armidct.asm 65 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886
  1. #ifdef OC_ARM_ASM
  2. @********************************************************************
  3. @* *
  4. @* THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. *
  5. @* USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS *
  6. @* GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
  7. @* IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
  8. @* *
  9. @* THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2010 *
  10. @* by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
  11. @* *
  12. @********************************************************************
  13. @ Original implementation:
  14. @ Copyright (C) 2009 Robin Watts for Pinknoise Productions Ltd
  15. @ last mod: $Id: armidct.s 17728 2010-12-07 10:28:07Z tterribe $
  16. @********************************************************************
  17. .text; .p2align 2
  18. .global _oc_idct8x8_1_arm
  19. .global _oc_idct8x8_arm
  20. @ .type oc_idct8x8_1_arm, %function; oc_idct8x8_1_arm: @ PROC
  21. _oc_idct8x8_1_arm:
  22. @ r0 = ogg_int16_t *_y
  23. @ r1 = ogg_uint16_t _dc
  24. ORR r1, r1, r1, LSL #16
  25. MOV r2, r1
  26. MOV r3, r1
  27. MOV r12,r1
  28. STMIA r0!,{r1,r2,r3,r12}
  29. STMIA r0!,{r1,r2,r3,r12}
  30. STMIA r0!,{r1,r2,r3,r12}
  31. STMIA r0!,{r1,r2,r3,r12}
  32. STMIA r0!,{r1,r2,r3,r12}
  33. STMIA r0!,{r1,r2,r3,r12}
  34. STMIA r0!,{r1,r2,r3,r12}
  35. STMIA r0!,{r1,r2,r3,r12}
  36. MOV PC, r14
  37. @ .size oc_idct8x8_1_arm, .-oc_idct8x8_1_arm @ ENDP
  38. @ .type oc_idct8x8_arm, %function; oc_idct8x8_arm: @ PROC
  39. _oc_idct8x8_arm:
  40. @ r0 = ogg_int16_t *_y
  41. @ r1 = ogg_int16_t *_x
  42. @ r2 = int _last_zzi
  43. CMP r2, #3
  44. BLE oc_idct8x8_3_arm
  45. CMP r2, #6
  46. BLE oc_idct8x8_6_arm
  47. CMP r2, #10
  48. BLE oc_idct8x8_10_arm
  49. oc_idct8x8_slow_arm:
  50. STMFD r13!,{r4-r11,r14}
  51. SUB r13,r13,#64*2
  52. @ Row transforms
  53. STR r0, [r13,#-4]!
  54. ADD r0, r13, #4 @ Write to temp storage.
  55. BL idct8core_arm
  56. BL idct8core_arm
  57. BL idct8core_arm
  58. BL idct8core_arm
  59. BL idct8core_arm
  60. BL idct8core_arm
  61. BL idct8core_arm
  62. BL idct8core_arm
  63. LDR r0, [r13], #4 @ Write to the final destination.
  64. SUB r2, r1, #8*16
  65. @ Clear input data for next block.
  66. MOV r4, #0
  67. MOV r5, #0
  68. MOV r6, #0
  69. MOV r7, #0
  70. STMIA r2!,{r4,r5,r6,r7}
  71. STMIA r2!,{r4,r5,r6,r7}
  72. STMIA r2!,{r4,r5,r6,r7}
  73. STMIA r2!,{r4,r5,r6,r7}
  74. STMIA r2!,{r4,r5,r6,r7}
  75. STMIA r2!,{r4,r5,r6,r7}
  76. STMIA r2!,{r4,r5,r6,r7}
  77. STMIA r2!,{r4,r5,r6,r7}
  78. MOV r1, r13 @ And read from temp storage.
  79. @ Column transforms
  80. BL idct8core_down_arm
  81. BL idct8core_down_arm
  82. BL idct8core_down_arm
  83. BL idct8core_down_arm
  84. BL idct8core_down_arm
  85. BL idct8core_down_arm
  86. BL idct8core_down_arm
  87. BL idct8core_down_arm
  88. ADD r13,r13,#64*2
  89. LDMFD r13!,{r4-r11,PC}
  90. @ .size oc_idct8x8_arm, .-oc_idct8x8_arm @ ENDP
  91. @ .type oc_idct8x8_10_arm, %function; oc_idct8x8_10_arm: @ PROC
  92. oc_idct8x8_10_arm:
  93. STMFD r13!,{r4-r11,r14}
  94. SUB r13,r13,#64*2
  95. @ Row transforms
  96. MOV r2, r0
  97. MOV r0, r13 @ Write to temp storage.
  98. BL idct4core_arm
  99. BL idct3core_arm
  100. BL idct2core_arm
  101. BL idct1core_arm
  102. @ Clear input data for next block.
  103. MOV r4, #0
  104. STR r4, [r1,#-4*16]!
  105. STR r4, [r1,#4]
  106. STR r4, [r1,#16]
  107. STR r4, [r1,#20]
  108. STR r4, [r1,#32]
  109. STR r4, [r1,#48]
  110. MOV r1, r13 @ Read from temp storage.
  111. MOV r0, r2 @ Write to the final destination
  112. oc_idct8x8_10_arm_cols:
  113. @ Column transforms
  114. BL idct4core_down_arm
  115. BL idct4core_down_arm
  116. BL idct4core_down_arm
  117. BL idct4core_down_arm
  118. BL idct4core_down_arm
  119. BL idct4core_down_arm
  120. BL idct4core_down_arm
  121. BL idct4core_down_arm
  122. ADD r13,r13,#64*2
  123. LDMFD r13!,{r4-r11,PC}
  124. @ .size oc_idct8x8_10_arm, .-oc_idct8x8_10_arm @ ENDP
  125. @ .type oc_idct8x8_6_arm, %function; oc_idct8x8_6_arm: @ PROC
  126. oc_idct8x8_6_arm:
  127. STMFD r13!,{r4-r7,r9-r11,r14}
  128. SUB r13,r13,#64*2
  129. @ Row transforms
  130. MOV r2, r0
  131. MOV r0, r13 @ Write to temp storage.
  132. BL idct3core_arm
  133. BL idct2core_arm
  134. BL idct1core_arm
  135. @ Clear input data for next block.
  136. MOV r4, #0
  137. STR r4, [r1,#-3*16]!
  138. STR r4, [r1,#4]
  139. STR r4, [r1,#16]
  140. STR r4, [r1,#32]
  141. MOV r1, r13 @ Read from temp storage.
  142. MOV r0, r2 @ Write to the final destination
  143. @ Column transforms
  144. BL idct3core_down_arm
  145. BL idct3core_down_arm
  146. BL idct3core_down_arm
  147. BL idct3core_down_arm
  148. BL idct3core_down_arm
  149. BL idct3core_down_arm
  150. BL idct3core_down_arm
  151. BL idct3core_down_arm
  152. ADD r13,r13,#64*2
  153. LDMFD r13!,{r4-r7,r9-r11,PC}
  154. @ .size oc_idct8x8_6_arm, .-oc_idct8x8_6_arm @ ENDP
  155. @ .type oc_idct8x8_3_arm, %function; oc_idct8x8_3_arm: @ PROC
  156. oc_idct8x8_3_arm:
  157. STMFD r13!,{r4-r7,r9-r11,r14}
  158. SUB r13,r13,#64*2
  159. @ Row transforms
  160. MOV r2, r0
  161. MOV r0, r13 @ Write to temp storage.
  162. BL idct2core_arm
  163. BL idct1core_arm
  164. @ Clear input data for next block.
  165. MOV r4, #0
  166. STR r4, [r1,#-2*16]!
  167. STR r4, [r1,#16]
  168. MOV r1, r13 @ Read from temp storage.
  169. MOV r0, r2 @ Write to the final destination
  170. @ Column transforms
  171. BL idct2core_down_arm
  172. BL idct2core_down_arm
  173. BL idct2core_down_arm
  174. BL idct2core_down_arm
  175. BL idct2core_down_arm
  176. BL idct2core_down_arm
  177. BL idct2core_down_arm
  178. BL idct2core_down_arm
  179. ADD r13,r13,#64*2
  180. LDMFD r13!,{r4-r7,r9-r11,PC}
  181. @ .size oc_idct8x8_3_arm, .-oc_idct8x8_3_arm @ ENDP
  182. @ .type idct1core_arm, %function; idct1core_arm: @ PROC
  183. idct1core_arm:
  184. @ r0 = ogg_int16_t *_y (destination)
  185. @ r1 = const ogg_int16_t *_x (source)
  186. LDRSH r3, [r1], #16
  187. MOV r12,#0x05
  188. ORR r12,r12,#0xB500
  189. MUL r3, r12, r3
  190. @ Stall ?
  191. MOV r3, r3, ASR #16
  192. STRH r3, [r0], #2
  193. STRH r3, [r0, #14]
  194. STRH r3, [r0, #30]
  195. STRH r3, [r0, #46]
  196. STRH r3, [r0, #62]
  197. STRH r3, [r0, #78]
  198. STRH r3, [r0, #94]
  199. STRH r3, [r0, #110]
  200. MOV PC,R14
  201. @ .size idct1core_arm, .-idct1core_arm @ ENDP
  202. @ .type idct2core_arm, %function; idct2core_arm: @ PROC
  203. idct2core_arm:
  204. @ r0 = ogg_int16_t *_y (destination)
  205. @ r1 = const ogg_int16_t *_x (source)
  206. LDRSH r9, [r1], #16 @ r9 = x[0]
  207. LDR r12,OC_C4S4
  208. LDRSH r11,[r1, #-14] @ r11= x[1]
  209. LDR r3, OC_C7S1
  210. MUL r9, r12,r9 @ r9 = t[0]<<16 = OC_C4S4*x[0]
  211. LDR r10,OC_C1S7
  212. MUL r3, r11,r3 @ r3 = t[4]<<16 = OC_C7S1*x[1]
  213. MOV r9, r9, ASR #16 @ r9 = t[0]
  214. MUL r11,r10,r11 @ r11= t[7]<<16 = OC_C1S7*x[1]
  215. MOV r3, r3, ASR #16 @ r3 = t[4]
  216. MUL r10,r12,r3 @ r10= t[5]<<16 = OC_C4S4*t[4]
  217. MOV r11,r11,ASR #16 @ r11= t[7]
  218. MUL r12,r11,r12 @ r12= t[6]<<16 = OC_C4S4*t[7]
  219. MOV r10,r10,ASR #16 @ r10= t[5]
  220. ADD r12,r9,r12,ASR #16 @ r12= t[0]+t[6]
  221. ADD r12,r12,r10 @ r12= t[0]+t2[6] = t[0]+t[6]+t[5]
  222. SUB r10,r12,r10,LSL #1 @ r10= t[0]+t2[5] = t[0]+t[6]-t[5]
  223. ADD r3, r3, r9 @ r3 = t[0]+t[4]
  224. ADD r11,r11,r9 @ r11= t[0]+t[7]
  225. STRH r11,[r0], #2 @ y[0] = t[0]+t[7]
  226. STRH r12,[r0, #14] @ y[1] = t[0]+t[6]
  227. STRH r10,[r0, #30] @ y[2] = t[0]+t[5]
  228. STRH r3, [r0, #46] @ y[3] = t[0]+t[4]
  229. RSB r3, r3, r9, LSL #1 @ r3 = t[0]*2-(t[0]+t[4])=t[0]-t[4]
  230. RSB r10,r10,r9, LSL #1 @ r10= t[0]*2-(t[0]+t[5])=t[0]-t[5]
  231. RSB r12,r12,r9, LSL #1 @ r12= t[0]*2-(t[0]+t[6])=t[0]-t[6]
  232. RSB r11,r11,r9, LSL #1 @ r1 = t[0]*2-(t[0]+t[7])=t[0]-t[7]
  233. STRH r3, [r0, #62] @ y[4] = t[0]-t[4]
  234. STRH r10,[r0, #78] @ y[5] = t[0]-t[5]
  235. STRH r12,[r0, #94] @ y[6] = t[0]-t[6]
  236. STRH r11,[r0, #110] @ y[7] = t[0]-t[7]
  237. MOV PC,r14
  238. @ .size idct2core_arm, .-idct2core_arm @ ENDP
  239. @ .type idct2core_down_arm, %function; idct2core_down_arm: @ PROC
  240. idct2core_down_arm:
  241. @ r0 = ogg_int16_t *_y (destination)
  242. @ r1 = const ogg_int16_t *_x (source)
  243. LDRSH r9, [r1], #16 @ r9 = x[0]
  244. LDR r12,OC_C4S4
  245. LDRSH r11,[r1, #-14] @ r11= x[1]
  246. LDR r3, OC_C7S1
  247. MUL r9, r12,r9 @ r9 = t[0]<<16 = OC_C4S4*x[0]
  248. LDR r10,OC_C1S7
  249. MUL r3, r11,r3 @ r3 = t[4]<<16 = OC_C7S1*x[1]
  250. MOV r9, r9, ASR #16 @ r9 = t[0]
  251. MUL r11,r10,r11 @ r11= t[7]<<16 = OC_C1S7*x[1]
  252. ADD r9, r9, #8 @ r9 = t[0]+8
  253. MOV r3, r3, ASR #16 @ r3 = t[4]
  254. MUL r10,r12,r3 @ r10= t[5]<<16 = OC_C4S4*t[4]
  255. MOV r11,r11,ASR #16 @ r11= t[7]
  256. MUL r12,r11,r12 @ r12= t[6]<<16 = OC_C4S4*t[7]
  257. MOV r10,r10,ASR #16 @ r10= t[5]
  258. ADD r12,r9,r12,ASR #16 @ r12= t[0]+t[6]+8
  259. ADD r12,r12,r10 @ r12= t[0]+t2[6] = t[0]+t[6]+t[5]+8
  260. SUB r10,r12,r10,LSL #1 @ r10= t[0]+t2[5] = t[0]+t[6]-t[5]+8
  261. ADD r3, r3, r9 @ r3 = t[0]+t[4]+8
  262. ADD r11,r11,r9 @ r11= t[0]+t[7]+8
  263. @ TODO: This is wrong.
  264. @ The C code truncates to 16 bits by storing to RAM and doing the
  265. @ shifts later; we've got an extra 4 bits here.
  266. MOV r4, r11,ASR #4
  267. MOV r5, r12,ASR #4
  268. MOV r6, r10,ASR #4
  269. MOV r7, r3, ASR #4
  270. RSB r3, r3, r9, LSL #1 @r3 =t[0]*2+8-(t[0]+t[4])=t[0]-t[4]+8
  271. RSB r10,r10,r9, LSL #1 @r10=t[0]*2+8-(t[0]+t[5])=t[0]-t[5]+8
  272. RSB r12,r12,r9, LSL #1 @r12=t[0]*2+8-(t[0]+t[6])=t[0]-t[6]+8
  273. RSB r11,r11,r9, LSL #1 @r11=t[0]*2+8-(t[0]+t[7])=t[0]-t[7]+8
  274. MOV r3, r3, ASR #4
  275. MOV r10,r10,ASR #4
  276. MOV r12,r12,ASR #4
  277. MOV r11,r11,ASR #4
  278. STRH r4, [r0], #2 @ y[0] = t[0]+t[7]
  279. STRH r5, [r0, #14] @ y[1] = t[0]+t[6]
  280. STRH r6, [r0, #30] @ y[2] = t[0]+t[5]
  281. STRH r7, [r0, #46] @ y[3] = t[0]+t[4]
  282. STRH r3, [r0, #62] @ y[4] = t[0]-t[4]
  283. STRH r10,[r0, #78] @ y[5] = t[0]-t[5]
  284. STRH r12,[r0, #94] @ y[6] = t[0]-t[6]
  285. STRH r11,[r0, #110] @ y[7] = t[0]-t[7]
  286. MOV PC,r14
  287. @ .size idct2core_down_arm, .-idct2core_down_arm @ ENDP
  288. @ .type idct3core_arm, %function; idct3core_arm: @ PROC
  289. idct3core_arm:
  290. LDRSH r9, [r1], #16 @ r9 = x[0]
  291. LDR r12,OC_C4S4 @ r12= OC_C4S4
  292. LDRSH r3, [r1, #-12] @ r3 = x[2]
  293. LDR r10,OC_C6S2 @ r10= OC_C6S2
  294. MUL r9, r12,r9 @ r9 = t[0]<<16 = OC_C4S4*x[0]
  295. LDR r4, OC_C2S6 @ r4 = OC_C2S6
  296. MUL r10,r3, r10 @ r10= t[2]<<16 = OC_C6S2*x[2]
  297. LDRSH r11,[r1, #-14] @ r11= x[1]
  298. MUL r3, r4, r3 @ r3 = t[3]<<16 = OC_C2S6*x[2]
  299. LDR r4, OC_C7S1 @ r4 = OC_C7S1
  300. LDR r5, OC_C1S7 @ r5 = OC_C1S7
  301. MOV r9, r9, ASR #16 @ r9 = t[0]
  302. MUL r4, r11,r4 @ r4 = t[4]<<16 = OC_C7S1*x[1]
  303. ADD r3, r9, r3, ASR #16 @ r3 = t[0]+t[3]
  304. MUL r11,r5, r11 @ r11= t[7]<<16 = OC_C1S7*x[1]
  305. MOV r4, r4, ASR #16 @ r4 = t[4]
  306. MUL r5, r12,r4 @ r5 = t[5]<<16 = OC_C4S4*t[4]
  307. MOV r11,r11,ASR #16 @ r11= t[7]
  308. MUL r12,r11,r12 @ r12= t[6]<<16 = OC_C4S4*t[7]
  309. ADD r10,r9, r10,ASR #16 @ r10= t[1] = t[0]+t[2]
  310. RSB r6, r10,r9, LSL #1 @ r6 = t[2] = t[0]-t[2]
  311. @ r3 = t2[0] = t[0]+t[3]
  312. RSB r9, r3, r9, LSL #1 @ r9 = t2[3] = t[0]-t[3]
  313. MOV r12,r12,ASR #16 @ r12= t[6]
  314. ADD r5, r12,r5, ASR #16 @ r5 = t2[6] = t[6]+t[5]
  315. RSB r12,r5, r12,LSL #1 @ r12= t2[5] = t[6]-t[5]
  316. ADD r11,r3, r11 @ r11= t2[0]+t[7]
  317. ADD r5, r10,r5 @ r5 = t[1]+t2[6]
  318. ADD r12,r6, r12 @ r12= t[2]+t2[5]
  319. ADD r4, r9, r4 @ r4 = t2[3]+t[4]
  320. STRH r11,[r0], #2 @ y[0] = t[0]+t[7]
  321. STRH r5, [r0, #14] @ y[1] = t[1]+t2[6]
  322. STRH r12,[r0, #30] @ y[2] = t[2]+t2[5]
  323. STRH r4, [r0, #46] @ y[3] = t2[3]+t[4]
  324. RSB r11,r11,r3, LSL #1 @ r11= t2[0] - t[7]
  325. RSB r5, r5, r10,LSL #1 @ r5 = t[1] - t2[6]
  326. RSB r12,r12,r6, LSL #1 @ r6 = t[2] - t2[5]
  327. RSB r4, r4, r9, LSL #1 @ r4 = t2[3] - t[4]
  328. STRH r4, [r0, #62] @ y[4] = t2[3]-t[4]
  329. STRH r12,[r0, #78] @ y[5] = t[2]-t2[5]
  330. STRH r5, [r0, #94] @ y[6] = t[1]-t2[6]
  331. STRH r11,[r0, #110] @ y[7] = t2[0]-t[7]
  332. MOV PC,R14
  333. @ .size idct3core_arm, .-idct3core_arm @ ENDP
  334. @ .type idct3core_down_arm, %function; idct3core_down_arm: @ PROC
  335. idct3core_down_arm:
  336. LDRSH r9, [r1], #16 @ r9 = x[0]
  337. LDR r12,OC_C4S4 @ r12= OC_C4S4
  338. LDRSH r3, [r1, #-12] @ r3 = x[2]
  339. LDR r10,OC_C6S2 @ r10= OC_C6S2
  340. MUL r9, r12,r9 @ r9 = t[0]<<16 = OC_C4S4*x[0]
  341. LDR r4, OC_C2S6 @ r4 = OC_C2S6
  342. MUL r10,r3, r10 @ r10= t[2]<<16 = OC_C6S2*x[2]
  343. LDRSH r11,[r1, #-14] @ r11= x[1]
  344. MUL r3, r4, r3 @ r3 = t[3]<<16 = OC_C2S6*x[2]
  345. LDR r4, OC_C7S1 @ r4 = OC_C7S1
  346. LDR r5, OC_C1S7 @ r5 = OC_C1S7
  347. MOV r9, r9, ASR #16 @ r9 = t[0]
  348. MUL r4, r11,r4 @ r4 = t[4]<<16 = OC_C7S1*x[1]
  349. ADD r9, r9, #8 @ r9 = t[0]+8
  350. MUL r11,r5, r11 @ r11= t[7]<<16 = OC_C1S7*x[1]
  351. ADD r3, r9, r3, ASR #16 @ r3 = t[0]+t[3]+8
  352. MOV r4, r4, ASR #16 @ r4 = t[4]
  353. MUL r5, r12,r4 @ r5 = t[5]<<16 = OC_C4S4*t[4]
  354. MOV r11,r11,ASR #16 @ r11= t[7]
  355. MUL r12,r11,r12 @ r12= t[6]<<16 = OC_C4S4*t[7]
  356. ADD r10,r9, r10,ASR #16 @ r10= t[1]+8 = t[0]+t[2]+8
  357. RSB r6, r10,r9, LSL #1 @ r6 = t[2]+8 = t[0]-t[2]+8
  358. @ r3 = t2[0]+8 = t[0]+t[3]+8
  359. RSB r9, r3, r9, LSL #1 @ r9 = t2[3]+8 = t[0]-t[3]+8
  360. MOV r12,r12,ASR #16 @ r12= t[6]
  361. ADD r5, r12,r5, ASR #16 @ r5 = t2[6] = t[6]+t[5]
  362. RSB r12,r5, r12,LSL #1 @ r12= t2[5] = t[6]-t[5]
  363. ADD r11,r3, r11 @ r11= t2[0]+t[7] +8
  364. ADD r5, r10,r5 @ r5 = t[1] +t2[6]+8
  365. ADD r12,r6, r12 @ r12= t[2] +t2[5]+8
  366. ADD r4, r9, r4 @ r4 = t2[3]+t[4] +8
  367. RSB r3, r11,r3, LSL #1 @ r11= t2[0] - t[7] + 8
  368. RSB r10,r5, r10,LSL #1 @ r5 = t[1] - t2[6] + 8
  369. RSB r6, r12,r6, LSL #1 @ r6 = t[2] - t2[5] + 8
  370. RSB r9, r4, r9, LSL #1 @ r4 = t2[3] - t[4] + 8
  371. @ TODO: This is wrong.
  372. @ The C code truncates to 16 bits by storing to RAM and doing the
  373. @ shifts later; we've got an extra 4 bits here.
  374. MOV r11,r11,ASR #4
  375. MOV r5, r5, ASR #4
  376. MOV r12,r12,ASR #4
  377. MOV r4, r4, ASR #4
  378. MOV r9, r9, ASR #4
  379. MOV r6, r6, ASR #4
  380. MOV r10,r10,ASR #4
  381. MOV r3, r3, ASR #4
  382. STRH r11,[r0], #2 @ y[0] = t[0]+t[7]
  383. STRH r5, [r0, #14] @ y[1] = t[1]+t2[6]
  384. STRH r12,[r0, #30] @ y[2] = t[2]+t2[5]
  385. STRH r4, [r0, #46] @ y[3] = t2[3]+t[4]
  386. STRH r9, [r0, #62] @ y[4] = t2[3]-t[4]
  387. STRH r6, [r0, #78] @ y[5] = t[2]-t2[5]
  388. STRH r10,[r0, #94] @ y[6] = t[1]-t2[6]
  389. STRH r3, [r0, #110] @ y[7] = t2[0]-t[7]
  390. MOV PC,R14
  391. @ .size idct3core_down_arm, .-idct3core_down_arm @ ENDP
  392. @ .type idct4core_arm, %function; idct4core_arm: @ PROC
  393. idct4core_arm:
  394. @ r0 = ogg_int16_t *_y (destination)
  395. @ r1 = const ogg_int16_t *_x (source)
  396. LDRSH r9, [r1], #16 @ r9 = x[0]
  397. LDR r10,OC_C4S4 @ r10= OC_C4S4
  398. LDRSH r12,[r1, #-12] @ r12= x[2]
  399. LDR r4, OC_C6S2 @ r4 = OC_C6S2
  400. MUL r9, r10,r9 @ r9 = t[0]<<16 = OC_C4S4*x[0]
  401. LDR r5, OC_C2S6 @ r5 = OC_C2S6
  402. MUL r4, r12,r4 @ r4 = t[2]<<16 = OC_C6S2*x[2]
  403. LDRSH r3, [r1, #-14] @ r3 = x[1]
  404. MUL r5, r12,r5 @ r5 = t[3]<<16 = OC_C2S6*x[2]
  405. LDR r6, OC_C7S1 @ r6 = OC_C7S1
  406. LDR r12,OC_C1S7 @ r12= OC_C1S7
  407. LDRSH r11,[r1, #-10] @ r11= x[3]
  408. MUL r6, r3, r6 @ r6 = t[4]<<16 = OC_C7S1*x[1]
  409. LDR r7, OC_C5S3 @ r7 = OC_C5S3
  410. MUL r3, r12,r3 @ r3 = t[7]<<16 = OC_C1S7*x[1]
  411. LDR r8, OC_C3S5 @ r8 = OC_C3S5
  412. MUL r7, r11,r7 @ r7 = -t[5]<<16 = OC_C5S3*x[3]
  413. MOV r9, r9, ASR #16 @ r9 = t[0]
  414. MUL r11,r8, r11 @ r11= t[6]<<16 = OC_C3S5*x[3]
  415. MOV r6, r6, ASR #16 @ r6 = t[4]
  416. @ TODO: This is wrong; t[4]-t[5] and t[7]-t[6] need to be truncated to 16-bit
  417. @ before multiplying, not after (this is not equivalent)
  418. SUB r7, r6, r7, ASR #16 @ r7 = t2[4]=t[4]+t[5] (as r7=-t[5])
  419. RSB r6, r7, r6, LSL #1 @ r6 = t[4]-t[5]
  420. MUL r6, r10,r6 @ r6 = t2[5]<<16 =OC_C4S4*(t[4]-t[5])
  421. MOV r3, r3, ASR #16 @ r3 = t[7]
  422. ADD r11,r3, r11,ASR #16 @ r11= t2[7]=t[7]+t[6]
  423. RSB r3, r11,r3, LSL #1 @ r3 = t[7]-t[6]
  424. MUL r3, r10,r3 @ r3 = t2[6]<<16 =OC_C4S4*(t[7]-t[6])
  425. ADD r4, r9, r4, ASR #16 @ r4 = t[1] = t[0] + t[2]
  426. RSB r10,r4, r9, LSL #1 @ r10= t[2] = t[0] - t[2]
  427. ADD r5, r9, r5, ASR #16 @ r5 = t[0] = t[0] + t[3]
  428. RSB r9, r5, r9, LSL #1 @ r9 = t[3] = t[0] - t[3]
  429. MOV r3, r3, ASR #16 @ r3 = t2[6]
  430. ADD r6, r3, r6, ASR #16 @ r6 = t3[6] = t2[6]+t2[5]
  431. RSB r3, r6, r3, LSL #1 @ r3 = t3[5] = t2[6]-t2[5]
  432. ADD r11,r5, r11 @ r11= t[0]+t2[7]
  433. ADD r6, r4, r6 @ r6 = t[1]+t3[6]
  434. ADD r3, r10,r3 @ r3 = t[2]+t3[5]
  435. ADD r7, r9, r7 @ r7 = t[3]+t2[4]
  436. STRH r11,[r0], #2 @ y[0] = t[0]+t[7]
  437. STRH r6, [r0, #14] @ y[1] = t[1]+t2[6]
  438. STRH r3, [r0, #30] @ y[2] = t[2]+t2[5]
  439. STRH r7, [r0, #46] @ y[3] = t2[3]+t[4]
  440. RSB r11,r11,r5, LSL #1 @ r11= t[0]-t2[7]
  441. RSB r6, r6, r4, LSL #1 @ r6 = t[1]-t3[6]
  442. RSB r3, r3, r10,LSL #1 @ r3 = t[2]-t3[5]
  443. RSB r7, r7, r9, LSL #1 @ r7 = t[3]-t2[4]
  444. STRH r7, [r0, #62] @ y[4] = t2[3]-t[4]
  445. STRH r3, [r0, #78] @ y[5] = t[2]-t2[5]
  446. STRH r6, [r0, #94] @ y[6] = t[1]-t2[6]
  447. STRH r11, [r0, #110] @ y[7] = t2[0]-t[7]
  448. MOV PC,r14
  449. @ .size idct4core_arm, .-idct4core_arm @ ENDP
  450. @ .type idct4core_down_arm, %function; idct4core_down_arm: @ PROC
  451. idct4core_down_arm:
  452. @ r0 = ogg_int16_t *_y (destination)
  453. @ r1 = const ogg_int16_t *_x (source)
  454. LDRSH r9, [r1], #16 @ r9 = x[0]
  455. LDR r10,OC_C4S4 @ r10= OC_C4S4
  456. LDRSH r12,[r1, #-12] @ r12= x[2]
  457. LDR r4, OC_C6S2 @ r4 = OC_C6S2
  458. MUL r9, r10,r9 @ r9 = t[0]<<16 = OC_C4S4*x[0]
  459. LDR r5, OC_C2S6 @ r5 = OC_C2S6
  460. MUL r4, r12,r4 @ r4 = t[2]<<16 = OC_C6S2*x[2]
  461. LDRSH r3, [r1, #-14] @ r3 = x[1]
  462. MUL r5, r12,r5 @ r5 = t[3]<<16 = OC_C2S6*x[2]
  463. LDR r6, OC_C7S1 @ r6 = OC_C7S1
  464. LDR r12,OC_C1S7 @ r12= OC_C1S7
  465. LDRSH r11,[r1, #-10] @ r11= x[3]
  466. MUL r6, r3, r6 @ r6 = t[4]<<16 = OC_C7S1*x[1]
  467. LDR r7, OC_C5S3 @ r7 = OC_C5S3
  468. MUL r3, r12,r3 @ r3 = t[7]<<16 = OC_C1S7*x[1]
  469. LDR r8, OC_C3S5 @ r8 = OC_C3S5
  470. MUL r7, r11,r7 @ r7 = -t[5]<<16 = OC_C5S3*x[3]
  471. MOV r9, r9, ASR #16 @ r9 = t[0]
  472. MUL r11,r8, r11 @ r11= t[6]<<16 = OC_C3S5*x[3]
  473. MOV r6, r6, ASR #16 @ r6 = t[4]
  474. @ TODO: This is wrong; t[4]-t[5] and t[7]-t[6] need to be truncated to 16-bit
  475. @ before multiplying, not after (this is not equivalent)
  476. SUB r7, r6, r7, ASR #16 @ r7 = t2[4]=t[4]+t[5] (as r7=-t[5])
  477. RSB r6, r7, r6, LSL #1 @ r6 = t[4]-t[5]
  478. MUL r6, r10,r6 @ r6 = t2[5]<<16 =OC_C4S4*(t[4]-t[5])
  479. MOV r3, r3, ASR #16 @ r3 = t[7]
  480. ADD r11,r3, r11,ASR #16 @ r11= t2[7]=t[7]+t[6]
  481. RSB r3, r11,r3, LSL #1 @ r3 = t[7]-t[6]
  482. ADD r9, r9, #8 @ r9 = t[0]+8
  483. MUL r3, r10,r3 @ r3 = t2[6]<<16 =OC_C4S4*(t[7]-t[6])
  484. ADD r4, r9, r4, ASR #16 @ r4 = t[1] = t[0] + t[2] + 8
  485. RSB r10,r4, r9, LSL #1 @ r10= t[2] = t[0] - t[2] + 8
  486. ADD r5, r9, r5, ASR #16 @ r5 = t[0] = t[0] + t[3] + 8
  487. RSB r9, r5, r9, LSL #1 @ r9 = t[3] = t[0] - t[3] + 8
  488. MOV r3, r3, ASR #16 @ r3 = t2[6]
  489. ADD r6, r3, r6, ASR #16 @ r6 = t3[6] = t2[6]+t2[5]
  490. RSB r3, r6, r3, LSL #1 @ r3 = t3[5] = t2[6]-t2[5]
  491. ADD r5, r5, r11 @ r5 = t[0]+t2[7]+8
  492. ADD r4, r4, r6 @ r4 = t[1]+t3[6]+8
  493. ADD r10,r10,r3 @ r10= t[2]+t3[5]+8
  494. ADD r9, r9, r7 @ r9 = t[3]+t2[4]+8
  495. SUB r11,r5, r11,LSL #1 @ r11= t[0]-t2[7]+8
  496. SUB r6, r4, r6, LSL #1 @ r6 = t[1]-t3[6]+8
  497. SUB r3, r10,r3, LSL #1 @ r3 = t[2]-t3[5]+8
  498. SUB r7, r9, r7, LSL #1 @ r7 = t[3]-t2[4]+8
  499. @ TODO: This is wrong.
  500. @ The C code truncates to 16 bits by storing to RAM and doing the
  501. @ shifts later; we've got an extra 4 bits here.
  502. MOV r11,r11,ASR #4
  503. MOV r6, r6, ASR #4
  504. MOV r3, r3, ASR #4
  505. MOV r7, r7, ASR #4
  506. MOV r9, r9, ASR #4
  507. MOV r10,r10,ASR #4
  508. MOV r4, r4, ASR #4
  509. MOV r5, r5, ASR #4
  510. STRH r5,[r0], #2 @ y[0] = t[0]+t[7]
  511. STRH r4, [r0, #14] @ y[1] = t[1]+t2[6]
  512. STRH r10,[r0, #30] @ y[2] = t[2]+t2[5]
  513. STRH r9, [r0, #46] @ y[3] = t2[3]+t[4]
  514. STRH r7, [r0, #62] @ y[4] = t2[3]-t[4]
  515. STRH r3, [r0, #78] @ y[5] = t[2]-t2[5]
  516. STRH r6, [r0, #94] @ y[6] = t[1]-t2[6]
  517. STRH r11,[r0, #110] @ y[7] = t2[0]-t[7]
  518. MOV PC,r14
  519. @ .size idct4core_down_arm, .-idct4core_down_arm @ ENDP
  520. @ .type idct8core_arm, %function; idct8core_arm: @ PROC
  521. idct8core_arm:
  522. @ r0 = ogg_int16_t *_y (destination)
  523. @ r1 = const ogg_int16_t *_x (source)
  524. LDRSH r2, [r1],#16 @ r2 = x[0]
  525. STMFD r13!,{r1,r14}
  526. LDRSH r6, [r1, #-8] @ r6 = x[4]
  527. LDR r12,OC_C4S4 @ r12= C4S4
  528. LDRSH r4, [r1, #-12] @ r4 = x[2]
  529. ADD r2, r2, r6 @ r2 = x[0] + x[4]
  530. SUB r6, r2, r6, LSL #1 @ r6 = x[0] - x[4]
  531. @ For spec compliance, these sums must be truncated to 16-bit precision
  532. @ _before_ the multiply (not after).
  533. @ Sadly, ARMv4 provides no simple way to do that.
  534. MOV r2, r2, LSL #16
  535. MOV r6, r6, LSL #16
  536. MOV r2, r2, ASR #16
  537. MOV r6, r6, ASR #16
  538. MUL r2, r12,r2 @ r2 = t[0]<<16 = C4S4*(x[0]+x[4])
  539. LDRSH r8, [r1, #-4] @ r8 = x[6]
  540. LDR r7, OC_C6S2 @ r7 = OC_C6S2
  541. MUL r6, r12,r6 @ r6 = t[1]<<16 = C4S4*(x[0]-x[4])
  542. LDR r14,OC_C2S6 @ r14= OC_C2S6
  543. MUL r3, r4, r7 @ r3 = OC_C6S2*x[2]
  544. LDR r5, OC_C7S1 @ r5 = OC_C7S1
  545. MUL r4, r14,r4 @ r4 = OC_C2S6*x[2]
  546. MOV r3, r3, ASR #16 @ r3 = OC_C6S2*x[2]>>16
  547. MUL r14,r8, r14 @ r14= OC_C2S6*x[6]
  548. MOV r4, r4, ASR #16 @ r4 = OC_C2S6*x[2]>>16
  549. MUL r8, r7, r8 @ r8 = OC_C6S2*x[6]
  550. LDR r7, OC_C1S7 @ r7 = OC_C1S7
  551. SUB r3, r3, r14,ASR #16 @ r3=t[2]=C6S2*x[2]>>16-C2S6*x[6]>>16
  552. LDRSH r14,[r1, #-14] @ r14= x[1]
  553. ADD r4, r4, r8, ASR #16 @ r4=t[3]=C2S6*x[2]>>16+C6S2*x[6]>>16
  554. LDRSH r8, [r1, #-2] @ r8 = x[7]
  555. MUL r9, r5, r14 @ r9 = OC_C7S1*x[1]
  556. LDRSH r10,[r1, #-6] @ r10= x[5]
  557. MUL r14,r7, r14 @ r14= OC_C1S7*x[1]
  558. MOV r9, r9, ASR #16 @ r9 = OC_C7S1*x[1]>>16
  559. MUL r7, r8, r7 @ r7 = OC_C1S7*x[7]
  560. MOV r14,r14,ASR #16 @ r14= OC_C1S7*x[1]>>16
  561. MUL r8, r5, r8 @ r8 = OC_C7S1*x[7]
  562. LDRSH r1, [r1, #-10] @ r1 = x[3]
  563. LDR r5, OC_C3S5 @ r5 = OC_C3S5
  564. LDR r11,OC_C5S3 @ r11= OC_C5S3
  565. ADD r8, r14,r8, ASR #16 @ r8=t[7]=C1S7*x[1]>>16+C7S1*x[7]>>16
  566. MUL r14,r5, r10 @ r14= OC_C3S5*x[5]
  567. SUB r9, r9, r7, ASR #16 @ r9=t[4]=C7S1*x[1]>>16-C1S7*x[7]>>16
  568. MUL r10,r11,r10 @ r10= OC_C5S3*x[5]
  569. MOV r14,r14,ASR #16 @ r14= OC_C3S5*x[5]>>16
  570. MUL r11,r1, r11 @ r11= OC_C5S3*x[3]
  571. MOV r10,r10,ASR #16 @ r10= OC_C5S3*x[5]>>16
  572. MUL r1, r5, r1 @ r1 = OC_C3S5*x[3]
  573. SUB r14,r14,r11,ASR #16 @r14=t[5]=C3S5*x[5]>>16-C5S3*x[3]>>16
  574. ADD r10,r10,r1, ASR #16 @r10=t[6]=C5S3*x[5]>>16+C3S5*x[3]>>16
  575. @ r2=t[0]<<16 r3=t[2] r4=t[3] r6=t[1]<<16 r8=t[7] r9=t[4]
  576. @ r10=t[6] r12=C4S4 r14=t[5]
  577. @ TODO: This is wrong; t[4]-t[5] and t[7]-t[6] need to be truncated to 16-bit
  578. @ before multiplying, not after (this is not equivalent)
  579. @ Stage 2
  580. @ 4-5 butterfly
  581. ADD r9, r9, r14 @ r9 = t2[4] = t[4]+t[5]
  582. SUB r14,r9, r14, LSL #1 @ r14= t[4]-t[5]
  583. MUL r14,r12,r14 @ r14= t2[5]<<16 = C4S4*(t[4]-t[5])
  584. @ 7-6 butterfly
  585. ADD r8, r8, r10 @ r8 = t2[7] = t[7]+t[6]
  586. SUB r10,r8, r10, LSL #1 @ r10= t[7]-t[6]
  587. MUL r10,r12,r10 @ r10= t2[6]<<16 = C4S4*(t[7]+t[6])
  588. @ r2=t[0]<<16 r3=t[2] r4=t[3] r6=t[1]<<16 r8=t2[7] r9=t2[4]
  589. @ r10=t2[6]<<16 r12=C4S4 r14=t2[5]<<16
  590. @ Stage 3
  591. @ 0-3 butterfly
  592. ADD r2, r4, r2, ASR #16 @ r2 = t2[0] = t[0] + t[3]
  593. SUB r4, r2, r4, LSL #1 @ r4 = t2[3] = t[0] - t[3]
  594. @ 1-2 butterfly
  595. ADD r6, r3, r6, ASR #16 @ r6 = t2[1] = t[1] + t[2]
  596. SUB r3, r6, r3, LSL #1 @ r3 = t2[2] = t[1] - t[2]
  597. @ 6-5 butterfly
  598. MOV r14,r14,ASR #16 @ r14= t2[5]
  599. ADD r10,r14,r10,ASR #16 @ r10= t3[6] = t[6] + t[5]
  600. SUB r14,r10,r14,LSL #1 @ r14= t3[5] = t[6] - t[5]
  601. @ r2=t2[0] r3=t2[2] r4=t2[3] r6=t2[1] r8=t2[7] r9=t2[4]
  602. @ r10=t3[6] r14=t3[5]
  603. @ Stage 4
  604. ADD r2, r2, r8 @ r2 = t[0] + t[7]
  605. ADD r6, r6, r10 @ r6 = t[1] + t[6]
  606. ADD r3, r3, r14 @ r3 = t[2] + t[5]
  607. ADD r4, r4, r9 @ r4 = t[3] + t[4]
  608. SUB r8, r2, r8, LSL #1 @ r8 = t[0] - t[7]
  609. SUB r10,r6, r10,LSL #1 @ r10= t[1] - t[6]
  610. SUB r14,r3, r14,LSL #1 @ r14= t[2] - t[5]
  611. SUB r9, r4, r9, LSL #1 @ r9 = t[3] - t[4]
  612. STRH r2, [r0], #2 @ y[0] = t[0]+t[7]
  613. STRH r6, [r0, #14] @ y[1] = t[1]+t[6]
  614. STRH r3, [r0, #30] @ y[2] = t[2]+t[5]
  615. STRH r4, [r0, #46] @ y[3] = t[3]+t[4]
  616. STRH r9, [r0, #62] @ y[4] = t[3]-t[4]
  617. STRH r14,[r0, #78] @ y[5] = t[2]-t[5]
  618. STRH r10,[r0, #94] @ y[6] = t[1]-t[6]
  619. STRH r8, [r0, #110] @ y[7] = t[0]-t[7]
  620. LDMFD r13!,{r1,PC}
  621. @ .size idct8core_arm, .-idct8core_arm @ ENDP
  622. @ .type idct8core_down_arm, %function; idct8core_down_arm: @ PROC
  623. idct8core_down_arm:
  624. @ r0 = ogg_int16_t *_y (destination)
  625. @ r1 = const ogg_int16_t *_x (source)
  626. LDRSH r2, [r1],#16 @ r2 = x[0]
  627. STMFD r13!,{r1,r14}
  628. LDRSH r6, [r1, #-8] @ r6 = x[4]
  629. LDR r12,OC_C4S4 @ r12= C4S4
  630. LDRSH r4, [r1, #-12] @ r4 = x[2]
  631. ADD r2, r2, r6 @ r2 = x[0] + x[4]
  632. SUB r6, r2, r6, LSL #1 @ r6 = x[0] - x[4]
  633. @ For spec compliance, these sums must be truncated to 16-bit precision
  634. @ _before_ the multiply (not after).
  635. @ Sadly, ARMv4 provides no simple way to do that.
  636. MOV r2, r2, LSL #16
  637. MOV r6, r6, LSL #16
  638. MOV r2, r2, ASR #16
  639. MOV r6, r6, ASR #16
  640. MUL r2, r12,r2 @ r2 = t[0]<<16 = C4S4*(x[0]+x[4])
  641. LDRSH r8, [r1, #-4] @ r8 = x[6]
  642. LDR r7, OC_C6S2 @ r7 = OC_C6S2
  643. MUL r6, r12,r6 @ r6 = t[1]<<16 = C4S4*(x[0]-x[4])
  644. LDR r14,OC_C2S6 @ r14= OC_C2S6
  645. MUL r3, r4, r7 @ r3 = OC_C6S2*x[2]
  646. LDR r5, OC_C7S1 @ r5 = OC_C7S1
  647. MUL r4, r14,r4 @ r4 = OC_C2S6*x[2]
  648. MOV r3, r3, ASR #16 @ r3 = OC_C6S2*x[2]>>16
  649. MUL r14,r8, r14 @ r14= OC_C2S6*x[6]
  650. MOV r4, r4, ASR #16 @ r4 = OC_C2S6*x[2]>>16
  651. MUL r8, r7, r8 @ r8 = OC_C6S2*x[6]
  652. LDR r7, OC_C1S7 @ r7 = OC_C1S7
  653. SUB r3, r3, r14,ASR #16 @ r3=t[2]=C6S2*x[2]>>16-C2S6*x[6]>>16
  654. LDRSH r14,[r1, #-14] @ r14= x[1]
  655. ADD r4, r4, r8, ASR #16 @ r4=t[3]=C2S6*x[2]>>16+C6S2*x[6]>>16
  656. LDRSH r8, [r1, #-2] @ r8 = x[7]
  657. MUL r9, r5, r14 @ r9 = OC_C7S1*x[1]
  658. LDRSH r10,[r1, #-6] @ r10= x[5]
  659. MUL r14,r7, r14 @ r14= OC_C1S7*x[1]
  660. MOV r9, r9, ASR #16 @ r9 = OC_C7S1*x[1]>>16
  661. MUL r7, r8, r7 @ r7 = OC_C1S7*x[7]
  662. MOV r14,r14,ASR #16 @ r14= OC_C1S7*x[1]>>16
  663. MUL r8, r5, r8 @ r8 = OC_C7S1*x[7]
  664. LDRSH r1, [r1, #-10] @ r1 = x[3]
  665. LDR r5, OC_C3S5 @ r5 = OC_C3S5
  666. LDR r11,OC_C5S3 @ r11= OC_C5S3
  667. ADD r8, r14,r8, ASR #16 @ r8=t[7]=C1S7*x[1]>>16+C7S1*x[7]>>16
  668. MUL r14,r5, r10 @ r14= OC_C3S5*x[5]
  669. SUB r9, r9, r7, ASR #16 @ r9=t[4]=C7S1*x[1]>>16-C1S7*x[7]>>16
  670. MUL r10,r11,r10 @ r10= OC_C5S3*x[5]
  671. MOV r14,r14,ASR #16 @ r14= OC_C3S5*x[5]>>16
  672. MUL r11,r1, r11 @ r11= OC_C5S3*x[3]
  673. MOV r10,r10,ASR #16 @ r10= OC_C5S3*x[5]>>16
  674. MUL r1, r5, r1 @ r1 = OC_C3S5*x[3]
  675. SUB r14,r14,r11,ASR #16 @r14=t[5]=C3S5*x[5]>>16-C5S3*x[3]>>16
  676. ADD r10,r10,r1, ASR #16 @r10=t[6]=C5S3*x[5]>>16+C3S5*x[3]>>16
  677. @ r2=t[0]<<16 r3=t[2] r4=t[3] r6=t[1]<<16 r8=t[7] r9=t[4]
  678. @ r10=t[6] r12=C4S4 r14=t[5]
  679. @ Stage 2
  680. @ TODO: This is wrong; t[4]-t[5] and t[7]-t[6] need to be truncated to 16-bit
  681. @ before multiplying, not after (this is not equivalent)
  682. @ 4-5 butterfly
  683. ADD r9, r9, r14 @ r9 = t2[4] = t[4]+t[5]
  684. SUB r14,r9, r14, LSL #1 @ r14= t[4]-t[5]
  685. MUL r14,r12,r14 @ r14= t2[5]<<16 = C4S4*(t[4]-t[5])
  686. @ 7-6 butterfly
  687. ADD r8, r8, r10 @ r8 = t2[7] = t[7]+t[6]
  688. SUB r10,r8, r10, LSL #1 @ r10= t[7]-t[6]
  689. MUL r10,r12,r10 @ r10= t2[6]<<16 = C4S4*(t[7]+t[6])
  690. @ r2=t[0]<<16 r3=t[2] r4=t[3] r6=t[1]<<16 r8=t2[7] r9=t2[4]
  691. @ r10=t2[6]<<16 r12=C4S4 r14=t2[5]<<16
  692. @ Stage 3
  693. ADD r2, r2, #8<<16 @ r2 = t[0]+8<<16
  694. ADD r6, r6, #8<<16 @ r6 = t[1]+8<<16
  695. @ 0-3 butterfly
  696. ADD r2, r4, r2, ASR #16 @ r2 = t2[0] = t[0] + t[3] + 8
  697. SUB r4, r2, r4, LSL #1 @ r4 = t2[3] = t[0] - t[3] + 8
  698. @ 1-2 butterfly
  699. ADD r6, r3, r6, ASR #16 @ r6 = t2[1] = t[1] + t[2] + 8
  700. SUB r3, r6, r3, LSL #1 @ r3 = t2[2] = t[1] - t[2] + 8
  701. @ 6-5 butterfly
  702. MOV r14,r14,ASR #16 @ r14= t2[5]
  703. ADD r10,r14,r10,ASR #16 @ r10= t3[6] = t[6] + t[5]
  704. SUB r14,r10,r14,LSL #1 @ r14= t3[5] = t[6] - t[5]
  705. @ r2=t2[0] r3=t2[2] r4=t2[3] r6=t2[1] r8=t2[7] r9=t2[4]
  706. @ r10=t3[6] r14=t3[5]
  707. @ Stage 4
  708. ADD r2, r2, r8 @ r2 = t[0] + t[7] + 8
  709. ADD r6, r6, r10 @ r6 = t[1] + t[6] + 8
  710. ADD r3, r3, r14 @ r3 = t[2] + t[5] + 8
  711. ADD r4, r4, r9 @ r4 = t[3] + t[4] + 8
  712. SUB r8, r2, r8, LSL #1 @ r8 = t[0] - t[7] + 8
  713. SUB r10,r6, r10,LSL #1 @ r10= t[1] - t[6] + 8
  714. SUB r14,r3, r14,LSL #1 @ r14= t[2] - t[5] + 8
  715. SUB r9, r4, r9, LSL #1 @ r9 = t[3] - t[4] + 8
  716. @ TODO: This is wrong.
  717. @ The C code truncates to 16 bits by storing to RAM and doing the
  718. @ shifts later; we've got an extra 4 bits here.
  719. MOV r2, r2, ASR #4
  720. MOV r6, r6, ASR #4
  721. MOV r3, r3, ASR #4
  722. MOV r4, r4, ASR #4
  723. MOV r8, r8, ASR #4
  724. MOV r10,r10,ASR #4
  725. MOV r14,r14,ASR #4
  726. MOV r9, r9, ASR #4
  727. STRH r2, [r0], #2 @ y[0] = t[0]+t[7]
  728. STRH r6, [r0, #14] @ y[1] = t[1]+t[6]
  729. STRH r3, [r0, #30] @ y[2] = t[2]+t[5]
  730. STRH r4, [r0, #46] @ y[3] = t[3]+t[4]
  731. STRH r9, [r0, #62] @ y[4] = t[3]-t[4]
  732. STRH r14,[r0, #78] @ y[5] = t[2]-t[5]
  733. STRH r10,[r0, #94] @ y[6] = t[1]-t[6]
  734. STRH r8, [r0, #110] @ y[7] = t[0]-t[7]
  735. LDMFD r13!,{r1,PC}
  736. @ .size idct8core_down_arm, .-idct8core_down_arm @ ENDP
  737. .if OC_ARM_ASM_MEDIA
  738. .global _oc_idct8x8_1_v6
  739. .global _oc_idct8x8_v6
  740. @ .type oc_idct8x8_1_v6, %function; oc_idct8x8_1_v6: @ PROC
  741. _oc_idct8x8_1_v6:
  742. @ r0 = ogg_int16_t *_y
  743. @ r1 = ogg_uint16_t _dc
  744. ORR r2, r1, r1, LSL #16
  745. ORR r3, r1, r1, LSL #16
  746. STRD r2, r3, [r0], #8
  747. STRD r2, r3, [r0], #8
  748. STRD r2, r3, [r0], #8
  749. STRD r2, r3, [r0], #8
  750. STRD r2, r3, [r0], #8
  751. STRD r2, r3, [r0], #8
  752. STRD r2, r3, [r0], #8
  753. STRD r2, r3, [r0], #8
  754. STRD r2, r3, [r0], #8
  755. STRD r2, r3, [r0], #8
  756. STRD r2, r3, [r0], #8
  757. STRD r2, r3, [r0], #8
  758. STRD r2, r3, [r0], #8
  759. STRD r2, r3, [r0], #8
  760. STRD r2, r3, [r0], #8
  761. STRD r2, r3, [r0], #8
  762. MOV PC, r14
  763. @ .size oc_idct8x8_1_v6, .-oc_idct8x8_1_v6 @ ENDP
  764. @ .type oc_idct8x8_v6, %function; oc_idct8x8_v6: @ PROC
  765. _oc_idct8x8_v6:
  766. @ r0 = ogg_int16_t *_y
  767. @ r1 = ogg_int16_t *_x
  768. @ r2 = int _last_zzi
  769. CMP r2, #3
  770. BLE oc_idct8x8_3_v6
  771. @CMP r2, #6
  772. @BLE oc_idct8x8_6_v6
  773. CMP r2, #10
  774. BLE oc_idct8x8_10_v6
  775. oc_idct8x8_slow_v6:
  776. STMFD r13!,{r4-r11,r14}
  777. SUB r13,r13,#64*2
  778. @ Row transforms
  779. STR r0, [r13,#-4]!
  780. ADD r0, r13, #4 @ Write to temp storage.
  781. BL idct8_8core_v6
  782. BL idct8_8core_v6
  783. BL idct8_8core_v6
  784. BL idct8_8core_v6
  785. LDR r0, [r13], #4 @ Write to the final destination.
  786. @ Clear input data for next block.
  787. MOV r4, #0
  788. MOV r5, #0
  789. STRD r4, r5, [r1,#-8*16]!
  790. STRD r4, r5, [r1,#8]
  791. STRD r4, r5, [r1,#16]
  792. STRD r4, r5, [r1,#24]
  793. STRD r4, r5, [r1,#32]
  794. STRD r4, r5, [r1,#40]
  795. STRD r4, r5, [r1,#48]
  796. STRD r4, r5, [r1,#56]
  797. STRD r4, r5, [r1,#64]
  798. STRD r4, r5, [r1,#72]
  799. STRD r4, r5, [r1,#80]
  800. STRD r4, r5, [r1,#88]
  801. STRD r4, r5, [r1,#96]
  802. STRD r4, r5, [r1,#104]
  803. STRD r4, r5, [r1,#112]
  804. STRD r4, r5, [r1,#120]
  805. MOV r1, r13 @ And read from temp storage.
  806. @ Column transforms
  807. BL idct8_8core_down_v6
  808. BL idct8_8core_down_v6
  809. BL idct8_8core_down_v6
  810. BL idct8_8core_down_v6
  811. ADD r13,r13,#64*2
  812. LDMFD r13!,{r4-r11,PC}
  813. @ .size oc_idct8x8_v6, .-oc_idct8x8_v6 @ ENDP
  814. @ .type oc_idct8x8_10_v6, %function; oc_idct8x8_10_v6: @ PROC
  815. oc_idct8x8_10_v6:
  816. STMFD r13!,{r4-r11,r14}
  817. SUB r13,r13,#64*2+4
  818. @ Row transforms
  819. MOV r2, r13
  820. STR r0, [r13,#-4]!
  821. AND r0, r2, #4 @ Align the stack.
  822. ADD r0, r0, r2 @ Write to temp storage.
  823. BL idct4_3core_v6
  824. BL idct2_1core_v6
  825. LDR r0, [r13], #4 @ Write to the final destination.
  826. @ Clear input data for next block.
  827. MOV r4, #0
  828. MOV r5, #0
  829. STRD r4, r5, [r1,#-4*16]!
  830. STRD r4, r5, [r1,#16]
  831. STR r4, [r1,#32]
  832. STR r4, [r1,#48]
  833. AND r1, r13,#4 @ Align the stack.
  834. ADD r1, r1, r13 @ And read from temp storage.
  835. @ Column transforms
  836. BL idct4_4core_down_v6
  837. BL idct4_4core_down_v6
  838. BL idct4_4core_down_v6
  839. BL idct4_4core_down_v6
  840. ADD r13,r13,#64*2+4
  841. LDMFD r13!,{r4-r11,PC}
  842. @ .size oc_idct8x8_10_v6, .-oc_idct8x8_10_v6 @ ENDP
  843. @ .type oc_idct8x8_3_v6, %function; oc_idct8x8_3_v6: @ PROC
  844. oc_idct8x8_3_v6:
  845. STMFD r13!,{r4-r8,r14}
  846. SUB r13,r13,#64*2
  847. @ Row transforms
  848. MOV r8, r0
  849. MOV r0, r13 @ Write to temp storage.
  850. BL idct2_1core_v6
  851. @ Clear input data for next block.
  852. MOV r4, #0
  853. STR r4, [r1,#-2*16]!
  854. STR r4, [r1,#16]
  855. MOV r1, r13 @ Read from temp storage.
  856. MOV r0, r8 @ Write to the final destination.
  857. @ Column transforms
  858. BL idct2_2core_down_v6
  859. BL idct2_2core_down_v6
  860. BL idct2_2core_down_v6
  861. BL idct2_2core_down_v6
  862. ADD r13,r13,#64*2
  863. LDMFD r13!,{r4-r8,PC}
  864. @ .size oc_idct8x8_3_v6, .-oc_idct8x8_3_v6 @ ENDP
  865. @ .type idct2_1core_v6, %function; idct2_1core_v6: @ PROC
  866. idct2_1core_v6:
  867. @ r0 = ogg_int16_t *_y (destination)
  868. @ r1 = const ogg_int16_t *_x (source)
  869. @ Stage 1:
  870. LDR r2, [r1], #16 @ r2 = <x[0,1]|x[0,0]>
  871. LDR r3, OC_C4S4
  872. LDRSH r6, [r1], #16 @ r6 = x[1,0]
  873. SMULWB r12,r3, r2 @ r12= t[0,0]=OC_C4S4*x[0,0]>>16
  874. LDRD r4, r5, OC_C7S1 @ r4 = OC_C7S1; r5 = OC_C1S7
  875. SMULWB r6, r3, r6 @ r6 = t[1,0]=OC_C4S4*x[1,0]>>16
  876. SMULWT r4, r4, r2 @ r4 = t[0,4]=OC_C7S1*x[0,1]>>16
  877. SMULWT r7, r5, r2 @ r7 = t[0,7]=OC_C1S7*x[0,1]>>16
  878. @ Stage 2:
  879. SMULWB r5, r3, r4 @ r5 = t[0,5]=OC_C4S4*t[0,4]>>16
  880. PKHBT r12,r12,r6, LSL #16 @ r12= <t[1,0]|t[0,0]>
  881. SMULWB r6, r3, r7 @ r6 = t[0,6]=OC_C4S4*t[0,7]>>16
  882. PKHBT r7, r7, r3 @ r7 = <0|t[0,7]>
  883. @ Stage 3:
  884. PKHBT r5, r6, r5, LSL #16 @ r5 = <t[0,5]|t[0,6]>
  885. PKHBT r4, r4, r3 @ r4 = <0|t[0,4]>
  886. SADDSUBX r5, r5, r5 @ r5 = <t[0,6]+t[0,5]|t[0,6]-t[0,5]>
  887. @ Stage 4:
  888. PKHTB r6, r3, r5, ASR #16 @ r6 = <0|t[0,6]>
  889. PKHBT r5, r5, r3 @ r5 = <0|t[0,5]>
  890. SADD16 r3, r12,r7 @ r3 = t[0]+t[7]
  891. STR r3, [r0], #4 @ y[0<<3] = t[0]+t[7]
  892. SADD16 r3, r12,r6 @ r3 = t[0]+t[6]
  893. STR r3, [r0, #12] @ y[1<<3] = t[0]+t[6]
  894. SADD16 r3, r12,r5 @ r3 = t[0]+t[5]
  895. STR r3, [r0, #28] @ y[2<<3] = t[0]+t[5]
  896. SADD16 r3, r12,r4 @ r3 = t[0]+t[4]
  897. STR r3, [r0, #44] @ y[3<<3] = t[0]+t[4]
  898. SSUB16 r4, r12,r4 @ r4 = t[0]-t[4]
  899. STR r4, [r0, #60] @ y[4<<3] = t[0]-t[4]
  900. SSUB16 r5, r12,r5 @ r5 = t[0]-t[5]
  901. STR r5, [r0, #76] @ y[5<<3] = t[0]-t[5]
  902. SSUB16 r6, r12,r6 @ r6 = t[0]-t[6]
  903. STR r6, [r0, #92] @ y[6<<3] = t[0]-t[6]
  904. SSUB16 r7, r12,r7 @ r7 = t[0]-t[7]
  905. STR r7, [r0, #108] @ y[7<<3] = t[0]-t[7]
  906. MOV PC,r14
  907. @ .size idct2_1core_v6, .-idct2_1core_v6 @ ENDP
  908. .endif
  909. .balign 8
  910. OC_C7S1:
  911. .word 12785 @ 31F1
  912. OC_C1S7:
  913. .word 64277 @ FB15
  914. OC_C6S2:
  915. .word 25080 @ 61F8
  916. OC_C2S6:
  917. .word 60547 @ EC83
  918. OC_C5S3:
  919. .word 36410 @ 8E3A
  920. OC_C3S5:
  921. .word 54491 @ D4DB
  922. OC_C4S4:
  923. .word 46341 @ B505
  924. .if OC_ARM_ASM_MEDIA
  925. @ .type idct2_2core_down_v6, %function; idct2_2core_down_v6: @ PROC
  926. idct2_2core_down_v6:
  927. @ r0 = ogg_int16_t *_y (destination)
  928. @ r1 = const ogg_int16_t *_x (source)
  929. @ Stage 1:
  930. LDR r2, [r1], #16 @ r2 = <x[0,1]|x[0,0]>
  931. LDR r3, OC_C4S4
  932. MOV r7 ,#8 @ r7 = 8
  933. LDR r6, [r1], #16 @ r6 = <x[1,1]|x[1,0]>
  934. SMLAWB r12,r3, r2, r7 @ r12= (t[0,0]=OC_C4S4*x[0,0]>>16)+8
  935. LDRD r4, r5, OC_C7S1 @ r4 = OC_C7S1; r5 = OC_C1S7
  936. SMLAWB r7, r3, r6, r7 @ r7 = (t[1,0]=OC_C4S4*x[1,0]>>16)+8
  937. SMULWT r5, r5, r2 @ r2 = t[0,7]=OC_C1S7*x[0,1]>>16
  938. PKHBT r12,r12,r7, LSL #16 @ r12= <t[1,0]+8|t[0,0]+8>
  939. SMULWT r4, r4, r2 @ r4 = t[0,4]=OC_C7S1*x[0,1]>>16
  940. @ Here we cheat: row 1 had just a DC, so x[0,1]==x[1,1] by definition.
  941. PKHBT r7, r5, r5, LSL #16 @ r7 = <t[0,7]|t[0,7]>
  942. @ Stage 2:
  943. SMULWB r6, r3, r7 @ r6 = t[0,6]=OC_C4S4*t[0,7]>>16
  944. PKHBT r4, r4, r4, LSL #16 @ r4 = <t[0,4]|t[0,4]>
  945. SMULWT r2, r3, r7 @ r2 = t[1,6]=OC_C4S4*t[1,7]>>16
  946. SMULWB r5, r3, r4 @ r5 = t[0,5]=OC_C4S4*t[0,4]>>16
  947. PKHBT r6, r6, r2, LSL #16 @ r6 = <t[1,6]|t[0,6]>
  948. SMULWT r2, r3, r4 @ r2 = t[1,5]=OC_C4S4*t[1,4]>>16
  949. PKHBT r2, r5, r2, LSL #16 @ r2 = <t[1,5]|t[0,5]>
  950. @ Stage 3:
  951. SSUB16 r5, r6, r2 @ r5 = <t[1,6]-t[1,5]|t[0,6]-t[0,5]>
  952. SADD16 r6, r6, r2 @ r6 = <t[1,6]+t[1,5]|t[0,6]+t[0,5]>
  953. @ Stage 4:
  954. SADD16 r2, r12,r7 @ r2 = t[0]+t[7]+8
  955. MOV r3, r2, ASR #4
  956. MOV r2, r2, LSL #16
  957. PKHTB r3, r3, r2, ASR #20 @ r3 = t[0]+t[7]+8>>4
  958. STR r3, [r0], #4 @ y[0<<3] = t[0]+t[7]+8>>4
  959. SADD16 r2, r12,r6 @ r2 = t[0]+t[6]+8
  960. MOV r3, r2, ASR #4
  961. MOV r2, r2, LSL #16
  962. PKHTB r3, r3, r2, ASR #20 @ r3 = t[0]+t[6]+8>>4
  963. STR r3, [r0, #12] @ y[1<<3] = t[0]+t[6]+8>>4
  964. SADD16 r2, r12,r5 @ r2 = t[0]+t[5]+8
  965. MOV r3, r2, ASR #4
  966. MOV r2, r2, LSL #16
  967. PKHTB r3, r3, r2, ASR #20 @ r3 = t[0]+t[5]+8>>4
  968. STR r3, [r0, #28] @ y[2<<3] = t[0]+t[5]+8>>4
  969. SADD16 r2, r12,r4 @ r2 = t[0]+t[4]+8
  970. MOV r3, r2, ASR #4
  971. MOV r2, r2, LSL #16
  972. PKHTB r3, r3, r2, ASR #20 @ r3 = t[0]+t[4]+8>>4
  973. STR r3, [r0, #44] @ y[3<<3] = t[0]+t[4]+8>>4
  974. SSUB16 r4, r12,r4 @ r4 = t[0]-t[4]+8
  975. MOV r3, r4, ASR #4
  976. MOV r4, r4, LSL #16
  977. PKHTB r3, r3, r4, ASR #20 @ r3 = t[0]-t[4]+8>>4
  978. STR r3, [r0, #60] @ y[4<<3] = t[0]-t[4]+8>>4
  979. SSUB16 r5, r12,r5 @ r5 = t[0]-t[5]+8
  980. MOV r3, r5, ASR #4
  981. MOV r5, r5, LSL #16
  982. PKHTB r3, r3, r5, ASR #20 @ r3 = t[0]-t[5]+8>>4
  983. STR r3, [r0, #76] @ y[5<<3] = t[0]-t[5]+8>>4
  984. SSUB16 r6, r12,r6 @ r6 = t[0]-t[6]+8
  985. MOV r3, r6, ASR #4
  986. MOV r6, r6, LSL #16
  987. PKHTB r3, r3, r6, ASR #20 @ r3 = t[0]-t[6]+8>>4
  988. STR r3, [r0, #92] @ y[6<<3] = t[0]-t[6]+8>>4
  989. SSUB16 r7, r12,r7 @ r7 = t[0]-t[7]+8
  990. MOV r3, r7, ASR #4
  991. MOV r7, r7, LSL #16
  992. PKHTB r3, r3, r7, ASR #20 @ r3 = t[0]-t[7]+8>>4
  993. STR r3, [r0, #108] @ y[7<<3] = t[0]-t[7]+8>>4
  994. MOV PC,r14
  995. @ .size idct2_2core_down_v6, .-idct2_2core_down_v6 @ ENDP
  996. @ In theory this should save ~75 cycles over oc_idct8x8_10, more than enough to
  997. @ pay for increased branch mis-prediction to get here, but in practice it
  998. @ doesn't seem to slow anything down to take it out, and it's less code this
  999. @ way.
  1000. .if 0
  1001. @ .type oc_idct8x8_6_v6, %function; oc_idct8x8_6_v6: @ PROC
  1002. _oc_idct8x8_6_v6:
  1003. STMFD r13!,{r4-r8,r10,r11,r14}
  1004. SUB r13,r13,#64*2+4
  1005. @ Row transforms
  1006. MOV r8, r0
  1007. AND r0, r13,#4 @ Align the stack.
  1008. ADD r0, r0, r13 @ Write to temp storage.
  1009. BL idct3_2core_v6
  1010. BL idct1core_v6
  1011. @ Clear input data for next block.
  1012. MOV r4, #0
  1013. MOV r5, #0
  1014. STRD r4, r5, [r1,#-3*16]!
  1015. STR r4, [r1,#16]
  1016. STR r4, [r1,#32]
  1017. AND r1, r13,#4 @ Align the stack.
  1018. MOV r0, r8 @ Write to the final destination.
  1019. ADD r1, r1, r13 @ And read from temp storage.
  1020. @ Column transforms
  1021. BL idct3_3core_down_v6
  1022. BL idct3_3core_down_v6
  1023. BL idct3_3core_down_v6
  1024. BL idct3_3core_down_v6
  1025. ADD r13,r13,#64*2+4
  1026. LDMFD r13!,{r4-r8,r10,r11,PC}
  1027. @ .size oc_idct8x8_6_v6, .-oc_idct8x8_6_v6 @ ENDP
  1028. @ .type idct1core_v6, %function; idct1core_v6: @ PROC
  1029. _idct1core_v6:
  1030. @ r0 = ogg_int16_t *_y (destination)
  1031. @ r1 = const ogg_int16_t *_x (source)
  1032. LDRSH r3, [r1], #16
  1033. MOV r12,#0x05
  1034. ORR r12,r12,#0xB500
  1035. MUL r3, r12, r3
  1036. @ Stall ?
  1037. MOV r3, r3, ASR #16
  1038. @ Don't need to actually store the odd lines; they won't be read.
  1039. STRH r3, [r0], #2
  1040. STRH r3, [r0, #30]
  1041. STRH r3, [r0, #62]
  1042. STRH r3, [r0, #94]
  1043. MOV PC,R14
  1044. @ .size idct1core_v6, .-idct1core_v6 @ ENDP
  1045. @ .type idct3_2core_v6, %function; idct3_2core_v6: @ PROC
  1046. _idct3_2core_v6:
  1047. @ r0 = ogg_int16_t *_y (destination)
  1048. @ r1 = const ogg_int16_t *_x (source)
  1049. @ Stage 1:
  1050. LDRD r4, r5, [r1], #16 @ r4 = <x[0,1]|x[0,0]>; r5 = <*|x[0,2]>
  1051. LDRD r10,r11,OC_C6S2_3_v6 @ r10= OC_C6S2; r11= OC_C2S6
  1052. @ Stall
  1053. SMULWB r3, r11,r5 @ r3 = t[0,3]=OC_C2S6*x[0,2]>>16
  1054. LDR r11,OC_C4S4
  1055. SMULWB r2, r10,r5 @ r2 = t[0,2]=OC_C6S2*x[0,2]>>16
  1056. LDR r5, [r1], #16 @ r5 = <x[1,1]|x[1,0]>
  1057. SMULWB r12,r11,r4 @ r12= (t[0,0]=OC_C4S4*x[0,0]>>16)
  1058. LDRD r6, r7, OC_C7S1_3_v6 @ r6 = OC_C7S1; r7 = OC_C1S7
  1059. SMULWB r10,r11,r5 @ r10= (t[1,0]=OC_C4S4*x[1,0]>>16)
  1060. PKHBT r12,r12,r10,LSL #16 @ r12= <t[1,0]|t[0,0]>
  1061. SMULWT r10,r7, r5 @ r10= t[1,7]=OC_C1S7*x[1,1]>>16
  1062. PKHBT r2, r2, r11 @ r2 = <0|t[0,2]>
  1063. SMULWT r7, r7, r4 @ r7 = t[0,7]=OC_C1S7*x[0,1]>>16
  1064. PKHBT r3, r3, r11 @ r3 = <0|t[0,3]>
  1065. SMULWT r5, r6, r5 @ r10= t[1,4]=OC_C7S1*x[1,1]>>16
  1066. PKHBT r7, r7, r10,LSL #16 @ r7 = <t[1,7]|t[0,7]>
  1067. SMULWT r4, r6, r4 @ r4 = t[0,4]=OC_C7S1*x[0,1]>>16
  1068. @ Stage 2:
  1069. SMULWB r6, r11,r7 @ r6 = t[0,6]=OC_C4S4*t[0,7]>>16
  1070. PKHBT r4, r4, r5, LSL #16 @ r4 = <t[1,4]|t[0,4]>
  1071. SMULWT r10,r11,r7 @ r10= t[1,6]=OC_C4S4*t[1,7]>>16
  1072. SMULWB r5, r11,r4 @ r5 = t[0,5]=OC_C4S4*t[0,4]>>16
  1073. PKHBT r6, r6, r10,LSL #16 @ r6 = <t[1,6]|t[0,6]>
  1074. SMULWT r10,r11,r4 @ r10= t[1,5]=OC_C4S4*t[1,4]>>16
  1075. @ Stage 3:
  1076. B idct4_3core_stage3_v6
  1077. @ .size idct3_2core_v6, .-idct3_2core_v6 @ ENDP
  1078. @ Another copy so the LDRD offsets are less than +/- 255.
  1079. .balign 8
  1080. OC_C7S1_3_v6:
  1081. .word 12785 @ 31F1
  1082. OC_C1S7_3_v6:
  1083. .word 64277 @ FB15
  1084. OC_C6S2_3_v6:
  1085. .word 25080 @ 61F8
  1086. OC_C2S6_3_v6:
  1087. .word 60547 @ EC83
  1088. @ .type idct3_3core_down_v6, %function; idct3_3core_down_v6: @ PROC
  1089. _idct3_3core_down_v6:
  1090. @ r0 = ogg_int16_t *_y (destination)
  1091. @ r1 = const ogg_int16_t *_x (source)
  1092. @ Stage 1:
  1093. LDRD r10,r11,[r1], #16 @ r10= <x[0,1]|x[0,0]>; r11= <??|x[0,2]>
  1094. LDRD r6, r7, OC_C6S2_3_v6 @ r6 = OC_C6S2; r7 = OC_C2S6
  1095. LDR r4, [r1], #16 @ r4 = <x[1,1]|x[1,0]>
  1096. SMULWB r3, r7, r11 @ r3 = t[0,3]=OC_C2S6*x[0,2]>>16
  1097. MOV r7,#8
  1098. SMULWB r2, r6, r11 @ r2 = t[0,2]=OC_C6S2*x[0,2]>>16
  1099. LDR r11,OC_C4S4
  1100. SMLAWB r12,r11,r10,r7 @ r12= t[0,0]+8=(OC_C4S4*x[0,0]>>16)+8
  1101. @ Here we cheat: row 2 had just a DC, so x[0,2]==x[1,2] by definition.
  1102. PKHBT r3, r3, r3, LSL #16 @ r3 = <t[0,3]|t[0,3]>
  1103. SMLAWB r5, r11,r4, r7 @ r5 = t[1,0]+8=(OC_C4S4*x[1,0]>>16)+8
  1104. PKHBT r2, r2, r2, LSL #16 @ r2 = <t[0,2]|t[0,2]>
  1105. LDRD r6, r7, OC_C7S1_3_v6 @ r6 = OC_C7S1; r7 = OC_C1S7
  1106. PKHBT r12,r12,r5, LSL #16 @ r12= <t[1,0]+8|t[0,0]+8>
  1107. SMULWT r5, r7, r4 @ r5 = t[1,7]=OC_C1S7*x[1,1]>>16
  1108. SMULWT r7, r7, r10 @ r7 = t[0,7]=OC_C1S7*x[0,1]>>16
  1109. SMULWT r10,r6, r10 @ r10= t[0,4]=OC_C7S1*x[0,1]>>16
  1110. PKHBT r7, r7, r5, LSL #16 @ r7 = <t[1,7]|t[0,7]>
  1111. SMULWT r4, r6, r4 @ r4 = t[1,4]=OC_C7S1*x[1,1]>>16
  1112. @ Stage 2:
  1113. SMULWB r6, r11,r7 @ r6 = t[0,6]=OC_C4S4*t[0,7]>>16
  1114. PKHBT r4, r10,r4, LSL #16 @ r4 = <t[1,4]|t[0,4]>
  1115. SMULWT r10,r11,r7 @ r10= t[1,6]=OC_C4S4*t[1,7]>>16
  1116. SMULWB r5, r11,r4 @ r5 = t[0,5]=OC_C4S4*t[0,4]>>16
  1117. PKHBT r6, r6, r10,LSL #16 @ r6 = <t[1,6]|t[0,6]>
  1118. SMULWT r10,r11,r4 @ r10= t[1,5]=OC_C4S4*t[1,4]>>16
  1119. @ Stage 3:
  1120. B idct4_4core_down_stage3_v6
  1121. @ .size idct3_3core_down_v6, .-idct3_3core_down_v6 @ ENDP
  1122. .endif
  1123. @ .type idct4_3core_v6, %function; idct4_3core_v6: @ PROC
  1124. idct4_3core_v6:
  1125. @ r0 = ogg_int16_t *_y (destination)
  1126. @ r1 = const ogg_int16_t *_x (source)
  1127. @ Stage 1:
  1128. LDRD r10,r11,[r1], #16 @ r10= <x[0,1]|x[0,0]>; r11= <x[0,3]|x[0,2]>
  1129. LDRD r2, r3, OC_C5S3_4_v6 @ r2 = OC_C5S3; r3 = OC_C3S5
  1130. LDRD r4, r5, [r1], #16 @ r4 = <x[1,1]|x[1,0]>; r5 = <??|x[1,2]>
  1131. SMULWT r9, r3, r11 @ r9 = t[0,6]=OC_C3S5*x[0,3]>>16
  1132. SMULWT r8, r2, r11 @ r8 = -t[0,5]=OC_C5S3*x[0,3]>>16
  1133. PKHBT r9, r9, r2 @ r9 = <0|t[0,6]>
  1134. LDRD r6, r7, OC_C6S2_4_v6 @ r6 = OC_C6S2; r7 = OC_C2S6
  1135. PKHBT r8, r8, r2 @ r9 = <0|-t[0,5]>
  1136. SMULWB r3, r7, r11 @ r3 = t[0,3]=OC_C2S6*x[0,2]>>16
  1137. SMULWB r2, r6, r11 @ r2 = t[0,2]=OC_C6S2*x[0,2]>>16
  1138. LDR r11,OC_C4S4
  1139. SMULWB r12,r7, r5 @ r12= t[1,3]=OC_C2S6*x[1,2]>>16
  1140. SMULWB r5, r6, r5 @ r5 = t[1,2]=OC_C6S2*x[1,2]>>16
  1141. PKHBT r3, r3, r12,LSL #16 @ r3 = <t[1,3]|t[0,3]>
  1142. SMULWB r12,r11,r10 @ r12= t[0,0]=OC_C4S4*x[0,0]>>16
  1143. PKHBT r2, r2, r5, LSL #16 @ r2 = <t[1,2]|t[0,2]>
  1144. SMULWB r5, r11,r4 @ r5 = t[1,0]=OC_C4S4*x[1,0]>>16
  1145. LDRD r6, r7, OC_C7S1_4_v6 @ r6 = OC_C7S1; r7 = OC_C1S7
  1146. PKHBT r12,r12,r5, LSL #16 @ r12= <t[1,0]|t[0,0]>
  1147. SMULWT r5, r7, r4 @ r5 = t[1,7]=OC_C1S7*x[1,1]>>16
  1148. SMULWT r7, r7, r10 @ r7 = t[0,7]=OC_C1S7*x[0,1]>>16
  1149. SMULWT r10,r6, r10 @ r10= t[0,4]=OC_C7S1*x[0,1]>>16
  1150. PKHBT r7, r7, r5, LSL #16 @ r7 = <t[1,7]|t[0,7]>
  1151. SMULWT r4, r6, r4 @ r4 = t[1,4]=OC_C7S1*x[1,1]>>16
  1152. @ Stage 2:
  1153. SSUB16 r6, r7, r9 @ r6 = t[7]-t[6]
  1154. PKHBT r4, r10,r4, LSL #16 @ r4 = <t[1,4]|t[0,4]>
  1155. SADD16 r7, r7, r9 @ r7 = t[7]=t[7]+t[6]
  1156. SMULWT r9, r11,r6 @ r9 = t[1,6]=OC_C4S4*r6T>>16
  1157. SADD16 r5, r4, r8 @ r5 = t[4]-t[5]
  1158. SMULWB r6, r11,r6 @ r6 = t[0,6]=OC_C4S4*r6B>>16
  1159. SSUB16 r4, r4, r8 @ r4 = t[4]=t[4]+t[5]
  1160. SMULWT r10,r11,r5 @ r10= t[1,5]=OC_C4S4*r5T>>16
  1161. PKHBT r6, r6, r9, LSL #16 @ r6 = <t[1,6]|t[0,6]>
  1162. SMULWB r5, r11,r5 @ r5 = t[0,5]=OC_C4S4*r5B>>16
  1163. @ Stage 3:
  1164. idct4_3core_stage3_v6:
  1165. SADD16 r11,r12,r2 @ r11= t[1]=t[0]+t[2]
  1166. PKHBT r10,r5, r10,LSL #16 @ r10= <t[1,5]|t[0,5]>
  1167. SSUB16 r2, r12,r2 @ r2 = t[2]=t[0]-t[2]
  1168. idct4_3core_stage3_5_v6:
  1169. SSUB16 r5, r6, r10 @ r5 = t[5]=t[6]-t[5]
  1170. SADD16 r6, r6, r10 @ r6 = t[6]=t[6]+t[5]
  1171. SADD16 r10,r12,r3 @ r10= t[0]=t[0]+t[3]
  1172. SSUB16 r3, r12,r3 @ r3 = t[3]=t[0]-t[3]
  1173. @ Stage 4:
  1174. SADD16 r12,r10,r7 @ r12= t[0]+t[7]
  1175. STR r12,[r0], #4 @ y[0<<3] = t[0]+t[7]
  1176. SADD16 r12,r11,r6 @ r12= t[1]+t[6]
  1177. STR r12,[r0, #12] @ y[1<<3] = t[1]+t[6]
  1178. SADD16 r12,r2, r5 @ r12= t[2]+t[5]
  1179. STR r12,[r0, #28] @ y[2<<3] = t[2]+t[5]
  1180. SADD16 r12,r3, r4 @ r12= t[3]+t[4]
  1181. STR r12,[r0, #44] @ y[3<<3] = t[3]+t[4]
  1182. SSUB16 r4, r3, r4 @ r4 = t[3]-t[4]
  1183. STR r4, [r0, #60] @ y[4<<3] = t[3]-t[4]
  1184. SSUB16 r5, r2, r5 @ r5 = t[2]-t[5]
  1185. STR r5, [r0, #76] @ y[5<<3] = t[2]-t[5]
  1186. SSUB16 r6, r11,r6 @ r6 = t[1]-t[6]
  1187. STR r6, [r0, #92] @ y[6<<3] = t[1]-t[6]
  1188. SSUB16 r7, r10,r7 @ r7 = t[0]-t[7]
  1189. STR r7, [r0, #108] @ y[7<<3] = t[0]-t[7]
  1190. MOV PC,r14
  1191. @ .size idct4_3core_v6, .-idct4_3core_v6 @ ENDP
  1192. @ Another copy so the LDRD offsets are less than +/- 255.
  1193. .balign 8
  1194. OC_C7S1_4_v6:
  1195. .word 12785 @ 31F1
  1196. OC_C1S7_4_v6:
  1197. .word 64277 @ FB15
  1198. OC_C6S2_4_v6:
  1199. .word 25080 @ 61F8
  1200. OC_C2S6_4_v6:
  1201. .word 60547 @ EC83
  1202. OC_C5S3_4_v6:
  1203. .word 36410 @ 8E3A
  1204. OC_C3S5_4_v6:
  1205. .word 54491 @ D4DB
  1206. @ .type idct4_4core_down_v6, %function; idct4_4core_down_v6: @ PROC
  1207. idct4_4core_down_v6:
  1208. @ r0 = ogg_int16_t *_y (destination)
  1209. @ r1 = const ogg_int16_t *_x (source)
  1210. @ Stage 1:
  1211. LDRD r10,r11,[r1], #16 @ r10= <x[0,1]|x[0,0]>; r11= <x[0,3]|x[0,2]>
  1212. LDRD r2, r3, OC_C5S3_4_v6 @ r2 = OC_C5S3; r3 = OC_C3S5
  1213. LDRD r4, r5, [r1], #16 @ r4 = <x[1,1]|x[1,0]>; r5 = <x[1,3]|x[1,2]>
  1214. SMULWT r9, r3, r11 @ r9 = t[0,6]=OC_C3S5*x[0,3]>>16
  1215. LDRD r6, r7, OC_C6S2_4_v6 @ r6 = OC_C6S2; r7 = OC_C2S6
  1216. SMULWT r8, r2, r11 @ r8 = -t[0,5]=OC_C5S3*x[0,3]>>16
  1217. @ Here we cheat: row 3 had just a DC, so x[0,3]==x[1,3] by definition.
  1218. PKHBT r9, r9, r9, LSL #16 @ r9 = <t[0,6]|t[0,6]>
  1219. SMULWB r3, r7, r11 @ r3 = t[0,3]=OC_C2S6*x[0,2]>>16
  1220. PKHBT r8, r8, r8, LSL #16 @ r8 = <-t[0,5]|-t[0,5]>
  1221. SMULWB r2, r6, r11 @ r2 = t[0,2]=OC_C6S2*x[0,2]>>16
  1222. LDR r11,OC_C4S4
  1223. SMULWB r12,r7, r5 @ r12= t[1,3]=OC_C2S6*x[1,2]>>16
  1224. MOV r7,#8
  1225. SMULWB r5, r6, r5 @ r5 = t[1,2]=OC_C6S2*x[1,2]>>16
  1226. PKHBT r3, r3, r12,LSL #16 @ r3 = <t[1,3]|t[0,3]>
  1227. SMLAWB r12,r11,r10,r7 @ r12= t[0,0]+8=(OC_C4S4*x[0,0]>>16)+8
  1228. PKHBT r2, r2, r5, LSL #16 @ r2 = <t[1,2]|t[0,2]>
  1229. SMLAWB r5, r11,r4 ,r7 @ r5 = t[1,0]+8=(OC_C4S4*x[1,0]>>16)+8
  1230. LDRD r6, r7, OC_C7S1_4_v6 @ r6 = OC_C7S1; r7 = OC_C1S7
  1231. PKHBT r12,r12,r5, LSL #16 @ r12= <t[1,0]+8|t[0,0]+8>
  1232. SMULWT r5, r7, r4 @ r5 = t[1,7]=OC_C1S7*x[1,1]>>16
  1233. SMULWT r7, r7, r10 @ r7 = t[0,7]=OC_C1S7*x[0,1]>>16
  1234. SMULWT r10,r6, r10 @ r10= t[0,4]=OC_C7S1*x[0,1]>>16
  1235. PKHBT r7, r7, r5, LSL #16 @ r7 = <t[1,7]|t[0,7]>
  1236. SMULWT r4, r6, r4 @ r4 = t[1,4]=OC_C7S1*x[1,1]>>16
  1237. @ Stage 2:
  1238. SSUB16 r6, r7, r9 @ r6 = t[7]-t[6]
  1239. PKHBT r4, r10,r4, LSL #16 @ r4 = <t[1,4]|t[0,4]>
  1240. SADD16 r7, r7, r9 @ r7 = t[7]=t[7]+t[6]
  1241. SMULWT r9, r11,r6 @ r9 = t[1,6]=OC_C4S4*r6T>>16
  1242. SADD16 r5, r4, r8 @ r5 = t[4]-t[5]
  1243. SMULWB r6, r11,r6 @ r6 = t[0,6]=OC_C4S4*r6B>>16
  1244. SSUB16 r4, r4, r8 @ r4 = t[4]=t[4]+t[5]
  1245. SMULWT r10,r11,r5 @ r10= t[1,5]=OC_C4S4*r5T>>16
  1246. PKHBT r6, r6, r9, LSL #16 @ r6 = <t[1,6]|t[0,6]>
  1247. SMULWB r5, r11,r5 @ r5 = t[0,5]=OC_C4S4*r5B>>16
  1248. @ Stage 3:
  1249. idct4_4core_down_stage3_v6:
  1250. SADD16 r11,r12,r2 @ r11= t[1]+8=t[0]+t[2]+8
  1251. PKHBT r10,r5, r10,LSL #16 @ r10= <t[1,5]|t[0,5]>
  1252. SSUB16 r2, r12,r2 @ r2 = t[2]+8=t[0]-t[2]+8
  1253. B idct8_8core_down_stage3_5_v6
  1254. @ .size idct4_4core_down_v6, .-idct4_4core_down_v6 @ ENDP
  1255. @ .type idct8_8core_v6, %function; idct8_8core_v6: @ PROC
  1256. idct8_8core_v6:
  1257. STMFD r13!,{r0,r14}
  1258. @ Stage 1:
  1259. @5-6 rotation by 3pi/16
  1260. LDRD r10,r11,OC_C5S3_4_v6 @ r10= OC_C5S3, r11= OC_C3S5
  1261. LDR r4, [r1,#8] @ r4 = <x[0,5]|x[0,4]>
  1262. LDR r7, [r1,#24] @ r7 = <x[1,5]|x[1,4]>
  1263. SMULWT r5, r11,r4 @ r5 = OC_C3S5*x[0,5]>>16
  1264. LDR r0, [r1,#4] @ r0 = <x[0,3]|x[0,2]>
  1265. SMULWT r3, r11,r7 @ r3 = OC_C3S5*x[1,5]>>16
  1266. LDR r12,[r1,#20] @ r12= <x[1,3]|x[1,2]>
  1267. SMULWT r6, r11,r0 @ r6 = OC_C3S5*x[0,3]>>16
  1268. SMULWT r11,r11,r12 @ r11= OC_C3S5*x[1,3]>>16
  1269. SMLAWT r6, r10,r4, r6 @ r6 = t[0,6]=r6+(OC_C5S3*x[0,5]>>16)
  1270. PKHBT r5, r5, r3, LSL #16 @ r5 = <r3|r5>
  1271. SMLAWT r11,r10,r7, r11 @ r11= t[1,6]=r11+(OC_C5S3*x[1,5]>>16)
  1272. PKHBT r4, r4, r7, LSL #16 @ r4 = <x[1,4]|x[0,4]>
  1273. SMULWT r3, r10,r0 @ r3 = OC_C5S3*x[0,3]>>16
  1274. PKHBT r6, r6, r11,LSL #16 @ r6 = <t[1,6]|t[0,6]>
  1275. SMULWT r8, r10,r12 @ r8 = OC_C5S3*x[1,3]>>16
  1276. @2-3 rotation by 6pi/16
  1277. LDRD r10,r11,OC_C6S2_4_v6 @ r10= OC_C6S2, r11= OC_C2S6
  1278. PKHBT r3, r3, r8, LSL #16 @ r3 = <r8|r3>
  1279. LDR r8, [r1,#12] @ r8 = <x[0,7]|x[0,6]>
  1280. SMULWB r2, r10,r0 @ r2 = OC_C6S2*x[0,2]>>16
  1281. SSUB16 r5, r5, r3 @ r5 = <t[1,5]|t[0,5]>
  1282. SMULWB r9, r10,r12 @ r9 = OC_C6S2*x[1,2]>>16
  1283. LDR r7, [r1,#28] @ r7 = <x[1,7]|x[1,6]>
  1284. SMULWB r3, r10,r8 @ r3 = OC_C6S2*x[0,6]>>16
  1285. SMULWB r10,r10,r7 @ r10= OC_C6S2*x[1,6]>>16
  1286. PKHBT r2, r2, r9, LSL #16 @ r2 = <r2|r9>
  1287. SMLAWB r3, r11,r0, r3 @ r3 = t[0,3]=r3+(OC_C2S6*x[0,2]>>16)
  1288. SMLAWB r10,r11,r12,r10 @ r10= t[1,3]=r10+(OC_C2S6*x[1,2]>>16)
  1289. SMULWB r9, r11,r8 @ r9 = OC_C2S6*x[0,6]>>16
  1290. PKHBT r3, r3, r10,LSL #16 @ r3 = <t[1,6]|t[0,6]>
  1291. SMULWB r12,r11,r7 @ r12= OC_C2S6*x[1,6]>>16
  1292. @4-7 rotation by 7pi/16
  1293. LDRD r10,r11,OC_C7S1_8_v6 @ r10= OC_C7S1, r11= OC_C1S7
  1294. PKHBT r9, r9, r12,LSL #16 @ r9 = <r9|r12>
  1295. LDR r0, [r1],#16 @ r0 = <x[0,1]|x[0,0]>
  1296. PKHTB r7, r7, r8, ASR #16 @ r7 = <x[1,7]|x[0,7]>
  1297. SSUB16 r2, r2, r9 @ r2 = <t[1,2]|t[0,2]>
  1298. SMULWB r9, r10,r7 @ r9 = OC_C7S1*x[0,7]>>16
  1299. LDR r14,[r1],#16 @ r14= <x[1,1]|x[1,0]>
  1300. SMULWT r12,r10,r7 @ r12= OC_C7S1*x[1,7]>>16
  1301. SMULWT r8, r10,r0 @ r8 = OC_C7S1*x[0,1]>>16
  1302. SMULWT r10,r10,r14 @ r10= OC_C7S1*x[1,1]>>16
  1303. SMLAWT r9, r11,r0, r9 @ r9 = t[0,7]=r9+(OC_C1S7*x[0,1]>>16)
  1304. PKHBT r8, r8, r10,LSL #16 @ r8 = <r12|r8>
  1305. SMLAWT r12,r11,r14,r12 @ r12= t[1,7]=r12+(OC_C1S7*x[1,1]>>16)
  1306. PKHBT r0, r0, r14,LSL #16 @ r0 = <x[1,0]|x[0,0]>
  1307. SMULWB r10,r11,r7 @ r10= OC_C1S7*x[0,6]>>16
  1308. PKHBT r9, r9, r12,LSL #16 @ r9 = <t[1,7]|t[0,7]>
  1309. SMULWT r12,r11,r7 @ r12= OC_C1S7*x[1,6]>>16
  1310. @0-1 butterfly
  1311. LDR r11,OC_C4S4
  1312. PKHBT r10,r10,r12,LSL #16 @ r10= <r12|r10>
  1313. SADD16 r7, r0, r4 @ r7 = x[0]+x[4]
  1314. SSUB16 r10,r8, r10 @ r10= <t[1,4]|t[0,4]>
  1315. SSUB16 r4, r0, r4 @ r4 = x[0]-x[4]
  1316. SMULWB r8, r11,r7 @ r8 = t[0,0]=OC_C4S4*r7B>>16
  1317. SMULWT r12,r11,r7 @ r12= t[1,0]=OC_C4S4*r7T>>16
  1318. SMULWB r7, r11,r4 @ r7 = t[0,1]=OC_C4S4*r4B>>16
  1319. PKHBT r12,r8, r12,LSL #16 @ r12= <t[1,0]|t[0,0]>
  1320. SMULWT r8, r11,r4 @ r8 = t[1,1]=OC_C4S4*r4T>>16
  1321. @ Stage 2:
  1322. SADD16 r4, r10,r5 @ r4 = t[4]=t[4]+t[5]
  1323. PKHBT r8, r7, r8, LSL #16 @ r8 = <t[1,0]|t[0,0]>
  1324. SSUB16 r5, r10,r5 @ r5 = t[4]-t[5]
  1325. SMULWB r10,r11,r5 @ r10= t[0,5]=OC_C4S4*r5B>>16
  1326. SADD16 r7, r9, r6 @ r7 = t[7]=t[7]+t[6]
  1327. SMULWT r5, r11,r5 @ r5 = t[1,5]=OC_C4S4*r5T>>16
  1328. SSUB16 r6, r9, r6 @ r6 = t[7]-t[6]
  1329. SMULWB r9, r11,r6 @ r9 = t[0,6]=OC_C4S4*r6B>>16
  1330. PKHBT r10,r10,r5, LSL #16 @ r10= <t[1,5]|t[0,5]>
  1331. SMULWT r6, r11,r6 @ r6 = t[1,6]=OC_C4S4*r6T>>16
  1332. @ Stage 3:
  1333. SADD16 r11,r8, r2 @ r11= t[1]=t[1]+t[2]
  1334. PKHBT r6, r9, r6, LSL #16 @ r6 = <t[1,6]|t[0,6]>
  1335. SSUB16 r2, r8, r2 @ r2 = t[2]=t[1]-t[2]
  1336. LDMFD r13!,{r0,r14}
  1337. B idct4_3core_stage3_5_v6
  1338. @ .size idct8_8core_v6, .-idct8_8core_v6 @ ENDP
  1339. @ Another copy so the LDRD offsets are less than +/- 255.
  1340. .balign 8
  1341. OC_C7S1_8_v6:
  1342. .word 12785 @ 31F1
  1343. OC_C1S7_8_v6:
  1344. .word 64277 @ FB15
  1345. OC_C6S2_8_v6:
  1346. .word 25080 @ 61F8
  1347. OC_C2S6_8_v6:
  1348. .word 60547 @ EC83
  1349. OC_C5S3_8_v6:
  1350. .word 36410 @ 8E3A
  1351. OC_C3S5_8_v6:
  1352. .word 54491 @ D4DB
  1353. @ .type idct8_8core_down_v6, %function; idct8_8core_down_v6: @ PROC
  1354. idct8_8core_down_v6:
  1355. STMFD r13!,{r0,r14}
  1356. @ Stage 1:
  1357. @5-6 rotation by 3pi/16
  1358. LDRD r10,r11,OC_C5S3_8_v6 @ r10= OC_C5S3, r11= OC_C3S5
  1359. LDR r4, [r1,#8] @ r4 = <x[0,5]|x[0,4]>
  1360. LDR r7, [r1,#24] @ r7 = <x[1,5]|x[1,4]>
  1361. SMULWT r5, r11,r4 @ r5 = OC_C3S5*x[0,5]>>16
  1362. LDR r0, [r1,#4] @ r0 = <x[0,3]|x[0,2]>
  1363. SMULWT r3, r11,r7 @ r3 = OC_C3S5*x[1,5]>>16
  1364. LDR r12,[r1,#20] @ r12= <x[1,3]|x[1,2]>
  1365. SMULWT r6, r11,r0 @ r6 = OC_C3S5*x[0,3]>>16
  1366. SMULWT r11,r11,r12 @ r11= OC_C3S5*x[1,3]>>16
  1367. SMLAWT r6, r10,r4, r6 @ r6 = t[0,6]=r6+(OC_C5S3*x[0,5]>>16)
  1368. PKHBT r5, r5, r3, LSL #16 @ r5 = <r3|r5>
  1369. SMLAWT r11,r10,r7, r11 @ r11= t[1,6]=r11+(OC_C5S3*x[1,5]>>16)
  1370. PKHBT r4, r4, r7, LSL #16 @ r4 = <x[1,4]|x[0,4]>
  1371. SMULWT r3, r10,r0 @ r3 = OC_C5S3*x[0,3]>>16
  1372. PKHBT r6, r6, r11,LSL #16 @ r6 = <t[1,6]|t[0,6]>
  1373. SMULWT r8, r10,r12 @ r8 = OC_C5S3*x[1,3]>>16
  1374. @2-3 rotation by 6pi/16
  1375. LDRD r10,r11,OC_C6S2_8_v6 @ r10= OC_C6S2, r11= OC_C2S6
  1376. PKHBT r3, r3, r8, LSL #16 @ r3 = <r8|r3>
  1377. LDR r8, [r1,#12] @ r8 = <x[0,7]|x[0,6]>
  1378. SMULWB r2, r10,r0 @ r2 = OC_C6S2*x[0,2]>>16
  1379. SSUB16 r5, r5, r3 @ r5 = <t[1,5]|t[0,5]>
  1380. SMULWB r9, r10,r12 @ r9 = OC_C6S2*x[1,2]>>16
  1381. LDR r7, [r1,#28] @ r7 = <x[1,7]|x[1,6]>
  1382. SMULWB r3, r10,r8 @ r3 = OC_C6S2*x[0,6]>>16
  1383. SMULWB r10,r10,r7 @ r10= OC_C6S2*x[1,6]>>16
  1384. PKHBT r2, r2, r9, LSL #16 @ r2 = <r2|r9>
  1385. SMLAWB r3, r11,r0, r3 @ r3 = t[0,3]=r3+(OC_C2S6*x[0,2]>>16)
  1386. SMLAWB r10,r11,r12,r10 @ r10= t[1,3]=r10+(OC_C2S6*x[1,2]>>16)
  1387. SMULWB r9, r11,r8 @ r9 = OC_C2S6*x[0,6]>>16
  1388. PKHBT r3, r3, r10,LSL #16 @ r3 = <t[1,6]|t[0,6]>
  1389. SMULWB r12,r11,r7 @ r12= OC_C2S6*x[1,6]>>16
  1390. @4-7 rotation by 7pi/16
  1391. LDRD r10,r11,OC_C7S1_8_v6 @ r10= OC_C7S1, r11= OC_C1S7
  1392. PKHBT r9, r9, r12,LSL #16 @ r9 = <r9|r12>
  1393. LDR r0, [r1],#16 @ r0 = <x[0,1]|x[0,0]>
  1394. PKHTB r7, r7, r8, ASR #16 @ r7 = <x[1,7]|x[0,7]>
  1395. SSUB16 r2, r2, r9 @ r2 = <t[1,2]|t[0,2]>
  1396. SMULWB r9, r10,r7 @ r9 = OC_C7S1*x[0,7]>>16
  1397. LDR r14,[r1],#16 @ r14= <x[1,1]|x[1,0]>
  1398. SMULWT r12,r10,r7 @ r12= OC_C7S1*x[1,7]>>16
  1399. SMULWT r8, r10,r0 @ r8 = OC_C7S1*x[0,1]>>16
  1400. SMULWT r10,r10,r14 @ r10= OC_C7S1*x[1,1]>>16
  1401. SMLAWT r9, r11,r0, r9 @ r9 = t[0,7]=r9+(OC_C1S7*x[0,1]>>16)
  1402. PKHBT r8, r8, r10,LSL #16 @ r8 = <r12|r8>
  1403. SMLAWT r12,r11,r14,r12 @ r12= t[1,7]=r12+(OC_C1S7*x[1,1]>>16)
  1404. PKHBT r0, r0, r14,LSL #16 @ r0 = <x[1,0]|x[0,0]>
  1405. SMULWB r10,r11,r7 @ r10= OC_C1S7*x[0,6]>>16
  1406. PKHBT r9, r9, r12,LSL #16 @ r9 = <t[1,7]|t[0,7]>
  1407. SMULWT r12,r11,r7 @ r12= OC_C1S7*x[1,6]>>16
  1408. @0-1 butterfly
  1409. LDR r11,OC_C4S4
  1410. MOV r14,#8
  1411. PKHBT r10,r10,r12,LSL #16 @ r10= <r12|r10>
  1412. SADD16 r7, r0, r4 @ r7 = x[0]+x[4]
  1413. SSUB16 r10,r8, r10 @ r10= <t[1,4]|t[0,4]>
  1414. SMLAWB r8, r11,r7, r14 @ r8 = t[0,0]+8=(OC_C4S4*r7B>>16)+8
  1415. SSUB16 r4, r0, r4 @ r4 = x[0]-x[4]
  1416. SMLAWT r12,r11,r7, r14 @ r12= t[1,0]+8=(OC_C4S4*r7T>>16)+8
  1417. SMLAWB r7, r11,r4, r14 @ r7 = t[0,1]+8=(OC_C4S4*r4B>>16)+8
  1418. PKHBT r12,r8, r12,LSL #16 @ r12= <t[1,0]+8|t[0,0]+8>
  1419. SMLAWT r8, r11,r4, r14 @ r8 = t[1,1]+8=(OC_C4S4*r4T>>16)+8
  1420. @ Stage 2:
  1421. SADD16 r4, r10,r5 @ r4 = t[4]=t[4]+t[5]
  1422. PKHBT r8, r7, r8, LSL #16 @ r8 = <t[1,0]+8|t[0,0]+8>
  1423. SSUB16 r5, r10,r5 @ r5 = t[4]-t[5]
  1424. SMULWB r10,r11,r5 @ r10= t[0,5]=OC_C4S4*r5B>>16
  1425. SADD16 r7, r9, r6 @ r7 = t[7]=t[7]+t[6]
  1426. SMULWT r5, r11,r5 @ r5 = t[1,5]=OC_C4S4*r5T>>16
  1427. SSUB16 r6, r9, r6 @ r6 = t[7]-t[6]
  1428. SMULWB r9, r11,r6 @ r9 = t[0,6]=OC_C4S4*r6B>>16
  1429. PKHBT r10,r10,r5, LSL #16 @ r10= <t[1,5]|t[0,5]>
  1430. SMULWT r6, r11,r6 @ r6 = t[1,6]=OC_C4S4*r6T>>16
  1431. @ Stage 3:
  1432. SADD16 r11,r8, r2 @ r11= t[1]+8=t[1]+t[2]+8
  1433. PKHBT r6, r9, r6, LSL #16 @ r6 = <t[1,6]|t[0,6]>
  1434. SSUB16 r2, r8, r2 @ r2 = t[2]+8=t[1]-t[2]+8
  1435. LDMFD r13!,{r0,r14}
  1436. idct8_8core_down_stage3_5_v6:
  1437. SSUB16 r5, r6, r10 @ r5 = t[5]=t[6]-t[5]
  1438. SADD16 r6, r6, r10 @ r6 = t[6]=t[6]+t[5]
  1439. SADD16 r10,r12,r3 @ r10= t[0]+8=t[0]+t[3]+8
  1440. SSUB16 r3, r12,r3 @ r3 = t[3]+8=t[0]-t[3]+8
  1441. @ Stage 4:
  1442. SADD16 r12,r10,r7 @ r12= t[0]+t[7]+8
  1443. SSUB16 r7, r10,r7 @ r7 = t[0]-t[7]+8
  1444. MOV r10,r12,ASR #4
  1445. MOV r12,r12,LSL #16
  1446. PKHTB r10,r10,r12,ASR #20 @ r10= t[0]+t[7]+8>>4
  1447. STR r10,[r0], #4 @ y[0<<3] = t[0]+t[7]+8>>4
  1448. SADD16 r12,r11,r6 @ r12= t[1]+t[6]+8
  1449. SSUB16 r6, r11,r6 @ r6 = t[1]-t[6]+8
  1450. MOV r10,r12,ASR #4
  1451. MOV r12,r12,LSL #16
  1452. PKHTB r10,r10,r12,ASR #20 @ r10= t[1]+t[6]+8>>4
  1453. STR r10,[r0, #12] @ y[1<<3] = t[1]+t[6]+8>>4
  1454. SADD16 r12,r2, r5 @ r12= t[2]+t[5]+8
  1455. SSUB16 r5, r2, r5 @ r5 = t[2]-t[5]+8
  1456. MOV r10,r12,ASR #4
  1457. MOV r12,r12,LSL #16
  1458. PKHTB r10,r10,r12,ASR #20 @ r10= t[2]+t[5]+8>>4
  1459. STR r10,[r0, #28] @ y[2<<3] = t[2]+t[5]+8>>4
  1460. SADD16 r12,r3, r4 @ r12= t[3]+t[4]+8
  1461. SSUB16 r4, r3, r4 @ r4 = t[3]-t[4]+8
  1462. MOV r10,r12,ASR #4
  1463. MOV r12,r12,LSL #16
  1464. PKHTB r10,r10,r12,ASR #20 @ r10= t[3]+t[4]+8>>4
  1465. STR r10,[r0, #44] @ y[3<<3] = t[3]+t[4]+8>>4
  1466. MOV r10,r4, ASR #4
  1467. MOV r4, r4, LSL #16
  1468. PKHTB r10,r10,r4, ASR #20 @ r10= t[3]-t[4]+8>>4
  1469. STR r10,[r0, #60] @ y[4<<3] = t[3]-t[4]+8>>4
  1470. MOV r10,r5, ASR #4
  1471. MOV r5, r5, LSL #16
  1472. PKHTB r10,r10,r5, ASR #20 @ r10= t[2]-t[5]+8>>4
  1473. STR r10,[r0, #76] @ y[5<<3] = t[2]-t[5]+8>>4
  1474. MOV r10,r6, ASR #4
  1475. MOV r6, r6, LSL #16
  1476. PKHTB r10,r10,r6, ASR #20 @ r10= t[1]-t[6]+8>>4
  1477. STR r10,[r0, #92] @ y[6<<3] = t[1]-t[6]+8>>4
  1478. MOV r10,r7, ASR #4
  1479. MOV r7, r7, LSL #16
  1480. PKHTB r10,r10,r7, ASR #20 @ r10= t[0]-t[7]+8>>4
  1481. STR r10,[r0, #108] @ y[7<<3] = t[0]-t[7]+8>>4
  1482. MOV PC,r14
  1483. @ .size idct8_8core_down_v6, .-idct8_8core_down_v6 @ ENDP
  1484. .endif
  1485. .if OC_ARM_ASM_NEON
  1486. .global _oc_idct8x8_1_neon
  1487. .global _oc_idct8x8_neon
  1488. .balign 16
  1489. OC_IDCT_CONSTS_NEON:
  1490. .short 8
  1491. .short 64277 @ FB15 (C1S7)
  1492. .short 60547 @ EC83 (C2S6)
  1493. .short 54491 @ D4DB (C3S5)
  1494. .short 46341 @ B505 (C4S4)
  1495. .short 36410 @ 471D (C5S3)
  1496. .short 25080 @ 30FC (C6S2)
  1497. .short 12785 @ 31F1 (C7S1)
  1498. @ .type oc_idct8x8_1_neon, %function; oc_idct8x8_1_neon: @ PROC
  1499. _oc_idct8x8_1_neon:
  1500. @ r0 = ogg_int16_t *_y
  1501. @ r1 = ogg_uint16_t _dc
  1502. VDUP.S16 Q0, r1
  1503. VMOV Q1, Q0
  1504. VST1.64 {D0, D1, D2, D3}, [r0,:128]!
  1505. VST1.64 {D0, D1, D2, D3}, [r0,:128]!
  1506. VST1.64 {D0, D1, D2, D3}, [r0,:128]!
  1507. VST1.64 {D0, D1, D2, D3}, [r0,:128]
  1508. MOV PC, r14
  1509. @ .size oc_idct8x8_1_neon, .-oc_idct8x8_1_neon @ ENDP
  1510. @ .type oc_idct8x8_neon, %function; oc_idct8x8_neon: @ PROC
  1511. _oc_idct8x8_neon:
  1512. @ r0 = ogg_int16_t *_y
  1513. @ r1 = ogg_int16_t *_x
  1514. @ r2 = int _last_zzi
  1515. CMP r2, #10
  1516. BLE oc_idct8x8_10_neon
  1517. oc_idct8x8_slow_neon:
  1518. VPUSH {D8-D15}
  1519. MOV r2, r1
  1520. ADR r3, OC_IDCT_CONSTS_NEON
  1521. @ Row transforms (input is pre-transposed)
  1522. VLD1.64 {D16,D17,D18,D19}, [r2,:128]!
  1523. VLD1.64 {D20,D21,D22,D23}, [r2,:128]!
  1524. VLD1.64 {D24,D25,D26,D27}, [r2,:128]!
  1525. VSUB.S16 Q1, Q8, Q12 @ Q8 = x[0]-x[4]
  1526. VLD1.64 {D28,D29,D30,D31}, [r2,:128]
  1527. VADD.S16 Q8, Q8, Q12 @ Q1 = x[0]+x[4]
  1528. VLD1.64 {D0,D1}, [r3,:128]
  1529. MOV r12, r14
  1530. BL oc_idct8x8_stage123_neon
  1531. @ Stage 4
  1532. VSUB.S16 Q15,Q8, Q7 @ Q15 = y[7]=t[0]-t[7]
  1533. VADD.S16 Q8, Q8, Q7 @ Q8 = y[0]=t[0]+t[7]
  1534. VSUB.S16 Q14,Q9, Q3 @ Q14 = y[6]=t[1]-t[6]
  1535. VADD.S16 Q9, Q9, Q3 @ Q9 = y[1]=t[1]+t[6]
  1536. VSUB.S16 Q13,Q10,Q5 @ Q13 = y[5]=t[2]-t[5]
  1537. VADD.S16 Q10,Q10,Q5 @ Q10 = y[2]=t[2]+t[5]
  1538. VTRN.16 Q14,Q15
  1539. VSUB.S16 Q12,Q11,Q4 @ Q12 = y[4]=t[3]-t[4]
  1540. VADD.S16 Q11,Q11,Q4 @ Q11 = y[3]=t[3]+t[4]
  1541. @ 8x8 Transpose
  1542. VTRN.16 Q8, Q9
  1543. VTRN.16 Q10,Q11
  1544. VTRN.16 Q12,Q13
  1545. VTRN.32 Q8, Q10
  1546. VTRN.32 Q9, Q11
  1547. VTRN.32 Q12,Q14
  1548. VTRN.32 Q13,Q15
  1549. VSWP D17,D24
  1550. VSUB.S16 Q1, Q8, Q12 @ Q8 = x[0]-x[4]
  1551. VSWP D19,D26
  1552. VADD.S16 Q8, Q8, Q12 @ Q1 = x[0]+x[4]
  1553. VSWP D21,D28
  1554. VSWP D23,D30
  1555. @ Column transforms
  1556. BL oc_idct8x8_stage123_neon
  1557. @ We have to put the return address back in the LR, or the branch
  1558. @ predictor will not recognize the function return and mis-predict the
  1559. @ entire call stack.
  1560. MOV r14, r12
  1561. @ Stage 4
  1562. VSUB.S16 Q15,Q8, Q7 @ Q15 = y[7]=t[0]-t[7]
  1563. VADD.S16 Q8, Q8, Q7 @ Q8 = y[0]=t[0]+t[7]
  1564. VSUB.S16 Q14,Q9, Q3 @ Q14 = y[6]=t[1]-t[6]
  1565. VADD.S16 Q9, Q9, Q3 @ Q9 = y[1]=t[1]+t[6]
  1566. VSUB.S16 Q13,Q10,Q5 @ Q13 = y[5]=t[2]-t[5]
  1567. VADD.S16 Q10,Q10,Q5 @ Q10 = y[2]=t[2]+t[5]
  1568. VSUB.S16 Q12,Q11,Q4 @ Q12 = y[4]=t[3]-t[4]
  1569. VADD.S16 Q11,Q11,Q4 @ Q11 = y[3]=t[3]+t[4]
  1570. VMOV.I8 Q2,#0
  1571. VPOP {D8-D15}
  1572. VMOV.I8 Q3,#0
  1573. VRSHR.S16 Q8, Q8, #4 @ Q8 = y[0]+8>>4
  1574. VST1.64 {D4, D5, D6, D7}, [r1,:128]!
  1575. VRSHR.S16 Q9, Q9, #4 @ Q9 = y[1]+8>>4
  1576. VRSHR.S16 Q10,Q10,#4 @ Q10 = y[2]+8>>4
  1577. VST1.64 {D4, D5, D6, D7}, [r1,:128]!
  1578. VRSHR.S16 Q11,Q11,#4 @ Q11 = y[3]+8>>4
  1579. VRSHR.S16 Q12,Q12,#4 @ Q12 = y[4]+8>>4
  1580. VST1.64 {D4, D5, D6, D7}, [r1,:128]!
  1581. VRSHR.S16 Q13,Q13,#4 @ Q13 = y[5]+8>>4
  1582. VRSHR.S16 Q14,Q14,#4 @ Q14 = y[6]+8>>4
  1583. VST1.64 {D4, D5, D6, D7}, [r1,:128]
  1584. VRSHR.S16 Q15,Q15,#4 @ Q15 = y[7]+8>>4
  1585. VSTMIA r0, {D16-D31}
  1586. MOV PC, r14
  1587. @ .size oc_idct8x8_neon, .-oc_idct8x8_neon @ ENDP
  1588. @ .type oc_idct8x8_stage123_neon, %function; oc_idct8x8_stage123_neon: @ PROC
  1589. oc_idct8x8_stage123_neon:
  1590. @ Stages 1 & 2
  1591. VMULL.S16 Q4, D18,D1[3]
  1592. VMULL.S16 Q5, D19,D1[3]
  1593. VMULL.S16 Q7, D30,D1[3]
  1594. VMULL.S16 Q6, D31,D1[3]
  1595. VMULL.S16 Q2, D30,D0[1]
  1596. VMULL.S16 Q3, D31,D0[1]
  1597. VSHRN.S32 D8, Q4, #16
  1598. VSHRN.S32 D9, Q5, #16 @ Q4 = (OC_C7S1*x[1]>>16)
  1599. VSHRN.S32 D14,Q7, #16
  1600. VSHRN.S32 D15,Q6, #16 @ Q7 = (OC_C7S1*x[7]>>16)
  1601. VSHRN.S32 D4, Q2, #16
  1602. VSHRN.S32 D5, Q3, #16 @ Q2 = (OC_C1S7*x[7]>>16)-x[7]
  1603. VSUB.S16 Q4, Q4, Q15
  1604. VADD.S16 Q7, Q7, Q9
  1605. VSUB.S16 Q4, Q4, Q2 @ Q4 = t[4]
  1606. VMULL.S16 Q2, D18,D0[1]
  1607. VMULL.S16 Q9, D19,D0[1]
  1608. VMULL.S16 Q5, D26,D0[3]
  1609. VMULL.S16 Q3, D27,D0[3]
  1610. VMULL.S16 Q6, D22,D0[3]
  1611. VMULL.S16 Q12,D23,D0[3]
  1612. VSHRN.S32 D4, Q2, #16
  1613. VSHRN.S32 D5, Q9, #16 @ Q2 = (OC_C1S7*x[1]>>16)-x[1]
  1614. VSHRN.S32 D10,Q5, #16
  1615. VSHRN.S32 D11,Q3, #16 @ Q5 = (OC_C3S5*x[5]>>16)-x[5]
  1616. VSHRN.S32 D12,Q6, #16
  1617. VSHRN.S32 D13,Q12,#16 @ Q6 = (OC_C3S5*x[3]>>16)-x[3]
  1618. VADD.S16 Q7, Q7, Q2 @ Q7 = t[7]
  1619. VSUB.S16 Q5, Q5, Q11
  1620. VADD.S16 Q6, Q6, Q11
  1621. VADD.S16 Q5, Q5, Q13
  1622. VADD.S16 Q6, Q6, Q13
  1623. VMULL.S16 Q9, D22,D1[1]
  1624. VMULL.S16 Q11,D23,D1[1]
  1625. VMULL.S16 Q15,D26,D1[1]
  1626. VMULL.S16 Q13,D27,D1[1]
  1627. VMULL.S16 Q2, D20,D1[2]
  1628. VMULL.S16 Q12,D21,D1[2]
  1629. VSHRN.S32 D18,Q9, #16
  1630. VSHRN.S32 D19,Q11,#16 @ Q9 = (OC_C5S3*x[3]>>16)-x[3]
  1631. VSHRN.S32 D30,Q15,#16
  1632. VSHRN.S32 D31,Q13,#16 @ Q15= (OC_C5S3*x[5]>>16)-x[5]
  1633. VSHRN.S32 D4, Q2, #16
  1634. VSHRN.S32 D5, Q12,#16 @ Q2 = (OC_C6S2*x[2]>>16)
  1635. VSUB.S16 Q5, Q5, Q9 @ Q5 = t[5]
  1636. VADD.S16 Q6, Q6, Q15 @ Q6 = t[6]
  1637. VSUB.S16 Q2, Q2, Q14
  1638. VMULL.S16 Q3, D28,D1[2]
  1639. VMULL.S16 Q11,D29,D1[2]
  1640. VMULL.S16 Q12,D28,D0[2]
  1641. VMULL.S16 Q9, D29,D0[2]
  1642. VMULL.S16 Q13,D20,D0[2]
  1643. VMULL.S16 Q15,D21,D0[2]
  1644. VSHRN.S32 D6, Q3, #16
  1645. VSHRN.S32 D7, Q11,#16 @ Q3 = (OC_C6S2*x[6]>>16)
  1646. VSHRN.S32 D24,Q12,#16
  1647. VSHRN.S32 D25,Q9, #16 @ Q12= (OC_C2S6*x[6]>>16)-x[6]
  1648. VSHRN.S32 D26,Q13,#16
  1649. VSHRN.S32 D27,Q15,#16 @ Q13= (OC_C2S6*x[2]>>16)-x[2]
  1650. VSUB.S16 Q9, Q4, Q5 @ Q9 = t[4]-t[5]
  1651. VSUB.S16 Q11,Q7, Q6 @ Q11= t[7]-t[6]
  1652. VADD.S16 Q3, Q3, Q10
  1653. VADD.S16 Q4, Q4, Q5 @ Q4 = t[4]=t[4]+t[5]
  1654. VADD.S16 Q7, Q7, Q6 @ Q7 = t[7]=t[7]+t[6]
  1655. VSUB.S16 Q2, Q2, Q12 @ Q2 = t[2]
  1656. VADD.S16 Q3, Q3, Q13 @ Q3 = t[3]
  1657. VMULL.S16 Q12,D16,D1[0]
  1658. VMULL.S16 Q13,D17,D1[0]
  1659. VMULL.S16 Q14,D2, D1[0]
  1660. VMULL.S16 Q15,D3, D1[0]
  1661. VMULL.S16 Q5, D18,D1[0]
  1662. VMULL.S16 Q6, D22,D1[0]
  1663. VSHRN.S32 D24,Q12,#16
  1664. VSHRN.S32 D25,Q13,#16
  1665. VSHRN.S32 D28,Q14,#16
  1666. VSHRN.S32 D29,Q15,#16
  1667. VMULL.S16 Q13,D19,D1[0]
  1668. VMULL.S16 Q15,D23,D1[0]
  1669. VADD.S16 Q8, Q8, Q12 @ Q8 = t[0]
  1670. VADD.S16 Q1, Q1, Q14 @ Q1 = t[1]
  1671. VSHRN.S32 D10,Q5, #16
  1672. VSHRN.S32 D12,Q6, #16
  1673. VSHRN.S32 D11,Q13,#16
  1674. VSHRN.S32 D13,Q15,#16
  1675. VADD.S16 Q5, Q5, Q9 @ Q5 = t[5]=OC_C4S4*(t[4]-t[5])>>16
  1676. VADD.S16 Q6, Q6, Q11 @ Q6 = t[6]=OC_C4S4*(t[7]-t[6])>>16
  1677. @ Stage 3
  1678. VSUB.S16 Q11,Q8, Q3 @ Q11 = t[3]=t[0]-t[3]
  1679. VADD.S16 Q8, Q8, Q3 @ Q8 = t[0]=t[0]+t[3]
  1680. VADD.S16 Q9, Q1, Q2 @ Q9 = t[1]=t[1]+t[2]
  1681. VADD.S16 Q3, Q6, Q5 @ Q3 = t[6]=t[6]+t[5]
  1682. VSUB.S16 Q10,Q1, Q2 @ Q10 = t[2]=t[1]-t[2]
  1683. VSUB.S16 Q5, Q6, Q5 @ Q5 = t[5]=t[6]-t[5]
  1684. MOV PC, r14
  1685. @ .size oc_idct8x8_stage123_neon, .-oc_idct8x8_stage123_neon @ ENDP
  1686. @ .type oc_idct8x8_10_neon, %function; oc_idct8x8_10_neon: @ PROC
  1687. oc_idct8x8_10_neon:
  1688. ADR r3, OC_IDCT_CONSTS_NEON
  1689. VLD1.64 {D0,D1}, [r3,:128]
  1690. MOV r2, r1
  1691. @ Row transforms (input is pre-transposed)
  1692. @ Stage 1
  1693. VLD1.64 {D16,D17,D18,D19},[r2,:128]!
  1694. MOV r12, #16
  1695. VMULL.S16 Q15,D16,D1[0] @ Q15= OC_C4S4*x[0]-(x[0]<<16)
  1696. VLD1.64 {D17}, [r2,:64], r12
  1697. VMULL.S16 Q2, D18,D0[1] @ Q2 = OC_C1S7*x[1]-(x[1]<<16)
  1698. VLD1.64 {D19}, [r2,:64]
  1699. VMULL.S16 Q14,D17,D0[2] @ Q14= OC_C2S6*x[2]-(x[2]<<16)
  1700. VMULL.S16 Q3, D19,D0[3] @ Q3 = OC_C3S5*x[3]-(x[3]<<16)
  1701. VMULL.S16 Q13,D19,D1[1] @ Q13= OC_C5S3*x[3]-(x[3]<<16)
  1702. VMULL.S16 Q12,D18,D1[3] @ Q12= OC_C7S1*x[1]
  1703. VMULL.S16 Q1, D17,D1[2] @ Q1 = OC_C6S2*x[2]
  1704. VSHRN.S32 D30,Q15,#16 @ D30= t[0]-x[0]
  1705. VSHRN.S32 D4, Q2, #16 @ D4 = t[7]-x[1]
  1706. VSHRN.S32 D31,Q14,#16 @ D31= t[3]-x[2]
  1707. VSHRN.S32 D6, Q3, #16 @ D6 = t[6]-x[3]
  1708. VSHRN.S32 D7, Q13,#16 @ D7 = -t[5]-x[3]
  1709. VSHRN.S32 D5, Q12,#16 @ D5 = t[4]
  1710. VSHRN.S32 D2, Q1, #16 @ D2 = t[2]
  1711. VADD.S16 D4, D4, D18 @ D4 = t[7]
  1712. VADD.S16 D6, D6, D19 @ D6 = t[6]
  1713. VADD.S16 D7, D7, D19 @ D7 = -t[5]
  1714. VADD.S16 Q15,Q15,Q8 @ D30= t[0]
  1715. @ D31= t[3]
  1716. @ Stages 2 & 3
  1717. VSUB.S16 Q12,Q2, Q3 @ D24= t[7]-t[6]
  1718. @ D25= t[4]'=t[4]+t[5]
  1719. VADD.S16 Q13,Q2, Q3 @ D26= t[7]=t[7]+t[6]
  1720. @ D27= t[4]-t[5]
  1721. VMULL.S16 Q11,D24,D1[0] @ Q11= OC_C4S4*(t[7]-t[6])
  1722. @ -(t[7]-t[6]<<16)
  1723. VMULL.S16 Q14,D27,D1[0] @ Q14= OC_C4S4*(t[4]-t[5])
  1724. @ -(t[4]-t[5]<<16)
  1725. VADD.S16 D16,D30,D31 @ D16= t[0]=t[0]+t[3]
  1726. VSUB.S16 D17,D30,D2 @ D17= t[2]=t[0]-t[2]
  1727. VADD.S16 D18,D30,D2 @ D18= t[1]=t[0]+t[2]
  1728. VSHRN.S32 D22,Q11,#16 @ D22= (OC_C4S4*(t[7]-t[6])>>16)
  1729. @ -(t[7]-t[6])
  1730. VSHRN.S32 D23,Q14,#16 @ D23= (OC_C4S4*(t[4]-t[5])>>16)
  1731. @ -(t[4]-t[5])
  1732. VSUB.S16 D19,D30,D31 @ D19= t[3]=t[0]-t[3]
  1733. VADD.S16 D22,D22,D24 @ D22= t[6]=OC_C4S4*(t[7]-t[6])>>16
  1734. VADD.S16 D23,D23,D27 @ D23= t[5]=OC_C4S4*(t[4]-t[5])>>16
  1735. VSUB.S16 D27,D22,D23 @ D27= t[5]=t[6]-t[5]
  1736. VADD.S16 D24,D22,D23 @ D24= t[6]=t[6]+t[5]
  1737. @ Stage 4
  1738. VSUB.S16 Q11,Q8, Q13 @ D22= y[7]=t[0]-t[7]
  1739. @ D23= y[5]=t[2]'-t[5]''
  1740. VSUB.S16 Q10,Q9, Q12 @ D20= y[6]=t[1]-t[6]
  1741. @ D21= y[4]=t[3]'-t[4]''
  1742. VADD.S16 Q8, Q8, Q13 @ D16= y[0]=t[0]+t[7]
  1743. @ D17= y[2]=t[2]'+t[5]''
  1744. VADD.S16 Q9, Q9, Q12 @ D18= y[1]=t[1]-t[6]
  1745. @ D19= y[3]=t[3]'-t[4]''
  1746. @ 8x4 transpose
  1747. VTRN.16 Q10,Q11 @ Q10= c5c4a5a4 c7c6a7a6
  1748. @ Q11= d5d4b5b4 d7d6b7b6
  1749. VTRN.16 Q8, Q9 @ Q8 = c3c2a3a2 c1c0a1a0
  1750. @ Q9 = d3d2b3b2 d1d0b1b0
  1751. VSWP D20,D21 @ Q10= c7c6a7a6 c5c4a5a4
  1752. VSWP D22,D23 @ Q11= d7d6b7b6 d5d4b5b4
  1753. VUZP.32 Q9, Q11 @ Q9 = b7b6b5b4 b3b2b1b0
  1754. @ Q11= d7d6d5d4 d3d2d1d0
  1755. VMULL.S16 Q15,D18,D0[1]
  1756. VMULL.S16 Q13,D22,D1[1]
  1757. VUZP.32 Q8, Q10 @ Q8 = a7a6a5a4 a3a2a1a0
  1758. @ Q10= c7c6c5c4 c3c2c1c0
  1759. @ Column transforms
  1760. @ Stages 1, 2, & 3
  1761. VMULL.S16 Q14,D19,D0[1] @ Q14:Q15= OC_C1S7*x[1]-(x[1]<<16)
  1762. VMULL.S16 Q12,D23,D1[1] @ Q12:Q13= OC_C5S3*x[3]-(x[3]<<16)
  1763. VMULL.S16 Q3, D22,D0[3]
  1764. VMULL.S16 Q2, D23,D0[3] @ Q2:Q3 = OC_C3S5*x[3]-(x[3]<<16)
  1765. VSHRN.S32 D30,Q15,#16
  1766. VSHRN.S32 D31,Q14,#16 @ Q15= (OC_C1S7*x[1]>>16)-x[1]
  1767. VSHRN.S32 D26,Q13,#16
  1768. VSHRN.S32 D27,Q12,#16 @ Q13= (OC_C5S3*x[3]>>16)-x[3]
  1769. VSHRN.S32 D28,Q3, #16
  1770. VSHRN.S32 D29,Q2, #16 @ Q14= (OC_C3S5*x[3]>>16)-x[3]
  1771. VADD.S16 Q15,Q15,Q9 @ Q15= t[7]
  1772. VADD.S16 Q13,Q13,Q11 @ Q13= -t[5]
  1773. VADD.S16 Q14,Q14,Q11 @ Q14= t[6]
  1774. VMULL.S16 Q12,D18,D1[3]
  1775. VMULL.S16 Q2, D19,D1[3] @ Q2:Q12= OC_C7S1*x[1]
  1776. VMULL.S16 Q1, D16,D1[0]
  1777. VMULL.S16 Q11,D17,D1[0] @ Q11:Q1 = OC_C4S4*x[0]-(x[0]<<16)
  1778. VMULL.S16 Q3, D20,D0[2]
  1779. VMULL.S16 Q9, D21,D0[2] @ Q9:Q3 = OC_C2S6*x[2]-(x[2]<<16)
  1780. VSHRN.S32 D24,Q12,#16
  1781. VSHRN.S32 D25,Q2, #16 @ Q12= t[4]
  1782. VMULL.S16 Q2, D20,D1[2]
  1783. VSHRN.S32 D2, Q1, #16
  1784. VSHRN.S32 D3, Q11,#16 @ Q1 = (OC_C4S4*x[0]>>16)-x[0]
  1785. VMULL.S16 Q11,D21,D1[2] @ Q2:Q11= OC_C6S2*x[2]
  1786. VSHRN.S32 D6, Q3, #16
  1787. VSHRN.S32 D7, Q9, #16 @ Q3 = (OC_C2S6*x[2]>>16)-x[2]
  1788. VSUB.S16 Q9, Q15,Q14 @ Q9 = t[7]-t[6]
  1789. VADD.S16 Q15,Q15,Q14 @ Q15= t[7]=t[7]+t[6]
  1790. VSHRN.S32 D4, Q2, #16
  1791. VSHRN.S32 D5, Q11,#16 @ Q2 = t[2]
  1792. VADD.S16 Q1, Q1, Q8 @ Q1 = t[0]
  1793. VADD.S16 Q8, Q12,Q13 @ Q8 = t[4]-t[5]
  1794. VADD.S16 Q3, Q3, Q10 @ Q3 = t[3]
  1795. VMULL.S16 Q10,D16,D1[0]
  1796. VMULL.S16 Q11,D17,D1[0] @ Q11:Q10= OC_C4S4*(t[4]-t[5])
  1797. @ -(t[4]-t[5]<<16)
  1798. VSUB.S16 Q12,Q12,Q13 @ Q12= t[4]=t[4]+t[5]
  1799. VMULL.S16 Q14,D18,D1[0]
  1800. VMULL.S16 Q13,D19,D1[0] @ Q13:Q14= OC_C4S4*(t[6]-t[7])
  1801. @ -(t[6]-t[7]<<16)
  1802. VSHRN.S32 D20,Q10,#16
  1803. VSHRN.S32 D21,Q11,#16 @ Q10= (OC_C4S4*(t[4]-t[5])>>16)
  1804. @ -(t[4]-t[5])
  1805. VADD.S16 Q11,Q1, Q3 @ Q11= t[0]=t[0]+t[3]
  1806. VSUB.S16 Q3, Q1, Q3 @ Q3 = t[3]=t[0]-t[3]
  1807. VSHRN.S32 D28,Q14,#16
  1808. VSHRN.S32 D29,Q13,#16 @ Q14= (OC_C4S4*(t[7]-t[6])>>16)
  1809. @ -(t[7]-t[6])
  1810. VADD.S16 Q10,Q10,Q8 @ Q10=t[5]
  1811. VADD.S16 Q14,Q14,Q9 @ Q14=t[6]
  1812. VSUB.S16 Q13,Q14,Q10 @ Q13=t[5]=t[6]-t[5]
  1813. VADD.S16 Q14,Q14,Q10 @ Q14=t[6]=t[6]+t[5]
  1814. VADD.S16 Q10,Q1, Q2 @ Q10= t[1]=t[0]+t[2]
  1815. VSUB.S16 Q2, Q1, Q2 @ Q2 = t[2]=t[0]-t[2]
  1816. @ Stage 4
  1817. VADD.S16 Q8, Q11,Q15 @ Q8 = y[0]=t[0]+t[7]
  1818. VADD.S16 Q9, Q10,Q14 @ Q9 = y[1]=t[1]+t[6]
  1819. VSUB.S16 Q15,Q11,Q15 @ Q15 = y[7]=t[0]-t[7]
  1820. VSUB.S16 Q14,Q10,Q14 @ Q14 = y[6]=t[1]-t[6]
  1821. VADD.S16 Q10,Q2, Q13 @ Q10 = y[2]=t[2]+t[5]
  1822. VADD.S16 Q11,Q3, Q12 @ Q11 = y[3]=t[3]+t[4]
  1823. VSUB.S16 Q12,Q3, Q12 @ Q12 = y[4]=t[3]-t[4]
  1824. VSUB.S16 Q13,Q2, Q13 @ Q13 = y[5]=t[2]-t[5]
  1825. VMOV.I8 D2, #0
  1826. VRSHR.S16 Q8, Q8, #4 @ Q8 = y[0]+8>>4
  1827. VST1.64 {D2}, [r1,:64], r12
  1828. VRSHR.S16 Q9, Q9, #4 @ Q9 = y[1]+8>>4
  1829. VRSHR.S16 Q10,Q10,#4 @ Q10 = y[2]+8>>4
  1830. VST1.64 {D2}, [r1,:64], r12
  1831. VRSHR.S16 Q11,Q11,#4 @ Q11 = y[3]+8>>4
  1832. VRSHR.S16 Q12,Q12,#4 @ Q12 = y[4]+8>>4
  1833. VST1.64 {D2}, [r1,:64], r12
  1834. VRSHR.S16 Q13,Q13,#4 @ Q13 = y[5]+8>>4
  1835. VRSHR.S16 Q14,Q14,#4 @ Q14 = y[6]+8>>4
  1836. VST1.64 {D2}, [r1,:64]
  1837. VRSHR.S16 Q15,Q15,#4 @ Q15 = y[7]+8>>4
  1838. VSTMIA r0, {D16-D31}
  1839. MOV PC, r14
  1840. @ .size oc_idct8x8_10_neon, .-oc_idct8x8_10_neon @ ENDP
  1841. .endif
  1842. @ END
  1843. @ .section .note.GNU-stack,"",%progbits
  1844. #endif