armidct-gnu.S 63 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881
  1. @********************************************************************
  2. @* *
  3. @* THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. *
  4. @* USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS *
  5. @* GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
  6. @* IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
  7. @* *
  8. @* THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2010 *
  9. @* by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
  10. @* *
  11. @********************************************************************
  12. @ Original implementation:
  13. @ Copyright (C) 2009 Robin Watts for Pinknoise Productions Ltd
  14. @ last mod: $Id: armidct.s 17430 2010-09-22 21:54:09Z tterribe $
  15. @********************************************************************
  16. .text
  17. @ .include "armopts-gnu.S"
  18. @ .set OC_ARM_ASM_EDSP, 0
  19. @ .set OC_ARM_ASM_MEDIA, 0
  20. @ .set OC_ARM_ASM_NEON, 0
  21. @ .set OC_ARM_CAN_UNALIGN, 0
  22. @ .set OC_ARM_CAN_UNALIGN_LDRD, 0
  23. .global oc_idct8x8_1_arm
  24. .global oc_idct8x8_arm
  25. oc_idct8x8_1_arm:
  26. @ r0 = ogg_int16_t *_y
  27. @ r1 = ogg_uint16_t _dc
  28. ORR r1, r1, r1, LSL #16
  29. MOV r2, r1
  30. MOV r3, r1
  31. MOV r12,r1
  32. STMIA r0!,{r1,r2,r3,r12}
  33. STMIA r0!,{r1,r2,r3,r12}
  34. STMIA r0!,{r1,r2,r3,r12}
  35. STMIA r0!,{r1,r2,r3,r12}
  36. STMIA r0!,{r1,r2,r3,r12}
  37. STMIA r0!,{r1,r2,r3,r12}
  38. STMIA r0!,{r1,r2,r3,r12}
  39. STMIA r0!,{r1,r2,r3,r12}
  40. MOV PC, r14
  41. oc_idct8x8_arm:
  42. @ r0 = ogg_int16_t *_y
  43. @ r1 = ogg_int16_t *_x
  44. @ r2 = int _last_zzi
  45. CMP r2, #3
  46. BLE oc_idct8x8_3_arm
  47. CMP r2, #6
  48. BLE oc_idct8x8_6_arm
  49. CMP r2, #10
  50. BLE oc_idct8x8_10_arm
  51. oc_idct8x8_slow_arm:
  52. STMFD r13!,{r4-r11,r14}
  53. SUB r13,r13,#64*2
  54. @ Row transforms
  55. STR r0, [r13,#-4]!
  56. ADD r0, r13, #4 @ Write to temp storage.
  57. BL idct8core_arm
  58. BL idct8core_arm
  59. BL idct8core_arm
  60. BL idct8core_arm
  61. BL idct8core_arm
  62. BL idct8core_arm
  63. BL idct8core_arm
  64. BL idct8core_arm
  65. LDR r0, [r13], #4 @ Write to the final destination.
  66. @ Clear input data for next block (decoder only).
  67. SUB r2, r1, #8*16
  68. CMP r0, r2
  69. MOV r1, r13 @ And read from temp storage.
  70. BEQ oc_idct8x8_slow_arm_cols
  71. MOV r4, #0
  72. MOV r5, #0
  73. MOV r6, #0
  74. MOV r7, #0
  75. STMIA r2!,{r4,r5,r6,r7}
  76. STMIA r2!,{r4,r5,r6,r7}
  77. STMIA r2!,{r4,r5,r6,r7}
  78. STMIA r2!,{r4,r5,r6,r7}
  79. STMIA r2!,{r4,r5,r6,r7}
  80. STMIA r2!,{r4,r5,r6,r7}
  81. STMIA r2!,{r4,r5,r6,r7}
  82. STMIA r2!,{r4,r5,r6,r7}
  83. oc_idct8x8_slow_arm_cols:
  84. @ Column transforms
  85. BL idct8core_down_arm
  86. BL idct8core_down_arm
  87. BL idct8core_down_arm
  88. BL idct8core_down_arm
  89. BL idct8core_down_arm
  90. BL idct8core_down_arm
  91. BL idct8core_down_arm
  92. BL idct8core_down_arm
  93. ADD r13,r13,#64*2
  94. LDMFD r13!,{r4-r11,PC}
  95. oc_idct8x8_10_arm:
  96. STMFD r13!,{r4-r11,r14}
  97. SUB r13,r13,#64*2
  98. @ Row transforms
  99. MOV r2, r0
  100. MOV r0, r13 @ Write to temp storage.
  101. BL idct4core_arm
  102. BL idct3core_arm
  103. BL idct2core_arm
  104. BL idct1core_arm
  105. @ Clear input data for next block (decoder only).
  106. SUB r0, r1, #4*16
  107. CMP r0, r2
  108. MOV r1, r13 @ Read from temp storage.
  109. BEQ oc_idct8x8_10_arm_cols
  110. MOV r4, #0
  111. STR r4, [r0]
  112. STR r4, [r0,#4]
  113. STR r4, [r0,#16]
  114. STR r4, [r0,#20]
  115. STR r4, [r0,#32]
  116. STR r4, [r0,#48]
  117. MOV r0, r2 @ Write to the final destination
  118. oc_idct8x8_10_arm_cols:
  119. @ Column transforms
  120. BL idct4core_down_arm
  121. BL idct4core_down_arm
  122. BL idct4core_down_arm
  123. BL idct4core_down_arm
  124. BL idct4core_down_arm
  125. BL idct4core_down_arm
  126. BL idct4core_down_arm
  127. BL idct4core_down_arm
  128. ADD r13,r13,#64*2
  129. LDMFD r13!,{r4-r11,PC}
  130. oc_idct8x8_6_arm:
  131. STMFD r13!,{r4-r7,r9-r11,r14}
  132. SUB r13,r13,#64*2
  133. @ Row transforms
  134. MOV r2, r0
  135. MOV r0, r13 @ Write to temp storage.
  136. BL idct3core_arm
  137. BL idct2core_arm
  138. BL idct1core_arm
  139. @ Clear input data for next block (decoder only).
  140. SUB r0, r1, #3*16
  141. CMP r0, r2
  142. MOV r1, r13 @ Read from temp storage.
  143. BEQ oc_idct8x8_6_arm_cols
  144. MOV r4, #0
  145. STR r4, [r0]
  146. STR r4, [r0,#4]
  147. STR r4, [r0,#16]
  148. STR r4, [r0,#32]
  149. MOV r0, r2 @ Write to the final destination
  150. oc_idct8x8_6_arm_cols:
  151. @ Column transforms
  152. BL idct3core_down_arm
  153. BL idct3core_down_arm
  154. BL idct3core_down_arm
  155. BL idct3core_down_arm
  156. BL idct3core_down_arm
  157. BL idct3core_down_arm
  158. BL idct3core_down_arm
  159. BL idct3core_down_arm
  160. ADD r13,r13,#64*2
  161. LDMFD r13!,{r4-r7,r9-r11,PC}
  162. oc_idct8x8_3_arm:
  163. STMFD r13!,{r4-r7,r9-r11,r14}
  164. SUB r13,r13,#64*2
  165. @ Row transforms
  166. MOV r2, r0
  167. MOV r0, r13 @ Write to temp storage.
  168. BL idct2core_arm
  169. BL idct1core_arm
  170. @ Clear input data for next block (decoder only).
  171. SUB r0, r1, #2*16
  172. CMP r0, r2
  173. MOV r1, r13 @ Read from temp storage.
  174. MOVNE r4, #0
  175. STRNE r4, [r0]
  176. STRNE r4, [r0,#16]
  177. MOVNE r0, r2 @ Write to the final destination
  178. @ Column transforms
  179. BL idct2core_down_arm
  180. BL idct2core_down_arm
  181. BL idct2core_down_arm
  182. BL idct2core_down_arm
  183. BL idct2core_down_arm
  184. BL idct2core_down_arm
  185. BL idct2core_down_arm
  186. BL idct2core_down_arm
  187. ADD r13,r13,#64*2
  188. LDMFD r13!,{r4-r7,r9-r11,PC}
  189. idct1core_arm:
  190. @ r0 = ogg_int16_t *_y (destination)
  191. @ r1 = const ogg_int16_t *_x (source)
  192. LDRSH r3, [r1], #16
  193. MOV r12,#0x05
  194. ORR r12,r12,#0xB500
  195. MUL r3, r12, r3
  196. @ Stall ?
  197. MOV r3, r3, ASR #16
  198. STRH r3, [r0], #2
  199. STRH r3, [r0, #14]
  200. STRH r3, [r0, #30]
  201. STRH r3, [r0, #46]
  202. STRH r3, [r0, #62]
  203. STRH r3, [r0, #78]
  204. STRH r3, [r0, #94]
  205. STRH r3, [r0, #110]
  206. MOV PC,R14
  207. idct2core_arm:
  208. @ r0 = ogg_int16_t *_y (destination)
  209. @ r1 = const ogg_int16_t *_x (source)
  210. LDRSH r9, [r1], #16 @ r9 = x[0]
  211. LDR r12,OC_C4S4
  212. LDRSH r11,[r1, #-14] @ r11= x[1]
  213. LDR r3, OC_C7S1
  214. MUL r9, r12,r9 @ r9 = t[0]<<16 = OC_C4S4*x[0]
  215. LDR r10,OC_C1S7
  216. MUL r3, r11,r3 @ r3 = t[4]<<16 = OC_C7S1*x[1]
  217. MOV r9, r9, ASR #16 @ r9 = t[0]
  218. MUL r11,r10,r11 @ r11= t[7]<<16 = OC_C1S7*x[1]
  219. MOV r3, r3, ASR #16 @ r3 = t[4]
  220. MUL r10,r12,r3 @ r10= t[5]<<16 = OC_C4S4*t[4]
  221. MOV r11,r11,ASR #16 @ r11= t[7]
  222. MUL r12,r11,r12 @ r12= t[6]<<16 = OC_C4S4*t[7]
  223. MOV r10,r10,ASR #16 @ r10= t[5]
  224. ADD r12,r9,r12,ASR #16 @ r12= t[0]+t[6]
  225. ADD r12,r12,r10 @ r12= t[0]+t2[6] = t[0]+t[6]+t[5]
  226. SUB r10,r12,r10,LSL #1 @ r10= t[0]+t2[5] = t[0]+t[6]-t[5]
  227. ADD r3, r3, r9 @ r3 = t[0]+t[4]
  228. ADD r11,r11,r9 @ r11= t[0]+t[7]
  229. STRH r11,[r0], #2 @ y[0] = t[0]+t[7]
  230. STRH r12,[r0, #14] @ y[1] = t[0]+t[6]
  231. STRH r10,[r0, #30] @ y[2] = t[0]+t[5]
  232. STRH r3, [r0, #46] @ y[3] = t[0]+t[4]
  233. RSB r3, r3, r9, LSL #1 @ r3 = t[0]*2-(t[0]+t[4])=t[0]-t[4]
  234. RSB r10,r10,r9, LSL #1 @ r10= t[0]*2-(t[0]+t[5])=t[0]-t[5]
  235. RSB r12,r12,r9, LSL #1 @ r12= t[0]*2-(t[0]+t[6])=t[0]-t[6]
  236. RSB r11,r11,r9, LSL #1 @ r1 = t[0]*2-(t[0]+t[7])=t[0]-t[7]
  237. STRH r3, [r0, #62] @ y[4] = t[0]-t[4]
  238. STRH r10,[r0, #78] @ y[5] = t[0]-t[5]
  239. STRH r12,[r0, #94] @ y[6] = t[0]-t[6]
  240. STRH r11,[r0, #110] @ y[7] = t[0]-t[7]
  241. MOV PC,r14
  242. idct2core_down_arm:
  243. @ r0 = ogg_int16_t *_y (destination)
  244. @ r1 = const ogg_int16_t *_x (source)
  245. LDRSH r9, [r1], #16 @ r9 = x[0]
  246. LDR r12,OC_C4S4
  247. LDRSH r11,[r1, #-14] @ r11= x[1]
  248. LDR r3, OC_C7S1
  249. MUL r9, r12,r9 @ r9 = t[0]<<16 = OC_C4S4*x[0]
  250. LDR r10,OC_C1S7
  251. MUL r3, r11,r3 @ r3 = t[4]<<16 = OC_C7S1*x[1]
  252. MOV r9, r9, ASR #16 @ r9 = t[0]
  253. MUL r11,r10,r11 @ r11= t[7]<<16 = OC_C1S7*x[1]
  254. ADD r9, r9, #8 @ r9 = t[0]+8
  255. MOV r3, r3, ASR #16 @ r3 = t[4]
  256. MUL r10,r12,r3 @ r10= t[5]<<16 = OC_C4S4*t[4]
  257. MOV r11,r11,ASR #16 @ r11= t[7]
  258. MUL r12,r11,r12 @ r12= t[6]<<16 = OC_C4S4*t[7]
  259. MOV r10,r10,ASR #16 @ r10= t[5]
  260. ADD r12,r9,r12,ASR #16 @ r12= t[0]+t[6]+8
  261. ADD r12,r12,r10 @ r12= t[0]+t2[6] = t[0]+t[6]+t[5]+8
  262. SUB r10,r12,r10,LSL #1 @ r10= t[0]+t2[5] = t[0]+t[6]-t[5]+8
  263. ADD r3, r3, r9 @ r3 = t[0]+t[4]+8
  264. ADD r11,r11,r9 @ r11= t[0]+t[7]+8
  265. @ TODO: This is wrong.
  266. @ The C code truncates to 16 bits by storing to RAM and doing the
  267. @ shifts later; we've got an extra 4 bits here.
  268. MOV r4, r11,ASR #4
  269. MOV r5, r12,ASR #4
  270. MOV r6, r10,ASR #4
  271. MOV r7, r3, ASR #4
  272. RSB r3, r3, r9, LSL #1 @r3 =t[0]*2+8-(t[0]+t[4])=t[0]-t[4]+8
  273. RSB r10,r10,r9, LSL #1 @r10=t[0]*2+8-(t[0]+t[5])=t[0]-t[5]+8
  274. RSB r12,r12,r9, LSL #1 @r12=t[0]*2+8-(t[0]+t[6])=t[0]-t[6]+8
  275. RSB r11,r11,r9, LSL #1 @r11=t[0]*2+8-(t[0]+t[7])=t[0]-t[7]+8
  276. MOV r3, r3, ASR #4
  277. MOV r10,r10,ASR #4
  278. MOV r12,r12,ASR #4
  279. MOV r11,r11,ASR #4
  280. STRH r4, [r0], #2 @ y[0] = t[0]+t[7]
  281. STRH r5, [r0, #14] @ y[1] = t[0]+t[6]
  282. STRH r6, [r0, #30] @ y[2] = t[0]+t[5]
  283. STRH r7, [r0, #46] @ y[3] = t[0]+t[4]
  284. STRH r3, [r0, #62] @ y[4] = t[0]-t[4]
  285. STRH r10,[r0, #78] @ y[5] = t[0]-t[5]
  286. STRH r12,[r0, #94] @ y[6] = t[0]-t[6]
  287. STRH r11,[r0, #110] @ y[7] = t[0]-t[7]
  288. MOV PC,r14
  289. idct3core_arm:
  290. LDRSH r9, [r1], #16 @ r9 = x[0]
  291. LDR r12,OC_C4S4 @ r12= OC_C4S4
  292. LDRSH r3, [r1, #-12] @ r3 = x[2]
  293. LDR r10,OC_C6S2 @ r10= OC_C6S2
  294. MUL r9, r12,r9 @ r9 = t[0]<<16 = OC_C4S4*x[0]
  295. LDR r4, OC_C2S6 @ r4 = OC_C2S6
  296. MUL r10,r3, r10 @ r10= t[2]<<16 = OC_C6S2*x[2]
  297. LDRSH r11,[r1, #-14] @ r11= x[1]
  298. MUL r3, r4, r3 @ r3 = t[3]<<16 = OC_C2S6*x[2]
  299. LDR r4, OC_C7S1 @ r4 = OC_C7S1
  300. LDR r5, OC_C1S7 @ r5 = OC_C1S7
  301. MOV r9, r9, ASR #16 @ r9 = t[0]
  302. MUL r4, r11,r4 @ r4 = t[4]<<16 = OC_C7S1*x[1]
  303. ADD r3, r9, r3, ASR #16 @ r3 = t[0]+t[3]
  304. MUL r11,r5, r11 @ r11= t[7]<<16 = OC_C1S7*x[1]
  305. MOV r4, r4, ASR #16 @ r4 = t[4]
  306. MUL r5, r12,r4 @ r5 = t[5]<<16 = OC_C4S4*t[4]
  307. MOV r11,r11,ASR #16 @ r11= t[7]
  308. MUL r12,r11,r12 @ r12= t[6]<<16 = OC_C4S4*t[7]
  309. ADD r10,r9, r10,ASR #16 @ r10= t[1] = t[0]+t[2]
  310. RSB r6, r10,r9, LSL #1 @ r6 = t[2] = t[0]-t[2]
  311. @ r3 = t2[0] = t[0]+t[3]
  312. RSB r9, r3, r9, LSL #1 @ r9 = t2[3] = t[0]-t[3]
  313. MOV r12,r12,ASR #16 @ r12= t[6]
  314. ADD r5, r12,r5, ASR #16 @ r5 = t2[6] = t[6]+t[5]
  315. RSB r12,r5, r12,LSL #1 @ r12= t2[5] = t[6]-t[5]
  316. ADD r11,r3, r11 @ r11= t2[0]+t[7]
  317. ADD r5, r10,r5 @ r5 = t[1]+t2[6]
  318. ADD r12,r6, r12 @ r12= t[2]+t2[5]
  319. ADD r4, r9, r4 @ r4 = t2[3]+t[4]
  320. STRH r11,[r0], #2 @ y[0] = t[0]+t[7]
  321. STRH r5, [r0, #14] @ y[1] = t[1]+t2[6]
  322. STRH r12,[r0, #30] @ y[2] = t[2]+t2[5]
  323. STRH r4, [r0, #46] @ y[3] = t2[3]+t[4]
  324. RSB r11,r11,r3, LSL #1 @ r11= t2[0] - t[7]
  325. RSB r5, r5, r10,LSL #1 @ r5 = t[1] - t2[6]
  326. RSB r12,r12,r6, LSL #1 @ r6 = t[2] - t2[5]
  327. RSB r4, r4, r9, LSL #1 @ r4 = t2[3] - t[4]
  328. STRH r4, [r0, #62] @ y[4] = t2[3]-t[4]
  329. STRH r12,[r0, #78] @ y[5] = t[2]-t2[5]
  330. STRH r5, [r0, #94] @ y[6] = t[1]-t2[6]
  331. STRH r11,[r0, #110] @ y[7] = t2[0]-t[7]
  332. MOV PC,R14
  333. idct3core_down_arm:
  334. LDRSH r9, [r1], #16 @ r9 = x[0]
  335. LDR r12,OC_C4S4 @ r12= OC_C4S4
  336. LDRSH r3, [r1, #-12] @ r3 = x[2]
  337. LDR r10,OC_C6S2 @ r10= OC_C6S2
  338. MUL r9, r12,r9 @ r9 = t[0]<<16 = OC_C4S4*x[0]
  339. LDR r4, OC_C2S6 @ r4 = OC_C2S6
  340. MUL r10,r3, r10 @ r10= t[2]<<16 = OC_C6S2*x[2]
  341. LDRSH r11,[r1, #-14] @ r11= x[1]
  342. MUL r3, r4, r3 @ r3 = t[3]<<16 = OC_C2S6*x[2]
  343. LDR r4, OC_C7S1 @ r4 = OC_C7S1
  344. LDR r5, OC_C1S7 @ r5 = OC_C1S7
  345. MOV r9, r9, ASR #16 @ r9 = t[0]
  346. MUL r4, r11,r4 @ r4 = t[4]<<16 = OC_C7S1*x[1]
  347. ADD r9, r9, #8 @ r9 = t[0]+8
  348. MUL r11,r5, r11 @ r11= t[7]<<16 = OC_C1S7*x[1]
  349. ADD r3, r9, r3, ASR #16 @ r3 = t[0]+t[3]+8
  350. MOV r4, r4, ASR #16 @ r4 = t[4]
  351. MUL r5, r12,r4 @ r5 = t[5]<<16 = OC_C4S4*t[4]
  352. MOV r11,r11,ASR #16 @ r11= t[7]
  353. MUL r12,r11,r12 @ r12= t[6]<<16 = OC_C4S4*t[7]
  354. ADD r10,r9, r10,ASR #16 @ r10= t[1]+8 = t[0]+t[2]+8
  355. RSB r6, r10,r9, LSL #1 @ r6 = t[2]+8 = t[0]-t[2]+8
  356. @ r3 = t2[0]+8 = t[0]+t[3]+8
  357. RSB r9, r3, r9, LSL #1 @ r9 = t2[3]+8 = t[0]-t[3]+8
  358. MOV r12,r12,ASR #16 @ r12= t[6]
  359. ADD r5, r12,r5, ASR #16 @ r5 = t2[6] = t[6]+t[5]
  360. RSB r12,r5, r12,LSL #1 @ r12= t2[5] = t[6]-t[5]
  361. ADD r11,r3, r11 @ r11= t2[0]+t[7] +8
  362. ADD r5, r10,r5 @ r5 = t[1] +t2[6]+8
  363. ADD r12,r6, r12 @ r12= t[2] +t2[5]+8
  364. ADD r4, r9, r4 @ r4 = t2[3]+t[4] +8
  365. RSB r3, r11,r3, LSL #1 @ r11= t2[0] - t[7] + 8
  366. RSB r10,r5, r10,LSL #1 @ r5 = t[1] - t2[6] + 8
  367. RSB r6, r12,r6, LSL #1 @ r6 = t[2] - t2[5] + 8
  368. RSB r9, r4, r9, LSL #1 @ r4 = t2[3] - t[4] + 8
  369. @ TODO: This is wrong.
  370. @ The C code truncates to 16 bits by storing to RAM and doing the
  371. @ shifts later; we've got an extra 4 bits here.
  372. MOV r11,r11,ASR #4
  373. MOV r5, r5, ASR #4
  374. MOV r12,r12,ASR #4
  375. MOV r4, r4, ASR #4
  376. MOV r9, r9, ASR #4
  377. MOV r6, r6, ASR #4
  378. MOV r10,r10,ASR #4
  379. MOV r3, r3, ASR #4
  380. STRH r11,[r0], #2 @ y[0] = t[0]+t[7]
  381. STRH r5, [r0, #14] @ y[1] = t[1]+t2[6]
  382. STRH r12,[r0, #30] @ y[2] = t[2]+t2[5]
  383. STRH r4, [r0, #46] @ y[3] = t2[3]+t[4]
  384. STRH r9, [r0, #62] @ y[4] = t2[3]-t[4]
  385. STRH r6, [r0, #78] @ y[5] = t[2]-t2[5]
  386. STRH r10,[r0, #94] @ y[6] = t[1]-t2[6]
  387. STRH r3, [r0, #110] @ y[7] = t2[0]-t[7]
  388. MOV PC,R14
  389. idct4core_arm:
  390. @ r0 = ogg_int16_t *_y (destination)
  391. @ r1 = const ogg_int16_t *_x (source)
  392. LDRSH r9, [r1], #16 @ r9 = x[0]
  393. LDR r10,OC_C4S4 @ r10= OC_C4S4
  394. LDRSH r12,[r1, #-12] @ r12= x[2]
  395. LDR r4, OC_C6S2 @ r4 = OC_C6S2
  396. MUL r9, r10,r9 @ r9 = t[0]<<16 = OC_C4S4*x[0]
  397. LDR r5, OC_C2S6 @ r5 = OC_C2S6
  398. MUL r4, r12,r4 @ r4 = t[2]<<16 = OC_C6S2*x[2]
  399. LDRSH r3, [r1, #-14] @ r3 = x[1]
  400. MUL r5, r12,r5 @ r5 = t[3]<<16 = OC_C2S6*x[2]
  401. LDR r6, OC_C7S1 @ r6 = OC_C7S1
  402. LDR r12,OC_C1S7 @ r12= OC_C1S7
  403. LDRSH r11,[r1, #-10] @ r11= x[3]
  404. MUL r6, r3, r6 @ r6 = t[4]<<16 = OC_C7S1*x[1]
  405. LDR r7, OC_C5S3 @ r7 = OC_C5S3
  406. MUL r3, r12,r3 @ r3 = t[7]<<16 = OC_C1S7*x[1]
  407. LDR r8, OC_C3S5 @ r8 = OC_C3S5
  408. MUL r7, r11,r7 @ r7 = -t[5]<<16 = OC_C5S3*x[3]
  409. MOV r9, r9, ASR #16 @ r9 = t[0]
  410. MUL r11,r8, r11 @ r11= t[6]<<16 = OC_C3S5*x[3]
  411. MOV r6, r6, ASR #16 @ r6 = t[4]
  412. @ TODO: This is wrong; t[4]-t[5] and t[7]-t[6] need to be truncated to 16-bit
  413. @ before multiplying, not after (this is not equivalent)
  414. SUB r7, r6, r7, ASR #16 @ r7 = t2[4]=t[4]+t[5] (as r7=-t[5])
  415. RSB r6, r7, r6, LSL #1 @ r6 = t[4]-t[5]
  416. MUL r6, r10,r6 @ r6 = t2[5]<<16 =OC_C4S4*(t[4]-t[5])
  417. MOV r3, r3, ASR #16 @ r3 = t[7]
  418. ADD r11,r3, r11,ASR #16 @ r11= t2[7]=t[7]+t[6]
  419. RSB r3, r11,r3, LSL #1 @ r3 = t[7]-t[6]
  420. MUL r3, r10,r3 @ r3 = t2[6]<<16 =OC_C4S4*(t[7]-t[6])
  421. ADD r4, r9, r4, ASR #16 @ r4 = t[1] = t[0] + t[2]
  422. RSB r10,r4, r9, LSL #1 @ r10= t[2] = t[0] - t[2]
  423. ADD r5, r9, r5, ASR #16 @ r5 = t[0] = t[0] + t[3]
  424. RSB r9, r5, r9, LSL #1 @ r9 = t[3] = t[0] - t[3]
  425. MOV r3, r3, ASR #16 @ r3 = t2[6]
  426. ADD r6, r3, r6, ASR #16 @ r6 = t3[6] = t2[6]+t2[5]
  427. RSB r3, r6, r3, LSL #1 @ r3 = t3[5] = t2[6]-t2[5]
  428. ADD r11,r5, r11 @ r11= t[0]+t2[7]
  429. ADD r6, r4, r6 @ r6 = t[1]+t3[6]
  430. ADD r3, r10,r3 @ r3 = t[2]+t3[5]
  431. ADD r7, r9, r7 @ r7 = t[3]+t2[4]
  432. STRH r11,[r0], #2 @ y[0] = t[0]+t[7]
  433. STRH r6, [r0, #14] @ y[1] = t[1]+t2[6]
  434. STRH r3, [r0, #30] @ y[2] = t[2]+t2[5]
  435. STRH r7, [r0, #46] @ y[3] = t2[3]+t[4]
  436. RSB r11,r11,r5, LSL #1 @ r11= t[0]-t2[7]
  437. RSB r6, r6, r4, LSL #1 @ r6 = t[1]-t3[6]
  438. RSB r3, r3, r10,LSL #1 @ r3 = t[2]-t3[5]
  439. RSB r7, r7, r9, LSL #1 @ r7 = t[3]-t2[4]
  440. STRH r7, [r0, #62] @ y[4] = t2[3]-t[4]
  441. STRH r3, [r0, #78] @ y[5] = t[2]-t2[5]
  442. STRH r6, [r0, #94] @ y[6] = t[1]-t2[6]
  443. STRH r11, [r0, #110] @ y[7] = t2[0]-t[7]
  444. MOV PC,r14
  445. idct4core_down_arm:
  446. @ r0 = ogg_int16_t *_y (destination)
  447. @ r1 = const ogg_int16_t *_x (source)
  448. LDRSH r9, [r1], #16 @ r9 = x[0]
  449. LDR r10,OC_C4S4 @ r10= OC_C4S4
  450. LDRSH r12,[r1, #-12] @ r12= x[2]
  451. LDR r4, OC_C6S2 @ r4 = OC_C6S2
  452. MUL r9, r10,r9 @ r9 = t[0]<<16 = OC_C4S4*x[0]
  453. LDR r5, OC_C2S6 @ r5 = OC_C2S6
  454. MUL r4, r12,r4 @ r4 = t[2]<<16 = OC_C6S2*x[2]
  455. LDRSH r3, [r1, #-14] @ r3 = x[1]
  456. MUL r5, r12,r5 @ r5 = t[3]<<16 = OC_C2S6*x[2]
  457. LDR r6, OC_C7S1 @ r6 = OC_C7S1
  458. LDR r12,OC_C1S7 @ r12= OC_C1S7
  459. LDRSH r11,[r1, #-10] @ r11= x[3]
  460. MUL r6, r3, r6 @ r6 = t[4]<<16 = OC_C7S1*x[1]
  461. LDR r7, OC_C5S3 @ r7 = OC_C5S3
  462. MUL r3, r12,r3 @ r3 = t[7]<<16 = OC_C1S7*x[1]
  463. LDR r8, OC_C3S5 @ r8 = OC_C3S5
  464. MUL r7, r11,r7 @ r7 = -t[5]<<16 = OC_C5S3*x[3]
  465. MOV r9, r9, ASR #16 @ r9 = t[0]
  466. MUL r11,r8, r11 @ r11= t[6]<<16 = OC_C3S5*x[3]
  467. MOV r6, r6, ASR #16 @ r6 = t[4]
  468. @ TODO: This is wrong; t[4]-t[5] and t[7]-t[6] need to be truncated to 16-bit
  469. @ before multiplying, not after (this is not equivalent)
  470. SUB r7, r6, r7, ASR #16 @ r7 = t2[4]=t[4]+t[5] (as r7=-t[5])
  471. RSB r6, r7, r6, LSL #1 @ r6 = t[4]-t[5]
  472. MUL r6, r10,r6 @ r6 = t2[5]<<16 =OC_C4S4*(t[4]-t[5])
  473. MOV r3, r3, ASR #16 @ r3 = t[7]
  474. ADD r11,r3, r11,ASR #16 @ r11= t2[7]=t[7]+t[6]
  475. RSB r3, r11,r3, LSL #1 @ r3 = t[7]-t[6]
  476. ADD r9, r9, #8 @ r9 = t[0]+8
  477. MUL r3, r10,r3 @ r3 = t2[6]<<16 =OC_C4S4*(t[7]-t[6])
  478. ADD r4, r9, r4, ASR #16 @ r4 = t[1] = t[0] + t[2] + 8
  479. RSB r10,r4, r9, LSL #1 @ r10= t[2] = t[0] - t[2] + 8
  480. ADD r5, r9, r5, ASR #16 @ r5 = t[0] = t[0] + t[3] + 8
  481. RSB r9, r5, r9, LSL #1 @ r9 = t[3] = t[0] - t[3] + 8
  482. MOV r3, r3, ASR #16 @ r3 = t2[6]
  483. ADD r6, r3, r6, ASR #16 @ r6 = t3[6] = t2[6]+t2[5]
  484. RSB r3, r6, r3, LSL #1 @ r3 = t3[5] = t2[6]-t2[5]
  485. ADD r5, r5, r11 @ r5 = t[0]+t2[7]+8
  486. ADD r4, r4, r6 @ r4 = t[1]+t3[6]+8
  487. ADD r10,r10,r3 @ r10= t[2]+t3[5]+8
  488. ADD r9, r9, r7 @ r9 = t[3]+t2[4]+8
  489. SUB r11,r5, r11,LSL #1 @ r11= t[0]-t2[7]+8
  490. SUB r6, r4, r6, LSL #1 @ r6 = t[1]-t3[6]+8
  491. SUB r3, r10,r3, LSL #1 @ r3 = t[2]-t3[5]+8
  492. SUB r7, r9, r7, LSL #1 @ r7 = t[3]-t2[4]+8
  493. @ TODO: This is wrong.
  494. @ The C code truncates to 16 bits by storing to RAM and doing the
  495. @ shifts later; we've got an extra 4 bits here.
  496. MOV r11,r11,ASR #4
  497. MOV r6, r6, ASR #4
  498. MOV r3, r3, ASR #4
  499. MOV r7, r7, ASR #4
  500. MOV r9, r9, ASR #4
  501. MOV r10,r10,ASR #4
  502. MOV r4, r4, ASR #4
  503. MOV r5, r5, ASR #4
  504. STRH r5,[r0], #2 @ y[0] = t[0]+t[7]
  505. STRH r4, [r0, #14] @ y[1] = t[1]+t2[6]
  506. STRH r10,[r0, #30] @ y[2] = t[2]+t2[5]
  507. STRH r9, [r0, #46] @ y[3] = t2[3]+t[4]
  508. STRH r7, [r0, #62] @ y[4] = t2[3]-t[4]
  509. STRH r3, [r0, #78] @ y[5] = t[2]-t2[5]
  510. STRH r6, [r0, #94] @ y[6] = t[1]-t2[6]
  511. STRH r11,[r0, #110] @ y[7] = t2[0]-t[7]
  512. MOV PC,r14
  513. idct8core_arm:
  514. @ r0 = ogg_int16_t *_y (destination)
  515. @ r1 = const ogg_int16_t *_x (source)
  516. LDRSH r2, [r1],#16 @ r2 = x[0]
  517. STMFD r13!,{r1,r14}
  518. LDRSH r6, [r1, #-8] @ r6 = x[4]
  519. LDR r12,OC_C4S4 @ r12= C4S4
  520. LDRSH r4, [r1, #-12] @ r4 = x[2]
  521. ADD r2, r2, r6 @ r2 = x[0] + x[4]
  522. SUB r6, r2, r6, LSL #1 @ r6 = x[0] - x[4]
  523. @ For spec compliance, these sums must be truncated to 16-bit precision
  524. @ _before_ the multiply (not after).
  525. @ Sadly, ARMv4 provides no simple way to do that.
  526. MOV r2, r2, LSL #16
  527. MOV r6, r6, LSL #16
  528. MOV r2, r2, ASR #16
  529. MOV r6, r6, ASR #16
  530. MUL r2, r12,r2 @ r2 = t[0]<<16 = C4S4*(x[0]+x[4])
  531. LDRSH r8, [r1, #-4] @ r8 = x[6]
  532. LDR r7, OC_C6S2 @ r7 = OC_C6S2
  533. MUL r6, r12,r6 @ r6 = t[1]<<16 = C4S4*(x[0]-x[4])
  534. LDR r14,OC_C2S6 @ r14= OC_C2S6
  535. MUL r3, r4, r7 @ r3 = OC_C6S2*x[2]
  536. LDR r5, OC_C7S1 @ r5 = OC_C7S1
  537. MUL r4, r14,r4 @ r4 = OC_C2S6*x[2]
  538. MOV r3, r3, ASR #16 @ r3 = OC_C6S2*x[2]>>16
  539. MUL r14,r8, r14 @ r14= OC_C2S6*x[6]
  540. MOV r4, r4, ASR #16 @ r4 = OC_C2S6*x[2]>>16
  541. MUL r8, r7, r8 @ r8 = OC_C6S2*x[6]
  542. LDR r7, OC_C1S7 @ r7 = OC_C1S7
  543. SUB r3, r3, r14,ASR #16 @ r3=t[2]=C6S2*x[2]>>16-C2S6*x[6]>>16
  544. LDRSH r14,[r1, #-14] @ r14= x[1]
  545. ADD r4, r4, r8, ASR #16 @ r4=t[3]=C2S6*x[2]>>16+C6S2*x[6]>>16
  546. LDRSH r8, [r1, #-2] @ r8 = x[7]
  547. MUL r9, r5, r14 @ r9 = OC_C7S1*x[1]
  548. LDRSH r10,[r1, #-6] @ r10= x[5]
  549. MUL r14,r7, r14 @ r14= OC_C1S7*x[1]
  550. MOV r9, r9, ASR #16 @ r9 = OC_C7S1*x[1]>>16
  551. MUL r7, r8, r7 @ r7 = OC_C1S7*x[7]
  552. MOV r14,r14,ASR #16 @ r14= OC_C1S7*x[1]>>16
  553. MUL r8, r5, r8 @ r8 = OC_C7S1*x[7]
  554. LDRSH r1, [r1, #-10] @ r1 = x[3]
  555. LDR r5, OC_C3S5 @ r5 = OC_C3S5
  556. LDR r11,OC_C5S3 @ r11= OC_C5S3
  557. ADD r8, r14,r8, ASR #16 @ r8=t[7]=C1S7*x[1]>>16+C7S1*x[7]>>16
  558. MUL r14,r5, r10 @ r14= OC_C3S5*x[5]
  559. SUB r9, r9, r7, ASR #16 @ r9=t[4]=C7S1*x[1]>>16-C1S7*x[7]>>16
  560. MUL r10,r11,r10 @ r10= OC_C5S3*x[5]
  561. MOV r14,r14,ASR #16 @ r14= OC_C3S5*x[5]>>16
  562. MUL r11,r1, r11 @ r11= OC_C5S3*x[3]
  563. MOV r10,r10,ASR #16 @ r10= OC_C5S3*x[5]>>16
  564. MUL r1, r5, r1 @ r1 = OC_C3S5*x[3]
  565. SUB r14,r14,r11,ASR #16 @r14=t[5]=C3S5*x[5]>>16-C5S3*x[3]>>16
  566. ADD r10,r10,r1, ASR #16 @r10=t[6]=C5S3*x[5]>>16+C3S5*x[3]>>16
  567. @ r2=t[0]<<16 r3=t[2] r4=t[3] r6=t[1]<<16 r8=t[7] r9=t[4]
  568. @ r10=t[6] r12=C4S4 r14=t[5]
  569. @ TODO: This is wrong; t[4]-t[5] and t[7]-t[6] need to be truncated to 16-bit
  570. @ before multiplying, not after (this is not equivalent)
  571. @ Stage 2
  572. @ 4-5 butterfly
  573. ADD r9, r9, r14 @ r9 = t2[4] = t[4]+t[5]
  574. SUB r14,r9, r14, LSL #1 @ r14= t[4]-t[5]
  575. MUL r14,r12,r14 @ r14= t2[5]<<16 = C4S4*(t[4]-t[5])
  576. @ 7-6 butterfly
  577. ADD r8, r8, r10 @ r8 = t2[7] = t[7]+t[6]
  578. SUB r10,r8, r10, LSL #1 @ r10= t[7]-t[6]
  579. MUL r10,r12,r10 @ r10= t2[6]<<16 = C4S4*(t[7]+t[6])
  580. @ r2=t[0]<<16 r3=t[2] r4=t[3] r6=t[1]<<16 r8=t2[7] r9=t2[4]
  581. @ r10=t2[6]<<16 r12=C4S4 r14=t2[5]<<16
  582. @ Stage 3
  583. @ 0-3 butterfly
  584. ADD r2, r4, r2, ASR #16 @ r2 = t2[0] = t[0] + t[3]
  585. SUB r4, r2, r4, LSL #1 @ r4 = t2[3] = t[0] - t[3]
  586. @ 1-2 butterfly
  587. ADD r6, r3, r6, ASR #16 @ r6 = t2[1] = t[1] + t[2]
  588. SUB r3, r6, r3, LSL #1 @ r3 = t2[2] = t[1] - t[2]
  589. @ 6-5 butterfly
  590. MOV r14,r14,ASR #16 @ r14= t2[5]
  591. ADD r10,r14,r10,ASR #16 @ r10= t3[6] = t[6] + t[5]
  592. SUB r14,r10,r14,LSL #1 @ r14= t3[5] = t[6] - t[5]
  593. @ r2=t2[0] r3=t2[2] r4=t2[3] r6=t2[1] r8=t2[7] r9=t2[4]
  594. @ r10=t3[6] r14=t3[5]
  595. @ Stage 4
  596. ADD r2, r2, r8 @ r2 = t[0] + t[7]
  597. ADD r6, r6, r10 @ r6 = t[1] + t[6]
  598. ADD r3, r3, r14 @ r3 = t[2] + t[5]
  599. ADD r4, r4, r9 @ r4 = t[3] + t[4]
  600. SUB r8, r2, r8, LSL #1 @ r8 = t[0] - t[7]
  601. SUB r10,r6, r10,LSL #1 @ r10= t[1] - t[6]
  602. SUB r14,r3, r14,LSL #1 @ r14= t[2] - t[5]
  603. SUB r9, r4, r9, LSL #1 @ r9 = t[3] - t[4]
  604. STRH r2, [r0], #2 @ y[0] = t[0]+t[7]
  605. STRH r6, [r0, #14] @ y[1] = t[1]+t[6]
  606. STRH r3, [r0, #30] @ y[2] = t[2]+t[5]
  607. STRH r4, [r0, #46] @ y[3] = t[3]+t[4]
  608. STRH r9, [r0, #62] @ y[4] = t[3]-t[4]
  609. STRH r14,[r0, #78] @ y[5] = t[2]-t[5]
  610. STRH r10,[r0, #94] @ y[6] = t[1]-t[6]
  611. STRH r8, [r0, #110] @ y[7] = t[0]-t[7]
  612. LDMFD r13!,{r1,PC}
  613. idct8core_down_arm:
  614. @ r0 = ogg_int16_t *_y (destination)
  615. @ r1 = const ogg_int16_t *_x (source)
  616. LDRSH r2, [r1],#16 @ r2 = x[0]
  617. STMFD r13!,{r1,r14}
  618. LDRSH r6, [r1, #-8] @ r6 = x[4]
  619. LDR r12,OC_C4S4 @ r12= C4S4
  620. LDRSH r4, [r1, #-12] @ r4 = x[2]
  621. ADD r2, r2, r6 @ r2 = x[0] + x[4]
  622. SUB r6, r2, r6, LSL #1 @ r6 = x[0] - x[4]
  623. @ For spec compliance, these sums must be truncated to 16-bit precision
  624. @ _before_ the multiply (not after).
  625. @ Sadly, ARMv4 provides no simple way to do that.
  626. MOV r2, r2, LSL #16
  627. MOV r6, r6, LSL #16
  628. MOV r2, r2, ASR #16
  629. MOV r6, r6, ASR #16
  630. MUL r2, r12,r2 @ r2 = t[0]<<16 = C4S4*(x[0]+x[4])
  631. LDRSH r8, [r1, #-4] @ r8 = x[6]
  632. LDR r7, OC_C6S2 @ r7 = OC_C6S2
  633. MUL r6, r12,r6 @ r6 = t[1]<<16 = C4S4*(x[0]-x[4])
  634. LDR r14,OC_C2S6 @ r14= OC_C2S6
  635. MUL r3, r4, r7 @ r3 = OC_C6S2*x[2]
  636. LDR r5, OC_C7S1 @ r5 = OC_C7S1
  637. MUL r4, r14,r4 @ r4 = OC_C2S6*x[2]
  638. MOV r3, r3, ASR #16 @ r3 = OC_C6S2*x[2]>>16
  639. MUL r14,r8, r14 @ r14= OC_C2S6*x[6]
  640. MOV r4, r4, ASR #16 @ r4 = OC_C2S6*x[2]>>16
  641. MUL r8, r7, r8 @ r8 = OC_C6S2*x[6]
  642. LDR r7, OC_C1S7 @ r7 = OC_C1S7
  643. SUB r3, r3, r14,ASR #16 @ r3=t[2]=C6S2*x[2]>>16-C2S6*x[6]>>16
  644. LDRSH r14,[r1, #-14] @ r14= x[1]
  645. ADD r4, r4, r8, ASR #16 @ r4=t[3]=C2S6*x[2]>>16+C6S2*x[6]>>16
  646. LDRSH r8, [r1, #-2] @ r8 = x[7]
  647. MUL r9, r5, r14 @ r9 = OC_C7S1*x[1]
  648. LDRSH r10,[r1, #-6] @ r10= x[5]
  649. MUL r14,r7, r14 @ r14= OC_C1S7*x[1]
  650. MOV r9, r9, ASR #16 @ r9 = OC_C7S1*x[1]>>16
  651. MUL r7, r8, r7 @ r7 = OC_C1S7*x[7]
  652. MOV r14,r14,ASR #16 @ r14= OC_C1S7*x[1]>>16
  653. MUL r8, r5, r8 @ r8 = OC_C7S1*x[7]
  654. LDRSH r1, [r1, #-10] @ r1 = x[3]
  655. LDR r5, OC_C3S5 @ r5 = OC_C3S5
  656. LDR r11,OC_C5S3 @ r11= OC_C5S3
  657. ADD r8, r14,r8, ASR #16 @ r8=t[7]=C1S7*x[1]>>16+C7S1*x[7]>>16
  658. MUL r14,r5, r10 @ r14= OC_C3S5*x[5]
  659. SUB r9, r9, r7, ASR #16 @ r9=t[4]=C7S1*x[1]>>16-C1S7*x[7]>>16
  660. MUL r10,r11,r10 @ r10= OC_C5S3*x[5]
  661. MOV r14,r14,ASR #16 @ r14= OC_C3S5*x[5]>>16
  662. MUL r11,r1, r11 @ r11= OC_C5S3*x[3]
  663. MOV r10,r10,ASR #16 @ r10= OC_C5S3*x[5]>>16
  664. MUL r1, r5, r1 @ r1 = OC_C3S5*x[3]
  665. SUB r14,r14,r11,ASR #16 @r14=t[5]=C3S5*x[5]>>16-C5S3*x[3]>>16
  666. ADD r10,r10,r1, ASR #16 @r10=t[6]=C5S3*x[5]>>16+C3S5*x[3]>>16
  667. @ r2=t[0]<<16 r3=t[2] r4=t[3] r6=t[1]<<16 r8=t[7] r9=t[4]
  668. @ r10=t[6] r12=C4S4 r14=t[5]
  669. @ Stage 2
  670. @ TODO: This is wrong; t[4]-t[5] and t[7]-t[6] need to be truncated to 16-bit
  671. @ before multiplying, not after (this is not equivalent)
  672. @ 4-5 butterfly
  673. ADD r9, r9, r14 @ r9 = t2[4] = t[4]+t[5]
  674. SUB r14,r9, r14, LSL #1 @ r14= t[4]-t[5]
  675. MUL r14,r12,r14 @ r14= t2[5]<<16 = C4S4*(t[4]-t[5])
  676. @ 7-6 butterfly
  677. ADD r8, r8, r10 @ r8 = t2[7] = t[7]+t[6]
  678. SUB r10,r8, r10, LSL #1 @ r10= t[7]-t[6]
  679. MUL r10,r12,r10 @ r10= t2[6]<<16 = C4S4*(t[7]+t[6])
  680. @ r2=t[0]<<16 r3=t[2] r4=t[3] r6=t[1]<<16 r8=t2[7] r9=t2[4]
  681. @ r10=t2[6]<<16 r12=C4S4 r14=t2[5]<<16
  682. @ Stage 3
  683. ADD r2, r2, #8<<16 @ r2 = t[0]+8<<16
  684. ADD r6, r6, #8<<16 @ r6 = t[1]+8<<16
  685. @ 0-3 butterfly
  686. ADD r2, r4, r2, ASR #16 @ r2 = t2[0] = t[0] + t[3] + 8
  687. SUB r4, r2, r4, LSL #1 @ r4 = t2[3] = t[0] - t[3] + 8
  688. @ 1-2 butterfly
  689. ADD r6, r3, r6, ASR #16 @ r6 = t2[1] = t[1] + t[2] + 8
  690. SUB r3, r6, r3, LSL #1 @ r3 = t2[2] = t[1] - t[2] + 8
  691. @ 6-5 butterfly
  692. MOV r14,r14,ASR #16 @ r14= t2[5]
  693. ADD r10,r14,r10,ASR #16 @ r10= t3[6] = t[6] + t[5]
  694. SUB r14,r10,r14,LSL #1 @ r14= t3[5] = t[6] - t[5]
  695. @ r2=t2[0] r3=t2[2] r4=t2[3] r6=t2[1] r8=t2[7] r9=t2[4]
  696. @ r10=t3[6] r14=t3[5]
  697. @ Stage 4
  698. ADD r2, r2, r8 @ r2 = t[0] + t[7] + 8
  699. ADD r6, r6, r10 @ r6 = t[1] + t[6] + 8
  700. ADD r3, r3, r14 @ r3 = t[2] + t[5] + 8
  701. ADD r4, r4, r9 @ r4 = t[3] + t[4] + 8
  702. SUB r8, r2, r8, LSL #1 @ r8 = t[0] - t[7] + 8
  703. SUB r10,r6, r10,LSL #1 @ r10= t[1] - t[6] + 8
  704. SUB r14,r3, r14,LSL #1 @ r14= t[2] - t[5] + 8
  705. SUB r9, r4, r9, LSL #1 @ r9 = t[3] - t[4] + 8
  706. @ TODO: This is wrong.
  707. @ The C code truncates to 16 bits by storing to RAM and doing the
  708. @ shifts later; we've got an extra 4 bits here.
  709. MOV r2, r2, ASR #4
  710. MOV r6, r6, ASR #4
  711. MOV r3, r3, ASR #4
  712. MOV r4, r4, ASR #4
  713. MOV r8, r8, ASR #4
  714. MOV r10,r10,ASR #4
  715. MOV r14,r14,ASR #4
  716. MOV r9, r9, ASR #4
  717. STRH r2, [r0], #2 @ y[0] = t[0]+t[7]
  718. STRH r6, [r0, #14] @ y[1] = t[1]+t[6]
  719. STRH r3, [r0, #30] @ y[2] = t[2]+t[5]
  720. STRH r4, [r0, #46] @ y[3] = t[3]+t[4]
  721. STRH r9, [r0, #62] @ y[4] = t[3]-t[4]
  722. STRH r14,[r0, #78] @ y[5] = t[2]-t[5]
  723. STRH r10,[r0, #94] @ y[6] = t[1]-t[6]
  724. STRH r8, [r0, #110] @ y[7] = t[0]-t[7]
  725. LDMFD r13!,{r1,PC}
  726. .if OC_ARM_ASM_MEDIA
  727. .global oc_idct8x8_1_v6
  728. .global oc_idct8x8_v6
  729. oc_idct8x8_1_v6:
  730. @ r0 = ogg_int16_t *_y
  731. @ r1 = ogg_uint16_t _dc
  732. ORR r2, r1, r1, LSL #16
  733. ORR r3, r1, r1, LSL #16
  734. STRD r2, [r0], #8
  735. STRD r2, [r0], #8
  736. STRD r2, [r0], #8
  737. STRD r2, [r0], #8
  738. STRD r2, [r0], #8
  739. STRD r2, [r0], #8
  740. STRD r2, [r0], #8
  741. STRD r2, [r0], #8
  742. STRD r2, [r0], #8
  743. STRD r2, [r0], #8
  744. STRD r2, [r0], #8
  745. STRD r2, [r0], #8
  746. STRD r2, [r0], #8
  747. STRD r2, [r0], #8
  748. STRD r2, [r0], #8
  749. STRD r2, [r0], #8
  750. MOV PC, r14
  751. oc_idct8x8_v6:
  752. @ r0 = ogg_int16_t *_y
  753. @ r1 = ogg_int16_t *_x
  754. @ r2 = int _last_zzi
  755. CMP r2, #3
  756. BLE oc_idct8x8_3_v6
  757. @CMP r2, #6
  758. @BLE oc_idct8x8_6_v6
  759. CMP r2, #10
  760. BLE oc_idct8x8_10_v6
  761. oc_idct8x8_slow_v6:
  762. STMFD r13!,{r4-r11,r14}
  763. SUB r13,r13,#64*2
  764. @ Row transforms
  765. STR r0, [r13,#-4]!
  766. ADD r0, r13, #4 @ Write to temp storage.
  767. BL idct8_8core_v6
  768. BL idct8_8core_v6
  769. BL idct8_8core_v6
  770. BL idct8_8core_v6
  771. LDR r0, [r13], #4 @ Write to the final destination.
  772. @ Clear input data for next block (decoder only).
  773. SUB r2, r1, #8*16
  774. CMP r0, r2
  775. MOV r1, r13 @ And read from temp storage.
  776. BEQ oc_idct8x8_slow_v6_cols
  777. MOV r4, #0
  778. MOV r5, #0
  779. STRD r4, [r2], #8
  780. STRD r4, [r2], #8
  781. STRD r4, [r2], #8
  782. STRD r4, [r2], #8
  783. STRD r4, [r2], #8
  784. STRD r4, [r2], #8
  785. STRD r4, [r2], #8
  786. STRD r4, [r2], #8
  787. STRD r4, [r2], #8
  788. STRD r4, [r2], #8
  789. STRD r4, [r2], #8
  790. STRD r4, [r2], #8
  791. STRD r4, [r2], #8
  792. STRD r4, [r2], #8
  793. STRD r4, [r2], #8
  794. STRD r4, [r2], #8
  795. oc_idct8x8_slow_v6_cols:
  796. @ Column transforms
  797. BL idct8_8core_down_v6
  798. BL idct8_8core_down_v6
  799. BL idct8_8core_down_v6
  800. BL idct8_8core_down_v6
  801. ADD r13,r13,#64*2
  802. LDMFD r13!,{r4-r11,PC}
  803. oc_idct8x8_10_v6:
  804. STMFD r13!,{r4-r11,r14}
  805. SUB r13,r13,#64*2+4
  806. @ Row transforms
  807. MOV r2, r13
  808. STR r0, [r13,#-4]!
  809. AND r0, r2, #4 @ Align the stack.
  810. ADD r0, r0, r2 @ Write to temp storage.
  811. BL idct4_3core_v6
  812. BL idct2_1core_v6
  813. LDR r0, [r13], #4 @ Write to the final destination.
  814. @ Clear input data for next block (decoder only).
  815. SUB r2, r1, #4*16
  816. CMP r0, r2
  817. AND r1, r13,#4 @ Align the stack.
  818. BEQ oc_idct8x8_10_v6_cols
  819. MOV r4, #0
  820. MOV r5, #0
  821. STRD r4, [r2]
  822. STRD r4, [r2,#16]
  823. STR r4, [r2,#32]
  824. STR r4, [r2,#48]
  825. oc_idct8x8_10_v6_cols:
  826. @ Column transforms
  827. ADD r1, r1, r13 @ And read from temp storage.
  828. BL idct4_4core_down_v6
  829. BL idct4_4core_down_v6
  830. BL idct4_4core_down_v6
  831. BL idct4_4core_down_v6
  832. ADD r13,r13,#64*2+4
  833. LDMFD r13!,{r4-r11,PC}
  834. oc_idct8x8_3_v6:
  835. STMFD r13!,{r4-r8,r14}
  836. SUB r13,r13,#64*2
  837. @ Row transforms
  838. MOV r8, r0
  839. MOV r0, r13 @ Write to temp storage.
  840. BL idct2_1core_v6
  841. @ Clear input data for next block (decoder only).
  842. SUB r0, r1, #2*16
  843. CMP r0, r8
  844. MOV r1, r13 @ Read from temp storage.
  845. MOVNE r4, #0
  846. STRNE r4, [r0]
  847. STRNE r4, [r0,#16]
  848. MOVNE r0, r8 @ Write to the final destination.
  849. @ Column transforms
  850. BL idct2_2core_down_v6
  851. BL idct2_2core_down_v6
  852. BL idct2_2core_down_v6
  853. BL idct2_2core_down_v6
  854. ADD r13,r13,#64*2
  855. LDMFD r13!,{r4-r8,PC}
  856. idct2_1core_v6:
  857. @ r0 = ogg_int16_t *_y (destination)
  858. @ r1 = const ogg_int16_t *_x (source)
  859. @ Stage 1:
  860. LDR r2, [r1], #16 @ r2 = <x[0,1]|x[0,0]>
  861. LDR r3, OC_C4S4
  862. LDRSH r6, [r1], #16 @ r6 = x[1,0]
  863. SMULWB r12,r3, r2 @ r12= t[0,0]=OC_C4S4*x[0,0]>>16
  864. LDRD r4, OC_C7S1 @ r4 = OC_C7S1; r5 = OC_C1S7
  865. SMULWB r6, r3, r6 @ r6 = t[1,0]=OC_C4S4*x[1,0]>>16
  866. SMULWT r4, r4, r2 @ r4 = t[0,4]=OC_C7S1*x[0,1]>>16
  867. SMULWT r7, r5, r2 @ r7 = t[0,7]=OC_C1S7*x[0,1]>>16
  868. @ Stage 2:
  869. SMULWB r5, r3, r4 @ r5 = t[0,5]=OC_C4S4*t[0,4]>>16
  870. PKHBT r12,r12,r6, LSL #16 @ r12= <t[1,0]|t[0,0]>
  871. SMULWB r6, r3, r7 @ r6 = t[0,6]=OC_C4S4*t[0,7]>>16
  872. PKHBT r7, r7, r3 @ r7 = <0|t[0,7]>
  873. @ Stage 3:
  874. PKHBT r5, r6, r5, LSL #16 @ r5 = <t[0,5]|t[0,6]>
  875. PKHBT r4, r4, r3 @ r4 = <0|t[0,4]>
  876. SADDSUBX r5, r5, r5 @ r5 = <t[0,6]+t[0,5]|t[0,6]-t[0,5]>
  877. @ Stage 4:
  878. PKHTB r6, r3, r5, ASR #16 @ r6 = <0|t[0,6]>
  879. PKHBT r5, r5, r3 @ r5 = <0|t[0,5]>
  880. SADD16 r3, r12,r7 @ r3 = t[0]+t[7]
  881. STR r3, [r0], #4 @ y[0<<3] = t[0]+t[7]
  882. SADD16 r3, r12,r6 @ r3 = t[0]+t[6]
  883. STR r3, [r0, #12] @ y[1<<3] = t[0]+t[6]
  884. SADD16 r3, r12,r5 @ r3 = t[0]+t[5]
  885. STR r3, [r0, #28] @ y[2<<3] = t[0]+t[5]
  886. SADD16 r3, r12,r4 @ r3 = t[0]+t[4]
  887. STR r3, [r0, #44] @ y[3<<3] = t[0]+t[4]
  888. SSUB16 r4, r12,r4 @ r4 = t[0]-t[4]
  889. STR r4, [r0, #60] @ y[4<<3] = t[0]-t[4]
  890. SSUB16 r5, r12,r5 @ r5 = t[0]-t[5]
  891. STR r5, [r0, #76] @ y[5<<3] = t[0]-t[5]
  892. SSUB16 r6, r12,r6 @ r6 = t[0]-t[6]
  893. STR r6, [r0, #92] @ y[6<<3] = t[0]-t[6]
  894. SSUB16 r7, r12,r7 @ r7 = t[0]-t[7]
  895. STR r7, [r0, #108] @ y[7<<3] = t[0]-t[7]
  896. MOV PC,r14
  897. .endif
  898. .balign 8
  899. OC_C7S1:
  900. .word 12785 @ 31F1
  901. OC_C1S7:
  902. .word 64277 @ FB15
  903. OC_C6S2:
  904. .word 25080 @ 61F8
  905. OC_C2S6:
  906. .word 60547 @ EC83
  907. OC_C5S3:
  908. .word 36410 @ 8E3A
  909. OC_C3S5:
  910. .word 54491 @ D4DB
  911. OC_C4S4:
  912. .word 46341 @ B505
  913. .if OC_ARM_ASM_MEDIA
  914. idct2_2core_down_v6:
  915. @ r0 = ogg_int16_t *_y (destination)
  916. @ r1 = const ogg_int16_t *_x (source)
  917. @ Stage 1:
  918. LDR r2, [r1], #16 @ r2 = <x[0,1]|x[0,0]>
  919. LDR r3, OC_C4S4
  920. MOV r7 ,#8 @ r7 = 8
  921. LDR r6, [r1], #16 @ r6 = <x[1,1]|x[1,0]>
  922. SMLAWB r12,r3, r2, r7 @ r12= (t[0,0]=OC_C4S4*x[0,0]>>16)+8
  923. LDRD r4, OC_C7S1 @ r4 = OC_C7S1; r5 = OC_C1S7
  924. SMLAWB r7, r3, r6, r7 @ r7 = (t[1,0]=OC_C4S4*x[1,0]>>16)+8
  925. SMULWT r5, r5, r2 @ r2 = t[0,7]=OC_C1S7*x[0,1]>>16
  926. PKHBT r12,r12,r7, LSL #16 @ r12= <t[1,0]+8|t[0,0]+8>
  927. SMULWT r4, r4, r2 @ r4 = t[0,4]=OC_C7S1*x[0,1]>>16
  928. @ Here we cheat: row 1 had just a DC, so x[0,1]==x[1,1] by definition.
  929. PKHBT r7, r5, r5, LSL #16 @ r7 = <t[0,7]|t[0,7]>
  930. @ Stage 2:
  931. SMULWB r6, r3, r7 @ r6 = t[0,6]=OC_C4S4*t[0,7]>>16
  932. PKHBT r4, r4, r4, LSL #16 @ r4 = <t[0,4]|t[0,4]>
  933. SMULWT r2, r3, r7 @ r2 = t[1,6]=OC_C4S4*t[1,7]>>16
  934. SMULWB r5, r3, r4 @ r5 = t[0,5]=OC_C4S4*t[0,4]>>16
  935. PKHBT r6, r6, r2, LSL #16 @ r6 = <t[1,6]|t[0,6]>
  936. SMULWT r2, r3, r4 @ r2 = t[1,5]=OC_C4S4*t[1,4]>>16
  937. PKHBT r2, r5, r2, LSL #16 @ r2 = <t[1,5]|t[0,5]>
  938. @ Stage 3:
  939. SSUB16 r5, r6, r2 @ r5 = <t[1,6]-t[1,5]|t[0,6]-t[0,5]>
  940. SADD16 r6, r6, r2 @ r6 = <t[1,6]+t[1,5]|t[0,6]+t[0,5]>
  941. @ Stage 4:
  942. SADD16 r2, r12,r7 @ r2 = t[0]+t[7]+8
  943. MOV r3, r2, ASR #4
  944. MOV r2, r2, LSL #16
  945. PKHTB r3, r3, r2, ASR #20 @ r3 = t[0]+t[7]+8>>4
  946. STR r3, [r0], #4 @ y[0<<3] = t[0]+t[7]+8>>4
  947. SADD16 r2, r12,r6 @ r2 = t[0]+t[6]+8
  948. MOV r3, r2, ASR #4
  949. MOV r2, r2, LSL #16
  950. PKHTB r3, r3, r2, ASR #20 @ r3 = t[0]+t[6]+8>>4
  951. STR r3, [r0, #12] @ y[1<<3] = t[0]+t[6]+8>>4
  952. SADD16 r2, r12,r5 @ r2 = t[0]+t[5]+8
  953. MOV r3, r2, ASR #4
  954. MOV r2, r2, LSL #16
  955. PKHTB r3, r3, r2, ASR #20 @ r3 = t[0]+t[5]+8>>4
  956. STR r3, [r0, #28] @ y[2<<3] = t[0]+t[5]+8>>4
  957. SADD16 r2, r12,r4 @ r2 = t[0]+t[4]+8
  958. MOV r3, r2, ASR #4
  959. MOV r2, r2, LSL #16
  960. PKHTB r3, r3, r2, ASR #20 @ r3 = t[0]+t[4]+8>>4
  961. STR r3, [r0, #44] @ y[3<<3] = t[0]+t[4]+8>>4
  962. SSUB16 r4, r12,r4 @ r4 = t[0]-t[4]+8
  963. MOV r3, r4, ASR #4
  964. MOV r4, r4, LSL #16
  965. PKHTB r3, r3, r4, ASR #20 @ r3 = t[0]-t[4]+8>>4
  966. STR r3, [r0, #60] @ y[4<<3] = t[0]-t[4]+8>>4
  967. SSUB16 r5, r12,r5 @ r5 = t[0]-t[5]+8
  968. MOV r3, r5, ASR #4
  969. MOV r5, r5, LSL #16
  970. PKHTB r3, r3, r5, ASR #20 @ r3 = t[0]-t[5]+8>>4
  971. STR r3, [r0, #76] @ y[5<<3] = t[0]-t[5]+8>>4
  972. SSUB16 r6, r12,r6 @ r6 = t[0]-t[6]+8
  973. MOV r3, r6, ASR #4
  974. MOV r6, r6, LSL #16
  975. PKHTB r3, r3, r6, ASR #20 @ r3 = t[0]-t[6]+8>>4
  976. STR r3, [r0, #92] @ y[6<<3] = t[0]-t[6]+8>>4
  977. SSUB16 r7, r12,r7 @ r7 = t[0]-t[7]+8
  978. MOV r3, r7, ASR #4
  979. MOV r7, r7, LSL #16
  980. PKHTB r3, r3, r7, ASR #20 @ r3 = t[0]-t[7]+8>>4
  981. STR r3, [r0, #108] @ y[7<<3] = t[0]-t[7]+8>>4
  982. MOV PC,r14
  983. @ In theory this should save ~75 cycles over oc_idct8x8_10, more than enough to
  984. @ pay for increased branch mis-prediction to get here, but in practice it
  985. @ doesn't seem to slow anything down to take it out, and it's less code this
  986. @ way.
  987. .if 0
  988. oc_idct8x8_6_v6:
  989. STMFD r13!,{r4-r8,r10,r11,r14}
  990. SUB r13,r13,#64*2+4
  991. @ Row transforms
  992. MOV r8, r0
  993. AND r0, r13,#4 @ Align the stack.
  994. ADD r0, r0, r13 @ Write to temp storage.
  995. BL idct3_2core_v6
  996. BL idct1core_v6
  997. @ Clear input data for next block (decoder only).
  998. SUB r0, r1, #3*16
  999. CMP r0, r8
  1000. AND r1, r13,#4 @ Align the stack.
  1001. BEQ oc_idct8x8_6_v6_cols
  1002. MOV r4, #0
  1003. MOV r5, #0
  1004. STRD r4, [r0]
  1005. STR r4, [r0,#16]
  1006. STR r4, [r0,#32]
  1007. MOV r0, r8 @ Write to the final destination.
  1008. oc_idct8x8_6_v6_cols:
  1009. @ Column transforms
  1010. ADD r1, r1, r13 @ And read from temp storage.
  1011. BL idct3_3core_down_v6
  1012. BL idct3_3core_down_v6
  1013. BL idct3_3core_down_v6
  1014. BL idct3_3core_down_v6
  1015. ADD r13,r13,#64*2+4
  1016. LDMFD r13!,{r4-r8,r10,r11,PC}
  1017. idct1core_v6:
  1018. @ r0 = ogg_int16_t *_y (destination)
  1019. @ r1 = const ogg_int16_t *_x (source)
  1020. LDRSH r3, [r1], #16
  1021. MOV r12,#0x05
  1022. ORR r12,r12,#0xB500
  1023. MUL r3, r12, r3
  1024. @ Stall ?
  1025. MOV r3, r3, ASR #16
  1026. @ Don't need to actually store the odd lines; they won't be read.
  1027. STRH r3, [r0], #2
  1028. STRH r3, [r0, #30]
  1029. STRH r3, [r0, #62]
  1030. STRH r3, [r0, #94]
  1031. MOV PC,R14
  1032. idct3_2core_v6:
  1033. @ r0 = ogg_int16_t *_y (destination)
  1034. @ r1 = const ogg_int16_t *_x (source)
  1035. @ Stage 1:
  1036. LDRD r4, [r1], #16 @ r4 = <x[0,1]|x[0,0]>; r5 = <*|x[0,2]>
  1037. LDRD r10,OC_C6S2_3_v6 @ r10= OC_C6S2; r11= OC_C2S6
  1038. @ Stall
  1039. SMULWB r3, r11,r5 @ r3 = t[0,3]=OC_C2S6*x[0,2]>>16
  1040. LDR r11,OC_C4S4
  1041. SMULWB r2, r10,r5 @ r2 = t[0,2]=OC_C6S2*x[0,2]>>16
  1042. LDR r5, [r1], #16 @ r5 = <x[1,1]|x[1,0]>
  1043. SMULWB r12,r11,r4 @ r12= (t[0,0]=OC_C4S4*x[0,0]>>16)
  1044. LDRD r6, OC_C7S1_3_v6 @ r6 = OC_C7S1; r7 = OC_C1S7
  1045. SMULWB r10,r11,r5 @ r10= (t[1,0]=OC_C4S4*x[1,0]>>16)
  1046. PKHBT r12,r12,r10,LSL #16 @ r12= <t[1,0]|t[0,0]>
  1047. SMULWT r10,r7, r5 @ r10= t[1,7]=OC_C1S7*x[1,1]>>16
  1048. PKHBT r2, r2, r11 @ r2 = <0|t[0,2]>
  1049. SMULWT r7, r7, r4 @ r7 = t[0,7]=OC_C1S7*x[0,1]>>16
  1050. PKHBT r3, r3, r11 @ r3 = <0|t[0,3]>
  1051. SMULWT r5, r6, r5 @ r10= t[1,4]=OC_C7S1*x[1,1]>>16
  1052. PKHBT r7, r7, r10,LSL #16 @ r7 = <t[1,7]|t[0,7]>
  1053. SMULWT r4, r6, r4 @ r4 = t[0,4]=OC_C7S1*x[0,1]>>16
  1054. @ Stage 2:
  1055. SMULWB r6, r11,r7 @ r6 = t[0,6]=OC_C4S4*t[0,7]>>16
  1056. PKHBT r4, r4, r5, LSL #16 @ r4 = <t[1,4]|t[0,4]>
  1057. SMULWT r10,r11,r7 @ r10= t[1,6]=OC_C4S4*t[1,7]>>16
  1058. SMULWB r5, r11,r4 @ r5 = t[0,5]=OC_C4S4*t[0,4]>>16
  1059. PKHBT r6, r6, r10,LSL #16 @ r6 = <t[1,6]|t[0,6]>
  1060. SMULWT r10,r11,r4 @ r10= t[1,5]=OC_C4S4*t[1,4]>>16
  1061. @ Stage 3:
  1062. B idct4_3core_stage3_v6
  1063. @ Another copy so the LDRD offsets are less than +/- 255.
  1064. .balign 8
  1065. OC_C7S1_3_v6:
  1066. .word 12785 @ 31F1
  1067. OC_C1S7_3_v6:
  1068. .word 64277 @ FB15
  1069. OC_C6S2_3_v6:
  1070. .word 25080 @ 61F8
  1071. OC_C2S6_3_v6:
  1072. .word 60547 @ EC83
  1073. idct3_3core_down_v6:
  1074. @ r0 = ogg_int16_t *_y (destination)
  1075. @ r1 = const ogg_int16_t *_x (source)
  1076. @ Stage 1:
  1077. LDRD r10,[r1], #16 @ r10= <x[0,1]|x[0,0]>; r11= <??|x[0,2]>
  1078. LDRD r6, OC_C6S2_3_v6 @ r6 = OC_C6S2; r7 = OC_C2S6
  1079. LDR r4, [r1], #16 @ r4 = <x[1,1]|x[1,0]>
  1080. SMULWB r3, r7, r11 @ r3 = t[0,3]=OC_C2S6*x[0,2]>>16
  1081. MOV r7,#8
  1082. SMULWB r2, r6, r11 @ r2 = t[0,2]=OC_C6S2*x[0,2]>>16
  1083. LDR r11,OC_C4S4
  1084. SMLAWB r12,r11,r10,r7 @ r12= t[0,0]+8=(OC_C4S4*x[0,0]>>16)+8
  1085. @ Here we cheat: row 2 had just a DC, so x[0,2]==x[1,2] by definition.
  1086. PKHBT r3, r3, r3, LSL #16 @ r3 = <t[0,3]|t[0,3]>
  1087. SMLAWB r5, r11,r4, r7 @ r5 = t[1,0]+8=(OC_C4S4*x[1,0]>>16)+8
  1088. PKHBT r2, r2, r2, LSL #16 @ r2 = <t[0,2]|t[0,2]>
  1089. LDRD r6, OC_C7S1_3_v6 @ r6 = OC_C7S1; r7 = OC_C1S7
  1090. PKHBT r12,r12,r5, LSL #16 @ r12= <t[1,0]+8|t[0,0]+8>
  1091. SMULWT r5, r7, r4 @ r5 = t[1,7]=OC_C1S7*x[1,1]>>16
  1092. SMULWT r7, r7, r10 @ r7 = t[0,7]=OC_C1S7*x[0,1]>>16
  1093. SMULWT r10,r6, r10 @ r10= t[0,4]=OC_C7S1*x[0,1]>>16
  1094. PKHBT r7, r7, r5, LSL #16 @ r7 = <t[1,7]|t[0,7]>
  1095. SMULWT r4, r6, r4 @ r4 = t[1,4]=OC_C7S1*x[1,1]>>16
  1096. @ Stage 2:
  1097. SMULWB r6, r11,r7 @ r6 = t[0,6]=OC_C4S4*t[0,7]>>16
  1098. PKHBT r4, r10,r4, LSL #16 @ r4 = <t[1,4]|t[0,4]>
  1099. SMULWT r10,r11,r7 @ r10= t[1,6]=OC_C4S4*t[1,7]>>16
  1100. SMULWB r5, r11,r4 @ r5 = t[0,5]=OC_C4S4*t[0,4]>>16
  1101. PKHBT r6, r6, r10,LSL #16 @ r6 = <t[1,6]|t[0,6]>
  1102. SMULWT r10,r11,r4 @ r10= t[1,5]=OC_C4S4*t[1,4]>>16
  1103. @ Stage 3:
  1104. B idct4_4core_down_stage3_v6
  1105. .endif
  1106. idct4_3core_v6:
  1107. @ r0 = ogg_int16_t *_y (destination)
  1108. @ r1 = const ogg_int16_t *_x (source)
  1109. @ Stage 1:
  1110. LDRD r10,[r1], #16 @ r10= <x[0,1]|x[0,0]>; r11= <x[0,3]|x[0,2]>
  1111. LDRD r2, OC_C5S3_4_v6 @ r2 = OC_C5S3; r3 = OC_C3S5
  1112. LDRD r4, [r1], #16 @ r4 = <x[1,1]|x[1,0]>; r5 = <??|x[1,2]>
  1113. SMULWT r9, r3, r11 @ r9 = t[0,6]=OC_C3S5*x[0,3]>>16
  1114. SMULWT r8, r2, r11 @ r8 = -t[0,5]=OC_C5S3*x[0,3]>>16
  1115. PKHBT r9, r9, r2 @ r9 = <0|t[0,6]>
  1116. LDRD r6, OC_C6S2_4_v6 @ r6 = OC_C6S2; r7 = OC_C2S6
  1117. PKHBT r8, r8, r2 @ r9 = <0|-t[0,5]>
  1118. SMULWB r3, r7, r11 @ r3 = t[0,3]=OC_C2S6*x[0,2]>>16
  1119. SMULWB r2, r6, r11 @ r2 = t[0,2]=OC_C6S2*x[0,2]>>16
  1120. LDR r11,OC_C4S4
  1121. SMULWB r12,r7, r5 @ r12= t[1,3]=OC_C2S6*x[1,2]>>16
  1122. SMULWB r5, r6, r5 @ r5 = t[1,2]=OC_C6S2*x[1,2]>>16
  1123. PKHBT r3, r3, r12,LSL #16 @ r3 = <t[1,3]|t[0,3]>
  1124. SMULWB r12,r11,r10 @ r12= t[0,0]=OC_C4S4*x[0,0]>>16
  1125. PKHBT r2, r2, r5, LSL #16 @ r2 = <t[1,2]|t[0,2]>
  1126. SMULWB r5, r11,r4 @ r5 = t[1,0]=OC_C4S4*x[1,0]>>16
  1127. LDRD r6, OC_C7S1_4_v6 @ r6 = OC_C7S1; r7 = OC_C1S7
  1128. PKHBT r12,r12,r5, LSL #16 @ r12= <t[1,0]|t[0,0]>
  1129. SMULWT r5, r7, r4 @ r5 = t[1,7]=OC_C1S7*x[1,1]>>16
  1130. SMULWT r7, r7, r10 @ r7 = t[0,7]=OC_C1S7*x[0,1]>>16
  1131. SMULWT r10,r6, r10 @ r10= t[0,4]=OC_C7S1*x[0,1]>>16
  1132. PKHBT r7, r7, r5, LSL #16 @ r7 = <t[1,7]|t[0,7]>
  1133. SMULWT r4, r6, r4 @ r4 = t[1,4]=OC_C7S1*x[1,1]>>16
  1134. @ Stage 2:
  1135. SSUB16 r6, r7, r9 @ r6 = t[7]-t[6]
  1136. PKHBT r4, r10,r4, LSL #16 @ r4 = <t[1,4]|t[0,4]>
  1137. SADD16 r7, r7, r9 @ r7 = t[7]=t[7]+t[6]
  1138. SMULWT r9, r11,r6 @ r9 = t[1,6]=OC_C4S4*r6T>>16
  1139. SADD16 r5, r4, r8 @ r5 = t[4]-t[5]
  1140. SMULWB r6, r11,r6 @ r6 = t[0,6]=OC_C4S4*r6B>>16
  1141. SSUB16 r4, r4, r8 @ r4 = t[4]=t[4]+t[5]
  1142. SMULWT r10,r11,r5 @ r10= t[1,5]=OC_C4S4*r5T>>16
  1143. PKHBT r6, r6, r9, LSL #16 @ r6 = <t[1,6]|t[0,6]>
  1144. SMULWB r5, r11,r5 @ r5 = t[0,5]=OC_C4S4*r5B>>16
  1145. @ Stage 3:
  1146. idct4_3core_stage3_v6:
  1147. SADD16 r11,r12,r2 @ r11= t[1]=t[0]+t[2]
  1148. PKHBT r10,r5, r10,LSL #16 @ r10= <t[1,5]|t[0,5]>
  1149. SSUB16 r2, r12,r2 @ r2 = t[2]=t[0]-t[2]
  1150. idct4_3core_stage3_5_v6:
  1151. SSUB16 r5, r6, r10 @ r5 = t[5]=t[6]-t[5]
  1152. SADD16 r6, r6, r10 @ r6 = t[6]=t[6]+t[5]
  1153. SADD16 r10,r12,r3 @ r10= t[0]=t[0]+t[3]
  1154. SSUB16 r3, r12,r3 @ r3 = t[3]=t[0]-t[3]
  1155. @ Stage 4:
  1156. SADD16 r12,r10,r7 @ r12= t[0]+t[7]
  1157. STR r12,[r0], #4 @ y[0<<3] = t[0]+t[7]
  1158. SADD16 r12,r11,r6 @ r12= t[1]+t[6]
  1159. STR r12,[r0, #12] @ y[1<<3] = t[1]+t[6]
  1160. SADD16 r12,r2, r5 @ r12= t[2]+t[5]
  1161. STR r12,[r0, #28] @ y[2<<3] = t[2]+t[5]
  1162. SADD16 r12,r3, r4 @ r12= t[3]+t[4]
  1163. STR r12,[r0, #44] @ y[3<<3] = t[3]+t[4]
  1164. SSUB16 r4, r3, r4 @ r4 = t[3]-t[4]
  1165. STR r4, [r0, #60] @ y[4<<3] = t[3]-t[4]
  1166. SSUB16 r5, r2, r5 @ r5 = t[2]-t[5]
  1167. STR r5, [r0, #76] @ y[5<<3] = t[2]-t[5]
  1168. SSUB16 r6, r11,r6 @ r6 = t[1]-t[6]
  1169. STR r6, [r0, #92] @ y[6<<3] = t[1]-t[6]
  1170. SSUB16 r7, r10,r7 @ r7 = t[0]-t[7]
  1171. STR r7, [r0, #108] @ y[7<<3] = t[0]-t[7]
  1172. MOV PC,r14
  1173. @ Another copy so the LDRD offsets are less than +/- 255.
  1174. .balign 8
  1175. OC_C7S1_4_v6:
  1176. .word 12785 @ 31F1
  1177. OC_C1S7_4_v6:
  1178. .word 64277 @ FB15
  1179. OC_C6S2_4_v6:
  1180. .word 25080 @ 61F8
  1181. OC_C2S6_4_v6:
  1182. .word 60547 @ EC83
  1183. OC_C5S3_4_v6:
  1184. .word 36410 @ 8E3A
  1185. OC_C3S5_4_v6:
  1186. .word 54491 @ D4DB
  1187. idct4_4core_down_v6:
  1188. @ r0 = ogg_int16_t *_y (destination)
  1189. @ r1 = const ogg_int16_t *_x (source)
  1190. @ Stage 1:
  1191. LDRD r10,[r1], #16 @ r10= <x[0,1]|x[0,0]>; r11= <x[0,3]|x[0,2]>
  1192. LDRD r2, OC_C5S3_4_v6 @ r2 = OC_C5S3; r3 = OC_C3S5
  1193. LDRD r4, [r1], #16 @ r4 = <x[1,1]|x[1,0]>; r5 = <x[1,3]|x[1,2]>
  1194. SMULWT r9, r3, r11 @ r9 = t[0,6]=OC_C3S5*x[0,3]>>16
  1195. LDRD r6, OC_C6S2_4_v6 @ r6 = OC_C6S2; r7 = OC_C2S6
  1196. SMULWT r8, r2, r11 @ r8 = -t[0,5]=OC_C5S3*x[0,3]>>16
  1197. @ Here we cheat: row 3 had just a DC, so x[0,3]==x[1,3] by definition.
  1198. PKHBT r9, r9, r9, LSL #16 @ r9 = <t[0,6]|t[0,6]>
  1199. SMULWB r3, r7, r11 @ r3 = t[0,3]=OC_C2S6*x[0,2]>>16
  1200. PKHBT r8, r8, r8, LSL #16 @ r8 = <-t[0,5]|-t[0,5]>
  1201. SMULWB r2, r6, r11 @ r2 = t[0,2]=OC_C6S2*x[0,2]>>16
  1202. LDR r11,OC_C4S4
  1203. SMULWB r12,r7, r5 @ r12= t[1,3]=OC_C2S6*x[1,2]>>16
  1204. MOV r7,#8
  1205. SMULWB r5, r6, r5 @ r5 = t[1,2]=OC_C6S2*x[1,2]>>16
  1206. PKHBT r3, r3, r12,LSL #16 @ r3 = <t[1,3]|t[0,3]>
  1207. SMLAWB r12,r11,r10,r7 @ r12= t[0,0]+8=(OC_C4S4*x[0,0]>>16)+8
  1208. PKHBT r2, r2, r5, LSL #16 @ r2 = <t[1,2]|t[0,2]>
  1209. SMLAWB r5, r11,r4 ,r7 @ r5 = t[1,0]+8=(OC_C4S4*x[1,0]>>16)+8
  1210. LDRD r6, OC_C7S1_4_v6 @ r6 = OC_C7S1; r7 = OC_C1S7
  1211. PKHBT r12,r12,r5, LSL #16 @ r12= <t[1,0]+8|t[0,0]+8>
  1212. SMULWT r5, r7, r4 @ r5 = t[1,7]=OC_C1S7*x[1,1]>>16
  1213. SMULWT r7, r7, r10 @ r7 = t[0,7]=OC_C1S7*x[0,1]>>16
  1214. SMULWT r10,r6, r10 @ r10= t[0,4]=OC_C7S1*x[0,1]>>16
  1215. PKHBT r7, r7, r5, LSL #16 @ r7 = <t[1,7]|t[0,7]>
  1216. SMULWT r4, r6, r4 @ r4 = t[1,4]=OC_C7S1*x[1,1]>>16
  1217. @ Stage 2:
  1218. SSUB16 r6, r7, r9 @ r6 = t[7]-t[6]
  1219. PKHBT r4, r10,r4, LSL #16 @ r4 = <t[1,4]|t[0,4]>
  1220. SADD16 r7, r7, r9 @ r7 = t[7]=t[7]+t[6]
  1221. SMULWT r9, r11,r6 @ r9 = t[1,6]=OC_C4S4*r6T>>16
  1222. SADD16 r5, r4, r8 @ r5 = t[4]-t[5]
  1223. SMULWB r6, r11,r6 @ r6 = t[0,6]=OC_C4S4*r6B>>16
  1224. SSUB16 r4, r4, r8 @ r4 = t[4]=t[4]+t[5]
  1225. SMULWT r10,r11,r5 @ r10= t[1,5]=OC_C4S4*r5T>>16
  1226. PKHBT r6, r6, r9, LSL #16 @ r6 = <t[1,6]|t[0,6]>
  1227. SMULWB r5, r11,r5 @ r5 = t[0,5]=OC_C4S4*r5B>>16
  1228. @ Stage 3:
  1229. idct4_4core_down_stage3_v6:
  1230. SADD16 r11,r12,r2 @ r11= t[1]+8=t[0]+t[2]+8
  1231. PKHBT r10,r5, r10,LSL #16 @ r10= <t[1,5]|t[0,5]>
  1232. SSUB16 r2, r12,r2 @ r2 = t[2]+8=t[0]-t[2]+8
  1233. B idct8_8core_down_stage3_5_v6
  1234. idct8_8core_v6:
  1235. STMFD r13!,{r0,r14}
  1236. @ Stage 1:
  1237. @5-6 rotation by 3pi/16
  1238. LDRD r10,OC_C5S3_4_v6 @ r10= OC_C5S3, r11= OC_C3S5
  1239. LDR r4, [r1,#8] @ r4 = <x[0,5]|x[0,4]>
  1240. LDR r7, [r1,#24] @ r7 = <x[1,5]|x[1,4]>
  1241. SMULWT r5, r11,r4 @ r5 = OC_C3S5*x[0,5]>>16
  1242. LDR r0, [r1,#4] @ r0 = <x[0,3]|x[0,2]>
  1243. SMULWT r3, r11,r7 @ r3 = OC_C3S5*x[1,5]>>16
  1244. LDR r12,[r1,#20] @ r12= <x[1,3]|x[1,2]>
  1245. SMULWT r6, r11,r0 @ r6 = OC_C3S5*x[0,3]>>16
  1246. SMULWT r11,r11,r12 @ r11= OC_C3S5*x[1,3]>>16
  1247. SMLAWT r6, r10,r4, r6 @ r6 = t[0,6]=r6+(OC_C5S3*x[0,5]>>16)
  1248. PKHBT r5, r5, r3, LSL #16 @ r5 = <r3|r5>
  1249. SMLAWT r11,r10,r7, r11 @ r11= t[1,6]=r11+(OC_C5S3*x[1,5]>>16)
  1250. PKHBT r4, r4, r7, LSL #16 @ r4 = <x[1,4]|x[0,4]>
  1251. SMULWT r3, r10,r0 @ r3 = OC_C5S3*x[0,3]>>16
  1252. PKHBT r6, r6, r11,LSL #16 @ r6 = <t[1,6]|t[0,6]>
  1253. SMULWT r8, r10,r12 @ r8 = OC_C5S3*x[1,3]>>16
  1254. @2-3 rotation by 6pi/16
  1255. LDRD r10,OC_C6S2_4_v6 @ r10= OC_C6S2, r11= OC_C2S6
  1256. PKHBT r3, r3, r8, LSL #16 @ r3 = <r8|r3>
  1257. LDR r8, [r1,#12] @ r8 = <x[0,7]|x[0,6]>
  1258. SMULWB r2, r10,r0 @ r2 = OC_C6S2*x[0,2]>>16
  1259. SSUB16 r5, r5, r3 @ r5 = <t[1,5]|t[0,5]>
  1260. SMULWB r9, r10,r12 @ r9 = OC_C6S2*x[1,2]>>16
  1261. LDR r7, [r1,#28] @ r7 = <x[1,7]|x[1,6]>
  1262. SMULWB r3, r10,r8 @ r3 = OC_C6S2*x[0,6]>>16
  1263. SMULWB r10,r10,r7 @ r10= OC_C6S2*x[1,6]>>16
  1264. PKHBT r2, r2, r9, LSL #16 @ r2 = <r2|r9>
  1265. SMLAWB r3, r11,r0, r3 @ r3 = t[0,3]=r3+(OC_C2S6*x[0,2]>>16)
  1266. SMLAWB r10,r11,r12,r10 @ r10= t[1,3]=r10+(OC_C2S6*x[1,2]>>16)
  1267. SMULWB r9, r11,r8 @ r9 = OC_C2S6*x[0,6]>>16
  1268. PKHBT r3, r3, r10,LSL #16 @ r3 = <t[1,6]|t[0,6]>
  1269. SMULWB r12,r11,r7 @ r12= OC_C2S6*x[1,6]>>16
  1270. @4-7 rotation by 7pi/16
  1271. LDRD r10,OC_C7S1_8_v6 @ r10= OC_C7S1, r11= OC_C1S7
  1272. PKHBT r9, r9, r12,LSL #16 @ r9 = <r9|r12>
  1273. LDR r0, [r1],#16 @ r0 = <x[0,1]|x[0,0]>
  1274. PKHTB r7, r7, r8, ASR #16 @ r7 = <x[1,7]|x[0,7]>
  1275. SSUB16 r2, r2, r9 @ r2 = <t[1,2]|t[0,2]>
  1276. SMULWB r9, r10,r7 @ r9 = OC_C7S1*x[0,7]>>16
  1277. LDR r14,[r1],#16 @ r14= <x[1,1]|x[1,0]>
  1278. SMULWT r12,r10,r7 @ r12= OC_C7S1*x[1,7]>>16
  1279. SMULWT r8, r10,r0 @ r8 = OC_C7S1*x[0,1]>>16
  1280. SMULWT r10,r10,r14 @ r10= OC_C7S1*x[1,1]>>16
  1281. SMLAWT r9, r11,r0, r9 @ r9 = t[0,7]=r9+(OC_C1S7*x[0,1]>>16)
  1282. PKHBT r8, r8, r10,LSL #16 @ r8 = <r12|r8>
  1283. SMLAWT r12,r11,r14,r12 @ r12= t[1,7]=r12+(OC_C1S7*x[1,1]>>16)
  1284. PKHBT r0, r0, r14,LSL #16 @ r0 = <x[1,0]|x[0,0]>
  1285. SMULWB r10,r11,r7 @ r10= OC_C1S7*x[0,6]>>16
  1286. PKHBT r9, r9, r12,LSL #16 @ r9 = <t[1,7]|t[0,7]>
  1287. SMULWT r12,r11,r7 @ r12= OC_C1S7*x[1,6]>>16
  1288. @0-1 butterfly
  1289. LDR r11,OC_C4S4
  1290. PKHBT r10,r10,r12,LSL #16 @ r10= <r12|r10>
  1291. SADD16 r7, r0, r4 @ r7 = x[0]+x[4]
  1292. SSUB16 r10,r8, r10 @ r10= <t[1,4]|t[0,4]>
  1293. SSUB16 r4, r0, r4 @ r4 = x[0]-x[4]
  1294. SMULWB r8, r11,r7 @ r8 = t[0,0]=OC_C4S4*r7B>>16
  1295. SMULWT r12,r11,r7 @ r12= t[1,0]=OC_C4S4*r7T>>16
  1296. SMULWB r7, r11,r4 @ r7 = t[0,1]=OC_C4S4*r4B>>16
  1297. PKHBT r12,r8, r12,LSL #16 @ r12= <t[1,0]|t[0,0]>
  1298. SMULWT r8, r11,r4 @ r8 = t[1,1]=OC_C4S4*r4T>>16
  1299. @ Stage 2:
  1300. SADD16 r4, r10,r5 @ r4 = t[4]=t[4]+t[5]
  1301. PKHBT r8, r7, r8, LSL #16 @ r8 = <t[1,0]|t[0,0]>
  1302. SSUB16 r5, r10,r5 @ r5 = t[4]-t[5]
  1303. SMULWB r10,r11,r5 @ r10= t[0,5]=OC_C4S4*r5B>>16
  1304. SADD16 r7, r9, r6 @ r7 = t[7]=t[7]+t[6]
  1305. SMULWT r5, r11,r5 @ r5 = t[1,5]=OC_C4S4*r5T>>16
  1306. SSUB16 r6, r9, r6 @ r6 = t[7]-t[6]
  1307. SMULWB r9, r11,r6 @ r9 = t[0,6]=OC_C4S4*r6B>>16
  1308. PKHBT r10,r10,r5, LSL #16 @ r10= <t[1,5]|t[0,5]>
  1309. SMULWT r6, r11,r6 @ r6 = t[1,6]=OC_C4S4*r6T>>16
  1310. @ Stage 3:
  1311. SADD16 r11,r8, r2 @ r11= t[1]=t[1]+t[2]
  1312. PKHBT r6, r9, r6, LSL #16 @ r6 = <t[1,6]|t[0,6]>
  1313. SSUB16 r2, r8, r2 @ r2 = t[2]=t[1]-t[2]
  1314. LDMFD r13!,{r0,r14}
  1315. B idct4_3core_stage3_5_v6
  1316. @ Another copy so the LDRD offsets are less than +/- 255.
  1317. .balign 8
  1318. OC_C7S1_8_v6:
  1319. .word 12785 @ 31F1
  1320. OC_C1S7_8_v6:
  1321. .word 64277 @ FB15
  1322. OC_C6S2_8_v6:
  1323. .word 25080 @ 61F8
  1324. OC_C2S6_8_v6:
  1325. .word 60547 @ EC83
  1326. OC_C5S3_8_v6:
  1327. .word 36410 @ 8E3A
  1328. OC_C3S5_8_v6:
  1329. .word 54491 @ D4DB
  1330. idct8_8core_down_v6:
  1331. STMFD r13!,{r0,r14}
  1332. @ Stage 1:
  1333. @5-6 rotation by 3pi/16
  1334. LDRD r10,OC_C5S3_8_v6 @ r10= OC_C5S3, r11= OC_C3S5
  1335. LDR r4, [r1,#8] @ r4 = <x[0,5]|x[0,4]>
  1336. LDR r7, [r1,#24] @ r7 = <x[1,5]|x[1,4]>
  1337. SMULWT r5, r11,r4 @ r5 = OC_C3S5*x[0,5]>>16
  1338. LDR r0, [r1,#4] @ r0 = <x[0,3]|x[0,2]>
  1339. SMULWT r3, r11,r7 @ r3 = OC_C3S5*x[1,5]>>16
  1340. LDR r12,[r1,#20] @ r12= <x[1,3]|x[1,2]>
  1341. SMULWT r6, r11,r0 @ r6 = OC_C3S5*x[0,3]>>16
  1342. SMULWT r11,r11,r12 @ r11= OC_C3S5*x[1,3]>>16
  1343. SMLAWT r6, r10,r4, r6 @ r6 = t[0,6]=r6+(OC_C5S3*x[0,5]>>16)
  1344. PKHBT r5, r5, r3, LSL #16 @ r5 = <r3|r5>
  1345. SMLAWT r11,r10,r7, r11 @ r11= t[1,6]=r11+(OC_C5S3*x[1,5]>>16)
  1346. PKHBT r4, r4, r7, LSL #16 @ r4 = <x[1,4]|x[0,4]>
  1347. SMULWT r3, r10,r0 @ r3 = OC_C5S3*x[0,3]>>16
  1348. PKHBT r6, r6, r11,LSL #16 @ r6 = <t[1,6]|t[0,6]>
  1349. SMULWT r8, r10,r12 @ r8 = OC_C5S3*x[1,3]>>16
  1350. @2-3 rotation by 6pi/16
  1351. LDRD r10,OC_C6S2_8_v6 @ r10= OC_C6S2, r11= OC_C2S6
  1352. PKHBT r3, r3, r8, LSL #16 @ r3 = <r8|r3>
  1353. LDR r8, [r1,#12] @ r8 = <x[0,7]|x[0,6]>
  1354. SMULWB r2, r10,r0 @ r2 = OC_C6S2*x[0,2]>>16
  1355. SSUB16 r5, r5, r3 @ r5 = <t[1,5]|t[0,5]>
  1356. SMULWB r9, r10,r12 @ r9 = OC_C6S2*x[1,2]>>16
  1357. LDR r7, [r1,#28] @ r7 = <x[1,7]|x[1,6]>
  1358. SMULWB r3, r10,r8 @ r3 = OC_C6S2*x[0,6]>>16
  1359. SMULWB r10,r10,r7 @ r10= OC_C6S2*x[1,6]>>16
  1360. PKHBT r2, r2, r9, LSL #16 @ r2 = <r2|r9>
  1361. SMLAWB r3, r11,r0, r3 @ r3 = t[0,3]=r3+(OC_C2S6*x[0,2]>>16)
  1362. SMLAWB r10,r11,r12,r10 @ r10= t[1,3]=r10+(OC_C2S6*x[1,2]>>16)
  1363. SMULWB r9, r11,r8 @ r9 = OC_C2S6*x[0,6]>>16
  1364. PKHBT r3, r3, r10,LSL #16 @ r3 = <t[1,6]|t[0,6]>
  1365. SMULWB r12,r11,r7 @ r12= OC_C2S6*x[1,6]>>16
  1366. @4-7 rotation by 7pi/16
  1367. LDRD r10,OC_C7S1_8_v6 @ r10= OC_C7S1, r11= OC_C1S7
  1368. PKHBT r9, r9, r12,LSL #16 @ r9 = <r9|r12>
  1369. LDR r0, [r1],#16 @ r0 = <x[0,1]|x[0,0]>
  1370. PKHTB r7, r7, r8, ASR #16 @ r7 = <x[1,7]|x[0,7]>
  1371. SSUB16 r2, r2, r9 @ r2 = <t[1,2]|t[0,2]>
  1372. SMULWB r9, r10,r7 @ r9 = OC_C7S1*x[0,7]>>16
  1373. LDR r14,[r1],#16 @ r14= <x[1,1]|x[1,0]>
  1374. SMULWT r12,r10,r7 @ r12= OC_C7S1*x[1,7]>>16
  1375. SMULWT r8, r10,r0 @ r8 = OC_C7S1*x[0,1]>>16
  1376. SMULWT r10,r10,r14 @ r10= OC_C7S1*x[1,1]>>16
  1377. SMLAWT r9, r11,r0, r9 @ r9 = t[0,7]=r9+(OC_C1S7*x[0,1]>>16)
  1378. PKHBT r8, r8, r10,LSL #16 @ r8 = <r12|r8>
  1379. SMLAWT r12,r11,r14,r12 @ r12= t[1,7]=r12+(OC_C1S7*x[1,1]>>16)
  1380. PKHBT r0, r0, r14,LSL #16 @ r0 = <x[1,0]|x[0,0]>
  1381. SMULWB r10,r11,r7 @ r10= OC_C1S7*x[0,6]>>16
  1382. PKHBT r9, r9, r12,LSL #16 @ r9 = <t[1,7]|t[0,7]>
  1383. SMULWT r12,r11,r7 @ r12= OC_C1S7*x[1,6]>>16
  1384. @0-1 butterfly
  1385. LDR r11,OC_C4S4
  1386. MOV r14,#8
  1387. PKHBT r10,r10,r12,LSL #16 @ r10= <r12|r10>
  1388. SADD16 r7, r0, r4 @ r7 = x[0]+x[4]
  1389. SSUB16 r10,r8, r10 @ r10= <t[1,4]|t[0,4]>
  1390. SMLAWB r8, r11,r7, r14 @ r8 = t[0,0]+8=(OC_C4S4*r7B>>16)+8
  1391. SSUB16 r4, r0, r4 @ r4 = x[0]-x[4]
  1392. SMLAWT r12,r11,r7, r14 @ r12= t[1,0]+8=(OC_C4S4*r7T>>16)+8
  1393. SMLAWB r7, r11,r4, r14 @ r7 = t[0,1]+8=(OC_C4S4*r4B>>16)+8
  1394. PKHBT r12,r8, r12,LSL #16 @ r12= <t[1,0]+8|t[0,0]+8>
  1395. SMLAWT r8, r11,r4, r14 @ r8 = t[1,1]+8=(OC_C4S4*r4T>>16)+8
  1396. @ Stage 2:
  1397. SADD16 r4, r10,r5 @ r4 = t[4]=t[4]+t[5]
  1398. PKHBT r8, r7, r8, LSL #16 @ r8 = <t[1,0]+8|t[0,0]+8>
  1399. SSUB16 r5, r10,r5 @ r5 = t[4]-t[5]
  1400. SMULWB r10,r11,r5 @ r10= t[0,5]=OC_C4S4*r5B>>16
  1401. SADD16 r7, r9, r6 @ r7 = t[7]=t[7]+t[6]
  1402. SMULWT r5, r11,r5 @ r5 = t[1,5]=OC_C4S4*r5T>>16
  1403. SSUB16 r6, r9, r6 @ r6 = t[7]-t[6]
  1404. SMULWB r9, r11,r6 @ r9 = t[0,6]=OC_C4S4*r6B>>16
  1405. PKHBT r10,r10,r5, LSL #16 @ r10= <t[1,5]|t[0,5]>
  1406. SMULWT r6, r11,r6 @ r6 = t[1,6]=OC_C4S4*r6T>>16
  1407. @ Stage 3:
  1408. SADD16 r11,r8, r2 @ r11= t[1]+8=t[1]+t[2]+8
  1409. PKHBT r6, r9, r6, LSL #16 @ r6 = <t[1,6]|t[0,6]>
  1410. SSUB16 r2, r8, r2 @ r2 = t[2]+8=t[1]-t[2]+8
  1411. LDMFD r13!,{r0,r14}
  1412. idct8_8core_down_stage3_5_v6:
  1413. SSUB16 r5, r6, r10 @ r5 = t[5]=t[6]-t[5]
  1414. SADD16 r6, r6, r10 @ r6 = t[6]=t[6]+t[5]
  1415. SADD16 r10,r12,r3 @ r10= t[0]+8=t[0]+t[3]+8
  1416. SSUB16 r3, r12,r3 @ r3 = t[3]+8=t[0]-t[3]+8
  1417. @ Stage 4:
  1418. SADD16 r12,r10,r7 @ r12= t[0]+t[7]+8
  1419. SSUB16 r7, r10,r7 @ r7 = t[0]-t[7]+8
  1420. MOV r10,r12,ASR #4
  1421. MOV r12,r12,LSL #16
  1422. PKHTB r10,r10,r12,ASR #20 @ r10= t[0]+t[7]+8>>4
  1423. STR r10,[r0], #4 @ y[0<<3] = t[0]+t[7]+8>>4
  1424. SADD16 r12,r11,r6 @ r12= t[1]+t[6]+8
  1425. SSUB16 r6, r11,r6 @ r6 = t[1]-t[6]+8
  1426. MOV r10,r12,ASR #4
  1427. MOV r12,r12,LSL #16
  1428. PKHTB r10,r10,r12,ASR #20 @ r10= t[1]+t[6]+8>>4
  1429. STR r10,[r0, #12] @ y[1<<3] = t[1]+t[6]+8>>4
  1430. SADD16 r12,r2, r5 @ r12= t[2]+t[5]+8
  1431. SSUB16 r5, r2, r5 @ r5 = t[2]-t[5]+8
  1432. MOV r10,r12,ASR #4
  1433. MOV r12,r12,LSL #16
  1434. PKHTB r10,r10,r12,ASR #20 @ r10= t[2]+t[5]+8>>4
  1435. STR r10,[r0, #28] @ y[2<<3] = t[2]+t[5]+8>>4
  1436. SADD16 r12,r3, r4 @ r12= t[3]+t[4]+8
  1437. SSUB16 r4, r3, r4 @ r4 = t[3]-t[4]+8
  1438. MOV r10,r12,ASR #4
  1439. MOV r12,r12,LSL #16
  1440. PKHTB r10,r10,r12,ASR #20 @ r10= t[3]+t[4]+8>>4
  1441. STR r10,[r0, #44] @ y[3<<3] = t[3]+t[4]+8>>4
  1442. MOV r10,r4, ASR #4
  1443. MOV r4, r4, LSL #16
  1444. PKHTB r10,r10,r4, ASR #20 @ r10= t[3]-t[4]+8>>4
  1445. STR r10,[r0, #60] @ y[4<<3] = t[3]-t[4]+8>>4
  1446. MOV r10,r5, ASR #4
  1447. MOV r5, r5, LSL #16
  1448. PKHTB r10,r10,r5, ASR #20 @ r10= t[2]-t[5]+8>>4
  1449. STR r10,[r0, #76] @ y[5<<3] = t[2]-t[5]+8>>4
  1450. MOV r10,r6, ASR #4
  1451. MOV r6, r6, LSL #16
  1452. PKHTB r10,r10,r6, ASR #20 @ r10= t[1]-t[6]+8>>4
  1453. STR r10,[r0, #92] @ y[6<<3] = t[1]-t[6]+8>>4
  1454. MOV r10,r7, ASR #4
  1455. MOV r7, r7, LSL #16
  1456. PKHTB r10,r10,r7, ASR #20 @ r10= t[0]-t[7]+8>>4
  1457. STR r10,[r0, #108] @ y[7<<3] = t[0]-t[7]+8>>4
  1458. MOV PC,r14
  1459. .endif
  1460. .if OC_ARM_ASM_NEON
  1461. .global oc_idct8x8_1_neon
  1462. .global oc_idct8x8_neon
  1463. .balign 16
  1464. OC_IDCT_CONSTS_NEON:
  1465. .short 8
  1466. .short 64277 @ FB15 (C1S7)
  1467. .short 60547 @ EC83 (C2S6)
  1468. .short 54491 @ D4DB (C3S5)
  1469. .short 46341 @ B505 (C4S4)
  1470. .short 36410 @ 471D (C5S3)
  1471. .short 25080 @ 30FC (C6S2)
  1472. .short 12785 @ 31F1 (C7S1)
  1473. oc_idct8x8_1_neon:
  1474. @ r0 = ogg_int16_t *_y
  1475. @ r1 = ogg_uint16_t _dc
  1476. VDUP.S16 Q0, r1
  1477. VMOV Q1, Q0
  1478. VST1.64 {D0, D1, D2, D3}, [r0,:128]!
  1479. VST1.64 {D0, D1, D2, D3}, [r0,:128]!
  1480. VST1.64 {D0, D1, D2, D3}, [r0,:128]!
  1481. VST1.64 {D0, D1, D2, D3}, [r0,:128]
  1482. MOV PC, r14
  1483. oc_idct8x8_neon:
  1484. @ r0 = ogg_int16_t *_y
  1485. @ r1 = ogg_int16_t *_x
  1486. @ r2 = int _last_zzi
  1487. CMP r2, #10
  1488. BLE oc_idct8x8_10_neon
  1489. oc_idct8x8_slow_neon:
  1490. VPUSH {D8-D15}
  1491. MOV r2, r1
  1492. ADR r3, OC_IDCT_CONSTS_NEON
  1493. @ Row transforms (input is pre-transposed)
  1494. VLD1.64 {D16,D17,D18,D19}, [r2,:128]!
  1495. VLD1.64 {D20,D21,D22,D23}, [r2,:128]!
  1496. VLD1.64 {D24,D25,D26,D27}, [r2,:128]!
  1497. VSUB.S16 Q1, Q8, Q12 @ Q8 = x[0]-x[4]
  1498. VLD1.64 {D28,D29,D30,D31}, [r2,:128]
  1499. VADD.S16 Q8, Q8, Q12 @ Q1 = x[0]+x[4]
  1500. VLD1.64 {D0,D1}, [r3,:128]
  1501. MOV r12, r14
  1502. BL oc_idct8x8_stage123_neon
  1503. @ Stage 4
  1504. VSUB.S16 Q15,Q8, Q7 @ Q15 = y[7]=t[0]-t[7]
  1505. VADD.S16 Q8, Q8, Q7 @ Q8 = y[0]=t[0]+t[7]
  1506. VSUB.S16 Q14,Q9, Q3 @ Q14 = y[6]=t[1]-t[6]
  1507. VADD.S16 Q9, Q9, Q3 @ Q9 = y[1]=t[1]+t[6]
  1508. VSUB.S16 Q13,Q10,Q5 @ Q13 = y[5]=t[2]-t[5]
  1509. VADD.S16 Q10,Q10,Q5 @ Q10 = y[2]=t[2]+t[5]
  1510. VTRN.16 Q14,Q15
  1511. VSUB.S16 Q12,Q11,Q4 @ Q12 = y[4]=t[3]-t[4]
  1512. VADD.S16 Q11,Q11,Q4 @ Q11 = y[3]=t[3]+t[4]
  1513. @ 8x8 Transpose
  1514. VTRN.16 Q8, Q9
  1515. VTRN.16 Q10,Q11
  1516. VTRN.16 Q12,Q13
  1517. VTRN.32 Q8, Q10
  1518. VTRN.32 Q9, Q11
  1519. VTRN.32 Q12,Q14
  1520. VTRN.32 Q13,Q15
  1521. VSWP D17,D24
  1522. VSUB.S16 Q1, Q8, Q12 @ Q8 = x[0]-x[4]
  1523. VSWP D19,D26
  1524. VADD.S16 Q8, Q8, Q12 @ Q1 = x[0]+x[4]
  1525. VSWP D21,D28
  1526. VSWP D23,D30
  1527. @ Column transforms
  1528. BL oc_idct8x8_stage123_neon
  1529. CMP r0,r1
  1530. @ We have to put the return address back in the LR, or the branch
  1531. @ predictor will not recognize the function return and mis-predict the
  1532. @ entire call stack.
  1533. MOV r14, r12
  1534. @ Stage 4
  1535. VSUB.S16 Q15,Q8, Q7 @ Q15 = y[7]=t[0]-t[7]
  1536. VADD.S16 Q8, Q8, Q7 @ Q8 = y[0]=t[0]+t[7]
  1537. VSUB.S16 Q14,Q9, Q3 @ Q14 = y[6]=t[1]-t[6]
  1538. VADD.S16 Q9, Q9, Q3 @ Q9 = y[1]=t[1]+t[6]
  1539. VSUB.S16 Q13,Q10,Q5 @ Q13 = y[5]=t[2]-t[5]
  1540. VADD.S16 Q10,Q10,Q5 @ Q10 = y[2]=t[2]+t[5]
  1541. VSUB.S16 Q12,Q11,Q4 @ Q12 = y[4]=t[3]-t[4]
  1542. VADD.S16 Q11,Q11,Q4 @ Q11 = y[3]=t[3]+t[4]
  1543. BEQ oc_idct8x8_slow_neon_noclear
  1544. VMOV.I8 Q2,#0
  1545. VPOP {D8-D15}
  1546. VMOV.I8 Q3,#0
  1547. VRSHR.S16 Q8, Q8, #4 @ Q8 = y[0]+8>>4
  1548. VST1.64 {D4, D5, D6, D7}, [r1,:128]!
  1549. VRSHR.S16 Q9, Q9, #4 @ Q9 = y[1]+8>>4
  1550. VRSHR.S16 Q10,Q10,#4 @ Q10 = y[2]+8>>4
  1551. VST1.64 {D4, D5, D6, D7}, [r1,:128]!
  1552. VRSHR.S16 Q11,Q11,#4 @ Q11 = y[3]+8>>4
  1553. VRSHR.S16 Q12,Q12,#4 @ Q12 = y[4]+8>>4
  1554. VST1.64 {D4, D5, D6, D7}, [r1,:128]!
  1555. VRSHR.S16 Q13,Q13,#4 @ Q13 = y[5]+8>>4
  1556. VRSHR.S16 Q14,Q14,#4 @ Q14 = y[6]+8>>4
  1557. VST1.64 {D4, D5, D6, D7}, [r1,:128]
  1558. VRSHR.S16 Q15,Q15,#4 @ Q15 = y[7]+8>>4
  1559. VSTMIA r0, {D16-D31}
  1560. MOV PC, r14
  1561. oc_idct8x8_slow_neon_noclear:
  1562. VPOP {D8-D15}
  1563. VRSHR.S16 Q8, Q8, #4 @ Q8 = y[0]+8>>4
  1564. VRSHR.S16 Q9, Q9, #4 @ Q9 = y[1]+8>>4
  1565. VRSHR.S16 Q10,Q10,#4 @ Q10 = y[2]+8>>4
  1566. VRSHR.S16 Q11,Q11,#4 @ Q11 = y[3]+8>>4
  1567. VRSHR.S16 Q12,Q12,#4 @ Q12 = y[4]+8>>4
  1568. VRSHR.S16 Q13,Q13,#4 @ Q13 = y[5]+8>>4
  1569. VRSHR.S16 Q14,Q14,#4 @ Q14 = y[6]+8>>4
  1570. VRSHR.S16 Q15,Q15,#4 @ Q15 = y[7]+8>>4
  1571. VSTMIA r0, {D16-D31}
  1572. MOV PC, r14
  1573. oc_idct8x8_stage123_neon:
  1574. @ Stages 1 & 2
  1575. VMULL.S16 Q4, D18,D1[3]
  1576. VMULL.S16 Q5, D19,D1[3]
  1577. VMULL.S16 Q7, D30,D1[3]
  1578. VMULL.S16 Q6, D31,D1[3]
  1579. VMULL.S16 Q2, D30,D0[1]
  1580. VMULL.S16 Q3, D31,D0[1]
  1581. VSHRN.S32 D8, Q4, #16
  1582. VSHRN.S32 D9, Q5, #16 @ Q4 = (OC_C7S1*x[1]>>16)
  1583. VSHRN.S32 D14,Q7, #16
  1584. VSHRN.S32 D15,Q6, #16 @ Q7 = (OC_C7S1*x[7]>>16)
  1585. VSHRN.S32 D4, Q2, #16
  1586. VSHRN.S32 D5, Q3, #16 @ Q2 = (OC_C1S7*x[7]>>16)-x[7]
  1587. VSUB.S16 Q4, Q4, Q15
  1588. VADD.S16 Q7, Q7, Q9
  1589. VSUB.S16 Q4, Q4, Q2 @ Q4 = t[4]
  1590. VMULL.S16 Q2, D18,D0[1]
  1591. VMULL.S16 Q9, D19,D0[1]
  1592. VMULL.S16 Q5, D26,D0[3]
  1593. VMULL.S16 Q3, D27,D0[3]
  1594. VMULL.S16 Q6, D22,D0[3]
  1595. VMULL.S16 Q12,D23,D0[3]
  1596. VSHRN.S32 D4, Q2, #16
  1597. VSHRN.S32 D5, Q9, #16 @ Q2 = (OC_C1S7*x[1]>>16)-x[1]
  1598. VSHRN.S32 D10,Q5, #16
  1599. VSHRN.S32 D11,Q3, #16 @ Q5 = (OC_C3S5*x[5]>>16)-x[5]
  1600. VSHRN.S32 D12,Q6, #16
  1601. VSHRN.S32 D13,Q12,#16 @ Q6 = (OC_C3S5*x[3]>>16)-x[3]
  1602. VADD.S16 Q7, Q7, Q2 @ Q7 = t[7]
  1603. VSUB.S16 Q5, Q5, Q11
  1604. VADD.S16 Q6, Q6, Q11
  1605. VADD.S16 Q5, Q5, Q13
  1606. VADD.S16 Q6, Q6, Q13
  1607. VMULL.S16 Q9, D22,D1[1]
  1608. VMULL.S16 Q11,D23,D1[1]
  1609. VMULL.S16 Q15,D26,D1[1]
  1610. VMULL.S16 Q13,D27,D1[1]
  1611. VMULL.S16 Q2, D20,D1[2]
  1612. VMULL.S16 Q12,D21,D1[2]
  1613. VSHRN.S32 D18,Q9, #16
  1614. VSHRN.S32 D19,Q11,#16 @ Q9 = (OC_C5S3*x[3]>>16)-x[3]
  1615. VSHRN.S32 D30,Q15,#16
  1616. VSHRN.S32 D31,Q13,#16 @ Q15= (OC_C5S3*x[5]>>16)-x[5]
  1617. VSHRN.S32 D4, Q2, #16
  1618. VSHRN.S32 D5, Q12,#16 @ Q2 = (OC_C6S2*x[2]>>16)
  1619. VSUB.S16 Q5, Q5, Q9 @ Q5 = t[5]
  1620. VADD.S16 Q6, Q6, Q15 @ Q6 = t[6]
  1621. VSUB.S16 Q2, Q2, Q14
  1622. VMULL.S16 Q3, D28,D1[2]
  1623. VMULL.S16 Q11,D29,D1[2]
  1624. VMULL.S16 Q12,D28,D0[2]
  1625. VMULL.S16 Q9, D29,D0[2]
  1626. VMULL.S16 Q13,D20,D0[2]
  1627. VMULL.S16 Q15,D21,D0[2]
  1628. VSHRN.S32 D6, Q3, #16
  1629. VSHRN.S32 D7, Q11,#16 @ Q3 = (OC_C6S2*x[6]>>16)
  1630. VSHRN.S32 D24,Q12,#16
  1631. VSHRN.S32 D25,Q9, #16 @ Q12= (OC_C2S6*x[6]>>16)-x[6]
  1632. VSHRN.S32 D26,Q13,#16
  1633. VSHRN.S32 D27,Q15,#16 @ Q13= (OC_C2S6*x[2]>>16)-x[2]
  1634. VSUB.S16 Q9, Q4, Q5 @ Q9 = t[4]-t[5]
  1635. VSUB.S16 Q11,Q7, Q6 @ Q11= t[7]-t[6]
  1636. VADD.S16 Q3, Q3, Q10
  1637. VADD.S16 Q4, Q4, Q5 @ Q4 = t[4]=t[4]+t[5]
  1638. VADD.S16 Q7, Q7, Q6 @ Q7 = t[7]=t[7]+t[6]
  1639. VSUB.S16 Q2, Q2, Q12 @ Q2 = t[2]
  1640. VADD.S16 Q3, Q3, Q13 @ Q3 = t[3]
  1641. VMULL.S16 Q12,D16,D1[0]
  1642. VMULL.S16 Q13,D17,D1[0]
  1643. VMULL.S16 Q14,D2, D1[0]
  1644. VMULL.S16 Q15,D3, D1[0]
  1645. VMULL.S16 Q5, D18,D1[0]
  1646. VMULL.S16 Q6, D22,D1[0]
  1647. VSHRN.S32 D24,Q12,#16
  1648. VSHRN.S32 D25,Q13,#16
  1649. VSHRN.S32 D28,Q14,#16
  1650. VSHRN.S32 D29,Q15,#16
  1651. VMULL.S16 Q13,D19,D1[0]
  1652. VMULL.S16 Q15,D23,D1[0]
  1653. VADD.S16 Q8, Q8, Q12 @ Q8 = t[0]
  1654. VADD.S16 Q1, Q1, Q14 @ Q1 = t[1]
  1655. VSHRN.S32 D10,Q5, #16
  1656. VSHRN.S32 D12,Q6, #16
  1657. VSHRN.S32 D11,Q13,#16
  1658. VSHRN.S32 D13,Q15,#16
  1659. VADD.S16 Q5, Q5, Q9 @ Q5 = t[5]=OC_C4S4*(t[4]-t[5])>>16
  1660. VADD.S16 Q6, Q6, Q11 @ Q6 = t[6]=OC_C4S4*(t[7]-t[6])>>16
  1661. @ Stage 3
  1662. VSUB.S16 Q11,Q8, Q3 @ Q11 = t[3]=t[0]-t[3]
  1663. VADD.S16 Q8, Q8, Q3 @ Q8 = t[0]=t[0]+t[3]
  1664. VADD.S16 Q9, Q1, Q2 @ Q9 = t[1]=t[1]+t[2]
  1665. VADD.S16 Q3, Q6, Q5 @ Q3 = t[6]=t[6]+t[5]
  1666. VSUB.S16 Q10,Q1, Q2 @ Q10 = t[2]=t[1]-t[2]
  1667. VSUB.S16 Q5, Q6, Q5 @ Q5 = t[5]=t[6]-t[5]
  1668. MOV PC, r14
  1669. oc_idct8x8_10_neon:
  1670. ADR r3, OC_IDCT_CONSTS_NEON
  1671. VLD1.64 {D0,D1}, [r3,:128]
  1672. MOV r2, r1
  1673. @ Row transforms (input is pre-transposed)
  1674. @ Stage 1
  1675. VLD1.64 {D16,D17,D18,D19},[r2,:128]!
  1676. MOV r12, #16
  1677. VMULL.S16 Q15,D16,D1[0] @ Q15= OC_C4S4*x[0]-(x[0]<<16)
  1678. VLD1.64 {D17}, [r2,:64], r12
  1679. VMULL.S16 Q2, D18,D0[1] @ Q2 = OC_C1S7*x[1]-(x[1]<<16)
  1680. VLD1.64 {D19}, [r2,:64]
  1681. VMULL.S16 Q14,D17,D0[2] @ Q14= OC_C2S6*x[2]-(x[2]<<16)
  1682. VMULL.S16 Q3, D19,D0[3] @ Q3 = OC_C3S5*x[3]-(x[3]<<16)
  1683. VMULL.S16 Q13,D19,D1[1] @ Q13= OC_C5S3*x[3]-(x[3]<<16)
  1684. VMULL.S16 Q12,D18,D1[3] @ Q12= OC_C7S1*x[1]
  1685. VMULL.S16 Q1, D17,D1[2] @ Q1 = OC_C6S2*x[2]
  1686. VSHRN.S32 D30,Q15,#16 @ D30= t[0]-x[0]
  1687. VSHRN.S32 D4, Q2, #16 @ D4 = t[7]-x[1]
  1688. VSHRN.S32 D31,Q14,#16 @ D31= t[3]-x[2]
  1689. VSHRN.S32 D6, Q3, #16 @ D6 = t[6]-x[3]
  1690. VSHRN.S32 D7, Q13,#16 @ D7 = -t[5]-x[3]
  1691. VSHRN.S32 D5, Q12,#16 @ D5 = t[4]
  1692. VSHRN.S32 D2, Q1, #16 @ D2 = t[2]
  1693. VADD.S16 D4, D4, D18 @ D4 = t[7]
  1694. VADD.S16 D6, D6, D19 @ D6 = t[6]
  1695. VADD.S16 D7, D7, D19 @ D7 = -t[5]
  1696. VADD.S16 Q15,Q15,Q8 @ D30= t[0]
  1697. @ D31= t[3]
  1698. @ Stages 2 & 3
  1699. VSUB.S16 Q12,Q2, Q3 @ D24= t[7]-t[6]
  1700. @ D25= t[4]'=t[4]+t[5]
  1701. VADD.S16 Q13,Q2, Q3 @ D26= t[7]=t[7]+t[6]
  1702. @ D27= t[4]-t[5]
  1703. VMULL.S16 Q11,D24,D1[0] @ Q11= OC_C4S4*(t[7]-t[6])
  1704. @ -(t[7]-t[6]<<16)
  1705. VMULL.S16 Q14,D27,D1[0] @ Q14= OC_C4S4*(t[4]-t[5])
  1706. @ -(t[4]-t[5]<<16)
  1707. VADD.S16 D16,D30,D31 @ D16= t[0]=t[0]+t[3]
  1708. VSUB.S16 D17,D30,D2 @ D17= t[2]=t[0]-t[2]
  1709. VADD.S16 D18,D30,D2 @ D18= t[1]=t[0]+t[2]
  1710. VSHRN.S32 D22,Q11,#16 @ D22= (OC_C4S4*(t[7]-t[6])>>16)
  1711. @ -(t[7]-t[6])
  1712. VSHRN.S32 D23,Q14,#16 @ D23= (OC_C4S4*(t[4]-t[5])>>16)
  1713. @ -(t[4]-t[5])
  1714. VSUB.S16 D19,D30,D31 @ D19= t[3]=t[0]-t[3]
  1715. VADD.S16 D22,D22,D24 @ D22= t[6]=OC_C4S4*(t[7]-t[6])>>16
  1716. VADD.S16 D23,D23,D27 @ D23= t[5]=OC_C4S4*(t[4]-t[5])>>16
  1717. VSUB.S16 D27,D22,D23 @ D27= t[5]=t[6]-t[5]
  1718. VADD.S16 D24,D22,D23 @ D24= t[6]=t[6]+t[5]
  1719. @ Stage 4
  1720. VSUB.S16 Q11,Q8, Q13 @ D22= y[7]=t[0]-t[7]
  1721. @ D23= y[5]=t[2]'-t[5]''
  1722. VSUB.S16 Q10,Q9, Q12 @ D20= y[6]=t[1]-t[6]
  1723. @ D21= y[4]=t[3]'-t[4]''
  1724. VADD.S16 Q8, Q8, Q13 @ D16= y[0]=t[0]+t[7]
  1725. @ D17= y[2]=t[2]'+t[5]''
  1726. VADD.S16 Q9, Q9, Q12 @ D18= y[1]=t[1]-t[6]
  1727. @ D19= y[3]=t[3]'-t[4]''
  1728. @ 8x4 transpose
  1729. VTRN.16 Q10,Q11 @ Q10= c5c4a5a4 c7c6a7a6
  1730. @ Q11= d5d4b5b4 d7d6b7b6
  1731. VTRN.16 Q8, Q9 @ Q8 = c3c2a3a2 c1c0a1a0
  1732. @ Q9 = d3d2b3b2 d1d0b1b0
  1733. VSWP D20,D21 @ Q10= c7c6a7a6 c5c4a5a4
  1734. VSWP D22,D23 @ Q11= d7d6b7b6 d5d4b5b4
  1735. VUZP.32 Q9, Q11 @ Q9 = b7b6b5b4 b3b2b1b0
  1736. @ Q11= d7d6d5d4 d3d2d1d0
  1737. VMULL.S16 Q15,D18,D0[1]
  1738. VMULL.S16 Q13,D22,D1[1]
  1739. VUZP.32 Q8, Q10 @ Q8 = a7a6a5a4 a3a2a1a0
  1740. @ Q10= c7c6c5c4 c3c2c1c0
  1741. @ Column transforms
  1742. @ Stages 1, 2, & 3
  1743. VMULL.S16 Q14,D19,D0[1] @ Q14:Q15= OC_C1S7*x[1]-(x[1]<<16)
  1744. VMULL.S16 Q12,D23,D1[1] @ Q12:Q13= OC_C5S3*x[3]-(x[3]<<16)
  1745. VMULL.S16 Q3, D22,D0[3]
  1746. VMULL.S16 Q2, D23,D0[3] @ Q2:Q3 = OC_C3S5*x[3]-(x[3]<<16)
  1747. VSHRN.S32 D30,Q15,#16
  1748. VSHRN.S32 D31,Q14,#16 @ Q15= (OC_C1S7*x[1]>>16)-x[1]
  1749. VSHRN.S32 D26,Q13,#16
  1750. VSHRN.S32 D27,Q12,#16 @ Q13= (OC_C5S3*x[3]>>16)-x[3]
  1751. VSHRN.S32 D28,Q3, #16
  1752. VSHRN.S32 D29,Q2, #16 @ Q14= (OC_C3S5*x[3]>>16)-x[3]
  1753. VADD.S16 Q15,Q15,Q9 @ Q15= t[7]
  1754. VADD.S16 Q13,Q13,Q11 @ Q13= -t[5]
  1755. VADD.S16 Q14,Q14,Q11 @ Q14= t[6]
  1756. VMULL.S16 Q12,D18,D1[3]
  1757. VMULL.S16 Q2, D19,D1[3] @ Q2:Q12= OC_C7S1*x[1]
  1758. VMULL.S16 Q1, D16,D1[0]
  1759. VMULL.S16 Q11,D17,D1[0] @ Q11:Q1 = OC_C4S4*x[0]-(x[0]<<16)
  1760. VMULL.S16 Q3, D20,D0[2]
  1761. VMULL.S16 Q9, D21,D0[2] @ Q9:Q3 = OC_C2S6*x[2]-(x[2]<<16)
  1762. VSHRN.S32 D24,Q12,#16
  1763. VSHRN.S32 D25,Q2, #16 @ Q12= t[4]
  1764. VMULL.S16 Q2, D20,D1[2]
  1765. VSHRN.S32 D2, Q1, #16
  1766. VSHRN.S32 D3, Q11,#16 @ Q1 = (OC_C4S4*x[0]>>16)-x[0]
  1767. VMULL.S16 Q11,D21,D1[2] @ Q2:Q11= OC_C6S2*x[2]
  1768. VSHRN.S32 D6, Q3, #16
  1769. VSHRN.S32 D7, Q9, #16 @ Q3 = (OC_C2S6*x[2]>>16)-x[2]
  1770. VSUB.S16 Q9, Q15,Q14 @ Q9 = t[7]-t[6]
  1771. VADD.S16 Q15,Q15,Q14 @ Q15= t[7]=t[7]+t[6]
  1772. VSHRN.S32 D4, Q2, #16
  1773. VSHRN.S32 D5, Q11,#16 @ Q2 = t[2]
  1774. VADD.S16 Q1, Q1, Q8 @ Q1 = t[0]
  1775. VADD.S16 Q8, Q12,Q13 @ Q8 = t[4]-t[5]
  1776. VADD.S16 Q3, Q3, Q10 @ Q3 = t[3]
  1777. VMULL.S16 Q10,D16,D1[0]
  1778. VMULL.S16 Q11,D17,D1[0] @ Q11:Q10= OC_C4S4*(t[4]-t[5])
  1779. @ -(t[4]-t[5]<<16)
  1780. VSUB.S16 Q12,Q12,Q13 @ Q12= t[4]=t[4]+t[5]
  1781. VMULL.S16 Q14,D18,D1[0]
  1782. VMULL.S16 Q13,D19,D1[0] @ Q13:Q14= OC_C4S4*(t[6]-t[7])
  1783. @ -(t[6]-t[7]<<16)
  1784. VSHRN.S32 D20,Q10,#16
  1785. VSHRN.S32 D21,Q11,#16 @ Q10= (OC_C4S4*(t[4]-t[5])>>16)
  1786. @ -(t[4]-t[5])
  1787. VADD.S16 Q11,Q1, Q3 @ Q11= t[0]=t[0]+t[3]
  1788. VSUB.S16 Q3, Q1, Q3 @ Q3 = t[3]=t[0]-t[3]
  1789. VSHRN.S32 D28,Q14,#16
  1790. VSHRN.S32 D29,Q13,#16 @ Q14= (OC_C4S4*(t[7]-t[6])>>16)
  1791. @ -(t[7]-t[6])
  1792. VADD.S16 Q10,Q10,Q8 @ Q10=t[5]
  1793. VADD.S16 Q14,Q14,Q9 @ Q14=t[6]
  1794. VSUB.S16 Q13,Q14,Q10 @ Q13=t[5]=t[6]-t[5]
  1795. VADD.S16 Q14,Q14,Q10 @ Q14=t[6]=t[6]+t[5]
  1796. VADD.S16 Q10,Q1, Q2 @ Q10= t[1]=t[0]+t[2]
  1797. VSUB.S16 Q2, Q1, Q2 @ Q2 = t[2]=t[0]-t[2]
  1798. @ Stage 4
  1799. CMP r0, r1
  1800. VADD.S16 Q8, Q11,Q15 @ Q8 = y[0]=t[0]+t[7]
  1801. VADD.S16 Q9, Q10,Q14 @ Q9 = y[1]=t[1]+t[6]
  1802. VSUB.S16 Q15,Q11,Q15 @ Q15 = y[7]=t[0]-t[7]
  1803. VSUB.S16 Q14,Q10,Q14 @ Q14 = y[6]=t[1]-t[6]
  1804. VADD.S16 Q10,Q2, Q13 @ Q10 = y[2]=t[2]+t[5]
  1805. VADD.S16 Q11,Q3, Q12 @ Q11 = y[3]=t[3]+t[4]
  1806. VSUB.S16 Q12,Q3, Q12 @ Q12 = y[4]=t[3]-t[4]
  1807. VSUB.S16 Q13,Q2, Q13 @ Q13 = y[5]=t[2]-t[5]
  1808. BEQ oc_idct8x8_10_neon_noclear
  1809. VMOV.I8 D2, #0
  1810. VRSHR.S16 Q8, Q8, #4 @ Q8 = y[0]+8>>4
  1811. VST1.64 {D2}, [r1,:64], r12
  1812. VRSHR.S16 Q9, Q9, #4 @ Q9 = y[1]+8>>4
  1813. VRSHR.S16 Q10,Q10,#4 @ Q10 = y[2]+8>>4
  1814. VST1.64 {D2}, [r1,:64], r12
  1815. VRSHR.S16 Q11,Q11,#4 @ Q11 = y[3]+8>>4
  1816. VRSHR.S16 Q12,Q12,#4 @ Q12 = y[4]+8>>4
  1817. VST1.64 {D2}, [r1,:64], r12
  1818. VRSHR.S16 Q13,Q13,#4 @ Q13 = y[5]+8>>4
  1819. VRSHR.S16 Q14,Q14,#4 @ Q14 = y[6]+8>>4
  1820. VST1.64 {D2}, [r1,:64]
  1821. VRSHR.S16 Q15,Q15,#4 @ Q15 = y[7]+8>>4
  1822. VSTMIA r0, {D16-D31}
  1823. MOV PC, r14
  1824. oc_idct8x8_10_neon_noclear:
  1825. VRSHR.S16 Q8, Q8, #4 @ Q8 = y[0]+8>>4
  1826. VRSHR.S16 Q9, Q9, #4 @ Q9 = y[1]+8>>4
  1827. VRSHR.S16 Q10,Q10,#4 @ Q10 = y[2]+8>>4
  1828. VRSHR.S16 Q11,Q11,#4 @ Q11 = y[3]+8>>4
  1829. VRSHR.S16 Q12,Q12,#4 @ Q12 = y[4]+8>>4
  1830. VRSHR.S16 Q13,Q13,#4 @ Q13 = y[5]+8>>4
  1831. VRSHR.S16 Q14,Q14,#4 @ Q14 = y[6]+8>>4
  1832. VRSHR.S16 Q15,Q15,#4 @ Q15 = y[7]+8>>4
  1833. VSTMIA r0, {D16-D31}
  1834. MOV PC, r14
  1835. .endif
  1836. @ END