armfrag.asm 18 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668
  1. #ifdef OC_ARM_ASM
  2. @********************************************************************
  3. @* *
  4. @* THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. *
  5. @* USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS *
  6. @* GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
  7. @* IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
  8. @* *
  9. @* THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2010 *
  10. @* by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
  11. @* *
  12. @********************************************************************
  13. @ Original implementation:
  14. @ Copyright (C) 2009 Robin Watts for Pinknoise Productions Ltd
  15. @ last mod: $Id: armfrag.s 17874 2011-02-24 14:49:11Z tterribe $
  16. @********************************************************************
  17. .text; .p2align 2
  18. @ Vanilla ARM v4 versions
  19. .global _oc_frag_copy_list_arm
  20. .global _oc_frag_recon_intra_arm
  21. .global _oc_frag_recon_inter_arm
  22. .global _oc_frag_recon_inter2_arm
  23. @ .type oc_frag_copy_list_arm, %function; oc_frag_copy_list_arm: @ PROC
  24. _oc_frag_copy_list_arm:
  25. @ r0 = _dst_frame
  26. @ r1 = _src_frame
  27. @ r2 = _ystride
  28. @ r3 = _fragis
  29. @ <> = _nfragis
  30. @ <> = _frag_buf_offs
  31. LDR r12,[r13] @ r12 = _nfragis
  32. STMFD r13!,{r4-r6,r11,r14}
  33. SUBS r12, r12, #1
  34. LDR r4,[r3],#4 @ r4 = _fragis[fragii]
  35. LDRGE r14,[r13,#4*6] @ r14 = _frag_buf_offs
  36. BLT ofcl_arm_end
  37. SUB r2, r2, #4
  38. ofcl_arm_lp:
  39. LDR r11,[r14,r4,LSL #2] @ r11 = _frag_buf_offs[_fragis[fragii]]
  40. SUBS r12, r12, #1
  41. @ Stall (on XScale)
  42. ADD r4, r1, r11 @ r4 = _src_frame+frag_buf_off
  43. LDR r6, [r4], #4
  44. ADD r11,r0, r11 @ r11 = _dst_frame+frag_buf_off
  45. LDR r5, [r4], r2
  46. STR r6, [r11],#4
  47. LDR r6, [r4], #4
  48. STR r5, [r11],r2
  49. LDR r5, [r4], r2
  50. STR r6, [r11],#4
  51. LDR r6, [r4], #4
  52. STR r5, [r11],r2
  53. LDR r5, [r4], r2
  54. STR r6, [r11],#4
  55. LDR r6, [r4], #4
  56. STR r5, [r11],r2
  57. LDR r5, [r4], r2
  58. STR r6, [r11],#4
  59. LDR r6, [r4], #4
  60. STR r5, [r11],r2
  61. LDR r5, [r4], r2
  62. STR r6, [r11],#4
  63. LDR r6, [r4], #4
  64. STR r5, [r11],r2
  65. LDR r5, [r4], r2
  66. STR r6, [r11],#4
  67. LDR r6, [r4], #4
  68. STR r5, [r11],r2
  69. LDR r5, [r4], r2
  70. STR r6, [r11],#4
  71. LDR r6, [r4], #4
  72. STR r5, [r11],r2
  73. LDR r5, [r4]
  74. LDRGE r4,[r3],#4 @ r4 = _fragis[fragii]
  75. STR r6, [r11],#4
  76. STR r5, [r11]
  77. BGE ofcl_arm_lp
  78. ofcl_arm_end:
  79. LDMFD r13!,{r4-r6,r11,PC}
  80. _oc_frag_recon_intra_arm:
  81. @ r0 = unsigned char *_dst
  82. @ r1 = int _ystride
  83. @ r2 = const ogg_int16_t _residue[64]
  84. STMFD r13!,{r4,r5,r14}
  85. MOV r14,#8
  86. MOV r5, #255
  87. SUB r1, r1, #7
  88. ofrintra_lp_arm:
  89. LDRSH r3, [r2], #2
  90. LDRSH r4, [r2], #2
  91. LDRSH r12,[r2], #2
  92. ADDS r3, r3, #128
  93. CMPGT r5, r3
  94. EORLT r3, r5, r3, ASR #32
  95. STRB r3, [r0], #1
  96. ADDS r4, r4, #128
  97. CMPGT r5, r4
  98. EORLT r4, r5, r4, ASR #32
  99. LDRSH r3, [r2], #2
  100. STRB r4, [r0], #1
  101. ADDS r12,r12,#128
  102. CMPGT r5, r12
  103. EORLT r12,r5, r12,ASR #32
  104. LDRSH r4, [r2], #2
  105. STRB r12,[r0], #1
  106. ADDS r3, r3, #128
  107. CMPGT r5, r3
  108. EORLT r3, r5, r3, ASR #32
  109. LDRSH r12,[r2], #2
  110. STRB r3, [r0], #1
  111. ADDS r4, r4, #128
  112. CMPGT r5, r4
  113. EORLT r4, r5, r4, ASR #32
  114. LDRSH r3, [r2], #2
  115. STRB r4, [r0], #1
  116. ADDS r12,r12,#128
  117. CMPGT r5, r12
  118. EORLT r12,r5, r12,ASR #32
  119. LDRSH r4, [r2], #2
  120. STRB r12,[r0], #1
  121. ADDS r3, r3, #128
  122. CMPGT r5, r3
  123. EORLT r3, r5, r3, ASR #32
  124. STRB r3, [r0], #1
  125. ADDS r4, r4, #128
  126. CMPGT r5, r4
  127. EORLT r4, r5, r4, ASR #32
  128. STRB r4, [r0], r1
  129. SUBS r14,r14,#1
  130. BGT ofrintra_lp_arm
  131. LDMFD r13!,{r4,r5,PC}
  132. @ .size oc_frag_copy_list_arm, .-oc_frag_copy_list_arm @ ENDP
  133. @ .type oc_frag_recon_inter_arm, %function; oc_frag_recon_inter_arm: @ PROC
  134. _oc_frag_recon_inter_arm:
  135. @ r0 = unsigned char *dst
  136. @ r1 = const unsigned char *src
  137. @ r2 = int ystride
  138. @ r3 = const ogg_int16_t residue[64]
  139. STMFD r13!,{r5,r9-r11,r14}
  140. MOV r9, #8
  141. MOV r5, #255
  142. SUB r2, r2, #7
  143. ofrinter_lp_arm:
  144. LDRSH r12,[r3], #2
  145. LDRB r14,[r1], #1
  146. LDRSH r11,[r3], #2
  147. LDRB r10,[r1], #1
  148. ADDS r12,r12,r14
  149. CMPGT r5, r12
  150. EORLT r12,r5, r12,ASR #32
  151. STRB r12,[r0], #1
  152. ADDS r11,r11,r10
  153. CMPGT r5, r11
  154. LDRSH r12,[r3], #2
  155. LDRB r14,[r1], #1
  156. EORLT r11,r5, r11,ASR #32
  157. STRB r11,[r0], #1
  158. ADDS r12,r12,r14
  159. CMPGT r5, r12
  160. LDRSH r11,[r3], #2
  161. LDRB r10,[r1], #1
  162. EORLT r12,r5, r12,ASR #32
  163. STRB r12,[r0], #1
  164. ADDS r11,r11,r10
  165. CMPGT r5, r11
  166. LDRSH r12,[r3], #2
  167. LDRB r14,[r1], #1
  168. EORLT r11,r5, r11,ASR #32
  169. STRB r11,[r0], #1
  170. ADDS r12,r12,r14
  171. CMPGT r5, r12
  172. LDRSH r11,[r3], #2
  173. LDRB r10,[r1], #1
  174. EORLT r12,r5, r12,ASR #32
  175. STRB r12,[r0], #1
  176. ADDS r11,r11,r10
  177. CMPGT r5, r11
  178. LDRSH r12,[r3], #2
  179. LDRB r14,[r1], #1
  180. EORLT r11,r5, r11,ASR #32
  181. STRB r11,[r0], #1
  182. ADDS r12,r12,r14
  183. CMPGT r5, r12
  184. LDRSH r11,[r3], #2
  185. LDRB r10,[r1], r2
  186. EORLT r12,r5, r12,ASR #32
  187. STRB r12,[r0], #1
  188. ADDS r11,r11,r10
  189. CMPGT r5, r11
  190. EORLT r11,r5, r11,ASR #32
  191. STRB r11,[r0], r2
  192. SUBS r9, r9, #1
  193. BGT ofrinter_lp_arm
  194. LDMFD r13!,{r5,r9-r11,PC}
  195. @ .size oc_frag_recon_inter_arm, .-oc_frag_recon_inter_arm @ ENDP
  196. @ .type oc_frag_recon_inter2_arm, %function; oc_frag_recon_inter2_arm: @ PROC
  197. _oc_frag_recon_inter2_arm:
  198. @ r0 = unsigned char *dst
  199. @ r1 = const unsigned char *src1
  200. @ r2 = const unsigned char *src2
  201. @ r3 = int ystride
  202. LDR r12,[r13]
  203. @ r12= const ogg_int16_t residue[64]
  204. STMFD r13!,{r4-r8,r14}
  205. MOV r14,#8
  206. MOV r8, #255
  207. SUB r3, r3, #7
  208. ofrinter2_lp_arm:
  209. LDRB r5, [r1], #1
  210. LDRB r6, [r2], #1
  211. LDRSH r4, [r12],#2
  212. LDRB r7, [r1], #1
  213. ADD r5, r5, r6
  214. ADDS r5, r4, r5, LSR #1
  215. CMPGT r8, r5
  216. LDRB r6, [r2], #1
  217. LDRSH r4, [r12],#2
  218. EORLT r5, r8, r5, ASR #32
  219. STRB r5, [r0], #1
  220. ADD r7, r7, r6
  221. ADDS r7, r4, r7, LSR #1
  222. CMPGT r8, r7
  223. LDRB r5, [r1], #1
  224. LDRB r6, [r2], #1
  225. LDRSH r4, [r12],#2
  226. EORLT r7, r8, r7, ASR #32
  227. STRB r7, [r0], #1
  228. ADD r5, r5, r6
  229. ADDS r5, r4, r5, LSR #1
  230. CMPGT r8, r5
  231. LDRB r7, [r1], #1
  232. LDRB r6, [r2], #1
  233. LDRSH r4, [r12],#2
  234. EORLT r5, r8, r5, ASR #32
  235. STRB r5, [r0], #1
  236. ADD r7, r7, r6
  237. ADDS r7, r4, r7, LSR #1
  238. CMPGT r8, r7
  239. LDRB r5, [r1], #1
  240. LDRB r6, [r2], #1
  241. LDRSH r4, [r12],#2
  242. EORLT r7, r8, r7, ASR #32
  243. STRB r7, [r0], #1
  244. ADD r5, r5, r6
  245. ADDS r5, r4, r5, LSR #1
  246. CMPGT r8, r5
  247. LDRB r7, [r1], #1
  248. LDRB r6, [r2], #1
  249. LDRSH r4, [r12],#2
  250. EORLT r5, r8, r5, ASR #32
  251. STRB r5, [r0], #1
  252. ADD r7, r7, r6
  253. ADDS r7, r4, r7, LSR #1
  254. CMPGT r8, r7
  255. LDRB r5, [r1], #1
  256. LDRB r6, [r2], #1
  257. LDRSH r4, [r12],#2
  258. EORLT r7, r8, r7, ASR #32
  259. STRB r7, [r0], #1
  260. ADD r5, r5, r6
  261. ADDS r5, r4, r5, LSR #1
  262. CMPGT r8, r5
  263. LDRB r7, [r1], r3
  264. LDRB r6, [r2], r3
  265. LDRSH r4, [r12],#2
  266. EORLT r5, r8, r5, ASR #32
  267. STRB r5, [r0], #1
  268. ADD r7, r7, r6
  269. ADDS r7, r4, r7, LSR #1
  270. CMPGT r8, r7
  271. EORLT r7, r8, r7, ASR #32
  272. STRB r7, [r0], r3
  273. SUBS r14,r14,#1
  274. BGT ofrinter2_lp_arm
  275. LDMFD r13!,{r4-r8,PC}
  276. @ .size oc_frag_recon_inter2_arm, .-oc_frag_recon_inter2_arm @ ENDP
  277. .if OC_ARM_ASM_EDSP
  278. .global _oc_frag_copy_list_edsp
  279. @ .type oc_frag_copy_list_edsp, %function; oc_frag_copy_list_edsp: @ PROC
  280. _oc_frag_copy_list_edsp:
  281. @ r0 = _dst_frame
  282. @ r1 = _src_frame
  283. @ r2 = _ystride
  284. @ r3 = _fragis
  285. @ <> = _nfragis
  286. @ <> = _frag_buf_offs
  287. LDR r12,[r13] @ r12 = _nfragis
  288. STMFD r13!,{r4-r11,r14}
  289. SUBS r12, r12, #1
  290. LDRGE r5, [r3],#4 @ r5 = _fragis[fragii]
  291. LDRGE r14,[r13,#4*10] @ r14 = _frag_buf_offs
  292. BLT ofcl_edsp_end
  293. ofcl_edsp_lp:
  294. MOV r4, r1
  295. LDR r5, [r14,r5, LSL #2] @ r5 = _frag_buf_offs[_fragis[fragii]]
  296. SUBS r12, r12, #1
  297. @ Stall (on XScale)
  298. LDRD r6, r7, [r4, r5]! @ r4 = _src_frame+frag_buf_off
  299. LDRD r8, r9, [r4, r2]!
  300. @ Stall
  301. STRD r6, r7, [r5, r0]! @ r5 = _dst_frame+frag_buf_off
  302. STRD r8, r9, [r5, r2]!
  303. @ Stall
  304. LDRD r6, r7, [r4, r2]! @ On Xscale at least, doing 3 consecutive
  305. LDRD r8, r9, [r4, r2]! @ loads causes a stall, but thats no worse
  306. LDRD r10,r11,[r4, r2]! @ than us only doing 2, and having to do
  307. @ another pair of LDRD/STRD later on.
  308. @ Stall
  309. STRD r6, r7, [r5, r2]!
  310. STRD r8, r9, [r5, r2]!
  311. STRD r10,r11,[r5, r2]!
  312. LDRD r6, r7, [r4, r2]!
  313. LDRD r8, r9, [r4, r2]!
  314. LDRD r10,r11,[r4, r2]!
  315. STRD r6, r7, [r5, r2]!
  316. STRD r8, r9, [r5, r2]!
  317. STRD r10,r11,[r5, r2]!
  318. LDRGE r5, [r3],#4 @ r5 = _fragis[fragii]
  319. BGE ofcl_edsp_lp
  320. ofcl_edsp_end:
  321. LDMFD r13!,{r4-r11,PC}
  322. @ .size oc_frag_copy_list_edsp, .-oc_frag_copy_list_edsp @ ENDP
  323. .endif
  324. .if OC_ARM_ASM_MEDIA
  325. .global _oc_frag_recon_intra_v6
  326. .global _oc_frag_recon_inter_v6
  327. .global _oc_frag_recon_inter2_v6
  328. @ .type oc_frag_recon_intra_v6, %function; oc_frag_recon_intra_v6: @ PROC
  329. _oc_frag_recon_intra_v6:
  330. @ r0 = unsigned char *_dst
  331. @ r1 = int _ystride
  332. @ r2 = const ogg_int16_t _residue[64]
  333. STMFD r13!,{r4-r6,r14}
  334. MOV r14,#8
  335. MOV r12,r2
  336. MOV r6, #0x0080
  337. MOVT r6, #0x0080
  338. ofrintra_v6_lp:
  339. LDRD r2, r3, [r12],#8 @ r2 = 11110000 r3 = 33332222
  340. LDRD r4, r5, [r12],#8 @ r4 = 55554444 r5 = 77776666
  341. SUBS r14,r14,#1
  342. QADD16 r2, r2, r6
  343. QADD16 r3, r3, r6
  344. QADD16 r4, r4, r6
  345. QADD16 r5, r5, r6
  346. USAT16 r2, #8, r2 @ r2 = __11__00
  347. USAT16 r3, #8, r3 @ r3 = __33__22
  348. USAT16 r4, #8, r4 @ r4 = __55__44
  349. USAT16 r5, #8, r5 @ r5 = __77__66
  350. ORR r2, r2, r2, LSR #8 @ r2 = __111100
  351. ORR r3, r3, r3, LSR #8 @ r3 = __333322
  352. ORR r4, r4, r4, LSR #8 @ r4 = __555544
  353. ORR r5, r5, r5, LSR #8 @ r5 = __777766
  354. PKHBT r2, r2, r3, LSL #16 @ r2 = 33221100
  355. PKHBT r3, r4, r5, LSL #16 @ r3 = 77665544
  356. STRD r2, r3, [r0], r1
  357. BGT ofrintra_v6_lp
  358. LDMFD r13!,{r4-r6,PC}
  359. @ .size oc_frag_recon_intra_v6, .-oc_frag_recon_intra_v6 @ ENDP
  360. @ .type oc_frag_recon_inter_v6, %function; oc_frag_recon_inter_v6: @ PROC
  361. _oc_frag_recon_inter_v6:
  362. @ r0 = unsigned char *_dst
  363. @ r1 = const unsigned char *_src
  364. @ r2 = int _ystride
  365. @ r3 = const ogg_int16_t _residue[64]
  366. STMFD r13!,{r4-r7,r14}
  367. MOV r14,#8
  368. ofrinter_v6_lp:
  369. LDRD r6, r7, [r3], #8 @ r6 = 11110000 r7 = 33332222
  370. SUBS r14,r14,#1
  371. .if OC_ARM_CAN_UNALIGN_LDRD
  372. LDRD r4, r5, [r1], r2 @ Unaligned ; r4 = 33221100 r5 = 77665544
  373. .else
  374. LDR r5, [r1, #4]
  375. LDR r4, [r1], r2
  376. .endif
  377. PKHBT r12,r6, r7, LSL #16 @ r12= 22220000
  378. PKHTB r7, r7, r6, ASR #16 @ r7 = 33331111
  379. UXTB16 r6,r4 @ r6 = __22__00
  380. UXTB16 r4,r4, ROR #8 @ r4 = __33__11
  381. QADD16 r12,r12,r6 @ r12= xx22xx00
  382. QADD16 r4, r7, r4 @ r4 = xx33xx11
  383. LDRD r6, r7, [r3], #8 @ r6 = 55554444 r7 = 77776666
  384. USAT16 r4, #8, r4 @ r4 = __33__11
  385. USAT16 r12,#8,r12 @ r12= __22__00
  386. ORR r4, r12,r4, LSL #8 @ r4 = 33221100
  387. PKHBT r12,r6, r7, LSL #16 @ r12= 66664444
  388. PKHTB r7, r7, r6, ASR #16 @ r7 = 77775555
  389. UXTB16 r6,r5 @ r6 = __66__44
  390. UXTB16 r5,r5, ROR #8 @ r5 = __77__55
  391. QADD16 r12,r12,r6 @ r12= xx66xx44
  392. QADD16 r5, r7, r5 @ r5 = xx77xx55
  393. USAT16 r12,#8, r12 @ r12= __66__44
  394. USAT16 r5, #8, r5 @ r4 = __77__55
  395. ORR r5, r12,r5, LSL #8 @ r5 = 33221100
  396. STRD r4, r5, [r0], r2
  397. BGT ofrinter_v6_lp
  398. LDMFD r13!,{r4-r7,PC}
  399. @ .size oc_frag_recon_inter_v6, .-oc_frag_recon_inter_v6 @ ENDP
  400. @ .type oc_frag_recon_inter2_v6, %function; oc_frag_recon_inter2_v6: @ PROC
  401. _oc_frag_recon_inter2_v6:
  402. @ r0 = unsigned char *_dst
  403. @ r1 = const unsigned char *_src1
  404. @ r2 = const unsigned char *_src2
  405. @ r3 = int _ystride
  406. LDR r12,[r13]
  407. @ r12= const ogg_int16_t _residue[64]
  408. STMFD r13!,{r4-r9,r14}
  409. MOV r14,#8
  410. ofrinter2_v6_lp:
  411. LDRD r6, r7, [r12,#8] @ r6 = 55554444 r7 = 77776666
  412. SUBS r14,r14,#1
  413. LDR r4, [r1, #4] @ Unaligned ; r4 = src1[1] = 77665544
  414. LDR r5, [r2, #4] @ Unaligned ; r5 = src2[1] = 77665544
  415. PKHBT r8, r6, r7, LSL #16 @ r8 = 66664444
  416. PKHTB r9, r7, r6, ASR #16 @ r9 = 77775555
  417. UHADD8 r4, r4, r5 @ r4 = (src1[7,6,5,4] + src2[7,6,5,4])>>1
  418. UXTB16 r5, r4 @ r5 = __66__44
  419. UXTB16 r4, r4, ROR #8 @ r4 = __77__55
  420. QADD16 r8, r8, r5 @ r8 = xx66xx44
  421. QADD16 r9, r9, r4 @ r9 = xx77xx55
  422. LDRD r6, r7, [r12],#16 @ r6 = 33332222 r7 = 11110000
  423. USAT16 r8, #8, r8 @ r8 = __66__44
  424. LDR r4, [r1], r3 @ Unaligned ; r4 = src1[0] = 33221100
  425. USAT16 r9, #8, r9 @ r9 = __77__55
  426. LDR r5, [r2], r3 @ Unaligned ; r5 = src2[0] = 33221100
  427. ORR r9, r8, r9, LSL #8 @ r9 = 77665544
  428. PKHBT r8, r6, r7, LSL #16 @ r8 = 22220000
  429. UHADD8 r4, r4, r5 @ r4 = (src1[3,2,1,0] + src2[3,2,1,0])>>1
  430. PKHTB r7, r7, r6, ASR #16 @ r7 = 33331111
  431. UXTB16 r5, r4 @ r5 = __22__00
  432. UXTB16 r4, r4, ROR #8 @ r4 = __33__11
  433. QADD16 r8, r8, r5 @ r8 = xx22xx00
  434. QADD16 r7, r7, r4 @ r7 = xx33xx11
  435. USAT16 r8, #8, r8 @ r8 = __22__00
  436. USAT16 r7, #8, r7 @ r7 = __33__11
  437. ORR r8, r8, r7, LSL #8 @ r8 = 33221100
  438. STRD r8, r9, [r0], r3
  439. BGT ofrinter2_v6_lp
  440. LDMFD r13!,{r4-r9,PC}
  441. @ .size oc_frag_recon_inter2_v6, .-oc_frag_recon_inter2_v6 @ ENDP
  442. .endif
  443. .if OC_ARM_ASM_NEON
  444. .global _oc_frag_copy_list_neon
  445. .global _oc_frag_recon_intra_neon
  446. .global _oc_frag_recon_inter_neon
  447. .global _oc_frag_recon_inter2_neon
  448. @ .type oc_frag_copy_list_neon, %function; oc_frag_copy_list_neon: @ PROC
  449. _oc_frag_copy_list_neon:
  450. @ r0 = _dst_frame
  451. @ r1 = _src_frame
  452. @ r2 = _ystride
  453. @ r3 = _fragis
  454. @ <> = _nfragis
  455. @ <> = _frag_buf_offs
  456. LDR r12,[r13] @ r12 = _nfragis
  457. STMFD r13!,{r4-r7,r14}
  458. CMP r12, #1
  459. LDRGE r6, [r3] @ r6 = _fragis[fragii]
  460. LDRGE r14,[r13,#4*6] @ r14 = _frag_buf_offs
  461. BLT ofcl_neon_end
  462. @ Stall (2 on Xscale)
  463. LDR r6, [r14,r6, LSL #2] @ r6 = _frag_buf_offs[_fragis[fragii]]
  464. @ Stall (on XScale)
  465. MOV r7, r6 @ Guarantee PLD points somewhere valid.
  466. ofcl_neon_lp:
  467. ADD r4, r1, r6
  468. VLD1.64 {D0}, [r4,:64], r2
  469. ADD r5, r0, r6
  470. VLD1.64 {D1}, [r4,:64], r2
  471. SUBS r12, r12, #1
  472. VLD1.64 {D2}, [r4,:64], r2
  473. LDRGT r6, [r3,#4]! @ r6 = _fragis[fragii]
  474. VLD1.64 {D3}, [r4,:64], r2
  475. LDRGT r6, [r14,r6, LSL #2] @ r6 = _frag_buf_offs[_fragis[fragii]]
  476. VLD1.64 {D4}, [r4,:64], r2
  477. ADDGT r7, r1, r6
  478. VLD1.64 {D5}, [r4,:64], r2
  479. PLD [r7]
  480. VLD1.64 {D6}, [r4,:64], r2
  481. PLD [r7, r2]
  482. VLD1.64 {D7}, [r4,:64]
  483. PLD [r7, r2, LSL #1]
  484. VST1.64 {D0}, [r5,:64], r2
  485. ADDGT r7, r7, r2, LSL #2
  486. VST1.64 {D1}, [r5,:64], r2
  487. PLD [r7, -r2]
  488. VST1.64 {D2}, [r5,:64], r2
  489. PLD [r7]
  490. VST1.64 {D3}, [r5,:64], r2
  491. PLD [r7, r2]
  492. VST1.64 {D4}, [r5,:64], r2
  493. PLD [r7, r2, LSL #1]
  494. VST1.64 {D5}, [r5,:64], r2
  495. ADDGT r7, r7, r2, LSL #2
  496. VST1.64 {D6}, [r5,:64], r2
  497. PLD [r7, -r2]
  498. VST1.64 {D7}, [r5,:64]
  499. BGT ofcl_neon_lp
  500. ofcl_neon_end:
  501. LDMFD r13!,{r4-r7,PC}
  502. @ .size oc_frag_copy_list_neon, .-oc_frag_copy_list_neon @ ENDP
  503. @ .type oc_frag_recon_intra_neon, %function; oc_frag_recon_intra_neon: @ PROC
  504. _oc_frag_recon_intra_neon:
  505. @ r0 = unsigned char *_dst
  506. @ r1 = int _ystride
  507. @ r2 = const ogg_int16_t _residue[64]
  508. VMOV.I16 Q0, #128
  509. VLDMIA r2, {D16-D31} @ D16= 3333222211110000 etc ; 9(8) cycles
  510. VQADD.S16 Q8, Q8, Q0
  511. VQADD.S16 Q9, Q9, Q0
  512. VQADD.S16 Q10,Q10,Q0
  513. VQADD.S16 Q11,Q11,Q0
  514. VQADD.S16 Q12,Q12,Q0
  515. VQADD.S16 Q13,Q13,Q0
  516. VQADD.S16 Q14,Q14,Q0
  517. VQADD.S16 Q15,Q15,Q0
  518. VQMOVUN.S16 D16,Q8 @ D16= 7766554433221100 ; 1 cycle
  519. VQMOVUN.S16 D17,Q9 @ D17= FFEEDDCCBBAA9988 ; 1 cycle
  520. VQMOVUN.S16 D18,Q10 @ D18= NNMMLLKKJJIIHHGG ; 1 cycle
  521. VST1.64 {D16},[r0,:64], r1
  522. VQMOVUN.S16 D19,Q11 @ D19= VVUUTTSSRRQQPPOO ; 1 cycle
  523. VST1.64 {D17},[r0,:64], r1
  524. VQMOVUN.S16 D20,Q12 @ D20= ddccbbaaZZYYXXWW ; 1 cycle
  525. VST1.64 {D18},[r0,:64], r1
  526. VQMOVUN.S16 D21,Q13 @ D21= llkkjjiihhggffee ; 1 cycle
  527. VST1.64 {D19},[r0,:64], r1
  528. VQMOVUN.S16 D22,Q14 @ D22= ttssrrqqppoonnmm ; 1 cycle
  529. VST1.64 {D20},[r0,:64], r1
  530. VQMOVUN.S16 D23,Q15 @ D23= !!,:@zzyyxxwwvvuu ; 1 cycle
  531. VST1.64 {D21},[r0,:64], r1
  532. VST1.64 {D22},[r0,:64], r1
  533. VST1.64 {D23},[r0,:64], r1
  534. MOV PC,R14
  535. @ .size oc_frag_recon_intra_neon, .-oc_frag_recon_intra_neon @ ENDP
  536. @ .type oc_frag_recon_inter_neon, %function; oc_frag_recon_inter_neon: @ PROC
  537. _oc_frag_recon_inter_neon:
  538. @ r0 = unsigned char *_dst
  539. @ r1 = const unsigned char *_src
  540. @ r2 = int _ystride
  541. @ r3 = const ogg_int16_t _residue[64]
  542. VLDMIA r3, {D16-D31} @ D16= 3333222211110000 etc ; 9(8) cycles
  543. VLD1.64 {D0}, [r1], r2
  544. VLD1.64 {D2}, [r1], r2
  545. VMOVL.U8 Q0, D0 @ Q0 = __77__66__55__44__33__22__11__00
  546. VLD1.64 {D4}, [r1], r2
  547. VMOVL.U8 Q1, D2 @ etc
  548. VLD1.64 {D6}, [r1], r2
  549. VMOVL.U8 Q2, D4
  550. VMOVL.U8 Q3, D6
  551. VQADD.S16 Q8, Q8, Q0
  552. VLD1.64 {D0}, [r1], r2
  553. VQADD.S16 Q9, Q9, Q1
  554. VLD1.64 {D2}, [r1], r2
  555. VQADD.S16 Q10,Q10,Q2
  556. VLD1.64 {D4}, [r1], r2
  557. VQADD.S16 Q11,Q11,Q3
  558. VLD1.64 {D6}, [r1], r2
  559. VMOVL.U8 Q0, D0
  560. VMOVL.U8 Q1, D2
  561. VMOVL.U8 Q2, D4
  562. VMOVL.U8 Q3, D6
  563. VQADD.S16 Q12,Q12,Q0
  564. VQADD.S16 Q13,Q13,Q1
  565. VQADD.S16 Q14,Q14,Q2
  566. VQADD.S16 Q15,Q15,Q3
  567. VQMOVUN.S16 D16,Q8
  568. VQMOVUN.S16 D17,Q9
  569. VQMOVUN.S16 D18,Q10
  570. VST1.64 {D16},[r0,:64], r2
  571. VQMOVUN.S16 D19,Q11
  572. VST1.64 {D17},[r0,:64], r2
  573. VQMOVUN.S16 D20,Q12
  574. VST1.64 {D18},[r0,:64], r2
  575. VQMOVUN.S16 D21,Q13
  576. VST1.64 {D19},[r0,:64], r2
  577. VQMOVUN.S16 D22,Q14
  578. VST1.64 {D20},[r0,:64], r2
  579. VQMOVUN.S16 D23,Q15
  580. VST1.64 {D21},[r0,:64], r2
  581. VST1.64 {D22},[r0,:64], r2
  582. VST1.64 {D23},[r0,:64], r2
  583. MOV PC,R14
  584. @ .size oc_frag_recon_inter_neon, .-oc_frag_recon_inter_neon @ ENDP
  585. @ .type oc_frag_recon_inter2_neon, %function; oc_frag_recon_inter2_neon: @ PROC
  586. _oc_frag_recon_inter2_neon:
  587. @ r0 = unsigned char *_dst
  588. @ r1 = const unsigned char *_src1
  589. @ r2 = const unsigned char *_src2
  590. @ r3 = int _ystride
  591. LDR r12,[r13]
  592. @ r12= const ogg_int16_t _residue[64]
  593. VLDMIA r12,{D16-D31}
  594. VLD1.64 {D0}, [r1], r3
  595. VLD1.64 {D4}, [r2], r3
  596. VLD1.64 {D1}, [r1], r3
  597. VLD1.64 {D5}, [r2], r3
  598. VHADD.U8 Q2, Q0, Q2 @ Q2 = FFEEDDCCBBAA99887766554433221100
  599. VLD1.64 {D2}, [r1], r3
  600. VLD1.64 {D6}, [r2], r3
  601. VMOVL.U8 Q0, D4 @ Q0 = __77__66__55__44__33__22__11__00
  602. VLD1.64 {D3}, [r1], r3
  603. VMOVL.U8 Q2, D5 @ etc
  604. VLD1.64 {D7}, [r2], r3
  605. VHADD.U8 Q3, Q1, Q3
  606. VQADD.S16 Q8, Q8, Q0
  607. VQADD.S16 Q9, Q9, Q2
  608. VLD1.64 {D0}, [r1], r3
  609. VMOVL.U8 Q1, D6
  610. VLD1.64 {D4}, [r2], r3
  611. VMOVL.U8 Q3, D7
  612. VLD1.64 {D1}, [r1], r3
  613. VQADD.S16 Q10,Q10,Q1
  614. VLD1.64 {D5}, [r2], r3
  615. VQADD.S16 Q11,Q11,Q3
  616. VLD1.64 {D2}, [r1], r3
  617. VHADD.U8 Q2, Q0, Q2
  618. VLD1.64 {D6}, [r2], r3
  619. VLD1.64 {D3}, [r1], r3
  620. VMOVL.U8 Q0, D4
  621. VLD1.64 {D7}, [r2], r3
  622. VMOVL.U8 Q2, D5
  623. VHADD.U8 Q3, Q1, Q3
  624. VQADD.S16 Q12,Q12,Q0
  625. VQADD.S16 Q13,Q13,Q2
  626. VMOVL.U8 Q1, D6
  627. VMOVL.U8 Q3, D7
  628. VQADD.S16 Q14,Q14,Q1
  629. VQADD.S16 Q15,Q15,Q3
  630. VQMOVUN.S16 D16,Q8
  631. VQMOVUN.S16 D17,Q9
  632. VQMOVUN.S16 D18,Q10
  633. VST1.64 {D16},[r0,:64], r3
  634. VQMOVUN.S16 D19,Q11
  635. VST1.64 {D17},[r0,:64], r3
  636. VQMOVUN.S16 D20,Q12
  637. VST1.64 {D18},[r0,:64], r3
  638. VQMOVUN.S16 D21,Q13
  639. VST1.64 {D19},[r0,:64], r3
  640. VQMOVUN.S16 D22,Q14
  641. VST1.64 {D20},[r0,:64], r3
  642. VQMOVUN.S16 D23,Q15
  643. VST1.64 {D21},[r0,:64], r3
  644. VST1.64 {D22},[r0,:64], r3
  645. VST1.64 {D23},[r0,:64], r3
  646. MOV PC,R14
  647. @ .size oc_frag_recon_inter2_neon, .-oc_frag_recon_inter2_neon @ ENDP
  648. .endif
  649. @ END
  650. @ .section .note.GNU-stack,"",%progbits
  651. #endif