armfrag-gnu.S 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651
  1. @********************************************************************
  2. @* *
  3. @* THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. *
  4. @* USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS *
  5. @* GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
  6. @* IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
  7. @* *
  8. @* THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2010 *
  9. @* by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
  10. @* *
  11. @********************************************************************
  12. @ Original implementation:
  13. @ Copyright (C) 2009 Robin Watts for Pinknoise Productions Ltd
  14. @ last mod: $Id: armfrag.s 17432 2010-09-23 04:03:25Z tterribe $
  15. @********************************************************************
  16. .text
  17. @ .include "armopts-gnu.S"
  18. @ .set OC_ARM_ASM_EDSP, 0
  19. @ .set OC_ARM_ASM_MEDIA, 0
  20. @ .set OC_ARM_ASM_NEON, 0
  21. @ .set OC_ARM_CAN_UNALIGN, 0
  22. @ .set OC_ARM_CAN_UNALIGN_LDRD, 0
  23. @ Vanilla ARM v4 versions
  24. .global oc_frag_copy_list_arm
  25. .global oc_frag_recon_intra_arm
  26. .global oc_frag_recon_inter_arm
  27. .global oc_frag_recon_inter2_arm
  28. oc_frag_copy_list_arm:
  29. @ r0 = _dst_frame
  30. @ r1 = _src_frame
  31. @ r2 = _ystride
  32. @ r3 = _fragis
  33. @ <> = _nfragis
  34. @ <> = _frag_buf_offs
  35. LDR r12,[r13] @ r12 = _nfragis
  36. STMFD r13!,{r4-r6,r11,r14}
  37. SUBS r12, r12, #1
  38. LDR r4,[r3],#4 @ r4 = _fragis[fragii]
  39. LDRGE r14,[r13,#4*6] @ r14 = _frag_buf_offs
  40. BLT ofcl_arm_end
  41. SUB r2, r2, #4
  42. ofcl_arm_lp:
  43. LDR r11,[r14,r4,LSL #2] @ r11 = _frag_buf_offs[_fragis[fragii]]
  44. SUBS r12, r12, #1
  45. @ Stall (on XScale)
  46. ADD r4, r1, r11 @ r4 = _src_frame+frag_buf_off
  47. LDR r6, [r4], #4
  48. ADD r11,r0, r11 @ r11 = _dst_frame+frag_buf_off
  49. LDR r5, [r4], r2
  50. STR r6, [r11],#4
  51. LDR r6, [r4], #4
  52. STR r5, [r11],r2
  53. LDR r5, [r4], r2
  54. STR r6, [r11],#4
  55. LDR r6, [r4], #4
  56. STR r5, [r11],r2
  57. LDR r5, [r4], r2
  58. STR r6, [r11],#4
  59. LDR r6, [r4], #4
  60. STR r5, [r11],r2
  61. LDR r5, [r4], r2
  62. STR r6, [r11],#4
  63. LDR r6, [r4], #4
  64. STR r5, [r11],r2
  65. LDR r5, [r4], r2
  66. STR r6, [r11],#4
  67. LDR r6, [r4], #4
  68. STR r5, [r11],r2
  69. LDR r5, [r4], r2
  70. STR r6, [r11],#4
  71. LDR r6, [r4], #4
  72. STR r5, [r11],r2
  73. LDR r5, [r4], r2
  74. STR r6, [r11],#4
  75. LDR r6, [r4], #4
  76. STR r5, [r11],r2
  77. LDR r5, [r4]
  78. LDRGE r4,[r3],#4 @ r4 = _fragis[fragii]
  79. STR r6, [r11],#4
  80. STR r5, [r11]
  81. BGE ofcl_arm_lp
  82. ofcl_arm_end:
  83. LDMFD r13!,{r4-r6,r11,PC}
  84. oc_frag_recon_intra_arm:
  85. @ r0 = unsigned char *_dst
  86. @ r1 = int _ystride
  87. @ r2 = const ogg_int16_t _residue[64]
  88. STMFD r13!,{r4,r5,r14}
  89. MOV r14,#8
  90. MOV r5, #255
  91. SUB r1, r1, #7
  92. ofrintra_lp_arm:
  93. LDRSH r3, [r2], #2
  94. LDRSH r4, [r2], #2
  95. LDRSH r12,[r2], #2
  96. ADDS r3, r3, #128
  97. CMPGT r5, r3
  98. EORLT r3, r5, r3, ASR #32
  99. STRB r3, [r0], #1
  100. ADDS r4, r4, #128
  101. CMPGT r5, r4
  102. EORLT r4, r5, r4, ASR #32
  103. LDRSH r3, [r2], #2
  104. STRB r4, [r0], #1
  105. ADDS r12,r12,#128
  106. CMPGT r5, r12
  107. EORLT r12,r5, r12,ASR #32
  108. LDRSH r4, [r2], #2
  109. STRB r12,[r0], #1
  110. ADDS r3, r3, #128
  111. CMPGT r5, r3
  112. EORLT r3, r5, r3, ASR #32
  113. LDRSH r12,[r2], #2
  114. STRB r3, [r0], #1
  115. ADDS r4, r4, #128
  116. CMPGT r5, r4
  117. EORLT r4, r5, r4, ASR #32
  118. LDRSH r3, [r2], #2
  119. STRB r4, [r0], #1
  120. ADDS r12,r12,#128
  121. CMPGT r5, r12
  122. EORLT r12,r5, r12,ASR #32
  123. LDRSH r4, [r2], #2
  124. STRB r12,[r0], #1
  125. ADDS r3, r3, #128
  126. CMPGT r5, r3
  127. EORLT r3, r5, r3, ASR #32
  128. STRB r3, [r0], #1
  129. ADDS r4, r4, #128
  130. CMPGT r5, r4
  131. EORLT r4, r5, r4, ASR #32
  132. STRB r4, [r0], r1
  133. SUBS r14,r14,#1
  134. BGT ofrintra_lp_arm
  135. LDMFD r13!,{r4,r5,PC}
  136. oc_frag_recon_inter_arm:
  137. @ r0 = unsigned char *dst
  138. @ r1 = const unsigned char *src
  139. @ r2 = int ystride
  140. @ r3 = const ogg_int16_t residue[64]
  141. STMFD r13!,{r5,r9-r11,r14}
  142. MOV r9, #8
  143. MOV r5, #255
  144. SUB r2, r2, #7
  145. ofrinter_lp_arm:
  146. LDRSH r12,[r3], #2
  147. LDRB r14,[r1], #1
  148. LDRSH r11,[r3], #2
  149. LDRB r10,[r1], #1
  150. ADDS r12,r12,r14
  151. CMPGT r5, r12
  152. EORLT r12,r5, r12,ASR #32
  153. STRB r12,[r0], #1
  154. ADDS r11,r11,r10
  155. CMPGT r5, r11
  156. LDRSH r12,[r3], #2
  157. LDRB r14,[r1], #1
  158. EORLT r11,r5, r11,ASR #32
  159. STRB r11,[r0], #1
  160. ADDS r12,r12,r14
  161. CMPGT r5, r12
  162. LDRSH r11,[r3], #2
  163. LDRB r10,[r1], #1
  164. EORLT r12,r5, r12,ASR #32
  165. STRB r12,[r0], #1
  166. ADDS r11,r11,r10
  167. CMPGT r5, r11
  168. LDRSH r12,[r3], #2
  169. LDRB r14,[r1], #1
  170. EORLT r11,r5, r11,ASR #32
  171. STRB r11,[r0], #1
  172. ADDS r12,r12,r14
  173. CMPGT r5, r12
  174. LDRSH r11,[r3], #2
  175. LDRB r10,[r1], #1
  176. EORLT r12,r5, r12,ASR #32
  177. STRB r12,[r0], #1
  178. ADDS r11,r11,r10
  179. CMPGT r5, r11
  180. LDRSH r12,[r3], #2
  181. LDRB r14,[r1], #1
  182. EORLT r11,r5, r11,ASR #32
  183. STRB r11,[r0], #1
  184. ADDS r12,r12,r14
  185. CMPGT r5, r12
  186. LDRSH r11,[r3], #2
  187. LDRB r10,[r1], r2
  188. EORLT r12,r5, r12,ASR #32
  189. STRB r12,[r0], #1
  190. ADDS r11,r11,r10
  191. CMPGT r5, r11
  192. EORLT r11,r5, r11,ASR #32
  193. STRB r11,[r0], r2
  194. SUBS r9, r9, #1
  195. BGT ofrinter_lp_arm
  196. LDMFD r13!,{r5,r9-r11,PC}
  197. oc_frag_recon_inter2_arm:
  198. @ r0 = unsigned char *dst
  199. @ r1 = const unsigned char *src1
  200. @ r2 = const unsigned char *src2
  201. @ r3 = int ystride
  202. LDR r12,[r13]
  203. @ r12= const ogg_int16_t residue[64]
  204. STMFD r13!,{r4-r8,r14}
  205. MOV r14,#8
  206. MOV r8, #255
  207. SUB r3, r3, #7
  208. ofrinter2_lp_arm:
  209. LDRB r5, [r1], #1
  210. LDRB r6, [r2], #1
  211. LDRSH r4, [r12],#2
  212. LDRB r7, [r1], #1
  213. ADD r5, r5, r6
  214. ADDS r5, r4, r5, LSR #1
  215. CMPGT r8, r5
  216. LDRB r6, [r2], #1
  217. LDRSH r4, [r12],#2
  218. EORLT r5, r8, r5, ASR #32
  219. STRB r5, [r0], #1
  220. ADD r7, r7, r6
  221. ADDS r7, r4, r7, LSR #1
  222. CMPGT r8, r7
  223. LDRB r5, [r1], #1
  224. LDRB r6, [r2], #1
  225. LDRSH r4, [r12],#2
  226. EORLT r7, r8, r7, ASR #32
  227. STRB r7, [r0], #1
  228. ADD r5, r5, r6
  229. ADDS r5, r4, r5, LSR #1
  230. CMPGT r8, r5
  231. LDRB r7, [r1], #1
  232. LDRB r6, [r2], #1
  233. LDRSH r4, [r12],#2
  234. EORLT r5, r8, r5, ASR #32
  235. STRB r5, [r0], #1
  236. ADD r7, r7, r6
  237. ADDS r7, r4, r7, LSR #1
  238. CMPGT r8, r7
  239. LDRB r5, [r1], #1
  240. LDRB r6, [r2], #1
  241. LDRSH r4, [r12],#2
  242. EORLT r7, r8, r7, ASR #32
  243. STRB r7, [r0], #1
  244. ADD r5, r5, r6
  245. ADDS r5, r4, r5, LSR #1
  246. CMPGT r8, r5
  247. LDRB r7, [r1], #1
  248. LDRB r6, [r2], #1
  249. LDRSH r4, [r12],#2
  250. EORLT r5, r8, r5, ASR #32
  251. STRB r5, [r0], #1
  252. ADD r7, r7, r6
  253. ADDS r7, r4, r7, LSR #1
  254. CMPGT r8, r7
  255. LDRB r5, [r1], #1
  256. LDRB r6, [r2], #1
  257. LDRSH r4, [r12],#2
  258. EORLT r7, r8, r7, ASR #32
  259. STRB r7, [r0], #1
  260. ADD r5, r5, r6
  261. ADDS r5, r4, r5, LSR #1
  262. CMPGT r8, r5
  263. LDRB r7, [r1], r3
  264. LDRB r6, [r2], r3
  265. LDRSH r4, [r12],#2
  266. EORLT r5, r8, r5, ASR #32
  267. STRB r5, [r0], #1
  268. ADD r7, r7, r6
  269. ADDS r7, r4, r7, LSR #1
  270. CMPGT r8, r7
  271. EORLT r7, r8, r7, ASR #32
  272. STRB r7, [r0], r3
  273. SUBS r14,r14,#1
  274. BGT ofrinter2_lp_arm
  275. LDMFD r13!,{r4-r8,PC}
  276. .if OC_ARM_ASM_EDSP
  277. .global oc_frag_copy_list_edsp
  278. oc_frag_copy_list_edsp:
  279. @ r0 = _dst_frame
  280. @ r1 = _src_frame
  281. @ r2 = _ystride
  282. @ r3 = _fragis
  283. @ <> = _nfragis
  284. @ <> = _frag_buf_offs
  285. LDR r12,[r13] @ r12 = _nfragis
  286. STMFD r13!,{r4-r11,r14}
  287. SUBS r12, r12, #1
  288. LDRGE r5, [r3],#4 @ r5 = _fragis[fragii]
  289. LDRGE r14,[r13,#4*10] @ r14 = _frag_buf_offs
  290. BLT ofcl_edsp_end
  291. ofcl_edsp_lp:
  292. MOV r4, r1
  293. LDR r5, [r14,r5, LSL #2] @ r5 = _frag_buf_offs[_fragis[fragii]]
  294. SUBS r12, r12, #1
  295. @ Stall (on XScale)
  296. LDRD r6, [r4, r5]! @ r4 = _src_frame+frag_buf_off
  297. LDRD r8, [r4, r2]!
  298. @ Stall
  299. STRD r6, [r5, r0]! @ r5 = _dst_frame+frag_buf_off
  300. STRD r8, [r5, r2]!
  301. @ Stall
  302. LDRD r6, [r4, r2]! @ On Xscale at least, doing 3 consecutive
  303. LDRD r8, [r4, r2]! @ loads causes a stall, but thats no worse
  304. LDRD r10,[r4, r2]! @ than us only doing 2, and having to do
  305. @ another pair of LDRD/STRD later on.
  306. @ Stall
  307. STRD r6, [r5, r2]!
  308. STRD r8, [r5, r2]!
  309. STRD r10,[r5, r2]!
  310. LDRD r6, [r4, r2]!
  311. LDRD r8, [r4, r2]!
  312. LDRD r10,[r4, r2]!
  313. STRD r6, [r5, r2]!
  314. STRD r8, [r5, r2]!
  315. STRD r10,[r5, r2]!
  316. LDRGE r5, [r3],#4 @ r5 = _fragis[fragii]
  317. BGE ofcl_edsp_lp
  318. ofcl_edsp_end:
  319. LDMFD r13!,{r4-r11,PC}
  320. .endif
  321. .if OC_ARM_ASM_MEDIA
  322. .global oc_frag_recon_intra_v6
  323. .global oc_frag_recon_inter_v6
  324. .global oc_frag_recon_inter2_v6
  325. oc_frag_recon_intra_v6:
  326. @ r0 = unsigned char *_dst
  327. @ r1 = int _ystride
  328. @ r2 = const ogg_int16_t _residue[64]
  329. STMFD r13!,{r4-r6,r14}
  330. MOV r14,#8
  331. MOV r12,r2
  332. LDR r6, =0x00800080
  333. ofrintra_v6_lp:
  334. LDRD r2, [r12],#8 @ r2 = 11110000 r3 = 33332222
  335. LDRD r4, [r12],#8 @ r4 = 55554444 r5 = 77776666
  336. SUBS r14,r14,#1
  337. QADD16 r2, r2, r6
  338. QADD16 r3, r3, r6
  339. QADD16 r4, r4, r6
  340. QADD16 r5, r5, r6
  341. USAT16 r2, #8, r2 @ r2 = __11__00
  342. USAT16 r3, #8, r3 @ r3 = __33__22
  343. USAT16 r4, #8, r4 @ r4 = __55__44
  344. USAT16 r5, #8, r5 @ r5 = __77__66
  345. ORR r2, r2, r2, LSR #8 @ r2 = __111100
  346. ORR r3, r3, r3, LSR #8 @ r3 = __333322
  347. ORR r4, r4, r4, LSR #8 @ r4 = __555544
  348. ORR r5, r5, r5, LSR #8 @ r5 = __777766
  349. PKHBT r2, r2, r3, LSL #16 @ r2 = 33221100
  350. PKHBT r3, r4, r5, LSL #16 @ r3 = 77665544
  351. STRD r2, [r0], r1
  352. BGT ofrintra_v6_lp
  353. LDMFD r13!,{r4-r6,PC}
  354. oc_frag_recon_inter_v6:
  355. @ r0 = unsigned char *_dst
  356. @ r1 = const unsigned char *_src
  357. @ r2 = int _ystride
  358. @ r3 = const ogg_int16_t _residue[64]
  359. STMFD r13!,{r4-r7,r14}
  360. MOV r14,#8
  361. ofrinter_v6_lp:
  362. LDRD r6, [r3], #8 @ r6 = 11110000 r7 = 33332222
  363. SUBS r14,r14,#1
  364. .if OC_ARM_CAN_UNALIGN_LDRD
  365. LDRD r4, [r1], r2 @ Unaligned ; r4 = 33221100 r5 = 77665544
  366. .else
  367. LDR r5, [r1, #4]
  368. LDR r4, [r1], r2
  369. .endif
  370. PKHBT r12,r6, r7, LSL #16 @ r12= 22220000
  371. PKHTB r7, r7, r6, ASR #16 @ r7 = 33331111
  372. UXTB16 r6,r4 @ r6 = __22__00
  373. UXTB16 r4,r4, ROR #8 @ r4 = __33__11
  374. QADD16 r12,r12,r6 @ r12= xx22xx00
  375. QADD16 r4, r7, r4 @ r4 = xx33xx11
  376. LDRD r6, [r3], #8 @ r6 = 55554444 r7 = 77776666
  377. USAT16 r4, #8, r4 @ r4 = __33__11
  378. USAT16 r12,#8,r12 @ r12= __22__00
  379. ORR r4, r12,r4, LSL #8 @ r4 = 33221100
  380. PKHBT r12,r6, r7, LSL #16 @ r12= 66664444
  381. PKHTB r7, r7, r6, ASR #16 @ r7 = 77775555
  382. UXTB16 r6,r5 @ r6 = __66__44
  383. UXTB16 r5,r5, ROR #8 @ r5 = __77__55
  384. QADD16 r12,r12,r6 @ r12= xx66xx44
  385. QADD16 r5, r7, r5 @ r5 = xx77xx55
  386. USAT16 r12,#8, r12 @ r12= __66__44
  387. USAT16 r5, #8, r5 @ r4 = __77__55
  388. ORR r5, r12,r5, LSL #8 @ r5 = 33221100
  389. STRD r4, [r0], r2
  390. BGT ofrinter_v6_lp
  391. LDMFD r13!,{r4-r7,PC}
  392. oc_frag_recon_inter2_v6:
  393. @ r0 = unsigned char *_dst
  394. @ r1 = const unsigned char *_src1
  395. @ r2 = const unsigned char *_src2
  396. @ r3 = int _ystride
  397. LDR r12,[r13]
  398. @ r12= const ogg_int16_t _residue[64]
  399. STMFD r13!,{r4-r9,r14}
  400. MOV r14,#8
  401. ofrinter2_v6_lp:
  402. LDRD r6, [r12,#8] @ r6 = 55554444 r7 = 77776666
  403. SUBS r14,r14,#1
  404. LDR r4, [r1, #4] @ Unaligned ; r4 = src1[1] = 77665544
  405. LDR r5, [r2, #4] @ Unaligned ; r5 = src2[1] = 77665544
  406. PKHBT r8, r6, r7, LSL #16 @ r8 = 66664444
  407. PKHTB r9, r7, r6, ASR #16 @ r9 = 77775555
  408. UHADD8 r4, r4, r5 @ r4 = (src1[7,6,5,4] + src2[7,6,5,4])>>1
  409. UXTB16 r5, r4 @ r5 = __66__44
  410. UXTB16 r4, r4, ROR #8 @ r4 = __77__55
  411. QADD16 r8, r8, r5 @ r8 = xx66xx44
  412. QADD16 r9, r9, r4 @ r9 = xx77xx55
  413. LDRD r6,[r12],#16 @ r6 = 33332222 r7 = 11110000
  414. USAT16 r8, #8, r8 @ r8 = __66__44
  415. LDR r4, [r1], r3 @ Unaligned ; r4 = src1[0] = 33221100
  416. USAT16 r9, #8, r9 @ r9 = __77__55
  417. LDR r5, [r2], r3 @ Unaligned ; r5 = src2[0] = 33221100
  418. ORR r9, r8, r9, LSL #8 @ r9 = 77665544
  419. PKHBT r8, r6, r7, LSL #16 @ r8 = 22220000
  420. UHADD8 r4, r4, r5 @ r4 = (src1[3,2,1,0] + src2[3,2,1,0])>>1
  421. PKHTB r7, r7, r6, ASR #16 @ r7 = 33331111
  422. UXTB16 r5, r4 @ r5 = __22__00
  423. UXTB16 r4, r4, ROR #8 @ r4 = __33__11
  424. QADD16 r8, r8, r5 @ r8 = xx22xx00
  425. QADD16 r7, r7, r4 @ r7 = xx33xx11
  426. USAT16 r8, #8, r8 @ r8 = __22__00
  427. USAT16 r7, #8, r7 @ r7 = __33__11
  428. ORR r8, r8, r7, LSL #8 @ r8 = 33221100
  429. STRD r8, [r0], r3
  430. BGT ofrinter2_v6_lp
  431. LDMFD r13!,{r4-r9,PC}
  432. .endif
  433. .if OC_ARM_ASM_NEON
  434. .global oc_frag_copy_list_neon
  435. .global oc_frag_recon_intra_neon
  436. .global oc_frag_recon_inter_neon
  437. .global oc_frag_recon_inter2_neon
  438. oc_frag_copy_list_neon:
  439. @ r0 = _dst_frame
  440. @ r1 = _src_frame
  441. @ r2 = _ystride
  442. @ r3 = _fragis
  443. @ <> = _nfragis
  444. @ <> = _frag_buf_offs
  445. LDR r12,[r13] @ r12 = _nfragis
  446. STMFD r13!,{r4-r7,r14}
  447. CMP r12, #1
  448. LDRGE r6, [r3] @ r6 = _fragis[fragii]
  449. LDRGE r14,[r13,#4*6] @ r14 = _frag_buf_offs
  450. BLT ofcl_neon_end
  451. @ Stall (2 on Xscale)
  452. LDR r6, [r14,r6, LSL #2] @ r6 = _frag_buf_offs[_fragis[fragii]]
  453. @ Stall (on XScale)
  454. MOV r7, r6 @ Guarantee PLD points somewhere valid.
  455. ofcl_neon_lp:
  456. ADD r4, r1, r6
  457. VLD1.64 {D0}, [r4,:64], r2
  458. ADD r5, r0, r6
  459. VLD1.64 {D1}, [r4,:64], r2
  460. SUBS r12, r12, #1
  461. VLD1.64 {D2}, [r4,:64], r2
  462. LDRGT r6, [r3,#4]! @ r6 = _fragis[fragii]
  463. VLD1.64 {D3}, [r4,:64], r2
  464. LDRGT r6, [r14,r6, LSL #2] @ r6 = _frag_buf_offs[_fragis[fragii]]
  465. VLD1.64 {D4}, [r4,:64], r2
  466. ADDGT r7, r1, r6
  467. VLD1.64 {D5}, [r4,:64], r2
  468. PLD [r7]
  469. VLD1.64 {D6}, [r4,:64], r2
  470. PLD [r7, r2]
  471. VLD1.64 {D7}, [r4,:64]
  472. PLD [r7, r2, LSL #1]
  473. VST1.64 {D0}, [r5,:64], r2
  474. ADDGT r7, r7, r2, LSL #2
  475. VST1.64 {D1}, [r5,:64], r2
  476. PLD [r7, -r2]
  477. VST1.64 {D2}, [r5,:64], r2
  478. PLD [r7]
  479. VST1.64 {D3}, [r5,:64], r2
  480. PLD [r7, r2]
  481. VST1.64 {D4}, [r5,:64], r2
  482. PLD [r7, r2, LSL #1]
  483. VST1.64 {D5}, [r5,:64], r2
  484. ADDGT r7, r7, r2, LSL #2
  485. VST1.64 {D6}, [r5,:64], r2
  486. PLD [r7, -r2]
  487. VST1.64 {D7}, [r5,:64]
  488. BGT ofcl_neon_lp
  489. ofcl_neon_end:
  490. LDMFD r13!,{r4-r7,PC}
  491. oc_frag_recon_intra_neon:
  492. @ r0 = unsigned char *_dst
  493. @ r1 = int _ystride
  494. @ r2 = const ogg_int16_t _residue[64]
  495. MOV r3, #128
  496. VDUP.S16 Q0, r3
  497. VLDMIA r2, {D16-D31} @ D16= 3333222211110000 etc ; 9(8) cycles
  498. VQADD.S16 Q8, Q8, Q0
  499. VQADD.S16 Q9, Q9, Q0
  500. VQADD.S16 Q10,Q10,Q0
  501. VQADD.S16 Q11,Q11,Q0
  502. VQADD.S16 Q12,Q12,Q0
  503. VQADD.S16 Q13,Q13,Q0
  504. VQADD.S16 Q14,Q14,Q0
  505. VQADD.S16 Q15,Q15,Q0
  506. VQMOVUN.S16 D16,Q8 @ D16= 7766554433221100 ; 1 cycle
  507. VQMOVUN.S16 D17,Q9 @ D17= FFEEDDCCBBAA9988 ; 1 cycle
  508. VQMOVUN.S16 D18,Q10 @ D18= NNMMLLKKJJIIHHGG ; 1 cycle
  509. VST1.64 {D16},[r0,:64], r1
  510. VQMOVUN.S16 D19,Q11 @ D19= VVUUTTSSRRQQPPOO ; 1 cycle
  511. VST1.64 {D17},[r0,:64], r1
  512. VQMOVUN.S16 D20,Q12 @ D20= ddccbbaaZZYYXXWW ; 1 cycle
  513. VST1.64 {D18},[r0,:64], r1
  514. VQMOVUN.S16 D21,Q13 @ D21= llkkjjiihhggffee ; 1 cycle
  515. VST1.64 {D19},[r0,:64], r1
  516. VQMOVUN.S16 D22,Q14 @ D22= ttssrrqqppoonnmm ; 1 cycle
  517. VST1.64 {D20},[r0,:64], r1
  518. VQMOVUN.S16 D23,Q15 @ D23= !!,:@zzyyxxwwvvuu ; 1 cycle
  519. VST1.64 {D21},[r0,:64], r1
  520. VST1.64 {D22},[r0,:64], r1
  521. VST1.64 {D23},[r0,:64], r1
  522. MOV PC,R14
  523. oc_frag_recon_inter_neon:
  524. @ r0 = unsigned char *_dst
  525. @ r1 = const unsigned char *_src
  526. @ r2 = int _ystride
  527. @ r3 = const ogg_int16_t _residue[64]
  528. VLDMIA r3, {D16-D31} @ D16= 3333222211110000 etc ; 9(8) cycles
  529. VLD1.64 {D0}, [r1], r2
  530. VLD1.64 {D2}, [r1], r2
  531. VMOVL.U8 Q0, D0 @ Q0 = __77__66__55__44__33__22__11__00
  532. VLD1.64 {D4}, [r1], r2
  533. VMOVL.U8 Q1, D2 @ etc
  534. VLD1.64 {D6}, [r1], r2
  535. VMOVL.U8 Q2, D4
  536. VMOVL.U8 Q3, D6
  537. VQADD.S16 Q8, Q8, Q0
  538. VLD1.64 {D0}, [r1], r2
  539. VQADD.S16 Q9, Q9, Q1
  540. VLD1.64 {D2}, [r1], r2
  541. VQADD.S16 Q10,Q10,Q2
  542. VLD1.64 {D4}, [r1], r2
  543. VQADD.S16 Q11,Q11,Q3
  544. VLD1.64 {D6}, [r1], r2
  545. VMOVL.U8 Q0, D0
  546. VMOVL.U8 Q1, D2
  547. VMOVL.U8 Q2, D4
  548. VMOVL.U8 Q3, D6
  549. VQADD.S16 Q12,Q12,Q0
  550. VQADD.S16 Q13,Q13,Q1
  551. VQADD.S16 Q14,Q14,Q2
  552. VQADD.S16 Q15,Q15,Q3
  553. VQMOVUN.S16 D16,Q8
  554. VQMOVUN.S16 D17,Q9
  555. VQMOVUN.S16 D18,Q10
  556. VST1.64 {D16},[r0,:64], r2
  557. VQMOVUN.S16 D19,Q11
  558. VST1.64 {D17},[r0,:64], r2
  559. VQMOVUN.S16 D20,Q12
  560. VST1.64 {D18},[r0,:64], r2
  561. VQMOVUN.S16 D21,Q13
  562. VST1.64 {D19},[r0,:64], r2
  563. VQMOVUN.S16 D22,Q14
  564. VST1.64 {D20},[r0,:64], r2
  565. VQMOVUN.S16 D23,Q15
  566. VST1.64 {D21},[r0,:64], r2
  567. VST1.64 {D22},[r0,:64], r2
  568. VST1.64 {D23},[r0,:64], r2
  569. MOV PC,R14
  570. oc_frag_recon_inter2_neon:
  571. @ r0 = unsigned char *_dst
  572. @ r1 = const unsigned char *_src1
  573. @ r2 = const unsigned char *_src2
  574. @ r3 = int _ystride
  575. LDR r12,[r13]
  576. @ r12= const ogg_int16_t _residue[64]
  577. VLDMIA r12,{D16-D31}
  578. VLD1.64 {D0}, [r1], r3
  579. VLD1.64 {D4}, [r2], r3
  580. VLD1.64 {D1}, [r1], r3
  581. VLD1.64 {D5}, [r2], r3
  582. VHADD.U8 Q2, Q0, Q2 @ Q2 = FFEEDDCCBBAA99887766554433221100
  583. VLD1.64 {D2}, [r1], r3
  584. VLD1.64 {D6}, [r2], r3
  585. VMOVL.U8 Q0, D4 @ Q0 = __77__66__55__44__33__22__11__00
  586. VLD1.64 {D3}, [r1], r3
  587. VMOVL.U8 Q2, D5 @ etc
  588. VLD1.64 {D7}, [r2], r3
  589. VHADD.U8 Q3, Q1, Q3
  590. VQADD.S16 Q8, Q8, Q0
  591. VQADD.S16 Q9, Q9, Q2
  592. VLD1.64 {D0}, [r1], r3
  593. VMOVL.U8 Q1, D6
  594. VLD1.64 {D4}, [r2], r3
  595. VMOVL.U8 Q3, D7
  596. VLD1.64 {D1}, [r1], r3
  597. VQADD.S16 Q10,Q10,Q1
  598. VLD1.64 {D5}, [r2], r3
  599. VQADD.S16 Q11,Q11,Q3
  600. VLD1.64 {D2}, [r1], r3
  601. VHADD.U8 Q2, Q0, Q2
  602. VLD1.64 {D6}, [r2], r3
  603. VLD1.64 {D3}, [r1], r3
  604. VMOVL.U8 Q0, D4
  605. VLD1.64 {D7}, [r2], r3
  606. VMOVL.U8 Q2, D5
  607. VHADD.U8 Q3, Q1, Q3
  608. VQADD.S16 Q12,Q12,Q0
  609. VQADD.S16 Q13,Q13,Q2
  610. VMOVL.U8 Q1, D6
  611. VMOVL.U8 Q3, D7
  612. VQADD.S16 Q14,Q14,Q1
  613. VQADD.S16 Q15,Q15,Q3
  614. VQMOVUN.S16 D16,Q8
  615. VQMOVUN.S16 D17,Q9
  616. VQMOVUN.S16 D18,Q10
  617. VST1.64 {D16},[r0,:64], r3
  618. VQMOVUN.S16 D19,Q11
  619. VST1.64 {D17},[r0,:64], r3
  620. VQMOVUN.S16 D20,Q12
  621. VST1.64 {D18},[r0,:64], r3
  622. VQMOVUN.S16 D21,Q13
  623. VST1.64 {D19},[r0,:64], r3
  624. VQMOVUN.S16 D22,Q14
  625. VST1.64 {D20},[r0,:64], r3
  626. VQMOVUN.S16 D23,Q15
  627. VST1.64 {D21},[r0,:64], r3
  628. VST1.64 {D22},[r0,:64], r3
  629. VST1.64 {D23},[r0,:64], r3
  630. MOV PC,R14
  631. .endif
  632. @ END