armloop.asm 24 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691
  1. #ifdef OC_ARM_ASM
  2. @********************************************************************
  3. @* *
  4. @* THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. *
  5. @* USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS *
  6. @* GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
  7. @* IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
  8. @* *
  9. @* THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2010 *
  10. @* by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
  11. @* *
  12. @********************************************************************
  13. @ Original implementation:
  14. @ Copyright (C) 2009 Robin Watts for Pinknoise Productions Ltd
  15. @ last mod: $Id: armloop.s 17481 2010-10-03 22:49:42Z tterribe $
  16. @********************************************************************
  17. .text; .p2align 2
  18. .global _oc_loop_filter_frag_rows_arm
  19. @ Which bit this is depends on the order of packing within a bitfield.
  20. @ Hopefully that doesn't change among any of the relevant compilers.
  21. .set OC_FRAG_CODED_FLAG, 1
  22. @ Vanilla ARM v4 version
  23. @ .type loop_filter_h_arm, %function; loop_filter_h_arm: @ PROC
  24. loop_filter_h_arm:
  25. @ r0 = unsigned char *_pix
  26. @ r1 = int _ystride
  27. @ r2 = int *_bv
  28. @ preserves r0-r3
  29. STMFD r13!,{r3-r6,r14}
  30. MOV r14,#8
  31. MOV r6, #255
  32. lfh_arm_lp:
  33. LDRB r3, [r0, #-2] @ r3 = _pix[0]
  34. LDRB r12,[r0, #1] @ r12= _pix[3]
  35. LDRB r4, [r0, #-1] @ r4 = _pix[1]
  36. LDRB r5, [r0] @ r5 = _pix[2]
  37. SUB r3, r3, r12 @ r3 = _pix[0]-_pix[3]+4
  38. ADD r3, r3, #4
  39. SUB r12,r5, r4 @ r12= _pix[2]-_pix[1]
  40. ADD r12,r12,r12,LSL #1 @ r12= 3*(_pix[2]-_pix[1])
  41. ADD r12,r12,r3 @ r12= _pix[0]-_pix[3]+3*(_pix[2]-_pix[1])+4
  42. MOV r12,r12,ASR #3
  43. LDRSB r12,[r2, r12]
  44. @ Stall (2 on Xscale)
  45. ADDS r4, r4, r12
  46. CMPGT r6, r4
  47. EORLT r4, r6, r4, ASR #32
  48. SUBS r5, r5, r12
  49. CMPGT r6, r5
  50. EORLT r5, r6, r5, ASR #32
  51. STRB r4, [r0, #-1]
  52. STRB r5, [r0], r1
  53. SUBS r14,r14,#1
  54. BGT lfh_arm_lp
  55. SUB r0, r0, r1, LSL #3
  56. LDMFD r13!,{r3-r6,PC}
  57. @ @ .size loop_filter_h_arm, .-loop_filter_h_arm @ ENDP
  58. @ .type loop_filter_v_arm, %function; loop_filter_v_arm: @ PROC
  59. loop_filter_v_arm:
  60. @ r0 = unsigned char *_pix
  61. @ r1 = int _ystride
  62. @ r2 = int *_bv
  63. @ preserves r0-r3
  64. STMFD r13!,{r3-r6,r14}
  65. MOV r14,#8
  66. MOV r6, #255
  67. lfv_arm_lp:
  68. LDRB r3, [r0, -r1, LSL #1] @ r3 = _pix[0]
  69. LDRB r12,[r0, r1] @ r12= _pix[3]
  70. LDRB r4, [r0, -r1] @ r4 = _pix[1]
  71. LDRB r5, [r0] @ r5 = _pix[2]
  72. SUB r3, r3, r12 @ r3 = _pix[0]-_pix[3]+4
  73. ADD r3, r3, #4
  74. SUB r12,r5, r4 @ r12= _pix[2]-_pix[1]
  75. ADD r12,r12,r12,LSL #1 @ r12= 3*(_pix[2]-_pix[1])
  76. ADD r12,r12,r3 @ r12= _pix[0]-_pix[3]+3*(_pix[2]-_pix[1])+4
  77. MOV r12,r12,ASR #3
  78. LDRSB r12,[r2, r12]
  79. @ Stall (2 on Xscale)
  80. ADDS r4, r4, r12
  81. CMPGT r6, r4
  82. EORLT r4, r6, r4, ASR #32
  83. SUBS r5, r5, r12
  84. CMPGT r6, r5
  85. EORLT r5, r6, r5, ASR #32
  86. STRB r4, [r0, -r1]
  87. STRB r5, [r0], #1
  88. SUBS r14,r14,#1
  89. BGT lfv_arm_lp
  90. SUB r0, r0, #8
  91. LDMFD r13!,{r3-r6,PC}
  92. @ @ .size loop_filter_v_arm, .-loop_filter_v_arm @ ENDP
  93. @ .type oc_loop_filter_frag_rows_arm, %function; oc_loop_filter_frag_rows_arm: @ PROC
  94. _oc_loop_filter_frag_rows_arm:
  95. @ r0 = _ref_frame_data
  96. @ r1 = _ystride
  97. @ r2 = _bv
  98. @ r3 = _frags
  99. @ r4 = _fragi0
  100. @ r5 = _fragi0_end
  101. @ r6 = _fragi_top
  102. @ r7 = _fragi_bot
  103. @ r8 = _frag_buf_offs
  104. @ r9 = _nhfrags
  105. MOV r12,r13
  106. STMFD r13!,{r0,r4-r11,r14}
  107. LDMFD r12,{r4-r9}
  108. ADD r2, r2, #127 @ _bv += 127
  109. CMP r4, r5 @ if(_fragi0>=_fragi0_end)
  110. BGE oslffri_arm_end @ bail
  111. SUBS r9, r9, #1 @ r9 = _nhfrags-1 if (r9<=0)
  112. BLE oslffri_arm_end @ bail
  113. ADD r3, r3, r4, LSL #2 @ r3 = &_frags[fragi]
  114. ADD r8, r8, r4, LSL #2 @ r8 = &_frag_buf_offs[fragi]
  115. SUB r7, r7, r9 @ _fragi_bot -= _nhfrags;
  116. oslffri_arm_lp1:
  117. MOV r10,r4 @ r10= fragi = _fragi0
  118. ADD r11,r4, r9 @ r11= fragi_end-1=fragi+_nhfrags-1
  119. oslffri_arm_lp2:
  120. LDR r14,[r3], #4 @ r14= _frags[fragi] _frags++
  121. LDR r0, [r13] @ r0 = _ref_frame_data
  122. LDR r12,[r8], #4 @ r12= _frag_buf_offs[fragi] _frag_buf_offs++
  123. TST r14,#OC_FRAG_CODED_FLAG
  124. BEQ oslffri_arm_uncoded
  125. CMP r10,r4 @ if (fragi>_fragi0)
  126. ADD r0, r0, r12 @ r0 = _ref_frame_data + _frag_buf_offs[fragi]
  127. BLGT loop_filter_h_arm
  128. CMP r4, r6 @ if (_fragi0>_fragi_top)
  129. BLGT loop_filter_v_arm
  130. CMP r10,r11 @ if(fragi+1<fragi_end)===(fragi<fragi_end-1)
  131. LDRLT r12,[r3] @ r12 = _frags[fragi+1]
  132. ADD r0, r0, #8
  133. ADD r10,r10,#1 @ r10 = fragi+1;
  134. ANDLT r12,r12,#OC_FRAG_CODED_FLAG
  135. CMPLT r12,#OC_FRAG_CODED_FLAG @ && _frags[fragi+1].coded==0
  136. BLLT loop_filter_h_arm
  137. CMP r10,r7 @ if (fragi<_fragi_bot)
  138. LDRLT r12,[r3, r9, LSL #2] @ r12 = _frags[fragi+1+_nhfrags-1]
  139. SUB r0, r0, #8
  140. ADD r0, r0, r1, LSL #3
  141. ANDLT r12,r12,#OC_FRAG_CODED_FLAG
  142. CMPLT r12,#OC_FRAG_CODED_FLAG
  143. BLLT loop_filter_v_arm
  144. CMP r10,r11 @ while(fragi<=fragi_end-1)
  145. BLE oslffri_arm_lp2
  146. MOV r4, r10 @ r4 = fragi0 += _nhfrags
  147. CMP r4, r5
  148. BLT oslffri_arm_lp1
  149. oslffri_arm_end:
  150. LDMFD r13!,{r0,r4-r11,PC}
  151. oslffri_arm_uncoded:
  152. ADD r10,r10,#1
  153. CMP r10,r11
  154. BLE oslffri_arm_lp2
  155. MOV r4, r10 @ r4 = _fragi0 += _nhfrags
  156. CMP r4, r5
  157. BLT oslffri_arm_lp1
  158. LDMFD r13!,{r0,r4-r11,PC}
  159. @ @ .size oc_loop_filter_frag_rows_arm, .-oc_loop_filter_frag_rows_arm @ ENDP
  160. .if OC_ARM_ASM_MEDIA
  161. .global _oc_loop_filter_init_v6
  162. .global _oc_loop_filter_frag_rows_v6
  163. @ .type oc_loop_filter_init_v6, %function; oc_loop_filter_init_v6: @ PROC
  164. _oc_loop_filter_init_v6:
  165. @ r0 = _bv
  166. @ r1 = _flimit (=L from the spec)
  167. MVN r1, r1, LSL #1 @ r1 = <0xFFFFFF|255-2*L>
  168. AND r1, r1, #255 @ r1 = ll=r10x0xFF
  169. ORR r1, r1, r1, LSL #8 @ r1 = <ll|ll>
  170. PKHBT r1, r1, r1, LSL #16 @ r1 = <ll|ll|ll|ll>
  171. STR r1, [r0]
  172. MOV PC,r14
  173. @ @ .size oc_loop_filter_init_v6, .-oc_loop_filter_init_v6 @ ENDP
  174. @ We could use the same strategy as the v filter below, but that would require
  175. @ 40 instructions to load the data and transpose it into columns and another
  176. @ 32 to write out the results at the end, plus the 52 instructions to do the
  177. @ filtering itself.
  178. @ This is slightly less, and less code, even assuming we could have shared the
  179. @ 52 instructions in the middle with the other function.
  180. @ It executes slightly fewer instructions than the ARMv6 approach David Conrad
  181. @ proposed for FFmpeg, but not by much:
  182. @ http://lists.mplayerhq.hu/pipermail/ffmpeg-devel/2010-February/083141.html
  183. @ His is a lot less code, though, because it only does two rows at once instead
  184. @ of four.
  185. @ .type loop_filter_h_v6, %function; loop_filter_h_v6: @ PROC
  186. loop_filter_h_v6:
  187. @ r0 = unsigned char *_pix
  188. @ r1 = int _ystride
  189. @ r2 = int _ll
  190. @ preserves r0-r3
  191. STMFD r13!,{r4-r11,r14}
  192. MOV r12, 0x0003
  193. MOVT r12, 0x1
  194. BL loop_filter_h_core_v6
  195. ADD r0, r0, r1, LSL #2
  196. BL loop_filter_h_core_v6
  197. SUB r0, r0, r1, LSL #2
  198. LDMFD r13!,{r4-r11,PC}
  199. @ @ .size loop_filter_h_v6, .-loop_filter_h_v6 @ ENDP
  200. @ .type loop_filter_h_core_v6, %function; loop_filter_h_core_v6: @ PROC
  201. loop_filter_h_core_v6:
  202. @ r0 = unsigned char *_pix
  203. @ r1 = int _ystride
  204. @ r2 = int _ll
  205. @ r12= 0x10003
  206. @ Preserves r0-r3, r12; Clobbers r4-r11.
  207. LDR r4,[r0, #-2]! @ r4 = <p3|p2|p1|p0>
  208. @ Single issue
  209. LDR r5,[r0, r1]! @ r5 = <q3|q2|q1|q0>
  210. UXTB16 r6, r4, ROR #16 @ r6 = <p0|p2>
  211. UXTB16 r4, r4, ROR #8 @ r4 = <p3|p1>
  212. UXTB16 r7, r5, ROR #16 @ r7 = <q0|q2>
  213. UXTB16 r5, r5, ROR #8 @ r5 = <q3|q1>
  214. PKHBT r8, r4, r5, LSL #16 @ r8 = <__|q1|__|p1>
  215. PKHBT r9, r6, r7, LSL #16 @ r9 = <__|q2|__|p2>
  216. SSUB16 r6, r4, r6 @ r6 = <p3-p0|p1-p2>
  217. SMLAD r6, r6, r12,r12 @ r6 = <????|(p3-p0)+3*(p1-p2)+3>
  218. SSUB16 r7, r5, r7 @ r7 = <q3-q0|q1-q2>
  219. SMLAD r7, r7, r12,r12 @ r7 = <????|(q0-q3)+3*(q2-q1)+4>
  220. LDR r4,[r0, r1]! @ r4 = <r3|r2|r1|r0>
  221. MOV r6, r6, ASR #3 @ r6 = <??????|(p3-p0)+3*(p1-p2)+3>>3>
  222. LDR r5,[r0, r1]! @ r5 = <s3|s2|s1|s0>
  223. PKHBT r11,r6, r7, LSL #13 @ r11= <??|-R_q|??|-R_p>
  224. UXTB16 r6, r4, ROR #16 @ r6 = <r0|r2>
  225. UXTB16 r11,r11 @ r11= <__|-R_q|__|-R_p>
  226. UXTB16 r4, r4, ROR #8 @ r4 = <r3|r1>
  227. UXTB16 r7, r5, ROR #16 @ r7 = <s0|s2>
  228. PKHBT r10,r6, r7, LSL #16 @ r10= <__|s2|__|r2>
  229. SSUB16 r6, r4, r6 @ r6 = <r3-r0|r1-r2>
  230. UXTB16 r5, r5, ROR #8 @ r5 = <s3|s1>
  231. SMLAD r6, r6, r12,r12 @ r6 = <????|(r3-r0)+3*(r2-r1)+3>
  232. SSUB16 r7, r5, r7 @ r7 = <r3-r0|r1-r2>
  233. SMLAD r7, r7, r12,r12 @ r7 = <????|(s0-s3)+3*(s2-s1)+4>
  234. ORR r9, r9, r10, LSL #8 @ r9 = <s2|q2|r2|p2>
  235. MOV r6, r6, ASR #3 @ r6 = <??????|(r0-r3)+3*(r2-r1)+4>>3>
  236. PKHBT r10,r4, r5, LSL #16 @ r10= <__|s1|__|r1>
  237. PKHBT r6, r6, r7, LSL #13 @ r6 = <??|-R_s|??|-R_r>
  238. ORR r8, r8, r10, LSL #8 @ r8 = <s1|q1|r1|p1>
  239. UXTB16 r6, r6 @ r6 = <__|-R_s|__|-R_r>
  240. MOV r10,#0
  241. ORR r6, r11,r6, LSL #8 @ r6 = <-R_s|-R_q|-R_r|-R_p>
  242. @ Single issue
  243. @ There's no min, max or abs instruction.
  244. @ SSUB8 and SEL will work for abs, and we can do all the rest with
  245. @ unsigned saturated adds, which means the GE flags are still all
  246. @ set when we're done computing lflim(abs(R_i),L).
  247. @ This allows us to both add and subtract, and split the results by
  248. @ the original sign of R_i.
  249. SSUB8 r7, r10,r6
  250. @ Single issue
  251. SEL r7, r7, r6 @ r7 = abs(R_i)
  252. @ Single issue
  253. UQADD8 r4, r7, r2 @ r4 = 255-max(2*L-abs(R_i),0)
  254. @ Single issue
  255. UQADD8 r7, r7, r4
  256. @ Single issue
  257. UQSUB8 r7, r7, r4 @ r7 = min(abs(R_i),max(2*L-abs(R_i),0))
  258. @ Single issue
  259. UQSUB8 r4, r8, r7
  260. UQADD8 r5, r9, r7
  261. UQADD8 r8, r8, r7
  262. UQSUB8 r9, r9, r7
  263. SEL r8, r8, r4 @ r8 = p1+lflim(R_i,L)
  264. SEL r9, r9, r5 @ r9 = p2-lflim(R_i,L)
  265. MOV r5, r9, LSR #24 @ r5 = s2
  266. STRB r5, [r0,#2]!
  267. MOV r4, r8, LSR #24 @ r4 = s1
  268. STRB r4, [r0,#-1]
  269. MOV r5, r9, LSR #8 @ r5 = r2
  270. STRB r5, [r0,-r1]!
  271. MOV r4, r8, LSR #8 @ r4 = r1
  272. STRB r4, [r0,#-1]
  273. MOV r5, r9, LSR #16 @ r5 = q2
  274. STRB r5, [r0,-r1]!
  275. MOV r4, r8, LSR #16 @ r4 = q1
  276. STRB r4, [r0,#-1]
  277. @ Single issue
  278. STRB r9, [r0,-r1]!
  279. @ Single issue
  280. STRB r8, [r0,#-1]
  281. MOV PC,r14
  282. @ @ .size loop_filter_h_core_v6, .-loop_filter_h_core_v6 @ ENDP
  283. @ This uses the same strategy as the MMXEXT version for x86, except that UHADD8
  284. @ computes (a+b>>1) instead of (a+b+1>>1) like PAVGB.
  285. @ This works just as well, with the following procedure for computing the
  286. @ filter value, f:
  287. @ u = ~UHADD8(p1,~p2);
  288. @ v = UHADD8(~p1,p2);
  289. @ m = v-u;
  290. @ a = m^UHADD8(m^p0,m^~p3);
  291. @ f = UHADD8(UHADD8(a,u1),v1);
  292. @ where f = 127+R, with R in [-127,128] defined as in the spec.
  293. @ This is exactly the same amount of arithmetic as the version that uses PAVGB
  294. @ as the basic operator.
  295. @ It executes about 2/3 the number of instructions of David Conrad's approach,
  296. @ but requires more code, because it does all eight columns at once, instead
  297. @ of four at a time.
  298. @ .type loop_filter_v_v6, %function; loop_filter_v_v6: @ PROC
  299. loop_filter_v_v6:
  300. @ r0 = unsigned char *_pix
  301. @ r1 = int _ystride
  302. @ r2 = int _ll
  303. @ preserves r0-r11
  304. STMFD r13!,{r4-r11,r14}
  305. LDRD r6, r7, [r0, -r1]! @ r7, r6 = <p5|p1>
  306. LDRD r4, r5, [r0, -r1] @ r5, r4 = <p4|p0>
  307. LDRD r8, r9, [r0, r1]! @ r9, r8 = <p6|p2>
  308. MVN r14,r6 @ r14= ~p1
  309. LDRD r10,r11,[r0, r1] @ r11,r10= <p7|p3>
  310. @ Filter the first four columns.
  311. MVN r12,r8 @ r12= ~p2
  312. UHADD8 r14,r14,r8 @ r14= v1=~p1+p2>>1
  313. UHADD8 r12,r12,r6 @ r12= p1+~p2>>1
  314. MVN r10, r10 @ r10=~p3
  315. MVN r12,r12 @ r12= u1=~p1+p2+1>>1
  316. SSUB8 r14,r14,r12 @ r14= m1=v1-u1
  317. @ Single issue
  318. EOR r4, r4, r14 @ r4 = m1^p0
  319. EOR r10,r10,r14 @ r10= m1^~p3
  320. UHADD8 r4, r4, r10 @ r4 = (m1^p0)+(m1^~p3)>>1
  321. @ Single issue
  322. EOR r4, r4, r14 @ r4 = a1=m1^((m1^p0)+(m1^~p3)>>1)
  323. SADD8 r14,r14,r12 @ r14= v1=m1+u1
  324. UHADD8 r4, r4, r12 @ r4 = a1+u1>>1
  325. MVN r12,r9 @ r12= ~p6
  326. UHADD8 r4, r4, r14 @ r4 = f1=(a1+u1>>1)+v1>>1
  327. @ Filter the second four columns.
  328. MVN r14,r7 @ r14= ~p5
  329. UHADD8 r12,r12,r7 @ r12= p5+~p6>>1
  330. UHADD8 r14,r14,r9 @ r14= v2=~p5+p6>>1
  331. MVN r12,r12 @ r12= u2=~p5+p6+1>>1
  332. MVN r11,r11 @ r11=~p7
  333. SSUB8 r10,r14,r12 @ r10= m2=v2-u2
  334. @ Single issue
  335. EOR r5, r5, r10 @ r5 = m2^p4
  336. EOR r11,r11,r10 @ r11= m2^~p7
  337. UHADD8 r5, r5, r11 @ r5 = (m2^p4)+(m2^~p7)>>1
  338. @ Single issue
  339. EOR r5, r5, r10 @ r5 = a2=m2^((m2^p4)+(m2^~p7)>>1)
  340. @ Single issue
  341. UHADD8 r5, r5, r12 @ r5 = a2+u2>>1
  342. MOV r12, #0x7F7F @ r12 = {127}x4
  343. MOVT r12, #0x7F7F @ r12 = {127}x4
  344. UHADD8 r5, r5, r14 @ r5 = f2=(a2+u2>>1)+v2>>1
  345. @ Now split f[i] by sign.
  346. @ There's no min or max instruction.
  347. @ We could use SSUB8 and SEL, but this is just as many instructions and
  348. @ dual issues more (for v7 without NEON).
  349. UQSUB8 r10,r4, r12 @ r10= R_i>0?R_i:0
  350. UQSUB8 r4, r12,r4 @ r4 = R_i<0?-R_i:0
  351. UQADD8 r11,r10,r2 @ r11= 255-max(2*L-abs(R_i<0),0)
  352. UQADD8 r14,r4, r2 @ r14= 255-max(2*L-abs(R_i>0),0)
  353. UQADD8 r10,r10,r11
  354. UQADD8 r4, r4, r14
  355. UQSUB8 r10,r10,r11 @ r10= min(abs(R_i<0),max(2*L-abs(R_i<0),0))
  356. UQSUB8 r4, r4, r14 @ r4 = min(abs(R_i>0),max(2*L-abs(R_i>0),0))
  357. UQSUB8 r11,r5, r12 @ r11= R_i>0?R_i:0
  358. UQADD8 r6, r6, r10
  359. UQSUB8 r8, r8, r10
  360. UQSUB8 r5, r12,r5 @ r5 = R_i<0?-R_i:0
  361. UQSUB8 r6, r6, r4 @ r6 = p1+lflim(R_i,L)
  362. UQADD8 r8, r8, r4 @ r8 = p2-lflim(R_i,L)
  363. UQADD8 r10,r11,r2 @ r10= 255-max(2*L-abs(R_i<0),0)
  364. UQADD8 r14,r5, r2 @ r14= 255-max(2*L-abs(R_i>0),0)
  365. UQADD8 r11,r11,r10
  366. UQADD8 r5, r5, r14
  367. UQSUB8 r11,r11,r10 @ r11= min(abs(R_i<0),max(2*L-abs(R_i<0),0))
  368. UQSUB8 r5, r5, r14 @ r5 = min(abs(R_i>0),max(2*L-abs(R_i>0),0))
  369. UQADD8 r7, r7, r11
  370. UQSUB8 r9, r9, r11
  371. UQSUB8 r7, r7, r5 @ r7 = p5+lflim(R_i,L)
  372. STRD r6, r7, [r0, -r1] @ [p5:p1] = [r7: r6]
  373. UQADD8 r9, r9, r5 @ r9 = p6-lflim(R_i,L)
  374. STRD r8, r9, [r0] @ [p6:p2] = [r9: r8]
  375. LDMFD r13!,{r4-r11,PC}
  376. @ @ .size loop_filter_v_v6, .-loop_filter_v_v6 @ ENDP
  377. @ .type oc_loop_filter_frag_rows_v6, %function; oc_loop_filter_frag_rows_v6: @ PROC
  378. _oc_loop_filter_frag_rows_v6:
  379. @ r0 = _ref_frame_data
  380. @ r1 = _ystride
  381. @ r2 = _bv
  382. @ r3 = _frags
  383. @ r4 = _fragi0
  384. @ r5 = _fragi0_end
  385. @ r6 = _fragi_top
  386. @ r7 = _fragi_bot
  387. @ r8 = _frag_buf_offs
  388. @ r9 = _nhfrags
  389. MOV r12,r13
  390. STMFD r13!,{r0,r4-r11,r14}
  391. LDMFD r12,{r4-r9}
  392. LDR r2, [r2] @ ll = *(int *)_bv
  393. CMP r4, r5 @ if(_fragi0>=_fragi0_end)
  394. BGE oslffri_v6_end @ bail
  395. SUBS r9, r9, #1 @ r9 = _nhfrags-1 if (r9<=0)
  396. BLE oslffri_v6_end @ bail
  397. ADD r3, r3, r4, LSL #2 @ r3 = &_frags[fragi]
  398. ADD r8, r8, r4, LSL #2 @ r8 = &_frag_buf_offs[fragi]
  399. SUB r7, r7, r9 @ _fragi_bot -= _nhfrags;
  400. oslffri_v6_lp1:
  401. MOV r10,r4 @ r10= fragi = _fragi0
  402. ADD r11,r4, r9 @ r11= fragi_end-1=fragi+_nhfrags-1
  403. oslffri_v6_lp2:
  404. LDR r14,[r3], #4 @ r14= _frags[fragi] _frags++
  405. LDR r0, [r13] @ r0 = _ref_frame_data
  406. LDR r12,[r8], #4 @ r12= _frag_buf_offs[fragi] _frag_buf_offs++
  407. TST r14,#OC_FRAG_CODED_FLAG
  408. BEQ oslffri_v6_uncoded
  409. CMP r10,r4 @ if (fragi>_fragi0)
  410. ADD r0, r0, r12 @ r0 = _ref_frame_data + _frag_buf_offs[fragi]
  411. BLGT loop_filter_h_v6
  412. CMP r4, r6 @ if (fragi0>_fragi_top)
  413. BLGT loop_filter_v_v6
  414. CMP r10,r11 @ if(fragi+1<fragi_end)===(fragi<fragi_end-1)
  415. LDRLT r12,[r3] @ r12 = _frags[fragi+1]
  416. ADD r0, r0, #8
  417. ADD r10,r10,#1 @ r10 = fragi+1;
  418. ANDLT r12,r12,#OC_FRAG_CODED_FLAG
  419. CMPLT r12,#OC_FRAG_CODED_FLAG @ && _frags[fragi+1].coded==0
  420. BLLT loop_filter_h_v6
  421. CMP r10,r7 @ if (fragi<_fragi_bot)
  422. LDRLT r12,[r3, r9, LSL #2] @ r12 = _frags[fragi+1+_nhfrags-1]
  423. SUB r0, r0, #8
  424. ADD r0, r0, r1, LSL #3
  425. ANDLT r12,r12,#OC_FRAG_CODED_FLAG
  426. CMPLT r12,#OC_FRAG_CODED_FLAG
  427. BLLT loop_filter_v_v6
  428. CMP r10,r11 @ while(fragi<=fragi_end-1)
  429. BLE oslffri_v6_lp2
  430. MOV r4, r10 @ r4 = fragi0 += nhfrags
  431. CMP r4, r5
  432. BLT oslffri_v6_lp1
  433. oslffri_v6_end:
  434. LDMFD r13!,{r0,r4-r11,PC}
  435. oslffri_v6_uncoded:
  436. ADD r10,r10,#1
  437. CMP r10,r11
  438. BLE oslffri_v6_lp2
  439. MOV r4, r10 @ r4 = fragi0 += nhfrags
  440. CMP r4, r5
  441. BLT oslffri_v6_lp1
  442. LDMFD r13!,{r0,r4-r11,PC}
  443. @ @ .size oc_loop_filter_frag_rows_v6, .-oc_loop_filter_frag_rows_v6 @ ENDP
  444. .endif
  445. .if OC_ARM_ASM_NEON
  446. .global _oc_loop_filter_init_neon
  447. .global _oc_loop_filter_frag_rows_neon
  448. @ .type oc_loop_filter_init_neon, %function; oc_loop_filter_init_neon: @ PROC
  449. _oc_loop_filter_init_neon:
  450. @ r0 = _bv
  451. @ r1 = _flimit (=L from the spec)
  452. MOV r1, r1, LSL #1 @ r1 = 2*L
  453. VDUP.S16 Q15, r1 @ Q15= 2L in U16s
  454. VST1.64 {D30,D31}, [r0,:128]
  455. MOV PC,r14
  456. @ @ .size oc_loop_filter_init_neon, .-oc_loop_filter_init_neon @ ENDP
  457. @ .type loop_filter_h_neon, %function; loop_filter_h_neon: @ PROC
  458. loop_filter_h_neon:
  459. @ r0 = unsigned char *_pix
  460. @ r1 = int _ystride
  461. @ r2 = int *_bv
  462. @ preserves r0-r3
  463. @ We assume Q15= 2*L in U16s
  464. @ My best guesses at cycle counts (and latency)--vvv
  465. SUB r12,r0, #2
  466. @ Doing a 2-element structure load saves doing two VTRN's below, at the
  467. @ cost of using two more slower single-lane loads vs. the faster
  468. @ all-lane loads.
  469. @ It's less code this way, though, and benches a hair faster, but it
  470. @ leaves D2 and D4 swapped.
  471. VLD2.16 {D0[],D2[]}, [r12], r1 @ D0 = ____________1100 2,1
  472. @ D2 = ____________3322
  473. VLD2.16 {D4[],D6[]}, [r12], r1 @ D4 = ____________5544 2,1
  474. @ D6 = ____________7766
  475. VLD2.16 {D0[1],D2[1]},[r12], r1 @ D0 = ________99881100 3,1
  476. @ D2 = ________BBAA3322
  477. VLD2.16 {D4[1],D6[1]},[r12], r1 @ D4 = ________DDCC5544 3,1
  478. @ D6 = ________FFEE7766
  479. VLD2.16 {D0[2],D2[2]},[r12], r1 @ D0 = ____GGHH99881100 3,1
  480. @ D2 = ____JJIIBBAA3322
  481. VLD2.16 {D4[2],D6[2]},[r12], r1 @ D4 = ____KKLLDDCC5544 3,1
  482. @ D6 = ____NNMMFFEE7766
  483. VLD2.16 {D0[3],D2[3]},[r12], r1 @ D0 = PPOOGGHH99881100 3,1
  484. @ D2 = RRQQJJIIBBAA3322
  485. VLD2.16 {D4[3],D6[3]},[r12], r1 @ D4 = TTSSKKLLDDCC5544 3,1
  486. @ D6 = VVUUNNMMFFEE7766
  487. VTRN.8 D0, D4 @ D0 = SSOOKKGGCC884400 D4 = TTPPLLHHDD995511 1,1
  488. VTRN.8 D2, D6 @ D2 = UUQQMMIIEEAA6622 D6 = VVRRNNJJFFBB7733 1,1
  489. VSUBL.U8 Q0, D0, D6 @ Q0 = 00 - 33 in S16s 1,3
  490. VSUBL.U8 Q8, D2, D4 @ Q8 = 22 - 11 in S16s 1,3
  491. ADD r12,r0, #8
  492. VADD.S16 Q0, Q0, Q8 @ 1,3
  493. PLD [r12]
  494. VADD.S16 Q0, Q0, Q8 @ 1,3
  495. PLD [r12,r1]
  496. VADD.S16 Q0, Q0, Q8 @ Q0 = [0-3]+3*[2-1] 1,3
  497. PLD [r12,r1, LSL #1]
  498. VRSHR.S16 Q0, Q0, #3 @ Q0 = f = ([0-3]+3*[2-1]+4)>>3 1,4
  499. ADD r12,r12,r1, LSL #2
  500. @ We want to do
  501. @ f = CLAMP(MIN(-2L-f,0), f, MAX(2L-f,0))
  502. @ = ((f >= 0) ? MIN( f ,MAX(2L- f ,0)) : MAX( f , MIN(-2L- f ,0)))
  503. @ = ((f >= 0) ? MIN(|f|,MAX(2L-|f|,0)) : MAX(-|f|, MIN(-2L+|f|,0)))
  504. @ = ((f >= 0) ? MIN(|f|,MAX(2L-|f|,0)) :-MIN( |f|,-MIN(-2L+|f|,0)))
  505. @ = ((f >= 0) ? MIN(|f|,MAX(2L-|f|,0)) :-MIN( |f|, MAX( 2L-|f|,0)))
  506. @ So we've reduced the left and right hand terms to be the same, except
  507. @ for a negation.
  508. @ Stall x3
  509. VABS.S16 Q9, Q0 @ Q9 = |f| in U16s 1,4
  510. PLD [r12,-r1]
  511. VSHR.S16 Q0, Q0, #15 @ Q0 = -1 or 0 according to sign 1,3
  512. PLD [r12]
  513. VQSUB.U16 Q10,Q15,Q9 @ Q10= MAX(2L-|f|,0) in U16s 1,4
  514. PLD [r12,r1]
  515. VMOVL.U8 Q1, D2 @ Q2 = __UU__QQ__MM__II__EE__AA__66__22 2,3
  516. PLD [r12,r1,LSL #1]
  517. VMIN.U16 Q9, Q10,Q9 @ Q9 = MIN(|f|,MAX(2L-|f|)) 1,4
  518. ADD r12,r12,r1, LSL #2
  519. @ Now we need to correct for the sign of f.
  520. @ For negative elements of Q0, we want to subtract the appropriate
  521. @ element of Q9. For positive elements we want to add them. No NEON
  522. @ instruction exists to do this, so we need to negate the negative
  523. @ elements, and we can then just add them. a-b = a-(1+!b) = a-1+!b
  524. VADD.S16 Q9, Q9, Q0 @ 1,3
  525. PLD [r12,-r1]
  526. VEOR.S16 Q9, Q9, Q0 @ Q9 = real value of f 1,3
  527. @ Bah. No VRSBW.U8
  528. @ Stall (just 1 as Q9 not needed to second pipeline stage. I think.)
  529. VADDW.U8 Q2, Q9, D4 @ Q1 = xxTTxxPPxxLLxxHHxxDDxx99xx55xx11 1,3
  530. VSUB.S16 Q1, Q1, Q9 @ Q2 = xxUUxxQQxxMMxxIIxxEExxAAxx66xx22 1,3
  531. VQMOVUN.S16 D4, Q2 @ D4 = TTPPLLHHDD995511 1,1
  532. VQMOVUN.S16 D2, Q1 @ D2 = UUQQMMIIEEAA6622 1,1
  533. SUB r12,r0, #1
  534. VTRN.8 D4, D2 @ D4 = QQPPIIHHAA992211 D2 = MMLLEEDD6655 1,1
  535. VST1.16 {D4[0]}, [r12], r1
  536. VST1.16 {D2[0]}, [r12], r1
  537. VST1.16 {D4[1]}, [r12], r1
  538. VST1.16 {D2[1]}, [r12], r1
  539. VST1.16 {D4[2]}, [r12], r1
  540. VST1.16 {D2[2]}, [r12], r1
  541. VST1.16 {D4[3]}, [r12], r1
  542. VST1.16 {D2[3]}, [r12], r1
  543. MOV PC,r14
  544. @ @ .size loop_filter_h_neon, .-loop_filter_h_neon @ ENDP
  545. @ .type loop_filter_v_neon, %function; loop_filter_v_neon: @ PROC
  546. loop_filter_v_neon:
  547. @ r0 = unsigned char *_pix
  548. @ r1 = int _ystride
  549. @ r2 = int *_bv
  550. @ preserves r0-r3
  551. @ We assume Q15= 2*L in U16s
  552. @ My best guesses at cycle counts (and latency)--vvv
  553. SUB r12,r0, r1, LSL #1
  554. VLD1.64 {D0}, [r12,:64], r1 @ D0 = SSOOKKGGCC884400 2,1
  555. VLD1.64 {D2}, [r12,:64], r1 @ D2 = TTPPLLHHDD995511 2,1
  556. VLD1.64 {D4}, [r12,:64], r1 @ D4 = UUQQMMIIEEAA6622 2,1
  557. VLD1.64 {D6}, [r12,:64] @ D6 = VVRRNNJJFFBB7733 2,1
  558. VSUBL.U8 Q8, D4, D2 @ Q8 = 22 - 11 in S16s 1,3
  559. VSUBL.U8 Q0, D0, D6 @ Q0 = 00 - 33 in S16s 1,3
  560. ADD r12, #8
  561. VADD.S16 Q0, Q0, Q8 @ 1,3
  562. PLD [r12]
  563. VADD.S16 Q0, Q0, Q8 @ 1,3
  564. PLD [r12,r1]
  565. VADD.S16 Q0, Q0, Q8 @ Q0 = [0-3]+3*[2-1] 1,3
  566. SUB r12, r0, r1
  567. VRSHR.S16 Q0, Q0, #3 @ Q0 = f = ([0-3]+3*[2-1]+4)>>3 1,4
  568. @ We want to do
  569. @ f = CLAMP(MIN(-2L-f,0), f, MAX(2L-f,0))
  570. @ = ((f >= 0) ? MIN( f ,MAX(2L- f ,0)) : MAX( f , MIN(-2L- f ,0)))
  571. @ = ((f >= 0) ? MIN(|f|,MAX(2L-|f|,0)) : MAX(-|f|, MIN(-2L+|f|,0)))
  572. @ = ((f >= 0) ? MIN(|f|,MAX(2L-|f|,0)) :-MIN( |f|,-MIN(-2L+|f|,0)))
  573. @ = ((f >= 0) ? MIN(|f|,MAX(2L-|f|,0)) :-MIN( |f|, MAX( 2L-|f|,0)))
  574. @ So we've reduced the left and right hand terms to be the same, except
  575. @ for a negation.
  576. @ Stall x3
  577. VABS.S16 Q9, Q0 @ Q9 = |f| in U16s 1,4
  578. VSHR.S16 Q0, Q0, #15 @ Q0 = -1 or 0 according to sign 1,3
  579. @ Stall x2
  580. VQSUB.U16 Q10,Q15,Q9 @ Q10= MAX(2L-|f|,0) in U16s 1,4
  581. VMOVL.U8 Q2, D4 @ Q2 = __UU__QQ__MM__II__EE__AA__66__22 2,3
  582. @ Stall x2
  583. VMIN.U16 Q9, Q10,Q9 @ Q9 = MIN(|f|,MAX(2L-|f|)) 1,4
  584. @ Now we need to correct for the sign of f.
  585. @ For negative elements of Q0, we want to subtract the appropriate
  586. @ element of Q9. For positive elements we want to add them. No NEON
  587. @ instruction exists to do this, so we need to negate the negative
  588. @ elements, and we can then just add them. a-b = a-(1+!b) = a-1+!b
  589. @ Stall x3
  590. VADD.S16 Q9, Q9, Q0 @ 1,3
  591. @ Stall x2
  592. VEOR.S16 Q9, Q9, Q0 @ Q9 = real value of f 1,3
  593. @ Bah. No VRSBW.U8
  594. @ Stall (just 1 as Q9 not needed to second pipeline stage. I think.)
  595. VADDW.U8 Q1, Q9, D2 @ Q1 = xxTTxxPPxxLLxxHHxxDDxx99xx55xx11 1,3
  596. VSUB.S16 Q2, Q2, Q9 @ Q2 = xxUUxxQQxxMMxxIIxxEExxAAxx66xx22 1,3
  597. VQMOVUN.S16 D2, Q1 @ D2 = TTPPLLHHDD995511 1,1
  598. VQMOVUN.S16 D4, Q2 @ D4 = UUQQMMIIEEAA6622 1,1
  599. VST1.64 {D2}, [r12,:64], r1
  600. VST1.64 {D4}, [r12,:64], r1
  601. MOV PC,r14
  602. @ @ .size loop_filter_v_neon, .-loop_filter_v_neon @ ENDP
  603. @ .type oc_loop_filter_frag_rows_neon, %function; oc_loop_filter_frag_rows_neon: @ PROC
  604. _oc_loop_filter_frag_rows_neon:
  605. @ r0 = _ref_frame_data
  606. @ r1 = _ystride
  607. @ r2 = _bv
  608. @ r3 = _frags
  609. @ r4 = _fragi0
  610. @ r5 = _fragi0_end
  611. @ r6 = _fragi_top
  612. @ r7 = _fragi_bot
  613. @ r8 = _frag_buf_offs
  614. @ r9 = _nhfrags
  615. MOV r12,r13
  616. STMFD r13!,{r0,r4-r11,r14}
  617. LDMFD r12,{r4-r9}
  618. CMP r4, r5 @ if(_fragi0>=_fragi0_end)
  619. BGE oslffri_neon_end @ bail
  620. SUBS r9, r9, #1 @ r9 = _nhfrags-1 if (r9<=0)
  621. BLE oslffri_neon_end @ bail
  622. VLD1.64 {D30,D31}, [r2,:128] @ Q15= 2L in U16s
  623. ADD r3, r3, r4, LSL #2 @ r3 = &_frags[fragi]
  624. ADD r8, r8, r4, LSL #2 @ r8 = &_frag_buf_offs[fragi]
  625. SUB r7, r7, r9 @ _fragi_bot -= _nhfrags;
  626. oslffri_neon_lp1:
  627. MOV r10,r4 @ r10= fragi = _fragi0
  628. ADD r11,r4, r9 @ r11= fragi_end-1=fragi+_nhfrags-1
  629. oslffri_neon_lp2:
  630. LDR r14,[r3], #4 @ r14= _frags[fragi] _frags++
  631. LDR r0, [r13] @ r0 = _ref_frame_data
  632. LDR r12,[r8], #4 @ r12= _frag_buf_offs[fragi] _frag_buf_offs++
  633. TST r14,#OC_FRAG_CODED_FLAG
  634. BEQ oslffri_neon_uncoded
  635. CMP r10,r4 @ if (fragi>_fragi0)
  636. ADD r0, r0, r12 @ r0 = _ref_frame_data + _frag_buf_offs[fragi]
  637. BLGT loop_filter_h_neon
  638. CMP r4, r6 @ if (_fragi0>_fragi_top)
  639. BLGT loop_filter_v_neon
  640. CMP r10,r11 @ if(fragi+1<fragi_end)===(fragi<fragi_end-1)
  641. LDRLT r12,[r3] @ r12 = _frags[fragi+1]
  642. ADD r0, r0, #8
  643. ADD r10,r10,#1 @ r10 = fragi+1;
  644. ANDLT r12,r12,#OC_FRAG_CODED_FLAG
  645. CMPLT r12,#OC_FRAG_CODED_FLAG @ && _frags[fragi+1].coded==0
  646. BLLT loop_filter_h_neon
  647. CMP r10,r7 @ if (fragi<_fragi_bot)
  648. LDRLT r12,[r3, r9, LSL #2] @ r12 = _frags[fragi+1+_nhfrags-1]
  649. SUB r0, r0, #8
  650. ADD r0, r0, r1, LSL #3
  651. ANDLT r12,r12,#OC_FRAG_CODED_FLAG
  652. CMPLT r12,#OC_FRAG_CODED_FLAG
  653. BLLT loop_filter_v_neon
  654. CMP r10,r11 @ while(fragi<=fragi_end-1)
  655. BLE oslffri_neon_lp2
  656. MOV r4, r10 @ r4 = _fragi0 += _nhfrags
  657. CMP r4, r5
  658. BLT oslffri_neon_lp1
  659. oslffri_neon_end:
  660. LDMFD r13!,{r0,r4-r11,PC}
  661. oslffri_neon_uncoded:
  662. ADD r10,r10,#1
  663. CMP r10,r11
  664. BLE oslffri_neon_lp2
  665. MOV r4, r10 @ r4 = _fragi0 += _nhfrags
  666. CMP r4, r5
  667. BLT oslffri_neon_lp1
  668. LDMFD r13!,{r0,r4-r11,PC}
  669. @ @ .size oc_loop_filter_frag_rows_neon, .-oc_loop_filter_frag_rows_neon @ ENDP
  670. .endif
  671. @ END
  672. @ .section .note.GNU-stack,"",%progbits
  673. #endif