jsimd_arm_neon.S 103 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987198819891990199119921993199419951996199719981999200020012002200320042005200620072008200920102011201220132014201520162017201820192020202120222023202420252026202720282029203020312032203320342035203620372038203920402041204220432044204520462047204820492050205120522053205420552056205720582059206020612062206320642065206620672068206920702071207220732074207520762077207820792080208120822083208420852086208720882089209020912092209320942095209620972098209921002101210221032104210521062107210821092110211121122113211421152116211721182119212021212122212321242125212621272128212921302131213221332134213521362137213821392140214121422143214421452146214721482149215021512152215321542155215621572158215921602161216221632164216521662167216821692170217121722173217421752176217721782179218021812182218321842185218621872188218921902191219221932194219521962197219821992200220122022203220422052206220722082209221022112212221322142215221622172218221922202221222222232224222522262227222822292230223122322233223422352236223722382239224022412242224322442245224622472248224922502251225222532254225522562257225822592260226122622263226422652266226722682269227022712272227322742275227622772278227922802281228222832284228522862287228822892290229122922293229422952296229722982299230023012302230323042305230623072308230923102311231223132314231523162317231823192320232123222323232423252326232723282329233023312332233323342335233623372338233923402341234223432344234523462347234823492350235123522353235423552356235723582359236023612362236323642365236623672368236923702371237223732374237523762377237823792380238123822383238423852386238723882389239023912392239323942395239623972398239924002401240224032404240524062407240824092410241124122413241424152416241724182419242024212422242324242425242624272428242924302431243224332434243524362437243824392440244124422443244424452446244724482449245024512452245324542455245624572458245924602461246224632464246524662467246824692470247124722473247424752476247724782479248024812482248324842485248624872488248924902491249224932494249524962497249824992500250125022503250425052506250725082509251025112512251325142515251625172518251925202521252225232524252525262527252825292530253125322533253425352536253725382539254025412542254325442545254625472548254925502551255225532554255525562557255825592560256125622563256425652566256725682569257025712572257325742575257625772578257925802581258225832584258525862587258825892590259125922593259425952596259725982599260026012602260326042605260626072608260926102611261226132614261526162617261826192620262126222623262426252626262726282629263026312632263326342635263626372638263926402641264226432644264526462647264826492650265126522653265426552656265726582659266026612662266326642665266626672668266926702671267226732674267526762677267826792680268126822683268426852686268726882689269026912692269326942695269626972698269927002701270227032704270527062707270827092710271127122713271427152716271727182719272027212722272327242725272627272728272927302731273227332734273527362737273827392740274127422743274427452746274727482749275027512752275327542755275627572758275927602761276227632764276527662767276827692770277127722773277427752776277727782779278027812782278327842785278627872788278927902791279227932794279527962797279827992800280128022803280428052806280728082809281028112812281328142815281628172818281928202821282228232824282528262827282828292830283128322833283428352836283728382839284028412842284328442845284628472848284928502851285228532854285528562857285828592860286128622863286428652866286728682869287028712872287328742875287628772878287928802881288228832884
  1. #ifndef __LP64__ // ESENTHEL CHANGED
  2. /*
  3. * ARMv7 NEON optimizations for libjpeg-turbo
  4. *
  5. * Copyright (C) 2009-2011, Nokia Corporation and/or its subsidiary(-ies).
  6. * All Rights Reserved.
  7. * Author: Siarhei Siamashka <[email protected]>
  8. * Copyright (C) 2014, Siarhei Siamashka. All Rights Reserved.
  9. * Copyright (C) 2014, Linaro Limited. All Rights Reserved.
  10. * Copyright (C) 2015, D. R. Commander. All Rights Reserved.
  11. * Copyright (C) 2015-2016, Matthieu Darbois. All Rights Reserved.
  12. *
  13. * This software is provided 'as-is', without any express or implied
  14. * warranty. In no event will the authors be held liable for any damages
  15. * arising from the use of this software.
  16. *
  17. * Permission is granted to anyone to use this software for any purpose,
  18. * including commercial applications, and to alter it and redistribute it
  19. * freely, subject to the following restrictions:
  20. *
  21. * 1. The origin of this software must not be misrepresented; you must not
  22. * claim that you wrote the original software. If you use this software
  23. * in a product, an acknowledgment in the product documentation would be
  24. * appreciated but is not required.
  25. * 2. Altered source versions must be plainly marked as such, and must not be
  26. * misrepresented as being the original software.
  27. * 3. This notice may not be removed or altered from any source distribution.
  28. */
  29. #if defined(__linux__) && defined(__ELF__)
  30. .section .note.GNU-stack, "", %progbits /* mark stack as non-executable */
  31. #endif
  32. .text
  33. #ifndef __APPLE__ // ESENTHEL CHANGED
  34. .fpu neon
  35. .arch armv7a
  36. .object_arch armv4
  37. .arm
  38. .syntax unified
  39. #endif // ESENTHEL CHANGED
  40. #define RESPECT_STRICT_ALIGNMENT 1
  41. /*****************************************************************************/
  42. /* Supplementary macro for setting function attributes */
  43. .macro asm_function fname
  44. #ifdef __APPLE__
  45. .globl _\fname
  46. _\fname:
  47. #else
  48. .global \fname
  49. #ifdef __ELF__
  50. .hidden \fname
  51. .type \fname, %function
  52. #endif
  53. \fname:
  54. #endif
  55. .endm
  56. /* Transpose a block of 4x4 coefficients in four 64-bit registers */
  57. .macro transpose_4x4 x0, x1, x2, x3
  58. vtrn.16 \x0, \x1
  59. vtrn.16 \x2, \x3
  60. vtrn.32 \x0, \x2
  61. vtrn.32 \x1, \x3
  62. .endm
  63. #define CENTERJSAMPLE 128
  64. /*****************************************************************************/
  65. /*
  66. * Perform dequantization and inverse DCT on one block of coefficients.
  67. *
  68. * GLOBAL(void)
  69. * jsimd_idct_islow_neon (void *dct_table, JCOEFPTR coef_block,
  70. * JSAMPARRAY output_buf, JDIMENSION output_col)
  71. */
  72. #define FIX_0_298631336 (2446)
  73. #define FIX_0_390180644 (3196)
  74. #define FIX_0_541196100 (4433)
  75. #define FIX_0_765366865 (6270)
  76. #define FIX_0_899976223 (7373)
  77. #define FIX_1_175875602 (9633)
  78. #define FIX_1_501321110 (12299)
  79. #define FIX_1_847759065 (15137)
  80. #define FIX_1_961570560 (16069)
  81. #define FIX_2_053119869 (16819)
  82. #define FIX_2_562915447 (20995)
  83. #define FIX_3_072711026 (25172)
  84. #define FIX_1_175875602_MINUS_1_961570560 (FIX_1_175875602 - FIX_1_961570560)
  85. #define FIX_1_175875602_MINUS_0_390180644 (FIX_1_175875602 - FIX_0_390180644)
  86. #define FIX_0_541196100_MINUS_1_847759065 (FIX_0_541196100 - FIX_1_847759065)
  87. #define FIX_3_072711026_MINUS_2_562915447 (FIX_3_072711026 - FIX_2_562915447)
  88. #define FIX_0_298631336_MINUS_0_899976223 (FIX_0_298631336 - FIX_0_899976223)
  89. #define FIX_1_501321110_MINUS_0_899976223 (FIX_1_501321110 - FIX_0_899976223)
  90. #define FIX_2_053119869_MINUS_2_562915447 (FIX_2_053119869 - FIX_2_562915447)
  91. #define FIX_0_541196100_PLUS_0_765366865 (FIX_0_541196100 + FIX_0_765366865)
  92. /*
  93. * Reference SIMD-friendly 1-D ISLOW iDCT C implementation.
  94. * Uses some ideas from the comments in 'simd/jiss2int-64.asm'
  95. */
  96. #define REF_1D_IDCT(xrow0, xrow1, xrow2, xrow3, xrow4, xrow5, xrow6, xrow7) \
  97. { \
  98. DCTELEM row0, row1, row2, row3, row4, row5, row6, row7; \
  99. JLONG q1, q2, q3, q4, q5, q6, q7; \
  100. JLONG tmp11_plus_tmp2, tmp11_minus_tmp2; \
  101. \
  102. /* 1-D iDCT input data */ \
  103. row0 = xrow0; \
  104. row1 = xrow1; \
  105. row2 = xrow2; \
  106. row3 = xrow3; \
  107. row4 = xrow4; \
  108. row5 = xrow5; \
  109. row6 = xrow6; \
  110. row7 = xrow7; \
  111. \
  112. q5 = row7 + row3; \
  113. q4 = row5 + row1; \
  114. q6 = MULTIPLY(q5, FIX_1_175875602_MINUS_1_961570560) + \
  115. MULTIPLY(q4, FIX_1_175875602); \
  116. q7 = MULTIPLY(q5, FIX_1_175875602) + \
  117. MULTIPLY(q4, FIX_1_175875602_MINUS_0_390180644); \
  118. q2 = MULTIPLY(row2, FIX_0_541196100) + \
  119. MULTIPLY(row6, FIX_0_541196100_MINUS_1_847759065); \
  120. q4 = q6; \
  121. q3 = ((JLONG) row0 - (JLONG) row4) << 13; \
  122. q6 += MULTIPLY(row5, -FIX_2_562915447) + \
  123. MULTIPLY(row3, FIX_3_072711026_MINUS_2_562915447); \
  124. /* now we can use q1 (reloadable constants have been used up) */ \
  125. q1 = q3 + q2; \
  126. q4 += MULTIPLY(row7, FIX_0_298631336_MINUS_0_899976223) + \
  127. MULTIPLY(row1, -FIX_0_899976223); \
  128. q5 = q7; \
  129. q1 = q1 + q6; \
  130. q7 += MULTIPLY(row7, -FIX_0_899976223) + \
  131. MULTIPLY(row1, FIX_1_501321110_MINUS_0_899976223); \
  132. \
  133. /* (tmp11 + tmp2) has been calculated (out_row1 before descale) */ \
  134. tmp11_plus_tmp2 = q1; \
  135. row1 = 0; \
  136. \
  137. q1 = q1 - q6; \
  138. q5 += MULTIPLY(row5, FIX_2_053119869_MINUS_2_562915447) + \
  139. MULTIPLY(row3, -FIX_2_562915447); \
  140. q1 = q1 - q6; \
  141. q6 = MULTIPLY(row2, FIX_0_541196100_PLUS_0_765366865) + \
  142. MULTIPLY(row6, FIX_0_541196100); \
  143. q3 = q3 - q2; \
  144. \
  145. /* (tmp11 - tmp2) has been calculated (out_row6 before descale) */ \
  146. tmp11_minus_tmp2 = q1; \
  147. \
  148. q1 = ((JLONG) row0 + (JLONG) row4) << 13; \
  149. q2 = q1 + q6; \
  150. q1 = q1 - q6; \
  151. \
  152. /* pick up the results */ \
  153. tmp0 = q4; \
  154. tmp1 = q5; \
  155. tmp2 = (tmp11_plus_tmp2 - tmp11_minus_tmp2) / 2; \
  156. tmp3 = q7; \
  157. tmp10 = q2; \
  158. tmp11 = (tmp11_plus_tmp2 + tmp11_minus_tmp2) / 2; \
  159. tmp12 = q3; \
  160. tmp13 = q1; \
  161. }
  162. #define XFIX_0_899976223 d0[0]
  163. #define XFIX_0_541196100 d0[1]
  164. #define XFIX_2_562915447 d0[2]
  165. #define XFIX_0_298631336_MINUS_0_899976223 d0[3]
  166. #define XFIX_1_501321110_MINUS_0_899976223 d1[0]
  167. #define XFIX_2_053119869_MINUS_2_562915447 d1[1]
  168. #define XFIX_0_541196100_PLUS_0_765366865 d1[2]
  169. #define XFIX_1_175875602 d1[3]
  170. #define XFIX_1_175875602_MINUS_0_390180644 d2[0]
  171. #define XFIX_0_541196100_MINUS_1_847759065 d2[1]
  172. #define XFIX_3_072711026_MINUS_2_562915447 d2[2]
  173. #define XFIX_1_175875602_MINUS_1_961570560 d2[3]
  174. .balign 16
  175. jsimd_idct_islow_neon_consts:
  176. .short FIX_0_899976223 /* d0[0] */
  177. .short FIX_0_541196100 /* d0[1] */
  178. .short FIX_2_562915447 /* d0[2] */
  179. .short FIX_0_298631336_MINUS_0_899976223 /* d0[3] */
  180. .short FIX_1_501321110_MINUS_0_899976223 /* d1[0] */
  181. .short FIX_2_053119869_MINUS_2_562915447 /* d1[1] */
  182. .short FIX_0_541196100_PLUS_0_765366865 /* d1[2] */
  183. .short FIX_1_175875602 /* d1[3] */
  184. /* reloadable constants */
  185. .short FIX_1_175875602_MINUS_0_390180644 /* d2[0] */
  186. .short FIX_0_541196100_MINUS_1_847759065 /* d2[1] */
  187. .short FIX_3_072711026_MINUS_2_562915447 /* d2[2] */
  188. .short FIX_1_175875602_MINUS_1_961570560 /* d2[3] */
  189. asm_function jsimd_idct_islow_neon
  190. DCT_TABLE .req r0
  191. COEF_BLOCK .req r1
  192. OUTPUT_BUF .req r2
  193. OUTPUT_COL .req r3
  194. TMP1 .req r0
  195. TMP2 .req r1
  196. TMP3 .req r2
  197. TMP4 .req ip
  198. ROW0L .req d16
  199. ROW0R .req d17
  200. ROW1L .req d18
  201. ROW1R .req d19
  202. ROW2L .req d20
  203. ROW2R .req d21
  204. ROW3L .req d22
  205. ROW3R .req d23
  206. ROW4L .req d24
  207. ROW4R .req d25
  208. ROW5L .req d26
  209. ROW5R .req d27
  210. ROW6L .req d28
  211. ROW6R .req d29
  212. ROW7L .req d30
  213. ROW7R .req d31
  214. /* Load and dequantize coefficients into NEON registers
  215. * with the following allocation:
  216. * 0 1 2 3 | 4 5 6 7
  217. * ---------+--------
  218. * 0 | d16 | d17 ( q8 )
  219. * 1 | d18 | d19 ( q9 )
  220. * 2 | d20 | d21 ( q10 )
  221. * 3 | d22 | d23 ( q11 )
  222. * 4 | d24 | d25 ( q12 )
  223. * 5 | d26 | d27 ( q13 )
  224. * 6 | d28 | d29 ( q14 )
  225. * 7 | d30 | d31 ( q15 )
  226. */
  227. adr ip, jsimd_idct_islow_neon_consts
  228. vld1.16 {d16, d17, d18, d19}, [COEF_BLOCK, :128]!
  229. vld1.16 {d0, d1, d2, d3}, [DCT_TABLE, :128]!
  230. vld1.16 {d20, d21, d22, d23}, [COEF_BLOCK, :128]!
  231. vmul.s16 q8, q8, q0
  232. vld1.16 {d4, d5, d6, d7}, [DCT_TABLE, :128]!
  233. vmul.s16 q9, q9, q1
  234. vld1.16 {d24, d25, d26, d27}, [COEF_BLOCK, :128]!
  235. vmul.s16 q10, q10, q2
  236. vld1.16 {d0, d1, d2, d3}, [DCT_TABLE, :128]!
  237. vmul.s16 q11, q11, q3
  238. vld1.16 {d28, d29, d30, d31}, [COEF_BLOCK, :128]
  239. vmul.s16 q12, q12, q0
  240. vld1.16 {d4, d5, d6, d7}, [DCT_TABLE, :128]!
  241. vmul.s16 q14, q14, q2
  242. vmul.s16 q13, q13, q1
  243. vld1.16 {d0, d1, d2, d3}, [ip, :128] /* load constants */
  244. add ip, ip, #16
  245. vmul.s16 q15, q15, q3
  246. vpush {d8-d15} /* save NEON registers */
  247. /* 1-D IDCT, pass 1, left 4x8 half */
  248. vadd.s16 d4, ROW7L, ROW3L
  249. vadd.s16 d5, ROW5L, ROW1L
  250. vmull.s16 q6, d4, XFIX_1_175875602_MINUS_1_961570560
  251. vmlal.s16 q6, d5, XFIX_1_175875602
  252. vmull.s16 q7, d4, XFIX_1_175875602
  253. /* Check for the zero coefficients in the right 4x8 half */
  254. push {r4, r5}
  255. vmlal.s16 q7, d5, XFIX_1_175875602_MINUS_0_390180644
  256. vsubl.s16 q3, ROW0L, ROW4L
  257. ldrd r4, [COEF_BLOCK, #(-96 + 2 * (4 + 1 * 8))]
  258. vmull.s16 q2, ROW2L, XFIX_0_541196100
  259. vmlal.s16 q2, ROW6L, XFIX_0_541196100_MINUS_1_847759065
  260. orr r0, r4, r5
  261. vmov q4, q6
  262. vmlsl.s16 q6, ROW5L, XFIX_2_562915447
  263. ldrd r4, [COEF_BLOCK, #(-96 + 2 * (4 + 2 * 8))]
  264. vmlal.s16 q6, ROW3L, XFIX_3_072711026_MINUS_2_562915447
  265. vshl.s32 q3, q3, #13
  266. orr r0, r0, r4
  267. vmlsl.s16 q4, ROW1L, XFIX_0_899976223
  268. orr r0, r0, r5
  269. vadd.s32 q1, q3, q2
  270. ldrd r4, [COEF_BLOCK, #(-96 + 2 * (4 + 3 * 8))]
  271. vmov q5, q7
  272. vadd.s32 q1, q1, q6
  273. orr r0, r0, r4
  274. vmlsl.s16 q7, ROW7L, XFIX_0_899976223
  275. orr r0, r0, r5
  276. vmlal.s16 q7, ROW1L, XFIX_1_501321110_MINUS_0_899976223
  277. vrshrn.s32 ROW1L, q1, #11
  278. ldrd r4, [COEF_BLOCK, #(-96 + 2 * (4 + 4 * 8))]
  279. vsub.s32 q1, q1, q6
  280. vmlal.s16 q5, ROW5L, XFIX_2_053119869_MINUS_2_562915447
  281. orr r0, r0, r4
  282. vmlsl.s16 q5, ROW3L, XFIX_2_562915447
  283. orr r0, r0, r5
  284. vsub.s32 q1, q1, q6
  285. vmull.s16 q6, ROW2L, XFIX_0_541196100_PLUS_0_765366865
  286. ldrd r4, [COEF_BLOCK, #(-96 + 2 * (4 + 5 * 8))]
  287. vmlal.s16 q6, ROW6L, XFIX_0_541196100
  288. vsub.s32 q3, q3, q2
  289. orr r0, r0, r4
  290. vrshrn.s32 ROW6L, q1, #11
  291. orr r0, r0, r5
  292. vadd.s32 q1, q3, q5
  293. ldrd r4, [COEF_BLOCK, #(-96 + 2 * (4 + 6 * 8))]
  294. vsub.s32 q3, q3, q5
  295. vaddl.s16 q5, ROW0L, ROW4L
  296. orr r0, r0, r4
  297. vrshrn.s32 ROW2L, q1, #11
  298. orr r0, r0, r5
  299. vrshrn.s32 ROW5L, q3, #11
  300. ldrd r4, [COEF_BLOCK, #(-96 + 2 * (4 + 7 * 8))]
  301. vshl.s32 q5, q5, #13
  302. vmlal.s16 q4, ROW7L, XFIX_0_298631336_MINUS_0_899976223
  303. orr r0, r0, r4
  304. vadd.s32 q2, q5, q6
  305. orrs r0, r0, r5
  306. vsub.s32 q1, q5, q6
  307. vadd.s32 q6, q2, q7
  308. ldrd r4, [COEF_BLOCK, #(-96 + 2 * (4 + 0 * 8))]
  309. vsub.s32 q2, q2, q7
  310. vadd.s32 q5, q1, q4
  311. orr r0, r4, r5
  312. vsub.s32 q3, q1, q4
  313. pop {r4, r5}
  314. vrshrn.s32 ROW7L, q2, #11
  315. vrshrn.s32 ROW3L, q5, #11
  316. vrshrn.s32 ROW0L, q6, #11
  317. vrshrn.s32 ROW4L, q3, #11
  318. beq 3f /* Go to do some special handling for the sparse
  319. right 4x8 half */
  320. /* 1-D IDCT, pass 1, right 4x8 half */
  321. vld1.s16 {d2}, [ip, :64] /* reload constants */
  322. vadd.s16 d10, ROW7R, ROW3R
  323. vadd.s16 d8, ROW5R, ROW1R
  324. /* Transpose left 4x8 half */
  325. vtrn.16 ROW6L, ROW7L
  326. vmull.s16 q6, d10, XFIX_1_175875602_MINUS_1_961570560
  327. vmlal.s16 q6, d8, XFIX_1_175875602
  328. vtrn.16 ROW2L, ROW3L
  329. vmull.s16 q7, d10, XFIX_1_175875602
  330. vmlal.s16 q7, d8, XFIX_1_175875602_MINUS_0_390180644
  331. vtrn.16 ROW0L, ROW1L
  332. vsubl.s16 q3, ROW0R, ROW4R
  333. vmull.s16 q2, ROW2R, XFIX_0_541196100
  334. vmlal.s16 q2, ROW6R, XFIX_0_541196100_MINUS_1_847759065
  335. vtrn.16 ROW4L, ROW5L
  336. vmov q4, q6
  337. vmlsl.s16 q6, ROW5R, XFIX_2_562915447
  338. vmlal.s16 q6, ROW3R, XFIX_3_072711026_MINUS_2_562915447
  339. vtrn.32 ROW1L, ROW3L
  340. vshl.s32 q3, q3, #13
  341. vmlsl.s16 q4, ROW1R, XFIX_0_899976223
  342. vtrn.32 ROW4L, ROW6L
  343. vadd.s32 q1, q3, q2
  344. vmov q5, q7
  345. vadd.s32 q1, q1, q6
  346. vtrn.32 ROW0L, ROW2L
  347. vmlsl.s16 q7, ROW7R, XFIX_0_899976223
  348. vmlal.s16 q7, ROW1R, XFIX_1_501321110_MINUS_0_899976223
  349. vrshrn.s32 ROW1R, q1, #11
  350. vtrn.32 ROW5L, ROW7L
  351. vsub.s32 q1, q1, q6
  352. vmlal.s16 q5, ROW5R, XFIX_2_053119869_MINUS_2_562915447
  353. vmlsl.s16 q5, ROW3R, XFIX_2_562915447
  354. vsub.s32 q1, q1, q6
  355. vmull.s16 q6, ROW2R, XFIX_0_541196100_PLUS_0_765366865
  356. vmlal.s16 q6, ROW6R, XFIX_0_541196100
  357. vsub.s32 q3, q3, q2
  358. vrshrn.s32 ROW6R, q1, #11
  359. vadd.s32 q1, q3, q5
  360. vsub.s32 q3, q3, q5
  361. vaddl.s16 q5, ROW0R, ROW4R
  362. vrshrn.s32 ROW2R, q1, #11
  363. vrshrn.s32 ROW5R, q3, #11
  364. vshl.s32 q5, q5, #13
  365. vmlal.s16 q4, ROW7R, XFIX_0_298631336_MINUS_0_899976223
  366. vadd.s32 q2, q5, q6
  367. vsub.s32 q1, q5, q6
  368. vadd.s32 q6, q2, q7
  369. vsub.s32 q2, q2, q7
  370. vadd.s32 q5, q1, q4
  371. vsub.s32 q3, q1, q4
  372. vrshrn.s32 ROW7R, q2, #11
  373. vrshrn.s32 ROW3R, q5, #11
  374. vrshrn.s32 ROW0R, q6, #11
  375. vrshrn.s32 ROW4R, q3, #11
  376. /* Transpose right 4x8 half */
  377. vtrn.16 ROW6R, ROW7R
  378. vtrn.16 ROW2R, ROW3R
  379. vtrn.16 ROW0R, ROW1R
  380. vtrn.16 ROW4R, ROW5R
  381. vtrn.32 ROW1R, ROW3R
  382. vtrn.32 ROW4R, ROW6R
  383. vtrn.32 ROW0R, ROW2R
  384. vtrn.32 ROW5R, ROW7R
  385. 1: /* 1-D IDCT, pass 2 (normal variant), left 4x8 half */
  386. vld1.s16 {d2}, [ip, :64] /* reload constants */
  387. vmull.s16 q6, ROW1R, XFIX_1_175875602 /* ROW5L <-> ROW1R */
  388. vmlal.s16 q6, ROW1L, XFIX_1_175875602
  389. vmlal.s16 q6, ROW3R, XFIX_1_175875602_MINUS_1_961570560 /* ROW7L <-> ROW3R */
  390. vmlal.s16 q6, ROW3L, XFIX_1_175875602_MINUS_1_961570560
  391. vmull.s16 q7, ROW3R, XFIX_1_175875602 /* ROW7L <-> ROW3R */
  392. vmlal.s16 q7, ROW3L, XFIX_1_175875602
  393. vmlal.s16 q7, ROW1R, XFIX_1_175875602_MINUS_0_390180644 /* ROW5L <-> ROW1R */
  394. vmlal.s16 q7, ROW1L, XFIX_1_175875602_MINUS_0_390180644
  395. vsubl.s16 q3, ROW0L, ROW0R /* ROW4L <-> ROW0R */
  396. vmull.s16 q2, ROW2L, XFIX_0_541196100
  397. vmlal.s16 q2, ROW2R, XFIX_0_541196100_MINUS_1_847759065 /* ROW6L <-> ROW2R */
  398. vmov q4, q6
  399. vmlsl.s16 q6, ROW1R, XFIX_2_562915447 /* ROW5L <-> ROW1R */
  400. vmlal.s16 q6, ROW3L, XFIX_3_072711026_MINUS_2_562915447
  401. vshl.s32 q3, q3, #13
  402. vmlsl.s16 q4, ROW1L, XFIX_0_899976223
  403. vadd.s32 q1, q3, q2
  404. vmov q5, q7
  405. vadd.s32 q1, q1, q6
  406. vmlsl.s16 q7, ROW3R, XFIX_0_899976223 /* ROW7L <-> ROW3R */
  407. vmlal.s16 q7, ROW1L, XFIX_1_501321110_MINUS_0_899976223
  408. vshrn.s32 ROW1L, q1, #16
  409. vsub.s32 q1, q1, q6
  410. vmlal.s16 q5, ROW1R, XFIX_2_053119869_MINUS_2_562915447 /* ROW5L <-> ROW1R */
  411. vmlsl.s16 q5, ROW3L, XFIX_2_562915447
  412. vsub.s32 q1, q1, q6
  413. vmull.s16 q6, ROW2L, XFIX_0_541196100_PLUS_0_765366865
  414. vmlal.s16 q6, ROW2R, XFIX_0_541196100 /* ROW6L <-> ROW2R */
  415. vsub.s32 q3, q3, q2
  416. vshrn.s32 ROW2R, q1, #16 /* ROW6L <-> ROW2R */
  417. vadd.s32 q1, q3, q5
  418. vsub.s32 q3, q3, q5
  419. vaddl.s16 q5, ROW0L, ROW0R /* ROW4L <-> ROW0R */
  420. vshrn.s32 ROW2L, q1, #16
  421. vshrn.s32 ROW1R, q3, #16 /* ROW5L <-> ROW1R */
  422. vshl.s32 q5, q5, #13
  423. vmlal.s16 q4, ROW3R, XFIX_0_298631336_MINUS_0_899976223 /* ROW7L <-> ROW3R */
  424. vadd.s32 q2, q5, q6
  425. vsub.s32 q1, q5, q6
  426. vadd.s32 q6, q2, q7
  427. vsub.s32 q2, q2, q7
  428. vadd.s32 q5, q1, q4
  429. vsub.s32 q3, q1, q4
  430. vshrn.s32 ROW3R, q2, #16 /* ROW7L <-> ROW3R */
  431. vshrn.s32 ROW3L, q5, #16
  432. vshrn.s32 ROW0L, q6, #16
  433. vshrn.s32 ROW0R, q3, #16 /* ROW4L <-> ROW0R */
  434. /* 1-D IDCT, pass 2, right 4x8 half */
  435. vld1.s16 {d2}, [ip, :64] /* reload constants */
  436. vmull.s16 q6, ROW5R, XFIX_1_175875602
  437. vmlal.s16 q6, ROW5L, XFIX_1_175875602 /* ROW5L <-> ROW1R */
  438. vmlal.s16 q6, ROW7R, XFIX_1_175875602_MINUS_1_961570560
  439. vmlal.s16 q6, ROW7L, XFIX_1_175875602_MINUS_1_961570560 /* ROW7L <-> ROW3R */
  440. vmull.s16 q7, ROW7R, XFIX_1_175875602
  441. vmlal.s16 q7, ROW7L, XFIX_1_175875602 /* ROW7L <-> ROW3R */
  442. vmlal.s16 q7, ROW5R, XFIX_1_175875602_MINUS_0_390180644
  443. vmlal.s16 q7, ROW5L, XFIX_1_175875602_MINUS_0_390180644 /* ROW5L <-> ROW1R */
  444. vsubl.s16 q3, ROW4L, ROW4R /* ROW4L <-> ROW0R */
  445. vmull.s16 q2, ROW6L, XFIX_0_541196100 /* ROW6L <-> ROW2R */
  446. vmlal.s16 q2, ROW6R, XFIX_0_541196100_MINUS_1_847759065
  447. vmov q4, q6
  448. vmlsl.s16 q6, ROW5R, XFIX_2_562915447
  449. vmlal.s16 q6, ROW7L, XFIX_3_072711026_MINUS_2_562915447 /* ROW7L <-> ROW3R */
  450. vshl.s32 q3, q3, #13
  451. vmlsl.s16 q4, ROW5L, XFIX_0_899976223 /* ROW5L <-> ROW1R */
  452. vadd.s32 q1, q3, q2
  453. vmov q5, q7
  454. vadd.s32 q1, q1, q6
  455. vmlsl.s16 q7, ROW7R, XFIX_0_899976223
  456. vmlal.s16 q7, ROW5L, XFIX_1_501321110_MINUS_0_899976223 /* ROW5L <-> ROW1R */
  457. vshrn.s32 ROW5L, q1, #16 /* ROW5L <-> ROW1R */
  458. vsub.s32 q1, q1, q6
  459. vmlal.s16 q5, ROW5R, XFIX_2_053119869_MINUS_2_562915447
  460. vmlsl.s16 q5, ROW7L, XFIX_2_562915447 /* ROW7L <-> ROW3R */
  461. vsub.s32 q1, q1, q6
  462. vmull.s16 q6, ROW6L, XFIX_0_541196100_PLUS_0_765366865 /* ROW6L <-> ROW2R */
  463. vmlal.s16 q6, ROW6R, XFIX_0_541196100
  464. vsub.s32 q3, q3, q2
  465. vshrn.s32 ROW6R, q1, #16
  466. vadd.s32 q1, q3, q5
  467. vsub.s32 q3, q3, q5
  468. vaddl.s16 q5, ROW4L, ROW4R /* ROW4L <-> ROW0R */
  469. vshrn.s32 ROW6L, q1, #16 /* ROW6L <-> ROW2R */
  470. vshrn.s32 ROW5R, q3, #16
  471. vshl.s32 q5, q5, #13
  472. vmlal.s16 q4, ROW7R, XFIX_0_298631336_MINUS_0_899976223
  473. vadd.s32 q2, q5, q6
  474. vsub.s32 q1, q5, q6
  475. vadd.s32 q6, q2, q7
  476. vsub.s32 q2, q2, q7
  477. vadd.s32 q5, q1, q4
  478. vsub.s32 q3, q1, q4
  479. vshrn.s32 ROW7R, q2, #16
  480. vshrn.s32 ROW7L, q5, #16 /* ROW7L <-> ROW3R */
  481. vshrn.s32 ROW4L, q6, #16 /* ROW4L <-> ROW0R */
  482. vshrn.s32 ROW4R, q3, #16
  483. 2: /* Descale to 8-bit and range limit */
  484. vqrshrn.s16 d16, q8, #2
  485. vqrshrn.s16 d17, q9, #2
  486. vqrshrn.s16 d18, q10, #2
  487. vqrshrn.s16 d19, q11, #2
  488. vpop {d8-d15} /* restore NEON registers */
  489. vqrshrn.s16 d20, q12, #2
  490. /* Transpose the final 8-bit samples and do signed->unsigned conversion */
  491. vtrn.16 q8, q9
  492. vqrshrn.s16 d21, q13, #2
  493. vqrshrn.s16 d22, q14, #2
  494. vmov.u8 q0, #(CENTERJSAMPLE)
  495. vqrshrn.s16 d23, q15, #2
  496. vtrn.8 d16, d17
  497. vtrn.8 d18, d19
  498. vadd.u8 q8, q8, q0
  499. vadd.u8 q9, q9, q0
  500. vtrn.16 q10, q11
  501. /* Store results to the output buffer */
  502. ldmia OUTPUT_BUF!, {TMP1, TMP2}
  503. add TMP1, TMP1, OUTPUT_COL
  504. add TMP2, TMP2, OUTPUT_COL
  505. vst1.8 {d16}, [TMP1]
  506. vtrn.8 d20, d21
  507. vst1.8 {d17}, [TMP2]
  508. ldmia OUTPUT_BUF!, {TMP1, TMP2}
  509. add TMP1, TMP1, OUTPUT_COL
  510. add TMP2, TMP2, OUTPUT_COL
  511. vst1.8 {d18}, [TMP1]
  512. vadd.u8 q10, q10, q0
  513. vst1.8 {d19}, [TMP2]
  514. ldmia OUTPUT_BUF, {TMP1, TMP2, TMP3, TMP4}
  515. add TMP1, TMP1, OUTPUT_COL
  516. add TMP2, TMP2, OUTPUT_COL
  517. add TMP3, TMP3, OUTPUT_COL
  518. add TMP4, TMP4, OUTPUT_COL
  519. vtrn.8 d22, d23
  520. vst1.8 {d20}, [TMP1]
  521. vadd.u8 q11, q11, q0
  522. vst1.8 {d21}, [TMP2]
  523. vst1.8 {d22}, [TMP3]
  524. vst1.8 {d23}, [TMP4]
  525. bx lr
  526. 3: /* Left 4x8 half is done, right 4x8 half contains mostly zeros */
  527. /* Transpose left 4x8 half */
  528. vtrn.16 ROW6L, ROW7L
  529. vtrn.16 ROW2L, ROW3L
  530. vtrn.16 ROW0L, ROW1L
  531. vtrn.16 ROW4L, ROW5L
  532. vshl.s16 ROW0R, ROW0R, #2 /* PASS1_BITS */
  533. vtrn.32 ROW1L, ROW3L
  534. vtrn.32 ROW4L, ROW6L
  535. vtrn.32 ROW0L, ROW2L
  536. vtrn.32 ROW5L, ROW7L
  537. cmp r0, #0
  538. beq 4f /* Right 4x8 half has all zeros, go to 'sparse' second
  539. pass */
  540. /* Only row 0 is non-zero for the right 4x8 half */
  541. vdup.s16 ROW1R, ROW0R[1]
  542. vdup.s16 ROW2R, ROW0R[2]
  543. vdup.s16 ROW3R, ROW0R[3]
  544. vdup.s16 ROW4R, ROW0R[0]
  545. vdup.s16 ROW5R, ROW0R[1]
  546. vdup.s16 ROW6R, ROW0R[2]
  547. vdup.s16 ROW7R, ROW0R[3]
  548. vdup.s16 ROW0R, ROW0R[0]
  549. b 1b /* Go to 'normal' second pass */
  550. 4: /* 1-D IDCT, pass 2 (sparse variant with zero rows 4-7), left 4x8 half */
  551. vld1.s16 {d2}, [ip, :64] /* reload constants */
  552. vmull.s16 q6, ROW1L, XFIX_1_175875602
  553. vmlal.s16 q6, ROW3L, XFIX_1_175875602_MINUS_1_961570560
  554. vmull.s16 q7, ROW3L, XFIX_1_175875602
  555. vmlal.s16 q7, ROW1L, XFIX_1_175875602_MINUS_0_390180644
  556. vmull.s16 q2, ROW2L, XFIX_0_541196100
  557. vshll.s16 q3, ROW0L, #13
  558. vmov q4, q6
  559. vmlal.s16 q6, ROW3L, XFIX_3_072711026_MINUS_2_562915447
  560. vmlsl.s16 q4, ROW1L, XFIX_0_899976223
  561. vadd.s32 q1, q3, q2
  562. vmov q5, q7
  563. vmlal.s16 q7, ROW1L, XFIX_1_501321110_MINUS_0_899976223
  564. vadd.s32 q1, q1, q6
  565. vadd.s32 q6, q6, q6
  566. vmlsl.s16 q5, ROW3L, XFIX_2_562915447
  567. vshrn.s32 ROW1L, q1, #16
  568. vsub.s32 q1, q1, q6
  569. vmull.s16 q6, ROW2L, XFIX_0_541196100_PLUS_0_765366865
  570. vsub.s32 q3, q3, q2
  571. vshrn.s32 ROW2R, q1, #16 /* ROW6L <-> ROW2R */
  572. vadd.s32 q1, q3, q5
  573. vsub.s32 q3, q3, q5
  574. vshll.s16 q5, ROW0L, #13
  575. vshrn.s32 ROW2L, q1, #16
  576. vshrn.s32 ROW1R, q3, #16 /* ROW5L <-> ROW1R */
  577. vadd.s32 q2, q5, q6
  578. vsub.s32 q1, q5, q6
  579. vadd.s32 q6, q2, q7
  580. vsub.s32 q2, q2, q7
  581. vadd.s32 q5, q1, q4
  582. vsub.s32 q3, q1, q4
  583. vshrn.s32 ROW3R, q2, #16 /* ROW7L <-> ROW3R */
  584. vshrn.s32 ROW3L, q5, #16
  585. vshrn.s32 ROW0L, q6, #16
  586. vshrn.s32 ROW0R, q3, #16 /* ROW4L <-> ROW0R */
  587. /* 1-D IDCT, pass 2 (sparse variant with zero rows 4-7), right 4x8 half */
  588. vld1.s16 {d2}, [ip, :64] /* reload constants */
  589. vmull.s16 q6, ROW5L, XFIX_1_175875602
  590. vmlal.s16 q6, ROW7L, XFIX_1_175875602_MINUS_1_961570560
  591. vmull.s16 q7, ROW7L, XFIX_1_175875602
  592. vmlal.s16 q7, ROW5L, XFIX_1_175875602_MINUS_0_390180644
  593. vmull.s16 q2, ROW6L, XFIX_0_541196100
  594. vshll.s16 q3, ROW4L, #13
  595. vmov q4, q6
  596. vmlal.s16 q6, ROW7L, XFIX_3_072711026_MINUS_2_562915447
  597. vmlsl.s16 q4, ROW5L, XFIX_0_899976223
  598. vadd.s32 q1, q3, q2
  599. vmov q5, q7
  600. vmlal.s16 q7, ROW5L, XFIX_1_501321110_MINUS_0_899976223
  601. vadd.s32 q1, q1, q6
  602. vadd.s32 q6, q6, q6
  603. vmlsl.s16 q5, ROW7L, XFIX_2_562915447
  604. vshrn.s32 ROW5L, q1, #16 /* ROW5L <-> ROW1R */
  605. vsub.s32 q1, q1, q6
  606. vmull.s16 q6, ROW6L, XFIX_0_541196100_PLUS_0_765366865
  607. vsub.s32 q3, q3, q2
  608. vshrn.s32 ROW6R, q1, #16
  609. vadd.s32 q1, q3, q5
  610. vsub.s32 q3, q3, q5
  611. vshll.s16 q5, ROW4L, #13
  612. vshrn.s32 ROW6L, q1, #16 /* ROW6L <-> ROW2R */
  613. vshrn.s32 ROW5R, q3, #16
  614. vadd.s32 q2, q5, q6
  615. vsub.s32 q1, q5, q6
  616. vadd.s32 q6, q2, q7
  617. vsub.s32 q2, q2, q7
  618. vadd.s32 q5, q1, q4
  619. vsub.s32 q3, q1, q4
  620. vshrn.s32 ROW7R, q2, #16
  621. vshrn.s32 ROW7L, q5, #16 /* ROW7L <-> ROW3R */
  622. vshrn.s32 ROW4L, q6, #16 /* ROW4L <-> ROW0R */
  623. vshrn.s32 ROW4R, q3, #16
  624. b 2b /* Go to epilogue */
  625. .unreq DCT_TABLE
  626. .unreq COEF_BLOCK
  627. .unreq OUTPUT_BUF
  628. .unreq OUTPUT_COL
  629. .unreq TMP1
  630. .unreq TMP2
  631. .unreq TMP3
  632. .unreq TMP4
  633. .unreq ROW0L
  634. .unreq ROW0R
  635. .unreq ROW1L
  636. .unreq ROW1R
  637. .unreq ROW2L
  638. .unreq ROW2R
  639. .unreq ROW3L
  640. .unreq ROW3R
  641. .unreq ROW4L
  642. .unreq ROW4R
  643. .unreq ROW5L
  644. .unreq ROW5R
  645. .unreq ROW6L
  646. .unreq ROW6R
  647. .unreq ROW7L
  648. .unreq ROW7R
  649. /*****************************************************************************/
  650. /*
  651. * jsimd_idct_ifast_neon
  652. *
  653. * This function contains a fast, not so accurate integer implementation of
  654. * the inverse DCT (Discrete Cosine Transform). It uses the same calculations
  655. * and produces exactly the same output as IJG's original 'jpeg_idct_ifast'
  656. * function from jidctfst.c
  657. *
  658. * Normally 1-D AAN DCT needs 5 multiplications and 29 additions.
  659. * But in ARM NEON case some extra additions are required because VQDMULH
  660. * instruction can't handle the constants larger than 1. So the expressions
  661. * like "x * 1.082392200" have to be converted to "x * 0.082392200 + x",
  662. * which introduces an extra addition. Overall, there are 6 extra additions
  663. * per 1-D IDCT pass, totalling to 5 VQDMULH and 35 VADD/VSUB instructions.
  664. */
  665. #define XFIX_1_082392200 d0[0]
  666. #define XFIX_1_414213562 d0[1]
  667. #define XFIX_1_847759065 d0[2]
  668. #define XFIX_2_613125930 d0[3]
  669. .balign 16
  670. jsimd_idct_ifast_neon_consts:
  671. .short (277 * 128 - 256 * 128) /* XFIX_1_082392200 */
  672. .short (362 * 128 - 256 * 128) /* XFIX_1_414213562 */
  673. .short (473 * 128 - 256 * 128) /* XFIX_1_847759065 */
  674. .short (669 * 128 - 512 * 128) /* XFIX_2_613125930 */
  675. asm_function jsimd_idct_ifast_neon
  676. DCT_TABLE .req r0
  677. COEF_BLOCK .req r1
  678. OUTPUT_BUF .req r2
  679. OUTPUT_COL .req r3
  680. TMP1 .req r0
  681. TMP2 .req r1
  682. TMP3 .req r2
  683. TMP4 .req ip
  684. /* Load and dequantize coefficients into NEON registers
  685. * with the following allocation:
  686. * 0 1 2 3 | 4 5 6 7
  687. * ---------+--------
  688. * 0 | d16 | d17 ( q8 )
  689. * 1 | d18 | d19 ( q9 )
  690. * 2 | d20 | d21 ( q10 )
  691. * 3 | d22 | d23 ( q11 )
  692. * 4 | d24 | d25 ( q12 )
  693. * 5 | d26 | d27 ( q13 )
  694. * 6 | d28 | d29 ( q14 )
  695. * 7 | d30 | d31 ( q15 )
  696. */
  697. adr ip, jsimd_idct_ifast_neon_consts
  698. vld1.16 {d16, d17, d18, d19}, [COEF_BLOCK, :128]!
  699. vld1.16 {d0, d1, d2, d3}, [DCT_TABLE, :128]!
  700. vld1.16 {d20, d21, d22, d23}, [COEF_BLOCK, :128]!
  701. vmul.s16 q8, q8, q0
  702. vld1.16 {d4, d5, d6, d7}, [DCT_TABLE, :128]!
  703. vmul.s16 q9, q9, q1
  704. vld1.16 {d24, d25, d26, d27}, [COEF_BLOCK, :128]!
  705. vmul.s16 q10, q10, q2
  706. vld1.16 {d0, d1, d2, d3}, [DCT_TABLE, :128]!
  707. vmul.s16 q11, q11, q3
  708. vld1.16 {d28, d29, d30, d31}, [COEF_BLOCK, :128]
  709. vmul.s16 q12, q12, q0
  710. vld1.16 {d4, d5, d6, d7}, [DCT_TABLE, :128]!
  711. vmul.s16 q14, q14, q2
  712. vmul.s16 q13, q13, q1
  713. vld1.16 {d0}, [ip, :64] /* load constants */
  714. vmul.s16 q15, q15, q3
  715. vpush {d8-d13} /* save NEON registers */
  716. /* 1-D IDCT, pass 1 */
  717. vsub.s16 q2, q10, q14
  718. vadd.s16 q14, q10, q14
  719. vsub.s16 q1, q11, q13
  720. vadd.s16 q13, q11, q13
  721. vsub.s16 q5, q9, q15
  722. vadd.s16 q15, q9, q15
  723. vqdmulh.s16 q4, q2, XFIX_1_414213562
  724. vqdmulh.s16 q6, q1, XFIX_2_613125930
  725. vadd.s16 q3, q1, q1
  726. vsub.s16 q1, q5, q1
  727. vadd.s16 q10, q2, q4
  728. vqdmulh.s16 q4, q1, XFIX_1_847759065
  729. vsub.s16 q2, q15, q13
  730. vadd.s16 q3, q3, q6
  731. vqdmulh.s16 q6, q2, XFIX_1_414213562
  732. vadd.s16 q1, q1, q4
  733. vqdmulh.s16 q4, q5, XFIX_1_082392200
  734. vsub.s16 q10, q10, q14
  735. vadd.s16 q2, q2, q6
  736. vsub.s16 q6, q8, q12
  737. vadd.s16 q12, q8, q12
  738. vadd.s16 q9, q5, q4
  739. vadd.s16 q5, q6, q10
  740. vsub.s16 q10, q6, q10
  741. vadd.s16 q6, q15, q13
  742. vadd.s16 q8, q12, q14
  743. vsub.s16 q3, q6, q3
  744. vsub.s16 q12, q12, q14
  745. vsub.s16 q3, q3, q1
  746. vsub.s16 q1, q9, q1
  747. vadd.s16 q2, q3, q2
  748. vsub.s16 q15, q8, q6
  749. vadd.s16 q1, q1, q2
  750. vadd.s16 q8, q8, q6
  751. vadd.s16 q14, q5, q3
  752. vsub.s16 q9, q5, q3
  753. vsub.s16 q13, q10, q2
  754. vadd.s16 q10, q10, q2
  755. /* Transpose */
  756. vtrn.16 q8, q9
  757. vsub.s16 q11, q12, q1
  758. vtrn.16 q14, q15
  759. vadd.s16 q12, q12, q1
  760. vtrn.16 q10, q11
  761. vtrn.16 q12, q13
  762. vtrn.32 q9, q11
  763. vtrn.32 q12, q14
  764. vtrn.32 q8, q10
  765. vtrn.32 q13, q15
  766. vswp d28, d21
  767. vswp d26, d19
  768. /* 1-D IDCT, pass 2 */
  769. vsub.s16 q2, q10, q14
  770. vswp d30, d23
  771. vadd.s16 q14, q10, q14
  772. vswp d24, d17
  773. vsub.s16 q1, q11, q13
  774. vadd.s16 q13, q11, q13
  775. vsub.s16 q5, q9, q15
  776. vadd.s16 q15, q9, q15
  777. vqdmulh.s16 q4, q2, XFIX_1_414213562
  778. vqdmulh.s16 q6, q1, XFIX_2_613125930
  779. vadd.s16 q3, q1, q1
  780. vsub.s16 q1, q5, q1
  781. vadd.s16 q10, q2, q4
  782. vqdmulh.s16 q4, q1, XFIX_1_847759065
  783. vsub.s16 q2, q15, q13
  784. vadd.s16 q3, q3, q6
  785. vqdmulh.s16 q6, q2, XFIX_1_414213562
  786. vadd.s16 q1, q1, q4
  787. vqdmulh.s16 q4, q5, XFIX_1_082392200
  788. vsub.s16 q10, q10, q14
  789. vadd.s16 q2, q2, q6
  790. vsub.s16 q6, q8, q12
  791. vadd.s16 q12, q8, q12
  792. vadd.s16 q9, q5, q4
  793. vadd.s16 q5, q6, q10
  794. vsub.s16 q10, q6, q10
  795. vadd.s16 q6, q15, q13
  796. vadd.s16 q8, q12, q14
  797. vsub.s16 q3, q6, q3
  798. vsub.s16 q12, q12, q14
  799. vsub.s16 q3, q3, q1
  800. vsub.s16 q1, q9, q1
  801. vadd.s16 q2, q3, q2
  802. vsub.s16 q15, q8, q6
  803. vadd.s16 q1, q1, q2
  804. vadd.s16 q8, q8, q6
  805. vadd.s16 q14, q5, q3
  806. vsub.s16 q9, q5, q3
  807. vsub.s16 q13, q10, q2
  808. vpop {d8-d13} /* restore NEON registers */
  809. vadd.s16 q10, q10, q2
  810. vsub.s16 q11, q12, q1
  811. vadd.s16 q12, q12, q1
  812. /* Descale to 8-bit and range limit */
  813. vmov.u8 q0, #0x80
  814. vqshrn.s16 d16, q8, #5
  815. vqshrn.s16 d17, q9, #5
  816. vqshrn.s16 d18, q10, #5
  817. vqshrn.s16 d19, q11, #5
  818. vqshrn.s16 d20, q12, #5
  819. vqshrn.s16 d21, q13, #5
  820. vqshrn.s16 d22, q14, #5
  821. vqshrn.s16 d23, q15, #5
  822. vadd.u8 q8, q8, q0
  823. vadd.u8 q9, q9, q0
  824. vadd.u8 q10, q10, q0
  825. vadd.u8 q11, q11, q0
  826. /* Transpose the final 8-bit samples */
  827. vtrn.16 q8, q9
  828. vtrn.16 q10, q11
  829. vtrn.32 q8, q10
  830. vtrn.32 q9, q11
  831. vtrn.8 d16, d17
  832. vtrn.8 d18, d19
  833. /* Store results to the output buffer */
  834. ldmia OUTPUT_BUF!, {TMP1, TMP2}
  835. add TMP1, TMP1, OUTPUT_COL
  836. add TMP2, TMP2, OUTPUT_COL
  837. vst1.8 {d16}, [TMP1]
  838. vst1.8 {d17}, [TMP2]
  839. ldmia OUTPUT_BUF!, {TMP1, TMP2}
  840. add TMP1, TMP1, OUTPUT_COL
  841. add TMP2, TMP2, OUTPUT_COL
  842. vst1.8 {d18}, [TMP1]
  843. vtrn.8 d20, d21
  844. vst1.8 {d19}, [TMP2]
  845. ldmia OUTPUT_BUF, {TMP1, TMP2, TMP3, TMP4}
  846. add TMP1, TMP1, OUTPUT_COL
  847. add TMP2, TMP2, OUTPUT_COL
  848. add TMP3, TMP3, OUTPUT_COL
  849. add TMP4, TMP4, OUTPUT_COL
  850. vst1.8 {d20}, [TMP1]
  851. vtrn.8 d22, d23
  852. vst1.8 {d21}, [TMP2]
  853. vst1.8 {d22}, [TMP3]
  854. vst1.8 {d23}, [TMP4]
  855. bx lr
  856. .unreq DCT_TABLE
  857. .unreq COEF_BLOCK
  858. .unreq OUTPUT_BUF
  859. .unreq OUTPUT_COL
  860. .unreq TMP1
  861. .unreq TMP2
  862. .unreq TMP3
  863. .unreq TMP4
  864. /*****************************************************************************/
  865. /*
  866. * jsimd_idct_4x4_neon
  867. *
  868. * This function contains inverse-DCT code for getting reduced-size
  869. * 4x4 pixels output from an 8x8 DCT block. It uses the same calculations
  870. * and produces exactly the same output as IJG's original 'jpeg_idct_4x4'
  871. * function from jpeg-6b (jidctred.c).
  872. *
  873. * NOTE: jpeg-8 has an improved implementation of 4x4 inverse-DCT, which
  874. * requires much less arithmetic operations and hence should be faster.
  875. * The primary purpose of this particular NEON optimized function is
  876. * bit exact compatibility with jpeg-6b.
  877. *
  878. * TODO: a bit better instructions scheduling can be achieved by expanding
  879. * idct_helper/transpose_4x4 macros and reordering instructions,
  880. * but readability will suffer somewhat.
  881. */
  882. #define CONST_BITS 13
  883. #define FIX_0_211164243 (1730) /* FIX(0.211164243) */
  884. #define FIX_0_509795579 (4176) /* FIX(0.509795579) */
  885. #define FIX_0_601344887 (4926) /* FIX(0.601344887) */
  886. #define FIX_0_720959822 (5906) /* FIX(0.720959822) */
  887. #define FIX_0_765366865 (6270) /* FIX(0.765366865) */
  888. #define FIX_0_850430095 (6967) /* FIX(0.850430095) */
  889. #define FIX_0_899976223 (7373) /* FIX(0.899976223) */
  890. #define FIX_1_061594337 (8697) /* FIX(1.061594337) */
  891. #define FIX_1_272758580 (10426) /* FIX(1.272758580) */
  892. #define FIX_1_451774981 (11893) /* FIX(1.451774981) */
  893. #define FIX_1_847759065 (15137) /* FIX(1.847759065) */
  894. #define FIX_2_172734803 (17799) /* FIX(2.172734803) */
  895. #define FIX_2_562915447 (20995) /* FIX(2.562915447) */
  896. #define FIX_3_624509785 (29692) /* FIX(3.624509785) */
  897. .balign 16
  898. jsimd_idct_4x4_neon_consts:
  899. .short FIX_1_847759065 /* d0[0] */
  900. .short -FIX_0_765366865 /* d0[1] */
  901. .short -FIX_0_211164243 /* d0[2] */
  902. .short FIX_1_451774981 /* d0[3] */
  903. .short -FIX_2_172734803 /* d1[0] */
  904. .short FIX_1_061594337 /* d1[1] */
  905. .short -FIX_0_509795579 /* d1[2] */
  906. .short -FIX_0_601344887 /* d1[3] */
  907. .short FIX_0_899976223 /* d2[0] */
  908. .short FIX_2_562915447 /* d2[1] */
  909. .short 1 << (CONST_BITS+1) /* d2[2] */
  910. .short 0 /* d2[3] */
  911. .macro idct_helper x4, x6, x8, x10, x12, x14, x16, shift, y26, y27, y28, y29
  912. vmull.s16 q14, \x4, d2[2]
  913. vmlal.s16 q14, \x8, d0[0]
  914. vmlal.s16 q14, \x14, d0[1]
  915. vmull.s16 q13, \x16, d1[2]
  916. vmlal.s16 q13, \x12, d1[3]
  917. vmlal.s16 q13, \x10, d2[0]
  918. vmlal.s16 q13, \x6, d2[1]
  919. vmull.s16 q15, \x4, d2[2]
  920. vmlsl.s16 q15, \x8, d0[0]
  921. vmlsl.s16 q15, \x14, d0[1]
  922. vmull.s16 q12, \x16, d0[2]
  923. vmlal.s16 q12, \x12, d0[3]
  924. vmlal.s16 q12, \x10, d1[0]
  925. vmlal.s16 q12, \x6, d1[1]
  926. vadd.s32 q10, q14, q13
  927. vsub.s32 q14, q14, q13
  928. .if \shift > 16
  929. vrshr.s32 q10, q10, #\shift
  930. vrshr.s32 q14, q14, #\shift
  931. vmovn.s32 \y26, q10
  932. vmovn.s32 \y29, q14
  933. .else
  934. vrshrn.s32 \y26, q10, #\shift
  935. vrshrn.s32 \y29, q14, #\shift
  936. .endif
  937. vadd.s32 q10, q15, q12
  938. vsub.s32 q15, q15, q12
  939. .if \shift > 16
  940. vrshr.s32 q10, q10, #\shift
  941. vrshr.s32 q15, q15, #\shift
  942. vmovn.s32 \y27, q10
  943. vmovn.s32 \y28, q15
  944. .else
  945. vrshrn.s32 \y27, q10, #\shift
  946. vrshrn.s32 \y28, q15, #\shift
  947. .endif
  948. .endm
  949. asm_function jsimd_idct_4x4_neon
  950. DCT_TABLE .req r0
  951. COEF_BLOCK .req r1
  952. OUTPUT_BUF .req r2
  953. OUTPUT_COL .req r3
  954. TMP1 .req r0
  955. TMP2 .req r1
  956. TMP3 .req r2
  957. TMP4 .req ip
  958. vpush {d8-d15}
  959. /* Load constants (d3 is just used for padding) */
  960. adr TMP4, jsimd_idct_4x4_neon_consts
  961. vld1.16 {d0, d1, d2, d3}, [TMP4, :128]
  962. /* Load all COEF_BLOCK into NEON registers with the following allocation:
  963. * 0 1 2 3 | 4 5 6 7
  964. * ---------+--------
  965. * 0 | d4 | d5
  966. * 1 | d6 | d7
  967. * 2 | d8 | d9
  968. * 3 | d10 | d11
  969. * 4 | - | -
  970. * 5 | d12 | d13
  971. * 6 | d14 | d15
  972. * 7 | d16 | d17
  973. */
  974. vld1.16 {d4, d5, d6, d7}, [COEF_BLOCK, :128]!
  975. vld1.16 {d8, d9, d10, d11}, [COEF_BLOCK, :128]!
  976. add COEF_BLOCK, COEF_BLOCK, #16
  977. vld1.16 {d12, d13, d14, d15}, [COEF_BLOCK, :128]!
  978. vld1.16 {d16, d17}, [COEF_BLOCK, :128]!
  979. /* dequantize */
  980. vld1.16 {d18, d19, d20, d21}, [DCT_TABLE, :128]!
  981. vmul.s16 q2, q2, q9
  982. vld1.16 {d22, d23, d24, d25}, [DCT_TABLE, :128]!
  983. vmul.s16 q3, q3, q10
  984. vmul.s16 q4, q4, q11
  985. add DCT_TABLE, DCT_TABLE, #16
  986. vld1.16 {d26, d27, d28, d29}, [DCT_TABLE, :128]!
  987. vmul.s16 q5, q5, q12
  988. vmul.s16 q6, q6, q13
  989. vld1.16 {d30, d31}, [DCT_TABLE, :128]!
  990. vmul.s16 q7, q7, q14
  991. vmul.s16 q8, q8, q15
  992. /* Pass 1 */
  993. idct_helper d4, d6, d8, d10, d12, d14, d16, 12, d4, d6, d8, d10
  994. transpose_4x4 d4, d6, d8, d10
  995. idct_helper d5, d7, d9, d11, d13, d15, d17, 12, d5, d7, d9, d11
  996. transpose_4x4 d5, d7, d9, d11
  997. /* Pass 2 */
  998. idct_helper d4, d6, d8, d10, d7, d9, d11, 19, d26, d27, d28, d29
  999. transpose_4x4 d26, d27, d28, d29
  1000. /* Range limit */
  1001. vmov.u16 q15, #0x80
  1002. vadd.s16 q13, q13, q15
  1003. vadd.s16 q14, q14, q15
  1004. vqmovun.s16 d26, q13
  1005. vqmovun.s16 d27, q14
  1006. /* Store results to the output buffer */
  1007. ldmia OUTPUT_BUF, {TMP1, TMP2, TMP3, TMP4}
  1008. add TMP1, TMP1, OUTPUT_COL
  1009. add TMP2, TMP2, OUTPUT_COL
  1010. add TMP3, TMP3, OUTPUT_COL
  1011. add TMP4, TMP4, OUTPUT_COL
  1012. #if defined(__ARMEL__) && !RESPECT_STRICT_ALIGNMENT
  1013. /* We can use much less instructions on little endian systems if the
  1014. * OS kernel is not configured to trap unaligned memory accesses
  1015. */
  1016. vst1.32 {d26[0]}, [TMP1]!
  1017. vst1.32 {d27[0]}, [TMP3]!
  1018. vst1.32 {d26[1]}, [TMP2]!
  1019. vst1.32 {d27[1]}, [TMP4]!
  1020. #else
  1021. vst1.8 {d26[0]}, [TMP1]!
  1022. vst1.8 {d27[0]}, [TMP3]!
  1023. vst1.8 {d26[1]}, [TMP1]!
  1024. vst1.8 {d27[1]}, [TMP3]!
  1025. vst1.8 {d26[2]}, [TMP1]!
  1026. vst1.8 {d27[2]}, [TMP3]!
  1027. vst1.8 {d26[3]}, [TMP1]!
  1028. vst1.8 {d27[3]}, [TMP3]!
  1029. vst1.8 {d26[4]}, [TMP2]!
  1030. vst1.8 {d27[4]}, [TMP4]!
  1031. vst1.8 {d26[5]}, [TMP2]!
  1032. vst1.8 {d27[5]}, [TMP4]!
  1033. vst1.8 {d26[6]}, [TMP2]!
  1034. vst1.8 {d27[6]}, [TMP4]!
  1035. vst1.8 {d26[7]}, [TMP2]!
  1036. vst1.8 {d27[7]}, [TMP4]!
  1037. #endif
  1038. vpop {d8-d15}
  1039. bx lr
  1040. .unreq DCT_TABLE
  1041. .unreq COEF_BLOCK
  1042. .unreq OUTPUT_BUF
  1043. .unreq OUTPUT_COL
  1044. .unreq TMP1
  1045. .unreq TMP2
  1046. .unreq TMP3
  1047. .unreq TMP4
  1048. .purgem idct_helper
  1049. /*****************************************************************************/
  1050. /*
  1051. * jsimd_idct_2x2_neon
  1052. *
  1053. * This function contains inverse-DCT code for getting reduced-size
  1054. * 2x2 pixels output from an 8x8 DCT block. It uses the same calculations
  1055. * and produces exactly the same output as IJG's original 'jpeg_idct_2x2'
  1056. * function from jpeg-6b (jidctred.c).
  1057. *
  1058. * NOTE: jpeg-8 has an improved implementation of 2x2 inverse-DCT, which
  1059. * requires much less arithmetic operations and hence should be faster.
  1060. * The primary purpose of this particular NEON optimized function is
  1061. * bit exact compatibility with jpeg-6b.
  1062. */
  1063. .balign 8
  1064. jsimd_idct_2x2_neon_consts:
  1065. .short -FIX_0_720959822 /* d0[0] */
  1066. .short FIX_0_850430095 /* d0[1] */
  1067. .short -FIX_1_272758580 /* d0[2] */
  1068. .short FIX_3_624509785 /* d0[3] */
  1069. .macro idct_helper x4, x6, x10, x12, x16, shift, y26, y27
  1070. vshll.s16 q14, \x4, #15
  1071. vmull.s16 q13, \x6, d0[3]
  1072. vmlal.s16 q13, \x10, d0[2]
  1073. vmlal.s16 q13, \x12, d0[1]
  1074. vmlal.s16 q13, \x16, d0[0]
  1075. vadd.s32 q10, q14, q13
  1076. vsub.s32 q14, q14, q13
  1077. .if \shift > 16
  1078. vrshr.s32 q10, q10, #\shift
  1079. vrshr.s32 q14, q14, #\shift
  1080. vmovn.s32 \y26, q10
  1081. vmovn.s32 \y27, q14
  1082. .else
  1083. vrshrn.s32 \y26, q10, #\shift
  1084. vrshrn.s32 \y27, q14, #\shift
  1085. .endif
  1086. .endm
  1087. asm_function jsimd_idct_2x2_neon
  1088. DCT_TABLE .req r0
  1089. COEF_BLOCK .req r1
  1090. OUTPUT_BUF .req r2
  1091. OUTPUT_COL .req r3
  1092. TMP1 .req r0
  1093. TMP2 .req ip
  1094. vpush {d8-d15}
  1095. /* Load constants */
  1096. adr TMP2, jsimd_idct_2x2_neon_consts
  1097. vld1.16 {d0}, [TMP2, :64]
  1098. /* Load all COEF_BLOCK into NEON registers with the following allocation:
  1099. * 0 1 2 3 | 4 5 6 7
  1100. * ---------+--------
  1101. * 0 | d4 | d5
  1102. * 1 | d6 | d7
  1103. * 2 | - | -
  1104. * 3 | d10 | d11
  1105. * 4 | - | -
  1106. * 5 | d12 | d13
  1107. * 6 | - | -
  1108. * 7 | d16 | d17
  1109. */
  1110. vld1.16 {d4, d5, d6, d7}, [COEF_BLOCK, :128]!
  1111. add COEF_BLOCK, COEF_BLOCK, #16
  1112. vld1.16 {d10, d11}, [COEF_BLOCK, :128]!
  1113. add COEF_BLOCK, COEF_BLOCK, #16
  1114. vld1.16 {d12, d13}, [COEF_BLOCK, :128]!
  1115. add COEF_BLOCK, COEF_BLOCK, #16
  1116. vld1.16 {d16, d17}, [COEF_BLOCK, :128]!
  1117. /* Dequantize */
  1118. vld1.16 {d18, d19, d20, d21}, [DCT_TABLE, :128]!
  1119. vmul.s16 q2, q2, q9
  1120. vmul.s16 q3, q3, q10
  1121. add DCT_TABLE, DCT_TABLE, #16
  1122. vld1.16 {d24, d25}, [DCT_TABLE, :128]!
  1123. vmul.s16 q5, q5, q12
  1124. add DCT_TABLE, DCT_TABLE, #16
  1125. vld1.16 {d26, d27}, [DCT_TABLE, :128]!
  1126. vmul.s16 q6, q6, q13
  1127. add DCT_TABLE, DCT_TABLE, #16
  1128. vld1.16 {d30, d31}, [DCT_TABLE, :128]!
  1129. vmul.s16 q8, q8, q15
  1130. /* Pass 1 */
  1131. #if 0
  1132. idct_helper d4, d6, d10, d12, d16, 13, d4, d6
  1133. transpose_4x4 d4, d6, d8, d10
  1134. idct_helper d5, d7, d11, d13, d17, 13, d5, d7
  1135. transpose_4x4 d5, d7, d9, d11
  1136. #else
  1137. vmull.s16 q13, d6, d0[3]
  1138. vmlal.s16 q13, d10, d0[2]
  1139. vmlal.s16 q13, d12, d0[1]
  1140. vmlal.s16 q13, d16, d0[0]
  1141. vmull.s16 q12, d7, d0[3]
  1142. vmlal.s16 q12, d11, d0[2]
  1143. vmlal.s16 q12, d13, d0[1]
  1144. vmlal.s16 q12, d17, d0[0]
  1145. vshll.s16 q14, d4, #15
  1146. vshll.s16 q15, d5, #15
  1147. vadd.s32 q10, q14, q13
  1148. vsub.s32 q14, q14, q13
  1149. vrshrn.s32 d4, q10, #13
  1150. vrshrn.s32 d6, q14, #13
  1151. vadd.s32 q10, q15, q12
  1152. vsub.s32 q14, q15, q12
  1153. vrshrn.s32 d5, q10, #13
  1154. vrshrn.s32 d7, q14, #13
  1155. vtrn.16 q2, q3
  1156. vtrn.32 q3, q5
  1157. #endif
  1158. /* Pass 2 */
  1159. idct_helper d4, d6, d10, d7, d11, 20, d26, d27
  1160. /* Range limit */
  1161. vmov.u16 q15, #0x80
  1162. vadd.s16 q13, q13, q15
  1163. vqmovun.s16 d26, q13
  1164. vqmovun.s16 d27, q13
  1165. /* Store results to the output buffer */
  1166. ldmia OUTPUT_BUF, {TMP1, TMP2}
  1167. add TMP1, TMP1, OUTPUT_COL
  1168. add TMP2, TMP2, OUTPUT_COL
  1169. vst1.8 {d26[0]}, [TMP1]!
  1170. vst1.8 {d27[4]}, [TMP1]!
  1171. vst1.8 {d26[1]}, [TMP2]!
  1172. vst1.8 {d27[5]}, [TMP2]!
  1173. vpop {d8-d15}
  1174. bx lr
  1175. .unreq DCT_TABLE
  1176. .unreq COEF_BLOCK
  1177. .unreq OUTPUT_BUF
  1178. .unreq OUTPUT_COL
  1179. .unreq TMP1
  1180. .unreq TMP2
  1181. .purgem idct_helper
  1182. /*****************************************************************************/
  1183. /*
  1184. * jsimd_ycc_extrgb_convert_neon
  1185. * jsimd_ycc_extbgr_convert_neon
  1186. * jsimd_ycc_extrgbx_convert_neon
  1187. * jsimd_ycc_extbgrx_convert_neon
  1188. * jsimd_ycc_extxbgr_convert_neon
  1189. * jsimd_ycc_extxrgb_convert_neon
  1190. *
  1191. * Colorspace conversion YCbCr -> RGB
  1192. */
  1193. .macro do_load size
  1194. .if \size == 8
  1195. vld1.8 {d4}, [U, :64]!
  1196. vld1.8 {d5}, [V, :64]!
  1197. vld1.8 {d0}, [Y, :64]!
  1198. pld [U, #64]
  1199. pld [V, #64]
  1200. pld [Y, #64]
  1201. .elseif \size == 4
  1202. vld1.8 {d4[0]}, [U]!
  1203. vld1.8 {d4[1]}, [U]!
  1204. vld1.8 {d4[2]}, [U]!
  1205. vld1.8 {d4[3]}, [U]!
  1206. vld1.8 {d5[0]}, [V]!
  1207. vld1.8 {d5[1]}, [V]!
  1208. vld1.8 {d5[2]}, [V]!
  1209. vld1.8 {d5[3]}, [V]!
  1210. vld1.8 {d0[0]}, [Y]!
  1211. vld1.8 {d0[1]}, [Y]!
  1212. vld1.8 {d0[2]}, [Y]!
  1213. vld1.8 {d0[3]}, [Y]!
  1214. .elseif \size == 2
  1215. vld1.8 {d4[4]}, [U]!
  1216. vld1.8 {d4[5]}, [U]!
  1217. vld1.8 {d5[4]}, [V]!
  1218. vld1.8 {d5[5]}, [V]!
  1219. vld1.8 {d0[4]}, [Y]!
  1220. vld1.8 {d0[5]}, [Y]!
  1221. .elseif \size == 1
  1222. vld1.8 {d4[6]}, [U]!
  1223. vld1.8 {d5[6]}, [V]!
  1224. vld1.8 {d0[6]}, [Y]!
  1225. .else
  1226. .error unsupported macroblock size
  1227. .endif
  1228. .endm
  1229. .macro do_store bpp, size
  1230. .if \bpp == 24
  1231. .if \size == 8
  1232. vst3.8 {d10, d11, d12}, [RGB]!
  1233. .elseif \size == 4
  1234. vst3.8 {d10[0], d11[0], d12[0]}, [RGB]!
  1235. vst3.8 {d10[1], d11[1], d12[1]}, [RGB]!
  1236. vst3.8 {d10[2], d11[2], d12[2]}, [RGB]!
  1237. vst3.8 {d10[3], d11[3], d12[3]}, [RGB]!
  1238. .elseif \size == 2
  1239. vst3.8 {d10[4], d11[4], d12[4]}, [RGB]!
  1240. vst3.8 {d10[5], d11[5], d12[5]}, [RGB]!
  1241. .elseif \size == 1
  1242. vst3.8 {d10[6], d11[6], d12[6]}, [RGB]!
  1243. .else
  1244. .error unsupported macroblock size
  1245. .endif
  1246. .elseif \bpp == 32
  1247. .if \size == 8
  1248. vst4.8 {d10, d11, d12, d13}, [RGB]!
  1249. .elseif \size == 4
  1250. vst4.8 {d10[0], d11[0], d12[0], d13[0]}, [RGB]!
  1251. vst4.8 {d10[1], d11[1], d12[1], d13[1]}, [RGB]!
  1252. vst4.8 {d10[2], d11[2], d12[2], d13[2]}, [RGB]!
  1253. vst4.8 {d10[3], d11[3], d12[3], d13[3]}, [RGB]!
  1254. .elseif \size == 2
  1255. vst4.8 {d10[4], d11[4], d12[4], d13[4]}, [RGB]!
  1256. vst4.8 {d10[5], d11[5], d12[5], d13[5]}, [RGB]!
  1257. .elseif \size == 1
  1258. vst4.8 {d10[6], d11[6], d12[6], d13[6]}, [RGB]!
  1259. .else
  1260. .error unsupported macroblock size
  1261. .endif
  1262. .elseif \bpp == 16
  1263. .if \size == 8
  1264. vst1.16 {q15}, [RGB]!
  1265. .elseif \size == 4
  1266. vst1.16 {d30}, [RGB]!
  1267. .elseif \size == 2
  1268. vst1.16 {d31[0]}, [RGB]!
  1269. vst1.16 {d31[1]}, [RGB]!
  1270. .elseif \size == 1
  1271. vst1.16 {d31[2]}, [RGB]!
  1272. .else
  1273. .error unsupported macroblock size
  1274. .endif
  1275. .else
  1276. .error unsupported bpp
  1277. .endif
  1278. .endm
  1279. .macro generate_jsimd_ycc_rgb_convert_neon colorid, bpp, r_offs, g_offs, b_offs
  1280. /*
  1281. * 2-stage pipelined YCbCr->RGB conversion
  1282. */
  1283. .macro do_yuv_to_rgb_stage1
  1284. vaddw.u8 q3, q1, d4 /* q3 = u - 128 */
  1285. vaddw.u8 q4, q1, d5 /* q2 = v - 128 */
  1286. vmull.s16 q10, d6, d1[1] /* multiply by -11277 */
  1287. vmlal.s16 q10, d8, d1[2] /* multiply by -23401 */
  1288. vmull.s16 q11, d7, d1[1] /* multiply by -11277 */
  1289. vmlal.s16 q11, d9, d1[2] /* multiply by -23401 */
  1290. vmull.s16 q12, d8, d1[0] /* multiply by 22971 */
  1291. vmull.s16 q13, d9, d1[0] /* multiply by 22971 */
  1292. vmull.s16 q14, d6, d1[3] /* multiply by 29033 */
  1293. vmull.s16 q15, d7, d1[3] /* multiply by 29033 */
  1294. .endm
  1295. .macro do_yuv_to_rgb_stage2
  1296. vrshrn.s32 d20, q10, #15
  1297. vrshrn.s32 d21, q11, #15
  1298. vrshrn.s32 d24, q12, #14
  1299. vrshrn.s32 d25, q13, #14
  1300. vrshrn.s32 d28, q14, #14
  1301. vrshrn.s32 d29, q15, #14
  1302. vaddw.u8 q11, q10, d0
  1303. vaddw.u8 q12, q12, d0
  1304. vaddw.u8 q14, q14, d0
  1305. .if \bpp != 16
  1306. vqmovun.s16 d1\g_offs, q11
  1307. vqmovun.s16 d1\r_offs, q12
  1308. vqmovun.s16 d1\b_offs, q14
  1309. .else /* rgb565 */
  1310. vqshlu.s16 q13, q11, #8
  1311. vqshlu.s16 q15, q12, #8
  1312. vqshlu.s16 q14, q14, #8
  1313. vsri.u16 q15, q13, #5
  1314. vsri.u16 q15, q14, #11
  1315. .endif
  1316. .endm
  1317. .macro do_yuv_to_rgb_stage2_store_load_stage1
  1318. /* "do_yuv_to_rgb_stage2" and "store" */
  1319. vrshrn.s32 d20, q10, #15
  1320. /* "load" and "do_yuv_to_rgb_stage1" */
  1321. pld [U, #64]
  1322. vrshrn.s32 d21, q11, #15
  1323. pld [V, #64]
  1324. vrshrn.s32 d24, q12, #14
  1325. vrshrn.s32 d25, q13, #14
  1326. vld1.8 {d4}, [U, :64]!
  1327. vrshrn.s32 d28, q14, #14
  1328. vld1.8 {d5}, [V, :64]!
  1329. vrshrn.s32 d29, q15, #14
  1330. vaddw.u8 q3, q1, d4 /* q3 = u - 128 */
  1331. vaddw.u8 q4, q1, d5 /* q2 = v - 128 */
  1332. vaddw.u8 q11, q10, d0
  1333. vmull.s16 q10, d6, d1[1] /* multiply by -11277 */
  1334. vmlal.s16 q10, d8, d1[2] /* multiply by -23401 */
  1335. vaddw.u8 q12, q12, d0
  1336. vaddw.u8 q14, q14, d0
  1337. .if \bpp != 16 /**************** rgb24/rgb32 ******************************/
  1338. vqmovun.s16 d1\g_offs, q11
  1339. pld [Y, #64]
  1340. vqmovun.s16 d1\r_offs, q12
  1341. vld1.8 {d0}, [Y, :64]!
  1342. vqmovun.s16 d1\b_offs, q14
  1343. vmull.s16 q11, d7, d1[1] /* multiply by -11277 */
  1344. vmlal.s16 q11, d9, d1[2] /* multiply by -23401 */
  1345. do_store \bpp, 8
  1346. vmull.s16 q12, d8, d1[0] /* multiply by 22971 */
  1347. vmull.s16 q13, d9, d1[0] /* multiply by 22971 */
  1348. vmull.s16 q14, d6, d1[3] /* multiply by 29033 */
  1349. vmull.s16 q15, d7, d1[3] /* multiply by 29033 */
  1350. .else /**************************** rgb565 ********************************/
  1351. vqshlu.s16 q13, q11, #8
  1352. pld [Y, #64]
  1353. vqshlu.s16 q15, q12, #8
  1354. vqshlu.s16 q14, q14, #8
  1355. vld1.8 {d0}, [Y, :64]!
  1356. vmull.s16 q11, d7, d1[1]
  1357. vmlal.s16 q11, d9, d1[2]
  1358. vsri.u16 q15, q13, #5
  1359. vmull.s16 q12, d8, d1[0]
  1360. vsri.u16 q15, q14, #11
  1361. vmull.s16 q13, d9, d1[0]
  1362. vmull.s16 q14, d6, d1[3]
  1363. do_store \bpp, 8
  1364. vmull.s16 q15, d7, d1[3]
  1365. .endif
  1366. .endm
  1367. .macro do_yuv_to_rgb
  1368. do_yuv_to_rgb_stage1
  1369. do_yuv_to_rgb_stage2
  1370. .endm
  1371. /* Apple gas crashes on adrl, work around that by using adr.
  1372. * But this requires a copy of these constants for each function.
  1373. */
  1374. .balign 16
  1375. jsimd_ycc_\colorid\()_neon_consts:
  1376. .short 0, 0, 0, 0
  1377. .short 22971, -11277, -23401, 29033
  1378. .short -128, -128, -128, -128
  1379. .short -128, -128, -128, -128
  1380. asm_function jsimd_ycc_\colorid\()_convert_neon
  1381. OUTPUT_WIDTH .req r0
  1382. INPUT_BUF .req r1
  1383. INPUT_ROW .req r2
  1384. OUTPUT_BUF .req r3
  1385. NUM_ROWS .req r4
  1386. INPUT_BUF0 .req r5
  1387. INPUT_BUF1 .req r6
  1388. INPUT_BUF2 .req INPUT_BUF
  1389. RGB .req r7
  1390. Y .req r8
  1391. U .req r9
  1392. V .req r10
  1393. N .req ip
  1394. /* Load constants to d1, d2, d3 (d0 is just used for padding) */
  1395. adr ip, jsimd_ycc_\colorid\()_neon_consts
  1396. vld1.16 {d0, d1, d2, d3}, [ip, :128]
  1397. /* Save ARM registers and handle input arguments */
  1398. push {r4, r5, r6, r7, r8, r9, r10, lr}
  1399. ldr NUM_ROWS, [sp, #(4 * 8)]
  1400. ldr INPUT_BUF0, [INPUT_BUF]
  1401. ldr INPUT_BUF1, [INPUT_BUF, #4]
  1402. ldr INPUT_BUF2, [INPUT_BUF, #8]
  1403. .unreq INPUT_BUF
  1404. /* Save NEON registers */
  1405. vpush {d8-d15}
  1406. /* Initially set d10, d11, d12, d13 to 0xFF */
  1407. vmov.u8 q5, #255
  1408. vmov.u8 q6, #255
  1409. /* Outer loop over scanlines */
  1410. cmp NUM_ROWS, #1
  1411. blt 9f
  1412. 0:
  1413. ldr Y, [INPUT_BUF0, INPUT_ROW, lsl #2]
  1414. ldr U, [INPUT_BUF1, INPUT_ROW, lsl #2]
  1415. mov N, OUTPUT_WIDTH
  1416. ldr V, [INPUT_BUF2, INPUT_ROW, lsl #2]
  1417. add INPUT_ROW, INPUT_ROW, #1
  1418. ldr RGB, [OUTPUT_BUF], #4
  1419. /* Inner loop over pixels */
  1420. subs N, N, #8
  1421. blt 3f
  1422. do_load 8
  1423. do_yuv_to_rgb_stage1
  1424. subs N, N, #8
  1425. blt 2f
  1426. 1:
  1427. do_yuv_to_rgb_stage2_store_load_stage1
  1428. subs N, N, #8
  1429. bge 1b
  1430. 2:
  1431. do_yuv_to_rgb_stage2
  1432. do_store \bpp, 8
  1433. tst N, #7
  1434. beq 8f
  1435. 3:
  1436. tst N, #4
  1437. beq 3f
  1438. do_load 4
  1439. 3:
  1440. tst N, #2
  1441. beq 4f
  1442. do_load 2
  1443. 4:
  1444. tst N, #1
  1445. beq 5f
  1446. do_load 1
  1447. 5:
  1448. do_yuv_to_rgb
  1449. tst N, #4
  1450. beq 6f
  1451. do_store \bpp, 4
  1452. 6:
  1453. tst N, #2
  1454. beq 7f
  1455. do_store \bpp, 2
  1456. 7:
  1457. tst N, #1
  1458. beq 8f
  1459. do_store \bpp, 1
  1460. 8:
  1461. subs NUM_ROWS, NUM_ROWS, #1
  1462. bgt 0b
  1463. 9:
  1464. /* Restore all registers and return */
  1465. vpop {d8-d15}
  1466. pop {r4, r5, r6, r7, r8, r9, r10, pc}
  1467. .unreq OUTPUT_WIDTH
  1468. .unreq INPUT_ROW
  1469. .unreq OUTPUT_BUF
  1470. .unreq NUM_ROWS
  1471. .unreq INPUT_BUF0
  1472. .unreq INPUT_BUF1
  1473. .unreq INPUT_BUF2
  1474. .unreq RGB
  1475. .unreq Y
  1476. .unreq U
  1477. .unreq V
  1478. .unreq N
  1479. .purgem do_yuv_to_rgb
  1480. .purgem do_yuv_to_rgb_stage1
  1481. .purgem do_yuv_to_rgb_stage2
  1482. .purgem do_yuv_to_rgb_stage2_store_load_stage1
  1483. .endm
  1484. /*--------------------------------- id ----- bpp R G B */
  1485. generate_jsimd_ycc_rgb_convert_neon extrgb, 24, 0, 1, 2
  1486. generate_jsimd_ycc_rgb_convert_neon extbgr, 24, 2, 1, 0
  1487. generate_jsimd_ycc_rgb_convert_neon extrgbx, 32, 0, 1, 2
  1488. generate_jsimd_ycc_rgb_convert_neon extbgrx, 32, 2, 1, 0
  1489. generate_jsimd_ycc_rgb_convert_neon extxbgr, 32, 3, 2, 1
  1490. generate_jsimd_ycc_rgb_convert_neon extxrgb, 32, 1, 2, 3
  1491. generate_jsimd_ycc_rgb_convert_neon rgb565, 16, 0, 0, 0
  1492. .purgem do_load
  1493. .purgem do_store
  1494. /*****************************************************************************/
  1495. /*
  1496. * jsimd_extrgb_ycc_convert_neon
  1497. * jsimd_extbgr_ycc_convert_neon
  1498. * jsimd_extrgbx_ycc_convert_neon
  1499. * jsimd_extbgrx_ycc_convert_neon
  1500. * jsimd_extxbgr_ycc_convert_neon
  1501. * jsimd_extxrgb_ycc_convert_neon
  1502. *
  1503. * Colorspace conversion RGB -> YCbCr
  1504. */
  1505. .macro do_store size
  1506. .if \size == 8
  1507. vst1.8 {d20}, [Y]!
  1508. vst1.8 {d21}, [U]!
  1509. vst1.8 {d22}, [V]!
  1510. .elseif \size == 4
  1511. vst1.8 {d20[0]}, [Y]!
  1512. vst1.8 {d20[1]}, [Y]!
  1513. vst1.8 {d20[2]}, [Y]!
  1514. vst1.8 {d20[3]}, [Y]!
  1515. vst1.8 {d21[0]}, [U]!
  1516. vst1.8 {d21[1]}, [U]!
  1517. vst1.8 {d21[2]}, [U]!
  1518. vst1.8 {d21[3]}, [U]!
  1519. vst1.8 {d22[0]}, [V]!
  1520. vst1.8 {d22[1]}, [V]!
  1521. vst1.8 {d22[2]}, [V]!
  1522. vst1.8 {d22[3]}, [V]!
  1523. .elseif \size == 2
  1524. vst1.8 {d20[4]}, [Y]!
  1525. vst1.8 {d20[5]}, [Y]!
  1526. vst1.8 {d21[4]}, [U]!
  1527. vst1.8 {d21[5]}, [U]!
  1528. vst1.8 {d22[4]}, [V]!
  1529. vst1.8 {d22[5]}, [V]!
  1530. .elseif \size == 1
  1531. vst1.8 {d20[6]}, [Y]!
  1532. vst1.8 {d21[6]}, [U]!
  1533. vst1.8 {d22[6]}, [V]!
  1534. .else
  1535. .error unsupported macroblock size
  1536. .endif
  1537. .endm
  1538. .macro do_load bpp, size
  1539. .if \bpp == 24
  1540. .if \size == 8
  1541. vld3.8 {d10, d11, d12}, [RGB]!
  1542. pld [RGB, #128]
  1543. .elseif \size == 4
  1544. vld3.8 {d10[0], d11[0], d12[0]}, [RGB]!
  1545. vld3.8 {d10[1], d11[1], d12[1]}, [RGB]!
  1546. vld3.8 {d10[2], d11[2], d12[2]}, [RGB]!
  1547. vld3.8 {d10[3], d11[3], d12[3]}, [RGB]!
  1548. .elseif \size == 2
  1549. vld3.8 {d10[4], d11[4], d12[4]}, [RGB]!
  1550. vld3.8 {d10[5], d11[5], d12[5]}, [RGB]!
  1551. .elseif \size == 1
  1552. vld3.8 {d10[6], d11[6], d12[6]}, [RGB]!
  1553. .else
  1554. .error unsupported macroblock size
  1555. .endif
  1556. .elseif \bpp == 32
  1557. .if \size == 8
  1558. vld4.8 {d10, d11, d12, d13}, [RGB]!
  1559. pld [RGB, #128]
  1560. .elseif \size == 4
  1561. vld4.8 {d10[0], d11[0], d12[0], d13[0]}, [RGB]!
  1562. vld4.8 {d10[1], d11[1], d12[1], d13[1]}, [RGB]!
  1563. vld4.8 {d10[2], d11[2], d12[2], d13[2]}, [RGB]!
  1564. vld4.8 {d10[3], d11[3], d12[3], d13[3]}, [RGB]!
  1565. .elseif \size == 2
  1566. vld4.8 {d10[4], d11[4], d12[4], d13[4]}, [RGB]!
  1567. vld4.8 {d10[5], d11[5], d12[5], d13[5]}, [RGB]!
  1568. .elseif \size == 1
  1569. vld4.8 {d10[6], d11[6], d12[6], d13[6]}, [RGB]!
  1570. .else
  1571. .error unsupported macroblock size
  1572. .endif
  1573. .else
  1574. .error unsupported bpp
  1575. .endif
  1576. .endm
  1577. .macro generate_jsimd_rgb_ycc_convert_neon colorid, bpp, r_offs, g_offs, b_offs
  1578. /*
  1579. * 2-stage pipelined RGB->YCbCr conversion
  1580. */
  1581. .macro do_rgb_to_yuv_stage1
  1582. vmovl.u8 q2, d1\r_offs /* r = { d4, d5 } */
  1583. vmovl.u8 q3, d1\g_offs /* g = { d6, d7 } */
  1584. vmovl.u8 q4, d1\b_offs /* b = { d8, d9 } */
  1585. vmull.u16 q7, d4, d0[0]
  1586. vmlal.u16 q7, d6, d0[1]
  1587. vmlal.u16 q7, d8, d0[2]
  1588. vmull.u16 q8, d5, d0[0]
  1589. vmlal.u16 q8, d7, d0[1]
  1590. vmlal.u16 q8, d9, d0[2]
  1591. vrev64.32 q9, q1
  1592. vrev64.32 q13, q1
  1593. vmlsl.u16 q9, d4, d0[3]
  1594. vmlsl.u16 q9, d6, d1[0]
  1595. vmlal.u16 q9, d8, d1[1]
  1596. vmlsl.u16 q13, d5, d0[3]
  1597. vmlsl.u16 q13, d7, d1[0]
  1598. vmlal.u16 q13, d9, d1[1]
  1599. vrev64.32 q14, q1
  1600. vrev64.32 q15, q1
  1601. vmlal.u16 q14, d4, d1[1]
  1602. vmlsl.u16 q14, d6, d1[2]
  1603. vmlsl.u16 q14, d8, d1[3]
  1604. vmlal.u16 q15, d5, d1[1]
  1605. vmlsl.u16 q15, d7, d1[2]
  1606. vmlsl.u16 q15, d9, d1[3]
  1607. .endm
  1608. .macro do_rgb_to_yuv_stage2
  1609. vrshrn.u32 d20, q7, #16
  1610. vrshrn.u32 d21, q8, #16
  1611. vshrn.u32 d22, q9, #16
  1612. vshrn.u32 d23, q13, #16
  1613. vshrn.u32 d24, q14, #16
  1614. vshrn.u32 d25, q15, #16
  1615. vmovn.u16 d20, q10 /* d20 = y */
  1616. vmovn.u16 d21, q11 /* d21 = u */
  1617. vmovn.u16 d22, q12 /* d22 = v */
  1618. .endm
  1619. .macro do_rgb_to_yuv
  1620. do_rgb_to_yuv_stage1
  1621. do_rgb_to_yuv_stage2
  1622. .endm
  1623. .macro do_rgb_to_yuv_stage2_store_load_stage1
  1624. vrshrn.u32 d20, q7, #16
  1625. vrshrn.u32 d21, q8, #16
  1626. vshrn.u32 d22, q9, #16
  1627. vrev64.32 q9, q1
  1628. vshrn.u32 d23, q13, #16
  1629. vrev64.32 q13, q1
  1630. vshrn.u32 d24, q14, #16
  1631. vshrn.u32 d25, q15, #16
  1632. do_load \bpp, 8
  1633. vmovn.u16 d20, q10 /* d20 = y */
  1634. vmovl.u8 q2, d1\r_offs /* r = { d4, d5 } */
  1635. vmovn.u16 d21, q11 /* d21 = u */
  1636. vmovl.u8 q3, d1\g_offs /* g = { d6, d7 } */
  1637. vmovn.u16 d22, q12 /* d22 = v */
  1638. vmovl.u8 q4, d1\b_offs /* b = { d8, d9 } */
  1639. vmull.u16 q7, d4, d0[0]
  1640. vmlal.u16 q7, d6, d0[1]
  1641. vmlal.u16 q7, d8, d0[2]
  1642. vst1.8 {d20}, [Y]!
  1643. vmull.u16 q8, d5, d0[0]
  1644. vmlal.u16 q8, d7, d0[1]
  1645. vmlal.u16 q8, d9, d0[2]
  1646. vmlsl.u16 q9, d4, d0[3]
  1647. vmlsl.u16 q9, d6, d1[0]
  1648. vmlal.u16 q9, d8, d1[1]
  1649. vst1.8 {d21}, [U]!
  1650. vmlsl.u16 q13, d5, d0[3]
  1651. vmlsl.u16 q13, d7, d1[0]
  1652. vmlal.u16 q13, d9, d1[1]
  1653. vrev64.32 q14, q1
  1654. vrev64.32 q15, q1
  1655. vmlal.u16 q14, d4, d1[1]
  1656. vmlsl.u16 q14, d6, d1[2]
  1657. vmlsl.u16 q14, d8, d1[3]
  1658. vst1.8 {d22}, [V]!
  1659. vmlal.u16 q15, d5, d1[1]
  1660. vmlsl.u16 q15, d7, d1[2]
  1661. vmlsl.u16 q15, d9, d1[3]
  1662. .endm
  1663. .balign 16
  1664. jsimd_\colorid\()_ycc_neon_consts:
  1665. .short 19595, 38470, 7471, 11059
  1666. .short 21709, 32768, 27439, 5329
  1667. .short 32767, 128, 32767, 128
  1668. .short 32767, 128, 32767, 128
  1669. asm_function jsimd_\colorid\()_ycc_convert_neon
  1670. OUTPUT_WIDTH .req r0
  1671. INPUT_BUF .req r1
  1672. OUTPUT_BUF .req r2
  1673. OUTPUT_ROW .req r3
  1674. NUM_ROWS .req r4
  1675. OUTPUT_BUF0 .req r5
  1676. OUTPUT_BUF1 .req r6
  1677. OUTPUT_BUF2 .req OUTPUT_BUF
  1678. RGB .req r7
  1679. Y .req r8
  1680. U .req r9
  1681. V .req r10
  1682. N .req ip
  1683. /* Load constants to d0, d1, d2, d3 */
  1684. adr ip, jsimd_\colorid\()_ycc_neon_consts
  1685. vld1.16 {d0, d1, d2, d3}, [ip, :128]
  1686. /* Save ARM registers and handle input arguments */
  1687. push {r4, r5, r6, r7, r8, r9, r10, lr}
  1688. ldr NUM_ROWS, [sp, #(4 * 8)]
  1689. ldr OUTPUT_BUF0, [OUTPUT_BUF]
  1690. ldr OUTPUT_BUF1, [OUTPUT_BUF, #4]
  1691. ldr OUTPUT_BUF2, [OUTPUT_BUF, #8]
  1692. .unreq OUTPUT_BUF
  1693. /* Save NEON registers */
  1694. vpush {d8-d15}
  1695. /* Outer loop over scanlines */
  1696. cmp NUM_ROWS, #1
  1697. blt 9f
  1698. 0:
  1699. ldr Y, [OUTPUT_BUF0, OUTPUT_ROW, lsl #2]
  1700. ldr U, [OUTPUT_BUF1, OUTPUT_ROW, lsl #2]
  1701. mov N, OUTPUT_WIDTH
  1702. ldr V, [OUTPUT_BUF2, OUTPUT_ROW, lsl #2]
  1703. add OUTPUT_ROW, OUTPUT_ROW, #1
  1704. ldr RGB, [INPUT_BUF], #4
  1705. /* Inner loop over pixels */
  1706. subs N, N, #8
  1707. blt 3f
  1708. do_load \bpp, 8
  1709. do_rgb_to_yuv_stage1
  1710. subs N, N, #8
  1711. blt 2f
  1712. 1:
  1713. do_rgb_to_yuv_stage2_store_load_stage1
  1714. subs N, N, #8
  1715. bge 1b
  1716. 2:
  1717. do_rgb_to_yuv_stage2
  1718. do_store 8
  1719. tst N, #7
  1720. beq 8f
  1721. 3:
  1722. tst N, #4
  1723. beq 3f
  1724. do_load \bpp, 4
  1725. 3:
  1726. tst N, #2
  1727. beq 4f
  1728. do_load \bpp, 2
  1729. 4:
  1730. tst N, #1
  1731. beq 5f
  1732. do_load \bpp, 1
  1733. 5:
  1734. do_rgb_to_yuv
  1735. tst N, #4
  1736. beq 6f
  1737. do_store 4
  1738. 6:
  1739. tst N, #2
  1740. beq 7f
  1741. do_store 2
  1742. 7:
  1743. tst N, #1
  1744. beq 8f
  1745. do_store 1
  1746. 8:
  1747. subs NUM_ROWS, NUM_ROWS, #1
  1748. bgt 0b
  1749. 9:
  1750. /* Restore all registers and return */
  1751. vpop {d8-d15}
  1752. pop {r4, r5, r6, r7, r8, r9, r10, pc}
  1753. .unreq OUTPUT_WIDTH
  1754. .unreq OUTPUT_ROW
  1755. .unreq INPUT_BUF
  1756. .unreq NUM_ROWS
  1757. .unreq OUTPUT_BUF0
  1758. .unreq OUTPUT_BUF1
  1759. .unreq OUTPUT_BUF2
  1760. .unreq RGB
  1761. .unreq Y
  1762. .unreq U
  1763. .unreq V
  1764. .unreq N
  1765. .purgem do_rgb_to_yuv
  1766. .purgem do_rgb_to_yuv_stage1
  1767. .purgem do_rgb_to_yuv_stage2
  1768. .purgem do_rgb_to_yuv_stage2_store_load_stage1
  1769. .endm
  1770. /*--------------------------------- id ----- bpp R G B */
  1771. generate_jsimd_rgb_ycc_convert_neon extrgb, 24, 0, 1, 2
  1772. generate_jsimd_rgb_ycc_convert_neon extbgr, 24, 2, 1, 0
  1773. generate_jsimd_rgb_ycc_convert_neon extrgbx, 32, 0, 1, 2
  1774. generate_jsimd_rgb_ycc_convert_neon extbgrx, 32, 2, 1, 0
  1775. generate_jsimd_rgb_ycc_convert_neon extxbgr, 32, 3, 2, 1
  1776. generate_jsimd_rgb_ycc_convert_neon extxrgb, 32, 1, 2, 3
  1777. .purgem do_load
  1778. .purgem do_store
  1779. /*****************************************************************************/
  1780. /*
  1781. * Load data into workspace, applying unsigned->signed conversion
  1782. *
  1783. * TODO: can be combined with 'jsimd_fdct_ifast_neon' to get
  1784. * rid of VST1.16 instructions
  1785. */
  1786. asm_function jsimd_convsamp_neon
  1787. SAMPLE_DATA .req r0
  1788. START_COL .req r1
  1789. WORKSPACE .req r2
  1790. TMP1 .req r3
  1791. TMP2 .req r4
  1792. TMP3 .req r5
  1793. TMP4 .req ip
  1794. push {r4, r5}
  1795. vmov.u8 d0, #128
  1796. ldmia SAMPLE_DATA!, {TMP1, TMP2, TMP3, TMP4}
  1797. add TMP1, TMP1, START_COL
  1798. add TMP2, TMP2, START_COL
  1799. add TMP3, TMP3, START_COL
  1800. add TMP4, TMP4, START_COL
  1801. vld1.8 {d16}, [TMP1]
  1802. vsubl.u8 q8, d16, d0
  1803. vld1.8 {d18}, [TMP2]
  1804. vsubl.u8 q9, d18, d0
  1805. vld1.8 {d20}, [TMP3]
  1806. vsubl.u8 q10, d20, d0
  1807. vld1.8 {d22}, [TMP4]
  1808. ldmia SAMPLE_DATA!, {TMP1, TMP2, TMP3, TMP4}
  1809. vsubl.u8 q11, d22, d0
  1810. vst1.16 {d16, d17, d18, d19}, [WORKSPACE, :128]!
  1811. add TMP1, TMP1, START_COL
  1812. add TMP2, TMP2, START_COL
  1813. vst1.16 {d20, d21, d22, d23}, [WORKSPACE, :128]!
  1814. add TMP3, TMP3, START_COL
  1815. add TMP4, TMP4, START_COL
  1816. vld1.8 {d24}, [TMP1]
  1817. vsubl.u8 q12, d24, d0
  1818. vld1.8 {d26}, [TMP2]
  1819. vsubl.u8 q13, d26, d0
  1820. vld1.8 {d28}, [TMP3]
  1821. vsubl.u8 q14, d28, d0
  1822. vld1.8 {d30}, [TMP4]
  1823. vsubl.u8 q15, d30, d0
  1824. vst1.16 {d24, d25, d26, d27}, [WORKSPACE, :128]!
  1825. vst1.16 {d28, d29, d30, d31}, [WORKSPACE, :128]!
  1826. pop {r4, r5}
  1827. bx lr
  1828. .unreq SAMPLE_DATA
  1829. .unreq START_COL
  1830. .unreq WORKSPACE
  1831. .unreq TMP1
  1832. .unreq TMP2
  1833. .unreq TMP3
  1834. .unreq TMP4
  1835. /*****************************************************************************/
  1836. /*
  1837. * jsimd_fdct_ifast_neon
  1838. *
  1839. * This function contains a fast, not so accurate integer implementation of
  1840. * the forward DCT (Discrete Cosine Transform). It uses the same calculations
  1841. * and produces exactly the same output as IJG's original 'jpeg_fdct_ifast'
  1842. * function from jfdctfst.c
  1843. *
  1844. * TODO: can be combined with 'jsimd_convsamp_neon' to get
  1845. * rid of a bunch of VLD1.16 instructions
  1846. */
  1847. #define XFIX_0_382683433 d0[0]
  1848. #define XFIX_0_541196100 d0[1]
  1849. #define XFIX_0_707106781 d0[2]
  1850. #define XFIX_1_306562965 d0[3]
  1851. .balign 16
  1852. jsimd_fdct_ifast_neon_consts:
  1853. .short (98 * 128) /* XFIX_0_382683433 */
  1854. .short (139 * 128) /* XFIX_0_541196100 */
  1855. .short (181 * 128) /* XFIX_0_707106781 */
  1856. .short (334 * 128 - 256 * 128) /* XFIX_1_306562965 */
  1857. asm_function jsimd_fdct_ifast_neon
  1858. DATA .req r0
  1859. TMP .req ip
  1860. vpush {d8-d15}
  1861. /* Load constants */
  1862. adr TMP, jsimd_fdct_ifast_neon_consts
  1863. vld1.16 {d0}, [TMP, :64]
  1864. /* Load all DATA into NEON registers with the following allocation:
  1865. * 0 1 2 3 | 4 5 6 7
  1866. * ---------+--------
  1867. * 0 | d16 | d17 | q8
  1868. * 1 | d18 | d19 | q9
  1869. * 2 | d20 | d21 | q10
  1870. * 3 | d22 | d23 | q11
  1871. * 4 | d24 | d25 | q12
  1872. * 5 | d26 | d27 | q13
  1873. * 6 | d28 | d29 | q14
  1874. * 7 | d30 | d31 | q15
  1875. */
  1876. vld1.16 {d16, d17, d18, d19}, [DATA, :128]!
  1877. vld1.16 {d20, d21, d22, d23}, [DATA, :128]!
  1878. vld1.16 {d24, d25, d26, d27}, [DATA, :128]!
  1879. vld1.16 {d28, d29, d30, d31}, [DATA, :128]
  1880. sub DATA, DATA, #(128 - 32)
  1881. mov TMP, #2
  1882. 1:
  1883. /* Transpose */
  1884. vtrn.16 q12, q13
  1885. vtrn.16 q10, q11
  1886. vtrn.16 q8, q9
  1887. vtrn.16 q14, q15
  1888. vtrn.32 q9, q11
  1889. vtrn.32 q13, q15
  1890. vtrn.32 q8, q10
  1891. vtrn.32 q12, q14
  1892. vswp d30, d23
  1893. vswp d24, d17
  1894. vswp d26, d19
  1895. /* 1-D FDCT */
  1896. vadd.s16 q2, q11, q12
  1897. vswp d28, d21
  1898. vsub.s16 q12, q11, q12
  1899. vsub.s16 q6, q10, q13
  1900. vadd.s16 q10, q10, q13
  1901. vsub.s16 q7, q9, q14
  1902. vadd.s16 q9, q9, q14
  1903. vsub.s16 q1, q8, q15
  1904. vadd.s16 q8, q8, q15
  1905. vsub.s16 q4, q9, q10
  1906. vsub.s16 q5, q8, q2
  1907. vadd.s16 q3, q9, q10
  1908. vadd.s16 q4, q4, q5
  1909. vadd.s16 q2, q8, q2
  1910. vqdmulh.s16 q4, q4, XFIX_0_707106781
  1911. vadd.s16 q11, q12, q6
  1912. vadd.s16 q8, q2, q3
  1913. vsub.s16 q12, q2, q3
  1914. vadd.s16 q3, q6, q7
  1915. vadd.s16 q7, q7, q1
  1916. vqdmulh.s16 q3, q3, XFIX_0_707106781
  1917. vsub.s16 q6, q11, q7
  1918. vadd.s16 q10, q5, q4
  1919. vqdmulh.s16 q6, q6, XFIX_0_382683433
  1920. vsub.s16 q14, q5, q4
  1921. vqdmulh.s16 q11, q11, XFIX_0_541196100
  1922. vqdmulh.s16 q5, q7, XFIX_1_306562965
  1923. vadd.s16 q4, q1, q3
  1924. vsub.s16 q3, q1, q3
  1925. vadd.s16 q7, q7, q6
  1926. vadd.s16 q11, q11, q6
  1927. vadd.s16 q7, q7, q5
  1928. vadd.s16 q13, q3, q11
  1929. vsub.s16 q11, q3, q11
  1930. vadd.s16 q9, q4, q7
  1931. vsub.s16 q15, q4, q7
  1932. subs TMP, TMP, #1
  1933. bne 1b
  1934. /* store results */
  1935. vst1.16 {d16, d17, d18, d19}, [DATA, :128]!
  1936. vst1.16 {d20, d21, d22, d23}, [DATA, :128]!
  1937. vst1.16 {d24, d25, d26, d27}, [DATA, :128]!
  1938. vst1.16 {d28, d29, d30, d31}, [DATA, :128]
  1939. vpop {d8-d15}
  1940. bx lr
  1941. .unreq DATA
  1942. .unreq TMP
  1943. /*****************************************************************************/
  1944. /*
  1945. * GLOBAL(void)
  1946. * jsimd_quantize_neon (JCOEFPTR coef_block, DCTELEM *divisors,
  1947. * DCTELEM *workspace);
  1948. *
  1949. * Note: the code uses 2 stage pipelining in order to improve instructions
  1950. * scheduling and eliminate stalls (this provides ~15% better
  1951. * performance for this function on both ARM Cortex-A8 and
  1952. * ARM Cortex-A9 when compared to the non-pipelined variant).
  1953. * The instructions which belong to the second stage use different
  1954. * indentation for better readiability.
  1955. */
  1956. asm_function jsimd_quantize_neon
  1957. COEF_BLOCK .req r0
  1958. DIVISORS .req r1
  1959. WORKSPACE .req r2
  1960. RECIPROCAL .req DIVISORS
  1961. CORRECTION .req r3
  1962. SHIFT .req ip
  1963. LOOP_COUNT .req r4
  1964. vld1.16 {d0, d1, d2, d3}, [WORKSPACE, :128]!
  1965. vabs.s16 q12, q0
  1966. add CORRECTION, DIVISORS, #(64 * 2)
  1967. add SHIFT, DIVISORS, #(64 * 6)
  1968. vld1.16 {d20, d21, d22, d23}, [CORRECTION, :128]!
  1969. vabs.s16 q13, q1
  1970. vld1.16 {d16, d17, d18, d19}, [RECIPROCAL, :128]!
  1971. vadd.u16 q12, q12, q10 /* add correction */
  1972. vadd.u16 q13, q13, q11
  1973. vmull.u16 q10, d24, d16 /* multiply by reciprocal */
  1974. vmull.u16 q11, d25, d17
  1975. vmull.u16 q8, d26, d18
  1976. vmull.u16 q9, d27, d19
  1977. vld1.16 {d24, d25, d26, d27}, [SHIFT, :128]!
  1978. vshrn.u32 d20, q10, #16
  1979. vshrn.u32 d21, q11, #16
  1980. vshrn.u32 d22, q8, #16
  1981. vshrn.u32 d23, q9, #16
  1982. vneg.s16 q12, q12
  1983. vneg.s16 q13, q13
  1984. vshr.s16 q2, q0, #15 /* extract sign */
  1985. vshr.s16 q3, q1, #15
  1986. vshl.u16 q14, q10, q12 /* shift */
  1987. vshl.u16 q15, q11, q13
  1988. push {r4, r5}
  1989. mov LOOP_COUNT, #3
  1990. 1:
  1991. vld1.16 {d0, d1, d2, d3}, [WORKSPACE, :128]!
  1992. veor.u16 q14, q14, q2 /* restore sign */
  1993. vabs.s16 q12, q0
  1994. vld1.16 {d20, d21, d22, d23}, [CORRECTION, :128]!
  1995. vabs.s16 q13, q1
  1996. veor.u16 q15, q15, q3
  1997. vld1.16 {d16, d17, d18, d19}, [RECIPROCAL, :128]!
  1998. vadd.u16 q12, q12, q10 /* add correction */
  1999. vadd.u16 q13, q13, q11
  2000. vmull.u16 q10, d24, d16 /* multiply by reciprocal */
  2001. vmull.u16 q11, d25, d17
  2002. vmull.u16 q8, d26, d18
  2003. vmull.u16 q9, d27, d19
  2004. vsub.u16 q14, q14, q2
  2005. vld1.16 {d24, d25, d26, d27}, [SHIFT, :128]!
  2006. vsub.u16 q15, q15, q3
  2007. vshrn.u32 d20, q10, #16
  2008. vshrn.u32 d21, q11, #16
  2009. vst1.16 {d28, d29, d30, d31}, [COEF_BLOCK, :128]!
  2010. vshrn.u32 d22, q8, #16
  2011. vshrn.u32 d23, q9, #16
  2012. vneg.s16 q12, q12
  2013. vneg.s16 q13, q13
  2014. vshr.s16 q2, q0, #15 /* extract sign */
  2015. vshr.s16 q3, q1, #15
  2016. vshl.u16 q14, q10, q12 /* shift */
  2017. vshl.u16 q15, q11, q13
  2018. subs LOOP_COUNT, LOOP_COUNT, #1
  2019. bne 1b
  2020. pop {r4, r5}
  2021. veor.u16 q14, q14, q2 /* restore sign */
  2022. veor.u16 q15, q15, q3
  2023. vsub.u16 q14, q14, q2
  2024. vsub.u16 q15, q15, q3
  2025. vst1.16 {d28, d29, d30, d31}, [COEF_BLOCK, :128]!
  2026. bx lr /* return */
  2027. .unreq COEF_BLOCK
  2028. .unreq DIVISORS
  2029. .unreq WORKSPACE
  2030. .unreq RECIPROCAL
  2031. .unreq CORRECTION
  2032. .unreq SHIFT
  2033. .unreq LOOP_COUNT
  2034. /*****************************************************************************/
  2035. /*
  2036. * GLOBAL(void)
  2037. * jsimd_h2v1_fancy_upsample_neon (int max_v_samp_factor,
  2038. * JDIMENSION downsampled_width,
  2039. * JSAMPARRAY input_data,
  2040. * JSAMPARRAY *output_data_ptr);
  2041. *
  2042. * Note: the use of unaligned writes is the main remaining bottleneck in
  2043. * this code, which can be potentially solved to get up to tens
  2044. * of percents performance improvement on Cortex-A8/Cortex-A9.
  2045. */
  2046. /*
  2047. * Upsample 16 source pixels to 32 destination pixels. The new 16 source
  2048. * pixels are loaded to q0. The previous 16 source pixels are in q1. The
  2049. * shifted-by-one source pixels are constructed in q2 by using q0 and q1.
  2050. * Register d28 is used for multiplication by 3. Register q15 is used
  2051. * for adding +1 bias.
  2052. */
  2053. .macro upsample16 OUTPTR, INPTR
  2054. vld1.8 {q0}, [\INPTR]!
  2055. vmovl.u8 q8, d0
  2056. vext.8 q2, q1, q0, #15
  2057. vmovl.u8 q9, d1
  2058. vaddw.u8 q10, q15, d4
  2059. vaddw.u8 q11, q15, d5
  2060. vmlal.u8 q8, d4, d28
  2061. vmlal.u8 q9, d5, d28
  2062. vmlal.u8 q10, d0, d28
  2063. vmlal.u8 q11, d1, d28
  2064. vmov q1, q0 /* backup source pixels to q1 */
  2065. vrshrn.u16 d6, q8, #2
  2066. vrshrn.u16 d7, q9, #2
  2067. vshrn.u16 d8, q10, #2
  2068. vshrn.u16 d9, q11, #2
  2069. vst2.8 {d6, d7, d8, d9}, [\OUTPTR]!
  2070. .endm
  2071. /*
  2072. * Upsample 32 source pixels to 64 destination pixels. Compared to 'usample16'
  2073. * macro, the roles of q0 and q1 registers are reversed for even and odd
  2074. * groups of 16 pixels, that's why "vmov q1, q0" instructions are not needed.
  2075. * Also this unrolling allows to reorder loads and stores to compensate
  2076. * multiplication latency and reduce stalls.
  2077. */
  2078. .macro upsample32 OUTPTR, INPTR
  2079. /* even 16 pixels group */
  2080. vld1.8 {q0}, [\INPTR]!
  2081. vmovl.u8 q8, d0
  2082. vext.8 q2, q1, q0, #15
  2083. vmovl.u8 q9, d1
  2084. vaddw.u8 q10, q15, d4
  2085. vaddw.u8 q11, q15, d5
  2086. vmlal.u8 q8, d4, d28
  2087. vmlal.u8 q9, d5, d28
  2088. vmlal.u8 q10, d0, d28
  2089. vmlal.u8 q11, d1, d28
  2090. /* odd 16 pixels group */
  2091. vld1.8 {q1}, [\INPTR]!
  2092. vrshrn.u16 d6, q8, #2
  2093. vrshrn.u16 d7, q9, #2
  2094. vshrn.u16 d8, q10, #2
  2095. vshrn.u16 d9, q11, #2
  2096. vmovl.u8 q8, d2
  2097. vext.8 q2, q0, q1, #15
  2098. vmovl.u8 q9, d3
  2099. vaddw.u8 q10, q15, d4
  2100. vaddw.u8 q11, q15, d5
  2101. vmlal.u8 q8, d4, d28
  2102. vmlal.u8 q9, d5, d28
  2103. vmlal.u8 q10, d2, d28
  2104. vmlal.u8 q11, d3, d28
  2105. vst2.8 {d6, d7, d8, d9}, [\OUTPTR]!
  2106. vrshrn.u16 d6, q8, #2
  2107. vrshrn.u16 d7, q9, #2
  2108. vshrn.u16 d8, q10, #2
  2109. vshrn.u16 d9, q11, #2
  2110. vst2.8 {d6, d7, d8, d9}, [\OUTPTR]!
  2111. .endm
  2112. /*
  2113. * Upsample a row of WIDTH pixels from INPTR to OUTPTR.
  2114. */
  2115. .macro upsample_row OUTPTR, INPTR, WIDTH, TMP1
  2116. /* special case for the first and last pixels */
  2117. sub \WIDTH, \WIDTH, #1
  2118. add \OUTPTR, \OUTPTR, #1
  2119. ldrb \TMP1, [\INPTR, \WIDTH]
  2120. strb \TMP1, [\OUTPTR, \WIDTH, asl #1]
  2121. ldrb \TMP1, [\INPTR], #1
  2122. strb \TMP1, [\OUTPTR, #-1]
  2123. vmov.8 d3[7], \TMP1
  2124. subs \WIDTH, \WIDTH, #32
  2125. blt 5f
  2126. 0: /* process 32 pixels per iteration */
  2127. upsample32 \OUTPTR, \INPTR
  2128. subs \WIDTH, \WIDTH, #32
  2129. bge 0b
  2130. 5:
  2131. adds \WIDTH, \WIDTH, #16
  2132. blt 1f
  2133. 0: /* process 16 pixels if needed */
  2134. upsample16 \OUTPTR, \INPTR
  2135. subs \WIDTH, \WIDTH, #16
  2136. 1:
  2137. adds \WIDTH, \WIDTH, #16
  2138. beq 9f
  2139. /* load the remaining 1-15 pixels */
  2140. add \INPTR, \INPTR, \WIDTH
  2141. tst \WIDTH, #1
  2142. beq 2f
  2143. sub \INPTR, \INPTR, #1
  2144. vld1.8 {d0[0]}, [\INPTR]
  2145. 2:
  2146. tst \WIDTH, #2
  2147. beq 2f
  2148. vext.8 d0, d0, d0, #6
  2149. sub \INPTR, \INPTR, #1
  2150. vld1.8 {d0[1]}, [\INPTR]
  2151. sub \INPTR, \INPTR, #1
  2152. vld1.8 {d0[0]}, [\INPTR]
  2153. 2:
  2154. tst \WIDTH, #4
  2155. beq 2f
  2156. vrev64.32 d0, d0
  2157. sub \INPTR, \INPTR, #1
  2158. vld1.8 {d0[3]}, [\INPTR]
  2159. sub \INPTR, \INPTR, #1
  2160. vld1.8 {d0[2]}, [\INPTR]
  2161. sub \INPTR, \INPTR, #1
  2162. vld1.8 {d0[1]}, [\INPTR]
  2163. sub \INPTR, \INPTR, #1
  2164. vld1.8 {d0[0]}, [\INPTR]
  2165. 2:
  2166. tst \WIDTH, #8
  2167. beq 2f
  2168. vmov d1, d0
  2169. sub \INPTR, \INPTR, #8
  2170. vld1.8 {d0}, [\INPTR]
  2171. 2: /* upsample the remaining pixels */
  2172. vmovl.u8 q8, d0
  2173. vext.8 q2, q1, q0, #15
  2174. vmovl.u8 q9, d1
  2175. vaddw.u8 q10, q15, d4
  2176. vaddw.u8 q11, q15, d5
  2177. vmlal.u8 q8, d4, d28
  2178. vmlal.u8 q9, d5, d28
  2179. vmlal.u8 q10, d0, d28
  2180. vmlal.u8 q11, d1, d28
  2181. vrshrn.u16 d10, q8, #2
  2182. vrshrn.u16 d12, q9, #2
  2183. vshrn.u16 d11, q10, #2
  2184. vshrn.u16 d13, q11, #2
  2185. vzip.8 d10, d11
  2186. vzip.8 d12, d13
  2187. /* store the remaining pixels */
  2188. tst \WIDTH, #8
  2189. beq 2f
  2190. vst1.8 {d10, d11}, [\OUTPTR]!
  2191. vmov q5, q6
  2192. 2:
  2193. tst \WIDTH, #4
  2194. beq 2f
  2195. vst1.8 {d10}, [\OUTPTR]!
  2196. vmov d10, d11
  2197. 2:
  2198. tst \WIDTH, #2
  2199. beq 2f
  2200. vst1.8 {d10[0]}, [\OUTPTR]!
  2201. vst1.8 {d10[1]}, [\OUTPTR]!
  2202. vst1.8 {d10[2]}, [\OUTPTR]!
  2203. vst1.8 {d10[3]}, [\OUTPTR]!
  2204. vext.8 d10, d10, d10, #4
  2205. 2:
  2206. tst \WIDTH, #1
  2207. beq 2f
  2208. vst1.8 {d10[0]}, [\OUTPTR]!
  2209. vst1.8 {d10[1]}, [\OUTPTR]!
  2210. 2:
  2211. 9:
  2212. .endm
  2213. asm_function jsimd_h2v1_fancy_upsample_neon
  2214. MAX_V_SAMP_FACTOR .req r0
  2215. DOWNSAMPLED_WIDTH .req r1
  2216. INPUT_DATA .req r2
  2217. OUTPUT_DATA_PTR .req r3
  2218. OUTPUT_DATA .req OUTPUT_DATA_PTR
  2219. OUTPTR .req r4
  2220. INPTR .req r5
  2221. WIDTH .req ip
  2222. TMP .req lr
  2223. push {r4, r5, r6, lr}
  2224. vpush {d8-d15}
  2225. ldr OUTPUT_DATA, [OUTPUT_DATA_PTR]
  2226. cmp MAX_V_SAMP_FACTOR, #0
  2227. ble 99f
  2228. /* initialize constants */
  2229. vmov.u8 d28, #3
  2230. vmov.u16 q15, #1
  2231. 11:
  2232. ldr INPTR, [INPUT_DATA], #4
  2233. ldr OUTPTR, [OUTPUT_DATA], #4
  2234. mov WIDTH, DOWNSAMPLED_WIDTH
  2235. upsample_row OUTPTR, INPTR, WIDTH, TMP
  2236. subs MAX_V_SAMP_FACTOR, MAX_V_SAMP_FACTOR, #1
  2237. bgt 11b
  2238. 99:
  2239. vpop {d8-d15}
  2240. pop {r4, r5, r6, pc}
  2241. .unreq MAX_V_SAMP_FACTOR
  2242. .unreq DOWNSAMPLED_WIDTH
  2243. .unreq INPUT_DATA
  2244. .unreq OUTPUT_DATA_PTR
  2245. .unreq OUTPUT_DATA
  2246. .unreq OUTPTR
  2247. .unreq INPTR
  2248. .unreq WIDTH
  2249. .unreq TMP
  2250. .purgem upsample16
  2251. .purgem upsample32
  2252. .purgem upsample_row
  2253. /*****************************************************************************/
  2254. /*
  2255. * GLOBAL(JOCTET*)
  2256. * jsimd_huff_encode_one_block (working_state *state, JOCTET *buffer,
  2257. * JCOEFPTR block, int last_dc_val,
  2258. * c_derived_tbl *dctbl, c_derived_tbl *actbl)
  2259. *
  2260. */
  2261. .macro emit_byte BUFFER, PUT_BUFFER, PUT_BITS, ZERO, TMP
  2262. sub \PUT_BITS, \PUT_BITS, #0x8
  2263. lsr \TMP, \PUT_BUFFER, \PUT_BITS
  2264. uxtb \TMP, \TMP
  2265. strb \TMP, [\BUFFER, #1]!
  2266. cmp \TMP, #0xff
  2267. /*it eq*/
  2268. strbeq \ZERO, [\BUFFER, #1]!
  2269. .endm
  2270. .macro put_bits PUT_BUFFER, PUT_BITS, CODE, SIZE
  2271. /*lsl \PUT_BUFFER, \PUT_BUFFER, \SIZE*/
  2272. add \PUT_BITS, \SIZE
  2273. /*orr \PUT_BUFFER, \PUT_BUFFER, \CODE*/
  2274. orr \PUT_BUFFER, \CODE, \PUT_BUFFER, lsl \SIZE
  2275. .endm
  2276. .macro checkbuf15 BUFFER, PUT_BUFFER, PUT_BITS, ZERO, TMP
  2277. cmp \PUT_BITS, #0x10
  2278. blt 15f
  2279. eor \ZERO, \ZERO, \ZERO
  2280. emit_byte \BUFFER, \PUT_BUFFER, \PUT_BITS, \ZERO, \TMP
  2281. emit_byte \BUFFER, \PUT_BUFFER, \PUT_BITS, \ZERO, \TMP
  2282. 15:
  2283. .endm
  2284. .balign 16
  2285. jsimd_huff_encode_one_block_neon_consts:
  2286. .byte 0x01
  2287. .byte 0x02
  2288. .byte 0x04
  2289. .byte 0x08
  2290. .byte 0x10
  2291. .byte 0x20
  2292. .byte 0x40
  2293. .byte 0x80
  2294. asm_function jsimd_huff_encode_one_block_neon
  2295. push {r4, r5, r6, r7, r8, r9, r10, r11, lr}
  2296. add r7, sp, #0x1c
  2297. sub r4, sp, #0x40
  2298. bfc r4, #0, #5
  2299. mov sp, r4 /* align sp on 32 bytes */
  2300. vst1.64 {d8, d9, d10, d11}, [r4, :128]!
  2301. vst1.64 {d12, d13, d14, d15}, [r4, :128]
  2302. sub sp, #0x140 /* reserve 320 bytes */
  2303. str r0, [sp, #0x18] /* working state > sp + Ox18 */
  2304. add r4, sp, #0x20 /* r4 = t1 */
  2305. ldr lr, [r7, #0x8] /* lr = dctbl */
  2306. sub r10, r1, #0x1 /* r10=buffer-- */
  2307. ldrsh r1, [r2]
  2308. mov r9, #0x10
  2309. mov r8, #0x1
  2310. adr r5, jsimd_huff_encode_one_block_neon_consts
  2311. /* prepare data */
  2312. vld1.8 {d26}, [r5, :64]
  2313. veor q8, q8, q8
  2314. veor q9, q9, q9
  2315. vdup.16 q14, r9
  2316. vdup.16 q15, r8
  2317. veor q10, q10, q10
  2318. veor q11, q11, q11
  2319. sub r1, r1, r3
  2320. add r9, r2, #0x22
  2321. add r8, r2, #0x18
  2322. add r3, r2, #0x36
  2323. vmov.16 d0[0], r1
  2324. vld1.16 {d2[0]}, [r9, :16]
  2325. vld1.16 {d4[0]}, [r8, :16]
  2326. vld1.16 {d6[0]}, [r3, :16]
  2327. add r1, r2, #0x2
  2328. add r9, r2, #0x30
  2329. add r8, r2, #0x26
  2330. add r3, r2, #0x28
  2331. vld1.16 {d0[1]}, [r1, :16]
  2332. vld1.16 {d2[1]}, [r9, :16]
  2333. vld1.16 {d4[1]}, [r8, :16]
  2334. vld1.16 {d6[1]}, [r3, :16]
  2335. add r1, r2, #0x10
  2336. add r9, r2, #0x40
  2337. add r8, r2, #0x34
  2338. add r3, r2, #0x1a
  2339. vld1.16 {d0[2]}, [r1, :16]
  2340. vld1.16 {d2[2]}, [r9, :16]
  2341. vld1.16 {d4[2]}, [r8, :16]
  2342. vld1.16 {d6[2]}, [r3, :16]
  2343. add r1, r2, #0x20
  2344. add r9, r2, #0x32
  2345. add r8, r2, #0x42
  2346. add r3, r2, #0xc
  2347. vld1.16 {d0[3]}, [r1, :16]
  2348. vld1.16 {d2[3]}, [r9, :16]
  2349. vld1.16 {d4[3]}, [r8, :16]
  2350. vld1.16 {d6[3]}, [r3, :16]
  2351. add r1, r2, #0x12
  2352. add r9, r2, #0x24
  2353. add r8, r2, #0x50
  2354. add r3, r2, #0xe
  2355. vld1.16 {d1[0]}, [r1, :16]
  2356. vld1.16 {d3[0]}, [r9, :16]
  2357. vld1.16 {d5[0]}, [r8, :16]
  2358. vld1.16 {d7[0]}, [r3, :16]
  2359. add r1, r2, #0x4
  2360. add r9, r2, #0x16
  2361. add r8, r2, #0x60
  2362. add r3, r2, #0x1c
  2363. vld1.16 {d1[1]}, [r1, :16]
  2364. vld1.16 {d3[1]}, [r9, :16]
  2365. vld1.16 {d5[1]}, [r8, :16]
  2366. vld1.16 {d7[1]}, [r3, :16]
  2367. add r1, r2, #0x6
  2368. add r9, r2, #0x8
  2369. add r8, r2, #0x52
  2370. add r3, r2, #0x2a
  2371. vld1.16 {d1[2]}, [r1, :16]
  2372. vld1.16 {d3[2]}, [r9, :16]
  2373. vld1.16 {d5[2]}, [r8, :16]
  2374. vld1.16 {d7[2]}, [r3, :16]
  2375. add r1, r2, #0x14
  2376. add r9, r2, #0xa
  2377. add r8, r2, #0x44
  2378. add r3, r2, #0x38
  2379. vld1.16 {d1[3]}, [r1, :16]
  2380. vld1.16 {d3[3]}, [r9, :16]
  2381. vld1.16 {d5[3]}, [r8, :16]
  2382. vld1.16 {d7[3]}, [r3, :16]
  2383. vcgt.s16 q8, q8, q0
  2384. vcgt.s16 q9, q9, q1
  2385. vcgt.s16 q10, q10, q2
  2386. vcgt.s16 q11, q11, q3
  2387. vabs.s16 q0, q0
  2388. vabs.s16 q1, q1
  2389. vabs.s16 q2, q2
  2390. vabs.s16 q3, q3
  2391. veor q8, q8, q0
  2392. veor q9, q9, q1
  2393. veor q10, q10, q2
  2394. veor q11, q11, q3
  2395. add r9, r4, #0x20
  2396. add r8, r4, #0x80
  2397. add r3, r4, #0xa0
  2398. vclz.i16 q0, q0
  2399. vclz.i16 q1, q1
  2400. vclz.i16 q2, q2
  2401. vclz.i16 q3, q3
  2402. vsub.i16 q0, q14, q0
  2403. vsub.i16 q1, q14, q1
  2404. vsub.i16 q2, q14, q2
  2405. vsub.i16 q3, q14, q3
  2406. vst1.16 {d0, d1, d2, d3}, [r4, :256]
  2407. vst1.16 {d4, d5, d6, d7}, [r9, :256]
  2408. vshl.s16 q0, q15, q0
  2409. vshl.s16 q1, q15, q1
  2410. vshl.s16 q2, q15, q2
  2411. vshl.s16 q3, q15, q3
  2412. vsub.i16 q0, q0, q15
  2413. vsub.i16 q1, q1, q15
  2414. vsub.i16 q2, q2, q15
  2415. vsub.i16 q3, q3, q15
  2416. vand q8, q8, q0
  2417. vand q9, q9, q1
  2418. vand q10, q10, q2
  2419. vand q11, q11, q3
  2420. vst1.16 {d16, d17, d18, d19}, [r8, :256]
  2421. vst1.16 {d20, d21, d22, d23}, [r3, :256]
  2422. add r1, r2, #0x46
  2423. add r9, r2, #0x3a
  2424. add r8, r2, #0x74
  2425. add r3, r2, #0x6a
  2426. vld1.16 {d8[0]}, [r1, :16]
  2427. vld1.16 {d10[0]}, [r9, :16]
  2428. vld1.16 {d12[0]}, [r8, :16]
  2429. vld1.16 {d14[0]}, [r3, :16]
  2430. veor q8, q8, q8
  2431. veor q9, q9, q9
  2432. veor q10, q10, q10
  2433. veor q11, q11, q11
  2434. add r1, r2, #0x54
  2435. add r9, r2, #0x2c
  2436. add r8, r2, #0x76
  2437. add r3, r2, #0x78
  2438. vld1.16 {d8[1]}, [r1, :16]
  2439. vld1.16 {d10[1]}, [r9, :16]
  2440. vld1.16 {d12[1]}, [r8, :16]
  2441. vld1.16 {d14[1]}, [r3, :16]
  2442. add r1, r2, #0x62
  2443. add r9, r2, #0x1e
  2444. add r8, r2, #0x68
  2445. add r3, r2, #0x7a
  2446. vld1.16 {d8[2]}, [r1, :16]
  2447. vld1.16 {d10[2]}, [r9, :16]
  2448. vld1.16 {d12[2]}, [r8, :16]
  2449. vld1.16 {d14[2]}, [r3, :16]
  2450. add r1, r2, #0x70
  2451. add r9, r2, #0x2e
  2452. add r8, r2, #0x5a
  2453. add r3, r2, #0x6c
  2454. vld1.16 {d8[3]}, [r1, :16]
  2455. vld1.16 {d10[3]}, [r9, :16]
  2456. vld1.16 {d12[3]}, [r8, :16]
  2457. vld1.16 {d14[3]}, [r3, :16]
  2458. add r1, r2, #0x72
  2459. add r9, r2, #0x3c
  2460. add r8, r2, #0x4c
  2461. add r3, r2, #0x5e
  2462. vld1.16 {d9[0]}, [r1, :16]
  2463. vld1.16 {d11[0]}, [r9, :16]
  2464. vld1.16 {d13[0]}, [r8, :16]
  2465. vld1.16 {d15[0]}, [r3, :16]
  2466. add r1, r2, #0x64
  2467. add r9, r2, #0x4a
  2468. add r8, r2, #0x3e
  2469. add r3, r2, #0x6e
  2470. vld1.16 {d9[1]}, [r1, :16]
  2471. vld1.16 {d11[1]}, [r9, :16]
  2472. vld1.16 {d13[1]}, [r8, :16]
  2473. vld1.16 {d15[1]}, [r3, :16]
  2474. add r1, r2, #0x56
  2475. add r9, r2, #0x58
  2476. add r8, r2, #0x4e
  2477. add r3, r2, #0x7c
  2478. vld1.16 {d9[2]}, [r1, :16]
  2479. vld1.16 {d11[2]}, [r9, :16]
  2480. vld1.16 {d13[2]}, [r8, :16]
  2481. vld1.16 {d15[2]}, [r3, :16]
  2482. add r1, r2, #0x48
  2483. add r9, r2, #0x66
  2484. add r8, r2, #0x5c
  2485. add r3, r2, #0x7e
  2486. vld1.16 {d9[3]}, [r1, :16]
  2487. vld1.16 {d11[3]}, [r9, :16]
  2488. vld1.16 {d13[3]}, [r8, :16]
  2489. vld1.16 {d15[3]}, [r3, :16]
  2490. vcgt.s16 q8, q8, q4
  2491. vcgt.s16 q9, q9, q5
  2492. vcgt.s16 q10, q10, q6
  2493. vcgt.s16 q11, q11, q7
  2494. vabs.s16 q4, q4
  2495. vabs.s16 q5, q5
  2496. vabs.s16 q6, q6
  2497. vabs.s16 q7, q7
  2498. veor q8, q8, q4
  2499. veor q9, q9, q5
  2500. veor q10, q10, q6
  2501. veor q11, q11, q7
  2502. add r1, r4, #0x40
  2503. add r9, r4, #0x60
  2504. add r8, r4, #0xc0
  2505. add r3, r4, #0xe0
  2506. vclz.i16 q4, q4
  2507. vclz.i16 q5, q5
  2508. vclz.i16 q6, q6
  2509. vclz.i16 q7, q7
  2510. vsub.i16 q4, q14, q4
  2511. vsub.i16 q5, q14, q5
  2512. vsub.i16 q6, q14, q6
  2513. vsub.i16 q7, q14, q7
  2514. vst1.16 {d8, d9, d10, d11}, [r1, :256]
  2515. vst1.16 {d12, d13, d14, d15}, [r9, :256]
  2516. vshl.s16 q4, q15, q4
  2517. vshl.s16 q5, q15, q5
  2518. vshl.s16 q6, q15, q6
  2519. vshl.s16 q7, q15, q7
  2520. vsub.i16 q4, q4, q15
  2521. vsub.i16 q5, q5, q15
  2522. vsub.i16 q6, q6, q15
  2523. vsub.i16 q7, q7, q15
  2524. vand q8, q8, q4
  2525. vand q9, q9, q5
  2526. vand q10, q10, q6
  2527. vand q11, q11, q7
  2528. vst1.16 {d16, d17, d18, d19}, [r8, :256]
  2529. vst1.16 {d20, d21, d22, d23}, [r3, :256]
  2530. ldr r12, [r7, #0xc] /* r12 = actbl */
  2531. add r1, lr, #0x400 /* r1 = dctbl->ehufsi */
  2532. mov r9, r12 /* r9 = actbl */
  2533. add r6, r4, #0x80 /* r6 = t2 */
  2534. ldr r11, [r0, #0x8] /* r11 = put_buffer */
  2535. ldr r4, [r0, #0xc] /* r4 = put_bits */
  2536. ldrh r2, [r6, #-128] /* r2 = nbits */
  2537. ldrh r3, [r6] /* r3 = temp2 & (((JLONG) 1)<<nbits) - 1; */
  2538. ldr r0, [lr, r2, lsl #2]
  2539. ldrb r5, [r1, r2]
  2540. put_bits r11, r4, r0, r5
  2541. checkbuf15 r10, r11, r4, r5, r0
  2542. put_bits r11, r4, r3, r2
  2543. checkbuf15 r10, r11, r4, r5, r0
  2544. mov lr, r6 /* lr = t2 */
  2545. add r5, r9, #0x400 /* r5 = actbl->ehufsi */
  2546. ldrsb r6, [r5, #0xf0] /* r6 = actbl->ehufsi[0xf0] */
  2547. veor q8, q8, q8
  2548. vceq.i16 q0, q0, q8
  2549. vceq.i16 q1, q1, q8
  2550. vceq.i16 q2, q2, q8
  2551. vceq.i16 q3, q3, q8
  2552. vceq.i16 q4, q4, q8
  2553. vceq.i16 q5, q5, q8
  2554. vceq.i16 q6, q6, q8
  2555. vceq.i16 q7, q7, q8
  2556. vmovn.i16 d0, q0
  2557. vmovn.i16 d2, q1
  2558. vmovn.i16 d4, q2
  2559. vmovn.i16 d6, q3
  2560. vmovn.i16 d8, q4
  2561. vmovn.i16 d10, q5
  2562. vmovn.i16 d12, q6
  2563. vmovn.i16 d14, q7
  2564. vand d0, d0, d26
  2565. vand d2, d2, d26
  2566. vand d4, d4, d26
  2567. vand d6, d6, d26
  2568. vand d8, d8, d26
  2569. vand d10, d10, d26
  2570. vand d12, d12, d26
  2571. vand d14, d14, d26
  2572. vpadd.i8 d0, d0, d2
  2573. vpadd.i8 d4, d4, d6
  2574. vpadd.i8 d8, d8, d10
  2575. vpadd.i8 d12, d12, d14
  2576. vpadd.i8 d0, d0, d4
  2577. vpadd.i8 d8, d8, d12
  2578. vpadd.i8 d0, d0, d8
  2579. vmov.32 r1, d0[1]
  2580. vmov.32 r8, d0[0]
  2581. mvn r1, r1
  2582. mvn r8, r8
  2583. lsrs r1, r1, #0x1
  2584. rrx r8, r8 /* shift in last r1 bit while shifting out DC bit */
  2585. rbit r1, r1 /* r1 = index1 */
  2586. rbit r8, r8 /* r8 = index0 */
  2587. ldr r0, [r9, #0x3c0] /* r0 = actbl->ehufco[0xf0] */
  2588. str r1, [sp, #0x14] /* index1 > sp + 0x14 */
  2589. cmp r8, #0x0
  2590. beq 6f
  2591. 1:
  2592. clz r2, r8
  2593. add lr, lr, r2, lsl #1
  2594. lsl r8, r8, r2
  2595. ldrh r1, [lr, #-126]
  2596. 2:
  2597. cmp r2, #0x10
  2598. blt 3f
  2599. sub r2, r2, #0x10
  2600. put_bits r11, r4, r0, r6
  2601. cmp r4, #0x10
  2602. blt 2b
  2603. eor r3, r3, r3
  2604. emit_byte r10, r11, r4, r3, r12
  2605. emit_byte r10, r11, r4, r3, r12
  2606. b 2b
  2607. 3:
  2608. add r2, r1, r2, lsl #4
  2609. ldrh r3, [lr, #2]!
  2610. ldr r12, [r9, r2, lsl #2]
  2611. ldrb r2, [r5, r2]
  2612. put_bits r11, r4, r12, r2
  2613. checkbuf15 r10, r11, r4, r2, r12
  2614. put_bits r11, r4, r3, r1
  2615. checkbuf15 r10, r11, r4, r2, r12
  2616. lsls r8, r8, #0x1
  2617. bne 1b
  2618. 6:
  2619. add r12, sp, #0x20 /* r12 = t1 */
  2620. ldr r8, [sp, #0x14] /* r8 = index1 */
  2621. adds r12, #0xc0 /* r12 = t2 + (DCTSIZE2/2) */
  2622. cmp r8, #0x0
  2623. beq 6f
  2624. clz r2, r8
  2625. sub r12, r12, lr
  2626. lsl r8, r8, r2
  2627. add r2, r2, r12, lsr #1
  2628. add lr, lr, r2, lsl #1
  2629. b 7f
  2630. 1:
  2631. clz r2, r8
  2632. add lr, lr, r2, lsl #1
  2633. lsl r8, r8, r2
  2634. 7:
  2635. ldrh r1, [lr, #-126]
  2636. 2:
  2637. cmp r2, #0x10
  2638. blt 3f
  2639. sub r2, r2, #0x10
  2640. put_bits r11, r4, r0, r6
  2641. cmp r4, #0x10
  2642. blt 2b
  2643. eor r3, r3, r3
  2644. emit_byte r10, r11, r4, r3, r12
  2645. emit_byte r10, r11, r4, r3, r12
  2646. b 2b
  2647. 3:
  2648. add r2, r1, r2, lsl #4
  2649. ldrh r3, [lr, #2]!
  2650. ldr r12, [r9, r2, lsl #2]
  2651. ldrb r2, [r5, r2]
  2652. put_bits r11, r4, r12, r2
  2653. checkbuf15 r10, r11, r4, r2, r12
  2654. put_bits r11, r4, r3, r1
  2655. checkbuf15 r10, r11, r4, r2, r12
  2656. lsls r8, r8, #0x1
  2657. bne 1b
  2658. 6:
  2659. add r0, sp, #0x20
  2660. add r0, #0xfe
  2661. cmp lr, r0
  2662. bhs 1f
  2663. ldr r1, [r9]
  2664. ldrb r0, [r5]
  2665. put_bits r11, r4, r1, r0
  2666. checkbuf15 r10, r11, r4, r0, r1
  2667. 1:
  2668. ldr r12, [sp, #0x18]
  2669. str r11, [r12, #0x8]
  2670. str r4, [r12, #0xc]
  2671. add r0, r10, #0x1
  2672. add r4, sp, #0x140
  2673. vld1.64 {d8, d9, d10, d11}, [r4, :128]!
  2674. vld1.64 {d12, d13, d14, d15}, [r4, :128]
  2675. sub r4, r7, #0x1c
  2676. mov sp, r4
  2677. pop {r4, r5, r6, r7, r8, r9, r10, r11, pc}
  2678. .purgem emit_byte
  2679. .purgem put_bits
  2680. .purgem checkbuf15
  2681. #endif // ESENTHEL CHANGED