row_neon.cc 123 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949195019511952195319541955195619571958195919601961196219631964196519661967196819691970197119721973197419751976197719781979198019811982198319841985198619871988198919901991199219931994199519961997199819992000200120022003200420052006200720082009201020112012201320142015201620172018201920202021202220232024202520262027202820292030203120322033203420352036203720382039204020412042204320442045204620472048204920502051205220532054205520562057205820592060206120622063206420652066206720682069207020712072207320742075207620772078207920802081208220832084208520862087208820892090209120922093209420952096209720982099210021012102210321042105210621072108210921102111211221132114211521162117211821192120212121222123212421252126212721282129213021312132213321342135213621372138213921402141214221432144214521462147214821492150215121522153215421552156215721582159216021612162216321642165216621672168216921702171217221732174217521762177217821792180218121822183218421852186218721882189219021912192219321942195219621972198219922002201220222032204220522062207220822092210221122122213221422152216221722182219222022212222222322242225222622272228222922302231223222332234223522362237223822392240224122422243224422452246224722482249225022512252225322542255225622572258225922602261226222632264226522662267226822692270227122722273227422752276227722782279228022812282228322842285228622872288228922902291229222932294229522962297229822992300230123022303230423052306230723082309231023112312231323142315231623172318231923202321232223232324232523262327232823292330233123322333233423352336233723382339234023412342234323442345234623472348234923502351235223532354235523562357235823592360236123622363236423652366236723682369237023712372237323742375237623772378237923802381238223832384238523862387238823892390239123922393239423952396239723982399240024012402240324042405240624072408240924102411241224132414241524162417241824192420242124222423242424252426242724282429243024312432243324342435243624372438243924402441244224432444244524462447244824492450245124522453245424552456245724582459246024612462246324642465246624672468246924702471247224732474247524762477247824792480248124822483248424852486248724882489249024912492249324942495249624972498249925002501250225032504250525062507250825092510251125122513251425152516251725182519252025212522252325242525252625272528252925302531253225332534253525362537253825392540254125422543254425452546254725482549255025512552255325542555255625572558255925602561256225632564256525662567256825692570257125722573257425752576257725782579258025812582258325842585258625872588258925902591259225932594259525962597259825992600260126022603260426052606260726082609261026112612261326142615261626172618261926202621262226232624262526262627262826292630263126322633263426352636263726382639264026412642264326442645264626472648264926502651265226532654265526562657265826592660266126622663266426652666266726682669267026712672267326742675267626772678267926802681268226832684268526862687268826892690269126922693269426952696269726982699270027012702270327042705270627072708270927102711271227132714271527162717271827192720272127222723272427252726272727282729273027312732273327342735273627372738273927402741274227432744274527462747274827492750275127522753275427552756275727582759276027612762276327642765276627672768276927702771277227732774277527762777277827792780278127822783278427852786278727882789279027912792279327942795279627972798279928002801280228032804280528062807280828092810281128122813281428152816281728182819282028212822282328242825282628272828282928302831283228332834283528362837283828392840284128422843
  1. /*
  2. * Copyright 2011 The LibYuv Project Authors. All rights reserved.
  3. *
  4. * Use of this source code is governed by a BSD-style license
  5. * that can be found in the LICENSE file in the root of the source
  6. * tree. An additional intellectual property rights grant can be found
  7. * in the file PATENTS. All contributing project authors may
  8. * be found in the AUTHORS file in the root of the source tree.
  9. */
  10. #include "libyuv/row.h"
  11. #ifdef __cplusplus
  12. namespace libyuv {
  13. extern "C" {
  14. #endif
  15. // This module is for GCC Neon
  16. #if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__) && \
  17. !defined(__aarch64__)
  18. // Read 8 Y, 4 U and 4 V from 422
  19. #define READYUV422 \
  20. MEMACCESS(0) \
  21. "vld1.8 {d0}, [%0]! \n" \
  22. MEMACCESS(1) \
  23. "vld1.32 {d2[0]}, [%1]! \n" \
  24. MEMACCESS(2) \
  25. "vld1.32 {d2[1]}, [%2]! \n"
  26. // Read 8 Y, 2 U and 2 V from 422
  27. #define READYUV411 \
  28. MEMACCESS(0) \
  29. "vld1.8 {d0}, [%0]! \n" \
  30. MEMACCESS(1) \
  31. "vld1.16 {d2[0]}, [%1]! \n" \
  32. MEMACCESS(2) \
  33. "vld1.16 {d2[1]}, [%2]! \n" \
  34. "vmov.u8 d3, d2 \n" \
  35. "vzip.u8 d2, d3 \n"
  36. // Read 8 Y, 8 U and 8 V from 444
  37. #define READYUV444 \
  38. MEMACCESS(0) \
  39. "vld1.8 {d0}, [%0]! \n" \
  40. MEMACCESS(1) \
  41. "vld1.8 {d2}, [%1]! \n" \
  42. MEMACCESS(2) \
  43. "vld1.8 {d3}, [%2]! \n" \
  44. "vpaddl.u8 q1, q1 \n" \
  45. "vrshrn.u16 d2, q1, #1 \n"
  46. // Read 8 Y, and set 4 U and 4 V to 128
  47. #define READYUV400 \
  48. MEMACCESS(0) \
  49. "vld1.8 {d0}, [%0]! \n" \
  50. "vmov.u8 d2, #128 \n"
  51. // Read 8 Y and 4 UV from NV12
  52. #define READNV12 \
  53. MEMACCESS(0) \
  54. "vld1.8 {d0}, [%0]! \n" \
  55. MEMACCESS(1) \
  56. "vld1.8 {d2}, [%1]! \n" \
  57. "vmov.u8 d3, d2 \n"/* split odd/even uv apart */\
  58. "vuzp.u8 d2, d3 \n" \
  59. "vtrn.u32 d2, d3 \n"
  60. // Read 8 Y and 4 VU from NV21
  61. #define READNV21 \
  62. MEMACCESS(0) \
  63. "vld1.8 {d0}, [%0]! \n" \
  64. MEMACCESS(1) \
  65. "vld1.8 {d2}, [%1]! \n" \
  66. "vmov.u8 d3, d2 \n"/* split odd/even uv apart */\
  67. "vuzp.u8 d3, d2 \n" \
  68. "vtrn.u32 d2, d3 \n"
  69. // Read 8 YUY2
  70. #define READYUY2 \
  71. MEMACCESS(0) \
  72. "vld2.8 {d0, d2}, [%0]! \n" \
  73. "vmov.u8 d3, d2 \n" \
  74. "vuzp.u8 d2, d3 \n" \
  75. "vtrn.u32 d2, d3 \n"
  76. // Read 8 UYVY
  77. #define READUYVY \
  78. MEMACCESS(0) \
  79. "vld2.8 {d2, d3}, [%0]! \n" \
  80. "vmov.u8 d0, d3 \n" \
  81. "vmov.u8 d3, d2 \n" \
  82. "vuzp.u8 d2, d3 \n" \
  83. "vtrn.u32 d2, d3 \n"
  84. #define YUVTORGB_SETUP \
  85. MEMACCESS([kUVToRB]) \
  86. "vld1.8 {d24}, [%[kUVToRB]] \n" \
  87. MEMACCESS([kUVToG]) \
  88. "vld1.8 {d25}, [%[kUVToG]] \n" \
  89. MEMACCESS([kUVBiasBGR]) \
  90. "vld1.16 {d26[], d27[]}, [%[kUVBiasBGR]]! \n" \
  91. MEMACCESS([kUVBiasBGR]) \
  92. "vld1.16 {d8[], d9[]}, [%[kUVBiasBGR]]! \n" \
  93. MEMACCESS([kUVBiasBGR]) \
  94. "vld1.16 {d28[], d29[]}, [%[kUVBiasBGR]] \n" \
  95. MEMACCESS([kYToRgb]) \
  96. "vld1.32 {d30[], d31[]}, [%[kYToRgb]] \n"
  97. #define YUVTORGB \
  98. "vmull.u8 q8, d2, d24 \n" /* u/v B/R component */\
  99. "vmull.u8 q9, d2, d25 \n" /* u/v G component */\
  100. "vmovl.u8 q0, d0 \n" /* Y */\
  101. "vmovl.s16 q10, d1 \n" \
  102. "vmovl.s16 q0, d0 \n" \
  103. "vmul.s32 q10, q10, q15 \n" \
  104. "vmul.s32 q0, q0, q15 \n" \
  105. "vqshrun.s32 d0, q0, #16 \n" \
  106. "vqshrun.s32 d1, q10, #16 \n" /* Y */\
  107. "vadd.s16 d18, d19 \n" \
  108. "vshll.u16 q1, d16, #16 \n" /* Replicate u * UB */\
  109. "vshll.u16 q10, d17, #16 \n" /* Replicate v * VR */\
  110. "vshll.u16 q3, d18, #16 \n" /* Replicate (v*VG + u*UG)*/\
  111. "vaddw.u16 q1, q1, d16 \n" \
  112. "vaddw.u16 q10, q10, d17 \n" \
  113. "vaddw.u16 q3, q3, d18 \n" \
  114. "vqadd.s16 q8, q0, q13 \n" /* B */ \
  115. "vqadd.s16 q9, q0, q14 \n" /* R */ \
  116. "vqadd.s16 q0, q0, q4 \n" /* G */ \
  117. "vqadd.s16 q8, q8, q1 \n" /* B */ \
  118. "vqadd.s16 q9, q9, q10 \n" /* R */ \
  119. "vqsub.s16 q0, q0, q3 \n" /* G */ \
  120. "vqshrun.s16 d20, q8, #6 \n" /* B */ \
  121. "vqshrun.s16 d22, q9, #6 \n" /* R */ \
  122. "vqshrun.s16 d21, q0, #6 \n" /* G */
  123. void I444ToARGBRow_NEON(const uint8* src_y,
  124. const uint8* src_u,
  125. const uint8* src_v,
  126. uint8* dst_argb,
  127. const struct YuvConstants* yuvconstants,
  128. int width) {
  129. asm volatile (
  130. YUVTORGB_SETUP
  131. "vmov.u8 d23, #255 \n"
  132. "1: \n"
  133. READYUV444
  134. YUVTORGB
  135. "subs %4, %4, #8 \n"
  136. MEMACCESS(3)
  137. "vst4.8 {d20, d21, d22, d23}, [%3]! \n"
  138. "bgt 1b \n"
  139. : "+r"(src_y), // %0
  140. "+r"(src_u), // %1
  141. "+r"(src_v), // %2
  142. "+r"(dst_argb), // %3
  143. "+r"(width) // %4
  144. : [kUVToRB]"r"(&yuvconstants->kUVToRB),
  145. [kUVToG]"r"(&yuvconstants->kUVToG),
  146. [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
  147. [kYToRgb]"r"(&yuvconstants->kYToRgb)
  148. : "cc", "memory", "q0", "q1", "q2", "q3", "q4",
  149. "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
  150. );
  151. }
  152. void I422ToARGBRow_NEON(const uint8* src_y,
  153. const uint8* src_u,
  154. const uint8* src_v,
  155. uint8* dst_argb,
  156. const struct YuvConstants* yuvconstants,
  157. int width) {
  158. asm volatile (
  159. YUVTORGB_SETUP
  160. "vmov.u8 d23, #255 \n"
  161. "1: \n"
  162. READYUV422
  163. YUVTORGB
  164. "subs %4, %4, #8 \n"
  165. MEMACCESS(3)
  166. "vst4.8 {d20, d21, d22, d23}, [%3]! \n"
  167. "bgt 1b \n"
  168. : "+r"(src_y), // %0
  169. "+r"(src_u), // %1
  170. "+r"(src_v), // %2
  171. "+r"(dst_argb), // %3
  172. "+r"(width) // %4
  173. : [kUVToRB]"r"(&yuvconstants->kUVToRB),
  174. [kUVToG]"r"(&yuvconstants->kUVToG),
  175. [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
  176. [kYToRgb]"r"(&yuvconstants->kYToRgb)
  177. : "cc", "memory", "q0", "q1", "q2", "q3", "q4",
  178. "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
  179. );
  180. }
  181. void I422AlphaToARGBRow_NEON(const uint8* src_y,
  182. const uint8* src_u,
  183. const uint8* src_v,
  184. const uint8* src_a,
  185. uint8* dst_argb,
  186. const struct YuvConstants* yuvconstants,
  187. int width) {
  188. asm volatile (
  189. YUVTORGB_SETUP
  190. "1: \n"
  191. READYUV422
  192. YUVTORGB
  193. "subs %5, %5, #8 \n"
  194. MEMACCESS(3)
  195. "vld1.8 {d23}, [%3]! \n"
  196. MEMACCESS(4)
  197. "vst4.8 {d20, d21, d22, d23}, [%4]! \n"
  198. "bgt 1b \n"
  199. : "+r"(src_y), // %0
  200. "+r"(src_u), // %1
  201. "+r"(src_v), // %2
  202. "+r"(src_a), // %3
  203. "+r"(dst_argb), // %4
  204. "+r"(width) // %5
  205. : [kUVToRB]"r"(&yuvconstants->kUVToRB),
  206. [kUVToG]"r"(&yuvconstants->kUVToG),
  207. [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
  208. [kYToRgb]"r"(&yuvconstants->kYToRgb)
  209. : "cc", "memory", "q0", "q1", "q2", "q3", "q4",
  210. "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
  211. );
  212. }
  213. void I411ToARGBRow_NEON(const uint8* src_y,
  214. const uint8* src_u,
  215. const uint8* src_v,
  216. uint8* dst_argb,
  217. const struct YuvConstants* yuvconstants,
  218. int width) {
  219. asm volatile (
  220. YUVTORGB_SETUP
  221. "vmov.u8 d23, #255 \n"
  222. "1: \n"
  223. READYUV411
  224. YUVTORGB
  225. "subs %4, %4, #8 \n"
  226. MEMACCESS(3)
  227. "vst4.8 {d20, d21, d22, d23}, [%3]! \n"
  228. "bgt 1b \n"
  229. : "+r"(src_y), // %0
  230. "+r"(src_u), // %1
  231. "+r"(src_v), // %2
  232. "+r"(dst_argb), // %3
  233. "+r"(width) // %4
  234. : [kUVToRB]"r"(&yuvconstants->kUVToRB),
  235. [kUVToG]"r"(&yuvconstants->kUVToG),
  236. [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
  237. [kYToRgb]"r"(&yuvconstants->kYToRgb)
  238. : "cc", "memory", "q0", "q1", "q2", "q3", "q4",
  239. "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
  240. );
  241. }
  242. void I422ToRGBARow_NEON(const uint8* src_y,
  243. const uint8* src_u,
  244. const uint8* src_v,
  245. uint8* dst_rgba,
  246. const struct YuvConstants* yuvconstants,
  247. int width) {
  248. asm volatile (
  249. YUVTORGB_SETUP
  250. "1: \n"
  251. READYUV422
  252. YUVTORGB
  253. "subs %4, %4, #8 \n"
  254. "vmov.u8 d19, #255 \n" // d19 modified by YUVTORGB
  255. MEMACCESS(3)
  256. "vst4.8 {d19, d20, d21, d22}, [%3]! \n"
  257. "bgt 1b \n"
  258. : "+r"(src_y), // %0
  259. "+r"(src_u), // %1
  260. "+r"(src_v), // %2
  261. "+r"(dst_rgba), // %3
  262. "+r"(width) // %4
  263. : [kUVToRB]"r"(&yuvconstants->kUVToRB),
  264. [kUVToG]"r"(&yuvconstants->kUVToG),
  265. [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
  266. [kYToRgb]"r"(&yuvconstants->kYToRgb)
  267. : "cc", "memory", "q0", "q1", "q2", "q3", "q4",
  268. "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
  269. );
  270. }
  271. void I422ToRGB24Row_NEON(const uint8* src_y,
  272. const uint8* src_u,
  273. const uint8* src_v,
  274. uint8* dst_rgb24,
  275. const struct YuvConstants* yuvconstants,
  276. int width) {
  277. asm volatile (
  278. YUVTORGB_SETUP
  279. "1: \n"
  280. READYUV422
  281. YUVTORGB
  282. "subs %4, %4, #8 \n"
  283. MEMACCESS(3)
  284. "vst3.8 {d20, d21, d22}, [%3]! \n"
  285. "bgt 1b \n"
  286. : "+r"(src_y), // %0
  287. "+r"(src_u), // %1
  288. "+r"(src_v), // %2
  289. "+r"(dst_rgb24), // %3
  290. "+r"(width) // %4
  291. : [kUVToRB]"r"(&yuvconstants->kUVToRB),
  292. [kUVToG]"r"(&yuvconstants->kUVToG),
  293. [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
  294. [kYToRgb]"r"(&yuvconstants->kYToRgb)
  295. : "cc", "memory", "q0", "q1", "q2", "q3", "q4",
  296. "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
  297. );
  298. }
  299. #define ARGBTORGB565 \
  300. "vshll.u8 q0, d22, #8 \n" /* R */ \
  301. "vshll.u8 q8, d21, #8 \n" /* G */ \
  302. "vshll.u8 q9, d20, #8 \n" /* B */ \
  303. "vsri.16 q0, q8, #5 \n" /* RG */ \
  304. "vsri.16 q0, q9, #11 \n" /* RGB */
  305. void I422ToRGB565Row_NEON(const uint8* src_y,
  306. const uint8* src_u,
  307. const uint8* src_v,
  308. uint8* dst_rgb565,
  309. const struct YuvConstants* yuvconstants,
  310. int width) {
  311. asm volatile (
  312. YUVTORGB_SETUP
  313. "1: \n"
  314. READYUV422
  315. YUVTORGB
  316. "subs %4, %4, #8 \n"
  317. ARGBTORGB565
  318. MEMACCESS(3)
  319. "vst1.8 {q0}, [%3]! \n" // store 8 pixels RGB565.
  320. "bgt 1b \n"
  321. : "+r"(src_y), // %0
  322. "+r"(src_u), // %1
  323. "+r"(src_v), // %2
  324. "+r"(dst_rgb565), // %3
  325. "+r"(width) // %4
  326. : [kUVToRB]"r"(&yuvconstants->kUVToRB),
  327. [kUVToG]"r"(&yuvconstants->kUVToG),
  328. [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
  329. [kYToRgb]"r"(&yuvconstants->kYToRgb)
  330. : "cc", "memory", "q0", "q1", "q2", "q3", "q4",
  331. "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
  332. );
  333. }
  334. #define ARGBTOARGB1555 \
  335. "vshll.u8 q0, d23, #8 \n" /* A */ \
  336. "vshll.u8 q8, d22, #8 \n" /* R */ \
  337. "vshll.u8 q9, d21, #8 \n" /* G */ \
  338. "vshll.u8 q10, d20, #8 \n" /* B */ \
  339. "vsri.16 q0, q8, #1 \n" /* AR */ \
  340. "vsri.16 q0, q9, #6 \n" /* ARG */ \
  341. "vsri.16 q0, q10, #11 \n" /* ARGB */
  342. void I422ToARGB1555Row_NEON(const uint8* src_y,
  343. const uint8* src_u,
  344. const uint8* src_v,
  345. uint8* dst_argb1555,
  346. const struct YuvConstants* yuvconstants,
  347. int width) {
  348. asm volatile (
  349. YUVTORGB_SETUP
  350. "1: \n"
  351. READYUV422
  352. YUVTORGB
  353. "subs %4, %4, #8 \n"
  354. "vmov.u8 d23, #255 \n"
  355. ARGBTOARGB1555
  356. MEMACCESS(3)
  357. "vst1.8 {q0}, [%3]! \n" // store 8 pixels ARGB1555.
  358. "bgt 1b \n"
  359. : "+r"(src_y), // %0
  360. "+r"(src_u), // %1
  361. "+r"(src_v), // %2
  362. "+r"(dst_argb1555), // %3
  363. "+r"(width) // %4
  364. : [kUVToRB]"r"(&yuvconstants->kUVToRB),
  365. [kUVToG]"r"(&yuvconstants->kUVToG),
  366. [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
  367. [kYToRgb]"r"(&yuvconstants->kYToRgb)
  368. : "cc", "memory", "q0", "q1", "q2", "q3", "q4",
  369. "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
  370. );
  371. }
  372. #define ARGBTOARGB4444 \
  373. "vshr.u8 d20, d20, #4 \n" /* B */ \
  374. "vbic.32 d21, d21, d4 \n" /* G */ \
  375. "vshr.u8 d22, d22, #4 \n" /* R */ \
  376. "vbic.32 d23, d23, d4 \n" /* A */ \
  377. "vorr d0, d20, d21 \n" /* BG */ \
  378. "vorr d1, d22, d23 \n" /* RA */ \
  379. "vzip.u8 d0, d1 \n" /* BGRA */
  380. void I422ToARGB4444Row_NEON(const uint8* src_y,
  381. const uint8* src_u,
  382. const uint8* src_v,
  383. uint8* dst_argb4444,
  384. const struct YuvConstants* yuvconstants,
  385. int width) {
  386. asm volatile (
  387. YUVTORGB_SETUP
  388. "vmov.u8 d4, #0x0f \n" // bits to clear with vbic.
  389. "1: \n"
  390. READYUV422
  391. YUVTORGB
  392. "subs %4, %4, #8 \n"
  393. "vmov.u8 d23, #255 \n"
  394. ARGBTOARGB4444
  395. MEMACCESS(3)
  396. "vst1.8 {q0}, [%3]! \n" // store 8 pixels ARGB4444.
  397. "bgt 1b \n"
  398. : "+r"(src_y), // %0
  399. "+r"(src_u), // %1
  400. "+r"(src_v), // %2
  401. "+r"(dst_argb4444), // %3
  402. "+r"(width) // %4
  403. : [kUVToRB]"r"(&yuvconstants->kUVToRB),
  404. [kUVToG]"r"(&yuvconstants->kUVToG),
  405. [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
  406. [kYToRgb]"r"(&yuvconstants->kYToRgb)
  407. : "cc", "memory", "q0", "q1", "q2", "q3", "q4",
  408. "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
  409. );
  410. }
  411. void I400ToARGBRow_NEON(const uint8* src_y,
  412. uint8* dst_argb,
  413. int width) {
  414. asm volatile (
  415. YUVTORGB_SETUP
  416. "vmov.u8 d23, #255 \n"
  417. "1: \n"
  418. READYUV400
  419. YUVTORGB
  420. "subs %2, %2, #8 \n"
  421. MEMACCESS(1)
  422. "vst4.8 {d20, d21, d22, d23}, [%1]! \n"
  423. "bgt 1b \n"
  424. : "+r"(src_y), // %0
  425. "+r"(dst_argb), // %1
  426. "+r"(width) // %2
  427. : [kUVToRB]"r"(&kYuvI601Constants.kUVToRB),
  428. [kUVToG]"r"(&kYuvI601Constants.kUVToG),
  429. [kUVBiasBGR]"r"(&kYuvI601Constants.kUVBiasBGR),
  430. [kYToRgb]"r"(&kYuvI601Constants.kYToRgb)
  431. : "cc", "memory", "q0", "q1", "q2", "q3", "q4",
  432. "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
  433. );
  434. }
  435. void J400ToARGBRow_NEON(const uint8* src_y,
  436. uint8* dst_argb,
  437. int width) {
  438. asm volatile (
  439. "vmov.u8 d23, #255 \n"
  440. "1: \n"
  441. MEMACCESS(0)
  442. "vld1.8 {d20}, [%0]! \n"
  443. "vmov d21, d20 \n"
  444. "vmov d22, d20 \n"
  445. "subs %2, %2, #8 \n"
  446. MEMACCESS(1)
  447. "vst4.8 {d20, d21, d22, d23}, [%1]! \n"
  448. "bgt 1b \n"
  449. : "+r"(src_y), // %0
  450. "+r"(dst_argb), // %1
  451. "+r"(width) // %2
  452. :
  453. : "cc", "memory", "d20", "d21", "d22", "d23"
  454. );
  455. }
  456. void NV12ToARGBRow_NEON(const uint8* src_y,
  457. const uint8* src_uv,
  458. uint8* dst_argb,
  459. const struct YuvConstants* yuvconstants,
  460. int width) {
  461. asm volatile (
  462. YUVTORGB_SETUP
  463. "vmov.u8 d23, #255 \n"
  464. "1: \n"
  465. READNV12
  466. YUVTORGB
  467. "subs %3, %3, #8 \n"
  468. MEMACCESS(2)
  469. "vst4.8 {d20, d21, d22, d23}, [%2]! \n"
  470. "bgt 1b \n"
  471. : "+r"(src_y), // %0
  472. "+r"(src_uv), // %1
  473. "+r"(dst_argb), // %2
  474. "+r"(width) // %3
  475. : [kUVToRB]"r"(&yuvconstants->kUVToRB),
  476. [kUVToG]"r"(&yuvconstants->kUVToG),
  477. [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
  478. [kYToRgb]"r"(&yuvconstants->kYToRgb)
  479. : "cc", "memory", "q0", "q1", "q2", "q3", "q4",
  480. "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
  481. );
  482. }
  483. void NV21ToARGBRow_NEON(const uint8* src_y,
  484. const uint8* src_vu,
  485. uint8* dst_argb,
  486. const struct YuvConstants* yuvconstants,
  487. int width) {
  488. asm volatile (
  489. YUVTORGB_SETUP
  490. "vmov.u8 d23, #255 \n"
  491. "1: \n"
  492. READNV21
  493. YUVTORGB
  494. "subs %3, %3, #8 \n"
  495. MEMACCESS(2)
  496. "vst4.8 {d20, d21, d22, d23}, [%2]! \n"
  497. "bgt 1b \n"
  498. : "+r"(src_y), // %0
  499. "+r"(src_vu), // %1
  500. "+r"(dst_argb), // %2
  501. "+r"(width) // %3
  502. : [kUVToRB]"r"(&yuvconstants->kUVToRB),
  503. [kUVToG]"r"(&yuvconstants->kUVToG),
  504. [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
  505. [kYToRgb]"r"(&yuvconstants->kYToRgb)
  506. : "cc", "memory", "q0", "q1", "q2", "q3", "q4",
  507. "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
  508. );
  509. }
  510. void NV12ToRGB565Row_NEON(const uint8* src_y,
  511. const uint8* src_uv,
  512. uint8* dst_rgb565,
  513. const struct YuvConstants* yuvconstants,
  514. int width) {
  515. asm volatile (
  516. YUVTORGB_SETUP
  517. "1: \n"
  518. READNV12
  519. YUVTORGB
  520. "subs %3, %3, #8 \n"
  521. ARGBTORGB565
  522. MEMACCESS(2)
  523. "vst1.8 {q0}, [%2]! \n" // store 8 pixels RGB565.
  524. "bgt 1b \n"
  525. : "+r"(src_y), // %0
  526. "+r"(src_uv), // %1
  527. "+r"(dst_rgb565), // %2
  528. "+r"(width) // %3
  529. : [kUVToRB]"r"(&yuvconstants->kUVToRB),
  530. [kUVToG]"r"(&yuvconstants->kUVToG),
  531. [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
  532. [kYToRgb]"r"(&yuvconstants->kYToRgb)
  533. : "cc", "memory", "q0", "q1", "q2", "q3", "q4",
  534. "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
  535. );
  536. }
  537. void YUY2ToARGBRow_NEON(const uint8* src_yuy2,
  538. uint8* dst_argb,
  539. const struct YuvConstants* yuvconstants,
  540. int width) {
  541. asm volatile (
  542. YUVTORGB_SETUP
  543. "vmov.u8 d23, #255 \n"
  544. "1: \n"
  545. READYUY2
  546. YUVTORGB
  547. "subs %2, %2, #8 \n"
  548. MEMACCESS(1)
  549. "vst4.8 {d20, d21, d22, d23}, [%1]! \n"
  550. "bgt 1b \n"
  551. : "+r"(src_yuy2), // %0
  552. "+r"(dst_argb), // %1
  553. "+r"(width) // %2
  554. : [kUVToRB]"r"(&yuvconstants->kUVToRB),
  555. [kUVToG]"r"(&yuvconstants->kUVToG),
  556. [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
  557. [kYToRgb]"r"(&yuvconstants->kYToRgb)
  558. : "cc", "memory", "q0", "q1", "q2", "q3", "q4",
  559. "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
  560. );
  561. }
  562. void UYVYToARGBRow_NEON(const uint8* src_uyvy,
  563. uint8* dst_argb,
  564. const struct YuvConstants* yuvconstants,
  565. int width) {
  566. asm volatile (
  567. YUVTORGB_SETUP
  568. "vmov.u8 d23, #255 \n"
  569. "1: \n"
  570. READUYVY
  571. YUVTORGB
  572. "subs %2, %2, #8 \n"
  573. MEMACCESS(1)
  574. "vst4.8 {d20, d21, d22, d23}, [%1]! \n"
  575. "bgt 1b \n"
  576. : "+r"(src_uyvy), // %0
  577. "+r"(dst_argb), // %1
  578. "+r"(width) // %2
  579. : [kUVToRB]"r"(&yuvconstants->kUVToRB),
  580. [kUVToG]"r"(&yuvconstants->kUVToG),
  581. [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
  582. [kYToRgb]"r"(&yuvconstants->kYToRgb)
  583. : "cc", "memory", "q0", "q1", "q2", "q3", "q4",
  584. "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
  585. );
  586. }
  587. // Reads 16 pairs of UV and write even values to dst_u and odd to dst_v.
  588. void SplitUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
  589. int width) {
  590. asm volatile (
  591. "1: \n"
  592. MEMACCESS(0)
  593. "vld2.8 {q0, q1}, [%0]! \n" // load 16 pairs of UV
  594. "subs %3, %3, #16 \n" // 16 processed per loop
  595. MEMACCESS(1)
  596. "vst1.8 {q0}, [%1]! \n" // store U
  597. MEMACCESS(2)
  598. "vst1.8 {q1}, [%2]! \n" // store V
  599. "bgt 1b \n"
  600. : "+r"(src_uv), // %0
  601. "+r"(dst_u), // %1
  602. "+r"(dst_v), // %2
  603. "+r"(width) // %3 // Output registers
  604. : // Input registers
  605. : "cc", "memory", "q0", "q1" // Clobber List
  606. );
  607. }
  608. // Reads 16 U's and V's and writes out 16 pairs of UV.
  609. void MergeUVRow_NEON(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
  610. int width) {
  611. asm volatile (
  612. "1: \n"
  613. MEMACCESS(0)
  614. "vld1.8 {q0}, [%0]! \n" // load U
  615. MEMACCESS(1)
  616. "vld1.8 {q1}, [%1]! \n" // load V
  617. "subs %3, %3, #16 \n" // 16 processed per loop
  618. MEMACCESS(2)
  619. "vst2.u8 {q0, q1}, [%2]! \n" // store 16 pairs of UV
  620. "bgt 1b \n"
  621. :
  622. "+r"(src_u), // %0
  623. "+r"(src_v), // %1
  624. "+r"(dst_uv), // %2
  625. "+r"(width) // %3 // Output registers
  626. : // Input registers
  627. : "cc", "memory", "q0", "q1" // Clobber List
  628. );
  629. }
  630. // Copy multiple of 32. vld4.8 allow unaligned and is fastest on a15.
  631. void CopyRow_NEON(const uint8* src, uint8* dst, int count) {
  632. asm volatile (
  633. "1: \n"
  634. MEMACCESS(0)
  635. "vld1.8 {d0, d1, d2, d3}, [%0]! \n" // load 32
  636. "subs %2, %2, #32 \n" // 32 processed per loop
  637. MEMACCESS(1)
  638. "vst1.8 {d0, d1, d2, d3}, [%1]! \n" // store 32
  639. "bgt 1b \n"
  640. : "+r"(src), // %0
  641. "+r"(dst), // %1
  642. "+r"(count) // %2 // Output registers
  643. : // Input registers
  644. : "cc", "memory", "q0", "q1" // Clobber List
  645. );
  646. }
  647. // SetRow writes 'count' bytes using an 8 bit value repeated.
  648. void SetRow_NEON(uint8* dst, uint8 v8, int count) {
  649. asm volatile (
  650. "vdup.8 q0, %2 \n" // duplicate 16 bytes
  651. "1: \n"
  652. "subs %1, %1, #16 \n" // 16 bytes per loop
  653. MEMACCESS(0)
  654. "vst1.8 {q0}, [%0]! \n" // store
  655. "bgt 1b \n"
  656. : "+r"(dst), // %0
  657. "+r"(count) // %1
  658. : "r"(v8) // %2
  659. : "cc", "memory", "q0"
  660. );
  661. }
  662. // ARGBSetRow writes 'count' pixels using an 32 bit value repeated.
  663. void ARGBSetRow_NEON(uint8* dst, uint32 v32, int count) {
  664. asm volatile (
  665. "vdup.u32 q0, %2 \n" // duplicate 4 ints
  666. "1: \n"
  667. "subs %1, %1, #4 \n" // 4 pixels per loop
  668. MEMACCESS(0)
  669. "vst1.8 {q0}, [%0]! \n" // store
  670. "bgt 1b \n"
  671. : "+r"(dst), // %0
  672. "+r"(count) // %1
  673. : "r"(v32) // %2
  674. : "cc", "memory", "q0"
  675. );
  676. }
  677. void MirrorRow_NEON(const uint8* src, uint8* dst, int width) {
  678. asm volatile (
  679. // Start at end of source row.
  680. "mov r3, #-16 \n"
  681. "add %0, %0, %2 \n"
  682. "sub %0, #16 \n"
  683. "1: \n"
  684. MEMACCESS(0)
  685. "vld1.8 {q0}, [%0], r3 \n" // src -= 16
  686. "subs %2, #16 \n" // 16 pixels per loop.
  687. "vrev64.8 q0, q0 \n"
  688. MEMACCESS(1)
  689. "vst1.8 {d1}, [%1]! \n" // dst += 16
  690. MEMACCESS(1)
  691. "vst1.8 {d0}, [%1]! \n"
  692. "bgt 1b \n"
  693. : "+r"(src), // %0
  694. "+r"(dst), // %1
  695. "+r"(width) // %2
  696. :
  697. : "cc", "memory", "r3", "q0"
  698. );
  699. }
  700. void MirrorUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
  701. int width) {
  702. asm volatile (
  703. // Start at end of source row.
  704. "mov r12, #-16 \n"
  705. "add %0, %0, %3, lsl #1 \n"
  706. "sub %0, #16 \n"
  707. "1: \n"
  708. MEMACCESS(0)
  709. "vld2.8 {d0, d1}, [%0], r12 \n" // src -= 16
  710. "subs %3, #8 \n" // 8 pixels per loop.
  711. "vrev64.8 q0, q0 \n"
  712. MEMACCESS(1)
  713. "vst1.8 {d0}, [%1]! \n" // dst += 8
  714. MEMACCESS(2)
  715. "vst1.8 {d1}, [%2]! \n"
  716. "bgt 1b \n"
  717. : "+r"(src_uv), // %0
  718. "+r"(dst_u), // %1
  719. "+r"(dst_v), // %2
  720. "+r"(width) // %3
  721. :
  722. : "cc", "memory", "r12", "q0"
  723. );
  724. }
  725. void ARGBMirrorRow_NEON(const uint8* src, uint8* dst, int width) {
  726. asm volatile (
  727. // Start at end of source row.
  728. "mov r3, #-16 \n"
  729. "add %0, %0, %2, lsl #2 \n"
  730. "sub %0, #16 \n"
  731. "1: \n"
  732. MEMACCESS(0)
  733. "vld1.8 {q0}, [%0], r3 \n" // src -= 16
  734. "subs %2, #4 \n" // 4 pixels per loop.
  735. "vrev64.32 q0, q0 \n"
  736. MEMACCESS(1)
  737. "vst1.8 {d1}, [%1]! \n" // dst += 16
  738. MEMACCESS(1)
  739. "vst1.8 {d0}, [%1]! \n"
  740. "bgt 1b \n"
  741. : "+r"(src), // %0
  742. "+r"(dst), // %1
  743. "+r"(width) // %2
  744. :
  745. : "cc", "memory", "r3", "q0"
  746. );
  747. }
  748. void RGB24ToARGBRow_NEON(const uint8* src_rgb24, uint8* dst_argb, int width) {
  749. asm volatile (
  750. "vmov.u8 d4, #255 \n" // Alpha
  751. "1: \n"
  752. MEMACCESS(0)
  753. "vld3.8 {d1, d2, d3}, [%0]! \n" // load 8 pixels of RGB24.
  754. "subs %2, %2, #8 \n" // 8 processed per loop.
  755. MEMACCESS(1)
  756. "vst4.8 {d1, d2, d3, d4}, [%1]! \n" // store 8 pixels of ARGB.
  757. "bgt 1b \n"
  758. : "+r"(src_rgb24), // %0
  759. "+r"(dst_argb), // %1
  760. "+r"(width) // %2
  761. :
  762. : "cc", "memory", "d1", "d2", "d3", "d4" // Clobber List
  763. );
  764. }
  765. void RAWToARGBRow_NEON(const uint8* src_raw, uint8* dst_argb, int width) {
  766. asm volatile (
  767. "vmov.u8 d4, #255 \n" // Alpha
  768. "1: \n"
  769. MEMACCESS(0)
  770. "vld3.8 {d1, d2, d3}, [%0]! \n" // load 8 pixels of RAW.
  771. "subs %2, %2, #8 \n" // 8 processed per loop.
  772. "vswp.u8 d1, d3 \n" // swap R, B
  773. MEMACCESS(1)
  774. "vst4.8 {d1, d2, d3, d4}, [%1]! \n" // store 8 pixels of ARGB.
  775. "bgt 1b \n"
  776. : "+r"(src_raw), // %0
  777. "+r"(dst_argb), // %1
  778. "+r"(width) // %2
  779. :
  780. : "cc", "memory", "d1", "d2", "d3", "d4" // Clobber List
  781. );
  782. }
  783. void RAWToRGB24Row_NEON(const uint8* src_raw, uint8* dst_rgb24, int width) {
  784. asm volatile (
  785. "1: \n"
  786. MEMACCESS(0)
  787. "vld3.8 {d1, d2, d3}, [%0]! \n" // load 8 pixels of RAW.
  788. "subs %2, %2, #8 \n" // 8 processed per loop.
  789. "vswp.u8 d1, d3 \n" // swap R, B
  790. MEMACCESS(1)
  791. "vst3.8 {d1, d2, d3}, [%1]! \n" // store 8 pixels of RGB24.
  792. "bgt 1b \n"
  793. : "+r"(src_raw), // %0
  794. "+r"(dst_rgb24), // %1
  795. "+r"(width) // %2
  796. :
  797. : "cc", "memory", "d1", "d2", "d3" // Clobber List
  798. );
  799. }
  800. #define RGB565TOARGB \
  801. "vshrn.u16 d6, q0, #5 \n" /* G xxGGGGGG */ \
  802. "vuzp.u8 d0, d1 \n" /* d0 xxxBBBBB RRRRRxxx */ \
  803. "vshl.u8 d6, d6, #2 \n" /* G GGGGGG00 upper 6 */ \
  804. "vshr.u8 d1, d1, #3 \n" /* R 000RRRRR lower 5 */ \
  805. "vshl.u8 q0, q0, #3 \n" /* B,R BBBBB000 upper 5 */ \
  806. "vshr.u8 q2, q0, #5 \n" /* B,R 00000BBB lower 3 */ \
  807. "vorr.u8 d0, d0, d4 \n" /* B */ \
  808. "vshr.u8 d4, d6, #6 \n" /* G 000000GG lower 2 */ \
  809. "vorr.u8 d2, d1, d5 \n" /* R */ \
  810. "vorr.u8 d1, d4, d6 \n" /* G */
  811. void RGB565ToARGBRow_NEON(const uint8* src_rgb565, uint8* dst_argb, int width) {
  812. asm volatile (
  813. "vmov.u8 d3, #255 \n" // Alpha
  814. "1: \n"
  815. MEMACCESS(0)
  816. "vld1.8 {q0}, [%0]! \n" // load 8 RGB565 pixels.
  817. "subs %2, %2, #8 \n" // 8 processed per loop.
  818. RGB565TOARGB
  819. MEMACCESS(1)
  820. "vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 pixels of ARGB.
  821. "bgt 1b \n"
  822. : "+r"(src_rgb565), // %0
  823. "+r"(dst_argb), // %1
  824. "+r"(width) // %2
  825. :
  826. : "cc", "memory", "q0", "q1", "q2", "q3" // Clobber List
  827. );
  828. }
  829. #define ARGB1555TOARGB \
  830. "vshrn.u16 d7, q0, #8 \n" /* A Arrrrrxx */ \
  831. "vshr.u8 d6, d7, #2 \n" /* R xxxRRRRR */ \
  832. "vshrn.u16 d5, q0, #5 \n" /* G xxxGGGGG */ \
  833. "vmovn.u16 d4, q0 \n" /* B xxxBBBBB */ \
  834. "vshr.u8 d7, d7, #7 \n" /* A 0000000A */ \
  835. "vneg.s8 d7, d7 \n" /* A AAAAAAAA upper 8 */ \
  836. "vshl.u8 d6, d6, #3 \n" /* R RRRRR000 upper 5 */ \
  837. "vshr.u8 q1, q3, #5 \n" /* R,A 00000RRR lower 3 */ \
  838. "vshl.u8 q0, q2, #3 \n" /* B,G BBBBB000 upper 5 */ \
  839. "vshr.u8 q2, q0, #5 \n" /* B,G 00000BBB lower 3 */ \
  840. "vorr.u8 q1, q1, q3 \n" /* R,A */ \
  841. "vorr.u8 q0, q0, q2 \n" /* B,G */ \
  842. // RGB555TOARGB is same as ARGB1555TOARGB but ignores alpha.
  843. #define RGB555TOARGB \
  844. "vshrn.u16 d6, q0, #5 \n" /* G xxxGGGGG */ \
  845. "vuzp.u8 d0, d1 \n" /* d0 xxxBBBBB xRRRRRxx */ \
  846. "vshl.u8 d6, d6, #3 \n" /* G GGGGG000 upper 5 */ \
  847. "vshr.u8 d1, d1, #2 \n" /* R 00xRRRRR lower 5 */ \
  848. "vshl.u8 q0, q0, #3 \n" /* B,R BBBBB000 upper 5 */ \
  849. "vshr.u8 q2, q0, #5 \n" /* B,R 00000BBB lower 3 */ \
  850. "vorr.u8 d0, d0, d4 \n" /* B */ \
  851. "vshr.u8 d4, d6, #5 \n" /* G 00000GGG lower 3 */ \
  852. "vorr.u8 d2, d1, d5 \n" /* R */ \
  853. "vorr.u8 d1, d4, d6 \n" /* G */
  854. void ARGB1555ToARGBRow_NEON(const uint8* src_argb1555, uint8* dst_argb,
  855. int width) {
  856. asm volatile (
  857. "vmov.u8 d3, #255 \n" // Alpha
  858. "1: \n"
  859. MEMACCESS(0)
  860. "vld1.8 {q0}, [%0]! \n" // load 8 ARGB1555 pixels.
  861. "subs %2, %2, #8 \n" // 8 processed per loop.
  862. ARGB1555TOARGB
  863. MEMACCESS(1)
  864. "vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 pixels of ARGB.
  865. "bgt 1b \n"
  866. : "+r"(src_argb1555), // %0
  867. "+r"(dst_argb), // %1
  868. "+r"(width) // %2
  869. :
  870. : "cc", "memory", "q0", "q1", "q2", "q3" // Clobber List
  871. );
  872. }
  873. #define ARGB4444TOARGB \
  874. "vuzp.u8 d0, d1 \n" /* d0 BG, d1 RA */ \
  875. "vshl.u8 q2, q0, #4 \n" /* B,R BBBB0000 */ \
  876. "vshr.u8 q1, q0, #4 \n" /* G,A 0000GGGG */ \
  877. "vshr.u8 q0, q2, #4 \n" /* B,R 0000BBBB */ \
  878. "vorr.u8 q0, q0, q2 \n" /* B,R BBBBBBBB */ \
  879. "vshl.u8 q2, q1, #4 \n" /* G,A GGGG0000 */ \
  880. "vorr.u8 q1, q1, q2 \n" /* G,A GGGGGGGG */ \
  881. "vswp.u8 d1, d2 \n" /* B,R,G,A -> B,G,R,A */
  882. void ARGB4444ToARGBRow_NEON(const uint8* src_argb4444, uint8* dst_argb,
  883. int width) {
  884. asm volatile (
  885. "vmov.u8 d3, #255 \n" // Alpha
  886. "1: \n"
  887. MEMACCESS(0)
  888. "vld1.8 {q0}, [%0]! \n" // load 8 ARGB4444 pixels.
  889. "subs %2, %2, #8 \n" // 8 processed per loop.
  890. ARGB4444TOARGB
  891. MEMACCESS(1)
  892. "vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 pixels of ARGB.
  893. "bgt 1b \n"
  894. : "+r"(src_argb4444), // %0
  895. "+r"(dst_argb), // %1
  896. "+r"(width) // %2
  897. :
  898. : "cc", "memory", "q0", "q1", "q2" // Clobber List
  899. );
  900. }
  901. void ARGBToRGB24Row_NEON(const uint8* src_argb, uint8* dst_rgb24, int width) {
  902. asm volatile (
  903. "1: \n"
  904. MEMACCESS(0)
  905. "vld4.8 {d1, d2, d3, d4}, [%0]! \n" // load 8 pixels of ARGB.
  906. "subs %2, %2, #8 \n" // 8 processed per loop.
  907. MEMACCESS(1)
  908. "vst3.8 {d1, d2, d3}, [%1]! \n" // store 8 pixels of RGB24.
  909. "bgt 1b \n"
  910. : "+r"(src_argb), // %0
  911. "+r"(dst_rgb24), // %1
  912. "+r"(width) // %2
  913. :
  914. : "cc", "memory", "d1", "d2", "d3", "d4" // Clobber List
  915. );
  916. }
  917. void ARGBToRAWRow_NEON(const uint8* src_argb, uint8* dst_raw, int width) {
  918. asm volatile (
  919. "1: \n"
  920. MEMACCESS(0)
  921. "vld4.8 {d1, d2, d3, d4}, [%0]! \n" // load 8 pixels of ARGB.
  922. "subs %2, %2, #8 \n" // 8 processed per loop.
  923. "vswp.u8 d1, d3 \n" // swap R, B
  924. MEMACCESS(1)
  925. "vst3.8 {d1, d2, d3}, [%1]! \n" // store 8 pixels of RAW.
  926. "bgt 1b \n"
  927. : "+r"(src_argb), // %0
  928. "+r"(dst_raw), // %1
  929. "+r"(width) // %2
  930. :
  931. : "cc", "memory", "d1", "d2", "d3", "d4" // Clobber List
  932. );
  933. }
  934. void YUY2ToYRow_NEON(const uint8* src_yuy2, uint8* dst_y, int width) {
  935. asm volatile (
  936. "1: \n"
  937. MEMACCESS(0)
  938. "vld2.8 {q0, q1}, [%0]! \n" // load 16 pixels of YUY2.
  939. "subs %2, %2, #16 \n" // 16 processed per loop.
  940. MEMACCESS(1)
  941. "vst1.8 {q0}, [%1]! \n" // store 16 pixels of Y.
  942. "bgt 1b \n"
  943. : "+r"(src_yuy2), // %0
  944. "+r"(dst_y), // %1
  945. "+r"(width) // %2
  946. :
  947. : "cc", "memory", "q0", "q1" // Clobber List
  948. );
  949. }
  950. void UYVYToYRow_NEON(const uint8* src_uyvy, uint8* dst_y, int width) {
  951. asm volatile (
  952. "1: \n"
  953. MEMACCESS(0)
  954. "vld2.8 {q0, q1}, [%0]! \n" // load 16 pixels of UYVY.
  955. "subs %2, %2, #16 \n" // 16 processed per loop.
  956. MEMACCESS(1)
  957. "vst1.8 {q1}, [%1]! \n" // store 16 pixels of Y.
  958. "bgt 1b \n"
  959. : "+r"(src_uyvy), // %0
  960. "+r"(dst_y), // %1
  961. "+r"(width) // %2
  962. :
  963. : "cc", "memory", "q0", "q1" // Clobber List
  964. );
  965. }
  966. void YUY2ToUV422Row_NEON(const uint8* src_yuy2, uint8* dst_u, uint8* dst_v,
  967. int width) {
  968. asm volatile (
  969. "1: \n"
  970. MEMACCESS(0)
  971. "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of YUY2.
  972. "subs %3, %3, #16 \n" // 16 pixels = 8 UVs.
  973. MEMACCESS(1)
  974. "vst1.8 {d1}, [%1]! \n" // store 8 U.
  975. MEMACCESS(2)
  976. "vst1.8 {d3}, [%2]! \n" // store 8 V.
  977. "bgt 1b \n"
  978. : "+r"(src_yuy2), // %0
  979. "+r"(dst_u), // %1
  980. "+r"(dst_v), // %2
  981. "+r"(width) // %3
  982. :
  983. : "cc", "memory", "d0", "d1", "d2", "d3" // Clobber List
  984. );
  985. }
  986. void UYVYToUV422Row_NEON(const uint8* src_uyvy, uint8* dst_u, uint8* dst_v,
  987. int width) {
  988. asm volatile (
  989. "1: \n"
  990. MEMACCESS(0)
  991. "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of UYVY.
  992. "subs %3, %3, #16 \n" // 16 pixels = 8 UVs.
  993. MEMACCESS(1)
  994. "vst1.8 {d0}, [%1]! \n" // store 8 U.
  995. MEMACCESS(2)
  996. "vst1.8 {d2}, [%2]! \n" // store 8 V.
  997. "bgt 1b \n"
  998. : "+r"(src_uyvy), // %0
  999. "+r"(dst_u), // %1
  1000. "+r"(dst_v), // %2
  1001. "+r"(width) // %3
  1002. :
  1003. : "cc", "memory", "d0", "d1", "d2", "d3" // Clobber List
  1004. );
  1005. }
  1006. void YUY2ToUVRow_NEON(const uint8* src_yuy2, int stride_yuy2,
  1007. uint8* dst_u, uint8* dst_v, int width) {
  1008. asm volatile (
  1009. "add %1, %0, %1 \n" // stride + src_yuy2
  1010. "1: \n"
  1011. MEMACCESS(0)
  1012. "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of YUY2.
  1013. "subs %4, %4, #16 \n" // 16 pixels = 8 UVs.
  1014. MEMACCESS(1)
  1015. "vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load next row YUY2.
  1016. "vrhadd.u8 d1, d1, d5 \n" // average rows of U
  1017. "vrhadd.u8 d3, d3, d7 \n" // average rows of V
  1018. MEMACCESS(2)
  1019. "vst1.8 {d1}, [%2]! \n" // store 8 U.
  1020. MEMACCESS(3)
  1021. "vst1.8 {d3}, [%3]! \n" // store 8 V.
  1022. "bgt 1b \n"
  1023. : "+r"(src_yuy2), // %0
  1024. "+r"(stride_yuy2), // %1
  1025. "+r"(dst_u), // %2
  1026. "+r"(dst_v), // %3
  1027. "+r"(width) // %4
  1028. :
  1029. : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7" // Clobber List
  1030. );
  1031. }
  1032. void UYVYToUVRow_NEON(const uint8* src_uyvy, int stride_uyvy,
  1033. uint8* dst_u, uint8* dst_v, int width) {
  1034. asm volatile (
  1035. "add %1, %0, %1 \n" // stride + src_uyvy
  1036. "1: \n"
  1037. MEMACCESS(0)
  1038. "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of UYVY.
  1039. "subs %4, %4, #16 \n" // 16 pixels = 8 UVs.
  1040. MEMACCESS(1)
  1041. "vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load next row UYVY.
  1042. "vrhadd.u8 d0, d0, d4 \n" // average rows of U
  1043. "vrhadd.u8 d2, d2, d6 \n" // average rows of V
  1044. MEMACCESS(2)
  1045. "vst1.8 {d0}, [%2]! \n" // store 8 U.
  1046. MEMACCESS(3)
  1047. "vst1.8 {d2}, [%3]! \n" // store 8 V.
  1048. "bgt 1b \n"
  1049. : "+r"(src_uyvy), // %0
  1050. "+r"(stride_uyvy), // %1
  1051. "+r"(dst_u), // %2
  1052. "+r"(dst_v), // %3
  1053. "+r"(width) // %4
  1054. :
  1055. : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7" // Clobber List
  1056. );
  1057. }
  1058. // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
  1059. void ARGBShuffleRow_NEON(const uint8* src_argb, uint8* dst_argb,
  1060. const uint8* shuffler, int width) {
  1061. asm volatile (
  1062. MEMACCESS(3)
  1063. "vld1.8 {q2}, [%3] \n" // shuffler
  1064. "1: \n"
  1065. MEMACCESS(0)
  1066. "vld1.8 {q0}, [%0]! \n" // load 4 pixels.
  1067. "subs %2, %2, #4 \n" // 4 processed per loop
  1068. "vtbl.8 d2, {d0, d1}, d4 \n" // look up 2 first pixels
  1069. "vtbl.8 d3, {d0, d1}, d5 \n" // look up 2 next pixels
  1070. MEMACCESS(1)
  1071. "vst1.8 {q1}, [%1]! \n" // store 4.
  1072. "bgt 1b \n"
  1073. : "+r"(src_argb), // %0
  1074. "+r"(dst_argb), // %1
  1075. "+r"(width) // %2
  1076. : "r"(shuffler) // %3
  1077. : "cc", "memory", "q0", "q1", "q2" // Clobber List
  1078. );
  1079. }
  1080. void I422ToYUY2Row_NEON(const uint8* src_y,
  1081. const uint8* src_u,
  1082. const uint8* src_v,
  1083. uint8* dst_yuy2, int width) {
  1084. asm volatile (
  1085. "1: \n"
  1086. MEMACCESS(0)
  1087. "vld2.8 {d0, d2}, [%0]! \n" // load 16 Ys
  1088. MEMACCESS(1)
  1089. "vld1.8 {d1}, [%1]! \n" // load 8 Us
  1090. MEMACCESS(2)
  1091. "vld1.8 {d3}, [%2]! \n" // load 8 Vs
  1092. "subs %4, %4, #16 \n" // 16 pixels
  1093. MEMACCESS(3)
  1094. "vst4.8 {d0, d1, d2, d3}, [%3]! \n" // Store 8 YUY2/16 pixels.
  1095. "bgt 1b \n"
  1096. : "+r"(src_y), // %0
  1097. "+r"(src_u), // %1
  1098. "+r"(src_v), // %2
  1099. "+r"(dst_yuy2), // %3
  1100. "+r"(width) // %4
  1101. :
  1102. : "cc", "memory", "d0", "d1", "d2", "d3"
  1103. );
  1104. }
  1105. void I422ToUYVYRow_NEON(const uint8* src_y,
  1106. const uint8* src_u,
  1107. const uint8* src_v,
  1108. uint8* dst_uyvy, int width) {
  1109. asm volatile (
  1110. "1: \n"
  1111. MEMACCESS(0)
  1112. "vld2.8 {d1, d3}, [%0]! \n" // load 16 Ys
  1113. MEMACCESS(1)
  1114. "vld1.8 {d0}, [%1]! \n" // load 8 Us
  1115. MEMACCESS(2)
  1116. "vld1.8 {d2}, [%2]! \n" // load 8 Vs
  1117. "subs %4, %4, #16 \n" // 16 pixels
  1118. MEMACCESS(3)
  1119. "vst4.8 {d0, d1, d2, d3}, [%3]! \n" // Store 8 UYVY/16 pixels.
  1120. "bgt 1b \n"
  1121. : "+r"(src_y), // %0
  1122. "+r"(src_u), // %1
  1123. "+r"(src_v), // %2
  1124. "+r"(dst_uyvy), // %3
  1125. "+r"(width) // %4
  1126. :
  1127. : "cc", "memory", "d0", "d1", "d2", "d3"
  1128. );
  1129. }
  1130. void ARGBToRGB565Row_NEON(const uint8* src_argb, uint8* dst_rgb565, int width) {
  1131. asm volatile (
  1132. "1: \n"
  1133. MEMACCESS(0)
  1134. "vld4.8 {d20, d21, d22, d23}, [%0]! \n" // load 8 pixels of ARGB.
  1135. "subs %2, %2, #8 \n" // 8 processed per loop.
  1136. ARGBTORGB565
  1137. MEMACCESS(1)
  1138. "vst1.8 {q0}, [%1]! \n" // store 8 pixels RGB565.
  1139. "bgt 1b \n"
  1140. : "+r"(src_argb), // %0
  1141. "+r"(dst_rgb565), // %1
  1142. "+r"(width) // %2
  1143. :
  1144. : "cc", "memory", "q0", "q8", "q9", "q10", "q11"
  1145. );
  1146. }
  1147. void ARGBToRGB565DitherRow_NEON(const uint8* src_argb, uint8* dst_rgb,
  1148. const uint32 dither4, int width) {
  1149. asm volatile (
  1150. "vdup.32 d2, %2 \n" // dither4
  1151. "1: \n"
  1152. MEMACCESS(1)
  1153. "vld4.8 {d20, d21, d22, d23}, [%1]! \n" // load 8 pixels of ARGB.
  1154. "subs %3, %3, #8 \n" // 8 processed per loop.
  1155. "vqadd.u8 d20, d20, d2 \n"
  1156. "vqadd.u8 d21, d21, d2 \n"
  1157. "vqadd.u8 d22, d22, d2 \n"
  1158. ARGBTORGB565
  1159. MEMACCESS(0)
  1160. "vst1.8 {q0}, [%0]! \n" // store 8 pixels RGB565.
  1161. "bgt 1b \n"
  1162. : "+r"(dst_rgb) // %0
  1163. : "r"(src_argb), // %1
  1164. "r"(dither4), // %2
  1165. "r"(width) // %3
  1166. : "cc", "memory", "q0", "q1", "q8", "q9", "q10", "q11"
  1167. );
  1168. }
  1169. void ARGBToARGB1555Row_NEON(const uint8* src_argb, uint8* dst_argb1555,
  1170. int width) {
  1171. asm volatile (
  1172. "1: \n"
  1173. MEMACCESS(0)
  1174. "vld4.8 {d20, d21, d22, d23}, [%0]! \n" // load 8 pixels of ARGB.
  1175. "subs %2, %2, #8 \n" // 8 processed per loop.
  1176. ARGBTOARGB1555
  1177. MEMACCESS(1)
  1178. "vst1.8 {q0}, [%1]! \n" // store 8 pixels ARGB1555.
  1179. "bgt 1b \n"
  1180. : "+r"(src_argb), // %0
  1181. "+r"(dst_argb1555), // %1
  1182. "+r"(width) // %2
  1183. :
  1184. : "cc", "memory", "q0", "q8", "q9", "q10", "q11"
  1185. );
  1186. }
  1187. void ARGBToARGB4444Row_NEON(const uint8* src_argb, uint8* dst_argb4444,
  1188. int width) {
  1189. asm volatile (
  1190. "vmov.u8 d4, #0x0f \n" // bits to clear with vbic.
  1191. "1: \n"
  1192. MEMACCESS(0)
  1193. "vld4.8 {d20, d21, d22, d23}, [%0]! \n" // load 8 pixels of ARGB.
  1194. "subs %2, %2, #8 \n" // 8 processed per loop.
  1195. ARGBTOARGB4444
  1196. MEMACCESS(1)
  1197. "vst1.8 {q0}, [%1]! \n" // store 8 pixels ARGB4444.
  1198. "bgt 1b \n"
  1199. : "+r"(src_argb), // %0
  1200. "+r"(dst_argb4444), // %1
  1201. "+r"(width) // %2
  1202. :
  1203. : "cc", "memory", "q0", "q8", "q9", "q10", "q11"
  1204. );
  1205. }
  1206. void ARGBToYRow_NEON(const uint8* src_argb, uint8* dst_y, int width) {
  1207. asm volatile (
  1208. "vmov.u8 d24, #13 \n" // B * 0.1016 coefficient
  1209. "vmov.u8 d25, #65 \n" // G * 0.5078 coefficient
  1210. "vmov.u8 d26, #33 \n" // R * 0.2578 coefficient
  1211. "vmov.u8 d27, #16 \n" // Add 16 constant
  1212. "1: \n"
  1213. MEMACCESS(0)
  1214. "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels.
  1215. "subs %2, %2, #8 \n" // 8 processed per loop.
  1216. "vmull.u8 q2, d0, d24 \n" // B
  1217. "vmlal.u8 q2, d1, d25 \n" // G
  1218. "vmlal.u8 q2, d2, d26 \n" // R
  1219. "vqrshrun.s16 d0, q2, #7 \n" // 16 bit to 8 bit Y
  1220. "vqadd.u8 d0, d27 \n"
  1221. MEMACCESS(1)
  1222. "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y.
  1223. "bgt 1b \n"
  1224. : "+r"(src_argb), // %0
  1225. "+r"(dst_y), // %1
  1226. "+r"(width) // %2
  1227. :
  1228. : "cc", "memory", "q0", "q1", "q2", "q12", "q13"
  1229. );
  1230. }
  1231. void ARGBExtractAlphaRow_NEON(const uint8* src_argb, uint8* dst_a, int width) {
  1232. asm volatile (
  1233. "1: \n"
  1234. MEMACCESS(0)
  1235. "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels
  1236. "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB pixels
  1237. "subs %2, %2, #16 \n" // 16 processed per loop
  1238. MEMACCESS(1)
  1239. "vst1.8 {q3}, [%1]! \n" // store 16 A's.
  1240. "bgt 1b \n"
  1241. : "+r"(src_argb), // %0
  1242. "+r"(dst_a), // %1
  1243. "+r"(width) // %2
  1244. :
  1245. : "cc", "memory", "q0", "q1", "q2", "q3" // Clobber List
  1246. );
  1247. }
  1248. void ARGBToYJRow_NEON(const uint8* src_argb, uint8* dst_y, int width) {
  1249. asm volatile (
  1250. "vmov.u8 d24, #15 \n" // B * 0.11400 coefficient
  1251. "vmov.u8 d25, #75 \n" // G * 0.58700 coefficient
  1252. "vmov.u8 d26, #38 \n" // R * 0.29900 coefficient
  1253. "1: \n"
  1254. MEMACCESS(0)
  1255. "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels.
  1256. "subs %2, %2, #8 \n" // 8 processed per loop.
  1257. "vmull.u8 q2, d0, d24 \n" // B
  1258. "vmlal.u8 q2, d1, d25 \n" // G
  1259. "vmlal.u8 q2, d2, d26 \n" // R
  1260. "vqrshrun.s16 d0, q2, #7 \n" // 15 bit to 8 bit Y
  1261. MEMACCESS(1)
  1262. "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y.
  1263. "bgt 1b \n"
  1264. : "+r"(src_argb), // %0
  1265. "+r"(dst_y), // %1
  1266. "+r"(width) // %2
  1267. :
  1268. : "cc", "memory", "q0", "q1", "q2", "q12", "q13"
  1269. );
  1270. }
  1271. // 8x1 pixels.
  1272. void ARGBToUV444Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
  1273. int width) {
  1274. asm volatile (
  1275. "vmov.u8 d24, #112 \n" // UB / VR 0.875 coefficient
  1276. "vmov.u8 d25, #74 \n" // UG -0.5781 coefficient
  1277. "vmov.u8 d26, #38 \n" // UR -0.2969 coefficient
  1278. "vmov.u8 d27, #18 \n" // VB -0.1406 coefficient
  1279. "vmov.u8 d28, #94 \n" // VG -0.7344 coefficient
  1280. "vmov.u16 q15, #0x8080 \n" // 128.5
  1281. "1: \n"
  1282. MEMACCESS(0)
  1283. "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels.
  1284. "subs %3, %3, #8 \n" // 8 processed per loop.
  1285. "vmull.u8 q2, d0, d24 \n" // B
  1286. "vmlsl.u8 q2, d1, d25 \n" // G
  1287. "vmlsl.u8 q2, d2, d26 \n" // R
  1288. "vadd.u16 q2, q2, q15 \n" // +128 -> unsigned
  1289. "vmull.u8 q3, d2, d24 \n" // R
  1290. "vmlsl.u8 q3, d1, d28 \n" // G
  1291. "vmlsl.u8 q3, d0, d27 \n" // B
  1292. "vadd.u16 q3, q3, q15 \n" // +128 -> unsigned
  1293. "vqshrn.u16 d0, q2, #8 \n" // 16 bit to 8 bit U
  1294. "vqshrn.u16 d1, q3, #8 \n" // 16 bit to 8 bit V
  1295. MEMACCESS(1)
  1296. "vst1.8 {d0}, [%1]! \n" // store 8 pixels U.
  1297. MEMACCESS(2)
  1298. "vst1.8 {d1}, [%2]! \n" // store 8 pixels V.
  1299. "bgt 1b \n"
  1300. : "+r"(src_argb), // %0
  1301. "+r"(dst_u), // %1
  1302. "+r"(dst_v), // %2
  1303. "+r"(width) // %3
  1304. :
  1305. : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q12", "q13", "q14", "q15"
  1306. );
  1307. }
  1308. // 32x1 pixels -> 8x1. width is number of argb pixels. e.g. 32.
  1309. void ARGBToUV411Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
  1310. int width) {
  1311. asm volatile (
  1312. "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient
  1313. "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient
  1314. "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient
  1315. "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient
  1316. "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient
  1317. "vmov.u16 q15, #0x8080 \n" // 128.5
  1318. "1: \n"
  1319. MEMACCESS(0)
  1320. "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels.
  1321. MEMACCESS(0)
  1322. "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB pixels.
  1323. "vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts.
  1324. "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts.
  1325. "vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts.
  1326. MEMACCESS(0)
  1327. "vld4.8 {d8, d10, d12, d14}, [%0]! \n" // load 8 more ARGB pixels.
  1328. MEMACCESS(0)
  1329. "vld4.8 {d9, d11, d13, d15}, [%0]! \n" // load last 8 ARGB pixels.
  1330. "vpaddl.u8 q4, q4 \n" // B 16 bytes -> 8 shorts.
  1331. "vpaddl.u8 q5, q5 \n" // G 16 bytes -> 8 shorts.
  1332. "vpaddl.u8 q6, q6 \n" // R 16 bytes -> 8 shorts.
  1333. "vpadd.u16 d0, d0, d1 \n" // B 16 shorts -> 8 shorts.
  1334. "vpadd.u16 d1, d8, d9 \n" // B
  1335. "vpadd.u16 d2, d2, d3 \n" // G 16 shorts -> 8 shorts.
  1336. "vpadd.u16 d3, d10, d11 \n" // G
  1337. "vpadd.u16 d4, d4, d5 \n" // R 16 shorts -> 8 shorts.
  1338. "vpadd.u16 d5, d12, d13 \n" // R
  1339. "vrshr.u16 q0, q0, #1 \n" // 2x average
  1340. "vrshr.u16 q1, q1, #1 \n"
  1341. "vrshr.u16 q2, q2, #1 \n"
  1342. "subs %3, %3, #32 \n" // 32 processed per loop.
  1343. "vmul.s16 q8, q0, q10 \n" // B
  1344. "vmls.s16 q8, q1, q11 \n" // G
  1345. "vmls.s16 q8, q2, q12 \n" // R
  1346. "vadd.u16 q8, q8, q15 \n" // +128 -> unsigned
  1347. "vmul.s16 q9, q2, q10 \n" // R
  1348. "vmls.s16 q9, q1, q14 \n" // G
  1349. "vmls.s16 q9, q0, q13 \n" // B
  1350. "vadd.u16 q9, q9, q15 \n" // +128 -> unsigned
  1351. "vqshrn.u16 d0, q8, #8 \n" // 16 bit to 8 bit U
  1352. "vqshrn.u16 d1, q9, #8 \n" // 16 bit to 8 bit V
  1353. MEMACCESS(1)
  1354. "vst1.8 {d0}, [%1]! \n" // store 8 pixels U.
  1355. MEMACCESS(2)
  1356. "vst1.8 {d1}, [%2]! \n" // store 8 pixels V.
  1357. "bgt 1b \n"
  1358. : "+r"(src_argb), // %0
  1359. "+r"(dst_u), // %1
  1360. "+r"(dst_v), // %2
  1361. "+r"(width) // %3
  1362. :
  1363. : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
  1364. "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
  1365. );
  1366. }
  1367. // 16x2 pixels -> 8x1. width is number of argb pixels. e.g. 16.
  1368. #define RGBTOUV(QB, QG, QR) \
  1369. "vmul.s16 q8, " #QB ", q10 \n" /* B */ \
  1370. "vmls.s16 q8, " #QG ", q11 \n" /* G */ \
  1371. "vmls.s16 q8, " #QR ", q12 \n" /* R */ \
  1372. "vadd.u16 q8, q8, q15 \n" /* +128 -> unsigned */ \
  1373. "vmul.s16 q9, " #QR ", q10 \n" /* R */ \
  1374. "vmls.s16 q9, " #QG ", q14 \n" /* G */ \
  1375. "vmls.s16 q9, " #QB ", q13 \n" /* B */ \
  1376. "vadd.u16 q9, q9, q15 \n" /* +128 -> unsigned */ \
  1377. "vqshrn.u16 d0, q8, #8 \n" /* 16 bit to 8 bit U */ \
  1378. "vqshrn.u16 d1, q9, #8 \n" /* 16 bit to 8 bit V */
  1379. // TODO(fbarchard): Consider vhadd vertical, then vpaddl horizontal, avoid shr.
  1380. void ARGBToUVRow_NEON(const uint8* src_argb, int src_stride_argb,
  1381. uint8* dst_u, uint8* dst_v, int width) {
  1382. asm volatile (
  1383. "add %1, %0, %1 \n" // src_stride + src_argb
  1384. "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient
  1385. "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient
  1386. "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient
  1387. "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient
  1388. "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient
  1389. "vmov.u16 q15, #0x8080 \n" // 128.5
  1390. "1: \n"
  1391. MEMACCESS(0)
  1392. "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels.
  1393. MEMACCESS(0)
  1394. "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB pixels.
  1395. "vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts.
  1396. "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts.
  1397. "vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts.
  1398. MEMACCESS(1)
  1399. "vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more ARGB pixels.
  1400. MEMACCESS(1)
  1401. "vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 ARGB pixels.
  1402. "vpadal.u8 q0, q4 \n" // B 16 bytes -> 8 shorts.
  1403. "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts.
  1404. "vpadal.u8 q2, q6 \n" // R 16 bytes -> 8 shorts.
  1405. "vrshr.u16 q0, q0, #1 \n" // 2x average
  1406. "vrshr.u16 q1, q1, #1 \n"
  1407. "vrshr.u16 q2, q2, #1 \n"
  1408. "subs %4, %4, #16 \n" // 32 processed per loop.
  1409. RGBTOUV(q0, q1, q2)
  1410. MEMACCESS(2)
  1411. "vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
  1412. MEMACCESS(3)
  1413. "vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
  1414. "bgt 1b \n"
  1415. : "+r"(src_argb), // %0
  1416. "+r"(src_stride_argb), // %1
  1417. "+r"(dst_u), // %2
  1418. "+r"(dst_v), // %3
  1419. "+r"(width) // %4
  1420. :
  1421. : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
  1422. "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
  1423. );
  1424. }
  1425. // TODO(fbarchard): Subsample match C code.
  1426. void ARGBToUVJRow_NEON(const uint8* src_argb, int src_stride_argb,
  1427. uint8* dst_u, uint8* dst_v, int width) {
  1428. asm volatile (
  1429. "add %1, %0, %1 \n" // src_stride + src_argb
  1430. "vmov.s16 q10, #127 / 2 \n" // UB / VR 0.500 coefficient
  1431. "vmov.s16 q11, #84 / 2 \n" // UG -0.33126 coefficient
  1432. "vmov.s16 q12, #43 / 2 \n" // UR -0.16874 coefficient
  1433. "vmov.s16 q13, #20 / 2 \n" // VB -0.08131 coefficient
  1434. "vmov.s16 q14, #107 / 2 \n" // VG -0.41869 coefficient
  1435. "vmov.u16 q15, #0x8080 \n" // 128.5
  1436. "1: \n"
  1437. MEMACCESS(0)
  1438. "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels.
  1439. MEMACCESS(0)
  1440. "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB pixels.
  1441. "vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts.
  1442. "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts.
  1443. "vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts.
  1444. MEMACCESS(1)
  1445. "vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more ARGB pixels.
  1446. MEMACCESS(1)
  1447. "vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 ARGB pixels.
  1448. "vpadal.u8 q0, q4 \n" // B 16 bytes -> 8 shorts.
  1449. "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts.
  1450. "vpadal.u8 q2, q6 \n" // R 16 bytes -> 8 shorts.
  1451. "vrshr.u16 q0, q0, #1 \n" // 2x average
  1452. "vrshr.u16 q1, q1, #1 \n"
  1453. "vrshr.u16 q2, q2, #1 \n"
  1454. "subs %4, %4, #16 \n" // 32 processed per loop.
  1455. RGBTOUV(q0, q1, q2)
  1456. MEMACCESS(2)
  1457. "vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
  1458. MEMACCESS(3)
  1459. "vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
  1460. "bgt 1b \n"
  1461. : "+r"(src_argb), // %0
  1462. "+r"(src_stride_argb), // %1
  1463. "+r"(dst_u), // %2
  1464. "+r"(dst_v), // %3
  1465. "+r"(width) // %4
  1466. :
  1467. : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
  1468. "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
  1469. );
  1470. }
  1471. void BGRAToUVRow_NEON(const uint8* src_bgra, int src_stride_bgra,
  1472. uint8* dst_u, uint8* dst_v, int width) {
  1473. asm volatile (
  1474. "add %1, %0, %1 \n" // src_stride + src_bgra
  1475. "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient
  1476. "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient
  1477. "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient
  1478. "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient
  1479. "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient
  1480. "vmov.u16 q15, #0x8080 \n" // 128.5
  1481. "1: \n"
  1482. MEMACCESS(0)
  1483. "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 BGRA pixels.
  1484. MEMACCESS(0)
  1485. "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 BGRA pixels.
  1486. "vpaddl.u8 q3, q3 \n" // B 16 bytes -> 8 shorts.
  1487. "vpaddl.u8 q2, q2 \n" // G 16 bytes -> 8 shorts.
  1488. "vpaddl.u8 q1, q1 \n" // R 16 bytes -> 8 shorts.
  1489. MEMACCESS(1)
  1490. "vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more BGRA pixels.
  1491. MEMACCESS(1)
  1492. "vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 BGRA pixels.
  1493. "vpadal.u8 q3, q7 \n" // B 16 bytes -> 8 shorts.
  1494. "vpadal.u8 q2, q6 \n" // G 16 bytes -> 8 shorts.
  1495. "vpadal.u8 q1, q5 \n" // R 16 bytes -> 8 shorts.
  1496. "vrshr.u16 q1, q1, #1 \n" // 2x average
  1497. "vrshr.u16 q2, q2, #1 \n"
  1498. "vrshr.u16 q3, q3, #1 \n"
  1499. "subs %4, %4, #16 \n" // 32 processed per loop.
  1500. RGBTOUV(q3, q2, q1)
  1501. MEMACCESS(2)
  1502. "vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
  1503. MEMACCESS(3)
  1504. "vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
  1505. "bgt 1b \n"
  1506. : "+r"(src_bgra), // %0
  1507. "+r"(src_stride_bgra), // %1
  1508. "+r"(dst_u), // %2
  1509. "+r"(dst_v), // %3
  1510. "+r"(width) // %4
  1511. :
  1512. : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
  1513. "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
  1514. );
  1515. }
  1516. void ABGRToUVRow_NEON(const uint8* src_abgr, int src_stride_abgr,
  1517. uint8* dst_u, uint8* dst_v, int width) {
  1518. asm volatile (
  1519. "add %1, %0, %1 \n" // src_stride + src_abgr
  1520. "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient
  1521. "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient
  1522. "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient
  1523. "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient
  1524. "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient
  1525. "vmov.u16 q15, #0x8080 \n" // 128.5
  1526. "1: \n"
  1527. MEMACCESS(0)
  1528. "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ABGR pixels.
  1529. MEMACCESS(0)
  1530. "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ABGR pixels.
  1531. "vpaddl.u8 q2, q2 \n" // B 16 bytes -> 8 shorts.
  1532. "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts.
  1533. "vpaddl.u8 q0, q0 \n" // R 16 bytes -> 8 shorts.
  1534. MEMACCESS(1)
  1535. "vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more ABGR pixels.
  1536. MEMACCESS(1)
  1537. "vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 ABGR pixels.
  1538. "vpadal.u8 q2, q6 \n" // B 16 bytes -> 8 shorts.
  1539. "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts.
  1540. "vpadal.u8 q0, q4 \n" // R 16 bytes -> 8 shorts.
  1541. "vrshr.u16 q0, q0, #1 \n" // 2x average
  1542. "vrshr.u16 q1, q1, #1 \n"
  1543. "vrshr.u16 q2, q2, #1 \n"
  1544. "subs %4, %4, #16 \n" // 32 processed per loop.
  1545. RGBTOUV(q2, q1, q0)
  1546. MEMACCESS(2)
  1547. "vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
  1548. MEMACCESS(3)
  1549. "vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
  1550. "bgt 1b \n"
  1551. : "+r"(src_abgr), // %0
  1552. "+r"(src_stride_abgr), // %1
  1553. "+r"(dst_u), // %2
  1554. "+r"(dst_v), // %3
  1555. "+r"(width) // %4
  1556. :
  1557. : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
  1558. "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
  1559. );
  1560. }
  1561. void RGBAToUVRow_NEON(const uint8* src_rgba, int src_stride_rgba,
  1562. uint8* dst_u, uint8* dst_v, int width) {
  1563. asm volatile (
  1564. "add %1, %0, %1 \n" // src_stride + src_rgba
  1565. "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient
  1566. "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient
  1567. "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient
  1568. "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient
  1569. "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient
  1570. "vmov.u16 q15, #0x8080 \n" // 128.5
  1571. "1: \n"
  1572. MEMACCESS(0)
  1573. "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 RGBA pixels.
  1574. MEMACCESS(0)
  1575. "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 RGBA pixels.
  1576. "vpaddl.u8 q0, q1 \n" // B 16 bytes -> 8 shorts.
  1577. "vpaddl.u8 q1, q2 \n" // G 16 bytes -> 8 shorts.
  1578. "vpaddl.u8 q2, q3 \n" // R 16 bytes -> 8 shorts.
  1579. MEMACCESS(1)
  1580. "vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more RGBA pixels.
  1581. MEMACCESS(1)
  1582. "vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 RGBA pixels.
  1583. "vpadal.u8 q0, q5 \n" // B 16 bytes -> 8 shorts.
  1584. "vpadal.u8 q1, q6 \n" // G 16 bytes -> 8 shorts.
  1585. "vpadal.u8 q2, q7 \n" // R 16 bytes -> 8 shorts.
  1586. "vrshr.u16 q0, q0, #1 \n" // 2x average
  1587. "vrshr.u16 q1, q1, #1 \n"
  1588. "vrshr.u16 q2, q2, #1 \n"
  1589. "subs %4, %4, #16 \n" // 32 processed per loop.
  1590. RGBTOUV(q0, q1, q2)
  1591. MEMACCESS(2)
  1592. "vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
  1593. MEMACCESS(3)
  1594. "vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
  1595. "bgt 1b \n"
  1596. : "+r"(src_rgba), // %0
  1597. "+r"(src_stride_rgba), // %1
  1598. "+r"(dst_u), // %2
  1599. "+r"(dst_v), // %3
  1600. "+r"(width) // %4
  1601. :
  1602. : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
  1603. "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
  1604. );
  1605. }
  1606. void RGB24ToUVRow_NEON(const uint8* src_rgb24, int src_stride_rgb24,
  1607. uint8* dst_u, uint8* dst_v, int width) {
  1608. asm volatile (
  1609. "add %1, %0, %1 \n" // src_stride + src_rgb24
  1610. "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient
  1611. "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient
  1612. "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient
  1613. "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient
  1614. "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient
  1615. "vmov.u16 q15, #0x8080 \n" // 128.5
  1616. "1: \n"
  1617. MEMACCESS(0)
  1618. "vld3.8 {d0, d2, d4}, [%0]! \n" // load 8 RGB24 pixels.
  1619. MEMACCESS(0)
  1620. "vld3.8 {d1, d3, d5}, [%0]! \n" // load next 8 RGB24 pixels.
  1621. "vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts.
  1622. "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts.
  1623. "vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts.
  1624. MEMACCESS(1)
  1625. "vld3.8 {d8, d10, d12}, [%1]! \n" // load 8 more RGB24 pixels.
  1626. MEMACCESS(1)
  1627. "vld3.8 {d9, d11, d13}, [%1]! \n" // load last 8 RGB24 pixels.
  1628. "vpadal.u8 q0, q4 \n" // B 16 bytes -> 8 shorts.
  1629. "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts.
  1630. "vpadal.u8 q2, q6 \n" // R 16 bytes -> 8 shorts.
  1631. "vrshr.u16 q0, q0, #1 \n" // 2x average
  1632. "vrshr.u16 q1, q1, #1 \n"
  1633. "vrshr.u16 q2, q2, #1 \n"
  1634. "subs %4, %4, #16 \n" // 32 processed per loop.
  1635. RGBTOUV(q0, q1, q2)
  1636. MEMACCESS(2)
  1637. "vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
  1638. MEMACCESS(3)
  1639. "vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
  1640. "bgt 1b \n"
  1641. : "+r"(src_rgb24), // %0
  1642. "+r"(src_stride_rgb24), // %1
  1643. "+r"(dst_u), // %2
  1644. "+r"(dst_v), // %3
  1645. "+r"(width) // %4
  1646. :
  1647. : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
  1648. "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
  1649. );
  1650. }
  1651. void RAWToUVRow_NEON(const uint8* src_raw, int src_stride_raw,
  1652. uint8* dst_u, uint8* dst_v, int width) {
  1653. asm volatile (
  1654. "add %1, %0, %1 \n" // src_stride + src_raw
  1655. "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient
  1656. "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient
  1657. "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient
  1658. "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient
  1659. "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient
  1660. "vmov.u16 q15, #0x8080 \n" // 128.5
  1661. "1: \n"
  1662. MEMACCESS(0)
  1663. "vld3.8 {d0, d2, d4}, [%0]! \n" // load 8 RAW pixels.
  1664. MEMACCESS(0)
  1665. "vld3.8 {d1, d3, d5}, [%0]! \n" // load next 8 RAW pixels.
  1666. "vpaddl.u8 q2, q2 \n" // B 16 bytes -> 8 shorts.
  1667. "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts.
  1668. "vpaddl.u8 q0, q0 \n" // R 16 bytes -> 8 shorts.
  1669. MEMACCESS(1)
  1670. "vld3.8 {d8, d10, d12}, [%1]! \n" // load 8 more RAW pixels.
  1671. MEMACCESS(1)
  1672. "vld3.8 {d9, d11, d13}, [%1]! \n" // load last 8 RAW pixels.
  1673. "vpadal.u8 q2, q6 \n" // B 16 bytes -> 8 shorts.
  1674. "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts.
  1675. "vpadal.u8 q0, q4 \n" // R 16 bytes -> 8 shorts.
  1676. "vrshr.u16 q0, q0, #1 \n" // 2x average
  1677. "vrshr.u16 q1, q1, #1 \n"
  1678. "vrshr.u16 q2, q2, #1 \n"
  1679. "subs %4, %4, #16 \n" // 32 processed per loop.
  1680. RGBTOUV(q2, q1, q0)
  1681. MEMACCESS(2)
  1682. "vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
  1683. MEMACCESS(3)
  1684. "vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
  1685. "bgt 1b \n"
  1686. : "+r"(src_raw), // %0
  1687. "+r"(src_stride_raw), // %1
  1688. "+r"(dst_u), // %2
  1689. "+r"(dst_v), // %3
  1690. "+r"(width) // %4
  1691. :
  1692. : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
  1693. "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
  1694. );
  1695. }
  1696. // 16x2 pixels -> 8x1. width is number of argb pixels. e.g. 16.
  1697. void RGB565ToUVRow_NEON(const uint8* src_rgb565, int src_stride_rgb565,
  1698. uint8* dst_u, uint8* dst_v, int width) {
  1699. asm volatile (
  1700. "add %1, %0, %1 \n" // src_stride + src_argb
  1701. "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient
  1702. "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient
  1703. "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient
  1704. "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient
  1705. "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient
  1706. "vmov.u16 q15, #0x8080 \n" // 128.5
  1707. "1: \n"
  1708. MEMACCESS(0)
  1709. "vld1.8 {q0}, [%0]! \n" // load 8 RGB565 pixels.
  1710. RGB565TOARGB
  1711. "vpaddl.u8 d8, d0 \n" // B 8 bytes -> 4 shorts.
  1712. "vpaddl.u8 d10, d1 \n" // G 8 bytes -> 4 shorts.
  1713. "vpaddl.u8 d12, d2 \n" // R 8 bytes -> 4 shorts.
  1714. MEMACCESS(0)
  1715. "vld1.8 {q0}, [%0]! \n" // next 8 RGB565 pixels.
  1716. RGB565TOARGB
  1717. "vpaddl.u8 d9, d0 \n" // B 8 bytes -> 4 shorts.
  1718. "vpaddl.u8 d11, d1 \n" // G 8 bytes -> 4 shorts.
  1719. "vpaddl.u8 d13, d2 \n" // R 8 bytes -> 4 shorts.
  1720. MEMACCESS(1)
  1721. "vld1.8 {q0}, [%1]! \n" // load 8 RGB565 pixels.
  1722. RGB565TOARGB
  1723. "vpadal.u8 d8, d0 \n" // B 8 bytes -> 4 shorts.
  1724. "vpadal.u8 d10, d1 \n" // G 8 bytes -> 4 shorts.
  1725. "vpadal.u8 d12, d2 \n" // R 8 bytes -> 4 shorts.
  1726. MEMACCESS(1)
  1727. "vld1.8 {q0}, [%1]! \n" // next 8 RGB565 pixels.
  1728. RGB565TOARGB
  1729. "vpadal.u8 d9, d0 \n" // B 8 bytes -> 4 shorts.
  1730. "vpadal.u8 d11, d1 \n" // G 8 bytes -> 4 shorts.
  1731. "vpadal.u8 d13, d2 \n" // R 8 bytes -> 4 shorts.
  1732. "vrshr.u16 q4, q4, #1 \n" // 2x average
  1733. "vrshr.u16 q5, q5, #1 \n"
  1734. "vrshr.u16 q6, q6, #1 \n"
  1735. "subs %4, %4, #16 \n" // 16 processed per loop.
  1736. "vmul.s16 q8, q4, q10 \n" // B
  1737. "vmls.s16 q8, q5, q11 \n" // G
  1738. "vmls.s16 q8, q6, q12 \n" // R
  1739. "vadd.u16 q8, q8, q15 \n" // +128 -> unsigned
  1740. "vmul.s16 q9, q6, q10 \n" // R
  1741. "vmls.s16 q9, q5, q14 \n" // G
  1742. "vmls.s16 q9, q4, q13 \n" // B
  1743. "vadd.u16 q9, q9, q15 \n" // +128 -> unsigned
  1744. "vqshrn.u16 d0, q8, #8 \n" // 16 bit to 8 bit U
  1745. "vqshrn.u16 d1, q9, #8 \n" // 16 bit to 8 bit V
  1746. MEMACCESS(2)
  1747. "vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
  1748. MEMACCESS(3)
  1749. "vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
  1750. "bgt 1b \n"
  1751. : "+r"(src_rgb565), // %0
  1752. "+r"(src_stride_rgb565), // %1
  1753. "+r"(dst_u), // %2
  1754. "+r"(dst_v), // %3
  1755. "+r"(width) // %4
  1756. :
  1757. : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
  1758. "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
  1759. );
  1760. }
  1761. // 16x2 pixels -> 8x1. width is number of argb pixels. e.g. 16.
  1762. void ARGB1555ToUVRow_NEON(const uint8* src_argb1555, int src_stride_argb1555,
  1763. uint8* dst_u, uint8* dst_v, int width) {
  1764. asm volatile (
  1765. "add %1, %0, %1 \n" // src_stride + src_argb
  1766. "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient
  1767. "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient
  1768. "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient
  1769. "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient
  1770. "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient
  1771. "vmov.u16 q15, #0x8080 \n" // 128.5
  1772. "1: \n"
  1773. MEMACCESS(0)
  1774. "vld1.8 {q0}, [%0]! \n" // load 8 ARGB1555 pixels.
  1775. RGB555TOARGB
  1776. "vpaddl.u8 d8, d0 \n" // B 8 bytes -> 4 shorts.
  1777. "vpaddl.u8 d10, d1 \n" // G 8 bytes -> 4 shorts.
  1778. "vpaddl.u8 d12, d2 \n" // R 8 bytes -> 4 shorts.
  1779. MEMACCESS(0)
  1780. "vld1.8 {q0}, [%0]! \n" // next 8 ARGB1555 pixels.
  1781. RGB555TOARGB
  1782. "vpaddl.u8 d9, d0 \n" // B 8 bytes -> 4 shorts.
  1783. "vpaddl.u8 d11, d1 \n" // G 8 bytes -> 4 shorts.
  1784. "vpaddl.u8 d13, d2 \n" // R 8 bytes -> 4 shorts.
  1785. MEMACCESS(1)
  1786. "vld1.8 {q0}, [%1]! \n" // load 8 ARGB1555 pixels.
  1787. RGB555TOARGB
  1788. "vpadal.u8 d8, d0 \n" // B 8 bytes -> 4 shorts.
  1789. "vpadal.u8 d10, d1 \n" // G 8 bytes -> 4 shorts.
  1790. "vpadal.u8 d12, d2 \n" // R 8 bytes -> 4 shorts.
  1791. MEMACCESS(1)
  1792. "vld1.8 {q0}, [%1]! \n" // next 8 ARGB1555 pixels.
  1793. RGB555TOARGB
  1794. "vpadal.u8 d9, d0 \n" // B 8 bytes -> 4 shorts.
  1795. "vpadal.u8 d11, d1 \n" // G 8 bytes -> 4 shorts.
  1796. "vpadal.u8 d13, d2 \n" // R 8 bytes -> 4 shorts.
  1797. "vrshr.u16 q4, q4, #1 \n" // 2x average
  1798. "vrshr.u16 q5, q5, #1 \n"
  1799. "vrshr.u16 q6, q6, #1 \n"
  1800. "subs %4, %4, #16 \n" // 16 processed per loop.
  1801. "vmul.s16 q8, q4, q10 \n" // B
  1802. "vmls.s16 q8, q5, q11 \n" // G
  1803. "vmls.s16 q8, q6, q12 \n" // R
  1804. "vadd.u16 q8, q8, q15 \n" // +128 -> unsigned
  1805. "vmul.s16 q9, q6, q10 \n" // R
  1806. "vmls.s16 q9, q5, q14 \n" // G
  1807. "vmls.s16 q9, q4, q13 \n" // B
  1808. "vadd.u16 q9, q9, q15 \n" // +128 -> unsigned
  1809. "vqshrn.u16 d0, q8, #8 \n" // 16 bit to 8 bit U
  1810. "vqshrn.u16 d1, q9, #8 \n" // 16 bit to 8 bit V
  1811. MEMACCESS(2)
  1812. "vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
  1813. MEMACCESS(3)
  1814. "vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
  1815. "bgt 1b \n"
  1816. : "+r"(src_argb1555), // %0
  1817. "+r"(src_stride_argb1555), // %1
  1818. "+r"(dst_u), // %2
  1819. "+r"(dst_v), // %3
  1820. "+r"(width) // %4
  1821. :
  1822. : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
  1823. "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
  1824. );
  1825. }
  1826. // 16x2 pixels -> 8x1. width is number of argb pixels. e.g. 16.
  1827. void ARGB4444ToUVRow_NEON(const uint8* src_argb4444, int src_stride_argb4444,
  1828. uint8* dst_u, uint8* dst_v, int width) {
  1829. asm volatile (
  1830. "add %1, %0, %1 \n" // src_stride + src_argb
  1831. "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient
  1832. "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient
  1833. "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient
  1834. "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient
  1835. "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient
  1836. "vmov.u16 q15, #0x8080 \n" // 128.5
  1837. "1: \n"
  1838. MEMACCESS(0)
  1839. "vld1.8 {q0}, [%0]! \n" // load 8 ARGB4444 pixels.
  1840. ARGB4444TOARGB
  1841. "vpaddl.u8 d8, d0 \n" // B 8 bytes -> 4 shorts.
  1842. "vpaddl.u8 d10, d1 \n" // G 8 bytes -> 4 shorts.
  1843. "vpaddl.u8 d12, d2 \n" // R 8 bytes -> 4 shorts.
  1844. MEMACCESS(0)
  1845. "vld1.8 {q0}, [%0]! \n" // next 8 ARGB4444 pixels.
  1846. ARGB4444TOARGB
  1847. "vpaddl.u8 d9, d0 \n" // B 8 bytes -> 4 shorts.
  1848. "vpaddl.u8 d11, d1 \n" // G 8 bytes -> 4 shorts.
  1849. "vpaddl.u8 d13, d2 \n" // R 8 bytes -> 4 shorts.
  1850. MEMACCESS(1)
  1851. "vld1.8 {q0}, [%1]! \n" // load 8 ARGB4444 pixels.
  1852. ARGB4444TOARGB
  1853. "vpadal.u8 d8, d0 \n" // B 8 bytes -> 4 shorts.
  1854. "vpadal.u8 d10, d1 \n" // G 8 bytes -> 4 shorts.
  1855. "vpadal.u8 d12, d2 \n" // R 8 bytes -> 4 shorts.
  1856. MEMACCESS(1)
  1857. "vld1.8 {q0}, [%1]! \n" // next 8 ARGB4444 pixels.
  1858. ARGB4444TOARGB
  1859. "vpadal.u8 d9, d0 \n" // B 8 bytes -> 4 shorts.
  1860. "vpadal.u8 d11, d1 \n" // G 8 bytes -> 4 shorts.
  1861. "vpadal.u8 d13, d2 \n" // R 8 bytes -> 4 shorts.
  1862. "vrshr.u16 q4, q4, #1 \n" // 2x average
  1863. "vrshr.u16 q5, q5, #1 \n"
  1864. "vrshr.u16 q6, q6, #1 \n"
  1865. "subs %4, %4, #16 \n" // 16 processed per loop.
  1866. "vmul.s16 q8, q4, q10 \n" // B
  1867. "vmls.s16 q8, q5, q11 \n" // G
  1868. "vmls.s16 q8, q6, q12 \n" // R
  1869. "vadd.u16 q8, q8, q15 \n" // +128 -> unsigned
  1870. "vmul.s16 q9, q6, q10 \n" // R
  1871. "vmls.s16 q9, q5, q14 \n" // G
  1872. "vmls.s16 q9, q4, q13 \n" // B
  1873. "vadd.u16 q9, q9, q15 \n" // +128 -> unsigned
  1874. "vqshrn.u16 d0, q8, #8 \n" // 16 bit to 8 bit U
  1875. "vqshrn.u16 d1, q9, #8 \n" // 16 bit to 8 bit V
  1876. MEMACCESS(2)
  1877. "vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
  1878. MEMACCESS(3)
  1879. "vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
  1880. "bgt 1b \n"
  1881. : "+r"(src_argb4444), // %0
  1882. "+r"(src_stride_argb4444), // %1
  1883. "+r"(dst_u), // %2
  1884. "+r"(dst_v), // %3
  1885. "+r"(width) // %4
  1886. :
  1887. : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
  1888. "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
  1889. );
  1890. }
  1891. void RGB565ToYRow_NEON(const uint8* src_rgb565, uint8* dst_y, int width) {
  1892. asm volatile (
  1893. "vmov.u8 d24, #13 \n" // B * 0.1016 coefficient
  1894. "vmov.u8 d25, #65 \n" // G * 0.5078 coefficient
  1895. "vmov.u8 d26, #33 \n" // R * 0.2578 coefficient
  1896. "vmov.u8 d27, #16 \n" // Add 16 constant
  1897. "1: \n"
  1898. MEMACCESS(0)
  1899. "vld1.8 {q0}, [%0]! \n" // load 8 RGB565 pixels.
  1900. "subs %2, %2, #8 \n" // 8 processed per loop.
  1901. RGB565TOARGB
  1902. "vmull.u8 q2, d0, d24 \n" // B
  1903. "vmlal.u8 q2, d1, d25 \n" // G
  1904. "vmlal.u8 q2, d2, d26 \n" // R
  1905. "vqrshrun.s16 d0, q2, #7 \n" // 16 bit to 8 bit Y
  1906. "vqadd.u8 d0, d27 \n"
  1907. MEMACCESS(1)
  1908. "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y.
  1909. "bgt 1b \n"
  1910. : "+r"(src_rgb565), // %0
  1911. "+r"(dst_y), // %1
  1912. "+r"(width) // %2
  1913. :
  1914. : "cc", "memory", "q0", "q1", "q2", "q3", "q12", "q13"
  1915. );
  1916. }
  1917. void ARGB1555ToYRow_NEON(const uint8* src_argb1555, uint8* dst_y, int width) {
  1918. asm volatile (
  1919. "vmov.u8 d24, #13 \n" // B * 0.1016 coefficient
  1920. "vmov.u8 d25, #65 \n" // G * 0.5078 coefficient
  1921. "vmov.u8 d26, #33 \n" // R * 0.2578 coefficient
  1922. "vmov.u8 d27, #16 \n" // Add 16 constant
  1923. "1: \n"
  1924. MEMACCESS(0)
  1925. "vld1.8 {q0}, [%0]! \n" // load 8 ARGB1555 pixels.
  1926. "subs %2, %2, #8 \n" // 8 processed per loop.
  1927. ARGB1555TOARGB
  1928. "vmull.u8 q2, d0, d24 \n" // B
  1929. "vmlal.u8 q2, d1, d25 \n" // G
  1930. "vmlal.u8 q2, d2, d26 \n" // R
  1931. "vqrshrun.s16 d0, q2, #7 \n" // 16 bit to 8 bit Y
  1932. "vqadd.u8 d0, d27 \n"
  1933. MEMACCESS(1)
  1934. "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y.
  1935. "bgt 1b \n"
  1936. : "+r"(src_argb1555), // %0
  1937. "+r"(dst_y), // %1
  1938. "+r"(width) // %2
  1939. :
  1940. : "cc", "memory", "q0", "q1", "q2", "q3", "q12", "q13"
  1941. );
  1942. }
  1943. void ARGB4444ToYRow_NEON(const uint8* src_argb4444, uint8* dst_y, int width) {
  1944. asm volatile (
  1945. "vmov.u8 d24, #13 \n" // B * 0.1016 coefficient
  1946. "vmov.u8 d25, #65 \n" // G * 0.5078 coefficient
  1947. "vmov.u8 d26, #33 \n" // R * 0.2578 coefficient
  1948. "vmov.u8 d27, #16 \n" // Add 16 constant
  1949. "1: \n"
  1950. MEMACCESS(0)
  1951. "vld1.8 {q0}, [%0]! \n" // load 8 ARGB4444 pixels.
  1952. "subs %2, %2, #8 \n" // 8 processed per loop.
  1953. ARGB4444TOARGB
  1954. "vmull.u8 q2, d0, d24 \n" // B
  1955. "vmlal.u8 q2, d1, d25 \n" // G
  1956. "vmlal.u8 q2, d2, d26 \n" // R
  1957. "vqrshrun.s16 d0, q2, #7 \n" // 16 bit to 8 bit Y
  1958. "vqadd.u8 d0, d27 \n"
  1959. MEMACCESS(1)
  1960. "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y.
  1961. "bgt 1b \n"
  1962. : "+r"(src_argb4444), // %0
  1963. "+r"(dst_y), // %1
  1964. "+r"(width) // %2
  1965. :
  1966. : "cc", "memory", "q0", "q1", "q2", "q3", "q12", "q13"
  1967. );
  1968. }
  1969. void BGRAToYRow_NEON(const uint8* src_bgra, uint8* dst_y, int width) {
  1970. asm volatile (
  1971. "vmov.u8 d4, #33 \n" // R * 0.2578 coefficient
  1972. "vmov.u8 d5, #65 \n" // G * 0.5078 coefficient
  1973. "vmov.u8 d6, #13 \n" // B * 0.1016 coefficient
  1974. "vmov.u8 d7, #16 \n" // Add 16 constant
  1975. "1: \n"
  1976. MEMACCESS(0)
  1977. "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of BGRA.
  1978. "subs %2, %2, #8 \n" // 8 processed per loop.
  1979. "vmull.u8 q8, d1, d4 \n" // R
  1980. "vmlal.u8 q8, d2, d5 \n" // G
  1981. "vmlal.u8 q8, d3, d6 \n" // B
  1982. "vqrshrun.s16 d0, q8, #7 \n" // 16 bit to 8 bit Y
  1983. "vqadd.u8 d0, d7 \n"
  1984. MEMACCESS(1)
  1985. "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y.
  1986. "bgt 1b \n"
  1987. : "+r"(src_bgra), // %0
  1988. "+r"(dst_y), // %1
  1989. "+r"(width) // %2
  1990. :
  1991. : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8"
  1992. );
  1993. }
  1994. void ABGRToYRow_NEON(const uint8* src_abgr, uint8* dst_y, int width) {
  1995. asm volatile (
  1996. "vmov.u8 d4, #33 \n" // R * 0.2578 coefficient
  1997. "vmov.u8 d5, #65 \n" // G * 0.5078 coefficient
  1998. "vmov.u8 d6, #13 \n" // B * 0.1016 coefficient
  1999. "vmov.u8 d7, #16 \n" // Add 16 constant
  2000. "1: \n"
  2001. MEMACCESS(0)
  2002. "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of ABGR.
  2003. "subs %2, %2, #8 \n" // 8 processed per loop.
  2004. "vmull.u8 q8, d0, d4 \n" // R
  2005. "vmlal.u8 q8, d1, d5 \n" // G
  2006. "vmlal.u8 q8, d2, d6 \n" // B
  2007. "vqrshrun.s16 d0, q8, #7 \n" // 16 bit to 8 bit Y
  2008. "vqadd.u8 d0, d7 \n"
  2009. MEMACCESS(1)
  2010. "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y.
  2011. "bgt 1b \n"
  2012. : "+r"(src_abgr), // %0
  2013. "+r"(dst_y), // %1
  2014. "+r"(width) // %2
  2015. :
  2016. : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8"
  2017. );
  2018. }
  2019. void RGBAToYRow_NEON(const uint8* src_rgba, uint8* dst_y, int width) {
  2020. asm volatile (
  2021. "vmov.u8 d4, #13 \n" // B * 0.1016 coefficient
  2022. "vmov.u8 d5, #65 \n" // G * 0.5078 coefficient
  2023. "vmov.u8 d6, #33 \n" // R * 0.2578 coefficient
  2024. "vmov.u8 d7, #16 \n" // Add 16 constant
  2025. "1: \n"
  2026. MEMACCESS(0)
  2027. "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of RGBA.
  2028. "subs %2, %2, #8 \n" // 8 processed per loop.
  2029. "vmull.u8 q8, d1, d4 \n" // B
  2030. "vmlal.u8 q8, d2, d5 \n" // G
  2031. "vmlal.u8 q8, d3, d6 \n" // R
  2032. "vqrshrun.s16 d0, q8, #7 \n" // 16 bit to 8 bit Y
  2033. "vqadd.u8 d0, d7 \n"
  2034. MEMACCESS(1)
  2035. "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y.
  2036. "bgt 1b \n"
  2037. : "+r"(src_rgba), // %0
  2038. "+r"(dst_y), // %1
  2039. "+r"(width) // %2
  2040. :
  2041. : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8"
  2042. );
  2043. }
  2044. void RGB24ToYRow_NEON(const uint8* src_rgb24, uint8* dst_y, int width) {
  2045. asm volatile (
  2046. "vmov.u8 d4, #13 \n" // B * 0.1016 coefficient
  2047. "vmov.u8 d5, #65 \n" // G * 0.5078 coefficient
  2048. "vmov.u8 d6, #33 \n" // R * 0.2578 coefficient
  2049. "vmov.u8 d7, #16 \n" // Add 16 constant
  2050. "1: \n"
  2051. MEMACCESS(0)
  2052. "vld3.8 {d0, d1, d2}, [%0]! \n" // load 8 pixels of RGB24.
  2053. "subs %2, %2, #8 \n" // 8 processed per loop.
  2054. "vmull.u8 q8, d0, d4 \n" // B
  2055. "vmlal.u8 q8, d1, d5 \n" // G
  2056. "vmlal.u8 q8, d2, d6 \n" // R
  2057. "vqrshrun.s16 d0, q8, #7 \n" // 16 bit to 8 bit Y
  2058. "vqadd.u8 d0, d7 \n"
  2059. MEMACCESS(1)
  2060. "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y.
  2061. "bgt 1b \n"
  2062. : "+r"(src_rgb24), // %0
  2063. "+r"(dst_y), // %1
  2064. "+r"(width) // %2
  2065. :
  2066. : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8"
  2067. );
  2068. }
  2069. void RAWToYRow_NEON(const uint8* src_raw, uint8* dst_y, int width) {
  2070. asm volatile (
  2071. "vmov.u8 d4, #33 \n" // R * 0.2578 coefficient
  2072. "vmov.u8 d5, #65 \n" // G * 0.5078 coefficient
  2073. "vmov.u8 d6, #13 \n" // B * 0.1016 coefficient
  2074. "vmov.u8 d7, #16 \n" // Add 16 constant
  2075. "1: \n"
  2076. MEMACCESS(0)
  2077. "vld3.8 {d0, d1, d2}, [%0]! \n" // load 8 pixels of RAW.
  2078. "subs %2, %2, #8 \n" // 8 processed per loop.
  2079. "vmull.u8 q8, d0, d4 \n" // B
  2080. "vmlal.u8 q8, d1, d5 \n" // G
  2081. "vmlal.u8 q8, d2, d6 \n" // R
  2082. "vqrshrun.s16 d0, q8, #7 \n" // 16 bit to 8 bit Y
  2083. "vqadd.u8 d0, d7 \n"
  2084. MEMACCESS(1)
  2085. "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y.
  2086. "bgt 1b \n"
  2087. : "+r"(src_raw), // %0
  2088. "+r"(dst_y), // %1
  2089. "+r"(width) // %2
  2090. :
  2091. : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8"
  2092. );
  2093. }
  2094. // Bilinear filter 16x2 -> 16x1
  2095. void InterpolateRow_NEON(uint8* dst_ptr,
  2096. const uint8* src_ptr, ptrdiff_t src_stride,
  2097. int dst_width, int source_y_fraction) {
  2098. int y1_fraction = source_y_fraction;
  2099. asm volatile (
  2100. "cmp %4, #0 \n"
  2101. "beq 100f \n"
  2102. "add %2, %1 \n"
  2103. "cmp %4, #128 \n"
  2104. "beq 50f \n"
  2105. "vdup.8 d5, %4 \n"
  2106. "rsb %4, #256 \n"
  2107. "vdup.8 d4, %4 \n"
  2108. // General purpose row blend.
  2109. "1: \n"
  2110. MEMACCESS(1)
  2111. "vld1.8 {q0}, [%1]! \n"
  2112. MEMACCESS(2)
  2113. "vld1.8 {q1}, [%2]! \n"
  2114. "subs %3, %3, #16 \n"
  2115. "vmull.u8 q13, d0, d4 \n"
  2116. "vmull.u8 q14, d1, d4 \n"
  2117. "vmlal.u8 q13, d2, d5 \n"
  2118. "vmlal.u8 q14, d3, d5 \n"
  2119. "vrshrn.u16 d0, q13, #8 \n"
  2120. "vrshrn.u16 d1, q14, #8 \n"
  2121. MEMACCESS(0)
  2122. "vst1.8 {q0}, [%0]! \n"
  2123. "bgt 1b \n"
  2124. "b 99f \n"
  2125. // Blend 50 / 50.
  2126. "50: \n"
  2127. MEMACCESS(1)
  2128. "vld1.8 {q0}, [%1]! \n"
  2129. MEMACCESS(2)
  2130. "vld1.8 {q1}, [%2]! \n"
  2131. "subs %3, %3, #16 \n"
  2132. "vrhadd.u8 q0, q1 \n"
  2133. MEMACCESS(0)
  2134. "vst1.8 {q0}, [%0]! \n"
  2135. "bgt 50b \n"
  2136. "b 99f \n"
  2137. // Blend 100 / 0 - Copy row unchanged.
  2138. "100: \n"
  2139. MEMACCESS(1)
  2140. "vld1.8 {q0}, [%1]! \n"
  2141. "subs %3, %3, #16 \n"
  2142. MEMACCESS(0)
  2143. "vst1.8 {q0}, [%0]! \n"
  2144. "bgt 100b \n"
  2145. "99: \n"
  2146. : "+r"(dst_ptr), // %0
  2147. "+r"(src_ptr), // %1
  2148. "+r"(src_stride), // %2
  2149. "+r"(dst_width), // %3
  2150. "+r"(y1_fraction) // %4
  2151. :
  2152. : "cc", "memory", "q0", "q1", "d4", "d5", "q13", "q14"
  2153. );
  2154. }
  2155. // dr * (256 - sa) / 256 + sr = dr - dr * sa / 256 + sr
  2156. void ARGBBlendRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
  2157. uint8* dst_argb, int width) {
  2158. asm volatile (
  2159. "subs %3, #8 \n"
  2160. "blt 89f \n"
  2161. // Blend 8 pixels.
  2162. "8: \n"
  2163. MEMACCESS(0)
  2164. "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of ARGB0.
  2165. MEMACCESS(1)
  2166. "vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load 8 pixels of ARGB1.
  2167. "subs %3, %3, #8 \n" // 8 processed per loop.
  2168. "vmull.u8 q10, d4, d3 \n" // db * a
  2169. "vmull.u8 q11, d5, d3 \n" // dg * a
  2170. "vmull.u8 q12, d6, d3 \n" // dr * a
  2171. "vqrshrn.u16 d20, q10, #8 \n" // db >>= 8
  2172. "vqrshrn.u16 d21, q11, #8 \n" // dg >>= 8
  2173. "vqrshrn.u16 d22, q12, #8 \n" // dr >>= 8
  2174. "vqsub.u8 q2, q2, q10 \n" // dbg - dbg * a / 256
  2175. "vqsub.u8 d6, d6, d22 \n" // dr - dr * a / 256
  2176. "vqadd.u8 q0, q0, q2 \n" // + sbg
  2177. "vqadd.u8 d2, d2, d6 \n" // + sr
  2178. "vmov.u8 d3, #255 \n" // a = 255
  2179. MEMACCESS(2)
  2180. "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 pixels of ARGB.
  2181. "bge 8b \n"
  2182. "89: \n"
  2183. "adds %3, #8-1 \n"
  2184. "blt 99f \n"
  2185. // Blend 1 pixels.
  2186. "1: \n"
  2187. MEMACCESS(0)
  2188. "vld4.8 {d0[0],d1[0],d2[0],d3[0]}, [%0]! \n" // load 1 pixel ARGB0.
  2189. MEMACCESS(1)
  2190. "vld4.8 {d4[0],d5[0],d6[0],d7[0]}, [%1]! \n" // load 1 pixel ARGB1.
  2191. "subs %3, %3, #1 \n" // 1 processed per loop.
  2192. "vmull.u8 q10, d4, d3 \n" // db * a
  2193. "vmull.u8 q11, d5, d3 \n" // dg * a
  2194. "vmull.u8 q12, d6, d3 \n" // dr * a
  2195. "vqrshrn.u16 d20, q10, #8 \n" // db >>= 8
  2196. "vqrshrn.u16 d21, q11, #8 \n" // dg >>= 8
  2197. "vqrshrn.u16 d22, q12, #8 \n" // dr >>= 8
  2198. "vqsub.u8 q2, q2, q10 \n" // dbg - dbg * a / 256
  2199. "vqsub.u8 d6, d6, d22 \n" // dr - dr * a / 256
  2200. "vqadd.u8 q0, q0, q2 \n" // + sbg
  2201. "vqadd.u8 d2, d2, d6 \n" // + sr
  2202. "vmov.u8 d3, #255 \n" // a = 255
  2203. MEMACCESS(2)
  2204. "vst4.8 {d0[0],d1[0],d2[0],d3[0]}, [%2]! \n" // store 1 pixel.
  2205. "bge 1b \n"
  2206. "99: \n"
  2207. : "+r"(src_argb0), // %0
  2208. "+r"(src_argb1), // %1
  2209. "+r"(dst_argb), // %2
  2210. "+r"(width) // %3
  2211. :
  2212. : "cc", "memory", "q0", "q1", "q2", "q3", "q10", "q11", "q12"
  2213. );
  2214. }
  2215. // Attenuate 8 pixels at a time.
  2216. void ARGBAttenuateRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) {
  2217. asm volatile (
  2218. // Attenuate 8 pixels.
  2219. "1: \n"
  2220. MEMACCESS(0)
  2221. "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of ARGB.
  2222. "subs %2, %2, #8 \n" // 8 processed per loop.
  2223. "vmull.u8 q10, d0, d3 \n" // b * a
  2224. "vmull.u8 q11, d1, d3 \n" // g * a
  2225. "vmull.u8 q12, d2, d3 \n" // r * a
  2226. "vqrshrn.u16 d0, q10, #8 \n" // b >>= 8
  2227. "vqrshrn.u16 d1, q11, #8 \n" // g >>= 8
  2228. "vqrshrn.u16 d2, q12, #8 \n" // r >>= 8
  2229. MEMACCESS(1)
  2230. "vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 pixels of ARGB.
  2231. "bgt 1b \n"
  2232. : "+r"(src_argb), // %0
  2233. "+r"(dst_argb), // %1
  2234. "+r"(width) // %2
  2235. :
  2236. : "cc", "memory", "q0", "q1", "q10", "q11", "q12"
  2237. );
  2238. }
  2239. // Quantize 8 ARGB pixels (32 bytes).
  2240. // dst = (dst * scale >> 16) * interval_size + interval_offset;
  2241. void ARGBQuantizeRow_NEON(uint8* dst_argb, int scale, int interval_size,
  2242. int interval_offset, int width) {
  2243. asm volatile (
  2244. "vdup.u16 q8, %2 \n"
  2245. "vshr.u16 q8, q8, #1 \n" // scale >>= 1
  2246. "vdup.u16 q9, %3 \n" // interval multiply.
  2247. "vdup.u16 q10, %4 \n" // interval add
  2248. // 8 pixel loop.
  2249. "1: \n"
  2250. MEMACCESS(0)
  2251. "vld4.8 {d0, d2, d4, d6}, [%0] \n" // load 8 pixels of ARGB.
  2252. "subs %1, %1, #8 \n" // 8 processed per loop.
  2253. "vmovl.u8 q0, d0 \n" // b (0 .. 255)
  2254. "vmovl.u8 q1, d2 \n"
  2255. "vmovl.u8 q2, d4 \n"
  2256. "vqdmulh.s16 q0, q0, q8 \n" // b * scale
  2257. "vqdmulh.s16 q1, q1, q8 \n" // g
  2258. "vqdmulh.s16 q2, q2, q8 \n" // r
  2259. "vmul.u16 q0, q0, q9 \n" // b * interval_size
  2260. "vmul.u16 q1, q1, q9 \n" // g
  2261. "vmul.u16 q2, q2, q9 \n" // r
  2262. "vadd.u16 q0, q0, q10 \n" // b + interval_offset
  2263. "vadd.u16 q1, q1, q10 \n" // g
  2264. "vadd.u16 q2, q2, q10 \n" // r
  2265. "vqmovn.u16 d0, q0 \n"
  2266. "vqmovn.u16 d2, q1 \n"
  2267. "vqmovn.u16 d4, q2 \n"
  2268. MEMACCESS(0)
  2269. "vst4.8 {d0, d2, d4, d6}, [%0]! \n" // store 8 pixels of ARGB.
  2270. "bgt 1b \n"
  2271. : "+r"(dst_argb), // %0
  2272. "+r"(width) // %1
  2273. : "r"(scale), // %2
  2274. "r"(interval_size), // %3
  2275. "r"(interval_offset) // %4
  2276. : "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", "q10"
  2277. );
  2278. }
  2279. // Shade 8 pixels at a time by specified value.
  2280. // NOTE vqrdmulh.s16 q10, q10, d0[0] must use a scaler register from 0 to 8.
  2281. // Rounding in vqrdmulh does +1 to high if high bit of low s16 is set.
  2282. void ARGBShadeRow_NEON(const uint8* src_argb, uint8* dst_argb, int width,
  2283. uint32 value) {
  2284. asm volatile (
  2285. "vdup.u32 q0, %3 \n" // duplicate scale value.
  2286. "vzip.u8 d0, d1 \n" // d0 aarrggbb.
  2287. "vshr.u16 q0, q0, #1 \n" // scale / 2.
  2288. // 8 pixel loop.
  2289. "1: \n"
  2290. MEMACCESS(0)
  2291. "vld4.8 {d20, d22, d24, d26}, [%0]! \n" // load 8 pixels of ARGB.
  2292. "subs %2, %2, #8 \n" // 8 processed per loop.
  2293. "vmovl.u8 q10, d20 \n" // b (0 .. 255)
  2294. "vmovl.u8 q11, d22 \n"
  2295. "vmovl.u8 q12, d24 \n"
  2296. "vmovl.u8 q13, d26 \n"
  2297. "vqrdmulh.s16 q10, q10, d0[0] \n" // b * scale * 2
  2298. "vqrdmulh.s16 q11, q11, d0[1] \n" // g
  2299. "vqrdmulh.s16 q12, q12, d0[2] \n" // r
  2300. "vqrdmulh.s16 q13, q13, d0[3] \n" // a
  2301. "vqmovn.u16 d20, q10 \n"
  2302. "vqmovn.u16 d22, q11 \n"
  2303. "vqmovn.u16 d24, q12 \n"
  2304. "vqmovn.u16 d26, q13 \n"
  2305. MEMACCESS(1)
  2306. "vst4.8 {d20, d22, d24, d26}, [%1]! \n" // store 8 pixels of ARGB.
  2307. "bgt 1b \n"
  2308. : "+r"(src_argb), // %0
  2309. "+r"(dst_argb), // %1
  2310. "+r"(width) // %2
  2311. : "r"(value) // %3
  2312. : "cc", "memory", "q0", "q10", "q11", "q12", "q13"
  2313. );
  2314. }
  2315. // Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels
  2316. // Similar to ARGBToYJ but stores ARGB.
  2317. // C code is (15 * b + 75 * g + 38 * r + 64) >> 7;
  2318. void ARGBGrayRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) {
  2319. asm volatile (
  2320. "vmov.u8 d24, #15 \n" // B * 0.11400 coefficient
  2321. "vmov.u8 d25, #75 \n" // G * 0.58700 coefficient
  2322. "vmov.u8 d26, #38 \n" // R * 0.29900 coefficient
  2323. "1: \n"
  2324. MEMACCESS(0)
  2325. "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels.
  2326. "subs %2, %2, #8 \n" // 8 processed per loop.
  2327. "vmull.u8 q2, d0, d24 \n" // B
  2328. "vmlal.u8 q2, d1, d25 \n" // G
  2329. "vmlal.u8 q2, d2, d26 \n" // R
  2330. "vqrshrun.s16 d0, q2, #7 \n" // 15 bit to 8 bit B
  2331. "vmov d1, d0 \n" // G
  2332. "vmov d2, d0 \n" // R
  2333. MEMACCESS(1)
  2334. "vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 ARGB pixels.
  2335. "bgt 1b \n"
  2336. : "+r"(src_argb), // %0
  2337. "+r"(dst_argb), // %1
  2338. "+r"(width) // %2
  2339. :
  2340. : "cc", "memory", "q0", "q1", "q2", "q12", "q13"
  2341. );
  2342. }
  2343. // Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels.
  2344. // b = (r * 35 + g * 68 + b * 17) >> 7
  2345. // g = (r * 45 + g * 88 + b * 22) >> 7
  2346. // r = (r * 50 + g * 98 + b * 24) >> 7
  2347. void ARGBSepiaRow_NEON(uint8* dst_argb, int width) {
  2348. asm volatile (
  2349. "vmov.u8 d20, #17 \n" // BB coefficient
  2350. "vmov.u8 d21, #68 \n" // BG coefficient
  2351. "vmov.u8 d22, #35 \n" // BR coefficient
  2352. "vmov.u8 d24, #22 \n" // GB coefficient
  2353. "vmov.u8 d25, #88 \n" // GG coefficient
  2354. "vmov.u8 d26, #45 \n" // GR coefficient
  2355. "vmov.u8 d28, #24 \n" // BB coefficient
  2356. "vmov.u8 d29, #98 \n" // BG coefficient
  2357. "vmov.u8 d30, #50 \n" // BR coefficient
  2358. "1: \n"
  2359. MEMACCESS(0)
  2360. "vld4.8 {d0, d1, d2, d3}, [%0] \n" // load 8 ARGB pixels.
  2361. "subs %1, %1, #8 \n" // 8 processed per loop.
  2362. "vmull.u8 q2, d0, d20 \n" // B to Sepia B
  2363. "vmlal.u8 q2, d1, d21 \n" // G
  2364. "vmlal.u8 q2, d2, d22 \n" // R
  2365. "vmull.u8 q3, d0, d24 \n" // B to Sepia G
  2366. "vmlal.u8 q3, d1, d25 \n" // G
  2367. "vmlal.u8 q3, d2, d26 \n" // R
  2368. "vmull.u8 q8, d0, d28 \n" // B to Sepia R
  2369. "vmlal.u8 q8, d1, d29 \n" // G
  2370. "vmlal.u8 q8, d2, d30 \n" // R
  2371. "vqshrn.u16 d0, q2, #7 \n" // 16 bit to 8 bit B
  2372. "vqshrn.u16 d1, q3, #7 \n" // 16 bit to 8 bit G
  2373. "vqshrn.u16 d2, q8, #7 \n" // 16 bit to 8 bit R
  2374. MEMACCESS(0)
  2375. "vst4.8 {d0, d1, d2, d3}, [%0]! \n" // store 8 ARGB pixels.
  2376. "bgt 1b \n"
  2377. : "+r"(dst_argb), // %0
  2378. "+r"(width) // %1
  2379. :
  2380. : "cc", "memory", "q0", "q1", "q2", "q3",
  2381. "q10", "q11", "q12", "q13", "q14", "q15"
  2382. );
  2383. }
  2384. // Tranform 8 ARGB pixels (32 bytes) with color matrix.
  2385. // TODO(fbarchard): Was same as Sepia except matrix is provided. This function
  2386. // needs to saturate. Consider doing a non-saturating version.
  2387. void ARGBColorMatrixRow_NEON(const uint8* src_argb, uint8* dst_argb,
  2388. const int8* matrix_argb, int width) {
  2389. asm volatile (
  2390. MEMACCESS(3)
  2391. "vld1.8 {q2}, [%3] \n" // load 3 ARGB vectors.
  2392. "vmovl.s8 q0, d4 \n" // B,G coefficients s16.
  2393. "vmovl.s8 q1, d5 \n" // R,A coefficients s16.
  2394. "1: \n"
  2395. MEMACCESS(0)
  2396. "vld4.8 {d16, d18, d20, d22}, [%0]! \n" // load 8 ARGB pixels.
  2397. "subs %2, %2, #8 \n" // 8 processed per loop.
  2398. "vmovl.u8 q8, d16 \n" // b (0 .. 255) 16 bit
  2399. "vmovl.u8 q9, d18 \n" // g
  2400. "vmovl.u8 q10, d20 \n" // r
  2401. "vmovl.u8 q11, d22 \n" // a
  2402. "vmul.s16 q12, q8, d0[0] \n" // B = B * Matrix B
  2403. "vmul.s16 q13, q8, d1[0] \n" // G = B * Matrix G
  2404. "vmul.s16 q14, q8, d2[0] \n" // R = B * Matrix R
  2405. "vmul.s16 q15, q8, d3[0] \n" // A = B * Matrix A
  2406. "vmul.s16 q4, q9, d0[1] \n" // B += G * Matrix B
  2407. "vmul.s16 q5, q9, d1[1] \n" // G += G * Matrix G
  2408. "vmul.s16 q6, q9, d2[1] \n" // R += G * Matrix R
  2409. "vmul.s16 q7, q9, d3[1] \n" // A += G * Matrix A
  2410. "vqadd.s16 q12, q12, q4 \n" // Accumulate B
  2411. "vqadd.s16 q13, q13, q5 \n" // Accumulate G
  2412. "vqadd.s16 q14, q14, q6 \n" // Accumulate R
  2413. "vqadd.s16 q15, q15, q7 \n" // Accumulate A
  2414. "vmul.s16 q4, q10, d0[2] \n" // B += R * Matrix B
  2415. "vmul.s16 q5, q10, d1[2] \n" // G += R * Matrix G
  2416. "vmul.s16 q6, q10, d2[2] \n" // R += R * Matrix R
  2417. "vmul.s16 q7, q10, d3[2] \n" // A += R * Matrix A
  2418. "vqadd.s16 q12, q12, q4 \n" // Accumulate B
  2419. "vqadd.s16 q13, q13, q5 \n" // Accumulate G
  2420. "vqadd.s16 q14, q14, q6 \n" // Accumulate R
  2421. "vqadd.s16 q15, q15, q7 \n" // Accumulate A
  2422. "vmul.s16 q4, q11, d0[3] \n" // B += A * Matrix B
  2423. "vmul.s16 q5, q11, d1[3] \n" // G += A * Matrix G
  2424. "vmul.s16 q6, q11, d2[3] \n" // R += A * Matrix R
  2425. "vmul.s16 q7, q11, d3[3] \n" // A += A * Matrix A
  2426. "vqadd.s16 q12, q12, q4 \n" // Accumulate B
  2427. "vqadd.s16 q13, q13, q5 \n" // Accumulate G
  2428. "vqadd.s16 q14, q14, q6 \n" // Accumulate R
  2429. "vqadd.s16 q15, q15, q7 \n" // Accumulate A
  2430. "vqshrun.s16 d16, q12, #6 \n" // 16 bit to 8 bit B
  2431. "vqshrun.s16 d18, q13, #6 \n" // 16 bit to 8 bit G
  2432. "vqshrun.s16 d20, q14, #6 \n" // 16 bit to 8 bit R
  2433. "vqshrun.s16 d22, q15, #6 \n" // 16 bit to 8 bit A
  2434. MEMACCESS(1)
  2435. "vst4.8 {d16, d18, d20, d22}, [%1]! \n" // store 8 ARGB pixels.
  2436. "bgt 1b \n"
  2437. : "+r"(src_argb), // %0
  2438. "+r"(dst_argb), // %1
  2439. "+r"(width) // %2
  2440. : "r"(matrix_argb) // %3
  2441. : "cc", "memory", "q0", "q1", "q2", "q4", "q5", "q6", "q7", "q8", "q9",
  2442. "q10", "q11", "q12", "q13", "q14", "q15"
  2443. );
  2444. }
  2445. // Multiply 2 rows of ARGB pixels together, 8 pixels at a time.
  2446. void ARGBMultiplyRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
  2447. uint8* dst_argb, int width) {
  2448. asm volatile (
  2449. // 8 pixel loop.
  2450. "1: \n"
  2451. MEMACCESS(0)
  2452. "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels.
  2453. MEMACCESS(1)
  2454. "vld4.8 {d1, d3, d5, d7}, [%1]! \n" // load 8 more ARGB pixels.
  2455. "subs %3, %3, #8 \n" // 8 processed per loop.
  2456. "vmull.u8 q0, d0, d1 \n" // multiply B
  2457. "vmull.u8 q1, d2, d3 \n" // multiply G
  2458. "vmull.u8 q2, d4, d5 \n" // multiply R
  2459. "vmull.u8 q3, d6, d7 \n" // multiply A
  2460. "vrshrn.u16 d0, q0, #8 \n" // 16 bit to 8 bit B
  2461. "vrshrn.u16 d1, q1, #8 \n" // 16 bit to 8 bit G
  2462. "vrshrn.u16 d2, q2, #8 \n" // 16 bit to 8 bit R
  2463. "vrshrn.u16 d3, q3, #8 \n" // 16 bit to 8 bit A
  2464. MEMACCESS(2)
  2465. "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels.
  2466. "bgt 1b \n"
  2467. : "+r"(src_argb0), // %0
  2468. "+r"(src_argb1), // %1
  2469. "+r"(dst_argb), // %2
  2470. "+r"(width) // %3
  2471. :
  2472. : "cc", "memory", "q0", "q1", "q2", "q3"
  2473. );
  2474. }
  2475. // Add 2 rows of ARGB pixels together, 8 pixels at a time.
  2476. void ARGBAddRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
  2477. uint8* dst_argb, int width) {
  2478. asm volatile (
  2479. // 8 pixel loop.
  2480. "1: \n"
  2481. MEMACCESS(0)
  2482. "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels.
  2483. MEMACCESS(1)
  2484. "vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load 8 more ARGB pixels.
  2485. "subs %3, %3, #8 \n" // 8 processed per loop.
  2486. "vqadd.u8 q0, q0, q2 \n" // add B, G
  2487. "vqadd.u8 q1, q1, q3 \n" // add R, A
  2488. MEMACCESS(2)
  2489. "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels.
  2490. "bgt 1b \n"
  2491. : "+r"(src_argb0), // %0
  2492. "+r"(src_argb1), // %1
  2493. "+r"(dst_argb), // %2
  2494. "+r"(width) // %3
  2495. :
  2496. : "cc", "memory", "q0", "q1", "q2", "q3"
  2497. );
  2498. }
  2499. // Subtract 2 rows of ARGB pixels, 8 pixels at a time.
  2500. void ARGBSubtractRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
  2501. uint8* dst_argb, int width) {
  2502. asm volatile (
  2503. // 8 pixel loop.
  2504. "1: \n"
  2505. MEMACCESS(0)
  2506. "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels.
  2507. MEMACCESS(1)
  2508. "vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load 8 more ARGB pixels.
  2509. "subs %3, %3, #8 \n" // 8 processed per loop.
  2510. "vqsub.u8 q0, q0, q2 \n" // subtract B, G
  2511. "vqsub.u8 q1, q1, q3 \n" // subtract R, A
  2512. MEMACCESS(2)
  2513. "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels.
  2514. "bgt 1b \n"
  2515. : "+r"(src_argb0), // %0
  2516. "+r"(src_argb1), // %1
  2517. "+r"(dst_argb), // %2
  2518. "+r"(width) // %3
  2519. :
  2520. : "cc", "memory", "q0", "q1", "q2", "q3"
  2521. );
  2522. }
  2523. // Adds Sobel X and Sobel Y and stores Sobel into ARGB.
  2524. // A = 255
  2525. // R = Sobel
  2526. // G = Sobel
  2527. // B = Sobel
  2528. void SobelRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,
  2529. uint8* dst_argb, int width) {
  2530. asm volatile (
  2531. "vmov.u8 d3, #255 \n" // alpha
  2532. // 8 pixel loop.
  2533. "1: \n"
  2534. MEMACCESS(0)
  2535. "vld1.8 {d0}, [%0]! \n" // load 8 sobelx.
  2536. MEMACCESS(1)
  2537. "vld1.8 {d1}, [%1]! \n" // load 8 sobely.
  2538. "subs %3, %3, #8 \n" // 8 processed per loop.
  2539. "vqadd.u8 d0, d0, d1 \n" // add
  2540. "vmov.u8 d1, d0 \n"
  2541. "vmov.u8 d2, d0 \n"
  2542. MEMACCESS(2)
  2543. "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels.
  2544. "bgt 1b \n"
  2545. : "+r"(src_sobelx), // %0
  2546. "+r"(src_sobely), // %1
  2547. "+r"(dst_argb), // %2
  2548. "+r"(width) // %3
  2549. :
  2550. : "cc", "memory", "q0", "q1"
  2551. );
  2552. }
  2553. // Adds Sobel X and Sobel Y and stores Sobel into plane.
  2554. void SobelToPlaneRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,
  2555. uint8* dst_y, int width) {
  2556. asm volatile (
  2557. // 16 pixel loop.
  2558. "1: \n"
  2559. MEMACCESS(0)
  2560. "vld1.8 {q0}, [%0]! \n" // load 16 sobelx.
  2561. MEMACCESS(1)
  2562. "vld1.8 {q1}, [%1]! \n" // load 16 sobely.
  2563. "subs %3, %3, #16 \n" // 16 processed per loop.
  2564. "vqadd.u8 q0, q0, q1 \n" // add
  2565. MEMACCESS(2)
  2566. "vst1.8 {q0}, [%2]! \n" // store 16 pixels.
  2567. "bgt 1b \n"
  2568. : "+r"(src_sobelx), // %0
  2569. "+r"(src_sobely), // %1
  2570. "+r"(dst_y), // %2
  2571. "+r"(width) // %3
  2572. :
  2573. : "cc", "memory", "q0", "q1"
  2574. );
  2575. }
  2576. // Mixes Sobel X, Sobel Y and Sobel into ARGB.
  2577. // A = 255
  2578. // R = Sobel X
  2579. // G = Sobel
  2580. // B = Sobel Y
  2581. void SobelXYRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,
  2582. uint8* dst_argb, int width) {
  2583. asm volatile (
  2584. "vmov.u8 d3, #255 \n" // alpha
  2585. // 8 pixel loop.
  2586. "1: \n"
  2587. MEMACCESS(0)
  2588. "vld1.8 {d2}, [%0]! \n" // load 8 sobelx.
  2589. MEMACCESS(1)
  2590. "vld1.8 {d0}, [%1]! \n" // load 8 sobely.
  2591. "subs %3, %3, #8 \n" // 8 processed per loop.
  2592. "vqadd.u8 d1, d0, d2 \n" // add
  2593. MEMACCESS(2)
  2594. "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels.
  2595. "bgt 1b \n"
  2596. : "+r"(src_sobelx), // %0
  2597. "+r"(src_sobely), // %1
  2598. "+r"(dst_argb), // %2
  2599. "+r"(width) // %3
  2600. :
  2601. : "cc", "memory", "q0", "q1"
  2602. );
  2603. }
  2604. // SobelX as a matrix is
  2605. // -1 0 1
  2606. // -2 0 2
  2607. // -1 0 1
  2608. void SobelXRow_NEON(const uint8* src_y0, const uint8* src_y1,
  2609. const uint8* src_y2, uint8* dst_sobelx, int width) {
  2610. asm volatile (
  2611. "1: \n"
  2612. MEMACCESS(0)
  2613. "vld1.8 {d0}, [%0],%5 \n" // top
  2614. MEMACCESS(0)
  2615. "vld1.8 {d1}, [%0],%6 \n"
  2616. "vsubl.u8 q0, d0, d1 \n"
  2617. MEMACCESS(1)
  2618. "vld1.8 {d2}, [%1],%5 \n" // center * 2
  2619. MEMACCESS(1)
  2620. "vld1.8 {d3}, [%1],%6 \n"
  2621. "vsubl.u8 q1, d2, d3 \n"
  2622. "vadd.s16 q0, q0, q1 \n"
  2623. "vadd.s16 q0, q0, q1 \n"
  2624. MEMACCESS(2)
  2625. "vld1.8 {d2}, [%2],%5 \n" // bottom
  2626. MEMACCESS(2)
  2627. "vld1.8 {d3}, [%2],%6 \n"
  2628. "subs %4, %4, #8 \n" // 8 pixels
  2629. "vsubl.u8 q1, d2, d3 \n"
  2630. "vadd.s16 q0, q0, q1 \n"
  2631. "vabs.s16 q0, q0 \n"
  2632. "vqmovn.u16 d0, q0 \n"
  2633. MEMACCESS(3)
  2634. "vst1.8 {d0}, [%3]! \n" // store 8 sobelx
  2635. "bgt 1b \n"
  2636. : "+r"(src_y0), // %0
  2637. "+r"(src_y1), // %1
  2638. "+r"(src_y2), // %2
  2639. "+r"(dst_sobelx), // %3
  2640. "+r"(width) // %4
  2641. : "r"(2), // %5
  2642. "r"(6) // %6
  2643. : "cc", "memory", "q0", "q1" // Clobber List
  2644. );
  2645. }
  2646. // SobelY as a matrix is
  2647. // -1 -2 -1
  2648. // 0 0 0
  2649. // 1 2 1
  2650. void SobelYRow_NEON(const uint8* src_y0, const uint8* src_y1,
  2651. uint8* dst_sobely, int width) {
  2652. asm volatile (
  2653. "1: \n"
  2654. MEMACCESS(0)
  2655. "vld1.8 {d0}, [%0],%4 \n" // left
  2656. MEMACCESS(1)
  2657. "vld1.8 {d1}, [%1],%4 \n"
  2658. "vsubl.u8 q0, d0, d1 \n"
  2659. MEMACCESS(0)
  2660. "vld1.8 {d2}, [%0],%4 \n" // center * 2
  2661. MEMACCESS(1)
  2662. "vld1.8 {d3}, [%1],%4 \n"
  2663. "vsubl.u8 q1, d2, d3 \n"
  2664. "vadd.s16 q0, q0, q1 \n"
  2665. "vadd.s16 q0, q0, q1 \n"
  2666. MEMACCESS(0)
  2667. "vld1.8 {d2}, [%0],%5 \n" // right
  2668. MEMACCESS(1)
  2669. "vld1.8 {d3}, [%1],%5 \n"
  2670. "subs %3, %3, #8 \n" // 8 pixels
  2671. "vsubl.u8 q1, d2, d3 \n"
  2672. "vadd.s16 q0, q0, q1 \n"
  2673. "vabs.s16 q0, q0 \n"
  2674. "vqmovn.u16 d0, q0 \n"
  2675. MEMACCESS(2)
  2676. "vst1.8 {d0}, [%2]! \n" // store 8 sobely
  2677. "bgt 1b \n"
  2678. : "+r"(src_y0), // %0
  2679. "+r"(src_y1), // %1
  2680. "+r"(dst_sobely), // %2
  2681. "+r"(width) // %3
  2682. : "r"(1), // %4
  2683. "r"(6) // %5
  2684. : "cc", "memory", "q0", "q1" // Clobber List
  2685. );
  2686. }
  2687. #endif // defined(__ARM_NEON__) && !defined(__aarch64__)
  2688. #ifdef __cplusplus
  2689. } // extern "C"
  2690. } // namespace libyuv
  2691. #endif