scale_gcc.cc 53 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322
  1. /*
  2. * Copyright 2013 The LibYuv Project Authors. All rights reserved.
  3. *
  4. * Use of this source code is governed by a BSD-style license
  5. * that can be found in the LICENSE file in the root of the source
  6. * tree. An additional intellectual property rights grant can be found
  7. * in the file PATENTS. All contributing project authors may
  8. * be found in the AUTHORS file in the root of the source tree.
  9. */
  10. #include "libyuv/row.h"
  11. #include "libyuv/scale_row.h"
  12. #ifdef __cplusplus
  13. namespace libyuv {
  14. extern "C" {
  15. #endif
  16. // This module is for GCC x86 and x64.
  17. #if !defined(LIBYUV_DISABLE_X86) && \
  18. (defined(__x86_64__) || (defined(__i386__) && !defined(_MSC_VER)))
  19. // Offsets for source bytes 0 to 9
  20. static uvec8 kShuf0 =
  21. { 0, 1, 3, 4, 5, 7, 8, 9, 128, 128, 128, 128, 128, 128, 128, 128 };
  22. // Offsets for source bytes 11 to 20 with 8 subtracted = 3 to 12.
  23. static uvec8 kShuf1 =
  24. { 3, 4, 5, 7, 8, 9, 11, 12, 128, 128, 128, 128, 128, 128, 128, 128 };
  25. // Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31.
  26. static uvec8 kShuf2 =
  27. { 5, 7, 8, 9, 11, 12, 13, 15, 128, 128, 128, 128, 128, 128, 128, 128 };
  28. // Offsets for source bytes 0 to 10
  29. static uvec8 kShuf01 =
  30. { 0, 1, 1, 2, 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10 };
  31. // Offsets for source bytes 10 to 21 with 8 subtracted = 3 to 13.
  32. static uvec8 kShuf11 =
  33. { 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13 };
  34. // Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31.
  35. static uvec8 kShuf21 =
  36. { 5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13, 13, 14, 14, 15 };
  37. // Coefficients for source bytes 0 to 10
  38. static uvec8 kMadd01 =
  39. { 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2 };
  40. // Coefficients for source bytes 10 to 21
  41. static uvec8 kMadd11 =
  42. { 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1 };
  43. // Coefficients for source bytes 21 to 31
  44. static uvec8 kMadd21 =
  45. { 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3 };
  46. // Coefficients for source bytes 21 to 31
  47. static vec16 kRound34 =
  48. { 2, 2, 2, 2, 2, 2, 2, 2 };
  49. static uvec8 kShuf38a =
  50. { 0, 3, 6, 8, 11, 14, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 };
  51. static uvec8 kShuf38b =
  52. { 128, 128, 128, 128, 128, 128, 0, 3, 6, 8, 11, 14, 128, 128, 128, 128 };
  53. // Arrange words 0,3,6 into 0,1,2
  54. static uvec8 kShufAc =
  55. { 0, 1, 6, 7, 12, 13, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 };
  56. // Arrange words 0,3,6 into 3,4,5
  57. static uvec8 kShufAc3 =
  58. { 128, 128, 128, 128, 128, 128, 0, 1, 6, 7, 12, 13, 128, 128, 128, 128 };
  59. // Scaling values for boxes of 3x3 and 2x3
  60. static uvec16 kScaleAc33 =
  61. { 65536 / 9, 65536 / 9, 65536 / 6, 65536 / 9, 65536 / 9, 65536 / 6, 0, 0 };
  62. // Arrange first value for pixels 0,1,2,3,4,5
  63. static uvec8 kShufAb0 =
  64. { 0, 128, 3, 128, 6, 128, 8, 128, 11, 128, 14, 128, 128, 128, 128, 128 };
  65. // Arrange second value for pixels 0,1,2,3,4,5
  66. static uvec8 kShufAb1 =
  67. { 1, 128, 4, 128, 7, 128, 9, 128, 12, 128, 15, 128, 128, 128, 128, 128 };
  68. // Arrange third value for pixels 0,1,2,3,4,5
  69. static uvec8 kShufAb2 =
  70. { 2, 128, 5, 128, 128, 128, 10, 128, 13, 128, 128, 128, 128, 128, 128, 128 };
  71. // Scaling values for boxes of 3x2 and 2x2
  72. static uvec16 kScaleAb2 =
  73. { 65536 / 3, 65536 / 3, 65536 / 2, 65536 / 3, 65536 / 3, 65536 / 2, 0, 0 };
  74. // GCC versions of row functions are verbatim conversions from Visual C.
  75. // Generated using gcc disassembly on Visual C object file:
  76. // objdump -D yuvscaler.obj >yuvscaler.txt
  77. void ScaleRowDown2_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
  78. uint8* dst_ptr, int dst_width) {
  79. asm volatile (
  80. LABELALIGN
  81. "1: \n"
  82. "movdqu " MEMACCESS(0) ",%%xmm0 \n"
  83. "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
  84. "lea " MEMLEA(0x20,0) ",%0 \n"
  85. "psrlw $0x8,%%xmm0 \n"
  86. "psrlw $0x8,%%xmm1 \n"
  87. "packuswb %%xmm1,%%xmm0 \n"
  88. "movdqu %%xmm0," MEMACCESS(1) " \n"
  89. "lea " MEMLEA(0x10,1) ",%1 \n"
  90. "sub $0x10,%2 \n"
  91. "jg 1b \n"
  92. : "+r"(src_ptr), // %0
  93. "+r"(dst_ptr), // %1
  94. "+r"(dst_width) // %2
  95. :: "memory", "cc", "xmm0", "xmm1"
  96. );
  97. }
  98. void ScaleRowDown2Linear_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
  99. uint8* dst_ptr, int dst_width) {
  100. asm volatile (
  101. "pcmpeqb %%xmm4,%%xmm4 \n"
  102. "psrlw $0xf,%%xmm4 \n"
  103. "packuswb %%xmm4,%%xmm4 \n"
  104. "pxor %%xmm5,%%xmm5 \n"
  105. LABELALIGN
  106. "1: \n"
  107. "movdqu " MEMACCESS(0) ",%%xmm0 \n"
  108. "movdqu " MEMACCESS2(0x10, 0) ",%%xmm1 \n"
  109. "lea " MEMLEA(0x20,0) ",%0 \n"
  110. "pmaddubsw %%xmm4,%%xmm0 \n"
  111. "pmaddubsw %%xmm4,%%xmm1 \n"
  112. "pavgw %%xmm5,%%xmm0 \n"
  113. "pavgw %%xmm5,%%xmm1 \n"
  114. "packuswb %%xmm1,%%xmm0 \n"
  115. "movdqu %%xmm0," MEMACCESS(1) " \n"
  116. "lea " MEMLEA(0x10,1) ",%1 \n"
  117. "sub $0x10,%2 \n"
  118. "jg 1b \n"
  119. : "+r"(src_ptr), // %0
  120. "+r"(dst_ptr), // %1
  121. "+r"(dst_width) // %2
  122. :: "memory", "cc", "xmm0", "xmm1", "xmm4", "xmm5"
  123. );
  124. }
  125. void ScaleRowDown2Box_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
  126. uint8* dst_ptr, int dst_width) {
  127. asm volatile (
  128. "pcmpeqb %%xmm4,%%xmm4 \n"
  129. "psrlw $0xf,%%xmm4 \n"
  130. "packuswb %%xmm4,%%xmm4 \n"
  131. "pxor %%xmm5,%%xmm5 \n"
  132. LABELALIGN
  133. "1: \n"
  134. "movdqu " MEMACCESS(0) ",%%xmm0 \n"
  135. "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
  136. MEMOPREG(movdqu,0x00,0,3,1,xmm2) // movdqu (%0,%3,1),%%xmm2
  137. MEMOPREG(movdqu,0x10,0,3,1,xmm3) // movdqu 0x10(%0,%3,1),%%xmm3
  138. "lea " MEMLEA(0x20,0) ",%0 \n"
  139. "pmaddubsw %%xmm4,%%xmm0 \n"
  140. "pmaddubsw %%xmm4,%%xmm1 \n"
  141. "pmaddubsw %%xmm4,%%xmm2 \n"
  142. "pmaddubsw %%xmm4,%%xmm3 \n"
  143. "paddw %%xmm2,%%xmm0 \n"
  144. "paddw %%xmm3,%%xmm1 \n"
  145. "psrlw $0x1,%%xmm0 \n"
  146. "psrlw $0x1,%%xmm1 \n"
  147. "pavgw %%xmm5,%%xmm0 \n"
  148. "pavgw %%xmm5,%%xmm1 \n"
  149. "packuswb %%xmm1,%%xmm0 \n"
  150. "movdqu %%xmm0," MEMACCESS(1) " \n"
  151. "lea " MEMLEA(0x10,1) ",%1 \n"
  152. "sub $0x10,%2 \n"
  153. "jg 1b \n"
  154. : "+r"(src_ptr), // %0
  155. "+r"(dst_ptr), // %1
  156. "+r"(dst_width) // %2
  157. : "r"((intptr_t)(src_stride)) // %3
  158. : "memory", "cc", NACL_R14
  159. "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
  160. );
  161. }
  162. #ifdef HAS_SCALEROWDOWN2_AVX2
  163. void ScaleRowDown2_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
  164. uint8* dst_ptr, int dst_width) {
  165. asm volatile (
  166. LABELALIGN
  167. "1: \n"
  168. "vmovdqu " MEMACCESS(0) ",%%ymm0 \n"
  169. "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n"
  170. "lea " MEMLEA(0x40,0) ",%0 \n"
  171. "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
  172. "vpsrlw $0x8,%%ymm1,%%ymm1 \n"
  173. "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
  174. "vpermq $0xd8,%%ymm0,%%ymm0 \n"
  175. "vmovdqu %%ymm0," MEMACCESS(1) " \n"
  176. "lea " MEMLEA(0x20,1) ",%1 \n"
  177. "sub $0x20,%2 \n"
  178. "jg 1b \n"
  179. "vzeroupper \n"
  180. : "+r"(src_ptr), // %0
  181. "+r"(dst_ptr), // %1
  182. "+r"(dst_width) // %2
  183. :: "memory", "cc", "xmm0", "xmm1"
  184. );
  185. }
  186. void ScaleRowDown2Linear_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
  187. uint8* dst_ptr, int dst_width) {
  188. asm volatile (
  189. "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n"
  190. "vpsrlw $0xf,%%ymm4,%%ymm4 \n"
  191. "vpackuswb %%ymm4,%%ymm4,%%ymm4 \n"
  192. "vpxor %%ymm5,%%ymm5,%%ymm5 \n"
  193. LABELALIGN
  194. "1: \n"
  195. "vmovdqu " MEMACCESS(0) ",%%ymm0 \n"
  196. "vmovdqu " MEMACCESS2(0x20, 0) ",%%ymm1 \n"
  197. "lea " MEMLEA(0x40,0) ",%0 \n"
  198. "vpmaddubsw %%ymm4,%%ymm0,%%ymm0 \n"
  199. "vpmaddubsw %%ymm4,%%ymm1,%%ymm1 \n"
  200. "vpavgw %%ymm5,%%ymm0,%%ymm0 \n"
  201. "vpavgw %%ymm5,%%ymm1,%%ymm1 \n"
  202. "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
  203. "vpermq $0xd8,%%ymm0,%%ymm0 \n"
  204. "vmovdqu %%ymm0," MEMACCESS(1) " \n"
  205. "lea " MEMLEA(0x20,1) ",%1 \n"
  206. "sub $0x20,%2 \n"
  207. "jg 1b \n"
  208. "vzeroupper \n"
  209. : "+r"(src_ptr), // %0
  210. "+r"(dst_ptr), // %1
  211. "+r"(dst_width) // %2
  212. :: "memory", "cc", "xmm0", "xmm1", "xmm4", "xmm5"
  213. );
  214. }
  215. void ScaleRowDown2Box_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
  216. uint8* dst_ptr, int dst_width) {
  217. asm volatile (
  218. "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n"
  219. "vpsrlw $0xf,%%ymm4,%%ymm4 \n"
  220. "vpackuswb %%ymm4,%%ymm4,%%ymm4 \n"
  221. "vpxor %%ymm5,%%ymm5,%%ymm5 \n"
  222. LABELALIGN
  223. "1: \n"
  224. "vmovdqu " MEMACCESS(0) ",%%ymm0 \n"
  225. "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n"
  226. MEMOPREG(vmovdqu,0x00,0,3,1,ymm2) // vmovdqu (%0,%3,1),%%ymm2
  227. MEMOPREG(vmovdqu,0x20,0,3,1,ymm3) // vmovdqu 0x20(%0,%3,1),%%ymm3
  228. "lea " MEMLEA(0x40,0) ",%0 \n"
  229. "vpmaddubsw %%ymm4,%%ymm0,%%ymm0 \n"
  230. "vpmaddubsw %%ymm4,%%ymm1,%%ymm1 \n"
  231. "vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n"
  232. "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n"
  233. "vpaddw %%ymm2,%%ymm0,%%ymm0 \n"
  234. "vpaddw %%ymm3,%%ymm1,%%ymm1 \n"
  235. "vpsrlw $0x1,%%ymm0,%%ymm0 \n"
  236. "vpsrlw $0x1,%%ymm1,%%ymm1 \n"
  237. "vpavgw %%ymm5,%%ymm0,%%ymm0 \n"
  238. "vpavgw %%ymm5,%%ymm1,%%ymm1 \n"
  239. "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
  240. "vpermq $0xd8,%%ymm0,%%ymm0 \n"
  241. "vmovdqu %%ymm0," MEMACCESS(1) " \n"
  242. "lea " MEMLEA(0x20,1) ",%1 \n"
  243. "sub $0x20,%2 \n"
  244. "jg 1b \n"
  245. "vzeroupper \n"
  246. : "+r"(src_ptr), // %0
  247. "+r"(dst_ptr), // %1
  248. "+r"(dst_width) // %2
  249. : "r"((intptr_t)(src_stride)) // %3
  250. : "memory", "cc", NACL_R14
  251. "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
  252. );
  253. }
  254. #endif // HAS_SCALEROWDOWN2_AVX2
  255. void ScaleRowDown4_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
  256. uint8* dst_ptr, int dst_width) {
  257. asm volatile (
  258. "pcmpeqb %%xmm5,%%xmm5 \n"
  259. "psrld $0x18,%%xmm5 \n"
  260. "pslld $0x10,%%xmm5 \n"
  261. LABELALIGN
  262. "1: \n"
  263. "movdqu " MEMACCESS(0) ",%%xmm0 \n"
  264. "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
  265. "lea " MEMLEA(0x20,0) ",%0 \n"
  266. "pand %%xmm5,%%xmm0 \n"
  267. "pand %%xmm5,%%xmm1 \n"
  268. "packuswb %%xmm1,%%xmm0 \n"
  269. "psrlw $0x8,%%xmm0 \n"
  270. "packuswb %%xmm0,%%xmm0 \n"
  271. "movq %%xmm0," MEMACCESS(1) " \n"
  272. "lea " MEMLEA(0x8,1) ",%1 \n"
  273. "sub $0x8,%2 \n"
  274. "jg 1b \n"
  275. : "+r"(src_ptr), // %0
  276. "+r"(dst_ptr), // %1
  277. "+r"(dst_width) // %2
  278. :: "memory", "cc", "xmm0", "xmm1", "xmm5"
  279. );
  280. }
  281. void ScaleRowDown4Box_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
  282. uint8* dst_ptr, int dst_width) {
  283. intptr_t stridex3;
  284. asm volatile (
  285. "pcmpeqb %%xmm4,%%xmm4 \n"
  286. "psrlw $0xf,%%xmm4 \n"
  287. "movdqa %%xmm4,%%xmm5 \n"
  288. "packuswb %%xmm4,%%xmm4 \n"
  289. "psllw $0x3,%%xmm5 \n"
  290. "lea " MEMLEA4(0x00,4,4,2) ",%3 \n"
  291. LABELALIGN
  292. "1: \n"
  293. "movdqu " MEMACCESS(0) ",%%xmm0 \n"
  294. "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
  295. MEMOPREG(movdqu,0x00,0,4,1,xmm2) // movdqu (%0,%4,1),%%xmm2
  296. MEMOPREG(movdqu,0x10,0,4,1,xmm3) // movdqu 0x10(%0,%4,1),%%xmm3
  297. "pmaddubsw %%xmm4,%%xmm0 \n"
  298. "pmaddubsw %%xmm4,%%xmm1 \n"
  299. "pmaddubsw %%xmm4,%%xmm2 \n"
  300. "pmaddubsw %%xmm4,%%xmm3 \n"
  301. "paddw %%xmm2,%%xmm0 \n"
  302. "paddw %%xmm3,%%xmm1 \n"
  303. MEMOPREG(movdqu,0x00,0,4,2,xmm2) // movdqu (%0,%4,2),%%xmm2
  304. MEMOPREG(movdqu,0x10,0,4,2,xmm3) // movdqu 0x10(%0,%4,2),%%xmm3
  305. "pmaddubsw %%xmm4,%%xmm2 \n"
  306. "pmaddubsw %%xmm4,%%xmm3 \n"
  307. "paddw %%xmm2,%%xmm0 \n"
  308. "paddw %%xmm3,%%xmm1 \n"
  309. MEMOPREG(movdqu,0x00,0,3,1,xmm2) // movdqu (%0,%3,1),%%xmm2
  310. MEMOPREG(movdqu,0x10,0,3,1,xmm3) // movdqu 0x10(%0,%3,1),%%xmm3
  311. "lea " MEMLEA(0x20,0) ",%0 \n"
  312. "pmaddubsw %%xmm4,%%xmm2 \n"
  313. "pmaddubsw %%xmm4,%%xmm3 \n"
  314. "paddw %%xmm2,%%xmm0 \n"
  315. "paddw %%xmm3,%%xmm1 \n"
  316. "phaddw %%xmm1,%%xmm0 \n"
  317. "paddw %%xmm5,%%xmm0 \n"
  318. "psrlw $0x4,%%xmm0 \n"
  319. "packuswb %%xmm0,%%xmm0 \n"
  320. "movq %%xmm0," MEMACCESS(1) " \n"
  321. "lea " MEMLEA(0x8,1) ",%1 \n"
  322. "sub $0x8,%2 \n"
  323. "jg 1b \n"
  324. : "+r"(src_ptr), // %0
  325. "+r"(dst_ptr), // %1
  326. "+r"(dst_width), // %2
  327. "=&r"(stridex3) // %3
  328. : "r"((intptr_t)(src_stride)) // %4
  329. : "memory", "cc", NACL_R14
  330. "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
  331. );
  332. }
  333. #ifdef HAS_SCALEROWDOWN4_AVX2
  334. void ScaleRowDown4_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
  335. uint8* dst_ptr, int dst_width) {
  336. asm volatile (
  337. "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
  338. "vpsrld $0x18,%%ymm5,%%ymm5 \n"
  339. "vpslld $0x10,%%ymm5,%%ymm5 \n"
  340. LABELALIGN
  341. "1: \n"
  342. "vmovdqu " MEMACCESS(0) ",%%ymm0 \n"
  343. "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n"
  344. "lea " MEMLEA(0x40,0) ",%0 \n"
  345. "vpand %%ymm5,%%ymm0,%%ymm0 \n"
  346. "vpand %%ymm5,%%ymm1,%%ymm1 \n"
  347. "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
  348. "vpermq $0xd8,%%ymm0,%%ymm0 \n"
  349. "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
  350. "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n"
  351. "vpermq $0xd8,%%ymm0,%%ymm0 \n"
  352. "vmovdqu %%xmm0," MEMACCESS(1) " \n"
  353. "lea " MEMLEA(0x10,1) ",%1 \n"
  354. "sub $0x10,%2 \n"
  355. "jg 1b \n"
  356. "vzeroupper \n"
  357. : "+r"(src_ptr), // %0
  358. "+r"(dst_ptr), // %1
  359. "+r"(dst_width) // %2
  360. :: "memory", "cc", "xmm0", "xmm1", "xmm5"
  361. );
  362. }
  363. void ScaleRowDown4Box_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
  364. uint8* dst_ptr, int dst_width) {
  365. asm volatile (
  366. "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n"
  367. "vpsrlw $0xf,%%ymm4,%%ymm4 \n"
  368. "vpsllw $0x3,%%ymm4,%%ymm5 \n"
  369. "vpackuswb %%ymm4,%%ymm4,%%ymm4 \n"
  370. LABELALIGN
  371. "1: \n"
  372. "vmovdqu " MEMACCESS(0) ",%%ymm0 \n"
  373. "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n"
  374. MEMOPREG(vmovdqu,0x00,0,3,1,ymm2) // vmovdqu (%0,%3,1),%%ymm2
  375. MEMOPREG(vmovdqu,0x20,0,3,1,ymm3) // vmovdqu 0x20(%0,%3,1),%%ymm3
  376. "vpmaddubsw %%ymm4,%%ymm0,%%ymm0 \n"
  377. "vpmaddubsw %%ymm4,%%ymm1,%%ymm1 \n"
  378. "vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n"
  379. "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n"
  380. "vpaddw %%ymm2,%%ymm0,%%ymm0 \n"
  381. "vpaddw %%ymm3,%%ymm1,%%ymm1 \n"
  382. MEMOPREG(vmovdqu,0x00,0,3,2,ymm2) // vmovdqu (%0,%3,2),%%ymm2
  383. MEMOPREG(vmovdqu,0x20,0,3,2,ymm3) // vmovdqu 0x20(%0,%3,2),%%ymm3
  384. "vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n"
  385. "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n"
  386. "vpaddw %%ymm2,%%ymm0,%%ymm0 \n"
  387. "vpaddw %%ymm3,%%ymm1,%%ymm1 \n"
  388. MEMOPREG(vmovdqu,0x00,0,4,1,ymm2) // vmovdqu (%0,%4,1),%%ymm2
  389. MEMOPREG(vmovdqu,0x20,0,4,1,ymm3) // vmovdqu 0x20(%0,%4,1),%%ymm3
  390. "lea " MEMLEA(0x40,0) ",%0 \n"
  391. "vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n"
  392. "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n"
  393. "vpaddw %%ymm2,%%ymm0,%%ymm0 \n"
  394. "vpaddw %%ymm3,%%ymm1,%%ymm1 \n"
  395. "vphaddw %%ymm1,%%ymm0,%%ymm0 \n"
  396. "vpermq $0xd8,%%ymm0,%%ymm0 \n"
  397. "vpaddw %%ymm5,%%ymm0,%%ymm0 \n"
  398. "vpsrlw $0x4,%%ymm0,%%ymm0 \n"
  399. "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n"
  400. "vpermq $0xd8,%%ymm0,%%ymm0 \n"
  401. "vmovdqu %%xmm0," MEMACCESS(1) " \n"
  402. "lea " MEMLEA(0x10,1) ",%1 \n"
  403. "sub $0x10,%2 \n"
  404. "jg 1b \n"
  405. "vzeroupper \n"
  406. : "+r"(src_ptr), // %0
  407. "+r"(dst_ptr), // %1
  408. "+r"(dst_width) // %2
  409. : "r"((intptr_t)(src_stride)), // %3
  410. "r"((intptr_t)(src_stride * 3)) // %4
  411. : "memory", "cc", NACL_R14
  412. "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
  413. );
  414. }
  415. #endif // HAS_SCALEROWDOWN4_AVX2
  416. void ScaleRowDown34_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
  417. uint8* dst_ptr, int dst_width) {
  418. asm volatile (
  419. "movdqa %0,%%xmm3 \n"
  420. "movdqa %1,%%xmm4 \n"
  421. "movdqa %2,%%xmm5 \n"
  422. :
  423. : "m"(kShuf0), // %0
  424. "m"(kShuf1), // %1
  425. "m"(kShuf2) // %2
  426. );
  427. asm volatile (
  428. LABELALIGN
  429. "1: \n"
  430. "movdqu " MEMACCESS(0) ",%%xmm0 \n"
  431. "movdqu " MEMACCESS2(0x10,0) ",%%xmm2 \n"
  432. "lea " MEMLEA(0x20,0) ",%0 \n"
  433. "movdqa %%xmm2,%%xmm1 \n"
  434. "palignr $0x8,%%xmm0,%%xmm1 \n"
  435. "pshufb %%xmm3,%%xmm0 \n"
  436. "pshufb %%xmm4,%%xmm1 \n"
  437. "pshufb %%xmm5,%%xmm2 \n"
  438. "movq %%xmm0," MEMACCESS(1) " \n"
  439. "movq %%xmm1," MEMACCESS2(0x8,1) " \n"
  440. "movq %%xmm2," MEMACCESS2(0x10,1) " \n"
  441. "lea " MEMLEA(0x18,1) ",%1 \n"
  442. "sub $0x18,%2 \n"
  443. "jg 1b \n"
  444. : "+r"(src_ptr), // %0
  445. "+r"(dst_ptr), // %1
  446. "+r"(dst_width) // %2
  447. :: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
  448. );
  449. }
  450. void ScaleRowDown34_1_Box_SSSE3(const uint8* src_ptr,
  451. ptrdiff_t src_stride,
  452. uint8* dst_ptr, int dst_width) {
  453. asm volatile (
  454. "movdqa %0,%%xmm2 \n" // kShuf01
  455. "movdqa %1,%%xmm3 \n" // kShuf11
  456. "movdqa %2,%%xmm4 \n" // kShuf21
  457. :
  458. : "m"(kShuf01), // %0
  459. "m"(kShuf11), // %1
  460. "m"(kShuf21) // %2
  461. );
  462. asm volatile (
  463. "movdqa %0,%%xmm5 \n" // kMadd01
  464. "movdqa %1,%%xmm0 \n" // kMadd11
  465. "movdqa %2,%%xmm1 \n" // kRound34
  466. :
  467. : "m"(kMadd01), // %0
  468. "m"(kMadd11), // %1
  469. "m"(kRound34) // %2
  470. );
  471. asm volatile (
  472. LABELALIGN
  473. "1: \n"
  474. "movdqu " MEMACCESS(0) ",%%xmm6 \n"
  475. MEMOPREG(movdqu,0x00,0,3,1,xmm7) // movdqu (%0,%3),%%xmm7
  476. "pavgb %%xmm7,%%xmm6 \n"
  477. "pshufb %%xmm2,%%xmm6 \n"
  478. "pmaddubsw %%xmm5,%%xmm6 \n"
  479. "paddsw %%xmm1,%%xmm6 \n"
  480. "psrlw $0x2,%%xmm6 \n"
  481. "packuswb %%xmm6,%%xmm6 \n"
  482. "movq %%xmm6," MEMACCESS(1) " \n"
  483. "movdqu " MEMACCESS2(0x8,0) ",%%xmm6 \n"
  484. MEMOPREG(movdqu,0x8,0,3,1,xmm7) // movdqu 0x8(%0,%3),%%xmm7
  485. "pavgb %%xmm7,%%xmm6 \n"
  486. "pshufb %%xmm3,%%xmm6 \n"
  487. "pmaddubsw %%xmm0,%%xmm6 \n"
  488. "paddsw %%xmm1,%%xmm6 \n"
  489. "psrlw $0x2,%%xmm6 \n"
  490. "packuswb %%xmm6,%%xmm6 \n"
  491. "movq %%xmm6," MEMACCESS2(0x8,1) " \n"
  492. "movdqu " MEMACCESS2(0x10,0) ",%%xmm6 \n"
  493. MEMOPREG(movdqu,0x10,0,3,1,xmm7) // movdqu 0x10(%0,%3),%%xmm7
  494. "lea " MEMLEA(0x20,0) ",%0 \n"
  495. "pavgb %%xmm7,%%xmm6 \n"
  496. "pshufb %%xmm4,%%xmm6 \n"
  497. "pmaddubsw %4,%%xmm6 \n"
  498. "paddsw %%xmm1,%%xmm6 \n"
  499. "psrlw $0x2,%%xmm6 \n"
  500. "packuswb %%xmm6,%%xmm6 \n"
  501. "movq %%xmm6," MEMACCESS2(0x10,1) " \n"
  502. "lea " MEMLEA(0x18,1) ",%1 \n"
  503. "sub $0x18,%2 \n"
  504. "jg 1b \n"
  505. : "+r"(src_ptr), // %0
  506. "+r"(dst_ptr), // %1
  507. "+r"(dst_width) // %2
  508. : "r"((intptr_t)(src_stride)), // %3
  509. "m"(kMadd21) // %4
  510. : "memory", "cc", NACL_R14
  511. "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
  512. );
  513. }
  514. void ScaleRowDown34_0_Box_SSSE3(const uint8* src_ptr,
  515. ptrdiff_t src_stride,
  516. uint8* dst_ptr, int dst_width) {
  517. asm volatile (
  518. "movdqa %0,%%xmm2 \n" // kShuf01
  519. "movdqa %1,%%xmm3 \n" // kShuf11
  520. "movdqa %2,%%xmm4 \n" // kShuf21
  521. :
  522. : "m"(kShuf01), // %0
  523. "m"(kShuf11), // %1
  524. "m"(kShuf21) // %2
  525. );
  526. asm volatile (
  527. "movdqa %0,%%xmm5 \n" // kMadd01
  528. "movdqa %1,%%xmm0 \n" // kMadd11
  529. "movdqa %2,%%xmm1 \n" // kRound34
  530. :
  531. : "m"(kMadd01), // %0
  532. "m"(kMadd11), // %1
  533. "m"(kRound34) // %2
  534. );
  535. asm volatile (
  536. LABELALIGN
  537. "1: \n"
  538. "movdqu " MEMACCESS(0) ",%%xmm6 \n"
  539. MEMOPREG(movdqu,0x00,0,3,1,xmm7) // movdqu (%0,%3,1),%%xmm7
  540. "pavgb %%xmm6,%%xmm7 \n"
  541. "pavgb %%xmm7,%%xmm6 \n"
  542. "pshufb %%xmm2,%%xmm6 \n"
  543. "pmaddubsw %%xmm5,%%xmm6 \n"
  544. "paddsw %%xmm1,%%xmm6 \n"
  545. "psrlw $0x2,%%xmm6 \n"
  546. "packuswb %%xmm6,%%xmm6 \n"
  547. "movq %%xmm6," MEMACCESS(1) " \n"
  548. "movdqu " MEMACCESS2(0x8,0) ",%%xmm6 \n"
  549. MEMOPREG(movdqu,0x8,0,3,1,xmm7) // movdqu 0x8(%0,%3,1),%%xmm7
  550. "pavgb %%xmm6,%%xmm7 \n"
  551. "pavgb %%xmm7,%%xmm6 \n"
  552. "pshufb %%xmm3,%%xmm6 \n"
  553. "pmaddubsw %%xmm0,%%xmm6 \n"
  554. "paddsw %%xmm1,%%xmm6 \n"
  555. "psrlw $0x2,%%xmm6 \n"
  556. "packuswb %%xmm6,%%xmm6 \n"
  557. "movq %%xmm6," MEMACCESS2(0x8,1) " \n"
  558. "movdqu " MEMACCESS2(0x10,0) ",%%xmm6 \n"
  559. MEMOPREG(movdqu,0x10,0,3,1,xmm7) // movdqu 0x10(%0,%3,1),%%xmm7
  560. "lea " MEMLEA(0x20,0) ",%0 \n"
  561. "pavgb %%xmm6,%%xmm7 \n"
  562. "pavgb %%xmm7,%%xmm6 \n"
  563. "pshufb %%xmm4,%%xmm6 \n"
  564. "pmaddubsw %4,%%xmm6 \n"
  565. "paddsw %%xmm1,%%xmm6 \n"
  566. "psrlw $0x2,%%xmm6 \n"
  567. "packuswb %%xmm6,%%xmm6 \n"
  568. "movq %%xmm6," MEMACCESS2(0x10,1) " \n"
  569. "lea " MEMLEA(0x18,1) ",%1 \n"
  570. "sub $0x18,%2 \n"
  571. "jg 1b \n"
  572. : "+r"(src_ptr), // %0
  573. "+r"(dst_ptr), // %1
  574. "+r"(dst_width) // %2
  575. : "r"((intptr_t)(src_stride)), // %3
  576. "m"(kMadd21) // %4
  577. : "memory", "cc", NACL_R14
  578. "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
  579. );
  580. }
  581. void ScaleRowDown38_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
  582. uint8* dst_ptr, int dst_width) {
  583. asm volatile (
  584. "movdqa %3,%%xmm4 \n"
  585. "movdqa %4,%%xmm5 \n"
  586. LABELALIGN
  587. "1: \n"
  588. "movdqu " MEMACCESS(0) ",%%xmm0 \n"
  589. "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
  590. "lea " MEMLEA(0x20,0) ",%0 \n"
  591. "pshufb %%xmm4,%%xmm0 \n"
  592. "pshufb %%xmm5,%%xmm1 \n"
  593. "paddusb %%xmm1,%%xmm0 \n"
  594. "movq %%xmm0," MEMACCESS(1) " \n"
  595. "movhlps %%xmm0,%%xmm1 \n"
  596. "movd %%xmm1," MEMACCESS2(0x8,1) " \n"
  597. "lea " MEMLEA(0xc,1) ",%1 \n"
  598. "sub $0xc,%2 \n"
  599. "jg 1b \n"
  600. : "+r"(src_ptr), // %0
  601. "+r"(dst_ptr), // %1
  602. "+r"(dst_width) // %2
  603. : "m"(kShuf38a), // %3
  604. "m"(kShuf38b) // %4
  605. : "memory", "cc", "xmm0", "xmm1", "xmm4", "xmm5"
  606. );
  607. }
  608. void ScaleRowDown38_2_Box_SSSE3(const uint8* src_ptr,
  609. ptrdiff_t src_stride,
  610. uint8* dst_ptr, int dst_width) {
  611. asm volatile (
  612. "movdqa %0,%%xmm2 \n"
  613. "movdqa %1,%%xmm3 \n"
  614. "movdqa %2,%%xmm4 \n"
  615. "movdqa %3,%%xmm5 \n"
  616. :
  617. : "m"(kShufAb0), // %0
  618. "m"(kShufAb1), // %1
  619. "m"(kShufAb2), // %2
  620. "m"(kScaleAb2) // %3
  621. );
  622. asm volatile (
  623. LABELALIGN
  624. "1: \n"
  625. "movdqu " MEMACCESS(0) ",%%xmm0 \n"
  626. MEMOPREG(movdqu,0x00,0,3,1,xmm1) // movdqu (%0,%3,1),%%xmm1
  627. "lea " MEMLEA(0x10,0) ",%0 \n"
  628. "pavgb %%xmm1,%%xmm0 \n"
  629. "movdqa %%xmm0,%%xmm1 \n"
  630. "pshufb %%xmm2,%%xmm1 \n"
  631. "movdqa %%xmm0,%%xmm6 \n"
  632. "pshufb %%xmm3,%%xmm6 \n"
  633. "paddusw %%xmm6,%%xmm1 \n"
  634. "pshufb %%xmm4,%%xmm0 \n"
  635. "paddusw %%xmm0,%%xmm1 \n"
  636. "pmulhuw %%xmm5,%%xmm1 \n"
  637. "packuswb %%xmm1,%%xmm1 \n"
  638. "movd %%xmm1," MEMACCESS(1) " \n"
  639. "psrlq $0x10,%%xmm1 \n"
  640. "movd %%xmm1," MEMACCESS2(0x2,1) " \n"
  641. "lea " MEMLEA(0x6,1) ",%1 \n"
  642. "sub $0x6,%2 \n"
  643. "jg 1b \n"
  644. : "+r"(src_ptr), // %0
  645. "+r"(dst_ptr), // %1
  646. "+r"(dst_width) // %2
  647. : "r"((intptr_t)(src_stride)) // %3
  648. : "memory", "cc", NACL_R14
  649. "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
  650. );
  651. }
  652. void ScaleRowDown38_3_Box_SSSE3(const uint8* src_ptr,
  653. ptrdiff_t src_stride,
  654. uint8* dst_ptr, int dst_width) {
  655. asm volatile (
  656. "movdqa %0,%%xmm2 \n"
  657. "movdqa %1,%%xmm3 \n"
  658. "movdqa %2,%%xmm4 \n"
  659. "pxor %%xmm5,%%xmm5 \n"
  660. :
  661. : "m"(kShufAc), // %0
  662. "m"(kShufAc3), // %1
  663. "m"(kScaleAc33) // %2
  664. );
  665. asm volatile (
  666. LABELALIGN
  667. "1: \n"
  668. "movdqu " MEMACCESS(0) ",%%xmm0 \n"
  669. MEMOPREG(movdqu,0x00,0,3,1,xmm6) // movdqu (%0,%3,1),%%xmm6
  670. "movhlps %%xmm0,%%xmm1 \n"
  671. "movhlps %%xmm6,%%xmm7 \n"
  672. "punpcklbw %%xmm5,%%xmm0 \n"
  673. "punpcklbw %%xmm5,%%xmm1 \n"
  674. "punpcklbw %%xmm5,%%xmm6 \n"
  675. "punpcklbw %%xmm5,%%xmm7 \n"
  676. "paddusw %%xmm6,%%xmm0 \n"
  677. "paddusw %%xmm7,%%xmm1 \n"
  678. MEMOPREG(movdqu,0x00,0,3,2,xmm6) // movdqu (%0,%3,2),%%xmm6
  679. "lea " MEMLEA(0x10,0) ",%0 \n"
  680. "movhlps %%xmm6,%%xmm7 \n"
  681. "punpcklbw %%xmm5,%%xmm6 \n"
  682. "punpcklbw %%xmm5,%%xmm7 \n"
  683. "paddusw %%xmm6,%%xmm0 \n"
  684. "paddusw %%xmm7,%%xmm1 \n"
  685. "movdqa %%xmm0,%%xmm6 \n"
  686. "psrldq $0x2,%%xmm0 \n"
  687. "paddusw %%xmm0,%%xmm6 \n"
  688. "psrldq $0x2,%%xmm0 \n"
  689. "paddusw %%xmm0,%%xmm6 \n"
  690. "pshufb %%xmm2,%%xmm6 \n"
  691. "movdqa %%xmm1,%%xmm7 \n"
  692. "psrldq $0x2,%%xmm1 \n"
  693. "paddusw %%xmm1,%%xmm7 \n"
  694. "psrldq $0x2,%%xmm1 \n"
  695. "paddusw %%xmm1,%%xmm7 \n"
  696. "pshufb %%xmm3,%%xmm7 \n"
  697. "paddusw %%xmm7,%%xmm6 \n"
  698. "pmulhuw %%xmm4,%%xmm6 \n"
  699. "packuswb %%xmm6,%%xmm6 \n"
  700. "movd %%xmm6," MEMACCESS(1) " \n"
  701. "psrlq $0x10,%%xmm6 \n"
  702. "movd %%xmm6," MEMACCESS2(0x2,1) " \n"
  703. "lea " MEMLEA(0x6,1) ",%1 \n"
  704. "sub $0x6,%2 \n"
  705. "jg 1b \n"
  706. : "+r"(src_ptr), // %0
  707. "+r"(dst_ptr), // %1
  708. "+r"(dst_width) // %2
  709. : "r"((intptr_t)(src_stride)) // %3
  710. : "memory", "cc", NACL_R14
  711. "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
  712. );
  713. }
  714. // Reads 16xN bytes and produces 16 shorts at a time.
  715. void ScaleAddRow_SSE2(const uint8* src_ptr, uint16* dst_ptr, int src_width) {
  716. asm volatile (
  717. "pxor %%xmm5,%%xmm5 \n"
  718. LABELALIGN
  719. "1: \n"
  720. "movdqu " MEMACCESS(0) ",%%xmm3 \n"
  721. "lea " MEMLEA(0x10,0) ",%0 \n" // src_ptr += 16
  722. "movdqu " MEMACCESS(1) ",%%xmm0 \n"
  723. "movdqu " MEMACCESS2(0x10,1) ",%%xmm1 \n"
  724. "movdqa %%xmm3,%%xmm2 \n"
  725. "punpcklbw %%xmm5,%%xmm2 \n"
  726. "punpckhbw %%xmm5,%%xmm3 \n"
  727. "paddusw %%xmm2,%%xmm0 \n"
  728. "paddusw %%xmm3,%%xmm1 \n"
  729. "movdqu %%xmm0," MEMACCESS(1) " \n"
  730. "movdqu %%xmm1," MEMACCESS2(0x10,1) " \n"
  731. "lea " MEMLEA(0x20,1) ",%1 \n"
  732. "sub $0x10,%2 \n"
  733. "jg 1b \n"
  734. : "+r"(src_ptr), // %0
  735. "+r"(dst_ptr), // %1
  736. "+r"(src_width) // %2
  737. :
  738. : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
  739. );
  740. }
  741. #ifdef HAS_SCALEADDROW_AVX2
  742. // Reads 32 bytes and accumulates to 32 shorts at a time.
  743. void ScaleAddRow_AVX2(const uint8* src_ptr, uint16* dst_ptr, int src_width) {
  744. asm volatile (
  745. "vpxor %%ymm5,%%ymm5,%%ymm5 \n"
  746. LABELALIGN
  747. "1: \n"
  748. "vmovdqu " MEMACCESS(0) ",%%ymm3 \n"
  749. "lea " MEMLEA(0x20,0) ",%0 \n" // src_ptr += 32
  750. "vpermq $0xd8,%%ymm3,%%ymm3 \n"
  751. "vpunpcklbw %%ymm5,%%ymm3,%%ymm2 \n"
  752. "vpunpckhbw %%ymm5,%%ymm3,%%ymm3 \n"
  753. "vpaddusw " MEMACCESS(1) ",%%ymm2,%%ymm0 \n"
  754. "vpaddusw " MEMACCESS2(0x20,1) ",%%ymm3,%%ymm1 \n"
  755. "vmovdqu %%ymm0," MEMACCESS(1) " \n"
  756. "vmovdqu %%ymm1," MEMACCESS2(0x20,1) " \n"
  757. "lea " MEMLEA(0x40,1) ",%1 \n"
  758. "sub $0x20,%2 \n"
  759. "jg 1b \n"
  760. "vzeroupper \n"
  761. : "+r"(src_ptr), // %0
  762. "+r"(dst_ptr), // %1
  763. "+r"(src_width) // %2
  764. :
  765. : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
  766. );
  767. }
  768. #endif // HAS_SCALEADDROW_AVX2
  769. // Constant for making pixels signed to avoid pmaddubsw
  770. // saturation.
  771. static uvec8 kFsub80 =
  772. { 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
  773. 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 };
  774. // Constant for making pixels unsigned and adding .5 for rounding.
  775. static uvec16 kFadd40 =
  776. { 0x4040, 0x4040, 0x4040, 0x4040, 0x4040, 0x4040, 0x4040, 0x4040 };
  777. // Bilinear column filtering. SSSE3 version.
  778. void ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
  779. int dst_width, int x, int dx) {
  780. intptr_t x0, x1, temp_pixel;
  781. asm volatile (
  782. "movd %6,%%xmm2 \n"
  783. "movd %7,%%xmm3 \n"
  784. "movl $0x04040000,%k2 \n"
  785. "movd %k2,%%xmm5 \n"
  786. "pcmpeqb %%xmm6,%%xmm6 \n"
  787. "psrlw $0x9,%%xmm6 \n" // 0x007f007f
  788. "pcmpeqb %%xmm7,%%xmm7 \n"
  789. "psrlw $15,%%xmm7 \n" // 0x00010001
  790. "pextrw $0x1,%%xmm2,%k3 \n"
  791. "subl $0x2,%5 \n"
  792. "jl 29f \n"
  793. "movdqa %%xmm2,%%xmm0 \n"
  794. "paddd %%xmm3,%%xmm0 \n"
  795. "punpckldq %%xmm0,%%xmm2 \n"
  796. "punpckldq %%xmm3,%%xmm3 \n"
  797. "paddd %%xmm3,%%xmm3 \n"
  798. "pextrw $0x3,%%xmm2,%k4 \n"
  799. LABELALIGN
  800. "2: \n"
  801. "movdqa %%xmm2,%%xmm1 \n"
  802. "paddd %%xmm3,%%xmm2 \n"
  803. MEMOPARG(movzwl,0x00,1,3,1,k2) // movzwl (%1,%3,1),%k2
  804. "movd %k2,%%xmm0 \n"
  805. "psrlw $0x9,%%xmm1 \n"
  806. MEMOPARG(movzwl,0x00,1,4,1,k2) // movzwl (%1,%4,1),%k2
  807. "movd %k2,%%xmm4 \n"
  808. "pshufb %%xmm5,%%xmm1 \n"
  809. "punpcklwd %%xmm4,%%xmm0 \n"
  810. "psubb %8,%%xmm0 \n" // make pixels signed.
  811. "pxor %%xmm6,%%xmm1 \n" // 128 -f = (f ^ 127 ) + 1
  812. "paddusb %%xmm7,%%xmm1 \n"
  813. "pmaddubsw %%xmm0,%%xmm1 \n"
  814. "pextrw $0x1,%%xmm2,%k3 \n"
  815. "pextrw $0x3,%%xmm2,%k4 \n"
  816. "paddw %9,%%xmm1 \n" // make pixels unsigned.
  817. "psrlw $0x7,%%xmm1 \n"
  818. "packuswb %%xmm1,%%xmm1 \n"
  819. "movd %%xmm1,%k2 \n"
  820. "mov %w2," MEMACCESS(0) " \n"
  821. "lea " MEMLEA(0x2,0) ",%0 \n"
  822. "subl $0x2,%5 \n"
  823. "jge 2b \n"
  824. LABELALIGN
  825. "29: \n"
  826. "addl $0x1,%5 \n"
  827. "jl 99f \n"
  828. MEMOPARG(movzwl,0x00,1,3,1,k2) // movzwl (%1,%3,1),%k2
  829. "movd %k2,%%xmm0 \n"
  830. "psrlw $0x9,%%xmm2 \n"
  831. "pshufb %%xmm5,%%xmm2 \n"
  832. "psubb %8,%%xmm0 \n" // make pixels signed.
  833. "pxor %%xmm6,%%xmm2 \n"
  834. "paddusb %%xmm7,%%xmm2 \n"
  835. "pmaddubsw %%xmm0,%%xmm2 \n"
  836. "paddw %9,%%xmm2 \n" // make pixels unsigned.
  837. "psrlw $0x7,%%xmm2 \n"
  838. "packuswb %%xmm2,%%xmm2 \n"
  839. "movd %%xmm2,%k2 \n"
  840. "mov %b2," MEMACCESS(0) " \n"
  841. "99: \n"
  842. : "+r"(dst_ptr), // %0
  843. "+r"(src_ptr), // %1
  844. "=&a"(temp_pixel), // %2
  845. "=&r"(x0), // %3
  846. "=&r"(x1), // %4
  847. #if defined(__x86_64__)
  848. "+rm"(dst_width) // %5
  849. #else
  850. "+m"(dst_width) // %5
  851. #endif
  852. : "rm"(x), // %6
  853. "rm"(dx), // %7
  854. #if defined(__x86_64__)
  855. "x"(kFsub80), // %8
  856. "x"(kFadd40) // %9
  857. #else
  858. "m"(kFsub80), // %8
  859. "m"(kFadd40) // %9
  860. #endif
  861. : "memory", "cc", NACL_R14
  862. "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
  863. );
  864. }
  865. // Reads 4 pixels, duplicates them and writes 8 pixels.
  866. // Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned.
  867. void ScaleColsUp2_SSE2(uint8* dst_ptr, const uint8* src_ptr,
  868. int dst_width, int x, int dx) {
  869. asm volatile (
  870. LABELALIGN
  871. "1: \n"
  872. "movdqu " MEMACCESS(1) ",%%xmm0 \n"
  873. "lea " MEMLEA(0x10,1) ",%1 \n"
  874. "movdqa %%xmm0,%%xmm1 \n"
  875. "punpcklbw %%xmm0,%%xmm0 \n"
  876. "punpckhbw %%xmm1,%%xmm1 \n"
  877. "movdqu %%xmm0," MEMACCESS(0) " \n"
  878. "movdqu %%xmm1," MEMACCESS2(0x10,0) " \n"
  879. "lea " MEMLEA(0x20,0) ",%0 \n"
  880. "sub $0x20,%2 \n"
  881. "jg 1b \n"
  882. : "+r"(dst_ptr), // %0
  883. "+r"(src_ptr), // %1
  884. "+r"(dst_width) // %2
  885. :: "memory", "cc", "xmm0", "xmm1"
  886. );
  887. }
  888. void ScaleARGBRowDown2_SSE2(const uint8* src_argb,
  889. ptrdiff_t src_stride,
  890. uint8* dst_argb, int dst_width) {
  891. asm volatile (
  892. LABELALIGN
  893. "1: \n"
  894. "movdqu " MEMACCESS(0) ",%%xmm0 \n"
  895. "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
  896. "lea " MEMLEA(0x20,0) ",%0 \n"
  897. "shufps $0xdd,%%xmm1,%%xmm0 \n"
  898. "movdqu %%xmm0," MEMACCESS(1) " \n"
  899. "lea " MEMLEA(0x10,1) ",%1 \n"
  900. "sub $0x4,%2 \n"
  901. "jg 1b \n"
  902. : "+r"(src_argb), // %0
  903. "+r"(dst_argb), // %1
  904. "+r"(dst_width) // %2
  905. :: "memory", "cc", "xmm0", "xmm1"
  906. );
  907. }
  908. void ScaleARGBRowDown2Linear_SSE2(const uint8* src_argb,
  909. ptrdiff_t src_stride,
  910. uint8* dst_argb, int dst_width) {
  911. asm volatile (
  912. LABELALIGN
  913. "1: \n"
  914. "movdqu " MEMACCESS(0) ",%%xmm0 \n"
  915. "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
  916. "lea " MEMLEA(0x20,0) ",%0 \n"
  917. "movdqa %%xmm0,%%xmm2 \n"
  918. "shufps $0x88,%%xmm1,%%xmm0 \n"
  919. "shufps $0xdd,%%xmm1,%%xmm2 \n"
  920. "pavgb %%xmm2,%%xmm0 \n"
  921. "movdqu %%xmm0," MEMACCESS(1) " \n"
  922. "lea " MEMLEA(0x10,1) ",%1 \n"
  923. "sub $0x4,%2 \n"
  924. "jg 1b \n"
  925. : "+r"(src_argb), // %0
  926. "+r"(dst_argb), // %1
  927. "+r"(dst_width) // %2
  928. :: "memory", "cc", "xmm0", "xmm1"
  929. );
  930. }
  931. void ScaleARGBRowDown2Box_SSE2(const uint8* src_argb,
  932. ptrdiff_t src_stride,
  933. uint8* dst_argb, int dst_width) {
  934. asm volatile (
  935. LABELALIGN
  936. "1: \n"
  937. "movdqu " MEMACCESS(0) ",%%xmm0 \n"
  938. "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
  939. MEMOPREG(movdqu,0x00,0,3,1,xmm2) // movdqu (%0,%3,1),%%xmm2
  940. MEMOPREG(movdqu,0x10,0,3,1,xmm3) // movdqu 0x10(%0,%3,1),%%xmm3
  941. "lea " MEMLEA(0x20,0) ",%0 \n"
  942. "pavgb %%xmm2,%%xmm0 \n"
  943. "pavgb %%xmm3,%%xmm1 \n"
  944. "movdqa %%xmm0,%%xmm2 \n"
  945. "shufps $0x88,%%xmm1,%%xmm0 \n"
  946. "shufps $0xdd,%%xmm1,%%xmm2 \n"
  947. "pavgb %%xmm2,%%xmm0 \n"
  948. "movdqu %%xmm0," MEMACCESS(1) " \n"
  949. "lea " MEMLEA(0x10,1) ",%1 \n"
  950. "sub $0x4,%2 \n"
  951. "jg 1b \n"
  952. : "+r"(src_argb), // %0
  953. "+r"(dst_argb), // %1
  954. "+r"(dst_width) // %2
  955. : "r"((intptr_t)(src_stride)) // %3
  956. : "memory", "cc", NACL_R14
  957. "xmm0", "xmm1", "xmm2", "xmm3"
  958. );
  959. }
  960. // Reads 4 pixels at a time.
  961. // Alignment requirement: dst_argb 16 byte aligned.
  962. void ScaleARGBRowDownEven_SSE2(const uint8* src_argb, ptrdiff_t src_stride,
  963. int src_stepx, uint8* dst_argb, int dst_width) {
  964. intptr_t src_stepx_x4 = (intptr_t)(src_stepx);
  965. intptr_t src_stepx_x12;
  966. asm volatile (
  967. "lea " MEMLEA3(0x00,1,4) ",%1 \n"
  968. "lea " MEMLEA4(0x00,1,1,2) ",%4 \n"
  969. LABELALIGN
  970. "1: \n"
  971. "movd " MEMACCESS(0) ",%%xmm0 \n"
  972. MEMOPREG(movd,0x00,0,1,1,xmm1) // movd (%0,%1,1),%%xmm1
  973. "punpckldq %%xmm1,%%xmm0 \n"
  974. MEMOPREG(movd,0x00,0,1,2,xmm2) // movd (%0,%1,2),%%xmm2
  975. MEMOPREG(movd,0x00,0,4,1,xmm3) // movd (%0,%4,1),%%xmm3
  976. "lea " MEMLEA4(0x00,0,1,4) ",%0 \n"
  977. "punpckldq %%xmm3,%%xmm2 \n"
  978. "punpcklqdq %%xmm2,%%xmm0 \n"
  979. "movdqu %%xmm0," MEMACCESS(2) " \n"
  980. "lea " MEMLEA(0x10,2) ",%2 \n"
  981. "sub $0x4,%3 \n"
  982. "jg 1b \n"
  983. : "+r"(src_argb), // %0
  984. "+r"(src_stepx_x4), // %1
  985. "+r"(dst_argb), // %2
  986. "+r"(dst_width), // %3
  987. "=&r"(src_stepx_x12) // %4
  988. :: "memory", "cc", NACL_R14
  989. "xmm0", "xmm1", "xmm2", "xmm3"
  990. );
  991. }
  992. // Blends four 2x2 to 4x1.
  993. // Alignment requirement: dst_argb 16 byte aligned.
  994. void ScaleARGBRowDownEvenBox_SSE2(const uint8* src_argb,
  995. ptrdiff_t src_stride, int src_stepx,
  996. uint8* dst_argb, int dst_width) {
  997. intptr_t src_stepx_x4 = (intptr_t)(src_stepx);
  998. intptr_t src_stepx_x12;
  999. intptr_t row1 = (intptr_t)(src_stride);
  1000. asm volatile (
  1001. "lea " MEMLEA3(0x00,1,4) ",%1 \n"
  1002. "lea " MEMLEA4(0x00,1,1,2) ",%4 \n"
  1003. "lea " MEMLEA4(0x00,0,5,1) ",%5 \n"
  1004. LABELALIGN
  1005. "1: \n"
  1006. "movq " MEMACCESS(0) ",%%xmm0 \n"
  1007. MEMOPREG(movhps,0x00,0,1,1,xmm0) // movhps (%0,%1,1),%%xmm0
  1008. MEMOPREG(movq,0x00,0,1,2,xmm1) // movq (%0,%1,2),%%xmm1
  1009. MEMOPREG(movhps,0x00,0,4,1,xmm1) // movhps (%0,%4,1),%%xmm1
  1010. "lea " MEMLEA4(0x00,0,1,4) ",%0 \n"
  1011. "movq " MEMACCESS(5) ",%%xmm2 \n"
  1012. MEMOPREG(movhps,0x00,5,1,1,xmm2) // movhps (%5,%1,1),%%xmm2
  1013. MEMOPREG(movq,0x00,5,1,2,xmm3) // movq (%5,%1,2),%%xmm3
  1014. MEMOPREG(movhps,0x00,5,4,1,xmm3) // movhps (%5,%4,1),%%xmm3
  1015. "lea " MEMLEA4(0x00,5,1,4) ",%5 \n"
  1016. "pavgb %%xmm2,%%xmm0 \n"
  1017. "pavgb %%xmm3,%%xmm1 \n"
  1018. "movdqa %%xmm0,%%xmm2 \n"
  1019. "shufps $0x88,%%xmm1,%%xmm0 \n"
  1020. "shufps $0xdd,%%xmm1,%%xmm2 \n"
  1021. "pavgb %%xmm2,%%xmm0 \n"
  1022. "movdqu %%xmm0," MEMACCESS(2) " \n"
  1023. "lea " MEMLEA(0x10,2) ",%2 \n"
  1024. "sub $0x4,%3 \n"
  1025. "jg 1b \n"
  1026. : "+r"(src_argb), // %0
  1027. "+r"(src_stepx_x4), // %1
  1028. "+r"(dst_argb), // %2
  1029. "+rm"(dst_width), // %3
  1030. "=&r"(src_stepx_x12), // %4
  1031. "+r"(row1) // %5
  1032. :: "memory", "cc", NACL_R14
  1033. "xmm0", "xmm1", "xmm2", "xmm3"
  1034. );
  1035. }
  1036. void ScaleARGBCols_SSE2(uint8* dst_argb, const uint8* src_argb,
  1037. int dst_width, int x, int dx) {
  1038. intptr_t x0, x1;
  1039. asm volatile (
  1040. "movd %5,%%xmm2 \n"
  1041. "movd %6,%%xmm3 \n"
  1042. "pshufd $0x0,%%xmm2,%%xmm2 \n"
  1043. "pshufd $0x11,%%xmm3,%%xmm0 \n"
  1044. "paddd %%xmm0,%%xmm2 \n"
  1045. "paddd %%xmm3,%%xmm3 \n"
  1046. "pshufd $0x5,%%xmm3,%%xmm0 \n"
  1047. "paddd %%xmm0,%%xmm2 \n"
  1048. "paddd %%xmm3,%%xmm3 \n"
  1049. "pshufd $0x0,%%xmm3,%%xmm3 \n"
  1050. "pextrw $0x1,%%xmm2,%k0 \n"
  1051. "pextrw $0x3,%%xmm2,%k1 \n"
  1052. "cmp $0x0,%4 \n"
  1053. "jl 99f \n"
  1054. "sub $0x4,%4 \n"
  1055. "jl 49f \n"
  1056. LABELALIGN
  1057. "40: \n"
  1058. MEMOPREG(movd,0x00,3,0,4,xmm0) // movd (%3,%0,4),%%xmm0
  1059. MEMOPREG(movd,0x00,3,1,4,xmm1) // movd (%3,%1,4),%%xmm1
  1060. "pextrw $0x5,%%xmm2,%k0 \n"
  1061. "pextrw $0x7,%%xmm2,%k1 \n"
  1062. "paddd %%xmm3,%%xmm2 \n"
  1063. "punpckldq %%xmm1,%%xmm0 \n"
  1064. MEMOPREG(movd,0x00,3,0,4,xmm1) // movd (%3,%0,4),%%xmm1
  1065. MEMOPREG(movd,0x00,3,1,4,xmm4) // movd (%3,%1,4),%%xmm4
  1066. "pextrw $0x1,%%xmm2,%k0 \n"
  1067. "pextrw $0x3,%%xmm2,%k1 \n"
  1068. "punpckldq %%xmm4,%%xmm1 \n"
  1069. "punpcklqdq %%xmm1,%%xmm0 \n"
  1070. "movdqu %%xmm0," MEMACCESS(2) " \n"
  1071. "lea " MEMLEA(0x10,2) ",%2 \n"
  1072. "sub $0x4,%4 \n"
  1073. "jge 40b \n"
  1074. "49: \n"
  1075. "test $0x2,%4 \n"
  1076. "je 29f \n"
  1077. MEMOPREG(movd,0x00,3,0,4,xmm0) // movd (%3,%0,4),%%xmm0
  1078. MEMOPREG(movd,0x00,3,1,4,xmm1) // movd (%3,%1,4),%%xmm1
  1079. "pextrw $0x5,%%xmm2,%k0 \n"
  1080. "punpckldq %%xmm1,%%xmm0 \n"
  1081. "movq %%xmm0," MEMACCESS(2) " \n"
  1082. "lea " MEMLEA(0x8,2) ",%2 \n"
  1083. "29: \n"
  1084. "test $0x1,%4 \n"
  1085. "je 99f \n"
  1086. MEMOPREG(movd,0x00,3,0,4,xmm0) // movd (%3,%0,4),%%xmm0
  1087. "movd %%xmm0," MEMACCESS(2) " \n"
  1088. "99: \n"
  1089. : "=&a"(x0), // %0
  1090. "=&d"(x1), // %1
  1091. "+r"(dst_argb), // %2
  1092. "+r"(src_argb), // %3
  1093. "+r"(dst_width) // %4
  1094. : "rm"(x), // %5
  1095. "rm"(dx) // %6
  1096. : "memory", "cc", NACL_R14
  1097. "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
  1098. );
  1099. }
  1100. // Reads 4 pixels, duplicates them and writes 8 pixels.
  1101. // Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned.
  1102. void ScaleARGBColsUp2_SSE2(uint8* dst_argb, const uint8* src_argb,
  1103. int dst_width, int x, int dx) {
  1104. asm volatile (
  1105. LABELALIGN
  1106. "1: \n"
  1107. "movdqu " MEMACCESS(1) ",%%xmm0 \n"
  1108. "lea " MEMLEA(0x10,1) ",%1 \n"
  1109. "movdqa %%xmm0,%%xmm1 \n"
  1110. "punpckldq %%xmm0,%%xmm0 \n"
  1111. "punpckhdq %%xmm1,%%xmm1 \n"
  1112. "movdqu %%xmm0," MEMACCESS(0) " \n"
  1113. "movdqu %%xmm1," MEMACCESS2(0x10,0) " \n"
  1114. "lea " MEMLEA(0x20,0) ",%0 \n"
  1115. "sub $0x8,%2 \n"
  1116. "jg 1b \n"
  1117. : "+r"(dst_argb), // %0
  1118. "+r"(src_argb), // %1
  1119. "+r"(dst_width) // %2
  1120. :: "memory", "cc", NACL_R14
  1121. "xmm0", "xmm1"
  1122. );
  1123. }
  1124. // Shuffle table for arranging 2 pixels into pairs for pmaddubsw
  1125. static uvec8 kShuffleColARGB = {
  1126. 0u, 4u, 1u, 5u, 2u, 6u, 3u, 7u, // bbggrraa 1st pixel
  1127. 8u, 12u, 9u, 13u, 10u, 14u, 11u, 15u // bbggrraa 2nd pixel
  1128. };
  1129. // Shuffle table for duplicating 2 fractions into 8 bytes each
  1130. static uvec8 kShuffleFractions = {
  1131. 0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u, 4u, 4u, 4u, 4u, 4u, 4u, 4u, 4u,
  1132. };
  1133. // Bilinear row filtering combines 4x2 -> 4x1. SSSE3 version
  1134. void ScaleARGBFilterCols_SSSE3(uint8* dst_argb, const uint8* src_argb,
  1135. int dst_width, int x, int dx) {
  1136. intptr_t x0, x1;
  1137. asm volatile (
  1138. "movdqa %0,%%xmm4 \n"
  1139. "movdqa %1,%%xmm5 \n"
  1140. :
  1141. : "m"(kShuffleColARGB), // %0
  1142. "m"(kShuffleFractions) // %1
  1143. );
  1144. asm volatile (
  1145. "movd %5,%%xmm2 \n"
  1146. "movd %6,%%xmm3 \n"
  1147. "pcmpeqb %%xmm6,%%xmm6 \n"
  1148. "psrlw $0x9,%%xmm6 \n"
  1149. "pextrw $0x1,%%xmm2,%k3 \n"
  1150. "sub $0x2,%2 \n"
  1151. "jl 29f \n"
  1152. "movdqa %%xmm2,%%xmm0 \n"
  1153. "paddd %%xmm3,%%xmm0 \n"
  1154. "punpckldq %%xmm0,%%xmm2 \n"
  1155. "punpckldq %%xmm3,%%xmm3 \n"
  1156. "paddd %%xmm3,%%xmm3 \n"
  1157. "pextrw $0x3,%%xmm2,%k4 \n"
  1158. LABELALIGN
  1159. "2: \n"
  1160. "movdqa %%xmm2,%%xmm1 \n"
  1161. "paddd %%xmm3,%%xmm2 \n"
  1162. MEMOPREG(movq,0x00,1,3,4,xmm0) // movq (%1,%3,4),%%xmm0
  1163. "psrlw $0x9,%%xmm1 \n"
  1164. MEMOPREG(movhps,0x00,1,4,4,xmm0) // movhps (%1,%4,4),%%xmm0
  1165. "pshufb %%xmm5,%%xmm1 \n"
  1166. "pshufb %%xmm4,%%xmm0 \n"
  1167. "pxor %%xmm6,%%xmm1 \n"
  1168. "pmaddubsw %%xmm1,%%xmm0 \n"
  1169. "psrlw $0x7,%%xmm0 \n"
  1170. "pextrw $0x1,%%xmm2,%k3 \n"
  1171. "pextrw $0x3,%%xmm2,%k4 \n"
  1172. "packuswb %%xmm0,%%xmm0 \n"
  1173. "movq %%xmm0," MEMACCESS(0) " \n"
  1174. "lea " MEMLEA(0x8,0) ",%0 \n"
  1175. "sub $0x2,%2 \n"
  1176. "jge 2b \n"
  1177. LABELALIGN
  1178. "29: \n"
  1179. "add $0x1,%2 \n"
  1180. "jl 99f \n"
  1181. "psrlw $0x9,%%xmm2 \n"
  1182. MEMOPREG(movq,0x00,1,3,4,xmm0) // movq (%1,%3,4),%%xmm0
  1183. "pshufb %%xmm5,%%xmm2 \n"
  1184. "pshufb %%xmm4,%%xmm0 \n"
  1185. "pxor %%xmm6,%%xmm2 \n"
  1186. "pmaddubsw %%xmm2,%%xmm0 \n"
  1187. "psrlw $0x7,%%xmm0 \n"
  1188. "packuswb %%xmm0,%%xmm0 \n"
  1189. "movd %%xmm0," MEMACCESS(0) " \n"
  1190. LABELALIGN
  1191. "99: \n"
  1192. : "+r"(dst_argb), // %0
  1193. "+r"(src_argb), // %1
  1194. "+rm"(dst_width), // %2
  1195. "=&r"(x0), // %3
  1196. "=&r"(x1) // %4
  1197. : "rm"(x), // %5
  1198. "rm"(dx) // %6
  1199. : "memory", "cc", NACL_R14
  1200. "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
  1201. );
  1202. }
  1203. // Divide num by div and return as 16.16 fixed point result.
  1204. int FixedDiv_X86(int num, int div) {
  1205. asm volatile (
  1206. "cdq \n"
  1207. "shld $0x10,%%eax,%%edx \n"
  1208. "shl $0x10,%%eax \n"
  1209. "idiv %1 \n"
  1210. "mov %0, %%eax \n"
  1211. : "+a"(num) // %0
  1212. : "c"(div) // %1
  1213. : "memory", "cc", "edx"
  1214. );
  1215. return num;
  1216. }
  1217. // Divide num - 1 by div - 1 and return as 16.16 fixed point result.
  1218. int FixedDiv1_X86(int num, int div) {
  1219. asm volatile (
  1220. "cdq \n"
  1221. "shld $0x10,%%eax,%%edx \n"
  1222. "shl $0x10,%%eax \n"
  1223. "sub $0x10001,%%eax \n"
  1224. "sbb $0x0,%%edx \n"
  1225. "sub $0x1,%1 \n"
  1226. "idiv %1 \n"
  1227. "mov %0, %%eax \n"
  1228. : "+a"(num) // %0
  1229. : "c"(div) // %1
  1230. : "memory", "cc", "edx"
  1231. );
  1232. return num;
  1233. }
  1234. #endif // defined(__x86_64__) || defined(__i386__)
  1235. #ifdef __cplusplus
  1236. } // extern "C"
  1237. } // namespace libyuv
  1238. #endif