2
0

chacha-x86-win32n.asm 18 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973
  1. ; This file is generated from a similarly-named Perl script in the BoringSSL
  2. ; source tree. Do not edit by hand.
  3. %ifdef BORINGSSL_PREFIX
  4. %include "boringssl_prefix_symbols_nasm.inc"
  5. %endif
  6. %ifidn __OUTPUT_FORMAT__,obj
  7. section code use32 class=code align=64
  8. %elifidn __OUTPUT_FORMAT__,win32
  9. [email protected] equ 1
  10. section .text code align=64
  11. %else
  12. section .text code
  13. %endif
  14. global _GFp_ChaCha20_ctr32
  15. align 16
  16. _GFp_ChaCha20_ctr32:
  17. L$_GFp_ChaCha20_ctr32_begin:
  18. push ebp
  19. push ebx
  20. push esi
  21. push edi
  22. xor eax,eax
  23. cmp eax,DWORD [28+esp]
  24. je NEAR L$000no_data
  25. call L$pic_point
  26. L$pic_point:
  27. pop eax
  28. lea ebp,[_GFp_ia32cap_P]
  29. test DWORD [ebp],16777216
  30. jz NEAR L$001x86
  31. test DWORD [4+ebp],512
  32. jz NEAR L$001x86
  33. jmp NEAR L$ssse3_shortcut
  34. L$001x86:
  35. mov esi,DWORD [32+esp]
  36. mov edi,DWORD [36+esp]
  37. sub esp,132
  38. mov eax,DWORD [esi]
  39. mov ebx,DWORD [4+esi]
  40. mov ecx,DWORD [8+esi]
  41. mov edx,DWORD [12+esi]
  42. mov DWORD [80+esp],eax
  43. mov DWORD [84+esp],ebx
  44. mov DWORD [88+esp],ecx
  45. mov DWORD [92+esp],edx
  46. mov eax,DWORD [16+esi]
  47. mov ebx,DWORD [20+esi]
  48. mov ecx,DWORD [24+esi]
  49. mov edx,DWORD [28+esi]
  50. mov DWORD [96+esp],eax
  51. mov DWORD [100+esp],ebx
  52. mov DWORD [104+esp],ecx
  53. mov DWORD [108+esp],edx
  54. mov eax,DWORD [edi]
  55. mov ebx,DWORD [4+edi]
  56. mov ecx,DWORD [8+edi]
  57. mov edx,DWORD [12+edi]
  58. sub eax,1
  59. mov DWORD [112+esp],eax
  60. mov DWORD [116+esp],ebx
  61. mov DWORD [120+esp],ecx
  62. mov DWORD [124+esp],edx
  63. jmp NEAR L$002entry
  64. align 16
  65. L$003outer_loop:
  66. mov DWORD [156+esp],ebx
  67. mov DWORD [152+esp],eax
  68. mov DWORD [160+esp],ecx
  69. L$002entry:
  70. mov eax,1634760805
  71. mov DWORD [4+esp],857760878
  72. mov DWORD [8+esp],2036477234
  73. mov DWORD [12+esp],1797285236
  74. mov ebx,DWORD [84+esp]
  75. mov ebp,DWORD [88+esp]
  76. mov ecx,DWORD [104+esp]
  77. mov esi,DWORD [108+esp]
  78. mov edx,DWORD [116+esp]
  79. mov edi,DWORD [120+esp]
  80. mov DWORD [20+esp],ebx
  81. mov DWORD [24+esp],ebp
  82. mov DWORD [40+esp],ecx
  83. mov DWORD [44+esp],esi
  84. mov DWORD [52+esp],edx
  85. mov DWORD [56+esp],edi
  86. mov ebx,DWORD [92+esp]
  87. mov edi,DWORD [124+esp]
  88. mov edx,DWORD [112+esp]
  89. mov ebp,DWORD [80+esp]
  90. mov ecx,DWORD [96+esp]
  91. mov esi,DWORD [100+esp]
  92. add edx,1
  93. mov DWORD [28+esp],ebx
  94. mov DWORD [60+esp],edi
  95. mov DWORD [112+esp],edx
  96. mov ebx,10
  97. jmp NEAR L$004loop
  98. align 16
  99. L$004loop:
  100. add eax,ebp
  101. mov DWORD [128+esp],ebx
  102. mov ebx,ebp
  103. xor edx,eax
  104. rol edx,16
  105. add ecx,edx
  106. xor ebx,ecx
  107. mov edi,DWORD [52+esp]
  108. rol ebx,12
  109. mov ebp,DWORD [20+esp]
  110. add eax,ebx
  111. xor edx,eax
  112. mov DWORD [esp],eax
  113. rol edx,8
  114. mov eax,DWORD [4+esp]
  115. add ecx,edx
  116. mov DWORD [48+esp],edx
  117. xor ebx,ecx
  118. add eax,ebp
  119. rol ebx,7
  120. xor edi,eax
  121. mov DWORD [32+esp],ecx
  122. rol edi,16
  123. mov DWORD [16+esp],ebx
  124. add esi,edi
  125. mov ecx,DWORD [40+esp]
  126. xor ebp,esi
  127. mov edx,DWORD [56+esp]
  128. rol ebp,12
  129. mov ebx,DWORD [24+esp]
  130. add eax,ebp
  131. xor edi,eax
  132. mov DWORD [4+esp],eax
  133. rol edi,8
  134. mov eax,DWORD [8+esp]
  135. add esi,edi
  136. mov DWORD [52+esp],edi
  137. xor ebp,esi
  138. add eax,ebx
  139. rol ebp,7
  140. xor edx,eax
  141. mov DWORD [36+esp],esi
  142. rol edx,16
  143. mov DWORD [20+esp],ebp
  144. add ecx,edx
  145. mov esi,DWORD [44+esp]
  146. xor ebx,ecx
  147. mov edi,DWORD [60+esp]
  148. rol ebx,12
  149. mov ebp,DWORD [28+esp]
  150. add eax,ebx
  151. xor edx,eax
  152. mov DWORD [8+esp],eax
  153. rol edx,8
  154. mov eax,DWORD [12+esp]
  155. add ecx,edx
  156. mov DWORD [56+esp],edx
  157. xor ebx,ecx
  158. add eax,ebp
  159. rol ebx,7
  160. xor edi,eax
  161. rol edi,16
  162. mov DWORD [24+esp],ebx
  163. add esi,edi
  164. xor ebp,esi
  165. rol ebp,12
  166. mov ebx,DWORD [20+esp]
  167. add eax,ebp
  168. xor edi,eax
  169. mov DWORD [12+esp],eax
  170. rol edi,8
  171. mov eax,DWORD [esp]
  172. add esi,edi
  173. mov edx,edi
  174. xor ebp,esi
  175. add eax,ebx
  176. rol ebp,7
  177. xor edx,eax
  178. rol edx,16
  179. mov DWORD [28+esp],ebp
  180. add ecx,edx
  181. xor ebx,ecx
  182. mov edi,DWORD [48+esp]
  183. rol ebx,12
  184. mov ebp,DWORD [24+esp]
  185. add eax,ebx
  186. xor edx,eax
  187. mov DWORD [esp],eax
  188. rol edx,8
  189. mov eax,DWORD [4+esp]
  190. add ecx,edx
  191. mov DWORD [60+esp],edx
  192. xor ebx,ecx
  193. add eax,ebp
  194. rol ebx,7
  195. xor edi,eax
  196. mov DWORD [40+esp],ecx
  197. rol edi,16
  198. mov DWORD [20+esp],ebx
  199. add esi,edi
  200. mov ecx,DWORD [32+esp]
  201. xor ebp,esi
  202. mov edx,DWORD [52+esp]
  203. rol ebp,12
  204. mov ebx,DWORD [28+esp]
  205. add eax,ebp
  206. xor edi,eax
  207. mov DWORD [4+esp],eax
  208. rol edi,8
  209. mov eax,DWORD [8+esp]
  210. add esi,edi
  211. mov DWORD [48+esp],edi
  212. xor ebp,esi
  213. add eax,ebx
  214. rol ebp,7
  215. xor edx,eax
  216. mov DWORD [44+esp],esi
  217. rol edx,16
  218. mov DWORD [24+esp],ebp
  219. add ecx,edx
  220. mov esi,DWORD [36+esp]
  221. xor ebx,ecx
  222. mov edi,DWORD [56+esp]
  223. rol ebx,12
  224. mov ebp,DWORD [16+esp]
  225. add eax,ebx
  226. xor edx,eax
  227. mov DWORD [8+esp],eax
  228. rol edx,8
  229. mov eax,DWORD [12+esp]
  230. add ecx,edx
  231. mov DWORD [52+esp],edx
  232. xor ebx,ecx
  233. add eax,ebp
  234. rol ebx,7
  235. xor edi,eax
  236. rol edi,16
  237. mov DWORD [28+esp],ebx
  238. add esi,edi
  239. xor ebp,esi
  240. mov edx,DWORD [48+esp]
  241. rol ebp,12
  242. mov ebx,DWORD [128+esp]
  243. add eax,ebp
  244. xor edi,eax
  245. mov DWORD [12+esp],eax
  246. rol edi,8
  247. mov eax,DWORD [esp]
  248. add esi,edi
  249. mov DWORD [56+esp],edi
  250. xor ebp,esi
  251. rol ebp,7
  252. dec ebx
  253. jnz NEAR L$004loop
  254. mov ebx,DWORD [160+esp]
  255. add eax,1634760805
  256. add ebp,DWORD [80+esp]
  257. add ecx,DWORD [96+esp]
  258. add esi,DWORD [100+esp]
  259. cmp ebx,64
  260. jb NEAR L$005tail
  261. mov ebx,DWORD [156+esp]
  262. add edx,DWORD [112+esp]
  263. add edi,DWORD [120+esp]
  264. xor eax,DWORD [ebx]
  265. xor ebp,DWORD [16+ebx]
  266. mov DWORD [esp],eax
  267. mov eax,DWORD [152+esp]
  268. xor ecx,DWORD [32+ebx]
  269. xor esi,DWORD [36+ebx]
  270. xor edx,DWORD [48+ebx]
  271. xor edi,DWORD [56+ebx]
  272. mov DWORD [16+eax],ebp
  273. mov DWORD [32+eax],ecx
  274. mov DWORD [36+eax],esi
  275. mov DWORD [48+eax],edx
  276. mov DWORD [56+eax],edi
  277. mov ebp,DWORD [4+esp]
  278. mov ecx,DWORD [8+esp]
  279. mov esi,DWORD [12+esp]
  280. mov edx,DWORD [20+esp]
  281. mov edi,DWORD [24+esp]
  282. add ebp,857760878
  283. add ecx,2036477234
  284. add esi,1797285236
  285. add edx,DWORD [84+esp]
  286. add edi,DWORD [88+esp]
  287. xor ebp,DWORD [4+ebx]
  288. xor ecx,DWORD [8+ebx]
  289. xor esi,DWORD [12+ebx]
  290. xor edx,DWORD [20+ebx]
  291. xor edi,DWORD [24+ebx]
  292. mov DWORD [4+eax],ebp
  293. mov DWORD [8+eax],ecx
  294. mov DWORD [12+eax],esi
  295. mov DWORD [20+eax],edx
  296. mov DWORD [24+eax],edi
  297. mov ebp,DWORD [28+esp]
  298. mov ecx,DWORD [40+esp]
  299. mov esi,DWORD [44+esp]
  300. mov edx,DWORD [52+esp]
  301. mov edi,DWORD [60+esp]
  302. add ebp,DWORD [92+esp]
  303. add ecx,DWORD [104+esp]
  304. add esi,DWORD [108+esp]
  305. add edx,DWORD [116+esp]
  306. add edi,DWORD [124+esp]
  307. xor ebp,DWORD [28+ebx]
  308. xor ecx,DWORD [40+ebx]
  309. xor esi,DWORD [44+ebx]
  310. xor edx,DWORD [52+ebx]
  311. xor edi,DWORD [60+ebx]
  312. lea ebx,[64+ebx]
  313. mov DWORD [28+eax],ebp
  314. mov ebp,DWORD [esp]
  315. mov DWORD [40+eax],ecx
  316. mov ecx,DWORD [160+esp]
  317. mov DWORD [44+eax],esi
  318. mov DWORD [52+eax],edx
  319. mov DWORD [60+eax],edi
  320. mov DWORD [eax],ebp
  321. lea eax,[64+eax]
  322. sub ecx,64
  323. jnz NEAR L$003outer_loop
  324. jmp NEAR L$006done
  325. L$005tail:
  326. add edx,DWORD [112+esp]
  327. add edi,DWORD [120+esp]
  328. mov DWORD [esp],eax
  329. mov DWORD [16+esp],ebp
  330. mov DWORD [32+esp],ecx
  331. mov DWORD [36+esp],esi
  332. mov DWORD [48+esp],edx
  333. mov DWORD [56+esp],edi
  334. mov ebp,DWORD [4+esp]
  335. mov ecx,DWORD [8+esp]
  336. mov esi,DWORD [12+esp]
  337. mov edx,DWORD [20+esp]
  338. mov edi,DWORD [24+esp]
  339. add ebp,857760878
  340. add ecx,2036477234
  341. add esi,1797285236
  342. add edx,DWORD [84+esp]
  343. add edi,DWORD [88+esp]
  344. mov DWORD [4+esp],ebp
  345. mov DWORD [8+esp],ecx
  346. mov DWORD [12+esp],esi
  347. mov DWORD [20+esp],edx
  348. mov DWORD [24+esp],edi
  349. mov ebp,DWORD [28+esp]
  350. mov ecx,DWORD [40+esp]
  351. mov esi,DWORD [44+esp]
  352. mov edx,DWORD [52+esp]
  353. mov edi,DWORD [60+esp]
  354. add ebp,DWORD [92+esp]
  355. add ecx,DWORD [104+esp]
  356. add esi,DWORD [108+esp]
  357. add edx,DWORD [116+esp]
  358. add edi,DWORD [124+esp]
  359. mov DWORD [28+esp],ebp
  360. mov ebp,DWORD [156+esp]
  361. mov DWORD [40+esp],ecx
  362. mov ecx,DWORD [152+esp]
  363. mov DWORD [44+esp],esi
  364. xor esi,esi
  365. mov DWORD [52+esp],edx
  366. mov DWORD [60+esp],edi
  367. xor eax,eax
  368. xor edx,edx
  369. L$007tail_loop:
  370. mov al,BYTE [ebp*1+esi]
  371. mov dl,BYTE [esi*1+esp]
  372. lea esi,[1+esi]
  373. xor al,dl
  374. mov BYTE [esi*1+ecx-1],al
  375. dec ebx
  376. jnz NEAR L$007tail_loop
  377. L$006done:
  378. add esp,132
  379. L$000no_data:
  380. pop edi
  381. pop esi
  382. pop ebx
  383. pop ebp
  384. ret
  385. align 16
  386. __ChaCha20_ssse3:
  387. push ebp
  388. push ebx
  389. push esi
  390. push edi
  391. L$ssse3_shortcut:
  392. mov edi,DWORD [20+esp]
  393. mov esi,DWORD [24+esp]
  394. mov ecx,DWORD [28+esp]
  395. mov edx,DWORD [32+esp]
  396. mov ebx,DWORD [36+esp]
  397. mov ebp,esp
  398. sub esp,524
  399. and esp,-64
  400. mov DWORD [512+esp],ebp
  401. lea eax,[(L$ssse3_data-L$pic_point)+eax]
  402. movdqu xmm3,[ebx]
  403. cmp ecx,256
  404. jb NEAR L$0081x
  405. mov DWORD [516+esp],edx
  406. mov DWORD [520+esp],ebx
  407. sub ecx,256
  408. lea ebp,[384+esp]
  409. movdqu xmm7,[edx]
  410. pshufd xmm0,xmm3,0
  411. pshufd xmm1,xmm3,85
  412. pshufd xmm2,xmm3,170
  413. pshufd xmm3,xmm3,255
  414. paddd xmm0,[48+eax]
  415. pshufd xmm4,xmm7,0
  416. pshufd xmm5,xmm7,85
  417. psubd xmm0,[64+eax]
  418. pshufd xmm6,xmm7,170
  419. pshufd xmm7,xmm7,255
  420. movdqa [64+ebp],xmm0
  421. movdqa [80+ebp],xmm1
  422. movdqa [96+ebp],xmm2
  423. movdqa [112+ebp],xmm3
  424. movdqu xmm3,[16+edx]
  425. movdqa [ebp-64],xmm4
  426. movdqa [ebp-48],xmm5
  427. movdqa [ebp-32],xmm6
  428. movdqa [ebp-16],xmm7
  429. movdqa xmm7,[32+eax]
  430. lea ebx,[128+esp]
  431. pshufd xmm0,xmm3,0
  432. pshufd xmm1,xmm3,85
  433. pshufd xmm2,xmm3,170
  434. pshufd xmm3,xmm3,255
  435. pshufd xmm4,xmm7,0
  436. pshufd xmm5,xmm7,85
  437. pshufd xmm6,xmm7,170
  438. pshufd xmm7,xmm7,255
  439. movdqa [ebp],xmm0
  440. movdqa [16+ebp],xmm1
  441. movdqa [32+ebp],xmm2
  442. movdqa [48+ebp],xmm3
  443. movdqa [ebp-128],xmm4
  444. movdqa [ebp-112],xmm5
  445. movdqa [ebp-96],xmm6
  446. movdqa [ebp-80],xmm7
  447. lea esi,[128+esi]
  448. lea edi,[128+edi]
  449. jmp NEAR L$009outer_loop
  450. align 16
  451. L$009outer_loop:
  452. movdqa xmm1,[ebp-112]
  453. movdqa xmm2,[ebp-96]
  454. movdqa xmm3,[ebp-80]
  455. movdqa xmm5,[ebp-48]
  456. movdqa xmm6,[ebp-32]
  457. movdqa xmm7,[ebp-16]
  458. movdqa [ebx-112],xmm1
  459. movdqa [ebx-96],xmm2
  460. movdqa [ebx-80],xmm3
  461. movdqa [ebx-48],xmm5
  462. movdqa [ebx-32],xmm6
  463. movdqa [ebx-16],xmm7
  464. movdqa xmm2,[32+ebp]
  465. movdqa xmm3,[48+ebp]
  466. movdqa xmm4,[64+ebp]
  467. movdqa xmm5,[80+ebp]
  468. movdqa xmm6,[96+ebp]
  469. movdqa xmm7,[112+ebp]
  470. paddd xmm4,[64+eax]
  471. movdqa [32+ebx],xmm2
  472. movdqa [48+ebx],xmm3
  473. movdqa [64+ebx],xmm4
  474. movdqa [80+ebx],xmm5
  475. movdqa [96+ebx],xmm6
  476. movdqa [112+ebx],xmm7
  477. movdqa [64+ebp],xmm4
  478. movdqa xmm0,[ebp-128]
  479. movdqa xmm6,xmm4
  480. movdqa xmm3,[ebp-64]
  481. movdqa xmm4,[ebp]
  482. movdqa xmm5,[16+ebp]
  483. mov edx,10
  484. nop
  485. align 16
  486. L$010loop:
  487. paddd xmm0,xmm3
  488. movdqa xmm2,xmm3
  489. pxor xmm6,xmm0
  490. pshufb xmm6,[eax]
  491. paddd xmm4,xmm6
  492. pxor xmm2,xmm4
  493. movdqa xmm3,[ebx-48]
  494. movdqa xmm1,xmm2
  495. pslld xmm2,12
  496. psrld xmm1,20
  497. por xmm2,xmm1
  498. movdqa xmm1,[ebx-112]
  499. paddd xmm0,xmm2
  500. movdqa xmm7,[80+ebx]
  501. pxor xmm6,xmm0
  502. movdqa [ebx-128],xmm0
  503. pshufb xmm6,[16+eax]
  504. paddd xmm4,xmm6
  505. movdqa [64+ebx],xmm6
  506. pxor xmm2,xmm4
  507. paddd xmm1,xmm3
  508. movdqa xmm0,xmm2
  509. pslld xmm2,7
  510. psrld xmm0,25
  511. pxor xmm7,xmm1
  512. por xmm2,xmm0
  513. movdqa [ebx],xmm4
  514. pshufb xmm7,[eax]
  515. movdqa [ebx-64],xmm2
  516. paddd xmm5,xmm7
  517. movdqa xmm4,[32+ebx]
  518. pxor xmm3,xmm5
  519. movdqa xmm2,[ebx-32]
  520. movdqa xmm0,xmm3
  521. pslld xmm3,12
  522. psrld xmm0,20
  523. por xmm3,xmm0
  524. movdqa xmm0,[ebx-96]
  525. paddd xmm1,xmm3
  526. movdqa xmm6,[96+ebx]
  527. pxor xmm7,xmm1
  528. movdqa [ebx-112],xmm1
  529. pshufb xmm7,[16+eax]
  530. paddd xmm5,xmm7
  531. movdqa [80+ebx],xmm7
  532. pxor xmm3,xmm5
  533. paddd xmm0,xmm2
  534. movdqa xmm1,xmm3
  535. pslld xmm3,7
  536. psrld xmm1,25
  537. pxor xmm6,xmm0
  538. por xmm3,xmm1
  539. movdqa [16+ebx],xmm5
  540. pshufb xmm6,[eax]
  541. movdqa [ebx-48],xmm3
  542. paddd xmm4,xmm6
  543. movdqa xmm5,[48+ebx]
  544. pxor xmm2,xmm4
  545. movdqa xmm3,[ebx-16]
  546. movdqa xmm1,xmm2
  547. pslld xmm2,12
  548. psrld xmm1,20
  549. por xmm2,xmm1
  550. movdqa xmm1,[ebx-80]
  551. paddd xmm0,xmm2
  552. movdqa xmm7,[112+ebx]
  553. pxor xmm6,xmm0
  554. movdqa [ebx-96],xmm0
  555. pshufb xmm6,[16+eax]
  556. paddd xmm4,xmm6
  557. movdqa [96+ebx],xmm6
  558. pxor xmm2,xmm4
  559. paddd xmm1,xmm3
  560. movdqa xmm0,xmm2
  561. pslld xmm2,7
  562. psrld xmm0,25
  563. pxor xmm7,xmm1
  564. por xmm2,xmm0
  565. pshufb xmm7,[eax]
  566. movdqa [ebx-32],xmm2
  567. paddd xmm5,xmm7
  568. pxor xmm3,xmm5
  569. movdqa xmm2,[ebx-48]
  570. movdqa xmm0,xmm3
  571. pslld xmm3,12
  572. psrld xmm0,20
  573. por xmm3,xmm0
  574. movdqa xmm0,[ebx-128]
  575. paddd xmm1,xmm3
  576. pxor xmm7,xmm1
  577. movdqa [ebx-80],xmm1
  578. pshufb xmm7,[16+eax]
  579. paddd xmm5,xmm7
  580. movdqa xmm6,xmm7
  581. pxor xmm3,xmm5
  582. paddd xmm0,xmm2
  583. movdqa xmm1,xmm3
  584. pslld xmm3,7
  585. psrld xmm1,25
  586. pxor xmm6,xmm0
  587. por xmm3,xmm1
  588. pshufb xmm6,[eax]
  589. movdqa [ebx-16],xmm3
  590. paddd xmm4,xmm6
  591. pxor xmm2,xmm4
  592. movdqa xmm3,[ebx-32]
  593. movdqa xmm1,xmm2
  594. pslld xmm2,12
  595. psrld xmm1,20
  596. por xmm2,xmm1
  597. movdqa xmm1,[ebx-112]
  598. paddd xmm0,xmm2
  599. movdqa xmm7,[64+ebx]
  600. pxor xmm6,xmm0
  601. movdqa [ebx-128],xmm0
  602. pshufb xmm6,[16+eax]
  603. paddd xmm4,xmm6
  604. movdqa [112+ebx],xmm6
  605. pxor xmm2,xmm4
  606. paddd xmm1,xmm3
  607. movdqa xmm0,xmm2
  608. pslld xmm2,7
  609. psrld xmm0,25
  610. pxor xmm7,xmm1
  611. por xmm2,xmm0
  612. movdqa [32+ebx],xmm4
  613. pshufb xmm7,[eax]
  614. movdqa [ebx-48],xmm2
  615. paddd xmm5,xmm7
  616. movdqa xmm4,[ebx]
  617. pxor xmm3,xmm5
  618. movdqa xmm2,[ebx-16]
  619. movdqa xmm0,xmm3
  620. pslld xmm3,12
  621. psrld xmm0,20
  622. por xmm3,xmm0
  623. movdqa xmm0,[ebx-96]
  624. paddd xmm1,xmm3
  625. movdqa xmm6,[80+ebx]
  626. pxor xmm7,xmm1
  627. movdqa [ebx-112],xmm1
  628. pshufb xmm7,[16+eax]
  629. paddd xmm5,xmm7
  630. movdqa [64+ebx],xmm7
  631. pxor xmm3,xmm5
  632. paddd xmm0,xmm2
  633. movdqa xmm1,xmm3
  634. pslld xmm3,7
  635. psrld xmm1,25
  636. pxor xmm6,xmm0
  637. por xmm3,xmm1
  638. movdqa [48+ebx],xmm5
  639. pshufb xmm6,[eax]
  640. movdqa [ebx-32],xmm3
  641. paddd xmm4,xmm6
  642. movdqa xmm5,[16+ebx]
  643. pxor xmm2,xmm4
  644. movdqa xmm3,[ebx-64]
  645. movdqa xmm1,xmm2
  646. pslld xmm2,12
  647. psrld xmm1,20
  648. por xmm2,xmm1
  649. movdqa xmm1,[ebx-80]
  650. paddd xmm0,xmm2
  651. movdqa xmm7,[96+ebx]
  652. pxor xmm6,xmm0
  653. movdqa [ebx-96],xmm0
  654. pshufb xmm6,[16+eax]
  655. paddd xmm4,xmm6
  656. movdqa [80+ebx],xmm6
  657. pxor xmm2,xmm4
  658. paddd xmm1,xmm3
  659. movdqa xmm0,xmm2
  660. pslld xmm2,7
  661. psrld xmm0,25
  662. pxor xmm7,xmm1
  663. por xmm2,xmm0
  664. pshufb xmm7,[eax]
  665. movdqa [ebx-16],xmm2
  666. paddd xmm5,xmm7
  667. pxor xmm3,xmm5
  668. movdqa xmm0,xmm3
  669. pslld xmm3,12
  670. psrld xmm0,20
  671. por xmm3,xmm0
  672. movdqa xmm0,[ebx-128]
  673. paddd xmm1,xmm3
  674. movdqa xmm6,[64+ebx]
  675. pxor xmm7,xmm1
  676. movdqa [ebx-80],xmm1
  677. pshufb xmm7,[16+eax]
  678. paddd xmm5,xmm7
  679. movdqa [96+ebx],xmm7
  680. pxor xmm3,xmm5
  681. movdqa xmm1,xmm3
  682. pslld xmm3,7
  683. psrld xmm1,25
  684. por xmm3,xmm1
  685. dec edx
  686. jnz NEAR L$010loop
  687. movdqa [ebx-64],xmm3
  688. movdqa [ebx],xmm4
  689. movdqa [16+ebx],xmm5
  690. movdqa [64+ebx],xmm6
  691. movdqa [96+ebx],xmm7
  692. movdqa xmm1,[ebx-112]
  693. movdqa xmm2,[ebx-96]
  694. movdqa xmm3,[ebx-80]
  695. paddd xmm0,[ebp-128]
  696. paddd xmm1,[ebp-112]
  697. paddd xmm2,[ebp-96]
  698. paddd xmm3,[ebp-80]
  699. movdqa xmm6,xmm0
  700. punpckldq xmm0,xmm1
  701. movdqa xmm7,xmm2
  702. punpckldq xmm2,xmm3
  703. punpckhdq xmm6,xmm1
  704. punpckhdq xmm7,xmm3
  705. movdqa xmm1,xmm0
  706. punpcklqdq xmm0,xmm2
  707. movdqa xmm3,xmm6
  708. punpcklqdq xmm6,xmm7
  709. punpckhqdq xmm1,xmm2
  710. punpckhqdq xmm3,xmm7
  711. movdqu xmm4,[esi-128]
  712. movdqu xmm5,[esi-64]
  713. movdqu xmm2,[esi]
  714. movdqu xmm7,[64+esi]
  715. lea esi,[16+esi]
  716. pxor xmm4,xmm0
  717. movdqa xmm0,[ebx-64]
  718. pxor xmm5,xmm1
  719. movdqa xmm1,[ebx-48]
  720. pxor xmm6,xmm2
  721. movdqa xmm2,[ebx-32]
  722. pxor xmm7,xmm3
  723. movdqa xmm3,[ebx-16]
  724. movdqu [edi-128],xmm4
  725. movdqu [edi-64],xmm5
  726. movdqu [edi],xmm6
  727. movdqu [64+edi],xmm7
  728. lea edi,[16+edi]
  729. paddd xmm0,[ebp-64]
  730. paddd xmm1,[ebp-48]
  731. paddd xmm2,[ebp-32]
  732. paddd xmm3,[ebp-16]
  733. movdqa xmm6,xmm0
  734. punpckldq xmm0,xmm1
  735. movdqa xmm7,xmm2
  736. punpckldq xmm2,xmm3
  737. punpckhdq xmm6,xmm1
  738. punpckhdq xmm7,xmm3
  739. movdqa xmm1,xmm0
  740. punpcklqdq xmm0,xmm2
  741. movdqa xmm3,xmm6
  742. punpcklqdq xmm6,xmm7
  743. punpckhqdq xmm1,xmm2
  744. punpckhqdq xmm3,xmm7
  745. movdqu xmm4,[esi-128]
  746. movdqu xmm5,[esi-64]
  747. movdqu xmm2,[esi]
  748. movdqu xmm7,[64+esi]
  749. lea esi,[16+esi]
  750. pxor xmm4,xmm0
  751. movdqa xmm0,[ebx]
  752. pxor xmm5,xmm1
  753. movdqa xmm1,[16+ebx]
  754. pxor xmm6,xmm2
  755. movdqa xmm2,[32+ebx]
  756. pxor xmm7,xmm3
  757. movdqa xmm3,[48+ebx]
  758. movdqu [edi-128],xmm4
  759. movdqu [edi-64],xmm5
  760. movdqu [edi],xmm6
  761. movdqu [64+edi],xmm7
  762. lea edi,[16+edi]
  763. paddd xmm0,[ebp]
  764. paddd xmm1,[16+ebp]
  765. paddd xmm2,[32+ebp]
  766. paddd xmm3,[48+ebp]
  767. movdqa xmm6,xmm0
  768. punpckldq xmm0,xmm1
  769. movdqa xmm7,xmm2
  770. punpckldq xmm2,xmm3
  771. punpckhdq xmm6,xmm1
  772. punpckhdq xmm7,xmm3
  773. movdqa xmm1,xmm0
  774. punpcklqdq xmm0,xmm2
  775. movdqa xmm3,xmm6
  776. punpcklqdq xmm6,xmm7
  777. punpckhqdq xmm1,xmm2
  778. punpckhqdq xmm3,xmm7
  779. movdqu xmm4,[esi-128]
  780. movdqu xmm5,[esi-64]
  781. movdqu xmm2,[esi]
  782. movdqu xmm7,[64+esi]
  783. lea esi,[16+esi]
  784. pxor xmm4,xmm0
  785. movdqa xmm0,[64+ebx]
  786. pxor xmm5,xmm1
  787. movdqa xmm1,[80+ebx]
  788. pxor xmm6,xmm2
  789. movdqa xmm2,[96+ebx]
  790. pxor xmm7,xmm3
  791. movdqa xmm3,[112+ebx]
  792. movdqu [edi-128],xmm4
  793. movdqu [edi-64],xmm5
  794. movdqu [edi],xmm6
  795. movdqu [64+edi],xmm7
  796. lea edi,[16+edi]
  797. paddd xmm0,[64+ebp]
  798. paddd xmm1,[80+ebp]
  799. paddd xmm2,[96+ebp]
  800. paddd xmm3,[112+ebp]
  801. movdqa xmm6,xmm0
  802. punpckldq xmm0,xmm1
  803. movdqa xmm7,xmm2
  804. punpckldq xmm2,xmm3
  805. punpckhdq xmm6,xmm1
  806. punpckhdq xmm7,xmm3
  807. movdqa xmm1,xmm0
  808. punpcklqdq xmm0,xmm2
  809. movdqa xmm3,xmm6
  810. punpcklqdq xmm6,xmm7
  811. punpckhqdq xmm1,xmm2
  812. punpckhqdq xmm3,xmm7
  813. movdqu xmm4,[esi-128]
  814. movdqu xmm5,[esi-64]
  815. movdqu xmm2,[esi]
  816. movdqu xmm7,[64+esi]
  817. lea esi,[208+esi]
  818. pxor xmm4,xmm0
  819. pxor xmm5,xmm1
  820. pxor xmm6,xmm2
  821. pxor xmm7,xmm3
  822. movdqu [edi-128],xmm4
  823. movdqu [edi-64],xmm5
  824. movdqu [edi],xmm6
  825. movdqu [64+edi],xmm7
  826. lea edi,[208+edi]
  827. sub ecx,256
  828. jnc NEAR L$009outer_loop
  829. add ecx,256
  830. jz NEAR L$011done
  831. mov ebx,DWORD [520+esp]
  832. lea esi,[esi-128]
  833. mov edx,DWORD [516+esp]
  834. lea edi,[edi-128]
  835. movd xmm2,DWORD [64+ebp]
  836. movdqu xmm3,[ebx]
  837. paddd xmm2,[96+eax]
  838. pand xmm3,[112+eax]
  839. por xmm3,xmm2
  840. L$0081x:
  841. movdqa xmm0,[32+eax]
  842. movdqu xmm1,[edx]
  843. movdqu xmm2,[16+edx]
  844. movdqa xmm6,[eax]
  845. movdqa xmm7,[16+eax]
  846. mov DWORD [48+esp],ebp
  847. movdqa [esp],xmm0
  848. movdqa [16+esp],xmm1
  849. movdqa [32+esp],xmm2
  850. movdqa [48+esp],xmm3
  851. mov edx,10
  852. jmp NEAR L$012loop1x
  853. align 16
  854. L$013outer1x:
  855. movdqa xmm3,[80+eax]
  856. movdqa xmm0,[esp]
  857. movdqa xmm1,[16+esp]
  858. movdqa xmm2,[32+esp]
  859. paddd xmm3,[48+esp]
  860. mov edx,10
  861. movdqa [48+esp],xmm3
  862. jmp NEAR L$012loop1x
  863. align 16
  864. L$012loop1x:
  865. paddd xmm0,xmm1
  866. pxor xmm3,xmm0
  867. db 102,15,56,0,222
  868. paddd xmm2,xmm3
  869. pxor xmm1,xmm2
  870. movdqa xmm4,xmm1
  871. psrld xmm1,20
  872. pslld xmm4,12
  873. por xmm1,xmm4
  874. paddd xmm0,xmm1
  875. pxor xmm3,xmm0
  876. db 102,15,56,0,223
  877. paddd xmm2,xmm3
  878. pxor xmm1,xmm2
  879. movdqa xmm4,xmm1
  880. psrld xmm1,25
  881. pslld xmm4,7
  882. por xmm1,xmm4
  883. pshufd xmm2,xmm2,78
  884. pshufd xmm1,xmm1,57
  885. pshufd xmm3,xmm3,147
  886. nop
  887. paddd xmm0,xmm1
  888. pxor xmm3,xmm0
  889. db 102,15,56,0,222
  890. paddd xmm2,xmm3
  891. pxor xmm1,xmm2
  892. movdqa xmm4,xmm1
  893. psrld xmm1,20
  894. pslld xmm4,12
  895. por xmm1,xmm4
  896. paddd xmm0,xmm1
  897. pxor xmm3,xmm0
  898. db 102,15,56,0,223
  899. paddd xmm2,xmm3
  900. pxor xmm1,xmm2
  901. movdqa xmm4,xmm1
  902. psrld xmm1,25
  903. pslld xmm4,7
  904. por xmm1,xmm4
  905. pshufd xmm2,xmm2,78
  906. pshufd xmm1,xmm1,147
  907. pshufd xmm3,xmm3,57
  908. dec edx
  909. jnz NEAR L$012loop1x
  910. paddd xmm0,[esp]
  911. paddd xmm1,[16+esp]
  912. paddd xmm2,[32+esp]
  913. paddd xmm3,[48+esp]
  914. cmp ecx,64
  915. jb NEAR L$014tail
  916. movdqu xmm4,[esi]
  917. movdqu xmm5,[16+esi]
  918. pxor xmm0,xmm4
  919. movdqu xmm4,[32+esi]
  920. pxor xmm1,xmm5
  921. movdqu xmm5,[48+esi]
  922. pxor xmm2,xmm4
  923. pxor xmm3,xmm5
  924. lea esi,[64+esi]
  925. movdqu [edi],xmm0
  926. movdqu [16+edi],xmm1
  927. movdqu [32+edi],xmm2
  928. movdqu [48+edi],xmm3
  929. lea edi,[64+edi]
  930. sub ecx,64
  931. jnz NEAR L$013outer1x
  932. jmp NEAR L$011done
  933. L$014tail:
  934. movdqa [esp],xmm0
  935. movdqa [16+esp],xmm1
  936. movdqa [32+esp],xmm2
  937. movdqa [48+esp],xmm3
  938. xor eax,eax
  939. xor edx,edx
  940. xor ebp,ebp
  941. L$015tail_loop:
  942. mov al,BYTE [ebp*1+esp]
  943. mov dl,BYTE [ebp*1+esi]
  944. lea ebp,[1+ebp]
  945. xor al,dl
  946. mov BYTE [ebp*1+edi-1],al
  947. dec ecx
  948. jnz NEAR L$015tail_loop
  949. L$011done:
  950. mov esp,DWORD [512+esp]
  951. pop edi
  952. pop esi
  953. pop ebx
  954. pop ebp
  955. ret
  956. align 64
  957. L$ssse3_data:
  958. db 2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13
  959. db 3,0,1,2,7,4,5,6,11,8,9,10,15,12,13,14
  960. dd 1634760805,857760878,2036477234,1797285236
  961. dd 0,1,2,3
  962. dd 4,4,4,4
  963. dd 1,0,0,0
  964. dd 4,0,0,0
  965. dd 0,-1,-1,-1
  966. align 64
  967. db 67,104,97,67,104,97,50,48,32,102,111,114,32,120,56,54
  968. db 44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32
  969. db 60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111
  970. db 114,103,62,0
  971. segment .bss
  972. common _GFp_ia32cap_P 16