ghash-x86_64-nasm.asm 24 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209
  1. ; This file is generated from a similarly-named Perl script in the BoringSSL
  2. ; source tree. Do not edit by hand.
  3. default rel
  4. %define XMMWORD
  5. %define YMMWORD
  6. %define ZMMWORD
  7. section .text code align=64
  8. EXTERN GFp_ia32cap_P
  9. global GFp_gcm_init_clmul
  10. ALIGN 16
  11. GFp_gcm_init_clmul:
  12. $L$_init_clmul:
  13. $L$SEH_begin_GFp_gcm_init_clmul:
  14. DB 0x48,0x83,0xec,0x18
  15. DB 0x0f,0x29,0x34,0x24
  16. movdqu xmm2,XMMWORD[rdx]
  17. pshufd xmm2,xmm2,78
  18. pshufd xmm4,xmm2,255
  19. movdqa xmm3,xmm2
  20. psllq xmm2,1
  21. pxor xmm5,xmm5
  22. psrlq xmm3,63
  23. pcmpgtd xmm5,xmm4
  24. pslldq xmm3,8
  25. por xmm2,xmm3
  26. pand xmm5,XMMWORD[$L$0x1c2_polynomial]
  27. pxor xmm2,xmm5
  28. pshufd xmm6,xmm2,78
  29. movdqa xmm0,xmm2
  30. pxor xmm6,xmm2
  31. movdqa xmm1,xmm0
  32. pshufd xmm3,xmm0,78
  33. pxor xmm3,xmm0
  34. DB 102,15,58,68,194,0
  35. DB 102,15,58,68,202,17
  36. DB 102,15,58,68,222,0
  37. pxor xmm3,xmm0
  38. pxor xmm3,xmm1
  39. movdqa xmm4,xmm3
  40. psrldq xmm3,8
  41. pslldq xmm4,8
  42. pxor xmm1,xmm3
  43. pxor xmm0,xmm4
  44. movdqa xmm4,xmm0
  45. movdqa xmm3,xmm0
  46. psllq xmm0,5
  47. pxor xmm3,xmm0
  48. psllq xmm0,1
  49. pxor xmm0,xmm3
  50. psllq xmm0,57
  51. movdqa xmm3,xmm0
  52. pslldq xmm0,8
  53. psrldq xmm3,8
  54. pxor xmm0,xmm4
  55. pxor xmm1,xmm3
  56. movdqa xmm4,xmm0
  57. psrlq xmm0,1
  58. pxor xmm1,xmm4
  59. pxor xmm4,xmm0
  60. psrlq xmm0,5
  61. pxor xmm0,xmm4
  62. psrlq xmm0,1
  63. pxor xmm0,xmm1
  64. pshufd xmm3,xmm2,78
  65. pshufd xmm4,xmm0,78
  66. pxor xmm3,xmm2
  67. movdqu XMMWORD[rcx],xmm2
  68. pxor xmm4,xmm0
  69. movdqu XMMWORD[16+rcx],xmm0
  70. DB 102,15,58,15,227,8
  71. movdqu XMMWORD[32+rcx],xmm4
  72. movdqa xmm1,xmm0
  73. pshufd xmm3,xmm0,78
  74. pxor xmm3,xmm0
  75. DB 102,15,58,68,194,0
  76. DB 102,15,58,68,202,17
  77. DB 102,15,58,68,222,0
  78. pxor xmm3,xmm0
  79. pxor xmm3,xmm1
  80. movdqa xmm4,xmm3
  81. psrldq xmm3,8
  82. pslldq xmm4,8
  83. pxor xmm1,xmm3
  84. pxor xmm0,xmm4
  85. movdqa xmm4,xmm0
  86. movdqa xmm3,xmm0
  87. psllq xmm0,5
  88. pxor xmm3,xmm0
  89. psllq xmm0,1
  90. pxor xmm0,xmm3
  91. psllq xmm0,57
  92. movdqa xmm3,xmm0
  93. pslldq xmm0,8
  94. psrldq xmm3,8
  95. pxor xmm0,xmm4
  96. pxor xmm1,xmm3
  97. movdqa xmm4,xmm0
  98. psrlq xmm0,1
  99. pxor xmm1,xmm4
  100. pxor xmm4,xmm0
  101. psrlq xmm0,5
  102. pxor xmm0,xmm4
  103. psrlq xmm0,1
  104. pxor xmm0,xmm1
  105. movdqa xmm5,xmm0
  106. movdqa xmm1,xmm0
  107. pshufd xmm3,xmm0,78
  108. pxor xmm3,xmm0
  109. DB 102,15,58,68,194,0
  110. DB 102,15,58,68,202,17
  111. DB 102,15,58,68,222,0
  112. pxor xmm3,xmm0
  113. pxor xmm3,xmm1
  114. movdqa xmm4,xmm3
  115. psrldq xmm3,8
  116. pslldq xmm4,8
  117. pxor xmm1,xmm3
  118. pxor xmm0,xmm4
  119. movdqa xmm4,xmm0
  120. movdqa xmm3,xmm0
  121. psllq xmm0,5
  122. pxor xmm3,xmm0
  123. psllq xmm0,1
  124. pxor xmm0,xmm3
  125. psllq xmm0,57
  126. movdqa xmm3,xmm0
  127. pslldq xmm0,8
  128. psrldq xmm3,8
  129. pxor xmm0,xmm4
  130. pxor xmm1,xmm3
  131. movdqa xmm4,xmm0
  132. psrlq xmm0,1
  133. pxor xmm1,xmm4
  134. pxor xmm4,xmm0
  135. psrlq xmm0,5
  136. pxor xmm0,xmm4
  137. psrlq xmm0,1
  138. pxor xmm0,xmm1
  139. pshufd xmm3,xmm5,78
  140. pshufd xmm4,xmm0,78
  141. pxor xmm3,xmm5
  142. movdqu XMMWORD[48+rcx],xmm5
  143. pxor xmm4,xmm0
  144. movdqu XMMWORD[64+rcx],xmm0
  145. DB 102,15,58,15,227,8
  146. movdqu XMMWORD[80+rcx],xmm4
  147. movaps xmm6,XMMWORD[rsp]
  148. lea rsp,[24+rsp]
  149. $L$SEH_end_GFp_gcm_init_clmul:
  150. DB 0F3h,0C3h ;repret
  151. global GFp_gcm_gmult_clmul
  152. ALIGN 16
  153. GFp_gcm_gmult_clmul:
  154. $L$_gmult_clmul:
  155. movdqu xmm0,XMMWORD[rcx]
  156. movdqa xmm5,XMMWORD[$L$bswap_mask]
  157. movdqu xmm2,XMMWORD[rdx]
  158. movdqu xmm4,XMMWORD[32+rdx]
  159. DB 102,15,56,0,197
  160. movdqa xmm1,xmm0
  161. pshufd xmm3,xmm0,78
  162. pxor xmm3,xmm0
  163. DB 102,15,58,68,194,0
  164. DB 102,15,58,68,202,17
  165. DB 102,15,58,68,220,0
  166. pxor xmm3,xmm0
  167. pxor xmm3,xmm1
  168. movdqa xmm4,xmm3
  169. psrldq xmm3,8
  170. pslldq xmm4,8
  171. pxor xmm1,xmm3
  172. pxor xmm0,xmm4
  173. movdqa xmm4,xmm0
  174. movdqa xmm3,xmm0
  175. psllq xmm0,5
  176. pxor xmm3,xmm0
  177. psllq xmm0,1
  178. pxor xmm0,xmm3
  179. psllq xmm0,57
  180. movdqa xmm3,xmm0
  181. pslldq xmm0,8
  182. psrldq xmm3,8
  183. pxor xmm0,xmm4
  184. pxor xmm1,xmm3
  185. movdqa xmm4,xmm0
  186. psrlq xmm0,1
  187. pxor xmm1,xmm4
  188. pxor xmm4,xmm0
  189. psrlq xmm0,5
  190. pxor xmm0,xmm4
  191. psrlq xmm0,1
  192. pxor xmm0,xmm1
  193. DB 102,15,56,0,197
  194. movdqu XMMWORD[rcx],xmm0
  195. DB 0F3h,0C3h ;repret
  196. global GFp_gcm_ghash_clmul
  197. ALIGN 32
  198. GFp_gcm_ghash_clmul:
  199. $L$_ghash_clmul:
  200. lea rax,[((-136))+rsp]
  201. $L$SEH_begin_GFp_gcm_ghash_clmul:
  202. DB 0x48,0x8d,0x60,0xe0
  203. DB 0x0f,0x29,0x70,0xe0
  204. DB 0x0f,0x29,0x78,0xf0
  205. DB 0x44,0x0f,0x29,0x00
  206. DB 0x44,0x0f,0x29,0x48,0x10
  207. DB 0x44,0x0f,0x29,0x50,0x20
  208. DB 0x44,0x0f,0x29,0x58,0x30
  209. DB 0x44,0x0f,0x29,0x60,0x40
  210. DB 0x44,0x0f,0x29,0x68,0x50
  211. DB 0x44,0x0f,0x29,0x70,0x60
  212. DB 0x44,0x0f,0x29,0x78,0x70
  213. movdqa xmm10,XMMWORD[$L$bswap_mask]
  214. movdqu xmm0,XMMWORD[rcx]
  215. movdqu xmm2,XMMWORD[rdx]
  216. movdqu xmm7,XMMWORD[32+rdx]
  217. DB 102,65,15,56,0,194
  218. sub r9,0x10
  219. jz NEAR $L$odd_tail
  220. movdqu xmm6,XMMWORD[16+rdx]
  221. lea rax,[GFp_ia32cap_P]
  222. mov eax,DWORD[4+rax]
  223. cmp r9,0x30
  224. jb NEAR $L$skip4x
  225. and eax,71303168
  226. cmp eax,4194304
  227. je NEAR $L$skip4x
  228. sub r9,0x30
  229. mov rax,0xA040608020C0E000
  230. movdqu xmm14,XMMWORD[48+rdx]
  231. movdqu xmm15,XMMWORD[64+rdx]
  232. movdqu xmm3,XMMWORD[48+r8]
  233. movdqu xmm11,XMMWORD[32+r8]
  234. DB 102,65,15,56,0,218
  235. DB 102,69,15,56,0,218
  236. movdqa xmm5,xmm3
  237. pshufd xmm4,xmm3,78
  238. pxor xmm4,xmm3
  239. DB 102,15,58,68,218,0
  240. DB 102,15,58,68,234,17
  241. DB 102,15,58,68,231,0
  242. movdqa xmm13,xmm11
  243. pshufd xmm12,xmm11,78
  244. pxor xmm12,xmm11
  245. DB 102,68,15,58,68,222,0
  246. DB 102,68,15,58,68,238,17
  247. DB 102,68,15,58,68,231,16
  248. xorps xmm3,xmm11
  249. xorps xmm5,xmm13
  250. movups xmm7,XMMWORD[80+rdx]
  251. xorps xmm4,xmm12
  252. movdqu xmm11,XMMWORD[16+r8]
  253. movdqu xmm8,XMMWORD[r8]
  254. DB 102,69,15,56,0,218
  255. DB 102,69,15,56,0,194
  256. movdqa xmm13,xmm11
  257. pshufd xmm12,xmm11,78
  258. pxor xmm0,xmm8
  259. pxor xmm12,xmm11
  260. DB 102,69,15,58,68,222,0
  261. movdqa xmm1,xmm0
  262. pshufd xmm8,xmm0,78
  263. pxor xmm8,xmm0
  264. DB 102,69,15,58,68,238,17
  265. DB 102,68,15,58,68,231,0
  266. xorps xmm3,xmm11
  267. xorps xmm5,xmm13
  268. lea r8,[64+r8]
  269. sub r9,0x40
  270. jc NEAR $L$tail4x
  271. jmp NEAR $L$mod4_loop
  272. ALIGN 32
  273. $L$mod4_loop:
  274. DB 102,65,15,58,68,199,0
  275. xorps xmm4,xmm12
  276. movdqu xmm11,XMMWORD[48+r8]
  277. DB 102,69,15,56,0,218
  278. DB 102,65,15,58,68,207,17
  279. xorps xmm0,xmm3
  280. movdqu xmm3,XMMWORD[32+r8]
  281. movdqa xmm13,xmm11
  282. DB 102,68,15,58,68,199,16
  283. pshufd xmm12,xmm11,78
  284. xorps xmm1,xmm5
  285. pxor xmm12,xmm11
  286. DB 102,65,15,56,0,218
  287. movups xmm7,XMMWORD[32+rdx]
  288. xorps xmm8,xmm4
  289. DB 102,68,15,58,68,218,0
  290. pshufd xmm4,xmm3,78
  291. pxor xmm8,xmm0
  292. movdqa xmm5,xmm3
  293. pxor xmm8,xmm1
  294. pxor xmm4,xmm3
  295. movdqa xmm9,xmm8
  296. DB 102,68,15,58,68,234,17
  297. pslldq xmm8,8
  298. psrldq xmm9,8
  299. pxor xmm0,xmm8
  300. movdqa xmm8,XMMWORD[$L$7_mask]
  301. pxor xmm1,xmm9
  302. DB 102,76,15,110,200
  303. pand xmm8,xmm0
  304. DB 102,69,15,56,0,200
  305. pxor xmm9,xmm0
  306. DB 102,68,15,58,68,231,0
  307. psllq xmm9,57
  308. movdqa xmm8,xmm9
  309. pslldq xmm9,8
  310. DB 102,15,58,68,222,0
  311. psrldq xmm8,8
  312. pxor xmm0,xmm9
  313. pxor xmm1,xmm8
  314. movdqu xmm8,XMMWORD[r8]
  315. movdqa xmm9,xmm0
  316. psrlq xmm0,1
  317. DB 102,15,58,68,238,17
  318. xorps xmm3,xmm11
  319. movdqu xmm11,XMMWORD[16+r8]
  320. DB 102,69,15,56,0,218
  321. DB 102,15,58,68,231,16
  322. xorps xmm5,xmm13
  323. movups xmm7,XMMWORD[80+rdx]
  324. DB 102,69,15,56,0,194
  325. pxor xmm1,xmm9
  326. pxor xmm9,xmm0
  327. psrlq xmm0,5
  328. movdqa xmm13,xmm11
  329. pxor xmm4,xmm12
  330. pshufd xmm12,xmm11,78
  331. pxor xmm0,xmm9
  332. pxor xmm1,xmm8
  333. pxor xmm12,xmm11
  334. DB 102,69,15,58,68,222,0
  335. psrlq xmm0,1
  336. pxor xmm0,xmm1
  337. movdqa xmm1,xmm0
  338. DB 102,69,15,58,68,238,17
  339. xorps xmm3,xmm11
  340. pshufd xmm8,xmm0,78
  341. pxor xmm8,xmm0
  342. DB 102,68,15,58,68,231,0
  343. xorps xmm5,xmm13
  344. lea r8,[64+r8]
  345. sub r9,0x40
  346. jnc NEAR $L$mod4_loop
  347. $L$tail4x:
  348. DB 102,65,15,58,68,199,0
  349. DB 102,65,15,58,68,207,17
  350. DB 102,68,15,58,68,199,16
  351. xorps xmm4,xmm12
  352. xorps xmm0,xmm3
  353. xorps xmm1,xmm5
  354. pxor xmm1,xmm0
  355. pxor xmm8,xmm4
  356. pxor xmm8,xmm1
  357. pxor xmm1,xmm0
  358. movdqa xmm9,xmm8
  359. psrldq xmm8,8
  360. pslldq xmm9,8
  361. pxor xmm1,xmm8
  362. pxor xmm0,xmm9
  363. movdqa xmm4,xmm0
  364. movdqa xmm3,xmm0
  365. psllq xmm0,5
  366. pxor xmm3,xmm0
  367. psllq xmm0,1
  368. pxor xmm0,xmm3
  369. psllq xmm0,57
  370. movdqa xmm3,xmm0
  371. pslldq xmm0,8
  372. psrldq xmm3,8
  373. pxor xmm0,xmm4
  374. pxor xmm1,xmm3
  375. movdqa xmm4,xmm0
  376. psrlq xmm0,1
  377. pxor xmm1,xmm4
  378. pxor xmm4,xmm0
  379. psrlq xmm0,5
  380. pxor xmm0,xmm4
  381. psrlq xmm0,1
  382. pxor xmm0,xmm1
  383. add r9,0x40
  384. jz NEAR $L$done
  385. movdqu xmm7,XMMWORD[32+rdx]
  386. sub r9,0x10
  387. jz NEAR $L$odd_tail
  388. $L$skip4x:
  389. movdqu xmm8,XMMWORD[r8]
  390. movdqu xmm3,XMMWORD[16+r8]
  391. DB 102,69,15,56,0,194
  392. DB 102,65,15,56,0,218
  393. pxor xmm0,xmm8
  394. movdqa xmm5,xmm3
  395. pshufd xmm4,xmm3,78
  396. pxor xmm4,xmm3
  397. DB 102,15,58,68,218,0
  398. DB 102,15,58,68,234,17
  399. DB 102,15,58,68,231,0
  400. lea r8,[32+r8]
  401. nop
  402. sub r9,0x20
  403. jbe NEAR $L$even_tail
  404. nop
  405. jmp NEAR $L$mod_loop
  406. ALIGN 32
  407. $L$mod_loop:
  408. movdqa xmm1,xmm0
  409. movdqa xmm8,xmm4
  410. pshufd xmm4,xmm0,78
  411. pxor xmm4,xmm0
  412. DB 102,15,58,68,198,0
  413. DB 102,15,58,68,206,17
  414. DB 102,15,58,68,231,16
  415. pxor xmm0,xmm3
  416. pxor xmm1,xmm5
  417. movdqu xmm9,XMMWORD[r8]
  418. pxor xmm8,xmm0
  419. DB 102,69,15,56,0,202
  420. movdqu xmm3,XMMWORD[16+r8]
  421. pxor xmm8,xmm1
  422. pxor xmm1,xmm9
  423. pxor xmm4,xmm8
  424. DB 102,65,15,56,0,218
  425. movdqa xmm8,xmm4
  426. psrldq xmm8,8
  427. pslldq xmm4,8
  428. pxor xmm1,xmm8
  429. pxor xmm0,xmm4
  430. movdqa xmm5,xmm3
  431. movdqa xmm9,xmm0
  432. movdqa xmm8,xmm0
  433. psllq xmm0,5
  434. pxor xmm8,xmm0
  435. DB 102,15,58,68,218,0
  436. psllq xmm0,1
  437. pxor xmm0,xmm8
  438. psllq xmm0,57
  439. movdqa xmm8,xmm0
  440. pslldq xmm0,8
  441. psrldq xmm8,8
  442. pxor xmm0,xmm9
  443. pshufd xmm4,xmm5,78
  444. pxor xmm1,xmm8
  445. pxor xmm4,xmm5
  446. movdqa xmm9,xmm0
  447. psrlq xmm0,1
  448. DB 102,15,58,68,234,17
  449. pxor xmm1,xmm9
  450. pxor xmm9,xmm0
  451. psrlq xmm0,5
  452. pxor xmm0,xmm9
  453. lea r8,[32+r8]
  454. psrlq xmm0,1
  455. DB 102,15,58,68,231,0
  456. pxor xmm0,xmm1
  457. sub r9,0x20
  458. ja NEAR $L$mod_loop
  459. $L$even_tail:
  460. movdqa xmm1,xmm0
  461. movdqa xmm8,xmm4
  462. pshufd xmm4,xmm0,78
  463. pxor xmm4,xmm0
  464. DB 102,15,58,68,198,0
  465. DB 102,15,58,68,206,17
  466. DB 102,15,58,68,231,16
  467. pxor xmm0,xmm3
  468. pxor xmm1,xmm5
  469. pxor xmm8,xmm0
  470. pxor xmm8,xmm1
  471. pxor xmm4,xmm8
  472. movdqa xmm8,xmm4
  473. psrldq xmm8,8
  474. pslldq xmm4,8
  475. pxor xmm1,xmm8
  476. pxor xmm0,xmm4
  477. movdqa xmm4,xmm0
  478. movdqa xmm3,xmm0
  479. psllq xmm0,5
  480. pxor xmm3,xmm0
  481. psllq xmm0,1
  482. pxor xmm0,xmm3
  483. psllq xmm0,57
  484. movdqa xmm3,xmm0
  485. pslldq xmm0,8
  486. psrldq xmm3,8
  487. pxor xmm0,xmm4
  488. pxor xmm1,xmm3
  489. movdqa xmm4,xmm0
  490. psrlq xmm0,1
  491. pxor xmm1,xmm4
  492. pxor xmm4,xmm0
  493. psrlq xmm0,5
  494. pxor xmm0,xmm4
  495. psrlq xmm0,1
  496. pxor xmm0,xmm1
  497. test r9,r9
  498. jnz NEAR $L$done
  499. $L$odd_tail:
  500. movdqu xmm8,XMMWORD[r8]
  501. DB 102,69,15,56,0,194
  502. pxor xmm0,xmm8
  503. movdqa xmm1,xmm0
  504. pshufd xmm3,xmm0,78
  505. pxor xmm3,xmm0
  506. DB 102,15,58,68,194,0
  507. DB 102,15,58,68,202,17
  508. DB 102,15,58,68,223,0
  509. pxor xmm3,xmm0
  510. pxor xmm3,xmm1
  511. movdqa xmm4,xmm3
  512. psrldq xmm3,8
  513. pslldq xmm4,8
  514. pxor xmm1,xmm3
  515. pxor xmm0,xmm4
  516. movdqa xmm4,xmm0
  517. movdqa xmm3,xmm0
  518. psllq xmm0,5
  519. pxor xmm3,xmm0
  520. psllq xmm0,1
  521. pxor xmm0,xmm3
  522. psllq xmm0,57
  523. movdqa xmm3,xmm0
  524. pslldq xmm0,8
  525. psrldq xmm3,8
  526. pxor xmm0,xmm4
  527. pxor xmm1,xmm3
  528. movdqa xmm4,xmm0
  529. psrlq xmm0,1
  530. pxor xmm1,xmm4
  531. pxor xmm4,xmm0
  532. psrlq xmm0,5
  533. pxor xmm0,xmm4
  534. psrlq xmm0,1
  535. pxor xmm0,xmm1
  536. $L$done:
  537. DB 102,65,15,56,0,194
  538. movdqu XMMWORD[rcx],xmm0
  539. movaps xmm6,XMMWORD[rsp]
  540. movaps xmm7,XMMWORD[16+rsp]
  541. movaps xmm8,XMMWORD[32+rsp]
  542. movaps xmm9,XMMWORD[48+rsp]
  543. movaps xmm10,XMMWORD[64+rsp]
  544. movaps xmm11,XMMWORD[80+rsp]
  545. movaps xmm12,XMMWORD[96+rsp]
  546. movaps xmm13,XMMWORD[112+rsp]
  547. movaps xmm14,XMMWORD[128+rsp]
  548. movaps xmm15,XMMWORD[144+rsp]
  549. lea rsp,[168+rsp]
  550. $L$SEH_end_GFp_gcm_ghash_clmul:
  551. DB 0F3h,0C3h ;repret
  552. global GFp_gcm_init_avx
  553. ALIGN 32
  554. GFp_gcm_init_avx:
  555. $L$SEH_begin_GFp_gcm_init_avx:
  556. DB 0x48,0x83,0xec,0x18
  557. DB 0x0f,0x29,0x34,0x24
  558. vzeroupper
  559. vmovdqu xmm2,XMMWORD[rdx]
  560. vpshufd xmm2,xmm2,78
  561. vpshufd xmm4,xmm2,255
  562. vpsrlq xmm3,xmm2,63
  563. vpsllq xmm2,xmm2,1
  564. vpxor xmm5,xmm5,xmm5
  565. vpcmpgtd xmm5,xmm5,xmm4
  566. vpslldq xmm3,xmm3,8
  567. vpor xmm2,xmm2,xmm3
  568. vpand xmm5,xmm5,XMMWORD[$L$0x1c2_polynomial]
  569. vpxor xmm2,xmm2,xmm5
  570. vpunpckhqdq xmm6,xmm2,xmm2
  571. vmovdqa xmm0,xmm2
  572. vpxor xmm6,xmm6,xmm2
  573. mov r10,4
  574. jmp NEAR $L$init_start_avx
  575. ALIGN 32
  576. $L$init_loop_avx:
  577. vpalignr xmm5,xmm4,xmm3,8
  578. vmovdqu XMMWORD[(-16)+rcx],xmm5
  579. vpunpckhqdq xmm3,xmm0,xmm0
  580. vpxor xmm3,xmm3,xmm0
  581. vpclmulqdq xmm1,xmm0,xmm2,0x11
  582. vpclmulqdq xmm0,xmm0,xmm2,0x00
  583. vpclmulqdq xmm3,xmm3,xmm6,0x00
  584. vpxor xmm4,xmm1,xmm0
  585. vpxor xmm3,xmm3,xmm4
  586. vpslldq xmm4,xmm3,8
  587. vpsrldq xmm3,xmm3,8
  588. vpxor xmm0,xmm0,xmm4
  589. vpxor xmm1,xmm1,xmm3
  590. vpsllq xmm3,xmm0,57
  591. vpsllq xmm4,xmm0,62
  592. vpxor xmm4,xmm4,xmm3
  593. vpsllq xmm3,xmm0,63
  594. vpxor xmm4,xmm4,xmm3
  595. vpslldq xmm3,xmm4,8
  596. vpsrldq xmm4,xmm4,8
  597. vpxor xmm0,xmm0,xmm3
  598. vpxor xmm1,xmm1,xmm4
  599. vpsrlq xmm4,xmm0,1
  600. vpxor xmm1,xmm1,xmm0
  601. vpxor xmm0,xmm0,xmm4
  602. vpsrlq xmm4,xmm4,5
  603. vpxor xmm0,xmm0,xmm4
  604. vpsrlq xmm0,xmm0,1
  605. vpxor xmm0,xmm0,xmm1
  606. $L$init_start_avx:
  607. vmovdqa xmm5,xmm0
  608. vpunpckhqdq xmm3,xmm0,xmm0
  609. vpxor xmm3,xmm3,xmm0
  610. vpclmulqdq xmm1,xmm0,xmm2,0x11
  611. vpclmulqdq xmm0,xmm0,xmm2,0x00
  612. vpclmulqdq xmm3,xmm3,xmm6,0x00
  613. vpxor xmm4,xmm1,xmm0
  614. vpxor xmm3,xmm3,xmm4
  615. vpslldq xmm4,xmm3,8
  616. vpsrldq xmm3,xmm3,8
  617. vpxor xmm0,xmm0,xmm4
  618. vpxor xmm1,xmm1,xmm3
  619. vpsllq xmm3,xmm0,57
  620. vpsllq xmm4,xmm0,62
  621. vpxor xmm4,xmm4,xmm3
  622. vpsllq xmm3,xmm0,63
  623. vpxor xmm4,xmm4,xmm3
  624. vpslldq xmm3,xmm4,8
  625. vpsrldq xmm4,xmm4,8
  626. vpxor xmm0,xmm0,xmm3
  627. vpxor xmm1,xmm1,xmm4
  628. vpsrlq xmm4,xmm0,1
  629. vpxor xmm1,xmm1,xmm0
  630. vpxor xmm0,xmm0,xmm4
  631. vpsrlq xmm4,xmm4,5
  632. vpxor xmm0,xmm0,xmm4
  633. vpsrlq xmm0,xmm0,1
  634. vpxor xmm0,xmm0,xmm1
  635. vpshufd xmm3,xmm5,78
  636. vpshufd xmm4,xmm0,78
  637. vpxor xmm3,xmm3,xmm5
  638. vmovdqu XMMWORD[rcx],xmm5
  639. vpxor xmm4,xmm4,xmm0
  640. vmovdqu XMMWORD[16+rcx],xmm0
  641. lea rcx,[48+rcx]
  642. sub r10,1
  643. jnz NEAR $L$init_loop_avx
  644. vpalignr xmm5,xmm3,xmm4,8
  645. vmovdqu XMMWORD[(-16)+rcx],xmm5
  646. vzeroupper
  647. movaps xmm6,XMMWORD[rsp]
  648. lea rsp,[24+rsp]
  649. $L$SEH_end_GFp_gcm_init_avx:
  650. DB 0F3h,0C3h ;repret
  651. global GFp_gcm_ghash_avx
  652. ALIGN 32
  653. GFp_gcm_ghash_avx:
  654. lea rax,[((-136))+rsp]
  655. $L$SEH_begin_GFp_gcm_ghash_avx:
  656. DB 0x48,0x8d,0x60,0xe0
  657. DB 0x0f,0x29,0x70,0xe0
  658. DB 0x0f,0x29,0x78,0xf0
  659. DB 0x44,0x0f,0x29,0x00
  660. DB 0x44,0x0f,0x29,0x48,0x10
  661. DB 0x44,0x0f,0x29,0x50,0x20
  662. DB 0x44,0x0f,0x29,0x58,0x30
  663. DB 0x44,0x0f,0x29,0x60,0x40
  664. DB 0x44,0x0f,0x29,0x68,0x50
  665. DB 0x44,0x0f,0x29,0x70,0x60
  666. DB 0x44,0x0f,0x29,0x78,0x70
  667. vzeroupper
  668. vmovdqu xmm10,XMMWORD[rcx]
  669. lea r10,[$L$0x1c2_polynomial]
  670. lea rdx,[64+rdx]
  671. vmovdqu xmm13,XMMWORD[$L$bswap_mask]
  672. vpshufb xmm10,xmm10,xmm13
  673. cmp r9,0x80
  674. jb NEAR $L$short_avx
  675. sub r9,0x80
  676. vmovdqu xmm14,XMMWORD[112+r8]
  677. vmovdqu xmm6,XMMWORD[((0-64))+rdx]
  678. vpshufb xmm14,xmm14,xmm13
  679. vmovdqu xmm7,XMMWORD[((32-64))+rdx]
  680. vpunpckhqdq xmm9,xmm14,xmm14
  681. vmovdqu xmm15,XMMWORD[96+r8]
  682. vpclmulqdq xmm0,xmm14,xmm6,0x00
  683. vpxor xmm9,xmm9,xmm14
  684. vpshufb xmm15,xmm15,xmm13
  685. vpclmulqdq xmm1,xmm14,xmm6,0x11
  686. vmovdqu xmm6,XMMWORD[((16-64))+rdx]
  687. vpunpckhqdq xmm8,xmm15,xmm15
  688. vmovdqu xmm14,XMMWORD[80+r8]
  689. vpclmulqdq xmm2,xmm9,xmm7,0x00
  690. vpxor xmm8,xmm8,xmm15
  691. vpshufb xmm14,xmm14,xmm13
  692. vpclmulqdq xmm3,xmm15,xmm6,0x00
  693. vpunpckhqdq xmm9,xmm14,xmm14
  694. vpclmulqdq xmm4,xmm15,xmm6,0x11
  695. vmovdqu xmm6,XMMWORD[((48-64))+rdx]
  696. vpxor xmm9,xmm9,xmm14
  697. vmovdqu xmm15,XMMWORD[64+r8]
  698. vpclmulqdq xmm5,xmm8,xmm7,0x10
  699. vmovdqu xmm7,XMMWORD[((80-64))+rdx]
  700. vpshufb xmm15,xmm15,xmm13
  701. vpxor xmm3,xmm3,xmm0
  702. vpclmulqdq xmm0,xmm14,xmm6,0x00
  703. vpxor xmm4,xmm4,xmm1
  704. vpunpckhqdq xmm8,xmm15,xmm15
  705. vpclmulqdq xmm1,xmm14,xmm6,0x11
  706. vmovdqu xmm6,XMMWORD[((64-64))+rdx]
  707. vpxor xmm5,xmm5,xmm2
  708. vpclmulqdq xmm2,xmm9,xmm7,0x00
  709. vpxor xmm8,xmm8,xmm15
  710. vmovdqu xmm14,XMMWORD[48+r8]
  711. vpxor xmm0,xmm0,xmm3
  712. vpclmulqdq xmm3,xmm15,xmm6,0x00
  713. vpxor xmm1,xmm1,xmm4
  714. vpshufb xmm14,xmm14,xmm13
  715. vpclmulqdq xmm4,xmm15,xmm6,0x11
  716. vmovdqu xmm6,XMMWORD[((96-64))+rdx]
  717. vpxor xmm2,xmm2,xmm5
  718. vpunpckhqdq xmm9,xmm14,xmm14
  719. vpclmulqdq xmm5,xmm8,xmm7,0x10
  720. vmovdqu xmm7,XMMWORD[((128-64))+rdx]
  721. vpxor xmm9,xmm9,xmm14
  722. vmovdqu xmm15,XMMWORD[32+r8]
  723. vpxor xmm3,xmm3,xmm0
  724. vpclmulqdq xmm0,xmm14,xmm6,0x00
  725. vpxor xmm4,xmm4,xmm1
  726. vpshufb xmm15,xmm15,xmm13
  727. vpclmulqdq xmm1,xmm14,xmm6,0x11
  728. vmovdqu xmm6,XMMWORD[((112-64))+rdx]
  729. vpxor xmm5,xmm5,xmm2
  730. vpunpckhqdq xmm8,xmm15,xmm15
  731. vpclmulqdq xmm2,xmm9,xmm7,0x00
  732. vpxor xmm8,xmm8,xmm15
  733. vmovdqu xmm14,XMMWORD[16+r8]
  734. vpxor xmm0,xmm0,xmm3
  735. vpclmulqdq xmm3,xmm15,xmm6,0x00
  736. vpxor xmm1,xmm1,xmm4
  737. vpshufb xmm14,xmm14,xmm13
  738. vpclmulqdq xmm4,xmm15,xmm6,0x11
  739. vmovdqu xmm6,XMMWORD[((144-64))+rdx]
  740. vpxor xmm2,xmm2,xmm5
  741. vpunpckhqdq xmm9,xmm14,xmm14
  742. vpclmulqdq xmm5,xmm8,xmm7,0x10
  743. vmovdqu xmm7,XMMWORD[((176-64))+rdx]
  744. vpxor xmm9,xmm9,xmm14
  745. vmovdqu xmm15,XMMWORD[r8]
  746. vpxor xmm3,xmm3,xmm0
  747. vpclmulqdq xmm0,xmm14,xmm6,0x00
  748. vpxor xmm4,xmm4,xmm1
  749. vpshufb xmm15,xmm15,xmm13
  750. vpclmulqdq xmm1,xmm14,xmm6,0x11
  751. vmovdqu xmm6,XMMWORD[((160-64))+rdx]
  752. vpxor xmm5,xmm5,xmm2
  753. vpclmulqdq xmm2,xmm9,xmm7,0x10
  754. lea r8,[128+r8]
  755. cmp r9,0x80
  756. jb NEAR $L$tail_avx
  757. vpxor xmm15,xmm15,xmm10
  758. sub r9,0x80
  759. jmp NEAR $L$oop8x_avx
  760. ALIGN 32
  761. $L$oop8x_avx:
  762. vpunpckhqdq xmm8,xmm15,xmm15
  763. vmovdqu xmm14,XMMWORD[112+r8]
  764. vpxor xmm3,xmm3,xmm0
  765. vpxor xmm8,xmm8,xmm15
  766. vpclmulqdq xmm10,xmm15,xmm6,0x00
  767. vpshufb xmm14,xmm14,xmm13
  768. vpxor xmm4,xmm4,xmm1
  769. vpclmulqdq xmm11,xmm15,xmm6,0x11
  770. vmovdqu xmm6,XMMWORD[((0-64))+rdx]
  771. vpunpckhqdq xmm9,xmm14,xmm14
  772. vpxor xmm5,xmm5,xmm2
  773. vpclmulqdq xmm12,xmm8,xmm7,0x00
  774. vmovdqu xmm7,XMMWORD[((32-64))+rdx]
  775. vpxor xmm9,xmm9,xmm14
  776. vmovdqu xmm15,XMMWORD[96+r8]
  777. vpclmulqdq xmm0,xmm14,xmm6,0x00
  778. vpxor xmm10,xmm10,xmm3
  779. vpshufb xmm15,xmm15,xmm13
  780. vpclmulqdq xmm1,xmm14,xmm6,0x11
  781. vxorps xmm11,xmm11,xmm4
  782. vmovdqu xmm6,XMMWORD[((16-64))+rdx]
  783. vpunpckhqdq xmm8,xmm15,xmm15
  784. vpclmulqdq xmm2,xmm9,xmm7,0x00
  785. vpxor xmm12,xmm12,xmm5
  786. vxorps xmm8,xmm8,xmm15
  787. vmovdqu xmm14,XMMWORD[80+r8]
  788. vpxor xmm12,xmm12,xmm10
  789. vpclmulqdq xmm3,xmm15,xmm6,0x00
  790. vpxor xmm12,xmm12,xmm11
  791. vpslldq xmm9,xmm12,8
  792. vpxor xmm3,xmm3,xmm0
  793. vpclmulqdq xmm4,xmm15,xmm6,0x11
  794. vpsrldq xmm12,xmm12,8
  795. vpxor xmm10,xmm10,xmm9
  796. vmovdqu xmm6,XMMWORD[((48-64))+rdx]
  797. vpshufb xmm14,xmm14,xmm13
  798. vxorps xmm11,xmm11,xmm12
  799. vpxor xmm4,xmm4,xmm1
  800. vpunpckhqdq xmm9,xmm14,xmm14
  801. vpclmulqdq xmm5,xmm8,xmm7,0x10
  802. vmovdqu xmm7,XMMWORD[((80-64))+rdx]
  803. vpxor xmm9,xmm9,xmm14
  804. vpxor xmm5,xmm5,xmm2
  805. vmovdqu xmm15,XMMWORD[64+r8]
  806. vpalignr xmm12,xmm10,xmm10,8
  807. vpclmulqdq xmm0,xmm14,xmm6,0x00
  808. vpshufb xmm15,xmm15,xmm13
  809. vpxor xmm0,xmm0,xmm3
  810. vpclmulqdq xmm1,xmm14,xmm6,0x11
  811. vmovdqu xmm6,XMMWORD[((64-64))+rdx]
  812. vpunpckhqdq xmm8,xmm15,xmm15
  813. vpxor xmm1,xmm1,xmm4
  814. vpclmulqdq xmm2,xmm9,xmm7,0x00
  815. vxorps xmm8,xmm8,xmm15
  816. vpxor xmm2,xmm2,xmm5
  817. vmovdqu xmm14,XMMWORD[48+r8]
  818. vpclmulqdq xmm10,xmm10,XMMWORD[r10],0x10
  819. vpclmulqdq xmm3,xmm15,xmm6,0x00
  820. vpshufb xmm14,xmm14,xmm13
  821. vpxor xmm3,xmm3,xmm0
  822. vpclmulqdq xmm4,xmm15,xmm6,0x11
  823. vmovdqu xmm6,XMMWORD[((96-64))+rdx]
  824. vpunpckhqdq xmm9,xmm14,xmm14
  825. vpxor xmm4,xmm4,xmm1
  826. vpclmulqdq xmm5,xmm8,xmm7,0x10
  827. vmovdqu xmm7,XMMWORD[((128-64))+rdx]
  828. vpxor xmm9,xmm9,xmm14
  829. vpxor xmm5,xmm5,xmm2
  830. vmovdqu xmm15,XMMWORD[32+r8]
  831. vpclmulqdq xmm0,xmm14,xmm6,0x00
  832. vpshufb xmm15,xmm15,xmm13
  833. vpxor xmm0,xmm0,xmm3
  834. vpclmulqdq xmm1,xmm14,xmm6,0x11
  835. vmovdqu xmm6,XMMWORD[((112-64))+rdx]
  836. vpunpckhqdq xmm8,xmm15,xmm15
  837. vpxor xmm1,xmm1,xmm4
  838. vpclmulqdq xmm2,xmm9,xmm7,0x00
  839. vpxor xmm8,xmm8,xmm15
  840. vpxor xmm2,xmm2,xmm5
  841. vxorps xmm10,xmm10,xmm12
  842. vmovdqu xmm14,XMMWORD[16+r8]
  843. vpalignr xmm12,xmm10,xmm10,8
  844. vpclmulqdq xmm3,xmm15,xmm6,0x00
  845. vpshufb xmm14,xmm14,xmm13
  846. vpxor xmm3,xmm3,xmm0
  847. vpclmulqdq xmm4,xmm15,xmm6,0x11
  848. vmovdqu xmm6,XMMWORD[((144-64))+rdx]
  849. vpclmulqdq xmm10,xmm10,XMMWORD[r10],0x10
  850. vxorps xmm12,xmm12,xmm11
  851. vpunpckhqdq xmm9,xmm14,xmm14
  852. vpxor xmm4,xmm4,xmm1
  853. vpclmulqdq xmm5,xmm8,xmm7,0x10
  854. vmovdqu xmm7,XMMWORD[((176-64))+rdx]
  855. vpxor xmm9,xmm9,xmm14
  856. vpxor xmm5,xmm5,xmm2
  857. vmovdqu xmm15,XMMWORD[r8]
  858. vpclmulqdq xmm0,xmm14,xmm6,0x00
  859. vpshufb xmm15,xmm15,xmm13
  860. vpclmulqdq xmm1,xmm14,xmm6,0x11
  861. vmovdqu xmm6,XMMWORD[((160-64))+rdx]
  862. vpxor xmm15,xmm15,xmm12
  863. vpclmulqdq xmm2,xmm9,xmm7,0x10
  864. vpxor xmm15,xmm15,xmm10
  865. lea r8,[128+r8]
  866. sub r9,0x80
  867. jnc NEAR $L$oop8x_avx
  868. add r9,0x80
  869. jmp NEAR $L$tail_no_xor_avx
  870. ALIGN 32
  871. $L$short_avx:
  872. vmovdqu xmm14,XMMWORD[((-16))+r9*1+r8]
  873. lea r8,[r9*1+r8]
  874. vmovdqu xmm6,XMMWORD[((0-64))+rdx]
  875. vmovdqu xmm7,XMMWORD[((32-64))+rdx]
  876. vpshufb xmm15,xmm14,xmm13
  877. vmovdqa xmm3,xmm0
  878. vmovdqa xmm4,xmm1
  879. vmovdqa xmm5,xmm2
  880. sub r9,0x10
  881. jz NEAR $L$tail_avx
  882. vpunpckhqdq xmm8,xmm15,xmm15
  883. vpxor xmm3,xmm3,xmm0
  884. vpclmulqdq xmm0,xmm15,xmm6,0x00
  885. vpxor xmm8,xmm8,xmm15
  886. vmovdqu xmm14,XMMWORD[((-32))+r8]
  887. vpxor xmm4,xmm4,xmm1
  888. vpclmulqdq xmm1,xmm15,xmm6,0x11
  889. vmovdqu xmm6,XMMWORD[((16-64))+rdx]
  890. vpshufb xmm15,xmm14,xmm13
  891. vpxor xmm5,xmm5,xmm2
  892. vpclmulqdq xmm2,xmm8,xmm7,0x00
  893. vpsrldq xmm7,xmm7,8
  894. sub r9,0x10
  895. jz NEAR $L$tail_avx
  896. vpunpckhqdq xmm8,xmm15,xmm15
  897. vpxor xmm3,xmm3,xmm0
  898. vpclmulqdq xmm0,xmm15,xmm6,0x00
  899. vpxor xmm8,xmm8,xmm15
  900. vmovdqu xmm14,XMMWORD[((-48))+r8]
  901. vpxor xmm4,xmm4,xmm1
  902. vpclmulqdq xmm1,xmm15,xmm6,0x11
  903. vmovdqu xmm6,XMMWORD[((48-64))+rdx]
  904. vpshufb xmm15,xmm14,xmm13
  905. vpxor xmm5,xmm5,xmm2
  906. vpclmulqdq xmm2,xmm8,xmm7,0x00
  907. vmovdqu xmm7,XMMWORD[((80-64))+rdx]
  908. sub r9,0x10
  909. jz NEAR $L$tail_avx
  910. vpunpckhqdq xmm8,xmm15,xmm15
  911. vpxor xmm3,xmm3,xmm0
  912. vpclmulqdq xmm0,xmm15,xmm6,0x00
  913. vpxor xmm8,xmm8,xmm15
  914. vmovdqu xmm14,XMMWORD[((-64))+r8]
  915. vpxor xmm4,xmm4,xmm1
  916. vpclmulqdq xmm1,xmm15,xmm6,0x11
  917. vmovdqu xmm6,XMMWORD[((64-64))+rdx]
  918. vpshufb xmm15,xmm14,xmm13
  919. vpxor xmm5,xmm5,xmm2
  920. vpclmulqdq xmm2,xmm8,xmm7,0x00
  921. vpsrldq xmm7,xmm7,8
  922. sub r9,0x10
  923. jz NEAR $L$tail_avx
  924. vpunpckhqdq xmm8,xmm15,xmm15
  925. vpxor xmm3,xmm3,xmm0
  926. vpclmulqdq xmm0,xmm15,xmm6,0x00
  927. vpxor xmm8,xmm8,xmm15
  928. vmovdqu xmm14,XMMWORD[((-80))+r8]
  929. vpxor xmm4,xmm4,xmm1
  930. vpclmulqdq xmm1,xmm15,xmm6,0x11
  931. vmovdqu xmm6,XMMWORD[((96-64))+rdx]
  932. vpshufb xmm15,xmm14,xmm13
  933. vpxor xmm5,xmm5,xmm2
  934. vpclmulqdq xmm2,xmm8,xmm7,0x00
  935. vmovdqu xmm7,XMMWORD[((128-64))+rdx]
  936. sub r9,0x10
  937. jz NEAR $L$tail_avx
  938. vpunpckhqdq xmm8,xmm15,xmm15
  939. vpxor xmm3,xmm3,xmm0
  940. vpclmulqdq xmm0,xmm15,xmm6,0x00
  941. vpxor xmm8,xmm8,xmm15
  942. vmovdqu xmm14,XMMWORD[((-96))+r8]
  943. vpxor xmm4,xmm4,xmm1
  944. vpclmulqdq xmm1,xmm15,xmm6,0x11
  945. vmovdqu xmm6,XMMWORD[((112-64))+rdx]
  946. vpshufb xmm15,xmm14,xmm13
  947. vpxor xmm5,xmm5,xmm2
  948. vpclmulqdq xmm2,xmm8,xmm7,0x00
  949. vpsrldq xmm7,xmm7,8
  950. sub r9,0x10
  951. jz NEAR $L$tail_avx
  952. vpunpckhqdq xmm8,xmm15,xmm15
  953. vpxor xmm3,xmm3,xmm0
  954. vpclmulqdq xmm0,xmm15,xmm6,0x00
  955. vpxor xmm8,xmm8,xmm15
  956. vmovdqu xmm14,XMMWORD[((-112))+r8]
  957. vpxor xmm4,xmm4,xmm1
  958. vpclmulqdq xmm1,xmm15,xmm6,0x11
  959. vmovdqu xmm6,XMMWORD[((144-64))+rdx]
  960. vpshufb xmm15,xmm14,xmm13
  961. vpxor xmm5,xmm5,xmm2
  962. vpclmulqdq xmm2,xmm8,xmm7,0x00
  963. vmovq xmm7,QWORD[((184-64))+rdx]
  964. sub r9,0x10
  965. jmp NEAR $L$tail_avx
  966. ALIGN 32
  967. $L$tail_avx:
  968. vpxor xmm15,xmm15,xmm10
  969. $L$tail_no_xor_avx:
  970. vpunpckhqdq xmm8,xmm15,xmm15
  971. vpxor xmm3,xmm3,xmm0
  972. vpclmulqdq xmm0,xmm15,xmm6,0x00
  973. vpxor xmm8,xmm8,xmm15
  974. vpxor xmm4,xmm4,xmm1
  975. vpclmulqdq xmm1,xmm15,xmm6,0x11
  976. vpxor xmm5,xmm5,xmm2
  977. vpclmulqdq xmm2,xmm8,xmm7,0x00
  978. vmovdqu xmm12,XMMWORD[r10]
  979. vpxor xmm10,xmm3,xmm0
  980. vpxor xmm11,xmm4,xmm1
  981. vpxor xmm5,xmm5,xmm2
  982. vpxor xmm5,xmm5,xmm10
  983. vpxor xmm5,xmm5,xmm11
  984. vpslldq xmm9,xmm5,8
  985. vpsrldq xmm5,xmm5,8
  986. vpxor xmm10,xmm10,xmm9
  987. vpxor xmm11,xmm11,xmm5
  988. vpclmulqdq xmm9,xmm10,xmm12,0x10
  989. vpalignr xmm10,xmm10,xmm10,8
  990. vpxor xmm10,xmm10,xmm9
  991. vpclmulqdq xmm9,xmm10,xmm12,0x10
  992. vpalignr xmm10,xmm10,xmm10,8
  993. vpxor xmm10,xmm10,xmm11
  994. vpxor xmm10,xmm10,xmm9
  995. cmp r9,0
  996. jne NEAR $L$short_avx
  997. vpshufb xmm10,xmm10,xmm13
  998. vmovdqu XMMWORD[rcx],xmm10
  999. vzeroupper
  1000. movaps xmm6,XMMWORD[rsp]
  1001. movaps xmm7,XMMWORD[16+rsp]
  1002. movaps xmm8,XMMWORD[32+rsp]
  1003. movaps xmm9,XMMWORD[48+rsp]
  1004. movaps xmm10,XMMWORD[64+rsp]
  1005. movaps xmm11,XMMWORD[80+rsp]
  1006. movaps xmm12,XMMWORD[96+rsp]
  1007. movaps xmm13,XMMWORD[112+rsp]
  1008. movaps xmm14,XMMWORD[128+rsp]
  1009. movaps xmm15,XMMWORD[144+rsp]
  1010. lea rsp,[168+rsp]
  1011. $L$SEH_end_GFp_gcm_ghash_avx:
  1012. DB 0F3h,0C3h ;repret
  1013. ALIGN 64
  1014. $L$bswap_mask:
  1015. DB 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
  1016. $L$0x1c2_polynomial:
  1017. DB 1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2
  1018. $L$7_mask:
  1019. DD 7,0,7,0
  1020. ALIGN 64
  1021. DB 71,72,65,83,72,32,102,111,114,32,120,56,54,95,54,52
  1022. DB 44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32
  1023. DB 60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111
  1024. DB 114,103,62,0
  1025. ALIGN 64
  1026. section .pdata rdata align=4
  1027. ALIGN 4
  1028. DD $L$SEH_begin_GFp_gcm_init_clmul wrt ..imagebase
  1029. DD $L$SEH_end_GFp_gcm_init_clmul wrt ..imagebase
  1030. DD $L$SEH_info_GFp_gcm_init_clmul wrt ..imagebase
  1031. DD $L$SEH_begin_GFp_gcm_ghash_clmul wrt ..imagebase
  1032. DD $L$SEH_end_GFp_gcm_ghash_clmul wrt ..imagebase
  1033. DD $L$SEH_info_GFp_gcm_ghash_clmul wrt ..imagebase
  1034. DD $L$SEH_begin_GFp_gcm_init_avx wrt ..imagebase
  1035. DD $L$SEH_end_GFp_gcm_init_avx wrt ..imagebase
  1036. DD $L$SEH_info_GFp_gcm_init_clmul wrt ..imagebase
  1037. DD $L$SEH_begin_GFp_gcm_ghash_avx wrt ..imagebase
  1038. DD $L$SEH_end_GFp_gcm_ghash_avx wrt ..imagebase
  1039. DD $L$SEH_info_GFp_gcm_ghash_clmul wrt ..imagebase
  1040. section .xdata rdata align=8
  1041. ALIGN 8
  1042. $L$SEH_info_GFp_gcm_init_clmul:
  1043. DB 0x01,0x08,0x03,0x00
  1044. DB 0x08,0x68,0x00,0x00
  1045. DB 0x04,0x22,0x00,0x00
  1046. $L$SEH_info_GFp_gcm_ghash_clmul:
  1047. DB 0x01,0x33,0x16,0x00
  1048. DB 0x33,0xf8,0x09,0x00
  1049. DB 0x2e,0xe8,0x08,0x00
  1050. DB 0x29,0xd8,0x07,0x00
  1051. DB 0x24,0xc8,0x06,0x00
  1052. DB 0x1f,0xb8,0x05,0x00
  1053. DB 0x1a,0xa8,0x04,0x00
  1054. DB 0x15,0x98,0x03,0x00
  1055. DB 0x10,0x88,0x02,0x00
  1056. DB 0x0c,0x78,0x01,0x00
  1057. DB 0x08,0x68,0x00,0x00
  1058. DB 0x04,0x01,0x15,0x00