chacha-armv8-linux64.S 40 KB


  1. // This file is generated from a similarly-named Perl script in the BoringSSL
  2. // source tree. Do not edit by hand.
  3. #if !defined(__has_feature)
  4. #define __has_feature(x) 0
  5. #endif
  6. #if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM)
  7. #define OPENSSL_NO_ASM
  8. #endif
  9. #if !defined(OPENSSL_NO_ASM)
  10. #if defined(__aarch64__)
  11. #include <GFp/arm_arch.h>
  12. .hidden GFp_armcap_P
  13. .section .rodata
  14. .align 5
  15. .Lsigma:
  16. .quad 0x3320646e61707865,0x6b20657479622d32 // endian-neutral
  17. .Lone:
  18. .long 1,0,0,0
  19. .byte 67,104,97,67,104,97,50,48,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
  20. .align 2
  21. .text
  22. .globl GFp_ChaCha20_ctr32
  23. .hidden GFp_ChaCha20_ctr32
  24. .type GFp_ChaCha20_ctr32,%function
  25. .align 5
  26. GFp_ChaCha20_ctr32:
  27. AARCH64_VALID_CALL_TARGET
  28. cbz x2,.Labort
  29. #if __has_feature(hwaddress_sanitizer) && __clang_major__ >= 10
  30. adrp x5,:pg_hi21_nc:GFp_armcap_P
  31. #else
  32. adrp x5,GFp_armcap_P
  33. #endif
  34. cmp x2,#192
  35. b.lo .Lshort
  36. ldr w17,[x5,:lo12:GFp_armcap_P]
  37. tst w17,#ARMV7_NEON
  38. b.ne ChaCha20_neon
  39. .Lshort:
  40. AARCH64_SIGN_LINK_REGISTER
  41. stp x29,x30,[sp,#-96]!
  42. add x29,sp,#0
  43. adrp x5,.Lsigma
  44. add x5,x5,:lo12:.Lsigma
  45. stp x19,x20,[sp,#16]
  46. stp x21,x22,[sp,#32]
  47. stp x23,x24,[sp,#48]
  48. stp x25,x26,[sp,#64]
  49. stp x27,x28,[sp,#80]
  50. sub sp,sp,#64
  51. ldp x22,x23,[x5] // load sigma
  52. ldp x24,x25,[x3] // load key
  53. ldp x26,x27,[x3,#16]
  54. ldp x28,x30,[x4] // load counter
  55. #ifdef __ARMEB__
  56. ror x24,x24,#32
  57. ror x25,x25,#32
  58. ror x26,x26,#32
  59. ror x27,x27,#32
  60. ror x28,x28,#32
  61. ror x30,x30,#32
  62. #endif
  63. .Loop_outer:
  64. mov w5,w22 // unpack key block
  65. lsr x6,x22,#32
  66. mov w7,w23
  67. lsr x8,x23,#32
  68. mov w9,w24
  69. lsr x10,x24,#32
  70. mov w11,w25
  71. lsr x12,x25,#32
  72. mov w13,w26
  73. lsr x14,x26,#32
  74. mov w15,w27
  75. lsr x16,x27,#32
  76. mov w17,w28
  77. lsr x19,x28,#32
  78. mov w20,w30
  79. lsr x21,x30,#32
  80. mov x4,#10
  81. subs x2,x2,#64
  82. .Loop:
  83. sub x4,x4,#1
  84. add w5,w5,w9
  85. add w6,w6,w10
  86. add w7,w7,w11
  87. add w8,w8,w12
  88. eor w17,w17,w5
  89. eor w19,w19,w6
  90. eor w20,w20,w7
  91. eor w21,w21,w8
  92. ror w17,w17,#16
  93. ror w19,w19,#16
  94. ror w20,w20,#16
  95. ror w21,w21,#16
  96. add w13,w13,w17
  97. add w14,w14,w19
  98. add w15,w15,w20
  99. add w16,w16,w21
  100. eor w9,w9,w13
  101. eor w10,w10,w14
  102. eor w11,w11,w15
  103. eor w12,w12,w16
  104. ror w9,w9,#20
  105. ror w10,w10,#20
  106. ror w11,w11,#20
  107. ror w12,w12,#20
  108. add w5,w5,w9
  109. add w6,w6,w10
  110. add w7,w7,w11
  111. add w8,w8,w12
  112. eor w17,w17,w5
  113. eor w19,w19,w6
  114. eor w20,w20,w7
  115. eor w21,w21,w8
  116. ror w17,w17,#24
  117. ror w19,w19,#24
  118. ror w20,w20,#24
  119. ror w21,w21,#24
  120. add w13,w13,w17
  121. add w14,w14,w19
  122. add w15,w15,w20
  123. add w16,w16,w21
  124. eor w9,w9,w13
  125. eor w10,w10,w14
  126. eor w11,w11,w15
  127. eor w12,w12,w16
  128. ror w9,w9,#25
  129. ror w10,w10,#25
  130. ror w11,w11,#25
  131. ror w12,w12,#25
  132. add w5,w5,w10
  133. add w6,w6,w11
  134. add w7,w7,w12
  135. add w8,w8,w9
  136. eor w21,w21,w5
  137. eor w17,w17,w6
  138. eor w19,w19,w7
  139. eor w20,w20,w8
  140. ror w21,w21,#16
  141. ror w17,w17,#16
  142. ror w19,w19,#16
  143. ror w20,w20,#16
  144. add w15,w15,w21
  145. add w16,w16,w17
  146. add w13,w13,w19
  147. add w14,w14,w20
  148. eor w10,w10,w15
  149. eor w11,w11,w16
  150. eor w12,w12,w13
  151. eor w9,w9,w14
  152. ror w10,w10,#20
  153. ror w11,w11,#20
  154. ror w12,w12,#20
  155. ror w9,w9,#20
  156. add w5,w5,w10
  157. add w6,w6,w11
  158. add w7,w7,w12
  159. add w8,w8,w9
  160. eor w21,w21,w5
  161. eor w17,w17,w6
  162. eor w19,w19,w7
  163. eor w20,w20,w8
  164. ror w21,w21,#24
  165. ror w17,w17,#24
  166. ror w19,w19,#24
  167. ror w20,w20,#24
  168. add w15,w15,w21
  169. add w16,w16,w17
  170. add w13,w13,w19
  171. add w14,w14,w20
  172. eor w10,w10,w15
  173. eor w11,w11,w16
  174. eor w12,w12,w13
  175. eor w9,w9,w14
  176. ror w10,w10,#25
  177. ror w11,w11,#25
  178. ror w12,w12,#25
  179. ror w9,w9,#25
  180. cbnz x4,.Loop
  181. add w5,w5,w22 // accumulate key block
  182. add x6,x6,x22,lsr#32
  183. add w7,w7,w23
  184. add x8,x8,x23,lsr#32
  185. add w9,w9,w24
  186. add x10,x10,x24,lsr#32
  187. add w11,w11,w25
  188. add x12,x12,x25,lsr#32
  189. add w13,w13,w26
  190. add x14,x14,x26,lsr#32
  191. add w15,w15,w27
  192. add x16,x16,x27,lsr#32
  193. add w17,w17,w28
  194. add x19,x19,x28,lsr#32
  195. add w20,w20,w30
  196. add x21,x21,x30,lsr#32
  197. b.lo .Ltail
  198. add x5,x5,x6,lsl#32 // pack
  199. add x7,x7,x8,lsl#32
  200. ldp x6,x8,[x1,#0] // load input
  201. add x9,x9,x10,lsl#32
  202. add x11,x11,x12,lsl#32
  203. ldp x10,x12,[x1,#16]
  204. add x13,x13,x14,lsl#32
  205. add x15,x15,x16,lsl#32
  206. ldp x14,x16,[x1,#32]
  207. add x17,x17,x19,lsl#32
  208. add x20,x20,x21,lsl#32
  209. ldp x19,x21,[x1,#48]
  210. add x1,x1,#64
  211. #ifdef __ARMEB__
  212. rev x5,x5
  213. rev x7,x7
  214. rev x9,x9
  215. rev x11,x11
  216. rev x13,x13
  217. rev x15,x15
  218. rev x17,x17
  219. rev x20,x20
  220. #endif
  221. eor x5,x5,x6
  222. eor x7,x7,x8
  223. eor x9,x9,x10
  224. eor x11,x11,x12
  225. eor x13,x13,x14
  226. eor x15,x15,x16
  227. eor x17,x17,x19
  228. eor x20,x20,x21
  229. stp x5,x7,[x0,#0] // store output
  230. add x28,x28,#1 // increment counter
  231. stp x9,x11,[x0,#16]
  232. stp x13,x15,[x0,#32]
  233. stp x17,x20,[x0,#48]
  234. add x0,x0,#64
  235. b.hi .Loop_outer
  236. ldp x19,x20,[x29,#16]
  237. add sp,sp,#64
  238. ldp x21,x22,[x29,#32]
  239. ldp x23,x24,[x29,#48]
  240. ldp x25,x26,[x29,#64]
  241. ldp x27,x28,[x29,#80]
  242. ldp x29,x30,[sp],#96
  243. AARCH64_VALIDATE_LINK_REGISTER
  244. .Labort:
  245. ret
  246. .align 4
  247. .Ltail:
  248. add x2,x2,#64
  249. .Less_than_64:
  250. sub x0,x0,#1
  251. add x1,x1,x2
  252. add x0,x0,x2
  253. add x4,sp,x2
  254. neg x2,x2
  255. add x5,x5,x6,lsl#32 // pack
  256. add x7,x7,x8,lsl#32
  257. add x9,x9,x10,lsl#32
  258. add x11,x11,x12,lsl#32
  259. add x13,x13,x14,lsl#32
  260. add x15,x15,x16,lsl#32
  261. add x17,x17,x19,lsl#32
  262. add x20,x20,x21,lsl#32
  263. #ifdef __ARMEB__
  264. rev x5,x5
  265. rev x7,x7
  266. rev x9,x9
  267. rev x11,x11
  268. rev x13,x13
  269. rev x15,x15
  270. rev x17,x17
  271. rev x20,x20
  272. #endif
  273. stp x5,x7,[sp,#0]
  274. stp x9,x11,[sp,#16]
  275. stp x13,x15,[sp,#32]
  276. stp x17,x20,[sp,#48]
  277. .Loop_tail:
  278. ldrb w10,[x1,x2]
  279. ldrb w11,[x4,x2]
  280. add x2,x2,#1
  281. eor w10,w10,w11
  282. strb w10,[x0,x2]
  283. cbnz x2,.Loop_tail
  284. stp xzr,xzr,[sp,#0]
  285. stp xzr,xzr,[sp,#16]
  286. stp xzr,xzr,[sp,#32]
  287. stp xzr,xzr,[sp,#48]
  288. ldp x19,x20,[x29,#16]
  289. add sp,sp,#64
  290. ldp x21,x22,[x29,#32]
  291. ldp x23,x24,[x29,#48]
  292. ldp x25,x26,[x29,#64]
  293. ldp x27,x28,[x29,#80]
  294. ldp x29,x30,[sp],#96
  295. AARCH64_VALIDATE_LINK_REGISTER
  296. ret
  297. .size GFp_ChaCha20_ctr32,.-GFp_ChaCha20_ctr32
  298. .type ChaCha20_neon,%function
  299. .align 5
  300. ChaCha20_neon:
  301. AARCH64_SIGN_LINK_REGISTER
  302. stp x29,x30,[sp,#-96]!
  303. add x29,sp,#0
  304. adrp x5,.Lsigma
  305. add x5,x5,:lo12:.Lsigma
  306. stp x19,x20,[sp,#16]
  307. stp x21,x22,[sp,#32]
  308. stp x23,x24,[sp,#48]
  309. stp x25,x26,[sp,#64]
  310. stp x27,x28,[sp,#80]
  311. cmp x2,#512
  312. b.hs .L512_or_more_neon
  313. sub sp,sp,#64
  314. ldp x22,x23,[x5] // load sigma
  315. ld1 {v24.4s},[x5],#16
  316. ldp x24,x25,[x3] // load key
  317. ldp x26,x27,[x3,#16]
  318. ld1 {v25.4s,v26.4s},[x3]
  319. ldp x28,x30,[x4] // load counter
  320. ld1 {v27.4s},[x4]
  321. ld1 {v31.4s},[x5]
  322. #ifdef __ARMEB__
  323. rev64 v24.4s,v24.4s
  324. ror x24,x24,#32
  325. ror x25,x25,#32
  326. ror x26,x26,#32
  327. ror x27,x27,#32
  328. ror x28,x28,#32
  329. ror x30,x30,#32
  330. #endif
  331. add v27.4s,v27.4s,v31.4s // += 1
  332. add v28.4s,v27.4s,v31.4s
  333. add v29.4s,v28.4s,v31.4s
  334. shl v31.4s,v31.4s,#2 // 1 -> 4
  335. .Loop_outer_neon:
  336. mov w5,w22 // unpack key block
  337. lsr x6,x22,#32
  338. mov v0.16b,v24.16b
  339. mov w7,w23
  340. lsr x8,x23,#32
  341. mov v4.16b,v24.16b
  342. mov w9,w24
  343. lsr x10,x24,#32
  344. mov v16.16b,v24.16b
  345. mov w11,w25
  346. mov v1.16b,v25.16b
  347. lsr x12,x25,#32
  348. mov v5.16b,v25.16b
  349. mov w13,w26
  350. mov v17.16b,v25.16b
  351. lsr x14,x26,#32
  352. mov v3.16b,v27.16b
  353. mov w15,w27
  354. mov v7.16b,v28.16b
  355. lsr x16,x27,#32
  356. mov v19.16b,v29.16b
  357. mov w17,w28
  358. mov v2.16b,v26.16b
  359. lsr x19,x28,#32
  360. mov v6.16b,v26.16b
  361. mov w20,w30
  362. mov v18.16b,v26.16b
  363. lsr x21,x30,#32
  364. mov x4,#10
  365. subs x2,x2,#256
  366. .Loop_neon:
  367. sub x4,x4,#1
  368. add v0.4s,v0.4s,v1.4s
  369. add w5,w5,w9
  370. add v4.4s,v4.4s,v5.4s
  371. add w6,w6,w10
  372. add v16.4s,v16.4s,v17.4s
  373. add w7,w7,w11
  374. eor v3.16b,v3.16b,v0.16b
  375. add w8,w8,w12
  376. eor v7.16b,v7.16b,v4.16b
  377. eor w17,w17,w5
  378. eor v19.16b,v19.16b,v16.16b
  379. eor w19,w19,w6
  380. rev32 v3.8h,v3.8h
  381. eor w20,w20,w7
  382. rev32 v7.8h,v7.8h
  383. eor w21,w21,w8
  384. rev32 v19.8h,v19.8h
  385. ror w17,w17,#16
  386. add v2.4s,v2.4s,v3.4s
  387. ror w19,w19,#16
  388. add v6.4s,v6.4s,v7.4s
  389. ror w20,w20,#16
  390. add v18.4s,v18.4s,v19.4s
  391. ror w21,w21,#16
  392. eor v20.16b,v1.16b,v2.16b
  393. add w13,w13,w17
  394. eor v21.16b,v5.16b,v6.16b
  395. add w14,w14,w19
  396. eor v22.16b,v17.16b,v18.16b
  397. add w15,w15,w20
  398. ushr v1.4s,v20.4s,#20
  399. add w16,w16,w21
  400. ushr v5.4s,v21.4s,#20
  401. eor w9,w9,w13
  402. ushr v17.4s,v22.4s,#20
  403. eor w10,w10,w14
  404. sli v1.4s,v20.4s,#12
  405. eor w11,w11,w15
  406. sli v5.4s,v21.4s,#12
  407. eor w12,w12,w16
  408. sli v17.4s,v22.4s,#12
  409. ror w9,w9,#20
  410. add v0.4s,v0.4s,v1.4s
  411. ror w10,w10,#20
  412. add v4.4s,v4.4s,v5.4s
  413. ror w11,w11,#20
  414. add v16.4s,v16.4s,v17.4s
  415. ror w12,w12,#20
  416. eor v20.16b,v3.16b,v0.16b
  417. add w5,w5,w9
  418. eor v21.16b,v7.16b,v4.16b
  419. add w6,w6,w10
  420. eor v22.16b,v19.16b,v16.16b
  421. add w7,w7,w11
  422. ushr v3.4s,v20.4s,#24
  423. add w8,w8,w12
  424. ushr v7.4s,v21.4s,#24
  425. eor w17,w17,w5
  426. ushr v19.4s,v22.4s,#24
  427. eor w19,w19,w6
  428. sli v3.4s,v20.4s,#8
  429. eor w20,w20,w7
  430. sli v7.4s,v21.4s,#8
  431. eor w21,w21,w8
  432. sli v19.4s,v22.4s,#8
  433. ror w17,w17,#24
  434. add v2.4s,v2.4s,v3.4s
  435. ror w19,w19,#24
  436. add v6.4s,v6.4s,v7.4s
  437. ror w20,w20,#24
  438. add v18.4s,v18.4s,v19.4s
  439. ror w21,w21,#24
  440. eor v20.16b,v1.16b,v2.16b
  441. add w13,w13,w17
  442. eor v21.16b,v5.16b,v6.16b
  443. add w14,w14,w19
  444. eor v22.16b,v17.16b,v18.16b
  445. add w15,w15,w20
  446. ushr v1.4s,v20.4s,#25
  447. add w16,w16,w21
  448. ushr v5.4s,v21.4s,#25
  449. eor w9,w9,w13
  450. ushr v17.4s,v22.4s,#25
  451. eor w10,w10,w14
  452. sli v1.4s,v20.4s,#7
  453. eor w11,w11,w15
  454. sli v5.4s,v21.4s,#7
  455. eor w12,w12,w16
  456. sli v17.4s,v22.4s,#7
  457. ror w9,w9,#25
  458. ext v2.16b,v2.16b,v2.16b,#8
  459. ror w10,w10,#25
  460. ext v6.16b,v6.16b,v6.16b,#8
  461. ror w11,w11,#25
  462. ext v18.16b,v18.16b,v18.16b,#8
  463. ror w12,w12,#25
  464. ext v3.16b,v3.16b,v3.16b,#12
  465. ext v7.16b,v7.16b,v7.16b,#12
  466. ext v19.16b,v19.16b,v19.16b,#12
  467. ext v1.16b,v1.16b,v1.16b,#4
  468. ext v5.16b,v5.16b,v5.16b,#4
  469. ext v17.16b,v17.16b,v17.16b,#4
  470. add v0.4s,v0.4s,v1.4s
  471. add w5,w5,w10
  472. add v4.4s,v4.4s,v5.4s
  473. add w6,w6,w11
  474. add v16.4s,v16.4s,v17.4s
  475. add w7,w7,w12
  476. eor v3.16b,v3.16b,v0.16b
  477. add w8,w8,w9
  478. eor v7.16b,v7.16b,v4.16b
  479. eor w21,w21,w5
  480. eor v19.16b,v19.16b,v16.16b
  481. eor w17,w17,w6
  482. rev32 v3.8h,v3.8h
  483. eor w19,w19,w7
  484. rev32 v7.8h,v7.8h
  485. eor w20,w20,w8
  486. rev32 v19.8h,v19.8h
  487. ror w21,w21,#16
  488. add v2.4s,v2.4s,v3.4s
  489. ror w17,w17,#16
  490. add v6.4s,v6.4s,v7.4s
  491. ror w19,w19,#16
  492. add v18.4s,v18.4s,v19.4s
  493. ror w20,w20,#16
  494. eor v20.16b,v1.16b,v2.16b
  495. add w15,w15,w21
  496. eor v21.16b,v5.16b,v6.16b
  497. add w16,w16,w17
  498. eor v22.16b,v17.16b,v18.16b
  499. add w13,w13,w19
  500. ushr v1.4s,v20.4s,#20
  501. add w14,w14,w20
  502. ushr v5.4s,v21.4s,#20
  503. eor w10,w10,w15
  504. ushr v17.4s,v22.4s,#20
  505. eor w11,w11,w16
  506. sli v1.4s,v20.4s,#12
  507. eor w12,w12,w13
  508. sli v5.4s,v21.4s,#12
  509. eor w9,w9,w14
  510. sli v17.4s,v22.4s,#12
  511. ror w10,w10,#20
  512. add v0.4s,v0.4s,v1.4s
  513. ror w11,w11,#20
  514. add v4.4s,v4.4s,v5.4s
  515. ror w12,w12,#20
  516. add v16.4s,v16.4s,v17.4s
  517. ror w9,w9,#20
  518. eor v20.16b,v3.16b,v0.16b
  519. add w5,w5,w10
  520. eor v21.16b,v7.16b,v4.16b
  521. add w6,w6,w11
  522. eor v22.16b,v19.16b,v16.16b
  523. add w7,w7,w12
  524. ushr v3.4s,v20.4s,#24
  525. add w8,w8,w9
  526. ushr v7.4s,v21.4s,#24
  527. eor w21,w21,w5
  528. ushr v19.4s,v22.4s,#24
  529. eor w17,w17,w6
  530. sli v3.4s,v20.4s,#8
  531. eor w19,w19,w7
  532. sli v7.4s,v21.4s,#8
  533. eor w20,w20,w8
  534. sli v19.4s,v22.4s,#8
  535. ror w21,w21,#24
  536. add v2.4s,v2.4s,v3.4s
  537. ror w17,w17,#24
  538. add v6.4s,v6.4s,v7.4s
  539. ror w19,w19,#24
  540. add v18.4s,v18.4s,v19.4s
  541. ror w20,w20,#24
  542. eor v20.16b,v1.16b,v2.16b
  543. add w15,w15,w21
  544. eor v21.16b,v5.16b,v6.16b
  545. add w16,w16,w17
  546. eor v22.16b,v17.16b,v18.16b
  547. add w13,w13,w19
  548. ushr v1.4s,v20.4s,#25
  549. add w14,w14,w20
  550. ushr v5.4s,v21.4s,#25
  551. eor w10,w10,w15
  552. ushr v17.4s,v22.4s,#25
  553. eor w11,w11,w16
  554. sli v1.4s,v20.4s,#7
  555. eor w12,w12,w13
  556. sli v5.4s,v21.4s,#7
  557. eor w9,w9,w14
  558. sli v17.4s,v22.4s,#7
  559. ror w10,w10,#25
  560. ext v2.16b,v2.16b,v2.16b,#8
  561. ror w11,w11,#25
  562. ext v6.16b,v6.16b,v6.16b,#8
  563. ror w12,w12,#25
  564. ext v18.16b,v18.16b,v18.16b,#8
  565. ror w9,w9,#25
  566. ext v3.16b,v3.16b,v3.16b,#4
  567. ext v7.16b,v7.16b,v7.16b,#4
  568. ext v19.16b,v19.16b,v19.16b,#4
  569. ext v1.16b,v1.16b,v1.16b,#12
  570. ext v5.16b,v5.16b,v5.16b,#12
  571. ext v17.16b,v17.16b,v17.16b,#12
  572. cbnz x4,.Loop_neon
  573. add w5,w5,w22 // accumulate key block
  574. add v0.4s,v0.4s,v24.4s
  575. add x6,x6,x22,lsr#32
  576. add v4.4s,v4.4s,v24.4s
  577. add w7,w7,w23
  578. add v16.4s,v16.4s,v24.4s
  579. add x8,x8,x23,lsr#32
  580. add v2.4s,v2.4s,v26.4s
  581. add w9,w9,w24
  582. add v6.4s,v6.4s,v26.4s
  583. add x10,x10,x24,lsr#32
  584. add v18.4s,v18.4s,v26.4s
  585. add w11,w11,w25
  586. add v3.4s,v3.4s,v27.4s
  587. add x12,x12,x25,lsr#32
  588. add w13,w13,w26
  589. add v7.4s,v7.4s,v28.4s
  590. add x14,x14,x26,lsr#32
  591. add w15,w15,w27
  592. add v19.4s,v19.4s,v29.4s
  593. add x16,x16,x27,lsr#32
  594. add w17,w17,w28
  595. add v1.4s,v1.4s,v25.4s
  596. add x19,x19,x28,lsr#32
  597. add w20,w20,w30
  598. add v5.4s,v5.4s,v25.4s
  599. add x21,x21,x30,lsr#32
  600. add v17.4s,v17.4s,v25.4s
  601. b.lo .Ltail_neon
  602. add x5,x5,x6,lsl#32 // pack
  603. add x7,x7,x8,lsl#32
  604. ldp x6,x8,[x1,#0] // load input
  605. add x9,x9,x10,lsl#32
  606. add x11,x11,x12,lsl#32
  607. ldp x10,x12,[x1,#16]
  608. add x13,x13,x14,lsl#32
  609. add x15,x15,x16,lsl#32
  610. ldp x14,x16,[x1,#32]
  611. add x17,x17,x19,lsl#32
  612. add x20,x20,x21,lsl#32
  613. ldp x19,x21,[x1,#48]
  614. add x1,x1,#64
  615. #ifdef __ARMEB__
  616. rev x5,x5
  617. rev x7,x7
  618. rev x9,x9
  619. rev x11,x11
  620. rev x13,x13
  621. rev x15,x15
  622. rev x17,x17
  623. rev x20,x20
  624. #endif
  625. ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64
  626. eor x5,x5,x6
  627. eor x7,x7,x8
  628. eor x9,x9,x10
  629. eor x11,x11,x12
  630. eor x13,x13,x14
  631. eor v0.16b,v0.16b,v20.16b
  632. eor x15,x15,x16
  633. eor v1.16b,v1.16b,v21.16b
  634. eor x17,x17,x19
  635. eor v2.16b,v2.16b,v22.16b
  636. eor x20,x20,x21
  637. eor v3.16b,v3.16b,v23.16b
  638. ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64
  639. stp x5,x7,[x0,#0] // store output
  640. add x28,x28,#4 // increment counter
  641. stp x9,x11,[x0,#16]
  642. add v27.4s,v27.4s,v31.4s // += 4
  643. stp x13,x15,[x0,#32]
  644. add v28.4s,v28.4s,v31.4s
  645. stp x17,x20,[x0,#48]
  646. add v29.4s,v29.4s,v31.4s
  647. add x0,x0,#64
  648. st1 {v0.16b,v1.16b,v2.16b,v3.16b},[x0],#64
  649. ld1 {v0.16b,v1.16b,v2.16b,v3.16b},[x1],#64
  650. eor v4.16b,v4.16b,v20.16b
  651. eor v5.16b,v5.16b,v21.16b
  652. eor v6.16b,v6.16b,v22.16b
  653. eor v7.16b,v7.16b,v23.16b
  654. st1 {v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64
  655. eor v16.16b,v16.16b,v0.16b
  656. eor v17.16b,v17.16b,v1.16b
  657. eor v18.16b,v18.16b,v2.16b
  658. eor v19.16b,v19.16b,v3.16b
  659. st1 {v16.16b,v17.16b,v18.16b,v19.16b},[x0],#64
  660. b.hi .Loop_outer_neon
  661. ldp x19,x20,[x29,#16]
  662. add sp,sp,#64
  663. ldp x21,x22,[x29,#32]
  664. ldp x23,x24,[x29,#48]
  665. ldp x25,x26,[x29,#64]
  666. ldp x27,x28,[x29,#80]
  667. ldp x29,x30,[sp],#96
  668. AARCH64_VALIDATE_LINK_REGISTER
  669. ret
  670. .Ltail_neon:
  671. add x2,x2,#256
  672. cmp x2,#64
  673. b.lo .Less_than_64
  674. add x5,x5,x6,lsl#32 // pack
  675. add x7,x7,x8,lsl#32
  676. ldp x6,x8,[x1,#0] // load input
  677. add x9,x9,x10,lsl#32
  678. add x11,x11,x12,lsl#32
  679. ldp x10,x12,[x1,#16]
  680. add x13,x13,x14,lsl#32
  681. add x15,x15,x16,lsl#32
  682. ldp x14,x16,[x1,#32]
  683. add x17,x17,x19,lsl#32
  684. add x20,x20,x21,lsl#32
  685. ldp x19,x21,[x1,#48]
  686. add x1,x1,#64
  687. #ifdef __ARMEB__
  688. rev x5,x5
  689. rev x7,x7
  690. rev x9,x9
  691. rev x11,x11
  692. rev x13,x13
  693. rev x15,x15
  694. rev x17,x17
  695. rev x20,x20
  696. #endif
  697. eor x5,x5,x6
  698. eor x7,x7,x8
  699. eor x9,x9,x10
  700. eor x11,x11,x12
  701. eor x13,x13,x14
  702. eor x15,x15,x16
  703. eor x17,x17,x19
  704. eor x20,x20,x21
  705. stp x5,x7,[x0,#0] // store output
  706. add x28,x28,#4 // increment counter
  707. stp x9,x11,[x0,#16]
  708. stp x13,x15,[x0,#32]
  709. stp x17,x20,[x0,#48]
  710. add x0,x0,#64
  711. b.eq .Ldone_neon
  712. sub x2,x2,#64
  713. cmp x2,#64
  714. b.lo .Less_than_128
  715. ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64
  716. eor v0.16b,v0.16b,v20.16b
  717. eor v1.16b,v1.16b,v21.16b
  718. eor v2.16b,v2.16b,v22.16b
  719. eor v3.16b,v3.16b,v23.16b
  720. st1 {v0.16b,v1.16b,v2.16b,v3.16b},[x0],#64
  721. b.eq .Ldone_neon
  722. sub x2,x2,#64
  723. cmp x2,#64
  724. b.lo .Less_than_192
  725. ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64
  726. eor v4.16b,v4.16b,v20.16b
  727. eor v5.16b,v5.16b,v21.16b
  728. eor v6.16b,v6.16b,v22.16b
  729. eor v7.16b,v7.16b,v23.16b
  730. st1 {v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64
  731. b.eq .Ldone_neon
  732. sub x2,x2,#64
  733. st1 {v16.16b,v17.16b,v18.16b,v19.16b},[sp]
  734. b .Last_neon
  735. .Less_than_128:
  736. st1 {v0.16b,v1.16b,v2.16b,v3.16b},[sp]
  737. b .Last_neon
  738. .Less_than_192:
  739. st1 {v4.16b,v5.16b,v6.16b,v7.16b},[sp]
  740. b .Last_neon
  741. .align 4
  742. .Last_neon:
  743. sub x0,x0,#1
  744. add x1,x1,x2
  745. add x0,x0,x2
  746. add x4,sp,x2
  747. neg x2,x2
  748. .Loop_tail_neon:
  749. ldrb w10,[x1,x2]
  750. ldrb w11,[x4,x2]
  751. add x2,x2,#1
  752. eor w10,w10,w11
  753. strb w10,[x0,x2]
  754. cbnz x2,.Loop_tail_neon
  755. stp xzr,xzr,[sp,#0]
  756. stp xzr,xzr,[sp,#16]
  757. stp xzr,xzr,[sp,#32]
  758. stp xzr,xzr,[sp,#48]
  759. .Ldone_neon:
  760. ldp x19,x20,[x29,#16]
  761. add sp,sp,#64
  762. ldp x21,x22,[x29,#32]
  763. ldp x23,x24,[x29,#48]
  764. ldp x25,x26,[x29,#64]
  765. ldp x27,x28,[x29,#80]
  766. ldp x29,x30,[sp],#96
  767. AARCH64_VALIDATE_LINK_REGISTER
  768. ret
  769. .size ChaCha20_neon,.-ChaCha20_neon
  770. .type ChaCha20_512_neon,%function
  771. .align 5
  772. ChaCha20_512_neon:
  773. AARCH64_SIGN_LINK_REGISTER
  774. stp x29,x30,[sp,#-96]!
  775. add x29,sp,#0
  776. adrp x5,.Lsigma
  777. add x5,x5,:lo12:.Lsigma
  778. stp x19,x20,[sp,#16]
  779. stp x21,x22,[sp,#32]
  780. stp x23,x24,[sp,#48]
  781. stp x25,x26,[sp,#64]
  782. stp x27,x28,[sp,#80]
  783. .L512_or_more_neon:
  784. sub sp,sp,#128+64
  785. ldp x22,x23,[x5] // load sigma
  786. ld1 {v24.4s},[x5],#16
  787. ldp x24,x25,[x3] // load key
  788. ldp x26,x27,[x3,#16]
  789. ld1 {v25.4s,v26.4s},[x3]
  790. ldp x28,x30,[x4] // load counter
  791. ld1 {v27.4s},[x4]
  792. ld1 {v31.4s},[x5]
  793. #ifdef __ARMEB__
  794. rev64 v24.4s,v24.4s
  795. ror x24,x24,#32
  796. ror x25,x25,#32
  797. ror x26,x26,#32
  798. ror x27,x27,#32
  799. ror x28,x28,#32
  800. ror x30,x30,#32
  801. #endif
  802. add v27.4s,v27.4s,v31.4s // += 1
  803. stp q24,q25,[sp,#0] // off-load key block, invariant part
  804. add v27.4s,v27.4s,v31.4s // not typo
  805. str q26,[sp,#32]
  806. add v28.4s,v27.4s,v31.4s
  807. add v29.4s,v28.4s,v31.4s
  808. add v30.4s,v29.4s,v31.4s
  809. shl v31.4s,v31.4s,#2 // 1 -> 4
  810. stp d8,d9,[sp,#128+0] // meet ABI requirements
  811. stp d10,d11,[sp,#128+16]
  812. stp d12,d13,[sp,#128+32]
  813. stp d14,d15,[sp,#128+48]
  814. sub x2,x2,#512 // not typo
  815. .Loop_outer_512_neon:
  816. mov v0.16b,v24.16b
  817. mov v4.16b,v24.16b
  818. mov v8.16b,v24.16b
  819. mov v12.16b,v24.16b
  820. mov v16.16b,v24.16b
  821. mov v20.16b,v24.16b
  822. mov v1.16b,v25.16b
  823. mov w5,w22 // unpack key block
  824. mov v5.16b,v25.16b
  825. lsr x6,x22,#32
  826. mov v9.16b,v25.16b
  827. mov w7,w23
  828. mov v13.16b,v25.16b
  829. lsr x8,x23,#32
  830. mov v17.16b,v25.16b
  831. mov w9,w24
  832. mov v21.16b,v25.16b
  833. lsr x10,x24,#32
  834. mov v3.16b,v27.16b
  835. mov w11,w25
  836. mov v7.16b,v28.16b
  837. lsr x12,x25,#32
  838. mov v11.16b,v29.16b
  839. mov w13,w26
  840. mov v15.16b,v30.16b
  841. lsr x14,x26,#32
  842. mov v2.16b,v26.16b
  843. mov w15,w27
  844. mov v6.16b,v26.16b
  845. lsr x16,x27,#32
  846. add v19.4s,v3.4s,v31.4s // +4
  847. mov w17,w28
  848. add v23.4s,v7.4s,v31.4s // +4
  849. lsr x19,x28,#32
  850. mov v10.16b,v26.16b
  851. mov w20,w30
  852. mov v14.16b,v26.16b
  853. lsr x21,x30,#32
  854. mov v18.16b,v26.16b
  855. stp q27,q28,[sp,#48] // off-load key block, variable part
  856. mov v22.16b,v26.16b
  857. str q29,[sp,#80]
  858. mov x4,#5
  859. subs x2,x2,#512
  860. .Loop_upper_neon:
  861. sub x4,x4,#1
  862. add v0.4s,v0.4s,v1.4s
  863. add w5,w5,w9
  864. add v4.4s,v4.4s,v5.4s
  865. add w6,w6,w10
  866. add v8.4s,v8.4s,v9.4s
  867. add w7,w7,w11
  868. add v12.4s,v12.4s,v13.4s
  869. add w8,w8,w12
  870. add v16.4s,v16.4s,v17.4s
  871. eor w17,w17,w5
  872. add v20.4s,v20.4s,v21.4s
  873. eor w19,w19,w6
  874. eor v3.16b,v3.16b,v0.16b
  875. eor w20,w20,w7
  876. eor v7.16b,v7.16b,v4.16b
  877. eor w21,w21,w8
  878. eor v11.16b,v11.16b,v8.16b
  879. ror w17,w17,#16
  880. eor v15.16b,v15.16b,v12.16b
  881. ror w19,w19,#16
  882. eor v19.16b,v19.16b,v16.16b
  883. ror w20,w20,#16
  884. eor v23.16b,v23.16b,v20.16b
  885. ror w21,w21,#16
  886. rev32 v3.8h,v3.8h
  887. add w13,w13,w17
  888. rev32 v7.8h,v7.8h
  889. add w14,w14,w19
  890. rev32 v11.8h,v11.8h
  891. add w15,w15,w20
  892. rev32 v15.8h,v15.8h
  893. add w16,w16,w21
  894. rev32 v19.8h,v19.8h
  895. eor w9,w9,w13
  896. rev32 v23.8h,v23.8h
  897. eor w10,w10,w14
  898. add v2.4s,v2.4s,v3.4s
  899. eor w11,w11,w15
  900. add v6.4s,v6.4s,v7.4s
  901. eor w12,w12,w16
  902. add v10.4s,v10.4s,v11.4s
  903. ror w9,w9,#20
  904. add v14.4s,v14.4s,v15.4s
  905. ror w10,w10,#20
  906. add v18.4s,v18.4s,v19.4s
  907. ror w11,w11,#20
  908. add v22.4s,v22.4s,v23.4s
  909. ror w12,w12,#20
  910. eor v24.16b,v1.16b,v2.16b
  911. add w5,w5,w9
  912. eor v25.16b,v5.16b,v6.16b
  913. add w6,w6,w10
  914. eor v26.16b,v9.16b,v10.16b
  915. add w7,w7,w11
  916. eor v27.16b,v13.16b,v14.16b
  917. add w8,w8,w12
  918. eor v28.16b,v17.16b,v18.16b
  919. eor w17,w17,w5
  920. eor v29.16b,v21.16b,v22.16b
  921. eor w19,w19,w6
  922. ushr v1.4s,v24.4s,#20
  923. eor w20,w20,w7
  924. ushr v5.4s,v25.4s,#20
  925. eor w21,w21,w8
  926. ushr v9.4s,v26.4s,#20
  927. ror w17,w17,#24
  928. ushr v13.4s,v27.4s,#20
  929. ror w19,w19,#24
  930. ushr v17.4s,v28.4s,#20
  931. ror w20,w20,#24
  932. ushr v21.4s,v29.4s,#20
  933. ror w21,w21,#24
  934. sli v1.4s,v24.4s,#12
  935. add w13,w13,w17
  936. sli v5.4s,v25.4s,#12
  937. add w14,w14,w19
  938. sli v9.4s,v26.4s,#12
  939. add w15,w15,w20
  940. sli v13.4s,v27.4s,#12
  941. add w16,w16,w21
  942. sli v17.4s,v28.4s,#12
  943. eor w9,w9,w13
  944. sli v21.4s,v29.4s,#12
  945. eor w10,w10,w14
  946. add v0.4s,v0.4s,v1.4s
  947. eor w11,w11,w15
  948. add v4.4s,v4.4s,v5.4s
  949. eor w12,w12,w16
  950. add v8.4s,v8.4s,v9.4s
  951. ror w9,w9,#25
  952. add v12.4s,v12.4s,v13.4s
  953. ror w10,w10,#25
  954. add v16.4s,v16.4s,v17.4s
  955. ror w11,w11,#25
  956. add v20.4s,v20.4s,v21.4s
  957. ror w12,w12,#25
  958. eor v24.16b,v3.16b,v0.16b
  959. add w5,w5,w10
  960. eor v25.16b,v7.16b,v4.16b
  961. add w6,w6,w11
  962. eor v26.16b,v11.16b,v8.16b
  963. add w7,w7,w12
  964. eor v27.16b,v15.16b,v12.16b
  965. add w8,w8,w9
  966. eor v28.16b,v19.16b,v16.16b
  967. eor w21,w21,w5
  968. eor v29.16b,v23.16b,v20.16b
  969. eor w17,w17,w6
  970. ushr v3.4s,v24.4s,#24
  971. eor w19,w19,w7
  972. ushr v7.4s,v25.4s,#24
  973. eor w20,w20,w8
  974. ushr v11.4s,v26.4s,#24
  975. ror w21,w21,#16
  976. ushr v15.4s,v27.4s,#24
  977. ror w17,w17,#16
  978. ushr v19.4s,v28.4s,#24
  979. ror w19,w19,#16
  980. ushr v23.4s,v29.4s,#24
  981. ror w20,w20,#16
  982. sli v3.4s,v24.4s,#8
  983. add w15,w15,w21
  984. sli v7.4s,v25.4s,#8
  985. add w16,w16,w17
  986. sli v11.4s,v26.4s,#8
  987. add w13,w13,w19
  988. sli v15.4s,v27.4s,#8
  989. add w14,w14,w20
  990. sli v19.4s,v28.4s,#8
  991. eor w10,w10,w15
  992. sli v23.4s,v29.4s,#8
  993. eor w11,w11,w16
  994. add v2.4s,v2.4s,v3.4s
  995. eor w12,w12,w13
  996. add v6.4s,v6.4s,v7.4s
  997. eor w9,w9,w14
  998. add v10.4s,v10.4s,v11.4s
  999. ror w10,w10,#20
  1000. add v14.4s,v14.4s,v15.4s
  1001. ror w11,w11,#20
  1002. add v18.4s,v18.4s,v19.4s
  1003. ror w12,w12,#20
  1004. add v22.4s,v22.4s,v23.4s
  1005. ror w9,w9,#20
  1006. eor v24.16b,v1.16b,v2.16b
  1007. add w5,w5,w10
  1008. eor v25.16b,v5.16b,v6.16b
  1009. add w6,w6,w11
  1010. eor v26.16b,v9.16b,v10.16b
  1011. add w7,w7,w12
  1012. eor v27.16b,v13.16b,v14.16b
  1013. add w8,w8,w9
  1014. eor v28.16b,v17.16b,v18.16b
  1015. eor w21,w21,w5
  1016. eor v29.16b,v21.16b,v22.16b
  1017. eor w17,w17,w6
  1018. ushr v1.4s,v24.4s,#25
  1019. eor w19,w19,w7
  1020. ushr v5.4s,v25.4s,#25
  1021. eor w20,w20,w8
  1022. ushr v9.4s,v26.4s,#25
  1023. ror w21,w21,#24
  1024. ushr v13.4s,v27.4s,#25
  1025. ror w17,w17,#24
  1026. ushr v17.4s,v28.4s,#25
  1027. ror w19,w19,#24
  1028. ushr v21.4s,v29.4s,#25
  1029. ror w20,w20,#24
  1030. sli v1.4s,v24.4s,#7
  1031. add w15,w15,w21
  1032. sli v5.4s,v25.4s,#7
  1033. add w16,w16,w17
  1034. sli v9.4s,v26.4s,#7
  1035. add w13,w13,w19
  1036. sli v13.4s,v27.4s,#7
  1037. add w14,w14,w20
  1038. sli v17.4s,v28.4s,#7
  1039. eor w10,w10,w15
  1040. sli v21.4s,v29.4s,#7
  1041. eor w11,w11,w16
  1042. ext v2.16b,v2.16b,v2.16b,#8
  1043. eor w12,w12,w13
  1044. ext v6.16b,v6.16b,v6.16b,#8
  1045. eor w9,w9,w14
  1046. ext v10.16b,v10.16b,v10.16b,#8
  1047. ror w10,w10,#25
  1048. ext v14.16b,v14.16b,v14.16b,#8
  1049. ror w11,w11,#25
  1050. ext v18.16b,v18.16b,v18.16b,#8
  1051. ror w12,w12,#25
  1052. ext v22.16b,v22.16b,v22.16b,#8
  1053. ror w9,w9,#25
  1054. ext v3.16b,v3.16b,v3.16b,#12
  1055. ext v7.16b,v7.16b,v7.16b,#12
  1056. ext v11.16b,v11.16b,v11.16b,#12
  1057. ext v15.16b,v15.16b,v15.16b,#12
  1058. ext v19.16b,v19.16b,v19.16b,#12
  1059. ext v23.16b,v23.16b,v23.16b,#12
  1060. ext v1.16b,v1.16b,v1.16b,#4
  1061. ext v5.16b,v5.16b,v5.16b,#4
  1062. ext v9.16b,v9.16b,v9.16b,#4
  1063. ext v13.16b,v13.16b,v13.16b,#4
  1064. ext v17.16b,v17.16b,v17.16b,#4
  1065. ext v21.16b,v21.16b,v21.16b,#4
  1066. add v0.4s,v0.4s,v1.4s
  1067. add w5,w5,w9
  1068. add v4.4s,v4.4s,v5.4s
  1069. add w6,w6,w10
  1070. add v8.4s,v8.4s,v9.4s
  1071. add w7,w7,w11
  1072. add v12.4s,v12.4s,v13.4s
  1073. add w8,w8,w12
  1074. add v16.4s,v16.4s,v17.4s
  1075. eor w17,w17,w5
  1076. add v20.4s,v20.4s,v21.4s
  1077. eor w19,w19,w6
  1078. eor v3.16b,v3.16b,v0.16b
  1079. eor w20,w20,w7
  1080. eor v7.16b,v7.16b,v4.16b
  1081. eor w21,w21,w8
  1082. eor v11.16b,v11.16b,v8.16b
  1083. ror w17,w17,#16
  1084. eor v15.16b,v15.16b,v12.16b
  1085. ror w19,w19,#16
  1086. eor v19.16b,v19.16b,v16.16b
  1087. ror w20,w20,#16
  1088. eor v23.16b,v23.16b,v20.16b
  1089. ror w21,w21,#16
  1090. rev32 v3.8h,v3.8h
  1091. add w13,w13,w17
  1092. rev32 v7.8h,v7.8h
  1093. add w14,w14,w19
  1094. rev32 v11.8h,v11.8h
  1095. add w15,w15,w20
  1096. rev32 v15.8h,v15.8h
  1097. add w16,w16,w21
  1098. rev32 v19.8h,v19.8h
  1099. eor w9,w9,w13
  1100. rev32 v23.8h,v23.8h
  1101. eor w10,w10,w14
  1102. add v2.4s,v2.4s,v3.4s
  1103. eor w11,w11,w15
  1104. add v6.4s,v6.4s,v7.4s
  1105. eor w12,w12,w16
  1106. add v10.4s,v10.4s,v11.4s
  1107. ror w9,w9,#20
  1108. add v14.4s,v14.4s,v15.4s
  1109. ror w10,w10,#20
  1110. add v18.4s,v18.4s,v19.4s
  1111. ror w11,w11,#20
  1112. add v22.4s,v22.4s,v23.4s
  1113. ror w12,w12,#20
  1114. eor v24.16b,v1.16b,v2.16b
  1115. add w5,w5,w9
  1116. eor v25.16b,v5.16b,v6.16b
  1117. add w6,w6,w10
  1118. eor v26.16b,v9.16b,v10.16b
  1119. add w7,w7,w11
  1120. eor v27.16b,v13.16b,v14.16b
  1121. add w8,w8,w12
  1122. eor v28.16b,v17.16b,v18.16b
  1123. eor w17,w17,w5
  1124. eor v29.16b,v21.16b,v22.16b
  1125. eor w19,w19,w6
  1126. ushr v1.4s,v24.4s,#20
  1127. eor w20,w20,w7
  1128. ushr v5.4s,v25.4s,#20
  1129. eor w21,w21,w8
  1130. ushr v9.4s,v26.4s,#20
  1131. ror w17,w17,#24
  1132. ushr v13.4s,v27.4s,#20
  1133. ror w19,w19,#24
  1134. ushr v17.4s,v28.4s,#20
  1135. ror w20,w20,#24
  1136. ushr v21.4s,v29.4s,#20
  1137. ror w21,w21,#24
  1138. sli v1.4s,v24.4s,#12
  1139. add w13,w13,w17
  1140. sli v5.4s,v25.4s,#12
  1141. add w14,w14,w19
  1142. sli v9.4s,v26.4s,#12
  1143. add w15,w15,w20
  1144. sli v13.4s,v27.4s,#12
  1145. add w16,w16,w21
  1146. sli v17.4s,v28.4s,#12
  1147. eor w9,w9,w13
  1148. sli v21.4s,v29.4s,#12
  1149. eor w10,w10,w14
  1150. add v0.4s,v0.4s,v1.4s
  1151. eor w11,w11,w15
  1152. add v4.4s,v4.4s,v5.4s
  1153. eor w12,w12,w16
  1154. add v8.4s,v8.4s,v9.4s
  1155. ror w9,w9,#25
  1156. add v12.4s,v12.4s,v13.4s
  1157. ror w10,w10,#25
  1158. add v16.4s,v16.4s,v17.4s
  1159. ror w11,w11,#25
  1160. add v20.4s,v20.4s,v21.4s
  1161. ror w12,w12,#25
  1162. eor v24.16b,v3.16b,v0.16b
  1163. add w5,w5,w10
  1164. eor v25.16b,v7.16b,v4.16b
  1165. add w6,w6,w11
  1166. eor v26.16b,v11.16b,v8.16b
  1167. add w7,w7,w12
  1168. eor v27.16b,v15.16b,v12.16b
  1169. add w8,w8,w9
  1170. eor v28.16b,v19.16b,v16.16b
  1171. eor w21,w21,w5
  1172. eor v29.16b,v23.16b,v20.16b
  1173. eor w17,w17,w6
  1174. ushr v3.4s,v24.4s,#24
  1175. eor w19,w19,w7
  1176. ushr v7.4s,v25.4s,#24
  1177. eor w20,w20,w8
  1178. ushr v11.4s,v26.4s,#24
  1179. ror w21,w21,#16
  1180. ushr v15.4s,v27.4s,#24
  1181. ror w17,w17,#16
  1182. ushr v19.4s,v28.4s,#24
  1183. ror w19,w19,#16
  1184. ushr v23.4s,v29.4s,#24
  1185. ror w20,w20,#16
  1186. sli v3.4s,v24.4s,#8
  1187. add w15,w15,w21
  1188. sli v7.4s,v25.4s,#8
  1189. add w16,w16,w17
  1190. sli v11.4s,v26.4s,#8
  1191. add w13,w13,w19
  1192. sli v15.4s,v27.4s,#8
  1193. add w14,w14,w20
  1194. sli v19.4s,v28.4s,#8
  1195. eor w10,w10,w15
  1196. sli v23.4s,v29.4s,#8
  1197. eor w11,w11,w16
  1198. add v2.4s,v2.4s,v3.4s
  1199. eor w12,w12,w13
  1200. add v6.4s,v6.4s,v7.4s
  1201. eor w9,w9,w14
  1202. add v10.4s,v10.4s,v11.4s
  1203. ror w10,w10,#20
  1204. add v14.4s,v14.4s,v15.4s
  1205. ror w11,w11,#20
  1206. add v18.4s,v18.4s,v19.4s
  1207. ror w12,w12,#20
  1208. add v22.4s,v22.4s,v23.4s
  1209. ror w9,w9,#20
  1210. eor v24.16b,v1.16b,v2.16b
  1211. add w5,w5,w10
  1212. eor v25.16b,v5.16b,v6.16b
  1213. add w6,w6,w11
  1214. eor v26.16b,v9.16b,v10.16b
  1215. add w7,w7,w12
  1216. eor v27.16b,v13.16b,v14.16b
  1217. add w8,w8,w9
  1218. eor v28.16b,v17.16b,v18.16b
  1219. eor w21,w21,w5
  1220. eor v29.16b,v21.16b,v22.16b
  1221. eor w17,w17,w6
  1222. ushr v1.4s,v24.4s,#25
  1223. eor w19,w19,w7
  1224. ushr v5.4s,v25.4s,#25
  1225. eor w20,w20,w8
  1226. ushr v9.4s,v26.4s,#25
  1227. ror w21,w21,#24
  1228. ushr v13.4s,v27.4s,#25
  1229. ror w17,w17,#24
  1230. ushr v17.4s,v28.4s,#25
  1231. ror w19,w19,#24
  1232. ushr v21.4s,v29.4s,#25
  1233. ror w20,w20,#24
  1234. sli v1.4s,v24.4s,#7
  1235. add w15,w15,w21
  1236. sli v5.4s,v25.4s,#7
  1237. add w16,w16,w17
  1238. sli v9.4s,v26.4s,#7
  1239. add w13,w13,w19
  1240. sli v13.4s,v27.4s,#7
  1241. add w14,w14,w20
  1242. sli v17.4s,v28.4s,#7
  1243. eor w10,w10,w15
  1244. sli v21.4s,v29.4s,#7
  1245. eor w11,w11,w16
  1246. ext v2.16b,v2.16b,v2.16b,#8
  1247. eor w12,w12,w13
  1248. ext v6.16b,v6.16b,v6.16b,#8
  1249. eor w9,w9,w14
  1250. ext v10.16b,v10.16b,v10.16b,#8
  1251. ror w10,w10,#25
  1252. ext v14.16b,v14.16b,v14.16b,#8
  1253. ror w11,w11,#25
  1254. ext v18.16b,v18.16b,v18.16b,#8
  1255. ror w12,w12,#25
  1256. ext v22.16b,v22.16b,v22.16b,#8
  1257. ror w9,w9,#25
  1258. ext v3.16b,v3.16b,v3.16b,#4
  1259. ext v7.16b,v7.16b,v7.16b,#4
  1260. ext v11.16b,v11.16b,v11.16b,#4
  1261. ext v15.16b,v15.16b,v15.16b,#4
  1262. ext v19.16b,v19.16b,v19.16b,#4
  1263. ext v23.16b,v23.16b,v23.16b,#4
  1264. ext v1.16b,v1.16b,v1.16b,#12
  1265. ext v5.16b,v5.16b,v5.16b,#12
  1266. ext v9.16b,v9.16b,v9.16b,#12
  1267. ext v13.16b,v13.16b,v13.16b,#12
  1268. ext v17.16b,v17.16b,v17.16b,#12
  1269. ext v21.16b,v21.16b,v21.16b,#12
  1270. cbnz x4,.Loop_upper_neon
  1271. add w5,w5,w22 // accumulate key block
  1272. add x6,x6,x22,lsr#32
  1273. add w7,w7,w23
  1274. add x8,x8,x23,lsr#32
  1275. add w9,w9,w24
  1276. add x10,x10,x24,lsr#32
  1277. add w11,w11,w25
  1278. add x12,x12,x25,lsr#32
  1279. add w13,w13,w26
  1280. add x14,x14,x26,lsr#32
  1281. add w15,w15,w27
  1282. add x16,x16,x27,lsr#32
  1283. add w17,w17,w28
  1284. add x19,x19,x28,lsr#32
  1285. add w20,w20,w30
  1286. add x21,x21,x30,lsr#32
  1287. add x5,x5,x6,lsl#32 // pack
  1288. add x7,x7,x8,lsl#32
  1289. ldp x6,x8,[x1,#0] // load input
  1290. add x9,x9,x10,lsl#32
  1291. add x11,x11,x12,lsl#32
  1292. ldp x10,x12,[x1,#16]
  1293. add x13,x13,x14,lsl#32
  1294. add x15,x15,x16,lsl#32
  1295. ldp x14,x16,[x1,#32]
  1296. add x17,x17,x19,lsl#32
  1297. add x20,x20,x21,lsl#32
  1298. ldp x19,x21,[x1,#48]
  1299. add x1,x1,#64
  1300. #ifdef __ARMEB__
  1301. rev x5,x5
  1302. rev x7,x7
  1303. rev x9,x9
  1304. rev x11,x11
  1305. rev x13,x13
  1306. rev x15,x15
  1307. rev x17,x17
  1308. rev x20,x20
  1309. #endif
  1310. eor x5,x5,x6
  1311. eor x7,x7,x8
  1312. eor x9,x9,x10
  1313. eor x11,x11,x12
  1314. eor x13,x13,x14
  1315. eor x15,x15,x16
  1316. eor x17,x17,x19
  1317. eor x20,x20,x21
  1318. stp x5,x7,[x0,#0] // store output
  1319. add x28,x28,#1 // increment counter
  1320. mov w5,w22 // unpack key block
  1321. lsr x6,x22,#32
  1322. stp x9,x11,[x0,#16]
  1323. mov w7,w23
  1324. lsr x8,x23,#32
  1325. stp x13,x15,[x0,#32]
  1326. mov w9,w24
  1327. lsr x10,x24,#32
  1328. stp x17,x20,[x0,#48]
  1329. add x0,x0,#64
  1330. mov w11,w25
  1331. lsr x12,x25,#32
  1332. mov w13,w26
  1333. lsr x14,x26,#32
  1334. mov w15,w27
  1335. lsr x16,x27,#32
  1336. mov w17,w28
  1337. lsr x19,x28,#32
  1338. mov w20,w30
  1339. lsr x21,x30,#32
  1340. mov x4,#5
  1341. .Loop_lower_neon:
  1342. sub x4,x4,#1
  1343. add v0.4s,v0.4s,v1.4s
  1344. add w5,w5,w9
  1345. add v4.4s,v4.4s,v5.4s
  1346. add w6,w6,w10
  1347. add v8.4s,v8.4s,v9.4s
  1348. add w7,w7,w11
  1349. add v12.4s,v12.4s,v13.4s
  1350. add w8,w8,w12
  1351. add v16.4s,v16.4s,v17.4s
  1352. eor w17,w17,w5
  1353. add v20.4s,v20.4s,v21.4s
  1354. eor w19,w19,w6
  1355. eor v3.16b,v3.16b,v0.16b
  1356. eor w20,w20,w7
  1357. eor v7.16b,v7.16b,v4.16b
  1358. eor w21,w21,w8
  1359. eor v11.16b,v11.16b,v8.16b
  1360. ror w17,w17,#16
  1361. eor v15.16b,v15.16b,v12.16b
  1362. ror w19,w19,#16
  1363. eor v19.16b,v19.16b,v16.16b
  1364. ror w20,w20,#16
  1365. eor v23.16b,v23.16b,v20.16b
  1366. ror w21,w21,#16
  1367. rev32 v3.8h,v3.8h
  1368. add w13,w13,w17
  1369. rev32 v7.8h,v7.8h
  1370. add w14,w14,w19
  1371. rev32 v11.8h,v11.8h
  1372. add w15,w15,w20
  1373. rev32 v15.8h,v15.8h
  1374. add w16,w16,w21
  1375. rev32 v19.8h,v19.8h
  1376. eor w9,w9,w13
  1377. rev32 v23.8h,v23.8h
  1378. eor w10,w10,w14
  1379. add v2.4s,v2.4s,v3.4s
  1380. eor w11,w11,w15
  1381. add v6.4s,v6.4s,v7.4s
  1382. eor w12,w12,w16
  1383. add v10.4s,v10.4s,v11.4s
  1384. ror w9,w9,#20
  1385. add v14.4s,v14.4s,v15.4s
  1386. ror w10,w10,#20
  1387. add v18.4s,v18.4s,v19.4s
  1388. ror w11,w11,#20
  1389. add v22.4s,v22.4s,v23.4s
  1390. ror w12,w12,#20
  1391. eor v24.16b,v1.16b,v2.16b
  1392. add w5,w5,w9
  1393. eor v25.16b,v5.16b,v6.16b
  1394. add w6,w6,w10
  1395. eor v26.16b,v9.16b,v10.16b
  1396. add w7,w7,w11
  1397. eor v27.16b,v13.16b,v14.16b
  1398. add w8,w8,w12
  1399. eor v28.16b,v17.16b,v18.16b
  1400. eor w17,w17,w5
  1401. eor v29.16b,v21.16b,v22.16b
  1402. eor w19,w19,w6
  1403. ushr v1.4s,v24.4s,#20
  1404. eor w20,w20,w7
  1405. ushr v5.4s,v25.4s,#20
  1406. eor w21,w21,w8
  1407. ushr v9.4s,v26.4s,#20
  1408. ror w17,w17,#24
  1409. ushr v13.4s,v27.4s,#20
  1410. ror w19,w19,#24
  1411. ushr v17.4s,v28.4s,#20
  1412. ror w20,w20,#24
  1413. ushr v21.4s,v29.4s,#20
  1414. ror w21,w21,#24
  1415. sli v1.4s,v24.4s,#12
  1416. add w13,w13,w17
  1417. sli v5.4s,v25.4s,#12
  1418. add w14,w14,w19
  1419. sli v9.4s,v26.4s,#12
  1420. add w15,w15,w20
  1421. sli v13.4s,v27.4s,#12
  1422. add w16,w16,w21
  1423. sli v17.4s,v28.4s,#12
  1424. eor w9,w9,w13
  1425. sli v21.4s,v29.4s,#12
  1426. eor w10,w10,w14
  1427. add v0.4s,v0.4s,v1.4s
  1428. eor w11,w11,w15
  1429. add v4.4s,v4.4s,v5.4s
  1430. eor w12,w12,w16
  1431. add v8.4s,v8.4s,v9.4s
  1432. ror w9,w9,#25
  1433. add v12.4s,v12.4s,v13.4s
  1434. ror w10,w10,#25
  1435. add v16.4s,v16.4s,v17.4s
  1436. ror w11,w11,#25
  1437. add v20.4s,v20.4s,v21.4s
  1438. ror w12,w12,#25
  1439. eor v24.16b,v3.16b,v0.16b
  1440. add w5,w5,w10
  1441. eor v25.16b,v7.16b,v4.16b
  1442. add w6,w6,w11
  1443. eor v26.16b,v11.16b,v8.16b
  1444. add w7,w7,w12
  1445. eor v27.16b,v15.16b,v12.16b
  1446. add w8,w8,w9
  1447. eor v28.16b,v19.16b,v16.16b
  1448. eor w21,w21,w5
  1449. eor v29.16b,v23.16b,v20.16b
  1450. eor w17,w17,w6
  1451. ushr v3.4s,v24.4s,#24
  1452. eor w19,w19,w7
  1453. ushr v7.4s,v25.4s,#24
  1454. eor w20,w20,w8
  1455. ushr v11.4s,v26.4s,#24
  1456. ror w21,w21,#16
  1457. ushr v15.4s,v27.4s,#24
  1458. ror w17,w17,#16
  1459. ushr v19.4s,v28.4s,#24
  1460. ror w19,w19,#16
  1461. ushr v23.4s,v29.4s,#24
  1462. ror w20,w20,#16
  1463. sli v3.4s,v24.4s,#8
  1464. add w15,w15,w21
  1465. sli v7.4s,v25.4s,#8
  1466. add w16,w16,w17
  1467. sli v11.4s,v26.4s,#8
  1468. add w13,w13,w19
  1469. sli v15.4s,v27.4s,#8
  1470. add w14,w14,w20
  1471. sli v19.4s,v28.4s,#8
  1472. eor w10,w10,w15
  1473. sli v23.4s,v29.4s,#8
  1474. eor w11,w11,w16
  1475. add v2.4s,v2.4s,v3.4s
  1476. eor w12,w12,w13
  1477. add v6.4s,v6.4s,v7.4s
  1478. eor w9,w9,w14
  1479. add v10.4s,v10.4s,v11.4s
  1480. ror w10,w10,#20
  1481. add v14.4s,v14.4s,v15.4s
  1482. ror w11,w11,#20
  1483. add v18.4s,v18.4s,v19.4s
  1484. ror w12,w12,#20
  1485. add v22.4s,v22.4s,v23.4s
  1486. ror w9,w9,#20
  1487. eor v24.16b,v1.16b,v2.16b
  1488. add w5,w5,w10
  1489. eor v25.16b,v5.16b,v6.16b
  1490. add w6,w6,w11
  1491. eor v26.16b,v9.16b,v10.16b
  1492. add w7,w7,w12
  1493. eor v27.16b,v13.16b,v14.16b
  1494. add w8,w8,w9
  1495. eor v28.16b,v17.16b,v18.16b
  1496. eor w21,w21,w5
  1497. eor v29.16b,v21.16b,v22.16b
  1498. eor w17,w17,w6
  1499. ushr v1.4s,v24.4s,#25
  1500. eor w19,w19,w7
  1501. ushr v5.4s,v25.4s,#25
  1502. eor w20,w20,w8
  1503. ushr v9.4s,v26.4s,#25
  1504. ror w21,w21,#24
  1505. ushr v13.4s,v27.4s,#25
  1506. ror w17,w17,#24
  1507. ushr v17.4s,v28.4s,#25
  1508. ror w19,w19,#24
  1509. ushr v21.4s,v29.4s,#25
  1510. ror w20,w20,#24
  1511. sli v1.4s,v24.4s,#7
  1512. add w15,w15,w21
  1513. sli v5.4s,v25.4s,#7
  1514. add w16,w16,w17
  1515. sli v9.4s,v26.4s,#7
  1516. add w13,w13,w19
  1517. sli v13.4s,v27.4s,#7
  1518. add w14,w14,w20
  1519. sli v17.4s,v28.4s,#7
  1520. eor w10,w10,w15
  1521. sli v21.4s,v29.4s,#7
  1522. eor w11,w11,w16
  1523. ext v2.16b,v2.16b,v2.16b,#8
  1524. eor w12,w12,w13
  1525. ext v6.16b,v6.16b,v6.16b,#8
  1526. eor w9,w9,w14
  1527. ext v10.16b,v10.16b,v10.16b,#8
  1528. ror w10,w10,#25
  1529. ext v14.16b,v14.16b,v14.16b,#8
  1530. ror w11,w11,#25
  1531. ext v18.16b,v18.16b,v18.16b,#8
  1532. ror w12,w12,#25
  1533. ext v22.16b,v22.16b,v22.16b,#8
  1534. ror w9,w9,#25
  1535. ext v3.16b,v3.16b,v3.16b,#12
  1536. ext v7.16b,v7.16b,v7.16b,#12
  1537. ext v11.16b,v11.16b,v11.16b,#12
  1538. ext v15.16b,v15.16b,v15.16b,#12
  1539. ext v19.16b,v19.16b,v19.16b,#12
  1540. ext v23.16b,v23.16b,v23.16b,#12
  1541. ext v1.16b,v1.16b,v1.16b,#4
  1542. ext v5.16b,v5.16b,v5.16b,#4
  1543. ext v9.16b,v9.16b,v9.16b,#4
  1544. ext v13.16b,v13.16b,v13.16b,#4
  1545. ext v17.16b,v17.16b,v17.16b,#4
  1546. ext v21.16b,v21.16b,v21.16b,#4
  1547. add v0.4s,v0.4s,v1.4s
  1548. add w5,w5,w9
  1549. add v4.4s,v4.4s,v5.4s
  1550. add w6,w6,w10
  1551. add v8.4s,v8.4s,v9.4s
  1552. add w7,w7,w11
  1553. add v12.4s,v12.4s,v13.4s
  1554. add w8,w8,w12
  1555. add v16.4s,v16.4s,v17.4s
  1556. eor w17,w17,w5
  1557. add v20.4s,v20.4s,v21.4s
  1558. eor w19,w19,w6
  1559. eor v3.16b,v3.16b,v0.16b
  1560. eor w20,w20,w7
  1561. eor v7.16b,v7.16b,v4.16b
  1562. eor w21,w21,w8
  1563. eor v11.16b,v11.16b,v8.16b
  1564. ror w17,w17,#16
  1565. eor v15.16b,v15.16b,v12.16b
  1566. ror w19,w19,#16
  1567. eor v19.16b,v19.16b,v16.16b
  1568. ror w20,w20,#16
  1569. eor v23.16b,v23.16b,v20.16b
  1570. ror w21,w21,#16
  1571. rev32 v3.8h,v3.8h
  1572. add w13,w13,w17
  1573. rev32 v7.8h,v7.8h
  1574. add w14,w14,w19
  1575. rev32 v11.8h,v11.8h
  1576. add w15,w15,w20
  1577. rev32 v15.8h,v15.8h
  1578. add w16,w16,w21
  1579. rev32 v19.8h,v19.8h
  1580. eor w9,w9,w13
  1581. rev32 v23.8h,v23.8h
  1582. eor w10,w10,w14
  1583. add v2.4s,v2.4s,v3.4s
  1584. eor w11,w11,w15
  1585. add v6.4s,v6.4s,v7.4s
  1586. eor w12,w12,w16
  1587. add v10.4s,v10.4s,v11.4s
  1588. ror w9,w9,#20
  1589. add v14.4s,v14.4s,v15.4s
  1590. ror w10,w10,#20
  1591. add v18.4s,v18.4s,v19.4s
  1592. ror w11,w11,#20
  1593. add v22.4s,v22.4s,v23.4s
  1594. ror w12,w12,#20
  1595. eor v24.16b,v1.16b,v2.16b
  1596. add w5,w5,w9
  1597. eor v25.16b,v5.16b,v6.16b
  1598. add w6,w6,w10
  1599. eor v26.16b,v9.16b,v10.16b
  1600. add w7,w7,w11
  1601. eor v27.16b,v13.16b,v14.16b
  1602. add w8,w8,w12
  1603. eor v28.16b,v17.16b,v18.16b
  1604. eor w17,w17,w5
  1605. eor v29.16b,v21.16b,v22.16b
  1606. eor w19,w19,w6
  1607. ushr v1.4s,v24.4s,#20
  1608. eor w20,w20,w7
  1609. ushr v5.4s,v25.4s,#20
  1610. eor w21,w21,w8
  1611. ushr v9.4s,v26.4s,#20
  1612. ror w17,w17,#24
  1613. ushr v13.4s,v27.4s,#20
  1614. ror w19,w19,#24
  1615. ushr v17.4s,v28.4s,#20
  1616. ror w20,w20,#24
  1617. ushr v21.4s,v29.4s,#20
  1618. ror w21,w21,#24
  1619. sli v1.4s,v24.4s,#12
  1620. add w13,w13,w17
  1621. sli v5.4s,v25.4s,#12
  1622. add w14,w14,w19
  1623. sli v9.4s,v26.4s,#12
  1624. add w15,w15,w20
  1625. sli v13.4s,v27.4s,#12
  1626. add w16,w16,w21
  1627. sli v17.4s,v28.4s,#12
  1628. eor w9,w9,w13
  1629. sli v21.4s,v29.4s,#12
  1630. eor w10,w10,w14
  1631. add v0.4s,v0.4s,v1.4s
  1632. eor w11,w11,w15
  1633. add v4.4s,v4.4s,v5.4s
  1634. eor w12,w12,w16
  1635. add v8.4s,v8.4s,v9.4s
  1636. ror w9,w9,#25
  1637. add v12.4s,v12.4s,v13.4s
  1638. ror w10,w10,#25
  1639. add v16.4s,v16.4s,v17.4s
  1640. ror w11,w11,#25
  1641. add v20.4s,v20.4s,v21.4s
  1642. ror w12,w12,#25
  1643. eor v24.16b,v3.16b,v0.16b
  1644. add w5,w5,w10
  1645. eor v25.16b,v7.16b,v4.16b
  1646. add w6,w6,w11
  1647. eor v26.16b,v11.16b,v8.16b
  1648. add w7,w7,w12
  1649. eor v27.16b,v15.16b,v12.16b
  1650. add w8,w8,w9
  1651. eor v28.16b,v19.16b,v16.16b
  1652. eor w21,w21,w5
  1653. eor v29.16b,v23.16b,v20.16b
  1654. eor w17,w17,w6
  1655. ushr v3.4s,v24.4s,#24
  1656. eor w19,w19,w7
  1657. ushr v7.4s,v25.4s,#24
  1658. eor w20,w20,w8
  1659. ushr v11.4s,v26.4s,#24
  1660. ror w21,w21,#16
  1661. ushr v15.4s,v27.4s,#24
  1662. ror w17,w17,#16
  1663. ushr v19.4s,v28.4s,#24
  1664. ror w19,w19,#16
  1665. ushr v23.4s,v29.4s,#24
  1666. ror w20,w20,#16
  1667. sli v3.4s,v24.4s,#8
  1668. add w15,w15,w21
  1669. sli v7.4s,v25.4s,#8
  1670. add w16,w16,w17
  1671. sli v11.4s,v26.4s,#8
  1672. add w13,w13,w19
  1673. sli v15.4s,v27.4s,#8
  1674. add w14,w14,w20
  1675. sli v19.4s,v28.4s,#8
  1676. eor w10,w10,w15
  1677. sli v23.4s,v29.4s,#8
  1678. eor w11,w11,w16
  1679. add v2.4s,v2.4s,v3.4s
  1680. eor w12,w12,w13
  1681. add v6.4s,v6.4s,v7.4s
  1682. eor w9,w9,w14
  1683. add v10.4s,v10.4s,v11.4s
  1684. ror w10,w10,#20
  1685. add v14.4s,v14.4s,v15.4s
  1686. ror w11,w11,#20
  1687. add v18.4s,v18.4s,v19.4s
  1688. ror w12,w12,#20
  1689. add v22.4s,v22.4s,v23.4s
  1690. ror w9,w9,#20
  1691. eor v24.16b,v1.16b,v2.16b
  1692. add w5,w5,w10
  1693. eor v25.16b,v5.16b,v6.16b
  1694. add w6,w6,w11
  1695. eor v26.16b,v9.16b,v10.16b
  1696. add w7,w7,w12
  1697. eor v27.16b,v13.16b,v14.16b
  1698. add w8,w8,w9
  1699. eor v28.16b,v17.16b,v18.16b
  1700. eor w21,w21,w5
  1701. eor v29.16b,v21.16b,v22.16b
  1702. eor w17,w17,w6
  1703. ushr v1.4s,v24.4s,#25
  1704. eor w19,w19,w7
  1705. ushr v5.4s,v25.4s,#25
  1706. eor w20,w20,w8
  1707. ushr v9.4s,v26.4s,#25
  1708. ror w21,w21,#24
  1709. ushr v13.4s,v27.4s,#25
  1710. ror w17,w17,#24
  1711. ushr v17.4s,v28.4s,#25
  1712. ror w19,w19,#24
  1713. ushr v21.4s,v29.4s,#25
  1714. ror w20,w20,#24
  1715. sli v1.4s,v24.4s,#7
  1716. add w15,w15,w21
  1717. sli v5.4s,v25.4s,#7
  1718. add w16,w16,w17
  1719. sli v9.4s,v26.4s,#7
  1720. add w13,w13,w19
  1721. sli v13.4s,v27.4s,#7
  1722. add w14,w14,w20
  1723. sli v17.4s,v28.4s,#7
  1724. eor w10,w10,w15
  1725. sli v21.4s,v29.4s,#7
  1726. eor w11,w11,w16
  1727. ext v2.16b,v2.16b,v2.16b,#8
  1728. eor w12,w12,w13
  1729. ext v6.16b,v6.16b,v6.16b,#8
  1730. eor w9,w9,w14
  1731. ext v10.16b,v10.16b,v10.16b,#8
  1732. ror w10,w10,#25
  1733. ext v14.16b,v14.16b,v14.16b,#8
  1734. ror w11,w11,#25
  1735. ext v18.16b,v18.16b,v18.16b,#8
  1736. ror w12,w12,#25
  1737. ext v22.16b,v22.16b,v22.16b,#8
  1738. ror w9,w9,#25
  1739. ext v3.16b,v3.16b,v3.16b,#4
  1740. ext v7.16b,v7.16b,v7.16b,#4
  1741. ext v11.16b,v11.16b,v11.16b,#4
  1742. ext v15.16b,v15.16b,v15.16b,#4
  1743. ext v19.16b,v19.16b,v19.16b,#4
  1744. ext v23.16b,v23.16b,v23.16b,#4
  1745. ext v1.16b,v1.16b,v1.16b,#12
  1746. ext v5.16b,v5.16b,v5.16b,#12
  1747. ext v9.16b,v9.16b,v9.16b,#12
  1748. ext v13.16b,v13.16b,v13.16b,#12
  1749. ext v17.16b,v17.16b,v17.16b,#12
  1750. ext v21.16b,v21.16b,v21.16b,#12
  1751. cbnz x4,.Loop_lower_neon
  1752. add w5,w5,w22 // accumulate key block
  1753. ldp q24,q25,[sp,#0]
  1754. add x6,x6,x22,lsr#32
  1755. ldp q26,q27,[sp,#32]
  1756. add w7,w7,w23
  1757. ldp q28,q29,[sp,#64]
  1758. add x8,x8,x23,lsr#32
  1759. add v0.4s,v0.4s,v24.4s
  1760. add w9,w9,w24
  1761. add v4.4s,v4.4s,v24.4s
  1762. add x10,x10,x24,lsr#32
  1763. add v8.4s,v8.4s,v24.4s
  1764. add w11,w11,w25
  1765. add v12.4s,v12.4s,v24.4s
  1766. add x12,x12,x25,lsr#32
  1767. add v16.4s,v16.4s,v24.4s
  1768. add w13,w13,w26
  1769. add v20.4s,v20.4s,v24.4s
  1770. add x14,x14,x26,lsr#32
  1771. add v2.4s,v2.4s,v26.4s
  1772. add w15,w15,w27
  1773. add v6.4s,v6.4s,v26.4s
  1774. add x16,x16,x27,lsr#32
  1775. add v10.4s,v10.4s,v26.4s
  1776. add w17,w17,w28
  1777. add v14.4s,v14.4s,v26.4s
  1778. add x19,x19,x28,lsr#32
  1779. add v18.4s,v18.4s,v26.4s
  1780. add w20,w20,w30
  1781. add v22.4s,v22.4s,v26.4s
  1782. add x21,x21,x30,lsr#32
  1783. add v19.4s,v19.4s,v31.4s // +4
  1784. add x5,x5,x6,lsl#32 // pack
  1785. add v23.4s,v23.4s,v31.4s // +4
  1786. add x7,x7,x8,lsl#32
  1787. add v3.4s,v3.4s,v27.4s
  1788. ldp x6,x8,[x1,#0] // load input
  1789. add v7.4s,v7.4s,v28.4s
  1790. add x9,x9,x10,lsl#32
  1791. add v11.4s,v11.4s,v29.4s
  1792. add x11,x11,x12,lsl#32
  1793. add v15.4s,v15.4s,v30.4s
  1794. ldp x10,x12,[x1,#16]
  1795. add v19.4s,v19.4s,v27.4s
  1796. add x13,x13,x14,lsl#32
  1797. add v23.4s,v23.4s,v28.4s
  1798. add x15,x15,x16,lsl#32
  1799. add v1.4s,v1.4s,v25.4s
  1800. ldp x14,x16,[x1,#32]
  1801. add v5.4s,v5.4s,v25.4s
  1802. add x17,x17,x19,lsl#32
  1803. add v9.4s,v9.4s,v25.4s
  1804. add x20,x20,x21,lsl#32
  1805. add v13.4s,v13.4s,v25.4s
  1806. ldp x19,x21,[x1,#48]
  1807. add v17.4s,v17.4s,v25.4s
  1808. add x1,x1,#64
  1809. add v21.4s,v21.4s,v25.4s
  1810. #ifdef __ARMEB__
  1811. rev x5,x5
  1812. rev x7,x7
  1813. rev x9,x9
  1814. rev x11,x11
  1815. rev x13,x13
  1816. rev x15,x15
  1817. rev x17,x17
  1818. rev x20,x20
  1819. #endif
  1820. ld1 {v24.16b,v25.16b,v26.16b,v27.16b},[x1],#64
  1821. eor x5,x5,x6
  1822. eor x7,x7,x8
  1823. eor x9,x9,x10
  1824. eor x11,x11,x12
  1825. eor x13,x13,x14
  1826. eor v0.16b,v0.16b,v24.16b
  1827. eor x15,x15,x16
  1828. eor v1.16b,v1.16b,v25.16b
  1829. eor x17,x17,x19
  1830. eor v2.16b,v2.16b,v26.16b
  1831. eor x20,x20,x21
  1832. eor v3.16b,v3.16b,v27.16b
  1833. ld1 {v24.16b,v25.16b,v26.16b,v27.16b},[x1],#64
  1834. stp x5,x7,[x0,#0] // store output
  1835. add x28,x28,#7 // increment counter
  1836. stp x9,x11,[x0,#16]
  1837. stp x13,x15,[x0,#32]
  1838. stp x17,x20,[x0,#48]
  1839. add x0,x0,#64
  1840. st1 {v0.16b,v1.16b,v2.16b,v3.16b},[x0],#64
  1841. ld1 {v0.16b,v1.16b,v2.16b,v3.16b},[x1],#64
  1842. eor v4.16b,v4.16b,v24.16b
  1843. eor v5.16b,v5.16b,v25.16b
  1844. eor v6.16b,v6.16b,v26.16b
  1845. eor v7.16b,v7.16b,v27.16b
  1846. st1 {v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64
  1847. ld1 {v4.16b,v5.16b,v6.16b,v7.16b},[x1],#64
  1848. eor v8.16b,v8.16b,v0.16b
  1849. ldp q24,q25,[sp,#0]
  1850. eor v9.16b,v9.16b,v1.16b
  1851. ldp q26,q27,[sp,#32]
  1852. eor v10.16b,v10.16b,v2.16b
  1853. eor v11.16b,v11.16b,v3.16b
  1854. st1 {v8.16b,v9.16b,v10.16b,v11.16b},[x0],#64
  1855. ld1 {v8.16b,v9.16b,v10.16b,v11.16b},[x1],#64
  1856. eor v12.16b,v12.16b,v4.16b
  1857. eor v13.16b,v13.16b,v5.16b
  1858. eor v14.16b,v14.16b,v6.16b
  1859. eor v15.16b,v15.16b,v7.16b
  1860. st1 {v12.16b,v13.16b,v14.16b,v15.16b},[x0],#64
  1861. ld1 {v12.16b,v13.16b,v14.16b,v15.16b},[x1],#64
  1862. eor v16.16b,v16.16b,v8.16b
  1863. eor v17.16b,v17.16b,v9.16b
  1864. eor v18.16b,v18.16b,v10.16b
  1865. eor v19.16b,v19.16b,v11.16b
  1866. st1 {v16.16b,v17.16b,v18.16b,v19.16b},[x0],#64
  1867. shl v0.4s,v31.4s,#1 // 4 -> 8
  1868. eor v20.16b,v20.16b,v12.16b
  1869. eor v21.16b,v21.16b,v13.16b
  1870. eor v22.16b,v22.16b,v14.16b
  1871. eor v23.16b,v23.16b,v15.16b
  1872. st1 {v20.16b,v21.16b,v22.16b,v23.16b},[x0],#64
  1873. add v27.4s,v27.4s,v0.4s // += 8
  1874. add v28.4s,v28.4s,v0.4s
  1875. add v29.4s,v29.4s,v0.4s
  1876. add v30.4s,v30.4s,v0.4s
  1877. b.hs .Loop_outer_512_neon
  1878. adds x2,x2,#512
  1879. ushr v0.4s,v31.4s,#2 // 4 -> 1
  1880. ldp d8,d9,[sp,#128+0] // meet ABI requirements
  1881. ldp d10,d11,[sp,#128+16]
  1882. ldp d12,d13,[sp,#128+32]
  1883. ldp d14,d15,[sp,#128+48]
  1884. stp q24,q31,[sp,#0] // wipe off-load area
  1885. stp q24,q31,[sp,#32]
  1886. stp q24,q31,[sp,#64]
  1887. b.eq .Ldone_512_neon
  1888. cmp x2,#192
  1889. sub v27.4s,v27.4s,v0.4s // -= 1
  1890. sub v28.4s,v28.4s,v0.4s
  1891. sub v29.4s,v29.4s,v0.4s
  1892. add sp,sp,#128
  1893. b.hs .Loop_outer_neon
  1894. eor v25.16b,v25.16b,v25.16b
  1895. eor v26.16b,v26.16b,v26.16b
  1896. eor v27.16b,v27.16b,v27.16b
  1897. eor v28.16b,v28.16b,v28.16b
  1898. eor v29.16b,v29.16b,v29.16b
  1899. eor v30.16b,v30.16b,v30.16b
  1900. b .Loop_outer
  1901. .Ldone_512_neon:
  1902. ldp x19,x20,[x29,#16]
  1903. add sp,sp,#128+64
  1904. ldp x21,x22,[x29,#32]
  1905. ldp x23,x24,[x29,#48]
  1906. ldp x25,x26,[x29,#64]
  1907. ldp x27,x28,[x29,#80]
  1908. ldp x29,x30,[sp],#96
  1909. AARCH64_VALIDATE_LINK_REGISTER
  1910. ret
  1911. .size ChaCha20_512_neon,.-ChaCha20_512_neon
  1912. #endif
  1913. #endif // !OPENSSL_NO_ASM
  1914. .section .note.GNU-stack,"",%progbits