chacha20_ref.odin 9.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360
  1. package chacha20_ref
  2. import "core:crypto/_chacha20"
  3. import "core:encoding/endian"
  4. import "core:math/bits"
  5. stream_blocks :: proc(ctx: ^_chacha20.Context, dst, src: []byte, nr_blocks: int) {
  6. // Enforce the maximum consumed keystream per IV.
  7. _chacha20.check_counter_limit(ctx, nr_blocks)
  8. dst, src := dst, src
  9. x := &ctx._s
  10. for n := 0; n < nr_blocks; n = n + 1 {
  11. x0, x1, x2, x3 :=
  12. _chacha20.SIGMA_0, _chacha20.SIGMA_1, _chacha20.SIGMA_2, _chacha20.SIGMA_3
  13. x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15 :=
  14. x[4], x[5], x[6], x[7], x[8], x[9], x[10], x[11], x[12], x[13], x[14], x[15]
  15. for i := _chacha20.ROUNDS; i > 0; i = i - 2 {
  16. // Even when forcing inlining manually inlining all of
  17. // these is decently faster.
  18. // quarterround(x, 0, 4, 8, 12)
  19. x0 += x4
  20. x12 ~= x0
  21. x12 = bits.rotate_left32(x12, 16)
  22. x8 += x12
  23. x4 ~= x8
  24. x4 = bits.rotate_left32(x4, 12)
  25. x0 += x4
  26. x12 ~= x0
  27. x12 = bits.rotate_left32(x12, 8)
  28. x8 += x12
  29. x4 ~= x8
  30. x4 = bits.rotate_left32(x4, 7)
  31. // quarterround(x, 1, 5, 9, 13)
  32. x1 += x5
  33. x13 ~= x1
  34. x13 = bits.rotate_left32(x13, 16)
  35. x9 += x13
  36. x5 ~= x9
  37. x5 = bits.rotate_left32(x5, 12)
  38. x1 += x5
  39. x13 ~= x1
  40. x13 = bits.rotate_left32(x13, 8)
  41. x9 += x13
  42. x5 ~= x9
  43. x5 = bits.rotate_left32(x5, 7)
  44. // quarterround(x, 2, 6, 10, 14)
  45. x2 += x6
  46. x14 ~= x2
  47. x14 = bits.rotate_left32(x14, 16)
  48. x10 += x14
  49. x6 ~= x10
  50. x6 = bits.rotate_left32(x6, 12)
  51. x2 += x6
  52. x14 ~= x2
  53. x14 = bits.rotate_left32(x14, 8)
  54. x10 += x14
  55. x6 ~= x10
  56. x6 = bits.rotate_left32(x6, 7)
  57. // quarterround(x, 3, 7, 11, 15)
  58. x3 += x7
  59. x15 ~= x3
  60. x15 = bits.rotate_left32(x15, 16)
  61. x11 += x15
  62. x7 ~= x11
  63. x7 = bits.rotate_left32(x7, 12)
  64. x3 += x7
  65. x15 ~= x3
  66. x15 = bits.rotate_left32(x15, 8)
  67. x11 += x15
  68. x7 ~= x11
  69. x7 = bits.rotate_left32(x7, 7)
  70. // quarterround(x, 0, 5, 10, 15)
  71. x0 += x5
  72. x15 ~= x0
  73. x15 = bits.rotate_left32(x15, 16)
  74. x10 += x15
  75. x5 ~= x10
  76. x5 = bits.rotate_left32(x5, 12)
  77. x0 += x5
  78. x15 ~= x0
  79. x15 = bits.rotate_left32(x15, 8)
  80. x10 += x15
  81. x5 ~= x10
  82. x5 = bits.rotate_left32(x5, 7)
  83. // quarterround(x, 1, 6, 11, 12)
  84. x1 += x6
  85. x12 ~= x1
  86. x12 = bits.rotate_left32(x12, 16)
  87. x11 += x12
  88. x6 ~= x11
  89. x6 = bits.rotate_left32(x6, 12)
  90. x1 += x6
  91. x12 ~= x1
  92. x12 = bits.rotate_left32(x12, 8)
  93. x11 += x12
  94. x6 ~= x11
  95. x6 = bits.rotate_left32(x6, 7)
  96. // quarterround(x, 2, 7, 8, 13)
  97. x2 += x7
  98. x13 ~= x2
  99. x13 = bits.rotate_left32(x13, 16)
  100. x8 += x13
  101. x7 ~= x8
  102. x7 = bits.rotate_left32(x7, 12)
  103. x2 += x7
  104. x13 ~= x2
  105. x13 = bits.rotate_left32(x13, 8)
  106. x8 += x13
  107. x7 ~= x8
  108. x7 = bits.rotate_left32(x7, 7)
  109. // quarterround(x, 3, 4, 9, 14)
  110. x3 += x4
  111. x14 ~= x3
  112. x14 = bits.rotate_left32(x14, 16)
  113. x9 += x14
  114. x4 ~= x9
  115. x4 = bits.rotate_left32(x4, 12)
  116. x3 += x4
  117. x14 ~= x3
  118. x14 = bits.rotate_left32(x14, 8)
  119. x9 += x14
  120. x4 ~= x9
  121. x4 = bits.rotate_left32(x4, 7)
  122. }
  123. x0 += _chacha20.SIGMA_0
  124. x1 += _chacha20.SIGMA_1
  125. x2 += _chacha20.SIGMA_2
  126. x3 += _chacha20.SIGMA_3
  127. x4 += x[4]
  128. x5 += x[5]
  129. x6 += x[6]
  130. x7 += x[7]
  131. x8 += x[8]
  132. x9 += x[9]
  133. x10 += x[10]
  134. x11 += x[11]
  135. x12 += x[12]
  136. x13 += x[13]
  137. x14 += x[14]
  138. x15 += x[15]
  139. // - The caller(s) ensure that src/dst are valid.
  140. // - The compiler knows if the target is picky about alignment.
  141. #no_bounds_check {
  142. if src != nil {
  143. endian.unchecked_put_u32le(dst[0:4], endian.unchecked_get_u32le(src[0:4]) ~ x0)
  144. endian.unchecked_put_u32le(dst[4:8], endian.unchecked_get_u32le(src[4:8]) ~ x1)
  145. endian.unchecked_put_u32le(dst[8:12], endian.unchecked_get_u32le(src[8:12]) ~ x2)
  146. endian.unchecked_put_u32le(dst[12:16], endian.unchecked_get_u32le(src[12:16]) ~ x3)
  147. endian.unchecked_put_u32le(dst[16:20], endian.unchecked_get_u32le(src[16:20]) ~ x4)
  148. endian.unchecked_put_u32le(dst[20:24], endian.unchecked_get_u32le(src[20:24]) ~ x5)
  149. endian.unchecked_put_u32le(dst[24:28], endian.unchecked_get_u32le(src[24:28]) ~ x6)
  150. endian.unchecked_put_u32le(dst[28:32], endian.unchecked_get_u32le(src[28:32]) ~ x7)
  151. endian.unchecked_put_u32le(dst[32:36], endian.unchecked_get_u32le(src[32:36]) ~ x8)
  152. endian.unchecked_put_u32le(dst[36:40], endian.unchecked_get_u32le(src[36:40]) ~ x9)
  153. endian.unchecked_put_u32le(
  154. dst[40:44],
  155. endian.unchecked_get_u32le(src[40:44]) ~ x10,
  156. )
  157. endian.unchecked_put_u32le(
  158. dst[44:48],
  159. endian.unchecked_get_u32le(src[44:48]) ~ x11,
  160. )
  161. endian.unchecked_put_u32le(
  162. dst[48:52],
  163. endian.unchecked_get_u32le(src[48:52]) ~ x12,
  164. )
  165. endian.unchecked_put_u32le(
  166. dst[52:56],
  167. endian.unchecked_get_u32le(src[52:56]) ~ x13,
  168. )
  169. endian.unchecked_put_u32le(
  170. dst[56:60],
  171. endian.unchecked_get_u32le(src[56:60]) ~ x14,
  172. )
  173. endian.unchecked_put_u32le(
  174. dst[60:64],
  175. endian.unchecked_get_u32le(src[60:64]) ~ x15,
  176. )
  177. src = src[_chacha20.BLOCK_SIZE:]
  178. } else {
  179. endian.unchecked_put_u32le(dst[0:4], x0)
  180. endian.unchecked_put_u32le(dst[4:8], x1)
  181. endian.unchecked_put_u32le(dst[8:12], x2)
  182. endian.unchecked_put_u32le(dst[12:16], x3)
  183. endian.unchecked_put_u32le(dst[16:20], x4)
  184. endian.unchecked_put_u32le(dst[20:24], x5)
  185. endian.unchecked_put_u32le(dst[24:28], x6)
  186. endian.unchecked_put_u32le(dst[28:32], x7)
  187. endian.unchecked_put_u32le(dst[32:36], x8)
  188. endian.unchecked_put_u32le(dst[36:40], x9)
  189. endian.unchecked_put_u32le(dst[40:44], x10)
  190. endian.unchecked_put_u32le(dst[44:48], x11)
  191. endian.unchecked_put_u32le(dst[48:52], x12)
  192. endian.unchecked_put_u32le(dst[52:56], x13)
  193. endian.unchecked_put_u32le(dst[56:60], x14)
  194. endian.unchecked_put_u32le(dst[60:64], x15)
  195. }
  196. dst = dst[_chacha20.BLOCK_SIZE:]
  197. }
  198. // Increment the counter. Overflow checking is done upon
  199. // entry into the routine, so a 64-bit increment safely
  200. // covers both cases.
  201. new_ctr := ((u64(ctx._s[13]) << 32) | u64(ctx._s[12])) + 1
  202. x[12] = u32(new_ctr)
  203. x[13] = u32(new_ctr >> 32)
  204. }
  205. }
  206. hchacha20 :: proc "contextless" (dst, key, iv: []byte) {
  207. x0, x1, x2, x3 := _chacha20.SIGMA_0, _chacha20.SIGMA_1, _chacha20.SIGMA_2, _chacha20.SIGMA_3
  208. x4 := endian.unchecked_get_u32le(key[0:4])
  209. x5 := endian.unchecked_get_u32le(key[4:8])
  210. x6 := endian.unchecked_get_u32le(key[8:12])
  211. x7 := endian.unchecked_get_u32le(key[12:16])
  212. x8 := endian.unchecked_get_u32le(key[16:20])
  213. x9 := endian.unchecked_get_u32le(key[20:24])
  214. x10 := endian.unchecked_get_u32le(key[24:28])
  215. x11 := endian.unchecked_get_u32le(key[28:32])
  216. x12 := endian.unchecked_get_u32le(iv[0:4])
  217. x13 := endian.unchecked_get_u32le(iv[4:8])
  218. x14 := endian.unchecked_get_u32le(iv[8:12])
  219. x15 := endian.unchecked_get_u32le(iv[12:16])
  220. for i := _chacha20.ROUNDS; i > 0; i = i - 2 {
  221. // quarterround(x, 0, 4, 8, 12)
  222. x0 += x4
  223. x12 ~= x0
  224. x12 = bits.rotate_left32(x12, 16)
  225. x8 += x12
  226. x4 ~= x8
  227. x4 = bits.rotate_left32(x4, 12)
  228. x0 += x4
  229. x12 ~= x0
  230. x12 = bits.rotate_left32(x12, 8)
  231. x8 += x12
  232. x4 ~= x8
  233. x4 = bits.rotate_left32(x4, 7)
  234. // quarterround(x, 1, 5, 9, 13)
  235. x1 += x5
  236. x13 ~= x1
  237. x13 = bits.rotate_left32(x13, 16)
  238. x9 += x13
  239. x5 ~= x9
  240. x5 = bits.rotate_left32(x5, 12)
  241. x1 += x5
  242. x13 ~= x1
  243. x13 = bits.rotate_left32(x13, 8)
  244. x9 += x13
  245. x5 ~= x9
  246. x5 = bits.rotate_left32(x5, 7)
  247. // quarterround(x, 2, 6, 10, 14)
  248. x2 += x6
  249. x14 ~= x2
  250. x14 = bits.rotate_left32(x14, 16)
  251. x10 += x14
  252. x6 ~= x10
  253. x6 = bits.rotate_left32(x6, 12)
  254. x2 += x6
  255. x14 ~= x2
  256. x14 = bits.rotate_left32(x14, 8)
  257. x10 += x14
  258. x6 ~= x10
  259. x6 = bits.rotate_left32(x6, 7)
  260. // quarterround(x, 3, 7, 11, 15)
  261. x3 += x7
  262. x15 ~= x3
  263. x15 = bits.rotate_left32(x15, 16)
  264. x11 += x15
  265. x7 ~= x11
  266. x7 = bits.rotate_left32(x7, 12)
  267. x3 += x7
  268. x15 ~= x3
  269. x15 = bits.rotate_left32(x15, 8)
  270. x11 += x15
  271. x7 ~= x11
  272. x7 = bits.rotate_left32(x7, 7)
  273. // quarterround(x, 0, 5, 10, 15)
  274. x0 += x5
  275. x15 ~= x0
  276. x15 = bits.rotate_left32(x15, 16)
  277. x10 += x15
  278. x5 ~= x10
  279. x5 = bits.rotate_left32(x5, 12)
  280. x0 += x5
  281. x15 ~= x0
  282. x15 = bits.rotate_left32(x15, 8)
  283. x10 += x15
  284. x5 ~= x10
  285. x5 = bits.rotate_left32(x5, 7)
  286. // quarterround(x, 1, 6, 11, 12)
  287. x1 += x6
  288. x12 ~= x1
  289. x12 = bits.rotate_left32(x12, 16)
  290. x11 += x12
  291. x6 ~= x11
  292. x6 = bits.rotate_left32(x6, 12)
  293. x1 += x6
  294. x12 ~= x1
  295. x12 = bits.rotate_left32(x12, 8)
  296. x11 += x12
  297. x6 ~= x11
  298. x6 = bits.rotate_left32(x6, 7)
  299. // quarterround(x, 2, 7, 8, 13)
  300. x2 += x7
  301. x13 ~= x2
  302. x13 = bits.rotate_left32(x13, 16)
  303. x8 += x13
  304. x7 ~= x8
  305. x7 = bits.rotate_left32(x7, 12)
  306. x2 += x7
  307. x13 ~= x2
  308. x13 = bits.rotate_left32(x13, 8)
  309. x8 += x13
  310. x7 ~= x8
  311. x7 = bits.rotate_left32(x7, 7)
  312. // quarterround(x, 3, 4, 9, 14)
  313. x3 += x4
  314. x14 ~= x3
  315. x14 = bits.rotate_left32(x14, 16)
  316. x9 += x14
  317. x4 ~= x9
  318. x4 = bits.rotate_left32(x4, 12)
  319. x3 += x4
  320. x14 ~= x3
  321. x14 = bits.rotate_left32(x14, 8)
  322. x9 += x14
  323. x4 ~= x9
  324. x4 = bits.rotate_left32(x4, 7)
  325. }
  326. endian.unchecked_put_u32le(dst[0:4], x0)
  327. endian.unchecked_put_u32le(dst[4:8], x1)
  328. endian.unchecked_put_u32le(dst[8:12], x2)
  329. endian.unchecked_put_u32le(dst[12:16], x3)
  330. endian.unchecked_put_u32le(dst[16:20], x12)
  331. endian.unchecked_put_u32le(dst[20:24], x13)
  332. endian.unchecked_put_u32le(dst[24:28], x14)
  333. endian.unchecked_put_u32le(dst[28:32], x15)
  334. }