You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

449 lines
9.1 KiB

  1. // Copyright 2019 The Go Authors. All rights reserved.
  2. // Use of this source code is governed by a BSD-style
  3. // license that can be found in the LICENSE file.
  4. // Based on CRYPTOGAMS code with the following comment:
  5. // # ====================================================================
  6. // # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
  7. // # project. The module is, however, dual licensed under OpenSSL and
  8. // # CRYPTOGAMS licenses depending on where you obtain it. For further
  9. // # details see http://www.openssl.org/~appro/cryptogams/.
  10. // # ====================================================================
  11. // Code for the perl script that generates the ppc64 assembler
  12. // can be found in the cryptogams repository at the link below. It is based on
  13. // the original from openssl.
  14. // https://github.com/dot-asm/cryptogams/commit/a60f5b50ed908e91
  15. // The differences in this and the original implementation are
  16. // due to the calling conventions and initialization of constants.
  17. // +build !gccgo,!appengine
  18. #include "textflag.h"
  19. #define OUT R3
  20. #define INP R4
  21. #define LEN R5
  22. #define KEY R6
  23. #define CNT R7
  24. #define TMP R15
  25. #define CONSTBASE R16
  26. #define BLOCKS R17
  27. DATA consts<>+0x00(SB)/8, $0x3320646e61707865
  28. DATA consts<>+0x08(SB)/8, $0x6b20657479622d32
  29. DATA consts<>+0x10(SB)/8, $0x0000000000000001
  30. DATA consts<>+0x18(SB)/8, $0x0000000000000000
  31. DATA consts<>+0x20(SB)/8, $0x0000000000000004
  32. DATA consts<>+0x28(SB)/8, $0x0000000000000000
  33. DATA consts<>+0x30(SB)/8, $0x0a0b08090e0f0c0d
  34. DATA consts<>+0x38(SB)/8, $0x0203000106070405
  35. DATA consts<>+0x40(SB)/8, $0x090a0b080d0e0f0c
  36. DATA consts<>+0x48(SB)/8, $0x0102030005060704
  37. DATA consts<>+0x50(SB)/8, $0x6170786561707865
  38. DATA consts<>+0x58(SB)/8, $0x6170786561707865
  39. DATA consts<>+0x60(SB)/8, $0x3320646e3320646e
  40. DATA consts<>+0x68(SB)/8, $0x3320646e3320646e
  41. DATA consts<>+0x70(SB)/8, $0x79622d3279622d32
  42. DATA consts<>+0x78(SB)/8, $0x79622d3279622d32
  43. DATA consts<>+0x80(SB)/8, $0x6b2065746b206574
  44. DATA consts<>+0x88(SB)/8, $0x6b2065746b206574
  45. DATA consts<>+0x90(SB)/8, $0x0000000100000000
  46. DATA consts<>+0x98(SB)/8, $0x0000000300000002
  47. GLOBL consts<>(SB), RODATA, $0xa0
  48. //func chaCha20_ctr32_vsx(out, inp *byte, len int, key *[8]uint32, counter *uint32)
  49. TEXT ·chaCha20_ctr32_vsx(SB),NOSPLIT,$64-40
  50. MOVD out+0(FP), OUT
  51. MOVD inp+8(FP), INP
  52. MOVD len+16(FP), LEN
  53. MOVD key+24(FP), KEY
  54. MOVD counter+32(FP), CNT
  55. // Addressing for constants
  56. MOVD $consts<>+0x00(SB), CONSTBASE
  57. MOVD $16, R8
  58. MOVD $32, R9
  59. MOVD $48, R10
  60. MOVD $64, R11
  61. SRD $6, LEN, BLOCKS
  62. // V16
  63. LXVW4X (CONSTBASE)(R0), VS48
  64. ADD $80,CONSTBASE
  65. // Load key into V17,V18
  66. LXVW4X (KEY)(R0), VS49
  67. LXVW4X (KEY)(R8), VS50
  68. // Load CNT, NONCE into V19
  69. LXVW4X (CNT)(R0), VS51
  70. // Clear V27
  71. VXOR V27, V27, V27
  72. // V28
  73. LXVW4X (CONSTBASE)(R11), VS60
  74. // splat slot from V19 -> V26
  75. VSPLTW $0, V19, V26
  76. VSLDOI $4, V19, V27, V19
  77. VSLDOI $12, V27, V19, V19
  78. VADDUWM V26, V28, V26
  79. MOVD $10, R14
  80. MOVD R14, CTR
  81. loop_outer_vsx:
  82. // V0, V1, V2, V3
  83. LXVW4X (R0)(CONSTBASE), VS32
  84. LXVW4X (R8)(CONSTBASE), VS33
  85. LXVW4X (R9)(CONSTBASE), VS34
  86. LXVW4X (R10)(CONSTBASE), VS35
  87. // splat values from V17, V18 into V4-V11
  88. VSPLTW $0, V17, V4
  89. VSPLTW $1, V17, V5
  90. VSPLTW $2, V17, V6
  91. VSPLTW $3, V17, V7
  92. VSPLTW $0, V18, V8
  93. VSPLTW $1, V18, V9
  94. VSPLTW $2, V18, V10
  95. VSPLTW $3, V18, V11
  96. // VOR
  97. VOR V26, V26, V12
  98. // splat values from V19 -> V13, V14, V15
  99. VSPLTW $1, V19, V13
  100. VSPLTW $2, V19, V14
  101. VSPLTW $3, V19, V15
  102. // splat const values
  103. VSPLTISW $-16, V27
  104. VSPLTISW $12, V28
  105. VSPLTISW $8, V29
  106. VSPLTISW $7, V30
  107. loop_vsx:
  108. VADDUWM V0, V4, V0
  109. VADDUWM V1, V5, V1
  110. VADDUWM V2, V6, V2
  111. VADDUWM V3, V7, V3
  112. VXOR V12, V0, V12
  113. VXOR V13, V1, V13
  114. VXOR V14, V2, V14
  115. VXOR V15, V3, V15
  116. VRLW V12, V27, V12
  117. VRLW V13, V27, V13
  118. VRLW V14, V27, V14
  119. VRLW V15, V27, V15
  120. VADDUWM V8, V12, V8
  121. VADDUWM V9, V13, V9
  122. VADDUWM V10, V14, V10
  123. VADDUWM V11, V15, V11
  124. VXOR V4, V8, V4
  125. VXOR V5, V9, V5
  126. VXOR V6, V10, V6
  127. VXOR V7, V11, V7
  128. VRLW V4, V28, V4
  129. VRLW V5, V28, V5
  130. VRLW V6, V28, V6
  131. VRLW V7, V28, V7
  132. VADDUWM V0, V4, V0
  133. VADDUWM V1, V5, V1
  134. VADDUWM V2, V6, V2
  135. VADDUWM V3, V7, V3
  136. VXOR V12, V0, V12
  137. VXOR V13, V1, V13
  138. VXOR V14, V2, V14
  139. VXOR V15, V3, V15
  140. VRLW V12, V29, V12
  141. VRLW V13, V29, V13
  142. VRLW V14, V29, V14
  143. VRLW V15, V29, V15
  144. VADDUWM V8, V12, V8
  145. VADDUWM V9, V13, V9
  146. VADDUWM V10, V14, V10
  147. VADDUWM V11, V15, V11
  148. VXOR V4, V8, V4
  149. VXOR V5, V9, V5
  150. VXOR V6, V10, V6
  151. VXOR V7, V11, V7
  152. VRLW V4, V30, V4
  153. VRLW V5, V30, V5
  154. VRLW V6, V30, V6
  155. VRLW V7, V30, V7
  156. VADDUWM V0, V5, V0
  157. VADDUWM V1, V6, V1
  158. VADDUWM V2, V7, V2
  159. VADDUWM V3, V4, V3
  160. VXOR V15, V0, V15
  161. VXOR V12, V1, V12
  162. VXOR V13, V2, V13
  163. VXOR V14, V3, V14
  164. VRLW V15, V27, V15
  165. VRLW V12, V27, V12
  166. VRLW V13, V27, V13
  167. VRLW V14, V27, V14
  168. VADDUWM V10, V15, V10
  169. VADDUWM V11, V12, V11
  170. VADDUWM V8, V13, V8
  171. VADDUWM V9, V14, V9
  172. VXOR V5, V10, V5
  173. VXOR V6, V11, V6
  174. VXOR V7, V8, V7
  175. VXOR V4, V9, V4
  176. VRLW V5, V28, V5
  177. VRLW V6, V28, V6
  178. VRLW V7, V28, V7
  179. VRLW V4, V28, V4
  180. VADDUWM V0, V5, V0
  181. VADDUWM V1, V6, V1
  182. VADDUWM V2, V7, V2
  183. VADDUWM V3, V4, V3
  184. VXOR V15, V0, V15
  185. VXOR V12, V1, V12
  186. VXOR V13, V2, V13
  187. VXOR V14, V3, V14
  188. VRLW V15, V29, V15
  189. VRLW V12, V29, V12
  190. VRLW V13, V29, V13
  191. VRLW V14, V29, V14
  192. VADDUWM V10, V15, V10
  193. VADDUWM V11, V12, V11
  194. VADDUWM V8, V13, V8
  195. VADDUWM V9, V14, V9
  196. VXOR V5, V10, V5
  197. VXOR V6, V11, V6
  198. VXOR V7, V8, V7
  199. VXOR V4, V9, V4
  200. VRLW V5, V30, V5
  201. VRLW V6, V30, V6
  202. VRLW V7, V30, V7
  203. VRLW V4, V30, V4
  204. BC 16, LT, loop_vsx
  205. VADDUWM V12, V26, V12
  206. WORD $0x13600F8C // VMRGEW V0, V1, V27
  207. WORD $0x13821F8C // VMRGEW V2, V3, V28
  208. WORD $0x10000E8C // VMRGOW V0, V1, V0
  209. WORD $0x10421E8C // VMRGOW V2, V3, V2
  210. WORD $0x13A42F8C // VMRGEW V4, V5, V29
  211. WORD $0x13C63F8C // VMRGEW V6, V7, V30
  212. XXPERMDI VS32, VS34, $0, VS33
  213. XXPERMDI VS32, VS34, $3, VS35
  214. XXPERMDI VS59, VS60, $0, VS32
  215. XXPERMDI VS59, VS60, $3, VS34
  216. WORD $0x10842E8C // VMRGOW V4, V5, V4
  217. WORD $0x10C63E8C // VMRGOW V6, V7, V6
  218. WORD $0x13684F8C // VMRGEW V8, V9, V27
  219. WORD $0x138A5F8C // VMRGEW V10, V11, V28
  220. XXPERMDI VS36, VS38, $0, VS37
  221. XXPERMDI VS36, VS38, $3, VS39
  222. XXPERMDI VS61, VS62, $0, VS36
  223. XXPERMDI VS61, VS62, $3, VS38
  224. WORD $0x11084E8C // VMRGOW V8, V9, V8
  225. WORD $0x114A5E8C // VMRGOW V10, V11, V10
  226. WORD $0x13AC6F8C // VMRGEW V12, V13, V29
  227. WORD $0x13CE7F8C // VMRGEW V14, V15, V30
  228. XXPERMDI VS40, VS42, $0, VS41
  229. XXPERMDI VS40, VS42, $3, VS43
  230. XXPERMDI VS59, VS60, $0, VS40
  231. XXPERMDI VS59, VS60, $3, VS42
  232. WORD $0x118C6E8C // VMRGOW V12, V13, V12
  233. WORD $0x11CE7E8C // VMRGOW V14, V15, V14
  234. VSPLTISW $4, V27
  235. VADDUWM V26, V27, V26
  236. XXPERMDI VS44, VS46, $0, VS45
  237. XXPERMDI VS44, VS46, $3, VS47
  238. XXPERMDI VS61, VS62, $0, VS44
  239. XXPERMDI VS61, VS62, $3, VS46
  240. VADDUWM V0, V16, V0
  241. VADDUWM V4, V17, V4
  242. VADDUWM V8, V18, V8
  243. VADDUWM V12, V19, V12
  244. CMPU LEN, $64
  245. BLT tail_vsx
  246. // Bottom of loop
  247. LXVW4X (INP)(R0), VS59
  248. LXVW4X (INP)(R8), VS60
  249. LXVW4X (INP)(R9), VS61
  250. LXVW4X (INP)(R10), VS62
  251. VXOR V27, V0, V27
  252. VXOR V28, V4, V28
  253. VXOR V29, V8, V29
  254. VXOR V30, V12, V30
  255. STXVW4X VS59, (OUT)(R0)
  256. STXVW4X VS60, (OUT)(R8)
  257. ADD $64, INP
  258. STXVW4X VS61, (OUT)(R9)
  259. ADD $-64, LEN
  260. STXVW4X VS62, (OUT)(R10)
  261. ADD $64, OUT
  262. BEQ done_vsx
  263. VADDUWM V1, V16, V0
  264. VADDUWM V5, V17, V4
  265. VADDUWM V9, V18, V8
  266. VADDUWM V13, V19, V12
  267. CMPU LEN, $64
  268. BLT tail_vsx
  269. LXVW4X (INP)(R0), VS59
  270. LXVW4X (INP)(R8), VS60
  271. LXVW4X (INP)(R9), VS61
  272. LXVW4X (INP)(R10), VS62
  273. VXOR V27, V0, V27
  274. VXOR V28, V4, V28
  275. VXOR V29, V8, V29
  276. VXOR V30, V12, V30
  277. STXVW4X VS59, (OUT)(R0)
  278. STXVW4X VS60, (OUT)(R8)
  279. ADD $64, INP
  280. STXVW4X VS61, (OUT)(R9)
  281. ADD $-64, LEN
  282. STXVW4X VS62, (OUT)(V10)
  283. ADD $64, OUT
  284. BEQ done_vsx
  285. VADDUWM V2, V16, V0
  286. VADDUWM V6, V17, V4
  287. VADDUWM V10, V18, V8
  288. VADDUWM V14, V19, V12
  289. CMPU LEN, $64
  290. BLT tail_vsx
  291. LXVW4X (INP)(R0), VS59
  292. LXVW4X (INP)(R8), VS60
  293. LXVW4X (INP)(R9), VS61
  294. LXVW4X (INP)(R10), VS62
  295. VXOR V27, V0, V27
  296. VXOR V28, V4, V28
  297. VXOR V29, V8, V29
  298. VXOR V30, V12, V30
  299. STXVW4X VS59, (OUT)(R0)
  300. STXVW4X VS60, (OUT)(R8)
  301. ADD $64, INP
  302. STXVW4X VS61, (OUT)(R9)
  303. ADD $-64, LEN
  304. STXVW4X VS62, (OUT)(R10)
  305. ADD $64, OUT
  306. BEQ done_vsx
  307. VADDUWM V3, V16, V0
  308. VADDUWM V7, V17, V4
  309. VADDUWM V11, V18, V8
  310. VADDUWM V15, V19, V12
  311. CMPU LEN, $64
  312. BLT tail_vsx
  313. LXVW4X (INP)(R0), VS59
  314. LXVW4X (INP)(R8), VS60
  315. LXVW4X (INP)(R9), VS61
  316. LXVW4X (INP)(R10), VS62
  317. VXOR V27, V0, V27
  318. VXOR V28, V4, V28
  319. VXOR V29, V8, V29
  320. VXOR V30, V12, V30
  321. STXVW4X VS59, (OUT)(R0)
  322. STXVW4X VS60, (OUT)(R8)
  323. ADD $64, INP
  324. STXVW4X VS61, (OUT)(R9)
  325. ADD $-64, LEN
  326. STXVW4X VS62, (OUT)(R10)
  327. ADD $64, OUT
  328. MOVD $10, R14
  329. MOVD R14, CTR
  330. BNE loop_outer_vsx
  331. done_vsx:
  332. // Increment counter by number of 64 byte blocks
  333. MOVD (CNT), R14
  334. ADD BLOCKS, R14
  335. MOVD R14, (CNT)
  336. RET
  337. tail_vsx:
  338. ADD $32, R1, R11
  339. MOVD LEN, CTR
  340. // Save values on stack to copy from
  341. STXVW4X VS32, (R11)(R0)
  342. STXVW4X VS36, (R11)(R8)
  343. STXVW4X VS40, (R11)(R9)
  344. STXVW4X VS44, (R11)(R10)
  345. ADD $-1, R11, R12
  346. ADD $-1, INP
  347. ADD $-1, OUT
  348. looptail_vsx:
  349. // Copying the result to OUT
  350. // in bytes.
  351. MOVBZU 1(R12), KEY
  352. MOVBZU 1(INP), TMP
  353. XOR KEY, TMP, KEY
  354. MOVBU KEY, 1(OUT)
  355. BC 16, LT, looptail_vsx
  356. // Clear the stack values
  357. STXVW4X VS48, (R11)(R0)
  358. STXVW4X VS48, (R11)(R8)
  359. STXVW4X VS48, (R11)(R9)
  360. STXVW4X VS48, (R11)(R10)
  361. BR done_vsx