|
|
- // Copyright 2019 The Go Authors. All rights reserved.
- // Use of this source code is governed by a BSD-style
- // license that can be found in the LICENSE file.
-
- // Based on CRYPTOGAMS code with the following comment:
- // # ====================================================================
- // # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
- // # project. The module is, however, dual licensed under OpenSSL and
- // # CRYPTOGAMS licenses depending on where you obtain it. For further
- // # details see http://www.openssl.org/~appro/cryptogams/.
- // # ====================================================================
-
- // Code for the perl script that generates the ppc64 assembler
- // can be found in the cryptogams repository at the link below. It is based on
- // the original from openssl.
-
- // https://github.com/dot-asm/cryptogams/commit/a60f5b50ed908e91
-
- // The differences in this and the original implementation are
- // due to the calling conventions and initialization of constants.
-
- // +build !gccgo,!appengine
-
- #include "textflag.h"
-
- #define OUT R3
- #define INP R4
- #define LEN R5
- #define KEY R6
- #define CNT R7
- #define TMP R15
-
- #define CONSTBASE R16
- #define BLOCKS R17
-
- DATA consts<>+0x00(SB)/8, $0x3320646e61707865
- DATA consts<>+0x08(SB)/8, $0x6b20657479622d32
- DATA consts<>+0x10(SB)/8, $0x0000000000000001
- DATA consts<>+0x18(SB)/8, $0x0000000000000000
- DATA consts<>+0x20(SB)/8, $0x0000000000000004
- DATA consts<>+0x28(SB)/8, $0x0000000000000000
- DATA consts<>+0x30(SB)/8, $0x0a0b08090e0f0c0d
- DATA consts<>+0x38(SB)/8, $0x0203000106070405
- DATA consts<>+0x40(SB)/8, $0x090a0b080d0e0f0c
- DATA consts<>+0x48(SB)/8, $0x0102030005060704
- DATA consts<>+0x50(SB)/8, $0x6170786561707865
- DATA consts<>+0x58(SB)/8, $0x6170786561707865
- DATA consts<>+0x60(SB)/8, $0x3320646e3320646e
- DATA consts<>+0x68(SB)/8, $0x3320646e3320646e
- DATA consts<>+0x70(SB)/8, $0x79622d3279622d32
- DATA consts<>+0x78(SB)/8, $0x79622d3279622d32
- DATA consts<>+0x80(SB)/8, $0x6b2065746b206574
- DATA consts<>+0x88(SB)/8, $0x6b2065746b206574
- DATA consts<>+0x90(SB)/8, $0x0000000100000000
- DATA consts<>+0x98(SB)/8, $0x0000000300000002
- GLOBL consts<>(SB), RODATA, $0xa0
-
- //func chaCha20_ctr32_vsx(out, inp *byte, len int, key *[8]uint32, counter *uint32)
- TEXT ·chaCha20_ctr32_vsx(SB),NOSPLIT,$64-40
- MOVD out+0(FP), OUT
- MOVD inp+8(FP), INP
- MOVD len+16(FP), LEN
- MOVD key+24(FP), KEY
- MOVD counter+32(FP), CNT
-
- // Addressing for constants
- MOVD $consts<>+0x00(SB), CONSTBASE
- MOVD $16, R8
- MOVD $32, R9
- MOVD $48, R10
- MOVD $64, R11
- SRD $6, LEN, BLOCKS
- // V16
- LXVW4X (CONSTBASE)(R0), VS48
- ADD $80,CONSTBASE
-
- // Load key into V17,V18
- LXVW4X (KEY)(R0), VS49
- LXVW4X (KEY)(R8), VS50
-
- // Load CNT, NONCE into V19
- LXVW4X (CNT)(R0), VS51
-
- // Clear V27
- VXOR V27, V27, V27
-
- // V28
- LXVW4X (CONSTBASE)(R11), VS60
-
- // splat slot from V19 -> V26
- VSPLTW $0, V19, V26
-
- VSLDOI $4, V19, V27, V19
- VSLDOI $12, V27, V19, V19
-
- VADDUWM V26, V28, V26
-
- MOVD $10, R14
- MOVD R14, CTR
-
- loop_outer_vsx:
- // V0, V1, V2, V3
- LXVW4X (R0)(CONSTBASE), VS32
- LXVW4X (R8)(CONSTBASE), VS33
- LXVW4X (R9)(CONSTBASE), VS34
- LXVW4X (R10)(CONSTBASE), VS35
-
- // splat values from V17, V18 into V4-V11
- VSPLTW $0, V17, V4
- VSPLTW $1, V17, V5
- VSPLTW $2, V17, V6
- VSPLTW $3, V17, V7
- VSPLTW $0, V18, V8
- VSPLTW $1, V18, V9
- VSPLTW $2, V18, V10
- VSPLTW $3, V18, V11
-
- // VOR
- VOR V26, V26, V12
-
- // splat values from V19 -> V13, V14, V15
- VSPLTW $1, V19, V13
- VSPLTW $2, V19, V14
- VSPLTW $3, V19, V15
-
- // splat const values
- VSPLTISW $-16, V27
- VSPLTISW $12, V28
- VSPLTISW $8, V29
- VSPLTISW $7, V30
-
- loop_vsx:
- VADDUWM V0, V4, V0
- VADDUWM V1, V5, V1
- VADDUWM V2, V6, V2
- VADDUWM V3, V7, V3
-
- VXOR V12, V0, V12
- VXOR V13, V1, V13
- VXOR V14, V2, V14
- VXOR V15, V3, V15
-
- VRLW V12, V27, V12
- VRLW V13, V27, V13
- VRLW V14, V27, V14
- VRLW V15, V27, V15
-
- VADDUWM V8, V12, V8
- VADDUWM V9, V13, V9
- VADDUWM V10, V14, V10
- VADDUWM V11, V15, V11
-
- VXOR V4, V8, V4
- VXOR V5, V9, V5
- VXOR V6, V10, V6
- VXOR V7, V11, V7
-
- VRLW V4, V28, V4
- VRLW V5, V28, V5
- VRLW V6, V28, V6
- VRLW V7, V28, V7
-
- VADDUWM V0, V4, V0
- VADDUWM V1, V5, V1
- VADDUWM V2, V6, V2
- VADDUWM V3, V7, V3
-
- VXOR V12, V0, V12
- VXOR V13, V1, V13
- VXOR V14, V2, V14
- VXOR V15, V3, V15
-
- VRLW V12, V29, V12
- VRLW V13, V29, V13
- VRLW V14, V29, V14
- VRLW V15, V29, V15
-
- VADDUWM V8, V12, V8
- VADDUWM V9, V13, V9
- VADDUWM V10, V14, V10
- VADDUWM V11, V15, V11
-
- VXOR V4, V8, V4
- VXOR V5, V9, V5
- VXOR V6, V10, V6
- VXOR V7, V11, V7
-
- VRLW V4, V30, V4
- VRLW V5, V30, V5
- VRLW V6, V30, V6
- VRLW V7, V30, V7
-
- VADDUWM V0, V5, V0
- VADDUWM V1, V6, V1
- VADDUWM V2, V7, V2
- VADDUWM V3, V4, V3
-
- VXOR V15, V0, V15
- VXOR V12, V1, V12
- VXOR V13, V2, V13
- VXOR V14, V3, V14
-
- VRLW V15, V27, V15
- VRLW V12, V27, V12
- VRLW V13, V27, V13
- VRLW V14, V27, V14
-
- VADDUWM V10, V15, V10
- VADDUWM V11, V12, V11
- VADDUWM V8, V13, V8
- VADDUWM V9, V14, V9
-
- VXOR V5, V10, V5
- VXOR V6, V11, V6
- VXOR V7, V8, V7
- VXOR V4, V9, V4
-
- VRLW V5, V28, V5
- VRLW V6, V28, V6
- VRLW V7, V28, V7
- VRLW V4, V28, V4
-
- VADDUWM V0, V5, V0
- VADDUWM V1, V6, V1
- VADDUWM V2, V7, V2
- VADDUWM V3, V4, V3
-
- VXOR V15, V0, V15
- VXOR V12, V1, V12
- VXOR V13, V2, V13
- VXOR V14, V3, V14
-
- VRLW V15, V29, V15
- VRLW V12, V29, V12
- VRLW V13, V29, V13
- VRLW V14, V29, V14
-
- VADDUWM V10, V15, V10
- VADDUWM V11, V12, V11
- VADDUWM V8, V13, V8
- VADDUWM V9, V14, V9
-
- VXOR V5, V10, V5
- VXOR V6, V11, V6
- VXOR V7, V8, V7
- VXOR V4, V9, V4
-
- VRLW V5, V30, V5
- VRLW V6, V30, V6
- VRLW V7, V30, V7
- VRLW V4, V30, V4
- BC 16, LT, loop_vsx
-
- VADDUWM V12, V26, V12
-
- WORD $0x13600F8C // VMRGEW V0, V1, V27
- WORD $0x13821F8C // VMRGEW V2, V3, V28
-
- WORD $0x10000E8C // VMRGOW V0, V1, V0
- WORD $0x10421E8C // VMRGOW V2, V3, V2
-
- WORD $0x13A42F8C // VMRGEW V4, V5, V29
- WORD $0x13C63F8C // VMRGEW V6, V7, V30
-
- XXPERMDI VS32, VS34, $0, VS33
- XXPERMDI VS32, VS34, $3, VS35
- XXPERMDI VS59, VS60, $0, VS32
- XXPERMDI VS59, VS60, $3, VS34
-
- WORD $0x10842E8C // VMRGOW V4, V5, V4
- WORD $0x10C63E8C // VMRGOW V6, V7, V6
-
- WORD $0x13684F8C // VMRGEW V8, V9, V27
- WORD $0x138A5F8C // VMRGEW V10, V11, V28
-
- XXPERMDI VS36, VS38, $0, VS37
- XXPERMDI VS36, VS38, $3, VS39
- XXPERMDI VS61, VS62, $0, VS36
- XXPERMDI VS61, VS62, $3, VS38
-
- WORD $0x11084E8C // VMRGOW V8, V9, V8
- WORD $0x114A5E8C // VMRGOW V10, V11, V10
-
- WORD $0x13AC6F8C // VMRGEW V12, V13, V29
- WORD $0x13CE7F8C // VMRGEW V14, V15, V30
-
- XXPERMDI VS40, VS42, $0, VS41
- XXPERMDI VS40, VS42, $3, VS43
- XXPERMDI VS59, VS60, $0, VS40
- XXPERMDI VS59, VS60, $3, VS42
-
- WORD $0x118C6E8C // VMRGOW V12, V13, V12
- WORD $0x11CE7E8C // VMRGOW V14, V15, V14
-
- VSPLTISW $4, V27
- VADDUWM V26, V27, V26
-
- XXPERMDI VS44, VS46, $0, VS45
- XXPERMDI VS44, VS46, $3, VS47
- XXPERMDI VS61, VS62, $0, VS44
- XXPERMDI VS61, VS62, $3, VS46
-
- VADDUWM V0, V16, V0
- VADDUWM V4, V17, V4
- VADDUWM V8, V18, V8
- VADDUWM V12, V19, V12
-
- CMPU LEN, $64
- BLT tail_vsx
-
- // Bottom of loop
- LXVW4X (INP)(R0), VS59
- LXVW4X (INP)(R8), VS60
- LXVW4X (INP)(R9), VS61
- LXVW4X (INP)(R10), VS62
-
- VXOR V27, V0, V27
- VXOR V28, V4, V28
- VXOR V29, V8, V29
- VXOR V30, V12, V30
-
- STXVW4X VS59, (OUT)(R0)
- STXVW4X VS60, (OUT)(R8)
- ADD $64, INP
- STXVW4X VS61, (OUT)(R9)
- ADD $-64, LEN
- STXVW4X VS62, (OUT)(R10)
- ADD $64, OUT
- BEQ done_vsx
-
- VADDUWM V1, V16, V0
- VADDUWM V5, V17, V4
- VADDUWM V9, V18, V8
- VADDUWM V13, V19, V12
-
- CMPU LEN, $64
- BLT tail_vsx
-
- LXVW4X (INP)(R0), VS59
- LXVW4X (INP)(R8), VS60
- LXVW4X (INP)(R9), VS61
- LXVW4X (INP)(R10), VS62
- VXOR V27, V0, V27
-
- VXOR V28, V4, V28
- VXOR V29, V8, V29
- VXOR V30, V12, V30
-
- STXVW4X VS59, (OUT)(R0)
- STXVW4X VS60, (OUT)(R8)
- ADD $64, INP
- STXVW4X VS61, (OUT)(R9)
- ADD $-64, LEN
- STXVW4X VS62, (OUT)(V10)
- ADD $64, OUT
- BEQ done_vsx
-
- VADDUWM V2, V16, V0
- VADDUWM V6, V17, V4
- VADDUWM V10, V18, V8
- VADDUWM V14, V19, V12
-
- CMPU LEN, $64
- BLT tail_vsx
-
- LXVW4X (INP)(R0), VS59
- LXVW4X (INP)(R8), VS60
- LXVW4X (INP)(R9), VS61
- LXVW4X (INP)(R10), VS62
-
- VXOR V27, V0, V27
- VXOR V28, V4, V28
- VXOR V29, V8, V29
- VXOR V30, V12, V30
-
- STXVW4X VS59, (OUT)(R0)
- STXVW4X VS60, (OUT)(R8)
- ADD $64, INP
- STXVW4X VS61, (OUT)(R9)
- ADD $-64, LEN
- STXVW4X VS62, (OUT)(R10)
- ADD $64, OUT
- BEQ done_vsx
-
- VADDUWM V3, V16, V0
- VADDUWM V7, V17, V4
- VADDUWM V11, V18, V8
- VADDUWM V15, V19, V12
-
- CMPU LEN, $64
- BLT tail_vsx
-
- LXVW4X (INP)(R0), VS59
- LXVW4X (INP)(R8), VS60
- LXVW4X (INP)(R9), VS61
- LXVW4X (INP)(R10), VS62
-
- VXOR V27, V0, V27
- VXOR V28, V4, V28
- VXOR V29, V8, V29
- VXOR V30, V12, V30
-
- STXVW4X VS59, (OUT)(R0)
- STXVW4X VS60, (OUT)(R8)
- ADD $64, INP
- STXVW4X VS61, (OUT)(R9)
- ADD $-64, LEN
- STXVW4X VS62, (OUT)(R10)
- ADD $64, OUT
-
- MOVD $10, R14
- MOVD R14, CTR
- BNE loop_outer_vsx
-
- done_vsx:
- // Increment counter by number of 64 byte blocks
- MOVD (CNT), R14
- ADD BLOCKS, R14
- MOVD R14, (CNT)
- RET
-
- tail_vsx:
- ADD $32, R1, R11
- MOVD LEN, CTR
-
- // Save values on stack to copy from
- STXVW4X VS32, (R11)(R0)
- STXVW4X VS36, (R11)(R8)
- STXVW4X VS40, (R11)(R9)
- STXVW4X VS44, (R11)(R10)
- ADD $-1, R11, R12
- ADD $-1, INP
- ADD $-1, OUT
-
- looptail_vsx:
- // Copying the result to OUT
- // in bytes.
- MOVBZU 1(R12), KEY
- MOVBZU 1(INP), TMP
- XOR KEY, TMP, KEY
- MOVBU KEY, 1(OUT)
- BC 16, LT, looptail_vsx
-
- // Clear the stack values
- STXVW4X VS48, (R11)(R0)
- STXVW4X VS48, (R11)(R8)
- STXVW4X VS48, (R11)(R9)
- STXVW4X VS48, (R11)(R10)
- BR done_vsx
|