From 2e68e95a928a921db1cb97d4433f6a4ff09fcac8 Mon Sep 17 00:00:00 2001 From: Jeffrey Walton Date: Sun, 18 Nov 2018 14:43:48 -0500 Subject: [PATCH] Add BLAKE2s and ChaCha CORE SIMD function (GH #656) The CORE function provides the implementation for ChaCha_OperateKeystream_ALTIVEC, ChaCha_OperateKeystream_POWER7, BLAKE2_Compress32_ALTIVEC and BLAKE2_Compress32_POWER7. Depending on the options used to compile the source files, either POWER7 or ALTIVEC will be used. This is needed to support the "new toolchain, ancient hardware" use case. --- GNUmakefile | 1 + blake2.cpp | 10 ++++------ blake2s_simd.cpp | 22 +++++++++++++++++++--- chacha.cpp | 10 ++++------ chacha_simd.cpp | 25 ++++++++++++++++++++++--- 5 files changed, 50 insertions(+), 18 deletions(-) diff --git a/GNUmakefile b/GNUmakefile index ee8ef33f..5d72a471 100755 --- a/GNUmakefile +++ b/GNUmakefile @@ -714,6 +714,7 @@ ifeq ($(DETECT_FEATURES),1) # Drop to Power4 if Power7 not available ifeq ($(POWER7_FLAG),) + BLAKE2S_FLAG = $(ALTIVEC_FLAG) CHACHA_FLAG = $(ALTIVEC_FLAG) SIMON64_FLAG = $(ALTIVEC_FLAG) SPECK64_FLAG = $(ALTIVEC_FLAG) diff --git a/blake2.cpp b/blake2.cpp index 79b79c82..b0c54af2 100644 --- a/blake2.cpp +++ b/blake2.cpp @@ -162,10 +162,10 @@ extern void BLAKE2_Compress32_NEON(const byte* input, BLAKE2s_State& state); extern void BLAKE2_Compress64_NEON(const byte* input, BLAKE2b_State& state); #endif -#if CRYPTOPP_ALTIVEC_AVAILABLE -// BLAKE2_Compress32_POWER7 may be compiled with either -mcpu=power7 or -// -mcpu=power4. The makefile drops to POWER4 if POWER7 is not available. +#if CRYPTOPP_POWER7_AVAILABLE extern void BLAKE2_Compress32_POWER7(const byte* input, BLAKE2s_State& state); +#elif CRYPTOPP_ALTIVEC_AVAILABLE +extern void BLAKE2_Compress32_ALTIVEC(const byte* input, BLAKE2s_State& state); #endif #if CRYPTOPP_POWER8_AVAILABLE @@ -670,14 +670,12 @@ void BLAKE2s::Compress(const byte *input) #if CRYPTOPP_POWER7_AVAILABLE if(HasPower7()) { - // BLAKE2_Compress32_POWER7 compiled with -mcpu=power7 and -DCRYPTOPP_POWER7_AVAILABLE return BLAKE2_Compress32_POWER7(input, *m_state.data()); } #elif CRYPTOPP_ALTIVEC_AVAILABLE if(HasAltivec()) { - // BLAKE2_Compress32_POWER7 compiled with -mcpu=power4 and -DCRYPTOPP_ALTIVEC_AVAILABLE - return BLAKE2_Compress32_POWER7(input, *m_state.data()); + return BLAKE2_Compress32_ALTIVEC(input, *m_state.data()); } #endif return BLAKE2_Compress32_CXX(input, *m_state.data()); diff --git a/blake2s_simd.cpp b/blake2s_simd.cpp index 1456ad23..1d756f9e 100644 --- a/blake2s_simd.cpp +++ b/blake2s_simd.cpp @@ -681,7 +681,7 @@ void BLAKE2_Compress32_NEON(const byte* input, BLAKE2s_State& state) } #endif // CRYPTOPP_ARM_NEON_AVAILABLE -#if (CRYPTOPP_ALTIVEC_AVAILABLE) +#if (CRYPTOPP_POWER7_AVAILABLE || CRYPTOPP_ALTIVEC_AVAILABLE) inline uint32x4_p VecLoad32(const void* p) { @@ -847,7 +847,7 @@ uint32x4_p VectorSet32<3,1,3,1>(const uint32x4_p a, const uint32x4_p b, return VecPermute(a, c, mask); } -void BLAKE2_Compress32_POWER7(const byte* input, BLAKE2s_State& state) +void BLAKE2_Compress32_CORE(const byte* input, BLAKE2s_State& state) { # define m1 m0 # define m2 m0 @@ -994,6 +994,22 @@ void BLAKE2_Compress32_POWER7(const byte* input, BLAKE2s_State& state) VecStore32LE(&state.h[0], VecXor(ff0, VecXor(row1, row3))); VecStore32LE(&state.h[4], VecXor(ff1, VecXor(row2, row4))); } -#endif // CRYPTOPP_ALTIVEC_AVAILABLE +#endif // CRYPTOPP_POWER7_AVAILABLE || CRYPTOPP_ALTIVEC_AVAILABLE + +#if (CRYPTOPP_POWER7_AVAILABLE) + +void BLAKE2_Compress32_POWER7(const byte* input, BLAKE2s_State& state) +{ + BLAKE2_Compress32_CORE(input, state); +} + +#elif (CRYPTOPP_ALTIVEC_AVAILABLE) + +void BLAKE2_Compress32_ALTIVEC(const byte* input, BLAKE2s_State& state) +{ + BLAKE2_Compress32_CORE(input, state); +} + +#endif NAMESPACE_END diff --git a/chacha.cpp b/chacha.cpp index 05075b17..43721b7b 100644 --- a/chacha.cpp +++ b/chacha.cpp @@ -24,10 +24,10 @@ extern void ChaCha_OperateKeystream_SSE2(const word32 *state, const byte* input, extern void ChaCha_OperateKeystream_AVX2(const word32 *state, const byte* input, byte *output, unsigned int rounds); #endif -#if (CRYPTOPP_ALTIVEC_AVAILABLE) -// ChaCha_OperateKeystream_POWER7 may be compiled with either -mcpu=power7 or -// -mcpu=power4. The makefile drops to POWER4 if POWER7 is not available. +#if (CRYPTOPP_POWER7_AVAILABLE) extern void ChaCha_OperateKeystream_POWER7(const word32 *state, const byte* input, byte *output, unsigned int rounds); +#elif (CRYPTOPP_ALTIVEC_AVAILABLE) +extern void ChaCha_OperateKeystream_ALTIVEC(const word32 *state, const byte* input, byte *output, unsigned int rounds); #endif #define CHACHA_QUARTER_ROUND(a,b,c,d) \ @@ -256,7 +256,6 @@ void ChaCha_Policy::OperateKeystream(KeystreamOperation operation, { while (iterationCount >= 4 && MultiBlockSafe(4)) { - // ChaCha_OperateKeystream_POWER7 compiled with -mcpu=power7 and -DCRYPTOPP_POWER7_AVAILABLE const bool xorInput = (operation & INPUT_NULL) != INPUT_NULL; ChaCha_OperateKeystream_POWER7(m_state, xorInput ? input : NULLPTR, output, m_rounds); @@ -275,9 +274,8 @@ void ChaCha_Policy::OperateKeystream(KeystreamOperation operation, { while (iterationCount >= 4 && MultiBlockSafe(4)) { - // ChaCha_OperateKeystream_POWER7 compiled with -mcpu=power4 and -DCRYPTOPP_ALTIVEC_AVAILABLE const bool xorInput = (operation & INPUT_NULL) != INPUT_NULL; - ChaCha_OperateKeystream_POWER7(m_state, xorInput ? input : NULLPTR, output, m_rounds); + ChaCha_OperateKeystream_ALTIVEC(m_state, xorInput ? input : NULLPTR, output, m_rounds); // MultiBlockSafe avoids overflow on the counter words m_state[12] += 4; diff --git a/chacha_simd.cpp b/chacha_simd.cpp index 9a0bd6c3..0945c597 100644 --- a/chacha_simd.cpp +++ b/chacha_simd.cpp @@ -822,9 +822,12 @@ void ChaCha_OperateKeystream_SSE2(const word32 *state, const byte* input, byte * #endif // CRYPTOPP_SSE2_INTRIN_AVAILABLE || CRYPTOPP_SSE2_ASM_AVAILABLE -#if (CRYPTOPP_ALTIVEC_AVAILABLE) +#if (CRYPTOPP_POWER7_AVAILABLE || CRYPTOPP_ALTIVEC_AVAILABLE) -void ChaCha_OperateKeystream_POWER7(const word32 *state, const byte* input, byte *output, unsigned int rounds) +// ChaCha_OperateKeystream_CORE will use either POWER7 or ALTIVEC, +// depending on the flags used to compile this source file. The +// abstractions are handled in VecLoad, VecStore and friends. +inline void ChaCha_OperateKeystream_CORE(const word32 *state, const byte* input, byte *output, unsigned int rounds) { const uint32x4_p state0 = VecLoad(state + 0*4); const uint32x4_p state1 = VecLoad(state + 1*4); @@ -1086,6 +1089,22 @@ void ChaCha_OperateKeystream_POWER7(const word32 *state, const byte* input, byte VecStore32LE(output + 15*16, r3_3); } -#endif // CRYPTOPP_ALTIVEC_AVAILABLE +#endif // CRYPTOPP_POWER7_AVAILABLE || CRYPTOPP_ALTIVEC_AVAILABLE + +#if (CRYPTOPP_POWER7_AVAILABLE) + +void ChaCha_OperateKeystream_POWER7(const word32 *state, const byte* input, byte *output, unsigned int rounds) +{ + ChaCha_OperateKeystream_CORE(state, input, output, rounds); +} + +#elif (CRYPTOPP_ALTIVEC_AVAILABLE) + +void ChaCha_OperateKeystream_ALTIVEC(const word32 *state, const byte* input, byte *output, unsigned int rounds) +{ + ChaCha_OperateKeystream_CORE(state, input, output, rounds); +} + +#endif NAMESPACE_END