Add BLAKE2s and ChaCha CORE SIMD function (GH #656)

The CORE function provides the implementation for ChaCha_OperateKeystream_ALTIVEC, ChaCha_OperateKeystream_POWER7, BLAKE2_Compress32_ALTIVEC and BLAKE2_Compress32_POWER7. Depending on the options used to compile the source files, either POWER7 or ALTIVEC will be used.
This is needed to support the "new toolchain, ancient hardware" use case.
pull/748/head
Jeffrey Walton 2018-11-18 14:43:48 -05:00
parent e28b2e0f02
commit 2e68e95a92
No known key found for this signature in database
GPG Key ID: B36AB348921B1838
5 changed files with 50 additions and 18 deletions

View File

@ -714,6 +714,7 @@ ifeq ($(DETECT_FEATURES),1)
# Drop to Power4 if Power7 not available # Drop to Power4 if Power7 not available
ifeq ($(POWER7_FLAG),) ifeq ($(POWER7_FLAG),)
BLAKE2S_FLAG = $(ALTIVEC_FLAG)
CHACHA_FLAG = $(ALTIVEC_FLAG) CHACHA_FLAG = $(ALTIVEC_FLAG)
SIMON64_FLAG = $(ALTIVEC_FLAG) SIMON64_FLAG = $(ALTIVEC_FLAG)
SPECK64_FLAG = $(ALTIVEC_FLAG) SPECK64_FLAG = $(ALTIVEC_FLAG)

View File

@ -162,10 +162,10 @@ extern void BLAKE2_Compress32_NEON(const byte* input, BLAKE2s_State& state);
extern void BLAKE2_Compress64_NEON(const byte* input, BLAKE2b_State& state); extern void BLAKE2_Compress64_NEON(const byte* input, BLAKE2b_State& state);
#endif #endif
#if CRYPTOPP_ALTIVEC_AVAILABLE #if CRYPTOPP_POWER7_AVAILABLE
// BLAKE2_Compress32_POWER7 may be compiled with either -mcpu=power7 or
// -mcpu=power4. The makefile drops to POWER4 if POWER7 is not available.
extern void BLAKE2_Compress32_POWER7(const byte* input, BLAKE2s_State& state); extern void BLAKE2_Compress32_POWER7(const byte* input, BLAKE2s_State& state);
#elif CRYPTOPP_ALTIVEC_AVAILABLE
extern void BLAKE2_Compress32_ALTIVEC(const byte* input, BLAKE2s_State& state);
#endif #endif
#if CRYPTOPP_POWER8_AVAILABLE #if CRYPTOPP_POWER8_AVAILABLE
@ -670,14 +670,12 @@ void BLAKE2s::Compress(const byte *input)
#if CRYPTOPP_POWER7_AVAILABLE #if CRYPTOPP_POWER7_AVAILABLE
if(HasPower7()) if(HasPower7())
{ {
// BLAKE2_Compress32_POWER7 compiled with -mcpu=power7 and -DCRYPTOPP_POWER7_AVAILABLE
return BLAKE2_Compress32_POWER7(input, *m_state.data()); return BLAKE2_Compress32_POWER7(input, *m_state.data());
} }
#elif CRYPTOPP_ALTIVEC_AVAILABLE #elif CRYPTOPP_ALTIVEC_AVAILABLE
if(HasAltivec()) if(HasAltivec())
{ {
// BLAKE2_Compress32_POWER7 compiled with -mcpu=power4 and -DCRYPTOPP_ALTIVEC_AVAILABLE return BLAKE2_Compress32_ALTIVEC(input, *m_state.data());
return BLAKE2_Compress32_POWER7(input, *m_state.data());
} }
#endif #endif
return BLAKE2_Compress32_CXX(input, *m_state.data()); return BLAKE2_Compress32_CXX(input, *m_state.data());

View File

@ -681,7 +681,7 @@ void BLAKE2_Compress32_NEON(const byte* input, BLAKE2s_State& state)
} }
#endif // CRYPTOPP_ARM_NEON_AVAILABLE #endif // CRYPTOPP_ARM_NEON_AVAILABLE
#if (CRYPTOPP_ALTIVEC_AVAILABLE) #if (CRYPTOPP_POWER7_AVAILABLE || CRYPTOPP_ALTIVEC_AVAILABLE)
inline uint32x4_p VecLoad32(const void* p) inline uint32x4_p VecLoad32(const void* p)
{ {
@ -847,7 +847,7 @@ uint32x4_p VectorSet32<3,1,3,1>(const uint32x4_p a, const uint32x4_p b,
return VecPermute(a, c, mask); return VecPermute(a, c, mask);
} }
void BLAKE2_Compress32_POWER7(const byte* input, BLAKE2s_State& state) void BLAKE2_Compress32_CORE(const byte* input, BLAKE2s_State& state)
{ {
# define m1 m0 # define m1 m0
# define m2 m0 # define m2 m0
@ -994,6 +994,22 @@ void BLAKE2_Compress32_POWER7(const byte* input, BLAKE2s_State& state)
VecStore32LE(&state.h[0], VecXor(ff0, VecXor(row1, row3))); VecStore32LE(&state.h[0], VecXor(ff0, VecXor(row1, row3)));
VecStore32LE(&state.h[4], VecXor(ff1, VecXor(row2, row4))); VecStore32LE(&state.h[4], VecXor(ff1, VecXor(row2, row4)));
} }
#endif // CRYPTOPP_ALTIVEC_AVAILABLE #endif // CRYPTOPP_POWER7_AVAILABLE || CRYPTOPP_ALTIVEC_AVAILABLE
#if (CRYPTOPP_POWER7_AVAILABLE)
void BLAKE2_Compress32_POWER7(const byte* input, BLAKE2s_State& state)
{
BLAKE2_Compress32_CORE(input, state);
}
#elif (CRYPTOPP_ALTIVEC_AVAILABLE)
void BLAKE2_Compress32_ALTIVEC(const byte* input, BLAKE2s_State& state)
{
BLAKE2_Compress32_CORE(input, state);
}
#endif
NAMESPACE_END NAMESPACE_END

View File

@ -24,10 +24,10 @@ extern void ChaCha_OperateKeystream_SSE2(const word32 *state, const byte* input,
extern void ChaCha_OperateKeystream_AVX2(const word32 *state, const byte* input, byte *output, unsigned int rounds); extern void ChaCha_OperateKeystream_AVX2(const word32 *state, const byte* input, byte *output, unsigned int rounds);
#endif #endif
#if (CRYPTOPP_ALTIVEC_AVAILABLE) #if (CRYPTOPP_POWER7_AVAILABLE)
// ChaCha_OperateKeystream_POWER7 may be compiled with either -mcpu=power7 or
// -mcpu=power4. The makefile drops to POWER4 if POWER7 is not available.
extern void ChaCha_OperateKeystream_POWER7(const word32 *state, const byte* input, byte *output, unsigned int rounds); extern void ChaCha_OperateKeystream_POWER7(const word32 *state, const byte* input, byte *output, unsigned int rounds);
#elif (CRYPTOPP_ALTIVEC_AVAILABLE)
extern void ChaCha_OperateKeystream_ALTIVEC(const word32 *state, const byte* input, byte *output, unsigned int rounds);
#endif #endif
#define CHACHA_QUARTER_ROUND(a,b,c,d) \ #define CHACHA_QUARTER_ROUND(a,b,c,d) \
@ -256,7 +256,6 @@ void ChaCha_Policy::OperateKeystream(KeystreamOperation operation,
{ {
while (iterationCount >= 4 && MultiBlockSafe(4)) while (iterationCount >= 4 && MultiBlockSafe(4))
{ {
// ChaCha_OperateKeystream_POWER7 compiled with -mcpu=power7 and -DCRYPTOPP_POWER7_AVAILABLE
const bool xorInput = (operation & INPUT_NULL) != INPUT_NULL; const bool xorInput = (operation & INPUT_NULL) != INPUT_NULL;
ChaCha_OperateKeystream_POWER7(m_state, xorInput ? input : NULLPTR, output, m_rounds); ChaCha_OperateKeystream_POWER7(m_state, xorInput ? input : NULLPTR, output, m_rounds);
@ -275,9 +274,8 @@ void ChaCha_Policy::OperateKeystream(KeystreamOperation operation,
{ {
while (iterationCount >= 4 && MultiBlockSafe(4)) while (iterationCount >= 4 && MultiBlockSafe(4))
{ {
// ChaCha_OperateKeystream_POWER7 compiled with -mcpu=power4 and -DCRYPTOPP_ALTIVEC_AVAILABLE
const bool xorInput = (operation & INPUT_NULL) != INPUT_NULL; const bool xorInput = (operation & INPUT_NULL) != INPUT_NULL;
ChaCha_OperateKeystream_POWER7(m_state, xorInput ? input : NULLPTR, output, m_rounds); ChaCha_OperateKeystream_ALTIVEC(m_state, xorInput ? input : NULLPTR, output, m_rounds);
// MultiBlockSafe avoids overflow on the counter words // MultiBlockSafe avoids overflow on the counter words
m_state[12] += 4; m_state[12] += 4;

View File

@ -822,9 +822,12 @@ void ChaCha_OperateKeystream_SSE2(const word32 *state, const byte* input, byte *
#endif // CRYPTOPP_SSE2_INTRIN_AVAILABLE || CRYPTOPP_SSE2_ASM_AVAILABLE #endif // CRYPTOPP_SSE2_INTRIN_AVAILABLE || CRYPTOPP_SSE2_ASM_AVAILABLE
#if (CRYPTOPP_ALTIVEC_AVAILABLE) #if (CRYPTOPP_POWER7_AVAILABLE || CRYPTOPP_ALTIVEC_AVAILABLE)
void ChaCha_OperateKeystream_POWER7(const word32 *state, const byte* input, byte *output, unsigned int rounds) // ChaCha_OperateKeystream_CORE will use either POWER7 or ALTIVEC,
// depending on the flags used to compile this source file. The
// abstractions are handled in VecLoad, VecStore and friends.
inline void ChaCha_OperateKeystream_CORE(const word32 *state, const byte* input, byte *output, unsigned int rounds)
{ {
const uint32x4_p state0 = VecLoad(state + 0*4); const uint32x4_p state0 = VecLoad(state + 0*4);
const uint32x4_p state1 = VecLoad(state + 1*4); const uint32x4_p state1 = VecLoad(state + 1*4);
@ -1086,6 +1089,22 @@ void ChaCha_OperateKeystream_POWER7(const word32 *state, const byte* input, byte
VecStore32LE(output + 15*16, r3_3); VecStore32LE(output + 15*16, r3_3);
} }
#endif // CRYPTOPP_ALTIVEC_AVAILABLE #endif // CRYPTOPP_POWER7_AVAILABLE || CRYPTOPP_ALTIVEC_AVAILABLE
#if (CRYPTOPP_POWER7_AVAILABLE)
void ChaCha_OperateKeystream_POWER7(const word32 *state, const byte* input, byte *output, unsigned int rounds)
{
ChaCha_OperateKeystream_CORE(state, input, output, rounds);
}
#elif (CRYPTOPP_ALTIVEC_AVAILABLE)
void ChaCha_OperateKeystream_ALTIVEC(const word32 *state, const byte* input, byte *output, unsigned int rounds)
{
ChaCha_OperateKeystream_CORE(state, input, output, rounds);
}
#endif
NAMESPACE_END NAMESPACE_END