Add BLAKE2s and ChaCha CORE SIMD function (GH #656)
The CORE function provides the implementation for ChaCha_OperateKeystream_ALTIVEC, ChaCha_OperateKeystream_POWER7, BLAKE2_Compress32_ALTIVEC and BLAKE2_Compress32_POWER7. Depending on the options used to compile the source files, either POWER7 or ALTIVEC will be used. This is needed to support the "new toolchain, ancient hardware" use case.pull/748/head
parent
e28b2e0f02
commit
2e68e95a92
|
|
@ -714,6 +714,7 @@ ifeq ($(DETECT_FEATURES),1)
|
||||||
|
|
||||||
# Drop to Power4 if Power7 not available
|
# Drop to Power4 if Power7 not available
|
||||||
ifeq ($(POWER7_FLAG),)
|
ifeq ($(POWER7_FLAG),)
|
||||||
|
BLAKE2S_FLAG = $(ALTIVEC_FLAG)
|
||||||
CHACHA_FLAG = $(ALTIVEC_FLAG)
|
CHACHA_FLAG = $(ALTIVEC_FLAG)
|
||||||
SIMON64_FLAG = $(ALTIVEC_FLAG)
|
SIMON64_FLAG = $(ALTIVEC_FLAG)
|
||||||
SPECK64_FLAG = $(ALTIVEC_FLAG)
|
SPECK64_FLAG = $(ALTIVEC_FLAG)
|
||||||
|
|
|
||||||
10
blake2.cpp
10
blake2.cpp
|
|
@ -162,10 +162,10 @@ extern void BLAKE2_Compress32_NEON(const byte* input, BLAKE2s_State& state);
|
||||||
extern void BLAKE2_Compress64_NEON(const byte* input, BLAKE2b_State& state);
|
extern void BLAKE2_Compress64_NEON(const byte* input, BLAKE2b_State& state);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if CRYPTOPP_ALTIVEC_AVAILABLE
|
#if CRYPTOPP_POWER7_AVAILABLE
|
||||||
// BLAKE2_Compress32_POWER7 may be compiled with either -mcpu=power7 or
|
|
||||||
// -mcpu=power4. The makefile drops to POWER4 if POWER7 is not available.
|
|
||||||
extern void BLAKE2_Compress32_POWER7(const byte* input, BLAKE2s_State& state);
|
extern void BLAKE2_Compress32_POWER7(const byte* input, BLAKE2s_State& state);
|
||||||
|
#elif CRYPTOPP_ALTIVEC_AVAILABLE
|
||||||
|
extern void BLAKE2_Compress32_ALTIVEC(const byte* input, BLAKE2s_State& state);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if CRYPTOPP_POWER8_AVAILABLE
|
#if CRYPTOPP_POWER8_AVAILABLE
|
||||||
|
|
@ -670,14 +670,12 @@ void BLAKE2s::Compress(const byte *input)
|
||||||
#if CRYPTOPP_POWER7_AVAILABLE
|
#if CRYPTOPP_POWER7_AVAILABLE
|
||||||
if(HasPower7())
|
if(HasPower7())
|
||||||
{
|
{
|
||||||
// BLAKE2_Compress32_POWER7 compiled with -mcpu=power7 and -DCRYPTOPP_POWER7_AVAILABLE
|
|
||||||
return BLAKE2_Compress32_POWER7(input, *m_state.data());
|
return BLAKE2_Compress32_POWER7(input, *m_state.data());
|
||||||
}
|
}
|
||||||
#elif CRYPTOPP_ALTIVEC_AVAILABLE
|
#elif CRYPTOPP_ALTIVEC_AVAILABLE
|
||||||
if(HasAltivec())
|
if(HasAltivec())
|
||||||
{
|
{
|
||||||
// BLAKE2_Compress32_POWER7 compiled with -mcpu=power4 and -DCRYPTOPP_ALTIVEC_AVAILABLE
|
return BLAKE2_Compress32_ALTIVEC(input, *m_state.data());
|
||||||
return BLAKE2_Compress32_POWER7(input, *m_state.data());
|
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
return BLAKE2_Compress32_CXX(input, *m_state.data());
|
return BLAKE2_Compress32_CXX(input, *m_state.data());
|
||||||
|
|
|
||||||
|
|
@ -681,7 +681,7 @@ void BLAKE2_Compress32_NEON(const byte* input, BLAKE2s_State& state)
|
||||||
}
|
}
|
||||||
#endif // CRYPTOPP_ARM_NEON_AVAILABLE
|
#endif // CRYPTOPP_ARM_NEON_AVAILABLE
|
||||||
|
|
||||||
#if (CRYPTOPP_ALTIVEC_AVAILABLE)
|
#if (CRYPTOPP_POWER7_AVAILABLE || CRYPTOPP_ALTIVEC_AVAILABLE)
|
||||||
|
|
||||||
inline uint32x4_p VecLoad32(const void* p)
|
inline uint32x4_p VecLoad32(const void* p)
|
||||||
{
|
{
|
||||||
|
|
@ -847,7 +847,7 @@ uint32x4_p VectorSet32<3,1,3,1>(const uint32x4_p a, const uint32x4_p b,
|
||||||
return VecPermute(a, c, mask);
|
return VecPermute(a, c, mask);
|
||||||
}
|
}
|
||||||
|
|
||||||
void BLAKE2_Compress32_POWER7(const byte* input, BLAKE2s_State& state)
|
void BLAKE2_Compress32_CORE(const byte* input, BLAKE2s_State& state)
|
||||||
{
|
{
|
||||||
# define m1 m0
|
# define m1 m0
|
||||||
# define m2 m0
|
# define m2 m0
|
||||||
|
|
@ -994,6 +994,22 @@ void BLAKE2_Compress32_POWER7(const byte* input, BLAKE2s_State& state)
|
||||||
VecStore32LE(&state.h[0], VecXor(ff0, VecXor(row1, row3)));
|
VecStore32LE(&state.h[0], VecXor(ff0, VecXor(row1, row3)));
|
||||||
VecStore32LE(&state.h[4], VecXor(ff1, VecXor(row2, row4)));
|
VecStore32LE(&state.h[4], VecXor(ff1, VecXor(row2, row4)));
|
||||||
}
|
}
|
||||||
#endif // CRYPTOPP_ALTIVEC_AVAILABLE
|
#endif // CRYPTOPP_POWER7_AVAILABLE || CRYPTOPP_ALTIVEC_AVAILABLE
|
||||||
|
|
||||||
|
#if (CRYPTOPP_POWER7_AVAILABLE)
|
||||||
|
|
||||||
|
void BLAKE2_Compress32_POWER7(const byte* input, BLAKE2s_State& state)
|
||||||
|
{
|
||||||
|
BLAKE2_Compress32_CORE(input, state);
|
||||||
|
}
|
||||||
|
|
||||||
|
#elif (CRYPTOPP_ALTIVEC_AVAILABLE)
|
||||||
|
|
||||||
|
void BLAKE2_Compress32_ALTIVEC(const byte* input, BLAKE2s_State& state)
|
||||||
|
{
|
||||||
|
BLAKE2_Compress32_CORE(input, state);
|
||||||
|
}
|
||||||
|
|
||||||
|
#endif
|
||||||
|
|
||||||
NAMESPACE_END
|
NAMESPACE_END
|
||||||
|
|
|
||||||
10
chacha.cpp
10
chacha.cpp
|
|
@ -24,10 +24,10 @@ extern void ChaCha_OperateKeystream_SSE2(const word32 *state, const byte* input,
|
||||||
extern void ChaCha_OperateKeystream_AVX2(const word32 *state, const byte* input, byte *output, unsigned int rounds);
|
extern void ChaCha_OperateKeystream_AVX2(const word32 *state, const byte* input, byte *output, unsigned int rounds);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if (CRYPTOPP_ALTIVEC_AVAILABLE)
|
#if (CRYPTOPP_POWER7_AVAILABLE)
|
||||||
// ChaCha_OperateKeystream_POWER7 may be compiled with either -mcpu=power7 or
|
|
||||||
// -mcpu=power4. The makefile drops to POWER4 if POWER7 is not available.
|
|
||||||
extern void ChaCha_OperateKeystream_POWER7(const word32 *state, const byte* input, byte *output, unsigned int rounds);
|
extern void ChaCha_OperateKeystream_POWER7(const word32 *state, const byte* input, byte *output, unsigned int rounds);
|
||||||
|
#elif (CRYPTOPP_ALTIVEC_AVAILABLE)
|
||||||
|
extern void ChaCha_OperateKeystream_ALTIVEC(const word32 *state, const byte* input, byte *output, unsigned int rounds);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#define CHACHA_QUARTER_ROUND(a,b,c,d) \
|
#define CHACHA_QUARTER_ROUND(a,b,c,d) \
|
||||||
|
|
@ -256,7 +256,6 @@ void ChaCha_Policy::OperateKeystream(KeystreamOperation operation,
|
||||||
{
|
{
|
||||||
while (iterationCount >= 4 && MultiBlockSafe(4))
|
while (iterationCount >= 4 && MultiBlockSafe(4))
|
||||||
{
|
{
|
||||||
// ChaCha_OperateKeystream_POWER7 compiled with -mcpu=power7 and -DCRYPTOPP_POWER7_AVAILABLE
|
|
||||||
const bool xorInput = (operation & INPUT_NULL) != INPUT_NULL;
|
const bool xorInput = (operation & INPUT_NULL) != INPUT_NULL;
|
||||||
ChaCha_OperateKeystream_POWER7(m_state, xorInput ? input : NULLPTR, output, m_rounds);
|
ChaCha_OperateKeystream_POWER7(m_state, xorInput ? input : NULLPTR, output, m_rounds);
|
||||||
|
|
||||||
|
|
@ -275,9 +274,8 @@ void ChaCha_Policy::OperateKeystream(KeystreamOperation operation,
|
||||||
{
|
{
|
||||||
while (iterationCount >= 4 && MultiBlockSafe(4))
|
while (iterationCount >= 4 && MultiBlockSafe(4))
|
||||||
{
|
{
|
||||||
// ChaCha_OperateKeystream_POWER7 compiled with -mcpu=power4 and -DCRYPTOPP_ALTIVEC_AVAILABLE
|
|
||||||
const bool xorInput = (operation & INPUT_NULL) != INPUT_NULL;
|
const bool xorInput = (operation & INPUT_NULL) != INPUT_NULL;
|
||||||
ChaCha_OperateKeystream_POWER7(m_state, xorInput ? input : NULLPTR, output, m_rounds);
|
ChaCha_OperateKeystream_ALTIVEC(m_state, xorInput ? input : NULLPTR, output, m_rounds);
|
||||||
|
|
||||||
// MultiBlockSafe avoids overflow on the counter words
|
// MultiBlockSafe avoids overflow on the counter words
|
||||||
m_state[12] += 4;
|
m_state[12] += 4;
|
||||||
|
|
|
||||||
|
|
@ -822,9 +822,12 @@ void ChaCha_OperateKeystream_SSE2(const word32 *state, const byte* input, byte *
|
||||||
|
|
||||||
#endif // CRYPTOPP_SSE2_INTRIN_AVAILABLE || CRYPTOPP_SSE2_ASM_AVAILABLE
|
#endif // CRYPTOPP_SSE2_INTRIN_AVAILABLE || CRYPTOPP_SSE2_ASM_AVAILABLE
|
||||||
|
|
||||||
#if (CRYPTOPP_ALTIVEC_AVAILABLE)
|
#if (CRYPTOPP_POWER7_AVAILABLE || CRYPTOPP_ALTIVEC_AVAILABLE)
|
||||||
|
|
||||||
void ChaCha_OperateKeystream_POWER7(const word32 *state, const byte* input, byte *output, unsigned int rounds)
|
// ChaCha_OperateKeystream_CORE will use either POWER7 or ALTIVEC,
|
||||||
|
// depending on the flags used to compile this source file. The
|
||||||
|
// abstractions are handled in VecLoad, VecStore and friends.
|
||||||
|
inline void ChaCha_OperateKeystream_CORE(const word32 *state, const byte* input, byte *output, unsigned int rounds)
|
||||||
{
|
{
|
||||||
const uint32x4_p state0 = VecLoad(state + 0*4);
|
const uint32x4_p state0 = VecLoad(state + 0*4);
|
||||||
const uint32x4_p state1 = VecLoad(state + 1*4);
|
const uint32x4_p state1 = VecLoad(state + 1*4);
|
||||||
|
|
@ -1086,6 +1089,22 @@ void ChaCha_OperateKeystream_POWER7(const word32 *state, const byte* input, byte
|
||||||
VecStore32LE(output + 15*16, r3_3);
|
VecStore32LE(output + 15*16, r3_3);
|
||||||
}
|
}
|
||||||
|
|
||||||
#endif // CRYPTOPP_ALTIVEC_AVAILABLE
|
#endif // CRYPTOPP_POWER7_AVAILABLE || CRYPTOPP_ALTIVEC_AVAILABLE
|
||||||
|
|
||||||
|
#if (CRYPTOPP_POWER7_AVAILABLE)
|
||||||
|
|
||||||
|
void ChaCha_OperateKeystream_POWER7(const word32 *state, const byte* input, byte *output, unsigned int rounds)
|
||||||
|
{
|
||||||
|
ChaCha_OperateKeystream_CORE(state, input, output, rounds);
|
||||||
|
}
|
||||||
|
|
||||||
|
#elif (CRYPTOPP_ALTIVEC_AVAILABLE)
|
||||||
|
|
||||||
|
void ChaCha_OperateKeystream_ALTIVEC(const word32 *state, const byte* input, byte *output, unsigned int rounds)
|
||||||
|
{
|
||||||
|
ChaCha_OperateKeystream_CORE(state, input, output, rounds);
|
||||||
|
}
|
||||||
|
|
||||||
|
#endif
|
||||||
|
|
||||||
NAMESPACE_END
|
NAMESPACE_END
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue