Drop ChaCha requirements to POWER7
This costs about 0.6 cpb (700 MB/s on GCC112), but it makes the faster algorithm available to more machines. In the future we may want to provide both POWER7 and POWER8pull/748/head
parent
b3941a433d
commit
225ab6cb7b
24
chacha.cpp
24
chacha.cpp
|
|
@ -24,8 +24,8 @@ extern void ChaCha_OperateKeystream_SSE2(const word32 *state, const byte* input,
|
||||||
extern void ChaCha_OperateKeystream_AVX2(const word32 *state, const byte* input, byte *output, unsigned int rounds);
|
extern void ChaCha_OperateKeystream_AVX2(const word32 *state, const byte* input, byte *output, unsigned int rounds);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if (CRYPTOPP_POWER8_AVAILABLE)
|
#if (CRYPTOPP_POWER7_AVAILABLE)
|
||||||
extern void ChaCha_OperateKeystream_POWER8(const word32 *state, const byte* input, byte *output, unsigned int rounds);
|
extern void ChaCha_OperateKeystream_POWER7(const word32 *state, const byte* input, byte *output, unsigned int rounds);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#define CHACHA_QUARTER_ROUND(a,b,c,d) \
|
#define CHACHA_QUARTER_ROUND(a,b,c,d) \
|
||||||
|
|
@ -81,9 +81,9 @@ std::string ChaCha_Policy::AlgorithmProvider() const
|
||||||
return "NEON";
|
return "NEON";
|
||||||
else
|
else
|
||||||
#endif
|
#endif
|
||||||
#if (CRYPTOPP_POWER8_AVAILABLE)
|
#if (CRYPTOPP_POWER7_AVAILABLE)
|
||||||
if (HasPower8())
|
if (HasPower7())
|
||||||
return "Power8";
|
return "Power7";
|
||||||
else
|
else
|
||||||
#endif
|
#endif
|
||||||
return "C++";
|
return "C++";
|
||||||
|
|
@ -139,8 +139,8 @@ unsigned int ChaCha_Policy::GetAlignment() const
|
||||||
return 16;
|
return 16;
|
||||||
else
|
else
|
||||||
#endif
|
#endif
|
||||||
#if (CRYPTOPP_POWER8_AVAILABLE)
|
#if (CRYPTOPP_POWER7_AVAILABLE)
|
||||||
if (HasPower8())
|
if (HasPower7())
|
||||||
return 16;
|
return 16;
|
||||||
else
|
else
|
||||||
#endif
|
#endif
|
||||||
|
|
@ -164,8 +164,8 @@ unsigned int ChaCha_Policy::GetOptimalBlockSize() const
|
||||||
return 4*BYTES_PER_ITERATION;
|
return 4*BYTES_PER_ITERATION;
|
||||||
else
|
else
|
||||||
#endif
|
#endif
|
||||||
#if (CRYPTOPP_POWER8_AVAILABLE)
|
#if (CRYPTOPP_POWER7_AVAILABLE)
|
||||||
if (HasPower8())
|
if (HasPower7())
|
||||||
return 4*BYTES_PER_ITERATION;
|
return 4*BYTES_PER_ITERATION;
|
||||||
else
|
else
|
||||||
#endif
|
#endif
|
||||||
|
|
@ -245,13 +245,13 @@ void ChaCha_Policy::OperateKeystream(KeystreamOperation operation,
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if (CRYPTOPP_POWER8_AVAILABLE)
|
#if (CRYPTOPP_POWER7_AVAILABLE)
|
||||||
if (HasPower8())
|
if (HasPower7())
|
||||||
{
|
{
|
||||||
while (iterationCount >= 4 && MultiBlockSafe(4))
|
while (iterationCount >= 4 && MultiBlockSafe(4))
|
||||||
{
|
{
|
||||||
const bool xorInput = (operation & INPUT_NULL) != INPUT_NULL;
|
const bool xorInput = (operation & INPUT_NULL) != INPUT_NULL;
|
||||||
ChaCha_OperateKeystream_POWER8(m_state, xorInput ? input : NULLPTR, output, m_rounds);
|
ChaCha_OperateKeystream_POWER7(m_state, xorInput ? input : NULLPTR, output, m_rounds);
|
||||||
|
|
||||||
// MultiBlockSafe avoids overflow on the counter words
|
// MultiBlockSafe avoids overflow on the counter words
|
||||||
m_state[12] += 4;
|
m_state[12] += 4;
|
||||||
|
|
|
||||||
|
|
@ -54,7 +54,7 @@
|
||||||
# include <arm_acle.h>
|
# include <arm_acle.h>
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if defined(CRYPTOPP_POWER8_AVAILABLE)
|
#if defined(CRYPTOPP_POWER7_AVAILABLE)
|
||||||
# include "ppc_simd.h"
|
# include "ppc_simd.h"
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
|
@ -201,16 +201,25 @@ inline __m128i RotateLeft<16>(const __m128i val)
|
||||||
|
|
||||||
#endif // CRYPTOPP_SSE2_INTRIN_AVAILABLE || CRYPTOPP_SSE2_ASM_AVAILABLE
|
#endif // CRYPTOPP_SSE2_INTRIN_AVAILABLE || CRYPTOPP_SSE2_ASM_AVAILABLE
|
||||||
|
|
||||||
// **************************** POWER8 **************************** //
|
// **************************** POWER7 **************************** //
|
||||||
|
|
||||||
#if (CRYPTOPP_POWER8_AVAILABLE)
|
#if (CRYPTOPP_POWER7_AVAILABLE)
|
||||||
|
|
||||||
|
// POWER8 is optional and runs about 0.6 cpb faster because
|
||||||
|
// of the native 64-bit vector add. That's about 700 MB/s on
|
||||||
|
// GCC112 from the compile farm. Use -mcpu=power8 to engage
|
||||||
|
// POWER8. POWER7 lacks 64-bit element support, so code built
|
||||||
|
// with -mcpu=power8 will SIGILL on POWER7 machines.
|
||||||
|
|
||||||
using CryptoPP::uint8x16_p;
|
using CryptoPP::uint8x16_p;
|
||||||
using CryptoPP::uint32x4_p;
|
using CryptoPP::uint32x4_p;
|
||||||
using CryptoPP::uint64x2_p;
|
|
||||||
using CryptoPP::VectorLoad;
|
using CryptoPP::VectorLoad;
|
||||||
using CryptoPP::VectorStore;
|
using CryptoPP::VectorStore;
|
||||||
|
|
||||||
|
#if (_ARCH_PWR8 || _ARCH_PWR9)
|
||||||
|
using CryptoPP::uint64x2_p;
|
||||||
|
#endif
|
||||||
|
|
||||||
// Permutes bytes in packed 32-bit words to little endian.
|
// Permutes bytes in packed 32-bit words to little endian.
|
||||||
// State is already in proper endian order. Input and
|
// State is already in proper endian order. Input and
|
||||||
// output must be permuted during load and save.
|
// output must be permuted during load and save.
|
||||||
|
|
@ -290,10 +299,22 @@ inline uint32x4_p Shuffle<3>(const uint32x4_p& val)
|
||||||
// Helper to perform 64-bit addition across two elements of 32-bit vectors
|
// Helper to perform 64-bit addition across two elements of 32-bit vectors
|
||||||
inline uint32x4_p VectorAdd64(const uint32x4_p& a, const uint32x4_p& b)
|
inline uint32x4_p VectorAdd64(const uint32x4_p& a, const uint32x4_p& b)
|
||||||
{
|
{
|
||||||
|
#if (_ARCH_PWR8 || _ARCH_PWR9)
|
||||||
return (uint32x4_p)vec_add((uint64x2_p)a, (uint64x2_p)b);
|
return (uint32x4_p)vec_add((uint64x2_p)a, (uint64x2_p)b);
|
||||||
|
#else
|
||||||
|
// The carry mask selects carries from elements 1 and 3 and sets remaining
|
||||||
|
// elements to 0. The mask also shifts the carried values left by 4 bytes
|
||||||
|
// so the carries are added to elements 0 and 2.
|
||||||
|
const uint8x16_p cmask = {4,5,6,7, 16,16,16,16, 12,13,14,15, 16,16,16,16};
|
||||||
|
const uint32x4_p zero = {0, 0, 0, 0};
|
||||||
|
|
||||||
|
uint32x4_p cy = vec_addc(a, b);
|
||||||
|
cy = vec_perm(cy, zero, cmask);
|
||||||
|
return vec_add(vec_add(a, b), cy);
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
#endif // CRYPTOPP_POWER8_AVAILABLE
|
#endif // CRYPTOPP_POWER7_AVAILABLE
|
||||||
|
|
||||||
ANONYMOUS_NAMESPACE_END
|
ANONYMOUS_NAMESPACE_END
|
||||||
|
|
||||||
|
|
@ -835,9 +856,9 @@ void ChaCha_OperateKeystream_SSE2(const word32 *state, const byte* input, byte *
|
||||||
|
|
||||||
#endif // CRYPTOPP_SSE2_INTRIN_AVAILABLE || CRYPTOPP_SSE2_ASM_AVAILABLE
|
#endif // CRYPTOPP_SSE2_INTRIN_AVAILABLE || CRYPTOPP_SSE2_ASM_AVAILABLE
|
||||||
|
|
||||||
#if (CRYPTOPP_POWER8_AVAILABLE)
|
#if (CRYPTOPP_POWER7_AVAILABLE)
|
||||||
|
|
||||||
void ChaCha_OperateKeystream_POWER8(const word32 *state, const byte* input, byte *output, unsigned int rounds)
|
void ChaCha_OperateKeystream_POWER7(const word32 *state, const byte* input, byte *output, unsigned int rounds)
|
||||||
{
|
{
|
||||||
const uint32x4_p state0 = VectorLoad(state + 0*4);
|
const uint32x4_p state0 = VectorLoad(state + 0*4);
|
||||||
const uint32x4_p state1 = VectorLoad(state + 1*4);
|
const uint32x4_p state1 = VectorLoad(state + 1*4);
|
||||||
|
|
@ -1099,6 +1120,6 @@ void ChaCha_OperateKeystream_POWER8(const word32 *state, const byte* input, byte
|
||||||
VectorStore32LE(output + 15*16, r3_3);
|
VectorStore32LE(output + 15*16, r3_3);
|
||||||
}
|
}
|
||||||
|
|
||||||
#endif // CRYPTOPP_POWER8_AVAILABLE
|
#endif // CRYPTOPP_POWER7_AVAILABLE
|
||||||
|
|
||||||
NAMESPACE_END
|
NAMESPACE_END
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue