Drop ChaCha requirements to POWER7

This costs about 0.6 cpb (700 MB/s on GCC112), but it makes the faster algorithm available to more machines. In the future we may want to provide both POWER7 and POWER8
pull/748/head
Jeffrey Walton 2018-11-14 08:19:13 -05:00
parent b3941a433d
commit 225ab6cb7b
No known key found for this signature in database
GPG Key ID: B36AB348921B1838
2 changed files with 41 additions and 20 deletions

View File

@ -24,8 +24,8 @@ extern void ChaCha_OperateKeystream_SSE2(const word32 *state, const byte* input,
extern void ChaCha_OperateKeystream_AVX2(const word32 *state, const byte* input, byte *output, unsigned int rounds); extern void ChaCha_OperateKeystream_AVX2(const word32 *state, const byte* input, byte *output, unsigned int rounds);
#endif #endif
#if (CRYPTOPP_POWER8_AVAILABLE) #if (CRYPTOPP_POWER7_AVAILABLE)
extern void ChaCha_OperateKeystream_POWER8(const word32 *state, const byte* input, byte *output, unsigned int rounds); extern void ChaCha_OperateKeystream_POWER7(const word32 *state, const byte* input, byte *output, unsigned int rounds);
#endif #endif
#define CHACHA_QUARTER_ROUND(a,b,c,d) \ #define CHACHA_QUARTER_ROUND(a,b,c,d) \
@ -81,9 +81,9 @@ std::string ChaCha_Policy::AlgorithmProvider() const
return "NEON"; return "NEON";
else else
#endif #endif
#if (CRYPTOPP_POWER8_AVAILABLE) #if (CRYPTOPP_POWER7_AVAILABLE)
if (HasPower8()) if (HasPower7())
return "Power8"; return "Power7";
else else
#endif #endif
return "C++"; return "C++";
@ -139,8 +139,8 @@ unsigned int ChaCha_Policy::GetAlignment() const
return 16; return 16;
else else
#endif #endif
#if (CRYPTOPP_POWER8_AVAILABLE) #if (CRYPTOPP_POWER7_AVAILABLE)
if (HasPower8()) if (HasPower7())
return 16; return 16;
else else
#endif #endif
@ -164,8 +164,8 @@ unsigned int ChaCha_Policy::GetOptimalBlockSize() const
return 4*BYTES_PER_ITERATION; return 4*BYTES_PER_ITERATION;
else else
#endif #endif
#if (CRYPTOPP_POWER8_AVAILABLE) #if (CRYPTOPP_POWER7_AVAILABLE)
if (HasPower8()) if (HasPower7())
return 4*BYTES_PER_ITERATION; return 4*BYTES_PER_ITERATION;
else else
#endif #endif
@ -245,13 +245,13 @@ void ChaCha_Policy::OperateKeystream(KeystreamOperation operation,
} }
#endif #endif
#if (CRYPTOPP_POWER8_AVAILABLE) #if (CRYPTOPP_POWER7_AVAILABLE)
if (HasPower8()) if (HasPower7())
{ {
while (iterationCount >= 4 && MultiBlockSafe(4)) while (iterationCount >= 4 && MultiBlockSafe(4))
{ {
const bool xorInput = (operation & INPUT_NULL) != INPUT_NULL; const bool xorInput = (operation & INPUT_NULL) != INPUT_NULL;
ChaCha_OperateKeystream_POWER8(m_state, xorInput ? input : NULLPTR, output, m_rounds); ChaCha_OperateKeystream_POWER7(m_state, xorInput ? input : NULLPTR, output, m_rounds);
// MultiBlockSafe avoids overflow on the counter words // MultiBlockSafe avoids overflow on the counter words
m_state[12] += 4; m_state[12] += 4;

View File

@ -54,7 +54,7 @@
# include <arm_acle.h> # include <arm_acle.h>
#endif #endif
#if defined(CRYPTOPP_POWER8_AVAILABLE) #if defined(CRYPTOPP_POWER7_AVAILABLE)
# include "ppc_simd.h" # include "ppc_simd.h"
#endif #endif
@ -201,16 +201,25 @@ inline __m128i RotateLeft<16>(const __m128i val)
#endif // CRYPTOPP_SSE2_INTRIN_AVAILABLE || CRYPTOPP_SSE2_ASM_AVAILABLE #endif // CRYPTOPP_SSE2_INTRIN_AVAILABLE || CRYPTOPP_SSE2_ASM_AVAILABLE
// **************************** POWER8 **************************** // // **************************** POWER7 **************************** //
#if (CRYPTOPP_POWER8_AVAILABLE) #if (CRYPTOPP_POWER7_AVAILABLE)
// POWER8 is optional and runs about 0.6 cpb faster because
// of the native 64-bit vector add. That's about 700 MB/s on
// GCC112 from the compile farm. Use -mcpu=power8 to engage
// POWER8. POWER7 lacks 64-bit element support, so code built
// with -mcpu=power8 will SIGILL on POWER7 machines.
using CryptoPP::uint8x16_p; using CryptoPP::uint8x16_p;
using CryptoPP::uint32x4_p; using CryptoPP::uint32x4_p;
using CryptoPP::uint64x2_p;
using CryptoPP::VectorLoad; using CryptoPP::VectorLoad;
using CryptoPP::VectorStore; using CryptoPP::VectorStore;
#if (_ARCH_PWR8 || _ARCH_PWR9)
using CryptoPP::uint64x2_p;
#endif
// Permutes bytes in packed 32-bit words to little endian. // Permutes bytes in packed 32-bit words to little endian.
// State is already in proper endian order. Input and // State is already in proper endian order. Input and
// output must be permuted during load and save. // output must be permuted during load and save.
@ -290,10 +299,22 @@ inline uint32x4_p Shuffle<3>(const uint32x4_p& val)
// Helper to perform 64-bit addition across two elements of 32-bit vectors // Helper to perform 64-bit addition across two elements of 32-bit vectors
inline uint32x4_p VectorAdd64(const uint32x4_p& a, const uint32x4_p& b) inline uint32x4_p VectorAdd64(const uint32x4_p& a, const uint32x4_p& b)
{ {
#if (_ARCH_PWR8 || _ARCH_PWR9)
return (uint32x4_p)vec_add((uint64x2_p)a, (uint64x2_p)b); return (uint32x4_p)vec_add((uint64x2_p)a, (uint64x2_p)b);
#else
// The carry mask selects carries from elements 1 and 3 and sets remaining
// elements to 0. The mask also shifts the carried values left by 4 bytes
// so the carries are added to elements 0 and 2.
const uint8x16_p cmask = {4,5,6,7, 16,16,16,16, 12,13,14,15, 16,16,16,16};
const uint32x4_p zero = {0, 0, 0, 0};
uint32x4_p cy = vec_addc(a, b);
cy = vec_perm(cy, zero, cmask);
return vec_add(vec_add(a, b), cy);
#endif
} }
#endif // CRYPTOPP_POWER8_AVAILABLE #endif // CRYPTOPP_POWER7_AVAILABLE
ANONYMOUS_NAMESPACE_END ANONYMOUS_NAMESPACE_END
@ -835,9 +856,9 @@ void ChaCha_OperateKeystream_SSE2(const word32 *state, const byte* input, byte *
#endif // CRYPTOPP_SSE2_INTRIN_AVAILABLE || CRYPTOPP_SSE2_ASM_AVAILABLE #endif // CRYPTOPP_SSE2_INTRIN_AVAILABLE || CRYPTOPP_SSE2_ASM_AVAILABLE
#if (CRYPTOPP_POWER8_AVAILABLE) #if (CRYPTOPP_POWER7_AVAILABLE)
void ChaCha_OperateKeystream_POWER8(const word32 *state, const byte* input, byte *output, unsigned int rounds) void ChaCha_OperateKeystream_POWER7(const word32 *state, const byte* input, byte *output, unsigned int rounds)
{ {
const uint32x4_p state0 = VectorLoad(state + 0*4); const uint32x4_p state0 = VectorLoad(state + 0*4);
const uint32x4_p state1 = VectorLoad(state + 1*4); const uint32x4_p state1 = VectorLoad(state + 1*4);
@ -1099,6 +1120,6 @@ void ChaCha_OperateKeystream_POWER8(const word32 *state, const byte* input, byte
VectorStore32LE(output + 15*16, r3_3); VectorStore32LE(output + 15*16, r3_3);
} }
#endif // CRYPTOPP_POWER8_AVAILABLE #endif // CRYPTOPP_POWER7_AVAILABLE
NAMESPACE_END NAMESPACE_END