Add ChaCha Power8 implementation

pull/730/head
Jeffrey Walton 2018-10-27 08:40:07 -04:00
parent 04306f86ac
commit d7d76fa5f7
No known key found for this signature in database
GPG Key ID: B36AB348921B1838
3 changed files with 399 additions and 2 deletions

View File

@ -432,6 +432,7 @@ ifneq ($(IS_PPC32)$(IS_PPC64),00)
ifneq ($(HAVE_POWER8),0)
POWER8_FLAG = -mcpu=power8 -maltivec
AES_FLAG = $(POWER8_FLAG)
CHACHA_FLAG = $(POWER8_FLAG)
GCM_FLAG = $(POWER8_FLAG)
SHA_FLAG = $(POWER8_FLAG)
SM4_FLAG = $(POWER8_FLAG)
@ -465,6 +466,7 @@ ifneq ($(IS_PPC32)$(IS_PPC64),00)
ifneq ($(HAVE_POWER8),0)
POWER8_FLAG = -qarch=pwr8 -qaltivec
AES_FLAG = $(POWER8_FLAG)
CHACHA_FLAG = $(POWER8_FLAG)
GCM_FLAG = $(POWER8_FLAG)
SHA_FLAG = $(POWER8_FLAG)
SM4_FLAG = $(POWER8_FLAG)
@ -501,6 +503,7 @@ ifneq ($(IS_PPC32)$(IS_PPC64),00)
POWER7_FLAG = $(POWER8_FLAG)
ARIA_FLAG = $(POWER8_FLAG)
BLAKE2_FLAG = $(POWER8_FLAG)
CHACHA_FLAG = $(POWER8_FLAG)
CHAM_FLAG = $(POWER8_FLAG)
LEA_FLAG = $(POWER8_FLAG)
SIMECK_FLAG = $(POWER8_FLAG)

View File

@ -23,8 +23,6 @@
// Here are some relative numbers for ChaCha8:
// * Intel Skylake, 3.0 GHz: SSE2 at 2160 MB/s; SSSE3 at 2310 MB/s.
// * AMD Bulldozer, 3.3 GHz: SSE2 at 1680 MB/s; XOP at 2510 MB/s.
//
// Power8 is upcoming.
#include "pch.h"
#include "config.h"
@ -56,6 +54,10 @@
# include <arm_acle.h>
#endif
#if defined(CRYPTOPP_POWER8_AVAILABLE)
# include "ppc-simd.h"
#endif
// Squash MS LNK4221 and libtool warnings
extern const char CHACHA_SIMD_FNAME[] = __FILE__;
@ -199,6 +201,100 @@ inline __m128i RotateLeft<16>(const __m128i val)
#endif // CRYPTOPP_SSE2_INTRIN_AVAILABLE || CRYPTOPP_SSE2_ASM_AVAILABLE
// **************************** POWER8 **************************** //
#if (CRYPTOPP_POWER8_AVAILABLE)
using CryptoPP::uint8x16_p;
using CryptoPP::uint32x4_p;
using CryptoPP::uint64x2_p;
using CryptoPP::VectorLoad;
using CryptoPP::VectorStore;
// Permutes bytes in 32-bit words to little endian.
// State is already in proper endian order. Input and
// output must be permuted during load and save.
inline uint32x4_p VectorLoad32LE(const uint8_t src[16])
{
#if (CRYPTOPP_BIG_ENDIAN)
const uint8x16_p mask = {3,2,1,0, 7,6,5,4, 11,10,9,8, 15,14,13,12};
const uint32x4_p val = VectorLoad(src);
return vec_perm(val, val, mask);
#else
return VectorLoad(src);
#endif
}
// Permutes bytes in 32-bit words to little endian.
// State is already in proper endian order. Input and
// output must be permuted during load and save.
inline void VectorStore32LE(uint8_t dest[16], const uint32x4_p& val)
{
#if (CRYPTOPP_BIG_ENDIAN)
const uint8x16_p mask = {3,2,1,0, 7,6,5,4, 11,10,9,8, 15,14,13,12};
VectorStore(dest, vec_perm(val, val, mask));
#else
return VectorStore(dest, val);
#endif
}
// Rotate left by bit count
template<unsigned int C>
inline uint32x4_p RotateLeft(const uint32x4_p val)
{
const uint32x4_p m = {C, C, C, C};
return vec_rl(val, m);
}
// Rotate right by bit count
template<unsigned int C>
inline uint32x4_p RotateRight(const uint32x4_p val)
{
const uint32x4_p m = {32-C, 32-C, 32-C, 32-C};
return vec_rl(val, m);
}
// ChaCha's use of x86 shuffle is really a 4, 8, or 12 byte
// rotation on the 128-bit vector word:
// * [3,2,1,0] => [0,3,2,1] is Shuffle<1>(x)
// * [3,2,1,0] => [1,0,3,2] is Shuffle<2>(x)
// * [3,2,1,0] => [2,1,0,3] is Shuffle<3>(x)
template <unsigned int S>
inline uint32x4_p Shuffle(const uint32x4_p& val)
{
switch (S%4)
{
case 1:
{
const uint8x16_p mask = {4,5,6,7, 8,9,10,11, 12,13,14,15, 0,1,2,3};
return vec_perm(val, val, mask);
}
case 2:
{
const uint8x16_p mask = {8,9,10,11, 12,13,14,15, 0,1,2,3, 4,5,6,7};
return vec_perm(val, val, mask);
}
case 3:
{
const uint8x16_p mask = {12,13,14,15, 0,1,2,3, 4,5,6,7, 8,9,10,11};
return vec_perm(val, val, mask);
}
default:
{
CRYPTOPP_ASSERT(0);
return val;
}
}
}
// Helper to perform 64-bit addition across two elements of 32-bit vectors
inline uint32x4_p Add64(const uint32x4_p& a, const uint32x4_p& b)
{
return (uint32x4_p)vec_add((uint64x2_p)a, (uint64x2_p)b);
}
#endif // CRYPTOPP_POWER8_AVAILABLE
ANONYMOUS_NAMESPACE_END
NAMESPACE_BEGIN(CryptoPP)
@ -739,4 +835,270 @@ void ChaCha_OperateKeystream_SSE2(const word32 *state, const byte* input, byte *
#endif // CRYPTOPP_SSE2_INTRIN_AVAILABLE || CRYPTOPP_SSE2_ASM_AVAILABLE
#if (CRYPTOPP_POWER8_AVAILABLE)
void ChaCha_OperateKeystream_POWER8(const word32 *state, const byte* input, byte *output, unsigned int rounds)
{
const uint32x4_p state0 = VectorLoad(state + 0*4);
const uint32x4_p state1 = VectorLoad(state + 1*4);
const uint32x4_p state2 = VectorLoad(state + 2*4);
const uint32x4_p state3 = VectorLoad(state + 3*4);
const uint32x4_p CTRS[3] = {
{1,0,0,0}, {2,0,0,0}, {3,0,0,0}
};
uint32x4_p r0_0 = state0;
uint32x4_p r0_1 = state1;
uint32x4_p r0_2 = state2;
uint32x4_p r0_3 = state3;
uint32x4_p r1_0 = state0;
uint32x4_p r1_1 = state1;
uint32x4_p r1_2 = state2;
uint32x4_p r1_3 = Add64(r0_3, CTRS[0]);
uint32x4_p r2_0 = state0;
uint32x4_p r2_1 = state1;
uint32x4_p r2_2 = state2;
uint32x4_p r2_3 = Add64(r0_3, CTRS[1]);
uint32x4_p r3_0 = state0;
uint32x4_p r3_1 = state1;
uint32x4_p r3_2 = state2;
uint32x4_p r3_3 = Add64(r0_3, CTRS[2]);
for (int i = static_cast<int>(rounds); i > 0; i -= 2)
{
r0_0 = VectorAdd(r0_0, r0_1);
r1_0 = VectorAdd(r1_0, r1_1);
r2_0 = VectorAdd(r2_0, r2_1);
r3_0 = VectorAdd(r3_0, r3_1);
r0_3 = VectorXor(r0_3, r0_0);
r1_3 = VectorXor(r1_3, r1_0);
r2_3 = VectorXor(r2_3, r2_0);
r3_3 = VectorXor(r3_3, r3_0);
r0_3 = RotateLeft<16>(r0_3);
r1_3 = RotateLeft<16>(r1_3);
r2_3 = RotateLeft<16>(r2_3);
r3_3 = RotateLeft<16>(r3_3);
r0_2 = VectorAdd(r0_2, r0_3);
r1_2 = VectorAdd(r1_2, r1_3);
r2_2 = VectorAdd(r2_2, r2_3);
r3_2 = VectorAdd(r3_2, r3_3);
r0_1 = VectorXor(r0_1, r0_2);
r1_1 = VectorXor(r1_1, r1_2);
r2_1 = VectorXor(r2_1, r2_2);
r3_1 = VectorXor(r3_1, r3_2);
r0_1 = RotateLeft<12>(r0_1);
r1_1 = RotateLeft<12>(r1_1);
r2_1 = RotateLeft<12>(r2_1);
r3_1 = RotateLeft<12>(r3_1);
r0_0 = VectorAdd(r0_0, r0_1);
r1_0 = VectorAdd(r1_0, r1_1);
r2_0 = VectorAdd(r2_0, r2_1);
r3_0 = VectorAdd(r3_0, r3_1);
r0_3 = VectorXor(r0_3, r0_0);
r1_3 = VectorXor(r1_3, r1_0);
r2_3 = VectorXor(r2_3, r2_0);
r3_3 = VectorXor(r3_3, r3_0);
r0_3 = RotateLeft<8>(r0_3);
r1_3 = RotateLeft<8>(r1_3);
r2_3 = RotateLeft<8>(r2_3);
r3_3 = RotateLeft<8>(r3_3);
r0_2 = VectorAdd(r0_2, r0_3);
r1_2 = VectorAdd(r1_2, r1_3);
r2_2 = VectorAdd(r2_2, r2_3);
r3_2 = VectorAdd(r3_2, r3_3);
r0_1 = VectorXor(r0_1, r0_2);
r1_1 = VectorXor(r1_1, r1_2);
r2_1 = VectorXor(r2_1, r2_2);
r3_1 = VectorXor(r3_1, r3_2);
r0_1 = RotateLeft<7>(r0_1);
r1_1 = RotateLeft<7>(r1_1);
r2_1 = RotateLeft<7>(r2_1);
r3_1 = RotateLeft<7>(r3_1);
r0_1 = Shuffle<1>(r0_1);
r0_2 = Shuffle<2>(r0_2);
r0_3 = Shuffle<3>(r0_3);
r1_1 = Shuffle<1>(r1_1);
r1_2 = Shuffle<2>(r1_2);
r1_3 = Shuffle<3>(r1_3);
r2_1 = Shuffle<1>(r2_1);
r2_2 = Shuffle<2>(r2_2);
r2_3 = Shuffle<3>(r2_3);
r3_1 = Shuffle<1>(r3_1);
r3_2 = Shuffle<2>(r3_2);
r3_3 = Shuffle<3>(r3_3);
r0_0 = VectorAdd(r0_0, r0_1);
r1_0 = VectorAdd(r1_0, r1_1);
r2_0 = VectorAdd(r2_0, r2_1);
r3_0 = VectorAdd(r3_0, r3_1);
r0_3 = VectorXor(r0_3, r0_0);
r1_3 = VectorXor(r1_3, r1_0);
r2_3 = VectorXor(r2_3, r2_0);
r3_3 = VectorXor(r3_3, r3_0);
r0_3 = RotateLeft<16>(r0_3);
r1_3 = RotateLeft<16>(r1_3);
r2_3 = RotateLeft<16>(r2_3);
r3_3 = RotateLeft<16>(r3_3);
r0_2 = VectorAdd(r0_2, r0_3);
r1_2 = VectorAdd(r1_2, r1_3);
r2_2 = VectorAdd(r2_2, r2_3);
r3_2 = VectorAdd(r3_2, r3_3);
r0_1 = VectorXor(r0_1, r0_2);
r1_1 = VectorXor(r1_1, r1_2);
r2_1 = VectorXor(r2_1, r2_2);
r3_1 = VectorXor(r3_1, r3_2);
r0_1 = RotateLeft<12>(r0_1);
r1_1 = RotateLeft<12>(r1_1);
r2_1 = RotateLeft<12>(r2_1);
r3_1 = RotateLeft<12>(r3_1);
r0_0 = VectorAdd(r0_0, r0_1);
r1_0 = VectorAdd(r1_0, r1_1);
r2_0 = VectorAdd(r2_0, r2_1);
r3_0 = VectorAdd(r3_0, r3_1);
r0_3 = VectorXor(r0_3, r0_0);
r1_3 = VectorXor(r1_3, r1_0);
r2_3 = VectorXor(r2_3, r2_0);
r3_3 = VectorXor(r3_3, r3_0);
r0_3 = RotateLeft<8>(r0_3);
r1_3 = RotateLeft<8>(r1_3);
r2_3 = RotateLeft<8>(r2_3);
r3_3 = RotateLeft<8>(r3_3);
r0_2 = VectorAdd(r0_2, r0_3);
r1_2 = VectorAdd(r1_2, r1_3);
r2_2 = VectorAdd(r2_2, r2_3);
r3_2 = VectorAdd(r3_2, r3_3);
r0_1 = VectorXor(r0_1, r0_2);
r1_1 = VectorXor(r1_1, r1_2);
r2_1 = VectorXor(r2_1, r2_2);
r3_1 = VectorXor(r3_1, r3_2);
r0_1 = RotateLeft<7>(r0_1);
r1_1 = RotateLeft<7>(r1_1);
r2_1 = RotateLeft<7>(r2_1);
r3_1 = RotateLeft<7>(r3_1);
r0_1 = Shuffle<3>(r0_1);
r0_2 = Shuffle<2>(r0_2);
r0_3 = Shuffle<1>(r0_3);
r1_1 = Shuffle<3>(r1_1);
r1_2 = Shuffle<2>(r1_2);
r1_3 = Shuffle<1>(r1_3);
r2_1 = Shuffle<3>(r2_1);
r2_2 = Shuffle<2>(r2_2);
r2_3 = Shuffle<1>(r2_3);
r3_1 = Shuffle<3>(r3_1);
r3_2 = Shuffle<2>(r3_2);
r3_3 = Shuffle<1>(r3_3);
}
r0_0 = VectorAdd(r0_0, state0);
r0_1 = VectorAdd(r0_1, state1);
r0_2 = VectorAdd(r0_2, state2);
r0_3 = VectorAdd(r0_3, state3);
r1_0 = VectorAdd(r1_0, state0);
r1_1 = VectorAdd(r1_1, state1);
r1_2 = VectorAdd(r1_2, state2);
r1_3 = VectorAdd(r1_3, state3);
r1_3 = Add64(r1_3, CTRS[0]);
r2_0 = VectorAdd(r2_0, state0);
r2_1 = VectorAdd(r2_1, state1);
r2_2 = VectorAdd(r2_2, state2);
r2_3 = VectorAdd(r2_3, state3);
r2_3 = Add64(r2_3, CTRS[1]);
r3_0 = VectorAdd(r3_0, state0);
r3_1 = VectorAdd(r3_1, state1);
r3_2 = VectorAdd(r3_2, state2);
r3_3 = VectorAdd(r3_3, state3);
r3_3 = Add64(r3_3, CTRS[2]);
if (input)
{
r0_0 = VectorXor(VectorLoad32LE(input + 0*16), r0_0);
r0_1 = VectorXor(VectorLoad32LE(input + 1*16), r0_1);
r0_2 = VectorXor(VectorLoad32LE(input + 2*16), r0_2);
r0_3 = VectorXor(VectorLoad32LE(input + 3*16), r0_3);
}
VectorStore32LE(output + 0*16, r0_0);
VectorStore32LE(output + 1*16, r0_1);
VectorStore32LE(output + 2*16, r0_2);
VectorStore32LE(output + 3*16, r0_3);
if (input)
{
r1_0 = VectorXor(VectorLoad32LE(input + 4*16), r1_0);
r1_1 = VectorXor(VectorLoad32LE(input + 5*16), r1_1);
r1_2 = VectorXor(VectorLoad32LE(input + 6*16), r1_2);
r1_3 = VectorXor(VectorLoad32LE(input + 7*16), r1_3);
}
VectorStore32LE(output + 4*16, r1_0);
VectorStore32LE(output + 5*16, r1_1);
VectorStore32LE(output + 6*16, r1_2);
VectorStore32LE(output + 7*16, r1_3);
if (input)
{
r2_0 = VectorXor(VectorLoad32LE(input + 8*16), r2_0);
r2_1 = VectorXor(VectorLoad32LE(input + 9*16), r2_1);
r2_2 = VectorXor(VectorLoad32LE(input + 10*16), r2_2);
r2_3 = VectorXor(VectorLoad32LE(input + 11*16), r2_3);
}
VectorStore32LE(output + 8*16, r2_0);
VectorStore32LE(output + 9*16, r2_1);
VectorStore32LE(output + 10*16, r2_2);
VectorStore32LE(output + 11*16, r2_3);
if (input)
{
r3_0 = VectorXor(VectorLoad32LE(input + 12*16), r3_0);
r3_1 = VectorXor(VectorLoad32LE(input + 13*16), r3_1);
r3_2 = VectorXor(VectorLoad32LE(input + 14*16), r3_2);
r3_3 = VectorXor(VectorLoad32LE(input + 15*16), r3_3);
}
VectorStore32LE(output + 12*16, r3_0);
VectorStore32LE(output + 13*16, r3_1);
VectorStore32LE(output + 14*16, r3_2);
VectorStore32LE(output + 15*16, r3_3);
}
#endif // CRYPTOPP_POWER8_AVAILABLE
NAMESPACE_END

View File

@ -19,6 +19,10 @@ extern void ChaCha_OperateKeystream_NEON(const word32 *state, const byte* input,
extern void ChaCha_OperateKeystream_SSE2(const word32 *state, const byte* input, byte *output, unsigned int rounds);
#endif
#if (CRYPTOPP_POWER8_AVAILABLE)
extern void ChaCha_OperateKeystream_POWER8(const word32 *state, const byte* input, byte *output, unsigned int rounds);
#endif
#define CHACHA_QUARTER_ROUND(a,b,c,d) \
a += b; d ^= a; d = rotlConstant<16,word32>(d); \
c += d; b ^= c; b = rotlConstant<12,word32>(b); \
@ -46,6 +50,10 @@ std::string ChaCha_Policy::AlgorithmProvider() const
#if (CRYPTOPP_ARM_NEON_AVAILABLE)
if (HasNEON())
return "NEON";
#endif
#if (CRYPTOPP_POWER8_AVAILABLE)
if (HasPower8())
return "Power8";
#endif
return "C++";
}
@ -109,6 +117,11 @@ unsigned int ChaCha_Policy::GetOptimalBlockSize() const
if (HasNEON())
return 4*BYTES_PER_ITERATION;
else
#endif
#if (CRYPTOPP_POWER8_AVAILABLE)
if (HasPower8())
return 4*BYTES_PER_ITERATION;
else
#endif
return BYTES_PER_ITERATION;
}
@ -157,6 +170,25 @@ void ChaCha_Policy::OperateKeystream(KeystreamOperation operation,
}
#endif
#if (CRYPTOPP_POWER8_AVAILABLE)
if (HasPower8())
{
while (iterationCount >= 4)
{
const bool xorInput = (operation & INPUT_NULL) != INPUT_NULL;
ChaCha_OperateKeystream_POWER8(m_state, xorInput ? input : NULLPTR, output, m_rounds);
m_state[12] += 4;
if (m_state[12] < 4)
m_state[13]++;
input += (!!xorInput)*4*BYTES_PER_ITERATION;
output += 4*BYTES_PER_ITERATION;
iterationCount -= 4;
}
}
#endif
while (iterationCount--)
{
word32 x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15;