diff --git a/GNUmakefile b/GNUmakefile index a451be95..5f5d58d2 100755 --- a/GNUmakefile +++ b/GNUmakefile @@ -432,6 +432,7 @@ ifneq ($(IS_PPC32)$(IS_PPC64),00) ifneq ($(HAVE_POWER8),0) POWER8_FLAG = -mcpu=power8 -maltivec AES_FLAG = $(POWER8_FLAG) + CHACHA_FLAG = $(POWER8_FLAG) GCM_FLAG = $(POWER8_FLAG) SHA_FLAG = $(POWER8_FLAG) SM4_FLAG = $(POWER8_FLAG) @@ -465,6 +466,7 @@ ifneq ($(IS_PPC32)$(IS_PPC64),00) ifneq ($(HAVE_POWER8),0) POWER8_FLAG = -qarch=pwr8 -qaltivec AES_FLAG = $(POWER8_FLAG) + CHACHA_FLAG = $(POWER8_FLAG) GCM_FLAG = $(POWER8_FLAG) SHA_FLAG = $(POWER8_FLAG) SM4_FLAG = $(POWER8_FLAG) @@ -501,6 +503,7 @@ ifneq ($(IS_PPC32)$(IS_PPC64),00) POWER7_FLAG = $(POWER8_FLAG) ARIA_FLAG = $(POWER8_FLAG) BLAKE2_FLAG = $(POWER8_FLAG) + CHACHA_FLAG = $(POWER8_FLAG) CHAM_FLAG = $(POWER8_FLAG) LEA_FLAG = $(POWER8_FLAG) SIMECK_FLAG = $(POWER8_FLAG) diff --git a/chacha-simd.cpp b/chacha-simd.cpp index dbaf1689..4b2adb4d 100644 --- a/chacha-simd.cpp +++ b/chacha-simd.cpp @@ -23,8 +23,6 @@ // Here are some relative numbers for ChaCha8: // * Intel Skylake, 3.0 GHz: SSE2 at 2160 MB/s; SSSE3 at 2310 MB/s. // * AMD Bulldozer, 3.3 GHz: SSE2 at 1680 MB/s; XOP at 2510 MB/s. -// -// Power8 is upcoming. #include "pch.h" #include "config.h" @@ -56,6 +54,10 @@ # include #endif +#if defined(CRYPTOPP_POWER8_AVAILABLE) +# include "ppc-simd.h" +#endif + // Squash MS LNK4221 and libtool warnings extern const char CHACHA_SIMD_FNAME[] = __FILE__; @@ -199,6 +201,100 @@ inline __m128i RotateLeft<16>(const __m128i val) #endif // CRYPTOPP_SSE2_INTRIN_AVAILABLE || CRYPTOPP_SSE2_ASM_AVAILABLE +// **************************** POWER8 **************************** // + +#if (CRYPTOPP_POWER8_AVAILABLE) + +using CryptoPP::uint8x16_p; +using CryptoPP::uint32x4_p; +using CryptoPP::uint64x2_p; +using CryptoPP::VectorLoad; +using CryptoPP::VectorStore; + +// Permutes bytes in 32-bit words to little endian. +// State is already in proper endian order. Input and +// output must be permuted during load and save. +inline uint32x4_p VectorLoad32LE(const uint8_t src[16]) +{ +#if (CRYPTOPP_BIG_ENDIAN) + const uint8x16_p mask = {3,2,1,0, 7,6,5,4, 11,10,9,8, 15,14,13,12}; + const uint32x4_p val = VectorLoad(src); + return vec_perm(val, val, mask); +#else + return VectorLoad(src); +#endif +} + +// Permutes bytes in 32-bit words to little endian. +// State is already in proper endian order. Input and +// output must be permuted during load and save. +inline void VectorStore32LE(uint8_t dest[16], const uint32x4_p& val) +{ +#if (CRYPTOPP_BIG_ENDIAN) + const uint8x16_p mask = {3,2,1,0, 7,6,5,4, 11,10,9,8, 15,14,13,12}; + VectorStore(dest, vec_perm(val, val, mask)); +#else + return VectorStore(dest, val); +#endif +} + +// Rotate left by bit count +template +inline uint32x4_p RotateLeft(const uint32x4_p val) +{ + const uint32x4_p m = {C, C, C, C}; + return vec_rl(val, m); +} + +// Rotate right by bit count +template +inline uint32x4_p RotateRight(const uint32x4_p val) +{ + const uint32x4_p m = {32-C, 32-C, 32-C, 32-C}; + return vec_rl(val, m); +} + +// ChaCha's use of x86 shuffle is really a 4, 8, or 12 byte +// rotation on the 128-bit vector word: +// * [3,2,1,0] => [0,3,2,1] is Shuffle<1>(x) +// * [3,2,1,0] => [1,0,3,2] is Shuffle<2>(x) +// * [3,2,1,0] => [2,1,0,3] is Shuffle<3>(x) +template +inline uint32x4_p Shuffle(const uint32x4_p& val) +{ + switch (S%4) + { + case 1: + { + const uint8x16_p mask = {4,5,6,7, 8,9,10,11, 12,13,14,15, 0,1,2,3}; + return vec_perm(val, val, mask); + } + case 2: + { + const uint8x16_p mask = {8,9,10,11, 12,13,14,15, 0,1,2,3, 4,5,6,7}; + return vec_perm(val, val, mask); + } + case 3: + { + const uint8x16_p mask = {12,13,14,15, 0,1,2,3, 4,5,6,7, 8,9,10,11}; + return vec_perm(val, val, mask); + } + default: + { + CRYPTOPP_ASSERT(0); + return val; + } + } +} + +// Helper to perform 64-bit addition across two elements of 32-bit vectors +inline uint32x4_p Add64(const uint32x4_p& a, const uint32x4_p& b) +{ + return (uint32x4_p)vec_add((uint64x2_p)a, (uint64x2_p)b); +} + +#endif // CRYPTOPP_POWER8_AVAILABLE + ANONYMOUS_NAMESPACE_END NAMESPACE_BEGIN(CryptoPP) @@ -739,4 +835,270 @@ void ChaCha_OperateKeystream_SSE2(const word32 *state, const byte* input, byte * #endif // CRYPTOPP_SSE2_INTRIN_AVAILABLE || CRYPTOPP_SSE2_ASM_AVAILABLE +#if (CRYPTOPP_POWER8_AVAILABLE) + +void ChaCha_OperateKeystream_POWER8(const word32 *state, const byte* input, byte *output, unsigned int rounds) +{ + const uint32x4_p state0 = VectorLoad(state + 0*4); + const uint32x4_p state1 = VectorLoad(state + 1*4); + const uint32x4_p state2 = VectorLoad(state + 2*4); + const uint32x4_p state3 = VectorLoad(state + 3*4); + + const uint32x4_p CTRS[3] = { + {1,0,0,0}, {2,0,0,0}, {3,0,0,0} + }; + + uint32x4_p r0_0 = state0; + uint32x4_p r0_1 = state1; + uint32x4_p r0_2 = state2; + uint32x4_p r0_3 = state3; + + uint32x4_p r1_0 = state0; + uint32x4_p r1_1 = state1; + uint32x4_p r1_2 = state2; + uint32x4_p r1_3 = Add64(r0_3, CTRS[0]); + + uint32x4_p r2_0 = state0; + uint32x4_p r2_1 = state1; + uint32x4_p r2_2 = state2; + uint32x4_p r2_3 = Add64(r0_3, CTRS[1]); + + uint32x4_p r3_0 = state0; + uint32x4_p r3_1 = state1; + uint32x4_p r3_2 = state2; + uint32x4_p r3_3 = Add64(r0_3, CTRS[2]); + + for (int i = static_cast(rounds); i > 0; i -= 2) + { + r0_0 = VectorAdd(r0_0, r0_1); + r1_0 = VectorAdd(r1_0, r1_1); + r2_0 = VectorAdd(r2_0, r2_1); + r3_0 = VectorAdd(r3_0, r3_1); + + r0_3 = VectorXor(r0_3, r0_0); + r1_3 = VectorXor(r1_3, r1_0); + r2_3 = VectorXor(r2_3, r2_0); + r3_3 = VectorXor(r3_3, r3_0); + + r0_3 = RotateLeft<16>(r0_3); + r1_3 = RotateLeft<16>(r1_3); + r2_3 = RotateLeft<16>(r2_3); + r3_3 = RotateLeft<16>(r3_3); + + r0_2 = VectorAdd(r0_2, r0_3); + r1_2 = VectorAdd(r1_2, r1_3); + r2_2 = VectorAdd(r2_2, r2_3); + r3_2 = VectorAdd(r3_2, r3_3); + + r0_1 = VectorXor(r0_1, r0_2); + r1_1 = VectorXor(r1_1, r1_2); + r2_1 = VectorXor(r2_1, r2_2); + r3_1 = VectorXor(r3_1, r3_2); + + r0_1 = RotateLeft<12>(r0_1); + r1_1 = RotateLeft<12>(r1_1); + r2_1 = RotateLeft<12>(r2_1); + r3_1 = RotateLeft<12>(r3_1); + + r0_0 = VectorAdd(r0_0, r0_1); + r1_0 = VectorAdd(r1_0, r1_1); + r2_0 = VectorAdd(r2_0, r2_1); + r3_0 = VectorAdd(r3_0, r3_1); + + r0_3 = VectorXor(r0_3, r0_0); + r1_3 = VectorXor(r1_3, r1_0); + r2_3 = VectorXor(r2_3, r2_0); + r3_3 = VectorXor(r3_3, r3_0); + + r0_3 = RotateLeft<8>(r0_3); + r1_3 = RotateLeft<8>(r1_3); + r2_3 = RotateLeft<8>(r2_3); + r3_3 = RotateLeft<8>(r3_3); + + r0_2 = VectorAdd(r0_2, r0_3); + r1_2 = VectorAdd(r1_2, r1_3); + r2_2 = VectorAdd(r2_2, r2_3); + r3_2 = VectorAdd(r3_2, r3_3); + + r0_1 = VectorXor(r0_1, r0_2); + r1_1 = VectorXor(r1_1, r1_2); + r2_1 = VectorXor(r2_1, r2_2); + r3_1 = VectorXor(r3_1, r3_2); + + r0_1 = RotateLeft<7>(r0_1); + r1_1 = RotateLeft<7>(r1_1); + r2_1 = RotateLeft<7>(r2_1); + r3_1 = RotateLeft<7>(r3_1); + + r0_1 = Shuffle<1>(r0_1); + r0_2 = Shuffle<2>(r0_2); + r0_3 = Shuffle<3>(r0_3); + + r1_1 = Shuffle<1>(r1_1); + r1_2 = Shuffle<2>(r1_2); + r1_3 = Shuffle<3>(r1_3); + + r2_1 = Shuffle<1>(r2_1); + r2_2 = Shuffle<2>(r2_2); + r2_3 = Shuffle<3>(r2_3); + + r3_1 = Shuffle<1>(r3_1); + r3_2 = Shuffle<2>(r3_2); + r3_3 = Shuffle<3>(r3_3); + + r0_0 = VectorAdd(r0_0, r0_1); + r1_0 = VectorAdd(r1_0, r1_1); + r2_0 = VectorAdd(r2_0, r2_1); + r3_0 = VectorAdd(r3_0, r3_1); + + r0_3 = VectorXor(r0_3, r0_0); + r1_3 = VectorXor(r1_3, r1_0); + r2_3 = VectorXor(r2_3, r2_0); + r3_3 = VectorXor(r3_3, r3_0); + + r0_3 = RotateLeft<16>(r0_3); + r1_3 = RotateLeft<16>(r1_3); + r2_3 = RotateLeft<16>(r2_3); + r3_3 = RotateLeft<16>(r3_3); + + r0_2 = VectorAdd(r0_2, r0_3); + r1_2 = VectorAdd(r1_2, r1_3); + r2_2 = VectorAdd(r2_2, r2_3); + r3_2 = VectorAdd(r3_2, r3_3); + + r0_1 = VectorXor(r0_1, r0_2); + r1_1 = VectorXor(r1_1, r1_2); + r2_1 = VectorXor(r2_1, r2_2); + r3_1 = VectorXor(r3_1, r3_2); + + r0_1 = RotateLeft<12>(r0_1); + r1_1 = RotateLeft<12>(r1_1); + r2_1 = RotateLeft<12>(r2_1); + r3_1 = RotateLeft<12>(r3_1); + + r0_0 = VectorAdd(r0_0, r0_1); + r1_0 = VectorAdd(r1_0, r1_1); + r2_0 = VectorAdd(r2_0, r2_1); + r3_0 = VectorAdd(r3_0, r3_1); + + r0_3 = VectorXor(r0_3, r0_0); + r1_3 = VectorXor(r1_3, r1_0); + r2_3 = VectorXor(r2_3, r2_0); + r3_3 = VectorXor(r3_3, r3_0); + + r0_3 = RotateLeft<8>(r0_3); + r1_3 = RotateLeft<8>(r1_3); + r2_3 = RotateLeft<8>(r2_3); + r3_3 = RotateLeft<8>(r3_3); + + r0_2 = VectorAdd(r0_2, r0_3); + r1_2 = VectorAdd(r1_2, r1_3); + r2_2 = VectorAdd(r2_2, r2_3); + r3_2 = VectorAdd(r3_2, r3_3); + + r0_1 = VectorXor(r0_1, r0_2); + r1_1 = VectorXor(r1_1, r1_2); + r2_1 = VectorXor(r2_1, r2_2); + r3_1 = VectorXor(r3_1, r3_2); + + r0_1 = RotateLeft<7>(r0_1); + r1_1 = RotateLeft<7>(r1_1); + r2_1 = RotateLeft<7>(r2_1); + r3_1 = RotateLeft<7>(r3_1); + + r0_1 = Shuffle<3>(r0_1); + r0_2 = Shuffle<2>(r0_2); + r0_3 = Shuffle<1>(r0_3); + + r1_1 = Shuffle<3>(r1_1); + r1_2 = Shuffle<2>(r1_2); + r1_3 = Shuffle<1>(r1_3); + + r2_1 = Shuffle<3>(r2_1); + r2_2 = Shuffle<2>(r2_2); + r2_3 = Shuffle<1>(r2_3); + + r3_1 = Shuffle<3>(r3_1); + r3_2 = Shuffle<2>(r3_2); + r3_3 = Shuffle<1>(r3_3); + } + + r0_0 = VectorAdd(r0_0, state0); + r0_1 = VectorAdd(r0_1, state1); + r0_2 = VectorAdd(r0_2, state2); + r0_3 = VectorAdd(r0_3, state3); + + r1_0 = VectorAdd(r1_0, state0); + r1_1 = VectorAdd(r1_1, state1); + r1_2 = VectorAdd(r1_2, state2); + r1_3 = VectorAdd(r1_3, state3); + r1_3 = Add64(r1_3, CTRS[0]); + + r2_0 = VectorAdd(r2_0, state0); + r2_1 = VectorAdd(r2_1, state1); + r2_2 = VectorAdd(r2_2, state2); + r2_3 = VectorAdd(r2_3, state3); + r2_3 = Add64(r2_3, CTRS[1]); + + r3_0 = VectorAdd(r3_0, state0); + r3_1 = VectorAdd(r3_1, state1); + r3_2 = VectorAdd(r3_2, state2); + r3_3 = VectorAdd(r3_3, state3); + r3_3 = Add64(r3_3, CTRS[2]); + + if (input) + { + r0_0 = VectorXor(VectorLoad32LE(input + 0*16), r0_0); + r0_1 = VectorXor(VectorLoad32LE(input + 1*16), r0_1); + r0_2 = VectorXor(VectorLoad32LE(input + 2*16), r0_2); + r0_3 = VectorXor(VectorLoad32LE(input + 3*16), r0_3); + } + + VectorStore32LE(output + 0*16, r0_0); + VectorStore32LE(output + 1*16, r0_1); + VectorStore32LE(output + 2*16, r0_2); + VectorStore32LE(output + 3*16, r0_3); + + if (input) + { + r1_0 = VectorXor(VectorLoad32LE(input + 4*16), r1_0); + r1_1 = VectorXor(VectorLoad32LE(input + 5*16), r1_1); + r1_2 = VectorXor(VectorLoad32LE(input + 6*16), r1_2); + r1_3 = VectorXor(VectorLoad32LE(input + 7*16), r1_3); + } + + VectorStore32LE(output + 4*16, r1_0); + VectorStore32LE(output + 5*16, r1_1); + VectorStore32LE(output + 6*16, r1_2); + VectorStore32LE(output + 7*16, r1_3); + + if (input) + { + r2_0 = VectorXor(VectorLoad32LE(input + 8*16), r2_0); + r2_1 = VectorXor(VectorLoad32LE(input + 9*16), r2_1); + r2_2 = VectorXor(VectorLoad32LE(input + 10*16), r2_2); + r2_3 = VectorXor(VectorLoad32LE(input + 11*16), r2_3); + } + + VectorStore32LE(output + 8*16, r2_0); + VectorStore32LE(output + 9*16, r2_1); + VectorStore32LE(output + 10*16, r2_2); + VectorStore32LE(output + 11*16, r2_3); + + if (input) + { + r3_0 = VectorXor(VectorLoad32LE(input + 12*16), r3_0); + r3_1 = VectorXor(VectorLoad32LE(input + 13*16), r3_1); + r3_2 = VectorXor(VectorLoad32LE(input + 14*16), r3_2); + r3_3 = VectorXor(VectorLoad32LE(input + 15*16), r3_3); + } + + VectorStore32LE(output + 12*16, r3_0); + VectorStore32LE(output + 13*16, r3_1); + VectorStore32LE(output + 14*16, r3_2); + VectorStore32LE(output + 15*16, r3_3); +} + +#endif // CRYPTOPP_POWER8_AVAILABLE + NAMESPACE_END diff --git a/chacha.cpp b/chacha.cpp index 63b24845..1de1e89e 100644 --- a/chacha.cpp +++ b/chacha.cpp @@ -19,6 +19,10 @@ extern void ChaCha_OperateKeystream_NEON(const word32 *state, const byte* input, extern void ChaCha_OperateKeystream_SSE2(const word32 *state, const byte* input, byte *output, unsigned int rounds); #endif +#if (CRYPTOPP_POWER8_AVAILABLE) +extern void ChaCha_OperateKeystream_POWER8(const word32 *state, const byte* input, byte *output, unsigned int rounds); +#endif + #define CHACHA_QUARTER_ROUND(a,b,c,d) \ a += b; d ^= a; d = rotlConstant<16,word32>(d); \ c += d; b ^= c; b = rotlConstant<12,word32>(b); \ @@ -46,6 +50,10 @@ std::string ChaCha_Policy::AlgorithmProvider() const #if (CRYPTOPP_ARM_NEON_AVAILABLE) if (HasNEON()) return "NEON"; +#endif +#if (CRYPTOPP_POWER8_AVAILABLE) + if (HasPower8()) + return "Power8"; #endif return "C++"; } @@ -109,6 +117,11 @@ unsigned int ChaCha_Policy::GetOptimalBlockSize() const if (HasNEON()) return 4*BYTES_PER_ITERATION; else +#endif +#if (CRYPTOPP_POWER8_AVAILABLE) + if (HasPower8()) + return 4*BYTES_PER_ITERATION; + else #endif return BYTES_PER_ITERATION; } @@ -157,6 +170,25 @@ void ChaCha_Policy::OperateKeystream(KeystreamOperation operation, } #endif +#if (CRYPTOPP_POWER8_AVAILABLE) + if (HasPower8()) + { + while (iterationCount >= 4) + { + const bool xorInput = (operation & INPUT_NULL) != INPUT_NULL; + ChaCha_OperateKeystream_POWER8(m_state, xorInput ? input : NULLPTR, output, m_rounds); + + m_state[12] += 4; + if (m_state[12] < 4) + m_state[13]++; + + input += (!!xorInput)*4*BYTES_PER_ITERATION; + output += 4*BYTES_PER_ITERATION; + iterationCount -= 4; + } + } +#endif + while (iterationCount--) { word32 x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15;