diff --git a/config.h b/config.h index 3fa1c01d..95aeece3 100644 --- a/config.h +++ b/config.h @@ -685,7 +685,7 @@ NAMESPACE_END #if !defined(CRYPTOPP_POWER8_AES_AVAILABLE) && !defined(CRYPTOPP_DISABLE_POWER8_AES) && defined(CRYPTOPP_POWER8_AVAILABLE) # if defined(__CRYPTO__) || defined(_ARCH_PWR8) || (CRYPTOPP_XLC_VERSION >= 130000) || (CRYPTOPP_GCC_VERSION >= 40800) # define CRYPTOPP_POWER8_AES_AVAILABLE 1 -//# define CRYPTOPP_POWER8_SHA_AVAILABLE 1 +# define CRYPTOPP_POWER8_SHA_AVAILABLE 1 //# define CRYPTOPP_POWER8_CRC_AVAILABLE 1 # endif #endif diff --git a/sha-simd.cpp b/sha-simd.cpp index e3a45d17..454916ed 100644 --- a/sha-simd.cpp +++ b/sha-simd.cpp @@ -65,7 +65,7 @@ extern "C" { bool CPU_ProbeSHA1() { #if defined(CRYPTOPP_NO_CPU_FEATURE_PROBES) - return false; + return false; #elif (CRYPTOPP_ARM_SHA_AVAILABLE) # if defined(CRYPTOPP_MS_STYLE_INLINE_ASSEMBLY) volatile bool result = true; @@ -127,7 +127,7 @@ bool CPU_ProbeSHA1() bool CPU_ProbeSHA2() { #if defined(CRYPTOPP_NO_CPU_FEATURE_PROBES) - return false; + return false; #elif (CRYPTOPP_ARM_SHA_AVAILABLE) # if defined(CRYPTOPP_MS_STYLE_INLINE_ASSEMBLY) volatile bool result = true; @@ -189,10 +189,11 @@ bool CPU_ProbeSHA2() // provided by sha.cpp extern const word32 SHA256_K[64]; +extern const word64 SHA512_K[80]; -/////////////////////////////////// -// start of Walton/Gulley's code // -/////////////////////////////////// +///////////////////////////////////// +// start of Walton and Gulley code // +///////////////////////////////////// #if CRYPTOPP_SHANI_AVAILABLE // Based on http://software.intel.com/en-us/articles/intel-sha-extensions and code by Sean Gulley. @@ -598,15 +599,15 @@ void SHA256_HashMultipleBlocks_SHANI(word32 *state, const word32 *data, size_t l } #endif // CRYPTOPP_SHANI_AVAILABLE -///////////////////////////////// -// end of Walton/Gulley's code // -///////////////////////////////// +/////////////////////////////////// +// end of Walton and Gulley code // +/////////////////////////////////// // ***************** ARMV8 SHA ******************** -///////////////////////////////////////////////////////// -// start of Walton/Schneiders/O'Rourke/Hovsmith's code // -///////////////////////////////////////////////////////// +///////////////////////////////////////////////////////////// +// start of Walton, Schneiders, O'Rourke and Hovsmith code // +///////////////////////////////////////////////////////////// #if CRYPTOPP_ARM_SHA_AVAILABLE void SHA1_HashMultipleBlocks_ARMV8(word32 *state, const word32 *data, size_t length, ByteOrder order) @@ -965,24 +966,499 @@ void SHA256_HashMultipleBlocks_ARMV8(word32 *state, const word32 *data, size_t l } #endif // CRYPTOPP_ARM_SHA_AVAILABLE -/////////////////////////////////////////////////////// -// end of Walton/Schneiders/O'Rourke/Hovsmith's code // -/////////////////////////////////////////////////////// +/////////////////////////////////////////////////////////// +// end of Walton, Schneiders, O'Rourke and Hovsmith code // +/////////////////////////////////////////////////////////// // ***************** Power8 SHA ******************** -//////////////////////////////////////////////// -// Begin Gustavo Serra Scalet and Walton code // -//////////////////////////////////////////////// +////////////////////////////////////////////////// +// start Gustavo, Serra, Scalet and Walton code // +////////////////////////////////////////////////// #if CRYPTOPP_POWER8_SHA_AVAILABLE + +// Indexes into the S[] array +enum {A=0, B=1, C, D, E, F, G, H}; + +typedef __vector unsigned char uint8x16_p8; +typedef __vector unsigned int uint32x4_p8; +typedef __vector unsigned long long uint64x2_p8; + +#endif + +#if CRYPTOPP_POWER8_SHA_AVAILABLE + +// Aligned load +template static inline +uint32x4_p8 VectorLoad32x4(const T* data, int offset) +{ + return (uint32x4_p8)vec_ld(offset, (uint8_t*)data); +} + +// Unaligned load +template static inline +uint32x4_p8 VectorLoad32x4u(const T* data, int offset) +{ +#if defined(CRYPTOPP_XLC_VERSION) + return (uint32x4_p8)vec_xl(offset, (uint8_t*)data); +#else + return (uint32x4_p8)vec_vsx_ld(offset, (uint8_t*)data); +#endif +} + +// Unaligned load, big-endian +template static inline +uint32x4_p8 VectorLoad32x4ube(const T* data, int offset) +{ +#if defined(CRYPTOPP_LITTLE_ENDIAN) + const uint8x16_p8 mask = {3,2,1,0, 7,6,5,4, 11,10,9,8, 15,14,13,12}; + const uint32x4_p8 r = VectorLoad32x4u(data, offset); + return (uint32x4_p8)vec_perm(r, r, mask); +#else + return VectorLoad32x4u(data, offset); +#endif +} + +// Aligned store +template static inline +void VectorStore32x4(const uint32x4_p8 val, T* data, int offset) +{ + vec_st((uint8x16_p8)val, offset, (uint8_t*)data); +} + +// Unaligned store +template static inline +void VectorStore32x4u(const uint32x4_p8 val, T* data, int offset) +{ +#if defined(CRYPTOPP_XLC_VERSION) + vec_xst((uint8x16_p8)val, offset, (uint8_t*)data); +#else + vec_vsx_st((uint8x16_p8)val, offset, (uint8_t*)data); +#endif +} + +static inline +uint32x4_p8 VectorCh(const uint32x4_p8 x, const uint32x4_p8 y, const uint32x4_p8 z) +{ + // The trick below is due to Andy Polyakov and Jack Lloyd + return vec_sel(z,y,x); +} + +static inline +uint32x4_p8 VectorMaj(const uint32x4_p8 x, const uint32x4_p8 y, const uint32x4_p8 z) +{ + // The trick below is due to Andy Polyakov and Jack Lloyd + const uint32x4_p8 xy = vec_xor(x, y); + return vec_sel(y, z, xy); +} + +static inline +uint32x4_p8 Vector_sigma0(const uint32x4_p8 val) +{ +#if defined(CRYPTOPP_XLC_VERSION) + return __vshasigmaw(val, 0, 0); +#else + return __builtin_crypto_vshasigmaw(val, 0, 0); +#endif +} + +static inline +uint32x4_p8 Vector_sigma1(const uint32x4_p8 val) +{ +#if defined(CRYPTOPP_XLC_VERSION) + return __vshasigmaw(val, 0, 0xf); +#else + return __builtin_crypto_vshasigmaw(val, 0, 0xf); +#endif +} + +static inline +uint32x4_p8 VectorSigma0(const uint32x4_p8 val) +{ +#if defined(CRYPTOPP_XLC_VERSION) + return __vshasigmaw(val, 1, 0); +#else + return __builtin_crypto_vshasigmaw(val, 1, 0); +#endif +} + +static inline +uint32x4_p8 VectorSigma1(const uint32x4_p8 val) +{ +#if defined(CRYPTOPP_XLC_VERSION) + return __vshasigmaw(val, 1, 0xf); +#else + return __builtin_crypto_vshasigmaw(val, 1, 0xf); +#endif +} + +static inline +uint32x4_p8 VectorPack(const uint32x4_p8 a, const uint32x4_p8 b, + const uint32x4_p8 c, const uint32x4_p8 d) +{ + const uint8x16_p8 m1 = {0,1,2,3, 16,17,18,19, 0,0,0,0, 0,0,0,0}; + const uint8x16_p8 m2 = {0,1,2,3, 4,5,6,7, 16,17,18,19, 0,0,0,0}; + const uint8x16_p8 m3 = {0,1,2,3, 4,5,6,7, 8,9,10,11, 16,17,18,19}; + + return vec_perm(vec_perm(vec_perm(a,b,m1),c,m2),d,m3); +} + +template static inline +uint32x4_p8 VectorShiftLeft(const uint32x4_p8 val) +{ +#if (defined(CRYPTOPP_LITTLE_ENDIAN)) + return (uint32x4_p8)vec_sld((uint8x16_p8)val, (uint8x16_p8)val, (16-L)&0xf); +#else + return (uint32x4_p8)vec_sld((uint8x16_p8)val, (uint8x16_p8)val, L&0xf); +#endif +} + +template <> +uint32x4_p8 VectorShiftLeft<0>(const uint32x4_p8 val) { return val; } + +template <> +uint32x4_p8 VectorShiftLeft<16>(const uint32x4_p8 val) { return val; } + +template static inline +void SHA256_ROUND1(uint32x4_p8 W[16], uint32x4_p8 S[8], const uint32x4_p8 K, const uint32x4_p8 M) +{ + uint32x4_p8 T1, T2; + + W[R] = M; + T1 = S[H] + VectorSigma1(S[E]) + VectorCh(S[E],S[F],S[G]) + K + M; + T2 = VectorSigma0(S[A]) + VectorMaj(S[A],S[B],S[C]); + + S[H] = S[G]; S[G] = S[F]; S[F] = S[E]; + S[E] = S[D] + T1; + S[D] = S[C]; S[C] = S[B]; S[B] = S[A]; + S[A] = T1 + T2; +} + +template static inline +void SHA256_ROUND2(uint32x4_p8 W[16], uint32x4_p8 S[8], const uint32x4_p8 K) +{ + // Indexes into the W[] array + enum {IDX0=(R+0)&0xf, IDX1=(R+1)&0xf, IDX9=(R+9)&0xf, IDX14=(R+14)&0xf}; + + const uint32x4_p8 s0 = Vector_sigma0(W[IDX1]); + const uint32x4_p8 s1 = Vector_sigma1(W[IDX14]); + + uint32x4_p8 T1 = (W[IDX0] += s0 + s1 + W[IDX9]); + T1 += S[H] + VectorSigma1(S[E]) + VectorCh(S[E],S[F],S[G]) + K; + uint32x4_p8 T2 = VectorSigma0(S[A]) + VectorMaj(S[A],S[B],S[C]); + + S[H] = S[G]; S[G] = S[F]; S[F] = S[E]; + S[E] = S[D] + T1; + S[D] = S[C]; S[C] = S[B]; S[B] = S[A]; + S[A] = T1 + T2; +} + void SHA256_HashMultipleBlocks_POWER8(word32 *state, const word32 *data, size_t length, ByteOrder order) { CRYPTOPP_ASSERT(state); CRYPTOPP_ASSERT(data); CRYPTOPP_ASSERT(length >= SHA256::BLOCKSIZE); - CRYPTOPP_ASSERT(0); + const uint32_t* k = reinterpret_cast(SHA256_K); + const uint32_t* m = reinterpret_cast(data); + + uint32x4_p8 abcd = VectorLoad32x4u(state+0, 0); + uint32x4_p8 efgh = VectorLoad32x4u(state+4, 0); + + uint32x4_p8 W[16], S[8], vm, vk; + + while (length >= SHA256::BLOCKSIZE) + { + S[A] = abcd; S[E] = efgh; + S[B] = VectorShiftLeft<4>(S[A]); + S[F] = VectorShiftLeft<4>(S[E]); + S[C] = VectorShiftLeft<4>(S[B]); + S[G] = VectorShiftLeft<4>(S[F]); + S[D] = VectorShiftLeft<4>(S[C]); + S[H] = VectorShiftLeft<4>(S[G]); + + k = reinterpret_cast(SHA256_K); + unsigned int i, offset=0; + + // Unroll the loop to get the constexpr of the round number + // for (unsigned int i=0; i<16; ++i, m+=4, offset+=16) + { + vk = VectorLoad32x4(k, offset); + vm = VectorLoad32x4ube(m, offset); + SHA256_ROUND1<0>(W,S, vk,vm); + offset+=16; + + vk = VectorShiftLeft<4>(vk); + vm = VectorShiftLeft<4>(vm); + SHA256_ROUND1<1>(W,S, vk,vm); + + vk = VectorShiftLeft<4>(vk); + vm = VectorShiftLeft<4>(vm); + SHA256_ROUND1<2>(W,S, vk,vm); + + vk = VectorShiftLeft<4>(vk); + vm = VectorShiftLeft<4>(vm); + SHA256_ROUND1<3>(W,S, vk,vm); + + vk = VectorLoad32x4(k, offset); + vm = VectorLoad32x4ube(m, offset); + SHA256_ROUND1<4>(W,S, vk,vm); + offset+=16; + + vk = VectorShiftLeft<4>(vk); + vm = VectorShiftLeft<4>(vm); + SHA256_ROUND1<5>(W,S, vk,vm); + + vk = VectorShiftLeft<4>(vk); + vm = VectorShiftLeft<4>(vm); + SHA256_ROUND1<6>(W,S, vk,vm); + + vk = VectorShiftLeft<4>(vk); + vm = VectorShiftLeft<4>(vm); + SHA256_ROUND1<7>(W,S, vk,vm); + + vk = VectorLoad32x4(k, offset); + vm = VectorLoad32x4ube(m, offset); + SHA256_ROUND1<8>(W,S, vk,vm); + offset+=16; + + vk = VectorShiftLeft<4>(vk); + vm = VectorShiftLeft<4>(vm); + SHA256_ROUND1<9>(W,S, vk,vm); + + vk = VectorShiftLeft<4>(vk); + vm = VectorShiftLeft<4>(vm); + SHA256_ROUND1<10>(W,S, vk,vm); + + vk = VectorShiftLeft<4>(vk); + vm = VectorShiftLeft<4>(vm); + SHA256_ROUND1<11>(W,S, vk,vm); + + vk = VectorLoad32x4(k, offset); + vm = VectorLoad32x4ube(m, offset); + SHA256_ROUND1<12>(W,S, vk,vm); + offset+=16; + + vk = VectorShiftLeft<4>(vk); + vm = VectorShiftLeft<4>(vm); + SHA256_ROUND1<13>(W,S, vk,vm); + + vk = VectorShiftLeft<4>(vk); + vm = VectorShiftLeft<4>(vm); + SHA256_ROUND1<14>(W,S, vk,vm); + + vk = VectorShiftLeft<4>(vk); + vm = VectorShiftLeft<4>(vm); + SHA256_ROUND1<15>(W,S, vk,vm); + } + + m += 16; // 32-bit words, not bytes + length -= SHA256::BLOCKSIZE; + + for (i=16; i<64; i+=16) + { + vk = VectorLoad32x4(k, offset); + SHA256_ROUND2<0>(W,S, vk); + SHA256_ROUND2<1>(W,S, VectorShiftLeft<4>(vk)); + SHA256_ROUND2<2>(W,S, VectorShiftLeft<8>(vk)); + SHA256_ROUND2<3>(W,S, VectorShiftLeft<12>(vk)); + offset+=16; + + vk = VectorLoad32x4(k, offset); + SHA256_ROUND2<4>(W,S, vk); + SHA256_ROUND2<5>(W,S, VectorShiftLeft<4>(vk)); + SHA256_ROUND2<6>(W,S, VectorShiftLeft<8>(vk)); + SHA256_ROUND2<7>(W,S, VectorShiftLeft<12>(vk)); + offset+=16; + + vk = VectorLoad32x4(k, offset); + SHA256_ROUND2<8>(W,S, vk); + SHA256_ROUND2<9>(W,S, VectorShiftLeft<4>(vk)); + SHA256_ROUND2<10>(W,S, VectorShiftLeft<8>(vk)); + SHA256_ROUND2<11>(W,S, VectorShiftLeft<12>(vk)); + offset+=16; + + vk = VectorLoad32x4(k, offset); + SHA256_ROUND2<12>(W,S, vk); + SHA256_ROUND2<13>(W,S, VectorShiftLeft<4>(vk)); + SHA256_ROUND2<14>(W,S, VectorShiftLeft<8>(vk)); + SHA256_ROUND2<15>(W,S, VectorShiftLeft<12>(vk)); + offset+=16; + } + + abcd += VectorPack(S[A],S[B],S[C],S[D]); + efgh += VectorPack(S[E],S[F],S[G],S[H]); + } + + VectorStore32x4u(abcd, state, 0); + VectorStore32x4u(efgh, state, 16); +} + +static inline +uint64x2_p8 VectorPermute64x2(const uint64x2_p8 val, const uint8x16_p8 mask) +{ + return (uint64x2_p8)vec_perm(val, val, mask); +} + +// Aligned load +template static inline +uint64x2_p8 VectorLoad64x2(const T* data, int offset) +{ + return (uint64x2_p8)vec_ld(offset, (uint8_t*)data); +} + +// Unaligned load +template static inline +uint64x2_p8 VectorLoad64x2u(const T* data, int offset) +{ +#if defined(CRYPTOPP_XLC_VERSION) + return (uint64x2_p8)vec_xl(offset, (uint8_t*)data); +#else + return (uint64x2_p8)vec_vsx_ld(offset, (uint8_t*)data); +#endif +} + +// Unaligned load, big-endian +template static inline +uint64x2_p8 VectorLoad64x2ube(const T* data, int offset) +{ +#if defined(CRYPTOPP_LITTLE_ENDIAN) + const uint8x16_p8 mask = {0,1,2,3, 4,5,6,7, 8,9,10,11, 12,13,14,15}; + return VectorPermute64x2(VectorLoad64x2u(data, offset), mask); +#else + return VectorLoad64x2u(data, offset); +#endif +} + +// Aligned store +template static inline +void VectorStore64x2(const uint64x2_p8 val, T* data, int offset) +{ + vec_st((uint8x16_p8)val, offset, (uint8_t*)data); +} + +// Unaligned store +template static inline +void VectorStore64x2u(const uint64x2_p8 val, T* data, int offset) +{ +#if defined(CRYPTOPP_XLC_VERSION) + vec_xst((uint8x16_p8)val, offset, (uint8_t*)data); +#else + vec_vsx_st((uint8x16_p8)val, offset, (uint8_t*)data); +#endif +} + +static inline +uint64x2_p8 VectorCh(const uint64x2_p8 x, const uint64x2_p8 y, const uint64x2_p8 z) +{ + // The trick below is due to Andy Polyakov and Jack Lloyd + return vec_sel(z,y,x); +} + +static inline +uint64x2_p8 VectorMaj(const uint64x2_p8 x, const uint64x2_p8 y, const uint64x2_p8 z) +{ + // The trick below is due to Andy Polyakov and Jack Lloyd + const uint64x2_p8 xy = vec_xor(x, y); + return vec_sel(y, z, xy); +} + +static inline +uint64x2_p8 Vector_sigma0(const uint64x2_p8 val) +{ +#if defined(CRYPTOPP_XLC_VERSION) + return __vshasigmad(val, 0, 0); +#else + return __builtin_crypto_vshasigmad(val, 0, 0); +#endif +} + +static inline +uint64x2_p8 Vector_sigma1(const uint64x2_p8 val) +{ +#if defined(CRYPTOPP_XLC_VERSION) + return __vshasigmad(val, 0, 0xf); +#else + return __builtin_crypto_vshasigmad(val, 0, 0xf); +#endif +} + +static inline +uint64x2_p8 VectorSigma0(const uint64x2_p8 val) +{ +#if defined(CRYPTOPP_XLC_VERSION) + return __vshasigmad(val, 1, 0); +#else + return __builtin_crypto_vshasigmad(val, 1, 0); +#endif +} + +static inline +uint64x2_p8 VectorSigma1(const uint64x2_p8 val) +{ +#if defined(CRYPTOPP_XLC_VERSION) + return __vshasigmad(val, 1, 0xf); +#else + return __builtin_crypto_vshasigmad(val, 1, 0xf); +#endif +} + +static inline +uint64x2_p8 VectorPack(const uint64x2_p8 x, const uint64x2_p8 y) +{ + const uint8x16_p8 m = {0,1,2,3, 4,5,6,7, 16,17,18,19, 20,21,22,23}; + return vec_perm(x,y,m); +} + +template static inline +uint64x2_p8 VectorShiftLeft(const uint64x2_p8 val) +{ +#if (defined(CRYPTOPP_LITTLE_ENDIAN)) + return (uint64x2_p8)vec_sld((uint8x16_p8)val, (uint8x16_p8)val, (16-L)&0xf); +#else + return (uint64x2_p8)vec_sld((uint8x16_p8)val, (uint8x16_p8)val, L&0xf); +#endif +} + +template <> +uint64x2_p8 VectorShiftLeft<0>(const uint64x2_p8 val) { return val; } + +template <> +uint64x2_p8 VectorShiftLeft<16>(const uint64x2_p8 val) { return val; } + +template static inline +void SHA512_ROUND1(uint64x2_p8 W[16], uint64x2_p8 S[8], const uint64x2_p8 K, const uint64x2_p8 M) +{ + uint64x2_p8 T1, T2; + + W[R] = M; + T1 = S[H] + VectorSigma1(S[E]) + VectorCh(S[E],S[F],S[G]) + K + M; + T2 = VectorSigma0(S[A]) + VectorMaj(S[A],S[B],S[C]); + + S[H] = S[G]; S[G] = S[F]; S[F] = S[E]; + S[E] = S[D] + T1; + S[D] = S[C]; S[C] = S[B]; S[B] = S[A]; + S[A] = T1 + T2; +} + +template static inline +void SHA512_ROUND2(uint64x2_p8 W[16], uint64x2_p8 S[8], const uint64x2_p8 K) +{ + // Indexes into the W[] array + enum {IDX0=(R+0)&0xf, IDX1=(R+1)&0xf, IDX9=(R+9)&0xf, IDX14=(R+14)&0xf}; + + const uint64x2_p8 s0 = Vector_sigma0(W[IDX1]); + const uint64x2_p8 s1 = Vector_sigma1(W[IDX14]); + + uint64x2_p8 T1 = (W[IDX0] += s0 + s1 + W[IDX9]); + T1 += S[H] + VectorSigma1(S[E]) + VectorCh(S[E],S[F],S[G]) + K; + uint64x2_p8 T2 = VectorSigma0(S[A]) + VectorMaj(S[A],S[B],S[C]); + + S[H] = S[G]; S[G] = S[F]; S[F] = S[E]; + S[E] = S[D] + T1; + S[D] = S[C]; S[C] = S[B]; S[B] = S[A]; + S[A] = T1 + T2; } void SHA512_HashMultipleBlocks_POWER8(word64 *state, const word64 *data, size_t length, ByteOrder order) @@ -991,13 +1467,166 @@ void SHA512_HashMultipleBlocks_POWER8(word64 *state, const word64 *data, size_t CRYPTOPP_ASSERT(data); CRYPTOPP_ASSERT(length >= SHA512::BLOCKSIZE); - CRYPTOPP_ASSERT(0); + const uint64_t* k = reinterpret_cast(SHA512_K); + const uint64_t* m = reinterpret_cast(data); + + uint64x2_p8 ab = VectorLoad64x2u(state+0, 0); + uint64x2_p8 cd = VectorLoad64x2u(state+2, 0); + uint64x2_p8 ef = VectorLoad64x2u(state+4, 0); + uint64x2_p8 gh = VectorLoad64x2u(state+6, 0); + + uint64x2_p8 W[16], S[8], vm, vk; + + while (length >= SHA512::BLOCKSIZE) + { + S[A] = ab; S[C] = cd; + S[E] = ef; S[G] = gh; + S[B] = VectorShiftLeft<8>(S[A]); + S[D] = VectorShiftLeft<8>(S[C]); + S[F] = VectorShiftLeft<8>(S[E]); + S[H] = VectorShiftLeft<8>(S[G]); + + k = reinterpret_cast(SHA512_K); + unsigned int i, offset=0; + + // Unroll the loop to get the constexpr of the round number + // for (unsigned int i=0; i<16; ++i, offset+=16) + { + vk = VectorLoad64x2(k, offset); + vm = VectorLoad64x2ube(m, offset); + SHA512_ROUND1<0>(W,S, vk,vm); + offset+=16; + + vk = VectorShiftLeft<8>(vk); + vm = VectorShiftLeft<8>(vm); + SHA512_ROUND1<1>(W,S, vk,vm); + + vk = VectorLoad64x2(k, offset); + vm = VectorLoad64x2ube(m, offset); + SHA512_ROUND1<2>(W,S, vk,vm); + offset+=16; + + vk = VectorShiftLeft<8>(vk); + vm = VectorShiftLeft<8>(vm); + SHA512_ROUND1<3>(W,S, vk,vm); + + vk = VectorLoad64x2(k, offset); + vm = VectorLoad64x2ube(m, offset); + SHA512_ROUND1<4>(W,S, vk,vm); + offset+=16; + + vk = VectorShiftLeft<8>(vk); + vm = VectorShiftLeft<8>(vm); + SHA512_ROUND1<5>(W,S, vk,vm); + + vk = VectorLoad64x2(k, offset); + vm = VectorLoad64x2ube(m, offset); + SHA512_ROUND1<6>(W,S, vk,vm); + offset+=16; + + vk = VectorShiftLeft<8>(vk); + vm = VectorShiftLeft<8>(vm); + SHA512_ROUND1<7>(W,S, vk,vm); + + vk = VectorLoad64x2(k, offset); + vm = VectorLoad64x2ube(m, offset); + SHA512_ROUND1<8>(W,S, vk,vm); + offset+=16; + + vk = VectorShiftLeft<8>(vk); + vm = VectorShiftLeft<8>(vm); + SHA512_ROUND1<9>(W,S, vk,vm); + + vk = VectorLoad64x2(k, offset); + vm = VectorLoad64x2ube(m, offset); + SHA512_ROUND1<10>(W,S, vk,vm); + offset+=16; + + vk = VectorShiftLeft<8>(vk); + vm = VectorShiftLeft<8>(vm); + SHA512_ROUND1<11>(W,S, vk,vm); + + vk = VectorLoad64x2(k, offset); + vm = VectorLoad64x2ube(m, offset); + SHA512_ROUND1<12>(W,S, vk,vm); + offset+=16; + + vk = VectorShiftLeft<8>(vk); + vm = VectorShiftLeft<8>(vm); + SHA512_ROUND1<13>(W,S, vk,vm); + + vk = VectorLoad64x2(k, offset); + vm = VectorLoad64x2ube(m, offset); + SHA512_ROUND1<14>(W,S, vk,vm); + offset+=16; + + vk = VectorShiftLeft<8>(vk); + vm = VectorShiftLeft<8>(vm); + SHA512_ROUND1<15>(W,S, vk,vm); + } + + m += 16; // 64-bit words, not bytes + length -= SHA512::BLOCKSIZE; + + for (i=16 ; i<80; i+=16) + { + vk = VectorLoad64x2(k, offset); + SHA512_ROUND2<0>(W,S, vk); + SHA512_ROUND2<1>(W,S, VectorShiftLeft<8>(vk)); + offset+=16; + + vk = VectorLoad64x2(k, offset); + SHA512_ROUND2<2>(W,S, vk); + SHA512_ROUND2<3>(W,S, VectorShiftLeft<8>(vk)); + offset+=16; + + vk = VectorLoad64x2(k, offset); + SHA512_ROUND2<4>(W,S, vk); + SHA512_ROUND2<5>(W,S, VectorShiftLeft<8>(vk)); + offset+=16; + + vk = VectorLoad64x2(k, offset); + SHA512_ROUND2<6>(W,S, vk); + SHA512_ROUND2<7>(W,S, VectorShiftLeft<8>(vk)); + offset+=16; + + vk = VectorLoad64x2(k, offset); + SHA512_ROUND2<8>(W,S, vk); + SHA512_ROUND2<9>(W,S, VectorShiftLeft<8>(vk)); + offset+=16; + + vk = VectorLoad64x2(k, offset); + SHA512_ROUND2<10>(W,S, vk); + SHA512_ROUND2<11>(W,S, VectorShiftLeft<8>(vk)); + offset+=16; + + vk = VectorLoad64x2(k, offset); + SHA512_ROUND2<12>(W,S, vk); + SHA512_ROUND2<13>(W,S, VectorShiftLeft<8>(vk)); + offset+=16; + + vk = VectorLoad64x2(k, offset); + SHA512_ROUND2<14>(W,S, vk); + SHA512_ROUND2<15>(W,S, VectorShiftLeft<8>(vk)); + offset+=16; + } + + ab += VectorPack(S[A],S[B]); + cd += VectorPack(S[C],S[D]); + ef += VectorPack(S[E],S[F]); + gh += VectorPack(S[G],S[H]); + } + + VectorStore64x2u(ab, state+0, 0); + VectorStore64x2u(cd, state+2, 0); + VectorStore64x2u(ef, state+4, 0); + VectorStore64x2u(gh, state+6, 0); } #endif // CRYPTOPP_POWER8_SHA_AVAILABLE -////////////////////////////////////////////// -// End Gustavo Serra Scalet and Walton code // -////////////////////////////////////////////// +//////////////////////////////////////////////// +// end Gustavo, Serra, Scalet and Walton code // +//////////////////////////////////////////////// NAMESPACE_END