From d563c5da94454919e24fd616db81e33ce3db8122 Mon Sep 17 00:00:00 2001 From: Jeffrey Walton Date: Mon, 30 Jul 2018 19:53:39 -0400 Subject: [PATCH] Fix SHA-256 on AIX using IBM XL C/C++ and POWER8 crypto We were using aligned loads of the key table SHA256_K. The key table was declared as 16-byte aligned but it appears the table was not aligned in memory. --- sha-simd.cpp | 91 +++++++++++++++------------------------------------- 1 file changed, 25 insertions(+), 66 deletions(-) diff --git a/sha-simd.cpp b/sha-simd.cpp index 30d6068d..09442279 100644 --- a/sha-simd.cpp +++ b/sha-simd.cpp @@ -187,7 +187,7 @@ bool CPU_ProbeSHA2() // ***************** Intel x86 SHA ******************** -// provided by sha.cpp +// provided by sha.cpp, 16-byte aigned extern const word32 SHA256_K[64]; extern const word64 SHA512_K[80]; @@ -987,30 +987,10 @@ typedef __vector unsigned char uint8x16_p8; typedef __vector unsigned int uint32x4_p8; typedef __vector unsigned long long uint64x2_p8; -uint32x4_p8 VEC_XL_BE(int offset, const uint8_t* data) -{ -#if defined(CRYPTOPP_XLC_VERSION) - return (uint32x4_p8)vec_xl_be(offset, (uint8_t*)data); -#else - uint32x4_p8 res; - __asm(" lxvd2x %x0, %1, %2 \n\t" - : "=wa" (res) - : "b" (data), "r" (offset)); - return res; -#endif -} - #endif // CRYPTOPP_POWER8_SHA_AVAILABLE #if CRYPTOPP_POWER8_SHA_AVAILABLE -// Aligned load -template static inline -uint32x4_p8 VectorLoad32x4(const T* data, int offset) -{ - return (uint32x4_p8)vec_ld(offset, data); -} - // Unaligned load template static inline uint32x4_p8 VectorLoad32x4u(const T* data, int offset) @@ -1022,13 +1002,6 @@ uint32x4_p8 VectorLoad32x4u(const T* data, int offset) #endif } -// Aligned store -template static inline -void VectorStore32x4(const uint32x4_p8 val, T* data, int offset) -{ - vec_st((uint8x16_p8)val, offset, data); -} - // Unaligned store template static inline void VectorStore32x4u(const uint32x4_p8 val, T* data, int offset) @@ -1196,7 +1169,7 @@ void SHA256_HashMultipleBlocks_POWER8(word32 *state, const word32 *data, size_t // Unroll the loop to provide the round number as a constexpr // for (unsigned int i=0; i<16; ++i) { - vk = VectorLoad32x4(k, offset); + vk = VectorLoad32x4u(k, offset); vm = VectorLoadMsg32x4(m, offset); SHA256_ROUND1<0>(W,S, vk,vm); offset+=16; @@ -1213,7 +1186,7 @@ void SHA256_HashMultipleBlocks_POWER8(word32 *state, const word32 *data, size_t vm = VectorShiftLeft<4>(vm); SHA256_ROUND1<3>(W,S, vk,vm); - vk = VectorLoad32x4(k, offset); + vk = VectorLoad32x4u(k, offset); vm = VectorLoadMsg32x4(m, offset); SHA256_ROUND1<4>(W,S, vk,vm); offset+=16; @@ -1230,7 +1203,7 @@ void SHA256_HashMultipleBlocks_POWER8(word32 *state, const word32 *data, size_t vm = VectorShiftLeft<4>(vm); SHA256_ROUND1<7>(W,S, vk,vm); - vk = VectorLoad32x4(k, offset); + vk = VectorLoad32x4u(k, offset); vm = VectorLoadMsg32x4(m, offset); SHA256_ROUND1<8>(W,S, vk,vm); offset+=16; @@ -1247,7 +1220,7 @@ void SHA256_HashMultipleBlocks_POWER8(word32 *state, const word32 *data, size_t vm = VectorShiftLeft<4>(vm); SHA256_ROUND1<11>(W,S, vk,vm); - vk = VectorLoad32x4(k, offset); + vk = VectorLoad32x4u(k, offset); vm = VectorLoadMsg32x4(m, offset); SHA256_ROUND1<12>(W,S, vk,vm); offset+=16; @@ -1269,28 +1242,28 @@ void SHA256_HashMultipleBlocks_POWER8(word32 *state, const word32 *data, size_t for (i=16; i<64; i+=16) { - vk = VectorLoad32x4(k, offset); + vk = VectorLoad32x4u(k, offset); SHA256_ROUND2<0>(W,S, vk); SHA256_ROUND2<1>(W,S, VectorShiftLeft<4>(vk)); SHA256_ROUND2<2>(W,S, VectorShiftLeft<8>(vk)); SHA256_ROUND2<3>(W,S, VectorShiftLeft<12>(vk)); offset+=16; - vk = VectorLoad32x4(k, offset); + vk = VectorLoad32x4u(k, offset); SHA256_ROUND2<4>(W,S, vk); SHA256_ROUND2<5>(W,S, VectorShiftLeft<4>(vk)); SHA256_ROUND2<6>(W,S, VectorShiftLeft<8>(vk)); SHA256_ROUND2<7>(W,S, VectorShiftLeft<12>(vk)); offset+=16; - vk = VectorLoad32x4(k, offset); + vk = VectorLoad32x4u(k, offset); SHA256_ROUND2<8>(W,S, vk); SHA256_ROUND2<9>(W,S, VectorShiftLeft<4>(vk)); SHA256_ROUND2<10>(W,S, VectorShiftLeft<8>(vk)); SHA256_ROUND2<11>(W,S, VectorShiftLeft<12>(vk)); offset+=16; - vk = VectorLoad32x4(k, offset); + vk = VectorLoad32x4u(k, offset); SHA256_ROUND2<12>(W,S, vk); SHA256_ROUND2<13>(W,S, VectorShiftLeft<4>(vk)); SHA256_ROUND2<14>(W,S, VectorShiftLeft<8>(vk)); @@ -1312,13 +1285,6 @@ uint64x2_p8 VectorPermute64x2(const uint64x2_p8 val, const uint8x16_p8 mask) return (uint64x2_p8)vec_perm(val, val, mask); } -// Aligned load -template static inline -uint64x2_p8 VectorLoad64x2(const T* data, int offset) -{ - return (uint64x2_p8)vec_ld(offset, (const uint8_t*)data); -} - // Unaligned load template static inline uint64x2_p8 VectorLoad64x2u(const T* data, int offset) @@ -1330,13 +1296,6 @@ uint64x2_p8 VectorLoad64x2u(const T* data, int offset) #endif } -// Aligned store -template static inline -void VectorStore64x2(const uint64x2_p8 val, T* data, int offset) -{ - vec_st((uint8x16_p8)val, offset, (uint8_t*)data); -} - // Unaligned store template static inline void VectorStore64x2u(const uint64x2_p8 val, T* data, int offset) @@ -1502,7 +1461,7 @@ void SHA512_HashMultipleBlocks_POWER8(word64 *state, const word64 *data, size_t // Unroll the loop to provide the round number as a constexpr // for (unsigned int i=0; i<16; ++i) { - vk = VectorLoad64x2(k, offset); + vk = VectorLoad64x2u(k, offset); vm = VectorLoadMsg64x2(m, offset); SHA512_ROUND1<0>(W,S, vk,vm); offset+=16; @@ -1511,7 +1470,7 @@ void SHA512_HashMultipleBlocks_POWER8(word64 *state, const word64 *data, size_t vm = VectorShiftLeft<8>(vm); SHA512_ROUND1<1>(W,S, vk,vm); - vk = VectorLoad64x2(k, offset); + vk = VectorLoad64x2u(k, offset); vm = VectorLoadMsg64x2(m, offset); SHA512_ROUND1<2>(W,S, vk,vm); offset+=16; @@ -1520,7 +1479,7 @@ void SHA512_HashMultipleBlocks_POWER8(word64 *state, const word64 *data, size_t vm = VectorShiftLeft<8>(vm); SHA512_ROUND1<3>(W,S, vk,vm); - vk = VectorLoad64x2(k, offset); + vk = VectorLoad64x2u(k, offset); vm = VectorLoadMsg64x2(m, offset); SHA512_ROUND1<4>(W,S, vk,vm); offset+=16; @@ -1529,7 +1488,7 @@ void SHA512_HashMultipleBlocks_POWER8(word64 *state, const word64 *data, size_t vm = VectorShiftLeft<8>(vm); SHA512_ROUND1<5>(W,S, vk,vm); - vk = VectorLoad64x2(k, offset); + vk = VectorLoad64x2u(k, offset); vm = VectorLoadMsg64x2(m, offset); SHA512_ROUND1<6>(W,S, vk,vm); offset+=16; @@ -1538,7 +1497,7 @@ void SHA512_HashMultipleBlocks_POWER8(word64 *state, const word64 *data, size_t vm = VectorShiftLeft<8>(vm); SHA512_ROUND1<7>(W,S, vk,vm); - vk = VectorLoad64x2(k, offset); + vk = VectorLoad64x2u(k, offset); vm = VectorLoadMsg64x2(m, offset); SHA512_ROUND1<8>(W,S, vk,vm); offset+=16; @@ -1547,7 +1506,7 @@ void SHA512_HashMultipleBlocks_POWER8(word64 *state, const word64 *data, size_t vm = VectorShiftLeft<8>(vm); SHA512_ROUND1<9>(W,S, vk,vm); - vk = VectorLoad64x2(k, offset); + vk = VectorLoad64x2u(k, offset); vm = VectorLoadMsg64x2(m, offset); SHA512_ROUND1<10>(W,S, vk,vm); offset+=16; @@ -1556,7 +1515,7 @@ void SHA512_HashMultipleBlocks_POWER8(word64 *state, const word64 *data, size_t vm = VectorShiftLeft<8>(vm); SHA512_ROUND1<11>(W,S, vk,vm); - vk = VectorLoad64x2(k, offset); + vk = VectorLoad64x2u(k, offset); vm = VectorLoadMsg64x2(m, offset); SHA512_ROUND1<12>(W,S, vk,vm); offset+=16; @@ -1565,7 +1524,7 @@ void SHA512_HashMultipleBlocks_POWER8(word64 *state, const word64 *data, size_t vm = VectorShiftLeft<8>(vm); SHA512_ROUND1<13>(W,S, vk,vm); - vk = VectorLoad64x2(k, offset); + vk = VectorLoad64x2u(k, offset); vm = VectorLoadMsg64x2(m, offset); SHA512_ROUND1<14>(W,S, vk,vm); offset+=16; @@ -1579,42 +1538,42 @@ void SHA512_HashMultipleBlocks_POWER8(word64 *state, const word64 *data, size_t for (i=16 ; i<80; i+=16) { - vk = VectorLoad64x2(k, offset); + vk = VectorLoad64x2u(k, offset); SHA512_ROUND2<0>(W,S, vk); SHA512_ROUND2<1>(W,S, VectorShiftLeft<8>(vk)); offset+=16; - vk = VectorLoad64x2(k, offset); + vk = VectorLoad64x2u(k, offset); SHA512_ROUND2<2>(W,S, vk); SHA512_ROUND2<3>(W,S, VectorShiftLeft<8>(vk)); offset+=16; - vk = VectorLoad64x2(k, offset); + vk = VectorLoad64x2u(k, offset); SHA512_ROUND2<4>(W,S, vk); SHA512_ROUND2<5>(W,S, VectorShiftLeft<8>(vk)); offset+=16; - vk = VectorLoad64x2(k, offset); + vk = VectorLoad64x2u(k, offset); SHA512_ROUND2<6>(W,S, vk); SHA512_ROUND2<7>(W,S, VectorShiftLeft<8>(vk)); offset+=16; - vk = VectorLoad64x2(k, offset); + vk = VectorLoad64x2u(k, offset); SHA512_ROUND2<8>(W,S, vk); SHA512_ROUND2<9>(W,S, VectorShiftLeft<8>(vk)); offset+=16; - vk = VectorLoad64x2(k, offset); + vk = VectorLoad64x2u(k, offset); SHA512_ROUND2<10>(W,S, vk); SHA512_ROUND2<11>(W,S, VectorShiftLeft<8>(vk)); offset+=16; - vk = VectorLoad64x2(k, offset); + vk = VectorLoad64x2u(k, offset); SHA512_ROUND2<12>(W,S, vk); SHA512_ROUND2<13>(W,S, VectorShiftLeft<8>(vk)); offset+=16; - vk = VectorLoad64x2(k, offset); + vk = VectorLoad64x2u(k, offset); SHA512_ROUND2<14>(W,S, vk); SHA512_ROUND2<15>(W,S, VectorShiftLeft<8>(vk)); offset+=16;