diff --git a/lea-simd.cpp b/lea-simd.cpp index 7ca3802e..e1529c32 100644 --- a/lea-simd.cpp +++ b/lea-simd.cpp @@ -1,7 +1,7 @@ // lea-simd.cpp - written and placed in the public domain by Jeffrey Walton // // This source file uses intrinsics and built-ins to gain access to -// SSSE3, ARM NEON and ARMv8a, and Power7 Altivec instructions. A separate +// SSSE3, ARM NEON and ARMv8a, and Power8 Altivec instructions. A separate // source file is needed because additional CXXFLAGS are required to enable // the appropriate instructions sets in some build configurations. @@ -38,6 +38,25 @@ # include #endif +// Do not port this to POWER architecture. Naively we hoped +// for a 2x speedup. The result was a 5x slow down because +// of the rotates and scattered loads. +// +// C++: +// LEA-128(128)/CTR (128-bit key)C++20715.640.5932015 +// LEA-128(192)/CTR (192-bit key)C++18617.480.6992378 +// LEA-128(256)/CTR (256-bit key)C++12426.20.8422861 +// +// Power8: +// LEA-128(128)/CTR (128-bit key)Power83788.70.5952023 +// LEA-128(192)/CTR (192-bit key)Power84082.10.6992375 +// LEA-128(256)/CTR (256-bit key)Power828116.01.0063419 + +#undef CRYPTOPP_POWER8_AVAILABLE +#if defined(CRYPTOPP_POWER8_AVAILABLE) +# include "ppc-simd.h" +#endif + // Squash MS LNK4221 and libtool warnings extern const char LEA_SIMD_FNAME[] = __FILE__; @@ -45,6 +64,8 @@ ANONYMOUS_NAMESPACE_BEGIN using CryptoPP::word32; +// *************************** ARM NEON ***************************// + #if (CRYPTOPP_ARM_NEON_AVAILABLE) inline uint32x4_t Xor(const uint32x4_t& a, const uint32x4_t& b) @@ -258,7 +279,305 @@ inline uint32x4_t RepackNEON(const uint32x4_t& v) return UnpackNEON(v); } -inline void LEA_Encryption(uint32x4_t temp[4], const word32 *subkeys, unsigned int rounds) +#endif // CRYPTOPP_ARM_NEON_AVAILABLE + +// *************************** IA-32 ***************************// + +#if (CRYPTOPP_SSSE3_AVAILABLE) + +inline __m128i Xor(const __m128i& a, const __m128i& b) +{ + return _mm_xor_si128(a, b); +} + +inline __m128i Add(const __m128i& a, const __m128i& b) +{ + return _mm_add_epi32(a, b); +} + +inline __m128i Sub(const __m128i& a, const __m128i& b) +{ + return _mm_sub_epi32(a, b); +} + +template +inline __m128i RotateLeft(const __m128i& val) +{ + return _mm_or_si128( + _mm_slli_epi32(val, R), _mm_srli_epi32(val, 32-R)); +} + +template +inline __m128i RotateRight(const __m128i& val) +{ + return _mm_or_si128( + _mm_slli_epi32(val, 32-R), _mm_srli_epi32(val, R)); +} + +// Faster than two Shifts and an Or. +template <> +inline __m128i RotateLeft<8>(const __m128i& val) +{ + const __m128i mask = _mm_set_epi8(14,13,12,15, 10,9,8,11, 6,5,4,7, 2,1,0,3); + return _mm_shuffle_epi8(val, mask); +} + +// Faster than two Shifts and an Or. +template <> +inline __m128i RotateRight<8>(const __m128i& val) +{ + const __m128i mask = _mm_set_epi8(12,15,14,13, 8,11,10,9, 4,7,6,5, 0,3,2,1); + return _mm_shuffle_epi8(val, mask); +} + +template +inline __m128i LoadKey(const word32 rkey[]) +{ + float rk; std::memcpy(&rk, rkey+IDX, sizeof(rk)); + return _mm_castps_si128(_mm_load_ps1(&rk)); +} + +template +inline __m128i UnpackXMM(const __m128i& a, const __m128i& b, const __m128i& c, const __m128i& d) +{ + // Should not be instantiated + CRYPTOPP_UNUSED(a); CRYPTOPP_UNUSED(b); + CRYPTOPP_UNUSED(c); CRYPTOPP_UNUSED(d); + CRYPTOPP_ASSERT(0); + return _mm_setzero_si128(); +} + +template <> +inline __m128i UnpackXMM<0>(const __m128i& a, const __m128i& b, const __m128i& c, const __m128i& d) +{ + // LEA is little-endian oriented, so there is no need for a separate shuffle. + const __m128i r1 = _mm_unpacklo_epi32(a, b); + const __m128i r2 = _mm_unpacklo_epi32(c, d); + return _mm_unpacklo_epi64(r1, r2); +} + +template <> +inline __m128i UnpackXMM<1>(const __m128i& a, const __m128i& b, const __m128i& c, const __m128i& d) +{ + // LEA is little-endian oriented, so there is no need for a separate shuffle. + const __m128i r1 = _mm_unpacklo_epi32(a, b); + const __m128i r2 = _mm_unpacklo_epi32(c, d); + return _mm_unpackhi_epi64(r1, r2); +} + +template <> +inline __m128i UnpackXMM<2>(const __m128i& a, const __m128i& b, const __m128i& c, const __m128i& d) +{ + // LEA is little-endian oriented, so there is no need for a separate shuffle. + const __m128i r1 = _mm_unpackhi_epi32(a, b); + const __m128i r2 = _mm_unpackhi_epi32(c, d); + return _mm_unpacklo_epi64(r1, r2); +} + +template <> +inline __m128i UnpackXMM<3>(const __m128i& a, const __m128i& b, const __m128i& c, const __m128i& d) +{ + // LEA is little-endian oriented, so there is no need for a separate shuffle. + const __m128i r1 = _mm_unpackhi_epi32(a, b); + const __m128i r2 = _mm_unpackhi_epi32(c, d); + return _mm_unpackhi_epi64(r1, r2); +} + +template +inline __m128i UnpackXMM(const __m128i& v) +{ + // Should not be instantiated + CRYPTOPP_UNUSED(v); CRYPTOPP_ASSERT(0); + return _mm_setzero_si128(); +} + +template <> +inline __m128i UnpackXMM<0>(const __m128i& v) +{ + // Splat to all lanes + return _mm_shuffle_epi8(v, _mm_set_epi8(3,2,1,0, 3,2,1,0, 3,2,1,0, 3,2,1,0)); +} + +template <> +inline __m128i UnpackXMM<1>(const __m128i& v) +{ + // Splat to all lanes + return _mm_shuffle_epi8(v, _mm_set_epi8(7,6,5,4, 7,6,5,4, 7,6,5,4, 7,6,5,4)); +} + +template <> +inline __m128i UnpackXMM<2>(const __m128i& v) +{ + // Splat to all lanes + return _mm_shuffle_epi8(v, _mm_set_epi8(11,10,9,8, 11,10,9,8, 11,10,9,8, 11,10,9,8)); +} + +template <> +inline __m128i UnpackXMM<3>(const __m128i& v) +{ + // Splat to all lanes + return _mm_shuffle_epi8(v, _mm_set_epi8(15,14,13,12, 15,14,13,12, 15,14,13,12, 15,14,13,12)); +} + +template +inline __m128i RepackXMM(const __m128i& a, const __m128i& b, const __m128i& c, const __m128i& d) +{ + return UnpackXMM(a, b, c, d); +} + +template +inline __m128i RepackXMM(const __m128i& v) +{ + return UnpackXMM(v); +} + +#endif // CRYPTOPP_SSSE3_AVAILABLE + +// *************************** Power8 ***************************// + +#if (CRYPTOPP_POWER8_AVAILABLE) + +using CryptoPP::uint8x16_p; +using CryptoPP::uint32x4_p; +using CryptoPP::uint64x2_p; + +inline uint32x4_p Xor(const uint32x4_p& a, const uint32x4_p& b) +{ + return vec_xor(a, b); +} + +inline uint32x4_p Add(const uint32x4_p& a, const uint32x4_p& b) +{ + return vec_add(a, b); +} + +inline uint32x4_p Sub(const uint32x4_p& a, const uint32x4_p& b) +{ + return vec_sub(a, b); +} + +template +inline uint32x4_p RotateLeft(const uint32x4_p& val) +{ + const uint32x4_p m = {R, R, R, R}; + return vec_rl(val, m); +} + +template +inline uint32x4_p RotateRight(const uint32x4_p& val) +{ + const uint32x4_p m = {32-R, 32-R, 32-R, 32-R}; + return vec_rl(val, m); +} + +template +inline uint32x4_p LoadKey(const word32 rkey[]) +{ + return vec_splats(rkey[IDX]); +} + +template +inline uint32x4_p UnpackSIMD(const uint32x4_p& a, const uint32x4_p& b, const uint32x4_p& c, const uint32x4_p& d) +{ + // Should not be instantiated + CRYPTOPP_UNUSED(a); CRYPTOPP_UNUSED(b); + CRYPTOPP_UNUSED(c); CRYPTOPP_UNUSED(d); + CRYPTOPP_ASSERT(0); + return vec_xor(a, a); +} + +template <> +inline uint32x4_p UnpackSIMD<0>(const uint32x4_p& a, const uint32x4_p& b, const uint32x4_p& c, const uint32x4_p& d) +{ + const uint64x2_p r1 = (uint64x2_p)vec_mergel(a, b); + const uint64x2_p r2 = (uint64x2_p)vec_mergel(c, d); + return (uint32x4_p)vec_mergel(r1, r2); +} + +template <> +inline uint32x4_p UnpackSIMD<1>(const uint32x4_p& a, const uint32x4_p& b, const uint32x4_p& c, const uint32x4_p& d) +{ + const uint64x2_p r1 = (uint64x2_p)vec_mergel(a, b); + const uint64x2_p r2 = (uint64x2_p)vec_mergel(c, d); + return (uint32x4_p)vec_mergeh(r1, r2); +} + +template <> +inline uint32x4_p UnpackSIMD<2>(const uint32x4_p& a, const uint32x4_p& b, const uint32x4_p& c, const uint32x4_p& d) +{ + const uint64x2_p r1 = (uint64x2_p)vec_mergeh(a, b); + const uint64x2_p r2 = (uint64x2_p)vec_mergeh(c, d); + return (uint32x4_p)vec_mergel(r1, r2); +} + +template <> +inline uint32x4_p UnpackSIMD<3>(const uint32x4_p& a, const uint32x4_p& b, const uint32x4_p& c, const uint32x4_p& d) +{ + const uint64x2_p r1 = (uint64x2_p)vec_mergeh(a, b); + const uint64x2_p r2 = (uint64x2_p)vec_mergeh(c, d); + return (uint32x4_p)vec_mergeh(r1, r2); +} + +template +inline uint32x4_p UnpackSIMD(const uint32x4_p& v) +{ + // Should not be instantiated + CRYPTOPP_ASSERT(0); + return vec_xor(v, v); +} + +template <> +inline uint32x4_p UnpackSIMD<0>(const uint32x4_p& v) +{ + // Splat to all lanes + const uint8x16_p m = {3,2,1,0, 3,2,1,0, 3,2,1,0, 3,2,1,0}; + return (uint32x4_p)vec_perm(v, v, m); +} + +template <> +inline uint32x4_p UnpackSIMD<1>(const uint32x4_p& v) +{ + // Splat to all lanes + const uint8x16_p m = {7,6,5,4, 7,6,5,4, 7,6,5,4, 7,6,5,4}; + return (uint32x4_p)vec_perm(v, v, m); +} + +template <> +inline uint32x4_p UnpackSIMD<2>(const uint32x4_p& v) +{ + // Splat to all lanes + const uint8x16_p m = {11,10,9,8, 11,10,9,8, 11,10,9,8, 11,10,9,8}; + return (uint32x4_p)vec_perm(v, v, m); +} + +template <> +inline uint32x4_p UnpackSIMD<3>(const uint32x4_p& v) +{ + // Splat to all lanes + const uint8x16_p m = {15,14,13,12, 15,14,13,12, 15,14,13,12, 15,14,13,12}; + return (uint32x4_p)vec_perm(v, v, m); +} + +template +inline uint32x4_p RepackSIMD(const uint32x4_p& a, const uint32x4_p& b, const uint32x4_p& c, const uint32x4_p& d) +{ + return UnpackSIMD(a, b, c, d); +} + +template +inline uint32x4_p RepackSIMD(const uint32x4_p& v) +{ + return UnpackSIMD(v); +} + +#endif // CRYPTOPP_POWER8_AVAILABLE + +// *************************** LEA Encryption ***************************// + +#if (CRYPTOPP_ARM_NEON_AVAILABLE || CRYPTOPP_SSSE3_AVAILABLE) + +template +inline void LEA_Encryption(W temp[4], const word32 *subkeys, unsigned int rounds) { temp[3] = RotateRight<3>(Add(Xor(temp[2], LoadKey<4>(subkeys)), Xor(temp[3], LoadKey<5>(subkeys)))); temp[2] = RotateRight<5>(Add(Xor(temp[1], LoadKey<2>(subkeys)), Xor(temp[2], LoadKey<3>(subkeys)))); @@ -371,7 +690,10 @@ inline void LEA_Encryption(uint32x4_t temp[4], const word32 *subkeys, unsigned i } } -inline void LEA_Decryption(uint32x4_t temp[4], const word32 *subkeys, unsigned int rounds) +// *************************** LEA Decryption ***************************// + +template +inline void LEA_Decryption(W temp[4], const word32 *subkeys, unsigned int rounds) { if(rounds > 28) { @@ -484,6 +806,12 @@ inline void LEA_Decryption(uint32x4_t temp[4], const word32 *subkeys, unsigned i temp[3] = Xor(Sub(RotateLeft<3>(temp[3]), Xor(temp[2], LoadKey<4>(subkeys))), LoadKey<5>(subkeys)); } +#endif // LEA Encryption and Decryption + +// *************************** ARM NEON ***************************// + +#if (CRYPTOPP_ARM_NEON_AVAILABLE) + inline void LEA_Enc_Block(uint32x4_t &block0, const word32 *subkeys, unsigned int rounds) { @@ -548,393 +876,10 @@ inline void LEA_Dec_4_Blocks(uint32x4_t &block0, uint32x4_t &block1, #endif // CRYPTOPP_ARM_NEON_AVAILABLE +// *************************** IA-32 ***************************// + #if (CRYPTOPP_SSSE3_AVAILABLE) -inline __m128i Xor(const __m128i& a, const __m128i& b) -{ - return _mm_xor_si128(a, b); -} - -inline __m128i Add(const __m128i& a, const __m128i& b) -{ - return _mm_add_epi32(a, b); -} - -inline __m128i Sub(const __m128i& a, const __m128i& b) -{ - return _mm_sub_epi32(a, b); -} - -template -inline __m128i RotateLeft(const __m128i& val) -{ - return _mm_or_si128( - _mm_slli_epi32(val, R), _mm_srli_epi32(val, 32-R)); -} - -template -inline __m128i RotateRight(const __m128i& val) -{ - return _mm_or_si128( - _mm_slli_epi32(val, 32-R), _mm_srli_epi32(val, R)); -} - -// Faster than two Shifts and an Or. -template <> -inline __m128i RotateLeft<8>(const __m128i& val) -{ - const __m128i mask = _mm_set_epi8(14,13,12,15, 10,9,8,11, 6,5,4,7, 2,1,0,3); - return _mm_shuffle_epi8(val, mask); -} - -// Faster than two Shifts and an Or. -template <> -inline __m128i RotateRight<8>(const __m128i& val) -{ - const __m128i mask = _mm_set_epi8(12,15,14,13, 8,11,10,9, 4,7,6,5, 0,3,2,1); - return _mm_shuffle_epi8(val, mask); -} - -template -inline __m128i LoadKey(const word32 rkey[]) -{ - float rk; std::memcpy(&rk, rkey+IDX, sizeof(rk)); - return _mm_castps_si128(_mm_load_ps1(&rk)); -} - -/// \brief Unpack XMM words -/// \tparam IDX the element from each XMM word -/// \param a the first XMM word -/// \param b the second XMM word -/// \param c the third XMM word -/// \param d the fourth XMM word -/// \details UnpackXMM selects the IDX element from a, b, c, d and returns a concatenation -/// equivalent to a[IDX] || b[IDX] || c[IDX] || d[IDX]. -template -inline __m128i UnpackXMM(const __m128i& a, const __m128i& b, const __m128i& c, const __m128i& d) -{ - // Should not be instantiated - CRYPTOPP_UNUSED(a); CRYPTOPP_UNUSED(b); - CRYPTOPP_UNUSED(c); CRYPTOPP_UNUSED(d); - CRYPTOPP_ASSERT(0); - return _mm_setzero_si128(); -} - -template <> -inline __m128i UnpackXMM<0>(const __m128i& a, const __m128i& b, const __m128i& c, const __m128i& d) -{ - // LEA is little-endian oriented, so there is no need for a separate shuffle. - const __m128i r1 = _mm_unpacklo_epi32(a, b); - const __m128i r2 = _mm_unpacklo_epi32(c, d); - return _mm_unpacklo_epi64(r1, r2); -} - -template <> -inline __m128i UnpackXMM<1>(const __m128i& a, const __m128i& b, const __m128i& c, const __m128i& d) -{ - // LEA is little-endian oriented, so there is no need for a separate shuffle. - const __m128i r1 = _mm_unpacklo_epi32(a, b); - const __m128i r2 = _mm_unpacklo_epi32(c, d); - return _mm_unpackhi_epi64(r1, r2); -} - -template <> -inline __m128i UnpackXMM<2>(const __m128i& a, const __m128i& b, const __m128i& c, const __m128i& d) -{ - // LEA is little-endian oriented, so there is no need for a separate shuffle. - const __m128i r1 = _mm_unpackhi_epi32(a, b); - const __m128i r2 = _mm_unpackhi_epi32(c, d); - return _mm_unpacklo_epi64(r1, r2); -} - -template <> -inline __m128i UnpackXMM<3>(const __m128i& a, const __m128i& b, const __m128i& c, const __m128i& d) -{ - // LEA is little-endian oriented, so there is no need for a separate shuffle. - const __m128i r1 = _mm_unpackhi_epi32(a, b); - const __m128i r2 = _mm_unpackhi_epi32(c, d); - return _mm_unpackhi_epi64(r1, r2); -} - -/// \brief Unpack a XMM word -/// \tparam IDX the element from each XMM word -/// \param v the first XMM word -/// \details UnpackXMM selects the IDX element from v and returns a concatenation -/// equivalent to v[IDX] || v[IDX] || v[IDX] || v[IDX]. -template -inline __m128i UnpackXMM(const __m128i& v) -{ - // Should not be instantiated - CRYPTOPP_UNUSED(v); CRYPTOPP_ASSERT(0); - return _mm_setzero_si128(); -} - -template <> -inline __m128i UnpackXMM<0>(const __m128i& v) -{ - // Splat to all lanes - return _mm_shuffle_epi8(v, _mm_set_epi8(3,2,1,0, 3,2,1,0, 3,2,1,0, 3,2,1,0)); -} - -template <> -inline __m128i UnpackXMM<1>(const __m128i& v) -{ - // Splat to all lanes - return _mm_shuffle_epi8(v, _mm_set_epi8(7,6,5,4, 7,6,5,4, 7,6,5,4, 7,6,5,4)); -} - -template <> -inline __m128i UnpackXMM<2>(const __m128i& v) -{ - // Splat to all lanes - return _mm_shuffle_epi8(v, _mm_set_epi8(11,10,9,8, 11,10,9,8, 11,10,9,8, 11,10,9,8)); -} - -template <> -inline __m128i UnpackXMM<3>(const __m128i& v) -{ - // Splat to all lanes - return _mm_shuffle_epi8(v, _mm_set_epi8(15,14,13,12, 15,14,13,12, 15,14,13,12, 15,14,13,12)); -} - -template -inline __m128i RepackXMM(const __m128i& a, const __m128i& b, const __m128i& c, const __m128i& d) -{ - return UnpackXMM(a, b, c, d); -} - -template -inline __m128i RepackXMM(const __m128i& v) -{ - return UnpackXMM(v); -} - -inline void LEA_Encryption(__m128i temp[4], const word32 *subkeys, unsigned int rounds) -{ - temp[3] = RotateRight<3>(Add(Xor(temp[2], LoadKey<4>(subkeys)), Xor(temp[3], LoadKey<5>(subkeys)))); - temp[2] = RotateRight<5>(Add(Xor(temp[1], LoadKey<2>(subkeys)), Xor(temp[2], LoadKey<3>(subkeys)))); - temp[1] = RotateLeft<9>(Add(Xor(temp[0], LoadKey<0>(subkeys)), Xor(temp[1], LoadKey<1>(subkeys)))); - temp[0] = RotateRight<3>(Add(Xor(temp[3], LoadKey<10>(subkeys)), Xor(temp[0], LoadKey<11>(subkeys)))); - temp[3] = RotateRight<5>(Add(Xor(temp[2], LoadKey<8>(subkeys)), Xor(temp[3], LoadKey<9>(subkeys)))); - temp[2] = RotateLeft<9>(Add(Xor(temp[1], LoadKey<6>(subkeys)), Xor(temp[2], LoadKey<7>(subkeys)))); - temp[1] = RotateRight<3>(Add(Xor(temp[0], LoadKey<16>(subkeys)), Xor(temp[1], LoadKey<17>(subkeys)))); - temp[0] = RotateRight<5>(Add(Xor(temp[3], LoadKey<14>(subkeys)), Xor(temp[0], LoadKey<15>(subkeys)))); - temp[3] = RotateLeft<9>(Add(Xor(temp[2], LoadKey<12>(subkeys)), Xor(temp[3], LoadKey<13>(subkeys)))); - temp[2] = RotateRight<3>(Add(Xor(temp[1], LoadKey<22>(subkeys)), Xor(temp[2], LoadKey<23>(subkeys)))); - temp[1] = RotateRight<5>(Add(Xor(temp[0], LoadKey<20>(subkeys)), Xor(temp[1], LoadKey<21>(subkeys)))); - temp[0] = RotateLeft<9>(Add(Xor(temp[3], LoadKey<18>(subkeys)), Xor(temp[0], LoadKey<19>(subkeys)))); - - temp[3] = RotateRight<3>(Add(Xor(temp[2], LoadKey<28>(subkeys)), Xor(temp[3], LoadKey<29>(subkeys)))); - temp[2] = RotateRight<5>(Add(Xor(temp[1], LoadKey<26>(subkeys)), Xor(temp[2], LoadKey<27>(subkeys)))); - temp[1] = RotateLeft<9>(Add(Xor(temp[0], LoadKey<24>(subkeys)), Xor(temp[1], LoadKey<25>(subkeys)))); - temp[0] = RotateRight<3>(Add(Xor(temp[3], LoadKey<34>(subkeys)), Xor(temp[0], LoadKey<35>(subkeys)))); - temp[3] = RotateRight<5>(Add(Xor(temp[2], LoadKey<32>(subkeys)), Xor(temp[3], LoadKey<33>(subkeys)))); - temp[2] = RotateLeft<9>(Add(Xor(temp[1], LoadKey<30>(subkeys)), Xor(temp[2], LoadKey<31>(subkeys)))); - temp[1] = RotateRight<3>(Add(Xor(temp[0], LoadKey<40>(subkeys)), Xor(temp[1], LoadKey<41>(subkeys)))); - temp[0] = RotateRight<5>(Add(Xor(temp[3], LoadKey<38>(subkeys)), Xor(temp[0], LoadKey<39>(subkeys)))); - temp[3] = RotateLeft<9>(Add(Xor(temp[2], LoadKey<36>(subkeys)), Xor(temp[3], LoadKey<37>(subkeys)))); - temp[2] = RotateRight<3>(Add(Xor(temp[1], LoadKey<46>(subkeys)), Xor(temp[2], LoadKey<47>(subkeys)))); - temp[1] = RotateRight<5>(Add(Xor(temp[0], LoadKey<44>(subkeys)), Xor(temp[1], LoadKey<45>(subkeys)))); - temp[0] = RotateLeft<9>(Add(Xor(temp[3], LoadKey<42>(subkeys)), Xor(temp[0], LoadKey<43>(subkeys)))); - - temp[3] = RotateRight<3>(Add(Xor(temp[2], LoadKey<52>(subkeys)), Xor(temp[3], LoadKey<53>(subkeys)))); - temp[2] = RotateRight<5>(Add(Xor(temp[1], LoadKey<50>(subkeys)), Xor(temp[2], LoadKey<51>(subkeys)))); - temp[1] = RotateLeft<9>(Add(Xor(temp[0], LoadKey<48>(subkeys)), Xor(temp[1], LoadKey<49>(subkeys)))); - temp[0] = RotateRight<3>(Add(Xor(temp[3], LoadKey<58>(subkeys)), Xor(temp[0], LoadKey<59>(subkeys)))); - temp[3] = RotateRight<5>(Add(Xor(temp[2], LoadKey<56>(subkeys)), Xor(temp[3], LoadKey<57>(subkeys)))); - temp[2] = RotateLeft<9>(Add(Xor(temp[1], LoadKey<54>(subkeys)), Xor(temp[2], LoadKey<55>(subkeys)))); - temp[1] = RotateRight<3>(Add(Xor(temp[0], LoadKey<64>(subkeys)), Xor(temp[1], LoadKey<65>(subkeys)))); - temp[0] = RotateRight<5>(Add(Xor(temp[3], LoadKey<62>(subkeys)), Xor(temp[0], LoadKey<63>(subkeys)))); - temp[3] = RotateLeft<9>(Add(Xor(temp[2], LoadKey<60>(subkeys)), Xor(temp[3], LoadKey<61>(subkeys)))); - temp[2] = RotateRight<3>(Add(Xor(temp[1], LoadKey<70>(subkeys)), Xor(temp[2], LoadKey<71>(subkeys)))); - temp[1] = RotateRight<5>(Add(Xor(temp[0], LoadKey<68>(subkeys)), Xor(temp[1], LoadKey<69>(subkeys)))); - temp[0] = RotateLeft<9>(Add(Xor(temp[3], LoadKey<66>(subkeys)), Xor(temp[0], LoadKey<67>(subkeys)))); - - temp[3] = RotateRight<3>(Add(Xor(temp[2], LoadKey<76>(subkeys)), Xor(temp[3], LoadKey<77>(subkeys)))); - temp[2] = RotateRight<5>(Add(Xor(temp[1], LoadKey<74>(subkeys)), Xor(temp[2], LoadKey<75>(subkeys)))); - temp[1] = RotateLeft<9>(Add(Xor(temp[0], LoadKey<72>(subkeys)), Xor(temp[1], LoadKey<73>(subkeys)))); - temp[0] = RotateRight<3>(Add(Xor(temp[3], LoadKey<82>(subkeys)), Xor(temp[0], LoadKey<83>(subkeys)))); - temp[3] = RotateRight<5>(Add(Xor(temp[2], LoadKey<80>(subkeys)), Xor(temp[3], LoadKey<81>(subkeys)))); - temp[2] = RotateLeft<9>(Add(Xor(temp[1], LoadKey<78>(subkeys)), Xor(temp[2], LoadKey<79>(subkeys)))); - temp[1] = RotateRight<3>(Add(Xor(temp[0], LoadKey<88>(subkeys)), Xor(temp[1], LoadKey<89>(subkeys)))); - temp[0] = RotateRight<5>(Add(Xor(temp[3], LoadKey<86>(subkeys)), Xor(temp[0], LoadKey<87>(subkeys)))); - temp[3] = RotateLeft<9>(Add(Xor(temp[2], LoadKey<84>(subkeys)), Xor(temp[3], LoadKey<85>(subkeys)))); - temp[2] = RotateRight<3>(Add(Xor(temp[1], LoadKey<94>(subkeys)), Xor(temp[2], LoadKey<95>(subkeys)))); - temp[1] = RotateRight<5>(Add(Xor(temp[0], LoadKey<92>(subkeys)), Xor(temp[1], LoadKey<93>(subkeys)))); - temp[0] = RotateLeft<9>(Add(Xor(temp[3], LoadKey<90>(subkeys)), Xor(temp[0], LoadKey<91>(subkeys)))); - - temp[3] = RotateRight<3>(Add(Xor(temp[2], LoadKey<100>(subkeys)), Xor(temp[3], LoadKey<101>(subkeys)))); - temp[2] = RotateRight<5>(Add(Xor(temp[1], LoadKey<98>(subkeys)), Xor(temp[2], LoadKey<99>(subkeys)))); - temp[1] = RotateLeft<9>(Add(Xor(temp[0], LoadKey<96>(subkeys)), Xor(temp[1], LoadKey<97>(subkeys)))); - temp[0] = RotateRight<3>(Add(Xor(temp[3], LoadKey<106>(subkeys)), Xor(temp[0], LoadKey<107>(subkeys)))); - temp[3] = RotateRight<5>(Add(Xor(temp[2], LoadKey<104>(subkeys)), Xor(temp[3], LoadKey<105>(subkeys)))); - temp[2] = RotateLeft<9>(Add(Xor(temp[1], LoadKey<102>(subkeys)), Xor(temp[2], LoadKey<103>(subkeys)))); - temp[1] = RotateRight<3>(Add(Xor(temp[0], LoadKey<112>(subkeys)), Xor(temp[1], LoadKey<113>(subkeys)))); - temp[0] = RotateRight<5>(Add(Xor(temp[3], LoadKey<110>(subkeys)), Xor(temp[0], LoadKey<111>(subkeys)))); - temp[3] = RotateLeft<9>(Add(Xor(temp[2], LoadKey<108>(subkeys)), Xor(temp[3], LoadKey<109>(subkeys)))); - temp[2] = RotateRight<3>(Add(Xor(temp[1], LoadKey<118>(subkeys)), Xor(temp[2], LoadKey<119>(subkeys)))); - temp[1] = RotateRight<5>(Add(Xor(temp[0], LoadKey<116>(subkeys)), Xor(temp[1], LoadKey<117>(subkeys)))); - temp[0] = RotateLeft<9>(Add(Xor(temp[3], LoadKey<114>(subkeys)), Xor(temp[0], LoadKey<115>(subkeys)))); - - temp[3] = RotateRight<3>(Add(Xor(temp[2], LoadKey<124>(subkeys)), Xor(temp[3], LoadKey<125>(subkeys)))); - temp[2] = RotateRight<5>(Add(Xor(temp[1], LoadKey<122>(subkeys)), Xor(temp[2], LoadKey<123>(subkeys)))); - temp[1] = RotateLeft<9>(Add(Xor(temp[0], LoadKey<120>(subkeys)), Xor(temp[1], LoadKey<121>(subkeys)))); - temp[0] = RotateRight<3>(Add(Xor(temp[3], LoadKey<130>(subkeys)), Xor(temp[0], LoadKey<131>(subkeys)))); - temp[3] = RotateRight<5>(Add(Xor(temp[2], LoadKey<128>(subkeys)), Xor(temp[3], LoadKey<129>(subkeys)))); - temp[2] = RotateLeft<9>(Add(Xor(temp[1], LoadKey<126>(subkeys)), Xor(temp[2], LoadKey<127>(subkeys)))); - temp[1] = RotateRight<3>(Add(Xor(temp[0], LoadKey<136>(subkeys)), Xor(temp[1], LoadKey<137>(subkeys)))); - temp[0] = RotateRight<5>(Add(Xor(temp[3], LoadKey<134>(subkeys)), Xor(temp[0], LoadKey<135>(subkeys)))); - temp[3] = RotateLeft<9>(Add(Xor(temp[2], LoadKey<132>(subkeys)), Xor(temp[3], LoadKey<133>(subkeys)))); - temp[2] = RotateRight<3>(Add(Xor(temp[1], LoadKey<142>(subkeys)), Xor(temp[2], LoadKey<143>(subkeys)))); - temp[1] = RotateRight<5>(Add(Xor(temp[0], LoadKey<140>(subkeys)), Xor(temp[1], LoadKey<141>(subkeys)))); - temp[0] = RotateLeft<9>(Add(Xor(temp[3], LoadKey<138>(subkeys)), Xor(temp[0], LoadKey<139>(subkeys)))); - - if(rounds > 24) - { - temp[3] = RotateRight<3>(Add(Xor(temp[2], LoadKey<148>(subkeys)), Xor(temp[3], LoadKey<149>(subkeys)))); - temp[2] = RotateRight<5>(Add(Xor(temp[1], LoadKey<146>(subkeys)), Xor(temp[2], LoadKey<147>(subkeys)))); - temp[1] = RotateLeft<9>(Add(Xor(temp[0], LoadKey<144>(subkeys)), Xor(temp[1], LoadKey<145>(subkeys)))); - temp[0] = RotateRight<3>(Add(Xor(temp[3], LoadKey<154>(subkeys)), Xor(temp[0], LoadKey<155>(subkeys)))); - temp[3] = RotateRight<5>(Add(Xor(temp[2], LoadKey<152>(subkeys)), Xor(temp[3], LoadKey<153>(subkeys)))); - temp[2] = RotateLeft<9>(Add(Xor(temp[1], LoadKey<150>(subkeys)), Xor(temp[2], LoadKey<151>(subkeys)))); - temp[1] = RotateRight<3>(Add(Xor(temp[0], LoadKey<160>(subkeys)), Xor(temp[1], LoadKey<161>(subkeys)))); - temp[0] = RotateRight<5>(Add(Xor(temp[3], LoadKey<158>(subkeys)), Xor(temp[0], LoadKey<159>(subkeys)))); - temp[3] = RotateLeft<9>(Add(Xor(temp[2], LoadKey<156>(subkeys)), Xor(temp[3], LoadKey<157>(subkeys)))); - temp[2] = RotateRight<3>(Add(Xor(temp[1], LoadKey<166>(subkeys)), Xor(temp[2], LoadKey<167>(subkeys)))); - temp[1] = RotateRight<5>(Add(Xor(temp[0], LoadKey<164>(subkeys)), Xor(temp[1], LoadKey<165>(subkeys)))); - temp[0] = RotateLeft<9>(Add(Xor(temp[3], LoadKey<162>(subkeys)), Xor(temp[0], LoadKey<163>(subkeys)))); - } - - if(rounds > 28) - { - temp[3] = RotateRight<3>(Add(Xor(temp[2], LoadKey<172>(subkeys)), Xor(temp[3], LoadKey<173>(subkeys)))); - temp[2] = RotateRight<5>(Add(Xor(temp[1], LoadKey<170>(subkeys)), Xor(temp[2], LoadKey<171>(subkeys)))); - temp[1] = RotateLeft<9>(Add(Xor(temp[0], LoadKey<168>(subkeys)), Xor(temp[1], LoadKey<169>(subkeys)))); - temp[0] = RotateRight<3>(Add(Xor(temp[3], LoadKey<178>(subkeys)), Xor(temp[0], LoadKey<179>(subkeys)))); - temp[3] = RotateRight<5>(Add(Xor(temp[2], LoadKey<176>(subkeys)), Xor(temp[3], LoadKey<177>(subkeys)))); - temp[2] = RotateLeft<9>(Add(Xor(temp[1], LoadKey<174>(subkeys)), Xor(temp[2], LoadKey<175>(subkeys)))); - temp[1] = RotateRight<3>(Add(Xor(temp[0], LoadKey<184>(subkeys)), Xor(temp[1], LoadKey<185>(subkeys)))); - temp[0] = RotateRight<5>(Add(Xor(temp[3], LoadKey<182>(subkeys)), Xor(temp[0], LoadKey<183>(subkeys)))); - temp[3] = RotateLeft<9>(Add(Xor(temp[2], LoadKey<180>(subkeys)), Xor(temp[3], LoadKey<181>(subkeys)))); - temp[2] = RotateRight<3>(Add(Xor(temp[1], LoadKey<190>(subkeys)), Xor(temp[2], LoadKey<191>(subkeys)))); - temp[1] = RotateRight<5>(Add(Xor(temp[0], LoadKey<188>(subkeys)), Xor(temp[1], LoadKey<189>(subkeys)))); - temp[0] = RotateLeft<9>(Add(Xor(temp[3], LoadKey<186>(subkeys)), Xor(temp[0], LoadKey<187>(subkeys)))); - } -} - -inline void LEA_Decryption(__m128i temp[4], const word32 *subkeys, unsigned int rounds) -{ - if(rounds > 28) - { - temp[0] = Xor(Sub(RotateRight<9>(temp[0]), Xor(temp[3], LoadKey<186>(subkeys))), LoadKey<187>(subkeys)); - temp[1] = Xor(Sub(RotateLeft<5>(temp[1]), Xor(temp[0], LoadKey<188>(subkeys))), LoadKey<189>(subkeys)); - temp[2] = Xor(Sub(RotateLeft<3>(temp[2]), Xor(temp[1], LoadKey<190>(subkeys))), LoadKey<191>(subkeys)); - temp[3] = Xor(Sub(RotateRight<9>(temp[3]), Xor(temp[2], LoadKey<180>(subkeys))), LoadKey<181>(subkeys)); - temp[0] = Xor(Sub(RotateLeft<5>(temp[0]), Xor(temp[3], LoadKey<182>(subkeys))), LoadKey<183>(subkeys)); - temp[1] = Xor(Sub(RotateLeft<3>(temp[1]), Xor(temp[0], LoadKey<184>(subkeys))), LoadKey<185>(subkeys)); - temp[2] = Xor(Sub(RotateRight<9>(temp[2]), Xor(temp[1], LoadKey<174>(subkeys))), LoadKey<175>(subkeys)); - temp[3] = Xor(Sub(RotateLeft<5>(temp[3]), Xor(temp[2], LoadKey<176>(subkeys))), LoadKey<177>(subkeys)); - temp[0] = Xor(Sub(RotateLeft<3>(temp[0]), Xor(temp[3], LoadKey<178>(subkeys))), LoadKey<179>(subkeys)); - temp[1] = Xor(Sub(RotateRight<9>(temp[1]), Xor(temp[0], LoadKey<168>(subkeys))), LoadKey<169>(subkeys)); - temp[2] = Xor(Sub(RotateLeft<5>(temp[2]), Xor(temp[1], LoadKey<170>(subkeys))), LoadKey<171>(subkeys)); - temp[3] = Xor(Sub(RotateLeft<3>(temp[3]), Xor(temp[2], LoadKey<172>(subkeys))), LoadKey<173>(subkeys)); - } - - if(rounds > 24) - { - temp[0] = Xor(Sub(RotateRight<9>(temp[0]), Xor(temp[3], LoadKey<162>(subkeys))), LoadKey<163>(subkeys)); - temp[1] = Xor(Sub(RotateLeft<5>(temp[1]), Xor(temp[0], LoadKey<164>(subkeys))), LoadKey<165>(subkeys)); - temp[2] = Xor(Sub(RotateLeft<3>(temp[2]), Xor(temp[1], LoadKey<166>(subkeys))), LoadKey<167>(subkeys)); - temp[3] = Xor(Sub(RotateRight<9>(temp[3]), Xor(temp[2], LoadKey<156>(subkeys))), LoadKey<157>(subkeys)); - temp[0] = Xor(Sub(RotateLeft<5>(temp[0]), Xor(temp[3], LoadKey<158>(subkeys))), LoadKey<159>(subkeys)); - temp[1] = Xor(Sub(RotateLeft<3>(temp[1]), Xor(temp[0], LoadKey<160>(subkeys))), LoadKey<161>(subkeys)); - temp[2] = Xor(Sub(RotateRight<9>(temp[2]), Xor(temp[1], LoadKey<150>(subkeys))), LoadKey<151>(subkeys)); - temp[3] = Xor(Sub(RotateLeft<5>(temp[3]), Xor(temp[2], LoadKey<152>(subkeys))), LoadKey<153>(subkeys)); - temp[0] = Xor(Sub(RotateLeft<3>(temp[0]), Xor(temp[3], LoadKey<154>(subkeys))), LoadKey<155>(subkeys)); - temp[1] = Xor(Sub(RotateRight<9>(temp[1]), Xor(temp[0], LoadKey<144>(subkeys))), LoadKey<145>(subkeys)); - temp[2] = Xor(Sub(RotateLeft<5>(temp[2]), Xor(temp[1], LoadKey<146>(subkeys))), LoadKey<147>(subkeys)); - temp[3] = Xor(Sub(RotateLeft<3>(temp[3]), Xor(temp[2], LoadKey<148>(subkeys))), LoadKey<149>(subkeys)); - } - - temp[0] = Xor(Sub(RotateRight<9>(temp[0]), Xor(temp[3], LoadKey<138>(subkeys))), LoadKey<139>(subkeys)); - temp[1] = Xor(Sub(RotateLeft<5>(temp[1]), Xor(temp[0], LoadKey<140>(subkeys))), LoadKey<141>(subkeys)); - temp[2] = Xor(Sub(RotateLeft<3>(temp[2]), Xor(temp[1], LoadKey<142>(subkeys))), LoadKey<143>(subkeys)); - temp[3] = Xor(Sub(RotateRight<9>(temp[3]), Xor(temp[2], LoadKey<132>(subkeys))), LoadKey<133>(subkeys)); - temp[0] = Xor(Sub(RotateLeft<5>(temp[0]), Xor(temp[3], LoadKey<134>(subkeys))), LoadKey<135>(subkeys)); - temp[1] = Xor(Sub(RotateLeft<3>(temp[1]), Xor(temp[0], LoadKey<136>(subkeys))), LoadKey<137>(subkeys)); - temp[2] = Xor(Sub(RotateRight<9>(temp[2]), Xor(temp[1], LoadKey<126>(subkeys))), LoadKey<127>(subkeys)); - temp[3] = Xor(Sub(RotateLeft<5>(temp[3]), Xor(temp[2], LoadKey<128>(subkeys))), LoadKey<129>(subkeys)); - temp[0] = Xor(Sub(RotateLeft<3>(temp[0]), Xor(temp[3], LoadKey<130>(subkeys))), LoadKey<131>(subkeys)); - temp[1] = Xor(Sub(RotateRight<9>(temp[1]), Xor(temp[0], LoadKey<120>(subkeys))), LoadKey<121>(subkeys)); - temp[2] = Xor(Sub(RotateLeft<5>(temp[2]), Xor(temp[1], LoadKey<122>(subkeys))), LoadKey<123>(subkeys)); - temp[3] = Xor(Sub(RotateLeft<3>(temp[3]), Xor(temp[2], LoadKey<124>(subkeys))), LoadKey<125>(subkeys)); - - temp[0] = Xor(Sub(RotateRight<9>(temp[0]), Xor(temp[3], LoadKey<114>(subkeys))), LoadKey<115>(subkeys)); - temp[1] = Xor(Sub(RotateLeft<5>(temp[1]), Xor(temp[0], LoadKey<116>(subkeys))), LoadKey<117>(subkeys)); - temp[2] = Xor(Sub(RotateLeft<3>(temp[2]), Xor(temp[1], LoadKey<118>(subkeys))), LoadKey<119>(subkeys)); - temp[3] = Xor(Sub(RotateRight<9>(temp[3]), Xor(temp[2], LoadKey<108>(subkeys))), LoadKey<109>(subkeys)); - temp[0] = Xor(Sub(RotateLeft<5>(temp[0]), Xor(temp[3], LoadKey<110>(subkeys))), LoadKey<111>(subkeys)); - temp[1] = Xor(Sub(RotateLeft<3>(temp[1]), Xor(temp[0], LoadKey<112>(subkeys))), LoadKey<113>(subkeys)); - temp[2] = Xor(Sub(RotateRight<9>(temp[2]), Xor(temp[1], LoadKey<102>(subkeys))), LoadKey<103>(subkeys)); - temp[3] = Xor(Sub(RotateLeft<5>(temp[3]), Xor(temp[2], LoadKey<104>(subkeys))), LoadKey<105>(subkeys)); - temp[0] = Xor(Sub(RotateLeft<3>(temp[0]), Xor(temp[3], LoadKey<106>(subkeys))), LoadKey<107>(subkeys)); - temp[1] = Xor(Sub(RotateRight<9>(temp[1]), Xor(temp[0], LoadKey<96>(subkeys))), LoadKey<97>(subkeys)); - temp[2] = Xor(Sub(RotateLeft<5>(temp[2]), Xor(temp[1], LoadKey<98>(subkeys))), LoadKey<99>(subkeys)); - temp[3] = Xor(Sub(RotateLeft<3>(temp[3]), Xor(temp[2], LoadKey<100>(subkeys))), LoadKey<101>(subkeys)); - - temp[0] = Xor(Sub(RotateRight<9>(temp[0]), Xor(temp[3], LoadKey<90>(subkeys))), LoadKey<91>(subkeys)); - temp[1] = Xor(Sub(RotateLeft<5>(temp[1]), Xor(temp[0], LoadKey<92>(subkeys))), LoadKey<93>(subkeys)); - temp[2] = Xor(Sub(RotateLeft<3>(temp[2]), Xor(temp[1], LoadKey<94>(subkeys))), LoadKey<95>(subkeys)); - temp[3] = Xor(Sub(RotateRight<9>(temp[3]), Xor(temp[2], LoadKey<84>(subkeys))), LoadKey<85>(subkeys)); - temp[0] = Xor(Sub(RotateLeft<5>(temp[0]), Xor(temp[3], LoadKey<86>(subkeys))), LoadKey<87>(subkeys)); - temp[1] = Xor(Sub(RotateLeft<3>(temp[1]), Xor(temp[0], LoadKey<88>(subkeys))), LoadKey<89>(subkeys)); - temp[2] = Xor(Sub(RotateRight<9>(temp[2]), Xor(temp[1], LoadKey<78>(subkeys))), LoadKey<79>(subkeys)); - temp[3] = Xor(Sub(RotateLeft<5>(temp[3]), Xor(temp[2], LoadKey<80>(subkeys))), LoadKey<81>(subkeys)); - temp[0] = Xor(Sub(RotateLeft<3>(temp[0]), Xor(temp[3], LoadKey<82>(subkeys))), LoadKey<83>(subkeys)); - temp[1] = Xor(Sub(RotateRight<9>(temp[1]), Xor(temp[0], LoadKey<72>(subkeys))), LoadKey<73>(subkeys)); - temp[2] = Xor(Sub(RotateLeft<5>(temp[2]), Xor(temp[1], LoadKey<74>(subkeys))), LoadKey<75>(subkeys)); - temp[3] = Xor(Sub(RotateLeft<3>(temp[3]), Xor(temp[2], LoadKey<76>(subkeys))), LoadKey<77>(subkeys)); - - temp[0] = Xor(Sub(RotateRight<9>(temp[0]), Xor(temp[3], LoadKey<66>(subkeys))), LoadKey<67>(subkeys)); - temp[1] = Xor(Sub(RotateLeft<5>(temp[1]), Xor(temp[0], LoadKey<68>(subkeys))), LoadKey<69>(subkeys)); - temp[2] = Xor(Sub(RotateLeft<3>(temp[2]), Xor(temp[1], LoadKey<70>(subkeys))), LoadKey<71>(subkeys)); - temp[3] = Xor(Sub(RotateRight<9>(temp[3]), Xor(temp[2], LoadKey<60>(subkeys))), LoadKey<61>(subkeys)); - temp[0] = Xor(Sub(RotateLeft<5>(temp[0]), Xor(temp[3], LoadKey<62>(subkeys))), LoadKey<63>(subkeys)); - temp[1] = Xor(Sub(RotateLeft<3>(temp[1]), Xor(temp[0], LoadKey<64>(subkeys))), LoadKey<65>(subkeys)); - temp[2] = Xor(Sub(RotateRight<9>(temp[2]), Xor(temp[1], LoadKey<54>(subkeys))), LoadKey<55>(subkeys)); - temp[3] = Xor(Sub(RotateLeft<5>(temp[3]), Xor(temp[2], LoadKey<56>(subkeys))), LoadKey<57>(subkeys)); - temp[0] = Xor(Sub(RotateLeft<3>(temp[0]), Xor(temp[3], LoadKey<58>(subkeys))), LoadKey<59>(subkeys)); - temp[1] = Xor(Sub(RotateRight<9>(temp[1]), Xor(temp[0], LoadKey<48>(subkeys))), LoadKey<49>(subkeys)); - temp[2] = Xor(Sub(RotateLeft<5>(temp[2]), Xor(temp[1], LoadKey<50>(subkeys))), LoadKey<51>(subkeys)); - temp[3] = Xor(Sub(RotateLeft<3>(temp[3]), Xor(temp[2], LoadKey<52>(subkeys))), LoadKey<53>(subkeys)); - - temp[0] = Xor(Sub(RotateRight<9>(temp[0]), Xor(temp[3], LoadKey<42>(subkeys))), LoadKey<43>(subkeys)); - temp[1] = Xor(Sub(RotateLeft<5>(temp[1]), Xor(temp[0], LoadKey<44>(subkeys))), LoadKey<45>(subkeys)); - temp[2] = Xor(Sub(RotateLeft<3>(temp[2]), Xor(temp[1], LoadKey<46>(subkeys))), LoadKey<47>(subkeys)); - temp[3] = Xor(Sub(RotateRight<9>(temp[3]), Xor(temp[2], LoadKey<36>(subkeys))), LoadKey<37>(subkeys)); - temp[0] = Xor(Sub(RotateLeft<5>(temp[0]), Xor(temp[3], LoadKey<38>(subkeys))), LoadKey<39>(subkeys)); - temp[1] = Xor(Sub(RotateLeft<3>(temp[1]), Xor(temp[0], LoadKey<40>(subkeys))), LoadKey<41>(subkeys)); - temp[2] = Xor(Sub(RotateRight<9>(temp[2]), Xor(temp[1], LoadKey<30>(subkeys))), LoadKey<31>(subkeys)); - temp[3] = Xor(Sub(RotateLeft<5>(temp[3]), Xor(temp[2], LoadKey<32>(subkeys))), LoadKey<33>(subkeys)); - temp[0] = Xor(Sub(RotateLeft<3>(temp[0]), Xor(temp[3], LoadKey<34>(subkeys))), LoadKey<35>(subkeys)); - temp[1] = Xor(Sub(RotateRight<9>(temp[1]), Xor(temp[0], LoadKey<24>(subkeys))), LoadKey<25>(subkeys)); - temp[2] = Xor(Sub(RotateLeft<5>(temp[2]), Xor(temp[1], LoadKey<26>(subkeys))), LoadKey<27>(subkeys)); - temp[3] = Xor(Sub(RotateLeft<3>(temp[3]), Xor(temp[2], LoadKey<28>(subkeys))), LoadKey<29>(subkeys)); - - temp[0] = Xor(Sub(RotateRight<9>(temp[0]), Xor(temp[3], LoadKey<18>(subkeys))), LoadKey<19>(subkeys)); - temp[1] = Xor(Sub(RotateLeft<5>(temp[1]), Xor(temp[0], LoadKey<20>(subkeys))), LoadKey<21>(subkeys)); - temp[2] = Xor(Sub(RotateLeft<3>(temp[2]), Xor(temp[1], LoadKey<22>(subkeys))), LoadKey<23>(subkeys)); - temp[3] = Xor(Sub(RotateRight<9>(temp[3]), Xor(temp[2], LoadKey<12>(subkeys))), LoadKey<13>(subkeys)); - temp[0] = Xor(Sub(RotateLeft<5>(temp[0]), Xor(temp[3], LoadKey<14>(subkeys))), LoadKey<15>(subkeys)); - temp[1] = Xor(Sub(RotateLeft<3>(temp[1]), Xor(temp[0], LoadKey<16>(subkeys))), LoadKey<17>(subkeys)); - temp[2] = Xor(Sub(RotateRight<9>(temp[2]), Xor(temp[1], LoadKey<6>(subkeys))), LoadKey<7>(subkeys)); - temp[3] = Xor(Sub(RotateLeft<5>(temp[3]), Xor(temp[2], LoadKey<8>(subkeys))), LoadKey<9>(subkeys)); - temp[0] = Xor(Sub(RotateLeft<3>(temp[0]), Xor(temp[3], LoadKey<10>(subkeys))), LoadKey<11>(subkeys)); - temp[1] = Xor(Sub(RotateRight<9>(temp[1]), Xor(temp[0], LoadKey<0>(subkeys))), LoadKey<1>(subkeys)); - temp[2] = Xor(Sub(RotateLeft<5>(temp[2]), Xor(temp[1], LoadKey<2>(subkeys))), LoadKey<3>(subkeys)); - temp[3] = Xor(Sub(RotateLeft<3>(temp[3]), Xor(temp[2], LoadKey<4>(subkeys))), LoadKey<5>(subkeys)); -} - inline void LEA_Enc_Block(__m128i &block0, const word32 *subkeys, unsigned int rounds) { @@ -999,8 +944,78 @@ inline void LEA_Dec_4_Blocks(__m128i &block0, __m128i &block1, #endif // CRYPTOPP_SSSE3_AVAILABLE +// *************************** Power8 ***************************// + +#if (CRYPTOPP_POWER8_AVAILABLE) + +inline void LEA_Enc_Block(uint32x4_p &block0, + const word32 *subkeys, unsigned int rounds) +{ + uint32x4_p temp[4]; + temp[0] = UnpackSIMD<0>(block0); + temp[1] = UnpackSIMD<1>(block0); + temp[2] = UnpackSIMD<2>(block0); + temp[3] = UnpackSIMD<3>(block0); + + LEA_Encryption(temp, subkeys, rounds); + + block0 = RepackSIMD<0>(temp[0], temp[1], temp[2], temp[3]); +} + +inline void LEA_Dec_Block(uint32x4_p &block0, + const word32 *subkeys, unsigned int rounds) +{ + uint32x4_p temp[4]; + temp[0] = UnpackSIMD<0>(block0); + temp[1] = UnpackSIMD<1>(block0); + temp[2] = UnpackSIMD<2>(block0); + temp[3] = UnpackSIMD<3>(block0); + + LEA_Decryption(temp, subkeys, rounds); + + block0 = RepackSIMD<0>(temp[0], temp[1], temp[2], temp[3]); +} + +inline void LEA_Enc_4_Blocks(uint32x4_p &block0, uint32x4_p &block1, + uint32x4_p &block2, uint32x4_p &block3, const word32 *subkeys, unsigned int rounds) +{ + uint32x4_p temp[4]; + temp[0] = UnpackSIMD<0>(block0, block1, block2, block3); + temp[1] = UnpackSIMD<1>(block0, block1, block2, block3); + temp[2] = UnpackSIMD<2>(block0, block1, block2, block3); + temp[3] = UnpackSIMD<3>(block0, block1, block2, block3); + + LEA_Encryption(temp, subkeys, rounds); + + block0 = RepackSIMD<0>(temp[0], temp[1], temp[2], temp[3]); + block1 = RepackSIMD<1>(temp[0], temp[1], temp[2], temp[3]); + block2 = RepackSIMD<2>(temp[0], temp[1], temp[2], temp[3]); + block3 = RepackSIMD<3>(temp[0], temp[1], temp[2], temp[3]); +} + +inline void LEA_Dec_4_Blocks(uint32x4_p &block0, uint32x4_p &block1, + uint32x4_p &block2, uint32x4_p &block3, const word32 *subkeys, unsigned int rounds) +{ + uint32x4_p temp[4]; + temp[0] = UnpackSIMD<0>(block0, block1, block2, block3); + temp[1] = UnpackSIMD<1>(block0, block1, block2, block3); + temp[2] = UnpackSIMD<2>(block0, block1, block2, block3); + temp[3] = UnpackSIMD<3>(block0, block1, block2, block3); + + LEA_Decryption(temp, subkeys, rounds); + + block0 = RepackSIMD<0>(temp[0], temp[1], temp[2], temp[3]); + block1 = RepackSIMD<1>(temp[0], temp[1], temp[2], temp[3]); + block2 = RepackSIMD<2>(temp[0], temp[1], temp[2], temp[3]); + block3 = RepackSIMD<3>(temp[0], temp[1], temp[2], temp[3]); +} + +#endif // CRYPTOPP_POWER8_AVAILABLE + ANONYMOUS_NAMESPACE_END +// *************************** SIMD Templates ***************************// + NAMESPACE_BEGIN(CryptoPP) #if defined(CRYPTOPP_SSSE3_AVAILABLE) @@ -1037,4 +1052,20 @@ size_t LEA_Dec_AdvancedProcessBlocks_NEON(const word32* subKeys, size_t rounds, } #endif // CRYPTOPP_ARM_NEON_AVAILABLE +#if defined(CRYPTOPP_POWER8_AVAILABLE) +size_t LEA_Enc_AdvancedProcessBlocks_POWER8(const word32* subKeys, size_t rounds, + const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags) +{ + return AdvancedProcessBlocks128_4x1_ALTIVEC(LEA_Enc_Block, LEA_Enc_4_Blocks, + subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags); +} + +size_t LEA_Dec_AdvancedProcessBlocks_POWER8(const word32* subKeys, size_t rounds, + const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags) +{ + return AdvancedProcessBlocks128_4x1_ALTIVEC(LEA_Dec_Block, LEA_Dec_4_Blocks, + subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags); +} +#endif // CRYPTOPP_POWER8_AVAILABLE + NAMESPACE_END