diff --git a/GNUmakefile b/GNUmakefile index 263eaca7..009feb03 100755 --- a/GNUmakefile +++ b/GNUmakefile @@ -250,6 +250,7 @@ ifeq ($(findstring -DCRYPTOPP_DISABLE_SSSE3,$(CXXFLAGS)),) ifeq ($(HAVE_SSSE3),1) ARIA_FLAG = -mssse3 CHAM_FLAG = -mssse3 + LEA_FLAG = -mssse3 SSSE3_FLAG = -mssse3 SIMON_FLAG = -mssse3 SPECK_FLAG = -mssse3 @@ -291,6 +292,7 @@ ifeq ($(SUN_COMPILER),1) SSSE3_FLAG = -xarch=ssse3 -D__SSSE3__=1 ARIA_FLAG = -xarch=ssse3 -D__SSSE3__=1 CHAM_FLAG = -xarch=ssse3 -D__SSSE3__=1 + LEA_FLAG = -xarch=ssse3 -D__SSSE3__=1 SIMON_FLAG = -xarch=ssse3 -D__SSSE3__=1 SPECK_FLAG = -xarch=ssse3 -D__SSSE3__=1 LDFLAGS += -xarch=ssse3 @@ -1068,6 +1070,10 @@ crc-simd.o : crc-simd.cpp gcm-simd.o : gcm-simd.cpp $(CXX) $(strip $(CXXFLAGS) $(GCM_FLAG) -c) $< +# SSSE3 available +lea-simd.o : lea-simd.cpp + $(CXX) $(strip $(CXXFLAGS) $(LEA_FLAG) -c) $< + # NEON available neon-simd.o : neon-simd.cpp $(CXX) $(strip $(CXXFLAGS) $(NEON_FLAG) -c) $< diff --git a/GNUmakefile-cross b/GNUmakefile-cross index b66b3874..820e3fd0 100755 --- a/GNUmakefile-cross +++ b/GNUmakefile-cross @@ -277,6 +277,7 @@ ifneq ($(IS_i686)$(IS_x86_64),00) ifeq ($(HAVE_SSSE3),1) ARIA_FLAG = -mssse3 CHAM_FLAG = -mssse3 + LEA_FLAG = -mssse3 SSSE3_FLAG = -mssse3 SIMON_FLAG = -mssse3 SPECK_FLAG = -mssse3 @@ -504,6 +505,10 @@ crc-simd.o : crc-simd.cpp gcm-simd.o : gcm-simd.cpp $(CXX) $(strip $(CXXFLAGS) $(GCM_FLAG) -c) $< +# SSSE3 available +lea-simd.o : lea-simd.cpp + $(CXX) $(strip $(CXXFLAGS) $(LEA_FLAG) -c) $< + # NEON available neon-simd.o : neon-simd.cpp $(CXX) $(strip $(CXXFLAGS) $(NEON_FLAG) -c) $< diff --git a/adv-simd.h b/adv-simd.h index cdb0311a..1609faeb 100644 --- a/adv-simd.h +++ b/adv-simd.h @@ -6,7 +6,7 @@ // acceleration. After several implementations we noticed a lot of copy and // paste occuring. adv-simd.h provides a template to avoid the copy and paste. // -// There are 8 templates provided in this file. The number following the +// There are 9 templates provided in this file. The number following the // function name is the block size of the cipher. The name following that // is the acceleration and arrangement. For example 4x1_SSE means Intel SSE // using two encrypt (or decrypt) functions: one that operates on 4 blocks, @@ -22,6 +22,14 @@ // * AdvancedProcessBlocks64_6x2_ALTIVEC // * AdvancedProcessBlocks128_6x2_ALTIVEC // +// If an arrangement ends in 2, like 6x2, then the template will handle the +// single block case by padding with 0's and using the two block function. +// This happens at most one time when processing multiple blocks. The extra +// processing of a zero block is trivial and worth the tradeoff. +// +// The MAYBE_CONST macro present on x86 is a SunCC workaround. Some versions +// of SunCC lose/drop the const-ness in the F1 and F4 functions. It eventually +// results in a failed link due to the const/non-const mismatch. #ifndef CRYPTOPP_ADVANCED_SIMD_TEMPLATES #define CRYPTOPP_ADVANCED_SIMD_TEMPLATES @@ -323,7 +331,7 @@ inline size_t AdvancedProcessBlocks64_6x2_NEON(F2 func2, F6 func6, } /// \brief AdvancedProcessBlocks for 1 and 6 blocks -/// \tparam F1 function to process 1 128-bit blocks +/// \tparam F1 function to process 1 128-bit block /// \tparam F6 function to process 6 128-bit blocks /// \tparam W word type of the subkey table /// \details AdvancedProcessBlocks128_NEON1x6 processes 6 and 2 NEON SIMD words @@ -721,7 +729,7 @@ NAMESPACE_BEGIN(CryptoPP) template inline size_t GCC_NO_UBSAN AdvancedProcessBlocks64_2x1_SSE(F1 func1, F2 func2, - const W *subKeys, size_t rounds, const byte *inBlocks, + MAYBE_CONST W *subKeys, size_t rounds, const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags) { CRYPTOPP_ASSERT(subKeys); @@ -878,7 +886,7 @@ inline size_t GCC_NO_UBSAN AdvancedProcessBlocks64_2x1_SSE(F1 func1, F2 func2, /// same word type. template inline size_t GCC_NO_UBSAN AdvancedProcessBlocks64_6x2_SSE(F2 func2, F6 func6, - const W *subKeys, size_t rounds, const byte *inBlocks, + MAYBE_CONST W *subKeys, size_t rounds, const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags) { CRYPTOPP_ASSERT(subKeys); @@ -1125,7 +1133,7 @@ inline size_t GCC_NO_UBSAN AdvancedProcessBlocks64_6x2_SSE(F2 func2, F6 func6, /// same word type. template inline size_t AdvancedProcessBlocks128_6x2_SSE(F2 func2, F6 func6, - const W *subKeys, size_t rounds, const byte *inBlocks, + MAYBE_CONST W *subKeys, size_t rounds, const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags) { CRYPTOPP_ASSERT(subKeys); @@ -1312,12 +1320,12 @@ inline size_t AdvancedProcessBlocks128_6x2_SSE(F2 func2, F6 func6, } /// \brief AdvancedProcessBlocks for 1 and 4 blocks -/// \tparam F1 function to process 1 128-bit blocks +/// \tparam F1 function to process 1 128-bit block /// \tparam F4 function to process 4 128-bit blocks /// \tparam W word type of the subkey table /// \details AdvancedProcessBlocks128_4x1_SSE processes 4 and 1 SSE SIMD words /// at a time. -/// \details The subkey type is usually word32 or word64. F1 and F6 must use the +/// \details The subkey type is usually word32 or word64. F1 and F4 must use the /// same word type. template inline size_t AdvancedProcessBlocks128_4x1_SSE(F1 func1, F4 func4, @@ -1455,7 +1463,7 @@ NAMESPACE_END // CryptoPP NAMESPACE_BEGIN(CryptoPP) /// \brief AdvancedProcessBlocks for 1 and 6 blocks -/// \tparam F1 function to process 1 128-bit blocks +/// \tparam F1 function to process 1 128-bit block /// \tparam F6 function to process 6 128-bit blocks /// \tparam W word type of the subkey table /// \details AdvancedProcessBlocks128_6x1_ALTIVEC processes 6 and 1 Altivec SIMD words diff --git a/cryptest.nmake b/cryptest.nmake index 3c9e7c99..aa0477c7 100644 --- a/cryptest.nmake +++ b/cryptest.nmake @@ -47,9 +47,9 @@ # If you use 'make sources' from Linux makefile, then add 'winpipes.cpp' to the list below. -LIB_SRCS = cryptlib.cpp cpu.cpp integer.cpp 3way.cpp adler32.cpp algebra.cpp algparam.cpp arc4.cpp aria-simd.cpp aria.cpp ariatab.cpp asn.cpp authenc.cpp base32.cpp base64.cpp basecode.cpp bfinit.cpp blake2-simd.cpp blake2.cpp blowfish.cpp blumshub.cpp camellia.cpp cast.cpp casts.cpp cbcmac.cpp ccm.cpp chacha.cpp cham.cpp cham-simd.cpp channels.cpp cmac.cpp crc-simd.cpp crc.cpp default.cpp des.cpp dessp.cpp dh.cpp dh2.cpp dll.cpp dsa.cpp eax.cpp ec2n.cpp eccrypto.cpp ecp.cpp elgamal.cpp emsa2.cpp eprecomp.cpp esign.cpp files.cpp filters.cpp fips140.cpp fipstest.cpp gcm-simd.cpp gcm.cpp gf256.cpp gf2_32.cpp gf2n.cpp gfpcrypt.cpp gost.cpp gzip.cpp hex.cpp hmac.cpp hrtimer.cpp ida.cpp idea.cpp iterhash.cpp kalyna.cpp kalynatab.cpp keccak.cpp lea.cpp luc.cpp mars.cpp marss.cpp md2.cpp md4.cpp md5.cpp misc.cpp modes.cpp mqueue.cpp mqv.cpp nbtheory.cpp neon-simd.cpp network.cpp oaep.cpp osrng.cpp padlkrng.cpp panama.cpp pkcspad.cpp poly1305.cpp polynomi.cpp pssr.cpp pubkey.cpp queue.cpp rabin.cpp randpool.cpp rc2.cpp rc5.cpp rc6.cpp rdrand.cpp rdtables.cpp rijndael-simd.cpp rijndael.cpp ripemd.cpp rng.cpp rsa.cpp rw.cpp safer.cpp salsa.cpp scrypt.cpp seal.cpp seed.cpp serpent.cpp sha-simd.cpp sha.cpp sha3.cpp shacal2-simd.cpp shacal2.cpp shark.cpp sharkbox.cpp simon.cpp simon-simd.cpp skipjack.cpp sm3.cpp sm4.cpp socketft.cpp sosemanuk.cpp speck.cpp speck-simd.cpp square.cpp squaretb.cpp sse-simd.cpp strciphr.cpp tea.cpp tftables.cpp threefish.cpp tiger.cpp tigertab.cpp trdlocal.cpp ttmac.cpp tweetnacl.cpp twofish.cpp vmac.cpp wait.cpp wake.cpp whrlpool.cpp winpipes.cpp xtr.cpp xtrcrypt.cpp zdeflate.cpp zinflate.cpp zlib.cpp +LIB_SRCS = cryptlib.cpp cpu.cpp integer.cpp 3way.cpp adler32.cpp algebra.cpp algparam.cpp arc4.cpp aria-simd.cpp aria.cpp ariatab.cpp asn.cpp authenc.cpp base32.cpp base64.cpp basecode.cpp bfinit.cpp blake2-simd.cpp blake2.cpp blowfish.cpp blumshub.cpp camellia.cpp cast.cpp casts.cpp cbcmac.cpp ccm.cpp chacha.cpp cham.cpp cham-simd.cpp channels.cpp cmac.cpp crc-simd.cpp crc.cpp default.cpp des.cpp dessp.cpp dh.cpp dh2.cpp dll.cpp dsa.cpp eax.cpp ec2n.cpp eccrypto.cpp ecp.cpp elgamal.cpp emsa2.cpp eprecomp.cpp esign.cpp files.cpp filters.cpp fips140.cpp fipstest.cpp gcm-simd.cpp gcm.cpp gf256.cpp gf2_32.cpp gf2n.cpp gfpcrypt.cpp gost.cpp gzip.cpp hex.cpp hmac.cpp hrtimer.cpp ida.cpp idea.cpp iterhash.cpp kalyna.cpp kalynatab.cpp keccak.cpp lea.cpp lea-simd.cpp luc.cpp mars.cpp marss.cpp md2.cpp md4.cpp md5.cpp misc.cpp modes.cpp mqueue.cpp mqv.cpp nbtheory.cpp neon-simd.cpp network.cpp oaep.cpp osrng.cpp padlkrng.cpp panama.cpp pkcspad.cpp poly1305.cpp polynomi.cpp pssr.cpp pubkey.cpp queue.cpp rabin.cpp randpool.cpp rc2.cpp rc5.cpp rc6.cpp rdrand.cpp rdtables.cpp rijndael-simd.cpp rijndael.cpp ripemd.cpp rng.cpp rsa.cpp rw.cpp safer.cpp salsa.cpp scrypt.cpp seal.cpp seed.cpp serpent.cpp sha-simd.cpp sha.cpp sha3.cpp shacal2-simd.cpp shacal2.cpp shark.cpp sharkbox.cpp simon.cpp simon-simd.cpp skipjack.cpp sm3.cpp sm4.cpp socketft.cpp sosemanuk.cpp speck.cpp speck-simd.cpp square.cpp squaretb.cpp sse-simd.cpp strciphr.cpp tea.cpp tftables.cpp threefish.cpp tiger.cpp tigertab.cpp trdlocal.cpp ttmac.cpp tweetnacl.cpp twofish.cpp vmac.cpp wait.cpp wake.cpp whrlpool.cpp winpipes.cpp xtr.cpp xtrcrypt.cpp zdeflate.cpp zinflate.cpp zlib.cpp -LIB_OBJS = cryptlib.obj cpu.obj integer.obj 3way.obj adler32.obj algebra.obj algparam.obj arc4.obj aria-simd.obj aria.obj ariatab.obj asn.obj authenc.obj base32.obj base64.obj basecode.obj bfinit.obj blake2-simd.obj blake2.obj blowfish.obj blumshub.obj camellia.obj cast.obj casts.obj cbcmac.obj ccm.obj chacha.obj cham.obj cham-simd.obj channels.obj cmac.obj crc-simd.obj crc.obj default.obj des.obj dessp.obj dh.obj dh2.obj dll.obj dsa.obj eax.obj ec2n.obj eccrypto.obj ecp.obj elgamal.obj emsa2.obj eprecomp.obj esign.obj files.obj filters.obj fips140.obj fipstest.obj gcm-simd.obj gcm.obj gf256.obj gf2_32.obj gf2n.obj gfpcrypt.obj gost.obj gzip.obj hex.obj hmac.obj hrtimer.obj ida.obj idea.obj iterhash.obj kalyna.obj kalynatab.obj keccak.obj lea.obj luc.obj mars.obj marss.obj md2.obj md4.obj md5.obj misc.obj modes.obj mqueue.obj mqv.obj nbtheory.obj neon-simd.obj network.obj oaep.obj osrng.obj padlkrng.obj panama.obj pkcspad.obj poly1305.obj polynomi.obj pssr.obj pubkey.obj queue.obj rabin.obj randpool.obj rc2.obj rc5.obj rc6.obj rdrand.obj rdtables.obj rijndael-simd.obj rijndael.obj ripemd.obj rng.obj rsa.obj rw.obj safer.obj salsa.obj scrypt.obj seal.obj seed.obj serpent.obj sha-simd.obj sha.obj sha3.obj shacal2-simd.obj shacal2.obj shark.obj sharkbox.obj simon.obj simon-simd.obj skipjack.obj sm3.obj sm4.obj socketft.obj sosemanuk.obj speck.obj speck-simd.obj square.obj squaretb.obj sse-simd.obj strciphr.obj tea.obj tftables.obj threefish.obj tiger.obj tigertab.obj trdlocal.obj ttmac.obj tweetnacl.obj twofish.obj vmac.obj wait.obj wake.obj whrlpool.obj winpipes.obj xtr.obj xtrcrypt.obj zdeflate.obj zinflate.obj zlib.obj +LIB_OBJS = cryptlib.obj cpu.obj integer.obj 3way.obj adler32.obj algebra.obj algparam.obj arc4.obj aria-simd.obj aria.obj ariatab.obj asn.obj authenc.obj base32.obj base64.obj basecode.obj bfinit.obj blake2-simd.obj blake2.obj blowfish.obj blumshub.obj camellia.obj cast.obj casts.obj cbcmac.obj ccm.obj chacha.obj cham.obj cham-simd.obj channels.obj cmac.obj crc-simd.obj crc.obj default.obj des.obj dessp.obj dh.obj dh2.obj dll.obj dsa.obj eax.obj ec2n.obj eccrypto.obj ecp.obj elgamal.obj emsa2.obj eprecomp.obj esign.obj files.obj filters.obj fips140.obj fipstest.obj gcm-simd.obj gcm.obj gf256.obj gf2_32.obj gf2n.obj gfpcrypt.obj gost.obj gzip.obj hex.obj hmac.obj hrtimer.obj ida.obj idea.obj iterhash.obj kalyna.obj kalynatab.obj keccak.obj lea.obj lea-simd.obj luc.obj mars.obj marss.obj md2.obj md4.obj md5.obj misc.obj modes.obj mqueue.obj mqv.obj nbtheory.obj neon-simd.obj network.obj oaep.obj osrng.obj padlkrng.obj panama.obj pkcspad.obj poly1305.obj polynomi.obj pssr.obj pubkey.obj queue.obj rabin.obj randpool.obj rc2.obj rc5.obj rc6.obj rdrand.obj rdtables.obj rijndael-simd.obj rijndael.obj ripemd.obj rng.obj rsa.obj rw.obj safer.obj salsa.obj scrypt.obj seal.obj seed.obj serpent.obj sha-simd.obj sha.obj sha3.obj shacal2-simd.obj shacal2.obj shark.obj sharkbox.obj simon.obj simon-simd.obj skipjack.obj sm3.obj sm4.obj socketft.obj sosemanuk.obj speck.obj speck-simd.obj square.obj squaretb.obj sse-simd.obj strciphr.obj tea.obj tftables.obj threefish.obj tiger.obj tigertab.obj trdlocal.obj ttmac.obj tweetnacl.obj twofish.obj vmac.obj wait.obj wake.obj whrlpool.obj winpipes.obj xtr.obj xtrcrypt.obj zdeflate.obj zinflate.obj zlib.obj TEST_SRCS = bench1.cpp bench2.cpp test.cpp validat0.cpp validat1.cpp validat2.cpp validat3.cpp validat4.cpp datatest.cpp regtest1.cpp regtest2.cpp regtest3.cpp fipsalgt.cpp dlltest.cpp fipstest.cpp diff --git a/cryptlib.vcxproj b/cryptlib.vcxproj index a2f7e0be..1bce1d1d 100644 --- a/cryptlib.vcxproj +++ b/cryptlib.vcxproj @@ -242,6 +242,7 @@ + diff --git a/cryptlib.vcxproj.filters b/cryptlib.vcxproj.filters index 9867d7ba..d91ce2d2 100644 --- a/cryptlib.vcxproj.filters +++ b/cryptlib.vcxproj.filters @@ -224,6 +224,9 @@ Source Files + + Source Files + Source Files diff --git a/lea-simd.cpp b/lea-simd.cpp new file mode 100644 index 00000000..0076926d --- /dev/null +++ b/lea-simd.cpp @@ -0,0 +1,495 @@ +// lea-simd.cpp - written and placed in the public domain by Jeffrey Walton +// +// This source file uses intrinsics and built-ins to gain access to +// SSSE3, ARM NEON and ARMv8a, and Power7 Altivec instructions. A separate +// source file is needed because additional CXXFLAGS are required to enable +// the appropriate instructions sets in some build configurations. + +#include "pch.h" +#include "config.h" + +#include "cham.h" +#include "misc.h" +#include "adv-simd.h" + +// Uncomment for benchmarking C++ against SSE or NEON. +// Do so in both simon.cpp and simon-simd.cpp. +// #undef CRYPTOPP_SSSE3_AVAILABLE +// #undef CRYPTOPP_ARM_NEON_AVAILABLE + +#if (CRYPTOPP_SSSE3_AVAILABLE) +# include +# include +#endif + +ANONYMOUS_NAMESPACE_BEGIN + +using CryptoPP::word32; + +#if (CRYPTOPP_SSSE3_AVAILABLE) + +inline __m128i Xor(const __m128i& a, const __m128i& b) +{ + return _mm_xor_si128(a, b); +} + +inline __m128i Add(const __m128i& a, const __m128i& b) +{ + return _mm_add_epi32(a, b); +} + +inline __m128i Sub(const __m128i& a, const __m128i& b) +{ + return _mm_sub_epi32(a, b); +} + +template +inline __m128i RotateLeft(const __m128i& val) +{ + return _mm_or_si128( + _mm_slli_epi32(val, R), _mm_srli_epi32(val, 32-R)); +} + +template +inline __m128i RotateRight(const __m128i& val) +{ + return _mm_or_si128( + _mm_slli_epi32(val, 32-R), _mm_srli_epi32(val, R)); +} + +// Faster than two Shifts and an Or. +template <> +inline __m128i RotateLeft<8>(const __m128i& val) +{ + const __m128i mask = _mm_set_epi8(14,13,12,15, 10,9,8,11, 6,5,4,7, 2,1,0,3); + return _mm_shuffle_epi8(val, mask); +} + +// Faster than two Shifts and an Or. +template <> +inline __m128i RotateRight<8>(const __m128i& val) +{ + const __m128i mask = _mm_set_epi8(12,15,14,13, 8,11,10,9, 4,7,6,5, 0,3,2,1); + return _mm_shuffle_epi8(val, mask); +} + +template +inline __m128i LoadKey(const word32 rkey[]) +{ + return _mm_loadu_si128((const __m128i*) &rkey[IDX*4]); +} + +template +inline __m128i UnpackXMM(const __m128i& a, const __m128i& b, const __m128i& c, const __m128i& d) +{ + // Should not be instantiated + CRYPTOPP_ASSERT(0);; + return _mm_setzero_si128(); +} + +template <> +inline __m128i UnpackXMM<0>(const __m128i& a, const __m128i& b, const __m128i& c, const __m128i& d) +{ + // LEA is little-endian oriented, so there is no need for a separate shuffle. + const __m128i r1 = _mm_unpacklo_epi32(a, b); + const __m128i r2 = _mm_unpacklo_epi32(c, d); + return _mm_unpacklo_epi64(r1, r2); +} + +template <> +inline __m128i UnpackXMM<1>(const __m128i& a, const __m128i& b, const __m128i& c, const __m128i& d) +{ + // LEA is little-endian oriented, so there is no need for a separate shuffle. + const __m128i r1 = _mm_unpacklo_epi32(a, b); + const __m128i r2 = _mm_unpacklo_epi32(c, d); + return _mm_unpackhi_epi64(r1, r2); +} + +template <> +inline __m128i UnpackXMM<2>(const __m128i& a, const __m128i& b, const __m128i& c, const __m128i& d) +{ + // LEA is little-endian oriented, so there is no need for a separate shuffle. + const __m128i r1 = _mm_unpackhi_epi32(a, b); + const __m128i r2 = _mm_unpackhi_epi32(c, d); + return _mm_unpacklo_epi64(r1, r2); +} + +template <> +inline __m128i UnpackXMM<3>(const __m128i& a, const __m128i& b, const __m128i& c, const __m128i& d) +{ + // LEA is little-endian oriented, so there is no need for a separate shuffle. + const __m128i r1 = _mm_unpackhi_epi32(a, b); + const __m128i r2 = _mm_unpackhi_epi32(c, d); + return _mm_unpackhi_epi64(r1, r2); +} + +template +inline __m128i UnpackXMM(const __m128i& v) +{ + // Should not be instantiated + CRYPTOPP_ASSERT(0);; + return _mm_setzero_si128(); +} + +template <> +inline __m128i UnpackXMM<0>(const __m128i& v) +{ + // Splat to all lanes + return _mm_shuffle_epi8(v, _mm_set_epi8(3,2,1,0, 3,2,1,0, 3,2,1,0, 3,2,1,0)); +} + +template <> +inline __m128i UnpackXMM<1>(const __m128i& v) +{ + // Splat to all lanes + return _mm_shuffle_epi8(v, _mm_set_epi8(7,6,5,4, 7,6,5,4, 7,6,5,4, 7,6,5,4)); +} + +template <> +inline __m128i UnpackXMM<2>(const __m128i& v) +{ + // Splat to all lanes + return _mm_shuffle_epi8(v, _mm_set_epi8(11,10,9,8, 11,10,9,8, 11,10,9,8, 11,10,9,8)); +} + +template <> +inline __m128i UnpackXMM<3>(const __m128i& v) +{ + // Splat to all lanes + return _mm_shuffle_epi8(v, _mm_set_epi8(15,14,13,12, 15,14,13,12, 15,14,13,12, 15,14,13,12)); +} + +template +inline __m128i RepackXMM(const __m128i& a, const __m128i& b, const __m128i& c, const __m128i& d) +{ + return UnpackXMM(a, b, c, d); +} + +template +inline __m128i RepackXMM(const __m128i& v) +{ + return UnpackXMM(v); +} + +inline void LEA_Encryption(__m128i temp[4], const word32 *subkeys, unsigned int rounds) +{ + temp[3] = RotateRight<3>(Add(Xor(temp[2], LoadKey<4>(subkeys)), Xor(temp[3], LoadKey<5>(subkeys)))); + temp[2] = RotateRight<5>(Add(Xor(temp[1], LoadKey<2>(subkeys)), Xor(temp[2], LoadKey<3>(subkeys)))); + temp[1] = RotateLeft<9>(Add(Xor(temp[0], LoadKey<0>(subkeys)), Xor(temp[1], LoadKey<1>(subkeys)))); + temp[0] = RotateRight<3>(Add(Xor(temp[3], LoadKey<10>(subkeys)), Xor(temp[0], LoadKey<11>(subkeys)))); + temp[3] = RotateRight<5>(Add(Xor(temp[2], LoadKey<8>(subkeys)), Xor(temp[3], LoadKey<9>(subkeys)))); + temp[2] = RotateLeft<9>(Add(Xor(temp[1], LoadKey<6>(subkeys)), Xor(temp[2], LoadKey<7>(subkeys)))); + temp[1] = RotateRight<3>(Add(Xor(temp[0], LoadKey<16>(subkeys)), Xor(temp[1], LoadKey<17>(subkeys)))); + temp[0] = RotateRight<5>(Add(Xor(temp[3], LoadKey<14>(subkeys)), Xor(temp[0], LoadKey<15>(subkeys)))); + temp[3] = RotateLeft<9>(Add(Xor(temp[2], LoadKey<12>(subkeys)), Xor(temp[3], LoadKey<13>(subkeys)))); + temp[2] = RotateRight<3>(Add(Xor(temp[1], LoadKey<22>(subkeys)), Xor(temp[2], LoadKey<23>(subkeys)))); + temp[1] = RotateRight<5>(Add(Xor(temp[0], LoadKey<20>(subkeys)), Xor(temp[1], LoadKey<21>(subkeys)))); + temp[0] = RotateLeft<9>(Add(Xor(temp[3], LoadKey<18>(subkeys)), Xor(temp[0], LoadKey<19>(subkeys)))); + + temp[3] = RotateRight<3>(Add(Xor(temp[2], LoadKey<28>(subkeys)), Xor(temp[3], LoadKey<29>(subkeys)))); + temp[2] = RotateRight<5>(Add(Xor(temp[1], LoadKey<26>(subkeys)), Xor(temp[2], LoadKey<27>(subkeys)))); + temp[1] = RotateLeft<9>(Add(Xor(temp[0], LoadKey<24>(subkeys)), Xor(temp[1], LoadKey<25>(subkeys)))); + temp[0] = RotateRight<3>(Add(Xor(temp[3], LoadKey<34>(subkeys)), Xor(temp[0], LoadKey<35>(subkeys)))); + temp[3] = RotateRight<5>(Add(Xor(temp[2], LoadKey<32>(subkeys)), Xor(temp[3], LoadKey<33>(subkeys)))); + temp[2] = RotateLeft<9>(Add(Xor(temp[1], LoadKey<30>(subkeys)), Xor(temp[2], LoadKey<31>(subkeys)))); + temp[1] = RotateRight<3>(Add(Xor(temp[0], LoadKey<40>(subkeys)), Xor(temp[1], LoadKey<41>(subkeys)))); + temp[0] = RotateRight<5>(Add(Xor(temp[3], LoadKey<38>(subkeys)), Xor(temp[0], LoadKey<39>(subkeys)))); + temp[3] = RotateLeft<9>(Add(Xor(temp[2], LoadKey<36>(subkeys)), Xor(temp[3], LoadKey<37>(subkeys)))); + temp[2] = RotateRight<3>(Add(Xor(temp[1], LoadKey<46>(subkeys)), Xor(temp[2], LoadKey<47>(subkeys)))); + temp[1] = RotateRight<5>(Add(Xor(temp[0], LoadKey<44>(subkeys)), Xor(temp[1], LoadKey<45>(subkeys)))); + temp[0] = RotateLeft<9>(Add(Xor(temp[3], LoadKey<42>(subkeys)), Xor(temp[0], LoadKey<43>(subkeys)))); + + temp[3] = RotateRight<3>(Add(Xor(temp[2], LoadKey<52>(subkeys)), Xor(temp[3], LoadKey<53>(subkeys)))); + temp[2] = RotateRight<5>(Add(Xor(temp[1], LoadKey<50>(subkeys)), Xor(temp[2], LoadKey<51>(subkeys)))); + temp[1] = RotateLeft<9>(Add(Xor(temp[0], LoadKey<48>(subkeys)), Xor(temp[1], LoadKey<49>(subkeys)))); + temp[0] = RotateRight<3>(Add(Xor(temp[3], LoadKey<58>(subkeys)), Xor(temp[0], LoadKey<59>(subkeys)))); + temp[3] = RotateRight<5>(Add(Xor(temp[2], LoadKey<56>(subkeys)), Xor(temp[3], LoadKey<57>(subkeys)))); + temp[2] = RotateLeft<9>(Add(Xor(temp[1], LoadKey<54>(subkeys)), Xor(temp[2], LoadKey<55>(subkeys)))); + temp[1] = RotateRight<3>(Add(Xor(temp[0], LoadKey<64>(subkeys)), Xor(temp[1], LoadKey<65>(subkeys)))); + temp[0] = RotateRight<5>(Add(Xor(temp[3], LoadKey<62>(subkeys)), Xor(temp[0], LoadKey<63>(subkeys)))); + temp[3] = RotateLeft<9>(Add(Xor(temp[2], LoadKey<60>(subkeys)), Xor(temp[3], LoadKey<61>(subkeys)))); + temp[2] = RotateRight<3>(Add(Xor(temp[1], LoadKey<70>(subkeys)), Xor(temp[2], LoadKey<71>(subkeys)))); + temp[1] = RotateRight<5>(Add(Xor(temp[0], LoadKey<68>(subkeys)), Xor(temp[1], LoadKey<69>(subkeys)))); + temp[0] = RotateLeft<9>(Add(Xor(temp[3], LoadKey<66>(subkeys)), Xor(temp[0], LoadKey<67>(subkeys)))); + + temp[3] = RotateRight<3>(Add(Xor(temp[2], LoadKey<76>(subkeys)), Xor(temp[3], LoadKey<77>(subkeys)))); + temp[2] = RotateRight<5>(Add(Xor(temp[1], LoadKey<74>(subkeys)), Xor(temp[2], LoadKey<75>(subkeys)))); + temp[1] = RotateLeft<9>(Add(Xor(temp[0], LoadKey<72>(subkeys)), Xor(temp[1], LoadKey<73>(subkeys)))); + temp[0] = RotateRight<3>(Add(Xor(temp[3], LoadKey<82>(subkeys)), Xor(temp[0], LoadKey<83>(subkeys)))); + temp[3] = RotateRight<5>(Add(Xor(temp[2], LoadKey<80>(subkeys)), Xor(temp[3], LoadKey<81>(subkeys)))); + temp[2] = RotateLeft<9>(Add(Xor(temp[1], LoadKey<78>(subkeys)), Xor(temp[2], LoadKey<79>(subkeys)))); + temp[1] = RotateRight<3>(Add(Xor(temp[0], LoadKey<88>(subkeys)), Xor(temp[1], LoadKey<89>(subkeys)))); + temp[0] = RotateRight<5>(Add(Xor(temp[3], LoadKey<86>(subkeys)), Xor(temp[0], LoadKey<87>(subkeys)))); + temp[3] = RotateLeft<9>(Add(Xor(temp[2], LoadKey<84>(subkeys)), Xor(temp[3], LoadKey<85>(subkeys)))); + temp[2] = RotateRight<3>(Add(Xor(temp[1], LoadKey<94>(subkeys)), Xor(temp[2], LoadKey<95>(subkeys)))); + temp[1] = RotateRight<5>(Add(Xor(temp[0], LoadKey<92>(subkeys)), Xor(temp[1], LoadKey<93>(subkeys)))); + temp[0] = RotateLeft<9>(Add(Xor(temp[3], LoadKey<90>(subkeys)), Xor(temp[0], LoadKey<91>(subkeys)))); + + temp[3] = RotateRight<3>(Add(Xor(temp[2], LoadKey<100>(subkeys)), Xor(temp[3], LoadKey<101>(subkeys)))); + temp[2] = RotateRight<5>(Add(Xor(temp[1], LoadKey<98>(subkeys)), Xor(temp[2], LoadKey<99>(subkeys)))); + temp[1] = RotateLeft<9>(Add(Xor(temp[0], LoadKey<96>(subkeys)), Xor(temp[1], LoadKey<97>(subkeys)))); + temp[0] = RotateRight<3>(Add(Xor(temp[3], LoadKey<106>(subkeys)), Xor(temp[0], LoadKey<107>(subkeys)))); + temp[3] = RotateRight<5>(Add(Xor(temp[2], LoadKey<104>(subkeys)), Xor(temp[3], LoadKey<105>(subkeys)))); + temp[2] = RotateLeft<9>(Add(Xor(temp[1], LoadKey<102>(subkeys)), Xor(temp[2], LoadKey<103>(subkeys)))); + temp[1] = RotateRight<3>(Add(Xor(temp[0], LoadKey<112>(subkeys)), Xor(temp[1], LoadKey<113>(subkeys)))); + temp[0] = RotateRight<5>(Add(Xor(temp[3], LoadKey<110>(subkeys)), Xor(temp[0], LoadKey<111>(subkeys)))); + temp[3] = RotateLeft<9>(Add(Xor(temp[2], LoadKey<108>(subkeys)), Xor(temp[3], LoadKey<109>(subkeys)))); + temp[2] = RotateRight<3>(Add(Xor(temp[1], LoadKey<118>(subkeys)), Xor(temp[2], LoadKey<119>(subkeys)))); + temp[1] = RotateRight<5>(Add(Xor(temp[0], LoadKey<116>(subkeys)), Xor(temp[1], LoadKey<117>(subkeys)))); + temp[0] = RotateLeft<9>(Add(Xor(temp[3], LoadKey<114>(subkeys)), Xor(temp[0], LoadKey<115>(subkeys)))); + + temp[3] = RotateRight<3>(Add(Xor(temp[2], LoadKey<124>(subkeys)), Xor(temp[3], LoadKey<125>(subkeys)))); + temp[2] = RotateRight<5>(Add(Xor(temp[1], LoadKey<122>(subkeys)), Xor(temp[2], LoadKey<123>(subkeys)))); + temp[1] = RotateLeft<9>(Add(Xor(temp[0], LoadKey<120>(subkeys)), Xor(temp[1], LoadKey<121>(subkeys)))); + temp[0] = RotateRight<3>(Add(Xor(temp[3], LoadKey<130>(subkeys)), Xor(temp[0], LoadKey<131>(subkeys)))); + temp[3] = RotateRight<5>(Add(Xor(temp[2], LoadKey<128>(subkeys)), Xor(temp[3], LoadKey<129>(subkeys)))); + temp[2] = RotateLeft<9>(Add(Xor(temp[1], LoadKey<126>(subkeys)), Xor(temp[2], LoadKey<127>(subkeys)))); + temp[1] = RotateRight<3>(Add(Xor(temp[0], LoadKey<136>(subkeys)), Xor(temp[1], LoadKey<137>(subkeys)))); + temp[0] = RotateRight<5>(Add(Xor(temp[3], LoadKey<134>(subkeys)), Xor(temp[0], LoadKey<135>(subkeys)))); + temp[3] = RotateLeft<9>(Add(Xor(temp[2], LoadKey<132>(subkeys)), Xor(temp[3], LoadKey<133>(subkeys)))); + temp[2] = RotateRight<3>(Add(Xor(temp[1], LoadKey<142>(subkeys)), Xor(temp[2], LoadKey<143>(subkeys)))); + temp[1] = RotateRight<5>(Add(Xor(temp[0], LoadKey<140>(subkeys)), Xor(temp[1], LoadKey<141>(subkeys)))); + temp[0] = RotateLeft<9>(Add(Xor(temp[3], LoadKey<138>(subkeys)), Xor(temp[0], LoadKey<139>(subkeys)))); + + if(rounds > 24) + { + temp[3] = RotateRight<3>(Add(Xor(temp[2], LoadKey<148>(subkeys)), Xor(temp[3], LoadKey<149>(subkeys)))); + temp[2] = RotateRight<5>(Add(Xor(temp[1], LoadKey<146>(subkeys)), Xor(temp[2], LoadKey<147>(subkeys)))); + temp[1] = RotateLeft<9>(Add(Xor(temp[0], LoadKey<144>(subkeys)), Xor(temp[1], LoadKey<145>(subkeys)))); + temp[0] = RotateRight<3>(Add(Xor(temp[3], LoadKey<154>(subkeys)), Xor(temp[0], LoadKey<155>(subkeys)))); + temp[3] = RotateRight<5>(Add(Xor(temp[2], LoadKey<152>(subkeys)), Xor(temp[3], LoadKey<153>(subkeys)))); + temp[2] = RotateLeft<9>(Add(Xor(temp[1], LoadKey<150>(subkeys)), Xor(temp[2], LoadKey<151>(subkeys)))); + temp[1] = RotateRight<3>(Add(Xor(temp[0], LoadKey<160>(subkeys)), Xor(temp[1], LoadKey<161>(subkeys)))); + temp[0] = RotateRight<5>(Add(Xor(temp[3], LoadKey<158>(subkeys)), Xor(temp[0], LoadKey<159>(subkeys)))); + temp[3] = RotateLeft<9>(Add(Xor(temp[2], LoadKey<156>(subkeys)), Xor(temp[3], LoadKey<157>(subkeys)))); + temp[2] = RotateRight<3>(Add(Xor(temp[1], LoadKey<166>(subkeys)), Xor(temp[2], LoadKey<167>(subkeys)))); + temp[1] = RotateRight<5>(Add(Xor(temp[0], LoadKey<164>(subkeys)), Xor(temp[1], LoadKey<165>(subkeys)))); + temp[0] = RotateLeft<9>(Add(Xor(temp[3], LoadKey<162>(subkeys)), Xor(temp[0], LoadKey<163>(subkeys)))); + } + + if(rounds > 28) + { + temp[3] = RotateRight<3>(Add(Xor(temp[2], LoadKey<172>(subkeys)), Xor(temp[3], LoadKey<173>(subkeys)))); + temp[2] = RotateRight<5>(Add(Xor(temp[1], LoadKey<170>(subkeys)), Xor(temp[2], LoadKey<171>(subkeys)))); + temp[1] = RotateLeft<9>(Add(Xor(temp[0], LoadKey<168>(subkeys)), Xor(temp[1], LoadKey<169>(subkeys)))); + temp[0] = RotateRight<3>(Add(Xor(temp[3], LoadKey<178>(subkeys)), Xor(temp[0], LoadKey<179>(subkeys)))); + temp[3] = RotateRight<5>(Add(Xor(temp[2], LoadKey<176>(subkeys)), Xor(temp[3], LoadKey<177>(subkeys)))); + temp[2] = RotateLeft<9>(Add(Xor(temp[1], LoadKey<174>(subkeys)), Xor(temp[2], LoadKey<175>(subkeys)))); + temp[1] = RotateRight<3>(Add(Xor(temp[0], LoadKey<184>(subkeys)), Xor(temp[1], LoadKey<185>(subkeys)))); + temp[0] = RotateRight<5>(Add(Xor(temp[3], LoadKey<182>(subkeys)), Xor(temp[0], LoadKey<183>(subkeys)))); + temp[3] = RotateLeft<9>(Add(Xor(temp[2], LoadKey<180>(subkeys)), Xor(temp[3], LoadKey<181>(subkeys)))); + temp[2] = RotateRight<3>(Add(Xor(temp[1], LoadKey<190>(subkeys)), Xor(temp[2], LoadKey<191>(subkeys)))); + temp[1] = RotateRight<5>(Add(Xor(temp[0], LoadKey<188>(subkeys)), Xor(temp[1], LoadKey<189>(subkeys)))); + temp[0] = RotateLeft<9>(Add(Xor(temp[3], LoadKey<186>(subkeys)), Xor(temp[0], LoadKey<187>(subkeys)))); + } +} + +inline void LEA_Decryption(__m128i temp[4], const word32 *subkeys, unsigned int rounds) +{ + if(rounds > 28) + { + temp[0] = Xor(Sub(RotateRight<9>(temp[0]), Xor(temp[3], LoadKey<186>(subkeys))), LoadKey<187>(subkeys)); + temp[1] = Xor(Sub(RotateLeft<5>(temp[1]), Xor(temp[0], LoadKey<188>(subkeys))), LoadKey<189>(subkeys)); + temp[2] = Xor(Sub(RotateLeft<3>(temp[2]), Xor(temp[1], LoadKey<190>(subkeys))), LoadKey<191>(subkeys)); + temp[3] = Xor(Sub(RotateRight<9>(temp[3]), Xor(temp[2], LoadKey<180>(subkeys))), LoadKey<181>(subkeys)); + temp[0] = Xor(Sub(RotateLeft<5>(temp[0]), Xor(temp[3], LoadKey<182>(subkeys))), LoadKey<183>(subkeys)); + temp[1] = Xor(Sub(RotateLeft<3>(temp[1]), Xor(temp[0], LoadKey<184>(subkeys))), LoadKey<185>(subkeys)); + temp[2] = Xor(Sub(RotateRight<9>(temp[2]), Xor(temp[1], LoadKey<174>(subkeys))), LoadKey<175>(subkeys)); + temp[3] = Xor(Sub(RotateLeft<5>(temp[3]), Xor(temp[2], LoadKey<176>(subkeys))), LoadKey<177>(subkeys)); + temp[0] = Xor(Sub(RotateLeft<3>(temp[0]), Xor(temp[3], LoadKey<178>(subkeys))), LoadKey<179>(subkeys)); + temp[1] = Xor(Sub(RotateRight<9>(temp[1]), Xor(temp[0], LoadKey<168>(subkeys))), LoadKey<169>(subkeys)); + temp[2] = Xor(Sub(RotateLeft<5>(temp[2]), Xor(temp[1], LoadKey<170>(subkeys))), LoadKey<171>(subkeys)); + temp[3] = Xor(Sub(RotateLeft<3>(temp[3]), Xor(temp[2], LoadKey<172>(subkeys))), LoadKey<173>(subkeys)); + } + + if(rounds > 24) + { + temp[0] = Xor(Sub(RotateRight<9>(temp[0]), Xor(temp[3], LoadKey<162>(subkeys))), LoadKey<163>(subkeys)); + temp[1] = Xor(Sub(RotateLeft<5>(temp[1]), Xor(temp[0], LoadKey<164>(subkeys))), LoadKey<165>(subkeys)); + temp[2] = Xor(Sub(RotateLeft<3>(temp[2]), Xor(temp[1], LoadKey<166>(subkeys))), LoadKey<167>(subkeys)); + temp[3] = Xor(Sub(RotateRight<9>(temp[3]), Xor(temp[2], LoadKey<156>(subkeys))), LoadKey<157>(subkeys)); + temp[0] = Xor(Sub(RotateLeft<5>(temp[0]), Xor(temp[3], LoadKey<158>(subkeys))), LoadKey<159>(subkeys)); + temp[1] = Xor(Sub(RotateLeft<3>(temp[1]), Xor(temp[0], LoadKey<160>(subkeys))), LoadKey<161>(subkeys)); + temp[2] = Xor(Sub(RotateRight<9>(temp[2]), Xor(temp[1], LoadKey<150>(subkeys))), LoadKey<151>(subkeys)); + temp[3] = Xor(Sub(RotateLeft<5>(temp[3]), Xor(temp[2], LoadKey<152>(subkeys))), LoadKey<153>(subkeys)); + temp[0] = Xor(Sub(RotateLeft<3>(temp[0]), Xor(temp[3], LoadKey<154>(subkeys))), LoadKey<155>(subkeys)); + temp[1] = Xor(Sub(RotateRight<9>(temp[1]), Xor(temp[0], LoadKey<144>(subkeys))), LoadKey<145>(subkeys)); + temp[2] = Xor(Sub(RotateLeft<5>(temp[2]), Xor(temp[1], LoadKey<146>(subkeys))), LoadKey<147>(subkeys)); + temp[3] = Xor(Sub(RotateLeft<3>(temp[3]), Xor(temp[2], LoadKey<148>(subkeys))), LoadKey<149>(subkeys)); + } + + temp[0] = Xor(Sub(RotateRight<9>(temp[0]), Xor(temp[3], LoadKey<138>(subkeys))), LoadKey<139>(subkeys)); + temp[1] = Xor(Sub(RotateLeft<5>(temp[1]), Xor(temp[0], LoadKey<140>(subkeys))), LoadKey<141>(subkeys)); + temp[2] = Xor(Sub(RotateLeft<3>(temp[2]), Xor(temp[1], LoadKey<142>(subkeys))), LoadKey<143>(subkeys)); + temp[3] = Xor(Sub(RotateRight<9>(temp[3]), Xor(temp[2], LoadKey<132>(subkeys))), LoadKey<133>(subkeys)); + temp[0] = Xor(Sub(RotateLeft<5>(temp[0]), Xor(temp[3], LoadKey<134>(subkeys))), LoadKey<135>(subkeys)); + temp[1] = Xor(Sub(RotateLeft<3>(temp[1]), Xor(temp[0], LoadKey<136>(subkeys))), LoadKey<137>(subkeys)); + temp[2] = Xor(Sub(RotateRight<9>(temp[2]), Xor(temp[1], LoadKey<126>(subkeys))), LoadKey<127>(subkeys)); + temp[3] = Xor(Sub(RotateLeft<5>(temp[3]), Xor(temp[2], LoadKey<128>(subkeys))), LoadKey<129>(subkeys)); + temp[0] = Xor(Sub(RotateLeft<3>(temp[0]), Xor(temp[3], LoadKey<130>(subkeys))), LoadKey<131>(subkeys)); + temp[1] = Xor(Sub(RotateRight<9>(temp[1]), Xor(temp[0], LoadKey<120>(subkeys))), LoadKey<121>(subkeys)); + temp[2] = Xor(Sub(RotateLeft<5>(temp[2]), Xor(temp[1], LoadKey<122>(subkeys))), LoadKey<123>(subkeys)); + temp[3] = Xor(Sub(RotateLeft<3>(temp[3]), Xor(temp[2], LoadKey<124>(subkeys))), LoadKey<125>(subkeys)); + + temp[0] = Xor(Sub(RotateRight<9>(temp[0]), Xor(temp[3], LoadKey<114>(subkeys))), LoadKey<115>(subkeys)); + temp[1] = Xor(Sub(RotateLeft<5>(temp[1]), Xor(temp[0], LoadKey<116>(subkeys))), LoadKey<117>(subkeys)); + temp[2] = Xor(Sub(RotateLeft<3>(temp[2]), Xor(temp[1], LoadKey<118>(subkeys))), LoadKey<119>(subkeys)); + temp[3] = Xor(Sub(RotateRight<9>(temp[3]), Xor(temp[2], LoadKey<108>(subkeys))), LoadKey<109>(subkeys)); + temp[0] = Xor(Sub(RotateLeft<5>(temp[0]), Xor(temp[3], LoadKey<110>(subkeys))), LoadKey<111>(subkeys)); + temp[1] = Xor(Sub(RotateLeft<3>(temp[1]), Xor(temp[0], LoadKey<112>(subkeys))), LoadKey<113>(subkeys)); + temp[2] = Xor(Sub(RotateRight<9>(temp[2]), Xor(temp[1], LoadKey<102>(subkeys))), LoadKey<103>(subkeys)); + temp[3] = Xor(Sub(RotateLeft<5>(temp[3]), Xor(temp[2], LoadKey<104>(subkeys))), LoadKey<105>(subkeys)); + temp[0] = Xor(Sub(RotateLeft<3>(temp[0]), Xor(temp[3], LoadKey<106>(subkeys))), LoadKey<107>(subkeys)); + temp[1] = Xor(Sub(RotateRight<9>(temp[1]), Xor(temp[0], LoadKey<96>(subkeys))), LoadKey<97>(subkeys)); + temp[2] = Xor(Sub(RotateLeft<5>(temp[2]), Xor(temp[1], LoadKey<98>(subkeys))), LoadKey<99>(subkeys)); + temp[3] = Xor(Sub(RotateLeft<3>(temp[3]), Xor(temp[2], LoadKey<100>(subkeys))), LoadKey<101>(subkeys)); + + temp[0] = Xor(Sub(RotateRight<9>(temp[0]), Xor(temp[3], LoadKey<90>(subkeys))), LoadKey<91>(subkeys)); + temp[1] = Xor(Sub(RotateLeft<5>(temp[1]), Xor(temp[0], LoadKey<92>(subkeys))), LoadKey<93>(subkeys)); + temp[2] = Xor(Sub(RotateLeft<3>(temp[2]), Xor(temp[1], LoadKey<94>(subkeys))), LoadKey<95>(subkeys)); + temp[3] = Xor(Sub(RotateRight<9>(temp[3]), Xor(temp[2], LoadKey<84>(subkeys))), LoadKey<85>(subkeys)); + temp[0] = Xor(Sub(RotateLeft<5>(temp[0]), Xor(temp[3], LoadKey<86>(subkeys))), LoadKey<87>(subkeys)); + temp[1] = Xor(Sub(RotateLeft<3>(temp[1]), Xor(temp[0], LoadKey<88>(subkeys))), LoadKey<89>(subkeys)); + temp[2] = Xor(Sub(RotateRight<9>(temp[2]), Xor(temp[1], LoadKey<78>(subkeys))), LoadKey<79>(subkeys)); + temp[3] = Xor(Sub(RotateLeft<5>(temp[3]), Xor(temp[2], LoadKey<80>(subkeys))), LoadKey<81>(subkeys)); + temp[0] = Xor(Sub(RotateLeft<3>(temp[0]), Xor(temp[3], LoadKey<82>(subkeys))), LoadKey<83>(subkeys)); + temp[1] = Xor(Sub(RotateRight<9>(temp[1]), Xor(temp[0], LoadKey<72>(subkeys))), LoadKey<73>(subkeys)); + temp[2] = Xor(Sub(RotateLeft<5>(temp[2]), Xor(temp[1], LoadKey<74>(subkeys))), LoadKey<75>(subkeys)); + temp[3] = Xor(Sub(RotateLeft<3>(temp[3]), Xor(temp[2], LoadKey<76>(subkeys))), LoadKey<77>(subkeys)); + + temp[0] = Xor(Sub(RotateRight<9>(temp[0]), Xor(temp[3], LoadKey<66>(subkeys))), LoadKey<67>(subkeys)); + temp[1] = Xor(Sub(RotateLeft<5>(temp[1]), Xor(temp[0], LoadKey<68>(subkeys))), LoadKey<69>(subkeys)); + temp[2] = Xor(Sub(RotateLeft<3>(temp[2]), Xor(temp[1], LoadKey<70>(subkeys))), LoadKey<71>(subkeys)); + temp[3] = Xor(Sub(RotateRight<9>(temp[3]), Xor(temp[2], LoadKey<60>(subkeys))), LoadKey<61>(subkeys)); + temp[0] = Xor(Sub(RotateLeft<5>(temp[0]), Xor(temp[3], LoadKey<62>(subkeys))), LoadKey<63>(subkeys)); + temp[1] = Xor(Sub(RotateLeft<3>(temp[1]), Xor(temp[0], LoadKey<64>(subkeys))), LoadKey<65>(subkeys)); + temp[2] = Xor(Sub(RotateRight<9>(temp[2]), Xor(temp[1], LoadKey<54>(subkeys))), LoadKey<55>(subkeys)); + temp[3] = Xor(Sub(RotateLeft<5>(temp[3]), Xor(temp[2], LoadKey<56>(subkeys))), LoadKey<57>(subkeys)); + temp[0] = Xor(Sub(RotateLeft<3>(temp[0]), Xor(temp[3], LoadKey<58>(subkeys))), LoadKey<59>(subkeys)); + temp[1] = Xor(Sub(RotateRight<9>(temp[1]), Xor(temp[0], LoadKey<48>(subkeys))), LoadKey<49>(subkeys)); + temp[2] = Xor(Sub(RotateLeft<5>(temp[2]), Xor(temp[1], LoadKey<50>(subkeys))), LoadKey<51>(subkeys)); + temp[3] = Xor(Sub(RotateLeft<3>(temp[3]), Xor(temp[2], LoadKey<52>(subkeys))), LoadKey<53>(subkeys)); + + temp[0] = Xor(Sub(RotateRight<9>(temp[0]), Xor(temp[3], LoadKey<42>(subkeys))), LoadKey<43>(subkeys)); + temp[1] = Xor(Sub(RotateLeft<5>(temp[1]), Xor(temp[0], LoadKey<44>(subkeys))), LoadKey<45>(subkeys)); + temp[2] = Xor(Sub(RotateLeft<3>(temp[2]), Xor(temp[1], LoadKey<46>(subkeys))), LoadKey<47>(subkeys)); + temp[3] = Xor(Sub(RotateRight<9>(temp[3]), Xor(temp[2], LoadKey<36>(subkeys))), LoadKey<37>(subkeys)); + temp[0] = Xor(Sub(RotateLeft<5>(temp[0]), Xor(temp[3], LoadKey<38>(subkeys))), LoadKey<39>(subkeys)); + temp[1] = Xor(Sub(RotateLeft<3>(temp[1]), Xor(temp[0], LoadKey<40>(subkeys))), LoadKey<41>(subkeys)); + temp[2] = Xor(Sub(RotateRight<9>(temp[2]), Xor(temp[1], LoadKey<30>(subkeys))), LoadKey<31>(subkeys)); + temp[3] = Xor(Sub(RotateLeft<5>(temp[3]), Xor(temp[2], LoadKey<32>(subkeys))), LoadKey<33>(subkeys)); + temp[0] = Xor(Sub(RotateLeft<3>(temp[0]), Xor(temp[3], LoadKey<34>(subkeys))), LoadKey<35>(subkeys)); + temp[1] = Xor(Sub(RotateRight<9>(temp[1]), Xor(temp[0], LoadKey<24>(subkeys))), LoadKey<25>(subkeys)); + temp[2] = Xor(Sub(RotateLeft<5>(temp[2]), Xor(temp[1], LoadKey<26>(subkeys))), LoadKey<27>(subkeys)); + temp[3] = Xor(Sub(RotateLeft<3>(temp[3]), Xor(temp[2], LoadKey<28>(subkeys))), LoadKey<29>(subkeys)); + + temp[0] = Xor(Sub(RotateRight<9>(temp[0]), Xor(temp[3], LoadKey<18>(subkeys))), LoadKey<19>(subkeys)); + temp[1] = Xor(Sub(RotateLeft<5>(temp[1]), Xor(temp[0], LoadKey<20>(subkeys))), LoadKey<21>(subkeys)); + temp[2] = Xor(Sub(RotateLeft<3>(temp[2]), Xor(temp[1], LoadKey<22>(subkeys))), LoadKey<23>(subkeys)); + temp[3] = Xor(Sub(RotateRight<9>(temp[3]), Xor(temp[2], LoadKey<12>(subkeys))), LoadKey<13>(subkeys)); + temp[0] = Xor(Sub(RotateLeft<5>(temp[0]), Xor(temp[3], LoadKey<14>(subkeys))), LoadKey<15>(subkeys)); + temp[1] = Xor(Sub(RotateLeft<3>(temp[1]), Xor(temp[0], LoadKey<16>(subkeys))), LoadKey<17>(subkeys)); + temp[2] = Xor(Sub(RotateRight<9>(temp[2]), Xor(temp[1], LoadKey<6>(subkeys))), LoadKey<7>(subkeys)); + temp[3] = Xor(Sub(RotateLeft<5>(temp[3]), Xor(temp[2], LoadKey<8>(subkeys))), LoadKey<9>(subkeys)); + temp[0] = Xor(Sub(RotateLeft<3>(temp[0]), Xor(temp[3], LoadKey<10>(subkeys))), LoadKey<11>(subkeys)); + temp[1] = Xor(Sub(RotateRight<9>(temp[1]), Xor(temp[0], LoadKey<0>(subkeys))), LoadKey<1>(subkeys)); + temp[2] = Xor(Sub(RotateLeft<5>(temp[2]), Xor(temp[1], LoadKey<2>(subkeys))), LoadKey<3>(subkeys)); + temp[3] = Xor(Sub(RotateLeft<3>(temp[3]), Xor(temp[2], LoadKey<4>(subkeys))), LoadKey<5>(subkeys)); +} + +inline void GCC_NO_UBSAN LEA_Enc_Block(__m128i &block0, + const word32 *subkeys, unsigned int rounds) +{ + __m128i temp[4]; + temp[0] = UnpackXMM<0>(block0); + temp[1] = UnpackXMM<1>(block0); + temp[2] = UnpackXMM<2>(block0); + temp[3] = UnpackXMM<3>(block0); + + LEA_Encryption(temp, subkeys, rounds); + + block0 = RepackXMM<0>(temp[0], temp[1], temp[2], temp[3]); +} + +inline void GCC_NO_UBSAN LEA_Dec_Block(__m128i &block0, + const word32 *subkeys, unsigned int rounds) +{ + __m128i temp[4]; + temp[0] = UnpackXMM<0>(block0); + temp[1] = UnpackXMM<1>(block0); + temp[2] = UnpackXMM<2>(block0); + temp[3] = UnpackXMM<3>(block0); + + LEA_Decryption(temp, subkeys, rounds); + + block0 = RepackXMM<0>(temp[0], temp[1], temp[2], temp[3]); +} + +inline void GCC_NO_UBSAN LEA_Enc_4_Blocks(__m128i &block0, __m128i &block1, + __m128i &block2, __m128i &block3, const word32 *subkeys, unsigned int rounds) +{ + __m128i temp[4]; + temp[0] = UnpackXMM<0>(block0, block1, block2, block3); + temp[1] = UnpackXMM<1>(block0, block1, block2, block3); + temp[2] = UnpackXMM<2>(block0, block1, block2, block3); + temp[3] = UnpackXMM<3>(block0, block1, block2, block3); + + LEA_Encryption(temp, subkeys, rounds); + + block0 = RepackXMM<0>(temp[0], temp[1], temp[2], temp[3]); + block1 = RepackXMM<1>(temp[0], temp[1], temp[2], temp[3]); + block2 = RepackXMM<2>(temp[0], temp[1], temp[2], temp[3]); + block3 = RepackXMM<3>(temp[0], temp[1], temp[2], temp[3]); +} + +inline void GCC_NO_UBSAN LEA_Dec_4_Blocks(__m128i &block0, __m128i &block1, + __m128i &block2, __m128i &block3, const word32 *subkeys, unsigned int rounds) +{ + __m128i temp[4]; + temp[0] = UnpackXMM<0>(block0, block1, block2, block3); + temp[1] = UnpackXMM<1>(block0, block1, block2, block3); + temp[2] = UnpackXMM<2>(block0, block1, block2, block3); + temp[3] = UnpackXMM<3>(block0, block1, block2, block3); + + LEA_Decryption(temp, subkeys, rounds); + + block0 = RepackXMM<0>(temp[0], temp[1], temp[2], temp[3]); + block1 = RepackXMM<1>(temp[0], temp[1], temp[2], temp[3]); + block2 = RepackXMM<2>(temp[0], temp[1], temp[2], temp[3]); + block3 = RepackXMM<3>(temp[0], temp[1], temp[2], temp[3]); +} + +#endif // CRYPTOPP_SSSE3_AVAILABLE + +ANONYMOUS_NAMESPACE_END + +NAMESPACE_BEGIN(CryptoPP) + +#if defined(CRYPTOPP_SSSE3_AVAILABLE) +void LEA_SplatKeys_SSSE3(SecBlock& rkeys) +{ + SecBlock temp(rkeys.size() * 4); + for (size_t i=0, j=0; i& rkeys); + +extern size_t LEA_Enc_AdvancedProcessBlocks_SSSE3(const word32* subKeys, size_t rounds, + const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags); + +extern size_t LEA_Dec_AdvancedProcessBlocks_SSSE3(const word32* subKeys, size_t rounds, + const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags); +#endif + void LEA::Base::UncheckedSetKey(const byte *userKey, unsigned int keyLength, const NameValuePairs ¶ms) { CRYPTOPP_UNUSED(params); @@ -584,6 +595,15 @@ void LEA::Base::UncheckedSetKey(const byte *userKey, unsigned int keyLength, con default: CRYPTOPP_ASSERT(0);; } + +#if (CRYPTOPP_SSSE3_AVAILABLE) + if (HasSSSE3()) + { + // If we pre-splat the round keys at setup then we avoid a shuffle + // at runtime for each subkey used during encryption and decryption. + LEA_SplatKeys_SSSE3(m_rkey); + } +#endif } void LEA::Enc::ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock, byte *outBlock) const @@ -826,4 +846,26 @@ void LEA::Dec::ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock, byt oblock(m_temp[0])(m_temp[1])(m_temp[2])(m_temp[3]); } +#if CRYPTOPP_LEA_ADVANCED_PROCESS_BLOCKS +size_t LEA::Enc::AdvancedProcessBlocks(const byte *inBlocks, const byte *xorBlocks, + byte *outBlocks, size_t length, word32 flags) const +{ + if (HasSSSE3()) { + return LEA_Enc_AdvancedProcessBlocks_SSSE3(m_rkey, m_rounds, + inBlocks, xorBlocks, outBlocks, length, flags); + } + return BlockTransformation::AdvancedProcessBlocks(inBlocks, xorBlocks, outBlocks, length, flags); +} + +size_t LEA::Dec::AdvancedProcessBlocks(const byte *inBlocks, const byte *xorBlocks, + byte *outBlocks, size_t length, word32 flags) const +{ + if (HasSSSE3()) { + return LEA_Dec_AdvancedProcessBlocks_SSSE3(m_rkey, m_rounds, + inBlocks, xorBlocks, outBlocks, length, flags); + } + return BlockTransformation::AdvancedProcessBlocks(inBlocks, xorBlocks, outBlocks, length, flags); +} +#endif // CRYPTOPP_LEA_ADVANCED_PROCESS_BLOCKS + NAMESPACE_END diff --git a/lea.h b/lea.h index e5a6dc4e..7b34724d 100644 --- a/lea.h +++ b/lea.h @@ -15,6 +15,10 @@ #include "secblock.h" #include "algparam.h" +#if (CRYPTOPP_BOOL_X64 || CRYPTOPP_BOOL_X32 || CRYPTOPP_BOOL_X86) +# define CRYPTOPP_LEA_ADVANCED_PROCESS_BLOCKS 1 +#endif + NAMESPACE_BEGIN(CryptoPP) /// \brief LEA block cipher information @@ -59,6 +63,10 @@ public: { public: void ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock, byte *outBlock) const; + +#if CRYPTOPP_LEA_ADVANCED_PROCESS_BLOCKS + size_t AdvancedProcessBlocks(const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags) const; +#endif }; /// \brief Provides implementation for encryption transformation @@ -69,6 +77,10 @@ public: { public: void ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock, byte *outBlock) const; + +#if CRYPTOPP_LEA_ADVANCED_PROCESS_BLOCKS + size_t AdvancedProcessBlocks(const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags) const; +#endif }; typedef BlockCipherFinal Encryption;