From e416b243d37d0904c6dfdf1ecce491f458fcecbb Mon Sep 17 00:00:00 2001 From: Jeffrey Walton Date: Sun, 18 Feb 2018 23:23:50 -0500 Subject: [PATCH] Re-add Simon and Speck, enable SSE (GH #585) This commit re-adds Simon and Speck. The commit includes C++, SSSE3 and SSE4. NEON, Aarch32 and Aarch64 are disabled at the moment. --- Filelist.txt | 8 + GNUmakefile | 28 + GNUmakefile-cross | 18 + bench1.cpp | 12 + cryptest.nmake | 4 +- cryptlib.vcxproj | 6 + cryptlib.vcxproj.filters | 18 + regtest2.cpp | 15 + simon-simd.cpp | 1141 ++++++++++++++++++++++++++++++++++++++ simon.cpp | 463 ++++++++++++++++ simon.h | 180 ++++++ speck-simd.cpp | 1031 ++++++++++++++++++++++++++++++++++ speck.cpp | 438 +++++++++++++++ speck.h | 180 ++++++ 14 files changed, 3540 insertions(+), 2 deletions(-) create mode 100644 simon-simd.cpp create mode 100644 simon.cpp create mode 100644 simon.h create mode 100644 speck-simd.cpp create mode 100644 speck.cpp create mode 100644 speck.h diff --git a/Filelist.txt b/Filelist.txt index 24842d31..b679a6cf 100644 --- a/Filelist.txt +++ b/Filelist.txt @@ -276,6 +276,9 @@ sharkbox.cpp simple.cpp simple.h siphash.h +simon.cpp +simon-simd.cpp +simon.h skipjack.cpp skipjack.h sm3.cpp @@ -287,6 +290,9 @@ socketft.cpp socketft.h sosemanuk.cpp sosemanuk.h +speck.cpp +speck-simd.cpp +speck.h square.cpp square.h squaretb.cpp @@ -466,10 +472,12 @@ TestVectors/sha3_256_fips_202.txt TestVectors/sha3_384_fips_202.txt TestVectors/sha3_512_fips_202.txt TestVectors/shacal2.txt +TestVectors/simon.txt TestVectors/siphash.txt TestVectors/sm3.txt TestVectors/sm4.txt TestVectors/sosemanuk.txt +TestVectors/speck.txt TestVectors/tea.txt TestVectors/threefish.txt TestVectors/ttmac.txt diff --git a/GNUmakefile b/GNUmakefile index 6fadc15f..7de31b00 100755 --- a/GNUmakefile +++ b/GNUmakefile @@ -250,11 +250,15 @@ ifeq ($(findstring -DCRYPTOPP_DISABLE_SSSE3,$(CXXFLAGS)),) ifeq ($(HAVE_SSSE3),1) ARIA_FLAG = -mssse3 SSSE3_FLAG = -mssse3 + SIMON_FLAG = -mssse3 + SPECK_FLAG = -mssse3 endif ifeq ($(findstring -DCRYPTOPP_DISABLE_SSE4,$(CXXFLAGS)),) HAVE_SSE4 = $(shell echo | $(CXX) -x c++ $(CXXFLAGS) -msse4.1 -dM -E - 2>/dev/null | $(GREP) -i -c __SSE4_1__) ifeq ($(HAVE_SSE4),1) BLAKE2_FLAG = -msse4.1 + SIMON_FLAG = -msse4.1 + SPECK_FLAG = -msse4.1 endif HAVE_SSE4 = $(shell echo | $(CXX) -x c++ $(CXXFLAGS) -msse4.2 -dM -E - 2>/dev/null | $(GREP) -i -c __SSE4_2__) ifeq ($(HAVE_SSE4),1) @@ -285,11 +289,15 @@ ifeq ($(SUN_COMPILER),1) ifeq ($(COUNT),0) SSSE3_FLAG = -xarch=ssse3 -D__SSSE3__=1 ARIA_FLAG = -xarch=ssse3 -D__SSSE3__=1 + SIMON_FLAG = -xarch=ssse3 -D__SSSE3__=1 + SPECK_FLAG = -xarch=ssse3 -D__SSSE3__=1 LDFLAGS += -xarch=ssse3 endif COUNT := $(shell $(CXX) $(CXXFLAGS) -E -xarch=sse4_1 -xdumpmacros /dev/null 2>&1 | $(GREP) -i -c "illegal") ifeq ($(COUNT),0) BLAKE2_FLAG = -xarch=sse4_1 -D__SSE4_1__=1 + SIMON_FLAG = -xarch=sse4_1 -D__SSE4_1__=1 + SPECK_FLAG = -xarch=sse4_1 -D__SSE4_1__=1 LDFLAGS += -xarch=sse4_1 endif COUNT := $(shell $(CXX) $(CXXFLAGS) -E -xarch=sse4_2 -xdumpmacros /dev/null 2>&1 | $(GREP) -i -c "illegal") @@ -366,6 +374,8 @@ ifeq ($(IS_NEON),1) GCM_FLAG = -march=armv7-a -mfloat-abi=$(FP_ABI) -mfpu=neon ARIA_FLAG = -march=armv7-a -mfloat-abi=$(FP_ABI) -mfpu=neon BLAKE2_FLAG = -march=armv7-a -mfloat-abi=$(FP_ABI) -mfpu=neon + SIMON_FLAG = -march=armv7-a -mfloat-abi=$(FP_ABI) -mfpu=neon + SPECK_FLAG = -march=armv7-a -mfloat-abi=$(FP_ABI) -mfpu=neon endif endif @@ -375,6 +385,8 @@ ifeq ($(IS_ARMV8),1) ARIA_FLAG = -march=armv8-a BLAKE2_FLAG = -march=armv8-a NEON_FLAG = -march=armv8-a + SIMON_FLAG = -march=armv8-a + SPECK_FLAG = -march=armv8-a endif HAVE_CRC = $(shell echo | $(CXX) -x c++ $(CXXFLAGS) -march=armv8-a+crc -dM -E - 2>/dev/null | $(GREP) -i -c __ARM_FEATURE_CRC32) ifeq ($(HAVE_CRC),1) @@ -397,6 +409,8 @@ ifneq ($(IS_PPC32)$(IS_PPC64)$(IS_AIX),000) ALTIVEC_FLAG = -mcpu=power4 -maltivec ARIA_FLAG = -mcpu=power4 -maltivec BLAKE2_FLAG = -mcpu=power4 -maltivec + SIMON_FLAG = -mcpu=power4 -maltivec + SPECK_FLAG = -mcpu=power4 -maltivec endif # GCC and some compatibles HAVE_CRYPTO = $(shell echo | $(CXX) -x c++ $(CXXFLAGS) -mcpu=power8 -maltivec -dM -E - 2>/dev/null | $(GREP) -i -c -E '_ARCH_PWR8|_ARCH_PWR9|__CRYPTO') @@ -405,6 +419,8 @@ ifneq ($(IS_PPC32)$(IS_PPC64)$(IS_AIX),000) AES_FLAG = -mcpu=power8 -maltivec GCM_FLAG = -mcpu=power8 -maltivec SHA_FLAG = -mcpu=power8 -maltivec + SIMON_FLAG = -mcpu=power8 -maltivec + SPECK_FLAG = -mcpu=power8 -maltivec endif # IBM XL C/C++ HAVE_ALTIVEC = $(shell $(CXX) $(CXXFLAGS) -qshowmacros -qarch=pwr7 -qaltivec -E adhoc.cpp.proto 2>/dev/null | $(GREP) -i -c '__ALTIVEC__') @@ -412,6 +428,8 @@ ifneq ($(IS_PPC32)$(IS_PPC64)$(IS_AIX),000) ALTIVEC_FLAG = -qarch=pwr7 -qaltivec ARIA_FLAG = -qarch=pwr7 -qaltivec BLAKE2_FLAG = -qarch=pwr7 -qaltivec + SIMON_FLAG = -qarch=pwr7 -qaltivec + SPECK_FLAG = -qarch=pwr7 -qaltivec endif # IBM XL C/C++ HAVE_CRYPTO = $(shell $(CXX) $(CXXFLAGS) -qshowmacros -qarch=pwr8 -qaltivec -E adhoc.cpp.proto 2>/dev/null | $(GREP) -i -c -E '_ARCH_PWR8|_ARCH_PWR9|__CRYPTO') @@ -422,6 +440,8 @@ ifneq ($(IS_PPC32)$(IS_PPC64)$(IS_AIX),000) SHA_FLAG = -qarch=pwr8 -qaltivec ARIA_FLAG = -qarch=pwr8 -qaltivec BLAKE2_FLAG = -qarch=pwr8 -qaltivec + SIMON_FLAG = -qarch=pwr8 -qaltivec + SPECK_FLAG = -qarch=pwr8 -qaltivec endif endif @@ -1057,6 +1077,14 @@ sha-simd.o : sha-simd.cpp shacal2-simd.o : shacal2-simd.cpp $(CXX) $(strip $(CXXFLAGS) $(SHA_FLAG) -c) $< +# SSSE3 or NEON available +simon-simd.o : simon-simd.cpp + $(CXX) $(strip $(CXXFLAGS) $(SIMON_FLAG) -c) $< + +# SSSE3 or NEON available +speck-simd.o : speck-simd.cpp + $(CXX) $(strip $(CXXFLAGS) $(SPECK_FLAG) -c) $< + # Don't build Rijndael with UBsan. Too much noise due to unaligned data accesses. ifneq ($(findstring -fsanitize=undefined,$(CXXFLAGS)),) rijndael.o : rijndael.cpp diff --git a/GNUmakefile-cross b/GNUmakefile-cross index ff5d4bc3..79cdf31f 100755 --- a/GNUmakefile-cross +++ b/GNUmakefile-cross @@ -234,12 +234,16 @@ ifeq ($(IS_NEON),1) GCM_FLAG += -mfpu=neon ARIA_FLAG += -mfpu=neon BLAKE2_FLAG += -mfpu=neon + SIMON_FLAG += -mfpu=neon + SPECK_FLAG += -mfpu=neon ifeq ($(IS_ANDROID),1) ifeq ($(findstring -mfloat-abi=softfp,$(CXXFLAGS)),) NEON_FLAG += -mfloat-abi=softfp GCM_FLAG += -mfloat-abi=softfp ARIA_FLAG += -mfloat-abi=softfp BLAKE2_FLAG += -mfloat-abi=softfp + SIMON_FLAG += -mfloat-abi=softfp + SPECK_FLAG += -mfloat-abi=softfp endif endif endif @@ -252,6 +256,8 @@ ifneq ($(IS_ARMv8),0) ARIA_FLAG = -march=armv8-a BLAKE2_FLAG = -march=armv8-a NEON_FLAG = -march=armv8-a + SIMON_FLAG = -march=armv8-a + SPECK_FLAG = -march=armv8-a endif HAVE_CRC := $(shell echo | $(CXX) -x c++ $(CXXFLAGS) -march=armv8-a+crc -dM -E - 2>/dev/null | $(EGREP) -i -c __ARM_FEATURE_CRC32) ifeq ($(HAVE_CRC),1) @@ -271,9 +277,13 @@ ifneq ($(IS_i686)$(IS_x86_64),00) ifeq ($(HAVE_SSSE3),1) ARIA_FLAG = -mssse3 SSSE3_FLAG = -mssse3 + SIMON_FLAG = -mssse3 + SPECK_FLAG = -mssse3 endif HAVE_SSE4 = $(shell echo | $(CXX) -x c++ $(CXXFLAGS) -msse4.1 -dM -E - 2>/dev/null | $(EGREP) -i -c __SSE4_1__) ifeq ($(HAVE_SSE4),1) + SIMON_FLAG = -msse4.1 + SPECK_FLAG = -msse4.1 endif HAVE_SSE4 = $(shell echo | $(CXX) -x c++ $(CXXFLAGS) -msse4.2 -dM -E - 2>/dev/null | $(EGREP) -i -c __SSE4_2__) ifeq ($(HAVE_SSE4),1) @@ -500,6 +510,14 @@ sha-simd.o : sha-simd.cpp shacal2-simd.o : shacal2-simd.cpp $(CXX) $(strip $(CXXFLAGS) $(SHA_FLAG) -c) $< +# SSSE3 or NEON available +simon-simd.o : simon-simd.cpp + $(CXX) $(strip $(CXXFLAGS) $(SIMON_FLAG) -c) $< + +# SSSE3 or NEON available +speck-simd.o : speck-simd.cpp + $(CXX) $(strip $(CXXFLAGS) $(SPECK_FLAG) -c) $< + %.o : %.cpp $(CXX) $(strip $(CXXFLAGS) -c) $< diff --git a/bench1.cpp b/bench1.cpp index 06580086..153428ff 100644 --- a/bench1.cpp +++ b/bench1.cpp @@ -614,6 +614,18 @@ void Benchmark2(double t, double hertz) BenchMarkByName("Kalyna-256/CTR", 32, "Kalyna-256(256)/CTR (256-bit key)"); BenchMarkByName("Kalyna-256/CTR", 64, "Kalyna-256(512)/CTR (512-bit key)"); BenchMarkByName("Kalyna-512/CTR", 64, "Kalyna-512(512)/CTR (512-bit key)"); + + BenchMarkByName("SIMON-64/CTR", 12, "SIMON-64(96)/CTR (96-bit key)"); + BenchMarkByName("SIMON-64/CTR", 16, "SIMON-64(128)/CTR (128-bit key)"); + BenchMarkByName("SIMON-128/CTR", 16, "SIMON-128(128)/CTR (128-bit key)"); + BenchMarkByName("SIMON-128/CTR", 24, "SIMON-128(192)/CTR (192-bit key)"); + BenchMarkByName("SIMON-128/CTR", 32, "SIMON-128(256)/CTR (256-bit key)"); + + BenchMarkByName("SPECK-64/CTR", 12, "SPECK-64(96)/CTR (96-bit key)"); + BenchMarkByName("SPECK-64/CTR", 16, "SPECK-64(128)/CTR (128-bit key)"); + BenchMarkByName("SPECK-128/CTR", 16, "SPECK-128(128)/CTR (128-bit key)"); + BenchMarkByName("SPECK-128/CTR", 24, "SPECK-128(192)/CTR (192-bit key)"); + BenchMarkByName("SPECK-128/CTR", 32, "SPECK-128(256)/CTR (256-bit key)"); } std::cout << "\n"; diff --git a/cryptest.nmake b/cryptest.nmake index 7f1077d7..f329d3fd 100644 --- a/cryptest.nmake +++ b/cryptest.nmake @@ -47,9 +47,9 @@ # If you use 'make sources' from Linux makefile, then add 'winpipes.cpp' to the list below. -LIB_SRCS = cryptlib.cpp cpu.cpp integer.cpp 3way.cpp adler32.cpp algebra.cpp algparam.cpp arc4.cpp aria-simd.cpp aria.cpp ariatab.cpp asn.cpp authenc.cpp base32.cpp base64.cpp basecode.cpp bfinit.cpp blake2-simd.cpp blake2.cpp blowfish.cpp blumshub.cpp camellia.cpp cast.cpp casts.cpp cbcmac.cpp ccm.cpp chacha.cpp channels.cpp cmac.cpp crc-simd.cpp crc.cpp default.cpp des.cpp dessp.cpp dh.cpp dh2.cpp dll.cpp dsa.cpp eax.cpp ec2n.cpp eccrypto.cpp ecp.cpp elgamal.cpp emsa2.cpp eprecomp.cpp esign.cpp files.cpp filters.cpp fips140.cpp fipstest.cpp gcm-simd.cpp gcm.cpp gf256.cpp gf2_32.cpp gf2n.cpp gfpcrypt.cpp gost.cpp gzip.cpp hex.cpp hmac.cpp hrtimer.cpp ida.cpp idea.cpp iterhash.cpp kalyna.cpp kalynatab.cpp keccak.cpp luc.cpp mars.cpp marss.cpp md2.cpp md4.cpp md5.cpp misc.cpp modes.cpp mqueue.cpp mqv.cpp nbtheory.cpp neon-simd.cpp network.cpp oaep.cpp osrng.cpp padlkrng.cpp panama.cpp pkcspad.cpp poly1305.cpp polynomi.cpp pssr.cpp pubkey.cpp queue.cpp rabin.cpp randpool.cpp rc2.cpp rc5.cpp rc6.cpp rdrand.cpp rdtables.cpp rijndael-simd.cpp rijndael.cpp ripemd.cpp rng.cpp rsa.cpp rw.cpp safer.cpp salsa.cpp seal.cpp seed.cpp serpent.cpp sha-simd.cpp sha.cpp sha3.cpp shacal2-simd.cpp shacal2.cpp shark.cpp sharkbox.cpp skipjack.cpp sm3.cpp sm4.cpp socketft.cpp sosemanuk.cpp square.cpp squaretb.cpp sse-simd.cpp strciphr.cpp tea.cpp tftables.cpp threefish.cpp tiger.cpp tigertab.cpp trdlocal.cpp ttmac.cpp tweetnacl.cpp twofish.cpp vmac.cpp wait.cpp wake.cpp whrlpool.cpp winpipes.cpp xtr.cpp xtrcrypt.cpp zdeflate.cpp zinflate.cpp zlib.cpp +LIB_SRCS = cryptlib.cpp cpu.cpp integer.cpp 3way.cpp adler32.cpp algebra.cpp algparam.cpp arc4.cpp aria-simd.cpp aria.cpp ariatab.cpp asn.cpp authenc.cpp base32.cpp base64.cpp basecode.cpp bfinit.cpp blake2-simd.cpp blake2.cpp blowfish.cpp blumshub.cpp camellia.cpp cast.cpp casts.cpp cbcmac.cpp ccm.cpp chacha.cpp channels.cpp cmac.cpp crc-simd.cpp crc.cpp default.cpp des.cpp dessp.cpp dh.cpp dh2.cpp dll.cpp dsa.cpp eax.cpp ec2n.cpp eccrypto.cpp ecp.cpp elgamal.cpp emsa2.cpp eprecomp.cpp esign.cpp files.cpp filters.cpp fips140.cpp fipstest.cpp gcm-simd.cpp gcm.cpp gf256.cpp gf2_32.cpp gf2n.cpp gfpcrypt.cpp gost.cpp gzip.cpp hex.cpp hmac.cpp hrtimer.cpp ida.cpp idea.cpp iterhash.cpp kalyna.cpp kalynatab.cpp keccak.cpp luc.cpp mars.cpp marss.cpp md2.cpp md4.cpp md5.cpp misc.cpp modes.cpp mqueue.cpp mqv.cpp nbtheory.cpp neon-simd.cpp network.cpp oaep.cpp osrng.cpp padlkrng.cpp panama.cpp pkcspad.cpp poly1305.cpp polynomi.cpp pssr.cpp pubkey.cpp queue.cpp rabin.cpp randpool.cpp rc2.cpp rc5.cpp rc6.cpp rdrand.cpp rdtables.cpp rijndael-simd.cpp rijndael.cpp ripemd.cpp rng.cpp rsa.cpp rw.cpp safer.cpp salsa.cpp seal.cpp seed.cpp serpent.cpp sha-simd.cpp sha.cpp sha3.cpp shacal2-simd.cpp shacal2.cpp shark.cpp sharkbox.cpp simon.cpp simon-simd.cpp skipjack.cpp sm3.cpp sm4.cpp socketft.cpp sosemanuk.cpp speck.cpp speck-simd.cpp square.cpp squaretb.cpp sse-simd.cpp strciphr.cpp tea.cpp tftables.cpp threefish.cpp tiger.cpp tigertab.cpp trdlocal.cpp ttmac.cpp tweetnacl.cpp twofish.cpp vmac.cpp wait.cpp wake.cpp whrlpool.cpp winpipes.cpp xtr.cpp xtrcrypt.cpp zdeflate.cpp zinflate.cpp zlib.cpp -LIB_OBJS = cryptlib.obj cpu.obj integer.obj 3way.obj adler32.obj algebra.obj algparam.obj arc4.obj aria-simd.obj aria.obj ariatab.obj asn.obj authenc.obj base32.obj base64.obj basecode.obj bfinit.obj blake2-simd.obj blake2.obj blowfish.obj blumshub.obj camellia.obj cast.obj casts.obj cbcmac.obj ccm.obj chacha.obj channels.obj cmac.obj crc-simd.obj crc.obj default.obj des.obj dessp.obj dh.obj dh2.obj dll.obj dsa.obj eax.obj ec2n.obj eccrypto.obj ecp.obj elgamal.obj emsa2.obj eprecomp.obj esign.obj files.obj filters.obj fips140.obj fipstest.obj gcm-simd.obj gcm.obj gf256.obj gf2_32.obj gf2n.obj gfpcrypt.obj gost.obj gzip.obj hex.obj hmac.obj hrtimer.obj ida.obj idea.obj iterhash.obj kalyna.obj kalynatab.obj keccak.obj luc.obj mars.obj marss.obj md2.obj md4.obj md5.obj misc.obj modes.obj mqueue.obj mqv.obj nbtheory.obj neon-simd.obj network.obj oaep.obj osrng.obj padlkrng.obj panama.obj pkcspad.obj poly1305.obj polynomi.obj pssr.obj pubkey.obj queue.obj rabin.obj randpool.obj rc2.obj rc5.obj rc6.obj rdrand.obj rdtables.obj rijndael-simd.obj rijndael.obj ripemd.obj rng.obj rsa.obj rw.obj safer.obj salsa.obj seal.obj seed.obj serpent.obj sha-simd.obj sha.obj sha3.obj shacal2-simd.obj shacal2.obj shark.obj sharkbox.obj skipjack.obj sm3.obj sm4.obj socketft.obj sosemanuk.obj square.obj squaretb.obj sse-simd.obj strciphr.obj tea.obj tftables.obj threefish.obj tiger.obj tigertab.obj trdlocal.obj ttmac.obj tweetnacl.obj twofish.obj vmac.obj wait.obj wake.obj whrlpool.obj winpipes.obj xtr.obj xtrcrypt.obj zdeflate.obj zinflate.obj zlib.obj +LIB_OBJS = cryptlib.obj cpu.obj integer.obj 3way.obj adler32.obj algebra.obj algparam.obj arc4.obj aria-simd.obj aria.obj ariatab.obj asn.obj authenc.obj base32.obj base64.obj basecode.obj bfinit.obj blake2-simd.obj blake2.obj blowfish.obj blumshub.obj camellia.obj cast.obj casts.obj cbcmac.obj ccm.obj chacha.obj channels.obj cmac.obj crc-simd.obj crc.obj default.obj des.obj dessp.obj dh.obj dh2.obj dll.obj dsa.obj eax.obj ec2n.obj eccrypto.obj ecp.obj elgamal.obj emsa2.obj eprecomp.obj esign.obj files.obj filters.obj fips140.obj fipstest.obj gcm-simd.obj gcm.obj gf256.obj gf2_32.obj gf2n.obj gfpcrypt.obj gost.obj gzip.obj hex.obj hmac.obj hrtimer.obj ida.obj idea.obj iterhash.obj kalyna.obj kalynatab.obj keccak.obj luc.obj mars.obj marss.obj md2.obj md4.obj md5.obj misc.obj modes.obj mqueue.obj mqv.obj nbtheory.obj neon-simd.obj network.obj oaep.obj osrng.obj padlkrng.obj panama.obj pkcspad.obj poly1305.obj polynomi.obj pssr.obj pubkey.obj queue.obj rabin.obj randpool.obj rc2.obj rc5.obj rc6.obj rdrand.obj rdtables.obj rijndael-simd.obj rijndael.obj ripemd.obj rng.obj rsa.obj rw.obj safer.obj salsa.obj seal.obj seed.obj serpent.obj sha-simd.obj sha.obj sha3.obj shacal2-simd.obj shacal2.obj shark.obj sharkbox.obj simon.obj simon-simd.obj skipjack.obj sm3.obj sm4.obj socketft.obj sosemanuk.obj speck.obj speck-simd.obj square.obj squaretb.obj sse-simd.obj strciphr.obj tea.obj tftables.obj threefish.obj tiger.obj tigertab.obj trdlocal.obj ttmac.obj tweetnacl.obj twofish.obj vmac.obj wait.obj wake.obj whrlpool.obj winpipes.obj xtr.obj xtrcrypt.obj zdeflate.obj zinflate.obj zlib.obj TEST_SRCS = bench1.cpp bench2.cpp test.cpp validat0.cpp validat1.cpp validat2.cpp validat3.cpp validat4.cpp datatest.cpp regtest1.cpp regtest2.cpp regtest3.cpp fipsalgt.cpp dlltest.cpp fipstest.cpp diff --git a/cryptlib.vcxproj b/cryptlib.vcxproj index 9bc4a9a3..0d4ab1cd 100644 --- a/cryptlib.vcxproj +++ b/cryptlib.vcxproj @@ -288,12 +288,16 @@ + + + + @@ -474,6 +478,7 @@ + @@ -481,6 +486,7 @@ + diff --git a/cryptlib.vcxproj.filters b/cryptlib.vcxproj.filters index c43131e7..04dec87d 100644 --- a/cryptlib.vcxproj.filters +++ b/cryptlib.vcxproj.filters @@ -359,6 +359,12 @@ Source Files + + Source Files + + + Source Files + Source Files @@ -371,6 +377,12 @@ Source Files + + Source Files + + + Source Files + Source Files @@ -816,6 +828,9 @@ Header Files + + Header Files + Header Files @@ -834,6 +849,9 @@ Header Files + + Header Files + Header Files diff --git a/regtest2.cpp b/regtest2.cpp index 4b3732e9..b5bf18b0 100644 --- a/regtest2.cpp +++ b/regtest2.cpp @@ -32,6 +32,8 @@ #include "mars.h" #include "kalyna.h" #include "threefish.h" +#include "simon.h" +#include "speck.h" #include "sm4.h" #include "des.h" #include "idea.h" @@ -161,6 +163,19 @@ void RegisterFactories2() RegisterSymmetricCipherDefaultFactories >(); // Benchmarks RegisterSymmetricCipherDefaultFactories >(); // Benchmarks + RegisterSymmetricCipherDefaultFactories >(); // Test Vectors + RegisterSymmetricCipherDefaultFactories >(); // Test Vectors + RegisterSymmetricCipherDefaultFactories >(); // Test Vectors + RegisterSymmetricCipherDefaultFactories >(); // Test Vectors + RegisterSymmetricCipherDefaultFactories >(); // Benchmarks + RegisterSymmetricCipherDefaultFactories >(); // Benchmarks + + RegisterSymmetricCipherDefaultFactories >(); // Test Vectors + RegisterSymmetricCipherDefaultFactories >(); // Test Vectors + RegisterSymmetricCipherDefaultFactories >(); // Test Vectors + RegisterSymmetricCipherDefaultFactories >(); // Test Vectors + RegisterSymmetricCipherDefaultFactories >(); // Benchmarks + RegisterSymmetricCipherDefaultFactories >(); // Benchmarks RegisterSymmetricCipherDefaultFactories >(); // Test Vectors RegisterSymmetricCipherDefaultFactories >(); // Test Vectors diff --git a/simon-simd.cpp b/simon-simd.cpp new file mode 100644 index 00000000..ad591321 --- /dev/null +++ b/simon-simd.cpp @@ -0,0 +1,1141 @@ +// simon-simd.cpp - written and placed in the public domain by Jeffrey Walton +// +// This source file uses intrinsics and built-ins to gain access to +// SSSE3, ARM NEON and ARMv8a, and Power7 Altivec instructions. A separate +// source file is needed because additional CXXFLAGS are required to enable +// the appropriate instructions sets in some build configurations. + +#include "pch.h" +#include "config.h" + +#include "simon.h" +#include "misc.h" +#include "adv-simd.h" + +// Uncomment for benchmarking C++ against SSE or NEON. +// Do so in both simon.cpp and simon-simd.cpp. +// #undef CRYPTOPP_SSSE3_AVAILABLE +// #undef CRYPTOPP_SSE41_AVAILABLE +// #undef CRYPTOPP_ARM_NEON_AVAILABLE + +#if (CRYPTOPP_SSSE3_AVAILABLE) +# include +# include +#endif + +#if (CRYPTOPP_SSE41_AVAILABLE) +# include +#endif + +#if defined(__AVX512F__) && defined(__AVX512VL__) +# define CRYPTOPP_AVX512_ROTATE 1 +# include +#endif + +#if (CRYPTOPP_ARM_NEON_AVAILABLE) +# include +#endif + +// Can't use CRYPTOPP_ARM_XXX_AVAILABLE because too many +// compilers don't follow ACLE conventions for the include. +#if defined(CRYPTOPP_ARM_ACLE_AVAILABLE) +# include +# include +#endif + +// https://www.spinics.net/lists/gcchelp/msg47735.html and +// https://www.spinics.net/lists/gcchelp/msg47749.html +#if (CRYPTOPP_GCC_VERSION >= 40900) +# define GCC_NO_UBSAN __attribute__ ((no_sanitize_undefined)) +#else +# define GCC_NO_UBSAN +#endif + +ANONYMOUS_NAMESPACE_BEGIN + +using CryptoPP::byte; +using CryptoPP::word32; +using CryptoPP::word64; +using CryptoPP::rotlFixed; +using CryptoPP::rotrFixed; +using CryptoPP::vec_swap; // SunCC + +// *************************** ARM NEON ************************** // + +#if defined(CRYPTOPP_ARM_NEON_AVAILABLE) + +template +inline uint32x4_t RotateLeft32(const uint32x4_t& val) +{ + const uint32x4_t a(vshlq_n_u32(val, R)); + const uint32x4_t b(vshrq_n_u32(val, 32 - R)); + return vorrq_u32(a, b); +} + +template +inline uint32x4_t RotateRight32(const uint32x4_t& val) +{ + const uint32x4_t a(vshlq_n_u32(val, 32 - R)); + const uint32x4_t b(vshrq_n_u32(val, R)); + return vorrq_u32(a, b); +} + +#if defined(__aarch32__) || defined(__aarch64__) +// Faster than two Shifts and an Or. Thanks to Louis Wingers and Bryan Weeks. +template <> +inline uint32x4_t RotateLeft32<8>(const uint32x4_t& val) +{ +#if defined(CRYPTOPP_BIG_ENDIAN) + const uint8_t maskb[16] = { 14,13,12,15, 10,9,8,11, 6,5,4,7, 2,1,0,3 }; + const uint8x16_t mask = vld1q_u8(maskb); +#else + const uint8_t maskb[16] = { 3,0,1,2, 7,4,5,6, 11,8,9,10, 15,12,13,14 }; + const uint8x16_t mask = vld1q_u8(maskb); +#endif + + return vreinterpretq_u32_u8( + vqtbl1q_u8(vreinterpretq_u8_u32(val), mask)); +} + +// Faster than two Shifts and an Or. Thanks to Louis Wingers and Bryan Weeks. +template <> +inline uint32x4_t RotateRight32<8>(const uint32x4_t& val) +{ +#if defined(CRYPTOPP_BIG_ENDIAN) + const uint8_t maskb[16] = { 12,15,14,13, 8,11,10,9, 4,7,6,5, 0,3,2,1 }; + const uint8x16_t mask = vld1q_u8(maskb); +#else + const uint8_t maskb[16] = { 1,2,3,0, 5,6,7,4, 9,10,11,8, 13,14,14,12 }; + const uint8x16_t mask = vld1q_u8(maskb); +#endif + + return vreinterpretq_u32_u8( + vqtbl1q_u8(vreinterpretq_u8_u32(val), mask)); +} +#endif + +inline uint32x4_t Shuffle32(const uint32x4_t& val) +{ +#if defined(CRYPTOPP_LITTLE_ENDIAN) + return vreinterpretq_u32_u8( + vrev32q_u8(vreinterpretq_u8_u32(val))); +#else + return val; +#endif +} + +inline uint32x4_t SIMON64_f(const uint32x4_t& val) +{ + return veorq_u32(RotateLeft32<2>(val), + vandq_u32(RotateLeft32<1>(val), RotateLeft32<8>(val))); +} + +inline void SIMON64_Enc_Block(uint32x4_t &block1, uint32x4_t &block0, + const word32 *subkeys, unsigned int rounds) +{ + // Rearrange the data for vectorization. The incoming data was read from + // a big-endian byte array. Depending on the number of blocks it needs to + // be permuted to the following. If only a single block is available then + // a Zero block is provided to promote vectorizations. + // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ... + uint32x4_t x1 = vuzpq_u32(block0, block1).val[0]; + uint32x4_t y1 = vuzpq_u32(block0, block1).val[1]; + + x1 = Shuffle32(x1); y1 = Shuffle32(y1); + + for (int i = 0; i < static_cast(rounds & ~1)-1; i += 2) + { + const uint32x4_t rk1 = vld1q_dup_u32(subkeys+i); + y1 = veorq_u32(veorq_u32(y1, SIMON64_f(x1)), rk1); + + const uint32x4_t rk2 = vld1q_dup_u32(subkeys+i+1); + x1 = veorq_u32(veorq_u32(x1, SIMON64_f(y1)), rk2); + } + + if (rounds & 1) + { + const uint32x4_t rk = vld1q_dup_u32(subkeys+rounds-1); + + y1 = veorq_u32(veorq_u32(y1, SIMON64_f(x1)), rk); + std::swap(x1, y1); + } + + x1 = Shuffle32(x1); y1 = Shuffle32(y1); + + // [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4] + block0 = vzipq_u32(x1, y1).val[0]; + block1 = vzipq_u32(x1, y1).val[1]; +} + +inline void SIMON64_Dec_Block(uint32x4_t &block0, uint32x4_t &block1, + const word32 *subkeys, unsigned int rounds) +{ + // Rearrange the data for vectorization. The incoming data was read from + // a big-endian byte array. Depending on the number of blocks it needs to + // be permuted to the following. If only a single block is available then + // a Zero block is provided to promote vectorizations. + // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ... + uint32x4_t x1 = vuzpq_u32(block0, block1).val[0]; + uint32x4_t y1 = vuzpq_u32(block0, block1).val[1]; + + x1 = Shuffle32(x1); y1 = Shuffle32(y1); + + if (rounds & 1) + { + std::swap(x1, y1); + const uint32x4_t rk = vld1q_dup_u32(subkeys + rounds - 1); + + y1 = veorq_u32(veorq_u32(y1, rk), SIMON64_f(x1)); + rounds--; + } + + for (int i = static_cast(rounds-2); i >= 0; i -= 2) + { + const uint32x4_t rk1 = vld1q_dup_u32(subkeys+i+1); + x1 = veorq_u32(veorq_u32(x1, SIMON64_f(y1)), rk1); + + const uint32x4_t rk2 = vld1q_dup_u32(subkeys+i); + y1 = veorq_u32(veorq_u32(y1, SIMON64_f(x1)), rk2); + } + + x1 = Shuffle32(x1); y1 = Shuffle32(y1); + + // [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4] + block0 = vzipq_u32(x1, y1).val[0]; + block1 = vzipq_u32(x1, y1).val[1]; +} + +inline void SIMON64_Enc_6_Blocks(uint32x4_t &block0, uint32x4_t &block1, + uint32x4_t &block2, uint32x4_t &block3, uint32x4_t &block4, uint32x4_t &block5, + const word32 *subkeys, unsigned int rounds) +{ + // Rearrange the data for vectorization. The incoming data was read from + // a big-endian byte array. Depending on the number of blocks it needs to + // be permuted to the following. If only a single block is available then + // a Zero block is provided to promote vectorizations. + // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ... + uint32x4_t x1 = vuzpq_u32(block0, block1).val[0]; + uint32x4_t y1 = vuzpq_u32(block0, block1).val[1]; + uint32x4_t x2 = vuzpq_u32(block2, block3).val[0]; + uint32x4_t y2 = vuzpq_u32(block2, block3).val[1]; + uint32x4_t x3 = vuzpq_u32(block4, block5).val[0]; + uint32x4_t y3 = vuzpq_u32(block4, block5).val[1]; + + x1 = Shuffle32(x1); y1 = Shuffle32(y1); + x2 = Shuffle32(x2); y2 = Shuffle32(y2); + x3 = Shuffle32(x3); y3 = Shuffle32(y3); + + for (int i = 0; i < static_cast(rounds & ~1) - 1; i += 2) + { + const uint32x4_t rk1 = vld1q_dup_u32(subkeys+i); + y1 = veorq_u32(veorq_u32(y1, SIMON64_f(x1)), rk1); + y2 = veorq_u32(veorq_u32(y2, SIMON64_f(x2)), rk1); + y3 = veorq_u32(veorq_u32(y3, SIMON64_f(x3)), rk1); + + const uint32x4_t rk2 = vld1q_dup_u32(subkeys+i+1); + x1 = veorq_u32(veorq_u32(x1, SIMON64_f(y1)), rk2); + x2 = veorq_u32(veorq_u32(x2, SIMON64_f(y2)), rk2); + x3 = veorq_u32(veorq_u32(x3, SIMON64_f(y3)), rk2); + } + + if (rounds & 1) + { + const uint32x4_t rk = vld1q_dup_u32(subkeys + rounds - 1); + + y1 = veorq_u32(veorq_u32(y1, SIMON64_f(x1)), rk); + y2 = veorq_u32(veorq_u32(y2, SIMON64_f(x2)), rk); + y3 = veorq_u32(veorq_u32(y3, SIMON64_f(x3)), rk); + std::swap(x1, y1); std::swap(x2, y2); std::swap(x3, y3); + } + + x1 = Shuffle32(x1); y1 = Shuffle32(y1); + x2 = Shuffle32(x2); y2 = Shuffle32(y2); + x3 = Shuffle32(x3); y3 = Shuffle32(y3); + + // [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4] + block0 = vzipq_u32(x1, y1).val[0]; + block1 = vzipq_u32(x1, y1).val[1]; + block2 = vzipq_u32(x2, y2).val[0]; + block3 = vzipq_u32(x2, y2).val[1]; + block4 = vzipq_u32(x3, y3).val[0]; + block5 = vzipq_u32(x3, y3).val[1]; +} + +inline void SIMON64_Dec_6_Blocks(uint32x4_t &block0, uint32x4_t &block1, + uint32x4_t &block2, uint32x4_t &block3, uint32x4_t &block4, uint32x4_t &block5, + const word32 *subkeys, unsigned int rounds) +{ + // Rearrange the data for vectorization. The incoming data was read from + // a big-endian byte array. Depending on the number of blocks it needs to + // be permuted to the following. If only a single block is available then + // a Zero block is provided to promote vectorizations. + // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ... + uint32x4_t x1 = vuzpq_u32(block0, block1).val[0]; + uint32x4_t y1 = vuzpq_u32(block0, block1).val[1]; + uint32x4_t x2 = vuzpq_u32(block2, block3).val[0]; + uint32x4_t y2 = vuzpq_u32(block2, block3).val[1]; + uint32x4_t x3 = vuzpq_u32(block4, block5).val[0]; + uint32x4_t y3 = vuzpq_u32(block4, block5).val[1]; + + x1 = Shuffle32(x1); y1 = Shuffle32(y1); + x2 = Shuffle32(x2); y2 = Shuffle32(y2); + x3 = Shuffle32(x3); y3 = Shuffle32(y3); + + if (rounds & 1) + { + std::swap(x1, y1); std::swap(x2, y2); std::swap(x3, y3); + const uint32x4_t rk = vld1q_dup_u32(subkeys + rounds - 1); + + y1 = veorq_u32(veorq_u32(y1, rk), SIMON64_f(x1)); + y2 = veorq_u32(veorq_u32(y2, rk), SIMON64_f(x2)); + y3 = veorq_u32(veorq_u32(y3, rk), SIMON64_f(x3)); + rounds--; + } + + for (int i = static_cast(rounds-2); i >= 0; i -= 2) + { + const uint32x4_t rk1 = vld1q_dup_u32(subkeys + i + 1); + x1 = veorq_u32(veorq_u32(x1, SIMON64_f(y1)), rk1); + x2 = veorq_u32(veorq_u32(x2, SIMON64_f(y2)), rk1); + x3 = veorq_u32(veorq_u32(x3, SIMON64_f(y3)), rk1); + + const uint32x4_t rk2 = vld1q_dup_u32(subkeys + i); + y1 = veorq_u32(veorq_u32(y1, SIMON64_f(x1)), rk2); + y2 = veorq_u32(veorq_u32(y2, SIMON64_f(x2)), rk2); + y3 = veorq_u32(veorq_u32(y3, SIMON64_f(x3)), rk2); + } + + x1 = Shuffle32(x1); y1 = Shuffle32(y1); + x2 = Shuffle32(x2); y2 = Shuffle32(y2); + x3 = Shuffle32(x3); y3 = Shuffle32(y3); + + // [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4] + block0 = vzipq_u32(x1, y1).val[0]; + block1 = vzipq_u32(x1, y1).val[1]; + block2 = vzipq_u32(x2, y2).val[0]; + block3 = vzipq_u32(x2, y2).val[1]; + block4 = vzipq_u32(x3, y3).val[0]; + block5 = vzipq_u32(x3, y3).val[1]; +} + +#endif // CRYPTOPP_ARM_NEON_AVAILABLE + +#if defined(CRYPTOPP_ARM_NEON_AVAILABLE) + +template +inline T UnpackHigh64(const T& a, const T& b) +{ + const uint64x1_t x(vget_high_u64((uint64x2_t)a)); + const uint64x1_t y(vget_high_u64((uint64x2_t)b)); + return (T)vcombine_u64(x, y); +} + +template +inline T UnpackLow64(const T& a, const T& b) +{ + const uint64x1_t x(vget_low_u64((uint64x2_t)a)); + const uint64x1_t y(vget_low_u64((uint64x2_t)b)); + return (T)vcombine_u64(x, y); +} + +template +inline uint64x2_t RotateLeft64(const uint64x2_t& val) +{ + const uint64x2_t a(vshlq_n_u64(val, R)); + const uint64x2_t b(vshrq_n_u64(val, 64 - R)); + return vorrq_u64(a, b); +} + +template +inline uint64x2_t RotateRight64(const uint64x2_t& val) +{ + const uint64x2_t a(vshlq_n_u64(val, 64 - R)); + const uint64x2_t b(vshrq_n_u64(val, R)); + return vorrq_u64(a, b); +} + +#if defined(__aarch32__) || defined(__aarch64__) +// Faster than two Shifts and an Or. Thanks to Louis Wingers and Bryan Weeks. +template <> +inline uint64x2_t RotateLeft64<8>(const uint64x2_t& val) +{ +#if defined(CRYPTOPP_BIG_ENDIAN) + const uint8_t maskb[16] = { 14,13,12,11, 10,9,8,15, 6,5,4,3, 2,1,0,7 }; + const uint8x16_t mask = vld1q_u8(maskb); +#else + const uint8_t maskb[16] = { 7,0,1,2, 3,4,5,6, 15,8,9,10, 11,12,13,14 }; + const uint8x16_t mask = vld1q_u8(maskb); +#endif + + return vreinterpretq_u64_u8( + vqtbl1q_u8(vreinterpretq_u8_u64(val), mask)); +} + +// Faster than two Shifts and an Or. Thanks to Louis Wingers and Bryan Weeks. +template <> +inline uint64x2_t RotateRight64<8>(const uint64x2_t& val) +{ +#if defined(CRYPTOPP_BIG_ENDIAN) + const uint8_t maskb[16] = { 8,15,14,13, 12,11,10,9, 0,7,6,5, 4,3,2,1 }; + const uint8x16_t mask = vld1q_u8(maskb); +#else + const uint8_t maskb[16] = { 1,2,3,4, 5,6,7,0, 9,10,11,12, 13,14,15,8 }; + const uint8x16_t mask = vld1q_u8(maskb); +#endif + + return vreinterpretq_u64_u8( + vqtbl1q_u8(vreinterpretq_u8_u64(val), mask)); +} +#endif + +inline uint64x2_t Shuffle64(const uint64x2_t& val) +{ +#if defined(CRYPTOPP_LITTLE_ENDIAN) + return vreinterpretq_u64_u8( + vrev64q_u8(vreinterpretq_u8_u64(val))); +#else + return val; +#endif +} + +inline uint64x2_t SIMON128_f(const uint64x2_t& val) +{ + return veorq_u64(RotateLeft64<2>(val), + vandq_u64(RotateLeft64<1>(val), RotateLeft64<8>(val))); +} + +inline void SIMON128_Enc_Block(uint64x2_t &block0, uint64x2_t &block1, + const word64 *subkeys, unsigned int rounds) +{ + // Rearrange the data for vectorization. The incoming data was read from + // a big-endian byte array. Depending on the number of blocks it needs to + // be permuted to the following. + // [A1 A2][B1 B2] ... => [A1 B1][A2 B2] ... + uint64x2_t x1 = UnpackLow64(block0, block1); + uint64x2_t y1 = UnpackHigh64(block0, block1); + + x1 = Shuffle64(x1); y1 = Shuffle64(y1); + + for (int i = 0; i < static_cast(rounds & ~1)-1; i += 2) + { + const uint64x2_t rk1 = vld1q_dup_u64(subkeys+i); + y1 = veorq_u64(veorq_u64(y1, SIMON128_f(x1)), rk1); + + const uint64x2_t rk2 = vld1q_dup_u64(subkeys+i+1); + x1 = veorq_u64(veorq_u64(x1, SIMON128_f(y1)), rk2); + } + + if (rounds & 1) + { + const uint64x2_t rk = vld1q_dup_u64(subkeys+rounds-1); + + y1 = veorq_u64(veorq_u64(y1, SIMON128_f(x1)), rk); + std::swap(x1, y1); + } + + x1 = Shuffle64(x1); y1 = Shuffle64(y1); + + block0 = UnpackLow64(x1, y1); + block1 = UnpackHigh64(x1, y1); +} + +inline void SIMON128_Enc_6_Blocks(uint64x2_t &block0, uint64x2_t &block1, + uint64x2_t &block2, uint64x2_t &block3, uint64x2_t &block4, uint64x2_t &block5, + const word64 *subkeys, unsigned int rounds) +{ + // Rearrange the data for vectorization. The incoming data was read from + // a big-endian byte array. Depending on the number of blocks it needs to + // be permuted to the following. + // [A1 A2][B1 B2] ... => [A1 B1][A2 B2] ... + uint64x2_t x1 = UnpackLow64(block0, block1); + uint64x2_t y1 = UnpackHigh64(block0, block1); + uint64x2_t x2 = UnpackLow64(block2, block3); + uint64x2_t y2 = UnpackHigh64(block2, block3); + uint64x2_t x3 = UnpackLow64(block4, block5); + uint64x2_t y3 = UnpackHigh64(block4, block5); + + x1 = Shuffle64(x1); y1 = Shuffle64(y1); + x2 = Shuffle64(x2); y2 = Shuffle64(y2); + x3 = Shuffle64(x3); y3 = Shuffle64(y3); + + for (int i = 0; i < static_cast(rounds & ~1) - 1; i += 2) + { + const uint64x2_t rk1 = vld1q_dup_u64(subkeys+i); + y1 = veorq_u64(veorq_u64(y1, SIMON128_f(x1)), rk1); + y2 = veorq_u64(veorq_u64(y2, SIMON128_f(x2)), rk1); + y3 = veorq_u64(veorq_u64(y3, SIMON128_f(x3)), rk1); + + const uint64x2_t rk2 = vld1q_dup_u64(subkeys+i+1); + x1 = veorq_u64(veorq_u64(x1, SIMON128_f(y1)), rk2); + x2 = veorq_u64(veorq_u64(x2, SIMON128_f(y2)), rk2); + x3 = veorq_u64(veorq_u64(x3, SIMON128_f(y3)), rk2); + } + + if (rounds & 1) + { + const uint64x2_t rk = vld1q_dup_u64(subkeys + rounds - 1); + + y1 = veorq_u64(veorq_u64(y1, SIMON128_f(x1)), rk); + y2 = veorq_u64(veorq_u64(y2, SIMON128_f(x2)), rk); + y3 = veorq_u64(veorq_u64(y3, SIMON128_f(x3)), rk); + std::swap(x1, y1); std::swap(x2, y2); std::swap(x3, y3); + } + + x1 = Shuffle64(x1); y1 = Shuffle64(y1); + x2 = Shuffle64(x2); y2 = Shuffle64(y2); + x3 = Shuffle64(x3); y3 = Shuffle64(y3); + + block0 = UnpackLow64(x1, y1); + block1 = UnpackHigh64(x1, y1); + block2 = UnpackLow64(x2, y2); + block3 = UnpackHigh64(x2, y2); + block4 = UnpackLow64(x3, y3); + block5 = UnpackHigh64(x3, y3); +} + +inline void SIMON128_Dec_Block(uint64x2_t &block0, uint64x2_t &block1, + const word64 *subkeys, unsigned int rounds) +{ + // Rearrange the data for vectorization. The incoming data was read from + // a big-endian byte array. Depending on the number of blocks it needs to + // be permuted to the following. + // [A1 A2][B1 B2] ... => [A1 B1][A2 B2] ... + uint64x2_t x1 = UnpackLow64(block0, block1); + uint64x2_t y1 = UnpackHigh64(block0, block1); + + x1 = Shuffle64(x1); y1 = Shuffle64(y1); + + if (rounds & 1) + { + std::swap(x1, y1); + const uint64x2_t rk = vld1q_dup_u64(subkeys + rounds - 1); + + y1 = veorq_u64(veorq_u64(y1, rk), SIMON128_f(x1)); + rounds--; + } + + for (int i = static_cast(rounds-2); i >= 0; i -= 2) + { + const uint64x2_t rk1 = vld1q_dup_u64(subkeys+i+1); + x1 = veorq_u64(veorq_u64(x1, SIMON128_f(y1)), rk1); + + const uint64x2_t rk2 = vld1q_dup_u64(subkeys+i); + y1 = veorq_u64(veorq_u64(y1, SIMON128_f(x1)), rk2); + } + + x1 = Shuffle64(x1); y1 = Shuffle64(y1); + + block0 = UnpackLow64(x1, y1); + block1 = UnpackHigh64(x1, y1); +} + +inline void SIMON128_Dec_6_Blocks(uint64x2_t &block0, uint64x2_t &block1, + uint64x2_t &block2, uint64x2_t &block3, uint64x2_t &block4, uint64x2_t &block5, + const word64 *subkeys, unsigned int rounds) +{ + // Rearrange the data for vectorization. The incoming data was read from + // a big-endian byte array. Depending on the number of blocks it needs to + // be permuted to the following. + // [A1 A2][B1 B2] ... => [A1 B1][A2 B2] ... + uint64x2_t x1 = UnpackLow64(block0, block1); + uint64x2_t y1 = UnpackHigh64(block0, block1); + uint64x2_t x2 = UnpackLow64(block2, block3); + uint64x2_t y2 = UnpackHigh64(block2, block3); + uint64x2_t x3 = UnpackLow64(block4, block5); + uint64x2_t y3 = UnpackHigh64(block4, block5); + + x1 = Shuffle64(x1); y1 = Shuffle64(y1); + x2 = Shuffle64(x2); y2 = Shuffle64(y2); + x3 = Shuffle64(x3); y3 = Shuffle64(y3); + + if (rounds & 1) + { + std::swap(x1, y1); std::swap(x2, y2); std::swap(x3, y3); + const uint64x2_t rk = vld1q_dup_u64(subkeys + rounds - 1); + + y1 = veorq_u64(veorq_u64(y1, rk), SIMON128_f(x1)); + y2 = veorq_u64(veorq_u64(y2, rk), SIMON128_f(x2)); + y3 = veorq_u64(veorq_u64(y3, rk), SIMON128_f(x3)); + rounds--; + } + + for (int i = static_cast(rounds-2); i >= 0; i -= 2) + { + const uint64x2_t rk1 = vld1q_dup_u64(subkeys + i + 1); + x1 = veorq_u64(veorq_u64(x1, SIMON128_f(y1)), rk1); + x2 = veorq_u64(veorq_u64(x2, SIMON128_f(y2)), rk1); + x3 = veorq_u64(veorq_u64(x3, SIMON128_f(y3)), rk1); + + const uint64x2_t rk2 = vld1q_dup_u64(subkeys + i); + y1 = veorq_u64(veorq_u64(y1, SIMON128_f(x1)), rk2); + y2 = veorq_u64(veorq_u64(y2, SIMON128_f(x2)), rk2); + y3 = veorq_u64(veorq_u64(y3, SIMON128_f(x3)), rk2); + } + + x1 = Shuffle64(x1); y1 = Shuffle64(y1); + x2 = Shuffle64(x2); y2 = Shuffle64(y2); + x3 = Shuffle64(x3); y3 = Shuffle64(y3); + + block0 = UnpackLow64(x1, y1); + block1 = UnpackHigh64(x1, y1); + block2 = UnpackLow64(x2, y2); + block3 = UnpackHigh64(x2, y2); + block4 = UnpackLow64(x3, y3); + block5 = UnpackHigh64(x3, y3); +} + +#endif // CRYPTOPP_ARM_NEON_AVAILABLE + +// ***************************** IA-32 ***************************** // + +#if defined(CRYPTOPP_SSSE3_AVAILABLE) + +// Clang __m128i casts, http://bugs.llvm.org/show_bug.cgi?id=20670 +#ifndef M128_CAST +# define M128_CAST(x) ((__m128i *)(void *)(x)) +#endif +#ifndef CONST_M128_CAST +# define CONST_M128_CAST(x) ((const __m128i *)(const void *)(x)) +#endif + +// GCC double casts, https://www.spinics.net/lists/gcchelp/msg47735.html +#ifndef DOUBLE_CAST +# define DOUBLE_CAST(x) ((double *)(void *)(x)) +#endif +#ifndef CONST_DOUBLE_CAST +# define CONST_DOUBLE_CAST(x) ((const double *)(const void *)(x)) +#endif + +inline void Swap128(__m128i& a,__m128i& b) +{ +#if defined(__SUNPRO_CC) && (__SUNPRO_CC <= 0x5120) + // __m128i is an unsigned long long[2], and support for swapping it was not added until C++11. + // SunCC 12.1 - 12.3 fail to consume the swap; while SunCC 12.4 consumes it without -std=c++11. + vec_swap(a, b); +#else + std::swap(a, b); +#endif +} + +#if defined(CRYPTOPP_AVX512_ROTATE) +template +inline __m128i RotateLeft64(const __m128i& val) +{ + return _mm_rol_epi64(val, R); +} + +template +inline __m128i RotateRight64(const __m128i& val) +{ + return _mm_ror_epi64(val, R); +} +#else +template +inline __m128i RotateLeft64(const __m128i& val) +{ + return _mm_or_si128( + _mm_slli_epi64(val, R), _mm_srli_epi64(val, 64-R)); +} + +template +inline __m128i RotateRight64(const __m128i& val) +{ + return _mm_or_si128( + _mm_slli_epi64(val, 64-R), _mm_srli_epi64(val, R)); +} + +// Faster than two Shifts and an Or. Thanks to Louis Wingers and Bryan Weeks. +template <> +inline __m128i RotateLeft64<8>(const __m128i& val) +{ + const __m128i mask = _mm_set_epi8(14,13,12,11, 10,9,8,15, 6,5,4,3, 2,1,0,7); + return _mm_shuffle_epi8(val, mask); +} + +// Faster than two Shifts and an Or. Thanks to Louis Wingers and Bryan Weeks. +template <> +inline __m128i RotateRight64<8>(const __m128i& val) +{ + const __m128i mask = _mm_set_epi8(8,15,14,13, 12,11,10,9, 0,7,6,5, 4,3,2,1); + return _mm_shuffle_epi8(val, mask); +} +#endif // CRYPTOPP_AVX512_ROTATE + +inline __m128i SIMON128_f(const __m128i& v) +{ + return _mm_xor_si128(RotateLeft64<2>(v), + _mm_and_si128(RotateLeft64<1>(v), RotateLeft64<8>(v))); +} + +inline void GCC_NO_UBSAN SIMON128_Enc_Block(__m128i &block0, __m128i &block1, + const word64 *subkeys, unsigned int rounds) +{ + // Rearrange the data for vectorization. The incoming data was read from + // a big-endian byte array. Depending on the number of blocks it needs to + // be permuted to the following. + // [A1 A2][B1 B2] ... => [A1 B1][A2 B2] ... + __m128i x1 = _mm_unpackhi_epi64(block0, block1); + __m128i y1 = _mm_unpacklo_epi64(block0, block1); + + for (int i = 0; i < static_cast(rounds & ~1)-1; i += 2) + { + const __m128i rk1 = _mm_castpd_si128( + _mm_loaddup_pd(CONST_DOUBLE_CAST(subkeys+i))); + y1 = _mm_xor_si128(_mm_xor_si128(y1, SIMON128_f(x1)), rk1); + + const __m128i rk2 = _mm_castpd_si128( + _mm_loaddup_pd(CONST_DOUBLE_CAST(subkeys+i+1))); + x1 = _mm_xor_si128(_mm_xor_si128(x1, SIMON128_f(y1)), rk2); + } + + if (rounds & 1) + { + const __m128i rk = _mm_castpd_si128( + _mm_loaddup_pd(CONST_DOUBLE_CAST(subkeys+rounds-1))); + + y1 = _mm_xor_si128(_mm_xor_si128(y1, SIMON128_f(x1)), rk); + Swap128(x1, y1); + } + + // [A1 B1][A2 B2] ... => [A1 A2][B1 B2] ... + block0 = _mm_unpacklo_epi64(y1, x1); + block1 = _mm_unpackhi_epi64(y1, x1); +} + +inline void GCC_NO_UBSAN SIMON128_Enc_6_Blocks(__m128i &block0, __m128i &block1, + __m128i &block2, __m128i &block3, __m128i &block4, __m128i &block5, + const word64 *subkeys, unsigned int rounds) +{ + // Rearrange the data for vectorization. The incoming data was read from + // a big-endian byte array. Depending on the number of blocks it needs to + // be permuted to the following. + // [A1 A2][B1 B2] ... => [A1 B1][A2 B2] ... + __m128i x1 = _mm_unpackhi_epi64(block0, block1); + __m128i y1 = _mm_unpacklo_epi64(block0, block1); + __m128i x2 = _mm_unpackhi_epi64(block2, block3); + __m128i y2 = _mm_unpacklo_epi64(block2, block3); + __m128i x3 = _mm_unpackhi_epi64(block4, block5); + __m128i y3 = _mm_unpacklo_epi64(block4, block5); + + for (int i = 0; i < static_cast(rounds & ~1) - 1; i += 2) + { + const __m128i rk1 = _mm_castpd_si128( + _mm_loaddup_pd(CONST_DOUBLE_CAST(subkeys + i))); + y1 = _mm_xor_si128(_mm_xor_si128(y1, SIMON128_f(x1)), rk1); + y2 = _mm_xor_si128(_mm_xor_si128(y2, SIMON128_f(x2)), rk1); + y3 = _mm_xor_si128(_mm_xor_si128(y3, SIMON128_f(x3)), rk1); + + const __m128i rk2 = _mm_castpd_si128( + _mm_loaddup_pd(CONST_DOUBLE_CAST(subkeys + i + 1))); + x1 = _mm_xor_si128(_mm_xor_si128(x1, SIMON128_f(y1)), rk2); + x2 = _mm_xor_si128(_mm_xor_si128(x2, SIMON128_f(y2)), rk2); + x3 = _mm_xor_si128(_mm_xor_si128(x3, SIMON128_f(y3)), rk2); + } + + if (rounds & 1) + { + const __m128i rk = _mm_castpd_si128( + _mm_loaddup_pd(CONST_DOUBLE_CAST(subkeys + rounds - 1))); + y1 = _mm_xor_si128(_mm_xor_si128(y1, SIMON128_f(x1)), rk); + y2 = _mm_xor_si128(_mm_xor_si128(y2, SIMON128_f(x2)), rk); + y3 = _mm_xor_si128(_mm_xor_si128(y3, SIMON128_f(x3)), rk); + Swap128(x1, y1); Swap128(x2, y2); Swap128(x3, y3); + } + + // [A1 B1][A2 B2] ... => [A1 A2][B1 B2] ... + block0 = _mm_unpacklo_epi64(y1, x1); + block1 = _mm_unpackhi_epi64(y1, x1); + block2 = _mm_unpacklo_epi64(y2, x2); + block3 = _mm_unpackhi_epi64(y2, x2); + block4 = _mm_unpacklo_epi64(y3, x3); + block5 = _mm_unpackhi_epi64(y3, x3); +} + +inline void GCC_NO_UBSAN SIMON128_Dec_Block(__m128i &block0, __m128i &block1, + const word64 *subkeys, unsigned int rounds) +{ + // Rearrange the data for vectorization. The incoming data was read from + // a big-endian byte array. Depending on the number of blocks it needs to + // be permuted to the following. + // [A1 A2][B1 B2] ... => [A1 B1][A2 B2] ... + __m128i x1 = _mm_unpackhi_epi64(block0, block1); + __m128i y1 = _mm_unpacklo_epi64(block0, block1); + + if (rounds & 1) + { + const __m128i rk = _mm_castpd_si128( + _mm_loaddup_pd(CONST_DOUBLE_CAST(subkeys + rounds - 1))); + + Swap128(x1, y1); + y1 = _mm_xor_si128(_mm_xor_si128(y1, rk), SIMON128_f(x1)); + rounds--; + } + + for (int i = static_cast(rounds-2); i >= 0; i -= 2) + { + const __m128i rk1 = _mm_castpd_si128( + _mm_loaddup_pd(CONST_DOUBLE_CAST(subkeys+i+1))); + x1 = _mm_xor_si128(_mm_xor_si128(x1, SIMON128_f(y1)), rk1); + + const __m128i rk2 = _mm_castpd_si128( + _mm_loaddup_pd(CONST_DOUBLE_CAST(subkeys+i))); + y1 = _mm_xor_si128(_mm_xor_si128(y1, SIMON128_f(x1)), rk2); + } + + // [A1 B1][A2 B2] ... => [A1 A2][B1 B2] ... + block0 = _mm_unpacklo_epi64(y1, x1); + block1 = _mm_unpackhi_epi64(y1, x1); +} + +inline void GCC_NO_UBSAN SIMON128_Dec_6_Blocks(__m128i &block0, __m128i &block1, + __m128i &block2, __m128i &block3, __m128i &block4, __m128i &block5, + const word64 *subkeys, unsigned int rounds) +{ + // Rearrange the data for vectorization. The incoming data was read from + // a big-endian byte array. Depending on the number of blocks it needs to + // be permuted to the following. + // [A1 A2][B1 B2] ... => [A1 B1][A2 B2] ... + __m128i x1 = _mm_unpackhi_epi64(block0, block1); + __m128i y1 = _mm_unpacklo_epi64(block0, block1); + __m128i x2 = _mm_unpackhi_epi64(block2, block3); + __m128i y2 = _mm_unpacklo_epi64(block2, block3); + __m128i x3 = _mm_unpackhi_epi64(block4, block5); + __m128i y3 = _mm_unpacklo_epi64(block4, block5); + + if (rounds & 1) + { + const __m128i rk = _mm_castpd_si128( + _mm_loaddup_pd(CONST_DOUBLE_CAST(subkeys + rounds - 1))); + + Swap128(x1, y1); Swap128(x2, y2); Swap128(x3, y3); + y1 = _mm_xor_si128(_mm_xor_si128(y1, rk), SIMON128_f(x1)); + y2 = _mm_xor_si128(_mm_xor_si128(y2, rk), SIMON128_f(x2)); + y3 = _mm_xor_si128(_mm_xor_si128(y3, rk), SIMON128_f(x3)); + rounds--; + } + + for (int i = static_cast(rounds-2); i >= 0; i -= 2) + { + const __m128i rk1 = _mm_castpd_si128( + _mm_loaddup_pd(CONST_DOUBLE_CAST(subkeys + i + 1))); + x1 = _mm_xor_si128(_mm_xor_si128(x1, SIMON128_f(y1)), rk1); + x2 = _mm_xor_si128(_mm_xor_si128(x2, SIMON128_f(y2)), rk1); + x3 = _mm_xor_si128(_mm_xor_si128(x3, SIMON128_f(y3)), rk1); + + const __m128i rk2 = _mm_castpd_si128( + _mm_loaddup_pd(CONST_DOUBLE_CAST(subkeys + i))); + y1 = _mm_xor_si128(_mm_xor_si128(y1, SIMON128_f(x1)), rk2); + y2 = _mm_xor_si128(_mm_xor_si128(y2, SIMON128_f(x2)), rk2); + y3 = _mm_xor_si128(_mm_xor_si128(y3, SIMON128_f(x3)), rk2); + } + + // [A1 B1][A2 B2] ... => [A1 A2][B1 B2] ... + block0 = _mm_unpacklo_epi64(y1, x1); + block1 = _mm_unpackhi_epi64(y1, x1); + block2 = _mm_unpacklo_epi64(y2, x2); + block3 = _mm_unpackhi_epi64(y2, x2); + block4 = _mm_unpacklo_epi64(y3, x3); + block5 = _mm_unpackhi_epi64(y3, x3); +} + +#endif // CRYPTOPP_SSSE3_AVAILABLE + +#if defined(CRYPTOPP_SSE41_AVAILABLE) + +template +inline __m128i RotateLeft32(const __m128i& val) +{ + return _mm_or_si128( + _mm_slli_epi32(val, R), _mm_srli_epi32(val, 32-R)); +} + +template +inline __m128i RotateRight32(const __m128i& val) +{ + return _mm_or_si128( + _mm_slli_epi32(val, 32-R), _mm_srli_epi32(val, R)); +} + +// Faster than two Shifts and an Or. Thanks to Louis Wingers and Bryan Weeks. +template <> +inline __m128i RotateLeft32<8>(const __m128i& val) +{ + const __m128i mask = _mm_set_epi8(14,13,12,15, 10,9,8,11, 6,5,4,7, 2,1,0,3); + return _mm_shuffle_epi8(val, mask); +} + +// Faster than two Shifts and an Or. Thanks to Louis Wingers and Bryan Weeks. +template <> +inline __m128i RotateRight32<8>(const __m128i& val) +{ + const __m128i mask = _mm_set_epi8(12,15,14,13, 8,11,10,9, 4,7,6,5, 0,3,2,1); + return _mm_shuffle_epi8(val, mask); +} + +inline __m128i SIMON64_f(const __m128i& v) +{ + return _mm_xor_si128(RotateLeft32<2>(v), + _mm_and_si128(RotateLeft32<1>(v), RotateLeft32<8>(v))); +} + +inline void GCC_NO_UBSAN SIMON64_Enc_Block(__m128i &block0, __m128i &block1, + const word32 *subkeys, unsigned int rounds) +{ + // Rearrange the data for vectorization. The incoming data was read from + // a big-endian byte array. Depending on the number of blocks it needs to + // be permuted to the following. Thanks to Peter Cordes for help with the + // SSE permutes below. + // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ... + const __m128 t0 = _mm_castsi128_ps(block0); + const __m128 t1 = _mm_castsi128_ps(block1); + __m128i x1 = _mm_castps_si128(_mm_shuffle_ps(t0, t1, _MM_SHUFFLE(3,1,3,1))); + __m128i y1 = _mm_castps_si128(_mm_shuffle_ps(t0, t1, _MM_SHUFFLE(2,0,2,0))); + + for (int i = 0; i < static_cast(rounds & ~1)-1; i += 2) + { + const __m128i rk1 = _mm_set1_epi32(subkeys[i]); + y1 = _mm_xor_si128(_mm_xor_si128(y1, SIMON64_f(x1)), rk1); + + const __m128i rk2 = _mm_set1_epi32(subkeys[i+1]); + x1 = _mm_xor_si128(_mm_xor_si128(x1, SIMON64_f(y1)), rk2); + } + + if (rounds & 1) + { + const __m128i rk = _mm_set1_epi32(subkeys[rounds-1]); + y1 = _mm_xor_si128(_mm_xor_si128(y1, SIMON64_f(x1)), rk); + Swap128(x1, y1); + } + + // The is roughly the SSE equivalent to ARM vzp32 + // [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4] + block0 = _mm_unpacklo_epi32(y1, x1); + block1 = _mm_unpackhi_epi32(y1, x1); +} + +inline void GCC_NO_UBSAN SIMON64_Dec_Block(__m128i &block0, __m128i &block1, + const word32 *subkeys, unsigned int rounds) +{ + // Rearrange the data for vectorization. The incoming data was read from + // a big-endian byte array. Depending on the number of blocks it needs to + // be permuted to the following. Thanks to Peter Cordes for help with the + // SSE permutes below. + // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ... + const __m128 t0 = _mm_castsi128_ps(block0); + const __m128 t1 = _mm_castsi128_ps(block1); + __m128i x1 = _mm_castps_si128(_mm_shuffle_ps(t0, t1, _MM_SHUFFLE(3,1,3,1))); + __m128i y1 = _mm_castps_si128(_mm_shuffle_ps(t0, t1, _MM_SHUFFLE(2,0,2,0))); + + if (rounds & 1) + { + Swap128(x1, y1); + const __m128i rk = _mm_set1_epi32(subkeys[rounds-1]); + y1 = _mm_xor_si128(_mm_xor_si128(y1, rk), SIMON64_f(x1)); + rounds--; + } + + for (int i = static_cast(rounds-2); i >= 0; i -= 2) + { + const __m128i rk1 = _mm_set1_epi32(subkeys[i+1]); + x1 = _mm_xor_si128(_mm_xor_si128(x1, SIMON64_f(y1)), rk1); + + const __m128i rk2 = _mm_set1_epi32(subkeys[i]); + y1 = _mm_xor_si128(_mm_xor_si128(y1, SIMON64_f(x1)), rk2); + } + + // The is roughly the SSE equivalent to ARM vzp32 + // [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4] + block0 = _mm_unpacklo_epi32(y1, x1); + block1 = _mm_unpackhi_epi32(y1, x1); +} + +inline void GCC_NO_UBSAN SIMON64_Enc_6_Blocks(__m128i &block0, __m128i &block1, + __m128i &block2, __m128i &block3, __m128i &block4, __m128i &block5, + const word32 *subkeys, unsigned int rounds) +{ + // Rearrange the data for vectorization. The incoming data was read from + // a big-endian byte array. Depending on the number of blocks it needs to + // be permuted to the following. Thanks to Peter Cordes for help with the + // SSE permutes below. + // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ... + const __m128 t0 = _mm_castsi128_ps(block0); + const __m128 t1 = _mm_castsi128_ps(block1); + __m128i x1 = _mm_castps_si128(_mm_shuffle_ps(t0, t1, _MM_SHUFFLE(3,1,3,1))); + __m128i y1 = _mm_castps_si128(_mm_shuffle_ps(t0, t1, _MM_SHUFFLE(2,0,2,0))); + + const __m128 t2 = _mm_castsi128_ps(block2); + const __m128 t3 = _mm_castsi128_ps(block3); + __m128i x2 = _mm_castps_si128(_mm_shuffle_ps(t2, t3, _MM_SHUFFLE(3,1,3,1))); + __m128i y2 = _mm_castps_si128(_mm_shuffle_ps(t2, t3, _MM_SHUFFLE(2,0,2,0))); + + const __m128 t4 = _mm_castsi128_ps(block4); + const __m128 t5 = _mm_castsi128_ps(block5); + __m128i x3 = _mm_castps_si128(_mm_shuffle_ps(t4, t5, _MM_SHUFFLE(3,1,3,1))); + __m128i y3 = _mm_castps_si128(_mm_shuffle_ps(t4, t5, _MM_SHUFFLE(2,0,2,0))); + + for (int i = 0; i < static_cast(rounds & ~1)-1; i += 2) + { + const __m128i rk1 = _mm_set1_epi32(subkeys[i]); + y1 = _mm_xor_si128(_mm_xor_si128(y1, SIMON64_f(x1)), rk1); + y2 = _mm_xor_si128(_mm_xor_si128(y2, SIMON64_f(x2)), rk1); + y3 = _mm_xor_si128(_mm_xor_si128(y3, SIMON64_f(x3)), rk1); + + const __m128i rk2 = _mm_set1_epi32(subkeys[i+1]); + x1 = _mm_xor_si128(_mm_xor_si128(x1, SIMON64_f(y1)), rk2); + x2 = _mm_xor_si128(_mm_xor_si128(x2, SIMON64_f(y2)), rk2); + x3 = _mm_xor_si128(_mm_xor_si128(x3, SIMON64_f(y3)), rk2); + } + + if (rounds & 1) + { + const __m128i rk = _mm_set1_epi32(subkeys[rounds-1]); + y1 = _mm_xor_si128(_mm_xor_si128(y1, SIMON64_f(x1)), rk); + y2 = _mm_xor_si128(_mm_xor_si128(y2, SIMON64_f(x2)), rk); + y3 = _mm_xor_si128(_mm_xor_si128(y3, SIMON64_f(x3)), rk); + Swap128(x1, y1); Swap128(x2, y2); Swap128(x3, y3); + } + + // The is roughly the SSE equivalent to ARM vzp32 + // [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4] + block0 = _mm_unpacklo_epi32(y1, x1); + block1 = _mm_unpackhi_epi32(y1, x1); + block2 = _mm_unpacklo_epi32(y2, x2); + block3 = _mm_unpackhi_epi32(y2, x2); + block4 = _mm_unpacklo_epi32(y3, x3); + block5 = _mm_unpackhi_epi32(y3, x3); +} + +inline void GCC_NO_UBSAN SIMON64_Dec_6_Blocks(__m128i &block0, __m128i &block1, + __m128i &block2, __m128i &block3, __m128i &block4, __m128i &block5, + const word32 *subkeys, unsigned int rounds) +{ + // Rearrange the data for vectorization. The incoming data was read from + // a big-endian byte array. Depending on the number of blocks it needs to + // be permuted to the following. Thanks to Peter Cordes for help with the + // SSE permutes below. + // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ... + const __m128 t0 = _mm_castsi128_ps(block0); + const __m128 t1 = _mm_castsi128_ps(block1); + __m128i x1 = _mm_castps_si128(_mm_shuffle_ps(t0, t1, _MM_SHUFFLE(3,1,3,1))); + __m128i y1 = _mm_castps_si128(_mm_shuffle_ps(t0, t1, _MM_SHUFFLE(2,0,2,0))); + + const __m128 t2 = _mm_castsi128_ps(block2); + const __m128 t3 = _mm_castsi128_ps(block3); + __m128i x2 = _mm_castps_si128(_mm_shuffle_ps(t2, t3, _MM_SHUFFLE(3,1,3,1))); + __m128i y2 = _mm_castps_si128(_mm_shuffle_ps(t2, t3, _MM_SHUFFLE(2,0,2,0))); + + const __m128 t4 = _mm_castsi128_ps(block4); + const __m128 t5 = _mm_castsi128_ps(block5); + __m128i x3 = _mm_castps_si128(_mm_shuffle_ps(t4, t5, _MM_SHUFFLE(3,1,3,1))); + __m128i y3 = _mm_castps_si128(_mm_shuffle_ps(t4, t5, _MM_SHUFFLE(2,0,2,0))); + + if (rounds & 1) + { + Swap128(x1, y1); Swap128(x2, y2); Swap128(x3, y3); + const __m128i rk = _mm_set1_epi32(subkeys[rounds-1]); + y1 = _mm_xor_si128(_mm_xor_si128(y1, rk), SIMON64_f(x1)); + y2 = _mm_xor_si128(_mm_xor_si128(y2, rk), SIMON64_f(x2)); + y3 = _mm_xor_si128(_mm_xor_si128(y3, rk), SIMON64_f(x3)); + rounds--; + } + + for (int i = static_cast(rounds-2); i >= 0; i -= 2) + { + const __m128i rk1 = _mm_set1_epi32(subkeys[i+1]); + x1 = _mm_xor_si128(_mm_xor_si128(x1, SIMON64_f(y1)), rk1); + x2 = _mm_xor_si128(_mm_xor_si128(x2, SIMON64_f(y2)), rk1); + x3 = _mm_xor_si128(_mm_xor_si128(x3, SIMON64_f(y3)), rk1); + + const __m128i rk2 = _mm_set1_epi32(subkeys[i]); + y1 = _mm_xor_si128(_mm_xor_si128(y1, SIMON64_f(x1)), rk2); + y2 = _mm_xor_si128(_mm_xor_si128(y2, SIMON64_f(x2)), rk2); + y3 = _mm_xor_si128(_mm_xor_si128(y3, SIMON64_f(x3)), rk2); + } + + // The is roughly the SSE equivalent to ARM vzp32 + // [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4] + block0 = _mm_unpacklo_epi32(y1, x1); + block1 = _mm_unpackhi_epi32(y1, x1); + block2 = _mm_unpacklo_epi32(y2, x2); + block3 = _mm_unpackhi_epi32(y2, x2); + block4 = _mm_unpacklo_epi32(y3, x3); + block5 = _mm_unpackhi_epi32(y3, x3); +} + +#endif // CRYPTOPP_SSE41_AVAILABLE + +ANONYMOUS_NAMESPACE_END + +/////////////////////////////////////////////////////////////////////// + +NAMESPACE_BEGIN(CryptoPP) + +// *************************** ARM NEON **************************** // + +#if (CRYPTOPP_ARM_NEON_AVAILABLE) +size_t SIMON64_Enc_AdvancedProcessBlocks_NEON(const word32* subKeys, size_t rounds, + const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags) +{ + return AdvancedProcessBlocks64_6x2_NEON(SIMON64_Enc_Block, SIMON64_Enc_6_Blocks, + subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags); +} + +size_t SIMON64_Dec_AdvancedProcessBlocks_NEON(const word32* subKeys, size_t rounds, + const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags) +{ + return AdvancedProcessBlocks64_6x2_NEON(SIMON64_Dec_Block, SIMON64_Dec_6_Blocks, + subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags); +} +#endif // CRYPTOPP_ARM_NEON_AVAILABLE + +#if (CRYPTOPP_ARM_NEON_AVAILABLE) +size_t SIMON128_Enc_AdvancedProcessBlocks_NEON(const word64* subKeys, size_t rounds, + const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags) +{ + return AdvancedProcessBlocks128_6x2_NEON(SIMON128_Enc_Block, SIMON128_Enc_6_Blocks, + subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags); +} + +size_t SIMON128_Dec_AdvancedProcessBlocks_NEON(const word64* subKeys, size_t rounds, + const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags) +{ + return AdvancedProcessBlocks128_6x2_NEON(SIMON128_Dec_Block, SIMON128_Dec_6_Blocks, + subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags); +} +#endif // CRYPTOPP_ARM_NEON_AVAILABLE + +// ***************************** IA-32 ***************************** // + +#if defined(CRYPTOPP_SSE41_AVAILABLE) +size_t SIMON64_Enc_AdvancedProcessBlocks_SSE41(const word32* subKeys, size_t rounds, + const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags) +{ + return AdvancedProcessBlocks64_6x2_SSE(SIMON64_Enc_Block, SIMON64_Enc_6_Blocks, + subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags); +} + +size_t SIMON64_Dec_AdvancedProcessBlocks_SSE41(const word32* subKeys, size_t rounds, + const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags) +{ + return AdvancedProcessBlocks64_6x2_SSE(SIMON64_Dec_Block, SIMON64_Dec_6_Blocks, + subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags); +} +#endif + +#if defined(CRYPTOPP_SSSE3_AVAILABLE) +size_t SIMON128_Enc_AdvancedProcessBlocks_SSSE3(const word64* subKeys, size_t rounds, + const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags) +{ + return AdvancedProcessBlocks128_6x2_SSE(SIMON128_Enc_Block, SIMON128_Enc_6_Blocks, + subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags); +} + +size_t SIMON128_Dec_AdvancedProcessBlocks_SSSE3(const word64* subKeys, size_t rounds, + const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags) +{ + return AdvancedProcessBlocks128_6x2_SSE(SIMON128_Dec_Block, SIMON128_Dec_6_Blocks, + subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags); +} +#endif // CRYPTOPP_SSSE3_AVAILABLE + +NAMESPACE_END diff --git a/simon.cpp b/simon.cpp new file mode 100644 index 00000000..438e9197 --- /dev/null +++ b/simon.cpp @@ -0,0 +1,463 @@ +// simon.h - written and placed in the public domain by Jeffrey Walton + +#include "pch.h" +#include "config.h" + +#include "simon.h" +#include "misc.h" +#include "cpu.h" + +// Uncomment for benchmarking C++ against SSE or NEON. +// Do so in both simon.cpp and simon-simd.cpp. +// #undef CRYPTOPP_SSSE3_AVAILABLE +// #undef CRYPTOPP_SSE41_AVAILABLE +// #undef CRYPTOPP_ARM_NEON_AVAILABLE + +ANONYMOUS_NAMESPACE_BEGIN + +using CryptoPP::word32; +using CryptoPP::word64; +using CryptoPP::rotlConstant; +using CryptoPP::rotrConstant; + +/// \brief Round transformation helper +/// \tparam W word type +/// \param v value +template +inline W f(const W v) +{ + return (rotlConstant<1>(v) & rotlConstant<8>(v)) ^ rotlConstant<2>(v); +} + +/// \brief Round transformation +/// \tparam W word type +/// \param x value +/// \param y value +/// \param k value +/// \param l value +template +inline void R2(W& x, W& y, const W k, const W l) +{ + y ^= f(x); y ^= k; + x ^= f(y); x ^= l; +} + +/// \brief Forward transformation +/// \tparam W word type +/// \tparam R number of rounds +/// \param c output array +/// \param p input array +/// \param k subkey array +template +inline void SIMON_Encrypt(W c[2], const W p[2], const W k[R]) +{ + c[0]=p[0]; c[1]=p[1]; + + for (int i = 0; i < static_cast(R-1); i += 2) + R2(c[0], c[1], k[i], k[i + 1]); + + if (R & 1) + { + c[1] ^= f(c[0]); c[1] ^= k[R-1]; + W t = c[0]; c[0] = c[1]; c[1] = t; + } +} + +/// \brief Reverse transformation +/// \tparam W word type +/// \tparam R number of rounds +/// \param p output array +/// \param c input array +/// \param k subkey array +template +inline void SIMON_Decrypt(W p[2], const W c[2], const W k[R]) +{ + p[0]=c[0]; p[1]=c[1]; + unsigned int rounds = R; + + if (R & 1) + { + const W t = p[1]; p[1] = p[0]; p[0] = t; + p[1] ^= k[R - 1]; p[1] ^= f(p[0]); + rounds--; + } + + for (int i = static_cast(rounds - 2); i >= 0; i -= 2) + R2(p[1], p[0], k[i + 1], k[i]); +} + +/// \brief Subkey generation function +/// \details Used for SIMON-64 with 96-bit key and 42 rounds. A template was +/// not worthwhile because all instantiations would need specialization. +/// \param key empty subkey array +/// \param k user key array +inline void SIMON64_ExpandKey_3W(word32 key[42], const word32 k[3]) +{ + const word32 c = 0xfffffffc; + word64 z = W64LIT(0x7369f885192c0ef5); + + key[0] = k[2]; key[1] = k[1]; key[2] = k[0]; + for (size_t i = 3; i<42; ++i) + { + key[i] = c ^ (z & 1) ^ key[i - 3] ^ rotrConstant<3>(key[i - 1]) ^ rotrConstant<4>(key[i - 1]); + z >>= 1; + } +} + +/// \brief Subkey generation function +/// \details Used for SIMON-64 with 128-bit key and 44 rounds. A template was +/// not worthwhile because all instantiations would need specialization. +/// \param key empty subkey array +/// \param k user key array +inline void SIMON64_ExpandKey_4W(word32 key[44], const word32 k[4]) +{ + const word32 c = 0xfffffffc; + word64 z = W64LIT(0xfc2ce51207a635db); + + key[0] = k[3]; key[1] = k[2]; key[2] = k[1]; key[3] = k[0]; + for (size_t i = 4; i<44; ++i) + { + key[i] = c ^ (z & 1) ^ key[i - 4] ^ rotrConstant<3>(key[i - 1]) ^ key[i - 3] ^ rotrConstant<4>(key[i - 1]) ^ rotrConstant<1>(key[i - 3]); + z >>= 1; + } +} + +/// \brief Subkey generation function +/// \details Used for SIMON-128 with 128-bit key and 68 rounds. A template was +/// not worthwhile because all instantiations would need specialization. +/// \param key empty subkey array +/// \param k user key array +inline void SIMON128_ExpandKey_2W(word64 key[68], const word64 k[2]) +{ + const word64 c = W64LIT(0xfffffffffffffffc); + word64 z = W64LIT(0x7369f885192c0ef5); + + key[0] = k[1]; key[1] = k[0]; + for (size_t i=2; i<66; ++i) + { + key[i] = c ^ (z & 1) ^ key[i - 2] ^ rotrConstant<3>(key[i - 1]) ^ rotrConstant<4>(key[i - 1]); + z>>=1; + } + + key[66] = c ^ 1 ^ key[64] ^ rotrConstant<3>(key[65]) ^ rotrConstant<4>(key[65]); + key[67] = c^key[65] ^ rotrConstant<3>(key[66]) ^ rotrConstant<4>(key[66]); +} + +/// \brief Subkey generation function +/// \details Used for SIMON-128 with 192-bit key and 69 rounds. A template was +/// not worthwhile because all instantiations would need specialization. +/// \param key empty subkey array +/// \param k user key array +inline void SIMON128_ExpandKey_3W(word64 key[69], const word64 k[3]) +{ + const word64 c = W64LIT(0xfffffffffffffffc); + word64 z = W64LIT(0xfc2ce51207a635db); + + key[0]=k[2]; key[1]=k[1]; key[2]=k[0]; + for (size_t i=3; i<67; ++i) + { + key[i] = c ^ (z & 1) ^ key[i - 3] ^ rotrConstant<3>(key[i - 1]) ^ rotrConstant<4>(key[i - 1]); + z>>=1; + } + + key[67] = c^key[64] ^ rotrConstant<3>(key[66]) ^ rotrConstant<4>(key[66]); + key[68] = c ^ 1 ^ key[65] ^ rotrConstant<3>(key[67]) ^ rotrConstant<4>(key[67]); +} + +/// \brief Subkey generation function +/// \details Used for SIMON-128 with 256-bit key and 72 rounds. A template was +/// not worthwhile because all instantiations would need specialization. +/// \param key empty subkey array +/// \param k user key array +inline void SIMON128_ExpandKey_4W(word64 key[72], const word64 k[4]) +{ + const word64 c = W64LIT(0xfffffffffffffffc); + word64 z = W64LIT(0xfdc94c3a046d678b); + + key[0]=k[3]; key[1]=k[2]; key[2]=k[1]; key[3]=k[0]; + for (size_t i=4; i<68; ++i) + { + key[i] = c ^ (z & 1) ^ key[i - 4] ^ rotrConstant<3>(key[i - 1]) ^ key[i - 3] ^ rotrConstant<4>(key[i - 1]) ^ rotrConstant<1>(key[i - 3]); + z>>=1; + } + + key[68] = c^key[64] ^ rotrConstant<3>(key[67]) ^ key[65] ^ rotrConstant<4>(key[67]) ^ rotrConstant<1>(key[65]); + key[69] = c ^ 1 ^ key[65] ^ rotrConstant<3>(key[68]) ^ key[66] ^ rotrConstant<4>(key[68]) ^ rotrConstant<1>(key[66]); + key[70] = c^key[66] ^ rotrConstant<3>(key[69]) ^ key[67] ^ rotrConstant<4>(key[69]) ^ rotrConstant<1>(key[67]); + key[71] = c^key[67] ^ rotrConstant<3>(key[70]) ^ key[68] ^ rotrConstant<4>(key[70]) ^ rotrConstant<1>(key[68]); +} + +ANONYMOUS_NAMESPACE_END + +/////////////////////////////////////////////////////////// + +NAMESPACE_BEGIN(CryptoPP) + +#if defined(CRYPTOPP_ARM_NEON_AVAILABLE) +extern size_t SIMON64_Enc_AdvancedProcessBlocks_NEON(const word32* subKeys, size_t rounds, + const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags); + +extern size_t SIMON64_Dec_AdvancedProcessBlocks_NEON(const word32* subKeys, size_t rounds, + const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags); +#endif + +#if defined(CRYPTOPP_ARM_NEON_AVAILABLE) +extern size_t SIMON128_Enc_AdvancedProcessBlocks_NEON(const word64* subKeys, size_t rounds, + const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags); + +extern size_t SIMON128_Dec_AdvancedProcessBlocks_NEON(const word64* subKeys, size_t rounds, + const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags); +#endif + +#if defined(CRYPTOPP_SSE41_AVAILABLE) +extern size_t SIMON64_Enc_AdvancedProcessBlocks_SSE41(const word32* subKeys, size_t rounds, + const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags); + +extern size_t SIMON64_Dec_AdvancedProcessBlocks_SSE41(const word32* subKeys, size_t rounds, + const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags); +#endif + +#if defined(CRYPTOPP_SSSE3_AVAILABLE) +extern size_t SIMON128_Enc_AdvancedProcessBlocks_SSSE3(const word64* subKeys, size_t rounds, + const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags); + +extern size_t SIMON128_Dec_AdvancedProcessBlocks_SSSE3(const word64* subKeys, size_t rounds, + const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags); +#endif + +void SIMON64::Base::UncheckedSetKey(const byte *userKey, unsigned int keyLength, const NameValuePairs ¶ms) +{ + CRYPTOPP_ASSERT(keyLength == 12 || keyLength == 16); + CRYPTOPP_UNUSED(params); + + // Building the key schedule table requires {3,4} words workspace. + // Encrypting and decrypting requires 4 words workspace. + m_kwords = keyLength/sizeof(word32); + m_wspace.New(4U); + + // Do the endian gyrations from the paper and align pointers + typedef GetBlock KeyBlock; + KeyBlock kblk(userKey); + + switch (m_kwords) + { + case 3: + m_rkeys.New((m_rounds = 42)); + kblk(m_wspace[2])(m_wspace[1])(m_wspace[0]); + SIMON64_ExpandKey_3W(m_rkeys, m_wspace); + break; + case 4: + m_rkeys.New((m_rounds = 44)); + kblk(m_wspace[3])(m_wspace[2])(m_wspace[1])(m_wspace[0]); + SIMON64_ExpandKey_4W(m_rkeys, m_wspace); + break; + default: + CRYPTOPP_ASSERT(0);; + } +} + +void SIMON64::Enc::ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock, byte *outBlock) const +{ + // Do the endian gyrations from the paper and align pointers + typedef GetBlock InBlock; + InBlock iblk(inBlock); iblk(m_wspace[1])(m_wspace[0]); + + switch (m_rounds) + { + case 42: + SIMON_Encrypt(m_wspace+2, m_wspace+0, m_rkeys); + break; + case 44: + SIMON_Encrypt(m_wspace+2, m_wspace+0, m_rkeys); + break; + default: + CRYPTOPP_ASSERT(0);; + } + + // Do the endian gyrations from the paper and align pointers + typedef PutBlock OutBlock; + OutBlock oblk(xorBlock, outBlock); oblk(m_wspace[3])(m_wspace[2]); +} + +void SIMON64::Dec::ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock, byte *outBlock) const +{ + // Do the endian gyrations from the paper and align pointers + typedef GetBlock InBlock; + InBlock iblk(inBlock); iblk(m_wspace[1])(m_wspace[0]); + + switch (m_rounds) + { + case 42: + SIMON_Decrypt(m_wspace+2, m_wspace+0, m_rkeys); + break; + case 44: + SIMON_Decrypt(m_wspace+2, m_wspace+0, m_rkeys); + break; + default: + CRYPTOPP_ASSERT(0);; + } + + // Do the endian gyrations from the paper and align pointers + typedef PutBlock OutBlock; + OutBlock oblk(xorBlock, outBlock); oblk(m_wspace[3])(m_wspace[2]); +} + +/////////////////////////////////////////////////////////// + +void SIMON128::Base::UncheckedSetKey(const byte *userKey, unsigned int keyLength, const NameValuePairs ¶ms) +{ + CRYPTOPP_ASSERT(keyLength == 16 || keyLength == 24 || keyLength == 32); + CRYPTOPP_UNUSED(params); + + // Building the key schedule table requires {2,3,4} words workspace. + // Encrypting and decrypting requires 4 words workspace. + m_kwords = keyLength/sizeof(word64); + m_wspace.New(4U); + + // Do the endian gyrations from the paper and align pointers + typedef GetBlock KeyBlock; + KeyBlock kblk(userKey); + + switch (m_kwords) + { + case 2: + m_rkeys.New((m_rounds = 68)); + kblk(m_wspace[1])(m_wspace[0]); + SIMON128_ExpandKey_2W(m_rkeys, m_wspace); + break; + case 3: + m_rkeys.New((m_rounds = 69)); + kblk(m_wspace[2])(m_wspace[1])(m_wspace[0]); + SIMON128_ExpandKey_3W(m_rkeys, m_wspace); + break; + case 4: + m_rkeys.New((m_rounds = 72)); + kblk(m_wspace[3])(m_wspace[2])(m_wspace[1])(m_wspace[0]); + SIMON128_ExpandKey_4W(m_rkeys, m_wspace); + break; + default: + CRYPTOPP_ASSERT(0);; + } +} + +void SIMON128::Enc::ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock, byte *outBlock) const +{ + // Do the endian gyrations from the paper and align pointers + typedef GetBlock InBlock; + InBlock iblk(inBlock); iblk(m_wspace[1])(m_wspace[0]); + + switch (m_rounds) + { + case 68: + SIMON_Encrypt(m_wspace+2, m_wspace+0, m_rkeys); + break; + case 69: + SIMON_Encrypt(m_wspace+2, m_wspace+0, m_rkeys); + break; + case 72: + SIMON_Encrypt(m_wspace+2, m_wspace+0, m_rkeys); + break; + default: + CRYPTOPP_ASSERT(0);; + } + + // Do the endian gyrations from the paper and align pointers + typedef PutBlock OutBlock; + OutBlock oblk(xorBlock, outBlock); oblk(m_wspace[3])(m_wspace[2]); +} + +void SIMON128::Dec::ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock, byte *outBlock) const +{ + // Do the endian gyrations from the paper and align pointers + typedef GetBlock InBlock; + InBlock iblk(inBlock); iblk(m_wspace[1])(m_wspace[0]); + + switch (m_rounds) + { + case 68: + SIMON_Decrypt(m_wspace+2, m_wspace+0, m_rkeys); + break; + case 69: + SIMON_Decrypt(m_wspace+2, m_wspace+0, m_rkeys); + break; + case 72: + SIMON_Decrypt(m_wspace+2, m_wspace+0, m_rkeys); + break; + default: + CRYPTOPP_ASSERT(0);; + } + + // Do the endian gyrations from the paper and align pointers + typedef PutBlock OutBlock; + OutBlock oblk(xorBlock, outBlock); oblk(m_wspace[3])(m_wspace[2]); +} + +#if defined(CRYPTOPP_SIMON64_ADVANCED_PROCESS_BLOCKS) +size_t SIMON64::Enc::AdvancedProcessBlocks(const byte *inBlocks, const byte *xorBlocks, + byte *outBlocks, size_t length, word32 flags) const +{ +#if defined(CRYPTOPP_SSE41_AVAILABLE) + if (HasSSE41()) + return SIMON64_Enc_AdvancedProcessBlocks_SSE41(m_rkeys, (size_t)m_rounds, + inBlocks, xorBlocks, outBlocks, length, flags); +#endif +#if defined(CRYPTOPP_ARM_NEON_AVAILABLE) + if (HasNEON()) + return SIMON64_Enc_AdvancedProcessBlocks_NEON(m_rkeys, (size_t)m_rounds, + inBlocks, xorBlocks, outBlocks, length, flags); +#endif + return BlockTransformation::AdvancedProcessBlocks(inBlocks, xorBlocks, outBlocks, length, flags); +} + +size_t SIMON64::Dec::AdvancedProcessBlocks(const byte *inBlocks, const byte *xorBlocks, + byte *outBlocks, size_t length, word32 flags) const +{ +#if defined(CRYPTOPP_SSE41_AVAILABLE) + if (HasSSE41()) + return SIMON64_Dec_AdvancedProcessBlocks_SSE41(m_rkeys, (size_t)m_rounds, + inBlocks, xorBlocks, outBlocks, length, flags); +#endif +#if defined(CRYPTOPP_ARM_NEON_AVAILABLE) + if (HasNEON()) + return SIMON64_Dec_AdvancedProcessBlocks_NEON(m_rkeys, (size_t)m_rounds, + inBlocks, xorBlocks, outBlocks, length, flags); +#endif + return BlockTransformation::AdvancedProcessBlocks(inBlocks, xorBlocks, outBlocks, length, flags); +} +#endif // CRYPTOPP_SIMON64_ADVANCED_PROCESS_BLOCKS + +#if defined(CRYPTOPP_SIMON128_ADVANCED_PROCESS_BLOCKS) +size_t SIMON128::Enc::AdvancedProcessBlocks(const byte *inBlocks, const byte *xorBlocks, + byte *outBlocks, size_t length, word32 flags) const +{ +#if defined(CRYPTOPP_SSSE3_AVAILABLE) + if (HasSSSE3()) + return SIMON128_Enc_AdvancedProcessBlocks_SSSE3(m_rkeys, (size_t)m_rounds, + inBlocks, xorBlocks, outBlocks, length, flags); +#endif +#if defined(CRYPTOPP_ARM_NEON_AVAILABLE) + if (HasNEON()) + return SIMON128_Enc_AdvancedProcessBlocks_NEON(m_rkeys, (size_t)m_rounds, + inBlocks, xorBlocks, outBlocks, length, flags); +#endif + return BlockTransformation::AdvancedProcessBlocks(inBlocks, xorBlocks, outBlocks, length, flags); +} + +size_t SIMON128::Dec::AdvancedProcessBlocks(const byte *inBlocks, const byte *xorBlocks, + byte *outBlocks, size_t length, word32 flags) const +{ +#if defined(CRYPTOPP_SSSE3_AVAILABLE) + if (HasSSSE3()) + return SIMON128_Dec_AdvancedProcessBlocks_SSSE3(m_rkeys, (size_t)m_rounds, + inBlocks, xorBlocks, outBlocks, length, flags); +#endif +#if defined(CRYPTOPP_ARM_NEON_AVAILABLE) + if (HasNEON()) + return SIMON128_Dec_AdvancedProcessBlocks_NEON(m_rkeys, (size_t)m_rounds, + inBlocks, xorBlocks, outBlocks, length, flags); +#endif + return BlockTransformation::AdvancedProcessBlocks(inBlocks, xorBlocks, outBlocks, length, flags); +} +#endif // CRYPTOPP_SIMON128_ADVANCED_PROCESS_BLOCKS + +NAMESPACE_END diff --git a/simon.h b/simon.h new file mode 100644 index 00000000..19f52e3c --- /dev/null +++ b/simon.h @@ -0,0 +1,180 @@ +// simon.h - written and placed in the public domain by Jeffrey Walton + +/// \file simon.h +/// \brief Classes for the Simon block cipher +/// \details Simon is a block cipher designed by Ray Beaulieu, Douglas Shors, Jason Smith, +/// Stefan Treatman-Clark, Bryan Weeks and Louis Wingers. +/// \sa The SIMON and SPECK Families of +/// Lightweight Block Ciphers, +/// The Simon and Speck GitHub and +/// SIMON on the Crypto++ wiki. +/// \since Crypto++ 6.0 + +#ifndef CRYPTOPP_SIMON_H +#define CRYPTOPP_SIMON_H + +#include "config.h" +#include "seckey.h" +#include "secblock.h" + +#if CRYPTOPP_BOOL_X64 || CRYPTOPP_BOOL_X32 || CRYPTOPP_BOOL_X86 +# define CRYPTOPP_SIMON64_ADVANCED_PROCESS_BLOCKS 1 +#endif + +#if CRYPTOPP_BOOL_X64 || CRYPTOPP_BOOL_X32 || CRYPTOPP_BOOL_X86 +# define CRYPTOPP_SIMON128_ADVANCED_PROCESS_BLOCKS 1 +#endif + +NAMESPACE_BEGIN(CryptoPP) + +/// \brief SIMON block cipher information +/// \tparam L block size of the cipher, in bytes +/// \tparam D default key length, in bytes +/// \tparam N minimum key length, in bytes +/// \tparam M maximum key length, in bytes +/// \since Crypto++ 6.0 +template +struct SIMON_Info : public FixedBlockSize, VariableKeyLength +{ + static const std::string StaticAlgorithmName() + { + // Format is Cipher-Blocksize(Keylength) + return "SIMON-" + IntToString(L*8); + } +}; + +/// \brief SIMON block cipher base class +/// \tparam W the word type +/// \details User code should use SIMON64 or SIMON128 +/// \sa SIMON64, SIMON128, SIMON on the Crypto++ wiki +/// \since Crypto++ 6.0 +template +struct SIMON_Base +{ + virtual ~SIMON_Base() {} +SIMON_Base() : m_kwords(0), m_rounds(0) {} + + typedef SecBlock > AlignedSecBlock; + mutable AlignedSecBlock m_wspace; // workspace + AlignedSecBlock m_rkeys; // round keys + unsigned int m_kwords; // number of key words + unsigned int m_rounds; // number of rounds +}; + +/// \brief SIMON 64-bit block cipher +/// \details Simon is a block cipher designed by Ray Beaulieu, Douglas Shors, Jason Smith, +/// Stefan Treatman-Clark, Bryan Weeks and Louis Wingers. +/// \details SIMON64 provides 64-bit block size. The valid key sizes are 96-bit and 128-bit. +/// \sa SIMON64, SIMON128, The SIMON and SIMON +/// Families of Lightweight Block Ciphers, +/// The Simon and Speck GitHub, SIMON on the +/// Crypto++ wiki +/// \since Crypto++ 6.0 +class CRYPTOPP_NO_VTABLE SIMON64 : public SIMON_Info<8, 12, 12, 16>, public BlockCipherDocumentation +{ +public: + /// \brief SIMON block cipher transformation functions + /// \details Provides implementation common to encryption and decryption + /// \since Crypto++ 6.0 + class CRYPTOPP_NO_VTABLE Base : protected SIMON_Base, public BlockCipherImpl > + { + public: + std::string AlgorithmName() const { + return StaticAlgorithmName() + (m_kwords == 0 ? "" : + "(" + IntToString(m_kwords*sizeof(word32)*8) + ")"); + } + + protected: + void UncheckedSetKey(const byte *userKey, unsigned int keyLength, const NameValuePairs ¶ms); + }; + + /// \brief Provides implementation for encryption transformation + /// \details Enc provides implementation for encryption transformation. All key + /// sizes are supported. + /// \since Crypto++ 6.0 + class CRYPTOPP_NO_VTABLE Enc : public Base + { + protected: + void ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock, byte *outBlock) const; +#if CRYPTOPP_SIMON64_ADVANCED_PROCESS_BLOCKS + size_t AdvancedProcessBlocks(const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags) const; +#endif + }; + + /// \brief Provides implementation for encryption transformation + /// \details Dec provides implementation for decryption transformation. All key + /// sizes are supported. + /// \since Crypto++ 6.0 + class CRYPTOPP_NO_VTABLE Dec : public Base + { + protected: + void ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock, byte *outBlock) const; +#if CRYPTOPP_SIMON64_ADVANCED_PROCESS_BLOCKS + size_t AdvancedProcessBlocks(const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags) const; +#endif + }; + + typedef BlockCipherFinal Encryption; + typedef BlockCipherFinal Decryption; +}; + +/// \brief SIMON 128-bit block cipher +/// \details Simon is a block cipher designed by Ray Beaulieu, Douglas Shors, Jason Smith, +/// Stefan Treatman-Clark, Bryan Weeks and Louis Wingers. +/// \details SIMON128 provides 128-bit block size. The valid key sizes are 128-bit, 192-bit and 256-bit. +/// \sa SIMON64, SIMON128, The SIMON and SIMON +/// Families of Lightweight Block Ciphers, +/// The Simon and Speck GitHub, SIMON on the +/// Crypto++ wiki +/// \since Crypto++ 6.0 +class CRYPTOPP_NO_VTABLE SIMON128 : public SIMON_Info<16, 16, 16, 32>, public BlockCipherDocumentation +{ +public: + /// \brief SIMON block cipher transformation functions + /// \details Provides implementation common to encryption and decryption + /// \since Crypto++ 6.0 + class CRYPTOPP_NO_VTABLE Base : protected SIMON_Base, public BlockCipherImpl > + { + public: + std::string AlgorithmName() const { + return StaticAlgorithmName() + (m_kwords == 0 ? "" : + "(" + IntToString(m_kwords*sizeof(word64)*8) + ")"); + } + + protected: + void UncheckedSetKey(const byte *userKey, unsigned int keyLength, const NameValuePairs ¶ms); + }; + + /// \brief Provides implementation for encryption transformation + /// \details Enc provides implementation for encryption transformation. All key + /// sizes are supported. + /// \since Crypto++ 6.0 + class CRYPTOPP_NO_VTABLE Enc : public Base + { + protected: + void ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock, byte *outBlock) const; +#if CRYPTOPP_SIMON128_ADVANCED_PROCESS_BLOCKS + size_t AdvancedProcessBlocks(const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags) const; +#endif + }; + + /// \brief Provides implementation for encryption transformation + /// \details Dec provides implementation for decryption transformation. All key + /// sizes are supported. + /// \since Crypto++ 6.0 + class CRYPTOPP_NO_VTABLE Dec : public Base + { + protected: + void ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock, byte *outBlock) const; +#if CRYPTOPP_SIMON128_ADVANCED_PROCESS_BLOCKS + size_t AdvancedProcessBlocks(const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags) const; +#endif + }; + + typedef BlockCipherFinal Encryption; + typedef BlockCipherFinal Decryption; +}; + +NAMESPACE_END + +#endif // CRYPTOPP_SIMON_H diff --git a/speck-simd.cpp b/speck-simd.cpp new file mode 100644 index 00000000..e60ad2f8 --- /dev/null +++ b/speck-simd.cpp @@ -0,0 +1,1031 @@ +// speck-simd.cpp - written and placed in the public domain by Jeffrey Walton +// +// This source file uses intrinsics and built-ins to gain access to +// SSSE3, ARM NEON and ARMv8a, and Power7 Altivec instructions. A separate +// source file is needed because additional CXXFLAGS are required to enable +// the appropriate instructions sets in some build configurations. + +#include "pch.h" +#include "config.h" + +#include "speck.h" +#include "misc.h" +#include "adv-simd.h" + +// Uncomment for benchmarking C++ against SSE or NEON. +// Do so in both speck.cpp and speck-simd.cpp. +// #undef CRYPTOPP_SSSE3_AVAILABLE +// #undef CRYPTOPP_SSE41_AVAILABLE +// #undef CRYPTOPP_ARM_NEON_AVAILABLE + +#if (CRYPTOPP_SSSE3_AVAILABLE) +# include +# include +#endif + +#if (CRYPTOPP_SSE41_AVAILABLE) +# include +#endif + +#if defined(__AVX512F__) && defined(__AVX512VL__) +# define CRYPTOPP_AVX512_ROTATE 1 +# include +#endif + +#if (CRYPTOPP_ARM_NEON_AVAILABLE) +# include +#endif + +// Can't use CRYPTOPP_ARM_XXX_AVAILABLE because too many +// compilers don't follow ACLE conventions for the include. +#if defined(CRYPTOPP_ARM_ACLE_AVAILABLE) +# include +# include +#endif + +// https://www.spinics.net/lists/gcchelp/msg47735.html and +// https://www.spinics.net/lists/gcchelp/msg47749.html +#if (CRYPTOPP_GCC_VERSION >= 40900) +# define GCC_NO_UBSAN __attribute__ ((no_sanitize_undefined)) +#else +# define GCC_NO_UBSAN +#endif + +ANONYMOUS_NAMESPACE_BEGIN + +using CryptoPP::byte; +using CryptoPP::word32; +using CryptoPP::word64; + +// *************************** ARM NEON ************************** // + +#if defined(CRYPTOPP_ARM_NEON_AVAILABLE) + +template +inline uint32x4_t RotateLeft32(const uint32x4_t& val) +{ + const uint32x4_t a(vshlq_n_u32(val, R)); + const uint32x4_t b(vshrq_n_u32(val, 32 - R)); + return vorrq_u32(a, b); +} + +template +inline uint32x4_t RotateRight32(const uint32x4_t& val) +{ + const uint32x4_t a(vshlq_n_u32(val, 32 - R)); + const uint32x4_t b(vshrq_n_u32(val, R)); + return vorrq_u32(a, b); +} + +#if defined(__aarch32__) || defined(__aarch64__) +// Faster than two Shifts and an Or. Thanks to Louis Wingers and Bryan Weeks. +template <> +inline uint32x4_t RotateLeft32<8>(const uint32x4_t& val) +{ +#if defined(CRYPTOPP_BIG_ENDIAN) + const uint8_t maskb[16] = { 14,13,12,15, 10,9,8,11, 6,5,4,7, 2,1,0,3 }; + const uint8x16_t mask = vld1q_u8(maskb); +#else + const uint8_t maskb[16] = { 3,0,1,2, 7,4,5,6, 11,8,9,10, 15,12,13,14 }; + const uint8x16_t mask = vld1q_u8(maskb); +#endif + + return vreinterpretq_u32_u8( + vqtbl1q_u8(vreinterpretq_u8_u32(val), mask)); +} + +// Faster than two Shifts and an Or. Thanks to Louis Wingers and Bryan Weeks. +template <> +inline uint32x4_t RotateRight32<8>(const uint32x4_t& val) +{ +#if defined(CRYPTOPP_BIG_ENDIAN) + const uint8_t maskb[16] = { 12,15,14,13, 8,11,10,9, 4,7,6,5, 0,3,2,1 }; + const uint8x16_t mask = vld1q_u8(maskb); +#else + const uint8_t maskb[16] = { 1,2,3,0, 5,6,7,4, 9,10,11,8, 13,14,15,12 }; + const uint8x16_t mask = vld1q_u8(maskb); +#endif + + return vreinterpretq_u32_u8( + vqtbl1q_u8(vreinterpretq_u8_u32(val), mask)); +} +#endif // Aarch32 or Aarch64 + +inline uint32x4_t Shuffle32(const uint32x4_t& val) +{ +#if defined(CRYPTOPP_LITTLE_ENDIAN) + return vreinterpretq_u32_u8( + vrev32q_u8(vreinterpretq_u8_u32(val))); +#else + return val; +#endif +} + +inline void SPECK64_Enc_Block(uint32x4_t &block0, uint32x4_t &block1, + const word32 *subkeys, unsigned int rounds) +{ + // Rearrange the data for vectorization. The incoming data was read from + // a big-endian byte array. Depending on the number of blocks it needs to + // be permuted to the following. + // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ... + uint32x4_t x1 = vuzpq_u32(block0, block1).val[0]; + uint32x4_t y1 = vuzpq_u32(block0, block1).val[1]; + + x1 = Shuffle32(x1); y1 = Shuffle32(y1); + + for (int i=0; i < static_cast(rounds); ++i) + { + const uint32x4_t rk = vdupq_n_u32(subkeys[i]); + + x1 = RotateRight32<8>(x1); + x1 = vaddq_u32(x1, y1); + x1 = veorq_u32(x1, rk); + y1 = RotateLeft32<3>(y1); + y1 = veorq_u32(y1, x1); + } + + x1 = Shuffle32(x1); y1 = Shuffle32(y1); + + // [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4] + block0 = vzipq_u32(x1, y1).val[0]; + block1 = vzipq_u32(x1, y1).val[1]; +} + +inline void SPECK64_Dec_Block(uint32x4_t &block0, uint32x4_t &block1, + const word32 *subkeys, unsigned int rounds) +{ + // Rearrange the data for vectorization. The incoming data was read from + // a big-endian byte array. Depending on the number of blocks it needs to + // be permuted to the following. + // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ... + uint32x4_t x1 = vuzpq_u32(block0, block1).val[0]; + uint32x4_t y1 = vuzpq_u32(block0, block1).val[1]; + + x1 = Shuffle32(x1); y1 = Shuffle32(y1); + + for (int i = static_cast(rounds-1); i >= 0; --i) + { + const uint32x4_t rk = vdupq_n_u32(subkeys[i]); + + y1 = veorq_u32(y1, x1); + y1 = RotateRight32<3>(y1); + x1 = veorq_u32(x1, rk); + x1 = vsubq_u32(x1, y1); + x1 = RotateLeft32<8>(x1); + } + + x1 = Shuffle32(x1); y1 = Shuffle32(y1); + + // [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4] + block0 = vzipq_u32(x1, y1).val[0]; + block1 = vzipq_u32(x1, y1).val[1]; +} + +inline void SPECK64_Enc_6_Blocks(uint32x4_t &block0, uint32x4_t &block1, + uint32x4_t &block2, uint32x4_t &block3, uint32x4_t &block4, uint32x4_t &block5, + const word32 *subkeys, unsigned int rounds) +{ + // Rearrange the data for vectorization. The incoming data was read from + // a big-endian byte array. Depending on the number of blocks it needs to + // be permuted to the following. If only a single block is available then + // a Zero block is provided to promote vectorizations. + // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ... + uint32x4_t x1 = vuzpq_u32(block0, block1).val[0]; + uint32x4_t y1 = vuzpq_u32(block0, block1).val[1]; + uint32x4_t x2 = vuzpq_u32(block2, block3).val[0]; + uint32x4_t y2 = vuzpq_u32(block2, block3).val[1]; + uint32x4_t x3 = vuzpq_u32(block4, block5).val[0]; + uint32x4_t y3 = vuzpq_u32(block4, block5).val[1]; + + x1 = Shuffle32(x1); y1 = Shuffle32(y1); + x2 = Shuffle32(x2); y2 = Shuffle32(y2); + x3 = Shuffle32(x3); y3 = Shuffle32(y3); + + for (int i=0; i < static_cast(rounds); ++i) + { + const uint32x4_t rk = vdupq_n_u32(subkeys[i]); + + x1 = RotateRight32<8>(x1); + x2 = RotateRight32<8>(x2); + x3 = RotateRight32<8>(x3); + x1 = vaddq_u32(x1, y1); + x2 = vaddq_u32(x2, y2); + x3 = vaddq_u32(x3, y3); + x1 = veorq_u32(x1, rk); + x2 = veorq_u32(x2, rk); + x3 = veorq_u32(x3, rk); + y1 = RotateLeft32<3>(y1); + y2 = RotateLeft32<3>(y2); + y3 = RotateLeft32<3>(y3); + y1 = veorq_u32(y1, x1); + y2 = veorq_u32(y2, x2); + y3 = veorq_u32(y3, x3); + } + + x1 = Shuffle32(x1); y1 = Shuffle32(y1); + x2 = Shuffle32(x2); y2 = Shuffle32(y2); + x3 = Shuffle32(x3); y3 = Shuffle32(y3); + + // [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4] + block0 = vzipq_u32(x1, y1).val[0]; + block1 = vzipq_u32(x1, y1).val[1]; + block2 = vzipq_u32(x2, y2).val[0]; + block3 = vzipq_u32(x2, y2).val[1]; + block4 = vzipq_u32(x3, y3).val[0]; + block5 = vzipq_u32(x3, y3).val[1]; +} + +inline void SPECK64_Dec_6_Blocks(uint32x4_t &block0, uint32x4_t &block1, + uint32x4_t &block2, uint32x4_t &block3, uint32x4_t &block4, uint32x4_t &block5, + const word32 *subkeys, unsigned int rounds) +{ + // Rearrange the data for vectorization. The incoming data was read from + // a big-endian byte array. Depending on the number of blocks it needs to + // be permuted to the following. If only a single block is available then + // a Zero block is provided to promote vectorizations. + // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ... + uint32x4_t x1 = vuzpq_u32(block0, block1).val[0]; + uint32x4_t y1 = vuzpq_u32(block0, block1).val[1]; + uint32x4_t x2 = vuzpq_u32(block2, block3).val[0]; + uint32x4_t y2 = vuzpq_u32(block2, block3).val[1]; + uint32x4_t x3 = vuzpq_u32(block4, block5).val[0]; + uint32x4_t y3 = vuzpq_u32(block4, block5).val[1]; + + x1 = Shuffle32(x1); y1 = Shuffle32(y1); + x2 = Shuffle32(x2); y2 = Shuffle32(y2); + x3 = Shuffle32(x3); y3 = Shuffle32(y3); + + for (int i = static_cast(rounds-1); i >= 0; --i) + { + const uint32x4_t rk = vdupq_n_u32(subkeys[i]); + + y1 = veorq_u32(y1, x1); + y2 = veorq_u32(y2, x2); + y3 = veorq_u32(y3, x3); + y1 = RotateRight32<3>(y1); + y2 = RotateRight32<3>(y2); + y3 = RotateRight32<3>(y3); + x1 = veorq_u32(x1, rk); + x2 = veorq_u32(x2, rk); + x3 = veorq_u32(x3, rk); + x1 = vsubq_u32(x1, y1); + x2 = vsubq_u32(x2, y2); + x3 = vsubq_u32(x3, y3); + x1 = RotateLeft32<8>(x1); + x2 = RotateLeft32<8>(x2); + x3 = RotateLeft32<8>(x3); + } + + x1 = Shuffle32(x1); y1 = Shuffle32(y1); + x2 = Shuffle32(x2); y2 = Shuffle32(y2); + x3 = Shuffle32(x3); y3 = Shuffle32(y3); + + // [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4] + block0 = vzipq_u32(x1, y1).val[0]; + block1 = vzipq_u32(x1, y1).val[1]; + block2 = vzipq_u32(x2, y2).val[0]; + block3 = vzipq_u32(x2, y2).val[1]; + block4 = vzipq_u32(x3, y3).val[0]; + block5 = vzipq_u32(x3, y3).val[1]; +} + +#endif // CRYPTOPP_ARM_NEON_AVAILABLE + +#if defined(CRYPTOPP_ARM_NEON_AVAILABLE) + +template +inline T UnpackHigh64(const T& a, const T& b) +{ + const uint64x1_t x(vget_high_u64((uint64x2_t)a)); + const uint64x1_t y(vget_high_u64((uint64x2_t)b)); + return (T)vcombine_u64(x, y); +} + +template +inline T UnpackLow64(const T& a, const T& b) +{ + const uint64x1_t x(vget_low_u64((uint64x2_t)a)); + const uint64x1_t y(vget_low_u64((uint64x2_t)b)); + return (T)vcombine_u64(x, y); +} + +template +inline uint64x2_t RotateLeft64(const uint64x2_t& val) +{ + const uint64x2_t a(vshlq_n_u64(val, R)); + const uint64x2_t b(vshrq_n_u64(val, 64 - R)); + return vorrq_u64(a, b); +} + +template +inline uint64x2_t RotateRight64(const uint64x2_t& val) +{ + const uint64x2_t a(vshlq_n_u64(val, 64 - R)); + const uint64x2_t b(vshrq_n_u64(val, R)); + return vorrq_u64(a, b); +} + +#if defined(__aarch32__) || defined(__aarch64__) +// Faster than two Shifts and an Or. Thanks to Louis Wingers and Bryan Weeks. +template <> +inline uint64x2_t RotateLeft64<8>(const uint64x2_t& val) +{ +#if defined(CRYPTOPP_BIG_ENDIAN) + const uint8_t maskb[16] = { 14,13,12,11, 10,9,8,15, 6,5,4,3, 2,1,0,7 }; + const uint8x16_t mask = vld1q_u8(maskb); +#else + const uint8_t maskb[16] = { 7,0,1,2, 3,4,5,6, 15,8,9,10, 11,12,13,14 }; + const uint8x16_t mask = vld1q_u8(maskb); +#endif + + return vreinterpretq_u64_u8( + vqtbl1q_u8(vreinterpretq_u8_u64(val), mask)); +} + +// Faster than two Shifts and an Or. Thanks to Louis Wingers and Bryan Weeks. +template <> +inline uint64x2_t RotateRight64<8>(const uint64x2_t& val) +{ +#if defined(CRYPTOPP_BIG_ENDIAN) + const uint8_t maskb[16] = { 8,15,14,13, 12,11,10,9, 0,7,6,5, 4,3,2,1 }; + const uint8x16_t mask = vld1q_u8(maskb); +#else + const uint8_t maskb[16] = { 1,2,3,4, 5,6,7,0, 9,10,11,12, 13,14,15,8 }; + const uint8x16_t mask = vld1q_u8(maskb); +#endif + + return vreinterpretq_u64_u8( + vqtbl1q_u8(vreinterpretq_u8_u64(val), mask)); +} +#endif + +inline uint64x2_t Shuffle64(const uint64x2_t& val) +{ +#if defined(CRYPTOPP_LITTLE_ENDIAN) + return vreinterpretq_u64_u8( + vrev64q_u8(vreinterpretq_u8_u64(val))); +#else + return val; +#endif +} + +inline void SPECK128_Enc_Block(uint64x2_t &block0, uint64x2_t &block1, + const word64 *subkeys, unsigned int rounds) +{ + // Rearrange the data for vectorization. The incoming data was read from + // a big-endian byte array. Depending on the number of blocks it needs to + // be permuted to the following. + // [A1 A2][B1 B2] ... => [A1 B1][A2 B2] ... + uint64x2_t x1 = UnpackLow64(block0, block1); + uint64x2_t y1 = UnpackHigh64(block0, block1); + + x1 = Shuffle64(x1); y1 = Shuffle64(y1); + + for (int i=0; i < static_cast(rounds); ++i) + { + const uint64x2_t rk = vld1q_dup_u64(subkeys+i); + + x1 = RotateRight64<8>(x1); + x1 = vaddq_u64(x1, y1); + x1 = veorq_u64(x1, rk); + y1 = RotateLeft64<3>(y1); + y1 = veorq_u64(y1, x1); + } + + x1 = Shuffle64(x1); y1 = Shuffle64(y1); + + // [A1 B1][A2 B2] ... => [A1 A2][B1 B2] ... + block0 = UnpackLow64(x1, y1); + block1 = UnpackHigh64(x1, y1); +} + +inline void SPECK128_Enc_6_Blocks(uint64x2_t &block0, uint64x2_t &block1, + uint64x2_t &block2, uint64x2_t &block3, uint64x2_t &block4, uint64x2_t &block5, + const word64 *subkeys, unsigned int rounds) +{ + // Rearrange the data for vectorization. The incoming data was read from + // a big-endian byte array. Depending on the number of blocks it needs to + // be permuted to the following. + // [A1 A2][B1 B2] ... => [A1 B1][A2 B2] ... + uint64x2_t x1 = UnpackLow64(block0, block1); + uint64x2_t y1 = UnpackHigh64(block0, block1); + uint64x2_t x2 = UnpackLow64(block2, block3); + uint64x2_t y2 = UnpackHigh64(block2, block3); + uint64x2_t x3 = UnpackLow64(block4, block5); + uint64x2_t y3 = UnpackHigh64(block4, block5); + + x1 = Shuffle64(x1); y1 = Shuffle64(y1); + x2 = Shuffle64(x2); y2 = Shuffle64(y2); + x3 = Shuffle64(x3); y3 = Shuffle64(y3); + + for (int i=0; i < static_cast(rounds); ++i) + { + const uint64x2_t rk = vld1q_dup_u64(subkeys+i); + + x1 = RotateRight64<8>(x1); + x2 = RotateRight64<8>(x2); + x3 = RotateRight64<8>(x3); + x1 = vaddq_u64(x1, y1); + x2 = vaddq_u64(x2, y2); + x3 = vaddq_u64(x3, y3); + x1 = veorq_u64(x1, rk); + x2 = veorq_u64(x2, rk); + x3 = veorq_u64(x3, rk); + y1 = RotateLeft64<3>(y1); + y2 = RotateLeft64<3>(y2); + y3 = RotateLeft64<3>(y3); + y1 = veorq_u64(y1, x1); + y2 = veorq_u64(y2, x2); + y3 = veorq_u64(y3, x3); + } + + x1 = Shuffle64(x1); y1 = Shuffle64(y1); + x2 = Shuffle64(x2); y2 = Shuffle64(y2); + x3 = Shuffle64(x3); y3 = Shuffle64(y3); + + // [A1 B1][A2 B2] ... => [A1 A2][B1 B2] ... + block0 = UnpackLow64(x1, y1); + block1 = UnpackHigh64(x1, y1); + block2 = UnpackLow64(x2, y2); + block3 = UnpackHigh64(x2, y2); + block4 = UnpackLow64(x3, y3); + block5 = UnpackHigh64(x3, y3); +} + +inline void SPECK128_Dec_Block(uint64x2_t &block0, uint64x2_t &block1, + const word64 *subkeys, unsigned int rounds) +{ + // Rearrange the data for vectorization. The incoming data was read from + // a big-endian byte array. Depending on the number of blocks it needs to + // be permuted to the following. + // [A1 A2][B1 B2] ... => [A1 B1][A2 B2] ... + uint64x2_t x1 = UnpackLow64(block0, block1); + uint64x2_t y1 = UnpackHigh64(block0, block1); + + x1 = Shuffle64(x1); y1 = Shuffle64(y1); + + for (int i = static_cast(rounds-1); i >= 0; --i) + { + const uint64x2_t rk = vld1q_dup_u64(subkeys+i); + + y1 = veorq_u64(y1, x1); + y1 = RotateRight64<3>(y1); + x1 = veorq_u64(x1, rk); + x1 = vsubq_u64(x1, y1); + x1 = RotateLeft64<8>(x1); + } + + x1 = Shuffle64(x1); y1 = Shuffle64(y1); + + // [A1 B1][A2 B2] ... => [A1 A2][B1 B2] ... + block0 = UnpackLow64(x1, y1); + block1 = UnpackHigh64(x1, y1); +} + +inline void SPECK128_Dec_6_Blocks(uint64x2_t &block0, uint64x2_t &block1, + uint64x2_t &block2, uint64x2_t &block3, uint64x2_t &block4, uint64x2_t &block5, + const word64 *subkeys, unsigned int rounds) +{ + // Rearrange the data for vectorization. The incoming data was read from + // a big-endian byte array. Depending on the number of blocks it needs to + // be permuted to the following. + // [A1 A2][B1 B2] ... => [A1 B1][A2 B2] ... + uint64x2_t x1 = UnpackLow64(block0, block1); + uint64x2_t y1 = UnpackHigh64(block0, block1); + uint64x2_t x2 = UnpackLow64(block2, block3); + uint64x2_t y2 = UnpackHigh64(block2, block3); + uint64x2_t x3 = UnpackLow64(block4, block5); + uint64x2_t y3 = UnpackHigh64(block4, block5); + + x1 = Shuffle64(x1); y1 = Shuffle64(y1); + x2 = Shuffle64(x2); y2 = Shuffle64(y2); + x3 = Shuffle64(x3); y3 = Shuffle64(y3); + + for (int i = static_cast(rounds-1); i >= 0; --i) + { + const uint64x2_t rk = vld1q_dup_u64(subkeys+i); + + y1 = veorq_u64(y1, x1); + y2 = veorq_u64(y2, x2); + y3 = veorq_u64(y3, x3); + y1 = RotateRight64<3>(y1); + y2 = RotateRight64<3>(y2); + y3 = RotateRight64<3>(y3); + x1 = veorq_u64(x1, rk); + x2 = veorq_u64(x2, rk); + x3 = veorq_u64(x3, rk); + x1 = vsubq_u64(x1, y1); + x2 = vsubq_u64(x2, y2); + x3 = vsubq_u64(x3, y3); + x1 = RotateLeft64<8>(x1); + x2 = RotateLeft64<8>(x2); + x3 = RotateLeft64<8>(x3); + } + + x1 = Shuffle64(x1); y1 = Shuffle64(y1); + x2 = Shuffle64(x2); y2 = Shuffle64(y2); + x3 = Shuffle64(x3); y3 = Shuffle64(y3); + + // [A1 B1][A2 B2] ... => [A1 A2][B1 B2] ... + block0 = UnpackLow64(x1, y1); + block1 = UnpackHigh64(x1, y1); + block2 = UnpackLow64(x2, y2); + block3 = UnpackHigh64(x2, y2); + block4 = UnpackLow64(x3, y3); + block5 = UnpackHigh64(x3, y3); +} + +#endif // CRYPTOPP_ARM_NEON_AVAILABLE + +// ***************************** IA-32 ***************************** // + +#if defined(CRYPTOPP_SSSE3_AVAILABLE) + +// Clang __m128i casts, http://bugs.llvm.org/show_bug.cgi?id=20670 +#ifndef M128_CAST +# define M128_CAST(x) ((__m128i *)(void *)(x)) +#endif +#ifndef CONST_M128_CAST +# define CONST_M128_CAST(x) ((const __m128i *)(const void *)(x)) +#endif + +// GCC double casts, https://www.spinics.net/lists/gcchelp/msg47735.html +#ifndef DOUBLE_CAST +# define DOUBLE_CAST(x) ((double *)(void *)(x)) +#endif +#ifndef CONST_DOUBLE_CAST +# define CONST_DOUBLE_CAST(x) ((const double *)(const void *)(x)) +#endif + +#if defined(CRYPTOPP_AVX512_ROTATE) +template +inline __m128i RotateLeft64(const __m128i& val) +{ + return _mm_rol_epi64(val, R); +} + +template +inline __m128i RotateRight64(const __m128i& val) +{ + return _mm_ror_epi64(val, R); +} +#else +template +inline __m128i RotateLeft64(const __m128i& val) +{ + return _mm_or_si128( + _mm_slli_epi64(val, R), _mm_srli_epi64(val, 64-R)); +} + +template +inline __m128i RotateRight64(const __m128i& val) +{ + return _mm_or_si128( + _mm_slli_epi64(val, 64-R), _mm_srli_epi64(val, R)); +} + +// Faster than two Shifts and an Or. Thanks to Louis Wingers and Bryan Weeks. +template <> +inline __m128i RotateLeft64<8>(const __m128i& val) +{ + const __m128i mask = _mm_set_epi8(14,13,12,11, 10,9,8,15, 6,5,4,3, 2,1,0,7); + return _mm_shuffle_epi8(val, mask); +} + +// Faster than two Shifts and an Or. Thanks to Louis Wingers and Bryan Weeks. +template <> +inline __m128i RotateRight64<8>(const __m128i& val) +{ + const __m128i mask = _mm_set_epi8(8,15,14,13, 12,11,10,9, 0,7,6,5, 4,3,2,1); + return _mm_shuffle_epi8(val, mask); +} + +#endif // CRYPTOPP_AVX512_ROTATE + +inline void GCC_NO_UBSAN SPECK128_Enc_Block(__m128i &block0, __m128i &block1, + const word64 *subkeys, unsigned int rounds) +{ + // Rearrange the data for vectorization. The incoming data was read from + // a big-endian byte array. Depending on the number of blocks it needs to + // be permuted to the following. + // [A1 A2][B1 B2] ... => [A1 B1][A2 B2] ... + __m128i x1 = _mm_unpackhi_epi64(block0, block1); + __m128i y1 = _mm_unpacklo_epi64(block0, block1); + + for (int i=0; i < static_cast(rounds); ++i) + { + const __m128i rk = _mm_castpd_si128( + _mm_loaddup_pd(CONST_DOUBLE_CAST(subkeys+i))); + + x1 = RotateRight64<8>(x1); + x1 = _mm_add_epi64(x1, y1); + x1 = _mm_xor_si128(x1, rk); + y1 = RotateLeft64<3>(y1); + y1 = _mm_xor_si128(y1, x1); + } + + // [A1 B1][A2 B2] ... => [A1 A2][B1 B2] ... + block0 = _mm_unpacklo_epi64(y1, x1); + block1 = _mm_unpackhi_epi64(y1, x1); +} + +inline void GCC_NO_UBSAN SPECK128_Enc_6_Blocks(__m128i &block0, __m128i &block1, + __m128i &block2, __m128i &block3, __m128i &block4, __m128i &block5, + const word64 *subkeys, unsigned int rounds) +{ + // Rearrange the data for vectorization. The incoming data was read from + // a big-endian byte array. Depending on the number of blocks it needs to + // be permuted to the following. + // [A1 A2][B1 B2] ... => [A1 B1][A2 B2] ... + __m128i x1 = _mm_unpackhi_epi64(block0, block1); + __m128i y1 = _mm_unpacklo_epi64(block0, block1); + __m128i x2 = _mm_unpackhi_epi64(block2, block3); + __m128i y2 = _mm_unpacklo_epi64(block2, block3); + __m128i x3 = _mm_unpackhi_epi64(block4, block5); + __m128i y3 = _mm_unpacklo_epi64(block4, block5); + + for (int i=0; i < static_cast(rounds); ++i) + { + const __m128i rk = _mm_castpd_si128( + _mm_loaddup_pd(CONST_DOUBLE_CAST(subkeys+i))); + + x1 = RotateRight64<8>(x1); + x2 = RotateRight64<8>(x2); + x3 = RotateRight64<8>(x3); + x1 = _mm_add_epi64(x1, y1); + x2 = _mm_add_epi64(x2, y2); + x3 = _mm_add_epi64(x3, y3); + x1 = _mm_xor_si128(x1, rk); + x2 = _mm_xor_si128(x2, rk); + x3 = _mm_xor_si128(x3, rk); + y1 = RotateLeft64<3>(y1); + y2 = RotateLeft64<3>(y2); + y3 = RotateLeft64<3>(y3); + y1 = _mm_xor_si128(y1, x1); + y2 = _mm_xor_si128(y2, x2); + y3 = _mm_xor_si128(y3, x3); + } + + // [A1 B1][A2 B2] ... => [A1 A2][B1 B2] ... + block0 = _mm_unpacklo_epi64(y1, x1); + block1 = _mm_unpackhi_epi64(y1, x1); + block2 = _mm_unpacklo_epi64(y2, x2); + block3 = _mm_unpackhi_epi64(y2, x2); + block4 = _mm_unpacklo_epi64(y3, x3); + block5 = _mm_unpackhi_epi64(y3, x3); +} + +inline void GCC_NO_UBSAN SPECK128_Dec_Block(__m128i &block0, __m128i &block1, + const word64 *subkeys, unsigned int rounds) +{ + // Rearrange the data for vectorization. The incoming data was read from + // a big-endian byte array. Depending on the number of blocks it needs to + // be permuted to the following. + // [A1 A2][B1 B2] ... => [A1 B1][A2 B2] ... + __m128i x1 = _mm_unpackhi_epi64(block0, block1); + __m128i y1 = _mm_unpacklo_epi64(block0, block1); + + for (int i = static_cast(rounds-1); i >= 0; --i) + { + const __m128i rk = _mm_castpd_si128( + _mm_loaddup_pd(CONST_DOUBLE_CAST(subkeys+i))); + + y1 = _mm_xor_si128(y1, x1); + y1 = RotateRight64<3>(y1); + x1 = _mm_xor_si128(x1, rk); + x1 = _mm_sub_epi64(x1, y1); + x1 = RotateLeft64<8>(x1); + } + + // [A1 B1][A2 B2] ... => [A1 A2][B1 B2] ... + block0 = _mm_unpacklo_epi64(y1, x1); + block1 = _mm_unpackhi_epi64(y1, x1); +} + +inline void GCC_NO_UBSAN SPECK128_Dec_6_Blocks(__m128i &block0, __m128i &block1, + __m128i &block2, __m128i &block3, __m128i &block4, __m128i &block5, + const word64 *subkeys, unsigned int rounds) +{ + // Rearrange the data for vectorization. The incoming data was read from + // a big-endian byte array. Depending on the number of blocks it needs to + // be permuted to the following. + // [A1 A2][B1 B2] ... => [A1 B1][A2 B2] ... + __m128i x1 = _mm_unpackhi_epi64(block0, block1); + __m128i y1 = _mm_unpacklo_epi64(block0, block1); + __m128i x2 = _mm_unpackhi_epi64(block2, block3); + __m128i y2 = _mm_unpacklo_epi64(block2, block3); + __m128i x3 = _mm_unpackhi_epi64(block4, block5); + __m128i y3 = _mm_unpacklo_epi64(block4, block5); + + for (int i = static_cast(rounds-1); i >= 0; --i) + { + const __m128i rk = _mm_castpd_si128( + _mm_loaddup_pd(CONST_DOUBLE_CAST(subkeys+i))); + + y1 = _mm_xor_si128(y1, x1); + y2 = _mm_xor_si128(y2, x2); + y3 = _mm_xor_si128(y3, x3); + y1 = RotateRight64<3>(y1); + y2 = RotateRight64<3>(y2); + y3 = RotateRight64<3>(y3); + x1 = _mm_xor_si128(x1, rk); + x2 = _mm_xor_si128(x2, rk); + x3 = _mm_xor_si128(x3, rk); + x1 = _mm_sub_epi64(x1, y1); + x2 = _mm_sub_epi64(x2, y2); + x3 = _mm_sub_epi64(x3, y3); + x1 = RotateLeft64<8>(x1); + x2 = RotateLeft64<8>(x2); + x3 = RotateLeft64<8>(x3); + } + + // [A1 B1][A2 B2] ... => [A1 A2][B1 B2] ... + block0 = _mm_unpacklo_epi64(y1, x1); + block1 = _mm_unpackhi_epi64(y1, x1); + block2 = _mm_unpacklo_epi64(y2, x2); + block3 = _mm_unpackhi_epi64(y2, x2); + block4 = _mm_unpacklo_epi64(y3, x3); + block5 = _mm_unpackhi_epi64(y3, x3); +} + +#endif // CRYPTOPP_SSSE3_AVAILABLE + +#if defined(CRYPTOPP_SSE41_AVAILABLE) + +template +inline __m128i RotateLeft32(const __m128i& val) +{ + return _mm_or_si128( + _mm_slli_epi32(val, R), _mm_srli_epi32(val, 32-R)); +} + +template +inline __m128i RotateRight32(const __m128i& val) +{ + return _mm_or_si128( + _mm_slli_epi32(val, 32-R), _mm_srli_epi32(val, R)); +} + +// Faster than two Shifts and an Or. Thanks to Louis Wingers and Bryan Weeks. +template <> +inline __m128i RotateLeft32<8>(const __m128i& val) +{ + const __m128i mask = _mm_set_epi8(14,13,12,15, 10,9,8,11, 6,5,4,7, 2,1,0,3); + return _mm_shuffle_epi8(val, mask); +} + +// Faster than two Shifts and an Or. Thanks to Louis Wingers and Bryan Weeks. +template <> +inline __m128i RotateRight32<8>(const __m128i& val) +{ + const __m128i mask = _mm_set_epi8(12,15,14,13, 8,11,10,9, 4,7,6,5, 0,3,2,1); + return _mm_shuffle_epi8(val, mask); +} + +inline void GCC_NO_UBSAN SPECK64_Enc_Block(__m128i &block0, __m128i &block1, + const word32 *subkeys, unsigned int rounds) +{ + // Rearrange the data for vectorization. The incoming data was read from + // a big-endian byte array. Depending on the number of blocks it needs to + // be permuted to the following. Thanks to Peter Cordes for help with the + // SSE permutes below. + // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ... + const __m128 t0 = _mm_castsi128_ps(block0); + const __m128 t1 = _mm_castsi128_ps(block1); + __m128i x1 = _mm_castps_si128(_mm_shuffle_ps(t0, t1, _MM_SHUFFLE(3,1,3,1))); + __m128i y1 = _mm_castps_si128(_mm_shuffle_ps(t0, t1, _MM_SHUFFLE(2,0,2,0))); + + for (int i=0; i < static_cast(rounds); ++i) + { + const __m128i rk = _mm_set1_epi32(subkeys[i]); + + x1 = RotateRight32<8>(x1); + x1 = _mm_add_epi32(x1, y1); + x1 = _mm_xor_si128(x1, rk); + y1 = RotateLeft32<3>(y1); + y1 = _mm_xor_si128(y1, x1); + } + + // The is roughly the SSE equivalent to ARM vzp32 + // [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4] + block0 = _mm_unpacklo_epi32(y1, x1); + block1 = _mm_unpackhi_epi32(y1, x1); +} + +inline void GCC_NO_UBSAN SPECK64_Dec_Block(__m128i &block0, __m128i &block1, + const word32 *subkeys, unsigned int rounds) +{ + // Rearrange the data for vectorization. The incoming data was read from + // a big-endian byte array. Depending on the number of blocks it needs to + // be permuted to the following. Thanks to Peter Cordes for help with the + // SSE permutes below. + // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ... + const __m128 t0 = _mm_castsi128_ps(block0); + const __m128 t1 = _mm_castsi128_ps(block1); + __m128i x1 = _mm_castps_si128(_mm_shuffle_ps(t0, t1, _MM_SHUFFLE(3,1,3,1))); + __m128i y1 = _mm_castps_si128(_mm_shuffle_ps(t0, t1, _MM_SHUFFLE(2,0,2,0))); + + for (int i = static_cast(rounds-1); i >= 0; --i) + { + const __m128i rk = _mm_set1_epi32(subkeys[i]); + + y1 = _mm_xor_si128(y1, x1); + y1 = RotateRight32<3>(y1); + x1 = _mm_xor_si128(x1, rk); + x1 = _mm_sub_epi32(x1, y1); + x1 = RotateLeft32<8>(x1); + } + + // The is roughly the SSE equivalent to ARM vzp32 + // [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4] + block0 = _mm_unpacklo_epi32(y1, x1); + block1 = _mm_unpackhi_epi32(y1, x1); +} + +inline void GCC_NO_UBSAN SPECK64_Enc_6_Blocks(__m128i &block0, __m128i &block1, + __m128i &block2, __m128i &block3, __m128i &block4, __m128i &block5, + const word32 *subkeys, unsigned int rounds) +{ + // Rearrange the data for vectorization. The incoming data was read from + // a big-endian byte array. Depending on the number of blocks it needs to + // be permuted to the following. Thanks to Peter Cordes for help with the + // SSE permutes below. + // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ... + const __m128 t0 = _mm_castsi128_ps(block0); + const __m128 t1 = _mm_castsi128_ps(block1); + __m128i x1 = _mm_castps_si128(_mm_shuffle_ps(t0, t1, _MM_SHUFFLE(3,1,3,1))); + __m128i y1 = _mm_castps_si128(_mm_shuffle_ps(t0, t1, _MM_SHUFFLE(2,0,2,0))); + + const __m128 t2 = _mm_castsi128_ps(block2); + const __m128 t3 = _mm_castsi128_ps(block3); + __m128i x2 = _mm_castps_si128(_mm_shuffle_ps(t2, t3, _MM_SHUFFLE(3,1,3,1))); + __m128i y2 = _mm_castps_si128(_mm_shuffle_ps(t2, t3, _MM_SHUFFLE(2,0,2,0))); + + const __m128 t4 = _mm_castsi128_ps(block4); + const __m128 t5 = _mm_castsi128_ps(block5); + __m128i x3 = _mm_castps_si128(_mm_shuffle_ps(t4, t5, _MM_SHUFFLE(3,1,3,1))); + __m128i y3 = _mm_castps_si128(_mm_shuffle_ps(t4, t5, _MM_SHUFFLE(2,0,2,0))); + + for (int i=0; i < static_cast(rounds); ++i) + { + const __m128i rk = _mm_set1_epi32(subkeys[i]); + + x1 = RotateRight32<8>(x1); + x2 = RotateRight32<8>(x2); + x3 = RotateRight32<8>(x3); + x1 = _mm_add_epi32(x1, y1); + x2 = _mm_add_epi32(x2, y2); + x3 = _mm_add_epi32(x3, y3); + x1 = _mm_xor_si128(x1, rk); + x2 = _mm_xor_si128(x2, rk); + x3 = _mm_xor_si128(x3, rk); + y1 = RotateLeft32<3>(y1); + y2 = RotateLeft32<3>(y2); + y3 = RotateLeft32<3>(y3); + y1 = _mm_xor_si128(y1, x1); + y2 = _mm_xor_si128(y2, x2); + y3 = _mm_xor_si128(y3, x3); + } + + // The is roughly the SSE equivalent to ARM vzp32 + // [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4] + block0 = _mm_unpacklo_epi32(y1, x1); + block1 = _mm_unpackhi_epi32(y1, x1); + block2 = _mm_unpacklo_epi32(y2, x2); + block3 = _mm_unpackhi_epi32(y2, x2); + block4 = _mm_unpacklo_epi32(y3, x3); + block5 = _mm_unpackhi_epi32(y3, x3); +} + +inline void GCC_NO_UBSAN SPECK64_Dec_6_Blocks(__m128i &block0, __m128i &block1, + __m128i &block2, __m128i &block3, __m128i &block4, __m128i &block5, + const word32 *subkeys, unsigned int rounds) +{ + // Rearrange the data for vectorization. The incoming data was read from + // a big-endian byte array. Depending on the number of blocks it needs to + // be permuted to the following. Thanks to Peter Cordes for help with the + // SSE permutes below. + // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ... + const __m128 t0 = _mm_castsi128_ps(block0); + const __m128 t1 = _mm_castsi128_ps(block1); + __m128i x1 = _mm_castps_si128(_mm_shuffle_ps(t0, t1, _MM_SHUFFLE(3,1,3,1))); + __m128i y1 = _mm_castps_si128(_mm_shuffle_ps(t0, t1, _MM_SHUFFLE(2,0,2,0))); + + const __m128 t2 = _mm_castsi128_ps(block2); + const __m128 t3 = _mm_castsi128_ps(block3); + __m128i x2 = _mm_castps_si128(_mm_shuffle_ps(t2, t3, _MM_SHUFFLE(3,1,3,1))); + __m128i y2 = _mm_castps_si128(_mm_shuffle_ps(t2, t3, _MM_SHUFFLE(2,0,2,0))); + + const __m128 t4 = _mm_castsi128_ps(block4); + const __m128 t5 = _mm_castsi128_ps(block5); + __m128i x3 = _mm_castps_si128(_mm_shuffle_ps(t4, t5, _MM_SHUFFLE(3,1,3,1))); + __m128i y3 = _mm_castps_si128(_mm_shuffle_ps(t4, t5, _MM_SHUFFLE(2,0,2,0))); + + for (int i = static_cast(rounds-1); i >= 0; --i) + { + const __m128i rk = _mm_set1_epi32(subkeys[i]); + + y1 = _mm_xor_si128(y1, x1); + y2 = _mm_xor_si128(y2, x2); + y3 = _mm_xor_si128(y3, x3); + y1 = RotateRight32<3>(y1); + y2 = RotateRight32<3>(y2); + y3 = RotateRight32<3>(y3); + x1 = _mm_xor_si128(x1, rk); + x2 = _mm_xor_si128(x2, rk); + x3 = _mm_xor_si128(x3, rk); + x1 = _mm_sub_epi32(x1, y1); + x2 = _mm_sub_epi32(x2, y2); + x3 = _mm_sub_epi32(x3, y3); + x1 = RotateLeft32<8>(x1); + x2 = RotateLeft32<8>(x2); + x3 = RotateLeft32<8>(x3); + } + + // The is roughly the SSE equivalent to ARM vzp32 + // [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4] + block0 = _mm_unpacklo_epi32(y1, x1); + block1 = _mm_unpackhi_epi32(y1, x1); + block2 = _mm_unpacklo_epi32(y2, x2); + block3 = _mm_unpackhi_epi32(y2, x2); + block4 = _mm_unpacklo_epi32(y3, x3); + block5 = _mm_unpackhi_epi32(y3, x3); +} + +#endif // CRYPTOPP_SSE41_AVAILABLE + +ANONYMOUS_NAMESPACE_END + +/////////////////////////////////////////////////////////////////////// + +NAMESPACE_BEGIN(CryptoPP) + +// *************************** ARM NEON **************************** // + +#if defined(CRYPTOPP_ARM_NEON_AVAILABLE) +size_t SPECK64_Enc_AdvancedProcessBlocks_NEON(const word32* subKeys, size_t rounds, + const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags) +{ + return AdvancedProcessBlocks64_6x2_NEON(SPECK64_Enc_Block, SPECK64_Enc_6_Blocks, + subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags); +} + +size_t SPECK64_Dec_AdvancedProcessBlocks_NEON(const word32* subKeys, size_t rounds, + const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags) +{ + return AdvancedProcessBlocks64_6x2_NEON(SPECK64_Dec_Block, SPECK64_Dec_6_Blocks, + subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags); +} +#endif + +#if (CRYPTOPP_ARM_NEON_AVAILABLE) +size_t SPECK128_Enc_AdvancedProcessBlocks_NEON(const word64* subKeys, size_t rounds, + const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags) +{ + return AdvancedProcessBlocks128_6x2_NEON(SPECK128_Enc_Block, SPECK128_Enc_6_Blocks, + subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags); +} + +size_t SPECK128_Dec_AdvancedProcessBlocks_NEON(const word64* subKeys, size_t rounds, + const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags) +{ + return AdvancedProcessBlocks128_6x2_NEON(SPECK128_Dec_Block, SPECK128_Dec_6_Blocks, + subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags); +} +#endif // CRYPTOPP_ARM_NEON_AVAILABLE + +// ***************************** IA-32 ***************************** // + +#if defined(CRYPTOPP_SSE41_AVAILABLE) +size_t SPECK64_Enc_AdvancedProcessBlocks_SSE41(const word32* subKeys, size_t rounds, + const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags) +{ + return AdvancedProcessBlocks64_6x2_SSE(SPECK64_Enc_Block, SPECK64_Enc_6_Blocks, + subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags); +} + +size_t SPECK64_Dec_AdvancedProcessBlocks_SSE41(const word32* subKeys, size_t rounds, + const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags) +{ + return AdvancedProcessBlocks64_6x2_SSE(SPECK64_Dec_Block, SPECK64_Dec_6_Blocks, + subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags); +} +#endif + +#if defined(CRYPTOPP_SSSE3_AVAILABLE) +size_t SPECK128_Enc_AdvancedProcessBlocks_SSSE3(const word64* subKeys, size_t rounds, + const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags) +{ + return AdvancedProcessBlocks128_6x2_SSE(SPECK128_Enc_Block, SPECK128_Enc_6_Blocks, + subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags); +} + +size_t SPECK128_Dec_AdvancedProcessBlocks_SSSE3(const word64* subKeys, size_t rounds, + const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags) +{ + return AdvancedProcessBlocks128_6x2_SSE(SPECK128_Dec_Block, SPECK128_Dec_6_Blocks, + subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags); +} +#endif // CRYPTOPP_SSSE3_AVAILABLE + +NAMESPACE_END diff --git a/speck.cpp b/speck.cpp new file mode 100644 index 00000000..9aaff67f --- /dev/null +++ b/speck.cpp @@ -0,0 +1,438 @@ +// speck.cpp - written and placed in the public domain by Jeffrey Walton + +#include "pch.h" +#include "config.h" + +#include "speck.h" +#include "misc.h" +#include "cpu.h" + +// Uncomment for benchmarking C++ against SSE or NEON. +// Do so in both speck.cpp and speck-simd.cpp. +// #undef CRYPTOPP_SSSE3_AVAILABLE +// #undef CRYPTOPP_SSE41_AVAILABLE +// #undef CRYPTOPP_ARM_NEON_AVAILABLE + +ANONYMOUS_NAMESPACE_BEGIN + +using CryptoPP::word32; +using CryptoPP::word64; +using CryptoPP::rotlConstant; +using CryptoPP::rotrConstant; + +/// \brief Forward round transformation +/// \tparam W word type +/// \details TF83() is the forward round transformation using a=8 and b=3 rotations. +/// The initial test implementation provided template parameters, but they were +/// removed because SPECK32 using a=7 and b=2 was not on the road map. The +/// additional template parameters also made calling SPECK_Encrypt and SPECK_Decrypt +/// kind of messy. +template +inline void TF83(W& x, W& y, const W k) +{ + x = rotrConstant<8>(x); + x += y; x ^= k; + y = rotlConstant<3>(y); + y ^= x; +} + +/// \brief Reverse round transformation +/// \tparam W word type +/// \details TR83() is the reverse round transformation using a=8 and b=3 rotations. +/// The initial test implementation provided template parameters, but they were +/// removed because SPECK32 using a=7 and b=2 was not on the road map. The +/// additional template parameters also made calling SPECK_Encrypt and SPECK_Decrypt +/// kind of messy. +template +inline void TR83(W& x, W& y, const W k) +{ + y ^= x; + y = rotrConstant<3>(y); + x ^= k; x -= y; + x = rotlConstant<8>(x); +} + +/// \brief Forward transformation +/// \tparam W word type +/// \tparam R number of rounds +/// \param c output array +/// \param p input array +/// \param k subkey array +template +inline void SPECK_Encrypt(W c[2], const W p[2], const W k[R]) +{ + c[0]=p[0]; c[1]=p[1]; + + // Don't unroll this loop. Things slow down. + for (int i = 0; i < static_cast(R); ++i) + TF83(c[0], c[1], k[i]); +} + +/// \brief Reverse transformation +/// \tparam W word type +/// \tparam R number of rounds +/// \param p output array +/// \param c input array +/// \param k subkey array +template +inline void SPECK_Decrypt(W p[2], const W c[2], const W k[R]) +{ + p[0]=c[0]; p[1]=c[1]; + + // Don't unroll this loop. Things slow down. + for (int i = static_cast(R-1); i >= 0; --i) + TR83(p[0], p[1], k[i]); +} + +/// \brief Subkey generation function +/// \details Used when the user key consists of 2 words +/// \tparam W word type +/// \tparam R number of rounds +/// \param key empty subkey array +/// \param k user key array +template +inline void SPECK_ExpandKey_2W(W key[R], const W k[2]) +{ + CRYPTOPP_ASSERT(R==32); + W i=0, B=k[0], A=k[1]; + + while (i +inline void SPECK_ExpandKey_3W(W key[R], const W k[3]) +{ + CRYPTOPP_ASSERT(R==33 || R==26); + W i=0, C=k[0], B=k[1], A=k[2]; + + unsigned int blocks = R/2; + while (blocks--) + { + key[i+0]=A; TF83(B, A, i+0); + key[i+1]=A; TF83(C, A, i+1); + i+=2; + } + + // The constexpr residue should allow the optimizer to remove unneeded statements + if(R%2 == 1) + { + key[R-1]=A; + } +} + +/// \brief Subkey generation function +/// \details Used when the user key consists of 4 words +/// \tparam W word type +/// \tparam R number of rounds +/// \param key empty subkey array +/// \param k user key array +template +inline void SPECK_ExpandKey_4W(W key[R], const W k[4]) +{ + CRYPTOPP_ASSERT(R==34 || R==27); + W i=0, D=k[0], C=k[1], B=k[2], A=k[3]; + + unsigned int blocks = R/3; + while (blocks--) + { + key[i+0]=A; TF83(B, A, i+0); + key[i+1]=A; TF83(C, A, i+1); + key[i+2]=A; TF83(D, A, i+2); + i+=3; + } + + // The constexpr residue should allow the optimizer to remove unneeded statements + if(R%3 == 1) + { + key[R-1]=A; + } + else if(R%3 == 2) + { + key[R-2]=A; TF83(B, A, W(R-2)); + key[R-1]=A; + } +} + +ANONYMOUS_NAMESPACE_END + +/////////////////////////////////////////////////////////// + +NAMESPACE_BEGIN(CryptoPP) + +#if defined(CRYPTOPP_ARM_NEON_AVAILABLE) +extern size_t SPECK64_Enc_AdvancedProcessBlocks_NEON(const word32* subKeys, size_t rounds, + const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags); + +extern size_t SPECK64_Dec_AdvancedProcessBlocks_NEON(const word32* subKeys, size_t rounds, + const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags); + +extern size_t SPECK128_Enc_AdvancedProcessBlocks_NEON(const word64* subKeys, size_t rounds, + const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags); + +extern size_t SPECK128_Dec_AdvancedProcessBlocks_NEON(const word64* subKeys, size_t rounds, + const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags); +#endif + +#if defined(CRYPTOPP_SSE41_AVAILABLE) +extern size_t SPECK64_Enc_AdvancedProcessBlocks_SSE41(const word32* subKeys, size_t rounds, + const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags); + +extern size_t SPECK64_Dec_AdvancedProcessBlocks_SSE41(const word32* subKeys, size_t rounds, + const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags); +#endif + +#if defined(CRYPTOPP_SSSE3_AVAILABLE) +extern size_t SPECK128_Enc_AdvancedProcessBlocks_SSSE3(const word64* subKeys, size_t rounds, + const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags); + +extern size_t SPECK128_Dec_AdvancedProcessBlocks_SSSE3(const word64* subKeys, size_t rounds, + const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags); +#endif + +void SPECK64::Base::UncheckedSetKey(const byte *userKey, unsigned int keyLength, const NameValuePairs ¶ms) +{ + CRYPTOPP_ASSERT(keyLength == 12 || keyLength == 16); + CRYPTOPP_UNUSED(params); + + // Building the key schedule table requires {3,4} words workspace. + // Encrypting and decrypting requires 4 words workspace. + m_kwords = keyLength/sizeof(word32); + m_wspace.New(4U); + + // Do the endian gyrations from the paper and align pointers + typedef GetBlock KeyBlock; + KeyBlock kblk(userKey); + + switch (m_kwords) + { + case 3: + m_rkeys.New((m_rounds = 26)); + kblk(m_wspace[2])(m_wspace[1])(m_wspace[0]); + SPECK_ExpandKey_3W(m_rkeys, m_wspace); + break; + case 4: + m_rkeys.New((m_rounds = 27)); + kblk(m_wspace[3])(m_wspace[2])(m_wspace[1])(m_wspace[0]); + SPECK_ExpandKey_4W(m_rkeys, m_wspace); + break; + default: + CRYPTOPP_ASSERT(0);; + } +} + +void SPECK64::Enc::ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock, byte *outBlock) const +{ + // Do the endian gyrations from the paper and align pointers + typedef GetBlock InBlock; + InBlock iblk(inBlock); iblk(m_wspace[1])(m_wspace[0]); + + switch (m_rounds) + { + case 26: + SPECK_Encrypt(m_wspace+2, m_wspace+0, m_rkeys); + break; + case 27: + SPECK_Encrypt(m_wspace+2, m_wspace+0, m_rkeys); + break; + default: + CRYPTOPP_ASSERT(0);; + } + + // Do the endian gyrations from the paper and align pointers + typedef PutBlock OutBlock; + OutBlock oblk(xorBlock, outBlock); oblk(m_wspace[3])(m_wspace[2]); +} + +void SPECK64::Dec::ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock, byte *outBlock) const +{ + // Do the endian gyrations from the paper and align pointers + typedef GetBlock InBlock; + InBlock iblk(inBlock); iblk(m_wspace[1])(m_wspace[0]); + + switch (m_rounds) + { + case 26: + SPECK_Decrypt(m_wspace+2, m_wspace+0, m_rkeys); + break; + case 27: + SPECK_Decrypt(m_wspace+2, m_wspace+0, m_rkeys); + break; + default: + CRYPTOPP_ASSERT(0);; + } + + // Do the endian gyrations from the paper and align pointers + typedef PutBlock OutBlock; + OutBlock oblk(xorBlock, outBlock); oblk(m_wspace[3])(m_wspace[2]); +} + +/////////////////////////////////////////////////////////// + +void SPECK128::Base::UncheckedSetKey(const byte *userKey, unsigned int keyLength, const NameValuePairs ¶ms) +{ + CRYPTOPP_ASSERT(keyLength == 16 || keyLength == 24 || keyLength == 32); + CRYPTOPP_UNUSED(params); + + // Building the key schedule table requires {2,3,4} words workspace. + // Encrypting and decrypting requires 4 words workspace. + m_kwords = keyLength/sizeof(word64); + m_wspace.New(4U); + + // Do the endian gyrations from the paper and align pointers + typedef GetBlock KeyBlock; + KeyBlock kblk(userKey); + + switch (m_kwords) + { + case 2: + m_rkeys.New((m_rounds = 32)); + kblk(m_wspace[1])(m_wspace[0]); + SPECK_ExpandKey_2W(m_rkeys, m_wspace); + break; + case 3: + m_rkeys.New((m_rounds = 33)); + kblk(m_wspace[2])(m_wspace[1])(m_wspace[0]); + SPECK_ExpandKey_3W(m_rkeys, m_wspace); + break; + case 4: + m_rkeys.New((m_rounds = 34)); + kblk(m_wspace[3])(m_wspace[2])(m_wspace[1])(m_wspace[0]); + SPECK_ExpandKey_4W(m_rkeys, m_wspace); + break; + default: + CRYPTOPP_ASSERT(0);; + } +} + +void SPECK128::Enc::ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock, byte *outBlock) const +{ + // Do the endian gyrations from the paper and align pointers + typedef GetBlock InBlock; + InBlock iblk(inBlock); iblk(m_wspace[1])(m_wspace[0]); + + switch (m_rounds) + { + case 32: + SPECK_Encrypt(m_wspace+2, m_wspace+0, m_rkeys); + break; + case 33: + SPECK_Encrypt(m_wspace+2, m_wspace+0, m_rkeys); + break; + case 34: + SPECK_Encrypt(m_wspace+2, m_wspace+0, m_rkeys); + break; + default: + CRYPTOPP_ASSERT(0);; + } + + // Do the endian gyrations from the paper and align pointers + typedef PutBlock OutBlock; + OutBlock oblk(xorBlock, outBlock); oblk(m_wspace[3])(m_wspace[2]); +} + +void SPECK128::Dec::ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock, byte *outBlock) const +{ + // Do the endian gyrations from the paper and align pointers + typedef GetBlock InBlock; + InBlock iblk(inBlock); iblk(m_wspace[1])(m_wspace[0]); + + switch (m_rounds) + { + case 32: + SPECK_Decrypt(m_wspace+2, m_wspace+0, m_rkeys); + break; + case 33: + SPECK_Decrypt(m_wspace+2, m_wspace+0, m_rkeys); + break; + case 34: + SPECK_Decrypt(m_wspace+2, m_wspace+0, m_rkeys); + break; + default: + CRYPTOPP_ASSERT(0);; + } + + // Do the endian gyrations from the paper and align pointers + typedef PutBlock OutBlock; + OutBlock oblk(xorBlock, outBlock); oblk(m_wspace[3])(m_wspace[2]); +} + +#if defined(CRYPTOPP_SPECK64_ADVANCED_PROCESS_BLOCKS) +size_t SPECK64::Enc::AdvancedProcessBlocks(const byte *inBlocks, const byte *xorBlocks, + byte *outBlocks, size_t length, word32 flags) const +{ +#if defined(CRYPTOPP_SSE41_AVAILABLE) + if (HasSSE41()) + return SPECK64_Enc_AdvancedProcessBlocks_SSE41(m_rkeys, (size_t)m_rounds, + inBlocks, xorBlocks, outBlocks, length, flags); +#endif +#if defined(CRYPTOPP_ARM_NEON_AVAILABLE) + if (HasNEON()) + return SPECK64_Enc_AdvancedProcessBlocks_NEON(m_rkeys, (size_t)m_rounds, + inBlocks, xorBlocks, outBlocks, length, flags); +#endif + return BlockTransformation::AdvancedProcessBlocks(inBlocks, xorBlocks, outBlocks, length, flags); +} + +size_t SPECK64::Dec::AdvancedProcessBlocks(const byte *inBlocks, const byte *xorBlocks, + byte *outBlocks, size_t length, word32 flags) const +{ +#if defined(CRYPTOPP_SSE41_AVAILABLE) + if (HasSSE41()) + return SPECK64_Dec_AdvancedProcessBlocks_SSE41(m_rkeys, (size_t)m_rounds, + inBlocks, xorBlocks, outBlocks, length, flags); +#endif +#if defined(CRYPTOPP_ARM_NEON_AVAILABLE) + if (HasNEON()) + return SPECK64_Dec_AdvancedProcessBlocks_NEON(m_rkeys, (size_t)m_rounds, + inBlocks, xorBlocks, outBlocks, length, flags); +#endif + return BlockTransformation::AdvancedProcessBlocks(inBlocks, xorBlocks, outBlocks, length, flags); +} +#endif // CRYPTOPP_SPECK64_ADVANCED_PROCESS_BLOCKS + +#if defined(CRYPTOPP_SPECK128_ADVANCED_PROCESS_BLOCKS) +size_t SPECK128::Enc::AdvancedProcessBlocks(const byte *inBlocks, const byte *xorBlocks, + byte *outBlocks, size_t length, word32 flags) const +{ +#if defined(CRYPTOPP_SSSE3_AVAILABLE) + if (HasSSSE3()) + return SPECK128_Enc_AdvancedProcessBlocks_SSSE3(m_rkeys, (size_t)m_rounds, + inBlocks, xorBlocks, outBlocks, length, flags); +#endif +#if defined(CRYPTOPP_ARM_NEON_AVAILABLE) + if (HasNEON()) + return SPECK128_Enc_AdvancedProcessBlocks_NEON(m_rkeys, (size_t)m_rounds, + inBlocks, xorBlocks, outBlocks, length, flags); +#endif + return BlockTransformation::AdvancedProcessBlocks(inBlocks, xorBlocks, outBlocks, length, flags); +} + +size_t SPECK128::Dec::AdvancedProcessBlocks(const byte *inBlocks, const byte *xorBlocks, + byte *outBlocks, size_t length, word32 flags) const +{ +#if defined(CRYPTOPP_SSSE3_AVAILABLE) + if (HasSSSE3()) + return SPECK128_Dec_AdvancedProcessBlocks_SSSE3(m_rkeys, (size_t)m_rounds, + inBlocks, xorBlocks, outBlocks, length, flags); +#endif +#if defined(CRYPTOPP_ARM_NEON_AVAILABLE) + if (HasNEON()) + return SPECK128_Dec_AdvancedProcessBlocks_NEON(m_rkeys, (size_t)m_rounds, + inBlocks, xorBlocks, outBlocks, length, flags); +#endif + return BlockTransformation::AdvancedProcessBlocks(inBlocks, xorBlocks, outBlocks, length, flags); +} +#endif // CRYPTOPP_SPECK128_ADVANCED_PROCESS_BLOCKS + +NAMESPACE_END diff --git a/speck.h b/speck.h new file mode 100644 index 00000000..44847e9b --- /dev/null +++ b/speck.h @@ -0,0 +1,180 @@ +// speck.h - written and placed in the public domain by Jeffrey Walton + +/// \file speck.h +/// \brief Classes for the Speck block cipher +/// \details Speck is a block cipher designed by Ray Beaulieu, Douglas Shors, Jason Smith, +/// Stefan Treatman-Clark, Bryan Weeks and Louis Wingers. +/// \sa The SIMON and SPECK Families of +/// Lightweight Block Ciphers, +/// The Simon and Speck GitHub and +/// SPECK on the Crypto++ wiki. +/// \since Crypto++ 6.0 + +#ifndef CRYPTOPP_SPECK_H +#define CRYPTOPP_SPECK_H + +#include "config.h" +#include "seckey.h" +#include "secblock.h" + +#if CRYPTOPP_BOOL_X64 || CRYPTOPP_BOOL_X32 || CRYPTOPP_BOOL_X86 +# define CRYPTOPP_SPECK64_ADVANCED_PROCESS_BLOCKS 1 +#endif + +#if CRYPTOPP_BOOL_X64 || CRYPTOPP_BOOL_X32 || CRYPTOPP_BOOL_X86 +# define CRYPTOPP_SPECK128_ADVANCED_PROCESS_BLOCKS 1 +#endif + +NAMESPACE_BEGIN(CryptoPP) + +/// \brief SPECK block cipher information +/// \tparam L block size of the cipher, in bytes +/// \tparam D default key length, in bytes +/// \tparam N minimum key length, in bytes +/// \tparam M maximum key length, in bytes +/// \since Crypto++ 6.0 +template +struct SPECK_Info : public FixedBlockSize, VariableKeyLength +{ + static const std::string StaticAlgorithmName() + { + // Format is Cipher-Blocksize(Keylength) + return "SPECK-" + IntToString(L*8); + } +}; + +/// \brief SPECK block cipher base class +/// \tparam W the word type +/// \details User code should use SPECK64 or SPECK128 +/// \sa SPECK64, SPECK128, SPECK +/// \since Crypto++ 6.0 +template +struct SPECK_Base +{ + virtual ~SPECK_Base() {} + SPECK_Base() : m_kwords(0), m_rounds(0) {} + + typedef SecBlock > AlignedSecBlock; + mutable AlignedSecBlock m_wspace; // workspace + AlignedSecBlock m_rkeys; // round keys + unsigned int m_kwords; // number of key words + unsigned int m_rounds; // number of rounds +}; + +/// \brief SPECK 64-bit block cipher +/// \details Speck is a block cipher designed by Ray Beaulieu, Douglas Shors, Jason Smith, +/// Stefan Treatman-Clark, Bryan Weeks and Louis Wingers. +/// \details SPECK64 provides 64-bit block size. The valid key sizes are 96-bit and 128-bit. +/// \sa SPECK64, SPECK128, The SIMON and SPECK +/// Families of Lightweight Block Ciphers, +/// The Simon and Speck GitHub, SPECK on the +/// Crypto++ wiki +/// \since Crypto++ 6.0 +class CRYPTOPP_NO_VTABLE SPECK64 : public SPECK_Info<8, 12, 12, 16>, public BlockCipherDocumentation +{ +public: + /// \brief SPECK block cipher transformation functions + /// \details Provides implementation common to encryption and decryption + /// \since Crypto++ 6.0 + class CRYPTOPP_NO_VTABLE Base : protected SPECK_Base, public BlockCipherImpl > + { + public: + std::string AlgorithmName() const { + return StaticAlgorithmName() + (m_kwords == 0 ? "" : + "(" + IntToString(m_kwords*sizeof(word32)*8) + ")"); + } + + protected: + void UncheckedSetKey(const byte *userKey, unsigned int keyLength, const NameValuePairs ¶ms); + }; + + /// \brief Provides implementation for encryption transformation + /// \details Enc provides implementation for encryption transformation. All key + /// sizes are supported. + /// \since Crypto++ 6.0 + class CRYPTOPP_NO_VTABLE Enc : public Base + { + protected: + void ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock, byte *outBlock) const; +#if CRYPTOPP_SPECK64_ADVANCED_PROCESS_BLOCKS + size_t AdvancedProcessBlocks(const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags) const; +#endif + }; + + /// \brief Provides implementation for encryption transformation + /// \details Dec provides implementation for decryption transformation. All key + /// sizes are supported. + /// \since Crypto++ 6.0 + class CRYPTOPP_NO_VTABLE Dec : public Base + { + protected: + void ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock, byte *outBlock) const; +#if CRYPTOPP_SPECK64_ADVANCED_PROCESS_BLOCKS + size_t AdvancedProcessBlocks(const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags) const; +#endif + }; + + typedef BlockCipherFinal Encryption; + typedef BlockCipherFinal Decryption; +}; + +/// \brief SPECK 128-bit block cipher +/// \details Speck is a block cipher designed by Ray Beaulieu, Douglas Shors, Jason Smith, +/// Stefan Treatman-Clark, Bryan Weeks and Louis Wingers. +/// \details SPECK128 provides 128-bit block size. The valid key sizes are 128-bit, 192-bit and 256-bit. +/// \sa SPECK64, SPECK128, The SIMON and SPECK +/// Families of Lightweight Block Ciphers, +/// The Simon and Speck GitHub, SPECK on the +/// Crypto++ wiki +/// \since Crypto++ 6.0 +class CRYPTOPP_NO_VTABLE SPECK128 : public SPECK_Info<16, 16, 16, 32>, public BlockCipherDocumentation +{ +public: + /// \brief SPECK block cipher transformation functions + /// \details Provides implementation common to encryption and decryption + /// \since Crypto++ 6.0 + class CRYPTOPP_NO_VTABLE Base : protected SPECK_Base, public BlockCipherImpl > + { + public: + std::string AlgorithmName() const { + return StaticAlgorithmName() + (m_kwords == 0 ? "" : + "(" + IntToString(m_kwords*sizeof(word64)*8) + ")"); + } + + protected: + void UncheckedSetKey(const byte *userKey, unsigned int keyLength, const NameValuePairs ¶ms); + }; + + /// \brief Provides implementation for encryption transformation + /// \details Enc provides implementation for encryption transformation. All key + /// sizes are supported. + /// \since Crypto++ 6.0 + class CRYPTOPP_NO_VTABLE Enc : public Base + { + protected: + void ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock, byte *outBlock) const; +#if CRYPTOPP_SPECK128_ADVANCED_PROCESS_BLOCKS + size_t AdvancedProcessBlocks(const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags) const; +#endif + }; + + /// \brief Provides implementation for encryption transformation + /// \details Dec provides implementation for decryption transformation. All key + /// sizes are supported. + /// \since Crypto++ 6.0 + class CRYPTOPP_NO_VTABLE Dec : public Base + { + protected: + void ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock, byte *outBlock) const; +#if CRYPTOPP_SPECK128_ADVANCED_PROCESS_BLOCKS + size_t AdvancedProcessBlocks(const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags) const; +#endif + }; + + typedef BlockCipherFinal Encryption; + typedef BlockCipherFinal Decryption; +}; + +NAMESPACE_END + +#endif // CRYPTOPP_SPECK_H