From e416b243d37d0904c6dfdf1ecce491f458fcecbb Mon Sep 17 00:00:00 2001
From: Jeffrey Walton <noloader@gmail.com>
Date: Sun, 18 Feb 2018 23:23:50 -0500
Subject: [PATCH] Re-add Simon and Speck, enable SSE (GH #585) This commit
 re-adds Simon and Speck. The commit includes C++, SSSE3 and SSE4. NEON,
 Aarch32 and Aarch64 are disabled at the moment.

---
 Filelist.txt             |    8 +
 GNUmakefile              |   28 +
 GNUmakefile-cross        |   18 +
 bench1.cpp               |   12 +
 cryptest.nmake           |    4 +-
 cryptlib.vcxproj         |    6 +
 cryptlib.vcxproj.filters |   18 +
 regtest2.cpp             |   15 +
 simon-simd.cpp           | 1141 ++++++++++++++++++++++++++++++++++++++
 simon.cpp                |  463 ++++++++++++++++
 simon.h                  |  180 ++++++
 speck-simd.cpp           | 1031 ++++++++++++++++++++++++++++++++++
 speck.cpp                |  438 +++++++++++++++
 speck.h                  |  180 ++++++
 14 files changed, 3540 insertions(+), 2 deletions(-)
 create mode 100644 simon-simd.cpp
 create mode 100644 simon.cpp
 create mode 100644 simon.h
 create mode 100644 speck-simd.cpp
 create mode 100644 speck.cpp
 create mode 100644 speck.h

diff --git a/Filelist.txt b/Filelist.txt
index 24842d31..b679a6cf 100644
--- a/Filelist.txt
+++ b/Filelist.txt
@@ -276,6 +276,9 @@ sharkbox.cpp
 simple.cpp
 simple.h
 siphash.h
+simon.cpp
+simon-simd.cpp
+simon.h
 skipjack.cpp
 skipjack.h
 sm3.cpp
@@ -287,6 +290,9 @@ socketft.cpp
 socketft.h
 sosemanuk.cpp
 sosemanuk.h
+speck.cpp
+speck-simd.cpp
+speck.h
 square.cpp
 square.h
 squaretb.cpp
@@ -466,10 +472,12 @@ TestVectors/sha3_256_fips_202.txt
 TestVectors/sha3_384_fips_202.txt
 TestVectors/sha3_512_fips_202.txt
 TestVectors/shacal2.txt
+TestVectors/simon.txt
 TestVectors/siphash.txt
 TestVectors/sm3.txt
 TestVectors/sm4.txt
 TestVectors/sosemanuk.txt
+TestVectors/speck.txt
 TestVectors/tea.txt
 TestVectors/threefish.txt
 TestVectors/ttmac.txt
diff --git a/GNUmakefile b/GNUmakefile
index 6fadc15f..7de31b00 100755
--- a/GNUmakefile
+++ b/GNUmakefile
@@ -250,11 +250,15 @@ ifeq ($(findstring -DCRYPTOPP_DISABLE_SSSE3,$(CXXFLAGS)),)
   ifeq ($(HAVE_SSSE3),1)
     ARIA_FLAG = -mssse3
     SSSE3_FLAG = -mssse3
+    SIMON_FLAG = -mssse3
+    SPECK_FLAG = -mssse3
   endif
 ifeq ($(findstring -DCRYPTOPP_DISABLE_SSE4,$(CXXFLAGS)),)
   HAVE_SSE4 = $(shell echo | $(CXX) -x c++ $(CXXFLAGS) -msse4.1 -dM -E - 2>/dev/null | $(GREP) -i -c __SSE4_1__)
   ifeq ($(HAVE_SSE4),1)
     BLAKE2_FLAG = -msse4.1
+    SIMON_FLAG = -msse4.1
+    SPECK_FLAG = -msse4.1
   endif
   HAVE_SSE4 = $(shell echo | $(CXX) -x c++ $(CXXFLAGS) -msse4.2 -dM -E - 2>/dev/null | $(GREP) -i -c __SSE4_2__)
   ifeq ($(HAVE_SSE4),1)
@@ -285,11 +289,15 @@ ifeq ($(SUN_COMPILER),1)
   ifeq ($(COUNT),0)
     SSSE3_FLAG = -xarch=ssse3 -D__SSSE3__=1
     ARIA_FLAG = -xarch=ssse3 -D__SSSE3__=1
+    SIMON_FLAG = -xarch=ssse3 -D__SSSE3__=1
+    SPECK_FLAG = -xarch=ssse3 -D__SSSE3__=1
     LDFLAGS += -xarch=ssse3
   endif
   COUNT := $(shell $(CXX) $(CXXFLAGS) -E -xarch=sse4_1 -xdumpmacros /dev/null 2>&1 | $(GREP) -i -c "illegal")
   ifeq ($(COUNT),0)
     BLAKE2_FLAG = -xarch=sse4_1 -D__SSE4_1__=1
+    SIMON_FLAG = -xarch=sse4_1 -D__SSE4_1__=1
+    SPECK_FLAG = -xarch=sse4_1 -D__SSE4_1__=1
     LDFLAGS += -xarch=sse4_1
   endif
   COUNT := $(shell $(CXX) $(CXXFLAGS) -E -xarch=sse4_2 -xdumpmacros /dev/null 2>&1 | $(GREP) -i -c "illegal")
@@ -366,6 +374,8 @@ ifeq ($(IS_NEON),1)
     GCM_FLAG = -march=armv7-a -mfloat-abi=$(FP_ABI) -mfpu=neon
     ARIA_FLAG = -march=armv7-a -mfloat-abi=$(FP_ABI) -mfpu=neon
     BLAKE2_FLAG = -march=armv7-a -mfloat-abi=$(FP_ABI) -mfpu=neon
+    SIMON_FLAG = -march=armv7-a -mfloat-abi=$(FP_ABI) -mfpu=neon
+    SPECK_FLAG = -march=armv7-a -mfloat-abi=$(FP_ABI) -mfpu=neon
   endif
 endif
 
@@ -375,6 +385,8 @@ ifeq ($(IS_ARMV8),1)
     ARIA_FLAG = -march=armv8-a
     BLAKE2_FLAG = -march=armv8-a
     NEON_FLAG = -march=armv8-a
+    SIMON_FLAG = -march=armv8-a
+    SPECK_FLAG = -march=armv8-a
   endif
   HAVE_CRC = $(shell echo | $(CXX) -x c++ $(CXXFLAGS) -march=armv8-a+crc -dM -E - 2>/dev/null | $(GREP) -i -c __ARM_FEATURE_CRC32)
   ifeq ($(HAVE_CRC),1)
@@ -397,6 +409,8 @@ ifneq ($(IS_PPC32)$(IS_PPC64)$(IS_AIX),000)
     ALTIVEC_FLAG = -mcpu=power4 -maltivec
     ARIA_FLAG = -mcpu=power4 -maltivec
     BLAKE2_FLAG = -mcpu=power4 -maltivec
+    SIMON_FLAG = -mcpu=power4 -maltivec
+    SPECK_FLAG = -mcpu=power4 -maltivec
   endif
   # GCC and some compatibles
   HAVE_CRYPTO = $(shell echo | $(CXX) -x c++ $(CXXFLAGS) -mcpu=power8 -maltivec -dM -E - 2>/dev/null | $(GREP) -i -c -E '_ARCH_PWR8|_ARCH_PWR9|__CRYPTO')
@@ -405,6 +419,8 @@ ifneq ($(IS_PPC32)$(IS_PPC64)$(IS_AIX),000)
     AES_FLAG = -mcpu=power8 -maltivec
     GCM_FLAG = -mcpu=power8 -maltivec
     SHA_FLAG = -mcpu=power8 -maltivec
+    SIMON_FLAG = -mcpu=power8 -maltivec
+    SPECK_FLAG = -mcpu=power8 -maltivec
   endif
   # IBM XL C/C++
   HAVE_ALTIVEC = $(shell $(CXX) $(CXXFLAGS) -qshowmacros -qarch=pwr7 -qaltivec -E adhoc.cpp.proto 2>/dev/null | $(GREP) -i -c '__ALTIVEC__')
@@ -412,6 +428,8 @@ ifneq ($(IS_PPC32)$(IS_PPC64)$(IS_AIX),000)
     ALTIVEC_FLAG = -qarch=pwr7 -qaltivec
     ARIA_FLAG = -qarch=pwr7 -qaltivec
     BLAKE2_FLAG = -qarch=pwr7 -qaltivec
+    SIMON_FLAG = -qarch=pwr7 -qaltivec
+    SPECK_FLAG = -qarch=pwr7 -qaltivec
   endif
   # IBM XL C/C++
   HAVE_CRYPTO = $(shell $(CXX) $(CXXFLAGS) -qshowmacros -qarch=pwr8 -qaltivec -E adhoc.cpp.proto 2>/dev/null | $(GREP) -i -c -E '_ARCH_PWR8|_ARCH_PWR9|__CRYPTO')
@@ -422,6 +440,8 @@ ifneq ($(IS_PPC32)$(IS_PPC64)$(IS_AIX),000)
     SHA_FLAG = -qarch=pwr8 -qaltivec
     ARIA_FLAG = -qarch=pwr8 -qaltivec
     BLAKE2_FLAG = -qarch=pwr8 -qaltivec
+    SIMON_FLAG = -qarch=pwr8 -qaltivec
+    SPECK_FLAG = -qarch=pwr8 -qaltivec
   endif
 endif
 
@@ -1057,6 +1077,14 @@ sha-simd.o : sha-simd.cpp
 shacal2-simd.o : shacal2-simd.cpp
 	$(CXX) $(strip $(CXXFLAGS) $(SHA_FLAG) -c) $<
 
+# SSSE3 or NEON available
+simon-simd.o : simon-simd.cpp
+	$(CXX) $(strip $(CXXFLAGS) $(SIMON_FLAG) -c) $<
+
+# SSSE3 or NEON available
+speck-simd.o : speck-simd.cpp
+	$(CXX) $(strip $(CXXFLAGS) $(SPECK_FLAG) -c) $<
+
 # Don't build Rijndael with UBsan. Too much noise due to unaligned data accesses.
 ifneq ($(findstring -fsanitize=undefined,$(CXXFLAGS)),)
 rijndael.o : rijndael.cpp
diff --git a/GNUmakefile-cross b/GNUmakefile-cross
index ff5d4bc3..79cdf31f 100755
--- a/GNUmakefile-cross
+++ b/GNUmakefile-cross
@@ -234,12 +234,16 @@ ifeq ($(IS_NEON),1)
     GCM_FLAG += -mfpu=neon
     ARIA_FLAG += -mfpu=neon
     BLAKE2_FLAG += -mfpu=neon
+    SIMON_FLAG += -mfpu=neon
+    SPECK_FLAG += -mfpu=neon
     ifeq ($(IS_ANDROID),1)
       ifeq ($(findstring -mfloat-abi=softfp,$(CXXFLAGS)),)
         NEON_FLAG += -mfloat-abi=softfp
         GCM_FLAG += -mfloat-abi=softfp
         ARIA_FLAG += -mfloat-abi=softfp
         BLAKE2_FLAG += -mfloat-abi=softfp
+        SIMON_FLAG += -mfloat-abi=softfp
+        SPECK_FLAG += -mfloat-abi=softfp
       endif
     endif
   endif
@@ -252,6 +256,8 @@ ifneq ($(IS_ARMv8),0)
     ARIA_FLAG = -march=armv8-a
     BLAKE2_FLAG = -march=armv8-a
     NEON_FLAG = -march=armv8-a
+    SIMON_FLAG = -march=armv8-a
+    SPECK_FLAG = -march=armv8-a
   endif
   HAVE_CRC := $(shell echo | $(CXX) -x c++ $(CXXFLAGS) -march=armv8-a+crc -dM -E - 2>/dev/null | $(EGREP) -i -c __ARM_FEATURE_CRC32)
   ifeq ($(HAVE_CRC),1)
@@ -271,9 +277,13 @@ ifneq ($(IS_i686)$(IS_x86_64),00)
   ifeq ($(HAVE_SSSE3),1)
     ARIA_FLAG = -mssse3
     SSSE3_FLAG = -mssse3
+    SIMON_FLAG = -mssse3
+    SPECK_FLAG = -mssse3
   endif
   HAVE_SSE4 = $(shell echo | $(CXX) -x c++ $(CXXFLAGS) -msse4.1 -dM -E - 2>/dev/null | $(EGREP) -i -c __SSE4_1__)
   ifeq ($(HAVE_SSE4),1)
+    SIMON_FLAG = -msse4.1
+    SPECK_FLAG = -msse4.1
   endif
   HAVE_SSE4 = $(shell echo | $(CXX) -x c++ $(CXXFLAGS) -msse4.2 -dM -E - 2>/dev/null | $(EGREP) -i -c __SSE4_2__)
   ifeq ($(HAVE_SSE4),1)
@@ -500,6 +510,14 @@ sha-simd.o : sha-simd.cpp
 shacal2-simd.o : shacal2-simd.cpp
 	$(CXX) $(strip $(CXXFLAGS) $(SHA_FLAG) -c) $<
 
+# SSSE3 or NEON available
+simon-simd.o : simon-simd.cpp
+	$(CXX) $(strip $(CXXFLAGS) $(SIMON_FLAG) -c) $<
+
+# SSSE3 or NEON available
+speck-simd.o : speck-simd.cpp
+	$(CXX) $(strip $(CXXFLAGS) $(SPECK_FLAG) -c) $<
+
 %.o : %.cpp
 	$(CXX) $(strip $(CXXFLAGS) -c) $<
 
diff --git a/bench1.cpp b/bench1.cpp
index 06580086..153428ff 100644
--- a/bench1.cpp
+++ b/bench1.cpp
@@ -614,6 +614,18 @@ void Benchmark2(double t, double hertz)
 		BenchMarkByName<SymmetricCipher>("Kalyna-256/CTR", 32, "Kalyna-256(256)/CTR (256-bit key)");
 		BenchMarkByName<SymmetricCipher>("Kalyna-256/CTR", 64, "Kalyna-256(512)/CTR (512-bit key)");
 		BenchMarkByName<SymmetricCipher>("Kalyna-512/CTR", 64, "Kalyna-512(512)/CTR (512-bit key)");
+
+		BenchMarkByName<SymmetricCipher>("SIMON-64/CTR", 12, "SIMON-64(96)/CTR (96-bit key)");
+		BenchMarkByName<SymmetricCipher>("SIMON-64/CTR", 16, "SIMON-64(128)/CTR (128-bit key)");
+		BenchMarkByName<SymmetricCipher>("SIMON-128/CTR", 16, "SIMON-128(128)/CTR (128-bit key)");
+		BenchMarkByName<SymmetricCipher>("SIMON-128/CTR", 24, "SIMON-128(192)/CTR (192-bit key)");
+		BenchMarkByName<SymmetricCipher>("SIMON-128/CTR", 32, "SIMON-128(256)/CTR (256-bit key)");
+
+		BenchMarkByName<SymmetricCipher>("SPECK-64/CTR", 12, "SPECK-64(96)/CTR (96-bit key)");
+		BenchMarkByName<SymmetricCipher>("SPECK-64/CTR", 16, "SPECK-64(128)/CTR (128-bit key)");
+		BenchMarkByName<SymmetricCipher>("SPECK-128/CTR", 16, "SPECK-128(128)/CTR (128-bit key)");
+		BenchMarkByName<SymmetricCipher>("SPECK-128/CTR", 24, "SPECK-128(192)/CTR (192-bit key)");
+		BenchMarkByName<SymmetricCipher>("SPECK-128/CTR", 32, "SPECK-128(256)/CTR (256-bit key)");
 	}
 
 	std::cout << "\n<TBODY style=\"background: yellow;\">";
diff --git a/cryptest.nmake b/cryptest.nmake
index 7f1077d7..f329d3fd 100644
--- a/cryptest.nmake
+++ b/cryptest.nmake
@@ -47,9 +47,9 @@
 
 # If you use 'make sources' from Linux makefile, then add 'winpipes.cpp' to the list below.
 
-LIB_SRCS = cryptlib.cpp cpu.cpp integer.cpp 3way.cpp adler32.cpp algebra.cpp algparam.cpp arc4.cpp aria-simd.cpp aria.cpp ariatab.cpp asn.cpp authenc.cpp base32.cpp base64.cpp basecode.cpp bfinit.cpp blake2-simd.cpp blake2.cpp blowfish.cpp blumshub.cpp camellia.cpp cast.cpp casts.cpp cbcmac.cpp ccm.cpp chacha.cpp channels.cpp cmac.cpp crc-simd.cpp crc.cpp default.cpp des.cpp dessp.cpp dh.cpp dh2.cpp dll.cpp dsa.cpp eax.cpp ec2n.cpp eccrypto.cpp ecp.cpp elgamal.cpp emsa2.cpp eprecomp.cpp esign.cpp files.cpp filters.cpp fips140.cpp fipstest.cpp gcm-simd.cpp gcm.cpp gf256.cpp gf2_32.cpp gf2n.cpp gfpcrypt.cpp gost.cpp gzip.cpp hex.cpp hmac.cpp hrtimer.cpp ida.cpp idea.cpp iterhash.cpp kalyna.cpp kalynatab.cpp keccak.cpp luc.cpp mars.cpp marss.cpp md2.cpp md4.cpp md5.cpp misc.cpp modes.cpp mqueue.cpp mqv.cpp nbtheory.cpp neon-simd.cpp network.cpp oaep.cpp osrng.cpp padlkrng.cpp panama.cpp pkcspad.cpp poly1305.cpp polynomi.cpp pssr.cpp pubkey.cpp queue.cpp rabin.cpp randpool.cpp rc2.cpp rc5.cpp rc6.cpp rdrand.cpp rdtables.cpp rijndael-simd.cpp rijndael.cpp ripemd.cpp rng.cpp rsa.cpp rw.cpp safer.cpp salsa.cpp seal.cpp seed.cpp serpent.cpp sha-simd.cpp sha.cpp sha3.cpp shacal2-simd.cpp shacal2.cpp shark.cpp sharkbox.cpp skipjack.cpp sm3.cpp sm4.cpp socketft.cpp sosemanuk.cpp square.cpp squaretb.cpp sse-simd.cpp strciphr.cpp tea.cpp tftables.cpp threefish.cpp tiger.cpp tigertab.cpp trdlocal.cpp ttmac.cpp tweetnacl.cpp twofish.cpp vmac.cpp wait.cpp wake.cpp whrlpool.cpp winpipes.cpp xtr.cpp xtrcrypt.cpp zdeflate.cpp zinflate.cpp zlib.cpp
+LIB_SRCS = cryptlib.cpp cpu.cpp integer.cpp 3way.cpp adler32.cpp algebra.cpp algparam.cpp arc4.cpp aria-simd.cpp aria.cpp ariatab.cpp asn.cpp authenc.cpp base32.cpp base64.cpp basecode.cpp bfinit.cpp blake2-simd.cpp blake2.cpp blowfish.cpp blumshub.cpp camellia.cpp cast.cpp casts.cpp cbcmac.cpp ccm.cpp chacha.cpp channels.cpp cmac.cpp crc-simd.cpp crc.cpp default.cpp des.cpp dessp.cpp dh.cpp dh2.cpp dll.cpp dsa.cpp eax.cpp ec2n.cpp eccrypto.cpp ecp.cpp elgamal.cpp emsa2.cpp eprecomp.cpp esign.cpp files.cpp filters.cpp fips140.cpp fipstest.cpp gcm-simd.cpp gcm.cpp gf256.cpp gf2_32.cpp gf2n.cpp gfpcrypt.cpp gost.cpp gzip.cpp hex.cpp hmac.cpp hrtimer.cpp ida.cpp idea.cpp iterhash.cpp kalyna.cpp kalynatab.cpp keccak.cpp luc.cpp mars.cpp marss.cpp md2.cpp md4.cpp md5.cpp misc.cpp modes.cpp mqueue.cpp mqv.cpp nbtheory.cpp neon-simd.cpp network.cpp oaep.cpp osrng.cpp padlkrng.cpp panama.cpp pkcspad.cpp poly1305.cpp polynomi.cpp pssr.cpp pubkey.cpp queue.cpp rabin.cpp randpool.cpp rc2.cpp rc5.cpp rc6.cpp rdrand.cpp rdtables.cpp rijndael-simd.cpp rijndael.cpp ripemd.cpp rng.cpp rsa.cpp rw.cpp safer.cpp salsa.cpp seal.cpp seed.cpp serpent.cpp sha-simd.cpp sha.cpp sha3.cpp shacal2-simd.cpp shacal2.cpp shark.cpp sharkbox.cpp simon.cpp simon-simd.cpp skipjack.cpp sm3.cpp sm4.cpp socketft.cpp sosemanuk.cpp speck.cpp speck-simd.cpp square.cpp squaretb.cpp sse-simd.cpp strciphr.cpp tea.cpp tftables.cpp threefish.cpp tiger.cpp tigertab.cpp trdlocal.cpp ttmac.cpp tweetnacl.cpp twofish.cpp vmac.cpp wait.cpp wake.cpp whrlpool.cpp winpipes.cpp xtr.cpp xtrcrypt.cpp zdeflate.cpp zinflate.cpp zlib.cpp
 
-LIB_OBJS = cryptlib.obj cpu.obj integer.obj 3way.obj adler32.obj algebra.obj algparam.obj arc4.obj aria-simd.obj aria.obj ariatab.obj asn.obj authenc.obj base32.obj base64.obj basecode.obj bfinit.obj blake2-simd.obj blake2.obj blowfish.obj blumshub.obj camellia.obj cast.obj casts.obj cbcmac.obj ccm.obj chacha.obj channels.obj cmac.obj crc-simd.obj crc.obj default.obj des.obj dessp.obj dh.obj dh2.obj dll.obj dsa.obj eax.obj ec2n.obj eccrypto.obj ecp.obj elgamal.obj emsa2.obj eprecomp.obj esign.obj files.obj filters.obj fips140.obj fipstest.obj gcm-simd.obj gcm.obj gf256.obj gf2_32.obj gf2n.obj gfpcrypt.obj gost.obj gzip.obj hex.obj hmac.obj hrtimer.obj ida.obj idea.obj iterhash.obj kalyna.obj kalynatab.obj keccak.obj luc.obj mars.obj marss.obj md2.obj md4.obj md5.obj misc.obj modes.obj mqueue.obj mqv.obj nbtheory.obj neon-simd.obj network.obj oaep.obj osrng.obj padlkrng.obj panama.obj pkcspad.obj poly1305.obj polynomi.obj pssr.obj pubkey.obj queue.obj rabin.obj randpool.obj rc2.obj rc5.obj rc6.obj rdrand.obj rdtables.obj rijndael-simd.obj rijndael.obj ripemd.obj rng.obj rsa.obj rw.obj safer.obj salsa.obj seal.obj seed.obj serpent.obj sha-simd.obj sha.obj sha3.obj shacal2-simd.obj shacal2.obj shark.obj sharkbox.obj skipjack.obj sm3.obj sm4.obj socketft.obj sosemanuk.obj square.obj squaretb.obj sse-simd.obj strciphr.obj tea.obj tftables.obj threefish.obj tiger.obj tigertab.obj trdlocal.obj ttmac.obj tweetnacl.obj twofish.obj vmac.obj wait.obj wake.obj whrlpool.obj winpipes.obj xtr.obj xtrcrypt.obj zdeflate.obj zinflate.obj zlib.obj
+LIB_OBJS = cryptlib.obj cpu.obj integer.obj 3way.obj adler32.obj algebra.obj algparam.obj arc4.obj aria-simd.obj aria.obj ariatab.obj asn.obj authenc.obj base32.obj base64.obj basecode.obj bfinit.obj blake2-simd.obj blake2.obj blowfish.obj blumshub.obj camellia.obj cast.obj casts.obj cbcmac.obj ccm.obj chacha.obj channels.obj cmac.obj crc-simd.obj crc.obj default.obj des.obj dessp.obj dh.obj dh2.obj dll.obj dsa.obj eax.obj ec2n.obj eccrypto.obj ecp.obj elgamal.obj emsa2.obj eprecomp.obj esign.obj files.obj filters.obj fips140.obj fipstest.obj gcm-simd.obj gcm.obj gf256.obj gf2_32.obj gf2n.obj gfpcrypt.obj gost.obj gzip.obj hex.obj hmac.obj hrtimer.obj ida.obj idea.obj iterhash.obj kalyna.obj kalynatab.obj keccak.obj luc.obj mars.obj marss.obj md2.obj md4.obj md5.obj misc.obj modes.obj mqueue.obj mqv.obj nbtheory.obj neon-simd.obj network.obj oaep.obj osrng.obj padlkrng.obj panama.obj pkcspad.obj poly1305.obj polynomi.obj pssr.obj pubkey.obj queue.obj rabin.obj randpool.obj rc2.obj rc5.obj rc6.obj rdrand.obj rdtables.obj rijndael-simd.obj rijndael.obj ripemd.obj rng.obj rsa.obj rw.obj safer.obj salsa.obj seal.obj seed.obj serpent.obj sha-simd.obj sha.obj sha3.obj shacal2-simd.obj shacal2.obj shark.obj sharkbox.obj simon.obj simon-simd.obj skipjack.obj sm3.obj sm4.obj socketft.obj sosemanuk.obj speck.obj speck-simd.obj square.obj squaretb.obj sse-simd.obj strciphr.obj tea.obj tftables.obj threefish.obj tiger.obj tigertab.obj trdlocal.obj ttmac.obj tweetnacl.obj twofish.obj vmac.obj wait.obj wake.obj whrlpool.obj winpipes.obj xtr.obj xtrcrypt.obj zdeflate.obj zinflate.obj zlib.obj
 
 TEST_SRCS = bench1.cpp bench2.cpp test.cpp validat0.cpp validat1.cpp validat2.cpp validat3.cpp validat4.cpp datatest.cpp regtest1.cpp regtest2.cpp regtest3.cpp fipsalgt.cpp dlltest.cpp fipstest.cpp
 
diff --git a/cryptlib.vcxproj b/cryptlib.vcxproj
index 9bc4a9a3..0d4ab1cd 100644
--- a/cryptlib.vcxproj
+++ b/cryptlib.vcxproj
@@ -288,12 +288,16 @@
     <ClCompile Include="shacal2-simd.cpp" />
     <ClCompile Include="shark.cpp" />
     <ClCompile Include="sharkbox.cpp" />
+    <ClCompile Include="simon.cpp" />
+    <ClCompile Include="simon-simd.cpp" />
     <ClCompile Include="simple.cpp" />
     <ClCompile Include="skipjack.cpp" />
     <ClCompile Include="sm3.cpp" />
     <ClCompile Include="sm4.cpp" />
     <ClCompile Include="socketft.cpp" />
     <ClCompile Include="sosemanuk.cpp" />
+    <ClCompile Include="speck.cpp" />
+    <ClCompile Include="speck-simd.cpp" />
     <ClCompile Include="square.cpp" />
     <ClCompile Include="squaretb.cpp" />
     <ClCompile Include="sse-simd.cpp" />
@@ -474,6 +478,7 @@
     <ClInclude Include="shacal2.h" />
     <ClInclude Include="shark.h" />
     <ClInclude Include="simple.h" />
+    <ClInclude Include="simon.h" />
     <ClInclude Include="siphash.h" />
     <ClInclude Include="skipjack.h" />
     <ClInclude Include="sm3.h" />
@@ -481,6 +486,7 @@
     <ClInclude Include="smartptr.h" />
     <ClInclude Include="socketft.h" />
     <ClInclude Include="sosemanuk.h" />
+    <ClInclude Include="speck.h" />
     <ClInclude Include="square.h" />
     <ClInclude Include="stdcpp.h" />
     <ClInclude Include="strciphr.h" />
diff --git a/cryptlib.vcxproj.filters b/cryptlib.vcxproj.filters
index c43131e7..04dec87d 100644
--- a/cryptlib.vcxproj.filters
+++ b/cryptlib.vcxproj.filters
@@ -359,6 +359,12 @@
     <ClCompile Include="sharkbox.cpp">
       <Filter>Source Files</Filter>
     </ClCompile>
+    <ClCompile Include="simon.cpp">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="simon-simd.cpp">
+      <Filter>Source Files</Filter>
+    </ClCompile>
     <ClCompile Include="simple.cpp">
       <Filter>Source Files</Filter>
     </ClCompile>
@@ -371,6 +377,12 @@
     <ClCompile Include="sm4.cpp">
       <Filter>Source Files</Filter>
     </ClCompile>
+    <ClCompile Include="speck.cpp">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="speck-simd.cpp">
+      <Filter>Source Files</Filter>
+    </ClCompile>
     <ClCompile Include="socketft.cpp">
       <Filter>Source Files</Filter>
     </ClCompile>
@@ -816,6 +828,9 @@
     <ClInclude Include="shark.h">
       <Filter>Header Files</Filter>
     </ClInclude>
+    <ClInclude Include="simon.h">
+      <Filter>Header Files</Filter>
+    </ClInclude>
     <ClInclude Include="simple.h">
       <Filter>Header Files</Filter>
     </ClInclude>
@@ -834,6 +849,9 @@
     <ClInclude Include="smartptr.h">
       <Filter>Header Files</Filter>
     </ClInclude>
+    <ClInclude Include="speck.h">
+      <Filter>Header Files</Filter>
+    </ClInclude>
     <ClInclude Include="socketft.h">
       <Filter>Header Files</Filter>
     </ClInclude>
diff --git a/regtest2.cpp b/regtest2.cpp
index 4b3732e9..b5bf18b0 100644
--- a/regtest2.cpp
+++ b/regtest2.cpp
@@ -32,6 +32,8 @@
 #include "mars.h"
 #include "kalyna.h"
 #include "threefish.h"
+#include "simon.h"
+#include "speck.h"
 #include "sm4.h"
 #include "des.h"
 #include "idea.h"
@@ -161,6 +163,19 @@ void RegisterFactories2()
 	RegisterSymmetricCipherDefaultFactories<CTR_Mode<Threefish512> >();  // Benchmarks
 	RegisterSymmetricCipherDefaultFactories<CTR_Mode<Threefish1024> >(); // Benchmarks
 
+	RegisterSymmetricCipherDefaultFactories<ECB_Mode<SIMON64> >();  // Test Vectors
+	RegisterSymmetricCipherDefaultFactories<CBC_Mode<SIMON64> >();  // Test Vectors
+	RegisterSymmetricCipherDefaultFactories<ECB_Mode<SIMON128> >(); // Test Vectors
+	RegisterSymmetricCipherDefaultFactories<CBC_Mode<SIMON128> >(); // Test Vectors
+	RegisterSymmetricCipherDefaultFactories<CTR_Mode<SIMON64> >();  // Benchmarks
+	RegisterSymmetricCipherDefaultFactories<CTR_Mode<SIMON128> >();  // Benchmarks
+
+	RegisterSymmetricCipherDefaultFactories<ECB_Mode<SPECK64> >();  // Test Vectors
+	RegisterSymmetricCipherDefaultFactories<CBC_Mode<SPECK64> >();  // Test Vectors
+	RegisterSymmetricCipherDefaultFactories<ECB_Mode<SPECK128> >(); // Test Vectors
+	RegisterSymmetricCipherDefaultFactories<CBC_Mode<SPECK128> >(); // Test Vectors
+	RegisterSymmetricCipherDefaultFactories<CTR_Mode<SPECK64> >();  // Benchmarks
+	RegisterSymmetricCipherDefaultFactories<CTR_Mode<SPECK128> >(); // Benchmarks
 
 	RegisterSymmetricCipherDefaultFactories<ECB_Mode<SM4> >();  // Test Vectors
 	RegisterSymmetricCipherDefaultFactories<CBC_Mode<SM4> >();  // Test Vectors
diff --git a/simon-simd.cpp b/simon-simd.cpp
new file mode 100644
index 00000000..ad591321
--- /dev/null
+++ b/simon-simd.cpp
@@ -0,0 +1,1141 @@
+// simon-simd.cpp - written and placed in the public domain by Jeffrey Walton
+//
+//    This source file uses intrinsics and built-ins to gain access to
+//    SSSE3, ARM NEON and ARMv8a, and Power7 Altivec instructions. A separate
+//    source file is needed because additional CXXFLAGS are required to enable
+//    the appropriate instructions sets in some build configurations.
+
+#include "pch.h"
+#include "config.h"
+
+#include "simon.h"
+#include "misc.h"
+#include "adv-simd.h"
+
+// Uncomment for benchmarking C++ against SSE or NEON.
+// Do so in both simon.cpp and simon-simd.cpp.
+// #undef CRYPTOPP_SSSE3_AVAILABLE
+// #undef CRYPTOPP_SSE41_AVAILABLE
+// #undef CRYPTOPP_ARM_NEON_AVAILABLE
+
+#if (CRYPTOPP_SSSE3_AVAILABLE)
+# include <pmmintrin.h>
+# include <tmmintrin.h>
+#endif
+
+#if (CRYPTOPP_SSE41_AVAILABLE)
+# include <smmintrin.h>
+#endif
+
+#if defined(__AVX512F__) && defined(__AVX512VL__)
+# define CRYPTOPP_AVX512_ROTATE 1
+# include <immintrin.h>
+#endif
+
+#if (CRYPTOPP_ARM_NEON_AVAILABLE)
+# include <arm_neon.h>
+#endif
+
+// Can't use CRYPTOPP_ARM_XXX_AVAILABLE because too many
+// compilers don't follow ACLE conventions for the include.
+#if defined(CRYPTOPP_ARM_ACLE_AVAILABLE)
+# include <stdint.h>
+# include <arm_acle.h>
+#endif
+
+// https://www.spinics.net/lists/gcchelp/msg47735.html and
+// https://www.spinics.net/lists/gcchelp/msg47749.html
+#if (CRYPTOPP_GCC_VERSION >= 40900)
+# define GCC_NO_UBSAN __attribute__ ((no_sanitize_undefined))
+#else
+# define GCC_NO_UBSAN
+#endif
+
+ANONYMOUS_NAMESPACE_BEGIN
+
+using CryptoPP::byte;
+using CryptoPP::word32;
+using CryptoPP::word64;
+using CryptoPP::rotlFixed;
+using CryptoPP::rotrFixed;
+using CryptoPP::vec_swap;  // SunCC
+
+// *************************** ARM NEON ************************** //
+
+#if defined(CRYPTOPP_ARM_NEON_AVAILABLE)
+
+template <unsigned int R>
+inline uint32x4_t RotateLeft32(const uint32x4_t& val)
+{
+    const uint32x4_t a(vshlq_n_u32(val, R));
+    const uint32x4_t b(vshrq_n_u32(val, 32 - R));
+    return vorrq_u32(a, b);
+}
+
+template <unsigned int R>
+inline uint32x4_t RotateRight32(const uint32x4_t& val)
+{
+    const uint32x4_t a(vshlq_n_u32(val, 32 - R));
+    const uint32x4_t b(vshrq_n_u32(val, R));
+    return vorrq_u32(a, b);
+}
+
+#if defined(__aarch32__) || defined(__aarch64__)
+// Faster than two Shifts and an Or. Thanks to Louis Wingers and Bryan Weeks.
+template <>
+inline uint32x4_t RotateLeft32<8>(const uint32x4_t& val)
+{
+#if defined(CRYPTOPP_BIG_ENDIAN)
+    const uint8_t maskb[16] = { 14,13,12,15, 10,9,8,11, 6,5,4,7, 2,1,0,3 };
+    const uint8x16_t mask = vld1q_u8(maskb);
+#else
+    const uint8_t maskb[16] = { 3,0,1,2, 7,4,5,6, 11,8,9,10, 15,12,13,14 };
+    const uint8x16_t mask = vld1q_u8(maskb);
+#endif
+
+    return vreinterpretq_u32_u8(
+        vqtbl1q_u8(vreinterpretq_u8_u32(val), mask));
+}
+
+// Faster than two Shifts and an Or. Thanks to Louis Wingers and Bryan Weeks.
+template <>
+inline uint32x4_t RotateRight32<8>(const uint32x4_t& val)
+{
+#if defined(CRYPTOPP_BIG_ENDIAN)
+    const uint8_t maskb[16] = { 12,15,14,13, 8,11,10,9, 4,7,6,5, 0,3,2,1 };
+    const uint8x16_t mask = vld1q_u8(maskb);
+#else
+    const uint8_t maskb[16] = { 1,2,3,0, 5,6,7,4, 9,10,11,8, 13,14,14,12 };
+    const uint8x16_t mask = vld1q_u8(maskb);
+#endif
+
+    return vreinterpretq_u32_u8(
+        vqtbl1q_u8(vreinterpretq_u8_u32(val), mask));
+}
+#endif
+
+inline uint32x4_t Shuffle32(const uint32x4_t& val)
+{
+#if defined(CRYPTOPP_LITTLE_ENDIAN)
+    return vreinterpretq_u32_u8(
+        vrev32q_u8(vreinterpretq_u8_u32(val)));
+#else
+    return val;
+#endif
+}
+
+inline uint32x4_t SIMON64_f(const uint32x4_t& val)
+{
+    return veorq_u32(RotateLeft32<2>(val),
+        vandq_u32(RotateLeft32<1>(val), RotateLeft32<8>(val)));
+}
+
+inline void SIMON64_Enc_Block(uint32x4_t &block1, uint32x4_t &block0,
+    const word32 *subkeys, unsigned int rounds)
+{
+    // Rearrange the data for vectorization. The incoming data was read from
+    // a big-endian byte array. Depending on the number of blocks it needs to
+    // be permuted to the following. If only a single block is available then
+    // a Zero block is provided to promote vectorizations.
+    // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
+    uint32x4_t x1 = vuzpq_u32(block0, block1).val[0];
+    uint32x4_t y1 = vuzpq_u32(block0, block1).val[1];
+
+    x1 = Shuffle32(x1); y1 = Shuffle32(y1);
+
+    for (int i = 0; i < static_cast<int>(rounds & ~1)-1; i += 2)
+    {
+        const uint32x4_t rk1 = vld1q_dup_u32(subkeys+i);
+        y1 = veorq_u32(veorq_u32(y1, SIMON64_f(x1)), rk1);
+
+        const uint32x4_t rk2 = vld1q_dup_u32(subkeys+i+1);
+        x1 = veorq_u32(veorq_u32(x1, SIMON64_f(y1)), rk2);
+    }
+
+    if (rounds & 1)
+    {
+        const uint32x4_t rk = vld1q_dup_u32(subkeys+rounds-1);
+
+        y1 = veorq_u32(veorq_u32(y1, SIMON64_f(x1)), rk);
+        std::swap(x1, y1);
+    }
+
+    x1 = Shuffle32(x1); y1 = Shuffle32(y1);
+
+    // [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4]
+    block0 = vzipq_u32(x1, y1).val[0];
+    block1 = vzipq_u32(x1, y1).val[1];
+}
+
+inline void SIMON64_Dec_Block(uint32x4_t &block0, uint32x4_t &block1,
+    const word32 *subkeys, unsigned int rounds)
+{
+    // Rearrange the data for vectorization. The incoming data was read from
+    // a big-endian byte array. Depending on the number of blocks it needs to
+    // be permuted to the following. If only a single block is available then
+    // a Zero block is provided to promote vectorizations.
+    // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
+    uint32x4_t x1 = vuzpq_u32(block0, block1).val[0];
+    uint32x4_t y1 = vuzpq_u32(block0, block1).val[1];
+
+    x1 = Shuffle32(x1); y1 = Shuffle32(y1);
+
+    if (rounds & 1)
+    {
+        std::swap(x1, y1);
+        const uint32x4_t rk = vld1q_dup_u32(subkeys + rounds - 1);
+
+        y1 = veorq_u32(veorq_u32(y1, rk), SIMON64_f(x1));
+        rounds--;
+    }
+
+    for (int i = static_cast<int>(rounds-2); i >= 0; i -= 2)
+    {
+        const uint32x4_t rk1 = vld1q_dup_u32(subkeys+i+1);
+        x1 = veorq_u32(veorq_u32(x1, SIMON64_f(y1)), rk1);
+
+        const uint32x4_t rk2 = vld1q_dup_u32(subkeys+i);
+        y1 = veorq_u32(veorq_u32(y1, SIMON64_f(x1)), rk2);
+    }
+
+    x1 = Shuffle32(x1); y1 = Shuffle32(y1);
+
+    // [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4]
+    block0 = vzipq_u32(x1, y1).val[0];
+    block1 = vzipq_u32(x1, y1).val[1];
+}
+
+inline void SIMON64_Enc_6_Blocks(uint32x4_t &block0, uint32x4_t &block1,
+    uint32x4_t &block2, uint32x4_t &block3, uint32x4_t &block4, uint32x4_t &block5,
+    const word32 *subkeys, unsigned int rounds)
+{
+    // Rearrange the data for vectorization. The incoming data was read from
+    // a big-endian byte array. Depending on the number of blocks it needs to
+    // be permuted to the following. If only a single block is available then
+    // a Zero block is provided to promote vectorizations.
+    // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
+    uint32x4_t x1 = vuzpq_u32(block0, block1).val[0];
+    uint32x4_t y1 = vuzpq_u32(block0, block1).val[1];
+    uint32x4_t x2 = vuzpq_u32(block2, block3).val[0];
+    uint32x4_t y2 = vuzpq_u32(block2, block3).val[1];
+    uint32x4_t x3 = vuzpq_u32(block4, block5).val[0];
+    uint32x4_t y3 = vuzpq_u32(block4, block5).val[1];
+
+    x1 = Shuffle32(x1); y1 = Shuffle32(y1);
+    x2 = Shuffle32(x2); y2 = Shuffle32(y2);
+    x3 = Shuffle32(x3); y3 = Shuffle32(y3);
+
+    for (int i = 0; i < static_cast<int>(rounds & ~1) - 1; i += 2)
+    {
+        const uint32x4_t rk1 = vld1q_dup_u32(subkeys+i);
+        y1 = veorq_u32(veorq_u32(y1, SIMON64_f(x1)), rk1);
+        y2 = veorq_u32(veorq_u32(y2, SIMON64_f(x2)), rk1);
+        y3 = veorq_u32(veorq_u32(y3, SIMON64_f(x3)), rk1);
+
+        const uint32x4_t rk2 = vld1q_dup_u32(subkeys+i+1);
+        x1 = veorq_u32(veorq_u32(x1, SIMON64_f(y1)), rk2);
+        x2 = veorq_u32(veorq_u32(x2, SIMON64_f(y2)), rk2);
+        x3 = veorq_u32(veorq_u32(x3, SIMON64_f(y3)), rk2);
+    }
+
+    if (rounds & 1)
+    {
+        const uint32x4_t rk = vld1q_dup_u32(subkeys + rounds - 1);
+
+        y1 = veorq_u32(veorq_u32(y1, SIMON64_f(x1)), rk);
+        y2 = veorq_u32(veorq_u32(y2, SIMON64_f(x2)), rk);
+        y3 = veorq_u32(veorq_u32(y3, SIMON64_f(x3)), rk);
+        std::swap(x1, y1); std::swap(x2, y2); std::swap(x3, y3);
+    }
+
+    x1 = Shuffle32(x1); y1 = Shuffle32(y1);
+    x2 = Shuffle32(x2); y2 = Shuffle32(y2);
+    x3 = Shuffle32(x3); y3 = Shuffle32(y3);
+
+    // [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4]
+    block0 = vzipq_u32(x1, y1).val[0];
+    block1 = vzipq_u32(x1, y1).val[1];
+    block2 = vzipq_u32(x2, y2).val[0];
+    block3 = vzipq_u32(x2, y2).val[1];
+    block4 = vzipq_u32(x3, y3).val[0];
+    block5 = vzipq_u32(x3, y3).val[1];
+}
+
+inline void SIMON64_Dec_6_Blocks(uint32x4_t &block0, uint32x4_t &block1,
+    uint32x4_t &block2, uint32x4_t &block3, uint32x4_t &block4, uint32x4_t &block5,
+    const word32 *subkeys, unsigned int rounds)
+{
+    // Rearrange the data for vectorization. The incoming data was read from
+    // a big-endian byte array. Depending on the number of blocks it needs to
+    // be permuted to the following. If only a single block is available then
+    // a Zero block is provided to promote vectorizations.
+    // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
+    uint32x4_t x1 = vuzpq_u32(block0, block1).val[0];
+    uint32x4_t y1 = vuzpq_u32(block0, block1).val[1];
+    uint32x4_t x2 = vuzpq_u32(block2, block3).val[0];
+    uint32x4_t y2 = vuzpq_u32(block2, block3).val[1];
+    uint32x4_t x3 = vuzpq_u32(block4, block5).val[0];
+    uint32x4_t y3 = vuzpq_u32(block4, block5).val[1];
+
+    x1 = Shuffle32(x1); y1 = Shuffle32(y1);
+    x2 = Shuffle32(x2); y2 = Shuffle32(y2);
+    x3 = Shuffle32(x3); y3 = Shuffle32(y3);
+
+    if (rounds & 1)
+    {
+        std::swap(x1, y1); std::swap(x2, y2); std::swap(x3, y3);
+        const uint32x4_t rk = vld1q_dup_u32(subkeys + rounds - 1);
+
+        y1 = veorq_u32(veorq_u32(y1, rk), SIMON64_f(x1));
+        y2 = veorq_u32(veorq_u32(y2, rk), SIMON64_f(x2));
+        y3 = veorq_u32(veorq_u32(y3, rk), SIMON64_f(x3));
+        rounds--;
+    }
+
+    for (int i = static_cast<int>(rounds-2); i >= 0; i -= 2)
+    {
+        const uint32x4_t rk1 = vld1q_dup_u32(subkeys + i + 1);
+        x1 = veorq_u32(veorq_u32(x1, SIMON64_f(y1)), rk1);
+        x2 = veorq_u32(veorq_u32(x2, SIMON64_f(y2)), rk1);
+        x3 = veorq_u32(veorq_u32(x3, SIMON64_f(y3)), rk1);
+
+        const uint32x4_t rk2 = vld1q_dup_u32(subkeys + i);
+        y1 = veorq_u32(veorq_u32(y1, SIMON64_f(x1)), rk2);
+        y2 = veorq_u32(veorq_u32(y2, SIMON64_f(x2)), rk2);
+        y3 = veorq_u32(veorq_u32(y3, SIMON64_f(x3)), rk2);
+    }
+
+    x1 = Shuffle32(x1); y1 = Shuffle32(y1);
+    x2 = Shuffle32(x2); y2 = Shuffle32(y2);
+    x3 = Shuffle32(x3); y3 = Shuffle32(y3);
+
+    // [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4]
+    block0 = vzipq_u32(x1, y1).val[0];
+    block1 = vzipq_u32(x1, y1).val[1];
+    block2 = vzipq_u32(x2, y2).val[0];
+    block3 = vzipq_u32(x2, y2).val[1];
+    block4 = vzipq_u32(x3, y3).val[0];
+    block5 = vzipq_u32(x3, y3).val[1];
+}
+
+#endif  // CRYPTOPP_ARM_NEON_AVAILABLE
+
+#if defined(CRYPTOPP_ARM_NEON_AVAILABLE)
+
+template <class T>
+inline T UnpackHigh64(const T& a, const T& b)
+{
+    const uint64x1_t x(vget_high_u64((uint64x2_t)a));
+    const uint64x1_t y(vget_high_u64((uint64x2_t)b));
+    return (T)vcombine_u64(x, y);
+}
+
+template <class T>
+inline T UnpackLow64(const T& a, const T& b)
+{
+    const uint64x1_t x(vget_low_u64((uint64x2_t)a));
+    const uint64x1_t y(vget_low_u64((uint64x2_t)b));
+    return (T)vcombine_u64(x, y);
+}
+
+template <unsigned int R>
+inline uint64x2_t RotateLeft64(const uint64x2_t& val)
+{
+    const uint64x2_t a(vshlq_n_u64(val, R));
+    const uint64x2_t b(vshrq_n_u64(val, 64 - R));
+    return vorrq_u64(a, b);
+}
+
+template <unsigned int R>
+inline uint64x2_t RotateRight64(const uint64x2_t& val)
+{
+    const uint64x2_t a(vshlq_n_u64(val, 64 - R));
+    const uint64x2_t b(vshrq_n_u64(val, R));
+    return vorrq_u64(a, b);
+}
+
+#if defined(__aarch32__) || defined(__aarch64__)
+// Faster than two Shifts and an Or. Thanks to Louis Wingers and Bryan Weeks.
+template <>
+inline uint64x2_t RotateLeft64<8>(const uint64x2_t& val)
+{
+#if defined(CRYPTOPP_BIG_ENDIAN)
+    const uint8_t maskb[16] = { 14,13,12,11, 10,9,8,15, 6,5,4,3, 2,1,0,7 };
+    const uint8x16_t mask = vld1q_u8(maskb);
+#else
+    const uint8_t maskb[16] = { 7,0,1,2, 3,4,5,6, 15,8,9,10, 11,12,13,14 };
+    const uint8x16_t mask = vld1q_u8(maskb);
+#endif
+
+    return vreinterpretq_u64_u8(
+        vqtbl1q_u8(vreinterpretq_u8_u64(val), mask));
+}
+
+// Faster than two Shifts and an Or. Thanks to Louis Wingers and Bryan Weeks.
+template <>
+inline uint64x2_t RotateRight64<8>(const uint64x2_t& val)
+{
+#if defined(CRYPTOPP_BIG_ENDIAN)
+    const uint8_t maskb[16] = { 8,15,14,13, 12,11,10,9, 0,7,6,5, 4,3,2,1 };
+    const uint8x16_t mask = vld1q_u8(maskb);
+#else
+    const uint8_t maskb[16] = { 1,2,3,4, 5,6,7,0, 9,10,11,12, 13,14,15,8 };
+    const uint8x16_t mask = vld1q_u8(maskb);
+#endif
+
+    return vreinterpretq_u64_u8(
+        vqtbl1q_u8(vreinterpretq_u8_u64(val), mask));
+}
+#endif
+
+inline uint64x2_t Shuffle64(const uint64x2_t& val)
+{
+#if defined(CRYPTOPP_LITTLE_ENDIAN)
+    return vreinterpretq_u64_u8(
+        vrev64q_u8(vreinterpretq_u8_u64(val)));
+#else
+    return val;
+#endif
+}
+
+inline uint64x2_t SIMON128_f(const uint64x2_t& val)
+{
+    return veorq_u64(RotateLeft64<2>(val),
+        vandq_u64(RotateLeft64<1>(val), RotateLeft64<8>(val)));
+}
+
+inline void SIMON128_Enc_Block(uint64x2_t &block0, uint64x2_t &block1,
+    const word64 *subkeys, unsigned int rounds)
+{
+    // Rearrange the data for vectorization. The incoming data was read from
+    // a big-endian byte array. Depending on the number of blocks it needs to
+    // be permuted to the following.
+    // [A1 A2][B1 B2] ... => [A1 B1][A2 B2] ...
+    uint64x2_t x1 = UnpackLow64(block0, block1);
+    uint64x2_t y1 = UnpackHigh64(block0, block1);
+
+    x1 = Shuffle64(x1); y1 = Shuffle64(y1);
+
+    for (int i = 0; i < static_cast<int>(rounds & ~1)-1; i += 2)
+    {
+        const uint64x2_t rk1 = vld1q_dup_u64(subkeys+i);
+        y1 = veorq_u64(veorq_u64(y1, SIMON128_f(x1)), rk1);
+
+        const uint64x2_t rk2 = vld1q_dup_u64(subkeys+i+1);
+        x1 = veorq_u64(veorq_u64(x1, SIMON128_f(y1)), rk2);
+    }
+
+    if (rounds & 1)
+    {
+        const uint64x2_t rk = vld1q_dup_u64(subkeys+rounds-1);
+
+        y1 = veorq_u64(veorq_u64(y1, SIMON128_f(x1)), rk);
+        std::swap(x1, y1);
+    }
+
+    x1 = Shuffle64(x1); y1 = Shuffle64(y1);
+
+    block0 = UnpackLow64(x1, y1);
+    block1 = UnpackHigh64(x1, y1);
+}
+
+inline void SIMON128_Enc_6_Blocks(uint64x2_t &block0, uint64x2_t &block1,
+    uint64x2_t &block2, uint64x2_t &block3, uint64x2_t &block4, uint64x2_t &block5,
+    const word64 *subkeys, unsigned int rounds)
+{
+    // Rearrange the data for vectorization. The incoming data was read from
+    // a big-endian byte array. Depending on the number of blocks it needs to
+    // be permuted to the following.
+    // [A1 A2][B1 B2] ... => [A1 B1][A2 B2] ...
+    uint64x2_t x1 = UnpackLow64(block0, block1);
+    uint64x2_t y1 = UnpackHigh64(block0, block1);
+    uint64x2_t x2 = UnpackLow64(block2, block3);
+    uint64x2_t y2 = UnpackHigh64(block2, block3);
+    uint64x2_t x3 = UnpackLow64(block4, block5);
+    uint64x2_t y3 = UnpackHigh64(block4, block5);
+
+    x1 = Shuffle64(x1); y1 = Shuffle64(y1);
+    x2 = Shuffle64(x2); y2 = Shuffle64(y2);
+    x3 = Shuffle64(x3); y3 = Shuffle64(y3);
+
+    for (int i = 0; i < static_cast<int>(rounds & ~1) - 1; i += 2)
+    {
+        const uint64x2_t rk1 = vld1q_dup_u64(subkeys+i);
+        y1 = veorq_u64(veorq_u64(y1, SIMON128_f(x1)), rk1);
+        y2 = veorq_u64(veorq_u64(y2, SIMON128_f(x2)), rk1);
+        y3 = veorq_u64(veorq_u64(y3, SIMON128_f(x3)), rk1);
+
+        const uint64x2_t rk2 = vld1q_dup_u64(subkeys+i+1);
+        x1 = veorq_u64(veorq_u64(x1, SIMON128_f(y1)), rk2);
+        x2 = veorq_u64(veorq_u64(x2, SIMON128_f(y2)), rk2);
+        x3 = veorq_u64(veorq_u64(x3, SIMON128_f(y3)), rk2);
+    }
+
+    if (rounds & 1)
+    {
+        const uint64x2_t rk = vld1q_dup_u64(subkeys + rounds - 1);
+
+        y1 = veorq_u64(veorq_u64(y1, SIMON128_f(x1)), rk);
+        y2 = veorq_u64(veorq_u64(y2, SIMON128_f(x2)), rk);
+        y3 = veorq_u64(veorq_u64(y3, SIMON128_f(x3)), rk);
+        std::swap(x1, y1); std::swap(x2, y2); std::swap(x3, y3);
+    }
+
+    x1 = Shuffle64(x1); y1 = Shuffle64(y1);
+    x2 = Shuffle64(x2); y2 = Shuffle64(y2);
+    x3 = Shuffle64(x3); y3 = Shuffle64(y3);
+
+    block0 = UnpackLow64(x1, y1);
+    block1 = UnpackHigh64(x1, y1);
+    block2 = UnpackLow64(x2, y2);
+    block3 = UnpackHigh64(x2, y2);
+    block4 = UnpackLow64(x3, y3);
+    block5 = UnpackHigh64(x3, y3);
+}
+
+inline void SIMON128_Dec_Block(uint64x2_t &block0, uint64x2_t &block1,
+    const word64 *subkeys, unsigned int rounds)
+{
+    // Rearrange the data for vectorization. The incoming data was read from
+    // a big-endian byte array. Depending on the number of blocks it needs to
+    // be permuted to the following.
+    // [A1 A2][B1 B2] ... => [A1 B1][A2 B2] ...
+    uint64x2_t x1 = UnpackLow64(block0, block1);
+    uint64x2_t y1 = UnpackHigh64(block0, block1);
+
+    x1 = Shuffle64(x1); y1 = Shuffle64(y1);
+
+    if (rounds & 1)
+    {
+        std::swap(x1, y1);
+        const uint64x2_t rk = vld1q_dup_u64(subkeys + rounds - 1);
+
+        y1 = veorq_u64(veorq_u64(y1, rk), SIMON128_f(x1));
+        rounds--;
+    }
+
+    for (int i = static_cast<int>(rounds-2); i >= 0; i -= 2)
+    {
+        const uint64x2_t rk1 = vld1q_dup_u64(subkeys+i+1);
+        x1 = veorq_u64(veorq_u64(x1, SIMON128_f(y1)), rk1);
+
+        const uint64x2_t rk2 = vld1q_dup_u64(subkeys+i);
+        y1 = veorq_u64(veorq_u64(y1, SIMON128_f(x1)), rk2);
+    }
+
+    x1 = Shuffle64(x1); y1 = Shuffle64(y1);
+
+    block0 = UnpackLow64(x1, y1);
+    block1 = UnpackHigh64(x1, y1);
+}
+
+inline void SIMON128_Dec_6_Blocks(uint64x2_t &block0, uint64x2_t &block1,
+    uint64x2_t &block2, uint64x2_t &block3, uint64x2_t &block4, uint64x2_t &block5,
+    const word64 *subkeys, unsigned int rounds)
+{
+    // Rearrange the data for vectorization. The incoming data was read from
+    // a big-endian byte array. Depending on the number of blocks it needs to
+    // be permuted to the following.
+    // [A1 A2][B1 B2] ... => [A1 B1][A2 B2] ...
+    uint64x2_t x1 = UnpackLow64(block0, block1);
+    uint64x2_t y1 = UnpackHigh64(block0, block1);
+    uint64x2_t x2 = UnpackLow64(block2, block3);
+    uint64x2_t y2 = UnpackHigh64(block2, block3);
+    uint64x2_t x3 = UnpackLow64(block4, block5);
+    uint64x2_t y3 = UnpackHigh64(block4, block5);
+
+    x1 = Shuffle64(x1); y1 = Shuffle64(y1);
+    x2 = Shuffle64(x2); y2 = Shuffle64(y2);
+    x3 = Shuffle64(x3); y3 = Shuffle64(y3);
+
+    if (rounds & 1)
+    {
+        std::swap(x1, y1); std::swap(x2, y2); std::swap(x3, y3);
+        const uint64x2_t rk = vld1q_dup_u64(subkeys + rounds - 1);
+
+        y1 = veorq_u64(veorq_u64(y1, rk), SIMON128_f(x1));
+        y2 = veorq_u64(veorq_u64(y2, rk), SIMON128_f(x2));
+        y3 = veorq_u64(veorq_u64(y3, rk), SIMON128_f(x3));
+        rounds--;
+    }
+
+    for (int i = static_cast<int>(rounds-2); i >= 0; i -= 2)
+    {
+        const uint64x2_t rk1 = vld1q_dup_u64(subkeys + i + 1);
+        x1 = veorq_u64(veorq_u64(x1, SIMON128_f(y1)), rk1);
+        x2 = veorq_u64(veorq_u64(x2, SIMON128_f(y2)), rk1);
+        x3 = veorq_u64(veorq_u64(x3, SIMON128_f(y3)), rk1);
+
+        const uint64x2_t rk2 = vld1q_dup_u64(subkeys + i);
+        y1 = veorq_u64(veorq_u64(y1, SIMON128_f(x1)), rk2);
+        y2 = veorq_u64(veorq_u64(y2, SIMON128_f(x2)), rk2);
+        y3 = veorq_u64(veorq_u64(y3, SIMON128_f(x3)), rk2);
+    }
+
+    x1 = Shuffle64(x1); y1 = Shuffle64(y1);
+    x2 = Shuffle64(x2); y2 = Shuffle64(y2);
+    x3 = Shuffle64(x3); y3 = Shuffle64(y3);
+
+    block0 = UnpackLow64(x1, y1);
+    block1 = UnpackHigh64(x1, y1);
+    block2 = UnpackLow64(x2, y2);
+    block3 = UnpackHigh64(x2, y2);
+    block4 = UnpackLow64(x3, y3);
+    block5 = UnpackHigh64(x3, y3);
+}
+
+#endif  // CRYPTOPP_ARM_NEON_AVAILABLE
+
+// ***************************** IA-32 ***************************** //
+
+#if defined(CRYPTOPP_SSSE3_AVAILABLE)
+
+// Clang __m128i casts, http://bugs.llvm.org/show_bug.cgi?id=20670
+#ifndef M128_CAST
+# define M128_CAST(x) ((__m128i *)(void *)(x))
+#endif
+#ifndef CONST_M128_CAST
+# define CONST_M128_CAST(x) ((const __m128i *)(const void *)(x))
+#endif
+
+// GCC double casts, https://www.spinics.net/lists/gcchelp/msg47735.html
+#ifndef DOUBLE_CAST
+# define DOUBLE_CAST(x) ((double *)(void *)(x))
+#endif
+#ifndef CONST_DOUBLE_CAST
+# define CONST_DOUBLE_CAST(x) ((const double *)(const void *)(x))
+#endif
+
+inline void Swap128(__m128i& a,__m128i& b)
+{
+#if defined(__SUNPRO_CC) && (__SUNPRO_CC <= 0x5120)
+    // __m128i is an unsigned long long[2], and support for swapping it was not added until C++11.
+    // SunCC 12.1 - 12.3 fail to consume the swap; while SunCC 12.4 consumes it without -std=c++11.
+    vec_swap(a, b);
+#else
+    std::swap(a, b);
+#endif
+}
+
+#if defined(CRYPTOPP_AVX512_ROTATE)
+template <unsigned int R>
+inline __m128i RotateLeft64(const __m128i& val)
+{
+    return _mm_rol_epi64(val, R);
+}
+
+template <unsigned int R>
+inline __m128i RotateRight64(const __m128i& val)
+{
+    return _mm_ror_epi64(val, R);
+}
+#else
+template <unsigned int R>
+inline __m128i RotateLeft64(const __m128i& val)
+{
+    return _mm_or_si128(
+        _mm_slli_epi64(val, R), _mm_srli_epi64(val, 64-R));
+}
+
+template <unsigned int R>
+inline __m128i RotateRight64(const __m128i& val)
+{
+    return _mm_or_si128(
+        _mm_slli_epi64(val, 64-R), _mm_srli_epi64(val, R));
+}
+
+// Faster than two Shifts and an Or. Thanks to Louis Wingers and Bryan Weeks.
+template <>
+inline __m128i RotateLeft64<8>(const __m128i& val)
+{
+    const __m128i mask = _mm_set_epi8(14,13,12,11, 10,9,8,15, 6,5,4,3, 2,1,0,7);
+    return _mm_shuffle_epi8(val, mask);
+}
+
+// Faster than two Shifts and an Or. Thanks to Louis Wingers and Bryan Weeks.
+template <>
+inline __m128i RotateRight64<8>(const __m128i& val)
+{
+    const __m128i mask = _mm_set_epi8(8,15,14,13, 12,11,10,9, 0,7,6,5, 4,3,2,1);
+    return _mm_shuffle_epi8(val, mask);
+}
+#endif  // CRYPTOPP_AVX512_ROTATE
+
+inline __m128i SIMON128_f(const __m128i& v)
+{
+    return _mm_xor_si128(RotateLeft64<2>(v),
+        _mm_and_si128(RotateLeft64<1>(v), RotateLeft64<8>(v)));
+}
+
+inline void GCC_NO_UBSAN SIMON128_Enc_Block(__m128i &block0, __m128i &block1,
+    const word64 *subkeys, unsigned int rounds)
+{
+    // Rearrange the data for vectorization. The incoming data was read from
+    // a big-endian byte array. Depending on the number of blocks it needs to
+    // be permuted to the following.
+    // [A1 A2][B1 B2] ... => [A1 B1][A2 B2] ...
+    __m128i x1 = _mm_unpackhi_epi64(block0, block1);
+	__m128i y1 = _mm_unpacklo_epi64(block0, block1);
+
+    for (int i = 0; i < static_cast<int>(rounds & ~1)-1; i += 2)
+    {
+        const __m128i rk1 = _mm_castpd_si128(
+            _mm_loaddup_pd(CONST_DOUBLE_CAST(subkeys+i)));
+        y1 = _mm_xor_si128(_mm_xor_si128(y1, SIMON128_f(x1)), rk1);
+
+        const __m128i rk2 = _mm_castpd_si128(
+            _mm_loaddup_pd(CONST_DOUBLE_CAST(subkeys+i+1)));
+        x1 = _mm_xor_si128(_mm_xor_si128(x1, SIMON128_f(y1)), rk2);
+    }
+
+    if (rounds & 1)
+    {
+        const __m128i rk = _mm_castpd_si128(
+            _mm_loaddup_pd(CONST_DOUBLE_CAST(subkeys+rounds-1)));
+
+        y1 = _mm_xor_si128(_mm_xor_si128(y1, SIMON128_f(x1)), rk);
+        Swap128(x1, y1);
+    }
+
+    // [A1 B1][A2 B2] ... => [A1 A2][B1 B2] ...
+    block0 = _mm_unpacklo_epi64(y1, x1);
+    block1 = _mm_unpackhi_epi64(y1, x1);
+}
+
+inline void GCC_NO_UBSAN SIMON128_Enc_6_Blocks(__m128i &block0, __m128i &block1,
+    __m128i &block2, __m128i &block3, __m128i &block4, __m128i &block5,
+    const word64 *subkeys, unsigned int rounds)
+{
+    // Rearrange the data for vectorization. The incoming data was read from
+    // a big-endian byte array. Depending on the number of blocks it needs to
+    // be permuted to the following.
+    // [A1 A2][B1 B2] ... => [A1 B1][A2 B2] ...
+    __m128i x1 = _mm_unpackhi_epi64(block0, block1);
+	__m128i y1 = _mm_unpacklo_epi64(block0, block1);
+    __m128i x2 = _mm_unpackhi_epi64(block2, block3);
+	__m128i y2 = _mm_unpacklo_epi64(block2, block3);
+    __m128i x3 = _mm_unpackhi_epi64(block4, block5);
+	__m128i y3 = _mm_unpacklo_epi64(block4, block5);
+
+    for (int i = 0; i < static_cast<int>(rounds & ~1) - 1; i += 2)
+    {
+        const __m128i rk1 = _mm_castpd_si128(
+            _mm_loaddup_pd(CONST_DOUBLE_CAST(subkeys + i)));
+        y1 = _mm_xor_si128(_mm_xor_si128(y1, SIMON128_f(x1)), rk1);
+        y2 = _mm_xor_si128(_mm_xor_si128(y2, SIMON128_f(x2)), rk1);
+        y3 = _mm_xor_si128(_mm_xor_si128(y3, SIMON128_f(x3)), rk1);
+
+        const __m128i rk2 = _mm_castpd_si128(
+            _mm_loaddup_pd(CONST_DOUBLE_CAST(subkeys + i + 1)));
+        x1 = _mm_xor_si128(_mm_xor_si128(x1, SIMON128_f(y1)), rk2);
+        x2 = _mm_xor_si128(_mm_xor_si128(x2, SIMON128_f(y2)), rk2);
+        x3 = _mm_xor_si128(_mm_xor_si128(x3, SIMON128_f(y3)), rk2);
+    }
+
+    if (rounds & 1)
+    {
+        const __m128i rk = _mm_castpd_si128(
+            _mm_loaddup_pd(CONST_DOUBLE_CAST(subkeys + rounds - 1)));
+        y1 = _mm_xor_si128(_mm_xor_si128(y1, SIMON128_f(x1)), rk);
+        y2 = _mm_xor_si128(_mm_xor_si128(y2, SIMON128_f(x2)), rk);
+        y3 = _mm_xor_si128(_mm_xor_si128(y3, SIMON128_f(x3)), rk);
+        Swap128(x1, y1); Swap128(x2, y2); Swap128(x3, y3);
+    }
+
+    // [A1 B1][A2 B2] ... => [A1 A2][B1 B2] ...
+    block0 = _mm_unpacklo_epi64(y1, x1);
+    block1 = _mm_unpackhi_epi64(y1, x1);
+    block2 = _mm_unpacklo_epi64(y2, x2);
+    block3 = _mm_unpackhi_epi64(y2, x2);
+    block4 = _mm_unpacklo_epi64(y3, x3);
+    block5 = _mm_unpackhi_epi64(y3, x3);
+}
+
+inline void GCC_NO_UBSAN SIMON128_Dec_Block(__m128i &block0, __m128i &block1,
+    const word64 *subkeys, unsigned int rounds)
+{
+    // Rearrange the data for vectorization. The incoming data was read from
+    // a big-endian byte array. Depending on the number of blocks it needs to
+    // be permuted to the following.
+    // [A1 A2][B1 B2] ... => [A1 B1][A2 B2] ...
+    __m128i x1 = _mm_unpackhi_epi64(block0, block1);
+	__m128i y1 = _mm_unpacklo_epi64(block0, block1);
+
+    if (rounds & 1)
+    {
+        const __m128i rk = _mm_castpd_si128(
+            _mm_loaddup_pd(CONST_DOUBLE_CAST(subkeys + rounds - 1)));
+
+        Swap128(x1, y1);
+        y1 = _mm_xor_si128(_mm_xor_si128(y1, rk), SIMON128_f(x1));
+        rounds--;
+    }
+
+    for (int i = static_cast<int>(rounds-2); i >= 0; i -= 2)
+    {
+        const __m128i rk1 = _mm_castpd_si128(
+            _mm_loaddup_pd(CONST_DOUBLE_CAST(subkeys+i+1)));
+        x1 = _mm_xor_si128(_mm_xor_si128(x1, SIMON128_f(y1)), rk1);
+
+        const __m128i rk2 = _mm_castpd_si128(
+            _mm_loaddup_pd(CONST_DOUBLE_CAST(subkeys+i)));
+        y1 = _mm_xor_si128(_mm_xor_si128(y1, SIMON128_f(x1)), rk2);
+    }
+
+    // [A1 B1][A2 B2] ... => [A1 A2][B1 B2] ...
+    block0 = _mm_unpacklo_epi64(y1, x1);
+    block1 = _mm_unpackhi_epi64(y1, x1);
+}
+
+inline void GCC_NO_UBSAN SIMON128_Dec_6_Blocks(__m128i &block0, __m128i &block1,
+    __m128i &block2, __m128i &block3, __m128i &block4, __m128i &block5,
+    const word64 *subkeys, unsigned int rounds)
+{
+    // Rearrange the data for vectorization. The incoming data was read from
+    // a big-endian byte array. Depending on the number of blocks it needs to
+    // be permuted to the following.
+    // [A1 A2][B1 B2] ... => [A1 B1][A2 B2] ...
+    __m128i x1 = _mm_unpackhi_epi64(block0, block1);
+	__m128i y1 = _mm_unpacklo_epi64(block0, block1);
+    __m128i x2 = _mm_unpackhi_epi64(block2, block3);
+	__m128i y2 = _mm_unpacklo_epi64(block2, block3);
+    __m128i x3 = _mm_unpackhi_epi64(block4, block5);
+	__m128i y3 = _mm_unpacklo_epi64(block4, block5);
+
+    if (rounds & 1)
+    {
+        const __m128i rk = _mm_castpd_si128(
+            _mm_loaddup_pd(CONST_DOUBLE_CAST(subkeys + rounds - 1)));
+
+        Swap128(x1, y1); Swap128(x2, y2); Swap128(x3, y3);
+        y1 = _mm_xor_si128(_mm_xor_si128(y1, rk), SIMON128_f(x1));
+        y2 = _mm_xor_si128(_mm_xor_si128(y2, rk), SIMON128_f(x2));
+        y3 = _mm_xor_si128(_mm_xor_si128(y3, rk), SIMON128_f(x3));
+        rounds--;
+    }
+
+    for (int i = static_cast<int>(rounds-2); i >= 0; i -= 2)
+    {
+        const __m128i rk1 = _mm_castpd_si128(
+            _mm_loaddup_pd(CONST_DOUBLE_CAST(subkeys + i + 1)));
+        x1 = _mm_xor_si128(_mm_xor_si128(x1, SIMON128_f(y1)), rk1);
+        x2 = _mm_xor_si128(_mm_xor_si128(x2, SIMON128_f(y2)), rk1);
+        x3 = _mm_xor_si128(_mm_xor_si128(x3, SIMON128_f(y3)), rk1);
+
+        const __m128i rk2 = _mm_castpd_si128(
+            _mm_loaddup_pd(CONST_DOUBLE_CAST(subkeys + i)));
+        y1 = _mm_xor_si128(_mm_xor_si128(y1, SIMON128_f(x1)), rk2);
+        y2 = _mm_xor_si128(_mm_xor_si128(y2, SIMON128_f(x2)), rk2);
+        y3 = _mm_xor_si128(_mm_xor_si128(y3, SIMON128_f(x3)), rk2);
+    }
+
+    // [A1 B1][A2 B2] ... => [A1 A2][B1 B2] ...
+    block0 = _mm_unpacklo_epi64(y1, x1);
+    block1 = _mm_unpackhi_epi64(y1, x1);
+    block2 = _mm_unpacklo_epi64(y2, x2);
+    block3 = _mm_unpackhi_epi64(y2, x2);
+    block4 = _mm_unpacklo_epi64(y3, x3);
+    block5 = _mm_unpackhi_epi64(y3, x3);
+}
+
+#endif  // CRYPTOPP_SSSE3_AVAILABLE
+
+#if defined(CRYPTOPP_SSE41_AVAILABLE)
+
+template <unsigned int R>
+inline __m128i RotateLeft32(const __m128i& val)
+{
+    return _mm_or_si128(
+        _mm_slli_epi32(val, R), _mm_srli_epi32(val, 32-R));
+}
+
+template <unsigned int R>
+inline __m128i RotateRight32(const __m128i& val)
+{
+    return _mm_or_si128(
+        _mm_slli_epi32(val, 32-R), _mm_srli_epi32(val, R));
+}
+
+// Faster than two Shifts and an Or. Thanks to Louis Wingers and Bryan Weeks.
+template <>
+inline __m128i RotateLeft32<8>(const __m128i& val)
+{
+    const __m128i mask = _mm_set_epi8(14,13,12,15, 10,9,8,11, 6,5,4,7, 2,1,0,3);
+    return _mm_shuffle_epi8(val, mask);
+}
+
+// Faster than two Shifts and an Or. Thanks to Louis Wingers and Bryan Weeks.
+template <>
+inline __m128i RotateRight32<8>(const __m128i& val)
+{
+    const __m128i mask = _mm_set_epi8(12,15,14,13, 8,11,10,9, 4,7,6,5, 0,3,2,1);
+    return _mm_shuffle_epi8(val, mask);
+}
+
+inline __m128i SIMON64_f(const __m128i& v)
+{
+    return _mm_xor_si128(RotateLeft32<2>(v),
+        _mm_and_si128(RotateLeft32<1>(v), RotateLeft32<8>(v)));
+}
+
+inline void GCC_NO_UBSAN SIMON64_Enc_Block(__m128i &block0, __m128i &block1,
+    const word32 *subkeys, unsigned int rounds)
+{
+    // Rearrange the data for vectorization. The incoming data was read from
+    // a big-endian byte array. Depending on the number of blocks it needs to
+    // be permuted to the following. Thanks to Peter Cordes for help with the
+    // SSE permutes below.
+    // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
+    const __m128 t0 = _mm_castsi128_ps(block0);
+    const __m128 t1 = _mm_castsi128_ps(block1);
+	__m128i x1 = _mm_castps_si128(_mm_shuffle_ps(t0, t1, _MM_SHUFFLE(3,1,3,1)));
+	__m128i y1 = _mm_castps_si128(_mm_shuffle_ps(t0, t1, _MM_SHUFFLE(2,0,2,0)));
+
+    for (int i = 0; i < static_cast<int>(rounds & ~1)-1; i += 2)
+    {
+        const __m128i rk1 = _mm_set1_epi32(subkeys[i]);
+        y1 = _mm_xor_si128(_mm_xor_si128(y1, SIMON64_f(x1)), rk1);
+
+        const __m128i rk2 = _mm_set1_epi32(subkeys[i+1]);
+        x1 = _mm_xor_si128(_mm_xor_si128(x1, SIMON64_f(y1)), rk2);
+    }
+
+    if (rounds & 1)
+    {
+        const __m128i rk = _mm_set1_epi32(subkeys[rounds-1]);
+        y1 = _mm_xor_si128(_mm_xor_si128(y1, SIMON64_f(x1)), rk);
+        Swap128(x1, y1);
+    }
+
+    // The is roughly the SSE equivalent to ARM vzp32
+    // [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4]
+    block0 = _mm_unpacklo_epi32(y1, x1);
+    block1 = _mm_unpackhi_epi32(y1, x1);
+}
+
+inline void GCC_NO_UBSAN SIMON64_Dec_Block(__m128i &block0, __m128i &block1,
+    const word32 *subkeys, unsigned int rounds)
+{
+    // Rearrange the data for vectorization. The incoming data was read from
+    // a big-endian byte array. Depending on the number of blocks it needs to
+    // be permuted to the following. Thanks to Peter Cordes for help with the
+    // SSE permutes below.
+    // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
+    const __m128 t0 = _mm_castsi128_ps(block0);
+    const __m128 t1 = _mm_castsi128_ps(block1);
+	__m128i x1 = _mm_castps_si128(_mm_shuffle_ps(t0, t1, _MM_SHUFFLE(3,1,3,1)));
+	__m128i y1 = _mm_castps_si128(_mm_shuffle_ps(t0, t1, _MM_SHUFFLE(2,0,2,0)));
+
+    if (rounds & 1)
+    {
+        Swap128(x1, y1);
+        const __m128i rk = _mm_set1_epi32(subkeys[rounds-1]);
+        y1 = _mm_xor_si128(_mm_xor_si128(y1, rk), SIMON64_f(x1));
+        rounds--;
+    }
+
+    for (int i = static_cast<int>(rounds-2); i >= 0; i -= 2)
+    {
+        const __m128i rk1 = _mm_set1_epi32(subkeys[i+1]);
+        x1 = _mm_xor_si128(_mm_xor_si128(x1, SIMON64_f(y1)), rk1);
+
+        const __m128i rk2 = _mm_set1_epi32(subkeys[i]);
+        y1 = _mm_xor_si128(_mm_xor_si128(y1, SIMON64_f(x1)), rk2);
+    }
+
+    // The is roughly the SSE equivalent to ARM vzp32
+    // [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4]
+    block0 = _mm_unpacklo_epi32(y1, x1);
+    block1 = _mm_unpackhi_epi32(y1, x1);
+}
+
+inline void GCC_NO_UBSAN SIMON64_Enc_6_Blocks(__m128i &block0, __m128i &block1,
+    __m128i &block2, __m128i &block3, __m128i &block4, __m128i &block5,
+    const word32 *subkeys, unsigned int rounds)
+{
+    // Rearrange the data for vectorization. The incoming data was read from
+    // a big-endian byte array. Depending on the number of blocks it needs to
+    // be permuted to the following. Thanks to Peter Cordes for help with the
+    // SSE permutes below.
+    // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
+    const __m128 t0 = _mm_castsi128_ps(block0);
+    const __m128 t1 = _mm_castsi128_ps(block1);
+	__m128i x1 = _mm_castps_si128(_mm_shuffle_ps(t0, t1, _MM_SHUFFLE(3,1,3,1)));
+	__m128i y1 = _mm_castps_si128(_mm_shuffle_ps(t0, t1, _MM_SHUFFLE(2,0,2,0)));
+
+    const __m128 t2 = _mm_castsi128_ps(block2);
+    const __m128 t3 = _mm_castsi128_ps(block3);
+	__m128i x2 = _mm_castps_si128(_mm_shuffle_ps(t2, t3, _MM_SHUFFLE(3,1,3,1)));
+	__m128i y2 = _mm_castps_si128(_mm_shuffle_ps(t2, t3, _MM_SHUFFLE(2,0,2,0)));
+
+    const __m128 t4 = _mm_castsi128_ps(block4);
+    const __m128 t5 = _mm_castsi128_ps(block5);
+	__m128i x3 = _mm_castps_si128(_mm_shuffle_ps(t4, t5, _MM_SHUFFLE(3,1,3,1)));
+	__m128i y3 = _mm_castps_si128(_mm_shuffle_ps(t4, t5, _MM_SHUFFLE(2,0,2,0)));
+
+    for (int i = 0; i < static_cast<int>(rounds & ~1)-1; i += 2)
+    {
+        const __m128i rk1 = _mm_set1_epi32(subkeys[i]);
+        y1 = _mm_xor_si128(_mm_xor_si128(y1, SIMON64_f(x1)), rk1);
+        y2 = _mm_xor_si128(_mm_xor_si128(y2, SIMON64_f(x2)), rk1);
+        y3 = _mm_xor_si128(_mm_xor_si128(y3, SIMON64_f(x3)), rk1);
+
+        const __m128i rk2 = _mm_set1_epi32(subkeys[i+1]);
+        x1 = _mm_xor_si128(_mm_xor_si128(x1, SIMON64_f(y1)), rk2);
+        x2 = _mm_xor_si128(_mm_xor_si128(x2, SIMON64_f(y2)), rk2);
+        x3 = _mm_xor_si128(_mm_xor_si128(x3, SIMON64_f(y3)), rk2);
+    }
+
+    if (rounds & 1)
+    {
+        const __m128i rk = _mm_set1_epi32(subkeys[rounds-1]);
+        y1 = _mm_xor_si128(_mm_xor_si128(y1, SIMON64_f(x1)), rk);
+        y2 = _mm_xor_si128(_mm_xor_si128(y2, SIMON64_f(x2)), rk);
+        y3 = _mm_xor_si128(_mm_xor_si128(y3, SIMON64_f(x3)), rk);
+        Swap128(x1, y1); Swap128(x2, y2); Swap128(x3, y3);
+    }
+
+    // The is roughly the SSE equivalent to ARM vzp32
+    // [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4]
+    block0 = _mm_unpacklo_epi32(y1, x1);
+    block1 = _mm_unpackhi_epi32(y1, x1);
+    block2 = _mm_unpacklo_epi32(y2, x2);
+    block3 = _mm_unpackhi_epi32(y2, x2);
+    block4 = _mm_unpacklo_epi32(y3, x3);
+    block5 = _mm_unpackhi_epi32(y3, x3);
+}
+
+inline void GCC_NO_UBSAN SIMON64_Dec_6_Blocks(__m128i &block0, __m128i &block1,
+    __m128i &block2, __m128i &block3, __m128i &block4, __m128i &block5,
+    const word32 *subkeys, unsigned int rounds)
+{
+    // Rearrange the data for vectorization. The incoming data was read from
+    // a big-endian byte array. Depending on the number of blocks it needs to
+    // be permuted to the following. Thanks to Peter Cordes for help with the
+    // SSE permutes below.
+    // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
+    const __m128 t0 = _mm_castsi128_ps(block0);
+    const __m128 t1 = _mm_castsi128_ps(block1);
+	__m128i x1 = _mm_castps_si128(_mm_shuffle_ps(t0, t1, _MM_SHUFFLE(3,1,3,1)));
+	__m128i y1 = _mm_castps_si128(_mm_shuffle_ps(t0, t1, _MM_SHUFFLE(2,0,2,0)));
+
+    const __m128 t2 = _mm_castsi128_ps(block2);
+    const __m128 t3 = _mm_castsi128_ps(block3);
+	__m128i x2 = _mm_castps_si128(_mm_shuffle_ps(t2, t3, _MM_SHUFFLE(3,1,3,1)));
+	__m128i y2 = _mm_castps_si128(_mm_shuffle_ps(t2, t3, _MM_SHUFFLE(2,0,2,0)));
+
+    const __m128 t4 = _mm_castsi128_ps(block4);
+    const __m128 t5 = _mm_castsi128_ps(block5);
+	__m128i x3 = _mm_castps_si128(_mm_shuffle_ps(t4, t5, _MM_SHUFFLE(3,1,3,1)));
+	__m128i y3 = _mm_castps_si128(_mm_shuffle_ps(t4, t5, _MM_SHUFFLE(2,0,2,0)));
+
+    if (rounds & 1)
+    {
+        Swap128(x1, y1); Swap128(x2, y2); Swap128(x3, y3);
+        const __m128i rk = _mm_set1_epi32(subkeys[rounds-1]);
+        y1 = _mm_xor_si128(_mm_xor_si128(y1, rk), SIMON64_f(x1));
+        y2 = _mm_xor_si128(_mm_xor_si128(y2, rk), SIMON64_f(x2));
+        y3 = _mm_xor_si128(_mm_xor_si128(y3, rk), SIMON64_f(x3));
+        rounds--;
+    }
+
+    for (int i = static_cast<int>(rounds-2); i >= 0; i -= 2)
+    {
+        const __m128i rk1 = _mm_set1_epi32(subkeys[i+1]);
+        x1 = _mm_xor_si128(_mm_xor_si128(x1, SIMON64_f(y1)), rk1);
+        x2 = _mm_xor_si128(_mm_xor_si128(x2, SIMON64_f(y2)), rk1);
+        x3 = _mm_xor_si128(_mm_xor_si128(x3, SIMON64_f(y3)), rk1);
+
+        const __m128i rk2 = _mm_set1_epi32(subkeys[i]);
+        y1 = _mm_xor_si128(_mm_xor_si128(y1, SIMON64_f(x1)), rk2);
+        y2 = _mm_xor_si128(_mm_xor_si128(y2, SIMON64_f(x2)), rk2);
+        y3 = _mm_xor_si128(_mm_xor_si128(y3, SIMON64_f(x3)), rk2);
+    }
+
+    // The is roughly the SSE equivalent to ARM vzp32
+    // [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4]
+    block0 = _mm_unpacklo_epi32(y1, x1);
+    block1 = _mm_unpackhi_epi32(y1, x1);
+    block2 = _mm_unpacklo_epi32(y2, x2);
+    block3 = _mm_unpackhi_epi32(y2, x2);
+    block4 = _mm_unpacklo_epi32(y3, x3);
+    block5 = _mm_unpackhi_epi32(y3, x3);
+}
+
+#endif  // CRYPTOPP_SSE41_AVAILABLE
+
+ANONYMOUS_NAMESPACE_END
+
+///////////////////////////////////////////////////////////////////////
+
+NAMESPACE_BEGIN(CryptoPP)
+
+// *************************** ARM NEON **************************** //
+
+#if (CRYPTOPP_ARM_NEON_AVAILABLE)
+size_t SIMON64_Enc_AdvancedProcessBlocks_NEON(const word32* subKeys, size_t rounds,
+    const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
+{
+    return AdvancedProcessBlocks64_6x2_NEON(SIMON64_Enc_Block, SIMON64_Enc_6_Blocks,
+        subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
+}
+
+size_t SIMON64_Dec_AdvancedProcessBlocks_NEON(const word32* subKeys, size_t rounds,
+    const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
+{
+    return AdvancedProcessBlocks64_6x2_NEON(SIMON64_Dec_Block, SIMON64_Dec_6_Blocks,
+        subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
+}
+#endif  // CRYPTOPP_ARM_NEON_AVAILABLE
+
+#if (CRYPTOPP_ARM_NEON_AVAILABLE)
+size_t SIMON128_Enc_AdvancedProcessBlocks_NEON(const word64* subKeys, size_t rounds,
+    const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
+{
+    return AdvancedProcessBlocks128_6x2_NEON(SIMON128_Enc_Block, SIMON128_Enc_6_Blocks,
+        subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
+}
+
+size_t SIMON128_Dec_AdvancedProcessBlocks_NEON(const word64* subKeys, size_t rounds,
+    const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
+{
+    return AdvancedProcessBlocks128_6x2_NEON(SIMON128_Dec_Block, SIMON128_Dec_6_Blocks,
+        subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
+}
+#endif  // CRYPTOPP_ARM_NEON_AVAILABLE
+
+// ***************************** IA-32 ***************************** //
+
+#if defined(CRYPTOPP_SSE41_AVAILABLE)
+size_t SIMON64_Enc_AdvancedProcessBlocks_SSE41(const word32* subKeys, size_t rounds,
+    const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
+{
+    return AdvancedProcessBlocks64_6x2_SSE(SIMON64_Enc_Block, SIMON64_Enc_6_Blocks,
+        subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
+}
+
+size_t SIMON64_Dec_AdvancedProcessBlocks_SSE41(const word32* subKeys, size_t rounds,
+    const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
+{
+    return AdvancedProcessBlocks64_6x2_SSE(SIMON64_Dec_Block, SIMON64_Dec_6_Blocks,
+        subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
+}
+#endif
+
+#if defined(CRYPTOPP_SSSE3_AVAILABLE)
+size_t SIMON128_Enc_AdvancedProcessBlocks_SSSE3(const word64* subKeys, size_t rounds,
+    const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
+{
+    return AdvancedProcessBlocks128_6x2_SSE(SIMON128_Enc_Block, SIMON128_Enc_6_Blocks,
+        subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
+}
+
+size_t SIMON128_Dec_AdvancedProcessBlocks_SSSE3(const word64* subKeys, size_t rounds,
+    const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
+{
+    return AdvancedProcessBlocks128_6x2_SSE(SIMON128_Dec_Block, SIMON128_Dec_6_Blocks,
+        subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
+}
+#endif  // CRYPTOPP_SSSE3_AVAILABLE
+
+NAMESPACE_END
diff --git a/simon.cpp b/simon.cpp
new file mode 100644
index 00000000..438e9197
--- /dev/null
+++ b/simon.cpp
@@ -0,0 +1,463 @@
+// simon.h - written and placed in the public domain by Jeffrey Walton
+
+#include "pch.h"
+#include "config.h"
+
+#include "simon.h"
+#include "misc.h"
+#include "cpu.h"
+
+// Uncomment for benchmarking C++ against SSE or NEON.
+// Do so in both simon.cpp and simon-simd.cpp.
+// #undef CRYPTOPP_SSSE3_AVAILABLE
+// #undef CRYPTOPP_SSE41_AVAILABLE
+// #undef CRYPTOPP_ARM_NEON_AVAILABLE
+
+ANONYMOUS_NAMESPACE_BEGIN
+
+using CryptoPP::word32;
+using CryptoPP::word64;
+using CryptoPP::rotlConstant;
+using CryptoPP::rotrConstant;
+
+/// \brief Round transformation helper
+/// \tparam W word type
+/// \param v value
+template <class W>
+inline W f(const W v)
+{
+    return (rotlConstant<1>(v) & rotlConstant<8>(v)) ^ rotlConstant<2>(v);
+}
+
+/// \brief Round transformation
+/// \tparam W word type
+/// \param x value
+/// \param y value
+/// \param k value
+/// \param l value
+template <class W>
+inline void R2(W& x, W& y, const W k, const W l)
+{
+    y ^= f(x); y ^= k;
+    x ^= f(y); x ^= l;
+}
+
+/// \brief Forward transformation
+/// \tparam W word type
+/// \tparam R number of rounds
+/// \param c output array
+/// \param p input array
+/// \param k subkey array
+template <class W, unsigned int R>
+inline void SIMON_Encrypt(W c[2], const W p[2], const W k[R])
+{
+    c[0]=p[0]; c[1]=p[1];
+
+    for (int i = 0; i < static_cast<int>(R-1); i += 2)
+        R2(c[0], c[1], k[i], k[i + 1]);
+
+    if (R & 1)
+    {
+        c[1] ^= f(c[0]); c[1] ^= k[R-1];
+        W t = c[0]; c[0] = c[1]; c[1] = t;
+    }
+}
+
+/// \brief Reverse transformation
+/// \tparam W word type
+/// \tparam R number of rounds
+/// \param p output array
+/// \param c input array
+/// \param k subkey array
+template <class W, unsigned int R>
+inline void SIMON_Decrypt(W p[2], const W c[2], const W k[R])
+{
+    p[0]=c[0]; p[1]=c[1];
+    unsigned int rounds = R;
+
+    if (R & 1)
+    {
+        const W t = p[1]; p[1] = p[0]; p[0] = t;
+        p[1] ^= k[R - 1]; p[1] ^= f(p[0]);
+        rounds--;
+    }
+
+    for (int i = static_cast<int>(rounds - 2); i >= 0; i -= 2)
+        R2(p[1], p[0], k[i + 1], k[i]);
+}
+
+/// \brief Subkey generation function
+/// \details Used for SIMON-64 with 96-bit key and 42 rounds. A template was
+///   not worthwhile because all instantiations would need specialization.
+/// \param key empty subkey array
+/// \param k user key array
+inline void SIMON64_ExpandKey_3W(word32 key[42], const word32 k[3])
+{
+    const word32 c = 0xfffffffc;
+    word64 z = W64LIT(0x7369f885192c0ef5);
+
+    key[0] = k[2]; key[1] = k[1]; key[2] = k[0];
+    for (size_t i = 3; i<42; ++i)
+    {
+        key[i] = c ^ (z & 1) ^ key[i - 3] ^ rotrConstant<3>(key[i - 1]) ^ rotrConstant<4>(key[i - 1]);
+        z >>= 1;
+    }
+}
+
+/// \brief Subkey generation function
+/// \details Used for SIMON-64 with 128-bit key and 44 rounds. A template was
+///   not worthwhile because all instantiations would need specialization.
+/// \param key empty subkey array
+/// \param k user key array
+inline void SIMON64_ExpandKey_4W(word32 key[44], const word32 k[4])
+{
+    const word32 c = 0xfffffffc;
+    word64 z = W64LIT(0xfc2ce51207a635db);
+
+    key[0] = k[3]; key[1] = k[2]; key[2] = k[1]; key[3] = k[0];
+    for (size_t i = 4; i<44; ++i)
+    {
+        key[i] = c ^ (z & 1) ^ key[i - 4] ^ rotrConstant<3>(key[i - 1]) ^ key[i - 3] ^ rotrConstant<4>(key[i - 1]) ^ rotrConstant<1>(key[i - 3]);
+        z >>= 1;
+    }
+}
+
+/// \brief Subkey generation function
+/// \details Used for SIMON-128 with 128-bit key and 68 rounds. A template was
+///   not worthwhile because all instantiations would need specialization.
+/// \param key empty subkey array
+/// \param k user key array
+inline void SIMON128_ExpandKey_2W(word64 key[68], const word64 k[2])
+{
+    const word64 c = W64LIT(0xfffffffffffffffc);
+    word64 z = W64LIT(0x7369f885192c0ef5);
+
+    key[0] = k[1]; key[1] = k[0];
+    for (size_t i=2; i<66; ++i)
+    {
+        key[i] = c ^ (z & 1) ^ key[i - 2] ^ rotrConstant<3>(key[i - 1]) ^ rotrConstant<4>(key[i - 1]);
+        z>>=1;
+    }
+
+    key[66] = c ^ 1 ^ key[64] ^ rotrConstant<3>(key[65]) ^ rotrConstant<4>(key[65]);
+    key[67] = c^key[65] ^ rotrConstant<3>(key[66]) ^ rotrConstant<4>(key[66]);
+}
+
+/// \brief Subkey generation function
+/// \details Used for SIMON-128 with 192-bit key and 69 rounds. A template was
+///   not worthwhile because all instantiations would need specialization.
+/// \param key empty subkey array
+/// \param k user key array
+inline void SIMON128_ExpandKey_3W(word64 key[69], const word64 k[3])
+{
+    const word64 c = W64LIT(0xfffffffffffffffc);
+    word64 z = W64LIT(0xfc2ce51207a635db);
+
+    key[0]=k[2]; key[1]=k[1]; key[2]=k[0];
+    for (size_t i=3; i<67; ++i)
+    {
+        key[i] = c ^ (z & 1) ^ key[i - 3] ^ rotrConstant<3>(key[i - 1]) ^ rotrConstant<4>(key[i - 1]);
+        z>>=1;
+    }
+
+    key[67] = c^key[64] ^ rotrConstant<3>(key[66]) ^ rotrConstant<4>(key[66]);
+    key[68] = c ^ 1 ^ key[65] ^ rotrConstant<3>(key[67]) ^ rotrConstant<4>(key[67]);
+}
+
+/// \brief Subkey generation function
+/// \details Used for SIMON-128 with 256-bit key and 72 rounds. A template was
+///   not worthwhile because all instantiations would need specialization.
+/// \param key empty subkey array
+/// \param k user key array
+inline void SIMON128_ExpandKey_4W(word64 key[72], const word64 k[4])
+{
+    const word64 c = W64LIT(0xfffffffffffffffc);
+    word64 z = W64LIT(0xfdc94c3a046d678b);
+
+    key[0]=k[3]; key[1]=k[2]; key[2]=k[1]; key[3]=k[0];
+    for (size_t i=4; i<68; ++i)
+    {
+        key[i] = c ^ (z & 1) ^ key[i - 4] ^ rotrConstant<3>(key[i - 1]) ^ key[i - 3] ^ rotrConstant<4>(key[i - 1]) ^ rotrConstant<1>(key[i - 3]);
+        z>>=1;
+    }
+
+    key[68] = c^key[64] ^ rotrConstant<3>(key[67]) ^ key[65] ^ rotrConstant<4>(key[67]) ^ rotrConstant<1>(key[65]);
+    key[69] = c ^ 1 ^ key[65] ^ rotrConstant<3>(key[68]) ^ key[66] ^ rotrConstant<4>(key[68]) ^ rotrConstant<1>(key[66]);
+    key[70] = c^key[66] ^ rotrConstant<3>(key[69]) ^ key[67] ^ rotrConstant<4>(key[69]) ^ rotrConstant<1>(key[67]);
+    key[71] = c^key[67] ^ rotrConstant<3>(key[70]) ^ key[68] ^ rotrConstant<4>(key[70]) ^ rotrConstant<1>(key[68]);
+}
+
+ANONYMOUS_NAMESPACE_END
+
+///////////////////////////////////////////////////////////
+
+NAMESPACE_BEGIN(CryptoPP)
+
+#if defined(CRYPTOPP_ARM_NEON_AVAILABLE)
+extern size_t SIMON64_Enc_AdvancedProcessBlocks_NEON(const word32* subKeys, size_t rounds,
+    const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags);
+
+extern size_t SIMON64_Dec_AdvancedProcessBlocks_NEON(const word32* subKeys, size_t rounds,
+    const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags);
+#endif
+
+#if defined(CRYPTOPP_ARM_NEON_AVAILABLE)
+extern size_t SIMON128_Enc_AdvancedProcessBlocks_NEON(const word64* subKeys, size_t rounds,
+    const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags);
+
+extern size_t SIMON128_Dec_AdvancedProcessBlocks_NEON(const word64* subKeys, size_t rounds,
+    const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags);
+#endif
+
+#if defined(CRYPTOPP_SSE41_AVAILABLE)
+extern size_t SIMON64_Enc_AdvancedProcessBlocks_SSE41(const word32* subKeys, size_t rounds,
+    const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags);
+
+extern size_t SIMON64_Dec_AdvancedProcessBlocks_SSE41(const word32* subKeys, size_t rounds,
+    const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags);
+#endif
+
+#if defined(CRYPTOPP_SSSE3_AVAILABLE)
+extern size_t SIMON128_Enc_AdvancedProcessBlocks_SSSE3(const word64* subKeys, size_t rounds,
+    const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags);
+
+extern size_t SIMON128_Dec_AdvancedProcessBlocks_SSSE3(const word64* subKeys, size_t rounds,
+    const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags);
+#endif
+
+void SIMON64::Base::UncheckedSetKey(const byte *userKey, unsigned int keyLength, const NameValuePairs &params)
+{
+    CRYPTOPP_ASSERT(keyLength == 12 || keyLength == 16);
+    CRYPTOPP_UNUSED(params);
+
+    // Building the key schedule table requires {3,4} words workspace.
+    // Encrypting and decrypting requires 4 words workspace.
+    m_kwords = keyLength/sizeof(word32);
+    m_wspace.New(4U);
+
+    // Do the endian gyrations from the paper and align pointers
+    typedef GetBlock<word32, LittleEndian, false> KeyBlock;
+    KeyBlock kblk(userKey);
+
+    switch (m_kwords)
+    {
+    case 3:
+        m_rkeys.New((m_rounds = 42));
+        kblk(m_wspace[2])(m_wspace[1])(m_wspace[0]);
+        SIMON64_ExpandKey_3W(m_rkeys, m_wspace);
+        break;
+    case 4:
+        m_rkeys.New((m_rounds = 44));
+        kblk(m_wspace[3])(m_wspace[2])(m_wspace[1])(m_wspace[0]);
+        SIMON64_ExpandKey_4W(m_rkeys, m_wspace);
+        break;
+    default:
+        CRYPTOPP_ASSERT(0);;
+    }
+}
+
+void SIMON64::Enc::ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock, byte *outBlock) const
+{
+    // Do the endian gyrations from the paper and align pointers
+    typedef GetBlock<word32, LittleEndian, false> InBlock;
+    InBlock iblk(inBlock); iblk(m_wspace[1])(m_wspace[0]);
+
+    switch (m_rounds)
+    {
+    case 42:
+        SIMON_Encrypt<word32, 42>(m_wspace+2, m_wspace+0, m_rkeys);
+        break;
+    case 44:
+        SIMON_Encrypt<word32, 44>(m_wspace+2, m_wspace+0, m_rkeys);
+        break;
+    default:
+        CRYPTOPP_ASSERT(0);;
+    }
+
+    // Do the endian gyrations from the paper and align pointers
+    typedef PutBlock<word32, LittleEndian, false> OutBlock;
+    OutBlock oblk(xorBlock, outBlock); oblk(m_wspace[3])(m_wspace[2]);
+}
+
+void SIMON64::Dec::ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock, byte *outBlock) const
+{
+    // Do the endian gyrations from the paper and align pointers
+    typedef GetBlock<word32, LittleEndian, false> InBlock;
+    InBlock iblk(inBlock); iblk(m_wspace[1])(m_wspace[0]);
+
+    switch (m_rounds)
+    {
+    case 42:
+        SIMON_Decrypt<word32, 42>(m_wspace+2, m_wspace+0, m_rkeys);
+        break;
+    case 44:
+        SIMON_Decrypt<word32, 44>(m_wspace+2, m_wspace+0, m_rkeys);
+        break;
+    default:
+        CRYPTOPP_ASSERT(0);;
+    }
+
+    // Do the endian gyrations from the paper and align pointers
+    typedef PutBlock<word32, LittleEndian, false> OutBlock;
+    OutBlock oblk(xorBlock, outBlock); oblk(m_wspace[3])(m_wspace[2]);
+}
+
+///////////////////////////////////////////////////////////
+
+void SIMON128::Base::UncheckedSetKey(const byte *userKey, unsigned int keyLength, const NameValuePairs &params)
+{
+    CRYPTOPP_ASSERT(keyLength == 16 || keyLength == 24 || keyLength == 32);
+    CRYPTOPP_UNUSED(params);
+
+    // Building the key schedule table requires {2,3,4} words workspace.
+    // Encrypting and decrypting requires 4 words workspace.
+    m_kwords = keyLength/sizeof(word64);
+    m_wspace.New(4U);
+
+    // Do the endian gyrations from the paper and align pointers
+    typedef GetBlock<word64, LittleEndian, false> KeyBlock;
+    KeyBlock kblk(userKey);
+
+    switch (m_kwords)
+    {
+    case 2:
+        m_rkeys.New((m_rounds = 68));
+        kblk(m_wspace[1])(m_wspace[0]);
+        SIMON128_ExpandKey_2W(m_rkeys, m_wspace);
+        break;
+    case 3:
+        m_rkeys.New((m_rounds = 69));
+        kblk(m_wspace[2])(m_wspace[1])(m_wspace[0]);
+        SIMON128_ExpandKey_3W(m_rkeys, m_wspace);
+        break;
+    case 4:
+        m_rkeys.New((m_rounds = 72));
+        kblk(m_wspace[3])(m_wspace[2])(m_wspace[1])(m_wspace[0]);
+        SIMON128_ExpandKey_4W(m_rkeys, m_wspace);
+        break;
+    default:
+        CRYPTOPP_ASSERT(0);;
+    }
+}
+
+void SIMON128::Enc::ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock, byte *outBlock) const
+{
+    // Do the endian gyrations from the paper and align pointers
+    typedef GetBlock<word64, LittleEndian, false> InBlock;
+    InBlock iblk(inBlock); iblk(m_wspace[1])(m_wspace[0]);
+
+    switch (m_rounds)
+    {
+    case 68:
+        SIMON_Encrypt<word64, 68>(m_wspace+2, m_wspace+0, m_rkeys);
+        break;
+    case 69:
+        SIMON_Encrypt<word64, 69>(m_wspace+2, m_wspace+0, m_rkeys);
+        break;
+    case 72:
+        SIMON_Encrypt<word64, 72>(m_wspace+2, m_wspace+0, m_rkeys);
+        break;
+    default:
+        CRYPTOPP_ASSERT(0);;
+    }
+
+    // Do the endian gyrations from the paper and align pointers
+    typedef PutBlock<word64, LittleEndian, false> OutBlock;
+    OutBlock oblk(xorBlock, outBlock); oblk(m_wspace[3])(m_wspace[2]);
+}
+
+void SIMON128::Dec::ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock, byte *outBlock) const
+{
+    // Do the endian gyrations from the paper and align pointers
+    typedef GetBlock<word64, LittleEndian, false> InBlock;
+    InBlock iblk(inBlock); iblk(m_wspace[1])(m_wspace[0]);
+
+    switch (m_rounds)
+    {
+    case 68:
+        SIMON_Decrypt<word64, 68>(m_wspace+2, m_wspace+0, m_rkeys);
+        break;
+    case 69:
+        SIMON_Decrypt<word64, 69>(m_wspace+2, m_wspace+0, m_rkeys);
+        break;
+    case 72:
+        SIMON_Decrypt<word64, 72>(m_wspace+2, m_wspace+0, m_rkeys);
+        break;
+    default:
+        CRYPTOPP_ASSERT(0);;
+    }
+
+    // Do the endian gyrations from the paper and align pointers
+    typedef PutBlock<word64, LittleEndian, false> OutBlock;
+    OutBlock oblk(xorBlock, outBlock); oblk(m_wspace[3])(m_wspace[2]);
+}
+
+#if defined(CRYPTOPP_SIMON64_ADVANCED_PROCESS_BLOCKS)
+size_t SIMON64::Enc::AdvancedProcessBlocks(const byte *inBlocks, const byte *xorBlocks,
+        byte *outBlocks, size_t length, word32 flags) const
+{
+#if defined(CRYPTOPP_SSE41_AVAILABLE)
+    if (HasSSE41())
+        return SIMON64_Enc_AdvancedProcessBlocks_SSE41(m_rkeys, (size_t)m_rounds,
+            inBlocks, xorBlocks, outBlocks, length, flags);
+#endif
+#if defined(CRYPTOPP_ARM_NEON_AVAILABLE)
+    if (HasNEON())
+        return SIMON64_Enc_AdvancedProcessBlocks_NEON(m_rkeys, (size_t)m_rounds,
+            inBlocks, xorBlocks, outBlocks, length, flags);
+#endif
+    return BlockTransformation::AdvancedProcessBlocks(inBlocks, xorBlocks, outBlocks, length, flags);
+}
+
+size_t SIMON64::Dec::AdvancedProcessBlocks(const byte *inBlocks, const byte *xorBlocks,
+        byte *outBlocks, size_t length, word32 flags) const
+{
+#if defined(CRYPTOPP_SSE41_AVAILABLE)
+    if (HasSSE41())
+        return SIMON64_Dec_AdvancedProcessBlocks_SSE41(m_rkeys, (size_t)m_rounds,
+            inBlocks, xorBlocks, outBlocks, length, flags);
+#endif
+#if defined(CRYPTOPP_ARM_NEON_AVAILABLE)
+    if (HasNEON())
+        return SIMON64_Dec_AdvancedProcessBlocks_NEON(m_rkeys, (size_t)m_rounds,
+            inBlocks, xorBlocks, outBlocks, length, flags);
+#endif
+    return BlockTransformation::AdvancedProcessBlocks(inBlocks, xorBlocks, outBlocks, length, flags);
+}
+#endif  // CRYPTOPP_SIMON64_ADVANCED_PROCESS_BLOCKS
+
+#if defined(CRYPTOPP_SIMON128_ADVANCED_PROCESS_BLOCKS)
+size_t SIMON128::Enc::AdvancedProcessBlocks(const byte *inBlocks, const byte *xorBlocks,
+        byte *outBlocks, size_t length, word32 flags) const
+{
+#if defined(CRYPTOPP_SSSE3_AVAILABLE)
+    if (HasSSSE3())
+        return SIMON128_Enc_AdvancedProcessBlocks_SSSE3(m_rkeys, (size_t)m_rounds,
+            inBlocks, xorBlocks, outBlocks, length, flags);
+#endif
+#if defined(CRYPTOPP_ARM_NEON_AVAILABLE)
+    if (HasNEON())
+        return SIMON128_Enc_AdvancedProcessBlocks_NEON(m_rkeys, (size_t)m_rounds,
+            inBlocks, xorBlocks, outBlocks, length, flags);
+#endif
+    return BlockTransformation::AdvancedProcessBlocks(inBlocks, xorBlocks, outBlocks, length, flags);
+}
+
+size_t SIMON128::Dec::AdvancedProcessBlocks(const byte *inBlocks, const byte *xorBlocks,
+        byte *outBlocks, size_t length, word32 flags) const
+{
+#if defined(CRYPTOPP_SSSE3_AVAILABLE)
+    if (HasSSSE3())
+        return SIMON128_Dec_AdvancedProcessBlocks_SSSE3(m_rkeys, (size_t)m_rounds,
+            inBlocks, xorBlocks, outBlocks, length, flags);
+#endif
+#if defined(CRYPTOPP_ARM_NEON_AVAILABLE)
+    if (HasNEON())
+        return SIMON128_Dec_AdvancedProcessBlocks_NEON(m_rkeys, (size_t)m_rounds,
+            inBlocks, xorBlocks, outBlocks, length, flags);
+#endif
+    return BlockTransformation::AdvancedProcessBlocks(inBlocks, xorBlocks, outBlocks, length, flags);
+}
+#endif  // CRYPTOPP_SIMON128_ADVANCED_PROCESS_BLOCKS
+
+NAMESPACE_END
diff --git a/simon.h b/simon.h
new file mode 100644
index 00000000..19f52e3c
--- /dev/null
+++ b/simon.h
@@ -0,0 +1,180 @@
+// simon.h - written and placed in the public domain by Jeffrey Walton
+
+/// \file simon.h
+/// \brief Classes for the Simon block cipher
+/// \details Simon is a block cipher designed by Ray Beaulieu, Douglas Shors, Jason Smith,
+///   Stefan Treatman-Clark, Bryan Weeks and Louis Wingers.
+/// \sa <A HREF="http://eprint.iacr.org/2013/404">The SIMON and SPECK Families of
+///   Lightweight Block Ciphers</A>, <A HREF="http://iadgov.github.io/simon-speck/">
+///   The Simon and Speck GitHub</A> and <A HREF="https://www.cryptopp.com/wiki/SIMON">
+///   SIMON</A> on the Crypto++ wiki.
+/// \since Crypto++ 6.0
+
+#ifndef CRYPTOPP_SIMON_H
+#define CRYPTOPP_SIMON_H
+
+#include "config.h"
+#include "seckey.h"
+#include "secblock.h"
+
+#if CRYPTOPP_BOOL_X64 || CRYPTOPP_BOOL_X32 || CRYPTOPP_BOOL_X86
+# define CRYPTOPP_SIMON64_ADVANCED_PROCESS_BLOCKS 1
+#endif
+
+#if CRYPTOPP_BOOL_X64 || CRYPTOPP_BOOL_X32 || CRYPTOPP_BOOL_X86
+# define CRYPTOPP_SIMON128_ADVANCED_PROCESS_BLOCKS 1
+#endif
+
+NAMESPACE_BEGIN(CryptoPP)
+
+/// \brief SIMON block cipher information
+/// \tparam L block size of the cipher, in bytes
+/// \tparam D default key length, in bytes
+/// \tparam N minimum key length, in bytes
+/// \tparam M maximum key length, in bytes
+/// \since Crypto++ 6.0
+template <unsigned int L, unsigned int D, unsigned int N, unsigned int M>
+struct SIMON_Info : public FixedBlockSize<L>, VariableKeyLength<D, N, M>
+{
+    static const std::string StaticAlgorithmName()
+    {
+        // Format is Cipher-Blocksize(Keylength)
+        return "SIMON-" + IntToString(L*8);
+    }
+};
+
+/// \brief SIMON block cipher base class
+/// \tparam W the word type
+/// \details User code should use SIMON64 or SIMON128
+/// \sa SIMON64, SIMON128, <a href="http://www.cryptopp.com/wiki/SIMON">SIMON</a> on the Crypto++ wiki
+/// \since Crypto++ 6.0
+template <class W>
+struct SIMON_Base
+{
+    virtual ~SIMON_Base() {}
+SIMON_Base() : m_kwords(0), m_rounds(0) {}
+
+    typedef SecBlock<W, AllocatorWithCleanup<W, true> > AlignedSecBlock;
+    mutable AlignedSecBlock m_wspace;  // workspace
+    AlignedSecBlock         m_rkeys;   // round keys
+    unsigned int            m_kwords;  // number of key words
+    unsigned int            m_rounds;  // number of rounds
+};
+
+/// \brief SIMON 64-bit block cipher
+/// \details Simon is a block cipher designed by Ray Beaulieu, Douglas Shors, Jason Smith,
+///   Stefan Treatman-Clark, Bryan Weeks and Louis Wingers.
+/// \details SIMON64 provides 64-bit block size. The valid key sizes are 96-bit and 128-bit.
+/// \sa SIMON64, SIMON128,  <A HREF="http://eprint.iacr.org/2013/404">The SIMON and SIMON
+///   Families of Lightweight Block Ciphers</A>, <A HREF="http://iadgov.github.io/simon-speck/">
+///   The Simon and Speck GitHub</A>, <a href="http://www.cryptopp.com/wiki/SIMON">SIMON</a> on the
+///   Crypto++ wiki
+/// \since Crypto++ 6.0
+class CRYPTOPP_NO_VTABLE SIMON64 : public SIMON_Info<8, 12, 12, 16>, public BlockCipherDocumentation
+{
+public:
+    /// \brief SIMON block cipher transformation functions
+    /// \details Provides implementation common to encryption and decryption
+    /// \since Crypto++ 6.0
+    class CRYPTOPP_NO_VTABLE Base : protected SIMON_Base<word32>, public BlockCipherImpl<SIMON_Info<8, 12, 12, 16> >
+    {
+    public:
+        std::string AlgorithmName() const {
+            return StaticAlgorithmName() + (m_kwords == 0 ? "" :
+                "(" + IntToString(m_kwords*sizeof(word32)*8) + ")");
+        }
+
+    protected:
+        void UncheckedSetKey(const byte *userKey, unsigned int keyLength, const NameValuePairs &params);
+    };
+
+    /// \brief Provides implementation for encryption transformation
+    /// \details Enc provides implementation for encryption transformation. All key
+    ///   sizes are supported.
+    /// \since Crypto++ 6.0
+    class CRYPTOPP_NO_VTABLE Enc : public Base
+    {
+    protected:
+        void ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock, byte *outBlock) const;
+#if CRYPTOPP_SIMON64_ADVANCED_PROCESS_BLOCKS
+        size_t AdvancedProcessBlocks(const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags) const;
+#endif
+    };
+
+    /// \brief Provides implementation for encryption transformation
+    /// \details Dec provides implementation for decryption transformation. All key
+    ///   sizes are supported.
+    /// \since Crypto++ 6.0
+    class CRYPTOPP_NO_VTABLE Dec : public Base
+    {
+    protected:
+        void ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock, byte *outBlock) const;
+#if CRYPTOPP_SIMON64_ADVANCED_PROCESS_BLOCKS
+        size_t AdvancedProcessBlocks(const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags) const;
+#endif
+    };
+
+    typedef BlockCipherFinal<ENCRYPTION, Enc> Encryption;
+    typedef BlockCipherFinal<DECRYPTION, Dec> Decryption;
+};
+
+/// \brief SIMON 128-bit block cipher
+/// \details Simon is a block cipher designed by Ray Beaulieu, Douglas Shors, Jason Smith,
+///   Stefan Treatman-Clark, Bryan Weeks and Louis Wingers.
+/// \details SIMON128 provides 128-bit block size. The valid key sizes are 128-bit, 192-bit and 256-bit.
+/// \sa SIMON64, SIMON128,  <A HREF="http://eprint.iacr.org/2013/404">The SIMON and SIMON
+///   Families of Lightweight Block Ciphers</A>, <A HREF="http://iadgov.github.io/simon-speck/">
+///   The Simon and Speck GitHub</A>, <a href="http://www.cryptopp.com/wiki/SIMON">SIMON</a> on the
+///   Crypto++ wiki
+/// \since Crypto++ 6.0
+class CRYPTOPP_NO_VTABLE SIMON128 : public SIMON_Info<16, 16, 16, 32>, public BlockCipherDocumentation
+{
+public:
+    /// \brief SIMON block cipher transformation functions
+    /// \details Provides implementation common to encryption and decryption
+    /// \since Crypto++ 6.0
+    class CRYPTOPP_NO_VTABLE Base : protected SIMON_Base<word64>, public BlockCipherImpl<SIMON_Info<16, 16, 16, 32> >
+    {
+    public:
+        std::string AlgorithmName() const {
+            return StaticAlgorithmName() + (m_kwords == 0 ? "" :
+                "(" + IntToString(m_kwords*sizeof(word64)*8) + ")");
+        }
+
+    protected:
+        void UncheckedSetKey(const byte *userKey, unsigned int keyLength, const NameValuePairs &params);
+    };
+
+    /// \brief Provides implementation for encryption transformation
+    /// \details Enc provides implementation for encryption transformation. All key
+    ///   sizes are supported.
+    /// \since Crypto++ 6.0
+    class CRYPTOPP_NO_VTABLE Enc : public Base
+    {
+    protected:
+        void ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock, byte *outBlock) const;
+#if CRYPTOPP_SIMON128_ADVANCED_PROCESS_BLOCKS
+        size_t AdvancedProcessBlocks(const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags) const;
+#endif
+    };
+
+    /// \brief Provides implementation for encryption transformation
+    /// \details Dec provides implementation for decryption transformation. All key
+    ///   sizes are supported.
+    /// \since Crypto++ 6.0
+    class CRYPTOPP_NO_VTABLE Dec : public Base
+    {
+    protected:
+        void ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock, byte *outBlock) const;
+#if CRYPTOPP_SIMON128_ADVANCED_PROCESS_BLOCKS
+        size_t AdvancedProcessBlocks(const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags) const;
+#endif
+    };
+
+    typedef BlockCipherFinal<ENCRYPTION, Enc> Encryption;
+    typedef BlockCipherFinal<DECRYPTION, Dec> Decryption;
+};
+
+NAMESPACE_END
+
+#endif  // CRYPTOPP_SIMON_H
diff --git a/speck-simd.cpp b/speck-simd.cpp
new file mode 100644
index 00000000..e60ad2f8
--- /dev/null
+++ b/speck-simd.cpp
@@ -0,0 +1,1031 @@
+// speck-simd.cpp - written and placed in the public domain by Jeffrey Walton
+//
+//    This source file uses intrinsics and built-ins to gain access to
+//    SSSE3, ARM NEON and ARMv8a, and Power7 Altivec instructions. A separate
+//    source file is needed because additional CXXFLAGS are required to enable
+//    the appropriate instructions sets in some build configurations.
+
+#include "pch.h"
+#include "config.h"
+
+#include "speck.h"
+#include "misc.h"
+#include "adv-simd.h"
+
+// Uncomment for benchmarking C++ against SSE or NEON.
+// Do so in both speck.cpp and speck-simd.cpp.
+// #undef CRYPTOPP_SSSE3_AVAILABLE
+// #undef CRYPTOPP_SSE41_AVAILABLE
+// #undef CRYPTOPP_ARM_NEON_AVAILABLE
+
+#if (CRYPTOPP_SSSE3_AVAILABLE)
+# include <pmmintrin.h>
+# include <tmmintrin.h>
+#endif
+
+#if (CRYPTOPP_SSE41_AVAILABLE)
+# include <smmintrin.h>
+#endif
+
+#if defined(__AVX512F__) && defined(__AVX512VL__)
+# define CRYPTOPP_AVX512_ROTATE 1
+# include <immintrin.h>
+#endif
+
+#if (CRYPTOPP_ARM_NEON_AVAILABLE)
+# include <arm_neon.h>
+#endif
+
+// Can't use CRYPTOPP_ARM_XXX_AVAILABLE because too many
+// compilers don't follow ACLE conventions for the include.
+#if defined(CRYPTOPP_ARM_ACLE_AVAILABLE)
+# include <stdint.h>
+# include <arm_acle.h>
+#endif
+
+// https://www.spinics.net/lists/gcchelp/msg47735.html and
+// https://www.spinics.net/lists/gcchelp/msg47749.html
+#if (CRYPTOPP_GCC_VERSION >= 40900)
+# define GCC_NO_UBSAN __attribute__ ((no_sanitize_undefined))
+#else
+# define GCC_NO_UBSAN
+#endif
+
+ANONYMOUS_NAMESPACE_BEGIN
+
+using CryptoPP::byte;
+using CryptoPP::word32;
+using CryptoPP::word64;
+
+// *************************** ARM NEON ************************** //
+
+#if defined(CRYPTOPP_ARM_NEON_AVAILABLE)
+
+template <unsigned int R>
+inline uint32x4_t RotateLeft32(const uint32x4_t& val)
+{
+    const uint32x4_t a(vshlq_n_u32(val, R));
+    const uint32x4_t b(vshrq_n_u32(val, 32 - R));
+    return vorrq_u32(a, b);
+}
+
+template <unsigned int R>
+inline uint32x4_t RotateRight32(const uint32x4_t& val)
+{
+    const uint32x4_t a(vshlq_n_u32(val, 32 - R));
+    const uint32x4_t b(vshrq_n_u32(val, R));
+    return vorrq_u32(a, b);
+}
+
+#if defined(__aarch32__) || defined(__aarch64__)
+// Faster than two Shifts and an Or. Thanks to Louis Wingers and Bryan Weeks.
+template <>
+inline uint32x4_t RotateLeft32<8>(const uint32x4_t& val)
+{
+#if defined(CRYPTOPP_BIG_ENDIAN)
+    const uint8_t maskb[16] = { 14,13,12,15, 10,9,8,11, 6,5,4,7, 2,1,0,3 };
+    const uint8x16_t mask = vld1q_u8(maskb);
+#else
+    const uint8_t maskb[16] = { 3,0,1,2, 7,4,5,6, 11,8,9,10, 15,12,13,14 };
+    const uint8x16_t mask = vld1q_u8(maskb);
+#endif
+
+    return vreinterpretq_u32_u8(
+        vqtbl1q_u8(vreinterpretq_u8_u32(val), mask));
+}
+
+// Faster than two Shifts and an Or. Thanks to Louis Wingers and Bryan Weeks.
+template <>
+inline uint32x4_t RotateRight32<8>(const uint32x4_t& val)
+{
+#if defined(CRYPTOPP_BIG_ENDIAN)
+    const uint8_t maskb[16] = { 12,15,14,13, 8,11,10,9, 4,7,6,5, 0,3,2,1 };
+    const uint8x16_t mask = vld1q_u8(maskb);
+#else
+    const uint8_t maskb[16] = { 1,2,3,0, 5,6,7,4, 9,10,11,8, 13,14,15,12 };
+    const uint8x16_t mask = vld1q_u8(maskb);
+#endif
+
+    return vreinterpretq_u32_u8(
+        vqtbl1q_u8(vreinterpretq_u8_u32(val), mask));
+}
+#endif  // Aarch32 or Aarch64
+
+inline uint32x4_t Shuffle32(const uint32x4_t& val)
+{
+#if defined(CRYPTOPP_LITTLE_ENDIAN)
+    return vreinterpretq_u32_u8(
+        vrev32q_u8(vreinterpretq_u8_u32(val)));
+#else
+    return val;
+#endif
+}
+
+inline void SPECK64_Enc_Block(uint32x4_t &block0, uint32x4_t &block1,
+    const word32 *subkeys, unsigned int rounds)
+{
+    // Rearrange the data for vectorization. The incoming data was read from
+    // a big-endian byte array. Depending on the number of blocks it needs to
+    // be permuted to the following.
+    // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
+    uint32x4_t x1 = vuzpq_u32(block0, block1).val[0];
+    uint32x4_t y1 = vuzpq_u32(block0, block1).val[1];
+
+    x1 = Shuffle32(x1); y1 = Shuffle32(y1);
+
+    for (int i=0; i < static_cast<int>(rounds); ++i)
+    {
+        const uint32x4_t rk = vdupq_n_u32(subkeys[i]);
+
+        x1 = RotateRight32<8>(x1);
+        x1 = vaddq_u32(x1, y1);
+        x1 = veorq_u32(x1, rk);
+        y1 = RotateLeft32<3>(y1);
+        y1 = veorq_u32(y1, x1);
+    }
+
+    x1 = Shuffle32(x1); y1 = Shuffle32(y1);
+
+    // [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4]
+    block0 = vzipq_u32(x1, y1).val[0];
+    block1 = vzipq_u32(x1, y1).val[1];
+}
+
+inline void SPECK64_Dec_Block(uint32x4_t &block0, uint32x4_t &block1,
+    const word32 *subkeys, unsigned int rounds)
+{
+    // Rearrange the data for vectorization. The incoming data was read from
+    // a big-endian byte array. Depending on the number of blocks it needs to
+    // be permuted to the following.
+    // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
+    uint32x4_t x1 = vuzpq_u32(block0, block1).val[0];
+    uint32x4_t y1 = vuzpq_u32(block0, block1).val[1];
+
+    x1 = Shuffle32(x1); y1 = Shuffle32(y1);
+
+    for (int i = static_cast<int>(rounds-1); i >= 0; --i)
+    {
+        const uint32x4_t rk = vdupq_n_u32(subkeys[i]);
+
+        y1 = veorq_u32(y1, x1);
+        y1 = RotateRight32<3>(y1);
+        x1 = veorq_u32(x1, rk);
+        x1 = vsubq_u32(x1, y1);
+        x1 = RotateLeft32<8>(x1);
+    }
+
+    x1 = Shuffle32(x1); y1 = Shuffle32(y1);
+
+    // [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4]
+    block0 = vzipq_u32(x1, y1).val[0];
+    block1 = vzipq_u32(x1, y1).val[1];
+}
+
+inline void SPECK64_Enc_6_Blocks(uint32x4_t &block0, uint32x4_t &block1,
+    uint32x4_t &block2, uint32x4_t &block3, uint32x4_t &block4, uint32x4_t &block5,
+    const word32 *subkeys, unsigned int rounds)
+{
+    // Rearrange the data for vectorization. The incoming data was read from
+    // a big-endian byte array. Depending on the number of blocks it needs to
+    // be permuted to the following. If only a single block is available then
+    // a Zero block is provided to promote vectorizations.
+    // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
+    uint32x4_t x1 = vuzpq_u32(block0, block1).val[0];
+    uint32x4_t y1 = vuzpq_u32(block0, block1).val[1];
+    uint32x4_t x2 = vuzpq_u32(block2, block3).val[0];
+    uint32x4_t y2 = vuzpq_u32(block2, block3).val[1];
+    uint32x4_t x3 = vuzpq_u32(block4, block5).val[0];
+    uint32x4_t y3 = vuzpq_u32(block4, block5).val[1];
+
+    x1 = Shuffle32(x1); y1 = Shuffle32(y1);
+    x2 = Shuffle32(x2); y2 = Shuffle32(y2);
+    x3 = Shuffle32(x3); y3 = Shuffle32(y3);
+
+    for (int i=0; i < static_cast<int>(rounds); ++i)
+    {
+        const uint32x4_t rk = vdupq_n_u32(subkeys[i]);
+
+        x1 = RotateRight32<8>(x1);
+        x2 = RotateRight32<8>(x2);
+        x3 = RotateRight32<8>(x3);
+        x1 = vaddq_u32(x1, y1);
+        x2 = vaddq_u32(x2, y2);
+        x3 = vaddq_u32(x3, y3);
+        x1 = veorq_u32(x1, rk);
+        x2 = veorq_u32(x2, rk);
+        x3 = veorq_u32(x3, rk);
+        y1 = RotateLeft32<3>(y1);
+        y2 = RotateLeft32<3>(y2);
+        y3 = RotateLeft32<3>(y3);
+        y1 = veorq_u32(y1, x1);
+        y2 = veorq_u32(y2, x2);
+        y3 = veorq_u32(y3, x3);
+    }
+
+    x1 = Shuffle32(x1); y1 = Shuffle32(y1);
+    x2 = Shuffle32(x2); y2 = Shuffle32(y2);
+    x3 = Shuffle32(x3); y3 = Shuffle32(y3);
+
+    // [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4]
+    block0 = vzipq_u32(x1, y1).val[0];
+    block1 = vzipq_u32(x1, y1).val[1];
+    block2 = vzipq_u32(x2, y2).val[0];
+    block3 = vzipq_u32(x2, y2).val[1];
+    block4 = vzipq_u32(x3, y3).val[0];
+    block5 = vzipq_u32(x3, y3).val[1];
+}
+
+inline void SPECK64_Dec_6_Blocks(uint32x4_t &block0, uint32x4_t &block1,
+    uint32x4_t &block2, uint32x4_t &block3, uint32x4_t &block4, uint32x4_t &block5,
+    const word32 *subkeys, unsigned int rounds)
+{
+    // Rearrange the data for vectorization. The incoming data was read from
+    // a big-endian byte array. Depending on the number of blocks it needs to
+    // be permuted to the following. If only a single block is available then
+    // a Zero block is provided to promote vectorizations.
+    // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
+    uint32x4_t x1 = vuzpq_u32(block0, block1).val[0];
+    uint32x4_t y1 = vuzpq_u32(block0, block1).val[1];
+    uint32x4_t x2 = vuzpq_u32(block2, block3).val[0];
+    uint32x4_t y2 = vuzpq_u32(block2, block3).val[1];
+    uint32x4_t x3 = vuzpq_u32(block4, block5).val[0];
+    uint32x4_t y3 = vuzpq_u32(block4, block5).val[1];
+
+    x1 = Shuffle32(x1); y1 = Shuffle32(y1);
+    x2 = Shuffle32(x2); y2 = Shuffle32(y2);
+    x3 = Shuffle32(x3); y3 = Shuffle32(y3);
+
+    for (int i = static_cast<int>(rounds-1); i >= 0; --i)
+    {
+        const uint32x4_t rk = vdupq_n_u32(subkeys[i]);
+
+        y1 = veorq_u32(y1, x1);
+        y2 = veorq_u32(y2, x2);
+        y3 = veorq_u32(y3, x3);
+        y1 = RotateRight32<3>(y1);
+        y2 = RotateRight32<3>(y2);
+        y3 = RotateRight32<3>(y3);
+        x1 = veorq_u32(x1, rk);
+        x2 = veorq_u32(x2, rk);
+        x3 = veorq_u32(x3, rk);
+        x1 = vsubq_u32(x1, y1);
+        x2 = vsubq_u32(x2, y2);
+        x3 = vsubq_u32(x3, y3);
+        x1 = RotateLeft32<8>(x1);
+        x2 = RotateLeft32<8>(x2);
+        x3 = RotateLeft32<8>(x3);
+    }
+
+    x1 = Shuffle32(x1); y1 = Shuffle32(y1);
+    x2 = Shuffle32(x2); y2 = Shuffle32(y2);
+    x3 = Shuffle32(x3); y3 = Shuffle32(y3);
+
+    // [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4]
+    block0 = vzipq_u32(x1, y1).val[0];
+    block1 = vzipq_u32(x1, y1).val[1];
+    block2 = vzipq_u32(x2, y2).val[0];
+    block3 = vzipq_u32(x2, y2).val[1];
+    block4 = vzipq_u32(x3, y3).val[0];
+    block5 = vzipq_u32(x3, y3).val[1];
+}
+
+#endif  // CRYPTOPP_ARM_NEON_AVAILABLE
+
+#if defined(CRYPTOPP_ARM_NEON_AVAILABLE)
+
+template <class T>
+inline T UnpackHigh64(const T& a, const T& b)
+{
+    const uint64x1_t x(vget_high_u64((uint64x2_t)a));
+    const uint64x1_t y(vget_high_u64((uint64x2_t)b));
+    return (T)vcombine_u64(x, y);
+}
+
+template <class T>
+inline T UnpackLow64(const T& a, const T& b)
+{
+    const uint64x1_t x(vget_low_u64((uint64x2_t)a));
+    const uint64x1_t y(vget_low_u64((uint64x2_t)b));
+    return (T)vcombine_u64(x, y);
+}
+
+template <unsigned int R>
+inline uint64x2_t RotateLeft64(const uint64x2_t& val)
+{
+    const uint64x2_t a(vshlq_n_u64(val, R));
+    const uint64x2_t b(vshrq_n_u64(val, 64 - R));
+    return vorrq_u64(a, b);
+}
+
+template <unsigned int R>
+inline uint64x2_t RotateRight64(const uint64x2_t& val)
+{
+    const uint64x2_t a(vshlq_n_u64(val, 64 - R));
+    const uint64x2_t b(vshrq_n_u64(val, R));
+    return vorrq_u64(a, b);
+}
+
+#if defined(__aarch32__) || defined(__aarch64__)
+// Faster than two Shifts and an Or. Thanks to Louis Wingers and Bryan Weeks.
+template <>
+inline uint64x2_t RotateLeft64<8>(const uint64x2_t& val)
+{
+#if defined(CRYPTOPP_BIG_ENDIAN)
+    const uint8_t maskb[16] = { 14,13,12,11, 10,9,8,15, 6,5,4,3, 2,1,0,7 };
+    const uint8x16_t mask = vld1q_u8(maskb);
+#else
+    const uint8_t maskb[16] = { 7,0,1,2, 3,4,5,6, 15,8,9,10, 11,12,13,14 };
+    const uint8x16_t mask = vld1q_u8(maskb);
+#endif
+
+    return vreinterpretq_u64_u8(
+        vqtbl1q_u8(vreinterpretq_u8_u64(val), mask));
+}
+
+// Faster than two Shifts and an Or. Thanks to Louis Wingers and Bryan Weeks.
+template <>
+inline uint64x2_t RotateRight64<8>(const uint64x2_t& val)
+{
+#if defined(CRYPTOPP_BIG_ENDIAN)
+    const uint8_t maskb[16] = { 8,15,14,13, 12,11,10,9, 0,7,6,5, 4,3,2,1 };
+    const uint8x16_t mask = vld1q_u8(maskb);
+#else
+    const uint8_t maskb[16] = { 1,2,3,4, 5,6,7,0, 9,10,11,12, 13,14,15,8 };
+    const uint8x16_t mask = vld1q_u8(maskb);
+#endif
+
+    return vreinterpretq_u64_u8(
+        vqtbl1q_u8(vreinterpretq_u8_u64(val), mask));
+}
+#endif
+
+inline uint64x2_t Shuffle64(const uint64x2_t& val)
+{
+#if defined(CRYPTOPP_LITTLE_ENDIAN)
+    return vreinterpretq_u64_u8(
+        vrev64q_u8(vreinterpretq_u8_u64(val)));
+#else
+    return val;
+#endif
+}
+
+inline void SPECK128_Enc_Block(uint64x2_t &block0, uint64x2_t &block1,
+    const word64 *subkeys, unsigned int rounds)
+{
+    // Rearrange the data for vectorization. The incoming data was read from
+    // a big-endian byte array. Depending on the number of blocks it needs to
+    // be permuted to the following.
+    // [A1 A2][B1 B2] ... => [A1 B1][A2 B2] ...
+    uint64x2_t x1 = UnpackLow64(block0, block1);
+    uint64x2_t y1 = UnpackHigh64(block0, block1);
+
+    x1 = Shuffle64(x1); y1 = Shuffle64(y1);
+
+    for (int i=0; i < static_cast<int>(rounds); ++i)
+    {
+        const uint64x2_t rk = vld1q_dup_u64(subkeys+i);
+
+        x1 = RotateRight64<8>(x1);
+        x1 = vaddq_u64(x1, y1);
+        x1 = veorq_u64(x1, rk);
+        y1 = RotateLeft64<3>(y1);
+        y1 = veorq_u64(y1, x1);
+    }
+
+    x1 = Shuffle64(x1); y1 = Shuffle64(y1);
+
+    // [A1 B1][A2 B2] ... => [A1 A2][B1 B2] ...
+    block0 = UnpackLow64(x1, y1);
+    block1 = UnpackHigh64(x1, y1);
+}
+
+inline void SPECK128_Enc_6_Blocks(uint64x2_t &block0, uint64x2_t &block1,
+    uint64x2_t &block2, uint64x2_t &block3, uint64x2_t &block4, uint64x2_t &block5,
+    const word64 *subkeys, unsigned int rounds)
+{
+    // Rearrange the data for vectorization. The incoming data was read from
+    // a big-endian byte array. Depending on the number of blocks it needs to
+    // be permuted to the following.
+    // [A1 A2][B1 B2] ... => [A1 B1][A2 B2] ...
+    uint64x2_t x1 = UnpackLow64(block0, block1);
+    uint64x2_t y1 = UnpackHigh64(block0, block1);
+    uint64x2_t x2 = UnpackLow64(block2, block3);
+    uint64x2_t y2 = UnpackHigh64(block2, block3);
+    uint64x2_t x3 = UnpackLow64(block4, block5);
+    uint64x2_t y3 = UnpackHigh64(block4, block5);
+
+    x1 = Shuffle64(x1); y1 = Shuffle64(y1);
+    x2 = Shuffle64(x2); y2 = Shuffle64(y2);
+    x3 = Shuffle64(x3); y3 = Shuffle64(y3);
+
+    for (int i=0; i < static_cast<int>(rounds); ++i)
+    {
+        const uint64x2_t rk = vld1q_dup_u64(subkeys+i);
+
+        x1 = RotateRight64<8>(x1);
+        x2 = RotateRight64<8>(x2);
+        x3 = RotateRight64<8>(x3);
+        x1 = vaddq_u64(x1, y1);
+        x2 = vaddq_u64(x2, y2);
+        x3 = vaddq_u64(x3, y3);
+        x1 = veorq_u64(x1, rk);
+        x2 = veorq_u64(x2, rk);
+        x3 = veorq_u64(x3, rk);
+        y1 = RotateLeft64<3>(y1);
+        y2 = RotateLeft64<3>(y2);
+        y3 = RotateLeft64<3>(y3);
+        y1 = veorq_u64(y1, x1);
+        y2 = veorq_u64(y2, x2);
+        y3 = veorq_u64(y3, x3);
+    }
+
+    x1 = Shuffle64(x1); y1 = Shuffle64(y1);
+    x2 = Shuffle64(x2); y2 = Shuffle64(y2);
+    x3 = Shuffle64(x3); y3 = Shuffle64(y3);
+
+    // [A1 B1][A2 B2] ... => [A1 A2][B1 B2] ...
+    block0 = UnpackLow64(x1, y1);
+    block1 = UnpackHigh64(x1, y1);
+    block2 = UnpackLow64(x2, y2);
+    block3 = UnpackHigh64(x2, y2);
+    block4 = UnpackLow64(x3, y3);
+    block5 = UnpackHigh64(x3, y3);
+}
+
+inline void SPECK128_Dec_Block(uint64x2_t &block0, uint64x2_t &block1,
+    const word64 *subkeys, unsigned int rounds)
+{
+    // Rearrange the data for vectorization. The incoming data was read from
+    // a big-endian byte array. Depending on the number of blocks it needs to
+    // be permuted to the following.
+    // [A1 A2][B1 B2] ... => [A1 B1][A2 B2] ...
+    uint64x2_t x1 = UnpackLow64(block0, block1);
+    uint64x2_t y1 = UnpackHigh64(block0, block1);
+
+    x1 = Shuffle64(x1); y1 = Shuffle64(y1);
+
+    for (int i = static_cast<int>(rounds-1); i >= 0; --i)
+    {
+        const uint64x2_t rk = vld1q_dup_u64(subkeys+i);
+
+        y1 = veorq_u64(y1, x1);
+        y1 = RotateRight64<3>(y1);
+        x1 = veorq_u64(x1, rk);
+        x1 = vsubq_u64(x1, y1);
+        x1 = RotateLeft64<8>(x1);
+    }
+
+    x1 = Shuffle64(x1); y1 = Shuffle64(y1);
+
+    // [A1 B1][A2 B2] ... => [A1 A2][B1 B2] ...
+    block0 = UnpackLow64(x1, y1);
+    block1 = UnpackHigh64(x1, y1);
+}
+
+inline void SPECK128_Dec_6_Blocks(uint64x2_t &block0, uint64x2_t &block1,
+    uint64x2_t &block2, uint64x2_t &block3, uint64x2_t &block4, uint64x2_t &block5,
+    const word64 *subkeys, unsigned int rounds)
+{
+    // Rearrange the data for vectorization. The incoming data was read from
+    // a big-endian byte array. Depending on the number of blocks it needs to
+    // be permuted to the following.
+    // [A1 A2][B1 B2] ... => [A1 B1][A2 B2] ...
+    uint64x2_t x1 = UnpackLow64(block0, block1);
+    uint64x2_t y1 = UnpackHigh64(block0, block1);
+    uint64x2_t x2 = UnpackLow64(block2, block3);
+    uint64x2_t y2 = UnpackHigh64(block2, block3);
+    uint64x2_t x3 = UnpackLow64(block4, block5);
+    uint64x2_t y3 = UnpackHigh64(block4, block5);
+
+    x1 = Shuffle64(x1); y1 = Shuffle64(y1);
+    x2 = Shuffle64(x2); y2 = Shuffle64(y2);
+    x3 = Shuffle64(x3); y3 = Shuffle64(y3);
+
+    for (int i = static_cast<int>(rounds-1); i >= 0; --i)
+    {
+        const uint64x2_t rk = vld1q_dup_u64(subkeys+i);
+
+        y1 = veorq_u64(y1, x1);
+        y2 = veorq_u64(y2, x2);
+        y3 = veorq_u64(y3, x3);
+        y1 = RotateRight64<3>(y1);
+        y2 = RotateRight64<3>(y2);
+        y3 = RotateRight64<3>(y3);
+        x1 = veorq_u64(x1, rk);
+        x2 = veorq_u64(x2, rk);
+        x3 = veorq_u64(x3, rk);
+        x1 = vsubq_u64(x1, y1);
+        x2 = vsubq_u64(x2, y2);
+        x3 = vsubq_u64(x3, y3);
+        x1 = RotateLeft64<8>(x1);
+        x2 = RotateLeft64<8>(x2);
+        x3 = RotateLeft64<8>(x3);
+    }
+
+    x1 = Shuffle64(x1); y1 = Shuffle64(y1);
+    x2 = Shuffle64(x2); y2 = Shuffle64(y2);
+    x3 = Shuffle64(x3); y3 = Shuffle64(y3);
+
+    // [A1 B1][A2 B2] ... => [A1 A2][B1 B2] ...
+    block0 = UnpackLow64(x1, y1);
+    block1 = UnpackHigh64(x1, y1);
+    block2 = UnpackLow64(x2, y2);
+    block3 = UnpackHigh64(x2, y2);
+    block4 = UnpackLow64(x3, y3);
+    block5 = UnpackHigh64(x3, y3);
+}
+
+#endif  // CRYPTOPP_ARM_NEON_AVAILABLE
+
+// ***************************** IA-32 ***************************** //
+
+#if defined(CRYPTOPP_SSSE3_AVAILABLE)
+
+// Clang __m128i casts, http://bugs.llvm.org/show_bug.cgi?id=20670
+#ifndef M128_CAST
+# define M128_CAST(x) ((__m128i *)(void *)(x))
+#endif
+#ifndef CONST_M128_CAST
+# define CONST_M128_CAST(x) ((const __m128i *)(const void *)(x))
+#endif
+
+// GCC double casts, https://www.spinics.net/lists/gcchelp/msg47735.html
+#ifndef DOUBLE_CAST
+# define DOUBLE_CAST(x) ((double *)(void *)(x))
+#endif
+#ifndef CONST_DOUBLE_CAST
+# define CONST_DOUBLE_CAST(x) ((const double *)(const void *)(x))
+#endif
+
+#if defined(CRYPTOPP_AVX512_ROTATE)
+template <unsigned int R>
+inline __m128i RotateLeft64(const __m128i& val)
+{
+    return _mm_rol_epi64(val, R);
+}
+
+template <unsigned int R>
+inline __m128i RotateRight64(const __m128i& val)
+{
+    return _mm_ror_epi64(val, R);
+}
+#else
+template <unsigned int R>
+inline __m128i RotateLeft64(const __m128i& val)
+{
+    return _mm_or_si128(
+        _mm_slli_epi64(val, R), _mm_srli_epi64(val, 64-R));
+}
+
+template <unsigned int R>
+inline __m128i RotateRight64(const __m128i& val)
+{
+    return _mm_or_si128(
+        _mm_slli_epi64(val, 64-R), _mm_srli_epi64(val, R));
+}
+
+// Faster than two Shifts and an Or. Thanks to Louis Wingers and Bryan Weeks.
+template <>
+inline __m128i RotateLeft64<8>(const __m128i& val)
+{
+    const __m128i mask = _mm_set_epi8(14,13,12,11, 10,9,8,15, 6,5,4,3, 2,1,0,7);
+    return _mm_shuffle_epi8(val, mask);
+}
+
+// Faster than two Shifts and an Or. Thanks to Louis Wingers and Bryan Weeks.
+template <>
+inline __m128i RotateRight64<8>(const __m128i& val)
+{
+    const __m128i mask = _mm_set_epi8(8,15,14,13, 12,11,10,9, 0,7,6,5, 4,3,2,1);
+    return _mm_shuffle_epi8(val, mask);
+}
+
+#endif  // CRYPTOPP_AVX512_ROTATE
+
+inline void GCC_NO_UBSAN SPECK128_Enc_Block(__m128i &block0, __m128i &block1,
+    const word64 *subkeys, unsigned int rounds)
+{
+    // Rearrange the data for vectorization. The incoming data was read from
+    // a big-endian byte array. Depending on the number of blocks it needs to
+    // be permuted to the following.
+    // [A1 A2][B1 B2] ... => [A1 B1][A2 B2] ...
+    __m128i x1 = _mm_unpackhi_epi64(block0, block1);
+	__m128i y1 = _mm_unpacklo_epi64(block0, block1);
+
+    for (int i=0; i < static_cast<int>(rounds); ++i)
+    {
+        const __m128i rk = _mm_castpd_si128(
+            _mm_loaddup_pd(CONST_DOUBLE_CAST(subkeys+i)));
+
+        x1 = RotateRight64<8>(x1);
+        x1 = _mm_add_epi64(x1, y1);
+        x1 = _mm_xor_si128(x1, rk);
+        y1 = RotateLeft64<3>(y1);
+        y1 = _mm_xor_si128(y1, x1);
+    }
+
+    // [A1 B1][A2 B2] ... => [A1 A2][B1 B2] ...
+    block0 = _mm_unpacklo_epi64(y1, x1);
+    block1 = _mm_unpackhi_epi64(y1, x1);
+}
+
+inline void GCC_NO_UBSAN SPECK128_Enc_6_Blocks(__m128i &block0, __m128i &block1,
+    __m128i &block2, __m128i &block3, __m128i &block4, __m128i &block5,
+    const word64 *subkeys, unsigned int rounds)
+{
+    // Rearrange the data for vectorization. The incoming data was read from
+    // a big-endian byte array. Depending on the number of blocks it needs to
+    // be permuted to the following.
+    // [A1 A2][B1 B2] ... => [A1 B1][A2 B2] ...
+    __m128i x1 = _mm_unpackhi_epi64(block0, block1);
+	__m128i y1 = _mm_unpacklo_epi64(block0, block1);
+    __m128i x2 = _mm_unpackhi_epi64(block2, block3);
+	__m128i y2 = _mm_unpacklo_epi64(block2, block3);
+    __m128i x3 = _mm_unpackhi_epi64(block4, block5);
+	__m128i y3 = _mm_unpacklo_epi64(block4, block5);
+
+    for (int i=0; i < static_cast<int>(rounds); ++i)
+    {
+        const __m128i rk = _mm_castpd_si128(
+            _mm_loaddup_pd(CONST_DOUBLE_CAST(subkeys+i)));
+
+        x1 = RotateRight64<8>(x1);
+        x2 = RotateRight64<8>(x2);
+        x3 = RotateRight64<8>(x3);
+        x1 = _mm_add_epi64(x1, y1);
+        x2 = _mm_add_epi64(x2, y2);
+        x3 = _mm_add_epi64(x3, y3);
+        x1 = _mm_xor_si128(x1, rk);
+        x2 = _mm_xor_si128(x2, rk);
+        x3 = _mm_xor_si128(x3, rk);
+        y1 = RotateLeft64<3>(y1);
+        y2 = RotateLeft64<3>(y2);
+        y3 = RotateLeft64<3>(y3);
+        y1 = _mm_xor_si128(y1, x1);
+        y2 = _mm_xor_si128(y2, x2);
+        y3 = _mm_xor_si128(y3, x3);
+    }
+
+    // [A1 B1][A2 B2] ... => [A1 A2][B1 B2] ...
+    block0 = _mm_unpacklo_epi64(y1, x1);
+    block1 = _mm_unpackhi_epi64(y1, x1);
+    block2 = _mm_unpacklo_epi64(y2, x2);
+    block3 = _mm_unpackhi_epi64(y2, x2);
+    block4 = _mm_unpacklo_epi64(y3, x3);
+    block5 = _mm_unpackhi_epi64(y3, x3);
+}
+
+inline void GCC_NO_UBSAN SPECK128_Dec_Block(__m128i &block0, __m128i &block1,
+    const word64 *subkeys, unsigned int rounds)
+{
+    // Rearrange the data for vectorization. The incoming data was read from
+    // a big-endian byte array. Depending on the number of blocks it needs to
+    // be permuted to the following.
+    // [A1 A2][B1 B2] ... => [A1 B1][A2 B2] ...
+    __m128i x1 = _mm_unpackhi_epi64(block0, block1);
+	__m128i y1 = _mm_unpacklo_epi64(block0, block1);
+
+    for (int i = static_cast<int>(rounds-1); i >= 0; --i)
+    {
+        const __m128i rk = _mm_castpd_si128(
+            _mm_loaddup_pd(CONST_DOUBLE_CAST(subkeys+i)));
+
+        y1 = _mm_xor_si128(y1, x1);
+        y1 = RotateRight64<3>(y1);
+        x1 = _mm_xor_si128(x1, rk);
+        x1 = _mm_sub_epi64(x1, y1);
+        x1 = RotateLeft64<8>(x1);
+    }
+
+    // [A1 B1][A2 B2] ... => [A1 A2][B1 B2] ...
+    block0 = _mm_unpacklo_epi64(y1, x1);
+    block1 = _mm_unpackhi_epi64(y1, x1);
+}
+
+inline void GCC_NO_UBSAN SPECK128_Dec_6_Blocks(__m128i &block0, __m128i &block1,
+    __m128i &block2, __m128i &block3, __m128i &block4, __m128i &block5,
+    const word64 *subkeys, unsigned int rounds)
+{
+    // Rearrange the data for vectorization. The incoming data was read from
+    // a big-endian byte array. Depending on the number of blocks it needs to
+    // be permuted to the following.
+    // [A1 A2][B1 B2] ... => [A1 B1][A2 B2] ...
+    __m128i x1 = _mm_unpackhi_epi64(block0, block1);
+	__m128i y1 = _mm_unpacklo_epi64(block0, block1);
+    __m128i x2 = _mm_unpackhi_epi64(block2, block3);
+	__m128i y2 = _mm_unpacklo_epi64(block2, block3);
+    __m128i x3 = _mm_unpackhi_epi64(block4, block5);
+	__m128i y3 = _mm_unpacklo_epi64(block4, block5);
+
+    for (int i = static_cast<int>(rounds-1); i >= 0; --i)
+    {
+        const __m128i rk = _mm_castpd_si128(
+            _mm_loaddup_pd(CONST_DOUBLE_CAST(subkeys+i)));
+
+        y1 = _mm_xor_si128(y1, x1);
+        y2 = _mm_xor_si128(y2, x2);
+        y3 = _mm_xor_si128(y3, x3);
+        y1 = RotateRight64<3>(y1);
+        y2 = RotateRight64<3>(y2);
+        y3 = RotateRight64<3>(y3);
+        x1 = _mm_xor_si128(x1, rk);
+        x2 = _mm_xor_si128(x2, rk);
+        x3 = _mm_xor_si128(x3, rk);
+        x1 = _mm_sub_epi64(x1, y1);
+        x2 = _mm_sub_epi64(x2, y2);
+        x3 = _mm_sub_epi64(x3, y3);
+        x1 = RotateLeft64<8>(x1);
+        x2 = RotateLeft64<8>(x2);
+        x3 = RotateLeft64<8>(x3);
+    }
+
+    // [A1 B1][A2 B2] ... => [A1 A2][B1 B2] ...
+    block0 = _mm_unpacklo_epi64(y1, x1);
+    block1 = _mm_unpackhi_epi64(y1, x1);
+    block2 = _mm_unpacklo_epi64(y2, x2);
+    block3 = _mm_unpackhi_epi64(y2, x2);
+    block4 = _mm_unpacklo_epi64(y3, x3);
+    block5 = _mm_unpackhi_epi64(y3, x3);
+}
+
+#endif  // CRYPTOPP_SSSE3_AVAILABLE
+
+#if defined(CRYPTOPP_SSE41_AVAILABLE)
+
+template <unsigned int R>
+inline __m128i RotateLeft32(const __m128i& val)
+{
+    return _mm_or_si128(
+        _mm_slli_epi32(val, R), _mm_srli_epi32(val, 32-R));
+}
+
+template <unsigned int R>
+inline __m128i RotateRight32(const __m128i& val)
+{
+    return _mm_or_si128(
+        _mm_slli_epi32(val, 32-R), _mm_srli_epi32(val, R));
+}
+
+// Faster than two Shifts and an Or. Thanks to Louis Wingers and Bryan Weeks.
+template <>
+inline __m128i RotateLeft32<8>(const __m128i& val)
+{
+    const __m128i mask = _mm_set_epi8(14,13,12,15, 10,9,8,11, 6,5,4,7, 2,1,0,3);
+    return _mm_shuffle_epi8(val, mask);
+}
+
+// Faster than two Shifts and an Or. Thanks to Louis Wingers and Bryan Weeks.
+template <>
+inline __m128i RotateRight32<8>(const __m128i& val)
+{
+    const __m128i mask = _mm_set_epi8(12,15,14,13, 8,11,10,9, 4,7,6,5, 0,3,2,1);
+    return _mm_shuffle_epi8(val, mask);
+}
+
+inline void GCC_NO_UBSAN SPECK64_Enc_Block(__m128i &block0, __m128i &block1,
+    const word32 *subkeys, unsigned int rounds)
+{
+    // Rearrange the data for vectorization. The incoming data was read from
+    // a big-endian byte array. Depending on the number of blocks it needs to
+    // be permuted to the following. Thanks to Peter Cordes for help with the
+    // SSE permutes below.
+    // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
+    const __m128 t0 = _mm_castsi128_ps(block0);
+    const __m128 t1 = _mm_castsi128_ps(block1);
+	__m128i x1 = _mm_castps_si128(_mm_shuffle_ps(t0, t1, _MM_SHUFFLE(3,1,3,1)));
+	__m128i y1 = _mm_castps_si128(_mm_shuffle_ps(t0, t1, _MM_SHUFFLE(2,0,2,0)));
+
+    for (int i=0; i < static_cast<int>(rounds); ++i)
+    {
+        const __m128i rk = _mm_set1_epi32(subkeys[i]);
+
+        x1 = RotateRight32<8>(x1);
+        x1 = _mm_add_epi32(x1, y1);
+        x1 = _mm_xor_si128(x1, rk);
+        y1 = RotateLeft32<3>(y1);
+        y1 = _mm_xor_si128(y1, x1);
+    }
+
+    // The is roughly the SSE equivalent to ARM vzp32
+    // [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4]
+    block0 = _mm_unpacklo_epi32(y1, x1);
+    block1 = _mm_unpackhi_epi32(y1, x1);
+}
+
+inline void GCC_NO_UBSAN SPECK64_Dec_Block(__m128i &block0, __m128i &block1,
+    const word32 *subkeys, unsigned int rounds)
+{
+    // Rearrange the data for vectorization. The incoming data was read from
+    // a big-endian byte array. Depending on the number of blocks it needs to
+    // be permuted to the following. Thanks to Peter Cordes for help with the
+    // SSE permutes below.
+    // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
+    const __m128 t0 = _mm_castsi128_ps(block0);
+    const __m128 t1 = _mm_castsi128_ps(block1);
+	__m128i x1 = _mm_castps_si128(_mm_shuffle_ps(t0, t1, _MM_SHUFFLE(3,1,3,1)));
+	__m128i y1 = _mm_castps_si128(_mm_shuffle_ps(t0, t1, _MM_SHUFFLE(2,0,2,0)));
+
+    for (int i = static_cast<int>(rounds-1); i >= 0; --i)
+    {
+        const __m128i rk = _mm_set1_epi32(subkeys[i]);
+
+        y1 = _mm_xor_si128(y1, x1);
+        y1 = RotateRight32<3>(y1);
+        x1 = _mm_xor_si128(x1, rk);
+        x1 = _mm_sub_epi32(x1, y1);
+        x1 = RotateLeft32<8>(x1);
+    }
+
+    // The is roughly the SSE equivalent to ARM vzp32
+    // [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4]
+    block0 = _mm_unpacklo_epi32(y1, x1);
+    block1 = _mm_unpackhi_epi32(y1, x1);
+}
+
+inline void GCC_NO_UBSAN SPECK64_Enc_6_Blocks(__m128i &block0, __m128i &block1,
+    __m128i &block2, __m128i &block3, __m128i &block4, __m128i &block5,
+    const word32 *subkeys, unsigned int rounds)
+{
+    // Rearrange the data for vectorization. The incoming data was read from
+    // a big-endian byte array. Depending on the number of blocks it needs to
+    // be permuted to the following. Thanks to Peter Cordes for help with the
+    // SSE permutes below.
+    // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
+    const __m128 t0 = _mm_castsi128_ps(block0);
+    const __m128 t1 = _mm_castsi128_ps(block1);
+	__m128i x1 = _mm_castps_si128(_mm_shuffle_ps(t0, t1, _MM_SHUFFLE(3,1,3,1)));
+	__m128i y1 = _mm_castps_si128(_mm_shuffle_ps(t0, t1, _MM_SHUFFLE(2,0,2,0)));
+
+    const __m128 t2 = _mm_castsi128_ps(block2);
+    const __m128 t3 = _mm_castsi128_ps(block3);
+	__m128i x2 = _mm_castps_si128(_mm_shuffle_ps(t2, t3, _MM_SHUFFLE(3,1,3,1)));
+	__m128i y2 = _mm_castps_si128(_mm_shuffle_ps(t2, t3, _MM_SHUFFLE(2,0,2,0)));
+
+    const __m128 t4 = _mm_castsi128_ps(block4);
+    const __m128 t5 = _mm_castsi128_ps(block5);
+	__m128i x3 = _mm_castps_si128(_mm_shuffle_ps(t4, t5, _MM_SHUFFLE(3,1,3,1)));
+	__m128i y3 = _mm_castps_si128(_mm_shuffle_ps(t4, t5, _MM_SHUFFLE(2,0,2,0)));
+
+    for (int i=0; i < static_cast<int>(rounds); ++i)
+    {
+        const __m128i rk = _mm_set1_epi32(subkeys[i]);
+
+        x1 = RotateRight32<8>(x1);
+        x2 = RotateRight32<8>(x2);
+        x3 = RotateRight32<8>(x3);
+        x1 = _mm_add_epi32(x1, y1);
+        x2 = _mm_add_epi32(x2, y2);
+        x3 = _mm_add_epi32(x3, y3);
+        x1 = _mm_xor_si128(x1, rk);
+        x2 = _mm_xor_si128(x2, rk);
+        x3 = _mm_xor_si128(x3, rk);
+        y1 = RotateLeft32<3>(y1);
+        y2 = RotateLeft32<3>(y2);
+        y3 = RotateLeft32<3>(y3);
+        y1 = _mm_xor_si128(y1, x1);
+        y2 = _mm_xor_si128(y2, x2);
+        y3 = _mm_xor_si128(y3, x3);
+    }
+
+    // The is roughly the SSE equivalent to ARM vzp32
+    // [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4]
+    block0 = _mm_unpacklo_epi32(y1, x1);
+    block1 = _mm_unpackhi_epi32(y1, x1);
+    block2 = _mm_unpacklo_epi32(y2, x2);
+    block3 = _mm_unpackhi_epi32(y2, x2);
+    block4 = _mm_unpacklo_epi32(y3, x3);
+    block5 = _mm_unpackhi_epi32(y3, x3);
+}
+
+inline void GCC_NO_UBSAN SPECK64_Dec_6_Blocks(__m128i &block0, __m128i &block1,
+    __m128i &block2, __m128i &block3, __m128i &block4, __m128i &block5,
+    const word32 *subkeys, unsigned int rounds)
+{
+    // Rearrange the data for vectorization. The incoming data was read from
+    // a big-endian byte array. Depending on the number of blocks it needs to
+    // be permuted to the following. Thanks to Peter Cordes for help with the
+    // SSE permutes below.
+    // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
+    const __m128 t0 = _mm_castsi128_ps(block0);
+    const __m128 t1 = _mm_castsi128_ps(block1);
+	__m128i x1 = _mm_castps_si128(_mm_shuffle_ps(t0, t1, _MM_SHUFFLE(3,1,3,1)));
+	__m128i y1 = _mm_castps_si128(_mm_shuffle_ps(t0, t1, _MM_SHUFFLE(2,0,2,0)));
+
+    const __m128 t2 = _mm_castsi128_ps(block2);
+    const __m128 t3 = _mm_castsi128_ps(block3);
+	__m128i x2 = _mm_castps_si128(_mm_shuffle_ps(t2, t3, _MM_SHUFFLE(3,1,3,1)));
+	__m128i y2 = _mm_castps_si128(_mm_shuffle_ps(t2, t3, _MM_SHUFFLE(2,0,2,0)));
+
+    const __m128 t4 = _mm_castsi128_ps(block4);
+    const __m128 t5 = _mm_castsi128_ps(block5);
+	__m128i x3 = _mm_castps_si128(_mm_shuffle_ps(t4, t5, _MM_SHUFFLE(3,1,3,1)));
+	__m128i y3 = _mm_castps_si128(_mm_shuffle_ps(t4, t5, _MM_SHUFFLE(2,0,2,0)));
+
+    for (int i = static_cast<int>(rounds-1); i >= 0; --i)
+    {
+        const __m128i rk = _mm_set1_epi32(subkeys[i]);
+
+        y1 = _mm_xor_si128(y1, x1);
+        y2 = _mm_xor_si128(y2, x2);
+        y3 = _mm_xor_si128(y3, x3);
+        y1 = RotateRight32<3>(y1);
+        y2 = RotateRight32<3>(y2);
+        y3 = RotateRight32<3>(y3);
+        x1 = _mm_xor_si128(x1, rk);
+        x2 = _mm_xor_si128(x2, rk);
+        x3 = _mm_xor_si128(x3, rk);
+        x1 = _mm_sub_epi32(x1, y1);
+        x2 = _mm_sub_epi32(x2, y2);
+        x3 = _mm_sub_epi32(x3, y3);
+        x1 = RotateLeft32<8>(x1);
+        x2 = RotateLeft32<8>(x2);
+        x3 = RotateLeft32<8>(x3);
+    }
+
+    // The is roughly the SSE equivalent to ARM vzp32
+    // [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4]
+    block0 = _mm_unpacklo_epi32(y1, x1);
+    block1 = _mm_unpackhi_epi32(y1, x1);
+    block2 = _mm_unpacklo_epi32(y2, x2);
+    block3 = _mm_unpackhi_epi32(y2, x2);
+    block4 = _mm_unpacklo_epi32(y3, x3);
+    block5 = _mm_unpackhi_epi32(y3, x3);
+}
+
+#endif  // CRYPTOPP_SSE41_AVAILABLE
+
+ANONYMOUS_NAMESPACE_END
+
+///////////////////////////////////////////////////////////////////////
+
+NAMESPACE_BEGIN(CryptoPP)
+
+// *************************** ARM NEON **************************** //
+
+#if defined(CRYPTOPP_ARM_NEON_AVAILABLE)
+size_t SPECK64_Enc_AdvancedProcessBlocks_NEON(const word32* subKeys, size_t rounds,
+    const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
+{
+    return AdvancedProcessBlocks64_6x2_NEON(SPECK64_Enc_Block, SPECK64_Enc_6_Blocks,
+        subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
+}
+
+size_t SPECK64_Dec_AdvancedProcessBlocks_NEON(const word32* subKeys, size_t rounds,
+    const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
+{
+    return AdvancedProcessBlocks64_6x2_NEON(SPECK64_Dec_Block, SPECK64_Dec_6_Blocks,
+        subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
+}
+#endif
+
+#if (CRYPTOPP_ARM_NEON_AVAILABLE)
+size_t SPECK128_Enc_AdvancedProcessBlocks_NEON(const word64* subKeys, size_t rounds,
+    const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
+{
+    return AdvancedProcessBlocks128_6x2_NEON(SPECK128_Enc_Block, SPECK128_Enc_6_Blocks,
+        subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
+}
+
+size_t SPECK128_Dec_AdvancedProcessBlocks_NEON(const word64* subKeys, size_t rounds,
+    const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
+{
+    return AdvancedProcessBlocks128_6x2_NEON(SPECK128_Dec_Block, SPECK128_Dec_6_Blocks,
+        subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
+}
+#endif  // CRYPTOPP_ARM_NEON_AVAILABLE
+
+// ***************************** IA-32 ***************************** //
+
+#if defined(CRYPTOPP_SSE41_AVAILABLE)
+size_t SPECK64_Enc_AdvancedProcessBlocks_SSE41(const word32* subKeys, size_t rounds,
+    const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
+{
+    return AdvancedProcessBlocks64_6x2_SSE(SPECK64_Enc_Block, SPECK64_Enc_6_Blocks,
+        subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
+}
+
+size_t SPECK64_Dec_AdvancedProcessBlocks_SSE41(const word32* subKeys, size_t rounds,
+    const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
+{
+    return AdvancedProcessBlocks64_6x2_SSE(SPECK64_Dec_Block, SPECK64_Dec_6_Blocks,
+        subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
+}
+#endif
+
+#if defined(CRYPTOPP_SSSE3_AVAILABLE)
+size_t SPECK128_Enc_AdvancedProcessBlocks_SSSE3(const word64* subKeys, size_t rounds,
+    const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
+{
+    return AdvancedProcessBlocks128_6x2_SSE(SPECK128_Enc_Block, SPECK128_Enc_6_Blocks,
+        subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
+}
+
+size_t SPECK128_Dec_AdvancedProcessBlocks_SSSE3(const word64* subKeys, size_t rounds,
+    const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
+{
+    return AdvancedProcessBlocks128_6x2_SSE(SPECK128_Dec_Block, SPECK128_Dec_6_Blocks,
+        subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
+}
+#endif  // CRYPTOPP_SSSE3_AVAILABLE
+
+NAMESPACE_END
diff --git a/speck.cpp b/speck.cpp
new file mode 100644
index 00000000..9aaff67f
--- /dev/null
+++ b/speck.cpp
@@ -0,0 +1,438 @@
+// speck.cpp - written and placed in the public domain by Jeffrey Walton
+
+#include "pch.h"
+#include "config.h"
+
+#include "speck.h"
+#include "misc.h"
+#include "cpu.h"
+
+// Uncomment for benchmarking C++ against SSE or NEON.
+// Do so in both speck.cpp and speck-simd.cpp.
+// #undef CRYPTOPP_SSSE3_AVAILABLE
+// #undef CRYPTOPP_SSE41_AVAILABLE
+// #undef CRYPTOPP_ARM_NEON_AVAILABLE
+
+ANONYMOUS_NAMESPACE_BEGIN
+
+using CryptoPP::word32;
+using CryptoPP::word64;
+using CryptoPP::rotlConstant;
+using CryptoPP::rotrConstant;
+
+/// \brief Forward round transformation
+/// \tparam W word type
+/// \details TF83() is the forward round transformation using a=8 and b=3 rotations.
+///   The initial test implementation provided template parameters, but they were
+///   removed because SPECK32 using a=7 and b=2 was not on the road map. The
+///   additional template parameters also made calling SPECK_Encrypt and SPECK_Decrypt
+///   kind of messy.
+template <class W>
+inline void TF83(W& x, W& y, const W k)
+{
+    x = rotrConstant<8>(x);
+    x += y; x ^= k;
+    y = rotlConstant<3>(y);
+    y ^= x;
+}
+
+/// \brief Reverse round transformation
+/// \tparam W word type
+/// \details TR83() is the reverse round transformation using a=8 and b=3 rotations.
+///   The initial test implementation provided template parameters, but they were
+///   removed because SPECK32 using a=7 and b=2 was not on the road map. The
+///   additional template parameters also made calling SPECK_Encrypt and SPECK_Decrypt
+///   kind of messy.
+template <class W>
+inline void TR83(W& x, W& y, const W k)
+{
+    y ^= x;
+    y = rotrConstant<3>(y);
+    x ^= k; x -= y;
+    x = rotlConstant<8>(x);
+}
+
+/// \brief Forward transformation
+/// \tparam W word type
+/// \tparam R number of rounds
+/// \param c output array
+/// \param p input array
+/// \param k subkey array
+template <class W, unsigned int R>
+inline void SPECK_Encrypt(W c[2], const W p[2], const W k[R])
+{
+    c[0]=p[0]; c[1]=p[1];
+
+    // Don't unroll this loop. Things slow down.
+    for (int i = 0; i < static_cast<int>(R); ++i)
+        TF83(c[0], c[1], k[i]);
+}
+
+/// \brief Reverse transformation
+/// \tparam W word type
+/// \tparam R number of rounds
+/// \param p output array
+/// \param c input array
+/// \param k subkey array
+template <class W, unsigned int R>
+inline void SPECK_Decrypt(W p[2], const W c[2], const W k[R])
+{
+    p[0]=c[0]; p[1]=c[1];
+
+    // Don't unroll this loop. Things slow down.
+    for (int i = static_cast<int>(R-1); i >= 0; --i)
+        TR83(p[0], p[1], k[i]);
+}
+
+/// \brief Subkey generation function
+/// \details Used when the user key consists of 2 words
+/// \tparam W word type
+/// \tparam R number of rounds
+/// \param key empty subkey array
+/// \param k user key array
+template <class W, unsigned int R>
+inline void SPECK_ExpandKey_2W(W key[R], const W k[2])
+{
+    CRYPTOPP_ASSERT(R==32);
+    W i=0, B=k[0], A=k[1];
+
+    while (i<R-1)
+    {
+        key[i]=A; TF83(B, A, i);
+        i++;
+    }
+    key[R-1]=A;
+}
+
+/// \brief Subkey generation function
+/// \details Used when the user key consists of 3 words
+/// \tparam W word type
+/// \tparam R number of rounds
+/// \param key empty subkey array
+/// \param k user key array
+template <class W, unsigned int R>
+inline void SPECK_ExpandKey_3W(W key[R], const W k[3])
+{
+    CRYPTOPP_ASSERT(R==33 || R==26);
+    W i=0, C=k[0], B=k[1], A=k[2];
+
+    unsigned int blocks = R/2;
+    while (blocks--)
+    {
+        key[i+0]=A; TF83(B, A, i+0);
+        key[i+1]=A; TF83(C, A, i+1);
+        i+=2;
+    }
+
+    // The constexpr residue should allow the optimizer to remove unneeded statements
+    if(R%2 == 1)
+    {
+        key[R-1]=A;
+    }
+}
+
+/// \brief Subkey generation function
+/// \details Used when the user key consists of 4 words
+/// \tparam W word type
+/// \tparam R number of rounds
+/// \param key empty subkey array
+/// \param k user key array
+template <class W, unsigned int R>
+inline void SPECK_ExpandKey_4W(W key[R], const W k[4])
+{
+    CRYPTOPP_ASSERT(R==34 || R==27);
+    W i=0, D=k[0], C=k[1], B=k[2], A=k[3];
+
+    unsigned int blocks = R/3;
+    while (blocks--)
+    {
+        key[i+0]=A; TF83(B, A, i+0);
+        key[i+1]=A; TF83(C, A, i+1);
+        key[i+2]=A; TF83(D, A, i+2);
+        i+=3;
+    }
+
+    // The constexpr residue should allow the optimizer to remove unneeded statements
+    if(R%3 == 1)
+    {
+        key[R-1]=A;
+    }
+    else if(R%3 == 2)
+    {
+        key[R-2]=A; TF83(B, A, W(R-2));
+        key[R-1]=A;
+    }
+}
+
+ANONYMOUS_NAMESPACE_END
+
+///////////////////////////////////////////////////////////
+
+NAMESPACE_BEGIN(CryptoPP)
+
+#if defined(CRYPTOPP_ARM_NEON_AVAILABLE)
+extern size_t SPECK64_Enc_AdvancedProcessBlocks_NEON(const word32* subKeys, size_t rounds,
+    const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags);
+
+extern size_t SPECK64_Dec_AdvancedProcessBlocks_NEON(const word32* subKeys, size_t rounds,
+    const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags);
+
+extern size_t SPECK128_Enc_AdvancedProcessBlocks_NEON(const word64* subKeys, size_t rounds,
+    const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags);
+
+extern size_t SPECK128_Dec_AdvancedProcessBlocks_NEON(const word64* subKeys, size_t rounds,
+    const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags);
+#endif
+
+#if defined(CRYPTOPP_SSE41_AVAILABLE)
+extern size_t SPECK64_Enc_AdvancedProcessBlocks_SSE41(const word32* subKeys, size_t rounds,
+    const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags);
+
+extern size_t SPECK64_Dec_AdvancedProcessBlocks_SSE41(const word32* subKeys, size_t rounds,
+    const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags);
+#endif
+
+#if defined(CRYPTOPP_SSSE3_AVAILABLE)
+extern size_t SPECK128_Enc_AdvancedProcessBlocks_SSSE3(const word64* subKeys, size_t rounds,
+    const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags);
+
+extern size_t SPECK128_Dec_AdvancedProcessBlocks_SSSE3(const word64* subKeys, size_t rounds,
+    const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags);
+#endif
+
+void SPECK64::Base::UncheckedSetKey(const byte *userKey, unsigned int keyLength, const NameValuePairs &params)
+{
+    CRYPTOPP_ASSERT(keyLength == 12 || keyLength == 16);
+    CRYPTOPP_UNUSED(params);
+
+    // Building the key schedule table requires {3,4} words workspace.
+    // Encrypting and decrypting requires 4 words workspace.
+    m_kwords = keyLength/sizeof(word32);
+    m_wspace.New(4U);
+
+    // Do the endian gyrations from the paper and align pointers
+    typedef GetBlock<word32, LittleEndian, false> KeyBlock;
+    KeyBlock kblk(userKey);
+
+    switch (m_kwords)
+    {
+    case 3:
+        m_rkeys.New((m_rounds = 26));
+        kblk(m_wspace[2])(m_wspace[1])(m_wspace[0]);
+        SPECK_ExpandKey_3W<word32, 26>(m_rkeys, m_wspace);
+        break;
+    case 4:
+        m_rkeys.New((m_rounds = 27));
+        kblk(m_wspace[3])(m_wspace[2])(m_wspace[1])(m_wspace[0]);
+        SPECK_ExpandKey_4W<word32, 27>(m_rkeys, m_wspace);
+        break;
+    default:
+        CRYPTOPP_ASSERT(0);;
+    }
+}
+
+void SPECK64::Enc::ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock, byte *outBlock) const
+{
+    // Do the endian gyrations from the paper and align pointers
+    typedef GetBlock<word32, LittleEndian, false> InBlock;
+    InBlock iblk(inBlock); iblk(m_wspace[1])(m_wspace[0]);
+
+    switch (m_rounds)
+    {
+    case 26:
+        SPECK_Encrypt<word32, 26>(m_wspace+2, m_wspace+0, m_rkeys);
+        break;
+    case 27:
+        SPECK_Encrypt<word32, 27>(m_wspace+2, m_wspace+0, m_rkeys);
+        break;
+    default:
+        CRYPTOPP_ASSERT(0);;
+    }
+
+    // Do the endian gyrations from the paper and align pointers
+    typedef PutBlock<word32, LittleEndian, false> OutBlock;
+    OutBlock oblk(xorBlock, outBlock); oblk(m_wspace[3])(m_wspace[2]);
+}
+
+void SPECK64::Dec::ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock, byte *outBlock) const
+{
+    // Do the endian gyrations from the paper and align pointers
+    typedef GetBlock<word32, LittleEndian, false> InBlock;
+    InBlock iblk(inBlock); iblk(m_wspace[1])(m_wspace[0]);
+
+    switch (m_rounds)
+    {
+    case 26:
+        SPECK_Decrypt<word32, 26>(m_wspace+2, m_wspace+0, m_rkeys);
+        break;
+    case 27:
+        SPECK_Decrypt<word32, 27>(m_wspace+2, m_wspace+0, m_rkeys);
+        break;
+    default:
+        CRYPTOPP_ASSERT(0);;
+    }
+
+    // Do the endian gyrations from the paper and align pointers
+    typedef PutBlock<word32, LittleEndian, false> OutBlock;
+    OutBlock oblk(xorBlock, outBlock); oblk(m_wspace[3])(m_wspace[2]);
+}
+
+///////////////////////////////////////////////////////////
+
+void SPECK128::Base::UncheckedSetKey(const byte *userKey, unsigned int keyLength, const NameValuePairs &params)
+{
+    CRYPTOPP_ASSERT(keyLength == 16 || keyLength == 24 || keyLength == 32);
+    CRYPTOPP_UNUSED(params);
+
+    // Building the key schedule table requires {2,3,4} words workspace.
+    // Encrypting and decrypting requires 4 words workspace.
+    m_kwords = keyLength/sizeof(word64);
+    m_wspace.New(4U);
+
+    // Do the endian gyrations from the paper and align pointers
+    typedef GetBlock<word64, LittleEndian, false> KeyBlock;
+    KeyBlock kblk(userKey);
+
+    switch (m_kwords)
+    {
+    case 2:
+        m_rkeys.New((m_rounds = 32));
+        kblk(m_wspace[1])(m_wspace[0]);
+        SPECK_ExpandKey_2W<word64, 32>(m_rkeys, m_wspace);
+        break;
+    case 3:
+        m_rkeys.New((m_rounds = 33));
+        kblk(m_wspace[2])(m_wspace[1])(m_wspace[0]);
+        SPECK_ExpandKey_3W<word64, 33>(m_rkeys, m_wspace);
+        break;
+    case 4:
+        m_rkeys.New((m_rounds = 34));
+        kblk(m_wspace[3])(m_wspace[2])(m_wspace[1])(m_wspace[0]);
+        SPECK_ExpandKey_4W<word64, 34>(m_rkeys, m_wspace);
+        break;
+    default:
+        CRYPTOPP_ASSERT(0);;
+    }
+}
+
+void SPECK128::Enc::ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock, byte *outBlock) const
+{
+    // Do the endian gyrations from the paper and align pointers
+    typedef GetBlock<word64, LittleEndian, false> InBlock;
+    InBlock iblk(inBlock); iblk(m_wspace[1])(m_wspace[0]);
+
+    switch (m_rounds)
+    {
+    case 32:
+        SPECK_Encrypt<word64, 32>(m_wspace+2, m_wspace+0, m_rkeys);
+        break;
+    case 33:
+        SPECK_Encrypt<word64, 33>(m_wspace+2, m_wspace+0, m_rkeys);
+        break;
+    case 34:
+        SPECK_Encrypt<word64, 34>(m_wspace+2, m_wspace+0, m_rkeys);
+        break;
+    default:
+        CRYPTOPP_ASSERT(0);;
+    }
+
+    // Do the endian gyrations from the paper and align pointers
+    typedef PutBlock<word64, LittleEndian, false> OutBlock;
+    OutBlock oblk(xorBlock, outBlock); oblk(m_wspace[3])(m_wspace[2]);
+}
+
+void SPECK128::Dec::ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock, byte *outBlock) const
+{
+    // Do the endian gyrations from the paper and align pointers
+    typedef GetBlock<word64, LittleEndian, false> InBlock;
+    InBlock iblk(inBlock); iblk(m_wspace[1])(m_wspace[0]);
+
+    switch (m_rounds)
+    {
+    case 32:
+        SPECK_Decrypt<word64, 32>(m_wspace+2, m_wspace+0, m_rkeys);
+        break;
+    case 33:
+        SPECK_Decrypt<word64, 33>(m_wspace+2, m_wspace+0, m_rkeys);
+        break;
+    case 34:
+        SPECK_Decrypt<word64, 34>(m_wspace+2, m_wspace+0, m_rkeys);
+        break;
+    default:
+        CRYPTOPP_ASSERT(0);;
+    }
+
+    // Do the endian gyrations from the paper and align pointers
+    typedef PutBlock<word64, LittleEndian, false> OutBlock;
+    OutBlock oblk(xorBlock, outBlock); oblk(m_wspace[3])(m_wspace[2]);
+}
+
+#if defined(CRYPTOPP_SPECK64_ADVANCED_PROCESS_BLOCKS)
+size_t SPECK64::Enc::AdvancedProcessBlocks(const byte *inBlocks, const byte *xorBlocks,
+        byte *outBlocks, size_t length, word32 flags) const
+{
+#if defined(CRYPTOPP_SSE41_AVAILABLE)
+    if (HasSSE41())
+        return SPECK64_Enc_AdvancedProcessBlocks_SSE41(m_rkeys, (size_t)m_rounds,
+            inBlocks, xorBlocks, outBlocks, length, flags);
+#endif
+#if defined(CRYPTOPP_ARM_NEON_AVAILABLE)
+    if (HasNEON())
+        return SPECK64_Enc_AdvancedProcessBlocks_NEON(m_rkeys, (size_t)m_rounds,
+            inBlocks, xorBlocks, outBlocks, length, flags);
+#endif
+    return BlockTransformation::AdvancedProcessBlocks(inBlocks, xorBlocks, outBlocks, length, flags);
+}
+
+size_t SPECK64::Dec::AdvancedProcessBlocks(const byte *inBlocks, const byte *xorBlocks,
+        byte *outBlocks, size_t length, word32 flags) const
+{
+#if defined(CRYPTOPP_SSE41_AVAILABLE)
+    if (HasSSE41())
+        return SPECK64_Dec_AdvancedProcessBlocks_SSE41(m_rkeys, (size_t)m_rounds,
+            inBlocks, xorBlocks, outBlocks, length, flags);
+#endif
+#if defined(CRYPTOPP_ARM_NEON_AVAILABLE)
+    if (HasNEON())
+        return SPECK64_Dec_AdvancedProcessBlocks_NEON(m_rkeys, (size_t)m_rounds,
+            inBlocks, xorBlocks, outBlocks, length, flags);
+#endif
+    return BlockTransformation::AdvancedProcessBlocks(inBlocks, xorBlocks, outBlocks, length, flags);
+}
+#endif  // CRYPTOPP_SPECK64_ADVANCED_PROCESS_BLOCKS
+
+#if defined(CRYPTOPP_SPECK128_ADVANCED_PROCESS_BLOCKS)
+size_t SPECK128::Enc::AdvancedProcessBlocks(const byte *inBlocks, const byte *xorBlocks,
+        byte *outBlocks, size_t length, word32 flags) const
+{
+#if defined(CRYPTOPP_SSSE3_AVAILABLE)
+    if (HasSSSE3())
+        return SPECK128_Enc_AdvancedProcessBlocks_SSSE3(m_rkeys, (size_t)m_rounds,
+            inBlocks, xorBlocks, outBlocks, length, flags);
+#endif
+#if defined(CRYPTOPP_ARM_NEON_AVAILABLE)
+    if (HasNEON())
+        return SPECK128_Enc_AdvancedProcessBlocks_NEON(m_rkeys, (size_t)m_rounds,
+            inBlocks, xorBlocks, outBlocks, length, flags);
+#endif
+    return BlockTransformation::AdvancedProcessBlocks(inBlocks, xorBlocks, outBlocks, length, flags);
+}
+
+size_t SPECK128::Dec::AdvancedProcessBlocks(const byte *inBlocks, const byte *xorBlocks,
+        byte *outBlocks, size_t length, word32 flags) const
+{
+#if defined(CRYPTOPP_SSSE3_AVAILABLE)
+    if (HasSSSE3())
+        return SPECK128_Dec_AdvancedProcessBlocks_SSSE3(m_rkeys, (size_t)m_rounds,
+            inBlocks, xorBlocks, outBlocks, length, flags);
+#endif
+#if defined(CRYPTOPP_ARM_NEON_AVAILABLE)
+    if (HasNEON())
+        return SPECK128_Dec_AdvancedProcessBlocks_NEON(m_rkeys, (size_t)m_rounds,
+            inBlocks, xorBlocks, outBlocks, length, flags);
+#endif
+    return BlockTransformation::AdvancedProcessBlocks(inBlocks, xorBlocks, outBlocks, length, flags);
+}
+#endif  // CRYPTOPP_SPECK128_ADVANCED_PROCESS_BLOCKS
+
+NAMESPACE_END
diff --git a/speck.h b/speck.h
new file mode 100644
index 00000000..44847e9b
--- /dev/null
+++ b/speck.h
@@ -0,0 +1,180 @@
+// speck.h - written and placed in the public domain by Jeffrey Walton
+
+/// \file speck.h
+/// \brief Classes for the Speck block cipher
+/// \details Speck is a block cipher designed by Ray Beaulieu, Douglas Shors, Jason Smith,
+///   Stefan Treatman-Clark, Bryan Weeks and Louis Wingers.
+/// \sa <A HREF="http://eprint.iacr.org/2013/404">The SIMON and SPECK Families of
+///   Lightweight Block Ciphers</A>, <A HREF="http://iadgov.github.io/simon-speck/">
+///   The Simon and Speck GitHub</A> and <A HREF="https://www.cryptopp.com/wiki/SPECK">
+///   SPECK</A> on the Crypto++ wiki.
+/// \since Crypto++ 6.0
+
+#ifndef CRYPTOPP_SPECK_H
+#define CRYPTOPP_SPECK_H
+
+#include "config.h"
+#include "seckey.h"
+#include "secblock.h"
+
+#if CRYPTOPP_BOOL_X64 || CRYPTOPP_BOOL_X32 || CRYPTOPP_BOOL_X86
+# define CRYPTOPP_SPECK64_ADVANCED_PROCESS_BLOCKS 1
+#endif
+
+#if CRYPTOPP_BOOL_X64 || CRYPTOPP_BOOL_X32 || CRYPTOPP_BOOL_X86
+# define CRYPTOPP_SPECK128_ADVANCED_PROCESS_BLOCKS 1
+#endif
+
+NAMESPACE_BEGIN(CryptoPP)
+
+/// \brief SPECK block cipher information
+/// \tparam L block size of the cipher, in bytes
+/// \tparam D default key length, in bytes
+/// \tparam N minimum key length, in bytes
+/// \tparam M maximum key length, in bytes
+/// \since Crypto++ 6.0
+template <unsigned int L, unsigned int D, unsigned int N, unsigned int M>
+struct SPECK_Info : public FixedBlockSize<L>, VariableKeyLength<D, N, M>
+{
+    static const std::string StaticAlgorithmName()
+    {
+        // Format is Cipher-Blocksize(Keylength)
+        return "SPECK-" + IntToString(L*8);
+    }
+};
+
+/// \brief SPECK block cipher base class
+/// \tparam W the word type
+/// \details User code should use SPECK64 or SPECK128
+/// \sa SPECK64, SPECK128, <a href="http://www.cryptopp.com/wiki/SPECK">SPECK</a>
+/// \since Crypto++ 6.0
+template <class W>
+struct SPECK_Base
+{
+    virtual ~SPECK_Base() {}
+    SPECK_Base() : m_kwords(0), m_rounds(0) {}
+
+    typedef SecBlock<W, AllocatorWithCleanup<W, true> > AlignedSecBlock;
+    mutable AlignedSecBlock m_wspace;  // workspace
+    AlignedSecBlock         m_rkeys;   // round keys
+    unsigned int            m_kwords;  // number of key words
+    unsigned int            m_rounds;  // number of rounds
+};
+
+/// \brief SPECK 64-bit block cipher
+/// \details Speck is a block cipher designed by Ray Beaulieu, Douglas Shors, Jason Smith,
+///   Stefan Treatman-Clark, Bryan Weeks and Louis Wingers.
+/// \details SPECK64 provides 64-bit block size. The valid key sizes are 96-bit and 128-bit.
+/// \sa SPECK64, SPECK128,  <A HREF="http://eprint.iacr.org/2013/404">The SIMON and SPECK
+///   Families of Lightweight Block Ciphers</A>, <A HREF="http://iadgov.github.io/simon-speck/">
+///   The Simon and Speck GitHub</A>, <a href="http://www.cryptopp.com/wiki/SPECK">SPECK</a> on the
+///   Crypto++ wiki
+/// \since Crypto++ 6.0
+class CRYPTOPP_NO_VTABLE SPECK64 : public SPECK_Info<8, 12, 12, 16>, public BlockCipherDocumentation
+{
+public:
+    /// \brief SPECK block cipher transformation functions
+    /// \details Provides implementation common to encryption and decryption
+    /// \since Crypto++ 6.0
+    class CRYPTOPP_NO_VTABLE Base : protected SPECK_Base<word32>, public BlockCipherImpl<SPECK_Info<8, 12, 12, 16> >
+    {
+    public:
+        std::string AlgorithmName() const {
+            return StaticAlgorithmName() + (m_kwords == 0 ? "" :
+                "(" + IntToString(m_kwords*sizeof(word32)*8) + ")");
+        }
+
+    protected:
+        void UncheckedSetKey(const byte *userKey, unsigned int keyLength, const NameValuePairs &params);
+    };
+
+    /// \brief Provides implementation for encryption transformation
+    /// \details Enc provides implementation for encryption transformation. All key
+    ///   sizes are supported.
+    /// \since Crypto++ 6.0
+    class CRYPTOPP_NO_VTABLE Enc : public Base
+    {
+    protected:
+        void ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock, byte *outBlock) const;
+#if CRYPTOPP_SPECK64_ADVANCED_PROCESS_BLOCKS
+        size_t AdvancedProcessBlocks(const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags) const;
+#endif
+    };
+
+    /// \brief Provides implementation for encryption transformation
+    /// \details Dec provides implementation for decryption transformation. All key
+    ///   sizes are supported.
+    /// \since Crypto++ 6.0
+    class CRYPTOPP_NO_VTABLE Dec : public Base
+    {
+    protected:
+        void ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock, byte *outBlock) const;
+#if CRYPTOPP_SPECK64_ADVANCED_PROCESS_BLOCKS
+        size_t AdvancedProcessBlocks(const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags) const;
+#endif
+    };
+
+    typedef BlockCipherFinal<ENCRYPTION, Enc> Encryption;
+    typedef BlockCipherFinal<DECRYPTION, Dec> Decryption;
+};
+
+/// \brief SPECK 128-bit block cipher
+/// \details Speck is a block cipher designed by Ray Beaulieu, Douglas Shors, Jason Smith,
+///   Stefan Treatman-Clark, Bryan Weeks and Louis Wingers.
+/// \details SPECK128 provides 128-bit block size. The valid key sizes are 128-bit, 192-bit and 256-bit.
+/// \sa SPECK64, SPECK128,  <A HREF="http://eprint.iacr.org/2013/404">The SIMON and SPECK
+///   Families of Lightweight Block Ciphers</A>, <A HREF="http://iadgov.github.io/simon-speck/">
+///   The Simon and Speck GitHub</A>, <a href="http://www.cryptopp.com/wiki/SPECK">SPECK</a> on the
+///   Crypto++ wiki
+/// \since Crypto++ 6.0
+class CRYPTOPP_NO_VTABLE SPECK128 : public SPECK_Info<16, 16, 16, 32>, public BlockCipherDocumentation
+{
+public:
+    /// \brief SPECK block cipher transformation functions
+    /// \details Provides implementation common to encryption and decryption
+    /// \since Crypto++ 6.0
+    class CRYPTOPP_NO_VTABLE Base : protected SPECK_Base<word64>, public BlockCipherImpl<SPECK_Info<16, 16, 16, 32> >
+    {
+    public:
+        std::string AlgorithmName() const {
+            return StaticAlgorithmName() + (m_kwords == 0 ? "" :
+                "(" + IntToString(m_kwords*sizeof(word64)*8) + ")");
+        }
+
+    protected:
+        void UncheckedSetKey(const byte *userKey, unsigned int keyLength, const NameValuePairs &params);
+    };
+
+    /// \brief Provides implementation for encryption transformation
+    /// \details Enc provides implementation for encryption transformation. All key
+    ///   sizes are supported.
+    /// \since Crypto++ 6.0
+    class CRYPTOPP_NO_VTABLE Enc : public Base
+    {
+    protected:
+        void ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock, byte *outBlock) const;
+#if CRYPTOPP_SPECK128_ADVANCED_PROCESS_BLOCKS
+        size_t AdvancedProcessBlocks(const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags) const;
+#endif
+    };
+
+    /// \brief Provides implementation for encryption transformation
+    /// \details Dec provides implementation for decryption transformation. All key
+    ///   sizes are supported.
+    /// \since Crypto++ 6.0
+    class CRYPTOPP_NO_VTABLE Dec : public Base
+    {
+    protected:
+        void ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock, byte *outBlock) const;
+#if CRYPTOPP_SPECK128_ADVANCED_PROCESS_BLOCKS
+        size_t AdvancedProcessBlocks(const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags) const;
+#endif
+    };
+
+    typedef BlockCipherFinal<ENCRYPTION, Enc> Encryption;
+    typedef BlockCipherFinal<DECRYPTION, Dec> Decryption;
+};
+
+NAMESPACE_END
+
+#endif  // CRYPTOPP_SPECK_H