From 304809a65dc34a73160edcf9f568e35d6e1af9b5 Mon Sep 17 00:00:00 2001
From: Jeffrey Walton <noloader@gmail.com>
Date: Thu, 23 Nov 2017 02:47:44 -0500
Subject: [PATCH] Add NEON and ASIMD intrinsics for SPECK-128 (GH #538)

Performance increased by about 115% on a 980 MHz BananaPi dev-board. Throughput went from about 46.2 cpb to about 21.5 cpb.
---
 GNUmakefile    |   2 +
 speck-simd.cpp | 392 ++++++++++++++++++++++++++++++++++++++++++++++++-
 speck.cpp      |  36 ++++-
 speck.h        |   8 +-
 4 files changed, 430 insertions(+), 8 deletions(-)
diff --git a/GNUmakefile b/GNUmakefile
index c9ddc59f..638bb6c1 100755
--- a/GNUmakefile
+++ b/GNUmakefile
@@ -360,6 +360,7 @@ ifeq ($(IS_NEON),1)
     GCM_FLAG = -march=armv7-a -mfloat-abi=$(FP_ABI) -mfpu=neon
     ARIA_FLAG = -march=armv7-a -mfloat-abi=$(FP_ABI) -mfpu=neon
     BLAKE2_FLAG = -march=armv7-a -mfloat-abi=$(FP_ABI) -mfpu=neon
+    SPECK_FLAG = -march=armv7-a -mfloat-abi=$(FP_ABI) -mfpu=neon
   endif
 endif
 
@@ -369,6 +370,7 @@ ifeq ($(IS_ARMV8),1)
     ARIA_FLAG = -march=armv8-a
     BLAKE2_FLAG = -march=armv8-a
     NEON_FLAG = -march=armv8-a
+    SPECK_FLAG = -march=armv8-a
   endif
   HAVE_CRC = $(shell echo | $(CXX) -x c++ $(CXXFLAGS) -march=armv8-a+crc -dM -E - 2>/dev/null | $(GREP) -i -c __ARM_FEATURE_CRC32)
   ifeq ($(HAVE_CRC),1)
diff --git a/speck-simd.cpp b/speck-simd.cpp
index ea32f6d2..521da70e 100644
--- a/speck-simd.cpp
+++ b/speck-simd.cpp
@@ -11,6 +11,21 @@
 #include "speck.h"
 #include "misc.h"
 
+// Uncomment for benchmarking C++ against SSE or NEON.
+// Do so in both speck.cpp and speck-simd.cpp.
+// #undef CRYPTOPP_SSSE3_AVAILABLE
+// #undef CRYPTOPP_ARM_NEON_AVAILABLE
+
+// Disable NEON/ASIMD for Cortex-A53 and A57. The shifts are too slow and C/C++ is 3 cpb
+// faster than NEON/ASIMD. Also see http://github.com/weidai11/cryptopp/issues/367.
+#if (defined(__aarch32__) || defined(__aarch64__)) && defined(CRYPTOPP_SLOW_ARMV8_SHIFT)
+# undef CRYPTOPP_ARM_NEON_AVAILABLE
+#endif
+
+#if (CRYPTOPP_ARM_NEON_AVAILABLE)
+# include <arm_neon.h>
+#endif
+
 #if (CRYPTOPP_SSSE3_AVAILABLE)
 # include <tmmintrin.h>
 #endif
@@ -37,6 +52,359 @@ using CryptoPP::rotlFixed;
 using CryptoPP::rotrFixed;
 using CryptoPP::BlockTransformation;
 
+// *************************** ARM NEON ************************** //
+
+#if defined(CRYPTOPP_ARM_NEON_AVAILABLE)
+
+#if defined(CRYPTOPP_LITTLE_ENDIAN)
+const word32 s_one[] = {0, 0, 0, 1<<24};  // uint32x4_t
+#else
+const word32 s_one[] = {0, 0, 0, 1};      // uint32x4_t
+#endif
+
+template <class W, class T>
+inline W UnpackHigh64(const T& a, const T& b)
+{
+    const uint64_t x = vget_high_u64((uint64x2_t)a);
+    const uint64_t y = vget_high_u64((uint64x2_t)b);
+    return (W)vcombine_u64(x, y);
+}
+
+template <class W, class T>
+inline W UnpackLow64(const T& a, const T& b)
+{
+    const uint64_t x = vget_low_u64((uint64x2_t)a);
+    const uint64_t y = vget_low_u64((uint64x2_t)b);
+    return (W)vcombine_u64(x, y);
+}
+
+template <unsigned int R>
+inline uint64x2_t RotateLeft64(const uint64x2_t& val)
+{
+    CRYPTOPP_ASSERT(R < 64);
+    const uint64x2_t a(vshlq_n_u64(val, R));
+    const uint64x2_t b(vshrq_n_u64(val, 64 - R));
+    return vorrq_u64(a, b);
+}
+
+template <unsigned int R>
+inline uint64x2_t RotateRight64(const uint64x2_t& val)
+{
+    CRYPTOPP_ASSERT(R < 64);
+    const uint64x2_t a(vshlq_n_u64(val, 64 - R));
+    const uint64x2_t b(vshrq_n_u64(val, R));
+    return vorrq_u64(a, b);
+}
+
+inline uint64x2_t Shuffle64(const uint64x2_t& val)
+{
+    return vreinterpretq_u64_u8(
+        vrev64q_u8(vreinterpretq_u8_u64(val)));
+}
+
+inline void SPECK128_Enc_Block(uint8x16_t &block0, const word64 *subkeys, unsigned int rounds)
+{
+    // Hack ahead... SPECK128_AdvancedProcessBlocks_NEON loads each SPECK-128 block into a
+    // uint64x2_t. We can't SSE over them, so we rearrange the data to allow packed operations.
+    // Its also easier to permute them in SPECK128_Enc_Block rather than the calling code.
+    // SPECK128_AdvancedProcessBlocks_NEON is rather messy. The zero block below is a
+    // "don't care". It is present so we can vectorize SPECK128_Enc_Block.
+    uint8x16_t block1 = {0};
+    uint64x2_t x1 = UnpackLow64<uint64x2_t>(block0, block1);
+    uint64x2_t y1 = UnpackHigh64<uint64x2_t>(block0, block1);
+
+    x1 = Shuffle64(x1);
+    y1 = Shuffle64(y1);
+
+    for (size_t i=0; static_cast<int>(i)<rounds; ++i)
+    {
+        const uint64x2_t rk = vld1q_dup_u64(subkeys+i);
+
+        x1 = RotateRight64<8>(x1);
+        x1 = vaddq_u64(x1, y1);
+        x1 = veorq_u64(x1, rk);
+        y1 = RotateLeft64<3>(y1);
+        y1 = veorq_u64(y1, x1);
+    }
+
+    x1 = Shuffle64(x1);
+    y1 = Shuffle64(y1);
+
+    block0 = UnpackLow64<uint8x16_t>(x1, y1);
+    // block1 = UnpackHigh64<uint8x16_t>(x1, y1);
+}
+
+inline void SPECK128_Enc_6_Blocks(uint8x16_t &block0, uint8x16_t &block1,
+            uint8x16_t &block2, uint8x16_t &block3, uint8x16_t &block4,
+            uint8x16_t &block5, const word64 *subkeys, unsigned int rounds)
+{
+    // Hack ahead... SPECK128_AdvancedProcessBlocks_NEON loads each SPECK-128 block into a
+    // uint64x2_t. We can't SSE over them, so we rearrange the data to allow packed operations.
+    // Its also easier to permute them in SPECK128_Enc_6_Blocks rather than the calling code.
+    // SPECK128_AdvancedProcessBlocks_NEON is rather messy.
+    uint64x2_t x1 = UnpackLow64<uint64x2_t>(block0, block1);
+    uint64x2_t y1 = UnpackHigh64<uint64x2_t>(block0, block1);
+    uint64x2_t x2 = UnpackLow64<uint64x2_t>(block2, block3);
+    uint64x2_t y2 = UnpackHigh64<uint64x2_t>(block2, block3);
+    uint64x2_t x3 = UnpackLow64<uint64x2_t>(block4, block5);
+    uint64x2_t y3 = UnpackHigh64<uint64x2_t>(block4, block5);
+
+    x1 = Shuffle64(x1);
+    y1 = Shuffle64(y1);
+    x2 = Shuffle64(x2);
+    y2 = Shuffle64(y2);
+    x3 = Shuffle64(x3);
+    y3 = Shuffle64(y3);
+
+    for (size_t i=0; static_cast<int>(i)<rounds; ++i)
+    {
+        const uint64x2_t rk = vld1q_dup_u64(subkeys+i);
+
+        x1 = RotateRight64<8>(x1);
+        x2 = RotateRight64<8>(x2);
+        x3 = RotateRight64<8>(x3);
+        x1 = vaddq_u64(x1, y1);
+        x2 = vaddq_u64(x2, y2);
+        x3 = vaddq_u64(x3, y3);
+        x1 = veorq_u64(x1, rk);
+        x2 = veorq_u64(x2, rk);
+        x3 = veorq_u64(x3, rk);
+        y1 = RotateLeft64<3>(y1);
+        y2 = RotateLeft64<3>(y2);
+        y3 = RotateLeft64<3>(y3);
+        y1 = veorq_u64(y1, x1);
+        y2 = veorq_u64(y2, x2);
+        y3 = veorq_u64(y3, x3);
+    }
+
+    x1 = Shuffle64(x1);
+    y1 = Shuffle64(y1);
+    x2 = Shuffle64(x2);
+    y2 = Shuffle64(y2);
+    x3 = Shuffle64(x3);
+    y3 = Shuffle64(y3);
+
+    block0 = UnpackLow64<uint8x16_t>(x1, y1);
+    block1 = UnpackHigh64<uint8x16_t>(x1, y1);
+    block2 = UnpackLow64<uint8x16_t>(x2, y2);
+    block3 = UnpackHigh64<uint8x16_t>(x2, y2);
+    block4 = UnpackLow64<uint8x16_t>(x3, y3);
+    block5 = UnpackHigh64<uint8x16_t>(x3, y3);
+}
+
+inline void SPECK128_Dec_Block(uint8x16_t &block0, const word64 *subkeys, unsigned int rounds)
+{
+    // Hack ahead... SPECK128_AdvancedProcessBlocks_NEON loads each SPECK-128 block into a
+    // uint64x2_t. We can't SSE over them, so we rearrange the data to allow packed operations.
+    // Its also easier to permute them in SPECK128_Dec_Block rather than the calling code.
+    // SPECK128_AdvancedProcessBlocks_NEON is rather messy. The zero block below is a
+    // "don't care". It is present so we can vectorize SPECK128_Dec_Block.
+    uint8x16_t block1 = {0};
+    uint64x2_t x1 = UnpackLow64<uint64x2_t>(block0, block1);
+    uint64x2_t y1 = UnpackHigh64<uint64x2_t>(block0, block1);
+
+    x1 = Shuffle64(x1);
+    y1 = Shuffle64(y1);
+
+    for (size_t i=rounds-1; static_cast<int>(i)>=0; --i)
+    {
+        const uint64x2_t rk = vld1q_dup_u64(subkeys+i);
+
+        y1 = veorq_u64(y1, x1);
+        y1 = RotateRight64<3>(y1);
+        x1 = veorq_u64(x1, rk);
+        x1 = vsubq_u64(x1, y1);
+        x1 = RotateLeft64<8>(x1);
+    }
+
+    x1 = Shuffle64(x1);
+    y1 = Shuffle64(y1);
+
+    block0 = UnpackLow64<uint8x16_t>(x1, y1);
+    // block1 = UnpackHigh64<uint8x16_t>(x1, y1);
+}
+
+inline void SPECK128_Dec_6_Blocks(uint8x16_t &block0, uint8x16_t &block1,
+            uint8x16_t &block2, uint8x16_t &block3, uint8x16_t &block4,
+            uint8x16_t &block5, const word64 *subkeys, unsigned int rounds)
+{
+    // Hack ahead... SPECK128_AdvancedProcessBlocks_NEON loads each SPECK-128 block into a
+    // uint64x2_t. We can't SSE over them, so we rearrange the data to allow packed operations.
+    // Its also easier to permute them in SPECK128_Dec_6_Blocks rather than the calling code.
+    // SPECK128_AdvancedProcessBlocks_NEON is rather messy.
+    uint64x2_t x1 = UnpackLow64<uint64x2_t>(block0, block1);
+    uint64x2_t y1 = UnpackHigh64<uint64x2_t>(block0, block1);
+    uint64x2_t x2 = UnpackLow64<uint64x2_t>(block2, block3);
+    uint64x2_t y2 = UnpackHigh64<uint64x2_t>(block2, block3);
+    uint64x2_t x3 = UnpackLow64<uint64x2_t>(block4, block5);
+    uint64x2_t y3 = UnpackHigh64<uint64x2_t>(block5, block5);
+
+    x1 = Shuffle64(x1);
+    y1 = Shuffle64(y1);
+    x2 = Shuffle64(x2);
+    y2 = Shuffle64(y2);
+    x3 = Shuffle64(x3);
+    y3 = Shuffle64(y3);
+
+    for (size_t i=rounds-1; static_cast<int>(i)>=0; --i)
+    {
+        const uint64x2_t rk = vld1q_dup_u64(subkeys+i);
+
+        y1 = veorq_u64(y1, x1);
+        y2 = veorq_u64(y2, x2);
+        y3 = veorq_u64(y3, x3);
+        y1 = RotateRight64<3>(y1);
+        y2 = RotateRight64<3>(y2);
+        y3 = RotateRight64<3>(y3);
+        x1 = veorq_u64(x1, rk);
+        x2 = veorq_u64(x2, rk);
+        x3 = veorq_u64(x3, rk);
+        x1 = vsubq_u64(x1, y1);
+        x2 = vsubq_u64(x2, y2);
+        x3 = vsubq_u64(x3, y3);
+        x1 = RotateLeft64<8>(x1);
+        x2 = RotateLeft64<8>(x2);
+        x3 = RotateLeft64<8>(x3);
+    }
+
+    x1 = Shuffle64(x1);
+    y1 = Shuffle64(y1);
+    x2 = Shuffle64(x2);
+    y2 = Shuffle64(y2);
+    x3 = Shuffle64(x3);
+    y3 = Shuffle64(y3);
+
+    block0 = UnpackLow64<uint8x16_t>(x1, y1);
+    block1 = UnpackHigh64<uint8x16_t>(x1, y1);
+    block2 = UnpackLow64<uint8x16_t>(x2, y2);
+    block3 = UnpackHigh64<uint8x16_t>(x2, y2);
+    block4 = UnpackLow64<uint8x16_t>(x3, y3);
+    block5 = UnpackHigh64<uint8x16_t>(x3, y3);
+}
+
+template <typename F1, typename F6>
+size_t SPECK128_AdvancedProcessBlocks_NEON(F1 func1, F6 func6,
+            const word64 *subKeys, size_t rounds, const byte *inBlocks,
+            const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
+{
+    CRYPTOPP_ASSERT(subKeys);
+    CRYPTOPP_ASSERT(inBlocks);
+    CRYPTOPP_ASSERT(outBlocks);
+    CRYPTOPP_ASSERT(length >= 16);
+
+    const size_t blockSize = 16;
+    size_t inIncrement = (flags & (BlockTransformation::BT_InBlockIsCounter|BlockTransformation::BT_DontIncrementInOutPointers)) ? 0 : blockSize;
+    size_t xorIncrement = xorBlocks ? blockSize : 0;
+    size_t outIncrement = (flags & BlockTransformation::BT_DontIncrementInOutPointers) ? 0 : blockSize;
+
+    if (flags & BlockTransformation::BT_ReverseDirection)
+    {
+        inBlocks += length - blockSize;
+        xorBlocks += length - blockSize;
+        outBlocks += length - blockSize;
+        inIncrement = 0-inIncrement;
+        xorIncrement = 0-xorIncrement;
+        outIncrement = 0-outIncrement;
+    }
+
+    if (flags & BlockTransformation::BT_AllowParallel)
+    {
+        while (length >= 6*blockSize)
+        {
+            uint8x16_t block0, block1, block2, block3, block4, block5, temp;
+            block0 = vld1q_u8(inBlocks);
+
+            if (flags & BlockTransformation::BT_InBlockIsCounter)
+            {
+                uint32x4_t be = vld1q_u32(s_one);
+                block1 = (uint8x16_t)vaddq_u32(vreinterpretq_u32_u8(block0), be);
+                block2 = (uint8x16_t)vaddq_u32(vreinterpretq_u32_u8(block1), be);
+                block3 = (uint8x16_t)vaddq_u32(vreinterpretq_u32_u8(block2), be);
+                block4 = (uint8x16_t)vaddq_u32(vreinterpretq_u32_u8(block3), be);
+                block5 = (uint8x16_t)vaddq_u32(vreinterpretq_u32_u8(block4), be);
+                temp   = (uint8x16_t)vaddq_u32(vreinterpretq_u32_u8(block5), be);
+                vst1q_u8(const_cast<byte*>(inBlocks), temp);
+            }
+            else
+            {
+                const int inc = static_cast<int>(inIncrement);
+                block1 = vld1q_u8(inBlocks+1*inc);
+                block2 = vld1q_u8(inBlocks+2*inc);
+                block3 = vld1q_u8(inBlocks+3*inc);
+                block4 = vld1q_u8(inBlocks+4*inc);
+                block5 = vld1q_u8(inBlocks+5*inc);
+                inBlocks += 6*inc;
+            }
+
+            if (flags & BlockTransformation::BT_XorInput)
+            {
+                const int inc = static_cast<int>(xorIncrement);
+                block0 = veorq_u8(block0, vld1q_u8(xorBlocks+0*inc));
+                block1 = veorq_u8(block1, vld1q_u8(xorBlocks+1*inc));
+                block2 = veorq_u8(block2, vld1q_u8(xorBlocks+2*inc));
+                block3 = veorq_u8(block3, vld1q_u8(xorBlocks+3*inc));
+                block4 = veorq_u8(block4, vld1q_u8(xorBlocks+4*inc));
+                block5 = veorq_u8(block5, vld1q_u8(xorBlocks+5*inc));
+                xorBlocks += 6*inc;
+            }
+
+            func6(block0, block1, block2, block3, block4, block5, subKeys, rounds);
+
+            if (xorBlocks && !(flags & BlockTransformation::BT_XorInput))
+            {
+                const int inc = static_cast<int>(xorIncrement);
+                block0 = veorq_u8(block0, vld1q_u8(xorBlocks+0*inc));
+                block1 = veorq_u8(block1, vld1q_u8(xorBlocks+1*inc));
+                block2 = veorq_u8(block2, vld1q_u8(xorBlocks+2*inc));
+                block3 = veorq_u8(block3, vld1q_u8(xorBlocks+3*inc));
+                block4 = veorq_u8(block4, vld1q_u8(xorBlocks+4*inc));
+                block5 = veorq_u8(block5, vld1q_u8(xorBlocks+5*inc));
+                xorBlocks += 6*inc;
+            }
+
+            const int inc = static_cast<int>(outIncrement);
+            vst1q_u8(outBlocks+0*inc, block0);
+            vst1q_u8(outBlocks+1*inc, block1);
+            vst1q_u8(outBlocks+2*inc, block2);
+            vst1q_u8(outBlocks+3*inc, block3);
+            vst1q_u8(outBlocks+4*inc, block4);
+            vst1q_u8(outBlocks+5*inc, block5);
+
+            outBlocks += 6*inc;
+            length -= 6*blockSize;
+        }
+    }
+
+    while (length >= blockSize)
+    {
+        uint8x16_t block = vld1q_u8(inBlocks);
+
+        if (flags & BlockTransformation::BT_XorInput)
+            block = veorq_u8(block, vld1q_u8(xorBlocks));
+
+        if (flags & BlockTransformation::BT_InBlockIsCounter)
+            const_cast<byte *>(inBlocks)[15]++;
+
+        func1(block, subKeys, rounds);
+
+        if (xorBlocks && !(flags & BlockTransformation::BT_XorInput))
+            block = veorq_u8(block, vld1q_u8(xorBlocks));
+
+        vst1q_u8(outBlocks, block);
+
+        inBlocks += inIncrement;
+        outBlocks += outIncrement;
+        xorBlocks += xorIncrement;
+        length -= blockSize;
+    }
+
+    return length;
+}
+
+#endif  // CRYPTOPP_ARM_NEON_AVAILABLE
+
+// ***************************** IA-32 ***************************** //
+
 #if defined(CRYPTOPP_SSSE3_AVAILABLE)
 
 CRYPTOPP_ALIGN_DATA(16)
@@ -340,10 +708,30 @@ inline size_t SPECK128_AdvancedProcessBlocks_SSSE3(F1 func1, F4 func4,
 
 ANONYMOUS_NAMESPACE_END
 
-///////////////////////////////////////////////////////////
+///////////////////////////////////////////////////////////////////////
 
 NAMESPACE_BEGIN(CryptoPP)
 
+// *************************** ARM NEON **************************** //
+
+#if (CRYPTOPP_ARM_NEON_AVAILABLE)
+size_t SPECK128_Enc_AdvancedProcessBlocks_NEON(const word64* subKeys, size_t rounds,
+    const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
+{
+    return SPECK128_AdvancedProcessBlocks_NEON(SPECK128_Enc_Block, SPECK128_Enc_6_Blocks,
+        subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
+}
+
+size_t SPECK128_Dec_AdvancedProcessBlocks_NEON(const word64* subKeys, size_t rounds,
+    const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
+{
+    return SPECK128_AdvancedProcessBlocks_NEON(SPECK128_Dec_Block, SPECK128_Dec_6_Blocks,
+        subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
+}
+#endif  // CRYPTOPP_ARM_NEON_AVAILABLE
+
+// ***************************** IA-32 ***************************** //
+
 #if defined(CRYPTOPP_SSSE3_AVAILABLE)
 size_t SPECK128_Enc_AdvancedProcessBlocks_SSSE3(const word64* subKeys, size_t rounds,
     const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
@@ -358,6 +746,6 @@ size_t SPECK128_Dec_AdvancedProcessBlocks_SSSE3(const word64* subKeys, size_t ro
     return SPECK128_AdvancedProcessBlocks_SSSE3(SPECK128_Dec_Block, SPECK128_Dec_4_Blocks,
         subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
 }
-#endif
+#endif  // CRYPTOPP_SSSE3_AVAILABLE
 
 NAMESPACE_END
diff --git a/speck.cpp b/speck.cpp
index 956de7ed..f30a5b4c 100644
--- a/speck.cpp
+++ b/speck.cpp
@@ -7,8 +7,16 @@
 #include "misc.h"
 #include "cpu.h"
 
-// Uncomment to benchmark C/C++, and to isolate SSE code.
+// Uncomment for benchmarking C++ against SSE2 or NEON.
+// Do so in both speck.cpp and speck-simd.cpp.
 // #undef CRYPTOPP_SSSE3_AVAILABLE
+// #undef CRYPTOPP_ARM_NEON_AVAILABLE
+
+// Disable NEON/ASIMD for Cortex-A53 and A57. The shifts are too slow and C/C++ is about
+// 3 cpb faster than NEON/ASIMD. Also see http://github.com/weidai11/cryptopp/issues/367.
+#if (defined(__aarch32__) || defined(__aarch64__)) && defined(CRYPTOPP_SLOW_ARMV8_SHIFT)
+# undef CRYPTOPP_ARM_NEON_AVAILABLE
+#endif
 
 ANONYMOUS_NAMESPACE_BEGIN
 
@@ -167,6 +175,14 @@ ANONYMOUS_NAMESPACE_END
 
 NAMESPACE_BEGIN(CryptoPP)
 
+#if defined(CRYPTOPP_ARM_NEON_AVAILABLE)
+extern size_t SPECK128_Enc_AdvancedProcessBlocks_NEON(const word64* subKeys, size_t rounds,
+    const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags);
+
+extern size_t SPECK128_Dec_AdvancedProcessBlocks_NEON(const word64* subKeys, size_t rounds,
+    const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags);
+#endif
+
 #if defined(CRYPTOPP_SSSE3_AVAILABLE)
 extern size_t SPECK128_Enc_AdvancedProcessBlocks_SSSE3(const word64* subKeys, size_t rounds,
     const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags);
@@ -336,23 +352,35 @@ void SPECK128::Dec::ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock
     OutBlock oblk(xorBlock, outBlock); oblk(m_wspace[2])(m_wspace[3]);
 }
 
-#if CRYPTOPP_BOOL_X86 || CRYPTOPP_BOOL_X32 || CRYPTOPP_BOOL_X64
-size_t SPECK128::Enc::AdvancedProcessBlocks(const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags) const
+#if defined(CRYPTOPP_SPECK_ADVANCED_PROCESS_BLOCKS)
+size_t SPECK128::Enc::AdvancedProcessBlocks(const byte *inBlocks, const byte *xorBlocks,
+        byte *outBlocks, size_t length, word32 flags) const
 {
 #if defined(CRYPTOPP_SSSE3_AVAILABLE)
     if (HasSSSE3())
         return SPECK128_Enc_AdvancedProcessBlocks_SSSE3(m_rkeys, (size_t)m_rounds,
             inBlocks, xorBlocks, outBlocks, length, flags);
+#endif
+#if defined(CRYPTOPP_ARM_NEON_AVAILABLE)
+    if (HasNEON())
+        return SPECK128_Enc_AdvancedProcessBlocks_NEON(m_rkeys, (size_t)m_rounds,
+            inBlocks, xorBlocks, outBlocks, length, flags);
 #endif
     return BlockTransformation::AdvancedProcessBlocks(inBlocks, xorBlocks, outBlocks, length, flags);
 }
 
-size_t SPECK128::Dec::AdvancedProcessBlocks(const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags) const
+size_t SPECK128::Dec::AdvancedProcessBlocks(const byte *inBlocks, const byte *xorBlocks,
+        byte *outBlocks, size_t length, word32 flags) const
 {
 #if defined(CRYPTOPP_SSSE3_AVAILABLE)
     if (HasSSSE3())
         return SPECK128_Dec_AdvancedProcessBlocks_SSSE3(m_rkeys, (size_t)m_rounds,
             inBlocks, xorBlocks, outBlocks, length, flags);
+#endif
+#if defined(CRYPTOPP_ARM_NEON_AVAILABLE)
+    if (HasNEON())
+        return SPECK128_Dec_AdvancedProcessBlocks_NEON(m_rkeys, (size_t)m_rounds,
+            inBlocks, xorBlocks, outBlocks, length, flags);
 #endif
     return BlockTransformation::AdvancedProcessBlocks(inBlocks, xorBlocks, outBlocks, length, flags);
 }
diff --git a/speck.h b/speck.h
index 037dab92..2f0c5e08 100644
--- a/speck.h
+++ b/speck.h
@@ -16,6 +16,10 @@
 #include "seckey.h"
 #include "secblock.h"
 
+#if CRYPTOPP_BOOL_X64 || CRYPTOPP_BOOL_X32 || CRYPTOPP_BOOL_X86 || CRYPTOPP_BOOL_ARM32 || CRYPTOPP_BOOL_ARM64
+# define CRYPTOPP_SPECK_ADVANCED_PROCESS_BLOCKS 1
+#endif
+
 NAMESPACE_BEGIN(CryptoPP)
 
 //! \class SPECK_Info
@@ -142,7 +146,7 @@ public:
     {
     protected:
         void ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock, byte *outBlock) const;
-#if CRYPTOPP_BOOL_X86 || CRYPTOPP_BOOL_X32 || CRYPTOPP_BOOL_X64
+#if CRYPTOPP_SPECK_ADVANCED_PROCESS_BLOCKS
         size_t AdvancedProcessBlocks(const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags) const;
 #endif
     };
@@ -155,7 +159,7 @@ public:
     {
     protected:
         void ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock, byte *outBlock) const;
-#if CRYPTOPP_BOOL_X86 || CRYPTOPP_BOOL_X32 || CRYPTOPP_BOOL_X64
+#if CRYPTOPP_SPECK_ADVANCED_PROCESS_BLOCKS
         size_t AdvancedProcessBlocks(const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags) const;
 #endif
     };