Add NEON and ASIMD intrinsics for SPECK-128 (GH #538)

Performance increased by about 115% on a 980 MHz BananaPi dev-board. Throughput went from about 46.2 cpb to about 21.5 cpb.
2017-11-23 02:47:44 -05:00 · 2017-11-23 02:47:44 -05:00 · 304809a65d
parent b08596da44
commit 304809a65d
4 changed files with 430 additions and 8 deletions
--- a/2
+++ b/2
@ -360,6 +360,7 @@ ifeq ($(IS_NEON),1)
    GCM_FLAG = -march=armv7-a -mfloat-abi=$(FP_ABI) -mfpu=neon
    ARIA_FLAG = -march=armv7-a -mfloat-abi=$(FP_ABI) -mfpu=neon
    BLAKE2_FLAG = -march=armv7-a -mfloat-abi=$(FP_ABI) -mfpu=neon
    SPECK_FLAG = -march=armv7-a -mfloat-abi=$(FP_ABI) -mfpu=neon
  endif
 endif
@ -369,6 +370,7 @@ ifeq ($(IS_ARMV8),1)
    ARIA_FLAG = -march=armv8-a
    BLAKE2_FLAG = -march=armv8-a
    NEON_FLAG = -march=armv8-a
    SPECK_FLAG = -march=armv8-a
  endif
  HAVE_CRC = $(shell echo | $(CXX) -x c++ $(CXXFLAGS) -march=armv8-a+crc -dM -E - 2>/dev/null | $(GREP) -i -c __ARM_FEATURE_CRC32)
  ifeq ($(HAVE_CRC),1)
--- a/speck-simd.cpp
+++ b/speck-simd.cpp
@ -11,6 +11,21 @@
 #include "speck.h"
 #include "misc.h"
 // Uncomment for benchmarking C++ against SSE or NEON.
 // Do so in both speck.cpp and speck-simd.cpp.
 // #undef CRYPTOPP_SSSE3_AVAILABLE
 // #undef CRYPTOPP_ARM_NEON_AVAILABLE
 // Disable NEON/ASIMD for Cortex-A53 and A57. The shifts are too slow and C/C++ is 3 cpb
 // faster than NEON/ASIMD. Also see http://github.com/weidai11/cryptopp/issues/367.
 #if (defined(__aarch32__) || defined(__aarch64__)) && defined(CRYPTOPP_SLOW_ARMV8_SHIFT)
 # undef CRYPTOPP_ARM_NEON_AVAILABLE
 #endif
 #if (CRYPTOPP_ARM_NEON_AVAILABLE)
 # include <arm_neon.h>
 #endif
 #if (CRYPTOPP_SSSE3_AVAILABLE)
 # include <tmmintrin.h>
 #endif
@ -37,6 +52,359 @@ using CryptoPP::rotlFixed;
 using CryptoPP::rotrFixed;
 using CryptoPP::BlockTransformation;
 // *************************** ARM NEON ************************** //
 #if defined(CRYPTOPP_ARM_NEON_AVAILABLE)
 #if defined(CRYPTOPP_LITTLE_ENDIAN)
 const word32 s_one[] = {0, 0, 0, 1<<24};  // uint32x4_t
 #else
 const word32 s_one[] = {0, 0, 0, 1};      // uint32x4_t
 #endif
 template <class W, class T>
 inline W UnpackHigh64(const T& a, const T& b)
 {
    const uint64_t x = vget_high_u64((uint64x2_t)a);
    const uint64_t y = vget_high_u64((uint64x2_t)b);
    return (W)vcombine_u64(x, y);
 }
 template <class W, class T>
 inline W UnpackLow64(const T& a, const T& b)
 {
    const uint64_t x = vget_low_u64((uint64x2_t)a);
    const uint64_t y = vget_low_u64((uint64x2_t)b);
    return (W)vcombine_u64(x, y);
 }
 template <unsigned int R>
 inline uint64x2_t RotateLeft64(const uint64x2_t& val)
 {
    CRYPTOPP_ASSERT(R < 64);
    const uint64x2_t a(vshlq_n_u64(val, R));
    const uint64x2_t b(vshrq_n_u64(val, 64 - R));
    return vorrq_u64(a, b);
 }
 template <unsigned int R>
 inline uint64x2_t RotateRight64(const uint64x2_t& val)
 {
    CRYPTOPP_ASSERT(R < 64);
    const uint64x2_t a(vshlq_n_u64(val, 64 - R));
    const uint64x2_t b(vshrq_n_u64(val, R));
    return vorrq_u64(a, b);
 }
 inline uint64x2_t Shuffle64(const uint64x2_t& val)
 {
    return vreinterpretq_u64_u8(
        vrev64q_u8(vreinterpretq_u8_u64(val)));
 }
 inline void SPECK128_Enc_Block(uint8x16_t &block0, const word64 *subkeys, unsigned int rounds)
 {
    // Hack ahead... SPECK128_AdvancedProcessBlocks_NEON loads each SPECK-128 block into a
    // uint64x2_t. We can't SSE over them, so we rearrange the data to allow packed operations.
    // Its also easier to permute them in SPECK128_Enc_Block rather than the calling code.
    // SPECK128_AdvancedProcessBlocks_NEON is rather messy. The zero block below is a
    // "don't care". It is present so we can vectorize SPECK128_Enc_Block.
    uint8x16_t block1 = {0};
    uint64x2_t x1 = UnpackLow64<uint64x2_t>(block0, block1);
    uint64x2_t y1 = UnpackHigh64<uint64x2_t>(block0, block1);
    x1 = Shuffle64(x1);
    y1 = Shuffle64(y1);
    for (size_t i=0; static_cast<int>(i)<rounds; ++i)
    {
        const uint64x2_t rk = vld1q_dup_u64(subkeys+i);
        x1 = RotateRight64<8>(x1);
        x1 = vaddq_u64(x1, y1);
        x1 = veorq_u64(x1, rk);
        y1 = RotateLeft64<3>(y1);
        y1 = veorq_u64(y1, x1);
    }
    x1 = Shuffle64(x1);
    y1 = Shuffle64(y1);
    block0 = UnpackLow64<uint8x16_t>(x1, y1);
    // block1 = UnpackHigh64<uint8x16_t>(x1, y1);
 }
 inline void SPECK128_Enc_6_Blocks(uint8x16_t &block0, uint8x16_t &block1,
            uint8x16_t &block2, uint8x16_t &block3, uint8x16_t &block4,
            uint8x16_t &block5, const word64 *subkeys, unsigned int rounds)
 {
    // Hack ahead... SPECK128_AdvancedProcessBlocks_NEON loads each SPECK-128 block into a
    // uint64x2_t. We can't SSE over them, so we rearrange the data to allow packed operations.
    // Its also easier to permute them in SPECK128_Enc_6_Blocks rather than the calling code.
    // SPECK128_AdvancedProcessBlocks_NEON is rather messy.
    uint64x2_t x1 = UnpackLow64<uint64x2_t>(block0, block1);
    uint64x2_t y1 = UnpackHigh64<uint64x2_t>(block0, block1);
    uint64x2_t x2 = UnpackLow64<uint64x2_t>(block2, block3);
    uint64x2_t y2 = UnpackHigh64<uint64x2_t>(block2, block3);
    uint64x2_t x3 = UnpackLow64<uint64x2_t>(block4, block5);
    uint64x2_t y3 = UnpackHigh64<uint64x2_t>(block4, block5);
    x1 = Shuffle64(x1);
    y1 = Shuffle64(y1);
    x2 = Shuffle64(x2);
    y2 = Shuffle64(y2);
    x3 = Shuffle64(x3);
    y3 = Shuffle64(y3);
    for (size_t i=0; static_cast<int>(i)<rounds; ++i)
    {
        const uint64x2_t rk = vld1q_dup_u64(subkeys+i);
        x1 = RotateRight64<8>(x1);
        x2 = RotateRight64<8>(x2);
        x3 = RotateRight64<8>(x3);
        x1 = vaddq_u64(x1, y1);
        x2 = vaddq_u64(x2, y2);
        x3 = vaddq_u64(x3, y3);
        x1 = veorq_u64(x1, rk);
        x2 = veorq_u64(x2, rk);
        x3 = veorq_u64(x3, rk);
        y1 = RotateLeft64<3>(y1);
        y2 = RotateLeft64<3>(y2);
        y3 = RotateLeft64<3>(y3);
        y1 = veorq_u64(y1, x1);
        y2 = veorq_u64(y2, x2);
        y3 = veorq_u64(y3, x3);
    }
    x1 = Shuffle64(x1);
    y1 = Shuffle64(y1);
    x2 = Shuffle64(x2);
    y2 = Shuffle64(y2);
    x3 = Shuffle64(x3);
    y3 = Shuffle64(y3);
    block0 = UnpackLow64<uint8x16_t>(x1, y1);
    block1 = UnpackHigh64<uint8x16_t>(x1, y1);
    block2 = UnpackLow64<uint8x16_t>(x2, y2);
    block3 = UnpackHigh64<uint8x16_t>(x2, y2);
    block4 = UnpackLow64<uint8x16_t>(x3, y3);
    block5 = UnpackHigh64<uint8x16_t>(x3, y3);
 }
 inline void SPECK128_Dec_Block(uint8x16_t &block0, const word64 *subkeys, unsigned int rounds)
 {
    // Hack ahead... SPECK128_AdvancedProcessBlocks_NEON loads each SPECK-128 block into a
    // uint64x2_t. We can't SSE over them, so we rearrange the data to allow packed operations.
    // Its also easier to permute them in SPECK128_Dec_Block rather than the calling code.
    // SPECK128_AdvancedProcessBlocks_NEON is rather messy. The zero block below is a
    // "don't care". It is present so we can vectorize SPECK128_Dec_Block.
    uint8x16_t block1 = {0};
    uint64x2_t x1 = UnpackLow64<uint64x2_t>(block0, block1);
    uint64x2_t y1 = UnpackHigh64<uint64x2_t>(block0, block1);
    x1 = Shuffle64(x1);
    y1 = Shuffle64(y1);
    for (size_t i=rounds-1; static_cast<int>(i)>=0; --i)
    {
        const uint64x2_t rk = vld1q_dup_u64(subkeys+i);
        y1 = veorq_u64(y1, x1);
        y1 = RotateRight64<3>(y1);
        x1 = veorq_u64(x1, rk);
        x1 = vsubq_u64(x1, y1);
        x1 = RotateLeft64<8>(x1);
    }
    x1 = Shuffle64(x1);
    y1 = Shuffle64(y1);
    block0 = UnpackLow64<uint8x16_t>(x1, y1);
    // block1 = UnpackHigh64<uint8x16_t>(x1, y1);
 }
 inline void SPECK128_Dec_6_Blocks(uint8x16_t &block0, uint8x16_t &block1,
            uint8x16_t &block2, uint8x16_t &block3, uint8x16_t &block4,
            uint8x16_t &block5, const word64 *subkeys, unsigned int rounds)
 {
    // Hack ahead... SPECK128_AdvancedProcessBlocks_NEON loads each SPECK-128 block into a
    // uint64x2_t. We can't SSE over them, so we rearrange the data to allow packed operations.
    // Its also easier to permute them in SPECK128_Dec_6_Blocks rather than the calling code.
    // SPECK128_AdvancedProcessBlocks_NEON is rather messy.
    uint64x2_t x1 = UnpackLow64<uint64x2_t>(block0, block1);
    uint64x2_t y1 = UnpackHigh64<uint64x2_t>(block0, block1);
    uint64x2_t x2 = UnpackLow64<uint64x2_t>(block2, block3);
    uint64x2_t y2 = UnpackHigh64<uint64x2_t>(block2, block3);
    uint64x2_t x3 = UnpackLow64<uint64x2_t>(block4, block5);
    uint64x2_t y3 = UnpackHigh64<uint64x2_t>(block5, block5);
    x1 = Shuffle64(x1);
    y1 = Shuffle64(y1);
    x2 = Shuffle64(x2);
    y2 = Shuffle64(y2);
    x3 = Shuffle64(x3);
    y3 = Shuffle64(y3);
    for (size_t i=rounds-1; static_cast<int>(i)>=0; --i)
    {
        const uint64x2_t rk = vld1q_dup_u64(subkeys+i);
        y1 = veorq_u64(y1, x1);
        y2 = veorq_u64(y2, x2);
        y3 = veorq_u64(y3, x3);
        y1 = RotateRight64<3>(y1);
        y2 = RotateRight64<3>(y2);
        y3 = RotateRight64<3>(y3);
        x1 = veorq_u64(x1, rk);
        x2 = veorq_u64(x2, rk);
        x3 = veorq_u64(x3, rk);
        x1 = vsubq_u64(x1, y1);
        x2 = vsubq_u64(x2, y2);
        x3 = vsubq_u64(x3, y3);
        x1 = RotateLeft64<8>(x1);
        x2 = RotateLeft64<8>(x2);
        x3 = RotateLeft64<8>(x3);
    }
    x1 = Shuffle64(x1);
    y1 = Shuffle64(y1);
    x2 = Shuffle64(x2);
    y2 = Shuffle64(y2);
    x3 = Shuffle64(x3);
    y3 = Shuffle64(y3);
    block0 = UnpackLow64<uint8x16_t>(x1, y1);
    block1 = UnpackHigh64<uint8x16_t>(x1, y1);
    block2 = UnpackLow64<uint8x16_t>(x2, y2);
    block3 = UnpackHigh64<uint8x16_t>(x2, y2);
    block4 = UnpackLow64<uint8x16_t>(x3, y3);
    block5 = UnpackHigh64<uint8x16_t>(x3, y3);
 }
 template <typename F1, typename F6>
 size_t SPECK128_AdvancedProcessBlocks_NEON(F1 func1, F6 func6,
            const word64 *subKeys, size_t rounds, const byte *inBlocks,
            const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
 {
    CRYPTOPP_ASSERT(subKeys);
    CRYPTOPP_ASSERT(inBlocks);
    CRYPTOPP_ASSERT(outBlocks);
    CRYPTOPP_ASSERT(length >= 16);
    const size_t blockSize = 16;
    size_t inIncrement = (flags & (BlockTransformation::BT_InBlockIsCounter|BlockTransformation::BT_DontIncrementInOutPointers)) ? 0 : blockSize;
    size_t xorIncrement = xorBlocks ? blockSize : 0;
    size_t outIncrement = (flags & BlockTransformation::BT_DontIncrementInOutPointers) ? 0 : blockSize;
    if (flags & BlockTransformation::BT_ReverseDirection)
    {
        inBlocks += length - blockSize;
        xorBlocks += length - blockSize;
        outBlocks += length - blockSize;
        inIncrement = 0-inIncrement;
        xorIncrement = 0-xorIncrement;
        outIncrement = 0-outIncrement;
    }
    if (flags & BlockTransformation::BT_AllowParallel)
    {
        while (length >= 6*blockSize)
        {
            uint8x16_t block0, block1, block2, block3, block4, block5, temp;
            block0 = vld1q_u8(inBlocks);
            if (flags & BlockTransformation::BT_InBlockIsCounter)
            {
                uint32x4_t be = vld1q_u32(s_one);
                block1 = (uint8x16_t)vaddq_u32(vreinterpretq_u32_u8(block0), be);
                block2 = (uint8x16_t)vaddq_u32(vreinterpretq_u32_u8(block1), be);
                block3 = (uint8x16_t)vaddq_u32(vreinterpretq_u32_u8(block2), be);
                block4 = (uint8x16_t)vaddq_u32(vreinterpretq_u32_u8(block3), be);
                block5 = (uint8x16_t)vaddq_u32(vreinterpretq_u32_u8(block4), be);
                temp   = (uint8x16_t)vaddq_u32(vreinterpretq_u32_u8(block5), be);
                vst1q_u8(const_cast<byte*>(inBlocks), temp);
            }
            else
            {
                const int inc = static_cast<int>(inIncrement);
                block1 = vld1q_u8(inBlocks+1*inc);
                block2 = vld1q_u8(inBlocks+2*inc);
                block3 = vld1q_u8(inBlocks+3*inc);
                block4 = vld1q_u8(inBlocks+4*inc);
                block5 = vld1q_u8(inBlocks+5*inc);
                inBlocks += 6*inc;
            }
            if (flags & BlockTransformation::BT_XorInput)
            {
                const int inc = static_cast<int>(xorIncrement);
                block0 = veorq_u8(block0, vld1q_u8(xorBlocks+0*inc));
                block1 = veorq_u8(block1, vld1q_u8(xorBlocks+1*inc));
                block2 = veorq_u8(block2, vld1q_u8(xorBlocks+2*inc));
                block3 = veorq_u8(block3, vld1q_u8(xorBlocks+3*inc));
                block4 = veorq_u8(block4, vld1q_u8(xorBlocks+4*inc));
                block5 = veorq_u8(block5, vld1q_u8(xorBlocks+5*inc));
                xorBlocks += 6*inc;
            }
            func6(block0, block1, block2, block3, block4, block5, subKeys, rounds);
            if (xorBlocks && !(flags & BlockTransformation::BT_XorInput))
            {
                const int inc = static_cast<int>(xorIncrement);
                block0 = veorq_u8(block0, vld1q_u8(xorBlocks+0*inc));
                block1 = veorq_u8(block1, vld1q_u8(xorBlocks+1*inc));
                block2 = veorq_u8(block2, vld1q_u8(xorBlocks+2*inc));
                block3 = veorq_u8(block3, vld1q_u8(xorBlocks+3*inc));
                block4 = veorq_u8(block4, vld1q_u8(xorBlocks+4*inc));
                block5 = veorq_u8(block5, vld1q_u8(xorBlocks+5*inc));
                xorBlocks += 6*inc;
            }
            const int inc = static_cast<int>(outIncrement);
            vst1q_u8(outBlocks+0*inc, block0);
            vst1q_u8(outBlocks+1*inc, block1);
            vst1q_u8(outBlocks+2*inc, block2);
            vst1q_u8(outBlocks+3*inc, block3);
            vst1q_u8(outBlocks+4*inc, block4);
            vst1q_u8(outBlocks+5*inc, block5);
            outBlocks += 6*inc;
            length -= 6*blockSize;
        }
    }
    while (length >= blockSize)
    {
        uint8x16_t block = vld1q_u8(inBlocks);
        if (flags & BlockTransformation::BT_XorInput)
            block = veorq_u8(block, vld1q_u8(xorBlocks));
        if (flags & BlockTransformation::BT_InBlockIsCounter)
            const_cast<byte *>(inBlocks)[15]++;
        func1(block, subKeys, rounds);
        if (xorBlocks && !(flags & BlockTransformation::BT_XorInput))
            block = veorq_u8(block, vld1q_u8(xorBlocks));
        vst1q_u8(outBlocks, block);
        inBlocks += inIncrement;
        outBlocks += outIncrement;
        xorBlocks += xorIncrement;
        length -= blockSize;
    }
    return length;
 }
 #endif  // CRYPTOPP_ARM_NEON_AVAILABLE
 // ***************************** IA-32 ***************************** //
 #if defined(CRYPTOPP_SSSE3_AVAILABLE)
 CRYPTOPP_ALIGN_DATA(16)
@ -340,10 +708,30 @@ inline size_t SPECK128_AdvancedProcessBlocks_SSSE3(F1 func1, F4 func4,
 ANONYMOUS_NAMESPACE_END
-///////////////////////////////////////////////////////////
+///////////////////////////////////////////////////////////////////////
 NAMESPACE_BEGIN(CryptoPP)
 // *************************** ARM NEON **************************** //
 #if (CRYPTOPP_ARM_NEON_AVAILABLE)
 size_t SPECK128_Enc_AdvancedProcessBlocks_NEON(const word64* subKeys, size_t rounds,
    const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
 {
    return SPECK128_AdvancedProcessBlocks_NEON(SPECK128_Enc_Block, SPECK128_Enc_6_Blocks,
        subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
 }
 size_t SPECK128_Dec_AdvancedProcessBlocks_NEON(const word64* subKeys, size_t rounds,
    const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
 {
    return SPECK128_AdvancedProcessBlocks_NEON(SPECK128_Dec_Block, SPECK128_Dec_6_Blocks,
        subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
 }
 #endif  // CRYPTOPP_ARM_NEON_AVAILABLE
 // ***************************** IA-32 ***************************** //
 #if defined(CRYPTOPP_SSSE3_AVAILABLE)
 size_t SPECK128_Enc_AdvancedProcessBlocks_SSSE3(const word64* subKeys, size_t rounds,
    const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
@ -358,6 +746,6 @@ size_t SPECK128_Dec_AdvancedProcessBlocks_SSSE3(const word64* subKeys, size_t ro
    return SPECK128_AdvancedProcessBlocks_SSSE3(SPECK128_Dec_Block, SPECK128_Dec_4_Blocks,
        subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
 }
-#endif
+#endif  // CRYPTOPP_SSSE3_AVAILABLE
 NAMESPACE_END
--- a/speck.cpp
+++ b/speck.cpp
@ -7,8 +7,16 @@
 #include "misc.h"
 #include "cpu.h"
-// Uncomment to benchmark C/C++, and to isolate SSE code.
+// Uncomment for benchmarking C++ against SSE2 or NEON.
 // Do so in both speck.cpp and speck-simd.cpp.
 // #undef CRYPTOPP_SSSE3_AVAILABLE
 // #undef CRYPTOPP_ARM_NEON_AVAILABLE
 // Disable NEON/ASIMD for Cortex-A53 and A57. The shifts are too slow and C/C++ is about
 // 3 cpb faster than NEON/ASIMD. Also see http://github.com/weidai11/cryptopp/issues/367.
 #if (defined(__aarch32__) || defined(__aarch64__)) && defined(CRYPTOPP_SLOW_ARMV8_SHIFT)
 # undef CRYPTOPP_ARM_NEON_AVAILABLE
 #endif
 ANONYMOUS_NAMESPACE_BEGIN
@ -167,6 +175,14 @@ ANONYMOUS_NAMESPACE_END
 NAMESPACE_BEGIN(CryptoPP)
 #if defined(CRYPTOPP_ARM_NEON_AVAILABLE)
 extern size_t SPECK128_Enc_AdvancedProcessBlocks_NEON(const word64* subKeys, size_t rounds,
    const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags);
 extern size_t SPECK128_Dec_AdvancedProcessBlocks_NEON(const word64* subKeys, size_t rounds,
    const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags);
 #endif
 #if defined(CRYPTOPP_SSSE3_AVAILABLE)
 extern size_t SPECK128_Enc_AdvancedProcessBlocks_SSSE3(const word64* subKeys, size_t rounds,
    const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags);
@ -336,23 +352,35 @@ void SPECK128::Dec::ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock
    OutBlock oblk(xorBlock, outBlock); oblk(m_wspace[2])(m_wspace[3]);
 }
-#if CRYPTOPP_BOOL_X86 || CRYPTOPP_BOOL_X32 || CRYPTOPP_BOOL_X64
+#if defined(CRYPTOPP_SPECK_ADVANCED_PROCESS_BLOCKS)
-size_t SPECK128::Enc::AdvancedProcessBlocks(const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags) const
+size_t SPECK128::Enc::AdvancedProcessBlocks(const byte *inBlocks, const byte *xorBlocks,
        byte *outBlocks, size_t length, word32 flags) const
 {
 #if defined(CRYPTOPP_SSSE3_AVAILABLE)
    if (HasSSSE3())
        return SPECK128_Enc_AdvancedProcessBlocks_SSSE3(m_rkeys, (size_t)m_rounds,
            inBlocks, xorBlocks, outBlocks, length, flags);
 #endif
 #if defined(CRYPTOPP_ARM_NEON_AVAILABLE)
    if (HasNEON())
        return SPECK128_Enc_AdvancedProcessBlocks_NEON(m_rkeys, (size_t)m_rounds,
            inBlocks, xorBlocks, outBlocks, length, flags);
 #endif
    return BlockTransformation::AdvancedProcessBlocks(inBlocks, xorBlocks, outBlocks, length, flags);
 }
-size_t SPECK128::Dec::AdvancedProcessBlocks(const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags) const
+size_t SPECK128::Dec::AdvancedProcessBlocks(const byte *inBlocks, const byte *xorBlocks,
        byte *outBlocks, size_t length, word32 flags) const
 {
 #if defined(CRYPTOPP_SSSE3_AVAILABLE)
    if (HasSSSE3())
        return SPECK128_Dec_AdvancedProcessBlocks_SSSE3(m_rkeys, (size_t)m_rounds,
            inBlocks, xorBlocks, outBlocks, length, flags);
 #endif
 #if defined(CRYPTOPP_ARM_NEON_AVAILABLE)
    if (HasNEON())
        return SPECK128_Dec_AdvancedProcessBlocks_NEON(m_rkeys, (size_t)m_rounds,
            inBlocks, xorBlocks, outBlocks, length, flags);
 #endif
    return BlockTransformation::AdvancedProcessBlocks(inBlocks, xorBlocks, outBlocks, length, flags);
 }
--- a/speck.h
+++ b/speck.h
@ -16,6 +16,10 @@
 #include "seckey.h"
 #include "secblock.h"
 #if CRYPTOPP_BOOL_X64 || CRYPTOPP_BOOL_X32 || CRYPTOPP_BOOL_X86 || CRYPTOPP_BOOL_ARM32 || CRYPTOPP_BOOL_ARM64
 # define CRYPTOPP_SPECK_ADVANCED_PROCESS_BLOCKS 1
 #endif
 NAMESPACE_BEGIN(CryptoPP)
 //! \class SPECK_Info
@ -142,7 +146,7 @@ public:
    {
    protected:
        void ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock, byte *outBlock) const;
-#if CRYPTOPP_BOOL_X86 || CRYPTOPP_BOOL_X32 || CRYPTOPP_BOOL_X64
+#if CRYPTOPP_SPECK_ADVANCED_PROCESS_BLOCKS
        size_t AdvancedProcessBlocks(const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags) const;
 #endif
    };
@ -155,7 +159,7 @@ public:
    {
    protected:
        void ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock, byte *outBlock) const;
-#if CRYPTOPP_BOOL_X86 || CRYPTOPP_BOOL_X32 || CRYPTOPP_BOOL_X64
+#if CRYPTOPP_SPECK_ADVANCED_PROCESS_BLOCKS
        size_t AdvancedProcessBlocks(const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags) const;
 #endif
    };