From 304809a65dc34a73160edcf9f568e35d6e1af9b5 Mon Sep 17 00:00:00 2001 From: Jeffrey Walton Date: Thu, 23 Nov 2017 02:47:44 -0500 Subject: [PATCH] Add NEON and ASIMD intrinsics for SPECK-128 (GH #538) Performance increased by about 115% on a 980 MHz BananaPi dev-board. Throughput went from about 46.2 cpb to about 21.5 cpb. --- GNUmakefile | 2 + speck-simd.cpp | 392 ++++++++++++++++++++++++++++++++++++++++++++++++- speck.cpp | 36 ++++- speck.h | 8 +- 4 files changed, 430 insertions(+), 8 deletions(-) diff --git a/GNUmakefile b/GNUmakefile index c9ddc59f..638bb6c1 100755 --- a/GNUmakefile +++ b/GNUmakefile @@ -360,6 +360,7 @@ ifeq ($(IS_NEON),1) GCM_FLAG = -march=armv7-a -mfloat-abi=$(FP_ABI) -mfpu=neon ARIA_FLAG = -march=armv7-a -mfloat-abi=$(FP_ABI) -mfpu=neon BLAKE2_FLAG = -march=armv7-a -mfloat-abi=$(FP_ABI) -mfpu=neon + SPECK_FLAG = -march=armv7-a -mfloat-abi=$(FP_ABI) -mfpu=neon endif endif @@ -369,6 +370,7 @@ ifeq ($(IS_ARMV8),1) ARIA_FLAG = -march=armv8-a BLAKE2_FLAG = -march=armv8-a NEON_FLAG = -march=armv8-a + SPECK_FLAG = -march=armv8-a endif HAVE_CRC = $(shell echo | $(CXX) -x c++ $(CXXFLAGS) -march=armv8-a+crc -dM -E - 2>/dev/null | $(GREP) -i -c __ARM_FEATURE_CRC32) ifeq ($(HAVE_CRC),1) diff --git a/speck-simd.cpp b/speck-simd.cpp index ea32f6d2..521da70e 100644 --- a/speck-simd.cpp +++ b/speck-simd.cpp @@ -11,6 +11,21 @@ #include "speck.h" #include "misc.h" +// Uncomment for benchmarking C++ against SSE or NEON. +// Do so in both speck.cpp and speck-simd.cpp. +// #undef CRYPTOPP_SSSE3_AVAILABLE +// #undef CRYPTOPP_ARM_NEON_AVAILABLE + +// Disable NEON/ASIMD for Cortex-A53 and A57. The shifts are too slow and C/C++ is 3 cpb +// faster than NEON/ASIMD. Also see http://github.com/weidai11/cryptopp/issues/367. +#if (defined(__aarch32__) || defined(__aarch64__)) && defined(CRYPTOPP_SLOW_ARMV8_SHIFT) +# undef CRYPTOPP_ARM_NEON_AVAILABLE +#endif + +#if (CRYPTOPP_ARM_NEON_AVAILABLE) +# include +#endif + #if (CRYPTOPP_SSSE3_AVAILABLE) # include #endif @@ -37,6 +52,359 @@ using CryptoPP::rotlFixed; using CryptoPP::rotrFixed; using CryptoPP::BlockTransformation; +// *************************** ARM NEON ************************** // + +#if defined(CRYPTOPP_ARM_NEON_AVAILABLE) + +#if defined(CRYPTOPP_LITTLE_ENDIAN) +const word32 s_one[] = {0, 0, 0, 1<<24}; // uint32x4_t +#else +const word32 s_one[] = {0, 0, 0, 1}; // uint32x4_t +#endif + +template +inline W UnpackHigh64(const T& a, const T& b) +{ + const uint64_t x = vget_high_u64((uint64x2_t)a); + const uint64_t y = vget_high_u64((uint64x2_t)b); + return (W)vcombine_u64(x, y); +} + +template +inline W UnpackLow64(const T& a, const T& b) +{ + const uint64_t x = vget_low_u64((uint64x2_t)a); + const uint64_t y = vget_low_u64((uint64x2_t)b); + return (W)vcombine_u64(x, y); +} + +template +inline uint64x2_t RotateLeft64(const uint64x2_t& val) +{ + CRYPTOPP_ASSERT(R < 64); + const uint64x2_t a(vshlq_n_u64(val, R)); + const uint64x2_t b(vshrq_n_u64(val, 64 - R)); + return vorrq_u64(a, b); +} + +template +inline uint64x2_t RotateRight64(const uint64x2_t& val) +{ + CRYPTOPP_ASSERT(R < 64); + const uint64x2_t a(vshlq_n_u64(val, 64 - R)); + const uint64x2_t b(vshrq_n_u64(val, R)); + return vorrq_u64(a, b); +} + +inline uint64x2_t Shuffle64(const uint64x2_t& val) +{ + return vreinterpretq_u64_u8( + vrev64q_u8(vreinterpretq_u8_u64(val))); +} + +inline void SPECK128_Enc_Block(uint8x16_t &block0, const word64 *subkeys, unsigned int rounds) +{ + // Hack ahead... SPECK128_AdvancedProcessBlocks_NEON loads each SPECK-128 block into a + // uint64x2_t. We can't SSE over them, so we rearrange the data to allow packed operations. + // Its also easier to permute them in SPECK128_Enc_Block rather than the calling code. + // SPECK128_AdvancedProcessBlocks_NEON is rather messy. The zero block below is a + // "don't care". It is present so we can vectorize SPECK128_Enc_Block. + uint8x16_t block1 = {0}; + uint64x2_t x1 = UnpackLow64(block0, block1); + uint64x2_t y1 = UnpackHigh64(block0, block1); + + x1 = Shuffle64(x1); + y1 = Shuffle64(y1); + + for (size_t i=0; static_cast(i)(x1); + x1 = vaddq_u64(x1, y1); + x1 = veorq_u64(x1, rk); + y1 = RotateLeft64<3>(y1); + y1 = veorq_u64(y1, x1); + } + + x1 = Shuffle64(x1); + y1 = Shuffle64(y1); + + block0 = UnpackLow64(x1, y1); + // block1 = UnpackHigh64(x1, y1); +} + +inline void SPECK128_Enc_6_Blocks(uint8x16_t &block0, uint8x16_t &block1, + uint8x16_t &block2, uint8x16_t &block3, uint8x16_t &block4, + uint8x16_t &block5, const word64 *subkeys, unsigned int rounds) +{ + // Hack ahead... SPECK128_AdvancedProcessBlocks_NEON loads each SPECK-128 block into a + // uint64x2_t. We can't SSE over them, so we rearrange the data to allow packed operations. + // Its also easier to permute them in SPECK128_Enc_6_Blocks rather than the calling code. + // SPECK128_AdvancedProcessBlocks_NEON is rather messy. + uint64x2_t x1 = UnpackLow64(block0, block1); + uint64x2_t y1 = UnpackHigh64(block0, block1); + uint64x2_t x2 = UnpackLow64(block2, block3); + uint64x2_t y2 = UnpackHigh64(block2, block3); + uint64x2_t x3 = UnpackLow64(block4, block5); + uint64x2_t y3 = UnpackHigh64(block4, block5); + + x1 = Shuffle64(x1); + y1 = Shuffle64(y1); + x2 = Shuffle64(x2); + y2 = Shuffle64(y2); + x3 = Shuffle64(x3); + y3 = Shuffle64(y3); + + for (size_t i=0; static_cast(i)(x1); + x2 = RotateRight64<8>(x2); + x3 = RotateRight64<8>(x3); + x1 = vaddq_u64(x1, y1); + x2 = vaddq_u64(x2, y2); + x3 = vaddq_u64(x3, y3); + x1 = veorq_u64(x1, rk); + x2 = veorq_u64(x2, rk); + x3 = veorq_u64(x3, rk); + y1 = RotateLeft64<3>(y1); + y2 = RotateLeft64<3>(y2); + y3 = RotateLeft64<3>(y3); + y1 = veorq_u64(y1, x1); + y2 = veorq_u64(y2, x2); + y3 = veorq_u64(y3, x3); + } + + x1 = Shuffle64(x1); + y1 = Shuffle64(y1); + x2 = Shuffle64(x2); + y2 = Shuffle64(y2); + x3 = Shuffle64(x3); + y3 = Shuffle64(y3); + + block0 = UnpackLow64(x1, y1); + block1 = UnpackHigh64(x1, y1); + block2 = UnpackLow64(x2, y2); + block3 = UnpackHigh64(x2, y2); + block4 = UnpackLow64(x3, y3); + block5 = UnpackHigh64(x3, y3); +} + +inline void SPECK128_Dec_Block(uint8x16_t &block0, const word64 *subkeys, unsigned int rounds) +{ + // Hack ahead... SPECK128_AdvancedProcessBlocks_NEON loads each SPECK-128 block into a + // uint64x2_t. We can't SSE over them, so we rearrange the data to allow packed operations. + // Its also easier to permute them in SPECK128_Dec_Block rather than the calling code. + // SPECK128_AdvancedProcessBlocks_NEON is rather messy. The zero block below is a + // "don't care". It is present so we can vectorize SPECK128_Dec_Block. + uint8x16_t block1 = {0}; + uint64x2_t x1 = UnpackLow64(block0, block1); + uint64x2_t y1 = UnpackHigh64(block0, block1); + + x1 = Shuffle64(x1); + y1 = Shuffle64(y1); + + for (size_t i=rounds-1; static_cast(i)>=0; --i) + { + const uint64x2_t rk = vld1q_dup_u64(subkeys+i); + + y1 = veorq_u64(y1, x1); + y1 = RotateRight64<3>(y1); + x1 = veorq_u64(x1, rk); + x1 = vsubq_u64(x1, y1); + x1 = RotateLeft64<8>(x1); + } + + x1 = Shuffle64(x1); + y1 = Shuffle64(y1); + + block0 = UnpackLow64(x1, y1); + // block1 = UnpackHigh64(x1, y1); +} + +inline void SPECK128_Dec_6_Blocks(uint8x16_t &block0, uint8x16_t &block1, + uint8x16_t &block2, uint8x16_t &block3, uint8x16_t &block4, + uint8x16_t &block5, const word64 *subkeys, unsigned int rounds) +{ + // Hack ahead... SPECK128_AdvancedProcessBlocks_NEON loads each SPECK-128 block into a + // uint64x2_t. We can't SSE over them, so we rearrange the data to allow packed operations. + // Its also easier to permute them in SPECK128_Dec_6_Blocks rather than the calling code. + // SPECK128_AdvancedProcessBlocks_NEON is rather messy. + uint64x2_t x1 = UnpackLow64(block0, block1); + uint64x2_t y1 = UnpackHigh64(block0, block1); + uint64x2_t x2 = UnpackLow64(block2, block3); + uint64x2_t y2 = UnpackHigh64(block2, block3); + uint64x2_t x3 = UnpackLow64(block4, block5); + uint64x2_t y3 = UnpackHigh64(block5, block5); + + x1 = Shuffle64(x1); + y1 = Shuffle64(y1); + x2 = Shuffle64(x2); + y2 = Shuffle64(y2); + x3 = Shuffle64(x3); + y3 = Shuffle64(y3); + + for (size_t i=rounds-1; static_cast(i)>=0; --i) + { + const uint64x2_t rk = vld1q_dup_u64(subkeys+i); + + y1 = veorq_u64(y1, x1); + y2 = veorq_u64(y2, x2); + y3 = veorq_u64(y3, x3); + y1 = RotateRight64<3>(y1); + y2 = RotateRight64<3>(y2); + y3 = RotateRight64<3>(y3); + x1 = veorq_u64(x1, rk); + x2 = veorq_u64(x2, rk); + x3 = veorq_u64(x3, rk); + x1 = vsubq_u64(x1, y1); + x2 = vsubq_u64(x2, y2); + x3 = vsubq_u64(x3, y3); + x1 = RotateLeft64<8>(x1); + x2 = RotateLeft64<8>(x2); + x3 = RotateLeft64<8>(x3); + } + + x1 = Shuffle64(x1); + y1 = Shuffle64(y1); + x2 = Shuffle64(x2); + y2 = Shuffle64(y2); + x3 = Shuffle64(x3); + y3 = Shuffle64(y3); + + block0 = UnpackLow64(x1, y1); + block1 = UnpackHigh64(x1, y1); + block2 = UnpackLow64(x2, y2); + block3 = UnpackHigh64(x2, y2); + block4 = UnpackLow64(x3, y3); + block5 = UnpackHigh64(x3, y3); +} + +template +size_t SPECK128_AdvancedProcessBlocks_NEON(F1 func1, F6 func6, + const word64 *subKeys, size_t rounds, const byte *inBlocks, + const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags) +{ + CRYPTOPP_ASSERT(subKeys); + CRYPTOPP_ASSERT(inBlocks); + CRYPTOPP_ASSERT(outBlocks); + CRYPTOPP_ASSERT(length >= 16); + + const size_t blockSize = 16; + size_t inIncrement = (flags & (BlockTransformation::BT_InBlockIsCounter|BlockTransformation::BT_DontIncrementInOutPointers)) ? 0 : blockSize; + size_t xorIncrement = xorBlocks ? blockSize : 0; + size_t outIncrement = (flags & BlockTransformation::BT_DontIncrementInOutPointers) ? 0 : blockSize; + + if (flags & BlockTransformation::BT_ReverseDirection) + { + inBlocks += length - blockSize; + xorBlocks += length - blockSize; + outBlocks += length - blockSize; + inIncrement = 0-inIncrement; + xorIncrement = 0-xorIncrement; + outIncrement = 0-outIncrement; + } + + if (flags & BlockTransformation::BT_AllowParallel) + { + while (length >= 6*blockSize) + { + uint8x16_t block0, block1, block2, block3, block4, block5, temp; + block0 = vld1q_u8(inBlocks); + + if (flags & BlockTransformation::BT_InBlockIsCounter) + { + uint32x4_t be = vld1q_u32(s_one); + block1 = (uint8x16_t)vaddq_u32(vreinterpretq_u32_u8(block0), be); + block2 = (uint8x16_t)vaddq_u32(vreinterpretq_u32_u8(block1), be); + block3 = (uint8x16_t)vaddq_u32(vreinterpretq_u32_u8(block2), be); + block4 = (uint8x16_t)vaddq_u32(vreinterpretq_u32_u8(block3), be); + block5 = (uint8x16_t)vaddq_u32(vreinterpretq_u32_u8(block4), be); + temp = (uint8x16_t)vaddq_u32(vreinterpretq_u32_u8(block5), be); + vst1q_u8(const_cast(inBlocks), temp); + } + else + { + const int inc = static_cast(inIncrement); + block1 = vld1q_u8(inBlocks+1*inc); + block2 = vld1q_u8(inBlocks+2*inc); + block3 = vld1q_u8(inBlocks+3*inc); + block4 = vld1q_u8(inBlocks+4*inc); + block5 = vld1q_u8(inBlocks+5*inc); + inBlocks += 6*inc; + } + + if (flags & BlockTransformation::BT_XorInput) + { + const int inc = static_cast(xorIncrement); + block0 = veorq_u8(block0, vld1q_u8(xorBlocks+0*inc)); + block1 = veorq_u8(block1, vld1q_u8(xorBlocks+1*inc)); + block2 = veorq_u8(block2, vld1q_u8(xorBlocks+2*inc)); + block3 = veorq_u8(block3, vld1q_u8(xorBlocks+3*inc)); + block4 = veorq_u8(block4, vld1q_u8(xorBlocks+4*inc)); + block5 = veorq_u8(block5, vld1q_u8(xorBlocks+5*inc)); + xorBlocks += 6*inc; + } + + func6(block0, block1, block2, block3, block4, block5, subKeys, rounds); + + if (xorBlocks && !(flags & BlockTransformation::BT_XorInput)) + { + const int inc = static_cast(xorIncrement); + block0 = veorq_u8(block0, vld1q_u8(xorBlocks+0*inc)); + block1 = veorq_u8(block1, vld1q_u8(xorBlocks+1*inc)); + block2 = veorq_u8(block2, vld1q_u8(xorBlocks+2*inc)); + block3 = veorq_u8(block3, vld1q_u8(xorBlocks+3*inc)); + block4 = veorq_u8(block4, vld1q_u8(xorBlocks+4*inc)); + block5 = veorq_u8(block5, vld1q_u8(xorBlocks+5*inc)); + xorBlocks += 6*inc; + } + + const int inc = static_cast(outIncrement); + vst1q_u8(outBlocks+0*inc, block0); + vst1q_u8(outBlocks+1*inc, block1); + vst1q_u8(outBlocks+2*inc, block2); + vst1q_u8(outBlocks+3*inc, block3); + vst1q_u8(outBlocks+4*inc, block4); + vst1q_u8(outBlocks+5*inc, block5); + + outBlocks += 6*inc; + length -= 6*blockSize; + } + } + + while (length >= blockSize) + { + uint8x16_t block = vld1q_u8(inBlocks); + + if (flags & BlockTransformation::BT_XorInput) + block = veorq_u8(block, vld1q_u8(xorBlocks)); + + if (flags & BlockTransformation::BT_InBlockIsCounter) + const_cast(inBlocks)[15]++; + + func1(block, subKeys, rounds); + + if (xorBlocks && !(flags & BlockTransformation::BT_XorInput)) + block = veorq_u8(block, vld1q_u8(xorBlocks)); + + vst1q_u8(outBlocks, block); + + inBlocks += inIncrement; + outBlocks += outIncrement; + xorBlocks += xorIncrement; + length -= blockSize; + } + + return length; +} + +#endif // CRYPTOPP_ARM_NEON_AVAILABLE + +// ***************************** IA-32 ***************************** // + #if defined(CRYPTOPP_SSSE3_AVAILABLE) CRYPTOPP_ALIGN_DATA(16) @@ -340,10 +708,30 @@ inline size_t SPECK128_AdvancedProcessBlocks_SSSE3(F1 func1, F4 func4, ANONYMOUS_NAMESPACE_END -/////////////////////////////////////////////////////////// +/////////////////////////////////////////////////////////////////////// NAMESPACE_BEGIN(CryptoPP) +// *************************** ARM NEON **************************** // + +#if (CRYPTOPP_ARM_NEON_AVAILABLE) +size_t SPECK128_Enc_AdvancedProcessBlocks_NEON(const word64* subKeys, size_t rounds, + const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags) +{ + return SPECK128_AdvancedProcessBlocks_NEON(SPECK128_Enc_Block, SPECK128_Enc_6_Blocks, + subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags); +} + +size_t SPECK128_Dec_AdvancedProcessBlocks_NEON(const word64* subKeys, size_t rounds, + const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags) +{ + return SPECK128_AdvancedProcessBlocks_NEON(SPECK128_Dec_Block, SPECK128_Dec_6_Blocks, + subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags); +} +#endif // CRYPTOPP_ARM_NEON_AVAILABLE + +// ***************************** IA-32 ***************************** // + #if defined(CRYPTOPP_SSSE3_AVAILABLE) size_t SPECK128_Enc_AdvancedProcessBlocks_SSSE3(const word64* subKeys, size_t rounds, const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags) @@ -358,6 +746,6 @@ size_t SPECK128_Dec_AdvancedProcessBlocks_SSSE3(const word64* subKeys, size_t ro return SPECK128_AdvancedProcessBlocks_SSSE3(SPECK128_Dec_Block, SPECK128_Dec_4_Blocks, subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags); } -#endif +#endif // CRYPTOPP_SSSE3_AVAILABLE NAMESPACE_END diff --git a/speck.cpp b/speck.cpp index 956de7ed..f30a5b4c 100644 --- a/speck.cpp +++ b/speck.cpp @@ -7,8 +7,16 @@ #include "misc.h" #include "cpu.h" -// Uncomment to benchmark C/C++, and to isolate SSE code. +// Uncomment for benchmarking C++ against SSE2 or NEON. +// Do so in both speck.cpp and speck-simd.cpp. // #undef CRYPTOPP_SSSE3_AVAILABLE +// #undef CRYPTOPP_ARM_NEON_AVAILABLE + +// Disable NEON/ASIMD for Cortex-A53 and A57. The shifts are too slow and C/C++ is about +// 3 cpb faster than NEON/ASIMD. Also see http://github.com/weidai11/cryptopp/issues/367. +#if (defined(__aarch32__) || defined(__aarch64__)) && defined(CRYPTOPP_SLOW_ARMV8_SHIFT) +# undef CRYPTOPP_ARM_NEON_AVAILABLE +#endif ANONYMOUS_NAMESPACE_BEGIN @@ -167,6 +175,14 @@ ANONYMOUS_NAMESPACE_END NAMESPACE_BEGIN(CryptoPP) +#if defined(CRYPTOPP_ARM_NEON_AVAILABLE) +extern size_t SPECK128_Enc_AdvancedProcessBlocks_NEON(const word64* subKeys, size_t rounds, + const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags); + +extern size_t SPECK128_Dec_AdvancedProcessBlocks_NEON(const word64* subKeys, size_t rounds, + const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags); +#endif + #if defined(CRYPTOPP_SSSE3_AVAILABLE) extern size_t SPECK128_Enc_AdvancedProcessBlocks_SSSE3(const word64* subKeys, size_t rounds, const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags); @@ -336,23 +352,35 @@ void SPECK128::Dec::ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock OutBlock oblk(xorBlock, outBlock); oblk(m_wspace[2])(m_wspace[3]); } -#if CRYPTOPP_BOOL_X86 || CRYPTOPP_BOOL_X32 || CRYPTOPP_BOOL_X64 -size_t SPECK128::Enc::AdvancedProcessBlocks(const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags) const +#if defined(CRYPTOPP_SPECK_ADVANCED_PROCESS_BLOCKS) +size_t SPECK128::Enc::AdvancedProcessBlocks(const byte *inBlocks, const byte *xorBlocks, + byte *outBlocks, size_t length, word32 flags) const { #if defined(CRYPTOPP_SSSE3_AVAILABLE) if (HasSSSE3()) return SPECK128_Enc_AdvancedProcessBlocks_SSSE3(m_rkeys, (size_t)m_rounds, inBlocks, xorBlocks, outBlocks, length, flags); +#endif +#if defined(CRYPTOPP_ARM_NEON_AVAILABLE) + if (HasNEON()) + return SPECK128_Enc_AdvancedProcessBlocks_NEON(m_rkeys, (size_t)m_rounds, + inBlocks, xorBlocks, outBlocks, length, flags); #endif return BlockTransformation::AdvancedProcessBlocks(inBlocks, xorBlocks, outBlocks, length, flags); } -size_t SPECK128::Dec::AdvancedProcessBlocks(const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags) const +size_t SPECK128::Dec::AdvancedProcessBlocks(const byte *inBlocks, const byte *xorBlocks, + byte *outBlocks, size_t length, word32 flags) const { #if defined(CRYPTOPP_SSSE3_AVAILABLE) if (HasSSSE3()) return SPECK128_Dec_AdvancedProcessBlocks_SSSE3(m_rkeys, (size_t)m_rounds, inBlocks, xorBlocks, outBlocks, length, flags); +#endif +#if defined(CRYPTOPP_ARM_NEON_AVAILABLE) + if (HasNEON()) + return SPECK128_Dec_AdvancedProcessBlocks_NEON(m_rkeys, (size_t)m_rounds, + inBlocks, xorBlocks, outBlocks, length, flags); #endif return BlockTransformation::AdvancedProcessBlocks(inBlocks, xorBlocks, outBlocks, length, flags); } diff --git a/speck.h b/speck.h index 037dab92..2f0c5e08 100644 --- a/speck.h +++ b/speck.h @@ -16,6 +16,10 @@ #include "seckey.h" #include "secblock.h" +#if CRYPTOPP_BOOL_X64 || CRYPTOPP_BOOL_X32 || CRYPTOPP_BOOL_X86 || CRYPTOPP_BOOL_ARM32 || CRYPTOPP_BOOL_ARM64 +# define CRYPTOPP_SPECK_ADVANCED_PROCESS_BLOCKS 1 +#endif + NAMESPACE_BEGIN(CryptoPP) //! \class SPECK_Info @@ -142,7 +146,7 @@ public: { protected: void ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock, byte *outBlock) const; -#if CRYPTOPP_BOOL_X86 || CRYPTOPP_BOOL_X32 || CRYPTOPP_BOOL_X64 +#if CRYPTOPP_SPECK_ADVANCED_PROCESS_BLOCKS size_t AdvancedProcessBlocks(const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags) const; #endif }; @@ -155,7 +159,7 @@ public: { protected: void ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock, byte *outBlock) const; -#if CRYPTOPP_BOOL_X86 || CRYPTOPP_BOOL_X32 || CRYPTOPP_BOOL_X64 +#if CRYPTOPP_SPECK_ADVANCED_PROCESS_BLOCKS size_t AdvancedProcessBlocks(const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags) const; #endif };