Add NEON and ASIMD intrinsics for SPECK-128 (GH #538)
Performance increased by about 115% on a 980 MHz BananaPi dev-board. Throughput went from about 46.2 cpb to about 21.5 cpb.pull/548/head
parent
b08596da44
commit
304809a65d
|
|
@ -360,6 +360,7 @@ ifeq ($(IS_NEON),1)
|
||||||
GCM_FLAG = -march=armv7-a -mfloat-abi=$(FP_ABI) -mfpu=neon
|
GCM_FLAG = -march=armv7-a -mfloat-abi=$(FP_ABI) -mfpu=neon
|
||||||
ARIA_FLAG = -march=armv7-a -mfloat-abi=$(FP_ABI) -mfpu=neon
|
ARIA_FLAG = -march=armv7-a -mfloat-abi=$(FP_ABI) -mfpu=neon
|
||||||
BLAKE2_FLAG = -march=armv7-a -mfloat-abi=$(FP_ABI) -mfpu=neon
|
BLAKE2_FLAG = -march=armv7-a -mfloat-abi=$(FP_ABI) -mfpu=neon
|
||||||
|
SPECK_FLAG = -march=armv7-a -mfloat-abi=$(FP_ABI) -mfpu=neon
|
||||||
endif
|
endif
|
||||||
endif
|
endif
|
||||||
|
|
||||||
|
|
@ -369,6 +370,7 @@ ifeq ($(IS_ARMV8),1)
|
||||||
ARIA_FLAG = -march=armv8-a
|
ARIA_FLAG = -march=armv8-a
|
||||||
BLAKE2_FLAG = -march=armv8-a
|
BLAKE2_FLAG = -march=armv8-a
|
||||||
NEON_FLAG = -march=armv8-a
|
NEON_FLAG = -march=armv8-a
|
||||||
|
SPECK_FLAG = -march=armv8-a
|
||||||
endif
|
endif
|
||||||
HAVE_CRC = $(shell echo | $(CXX) -x c++ $(CXXFLAGS) -march=armv8-a+crc -dM -E - 2>/dev/null | $(GREP) -i -c __ARM_FEATURE_CRC32)
|
HAVE_CRC = $(shell echo | $(CXX) -x c++ $(CXXFLAGS) -march=armv8-a+crc -dM -E - 2>/dev/null | $(GREP) -i -c __ARM_FEATURE_CRC32)
|
||||||
ifeq ($(HAVE_CRC),1)
|
ifeq ($(HAVE_CRC),1)
|
||||||
|
|
|
||||||
392
speck-simd.cpp
392
speck-simd.cpp
|
|
@ -11,6 +11,21 @@
|
||||||
#include "speck.h"
|
#include "speck.h"
|
||||||
#include "misc.h"
|
#include "misc.h"
|
||||||
|
|
||||||
|
// Uncomment for benchmarking C++ against SSE or NEON.
|
||||||
|
// Do so in both speck.cpp and speck-simd.cpp.
|
||||||
|
// #undef CRYPTOPP_SSSE3_AVAILABLE
|
||||||
|
// #undef CRYPTOPP_ARM_NEON_AVAILABLE
|
||||||
|
|
||||||
|
// Disable NEON/ASIMD for Cortex-A53 and A57. The shifts are too slow and C/C++ is 3 cpb
|
||||||
|
// faster than NEON/ASIMD. Also see http://github.com/weidai11/cryptopp/issues/367.
|
||||||
|
#if (defined(__aarch32__) || defined(__aarch64__)) && defined(CRYPTOPP_SLOW_ARMV8_SHIFT)
|
||||||
|
# undef CRYPTOPP_ARM_NEON_AVAILABLE
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#if (CRYPTOPP_ARM_NEON_AVAILABLE)
|
||||||
|
# include <arm_neon.h>
|
||||||
|
#endif
|
||||||
|
|
||||||
#if (CRYPTOPP_SSSE3_AVAILABLE)
|
#if (CRYPTOPP_SSSE3_AVAILABLE)
|
||||||
# include <tmmintrin.h>
|
# include <tmmintrin.h>
|
||||||
#endif
|
#endif
|
||||||
|
|
@ -37,6 +52,359 @@ using CryptoPP::rotlFixed;
|
||||||
using CryptoPP::rotrFixed;
|
using CryptoPP::rotrFixed;
|
||||||
using CryptoPP::BlockTransformation;
|
using CryptoPP::BlockTransformation;
|
||||||
|
|
||||||
|
// *************************** ARM NEON ************************** //
|
||||||
|
|
||||||
|
#if defined(CRYPTOPP_ARM_NEON_AVAILABLE)
|
||||||
|
|
||||||
|
#if defined(CRYPTOPP_LITTLE_ENDIAN)
|
||||||
|
const word32 s_one[] = {0, 0, 0, 1<<24}; // uint32x4_t
|
||||||
|
#else
|
||||||
|
const word32 s_one[] = {0, 0, 0, 1}; // uint32x4_t
|
||||||
|
#endif
|
||||||
|
|
||||||
|
template <class W, class T>
|
||||||
|
inline W UnpackHigh64(const T& a, const T& b)
|
||||||
|
{
|
||||||
|
const uint64_t x = vget_high_u64((uint64x2_t)a);
|
||||||
|
const uint64_t y = vget_high_u64((uint64x2_t)b);
|
||||||
|
return (W)vcombine_u64(x, y);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <class W, class T>
|
||||||
|
inline W UnpackLow64(const T& a, const T& b)
|
||||||
|
{
|
||||||
|
const uint64_t x = vget_low_u64((uint64x2_t)a);
|
||||||
|
const uint64_t y = vget_low_u64((uint64x2_t)b);
|
||||||
|
return (W)vcombine_u64(x, y);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <unsigned int R>
|
||||||
|
inline uint64x2_t RotateLeft64(const uint64x2_t& val)
|
||||||
|
{
|
||||||
|
CRYPTOPP_ASSERT(R < 64);
|
||||||
|
const uint64x2_t a(vshlq_n_u64(val, R));
|
||||||
|
const uint64x2_t b(vshrq_n_u64(val, 64 - R));
|
||||||
|
return vorrq_u64(a, b);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <unsigned int R>
|
||||||
|
inline uint64x2_t RotateRight64(const uint64x2_t& val)
|
||||||
|
{
|
||||||
|
CRYPTOPP_ASSERT(R < 64);
|
||||||
|
const uint64x2_t a(vshlq_n_u64(val, 64 - R));
|
||||||
|
const uint64x2_t b(vshrq_n_u64(val, R));
|
||||||
|
return vorrq_u64(a, b);
|
||||||
|
}
|
||||||
|
|
||||||
|
inline uint64x2_t Shuffle64(const uint64x2_t& val)
|
||||||
|
{
|
||||||
|
return vreinterpretq_u64_u8(
|
||||||
|
vrev64q_u8(vreinterpretq_u8_u64(val)));
|
||||||
|
}
|
||||||
|
|
||||||
|
inline void SPECK128_Enc_Block(uint8x16_t &block0, const word64 *subkeys, unsigned int rounds)
|
||||||
|
{
|
||||||
|
// Hack ahead... SPECK128_AdvancedProcessBlocks_NEON loads each SPECK-128 block into a
|
||||||
|
// uint64x2_t. We can't SSE over them, so we rearrange the data to allow packed operations.
|
||||||
|
// Its also easier to permute them in SPECK128_Enc_Block rather than the calling code.
|
||||||
|
// SPECK128_AdvancedProcessBlocks_NEON is rather messy. The zero block below is a
|
||||||
|
// "don't care". It is present so we can vectorize SPECK128_Enc_Block.
|
||||||
|
uint8x16_t block1 = {0};
|
||||||
|
uint64x2_t x1 = UnpackLow64<uint64x2_t>(block0, block1);
|
||||||
|
uint64x2_t y1 = UnpackHigh64<uint64x2_t>(block0, block1);
|
||||||
|
|
||||||
|
x1 = Shuffle64(x1);
|
||||||
|
y1 = Shuffle64(y1);
|
||||||
|
|
||||||
|
for (size_t i=0; static_cast<int>(i)<rounds; ++i)
|
||||||
|
{
|
||||||
|
const uint64x2_t rk = vld1q_dup_u64(subkeys+i);
|
||||||
|
|
||||||
|
x1 = RotateRight64<8>(x1);
|
||||||
|
x1 = vaddq_u64(x1, y1);
|
||||||
|
x1 = veorq_u64(x1, rk);
|
||||||
|
y1 = RotateLeft64<3>(y1);
|
||||||
|
y1 = veorq_u64(y1, x1);
|
||||||
|
}
|
||||||
|
|
||||||
|
x1 = Shuffle64(x1);
|
||||||
|
y1 = Shuffle64(y1);
|
||||||
|
|
||||||
|
block0 = UnpackLow64<uint8x16_t>(x1, y1);
|
||||||
|
// block1 = UnpackHigh64<uint8x16_t>(x1, y1);
|
||||||
|
}
|
||||||
|
|
||||||
|
inline void SPECK128_Enc_6_Blocks(uint8x16_t &block0, uint8x16_t &block1,
|
||||||
|
uint8x16_t &block2, uint8x16_t &block3, uint8x16_t &block4,
|
||||||
|
uint8x16_t &block5, const word64 *subkeys, unsigned int rounds)
|
||||||
|
{
|
||||||
|
// Hack ahead... SPECK128_AdvancedProcessBlocks_NEON loads each SPECK-128 block into a
|
||||||
|
// uint64x2_t. We can't SSE over them, so we rearrange the data to allow packed operations.
|
||||||
|
// Its also easier to permute them in SPECK128_Enc_6_Blocks rather than the calling code.
|
||||||
|
// SPECK128_AdvancedProcessBlocks_NEON is rather messy.
|
||||||
|
uint64x2_t x1 = UnpackLow64<uint64x2_t>(block0, block1);
|
||||||
|
uint64x2_t y1 = UnpackHigh64<uint64x2_t>(block0, block1);
|
||||||
|
uint64x2_t x2 = UnpackLow64<uint64x2_t>(block2, block3);
|
||||||
|
uint64x2_t y2 = UnpackHigh64<uint64x2_t>(block2, block3);
|
||||||
|
uint64x2_t x3 = UnpackLow64<uint64x2_t>(block4, block5);
|
||||||
|
uint64x2_t y3 = UnpackHigh64<uint64x2_t>(block4, block5);
|
||||||
|
|
||||||
|
x1 = Shuffle64(x1);
|
||||||
|
y1 = Shuffle64(y1);
|
||||||
|
x2 = Shuffle64(x2);
|
||||||
|
y2 = Shuffle64(y2);
|
||||||
|
x3 = Shuffle64(x3);
|
||||||
|
y3 = Shuffle64(y3);
|
||||||
|
|
||||||
|
for (size_t i=0; static_cast<int>(i)<rounds; ++i)
|
||||||
|
{
|
||||||
|
const uint64x2_t rk = vld1q_dup_u64(subkeys+i);
|
||||||
|
|
||||||
|
x1 = RotateRight64<8>(x1);
|
||||||
|
x2 = RotateRight64<8>(x2);
|
||||||
|
x3 = RotateRight64<8>(x3);
|
||||||
|
x1 = vaddq_u64(x1, y1);
|
||||||
|
x2 = vaddq_u64(x2, y2);
|
||||||
|
x3 = vaddq_u64(x3, y3);
|
||||||
|
x1 = veorq_u64(x1, rk);
|
||||||
|
x2 = veorq_u64(x2, rk);
|
||||||
|
x3 = veorq_u64(x3, rk);
|
||||||
|
y1 = RotateLeft64<3>(y1);
|
||||||
|
y2 = RotateLeft64<3>(y2);
|
||||||
|
y3 = RotateLeft64<3>(y3);
|
||||||
|
y1 = veorq_u64(y1, x1);
|
||||||
|
y2 = veorq_u64(y2, x2);
|
||||||
|
y3 = veorq_u64(y3, x3);
|
||||||
|
}
|
||||||
|
|
||||||
|
x1 = Shuffle64(x1);
|
||||||
|
y1 = Shuffle64(y1);
|
||||||
|
x2 = Shuffle64(x2);
|
||||||
|
y2 = Shuffle64(y2);
|
||||||
|
x3 = Shuffle64(x3);
|
||||||
|
y3 = Shuffle64(y3);
|
||||||
|
|
||||||
|
block0 = UnpackLow64<uint8x16_t>(x1, y1);
|
||||||
|
block1 = UnpackHigh64<uint8x16_t>(x1, y1);
|
||||||
|
block2 = UnpackLow64<uint8x16_t>(x2, y2);
|
||||||
|
block3 = UnpackHigh64<uint8x16_t>(x2, y2);
|
||||||
|
block4 = UnpackLow64<uint8x16_t>(x3, y3);
|
||||||
|
block5 = UnpackHigh64<uint8x16_t>(x3, y3);
|
||||||
|
}
|
||||||
|
|
||||||
|
inline void SPECK128_Dec_Block(uint8x16_t &block0, const word64 *subkeys, unsigned int rounds)
|
||||||
|
{
|
||||||
|
// Hack ahead... SPECK128_AdvancedProcessBlocks_NEON loads each SPECK-128 block into a
|
||||||
|
// uint64x2_t. We can't SSE over them, so we rearrange the data to allow packed operations.
|
||||||
|
// Its also easier to permute them in SPECK128_Dec_Block rather than the calling code.
|
||||||
|
// SPECK128_AdvancedProcessBlocks_NEON is rather messy. The zero block below is a
|
||||||
|
// "don't care". It is present so we can vectorize SPECK128_Dec_Block.
|
||||||
|
uint8x16_t block1 = {0};
|
||||||
|
uint64x2_t x1 = UnpackLow64<uint64x2_t>(block0, block1);
|
||||||
|
uint64x2_t y1 = UnpackHigh64<uint64x2_t>(block0, block1);
|
||||||
|
|
||||||
|
x1 = Shuffle64(x1);
|
||||||
|
y1 = Shuffle64(y1);
|
||||||
|
|
||||||
|
for (size_t i=rounds-1; static_cast<int>(i)>=0; --i)
|
||||||
|
{
|
||||||
|
const uint64x2_t rk = vld1q_dup_u64(subkeys+i);
|
||||||
|
|
||||||
|
y1 = veorq_u64(y1, x1);
|
||||||
|
y1 = RotateRight64<3>(y1);
|
||||||
|
x1 = veorq_u64(x1, rk);
|
||||||
|
x1 = vsubq_u64(x1, y1);
|
||||||
|
x1 = RotateLeft64<8>(x1);
|
||||||
|
}
|
||||||
|
|
||||||
|
x1 = Shuffle64(x1);
|
||||||
|
y1 = Shuffle64(y1);
|
||||||
|
|
||||||
|
block0 = UnpackLow64<uint8x16_t>(x1, y1);
|
||||||
|
// block1 = UnpackHigh64<uint8x16_t>(x1, y1);
|
||||||
|
}
|
||||||
|
|
||||||
|
inline void SPECK128_Dec_6_Blocks(uint8x16_t &block0, uint8x16_t &block1,
|
||||||
|
uint8x16_t &block2, uint8x16_t &block3, uint8x16_t &block4,
|
||||||
|
uint8x16_t &block5, const word64 *subkeys, unsigned int rounds)
|
||||||
|
{
|
||||||
|
// Hack ahead... SPECK128_AdvancedProcessBlocks_NEON loads each SPECK-128 block into a
|
||||||
|
// uint64x2_t. We can't SSE over them, so we rearrange the data to allow packed operations.
|
||||||
|
// Its also easier to permute them in SPECK128_Dec_6_Blocks rather than the calling code.
|
||||||
|
// SPECK128_AdvancedProcessBlocks_NEON is rather messy.
|
||||||
|
uint64x2_t x1 = UnpackLow64<uint64x2_t>(block0, block1);
|
||||||
|
uint64x2_t y1 = UnpackHigh64<uint64x2_t>(block0, block1);
|
||||||
|
uint64x2_t x2 = UnpackLow64<uint64x2_t>(block2, block3);
|
||||||
|
uint64x2_t y2 = UnpackHigh64<uint64x2_t>(block2, block3);
|
||||||
|
uint64x2_t x3 = UnpackLow64<uint64x2_t>(block4, block5);
|
||||||
|
uint64x2_t y3 = UnpackHigh64<uint64x2_t>(block5, block5);
|
||||||
|
|
||||||
|
x1 = Shuffle64(x1);
|
||||||
|
y1 = Shuffle64(y1);
|
||||||
|
x2 = Shuffle64(x2);
|
||||||
|
y2 = Shuffle64(y2);
|
||||||
|
x3 = Shuffle64(x3);
|
||||||
|
y3 = Shuffle64(y3);
|
||||||
|
|
||||||
|
for (size_t i=rounds-1; static_cast<int>(i)>=0; --i)
|
||||||
|
{
|
||||||
|
const uint64x2_t rk = vld1q_dup_u64(subkeys+i);
|
||||||
|
|
||||||
|
y1 = veorq_u64(y1, x1);
|
||||||
|
y2 = veorq_u64(y2, x2);
|
||||||
|
y3 = veorq_u64(y3, x3);
|
||||||
|
y1 = RotateRight64<3>(y1);
|
||||||
|
y2 = RotateRight64<3>(y2);
|
||||||
|
y3 = RotateRight64<3>(y3);
|
||||||
|
x1 = veorq_u64(x1, rk);
|
||||||
|
x2 = veorq_u64(x2, rk);
|
||||||
|
x3 = veorq_u64(x3, rk);
|
||||||
|
x1 = vsubq_u64(x1, y1);
|
||||||
|
x2 = vsubq_u64(x2, y2);
|
||||||
|
x3 = vsubq_u64(x3, y3);
|
||||||
|
x1 = RotateLeft64<8>(x1);
|
||||||
|
x2 = RotateLeft64<8>(x2);
|
||||||
|
x3 = RotateLeft64<8>(x3);
|
||||||
|
}
|
||||||
|
|
||||||
|
x1 = Shuffle64(x1);
|
||||||
|
y1 = Shuffle64(y1);
|
||||||
|
x2 = Shuffle64(x2);
|
||||||
|
y2 = Shuffle64(y2);
|
||||||
|
x3 = Shuffle64(x3);
|
||||||
|
y3 = Shuffle64(y3);
|
||||||
|
|
||||||
|
block0 = UnpackLow64<uint8x16_t>(x1, y1);
|
||||||
|
block1 = UnpackHigh64<uint8x16_t>(x1, y1);
|
||||||
|
block2 = UnpackLow64<uint8x16_t>(x2, y2);
|
||||||
|
block3 = UnpackHigh64<uint8x16_t>(x2, y2);
|
||||||
|
block4 = UnpackLow64<uint8x16_t>(x3, y3);
|
||||||
|
block5 = UnpackHigh64<uint8x16_t>(x3, y3);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename F1, typename F6>
|
||||||
|
size_t SPECK128_AdvancedProcessBlocks_NEON(F1 func1, F6 func6,
|
||||||
|
const word64 *subKeys, size_t rounds, const byte *inBlocks,
|
||||||
|
const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
|
||||||
|
{
|
||||||
|
CRYPTOPP_ASSERT(subKeys);
|
||||||
|
CRYPTOPP_ASSERT(inBlocks);
|
||||||
|
CRYPTOPP_ASSERT(outBlocks);
|
||||||
|
CRYPTOPP_ASSERT(length >= 16);
|
||||||
|
|
||||||
|
const size_t blockSize = 16;
|
||||||
|
size_t inIncrement = (flags & (BlockTransformation::BT_InBlockIsCounter|BlockTransformation::BT_DontIncrementInOutPointers)) ? 0 : blockSize;
|
||||||
|
size_t xorIncrement = xorBlocks ? blockSize : 0;
|
||||||
|
size_t outIncrement = (flags & BlockTransformation::BT_DontIncrementInOutPointers) ? 0 : blockSize;
|
||||||
|
|
||||||
|
if (flags & BlockTransformation::BT_ReverseDirection)
|
||||||
|
{
|
||||||
|
inBlocks += length - blockSize;
|
||||||
|
xorBlocks += length - blockSize;
|
||||||
|
outBlocks += length - blockSize;
|
||||||
|
inIncrement = 0-inIncrement;
|
||||||
|
xorIncrement = 0-xorIncrement;
|
||||||
|
outIncrement = 0-outIncrement;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (flags & BlockTransformation::BT_AllowParallel)
|
||||||
|
{
|
||||||
|
while (length >= 6*blockSize)
|
||||||
|
{
|
||||||
|
uint8x16_t block0, block1, block2, block3, block4, block5, temp;
|
||||||
|
block0 = vld1q_u8(inBlocks);
|
||||||
|
|
||||||
|
if (flags & BlockTransformation::BT_InBlockIsCounter)
|
||||||
|
{
|
||||||
|
uint32x4_t be = vld1q_u32(s_one);
|
||||||
|
block1 = (uint8x16_t)vaddq_u32(vreinterpretq_u32_u8(block0), be);
|
||||||
|
block2 = (uint8x16_t)vaddq_u32(vreinterpretq_u32_u8(block1), be);
|
||||||
|
block3 = (uint8x16_t)vaddq_u32(vreinterpretq_u32_u8(block2), be);
|
||||||
|
block4 = (uint8x16_t)vaddq_u32(vreinterpretq_u32_u8(block3), be);
|
||||||
|
block5 = (uint8x16_t)vaddq_u32(vreinterpretq_u32_u8(block4), be);
|
||||||
|
temp = (uint8x16_t)vaddq_u32(vreinterpretq_u32_u8(block5), be);
|
||||||
|
vst1q_u8(const_cast<byte*>(inBlocks), temp);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
const int inc = static_cast<int>(inIncrement);
|
||||||
|
block1 = vld1q_u8(inBlocks+1*inc);
|
||||||
|
block2 = vld1q_u8(inBlocks+2*inc);
|
||||||
|
block3 = vld1q_u8(inBlocks+3*inc);
|
||||||
|
block4 = vld1q_u8(inBlocks+4*inc);
|
||||||
|
block5 = vld1q_u8(inBlocks+5*inc);
|
||||||
|
inBlocks += 6*inc;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (flags & BlockTransformation::BT_XorInput)
|
||||||
|
{
|
||||||
|
const int inc = static_cast<int>(xorIncrement);
|
||||||
|
block0 = veorq_u8(block0, vld1q_u8(xorBlocks+0*inc));
|
||||||
|
block1 = veorq_u8(block1, vld1q_u8(xorBlocks+1*inc));
|
||||||
|
block2 = veorq_u8(block2, vld1q_u8(xorBlocks+2*inc));
|
||||||
|
block3 = veorq_u8(block3, vld1q_u8(xorBlocks+3*inc));
|
||||||
|
block4 = veorq_u8(block4, vld1q_u8(xorBlocks+4*inc));
|
||||||
|
block5 = veorq_u8(block5, vld1q_u8(xorBlocks+5*inc));
|
||||||
|
xorBlocks += 6*inc;
|
||||||
|
}
|
||||||
|
|
||||||
|
func6(block0, block1, block2, block3, block4, block5, subKeys, rounds);
|
||||||
|
|
||||||
|
if (xorBlocks && !(flags & BlockTransformation::BT_XorInput))
|
||||||
|
{
|
||||||
|
const int inc = static_cast<int>(xorIncrement);
|
||||||
|
block0 = veorq_u8(block0, vld1q_u8(xorBlocks+0*inc));
|
||||||
|
block1 = veorq_u8(block1, vld1q_u8(xorBlocks+1*inc));
|
||||||
|
block2 = veorq_u8(block2, vld1q_u8(xorBlocks+2*inc));
|
||||||
|
block3 = veorq_u8(block3, vld1q_u8(xorBlocks+3*inc));
|
||||||
|
block4 = veorq_u8(block4, vld1q_u8(xorBlocks+4*inc));
|
||||||
|
block5 = veorq_u8(block5, vld1q_u8(xorBlocks+5*inc));
|
||||||
|
xorBlocks += 6*inc;
|
||||||
|
}
|
||||||
|
|
||||||
|
const int inc = static_cast<int>(outIncrement);
|
||||||
|
vst1q_u8(outBlocks+0*inc, block0);
|
||||||
|
vst1q_u8(outBlocks+1*inc, block1);
|
||||||
|
vst1q_u8(outBlocks+2*inc, block2);
|
||||||
|
vst1q_u8(outBlocks+3*inc, block3);
|
||||||
|
vst1q_u8(outBlocks+4*inc, block4);
|
||||||
|
vst1q_u8(outBlocks+5*inc, block5);
|
||||||
|
|
||||||
|
outBlocks += 6*inc;
|
||||||
|
length -= 6*blockSize;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
while (length >= blockSize)
|
||||||
|
{
|
||||||
|
uint8x16_t block = vld1q_u8(inBlocks);
|
||||||
|
|
||||||
|
if (flags & BlockTransformation::BT_XorInput)
|
||||||
|
block = veorq_u8(block, vld1q_u8(xorBlocks));
|
||||||
|
|
||||||
|
if (flags & BlockTransformation::BT_InBlockIsCounter)
|
||||||
|
const_cast<byte *>(inBlocks)[15]++;
|
||||||
|
|
||||||
|
func1(block, subKeys, rounds);
|
||||||
|
|
||||||
|
if (xorBlocks && !(flags & BlockTransformation::BT_XorInput))
|
||||||
|
block = veorq_u8(block, vld1q_u8(xorBlocks));
|
||||||
|
|
||||||
|
vst1q_u8(outBlocks, block);
|
||||||
|
|
||||||
|
inBlocks += inIncrement;
|
||||||
|
outBlocks += outIncrement;
|
||||||
|
xorBlocks += xorIncrement;
|
||||||
|
length -= blockSize;
|
||||||
|
}
|
||||||
|
|
||||||
|
return length;
|
||||||
|
}
|
||||||
|
|
||||||
|
#endif // CRYPTOPP_ARM_NEON_AVAILABLE
|
||||||
|
|
||||||
|
// ***************************** IA-32 ***************************** //
|
||||||
|
|
||||||
#if defined(CRYPTOPP_SSSE3_AVAILABLE)
|
#if defined(CRYPTOPP_SSSE3_AVAILABLE)
|
||||||
|
|
||||||
CRYPTOPP_ALIGN_DATA(16)
|
CRYPTOPP_ALIGN_DATA(16)
|
||||||
|
|
@ -340,10 +708,30 @@ inline size_t SPECK128_AdvancedProcessBlocks_SSSE3(F1 func1, F4 func4,
|
||||||
|
|
||||||
ANONYMOUS_NAMESPACE_END
|
ANONYMOUS_NAMESPACE_END
|
||||||
|
|
||||||
///////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////////////
|
||||||
|
|
||||||
NAMESPACE_BEGIN(CryptoPP)
|
NAMESPACE_BEGIN(CryptoPP)
|
||||||
|
|
||||||
|
// *************************** ARM NEON **************************** //
|
||||||
|
|
||||||
|
#if (CRYPTOPP_ARM_NEON_AVAILABLE)
|
||||||
|
size_t SPECK128_Enc_AdvancedProcessBlocks_NEON(const word64* subKeys, size_t rounds,
|
||||||
|
const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
|
||||||
|
{
|
||||||
|
return SPECK128_AdvancedProcessBlocks_NEON(SPECK128_Enc_Block, SPECK128_Enc_6_Blocks,
|
||||||
|
subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
|
||||||
|
}
|
||||||
|
|
||||||
|
size_t SPECK128_Dec_AdvancedProcessBlocks_NEON(const word64* subKeys, size_t rounds,
|
||||||
|
const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
|
||||||
|
{
|
||||||
|
return SPECK128_AdvancedProcessBlocks_NEON(SPECK128_Dec_Block, SPECK128_Dec_6_Blocks,
|
||||||
|
subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
|
||||||
|
}
|
||||||
|
#endif // CRYPTOPP_ARM_NEON_AVAILABLE
|
||||||
|
|
||||||
|
// ***************************** IA-32 ***************************** //
|
||||||
|
|
||||||
#if defined(CRYPTOPP_SSSE3_AVAILABLE)
|
#if defined(CRYPTOPP_SSSE3_AVAILABLE)
|
||||||
size_t SPECK128_Enc_AdvancedProcessBlocks_SSSE3(const word64* subKeys, size_t rounds,
|
size_t SPECK128_Enc_AdvancedProcessBlocks_SSSE3(const word64* subKeys, size_t rounds,
|
||||||
const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
|
const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
|
||||||
|
|
@ -358,6 +746,6 @@ size_t SPECK128_Dec_AdvancedProcessBlocks_SSSE3(const word64* subKeys, size_t ro
|
||||||
return SPECK128_AdvancedProcessBlocks_SSSE3(SPECK128_Dec_Block, SPECK128_Dec_4_Blocks,
|
return SPECK128_AdvancedProcessBlocks_SSSE3(SPECK128_Dec_Block, SPECK128_Dec_4_Blocks,
|
||||||
subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
|
subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
|
||||||
}
|
}
|
||||||
#endif
|
#endif // CRYPTOPP_SSSE3_AVAILABLE
|
||||||
|
|
||||||
NAMESPACE_END
|
NAMESPACE_END
|
||||||
|
|
|
||||||
36
speck.cpp
36
speck.cpp
|
|
@ -7,8 +7,16 @@
|
||||||
#include "misc.h"
|
#include "misc.h"
|
||||||
#include "cpu.h"
|
#include "cpu.h"
|
||||||
|
|
||||||
// Uncomment to benchmark C/C++, and to isolate SSE code.
|
// Uncomment for benchmarking C++ against SSE2 or NEON.
|
||||||
|
// Do so in both speck.cpp and speck-simd.cpp.
|
||||||
// #undef CRYPTOPP_SSSE3_AVAILABLE
|
// #undef CRYPTOPP_SSSE3_AVAILABLE
|
||||||
|
// #undef CRYPTOPP_ARM_NEON_AVAILABLE
|
||||||
|
|
||||||
|
// Disable NEON/ASIMD for Cortex-A53 and A57. The shifts are too slow and C/C++ is about
|
||||||
|
// 3 cpb faster than NEON/ASIMD. Also see http://github.com/weidai11/cryptopp/issues/367.
|
||||||
|
#if (defined(__aarch32__) || defined(__aarch64__)) && defined(CRYPTOPP_SLOW_ARMV8_SHIFT)
|
||||||
|
# undef CRYPTOPP_ARM_NEON_AVAILABLE
|
||||||
|
#endif
|
||||||
|
|
||||||
ANONYMOUS_NAMESPACE_BEGIN
|
ANONYMOUS_NAMESPACE_BEGIN
|
||||||
|
|
||||||
|
|
@ -167,6 +175,14 @@ ANONYMOUS_NAMESPACE_END
|
||||||
|
|
||||||
NAMESPACE_BEGIN(CryptoPP)
|
NAMESPACE_BEGIN(CryptoPP)
|
||||||
|
|
||||||
|
#if defined(CRYPTOPP_ARM_NEON_AVAILABLE)
|
||||||
|
extern size_t SPECK128_Enc_AdvancedProcessBlocks_NEON(const word64* subKeys, size_t rounds,
|
||||||
|
const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags);
|
||||||
|
|
||||||
|
extern size_t SPECK128_Dec_AdvancedProcessBlocks_NEON(const word64* subKeys, size_t rounds,
|
||||||
|
const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags);
|
||||||
|
#endif
|
||||||
|
|
||||||
#if defined(CRYPTOPP_SSSE3_AVAILABLE)
|
#if defined(CRYPTOPP_SSSE3_AVAILABLE)
|
||||||
extern size_t SPECK128_Enc_AdvancedProcessBlocks_SSSE3(const word64* subKeys, size_t rounds,
|
extern size_t SPECK128_Enc_AdvancedProcessBlocks_SSSE3(const word64* subKeys, size_t rounds,
|
||||||
const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags);
|
const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags);
|
||||||
|
|
@ -336,23 +352,35 @@ void SPECK128::Dec::ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock
|
||||||
OutBlock oblk(xorBlock, outBlock); oblk(m_wspace[2])(m_wspace[3]);
|
OutBlock oblk(xorBlock, outBlock); oblk(m_wspace[2])(m_wspace[3]);
|
||||||
}
|
}
|
||||||
|
|
||||||
#if CRYPTOPP_BOOL_X86 || CRYPTOPP_BOOL_X32 || CRYPTOPP_BOOL_X64
|
#if defined(CRYPTOPP_SPECK_ADVANCED_PROCESS_BLOCKS)
|
||||||
size_t SPECK128::Enc::AdvancedProcessBlocks(const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags) const
|
size_t SPECK128::Enc::AdvancedProcessBlocks(const byte *inBlocks, const byte *xorBlocks,
|
||||||
|
byte *outBlocks, size_t length, word32 flags) const
|
||||||
{
|
{
|
||||||
#if defined(CRYPTOPP_SSSE3_AVAILABLE)
|
#if defined(CRYPTOPP_SSSE3_AVAILABLE)
|
||||||
if (HasSSSE3())
|
if (HasSSSE3())
|
||||||
return SPECK128_Enc_AdvancedProcessBlocks_SSSE3(m_rkeys, (size_t)m_rounds,
|
return SPECK128_Enc_AdvancedProcessBlocks_SSSE3(m_rkeys, (size_t)m_rounds,
|
||||||
inBlocks, xorBlocks, outBlocks, length, flags);
|
inBlocks, xorBlocks, outBlocks, length, flags);
|
||||||
|
#endif
|
||||||
|
#if defined(CRYPTOPP_ARM_NEON_AVAILABLE)
|
||||||
|
if (HasNEON())
|
||||||
|
return SPECK128_Enc_AdvancedProcessBlocks_NEON(m_rkeys, (size_t)m_rounds,
|
||||||
|
inBlocks, xorBlocks, outBlocks, length, flags);
|
||||||
#endif
|
#endif
|
||||||
return BlockTransformation::AdvancedProcessBlocks(inBlocks, xorBlocks, outBlocks, length, flags);
|
return BlockTransformation::AdvancedProcessBlocks(inBlocks, xorBlocks, outBlocks, length, flags);
|
||||||
}
|
}
|
||||||
|
|
||||||
size_t SPECK128::Dec::AdvancedProcessBlocks(const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags) const
|
size_t SPECK128::Dec::AdvancedProcessBlocks(const byte *inBlocks, const byte *xorBlocks,
|
||||||
|
byte *outBlocks, size_t length, word32 flags) const
|
||||||
{
|
{
|
||||||
#if defined(CRYPTOPP_SSSE3_AVAILABLE)
|
#if defined(CRYPTOPP_SSSE3_AVAILABLE)
|
||||||
if (HasSSSE3())
|
if (HasSSSE3())
|
||||||
return SPECK128_Dec_AdvancedProcessBlocks_SSSE3(m_rkeys, (size_t)m_rounds,
|
return SPECK128_Dec_AdvancedProcessBlocks_SSSE3(m_rkeys, (size_t)m_rounds,
|
||||||
inBlocks, xorBlocks, outBlocks, length, flags);
|
inBlocks, xorBlocks, outBlocks, length, flags);
|
||||||
|
#endif
|
||||||
|
#if defined(CRYPTOPP_ARM_NEON_AVAILABLE)
|
||||||
|
if (HasNEON())
|
||||||
|
return SPECK128_Dec_AdvancedProcessBlocks_NEON(m_rkeys, (size_t)m_rounds,
|
||||||
|
inBlocks, xorBlocks, outBlocks, length, flags);
|
||||||
#endif
|
#endif
|
||||||
return BlockTransformation::AdvancedProcessBlocks(inBlocks, xorBlocks, outBlocks, length, flags);
|
return BlockTransformation::AdvancedProcessBlocks(inBlocks, xorBlocks, outBlocks, length, flags);
|
||||||
}
|
}
|
||||||
|
|
|
||||||
8
speck.h
8
speck.h
|
|
@ -16,6 +16,10 @@
|
||||||
#include "seckey.h"
|
#include "seckey.h"
|
||||||
#include "secblock.h"
|
#include "secblock.h"
|
||||||
|
|
||||||
|
#if CRYPTOPP_BOOL_X64 || CRYPTOPP_BOOL_X32 || CRYPTOPP_BOOL_X86 || CRYPTOPP_BOOL_ARM32 || CRYPTOPP_BOOL_ARM64
|
||||||
|
# define CRYPTOPP_SPECK_ADVANCED_PROCESS_BLOCKS 1
|
||||||
|
#endif
|
||||||
|
|
||||||
NAMESPACE_BEGIN(CryptoPP)
|
NAMESPACE_BEGIN(CryptoPP)
|
||||||
|
|
||||||
//! \class SPECK_Info
|
//! \class SPECK_Info
|
||||||
|
|
@ -142,7 +146,7 @@ public:
|
||||||
{
|
{
|
||||||
protected:
|
protected:
|
||||||
void ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock, byte *outBlock) const;
|
void ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock, byte *outBlock) const;
|
||||||
#if CRYPTOPP_BOOL_X86 || CRYPTOPP_BOOL_X32 || CRYPTOPP_BOOL_X64
|
#if CRYPTOPP_SPECK_ADVANCED_PROCESS_BLOCKS
|
||||||
size_t AdvancedProcessBlocks(const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags) const;
|
size_t AdvancedProcessBlocks(const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags) const;
|
||||||
#endif
|
#endif
|
||||||
};
|
};
|
||||||
|
|
@ -155,7 +159,7 @@ public:
|
||||||
{
|
{
|
||||||
protected:
|
protected:
|
||||||
void ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock, byte *outBlock) const;
|
void ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock, byte *outBlock) const;
|
||||||
#if CRYPTOPP_BOOL_X86 || CRYPTOPP_BOOL_X32 || CRYPTOPP_BOOL_X64
|
#if CRYPTOPP_SPECK_ADVANCED_PROCESS_BLOCKS
|
||||||
size_t AdvancedProcessBlocks(const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags) const;
|
size_t AdvancedProcessBlocks(const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags) const;
|
||||||
#endif
|
#endif
|
||||||
};
|
};
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue