Add LEA-128 NEON and ARMv8 implementation (GH #669)
LEA-128(128) from 35.6 cpb to 14.11 cpb on a LeMaker HiKey dev-board. LEA-128 from 12.60 cpb to 11.89 cpb on AMD Opteron 1100.pull/676/head
parent
80ae9f4f0a
commit
9980d30734
|
|
@ -164,6 +164,7 @@ keccak.cpp
|
|||
keccak.h
|
||||
lubyrack.h
|
||||
lea.cpp
|
||||
lea-simd.cpp
|
||||
lea.h
|
||||
luc.cpp
|
||||
luc.h
|
||||
|
|
|
|||
|
|
@ -378,6 +378,7 @@ ifeq ($(IS_NEON),1)
|
|||
GCM_FLAG = -march=armv7-a -mfloat-abi=$(FP_ABI) -mfpu=neon
|
||||
ARIA_FLAG = -march=armv7-a -mfloat-abi=$(FP_ABI) -mfpu=neon
|
||||
BLAKE2_FLAG = -march=armv7-a -mfloat-abi=$(FP_ABI) -mfpu=neon
|
||||
LEA_FLAG = -march=armv7-a -mfloat-abi=$(FP_ABI) -mfpu=neon
|
||||
SIMON_FLAG = -march=armv7-a -mfloat-abi=$(FP_ABI) -mfpu=neon
|
||||
SPECK_FLAG = -march=armv7-a -mfloat-abi=$(FP_ABI) -mfpu=neon
|
||||
endif
|
||||
|
|
@ -388,6 +389,7 @@ ifeq ($(IS_ARMV8),1)
|
|||
ifeq ($(HAVE_NEON),1)
|
||||
ARIA_FLAG = -march=armv8-a
|
||||
BLAKE2_FLAG = -march=armv8-a
|
||||
LEA_FLAG = -march=armv8-a
|
||||
NEON_FLAG = -march=armv8-a
|
||||
SIMON_FLAG = -march=armv8-a
|
||||
SPECK_FLAG = -march=armv8-a
|
||||
|
|
|
|||
|
|
@ -224,6 +224,7 @@ ifeq ($(IS_ARMv7),1)
|
|||
GCM_FLAG = -march=armv7-a
|
||||
ARIA_FLAG = -march=armv7-a
|
||||
BLAKE2_FLAG = -march=armv7-a
|
||||
LEA_FLAG = -march=armv7-a
|
||||
endif
|
||||
endif
|
||||
|
||||
|
|
@ -234,6 +235,7 @@ ifeq ($(IS_NEON),1)
|
|||
GCM_FLAG += -mfpu=neon
|
||||
ARIA_FLAG += -mfpu=neon
|
||||
BLAKE2_FLAG += -mfpu=neon
|
||||
LEA_FLAG += -mfpu=neon
|
||||
SIMON_FLAG += -mfpu=neon
|
||||
SPECK_FLAG += -mfpu=neon
|
||||
ifeq ($(IS_ANDROID),1)
|
||||
|
|
@ -242,6 +244,7 @@ ifeq ($(IS_NEON),1)
|
|||
GCM_FLAG += -mfloat-abi=softfp
|
||||
ARIA_FLAG += -mfloat-abi=softfp
|
||||
BLAKE2_FLAG += -mfloat-abi=softfp
|
||||
LEA_FLAG += -mfloat-abi=softfp
|
||||
SIMON_FLAG += -mfloat-abi=softfp
|
||||
SPECK_FLAG += -mfloat-abi=softfp
|
||||
endif
|
||||
|
|
@ -255,6 +258,7 @@ ifneq ($(IS_ARMv8),0)
|
|||
ifeq ($(IS_NEON),1)
|
||||
ARIA_FLAG = -march=armv8-a
|
||||
BLAKE2_FLAG = -march=armv8-a
|
||||
LEA_FLAG = -march=armv8-a
|
||||
NEON_FLAG = -march=armv8-a
|
||||
SIMON_FLAG = -march=armv8-a
|
||||
SPECK_FLAG = -march=armv8-a
|
||||
|
|
@ -505,7 +509,7 @@ crc-simd.o : crc-simd.cpp
|
|||
gcm-simd.o : gcm-simd.cpp
|
||||
$(CXX) $(strip $(CXXFLAGS) $(GCM_FLAG) -c) $<
|
||||
|
||||
# SSSE3 available
|
||||
# SSSE3 or ARMv8a available
|
||||
lea-simd.o : lea-simd.cpp
|
||||
$(CXX) $(strip $(CXXFLAGS) $(LEA_FLAG) -c) $<
|
||||
|
||||
|
|
|
|||
142
adv-simd.h
142
adv-simd.h
|
|
@ -18,6 +18,7 @@
|
|||
// * AdvancedProcessBlocks64_6x2_SSE
|
||||
// * AdvancedProcessBlocks128_6x2_SSE
|
||||
// * AdvancedProcessBlocks64_6x2_NEON
|
||||
// * AdvancedProcessBlocks128_4x1_NEON
|
||||
// * AdvancedProcessBlocks128_6x2_NEON
|
||||
// * AdvancedProcessBlocks64_6x2_ALTIVEC
|
||||
// * AdvancedProcessBlocks128_6x2_ALTIVEC
|
||||
|
|
@ -489,6 +490,147 @@ inline size_t AdvancedProcessBlocks128_NEON1x6(F1 func1, F6 func6,
|
|||
return length;
|
||||
}
|
||||
|
||||
/// \brief AdvancedProcessBlocks for 1 and 4 blocks
|
||||
/// \tparam F1 function to process 1 128-bit block
|
||||
/// \tparam F4 function to process 4 128-bit blocks
|
||||
/// \tparam W word type of the subkey table
|
||||
/// \tparam V vector type of the NEON data type
|
||||
/// \details AdvancedProcessBlocks128_6x2_NEON processes 4 and 1 NEON SIMD words
|
||||
/// at a time.
|
||||
/// \details The subkey type is usually word32 or word64. V is the vector type and it is
|
||||
/// usually uint32x4_t or uint64x2_t. F1, F4, W and V must use the same word and
|
||||
/// vector type.
|
||||
template <typename F1, typename F4, typename W, typename V>
|
||||
inline size_t AdvancedProcessBlocks128_4x1_NEON(F1 func1, F4 func4,
|
||||
const V& unused, const W *subKeys, size_t rounds, const byte *inBlocks,
|
||||
const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
|
||||
{
|
||||
CRYPTOPP_ASSERT(subKeys);
|
||||
CRYPTOPP_ASSERT(inBlocks);
|
||||
CRYPTOPP_ASSERT(outBlocks);
|
||||
CRYPTOPP_ASSERT(length >= 16);
|
||||
CRYPTOPP_UNUSED(unused);
|
||||
|
||||
#if defined(CRYPTOPP_LITTLE_ENDIAN)
|
||||
const word32 s_one32x4[] = {0, 0, 0, 1<<24};
|
||||
#else
|
||||
const word32 s_one32x4[] = {0, 0, 0, 1};
|
||||
#endif
|
||||
|
||||
const ptrdiff_t blockSize = 16;
|
||||
// const ptrdiff_t neonBlockSize = 16;
|
||||
|
||||
ptrdiff_t inIncrement = (flags & (BT_InBlockIsCounter|BT_DontIncrementInOutPointers)) ? 0 : blockSize;
|
||||
ptrdiff_t xorIncrement = (xorBlocks != NULLPTR) ? blockSize : 0;
|
||||
ptrdiff_t outIncrement = (flags & BT_DontIncrementInOutPointers) ? 0 : blockSize;
|
||||
|
||||
// Clang and Coverity are generating findings using xorBlocks as a flag.
|
||||
const bool xorInput = (xorBlocks != NULLPTR) && (flags & BT_XorInput);
|
||||
const bool xorOutput = (xorBlocks != NULLPTR) && !(flags & BT_XorInput);
|
||||
|
||||
if (flags & BT_ReverseDirection)
|
||||
{
|
||||
inBlocks += static_cast<ptrdiff_t>(length) - blockSize;
|
||||
xorBlocks += static_cast<ptrdiff_t>(length) - blockSize;
|
||||
outBlocks += static_cast<ptrdiff_t>(length) - blockSize;
|
||||
inIncrement = 0-inIncrement;
|
||||
xorIncrement = 0-xorIncrement;
|
||||
outIncrement = 0-outIncrement;
|
||||
}
|
||||
|
||||
if (flags & BT_AllowParallel)
|
||||
{
|
||||
while (length >= 4*blockSize)
|
||||
{
|
||||
uint64x2_t block0, block1, block2, block3, block4, block5;
|
||||
if (flags & BT_InBlockIsCounter)
|
||||
{
|
||||
const uint64x2_t be = vreinterpretq_u64_u32(vld1q_u32(s_one32x4));
|
||||
block0 = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
|
||||
|
||||
block1 = vaddq_u64(block0, be);
|
||||
block2 = vaddq_u64(block1, be);
|
||||
block3 = vaddq_u64(block2, be);
|
||||
vst1q_u8(const_cast<byte*>(inBlocks),
|
||||
vreinterpretq_u8_u64(vaddq_u64(block3, be)));
|
||||
}
|
||||
else
|
||||
{
|
||||
block0 = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
|
||||
inBlocks += inIncrement;
|
||||
block1 = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
|
||||
inBlocks += inIncrement;
|
||||
block2 = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
|
||||
inBlocks += inIncrement;
|
||||
block3 = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
|
||||
inBlocks += inIncrement;
|
||||
}
|
||||
|
||||
if (xorInput)
|
||||
{
|
||||
block0 = veorq_u64(block0, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
|
||||
xorBlocks += xorIncrement;
|
||||
block1 = veorq_u64(block1, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
|
||||
xorBlocks += xorIncrement;
|
||||
block2 = veorq_u64(block2, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
|
||||
xorBlocks += xorIncrement;
|
||||
block3 = veorq_u64(block3, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
|
||||
xorBlocks += xorIncrement;
|
||||
}
|
||||
|
||||
func4((V&)block0, (V&)block1, (V&)block2, (V&)block3, subKeys, static_cast<unsigned int>(rounds));
|
||||
|
||||
if (xorOutput)
|
||||
{
|
||||
block0 = veorq_u64(block0, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
|
||||
xorBlocks += xorIncrement;
|
||||
block1 = veorq_u64(block1, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
|
||||
xorBlocks += xorIncrement;
|
||||
block2 = veorq_u64(block2, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
|
||||
xorBlocks += xorIncrement;
|
||||
block3 = veorq_u64(block3, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
|
||||
xorBlocks += xorIncrement;
|
||||
}
|
||||
|
||||
vst1q_u8(outBlocks, vreinterpretq_u8_u64(block0));
|
||||
outBlocks += outIncrement;
|
||||
vst1q_u8(outBlocks, vreinterpretq_u8_u64(block1));
|
||||
outBlocks += outIncrement;
|
||||
vst1q_u8(outBlocks, vreinterpretq_u8_u64(block2));
|
||||
outBlocks += outIncrement;
|
||||
vst1q_u8(outBlocks, vreinterpretq_u8_u64(block3));
|
||||
outBlocks += outIncrement;
|
||||
|
||||
length -= 4*blockSize;
|
||||
}
|
||||
}
|
||||
|
||||
while (length >= blockSize)
|
||||
{
|
||||
uint64x2_t block = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
|
||||
|
||||
if (xorInput)
|
||||
block = veorq_u64(block, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
|
||||
|
||||
if (flags & BT_InBlockIsCounter)
|
||||
const_cast<byte *>(inBlocks)[15]++;
|
||||
|
||||
func1( (V&)block, subKeys, static_cast<unsigned int>(rounds));
|
||||
|
||||
if (xorOutput)
|
||||
block = veorq_u64(block, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
|
||||
|
||||
vst1q_u8(outBlocks, vreinterpretq_u8_u64(block));
|
||||
|
||||
inBlocks += inIncrement;
|
||||
outBlocks += outIncrement;
|
||||
xorBlocks += xorIncrement;
|
||||
length -= blockSize;
|
||||
}
|
||||
|
||||
return length;
|
||||
}
|
||||
|
||||
/// \brief AdvancedProcessBlocks for 2 and 6 blocks
|
||||
/// \tparam F2 function to process 2 128-bit blocks
|
||||
/// \tparam F6 function to process 6 128-bit blocks
|
||||
|
|
|
|||
505
lea-simd.cpp
505
lea-simd.cpp
|
|
@ -27,6 +27,10 @@
|
|||
# include <immintrin.h>
|
||||
#endif
|
||||
|
||||
#if (CRYPTOPP_ARM_NEON_AVAILABLE)
|
||||
# include <arm_neon.h>
|
||||
#endif
|
||||
|
||||
// Can't use CRYPTOPP_ARM_XXX_AVAILABLE because too many
|
||||
// compilers don't follow ACLE conventions for the include.
|
||||
#if defined(CRYPTOPP_ARM_ACLE_AVAILABLE)
|
||||
|
|
@ -38,6 +42,489 @@ ANONYMOUS_NAMESPACE_BEGIN
|
|||
|
||||
using CryptoPP::word32;
|
||||
|
||||
#if (CRYPTOPP_ARM_NEON_AVAILABLE)
|
||||
|
||||
inline uint32x4_t Xor(const uint32x4_t& a, const uint32x4_t& b)
|
||||
{
|
||||
return veorq_u32(a, b);
|
||||
}
|
||||
|
||||
inline uint32x4_t Add(const uint32x4_t& a, const uint32x4_t& b)
|
||||
{
|
||||
return vaddq_u32(a, b);
|
||||
}
|
||||
|
||||
inline uint32x4_t Sub(const uint32x4_t& a, const uint32x4_t& b)
|
||||
{
|
||||
return vsubq_u32(a, b);
|
||||
}
|
||||
|
||||
template <unsigned int R>
|
||||
inline uint32x4_t RotateLeft(const uint32x4_t& val)
|
||||
{
|
||||
const uint32x4_t a(vshlq_n_u32(val, R));
|
||||
const uint32x4_t b(vshrq_n_u32(val, 32 - R));
|
||||
return vorrq_u32(a, b);
|
||||
}
|
||||
|
||||
template <unsigned int R>
|
||||
inline uint32x4_t RotateRight(const uint32x4_t& val)
|
||||
{
|
||||
const uint32x4_t a(vshlq_n_u32(val, 32 - R));
|
||||
const uint32x4_t b(vshrq_n_u32(val, R));
|
||||
return vorrq_u32(a, b);
|
||||
}
|
||||
|
||||
#if defined(__aarch32__) || defined(__aarch64__)
|
||||
template <>
|
||||
inline uint32x4_t RotateLeft<8>(const uint32x4_t& val)
|
||||
{
|
||||
#if defined(CRYPTOPP_BIG_ENDIAN)
|
||||
const uint8_t maskb[16] = { 14,13,12,15, 10,9,8,11, 6,5,4,7, 2,1,0,3 };
|
||||
const uint8x16_t mask = vld1q_u8(maskb);
|
||||
#else
|
||||
const uint8_t maskb[16] = { 3,0,1,2, 7,4,5,6, 11,8,9,10, 15,12,13,14 };
|
||||
const uint8x16_t mask = vld1q_u8(maskb);
|
||||
#endif
|
||||
|
||||
return vreinterpretq_u32_u8(
|
||||
vqtbl1q_u8(vreinterpretq_u8_u32(val), mask));
|
||||
}
|
||||
|
||||
template <>
|
||||
inline uint32x4_t RotateRight<8>(const uint32x4_t& val)
|
||||
{
|
||||
#if defined(CRYPTOPP_BIG_ENDIAN)
|
||||
const uint8_t maskb[16] = { 12,15,14,13, 8,11,10,9, 4,7,6,5, 0,3,2,1 };
|
||||
const uint8x16_t mask = vld1q_u8(maskb);
|
||||
#else
|
||||
const uint8_t maskb[16] = { 1,2,3,0, 5,6,7,4, 9,10,11,8, 13,14,14,12 };
|
||||
const uint8x16_t mask = vld1q_u8(maskb);
|
||||
#endif
|
||||
|
||||
return vreinterpretq_u32_u8(
|
||||
vqtbl1q_u8(vreinterpretq_u8_u32(val), mask));
|
||||
}
|
||||
#endif
|
||||
|
||||
uint32x4_t UnpackLow32(uint32x4_t a, uint32x4_t b)
|
||||
{
|
||||
uint32x2_t a1 = vget_low_u32(a);
|
||||
uint32x2_t b1 = vget_low_u32(b);
|
||||
uint32x2x2_t result = vzip_u32(a1, b1);
|
||||
return vcombine_u32(result.val[0], result.val[1]);
|
||||
}
|
||||
|
||||
uint32x4_t UnpackHigh32(uint32x4_t a, uint32x4_t b)
|
||||
{
|
||||
uint32x2_t a1 = vget_high_u32(a);
|
||||
uint32x2_t b1 = vget_high_u32(b);
|
||||
uint32x2x2_t result = vzip_u32(a1, b1);
|
||||
return vcombine_u32(result.val[0], result.val[1]);
|
||||
}
|
||||
|
||||
uint32x4_t UnpackLow64(uint32x4_t a, uint32x4_t b)
|
||||
{
|
||||
uint64x1_t a1 = vget_low_u64((uint64x2_t)a);
|
||||
uint64x1_t b1 = vget_low_u64((uint64x2_t)b);
|
||||
return (uint32x4_t)vcombine_u64(a1, b1);
|
||||
}
|
||||
|
||||
uint32x4_t UnpackHigh64(uint32x4_t a, uint32x4_t b)
|
||||
{
|
||||
uint64x1_t a1 = vget_high_u64((uint64x2_t)a);
|
||||
uint64x1_t b1 = vget_high_u64((uint64x2_t)b);
|
||||
return (uint32x4_t)vcombine_u64(a1, b1);
|
||||
}
|
||||
|
||||
template <unsigned int IDX>
|
||||
inline uint32x4_t LoadKey(const word32 rkey[])
|
||||
{
|
||||
return vdupq_n_u32(rkey[IDX]);
|
||||
}
|
||||
|
||||
template <unsigned int IDX>
|
||||
inline uint32x4_t UnpackNEON(const uint32x4_t& a, const uint32x4_t& b, const uint32x4_t& c, const uint32x4_t& d)
|
||||
{
|
||||
// Should not be instantiated
|
||||
CRYPTOPP_ASSERT(0);;
|
||||
return vmovq_n_u32(0);
|
||||
}
|
||||
|
||||
template <>
|
||||
inline uint32x4_t UnpackNEON<0>(const uint32x4_t& a, const uint32x4_t& b, const uint32x4_t& c, const uint32x4_t& d)
|
||||
{
|
||||
// LEA is little-endian oriented, so there is no need for a separate shuffle.
|
||||
const uint32x4_t r1 = UnpackLow32(a, b);
|
||||
const uint32x4_t r2 = UnpackLow32(c, d);
|
||||
return UnpackLow64(r1, r2);
|
||||
}
|
||||
|
||||
template <>
|
||||
inline uint32x4_t UnpackNEON<1>(const uint32x4_t& a, const uint32x4_t& b, const uint32x4_t& c, const uint32x4_t& d)
|
||||
{
|
||||
// LEA is little-endian oriented, so there is no need for a separate shuffle.
|
||||
const uint32x4_t r1 = UnpackLow32(a, b);
|
||||
const uint32x4_t r2 = UnpackLow32(c, d);
|
||||
return UnpackHigh64(r1, r2);
|
||||
}
|
||||
|
||||
template <>
|
||||
inline uint32x4_t UnpackNEON<2>(const uint32x4_t& a, const uint32x4_t& b, const uint32x4_t& c, const uint32x4_t& d)
|
||||
{
|
||||
// LEA is little-endian oriented, so there is no need for a separate shuffle.
|
||||
const uint32x4_t r1 = UnpackHigh32(a, b);
|
||||
const uint32x4_t r2 = UnpackHigh32(c, d);
|
||||
return UnpackLow64(r1, r2);
|
||||
}
|
||||
|
||||
template <>
|
||||
inline uint32x4_t UnpackNEON<3>(const uint32x4_t& a, const uint32x4_t& b, const uint32x4_t& c, const uint32x4_t& d)
|
||||
{
|
||||
// LEA is little-endian oriented, so there is no need for a separate shuffle.
|
||||
const uint32x4_t r1 = UnpackHigh32(a, b);
|
||||
const uint32x4_t r2 = UnpackHigh32(c, d);
|
||||
return UnpackHigh64(r1, r2);
|
||||
}
|
||||
|
||||
template <unsigned int IDX>
|
||||
inline uint32x4_t UnpackNEON(const uint32x4_t& v)
|
||||
{
|
||||
// Should not be instantiated
|
||||
CRYPTOPP_ASSERT(0);;
|
||||
return vmovq_n_u32(0);
|
||||
}
|
||||
|
||||
template <>
|
||||
inline uint32x4_t UnpackNEON<0>(const uint32x4_t& v)
|
||||
{
|
||||
// Splat to all lanes
|
||||
return vdupq_n_u32(vgetq_lane_u32(v, 0));
|
||||
}
|
||||
|
||||
template <>
|
||||
inline uint32x4_t UnpackNEON<1>(const uint32x4_t& v)
|
||||
{
|
||||
// Splat to all lanes
|
||||
return vdupq_n_u32(vgetq_lane_u32(v, 1));
|
||||
}
|
||||
|
||||
template <>
|
||||
inline uint32x4_t UnpackNEON<2>(const uint32x4_t& v)
|
||||
{
|
||||
// Splat to all lanes
|
||||
return vdupq_n_u32(vgetq_lane_u32(v, 2));
|
||||
}
|
||||
|
||||
template <>
|
||||
inline uint32x4_t UnpackNEON<3>(const uint32x4_t& v)
|
||||
{
|
||||
// Splat to all lanes
|
||||
return vdupq_n_u32(vgetq_lane_u32(v, 3));
|
||||
}
|
||||
|
||||
template <unsigned int IDX>
|
||||
inline uint32x4_t RepackNEON(const uint32x4_t& a, const uint32x4_t& b, const uint32x4_t& c, const uint32x4_t& d)
|
||||
{
|
||||
return UnpackNEON<IDX>(a, b, c, d);
|
||||
}
|
||||
|
||||
template <unsigned int IDX>
|
||||
inline uint32x4_t RepackNEON(const uint32x4_t& v)
|
||||
{
|
||||
return UnpackNEON<IDX>(v);
|
||||
}
|
||||
|
||||
void LEA_Encryption(uint32x4_t temp[4], const word32 *subkeys, unsigned int rounds)
|
||||
{
|
||||
temp[3] = RotateRight<3>(Add(Xor(temp[2], LoadKey<4>(subkeys)), Xor(temp[3], LoadKey<5>(subkeys))));
|
||||
temp[2] = RotateRight<5>(Add(Xor(temp[1], LoadKey<2>(subkeys)), Xor(temp[2], LoadKey<3>(subkeys))));
|
||||
temp[1] = RotateLeft<9>(Add(Xor(temp[0], LoadKey<0>(subkeys)), Xor(temp[1], LoadKey<1>(subkeys))));
|
||||
temp[0] = RotateRight<3>(Add(Xor(temp[3], LoadKey<10>(subkeys)), Xor(temp[0], LoadKey<11>(subkeys))));
|
||||
temp[3] = RotateRight<5>(Add(Xor(temp[2], LoadKey<8>(subkeys)), Xor(temp[3], LoadKey<9>(subkeys))));
|
||||
temp[2] = RotateLeft<9>(Add(Xor(temp[1], LoadKey<6>(subkeys)), Xor(temp[2], LoadKey<7>(subkeys))));
|
||||
temp[1] = RotateRight<3>(Add(Xor(temp[0], LoadKey<16>(subkeys)), Xor(temp[1], LoadKey<17>(subkeys))));
|
||||
temp[0] = RotateRight<5>(Add(Xor(temp[3], LoadKey<14>(subkeys)), Xor(temp[0], LoadKey<15>(subkeys))));
|
||||
temp[3] = RotateLeft<9>(Add(Xor(temp[2], LoadKey<12>(subkeys)), Xor(temp[3], LoadKey<13>(subkeys))));
|
||||
temp[2] = RotateRight<3>(Add(Xor(temp[1], LoadKey<22>(subkeys)), Xor(temp[2], LoadKey<23>(subkeys))));
|
||||
temp[1] = RotateRight<5>(Add(Xor(temp[0], LoadKey<20>(subkeys)), Xor(temp[1], LoadKey<21>(subkeys))));
|
||||
temp[0] = RotateLeft<9>(Add(Xor(temp[3], LoadKey<18>(subkeys)), Xor(temp[0], LoadKey<19>(subkeys))));
|
||||
|
||||
temp[3] = RotateRight<3>(Add(Xor(temp[2], LoadKey<28>(subkeys)), Xor(temp[3], LoadKey<29>(subkeys))));
|
||||
temp[2] = RotateRight<5>(Add(Xor(temp[1], LoadKey<26>(subkeys)), Xor(temp[2], LoadKey<27>(subkeys))));
|
||||
temp[1] = RotateLeft<9>(Add(Xor(temp[0], LoadKey<24>(subkeys)), Xor(temp[1], LoadKey<25>(subkeys))));
|
||||
temp[0] = RotateRight<3>(Add(Xor(temp[3], LoadKey<34>(subkeys)), Xor(temp[0], LoadKey<35>(subkeys))));
|
||||
temp[3] = RotateRight<5>(Add(Xor(temp[2], LoadKey<32>(subkeys)), Xor(temp[3], LoadKey<33>(subkeys))));
|
||||
temp[2] = RotateLeft<9>(Add(Xor(temp[1], LoadKey<30>(subkeys)), Xor(temp[2], LoadKey<31>(subkeys))));
|
||||
temp[1] = RotateRight<3>(Add(Xor(temp[0], LoadKey<40>(subkeys)), Xor(temp[1], LoadKey<41>(subkeys))));
|
||||
temp[0] = RotateRight<5>(Add(Xor(temp[3], LoadKey<38>(subkeys)), Xor(temp[0], LoadKey<39>(subkeys))));
|
||||
temp[3] = RotateLeft<9>(Add(Xor(temp[2], LoadKey<36>(subkeys)), Xor(temp[3], LoadKey<37>(subkeys))));
|
||||
temp[2] = RotateRight<3>(Add(Xor(temp[1], LoadKey<46>(subkeys)), Xor(temp[2], LoadKey<47>(subkeys))));
|
||||
temp[1] = RotateRight<5>(Add(Xor(temp[0], LoadKey<44>(subkeys)), Xor(temp[1], LoadKey<45>(subkeys))));
|
||||
temp[0] = RotateLeft<9>(Add(Xor(temp[3], LoadKey<42>(subkeys)), Xor(temp[0], LoadKey<43>(subkeys))));
|
||||
|
||||
temp[3] = RotateRight<3>(Add(Xor(temp[2], LoadKey<52>(subkeys)), Xor(temp[3], LoadKey<53>(subkeys))));
|
||||
temp[2] = RotateRight<5>(Add(Xor(temp[1], LoadKey<50>(subkeys)), Xor(temp[2], LoadKey<51>(subkeys))));
|
||||
temp[1] = RotateLeft<9>(Add(Xor(temp[0], LoadKey<48>(subkeys)), Xor(temp[1], LoadKey<49>(subkeys))));
|
||||
temp[0] = RotateRight<3>(Add(Xor(temp[3], LoadKey<58>(subkeys)), Xor(temp[0], LoadKey<59>(subkeys))));
|
||||
temp[3] = RotateRight<5>(Add(Xor(temp[2], LoadKey<56>(subkeys)), Xor(temp[3], LoadKey<57>(subkeys))));
|
||||
temp[2] = RotateLeft<9>(Add(Xor(temp[1], LoadKey<54>(subkeys)), Xor(temp[2], LoadKey<55>(subkeys))));
|
||||
temp[1] = RotateRight<3>(Add(Xor(temp[0], LoadKey<64>(subkeys)), Xor(temp[1], LoadKey<65>(subkeys))));
|
||||
temp[0] = RotateRight<5>(Add(Xor(temp[3], LoadKey<62>(subkeys)), Xor(temp[0], LoadKey<63>(subkeys))));
|
||||
temp[3] = RotateLeft<9>(Add(Xor(temp[2], LoadKey<60>(subkeys)), Xor(temp[3], LoadKey<61>(subkeys))));
|
||||
temp[2] = RotateRight<3>(Add(Xor(temp[1], LoadKey<70>(subkeys)), Xor(temp[2], LoadKey<71>(subkeys))));
|
||||
temp[1] = RotateRight<5>(Add(Xor(temp[0], LoadKey<68>(subkeys)), Xor(temp[1], LoadKey<69>(subkeys))));
|
||||
temp[0] = RotateLeft<9>(Add(Xor(temp[3], LoadKey<66>(subkeys)), Xor(temp[0], LoadKey<67>(subkeys))));
|
||||
|
||||
temp[3] = RotateRight<3>(Add(Xor(temp[2], LoadKey<76>(subkeys)), Xor(temp[3], LoadKey<77>(subkeys))));
|
||||
temp[2] = RotateRight<5>(Add(Xor(temp[1], LoadKey<74>(subkeys)), Xor(temp[2], LoadKey<75>(subkeys))));
|
||||
temp[1] = RotateLeft<9>(Add(Xor(temp[0], LoadKey<72>(subkeys)), Xor(temp[1], LoadKey<73>(subkeys))));
|
||||
temp[0] = RotateRight<3>(Add(Xor(temp[3], LoadKey<82>(subkeys)), Xor(temp[0], LoadKey<83>(subkeys))));
|
||||
temp[3] = RotateRight<5>(Add(Xor(temp[2], LoadKey<80>(subkeys)), Xor(temp[3], LoadKey<81>(subkeys))));
|
||||
temp[2] = RotateLeft<9>(Add(Xor(temp[1], LoadKey<78>(subkeys)), Xor(temp[2], LoadKey<79>(subkeys))));
|
||||
temp[1] = RotateRight<3>(Add(Xor(temp[0], LoadKey<88>(subkeys)), Xor(temp[1], LoadKey<89>(subkeys))));
|
||||
temp[0] = RotateRight<5>(Add(Xor(temp[3], LoadKey<86>(subkeys)), Xor(temp[0], LoadKey<87>(subkeys))));
|
||||
temp[3] = RotateLeft<9>(Add(Xor(temp[2], LoadKey<84>(subkeys)), Xor(temp[3], LoadKey<85>(subkeys))));
|
||||
temp[2] = RotateRight<3>(Add(Xor(temp[1], LoadKey<94>(subkeys)), Xor(temp[2], LoadKey<95>(subkeys))));
|
||||
temp[1] = RotateRight<5>(Add(Xor(temp[0], LoadKey<92>(subkeys)), Xor(temp[1], LoadKey<93>(subkeys))));
|
||||
temp[0] = RotateLeft<9>(Add(Xor(temp[3], LoadKey<90>(subkeys)), Xor(temp[0], LoadKey<91>(subkeys))));
|
||||
|
||||
temp[3] = RotateRight<3>(Add(Xor(temp[2], LoadKey<100>(subkeys)), Xor(temp[3], LoadKey<101>(subkeys))));
|
||||
temp[2] = RotateRight<5>(Add(Xor(temp[1], LoadKey<98>(subkeys)), Xor(temp[2], LoadKey<99>(subkeys))));
|
||||
temp[1] = RotateLeft<9>(Add(Xor(temp[0], LoadKey<96>(subkeys)), Xor(temp[1], LoadKey<97>(subkeys))));
|
||||
temp[0] = RotateRight<3>(Add(Xor(temp[3], LoadKey<106>(subkeys)), Xor(temp[0], LoadKey<107>(subkeys))));
|
||||
temp[3] = RotateRight<5>(Add(Xor(temp[2], LoadKey<104>(subkeys)), Xor(temp[3], LoadKey<105>(subkeys))));
|
||||
temp[2] = RotateLeft<9>(Add(Xor(temp[1], LoadKey<102>(subkeys)), Xor(temp[2], LoadKey<103>(subkeys))));
|
||||
temp[1] = RotateRight<3>(Add(Xor(temp[0], LoadKey<112>(subkeys)), Xor(temp[1], LoadKey<113>(subkeys))));
|
||||
temp[0] = RotateRight<5>(Add(Xor(temp[3], LoadKey<110>(subkeys)), Xor(temp[0], LoadKey<111>(subkeys))));
|
||||
temp[3] = RotateLeft<9>(Add(Xor(temp[2], LoadKey<108>(subkeys)), Xor(temp[3], LoadKey<109>(subkeys))));
|
||||
temp[2] = RotateRight<3>(Add(Xor(temp[1], LoadKey<118>(subkeys)), Xor(temp[2], LoadKey<119>(subkeys))));
|
||||
temp[1] = RotateRight<5>(Add(Xor(temp[0], LoadKey<116>(subkeys)), Xor(temp[1], LoadKey<117>(subkeys))));
|
||||
temp[0] = RotateLeft<9>(Add(Xor(temp[3], LoadKey<114>(subkeys)), Xor(temp[0], LoadKey<115>(subkeys))));
|
||||
|
||||
temp[3] = RotateRight<3>(Add(Xor(temp[2], LoadKey<124>(subkeys)), Xor(temp[3], LoadKey<125>(subkeys))));
|
||||
temp[2] = RotateRight<5>(Add(Xor(temp[1], LoadKey<122>(subkeys)), Xor(temp[2], LoadKey<123>(subkeys))));
|
||||
temp[1] = RotateLeft<9>(Add(Xor(temp[0], LoadKey<120>(subkeys)), Xor(temp[1], LoadKey<121>(subkeys))));
|
||||
temp[0] = RotateRight<3>(Add(Xor(temp[3], LoadKey<130>(subkeys)), Xor(temp[0], LoadKey<131>(subkeys))));
|
||||
temp[3] = RotateRight<5>(Add(Xor(temp[2], LoadKey<128>(subkeys)), Xor(temp[3], LoadKey<129>(subkeys))));
|
||||
temp[2] = RotateLeft<9>(Add(Xor(temp[1], LoadKey<126>(subkeys)), Xor(temp[2], LoadKey<127>(subkeys))));
|
||||
temp[1] = RotateRight<3>(Add(Xor(temp[0], LoadKey<136>(subkeys)), Xor(temp[1], LoadKey<137>(subkeys))));
|
||||
temp[0] = RotateRight<5>(Add(Xor(temp[3], LoadKey<134>(subkeys)), Xor(temp[0], LoadKey<135>(subkeys))));
|
||||
temp[3] = RotateLeft<9>(Add(Xor(temp[2], LoadKey<132>(subkeys)), Xor(temp[3], LoadKey<133>(subkeys))));
|
||||
temp[2] = RotateRight<3>(Add(Xor(temp[1], LoadKey<142>(subkeys)), Xor(temp[2], LoadKey<143>(subkeys))));
|
||||
temp[1] = RotateRight<5>(Add(Xor(temp[0], LoadKey<140>(subkeys)), Xor(temp[1], LoadKey<141>(subkeys))));
|
||||
temp[0] = RotateLeft<9>(Add(Xor(temp[3], LoadKey<138>(subkeys)), Xor(temp[0], LoadKey<139>(subkeys))));
|
||||
|
||||
if(rounds > 24)
|
||||
{
|
||||
temp[3] = RotateRight<3>(Add(Xor(temp[2], LoadKey<148>(subkeys)), Xor(temp[3], LoadKey<149>(subkeys))));
|
||||
temp[2] = RotateRight<5>(Add(Xor(temp[1], LoadKey<146>(subkeys)), Xor(temp[2], LoadKey<147>(subkeys))));
|
||||
temp[1] = RotateLeft<9>(Add(Xor(temp[0], LoadKey<144>(subkeys)), Xor(temp[1], LoadKey<145>(subkeys))));
|
||||
temp[0] = RotateRight<3>(Add(Xor(temp[3], LoadKey<154>(subkeys)), Xor(temp[0], LoadKey<155>(subkeys))));
|
||||
temp[3] = RotateRight<5>(Add(Xor(temp[2], LoadKey<152>(subkeys)), Xor(temp[3], LoadKey<153>(subkeys))));
|
||||
temp[2] = RotateLeft<9>(Add(Xor(temp[1], LoadKey<150>(subkeys)), Xor(temp[2], LoadKey<151>(subkeys))));
|
||||
temp[1] = RotateRight<3>(Add(Xor(temp[0], LoadKey<160>(subkeys)), Xor(temp[1], LoadKey<161>(subkeys))));
|
||||
temp[0] = RotateRight<5>(Add(Xor(temp[3], LoadKey<158>(subkeys)), Xor(temp[0], LoadKey<159>(subkeys))));
|
||||
temp[3] = RotateLeft<9>(Add(Xor(temp[2], LoadKey<156>(subkeys)), Xor(temp[3], LoadKey<157>(subkeys))));
|
||||
temp[2] = RotateRight<3>(Add(Xor(temp[1], LoadKey<166>(subkeys)), Xor(temp[2], LoadKey<167>(subkeys))));
|
||||
temp[1] = RotateRight<5>(Add(Xor(temp[0], LoadKey<164>(subkeys)), Xor(temp[1], LoadKey<165>(subkeys))));
|
||||
temp[0] = RotateLeft<9>(Add(Xor(temp[3], LoadKey<162>(subkeys)), Xor(temp[0], LoadKey<163>(subkeys))));
|
||||
}
|
||||
|
||||
if(rounds > 28)
|
||||
{
|
||||
temp[3] = RotateRight<3>(Add(Xor(temp[2], LoadKey<172>(subkeys)), Xor(temp[3], LoadKey<173>(subkeys))));
|
||||
temp[2] = RotateRight<5>(Add(Xor(temp[1], LoadKey<170>(subkeys)), Xor(temp[2], LoadKey<171>(subkeys))));
|
||||
temp[1] = RotateLeft<9>(Add(Xor(temp[0], LoadKey<168>(subkeys)), Xor(temp[1], LoadKey<169>(subkeys))));
|
||||
temp[0] = RotateRight<3>(Add(Xor(temp[3], LoadKey<178>(subkeys)), Xor(temp[0], LoadKey<179>(subkeys))));
|
||||
temp[3] = RotateRight<5>(Add(Xor(temp[2], LoadKey<176>(subkeys)), Xor(temp[3], LoadKey<177>(subkeys))));
|
||||
temp[2] = RotateLeft<9>(Add(Xor(temp[1], LoadKey<174>(subkeys)), Xor(temp[2], LoadKey<175>(subkeys))));
|
||||
temp[1] = RotateRight<3>(Add(Xor(temp[0], LoadKey<184>(subkeys)), Xor(temp[1], LoadKey<185>(subkeys))));
|
||||
temp[0] = RotateRight<5>(Add(Xor(temp[3], LoadKey<182>(subkeys)), Xor(temp[0], LoadKey<183>(subkeys))));
|
||||
temp[3] = RotateLeft<9>(Add(Xor(temp[2], LoadKey<180>(subkeys)), Xor(temp[3], LoadKey<181>(subkeys))));
|
||||
temp[2] = RotateRight<3>(Add(Xor(temp[1], LoadKey<190>(subkeys)), Xor(temp[2], LoadKey<191>(subkeys))));
|
||||
temp[1] = RotateRight<5>(Add(Xor(temp[0], LoadKey<188>(subkeys)), Xor(temp[1], LoadKey<189>(subkeys))));
|
||||
temp[0] = RotateLeft<9>(Add(Xor(temp[3], LoadKey<186>(subkeys)), Xor(temp[0], LoadKey<187>(subkeys))));
|
||||
}
|
||||
}
|
||||
|
||||
void LEA_Decryption(uint32x4_t temp[4], const word32 *subkeys, unsigned int rounds)
|
||||
{
|
||||
if(rounds > 28)
|
||||
{
|
||||
temp[0] = Xor(Sub(RotateRight<9>(temp[0]), Xor(temp[3], LoadKey<186>(subkeys))), LoadKey<187>(subkeys));
|
||||
temp[1] = Xor(Sub(RotateLeft<5>(temp[1]), Xor(temp[0], LoadKey<188>(subkeys))), LoadKey<189>(subkeys));
|
||||
temp[2] = Xor(Sub(RotateLeft<3>(temp[2]), Xor(temp[1], LoadKey<190>(subkeys))), LoadKey<191>(subkeys));
|
||||
temp[3] = Xor(Sub(RotateRight<9>(temp[3]), Xor(temp[2], LoadKey<180>(subkeys))), LoadKey<181>(subkeys));
|
||||
temp[0] = Xor(Sub(RotateLeft<5>(temp[0]), Xor(temp[3], LoadKey<182>(subkeys))), LoadKey<183>(subkeys));
|
||||
temp[1] = Xor(Sub(RotateLeft<3>(temp[1]), Xor(temp[0], LoadKey<184>(subkeys))), LoadKey<185>(subkeys));
|
||||
temp[2] = Xor(Sub(RotateRight<9>(temp[2]), Xor(temp[1], LoadKey<174>(subkeys))), LoadKey<175>(subkeys));
|
||||
temp[3] = Xor(Sub(RotateLeft<5>(temp[3]), Xor(temp[2], LoadKey<176>(subkeys))), LoadKey<177>(subkeys));
|
||||
temp[0] = Xor(Sub(RotateLeft<3>(temp[0]), Xor(temp[3], LoadKey<178>(subkeys))), LoadKey<179>(subkeys));
|
||||
temp[1] = Xor(Sub(RotateRight<9>(temp[1]), Xor(temp[0], LoadKey<168>(subkeys))), LoadKey<169>(subkeys));
|
||||
temp[2] = Xor(Sub(RotateLeft<5>(temp[2]), Xor(temp[1], LoadKey<170>(subkeys))), LoadKey<171>(subkeys));
|
||||
temp[3] = Xor(Sub(RotateLeft<3>(temp[3]), Xor(temp[2], LoadKey<172>(subkeys))), LoadKey<173>(subkeys));
|
||||
}
|
||||
|
||||
if(rounds > 24)
|
||||
{
|
||||
temp[0] = Xor(Sub(RotateRight<9>(temp[0]), Xor(temp[3], LoadKey<162>(subkeys))), LoadKey<163>(subkeys));
|
||||
temp[1] = Xor(Sub(RotateLeft<5>(temp[1]), Xor(temp[0], LoadKey<164>(subkeys))), LoadKey<165>(subkeys));
|
||||
temp[2] = Xor(Sub(RotateLeft<3>(temp[2]), Xor(temp[1], LoadKey<166>(subkeys))), LoadKey<167>(subkeys));
|
||||
temp[3] = Xor(Sub(RotateRight<9>(temp[3]), Xor(temp[2], LoadKey<156>(subkeys))), LoadKey<157>(subkeys));
|
||||
temp[0] = Xor(Sub(RotateLeft<5>(temp[0]), Xor(temp[3], LoadKey<158>(subkeys))), LoadKey<159>(subkeys));
|
||||
temp[1] = Xor(Sub(RotateLeft<3>(temp[1]), Xor(temp[0], LoadKey<160>(subkeys))), LoadKey<161>(subkeys));
|
||||
temp[2] = Xor(Sub(RotateRight<9>(temp[2]), Xor(temp[1], LoadKey<150>(subkeys))), LoadKey<151>(subkeys));
|
||||
temp[3] = Xor(Sub(RotateLeft<5>(temp[3]), Xor(temp[2], LoadKey<152>(subkeys))), LoadKey<153>(subkeys));
|
||||
temp[0] = Xor(Sub(RotateLeft<3>(temp[0]), Xor(temp[3], LoadKey<154>(subkeys))), LoadKey<155>(subkeys));
|
||||
temp[1] = Xor(Sub(RotateRight<9>(temp[1]), Xor(temp[0], LoadKey<144>(subkeys))), LoadKey<145>(subkeys));
|
||||
temp[2] = Xor(Sub(RotateLeft<5>(temp[2]), Xor(temp[1], LoadKey<146>(subkeys))), LoadKey<147>(subkeys));
|
||||
temp[3] = Xor(Sub(RotateLeft<3>(temp[3]), Xor(temp[2], LoadKey<148>(subkeys))), LoadKey<149>(subkeys));
|
||||
}
|
||||
|
||||
temp[0] = Xor(Sub(RotateRight<9>(temp[0]), Xor(temp[3], LoadKey<138>(subkeys))), LoadKey<139>(subkeys));
|
||||
temp[1] = Xor(Sub(RotateLeft<5>(temp[1]), Xor(temp[0], LoadKey<140>(subkeys))), LoadKey<141>(subkeys));
|
||||
temp[2] = Xor(Sub(RotateLeft<3>(temp[2]), Xor(temp[1], LoadKey<142>(subkeys))), LoadKey<143>(subkeys));
|
||||
temp[3] = Xor(Sub(RotateRight<9>(temp[3]), Xor(temp[2], LoadKey<132>(subkeys))), LoadKey<133>(subkeys));
|
||||
temp[0] = Xor(Sub(RotateLeft<5>(temp[0]), Xor(temp[3], LoadKey<134>(subkeys))), LoadKey<135>(subkeys));
|
||||
temp[1] = Xor(Sub(RotateLeft<3>(temp[1]), Xor(temp[0], LoadKey<136>(subkeys))), LoadKey<137>(subkeys));
|
||||
temp[2] = Xor(Sub(RotateRight<9>(temp[2]), Xor(temp[1], LoadKey<126>(subkeys))), LoadKey<127>(subkeys));
|
||||
temp[3] = Xor(Sub(RotateLeft<5>(temp[3]), Xor(temp[2], LoadKey<128>(subkeys))), LoadKey<129>(subkeys));
|
||||
temp[0] = Xor(Sub(RotateLeft<3>(temp[0]), Xor(temp[3], LoadKey<130>(subkeys))), LoadKey<131>(subkeys));
|
||||
temp[1] = Xor(Sub(RotateRight<9>(temp[1]), Xor(temp[0], LoadKey<120>(subkeys))), LoadKey<121>(subkeys));
|
||||
temp[2] = Xor(Sub(RotateLeft<5>(temp[2]), Xor(temp[1], LoadKey<122>(subkeys))), LoadKey<123>(subkeys));
|
||||
temp[3] = Xor(Sub(RotateLeft<3>(temp[3]), Xor(temp[2], LoadKey<124>(subkeys))), LoadKey<125>(subkeys));
|
||||
|
||||
temp[0] = Xor(Sub(RotateRight<9>(temp[0]), Xor(temp[3], LoadKey<114>(subkeys))), LoadKey<115>(subkeys));
|
||||
temp[1] = Xor(Sub(RotateLeft<5>(temp[1]), Xor(temp[0], LoadKey<116>(subkeys))), LoadKey<117>(subkeys));
|
||||
temp[2] = Xor(Sub(RotateLeft<3>(temp[2]), Xor(temp[1], LoadKey<118>(subkeys))), LoadKey<119>(subkeys));
|
||||
temp[3] = Xor(Sub(RotateRight<9>(temp[3]), Xor(temp[2], LoadKey<108>(subkeys))), LoadKey<109>(subkeys));
|
||||
temp[0] = Xor(Sub(RotateLeft<5>(temp[0]), Xor(temp[3], LoadKey<110>(subkeys))), LoadKey<111>(subkeys));
|
||||
temp[1] = Xor(Sub(RotateLeft<3>(temp[1]), Xor(temp[0], LoadKey<112>(subkeys))), LoadKey<113>(subkeys));
|
||||
temp[2] = Xor(Sub(RotateRight<9>(temp[2]), Xor(temp[1], LoadKey<102>(subkeys))), LoadKey<103>(subkeys));
|
||||
temp[3] = Xor(Sub(RotateLeft<5>(temp[3]), Xor(temp[2], LoadKey<104>(subkeys))), LoadKey<105>(subkeys));
|
||||
temp[0] = Xor(Sub(RotateLeft<3>(temp[0]), Xor(temp[3], LoadKey<106>(subkeys))), LoadKey<107>(subkeys));
|
||||
temp[1] = Xor(Sub(RotateRight<9>(temp[1]), Xor(temp[0], LoadKey<96>(subkeys))), LoadKey<97>(subkeys));
|
||||
temp[2] = Xor(Sub(RotateLeft<5>(temp[2]), Xor(temp[1], LoadKey<98>(subkeys))), LoadKey<99>(subkeys));
|
||||
temp[3] = Xor(Sub(RotateLeft<3>(temp[3]), Xor(temp[2], LoadKey<100>(subkeys))), LoadKey<101>(subkeys));
|
||||
|
||||
temp[0] = Xor(Sub(RotateRight<9>(temp[0]), Xor(temp[3], LoadKey<90>(subkeys))), LoadKey<91>(subkeys));
|
||||
temp[1] = Xor(Sub(RotateLeft<5>(temp[1]), Xor(temp[0], LoadKey<92>(subkeys))), LoadKey<93>(subkeys));
|
||||
temp[2] = Xor(Sub(RotateLeft<3>(temp[2]), Xor(temp[1], LoadKey<94>(subkeys))), LoadKey<95>(subkeys));
|
||||
temp[3] = Xor(Sub(RotateRight<9>(temp[3]), Xor(temp[2], LoadKey<84>(subkeys))), LoadKey<85>(subkeys));
|
||||
temp[0] = Xor(Sub(RotateLeft<5>(temp[0]), Xor(temp[3], LoadKey<86>(subkeys))), LoadKey<87>(subkeys));
|
||||
temp[1] = Xor(Sub(RotateLeft<3>(temp[1]), Xor(temp[0], LoadKey<88>(subkeys))), LoadKey<89>(subkeys));
|
||||
temp[2] = Xor(Sub(RotateRight<9>(temp[2]), Xor(temp[1], LoadKey<78>(subkeys))), LoadKey<79>(subkeys));
|
||||
temp[3] = Xor(Sub(RotateLeft<5>(temp[3]), Xor(temp[2], LoadKey<80>(subkeys))), LoadKey<81>(subkeys));
|
||||
temp[0] = Xor(Sub(RotateLeft<3>(temp[0]), Xor(temp[3], LoadKey<82>(subkeys))), LoadKey<83>(subkeys));
|
||||
temp[1] = Xor(Sub(RotateRight<9>(temp[1]), Xor(temp[0], LoadKey<72>(subkeys))), LoadKey<73>(subkeys));
|
||||
temp[2] = Xor(Sub(RotateLeft<5>(temp[2]), Xor(temp[1], LoadKey<74>(subkeys))), LoadKey<75>(subkeys));
|
||||
temp[3] = Xor(Sub(RotateLeft<3>(temp[3]), Xor(temp[2], LoadKey<76>(subkeys))), LoadKey<77>(subkeys));
|
||||
|
||||
temp[0] = Xor(Sub(RotateRight<9>(temp[0]), Xor(temp[3], LoadKey<66>(subkeys))), LoadKey<67>(subkeys));
|
||||
temp[1] = Xor(Sub(RotateLeft<5>(temp[1]), Xor(temp[0], LoadKey<68>(subkeys))), LoadKey<69>(subkeys));
|
||||
temp[2] = Xor(Sub(RotateLeft<3>(temp[2]), Xor(temp[1], LoadKey<70>(subkeys))), LoadKey<71>(subkeys));
|
||||
temp[3] = Xor(Sub(RotateRight<9>(temp[3]), Xor(temp[2], LoadKey<60>(subkeys))), LoadKey<61>(subkeys));
|
||||
temp[0] = Xor(Sub(RotateLeft<5>(temp[0]), Xor(temp[3], LoadKey<62>(subkeys))), LoadKey<63>(subkeys));
|
||||
temp[1] = Xor(Sub(RotateLeft<3>(temp[1]), Xor(temp[0], LoadKey<64>(subkeys))), LoadKey<65>(subkeys));
|
||||
temp[2] = Xor(Sub(RotateRight<9>(temp[2]), Xor(temp[1], LoadKey<54>(subkeys))), LoadKey<55>(subkeys));
|
||||
temp[3] = Xor(Sub(RotateLeft<5>(temp[3]), Xor(temp[2], LoadKey<56>(subkeys))), LoadKey<57>(subkeys));
|
||||
temp[0] = Xor(Sub(RotateLeft<3>(temp[0]), Xor(temp[3], LoadKey<58>(subkeys))), LoadKey<59>(subkeys));
|
||||
temp[1] = Xor(Sub(RotateRight<9>(temp[1]), Xor(temp[0], LoadKey<48>(subkeys))), LoadKey<49>(subkeys));
|
||||
temp[2] = Xor(Sub(RotateLeft<5>(temp[2]), Xor(temp[1], LoadKey<50>(subkeys))), LoadKey<51>(subkeys));
|
||||
temp[3] = Xor(Sub(RotateLeft<3>(temp[3]), Xor(temp[2], LoadKey<52>(subkeys))), LoadKey<53>(subkeys));
|
||||
|
||||
temp[0] = Xor(Sub(RotateRight<9>(temp[0]), Xor(temp[3], LoadKey<42>(subkeys))), LoadKey<43>(subkeys));
|
||||
temp[1] = Xor(Sub(RotateLeft<5>(temp[1]), Xor(temp[0], LoadKey<44>(subkeys))), LoadKey<45>(subkeys));
|
||||
temp[2] = Xor(Sub(RotateLeft<3>(temp[2]), Xor(temp[1], LoadKey<46>(subkeys))), LoadKey<47>(subkeys));
|
||||
temp[3] = Xor(Sub(RotateRight<9>(temp[3]), Xor(temp[2], LoadKey<36>(subkeys))), LoadKey<37>(subkeys));
|
||||
temp[0] = Xor(Sub(RotateLeft<5>(temp[0]), Xor(temp[3], LoadKey<38>(subkeys))), LoadKey<39>(subkeys));
|
||||
temp[1] = Xor(Sub(RotateLeft<3>(temp[1]), Xor(temp[0], LoadKey<40>(subkeys))), LoadKey<41>(subkeys));
|
||||
temp[2] = Xor(Sub(RotateRight<9>(temp[2]), Xor(temp[1], LoadKey<30>(subkeys))), LoadKey<31>(subkeys));
|
||||
temp[3] = Xor(Sub(RotateLeft<5>(temp[3]), Xor(temp[2], LoadKey<32>(subkeys))), LoadKey<33>(subkeys));
|
||||
temp[0] = Xor(Sub(RotateLeft<3>(temp[0]), Xor(temp[3], LoadKey<34>(subkeys))), LoadKey<35>(subkeys));
|
||||
temp[1] = Xor(Sub(RotateRight<9>(temp[1]), Xor(temp[0], LoadKey<24>(subkeys))), LoadKey<25>(subkeys));
|
||||
temp[2] = Xor(Sub(RotateLeft<5>(temp[2]), Xor(temp[1], LoadKey<26>(subkeys))), LoadKey<27>(subkeys));
|
||||
temp[3] = Xor(Sub(RotateLeft<3>(temp[3]), Xor(temp[2], LoadKey<28>(subkeys))), LoadKey<29>(subkeys));
|
||||
|
||||
temp[0] = Xor(Sub(RotateRight<9>(temp[0]), Xor(temp[3], LoadKey<18>(subkeys))), LoadKey<19>(subkeys));
|
||||
temp[1] = Xor(Sub(RotateLeft<5>(temp[1]), Xor(temp[0], LoadKey<20>(subkeys))), LoadKey<21>(subkeys));
|
||||
temp[2] = Xor(Sub(RotateLeft<3>(temp[2]), Xor(temp[1], LoadKey<22>(subkeys))), LoadKey<23>(subkeys));
|
||||
temp[3] = Xor(Sub(RotateRight<9>(temp[3]), Xor(temp[2], LoadKey<12>(subkeys))), LoadKey<13>(subkeys));
|
||||
temp[0] = Xor(Sub(RotateLeft<5>(temp[0]), Xor(temp[3], LoadKey<14>(subkeys))), LoadKey<15>(subkeys));
|
||||
temp[1] = Xor(Sub(RotateLeft<3>(temp[1]), Xor(temp[0], LoadKey<16>(subkeys))), LoadKey<17>(subkeys));
|
||||
temp[2] = Xor(Sub(RotateRight<9>(temp[2]), Xor(temp[1], LoadKey<6>(subkeys))), LoadKey<7>(subkeys));
|
||||
temp[3] = Xor(Sub(RotateLeft<5>(temp[3]), Xor(temp[2], LoadKey<8>(subkeys))), LoadKey<9>(subkeys));
|
||||
temp[0] = Xor(Sub(RotateLeft<3>(temp[0]), Xor(temp[3], LoadKey<10>(subkeys))), LoadKey<11>(subkeys));
|
||||
temp[1] = Xor(Sub(RotateRight<9>(temp[1]), Xor(temp[0], LoadKey<0>(subkeys))), LoadKey<1>(subkeys));
|
||||
temp[2] = Xor(Sub(RotateLeft<5>(temp[2]), Xor(temp[1], LoadKey<2>(subkeys))), LoadKey<3>(subkeys));
|
||||
temp[3] = Xor(Sub(RotateLeft<3>(temp[3]), Xor(temp[2], LoadKey<4>(subkeys))), LoadKey<5>(subkeys));
|
||||
}
|
||||
|
||||
inline void LEA_Enc_Block(uint32x4_t &block0,
|
||||
const word32 *subkeys, unsigned int rounds)
|
||||
{
|
||||
uint32x4_t temp[4];
|
||||
temp[0] = UnpackNEON<0>(block0);
|
||||
temp[1] = UnpackNEON<1>(block0);
|
||||
temp[2] = UnpackNEON<2>(block0);
|
||||
temp[3] = UnpackNEON<3>(block0);
|
||||
|
||||
LEA_Encryption(temp, subkeys, rounds);
|
||||
|
||||
block0 = RepackNEON<0>(temp[0], temp[1], temp[2], temp[3]);
|
||||
}
|
||||
|
||||
inline void LEA_Dec_Block(uint32x4_t &block0,
|
||||
const word32 *subkeys, unsigned int rounds)
|
||||
{
|
||||
uint32x4_t temp[4];
|
||||
temp[0] = UnpackNEON<0>(block0);
|
||||
temp[1] = UnpackNEON<1>(block0);
|
||||
temp[2] = UnpackNEON<2>(block0);
|
||||
temp[3] = UnpackNEON<3>(block0);
|
||||
|
||||
LEA_Decryption(temp, subkeys, rounds);
|
||||
|
||||
block0 = RepackNEON<0>(temp[0], temp[1], temp[2], temp[3]);
|
||||
}
|
||||
|
||||
inline void LEA_Enc_4_Blocks(uint32x4_t &block0, uint32x4_t &block1,
|
||||
uint32x4_t &block2, uint32x4_t &block3, const word32 *subkeys, unsigned int rounds)
|
||||
{
|
||||
uint32x4_t temp[4];
|
||||
temp[0] = UnpackNEON<0>(block0, block1, block2, block3);
|
||||
temp[1] = UnpackNEON<1>(block0, block1, block2, block3);
|
||||
temp[2] = UnpackNEON<2>(block0, block1, block2, block3);
|
||||
temp[3] = UnpackNEON<3>(block0, block1, block2, block3);
|
||||
|
||||
LEA_Encryption(temp, subkeys, rounds);
|
||||
|
||||
block0 = RepackNEON<0>(temp[0], temp[1], temp[2], temp[3]);
|
||||
block1 = RepackNEON<1>(temp[0], temp[1], temp[2], temp[3]);
|
||||
block2 = RepackNEON<2>(temp[0], temp[1], temp[2], temp[3]);
|
||||
block3 = RepackNEON<3>(temp[0], temp[1], temp[2], temp[3]);
|
||||
}
|
||||
|
||||
inline void LEA_Dec_4_Blocks(uint32x4_t &block0, uint32x4_t &block1,
|
||||
uint32x4_t &block2, uint32x4_t &block3, const word32 *subkeys, unsigned int rounds)
|
||||
{
|
||||
uint32x4_t temp[4];
|
||||
temp[0] = UnpackNEON<0>(block0, block1, block2, block3);
|
||||
temp[1] = UnpackNEON<1>(block0, block1, block2, block3);
|
||||
temp[2] = UnpackNEON<2>(block0, block1, block2, block3);
|
||||
temp[3] = UnpackNEON<3>(block0, block1, block2, block3);
|
||||
|
||||
LEA_Decryption(temp, subkeys, rounds);
|
||||
|
||||
block0 = RepackNEON<0>(temp[0], temp[1], temp[2], temp[3]);
|
||||
block1 = RepackNEON<1>(temp[0], temp[1], temp[2], temp[3]);
|
||||
block2 = RepackNEON<2>(temp[0], temp[1], temp[2], temp[3]);
|
||||
block3 = RepackNEON<3>(temp[0], temp[1], temp[2], temp[3]);
|
||||
}
|
||||
|
||||
#endif // CRYPTOPP_ARM_NEON_AVAILABLE
|
||||
|
||||
#if (CRYPTOPP_SSSE3_AVAILABLE)
|
||||
|
||||
inline __m128i Xor(const __m128i& a, const __m128i& b)
|
||||
|
|
@ -504,4 +991,22 @@ size_t LEA_Dec_AdvancedProcessBlocks_SSSE3(const word32* subKeys, size_t rounds,
|
|||
}
|
||||
#endif // CRYPTOPP_SSSE3_AVAILABLE
|
||||
|
||||
#if defined(CRYPTOPP_ARM_NEON_AVAILABLE)
|
||||
size_t LEA_Enc_AdvancedProcessBlocks_NEON(const word32* subKeys, size_t rounds,
|
||||
const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
|
||||
{
|
||||
uint32x4_t unused; // Avoid template argument deduction/substitution failures
|
||||
return AdvancedProcessBlocks128_4x1_NEON(LEA_Enc_Block, LEA_Enc_4_Blocks,
|
||||
unused, subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
|
||||
}
|
||||
|
||||
size_t LEA_Dec_AdvancedProcessBlocks_NEON(const word32* subKeys, size_t rounds,
|
||||
const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
|
||||
{
|
||||
uint32x4_t unused; // Avoid template argument deduction/substitution failures
|
||||
return AdvancedProcessBlocks128_4x1_NEON(LEA_Dec_Block, LEA_Dec_4_Blocks,
|
||||
unused, subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
|
||||
}
|
||||
#endif // CRYPTOPP_ARM_NEON_AVAILABLE
|
||||
|
||||
NAMESPACE_END
|
||||
|
|
|
|||
26
lea.cpp
26
lea.cpp
|
|
@ -556,6 +556,7 @@ inline void SetKey256(word32 rkey[192], const word32 key[8])
|
|||
NAMESPACE_BEGIN(CryptoPP)
|
||||
|
||||
#if CRYPTOPP_LEA_ADVANCED_PROCESS_BLOCKS
|
||||
# if defined(CRYPTOPP_SSSE3_AVAILABLE)
|
||||
extern void LEA_SplatKeys_SSSE3(SecBlock<word32>& rkeys);
|
||||
|
||||
extern size_t LEA_Enc_AdvancedProcessBlocks_SSSE3(const word32* subKeys, size_t rounds,
|
||||
|
|
@ -563,6 +564,15 @@ extern size_t LEA_Enc_AdvancedProcessBlocks_SSSE3(const word32* subKeys, size_t
|
|||
|
||||
extern size_t LEA_Dec_AdvancedProcessBlocks_SSSE3(const word32* subKeys, size_t rounds,
|
||||
const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags);
|
||||
# endif
|
||||
|
||||
# if (CRYPTOPP_ARM_NEON_AVAILABLE)
|
||||
extern size_t LEA_Enc_AdvancedProcessBlocks_NEON(const word32* subKeys, size_t rounds,
|
||||
const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags);
|
||||
|
||||
extern size_t LEA_Dec_AdvancedProcessBlocks_NEON(const word32* subKeys, size_t rounds,
|
||||
const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags);
|
||||
# endif
|
||||
#endif
|
||||
|
||||
void LEA::Base::UncheckedSetKey(const byte *userKey, unsigned int keyLength, const NameValuePairs ¶ms)
|
||||
|
|
@ -596,7 +606,7 @@ void LEA::Base::UncheckedSetKey(const byte *userKey, unsigned int keyLength, con
|
|||
CRYPTOPP_ASSERT(0);;
|
||||
}
|
||||
|
||||
#if (CRYPTOPP_SSSE3_AVAILABLE)
|
||||
#if (CRYPTOPP_LEA_ADVANCED_PROCESS_BLOCKS) && (CRYPTOPP_SSSE3_AVAILABLE)
|
||||
if (HasSSSE3())
|
||||
{
|
||||
// If we pre-splat the round keys at setup then we avoid a shuffle
|
||||
|
|
@ -850,20 +860,34 @@ void LEA::Dec::ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock, byt
|
|||
size_t LEA::Enc::AdvancedProcessBlocks(const byte *inBlocks, const byte *xorBlocks,
|
||||
byte *outBlocks, size_t length, word32 flags) const
|
||||
{
|
||||
#if defined(CRYPTOPP_SSSE3_AVAILABLE)
|
||||
if (HasSSSE3()) {
|
||||
return LEA_Enc_AdvancedProcessBlocks_SSSE3(m_rkey, m_rounds,
|
||||
inBlocks, xorBlocks, outBlocks, length, flags);
|
||||
}
|
||||
#endif
|
||||
#if (CRYPTOPP_ARM_NEON_AVAILABLE)
|
||||
if (HasNEON())
|
||||
return LEA_Enc_AdvancedProcessBlocks_NEON(m_rkey, (size_t)m_rounds,
|
||||
inBlocks, xorBlocks, outBlocks, length, flags);
|
||||
#endif
|
||||
return BlockTransformation::AdvancedProcessBlocks(inBlocks, xorBlocks, outBlocks, length, flags);
|
||||
}
|
||||
|
||||
size_t LEA::Dec::AdvancedProcessBlocks(const byte *inBlocks, const byte *xorBlocks,
|
||||
byte *outBlocks, size_t length, word32 flags) const
|
||||
{
|
||||
#if defined(CRYPTOPP_SSSE3_AVAILABLE)
|
||||
if (HasSSSE3()) {
|
||||
return LEA_Dec_AdvancedProcessBlocks_SSSE3(m_rkey, m_rounds,
|
||||
inBlocks, xorBlocks, outBlocks, length, flags);
|
||||
}
|
||||
#endif
|
||||
#if (CRYPTOPP_ARM_NEON_AVAILABLE)
|
||||
if (HasNEON())
|
||||
return LEA_Dec_AdvancedProcessBlocks_NEON(m_rkey, (size_t)m_rounds,
|
||||
inBlocks, xorBlocks, outBlocks, length, flags);
|
||||
#endif
|
||||
return BlockTransformation::AdvancedProcessBlocks(inBlocks, xorBlocks, outBlocks, length, flags);
|
||||
}
|
||||
#endif // CRYPTOPP_LEA_ADVANCED_PROCESS_BLOCKS
|
||||
|
|
|
|||
2
lea.h
2
lea.h
|
|
@ -15,7 +15,7 @@
|
|||
#include "secblock.h"
|
||||
#include "algparam.h"
|
||||
|
||||
#if (CRYPTOPP_BOOL_X64 || CRYPTOPP_BOOL_X32 || CRYPTOPP_BOOL_X86)
|
||||
#if (CRYPTOPP_BOOL_X64 || CRYPTOPP_BOOL_X32 || CRYPTOPP_BOOL_X86 || CRYPTOPP_BOOL_ARM32 || CRYPTOPP_BOOL_ARM64)
|
||||
# define CRYPTOPP_LEA_ADVANCED_PROCESS_BLOCKS 1
|
||||
#endif
|
||||
|
||||
|
|
|
|||
Loading…
Reference in New Issue