Update comments
parent
8a5911e6eb
commit
e90cc9a028
|
|
@ -703,7 +703,6 @@ inline size_t AdvancedProcessBlocks64_SSE2x6(F2 func2, F6 func6,
|
||||||
if (length)
|
if (length)
|
||||||
{
|
{
|
||||||
// Adjust to real block size
|
// Adjust to real block size
|
||||||
const size_t blockSize = 8;
|
|
||||||
if (flags & BT_ReverseDirection)
|
if (flags & BT_ReverseDirection)
|
||||||
{
|
{
|
||||||
inIncrement += inIncrement ? blockSize : 0;
|
inIncrement += inIncrement ? blockSize : 0;
|
||||||
|
|
|
||||||
2
config.h
2
config.h
|
|
@ -562,7 +562,7 @@ NAMESPACE_END
|
||||||
// Requires ARMv7 and ACLE 1.0. Testing shows ARMv7 is really ARMv7a under most toolchains.
|
// Requires ARMv7 and ACLE 1.0. Testing shows ARMv7 is really ARMv7a under most toolchains.
|
||||||
// Android still uses ARMv5 and ARMv6 so we have to be conservative when enabling NEON.
|
// Android still uses ARMv5 and ARMv6 so we have to be conservative when enabling NEON.
|
||||||
#if !defined(CRYPTOPP_ARM_NEON_AVAILABLE) && !defined(CRYPTOPP_DISABLE_ASM)
|
#if !defined(CRYPTOPP_ARM_NEON_AVAILABLE) && !defined(CRYPTOPP_DISABLE_ASM)
|
||||||
# if defined(__ARM_NEON) || defined(__ARM_FEATURE_NEON) || defined(__ARM_FEATURE_ASIMD) || \
|
# if defined(__ARM_NEON) || defined(__ARM_NEON_FP) || defined(__ARM_FEATURE_NEON) || defined(__ARM_FEATURE_ASIMD) || \
|
||||||
(__ARM_ARCH >= 7) || (CRYPTOPP_MSC_VERSION >= 1700)
|
(__ARM_ARCH >= 7) || (CRYPTOPP_MSC_VERSION >= 1700)
|
||||||
# define CRYPTOPP_ARM_NEON_AVAILABLE 1
|
# define CRYPTOPP_ARM_NEON_AVAILABLE 1
|
||||||
# endif
|
# endif
|
||||||
|
|
|
||||||
|
|
@ -51,7 +51,6 @@ using CryptoPP::rotrFixed;
|
||||||
template <unsigned int R>
|
template <unsigned int R>
|
||||||
inline uint32x4_t RotateLeft32(const uint32x4_t& val)
|
inline uint32x4_t RotateLeft32(const uint32x4_t& val)
|
||||||
{
|
{
|
||||||
CRYPTOPP_ASSERT(R < 32);
|
|
||||||
const uint32x4_t a(vshlq_n_u32(val, R));
|
const uint32x4_t a(vshlq_n_u32(val, R));
|
||||||
const uint32x4_t b(vshrq_n_u32(val, 32 - R));
|
const uint32x4_t b(vshrq_n_u32(val, 32 - R));
|
||||||
return vorrq_u32(a, b);
|
return vorrq_u32(a, b);
|
||||||
|
|
@ -60,7 +59,6 @@ inline uint32x4_t RotateLeft32(const uint32x4_t& val)
|
||||||
template <unsigned int R>
|
template <unsigned int R>
|
||||||
inline uint32x4_t RotateRight32(const uint32x4_t& val)
|
inline uint32x4_t RotateRight32(const uint32x4_t& val)
|
||||||
{
|
{
|
||||||
CRYPTOPP_ASSERT(R < 32);
|
|
||||||
const uint32x4_t a(vshlq_n_u32(val, 32 - R));
|
const uint32x4_t a(vshlq_n_u32(val, 32 - R));
|
||||||
const uint32x4_t b(vshrq_n_u32(val, R));
|
const uint32x4_t b(vshrq_n_u32(val, R));
|
||||||
return vorrq_u32(a, b);
|
return vorrq_u32(a, b);
|
||||||
|
|
@ -124,9 +122,8 @@ inline void SIMON64_Enc_Block(uint32x4_t &block1, uint32x4_t &block0,
|
||||||
// be permuted to the following. If only a single block is available then
|
// be permuted to the following. If only a single block is available then
|
||||||
// a Zero block is provided to promote vectorizations.
|
// a Zero block is provided to promote vectorizations.
|
||||||
// [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
|
// [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
|
||||||
const uint32x4x2_t t0 = vuzpq_u32(block0, block1);
|
uint32x4_t x1 = vuzpq_u32(block0, block1).val[0];
|
||||||
uint32x4_t x1 = t0.val[0];
|
uint32x4_t y1 = vuzpq_u32(block0, block1).val[1];
|
||||||
uint32x4_t y1 = t0.val[1];
|
|
||||||
|
|
||||||
x1 = Shuffle32(x1); y1 = Shuffle32(y1);
|
x1 = Shuffle32(x1); y1 = Shuffle32(y1);
|
||||||
|
|
||||||
|
|
@ -150,9 +147,8 @@ inline void SIMON64_Enc_Block(uint32x4_t &block1, uint32x4_t &block0,
|
||||||
x1 = Shuffle32(x1); y1 = Shuffle32(y1);
|
x1 = Shuffle32(x1); y1 = Shuffle32(y1);
|
||||||
|
|
||||||
// [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4]
|
// [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4]
|
||||||
const uint32x4x2_t t1 = vzipq_u32(x1, y1);
|
block0 = vzipq_u32(x1, y1).val[0];
|
||||||
block0 = t1.val[0];
|
block1 = vzipq_u32(x1, y1).val[1];
|
||||||
block1 = t1.val[1];
|
|
||||||
}
|
}
|
||||||
|
|
||||||
inline void SIMON64_Dec_Block(uint32x4_t &block0, uint32x4_t &block1,
|
inline void SIMON64_Dec_Block(uint32x4_t &block0, uint32x4_t &block1,
|
||||||
|
|
@ -163,9 +159,8 @@ inline void SIMON64_Dec_Block(uint32x4_t &block0, uint32x4_t &block1,
|
||||||
// be permuted to the following. If only a single block is available then
|
// be permuted to the following. If only a single block is available then
|
||||||
// a Zero block is provided to promote vectorizations.
|
// a Zero block is provided to promote vectorizations.
|
||||||
// [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
|
// [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
|
||||||
const uint32x4x2_t t0 = vuzpq_u32(block0, block1);
|
uint32x4_t x1 = vuzpq_u32(block0, block1).val[0];
|
||||||
uint32x4_t x1 = t0.val[0];
|
uint32x4_t y1 = vuzpq_u32(block0, block1).val[1];
|
||||||
uint32x4_t y1 = t0.val[1];
|
|
||||||
|
|
||||||
x1 = Shuffle32(x1); y1 = Shuffle32(y1);
|
x1 = Shuffle32(x1); y1 = Shuffle32(y1);
|
||||||
|
|
||||||
|
|
@ -190,9 +185,8 @@ inline void SIMON64_Dec_Block(uint32x4_t &block0, uint32x4_t &block1,
|
||||||
x1 = Shuffle32(x1); y1 = Shuffle32(y1);
|
x1 = Shuffle32(x1); y1 = Shuffle32(y1);
|
||||||
|
|
||||||
// [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4]
|
// [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4]
|
||||||
const uint32x4x2_t t1 = vzipq_u32(x1, y1);
|
block0 = vzipq_u32(x1, y1).val[0];
|
||||||
block0 = t1.val[0];
|
block1 = vzipq_u32(x1, y1).val[1];
|
||||||
block1 = t1.val[1];
|
|
||||||
}
|
}
|
||||||
|
|
||||||
inline void SIMON64_Enc_6_Blocks(uint32x4_t &block0, uint32x4_t &block1,
|
inline void SIMON64_Enc_6_Blocks(uint32x4_t &block0, uint32x4_t &block1,
|
||||||
|
|
@ -204,17 +198,12 @@ inline void SIMON64_Enc_6_Blocks(uint32x4_t &block0, uint32x4_t &block1,
|
||||||
// be permuted to the following. If only a single block is available then
|
// be permuted to the following. If only a single block is available then
|
||||||
// a Zero block is provided to promote vectorizations.
|
// a Zero block is provided to promote vectorizations.
|
||||||
// [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
|
// [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
|
||||||
const uint32x4x2_t t0 = vuzpq_u32(block0, block1);
|
uint32x4_t x1 = vuzpq_u32(block0, block1).val[0];
|
||||||
uint32x4_t x1 = t0.val[0];
|
uint32x4_t y1 = vuzpq_u32(block0, block1).val[1];
|
||||||
uint32x4_t y1 = t0.val[1];
|
uint32x4_t x2 = vuzpq_u32(block2, block3).val[0];
|
||||||
|
uint32x4_t y2 = vuzpq_u32(block2, block3).val[1];
|
||||||
const uint32x4x2_t t1 = vuzpq_u32(block2, block3);
|
uint32x4_t x3 = vuzpq_u32(block4, block5).val[0];
|
||||||
uint32x4_t x2 = t1.val[0];
|
uint32x4_t y3 = vuzpq_u32(block4, block5).val[1];
|
||||||
uint32x4_t y2 = t1.val[1];
|
|
||||||
|
|
||||||
const uint32x4x2_t t2 = vuzpq_u32(block4, block5);
|
|
||||||
uint32x4_t x3 = t2.val[0];
|
|
||||||
uint32x4_t y3 = t2.val[1];
|
|
||||||
|
|
||||||
x1 = Shuffle32(x1); y1 = Shuffle32(y1);
|
x1 = Shuffle32(x1); y1 = Shuffle32(y1);
|
||||||
x2 = Shuffle32(x2); y2 = Shuffle32(y2);
|
x2 = Shuffle32(x2); y2 = Shuffle32(y2);
|
||||||
|
|
@ -248,17 +237,12 @@ inline void SIMON64_Enc_6_Blocks(uint32x4_t &block0, uint32x4_t &block1,
|
||||||
x3 = Shuffle32(x3); y3 = Shuffle32(y3);
|
x3 = Shuffle32(x3); y3 = Shuffle32(y3);
|
||||||
|
|
||||||
// [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4]
|
// [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4]
|
||||||
const uint32x4x2_t t3 = vzipq_u32(x1, y1);
|
block0 = vzipq_u32(x1, y1).val[0];
|
||||||
block0 = t3.val[0];
|
block1 = vzipq_u32(x1, y1).val[1];
|
||||||
block1 = t3.val[1];
|
block2 = vzipq_u32(x2, y2).val[0];
|
||||||
|
block3 = vzipq_u32(x2, y2).val[1];
|
||||||
const uint32x4x2_t t4 = vzipq_u32(x2, y2);
|
block4 = vzipq_u32(x3, y3).val[0];
|
||||||
block2 = t4.val[0];
|
block5 = vzipq_u32(x3, y3).val[1];
|
||||||
block3 = t4.val[1];
|
|
||||||
|
|
||||||
const uint32x4x2_t t5 = vzipq_u32(x3, y3);
|
|
||||||
block4 = t5.val[0];
|
|
||||||
block5 = t5.val[1];
|
|
||||||
}
|
}
|
||||||
|
|
||||||
inline void SIMON64_Dec_6_Blocks(uint32x4_t &block0, uint32x4_t &block1,
|
inline void SIMON64_Dec_6_Blocks(uint32x4_t &block0, uint32x4_t &block1,
|
||||||
|
|
@ -270,17 +254,12 @@ inline void SIMON64_Dec_6_Blocks(uint32x4_t &block0, uint32x4_t &block1,
|
||||||
// be permuted to the following. If only a single block is available then
|
// be permuted to the following. If only a single block is available then
|
||||||
// a Zero block is provided to promote vectorizations.
|
// a Zero block is provided to promote vectorizations.
|
||||||
// [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
|
// [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
|
||||||
const uint32x4x2_t t0 = vuzpq_u32(block0, block1);
|
uint32x4_t x1 = vuzpq_u32(block0, block1).val[0];
|
||||||
uint32x4_t x1 = t0.val[0];
|
uint32x4_t y1 = vuzpq_u32(block0, block1).val[1];
|
||||||
uint32x4_t y1 = t0.val[1];
|
uint32x4_t x2 = vuzpq_u32(block2, block3).val[0];
|
||||||
|
uint32x4_t y2 = vuzpq_u32(block2, block3).val[1];
|
||||||
const uint32x4x2_t t1 = vuzpq_u32(block2, block3);
|
uint32x4_t x3 = vuzpq_u32(block4, block5).val[0];
|
||||||
uint32x4_t x2 = t1.val[0];
|
uint32x4_t y3 = vuzpq_u32(block4, block5).val[1];
|
||||||
uint32x4_t y2 = t1.val[1];
|
|
||||||
|
|
||||||
const uint32x4x2_t t2 = vuzpq_u32(block4, block5);
|
|
||||||
uint32x4_t x3 = t2.val[0];
|
|
||||||
uint32x4_t y3 = t2.val[1];
|
|
||||||
|
|
||||||
x1 = Shuffle32(x1); y1 = Shuffle32(y1);
|
x1 = Shuffle32(x1); y1 = Shuffle32(y1);
|
||||||
x2 = Shuffle32(x2); y2 = Shuffle32(y2);
|
x2 = Shuffle32(x2); y2 = Shuffle32(y2);
|
||||||
|
|
@ -315,17 +294,12 @@ inline void SIMON64_Dec_6_Blocks(uint32x4_t &block0, uint32x4_t &block1,
|
||||||
x3 = Shuffle32(x3); y3 = Shuffle32(y3);
|
x3 = Shuffle32(x3); y3 = Shuffle32(y3);
|
||||||
|
|
||||||
// [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4]
|
// [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4]
|
||||||
const uint32x4x2_t t3 = vzipq_u32(x1, y1);
|
block0 = vzipq_u32(x1, y1).val[0];
|
||||||
block0 = t3.val[0];
|
block1 = vzipq_u32(x1, y1).val[1];
|
||||||
block1 = t3.val[1];
|
block2 = vzipq_u32(x2, y2).val[0];
|
||||||
|
block3 = vzipq_u32(x2, y2).val[1];
|
||||||
const uint32x4x2_t t4 = vzipq_u32(x2, y2);
|
block4 = vzipq_u32(x3, y3).val[0];
|
||||||
block2 = t4.val[0];
|
block5 = vzipq_u32(x3, y3).val[1];
|
||||||
block3 = t4.val[1];
|
|
||||||
|
|
||||||
const uint32x4x2_t t5 = vzipq_u32(x3, y3);
|
|
||||||
block4 = t5.val[0];
|
|
||||||
block5 = t5.val[1];
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#endif // CRYPTOPP_ARM_NEON_AVAILABLE
|
#endif // CRYPTOPP_ARM_NEON_AVAILABLE
|
||||||
|
|
@ -351,7 +325,6 @@ inline T UnpackLow64(const T& a, const T& b)
|
||||||
template <unsigned int R>
|
template <unsigned int R>
|
||||||
inline uint64x2_t RotateLeft64(const uint64x2_t& val)
|
inline uint64x2_t RotateLeft64(const uint64x2_t& val)
|
||||||
{
|
{
|
||||||
CRYPTOPP_ASSERT(R < 64);
|
|
||||||
const uint64x2_t a(vshlq_n_u64(val, R));
|
const uint64x2_t a(vshlq_n_u64(val, R));
|
||||||
const uint64x2_t b(vshrq_n_u64(val, 64 - R));
|
const uint64x2_t b(vshrq_n_u64(val, 64 - R));
|
||||||
return vorrq_u64(a, b);
|
return vorrq_u64(a, b);
|
||||||
|
|
@ -360,7 +333,6 @@ inline uint64x2_t RotateLeft64(const uint64x2_t& val)
|
||||||
template <unsigned int R>
|
template <unsigned int R>
|
||||||
inline uint64x2_t RotateRight64(const uint64x2_t& val)
|
inline uint64x2_t RotateRight64(const uint64x2_t& val)
|
||||||
{
|
{
|
||||||
CRYPTOPP_ASSERT(R < 64);
|
|
||||||
const uint64x2_t a(vshlq_n_u64(val, 64 - R));
|
const uint64x2_t a(vshlq_n_u64(val, 64 - R));
|
||||||
const uint64x2_t b(vshrq_n_u64(val, R));
|
const uint64x2_t b(vshrq_n_u64(val, R));
|
||||||
return vorrq_u64(a, b);
|
return vorrq_u64(a, b);
|
||||||
|
|
|
||||||
|
|
@ -18,8 +18,12 @@
|
||||||
// #undef CRYPTOPP_SSE41_AVAILABLE
|
// #undef CRYPTOPP_SSE41_AVAILABLE
|
||||||
// #undef CRYPTOPP_ARM_NEON_AVAILABLE
|
// #undef CRYPTOPP_ARM_NEON_AVAILABLE
|
||||||
|
|
||||||
// GCC generates bad code when using the table-based rotates
|
// GCC generates bad code when using the table-based 32-bit rotates. Or,
|
||||||
#if defined(__aarch32__) || defined(__aarch64__)
|
// GAS assembles it incorrectly (this may be the case since both GCC and
|
||||||
|
// Clang produce the same failure). SIMON uses the same code but with a
|
||||||
|
// different round function, and SIMON is OK. Jake Lee warned about this
|
||||||
|
// at http://stackoverflow.com/q/47617331/608639.
|
||||||
|
#if (defined(__aarch32__) || defined(__aarch64__)) && defined(__GNUC__)
|
||||||
# define WORKAROUND_GCC_AARCH64_BUG 1
|
# define WORKAROUND_GCC_AARCH64_BUG 1
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
|
@ -54,7 +58,6 @@ using CryptoPP::word64;
|
||||||
template <unsigned int R>
|
template <unsigned int R>
|
||||||
inline uint32x4_t RotateLeft32(const uint32x4_t& val)
|
inline uint32x4_t RotateLeft32(const uint32x4_t& val)
|
||||||
{
|
{
|
||||||
CRYPTOPP_ASSERT(R < 32);
|
|
||||||
const uint32x4_t a(vshlq_n_u32(val, R));
|
const uint32x4_t a(vshlq_n_u32(val, R));
|
||||||
const uint32x4_t b(vshrq_n_u32(val, 32 - R));
|
const uint32x4_t b(vshrq_n_u32(val, 32 - R));
|
||||||
return vorrq_u32(a, b);
|
return vorrq_u32(a, b);
|
||||||
|
|
@ -63,7 +66,6 @@ inline uint32x4_t RotateLeft32(const uint32x4_t& val)
|
||||||
template <unsigned int R>
|
template <unsigned int R>
|
||||||
inline uint32x4_t RotateRight32(const uint32x4_t& val)
|
inline uint32x4_t RotateRight32(const uint32x4_t& val)
|
||||||
{
|
{
|
||||||
CRYPTOPP_ASSERT(R < 32);
|
|
||||||
const uint32x4_t a(vshlq_n_u32(val, 32 - R));
|
const uint32x4_t a(vshlq_n_u32(val, 32 - R));
|
||||||
const uint32x4_t b(vshrq_n_u32(val, R));
|
const uint32x4_t b(vshrq_n_u32(val, R));
|
||||||
return vorrq_u32(a, b);
|
return vorrq_u32(a, b);
|
||||||
|
|
@ -120,9 +122,8 @@ inline void SPECK64_Enc_Block(uint32x4_t &block0, uint32x4_t &block1,
|
||||||
// a big-endian byte array. Depending on the number of blocks it needs to
|
// a big-endian byte array. Depending on the number of blocks it needs to
|
||||||
// be permuted to the following.
|
// be permuted to the following.
|
||||||
// [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
|
// [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
|
||||||
const uint32x4x2_t t0 = vuzpq_u32(block0, block1);
|
uint32x4_t x1 = vuzpq_u32(block0, block1).val[0];
|
||||||
uint32x4_t x1 = t0.val[0];
|
uint32x4_t y1 = vuzpq_u32(block0, block1).val[1];
|
||||||
uint32x4_t y1 = t0.val[1];
|
|
||||||
|
|
||||||
x1 = Shuffle32(x1); y1 = Shuffle32(y1);
|
x1 = Shuffle32(x1); y1 = Shuffle32(y1);
|
||||||
|
|
||||||
|
|
@ -140,9 +141,8 @@ inline void SPECK64_Enc_Block(uint32x4_t &block0, uint32x4_t &block1,
|
||||||
x1 = Shuffle32(x1); y1 = Shuffle32(y1);
|
x1 = Shuffle32(x1); y1 = Shuffle32(y1);
|
||||||
|
|
||||||
// [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4]
|
// [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4]
|
||||||
const uint32x4x2_t t1 = vzipq_u32(x1, y1);
|
block0 = vzipq_u32(x1, y1).val[0];
|
||||||
block0 = t1.val[0];
|
block1 = vzipq_u32(x1, y1).val[1];
|
||||||
block1 = t1.val[1];
|
|
||||||
}
|
}
|
||||||
|
|
||||||
inline void SPECK64_Dec_Block(uint32x4_t &block0, uint32x4_t &block1,
|
inline void SPECK64_Dec_Block(uint32x4_t &block0, uint32x4_t &block1,
|
||||||
|
|
@ -152,9 +152,8 @@ inline void SPECK64_Dec_Block(uint32x4_t &block0, uint32x4_t &block1,
|
||||||
// a big-endian byte array. Depending on the number of blocks it needs to
|
// a big-endian byte array. Depending on the number of blocks it needs to
|
||||||
// be permuted to the following.
|
// be permuted to the following.
|
||||||
// [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
|
// [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
|
||||||
const uint32x4x2_t t0 = vuzpq_u32(block0, block1);
|
uint32x4_t x1 = vuzpq_u32(block0, block1).val[0];
|
||||||
uint32x4_t x1 = t0.val[0];
|
uint32x4_t y1 = vuzpq_u32(block0, block1).val[1];
|
||||||
uint32x4_t y1 = t0.val[1];
|
|
||||||
|
|
||||||
x1 = Shuffle32(x1); y1 = Shuffle32(y1);
|
x1 = Shuffle32(x1); y1 = Shuffle32(y1);
|
||||||
|
|
||||||
|
|
@ -172,9 +171,8 @@ inline void SPECK64_Dec_Block(uint32x4_t &block0, uint32x4_t &block1,
|
||||||
x1 = Shuffle32(x1); y1 = Shuffle32(y1);
|
x1 = Shuffle32(x1); y1 = Shuffle32(y1);
|
||||||
|
|
||||||
// [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4]
|
// [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4]
|
||||||
const uint32x4x2_t t1 = vzipq_u32(x1, y1);
|
block0 = vzipq_u32(x1, y1).val[0];
|
||||||
block0 = t1.val[0];
|
block1 = vzipq_u32(x1, y1).val[1];
|
||||||
block1 = t1.val[1];
|
|
||||||
}
|
}
|
||||||
|
|
||||||
inline void SPECK64_Enc_6_Blocks(uint32x4_t &block0, uint32x4_t &block1,
|
inline void SPECK64_Enc_6_Blocks(uint32x4_t &block0, uint32x4_t &block1,
|
||||||
|
|
@ -188,10 +186,8 @@ inline void SPECK64_Enc_6_Blocks(uint32x4_t &block0, uint32x4_t &block1,
|
||||||
// [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
|
// [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
|
||||||
uint32x4_t x1 = vuzpq_u32(block0, block1).val[0];
|
uint32x4_t x1 = vuzpq_u32(block0, block1).val[0];
|
||||||
uint32x4_t y1 = vuzpq_u32(block0, block1).val[1];
|
uint32x4_t y1 = vuzpq_u32(block0, block1).val[1];
|
||||||
|
|
||||||
uint32x4_t x2 = vuzpq_u32(block2, block3).val[0];
|
uint32x4_t x2 = vuzpq_u32(block2, block3).val[0];
|
||||||
uint32x4_t y2 = vuzpq_u32(block2, block3).val[1];
|
uint32x4_t y2 = vuzpq_u32(block2, block3).val[1];
|
||||||
|
|
||||||
uint32x4_t x3 = vuzpq_u32(block4, block5).val[0];
|
uint32x4_t x3 = vuzpq_u32(block4, block5).val[0];
|
||||||
uint32x4_t y3 = vuzpq_u32(block4, block5).val[1];
|
uint32x4_t y3 = vuzpq_u32(block4, block5).val[1];
|
||||||
|
|
||||||
|
|
@ -227,10 +223,8 @@ inline void SPECK64_Enc_6_Blocks(uint32x4_t &block0, uint32x4_t &block1,
|
||||||
// [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4]
|
// [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4]
|
||||||
block0 = vzipq_u32(x1, y1).val[0];
|
block0 = vzipq_u32(x1, y1).val[0];
|
||||||
block1 = vzipq_u32(x1, y1).val[1];
|
block1 = vzipq_u32(x1, y1).val[1];
|
||||||
|
|
||||||
block2 = vzipq_u32(x2, y2).val[0];
|
block2 = vzipq_u32(x2, y2).val[0];
|
||||||
block3 = vzipq_u32(x2, y2).val[1];
|
block3 = vzipq_u32(x2, y2).val[1];
|
||||||
|
|
||||||
block4 = vzipq_u32(x3, y3).val[0];
|
block4 = vzipq_u32(x3, y3).val[0];
|
||||||
block5 = vzipq_u32(x3, y3).val[1];
|
block5 = vzipq_u32(x3, y3).val[1];
|
||||||
}
|
}
|
||||||
|
|
@ -246,10 +240,8 @@ inline void SPECK64_Dec_6_Blocks(uint32x4_t &block0, uint32x4_t &block1,
|
||||||
// [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
|
// [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
|
||||||
uint32x4_t x1 = vuzpq_u32(block0, block1).val[0];
|
uint32x4_t x1 = vuzpq_u32(block0, block1).val[0];
|
||||||
uint32x4_t y1 = vuzpq_u32(block0, block1).val[1];
|
uint32x4_t y1 = vuzpq_u32(block0, block1).val[1];
|
||||||
|
|
||||||
uint32x4_t x2 = vuzpq_u32(block2, block3).val[0];
|
uint32x4_t x2 = vuzpq_u32(block2, block3).val[0];
|
||||||
uint32x4_t y2 = vuzpq_u32(block2, block3).val[1];
|
uint32x4_t y2 = vuzpq_u32(block2, block3).val[1];
|
||||||
|
|
||||||
uint32x4_t x3 = vuzpq_u32(block4, block5).val[0];
|
uint32x4_t x3 = vuzpq_u32(block4, block5).val[0];
|
||||||
uint32x4_t y3 = vuzpq_u32(block4, block5).val[1];
|
uint32x4_t y3 = vuzpq_u32(block4, block5).val[1];
|
||||||
|
|
||||||
|
|
@ -285,10 +277,8 @@ inline void SPECK64_Dec_6_Blocks(uint32x4_t &block0, uint32x4_t &block1,
|
||||||
// [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4]
|
// [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4]
|
||||||
block0 = vzipq_u32(x1, y1).val[0];
|
block0 = vzipq_u32(x1, y1).val[0];
|
||||||
block1 = vzipq_u32(x1, y1).val[1];
|
block1 = vzipq_u32(x1, y1).val[1];
|
||||||
|
|
||||||
block2 = vzipq_u32(x2, y2).val[0];
|
block2 = vzipq_u32(x2, y2).val[0];
|
||||||
block3 = vzipq_u32(x2, y2).val[1];
|
block3 = vzipq_u32(x2, y2).val[1];
|
||||||
|
|
||||||
block4 = vzipq_u32(x3, y3).val[0];
|
block4 = vzipq_u32(x3, y3).val[0];
|
||||||
block5 = vzipq_u32(x3, y3).val[1];
|
block5 = vzipq_u32(x3, y3).val[1];
|
||||||
}
|
}
|
||||||
|
|
@ -316,7 +306,6 @@ inline T UnpackLow64(const T& a, const T& b)
|
||||||
template <unsigned int R>
|
template <unsigned int R>
|
||||||
inline uint64x2_t RotateLeft64(const uint64x2_t& val)
|
inline uint64x2_t RotateLeft64(const uint64x2_t& val)
|
||||||
{
|
{
|
||||||
CRYPTOPP_ASSERT(R < 64);
|
|
||||||
const uint64x2_t a(vshlq_n_u64(val, R));
|
const uint64x2_t a(vshlq_n_u64(val, R));
|
||||||
const uint64x2_t b(vshrq_n_u64(val, 64 - R));
|
const uint64x2_t b(vshrq_n_u64(val, 64 - R));
|
||||||
return vorrq_u64(a, b);
|
return vorrq_u64(a, b);
|
||||||
|
|
@ -325,7 +314,6 @@ inline uint64x2_t RotateLeft64(const uint64x2_t& val)
|
||||||
template <unsigned int R>
|
template <unsigned int R>
|
||||||
inline uint64x2_t RotateRight64(const uint64x2_t& val)
|
inline uint64x2_t RotateRight64(const uint64x2_t& val)
|
||||||
{
|
{
|
||||||
CRYPTOPP_ASSERT(R < 64);
|
|
||||||
const uint64x2_t a(vshlq_n_u64(val, 64 - R));
|
const uint64x2_t a(vshlq_n_u64(val, 64 - R));
|
||||||
const uint64x2_t b(vshrq_n_u64(val, R));
|
const uint64x2_t b(vshrq_n_u64(val, R));
|
||||||
return vorrq_u64(a, b);
|
return vorrq_u64(a, b);
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue