Update comments

pull/548/head
Jeffrey Walton 2017-12-10 05:41:19 -05:00
parent 8a5911e6eb
commit e90cc9a028
No known key found for this signature in database
GPG Key ID: B36AB348921B1838
4 changed files with 47 additions and 88 deletions

View File

@ -703,7 +703,6 @@ inline size_t AdvancedProcessBlocks64_SSE2x6(F2 func2, F6 func6,
if (length) if (length)
{ {
// Adjust to real block size // Adjust to real block size
const size_t blockSize = 8;
if (flags & BT_ReverseDirection) if (flags & BT_ReverseDirection)
{ {
inIncrement += inIncrement ? blockSize : 0; inIncrement += inIncrement ? blockSize : 0;

View File

@ -562,7 +562,7 @@ NAMESPACE_END
// Requires ARMv7 and ACLE 1.0. Testing shows ARMv7 is really ARMv7a under most toolchains. // Requires ARMv7 and ACLE 1.0. Testing shows ARMv7 is really ARMv7a under most toolchains.
// Android still uses ARMv5 and ARMv6 so we have to be conservative when enabling NEON. // Android still uses ARMv5 and ARMv6 so we have to be conservative when enabling NEON.
#if !defined(CRYPTOPP_ARM_NEON_AVAILABLE) && !defined(CRYPTOPP_DISABLE_ASM) #if !defined(CRYPTOPP_ARM_NEON_AVAILABLE) && !defined(CRYPTOPP_DISABLE_ASM)
# if defined(__ARM_NEON) || defined(__ARM_FEATURE_NEON) || defined(__ARM_FEATURE_ASIMD) || \ # if defined(__ARM_NEON) || defined(__ARM_NEON_FP) || defined(__ARM_FEATURE_NEON) || defined(__ARM_FEATURE_ASIMD) || \
(__ARM_ARCH >= 7) || (CRYPTOPP_MSC_VERSION >= 1700) (__ARM_ARCH >= 7) || (CRYPTOPP_MSC_VERSION >= 1700)
# define CRYPTOPP_ARM_NEON_AVAILABLE 1 # define CRYPTOPP_ARM_NEON_AVAILABLE 1
# endif # endif

View File

@ -51,7 +51,6 @@ using CryptoPP::rotrFixed;
template <unsigned int R> template <unsigned int R>
inline uint32x4_t RotateLeft32(const uint32x4_t& val) inline uint32x4_t RotateLeft32(const uint32x4_t& val)
{ {
CRYPTOPP_ASSERT(R < 32);
const uint32x4_t a(vshlq_n_u32(val, R)); const uint32x4_t a(vshlq_n_u32(val, R));
const uint32x4_t b(vshrq_n_u32(val, 32 - R)); const uint32x4_t b(vshrq_n_u32(val, 32 - R));
return vorrq_u32(a, b); return vorrq_u32(a, b);
@ -60,7 +59,6 @@ inline uint32x4_t RotateLeft32(const uint32x4_t& val)
template <unsigned int R> template <unsigned int R>
inline uint32x4_t RotateRight32(const uint32x4_t& val) inline uint32x4_t RotateRight32(const uint32x4_t& val)
{ {
CRYPTOPP_ASSERT(R < 32);
const uint32x4_t a(vshlq_n_u32(val, 32 - R)); const uint32x4_t a(vshlq_n_u32(val, 32 - R));
const uint32x4_t b(vshrq_n_u32(val, R)); const uint32x4_t b(vshrq_n_u32(val, R));
return vorrq_u32(a, b); return vorrq_u32(a, b);
@ -124,9 +122,8 @@ inline void SIMON64_Enc_Block(uint32x4_t &block1, uint32x4_t &block0,
// be permuted to the following. If only a single block is available then // be permuted to the following. If only a single block is available then
// a Zero block is provided to promote vectorizations. // a Zero block is provided to promote vectorizations.
// [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ... // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
const uint32x4x2_t t0 = vuzpq_u32(block0, block1); uint32x4_t x1 = vuzpq_u32(block0, block1).val[0];
uint32x4_t x1 = t0.val[0]; uint32x4_t y1 = vuzpq_u32(block0, block1).val[1];
uint32x4_t y1 = t0.val[1];
x1 = Shuffle32(x1); y1 = Shuffle32(y1); x1 = Shuffle32(x1); y1 = Shuffle32(y1);
@ -150,9 +147,8 @@ inline void SIMON64_Enc_Block(uint32x4_t &block1, uint32x4_t &block0,
x1 = Shuffle32(x1); y1 = Shuffle32(y1); x1 = Shuffle32(x1); y1 = Shuffle32(y1);
// [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4] // [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4]
const uint32x4x2_t t1 = vzipq_u32(x1, y1); block0 = vzipq_u32(x1, y1).val[0];
block0 = t1.val[0]; block1 = vzipq_u32(x1, y1).val[1];
block1 = t1.val[1];
} }
inline void SIMON64_Dec_Block(uint32x4_t &block0, uint32x4_t &block1, inline void SIMON64_Dec_Block(uint32x4_t &block0, uint32x4_t &block1,
@ -163,9 +159,8 @@ inline void SIMON64_Dec_Block(uint32x4_t &block0, uint32x4_t &block1,
// be permuted to the following. If only a single block is available then // be permuted to the following. If only a single block is available then
// a Zero block is provided to promote vectorizations. // a Zero block is provided to promote vectorizations.
// [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ... // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
const uint32x4x2_t t0 = vuzpq_u32(block0, block1); uint32x4_t x1 = vuzpq_u32(block0, block1).val[0];
uint32x4_t x1 = t0.val[0]; uint32x4_t y1 = vuzpq_u32(block0, block1).val[1];
uint32x4_t y1 = t0.val[1];
x1 = Shuffle32(x1); y1 = Shuffle32(y1); x1 = Shuffle32(x1); y1 = Shuffle32(y1);
@ -190,9 +185,8 @@ inline void SIMON64_Dec_Block(uint32x4_t &block0, uint32x4_t &block1,
x1 = Shuffle32(x1); y1 = Shuffle32(y1); x1 = Shuffle32(x1); y1 = Shuffle32(y1);
// [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4] // [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4]
const uint32x4x2_t t1 = vzipq_u32(x1, y1); block0 = vzipq_u32(x1, y1).val[0];
block0 = t1.val[0]; block1 = vzipq_u32(x1, y1).val[1];
block1 = t1.val[1];
} }
inline void SIMON64_Enc_6_Blocks(uint32x4_t &block0, uint32x4_t &block1, inline void SIMON64_Enc_6_Blocks(uint32x4_t &block0, uint32x4_t &block1,
@ -204,17 +198,12 @@ inline void SIMON64_Enc_6_Blocks(uint32x4_t &block0, uint32x4_t &block1,
// be permuted to the following. If only a single block is available then // be permuted to the following. If only a single block is available then
// a Zero block is provided to promote vectorizations. // a Zero block is provided to promote vectorizations.
// [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ... // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
const uint32x4x2_t t0 = vuzpq_u32(block0, block1); uint32x4_t x1 = vuzpq_u32(block0, block1).val[0];
uint32x4_t x1 = t0.val[0]; uint32x4_t y1 = vuzpq_u32(block0, block1).val[1];
uint32x4_t y1 = t0.val[1]; uint32x4_t x2 = vuzpq_u32(block2, block3).val[0];
uint32x4_t y2 = vuzpq_u32(block2, block3).val[1];
const uint32x4x2_t t1 = vuzpq_u32(block2, block3); uint32x4_t x3 = vuzpq_u32(block4, block5).val[0];
uint32x4_t x2 = t1.val[0]; uint32x4_t y3 = vuzpq_u32(block4, block5).val[1];
uint32x4_t y2 = t1.val[1];
const uint32x4x2_t t2 = vuzpq_u32(block4, block5);
uint32x4_t x3 = t2.val[0];
uint32x4_t y3 = t2.val[1];
x1 = Shuffle32(x1); y1 = Shuffle32(y1); x1 = Shuffle32(x1); y1 = Shuffle32(y1);
x2 = Shuffle32(x2); y2 = Shuffle32(y2); x2 = Shuffle32(x2); y2 = Shuffle32(y2);
@ -248,17 +237,12 @@ inline void SIMON64_Enc_6_Blocks(uint32x4_t &block0, uint32x4_t &block1,
x3 = Shuffle32(x3); y3 = Shuffle32(y3); x3 = Shuffle32(x3); y3 = Shuffle32(y3);
// [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4] // [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4]
const uint32x4x2_t t3 = vzipq_u32(x1, y1); block0 = vzipq_u32(x1, y1).val[0];
block0 = t3.val[0]; block1 = vzipq_u32(x1, y1).val[1];
block1 = t3.val[1]; block2 = vzipq_u32(x2, y2).val[0];
block3 = vzipq_u32(x2, y2).val[1];
const uint32x4x2_t t4 = vzipq_u32(x2, y2); block4 = vzipq_u32(x3, y3).val[0];
block2 = t4.val[0]; block5 = vzipq_u32(x3, y3).val[1];
block3 = t4.val[1];
const uint32x4x2_t t5 = vzipq_u32(x3, y3);
block4 = t5.val[0];
block5 = t5.val[1];
} }
inline void SIMON64_Dec_6_Blocks(uint32x4_t &block0, uint32x4_t &block1, inline void SIMON64_Dec_6_Blocks(uint32x4_t &block0, uint32x4_t &block1,
@ -270,17 +254,12 @@ inline void SIMON64_Dec_6_Blocks(uint32x4_t &block0, uint32x4_t &block1,
// be permuted to the following. If only a single block is available then // be permuted to the following. If only a single block is available then
// a Zero block is provided to promote vectorizations. // a Zero block is provided to promote vectorizations.
// [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ... // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
const uint32x4x2_t t0 = vuzpq_u32(block0, block1); uint32x4_t x1 = vuzpq_u32(block0, block1).val[0];
uint32x4_t x1 = t0.val[0]; uint32x4_t y1 = vuzpq_u32(block0, block1).val[1];
uint32x4_t y1 = t0.val[1]; uint32x4_t x2 = vuzpq_u32(block2, block3).val[0];
uint32x4_t y2 = vuzpq_u32(block2, block3).val[1];
const uint32x4x2_t t1 = vuzpq_u32(block2, block3); uint32x4_t x3 = vuzpq_u32(block4, block5).val[0];
uint32x4_t x2 = t1.val[0]; uint32x4_t y3 = vuzpq_u32(block4, block5).val[1];
uint32x4_t y2 = t1.val[1];
const uint32x4x2_t t2 = vuzpq_u32(block4, block5);
uint32x4_t x3 = t2.val[0];
uint32x4_t y3 = t2.val[1];
x1 = Shuffle32(x1); y1 = Shuffle32(y1); x1 = Shuffle32(x1); y1 = Shuffle32(y1);
x2 = Shuffle32(x2); y2 = Shuffle32(y2); x2 = Shuffle32(x2); y2 = Shuffle32(y2);
@ -315,17 +294,12 @@ inline void SIMON64_Dec_6_Blocks(uint32x4_t &block0, uint32x4_t &block1,
x3 = Shuffle32(x3); y3 = Shuffle32(y3); x3 = Shuffle32(x3); y3 = Shuffle32(y3);
// [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4] // [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4]
const uint32x4x2_t t3 = vzipq_u32(x1, y1); block0 = vzipq_u32(x1, y1).val[0];
block0 = t3.val[0]; block1 = vzipq_u32(x1, y1).val[1];
block1 = t3.val[1]; block2 = vzipq_u32(x2, y2).val[0];
block3 = vzipq_u32(x2, y2).val[1];
const uint32x4x2_t t4 = vzipq_u32(x2, y2); block4 = vzipq_u32(x3, y3).val[0];
block2 = t4.val[0]; block5 = vzipq_u32(x3, y3).val[1];
block3 = t4.val[1];
const uint32x4x2_t t5 = vzipq_u32(x3, y3);
block4 = t5.val[0];
block5 = t5.val[1];
} }
#endif // CRYPTOPP_ARM_NEON_AVAILABLE #endif // CRYPTOPP_ARM_NEON_AVAILABLE
@ -351,7 +325,6 @@ inline T UnpackLow64(const T& a, const T& b)
template <unsigned int R> template <unsigned int R>
inline uint64x2_t RotateLeft64(const uint64x2_t& val) inline uint64x2_t RotateLeft64(const uint64x2_t& val)
{ {
CRYPTOPP_ASSERT(R < 64);
const uint64x2_t a(vshlq_n_u64(val, R)); const uint64x2_t a(vshlq_n_u64(val, R));
const uint64x2_t b(vshrq_n_u64(val, 64 - R)); const uint64x2_t b(vshrq_n_u64(val, 64 - R));
return vorrq_u64(a, b); return vorrq_u64(a, b);
@ -360,7 +333,6 @@ inline uint64x2_t RotateLeft64(const uint64x2_t& val)
template <unsigned int R> template <unsigned int R>
inline uint64x2_t RotateRight64(const uint64x2_t& val) inline uint64x2_t RotateRight64(const uint64x2_t& val)
{ {
CRYPTOPP_ASSERT(R < 64);
const uint64x2_t a(vshlq_n_u64(val, 64 - R)); const uint64x2_t a(vshlq_n_u64(val, 64 - R));
const uint64x2_t b(vshrq_n_u64(val, R)); const uint64x2_t b(vshrq_n_u64(val, R));
return vorrq_u64(a, b); return vorrq_u64(a, b);

View File

@ -18,8 +18,12 @@
// #undef CRYPTOPP_SSE41_AVAILABLE // #undef CRYPTOPP_SSE41_AVAILABLE
// #undef CRYPTOPP_ARM_NEON_AVAILABLE // #undef CRYPTOPP_ARM_NEON_AVAILABLE
// GCC generates bad code when using the table-based rotates // GCC generates bad code when using the table-based 32-bit rotates. Or,
#if defined(__aarch32__) || defined(__aarch64__) // GAS assembles it incorrectly (this may be the case since both GCC and
// Clang produce the same failure). SIMON uses the same code but with a
// different round function, and SIMON is OK. Jake Lee warned about this
// at http://stackoverflow.com/q/47617331/608639.
#if (defined(__aarch32__) || defined(__aarch64__)) && defined(__GNUC__)
# define WORKAROUND_GCC_AARCH64_BUG 1 # define WORKAROUND_GCC_AARCH64_BUG 1
#endif #endif
@ -54,7 +58,6 @@ using CryptoPP::word64;
template <unsigned int R> template <unsigned int R>
inline uint32x4_t RotateLeft32(const uint32x4_t& val) inline uint32x4_t RotateLeft32(const uint32x4_t& val)
{ {
CRYPTOPP_ASSERT(R < 32);
const uint32x4_t a(vshlq_n_u32(val, R)); const uint32x4_t a(vshlq_n_u32(val, R));
const uint32x4_t b(vshrq_n_u32(val, 32 - R)); const uint32x4_t b(vshrq_n_u32(val, 32 - R));
return vorrq_u32(a, b); return vorrq_u32(a, b);
@ -63,7 +66,6 @@ inline uint32x4_t RotateLeft32(const uint32x4_t& val)
template <unsigned int R> template <unsigned int R>
inline uint32x4_t RotateRight32(const uint32x4_t& val) inline uint32x4_t RotateRight32(const uint32x4_t& val)
{ {
CRYPTOPP_ASSERT(R < 32);
const uint32x4_t a(vshlq_n_u32(val, 32 - R)); const uint32x4_t a(vshlq_n_u32(val, 32 - R));
const uint32x4_t b(vshrq_n_u32(val, R)); const uint32x4_t b(vshrq_n_u32(val, R));
return vorrq_u32(a, b); return vorrq_u32(a, b);
@ -120,9 +122,8 @@ inline void SPECK64_Enc_Block(uint32x4_t &block0, uint32x4_t &block1,
// a big-endian byte array. Depending on the number of blocks it needs to // a big-endian byte array. Depending on the number of blocks it needs to
// be permuted to the following. // be permuted to the following.
// [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ... // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
const uint32x4x2_t t0 = vuzpq_u32(block0, block1); uint32x4_t x1 = vuzpq_u32(block0, block1).val[0];
uint32x4_t x1 = t0.val[0]; uint32x4_t y1 = vuzpq_u32(block0, block1).val[1];
uint32x4_t y1 = t0.val[1];
x1 = Shuffle32(x1); y1 = Shuffle32(y1); x1 = Shuffle32(x1); y1 = Shuffle32(y1);
@ -140,9 +141,8 @@ inline void SPECK64_Enc_Block(uint32x4_t &block0, uint32x4_t &block1,
x1 = Shuffle32(x1); y1 = Shuffle32(y1); x1 = Shuffle32(x1); y1 = Shuffle32(y1);
// [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4] // [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4]
const uint32x4x2_t t1 = vzipq_u32(x1, y1); block0 = vzipq_u32(x1, y1).val[0];
block0 = t1.val[0]; block1 = vzipq_u32(x1, y1).val[1];
block1 = t1.val[1];
} }
inline void SPECK64_Dec_Block(uint32x4_t &block0, uint32x4_t &block1, inline void SPECK64_Dec_Block(uint32x4_t &block0, uint32x4_t &block1,
@ -152,9 +152,8 @@ inline void SPECK64_Dec_Block(uint32x4_t &block0, uint32x4_t &block1,
// a big-endian byte array. Depending on the number of blocks it needs to // a big-endian byte array. Depending on the number of blocks it needs to
// be permuted to the following. // be permuted to the following.
// [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ... // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
const uint32x4x2_t t0 = vuzpq_u32(block0, block1); uint32x4_t x1 = vuzpq_u32(block0, block1).val[0];
uint32x4_t x1 = t0.val[0]; uint32x4_t y1 = vuzpq_u32(block0, block1).val[1];
uint32x4_t y1 = t0.val[1];
x1 = Shuffle32(x1); y1 = Shuffle32(y1); x1 = Shuffle32(x1); y1 = Shuffle32(y1);
@ -172,9 +171,8 @@ inline void SPECK64_Dec_Block(uint32x4_t &block0, uint32x4_t &block1,
x1 = Shuffle32(x1); y1 = Shuffle32(y1); x1 = Shuffle32(x1); y1 = Shuffle32(y1);
// [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4] // [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4]
const uint32x4x2_t t1 = vzipq_u32(x1, y1); block0 = vzipq_u32(x1, y1).val[0];
block0 = t1.val[0]; block1 = vzipq_u32(x1, y1).val[1];
block1 = t1.val[1];
} }
inline void SPECK64_Enc_6_Blocks(uint32x4_t &block0, uint32x4_t &block1, inline void SPECK64_Enc_6_Blocks(uint32x4_t &block0, uint32x4_t &block1,
@ -188,10 +186,8 @@ inline void SPECK64_Enc_6_Blocks(uint32x4_t &block0, uint32x4_t &block1,
// [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ... // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
uint32x4_t x1 = vuzpq_u32(block0, block1).val[0]; uint32x4_t x1 = vuzpq_u32(block0, block1).val[0];
uint32x4_t y1 = vuzpq_u32(block0, block1).val[1]; uint32x4_t y1 = vuzpq_u32(block0, block1).val[1];
uint32x4_t x2 = vuzpq_u32(block2, block3).val[0]; uint32x4_t x2 = vuzpq_u32(block2, block3).val[0];
uint32x4_t y2 = vuzpq_u32(block2, block3).val[1]; uint32x4_t y2 = vuzpq_u32(block2, block3).val[1];
uint32x4_t x3 = vuzpq_u32(block4, block5).val[0]; uint32x4_t x3 = vuzpq_u32(block4, block5).val[0];
uint32x4_t y3 = vuzpq_u32(block4, block5).val[1]; uint32x4_t y3 = vuzpq_u32(block4, block5).val[1];
@ -227,10 +223,8 @@ inline void SPECK64_Enc_6_Blocks(uint32x4_t &block0, uint32x4_t &block1,
// [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4] // [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4]
block0 = vzipq_u32(x1, y1).val[0]; block0 = vzipq_u32(x1, y1).val[0];
block1 = vzipq_u32(x1, y1).val[1]; block1 = vzipq_u32(x1, y1).val[1];
block2 = vzipq_u32(x2, y2).val[0]; block2 = vzipq_u32(x2, y2).val[0];
block3 = vzipq_u32(x2, y2).val[1]; block3 = vzipq_u32(x2, y2).val[1];
block4 = vzipq_u32(x3, y3).val[0]; block4 = vzipq_u32(x3, y3).val[0];
block5 = vzipq_u32(x3, y3).val[1]; block5 = vzipq_u32(x3, y3).val[1];
} }
@ -246,10 +240,8 @@ inline void SPECK64_Dec_6_Blocks(uint32x4_t &block0, uint32x4_t &block1,
// [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ... // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
uint32x4_t x1 = vuzpq_u32(block0, block1).val[0]; uint32x4_t x1 = vuzpq_u32(block0, block1).val[0];
uint32x4_t y1 = vuzpq_u32(block0, block1).val[1]; uint32x4_t y1 = vuzpq_u32(block0, block1).val[1];
uint32x4_t x2 = vuzpq_u32(block2, block3).val[0]; uint32x4_t x2 = vuzpq_u32(block2, block3).val[0];
uint32x4_t y2 = vuzpq_u32(block2, block3).val[1]; uint32x4_t y2 = vuzpq_u32(block2, block3).val[1];
uint32x4_t x3 = vuzpq_u32(block4, block5).val[0]; uint32x4_t x3 = vuzpq_u32(block4, block5).val[0];
uint32x4_t y3 = vuzpq_u32(block4, block5).val[1]; uint32x4_t y3 = vuzpq_u32(block4, block5).val[1];
@ -285,10 +277,8 @@ inline void SPECK64_Dec_6_Blocks(uint32x4_t &block0, uint32x4_t &block1,
// [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4] // [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4]
block0 = vzipq_u32(x1, y1).val[0]; block0 = vzipq_u32(x1, y1).val[0];
block1 = vzipq_u32(x1, y1).val[1]; block1 = vzipq_u32(x1, y1).val[1];
block2 = vzipq_u32(x2, y2).val[0]; block2 = vzipq_u32(x2, y2).val[0];
block3 = vzipq_u32(x2, y2).val[1]; block3 = vzipq_u32(x2, y2).val[1];
block4 = vzipq_u32(x3, y3).val[0]; block4 = vzipq_u32(x3, y3).val[0];
block5 = vzipq_u32(x3, y3).val[1]; block5 = vzipq_u32(x3, y3).val[1];
} }
@ -316,7 +306,6 @@ inline T UnpackLow64(const T& a, const T& b)
template <unsigned int R> template <unsigned int R>
inline uint64x2_t RotateLeft64(const uint64x2_t& val) inline uint64x2_t RotateLeft64(const uint64x2_t& val)
{ {
CRYPTOPP_ASSERT(R < 64);
const uint64x2_t a(vshlq_n_u64(val, R)); const uint64x2_t a(vshlq_n_u64(val, R));
const uint64x2_t b(vshrq_n_u64(val, 64 - R)); const uint64x2_t b(vshrq_n_u64(val, 64 - R));
return vorrq_u64(a, b); return vorrq_u64(a, b);
@ -325,7 +314,6 @@ inline uint64x2_t RotateLeft64(const uint64x2_t& val)
template <unsigned int R> template <unsigned int R>
inline uint64x2_t RotateRight64(const uint64x2_t& val) inline uint64x2_t RotateRight64(const uint64x2_t& val)
{ {
CRYPTOPP_ASSERT(R < 64);
const uint64x2_t a(vshlq_n_u64(val, 64 - R)); const uint64x2_t a(vshlq_n_u64(val, 64 - R));
const uint64x2_t b(vshrq_n_u64(val, R)); const uint64x2_t b(vshrq_n_u64(val, R));
return vorrq_u64(a, b); return vorrq_u64(a, b);