From e90cc9a0289c550f2fbf721c0ef12d5087a2d3d5 Mon Sep 17 00:00:00 2001 From: Jeffrey Walton Date: Sun, 10 Dec 2017 05:41:19 -0500 Subject: [PATCH] Update comments --- adv-simd.h | 1 - config.h | 2 +- simon-simd.cpp | 92 ++++++++++++++++++-------------------------------- speck-simd.cpp | 40 ++++++++-------------- 4 files changed, 47 insertions(+), 88 deletions(-) diff --git a/adv-simd.h b/adv-simd.h index 4809f8bc..b1b9c79c 100644 --- a/adv-simd.h +++ b/adv-simd.h @@ -703,7 +703,6 @@ inline size_t AdvancedProcessBlocks64_SSE2x6(F2 func2, F6 func6, if (length) { // Adjust to real block size - const size_t blockSize = 8; if (flags & BT_ReverseDirection) { inIncrement += inIncrement ? blockSize : 0; diff --git a/config.h b/config.h index 2a435c4b..b1e50ee9 100644 --- a/config.h +++ b/config.h @@ -562,7 +562,7 @@ NAMESPACE_END // Requires ARMv7 and ACLE 1.0. Testing shows ARMv7 is really ARMv7a under most toolchains. // Android still uses ARMv5 and ARMv6 so we have to be conservative when enabling NEON. #if !defined(CRYPTOPP_ARM_NEON_AVAILABLE) && !defined(CRYPTOPP_DISABLE_ASM) -# if defined(__ARM_NEON) || defined(__ARM_FEATURE_NEON) || defined(__ARM_FEATURE_ASIMD) || \ +# if defined(__ARM_NEON) || defined(__ARM_NEON_FP) || defined(__ARM_FEATURE_NEON) || defined(__ARM_FEATURE_ASIMD) || \ (__ARM_ARCH >= 7) || (CRYPTOPP_MSC_VERSION >= 1700) # define CRYPTOPP_ARM_NEON_AVAILABLE 1 # endif diff --git a/simon-simd.cpp b/simon-simd.cpp index a69528cc..547c1431 100644 --- a/simon-simd.cpp +++ b/simon-simd.cpp @@ -51,7 +51,6 @@ using CryptoPP::rotrFixed; template inline uint32x4_t RotateLeft32(const uint32x4_t& val) { - CRYPTOPP_ASSERT(R < 32); const uint32x4_t a(vshlq_n_u32(val, R)); const uint32x4_t b(vshrq_n_u32(val, 32 - R)); return vorrq_u32(a, b); @@ -60,7 +59,6 @@ inline uint32x4_t RotateLeft32(const uint32x4_t& val) template inline uint32x4_t RotateRight32(const uint32x4_t& val) { - CRYPTOPP_ASSERT(R < 32); const uint32x4_t a(vshlq_n_u32(val, 32 - R)); const uint32x4_t b(vshrq_n_u32(val, R)); return vorrq_u32(a, b); @@ -124,9 +122,8 @@ inline void SIMON64_Enc_Block(uint32x4_t &block1, uint32x4_t &block0, // be permuted to the following. If only a single block is available then // a Zero block is provided to promote vectorizations. // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ... - const uint32x4x2_t t0 = vuzpq_u32(block0, block1); - uint32x4_t x1 = t0.val[0]; - uint32x4_t y1 = t0.val[1]; + uint32x4_t x1 = vuzpq_u32(block0, block1).val[0]; + uint32x4_t y1 = vuzpq_u32(block0, block1).val[1]; x1 = Shuffle32(x1); y1 = Shuffle32(y1); @@ -150,9 +147,8 @@ inline void SIMON64_Enc_Block(uint32x4_t &block1, uint32x4_t &block0, x1 = Shuffle32(x1); y1 = Shuffle32(y1); // [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4] - const uint32x4x2_t t1 = vzipq_u32(x1, y1); - block0 = t1.val[0]; - block1 = t1.val[1]; + block0 = vzipq_u32(x1, y1).val[0]; + block1 = vzipq_u32(x1, y1).val[1]; } inline void SIMON64_Dec_Block(uint32x4_t &block0, uint32x4_t &block1, @@ -163,9 +159,8 @@ inline void SIMON64_Dec_Block(uint32x4_t &block0, uint32x4_t &block1, // be permuted to the following. If only a single block is available then // a Zero block is provided to promote vectorizations. // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ... - const uint32x4x2_t t0 = vuzpq_u32(block0, block1); - uint32x4_t x1 = t0.val[0]; - uint32x4_t y1 = t0.val[1]; + uint32x4_t x1 = vuzpq_u32(block0, block1).val[0]; + uint32x4_t y1 = vuzpq_u32(block0, block1).val[1]; x1 = Shuffle32(x1); y1 = Shuffle32(y1); @@ -190,9 +185,8 @@ inline void SIMON64_Dec_Block(uint32x4_t &block0, uint32x4_t &block1, x1 = Shuffle32(x1); y1 = Shuffle32(y1); // [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4] - const uint32x4x2_t t1 = vzipq_u32(x1, y1); - block0 = t1.val[0]; - block1 = t1.val[1]; + block0 = vzipq_u32(x1, y1).val[0]; + block1 = vzipq_u32(x1, y1).val[1]; } inline void SIMON64_Enc_6_Blocks(uint32x4_t &block0, uint32x4_t &block1, @@ -204,17 +198,12 @@ inline void SIMON64_Enc_6_Blocks(uint32x4_t &block0, uint32x4_t &block1, // be permuted to the following. If only a single block is available then // a Zero block is provided to promote vectorizations. // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ... - const uint32x4x2_t t0 = vuzpq_u32(block0, block1); - uint32x4_t x1 = t0.val[0]; - uint32x4_t y1 = t0.val[1]; - - const uint32x4x2_t t1 = vuzpq_u32(block2, block3); - uint32x4_t x2 = t1.val[0]; - uint32x4_t y2 = t1.val[1]; - - const uint32x4x2_t t2 = vuzpq_u32(block4, block5); - uint32x4_t x3 = t2.val[0]; - uint32x4_t y3 = t2.val[1]; + uint32x4_t x1 = vuzpq_u32(block0, block1).val[0]; + uint32x4_t y1 = vuzpq_u32(block0, block1).val[1]; + uint32x4_t x2 = vuzpq_u32(block2, block3).val[0]; + uint32x4_t y2 = vuzpq_u32(block2, block3).val[1]; + uint32x4_t x3 = vuzpq_u32(block4, block5).val[0]; + uint32x4_t y3 = vuzpq_u32(block4, block5).val[1]; x1 = Shuffle32(x1); y1 = Shuffle32(y1); x2 = Shuffle32(x2); y2 = Shuffle32(y2); @@ -248,17 +237,12 @@ inline void SIMON64_Enc_6_Blocks(uint32x4_t &block0, uint32x4_t &block1, x3 = Shuffle32(x3); y3 = Shuffle32(y3); // [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4] - const uint32x4x2_t t3 = vzipq_u32(x1, y1); - block0 = t3.val[0]; - block1 = t3.val[1]; - - const uint32x4x2_t t4 = vzipq_u32(x2, y2); - block2 = t4.val[0]; - block3 = t4.val[1]; - - const uint32x4x2_t t5 = vzipq_u32(x3, y3); - block4 = t5.val[0]; - block5 = t5.val[1]; + block0 = vzipq_u32(x1, y1).val[0]; + block1 = vzipq_u32(x1, y1).val[1]; + block2 = vzipq_u32(x2, y2).val[0]; + block3 = vzipq_u32(x2, y2).val[1]; + block4 = vzipq_u32(x3, y3).val[0]; + block5 = vzipq_u32(x3, y3).val[1]; } inline void SIMON64_Dec_6_Blocks(uint32x4_t &block0, uint32x4_t &block1, @@ -270,17 +254,12 @@ inline void SIMON64_Dec_6_Blocks(uint32x4_t &block0, uint32x4_t &block1, // be permuted to the following. If only a single block is available then // a Zero block is provided to promote vectorizations. // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ... - const uint32x4x2_t t0 = vuzpq_u32(block0, block1); - uint32x4_t x1 = t0.val[0]; - uint32x4_t y1 = t0.val[1]; - - const uint32x4x2_t t1 = vuzpq_u32(block2, block3); - uint32x4_t x2 = t1.val[0]; - uint32x4_t y2 = t1.val[1]; - - const uint32x4x2_t t2 = vuzpq_u32(block4, block5); - uint32x4_t x3 = t2.val[0]; - uint32x4_t y3 = t2.val[1]; + uint32x4_t x1 = vuzpq_u32(block0, block1).val[0]; + uint32x4_t y1 = vuzpq_u32(block0, block1).val[1]; + uint32x4_t x2 = vuzpq_u32(block2, block3).val[0]; + uint32x4_t y2 = vuzpq_u32(block2, block3).val[1]; + uint32x4_t x3 = vuzpq_u32(block4, block5).val[0]; + uint32x4_t y3 = vuzpq_u32(block4, block5).val[1]; x1 = Shuffle32(x1); y1 = Shuffle32(y1); x2 = Shuffle32(x2); y2 = Shuffle32(y2); @@ -315,17 +294,12 @@ inline void SIMON64_Dec_6_Blocks(uint32x4_t &block0, uint32x4_t &block1, x3 = Shuffle32(x3); y3 = Shuffle32(y3); // [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4] - const uint32x4x2_t t3 = vzipq_u32(x1, y1); - block0 = t3.val[0]; - block1 = t3.val[1]; - - const uint32x4x2_t t4 = vzipq_u32(x2, y2); - block2 = t4.val[0]; - block3 = t4.val[1]; - - const uint32x4x2_t t5 = vzipq_u32(x3, y3); - block4 = t5.val[0]; - block5 = t5.val[1]; + block0 = vzipq_u32(x1, y1).val[0]; + block1 = vzipq_u32(x1, y1).val[1]; + block2 = vzipq_u32(x2, y2).val[0]; + block3 = vzipq_u32(x2, y2).val[1]; + block4 = vzipq_u32(x3, y3).val[0]; + block5 = vzipq_u32(x3, y3).val[1]; } #endif // CRYPTOPP_ARM_NEON_AVAILABLE @@ -351,7 +325,6 @@ inline T UnpackLow64(const T& a, const T& b) template inline uint64x2_t RotateLeft64(const uint64x2_t& val) { - CRYPTOPP_ASSERT(R < 64); const uint64x2_t a(vshlq_n_u64(val, R)); const uint64x2_t b(vshrq_n_u64(val, 64 - R)); return vorrq_u64(a, b); @@ -360,7 +333,6 @@ inline uint64x2_t RotateLeft64(const uint64x2_t& val) template inline uint64x2_t RotateRight64(const uint64x2_t& val) { - CRYPTOPP_ASSERT(R < 64); const uint64x2_t a(vshlq_n_u64(val, 64 - R)); const uint64x2_t b(vshrq_n_u64(val, R)); return vorrq_u64(a, b); diff --git a/speck-simd.cpp b/speck-simd.cpp index 125e975b..b901a96e 100644 --- a/speck-simd.cpp +++ b/speck-simd.cpp @@ -18,8 +18,12 @@ // #undef CRYPTOPP_SSE41_AVAILABLE // #undef CRYPTOPP_ARM_NEON_AVAILABLE -// GCC generates bad code when using the table-based rotates -#if defined(__aarch32__) || defined(__aarch64__) +// GCC generates bad code when using the table-based 32-bit rotates. Or, +// GAS assembles it incorrectly (this may be the case since both GCC and +// Clang produce the same failure). SIMON uses the same code but with a +// different round function, and SIMON is OK. Jake Lee warned about this +// at http://stackoverflow.com/q/47617331/608639. +#if (defined(__aarch32__) || defined(__aarch64__)) && defined(__GNUC__) # define WORKAROUND_GCC_AARCH64_BUG 1 #endif @@ -54,7 +58,6 @@ using CryptoPP::word64; template inline uint32x4_t RotateLeft32(const uint32x4_t& val) { - CRYPTOPP_ASSERT(R < 32); const uint32x4_t a(vshlq_n_u32(val, R)); const uint32x4_t b(vshrq_n_u32(val, 32 - R)); return vorrq_u32(a, b); @@ -63,7 +66,6 @@ inline uint32x4_t RotateLeft32(const uint32x4_t& val) template inline uint32x4_t RotateRight32(const uint32x4_t& val) { - CRYPTOPP_ASSERT(R < 32); const uint32x4_t a(vshlq_n_u32(val, 32 - R)); const uint32x4_t b(vshrq_n_u32(val, R)); return vorrq_u32(a, b); @@ -120,9 +122,8 @@ inline void SPECK64_Enc_Block(uint32x4_t &block0, uint32x4_t &block1, // a big-endian byte array. Depending on the number of blocks it needs to // be permuted to the following. // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ... - const uint32x4x2_t t0 = vuzpq_u32(block0, block1); - uint32x4_t x1 = t0.val[0]; - uint32x4_t y1 = t0.val[1]; + uint32x4_t x1 = vuzpq_u32(block0, block1).val[0]; + uint32x4_t y1 = vuzpq_u32(block0, block1).val[1]; x1 = Shuffle32(x1); y1 = Shuffle32(y1); @@ -140,9 +141,8 @@ inline void SPECK64_Enc_Block(uint32x4_t &block0, uint32x4_t &block1, x1 = Shuffle32(x1); y1 = Shuffle32(y1); // [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4] - const uint32x4x2_t t1 = vzipq_u32(x1, y1); - block0 = t1.val[0]; - block1 = t1.val[1]; + block0 = vzipq_u32(x1, y1).val[0]; + block1 = vzipq_u32(x1, y1).val[1]; } inline void SPECK64_Dec_Block(uint32x4_t &block0, uint32x4_t &block1, @@ -152,9 +152,8 @@ inline void SPECK64_Dec_Block(uint32x4_t &block0, uint32x4_t &block1, // a big-endian byte array. Depending on the number of blocks it needs to // be permuted to the following. // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ... - const uint32x4x2_t t0 = vuzpq_u32(block0, block1); - uint32x4_t x1 = t0.val[0]; - uint32x4_t y1 = t0.val[1]; + uint32x4_t x1 = vuzpq_u32(block0, block1).val[0]; + uint32x4_t y1 = vuzpq_u32(block0, block1).val[1]; x1 = Shuffle32(x1); y1 = Shuffle32(y1); @@ -172,9 +171,8 @@ inline void SPECK64_Dec_Block(uint32x4_t &block0, uint32x4_t &block1, x1 = Shuffle32(x1); y1 = Shuffle32(y1); // [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4] - const uint32x4x2_t t1 = vzipq_u32(x1, y1); - block0 = t1.val[0]; - block1 = t1.val[1]; + block0 = vzipq_u32(x1, y1).val[0]; + block1 = vzipq_u32(x1, y1).val[1]; } inline void SPECK64_Enc_6_Blocks(uint32x4_t &block0, uint32x4_t &block1, @@ -188,10 +186,8 @@ inline void SPECK64_Enc_6_Blocks(uint32x4_t &block0, uint32x4_t &block1, // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ... uint32x4_t x1 = vuzpq_u32(block0, block1).val[0]; uint32x4_t y1 = vuzpq_u32(block0, block1).val[1]; - uint32x4_t x2 = vuzpq_u32(block2, block3).val[0]; uint32x4_t y2 = vuzpq_u32(block2, block3).val[1]; - uint32x4_t x3 = vuzpq_u32(block4, block5).val[0]; uint32x4_t y3 = vuzpq_u32(block4, block5).val[1]; @@ -227,10 +223,8 @@ inline void SPECK64_Enc_6_Blocks(uint32x4_t &block0, uint32x4_t &block1, // [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4] block0 = vzipq_u32(x1, y1).val[0]; block1 = vzipq_u32(x1, y1).val[1]; - block2 = vzipq_u32(x2, y2).val[0]; block3 = vzipq_u32(x2, y2).val[1]; - block4 = vzipq_u32(x3, y3).val[0]; block5 = vzipq_u32(x3, y3).val[1]; } @@ -246,10 +240,8 @@ inline void SPECK64_Dec_6_Blocks(uint32x4_t &block0, uint32x4_t &block1, // [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ... uint32x4_t x1 = vuzpq_u32(block0, block1).val[0]; uint32x4_t y1 = vuzpq_u32(block0, block1).val[1]; - uint32x4_t x2 = vuzpq_u32(block2, block3).val[0]; uint32x4_t y2 = vuzpq_u32(block2, block3).val[1]; - uint32x4_t x3 = vuzpq_u32(block4, block5).val[0]; uint32x4_t y3 = vuzpq_u32(block4, block5).val[1]; @@ -285,10 +277,8 @@ inline void SPECK64_Dec_6_Blocks(uint32x4_t &block0, uint32x4_t &block1, // [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4] block0 = vzipq_u32(x1, y1).val[0]; block1 = vzipq_u32(x1, y1).val[1]; - block2 = vzipq_u32(x2, y2).val[0]; block3 = vzipq_u32(x2, y2).val[1]; - block4 = vzipq_u32(x3, y3).val[0]; block5 = vzipq_u32(x3, y3).val[1]; } @@ -316,7 +306,6 @@ inline T UnpackLow64(const T& a, const T& b) template inline uint64x2_t RotateLeft64(const uint64x2_t& val) { - CRYPTOPP_ASSERT(R < 64); const uint64x2_t a(vshlq_n_u64(val, R)); const uint64x2_t b(vshrq_n_u64(val, 64 - R)); return vorrq_u64(a, b); @@ -325,7 +314,6 @@ inline uint64x2_t RotateLeft64(const uint64x2_t& val) template inline uint64x2_t RotateRight64(const uint64x2_t& val) { - CRYPTOPP_ASSERT(R < 64); const uint64x2_t a(vshlq_n_u64(val, 64 - R)); const uint64x2_t b(vshrq_n_u64(val, R)); return vorrq_u64(a, b);