From 9550ccc9a32b664e76654e21f5023ea9134eac13 Mon Sep 17 00:00:00 2001 From: Jeffrey Walton Date: Mon, 12 Nov 2018 21:51:11 -0500 Subject: [PATCH] Port SIMON64 to Altivec SIMON64 runs about 4x faster than C++ for POWER4 and friends. If POWER7 is available it goes back to full speed due to efficient unaligned loads --- GNUmakefile | 4 +-- simon.cpp | 38 +++++++++++++------- simon64_simd.cpp | 94 ++++++++++++++++++++++++++++++++++++++++-------- 3 files changed, 107 insertions(+), 29 deletions(-) diff --git a/GNUmakefile b/GNUmakefile index c6a30a4f..c582d84e 100755 --- a/GNUmakefile +++ b/GNUmakefile @@ -668,8 +668,8 @@ ifeq ($(DETECT_FEATURES),1) # Drop SIMON64 and SPECK64 to Power4 if Power7 not available ifeq ($(SIMON64_FLAG)$(SPECK64_FLAG),) - SPECK64_FLAG = $(ALTIVEC_FLAG) SIMON64_FLAG = $(ALTIVEC_FLAG) + SPECK64_FLAG = $(ALTIVEC_FLAG) endif # IBM XL C/C++ @@ -751,8 +751,8 @@ ifeq ($(DETECT_FEATURES),1) # Drop SIMON64 and SPECK64 to Power4 if Power7 not available ifeq ($(SIMON64_FLAG)$(SPECK64_FLAG),) - SPECK64_FLAG = $(ALTIVEC_FLAG) SIMON64_FLAG = $(ALTIVEC_FLAG) + SPECK64_FLAG = $(ALTIVEC_FLAG) endif ifeq ($(ALTIVEC_FLAG),) diff --git a/simon.cpp b/simon.cpp index 4c48efd2..0e777306 100644 --- a/simon.cpp +++ b/simon.cpp @@ -7,6 +7,14 @@ #include "misc.h" #include "cpu.h" +#ifndef CRYPTOPP_INLINE +# if defined(CRYPTOPP_DEBUG) +# define CRYPTOPP_INLINE static +# else +# define CRYPTOPP_INLINE inline +# endif +#endif + // Uncomment for benchmarking C++ against SSE or NEON. // Do so in both simon.cpp and simon-simd.cpp. // #undef CRYPTOPP_SSSE3_AVAILABLE @@ -228,11 +236,11 @@ extern size_t SIMON128_Dec_AdvancedProcessBlocks_SSSE3(const word64* subKeys, si const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags); #endif -#if (CRYPTOPP_POWER7_AVAILABLE) -extern size_t SIMON64_Enc_AdvancedProcessBlocks_POWER7(const word32* subKeys, size_t rounds, +#if (CRYPTOPP_ALTIVEC_AVAILABLE) +extern size_t SIMON64_Enc_AdvancedProcessBlocks_ALTIVEC(const word32* subKeys, size_t rounds, const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags); -extern size_t SIMON64_Dec_AdvancedProcessBlocks_POWER7(const word32* subKeys, size_t rounds, +extern size_t SIMON64_Dec_AdvancedProcessBlocks_ALTIVEC(const word32* subKeys, size_t rounds, const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags); #endif @@ -255,9 +263,9 @@ std::string SIMON64::Base::AlgorithmProvider() const if (HasNEON()) return "NEON"; # endif -# if (CRYPTOPP_POWER7_AVAILABLE) - if (HasPower7()) - return "Power7"; +# if (CRYPTOPP_ALTIVEC_AVAILABLE) + if (HasAltivec()) + return "Altivec"; # endif #endif return "C++"; @@ -292,6 +300,12 @@ void SIMON64::Base::UncheckedSetKey(const byte *userKey, unsigned int keyLength, default: CRYPTOPP_ASSERT(0);; } + + // Altivec loads the current subkey as a 16-byte vector + // The extra elements ensure memory backs the last subkey. +#if CRYPTOPP_ALTIVEC_AVAILABLE + m_rkeys.Grow(m_rkeys.size()+4); +#endif } void SIMON64::Enc::ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock, byte *outBlock) const @@ -463,9 +477,9 @@ size_t SIMON64::Enc::AdvancedProcessBlocks(const byte *inBlocks, const byte *xor return SIMON64_Enc_AdvancedProcessBlocks_NEON(m_rkeys, (size_t)m_rounds, inBlocks, xorBlocks, outBlocks, length, flags); #endif -#if (CRYPTOPP_POWER7_AVAILABLE) - if (HasPower7()) - return SIMON64_Enc_AdvancedProcessBlocks_POWER7(m_rkeys, (size_t)m_rounds, +#if (CRYPTOPP_ALTIVEC_AVAILABLE) + if (HasAltivec()) + return SIMON64_Enc_AdvancedProcessBlocks_ALTIVEC(m_rkeys, (size_t)m_rounds, inBlocks, xorBlocks, outBlocks, length, flags); #endif return BlockTransformation::AdvancedProcessBlocks(inBlocks, xorBlocks, outBlocks, length, flags); @@ -484,9 +498,9 @@ size_t SIMON64::Dec::AdvancedProcessBlocks(const byte *inBlocks, const byte *xor return SIMON64_Dec_AdvancedProcessBlocks_NEON(m_rkeys, (size_t)m_rounds, inBlocks, xorBlocks, outBlocks, length, flags); #endif -#if (CRYPTOPP_POWER7_AVAILABLE) - if (HasPower7()) - return SIMON64_Dec_AdvancedProcessBlocks_POWER7(m_rkeys, (size_t)m_rounds, +#if (CRYPTOPP_ALTIVEC_AVAILABLE) + if (HasAltivec()) + return SIMON64_Dec_AdvancedProcessBlocks_ALTIVEC(m_rkeys, (size_t)m_rounds, inBlocks, xorBlocks, outBlocks, length, flags); #endif return BlockTransformation::AdvancedProcessBlocks(inBlocks, xorBlocks, outBlocks, length, flags); diff --git a/simon64_simd.cpp b/simon64_simd.cpp index eecc5409..ea3f8174 100644 --- a/simon64_simd.cpp +++ b/simon64_simd.cpp @@ -1,7 +1,7 @@ // simon-simd.cpp - written and placed in the public domain by Jeffrey Walton // // This source file uses intrinsics and built-ins to gain access to -// SSSE3, ARM NEON and ARMv8a, and Power7 Altivec instructions. A separate +// SSSE3, ARM NEON and ARMv8a, and Altivec instructions. A separate // source file is needed because additional CXXFLAGS are required to enable // the appropriate instructions sets in some build configurations. @@ -46,10 +46,18 @@ # include #endif -#if defined(CRYPTOPP_POWER7_AVAILABLE) +#if defined(CRYPTOPP_ALTIVEC_AVAILABLE) # include "ppc_simd.h" #endif +#ifndef CRYPTOPP_INLINE +# if defined(CRYPTOPP_DEBUG) +# define CRYPTOPP_INLINE static +# else +# define CRYPTOPP_INLINE inline +# endif +#endif + // Squash MS LNK4221 and libtool warnings extern const char SIMON64_SIMD_FNAME[] = __FILE__; @@ -523,15 +531,16 @@ inline void SIMON64_Dec_6_Blocks(__m128i &block0, __m128i &block1, #endif // CRYPTOPP_SSE41_AVAILABLE -// ***************************** Power7 ***************************** // +// ***************************** Altivec ***************************** // -#if defined(CRYPTOPP_POWER7_AVAILABLE) +#if defined(CRYPTOPP_ALTIVEC_AVAILABLE) using CryptoPP::uint8x16_p; using CryptoPP::uint32x4_p; using CryptoPP::VectorAnd; using CryptoPP::VectorXor; +using CryptoPP::VectorLoad; using CryptoPP::VectorLoadBE; // Rotate left by bit count @@ -573,16 +582,29 @@ inline void SIMON64_Enc_Block(uint32x4_p &block0, uint32x4_p &block1, for (int i = 0; i < static_cast(rounds & ~1)-1; i += 2) { +#if CRYPTOPP_POWER7_AVAILABLE const uint32x4_p rk1 = vec_splats(subkeys[i]); - y1 = VectorXor(VectorXor(y1, SIMON64_f(x1)), rk1); - const uint32x4_p rk2 = vec_splats(subkeys[i+1]); +#else + const uint8x16_p m = {0,1,2,3, 0,1,2,3, 0,1,2,3, 0,1,2,3}; + uint32x4_p rk1 = VectorLoad(0, subkeys+i); + uint32x4_p rk2 = VectorLoad(0, subkeys+i+1); + rk1 = vec_perm(rk1, rk1, m); + rk2 = vec_perm(rk2, rk2, m); +#endif + y1 = VectorXor(VectorXor(y1, SIMON64_f(x1)), rk1); x1 = VectorXor(VectorXor(x1, SIMON64_f(y1)), rk2); } if (rounds & 1) { +#if CRYPTOPP_POWER7_AVAILABLE const uint32x4_p rk = vec_splats(subkeys[rounds-1]); +#else + const uint8x16_p m = {0,1,2,3, 0,1,2,3, 0,1,2,3, 0,1,2,3}; + uint32x4_p rk = VectorLoad(0, subkeys+rounds-1); + rk = vec_perm(rk, rk, m); +#endif y1 = VectorXor(VectorXor(y1, SIMON64_f(x1)), rk); std::swap(x1, y1); } @@ -618,17 +640,30 @@ inline void SIMON64_Dec_Block(uint32x4_p &block0, uint32x4_p &block1, if (rounds & 1) { std::swap(x1, y1); +#if CRYPTOPP_POWER7_AVAILABLE const uint32x4_p rk = vec_splats(subkeys[rounds-1]); +#else + const uint8x16_p m = {0,1,2,3, 0,1,2,3, 0,1,2,3, 0,1,2,3}; + uint32x4_p rk = VectorLoad(0, subkeys+rounds-1); + rk = vec_perm(rk, rk, m); +#endif y1 = VectorXor(VectorXor(y1, rk), SIMON64_f(x1)); rounds--; } for (int i = static_cast(rounds-2); i >= 0; i -= 2) { +#if CRYPTOPP_POWER7_AVAILABLE const uint32x4_p rk1 = vec_splats(subkeys[i+1]); - x1 = VectorXor(VectorXor(x1, SIMON64_f(y1)), rk1); - const uint32x4_p rk2 = vec_splats(subkeys[i]); +#else + const uint8x16_p m = {0,1,2,3, 0,1,2,3, 0,1,2,3, 0,1,2,3}; + uint32x4_p rk1 = VectorLoad(0, subkeys+i+1); + uint32x4_p rk2 = VectorLoad(0, subkeys+i); + rk1 = vec_perm(rk1, rk1, m); + rk2 = vec_perm(rk2, rk2, m); +#endif + x1 = VectorXor(VectorXor(x1, SIMON64_f(y1)), rk1); y1 = VectorXor(VectorXor(y1, SIMON64_f(x1)), rk2); } @@ -667,12 +702,20 @@ inline void SIMON64_Enc_6_Blocks(uint32x4_p &block0, uint32x4_p &block1, for (int i = 0; i < static_cast(rounds & ~1)-1; i += 2) { +#if CRYPTOPP_POWER7_AVAILABLE const uint32x4_p rk1 = vec_splats(subkeys[i]); + const uint32x4_p rk2 = vec_splats(subkeys[i+1]); +#else + const uint8x16_p m = {0,1,2,3, 0,1,2,3, 0,1,2,3, 0,1,2,3}; + uint32x4_p rk1 = VectorLoad(0, subkeys+i); + uint32x4_p rk2 = VectorLoad(0, subkeys+i+1); + rk1 = vec_perm(rk1, rk1, m); + rk2 = vec_perm(rk2, rk2, m); +#endif y1 = VectorXor(VectorXor(y1, SIMON64_f(x1)), rk1); y2 = VectorXor(VectorXor(y2, SIMON64_f(x2)), rk1); y3 = VectorXor(VectorXor(y3, SIMON64_f(x3)), rk1); - const uint32x4_p rk2 = vec_splats(subkeys[i+1]); x1 = VectorXor(VectorXor(x1, SIMON64_f(y1)), rk2); x2 = VectorXor(VectorXor(x2, SIMON64_f(y2)), rk2); x3 = VectorXor(VectorXor(x3, SIMON64_f(y3)), rk2); @@ -680,7 +723,13 @@ inline void SIMON64_Enc_6_Blocks(uint32x4_p &block0, uint32x4_p &block1, if (rounds & 1) { +#if CRYPTOPP_POWER7_AVAILABLE const uint32x4_p rk = vec_splats(subkeys[rounds-1]); +#else + const uint8x16_p m = {0,1,2,3, 0,1,2,3, 0,1,2,3, 0,1,2,3}; + uint32x4_p rk = VectorLoad(0, subkeys+rounds-1); + rk = vec_perm(rk, rk, m); +#endif y1 = VectorXor(VectorXor(y1, SIMON64_f(x1)), rk); y2 = VectorXor(VectorXor(y2, SIMON64_f(x2)), rk); y3 = VectorXor(VectorXor(y3, SIMON64_f(x3)), rk); @@ -727,7 +776,14 @@ inline void SIMON64_Dec_6_Blocks(uint32x4_p &block0, uint32x4_p &block1, if (rounds & 1) { std::swap(x1, y1); std::swap(x2, y2); std::swap(x3, y3); + +#if CRYPTOPP_POWER7_AVAILABLE const uint32x4_p rk = vec_splats(subkeys[rounds-1]); +#else + const uint8x16_p m = {0,1,2,3, 0,1,2,3, 0,1,2,3, 0,1,2,3}; + uint32x4_p rk = VectorLoad(0, subkeys+rounds-1); + rk = vec_perm(rk, rk, m); +#endif y1 = VectorXor(VectorXor(y1, rk), SIMON64_f(x1)); y2 = VectorXor(VectorXor(y2, rk), SIMON64_f(x2)); y3 = VectorXor(VectorXor(y3, rk), SIMON64_f(x3)); @@ -736,12 +792,20 @@ inline void SIMON64_Dec_6_Blocks(uint32x4_p &block0, uint32x4_p &block1, for (int i = static_cast(rounds-2); i >= 0; i -= 2) { +#if CRYPTOPP_POWER7_AVAILABLE const uint32x4_p rk1 = vec_splats(subkeys[i+1]); + const uint32x4_p rk2 = vec_splats(subkeys[i]); +#else + const uint8x16_p m = {0,1,2,3, 0,1,2,3, 0,1,2,3, 0,1,2,3}; + uint32x4_p rk1 = VectorLoad(0, subkeys+i+1); + uint32x4_p rk2 = VectorLoad(0, subkeys+i); + rk1 = vec_perm(rk1, rk1, m); + rk2 = vec_perm(rk2, rk2, m); +#endif x1 = VectorXor(VectorXor(x1, SIMON64_f(y1)), rk1); x2 = VectorXor(VectorXor(x2, SIMON64_f(y2)), rk1); x3 = VectorXor(VectorXor(x3, SIMON64_f(y3)), rk1); - const uint32x4_p rk2 = vec_splats(subkeys[i]); y1 = VectorXor(VectorXor(y1, SIMON64_f(x1)), rk2); y2 = VectorXor(VectorXor(y2, SIMON64_f(x2)), rk2); y3 = VectorXor(VectorXor(y3, SIMON64_f(x3)), rk2); @@ -764,7 +828,7 @@ inline void SIMON64_Dec_6_Blocks(uint32x4_p &block0, uint32x4_p &block1, block5 = (uint32x4_p)vec_perm(x3, y3, m4); } -#endif // CRYPTOPP_POWER7_AVAILABLE +#endif // CRYPTOPP_ALTIVEC_AVAILABLE ANONYMOUS_NAMESPACE_END @@ -808,17 +872,17 @@ size_t SIMON64_Dec_AdvancedProcessBlocks_SSE41(const word32* subKeys, size_t rou } #endif -// ***************************** Power7 ***************************** // +// ***************************** Altivec ***************************** // -#if defined(CRYPTOPP_POWER7_AVAILABLE) -size_t SIMON64_Enc_AdvancedProcessBlocks_POWER7(const word32* subKeys, size_t rounds, +#if defined(CRYPTOPP_ALTIVEC_AVAILABLE) +size_t SIMON64_Enc_AdvancedProcessBlocks_ALTIVEC(const word32* subKeys, size_t rounds, const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags) { return AdvancedProcessBlocks64_6x2_ALTIVEC(SIMON64_Enc_Block, SIMON64_Enc_6_Blocks, subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags); } -size_t SIMON64_Dec_AdvancedProcessBlocks_POWER7(const word32* subKeys, size_t rounds, +size_t SIMON64_Dec_AdvancedProcessBlocks_ALTIVEC(const word32* subKeys, size_t rounds, const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags) { return AdvancedProcessBlocks64_6x2_ALTIVEC(SIMON64_Dec_Block, SIMON64_Dec_6_Blocks,