From 9ff731824b1c06ac3e27f58533961c300d300c21 Mon Sep 17 00:00:00 2001 From: Jeffrey Walton Date: Mon, 6 Aug 2018 05:40:38 -0400 Subject: [PATCH] Prepare for POWER8 carryless multiplies using vpmsum --- config.h | 5 ++- cpu.cpp | 18 +++++++- cpu.h | 56 +++++++++++++++++++------ gcm-simd.cpp | 115 +++++++++++++++++++++++++++++++++++++++++++++++---- validat3.cpp | 6 ++- 5 files changed, 174 insertions(+), 26 deletions(-) diff --git a/config.h b/config.h index 9f659c76..2a06663d 100644 --- a/config.h +++ b/config.h @@ -785,9 +785,10 @@ NAMESPACE_END #if !defined(CRYPTOPP_POWER8_AES_AVAILABLE) && !defined(CRYPTOPP_DISABLE_POWER8_AES) && defined(CRYPTOPP_POWER8_AVAILABLE) # if defined(__CRYPTO__) || defined(_ARCH_PWR8) || (CRYPTOPP_XLC_VERSION >= 130000) || (CRYPTOPP_GCC_VERSION >= 40800) -# define CRYPTOPP_POWER8_AES_AVAILABLE 1 -# define CRYPTOPP_POWER8_SHA_AVAILABLE 1 //# define CRYPTOPP_POWER8_CRC_AVAILABLE 1 +# define CRYPTOPP_POWER8_AES_AVAILABLE 1 +// # define CRYPTOPP_POWER8_PMULL_AVAILABLE 1 +# define CRYPTOPP_POWER8_SHA_AVAILABLE 1 # endif #endif diff --git a/cpu.cpp b/cpu.cpp index 6c82fdc8..8d60c4b8 100644 --- a/cpu.cpp +++ b/cpu.cpp @@ -804,6 +804,7 @@ bool CRYPTOPP_SECTION_INIT g_hasAltivec = false; bool CRYPTOPP_SECTION_INIT g_hasPower7 = false; bool CRYPTOPP_SECTION_INIT g_hasPower8 = false; bool CRYPTOPP_SECTION_INIT g_hasAES = false; +bool CRYPTOPP_SECTION_INIT g_hasPMULL = false; bool CRYPTOPP_SECTION_INIT g_hasSHA256 = false; bool CRYPTOPP_SECTION_INIT g_hasSHA512 = false; word32 CRYPTOPP_SECTION_INIT g_cacheLineSize = CRYPTOPP_L1_CACHE_LINE_SIZE; @@ -812,6 +813,7 @@ extern bool CPU_ProbeAltivec(); extern bool CPU_ProbePower7(); extern bool CPU_ProbePower8(); extern bool CPU_ProbeAES(); +extern bool CPU_ProbePMULL(); extern bool CPU_ProbeSHA256(); extern bool CPU_ProbeSHA512(); @@ -884,6 +886,20 @@ inline bool CPU_QueryAES() return false; } +inline bool CPU_QueryPMULL() +{ + // Power8 and ISA 2.07 provide in-core crypto. Glibc + // 2.24 or higher is required for PPC_FEATURE2_VEC_CRYPTO. +#if defined(__linux__) + if ((getauxval(AT_HWCAP2) & PPC_FEATURE2_VEC_CRYPTO) != 0) + return true; +#elif defined(_AIX) + if (__power_8_andup() != 0) + return true; +#endif + return false; +} + inline bool CPU_QuerySHA256() { // Power8 and ISA 2.07 provide in-core crypto. Glibc @@ -918,7 +934,7 @@ void DetectPowerpcFeatures() g_hasAltivec = CPU_QueryAltivec() || CPU_ProbeAltivec(); g_hasPower7 = CPU_QueryPower7() || CPU_ProbePower7(); g_hasPower8 = CPU_QueryPower8() || CPU_ProbePower8(); - //g_hasPMULL = CPU_QueryPMULL() || CPU_ProbePMULL(); + g_hasPMULL = CPU_QueryPMULL() || CPU_ProbePMULL(); g_hasAES = CPU_QueryAES() || CPU_ProbeAES(); g_hasSHA256 = CPU_QuerySHA256() || CPU_ProbeSHA256(); g_hasSHA512 = CPU_QuerySHA512() || CPU_ProbeSHA512(); diff --git a/cpu.h b/cpu.h index 40678043..8f2f3850 100644 --- a/cpu.h +++ b/cpu.h @@ -342,7 +342,17 @@ inline int GetCacheLineSize() // Hide from Doxygen #ifndef CRYPTOPP_DOXYGEN_PROCESSING extern bool g_ArmDetectionDone; -extern bool g_hasARMv7, g_hasNEON, g_hasPMULL, g_hasCRC32, g_hasAES, g_hasSHA1, g_hasSHA2, g_hasSHA512, g_hasSHA3, g_hasSM3, g_hasSM4; +extern bool g_hasARMv7; +extern bool g_hasNEON; +extern bool g_hasPMULL; +extern bool g_hasCRC32; +extern bool g_hasAES; +extern bool g_hasSHA1; +extern bool g_hasSHA2; +extern bool g_hasSHA512; +extern bool g_hasSHA3; +extern bool g_hasSM3; +extern bool g_hasSM4; void CRYPTOPP_API DetectArmFeatures(); #endif // CRYPTOPP_DOXYGEN_PROCESSING @@ -578,7 +588,13 @@ inline bool HasSM4() // Hide from Doxygen #ifndef CRYPTOPP_DOXYGEN_PROCESSING extern bool g_PowerpcDetectionDone; -extern bool g_hasAltivec, g_hasPower7, g_hasPower8, g_hasAES, g_hasSHA256, g_hasSHA512; +extern bool g_hasAltivec; +extern bool g_hasPower7; +extern bool g_hasPower8; +extern bool g_hasAES; +extern bool g_hasPMULL; +extern bool g_hasSHA256; +extern bool g_hasSHA512; extern word32 g_cacheLineSize; void CRYPTOPP_API DetectPowerpcFeatures(); #endif // CRYPTOPP_DOXYGEN_PROCESSING @@ -590,11 +606,11 @@ void CRYPTOPP_API DetectPowerpcFeatures(); /// \returns true if the hardware is capable of Altivec at runtime, false otherwise. /// \details Altivec instructions are available under most modern PowerPCs. /// \details Runtime support requires compile time support. When compiling with GCC, you may -/// need to compile with -mcpu=power7; while IBM XL C/C++ compilers require -/// -qarch=pwr7 -qaltivec. Also see PowerPC's _ALTIVEC_ preprocessor macro. -/// \details Atilvec was first available on Power4 platforms. However Crypto++ releies on unaligned -/// loads and stores which is a Power7 feature. If the platform lacks Power7 extensions, then the -/// GNUmakefile sets -DCRYPTOPP_DISABLE_ALTIVEC. +/// need to compile with -mcpu=power4; while IBM XL C/C++ compilers require +/// -qarch=pwr6 -qaltivec. Also see PowerPC's _ALTIVEC_ preprocessor macro. +/// \details Atilvec was first available on Power4 platforms. However Crypto++ releies heavily +/// on unaligned loads and stores which is a Power7 feature. If the platform lacks Power7 +/// extensions, then the GNUmakefile sets -DCRYPTOPP_DISABLE_POWER7. /// \note This function is only available on PowerPC and PowerPC-64 platforms inline bool HasAltivec() { @@ -609,9 +625,9 @@ inline bool HasAltivec() /// \details Runtime support requires compile time support. When compiling with GCC, you may /// need to compile with -mcpu=power8; while IBM XL C/C++ compilers require /// -qarch=pwr8 -qaltivec. Also see PowerPC's _ALTIVEC_ preprocessor macro. -/// \details Atilvec was first available on Power4 platforms. However Crypto++ releies on unaligned -/// loads and stores which is a Power7 feature. If the platform lacks Power7 extensions, then the -/// GNUmakefile sets -DCRYPTOPP_DISABLE_ALTIVEC. +/// \details Atilvec was first available on Power4 platforms. However Crypto++ releies heavily +/// on unaligned loads and stores which is a Power7 feature. If the platform lacks Power7 +/// extensions, then the GNUmakefile sets -DCRYPTOPP_DISABLE_POWER7. /// \note This function is only available on PowerPC and PowerPC-64 platforms inline bool HasPower7() { @@ -626,9 +642,9 @@ inline bool HasPower7() /// \details Runtime support requires compile time support. When compiling with GCC, you may /// need to compile with -mcpu=power8; while IBM XL C/C++ compilers require /// -qarch=pwr8 -qaltivec. Also see PowerPC's _ALTIVEC_ preprocessor macro. -/// \details Atilvec was first available on Power4 platforms. However Crypto++ releies on unaligned -/// loads and stores which is a Power7 feature. If the platform lacks Power7 extensions, then the -/// GNUmakefile sets -DCRYPTOPP_DISABLE_ALTIVEC. +/// \details Atilvec was first available on Power4 platforms. However Crypto++ releies heavily +/// on unaligned loads and stores which is a Power7 feature. If the platform lacks Power7 +/// extensions, then the GNUmakefile sets -DCRYPTOPP_DISABLE_POWER7. /// \note This function is only available on PowerPC and PowerPC-64 platforms inline bool HasPower8() { @@ -651,6 +667,20 @@ inline bool HasAES() return g_hasAES; } +/// \brief Determine if a PowerPC processor has Polynomial Multiply available +/// \returns true if the hardware is capable of PMULL at runtime, false otherwise. +/// \details PMULL is part of the in-crypto extensions on Power8 and Power9. +/// \details Runtime support requires compile time support. When compiling with GCC, you may +/// need to compile with -mcpu=power8; while IBM XL C/C++ compilers require +/// -qarch=pwr8 -qaltivec. Also see PowerPC's __CRYPTO preprocessor macro. +/// \note This function is only available on PowerPC and PowerPC-64 platforms +inline bool HasPMULL() +{ + if (!g_PowerpcDetectionDone) + DetectPowerpcFeatures(); + return g_hasPMULL; +} + /// \brief Determine if a PowerPC processor has SHA256 available /// \returns true if the hardware is capable of SHA256 at runtime, false otherwise. /// \details SHA is part of the in-crypto extensions on Power8 and Power9. diff --git a/gcm-simd.cpp b/gcm-simd.cpp index d6085b3f..c7c958b4 100644 --- a/gcm-simd.cpp +++ b/gcm-simd.cpp @@ -39,6 +39,10 @@ # include #endif +#if defined(CRYPTOPP_POWER8_PMULL_AVAILABLE) +# include "ppc-simd.h" +#endif + #ifdef CRYPTOPP_GNU_STYLE_INLINE_ASSEMBLY # include # include @@ -61,6 +65,8 @@ extern const char GCM_SIMD_FNAME[] = __FILE__; ANONYMOUS_NAMESPACE_BEGIN +// ************************* Miscellaneous ************************* // + // GCC 4.8 is missing PMULL gear #if (CRYPTOPP_ARM_PMULL_AVAILABLE) # if (CRYPTOPP_GCC_VERSION >= 40800) && (CRYPTOPP_GCC_VERSION < 49000) @@ -182,10 +188,45 @@ inline uint64x2_t VEXT_U8(uint64x2_t a, uint64x2_t b) #endif // Microsoft and compatibles #endif // CRYPTOPP_ARM_PMULL_AVAILABLE +#if CRYPTOPP_POWER8_PMULL_AVAILABLE +using CryptoPP::uint8x16_p; +using CryptoPP::uint64x2_p; +using CryptoPP::VectorXor; +using CryptoPP::VectorShiftLeft; +using CryptoPP::VectorShiftRight; + +inline uint64x2_p VMULL_P64(uint64x2_p a, uint64x2_p b) +{ + // Multiplies low dwords +#if defined(__xlc__) || defined(__xlC__) + return __vpmsumd (a, b); +#else + return __builtin_crypto_vpmsumd (a, b); +#endif +} + +inline uint64x2_p VMULL_HIGH_P64(uint64x2_p a, uint64x2_p b) +{ +#if defined(__xlc__) || defined(__xlC__) + const uint64x2_p z = VectorXor(a, a); + const uint64x2_p s = VectorShiftRight<8>(a, z); + const uint64x2_p t = VectorShiftRight<8>(b, z); + return __vpmsumd (s, t); +#else + const uint64x2_p z = VectorXor(a, a); + const uint64x2_p s = VectorShiftRight<8>(a, z); + const uint64x2_p t = VectorShiftRight<8>(b, z); + return __builtin_crypto_vpmsumd (s, t); +#endif +} +#endif // CRYPTOPP_POWER8_PMULL_AVAILABLE + ANONYMOUS_NAMESPACE_END NAMESPACE_BEGIN(CryptoPP) +// ************************* Feature Probes ************************* // + #ifdef CRYPTOPP_GNU_STYLE_INLINE_ASSEMBLY extern "C" { typedef void (*SigHandler)(int); @@ -209,8 +250,10 @@ bool CPU_ProbePMULL() __try { const poly64_t a1={0x9090909090909090}, b1={0xb0b0b0b0b0b0b0b0}; - const poly8x16_t a2={0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0xa0,0xa0,0xa0,0xa0,0xa0,0xa0,0xa0,0xa0}, - b2={0xc0,0xc0,0xc0,0xc0,0xc0,0xc0,0xc0,0xc0,0xe0,0xe0,0xe0,0xe0,0xe0,0xe0,0xe0,0xe0}; + const poly8x16_t a2={0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80, + 0xa0,0xa0,0xa0,0xa0,0xa0,0xa0,0xa0,0xa0}, + b2={0xc0,0xc0,0xc0,0xc0,0xc0,0xc0,0xc0,0xc0, + 0xe0,0xe0,0xe0,0xe0,0xe0,0xe0,0xe0,0xe0}; const poly128_t r1 = vmull_p64(a1, b1); const poly128_t r2 = vmull_high_p64((poly64x2_t)(a2), (poly64x2_t)(b2)); @@ -219,8 +262,10 @@ bool CPU_ProbePMULL() const uint64x2_t t1 = (uint64x2_t)(r1); // {bignum,bignum} const uint64x2_t t2 = (uint64x2_t)(r2); // {bignum,bignum} - result = !!(vgetq_lane_u64(t1,0) == 0x5300530053005300 && vgetq_lane_u64(t1,1) == 0x5300530053005300 && - vgetq_lane_u64(t2,0) == 0x6c006c006c006c00 && vgetq_lane_u64(t2,1) == 0x6c006c006c006c00); + result = !!(vgetq_lane_u64(t1,0) == 0x5300530053005300 && + vgetq_lane_u64(t1,1) == 0x5300530053005300 && + vgetq_lane_u64(t2,0) == 0x6c006c006c006c00 && + vgetq_lane_u64(t2,1) == 0x6c006c006c006c00); } __except (EXCEPTION_EXECUTE_HANDLER) { @@ -246,8 +291,10 @@ bool CPU_ProbePMULL() else { const poly64_t a1={0x9090909090909090}, b1={0xb0b0b0b0b0b0b0b0}; - const poly8x16_t a2={0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0xa0,0xa0,0xa0,0xa0,0xa0,0xa0,0xa0,0xa0}, - b2={0xc0,0xc0,0xc0,0xc0,0xc0,0xc0,0xc0,0xc0,0xe0,0xe0,0xe0,0xe0,0xe0,0xe0,0xe0,0xe0}; + const poly8x16_t a2={0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80, + 0xa0,0xa0,0xa0,0xa0,0xa0,0xa0,0xa0,0xa0}, + b2={0xc0,0xc0,0xc0,0xc0,0xc0,0xc0,0xc0,0xc0, + 0xe0,0xe0,0xe0,0xe0,0xe0,0xe0,0xe0,0xe0}; const poly128_t r1 = VMULL_P64(a1, b1); const poly128_t r2 = VMULL_HIGH_P64((poly64x2_t)(a2), (poly64x2_t)(b2)); @@ -256,8 +303,10 @@ bool CPU_ProbePMULL() const uint64x2_t t1 = (uint64x2_t)(r1); // {bignum,bignum} const uint64x2_t t2 = (uint64x2_t)(r2); // {bignum,bignum} - result = !!(vgetq_lane_u64(t1,0) == 0x5300530053005300 && vgetq_lane_u64(t1,1) == 0x5300530053005300 && - vgetq_lane_u64(t2,0) == 0x6c006c006c006c00 && vgetq_lane_u64(t2,1) == 0x6c006c006c006c00); + result = !!(vgetq_lane_u64(t1,0) == 0x5300530053005300 && + vgetq_lane_u64(t1,1) == 0x5300530053005300 && + vgetq_lane_u64(t2,0) == 0x6c006c006c006c00 && + vgetq_lane_u64(t2,1) == 0x6c006c006c006c00); } sigprocmask(SIG_SETMASK, (sigset_t*)&oldMask, NULLPTR); @@ -270,6 +319,54 @@ bool CPU_ProbePMULL() } #endif // ARM32 or ARM64 +#if (CRYPTOPP_BOOL_PPC32 || CRYPTOPP_BOOL_PPC64) +bool CPU_ProbePMULL() +{ +#if defined(CRYPTOPP_NO_CPU_FEATURE_PROBES) + return false; +#elif (CRYPTOPP_POWER8_PMULL_AVAILABLE) + // longjmp and clobber warnings. Volatile is required. + // http://github.com/weidai11/cryptopp/issues/24 and http://stackoverflow.com/q/7721854 + volatile bool result = true; + + volatile SigHandler oldHandler = signal(SIGILL, SigIllHandler); + if (oldHandler == SIG_ERR) + return false; + + volatile sigset_t oldMask; + if (sigprocmask(0, NULLPTR, (sigset_t*)&oldMask)) + return false; + + if (setjmp(s_jmpSIGILL)) + result = false; + else + { + const uint64x2_p a1={0x9090909090909090ull}, b1={0xb0b0b0b0b0b0b0b0ull}; + const uint8x16_p a2={0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80, + 0xa0,0xa0,0xa0,0xa0,0xa0,0xa0,0xa0,0xa0}, + b2={0xc0,0xc0,0xc0,0xc0,0xc0,0xc0,0xc0,0xc0, + 0xe0,0xe0,0xe0,0xe0,0xe0,0xe0,0xe0,0xe0}; + + const uint64x2_p r1 = VMULL_P64(a1, b1); + const uint64x2_p r2 = VMULL_HIGH_P64((uint64x2_p)(a2), (uint64x2_p)(b2)); + + word64 w1[2], w2[2]; + VectorStore(r1, (byte*)w1); VectorStore(r2, (byte*)w2); + result = !!(w1[0] == 0x5300530053005300ull && w1[1] == 0x5300530053005300ull && + w2[0] == 0x6c006c006c006c00ull && w2[1] == 0x6c006c006c006c00ull); + } + + sigprocmask(SIG_SETMASK, (sigset_t*)&oldMask, NULLPTR); + signal(SIGILL, oldHandler); + return result; +#else + return false; +#endif // CRYPTOPP_POWER8_PMULL_AVAILABLE +} +#endif // PPC32 or PPC64 + +// *************************** ARM NEON *************************** // + #if CRYPTOPP_ARM_NEON_AVAILABLE void GCM_Xor16_NEON(byte *a, const byte *b, const byte *c) { @@ -413,6 +510,8 @@ void GCM_ReverseHashBufferIfNeeded_PMULL(byte *hashBuffer) } #endif // CRYPTOPP_ARM_PMULL_AVAILABLE +// ***************************** SSE ***************************** // + #if CRYPTOPP_SSE2_INTRIN_AVAILABLE || CRYPTOPP_SSE2_ASM_AVAILABLE // SunCC 5.10-5.11 compiler crash. Move GCM_Xor16_SSE2 out-of-line, and place in // a source file with a SSE architecture switch. Also see GH #226 and GH #284. diff --git a/validat3.cpp b/validat3.cpp index 218b08c1..4a2be166 100644 --- a/validat3.cpp +++ b/validat3.cpp @@ -374,14 +374,16 @@ bool TestSettings() const bool hasAltivec = HasAltivec(); const bool hasPower7 = HasPower7(); const bool hasPower8 = HasPower8(); + const bool hasPMULL = HasPMULL(); const bool hasAES = HasAES(); const bool hasSHA256 = HasSHA256(); const bool hasSHA512 = HasSHA512(); std::cout << "passed: "; std::cout << "hasAltivec == " << hasAltivec << ", hasPower7 == " << hasPower7; - std::cout << ", hasPower8 == " << hasPower8 << ", hasAES == " << hasAES; - std::cout << ", hasSHA256 == " << hasSHA256 << ", hasSHA512 == " << hasSHA512 << "\n"; + std::cout << ", hasPower8 == " << hasPower8 << ", hasPMULL == " << hasPMULL; + std::cout << ", hasAES == " << hasAES << ", hasSHA256 == " << hasSHA256; + std::cout << ", hasSHA512 == " << hasSHA512 << "\n"; #endif