Prepare for POWER8 carryless multiplies using vpmsum

pull/703/head
Jeffrey Walton 2018-08-06 05:40:38 -04:00
parent 6cd7f83346
commit 9ff731824b
No known key found for this signature in database
GPG Key ID: B36AB348921B1838
5 changed files with 174 additions and 26 deletions

View File

@ -785,9 +785,10 @@ NAMESPACE_END
#if !defined(CRYPTOPP_POWER8_AES_AVAILABLE) && !defined(CRYPTOPP_DISABLE_POWER8_AES) && defined(CRYPTOPP_POWER8_AVAILABLE)
# if defined(__CRYPTO__) || defined(_ARCH_PWR8) || (CRYPTOPP_XLC_VERSION >= 130000) || (CRYPTOPP_GCC_VERSION >= 40800)
# define CRYPTOPP_POWER8_AES_AVAILABLE 1
# define CRYPTOPP_POWER8_SHA_AVAILABLE 1
//# define CRYPTOPP_POWER8_CRC_AVAILABLE 1
# define CRYPTOPP_POWER8_AES_AVAILABLE 1
// # define CRYPTOPP_POWER8_PMULL_AVAILABLE 1
# define CRYPTOPP_POWER8_SHA_AVAILABLE 1
# endif
#endif

18
cpu.cpp
View File

@ -804,6 +804,7 @@ bool CRYPTOPP_SECTION_INIT g_hasAltivec = false;
bool CRYPTOPP_SECTION_INIT g_hasPower7 = false;
bool CRYPTOPP_SECTION_INIT g_hasPower8 = false;
bool CRYPTOPP_SECTION_INIT g_hasAES = false;
bool CRYPTOPP_SECTION_INIT g_hasPMULL = false;
bool CRYPTOPP_SECTION_INIT g_hasSHA256 = false;
bool CRYPTOPP_SECTION_INIT g_hasSHA512 = false;
word32 CRYPTOPP_SECTION_INIT g_cacheLineSize = CRYPTOPP_L1_CACHE_LINE_SIZE;
@ -812,6 +813,7 @@ extern bool CPU_ProbeAltivec();
extern bool CPU_ProbePower7();
extern bool CPU_ProbePower8();
extern bool CPU_ProbeAES();
extern bool CPU_ProbePMULL();
extern bool CPU_ProbeSHA256();
extern bool CPU_ProbeSHA512();
@ -884,6 +886,20 @@ inline bool CPU_QueryAES()
return false;
}
inline bool CPU_QueryPMULL()
{
// Power8 and ISA 2.07 provide in-core crypto. Glibc
// 2.24 or higher is required for PPC_FEATURE2_VEC_CRYPTO.
#if defined(__linux__)
if ((getauxval(AT_HWCAP2) & PPC_FEATURE2_VEC_CRYPTO) != 0)
return true;
#elif defined(_AIX)
if (__power_8_andup() != 0)
return true;
#endif
return false;
}
inline bool CPU_QuerySHA256()
{
// Power8 and ISA 2.07 provide in-core crypto. Glibc
@ -918,7 +934,7 @@ void DetectPowerpcFeatures()
g_hasAltivec = CPU_QueryAltivec() || CPU_ProbeAltivec();
g_hasPower7 = CPU_QueryPower7() || CPU_ProbePower7();
g_hasPower8 = CPU_QueryPower8() || CPU_ProbePower8();
//g_hasPMULL = CPU_QueryPMULL() || CPU_ProbePMULL();
g_hasPMULL = CPU_QueryPMULL() || CPU_ProbePMULL();
g_hasAES = CPU_QueryAES() || CPU_ProbeAES();
g_hasSHA256 = CPU_QuerySHA256() || CPU_ProbeSHA256();
g_hasSHA512 = CPU_QuerySHA512() || CPU_ProbeSHA512();

56
cpu.h
View File

@ -342,7 +342,17 @@ inline int GetCacheLineSize()
// Hide from Doxygen
#ifndef CRYPTOPP_DOXYGEN_PROCESSING
extern bool g_ArmDetectionDone;
extern bool g_hasARMv7, g_hasNEON, g_hasPMULL, g_hasCRC32, g_hasAES, g_hasSHA1, g_hasSHA2, g_hasSHA512, g_hasSHA3, g_hasSM3, g_hasSM4;
extern bool g_hasARMv7;
extern bool g_hasNEON;
extern bool g_hasPMULL;
extern bool g_hasCRC32;
extern bool g_hasAES;
extern bool g_hasSHA1;
extern bool g_hasSHA2;
extern bool g_hasSHA512;
extern bool g_hasSHA3;
extern bool g_hasSM3;
extern bool g_hasSM4;
void CRYPTOPP_API DetectArmFeatures();
#endif // CRYPTOPP_DOXYGEN_PROCESSING
@ -578,7 +588,13 @@ inline bool HasSM4()
// Hide from Doxygen
#ifndef CRYPTOPP_DOXYGEN_PROCESSING
extern bool g_PowerpcDetectionDone;
extern bool g_hasAltivec, g_hasPower7, g_hasPower8, g_hasAES, g_hasSHA256, g_hasSHA512;
extern bool g_hasAltivec;
extern bool g_hasPower7;
extern bool g_hasPower8;
extern bool g_hasAES;
extern bool g_hasPMULL;
extern bool g_hasSHA256;
extern bool g_hasSHA512;
extern word32 g_cacheLineSize;
void CRYPTOPP_API DetectPowerpcFeatures();
#endif // CRYPTOPP_DOXYGEN_PROCESSING
@ -590,11 +606,11 @@ void CRYPTOPP_API DetectPowerpcFeatures();
/// \returns true if the hardware is capable of Altivec at runtime, false otherwise.
/// \details Altivec instructions are available under most modern PowerPCs.
/// \details Runtime support requires compile time support. When compiling with GCC, you may
/// need to compile with <tt>-mcpu=power7</tt>; while IBM XL C/C++ compilers require
/// <tt>-qarch=pwr7 -qaltivec</tt>. Also see PowerPC's <tt>_ALTIVEC_</tt> preprocessor macro.
/// \details Atilvec was first available on Power4 platforms. However Crypto++ releies on unaligned
/// loads and stores which is a Power7 feature. If the platform lacks Power7 extensions, then the
/// GNUmakefile sets <tt>-DCRYPTOPP_DISABLE_ALTIVEC</tt>.
/// need to compile with <tt>-mcpu=power4</tt>; while IBM XL C/C++ compilers require
/// <tt>-qarch=pwr6 -qaltivec</tt>. Also see PowerPC's <tt>_ALTIVEC_</tt> preprocessor macro.
/// \details Atilvec was first available on Power4 platforms. However Crypto++ releies heavily
/// on unaligned loads and stores which is a Power7 feature. If the platform lacks Power7
/// extensions, then the GNUmakefile sets <tt>-DCRYPTOPP_DISABLE_POWER7</tt>.
/// \note This function is only available on PowerPC and PowerPC-64 platforms
inline bool HasAltivec()
{
@ -609,9 +625,9 @@ inline bool HasAltivec()
/// \details Runtime support requires compile time support. When compiling with GCC, you may
/// need to compile with <tt>-mcpu=power8</tt>; while IBM XL C/C++ compilers require
/// <tt>-qarch=pwr8 -qaltivec</tt>. Also see PowerPC's <tt>_ALTIVEC_</tt> preprocessor macro.
/// \details Atilvec was first available on Power4 platforms. However Crypto++ releies on unaligned
/// loads and stores which is a Power7 feature. If the platform lacks Power7 extensions, then the
/// GNUmakefile sets <tt>-DCRYPTOPP_DISABLE_ALTIVEC</tt>.
/// \details Atilvec was first available on Power4 platforms. However Crypto++ releies heavily
/// on unaligned loads and stores which is a Power7 feature. If the platform lacks Power7
/// extensions, then the GNUmakefile sets <tt>-DCRYPTOPP_DISABLE_POWER7</tt>.
/// \note This function is only available on PowerPC and PowerPC-64 platforms
inline bool HasPower7()
{
@ -626,9 +642,9 @@ inline bool HasPower7()
/// \details Runtime support requires compile time support. When compiling with GCC, you may
/// need to compile with <tt>-mcpu=power8</tt>; while IBM XL C/C++ compilers require
/// <tt>-qarch=pwr8 -qaltivec</tt>. Also see PowerPC's <tt>_ALTIVEC_</tt> preprocessor macro.
/// \details Atilvec was first available on Power4 platforms. However Crypto++ releies on unaligned
/// loads and stores which is a Power7 feature. If the platform lacks Power7 extensions, then the
/// GNUmakefile sets <tt>-DCRYPTOPP_DISABLE_ALTIVEC</tt>.
/// \details Atilvec was first available on Power4 platforms. However Crypto++ releies heavily
/// on unaligned loads and stores which is a Power7 feature. If the platform lacks Power7
/// extensions, then the GNUmakefile sets <tt>-DCRYPTOPP_DISABLE_POWER7</tt>.
/// \note This function is only available on PowerPC and PowerPC-64 platforms
inline bool HasPower8()
{
@ -651,6 +667,20 @@ inline bool HasAES()
return g_hasAES;
}
/// \brief Determine if a PowerPC processor has Polynomial Multiply available
/// \returns true if the hardware is capable of PMULL at runtime, false otherwise.
/// \details PMULL is part of the in-crypto extensions on Power8 and Power9.
/// \details Runtime support requires compile time support. When compiling with GCC, you may
/// need to compile with <tt>-mcpu=power8</tt>; while IBM XL C/C++ compilers require
/// <tt>-qarch=pwr8 -qaltivec</tt>. Also see PowerPC's <tt>__CRYPTO</tt> preprocessor macro.
/// \note This function is only available on PowerPC and PowerPC-64 platforms
inline bool HasPMULL()
{
if (!g_PowerpcDetectionDone)
DetectPowerpcFeatures();
return g_hasPMULL;
}
/// \brief Determine if a PowerPC processor has SHA256 available
/// \returns true if the hardware is capable of SHA256 at runtime, false otherwise.
/// \details SHA is part of the in-crypto extensions on Power8 and Power9.

View File

@ -39,6 +39,10 @@
# include <arm_acle.h>
#endif
#if defined(CRYPTOPP_POWER8_PMULL_AVAILABLE)
# include "ppc-simd.h"
#endif
#ifdef CRYPTOPP_GNU_STYLE_INLINE_ASSEMBLY
# include <signal.h>
# include <setjmp.h>
@ -61,6 +65,8 @@ extern const char GCM_SIMD_FNAME[] = __FILE__;
ANONYMOUS_NAMESPACE_BEGIN
// ************************* Miscellaneous ************************* //
// GCC 4.8 is missing PMULL gear
#if (CRYPTOPP_ARM_PMULL_AVAILABLE)
# if (CRYPTOPP_GCC_VERSION >= 40800) && (CRYPTOPP_GCC_VERSION < 49000)
@ -182,10 +188,45 @@ inline uint64x2_t VEXT_U8(uint64x2_t a, uint64x2_t b)
#endif // Microsoft and compatibles
#endif // CRYPTOPP_ARM_PMULL_AVAILABLE
#if CRYPTOPP_POWER8_PMULL_AVAILABLE
using CryptoPP::uint8x16_p;
using CryptoPP::uint64x2_p;
using CryptoPP::VectorXor;
using CryptoPP::VectorShiftLeft;
using CryptoPP::VectorShiftRight;
inline uint64x2_p VMULL_P64(uint64x2_p a, uint64x2_p b)
{
// Multiplies low dwords
#if defined(__xlc__) || defined(__xlC__)
return __vpmsumd (a, b);
#else
return __builtin_crypto_vpmsumd (a, b);
#endif
}
inline uint64x2_p VMULL_HIGH_P64(uint64x2_p a, uint64x2_p b)
{
#if defined(__xlc__) || defined(__xlC__)
const uint64x2_p z = VectorXor(a, a);
const uint64x2_p s = VectorShiftRight<8>(a, z);
const uint64x2_p t = VectorShiftRight<8>(b, z);
return __vpmsumd (s, t);
#else
const uint64x2_p z = VectorXor(a, a);
const uint64x2_p s = VectorShiftRight<8>(a, z);
const uint64x2_p t = VectorShiftRight<8>(b, z);
return __builtin_crypto_vpmsumd (s, t);
#endif
}
#endif // CRYPTOPP_POWER8_PMULL_AVAILABLE
ANONYMOUS_NAMESPACE_END
NAMESPACE_BEGIN(CryptoPP)
// ************************* Feature Probes ************************* //
#ifdef CRYPTOPP_GNU_STYLE_INLINE_ASSEMBLY
extern "C" {
typedef void (*SigHandler)(int);
@ -209,8 +250,10 @@ bool CPU_ProbePMULL()
__try
{
const poly64_t a1={0x9090909090909090}, b1={0xb0b0b0b0b0b0b0b0};
const poly8x16_t a2={0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0xa0,0xa0,0xa0,0xa0,0xa0,0xa0,0xa0,0xa0},
b2={0xc0,0xc0,0xc0,0xc0,0xc0,0xc0,0xc0,0xc0,0xe0,0xe0,0xe0,0xe0,0xe0,0xe0,0xe0,0xe0};
const poly8x16_t a2={0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,
0xa0,0xa0,0xa0,0xa0,0xa0,0xa0,0xa0,0xa0},
b2={0xc0,0xc0,0xc0,0xc0,0xc0,0xc0,0xc0,0xc0,
0xe0,0xe0,0xe0,0xe0,0xe0,0xe0,0xe0,0xe0};
const poly128_t r1 = vmull_p64(a1, b1);
const poly128_t r2 = vmull_high_p64((poly64x2_t)(a2), (poly64x2_t)(b2));
@ -219,8 +262,10 @@ bool CPU_ProbePMULL()
const uint64x2_t t1 = (uint64x2_t)(r1); // {bignum,bignum}
const uint64x2_t t2 = (uint64x2_t)(r2); // {bignum,bignum}
result = !!(vgetq_lane_u64(t1,0) == 0x5300530053005300 && vgetq_lane_u64(t1,1) == 0x5300530053005300 &&
vgetq_lane_u64(t2,0) == 0x6c006c006c006c00 && vgetq_lane_u64(t2,1) == 0x6c006c006c006c00);
result = !!(vgetq_lane_u64(t1,0) == 0x5300530053005300 &&
vgetq_lane_u64(t1,1) == 0x5300530053005300 &&
vgetq_lane_u64(t2,0) == 0x6c006c006c006c00 &&
vgetq_lane_u64(t2,1) == 0x6c006c006c006c00);
}
__except (EXCEPTION_EXECUTE_HANDLER)
{
@ -246,8 +291,10 @@ bool CPU_ProbePMULL()
else
{
const poly64_t a1={0x9090909090909090}, b1={0xb0b0b0b0b0b0b0b0};
const poly8x16_t a2={0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0xa0,0xa0,0xa0,0xa0,0xa0,0xa0,0xa0,0xa0},
b2={0xc0,0xc0,0xc0,0xc0,0xc0,0xc0,0xc0,0xc0,0xe0,0xe0,0xe0,0xe0,0xe0,0xe0,0xe0,0xe0};
const poly8x16_t a2={0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,
0xa0,0xa0,0xa0,0xa0,0xa0,0xa0,0xa0,0xa0},
b2={0xc0,0xc0,0xc0,0xc0,0xc0,0xc0,0xc0,0xc0,
0xe0,0xe0,0xe0,0xe0,0xe0,0xe0,0xe0,0xe0};
const poly128_t r1 = VMULL_P64(a1, b1);
const poly128_t r2 = VMULL_HIGH_P64((poly64x2_t)(a2), (poly64x2_t)(b2));
@ -256,8 +303,10 @@ bool CPU_ProbePMULL()
const uint64x2_t t1 = (uint64x2_t)(r1); // {bignum,bignum}
const uint64x2_t t2 = (uint64x2_t)(r2); // {bignum,bignum}
result = !!(vgetq_lane_u64(t1,0) == 0x5300530053005300 && vgetq_lane_u64(t1,1) == 0x5300530053005300 &&
vgetq_lane_u64(t2,0) == 0x6c006c006c006c00 && vgetq_lane_u64(t2,1) == 0x6c006c006c006c00);
result = !!(vgetq_lane_u64(t1,0) == 0x5300530053005300 &&
vgetq_lane_u64(t1,1) == 0x5300530053005300 &&
vgetq_lane_u64(t2,0) == 0x6c006c006c006c00 &&
vgetq_lane_u64(t2,1) == 0x6c006c006c006c00);
}
sigprocmask(SIG_SETMASK, (sigset_t*)&oldMask, NULLPTR);
@ -270,6 +319,54 @@ bool CPU_ProbePMULL()
}
#endif // ARM32 or ARM64
#if (CRYPTOPP_BOOL_PPC32 || CRYPTOPP_BOOL_PPC64)
bool CPU_ProbePMULL()
{
#if defined(CRYPTOPP_NO_CPU_FEATURE_PROBES)
return false;
#elif (CRYPTOPP_POWER8_PMULL_AVAILABLE)
// longjmp and clobber warnings. Volatile is required.
// http://github.com/weidai11/cryptopp/issues/24 and http://stackoverflow.com/q/7721854
volatile bool result = true;
volatile SigHandler oldHandler = signal(SIGILL, SigIllHandler);
if (oldHandler == SIG_ERR)
return false;
volatile sigset_t oldMask;
if (sigprocmask(0, NULLPTR, (sigset_t*)&oldMask))
return false;
if (setjmp(s_jmpSIGILL))
result = false;
else
{
const uint64x2_p a1={0x9090909090909090ull}, b1={0xb0b0b0b0b0b0b0b0ull};
const uint8x16_p a2={0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,
0xa0,0xa0,0xa0,0xa0,0xa0,0xa0,0xa0,0xa0},
b2={0xc0,0xc0,0xc0,0xc0,0xc0,0xc0,0xc0,0xc0,
0xe0,0xe0,0xe0,0xe0,0xe0,0xe0,0xe0,0xe0};
const uint64x2_p r1 = VMULL_P64(a1, b1);
const uint64x2_p r2 = VMULL_HIGH_P64((uint64x2_p)(a2), (uint64x2_p)(b2));
word64 w1[2], w2[2];
VectorStore(r1, (byte*)w1); VectorStore(r2, (byte*)w2);
result = !!(w1[0] == 0x5300530053005300ull && w1[1] == 0x5300530053005300ull &&
w2[0] == 0x6c006c006c006c00ull && w2[1] == 0x6c006c006c006c00ull);
}
sigprocmask(SIG_SETMASK, (sigset_t*)&oldMask, NULLPTR);
signal(SIGILL, oldHandler);
return result;
#else
return false;
#endif // CRYPTOPP_POWER8_PMULL_AVAILABLE
}
#endif // PPC32 or PPC64
// *************************** ARM NEON *************************** //
#if CRYPTOPP_ARM_NEON_AVAILABLE
void GCM_Xor16_NEON(byte *a, const byte *b, const byte *c)
{
@ -413,6 +510,8 @@ void GCM_ReverseHashBufferIfNeeded_PMULL(byte *hashBuffer)
}
#endif // CRYPTOPP_ARM_PMULL_AVAILABLE
// ***************************** SSE ***************************** //
#if CRYPTOPP_SSE2_INTRIN_AVAILABLE || CRYPTOPP_SSE2_ASM_AVAILABLE
// SunCC 5.10-5.11 compiler crash. Move GCM_Xor16_SSE2 out-of-line, and place in
// a source file with a SSE architecture switch. Also see GH #226 and GH #284.

View File

@ -374,14 +374,16 @@ bool TestSettings()
const bool hasAltivec = HasAltivec();
const bool hasPower7 = HasPower7();
const bool hasPower8 = HasPower8();
const bool hasPMULL = HasPMULL();
const bool hasAES = HasAES();
const bool hasSHA256 = HasSHA256();
const bool hasSHA512 = HasSHA512();
std::cout << "passed: ";
std::cout << "hasAltivec == " << hasAltivec << ", hasPower7 == " << hasPower7;
std::cout << ", hasPower8 == " << hasPower8 << ", hasAES == " << hasAES;
std::cout << ", hasSHA256 == " << hasSHA256 << ", hasSHA512 == " << hasSHA512 << "\n";
std::cout << ", hasPower8 == " << hasPower8 << ", hasPMULL == " << hasPMULL;
std::cout << ", hasAES == " << hasAES << ", hasSHA256 == " << hasSHA256;
std::cout << ", hasSHA512 == " << hasSHA512 << "\n";
#endif