From 1dd0e321a6e092c0472b36d75ea06bfa3750b281 Mon Sep 17 00:00:00 2001 From: Jeffrey Walton Date: Sun, 5 Aug 2018 05:39:42 -0400 Subject: [PATCH] Rework Makefile and ppc-simd.h for XLC and LLVM front-end changes --- GNUmakefile | 138 +++++++++++++-------- ppc-simd.cpp | 244 +----------------------------------- ppc-simd.h | 308 ++++++++++++++++++++++++---------------------- rijndael-simd.cpp | 158 ++++++++++++++++++++++-- sha-simd.cpp | 90 ++++++++++++++ 5 files changed, 492 insertions(+), 446 deletions(-) diff --git a/GNUmakefile b/GNUmakefile index bfacbf57..5a738847 100755 --- a/GNUmakefile +++ b/GNUmakefile @@ -101,6 +101,16 @@ ifeq ($(wildcard adhoc.cpp),) $(shell cp adhoc.cpp.proto adhoc.cpp) endif +# Fixup AIX +ifeq ($(IS_AIX),1) + BITNESS=$(shell getconf KERNEL_BITMODE) + ifeq ($(BITNESS),64) + IS_PPC64=1 + else + IS_PPC32=1 + endif +endif + ########################################################### ##### General Variables ##### ########################################################### @@ -400,63 +410,93 @@ ifeq ($(IS_ARMV8),1) endif endif -# PowerPC and PowerPC-64. Altivec is available with Power4 -ifneq ($(IS_PPC32)$(IS_PPC64)$(IS_AIX),000) +# PowerPC and PowerPC-64. Altivec is available with Power4. +# The tests below are crafted for IBM XLC and the LLVM front-end. +# XLC/LLVM only supplies POWER8. So we set the flags for XLC/LLVM +# and lower it if POWER7 or ALTIVEC is available. I've got a +# feeling LLVM is going to cause a lot of troubles. +ifneq ($(IS_PPC32)$(IS_PPC64),00) + HAVE_POWER8 = $(shell $(CXX) $(CXXFLAGS) -DADHOC_MAIN -mcpu=power8 -maltivec -dM -E adhoc.cpp 2>&1 | $(GREP) -i -c -E '_ARCH_PWR8|_ARCH_PWR9|__CRYPTO') + ifneq ($(HAVE_POWER8),0) + POWER8_FLAG = -mcpu=power8 -maltivec + AES_FLAG = $(POWER8_FLAG) + GCM_FLAG = $(POWER8_FLAG) + SHA_FLAG = $(POWER8_FLAG) + SM4_FLAG = $(POWER8_FLAG) + endif + + # GCC and some compatibles + HAVE_POWER7 = $(shell $(CXX) $(CXXFLAGS) -DADHOC_MAIN -mcpu=power7 -maltivec -dM -E adhoc.cpp 2>&1 | $(GREP) -i -c '_ARCH_PWR7') + ifneq ($(HAVE_POWER7),0) + POWER7_FLAG = -mcpu=power7 -maltivec + ARIA_FLAG = $(POWER7_FLAG) + BLAKE2_FLAG = $(POWER7_FLAG) + CHAM_FLAG = $(POWER7_FLAG) + LEA_FLAG = $(POWER7_FLAG) + SIMON_FLAG = $(POWER7_FLAG) + SPECK_FLAG = $(POWER7_FLAG) + SIMECK_FLAG = $(POWER7_FLAG) + endif + # GCC and some compatibles HAVE_ALTIVEC = $(shell $(CXX) $(CXXFLAGS) -DADHOC_MAIN -mcpu=power4 -maltivec -dM -E adhoc.cpp 2>&1 | $(GREP) -i -c '__ALTIVEC__') ifneq ($(HAVE_ALTIVEC),0) ALTIVEC_FLAG = -mcpu=power4 -maltivec - ARIA_FLAG = -mcpu=power4 -maltivec - BLAKE2_FLAG = -mcpu=power4 -maltivec - CHAM_FLAG = -mcpu=power4 -maltivec - LEA_FLAG = -mcpu=power4 -maltivec - SIMON_FLAG = -mcpu=power4 -maltivec - SPECK_FLAG = -mcpu=power4 -maltivec - SIMECK_FLAG = -mcpu=power4 -maltivec - SM4_FLAG = -mcpu=power7 -maltivec - endif - # GCC and some compatibles - HAVE_CRYPTO = $(shell $(CXX) $(CXXFLAGS) -DADHOC_MAIN -mcpu=power8 -maltivec -dM -E adhoc.cpp 2>&1 | $(GREP) -i -c -E '_ARCH_PWR8|_ARCH_PWR9|__CRYPTO') - ifneq ($(HAVE_CRYPTO),0) - ALTIVEC_FLAG = -mcpu=power8 -maltivec - AES_FLAG = -mcpu=power8 -maltivec - GCM_FLAG = -mcpu=power8 -maltivec - SHA_FLAG = -mcpu=power8 -maltivec - CHAM_FLAG = -mcpu=power8 -maltivec - LEA_FLAG = -mcpu=power8 -maltivec - SIMON_FLAG = -mcpu=power8 -maltivec - SPECK_FLAG = -mcpu=power8 -maltivec - SIMECK_FLAG = -mcpu=power8 -maltivec - SM4_FLAG = -mcpu=power8 -maltivec endif + # IBM XL C/C++ - HAVE_ALTIVEC = $(shell $(CXX) $(CXXFLAGS) -qshowmacros -qarch=pwr7 -qaltivec -E adhoc.cpp 2>&1 | $(GREP) -i -c '__ALTIVEC__') + HAVE_POWER8 = $(shell $(CXX) $(CXXFLAGS) -qshowmacros -qarch=pwr8 -qaltivec -E adhoc.cpp 2>&1 | $(GREP) -i -c -E '_ARCH_PWR8|_ARCH_PWR9|__CRYPTO') + ifneq ($(HAVE_POWER8),0) + POWER8_FLAG = -qarch=pwr8 -qaltivec + AES_FLAG = $(POWER8_FLAG) + GCM_FLAG = $(POWER8_FLAG) + SHA_FLAG = $(POWER8_FLAG) + SM4_FLAG = $(POWER8_FLAG) + endif + + # IBM XL C/C++ + HAVE_POWER7 = $(shell $(CXX) $(CXXFLAGS) -qshowmacros -qarch=pwr7 -qaltivec -E adhoc.cpp 2>&1 | $(GREP) -i -c -E '_ARCH_PWR7') + ifneq ($(HAVE_POWER7),0) + POWER7_FLAG = -qarch=pwr7 -qaltivec + ARIA_FLAG = $(POWER7_FLAG) + BLAKE2_FLAG = $(POWER7_FLAG) + CHAM_FLAG = $(POWER7_FLAG) + LEA_FLAG = $(POWER7_FLAG) + SIMECK_FLAG = $(POWER7_FLAG) + SIMON_FLAG = $(POWER7_FLAG) + SPECK_FLAG = $(POWER7_FLAG) + endif + + # IBM XL C/C++ + HAVE_ALTIVEC = $(shell $(CXX) $(CXXFLAGS) -qshowmacros -qarch=pwr6 -qaltivec -E adhoc.cpp 2>&1 | $(GREP) -i -c '__ALTIVEC__') ifneq ($(HAVE_ALTIVEC),0) - ALTIVEC_FLAG = -qarch=pwr7 -qaltivec - ARIA_FLAG = -qarch=pwr7 -qaltivec - BLAKE2_FLAG = -qarch=pwr7 -qaltivec - CHAM_FLAG = -qarch=pwr7 -qaltivec - LEA_FLAG = -qarch=pwr7 -qaltivec - SIMECK_FLAG = -qarch=pwr7 -qaltivec - SIMON_FLAG = -qarch=pwr7 -qaltivec - SPECK_FLAG = -qarch=pwr7 -qaltivec - SM4_FLAG = -qarch=pwr7 -qaltivec + ALTIVEC_FLAG = -qarch=pwr6 -qaltivec endif - # IBM XL C/C++ - HAVE_CRYPTO = $(shell $(CXX) $(CXXFLAGS) -qshowmacros -qarch=pwr8 -qaltivec -E adhoc.cpp 2>&1 | $(GREP) -i -c -E '_ARCH_PWR8|_ARCH_PWR9|__CRYPTO') - ifneq ($(HAVE_CRYPTO),0) - ALTIVEC_FLAG = -qarch=pwr8 -qaltivec - AES_FLAG = -qarch=pwr8 -qaltivec - GCM_FLAG = -qarch=pwr8 -qaltivec - SHA_FLAG = -qarch=pwr8 -qaltivec - ARIA_FLAG = -qarch=pwr8 -qaltivec - BLAKE2_FLAG = -qarch=pwr8 -qaltivec - CHAM_FLAG = -qarch=pwr8 -qaltivec - LEA_FLAG = -qarch=pwr8 -qaltivec - SIMECK_FLAG = -qarch=pwr8 -qaltivec - SIMON_FLAG = -qarch=pwr8 -qaltivec - SPECK_FLAG = -qarch=pwr8 -qaltivec - SM4_FLAG = -qarch=pwr8 -qaltivec + + # LLVM front-ends only provide Power8. It really jambs us up + # for ppc-simd.cpp which needs ALTIVEC/POWER4. We have similar + # problems {lea|cham|simon|speck|...}-simd.cpp and POWER7. + HAVE_LLVM = $(shell $(CXX) $(CXXFLAGS) -qshowmacros -E adhoc.cpp 2>&1 | $(GREP) -i -c '__llvm__') + ifneq ($(HAVE_LLVM),0) + POWER7_FLAG = $(POWER8_FLAG) + ARIA_FLAG = $(POWER8_FLAG) + BLAKE2_FLAG = $(POWER8_FLAG) + CHAM_FLAG = $(POWER8_FLAG) + LEA_FLAG = $(POWER8_FLAG) + SIMECK_FLAG = $(POWER8_FLAG) + SIMON_FLAG = $(POWER8_FLAG) + SPECK_FLAG = $(POWER8_FLAG) + ALTIVEC_FLAG = $(POWER8_FLAG) + endif + + ifeq ($(ALTIVEC_FLAG),) + CXXFLAGS += -DCRYPTOPP_DISABLE_ALTIVEC + endif + ifeq ($(POWER7_FLAG),) + CXXFLAGS += -DCRYPTOPP_DISABLE_POWER7 + endif + ifeq ($(POWER8_FLAG),) + CXXFLAGS += -DCRYPTOPP_DISABLE_POWER8 endif endif diff --git a/ppc-simd.cpp b/ppc-simd.cpp index 56542e38..dd4f5c95 100644 --- a/ppc-simd.cpp +++ b/ppc-simd.cpp @@ -6,14 +6,6 @@ // is needed because additional CXXFLAGS are required to enable the // appropriate instructions sets in some build configurations. -// TODO: Bob Wilkinson reported we are misdetecting CRYPTOPP_POWER8_AVAILABLE. -// The problem is, the updated compiler supports them but the down-level -// assembler and linker do not. We will probably need to fix it through -// the makefile, similar to the way x86 AES and SHA are handled. For the time -// being CRYPTOPP_DISABLE_POWER8 will have to be applied manually. Another -// twist is, we don't have access to a test machine and it must be fixed -// for two compilers (IBM XL C/C++ and GCC). Ugh... - #include "pch.h" #include "config.h" #include "stdcpp.h" @@ -53,7 +45,7 @@ bool CPU_ProbeAltivec() { #if defined(CRYPTOPP_NO_CPU_FEATURE_PROBES) return false; -#elif (CRYPTOPP_ALTIVEC_AVAILABLE) || (CRYPTOPP_POWER7_AVAILABLE) || (CRYPTOPP_POWER8_AVAILABLE) +#elif (CRYPTOPP_ALTIVEC_AVAILABLE) # if defined(CRYPTOPP_GNU_STYLE_INLINE_ASSEMBLY) // longjmp and clobber warnings. Volatile is required. @@ -96,239 +88,5 @@ bool CPU_ProbeAltivec() #endif // CRYPTOPP_ALTIVEC_AVAILABLE } -bool CPU_ProbePower7() -{ -#if defined(CRYPTOPP_NO_CPU_FEATURE_PROBES) - return false; -#elif (CRYPTOPP_POWER7_AVAILABLE) || (CRYPTOPP_POWER8_AVAILABLE) -# if defined(CRYPTOPP_GNU_STYLE_INLINE_ASSEMBLY) - - // longjmp and clobber warnings. Volatile is required. - // http://github.com/weidai11/cryptopp/issues/24 and http://stackoverflow.com/q/7721854 - volatile int result = false; - - volatile SigHandler oldHandler = signal(SIGILL, SigIllHandler); - if (oldHandler == SIG_ERR) - return false; - - volatile sigset_t oldMask; - if (sigprocmask(0, NULLPTR, (sigset_t*)&oldMask)) - return false; - - if (setjmp(s_jmpSIGILL)) - result = false; - else - { - // POWER7 added unaligned loads and store operations - byte b1[19] = {255, 255, 255, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1}, b2[17]; - - // Specifically call the VSX loads and stores - #if defined(__xlc__) || defined(__xlC__) - vec_xst(vec_xl(0, b1+3), 0, b2+1); - #else - vec_vsx_st(vec_vsx_ld(0, b1+3), 0, b2+1); - #endif - - result = (0 == std::memcmp(b1+3, b2+1, 16)); - } - - sigprocmask(SIG_SETMASK, (sigset_t*)&oldMask, NULLPTR); - signal(SIGILL, oldHandler); - return result; -# endif -#else - return false; -#endif // CRYPTOPP_POWER7_AVAILABLE -} - -bool CPU_ProbePower8() -{ -#if defined(CRYPTOPP_NO_CPU_FEATURE_PROBES) - return false; -#elif (CRYPTOPP_POWER8_AVAILABLE) -# if defined(CRYPTOPP_GNU_STYLE_INLINE_ASSEMBLY) - - // longjmp and clobber warnings. Volatile is required. - // http://github.com/weidai11/cryptopp/issues/24 and http://stackoverflow.com/q/7721854 - volatile int result = true; - - volatile SigHandler oldHandler = signal(SIGILL, SigIllHandler); - if (oldHandler == SIG_ERR) - return false; - - volatile sigset_t oldMask; - if (sigprocmask(0, NULLPTR, (sigset_t*)&oldMask)) - return false; - - if (setjmp(s_jmpSIGILL)) - result = false; - else - { - // POWER8 added 64-bit SIMD operations - const word64 x = W64LIT(0xffffffffffffffff); - word64 w1[2] = {x, x}, w2[2] = {4, 6}, w3[2]; - - // Specifically call the VSX loads and stores - #if defined(__xlc__) || defined(__xlC__) - const uint64x2_p v1 = (uint64x2_p)vec_xl(0, (byte*)w1); - const uint64x2_p v2 = (uint64x2_p)vec_xl(0, (byte*)w2); - const uint64x2_p v3 = vec_add(v1, v2); // 64-bit add - vec_xst((uint8x16_p)v3, 0, (byte*)w3); - #else - const uint64x2_p v1 = (uint64x2_p)vec_vsx_ld(0, (byte*)w1); - const uint64x2_p v2 = (uint64x2_p)vec_vsx_ld(0, (byte*)w2); - const uint64x2_p v3 = vec_add(v1, v2); // 64-bit add - vec_vsx_st((uint8x16_p)v3, 0, (byte*)w3); - #endif - - // Relies on integer wrap - result = (w3[0] == 3 && w3[1] == 5); - } - - sigprocmask(SIG_SETMASK, (sigset_t*)&oldMask, NULLPTR); - signal(SIGILL, oldHandler); - return result; -# endif -#else - return false; -#endif // CRYPTOPP_POWER8_AVAILABLE -} - -bool CPU_ProbeAES() -{ -#if defined(CRYPTOPP_NO_CPU_FEATURE_PROBES) - return false; -#elif (CRYPTOPP_POWER8_AVAILABLE) -# if defined(CRYPTOPP_GNU_STYLE_INLINE_ASSEMBLY) - - // longjmp and clobber warnings. Volatile is required. - // http://github.com/weidai11/cryptopp/issues/24 and http://stackoverflow.com/q/7721854 - volatile int result = true; - - volatile SigHandler oldHandler = signal(SIGILL, SigIllHandler); - if (oldHandler == SIG_ERR) - return false; - - volatile sigset_t oldMask; - if (sigprocmask(0, NULLPTR, (sigset_t*)&oldMask)) - return false; - - if (setjmp(s_jmpSIGILL)) - result = false; - else - { - byte key[16] = {0xA0, 0xFA, 0xFE, 0x17, 0x88, 0x54, 0x2c, 0xb1, - 0x23, 0xa3, 0x39, 0x39, 0x2a, 0x6c, 0x76, 0x05}; - byte state[16] = {0x19, 0x3d, 0xe3, 0xb3, 0xa0, 0xf4, 0xe2, 0x2b, - 0x9a, 0xc6, 0x8d, 0x2a, 0xe9, 0xf8, 0x48, 0x08}; - byte r[16] = {255}, z[16] = {}; - - uint8x16_p k = (uint8x16_p)VectorLoad(0, key); - uint8x16_p s = (uint8x16_p)VectorLoad(0, state); - s = VectorEncrypt(s, k); - s = VectorEncryptLast(s, k); - s = VectorDecrypt(s, k); - s = VectorDecryptLast(s, k); - VectorStore(s, r); - - result = (0 != std::memcmp(r, z, 16)); - } - - sigprocmask(SIG_SETMASK, (sigset_t*)&oldMask, NULLPTR); - signal(SIGILL, oldHandler); - return result; -# endif -#else - return false; -#endif // CRYPTOPP_ALTIVEC_AVAILABLE -} - -bool CPU_ProbeSHA256() -{ -#if defined(CRYPTOPP_NO_CPU_FEATURE_PROBES) - return false; -#elif (CRYPTOPP_POWER8_AVAILABLE) -# if defined(CRYPTOPP_GNU_STYLE_INLINE_ASSEMBLY) - - // longjmp and clobber warnings. Volatile is required. - // http://github.com/weidai11/cryptopp/issues/24 and http://stackoverflow.com/q/7721854 - volatile int result = false; - - volatile SigHandler oldHandler = signal(SIGILL, SigIllHandler); - if (oldHandler == SIG_ERR) - return false; - - volatile sigset_t oldMask; - if (sigprocmask(0, NULLPTR, (sigset_t*)&oldMask)) - return false; - - if (setjmp(s_jmpSIGILL)) - result = false; - else - { - byte r[16], z[16] = {0}; - uint8x16_p x = ((uint8x16_p){0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}); - - x = VectorSHA256<0,0>(x); - x = VectorSHA256<0,1>(x); - x = VectorSHA256<1,0>(x); - x = VectorSHA256<1,1>(x); - VectorStore(x, r); - - result = (0 == std::memcmp(r, z, 16)); - } - - sigprocmask(SIG_SETMASK, (sigset_t*)&oldMask, NULLPTR); - signal(SIGILL, oldHandler); - return result; -# endif -#else - return false; -#endif // CRYPTOPP_ALTIVEC_AVAILABLE -} - -bool CPU_ProbeSHA512() -{ -#if defined(CRYPTOPP_NO_CPU_FEATURE_PROBES) - return false; -#elif (CRYPTOPP_POWER8_AVAILABLE) -# if defined(CRYPTOPP_GNU_STYLE_INLINE_ASSEMBLY) - - // longjmp and clobber warnings. Volatile is required. - // http://github.com/weidai11/cryptopp/issues/24 and http://stackoverflow.com/q/7721854 - volatile int result = false; - - volatile SigHandler oldHandler = signal(SIGILL, SigIllHandler); - if (oldHandler == SIG_ERR) - return false; - - volatile sigset_t oldMask; - if (sigprocmask(0, NULLPTR, (sigset_t*)&oldMask)) - return false; - - if (setjmp(s_jmpSIGILL)) - result = false; - else - { - byte r[16], z[16] = {0}; - uint8x16_p x = ((uint8x16_p){0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}); - - x = VectorSHA512<0,0>(x); - x = VectorSHA512<0,1>(x); - x = VectorSHA512<1,0>(x); - x = VectorSHA512<1,1>(x); - VectorStore(x, r); - - result = (0 == std::memcmp(r, z, 16)); - } - - sigprocmask(SIG_SETMASK, (sigset_t*)&oldMask, NULLPTR); - signal(SIGILL, oldHandler); - return result; -# endif -#else - return false; -#endif // CRYPTOPP_POWER8_AVAILABLE -} # endif // CRYPTOPP_BOOL_PPC32 || CRYPTOPP_BOOL_PPC64 NAMESPACE_END diff --git a/ppc-simd.h b/ppc-simd.h index 08426839..7e7ef52e 100644 --- a/ppc-simd.h +++ b/ppc-simd.h @@ -19,6 +19,26 @@ #include "config.h" #include "misc.h" +// We are boxed into undefining macros like CRYPTOPP_POWER8_AVAILABLE. +// We set CRYPTOPP_POWER8_AVAILABLE based on compiler versions because +// we needed them for the SIMD and non-SIMD files. When the SIMD file is +// compiled it may only get -mcpu=power4 or -mcpu=power7, so the POWER7 +// or POWER8 stuff is not actually available when this header is included. +#if !defined(__ALTIVEC__) +# undef CRYPTOPP_ALTIVEC_AVAILABLE +#endif + +#if !defined(_ARCH_PWR7) +# undef CRYPTOPP_POWER7_AVAILABLE +#endif + +#if !(defined(_ARCH_PWR8) || defined(_ARCH_PWR9) || defined(_CRYPTO)) +# undef CRYPTOPP_POWER8_AVAILABLE +# undef CRYPTOPP_POWER8_AES_AVAILABLE +# undef CRYPTOPP_POWER8_SHA_AVAILABLE +# undef CRYPTOPP_POWER8_PMULL_AVAILABLE +#endif + #if defined(CRYPTOPP_ALTIVEC_AVAILABLE) || defined(CRYPTOPP_DOXYGEN_PROCESSING) # include # undef vector @@ -28,84 +48,112 @@ NAMESPACE_BEGIN(CryptoPP) +// Datatypes #if defined(CRYPTOPP_ALTIVEC_AVAILABLE) || defined(CRYPTOPP_DOXYGEN_PROCESSING) - typedef __vector unsigned char uint8x16_p; typedef __vector unsigned short uint16x8_p; typedef __vector unsigned int uint32x4_p; - -#if defined(CRYPTOPP_POWER8_AVAILABLE) +#if defined(CRYPTOPP_POWER8_AVAILABLE) || defined(CRYPTOPP_DOXYGEN_PROCESSING) typedef __vector unsigned long long uint64x2_p; #endif +#endif // ALTIVEC/POWER4 datatypes -#endif // CRYPTOPP_ALTIVEC_AVAILABLE +// POWER4 and above +#if defined(CRYPTOPP_ALTIVEC_AVAILABLE) || defined(CRYPTOPP_DOXYGEN_PROCESSING) -#if defined(CRYPTOPP_ALTIVEC_AVAILABLE) && !defined(CRYPTOPP_POWER7_AVAILABLE) - -inline uint32x4_p VectorLoad(const byte src[16]) +/// \brief Reverse a vector +/// \tparam T vector type +/// \param src the vector +/// \details Reverse() endian swaps the bytes in a vector +/// \sa Reverse(), VectorLoadBE(), VectorLoad(), VectorLoadKey() +/// \since Crypto++ 6.0 +template +inline T Reverse(const T& src) { - uint8x16_p data; - if (IsAlignedOn(src, 16)) - { - data = vec_ld(0, src); - } - else - { - // http://www.nxp.com/docs/en/reference-manual/ALTIVECPEM.pdf - const uint8x16_p perm = vec_lvsl(0, src); - const uint8x16_p low = vec_ld(0, src); - const uint8x16_p high = vec_ld(15, src); - data = vec_perm(low, high, perm); - } - -#if defined(CRYPTOPP_BIG_ENDIAN) - return (uint32x4_p)data; -#else const uint8x16_p mask = {15,14,13,12, 11,10,9,8, 7,6,5,4, 3,2,1,0}; - return (uint32x4_p)vec_perm(data, data, mask); -#endif + return vec_perm(src, src, mask); } -inline void VectorStore(const uint32x4_p data, byte dest[16]) +/// \brief Permutes two vectors +/// \tparam T1 vector type +/// \tparam T2 vector type +/// \param vec1 the first vector +/// \param vec2 the second vector +/// \param mask vector mask +/// \details VectorPermute returns a new vector from vec1 and vec2 +/// based on mask. mask is an uint8x16_p type vector. The return +/// vector is the same type as vec1. +/// \since Crypto++ 6.0 +template +inline T1 VectorPermute(const T1& vec1, const T1& vec2, const T2& mask) +{ + return (T1)vec_perm(vec1, vec2, (uint8x16_p)mask); +} + +/// \brief XOR two vectors +/// \tparam T1 vector type +/// \tparam T2 vector type +/// \param vec1 the first vector +/// \param vec2 the second vector +/// \details VectorXor returns a new vector from vec1 and vec2. The return +/// vector is the same type as vec1. +/// \since Crypto++ 6.0 +template +inline T1 VectorXor(const T1& vec1, const T2& vec2) +{ + return (T1)vec_xor(vec1, (T1)vec2); +} + +/// \brief Add two vector +/// \tparam T1 vector type +/// \tparam T2 vector type +/// \param vec1 the first vector +/// \param vec2 the second vector +/// \details VectorAdd returns a new vector from vec1 and vec2. +/// vec2 is cast to the same type as vec1. The return vector +/// is the same type as vec1. +/// \since Crypto++ 6.0 +template +inline T1 VectorAdd(const T1& vec1, const T2& vec2) +{ + return (T1)vec_add(vec1, (T1)vec2); +} + +/// \brief Shift two vectors left +/// \tparam C shift byte count +/// \tparam T1 vector type +/// \tparam T2 vector type +/// \param vec1 the first vector +/// \param vec2 the second vector +/// \details VectorShiftLeft() concatenates vec1 and vec2 and returns a +/// new vector after shifting the concatenation by the specified number +/// of bytes. Both vec1 and vec2 are cast to uint8x16_p. The return +/// vector is the same type as vec1. +/// \details On big endian machines VectorShiftLeft() is vec_sld(a, b, +/// c). On little endian machines VectorShiftLeft() is translated to +/// vec_sld(b, a, 16-c). You should always call the function as +/// if on a big endian machine as shown below. +///
+///    uint8x16_p r0 = {0};
+///    uint8x16_p r1 = VectorLoad(ptr);
+///    uint8x16_p r5 = VectorShiftLeft<12>(r0, r1);
+/// 
+/// \sa Is vec_sld +/// endian sensitive? on Stack Overflow +/// \since Crypto++ 6.0 +template +inline T1 VectorShiftLeft(const T1& vec1, const T2& vec2) { #if defined(CRYPTOPP_LITTLE_ENDIAN) - const uint8x16_p mask = {15,14,13,12, 11,10,9,8, 7,6,5,4, 3,2,1,0}; - const uint8x16_p t1 = (uint8x16_p)vec_perm(data, data, mask); + return (T1)vec_sld((uint8x16_p)vec2, (uint8x16_p)vec1, 16-C); #else - const uint8x16_p t1 = (uint8x16_p)data; + return (T1)vec_sld((uint8x16_p)vec1, (uint8x16_p)vec2, C); #endif - - if (IsAlignedOn(dest, 16)) - { - vec_st(t1, 0, dest); - } - else - { - // http://www.nxp.com/docs/en/reference-manual/ALTIVECPEM.pdf - const uint8x16_p t2 = vec_perm(t1, t1, vec_lvsr(0, dest)); - vec_ste((uint8x16_p) t2, 0, (unsigned char*) dest); - vec_ste((uint16x8_p) t2, 1, (unsigned short*)dest); - vec_ste((uint32x4_p) t2, 3, (unsigned int*) dest); - vec_ste((uint32x4_p) t2, 4, (unsigned int*) dest); - vec_ste((uint32x4_p) t2, 8, (unsigned int*) dest); - vec_ste((uint32x4_p) t2, 12, (unsigned int*) dest); - vec_ste((uint16x8_p) t2, 14, (unsigned short*)dest); - vec_ste((uint8x16_p) t2, 15, (unsigned char*) dest); - } } -inline uint32x4_p VectorXor(const uint32x4_p vec1, const uint32x4_p vec2) -{ - return vec_xor(vec1, vec2); -} - -inline uint32x4_p VectorAdd(const uint32x4_p vec1, const uint32x4_p vec2) -{ - return vec_add(vec1, vec2); -} - -#endif +#endif // POWER4 and above +// POWER7/POWER4 load and store #if defined(CRYPTOPP_POWER7_AVAILABLE) || defined(CRYPTOPP_DOXYGEN_PROCESSING) /// \brief Reverse a 16-byte array @@ -124,19 +172,6 @@ inline void ReverseByteArrayLE(byte src[16]) #endif } -/// \brief Reverse a vector -/// \tparam T vector type -/// \param src the vector -/// \details Reverse() endian swaps the bytes in a vector -/// \sa Reverse(), VectorLoadBE(), VectorLoad(), VectorLoadKey() -/// \since Crypto++ 6.0 -template -inline T Reverse(const T& src) -{ - const uint8x16_p mask = {15,14,13,12, 11,10,9,8, 7,6,5,4, 3,2,1,0}; - return vec_perm(src, src, mask); -} - /// \brief Loads a vector from a byte array /// \param src the byte array /// \details Loads a vector in big endian format from a byte array. @@ -346,86 +381,65 @@ inline void VectorStore(const T& src, int off, byte dest[16]) #endif } -/// \brief Permutes two vectors -/// \tparam T1 vector type -/// \tparam T2 vector type -/// \param vec1 the first vector -/// \param vec2 the second vector -/// \param mask vector mask -/// \details VectorPermute returns a new vector from vec1 and vec2 -/// based on mask. mask is an uint8x16_p type vector. The return -/// vector is the same type as vec1. -/// \since Crypto++ 6.0 -template -inline T1 VectorPermute(const T1& vec1, const T1& vec2, const T2& mask) -{ - return (T1)vec_perm(vec1, vec2, (uint8x16_p)mask); -} +#else // not CRYPTOPP_POWER7_AVAILABLE -/// \brief XOR two vectors -/// \tparam T1 vector type -/// \tparam T2 vector type -/// \param vec1 the first vector -/// \param vec2 the second vector -/// \details VectorXor returns a new vector from vec1 and vec2. The return -/// vector is the same type as vec1. -/// \since Crypto++ 6.0 -template -inline T1 VectorXor(const T1& vec1, const T2& vec2) +// POWER7 is not available. Slow Altivec loads and stores. +inline uint32x4_p VectorLoad(const byte src[16]) { - return (T1)vec_xor(vec1, (T1)vec2); -} + uint8x16_p data; + if (IsAlignedOn(src, 16)) + { + data = vec_ld(0, src); + } + else + { + // http://www.nxp.com/docs/en/reference-manual/ALTIVECPEM.pdf + const uint8x16_p perm = vec_lvsl(0, src); + const uint8x16_p low = vec_ld(0, src); + const uint8x16_p high = vec_ld(15, src); + data = vec_perm(low, high, perm); + } -/// \brief Add two vector -/// \tparam T1 vector type -/// \tparam T2 vector type -/// \param vec1 the first vector -/// \param vec2 the second vector -/// \details VectorAdd returns a new vector from vec1 and vec2. -/// vec2 is cast to the same type as vec1. The return vector -/// is the same type as vec1. -/// \since Crypto++ 6.0 -template -inline T1 VectorAdd(const T1& vec1, const T2& vec2) -{ - return (T1)vec_add(vec1, (T1)vec2); -} - -/// \brief Shift two vectors left -/// \tparam C shift byte count -/// \tparam T1 vector type -/// \tparam T2 vector type -/// \param vec1 the first vector -/// \param vec2 the second vector -/// \details VectorShiftLeft() concatenates vec1 and vec2 and returns a -/// new vector after shifting the concatenation by the specified number -/// of bytes. Both vec1 and vec2 are cast to uint8x16_p. The return -/// vector is the same type as vec1. -/// \details On big endian machines VectorShiftLeft() is vec_sld(a, b, -/// c). On little endian machines VectorShiftLeft() is translated to -/// vec_sld(b, a, 16-c). You should always call the function as -/// if on a big endian machine as shown below. -///
-///    uint8x16_p r0 = {0};
-///    uint8x16_p r1 = VectorLoad(ptr);
-///    uint8x16_p r5 = VectorShiftLeft<12>(r0, r1);
-/// 
-/// \sa Is vec_sld -/// endian sensitive? on Stack Overflow -/// \since Crypto++ 6.0 -template -inline T1 VectorShiftLeft(const T1& vec1, const T2& vec2) -{ -#if defined(CRYPTOPP_LITTLE_ENDIAN) - return (T1)vec_sld((uint8x16_p)vec2, (uint8x16_p)vec1, 16-C); +#if defined(CRYPTOPP_BIG_ENDIAN) + return (uint32x4_p)data; #else - return (T1)vec_sld((uint8x16_p)vec1, (uint8x16_p)vec2, C); + const uint8x16_p mask = {15,14,13,12, 11,10,9,8, 7,6,5,4, 3,2,1,0}; + return (uint32x4_p)vec_perm(data, data, mask); #endif } -#endif // CRYPTOPP_POWER7_AVAILABLE +inline void VectorStore(const uint32x4_p data, byte dest[16]) +{ +#if defined(CRYPTOPP_LITTLE_ENDIAN) + const uint8x16_p mask = {15,14,13,12, 11,10,9,8, 7,6,5,4, 3,2,1,0}; + const uint8x16_p t1 = (uint8x16_p)vec_perm(data, data, mask); +#else + const uint8x16_p t1 = (uint8x16_p)data; +#endif -#if defined(CRYPTOPP_POWER8_AES_AVAILABLE) || defined(CRYPTOPP_DOXYGEN_PROCESSING) + if (IsAlignedOn(dest, 16)) + { + vec_st(t1, 0, dest); + } + else + { + // http://www.nxp.com/docs/en/reference-manual/ALTIVECPEM.pdf + const uint8x16_p t2 = vec_perm(t1, t1, vec_lvsr(0, dest)); + vec_ste((uint8x16_p) t2, 0, (unsigned char*) dest); + vec_ste((uint16x8_p) t2, 1, (unsigned short*)dest); + vec_ste((uint32x4_p) t2, 3, (unsigned int*) dest); + vec_ste((uint32x4_p) t2, 4, (unsigned int*) dest); + vec_ste((uint32x4_p) t2, 8, (unsigned int*) dest); + vec_ste((uint32x4_p) t2, 12, (unsigned int*) dest); + vec_ste((uint16x8_p) t2, 14, (unsigned short*)dest); + vec_ste((uint8x16_p) t2, 15, (unsigned char*) dest); + } +} + +#endif // POWER4/POWER7 load and store + +// POWER8 crypto +#if defined(CRYPTOPP_POWER8_AVAILABLE) || defined(CRYPTOPP_DOXYGEN_PROCESSING) /// \brief One round of AES encryption /// \tparam T1 vector type @@ -507,9 +521,9 @@ inline T1 VectorDecryptLast(const T1& state, const T2& key) #endif } -#endif // CRYPTOPP_POWER8_AES_AVAILABLE +#endif // POWER8 crypto -#if defined(CRYPTOPP_POWER8_SHA_AVAILABLE) || defined(CRYPTOPP_DOXYGEN_PROCESSING) +#if defined(CRYPTOPP_POWER8_AVAILABLE) || defined(CRYPTOPP_DOXYGEN_PROCESSING) /// \brief SHA256 Sigma functions /// \tparam func function @@ -551,7 +565,7 @@ inline T VectorSHA512(const T& vec) #endif } -#endif // CRYPTOPP_POWER8_SHA_AVAILABLE +#endif // POWER8 crypto NAMESPACE_END diff --git a/rijndael-simd.cpp b/rijndael-simd.cpp index 6abfe3e5..8b98c1ce 100644 --- a/rijndael-simd.cpp +++ b/rijndael-simd.cpp @@ -25,13 +25,6 @@ #include "misc.h" #include "adv-simd.h" -// We set CRYPTOPP_POWER8_CRYPTO_AVAILABLE based on compiler version. -// If the crypto is not available, then we have to disable it here. -#if !(defined(__CRYPTO) || defined(_ARCH_PWR8) || defined(_ARCH_PWR9)) -# undef CRYPTOPP_POWER8_CRYPTO_AVAILABLE -# undef CRYPTOPP_POWER8_AES_AVAILABLE -#endif - #if (CRYPTOPP_AESNI_AVAILABLE) # include # include @@ -68,6 +61,8 @@ extern const char RIJNDAEL_SIMD_FNAME[] = __FILE__; NAMESPACE_BEGIN(CryptoPP) +// ************************* Feature Probes ************************* // + #ifdef CRYPTOPP_GNU_STYLE_INLINE_ASSEMBLY extern "C" { typedef void (*SigHandler)(int); @@ -142,6 +137,155 @@ bool CPU_ProbeAES() } #endif // ARM32 or ARM64 +#if (CRYPTOPP_BOOL_PPC32 || CRYPTOPP_BOOL_PPC64) + bool CPU_ProbePower7() +{ +#if defined(CRYPTOPP_NO_CPU_FEATURE_PROBES) + return false; +#elif (CRYPTOPP_POWER7_AVAILABLE) || (CRYPTOPP_POWER8_AVAILABLE) +# if defined(CRYPTOPP_GNU_STYLE_INLINE_ASSEMBLY) + + // longjmp and clobber warnings. Volatile is required. + // http://github.com/weidai11/cryptopp/issues/24 and http://stackoverflow.com/q/7721854 + volatile int result = false; + + volatile SigHandler oldHandler = signal(SIGILL, SigIllHandler); + if (oldHandler == SIG_ERR) + return false; + + volatile sigset_t oldMask; + if (sigprocmask(0, NULLPTR, (sigset_t*)&oldMask)) + return false; + + if (setjmp(s_jmpSIGILL)) + result = false; + else + { + // POWER7 added unaligned loads and store operations + byte b1[19] = {255, 255, 255, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1}, b2[17]; + + // Specifically call the VSX loads and stores + #if defined(__xlc__) || defined(__xlC__) + vec_xst(vec_xl(0, b1+3), 0, b2+1); + #else + vec_vsx_st(vec_vsx_ld(0, b1+3), 0, b2+1); + #endif + + result = (0 == std::memcmp(b1+3, b2+1, 16)); + } + + sigprocmask(SIG_SETMASK, (sigset_t*)&oldMask, NULLPTR); + signal(SIGILL, oldHandler); + return result; +# endif +#else + return false; +#endif // CRYPTOPP_POWER7_AVAILABLE +} + +bool CPU_ProbePower8() +{ +#if defined(CRYPTOPP_NO_CPU_FEATURE_PROBES) + return false; +#elif (CRYPTOPP_POWER8_AVAILABLE) +# if defined(CRYPTOPP_GNU_STYLE_INLINE_ASSEMBLY) + + // longjmp and clobber warnings. Volatile is required. + // http://github.com/weidai11/cryptopp/issues/24 and http://stackoverflow.com/q/7721854 + volatile int result = true; + + volatile SigHandler oldHandler = signal(SIGILL, SigIllHandler); + if (oldHandler == SIG_ERR) + return false; + + volatile sigset_t oldMask; + if (sigprocmask(0, NULLPTR, (sigset_t*)&oldMask)) + return false; + + if (setjmp(s_jmpSIGILL)) + result = false; + else + { + // POWER8 added 64-bit SIMD operations + const word64 x = W64LIT(0xffffffffffffffff); + word64 w1[2] = {x, x}, w2[2] = {4, 6}, w3[2]; + + // Specifically call the VSX loads and stores + #if defined(__xlc__) || defined(__xlC__) + const uint64x2_p v1 = (uint64x2_p)vec_xl(0, (byte*)w1); + const uint64x2_p v2 = (uint64x2_p)vec_xl(0, (byte*)w2); + const uint64x2_p v3 = vec_add(v1, v2); // 64-bit add + vec_xst((uint8x16_p)v3, 0, (byte*)w3); + #else + const uint64x2_p v1 = (uint64x2_p)vec_vsx_ld(0, (byte*)w1); + const uint64x2_p v2 = (uint64x2_p)vec_vsx_ld(0, (byte*)w2); + const uint64x2_p v3 = vec_add(v1, v2); // 64-bit add + vec_vsx_st((uint8x16_p)v3, 0, (byte*)w3); + #endif + + // Relies on integer wrap + result = (w3[0] == 3 && w3[1] == 5); + } + + sigprocmask(SIG_SETMASK, (sigset_t*)&oldMask, NULLPTR); + signal(SIGILL, oldHandler); + return result; +# endif +#else + return false; +#endif // CRYPTOPP_POWER8_AVAILABLE +} + +bool CPU_ProbeAES() +{ +#if defined(CRYPTOPP_NO_CPU_FEATURE_PROBES) + return false; +#elif (CRYPTOPP_POWER8_AES_AVAILABLE) +# if defined(CRYPTOPP_GNU_STYLE_INLINE_ASSEMBLY) + + // longjmp and clobber warnings. Volatile is required. + // http://github.com/weidai11/cryptopp/issues/24 and http://stackoverflow.com/q/7721854 + volatile int result = true; + + volatile SigHandler oldHandler = signal(SIGILL, SigIllHandler); + if (oldHandler == SIG_ERR) + return false; + + volatile sigset_t oldMask; + if (sigprocmask(0, NULLPTR, (sigset_t*)&oldMask)) + return false; + + if (setjmp(s_jmpSIGILL)) + result = false; + else + { + byte key[16] = {0xA0, 0xFA, 0xFE, 0x17, 0x88, 0x54, 0x2c, 0xb1, + 0x23, 0xa3, 0x39, 0x39, 0x2a, 0x6c, 0x76, 0x05}; + byte state[16] = {0x19, 0x3d, 0xe3, 0xb3, 0xa0, 0xf4, 0xe2, 0x2b, + 0x9a, 0xc6, 0x8d, 0x2a, 0xe9, 0xf8, 0x48, 0x08}; + byte r[16] = {255}, z[16] = {}; + + uint8x16_p k = (uint8x16_p)VectorLoad(0, key); + uint8x16_p s = (uint8x16_p)VectorLoad(0, state); + s = VectorEncrypt(s, k); + s = VectorEncryptLast(s, k); + s = VectorDecrypt(s, k); + s = VectorDecryptLast(s, k); + VectorStore(s, r); + + result = (0 != std::memcmp(r, z, 16)); + } + + sigprocmask(SIG_SETMASK, (sigset_t*)&oldMask, NULLPTR); + signal(SIGILL, oldHandler); + return result; +# endif +#else + return false; +#endif // CRYPTOPP_POWER8_AES_AVAILABLE +} +#endif // PPC32 or PPC64 + // ***************************** ARMv8 ***************************** // #if (CRYPTOPP_ARM_AES_AVAILABLE) diff --git a/sha-simd.cpp b/sha-simd.cpp index 09442279..c5b4b122 100644 --- a/sha-simd.cpp +++ b/sha-simd.cpp @@ -185,6 +185,96 @@ bool CPU_ProbeSHA2() } #endif // ARM32 or ARM64 +#if (CRYPTOPP_BOOL_PPC32 || CRYPTOPP_BOOL_PPC64) +bool CPU_ProbeSHA256() +{ +#if defined(CRYPTOPP_NO_CPU_FEATURE_PROBES) + return false; +#elif (CRYPTOPP_POWER8_AVAILABLE) +# if defined(CRYPTOPP_GNU_STYLE_INLINE_ASSEMBLY) + + // longjmp and clobber warnings. Volatile is required. + // http://github.com/weidai11/cryptopp/issues/24 and http://stackoverflow.com/q/7721854 + volatile int result = false; + + volatile SigHandler oldHandler = signal(SIGILL, SigIllHandler); + if (oldHandler == SIG_ERR) + return false; + + volatile sigset_t oldMask; + if (sigprocmask(0, NULLPTR, (sigset_t*)&oldMask)) + return false; + + if (setjmp(s_jmpSIGILL)) + result = false; + else + { + byte r[16], z[16] = {0}; + uint8x16_p x = ((uint8x16_p){0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}); + + x = VectorSHA256<0,0>(x); + x = VectorSHA256<0,1>(x); + x = VectorSHA256<1,0>(x); + x = VectorSHA256<1,1>(x); + VectorStore(x, r); + + result = (0 == std::memcmp(r, z, 16)); + } + + sigprocmask(SIG_SETMASK, (sigset_t*)&oldMask, NULLPTR); + signal(SIGILL, oldHandler); + return result; +# endif +#else + return false; +#endif // CRYPTOPP_ALTIVEC_AVAILABLE +} + +bool CPU_ProbeSHA512() +{ +#if defined(CRYPTOPP_NO_CPU_FEATURE_PROBES) + return false; +#elif (CRYPTOPP_POWER8_AVAILABLE) +# if defined(CRYPTOPP_GNU_STYLE_INLINE_ASSEMBLY) + + // longjmp and clobber warnings. Volatile is required. + // http://github.com/weidai11/cryptopp/issues/24 and http://stackoverflow.com/q/7721854 + volatile int result = false; + + volatile SigHandler oldHandler = signal(SIGILL, SigIllHandler); + if (oldHandler == SIG_ERR) + return false; + + volatile sigset_t oldMask; + if (sigprocmask(0, NULLPTR, (sigset_t*)&oldMask)) + return false; + + if (setjmp(s_jmpSIGILL)) + result = false; + else + { + byte r[16], z[16] = {0}; + uint8x16_p x = ((uint8x16_p){0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}); + + x = VectorSHA512<0,0>(x); + x = VectorSHA512<0,1>(x); + x = VectorSHA512<1,0>(x); + x = VectorSHA512<1,1>(x); + VectorStore(x, r); + + result = (0 == std::memcmp(r, z, 16)); + } + + sigprocmask(SIG_SETMASK, (sigset_t*)&oldMask, NULLPTR); + signal(SIGILL, oldHandler); + return result; +# endif +#else + return false; +#endif // CRYPTOPP_POWER8_AVAILABLE +} +#endif // PPC32 or PPC64 + // ***************** Intel x86 SHA ******************** // provided by sha.cpp, 16-byte aigned