diff --git a/GNUmakefile b/GNUmakefile index b9e64ec1..eed0c647 100644 --- a/GNUmakefile +++ b/GNUmakefile @@ -2,8 +2,12 @@ ##### System Attributes and Programs ##### ########################################################### +# If needed TMPDIR ?= /tmp +# Used for ARMv7 and NEON. +FP_ABI ?= hard +# Command ard arguments AR ?= ar ARFLAGS ?= -cr # ar needs the dash on OpenBSD RANLIB ?= ranlib @@ -297,7 +301,7 @@ endif endif ifeq ($(IS_NEON),1) - NEON_FLAG = $(shell echo | $(CXX) $(CXXFLAGS) -march=armv7-a -mfloat-abi=softfp -mfpu=neon -dM -E - | grep -i -c -q __ARM_NEON && echo "-march=armv7-a -mfloat-abi=softfp -mfpu=neon") + NEON_FLAG = $(shell echo | $(CXX) $(CXXFLAGS) -march=armv7-a -mfloat-abi=$(FP_ABI) -mfpu=neon -dM -E - | grep -i -c -q __ARM_NEON && echo "-march=armv7-a -mfloat-abi=$(FP_ABI) -mfpu=neon") GCM_FLAG = $(NEON_FLAG) ARIA_FLAG = $(NEON_FLAG) BLAKE2_FLAG = $(NEON_FLAG) @@ -868,6 +872,10 @@ blake2-simd.o : blake2-simd.cpp crc-simd.o : crc-simd.cpp $(CXX) $(strip $(CXXFLAGS) $(CRC_FLAG) -c) $< +# PCLMUL or ARMv7a/ARMv8a available +gcm-simd.o : gcm-simd.cpp + $(CXX) $(strip $(CXXFLAGS) $(GCM_FLAG) -c) $< + # SSE4.2/SHA-NI or ARMv8a available sha-simd.o : sha-simd.cpp $(CXX) $(strip $(CXXFLAGS) $(SHA_FLAG) -c) $< diff --git a/config.h b/config.h index 1fdf92b2..9b02fece 100644 --- a/config.h +++ b/config.h @@ -517,7 +517,8 @@ NAMESPACE_END // Requires ARMv7 and ACLE 1.0. Testing shows ARMv7 is really ARMv7a under most toolchains. #if !defined(CRYPTOPP_ARM_NEON_AVAILABLE) && !defined(CRYPTOPP_DISABLE_ASM) -# if defined(__ARM_NEON__) || defined(__ARM_NEON) || (CRYPTOPP_GCC_VERSION >= 40800) || (CRYPTOPP_LLVM_CLANG_VERSION >= 30500) +# if defined(__ARM_NEON__) || defined(__ARM_NEON) || (defined(__ARM_ARCH) && (__ARM_ARCH >= 7)) || \ + (CRYPTOPP_GCC_VERSION >= 40800) || (CRYPTOPP_LLVM_CLANG_VERSION >= 30500) # define CRYPTOPP_ARM_NEON_AVAILABLE 1 # endif #endif @@ -527,17 +528,21 @@ NAMESPACE_END // Microsoft plans to support ARM-64, but its not clear how to detect it. // TODO: Add MSC_VER and ARM-64 platform define when available #if !defined(CRYPTOPP_ARMV8A_CRC32_AVAILABLE) && !defined(CRYPTOPP_DISABLE_ASM) -# if defined(__ARM_FEATURE_CRC32) || (CRYPTOPP_GCC_VERSION >= 40800) || (CRYPTOPP_LLVM_CLANG_VERSION >= 30500) +# if defined(__ARM_FEATURE_CRC32) || (defined(__ARM_ARCH) && (__ARM_ARCH >= 8)) || \ + (defined(__ARM_32BIT_STATE_) || defined(__ARM_64BIT_STATE_)) || \ + (defined(__AARCH32EL__) || defined(__AARCH64EL__)) # define CRYPTOPP_ARMV8A_CRC32_AVAILABLE 1 # endif #endif -// Requires ARMv8, ACLE 2.0 and Aarch64. GCC requires 4.8 and above. -// LLVM Clang requires 3.5. Apple Clang does not support it at the moment. -// Microsoft plans to support ARM-64, but its not clear how to detect it. +// Requires ARMv8, but we are not sure of the define because the ACLE does not discuss it. +// GCC seems to requires 4.8 and above. LLVM Clang requires 3.5. Apple Clang does not support +// it at the moment. Microsoft plans to support ARM-64, but its not clear how to detect it. // TODO: Add MSC_VER and ARM-64 platform define when available #if !defined(CRYPTOPP_ARMV8A_PMULL_AVAILABLE) && !defined(CRYPTOPP_DISABLE_ASM) && !defined(__apple_build_version__) -# if defined(__ARM_FEATURE_CRYPTO) || (CRYPTOPP_GCC_VERSION >= 40800) || (CRYPTOPP_LLVM_CLANG_VERSION >= 30500) +# if defined(__ARM_FEATURE_CRYPTO) || (defined(__ARM_ARCH) && (__ARM_ARCH >= 8)) || \ + (defined(__ARM_32BIT_STATE_) || defined(__ARM_64BIT_STATE_)) || \ + (defined(__AARCH32EL__) || defined(__AARCH64EL__)) # define CRYPTOPP_ARMV8A_PMULL_AVAILABLE 1 # endif #endif @@ -547,7 +552,9 @@ NAMESPACE_END // Microsoft plans to support ARM-64, but its not clear how to detect it. // TODO: Add MSC_VER and ARM-64 platform define when available #if !defined(CRYPTOPP_ARMV8A_CRYPTO_AVAILABLE) && !defined(CRYPTOPP_DISABLE_ASM) -# if defined(__ARM_FEATURE_CRYPTO) || (CRYPTOPP_GCC_VERSION >= 40800) || (CRYPTOPP_LLVM_CLANG_VERSION >= 30500) +# if defined(__ARM_FEATURE_CRYPTO) || (defined(__ARM_ARCH) && (__ARM_ARCH >= 8)) || \ + (defined(__ARM_32BIT_STATE_) || defined(__ARM_64BIT_STATE_)) || \ + (defined(__AARCH32EL__) || defined(__AARCH64EL__)) # define CRYPTOPP_ARMV8A_AES_AVAILABLE 1 # define CRYPTOPP_ARMV8A_SHA_AVAILABLE 1 # define CRYPTOPP_ARMV8A_CRYPTO_AVAILABLE 1 diff --git a/cpu.cpp b/cpu.cpp index 11723c39..91706867 100644 --- a/cpu.cpp +++ b/cpu.cpp @@ -344,12 +344,6 @@ extern bool CPU_TryPMULL_ARMV8(); #ifndef CRYPTOPP_MS_STYLE_INLINE_ASSEMBLY extern "C" { - static jmp_buf s_jmpNoPMULL; - static void SigIllHandlerPMULL(int) - { - longjmp(s_jmpNoPMULL, 1); - } - static jmp_buf s_jmpNoAES; static void SigIllHandlerAES(int) { @@ -360,8 +354,8 @@ extern "C" static bool TryNEON() { -#if (CRYPTOPP_ARMV8A_CRC32_AVAILABLE) - return CPU_TryCRC32_ARMV8(); +#if (CRYPTOPP_ARM_NEON_AVAILABLE) + return CPU_TryNEON_ARM(); #else return false; #endif @@ -379,68 +373,10 @@ static bool TryCRC32() static bool TryPMULL() { #if (CRYPTOPP_ARMV8A_PMULL_AVAILABLE) -# if defined(CRYPTOPP_MS_STYLE_INLINE_ASSEMBLY) - volatile bool result = true; - __try - { - const poly64_t a1={0x9090909090909090}, b1={0xb0b0b0b0b0b0b0b0}; - const poly8x16_t a2={0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0xa0,0xa0,0xa0,0xa0,0xa0,0xa0,0xa0,0xa0}, - b2={0xc0,0xc0,0xc0,0xc0,0xc0,0xc0,0xc0,0xc0,0xe0,0xe0,0xe0,0xe0,0xe0,0xe0,0xe0,0xe0}; - - const poly128_t r1 = vmull_p64(a1, b1); - const poly128_t r2 = vmull_high_p64((poly64x2_t)(a2), (poly64x2_t)(b2)); - - // Linaro is missing vreinterpretq_u64_p128. Also see http://github.com/weidai11/cryptopp/issues/233. - const uint64x2_t& t1 = (uint64x2_t)(r1); // {bignum,bignum} - const uint64x2_t& t2 = (uint64x2_t)(r2); // {bignum,bignum} - - result = !!(vgetq_lane_u64(t1,0) == 0x5300530053005300 && vgetq_lane_u64(t1,1) == 0x5300530053005300 && - vgetq_lane_u64(t2,0) == 0x6c006c006c006c00 && vgetq_lane_u64(t2,1) == 0x6c006c006c006c00); - } - __except (EXCEPTION_EXECUTE_HANDLER) - { - return false; - } - return result; -# else - // longjmp and clobber warnings. Volatile is required. - // http://github.com/weidai11/cryptopp/issues/24 and http://stackoverflow.com/q/7721854 - volatile bool result = true; - - volatile SigHandler oldHandler = signal(SIGILL, SigIllHandlerPMULL); - if (oldHandler == SIG_ERR) - return false; - - volatile sigset_t oldMask; - if (sigprocmask(0, NULLPTR, (sigset_t*)&oldMask)) - return false; - - if (setjmp(s_jmpNoPMULL)) - result = false; - else - { - const poly64_t a1={0x9090909090909090}, b1={0xb0b0b0b0b0b0b0b0}; - const poly8x16_t a2={0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0xa0,0xa0,0xa0,0xa0,0xa0,0xa0,0xa0,0xa0}, - b2={0xc0,0xc0,0xc0,0xc0,0xc0,0xc0,0xc0,0xc0,0xe0,0xe0,0xe0,0xe0,0xe0,0xe0,0xe0,0xe0}; - - const poly128_t r1 = vmull_p64(a1, b1); - const poly128_t r2 = vmull_high_p64((poly64x2_t)(a2), (poly64x2_t)(b2)); - - // Linaro is missing vreinterpretq_u64_p128. Also see http://github.com/weidai11/cryptopp/issues/233. - const uint64x2_t& t1 = (uint64x2_t)(r1); // {bignum,bignum} - const uint64x2_t& t2 = (uint64x2_t)(r2); // {bignum,bignum} - - result = !!(vgetq_lane_u64(t1,0) == 0x5300530053005300 && vgetq_lane_u64(t1,1) == 0x5300530053005300 && - vgetq_lane_u64(t2,0) == 0x6c006c006c006c00 && vgetq_lane_u64(t2,1) == 0x6c006c006c006c00); - } - - sigprocmask(SIG_SETMASK, (sigset_t*)&oldMask, NULLPTR); - signal(SIGILL, oldHandler); - return result; -# endif + return CPU_TryPMULL_ARMV8(); #else return false; -#endif // CRYPTOPP_ARMV8A_CRYPTO_AVAILABLE +#endif } static bool TryAES() diff --git a/crc-simd.cpp b/crc-simd.cpp index 5a77b9e2..a8428b48 100644 --- a/crc-simd.cpp +++ b/crc-simd.cpp @@ -14,10 +14,12 @@ # include "nmmintrin.h" #endif -#if (CRYPTOPP_ARMV8A_CRC32_AVAILABLE) && defined(__GNUC__) +#if (CRYPTOPP_ARMV8A_CRC32_AVAILABLE) # include "arm_neon.h" +#if defined(__GNUC__) # include "arm_acle.h" #endif +#endif #ifdef CRYPTOPP_GNU_STYLE_INLINE_ASSEMBLY # include diff --git a/gcm-simd.cpp b/gcm-simd.cpp new file mode 100644 index 00000000..e8081001 --- /dev/null +++ b/gcm-simd.cpp @@ -0,0 +1,118 @@ +// crc-simd.cpp - written and placed in the public domain by +// Jeffrey Walton, Uri Blumenthal and Marcel Raad. +// +// This source file uses intrinsics to gain access to SSE4.2 and +// ARMv8a CRC-32 and CRC-32C instructions. A separate source file +// is needed because additional CXXFLAGS are required to enable +// the appropriate instructions sets in some build configurations. + +#include "pch.h" +#include "config.h" +#include "misc.h" + +#if (CRYPTOPP_AESNI_AVAILABLE) +# include "wmmintrin.h" +#endif + +#if (CRYPTOPP_ARM_NEON_AVAILABLE) +# include "arm_neon.h" +#if (CRYPTOPP_ARMV8A_PMULL_AVAILABLE) +# include "arm_acle.h" +#endif +#endif + +#ifdef CRYPTOPP_GNU_STYLE_INLINE_ASSEMBLY +# include +# include +#endif + +NAMESPACE_BEGIN(CryptoPP) + +#ifdef CRYPTOPP_GNU_STYLE_INLINE_ASSEMBLY +extern "C" { + typedef void (*SigHandler)(int); + + static jmp_buf s_jmpSIGILL; + static void SigIllHandler(int) + { + longjmp(s_jmpSIGILL, 1); + } +}; +#endif // Not CRYPTOPP_MS_STYLE_INLINE_ASSEMBLY + +#if (CRYPTOPP_ARMV8A_PMULL_AVAILABLE) +bool CPU_TryPMULL_ARMV8() +{ +# if defined(CRYPTOPP_MS_STYLE_INLINE_ASSEMBLY) + volatile bool result = true; + __try + { + const poly64_t a1={0x9090909090909090}, b1={0xb0b0b0b0b0b0b0b0}; + const poly8x16_t a2={0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0xa0,0xa0,0xa0,0xa0,0xa0,0xa0,0xa0,0xa0}, + b2={0xc0,0xc0,0xc0,0xc0,0xc0,0xc0,0xc0,0xc0,0xe0,0xe0,0xe0,0xe0,0xe0,0xe0,0xe0,0xe0}; + + const poly128_t r1 = vmull_p64(a1, b1); + const poly128_t r2 = vmull_high_p64((poly64x2_t)(a2), (poly64x2_t)(b2)); + + // Linaro is missing vreinterpretq_u64_p128. Also see http://github.com/weidai11/cryptopp/issues/233. + const uint64x2_t& t1 = (uint64x2_t)(r1); // {bignum,bignum} + const uint64x2_t& t2 = (uint64x2_t)(r2); // {bignum,bignum} + + result = !!(vgetq_lane_u64(t1,0) == 0x5300530053005300 && vgetq_lane_u64(t1,1) == 0x5300530053005300 && + vgetq_lane_u64(t2,0) == 0x6c006c006c006c00 && vgetq_lane_u64(t2,1) == 0x6c006c006c006c00); + } + __except (EXCEPTION_EXECUTE_HANDLER) + { + return false; + } + return result; +# else + // longjmp and clobber warnings. Volatile is required. + // http://github.com/weidai11/cryptopp/issues/24 and http://stackoverflow.com/q/7721854 + volatile bool result = true; + + volatile SigHandler oldHandler = signal(SIGILL, SigIllHandlerPMULL); + if (oldHandler == SIG_ERR) + return false; + + volatile sigset_t oldMask; + if (sigprocmask(0, NULLPTR, (sigset_t*)&oldMask)) + return false; + + if (setjmp(s_jmpNoPMULL)) + result = false; + else + { + const poly64_t a1={0x9090909090909090}, b1={0xb0b0b0b0b0b0b0b0}; + const poly8x16_t a2={0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0xa0,0xa0,0xa0,0xa0,0xa0,0xa0,0xa0,0xa0}, + b2={0xc0,0xc0,0xc0,0xc0,0xc0,0xc0,0xc0,0xc0,0xe0,0xe0,0xe0,0xe0,0xe0,0xe0,0xe0,0xe0}; + + const poly128_t r1 = vmull_p64(a1, b1); + const poly128_t r2 = vmull_high_p64((poly64x2_t)(a2), (poly64x2_t)(b2)); + + // Linaro is missing vreinterpretq_u64_p128. Also see http://github.com/weidai11/cryptopp/issues/233. + const uint64x2_t& t1 = (uint64x2_t)(r1); // {bignum,bignum} + const uint64x2_t& t2 = (uint64x2_t)(r2); // {bignum,bignum} + + result = !!(vgetq_lane_u64(t1,0) == 0x5300530053005300 && vgetq_lane_u64(t1,1) == 0x5300530053005300 && + vgetq_lane_u64(t2,0) == 0x6c006c006c006c00 && vgetq_lane_u64(t2,1) == 0x6c006c006c006c00); + } + + sigprocmask(SIG_SETMASK, (sigset_t*)&oldMask, NULLPTR); + signal(SIGILL, oldHandler); + return result; +# endif +} +#endif // CRYPTOPP_ARMV8A_PMULL_AVAILABLE + +#if CRYPTOPP_ARM_NEON_AVAILABLE +void GCM_Xor16_NEON(byte *a, const byte *b, const byte *c) +{ + CRYPTOPP_ASSERT(IsAlignedOn(a,GetAlignmentOf())); + CRYPTOPP_ASSERT(IsAlignedOn(b,GetAlignmentOf())); + CRYPTOPP_ASSERT(IsAlignedOn(c,GetAlignmentOf())); + *(uint64x2_t*)a = veorq_u64(*(uint64x2_t*)b, *(uint64x2_t*)c); +} +#endif + +NAMESPACE_END \ No newline at end of file diff --git a/gcm.cpp b/gcm.cpp index 2c915125..2c306dd0 100644 --- a/gcm.cpp +++ b/gcm.cpp @@ -49,6 +49,10 @@ NAMESPACE_BEGIN(CryptoPP) #endif #endif +#if CRYPTOPP_ARM_NEON_AVAILABLE +extern void GCM_Xor16_NEON(byte *a, const byte *b, const byte *c); +#endif + #if (CRYPTOPP_BOOL_ARM32 || CRYPTOPP_BOOL_ARM64) && CRYPTOPP_ARMV8A_PMULL_AVAILABLE #if defined(__GNUC__) // Schneiders, Hovsmith and O'Rourke used this trick. @@ -193,6 +197,15 @@ __m128i _mm_clmulepi64_si128(const __m128i &a, const __m128i &b, int i) } #endif +inline static void Xor16(byte *a, const byte *b, const byte *c) +{ + CRYPTOPP_ASSERT(IsAlignedOn(a,GetAlignmentOf())); + CRYPTOPP_ASSERT(IsAlignedOn(b,GetAlignmentOf())); + CRYPTOPP_ASSERT(IsAlignedOn(c,GetAlignmentOf())); + ((word64 *)(void *)a)[0] = ((word64 *)(void *)b)[0] ^ ((word64 *)(void *)c)[0]; + ((word64 *)(void *)a)[1] = ((word64 *)(void *)b)[1] ^ ((word64 *)(void *)c)[1]; +} + #if CRYPTOPP_BOOL_SSE2_INTRINSICS_AVAILABLE || CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE inline static void SSE2_Xor16(byte *a, const byte *b, const byte *c) { @@ -211,25 +224,6 @@ inline static void SSE2_Xor16(byte *a, const byte *b, const byte *c) } #endif -#if CRYPTOPP_ARM_NEON_AVAILABLE -inline static void NEON_Xor16(byte *a, const byte *b, const byte *c) -{ - CRYPTOPP_ASSERT(IsAlignedOn(a,GetAlignmentOf())); - CRYPTOPP_ASSERT(IsAlignedOn(b,GetAlignmentOf())); - CRYPTOPP_ASSERT(IsAlignedOn(c,GetAlignmentOf())); - *(uint64x2_t*)a = veorq_u64(*(uint64x2_t*)b, *(uint64x2_t*)c); -} -#endif - -inline static void Xor16(byte *a, const byte *b, const byte *c) -{ - CRYPTOPP_ASSERT(IsAlignedOn(a,GetAlignmentOf())); - CRYPTOPP_ASSERT(IsAlignedOn(b,GetAlignmentOf())); - CRYPTOPP_ASSERT(IsAlignedOn(c,GetAlignmentOf())); - ((word64 *)(void *)a)[0] = ((word64 *)(void *)b)[0] ^ ((word64 *)(void *)c)[0]; - ((word64 *)(void *)a)[1] = ((word64 *)(void *)b)[1] ^ ((word64 *)(void *)c)[1]; -} - #if CRYPTOPP_BOOL_AESNI_INTRINSICS_AVAILABLE CRYPTOPP_ALIGN_DATA(16) static const word64 s_clmulConstants64[] = { @@ -441,7 +435,7 @@ void GCM_Base::SetKeyWithoutResync(const byte *userKey, size_t keylength, const if (HasNEON()) for (j=2; j<=0x80; j*=2) for (k=1; k