Fixed ARMv7a and NEON detection. Initial cut-in of GCM

2017-07-30 03:16:58 -04:00 · 2017-07-30 03:16:58 -04:00 · b4f6882237
parent 4b51eadc73
commit b4f6882237
7 changed files with 166 additions and 101 deletions
--- a/10
+++ b/10
@ -2,8 +2,12 @@
 #####        System Attributes and Programs           #####
 ###########################################################

+# If needed
 TMPDIR ?= /tmp
+# Used for ARMv7 and NEON.
+FP_ABI ?= hard

+# Command ard arguments
 AR ?= ar
 ARFLAGS ?= -cr # ar needs the dash on OpenBSD
 RANLIB ?= ranlib
@ -297,7 +301,7 @@ endif
 endif

 ifeq ($(IS_NEON),1)
-	NEON_FLAG = $(shell echo | $(CXX) $(CXXFLAGS) -march=armv7-a -mfloat-abi=softfp -mfpu=neon -dM -E - | grep -i -c -q __ARM_NEON && echo "-march=armv7-a -mfloat-abi=softfp -mfpu=neon")
+	NEON_FLAG = $(shell echo | $(CXX) $(CXXFLAGS) -march=armv7-a -mfloat-abi=$(FP_ABI) -mfpu=neon -dM -E - | grep -i -c -q __ARM_NEON && echo "-march=armv7-a -mfloat-abi=$(FP_ABI) -mfpu=neon")
 	GCM_FLAG = $(NEON_FLAG)
 	ARIA_FLAG = $(NEON_FLAG)
 	BLAKE2_FLAG = $(NEON_FLAG)
@ -868,6 +872,10 @@ blake2-simd.o : blake2-simd.cpp
 crc-simd.o : crc-simd.cpp
 	$(CXX) $(strip $(CXXFLAGS) $(CRC_FLAG) -c) $<

+# PCLMUL or ARMv7a/ARMv8a available
+gcm-simd.o : gcm-simd.cpp
+	$(CXX) $(strip $(CXXFLAGS) $(GCM_FLAG) -c) $<
+
 # SSE4.2/SHA-NI or ARMv8a available
 sha-simd.o : sha-simd.cpp
 	$(CXX) $(strip $(CXXFLAGS) $(SHA_FLAG) -c) $<
--- a/config.h
+++ b/config.h
@ -517,7 +517,8 @@ NAMESPACE_END

 // Requires ARMv7 and ACLE 1.0. Testing shows ARMv7 is really ARMv7a under most toolchains.
 #if !defined(CRYPTOPP_ARM_NEON_AVAILABLE) && !defined(CRYPTOPP_DISABLE_ASM)
-# if defined(__ARM_NEON__) || defined(__ARM_NEON) || (CRYPTOPP_GCC_VERSION >= 40800) || (CRYPTOPP_LLVM_CLANG_VERSION >= 30500)
+# if defined(__ARM_NEON__) || defined(__ARM_NEON) || (defined(__ARM_ARCH) && (__ARM_ARCH >= 7)) || \
+	(CRYPTOPP_GCC_VERSION >= 40800) || (CRYPTOPP_LLVM_CLANG_VERSION >= 30500)
 #  define CRYPTOPP_ARM_NEON_AVAILABLE 1
 # endif
 #endif
@ -527,17 +528,21 @@ NAMESPACE_END
 // Microsoft plans to support ARM-64, but its not clear how to detect it.
 // TODO: Add MSC_VER and ARM-64 platform define when available
 #if !defined(CRYPTOPP_ARMV8A_CRC32_AVAILABLE) && !defined(CRYPTOPP_DISABLE_ASM)
-# if defined(__ARM_FEATURE_CRC32) || (CRYPTOPP_GCC_VERSION >= 40800) || (CRYPTOPP_LLVM_CLANG_VERSION >= 30500)
+# if defined(__ARM_FEATURE_CRC32) || (defined(__ARM_ARCH) && (__ARM_ARCH >= 8)) || \
+	(defined(__ARM_32BIT_STATE_) || defined(__ARM_64BIT_STATE_)) || \
+	(defined(__AARCH32EL__) || defined(__AARCH64EL__))
 #  define CRYPTOPP_ARMV8A_CRC32_AVAILABLE 1
 # endif
 #endif

-// Requires ARMv8, ACLE 2.0 and Aarch64. GCC requires 4.8 and above.
-// LLVM Clang requires 3.5. Apple Clang does not support it at the moment.
-// Microsoft plans to support ARM-64, but its not clear how to detect it.
+// Requires ARMv8, but we are not sure of the define because the ACLE does not discuss it.
+// GCC seems to requires 4.8 and above. LLVM Clang requires 3.5. Apple Clang does not support
+// it at the moment. Microsoft plans to support ARM-64, but its not clear how to detect it.
 // TODO: Add MSC_VER and ARM-64 platform define when available
 #if !defined(CRYPTOPP_ARMV8A_PMULL_AVAILABLE) && !defined(CRYPTOPP_DISABLE_ASM) && !defined(__apple_build_version__)
-# if defined(__ARM_FEATURE_CRYPTO) || (CRYPTOPP_GCC_VERSION >= 40800) || (CRYPTOPP_LLVM_CLANG_VERSION >= 30500)
+# if defined(__ARM_FEATURE_CRYPTO) || (defined(__ARM_ARCH) && (__ARM_ARCH >= 8)) || \
+	(defined(__ARM_32BIT_STATE_) || defined(__ARM_64BIT_STATE_)) || \
+	(defined(__AARCH32EL__) || defined(__AARCH64EL__))
 #  define CRYPTOPP_ARMV8A_PMULL_AVAILABLE 1
 # endif
 #endif
@ -547,7 +552,9 @@ NAMESPACE_END
 // Microsoft plans to support ARM-64, but its not clear how to detect it.
 // TODO: Add MSC_VER and ARM-64 platform define when available
 #if !defined(CRYPTOPP_ARMV8A_CRYPTO_AVAILABLE) && !defined(CRYPTOPP_DISABLE_ASM)
-# if defined(__ARM_FEATURE_CRYPTO) || (CRYPTOPP_GCC_VERSION >= 40800) || (CRYPTOPP_LLVM_CLANG_VERSION >= 30500)
+# if defined(__ARM_FEATURE_CRYPTO) || (defined(__ARM_ARCH) && (__ARM_ARCH >= 8)) || \
+	(defined(__ARM_32BIT_STATE_) || defined(__ARM_64BIT_STATE_)) || \
+	(defined(__AARCH32EL__) || defined(__AARCH64EL__))
 #  define CRYPTOPP_ARMV8A_AES_AVAILABLE 1
 #  define CRYPTOPP_ARMV8A_SHA_AVAILABLE 1
 #  define CRYPTOPP_ARMV8A_CRYPTO_AVAILABLE 1
--- a/cpu.cpp
+++ b/cpu.cpp
@ -344,12 +344,6 @@ extern bool CPU_TryPMULL_ARMV8();
 #ifndef CRYPTOPP_MS_STYLE_INLINE_ASSEMBLY
 extern "C"
 {
-	static jmp_buf s_jmpNoPMULL;
-	static void SigIllHandlerPMULL(int)
-	{
-		longjmp(s_jmpNoPMULL, 1);
-	}
-
 	static jmp_buf s_jmpNoAES;
 	static void SigIllHandlerAES(int)
 	{
@ -360,8 +354,8 @@ extern "C"

 static bool TryNEON()
 {
-#if (CRYPTOPP_ARMV8A_CRC32_AVAILABLE)
-	return CPU_TryCRC32_ARMV8();
+#if (CRYPTOPP_ARM_NEON_AVAILABLE)
+	return CPU_TryNEON_ARM();
 #else
 	return false;
 #endif
@ -379,68 +373,10 @@ static bool TryCRC32()
 static bool TryPMULL()
 {
 #if (CRYPTOPP_ARMV8A_PMULL_AVAILABLE)
-# if defined(CRYPTOPP_MS_STYLE_INLINE_ASSEMBLY)
-	volatile bool result = true;
-	__try
-	{
-		const poly64_t   a1={0x9090909090909090}, b1={0xb0b0b0b0b0b0b0b0};
-		const poly8x16_t a2={0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0xa0,0xa0,0xa0,0xa0,0xa0,0xa0,0xa0,0xa0},
-		                 b2={0xc0,0xc0,0xc0,0xc0,0xc0,0xc0,0xc0,0xc0,0xe0,0xe0,0xe0,0xe0,0xe0,0xe0,0xe0,0xe0};
-
-		const poly128_t r1 = vmull_p64(a1, b1);
-		const poly128_t r2 = vmull_high_p64((poly64x2_t)(a2), (poly64x2_t)(b2));
-
-		// Linaro is missing vreinterpretq_u64_p128. Also see http://github.com/weidai11/cryptopp/issues/233.
-		const uint64x2_t& t1 = (uint64x2_t)(r1);  // {bignum,bignum}
-		const uint64x2_t& t2 = (uint64x2_t)(r2);  // {bignum,bignum}
-
-		result = !!(vgetq_lane_u64(t1,0) == 0x5300530053005300 && vgetq_lane_u64(t1,1) == 0x5300530053005300 &&
-		            vgetq_lane_u64(t2,0) == 0x6c006c006c006c00 && vgetq_lane_u64(t2,1) == 0x6c006c006c006c00);
-	}
-	__except (EXCEPTION_EXECUTE_HANDLER)
-	{
-		return false;
-	}
-	return result;
-# else
-	// longjmp and clobber warnings. Volatile is required.
-	// http://github.com/weidai11/cryptopp/issues/24 and http://stackoverflow.com/q/7721854
-	volatile bool result = true;
-
-	volatile SigHandler oldHandler = signal(SIGILL, SigIllHandlerPMULL);
-	if (oldHandler == SIG_ERR)
-		return false;
-
-	volatile sigset_t oldMask;
-	if (sigprocmask(0, NULLPTR, (sigset_t*)&oldMask))
-		return false;
-
-	if (setjmp(s_jmpNoPMULL))
-		result = false;
-	else
-	{
-		const poly64_t   a1={0x9090909090909090}, b1={0xb0b0b0b0b0b0b0b0};
-		const poly8x16_t a2={0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0xa0,0xa0,0xa0,0xa0,0xa0,0xa0,0xa0,0xa0},
-		                 b2={0xc0,0xc0,0xc0,0xc0,0xc0,0xc0,0xc0,0xc0,0xe0,0xe0,0xe0,0xe0,0xe0,0xe0,0xe0,0xe0};
-
-		const poly128_t r1 = vmull_p64(a1, b1);
-		const poly128_t r2 = vmull_high_p64((poly64x2_t)(a2), (poly64x2_t)(b2));
-
-		// Linaro is missing vreinterpretq_u64_p128. Also see http://github.com/weidai11/cryptopp/issues/233.
-		const uint64x2_t& t1 = (uint64x2_t)(r1);  // {bignum,bignum}
-		const uint64x2_t& t2 = (uint64x2_t)(r2);  // {bignum,bignum}
-
-		result = !!(vgetq_lane_u64(t1,0) == 0x5300530053005300 && vgetq_lane_u64(t1,1) == 0x5300530053005300 &&
-		            vgetq_lane_u64(t2,0) == 0x6c006c006c006c00 && vgetq_lane_u64(t2,1) == 0x6c006c006c006c00);
-	}
-
-	sigprocmask(SIG_SETMASK, (sigset_t*)&oldMask, NULLPTR);
-	signal(SIGILL, oldHandler);
-	return result;
-# endif
+	return CPU_TryPMULL_ARMV8();
 #else
 	return false;
-#endif  // CRYPTOPP_ARMV8A_CRYPTO_AVAILABLE
+#endif
 }

 static bool TryAES()
--- a/crc-simd.cpp
+++ b/crc-simd.cpp
@ -14,10 +14,12 @@
 # include "nmmintrin.h"
 #endif

-#if (CRYPTOPP_ARMV8A_CRC32_AVAILABLE) && defined(__GNUC__)
+#if (CRYPTOPP_ARMV8A_CRC32_AVAILABLE)
 # include "arm_neon.h"
+#if defined(__GNUC__)
 # include "arm_acle.h"
 #endif
+#endif

 #ifdef CRYPTOPP_GNU_STYLE_INLINE_ASSEMBLY
 # include <signal.h>
--- a/gcm-simd.cpp
+++ b/gcm-simd.cpp
@ -0,0 +1,118 @@
+// crc-simd.cpp - written and placed in the public domain by
+//                Jeffrey Walton, Uri Blumenthal and Marcel Raad.
+//
+//    This source file uses intrinsics to gain access to SSE4.2 and
+//    ARMv8a CRC-32 and CRC-32C instructions. A separate source file
+//    is needed because additional CXXFLAGS are required to enable
+//    the appropriate instructions sets in some build configurations.
+
+#include "pch.h"
+#include "config.h"
+#include "misc.h"
+
+#if (CRYPTOPP_AESNI_AVAILABLE)
+# include "wmmintrin.h"
+#endif
+
+#if (CRYPTOPP_ARM_NEON_AVAILABLE)
+# include "arm_neon.h"
+#if (CRYPTOPP_ARMV8A_PMULL_AVAILABLE)
+# include "arm_acle.h"
+#endif
+#endif
+
+#ifdef CRYPTOPP_GNU_STYLE_INLINE_ASSEMBLY
+# include <signal.h>
+# include <setjmp.h>
+#endif
+
+NAMESPACE_BEGIN(CryptoPP)
+
+#ifdef CRYPTOPP_GNU_STYLE_INLINE_ASSEMBLY
+extern "C" {
+    typedef void (*SigHandler)(int);
+
+	static jmp_buf s_jmpSIGILL;
+	static void SigIllHandler(int)
+	{
+		longjmp(s_jmpSIGILL, 1);
+	}
+};
+#endif  // Not CRYPTOPP_MS_STYLE_INLINE_ASSEMBLY
+
+#if (CRYPTOPP_ARMV8A_PMULL_AVAILABLE)
+bool CPU_TryPMULL_ARMV8()
+{
+# if defined(CRYPTOPP_MS_STYLE_INLINE_ASSEMBLY)
+	volatile bool result = true;
+	__try
+	{
+		const poly64_t   a1={0x9090909090909090}, b1={0xb0b0b0b0b0b0b0b0};
+		const poly8x16_t a2={0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0xa0,0xa0,0xa0,0xa0,0xa0,0xa0,0xa0,0xa0},
+		                 b2={0xc0,0xc0,0xc0,0xc0,0xc0,0xc0,0xc0,0xc0,0xe0,0xe0,0xe0,0xe0,0xe0,0xe0,0xe0,0xe0};
+
+		const poly128_t r1 = vmull_p64(a1, b1);
+		const poly128_t r2 = vmull_high_p64((poly64x2_t)(a2), (poly64x2_t)(b2));
+
+		// Linaro is missing vreinterpretq_u64_p128. Also see http://github.com/weidai11/cryptopp/issues/233.
+		const uint64x2_t& t1 = (uint64x2_t)(r1);  // {bignum,bignum}
+		const uint64x2_t& t2 = (uint64x2_t)(r2);  // {bignum,bignum}
+
+		result = !!(vgetq_lane_u64(t1,0) == 0x5300530053005300 && vgetq_lane_u64(t1,1) == 0x5300530053005300 &&
+		            vgetq_lane_u64(t2,0) == 0x6c006c006c006c00 && vgetq_lane_u64(t2,1) == 0x6c006c006c006c00);
+	}
+	__except (EXCEPTION_EXECUTE_HANDLER)
+	{
+		return false;
+	}
+	return result;
+# else
+	// longjmp and clobber warnings. Volatile is required.
+	// http://github.com/weidai11/cryptopp/issues/24 and http://stackoverflow.com/q/7721854
+	volatile bool result = true;
+
+	volatile SigHandler oldHandler = signal(SIGILL, SigIllHandlerPMULL);
+	if (oldHandler == SIG_ERR)
+		return false;
+
+	volatile sigset_t oldMask;
+	if (sigprocmask(0, NULLPTR, (sigset_t*)&oldMask))
+		return false;
+
+	if (setjmp(s_jmpNoPMULL))
+		result = false;
+	else
+	{
+		const poly64_t   a1={0x9090909090909090}, b1={0xb0b0b0b0b0b0b0b0};
+		const poly8x16_t a2={0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0xa0,0xa0,0xa0,0xa0,0xa0,0xa0,0xa0,0xa0},
+		                 b2={0xc0,0xc0,0xc0,0xc0,0xc0,0xc0,0xc0,0xc0,0xe0,0xe0,0xe0,0xe0,0xe0,0xe0,0xe0,0xe0};
+
+		const poly128_t r1 = vmull_p64(a1, b1);
+		const poly128_t r2 = vmull_high_p64((poly64x2_t)(a2), (poly64x2_t)(b2));
+
+		// Linaro is missing vreinterpretq_u64_p128. Also see http://github.com/weidai11/cryptopp/issues/233.
+		const uint64x2_t& t1 = (uint64x2_t)(r1);  // {bignum,bignum}
+		const uint64x2_t& t2 = (uint64x2_t)(r2);  // {bignum,bignum}
+
+		result = !!(vgetq_lane_u64(t1,0) == 0x5300530053005300 && vgetq_lane_u64(t1,1) == 0x5300530053005300 &&
+		            vgetq_lane_u64(t2,0) == 0x6c006c006c006c00 && vgetq_lane_u64(t2,1) == 0x6c006c006c006c00);
+	}
+
+	sigprocmask(SIG_SETMASK, (sigset_t*)&oldMask, NULLPTR);
+	signal(SIGILL, oldHandler);
+	return result;
+# endif
+}
+#endif  // CRYPTOPP_ARMV8A_PMULL_AVAILABLE
+
+#if CRYPTOPP_ARM_NEON_AVAILABLE
+void GCM_Xor16_NEON(byte *a, const byte *b, const byte *c)
+{
+    CRYPTOPP_ASSERT(IsAlignedOn(a,GetAlignmentOf<uint64x2_t>()));
+    CRYPTOPP_ASSERT(IsAlignedOn(b,GetAlignmentOf<uint64x2_t>()));
+    CRYPTOPP_ASSERT(IsAlignedOn(c,GetAlignmentOf<uint64x2_t>()));
+    *(uint64x2_t*)a = veorq_u64(*(uint64x2_t*)b, *(uint64x2_t*)c);
+}
+#endif
+
+NAMESPACE_END
--- a/gcm.cpp
+++ b/gcm.cpp
@ -49,6 +49,10 @@ NAMESPACE_BEGIN(CryptoPP)
 #endif
 #endif

+#if CRYPTOPP_ARM_NEON_AVAILABLE
+extern void GCM_Xor16_NEON(byte *a, const byte *b, const byte *c);
+#endif
+
 #if (CRYPTOPP_BOOL_ARM32 || CRYPTOPP_BOOL_ARM64) && CRYPTOPP_ARMV8A_PMULL_AVAILABLE
 #if defined(__GNUC__)
 // Schneiders, Hovsmith and O'Rourke used this trick.
@ -193,6 +197,15 @@ __m128i _mm_clmulepi64_si128(const __m128i &a, const __m128i &b, int i)
 }
 #endif

+inline static void Xor16(byte *a, const byte *b, const byte *c)
+{
+    CRYPTOPP_ASSERT(IsAlignedOn(a,GetAlignmentOf<word64>()));
+    CRYPTOPP_ASSERT(IsAlignedOn(b,GetAlignmentOf<word64>()));
+    CRYPTOPP_ASSERT(IsAlignedOn(c,GetAlignmentOf<word64>()));
+    ((word64 *)(void *)a)[0] = ((word64 *)(void *)b)[0] ^ ((word64 *)(void *)c)[0];
+    ((word64 *)(void *)a)[1] = ((word64 *)(void *)b)[1] ^ ((word64 *)(void *)c)[1];
+}
+
 #if CRYPTOPP_BOOL_SSE2_INTRINSICS_AVAILABLE || CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE
 inline static void SSE2_Xor16(byte *a, const byte *b, const byte *c)
 {
@ -211,25 +224,6 @@ inline static void SSE2_Xor16(byte *a, const byte *b, const byte *c)
 }
 #endif

-#if CRYPTOPP_ARM_NEON_AVAILABLE
-inline static void NEON_Xor16(byte *a, const byte *b, const byte *c)
-{
-    CRYPTOPP_ASSERT(IsAlignedOn(a,GetAlignmentOf<uint64x2_t>()));
-    CRYPTOPP_ASSERT(IsAlignedOn(b,GetAlignmentOf<uint64x2_t>()));
-    CRYPTOPP_ASSERT(IsAlignedOn(c,GetAlignmentOf<uint64x2_t>()));
-    *(uint64x2_t*)a = veorq_u64(*(uint64x2_t*)b, *(uint64x2_t*)c);
-}
-#endif
-
-inline static void Xor16(byte *a, const byte *b, const byte *c)
-{
-    CRYPTOPP_ASSERT(IsAlignedOn(a,GetAlignmentOf<word64>()));
-    CRYPTOPP_ASSERT(IsAlignedOn(b,GetAlignmentOf<word64>()));
-    CRYPTOPP_ASSERT(IsAlignedOn(c,GetAlignmentOf<word64>()));
-    ((word64 *)(void *)a)[0] = ((word64 *)(void *)b)[0] ^ ((word64 *)(void *)c)[0];
-    ((word64 *)(void *)a)[1] = ((word64 *)(void *)b)[1] ^ ((word64 *)(void *)c)[1];
-}
-
 #if CRYPTOPP_BOOL_AESNI_INTRINSICS_AVAILABLE
 CRYPTOPP_ALIGN_DATA(16)
 static const word64 s_clmulConstants64[] = {
@ -441,7 +435,7 @@ void GCM_Base::SetKeyWithoutResync(const byte *userKey, size_t keylength, const
            if (HasNEON())
                for (j=2; j<=0x80; j*=2)
                    for (k=1; k<j; k++)
-                        NEON_Xor16(table+i*256*16+(j+k)*16, table+i*256*16+j*16, table+i*256*16+k*16);
+                        GCM_Xor16_NEON(table+i*256*16+(j+k)*16, table+i*256*16+j*16, table+i*256*16+k*16);
            else
 #endif
                for (j=2; j<=0x80; j*=2)
@ -497,8 +491,8 @@ void GCM_Base::SetKeyWithoutResync(const byte *userKey, size_t keylength, const
                for (j=2; j<=8; j*=2)
                    for (k=1; k<j; k++)
                    {
-                        NEON_Xor16(table+i*256+(j+k)*16, table+i*256+j*16, table+i*256+k*16);
-                        NEON_Xor16(table+1024+i*256+(j+k)*16, table+1024+i*256+j*16, table+1024+i*256+k*16);
+                        GCM_Xor16_NEON(table+i*256+(j+k)*16, table+i*256+j*16, table+i*256+k*16);
+                        GCM_Xor16_NEON(table+1024+i*256+(j+k)*16, table+1024+i*256+j*16, table+1024+i*256+k*16);
                    }
            else
 #endif
--- a/validat1.cpp
+++ b/validat1.cpp
@ -264,14 +264,14 @@ bool TestSettings()
 	// Don't assert the alignment of testvals. That's what this test is for.
 	byte testvals[10] = {1,2,2,3,3,3,3,2,2,1};
 	if (*(word32 *)(void *)(testvals+3) == 0x03030303 && *(word64 *)(void *)(testvals+1) == W64LIT(0x0202030303030202))
-		std::cout << "passed:  Your machine allows unaligned data access.\n";
+		std::cout << "passed:  Unaligned data access (CRYPTOPP_ALLOW_UNALIGNED_DATA_ACCESS).\n";
 	else
 	{
 		std::cout << "FAILED:  Unaligned data access gave incorrect results.\n";
 		pass = false;
 	}
 #else
-	std::cout << "passed:  CRYPTOPP_ALLOW_UNALIGNED_DATA_ACCESS is not defined. Will restrict to aligned data access.\n";
+	std::cout << "passed:  Aligned data access (no CRYPTOPP_ALLOW_UNALIGNED_DATA_ACCESS).\n";
 #endif

 	if (sizeof(byte) == 1)