Fixed ARMv7a and NEON detection. Initial cut-in of GCM

pull/461/head
Jeffrey Walton 2017-07-30 03:16:58 -04:00
parent 4b51eadc73
commit b4f6882237
No known key found for this signature in database
GPG Key ID: B36AB348921B1838
7 changed files with 166 additions and 101 deletions

View File

@ -2,8 +2,12 @@
##### System Attributes and Programs #####
###########################################################
# If needed
TMPDIR ?= /tmp
# Used for ARMv7 and NEON.
FP_ABI ?= hard
# Command ard arguments
AR ?= ar
ARFLAGS ?= -cr # ar needs the dash on OpenBSD
RANLIB ?= ranlib
@ -297,7 +301,7 @@ endif
endif
ifeq ($(IS_NEON),1)
NEON_FLAG = $(shell echo | $(CXX) $(CXXFLAGS) -march=armv7-a -mfloat-abi=softfp -mfpu=neon -dM -E - | grep -i -c -q __ARM_NEON && echo "-march=armv7-a -mfloat-abi=softfp -mfpu=neon")
NEON_FLAG = $(shell echo | $(CXX) $(CXXFLAGS) -march=armv7-a -mfloat-abi=$(FP_ABI) -mfpu=neon -dM -E - | grep -i -c -q __ARM_NEON && echo "-march=armv7-a -mfloat-abi=$(FP_ABI) -mfpu=neon")
GCM_FLAG = $(NEON_FLAG)
ARIA_FLAG = $(NEON_FLAG)
BLAKE2_FLAG = $(NEON_FLAG)
@ -868,6 +872,10 @@ blake2-simd.o : blake2-simd.cpp
crc-simd.o : crc-simd.cpp
$(CXX) $(strip $(CXXFLAGS) $(CRC_FLAG) -c) $<
# PCLMUL or ARMv7a/ARMv8a available
gcm-simd.o : gcm-simd.cpp
$(CXX) $(strip $(CXXFLAGS) $(GCM_FLAG) -c) $<
# SSE4.2/SHA-NI or ARMv8a available
sha-simd.o : sha-simd.cpp
$(CXX) $(strip $(CXXFLAGS) $(SHA_FLAG) -c) $<

View File

@ -517,7 +517,8 @@ NAMESPACE_END
// Requires ARMv7 and ACLE 1.0. Testing shows ARMv7 is really ARMv7a under most toolchains.
#if !defined(CRYPTOPP_ARM_NEON_AVAILABLE) && !defined(CRYPTOPP_DISABLE_ASM)
# if defined(__ARM_NEON__) || defined(__ARM_NEON) || (CRYPTOPP_GCC_VERSION >= 40800) || (CRYPTOPP_LLVM_CLANG_VERSION >= 30500)
# if defined(__ARM_NEON__) || defined(__ARM_NEON) || (defined(__ARM_ARCH) && (__ARM_ARCH >= 7)) || \
(CRYPTOPP_GCC_VERSION >= 40800) || (CRYPTOPP_LLVM_CLANG_VERSION >= 30500)
# define CRYPTOPP_ARM_NEON_AVAILABLE 1
# endif
#endif
@ -527,17 +528,21 @@ NAMESPACE_END
// Microsoft plans to support ARM-64, but its not clear how to detect it.
// TODO: Add MSC_VER and ARM-64 platform define when available
#if !defined(CRYPTOPP_ARMV8A_CRC32_AVAILABLE) && !defined(CRYPTOPP_DISABLE_ASM)
# if defined(__ARM_FEATURE_CRC32) || (CRYPTOPP_GCC_VERSION >= 40800) || (CRYPTOPP_LLVM_CLANG_VERSION >= 30500)
# if defined(__ARM_FEATURE_CRC32) || (defined(__ARM_ARCH) && (__ARM_ARCH >= 8)) || \
(defined(__ARM_32BIT_STATE_) || defined(__ARM_64BIT_STATE_)) || \
(defined(__AARCH32EL__) || defined(__AARCH64EL__))
# define CRYPTOPP_ARMV8A_CRC32_AVAILABLE 1
# endif
#endif
// Requires ARMv8, ACLE 2.0 and Aarch64. GCC requires 4.8 and above.
// LLVM Clang requires 3.5. Apple Clang does not support it at the moment.
// Microsoft plans to support ARM-64, but its not clear how to detect it.
// Requires ARMv8, but we are not sure of the define because the ACLE does not discuss it.
// GCC seems to requires 4.8 and above. LLVM Clang requires 3.5. Apple Clang does not support
// it at the moment. Microsoft plans to support ARM-64, but its not clear how to detect it.
// TODO: Add MSC_VER and ARM-64 platform define when available
#if !defined(CRYPTOPP_ARMV8A_PMULL_AVAILABLE) && !defined(CRYPTOPP_DISABLE_ASM) && !defined(__apple_build_version__)
# if defined(__ARM_FEATURE_CRYPTO) || (CRYPTOPP_GCC_VERSION >= 40800) || (CRYPTOPP_LLVM_CLANG_VERSION >= 30500)
# if defined(__ARM_FEATURE_CRYPTO) || (defined(__ARM_ARCH) && (__ARM_ARCH >= 8)) || \
(defined(__ARM_32BIT_STATE_) || defined(__ARM_64BIT_STATE_)) || \
(defined(__AARCH32EL__) || defined(__AARCH64EL__))
# define CRYPTOPP_ARMV8A_PMULL_AVAILABLE 1
# endif
#endif
@ -547,7 +552,9 @@ NAMESPACE_END
// Microsoft plans to support ARM-64, but its not clear how to detect it.
// TODO: Add MSC_VER and ARM-64 platform define when available
#if !defined(CRYPTOPP_ARMV8A_CRYPTO_AVAILABLE) && !defined(CRYPTOPP_DISABLE_ASM)
# if defined(__ARM_FEATURE_CRYPTO) || (CRYPTOPP_GCC_VERSION >= 40800) || (CRYPTOPP_LLVM_CLANG_VERSION >= 30500)
# if defined(__ARM_FEATURE_CRYPTO) || (defined(__ARM_ARCH) && (__ARM_ARCH >= 8)) || \
(defined(__ARM_32BIT_STATE_) || defined(__ARM_64BIT_STATE_)) || \
(defined(__AARCH32EL__) || defined(__AARCH64EL__))
# define CRYPTOPP_ARMV8A_AES_AVAILABLE 1
# define CRYPTOPP_ARMV8A_SHA_AVAILABLE 1
# define CRYPTOPP_ARMV8A_CRYPTO_AVAILABLE 1

72
cpu.cpp
View File

@ -344,12 +344,6 @@ extern bool CPU_TryPMULL_ARMV8();
#ifndef CRYPTOPP_MS_STYLE_INLINE_ASSEMBLY
extern "C"
{
static jmp_buf s_jmpNoPMULL;
static void SigIllHandlerPMULL(int)
{
longjmp(s_jmpNoPMULL, 1);
}
static jmp_buf s_jmpNoAES;
static void SigIllHandlerAES(int)
{
@ -360,8 +354,8 @@ extern "C"
static bool TryNEON()
{
#if (CRYPTOPP_ARMV8A_CRC32_AVAILABLE)
return CPU_TryCRC32_ARMV8();
#if (CRYPTOPP_ARM_NEON_AVAILABLE)
return CPU_TryNEON_ARM();
#else
return false;
#endif
@ -379,68 +373,10 @@ static bool TryCRC32()
static bool TryPMULL()
{
#if (CRYPTOPP_ARMV8A_PMULL_AVAILABLE)
# if defined(CRYPTOPP_MS_STYLE_INLINE_ASSEMBLY)
volatile bool result = true;
__try
{
const poly64_t a1={0x9090909090909090}, b1={0xb0b0b0b0b0b0b0b0};
const poly8x16_t a2={0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0xa0,0xa0,0xa0,0xa0,0xa0,0xa0,0xa0,0xa0},
b2={0xc0,0xc0,0xc0,0xc0,0xc0,0xc0,0xc0,0xc0,0xe0,0xe0,0xe0,0xe0,0xe0,0xe0,0xe0,0xe0};
const poly128_t r1 = vmull_p64(a1, b1);
const poly128_t r2 = vmull_high_p64((poly64x2_t)(a2), (poly64x2_t)(b2));
// Linaro is missing vreinterpretq_u64_p128. Also see http://github.com/weidai11/cryptopp/issues/233.
const uint64x2_t& t1 = (uint64x2_t)(r1); // {bignum,bignum}
const uint64x2_t& t2 = (uint64x2_t)(r2); // {bignum,bignum}
result = !!(vgetq_lane_u64(t1,0) == 0x5300530053005300 && vgetq_lane_u64(t1,1) == 0x5300530053005300 &&
vgetq_lane_u64(t2,0) == 0x6c006c006c006c00 && vgetq_lane_u64(t2,1) == 0x6c006c006c006c00);
}
__except (EXCEPTION_EXECUTE_HANDLER)
{
return false;
}
return result;
# else
// longjmp and clobber warnings. Volatile is required.
// http://github.com/weidai11/cryptopp/issues/24 and http://stackoverflow.com/q/7721854
volatile bool result = true;
volatile SigHandler oldHandler = signal(SIGILL, SigIllHandlerPMULL);
if (oldHandler == SIG_ERR)
return false;
volatile sigset_t oldMask;
if (sigprocmask(0, NULLPTR, (sigset_t*)&oldMask))
return false;
if (setjmp(s_jmpNoPMULL))
result = false;
else
{
const poly64_t a1={0x9090909090909090}, b1={0xb0b0b0b0b0b0b0b0};
const poly8x16_t a2={0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0xa0,0xa0,0xa0,0xa0,0xa0,0xa0,0xa0,0xa0},
b2={0xc0,0xc0,0xc0,0xc0,0xc0,0xc0,0xc0,0xc0,0xe0,0xe0,0xe0,0xe0,0xe0,0xe0,0xe0,0xe0};
const poly128_t r1 = vmull_p64(a1, b1);
const poly128_t r2 = vmull_high_p64((poly64x2_t)(a2), (poly64x2_t)(b2));
// Linaro is missing vreinterpretq_u64_p128. Also see http://github.com/weidai11/cryptopp/issues/233.
const uint64x2_t& t1 = (uint64x2_t)(r1); // {bignum,bignum}
const uint64x2_t& t2 = (uint64x2_t)(r2); // {bignum,bignum}
result = !!(vgetq_lane_u64(t1,0) == 0x5300530053005300 && vgetq_lane_u64(t1,1) == 0x5300530053005300 &&
vgetq_lane_u64(t2,0) == 0x6c006c006c006c00 && vgetq_lane_u64(t2,1) == 0x6c006c006c006c00);
}
sigprocmask(SIG_SETMASK, (sigset_t*)&oldMask, NULLPTR);
signal(SIGILL, oldHandler);
return result;
# endif
return CPU_TryPMULL_ARMV8();
#else
return false;
#endif // CRYPTOPP_ARMV8A_CRYPTO_AVAILABLE
#endif
}
static bool TryAES()

View File

@ -14,10 +14,12 @@
# include "nmmintrin.h"
#endif
#if (CRYPTOPP_ARMV8A_CRC32_AVAILABLE) && defined(__GNUC__)
#if (CRYPTOPP_ARMV8A_CRC32_AVAILABLE)
# include "arm_neon.h"
#if defined(__GNUC__)
# include "arm_acle.h"
#endif
#endif
#ifdef CRYPTOPP_GNU_STYLE_INLINE_ASSEMBLY
# include <signal.h>

118
gcm-simd.cpp Normal file
View File

@ -0,0 +1,118 @@
// crc-simd.cpp - written and placed in the public domain by
// Jeffrey Walton, Uri Blumenthal and Marcel Raad.
//
// This source file uses intrinsics to gain access to SSE4.2 and
// ARMv8a CRC-32 and CRC-32C instructions. A separate source file
// is needed because additional CXXFLAGS are required to enable
// the appropriate instructions sets in some build configurations.
#include "pch.h"
#include "config.h"
#include "misc.h"
#if (CRYPTOPP_AESNI_AVAILABLE)
# include "wmmintrin.h"
#endif
#if (CRYPTOPP_ARM_NEON_AVAILABLE)
# include "arm_neon.h"
#if (CRYPTOPP_ARMV8A_PMULL_AVAILABLE)
# include "arm_acle.h"
#endif
#endif
#ifdef CRYPTOPP_GNU_STYLE_INLINE_ASSEMBLY
# include <signal.h>
# include <setjmp.h>
#endif
NAMESPACE_BEGIN(CryptoPP)
#ifdef CRYPTOPP_GNU_STYLE_INLINE_ASSEMBLY
extern "C" {
typedef void (*SigHandler)(int);
static jmp_buf s_jmpSIGILL;
static void SigIllHandler(int)
{
longjmp(s_jmpSIGILL, 1);
}
};
#endif // Not CRYPTOPP_MS_STYLE_INLINE_ASSEMBLY
#if (CRYPTOPP_ARMV8A_PMULL_AVAILABLE)
bool CPU_TryPMULL_ARMV8()
{
# if defined(CRYPTOPP_MS_STYLE_INLINE_ASSEMBLY)
volatile bool result = true;
__try
{
const poly64_t a1={0x9090909090909090}, b1={0xb0b0b0b0b0b0b0b0};
const poly8x16_t a2={0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0xa0,0xa0,0xa0,0xa0,0xa0,0xa0,0xa0,0xa0},
b2={0xc0,0xc0,0xc0,0xc0,0xc0,0xc0,0xc0,0xc0,0xe0,0xe0,0xe0,0xe0,0xe0,0xe0,0xe0,0xe0};
const poly128_t r1 = vmull_p64(a1, b1);
const poly128_t r2 = vmull_high_p64((poly64x2_t)(a2), (poly64x2_t)(b2));
// Linaro is missing vreinterpretq_u64_p128. Also see http://github.com/weidai11/cryptopp/issues/233.
const uint64x2_t& t1 = (uint64x2_t)(r1); // {bignum,bignum}
const uint64x2_t& t2 = (uint64x2_t)(r2); // {bignum,bignum}
result = !!(vgetq_lane_u64(t1,0) == 0x5300530053005300 && vgetq_lane_u64(t1,1) == 0x5300530053005300 &&
vgetq_lane_u64(t2,0) == 0x6c006c006c006c00 && vgetq_lane_u64(t2,1) == 0x6c006c006c006c00);
}
__except (EXCEPTION_EXECUTE_HANDLER)
{
return false;
}
return result;
# else
// longjmp and clobber warnings. Volatile is required.
// http://github.com/weidai11/cryptopp/issues/24 and http://stackoverflow.com/q/7721854
volatile bool result = true;
volatile SigHandler oldHandler = signal(SIGILL, SigIllHandlerPMULL);
if (oldHandler == SIG_ERR)
return false;
volatile sigset_t oldMask;
if (sigprocmask(0, NULLPTR, (sigset_t*)&oldMask))
return false;
if (setjmp(s_jmpNoPMULL))
result = false;
else
{
const poly64_t a1={0x9090909090909090}, b1={0xb0b0b0b0b0b0b0b0};
const poly8x16_t a2={0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0xa0,0xa0,0xa0,0xa0,0xa0,0xa0,0xa0,0xa0},
b2={0xc0,0xc0,0xc0,0xc0,0xc0,0xc0,0xc0,0xc0,0xe0,0xe0,0xe0,0xe0,0xe0,0xe0,0xe0,0xe0};
const poly128_t r1 = vmull_p64(a1, b1);
const poly128_t r2 = vmull_high_p64((poly64x2_t)(a2), (poly64x2_t)(b2));
// Linaro is missing vreinterpretq_u64_p128. Also see http://github.com/weidai11/cryptopp/issues/233.
const uint64x2_t& t1 = (uint64x2_t)(r1); // {bignum,bignum}
const uint64x2_t& t2 = (uint64x2_t)(r2); // {bignum,bignum}
result = !!(vgetq_lane_u64(t1,0) == 0x5300530053005300 && vgetq_lane_u64(t1,1) == 0x5300530053005300 &&
vgetq_lane_u64(t2,0) == 0x6c006c006c006c00 && vgetq_lane_u64(t2,1) == 0x6c006c006c006c00);
}
sigprocmask(SIG_SETMASK, (sigset_t*)&oldMask, NULLPTR);
signal(SIGILL, oldHandler);
return result;
# endif
}
#endif // CRYPTOPP_ARMV8A_PMULL_AVAILABLE
#if CRYPTOPP_ARM_NEON_AVAILABLE
void GCM_Xor16_NEON(byte *a, const byte *b, const byte *c)
{
CRYPTOPP_ASSERT(IsAlignedOn(a,GetAlignmentOf<uint64x2_t>()));
CRYPTOPP_ASSERT(IsAlignedOn(b,GetAlignmentOf<uint64x2_t>()));
CRYPTOPP_ASSERT(IsAlignedOn(c,GetAlignmentOf<uint64x2_t>()));
*(uint64x2_t*)a = veorq_u64(*(uint64x2_t*)b, *(uint64x2_t*)c);
}
#endif
NAMESPACE_END

38
gcm.cpp
View File

@ -49,6 +49,10 @@ NAMESPACE_BEGIN(CryptoPP)
#endif
#endif
#if CRYPTOPP_ARM_NEON_AVAILABLE
extern void GCM_Xor16_NEON(byte *a, const byte *b, const byte *c);
#endif
#if (CRYPTOPP_BOOL_ARM32 || CRYPTOPP_BOOL_ARM64) && CRYPTOPP_ARMV8A_PMULL_AVAILABLE
#if defined(__GNUC__)
// Schneiders, Hovsmith and O'Rourke used this trick.
@ -193,6 +197,15 @@ __m128i _mm_clmulepi64_si128(const __m128i &a, const __m128i &b, int i)
}
#endif
inline static void Xor16(byte *a, const byte *b, const byte *c)
{
CRYPTOPP_ASSERT(IsAlignedOn(a,GetAlignmentOf<word64>()));
CRYPTOPP_ASSERT(IsAlignedOn(b,GetAlignmentOf<word64>()));
CRYPTOPP_ASSERT(IsAlignedOn(c,GetAlignmentOf<word64>()));
((word64 *)(void *)a)[0] = ((word64 *)(void *)b)[0] ^ ((word64 *)(void *)c)[0];
((word64 *)(void *)a)[1] = ((word64 *)(void *)b)[1] ^ ((word64 *)(void *)c)[1];
}
#if CRYPTOPP_BOOL_SSE2_INTRINSICS_AVAILABLE || CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE
inline static void SSE2_Xor16(byte *a, const byte *b, const byte *c)
{
@ -211,25 +224,6 @@ inline static void SSE2_Xor16(byte *a, const byte *b, const byte *c)
}
#endif
#if CRYPTOPP_ARM_NEON_AVAILABLE
inline static void NEON_Xor16(byte *a, const byte *b, const byte *c)
{
CRYPTOPP_ASSERT(IsAlignedOn(a,GetAlignmentOf<uint64x2_t>()));
CRYPTOPP_ASSERT(IsAlignedOn(b,GetAlignmentOf<uint64x2_t>()));
CRYPTOPP_ASSERT(IsAlignedOn(c,GetAlignmentOf<uint64x2_t>()));
*(uint64x2_t*)a = veorq_u64(*(uint64x2_t*)b, *(uint64x2_t*)c);
}
#endif
inline static void Xor16(byte *a, const byte *b, const byte *c)
{
CRYPTOPP_ASSERT(IsAlignedOn(a,GetAlignmentOf<word64>()));
CRYPTOPP_ASSERT(IsAlignedOn(b,GetAlignmentOf<word64>()));
CRYPTOPP_ASSERT(IsAlignedOn(c,GetAlignmentOf<word64>()));
((word64 *)(void *)a)[0] = ((word64 *)(void *)b)[0] ^ ((word64 *)(void *)c)[0];
((word64 *)(void *)a)[1] = ((word64 *)(void *)b)[1] ^ ((word64 *)(void *)c)[1];
}
#if CRYPTOPP_BOOL_AESNI_INTRINSICS_AVAILABLE
CRYPTOPP_ALIGN_DATA(16)
static const word64 s_clmulConstants64[] = {
@ -441,7 +435,7 @@ void GCM_Base::SetKeyWithoutResync(const byte *userKey, size_t keylength, const
if (HasNEON())
for (j=2; j<=0x80; j*=2)
for (k=1; k<j; k++)
NEON_Xor16(table+i*256*16+(j+k)*16, table+i*256*16+j*16, table+i*256*16+k*16);
GCM_Xor16_NEON(table+i*256*16+(j+k)*16, table+i*256*16+j*16, table+i*256*16+k*16);
else
#endif
for (j=2; j<=0x80; j*=2)
@ -497,8 +491,8 @@ void GCM_Base::SetKeyWithoutResync(const byte *userKey, size_t keylength, const
for (j=2; j<=8; j*=2)
for (k=1; k<j; k++)
{
NEON_Xor16(table+i*256+(j+k)*16, table+i*256+j*16, table+i*256+k*16);
NEON_Xor16(table+1024+i*256+(j+k)*16, table+1024+i*256+j*16, table+1024+i*256+k*16);
GCM_Xor16_NEON(table+i*256+(j+k)*16, table+i*256+j*16, table+i*256+k*16);
GCM_Xor16_NEON(table+1024+i*256+(j+k)*16, table+1024+i*256+j*16, table+1024+i*256+k*16);
}
else
#endif

View File

@ -264,14 +264,14 @@ bool TestSettings()
// Don't assert the alignment of testvals. That's what this test is for.
byte testvals[10] = {1,2,2,3,3,3,3,2,2,1};
if (*(word32 *)(void *)(testvals+3) == 0x03030303 && *(word64 *)(void *)(testvals+1) == W64LIT(0x0202030303030202))
std::cout << "passed: Your machine allows unaligned data access.\n";
std::cout << "passed: Unaligned data access (CRYPTOPP_ALLOW_UNALIGNED_DATA_ACCESS).\n";
else
{
std::cout << "FAILED: Unaligned data access gave incorrect results.\n";
pass = false;
}
#else
std::cout << "passed: CRYPTOPP_ALLOW_UNALIGNED_DATA_ACCESS is not defined. Will restrict to aligned data access.\n";
std::cout << "passed: Aligned data access (no CRYPTOPP_ALLOW_UNALIGNED_DATA_ACCESS).\n";
#endif
if (sizeof(byte) == 1)