From b1d7754ff7b73fb4e6e3cf6adc7a7da986e233cb Mon Sep 17 00:00:00 2001 From: Jeffrey Walton Date: Wed, 24 Aug 2016 08:10:37 -0400 Subject: [PATCH 1/4] Add disassembly tests for RDRAND and RDSEED --- cryptest.sh | 52 ++++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 50 insertions(+), 2 deletions(-) diff --git a/cryptest.sh b/cryptest.sh index ed1bbec4..687a3a95 100755 --- a/cryptest.sh +++ b/cryptest.sh @@ -115,6 +115,9 @@ if [[ "$IS_SOLARIS" -ne "0" ]]; then else AWK=nawk; fi + + DISASS=dis + DISASSARGS=() fi # Fixup @@ -138,7 +141,9 @@ done # We need to use the C++ compiler to determine feature availablility. Otherwise # mis-detections occur on a number of platforms. if [[ ((-z "$CXX") || ("$CXX" == "gcc")) ]]; then - if [[ "$IS_DARWIN" -ne "0" ]]; then + if [[ ("$CXX" == "gcc") ]]; then + CXX=g++ + elif [[ "$IS_DARWIN" -ne "0" ]]; then CXX=c++ elif [[ "$IS_SOLARIS" -ne "0" ]]; then if [[ (-e "/opt/developerstudio12.5/bin/CC") ]]; then @@ -182,7 +187,7 @@ if [[ ("$SUN_COMPILER" -eq "0") ]]; then fi # Now that the compiler is fixed, determine the compiler version for fixups -CLANG_37_OR_ABOVE=$("$CXX" -v 2>&1 | "$EGREP" -i -c 'clang version (3\.[7-9]|[5-9])') +CLANG_37_OR_ABOVE=$("$CXX" -v 2>&1 | "$EGREP" -i -c 'clang version (3\.[7-9]|[4-9]\.[0-9])') GCC_60_OR_ABOVE=$("$CXX" -v 2>&1 | "$EGREP" -i -c 'gcc version (6\.[0-9]|[7-9])') GCC_51_OR_ABOVE=$("$CXX" -v 2>&1 | "$EGREP" -i -c 'gcc version (5\.[1-9]|[6-9])') GCC_48_COMPILER=$("$CXX" -v 2>&1 | "$EGREP" -i -c 'gcc version 4\.8') @@ -1026,6 +1031,10 @@ if [[ ("$HAVE_DISASS" -ne "0" && "$HAVE_X86_AES" -ne "0") ]]; then OBJFILE=rijndael.o CXX="$CXX" CXXFLAGS="$RELEASE_CXXFLAGS" "$MAKE" "${MAKEARGS[@]}" $OBJFILE 2>&1 | tee -a "$TEST_RESULTS" + # This works for SunCC, but we need something like: + # /opt/solarisstudio12.4/bin/CC -DNDEBUG -g2 -O2 -xarch=aes -m64 -D__SSE2__ -D__SSE3__ \ + # -D__SSE4_1__ -D__SSE4_2__ -D__AES__ -D__PCLMUL__ -c rijndael.cpp + COUNT=0 FAILED=0 DISASS_TEXT=$("$DISASS" "${DISASSARGS[@]}" "$OBJFILE" 2>/dev/null) @@ -1075,6 +1084,45 @@ if [[ ("$HAVE_DISASS" -ne "0" && "$HAVE_X86_AES" -ne "0") ]]; then fi fi +############################################ +# Test RDRAND and RDSEED code generation +if [[ ("$HAVE_DISASS" -ne "0" && "$HAVE_X86_RDRAND" -ne "0") ]]; then + echo + echo "************************************" | tee -a "$TEST_RESULTS" + echo "Testing: RDRAND and RDSEED code generation" | tee -a "$TEST_RESULTS" + echo + + "$MAKE" clean > /dev/null 2>&1 + rm -f adhoc.cpp > /dev/null 2>&1 + + OBJFILE=rdrand.o + CXX="$CXX" CXXFLAGS="$RELEASE_CXXFLAGS" "$MAKE" "${MAKEARGS[@]}" $OBJFILE 2>&1 | tee -a "$TEST_RESULTS" + + # This works for SunCC, but we need something like: + # /opt/solarisstudio12.4/bin/CC -DNDEBUG -g2 -O2 -xarch=avx_i -m64 -D__SSE2__ -D__SSE3__ \ + # -D__SSE4_1__ -D__SSE4_2__ -D__AES__ -D__PCLMUL__ -D__RDRND__ -D__RDSEED__ -c rdrand.cpp + + COUNT=0 + FAILED=0 + DISASS_TEXT=$("$DISASS" "${DISASSARGS[@]}" "$OBJFILE" 2>/dev/null) + + COUNT=$(echo "$DISASS_TEXT" | "$GREP" -i -c rdrand) + if [[ ("$COUNT" -eq "0") ]]; then + FAILED=1 + echo "ERROR: failed to generate rdrand instruction" | tee -a "$TEST_RESULTS" + fi + + COUNT=$(echo "$DISASS_TEXT" | "$GREP" -i -c rdseed) + if [[ ("$COUNT" -eq "0") ]]; then + FAILED=1 + echo "ERROR: failed to generate rdseed instruction" | tee -a "$TEST_RESULTS" + fi + + if [[ ("$FAILED" -eq "0") ]];then + echo "Verified rdrand and rdseed machine instruction generation" | tee -a "$TEST_RESULTS" + fi +fi + ############################################ # ARM 64x64→128-bit multiply code generation if [[ ("$HAVE_DISASS" -ne "0" && "$HAVE_ARM_CRYPTO" -ne "0") ]]; then From 954c3bb82b7f09d76c73ced696803d814c9e3fdb Mon Sep 17 00:00:00 2001 From: Jeffrey Walton Date: Wed, 24 Aug 2016 12:28:54 -0400 Subject: [PATCH 2/4] Enable AES-NI intrinsics for Clang. Add ARM NEON and PMULL implementation --- gcm.cpp | 216 +++++++++++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 199 insertions(+), 17 deletions(-) diff --git a/gcm.cpp b/gcm.cpp index b5400b98..c5eb0c0c 100644 --- a/gcm.cpp +++ b/gcm.cpp @@ -12,7 +12,7 @@ #ifndef CRYPTOPP_IMPORTS #ifndef CRYPTOPP_GENERATE_X64_MASM -// Clang 3.3 integrated assembler crash on Linux. MacPorts GCC compile error. SunCC crash under Sun Studio 12.5 and below. +// Clang 3.3 integrated assembler crash on Linux. MacPorts GCC compile error. SunCC crash under SunCC 5.14 and below. #if (defined(CRYPTOPP_LLVM_CLANG_VERSION) && (CRYPTOPP_LLVM_CLANG_VERSION < 30400)) || defined(CRYPTOPP_CLANG_INTEGRATED_ASSEMBLER) || (defined(__SUNPRO_CC) && __SUNPRO_CC <= 0x5140) # undef CRYPTOPP_X86_ASM_AVAILABLE # undef CRYPTOPP_X32_ASM_AVAILABLE @@ -20,11 +20,9 @@ # undef CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE # undef CRYPTOPP_BOOL_SSSE3_ASM_AVAILABLE # undef CRYPTOPP_BOOL_SSE2_INTRINSICS_AVAILABLE -# undef CRYPTOPP_BOOL_AESNI_INTRINSICS_AVAILABLE # define CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE 0 # define CRYPTOPP_BOOL_SSSE3_ASM_AVAILABLE 0 # define CRYPTOPP_BOOL_SSE2_INTRINSICS_AVAILABLE 0 -# define CRYPTOPP_BOOL_AESNI_INTRINSICS_AVAILABLE 0 #endif #include "gcm.h" @@ -94,6 +92,16 @@ inline static void SSE2_Xor16(byte *a, const byte *b, const byte *c) } #endif +#if CRYPTOPP_BOOL_NEON_INTRINSICS_AVAILABLE +inline static void NEON_Xor16(byte *a, const byte *b, const byte *c) +{ + assert(IsAlignedOn(a,GetAlignmentOf())); + assert(IsAlignedOn(b,GetAlignmentOf())); + assert(IsAlignedOn(c,GetAlignmentOf())); + *(uint64x2_t*)a = veorq_u64(*(uint64x2_t*)b, *(uint64x2_t*)c); +} +#endif + inline static void Xor16(byte *a, const byte *b, const byte *c) { assert(IsAlignedOn(a,GetAlignmentOf())); @@ -109,6 +117,7 @@ static const word64 s_clmulConstants64[] = { W64LIT(0xe100000000000000), W64LIT(0xc200000000000000), W64LIT(0x08090a0b0c0d0e0f), W64LIT(0x0001020304050607), W64LIT(0x0001020304050607), W64LIT(0x08090a0b0c0d0e0f)}; + static const __m128i *s_clmulConstants = (const __m128i *)(const void *)s_clmulConstants64; static const unsigned int s_clmulTableSizeInBlocks = 8; @@ -146,14 +155,56 @@ inline __m128i CLMUL_Reduce(__m128i c0, __m128i c1, __m128i c2, const __m128i &r inline __m128i CLMUL_GF_Mul(const __m128i &x, const __m128i &h, const __m128i &r) { - __m128i c0 = _mm_clmulepi64_si128(x,h,0); - __m128i c1 = _mm_xor_si128(_mm_clmulepi64_si128(x,h,1), _mm_clmulepi64_si128(x,h,0x10)); - __m128i c2 = _mm_clmulepi64_si128(x,h,0x11); + const __m128i c0 = _mm_clmulepi64_si128(x,h,0); + const __m128i c1 = _mm_xor_si128(_mm_clmulepi64_si128(x,h,1), _mm_clmulepi64_si128(x,h,0x10)); + const __m128i c2 = _mm_clmulepi64_si128(x,h,0x11); return CLMUL_Reduce(c0, c1, c2, r); } #endif +#if CRYPTOPP_BOOL_ARM_CRYPTO_INTRINSICS_AVAILABLE + +CRYPTOPP_ALIGN_DATA(16) +static const word64 s_clmulConstants64[] = { + W64LIT(0xe100000000000000), W64LIT(0xc200000000000000), // Used for ARM and x86; polynomial coefficients + W64LIT(0x08090a0b0c0d0e0f), W64LIT(0x0001020304050607), // Unused for ARM; used for x86 _mm_shuffle_epi8 + W64LIT(0x0001020304050607), W64LIT(0x08090a0b0c0d0e0f) // Unused for ARM; used for x86 _mm_shuffle_epi8 +}; + +static const uint64x2_t *s_clmulConstants = (const uint64x2_t *)s_clmulConstants64; +static const unsigned int s_clmulTableSizeInBlocks = 8; + +inline uint64x2_t PMULL_Reduce(uint64x2_t c0, uint64x2_t c1, uint64x2_t c2, const uint64x2_t &r) +{ + // See comments fo CLMUL_Reduce + + c1 = veorq_u64(c1, (uint64x2_t)vextq_u8(vdupq_n_u8(0), (uint8x16_t)c0, 8)); + c1 = veorq_u64(c1, (uint64x2_t)vmull_p64(vgetq_lane_u64(c0, 0), vgetq_lane_u64(r, 1))); + c0 = (uint64x2_t)vextq_u8((uint8x16_t)c0, vdupq_n_u8(0), 8); + c0 = veorq_u64(c0, c1); + c0 = vshlq_n_u64(c0, 1); + c0 = (uint64x2_t)vmull_p64(vgetq_lane_u64(c0, 0), vgetq_lane_u64(r, 0)); + c2 = veorq_u64(c2, c0); + c2 = veorq_u64(c2, (uint64x2_t)vextq_u8((uint8x16_t)c1, vdupq_n_u8(0), 8)); + c1 = vcombine_u64(vget_low_u64(c1), vget_low_u64(c2)); + c1 = vshrq_n_u64(c1, 63); + c2 = vshlq_n_u64(c2, 1); + + return veorq_u64(c2, c1); +} + +inline uint64x2_t PMULL_GF_Mul(const uint64x2_t &x, const uint64x2_t &h, const uint64x2_t &r) +{ + const uint64x2_t c0 = (uint64x2_t)vmull_p64(vgetq_lane_u64(x, 0), vgetq_lane_u64(h, 0)); + const uint64x2_t c1 = veorq_u64((uint64x2_t)vmull_p64(vgetq_lane_u64(x, 1), vgetq_lane_u64(h,0)), + (uint64x2_t)vmull_p64(vgetq_lane_u64(x, 0), vgetq_lane_u64(h, 1))); + const uint64x2_t c2 = (uint64x2_t)vmull_high_p64((poly64x2_t)x, (poly64x2_t)h); + + return PMULL_Reduce(c0, c1, c2, r); +} +#endif + void GCM_Base::SetKeyWithoutResync(const byte *userKey, size_t keylength, const NameValuePairs ¶ms) { BlockCipher &blockCipher = AccessBlockCipher(); @@ -172,6 +223,14 @@ void GCM_Base::SetKeyWithoutResync(const byte *userKey, size_t keylength, const tableSize = s_clmulTableSizeInBlocks * REQUIRED_BLOCKSIZE; } else +#elif CRYPTOPP_BOOL_ARM_CRYPTO_INTRINSICS_AVAILABLE + if (HasPMULL()) + { + // Avoid "parameter not used" error and suppress Coverity finding + (void)params.GetIntValue(Name::TableSize(), tableSize); + tableSize = s_clmulTableSizeInBlocks * REQUIRED_BLOCKSIZE; + } + else #endif { if (params.GetIntValue(Name::TableSize(), tableSize)) @@ -208,6 +267,32 @@ void GCM_Base::SetKeyWithoutResync(const byte *userKey, size_t keylength, const h = CLMUL_GF_Mul(h1, h0, r); } + return; + } +#elif CRYPTOPP_BOOL_ARM_CRYPTO_INTRINSICS_AVAILABLE + if (HasPMULL()) + { + const uint64x2_t r = s_clmulConstants[0]; + const uint64x2_t t = vld1q_u64((uint64_t *)hashKey); + const uint64x2_t h0 = (uint64x2_t)vrev64q_u8((uint8x16_t)vcombine_u64(vget_high_u64(t), vget_low_u64(t))); + + uint64x2_t h = h0; + for (i=0; i= 16) { size_t s = UnsignedMin(len/16, s_clmulTableSizeInBlocks), i=0; - __m128i d, d2 = _mm_shuffle_epi8(_mm_loadu_si128((const __m128i *)(const void *)(data+(s-1)*16)), bswapMask2);; + __m128i d, d2 = _mm_shuffle_epi8(_mm_loadu_si128((const __m128i *)(const void *)(data+(s-1)*16)), mask2);; __m128i c0 = _mm_setzero_si128(); __m128i c1 = _mm_setzero_si128(); __m128i c2 = _mm_setzero_si128(); @@ -398,41 +509,41 @@ size_t GCM_Base::AuthenticateBlocks(const byte *data, size_t len) { __m128i h0 = _mm_load_si128(table+i); __m128i h1 = _mm_load_si128(table+i+1); - __m128i h01 = _mm_xor_si128(h0, h1); + __m128i h2 = _mm_xor_si128(h0, h1); if (++i == s) { - d = _mm_shuffle_epi8(_mm_loadu_si128((const __m128i *)(const void *)data), bswapMask); + d = _mm_shuffle_epi8(_mm_loadu_si128((const __m128i *)(const void *)data), mask1); d = _mm_xor_si128(d, x); c0 = _mm_xor_si128(c0, _mm_clmulepi64_si128(d, h0, 0)); c2 = _mm_xor_si128(c2, _mm_clmulepi64_si128(d, h1, 1)); d = _mm_xor_si128(d, _mm_shuffle_epi32(d, _MM_SHUFFLE(1, 0, 3, 2))); - c1 = _mm_xor_si128(c1, _mm_clmulepi64_si128(d, h01, 0)); + c1 = _mm_xor_si128(c1, _mm_clmulepi64_si128(d, h2, 0)); break; } - d = _mm_shuffle_epi8(_mm_loadu_si128((const __m128i *)(const void *)(data+(s-i)*16-8)), bswapMask2); + d = _mm_shuffle_epi8(_mm_loadu_si128((const __m128i *)(const void *)(data+(s-i)*16-8)), mask2); c0 = _mm_xor_si128(c0, _mm_clmulepi64_si128(d2, h0, 1)); c2 = _mm_xor_si128(c2, _mm_clmulepi64_si128(d, h1, 1)); d2 = _mm_xor_si128(d2, d); - c1 = _mm_xor_si128(c1, _mm_clmulepi64_si128(d2, h01, 1)); + c1 = _mm_xor_si128(c1, _mm_clmulepi64_si128(d2, h2, 1)); if (++i == s) { - d = _mm_shuffle_epi8(_mm_loadu_si128((const __m128i *)(const void *)data), bswapMask); + d = _mm_shuffle_epi8(_mm_loadu_si128((const __m128i *)(const void *)data), mask1); d = _mm_xor_si128(d, x); c0 = _mm_xor_si128(c0, _mm_clmulepi64_si128(d, h0, 0x10)); c2 = _mm_xor_si128(c2, _mm_clmulepi64_si128(d, h1, 0x11)); d = _mm_xor_si128(d, _mm_shuffle_epi32(d, _MM_SHUFFLE(1, 0, 3, 2))); - c1 = _mm_xor_si128(c1, _mm_clmulepi64_si128(d, h01, 0x10)); + c1 = _mm_xor_si128(c1, _mm_clmulepi64_si128(d, h2, 0x10)); break; } - d2 = _mm_shuffle_epi8(_mm_loadu_si128((const __m128i *)(const void *)(data+(s-i)*16-8)), bswapMask); + d2 = _mm_shuffle_epi8(_mm_loadu_si128((const __m128i *)(const void *)(data+(s-i)*16-8)), mask1); c0 = _mm_xor_si128(c0, _mm_clmulepi64_si128(d, h0, 0x10)); c2 = _mm_xor_si128(c2, _mm_clmulepi64_si128(d2, h1, 0x10)); d = _mm_xor_si128(d, d2); - c1 = _mm_xor_si128(c1, _mm_clmulepi64_si128(d, h01, 0x10)); + c1 = _mm_xor_si128(c1, _mm_clmulepi64_si128(d, h2, 0x10)); } data += s*16; len -= s*16; @@ -444,6 +555,75 @@ size_t GCM_Base::AuthenticateBlocks(const byte *data, size_t len) _mm_store_si128((__m128i *)(void *)HashBuffer(), x); return len; } +#elif CRYPTOPP_BOOL_ARM_CRYPTO_INTRINSICS_AVAILABLE + if (HasPMULL()) + { + const uint64x2_t *table = (const uint64x2_t *)MulTable(); + uint64x2_t x = vld1q_u64((const uint64_t*)HashBuffer()); + const uint64x2_t r = s_clmulConstants[0]; + + while (len >= 16) + { + size_t s = UnsignedMin(len/16, s_clmulTableSizeInBlocks), i=0; + uint64x2_t d, d2 = (uint64x2_t)vrev64q_u8((uint8x16_t)vld1q_u64((const uint64_t *)(data+(s-1)*16))); + uint64x2_t c0 = vdupq_n_u64(0); + uint64x2_t c1 = vdupq_n_u64(0); + uint64x2_t c2 = vdupq_n_u64(0); + + while (true) + { + const uint64x2_t h0 = vld1q_u64((const uint64_t*)(table+i)); + const uint64x2_t h1 = vld1q_u64((const uint64_t*)(table+i+1)); + const uint64x2_t h2 = veorq_u64(h0, h1); + + if (++i == s) + { + const uint64x2_t t1 = vld1q_u64((const uint64_t *)data); + d = veorq_u64((uint64x2_t)vrev64q_u8((uint8x16_t)vcombine_u64(vget_high_u64(t1), vget_low_u64(t1))), x); + c0 = veorq_u64(c0, (uint64x2_t)vmull_p64(vgetq_lane_u64(d, 0), vgetq_lane_u64(h0, 0))); + c2 = veorq_u64(c2, (uint64x2_t)vmull_p64(vgetq_lane_u64(d, 1), vgetq_lane_u64(h1, 0))); + d = veorq_u64(d, (uint64x2_t)vcombine_u32(vget_high_u32((uint32x4_t)d), vget_low_u32((uint32x4_t)d))); + c1 = veorq_u64(c1, (uint64x2_t)vmull_p64(vgetq_lane_u64(d, 0), vgetq_lane_u64(h2, 0))); + + break; + } + + d = (uint64x2_t)vrev64q_u8((uint8x16_t)vld1q_u64((const uint64_t *)(data+(s-i)*16-8))); + c0 = veorq_u64(c0, (uint64x2_t)vmull_p64(vgetq_lane_u64(d2, 1), vgetq_lane_u64(h0, 0))); + c2 = veorq_u64(c2, (uint64x2_t)vmull_p64(vgetq_lane_u64(d, 1), vgetq_lane_u64(h1, 0))); + d2 = veorq_u64(d2, d); + c1 = veorq_u64(c1, (uint64x2_t)vmull_p64(vgetq_lane_u64(d2, 1), vgetq_lane_u64(h2, 0))); + + if (++i == s) + { + + const uint64x2_t t2 = vld1q_u64((const uint64_t *)data); + d = veorq_u64((uint64x2_t)vrev64q_u8((uint8x16_t)vcombine_u64(vget_high_u64(t2), vget_low_u64(t2))), x); + c0 = veorq_u64(c0, (uint64x2_t)vmull_p64(vgetq_lane_u64(d, 0), vgetq_lane_u64(h0, 1))); + c2 = veorq_u64(c2, (uint64x2_t)vmull_high_p64((poly64x2_t)d, (poly64x2_t)h1)); + d = veorq_u64(d, (uint64x2_t)vcombine_u32(vget_high_u32((uint32x4_t)d), vget_low_u32((uint32x4_t)d))); + c1 = veorq_u64(c1, (uint64x2_t)vmull_p64(vgetq_lane_u64(d, 0), vgetq_lane_u64(h2, 1))); + + break; + } + + const uint64x2_t t3 = vld1q_u64((uint64_t *)(data+(s-i)*16-8)); + d2 = (uint64x2_t)vrev64q_u8((uint8x16_t)vcombine_u64(vget_high_u64(t3), vget_low_u64(t3))); + c0 = veorq_u64(c0, (uint64x2_t)vmull_p64(vgetq_lane_u64(d, 0), vgetq_lane_u64(h0, 1))); + c2 = veorq_u64(c2, (uint64x2_t)vmull_p64(vgetq_lane_u64(d2, 0), vgetq_lane_u64(h1, 1))); + d = veorq_u64(d, d2); + c1 = veorq_u64(c1, (uint64x2_t)vmull_p64(vgetq_lane_u64(d, 0), vgetq_lane_u64(h2, 1))); + } + data += s*16; + len -= s*16; + + c1 = veorq_u64(veorq_u64(c1, c0), c2); + x = PMULL_Reduce(c0, c1, c2, r); + } + + vst1q_u64((uint64_t *)HashBuffer(), x); + return len; +} #endif typedef BlockGetAndPut Block; @@ -453,6 +633,8 @@ size_t GCM_Base::AuthenticateBlocks(const byte *data, size_t len) switch (2*(m_buffer.size()>=64*1024) #if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE || defined(CRYPTOPP_X64_MASM_AVAILABLE) + HasSSE2() +//#elif CRYPTOPP_BOOL_NEON_INTRINSICS_AVAILABLE +// + HasNEON() #endif ) { From 57a0059eb70d7b630d0a5c536e3e0d383467ca73 Mon Sep 17 00:00:00 2001 From: Jeffrey Walton Date: Wed, 24 Aug 2016 12:36:03 -0400 Subject: [PATCH 3/4] Improve CPU feature detection for X86 and ARM. Add tests for X86 Carryless Multiply; and X86 and ARM CRC32 --- cryptest.sh | 424 ++++++++++++++++++++++++++++++++++++++-------------- 1 file changed, 313 insertions(+), 111 deletions(-) diff --git a/cryptest.sh b/cryptest.sh index 687a3a95..f62e9368 100755 --- a/cryptest.sh +++ b/cryptest.sh @@ -526,7 +526,7 @@ HAVE_X86_AES=0 HAVE_X86_RDRAND=0 HAVE_X86_RDSEED=0 HAVE_X86_PCLMUL=0 -if [[ (("$IS_X86" -ne "0") || ("$IS_X64" -ne "0")) && ("$SUN_COMPILER" -eq "0") ]]; then +if [[ ("$IS_X86" -ne "0" || "$IS_X64" -ne "0") && ("$SUN_COMPILER" -eq "0") ]]; then rm -f "$TMP/adhoc.exe" > /dev/null 2>&1 "$CXX" -DCRYPTOPP_ADHOC_MAIN -maes adhoc.cpp -o "$TMP/adhoc.exe" > /dev/null 2>&1 if [[ "$?" -eq "0" ]]; then @@ -574,7 +574,11 @@ fi # ARMv7 and ARMv8, including NEON, CRC32 and Crypto extensions if [[ ("$IS_ARM32" -ne "0" || "$IS_ARM64" -ne "0") ]]; then - ARM_FEATURES=$(cat /proc/cpuinfo 2>&1 | "$AWK" '{IGNORECASE=1}{if ($1 == "Features") print}' | cut -f 2 -d ':') + if [[ ("$IS_DARWIN" -ne "0") ]]; then + ARM_FEATURES=$(sysctl machdep.cpu.features 2>&1 | cut -f 2 -d ':') + else + ARM_FEATURES=$(cat /proc/cpuinfo 2>&1 | "$AWK" '{IGNORECASE=1}{if ($1 == "Features") print}' | cut -f 2 -d ':') + fi if [[ (-z "$HAVE_ARMV7A" && "$IS_ARM32" -ne "0") ]]; then HAVE_ARMV7A=$(echo "$ARM_FEATURES" | "$GREP" -i -c 'neon') @@ -651,7 +655,7 @@ fi # Used to disassemble object modules so we can verify some aspects of code generation if [[ (-z "$HAVE_DISASS") ]]; then echo "int main(int argc, char* argv[]) {return 0;}" > "$TMP/test.cc" - "$CXX" -x c "$TMP/test.cc" -o "$TMP/test.exe" > /dev/null 2>&1 + "$CXX" "$TMP/test.cc" -o "$TMP/test.exe" > /dev/null 2>&1 if [[ "$?" -eq "0" ]]; then "$DISASS" "${DISASSARGS[@]}" "$TMP/test.exe" > /dev/null 2>&1 if [[ "$?" -eq "0" ]]; then @@ -1019,142 +1023,340 @@ echo "Start time: $TEST_BEGIN" | tee -a "$TEST_RESULTS" ############################################ # Test AES-NI code generation -if [[ ("$HAVE_DISASS" -ne "0" && "$HAVE_X86_AES" -ne "0") ]]; then - echo - echo "************************************" | tee -a "$TEST_RESULTS" - echo "Testing: AES-NI code generation" | tee -a "$TEST_RESULTS" - echo - - "$MAKE" clean > /dev/null 2>&1 - rm -f adhoc.cpp > /dev/null 2>&1 - - OBJFILE=rijndael.o - CXX="$CXX" CXXFLAGS="$RELEASE_CXXFLAGS" "$MAKE" "${MAKEARGS[@]}" $OBJFILE 2>&1 | tee -a "$TEST_RESULTS" +if [[ ("$HAVE_DISASS" -ne "0" && ("$IS_X86" -ne "0" || "$IS_X64" -ne "0")) ]]; then # This works for SunCC, but we need something like: # /opt/solarisstudio12.4/bin/CC -DNDEBUG -g2 -O2 -xarch=aes -m64 -D__SSE2__ -D__SSE3__ \ - # -D__SSE4_1__ -D__SSE4_2__ -D__AES__ -D__PCLMUL__ -c rijndael.cpp + # -D__SSE4_1__ -D__SSE4_2__ -D__AES__ -D__PCLMUL__ -c rijndael.cpp - COUNT=0 - FAILED=0 - DISASS_TEXT=$("$DISASS" "${DISASSARGS[@]}" "$OBJFILE" 2>/dev/null) - - COUNT=$(echo "$DISASS_TEXT" | "$GREP" -i -c aesenc) - if [[ ("$COUNT" -eq "0") ]]; then - FAILED=1 - echo "ERROR: failed to generate aesenc instruction" | tee -a "$TEST_RESULTS" - fi - - COUNT=$(echo "$DISASS_TEXT" | "$GREP" -i -c aesenclast) - if [[ ("$COUNT" -eq "0") ]]; then - FAILED=1 - echo "ERROR: failed to generate aesenclast instruction" | tee -a "$TEST_RESULTS" - fi - - COUNT=$(echo "$DISASS_TEXT" | "$GREP" -i -c aesdec) - if [[ ("$COUNT" -eq "0") ]]; then - FAILED=1 - echo "ERROR: failed to generate aesdec instruction" | tee -a "$TEST_RESULTS" - fi - - COUNT=$(echo "$DISASS_TEXT" | "$GREP" -i -c aesdeclast) - if [[ ("$COUNT" -eq "0") ]]; then - FAILED=1 - echo "ERROR: failed to generate aesdeclast instruction" | tee -a "$TEST_RESULTS" - fi - - COUNT=$(echo "$DISASS_TEXT" | "$GREP" -i -c aesimc) - if [[ ("$COUNT" -eq "0") ]]; then - FAILED=1 - echo "ERROR: failed to generate aesimc instruction" | tee -a "$TEST_RESULTS" - fi - - COUNT=$(echo "$DISASS_TEXT" | "$GREP" -i -c aeskeygenassist) - if [[ ("$COUNT" -eq "0") ]]; then - FAILED=1 - echo "ERROR: failed to generate aeskeygenassist instruction" | tee -a "$TEST_RESULTS" - fi - - if [[ ("$FAILED" -eq "0") ]];then - echo "Verified aesenc, aesenclast, aesdec, aesdeclast, aesimc, aeskeygenassist machine instruction generation" | tee -a "$TEST_RESULTS" + if [[ ("$IS_DARWIN" -ne "0") ]]; then + X86_AESNI=$(sysctl machdep.cpu.features 2>/dev/null | "$GREP" -i -c aes) + elif [[ ("$IS_SOLARIS" -ne "0") ]]; then + X86_AESNI=$(isainfo -v 2>/dev/null | "$GREP" -i -c aes) else - if [[ ("$CLANG_COMPILER" -ne "0" && "$CLANG_37_OR_ABOVE" -eq "0") ]]; then - echo "This could be due to Clang and lack of expected support for SSSE3 in some versions of the compiler. If so, try Clang 3.7 or above" + X86_AESNI=$(cat /proc/cpuinfo 2>/dev/null | "$GREP" -i -c aes) + fi + + if [[ ("$X86_AESNI" -ne "0") ]]; then + echo + echo "************************************" | tee -a "$TEST_RESULTS" + echo "Testing: X86 AES-NI code generation" | tee -a "$TEST_RESULTS" + echo + + "$MAKE" clean > /dev/null 2>&1 + rm -f adhoc.cpp > /dev/null 2>&1 + + OBJFILE=rijndael.o + CXX="$CXX" CXXFLAGS="$RELEASE_CXXFLAGS ${PLATFORM_CXXFLAGS[@]}" "$MAKE" "${MAKEARGS[@]}" $OBJFILE 2>&1 | tee -a "$TEST_RESULTS" + + COUNT=0 + FAILED=0 + DISASS_TEXT=$("$DISASS" "${DISASSARGS[@]}" "$OBJFILE" 2>/dev/null) + + COUNT=$(echo "$DISASS_TEXT" | "$GREP" -i -c aesenc) + if [[ ("$COUNT" -eq "0") ]]; then + FAILED=1 + echo "ERROR: failed to generate aesenc instruction" | tee -a "$TEST_RESULTS" + fi + + COUNT=$(echo "$DISASS_TEXT" | "$GREP" -i -c aesenclast) + if [[ ("$COUNT" -eq "0") ]]; then + FAILED=1 + echo "ERROR: failed to generate aesenclast instruction" | tee -a "$TEST_RESULTS" + fi + + COUNT=$(echo "$DISASS_TEXT" | "$GREP" -i -c aesdec) + if [[ ("$COUNT" -eq "0") ]]; then + FAILED=1 + echo "ERROR: failed to generate aesdec instruction" | tee -a "$TEST_RESULTS" + fi + + COUNT=$(echo "$DISASS_TEXT" | "$GREP" -i -c aesdeclast) + if [[ ("$COUNT" -eq "0") ]]; then + FAILED=1 + echo "ERROR: failed to generate aesdeclast instruction" | tee -a "$TEST_RESULTS" + fi + + COUNT=$(echo "$DISASS_TEXT" | "$GREP" -i -c aesimc) + if [[ ("$COUNT" -eq "0") ]]; then + FAILED=1 + echo "ERROR: failed to generate aesimc instruction" | tee -a "$TEST_RESULTS" + fi + + COUNT=$(echo "$DISASS_TEXT" | "$GREP" -i -c aeskeygenassist) + if [[ ("$COUNT" -eq "0") ]]; then + FAILED=1 + echo "ERROR: failed to generate aeskeygenassist instruction" | tee -a "$TEST_RESULTS" + fi + + if [[ ("$FAILED" -eq "0") ]];then + echo "Verified aesenc, aesenclast, aesdec, aesdeclast, aesimc, aeskeygenassist machine instructions" | tee -a "$TEST_RESULTS" + else + if [[ ("$CLANG_COMPILER" -ne "0" && "$CLANG_37_OR_ABOVE" -eq "0") ]]; then + echo "This could be due to Clang and lack of expected support for SSSE3 in some versions of the compiler. If so, try Clang 3.7 or above" + fi + fi + fi +fi + +############################################ +# X86 carryless multiply code generation +if [[ ("$HAVE_DISASS" -ne "0" && ("$IS_X86" -ne "0" || "$IS_X64" -ne "0")) ]]; then + + # This works for SunCC, but we need something like: + # /opt/solarisstudio12.4/bin/CC -DNDEBUG -g2 -O2 -xarch=aes -m64 -D__SSE2__ -D__SSE3__ \ + # -D__SSE4_1__ -D__SSE4_2__ -D__AES__ -D__PCLMUL__ -D__RDRND__ -D__RDSEED__ -c gcm.cpp + + if [[ ("$IS_DARWIN" -ne "0") ]]; then + X86_PCLMUL=$(sysctl machdep.cpu.features 2>/dev/null | "$GREP" -i -c pclmulq) + elif [[ ("$IS_SOLARIS" -ne "0") ]]; then + X86_PCLMUL=$(isainfo -v 2>/dev/null | "$GREP" -i -c pclmulq) + else + X86_PCLMUL=$(cat /proc/cpuinfo 2>/dev/null | "$GREP" -i -c pclmulq) + fi + + if [[ ("$X86_PCLMUL" -ne "0") ]]; then + echo + echo "************************************" | tee -a "$TEST_RESULTS" + echo "Testing: X86 carryless multiply code generation" | tee -a "$TEST_RESULTS" + echo + + "$MAKE" clean > /dev/null 2>&1 + rm -f adhoc.cpp > /dev/null 2>&1 + + OBJFILE=gcm.o + CXX="$CXX" CXXFLAGS="$RELEASE_CXXFLAGS ${PLATFORM_CXXFLAGS[@]}" "$MAKE" "${MAKEARGS[@]}" $OBJFILE 2>&1 | tee -a "$TEST_RESULTS" + + COUNT=0 + FAILED=0 + DISASS_TEXT=$("$DISASS" "${DISASSARGS[@]}" "$OBJFILE" 2>/dev/null) + + COUNT=$(echo "$DISASS_TEXT" | "$EGREP" -i -c '(pclmullqh|vpclmulqdq)') + if [[ ("$COUNT" -eq "0") ]]; then + FAILED=1 + echo "ERROR: failed to generate pclmullqh instruction" | tee -a "$TEST_RESULTS" + fi + + COUNT=$(echo "$DISASS_TEXT" | "$EGREP" -i -c '(pclmullql|vpclmulqdq)') + if [[ ("$COUNT" -eq "0") ]]; then + FAILED=1 + echo "ERROR: failed to generate pclmullql instruction" | tee -a "$TEST_RESULTS" + fi + + if [[ ("$FAILED" -eq "0") ]];then + echo "Verified pclmullqh and pclmullql machine instructions" | tee -a "$TEST_RESULTS" + else + if [[ ("$CLANG_COMPILER" -ne "0") ]]; then + echo "This is probably due to Clang and its integrated assembler. The integrated assembler cannot consume a lot of ASM used by the library" + fi fi fi fi ############################################ # Test RDRAND and RDSEED code generation -if [[ ("$HAVE_DISASS" -ne "0" && "$HAVE_X86_RDRAND" -ne "0") ]]; then - echo - echo "************************************" | tee -a "$TEST_RESULTS" - echo "Testing: RDRAND and RDSEED code generation" | tee -a "$TEST_RESULTS" - echo - - "$MAKE" clean > /dev/null 2>&1 - rm -f adhoc.cpp > /dev/null 2>&1 - - OBJFILE=rdrand.o - CXX="$CXX" CXXFLAGS="$RELEASE_CXXFLAGS" "$MAKE" "${MAKEARGS[@]}" $OBJFILE 2>&1 | tee -a "$TEST_RESULTS" +if [[ ("$HAVE_DISASS" -ne "0" && ("$IS_X86" -ne "0" || "$IS_X64" -ne "0")) ]]; then # This works for SunCC, but we need something like: # /opt/solarisstudio12.4/bin/CC -DNDEBUG -g2 -O2 -xarch=avx_i -m64 -D__SSE2__ -D__SSE3__ \ - # -D__SSE4_1__ -D__SSE4_2__ -D__AES__ -D__PCLMUL__ -D__RDRND__ -D__RDSEED__ -c rdrand.cpp + # -D__SSE4_1__ -D__SSE4_2__ -D__AES__ -D__PCLMUL__ -D__RDRND__ -D__RDSEED__ -c rdrand.cpp - COUNT=0 - FAILED=0 - DISASS_TEXT=$("$DISASS" "${DISASSARGS[@]}" "$OBJFILE" 2>/dev/null) - - COUNT=$(echo "$DISASS_TEXT" | "$GREP" -i -c rdrand) - if [[ ("$COUNT" -eq "0") ]]; then - FAILED=1 - echo "ERROR: failed to generate rdrand instruction" | tee -a "$TEST_RESULTS" + if [[ ("$IS_DARWIN" -ne "0") ]]; then + X86_RDRAND=$(sysctl machdep.cpu.features 2>/dev/null | "$GREP" -i -c rdrand) + X86_RDSEED=$(sysctl machdep.cpu.features 2>/dev/null | "$GREP" -i -c rdseed) + elif [[ ("$IS_SOLARIS" -ne "0") ]]; then + X86_RDRAND=$(isainfo -v 2>/dev/null | "$GREP" -i -c rdrand) + X86_RDSEED=$(isainfo -v 2>/dev/null | "$GREP" -i -c rdseed) + else + X86_RDRAND=$(cat /proc/cpuinfo 2>/dev/null | "$GREP" -i -c rdrand) + X86_RDSEED=$(cat /proc/cpuinfo 2>/dev/null | "$GREP" -i -c rdseed) fi - COUNT=$(echo "$DISASS_TEXT" | "$GREP" -i -c rdseed) - if [[ ("$COUNT" -eq "0") ]]; then - FAILED=1 - echo "ERROR: failed to generate rdseed instruction" | tee -a "$TEST_RESULTS" - fi + if [[ ("$X86_RDRAND" -ne "0" || "$X86_RDSEED" -ne "0") ]]; then + echo + echo "************************************" | tee -a "$TEST_RESULTS" + echo "Testing: X86 RDRAND and RDSEED code generation" | tee -a "$TEST_RESULTS" + echo - if [[ ("$FAILED" -eq "0") ]];then - echo "Verified rdrand and rdseed machine instruction generation" | tee -a "$TEST_RESULTS" + "$MAKE" clean > /dev/null 2>&1 + rm -f adhoc.cpp > /dev/null 2>&1 + + OBJFILE=rdrand.o + CXX="$CXX" CXXFLAGS="$RELEASE_CXXFLAGS ${PLATFORM_CXXFLAGS[@]}" "$MAKE" "${MAKEARGS[@]}" $OBJFILE 2>&1 | tee -a "$TEST_RESULTS" + + COUNT=0 + FAILED=0 + DISASS_TEXT=$("$DISASS" "${DISASSARGS[@]}" "$OBJFILE" 2>/dev/null) + + if [[ "$X86_RDRAND" -ne "0" ]]; then + COUNT=$(echo "$DISASS_TEXT" | "$GREP" -i -c rdrand) + if [[ ("$COUNT" -eq "0") ]]; then + FAILED=1 + echo "ERROR: failed to generate rdrand instruction" | tee -a "$TEST_RESULTS" + fi + fi + + if [[ "$X86_RDSEED" -ne "0" ]]; then + COUNT=$(echo "$DISASS_TEXT" | "$GREP" -i -c rdseed) + if [[ ("$COUNT" -eq "0") ]]; then + FAILED=1 + echo "ERROR: failed to generate rdseed instruction" | tee -a "$TEST_RESULTS" + fi + fi + + if [[ ("$FAILED" -eq "0") ]];then + echo "Verified rdrand and rdseed machine instructions" | tee -a "$TEST_RESULTS" + fi fi fi ############################################ -# ARM 64x64→128-bit multiply code generation -if [[ ("$HAVE_DISASS" -ne "0" && "$HAVE_ARM_CRYPTO" -ne "0") ]]; then - echo - echo "************************************" | tee -a "$TEST_RESULTS" - echo "Testing: ARM 64x64→128-bit multiply code generation" | tee -a "$TEST_RESULTS" - echo +# X86 CRC32 code generation +if [[ ("$HAVE_DISASS" -ne "0" && ("$IS_X86" -ne "0" || "$IS_X64" -ne "0")) ]]; then - "$MAKE" clean > /dev/null 2>&1 - rm -f adhoc.cpp > /dev/null 2>&1 + # This works for SunCC, but we need something like: + # /opt/solarisstudio12.3/bin/CC -DNDEBUG -g2 -O2 -xarch=sse4_2 -m64 -D__SSE2__ -D__SSE3__ \ + # -D__SSE4_1__ -D__SSE4_2__ -c crc.cpp - OBJFILE=gcm.o - CXX="$CXX" CXXFLAGS="$RELEASE_CXXFLAGS" "$MAKE" "${MAKEARGS[@]}" $OBJFILE 2>&1 | tee -a "$TEST_RESULTS" - - COUNT=0 - FAILED=0 - DISASS_TEXT=$("$DISASS" "${DISASSARGS[@]}" "$OBJFILE" 2>/dev/null) - - COUNT=$(echo "$DISASS_TEXT" | "$GREP" -v pmull2 | "$GREP" -i -c pmull) - if [[ ("$COUNT" -eq "0") ]]; then - FAILED=1 - echo "ERROR: failed to generate pmull instruction" | tee -a "$TEST_RESULTS" + if [[ ("$IS_DARWIN" -ne "0") ]]; then + X86_CRC32=$(sysctl machdep.cpu.features 2>/dev/null | "$GREP" -i -c sse4.2) + elif [[ ("$IS_SOLARIS" -ne "0") ]]; then + X86_CRC32=$(isainfo -v 2>/dev/null | "$GREP" -i -c sse4_2) + else + X86_CRC32=$(cat /proc/cpuinfo 2>/dev/null | "$GREP" -i -c sse4_2) fi - COUNT=$(echo "$DISASS_TEXT" | "$GREP" -i -c pmull2) - if [[ ("$COUNT" -eq "0") ]]; then - FAILED=1 - echo "ERROR: failed to generate pmull2 instruction" | tee -a "$TEST_RESULTS" + if [[ ("$X86_CRC32" -ne "0") ]]; then + echo + echo "************************************" | tee -a "$TEST_RESULTS" + echo "Testing: X86 CRC32 code generation" | tee -a "$TEST_RESULTS" + echo + + "$MAKE" clean > /dev/null 2>&1 + rm -f adhoc.cpp > /dev/null 2>&1 + + OBJFILE=crc.o + CXX="$CXX" CXXFLAGS="$RELEASE_CXXFLAGS ${PLATFORM_CXXFLAGS[@]}" "$MAKE" "${MAKEARGS[@]}" $OBJFILE 2>&1 | tee -a "$TEST_RESULTS" + + COUNT=0 + FAILED=0 + DISASS_TEXT=$("$DISASS" "${DISASSARGS[@]}" "$OBJFILE" 2>/dev/null) + + COUNT=$(echo "$DISASS_TEXT" | "$GREP" -i -c crc32l) + if [[ ("$COUNT" -eq "0") ]]; then + FAILED=1 + echo "ERROR: failed to generate crc32l instruction" | tee -a "$TEST_RESULTS" + fi + + COUNT=$(echo "$DISASS_TEXT" | "$GREP" -i -c crc32b) + if [[ ("$COUNT" -eq "0") ]]; then + FAILED=1 + echo "ERROR: failed to generate crc32b instruction" | tee -a "$TEST_RESULTS" + fi + + if [[ ("$FAILED" -eq "0") ]];then + echo "Verified crc32l and crc32b machine instructions" | tee -a "$TEST_RESULTS" + fi + fi +fi + +############################################ +# ARM carryless multiply code generation +if [[ ("$HAVE_DISASS" -ne "0" && ("$IS_ARM32" -ne "0" || "$IS_ARM64" -ne "0")) ]]; then + + if [[ ("$IS_DARWIN" -ne "0") ]]; then + ARM_PMULL=$(sysctl machdep.cpu.features 2>/dev/null | "$GREP" -i -c pmull) + else + ARM_PMULL=$(cat /proc/cpuinfo 2>/dev/null | "$GREP" -i -c pmull) fi - if [[ ("$FAILED" -eq "0") ]];then - echo "Verified pmull and pmull2 machine instruction generation" | tee -a "$TEST_RESULTS" + if [[ ("$ARM_PMULL" -ne "0" || "$HAVE_ARM_CRYPTO" -ne "0") ]]; then + echo + echo "************************************" | tee -a "$TEST_RESULTS" + echo "Testing: ARM carryless multiply code generation" | tee -a "$TEST_RESULTS" + echo + + "$MAKE" clean > /dev/null 2>&1 + rm -f adhoc.cpp > /dev/null 2>&1 + + OBJFILE=gcm.o + CXX="$CXX" CXXFLAGS="$RELEASE_CXXFLAGS ${PLATFORM_CXXFLAGS[@]}" "$MAKE" "${MAKEARGS[@]}" $OBJFILE 2>&1 | tee -a "$TEST_RESULTS" + + COUNT=0 + FAILED=0 + DISASS_TEXT=$("$DISASS" "${DISASSARGS[@]}" "$OBJFILE" 2>/dev/null) + + COUNT=$(echo "$DISASS_TEXT" | "$GREP" -v pmull2 | "$GREP" -i -c pmull) + if [[ ("$COUNT" -eq "0") ]]; then + FAILED=1 + echo "ERROR: failed to generate pmull instruction" | tee -a "$TEST_RESULTS" + fi + + COUNT=$(echo "$DISASS_TEXT" | "$GREP" -i -c pmull2) + if [[ ("$COUNT" -eq "0") ]]; then + FAILED=1 + echo "ERROR: failed to generate pmull2 instruction" | tee -a "$TEST_RESULTS" + fi + + if [[ ("$FAILED" -eq "0") ]];then + echo "Verified pmull and pmull2 machine instructions" | tee -a "$TEST_RESULTS" + fi + fi +fi + +############################################ +# ARM CRC32 code generation +if [[ ("$HAVE_DISASS" -ne "0" && ("$IS_ARM32" -ne "0" || "$IS_ARM64" -ne "0")) ]]; then + + if [[ ("$IS_DARWIN" -ne "0") ]]; then + ARM_CRC32=$(sysctl machdep.cpu.features 2>/dev/null | "$GREP" -i -c crc) + else + ARM_CRC32=$(cat /proc/cpuinfo 2>/dev/null | "$GREP" -i -c crc32) + fi + + if [[ ("$ARM_CRC32" -ne "0") ]]; then + echo + echo "************************************" | tee -a "$TEST_RESULTS" + echo "Testing: ARM CRC32 code generation" | tee -a "$TEST_RESULTS" + echo + + "$MAKE" clean > /dev/null 2>&1 + rm -f adhoc.cpp > /dev/null 2>&1 + + OBJFILE=crc.o + CXX="$CXX" CXXFLAGS="$RELEASE_CXXFLAGS ${PLATFORM_CXXFLAGS[@]}" "$MAKE" "${MAKEARGS[@]}" $OBJFILE 2>&1 | tee -a "$TEST_RESULTS" + + COUNT=0 + FAILED=0 + DISASS_TEXT=$("$DISASS" "${DISASSARGS[@]}" "$OBJFILE" 2>/dev/null) + + COUNT=$(echo "$DISASS_TEXT" | "$GREP" -i -c crc32cb) + if [[ ("$COUNT" -eq "0") ]]; then + FAILED=1 + echo "ERROR: failed to generate crc32cb instruction" | tee -a "$TEST_RESULTS" + fi + + COUNT=$(echo "$DISASS_TEXT" | "$GREP" -i -c crc32cw) + if [[ ("$COUNT" -eq "0") ]]; then + FAILED=1 + echo "ERROR: failed to generate crc32cw instruction" | tee -a "$TEST_RESULTS" + fi + + COUNT=$(echo "$DISASS_TEXT" | "$GREP" -i -c crc32b) + if [[ ("$COUNT" -eq "0") ]]; then + FAILED=1 + echo "ERROR: failed to generate crc32b instruction" | tee -a "$TEST_RESULTS" + fi + + COUNT=$(echo "$DISASS_TEXT" | "$GREP" -i -c crc32w) + if [[ ("$COUNT" -eq "0") ]]; then + FAILED=1 + echo "ERROR: failed to generate crc32w instruction" | tee -a "$TEST_RESULTS" + fi + + if [[ ("$FAILED" -eq "0") ]];then + echo "Verified crc32cb, crc32cw, crc32b and crc32w machine instructions" | tee -a "$TEST_RESULTS" + fi fi fi From ef6be6718b2d549cb4e87e71a00460fd3f4852d0 Mon Sep 17 00:00:00 2001 From: Jeffrey Walton Date: Wed, 24 Aug 2016 21:15:55 -0400 Subject: [PATCH 4/4] Add disassembly tests for ARM NEON --- cryptest.sh | 47 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 47 insertions(+) diff --git a/cryptest.sh b/cryptest.sh index f62e9368..b6bef73d 100755 --- a/cryptest.sh +++ b/cryptest.sh @@ -1260,6 +1260,53 @@ if [[ ("$HAVE_DISASS" -ne "0" && ("$IS_X86" -ne "0" || "$IS_X64" -ne "0")) ]]; t fi fi +############################################ +# ARM NEON code generation +if [[ ("$HAVE_DISASS" -ne "0" && ("$IS_ARM32" -ne "0" || "$IS_ARM64" -ne "0")) ]]; then + + if [[ ("$IS_DARWIN" -ne "0") ]]; then + ARM_NEON=$(sysctl machdep.cpu.features 2>/dev/null | "$EGREP" -i -c '(neon|asimd)') + else + ARM_NEON=$(cat /proc/cpuinfo 2>/dev/null | "$EGREP" -i -c '(neon|asimd)') + fi + + if [[ ("$ARM_NEON" -ne "0" || "$HAVE_ARM_NEON" -ne "0") ]]; then + echo + echo "************************************" | tee -a "$TEST_RESULTS" + echo "Testing: ARM NEON code generation" | tee -a "$TEST_RESULTS" + echo + + "$MAKE" clean > /dev/null 2>&1 + rm -f adhoc.cpp > /dev/null 2>&1 + + OBJFILE=blake2.o + CXX="$CXX" CXXFLAGS="$RELEASE_CXXFLAGS ${PLATFORM_CXXFLAGS[@]}" "$MAKE" "${MAKEARGS[@]}" $OBJFILE 2>&1 | tee -a "$TEST_RESULTS" + + COUNT=0 + FAILED=0 + DISASS_TEXT=$("$DISASS" "${DISASSARGS[@]}" "$OBJFILE" 2>/dev/null) + + # BLAKE2_NEON_Compress32: 40 each vld1q_u8 and vld1q_u64 + # BLAKE2_NEON_Compress64: 22 each vld1q_u8 and vld1q_u64 + COUNT=$(echo "$DISASS_TEXT" | "$EGREP" -i -c 'ldr.*q') + if [[ ("$COUNT" -lt "62") ]]; then + FAILED=1 + echo "ERROR: failed to generate expected vector load instructions" | tee -a "$TEST_RESULTS" + fi + + # BLAKE2_NEON_Compress{32|64}: 6 each vst1q_u32 and vst1q_u64 + COUNT=$(echo "$DISASS_TEXT" | "$EGREP" -i -c 'str.*q') + if [[ ("$COUNT" -lt "6") ]]; then + FAILED=1 + echo "ERROR: failed to generate expected vector store instructions" | tee -a "$TEST_RESULTS" + fi + + if [[ ("$FAILED" -eq "0") ]];then + echo "Verified vector load and store machine instructions" | tee -a "$TEST_RESULTS" + fi + fi +fi + ############################################ # ARM carryless multiply code generation if [[ ("$HAVE_DISASS" -ne "0" && ("$IS_ARM32" -ne "0" || "$IS_ARM64" -ne "0")) ]]; then