diff --git a/config.h b/config.h index 6fd21bfc..55c0a289 100644 --- a/config.h +++ b/config.h @@ -495,10 +495,10 @@ NAMESPACE_END // LLVM Clang requires 3.5. Apple Clang does not support it at the moment. // Microsoft plans to support ARM-64, but its not clear how to detect it. // TODO: Add MSC_VER and ARM-64 platform define when available -#if !defined(CRYPTOPP_BOOL_ARM_PMULL_INTRINSICS_AVAILABLE) && !defined(CRYPTOPP_DISABLE_ASM) +#if !defined(CRYPTOPP_BOOL_ARM_PMULL_AVAILABLE) && !defined(CRYPTOPP_DISABLE_ASM) # if defined(__ARM_FEATURE_CRYPTO) && !defined(__apple_build_version__) # if defined(__arm64__) || defined(__aarch64__) -# define CRYPTOPP_BOOL_ARM_PMULL_INTRINSICS_AVAILABLE 1 +# define CRYPTOPP_BOOL_ARM_PMULL_AVAILABLE 1 # endif # endif #endif diff --git a/cpu.cpp b/cpu.cpp index b7c8f20f..f1a9f6ab 100644 --- a/cpu.cpp +++ b/cpu.cpp @@ -440,7 +440,7 @@ static bool TryNEON() static bool TryPMULL() { -#if (CRYPTOPP_BOOL_ARM_PMULL_INTRINSICS_AVAILABLE) +#if (CRYPTOPP_BOOL_ARM_PMULL_AVAILABLE) # if defined(CRYPTOPP_MS_STYLE_INLINE_ASSEMBLY) volatile bool result = true; __try diff --git a/gcm.cpp b/gcm.cpp index ea77e3aa..882f10c9 100644 --- a/gcm.cpp +++ b/gcm.cpp @@ -28,6 +28,7 @@ NAMESPACE_BEGIN(CryptoPP) +#if (CRYPTOPP_BOOL_X86 || CRYPTOPP_BOOL_X32 || CRYPTOPP_BOOL_X64) // Different assemblers accept different mnemonics: 'movd eax, xmm0' vs 'movd rax, xmm0' vs 'mov eax, xmm0' vs 'mov rax, xmm0' #if (CRYPTOPP_LLVM_CLANG_VERSION >= 30600) || (CRYPTOPP_APPLE_CLANG_VERSION >= 70000) || defined(CRYPTOPP_CLANG_INTEGRATED_ASSEMBLER) // 'movd eax, xmm0' only. REG_WORD() macro not used. @@ -42,6 +43,86 @@ NAMESPACE_BEGIN(CryptoPP) // 'mov eax, xmm0' or 'mov rax, xmm0'. REG_WORD() macro supplies REG32 or REG64. # define USE_MOV_REG32_OR_REG64 1 #endif +#endif + +#if (CRYPTOPP_BOOL_ARM32 || CRYPTOPP_BOOL_ARM64) && CRYPTOPP_BOOL_ARM_PMULL_AVAILABLE +#if defined(__GNUC__) +// Schneiders, Hovsmith and O'Rourke used this trick. +// It results in much better code generation in production code +// by avoiding D-register spills when using vgetq_lane_u64. The +// problem does not surface under minimal test cases. +inline uint64x2_t PMULL_00(const uint64x2_t a, const uint64x2_t b) +{ + uint64x2_t r; + __asm __volatile("pmull %0.1q, %1.1d, %2.1d \n\t" + :"=w" (r) : "w" (a), "w" (b) ); + return r; +} + +inline uint64x2_t PMULL_01(const uint64x2_t a, const uint64x2_t b) +{ + uint64x2_t r; + __asm __volatile("pmull %0.1q, %1.1d, %2.1d \n\t" + :"=w" (r) : "w" (a), "w" (vget_high_u64(b)) ); + return r; +} + +inline uint64x2_t PMULL_10(const uint64x2_t a, const uint64x2_t b) +{ + uint64x2_t r; + __asm __volatile("pmull %0.1q, %1.1d, %2.1d \n\t" + :"=w" (r) : "w" (vget_high_u64(a)), "w" (b) ); + return r; +} + +inline uint64x2_t PMULL_11(const uint64x2_t a, const uint64x2_t b) +{ + uint64x2_t r; + __asm __volatile("pmull2 %0.1q, %1.2d, %2.2d \n\t" + :"=w" (r) : "w" (a), "w" (b) ); + return r; +} + +inline uint64x2_t VEXT_8(uint64x2_t a, uint64x2_t b, unsigned int c) +{ + uint64x2_t r; + __asm __volatile("ext %0.16b, %1.16b, %2.16b, %3 \n\t" + :"=w" (r) : "w" (a), "w" (b), "I" (c) ); + return r; +} +#endif // GCC and compatibles + +#if defined(_MSC_VER) +inline uint64x2_t PMULL_00(const uint64x2_t a, const uint64x2_t b) +{ + return (uint64x2_t)(vmull_p64(vgetq_lane_u64(vreinterpretq_u64_u8(a),0), + vgetq_lane_u64(vreinterpretq_u64_u8(b),0))); +} + +inline uint64x2_t PMULL_01(const uint64x2_t a, const uint64x2_t b) +{ + return (uint64x2_t)(vmull_p64(vgetq_lane_u64(vreinterpretq_u64_u8(a),0), + vgetq_lane_u64(vreinterpretq_u64_u8(b),1))); +} + +inline uint64x2_t PMULL_10(const uint64x2_t a, const uint64x2_t b) +{ + return (uint64x2_t)(vmull_p64(vgetq_lane_u64(vreinterpretq_u64_u8(a),1), + vgetq_lane_u64(vreinterpretq_u64_u8(b),0))); +} + +inline uint64x2_t PMULL_11(const uint64x2_t a, const uint64x2_t b) +{ + return (uint64x2_t)(vmull_p64(vgetq_lane_u64(vreinterpretq_u64_u8(a),1), + vgetq_lane_u64(vreinterpretq_u64_u8(b),1))); +} + +inline uint64x2_t VEXT_8(uint64x2_t a, uint64x2_t b, unsigned int c) +{ + return (uint64x2_t)vextq_u8(vreinterpretq_u8_u64(a), vreinterpretq_u8_u64(b), c); +} +#endif // Microsoft and compatibles +#endif // CRYPTOPP_BOOL_ARM_PMULL_AVAILABLE word16 GCM_Base::s_reductionTable[256]; volatile bool GCM_Base::s_reductionTableInitialized = false; @@ -180,7 +261,7 @@ inline __m128i CLMUL_GF_Mul(const __m128i &x, const __m128i &h, const __m128i &r } #endif -#if CRYPTOPP_BOOL_ARM_PMULL_INTRINSICS_AVAILABLE +#if CRYPTOPP_BOOL_ARM_PMULL_AVAILABLE CRYPTOPP_ALIGN_DATA(16) static const word64 s_clmulConstants64[] = { @@ -195,13 +276,13 @@ static const unsigned int s_clmulTableSizeInBlocks = 8; inline uint64x2_t PMULL_Reduce(uint64x2_t c0, uint64x2_t c1, uint64x2_t c2, const uint64x2_t &r) { // See comments fo CLMUL_Reduce - c1 = veorq_u64(c1, vreinterpretq_u64_u8(vextq_u8(vdupq_n_u8(0), vreinterpretq_u8_u64(c0), 8))); - c1 = veorq_u64(c1, (uint64x2_t)vmull_p64(vgetq_lane_u64(c0, 0), vgetq_lane_u64(r, 1))); - c0 = vreinterpretq_u64_u8(vextq_u8(vreinterpretq_u8_u64(c0), vdupq_n_u8(0), 8)); + c1 = veorq_u64(c1, VEXT_8(vdupq_n_u64(0), c0, 8)); + c1 = veorq_u64(c1, PMULL_01(c0, r)); + c0 = VEXT_8(c0, vdupq_n_u64(0), 8); c0 = vshlq_n_u64(veorq_u64(c0, c1), 1); - c0 = (uint64x2_t)vmull_p64(vgetq_lane_u64(c0, 0), vgetq_lane_u64(r, 0)); + c0 = PMULL_00(c0, r); c2 = veorq_u64(c2, c0); - c2 = veorq_u64(c2, (uint64x2_t)vextq_u8(vreinterpretq_u8_u64(c1), vdupq_n_u8(0), 8)); + c2 = veorq_u64(c2, VEXT_8(c1, vdupq_n_u64(0), 8)); c1 = vshrq_n_u64(vcombine_u64(vget_low_u64(c1), vget_low_u64(c2)), 63); c2 = vshlq_n_u64(c2, 1); @@ -210,11 +291,9 @@ inline uint64x2_t PMULL_Reduce(uint64x2_t c0, uint64x2_t c1, uint64x2_t c2, cons inline uint64x2_t PMULL_GF_Mul(const uint64x2_t &x, const uint64x2_t &h, const uint64x2_t &r) { - const uint64x2_t c0 = (uint64x2_t)vmull_p64(vgetq_lane_u64(x, 0), vgetq_lane_u64(h, 0)); - const uint64x2_t c1 = veorq_u64( - (uint64x2_t)vmull_p64(vgetq_lane_u64(x, 1), vgetq_lane_u64(h,0)), - (uint64x2_t)vmull_p64(vgetq_lane_u64(x, 0), vgetq_lane_u64(h, 1))); - const uint64x2_t c2 = (uint64x2_t)vmull_p64(vgetq_lane_u64(x, 1), vgetq_lane_u64(h, 1)); + const uint64x2_t c0 = PMULL_00(x, h); + const uint64x2_t c1 = veorq_u64(PMULL_10(x, h), PMULL_01(x, h)); + const uint64x2_t c2 = PMULL_11(x, h); return PMULL_Reduce(c0, c1, c2, r); } @@ -238,7 +317,7 @@ void GCM_Base::SetKeyWithoutResync(const byte *userKey, size_t keylength, const tableSize = s_clmulTableSizeInBlocks * REQUIRED_BLOCKSIZE; } else -#elif CRYPTOPP_BOOL_ARM_PMULL_INTRINSICS_AVAILABLE +#elif CRYPTOPP_BOOL_ARM_PMULL_AVAILABLE if (HasPMULL()) { // Avoid "parameter not used" error and suppress Coverity finding @@ -284,12 +363,13 @@ void GCM_Base::SetKeyWithoutResync(const byte *userKey, size_t keylength, const return; } -#elif CRYPTOPP_BOOL_ARM_PMULL_INTRINSICS_AVAILABLE +#elif CRYPTOPP_BOOL_ARM_PMULL_AVAILABLE if (HasPMULL()) { const uint64x2_t r = s_clmulConstants[0]; const uint64x2_t t = vld1q_u64((const uint64_t *)hashKey); - const uint64x2_t h0 = vreinterpretq_u64_u8(vrev64q_u8(vreinterpretq_u8_u64(vcombine_u64(vget_high_u64(t), vget_low_u64(t))))); + const uint64x2_t h0 = vreinterpretq_u64_u8(vrev64q_u8( + vreinterpretq_u8_u64(vcombine_u64(vget_high_u64(t), vget_low_u64(t))))); uint64x2_t h = h0; for (i=0; i