diff --git a/GNUmakefile b/GNUmakefile index 9e32ce53..2e2d456e 100755 --- a/GNUmakefile +++ b/GNUmakefile @@ -204,7 +204,7 @@ ARIA_FLAG = $(SSSE3_FLAG) ifeq ($(findstring -DCRYPTOPP_DISABLE_SSE4,$(CXXFLAGS)),) SSE42_FLAG = $(shell echo | $(CXX) $(CXXFLAGS) -msse4.2 -dM -E - | grep -i -c -q __SSE4_2__ && echo "-msse4.2") ifeq ($(findstring -DCRYPTOPP_DISABLE_AESNI,$(CXXFLAGS)),) -GCM_FLAG = $(shell echo | $(CXX) $(CXXFLAGS) -mpclmul -dM -E - | grep -i -c -q __PCLMUL__ && echo "-mpclmul") +GCM_FLAG = $(shell echo | $(CXX) $(CXXFLAGS) -mssse3 -mpclmul -dM -E - | grep -i -c -q __PCLMUL__ && echo "-mssse3 -mpclmul") AES_FLAG = $(shell echo | $(CXX) $(CXXFLAGS) -maes -dM -E - | grep -i -c -q __AES__ && echo "-maes") ifeq ($(findstring -DCRYPTOPP_DISABLE_SHA,$(CXXFLAGS)),) SHA_FLAG = $(shell echo | $(CXX) $(CXXFLAGS) -msse4.2 -msha -dM -E - | grep -i -c -q __SHA__ && echo "-msse4.2 -msha") diff --git a/aria-simd.cpp b/aria-simd.cpp index 86a62914..a1f1f1c3 100644 --- a/aria-simd.cpp +++ b/aria-simd.cpp @@ -10,7 +10,7 @@ #include "config.h" #include "misc.h" -#if (CRYPTOPP_ARM_NEON_AVAILABLE) && defined(__GNUC__) +#if (CRYPTOPP_ARM_NEON_AVAILABLE) # include "arm_neon.h" #endif diff --git a/aria.cpp b/aria.cpp index 50ac5d8e..1461023f 100644 --- a/aria.cpp +++ b/aria.cpp @@ -267,7 +267,6 @@ void ARIA::Base::ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock, b #ifdef IS_LITTLE_ENDIAN # if CRYPTOPP_ENABLE_ARIA_SSSE3_INTRINSICS - const __m128i MASK = _mm_set_epi8(12,13,14,15, 8,9,10,11, 4,5,6,7, 0,1,2,3); if (HasSSSE3()) { ARIA_ProcessAndXorBlock_Xor_SSSE3(xorBlock, outBlock, rk, t); diff --git a/ariatab.cpp b/ariatab.cpp index 1e0372ea..567ba72b 100644 --- a/ariatab.cpp +++ b/ariatab.cpp @@ -1,8 +1,7 @@ -// kalynatab.cpp - written and placed in the public domain by Jeffrey Walton +// ariatab.cpp - written and placed in the public domain by Jeffrey Walton #include "pch.h" #include "config.h" -#include "kalyna.h" NAMESPACE_BEGIN(CryptoPP) NAMESPACE_BEGIN(ARIATab) diff --git a/bench1.cpp b/bench1.cpp index 5eb16c04..37c16806 100644 --- a/bench1.cpp +++ b/bench1.cpp @@ -506,11 +506,11 @@ void Benchmark2(double t, double hertz) std::cout << "\n"; { -#if CRYPTOPPL_AESNI_AES_AVAILABLE +#if CRYPTOPP_AESNI_AVAILABLE if (HasCLMUL()) BenchMarkByName2("AES/GCM", 0, "GMAC(AES)"); else -#elif CRYPTOPP_ARMV_PMULL_AVAILABLE +#elif CRYPTOPP_ARM_PMULL_AVAILABLE if (HasPMULL()) BenchMarkByName2("AES/GCM", 0, "GMAC(AES)"); else @@ -594,11 +594,11 @@ void Benchmark2(double t, double hertz) std::cout << "\n"; { -#if CRYPTOPPL_AESNI_AES_AVAILABLE +#if CRYPTOPP_AESNI_AVAILABLE if (HasCLMUL()) BenchMarkByName2("AES/GCM", 0, "AES/GCM"); else -#elif CRYPTOPP_ARMV_PMULL_AVAILABLE +#elif CRYPTOPP_ARM_PMULL_AVAILABLE if (HasPMULL()) BenchMarkByName2("AES/GCM", 0, "AES/GCM"); else diff --git a/config.h b/config.h index 862c7589..59130864 100644 --- a/config.h +++ b/config.h @@ -507,13 +507,25 @@ NAMESPACE_END #define CRYPTOPP_SSE42_AVAILABLE 1 #endif -// Don't disgorge AES-NI from CLMUL. There will be two to four subtle breaks -#if !defined(CRYPTOPP_DISABLE_ASM) && !defined(CRYPTOPP_DISABLE_AESNI) && (_MSC_FULL_VER >= 150030729 || __INTEL_COMPILER >= 1110 || (defined(__AES__) && defined(__PCLMUL__))) - #define CRYPTOPPL_AESNI_AES_AVAILABLE 1 +#if !defined(CRYPTOPP_DISABLE_ASM) && !defined(CRYPTOPP_DISABLE_CLMUL) && \ + (defined(__PCLMUL__) || (_MSC_FULL_VER >= 150030729) || \ + (CRYPTOPP_GCC_VERSION >= 40300) || (__INTEL_COMPILER >= 1110) || \ + (CRYPTOPP_LLVM_CLANG_VERSION >= 30200) || (CRYPTOPP_APPLE_CLANG_VERSION >= 40300)) + #define CRYPTOPP_CLMUL_AVAILABLE 1 #endif +#if !defined(CRYPTOPP_DISABLE_SSE4) && defined(CRYPTOPP_SSSE3_AVAILABLE) && \ + (defined(__AES__) || (_MSC_FULL_VER >= 150030729) || \ + (CRYPTOPP_GCC_VERSION >= 40300) || (__INTEL_COMPILER >= 1110) || \ + (CRYPTOPP_LLVM_CLANG_VERSION >= 30200) || (CRYPTOPP_APPLE_CLANG_VERSION >= 40300)) + #define CRYPTOPP_AESNI_AVAILABLE 1 +#endif + +// TODO: +#undef CRYPTOPP_AESNI_AVAILABLE + #if !defined(CRYPTOPP_DISABLE_ASM) && !defined(CRYPTOPP_DISABLE_SHA) && ((_MSC_VER >= 1900) || defined(__SHA__)) - #define CRYPTOPP_SHANI_SHA_AVAILABLE 1 + #define CRYPTOPP_SHANI_AVAILABLE 1 #endif #endif // X86, X32, X64 @@ -534,10 +546,10 @@ NAMESPACE_END // LLVM Clang requires 3.5. Apple Clang is unknown at the moment. // Microsoft plans to support ARM-64, but its not clear how to detect it. // TODO: Add MSC_VER and ARM-64 platform define when available -#if !defined(CRYPTOPP_ARMV_CRC32_AVAILABLE) && !defined(CRYPTOPP_DISABLE_ASM) +#if !defined(CRYPTOPP_ARM_CRC32_AVAILABLE) && !defined(CRYPTOPP_DISABLE_ASM) # if defined(__ARM_FEATURE_CRYPTO) || (CRYPTOPP_MSC_VER >= 2000) || \ (CRYPTOPP_GCC_VERSION >= 40800) || (CRYPTOPP_LLVM_CLANG_VERSION >= 30500) -# define CRYPTOPP_ARMV_CRC32_AVAILABLE 1 +# define CRYPTOPP_ARM_CRC32_AVAILABLE 1 # endif #endif @@ -545,10 +557,10 @@ NAMESPACE_END // LLVM Clang requires 3.5. Apple Clang is unknown at the moment. // Microsoft plans to support ARM-64, but its not clear how to detect it. // TODO: Add MSC_VER and ARM-64 platform define when available -#if !defined(CRYPTOPP_ARMV_PMULL_AVAILABLE) && !defined(CRYPTOPP_DISABLE_ASM) && !defined(__apple_build_version__) +#if !defined(CRYPTOPP_ARM_PMULL_AVAILABLE) && !defined(CRYPTOPP_DISABLE_ASM) && !defined(__apple_build_version__) # if defined(__ARM_FEATURE_CRYPTO) || (CRYPTOPP_MSC_VER >= 2000) || \ (CRYPTOPP_GCC_VERSION >= 40800) || (CRYPTOPP_LLVM_CLANG_VERSION >= 30500) -# define CRYPTOPP_ARMV_PMULL_AVAILABLE 1 +# define CRYPTOPP_ARM_PMULL_AVAILABLE 1 # endif #endif @@ -559,15 +571,15 @@ NAMESPACE_END #if !defined(CRYPTOPP_ARMV8A_CRYPTO_AVAILABLE) && !defined(CRYPTOPP_DISABLE_ASM) # if defined(__ARM_FEATURE_CRYPTO) || (CRYPTOPP_MSC_VER >= 2000) || \ (CRYPTOPP_GCC_VERSION >= 40800) || (CRYPTOPP_LLVM_CLANG_VERSION >= 30500) -# define CRYPTOPP_ARMV_AES_AVAILABLE 1 -# define CRYPTOPP_ARMV_PMULL_AVAILABLE 1 +# define CRYPTOPP_ARM_AES_AVAILABLE 1 +# define CRYPTOPP_ARM_PMULL_AVAILABLE 1 # define CRYPTOPP_ARMV8A_SHA_AVAILABLE 1 # define CRYPTOPP_ARMV8A_CRYPTO_AVAILABLE 1 # endif #endif // TODO... -#undef CRYPTOPP_ARMV_AES_AVAILABLE +#undef CRYPTOPP_ARM_AES_AVAILABLE #endif // ARM32, ARM64 diff --git a/cpu.cpp b/cpu.cpp index 570c4917..e07a9428 100644 --- a/cpu.cpp +++ b/cpu.cpp @@ -354,7 +354,7 @@ extern "C" static bool TryAES() { -#if (CRYPTOPP_ARMV_AES_AVAILABLE) +#if (CRYPTOPP_ARM_AES_AVAILABLE) # if defined(CRYPTOPP_MS_STYLE_INLINE_ASSEMBLY) volatile bool result = true; __try diff --git a/cpu.h b/cpu.h index 3c9d942f..79f0db12 100644 --- a/cpu.h +++ b/cpu.h @@ -66,10 +66,10 @@ # include // _mm_blend_epi16 # include // _mm_crc32_u{8|16|32} #endif // smmintrin.h -#if CRYPTOPPL_AESNI_AES_AVAILABLE +#if CRYPTOPP_AESNI_AVAILABLE # include // aesenc, aesdec, etc #endif // wmmintrin.h -#if CRYPTOPP_SHANI_SHA_AVAILABLE +#if CRYPTOPP_SHANI_AVAILABLE # include // RDRAND, RDSEED, AVX, SHA #endif // immintrin.h #endif // X86/X64/X32 Headers diff --git a/crc-simd.cpp b/crc-simd.cpp index 3c41c94f..3ee20532 100644 --- a/crc-simd.cpp +++ b/crc-simd.cpp @@ -14,7 +14,7 @@ # include "nmmintrin.h" #endif -#if (CRYPTOPP_ARMV_CRC32_AVAILABLE) +#if (CRYPTOPP_ARM_CRC32_AVAILABLE) # include "arm_neon.h" #if defined(__GNUC__) # include "arm_acle.h" @@ -40,7 +40,7 @@ extern "C" { }; #endif // Not CRYPTOPP_MS_STYLE_INLINE_ASSEMBLY -#if (CRYPTOPP_ARMV_CRC32_AVAILABLE) +#if (CRYPTOPP_ARM_CRC32_AVAILABLE) bool CPU_TryCRC32_ARMV8() { # if defined(CRYPTOPP_MS_STYLE_INLINE_ASSEMBLY) @@ -96,9 +96,9 @@ bool CPU_TryCRC32_ARMV8() return result; # endif } -#endif // CRYPTOPP_ARMV_CRC32_AVAILABLE +#endif // CRYPTOPP_ARM_CRC32_AVAILABLE -#if (CRYPTOPP_ARMV_CRC32_AVAILABLE) +#if (CRYPTOPP_ARM_CRC32_AVAILABLE) void CRC32_Update_ARMV8(const byte *s, size_t n, word32& c) { for(; !IsAligned(s) && n > 0; s++, n--) diff --git a/crc.cpp b/crc.cpp index 0859f279..6046bf83 100644 --- a/crc.cpp +++ b/crc.cpp @@ -8,7 +8,7 @@ NAMESPACE_BEGIN(CryptoPP) // crc-simd.cpp -#if (CRYPTOPP_ARMV_CRC32_AVAILABLE) +#if (CRYPTOPP_ARM_CRC32_AVAILABLE) extern void CRC32_Update_ARMV8(const byte *s, size_t n, word32& c); extern void CRC32C_Update_ARMV8(const byte *s, size_t n, word32& c); #endif @@ -136,7 +136,7 @@ CRC32::CRC32() void CRC32::Update(const byte *s, size_t n) { -#if (CRYPTOPP_ARMV_CRC32_AVAILABLE) +#if (CRYPTOPP_ARM_CRC32_AVAILABLE) if (HasCRC32()) { CRC32_Update_ARMV8(s, n, m_crc); @@ -302,7 +302,7 @@ void CRC32C::Update(const byte *s, size_t n) CRC32C_Update_SSE42(s, n, m_crc); return; } -#elif (CRYPTOPP_ARMV_CRC32_AVAILABLE) +#elif (CRYPTOPP_ARM_CRC32_AVAILABLE) if (HasCRC32()) { CRC32C_Update_ARMV8(s, n, m_crc); diff --git a/gcm-simd.cpp b/gcm-simd.cpp index 38de5e27..6197de6f 100644 --- a/gcm-simd.cpp +++ b/gcm-simd.cpp @@ -10,13 +10,14 @@ #include "config.h" #include "misc.h" -#if (CRYPTOPP_AESNI_AVAILABLE) +#if (CRYPTOPP_CLMUL_AVAILABLE) +# include "tmmintrin.h" # include "wmmintrin.h" #endif #if (CRYPTOPP_ARM_NEON_AVAILABLE) # include "arm_neon.h" -#if (CRYPTOPP_ARMV_PMULL_AVAILABLE) +#if (CRYPTOPP_ARM_PMULL_AVAILABLE) # include "arm_acle.h" #endif #endif @@ -29,7 +30,7 @@ ANONYMOUS_NAMESPACE_BEGIN // GCC 4.8 and 4.9 are missing PMULL gear -#if (CRYPTOPP_ARMV_PMULL_AVAILABLE) +#if (CRYPTOPP_ARM_PMULL_AVAILABLE) # if (CRYPTOPP_GCC_VERSION >= 40800) && (CRYPTOPP_GCC_VERSION < 50000) inline poly128_t VMULL_P64(poly64_t a, poly64_t b) { @@ -43,7 +44,7 @@ inline poly128_t VMULL_HIGH_P64(poly64x2_t a, poly64x2_t b) # endif #endif -#if (CRYPTOPP_BOOL_ARM32 || CRYPTOPP_BOOL_ARM64) && CRYPTOPP_ARMV_PMULL_AVAILABLE +#if (CRYPTOPP_BOOL_ARM32 || CRYPTOPP_BOOL_ARM64) && CRYPTOPP_ARM_PMULL_AVAILABLE #if defined(__GNUC__) // Schneiders, Hovsmith and O'Rourke used this trick. // It results in much better code generation in production code @@ -137,7 +138,7 @@ inline uint64x2_t VEXT_U8(uint64x2_t a, uint64x2_t b) return (uint64x2_t)vextq_u8(vreinterpretq_u8_u64(a), vreinterpretq_u8_u64(b), C); } #endif // Microsoft and compatibles -#endif // CRYPTOPP_ARMV_PMULL_AVAILABLE +#endif // CRYPTOPP_ARM_PMULL_AVAILABLE ANONYMOUS_NAMESPACE_END @@ -147,78 +148,78 @@ NAMESPACE_BEGIN(CryptoPP) extern "C" { typedef void (*SigHandler)(int); - static jmp_buf s_jmpSIGILL; - static void SigIllHandler(int) - { - longjmp(s_jmpSIGILL, 1); - } + static jmp_buf s_jmpSIGILL; + static void SigIllHandler(int) + { + longjmp(s_jmpSIGILL, 1); + } }; #endif // Not CRYPTOPP_MS_STYLE_INLINE_ASSEMBLY -#if (CRYPTOPP_ARMV_PMULL_AVAILABLE) +#if (CRYPTOPP_ARM_PMULL_AVAILABLE) bool CPU_TryPMULL_ARMV8() { # if defined(CRYPTOPP_MS_STYLE_INLINE_ASSEMBLY) - volatile bool result = true; - __try - { - const poly64_t a1={0x9090909090909090}, b1={0xb0b0b0b0b0b0b0b0}; - const poly8x16_t a2={0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0xa0,0xa0,0xa0,0xa0,0xa0,0xa0,0xa0,0xa0}, - b2={0xc0,0xc0,0xc0,0xc0,0xc0,0xc0,0xc0,0xc0,0xe0,0xe0,0xe0,0xe0,0xe0,0xe0,0xe0,0xe0}; + volatile bool result = true; + __try + { + const poly64_t a1={0x9090909090909090}, b1={0xb0b0b0b0b0b0b0b0}; + const poly8x16_t a2={0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0xa0,0xa0,0xa0,0xa0,0xa0,0xa0,0xa0,0xa0}, + b2={0xc0,0xc0,0xc0,0xc0,0xc0,0xc0,0xc0,0xc0,0xe0,0xe0,0xe0,0xe0,0xe0,0xe0,0xe0,0xe0}; - const poly128_t r1 = vmull_p64(a1, b1); - const poly128_t r2 = vmull_high_p64((poly64x2_t)(a2), (poly64x2_t)(b2)); + const poly128_t r1 = vmull_p64(a1, b1); + const poly128_t r2 = vmull_high_p64((poly64x2_t)(a2), (poly64x2_t)(b2)); - // Linaro is missing vreinterpretq_u64_p128. Also see http://github.com/weidai11/cryptopp/issues/233. - const uint64x2_t& t1 = (uint64x2_t)(r1); // {bignum,bignum} - const uint64x2_t& t2 = (uint64x2_t)(r2); // {bignum,bignum} + // Linaro is missing vreinterpretq_u64_p128. Also see http://github.com/weidai11/cryptopp/issues/233. + const uint64x2_t& t1 = (uint64x2_t)(r1); // {bignum,bignum} + const uint64x2_t& t2 = (uint64x2_t)(r2); // {bignum,bignum} - result = !!(vgetq_lane_u64(t1,0) == 0x5300530053005300 && vgetq_lane_u64(t1,1) == 0x5300530053005300 && - vgetq_lane_u64(t2,0) == 0x6c006c006c006c00 && vgetq_lane_u64(t2,1) == 0x6c006c006c006c00); - } - __except (EXCEPTION_EXECUTE_HANDLER) - { - return false; - } - return result; + result = !!(vgetq_lane_u64(t1,0) == 0x5300530053005300 && vgetq_lane_u64(t1,1) == 0x5300530053005300 && + vgetq_lane_u64(t2,0) == 0x6c006c006c006c00 && vgetq_lane_u64(t2,1) == 0x6c006c006c006c00); + } + __except (EXCEPTION_EXECUTE_HANDLER) + { + return false; + } + return result; # else - // longjmp and clobber warnings. Volatile is required. - // http://github.com/weidai11/cryptopp/issues/24 and http://stackoverflow.com/q/7721854 - volatile bool result = true; + // longjmp and clobber warnings. Volatile is required. + // http://github.com/weidai11/cryptopp/issues/24 and http://stackoverflow.com/q/7721854 + volatile bool result = true; - volatile SigHandler oldHandler = signal(SIGILL, SigIllHandler); - if (oldHandler == SIG_ERR) - return false; + volatile SigHandler oldHandler = signal(SIGILL, SigIllHandler); + if (oldHandler == SIG_ERR) + return false; - volatile sigset_t oldMask; - if (sigprocmask(0, NULLPTR, (sigset_t*)&oldMask)) - return false; + volatile sigset_t oldMask; + if (sigprocmask(0, NULLPTR, (sigset_t*)&oldMask)) + return false; - if (setjmp(s_jmpSIGILL)) - result = false; - else - { - const poly64_t a1={0x9090909090909090}, b1={0xb0b0b0b0b0b0b0b0}; - const poly8x16_t a2={0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0xa0,0xa0,0xa0,0xa0,0xa0,0xa0,0xa0,0xa0}, - b2={0xc0,0xc0,0xc0,0xc0,0xc0,0xc0,0xc0,0xc0,0xe0,0xe0,0xe0,0xe0,0xe0,0xe0,0xe0,0xe0}; + if (setjmp(s_jmpSIGILL)) + result = false; + else + { + const poly64_t a1={0x9090909090909090}, b1={0xb0b0b0b0b0b0b0b0}; + const poly8x16_t a2={0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0xa0,0xa0,0xa0,0xa0,0xa0,0xa0,0xa0,0xa0}, + b2={0xc0,0xc0,0xc0,0xc0,0xc0,0xc0,0xc0,0xc0,0xe0,0xe0,0xe0,0xe0,0xe0,0xe0,0xe0,0xe0}; - const poly128_t r1 = VMULL_P64(a1, b1); - const poly128_t r2 = VMULL_HIGH_P64((poly64x2_t)(a2), (poly64x2_t)(b2)); + const poly128_t r1 = VMULL_P64(a1, b1); + const poly128_t r2 = VMULL_HIGH_P64((poly64x2_t)(a2), (poly64x2_t)(b2)); - // Linaro is missing vreinterpretq_u64_p128. Also see http://github.com/weidai11/cryptopp/issues/233. - const uint64x2_t& t1 = (uint64x2_t)(r1); // {bignum,bignum} - const uint64x2_t& t2 = (uint64x2_t)(r2); // {bignum,bignum} + // Linaro is missing vreinterpretq_u64_p128. Also see http://github.com/weidai11/cryptopp/issues/233. + const uint64x2_t& t1 = (uint64x2_t)(r1); // {bignum,bignum} + const uint64x2_t& t2 = (uint64x2_t)(r2); // {bignum,bignum} - result = !!(vgetq_lane_u64(t1,0) == 0x5300530053005300 && vgetq_lane_u64(t1,1) == 0x5300530053005300 && - vgetq_lane_u64(t2,0) == 0x6c006c006c006c00 && vgetq_lane_u64(t2,1) == 0x6c006c006c006c00); - } + result = !!(vgetq_lane_u64(t1,0) == 0x5300530053005300 && vgetq_lane_u64(t1,1) == 0x5300530053005300 && + vgetq_lane_u64(t2,0) == 0x6c006c006c006c00 && vgetq_lane_u64(t2,1) == 0x6c006c006c006c00); + } - sigprocmask(SIG_SETMASK, (sigset_t*)&oldMask, NULLPTR); - signal(SIGILL, oldHandler); - return result; + sigprocmask(SIG_SETMASK, (sigset_t*)&oldMask, NULLPTR); + signal(SIGILL, oldHandler); + return result; # endif } -#endif // CRYPTOPP_ARMV_PMULL_AVAILABLE +#endif // CRYPTOPP_ARM_PMULL_AVAILABLE #if CRYPTOPP_ARM_NEON_AVAILABLE void GCM_Xor16_NEON(byte *a, const byte *b, const byte *c) @@ -230,7 +231,7 @@ void GCM_Xor16_NEON(byte *a, const byte *b, const byte *c) } #endif -#if CRYPTOPP_ARMV_PMULL_AVAILABLE +#if CRYPTOPP_ARM_PMULL_AVAILABLE ANONYMOUS_NAMESPACE_BEGIN @@ -246,7 +247,7 @@ const unsigned int s_clmulTableSizeInBlocks = 8; ANONYMOUS_NAMESPACE_END -uint64x2_t GCM_Reduce_ARMV8A(uint64x2_t c0, uint64x2_t c1, uint64x2_t c2, const uint64x2_t &r) +uint64x2_t GCM_Reduce_PMULL(uint64x2_t c0, uint64x2_t c1, uint64x2_t c2, const uint64x2_t &r) { c1 = veorq_u64(c1, VEXT_U8<8>(vdupq_n_u64(0), c0)); c1 = veorq_u64(c1, PMULL_01(c0, r)); @@ -261,85 +262,231 @@ uint64x2_t GCM_Reduce_ARMV8A(uint64x2_t c0, uint64x2_t c1, uint64x2_t c2, const return veorq_u64(c2, c1); } -uint64x2_t GCM_Multiply_ARMV8A(const uint64x2_t &x, const uint64x2_t &h, const uint64x2_t &r) +uint64x2_t GCM_Multiply_PMULL(const uint64x2_t &x, const uint64x2_t &h, const uint64x2_t &r) { const uint64x2_t c0 = PMULL_00(x, h); const uint64x2_t c1 = veorq_u64(PMULL_10(x, h), PMULL_01(x, h)); const uint64x2_t c2 = PMULL_11(x, h); - return GCM_Reduce_ARMV8A(c0, c1, c2, r); + return GCM_Reduce_PMULL(c0, c1, c2, r); } -size_t GCM_AuthenticateBlocks_ARMV8(const byte *data, size_t len, const byte *mtable, byte *hbuffer) +size_t GCM_AuthenticateBlocks_PMULL(const byte *data, size_t len, const byte *mtable, byte *hbuffer) { - const uint64x2_t* table = reinterpret_cast(mtable); - uint64x2_t x = vreinterpretq_u64_u8(vld1q_u8(hbuffer)); - const uint64x2_t r = s_clmulConstants[0]; + const uint64x2_t* table = reinterpret_cast(mtable); + uint64x2_t x = vreinterpretq_u64_u8(vld1q_u8(hbuffer)); + const uint64x2_t r = s_clmulConstants[0]; - const size_t BLOCKSIZE = 16; - while (len >= BLOCKSIZE) - { - size_t s = UnsignedMin(len/BLOCKSIZE, s_clmulTableSizeInBlocks), i=0; - uint64x2_t d1, d2 = vreinterpretq_u64_u8(vrev64q_u8(vld1q_u8(data+(s-1)*BLOCKSIZE))); - uint64x2_t c0 = vdupq_n_u64(0); - uint64x2_t c1 = vdupq_n_u64(0); - uint64x2_t c2 = vdupq_n_u64(0); + const size_t BLOCKSIZE = 16; + while (len >= BLOCKSIZE) + { + size_t s = UnsignedMin(len/BLOCKSIZE, s_clmulTableSizeInBlocks), i=0; + uint64x2_t d1, d2 = vreinterpretq_u64_u8(vrev64q_u8(vld1q_u8(data+(s-1)*BLOCKSIZE))); + uint64x2_t c0 = vdupq_n_u64(0); + uint64x2_t c1 = vdupq_n_u64(0); + uint64x2_t c2 = vdupq_n_u64(0); - while (true) - { - const uint64x2_t h0 = vld1q_u64((const uint64_t*)(table+i)); - const uint64x2_t h1 = vld1q_u64((const uint64_t*)(table+i+1)); - const uint64x2_t h2 = veorq_u64(h0, h1); + while (true) + { + const uint64x2_t h0 = vld1q_u64((const uint64_t*)(table+i)); + const uint64x2_t h1 = vld1q_u64((const uint64_t*)(table+i+1)); + const uint64x2_t h2 = veorq_u64(h0, h1); - if (++i == s) - { - const uint64x2_t t1 = vreinterpretq_u64_u8(vrev64q_u8(vld1q_u8(data))); - d1 = veorq_u64(vextq_u64(t1, t1, 1), x); - c0 = veorq_u64(c0, PMULL_00(d1, h0)); - c2 = veorq_u64(c2, PMULL_10(d1, h1)); - d1 = veorq_u64(d1, (uint64x2_t)vcombine_u32(vget_high_u32(vreinterpretq_u32_u64(d1)), - vget_low_u32(vreinterpretq_u32_u64(d1)))); - c1 = veorq_u64(c1, PMULL_00(d1, h2)); + if (++i == s) + { + const uint64x2_t t1 = vreinterpretq_u64_u8(vrev64q_u8(vld1q_u8(data))); + d1 = veorq_u64(vextq_u64(t1, t1, 1), x); + c0 = veorq_u64(c0, PMULL_00(d1, h0)); + c2 = veorq_u64(c2, PMULL_10(d1, h1)); + d1 = veorq_u64(d1, (uint64x2_t)vcombine_u32(vget_high_u32(vreinterpretq_u32_u64(d1)), + vget_low_u32(vreinterpretq_u32_u64(d1)))); + c1 = veorq_u64(c1, PMULL_00(d1, h2)); - break; - } + break; + } - d1 = vreinterpretq_u64_u8(vrev64q_u8(vld1q_u8(data+(s-i)*16-8))); - c0 = veorq_u64(c0, PMULL_10(d2, h0)); - c2 = veorq_u64(c2, PMULL_10(d1, h1)); - d2 = veorq_u64(d2, d1); - c1 = veorq_u64(c1, PMULL_10(d2, h2)); + d1 = vreinterpretq_u64_u8(vrev64q_u8(vld1q_u8(data+(s-i)*16-8))); + c0 = veorq_u64(c0, PMULL_10(d2, h0)); + c2 = veorq_u64(c2, PMULL_10(d1, h1)); + d2 = veorq_u64(d2, d1); + c1 = veorq_u64(c1, PMULL_10(d2, h2)); - if (++i == s) - { - const uint64x2_t t2 = vreinterpretq_u64_u8(vrev64q_u8(vld1q_u8(data))); - d1 = veorq_u64(vextq_u64(t2, t2, 1), x); - c0 = veorq_u64(c0, PMULL_01(d1, h0)); - c2 = veorq_u64(c2, PMULL_11(d1, h1)); - d1 = veorq_u64(d1, (uint64x2_t)vcombine_u32(vget_high_u32(vreinterpretq_u32_u64(d1)), - vget_low_u32(vreinterpretq_u32_u64(d1)))); - c1 = veorq_u64(c1, PMULL_01(d1, h2)); + if (++i == s) + { + const uint64x2_t t2 = vreinterpretq_u64_u8(vrev64q_u8(vld1q_u8(data))); + d1 = veorq_u64(vextq_u64(t2, t2, 1), x); + c0 = veorq_u64(c0, PMULL_01(d1, h0)); + c2 = veorq_u64(c2, PMULL_11(d1, h1)); + d1 = veorq_u64(d1, (uint64x2_t)vcombine_u32(vget_high_u32(vreinterpretq_u32_u64(d1)), + vget_low_u32(vreinterpretq_u32_u64(d1)))); + c1 = veorq_u64(c1, PMULL_01(d1, h2)); - break; - } + break; + } - const uint64x2_t t3 = vreinterpretq_u64_u8(vrev64q_u8(vld1q_u8(data+(s-i)*16-8))); - d2 = vextq_u64(t3, t3, 1); - c0 = veorq_u64(c0, PMULL_01(d1, h0)); - c2 = veorq_u64(c2, PMULL_01(d2, h1)); - d1 = veorq_u64(d1, d2); - c1 = veorq_u64(c1, PMULL_01(d1, h2)); - } - data += s*16; - len -= s*16; + const uint64x2_t t3 = vreinterpretq_u64_u8(vrev64q_u8(vld1q_u8(data+(s-i)*16-8))); + d2 = vextq_u64(t3, t3, 1); + c0 = veorq_u64(c0, PMULL_01(d1, h0)); + c2 = veorq_u64(c2, PMULL_01(d2, h1)); + d1 = veorq_u64(d1, d2); + c1 = veorq_u64(c1, PMULL_01(d1, h2)); + } + data += s*16; + len -= s*16; - c1 = veorq_u64(veorq_u64(c1, c0), c2); - x = GCM_Reduce_ARMV8A(c0, c1, c2, r); - } + c1 = veorq_u64(veorq_u64(c1, c0), c2); + x = GCM_Reduce_PMULL(c0, c1, c2, r); + } - vst1q_u64(reinterpret_cast(hbuffer), x); - return len; + vst1q_u64(reinterpret_cast(hbuffer), x); + return len; } +#endif // CRYPTOPP_ARM_PMULL_AVAILABLE + +#if CRYPTOPP_CLMUL_AVAILABLE + +ANONYMOUS_NAMESPACE_BEGIN + +CRYPTOPP_ALIGN_DATA(16) +const word64 s_clmulConstants64[] = { + W64LIT(0xe100000000000000), W64LIT(0xc200000000000000), + W64LIT(0x08090a0b0c0d0e0f), W64LIT(0x0001020304050607), + W64LIT(0x0001020304050607), W64LIT(0x08090a0b0c0d0e0f)}; + +const __m128i *s_clmulConstants = (const __m128i *)(const void *)s_clmulConstants64; +const unsigned int s_cltableSizeInBlocks = 8; + +ANONYMOUS_NAMESPACE_END + +__m128i GCM_Reduce_CLMUL(__m128i c0, __m128i c1, __m128i c2, const __m128i &r) +{ + /* + The polynomial to be reduced is c0 * x^128 + c1 * x^64 + c2. c0t below refers to the most + significant half of c0 as a polynomial, which, due to GCM's bit reflection, are in the + rightmost bit positions, and the lowest byte addresses. + + c1 ^= c0t * 0xc200000000000000 + c2t ^= c0t + t = shift (c1t ^ c0b) left 1 bit + c2 ^= t * 0xe100000000000000 + c2t ^= c1b + shift c2 left 1 bit and xor in lowest bit of c1t + */ +#if 0 // MSVC 2010 workaround: see http://connect.microsoft.com/VisualStudio/feedback/details/575301 + c2 = _mm_xor_si128(c2, _mm_move_epi64(c0)); +#else + c1 = _mm_xor_si128(c1, _mm_slli_si128(c0, 8)); +#endif + c1 = _mm_xor_si128(c1, _mm_clmulepi64_si128(c0, r, 0x10)); + c0 = _mm_srli_si128(c0, 8); + c0 = _mm_xor_si128(c0, c1); + c0 = _mm_slli_epi64(c0, 1); + c0 = _mm_clmulepi64_si128(c0, r, 0); + c2 = _mm_xor_si128(c2, c0); + c2 = _mm_xor_si128(c2, _mm_srli_si128(c1, 8)); + c1 = _mm_unpacklo_epi64(c1, c2); + c1 = _mm_srli_epi64(c1, 63); + c2 = _mm_slli_epi64(c2, 1); + return _mm_xor_si128(c2, c1); +} + +__m128i GCM_Multiply_CLMUL(const __m128i &x, const __m128i &h, const __m128i &r) +{ + const __m128i c0 = _mm_clmulepi64_si128(x,h,0); + const __m128i c1 = _mm_xor_si128(_mm_clmulepi64_si128(x,h,1), _mm_clmulepi64_si128(x,h,0x10)); + const __m128i c2 = _mm_clmulepi64_si128(x,h,0x11); + + return GCM_Reduce_CLMUL(c0, c1, c2, r); +} + +void GCM_SetKeyWithoutResync_CLMUL(byte *mulTable, byte *hashKey, unsigned int tableSize) +{ + const __m128i r = s_clmulConstants[0]; + __m128i h0 = _mm_shuffle_epi8(_mm_load_si128((__m128i *)(void *)hashKey), s_clmulConstants[1]); + __m128i h = h0; + + for (unsigned int i=0; i= 16) + { + size_t s = UnsignedMin(len/16, s_cltableSizeInBlocks), i=0; + __m128i d1, d2 = _mm_shuffle_epi8(_mm_loadu_si128((const __m128i *)(const void *)(data+(s-1)*16)), mask2); + __m128i c0 = _mm_setzero_si128(); + __m128i c1 = _mm_setzero_si128(); + __m128i c2 = _mm_setzero_si128(); + + while (true) + { + __m128i h0 = _mm_load_si128(table+i); + __m128i h1 = _mm_load_si128(table+i+1); + __m128i h2 = _mm_xor_si128(h0, h1); + + if (++i == s) + { + d1 = _mm_shuffle_epi8(_mm_loadu_si128((const __m128i *)(const void *)data), mask1); + d1 = _mm_xor_si128(d1, x); + c0 = _mm_xor_si128(c0, _mm_clmulepi64_si128(d1, h0, 0)); + c2 = _mm_xor_si128(c2, _mm_clmulepi64_si128(d1, h1, 1)); + d1 = _mm_xor_si128(d1, _mm_shuffle_epi32(d1, _MM_SHUFFLE(1, 0, 3, 2))); + c1 = _mm_xor_si128(c1, _mm_clmulepi64_si128(d1, h2, 0)); + break; + } + + d1 = _mm_shuffle_epi8(_mm_loadu_si128((const __m128i *)(const void *)(data+(s-i)*16-8)), mask2); + c0 = _mm_xor_si128(c0, _mm_clmulepi64_si128(d2, h0, 1)); + c2 = _mm_xor_si128(c2, _mm_clmulepi64_si128(d1, h1, 1)); + d2 = _mm_xor_si128(d2, d1); + c1 = _mm_xor_si128(c1, _mm_clmulepi64_si128(d2, h2, 1)); + + if (++i == s) + { + d1 = _mm_shuffle_epi8(_mm_loadu_si128((const __m128i *)(const void *)data), mask1); + d1 = _mm_xor_si128(d1, x); + c0 = _mm_xor_si128(c0, _mm_clmulepi64_si128(d1, h0, 0x10)); + c2 = _mm_xor_si128(c2, _mm_clmulepi64_si128(d1, h1, 0x11)); + d1 = _mm_xor_si128(d1, _mm_shuffle_epi32(d1, _MM_SHUFFLE(1, 0, 3, 2))); + c1 = _mm_xor_si128(c1, _mm_clmulepi64_si128(d1, h2, 0x10)); + break; + } + + d2 = _mm_shuffle_epi8(_mm_loadu_si128((const __m128i *)(const void *)(data+(s-i)*16-8)), mask1); + c0 = _mm_xor_si128(c0, _mm_clmulepi64_si128(d1, h0, 0x10)); + c2 = _mm_xor_si128(c2, _mm_clmulepi64_si128(d2, h1, 0x10)); + d1 = _mm_xor_si128(d1, d2); + c1 = _mm_xor_si128(c1, _mm_clmulepi64_si128(d1, h2, 0x10)); + } + data += s*16; + len -= s*16; + + c1 = _mm_xor_si128(_mm_xor_si128(c1, c0), c2); + x = GCM_Reduce_CLMUL(c0, c1, c2, r); + } + + _mm_store_si128((__m128i *)(void *)hbuffer, x); + return len; +} + #endif NAMESPACE_END \ No newline at end of file diff --git a/gcm.cpp b/gcm.cpp index 323f5dd7..2fe90b8e 100644 --- a/gcm.cpp +++ b/gcm.cpp @@ -24,7 +24,7 @@ // SunCC 5.13 and below crash with AES-NI/CLMUL and C++{03|11}. Disable one or the other. // Also see http://github.com/weidai11/cryptopp/issues/226 #if defined(__SUNPRO_CC) && (__SUNPRO_CC <= 0x513) -# undef CRYPTOPPL_AESNI_AES_AVAILABLE +# undef CRYPTOPP_CLMUL_AVAILABLE #endif #include "gcm.h" @@ -128,72 +128,39 @@ inline static void SSE2_Xor16(byte *a, const byte *b, const byte *c) } #endif -#if CRYPTOPPL_AESNI_AES_AVAILABLE +#if CRYPTOPP_CLMUL_AVAILABLE + +extern __m128i GCM_Multiply_CLMUL(const __m128i &x, const __m128i &h, const __m128i &r); +extern __m128i GCM_Reduce_CLMUL(__m128i c0, __m128i c1, __m128i c2, const __m128i &r); +extern void GCM_SetKeyWithoutResync_CLMUL(byte *mulTable, byte *hashKey, unsigned int tableSize); +extern void GCM_ReverseHashBufferIfNeeded_CLMUL(byte *hashBuffer); +extern size_t GCM_AuthenticateBlocks_CLMUL(const byte *data, size_t len, const byte *mtable, byte *hbuffer); + CRYPTOPP_ALIGN_DATA(16) -static const word64 s_clmulConstants64[] = { +const word64 s_clmulConstants64[] = { W64LIT(0xe100000000000000), W64LIT(0xc200000000000000), W64LIT(0x08090a0b0c0d0e0f), W64LIT(0x0001020304050607), W64LIT(0x0001020304050607), W64LIT(0x08090a0b0c0d0e0f)}; -static const __m128i *s_clmulConstants = (const __m128i *)(const void *)s_clmulConstants64; -static const unsigned int s_clmulTableSizeInBlocks = 8; +const __m128i *s_clmulConstants = (const __m128i *)(const void *)s_clmulConstants64; +const unsigned int s_cltableSizeInBlocks = 8; -inline __m128i CLMUL_Reduce(__m128i c0, __m128i c1, __m128i c2, const __m128i &r) -{ - /* - The polynomial to be reduced is c0 * x^128 + c1 * x^64 + c2. c0t below refers to the most - significant half of c0 as a polynomial, which, due to GCM's bit reflection, are in the - rightmost bit positions, and the lowest byte addresses. - - c1 ^= c0t * 0xc200000000000000 - c2t ^= c0t - t = shift (c1t ^ c0b) left 1 bit - c2 ^= t * 0xe100000000000000 - c2t ^= c1b - shift c2 left 1 bit and xor in lowest bit of c1t - */ -#if 0 // MSVC 2010 workaround: see http://connect.microsoft.com/VisualStudio/feedback/details/575301 - c2 = _mm_xor_si128(c2, _mm_move_epi64(c0)); -#else - c1 = _mm_xor_si128(c1, _mm_slli_si128(c0, 8)); -#endif - c1 = _mm_xor_si128(c1, _mm_clmulepi64_si128(c0, r, 0x10)); - c0 = _mm_srli_si128(c0, 8); - c0 = _mm_xor_si128(c0, c1); - c0 = _mm_slli_epi64(c0, 1); - c0 = _mm_clmulepi64_si128(c0, r, 0); - c2 = _mm_xor_si128(c2, c0); - c2 = _mm_xor_si128(c2, _mm_srli_si128(c1, 8)); - c1 = _mm_unpacklo_epi64(c1, c2); - c1 = _mm_srli_epi64(c1, 63); - c2 = _mm_slli_epi64(c2, 1); - return _mm_xor_si128(c2, c1); -} - -inline __m128i CLMUL_GF_Mul(const __m128i &x, const __m128i &h, const __m128i &r) -{ - const __m128i c0 = _mm_clmulepi64_si128(x,h,0); - const __m128i c1 = _mm_xor_si128(_mm_clmulepi64_si128(x,h,1), _mm_clmulepi64_si128(x,h,0x10)); - const __m128i c2 = _mm_clmulepi64_si128(x,h,0x11); - - return CLMUL_Reduce(c0, c1, c2, r); -} #endif -#if CRYPTOPP_ARMV_PMULL_AVAILABLE +#if CRYPTOPP_ARM_PMULL_AVAILABLE -extern size_t GCM_AuthenticateBlocks_ARMV8(const byte *data, size_t len, const byte *mtable, byte *hbuffer); -extern uint64x2_t GCM_Multiply_ARMV8A(const uint64x2_t &x, const uint64x2_t &h, const uint64x2_t &r); +extern size_t GCM_AuthenticateBlocks_PMULL(const byte *data, size_t len, const byte *mtable, byte *hbuffer); +extern uint64x2_t GCM_Multiply_PMULL(const uint64x2_t &x, const uint64x2_t &h, const uint64x2_t &r); CRYPTOPP_ALIGN_DATA(16) -static const word64 s_clmulConstants64[] = { +const word64 s_clmulConstants64[] = { W64LIT(0xe100000000000000), W64LIT(0xc200000000000000), // Used for ARM and x86; polynomial coefficients W64LIT(0x08090a0b0c0d0e0f), W64LIT(0x0001020304050607), // Unused for ARM; used for x86 _mm_shuffle_epi8 W64LIT(0x0001020304050607), W64LIT(0x08090a0b0c0d0e0f) // Unused for ARM; used for x86 _mm_shuffle_epi8 }; -static const uint64x2_t *s_clmulConstants = (const uint64x2_t *)s_clmulConstants64; -static const unsigned int s_clmulTableSizeInBlocks = 8; +const uint64x2_t *s_clmulConstants = (const uint64x2_t *)s_clmulConstants64; +const unsigned int s_cltableSizeInBlocks = 8; #endif void GCM_Base::SetKeyWithoutResync(const byte *userKey, size_t keylength, const NameValuePairs ¶ms) @@ -206,20 +173,20 @@ void GCM_Base::SetKeyWithoutResync(const byte *userKey, size_t keylength, const int tableSize, i, j, k; -#if CRYPTOPPL_AESNI_AES_AVAILABLE +#if CRYPTOPP_CLMUL_AVAILABLE if (HasCLMUL()) { // Avoid "parameter not used" error and suppress Coverity finding (void)params.GetIntValue(Name::TableSize(), tableSize); - tableSize = s_clmulTableSizeInBlocks * REQUIRED_BLOCKSIZE; + tableSize = s_cltableSizeInBlocks * REQUIRED_BLOCKSIZE; } else -#elif CRYPTOPP_ARMV_PMULL_AVAILABLE +#elif CRYPTOPP_ARM_PMULL_AVAILABLE if (HasPMULL()) { // Avoid "parameter not used" error and suppress Coverity finding (void)params.GetIntValue(Name::TableSize(), tableSize); - tableSize = s_clmulTableSizeInBlocks * REQUIRED_BLOCKSIZE; + tableSize = s_cltableSizeInBlocks * REQUIRED_BLOCKSIZE; } else #endif @@ -236,31 +203,18 @@ void GCM_Base::SetKeyWithoutResync(const byte *userKey, size_t keylength, const } m_buffer.resize(3*REQUIRED_BLOCKSIZE + tableSize); - byte *table = MulTable(); + byte *mulTable = MulTable(); byte *hashKey = HashKey(); memset(hashKey, 0, REQUIRED_BLOCKSIZE); blockCipher.ProcessBlock(hashKey); -#if CRYPTOPPL_AESNI_AES_AVAILABLE +#if CRYPTOPP_CLMUL_AVAILABLE if (HasCLMUL()) { - const __m128i r = s_clmulConstants[0]; - __m128i h0 = _mm_shuffle_epi8(_mm_load_si128((__m128i *)(void *)hashKey), s_clmulConstants[1]); - __m128i h = h0; - - for (i=0; i>1) | (V0<<63); @@ -306,23 +260,23 @@ void GCM_Base::SetKeyWithoutResync(const byte *userKey, size_t keylength, const for (i=0; i<16; i++) { - memset(table+i*256*16, 0, 16); + memset(mulTable+i*256*16, 0, 16); #if CRYPTOPP_SSE2_AVAILABLE || CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE if (HasSSE2()) for (j=2; j<=0x80; j*=2) for (k=1; k>1) | (V0<<63); @@ -357,15 +311,15 @@ void GCM_Base::SetKeyWithoutResync(const byte *userKey, size_t keylength, const for (i=0; i<4; i++) { - memset(table+i*256, 0, 16); - memset(table+1024+i*256, 0, 16); + memset(mulTable+i*256, 0, 16); + memset(mulTable+1024+i*256, 0, 16); #if CRYPTOPP_SSE2_AVAILABLE || CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE if (HasSSE2()) for (j=2; j<=8; j*=2) for (k=1; k= 16) - { - size_t s = UnsignedMin(len/16, s_clmulTableSizeInBlocks), i=0; - __m128i d1, d2 = _mm_shuffle_epi8(_mm_loadu_si128((const __m128i *)(const void *)(data+(s-1)*16)), mask2); - __m128i c0 = _mm_setzero_si128(); - __m128i c1 = _mm_setzero_si128(); - __m128i c2 = _mm_setzero_si128(); - - while (true) - { - __m128i h0 = _mm_load_si128(table+i); - __m128i h1 = _mm_load_si128(table+i+1); - __m128i h2 = _mm_xor_si128(h0, h1); - - if (++i == s) - { - d1 = _mm_shuffle_epi8(_mm_loadu_si128((const __m128i *)(const void *)data), mask1); - d1 = _mm_xor_si128(d1, x); - c0 = _mm_xor_si128(c0, _mm_clmulepi64_si128(d1, h0, 0)); - c2 = _mm_xor_si128(c2, _mm_clmulepi64_si128(d1, h1, 1)); - d1 = _mm_xor_si128(d1, _mm_shuffle_epi32(d1, _MM_SHUFFLE(1, 0, 3, 2))); - c1 = _mm_xor_si128(c1, _mm_clmulepi64_si128(d1, h2, 0)); - break; - } - - d1 = _mm_shuffle_epi8(_mm_loadu_si128((const __m128i *)(const void *)(data+(s-i)*16-8)), mask2); - c0 = _mm_xor_si128(c0, _mm_clmulepi64_si128(d2, h0, 1)); - c2 = _mm_xor_si128(c2, _mm_clmulepi64_si128(d1, h1, 1)); - d2 = _mm_xor_si128(d2, d1); - c1 = _mm_xor_si128(c1, _mm_clmulepi64_si128(d2, h2, 1)); - - if (++i == s) - { - d1 = _mm_shuffle_epi8(_mm_loadu_si128((const __m128i *)(const void *)data), mask1); - d1 = _mm_xor_si128(d1, x); - c0 = _mm_xor_si128(c0, _mm_clmulepi64_si128(d1, h0, 0x10)); - c2 = _mm_xor_si128(c2, _mm_clmulepi64_si128(d1, h1, 0x11)); - d1 = _mm_xor_si128(d1, _mm_shuffle_epi32(d1, _MM_SHUFFLE(1, 0, 3, 2))); - c1 = _mm_xor_si128(c1, _mm_clmulepi64_si128(d1, h2, 0x10)); - break; - } - - d2 = _mm_shuffle_epi8(_mm_loadu_si128((const __m128i *)(const void *)(data+(s-i)*16-8)), mask1); - c0 = _mm_xor_si128(c0, _mm_clmulepi64_si128(d1, h0, 0x10)); - c2 = _mm_xor_si128(c2, _mm_clmulepi64_si128(d2, h1, 0x10)); - d1 = _mm_xor_si128(d1, d2); - c1 = _mm_xor_si128(c1, _mm_clmulepi64_si128(d1, h2, 0x10)); - } - data += s*16; - len -= s*16; - - c1 = _mm_xor_si128(_mm_xor_si128(c1, c0), c2); - x = CLMUL_Reduce(c0, c1, c2, r); - } - - _mm_store_si128((__m128i *)(void *)HashBuffer(), x); - return len; + return GCM_AuthenticateBlocks_CLMUL(data, len, MulTable(), HashBuffer()); } -#elif CRYPTOPP_ARMV_PMULL_AVAILABLE +#elif CRYPTOPP_ARM_PMULL_AVAILABLE if (HasPMULL()) { - return GCM_AuthenticateBlocks_ARMV8(data, len, MulTable(), HashBuffer()); - } + return GCM_AuthenticateBlocks_PMULL(data, len, MulTable(), HashBuffer()); + } #endif typedef BlockGetAndPut Block; diff --git a/rijndael.cpp b/rijndael.cpp index 98de5fc0..b3045b30 100644 --- a/rijndael.cpp +++ b/rijndael.cpp @@ -223,7 +223,7 @@ void Rijndael::Base::UncheckedSetKey(const byte *userKey, unsigned int keylen, c word32 *rk = m_key; -#if (CRYPTOPPL_AESNI_AES_AVAILABLE && CRYPTOPP_SSE42_AVAILABLE && (!defined(_MSC_VER) || _MSC_VER >= 1600 || CRYPTOPP_BOOL_X86 || CRYPTOPP_BOOL_X32)) +#if (CRYPTOPP_AESNI_AVAILABLE && CRYPTOPP_SSE42_AVAILABLE && (!defined(_MSC_VER) || _MSC_VER >= 1600 || CRYPTOPP_BOOL_X86 || CRYPTOPP_BOOL_X32)) // MSVC 2008 SP1 generates bad code for _mm_extract_epi32() when compiling for X64 if (HasAESNI() && HasSSE4()) { @@ -379,7 +379,7 @@ void Rijndael::Base::UncheckedSetKey(const byte *userKey, unsigned int keylen, c temp = ConditionalByteReverse(BIG_ENDIAN_ORDER, rk[3]); rk[3] = ConditionalByteReverse(BIG_ENDIAN_ORDER, rk[4*m_rounds+3]); rk[4*m_rounds+3] = temp; } -#if CRYPTOPPL_AESNI_AES_AVAILABLE +#if CRYPTOPP_AESNI_AVAILABLE if (HasAESNI()) ConditionalByteReverse(BIG_ENDIAN_ORDER, rk+4, rk+4, (m_rounds-1)*16); #endif @@ -387,7 +387,7 @@ void Rijndael::Base::UncheckedSetKey(const byte *userKey, unsigned int keylen, c void Rijndael::Enc::ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock, byte *outBlock) const { -#if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE || defined(CRYPTOPP_X64_MASM_AVAILABLE) || CRYPTOPPL_AESNI_AES_AVAILABLE +#if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE || defined(CRYPTOPP_X64_MASM_AVAILABLE) || CRYPTOPP_AESNI_AVAILABLE #if (CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE || defined(CRYPTOPP_X64_MASM_AVAILABLE)) && !defined(CRYPTOPP_DISABLE_RIJNDAEL_ASM) if (HasSSE2()) #else @@ -468,7 +468,7 @@ void Rijndael::Enc::ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock void Rijndael::Dec::ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock, byte *outBlock) const { -#if CRYPTOPPL_AESNI_AES_AVAILABLE +#if CRYPTOPP_AESNI_AVAILABLE if (HasAESNI()) { Rijndael::Dec::AdvancedProcessBlocks(inBlock, xorBlock, outBlock, 16, 0); @@ -1082,7 +1082,7 @@ static inline bool AliasedWithTable(const byte *begin, const byte *end) return (s0 < t1 || s1 <= t1) || (s0 >= t0 || s1 > t0); } -#if CRYPTOPPL_AESNI_AES_AVAILABLE +#if CRYPTOPP_AESNI_AVAILABLE inline void AESNI_Enc_Block(__m128i &block, MAYBE_CONST __m128i *subkeys, unsigned int rounds) { @@ -1285,7 +1285,7 @@ Rijndael::Enc::Enc() : m_aliasBlock(s_sizeToAllocate) { } size_t Rijndael::Enc::AdvancedProcessBlocks(const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags) const { -#if CRYPTOPPL_AESNI_AES_AVAILABLE +#if CRYPTOPP_AESNI_AVAILABLE if (HasAESNI()) return AESNI_AdvancedProcessBlocks(AESNI_Enc_Block, AESNI_Enc_4_Blocks, (MAYBE_CONST __m128i *)(const void *)m_key.begin(), m_rounds, inBlocks, xorBlocks, outBlocks, length, flags); #endif @@ -1347,7 +1347,7 @@ size_t Rijndael::Enc::AdvancedProcessBlocks(const byte *inBlocks, const byte *xo #if CRYPTOPP_BOOL_X64 || CRYPTOPP_BOOL_X32 || CRYPTOPP_BOOL_X86 size_t Rijndael::Dec::AdvancedProcessBlocks(const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags) const { -#if CRYPTOPPL_AESNI_AES_AVAILABLE +#if CRYPTOPP_AESNI_AVAILABLE if (HasAESNI()) return AESNI_AdvancedProcessBlocks(AESNI_Dec_Block, AESNI_Dec_4_Blocks, (MAYBE_CONST __m128i *)(const void *)m_key.begin(), m_rounds, inBlocks, xorBlocks, outBlocks, length, flags); #endif diff --git a/seal.cpp b/seal.cpp index 1ba3e293..85e9b784 100644 --- a/seal.cpp +++ b/seal.cpp @@ -38,7 +38,7 @@ word32 SEAL_Gamma::Apply(word32 i) word32 shaIndex = i/5; if (shaIndex != lastIndex) { -#if CRYPTOPP_SHANI_SHA_AVAILABLE +#if CRYPTOPP_SHANI_AVAILABLE D[0] = ConditionalByteReverse(HasSHA() ? BIG_ENDIAN_ORDER : LITTLE_ENDIAN_ORDER, shaIndex); #else D[0] = shaIndex; diff --git a/sha-simd.cpp b/sha-simd.cpp index 7dafbc96..d335b705 100644 --- a/sha-simd.cpp +++ b/sha-simd.cpp @@ -14,7 +14,7 @@ # include "nmmintrin.h" #endif -#if (CRYPTOPP_SHANI_SHA_AVAILABLE) +#if (CRYPTOPP_SHANI_AVAILABLE) # include "immintrin.h" #endif @@ -160,7 +160,7 @@ bool CPU_TrySHA2_ARMV8() // start of Walton/Gulley's code // /////////////////////////////////// -#if CRYPTOPP_SHANI_SHA_AVAILABLE +#if CRYPTOPP_SHANI_AVAILABLE // Based on http://software.intel.com/en-us/articles/intel-sha-extensions and code by Sean Gulley. void SHA1_Transform_SHANI(word32 *state, const word32 *data) { @@ -540,7 +540,7 @@ void CRYPTOPP_FASTCALL SHA256_HashBlocks_SHANI(word32 *state, const word32 *data _mm_storeu_si128((__m128i*) &state[0], STATE0); _mm_storeu_si128((__m128i*) &state[4], STATE1); } -#endif // CRYPTOPP_SHANI_SHA_AVAILABLE +#endif // CRYPTOPP_SHANI_AVAILABLE ///////////////////////////////// // end of Walton/Gulley's code // diff --git a/sha.cpp b/sha.cpp index 109871b5..6f483bff 100644 --- a/sha.cpp +++ b/sha.cpp @@ -97,7 +97,7 @@ static void SHA1_Transform_CXX(word32 *state, const word32 *data) // end of Steve Reid's code // ////////////////////////////// -#if CRYPTOPP_SHANI_SHA_AVAILABLE +#if CRYPTOPP_SHANI_AVAILABLE extern void SHA1_Transform_SHANI(word32 *state, const word32 *data); extern void CRYPTOPP_FASTCALL SHA256_HashBlocks_SHANI(word32 *state, const word32 *data, size_t length); #elif CRYPTOPP_ARMV8A_SHA_AVAILABLE @@ -107,7 +107,7 @@ extern void CRYPTOPP_FASTCALL SHA256_HashBlocks_ARMV8A(word32 *state, const word static pfnSHATransform InitializeSHA1Transform() { -#if CRYPTOPP_SHANI_SHA_AVAILABLE +#if CRYPTOPP_SHANI_AVAILABLE if (HasSHA()) return &SHA1_Transform_SHANI; else @@ -135,7 +135,7 @@ void SHA1::Transform(word32 *state, const word32 *data) s_pfn(state, data); } -#if CRYPTOPP_SHANI_SHA_AVAILABLE +#if CRYPTOPP_SHANI_AVAILABLE size_t SHA1::HashMultipleBlocks(const word32 *input, size_t length) { const bool noReverse = HasSHA() || NativeByteOrderIs(this->GetByteOrder()); @@ -533,7 +533,7 @@ void CRYPTOPP_FASTCALL SHA256_HashBlocks_SSE2(word32 *state, const word32 *data, static pfnSHAHashBlocks InitializeSHA256HashBlocks() { -#if CRYPTOPP_SHANI_SHA_AVAILABLE +#if CRYPTOPP_SHANI_AVAILABLE if (HasSHA()) return &SHA256_HashBlocks_SHANI; else @@ -700,12 +700,12 @@ static void SHA256_Transform_SSE2(word32 *state, const word32 *data) } #endif // CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE -#if CRYPTOPP_SHANI_SHA_AVAILABLE +#if CRYPTOPP_SHANI_AVAILABLE static void SHA256_Transform_SHANI(word32 *state, const word32 *data) { return SHA256_HashBlocks_SHANI(state, data, SHA256::BLOCKSIZE); } -#endif // CRYPTOPP_SHANI_SHA_AVAILABLE +#endif // CRYPTOPP_SHANI_AVAILABLE #if CRYPTOPP_ARMV8A_SHA_AVAILABLE static void SHA256_Transform_ARMV8A(word32 *state, const word32 *data) @@ -720,7 +720,7 @@ static void SHA256_Transform_ARMV8A(word32 *state, const word32 *data) static pfnSHATransform InitializeSHA256Transform() { -#if CRYPTOPP_SHANI_SHA_AVAILABLE +#if CRYPTOPP_SHANI_AVAILABLE if (HasSHA()) return &SHA256_Transform_SHANI; else diff --git a/sha.h b/sha.h index 7702ba5f..f3c3f423 100644 --- a/sha.h +++ b/sha.h @@ -25,7 +25,7 @@ NAMESPACE_BEGIN(CryptoPP) class CRYPTOPP_DLL SHA1 : public IteratedHashWithStaticTransform { public: -#if CRYPTOPP_SHANI_SHA_AVAILABLE +#if CRYPTOPP_SHANI_AVAILABLE size_t HashMultipleBlocks(const word32 *input, size_t length); #endif static void CRYPTOPP_API InitState(HashWordType *state);