From d9011f07d2d5bde9c0710ac481e2b61fd9f51a77 Mon Sep 17 00:00:00 2001 From: Jeffrey Walton Date: Thu, 8 Nov 2018 16:20:31 -0500 Subject: [PATCH] Add ChaCha AVX2 implementation (GH #735) --- Filelist.txt | 1 + GNUmakefile | 39 ++++- chacha-avx.cpp | 362 +++++++++++++++++++++++++++++++++++++++ chacha.cpp | 56 +++++- chacha.h | 2 +- cham-simd.cpp | 11 +- config.h | 21 +++ cryptest.nmake | 8 +- cryptlib.vcxproj | 6 +- cryptlib.vcxproj.filters | 7 +- 10 files changed, 485 insertions(+), 28 deletions(-) create mode 100644 chacha-avx.cpp diff --git a/Filelist.txt b/Filelist.txt index 60d37620..ed09654a 100644 --- a/Filelist.txt +++ b/Filelist.txt @@ -52,6 +52,7 @@ cbcmac.h ccm.cpp ccm.h chacha.cpp +chacha-avx.cpp chacha-simd.cpp chacha.h cham.cpp diff --git a/GNUmakefile b/GNUmakefile index b07e68bd..d1668514 100755 --- a/GNUmakefile +++ b/GNUmakefile @@ -227,9 +227,10 @@ ifeq ($(HAVE_GAS)$(GAS219_OR_LATER),10) CXXFLAGS += -DCRYPTOPP_DISABLE_AESNI else ifeq ($(HAVE_GAS)$(GAS224_OR_LATER),10) +CXXFLAGS += -DCRYPTOPP_DISABLE_AVX CXXFLAGS += -DCRYPTOPP_DISABLE_SHANI -endif # -DCRYPTOPP_DISABLE_SHANI +endif # -DCRYPTOPP_DISABLE_AVX and SHANI endif # -DCRYPTOPP_DISABLE_AESNI endif # -DCRYPTOPP_DISABLE_SSE4 endif # -DCRYPTOPP_DISABLE_SSSE3 @@ -276,26 +277,33 @@ ifeq ($(findstring -DCRYPTOPP_DISABLE_AESNI,$(CXXFLAGS)),) AES_FLAG = -msse4.1 -maes SM4_FLAG = -mssse3 -maes endif +ifeq ($(findstring -DCRYPTOPP_DISABLE_AVX2,$(CXXFLAGS)),) + HAVE_AVX2 = $(shell $(CXX) $(CXXFLAGS) -mavx2 -dM -E pch.cpp 2>&1 | $(GREP) -i -c __AVX2__) + ifeq ($(HAVE_AVX2),1) + CHACHA_AVX2_FLAG = -mavx2 + endif ifeq ($(findstring -DCRYPTOPP_DISABLE_SHANI,$(CXXFLAGS)),) HAVE_SHA = $(shell $(CXX) $(CXXFLAGS) -msse4.2 -msha -dM -E pch.cpp 2>&1 | $(GREP) -i -c __SHA__) ifeq ($(HAVE_SHA),1) SHA_FLAG = -msse4.2 -msha endif endif # -DCRYPTOPP_DISABLE_SHANI +endif # -DCRYPTOPP_DISABLE_AVX2 endif # -DCRYPTOPP_DISABLE_AESNI endif # -DCRYPTOPP_DISABLE_SSE4 endif # -DCRYPTOPP_DISABLE_SSSE3 # Begin SunCC ifeq ($(SUN_COMPILER),1) - COUNT := $(shell $(CXX) $(CXXFLAGS) -E -xarch=sse2 -xdumpmacros /dev/null 2>&1 | $(GREP) -i -c "illegal") + COUNT := $(shell $(CXX) $(CXXFLAGS) -E -xarch=sse2 -xdumpmacros pch.cpp 2>&1 | $(GREP) -i -c "illegal") ifeq ($(COUNT),0) AES_FLAG = -xarch=sse2 -D__SSE2__=1 + CHACHA_FLAG = -xarch=sse2 -D__SSE2__=1 GCM_FLAG = -xarch=sse2 -D__SSE2__=1 SHA_FLAG = -xarch=sse2 -D__SSE2__=1 LDFLAGS += -xarch=sse2 endif - COUNT := $(shell $(CXX) $(CXXFLAGS) -E -xarch=ssse3 -xdumpmacros /dev/null 2>&1 | $(GREP) -i -c "illegal") + COUNT := $(shell $(CXX) $(CXXFLAGS) -E -xarch=ssse3 -xdumpmacros pch.cpp 2>&1 | $(GREP) -i -c "illegal") ifeq ($(COUNT),0) SSSE3_FLAG = -xarch=ssse3 -D__SSSE3__=1 ARIA_FLAG = -xarch=ssse3 -D__SSSE3__=1 @@ -308,7 +316,7 @@ ifeq ($(SUN_COMPILER),1) SPECK128_FLAG = -xarch=ssse3 -D__SSSE3__=1 LDFLAGS += -xarch=ssse3 endif - COUNT := $(shell $(CXX) $(CXXFLAGS) -E -xarch=sse4_1 -xdumpmacros /dev/null 2>&1 | $(GREP) -i -c "illegal") + COUNT := $(shell $(CXX) $(CXXFLAGS) -E -xarch=sse4_1 -xdumpmacros pch.cpp 2>&1 | $(GREP) -i -c "illegal") ifeq ($(COUNT),0) BLAKE2B_FLAG = -xarch=sse4_1 -D__SSE4_1__=1 BLAKE2S_FLAG = -xarch=sse4_1 -D__SSE4_1__=1 @@ -316,19 +324,28 @@ ifeq ($(SUN_COMPILER),1) SPECK64_FLAG = -xarch=sse4_1 -D__SSE4_1__=1 LDFLAGS += -xarch=sse4_1 endif - COUNT := $(shell $(CXX) $(CXXFLAGS) -E -xarch=sse4_2 -xdumpmacros /dev/null 2>&1 | $(GREP) -i -c "illegal") + COUNT := $(shell $(CXX) $(CXXFLAGS) -E -xarch=sse4_2 -xdumpmacros pch.cpp 2>&1 | $(GREP) -i -c "illegal") ifeq ($(COUNT),0) CRC_FLAG = -xarch=sse4_2 -D__SSE4_2__=1 LDFLAGS += -xarch=sse4_2 endif - COUNT := $(shell $(CXX) $(CXXFLAGS) -E -xarch=aes -xdumpmacros /dev/null 2>&1 | $(GREP) -i -c "illegal") + COUNT := $(shell $(CXX) $(CXXFLAGS) -E -xarch=aes -xdumpmacros pch.cpp 2>&1 | $(GREP) -i -c "illegal") ifeq ($(COUNT),0) GCM_FLAG = -xarch=aes -D__PCLMUL__=1 AES_FLAG = -xarch=aes -D__AES__=1 SM4_FLAG = -xarch=aes -D__AES__=1 LDFLAGS += -xarch=aes endif - COUNT := $(shell $(CXX) $(CXXFLAGS) -E -xarch=sha -xdumpmacros /dev/null 2>&1 | $(GREP) -i -c "illegal") + COUNT := $(shell $(CXX) $(CXXFLAGS) -E -xarch=avx -xdumpmacros pch.cpp 2>&1 | $(GREP) -i -c "illegal") + ifeq ($(COUNT),0) + LDFLAGS += -xarch=avx + endif + COUNT := $(shell $(CXX) $(CXXFLAGS) -E -xarch=avx2 -xdumpmacros pch.cpp 2>&1 | $(GREP) -i -c "illegal") + ifeq ($(COUNT),0) + CHACHA_AVX2_FLAG = -xarch=avx2 -D__AVX2__=1 + LDFLAGS += -xarch=avx2 + endif + COUNT := $(shell $(CXX) $(CXXFLAGS) -E -xarch=sha -xdumpmacros pch.cpp 2>&1 | $(GREP) -i -c "illegal") ifeq ($(COUNT),0) SHA_FLAG = -xarch=sha -D__SHA__=1 LDFLAGS += -xarch=sha @@ -646,8 +663,8 @@ ifneq ($(SUN_COMPILER),0) # override flags for CC Sun C++ compiler CXXFLAGS += -template=no%extdef SUN_CC10_BUGGY := $(shell $(CXX) -V 2>&1 | $(GREP) -c -E "CC: Sun .* 5\.10 .* (2009|2010/0[1-4])") ifneq ($(SUN_CC10_BUGGY),0) -# -DCRYPTOPP_INCLUDE_VECTOR_CC is needed for Sun Studio 12u1 Sun C++ 5.10 SunOS_i386 128229-02 2009/09/21 and was fixed in May 2010 -# remove it if you get "already had a body defined" errors in vector.cc +# -DCRYPTOPP_INCLUDE_VECTOR_CC is needed for Sun Studio 12u1 Sun C++ 5.10 SunOS_i386 128229-02 2009/09/21 +# and was fixed in May 2010. Remove it if you get "already had a body defined" errors in vector.cc CXXFLAGS += -DCRYPTOPP_INCLUDE_VECTOR_CC endif AR = $(CXX) @@ -1197,6 +1214,10 @@ blake2b-simd.o : blake2b-simd.cpp chacha-simd.o : chacha-simd.cpp $(CXX) $(strip $(CXXFLAGS) $(CHACHA_FLAG) -c) $< +# AVX2 available +chacha-avx.o : chacha-avx.cpp + $(CXX) $(strip $(CXXFLAGS) $(CHACHA_AVX2_FLAG) -c) $< + # SSSE3 available cham-simd.o : cham-simd.cpp $(CXX) $(strip $(CXXFLAGS) $(CHAM_FLAG) -c) $< diff --git a/chacha-avx.cpp b/chacha-avx.cpp new file mode 100644 index 00000000..ed5a2458 --- /dev/null +++ b/chacha-avx.cpp @@ -0,0 +1,362 @@ +// chacha-avx.cpp - written and placed in the public domain by +// Jack Lloyd and Jeffrey Walton +// +// This source file uses intrinsics and built-ins to gain access to +// SSE2, ARM NEON and ARMv8a, and Power7 Altivec instructions. A separate +// source file is needed because additional CXXFLAGS are required to enable +// the appropriate instructions sets in some build configurations. +// +// AVX implementation based on Botan's chacha_avx.cpp. Many thanks +// to Jack Lloyd and the Botan team for allowing us to use it. +// +// Here are some relative numbers for ChaCha8: +// * Intel Skylake, 3.0 GHz: AVX2 at 4385 MB/s; 0.59 cpb. +// * AMD Bulldozer, 3.3 GHz: AVX2 at 1680 MB/s; 1.47 cpb. + +#include "pch.h" +#include "config.h" + +#include "chacha.h" +#include "misc.h" + +#if defined(CRYPTOPP_AVX2_AVAILABLE) +# include +# include +# include +#endif + +// Squash MS LNK4221 and libtool warnings +extern const char CHACHA_AVX_FNAME[] = __FILE__; + +// Sun Studio 12.4 OK, 12.5 and 12.6 error. +#if (__SUNPRO_CC >= 0x5140) && (__SUNPRO_CC <= 0x5150) +# define MAYBE_CONST +#else +# define MAYBE_CONST const +#endif + +#if (CRYPTOPP_AVX2_AVAILABLE) + +ANONYMOUS_NAMESPACE_BEGIN + +template +inline __m256i RotateLeft(const __m256i val) +{ + return _mm256_or_si256(_mm256_slli_epi32(val, R), _mm256_srli_epi32(val, 32-R)); +} + +template <> +inline __m256i RotateLeft<8>(const __m256i val) +{ + const __m256i mask = _mm256_set_epi8(14,13,12,15, 10,9,8,11, 6,5,4,7, 2,1,0,3, + 14,13,12,15, 10,9,8,11, 6,5,4,7, 2,1,0,3); + return _mm256_shuffle_epi8(val, mask); +} + +template <> +inline __m256i RotateLeft<16>(const __m256i val) +{ + const __m256i mask = _mm256_set_epi8(13,12,15,14, 9,8,11,10, 5,4,7,6, 1,0,3,2, + 13,12,15,14, 9,8,11,10, 5,4,7,6, 1,0,3,2); + return _mm256_shuffle_epi8(val, mask); +} + +ANONYMOUS_NAMESPACE_END + +NAMESPACE_BEGIN(CryptoPP) + +void ChaCha_OperateKeystream_AVX2(const word32 *state, const byte* input, byte *output, unsigned int rounds) +{ + MAYBE_CONST __m128i* state_mm = (MAYBE_CONST __m128i*)(state); + MAYBE_CONST __m256i* input_mm = (MAYBE_CONST __m256i*)(input); + __m256i* output_mm = reinterpret_cast<__m256i*>(output); + + const __m256i state0 = _mm256_broadcastsi128_si256(_mm_loadu_si128(state_mm + 0)); + const __m256i state1 = _mm256_broadcastsi128_si256(_mm_loadu_si128(state_mm + 1)); + const __m256i state2 = _mm256_broadcastsi128_si256(_mm_loadu_si128(state_mm + 2)); + const __m256i state3 = _mm256_broadcastsi128_si256(_mm_loadu_si128(state_mm + 3)); + + const __m256i CTR0 = _mm256_set_epi32(0, 0, 0, 0, 0, 0, 0, 4); + const __m256i CTR1 = _mm256_set_epi32(0, 0, 0, 1, 0, 0, 0, 5); + const __m256i CTR2 = _mm256_set_epi32(0, 0, 0, 2, 0, 0, 0, 6); + const __m256i CTR3 = _mm256_set_epi32(0, 0, 0, 3, 0, 0, 0, 7); + + __m256i X0_0 = state0; + __m256i X0_1 = state1; + __m256i X0_2 = state2; + __m256i X0_3 = _mm256_add_epi64(state3, CTR0); + + __m256i X1_0 = state0; + __m256i X1_1 = state1; + __m256i X1_2 = state2; + __m256i X1_3 = _mm256_add_epi64(state3, CTR1); + + __m256i X2_0 = state0; + __m256i X2_1 = state1; + __m256i X2_2 = state2; + __m256i X2_3 = _mm256_add_epi64(state3, CTR2); + + __m256i X3_0 = state0; + __m256i X3_1 = state1; + __m256i X3_2 = state2; + __m256i X3_3 = _mm256_add_epi64(state3, CTR3); + + for (int i = static_cast(rounds); i > 0; i -= 2) + { + X0_0 = _mm256_add_epi32(X0_0, X0_1); + X1_0 = _mm256_add_epi32(X1_0, X1_1); + X2_0 = _mm256_add_epi32(X2_0, X2_1); + X3_0 = _mm256_add_epi32(X3_0, X3_1); + + X0_3 = _mm256_xor_si256(X0_3, X0_0); + X1_3 = _mm256_xor_si256(X1_3, X1_0); + X2_3 = _mm256_xor_si256(X2_3, X2_0); + X3_3 = _mm256_xor_si256(X3_3, X3_0); + + X0_3 = RotateLeft<16>(X0_3); + X1_3 = RotateLeft<16>(X1_3); + X2_3 = RotateLeft<16>(X2_3); + X3_3 = RotateLeft<16>(X3_3); + + X0_2 = _mm256_add_epi32(X0_2, X0_3); + X1_2 = _mm256_add_epi32(X1_2, X1_3); + X2_2 = _mm256_add_epi32(X2_2, X2_3); + X3_2 = _mm256_add_epi32(X3_2, X3_3); + + X0_1 = _mm256_xor_si256(X0_1, X0_2); + X1_1 = _mm256_xor_si256(X1_1, X1_2); + X2_1 = _mm256_xor_si256(X2_1, X2_2); + X3_1 = _mm256_xor_si256(X3_1, X3_2); + + X0_1 = RotateLeft<12>(X0_1); + X1_1 = RotateLeft<12>(X1_1); + X2_1 = RotateLeft<12>(X2_1); + X3_1 = RotateLeft<12>(X3_1); + + X0_0 = _mm256_add_epi32(X0_0, X0_1); + X1_0 = _mm256_add_epi32(X1_0, X1_1); + X2_0 = _mm256_add_epi32(X2_0, X2_1); + X3_0 = _mm256_add_epi32(X3_0, X3_1); + + X0_3 = _mm256_xor_si256(X0_3, X0_0); + X1_3 = _mm256_xor_si256(X1_3, X1_0); + X2_3 = _mm256_xor_si256(X2_3, X2_0); + X3_3 = _mm256_xor_si256(X3_3, X3_0); + + X0_3 = RotateLeft<8>(X0_3); + X1_3 = RotateLeft<8>(X1_3); + X2_3 = RotateLeft<8>(X2_3); + X3_3 = RotateLeft<8>(X3_3); + + X0_2 = _mm256_add_epi32(X0_2, X0_3); + X1_2 = _mm256_add_epi32(X1_2, X1_3); + X2_2 = _mm256_add_epi32(X2_2, X2_3); + X3_2 = _mm256_add_epi32(X3_2, X3_3); + + X0_1 = _mm256_xor_si256(X0_1, X0_2); + X1_1 = _mm256_xor_si256(X1_1, X1_2); + X2_1 = _mm256_xor_si256(X2_1, X2_2); + X3_1 = _mm256_xor_si256(X3_1, X3_2); + + X0_1 = RotateLeft<7>(X0_1); + X1_1 = RotateLeft<7>(X1_1); + X2_1 = RotateLeft<7>(X2_1); + X3_1 = RotateLeft<7>(X3_1); + + X0_1 = _mm256_shuffle_epi32(X0_1, _MM_SHUFFLE(0, 3, 2, 1)); + X0_2 = _mm256_shuffle_epi32(X0_2, _MM_SHUFFLE(1, 0, 3, 2)); + X0_3 = _mm256_shuffle_epi32(X0_3, _MM_SHUFFLE(2, 1, 0, 3)); + + X1_1 = _mm256_shuffle_epi32(X1_1, _MM_SHUFFLE(0, 3, 2, 1)); + X1_2 = _mm256_shuffle_epi32(X1_2, _MM_SHUFFLE(1, 0, 3, 2)); + X1_3 = _mm256_shuffle_epi32(X1_3, _MM_SHUFFLE(2, 1, 0, 3)); + + X2_1 = _mm256_shuffle_epi32(X2_1, _MM_SHUFFLE(0, 3, 2, 1)); + X2_2 = _mm256_shuffle_epi32(X2_2, _MM_SHUFFLE(1, 0, 3, 2)); + X2_3 = _mm256_shuffle_epi32(X2_3, _MM_SHUFFLE(2, 1, 0, 3)); + + X3_1 = _mm256_shuffle_epi32(X3_1, _MM_SHUFFLE(0, 3, 2, 1)); + X3_2 = _mm256_shuffle_epi32(X3_2, _MM_SHUFFLE(1, 0, 3, 2)); + X3_3 = _mm256_shuffle_epi32(X3_3, _MM_SHUFFLE(2, 1, 0, 3)); + + X0_0 = _mm256_add_epi32(X0_0, X0_1); + X1_0 = _mm256_add_epi32(X1_0, X1_1); + X2_0 = _mm256_add_epi32(X2_0, X2_1); + X3_0 = _mm256_add_epi32(X3_0, X3_1); + + X0_3 = _mm256_xor_si256(X0_3, X0_0); + X1_3 = _mm256_xor_si256(X1_3, X1_0); + X2_3 = _mm256_xor_si256(X2_3, X2_0); + X3_3 = _mm256_xor_si256(X3_3, X3_0); + + X0_3 = RotateLeft<16>(X0_3); + X1_3 = RotateLeft<16>(X1_3); + X2_3 = RotateLeft<16>(X2_3); + X3_3 = RotateLeft<16>(X3_3); + + X0_2 = _mm256_add_epi32(X0_2, X0_3); + X1_2 = _mm256_add_epi32(X1_2, X1_3); + X2_2 = _mm256_add_epi32(X2_2, X2_3); + X3_2 = _mm256_add_epi32(X3_2, X3_3); + + X0_1 = _mm256_xor_si256(X0_1, X0_2); + X1_1 = _mm256_xor_si256(X1_1, X1_2); + X2_1 = _mm256_xor_si256(X2_1, X2_2); + X3_1 = _mm256_xor_si256(X3_1, X3_2); + + X0_1 = RotateLeft<12>(X0_1); + X1_1 = RotateLeft<12>(X1_1); + X2_1 = RotateLeft<12>(X2_1); + X3_1 = RotateLeft<12>(X3_1); + + X0_0 = _mm256_add_epi32(X0_0, X0_1); + X1_0 = _mm256_add_epi32(X1_0, X1_1); + X2_0 = _mm256_add_epi32(X2_0, X2_1); + X3_0 = _mm256_add_epi32(X3_0, X3_1); + + X0_3 = _mm256_xor_si256(X0_3, X0_0); + X1_3 = _mm256_xor_si256(X1_3, X1_0); + X2_3 = _mm256_xor_si256(X2_3, X2_0); + X3_3 = _mm256_xor_si256(X3_3, X3_0); + + X0_3 = RotateLeft<8>(X0_3); + X1_3 = RotateLeft<8>(X1_3); + X2_3 = RotateLeft<8>(X2_3); + X3_3 = RotateLeft<8>(X3_3); + + X0_2 = _mm256_add_epi32(X0_2, X0_3); + X1_2 = _mm256_add_epi32(X1_2, X1_3); + X2_2 = _mm256_add_epi32(X2_2, X2_3); + X3_2 = _mm256_add_epi32(X3_2, X3_3); + + X0_1 = _mm256_xor_si256(X0_1, X0_2); + X1_1 = _mm256_xor_si256(X1_1, X1_2); + X2_1 = _mm256_xor_si256(X2_1, X2_2); + X3_1 = _mm256_xor_si256(X3_1, X3_2); + + X0_1 = RotateLeft<7>(X0_1); + X1_1 = RotateLeft<7>(X1_1); + X2_1 = RotateLeft<7>(X2_1); + X3_1 = RotateLeft<7>(X3_1); + + X0_1 = _mm256_shuffle_epi32(X0_1, _MM_SHUFFLE(2, 1, 0, 3)); + X0_2 = _mm256_shuffle_epi32(X0_2, _MM_SHUFFLE(1, 0, 3, 2)); + X0_3 = _mm256_shuffle_epi32(X0_3, _MM_SHUFFLE(0, 3, 2, 1)); + + X1_1 = _mm256_shuffle_epi32(X1_1, _MM_SHUFFLE(2, 1, 0, 3)); + X1_2 = _mm256_shuffle_epi32(X1_2, _MM_SHUFFLE(1, 0, 3, 2)); + X1_3 = _mm256_shuffle_epi32(X1_3, _MM_SHUFFLE(0, 3, 2, 1)); + + X2_1 = _mm256_shuffle_epi32(X2_1, _MM_SHUFFLE(2, 1, 0, 3)); + X2_2 = _mm256_shuffle_epi32(X2_2, _MM_SHUFFLE(1, 0, 3, 2)); + X2_3 = _mm256_shuffle_epi32(X2_3, _MM_SHUFFLE(0, 3, 2, 1)); + + X3_1 = _mm256_shuffle_epi32(X3_1, _MM_SHUFFLE(2, 1, 0, 3)); + X3_2 = _mm256_shuffle_epi32(X3_2, _MM_SHUFFLE(1, 0, 3, 2)); + X3_3 = _mm256_shuffle_epi32(X3_3, _MM_SHUFFLE(0, 3, 2, 1)); + } + + X0_0 = _mm256_add_epi32(X0_0, state0); + X0_1 = _mm256_add_epi32(X0_1, state1); + X0_2 = _mm256_add_epi32(X0_2, state2); + X0_3 = _mm256_add_epi32(X0_3, state3); + X0_3 = _mm256_add_epi64(X0_3, CTR0); + + X1_0 = _mm256_add_epi32(X1_0, state0); + X1_1 = _mm256_add_epi32(X1_1, state1); + X1_2 = _mm256_add_epi32(X1_2, state2); + X1_3 = _mm256_add_epi32(X1_3, state3); + X1_3 = _mm256_add_epi64(X1_3, CTR1); + + X2_0 = _mm256_add_epi32(X2_0, state0); + X2_1 = _mm256_add_epi32(X2_1, state1); + X2_2 = _mm256_add_epi32(X2_2, state2); + X2_3 = _mm256_add_epi32(X2_3, state3); + X2_3 = _mm256_add_epi64(X2_3, CTR2); + + X3_0 = _mm256_add_epi32(X3_0, state0); + X3_1 = _mm256_add_epi32(X3_1, state1); + X3_2 = _mm256_add_epi32(X3_2, state2); + X3_3 = _mm256_add_epi32(X3_3, state3); + X3_3 = _mm256_add_epi64(X3_3, CTR3); + + if (input_mm) + { + _mm256_storeu_si256(output_mm + 0, _mm256_xor_si256(_mm256_loadu_si256(input_mm + 0), + _mm256_permute2x128_si256(X0_0, X0_1, 1 + (3 << 4)))); + _mm256_storeu_si256(output_mm + 1, _mm256_xor_si256(_mm256_loadu_si256(input_mm + 1), + _mm256_permute2x128_si256(X0_2, X0_3, 1 + (3 << 4)))); + _mm256_storeu_si256(output_mm + 2, _mm256_xor_si256(_mm256_loadu_si256(input_mm + 2), + _mm256_permute2x128_si256(X1_0, X1_1, 1 + (3 << 4)))); + _mm256_storeu_si256(output_mm + 3, _mm256_xor_si256(_mm256_loadu_si256(input_mm + 3), + _mm256_permute2x128_si256(X1_2, X1_3, 1 + (3 << 4)))); + } + else + { + _mm256_storeu_si256(output_mm + 0, _mm256_permute2x128_si256(X0_0, X0_1, 1 + (3 << 4))); + _mm256_storeu_si256(output_mm + 1, _mm256_permute2x128_si256(X0_2, X0_3, 1 + (3 << 4))); + _mm256_storeu_si256(output_mm + 2, _mm256_permute2x128_si256(X1_0, X1_1, 1 + (3 << 4))); + _mm256_storeu_si256(output_mm + 3, _mm256_permute2x128_si256(X1_2, X1_3, 1 + (3 << 4))); + } + + if (input_mm) + { + _mm256_storeu_si256(output_mm + 4, _mm256_xor_si256(_mm256_loadu_si256(input_mm + 4), + _mm256_permute2x128_si256(X2_0, X2_1, 1 + (3 << 4)))); + _mm256_storeu_si256(output_mm + 5, _mm256_xor_si256(_mm256_loadu_si256(input_mm + 5), + _mm256_permute2x128_si256(X2_2, X2_3, 1 + (3 << 4)))); + _mm256_storeu_si256(output_mm + 6, _mm256_xor_si256(_mm256_loadu_si256(input_mm + 6), + _mm256_permute2x128_si256(X3_0, X3_1, 1 + (3 << 4)))); + _mm256_storeu_si256(output_mm + 7, _mm256_xor_si256(_mm256_loadu_si256(input_mm + 7), + _mm256_permute2x128_si256(X3_2, X3_3, 1 + (3 << 4)))); + } + else + { + _mm256_storeu_si256(output_mm + 4, _mm256_permute2x128_si256(X2_0, X2_1, 1 + (3 << 4))); + _mm256_storeu_si256(output_mm + 5, _mm256_permute2x128_si256(X2_2, X2_3, 1 + (3 << 4))); + _mm256_storeu_si256(output_mm + 6, _mm256_permute2x128_si256(X3_0, X3_1, 1 + (3 << 4))); + _mm256_storeu_si256(output_mm + 7, _mm256_permute2x128_si256(X3_2, X3_3, 1 + (3 << 4))); + } + + if (input_mm) + { + _mm256_storeu_si256(output_mm + 8, _mm256_xor_si256(_mm256_loadu_si256(input_mm + 8), + _mm256_permute2x128_si256(X0_0, X0_1, 0 + (2 << 4)))); + _mm256_storeu_si256(output_mm + 9, _mm256_xor_si256(_mm256_loadu_si256(input_mm + 9), + _mm256_permute2x128_si256(X0_2, X0_3, 0 + (2 << 4)))); + _mm256_storeu_si256(output_mm + 10, _mm256_xor_si256(_mm256_loadu_si256(input_mm + 10), + _mm256_permute2x128_si256(X1_0, X1_1, 0 + (2 << 4)))); + _mm256_storeu_si256(output_mm + 11, _mm256_xor_si256(_mm256_loadu_si256(input_mm + 11), + _mm256_permute2x128_si256(X1_2, X1_3, 0 + (2 << 4)))); + } + else + { + _mm256_storeu_si256(output_mm + 8, _mm256_permute2x128_si256(X0_0, X0_1, 0 + (2 << 4))); + _mm256_storeu_si256(output_mm + 9, _mm256_permute2x128_si256(X0_2, X0_3, 0 + (2 << 4))); + _mm256_storeu_si256(output_mm + 10, _mm256_permute2x128_si256(X1_0, X1_1, 0 + (2 << 4))); + _mm256_storeu_si256(output_mm + 11, _mm256_permute2x128_si256(X1_2, X1_3, 0 + (2 << 4))); + } + + if (input_mm) + { + _mm256_storeu_si256(output_mm + 12, _mm256_xor_si256(_mm256_loadu_si256(input_mm + 12), + _mm256_permute2x128_si256(X2_0, X2_1, 0 + (2 << 4)))); + _mm256_storeu_si256(output_mm + 13, _mm256_xor_si256(_mm256_loadu_si256(input_mm + 13), + _mm256_permute2x128_si256(X2_2, X2_3, 0 + (2 << 4)))); + _mm256_storeu_si256(output_mm + 14, _mm256_xor_si256(_mm256_loadu_si256(input_mm + 14), + _mm256_permute2x128_si256(X3_0, X3_1, 0 + (2 << 4)))); + _mm256_storeu_si256(output_mm + 15, _mm256_xor_si256(_mm256_loadu_si256(input_mm + 15), + _mm256_permute2x128_si256(X3_2, X3_3, 0 + (2 << 4)))); + } + else + { + _mm256_storeu_si256(output_mm + 12, _mm256_permute2x128_si256(X2_0, X2_1, 0 + (2 << 4))); + _mm256_storeu_si256(output_mm + 13, _mm256_permute2x128_si256(X2_2, X2_3, 0 + (2 << 4))); + _mm256_storeu_si256(output_mm + 14, _mm256_permute2x128_si256(X3_0, X3_1, 0 + (2 << 4))); + _mm256_storeu_si256(output_mm + 15, _mm256_permute2x128_si256(X3_2, X3_3, 0 + (2 << 4))); + } +} + +NAMESPACE_END + +#endif // CRYPTOPP_AVX2_AVAILABLE diff --git a/chacha.cpp b/chacha.cpp index 6a5f50f2..fe438507 100644 --- a/chacha.cpp +++ b/chacha.cpp @@ -20,6 +20,10 @@ extern void ChaCha_OperateKeystream_NEON(const word32 *state, const byte* input, extern void ChaCha_OperateKeystream_SSE2(const word32 *state, const byte* input, byte *output, unsigned int rounds); #endif +#if (CRYPTOPP_AVX2_AVAILABLE) +extern void ChaCha_OperateKeystream_AVX2(const word32 *state, const byte* input, byte *output, unsigned int rounds); +#endif + #if (CRYPTOPP_POWER8_AVAILABLE) extern void ChaCha_OperateKeystream_POWER8(const word32 *state, const byte* input, byte *output, unsigned int rounds); #endif @@ -62,17 +66,25 @@ std::string ChaCha_Policy::AlgorithmName() const std::string ChaCha_Policy::AlgorithmProvider() const { +#if (CRYPTOPP_AVX2_AVAILABLE) + if (HasAVX2()) + return "AVX2"; + else +#endif #if (CRYPTOPP_SSE2_INTRIN_AVAILABLE || CRYPTOPP_SSE2_ASM_AVAILABLE) if (HasSSE2()) return "SSE2"; + else #endif #if (CRYPTOPP_ARM_NEON_AVAILABLE) if (HasNEON()) return "NEON"; + else #endif #if (CRYPTOPP_POWER8_AVAILABLE) if (HasPower8()) return "Power8"; + else #endif return "C++"; } @@ -117,11 +129,17 @@ void ChaCha_Policy::SeekToIteration(lword iterationCount) unsigned int ChaCha_Policy::GetAlignment() const { +#if (CRYPTOPP_AVX2_AVAILABLE) + if (HasAVX2()) + return 16; + else +#endif #if (CRYPTOPP_SSE2_INTRIN_AVAILABLE || CRYPTOPP_SSE2_ASM_AVAILABLE) if (HasSSE2()) return 16; else -#elif (CRYPTOPP_POWER8_AVAILABLE) +#endif +#if (CRYPTOPP_POWER8_AVAILABLE) if (HasPower8()) return 16; else @@ -131,6 +149,11 @@ unsigned int ChaCha_Policy::GetAlignment() const unsigned int ChaCha_Policy::GetOptimalBlockSize() const { +#if (CRYPTOPP_AVX2_AVAILABLE) + if (HasAVX2()) + return 8 * BYTES_PER_ITERATION; + else +#endif #if (CRYPTOPP_SSE2_INTRIN_AVAILABLE || CRYPTOPP_SSE2_ASM_AVAILABLE) if (HasSSE2()) return 4*BYTES_PER_ITERATION; @@ -149,10 +172,9 @@ unsigned int ChaCha_Policy::GetOptimalBlockSize() const return BYTES_PER_ITERATION; } -bool ChaCha_Policy::MultiBlockSafe() const +bool ChaCha_Policy::MultiBlockSafe(unsigned int blocks) const { - const word32 c = m_state[12]; - return 0xffffffff - c > 4; + return 0xffffffff - m_state[12] > blocks; } // OperateKeystream always produces a key stream. The key stream is written @@ -163,10 +185,30 @@ void ChaCha_Policy::OperateKeystream(KeystreamOperation operation, { do { +#if (CRYPTOPP_AVX2_AVAILABLE) + if (HasAVX2()) + { + while (iterationCount >= 8 && MultiBlockSafe(8)) + { + const bool xorInput = (operation & INPUT_NULL) != INPUT_NULL; + ChaCha_OperateKeystream_AVX2(m_state, xorInput ? input : NULLPTR, output, m_rounds); + + // MultiBlockSafe avoids overflow on the counter words + m_state[12] += 8; + //if (m_state[12] < 8) + // m_state[13]++; + + input += (!!xorInput) * 8 * BYTES_PER_ITERATION; + output += 8 * BYTES_PER_ITERATION; + iterationCount -= 8; + } + } +#endif + #if (CRYPTOPP_SSE2_INTRIN_AVAILABLE || CRYPTOPP_SSE2_ASM_AVAILABLE) if (HasSSE2()) { - while (iterationCount >= 4 && MultiBlockSafe()) + while (iterationCount >= 4 && MultiBlockSafe(4)) { const bool xorInput = (operation & INPUT_NULL) != INPUT_NULL; ChaCha_OperateKeystream_SSE2(m_state, xorInput ? input : NULLPTR, output, m_rounds); @@ -186,7 +228,7 @@ void ChaCha_Policy::OperateKeystream(KeystreamOperation operation, #if (CRYPTOPP_ARM_NEON_AVAILABLE) if (HasNEON()) { - while (iterationCount >= 4 && MultiBlockSafe()) + while (iterationCount >= 4 && MultiBlockSafe(4)) { const bool xorInput = (operation & INPUT_NULL) != INPUT_NULL; ChaCha_OperateKeystream_NEON(m_state, xorInput ? input : NULLPTR, output, m_rounds); @@ -206,7 +248,7 @@ void ChaCha_Policy::OperateKeystream(KeystreamOperation operation, #if (CRYPTOPP_POWER8_AVAILABLE) if (HasPower8()) { - while (iterationCount >= 4 && MultiBlockSafe()) + while (iterationCount >= 4 && MultiBlockSafe(4)) { const bool xorInput = (operation & INPUT_NULL) != INPUT_NULL; ChaCha_OperateKeystream_POWER8(m_state, xorInput ? input : NULLPTR, output, m_rounds); diff --git a/chacha.h b/chacha.h index b0a762b3..5d219a27 100644 --- a/chacha.h +++ b/chacha.h @@ -56,7 +56,7 @@ protected: // during addition in an intermediate result. Conditions to trigger // issue include a user seeks to around 2^32 blocks (256 GB of data). // https://github.com/weidai11/cryptopp/issues/732 - bool MultiBlockSafe() const; + inline bool MultiBlockSafe(unsigned int blocks) const; FixedSizeAlignedSecBlock m_state; int m_rounds; diff --git a/cham-simd.cpp b/cham-simd.cpp index 8f3d0b5b..a6e1018f 100644 --- a/cham-simd.cpp +++ b/cham-simd.cpp @@ -17,6 +17,12 @@ // #undef CRYPTOPP_SSSE3_AVAILABLE // #undef CRYPTOPP_ARM_NEON_AVAILABLE +#if defined(CRYPTOPP_SSE2_AVAILABLE) +# define CRYPTOPP_AVX512_ROTATE 1 +# include +# include +#endif + #if (CRYPTOPP_SSSE3_AVAILABLE) # include # include @@ -26,11 +32,6 @@ # include #endif -#if defined(__AVX512F__) && defined(__AVX512VL__) -# define CRYPTOPP_AVX512_ROTATE 1 -# include -#endif - // Squash MS LNK4221 and libtool warnings extern const char CHAM_SIMD_FNAME[] = __FILE__; diff --git a/config.h b/config.h index 65131315..2d47cf5c 100644 --- a/config.h +++ b/config.h @@ -484,6 +484,11 @@ NAMESPACE_END # define CRYPTOPP_DISABLE_ASM 1 #endif +// SunCC prior to 5.10 cannot handle some SSE intrinsics +#if defined(__SUNPRO_CC) && (__SUNPRO_CC < 0x5100) +# define CRYPTOPP_DISABLE_ASM 1 +#endif + // Sun Studio 12 provides GCC inline assembly, http://blogs.oracle.com/x86be/entry/gcc_style_asm_inlining_support // We can enable SSE2 for Sun Studio in the makefile with -D__SSE2__, but users may not compile with it. #if !defined(CRYPTOPP_DISABLE_ASM) && !defined(__SSE2__) && defined(__x86_64__) && (__SUNPRO_CC >= 0x5100) @@ -563,6 +568,22 @@ NAMESPACE_END #define CRYPTOPP_AESNI_AVAILABLE 1 #endif +// Requires Binutils 2.24 +#if !defined(CRYPTOPP_DISABLE_AVX) && defined(CRYPTOPP_SSE42_AVAILABLE) && \ + (defined(__AVX2__) || (CRYPTOPP_MSC_VERSION >= 1800) || (__SUNPRO_CC >= 0x5130) || \ + (CRYPTOPP_GCC_VERSION >= 40700) || (__INTEL_COMPILER >= 1400) || \ + (CRYPTOPP_LLVM_CLANG_VERSION >= 30100) || (CRYPTOPP_APPLE_CLANG_VERSION >= 40600)) +#define CRYPTOPP_AVX_AVAILABLE 1 +#endif + +// Requires Binutils 2.24 +#if !defined(CRYPTOPP_DISABLE_AVX2) && defined(CRYPTOPP_AVX_AVAILABLE) && \ + (defined(__AVX2__) || (CRYPTOPP_MSC_VERSION >= 1800) || (__SUNPRO_CC >= 0x5130) || \ + (CRYPTOPP_GCC_VERSION >= 40700) || (__INTEL_COMPILER >= 1400) || \ + (CRYPTOPP_LLVM_CLANG_VERSION >= 30100) || (CRYPTOPP_APPLE_CLANG_VERSION >= 40600)) +#define CRYPTOPP_AVX2_AVAILABLE 1 +#endif + // Guessing at SHA for SunCC. Its not in Sun Studio 12.6. Also see // http://stackoverflow.com/questions/45872180/which-xarch-for-sha-extensions-on-solaris #if !defined(CRYPTOPP_DISABLE_ASM) && !defined(CRYPTOPP_DISABLE_SHANI) && defined(CRYPTOPP_SSE42_AVAILABLE) && \ diff --git a/cryptest.nmake b/cryptest.nmake index 0dc946ff..1e28576a 100644 --- a/cryptest.nmake +++ b/cryptest.nmake @@ -55,8 +55,8 @@ LIB_SRCS = \ algparam.cpp arc4.cpp aria-simd.cpp aria.cpp ariatab.cpp asn.cpp \ authenc.cpp base32.cpp base64.cpp basecode.cpp bfinit.cpp blake2s-simd.cpp \ blake2b-simd.cpp blake2.cpp blowfish.cpp blumshub.cpp camellia.cpp cast.cpp casts.cpp \ - cbcmac.cpp ccm.cpp chacha-simd.cpp chacha.cpp cham-simd.cpp cham.cpp channels.cpp \ - cmac.cpp crc-simd.cpp crc.cpp default.cpp des.cpp dessp.cpp dh.cpp \ + cbcmac.cpp ccm.cpp chacha-avx.cpp chacha-simd.cpp chacha.cpp cham-simd.cpp cham.cpp \ + channels.cpp cmac.cpp crc-simd.cpp crc.cpp default.cpp des.cpp dessp.cpp dh.cpp \ dh2.cpp dll.cpp dsa.cpp eax.cpp ec2n.cpp eccrypto.cpp ecp.cpp elgamal.cpp \ emsa2.cpp eprecomp.cpp esign.cpp files.cpp filters.cpp fips140.cpp \ fipstest.cpp gcm-simd.cpp gcm.cpp gf256.cpp gf2_32.cpp gf2n.cpp \ @@ -83,8 +83,8 @@ LIB_OBJS = \ algparam.obj arc4.obj aria-simd.obj aria.obj ariatab.obj asn.obj \ authenc.obj base32.obj base64.obj basecode.obj bfinit.obj blake2s-simd.obj \ blake2b-simd.obj blake2.obj blowfish.obj blumshub.obj camellia.obj cast.obj casts.obj \ - cbcmac.obj ccm.obj chacha-simd.obj chacha.obj cham-simd.obj cham.obj channels.obj \ - cmac.obj crc-simd.obj crc.obj default.obj des.obj dessp.obj dh.obj \ + cbcmac.obj ccm.obj chacha-avx.obj chacha-simd.obj chacha.obj cham-simd.obj cham.obj \ + channels.obj cmac.obj crc-simd.obj crc.obj default.obj des.obj dessp.obj dh.obj \ dh2.obj dll.obj dsa.obj eax.obj ec2n.obj eccrypto.obj ecp.obj elgamal.obj \ emsa2.obj eprecomp.obj esign.obj files.obj filters.obj fips140.obj \ fipstest.obj gcm-simd.obj gcm.obj gf256.obj gf2_32.obj gf2n.obj \ diff --git a/cryptlib.vcxproj b/cryptlib.vcxproj index b6917bf5..f28fd431 100644 --- a/cryptlib.vcxproj +++ b/cryptlib.vcxproj @@ -1,4 +1,4 @@ - + @@ -193,6 +193,10 @@ + + + true + diff --git a/cryptlib.vcxproj.filters b/cryptlib.vcxproj.filters index de1bcea1..71957da0 100644 --- a/cryptlib.vcxproj.filters +++ b/cryptlib.vcxproj.filters @@ -1,4 +1,4 @@ - + @@ -92,6 +92,9 @@ Source Files + + Source Files + Source Files @@ -986,5 +989,7 @@ Miscellaneous + + \ No newline at end of file