Add ChaCha AVX2 implementation (GH #735)

pull/737/head
Jeffrey Walton 2018-11-08 16:20:31 -05:00
parent 9b31bc189c
commit d9011f07d2
No known key found for this signature in database
GPG Key ID: B36AB348921B1838
10 changed files with 485 additions and 28 deletions

View File

@ -52,6 +52,7 @@ cbcmac.h
ccm.cpp
ccm.h
chacha.cpp
chacha-avx.cpp
chacha-simd.cpp
chacha.h
cham.cpp

View File

@ -227,9 +227,10 @@ ifeq ($(HAVE_GAS)$(GAS219_OR_LATER),10)
CXXFLAGS += -DCRYPTOPP_DISABLE_AESNI
else
ifeq ($(HAVE_GAS)$(GAS224_OR_LATER),10)
CXXFLAGS += -DCRYPTOPP_DISABLE_AVX
CXXFLAGS += -DCRYPTOPP_DISABLE_SHANI
endif # -DCRYPTOPP_DISABLE_SHANI
endif # -DCRYPTOPP_DISABLE_AVX and SHANI
endif # -DCRYPTOPP_DISABLE_AESNI
endif # -DCRYPTOPP_DISABLE_SSE4
endif # -DCRYPTOPP_DISABLE_SSSE3
@ -276,26 +277,33 @@ ifeq ($(findstring -DCRYPTOPP_DISABLE_AESNI,$(CXXFLAGS)),)
AES_FLAG = -msse4.1 -maes
SM4_FLAG = -mssse3 -maes
endif
ifeq ($(findstring -DCRYPTOPP_DISABLE_AVX2,$(CXXFLAGS)),)
HAVE_AVX2 = $(shell $(CXX) $(CXXFLAGS) -mavx2 -dM -E pch.cpp 2>&1 | $(GREP) -i -c __AVX2__)
ifeq ($(HAVE_AVX2),1)
CHACHA_AVX2_FLAG = -mavx2
endif
ifeq ($(findstring -DCRYPTOPP_DISABLE_SHANI,$(CXXFLAGS)),)
HAVE_SHA = $(shell $(CXX) $(CXXFLAGS) -msse4.2 -msha -dM -E pch.cpp 2>&1 | $(GREP) -i -c __SHA__)
ifeq ($(HAVE_SHA),1)
SHA_FLAG = -msse4.2 -msha
endif
endif # -DCRYPTOPP_DISABLE_SHANI
endif # -DCRYPTOPP_DISABLE_AVX2
endif # -DCRYPTOPP_DISABLE_AESNI
endif # -DCRYPTOPP_DISABLE_SSE4
endif # -DCRYPTOPP_DISABLE_SSSE3
# Begin SunCC
ifeq ($(SUN_COMPILER),1)
COUNT := $(shell $(CXX) $(CXXFLAGS) -E -xarch=sse2 -xdumpmacros /dev/null 2>&1 | $(GREP) -i -c "illegal")
COUNT := $(shell $(CXX) $(CXXFLAGS) -E -xarch=sse2 -xdumpmacros pch.cpp 2>&1 | $(GREP) -i -c "illegal")
ifeq ($(COUNT),0)
AES_FLAG = -xarch=sse2 -D__SSE2__=1
CHACHA_FLAG = -xarch=sse2 -D__SSE2__=1
GCM_FLAG = -xarch=sse2 -D__SSE2__=1
SHA_FLAG = -xarch=sse2 -D__SSE2__=1
LDFLAGS += -xarch=sse2
endif
COUNT := $(shell $(CXX) $(CXXFLAGS) -E -xarch=ssse3 -xdumpmacros /dev/null 2>&1 | $(GREP) -i -c "illegal")
COUNT := $(shell $(CXX) $(CXXFLAGS) -E -xarch=ssse3 -xdumpmacros pch.cpp 2>&1 | $(GREP) -i -c "illegal")
ifeq ($(COUNT),0)
SSSE3_FLAG = -xarch=ssse3 -D__SSSE3__=1
ARIA_FLAG = -xarch=ssse3 -D__SSSE3__=1
@ -308,7 +316,7 @@ ifeq ($(SUN_COMPILER),1)
SPECK128_FLAG = -xarch=ssse3 -D__SSSE3__=1
LDFLAGS += -xarch=ssse3
endif
COUNT := $(shell $(CXX) $(CXXFLAGS) -E -xarch=sse4_1 -xdumpmacros /dev/null 2>&1 | $(GREP) -i -c "illegal")
COUNT := $(shell $(CXX) $(CXXFLAGS) -E -xarch=sse4_1 -xdumpmacros pch.cpp 2>&1 | $(GREP) -i -c "illegal")
ifeq ($(COUNT),0)
BLAKE2B_FLAG = -xarch=sse4_1 -D__SSE4_1__=1
BLAKE2S_FLAG = -xarch=sse4_1 -D__SSE4_1__=1
@ -316,19 +324,28 @@ ifeq ($(SUN_COMPILER),1)
SPECK64_FLAG = -xarch=sse4_1 -D__SSE4_1__=1
LDFLAGS += -xarch=sse4_1
endif
COUNT := $(shell $(CXX) $(CXXFLAGS) -E -xarch=sse4_2 -xdumpmacros /dev/null 2>&1 | $(GREP) -i -c "illegal")
COUNT := $(shell $(CXX) $(CXXFLAGS) -E -xarch=sse4_2 -xdumpmacros pch.cpp 2>&1 | $(GREP) -i -c "illegal")
ifeq ($(COUNT),0)
CRC_FLAG = -xarch=sse4_2 -D__SSE4_2__=1
LDFLAGS += -xarch=sse4_2
endif
COUNT := $(shell $(CXX) $(CXXFLAGS) -E -xarch=aes -xdumpmacros /dev/null 2>&1 | $(GREP) -i -c "illegal")
COUNT := $(shell $(CXX) $(CXXFLAGS) -E -xarch=aes -xdumpmacros pch.cpp 2>&1 | $(GREP) -i -c "illegal")
ifeq ($(COUNT),0)
GCM_FLAG = -xarch=aes -D__PCLMUL__=1
AES_FLAG = -xarch=aes -D__AES__=1
SM4_FLAG = -xarch=aes -D__AES__=1
LDFLAGS += -xarch=aes
endif
COUNT := $(shell $(CXX) $(CXXFLAGS) -E -xarch=sha -xdumpmacros /dev/null 2>&1 | $(GREP) -i -c "illegal")
COUNT := $(shell $(CXX) $(CXXFLAGS) -E -xarch=avx -xdumpmacros pch.cpp 2>&1 | $(GREP) -i -c "illegal")
ifeq ($(COUNT),0)
LDFLAGS += -xarch=avx
endif
COUNT := $(shell $(CXX) $(CXXFLAGS) -E -xarch=avx2 -xdumpmacros pch.cpp 2>&1 | $(GREP) -i -c "illegal")
ifeq ($(COUNT),0)
CHACHA_AVX2_FLAG = -xarch=avx2 -D__AVX2__=1
LDFLAGS += -xarch=avx2
endif
COUNT := $(shell $(CXX) $(CXXFLAGS) -E -xarch=sha -xdumpmacros pch.cpp 2>&1 | $(GREP) -i -c "illegal")
ifeq ($(COUNT),0)
SHA_FLAG = -xarch=sha -D__SHA__=1
LDFLAGS += -xarch=sha
@ -646,8 +663,8 @@ ifneq ($(SUN_COMPILER),0) # override flags for CC Sun C++ compiler
CXXFLAGS += -template=no%extdef
SUN_CC10_BUGGY := $(shell $(CXX) -V 2>&1 | $(GREP) -c -E "CC: Sun .* 5\.10 .* (2009|2010/0[1-4])")
ifneq ($(SUN_CC10_BUGGY),0)
# -DCRYPTOPP_INCLUDE_VECTOR_CC is needed for Sun Studio 12u1 Sun C++ 5.10 SunOS_i386 128229-02 2009/09/21 and was fixed in May 2010
# remove it if you get "already had a body defined" errors in vector.cc
# -DCRYPTOPP_INCLUDE_VECTOR_CC is needed for Sun Studio 12u1 Sun C++ 5.10 SunOS_i386 128229-02 2009/09/21
# and was fixed in May 2010. Remove it if you get "already had a body defined" errors in vector.cc
CXXFLAGS += -DCRYPTOPP_INCLUDE_VECTOR_CC
endif
AR = $(CXX)
@ -1197,6 +1214,10 @@ blake2b-simd.o : blake2b-simd.cpp
chacha-simd.o : chacha-simd.cpp
$(CXX) $(strip $(CXXFLAGS) $(CHACHA_FLAG) -c) $<
# AVX2 available
chacha-avx.o : chacha-avx.cpp
$(CXX) $(strip $(CXXFLAGS) $(CHACHA_AVX2_FLAG) -c) $<
# SSSE3 available
cham-simd.o : cham-simd.cpp
$(CXX) $(strip $(CXXFLAGS) $(CHAM_FLAG) -c) $<

362
chacha-avx.cpp Normal file
View File

@ -0,0 +1,362 @@
// chacha-avx.cpp - written and placed in the public domain by
// Jack Lloyd and Jeffrey Walton
//
// This source file uses intrinsics and built-ins to gain access to
// SSE2, ARM NEON and ARMv8a, and Power7 Altivec instructions. A separate
// source file is needed because additional CXXFLAGS are required to enable
// the appropriate instructions sets in some build configurations.
//
// AVX implementation based on Botan's chacha_avx.cpp. Many thanks
// to Jack Lloyd and the Botan team for allowing us to use it.
//
// Here are some relative numbers for ChaCha8:
// * Intel Skylake, 3.0 GHz: AVX2 at 4385 MB/s; 0.59 cpb.
// * AMD Bulldozer, 3.3 GHz: AVX2 at 1680 MB/s; 1.47 cpb.
#include "pch.h"
#include "config.h"
#include "chacha.h"
#include "misc.h"
#if defined(CRYPTOPP_AVX2_AVAILABLE)
# include <xmmintrin.h>
# include <emmintrin.h>
# include <immintrin.h>
#endif
// Squash MS LNK4221 and libtool warnings
extern const char CHACHA_AVX_FNAME[] = __FILE__;
// Sun Studio 12.4 OK, 12.5 and 12.6 error.
#if (__SUNPRO_CC >= 0x5140) && (__SUNPRO_CC <= 0x5150)
# define MAYBE_CONST
#else
# define MAYBE_CONST const
#endif
#if (CRYPTOPP_AVX2_AVAILABLE)
ANONYMOUS_NAMESPACE_BEGIN
template <unsigned int R>
inline __m256i RotateLeft(const __m256i val)
{
return _mm256_or_si256(_mm256_slli_epi32(val, R), _mm256_srli_epi32(val, 32-R));
}
template <>
inline __m256i RotateLeft<8>(const __m256i val)
{
const __m256i mask = _mm256_set_epi8(14,13,12,15, 10,9,8,11, 6,5,4,7, 2,1,0,3,
14,13,12,15, 10,9,8,11, 6,5,4,7, 2,1,0,3);
return _mm256_shuffle_epi8(val, mask);
}
template <>
inline __m256i RotateLeft<16>(const __m256i val)
{
const __m256i mask = _mm256_set_epi8(13,12,15,14, 9,8,11,10, 5,4,7,6, 1,0,3,2,
13,12,15,14, 9,8,11,10, 5,4,7,6, 1,0,3,2);
return _mm256_shuffle_epi8(val, mask);
}
ANONYMOUS_NAMESPACE_END
NAMESPACE_BEGIN(CryptoPP)
void ChaCha_OperateKeystream_AVX2(const word32 *state, const byte* input, byte *output, unsigned int rounds)
{
MAYBE_CONST __m128i* state_mm = (MAYBE_CONST __m128i*)(state);
MAYBE_CONST __m256i* input_mm = (MAYBE_CONST __m256i*)(input);
__m256i* output_mm = reinterpret_cast<__m256i*>(output);
const __m256i state0 = _mm256_broadcastsi128_si256(_mm_loadu_si128(state_mm + 0));
const __m256i state1 = _mm256_broadcastsi128_si256(_mm_loadu_si128(state_mm + 1));
const __m256i state2 = _mm256_broadcastsi128_si256(_mm_loadu_si128(state_mm + 2));
const __m256i state3 = _mm256_broadcastsi128_si256(_mm_loadu_si128(state_mm + 3));
const __m256i CTR0 = _mm256_set_epi32(0, 0, 0, 0, 0, 0, 0, 4);
const __m256i CTR1 = _mm256_set_epi32(0, 0, 0, 1, 0, 0, 0, 5);
const __m256i CTR2 = _mm256_set_epi32(0, 0, 0, 2, 0, 0, 0, 6);
const __m256i CTR3 = _mm256_set_epi32(0, 0, 0, 3, 0, 0, 0, 7);
__m256i X0_0 = state0;
__m256i X0_1 = state1;
__m256i X0_2 = state2;
__m256i X0_3 = _mm256_add_epi64(state3, CTR0);
__m256i X1_0 = state0;
__m256i X1_1 = state1;
__m256i X1_2 = state2;
__m256i X1_3 = _mm256_add_epi64(state3, CTR1);
__m256i X2_0 = state0;
__m256i X2_1 = state1;
__m256i X2_2 = state2;
__m256i X2_3 = _mm256_add_epi64(state3, CTR2);
__m256i X3_0 = state0;
__m256i X3_1 = state1;
__m256i X3_2 = state2;
__m256i X3_3 = _mm256_add_epi64(state3, CTR3);
for (int i = static_cast<int>(rounds); i > 0; i -= 2)
{
X0_0 = _mm256_add_epi32(X0_0, X0_1);
X1_0 = _mm256_add_epi32(X1_0, X1_1);
X2_0 = _mm256_add_epi32(X2_0, X2_1);
X3_0 = _mm256_add_epi32(X3_0, X3_1);
X0_3 = _mm256_xor_si256(X0_3, X0_0);
X1_3 = _mm256_xor_si256(X1_3, X1_0);
X2_3 = _mm256_xor_si256(X2_3, X2_0);
X3_3 = _mm256_xor_si256(X3_3, X3_0);
X0_3 = RotateLeft<16>(X0_3);
X1_3 = RotateLeft<16>(X1_3);
X2_3 = RotateLeft<16>(X2_3);
X3_3 = RotateLeft<16>(X3_3);
X0_2 = _mm256_add_epi32(X0_2, X0_3);
X1_2 = _mm256_add_epi32(X1_2, X1_3);
X2_2 = _mm256_add_epi32(X2_2, X2_3);
X3_2 = _mm256_add_epi32(X3_2, X3_3);
X0_1 = _mm256_xor_si256(X0_1, X0_2);
X1_1 = _mm256_xor_si256(X1_1, X1_2);
X2_1 = _mm256_xor_si256(X2_1, X2_2);
X3_1 = _mm256_xor_si256(X3_1, X3_2);
X0_1 = RotateLeft<12>(X0_1);
X1_1 = RotateLeft<12>(X1_1);
X2_1 = RotateLeft<12>(X2_1);
X3_1 = RotateLeft<12>(X3_1);
X0_0 = _mm256_add_epi32(X0_0, X0_1);
X1_0 = _mm256_add_epi32(X1_0, X1_1);
X2_0 = _mm256_add_epi32(X2_0, X2_1);
X3_0 = _mm256_add_epi32(X3_0, X3_1);
X0_3 = _mm256_xor_si256(X0_3, X0_0);
X1_3 = _mm256_xor_si256(X1_3, X1_0);
X2_3 = _mm256_xor_si256(X2_3, X2_0);
X3_3 = _mm256_xor_si256(X3_3, X3_0);
X0_3 = RotateLeft<8>(X0_3);
X1_3 = RotateLeft<8>(X1_3);
X2_3 = RotateLeft<8>(X2_3);
X3_3 = RotateLeft<8>(X3_3);
X0_2 = _mm256_add_epi32(X0_2, X0_3);
X1_2 = _mm256_add_epi32(X1_2, X1_3);
X2_2 = _mm256_add_epi32(X2_2, X2_3);
X3_2 = _mm256_add_epi32(X3_2, X3_3);
X0_1 = _mm256_xor_si256(X0_1, X0_2);
X1_1 = _mm256_xor_si256(X1_1, X1_2);
X2_1 = _mm256_xor_si256(X2_1, X2_2);
X3_1 = _mm256_xor_si256(X3_1, X3_2);
X0_1 = RotateLeft<7>(X0_1);
X1_1 = RotateLeft<7>(X1_1);
X2_1 = RotateLeft<7>(X2_1);
X3_1 = RotateLeft<7>(X3_1);
X0_1 = _mm256_shuffle_epi32(X0_1, _MM_SHUFFLE(0, 3, 2, 1));
X0_2 = _mm256_shuffle_epi32(X0_2, _MM_SHUFFLE(1, 0, 3, 2));
X0_3 = _mm256_shuffle_epi32(X0_3, _MM_SHUFFLE(2, 1, 0, 3));
X1_1 = _mm256_shuffle_epi32(X1_1, _MM_SHUFFLE(0, 3, 2, 1));
X1_2 = _mm256_shuffle_epi32(X1_2, _MM_SHUFFLE(1, 0, 3, 2));
X1_3 = _mm256_shuffle_epi32(X1_3, _MM_SHUFFLE(2, 1, 0, 3));
X2_1 = _mm256_shuffle_epi32(X2_1, _MM_SHUFFLE(0, 3, 2, 1));
X2_2 = _mm256_shuffle_epi32(X2_2, _MM_SHUFFLE(1, 0, 3, 2));
X2_3 = _mm256_shuffle_epi32(X2_3, _MM_SHUFFLE(2, 1, 0, 3));
X3_1 = _mm256_shuffle_epi32(X3_1, _MM_SHUFFLE(0, 3, 2, 1));
X3_2 = _mm256_shuffle_epi32(X3_2, _MM_SHUFFLE(1, 0, 3, 2));
X3_3 = _mm256_shuffle_epi32(X3_3, _MM_SHUFFLE(2, 1, 0, 3));
X0_0 = _mm256_add_epi32(X0_0, X0_1);
X1_0 = _mm256_add_epi32(X1_0, X1_1);
X2_0 = _mm256_add_epi32(X2_0, X2_1);
X3_0 = _mm256_add_epi32(X3_0, X3_1);
X0_3 = _mm256_xor_si256(X0_3, X0_0);
X1_3 = _mm256_xor_si256(X1_3, X1_0);
X2_3 = _mm256_xor_si256(X2_3, X2_0);
X3_3 = _mm256_xor_si256(X3_3, X3_0);
X0_3 = RotateLeft<16>(X0_3);
X1_3 = RotateLeft<16>(X1_3);
X2_3 = RotateLeft<16>(X2_3);
X3_3 = RotateLeft<16>(X3_3);
X0_2 = _mm256_add_epi32(X0_2, X0_3);
X1_2 = _mm256_add_epi32(X1_2, X1_3);
X2_2 = _mm256_add_epi32(X2_2, X2_3);
X3_2 = _mm256_add_epi32(X3_2, X3_3);
X0_1 = _mm256_xor_si256(X0_1, X0_2);
X1_1 = _mm256_xor_si256(X1_1, X1_2);
X2_1 = _mm256_xor_si256(X2_1, X2_2);
X3_1 = _mm256_xor_si256(X3_1, X3_2);
X0_1 = RotateLeft<12>(X0_1);
X1_1 = RotateLeft<12>(X1_1);
X2_1 = RotateLeft<12>(X2_1);
X3_1 = RotateLeft<12>(X3_1);
X0_0 = _mm256_add_epi32(X0_0, X0_1);
X1_0 = _mm256_add_epi32(X1_0, X1_1);
X2_0 = _mm256_add_epi32(X2_0, X2_1);
X3_0 = _mm256_add_epi32(X3_0, X3_1);
X0_3 = _mm256_xor_si256(X0_3, X0_0);
X1_3 = _mm256_xor_si256(X1_3, X1_0);
X2_3 = _mm256_xor_si256(X2_3, X2_0);
X3_3 = _mm256_xor_si256(X3_3, X3_0);
X0_3 = RotateLeft<8>(X0_3);
X1_3 = RotateLeft<8>(X1_3);
X2_3 = RotateLeft<8>(X2_3);
X3_3 = RotateLeft<8>(X3_3);
X0_2 = _mm256_add_epi32(X0_2, X0_3);
X1_2 = _mm256_add_epi32(X1_2, X1_3);
X2_2 = _mm256_add_epi32(X2_2, X2_3);
X3_2 = _mm256_add_epi32(X3_2, X3_3);
X0_1 = _mm256_xor_si256(X0_1, X0_2);
X1_1 = _mm256_xor_si256(X1_1, X1_2);
X2_1 = _mm256_xor_si256(X2_1, X2_2);
X3_1 = _mm256_xor_si256(X3_1, X3_2);
X0_1 = RotateLeft<7>(X0_1);
X1_1 = RotateLeft<7>(X1_1);
X2_1 = RotateLeft<7>(X2_1);
X3_1 = RotateLeft<7>(X3_1);
X0_1 = _mm256_shuffle_epi32(X0_1, _MM_SHUFFLE(2, 1, 0, 3));
X0_2 = _mm256_shuffle_epi32(X0_2, _MM_SHUFFLE(1, 0, 3, 2));
X0_3 = _mm256_shuffle_epi32(X0_3, _MM_SHUFFLE(0, 3, 2, 1));
X1_1 = _mm256_shuffle_epi32(X1_1, _MM_SHUFFLE(2, 1, 0, 3));
X1_2 = _mm256_shuffle_epi32(X1_2, _MM_SHUFFLE(1, 0, 3, 2));
X1_3 = _mm256_shuffle_epi32(X1_3, _MM_SHUFFLE(0, 3, 2, 1));
X2_1 = _mm256_shuffle_epi32(X2_1, _MM_SHUFFLE(2, 1, 0, 3));
X2_2 = _mm256_shuffle_epi32(X2_2, _MM_SHUFFLE(1, 0, 3, 2));
X2_3 = _mm256_shuffle_epi32(X2_3, _MM_SHUFFLE(0, 3, 2, 1));
X3_1 = _mm256_shuffle_epi32(X3_1, _MM_SHUFFLE(2, 1, 0, 3));
X3_2 = _mm256_shuffle_epi32(X3_2, _MM_SHUFFLE(1, 0, 3, 2));
X3_3 = _mm256_shuffle_epi32(X3_3, _MM_SHUFFLE(0, 3, 2, 1));
}
X0_0 = _mm256_add_epi32(X0_0, state0);
X0_1 = _mm256_add_epi32(X0_1, state1);
X0_2 = _mm256_add_epi32(X0_2, state2);
X0_3 = _mm256_add_epi32(X0_3, state3);
X0_3 = _mm256_add_epi64(X0_3, CTR0);
X1_0 = _mm256_add_epi32(X1_0, state0);
X1_1 = _mm256_add_epi32(X1_1, state1);
X1_2 = _mm256_add_epi32(X1_2, state2);
X1_3 = _mm256_add_epi32(X1_3, state3);
X1_3 = _mm256_add_epi64(X1_3, CTR1);
X2_0 = _mm256_add_epi32(X2_0, state0);
X2_1 = _mm256_add_epi32(X2_1, state1);
X2_2 = _mm256_add_epi32(X2_2, state2);
X2_3 = _mm256_add_epi32(X2_3, state3);
X2_3 = _mm256_add_epi64(X2_3, CTR2);
X3_0 = _mm256_add_epi32(X3_0, state0);
X3_1 = _mm256_add_epi32(X3_1, state1);
X3_2 = _mm256_add_epi32(X3_2, state2);
X3_3 = _mm256_add_epi32(X3_3, state3);
X3_3 = _mm256_add_epi64(X3_3, CTR3);
if (input_mm)
{
_mm256_storeu_si256(output_mm + 0, _mm256_xor_si256(_mm256_loadu_si256(input_mm + 0),
_mm256_permute2x128_si256(X0_0, X0_1, 1 + (3 << 4))));
_mm256_storeu_si256(output_mm + 1, _mm256_xor_si256(_mm256_loadu_si256(input_mm + 1),
_mm256_permute2x128_si256(X0_2, X0_3, 1 + (3 << 4))));
_mm256_storeu_si256(output_mm + 2, _mm256_xor_si256(_mm256_loadu_si256(input_mm + 2),
_mm256_permute2x128_si256(X1_0, X1_1, 1 + (3 << 4))));
_mm256_storeu_si256(output_mm + 3, _mm256_xor_si256(_mm256_loadu_si256(input_mm + 3),
_mm256_permute2x128_si256(X1_2, X1_3, 1 + (3 << 4))));
}
else
{
_mm256_storeu_si256(output_mm + 0, _mm256_permute2x128_si256(X0_0, X0_1, 1 + (3 << 4)));
_mm256_storeu_si256(output_mm + 1, _mm256_permute2x128_si256(X0_2, X0_3, 1 + (3 << 4)));
_mm256_storeu_si256(output_mm + 2, _mm256_permute2x128_si256(X1_0, X1_1, 1 + (3 << 4)));
_mm256_storeu_si256(output_mm + 3, _mm256_permute2x128_si256(X1_2, X1_3, 1 + (3 << 4)));
}
if (input_mm)
{
_mm256_storeu_si256(output_mm + 4, _mm256_xor_si256(_mm256_loadu_si256(input_mm + 4),
_mm256_permute2x128_si256(X2_0, X2_1, 1 + (3 << 4))));
_mm256_storeu_si256(output_mm + 5, _mm256_xor_si256(_mm256_loadu_si256(input_mm + 5),
_mm256_permute2x128_si256(X2_2, X2_3, 1 + (3 << 4))));
_mm256_storeu_si256(output_mm + 6, _mm256_xor_si256(_mm256_loadu_si256(input_mm + 6),
_mm256_permute2x128_si256(X3_0, X3_1, 1 + (3 << 4))));
_mm256_storeu_si256(output_mm + 7, _mm256_xor_si256(_mm256_loadu_si256(input_mm + 7),
_mm256_permute2x128_si256(X3_2, X3_3, 1 + (3 << 4))));
}
else
{
_mm256_storeu_si256(output_mm + 4, _mm256_permute2x128_si256(X2_0, X2_1, 1 + (3 << 4)));
_mm256_storeu_si256(output_mm + 5, _mm256_permute2x128_si256(X2_2, X2_3, 1 + (3 << 4)));
_mm256_storeu_si256(output_mm + 6, _mm256_permute2x128_si256(X3_0, X3_1, 1 + (3 << 4)));
_mm256_storeu_si256(output_mm + 7, _mm256_permute2x128_si256(X3_2, X3_3, 1 + (3 << 4)));
}
if (input_mm)
{
_mm256_storeu_si256(output_mm + 8, _mm256_xor_si256(_mm256_loadu_si256(input_mm + 8),
_mm256_permute2x128_si256(X0_0, X0_1, 0 + (2 << 4))));
_mm256_storeu_si256(output_mm + 9, _mm256_xor_si256(_mm256_loadu_si256(input_mm + 9),
_mm256_permute2x128_si256(X0_2, X0_3, 0 + (2 << 4))));
_mm256_storeu_si256(output_mm + 10, _mm256_xor_si256(_mm256_loadu_si256(input_mm + 10),
_mm256_permute2x128_si256(X1_0, X1_1, 0 + (2 << 4))));
_mm256_storeu_si256(output_mm + 11, _mm256_xor_si256(_mm256_loadu_si256(input_mm + 11),
_mm256_permute2x128_si256(X1_2, X1_3, 0 + (2 << 4))));
}
else
{
_mm256_storeu_si256(output_mm + 8, _mm256_permute2x128_si256(X0_0, X0_1, 0 + (2 << 4)));
_mm256_storeu_si256(output_mm + 9, _mm256_permute2x128_si256(X0_2, X0_3, 0 + (2 << 4)));
_mm256_storeu_si256(output_mm + 10, _mm256_permute2x128_si256(X1_0, X1_1, 0 + (2 << 4)));
_mm256_storeu_si256(output_mm + 11, _mm256_permute2x128_si256(X1_2, X1_3, 0 + (2 << 4)));
}
if (input_mm)
{
_mm256_storeu_si256(output_mm + 12, _mm256_xor_si256(_mm256_loadu_si256(input_mm + 12),
_mm256_permute2x128_si256(X2_0, X2_1, 0 + (2 << 4))));
_mm256_storeu_si256(output_mm + 13, _mm256_xor_si256(_mm256_loadu_si256(input_mm + 13),
_mm256_permute2x128_si256(X2_2, X2_3, 0 + (2 << 4))));
_mm256_storeu_si256(output_mm + 14, _mm256_xor_si256(_mm256_loadu_si256(input_mm + 14),
_mm256_permute2x128_si256(X3_0, X3_1, 0 + (2 << 4))));
_mm256_storeu_si256(output_mm + 15, _mm256_xor_si256(_mm256_loadu_si256(input_mm + 15),
_mm256_permute2x128_si256(X3_2, X3_3, 0 + (2 << 4))));
}
else
{
_mm256_storeu_si256(output_mm + 12, _mm256_permute2x128_si256(X2_0, X2_1, 0 + (2 << 4)));
_mm256_storeu_si256(output_mm + 13, _mm256_permute2x128_si256(X2_2, X2_3, 0 + (2 << 4)));
_mm256_storeu_si256(output_mm + 14, _mm256_permute2x128_si256(X3_0, X3_1, 0 + (2 << 4)));
_mm256_storeu_si256(output_mm + 15, _mm256_permute2x128_si256(X3_2, X3_3, 0 + (2 << 4)));
}
}
NAMESPACE_END
#endif // CRYPTOPP_AVX2_AVAILABLE

View File

@ -20,6 +20,10 @@ extern void ChaCha_OperateKeystream_NEON(const word32 *state, const byte* input,
extern void ChaCha_OperateKeystream_SSE2(const word32 *state, const byte* input, byte *output, unsigned int rounds);
#endif
#if (CRYPTOPP_AVX2_AVAILABLE)
extern void ChaCha_OperateKeystream_AVX2(const word32 *state, const byte* input, byte *output, unsigned int rounds);
#endif
#if (CRYPTOPP_POWER8_AVAILABLE)
extern void ChaCha_OperateKeystream_POWER8(const word32 *state, const byte* input, byte *output, unsigned int rounds);
#endif
@ -62,17 +66,25 @@ std::string ChaCha_Policy::AlgorithmName() const
std::string ChaCha_Policy::AlgorithmProvider() const
{
#if (CRYPTOPP_AVX2_AVAILABLE)
if (HasAVX2())
return "AVX2";
else
#endif
#if (CRYPTOPP_SSE2_INTRIN_AVAILABLE || CRYPTOPP_SSE2_ASM_AVAILABLE)
if (HasSSE2())
return "SSE2";
else
#endif
#if (CRYPTOPP_ARM_NEON_AVAILABLE)
if (HasNEON())
return "NEON";
else
#endif
#if (CRYPTOPP_POWER8_AVAILABLE)
if (HasPower8())
return "Power8";
else
#endif
return "C++";
}
@ -117,11 +129,17 @@ void ChaCha_Policy::SeekToIteration(lword iterationCount)
unsigned int ChaCha_Policy::GetAlignment() const
{
#if (CRYPTOPP_AVX2_AVAILABLE)
if (HasAVX2())
return 16;
else
#endif
#if (CRYPTOPP_SSE2_INTRIN_AVAILABLE || CRYPTOPP_SSE2_ASM_AVAILABLE)
if (HasSSE2())
return 16;
else
#elif (CRYPTOPP_POWER8_AVAILABLE)
#endif
#if (CRYPTOPP_POWER8_AVAILABLE)
if (HasPower8())
return 16;
else
@ -131,6 +149,11 @@ unsigned int ChaCha_Policy::GetAlignment() const
unsigned int ChaCha_Policy::GetOptimalBlockSize() const
{
#if (CRYPTOPP_AVX2_AVAILABLE)
if (HasAVX2())
return 8 * BYTES_PER_ITERATION;
else
#endif
#if (CRYPTOPP_SSE2_INTRIN_AVAILABLE || CRYPTOPP_SSE2_ASM_AVAILABLE)
if (HasSSE2())
return 4*BYTES_PER_ITERATION;
@ -149,10 +172,9 @@ unsigned int ChaCha_Policy::GetOptimalBlockSize() const
return BYTES_PER_ITERATION;
}
bool ChaCha_Policy::MultiBlockSafe() const
bool ChaCha_Policy::MultiBlockSafe(unsigned int blocks) const
{
const word32 c = m_state[12];
return 0xffffffff - c > 4;
return 0xffffffff - m_state[12] > blocks;
}
// OperateKeystream always produces a key stream. The key stream is written
@ -163,10 +185,30 @@ void ChaCha_Policy::OperateKeystream(KeystreamOperation operation,
{
do
{
#if (CRYPTOPP_AVX2_AVAILABLE)
if (HasAVX2())
{
while (iterationCount >= 8 && MultiBlockSafe(8))
{
const bool xorInput = (operation & INPUT_NULL) != INPUT_NULL;
ChaCha_OperateKeystream_AVX2(m_state, xorInput ? input : NULLPTR, output, m_rounds);
// MultiBlockSafe avoids overflow on the counter words
m_state[12] += 8;
//if (m_state[12] < 8)
// m_state[13]++;
input += (!!xorInput) * 8 * BYTES_PER_ITERATION;
output += 8 * BYTES_PER_ITERATION;
iterationCount -= 8;
}
}
#endif
#if (CRYPTOPP_SSE2_INTRIN_AVAILABLE || CRYPTOPP_SSE2_ASM_AVAILABLE)
if (HasSSE2())
{
while (iterationCount >= 4 && MultiBlockSafe())
while (iterationCount >= 4 && MultiBlockSafe(4))
{
const bool xorInput = (operation & INPUT_NULL) != INPUT_NULL;
ChaCha_OperateKeystream_SSE2(m_state, xorInput ? input : NULLPTR, output, m_rounds);
@ -186,7 +228,7 @@ void ChaCha_Policy::OperateKeystream(KeystreamOperation operation,
#if (CRYPTOPP_ARM_NEON_AVAILABLE)
if (HasNEON())
{
while (iterationCount >= 4 && MultiBlockSafe())
while (iterationCount >= 4 && MultiBlockSafe(4))
{
const bool xorInput = (operation & INPUT_NULL) != INPUT_NULL;
ChaCha_OperateKeystream_NEON(m_state, xorInput ? input : NULLPTR, output, m_rounds);
@ -206,7 +248,7 @@ void ChaCha_Policy::OperateKeystream(KeystreamOperation operation,
#if (CRYPTOPP_POWER8_AVAILABLE)
if (HasPower8())
{
while (iterationCount >= 4 && MultiBlockSafe())
while (iterationCount >= 4 && MultiBlockSafe(4))
{
const bool xorInput = (operation & INPUT_NULL) != INPUT_NULL;
ChaCha_OperateKeystream_POWER8(m_state, xorInput ? input : NULLPTR, output, m_rounds);

View File

@ -56,7 +56,7 @@ protected:
// during addition in an intermediate result. Conditions to trigger
// issue include a user seeks to around 2^32 blocks (256 GB of data).
// https://github.com/weidai11/cryptopp/issues/732
bool MultiBlockSafe() const;
inline bool MultiBlockSafe(unsigned int blocks) const;
FixedSizeAlignedSecBlock<word32, 16> m_state;
int m_rounds;

View File

@ -17,6 +17,12 @@
// #undef CRYPTOPP_SSSE3_AVAILABLE
// #undef CRYPTOPP_ARM_NEON_AVAILABLE
#if defined(CRYPTOPP_SSE2_AVAILABLE)
# define CRYPTOPP_AVX512_ROTATE 1
# include <xmmintrin.h>
# include <emmintrin.h>
#endif
#if (CRYPTOPP_SSSE3_AVAILABLE)
# include <pmmintrin.h>
# include <tmmintrin.h>
@ -26,11 +32,6 @@
# include <ammintrin.h>
#endif
#if defined(__AVX512F__) && defined(__AVX512VL__)
# define CRYPTOPP_AVX512_ROTATE 1
# include <immintrin.h>
#endif
// Squash MS LNK4221 and libtool warnings
extern const char CHAM_SIMD_FNAME[] = __FILE__;

View File

@ -484,6 +484,11 @@ NAMESPACE_END
# define CRYPTOPP_DISABLE_ASM 1
#endif
// SunCC prior to 5.10 cannot handle some SSE intrinsics
#if defined(__SUNPRO_CC) && (__SUNPRO_CC < 0x5100)
# define CRYPTOPP_DISABLE_ASM 1
#endif
// Sun Studio 12 provides GCC inline assembly, http://blogs.oracle.com/x86be/entry/gcc_style_asm_inlining_support
// We can enable SSE2 for Sun Studio in the makefile with -D__SSE2__, but users may not compile with it.
#if !defined(CRYPTOPP_DISABLE_ASM) && !defined(__SSE2__) && defined(__x86_64__) && (__SUNPRO_CC >= 0x5100)
@ -563,6 +568,22 @@ NAMESPACE_END
#define CRYPTOPP_AESNI_AVAILABLE 1
#endif
// Requires Binutils 2.24
#if !defined(CRYPTOPP_DISABLE_AVX) && defined(CRYPTOPP_SSE42_AVAILABLE) && \
(defined(__AVX2__) || (CRYPTOPP_MSC_VERSION >= 1800) || (__SUNPRO_CC >= 0x5130) || \
(CRYPTOPP_GCC_VERSION >= 40700) || (__INTEL_COMPILER >= 1400) || \
(CRYPTOPP_LLVM_CLANG_VERSION >= 30100) || (CRYPTOPP_APPLE_CLANG_VERSION >= 40600))
#define CRYPTOPP_AVX_AVAILABLE 1
#endif
// Requires Binutils 2.24
#if !defined(CRYPTOPP_DISABLE_AVX2) && defined(CRYPTOPP_AVX_AVAILABLE) && \
(defined(__AVX2__) || (CRYPTOPP_MSC_VERSION >= 1800) || (__SUNPRO_CC >= 0x5130) || \
(CRYPTOPP_GCC_VERSION >= 40700) || (__INTEL_COMPILER >= 1400) || \
(CRYPTOPP_LLVM_CLANG_VERSION >= 30100) || (CRYPTOPP_APPLE_CLANG_VERSION >= 40600))
#define CRYPTOPP_AVX2_AVAILABLE 1
#endif
// Guessing at SHA for SunCC. Its not in Sun Studio 12.6. Also see
// http://stackoverflow.com/questions/45872180/which-xarch-for-sha-extensions-on-solaris
#if !defined(CRYPTOPP_DISABLE_ASM) && !defined(CRYPTOPP_DISABLE_SHANI) && defined(CRYPTOPP_SSE42_AVAILABLE) && \

View File

@ -55,8 +55,8 @@ LIB_SRCS = \
algparam.cpp arc4.cpp aria-simd.cpp aria.cpp ariatab.cpp asn.cpp \
authenc.cpp base32.cpp base64.cpp basecode.cpp bfinit.cpp blake2s-simd.cpp \
blake2b-simd.cpp blake2.cpp blowfish.cpp blumshub.cpp camellia.cpp cast.cpp casts.cpp \
cbcmac.cpp ccm.cpp chacha-simd.cpp chacha.cpp cham-simd.cpp cham.cpp channels.cpp \
cmac.cpp crc-simd.cpp crc.cpp default.cpp des.cpp dessp.cpp dh.cpp \
cbcmac.cpp ccm.cpp chacha-avx.cpp chacha-simd.cpp chacha.cpp cham-simd.cpp cham.cpp \
channels.cpp cmac.cpp crc-simd.cpp crc.cpp default.cpp des.cpp dessp.cpp dh.cpp \
dh2.cpp dll.cpp dsa.cpp eax.cpp ec2n.cpp eccrypto.cpp ecp.cpp elgamal.cpp \
emsa2.cpp eprecomp.cpp esign.cpp files.cpp filters.cpp fips140.cpp \
fipstest.cpp gcm-simd.cpp gcm.cpp gf256.cpp gf2_32.cpp gf2n.cpp \
@ -83,8 +83,8 @@ LIB_OBJS = \
algparam.obj arc4.obj aria-simd.obj aria.obj ariatab.obj asn.obj \
authenc.obj base32.obj base64.obj basecode.obj bfinit.obj blake2s-simd.obj \
blake2b-simd.obj blake2.obj blowfish.obj blumshub.obj camellia.obj cast.obj casts.obj \
cbcmac.obj ccm.obj chacha-simd.obj chacha.obj cham-simd.obj cham.obj channels.obj \
cmac.obj crc-simd.obj crc.obj default.obj des.obj dessp.obj dh.obj \
cbcmac.obj ccm.obj chacha-avx.obj chacha-simd.obj chacha.obj cham-simd.obj cham.obj \
channels.obj cmac.obj crc-simd.obj crc.obj default.obj des.obj dessp.obj dh.obj \
dh2.obj dll.obj dsa.obj eax.obj ec2n.obj eccrypto.obj ecp.obj elgamal.obj \
emsa2.obj eprecomp.obj esign.obj files.obj filters.obj fips140.obj \
fipstest.obj gcm-simd.obj gcm.obj gf256.obj gf2_32.obj gf2n.obj \

View File

@ -1,4 +1,4 @@
<?xml version="1.0" encoding="utf-8"?>
<?xml version="1.0" encoding="utf-8"?>
<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
<!-- Microsoft documentation for VCXPROJ file format is located at -->
<!-- the following URL. The documentation leaves a lot to be desired. -->
@ -193,6 +193,10 @@
<ClCompile Include="ccm.cpp" />
<ClCompile Include="chacha.cpp" />
<ClCompile Include="chacha-simd.cpp" />
<ClCompile Include="chacha-avx.cpp">
<!-- Requires Visual Studio 2013 and above -->
<ExcludedFromBuild Condition=" '$(PlatformToolset)' == 'v100' Or '$(PlatformToolset)' == 'v110' ">true</ExcludedFromBuild>
</ClCompile>
<ClCompile Include="cham.cpp" />
<ClCompile Include="cham-simd.cpp" />
<ClCompile Include="channels.cpp" />

View File

@ -1,4 +1,4 @@
<?xml version="1.0" encoding="utf-8"?>
<?xml version="1.0" encoding="utf-8"?>
<Project ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
<ItemGroup>
<Filter Include="Header Files">
@ -92,6 +92,9 @@
<ClCompile Include="chacha-simd.cpp">
<Filter>Source Files</Filter>
</ClCompile>
<ClCompile Include="chacha-avx.cpp">
<Filter>Source Files</Filter>
</ClCompile>
<ClCompile Include="cham.cpp">
<Filter>Source Files</Filter>
</ClCompile>
@ -986,5 +989,7 @@
<CustomBuild Include="adhoc.cpp.proto">
<Filter>Miscellaneous</Filter>
</CustomBuild>
<CustomBuild Include="x64dll.asm" />
<CustomBuild Include="x64masm.asm" />
</ItemGroup>
</Project>