Add ChaCha AVX2 implementation (GH #735)
parent
9b31bc189c
commit
d9011f07d2
|
|
@ -52,6 +52,7 @@ cbcmac.h
|
||||||
ccm.cpp
|
ccm.cpp
|
||||||
ccm.h
|
ccm.h
|
||||||
chacha.cpp
|
chacha.cpp
|
||||||
|
chacha-avx.cpp
|
||||||
chacha-simd.cpp
|
chacha-simd.cpp
|
||||||
chacha.h
|
chacha.h
|
||||||
cham.cpp
|
cham.cpp
|
||||||
|
|
|
||||||
39
GNUmakefile
39
GNUmakefile
|
|
@ -227,9 +227,10 @@ ifeq ($(HAVE_GAS)$(GAS219_OR_LATER),10)
|
||||||
CXXFLAGS += -DCRYPTOPP_DISABLE_AESNI
|
CXXFLAGS += -DCRYPTOPP_DISABLE_AESNI
|
||||||
else
|
else
|
||||||
ifeq ($(HAVE_GAS)$(GAS224_OR_LATER),10)
|
ifeq ($(HAVE_GAS)$(GAS224_OR_LATER),10)
|
||||||
|
CXXFLAGS += -DCRYPTOPP_DISABLE_AVX
|
||||||
CXXFLAGS += -DCRYPTOPP_DISABLE_SHANI
|
CXXFLAGS += -DCRYPTOPP_DISABLE_SHANI
|
||||||
|
|
||||||
endif # -DCRYPTOPP_DISABLE_SHANI
|
endif # -DCRYPTOPP_DISABLE_AVX and SHANI
|
||||||
endif # -DCRYPTOPP_DISABLE_AESNI
|
endif # -DCRYPTOPP_DISABLE_AESNI
|
||||||
endif # -DCRYPTOPP_DISABLE_SSE4
|
endif # -DCRYPTOPP_DISABLE_SSE4
|
||||||
endif # -DCRYPTOPP_DISABLE_SSSE3
|
endif # -DCRYPTOPP_DISABLE_SSSE3
|
||||||
|
|
@ -276,26 +277,33 @@ ifeq ($(findstring -DCRYPTOPP_DISABLE_AESNI,$(CXXFLAGS)),)
|
||||||
AES_FLAG = -msse4.1 -maes
|
AES_FLAG = -msse4.1 -maes
|
||||||
SM4_FLAG = -mssse3 -maes
|
SM4_FLAG = -mssse3 -maes
|
||||||
endif
|
endif
|
||||||
|
ifeq ($(findstring -DCRYPTOPP_DISABLE_AVX2,$(CXXFLAGS)),)
|
||||||
|
HAVE_AVX2 = $(shell $(CXX) $(CXXFLAGS) -mavx2 -dM -E pch.cpp 2>&1 | $(GREP) -i -c __AVX2__)
|
||||||
|
ifeq ($(HAVE_AVX2),1)
|
||||||
|
CHACHA_AVX2_FLAG = -mavx2
|
||||||
|
endif
|
||||||
ifeq ($(findstring -DCRYPTOPP_DISABLE_SHANI,$(CXXFLAGS)),)
|
ifeq ($(findstring -DCRYPTOPP_DISABLE_SHANI,$(CXXFLAGS)),)
|
||||||
HAVE_SHA = $(shell $(CXX) $(CXXFLAGS) -msse4.2 -msha -dM -E pch.cpp 2>&1 | $(GREP) -i -c __SHA__)
|
HAVE_SHA = $(shell $(CXX) $(CXXFLAGS) -msse4.2 -msha -dM -E pch.cpp 2>&1 | $(GREP) -i -c __SHA__)
|
||||||
ifeq ($(HAVE_SHA),1)
|
ifeq ($(HAVE_SHA),1)
|
||||||
SHA_FLAG = -msse4.2 -msha
|
SHA_FLAG = -msse4.2 -msha
|
||||||
endif
|
endif
|
||||||
endif # -DCRYPTOPP_DISABLE_SHANI
|
endif # -DCRYPTOPP_DISABLE_SHANI
|
||||||
|
endif # -DCRYPTOPP_DISABLE_AVX2
|
||||||
endif # -DCRYPTOPP_DISABLE_AESNI
|
endif # -DCRYPTOPP_DISABLE_AESNI
|
||||||
endif # -DCRYPTOPP_DISABLE_SSE4
|
endif # -DCRYPTOPP_DISABLE_SSE4
|
||||||
endif # -DCRYPTOPP_DISABLE_SSSE3
|
endif # -DCRYPTOPP_DISABLE_SSSE3
|
||||||
|
|
||||||
# Begin SunCC
|
# Begin SunCC
|
||||||
ifeq ($(SUN_COMPILER),1)
|
ifeq ($(SUN_COMPILER),1)
|
||||||
COUNT := $(shell $(CXX) $(CXXFLAGS) -E -xarch=sse2 -xdumpmacros /dev/null 2>&1 | $(GREP) -i -c "illegal")
|
COUNT := $(shell $(CXX) $(CXXFLAGS) -E -xarch=sse2 -xdumpmacros pch.cpp 2>&1 | $(GREP) -i -c "illegal")
|
||||||
ifeq ($(COUNT),0)
|
ifeq ($(COUNT),0)
|
||||||
AES_FLAG = -xarch=sse2 -D__SSE2__=1
|
AES_FLAG = -xarch=sse2 -D__SSE2__=1
|
||||||
|
CHACHA_FLAG = -xarch=sse2 -D__SSE2__=1
|
||||||
GCM_FLAG = -xarch=sse2 -D__SSE2__=1
|
GCM_FLAG = -xarch=sse2 -D__SSE2__=1
|
||||||
SHA_FLAG = -xarch=sse2 -D__SSE2__=1
|
SHA_FLAG = -xarch=sse2 -D__SSE2__=1
|
||||||
LDFLAGS += -xarch=sse2
|
LDFLAGS += -xarch=sse2
|
||||||
endif
|
endif
|
||||||
COUNT := $(shell $(CXX) $(CXXFLAGS) -E -xarch=ssse3 -xdumpmacros /dev/null 2>&1 | $(GREP) -i -c "illegal")
|
COUNT := $(shell $(CXX) $(CXXFLAGS) -E -xarch=ssse3 -xdumpmacros pch.cpp 2>&1 | $(GREP) -i -c "illegal")
|
||||||
ifeq ($(COUNT),0)
|
ifeq ($(COUNT),0)
|
||||||
SSSE3_FLAG = -xarch=ssse3 -D__SSSE3__=1
|
SSSE3_FLAG = -xarch=ssse3 -D__SSSE3__=1
|
||||||
ARIA_FLAG = -xarch=ssse3 -D__SSSE3__=1
|
ARIA_FLAG = -xarch=ssse3 -D__SSSE3__=1
|
||||||
|
|
@ -308,7 +316,7 @@ ifeq ($(SUN_COMPILER),1)
|
||||||
SPECK128_FLAG = -xarch=ssse3 -D__SSSE3__=1
|
SPECK128_FLAG = -xarch=ssse3 -D__SSSE3__=1
|
||||||
LDFLAGS += -xarch=ssse3
|
LDFLAGS += -xarch=ssse3
|
||||||
endif
|
endif
|
||||||
COUNT := $(shell $(CXX) $(CXXFLAGS) -E -xarch=sse4_1 -xdumpmacros /dev/null 2>&1 | $(GREP) -i -c "illegal")
|
COUNT := $(shell $(CXX) $(CXXFLAGS) -E -xarch=sse4_1 -xdumpmacros pch.cpp 2>&1 | $(GREP) -i -c "illegal")
|
||||||
ifeq ($(COUNT),0)
|
ifeq ($(COUNT),0)
|
||||||
BLAKE2B_FLAG = -xarch=sse4_1 -D__SSE4_1__=1
|
BLAKE2B_FLAG = -xarch=sse4_1 -D__SSE4_1__=1
|
||||||
BLAKE2S_FLAG = -xarch=sse4_1 -D__SSE4_1__=1
|
BLAKE2S_FLAG = -xarch=sse4_1 -D__SSE4_1__=1
|
||||||
|
|
@ -316,19 +324,28 @@ ifeq ($(SUN_COMPILER),1)
|
||||||
SPECK64_FLAG = -xarch=sse4_1 -D__SSE4_1__=1
|
SPECK64_FLAG = -xarch=sse4_1 -D__SSE4_1__=1
|
||||||
LDFLAGS += -xarch=sse4_1
|
LDFLAGS += -xarch=sse4_1
|
||||||
endif
|
endif
|
||||||
COUNT := $(shell $(CXX) $(CXXFLAGS) -E -xarch=sse4_2 -xdumpmacros /dev/null 2>&1 | $(GREP) -i -c "illegal")
|
COUNT := $(shell $(CXX) $(CXXFLAGS) -E -xarch=sse4_2 -xdumpmacros pch.cpp 2>&1 | $(GREP) -i -c "illegal")
|
||||||
ifeq ($(COUNT),0)
|
ifeq ($(COUNT),0)
|
||||||
CRC_FLAG = -xarch=sse4_2 -D__SSE4_2__=1
|
CRC_FLAG = -xarch=sse4_2 -D__SSE4_2__=1
|
||||||
LDFLAGS += -xarch=sse4_2
|
LDFLAGS += -xarch=sse4_2
|
||||||
endif
|
endif
|
||||||
COUNT := $(shell $(CXX) $(CXXFLAGS) -E -xarch=aes -xdumpmacros /dev/null 2>&1 | $(GREP) -i -c "illegal")
|
COUNT := $(shell $(CXX) $(CXXFLAGS) -E -xarch=aes -xdumpmacros pch.cpp 2>&1 | $(GREP) -i -c "illegal")
|
||||||
ifeq ($(COUNT),0)
|
ifeq ($(COUNT),0)
|
||||||
GCM_FLAG = -xarch=aes -D__PCLMUL__=1
|
GCM_FLAG = -xarch=aes -D__PCLMUL__=1
|
||||||
AES_FLAG = -xarch=aes -D__AES__=1
|
AES_FLAG = -xarch=aes -D__AES__=1
|
||||||
SM4_FLAG = -xarch=aes -D__AES__=1
|
SM4_FLAG = -xarch=aes -D__AES__=1
|
||||||
LDFLAGS += -xarch=aes
|
LDFLAGS += -xarch=aes
|
||||||
endif
|
endif
|
||||||
COUNT := $(shell $(CXX) $(CXXFLAGS) -E -xarch=sha -xdumpmacros /dev/null 2>&1 | $(GREP) -i -c "illegal")
|
COUNT := $(shell $(CXX) $(CXXFLAGS) -E -xarch=avx -xdumpmacros pch.cpp 2>&1 | $(GREP) -i -c "illegal")
|
||||||
|
ifeq ($(COUNT),0)
|
||||||
|
LDFLAGS += -xarch=avx
|
||||||
|
endif
|
||||||
|
COUNT := $(shell $(CXX) $(CXXFLAGS) -E -xarch=avx2 -xdumpmacros pch.cpp 2>&1 | $(GREP) -i -c "illegal")
|
||||||
|
ifeq ($(COUNT),0)
|
||||||
|
CHACHA_AVX2_FLAG = -xarch=avx2 -D__AVX2__=1
|
||||||
|
LDFLAGS += -xarch=avx2
|
||||||
|
endif
|
||||||
|
COUNT := $(shell $(CXX) $(CXXFLAGS) -E -xarch=sha -xdumpmacros pch.cpp 2>&1 | $(GREP) -i -c "illegal")
|
||||||
ifeq ($(COUNT),0)
|
ifeq ($(COUNT),0)
|
||||||
SHA_FLAG = -xarch=sha -D__SHA__=1
|
SHA_FLAG = -xarch=sha -D__SHA__=1
|
||||||
LDFLAGS += -xarch=sha
|
LDFLAGS += -xarch=sha
|
||||||
|
|
@ -646,8 +663,8 @@ ifneq ($(SUN_COMPILER),0) # override flags for CC Sun C++ compiler
|
||||||
CXXFLAGS += -template=no%extdef
|
CXXFLAGS += -template=no%extdef
|
||||||
SUN_CC10_BUGGY := $(shell $(CXX) -V 2>&1 | $(GREP) -c -E "CC: Sun .* 5\.10 .* (2009|2010/0[1-4])")
|
SUN_CC10_BUGGY := $(shell $(CXX) -V 2>&1 | $(GREP) -c -E "CC: Sun .* 5\.10 .* (2009|2010/0[1-4])")
|
||||||
ifneq ($(SUN_CC10_BUGGY),0)
|
ifneq ($(SUN_CC10_BUGGY),0)
|
||||||
# -DCRYPTOPP_INCLUDE_VECTOR_CC is needed for Sun Studio 12u1 Sun C++ 5.10 SunOS_i386 128229-02 2009/09/21 and was fixed in May 2010
|
# -DCRYPTOPP_INCLUDE_VECTOR_CC is needed for Sun Studio 12u1 Sun C++ 5.10 SunOS_i386 128229-02 2009/09/21
|
||||||
# remove it if you get "already had a body defined" errors in vector.cc
|
# and was fixed in May 2010. Remove it if you get "already had a body defined" errors in vector.cc
|
||||||
CXXFLAGS += -DCRYPTOPP_INCLUDE_VECTOR_CC
|
CXXFLAGS += -DCRYPTOPP_INCLUDE_VECTOR_CC
|
||||||
endif
|
endif
|
||||||
AR = $(CXX)
|
AR = $(CXX)
|
||||||
|
|
@ -1197,6 +1214,10 @@ blake2b-simd.o : blake2b-simd.cpp
|
||||||
chacha-simd.o : chacha-simd.cpp
|
chacha-simd.o : chacha-simd.cpp
|
||||||
$(CXX) $(strip $(CXXFLAGS) $(CHACHA_FLAG) -c) $<
|
$(CXX) $(strip $(CXXFLAGS) $(CHACHA_FLAG) -c) $<
|
||||||
|
|
||||||
|
# AVX2 available
|
||||||
|
chacha-avx.o : chacha-avx.cpp
|
||||||
|
$(CXX) $(strip $(CXXFLAGS) $(CHACHA_AVX2_FLAG) -c) $<
|
||||||
|
|
||||||
# SSSE3 available
|
# SSSE3 available
|
||||||
cham-simd.o : cham-simd.cpp
|
cham-simd.o : cham-simd.cpp
|
||||||
$(CXX) $(strip $(CXXFLAGS) $(CHAM_FLAG) -c) $<
|
$(CXX) $(strip $(CXXFLAGS) $(CHAM_FLAG) -c) $<
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,362 @@
|
||||||
|
// chacha-avx.cpp - written and placed in the public domain by
|
||||||
|
// Jack Lloyd and Jeffrey Walton
|
||||||
|
//
|
||||||
|
// This source file uses intrinsics and built-ins to gain access to
|
||||||
|
// SSE2, ARM NEON and ARMv8a, and Power7 Altivec instructions. A separate
|
||||||
|
// source file is needed because additional CXXFLAGS are required to enable
|
||||||
|
// the appropriate instructions sets in some build configurations.
|
||||||
|
//
|
||||||
|
// AVX implementation based on Botan's chacha_avx.cpp. Many thanks
|
||||||
|
// to Jack Lloyd and the Botan team for allowing us to use it.
|
||||||
|
//
|
||||||
|
// Here are some relative numbers for ChaCha8:
|
||||||
|
// * Intel Skylake, 3.0 GHz: AVX2 at 4385 MB/s; 0.59 cpb.
|
||||||
|
// * AMD Bulldozer, 3.3 GHz: AVX2 at 1680 MB/s; 1.47 cpb.
|
||||||
|
|
||||||
|
#include "pch.h"
|
||||||
|
#include "config.h"
|
||||||
|
|
||||||
|
#include "chacha.h"
|
||||||
|
#include "misc.h"
|
||||||
|
|
||||||
|
#if defined(CRYPTOPP_AVX2_AVAILABLE)
|
||||||
|
# include <xmmintrin.h>
|
||||||
|
# include <emmintrin.h>
|
||||||
|
# include <immintrin.h>
|
||||||
|
#endif
|
||||||
|
|
||||||
|
// Squash MS LNK4221 and libtool warnings
|
||||||
|
extern const char CHACHA_AVX_FNAME[] = __FILE__;
|
||||||
|
|
||||||
|
// Sun Studio 12.4 OK, 12.5 and 12.6 error.
|
||||||
|
#if (__SUNPRO_CC >= 0x5140) && (__SUNPRO_CC <= 0x5150)
|
||||||
|
# define MAYBE_CONST
|
||||||
|
#else
|
||||||
|
# define MAYBE_CONST const
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#if (CRYPTOPP_AVX2_AVAILABLE)
|
||||||
|
|
||||||
|
ANONYMOUS_NAMESPACE_BEGIN
|
||||||
|
|
||||||
|
template <unsigned int R>
|
||||||
|
inline __m256i RotateLeft(const __m256i val)
|
||||||
|
{
|
||||||
|
return _mm256_or_si256(_mm256_slli_epi32(val, R), _mm256_srli_epi32(val, 32-R));
|
||||||
|
}
|
||||||
|
|
||||||
|
template <>
|
||||||
|
inline __m256i RotateLeft<8>(const __m256i val)
|
||||||
|
{
|
||||||
|
const __m256i mask = _mm256_set_epi8(14,13,12,15, 10,9,8,11, 6,5,4,7, 2,1,0,3,
|
||||||
|
14,13,12,15, 10,9,8,11, 6,5,4,7, 2,1,0,3);
|
||||||
|
return _mm256_shuffle_epi8(val, mask);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <>
|
||||||
|
inline __m256i RotateLeft<16>(const __m256i val)
|
||||||
|
{
|
||||||
|
const __m256i mask = _mm256_set_epi8(13,12,15,14, 9,8,11,10, 5,4,7,6, 1,0,3,2,
|
||||||
|
13,12,15,14, 9,8,11,10, 5,4,7,6, 1,0,3,2);
|
||||||
|
return _mm256_shuffle_epi8(val, mask);
|
||||||
|
}
|
||||||
|
|
||||||
|
ANONYMOUS_NAMESPACE_END
|
||||||
|
|
||||||
|
NAMESPACE_BEGIN(CryptoPP)
|
||||||
|
|
||||||
|
void ChaCha_OperateKeystream_AVX2(const word32 *state, const byte* input, byte *output, unsigned int rounds)
|
||||||
|
{
|
||||||
|
MAYBE_CONST __m128i* state_mm = (MAYBE_CONST __m128i*)(state);
|
||||||
|
MAYBE_CONST __m256i* input_mm = (MAYBE_CONST __m256i*)(input);
|
||||||
|
__m256i* output_mm = reinterpret_cast<__m256i*>(output);
|
||||||
|
|
||||||
|
const __m256i state0 = _mm256_broadcastsi128_si256(_mm_loadu_si128(state_mm + 0));
|
||||||
|
const __m256i state1 = _mm256_broadcastsi128_si256(_mm_loadu_si128(state_mm + 1));
|
||||||
|
const __m256i state2 = _mm256_broadcastsi128_si256(_mm_loadu_si128(state_mm + 2));
|
||||||
|
const __m256i state3 = _mm256_broadcastsi128_si256(_mm_loadu_si128(state_mm + 3));
|
||||||
|
|
||||||
|
const __m256i CTR0 = _mm256_set_epi32(0, 0, 0, 0, 0, 0, 0, 4);
|
||||||
|
const __m256i CTR1 = _mm256_set_epi32(0, 0, 0, 1, 0, 0, 0, 5);
|
||||||
|
const __m256i CTR2 = _mm256_set_epi32(0, 0, 0, 2, 0, 0, 0, 6);
|
||||||
|
const __m256i CTR3 = _mm256_set_epi32(0, 0, 0, 3, 0, 0, 0, 7);
|
||||||
|
|
||||||
|
__m256i X0_0 = state0;
|
||||||
|
__m256i X0_1 = state1;
|
||||||
|
__m256i X0_2 = state2;
|
||||||
|
__m256i X0_3 = _mm256_add_epi64(state3, CTR0);
|
||||||
|
|
||||||
|
__m256i X1_0 = state0;
|
||||||
|
__m256i X1_1 = state1;
|
||||||
|
__m256i X1_2 = state2;
|
||||||
|
__m256i X1_3 = _mm256_add_epi64(state3, CTR1);
|
||||||
|
|
||||||
|
__m256i X2_0 = state0;
|
||||||
|
__m256i X2_1 = state1;
|
||||||
|
__m256i X2_2 = state2;
|
||||||
|
__m256i X2_3 = _mm256_add_epi64(state3, CTR2);
|
||||||
|
|
||||||
|
__m256i X3_0 = state0;
|
||||||
|
__m256i X3_1 = state1;
|
||||||
|
__m256i X3_2 = state2;
|
||||||
|
__m256i X3_3 = _mm256_add_epi64(state3, CTR3);
|
||||||
|
|
||||||
|
for (int i = static_cast<int>(rounds); i > 0; i -= 2)
|
||||||
|
{
|
||||||
|
X0_0 = _mm256_add_epi32(X0_0, X0_1);
|
||||||
|
X1_0 = _mm256_add_epi32(X1_0, X1_1);
|
||||||
|
X2_0 = _mm256_add_epi32(X2_0, X2_1);
|
||||||
|
X3_0 = _mm256_add_epi32(X3_0, X3_1);
|
||||||
|
|
||||||
|
X0_3 = _mm256_xor_si256(X0_3, X0_0);
|
||||||
|
X1_3 = _mm256_xor_si256(X1_3, X1_0);
|
||||||
|
X2_3 = _mm256_xor_si256(X2_3, X2_0);
|
||||||
|
X3_3 = _mm256_xor_si256(X3_3, X3_0);
|
||||||
|
|
||||||
|
X0_3 = RotateLeft<16>(X0_3);
|
||||||
|
X1_3 = RotateLeft<16>(X1_3);
|
||||||
|
X2_3 = RotateLeft<16>(X2_3);
|
||||||
|
X3_3 = RotateLeft<16>(X3_3);
|
||||||
|
|
||||||
|
X0_2 = _mm256_add_epi32(X0_2, X0_3);
|
||||||
|
X1_2 = _mm256_add_epi32(X1_2, X1_3);
|
||||||
|
X2_2 = _mm256_add_epi32(X2_2, X2_3);
|
||||||
|
X3_2 = _mm256_add_epi32(X3_2, X3_3);
|
||||||
|
|
||||||
|
X0_1 = _mm256_xor_si256(X0_1, X0_2);
|
||||||
|
X1_1 = _mm256_xor_si256(X1_1, X1_2);
|
||||||
|
X2_1 = _mm256_xor_si256(X2_1, X2_2);
|
||||||
|
X3_1 = _mm256_xor_si256(X3_1, X3_2);
|
||||||
|
|
||||||
|
X0_1 = RotateLeft<12>(X0_1);
|
||||||
|
X1_1 = RotateLeft<12>(X1_1);
|
||||||
|
X2_1 = RotateLeft<12>(X2_1);
|
||||||
|
X3_1 = RotateLeft<12>(X3_1);
|
||||||
|
|
||||||
|
X0_0 = _mm256_add_epi32(X0_0, X0_1);
|
||||||
|
X1_0 = _mm256_add_epi32(X1_0, X1_1);
|
||||||
|
X2_0 = _mm256_add_epi32(X2_0, X2_1);
|
||||||
|
X3_0 = _mm256_add_epi32(X3_0, X3_1);
|
||||||
|
|
||||||
|
X0_3 = _mm256_xor_si256(X0_3, X0_0);
|
||||||
|
X1_3 = _mm256_xor_si256(X1_3, X1_0);
|
||||||
|
X2_3 = _mm256_xor_si256(X2_3, X2_0);
|
||||||
|
X3_3 = _mm256_xor_si256(X3_3, X3_0);
|
||||||
|
|
||||||
|
X0_3 = RotateLeft<8>(X0_3);
|
||||||
|
X1_3 = RotateLeft<8>(X1_3);
|
||||||
|
X2_3 = RotateLeft<8>(X2_3);
|
||||||
|
X3_3 = RotateLeft<8>(X3_3);
|
||||||
|
|
||||||
|
X0_2 = _mm256_add_epi32(X0_2, X0_3);
|
||||||
|
X1_2 = _mm256_add_epi32(X1_2, X1_3);
|
||||||
|
X2_2 = _mm256_add_epi32(X2_2, X2_3);
|
||||||
|
X3_2 = _mm256_add_epi32(X3_2, X3_3);
|
||||||
|
|
||||||
|
X0_1 = _mm256_xor_si256(X0_1, X0_2);
|
||||||
|
X1_1 = _mm256_xor_si256(X1_1, X1_2);
|
||||||
|
X2_1 = _mm256_xor_si256(X2_1, X2_2);
|
||||||
|
X3_1 = _mm256_xor_si256(X3_1, X3_2);
|
||||||
|
|
||||||
|
X0_1 = RotateLeft<7>(X0_1);
|
||||||
|
X1_1 = RotateLeft<7>(X1_1);
|
||||||
|
X2_1 = RotateLeft<7>(X2_1);
|
||||||
|
X3_1 = RotateLeft<7>(X3_1);
|
||||||
|
|
||||||
|
X0_1 = _mm256_shuffle_epi32(X0_1, _MM_SHUFFLE(0, 3, 2, 1));
|
||||||
|
X0_2 = _mm256_shuffle_epi32(X0_2, _MM_SHUFFLE(1, 0, 3, 2));
|
||||||
|
X0_3 = _mm256_shuffle_epi32(X0_3, _MM_SHUFFLE(2, 1, 0, 3));
|
||||||
|
|
||||||
|
X1_1 = _mm256_shuffle_epi32(X1_1, _MM_SHUFFLE(0, 3, 2, 1));
|
||||||
|
X1_2 = _mm256_shuffle_epi32(X1_2, _MM_SHUFFLE(1, 0, 3, 2));
|
||||||
|
X1_3 = _mm256_shuffle_epi32(X1_3, _MM_SHUFFLE(2, 1, 0, 3));
|
||||||
|
|
||||||
|
X2_1 = _mm256_shuffle_epi32(X2_1, _MM_SHUFFLE(0, 3, 2, 1));
|
||||||
|
X2_2 = _mm256_shuffle_epi32(X2_2, _MM_SHUFFLE(1, 0, 3, 2));
|
||||||
|
X2_3 = _mm256_shuffle_epi32(X2_3, _MM_SHUFFLE(2, 1, 0, 3));
|
||||||
|
|
||||||
|
X3_1 = _mm256_shuffle_epi32(X3_1, _MM_SHUFFLE(0, 3, 2, 1));
|
||||||
|
X3_2 = _mm256_shuffle_epi32(X3_2, _MM_SHUFFLE(1, 0, 3, 2));
|
||||||
|
X3_3 = _mm256_shuffle_epi32(X3_3, _MM_SHUFFLE(2, 1, 0, 3));
|
||||||
|
|
||||||
|
X0_0 = _mm256_add_epi32(X0_0, X0_1);
|
||||||
|
X1_0 = _mm256_add_epi32(X1_0, X1_1);
|
||||||
|
X2_0 = _mm256_add_epi32(X2_0, X2_1);
|
||||||
|
X3_0 = _mm256_add_epi32(X3_0, X3_1);
|
||||||
|
|
||||||
|
X0_3 = _mm256_xor_si256(X0_3, X0_0);
|
||||||
|
X1_3 = _mm256_xor_si256(X1_3, X1_0);
|
||||||
|
X2_3 = _mm256_xor_si256(X2_3, X2_0);
|
||||||
|
X3_3 = _mm256_xor_si256(X3_3, X3_0);
|
||||||
|
|
||||||
|
X0_3 = RotateLeft<16>(X0_3);
|
||||||
|
X1_3 = RotateLeft<16>(X1_3);
|
||||||
|
X2_3 = RotateLeft<16>(X2_3);
|
||||||
|
X3_3 = RotateLeft<16>(X3_3);
|
||||||
|
|
||||||
|
X0_2 = _mm256_add_epi32(X0_2, X0_3);
|
||||||
|
X1_2 = _mm256_add_epi32(X1_2, X1_3);
|
||||||
|
X2_2 = _mm256_add_epi32(X2_2, X2_3);
|
||||||
|
X3_2 = _mm256_add_epi32(X3_2, X3_3);
|
||||||
|
|
||||||
|
X0_1 = _mm256_xor_si256(X0_1, X0_2);
|
||||||
|
X1_1 = _mm256_xor_si256(X1_1, X1_2);
|
||||||
|
X2_1 = _mm256_xor_si256(X2_1, X2_2);
|
||||||
|
X3_1 = _mm256_xor_si256(X3_1, X3_2);
|
||||||
|
|
||||||
|
X0_1 = RotateLeft<12>(X0_1);
|
||||||
|
X1_1 = RotateLeft<12>(X1_1);
|
||||||
|
X2_1 = RotateLeft<12>(X2_1);
|
||||||
|
X3_1 = RotateLeft<12>(X3_1);
|
||||||
|
|
||||||
|
X0_0 = _mm256_add_epi32(X0_0, X0_1);
|
||||||
|
X1_0 = _mm256_add_epi32(X1_0, X1_1);
|
||||||
|
X2_0 = _mm256_add_epi32(X2_0, X2_1);
|
||||||
|
X3_0 = _mm256_add_epi32(X3_0, X3_1);
|
||||||
|
|
||||||
|
X0_3 = _mm256_xor_si256(X0_3, X0_0);
|
||||||
|
X1_3 = _mm256_xor_si256(X1_3, X1_0);
|
||||||
|
X2_3 = _mm256_xor_si256(X2_3, X2_0);
|
||||||
|
X3_3 = _mm256_xor_si256(X3_3, X3_0);
|
||||||
|
|
||||||
|
X0_3 = RotateLeft<8>(X0_3);
|
||||||
|
X1_3 = RotateLeft<8>(X1_3);
|
||||||
|
X2_3 = RotateLeft<8>(X2_3);
|
||||||
|
X3_3 = RotateLeft<8>(X3_3);
|
||||||
|
|
||||||
|
X0_2 = _mm256_add_epi32(X0_2, X0_3);
|
||||||
|
X1_2 = _mm256_add_epi32(X1_2, X1_3);
|
||||||
|
X2_2 = _mm256_add_epi32(X2_2, X2_3);
|
||||||
|
X3_2 = _mm256_add_epi32(X3_2, X3_3);
|
||||||
|
|
||||||
|
X0_1 = _mm256_xor_si256(X0_1, X0_2);
|
||||||
|
X1_1 = _mm256_xor_si256(X1_1, X1_2);
|
||||||
|
X2_1 = _mm256_xor_si256(X2_1, X2_2);
|
||||||
|
X3_1 = _mm256_xor_si256(X3_1, X3_2);
|
||||||
|
|
||||||
|
X0_1 = RotateLeft<7>(X0_1);
|
||||||
|
X1_1 = RotateLeft<7>(X1_1);
|
||||||
|
X2_1 = RotateLeft<7>(X2_1);
|
||||||
|
X3_1 = RotateLeft<7>(X3_1);
|
||||||
|
|
||||||
|
X0_1 = _mm256_shuffle_epi32(X0_1, _MM_SHUFFLE(2, 1, 0, 3));
|
||||||
|
X0_2 = _mm256_shuffle_epi32(X0_2, _MM_SHUFFLE(1, 0, 3, 2));
|
||||||
|
X0_3 = _mm256_shuffle_epi32(X0_3, _MM_SHUFFLE(0, 3, 2, 1));
|
||||||
|
|
||||||
|
X1_1 = _mm256_shuffle_epi32(X1_1, _MM_SHUFFLE(2, 1, 0, 3));
|
||||||
|
X1_2 = _mm256_shuffle_epi32(X1_2, _MM_SHUFFLE(1, 0, 3, 2));
|
||||||
|
X1_3 = _mm256_shuffle_epi32(X1_3, _MM_SHUFFLE(0, 3, 2, 1));
|
||||||
|
|
||||||
|
X2_1 = _mm256_shuffle_epi32(X2_1, _MM_SHUFFLE(2, 1, 0, 3));
|
||||||
|
X2_2 = _mm256_shuffle_epi32(X2_2, _MM_SHUFFLE(1, 0, 3, 2));
|
||||||
|
X2_3 = _mm256_shuffle_epi32(X2_3, _MM_SHUFFLE(0, 3, 2, 1));
|
||||||
|
|
||||||
|
X3_1 = _mm256_shuffle_epi32(X3_1, _MM_SHUFFLE(2, 1, 0, 3));
|
||||||
|
X3_2 = _mm256_shuffle_epi32(X3_2, _MM_SHUFFLE(1, 0, 3, 2));
|
||||||
|
X3_3 = _mm256_shuffle_epi32(X3_3, _MM_SHUFFLE(0, 3, 2, 1));
|
||||||
|
}
|
||||||
|
|
||||||
|
X0_0 = _mm256_add_epi32(X0_0, state0);
|
||||||
|
X0_1 = _mm256_add_epi32(X0_1, state1);
|
||||||
|
X0_2 = _mm256_add_epi32(X0_2, state2);
|
||||||
|
X0_3 = _mm256_add_epi32(X0_3, state3);
|
||||||
|
X0_3 = _mm256_add_epi64(X0_3, CTR0);
|
||||||
|
|
||||||
|
X1_0 = _mm256_add_epi32(X1_0, state0);
|
||||||
|
X1_1 = _mm256_add_epi32(X1_1, state1);
|
||||||
|
X1_2 = _mm256_add_epi32(X1_2, state2);
|
||||||
|
X1_3 = _mm256_add_epi32(X1_3, state3);
|
||||||
|
X1_3 = _mm256_add_epi64(X1_3, CTR1);
|
||||||
|
|
||||||
|
X2_0 = _mm256_add_epi32(X2_0, state0);
|
||||||
|
X2_1 = _mm256_add_epi32(X2_1, state1);
|
||||||
|
X2_2 = _mm256_add_epi32(X2_2, state2);
|
||||||
|
X2_3 = _mm256_add_epi32(X2_3, state3);
|
||||||
|
X2_3 = _mm256_add_epi64(X2_3, CTR2);
|
||||||
|
|
||||||
|
X3_0 = _mm256_add_epi32(X3_0, state0);
|
||||||
|
X3_1 = _mm256_add_epi32(X3_1, state1);
|
||||||
|
X3_2 = _mm256_add_epi32(X3_2, state2);
|
||||||
|
X3_3 = _mm256_add_epi32(X3_3, state3);
|
||||||
|
X3_3 = _mm256_add_epi64(X3_3, CTR3);
|
||||||
|
|
||||||
|
if (input_mm)
|
||||||
|
{
|
||||||
|
_mm256_storeu_si256(output_mm + 0, _mm256_xor_si256(_mm256_loadu_si256(input_mm + 0),
|
||||||
|
_mm256_permute2x128_si256(X0_0, X0_1, 1 + (3 << 4))));
|
||||||
|
_mm256_storeu_si256(output_mm + 1, _mm256_xor_si256(_mm256_loadu_si256(input_mm + 1),
|
||||||
|
_mm256_permute2x128_si256(X0_2, X0_3, 1 + (3 << 4))));
|
||||||
|
_mm256_storeu_si256(output_mm + 2, _mm256_xor_si256(_mm256_loadu_si256(input_mm + 2),
|
||||||
|
_mm256_permute2x128_si256(X1_0, X1_1, 1 + (3 << 4))));
|
||||||
|
_mm256_storeu_si256(output_mm + 3, _mm256_xor_si256(_mm256_loadu_si256(input_mm + 3),
|
||||||
|
_mm256_permute2x128_si256(X1_2, X1_3, 1 + (3 << 4))));
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
_mm256_storeu_si256(output_mm + 0, _mm256_permute2x128_si256(X0_0, X0_1, 1 + (3 << 4)));
|
||||||
|
_mm256_storeu_si256(output_mm + 1, _mm256_permute2x128_si256(X0_2, X0_3, 1 + (3 << 4)));
|
||||||
|
_mm256_storeu_si256(output_mm + 2, _mm256_permute2x128_si256(X1_0, X1_1, 1 + (3 << 4)));
|
||||||
|
_mm256_storeu_si256(output_mm + 3, _mm256_permute2x128_si256(X1_2, X1_3, 1 + (3 << 4)));
|
||||||
|
}
|
||||||
|
|
||||||
|
if (input_mm)
|
||||||
|
{
|
||||||
|
_mm256_storeu_si256(output_mm + 4, _mm256_xor_si256(_mm256_loadu_si256(input_mm + 4),
|
||||||
|
_mm256_permute2x128_si256(X2_0, X2_1, 1 + (3 << 4))));
|
||||||
|
_mm256_storeu_si256(output_mm + 5, _mm256_xor_si256(_mm256_loadu_si256(input_mm + 5),
|
||||||
|
_mm256_permute2x128_si256(X2_2, X2_3, 1 + (3 << 4))));
|
||||||
|
_mm256_storeu_si256(output_mm + 6, _mm256_xor_si256(_mm256_loadu_si256(input_mm + 6),
|
||||||
|
_mm256_permute2x128_si256(X3_0, X3_1, 1 + (3 << 4))));
|
||||||
|
_mm256_storeu_si256(output_mm + 7, _mm256_xor_si256(_mm256_loadu_si256(input_mm + 7),
|
||||||
|
_mm256_permute2x128_si256(X3_2, X3_3, 1 + (3 << 4))));
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
_mm256_storeu_si256(output_mm + 4, _mm256_permute2x128_si256(X2_0, X2_1, 1 + (3 << 4)));
|
||||||
|
_mm256_storeu_si256(output_mm + 5, _mm256_permute2x128_si256(X2_2, X2_3, 1 + (3 << 4)));
|
||||||
|
_mm256_storeu_si256(output_mm + 6, _mm256_permute2x128_si256(X3_0, X3_1, 1 + (3 << 4)));
|
||||||
|
_mm256_storeu_si256(output_mm + 7, _mm256_permute2x128_si256(X3_2, X3_3, 1 + (3 << 4)));
|
||||||
|
}
|
||||||
|
|
||||||
|
if (input_mm)
|
||||||
|
{
|
||||||
|
_mm256_storeu_si256(output_mm + 8, _mm256_xor_si256(_mm256_loadu_si256(input_mm + 8),
|
||||||
|
_mm256_permute2x128_si256(X0_0, X0_1, 0 + (2 << 4))));
|
||||||
|
_mm256_storeu_si256(output_mm + 9, _mm256_xor_si256(_mm256_loadu_si256(input_mm + 9),
|
||||||
|
_mm256_permute2x128_si256(X0_2, X0_3, 0 + (2 << 4))));
|
||||||
|
_mm256_storeu_si256(output_mm + 10, _mm256_xor_si256(_mm256_loadu_si256(input_mm + 10),
|
||||||
|
_mm256_permute2x128_si256(X1_0, X1_1, 0 + (2 << 4))));
|
||||||
|
_mm256_storeu_si256(output_mm + 11, _mm256_xor_si256(_mm256_loadu_si256(input_mm + 11),
|
||||||
|
_mm256_permute2x128_si256(X1_2, X1_3, 0 + (2 << 4))));
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
_mm256_storeu_si256(output_mm + 8, _mm256_permute2x128_si256(X0_0, X0_1, 0 + (2 << 4)));
|
||||||
|
_mm256_storeu_si256(output_mm + 9, _mm256_permute2x128_si256(X0_2, X0_3, 0 + (2 << 4)));
|
||||||
|
_mm256_storeu_si256(output_mm + 10, _mm256_permute2x128_si256(X1_0, X1_1, 0 + (2 << 4)));
|
||||||
|
_mm256_storeu_si256(output_mm + 11, _mm256_permute2x128_si256(X1_2, X1_3, 0 + (2 << 4)));
|
||||||
|
}
|
||||||
|
|
||||||
|
if (input_mm)
|
||||||
|
{
|
||||||
|
_mm256_storeu_si256(output_mm + 12, _mm256_xor_si256(_mm256_loadu_si256(input_mm + 12),
|
||||||
|
_mm256_permute2x128_si256(X2_0, X2_1, 0 + (2 << 4))));
|
||||||
|
_mm256_storeu_si256(output_mm + 13, _mm256_xor_si256(_mm256_loadu_si256(input_mm + 13),
|
||||||
|
_mm256_permute2x128_si256(X2_2, X2_3, 0 + (2 << 4))));
|
||||||
|
_mm256_storeu_si256(output_mm + 14, _mm256_xor_si256(_mm256_loadu_si256(input_mm + 14),
|
||||||
|
_mm256_permute2x128_si256(X3_0, X3_1, 0 + (2 << 4))));
|
||||||
|
_mm256_storeu_si256(output_mm + 15, _mm256_xor_si256(_mm256_loadu_si256(input_mm + 15),
|
||||||
|
_mm256_permute2x128_si256(X3_2, X3_3, 0 + (2 << 4))));
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
_mm256_storeu_si256(output_mm + 12, _mm256_permute2x128_si256(X2_0, X2_1, 0 + (2 << 4)));
|
||||||
|
_mm256_storeu_si256(output_mm + 13, _mm256_permute2x128_si256(X2_2, X2_3, 0 + (2 << 4)));
|
||||||
|
_mm256_storeu_si256(output_mm + 14, _mm256_permute2x128_si256(X3_0, X3_1, 0 + (2 << 4)));
|
||||||
|
_mm256_storeu_si256(output_mm + 15, _mm256_permute2x128_si256(X3_2, X3_3, 0 + (2 << 4)));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
NAMESPACE_END
|
||||||
|
|
||||||
|
#endif // CRYPTOPP_AVX2_AVAILABLE
|
||||||
56
chacha.cpp
56
chacha.cpp
|
|
@ -20,6 +20,10 @@ extern void ChaCha_OperateKeystream_NEON(const word32 *state, const byte* input,
|
||||||
extern void ChaCha_OperateKeystream_SSE2(const word32 *state, const byte* input, byte *output, unsigned int rounds);
|
extern void ChaCha_OperateKeystream_SSE2(const word32 *state, const byte* input, byte *output, unsigned int rounds);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#if (CRYPTOPP_AVX2_AVAILABLE)
|
||||||
|
extern void ChaCha_OperateKeystream_AVX2(const word32 *state, const byte* input, byte *output, unsigned int rounds);
|
||||||
|
#endif
|
||||||
|
|
||||||
#if (CRYPTOPP_POWER8_AVAILABLE)
|
#if (CRYPTOPP_POWER8_AVAILABLE)
|
||||||
extern void ChaCha_OperateKeystream_POWER8(const word32 *state, const byte* input, byte *output, unsigned int rounds);
|
extern void ChaCha_OperateKeystream_POWER8(const word32 *state, const byte* input, byte *output, unsigned int rounds);
|
||||||
#endif
|
#endif
|
||||||
|
|
@ -62,17 +66,25 @@ std::string ChaCha_Policy::AlgorithmName() const
|
||||||
|
|
||||||
std::string ChaCha_Policy::AlgorithmProvider() const
|
std::string ChaCha_Policy::AlgorithmProvider() const
|
||||||
{
|
{
|
||||||
|
#if (CRYPTOPP_AVX2_AVAILABLE)
|
||||||
|
if (HasAVX2())
|
||||||
|
return "AVX2";
|
||||||
|
else
|
||||||
|
#endif
|
||||||
#if (CRYPTOPP_SSE2_INTRIN_AVAILABLE || CRYPTOPP_SSE2_ASM_AVAILABLE)
|
#if (CRYPTOPP_SSE2_INTRIN_AVAILABLE || CRYPTOPP_SSE2_ASM_AVAILABLE)
|
||||||
if (HasSSE2())
|
if (HasSSE2())
|
||||||
return "SSE2";
|
return "SSE2";
|
||||||
|
else
|
||||||
#endif
|
#endif
|
||||||
#if (CRYPTOPP_ARM_NEON_AVAILABLE)
|
#if (CRYPTOPP_ARM_NEON_AVAILABLE)
|
||||||
if (HasNEON())
|
if (HasNEON())
|
||||||
return "NEON";
|
return "NEON";
|
||||||
|
else
|
||||||
#endif
|
#endif
|
||||||
#if (CRYPTOPP_POWER8_AVAILABLE)
|
#if (CRYPTOPP_POWER8_AVAILABLE)
|
||||||
if (HasPower8())
|
if (HasPower8())
|
||||||
return "Power8";
|
return "Power8";
|
||||||
|
else
|
||||||
#endif
|
#endif
|
||||||
return "C++";
|
return "C++";
|
||||||
}
|
}
|
||||||
|
|
@ -117,11 +129,17 @@ void ChaCha_Policy::SeekToIteration(lword iterationCount)
|
||||||
|
|
||||||
unsigned int ChaCha_Policy::GetAlignment() const
|
unsigned int ChaCha_Policy::GetAlignment() const
|
||||||
{
|
{
|
||||||
|
#if (CRYPTOPP_AVX2_AVAILABLE)
|
||||||
|
if (HasAVX2())
|
||||||
|
return 16;
|
||||||
|
else
|
||||||
|
#endif
|
||||||
#if (CRYPTOPP_SSE2_INTRIN_AVAILABLE || CRYPTOPP_SSE2_ASM_AVAILABLE)
|
#if (CRYPTOPP_SSE2_INTRIN_AVAILABLE || CRYPTOPP_SSE2_ASM_AVAILABLE)
|
||||||
if (HasSSE2())
|
if (HasSSE2())
|
||||||
return 16;
|
return 16;
|
||||||
else
|
else
|
||||||
#elif (CRYPTOPP_POWER8_AVAILABLE)
|
#endif
|
||||||
|
#if (CRYPTOPP_POWER8_AVAILABLE)
|
||||||
if (HasPower8())
|
if (HasPower8())
|
||||||
return 16;
|
return 16;
|
||||||
else
|
else
|
||||||
|
|
@ -131,6 +149,11 @@ unsigned int ChaCha_Policy::GetAlignment() const
|
||||||
|
|
||||||
unsigned int ChaCha_Policy::GetOptimalBlockSize() const
|
unsigned int ChaCha_Policy::GetOptimalBlockSize() const
|
||||||
{
|
{
|
||||||
|
#if (CRYPTOPP_AVX2_AVAILABLE)
|
||||||
|
if (HasAVX2())
|
||||||
|
return 8 * BYTES_PER_ITERATION;
|
||||||
|
else
|
||||||
|
#endif
|
||||||
#if (CRYPTOPP_SSE2_INTRIN_AVAILABLE || CRYPTOPP_SSE2_ASM_AVAILABLE)
|
#if (CRYPTOPP_SSE2_INTRIN_AVAILABLE || CRYPTOPP_SSE2_ASM_AVAILABLE)
|
||||||
if (HasSSE2())
|
if (HasSSE2())
|
||||||
return 4*BYTES_PER_ITERATION;
|
return 4*BYTES_PER_ITERATION;
|
||||||
|
|
@ -149,10 +172,9 @@ unsigned int ChaCha_Policy::GetOptimalBlockSize() const
|
||||||
return BYTES_PER_ITERATION;
|
return BYTES_PER_ITERATION;
|
||||||
}
|
}
|
||||||
|
|
||||||
bool ChaCha_Policy::MultiBlockSafe() const
|
bool ChaCha_Policy::MultiBlockSafe(unsigned int blocks) const
|
||||||
{
|
{
|
||||||
const word32 c = m_state[12];
|
return 0xffffffff - m_state[12] > blocks;
|
||||||
return 0xffffffff - c > 4;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// OperateKeystream always produces a key stream. The key stream is written
|
// OperateKeystream always produces a key stream. The key stream is written
|
||||||
|
|
@ -163,10 +185,30 @@ void ChaCha_Policy::OperateKeystream(KeystreamOperation operation,
|
||||||
{
|
{
|
||||||
do
|
do
|
||||||
{
|
{
|
||||||
|
#if (CRYPTOPP_AVX2_AVAILABLE)
|
||||||
|
if (HasAVX2())
|
||||||
|
{
|
||||||
|
while (iterationCount >= 8 && MultiBlockSafe(8))
|
||||||
|
{
|
||||||
|
const bool xorInput = (operation & INPUT_NULL) != INPUT_NULL;
|
||||||
|
ChaCha_OperateKeystream_AVX2(m_state, xorInput ? input : NULLPTR, output, m_rounds);
|
||||||
|
|
||||||
|
// MultiBlockSafe avoids overflow on the counter words
|
||||||
|
m_state[12] += 8;
|
||||||
|
//if (m_state[12] < 8)
|
||||||
|
// m_state[13]++;
|
||||||
|
|
||||||
|
input += (!!xorInput) * 8 * BYTES_PER_ITERATION;
|
||||||
|
output += 8 * BYTES_PER_ITERATION;
|
||||||
|
iterationCount -= 8;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
#if (CRYPTOPP_SSE2_INTRIN_AVAILABLE || CRYPTOPP_SSE2_ASM_AVAILABLE)
|
#if (CRYPTOPP_SSE2_INTRIN_AVAILABLE || CRYPTOPP_SSE2_ASM_AVAILABLE)
|
||||||
if (HasSSE2())
|
if (HasSSE2())
|
||||||
{
|
{
|
||||||
while (iterationCount >= 4 && MultiBlockSafe())
|
while (iterationCount >= 4 && MultiBlockSafe(4))
|
||||||
{
|
{
|
||||||
const bool xorInput = (operation & INPUT_NULL) != INPUT_NULL;
|
const bool xorInput = (operation & INPUT_NULL) != INPUT_NULL;
|
||||||
ChaCha_OperateKeystream_SSE2(m_state, xorInput ? input : NULLPTR, output, m_rounds);
|
ChaCha_OperateKeystream_SSE2(m_state, xorInput ? input : NULLPTR, output, m_rounds);
|
||||||
|
|
@ -186,7 +228,7 @@ void ChaCha_Policy::OperateKeystream(KeystreamOperation operation,
|
||||||
#if (CRYPTOPP_ARM_NEON_AVAILABLE)
|
#if (CRYPTOPP_ARM_NEON_AVAILABLE)
|
||||||
if (HasNEON())
|
if (HasNEON())
|
||||||
{
|
{
|
||||||
while (iterationCount >= 4 && MultiBlockSafe())
|
while (iterationCount >= 4 && MultiBlockSafe(4))
|
||||||
{
|
{
|
||||||
const bool xorInput = (operation & INPUT_NULL) != INPUT_NULL;
|
const bool xorInput = (operation & INPUT_NULL) != INPUT_NULL;
|
||||||
ChaCha_OperateKeystream_NEON(m_state, xorInput ? input : NULLPTR, output, m_rounds);
|
ChaCha_OperateKeystream_NEON(m_state, xorInput ? input : NULLPTR, output, m_rounds);
|
||||||
|
|
@ -206,7 +248,7 @@ void ChaCha_Policy::OperateKeystream(KeystreamOperation operation,
|
||||||
#if (CRYPTOPP_POWER8_AVAILABLE)
|
#if (CRYPTOPP_POWER8_AVAILABLE)
|
||||||
if (HasPower8())
|
if (HasPower8())
|
||||||
{
|
{
|
||||||
while (iterationCount >= 4 && MultiBlockSafe())
|
while (iterationCount >= 4 && MultiBlockSafe(4))
|
||||||
{
|
{
|
||||||
const bool xorInput = (operation & INPUT_NULL) != INPUT_NULL;
|
const bool xorInput = (operation & INPUT_NULL) != INPUT_NULL;
|
||||||
ChaCha_OperateKeystream_POWER8(m_state, xorInput ? input : NULLPTR, output, m_rounds);
|
ChaCha_OperateKeystream_POWER8(m_state, xorInput ? input : NULLPTR, output, m_rounds);
|
||||||
|
|
|
||||||
2
chacha.h
2
chacha.h
|
|
@ -56,7 +56,7 @@ protected:
|
||||||
// during addition in an intermediate result. Conditions to trigger
|
// during addition in an intermediate result. Conditions to trigger
|
||||||
// issue include a user seeks to around 2^32 blocks (256 GB of data).
|
// issue include a user seeks to around 2^32 blocks (256 GB of data).
|
||||||
// https://github.com/weidai11/cryptopp/issues/732
|
// https://github.com/weidai11/cryptopp/issues/732
|
||||||
bool MultiBlockSafe() const;
|
inline bool MultiBlockSafe(unsigned int blocks) const;
|
||||||
|
|
||||||
FixedSizeAlignedSecBlock<word32, 16> m_state;
|
FixedSizeAlignedSecBlock<word32, 16> m_state;
|
||||||
int m_rounds;
|
int m_rounds;
|
||||||
|
|
|
||||||
|
|
@ -17,6 +17,12 @@
|
||||||
// #undef CRYPTOPP_SSSE3_AVAILABLE
|
// #undef CRYPTOPP_SSSE3_AVAILABLE
|
||||||
// #undef CRYPTOPP_ARM_NEON_AVAILABLE
|
// #undef CRYPTOPP_ARM_NEON_AVAILABLE
|
||||||
|
|
||||||
|
#if defined(CRYPTOPP_SSE2_AVAILABLE)
|
||||||
|
# define CRYPTOPP_AVX512_ROTATE 1
|
||||||
|
# include <xmmintrin.h>
|
||||||
|
# include <emmintrin.h>
|
||||||
|
#endif
|
||||||
|
|
||||||
#if (CRYPTOPP_SSSE3_AVAILABLE)
|
#if (CRYPTOPP_SSSE3_AVAILABLE)
|
||||||
# include <pmmintrin.h>
|
# include <pmmintrin.h>
|
||||||
# include <tmmintrin.h>
|
# include <tmmintrin.h>
|
||||||
|
|
@ -26,11 +32,6 @@
|
||||||
# include <ammintrin.h>
|
# include <ammintrin.h>
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if defined(__AVX512F__) && defined(__AVX512VL__)
|
|
||||||
# define CRYPTOPP_AVX512_ROTATE 1
|
|
||||||
# include <immintrin.h>
|
|
||||||
#endif
|
|
||||||
|
|
||||||
// Squash MS LNK4221 and libtool warnings
|
// Squash MS LNK4221 and libtool warnings
|
||||||
extern const char CHAM_SIMD_FNAME[] = __FILE__;
|
extern const char CHAM_SIMD_FNAME[] = __FILE__;
|
||||||
|
|
||||||
|
|
|
||||||
21
config.h
21
config.h
|
|
@ -484,6 +484,11 @@ NAMESPACE_END
|
||||||
# define CRYPTOPP_DISABLE_ASM 1
|
# define CRYPTOPP_DISABLE_ASM 1
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
// SunCC prior to 5.10 cannot handle some SSE intrinsics
|
||||||
|
#if defined(__SUNPRO_CC) && (__SUNPRO_CC < 0x5100)
|
||||||
|
# define CRYPTOPP_DISABLE_ASM 1
|
||||||
|
#endif
|
||||||
|
|
||||||
// Sun Studio 12 provides GCC inline assembly, http://blogs.oracle.com/x86be/entry/gcc_style_asm_inlining_support
|
// Sun Studio 12 provides GCC inline assembly, http://blogs.oracle.com/x86be/entry/gcc_style_asm_inlining_support
|
||||||
// We can enable SSE2 for Sun Studio in the makefile with -D__SSE2__, but users may not compile with it.
|
// We can enable SSE2 for Sun Studio in the makefile with -D__SSE2__, but users may not compile with it.
|
||||||
#if !defined(CRYPTOPP_DISABLE_ASM) && !defined(__SSE2__) && defined(__x86_64__) && (__SUNPRO_CC >= 0x5100)
|
#if !defined(CRYPTOPP_DISABLE_ASM) && !defined(__SSE2__) && defined(__x86_64__) && (__SUNPRO_CC >= 0x5100)
|
||||||
|
|
@ -563,6 +568,22 @@ NAMESPACE_END
|
||||||
#define CRYPTOPP_AESNI_AVAILABLE 1
|
#define CRYPTOPP_AESNI_AVAILABLE 1
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
// Requires Binutils 2.24
|
||||||
|
#if !defined(CRYPTOPP_DISABLE_AVX) && defined(CRYPTOPP_SSE42_AVAILABLE) && \
|
||||||
|
(defined(__AVX2__) || (CRYPTOPP_MSC_VERSION >= 1800) || (__SUNPRO_CC >= 0x5130) || \
|
||||||
|
(CRYPTOPP_GCC_VERSION >= 40700) || (__INTEL_COMPILER >= 1400) || \
|
||||||
|
(CRYPTOPP_LLVM_CLANG_VERSION >= 30100) || (CRYPTOPP_APPLE_CLANG_VERSION >= 40600))
|
||||||
|
#define CRYPTOPP_AVX_AVAILABLE 1
|
||||||
|
#endif
|
||||||
|
|
||||||
|
// Requires Binutils 2.24
|
||||||
|
#if !defined(CRYPTOPP_DISABLE_AVX2) && defined(CRYPTOPP_AVX_AVAILABLE) && \
|
||||||
|
(defined(__AVX2__) || (CRYPTOPP_MSC_VERSION >= 1800) || (__SUNPRO_CC >= 0x5130) || \
|
||||||
|
(CRYPTOPP_GCC_VERSION >= 40700) || (__INTEL_COMPILER >= 1400) || \
|
||||||
|
(CRYPTOPP_LLVM_CLANG_VERSION >= 30100) || (CRYPTOPP_APPLE_CLANG_VERSION >= 40600))
|
||||||
|
#define CRYPTOPP_AVX2_AVAILABLE 1
|
||||||
|
#endif
|
||||||
|
|
||||||
// Guessing at SHA for SunCC. Its not in Sun Studio 12.6. Also see
|
// Guessing at SHA for SunCC. Its not in Sun Studio 12.6. Also see
|
||||||
// http://stackoverflow.com/questions/45872180/which-xarch-for-sha-extensions-on-solaris
|
// http://stackoverflow.com/questions/45872180/which-xarch-for-sha-extensions-on-solaris
|
||||||
#if !defined(CRYPTOPP_DISABLE_ASM) && !defined(CRYPTOPP_DISABLE_SHANI) && defined(CRYPTOPP_SSE42_AVAILABLE) && \
|
#if !defined(CRYPTOPP_DISABLE_ASM) && !defined(CRYPTOPP_DISABLE_SHANI) && defined(CRYPTOPP_SSE42_AVAILABLE) && \
|
||||||
|
|
|
||||||
|
|
@ -55,8 +55,8 @@ LIB_SRCS = \
|
||||||
algparam.cpp arc4.cpp aria-simd.cpp aria.cpp ariatab.cpp asn.cpp \
|
algparam.cpp arc4.cpp aria-simd.cpp aria.cpp ariatab.cpp asn.cpp \
|
||||||
authenc.cpp base32.cpp base64.cpp basecode.cpp bfinit.cpp blake2s-simd.cpp \
|
authenc.cpp base32.cpp base64.cpp basecode.cpp bfinit.cpp blake2s-simd.cpp \
|
||||||
blake2b-simd.cpp blake2.cpp blowfish.cpp blumshub.cpp camellia.cpp cast.cpp casts.cpp \
|
blake2b-simd.cpp blake2.cpp blowfish.cpp blumshub.cpp camellia.cpp cast.cpp casts.cpp \
|
||||||
cbcmac.cpp ccm.cpp chacha-simd.cpp chacha.cpp cham-simd.cpp cham.cpp channels.cpp \
|
cbcmac.cpp ccm.cpp chacha-avx.cpp chacha-simd.cpp chacha.cpp cham-simd.cpp cham.cpp \
|
||||||
cmac.cpp crc-simd.cpp crc.cpp default.cpp des.cpp dessp.cpp dh.cpp \
|
channels.cpp cmac.cpp crc-simd.cpp crc.cpp default.cpp des.cpp dessp.cpp dh.cpp \
|
||||||
dh2.cpp dll.cpp dsa.cpp eax.cpp ec2n.cpp eccrypto.cpp ecp.cpp elgamal.cpp \
|
dh2.cpp dll.cpp dsa.cpp eax.cpp ec2n.cpp eccrypto.cpp ecp.cpp elgamal.cpp \
|
||||||
emsa2.cpp eprecomp.cpp esign.cpp files.cpp filters.cpp fips140.cpp \
|
emsa2.cpp eprecomp.cpp esign.cpp files.cpp filters.cpp fips140.cpp \
|
||||||
fipstest.cpp gcm-simd.cpp gcm.cpp gf256.cpp gf2_32.cpp gf2n.cpp \
|
fipstest.cpp gcm-simd.cpp gcm.cpp gf256.cpp gf2_32.cpp gf2n.cpp \
|
||||||
|
|
@ -83,8 +83,8 @@ LIB_OBJS = \
|
||||||
algparam.obj arc4.obj aria-simd.obj aria.obj ariatab.obj asn.obj \
|
algparam.obj arc4.obj aria-simd.obj aria.obj ariatab.obj asn.obj \
|
||||||
authenc.obj base32.obj base64.obj basecode.obj bfinit.obj blake2s-simd.obj \
|
authenc.obj base32.obj base64.obj basecode.obj bfinit.obj blake2s-simd.obj \
|
||||||
blake2b-simd.obj blake2.obj blowfish.obj blumshub.obj camellia.obj cast.obj casts.obj \
|
blake2b-simd.obj blake2.obj blowfish.obj blumshub.obj camellia.obj cast.obj casts.obj \
|
||||||
cbcmac.obj ccm.obj chacha-simd.obj chacha.obj cham-simd.obj cham.obj channels.obj \
|
cbcmac.obj ccm.obj chacha-avx.obj chacha-simd.obj chacha.obj cham-simd.obj cham.obj \
|
||||||
cmac.obj crc-simd.obj crc.obj default.obj des.obj dessp.obj dh.obj \
|
channels.obj cmac.obj crc-simd.obj crc.obj default.obj des.obj dessp.obj dh.obj \
|
||||||
dh2.obj dll.obj dsa.obj eax.obj ec2n.obj eccrypto.obj ecp.obj elgamal.obj \
|
dh2.obj dll.obj dsa.obj eax.obj ec2n.obj eccrypto.obj ecp.obj elgamal.obj \
|
||||||
emsa2.obj eprecomp.obj esign.obj files.obj filters.obj fips140.obj \
|
emsa2.obj eprecomp.obj esign.obj files.obj filters.obj fips140.obj \
|
||||||
fipstest.obj gcm-simd.obj gcm.obj gf256.obj gf2_32.obj gf2n.obj \
|
fipstest.obj gcm-simd.obj gcm.obj gf256.obj gf2_32.obj gf2n.obj \
|
||||||
|
|
|
||||||
|
|
@ -1,4 +1,4 @@
|
||||||
<?xml version="1.0" encoding="utf-8"?>
|
<?xml version="1.0" encoding="utf-8"?>
|
||||||
<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
|
<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
|
||||||
<!-- Microsoft documentation for VCXPROJ file format is located at -->
|
<!-- Microsoft documentation for VCXPROJ file format is located at -->
|
||||||
<!-- the following URL. The documentation leaves a lot to be desired. -->
|
<!-- the following URL. The documentation leaves a lot to be desired. -->
|
||||||
|
|
@ -193,6 +193,10 @@
|
||||||
<ClCompile Include="ccm.cpp" />
|
<ClCompile Include="ccm.cpp" />
|
||||||
<ClCompile Include="chacha.cpp" />
|
<ClCompile Include="chacha.cpp" />
|
||||||
<ClCompile Include="chacha-simd.cpp" />
|
<ClCompile Include="chacha-simd.cpp" />
|
||||||
|
<ClCompile Include="chacha-avx.cpp">
|
||||||
|
<!-- Requires Visual Studio 2013 and above -->
|
||||||
|
<ExcludedFromBuild Condition=" '$(PlatformToolset)' == 'v100' Or '$(PlatformToolset)' == 'v110' ">true</ExcludedFromBuild>
|
||||||
|
</ClCompile>
|
||||||
<ClCompile Include="cham.cpp" />
|
<ClCompile Include="cham.cpp" />
|
||||||
<ClCompile Include="cham-simd.cpp" />
|
<ClCompile Include="cham-simd.cpp" />
|
||||||
<ClCompile Include="channels.cpp" />
|
<ClCompile Include="channels.cpp" />
|
||||||
|
|
|
||||||
|
|
@ -1,4 +1,4 @@
|
||||||
<?xml version="1.0" encoding="utf-8"?>
|
<?xml version="1.0" encoding="utf-8"?>
|
||||||
<Project ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
|
<Project ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
|
||||||
<ItemGroup>
|
<ItemGroup>
|
||||||
<Filter Include="Header Files">
|
<Filter Include="Header Files">
|
||||||
|
|
@ -92,6 +92,9 @@
|
||||||
<ClCompile Include="chacha-simd.cpp">
|
<ClCompile Include="chacha-simd.cpp">
|
||||||
<Filter>Source Files</Filter>
|
<Filter>Source Files</Filter>
|
||||||
</ClCompile>
|
</ClCompile>
|
||||||
|
<ClCompile Include="chacha-avx.cpp">
|
||||||
|
<Filter>Source Files</Filter>
|
||||||
|
</ClCompile>
|
||||||
<ClCompile Include="cham.cpp">
|
<ClCompile Include="cham.cpp">
|
||||||
<Filter>Source Files</Filter>
|
<Filter>Source Files</Filter>
|
||||||
</ClCompile>
|
</ClCompile>
|
||||||
|
|
@ -986,5 +989,7 @@
|
||||||
<CustomBuild Include="adhoc.cpp.proto">
|
<CustomBuild Include="adhoc.cpp.proto">
|
||||||
<Filter>Miscellaneous</Filter>
|
<Filter>Miscellaneous</Filter>
|
||||||
</CustomBuild>
|
</CustomBuild>
|
||||||
|
<CustomBuild Include="x64dll.asm" />
|
||||||
|
<CustomBuild Include="x64masm.asm" />
|
||||||
</ItemGroup>
|
</ItemGroup>
|
||||||
</Project>
|
</Project>
|
||||||
Loading…
Reference in New Issue