From d9011f07d2d5bde9c0710ac481e2b61fd9f51a77 Mon Sep 17 00:00:00 2001
From: Jeffrey Walton <noloader@gmail.com>
Date: Thu, 8 Nov 2018 16:20:31 -0500
Subject: [PATCH] Add ChaCha AVX2 implementation (GH #735)

---
 Filelist.txt             |   1 +
 GNUmakefile              |  39 ++++-
 chacha-avx.cpp           | 362 +++++++++++++++++++++++++++++++++++++++
 chacha.cpp               |  56 +++++-
 chacha.h                 |   2 +-
 cham-simd.cpp            |  11 +-
 config.h                 |  21 +++
 cryptest.nmake           |   8 +-
 cryptlib.vcxproj         |   6 +-
 cryptlib.vcxproj.filters |   7 +-
 10 files changed, 485 insertions(+), 28 deletions(-)
 create mode 100644 chacha-avx.cpp

diff --git a/Filelist.txt b/Filelist.txt
index 60d37620..ed09654a 100644
--- a/Filelist.txt
+++ b/Filelist.txt
@@ -52,6 +52,7 @@ cbcmac.h
 ccm.cpp
 ccm.h
 chacha.cpp
+chacha-avx.cpp
 chacha-simd.cpp
 chacha.h
 cham.cpp
diff --git a/GNUmakefile b/GNUmakefile
index b07e68bd..d1668514 100755
--- a/GNUmakefile
+++ b/GNUmakefile
@@ -227,9 +227,10 @@ ifeq ($(HAVE_GAS)$(GAS219_OR_LATER),10)
 CXXFLAGS += -DCRYPTOPP_DISABLE_AESNI
 else
 ifeq ($(HAVE_GAS)$(GAS224_OR_LATER),10)
+CXXFLAGS += -DCRYPTOPP_DISABLE_AVX
 CXXFLAGS += -DCRYPTOPP_DISABLE_SHANI
 
-endif  # -DCRYPTOPP_DISABLE_SHANI
+endif  # -DCRYPTOPP_DISABLE_AVX and SHANI
 endif  # -DCRYPTOPP_DISABLE_AESNI
 endif  # -DCRYPTOPP_DISABLE_SSE4
 endif  # -DCRYPTOPP_DISABLE_SSSE3
@@ -276,26 +277,33 @@ ifeq ($(findstring -DCRYPTOPP_DISABLE_AESNI,$(CXXFLAGS)),)
     AES_FLAG = -msse4.1 -maes
     SM4_FLAG = -mssse3 -maes
   endif
+ifeq ($(findstring -DCRYPTOPP_DISABLE_AVX2,$(CXXFLAGS)),)
+  HAVE_AVX2 = $(shell $(CXX) $(CXXFLAGS) -mavx2 -dM -E pch.cpp 2>&1 | $(GREP) -i -c __AVX2__)
+  ifeq ($(HAVE_AVX2),1)
+    CHACHA_AVX2_FLAG = -mavx2
+  endif
 ifeq ($(findstring -DCRYPTOPP_DISABLE_SHANI,$(CXXFLAGS)),)
   HAVE_SHA = $(shell $(CXX) $(CXXFLAGS) -msse4.2 -msha -dM -E pch.cpp 2>&1 | $(GREP) -i -c __SHA__)
   ifeq ($(HAVE_SHA),1)
     SHA_FLAG = -msse4.2 -msha
   endif
 endif  # -DCRYPTOPP_DISABLE_SHANI
+endif  # -DCRYPTOPP_DISABLE_AVX2
 endif  # -DCRYPTOPP_DISABLE_AESNI
 endif  # -DCRYPTOPP_DISABLE_SSE4
 endif  # -DCRYPTOPP_DISABLE_SSSE3
 
 # Begin SunCC
 ifeq ($(SUN_COMPILER),1)
-  COUNT := $(shell $(CXX) $(CXXFLAGS) -E -xarch=sse2 -xdumpmacros /dev/null 2>&1 | $(GREP) -i -c "illegal")
+  COUNT := $(shell $(CXX) $(CXXFLAGS) -E -xarch=sse2 -xdumpmacros pch.cpp 2>&1 | $(GREP) -i -c "illegal")
   ifeq ($(COUNT),0)
     AES_FLAG = -xarch=sse2 -D__SSE2__=1
+    CHACHA_FLAG = -xarch=sse2 -D__SSE2__=1
     GCM_FLAG = -xarch=sse2 -D__SSE2__=1
     SHA_FLAG = -xarch=sse2 -D__SSE2__=1
     LDFLAGS += -xarch=sse2
   endif
-  COUNT := $(shell $(CXX) $(CXXFLAGS) -E -xarch=ssse3 -xdumpmacros /dev/null 2>&1 | $(GREP) -i -c "illegal")
+  COUNT := $(shell $(CXX) $(CXXFLAGS) -E -xarch=ssse3 -xdumpmacros pch.cpp 2>&1 | $(GREP) -i -c "illegal")
   ifeq ($(COUNT),0)
     SSSE3_FLAG = -xarch=ssse3 -D__SSSE3__=1
     ARIA_FLAG = -xarch=ssse3 -D__SSSE3__=1
@@ -308,7 +316,7 @@ ifeq ($(SUN_COMPILER),1)
     SPECK128_FLAG = -xarch=ssse3 -D__SSSE3__=1
     LDFLAGS += -xarch=ssse3
   endif
-  COUNT := $(shell $(CXX) $(CXXFLAGS) -E -xarch=sse4_1 -xdumpmacros /dev/null 2>&1 | $(GREP) -i -c "illegal")
+  COUNT := $(shell $(CXX) $(CXXFLAGS) -E -xarch=sse4_1 -xdumpmacros pch.cpp 2>&1 | $(GREP) -i -c "illegal")
   ifeq ($(COUNT),0)
     BLAKE2B_FLAG = -xarch=sse4_1 -D__SSE4_1__=1
     BLAKE2S_FLAG = -xarch=sse4_1 -D__SSE4_1__=1
@@ -316,19 +324,28 @@ ifeq ($(SUN_COMPILER),1)
     SPECK64_FLAG = -xarch=sse4_1 -D__SSE4_1__=1
     LDFLAGS += -xarch=sse4_1
   endif
-  COUNT := $(shell $(CXX) $(CXXFLAGS) -E -xarch=sse4_2 -xdumpmacros /dev/null 2>&1 | $(GREP) -i -c "illegal")
+  COUNT := $(shell $(CXX) $(CXXFLAGS) -E -xarch=sse4_2 -xdumpmacros pch.cpp 2>&1 | $(GREP) -i -c "illegal")
   ifeq ($(COUNT),0)
     CRC_FLAG = -xarch=sse4_2 -D__SSE4_2__=1
     LDFLAGS += -xarch=sse4_2
   endif
-  COUNT := $(shell $(CXX) $(CXXFLAGS) -E -xarch=aes -xdumpmacros /dev/null 2>&1 | $(GREP) -i -c "illegal")
+  COUNT := $(shell $(CXX) $(CXXFLAGS) -E -xarch=aes -xdumpmacros pch.cpp 2>&1 | $(GREP) -i -c "illegal")
   ifeq ($(COUNT),0)
     GCM_FLAG = -xarch=aes -D__PCLMUL__=1
     AES_FLAG = -xarch=aes -D__AES__=1
     SM4_FLAG = -xarch=aes -D__AES__=1
     LDFLAGS += -xarch=aes
   endif
-  COUNT := $(shell $(CXX) $(CXXFLAGS) -E -xarch=sha -xdumpmacros /dev/null 2>&1 | $(GREP) -i -c "illegal")
+  COUNT := $(shell $(CXX) $(CXXFLAGS) -E -xarch=avx -xdumpmacros pch.cpp 2>&1 | $(GREP) -i -c "illegal")
+  ifeq ($(COUNT),0)
+    LDFLAGS += -xarch=avx
+  endif
+  COUNT := $(shell $(CXX) $(CXXFLAGS) -E -xarch=avx2 -xdumpmacros pch.cpp 2>&1 | $(GREP) -i -c "illegal")
+  ifeq ($(COUNT),0)
+    CHACHA_AVX2_FLAG = -xarch=avx2 -D__AVX2__=1
+    LDFLAGS += -xarch=avx2
+  endif
+  COUNT := $(shell $(CXX) $(CXXFLAGS) -E -xarch=sha -xdumpmacros pch.cpp 2>&1 | $(GREP) -i -c "illegal")
   ifeq ($(COUNT),0)
     SHA_FLAG = -xarch=sha -D__SHA__=1
     LDFLAGS += -xarch=sha
@@ -646,8 +663,8 @@ ifneq ($(SUN_COMPILER),0)	# override flags for CC Sun C++ compiler
 CXXFLAGS += -template=no%extdef
 SUN_CC10_BUGGY := $(shell $(CXX) -V 2>&1 | $(GREP) -c -E "CC: Sun .* 5\.10 .* (2009|2010/0[1-4])")
 ifneq ($(SUN_CC10_BUGGY),0)
-# -DCRYPTOPP_INCLUDE_VECTOR_CC is needed for Sun Studio 12u1 Sun C++ 5.10 SunOS_i386 128229-02 2009/09/21 and was fixed in May 2010
-# remove it if you get "already had a body defined" errors in vector.cc
+# -DCRYPTOPP_INCLUDE_VECTOR_CC is needed for Sun Studio 12u1 Sun C++ 5.10 SunOS_i386 128229-02 2009/09/21
+# and was fixed in May 2010. Remove it if you get "already had a body defined" errors in vector.cc
 CXXFLAGS += -DCRYPTOPP_INCLUDE_VECTOR_CC
 endif
 AR = $(CXX)
@@ -1197,6 +1214,10 @@ blake2b-simd.o : blake2b-simd.cpp
 chacha-simd.o : chacha-simd.cpp
 	$(CXX) $(strip $(CXXFLAGS) $(CHACHA_FLAG) -c) $<
 
+# AVX2 available
+chacha-avx.o : chacha-avx.cpp
+	$(CXX) $(strip $(CXXFLAGS) $(CHACHA_AVX2_FLAG) -c) $<
+
 # SSSE3 available
 cham-simd.o : cham-simd.cpp
 	$(CXX) $(strip $(CXXFLAGS) $(CHAM_FLAG) -c) $<
diff --git a/chacha-avx.cpp b/chacha-avx.cpp
new file mode 100644
index 00000000..ed5a2458
--- /dev/null
+++ b/chacha-avx.cpp
@@ -0,0 +1,362 @@
+// chacha-avx.cpp - written and placed in the public domain by
+//                  Jack Lloyd and Jeffrey Walton
+//
+//    This source file uses intrinsics and built-ins to gain access to
+//    SSE2, ARM NEON and ARMv8a, and Power7 Altivec instructions. A separate
+//    source file is needed because additional CXXFLAGS are required to enable
+//    the appropriate instructions sets in some build configurations.
+//
+//    AVX implementation based on Botan's chacha_avx.cpp. Many thanks
+//    to Jack Lloyd and the Botan team for allowing us to use it.
+//
+//    Here are some relative numbers for ChaCha8:
+//    * Intel Skylake, 3.0 GHz: AVX2 at 4385 MB/s; 0.59 cpb.
+//    * AMD Bulldozer, 3.3 GHz: AVX2 at 1680 MB/s; 1.47 cpb.
+
+#include "pch.h"
+#include "config.h"
+
+#include "chacha.h"
+#include "misc.h"
+
+#if defined(CRYPTOPP_AVX2_AVAILABLE)
+# include <xmmintrin.h>
+# include <emmintrin.h>
+# include <immintrin.h>
+#endif
+
+// Squash MS LNK4221 and libtool warnings
+extern const char CHACHA_AVX_FNAME[] = __FILE__;
+
+// Sun Studio 12.4 OK, 12.5 and 12.6 error.
+#if (__SUNPRO_CC >= 0x5140) && (__SUNPRO_CC <= 0x5150)
+# define MAYBE_CONST
+#else
+# define MAYBE_CONST const
+#endif
+
+#if (CRYPTOPP_AVX2_AVAILABLE)
+
+ANONYMOUS_NAMESPACE_BEGIN
+
+template <unsigned int R>
+inline __m256i RotateLeft(const __m256i val)
+{
+    return _mm256_or_si256(_mm256_slli_epi32(val, R), _mm256_srli_epi32(val, 32-R));
+}
+
+template <>
+inline __m256i RotateLeft<8>(const __m256i val)
+{
+    const __m256i mask = _mm256_set_epi8(14,13,12,15, 10,9,8,11, 6,5,4,7, 2,1,0,3,
+                                         14,13,12,15, 10,9,8,11, 6,5,4,7, 2,1,0,3);
+    return _mm256_shuffle_epi8(val, mask);
+}
+
+template <>
+inline __m256i RotateLeft<16>(const __m256i val)
+{
+    const __m256i mask = _mm256_set_epi8(13,12,15,14, 9,8,11,10, 5,4,7,6, 1,0,3,2,
+                                         13,12,15,14, 9,8,11,10, 5,4,7,6, 1,0,3,2);
+    return _mm256_shuffle_epi8(val, mask);
+}
+
+ANONYMOUS_NAMESPACE_END
+
+NAMESPACE_BEGIN(CryptoPP)
+
+void ChaCha_OperateKeystream_AVX2(const word32 *state, const byte* input, byte *output, unsigned int rounds)
+{
+    MAYBE_CONST __m128i* state_mm = (MAYBE_CONST __m128i*)(state);
+    MAYBE_CONST __m256i* input_mm = (MAYBE_CONST __m256i*)(input);
+    __m256i* output_mm = reinterpret_cast<__m256i*>(output);
+
+    const __m256i state0 = _mm256_broadcastsi128_si256(_mm_loadu_si128(state_mm + 0));
+    const __m256i state1 = _mm256_broadcastsi128_si256(_mm_loadu_si128(state_mm + 1));
+    const __m256i state2 = _mm256_broadcastsi128_si256(_mm_loadu_si128(state_mm + 2));
+    const __m256i state3 = _mm256_broadcastsi128_si256(_mm_loadu_si128(state_mm + 3));
+
+    const __m256i CTR0 = _mm256_set_epi32(0, 0, 0, 0, 0, 0, 0, 4);
+    const __m256i CTR1 = _mm256_set_epi32(0, 0, 0, 1, 0, 0, 0, 5);
+    const __m256i CTR2 = _mm256_set_epi32(0, 0, 0, 2, 0, 0, 0, 6);
+    const __m256i CTR3 = _mm256_set_epi32(0, 0, 0, 3, 0, 0, 0, 7);
+
+    __m256i X0_0 = state0;
+    __m256i X0_1 = state1;
+    __m256i X0_2 = state2;
+    __m256i X0_3 = _mm256_add_epi64(state3, CTR0);
+
+    __m256i X1_0 = state0;
+    __m256i X1_1 = state1;
+    __m256i X1_2 = state2;
+    __m256i X1_3 = _mm256_add_epi64(state3, CTR1);
+
+    __m256i X2_0 = state0;
+    __m256i X2_1 = state1;
+    __m256i X2_2 = state2;
+    __m256i X2_3 = _mm256_add_epi64(state3, CTR2);
+
+    __m256i X3_0 = state0;
+    __m256i X3_1 = state1;
+    __m256i X3_2 = state2;
+    __m256i X3_3 = _mm256_add_epi64(state3, CTR3);
+
+    for (int i = static_cast<int>(rounds); i > 0; i -= 2)
+    {
+        X0_0 = _mm256_add_epi32(X0_0, X0_1);
+        X1_0 = _mm256_add_epi32(X1_0, X1_1);
+        X2_0 = _mm256_add_epi32(X2_0, X2_1);
+        X3_0 = _mm256_add_epi32(X3_0, X3_1);
+
+        X0_3 = _mm256_xor_si256(X0_3, X0_0);
+        X1_3 = _mm256_xor_si256(X1_3, X1_0);
+        X2_3 = _mm256_xor_si256(X2_3, X2_0);
+        X3_3 = _mm256_xor_si256(X3_3, X3_0);
+
+        X0_3 = RotateLeft<16>(X0_3);
+        X1_3 = RotateLeft<16>(X1_3);
+        X2_3 = RotateLeft<16>(X2_3);
+        X3_3 = RotateLeft<16>(X3_3);
+
+        X0_2 = _mm256_add_epi32(X0_2, X0_3);
+        X1_2 = _mm256_add_epi32(X1_2, X1_3);
+        X2_2 = _mm256_add_epi32(X2_2, X2_3);
+        X3_2 = _mm256_add_epi32(X3_2, X3_3);
+
+        X0_1 = _mm256_xor_si256(X0_1, X0_2);
+        X1_1 = _mm256_xor_si256(X1_1, X1_2);
+        X2_1 = _mm256_xor_si256(X2_1, X2_2);
+        X3_1 = _mm256_xor_si256(X3_1, X3_2);
+
+        X0_1 = RotateLeft<12>(X0_1);
+        X1_1 = RotateLeft<12>(X1_1);
+        X2_1 = RotateLeft<12>(X2_1);
+        X3_1 = RotateLeft<12>(X3_1);
+
+        X0_0 = _mm256_add_epi32(X0_0, X0_1);
+        X1_0 = _mm256_add_epi32(X1_0, X1_1);
+        X2_0 = _mm256_add_epi32(X2_0, X2_1);
+        X3_0 = _mm256_add_epi32(X3_0, X3_1);
+
+        X0_3 = _mm256_xor_si256(X0_3, X0_0);
+        X1_3 = _mm256_xor_si256(X1_3, X1_0);
+        X2_3 = _mm256_xor_si256(X2_3, X2_0);
+        X3_3 = _mm256_xor_si256(X3_3, X3_0);
+
+        X0_3 = RotateLeft<8>(X0_3);
+        X1_3 = RotateLeft<8>(X1_3);
+        X2_3 = RotateLeft<8>(X2_3);
+        X3_3 = RotateLeft<8>(X3_3);
+
+        X0_2 = _mm256_add_epi32(X0_2, X0_3);
+        X1_2 = _mm256_add_epi32(X1_2, X1_3);
+        X2_2 = _mm256_add_epi32(X2_2, X2_3);
+        X3_2 = _mm256_add_epi32(X3_2, X3_3);
+
+        X0_1 = _mm256_xor_si256(X0_1, X0_2);
+        X1_1 = _mm256_xor_si256(X1_1, X1_2);
+        X2_1 = _mm256_xor_si256(X2_1, X2_2);
+        X3_1 = _mm256_xor_si256(X3_1, X3_2);
+
+        X0_1 = RotateLeft<7>(X0_1);
+        X1_1 = RotateLeft<7>(X1_1);
+        X2_1 = RotateLeft<7>(X2_1);
+        X3_1 = RotateLeft<7>(X3_1);
+
+        X0_1 = _mm256_shuffle_epi32(X0_1, _MM_SHUFFLE(0, 3, 2, 1));
+        X0_2 = _mm256_shuffle_epi32(X0_2, _MM_SHUFFLE(1, 0, 3, 2));
+        X0_3 = _mm256_shuffle_epi32(X0_3, _MM_SHUFFLE(2, 1, 0, 3));
+
+        X1_1 = _mm256_shuffle_epi32(X1_1, _MM_SHUFFLE(0, 3, 2, 1));
+        X1_2 = _mm256_shuffle_epi32(X1_2, _MM_SHUFFLE(1, 0, 3, 2));
+        X1_3 = _mm256_shuffle_epi32(X1_3, _MM_SHUFFLE(2, 1, 0, 3));
+
+        X2_1 = _mm256_shuffle_epi32(X2_1, _MM_SHUFFLE(0, 3, 2, 1));
+        X2_2 = _mm256_shuffle_epi32(X2_2, _MM_SHUFFLE(1, 0, 3, 2));
+        X2_3 = _mm256_shuffle_epi32(X2_3, _MM_SHUFFLE(2, 1, 0, 3));
+
+        X3_1 = _mm256_shuffle_epi32(X3_1, _MM_SHUFFLE(0, 3, 2, 1));
+        X3_2 = _mm256_shuffle_epi32(X3_2, _MM_SHUFFLE(1, 0, 3, 2));
+        X3_3 = _mm256_shuffle_epi32(X3_3, _MM_SHUFFLE(2, 1, 0, 3));
+
+        X0_0 = _mm256_add_epi32(X0_0, X0_1);
+        X1_0 = _mm256_add_epi32(X1_0, X1_1);
+        X2_0 = _mm256_add_epi32(X2_0, X2_1);
+        X3_0 = _mm256_add_epi32(X3_0, X3_1);
+
+        X0_3 = _mm256_xor_si256(X0_3, X0_0);
+        X1_3 = _mm256_xor_si256(X1_3, X1_0);
+        X2_3 = _mm256_xor_si256(X2_3, X2_0);
+        X3_3 = _mm256_xor_si256(X3_3, X3_0);
+
+        X0_3 = RotateLeft<16>(X0_3);
+        X1_3 = RotateLeft<16>(X1_3);
+        X2_3 = RotateLeft<16>(X2_3);
+        X3_3 = RotateLeft<16>(X3_3);
+
+        X0_2 = _mm256_add_epi32(X0_2, X0_3);
+        X1_2 = _mm256_add_epi32(X1_2, X1_3);
+        X2_2 = _mm256_add_epi32(X2_2, X2_3);
+        X3_2 = _mm256_add_epi32(X3_2, X3_3);
+
+        X0_1 = _mm256_xor_si256(X0_1, X0_2);
+        X1_1 = _mm256_xor_si256(X1_1, X1_2);
+        X2_1 = _mm256_xor_si256(X2_1, X2_2);
+        X3_1 = _mm256_xor_si256(X3_1, X3_2);
+
+        X0_1 = RotateLeft<12>(X0_1);
+        X1_1 = RotateLeft<12>(X1_1);
+        X2_1 = RotateLeft<12>(X2_1);
+        X3_1 = RotateLeft<12>(X3_1);
+
+        X0_0 = _mm256_add_epi32(X0_0, X0_1);
+        X1_0 = _mm256_add_epi32(X1_0, X1_1);
+        X2_0 = _mm256_add_epi32(X2_0, X2_1);
+        X3_0 = _mm256_add_epi32(X3_0, X3_1);
+
+        X0_3 = _mm256_xor_si256(X0_3, X0_0);
+        X1_3 = _mm256_xor_si256(X1_3, X1_0);
+        X2_3 = _mm256_xor_si256(X2_3, X2_0);
+        X3_3 = _mm256_xor_si256(X3_3, X3_0);
+
+        X0_3 = RotateLeft<8>(X0_3);
+        X1_3 = RotateLeft<8>(X1_3);
+        X2_3 = RotateLeft<8>(X2_3);
+        X3_3 = RotateLeft<8>(X3_3);
+
+        X0_2 = _mm256_add_epi32(X0_2, X0_3);
+        X1_2 = _mm256_add_epi32(X1_2, X1_3);
+        X2_2 = _mm256_add_epi32(X2_2, X2_3);
+        X3_2 = _mm256_add_epi32(X3_2, X3_3);
+
+        X0_1 = _mm256_xor_si256(X0_1, X0_2);
+        X1_1 = _mm256_xor_si256(X1_1, X1_2);
+        X2_1 = _mm256_xor_si256(X2_1, X2_2);
+        X3_1 = _mm256_xor_si256(X3_1, X3_2);
+
+        X0_1 = RotateLeft<7>(X0_1);
+        X1_1 = RotateLeft<7>(X1_1);
+        X2_1 = RotateLeft<7>(X2_1);
+        X3_1 = RotateLeft<7>(X3_1);
+
+        X0_1 = _mm256_shuffle_epi32(X0_1, _MM_SHUFFLE(2, 1, 0, 3));
+        X0_2 = _mm256_shuffle_epi32(X0_2, _MM_SHUFFLE(1, 0, 3, 2));
+        X0_3 = _mm256_shuffle_epi32(X0_3, _MM_SHUFFLE(0, 3, 2, 1));
+
+        X1_1 = _mm256_shuffle_epi32(X1_1, _MM_SHUFFLE(2, 1, 0, 3));
+        X1_2 = _mm256_shuffle_epi32(X1_2, _MM_SHUFFLE(1, 0, 3, 2));
+        X1_3 = _mm256_shuffle_epi32(X1_3, _MM_SHUFFLE(0, 3, 2, 1));
+
+        X2_1 = _mm256_shuffle_epi32(X2_1, _MM_SHUFFLE(2, 1, 0, 3));
+        X2_2 = _mm256_shuffle_epi32(X2_2, _MM_SHUFFLE(1, 0, 3, 2));
+        X2_3 = _mm256_shuffle_epi32(X2_3, _MM_SHUFFLE(0, 3, 2, 1));
+
+        X3_1 = _mm256_shuffle_epi32(X3_1, _MM_SHUFFLE(2, 1, 0, 3));
+        X3_2 = _mm256_shuffle_epi32(X3_2, _MM_SHUFFLE(1, 0, 3, 2));
+        X3_3 = _mm256_shuffle_epi32(X3_3, _MM_SHUFFLE(0, 3, 2, 1));
+    }
+
+    X0_0 = _mm256_add_epi32(X0_0, state0);
+    X0_1 = _mm256_add_epi32(X0_1, state1);
+    X0_2 = _mm256_add_epi32(X0_2, state2);
+    X0_3 = _mm256_add_epi32(X0_3, state3);
+    X0_3 = _mm256_add_epi64(X0_3, CTR0);
+
+    X1_0 = _mm256_add_epi32(X1_0, state0);
+    X1_1 = _mm256_add_epi32(X1_1, state1);
+    X1_2 = _mm256_add_epi32(X1_2, state2);
+    X1_3 = _mm256_add_epi32(X1_3, state3);
+    X1_3 = _mm256_add_epi64(X1_3, CTR1);
+
+    X2_0 = _mm256_add_epi32(X2_0, state0);
+    X2_1 = _mm256_add_epi32(X2_1, state1);
+    X2_2 = _mm256_add_epi32(X2_2, state2);
+    X2_3 = _mm256_add_epi32(X2_3, state3);
+    X2_3 = _mm256_add_epi64(X2_3, CTR2);
+
+    X3_0 = _mm256_add_epi32(X3_0, state0);
+    X3_1 = _mm256_add_epi32(X3_1, state1);
+    X3_2 = _mm256_add_epi32(X3_2, state2);
+    X3_3 = _mm256_add_epi32(X3_3, state3);
+    X3_3 = _mm256_add_epi64(X3_3, CTR3);
+
+    if (input_mm)
+    {
+        _mm256_storeu_si256(output_mm + 0, _mm256_xor_si256(_mm256_loadu_si256(input_mm + 0),
+                            _mm256_permute2x128_si256(X0_0, X0_1, 1 + (3 << 4))));
+        _mm256_storeu_si256(output_mm + 1, _mm256_xor_si256(_mm256_loadu_si256(input_mm + 1),
+                            _mm256_permute2x128_si256(X0_2, X0_3, 1 + (3 << 4))));
+        _mm256_storeu_si256(output_mm + 2, _mm256_xor_si256(_mm256_loadu_si256(input_mm + 2),
+                            _mm256_permute2x128_si256(X1_0, X1_1, 1 + (3 << 4))));
+        _mm256_storeu_si256(output_mm + 3, _mm256_xor_si256(_mm256_loadu_si256(input_mm + 3),
+                            _mm256_permute2x128_si256(X1_2, X1_3, 1 + (3 << 4))));
+    }
+    else
+    {
+        _mm256_storeu_si256(output_mm + 0, _mm256_permute2x128_si256(X0_0, X0_1, 1 + (3 << 4)));
+        _mm256_storeu_si256(output_mm + 1, _mm256_permute2x128_si256(X0_2, X0_3, 1 + (3 << 4)));
+        _mm256_storeu_si256(output_mm + 2, _mm256_permute2x128_si256(X1_0, X1_1, 1 + (3 << 4)));
+        _mm256_storeu_si256(output_mm + 3, _mm256_permute2x128_si256(X1_2, X1_3, 1 + (3 << 4)));
+    }
+
+    if (input_mm)
+    {
+        _mm256_storeu_si256(output_mm + 4, _mm256_xor_si256(_mm256_loadu_si256(input_mm + 4),
+            _mm256_permute2x128_si256(X2_0, X2_1, 1 + (3 << 4))));
+        _mm256_storeu_si256(output_mm + 5, _mm256_xor_si256(_mm256_loadu_si256(input_mm + 5),
+            _mm256_permute2x128_si256(X2_2, X2_3, 1 + (3 << 4))));
+        _mm256_storeu_si256(output_mm + 6, _mm256_xor_si256(_mm256_loadu_si256(input_mm + 6),
+            _mm256_permute2x128_si256(X3_0, X3_1, 1 + (3 << 4))));
+        _mm256_storeu_si256(output_mm + 7, _mm256_xor_si256(_mm256_loadu_si256(input_mm + 7),
+            _mm256_permute2x128_si256(X3_2, X3_3, 1 + (3 << 4))));
+    }
+    else
+    {
+        _mm256_storeu_si256(output_mm + 4, _mm256_permute2x128_si256(X2_0, X2_1, 1 + (3 << 4)));
+        _mm256_storeu_si256(output_mm + 5, _mm256_permute2x128_si256(X2_2, X2_3, 1 + (3 << 4)));
+        _mm256_storeu_si256(output_mm + 6, _mm256_permute2x128_si256(X3_0, X3_1, 1 + (3 << 4)));
+        _mm256_storeu_si256(output_mm + 7, _mm256_permute2x128_si256(X3_2, X3_3, 1 + (3 << 4)));
+    }
+
+    if (input_mm)
+    {
+        _mm256_storeu_si256(output_mm +  8, _mm256_xor_si256(_mm256_loadu_si256(input_mm +  8),
+            _mm256_permute2x128_si256(X0_0, X0_1, 0 + (2 << 4))));
+        _mm256_storeu_si256(output_mm +  9, _mm256_xor_si256(_mm256_loadu_si256(input_mm +  9),
+            _mm256_permute2x128_si256(X0_2, X0_3, 0 + (2 << 4))));
+        _mm256_storeu_si256(output_mm + 10, _mm256_xor_si256(_mm256_loadu_si256(input_mm + 10),
+            _mm256_permute2x128_si256(X1_0, X1_1, 0 + (2 << 4))));
+        _mm256_storeu_si256(output_mm + 11, _mm256_xor_si256(_mm256_loadu_si256(input_mm + 11),
+            _mm256_permute2x128_si256(X1_2, X1_3, 0 + (2 << 4))));
+    }
+    else
+    {
+        _mm256_storeu_si256(output_mm +  8, _mm256_permute2x128_si256(X0_0, X0_1, 0 + (2 << 4)));
+        _mm256_storeu_si256(output_mm +  9, _mm256_permute2x128_si256(X0_2, X0_3, 0 + (2 << 4)));
+        _mm256_storeu_si256(output_mm + 10, _mm256_permute2x128_si256(X1_0, X1_1, 0 + (2 << 4)));
+        _mm256_storeu_si256(output_mm + 11, _mm256_permute2x128_si256(X1_2, X1_3, 0 + (2 << 4)));
+    }
+
+    if (input_mm)
+    {
+        _mm256_storeu_si256(output_mm + 12, _mm256_xor_si256(_mm256_loadu_si256(input_mm + 12),
+            _mm256_permute2x128_si256(X2_0, X2_1, 0 + (2 << 4))));
+        _mm256_storeu_si256(output_mm + 13, _mm256_xor_si256(_mm256_loadu_si256(input_mm + 13),
+            _mm256_permute2x128_si256(X2_2, X2_3, 0 + (2 << 4))));
+        _mm256_storeu_si256(output_mm + 14, _mm256_xor_si256(_mm256_loadu_si256(input_mm + 14),
+            _mm256_permute2x128_si256(X3_0, X3_1, 0 + (2 << 4))));
+        _mm256_storeu_si256(output_mm + 15, _mm256_xor_si256(_mm256_loadu_si256(input_mm + 15),
+            _mm256_permute2x128_si256(X3_2, X3_3, 0 + (2 << 4))));
+    }
+    else
+    {
+        _mm256_storeu_si256(output_mm + 12, _mm256_permute2x128_si256(X2_0, X2_1, 0 + (2 << 4)));
+        _mm256_storeu_si256(output_mm + 13, _mm256_permute2x128_si256(X2_2, X2_3, 0 + (2 << 4)));
+        _mm256_storeu_si256(output_mm + 14, _mm256_permute2x128_si256(X3_0, X3_1, 0 + (2 << 4)));
+        _mm256_storeu_si256(output_mm + 15, _mm256_permute2x128_si256(X3_2, X3_3, 0 + (2 << 4)));
+    }
+}
+
+NAMESPACE_END
+
+#endif  // CRYPTOPP_AVX2_AVAILABLE
diff --git a/chacha.cpp b/chacha.cpp
index 6a5f50f2..fe438507 100644
--- a/chacha.cpp
+++ b/chacha.cpp
@@ -20,6 +20,10 @@ extern void ChaCha_OperateKeystream_NEON(const word32 *state, const byte* input,
 extern void ChaCha_OperateKeystream_SSE2(const word32 *state, const byte* input, byte *output, unsigned int rounds);
 #endif
 
+#if (CRYPTOPP_AVX2_AVAILABLE)
+extern void ChaCha_OperateKeystream_AVX2(const word32 *state, const byte* input, byte *output, unsigned int rounds);
+#endif
+
 #if (CRYPTOPP_POWER8_AVAILABLE)
 extern void ChaCha_OperateKeystream_POWER8(const word32 *state, const byte* input, byte *output, unsigned int rounds);
 #endif
@@ -62,17 +66,25 @@ std::string ChaCha_Policy::AlgorithmName() const
 
 std::string ChaCha_Policy::AlgorithmProvider() const
 {
+#if (CRYPTOPP_AVX2_AVAILABLE)
+    if (HasAVX2())
+        return "AVX2";
+    else
+#endif
 #if (CRYPTOPP_SSE2_INTRIN_AVAILABLE || CRYPTOPP_SSE2_ASM_AVAILABLE)
     if (HasSSE2())
         return "SSE2";
+    else
 #endif
 #if (CRYPTOPP_ARM_NEON_AVAILABLE)
     if (HasNEON())
         return "NEON";
+    else
 #endif
 #if (CRYPTOPP_POWER8_AVAILABLE)
     if (HasPower8())
         return "Power8";
+    else
 #endif
     return "C++";
 }
@@ -117,11 +129,17 @@ void ChaCha_Policy::SeekToIteration(lword iterationCount)
 
 unsigned int ChaCha_Policy::GetAlignment() const
 {
+#if (CRYPTOPP_AVX2_AVAILABLE)
+    if (HasAVX2())
+        return 16;
+    else
+#endif
 #if (CRYPTOPP_SSE2_INTRIN_AVAILABLE || CRYPTOPP_SSE2_ASM_AVAILABLE)
     if (HasSSE2())
         return 16;
     else
-#elif (CRYPTOPP_POWER8_AVAILABLE)
+#endif
+#if (CRYPTOPP_POWER8_AVAILABLE)
     if (HasPower8())
         return 16;
     else
@@ -131,6 +149,11 @@ unsigned int ChaCha_Policy::GetAlignment() const
 
 unsigned int ChaCha_Policy::GetOptimalBlockSize() const
 {
+#if (CRYPTOPP_AVX2_AVAILABLE)
+    if (HasAVX2())
+        return 8 * BYTES_PER_ITERATION;
+    else
+#endif
 #if (CRYPTOPP_SSE2_INTRIN_AVAILABLE || CRYPTOPP_SSE2_ASM_AVAILABLE)
     if (HasSSE2())
         return 4*BYTES_PER_ITERATION;
@@ -149,10 +172,9 @@ unsigned int ChaCha_Policy::GetOptimalBlockSize() const
         return BYTES_PER_ITERATION;
 }
 
-bool ChaCha_Policy::MultiBlockSafe() const
+bool ChaCha_Policy::MultiBlockSafe(unsigned int blocks) const
 {
-    const word32 c = m_state[12];
-    return 0xffffffff - c > 4;
+    return 0xffffffff - m_state[12] > blocks;
 }
 
 // OperateKeystream always produces a key stream. The key stream is written
@@ -163,10 +185,30 @@ void ChaCha_Policy::OperateKeystream(KeystreamOperation operation,
 {
     do
     {
+#if (CRYPTOPP_AVX2_AVAILABLE)
+        if (HasAVX2())
+        {
+            while (iterationCount >= 8 && MultiBlockSafe(8))
+            {
+                const bool xorInput = (operation & INPUT_NULL) != INPUT_NULL;
+                ChaCha_OperateKeystream_AVX2(m_state, xorInput ? input : NULLPTR, output, m_rounds);
+
+                // MultiBlockSafe avoids overflow on the counter words
+                m_state[12] += 8;
+                //if (m_state[12] < 8)
+                //    m_state[13]++;
+
+                input += (!!xorInput) * 8 * BYTES_PER_ITERATION;
+                output += 8 * BYTES_PER_ITERATION;
+                iterationCount -= 8;
+            }
+        }
+#endif
+
 #if (CRYPTOPP_SSE2_INTRIN_AVAILABLE || CRYPTOPP_SSE2_ASM_AVAILABLE)
         if (HasSSE2())
         {
-            while (iterationCount >= 4 && MultiBlockSafe())
+            while (iterationCount >= 4 && MultiBlockSafe(4))
             {
                 const bool xorInput = (operation & INPUT_NULL) != INPUT_NULL;
                 ChaCha_OperateKeystream_SSE2(m_state, xorInput ? input : NULLPTR, output, m_rounds);
@@ -186,7 +228,7 @@ void ChaCha_Policy::OperateKeystream(KeystreamOperation operation,
 #if (CRYPTOPP_ARM_NEON_AVAILABLE)
         if (HasNEON())
         {
-            while (iterationCount >= 4 && MultiBlockSafe())
+            while (iterationCount >= 4 && MultiBlockSafe(4))
             {
                 const bool xorInput = (operation & INPUT_NULL) != INPUT_NULL;
                 ChaCha_OperateKeystream_NEON(m_state, xorInput ? input : NULLPTR, output, m_rounds);
@@ -206,7 +248,7 @@ void ChaCha_Policy::OperateKeystream(KeystreamOperation operation,
 #if (CRYPTOPP_POWER8_AVAILABLE)
         if (HasPower8())
         {
-            while (iterationCount >= 4 && MultiBlockSafe())
+            while (iterationCount >= 4 && MultiBlockSafe(4))
             {
                 const bool xorInput = (operation & INPUT_NULL) != INPUT_NULL;
                 ChaCha_OperateKeystream_POWER8(m_state, xorInput ? input : NULLPTR, output, m_rounds);
diff --git a/chacha.h b/chacha.h
index b0a762b3..5d219a27 100644
--- a/chacha.h
+++ b/chacha.h
@@ -56,7 +56,7 @@ protected:
     // during addition in an intermediate result. Conditions to trigger
     // issue include a user seeks to around 2^32 blocks (256 GB of data).
     // https://github.com/weidai11/cryptopp/issues/732
-    bool MultiBlockSafe() const;
+    inline bool MultiBlockSafe(unsigned int blocks) const;
 
     FixedSizeAlignedSecBlock<word32, 16> m_state;
     int m_rounds;
diff --git a/cham-simd.cpp b/cham-simd.cpp
index 8f3d0b5b..a6e1018f 100644
--- a/cham-simd.cpp
+++ b/cham-simd.cpp
@@ -17,6 +17,12 @@
 // #undef CRYPTOPP_SSSE3_AVAILABLE
 // #undef CRYPTOPP_ARM_NEON_AVAILABLE
 
+#if defined(CRYPTOPP_SSE2_AVAILABLE)
+# define CRYPTOPP_AVX512_ROTATE 1
+# include <xmmintrin.h>
+# include <emmintrin.h>
+#endif
+
 #if (CRYPTOPP_SSSE3_AVAILABLE)
 # include <pmmintrin.h>
 # include <tmmintrin.h>
@@ -26,11 +32,6 @@
 # include <ammintrin.h>
 #endif
 
-#if defined(__AVX512F__) && defined(__AVX512VL__)
-# define CRYPTOPP_AVX512_ROTATE 1
-# include <immintrin.h>
-#endif
-
 // Squash MS LNK4221 and libtool warnings
 extern const char CHAM_SIMD_FNAME[] = __FILE__;
 
diff --git a/config.h b/config.h
index 65131315..2d47cf5c 100644
--- a/config.h
+++ b/config.h
@@ -484,6 +484,11 @@ NAMESPACE_END
 # define CRYPTOPP_DISABLE_ASM 1
 #endif
 
+// SunCC prior to 5.10 cannot handle some SSE intrinsics
+#if defined(__SUNPRO_CC) && (__SUNPRO_CC < 0x5100)
+# define CRYPTOPP_DISABLE_ASM 1
+#endif
+
 // Sun Studio 12 provides GCC inline assembly, http://blogs.oracle.com/x86be/entry/gcc_style_asm_inlining_support
 // We can enable SSE2 for Sun Studio in the makefile with -D__SSE2__, but users may not compile with it.
 #if !defined(CRYPTOPP_DISABLE_ASM) && !defined(__SSE2__) && defined(__x86_64__) && (__SUNPRO_CC >= 0x5100)
@@ -563,6 +568,22 @@ NAMESPACE_END
 	#define CRYPTOPP_AESNI_AVAILABLE 1
 #endif
 
+// Requires Binutils 2.24
+#if !defined(CRYPTOPP_DISABLE_AVX) && defined(CRYPTOPP_SSE42_AVAILABLE) && \
+	(defined(__AVX2__) || (CRYPTOPP_MSC_VERSION >= 1800) || (__SUNPRO_CC >= 0x5130) || \
+	(CRYPTOPP_GCC_VERSION >= 40700) || (__INTEL_COMPILER >= 1400) || \
+	(CRYPTOPP_LLVM_CLANG_VERSION >= 30100) || (CRYPTOPP_APPLE_CLANG_VERSION >= 40600))
+#define CRYPTOPP_AVX_AVAILABLE 1
+#endif
+
+// Requires Binutils 2.24
+#if !defined(CRYPTOPP_DISABLE_AVX2) && defined(CRYPTOPP_AVX_AVAILABLE) && \
+	(defined(__AVX2__) || (CRYPTOPP_MSC_VERSION >= 1800) || (__SUNPRO_CC >= 0x5130) || \
+	(CRYPTOPP_GCC_VERSION >= 40700) || (__INTEL_COMPILER >= 1400) || \
+	(CRYPTOPP_LLVM_CLANG_VERSION >= 30100) || (CRYPTOPP_APPLE_CLANG_VERSION >= 40600))
+#define CRYPTOPP_AVX2_AVAILABLE 1
+#endif
+
 // Guessing at SHA for SunCC. Its not in Sun Studio 12.6. Also see
 //   http://stackoverflow.com/questions/45872180/which-xarch-for-sha-extensions-on-solaris
 #if !defined(CRYPTOPP_DISABLE_ASM) && !defined(CRYPTOPP_DISABLE_SHANI) && defined(CRYPTOPP_SSE42_AVAILABLE) && \
diff --git a/cryptest.nmake b/cryptest.nmake
index 0dc946ff..1e28576a 100644
--- a/cryptest.nmake
+++ b/cryptest.nmake
@@ -55,8 +55,8 @@ LIB_SRCS = \
     algparam.cpp arc4.cpp aria-simd.cpp aria.cpp ariatab.cpp asn.cpp \
     authenc.cpp base32.cpp base64.cpp basecode.cpp bfinit.cpp blake2s-simd.cpp \
     blake2b-simd.cpp blake2.cpp blowfish.cpp blumshub.cpp camellia.cpp cast.cpp casts.cpp \
-    cbcmac.cpp ccm.cpp chacha-simd.cpp chacha.cpp cham-simd.cpp cham.cpp channels.cpp \
-    cmac.cpp crc-simd.cpp crc.cpp default.cpp des.cpp dessp.cpp dh.cpp \
+    cbcmac.cpp ccm.cpp chacha-avx.cpp chacha-simd.cpp chacha.cpp cham-simd.cpp cham.cpp \
+    channels.cpp cmac.cpp crc-simd.cpp crc.cpp default.cpp des.cpp dessp.cpp dh.cpp \
     dh2.cpp dll.cpp dsa.cpp eax.cpp ec2n.cpp eccrypto.cpp ecp.cpp elgamal.cpp \
     emsa2.cpp eprecomp.cpp esign.cpp files.cpp filters.cpp fips140.cpp \
     fipstest.cpp gcm-simd.cpp gcm.cpp gf256.cpp gf2_32.cpp gf2n.cpp \
@@ -83,8 +83,8 @@ LIB_OBJS = \
     algparam.obj arc4.obj aria-simd.obj aria.obj ariatab.obj asn.obj \
     authenc.obj base32.obj base64.obj basecode.obj bfinit.obj blake2s-simd.obj \
     blake2b-simd.obj blake2.obj blowfish.obj blumshub.obj camellia.obj cast.obj casts.obj \
-    cbcmac.obj ccm.obj chacha-simd.obj chacha.obj cham-simd.obj cham.obj channels.obj \
-    cmac.obj crc-simd.obj crc.obj default.obj des.obj dessp.obj dh.obj \
+    cbcmac.obj ccm.obj chacha-avx.obj chacha-simd.obj chacha.obj cham-simd.obj cham.obj \
+    channels.obj cmac.obj crc-simd.obj crc.obj default.obj des.obj dessp.obj dh.obj \
     dh2.obj dll.obj dsa.obj eax.obj ec2n.obj eccrypto.obj ecp.obj elgamal.obj \
     emsa2.obj eprecomp.obj esign.obj files.obj filters.obj fips140.obj \
     fipstest.obj gcm-simd.obj gcm.obj gf256.obj gf2_32.obj gf2n.obj \
diff --git a/cryptlib.vcxproj b/cryptlib.vcxproj
index b6917bf5..f28fd431 100644
--- a/cryptlib.vcxproj
+++ b/cryptlib.vcxproj
@@ -1,4 +1,4 @@
-<?xml version="1.0" encoding="utf-8"?>
+﻿<?xml version="1.0" encoding="utf-8"?>
 <Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
   <!-- Microsoft documentation for VCXPROJ file format is located at -->
   <!-- the following URL. The documentation leaves a lot to be desired. -->
@@ -193,6 +193,10 @@
     <ClCompile Include="ccm.cpp" />
     <ClCompile Include="chacha.cpp" />
     <ClCompile Include="chacha-simd.cpp" />
+    <ClCompile Include="chacha-avx.cpp">
+      <!-- Requires Visual Studio 2013 and above -->
+      <ExcludedFromBuild Condition=" '$(PlatformToolset)' == 'v100' Or '$(PlatformToolset)' == 'v110' ">true</ExcludedFromBuild>
+    </ClCompile>
     <ClCompile Include="cham.cpp" />
     <ClCompile Include="cham-simd.cpp" />
     <ClCompile Include="channels.cpp" />
diff --git a/cryptlib.vcxproj.filters b/cryptlib.vcxproj.filters
index de1bcea1..71957da0 100644
--- a/cryptlib.vcxproj.filters
+++ b/cryptlib.vcxproj.filters
@@ -1,4 +1,4 @@
-<?xml version="1.0" encoding="utf-8"?>
+﻿<?xml version="1.0" encoding="utf-8"?>
 <Project ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
   <ItemGroup>
     <Filter Include="Header Files">
@@ -92,6 +92,9 @@
     <ClCompile Include="chacha-simd.cpp">
       <Filter>Source Files</Filter>
     </ClCompile>
+    <ClCompile Include="chacha-avx.cpp">
+      <Filter>Source Files</Filter>
+    </ClCompile>
     <ClCompile Include="cham.cpp">
       <Filter>Source Files</Filter>
     </ClCompile>
@@ -986,5 +989,7 @@
     <CustomBuild Include="adhoc.cpp.proto">
       <Filter>Miscellaneous</Filter>
     </CustomBuild>
+    <CustomBuild Include="x64dll.asm" />
+    <CustomBuild Include="x64masm.asm" />
   </ItemGroup>
 </Project>
\ No newline at end of file