From d656f4404ac98d0c33cf90663486766aa96aa233 Mon Sep 17 00:00:00 2001 From: Jeffrey Walton Date: Sun, 17 Apr 2016 10:48:26 -0400 Subject: [PATCH] Added Windows support for BLAKE2 --- blake2.cpp | 1007 +++++++++++++++++++++++++++++++++++++++++++++-- blake2.h | 2 +- cryptlib.dsp | 8 + cryptlib.vcproj | 78 ++++ vs2010.zip | Bin 19917 -> 19943 bytes 5 files changed, 1062 insertions(+), 33 deletions(-) diff --git a/blake2.cpp b/blake2.cpp index a95a5043..69b5d068 100644 --- a/blake2.cpp +++ b/blake2.cpp @@ -10,8 +10,13 @@ NAMESPACE_BEGIN(CryptoPP) +// Visual Studio needs both VS2005 and _M_64 for _mm_set_epi64x() +#if defined(_MSC_VER) && ((_MSC_VER < 1500) || !defined(_M_X64)) +# undef CRYPTOPP_BOOL_SSE2_INTRINSICS_AVAILABLE +# undef CRYPTOPP_BOOL_SSE4_INTRINSICS_AVAILABLE +#endif + // TODO -#undef CRYPTOPP_BOOL_SSE2_INTRINSICS_AVAILABLE #undef CRYPTOPP_BOOL_SSE4_INTRINSICS_AVAILABLE // C/C++ implementation @@ -30,11 +35,8 @@ static inline void BLAKE2_SSE4_Compress64(const byte* input, BLAKE2_State struct CRYPTOPP_NO_VTABLE BLAKE2_IV {}; @@ -51,7 +53,6 @@ const word32 BLAKE2_IV::iv[8] = { 0x510E527FUL, 0x9B05688CUL, 0x1F83D9ABUL, 0x5BE0CD19UL }; -//! \brief BLAKE2b initialization vector specialization template<> struct CRYPTOPP_NO_VTABLE BLAKE2_IV { @@ -66,15 +67,11 @@ const word64 BLAKE2_IV::iv[8] = { 0x1f83d9abfb41bd6bULL, 0x5be0cd19137e2179ULL }; -//! \class BLAKE2_Sigma -//! \brief BLAKE2 sigma table -//! \tparam T_64bit flag indicating 64-bit -//! \details IV and Sigma are a better fit as part of BLAKE2_Base, but that -//! places the constants out of reach for the SSE2 and SSE4 implementations. +// IV and Sigma are a better fit as part of BLAKE2_Base, but that +// places the constants out of reach for the SSE2 and SSE4 implementations. template struct CRYPTOPP_NO_VTABLE BLAKE2_Sigma {}; -//! \brief BLAKE2s sigma table specialization template<> struct CRYPTOPP_NO_VTABLE BLAKE2_Sigma { @@ -168,8 +165,8 @@ BLAKE2_ParameterBlock::BLAKE2_ParameterBlock(size_t digestLen, size_t key ThrowIfInvalidPersonalization(personalizationLen); memset(this, 0x00, sizeof(*this)); - this->digestLength = digestLen; - this->keyLength = keyLen; + this->digestLength = (byte)digestLen; + this->keyLength = (byte)keyLen; fanout = depth = 1; if (salt && saltLen) @@ -288,19 +285,18 @@ void BLAKE2_Base::Update(const byte *input, size_t length) { if (m_state.length + length > BLOCKSIZE) { - /* Complete current block */ - size_t left = m_state.length; - size_t fill = BLOCKSIZE - left; - memcpy(&m_state.buffer[left], input, fill); + // Complete current block + const size_t fill = BLOCKSIZE - m_state.length; + memcpy_s(&m_state.buffer[m_state.length], fill, input, fill); IncrementCounter(); Compress(m_state.buffer); - m_state.length = 0; + length -= fill; input += fill; - /* Avoid buffer copies when possible */ + // Compress in-place to avoid copies while (length > BLOCKSIZE) { IncrementCounter(); Compress(input); @@ -309,7 +305,7 @@ void BLAKE2_Base::Update(const byte *input, size_t length) } } - memcpy(&m_state.buffer[m_state.length], input, length); + memcpy_s(&m_state.buffer[m_state.length], BLOCKSIZE - m_state.length, input, length); m_state.length += static_cast(length); } @@ -329,7 +325,13 @@ void BLAKE2_Base::TruncatedFinal(byte *hash, size_t size) memset(m_state.buffer + m_state.length, 0x00, BLOCKSIZE - m_state.length); Compress(m_state.buffer); - if (size < DIGESTSIZE) + if (size >= DIGESTSIZE) + { + // Write directly to the caller buffer + for(unsigned int i = 0; i < 8; ++i) + WriteWord(m_state.h[i], hash, i); + } + else { SecByteBlock buffer(DIGESTSIZE); for(unsigned int i = 0; i < 8; ++i) @@ -337,12 +339,6 @@ void BLAKE2_Base::TruncatedFinal(byte *hash, size_t size) memcpy_s(hash, DIGESTSIZE, buffer, size); } - else - { - // Write directly to the caller buffer - for(unsigned int i = 0; i < 8; ++i) - WriteWord(m_state.h[i], hash, i); - } Restart(); } @@ -517,14 +513,961 @@ void BLAKE2_CXX_Compress32(const byte* input, BLAKE2_State& state #if CRYPTOPP_BOOL_SSE2_INTRINSICS_AVAILABLE static inline void BLAKE2_SSE2_Compress32(const byte* input, BLAKE2_State& state) { - // TODO... fall back to C++ BLAKE2_CXX_Compress32(input, state); } static inline void BLAKE2_SSE2_Compress64(const byte* input, BLAKE2_State& state) { - // TODO... fall back to C++ - BLAKE2_CXX_Compress64(input, state); + __m128i row1l, row1h, row2l, row2h; + __m128i row3l, row3h, row4l, row4h; + __m128i b0, b1, t0, t1; + + const word64 m0 = ((const word64*)input)[ 0]; + const word64 m1 = ((const word64*)input)[ 1]; + const word64 m2 = ((const word64*)input)[ 2]; + const word64 m3 = ((const word64*)input)[ 3]; + const word64 m4 = ((const word64*)input)[ 4]; + const word64 m5 = ((const word64*)input)[ 5]; + const word64 m6 = ((const word64*)input)[ 6]; + const word64 m7 = ((const word64*)input)[ 7]; + const word64 m8 = ((const word64*)input)[ 8]; + const word64 m9 = ((const word64*)input)[ 9]; + const word64 m10 = ((const word64*)input)[10]; + const word64 m11 = ((const word64*)input)[11]; + const word64 m12 = ((const word64*)input)[12]; + const word64 m13 = ((const word64*)input)[13]; + const word64 m14 = ((const word64*)input)[14]; + const word64 m15 = ((const word64*)input)[15]; + + row1l = _mm_loadu_si128( (const __m128i *)(&state.h[0]) ); + row1h = _mm_loadu_si128( (const __m128i *)(&state.h[2]) ); + row2l = _mm_loadu_si128( (const __m128i *)(&state.h[4]) ); + row2h = _mm_loadu_si128( (const __m128i *)(&state.h[6]) ); + row3l = _mm_loadu_si128( (const __m128i *)(&BLAKE2_IV::iv[0]) ); + row3h = _mm_loadu_si128( (const __m128i *)(&BLAKE2_IV::iv[2]) ); + row4l = _mm_xor_si128( _mm_loadu_si128( (const __m128i *)(&BLAKE2_IV::iv[4]) ), _mm_loadu_si128( (const __m128i *)(&state.t[0]) ) ); + row4h = _mm_xor_si128( _mm_loadu_si128( (const __m128i *)(&BLAKE2_IV::iv[6]) ), _mm_loadu_si128( (const __m128i *)(&state.f[0]) ) ); + + b0 = _mm_set_epi64x(m2, m0); + b1 = _mm_set_epi64x(m6, m4); + row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); + row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); + row4l = _mm_xor_si128(row4l, row1l); + row4h = _mm_xor_si128(row4h, row1h); + row4l = _mm_xor_si128(_mm_srli_epi64(row4l,32),_mm_slli_epi64(row4l,32)); + row4h = _mm_xor_si128(_mm_srli_epi64(row4h,32),_mm_slli_epi64(row4h,32)); + row3l = _mm_add_epi64(row3l, row4l); + row3h = _mm_add_epi64(row3h, row4h); + row2l = _mm_xor_si128(row2l, row3l); + row2h = _mm_xor_si128(row2h, row3h); + row2l = _mm_xor_si128(_mm_srli_epi64(row2l,24),_mm_slli_epi64(row2l, 40 )); + row2h = _mm_xor_si128(_mm_srli_epi64(row2h,24),_mm_slli_epi64(row2h, 40 )); + + + + b0 = _mm_set_epi64x(m3, m1); + b1 = _mm_set_epi64x(m7, m5); + row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); + row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); + row4l = _mm_xor_si128(row4l, row1l); + row4h = _mm_xor_si128(row4h, row1h); + row4l = _mm_xor_si128(_mm_srli_epi64(row4l,16),_mm_slli_epi64(row4l,48)); + row4h = _mm_xor_si128(_mm_srli_epi64(row4h,16),_mm_slli_epi64(row4h,48)); + row3l = _mm_add_epi64(row3l, row4l); + row3h = _mm_add_epi64(row3h, row4h); + row2l = _mm_xor_si128(row2l, row3l); + row2h = _mm_xor_si128(row2h, row3h); + row2l = _mm_xor_si128(_mm_srli_epi64(row2l,63),_mm_slli_epi64(row2l,1)); + row2h = _mm_xor_si128(_mm_srli_epi64(row2h,63),_mm_slli_epi64(row2h,1)); + + + t0 = row4l, t1 = row2l, row4l = row3l, row3l = row3h, row3h = row4l; + row4l = _mm_unpackhi_epi64(row4h, _mm_unpacklo_epi64(t0, t0)); + row4h = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(row4h, row4h)); + row2l = _mm_unpackhi_epi64(row2l, _mm_unpacklo_epi64(row2h, row2h)); + row2h = _mm_unpackhi_epi64(row2h, _mm_unpacklo_epi64(t1, t1)); + b0 = _mm_set_epi64x(m10, m8); + b1 = _mm_set_epi64x(m14, m12); + row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); + row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); + row4l = _mm_xor_si128(row4l, row1l); + row4h = _mm_xor_si128(row4h, row1h); + row4l = _mm_xor_si128(_mm_srli_epi64(row4l,32),_mm_slli_epi64(row4l,32)); + row4h = _mm_xor_si128(_mm_srli_epi64(row4h,32),_mm_slli_epi64(row4h,32)); + row3l = _mm_add_epi64(row3l, row4l); + row3h = _mm_add_epi64(row3h, row4h); + row2l = _mm_xor_si128(row2l, row3l); + row2h = _mm_xor_si128(row2h, row3h); + row2l = _mm_xor_si128(_mm_srli_epi64(row2l,24),_mm_slli_epi64(row2l,40)); + row2h = _mm_xor_si128(_mm_srli_epi64(row2h,24),_mm_slli_epi64(row2h,40)); + + + b0 = _mm_set_epi64x(m11, m9); + b1 = _mm_set_epi64x(m15, m13); + row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); + row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); + row4l = _mm_xor_si128(row4l, row1l); + row4h = _mm_xor_si128(row4h, row1h); + row4l = _mm_xor_si128(_mm_srli_epi64(row4l,16),_mm_slli_epi64(row4l,48)); + row4h = _mm_xor_si128(_mm_srli_epi64(row4h,16),_mm_slli_epi64(row4h,48)); + row3l = _mm_add_epi64(row3l, row4l); + row3h = _mm_add_epi64(row3h, row4h); + row2l = _mm_xor_si128(row2l, row3l); + row2h = _mm_xor_si128(row2h, row3h); + row2l = _mm_xor_si128(_mm_srli_epi64(row2l,63),_mm_slli_epi64(row2l,1)); + row2h = _mm_xor_si128(_mm_srli_epi64(row2h,63),_mm_slli_epi64(row2h,1)); + + + t0 = row3l, row3l = row3h, row3h = t0, t0 = row2l, t1 = row4l; + row2l = _mm_unpackhi_epi64(row2h, _mm_unpacklo_epi64(row2l, row2l)); + row2h = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(row2h, row2h)); + row4l = _mm_unpackhi_epi64(row4l, _mm_unpacklo_epi64(row4h, row4h)); + row4h = _mm_unpackhi_epi64(row4h, _mm_unpacklo_epi64(t1, t1)); + + + b0 = _mm_set_epi64x(m4, m14); + b1 = _mm_set_epi64x(m13, m9); + row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); + row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); + row4l = _mm_xor_si128(row4l, row1l); + row4h = _mm_xor_si128(row4h, row1h); + row4l = _mm_xor_si128(_mm_srli_epi64(row4l,32),_mm_slli_epi64(row4l,32)); + row4h = _mm_xor_si128(_mm_srli_epi64(row4h,32),_mm_slli_epi64(row4h,32)); + row3l = _mm_add_epi64(row3l, row4l); + row3h = _mm_add_epi64(row3h, row4h); + row2l = _mm_xor_si128(row2l, row3l); + row2h = _mm_xor_si128(row2h, row3h); + row2l = _mm_xor_si128(_mm_srli_epi64(row2l,24),_mm_slli_epi64(row2l,40)); + row2h = _mm_xor_si128(_mm_srli_epi64(row2h,24),_mm_slli_epi64(row2h,40)); + + + b0 = _mm_set_epi64x(m8, m10); + b1 = _mm_set_epi64x(m6, m15); + row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); + row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); + row4l = _mm_xor_si128(row4l, row1l); + row4h = _mm_xor_si128(row4h, row1h); + row4l = _mm_xor_si128(_mm_srli_epi64(row4l,16),_mm_slli_epi64(row4l,48)); + row4h = _mm_xor_si128(_mm_srli_epi64(row4h,16),_mm_slli_epi64(row4h,48)); + row3l = _mm_add_epi64(row3l, row4l); + row3h = _mm_add_epi64(row3h, row4h); + row2l = _mm_xor_si128(row2l, row3l); + row2h = _mm_xor_si128(row2h, row3h); + row2l = _mm_xor_si128(_mm_srli_epi64(row2l,63),_mm_slli_epi64(row2l,1)); + row2h = _mm_xor_si128(_mm_srli_epi64(row2h,63),_mm_slli_epi64(row2h,1)); + + + t0 = row4l, t1 = row2l, row4l = row3l, row3l = row3h, row3h = row4l; + row4l = _mm_unpackhi_epi64(row4h, _mm_unpacklo_epi64(t0, t0)); + row4h = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(row4h, row4h)); + row2l = _mm_unpackhi_epi64(row2l, _mm_unpacklo_epi64(row2h, row2h)); + row2h = _mm_unpackhi_epi64(row2h, _mm_unpacklo_epi64(t1, t1)); + b0 = _mm_set_epi64x(m0, m1); + b1 = _mm_set_epi64x(m5, m11); + row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); + row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); + row4l = _mm_xor_si128(row4l, row1l); + row4h = _mm_xor_si128(row4h, row1h); + row4l = _mm_xor_si128(_mm_srli_epi64(row4l,32),_mm_slli_epi64(row4l,32)); + row4h = _mm_xor_si128(_mm_srli_epi64(row4h,32),_mm_slli_epi64(row4h,32)); + row3l = _mm_add_epi64(row3l, row4l); + row3h = _mm_add_epi64(row3h, row4h); + row2l = _mm_xor_si128(row2l, row3l); + row2h = _mm_xor_si128(row2h, row3h); + row2l = _mm_xor_si128(_mm_srli_epi64(row2l,24),_mm_slli_epi64(row2l,40)); + row2h = _mm_xor_si128(_mm_srli_epi64(row2h,24),_mm_slli_epi64(row2h,40)); + + + b0 = _mm_set_epi64x(m2, m12); + b1 = _mm_set_epi64x(m3, m7); + row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); + row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); + row4l = _mm_xor_si128(row4l, row1l); + row4h = _mm_xor_si128(row4h, row1h); + row4l = _mm_xor_si128(_mm_srli_epi64(row4l,16),_mm_slli_epi64(row4l,48)); + row4h = _mm_xor_si128(_mm_srli_epi64(row4h,16),_mm_slli_epi64(row4h,48)); + row3l = _mm_add_epi64(row3l, row4l); + row3h = _mm_add_epi64(row3h, row4h); + row2l = _mm_xor_si128(row2l, row3l); + row2h = _mm_xor_si128(row2h, row3h); + row2l = _mm_xor_si128(_mm_srli_epi64(row2l,63),_mm_slli_epi64(row2l,1)); + row2h = _mm_xor_si128(_mm_srli_epi64(row2h,63),_mm_slli_epi64(row2h,1)); + + + t0 = row3l, row3l = row3h, row3h = t0, t0 = row2l, t1 = row4l; + row2l = _mm_unpackhi_epi64(row2h, _mm_unpacklo_epi64(row2l, row2l)); + row2h = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(row2h, row2h)); + row4l = _mm_unpackhi_epi64(row4l, _mm_unpacklo_epi64(row4h, row4h)); + row4h = _mm_unpackhi_epi64(row4h, _mm_unpacklo_epi64(t1, t1)); + + + b0 = _mm_set_epi64x(m12, m11); + b1 = _mm_set_epi64x(m15, m5); + row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); + row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); + row4l = _mm_xor_si128(row4l, row1l); + row4h = _mm_xor_si128(row4h, row1h); + row4l = _mm_xor_si128(_mm_srli_epi64(row4l,32),_mm_slli_epi64(row4l,32)); + row4h = _mm_xor_si128(_mm_srli_epi64(row4h,32),_mm_slli_epi64(row4h,32)); + row3l = _mm_add_epi64(row3l, row4l); + row3h = _mm_add_epi64(row3h, row4h); + row2l = _mm_xor_si128(row2l, row3l); + row2h = _mm_xor_si128(row2h, row3h); + row2l = _mm_xor_si128(_mm_srli_epi64(row2l,24),_mm_slli_epi64(row2l,40)); + row2h = _mm_xor_si128(_mm_srli_epi64(row2h,24),_mm_slli_epi64(row2h,40)); + + + b0 = _mm_set_epi64x(m0, m8); + b1 = _mm_set_epi64x(m13, m2); + row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); + row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); + row4l = _mm_xor_si128(row4l, row1l); + row4h = _mm_xor_si128(row4h, row1h); + row4l = _mm_xor_si128(_mm_srli_epi64(row4l,16),_mm_slli_epi64(row4l,48)); + row4h = _mm_xor_si128(_mm_srli_epi64(row4h,16),_mm_slli_epi64(row4h,48)); + row3l = _mm_add_epi64(row3l, row4l); + row3h = _mm_add_epi64(row3h, row4h); + row2l = _mm_xor_si128(row2l, row3l); + row2h = _mm_xor_si128(row2h, row3h); + row2l = _mm_xor_si128(_mm_srli_epi64(row2l,63),_mm_slli_epi64(row2l,1)); + row2h = _mm_xor_si128(_mm_srli_epi64(row2h,63),_mm_slli_epi64(row2h,1)); + + + t0 = row4l, t1 = row2l, row4l = row3l, row3l = row3h, row3h = row4l; + row4l = _mm_unpackhi_epi64(row4h, _mm_unpacklo_epi64(t0, t0)); + row4h = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(row4h, row4h)); + row2l = _mm_unpackhi_epi64(row2l, _mm_unpacklo_epi64(row2h, row2h)); + row2h = _mm_unpackhi_epi64(row2h, _mm_unpacklo_epi64(t1, t1)); + b0 = _mm_set_epi64x(m3, m10); + b1 = _mm_set_epi64x(m9, m7); + row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); + row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); + row4l = _mm_xor_si128(row4l, row1l); + row4h = _mm_xor_si128(row4h, row1h); + row4l = _mm_xor_si128(_mm_srli_epi64(row4l,32),_mm_slli_epi64(row4l,32)); + row4h = _mm_xor_si128(_mm_srli_epi64(row4h,32),_mm_slli_epi64(row4h,32)); + row3l = _mm_add_epi64(row3l, row4l); + row3h = _mm_add_epi64(row3h, row4h); + row2l = _mm_xor_si128(row2l, row3l); + row2h = _mm_xor_si128(row2h, row3h); + row2l = _mm_xor_si128(_mm_srli_epi64(row2l,24),_mm_slli_epi64(row2l,40)); + row2h = _mm_xor_si128(_mm_srli_epi64(row2h,24),_mm_slli_epi64(row2h,40)); + + + b0 = _mm_set_epi64x(m6, m14); + b1 = _mm_set_epi64x(m4, m1); + row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); + row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); + row4l = _mm_xor_si128(row4l, row1l); + row4h = _mm_xor_si128(row4h, row1h); + row4l = _mm_xor_si128(_mm_srli_epi64(row4l,16),_mm_slli_epi64(row4l,48)); + row4h = _mm_xor_si128(_mm_srli_epi64(row4h,16),_mm_slli_epi64(row4h,48)); + row3l = _mm_add_epi64(row3l, row4l); + row3h = _mm_add_epi64(row3h, row4h); + row2l = _mm_xor_si128(row2l, row3l); + row2h = _mm_xor_si128(row2h, row3h); + row2l = _mm_xor_si128(_mm_srli_epi64(row2l,63),_mm_slli_epi64(row2l,1)); + row2h = _mm_xor_si128(_mm_srli_epi64(row2h,63),_mm_slli_epi64(row2h,1)); + + + t0 = row3l, row3l = row3h, row3h = t0, t0 = row2l, t1 = row4l; + row2l = _mm_unpackhi_epi64(row2h, _mm_unpacklo_epi64(row2l, row2l)); + row2h = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(row2h, row2h)); + row4l = _mm_unpackhi_epi64(row4l, _mm_unpacklo_epi64(row4h, row4h)); + row4h = _mm_unpackhi_epi64(row4h, _mm_unpacklo_epi64(t1, t1)); + + + b0 = _mm_set_epi64x(m3, m7); + b1 = _mm_set_epi64x(m11, m13); + row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); + row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); + row4l = _mm_xor_si128(row4l, row1l); + row4h = _mm_xor_si128(row4h, row1h); + row4l = _mm_xor_si128(_mm_srli_epi64(row4l,32),_mm_slli_epi64(row4l,32)); + row4h = _mm_xor_si128(_mm_srli_epi64(row4h,32),_mm_slli_epi64(row4h,32)); + row3l = _mm_add_epi64(row3l, row4l); + row3h = _mm_add_epi64(row3h, row4h); + row2l = _mm_xor_si128(row2l, row3l); + row2h = _mm_xor_si128(row2h, row3h); + row2l = _mm_xor_si128(_mm_srli_epi64(row2l,24),_mm_slli_epi64(row2l,40)); + row2h = _mm_xor_si128(_mm_srli_epi64(row2h,24),_mm_slli_epi64(row2h,40)); + + + b0 = _mm_set_epi64x(m1, m9); + b1 = _mm_set_epi64x(m14, m12); + row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); + row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); + row4l = _mm_xor_si128(row4l, row1l); + row4h = _mm_xor_si128(row4h, row1h); + row4l = _mm_xor_si128(_mm_srli_epi64(row4l,16),_mm_slli_epi64(row4l,48)); + row4h = _mm_xor_si128(_mm_srli_epi64(row4h,16),_mm_slli_epi64(row4h,48)); + row3l = _mm_add_epi64(row3l, row4l); + row3h = _mm_add_epi64(row3h, row4h); + row2l = _mm_xor_si128(row2l, row3l); + row2h = _mm_xor_si128(row2h, row3h); + row2l = _mm_xor_si128(_mm_srli_epi64(row2l,63),_mm_slli_epi64(row2l,1)); + row2h = _mm_xor_si128(_mm_srli_epi64(row2h,63),_mm_slli_epi64(row2h,1)); + + + t0 = row4l, t1 = row2l, row4l = row3l, row3l = row3h, row3h = row4l; + row4l = _mm_unpackhi_epi64(row4h, _mm_unpacklo_epi64(t0, t0)); + row4h = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(row4h, row4h)); + row2l = _mm_unpackhi_epi64(row2l, _mm_unpacklo_epi64(row2h, row2h)); + row2h = _mm_unpackhi_epi64(row2h, _mm_unpacklo_epi64(t1, t1)); + b0 = _mm_set_epi64x(m5, m2); + b1 = _mm_set_epi64x(m15, m4); + row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); + row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); + row4l = _mm_xor_si128(row4l, row1l); + row4h = _mm_xor_si128(row4h, row1h); + row4l = _mm_xor_si128(_mm_srli_epi64(row4l,32),_mm_slli_epi64(row4l,32)); + row4h = _mm_xor_si128(_mm_srli_epi64(row4h,32),_mm_slli_epi64(row4h,32)); + row3l = _mm_add_epi64(row3l, row4l); + row3h = _mm_add_epi64(row3h, row4h); + row2l = _mm_xor_si128(row2l, row3l); + row2h = _mm_xor_si128(row2h, row3h); + row2l = _mm_xor_si128(_mm_srli_epi64(row2l,24),_mm_slli_epi64(row2l,40)); + row2h = _mm_xor_si128(_mm_srli_epi64(row2h,24),_mm_slli_epi64(row2h,40)); + + + b0 = _mm_set_epi64x(m10, m6); + b1 = _mm_set_epi64x(m8, m0); + row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); + row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); + row4l = _mm_xor_si128(row4l, row1l); + row4h = _mm_xor_si128(row4h, row1h); + row4l = _mm_xor_si128(_mm_srli_epi64(row4l,16),_mm_slli_epi64(row4l,48)); + row4h = _mm_xor_si128(_mm_srli_epi64(row4h,16),_mm_slli_epi64(row4h,48)); + row3l = _mm_add_epi64(row3l, row4l); + row3h = _mm_add_epi64(row3h, row4h); + row2l = _mm_xor_si128(row2l, row3l); + row2h = _mm_xor_si128(row2h, row3h); + row2l = _mm_xor_si128(_mm_srli_epi64(row2l,63),_mm_slli_epi64(row2l,1)); + row2h = _mm_xor_si128(_mm_srli_epi64(row2h,63),_mm_slli_epi64(row2h,1)); + + + t0 = row3l, row3l = row3h, row3h = t0, t0 = row2l, t1 = row4l; + row2l = _mm_unpackhi_epi64(row2h, _mm_unpacklo_epi64(row2l, row2l)); + row2h = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(row2h, row2h)); + row4l = _mm_unpackhi_epi64(row4l, _mm_unpacklo_epi64(row4h, row4h)); + row4h = _mm_unpackhi_epi64(row4h, _mm_unpacklo_epi64(t1, t1)); + + + b0 = _mm_set_epi64x(m5, m9); + b1 = _mm_set_epi64x(m10, m2); + row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); + row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); + row4l = _mm_xor_si128(row4l, row1l); + row4h = _mm_xor_si128(row4h, row1h); + row4l = _mm_xor_si128(_mm_srli_epi64(row4l,32),_mm_slli_epi64(row4l,32)); + row4h = _mm_xor_si128(_mm_srli_epi64(row4h,32),_mm_slli_epi64(row4h,32)); + row3l = _mm_add_epi64(row3l, row4l); + row3h = _mm_add_epi64(row3h, row4h); + row2l = _mm_xor_si128(row2l, row3l); + row2h = _mm_xor_si128(row2h, row3h); + row2l = _mm_xor_si128(_mm_srli_epi64(row2l,24),_mm_slli_epi64(row2l,40)); + row2h = _mm_xor_si128(_mm_srli_epi64(row2h,24),_mm_slli_epi64(row2h,40)); + + + b0 = _mm_set_epi64x(m7, m0); + b1 = _mm_set_epi64x(m15, m4); + row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); + row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); + row4l = _mm_xor_si128(row4l, row1l); + row4h = _mm_xor_si128(row4h, row1h); + row4l = _mm_xor_si128(_mm_srli_epi64(row4l,16),_mm_slli_epi64(row4l,48)); + row4h = _mm_xor_si128(_mm_srli_epi64(row4h,16),_mm_slli_epi64(row4h,48)); + row3l = _mm_add_epi64(row3l, row4l); + row3h = _mm_add_epi64(row3h, row4h); + row2l = _mm_xor_si128(row2l, row3l); + row2h = _mm_xor_si128(row2h, row3h); + row2l = _mm_xor_si128(_mm_srli_epi64(row2l,63),_mm_slli_epi64(row2l,1)); + row2h = _mm_xor_si128(_mm_srli_epi64(row2h,63),_mm_slli_epi64(row2h,1)); + + + t0 = row4l, t1 = row2l, row4l = row3l, row3l = row3h, row3h = row4l; + row4l = _mm_unpackhi_epi64(row4h, _mm_unpacklo_epi64(t0, t0)); + row4h = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(row4h, row4h)); + row2l = _mm_unpackhi_epi64(row2l, _mm_unpacklo_epi64(row2h, row2h)); + row2h = _mm_unpackhi_epi64(row2h, _mm_unpacklo_epi64(t1, t1)); + b0 = _mm_set_epi64x(m11, m14); + b1 = _mm_set_epi64x(m3, m6); + row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); + row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); + row4l = _mm_xor_si128(row4l, row1l); + row4h = _mm_xor_si128(row4h, row1h); + row4l = _mm_xor_si128(_mm_srli_epi64(row4l,32),_mm_slli_epi64(row4l,32)); + row4h = _mm_xor_si128(_mm_srli_epi64(row4h,32),_mm_slli_epi64(row4h,32)); + row3l = _mm_add_epi64(row3l, row4l); + row3h = _mm_add_epi64(row3h, row4h); + row2l = _mm_xor_si128(row2l, row3l); + row2h = _mm_xor_si128(row2h, row3h); + row2l = _mm_xor_si128(_mm_srli_epi64(row2l,24),_mm_slli_epi64(row2l,40)); + row2h = _mm_xor_si128(_mm_srli_epi64(row2h,24),_mm_slli_epi64(row2h,40)); + + + b0 = _mm_set_epi64x(m12, m1); + b1 = _mm_set_epi64x(m13, m8); + row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); + row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); + row4l = _mm_xor_si128(row4l, row1l); + row4h = _mm_xor_si128(row4h, row1h); + row4l = _mm_xor_si128(_mm_srli_epi64(row4l,16),_mm_slli_epi64(row4l,48)); + row4h = _mm_xor_si128(_mm_srli_epi64(row4h,16),_mm_slli_epi64(row4h,48)); + row3l = _mm_add_epi64(row3l, row4l); + row3h = _mm_add_epi64(row3h, row4h); + row2l = _mm_xor_si128(row2l, row3l); + row2h = _mm_xor_si128(row2h, row3h); + row2l = _mm_xor_si128(_mm_srli_epi64(row2l,63),_mm_slli_epi64(row2l,1)); + row2h = _mm_xor_si128(_mm_srli_epi64(row2h,63),_mm_slli_epi64(row2h,1)); + + + t0 = row3l, row3l = row3h, row3h = t0, t0 = row2l, t1 = row4l; + row2l = _mm_unpackhi_epi64(row2h, _mm_unpacklo_epi64(row2l, row2l)); + row2h = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(row2h, row2h)); + row4l = _mm_unpackhi_epi64(row4l, _mm_unpacklo_epi64(row4h, row4h)); + row4h = _mm_unpackhi_epi64(row4h, _mm_unpacklo_epi64(t1, t1)); + + + b0 = _mm_set_epi64x(m6, m2); + b1 = _mm_set_epi64x(m8, m0); + row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); + row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); + row4l = _mm_xor_si128(row4l, row1l); + row4h = _mm_xor_si128(row4h, row1h); + row4l = _mm_xor_si128(_mm_srli_epi64(row4l,32),_mm_slli_epi64(row4l,32)); + row4h = _mm_xor_si128(_mm_srli_epi64(row4h,32),_mm_slli_epi64(row4h,32)); + row3l = _mm_add_epi64(row3l, row4l); + row3h = _mm_add_epi64(row3h, row4h); + row2l = _mm_xor_si128(row2l, row3l); + row2h = _mm_xor_si128(row2h, row3h); + row2l = _mm_xor_si128(_mm_srli_epi64(row2l,24),_mm_slli_epi64(row2l,40)); + row2h = _mm_xor_si128(_mm_srli_epi64(row2h,24),_mm_slli_epi64(row2h,40)); + + + b0 = _mm_set_epi64x(m10, m12); + b1 = _mm_set_epi64x(m3, m11); + row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); + row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); + row4l = _mm_xor_si128(row4l, row1l); + row4h = _mm_xor_si128(row4h, row1h); + row4l = _mm_xor_si128(_mm_srli_epi64(row4l,16),_mm_slli_epi64(row4l,48)); + row4h = _mm_xor_si128(_mm_srli_epi64(row4h,16),_mm_slli_epi64(row4h,48)); + row3l = _mm_add_epi64(row3l, row4l); + row3h = _mm_add_epi64(row3h, row4h); + row2l = _mm_xor_si128(row2l, row3l); + row2h = _mm_xor_si128(row2h, row3h); + row2l = _mm_xor_si128(_mm_srli_epi64(row2l,63),_mm_slli_epi64(row2l,1)); + row2h = _mm_xor_si128(_mm_srli_epi64(row2h,63),_mm_slli_epi64(row2h,1)); + + + t0 = row4l, t1 = row2l, row4l = row3l, row3l = row3h, row3h = row4l; + row4l = _mm_unpackhi_epi64(row4h, _mm_unpacklo_epi64(t0, t0)); + row4h = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(row4h, row4h)); + row2l = _mm_unpackhi_epi64(row2l, _mm_unpacklo_epi64(row2h, row2h)); + row2h = _mm_unpackhi_epi64(row2h, _mm_unpacklo_epi64(t1, t1)); + b0 = _mm_set_epi64x(m7, m4); + b1 = _mm_set_epi64x(m1, m15); + row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); + row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); + row4l = _mm_xor_si128(row4l, row1l); + row4h = _mm_xor_si128(row4h, row1h); + row4l = _mm_xor_si128(_mm_srli_epi64(row4l,32),_mm_slli_epi64(row4l,32)); + row4h = _mm_xor_si128(_mm_srli_epi64(row4h,32),_mm_slli_epi64(row4h,32)); + row3l = _mm_add_epi64(row3l, row4l); + row3h = _mm_add_epi64(row3h, row4h); + row2l = _mm_xor_si128(row2l, row3l); + row2h = _mm_xor_si128(row2h, row3h); + row2l = _mm_xor_si128(_mm_srli_epi64(row2l,24),_mm_slli_epi64(row2l,40)); + row2h = _mm_xor_si128(_mm_srli_epi64(row2h,24),_mm_slli_epi64(row2h,40)); + + + b0 = _mm_set_epi64x(m5, m13); + b1 = _mm_set_epi64x(m9, m14); + row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); + row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); + row4l = _mm_xor_si128(row4l, row1l); + row4h = _mm_xor_si128(row4h, row1h); + row4l = _mm_xor_si128(_mm_srli_epi64(row4l,16),_mm_slli_epi64(row4l,48)); + row4h = _mm_xor_si128(_mm_srli_epi64(row4h,16),_mm_slli_epi64(row4h,48)); + row3l = _mm_add_epi64(row3l, row4l); + row3h = _mm_add_epi64(row3h, row4h); + row2l = _mm_xor_si128(row2l, row3l); + row2h = _mm_xor_si128(row2h, row3h); + row2l = _mm_xor_si128(_mm_srli_epi64(row2l,63),_mm_slli_epi64(row2l,1)); + row2h = _mm_xor_si128(_mm_srli_epi64(row2h,63),_mm_slli_epi64(row2h,1)); + + + t0 = row3l, row3l = row3h, row3h = t0, t0 = row2l, t1 = row4l; + row2l = _mm_unpackhi_epi64(row2h, _mm_unpacklo_epi64(row2l, row2l)); + row2h = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(row2h, row2h)); + row4l = _mm_unpackhi_epi64(row4l, _mm_unpacklo_epi64(row4h, row4h)); + row4h = _mm_unpackhi_epi64(row4h, _mm_unpacklo_epi64(t1, t1)); + + + b0 = _mm_set_epi64x(m1, m12); + b1 = _mm_set_epi64x(m4, m14); + row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); + row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); + row4l = _mm_xor_si128(row4l, row1l); + row4h = _mm_xor_si128(row4h, row1h); + row4l = _mm_xor_si128(_mm_srli_epi64(row4l,32),_mm_slli_epi64(row4l,32)); + row4h = _mm_xor_si128(_mm_srli_epi64(row4h,32),_mm_slli_epi64(row4h,32)); + row3l = _mm_add_epi64(row3l, row4l); + row3h = _mm_add_epi64(row3h, row4h); + row2l = _mm_xor_si128(row2l, row3l); + row2h = _mm_xor_si128(row2h, row3h); + row2l = _mm_xor_si128(_mm_srli_epi64(row2l,24),_mm_slli_epi64(row2l,40)); + row2h = _mm_xor_si128(_mm_srli_epi64(row2h,24),_mm_slli_epi64(row2h,40)); + + + b0 = _mm_set_epi64x(m15, m5); + b1 = _mm_set_epi64x(m10, m13); + row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); + row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); + row4l = _mm_xor_si128(row4l, row1l); + row4h = _mm_xor_si128(row4h, row1h); + row4l = _mm_xor_si128(_mm_srli_epi64(row4l,16),_mm_slli_epi64(row4l,48)); + row4h = _mm_xor_si128(_mm_srli_epi64(row4h,16),_mm_slli_epi64(row4h,48)); + row3l = _mm_add_epi64(row3l, row4l); + row3h = _mm_add_epi64(row3h, row4h); + row2l = _mm_xor_si128(row2l, row3l); + row2h = _mm_xor_si128(row2h, row3h); + row2l = _mm_xor_si128(_mm_srli_epi64(row2l,63),_mm_slli_epi64(row2l,1)); + row2h = _mm_xor_si128(_mm_srli_epi64(row2h,63),_mm_slli_epi64(row2h,1)); + + + t0 = row4l, t1 = row2l, row4l = row3l, row3l = row3h, row3h = row4l; + row4l = _mm_unpackhi_epi64(row4h, _mm_unpacklo_epi64(t0, t0)); + row4h = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(row4h, row4h)); + row2l = _mm_unpackhi_epi64(row2l, _mm_unpacklo_epi64(row2h, row2h)); + row2h = _mm_unpackhi_epi64(row2h, _mm_unpacklo_epi64(t1, t1)); + b0 = _mm_set_epi64x(m6, m0); + b1 = _mm_set_epi64x(m8, m9); + row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); + row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); + row4l = _mm_xor_si128(row4l, row1l); + row4h = _mm_xor_si128(row4h, row1h); + row4l = _mm_xor_si128(_mm_srli_epi64(row4l,32),_mm_slli_epi64(row4l,32)); + row4h = _mm_xor_si128(_mm_srli_epi64(row4h,32),_mm_slli_epi64(row4h,32)); + row3l = _mm_add_epi64(row3l, row4l); + row3h = _mm_add_epi64(row3h, row4h); + row2l = _mm_xor_si128(row2l, row3l); + row2h = _mm_xor_si128(row2h, row3h); + row2l = _mm_xor_si128(_mm_srli_epi64(row2l,24),_mm_slli_epi64(row2l,40)); + row2h = _mm_xor_si128(_mm_srli_epi64(row2h,24),_mm_slli_epi64(row2h,40)); + + + b0 = _mm_set_epi64x(m3, m7); + b1 = _mm_set_epi64x(m11, m2); + row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); + row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); + row4l = _mm_xor_si128(row4l, row1l); + row4h = _mm_xor_si128(row4h, row1h); + row4l = _mm_xor_si128(_mm_srli_epi64(row4l,16),_mm_slli_epi64(row4l,48)); + row4h = _mm_xor_si128(_mm_srli_epi64(row4h,16),_mm_slli_epi64(row4h,48)); + row3l = _mm_add_epi64(row3l, row4l); + row3h = _mm_add_epi64(row3h, row4h); + row2l = _mm_xor_si128(row2l, row3l); + row2h = _mm_xor_si128(row2h, row3h); + row2l = _mm_xor_si128(_mm_srli_epi64(row2l,63),_mm_slli_epi64(row2l,1)); + row2h = _mm_xor_si128(_mm_srli_epi64(row2h,63),_mm_slli_epi64(row2h,1)); + + + t0 = row3l, row3l = row3h, row3h = t0, t0 = row2l, t1 = row4l; + row2l = _mm_unpackhi_epi64(row2h, _mm_unpacklo_epi64(row2l, row2l)); + row2h = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(row2h, row2h)); + row4l = _mm_unpackhi_epi64(row4l, _mm_unpacklo_epi64(row4h, row4h)); + row4h = _mm_unpackhi_epi64(row4h, _mm_unpacklo_epi64(t1, t1)); + + + b0 = _mm_set_epi64x(m7, m13); + b1 = _mm_set_epi64x(m3, m12); + row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); + row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); + row4l = _mm_xor_si128(row4l, row1l); + row4h = _mm_xor_si128(row4h, row1h); + row4l = _mm_xor_si128(_mm_srli_epi64(row4l,32),_mm_slli_epi64(row4l,32)); + row4h = _mm_xor_si128(_mm_srli_epi64(row4h,32),_mm_slli_epi64(row4h,32)); + row3l = _mm_add_epi64(row3l, row4l); + row3h = _mm_add_epi64(row3h, row4h); + row2l = _mm_xor_si128(row2l, row3l); + row2h = _mm_xor_si128(row2h, row3h); + row2l = _mm_xor_si128(_mm_srli_epi64(row2l,24),_mm_slli_epi64(row2l,40)); + row2h = _mm_xor_si128(_mm_srli_epi64(row2h,24),_mm_slli_epi64(row2h,40)); + + + b0 = _mm_set_epi64x(m14, m11); + b1 = _mm_set_epi64x(m9, m1); + row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); + row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); + row4l = _mm_xor_si128(row4l, row1l); + row4h = _mm_xor_si128(row4h, row1h); + row4l = _mm_xor_si128(_mm_srli_epi64(row4l,16),_mm_slli_epi64(row4l,48)); + row4h = _mm_xor_si128(_mm_srli_epi64(row4h,16),_mm_slli_epi64(row4h,48)); + row3l = _mm_add_epi64(row3l, row4l); + row3h = _mm_add_epi64(row3h, row4h); + row2l = _mm_xor_si128(row2l, row3l); + row2h = _mm_xor_si128(row2h, row3h); + row2l = _mm_xor_si128(_mm_srli_epi64(row2l,63),_mm_slli_epi64(row2l,1)); + row2h = _mm_xor_si128(_mm_srli_epi64(row2h,63),_mm_slli_epi64(row2h,1)); + + + t0 = row4l, t1 = row2l, row4l = row3l, row3l = row3h, row3h = row4l; + row4l = _mm_unpackhi_epi64(row4h, _mm_unpacklo_epi64(t0, t0)); + row4h = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(row4h, row4h)); + row2l = _mm_unpackhi_epi64(row2l, _mm_unpacklo_epi64(row2h, row2h)); + row2h = _mm_unpackhi_epi64(row2h, _mm_unpacklo_epi64(t1, t1)); + b0 = _mm_set_epi64x(m15, m5); + b1 = _mm_set_epi64x(m2, m8); + row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); + row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); + row4l = _mm_xor_si128(row4l, row1l); + row4h = _mm_xor_si128(row4h, row1h); + row4l = _mm_xor_si128(_mm_srli_epi64(row4l,32),_mm_slli_epi64(row4l,32)); + row4h = _mm_xor_si128(_mm_srli_epi64(row4h,32),_mm_slli_epi64(row4h,32)); + row3l = _mm_add_epi64(row3l, row4l); + row3h = _mm_add_epi64(row3h, row4h); + row2l = _mm_xor_si128(row2l, row3l); + row2h = _mm_xor_si128(row2h, row3h); + row2l = _mm_xor_si128(_mm_srli_epi64(row2l,24),_mm_slli_epi64(row2l,40)); + row2h = _mm_xor_si128(_mm_srli_epi64(row2h,24),_mm_slli_epi64(row2h,40)); + + + b0 = _mm_set_epi64x(m4, m0); + b1 = _mm_set_epi64x(m10, m6); + row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); + row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); + row4l = _mm_xor_si128(row4l, row1l); + row4h = _mm_xor_si128(row4h, row1h); + row4l = _mm_xor_si128(_mm_srli_epi64(row4l,16),_mm_slli_epi64(row4l,48)); + row4h = _mm_xor_si128(_mm_srli_epi64(row4h,16),_mm_slli_epi64(row4h,48)); + row3l = _mm_add_epi64(row3l, row4l); + row3h = _mm_add_epi64(row3h, row4h); + row2l = _mm_xor_si128(row2l, row3l); + row2h = _mm_xor_si128(row2h, row3h); + row2l = _mm_xor_si128(_mm_srli_epi64(row2l,63),_mm_slli_epi64(row2l,1)); + row2h = _mm_xor_si128(_mm_srli_epi64(row2h,63),_mm_slli_epi64(row2h,1)); + + + t0 = row3l, row3l = row3h, row3h = t0, t0 = row2l, t1 = row4l; + row2l = _mm_unpackhi_epi64(row2h, _mm_unpacklo_epi64(row2l, row2l)); + row2h = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(row2h, row2h)); + row4l = _mm_unpackhi_epi64(row4l, _mm_unpacklo_epi64(row4h, row4h)); + row4h = _mm_unpackhi_epi64(row4h, _mm_unpacklo_epi64(t1, t1)); + + + b0 = _mm_set_epi64x(m14, m6); + b1 = _mm_set_epi64x(m0, m11); + row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); + row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); + row4l = _mm_xor_si128(row4l, row1l); + row4h = _mm_xor_si128(row4h, row1h); + row4l = _mm_xor_si128(_mm_srli_epi64(row4l,32),_mm_slli_epi64(row4l,32)); + row4h = _mm_xor_si128(_mm_srli_epi64(row4h,32),_mm_slli_epi64(row4h,32)); + row3l = _mm_add_epi64(row3l, row4l); + row3h = _mm_add_epi64(row3h, row4h); + row2l = _mm_xor_si128(row2l, row3l); + row2h = _mm_xor_si128(row2h, row3h); + row2l = _mm_xor_si128(_mm_srli_epi64(row2l,24),_mm_slli_epi64(row2l,40)); + row2h = _mm_xor_si128(_mm_srli_epi64(row2h,24),_mm_slli_epi64(row2h,40)); + + + b0 = _mm_set_epi64x(m9, m15); + b1 = _mm_set_epi64x(m8, m3); + row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); + row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); + row4l = _mm_xor_si128(row4l, row1l); + row4h = _mm_xor_si128(row4h, row1h); + row4l = _mm_xor_si128(_mm_srli_epi64(row4l,16),_mm_slli_epi64(row4l,48)); + row4h = _mm_xor_si128(_mm_srli_epi64(row4h,16),_mm_slli_epi64(row4h,48)); + row3l = _mm_add_epi64(row3l, row4l); + row3h = _mm_add_epi64(row3h, row4h); + row2l = _mm_xor_si128(row2l, row3l); + row2h = _mm_xor_si128(row2h, row3h); + row2l = _mm_xor_si128(_mm_srli_epi64(row2l,63),_mm_slli_epi64(row2l,1)); + row2h = _mm_xor_si128(_mm_srli_epi64(row2h,63),_mm_slli_epi64(row2h,1)); + + + t0 = row4l, t1 = row2l, row4l = row3l, row3l = row3h, row3h = row4l; + row4l = _mm_unpackhi_epi64(row4h, _mm_unpacklo_epi64(t0, t0)); + row4h = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(row4h, row4h)); + row2l = _mm_unpackhi_epi64(row2l, _mm_unpacklo_epi64(row2h, row2h)); + row2h = _mm_unpackhi_epi64(row2h, _mm_unpacklo_epi64(t1, t1)); + b0 = _mm_set_epi64x(m13, m12); + b1 = _mm_set_epi64x(m10, m1); + row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); + row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); + row4l = _mm_xor_si128(row4l, row1l); + row4h = _mm_xor_si128(row4h, row1h); + row4l = _mm_xor_si128(_mm_srli_epi64(row4l,32),_mm_slli_epi64(row4l,32)); + row4h = _mm_xor_si128(_mm_srli_epi64(row4h,32),_mm_slli_epi64(row4h,32)); + row3l = _mm_add_epi64(row3l, row4l); + row3h = _mm_add_epi64(row3h, row4h); + row2l = _mm_xor_si128(row2l, row3l); + row2h = _mm_xor_si128(row2h, row3h); + row2l = _mm_xor_si128(_mm_srli_epi64(row2l,24),_mm_slli_epi64(row2l,40)); + row2h = _mm_xor_si128(_mm_srli_epi64(row2h,24),_mm_slli_epi64(row2h,40)); + + + b0 = _mm_set_epi64x(m7, m2); + b1 = _mm_set_epi64x(m5, m4); + row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); + row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); + row4l = _mm_xor_si128(row4l, row1l); + row4h = _mm_xor_si128(row4h, row1h); + row4l = _mm_xor_si128(_mm_srli_epi64(row4l,16),_mm_slli_epi64(row4l,48)); + row4h = _mm_xor_si128(_mm_srli_epi64(row4h,16),_mm_slli_epi64(row4h,48)); + row3l = _mm_add_epi64(row3l, row4l); + row3h = _mm_add_epi64(row3h, row4h); + row2l = _mm_xor_si128(row2l, row3l); + row2h = _mm_xor_si128(row2h, row3h); + row2l = _mm_xor_si128(_mm_srli_epi64(row2l,63),_mm_slli_epi64(row2l,1)); + row2h = _mm_xor_si128(_mm_srli_epi64(row2h,63),_mm_slli_epi64(row2h,1)); + + + t0 = row3l, row3l = row3h, row3h = t0, t0 = row2l, t1 = row4l; + row2l = _mm_unpackhi_epi64(row2h, _mm_unpacklo_epi64(row2l, row2l)); + row2h = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(row2h, row2h)); + row4l = _mm_unpackhi_epi64(row4l, _mm_unpacklo_epi64(row4h, row4h)); + row4h = _mm_unpackhi_epi64(row4h, _mm_unpacklo_epi64(t1, t1)); + + + b0 = _mm_set_epi64x(m8, m10); + b1 = _mm_set_epi64x(m1, m7); + row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); + row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); + row4l = _mm_xor_si128(row4l, row1l); + row4h = _mm_xor_si128(row4h, row1h); + row4l = _mm_xor_si128(_mm_srli_epi64(row4l,32),_mm_slli_epi64(row4l,32)); + row4h = _mm_xor_si128(_mm_srli_epi64(row4h,32),_mm_slli_epi64(row4h,32)); + row3l = _mm_add_epi64(row3l, row4l); + row3h = _mm_add_epi64(row3h, row4h); + row2l = _mm_xor_si128(row2l, row3l); + row2h = _mm_xor_si128(row2h, row3h); + row2l = _mm_xor_si128(_mm_srli_epi64(row2l,24),_mm_slli_epi64(row2l,40)); + row2h = _mm_xor_si128(_mm_srli_epi64(row2h,24),_mm_slli_epi64(row2h,40)); + + + b0 = _mm_set_epi64x(m4, m2); + b1 = _mm_set_epi64x(m5, m6); + row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); + row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); + row4l = _mm_xor_si128(row4l, row1l); + row4h = _mm_xor_si128(row4h, row1h); + row4l = _mm_xor_si128(_mm_srli_epi64(row4l,16),_mm_slli_epi64(row4l,48)); + row4h = _mm_xor_si128(_mm_srli_epi64(row4h,16),_mm_slli_epi64(row4h,48)); + row3l = _mm_add_epi64(row3l, row4l); + row3h = _mm_add_epi64(row3h, row4h); + row2l = _mm_xor_si128(row2l, row3l); + row2h = _mm_xor_si128(row2h, row3h); + row2l = _mm_xor_si128(_mm_srli_epi64(row2l,63),_mm_slli_epi64(row2l,1)); + row2h = _mm_xor_si128(_mm_srli_epi64(row2h,63),_mm_slli_epi64(row2h,1)); + + + t0 = row4l, t1 = row2l, row4l = row3l, row3l = row3h, row3h = row4l; + row4l = _mm_unpackhi_epi64(row4h, _mm_unpacklo_epi64(t0, t0)); + row4h = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(row4h, row4h)); + row2l = _mm_unpackhi_epi64(row2l, _mm_unpacklo_epi64(row2h, row2h)); + row2h = _mm_unpackhi_epi64(row2h, _mm_unpacklo_epi64(t1, t1)); + b0 = _mm_set_epi64x(m9, m15); + b1 = _mm_set_epi64x(m13, m3); + row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); + row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); + row4l = _mm_xor_si128(row4l, row1l); + row4h = _mm_xor_si128(row4h, row1h); + row4l = _mm_xor_si128(_mm_srli_epi64(row4l,32),_mm_slli_epi64(row4l,32)); + row4h = _mm_xor_si128(_mm_srli_epi64(row4h,32),_mm_slli_epi64(row4h,32)); + row3l = _mm_add_epi64(row3l, row4l); + row3h = _mm_add_epi64(row3h, row4h); + row2l = _mm_xor_si128(row2l, row3l); + row2h = _mm_xor_si128(row2h, row3h); + row2l = _mm_xor_si128(_mm_srli_epi64(row2l,24),_mm_slli_epi64(row2l,40)); + row2h = _mm_xor_si128(_mm_srli_epi64(row2h,24),_mm_slli_epi64(row2h,40)); + + + b0 = _mm_set_epi64x(m14, m11); + b1 = _mm_set_epi64x(m0, m12); + row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); + row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); + row4l = _mm_xor_si128(row4l, row1l); + row4h = _mm_xor_si128(row4h, row1h); + row4l = _mm_xor_si128(_mm_srli_epi64(row4l,16),_mm_slli_epi64(row4l,48)); + row4h = _mm_xor_si128(_mm_srli_epi64(row4h,16),_mm_slli_epi64(row4h,48)); + row3l = _mm_add_epi64(row3l, row4l); + row3h = _mm_add_epi64(row3h, row4h); + row2l = _mm_xor_si128(row2l, row3l); + row2h = _mm_xor_si128(row2h, row3h); + row2l = _mm_xor_si128(_mm_srli_epi64(row2l,63),_mm_slli_epi64(row2l,1)); + row2h = _mm_xor_si128(_mm_srli_epi64(row2h,63),_mm_slli_epi64(row2h,1)); + + + t0 = row3l, row3l = row3h, row3h = t0, t0 = row2l, t1 = row4l; + row2l = _mm_unpackhi_epi64(row2h, _mm_unpacklo_epi64(row2l, row2l)); + row2h = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(row2h, row2h)); + row4l = _mm_unpackhi_epi64(row4l, _mm_unpacklo_epi64(row4h, row4h)); + row4h = _mm_unpackhi_epi64(row4h, _mm_unpacklo_epi64(t1, t1)); + + + b0 = _mm_set_epi64x(m2, m0); + b1 = _mm_set_epi64x(m6, m4); + row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); + row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); + row4l = _mm_xor_si128(row4l, row1l); + row4h = _mm_xor_si128(row4h, row1h); + row4l = _mm_xor_si128(_mm_srli_epi64(row4l,32),_mm_slli_epi64(row4l,32)); + row4h = _mm_xor_si128(_mm_srli_epi64(row4h,32),_mm_slli_epi64(row4h,32)); + row3l = _mm_add_epi64(row3l, row4l); + row3h = _mm_add_epi64(row3h, row4h); + row2l = _mm_xor_si128(row2l, row3l); + row2h = _mm_xor_si128(row2h, row3h); + row2l = _mm_xor_si128(_mm_srli_epi64(row2l,24),_mm_slli_epi64(row2l,40)); + row2h = _mm_xor_si128(_mm_srli_epi64(row2h,24),_mm_slli_epi64(row2h,40)); + + + b0 = _mm_set_epi64x(m3, m1); + b1 = _mm_set_epi64x(m7, m5); + row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); + row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); + row4l = _mm_xor_si128(row4l, row1l); + row4h = _mm_xor_si128(row4h, row1h); + row4l = _mm_xor_si128(_mm_srli_epi64(row4l,16),_mm_slli_epi64(row4l,48)); + row4h = _mm_xor_si128(_mm_srli_epi64(row4h,16),_mm_slli_epi64(row4h,48)); + row3l = _mm_add_epi64(row3l, row4l); + row3h = _mm_add_epi64(row3h, row4h); + row2l = _mm_xor_si128(row2l, row3l); + row2h = _mm_xor_si128(row2h, row3h); + row2l = _mm_xor_si128(_mm_srli_epi64(row2l,63),_mm_slli_epi64(row2l,1)); + row2h = _mm_xor_si128(_mm_srli_epi64(row2h,63),_mm_slli_epi64(row2h,1)); + + + t0 = row4l, t1 = row2l, row4l = row3l, row3l = row3h, row3h = row4l; + row4l = _mm_unpackhi_epi64(row4h, _mm_unpacklo_epi64(t0, t0)); + row4h = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(row4h, row4h)); + row2l = _mm_unpackhi_epi64(row2l, _mm_unpacklo_epi64(row2h, row2h)); + row2h = _mm_unpackhi_epi64(row2h, _mm_unpacklo_epi64(t1, t1)); + b0 = _mm_set_epi64x(m10, m8); + b1 = _mm_set_epi64x(m14, m12); + row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); + row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); + row4l = _mm_xor_si128(row4l, row1l); + row4h = _mm_xor_si128(row4h, row1h); + row4l = _mm_xor_si128(_mm_srli_epi64(row4l,32),_mm_slli_epi64(row4l,32)); + row4h = _mm_xor_si128(_mm_srli_epi64(row4h,32),_mm_slli_epi64(row4h,32)); + row3l = _mm_add_epi64(row3l, row4l); + row3h = _mm_add_epi64(row3h, row4h); + row2l = _mm_xor_si128(row2l, row3l); + row2h = _mm_xor_si128(row2h, row3h); + row2l = _mm_xor_si128(_mm_srli_epi64(row2l,24),_mm_slli_epi64(row2l,40)); + row2h = _mm_xor_si128(_mm_srli_epi64(row2h,24),_mm_slli_epi64(row2h,40)); + + + b0 = _mm_set_epi64x(m11, m9); + b1 = _mm_set_epi64x(m15, m13); + row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); + row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); + row4l = _mm_xor_si128(row4l, row1l); + row4h = _mm_xor_si128(row4h, row1h); + row4l = _mm_xor_si128(_mm_srli_epi64(row4l,16),_mm_slli_epi64(row4l,48)); + row4h = _mm_xor_si128(_mm_srli_epi64(row4h,16),_mm_slli_epi64(row4h,48)); + row3l = _mm_add_epi64(row3l, row4l); + row3h = _mm_add_epi64(row3h, row4h); + row2l = _mm_xor_si128(row2l, row3l); + row2h = _mm_xor_si128(row2h, row3h); + row2l = _mm_xor_si128(_mm_srli_epi64(row2l,63),_mm_slli_epi64(row2l,1)); + row2h = _mm_xor_si128(_mm_srli_epi64(row2h,63),_mm_slli_epi64(row2h,1)); + + + t0 = row3l, row3l = row3h, row3h = t0, t0 = row2l, t1 = row4l; + row2l = _mm_unpackhi_epi64(row2h, _mm_unpacklo_epi64(row2l, row2l)); + row2h = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(row2h, row2h)); + row4l = _mm_unpackhi_epi64(row4l, _mm_unpacklo_epi64(row4h, row4h)); + row4h = _mm_unpackhi_epi64(row4h, _mm_unpacklo_epi64(t1, t1)); + + + b0 = _mm_set_epi64x(m4, m14); + b1 = _mm_set_epi64x(m13, m9); + row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); + row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); + row4l = _mm_xor_si128(row4l, row1l); + row4h = _mm_xor_si128(row4h, row1h); + row4l = _mm_xor_si128(_mm_srli_epi64(row4l,32),_mm_slli_epi64(row4l,32)); + row4h = _mm_xor_si128(_mm_srli_epi64(row4h,32),_mm_slli_epi64(row4h,32)); + row3l = _mm_add_epi64(row3l, row4l); + row3h = _mm_add_epi64(row3h, row4h); + row2l = _mm_xor_si128(row2l, row3l); + row2h = _mm_xor_si128(row2h, row3h); + row2l = _mm_xor_si128(_mm_srli_epi64(row2l,24),_mm_slli_epi64(row2l,40)); + row2h = _mm_xor_si128(_mm_srli_epi64(row2h,24),_mm_slli_epi64(row2h,40)); + + + b0 = _mm_set_epi64x(m8, m10); + b1 = _mm_set_epi64x(m6, m15); + row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); + row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); + row4l = _mm_xor_si128(row4l, row1l); + row4h = _mm_xor_si128(row4h, row1h); + row4l = _mm_xor_si128(_mm_srli_epi64(row4l,16),_mm_slli_epi64(row4l,48)); + row4h = _mm_xor_si128(_mm_srli_epi64(row4h,16),_mm_slli_epi64(row4h,48)); + row3l = _mm_add_epi64(row3l, row4l); + row3h = _mm_add_epi64(row3h, row4h); + row2l = _mm_xor_si128(row2l, row3l); + row2h = _mm_xor_si128(row2h, row3h); + row2l = _mm_xor_si128(_mm_srli_epi64(row2l,63),_mm_slli_epi64(row2l,1)); + row2h = _mm_xor_si128(_mm_srli_epi64(row2h,63),_mm_slli_epi64(row2h,1)); + + + t0 = row4l, t1 = row2l, row4l = row3l, row3l = row3h, row3h = row4l; + row4l = _mm_unpackhi_epi64(row4h, _mm_unpacklo_epi64(t0, t0)); + row4h = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(row4h, row4h)); + row2l = _mm_unpackhi_epi64(row2l, _mm_unpacklo_epi64(row2h, row2h)); + row2h = _mm_unpackhi_epi64(row2h, _mm_unpacklo_epi64(t1, t1)); + b0 = _mm_set_epi64x(m0, m1); + b1 = _mm_set_epi64x(m5, m11); + row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); + row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); + row4l = _mm_xor_si128(row4l, row1l); + row4h = _mm_xor_si128(row4h, row1h); + row4l = _mm_xor_si128(_mm_srli_epi64(row4l,32),_mm_slli_epi64(row4l,32)); + row4h = _mm_xor_si128(_mm_srli_epi64(row4h,32),_mm_slli_epi64(row4h,32)); + row3l = _mm_add_epi64(row3l, row4l); + row3h = _mm_add_epi64(row3h, row4h); + row2l = _mm_xor_si128(row2l, row3l); + row2h = _mm_xor_si128(row2h, row3h); + row2l = _mm_xor_si128(_mm_srli_epi64(row2l,24),_mm_slli_epi64(row2l,40)); + row2h = _mm_xor_si128(_mm_srli_epi64(row2h,24),_mm_slli_epi64(row2h,40)); + + + b0 = _mm_set_epi64x(m2, m12); + b1 = _mm_set_epi64x(m3, m7); + row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); + row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); + row4l = _mm_xor_si128(row4l, row1l); + row4h = _mm_xor_si128(row4h, row1h); + row4l = _mm_xor_si128(_mm_srli_epi64(row4l,16),_mm_slli_epi64(row4l,48)); + row4h = _mm_xor_si128(_mm_srli_epi64(row4h,16),_mm_slli_epi64(row4h,48)); + row3l = _mm_add_epi64(row3l, row4l); + row3h = _mm_add_epi64(row3h, row4h); + row2l = _mm_xor_si128(row2l, row3l); + row2h = _mm_xor_si128(row2h, row3h); + row2l = _mm_xor_si128(_mm_srli_epi64(row2l,63),_mm_slli_epi64(row2l,1)); + row2h = _mm_xor_si128(_mm_srli_epi64(row2h,63),_mm_slli_epi64(row2h,1)); + + + t0 = row3l, row3l = row3h, row3h = t0, t0 = row2l, t1 = row4l; + row2l = _mm_unpackhi_epi64(row2h, _mm_unpacklo_epi64(row2l, row2l)); + row2h = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(row2h, row2h)); + row4l = _mm_unpackhi_epi64(row4l, _mm_unpacklo_epi64(row4h, row4h)); + row4h = _mm_unpackhi_epi64(row4h, _mm_unpacklo_epi64(t1, t1)); + + row1l = _mm_xor_si128( row3l, row1l ); + row1h = _mm_xor_si128( row3h, row1h ); + _mm_storeu_si128((__m128i *)(&state.h[0]), _mm_xor_si128(_mm_loadu_si128( (const __m128i *)(&state.h[0]) ), row1l)); + _mm_storeu_si128((__m128i *)(&state.h[2]), _mm_xor_si128(_mm_loadu_si128( (const __m128i *)(&state.h[2]) ), row1h)); + row2l = _mm_xor_si128( row4l, row2l ); + row2h = _mm_xor_si128( row4h, row2h ); + _mm_storeu_si128((__m128i *)(&state.h[4]), _mm_xor_si128(_mm_loadu_si128( (const __m128i *)(&state.h[4]) ), row2l)); + _mm_storeu_si128((__m128i *)(&state.h[6]), _mm_xor_si128(_mm_loadu_si128( (const __m128i *)(&state.h[6]) ), row2h)); } #endif // CRYPTOPP_BOOL_SSE2_INTRINSICS_AVAILABLE diff --git a/blake2.h b/blake2.h index bd081fc5..a6583d32 100644 --- a/blake2.h +++ b/blake2.h @@ -172,7 +172,7 @@ struct CRYPTOPP_NO_VTABLE BLAKE2_State //! \details BLAKE2b uses BLAKE2_Base, while BLAKE2s //! uses BLAKE2_Base. template -class CRYPTOPP_NO_VTABLE BLAKE2_Base : public SimpleKeyingInterfaceImpl > +class BLAKE2_Base : public SimpleKeyingInterfaceImpl > { public: CRYPTOPP_CONSTANT(DIGESTSIZE = BLAKE2_Info::DIGESTSIZE); diff --git a/cryptlib.dsp b/cryptlib.dsp index d08fa890..2d32a916 100644 --- a/cryptlib.dsp +++ b/cryptlib.dsp @@ -234,6 +234,10 @@ SOURCE=.\blowfish.cpp # End Source File # Begin Source File +SOURCE=.\blake2.cpp +# End Source File +# Begin Source File + SOURCE=.\blumshub.cpp # End Source File # Begin Source File @@ -729,6 +733,10 @@ SOURCE=.\basecode.h # End Source File # Begin Source File +SOURCE=.\blake2.h +# End Source File +# Begin Source File + SOURCE=.\blowfish.h # End Source File # Begin Source File diff --git a/cryptlib.vcproj b/cryptlib.vcproj index 77a095a0..bab64bca 100644 --- a/cryptlib.vcproj +++ b/cryptlib.vcproj @@ -1308,6 +1308,80 @@ /> + + + + + + + + + + + + + + + + + + + + + + + + + + @@ -9150,6 +9224,10 @@ RelativePath="basecode.h" > + + diff --git a/vs2010.zip b/vs2010.zip index b4cc9e3d3854c0af0b822f66ed97c17a8adf292e..097769c28ce4933c315a6cc758f1acbf3da3f71a 100644 GIT binary patch delta 5058 zcmY+I2|QHm8^=lLni5_6l6#SmE&D#@7GkIgO$;M@_MK}zMY2rDl6@&#wy{-XWJ`8q z%Q_f))X}mz?+~tmdG_)UmTtFNFA0Q%)jcCfWby^(w2tsSU$$LYz@_=H#!=DDERy z#O8G@2I7N_1@&}EpK7S^+4XPK z{VDl_-Jv&|*)DJO_XMidI$rRTr_0&hBJAN(P>RNvY0xPnJPX;Xu4CK7XM>^^;2W!T znR#wXd(DArnN`aETZ{em+pb^hv!OjXV@leieU9MNS(mkL96;>@)%?R<9_alSMfu~o}SSwzb1?wTycJ1}YbEOK6nzE}|& zJ>PXRT`a%Ev0`-wG%`hH&FkszR+pzAaSL%h%Di|Wv77tWrDCaz(G>=kH+%ji$*qpd zc)|?tX^ZkwJEj>(e_MY6!2+rDaO*#_Q(Mmr>)!`_EDE~$*!AFm)-!EO8v8T>qOa7V zVMj!dJMI-aIzA=D9;zoYQO+ZkTq1E>gjw3oP26fxz$Z$2=S#gU$K2$!v&ClFjfUbi zUpUKk8e){hCA~rUSA}|r*y#L%q8qIbvuF6MKkf`ir|21~@3n*`RRUH9akZjtrgPt< zBE+2}6*tSFLczva_R2>32z`XHzTXbvM{oFSRmfyAm|4-LIyVTaEhQ-p1XVQl6_L_A3D992C|HYtivB-_7t$&}1E!>~KiE(95Wn=L^re zxxJf;5ZPfjN-pm0t>+fcV8s5w&r+w#g+Zf8Mq*)K>Z>aOIZ?=B;;mU!*|uU!MjwvY zy{a?^fVVvi?^HH`vIY{9cy%0W#>c$Ywnwp1^7U+Y_Z>KPO4aS#W}GD4 zOF%n$1zEs3>_&o@`_Ce1&PJPN!A8dVzxO-w@tu5ES=NJA%gfGx*sj1QD(w@HF?h+AN2(8!s72p7_KWLI_n<*FWZ$q zV?sn#vNz7}T2&71@9(I(F2=_^5cpv@q^cmPlE@7JJUxAB^Cj-HHPZgJK+Crr3#nbR zY5RSyWGkD&MdK3ZoA~**7eh}m-=T|z_ns=sD3{xT`D5{h3#h{4pEl!iKfT$ zoHMg?m0vhGEuHFG+F$5?)0Ik4P0UUBb*xO>a&LVLw?x@?8Nj*r2@}1+fq|K$q`->+ z7noB#{QAKm?7?QX(8$|#aCk)WmF9&vhe-^2WMPNqq7w;huAR3tw;wyFkBTIpeGbgI zeSwt3#IX+DspP(3j!Cxh`Am`!62-i8jbI4n{3Ez>cVxO?cwKd^$Md?rm_EL-bxH%t zkrU>sLs_3V={uE~nY6K}AZ7=UA40^=X;Ls_hHCHjD>H_Gnege^RujM75|`nTAd zX`&rmCB#As!_2*aFS>UpW!RdCGp1r%V;X!^x5vFNTh?Z}F2e^?kV8}7Aj(u7u9`!b z{+iOQJ5wU<=aFf&i%ArVxe!!fNhi&yB-oXS*7ZH3iBSKr;KUh`Lx6+kO)0a|5_jnd z2y+a96>rEN^_R#L_Yp-fXT6dL^kd{Ph!6b&xB%f}yc^MCMu}8@PQfzYQn?-?hB_?t zKAmqr!rw%s+OM-nVr)wI7bs81K%v+WB<2`^poiZZS!Htke#AgQd8uriZy>V#-y`72 zav(9%-lHeNjoD$6=@^oKmGGreVT1o3M9{(C;1!SHaUg}9Je#0z*g>gtYW3KMekV}* zF@#qm4_=0lIReM;Wpb?cX6IcQQKmAA$`PM66BDWz|3#T`oElevsHdE$;8_7>?P+O; zBX-^4sk3|$Tf00Bw+)Xdo_by;9F_peAZK*KHX`Qj;Pxx+c;0f3IbxdWAoGG!z0HPVPxGt4uLnrnFh{)doE?7 z;}TH)pW)LrqE0rBp+PUgcB?QSO`4eHNaNG}_;4NHAb($G=13XYecO&aPwQ#}I92|} z(=d>E$Da^fxT#FoC_mtldDv?sIK(n2ZTi`omn5l(^Zs*087w+*4`o}%>g?%F8muoD z)Er}gat!dKLxb>-(-dTTx(E=d^19bx;S$P~&7BPNQkS+E$oeUlwS|t4rjQfwPyTE9 z?C81-I!qU~T>k!2wbl6GtErA?pBaIlox%`;+&&OCN>DBe*fxWsQ*bV(*ihlChn*it zdpv#U#7FZ_l!iytPVgBNKESMUa~a-gh28XJY!H7|9Iz!A62xpw`CX6h@YG%V1$I+@ z(TQ~3M`uZ^RHku^;jrU1gSH2zD!5OhUsEAj+decPp0**?CrEyx`bDGgk5cHLx&86 ztE5(TxeGSp9=7CSxWYPxmqg46R!{p?3px*q5|NUpkENN%UnT1yeD!aJhnsJQ20-)b{eGZ6N}pW1Wu$)WXhWn zmJ|cwq&UilBezQY?Rx6<_r!eE18eGI;QDismpujM6!NK$Ew|85IEUe^NtDc z-JBZKRUJ|OeZ}~XDkvsAlj19Hhtw+cnXW9v4cJ_0A7jsMAUM*yjum5eH`624uaWY* z^J`c7GJZ?yFiw7t*SM#qhn-@Dd-}dfmUo{0oM-^zUP7nE;rl^d zYKHDkRm_dylTqy_Gte|D1VMr7q15+}wSl_ z4svE%B}1H6q^_x?TW3#Q=yXu@VhTAIPBUF(V(0AU?)>y}_GX4WJ-tk5gE^$>7aKM| zLS<)bB2SB_|A*y#*)H)SAn)`Wh}RrqXQ7?p*Xd52MhY8|z%x#Zt{)ghUBOeRY>c;DPuKal9ziYgfAyRzeFXIaQw@XZ7YAhkGpeC1V{m@tTW}p z+}+ai_*jToUPPSI&||7_%cMsAtG1HmksFbFYmIBFYgmKKihf(keFYoI505p4E5m0; z>=x=WNBiv57Qer~bk|!nP1FkIs7Y$c>(r~LZAXnz&gfni)uHBG|G?{CqrCqtz4&hW zYSr5NZ>TjzYQ2xIV_Ws5MHY4eSA|$56_(~qT4MYi$B%9=Bd)r^-y*$7zqVNUjOuBy z(2}mjUvFR3E(Rhd;ZpLpa{?Vgnf9%!j4qj8EBERpDxmZ+y_J}kJG)CmHS&=3W`B8s zH$CgVP-ZPVP#44-rs+4+6+dVfX6vU|>0(Cg6|4Q=3&f_t!m)!_K1c!GOFvpX|I{j$ zUGe788uXa#c>ko%bx!StSwdghdeHe0A2Aw-VPZkyKO(kEd3vpcgLbZP$fDJYDCkQU z63n2jZg{w`9?fIFmX4%9Tngt)Ky0kk@lnE81fR{Qgf_6g4RH5_TxGlyl)uo&dOx8E zLIHb--Qe{^x8=?qtXDGvgkn1}IDd>=Jge1J(p8x3bl5FVWZX58lF;yJb+Q)QyhzV@L}Kls4NaW0?Kg1lYO=e! z5C0B-H%`Ioo)8=Jw<^}?LHD5K1kB@9L3)nL=@c*ZXO-z!C-W7| zM>&7Ab4~$8j#;DE_4#$pKQd0?U2n4XVO>4n01Ra>S8~4}HK?d}|$dt5X)08rpVH%iPeK z{+HrFcda@EiOg3Fpo_E`^?M|HBj>t&g|CSU4FW*-YBDWMcU|~GkQ+>)&0PT>nJ0m%79?QS#AW1 zZZ>Vft)j&xg5Qs0W?T@LhkC&$vyy{G@o0^tmO!w#vVtln4n!RP*5vaK0?aCwCZmd= z2>daP_?(dY`Ak!8_r?Rya_M~a0wFi5E=v!}3+em|=XMd3_egIa$(co@q9kgjG1na2 z(sf~mIrzN|e5$+eG@BZJt83ZFITL)ztx?NQ5?nhdjr#|DZ%BqU0j7EJGO31mCpSX! zV=Zjk9ZaK}BcafSwtFiR7~d4kUR4cLi8rM^ZavtG((Y=}RnwPOY@cVp<_RSU*VU~| ztg<~KeW(qWh1;w_N|R?S7grND#`iD2lM$~G8VGKANfK1^z=D98JU@j zmy?^FwW)}Q*}qOMjy8I_lw=f#J8UsfE%umunwQC;F^-zLvhRDg@sXw8IluE<=R7mlbDeYF|MS1Ec_v~>nqo<^!S4YQzznXX;;$}Q_sh=IJj%^Y#Tk|gr9i>N7O45E^V#dl|3Sm$9cto1lVJSB<{rzn@L+oyYpR4ccS^)bnpE*XQ)0?)70Sk z3^8qN;-#0N!jg-bDdb_uLB=iTt125yox3?=2JYpOw02^J3a!c^#UBK zI9d{3N}Fk!4LS}tfN(Ycv?|U|Px@%)T?s5TXR9~P5=@i}xNdCAa^0z*(Ic=R+G%#^^$d&)mtJP+O%82{k@uS%r3P@T^I8FiSY}c9PQ)c)H-M0T8!&lW(>!Q< z#cJruvw3)uE_H=s@AKl^(%jPW+^yBlu{5S8cbaNZ;O4iLl0dFls4>yzd=zMWX7*Y+ zOo*H;#DFKOKzDngk^voC9yUQIb2HmNZ+@d*E@5T!0mz{igvF=Ym{*7T-@5GIju@w- ztavXJ2M_h*7O63mK*u*Km zT;7lcpd!KcFo4mwR zqF13}u5@5v^VzS~@^akPa+Ct-$^@d@aYgwuWH?(gvH3o_`>nITa6Ga9?y5yZ&)bG! z>uFQaswU(SK!cvUvoih5C`n~5GD>GoZfYqx4$t=9w>Z*MJRyp8Whmc~zfZiwp>UjaFQD4(1z)nYc&=w-jtK@1-RH`uJ# z)%Fr)Lg0|)?XPo{pN9Dqkd<60Zy9OS($d8oLr$~o17Jp zMv7Z9TI@H2S)tn2;770b+bj#%T$3q`4vLvmjLIW$2zCmO#|6u;w2VlX7>S&t+H0Qr z7;SokM=%%VD#!TML9~6{cN?l}>;Rp(a_geRMQXQ=!a7L7RLFSBf_biKZc#9*CX!xE zg>E91!e6j@6XW$_vjt7Eh72_YvIV<Cf3q6&)84a(Uek=7OL#EMBioftHR`3L zNsNr?j2?NB{4QBidVCdJ==DrPjcH72qzY9~`I=-SQemQ}5F-Z4s~L=bP8BnO&P8N; zSuCNx7`;Y$+hD-8|E>BG~(;s&>|+f<6mK*2;sruk32nxcy#Ji^Ym36k_CYo+-Ey|vVzjJ zY5VY^1z9@hhPqXt_BoZmmF^+y4a<`c7sq%fl z$CDGTjz|DOL>qsxrR{l^#*`p}u;0TLMn;y-OaNJ8YrI(y+lG7X(rO6rf8Bk~m8j3k z=P=d4Xb%SN{UL8YPL#t#sB|nMyZ}7R?=?}@Ap({!X(GDzfPMUuxmhHGElV>%NhHEz62PkB6C6{*k}x+ zsES@t{$>W$CmY;O0W$vilKa*sR{+*W*MCv_f(cp9;z;Jza*E%VEzvD5MFnPzj5R!{ z?ACf}SFHKL0NaO~dn^h0A8eTZHyxsd@?1w+Kae^V43mJto;u9g{uRK$9YWg99Lo`e z@CxcZcX5LidjQJm4se`FVX*|qp)zN<-(ERK4wAfSBRRF7fyt| zuWJT~R^o{wJJtyS6A>Ry=IuCcPg6=9u-^}uq3f0X-;-e65i{BbRt9>1FZ>*gmg287 z7n~()kS}*cMNg`{YnSqw-zR#1mu2@|jBR-?^~Tun9n^MDJ%_b{J4OjhIk6z-9h(8B z^x_9@Bzw8ibX2VXJ&{xHrnZzzjL=W~XTuc-?4i-<(R>xs_=z)VQ?K(0om#UuZj?jg9$p z#E!j*uKpxY$bzN^^}-qsl!b>N@w-XkeqWBfX8*`n3?Rvmjf%`5p(~Z=M`X14v9EoR z0}QbvN=|RmE&kHTIT)v9=y4{3amEGhTyvlQ*vVsv1)a|J*CaXauon$qV!$$I`;hJA}Tx-aa^i6k=%`}(h( zNME8*DgPtw@$?|fvHtGpnj;I_H?n;f15D|TJ&WFM*L<7mKeH=;W@Y9~Xp{aUm)_rd zmI)l(J;3pJ+jGzSRbn8Br0ZcE$5n}E zz+-Lw?zf7-7gAOyMX_SUx-v(|p6GD)uMO*Svhjp+8-CwWxJKJc_)gVy2=eK?r>*|q zkvB~8xw+sN20%}qhaF+8_saZySU?Vv(JNx*X?ybZmo+X}+i;-wiPkBHI*_%0l)NwG z4F2ZRET0c6)Ioan3{e56jw479D|7UGQ^4N8ZxF{M3;{@1uhOzrW33x(Bb*WMUe_OD z43?&;>)W6?uK68UoDtoGUahe9$sUk3A+3_%@ORFf)L;{Yr+=^ci{rz(RV$*FCI(VD ziK++P20;yHKV>N7cTz_INZ-xB_OLfnOCzc;e{5}8k9tGKF%k6=k#w6w6H~0puPNy#* z2=_Lg^x)SGEgBO3#mHB9CZs?`q;YuzPkS*P&3%b>;zRx0guuJ#sM=XMZJl&rjz=R0QHqSUkxy&6sKp+huL8=$>YH9`OdVGUJD<+GIVoM5=(xeZMn!3zys$!y5aYKfM%QbuQ5M z#)vas!o_0D=?>fL6hPm=#sAS9RiH#}imYF@;mwG1x97#UE1qh?RlA!R=8g)0|rpNFl-(@KD6y4g~Ac{S?${$q3qF$#9-H3Kv_ipQ+5xDlA zT`>3oG*uu=u8o8lbg{M@8ys=*T9jI zXC1i}d@<{$3wes!UULDa$h4h#xsiE<_Q@}1e{QZVwEu(ROy)5G1Zz|O##>L06jei- zu?>!I%(Nv9t*vGEa#_K%6Vw|j@KG*jFi(HKhH_*>TRnb0h~kCj~WPB+`8BadQo)9metNS#yFO|@cctuk6<^;4Vw!Ed%%lOd=?ras^j?~C9w zQ_(UbvCw+rnOS1Mq@AoyFa9!(x5USv5ajIZg?0EuT*->NQ`g+VFsR%|Ct6IJ!Og1) z13c%O#mhND*kL=bRxw2_348dFROy(6K{=+`P*#Iav$0f-hwqlQu4^eU_7D2)6tj|X z{fK6^GQjn~|QnaO(1@gAd7me}THA%!pc3Vnx{~vyh0Q z0+qE$6i@@oMkGckNfjd!9#o8q5h-5w6lziSGU_7u9!W7u6Rb|+fJy;No>_c!@SOhx DfQkuK