From 916c4484a2705fb296147355028fdb31cf0ea808 Mon Sep 17 00:00:00 2001 From: Jeffrey Walton Date: Tue, 23 Oct 2018 07:57:59 -0400 Subject: [PATCH] Add ChaCha SSE2 implementation Thanks to Jack Lloyd and Botan for allowing us to use the implementation. The numbers for SSE2 are very good. When compared with Salsa20 ASM the results are: * Salsa20 2.55 cpb; ChaCha/20 2.90 cpb * Salsa20/12 1.61 cpb; ChaCha/12 1.90 cpb * Salsa20/8 1.34 cpb; ChaCha/8 1.5 cpb --- Filelist.txt | 1 + TestVectors/chacha.txt | 68 ++++++++- chacha-simd.cpp | 289 +++++++++++++++++++++++++++++++++++++++ chacha.cpp | 59 ++++++-- chacha.h | 7 +- cryptest.nmake | 4 +- cryptest.vcxproj.user | 9 +- cryptlib.vcxproj | 1 + cryptlib.vcxproj.filters | 3 + salsa.cpp | 3 +- salsa.h | 1 + 11 files changed, 426 insertions(+), 19 deletions(-) create mode 100644 chacha-simd.cpp diff --git a/Filelist.txt b/Filelist.txt index 581fcee0..22b7b83f 100644 --- a/Filelist.txt +++ b/Filelist.txt @@ -51,6 +51,7 @@ cbcmac.h ccm.cpp ccm.h chacha.cpp +chacha-simd.cpp chacha.h cham.cpp cham-simd.cpp diff --git a/TestVectors/chacha.txt b/TestVectors/chacha.txt index d6e90204..f17a2be9 100644 --- a/TestVectors/chacha.txt +++ b/TestVectors/chacha.txt @@ -96,6 +96,28 @@ Ciphertext: \ 25DDA3E1C1E35D96BDB9CAD13546971B1E5FDB2E83216FEF93E5457DE48A5ED8 \ 1F7E4B95484834A58B6AF8CCE9545BBBDC58929A1DEB2F6AEBF0DC2079F644E3 Test: Encrypt +# +Comment: All one's key and IV (16-byte key). +Key: r16 ff +IV: r8 ff +Rounds: 8 +Seek: 32 +Plaintext: r64 00 +Ciphertext: \ + 3E0FB640D76FF9C3B9CD99996E6E38FAD13F0E31C82244D33ABBC1B11E8BF12D \ + 9A81D78E9E56604DDFAE136921F51C9D81AE15119DB8E756DD28024493EE571D +Test: Encrypt +# +Comment: All one's key and IV (32-byte key). +Key: r32 ff +IV: r8 ff +Rounds: 8 +Seek: 32 +Plaintext: r64 00 +Ciphertext: \ + 445E0547D31C1623C537DF4BA85C70A9884A35BCBF3DFAB077E98B0F68135F54 \ + 81D4933F8B322AC0CD762C27235CE2B31534E0244A9A2F1FD5E94498D47FF108 +Test: Encrypt ################################################# @@ -197,6 +219,28 @@ Ciphertext: \ 5E16F52D6857A1A602A7FC6DDD578CA868F1E51AADD3209034A4740036DE08A7 \ A906067C997F01E4E334CBA913407C7A462A968B272834D2D66DF24922F4302C Test: Encrypt +# +Comment: All one's key and IV (16-byte key). +Key: r16 ff +IV: r8 ff +Rounds: 12 +Seek: 32 +Plaintext: r64 00 +Ciphertext: \ + 83D5597D7A616258EC3C5D5B30E1C5C85C5DFE2F92423B8E36870F3185B6ADD9 \ + F34DAB6C2BC551898FBDCDFC783F09171CC8B59A8B2852983C3A9B91D29B5761 +Test: Encrypt +# +Comment: All one's key and IV (32-byte key). +Key: r32 ff +IV: r8 ff +Rounds: 12 +Seek: 32 +Plaintext: r64 00 +Ciphertext: \ + CAA69C5AB221A23A57EB5F345C96F4D1322D0A2FF7A9CD43401CD536639A615A \ + 5C9429B55CA3C1B55354559669A154ACA46CD761C41AB8ACE385363B95675F06 +Test: Encrypt ################################################# @@ -297,4 +341,26 @@ Ciphertext: \ D22CFEAEC86612E53DEFC29848C055053C6B1D462A3CF09B228E47211AFBA0AF \ 4E4C2B336E6EE2F471823808523F073C1BC8785D258AC2BD580209A82A875273 \ 93DF828B6A6728ABD7AAD0485BFF5CE92C8DB78B1E63929FC76A905E8C7AF310 -Test: Encrypt \ No newline at end of file +Test: Encrypt +# +Comment: All one's key and IV (16-byte key). +Key: r16 ff +IV: r8 ff +Rounds: 20 +Seek: 32 +Plaintext: r64 00 +Ciphertext: \ + 399E4760B262F9D53F26D8DD19E56F5C506AE0C3619FA67FB0C408106D0203EE \ + 40EA3CFA61FA32A2FDA8D1238A2135D9D4178775240F99007064A6A7F0C731B6 +Test: Encrypt +# +Comment: All one's key and IV (32-byte key). +Key: r32 ff +IV: r8 ff +Rounds: 20 +Seek: 32 +Plaintext: r64 00 +Ciphertext: \ + 3FEABC57FDE54F790C52C8AE43240B79D49042B777BFD6CB80E931270B7F50EB \ + 5BAC2ACD86A836C5DC98C116C1217EC31D3A63A9451319F097F3B4D6DAB07787 +Test: Encrypt diff --git a/chacha-simd.cpp b/chacha-simd.cpp new file mode 100644 index 00000000..9e07c3ec --- /dev/null +++ b/chacha-simd.cpp @@ -0,0 +1,289 @@ +// chacha-simd.cpp - written and placed in the public domain by +// Jack Lloyd and Jeffrey Walton +// +// This source file uses intrinsics and built-ins to gain access to +// SSE2, ARM NEON and ARMv8a, and Power7 Altivec instructions. A separate +// source file is needed because additional CXXFLAGS are required to enable +// the appropriate instructions sets in some build configurations. +// +// SSE2 implementation based on Botan's chacha_sse2.cpp. Many thanks +// to Jack Lloyd and the Botan team for allowing us to use it. +// +// ARMv8 Power7 is upcoming. + +#include "pch.h" +#include "config.h" + +#include "chacha.h" +#include "misc.h" + +#if (CRYPTOPP_SSE2_INTRIN_AVAILABLE) +# include +# include +#endif + +#if (CRYPTOPP_ARM_NEON_AVAILABLE) && 0 +# include +#endif + +// Can't use CRYPTOPP_ARM_XXX_AVAILABLE because too many +// compilers don't follow ACLE conventions for the include. +#if defined(CRYPTOPP_ARM_ACLE_AVAILABLE) +# include +# include +#endif + +// Squash MS LNK4221 and libtool warnings +extern const char CHACHA_SIMD_FNAME[] = __FILE__; + +ANONYMOUS_NAMESPACE_BEGIN + +#if defined(CRYPTOPP_SSE2_INTRIN_AVAILABLE) + +template +inline __m128i RotateLeft(const __m128i val) +{ + return _mm_or_si128(_mm_slli_epi32(val, R), _mm_srli_epi32(val, 32-R)); +} + +#endif // CRYPTOPP_SSE2_INTRIN_AVAILABLE + +ANONYMOUS_NAMESPACE_END + +NAMESPACE_BEGIN(CryptoPP) + +#if defined(CRYPTOPP_SSE2_INTRIN_AVAILABLE) + +void ChaCha_OperateKeystream_SSE2(KeystreamOperation operation, byte *output, + const word32 *input, size_t iterationCount, unsigned int rounds) +{ + const __m128i* input_mm = reinterpret_cast(input); + __m128i* output_mm = reinterpret_cast<__m128i*>(output); + + __m128i input0 = _mm_loadu_si128(input_mm); + __m128i input1 = _mm_loadu_si128(input_mm + 1); + __m128i input2 = _mm_loadu_si128(input_mm + 2); + __m128i input3 = _mm_loadu_si128(input_mm + 3); + + __m128i r0_0 = input0; + __m128i r0_1 = input1; + __m128i r0_2 = input2; + __m128i r0_3 = input3; + + __m128i r1_0 = input0; + __m128i r1_1 = input1; + __m128i r1_2 = input2; + __m128i r1_3 = _mm_add_epi64(r0_3, _mm_set_epi32(0, 0, 0, 1)); + + __m128i r2_0 = input0; + __m128i r2_1 = input1; + __m128i r2_2 = input2; + __m128i r2_3 = _mm_add_epi64(r0_3, _mm_set_epi32(0, 0, 0, 2)); + + __m128i r3_0 = input0; + __m128i r3_1 = input1; + __m128i r3_2 = input2; + __m128i r3_3 = _mm_add_epi64(r0_3, _mm_set_epi32(0, 0, 0, 3)); + + for (int i = static_cast(rounds); i > 0; i -= 2) + { + r0_0 = _mm_add_epi32(r0_0, r0_1); + r1_0 = _mm_add_epi32(r1_0, r1_1); + r2_0 = _mm_add_epi32(r2_0, r2_1); + r3_0 = _mm_add_epi32(r3_0, r3_1); + + r0_3 = _mm_xor_si128(r0_3, r0_0); + r1_3 = _mm_xor_si128(r1_3, r1_0); + r2_3 = _mm_xor_si128(r2_3, r2_0); + r3_3 = _mm_xor_si128(r3_3, r3_0); + + r0_3 = RotateLeft<16>(r0_3); + r1_3 = RotateLeft<16>(r1_3); + r2_3 = RotateLeft<16>(r2_3); + r3_3 = RotateLeft<16>(r3_3); + + r0_2 = _mm_add_epi32(r0_2, r0_3); + r1_2 = _mm_add_epi32(r1_2, r1_3); + r2_2 = _mm_add_epi32(r2_2, r2_3); + r3_2 = _mm_add_epi32(r3_2, r3_3); + + r0_1 = _mm_xor_si128(r0_1, r0_2); + r1_1 = _mm_xor_si128(r1_1, r1_2); + r2_1 = _mm_xor_si128(r2_1, r2_2); + r3_1 = _mm_xor_si128(r3_1, r3_2); + + r0_1 = RotateLeft<12>(r0_1); + r1_1 = RotateLeft<12>(r1_1); + r2_1 = RotateLeft<12>(r2_1); + r3_1 = RotateLeft<12>(r3_1); + + r0_0 = _mm_add_epi32(r0_0, r0_1); + r1_0 = _mm_add_epi32(r1_0, r1_1); + r2_0 = _mm_add_epi32(r2_0, r2_1); + r3_0 = _mm_add_epi32(r3_0, r3_1); + + r0_3 = _mm_xor_si128(r0_3, r0_0); + r1_3 = _mm_xor_si128(r1_3, r1_0); + r2_3 = _mm_xor_si128(r2_3, r2_0); + r3_3 = _mm_xor_si128(r3_3, r3_0); + + r0_3 = RotateLeft<8>(r0_3); + r1_3 = RotateLeft<8>(r1_3); + r2_3 = RotateLeft<8>(r2_3); + r3_3 = RotateLeft<8>(r3_3); + + r0_2 = _mm_add_epi32(r0_2, r0_3); + r1_2 = _mm_add_epi32(r1_2, r1_3); + r2_2 = _mm_add_epi32(r2_2, r2_3); + r3_2 = _mm_add_epi32(r3_2, r3_3); + + r0_1 = _mm_xor_si128(r0_1, r0_2); + r1_1 = _mm_xor_si128(r1_1, r1_2); + r2_1 = _mm_xor_si128(r2_1, r2_2); + r3_1 = _mm_xor_si128(r3_1, r3_2); + + r0_1 = RotateLeft<7>(r0_1); + r1_1 = RotateLeft<7>(r1_1); + r2_1 = RotateLeft<7>(r2_1); + r3_1 = RotateLeft<7>(r3_1); + + r0_1 = _mm_shuffle_epi32(r0_1, _MM_SHUFFLE(0, 3, 2, 1)); + r0_2 = _mm_shuffle_epi32(r0_2, _MM_SHUFFLE(1, 0, 3, 2)); + r0_3 = _mm_shuffle_epi32(r0_3, _MM_SHUFFLE(2, 1, 0, 3)); + + r1_1 = _mm_shuffle_epi32(r1_1, _MM_SHUFFLE(0, 3, 2, 1)); + r1_2 = _mm_shuffle_epi32(r1_2, _MM_SHUFFLE(1, 0, 3, 2)); + r1_3 = _mm_shuffle_epi32(r1_3, _MM_SHUFFLE(2, 1, 0, 3)); + + r2_1 = _mm_shuffle_epi32(r2_1, _MM_SHUFFLE(0, 3, 2, 1)); + r2_2 = _mm_shuffle_epi32(r2_2, _MM_SHUFFLE(1, 0, 3, 2)); + r2_3 = _mm_shuffle_epi32(r2_3, _MM_SHUFFLE(2, 1, 0, 3)); + + r3_1 = _mm_shuffle_epi32(r3_1, _MM_SHUFFLE(0, 3, 2, 1)); + r3_2 = _mm_shuffle_epi32(r3_2, _MM_SHUFFLE(1, 0, 3, 2)); + r3_3 = _mm_shuffle_epi32(r3_3, _MM_SHUFFLE(2, 1, 0, 3)); + + r0_0 = _mm_add_epi32(r0_0, r0_1); + r1_0 = _mm_add_epi32(r1_0, r1_1); + r2_0 = _mm_add_epi32(r2_0, r2_1); + r3_0 = _mm_add_epi32(r3_0, r3_1); + + r0_3 = _mm_xor_si128(r0_3, r0_0); + r1_3 = _mm_xor_si128(r1_3, r1_0); + r2_3 = _mm_xor_si128(r2_3, r2_0); + r3_3 = _mm_xor_si128(r3_3, r3_0); + + r0_3 = RotateLeft<16>(r0_3); + r1_3 = RotateLeft<16>(r1_3); + r2_3 = RotateLeft<16>(r2_3); + r3_3 = RotateLeft<16>(r3_3); + + r0_2 = _mm_add_epi32(r0_2, r0_3); + r1_2 = _mm_add_epi32(r1_2, r1_3); + r2_2 = _mm_add_epi32(r2_2, r2_3); + r3_2 = _mm_add_epi32(r3_2, r3_3); + + r0_1 = _mm_xor_si128(r0_1, r0_2); + r1_1 = _mm_xor_si128(r1_1, r1_2); + r2_1 = _mm_xor_si128(r2_1, r2_2); + r3_1 = _mm_xor_si128(r3_1, r3_2); + + r0_1 = RotateLeft<12>(r0_1); + r1_1 = RotateLeft<12>(r1_1); + r2_1 = RotateLeft<12>(r2_1); + r3_1 = RotateLeft<12>(r3_1); + + r0_0 = _mm_add_epi32(r0_0, r0_1); + r1_0 = _mm_add_epi32(r1_0, r1_1); + r2_0 = _mm_add_epi32(r2_0, r2_1); + r3_0 = _mm_add_epi32(r3_0, r3_1); + + r0_3 = _mm_xor_si128(r0_3, r0_0); + r1_3 = _mm_xor_si128(r1_3, r1_0); + r2_3 = _mm_xor_si128(r2_3, r2_0); + r3_3 = _mm_xor_si128(r3_3, r3_0); + + r0_3 = RotateLeft<8>(r0_3); + r1_3 = RotateLeft<8>(r1_3); + r2_3 = RotateLeft<8>(r2_3); + r3_3 = RotateLeft<8>(r3_3); + + r0_2 = _mm_add_epi32(r0_2, r0_3); + r1_2 = _mm_add_epi32(r1_2, r1_3); + r2_2 = _mm_add_epi32(r2_2, r2_3); + r3_2 = _mm_add_epi32(r3_2, r3_3); + + r0_1 = _mm_xor_si128(r0_1, r0_2); + r1_1 = _mm_xor_si128(r1_1, r1_2); + r2_1 = _mm_xor_si128(r2_1, r2_2); + r3_1 = _mm_xor_si128(r3_1, r3_2); + + r0_1 = RotateLeft<7>(r0_1); + r1_1 = RotateLeft<7>(r1_1); + r2_1 = RotateLeft<7>(r2_1); + r3_1 = RotateLeft<7>(r3_1); + + r0_1 = _mm_shuffle_epi32(r0_1, _MM_SHUFFLE(2, 1, 0, 3)); + r0_2 = _mm_shuffle_epi32(r0_2, _MM_SHUFFLE(1, 0, 3, 2)); + r0_3 = _mm_shuffle_epi32(r0_3, _MM_SHUFFLE(0, 3, 2, 1)); + + r1_1 = _mm_shuffle_epi32(r1_1, _MM_SHUFFLE(2, 1, 0, 3)); + r1_2 = _mm_shuffle_epi32(r1_2, _MM_SHUFFLE(1, 0, 3, 2)); + r1_3 = _mm_shuffle_epi32(r1_3, _MM_SHUFFLE(0, 3, 2, 1)); + + r2_1 = _mm_shuffle_epi32(r2_1, _MM_SHUFFLE(2, 1, 0, 3)); + r2_2 = _mm_shuffle_epi32(r2_2, _MM_SHUFFLE(1, 0, 3, 2)); + r2_3 = _mm_shuffle_epi32(r2_3, _MM_SHUFFLE(0, 3, 2, 1)); + + r3_1 = _mm_shuffle_epi32(r3_1, _MM_SHUFFLE(2, 1, 0, 3)); + r3_2 = _mm_shuffle_epi32(r3_2, _MM_SHUFFLE(1, 0, 3, 2)); + r3_3 = _mm_shuffle_epi32(r3_3, _MM_SHUFFLE(0, 3, 2, 1)); + } + + r0_0 = _mm_add_epi32(r0_0, input0); + r0_1 = _mm_add_epi32(r0_1, input1); + r0_2 = _mm_add_epi32(r0_2, input2); + r0_3 = _mm_add_epi32(r0_3, input3); + + r1_0 = _mm_add_epi32(r1_0, input0); + r1_1 = _mm_add_epi32(r1_1, input1); + r1_2 = _mm_add_epi32(r1_2, input2); + r1_3 = _mm_add_epi32(r1_3, input3); + r1_3 = _mm_add_epi64(r1_3, _mm_set_epi32(0, 0, 0, 1)); + + r2_0 = _mm_add_epi32(r2_0, input0); + r2_1 = _mm_add_epi32(r2_1, input1); + r2_2 = _mm_add_epi32(r2_2, input2); + r2_3 = _mm_add_epi32(r2_3, input3); + r2_3 = _mm_add_epi64(r2_3, _mm_set_epi32(0, 0, 0, 2)); + + r3_0 = _mm_add_epi32(r3_0, input0); + r3_1 = _mm_add_epi32(r3_1, input1); + r3_2 = _mm_add_epi32(r3_2, input2); + r3_3 = _mm_add_epi32(r3_3, input3); + r3_3 = _mm_add_epi64(r3_3, _mm_set_epi32(0, 0, 0, 3)); + + _mm_storeu_si128(output_mm + 0, r0_0); + _mm_storeu_si128(output_mm + 1, r0_1); + _mm_storeu_si128(output_mm + 2, r0_2); + _mm_storeu_si128(output_mm + 3, r0_3); + + _mm_storeu_si128(output_mm + 4, r1_0); + _mm_storeu_si128(output_mm + 5, r1_1); + _mm_storeu_si128(output_mm + 6, r1_2); + _mm_storeu_si128(output_mm + 7, r1_3); + + _mm_storeu_si128(output_mm + 8, r2_0); + _mm_storeu_si128(output_mm + 9, r2_1); + _mm_storeu_si128(output_mm + 10, r2_2); + _mm_storeu_si128(output_mm + 11, r2_3); + + _mm_storeu_si128(output_mm + 12, r3_0); + _mm_storeu_si128(output_mm + 13, r3_1); + _mm_storeu_si128(output_mm + 14, r3_2); + _mm_storeu_si128(output_mm + 15, r3_3); +} + +#endif // CRYPTOPP_SSE2_INTRIN_AVAILABLE + +NAMESPACE_END diff --git a/chacha.cpp b/chacha.cpp index 504057f5..5ea16192 100644 --- a/chacha.cpp +++ b/chacha.cpp @@ -11,6 +11,12 @@ NAMESPACE_BEGIN(CryptoPP) +#if defined(CRYPTOPP_SSE2_INTRIN_AVAILABLE) +extern +void ChaCha_OperateKeystream_SSE2(KeystreamOperation operation, byte *output, + const word32 *state, size_t iterationCount, unsigned int rounds); +#endif + #define CHACHA_QUARTER_ROUND(a,b,c,d) \ a += b; d ^= a; d = rotlConstant<16,word32>(d); \ c += d; b ^= c; b = rotlConstant<12,word32>(b); \ @@ -24,6 +30,15 @@ void ChaCha_TestInstantiations() } #endif +std::string ChaCha_Policy::AlgorithmProvider() const +{ +#if CRYPTOPP_SSE2_INTRIN_AVAILABLE + if (HasSSE2()) + return "SSE2"; +#endif + return "C++"; +} + void ChaCha_Policy::CipherSetKey(const NameValuePairs ¶ms, const byte *key, size_t length) { CRYPTOPP_UNUSED(params); @@ -58,19 +73,19 @@ void ChaCha_Policy::CipherResynchronize(byte *keystreamBuffer, const byte *IV, s void ChaCha_Policy::SeekToIteration(lword iterationCount) { - CRYPTOPP_UNUSED(iterationCount); - throw NotImplemented(std::string(ChaCha_Info::StaticAlgorithmName()) + ": SeekToIteration is not yet implemented"); - // TODO: these were Salsa20, and Wei re-arranged the state array for SSE2 operations. // If we can generate some out-of-band test vectors, then test and implement. Also // see the test vectors in salsa.txt and the use of Seek test argument. // m_state[8] = (word32)iterationCount; // m_state[5] = (word32)SafeRightShift<32>(iterationCount); + + m_state[13] = (word32)iterationCount; + m_state[12] = (word32)SafeRightShift<32>(iterationCount); } unsigned int ChaCha_Policy::GetAlignment() const { -#if CRYPTOPP_SSE2_ASM_AVAILABLE && 0 +#if CRYPTOPP_SSE2_INTRIN_AVAILABLE if (HasSSE2()) return 16; else @@ -80,7 +95,7 @@ unsigned int ChaCha_Policy::GetAlignment() const unsigned int ChaCha_Policy::GetOptimalBlockSize() const { -#if CRYPTOPP_SSE2_ASM_AVAILABLE && 0 +#if CRYPTOPP_SSE2_INTRIN_AVAILABLE if (HasSSE2()) return 4*BYTES_PER_ITERATION; else @@ -88,12 +103,34 @@ unsigned int ChaCha_Policy::GetOptimalBlockSize() const return BYTES_PER_ITERATION; } -void ChaCha_Policy::OperateKeystream(KeystreamOperation operation, byte *output, const byte *input, size_t iterationCount) +void ChaCha_Policy::OperateKeystream(KeystreamOperation operation, + byte *output, const byte *input, size_t iterationCount) { - word32 x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15; +#if CRYPTOPP_SSE2_INTRIN_AVAILABLE + if (HasSSE2()) + { + while (iterationCount >= 4) + { + ChaCha_OperateKeystream_SSE2(operation, output, m_state, iterationCount, m_rounds); + + if ((operation & INPUT_NULL) != INPUT_NULL) + xorbuf(output, input, 4*BYTES_PER_ITERATION); + + m_state[12] += 4; + if (m_state[12] < 4) + m_state[13]++; + + input += 4*BYTES_PER_ITERATION; + output += 4*BYTES_PER_ITERATION; + iterationCount -= 4; + } + } +#endif while (iterationCount--) { + word32 x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15; + x0 = m_state[0]; x1 = m_state[1]; x2 = m_state[2]; x3 = m_state[3]; x4 = m_state[4]; x5 = m_state[5]; x6 = m_state[6]; x7 = m_state[7]; x8 = m_state[8]; x9 = m_state[9]; x10 = m_state[10]; x11 = m_state[11]; @@ -112,7 +149,7 @@ void ChaCha_Policy::OperateKeystream(KeystreamOperation operation, byte *output, CHACHA_QUARTER_ROUND(x3, x4, x9, x14); } - #undef CHACHA_OUTPUT +#ifndef CRYPTOPP_DOXYGEN_PROCESSING #define CHACHA_OUTPUT(x){\ CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 0, x0 + m_state[0]);\ CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 1, x1 + m_state[1]);\ @@ -131,12 +168,12 @@ void ChaCha_Policy::OperateKeystream(KeystreamOperation operation, byte *output, CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 14, x14 + m_state[14]);\ CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 15, x15 + m_state[15]);} -#ifndef CRYPTOPP_DOXYGEN_PROCESSING CRYPTOPP_KEYSTREAM_OUTPUT_SWITCH(CHACHA_OUTPUT, BYTES_PER_ITERATION); + #undef CHACHA_OUTPUT #endif - ++m_state[12]; - m_state[13] += static_cast(m_state[12] == 0); + if (++m_state[12] == 0) + m_state[13]++; } } diff --git a/chacha.h b/chacha.h index 1803059f..54cbc6a4 100644 --- a/chacha.h +++ b/chacha.h @@ -35,10 +35,15 @@ protected: void CipherSetKey(const NameValuePairs ¶ms, const byte *key, size_t length); void OperateKeystream(KeystreamOperation operation, byte *output, const byte *input, size_t iterationCount); void CipherResynchronize(byte *keystreamBuffer, const byte *IV, size_t length); - bool CipherIsRandomAccess() const {return false;} // TODO + bool CipherIsRandomAccess() const {return true;} void SeekToIteration(lword iterationCount); + +#if (CRYPTOPP_BOOL_X86 || CRYPTOPP_BOOL_X32 || CRYPTOPP_BOOL_X64) unsigned int GetAlignment() const; unsigned int GetOptimalBlockSize() const; +#endif + + std::string AlgorithmProvider() const; FixedSizeAlignedSecBlock m_state; int m_rounds; diff --git a/cryptest.nmake b/cryptest.nmake index 56ffeb92..5464f567 100644 --- a/cryptest.nmake +++ b/cryptest.nmake @@ -55,7 +55,7 @@ LIB_SRCS = \ algparam.cpp arc4.cpp aria-simd.cpp aria.cpp ariatab.cpp asn.cpp \ authenc.cpp base32.cpp base64.cpp basecode.cpp bfinit.cpp blake2-simd.cpp \ blake2.cpp blowfish.cpp blumshub.cpp camellia.cpp cast.cpp casts.cpp \ - cbcmac.cpp ccm.cpp chacha.cpp cham-simd.cpp cham.cpp channels.cpp \ + cbcmac.cpp ccm.cpp chacha-simd.cpp chacha.cpp cham-simd.cpp cham.cpp channels.cpp \ cmac.cpp crc-simd.cpp crc.cpp default.cpp des.cpp dessp.cpp dh.cpp \ dh2.cpp dll.cpp dsa.cpp eax.cpp ec2n.cpp eccrypto.cpp ecp.cpp elgamal.cpp \ emsa2.cpp eprecomp.cpp esign.cpp files.cpp filters.cpp fips140.cpp \ @@ -83,7 +83,7 @@ LIB_OBJS = \ algparam.obj arc4.obj aria-simd.obj aria.obj ariatab.obj asn.obj \ authenc.obj base32.obj base64.obj basecode.obj bfinit.obj blake2-simd.obj \ blake2.obj blowfish.obj blumshub.obj camellia.obj cast.obj casts.obj \ - cbcmac.obj ccm.obj chacha.obj cham-simd.obj cham.obj channels.obj \ + cbcmac.obj ccm.obj chacha-simd.obj chacha.obj cham-simd.obj cham.obj channels.obj \ cmac.obj crc-simd.obj crc.obj default.obj des.obj dessp.obj dh.obj \ dh2.obj dll.obj dsa.obj eax.obj ec2n.obj eccrypto.obj ecp.obj elgamal.obj \ emsa2.obj eprecomp.obj esign.obj files.obj filters.obj fips140.obj \ diff --git a/cryptest.vcxproj.user b/cryptest.vcxproj.user index ceb23602..4eef76b6 100644 --- a/cryptest.vcxproj.user +++ b/cryptest.vcxproj.user @@ -1,6 +1,9 @@ - + - v + tv chacha - + + WindowsLocalDebugger + + \ No newline at end of file diff --git a/cryptlib.vcxproj b/cryptlib.vcxproj index 156fa30f..8794a5bb 100644 --- a/cryptlib.vcxproj +++ b/cryptlib.vcxproj @@ -191,6 +191,7 @@ + diff --git a/cryptlib.vcxproj.filters b/cryptlib.vcxproj.filters index 2ddb498f..197bbc77 100644 --- a/cryptlib.vcxproj.filters +++ b/cryptlib.vcxproj.filters @@ -86,6 +86,9 @@ Source Files + + Source Files + Source Files diff --git a/salsa.cpp b/salsa.cpp index 46bcd225..6716d9fa 100644 --- a/salsa.cpp +++ b/salsa.cpp @@ -607,6 +607,7 @@ Salsa20_OperateKeystream ENDP QUARTER_ROUND(x3, x12, x9, x6) } +#ifndef CRYPTOPP_DOXYGEN_PROCESSING #define SALSA_OUTPUT(x) {\ CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 0, x0 + m_state[0]);\ CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 1, x13 + m_state[13]);\ @@ -625,8 +626,8 @@ Salsa20_OperateKeystream ENDP CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 14, x6 + m_state[6]);\ CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 15, x3 + m_state[3]);} -#ifndef CRYPTOPP_DOXYGEN_PROCESSING CRYPTOPP_KEYSTREAM_OUTPUT_SWITCH(SALSA_OUTPUT, BYTES_PER_ITERATION); + #undef SALSA_OUTPUT #endif if (++m_state[8] == 0) diff --git a/salsa.h b/salsa.h index f86bb61b..7765c919 100644 --- a/salsa.h +++ b/salsa.h @@ -41,6 +41,7 @@ protected: void CipherResynchronize(byte *keystreamBuffer, const byte *IV, size_t length); bool CipherIsRandomAccess() const {return true;} void SeekToIteration(lword iterationCount); + #if (CRYPTOPP_BOOL_X86 || CRYPTOPP_BOOL_X32 || CRYPTOPP_BOOL_X64) unsigned int GetAlignment() const; unsigned int GetOptimalBlockSize() const;