diff --git a/chacha.cpp b/chacha.cpp index 1de1e89e..6dac12a1 100644 --- a/chacha.cpp +++ b/chacha.cpp @@ -1,6 +1,7 @@ // chacha.cpp - written and placed in the public domain by Jeffrey Walton. -// Based on Wei Dai's Salsa20 and Bernstein's reference ChaCha -// family implementation at http://cr.yp.to/chacha.html. +// Based on Wei Dai's Salsa20, Botan's SSE2 implementation, +// and Bernstein's reference ChaCha family implementation at +// http://cr.yp.to/chacha.html. #include "pch.h" #include "config.h" @@ -29,6 +30,24 @@ extern void ChaCha_OperateKeystream_POWER8(const word32 *state, const byte* inpu a += b; d ^= a; d = rotlConstant<8,word32>(d); \ c += d; b ^= c; b = rotlConstant<7,word32>(b); +#define CHACHA_OUTPUT(x){\ + CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 0, x0 + m_state[0]);\ + CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 1, x1 + m_state[1]);\ + CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 2, x2 + m_state[2]);\ + CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 3, x3 + m_state[3]);\ + CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 4, x4 + m_state[4]);\ + CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 5, x5 + m_state[5]);\ + CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 6, x6 + m_state[6]);\ + CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 7, x7 + m_state[7]);\ + CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 8, x8 + m_state[8]);\ + CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 9, x9 + m_state[9]);\ + CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 10, x10 + m_state[10]);\ + CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 11, x11 + m_state[11]);\ + CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 12, x12 + m_state[12]);\ + CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 13, x13 + m_state[13]);\ + CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 14, x14 + m_state[14]);\ + CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 15, x15 + m_state[15]);} + #if defined(CRYPTOPP_DEBUG) && !defined(CRYPTOPP_DOXYGEN_PROCESSING) void ChaCha_TestInstantiations() { @@ -92,8 +111,8 @@ void ChaCha_Policy::CipherResynchronize(byte *keystreamBuffer, const byte *IV, s void ChaCha_Policy::SeekToIteration(lword iterationCount) { - m_state[13] = (word32)iterationCount; - m_state[12] = (word32)SafeRightShift<32>(iterationCount); + m_state[12] = (word32)iterationCount; // low word + m_state[13] = (word32)SafeRightShift<32>(iterationCount); } unsigned int ChaCha_Policy::GetAlignment() const @@ -102,6 +121,10 @@ unsigned int ChaCha_Policy::GetAlignment() const if (HasSSE2()) return 16; else +#elif (CRYPTOPP_POWER8_AVAILABLE) + if (HasPower8()) + return 16; + else #endif return GetAlignmentOf(); } @@ -126,117 +149,107 @@ unsigned int ChaCha_Policy::GetOptimalBlockSize() const return BYTES_PER_ITERATION; } +bool ChaCha_Policy::MultiBlockSafe() const +{ + const word32 c = m_state[12]; + return 0xffffffff - c > 4; +} + // OperateKeystream always produces a key stream. The key stream is written // to output. Optionally a message may be supplied to xor with the key stream. // The message is input, and output = output ^ input. void ChaCha_Policy::OperateKeystream(KeystreamOperation operation, byte *output, const byte *input, size_t iterationCount) { -#if (CRYPTOPP_SSE2_INTRIN_AVAILABLE || CRYPTOPP_SSE2_ASM_AVAILABLE) - if (HasSSE2()) + do { - while (iterationCount >= 4) +#if (CRYPTOPP_SSE2_INTRIN_AVAILABLE || CRYPTOPP_SSE2_ASM_AVAILABLE) + if (HasSSE2()) { - const bool xorInput = (operation & INPUT_NULL) != INPUT_NULL; - ChaCha_OperateKeystream_SSE2(m_state, xorInput ? input : NULLPTR, output, m_rounds); + while (iterationCount >= 4 && MultiBlockSafe()) + { + const bool xorInput = (operation & INPUT_NULL) != INPUT_NULL; + ChaCha_OperateKeystream_SSE2(m_state, xorInput ? input : NULLPTR, output, m_rounds); - m_state[12] += 4; - if (m_state[12] < 4) - m_state[13]++; + m_state[12] += 4; + if (m_state[12] < 4) + m_state[13]++; - input += (!!xorInput)*4*BYTES_PER_ITERATION; - output += 4*BYTES_PER_ITERATION; - iterationCount -= 4; + input += (!!xorInput)*4*BYTES_PER_ITERATION; + output += 4*BYTES_PER_ITERATION; + iterationCount -= 4; + } } - } #endif #if (CRYPTOPP_ARM_NEON_AVAILABLE) - if (HasNEON()) - { - while (iterationCount >= 4) + if (HasNEON()) { - const bool xorInput = (operation & INPUT_NULL) != INPUT_NULL; - ChaCha_OperateKeystream_NEON(m_state, xorInput ? input : NULLPTR, output, m_rounds); + while (iterationCount >= 4 && MultiBlockSafe()) + { + const bool xorInput = (operation & INPUT_NULL) != INPUT_NULL; + ChaCha_OperateKeystream_NEON(m_state, xorInput ? input : NULLPTR, output, m_rounds); - m_state[12] += 4; - if (m_state[12] < 4) - m_state[13]++; + m_state[12] += 4; + if (m_state[12] < 4) + m_state[13]++; - input += (!!xorInput)*4*BYTES_PER_ITERATION; - output += 4*BYTES_PER_ITERATION; - iterationCount -= 4; + input += (!!xorInput)*4*BYTES_PER_ITERATION; + output += 4*BYTES_PER_ITERATION; + iterationCount -= 4; + } } - } #endif #if (CRYPTOPP_POWER8_AVAILABLE) - if (HasPower8()) - { - while (iterationCount >= 4) + if (HasPower8()) { - const bool xorInput = (operation & INPUT_NULL) != INPUT_NULL; - ChaCha_OperateKeystream_POWER8(m_state, xorInput ? input : NULLPTR, output, m_rounds); + while (iterationCount >= 4 && MultiBlockSafe()) + { + const bool xorInput = (operation & INPUT_NULL) != INPUT_NULL; + ChaCha_OperateKeystream_POWER8(m_state, xorInput ? input : NULLPTR, output, m_rounds); - m_state[12] += 4; - if (m_state[12] < 4) + m_state[12] += 4; + if (m_state[12] < 4) + m_state[13]++; + + input += (!!xorInput)*4*BYTES_PER_ITERATION; + output += 4*BYTES_PER_ITERATION; + iterationCount -= 4; + } + } +#endif + + if (iterationCount) + { + word32 x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15; + + x0 = m_state[0]; x1 = m_state[1]; x2 = m_state[2]; x3 = m_state[3]; + x4 = m_state[4]; x5 = m_state[5]; x6 = m_state[6]; x7 = m_state[7]; + x8 = m_state[8]; x9 = m_state[9]; x10 = m_state[10]; x11 = m_state[11]; + x12 = m_state[12]; x13 = m_state[13]; x14 = m_state[14]; x15 = m_state[15]; + + for (int i = static_cast(m_rounds); i > 0; i -= 2) + { + CHACHA_QUARTER_ROUND(x0, x4, x8, x12); + CHACHA_QUARTER_ROUND(x1, x5, x9, x13); + CHACHA_QUARTER_ROUND(x2, x6, x10, x14); + CHACHA_QUARTER_ROUND(x3, x7, x11, x15); + + CHACHA_QUARTER_ROUND(x0, x5, x10, x15); + CHACHA_QUARTER_ROUND(x1, x6, x11, x12); + CHACHA_QUARTER_ROUND(x2, x7, x8, x13); + CHACHA_QUARTER_ROUND(x3, x4, x9, x14); + } + + CRYPTOPP_KEYSTREAM_OUTPUT_SWITCH(CHACHA_OUTPUT, BYTES_PER_ITERATION); + + if (++m_state[12] == 0) m_state[13]++; - - input += (!!xorInput)*4*BYTES_PER_ITERATION; - output += 4*BYTES_PER_ITERATION; - iterationCount -= 4; - } - } -#endif - - while (iterationCount--) - { - word32 x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15; - - x0 = m_state[0]; x1 = m_state[1]; x2 = m_state[2]; x3 = m_state[3]; - x4 = m_state[4]; x5 = m_state[5]; x6 = m_state[6]; x7 = m_state[7]; - x8 = m_state[8]; x9 = m_state[9]; x10 = m_state[10]; x11 = m_state[11]; - x12 = m_state[12]; x13 = m_state[13]; x14 = m_state[14]; x15 = m_state[15]; - - for (int i = static_cast(m_rounds); i > 0; i -= 2) - { - CHACHA_QUARTER_ROUND(x0, x4, x8, x12); - CHACHA_QUARTER_ROUND(x1, x5, x9, x13); - CHACHA_QUARTER_ROUND(x2, x6, x10, x14); - CHACHA_QUARTER_ROUND(x3, x7, x11, x15); - - CHACHA_QUARTER_ROUND(x0, x5, x10, x15); - CHACHA_QUARTER_ROUND(x1, x6, x11, x12); - CHACHA_QUARTER_ROUND(x2, x7, x8, x13); - CHACHA_QUARTER_ROUND(x3, x4, x9, x14); } -#ifndef CRYPTOPP_DOXYGEN_PROCESSING - #define CHACHA_OUTPUT(x){\ - CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 0, x0 + m_state[0]);\ - CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 1, x1 + m_state[1]);\ - CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 2, x2 + m_state[2]);\ - CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 3, x3 + m_state[3]);\ - CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 4, x4 + m_state[4]);\ - CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 5, x5 + m_state[5]);\ - CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 6, x6 + m_state[6]);\ - CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 7, x7 + m_state[7]);\ - CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 8, x8 + m_state[8]);\ - CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 9, x9 + m_state[9]);\ - CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 10, x10 + m_state[10]);\ - CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 11, x11 + m_state[11]);\ - CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 12, x12 + m_state[12]);\ - CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 13, x13 + m_state[13]);\ - CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 14, x14 + m_state[14]);\ - CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 15, x15 + m_state[15]);} - - CRYPTOPP_KEYSTREAM_OUTPUT_SWITCH(CHACHA_OUTPUT, BYTES_PER_ITERATION); - #undef CHACHA_OUTPUT -#endif - - if (++m_state[12] == 0) - m_state[13]++; - } + // We may re-enter a SIMD keystream operation from here. + } while (iterationCount--); } NAMESPACE_END diff --git a/chacha.h b/chacha.h index 50d33d09..b0a762b3 100644 --- a/chacha.h +++ b/chacha.h @@ -1,6 +1,7 @@ // chacha.h - written and placed in the public domain by Jeffrey Walton. -// Based on Wei Dai's Salsa20 and Bernstein's reference ChaCha -// family implementation at http://cr.yp.to/chacha.html. +// Based on Wei Dai's Salsa20, Botan's SSE2 implementation, +// and Bernstein's reference ChaCha family implementation at +// http://cr.yp.to/chacha.html. /// \file chacha.h /// \brief Classes for ChaCha8, ChaCha12 and ChaCha20 stream ciphers @@ -50,6 +51,13 @@ protected: std::string AlgorithmName() const; std::string AlgorithmProvider() const; + // MultiBlockSafe detects a condition that can arise in the SIMD + // implementations where we overflow one of the 32-bit state words + // during addition in an intermediate result. Conditions to trigger + // issue include a user seeks to around 2^32 blocks (256 GB of data). + // https://github.com/weidai11/cryptopp/issues/732 + bool MultiBlockSafe() const; + FixedSizeAlignedSecBlock m_state; int m_rounds; };