Work-around potential counter increment problem in ChaCha20 (GH #732)

This is only a work-around for the moment. The issue only affects SIMD code. The problem is, the algorithm we use performs a 32-bit add as an intermediate result, but we really need a 64-bit add. We are running 4 transforms in parallel, and we can't add and carry the way we need to.

The workaround is, whenever we could cross the 32-bit counter boundary we use the C version of the transform. We determine the cross-over point by 'bool safe = 0xffffffff - state.low > 4'. When not safe we skip the SIMD version of the algorithm and use the C version. Once we are safe again we use the SIMD version again.

The work-around costs us about 0.1 to 0.2 cpb. At 1.10 or 1.15 cpb that equates to about 200 MB/s on a Skylake. We'd like to get it back eventually.
pull/737/head
Jeffrey Walton 2018-11-04 14:49:26 -05:00
parent d7a3562c0b
commit 29be6ed97a
No known key found for this signature in database
GPG Key ID: B36AB348921B1838
2 changed files with 112 additions and 91 deletions

View File

@ -1,6 +1,7 @@
// chacha.cpp - written and placed in the public domain by Jeffrey Walton.
// Based on Wei Dai's Salsa20 and Bernstein's reference ChaCha
// family implementation at http://cr.yp.to/chacha.html.
// Based on Wei Dai's Salsa20, Botan's SSE2 implementation,
// and Bernstein's reference ChaCha family implementation at
// http://cr.yp.to/chacha.html.
#include "pch.h"
#include "config.h"
@ -29,6 +30,24 @@ extern void ChaCha_OperateKeystream_POWER8(const word32 *state, const byte* inpu
a += b; d ^= a; d = rotlConstant<8,word32>(d); \
c += d; b ^= c; b = rotlConstant<7,word32>(b);
#define CHACHA_OUTPUT(x){\
CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 0, x0 + m_state[0]);\
CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 1, x1 + m_state[1]);\
CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 2, x2 + m_state[2]);\
CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 3, x3 + m_state[3]);\
CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 4, x4 + m_state[4]);\
CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 5, x5 + m_state[5]);\
CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 6, x6 + m_state[6]);\
CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 7, x7 + m_state[7]);\
CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 8, x8 + m_state[8]);\
CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 9, x9 + m_state[9]);\
CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 10, x10 + m_state[10]);\
CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 11, x11 + m_state[11]);\
CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 12, x12 + m_state[12]);\
CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 13, x13 + m_state[13]);\
CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 14, x14 + m_state[14]);\
CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 15, x15 + m_state[15]);}
#if defined(CRYPTOPP_DEBUG) && !defined(CRYPTOPP_DOXYGEN_PROCESSING)
void ChaCha_TestInstantiations()
{
@ -92,8 +111,8 @@ void ChaCha_Policy::CipherResynchronize(byte *keystreamBuffer, const byte *IV, s
void ChaCha_Policy::SeekToIteration(lword iterationCount)
{
m_state[13] = (word32)iterationCount;
m_state[12] = (word32)SafeRightShift<32>(iterationCount);
m_state[12] = (word32)iterationCount; // low word
m_state[13] = (word32)SafeRightShift<32>(iterationCount);
}
unsigned int ChaCha_Policy::GetAlignment() const
@ -102,6 +121,10 @@ unsigned int ChaCha_Policy::GetAlignment() const
if (HasSSE2())
return 16;
else
#elif (CRYPTOPP_POWER8_AVAILABLE)
if (HasPower8())
return 16;
else
#endif
return GetAlignmentOf<word32>();
}
@ -126,16 +149,24 @@ unsigned int ChaCha_Policy::GetOptimalBlockSize() const
return BYTES_PER_ITERATION;
}
bool ChaCha_Policy::MultiBlockSafe() const
{
const word32 c = m_state[12];
return 0xffffffff - c > 4;
}
// OperateKeystream always produces a key stream. The key stream is written
// to output. Optionally a message may be supplied to xor with the key stream.
// The message is input, and output = output ^ input.
void ChaCha_Policy::OperateKeystream(KeystreamOperation operation,
byte *output, const byte *input, size_t iterationCount)
{
do
{
#if (CRYPTOPP_SSE2_INTRIN_AVAILABLE || CRYPTOPP_SSE2_ASM_AVAILABLE)
if (HasSSE2())
{
while (iterationCount >= 4)
while (iterationCount >= 4 && MultiBlockSafe())
{
const bool xorInput = (operation & INPUT_NULL) != INPUT_NULL;
ChaCha_OperateKeystream_SSE2(m_state, xorInput ? input : NULLPTR, output, m_rounds);
@ -154,7 +185,7 @@ void ChaCha_Policy::OperateKeystream(KeystreamOperation operation,
#if (CRYPTOPP_ARM_NEON_AVAILABLE)
if (HasNEON())
{
while (iterationCount >= 4)
while (iterationCount >= 4 && MultiBlockSafe())
{
const bool xorInput = (operation & INPUT_NULL) != INPUT_NULL;
ChaCha_OperateKeystream_NEON(m_state, xorInput ? input : NULLPTR, output, m_rounds);
@ -173,7 +204,7 @@ void ChaCha_Policy::OperateKeystream(KeystreamOperation operation,
#if (CRYPTOPP_POWER8_AVAILABLE)
if (HasPower8())
{
while (iterationCount >= 4)
while (iterationCount >= 4 && MultiBlockSafe())
{
const bool xorInput = (operation & INPUT_NULL) != INPUT_NULL;
ChaCha_OperateKeystream_POWER8(m_state, xorInput ? input : NULLPTR, output, m_rounds);
@ -189,7 +220,7 @@ void ChaCha_Policy::OperateKeystream(KeystreamOperation operation,
}
#endif
while (iterationCount--)
if (iterationCount)
{
word32 x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15;
@ -211,32 +242,14 @@ void ChaCha_Policy::OperateKeystream(KeystreamOperation operation,
CHACHA_QUARTER_ROUND(x3, x4, x9, x14);
}
#ifndef CRYPTOPP_DOXYGEN_PROCESSING
#define CHACHA_OUTPUT(x){\
CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 0, x0 + m_state[0]);\
CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 1, x1 + m_state[1]);\
CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 2, x2 + m_state[2]);\
CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 3, x3 + m_state[3]);\
CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 4, x4 + m_state[4]);\
CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 5, x5 + m_state[5]);\
CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 6, x6 + m_state[6]);\
CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 7, x7 + m_state[7]);\
CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 8, x8 + m_state[8]);\
CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 9, x9 + m_state[9]);\
CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 10, x10 + m_state[10]);\
CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 11, x11 + m_state[11]);\
CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 12, x12 + m_state[12]);\
CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 13, x13 + m_state[13]);\
CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 14, x14 + m_state[14]);\
CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 15, x15 + m_state[15]);}
CRYPTOPP_KEYSTREAM_OUTPUT_SWITCH(CHACHA_OUTPUT, BYTES_PER_ITERATION);
#undef CHACHA_OUTPUT
#endif
if (++m_state[12] == 0)
m_state[13]++;
}
// We may re-enter a SIMD keystream operation from here.
} while (iterationCount--);
}
NAMESPACE_END

View File

@ -1,6 +1,7 @@
// chacha.h - written and placed in the public domain by Jeffrey Walton.
// Based on Wei Dai's Salsa20 and Bernstein's reference ChaCha
// family implementation at http://cr.yp.to/chacha.html.
// Based on Wei Dai's Salsa20, Botan's SSE2 implementation,
// and Bernstein's reference ChaCha family implementation at
// http://cr.yp.to/chacha.html.
/// \file chacha.h
/// \brief Classes for ChaCha8, ChaCha12 and ChaCha20 stream ciphers
@ -50,6 +51,13 @@ protected:
std::string AlgorithmName() const;
std::string AlgorithmProvider() const;
// MultiBlockSafe detects a condition that can arise in the SIMD
// implementations where we overflow one of the 32-bit state words
// during addition in an intermediate result. Conditions to trigger
// issue include a user seeks to around 2^32 blocks (256 GB of data).
// https://github.com/weidai11/cryptopp/issues/732
bool MultiBlockSafe() const;
FixedSizeAlignedSecBlock<word32, 16> m_state;
int m_rounds;
};