Work-around potential counter increment problem in ChaCha20 (GH #732)

This is only a work-around for the moment. The issue only affects SIMD code. The problem is, the algorithm we use performs a 32-bit add as an intermediate result, but we really need a 64-bit add. We are running 4 transforms in parallel, and we can't add and carry the way we need to.

The workaround is, whenever we could cross the 32-bit counter boundary we use the C version of the transform. We determine the cross-over point by 'bool safe = 0xffffffff - state.low > 4'. When not safe we skip the SIMD version of the algorithm and use the C version. Once we are safe again we use the SIMD version again.

The work-around costs us about 0.1 to 0.2 cpb. At 1.10 or 1.15 cpb that equates to about 200 MB/s on a Skylake. We'd like to get it back eventually.
pull/737/head
Jeffrey Walton 2018-11-04 14:49:26 -05:00
parent d7a3562c0b
commit 29be6ed97a
No known key found for this signature in database
GPG Key ID: B36AB348921B1838
2 changed files with 112 additions and 91 deletions

View File

@ -1,6 +1,7 @@
// chacha.cpp - written and placed in the public domain by Jeffrey Walton.
// Based on Wei Dai's Salsa20 and Bernstein's reference ChaCha
// family implementation at http://cr.yp.to/chacha.html.
// Based on Wei Dai's Salsa20, Botan's SSE2 implementation,
// and Bernstein's reference ChaCha family implementation at
// http://cr.yp.to/chacha.html.
#include "pch.h"
#include "config.h"
@ -29,6 +30,24 @@ extern void ChaCha_OperateKeystream_POWER8(const word32 *state, const byte* inpu
a += b; d ^= a; d = rotlConstant<8,word32>(d); \
c += d; b ^= c; b = rotlConstant<7,word32>(b);
#define CHACHA_OUTPUT(x){\
CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 0, x0 + m_state[0]);\
CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 1, x1 + m_state[1]);\
CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 2, x2 + m_state[2]);\
CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 3, x3 + m_state[3]);\
CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 4, x4 + m_state[4]);\
CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 5, x5 + m_state[5]);\
CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 6, x6 + m_state[6]);\
CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 7, x7 + m_state[7]);\
CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 8, x8 + m_state[8]);\
CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 9, x9 + m_state[9]);\
CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 10, x10 + m_state[10]);\
CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 11, x11 + m_state[11]);\
CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 12, x12 + m_state[12]);\
CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 13, x13 + m_state[13]);\
CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 14, x14 + m_state[14]);\
CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 15, x15 + m_state[15]);}
#if defined(CRYPTOPP_DEBUG) && !defined(CRYPTOPP_DOXYGEN_PROCESSING)
void ChaCha_TestInstantiations()
{
@ -92,8 +111,8 @@ void ChaCha_Policy::CipherResynchronize(byte *keystreamBuffer, const byte *IV, s
void ChaCha_Policy::SeekToIteration(lword iterationCount)
{
m_state[13] = (word32)iterationCount;
m_state[12] = (word32)SafeRightShift<32>(iterationCount);
m_state[12] = (word32)iterationCount; // low word
m_state[13] = (word32)SafeRightShift<32>(iterationCount);
}
unsigned int ChaCha_Policy::GetAlignment() const
@ -102,6 +121,10 @@ unsigned int ChaCha_Policy::GetAlignment() const
if (HasSSE2())
return 16;
else
#elif (CRYPTOPP_POWER8_AVAILABLE)
if (HasPower8())
return 16;
else
#endif
return GetAlignmentOf<word32>();
}
@ -126,117 +149,107 @@ unsigned int ChaCha_Policy::GetOptimalBlockSize() const
return BYTES_PER_ITERATION;
}
bool ChaCha_Policy::MultiBlockSafe() const
{
const word32 c = m_state[12];
return 0xffffffff - c > 4;
}
// OperateKeystream always produces a key stream. The key stream is written
// to output. Optionally a message may be supplied to xor with the key stream.
// The message is input, and output = output ^ input.
void ChaCha_Policy::OperateKeystream(KeystreamOperation operation,
byte *output, const byte *input, size_t iterationCount)
{
#if (CRYPTOPP_SSE2_INTRIN_AVAILABLE || CRYPTOPP_SSE2_ASM_AVAILABLE)
if (HasSSE2())
do
{
while (iterationCount >= 4)
#if (CRYPTOPP_SSE2_INTRIN_AVAILABLE || CRYPTOPP_SSE2_ASM_AVAILABLE)
if (HasSSE2())
{
const bool xorInput = (operation & INPUT_NULL) != INPUT_NULL;
ChaCha_OperateKeystream_SSE2(m_state, xorInput ? input : NULLPTR, output, m_rounds);
while (iterationCount >= 4 && MultiBlockSafe())
{
const bool xorInput = (operation & INPUT_NULL) != INPUT_NULL;
ChaCha_OperateKeystream_SSE2(m_state, xorInput ? input : NULLPTR, output, m_rounds);
m_state[12] += 4;
if (m_state[12] < 4)
m_state[13]++;
m_state[12] += 4;
if (m_state[12] < 4)
m_state[13]++;
input += (!!xorInput)*4*BYTES_PER_ITERATION;
output += 4*BYTES_PER_ITERATION;
iterationCount -= 4;
input += (!!xorInput)*4*BYTES_PER_ITERATION;
output += 4*BYTES_PER_ITERATION;
iterationCount -= 4;
}
}
}
#endif
#if (CRYPTOPP_ARM_NEON_AVAILABLE)
if (HasNEON())
{
while (iterationCount >= 4)
if (HasNEON())
{
const bool xorInput = (operation & INPUT_NULL) != INPUT_NULL;
ChaCha_OperateKeystream_NEON(m_state, xorInput ? input : NULLPTR, output, m_rounds);
while (iterationCount >= 4 && MultiBlockSafe())
{
const bool xorInput = (operation & INPUT_NULL) != INPUT_NULL;
ChaCha_OperateKeystream_NEON(m_state, xorInput ? input : NULLPTR, output, m_rounds);
m_state[12] += 4;
if (m_state[12] < 4)
m_state[13]++;
m_state[12] += 4;
if (m_state[12] < 4)
m_state[13]++;
input += (!!xorInput)*4*BYTES_PER_ITERATION;
output += 4*BYTES_PER_ITERATION;
iterationCount -= 4;
input += (!!xorInput)*4*BYTES_PER_ITERATION;
output += 4*BYTES_PER_ITERATION;
iterationCount -= 4;
}
}
}
#endif
#if (CRYPTOPP_POWER8_AVAILABLE)
if (HasPower8())
{
while (iterationCount >= 4)
if (HasPower8())
{
const bool xorInput = (operation & INPUT_NULL) != INPUT_NULL;
ChaCha_OperateKeystream_POWER8(m_state, xorInput ? input : NULLPTR, output, m_rounds);
while (iterationCount >= 4 && MultiBlockSafe())
{
const bool xorInput = (operation & INPUT_NULL) != INPUT_NULL;
ChaCha_OperateKeystream_POWER8(m_state, xorInput ? input : NULLPTR, output, m_rounds);
m_state[12] += 4;
if (m_state[12] < 4)
m_state[12] += 4;
if (m_state[12] < 4)
m_state[13]++;
input += (!!xorInput)*4*BYTES_PER_ITERATION;
output += 4*BYTES_PER_ITERATION;
iterationCount -= 4;
}
}
#endif
if (iterationCount)
{
word32 x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15;
x0 = m_state[0]; x1 = m_state[1]; x2 = m_state[2]; x3 = m_state[3];
x4 = m_state[4]; x5 = m_state[5]; x6 = m_state[6]; x7 = m_state[7];
x8 = m_state[8]; x9 = m_state[9]; x10 = m_state[10]; x11 = m_state[11];
x12 = m_state[12]; x13 = m_state[13]; x14 = m_state[14]; x15 = m_state[15];
for (int i = static_cast<int>(m_rounds); i > 0; i -= 2)
{
CHACHA_QUARTER_ROUND(x0, x4, x8, x12);
CHACHA_QUARTER_ROUND(x1, x5, x9, x13);
CHACHA_QUARTER_ROUND(x2, x6, x10, x14);
CHACHA_QUARTER_ROUND(x3, x7, x11, x15);
CHACHA_QUARTER_ROUND(x0, x5, x10, x15);
CHACHA_QUARTER_ROUND(x1, x6, x11, x12);
CHACHA_QUARTER_ROUND(x2, x7, x8, x13);
CHACHA_QUARTER_ROUND(x3, x4, x9, x14);
}
CRYPTOPP_KEYSTREAM_OUTPUT_SWITCH(CHACHA_OUTPUT, BYTES_PER_ITERATION);
if (++m_state[12] == 0)
m_state[13]++;
input += (!!xorInput)*4*BYTES_PER_ITERATION;
output += 4*BYTES_PER_ITERATION;
iterationCount -= 4;
}
}
#endif
while (iterationCount--)
{
word32 x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15;
x0 = m_state[0]; x1 = m_state[1]; x2 = m_state[2]; x3 = m_state[3];
x4 = m_state[4]; x5 = m_state[5]; x6 = m_state[6]; x7 = m_state[7];
x8 = m_state[8]; x9 = m_state[9]; x10 = m_state[10]; x11 = m_state[11];
x12 = m_state[12]; x13 = m_state[13]; x14 = m_state[14]; x15 = m_state[15];
for (int i = static_cast<int>(m_rounds); i > 0; i -= 2)
{
CHACHA_QUARTER_ROUND(x0, x4, x8, x12);
CHACHA_QUARTER_ROUND(x1, x5, x9, x13);
CHACHA_QUARTER_ROUND(x2, x6, x10, x14);
CHACHA_QUARTER_ROUND(x3, x7, x11, x15);
CHACHA_QUARTER_ROUND(x0, x5, x10, x15);
CHACHA_QUARTER_ROUND(x1, x6, x11, x12);
CHACHA_QUARTER_ROUND(x2, x7, x8, x13);
CHACHA_QUARTER_ROUND(x3, x4, x9, x14);
}
#ifndef CRYPTOPP_DOXYGEN_PROCESSING
#define CHACHA_OUTPUT(x){\
CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 0, x0 + m_state[0]);\
CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 1, x1 + m_state[1]);\
CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 2, x2 + m_state[2]);\
CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 3, x3 + m_state[3]);\
CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 4, x4 + m_state[4]);\
CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 5, x5 + m_state[5]);\
CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 6, x6 + m_state[6]);\
CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 7, x7 + m_state[7]);\
CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 8, x8 + m_state[8]);\
CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 9, x9 + m_state[9]);\
CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 10, x10 + m_state[10]);\
CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 11, x11 + m_state[11]);\
CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 12, x12 + m_state[12]);\
CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 13, x13 + m_state[13]);\
CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 14, x14 + m_state[14]);\
CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 15, x15 + m_state[15]);}
CRYPTOPP_KEYSTREAM_OUTPUT_SWITCH(CHACHA_OUTPUT, BYTES_PER_ITERATION);
#undef CHACHA_OUTPUT
#endif
if (++m_state[12] == 0)
m_state[13]++;
}
// We may re-enter a SIMD keystream operation from here.
} while (iterationCount--);
}
NAMESPACE_END

View File

@ -1,6 +1,7 @@
// chacha.h - written and placed in the public domain by Jeffrey Walton.
// Based on Wei Dai's Salsa20 and Bernstein's reference ChaCha
// family implementation at http://cr.yp.to/chacha.html.
// Based on Wei Dai's Salsa20, Botan's SSE2 implementation,
// and Bernstein's reference ChaCha family implementation at
// http://cr.yp.to/chacha.html.
/// \file chacha.h
/// \brief Classes for ChaCha8, ChaCha12 and ChaCha20 stream ciphers
@ -50,6 +51,13 @@ protected:
std::string AlgorithmName() const;
std::string AlgorithmProvider() const;
// MultiBlockSafe detects a condition that can arise in the SIMD
// implementations where we overflow one of the 32-bit state words
// during addition in an intermediate result. Conditions to trigger
// issue include a user seeks to around 2^32 blocks (256 GB of data).
// https://github.com/weidai11/cryptopp/issues/732
bool MultiBlockSafe() const;
FixedSizeAlignedSecBlock<word32, 16> m_state;
int m_rounds;
};