Work-around potential counter increment problem in ChaCha20 (GH #732)
This is only a work-around for the moment. The issue only affects SIMD code. The problem is, the algorithm we use performs a 32-bit add as an intermediate result, but we really need a 64-bit add. We are running 4 transforms in parallel, and we can't add and carry the way we need to. The workaround is, whenever we could cross the 32-bit counter boundary we use the C version of the transform. We determine the cross-over point by 'bool safe = 0xffffffff - state.low > 4'. When not safe we skip the SIMD version of the algorithm and use the C version. Once we are safe again we use the SIMD version again. The work-around costs us about 0.1 to 0.2 cpb. At 1.10 or 1.15 cpb that equates to about 200 MB/s on a Skylake. We'd like to get it back eventually.pull/737/head
parent
d7a3562c0b
commit
29be6ed97a
191
chacha.cpp
191
chacha.cpp
|
|
@ -1,6 +1,7 @@
|
|||
// chacha.cpp - written and placed in the public domain by Jeffrey Walton.
|
||||
// Based on Wei Dai's Salsa20 and Bernstein's reference ChaCha
|
||||
// family implementation at http://cr.yp.to/chacha.html.
|
||||
// Based on Wei Dai's Salsa20, Botan's SSE2 implementation,
|
||||
// and Bernstein's reference ChaCha family implementation at
|
||||
// http://cr.yp.to/chacha.html.
|
||||
|
||||
#include "pch.h"
|
||||
#include "config.h"
|
||||
|
|
@ -29,6 +30,24 @@ extern void ChaCha_OperateKeystream_POWER8(const word32 *state, const byte* inpu
|
|||
a += b; d ^= a; d = rotlConstant<8,word32>(d); \
|
||||
c += d; b ^= c; b = rotlConstant<7,word32>(b);
|
||||
|
||||
#define CHACHA_OUTPUT(x){\
|
||||
CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 0, x0 + m_state[0]);\
|
||||
CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 1, x1 + m_state[1]);\
|
||||
CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 2, x2 + m_state[2]);\
|
||||
CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 3, x3 + m_state[3]);\
|
||||
CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 4, x4 + m_state[4]);\
|
||||
CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 5, x5 + m_state[5]);\
|
||||
CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 6, x6 + m_state[6]);\
|
||||
CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 7, x7 + m_state[7]);\
|
||||
CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 8, x8 + m_state[8]);\
|
||||
CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 9, x9 + m_state[9]);\
|
||||
CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 10, x10 + m_state[10]);\
|
||||
CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 11, x11 + m_state[11]);\
|
||||
CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 12, x12 + m_state[12]);\
|
||||
CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 13, x13 + m_state[13]);\
|
||||
CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 14, x14 + m_state[14]);\
|
||||
CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 15, x15 + m_state[15]);}
|
||||
|
||||
#if defined(CRYPTOPP_DEBUG) && !defined(CRYPTOPP_DOXYGEN_PROCESSING)
|
||||
void ChaCha_TestInstantiations()
|
||||
{
|
||||
|
|
@ -92,8 +111,8 @@ void ChaCha_Policy::CipherResynchronize(byte *keystreamBuffer, const byte *IV, s
|
|||
|
||||
void ChaCha_Policy::SeekToIteration(lword iterationCount)
|
||||
{
|
||||
m_state[13] = (word32)iterationCount;
|
||||
m_state[12] = (word32)SafeRightShift<32>(iterationCount);
|
||||
m_state[12] = (word32)iterationCount; // low word
|
||||
m_state[13] = (word32)SafeRightShift<32>(iterationCount);
|
||||
}
|
||||
|
||||
unsigned int ChaCha_Policy::GetAlignment() const
|
||||
|
|
@ -102,6 +121,10 @@ unsigned int ChaCha_Policy::GetAlignment() const
|
|||
if (HasSSE2())
|
||||
return 16;
|
||||
else
|
||||
#elif (CRYPTOPP_POWER8_AVAILABLE)
|
||||
if (HasPower8())
|
||||
return 16;
|
||||
else
|
||||
#endif
|
||||
return GetAlignmentOf<word32>();
|
||||
}
|
||||
|
|
@ -126,117 +149,107 @@ unsigned int ChaCha_Policy::GetOptimalBlockSize() const
|
|||
return BYTES_PER_ITERATION;
|
||||
}
|
||||
|
||||
bool ChaCha_Policy::MultiBlockSafe() const
|
||||
{
|
||||
const word32 c = m_state[12];
|
||||
return 0xffffffff - c > 4;
|
||||
}
|
||||
|
||||
// OperateKeystream always produces a key stream. The key stream is written
|
||||
// to output. Optionally a message may be supplied to xor with the key stream.
|
||||
// The message is input, and output = output ^ input.
|
||||
void ChaCha_Policy::OperateKeystream(KeystreamOperation operation,
|
||||
byte *output, const byte *input, size_t iterationCount)
|
||||
{
|
||||
#if (CRYPTOPP_SSE2_INTRIN_AVAILABLE || CRYPTOPP_SSE2_ASM_AVAILABLE)
|
||||
if (HasSSE2())
|
||||
do
|
||||
{
|
||||
while (iterationCount >= 4)
|
||||
#if (CRYPTOPP_SSE2_INTRIN_AVAILABLE || CRYPTOPP_SSE2_ASM_AVAILABLE)
|
||||
if (HasSSE2())
|
||||
{
|
||||
const bool xorInput = (operation & INPUT_NULL) != INPUT_NULL;
|
||||
ChaCha_OperateKeystream_SSE2(m_state, xorInput ? input : NULLPTR, output, m_rounds);
|
||||
while (iterationCount >= 4 && MultiBlockSafe())
|
||||
{
|
||||
const bool xorInput = (operation & INPUT_NULL) != INPUT_NULL;
|
||||
ChaCha_OperateKeystream_SSE2(m_state, xorInput ? input : NULLPTR, output, m_rounds);
|
||||
|
||||
m_state[12] += 4;
|
||||
if (m_state[12] < 4)
|
||||
m_state[13]++;
|
||||
m_state[12] += 4;
|
||||
if (m_state[12] < 4)
|
||||
m_state[13]++;
|
||||
|
||||
input += (!!xorInput)*4*BYTES_PER_ITERATION;
|
||||
output += 4*BYTES_PER_ITERATION;
|
||||
iterationCount -= 4;
|
||||
input += (!!xorInput)*4*BYTES_PER_ITERATION;
|
||||
output += 4*BYTES_PER_ITERATION;
|
||||
iterationCount -= 4;
|
||||
}
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
#if (CRYPTOPP_ARM_NEON_AVAILABLE)
|
||||
if (HasNEON())
|
||||
{
|
||||
while (iterationCount >= 4)
|
||||
if (HasNEON())
|
||||
{
|
||||
const bool xorInput = (operation & INPUT_NULL) != INPUT_NULL;
|
||||
ChaCha_OperateKeystream_NEON(m_state, xorInput ? input : NULLPTR, output, m_rounds);
|
||||
while (iterationCount >= 4 && MultiBlockSafe())
|
||||
{
|
||||
const bool xorInput = (operation & INPUT_NULL) != INPUT_NULL;
|
||||
ChaCha_OperateKeystream_NEON(m_state, xorInput ? input : NULLPTR, output, m_rounds);
|
||||
|
||||
m_state[12] += 4;
|
||||
if (m_state[12] < 4)
|
||||
m_state[13]++;
|
||||
m_state[12] += 4;
|
||||
if (m_state[12] < 4)
|
||||
m_state[13]++;
|
||||
|
||||
input += (!!xorInput)*4*BYTES_PER_ITERATION;
|
||||
output += 4*BYTES_PER_ITERATION;
|
||||
iterationCount -= 4;
|
||||
input += (!!xorInput)*4*BYTES_PER_ITERATION;
|
||||
output += 4*BYTES_PER_ITERATION;
|
||||
iterationCount -= 4;
|
||||
}
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
#if (CRYPTOPP_POWER8_AVAILABLE)
|
||||
if (HasPower8())
|
||||
{
|
||||
while (iterationCount >= 4)
|
||||
if (HasPower8())
|
||||
{
|
||||
const bool xorInput = (operation & INPUT_NULL) != INPUT_NULL;
|
||||
ChaCha_OperateKeystream_POWER8(m_state, xorInput ? input : NULLPTR, output, m_rounds);
|
||||
while (iterationCount >= 4 && MultiBlockSafe())
|
||||
{
|
||||
const bool xorInput = (operation & INPUT_NULL) != INPUT_NULL;
|
||||
ChaCha_OperateKeystream_POWER8(m_state, xorInput ? input : NULLPTR, output, m_rounds);
|
||||
|
||||
m_state[12] += 4;
|
||||
if (m_state[12] < 4)
|
||||
m_state[12] += 4;
|
||||
if (m_state[12] < 4)
|
||||
m_state[13]++;
|
||||
|
||||
input += (!!xorInput)*4*BYTES_PER_ITERATION;
|
||||
output += 4*BYTES_PER_ITERATION;
|
||||
iterationCount -= 4;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
if (iterationCount)
|
||||
{
|
||||
word32 x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15;
|
||||
|
||||
x0 = m_state[0]; x1 = m_state[1]; x2 = m_state[2]; x3 = m_state[3];
|
||||
x4 = m_state[4]; x5 = m_state[5]; x6 = m_state[6]; x7 = m_state[7];
|
||||
x8 = m_state[8]; x9 = m_state[9]; x10 = m_state[10]; x11 = m_state[11];
|
||||
x12 = m_state[12]; x13 = m_state[13]; x14 = m_state[14]; x15 = m_state[15];
|
||||
|
||||
for (int i = static_cast<int>(m_rounds); i > 0; i -= 2)
|
||||
{
|
||||
CHACHA_QUARTER_ROUND(x0, x4, x8, x12);
|
||||
CHACHA_QUARTER_ROUND(x1, x5, x9, x13);
|
||||
CHACHA_QUARTER_ROUND(x2, x6, x10, x14);
|
||||
CHACHA_QUARTER_ROUND(x3, x7, x11, x15);
|
||||
|
||||
CHACHA_QUARTER_ROUND(x0, x5, x10, x15);
|
||||
CHACHA_QUARTER_ROUND(x1, x6, x11, x12);
|
||||
CHACHA_QUARTER_ROUND(x2, x7, x8, x13);
|
||||
CHACHA_QUARTER_ROUND(x3, x4, x9, x14);
|
||||
}
|
||||
|
||||
CRYPTOPP_KEYSTREAM_OUTPUT_SWITCH(CHACHA_OUTPUT, BYTES_PER_ITERATION);
|
||||
|
||||
if (++m_state[12] == 0)
|
||||
m_state[13]++;
|
||||
|
||||
input += (!!xorInput)*4*BYTES_PER_ITERATION;
|
||||
output += 4*BYTES_PER_ITERATION;
|
||||
iterationCount -= 4;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
while (iterationCount--)
|
||||
{
|
||||
word32 x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15;
|
||||
|
||||
x0 = m_state[0]; x1 = m_state[1]; x2 = m_state[2]; x3 = m_state[3];
|
||||
x4 = m_state[4]; x5 = m_state[5]; x6 = m_state[6]; x7 = m_state[7];
|
||||
x8 = m_state[8]; x9 = m_state[9]; x10 = m_state[10]; x11 = m_state[11];
|
||||
x12 = m_state[12]; x13 = m_state[13]; x14 = m_state[14]; x15 = m_state[15];
|
||||
|
||||
for (int i = static_cast<int>(m_rounds); i > 0; i -= 2)
|
||||
{
|
||||
CHACHA_QUARTER_ROUND(x0, x4, x8, x12);
|
||||
CHACHA_QUARTER_ROUND(x1, x5, x9, x13);
|
||||
CHACHA_QUARTER_ROUND(x2, x6, x10, x14);
|
||||
CHACHA_QUARTER_ROUND(x3, x7, x11, x15);
|
||||
|
||||
CHACHA_QUARTER_ROUND(x0, x5, x10, x15);
|
||||
CHACHA_QUARTER_ROUND(x1, x6, x11, x12);
|
||||
CHACHA_QUARTER_ROUND(x2, x7, x8, x13);
|
||||
CHACHA_QUARTER_ROUND(x3, x4, x9, x14);
|
||||
}
|
||||
|
||||
#ifndef CRYPTOPP_DOXYGEN_PROCESSING
|
||||
#define CHACHA_OUTPUT(x){\
|
||||
CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 0, x0 + m_state[0]);\
|
||||
CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 1, x1 + m_state[1]);\
|
||||
CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 2, x2 + m_state[2]);\
|
||||
CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 3, x3 + m_state[3]);\
|
||||
CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 4, x4 + m_state[4]);\
|
||||
CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 5, x5 + m_state[5]);\
|
||||
CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 6, x6 + m_state[6]);\
|
||||
CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 7, x7 + m_state[7]);\
|
||||
CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 8, x8 + m_state[8]);\
|
||||
CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 9, x9 + m_state[9]);\
|
||||
CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 10, x10 + m_state[10]);\
|
||||
CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 11, x11 + m_state[11]);\
|
||||
CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 12, x12 + m_state[12]);\
|
||||
CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 13, x13 + m_state[13]);\
|
||||
CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 14, x14 + m_state[14]);\
|
||||
CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 15, x15 + m_state[15]);}
|
||||
|
||||
CRYPTOPP_KEYSTREAM_OUTPUT_SWITCH(CHACHA_OUTPUT, BYTES_PER_ITERATION);
|
||||
#undef CHACHA_OUTPUT
|
||||
#endif
|
||||
|
||||
if (++m_state[12] == 0)
|
||||
m_state[13]++;
|
||||
}
|
||||
// We may re-enter a SIMD keystream operation from here.
|
||||
} while (iterationCount--);
|
||||
}
|
||||
|
||||
NAMESPACE_END
|
||||
|
|
|
|||
12
chacha.h
12
chacha.h
|
|
@ -1,6 +1,7 @@
|
|||
// chacha.h - written and placed in the public domain by Jeffrey Walton.
|
||||
// Based on Wei Dai's Salsa20 and Bernstein's reference ChaCha
|
||||
// family implementation at http://cr.yp.to/chacha.html.
|
||||
// Based on Wei Dai's Salsa20, Botan's SSE2 implementation,
|
||||
// and Bernstein's reference ChaCha family implementation at
|
||||
// http://cr.yp.to/chacha.html.
|
||||
|
||||
/// \file chacha.h
|
||||
/// \brief Classes for ChaCha8, ChaCha12 and ChaCha20 stream ciphers
|
||||
|
|
@ -50,6 +51,13 @@ protected:
|
|||
std::string AlgorithmName() const;
|
||||
std::string AlgorithmProvider() const;
|
||||
|
||||
// MultiBlockSafe detects a condition that can arise in the SIMD
|
||||
// implementations where we overflow one of the 32-bit state words
|
||||
// during addition in an intermediate result. Conditions to trigger
|
||||
// issue include a user seeks to around 2^32 blocks (256 GB of data).
|
||||
// https://github.com/weidai11/cryptopp/issues/732
|
||||
bool MultiBlockSafe() const;
|
||||
|
||||
FixedSizeAlignedSecBlock<word32, 16> m_state;
|
||||
int m_rounds;
|
||||
};
|
||||
|
|
|
|||
Loading…
Reference in New Issue