Add xorInput and xorOutput flags to adv-simd classes

Analysis tools are generating findings when the pointer xorBlocks is used as the flag. The other missing piece is, xorBlocks is never NULL when either BT_XorInput or BT_XorOuput. But we don't know how to train the analyzers with the information, so we make it explicit with the boolean flags xorInput and xorOutput.
Switching to the explicit flags costs us about 0.01 cpb on a modern Intel Core processor. In the typical case 0.01 is negligible.
pull/574/head
Jeffrey Walton 2018-01-24 12:06:15 -05:00
parent e872c8dc3a
commit 85993b2529
No known key found for this signature in database
GPG Key ID: B36AB348921B1838
1 changed files with 71 additions and 62 deletions

View File

@ -64,18 +64,6 @@ CRYPTOPP_CONSTANT(BT_InBlockIsCounter = BlockTransformation::BT_InBlockIsCounter
CRYPTOPP_CONSTANT(BT_ReverseDirection = BlockTransformation::BT_ReverseDirection)
CRYPTOPP_CONSTANT(BT_DontIncrementInOutPointers = BlockTransformation::BT_DontIncrementInOutPointers)
// Coverity finding on xorBlocks. While not obvious, xorBlocks is
// always non-NULL when BT_XorInput is set. All callers follow the
// convention. Also see https://stackoverflow.com/q/33719379/608639.
inline word32 XorBlocksToFlags(const byte* xorBlocks, word32 flags)
{
#if defined(__COVERITY__)
return xorBlocks ? (flags) : (flags &= ~BT_XorInput);
#else
return CRYPTOPP_UNUSED(xorBlocks), flags;
#endif
}
ANONYMOUS_NAMESPACE_END
// *************************** ARM NEON ************************** //
@ -121,9 +109,12 @@ inline size_t AdvancedProcessBlocks64_6x2_NEON(F2 func2, F6 func6,
const ptrdiff_t neonBlockSize = 16;
ptrdiff_t inIncrement = (flags & (BT_InBlockIsCounter|BT_DontIncrementInOutPointers)) ? 0 : neonBlockSize;
ptrdiff_t xorIncrement = xorBlocks ? neonBlockSize : 0;
ptrdiff_t xorIncrement = (xorBlocks != NULLPTR) ? neonBlockSize : 0;
ptrdiff_t outIncrement = (flags & BT_DontIncrementInOutPointers) ? 0 : neonBlockSize;
flags = XorBlocksToFlags(xorBlocks, flags); // Coverity hack
// Clang and Coverity are generating findings using xorBlocks as a flag.
const bool xorInput = (xorBlocks != NULLPTR) && (flags & BT_XorInput);
const bool xorOutput = (xorBlocks != NULLPTR) && !(flags & BT_XorInput);
if (flags & BT_ReverseDirection)
{
@ -176,7 +167,7 @@ inline size_t AdvancedProcessBlocks64_6x2_NEON(F2 func2, F6 func6,
inBlocks += inIncrement;
}
if (flags & BT_XorInput)
if (xorInput)
{
block0 = veorq_u32(block0, vreinterpretq_u32_u8(vld1q_u8(xorBlocks)));
xorBlocks += xorIncrement;
@ -194,7 +185,7 @@ inline size_t AdvancedProcessBlocks64_6x2_NEON(F2 func2, F6 func6,
func6(block0, block1, block2, block3, block4, block5, subKeys, static_cast<unsigned int>(rounds));
if (xorBlocks && !(flags & BT_XorInput))
if (xorOutput)
{
block0 = veorq_u32(block0, vreinterpretq_u32_u8(vld1q_u8(xorBlocks)));
xorBlocks += xorIncrement;
@ -253,7 +244,7 @@ inline size_t AdvancedProcessBlocks64_6x2_NEON(F2 func2, F6 func6,
inBlocks += inIncrement;
}
if (flags & BT_XorInput)
if (xorInput)
{
block0 = veorq_u32(block0, vreinterpretq_u32_u8(vld1q_u8(xorBlocks)));
xorBlocks += xorIncrement;
@ -263,7 +254,7 @@ inline size_t AdvancedProcessBlocks64_6x2_NEON(F2 func2, F6 func6,
func2(block0, block1, subKeys, static_cast<unsigned int>(rounds));
if (xorBlocks && !(flags & BT_XorInput))
if (xorOutput)
{
block0 = veorq_u32(block0, vreinterpretq_u32_u8(vld1q_u8(xorBlocks)));
xorBlocks += xorIncrement;
@ -306,7 +297,7 @@ inline size_t AdvancedProcessBlocks64_6x2_NEON(F2 func2, F6 func6,
const uint8x8_t v = vld1_u8(inBlocks);
block = vreinterpretq_u32_u8(vcombine_u8(v,v));
if (flags & BT_XorInput)
if (xorInput)
{
const uint8x8_t x = vld1_u8(xorBlocks);
block = veorq_u32(block, vreinterpretq_u32_u8(vcombine_u8(x,x)));
@ -317,7 +308,7 @@ inline size_t AdvancedProcessBlocks64_6x2_NEON(F2 func2, F6 func6,
func2(block, zero, subKeys, static_cast<unsigned int>(rounds));
if (xorBlocks && !(flags & BT_XorInput))
if (xorOutput)
{
const uint8x8_t x = vld1_u8(xorBlocks);
block = veorq_u32(block, vreinterpretq_u32_u8(vcombine_u8(x,x)));
@ -350,9 +341,12 @@ size_t AdvancedProcessBlocks128_NEON1x6(F1 func1, F6 func6,
// const ptrdiff_t neonBlockSize = 16;
ptrdiff_t inIncrement = (flags & (BT_InBlockIsCounter|BT_DontIncrementInOutPointers)) ? 0 : blockSize;
ptrdiff_t xorIncrement = xorBlocks ? blockSize : 0;
ptrdiff_t xorIncrement = (xorBlocks != NULLPTR) ? blockSize : 0;
ptrdiff_t outIncrement = (flags & BT_DontIncrementInOutPointers) ? 0 : blockSize;
flags = XorBlocksToFlags(xorBlocks, flags); // Coverity hack
// Clang and Coverity are generating findings using xorBlocks as a flag.
const bool xorInput = (xorBlocks != NULLPTR) && (flags & BT_XorInput);
const bool xorOutput = (xorBlocks != NULLPTR) && !(flags & BT_XorInput);
if (flags & BT_ReverseDirection)
{
@ -398,7 +392,7 @@ size_t AdvancedProcessBlocks128_NEON1x6(F1 func1, F6 func6,
inBlocks += inIncrement;
}
if (flags & BT_XorInput)
if (xorInput)
{
block0 = veorq_u64(block0, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
xorBlocks += xorIncrement;
@ -416,7 +410,7 @@ size_t AdvancedProcessBlocks128_NEON1x6(F1 func1, F6 func6,
func6(block0, block1, block2, block3, block4, block5, subKeys, static_cast<unsigned int>(rounds));
if (xorBlocks && !(flags & BT_XorInput))
if (xorOutput)
{
block0 = veorq_u64(block0, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
xorBlocks += xorIncrement;
@ -454,7 +448,7 @@ size_t AdvancedProcessBlocks128_NEON1x6(F1 func1, F6 func6,
uint64x2_t block;
block = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
if (flags & BT_XorInput)
if (xorInput)
block = veorq_u64(block, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
if (flags & BT_InBlockIsCounter)
@ -462,7 +456,7 @@ size_t AdvancedProcessBlocks128_NEON1x6(F1 func1, F6 func6,
func1(block, subKeys, static_cast<unsigned int>(rounds));
if (xorBlocks && !(flags & BT_XorInput))
if (xorOutput)
block = veorq_u64(block, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
vst1q_u8(outBlocks, vreinterpretq_u8_u64(block));
@ -490,9 +484,12 @@ size_t AdvancedProcessBlocks128_6x2_NEON(F2 func2, F6 func6,
// const ptrdiff_t neonBlockSize = 16;
ptrdiff_t inIncrement = (flags & (BT_InBlockIsCounter|BT_DontIncrementInOutPointers)) ? 0 : blockSize;
ptrdiff_t xorIncrement = xorBlocks ? blockSize : 0;
ptrdiff_t xorIncrement = (xorBlocks != NULLPTR) ? blockSize : 0;
ptrdiff_t outIncrement = (flags & BT_DontIncrementInOutPointers) ? 0 : blockSize;
flags = XorBlocksToFlags(xorBlocks, flags); // Coverity hack
// Clang and Coverity are generating findings using xorBlocks as a flag.
const bool xorInput = (xorBlocks != NULLPTR) && (flags & BT_XorInput);
const bool xorOutput = (xorBlocks != NULLPTR) && !(flags & BT_XorInput);
if (flags & BT_ReverseDirection)
{
@ -538,7 +535,7 @@ size_t AdvancedProcessBlocks128_6x2_NEON(F2 func2, F6 func6,
inBlocks += inIncrement;
}
if (flags & BT_XorInput)
if (xorInput)
{
block0 = veorq_u64(block0, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
xorBlocks += xorIncrement;
@ -556,7 +553,7 @@ size_t AdvancedProcessBlocks128_6x2_NEON(F2 func2, F6 func6,
func6(block0, block1, block2, block3, block4, block5, subKeys, static_cast<unsigned int>(rounds));
if (xorBlocks && !(flags & BT_XorInput))
if (xorOutput)
{
block0 = veorq_u64(block0, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
xorBlocks += xorIncrement;
@ -608,7 +605,7 @@ size_t AdvancedProcessBlocks128_6x2_NEON(F2 func2, F6 func6,
inBlocks += inIncrement;
}
if (flags & BT_XorInput)
if (xorInput)
{
block0 = veorq_u64(block0, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
xorBlocks += xorIncrement;
@ -618,7 +615,7 @@ size_t AdvancedProcessBlocks128_6x2_NEON(F2 func2, F6 func6,
func2(block0, block1, subKeys, static_cast<unsigned int>(rounds));
if (xorBlocks && !(flags & BT_XorInput))
if (xorOutput)
{
block0 = veorq_u64(block0, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
xorBlocks += xorIncrement;
@ -640,7 +637,7 @@ size_t AdvancedProcessBlocks128_6x2_NEON(F2 func2, F6 func6,
uint64x2_t block, zero = {0,0};
block = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
if (flags & BT_XorInput)
if (xorInput)
block = veorq_u64(block, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
if (flags & BT_InBlockIsCounter)
@ -648,7 +645,7 @@ size_t AdvancedProcessBlocks128_6x2_NEON(F2 func2, F6 func6,
func2(block, zero, subKeys, static_cast<unsigned int>(rounds));
if (xorBlocks && !(flags & BT_XorInput))
if (xorOutput)
block = veorq_u64(block, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
vst1q_u8(outBlocks, vreinterpretq_u8_u64(block));
@ -726,9 +723,12 @@ inline size_t GCC_NO_UBSAN AdvancedProcessBlocks64_6x2_SSE(F2 func2, F6 func6,
const ptrdiff_t xmmBlockSize = 16;
ptrdiff_t inIncrement = (flags & (BT_InBlockIsCounter|BT_DontIncrementInOutPointers)) ? 0 : xmmBlockSize;
ptrdiff_t xorIncrement = xorBlocks ? xmmBlockSize : 0;
ptrdiff_t xorIncrement = (xorBlocks != NULLPTR) ? xmmBlockSize : 0;
ptrdiff_t outIncrement = (flags & BT_DontIncrementInOutPointers) ? 0 : xmmBlockSize;
flags = XorBlocksToFlags(xorBlocks, flags); // Coverity hack
// Clang and Coverity are generating findings using xorBlocks as a flag.
const bool xorInput = (xorBlocks != NULLPTR) && (flags & BT_XorInput);
const bool xorOutput = (xorBlocks != NULLPTR) && !(flags & BT_XorInput);
if (flags & BT_ReverseDirection)
{
@ -781,7 +781,7 @@ inline size_t GCC_NO_UBSAN AdvancedProcessBlocks64_6x2_SSE(F2 func2, F6 func6,
inBlocks += inIncrement;
}
if (flags & BT_XorInput)
if (xorInput)
{
block0 = _mm_xor_si128(block0, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
xorBlocks += xorIncrement;
@ -799,7 +799,7 @@ inline size_t GCC_NO_UBSAN AdvancedProcessBlocks64_6x2_SSE(F2 func2, F6 func6,
func6(block0, block1, block2, block3, block4, block5, subKeys, static_cast<unsigned int>(rounds));
if (xorBlocks && !(flags & BT_XorInput))
if (xorOutput)
{
block0 = _mm_xor_si128(block0, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
xorBlocks += xorIncrement;
@ -858,7 +858,7 @@ inline size_t GCC_NO_UBSAN AdvancedProcessBlocks64_6x2_SSE(F2 func2, F6 func6,
inBlocks += inIncrement;
}
if (flags & BT_XorInput)
if (xorInput)
{
block0 = _mm_xor_si128(block0, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
xorBlocks += xorIncrement;
@ -868,7 +868,7 @@ inline size_t GCC_NO_UBSAN AdvancedProcessBlocks64_6x2_SSE(F2 func2, F6 func6,
func2(block0, block1, subKeys, static_cast<unsigned int>(rounds));
if (xorBlocks && !(flags & BT_XorInput))
if (xorOutput)
{
block0 = _mm_xor_si128(block0, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
xorBlocks += xorIncrement;
@ -911,7 +911,7 @@ inline size_t GCC_NO_UBSAN AdvancedProcessBlocks64_6x2_SSE(F2 func2, F6 func6,
// UBsan false positive; mem_addr can be unaligned.
_mm_load_sd(CONST_DOUBLE_CAST(inBlocks)));
if (flags & BT_XorInput)
if (xorInput)
{
block = _mm_xor_si128(block, _mm_castpd_si128(
// UBsan false positive; mem_addr can be unaligned.
@ -923,7 +923,7 @@ inline size_t GCC_NO_UBSAN AdvancedProcessBlocks64_6x2_SSE(F2 func2, F6 func6,
func2(block, zero, subKeys, static_cast<unsigned int>(rounds));
if (xorBlocks && !(flags & BT_XorInput))
if (xorOutput)
{
block = _mm_xor_si128(block, _mm_castpd_si128(
// UBsan false positive; mem_addr can be unaligned.
@ -957,9 +957,12 @@ inline size_t AdvancedProcessBlocks128_6x2_SSE(F2 func2, F6 func6,
// const ptrdiff_t xmmBlockSize = 16;
ptrdiff_t inIncrement = (flags & (BT_InBlockIsCounter|BT_DontIncrementInOutPointers)) ? 0 : blockSize;
ptrdiff_t xorIncrement = xorBlocks ? blockSize : 0;
ptrdiff_t xorIncrement = (xorBlocks != NULLPTR) ? blockSize : 0;
ptrdiff_t outIncrement = (flags & BT_DontIncrementInOutPointers) ? 0 : blockSize;
flags = XorBlocksToFlags(xorBlocks, flags); // Coverity hack
// Clang and Coverity are generating findings using xorBlocks as a flag.
const bool xorInput = (xorBlocks != NULLPTR) && (flags & BT_XorInput);
const bool xorOutput = (xorBlocks != NULLPTR) && !(flags & BT_XorInput);
if (flags & BT_ReverseDirection)
{
@ -1003,7 +1006,7 @@ inline size_t AdvancedProcessBlocks128_6x2_SSE(F2 func2, F6 func6,
inBlocks += inIncrement;
}
if (flags & BT_XorInput)
if (xorInput)
{
block0 = _mm_xor_si128(block0, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
xorBlocks += xorIncrement;
@ -1021,7 +1024,7 @@ inline size_t AdvancedProcessBlocks128_6x2_SSE(F2 func2, F6 func6,
func6(block0, block1, block2, block3, block4, block5, subKeys, static_cast<unsigned int>(rounds));
if (xorBlocks && !(flags & BT_XorInput))
if (xorOutput)
{
block0 = _mm_xor_si128(block0, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
xorBlocks += xorIncrement;
@ -1071,7 +1074,7 @@ inline size_t AdvancedProcessBlocks128_6x2_SSE(F2 func2, F6 func6,
inBlocks += inIncrement;
}
if (flags & BT_XorInput)
if (xorInput)
{
block0 = _mm_xor_si128(block0, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
xorBlocks += xorIncrement;
@ -1081,7 +1084,7 @@ inline size_t AdvancedProcessBlocks128_6x2_SSE(F2 func2, F6 func6,
func2(block0, block1, subKeys, static_cast<unsigned int>(rounds));
if (xorBlocks && !(flags & BT_XorInput))
if (xorOutput)
{
block0 = _mm_xor_si128(block0, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
xorBlocks += xorIncrement;
@ -1103,7 +1106,7 @@ inline size_t AdvancedProcessBlocks128_6x2_SSE(F2 func2, F6 func6,
__m128i block, zero = _mm_setzero_si128();
block = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
if (flags & BT_XorInput)
if (xorInput)
block = _mm_xor_si128(block, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
if (flags & BT_InBlockIsCounter)
@ -1111,7 +1114,7 @@ inline size_t AdvancedProcessBlocks128_6x2_SSE(F2 func2, F6 func6,
func2(block, zero, subKeys, static_cast<unsigned int>(rounds));
if (xorBlocks && !(flags & BT_XorInput))
if (xorOutput)
block = _mm_xor_si128(block, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
_mm_storeu_si128(M128_CAST(outBlocks), block);
@ -1139,9 +1142,12 @@ inline size_t AdvancedProcessBlocks128_4x1_SSE(F1 func1, F4 func4,
// const ptrdiff_t xmmBlockSize = 16;
ptrdiff_t inIncrement = (flags & (BT_InBlockIsCounter|BT_DontIncrementInOutPointers)) ? 0 : blockSize;
ptrdiff_t xorIncrement = xorBlocks ? blockSize : 0;
ptrdiff_t xorIncrement = (xorBlocks != NULLPTR) ? blockSize : 0;
ptrdiff_t outIncrement = (flags & BT_DontIncrementInOutPointers) ? 0 : blockSize;
flags = XorBlocksToFlags(xorBlocks, flags); // Coverity hack
// Clang and Coverity are generating findings using xorBlocks as a flag.
const bool xorInput = (xorBlocks != NULLPTR) && (flags & BT_XorInput);
const bool xorOutput = (xorBlocks != NULLPTR) && !(flags & BT_XorInput);
if (flags & BT_ReverseDirection)
{
@ -1179,7 +1185,7 @@ inline size_t AdvancedProcessBlocks128_4x1_SSE(F1 func1, F4 func4,
inBlocks += inIncrement;
}
if (flags & BT_XorInput)
if (xorInput)
{
block0 = _mm_xor_si128(block0, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
xorBlocks += xorIncrement;
@ -1193,7 +1199,7 @@ inline size_t AdvancedProcessBlocks128_4x1_SSE(F1 func1, F4 func4,
func4(block0, block1, block2, block3, subKeys, static_cast<unsigned int>(rounds));
if (xorBlocks && !(flags & BT_XorInput))
if (xorOutput)
{
block0 = _mm_xor_si128(block0, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
xorBlocks += xorIncrement;
@ -1222,7 +1228,7 @@ inline size_t AdvancedProcessBlocks128_4x1_SSE(F1 func1, F4 func4,
{
__m128i block = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
if (flags & BT_XorInput)
if (xorInput)
block = _mm_xor_si128(block, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
if (flags & BT_InBlockIsCounter)
@ -1230,7 +1236,7 @@ inline size_t AdvancedProcessBlocks128_4x1_SSE(F1 func1, F4 func4,
func1(block, subKeys, static_cast<unsigned int>(rounds));
if (xorBlocks && !(flags & BT_XorInput))
if (xorOutput)
block = _mm_xor_si128(block, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
_mm_storeu_si128(M128_CAST(outBlocks), block);
@ -1279,9 +1285,12 @@ size_t AdvancedProcessBlocks128_6x1_ALTIVEC(F1 func1, F6 func6, const word32 *su
// const ptrdiff_t vexBlockSize = 16;
ptrdiff_t inIncrement = (flags & (BT_InBlockIsCounter|BT_DontIncrementInOutPointers)) ? 0 : blockSize;
ptrdiff_t xorIncrement = xorBlocks ? blockSize : 0;
ptrdiff_t xorIncrement = (xorBlocks != NULLPTR) ? blockSize : 0;
ptrdiff_t outIncrement = (flags & BT_DontIncrementInOutPointers) ? 0 : blockSize;
flags = XorBlocksToFlags(xorBlocks, flags); // Coverity hack
// Clang and Coverity are generating findings using xorBlocks as a flag.
const bool xorInput = (xorBlocks != NULLPTR) && (flags & BT_XorInput);
const bool xorOutput = (xorBlocks != NULLPTR) && !(flags & BT_XorInput);
if (flags & BT_ReverseDirection)
{
@ -1326,7 +1335,7 @@ size_t AdvancedProcessBlocks128_6x1_ALTIVEC(F1 func1, F6 func6, const word32 *su
inBlocks += inIncrement;
}
if (flags & BT_XorInput)
if (xorInput)
{
block0 = VectorXor(block0, VectorLoad(xorBlocks));
xorBlocks += xorIncrement;
@ -1344,7 +1353,7 @@ size_t AdvancedProcessBlocks128_6x1_ALTIVEC(F1 func1, F6 func6, const word32 *su
func6(block0, block1, block2, block3, block4, block5, subKeys, rounds);
if (xorBlocks && !(flags & BT_XorInput))
if (xorOutput)
{
block0 = VectorXor(block0, VectorLoad(xorBlocks));
xorBlocks += xorIncrement;
@ -1381,7 +1390,7 @@ size_t AdvancedProcessBlocks128_6x1_ALTIVEC(F1 func1, F6 func6, const word32 *su
{
uint32x4_p block = VectorLoad(inBlocks);
if (flags & BT_XorInput)
if (xorInput)
block = VectorXor(block, VectorLoad(xorBlocks));
if (flags & BT_InBlockIsCounter)
@ -1389,7 +1398,7 @@ size_t AdvancedProcessBlocks128_6x1_ALTIVEC(F1 func1, F6 func6, const word32 *su
func1(block, subKeys, rounds);
if (xorBlocks && !(flags & BT_XorInput))
if (xorOutput)
block = VectorXor(block, VectorLoad(xorBlocks));
VectorStore(block, outBlocks);