From 85993b2529a22ba58cf6ceb0ef178724a1fe5262 Mon Sep 17 00:00:00 2001 From: Jeffrey Walton Date: Wed, 24 Jan 2018 12:06:15 -0500 Subject: [PATCH] Add xorInput and xorOutput flags to adv-simd classes Analysis tools are generating findings when the pointer xorBlocks is used as the flag. The other missing piece is, xorBlocks is never NULL when either BT_XorInput or BT_XorOuput. But we don't know how to train the analyzers with the information, so we make it explicit with the boolean flags xorInput and xorOutput. Switching to the explicit flags costs us about 0.01 cpb on a modern Intel Core processor. In the typical case 0.01 is negligible. --- adv-simd.h | 133 ++++++++++++++++++++++++++++------------------------- 1 file changed, 71 insertions(+), 62 deletions(-) diff --git a/adv-simd.h b/adv-simd.h index c9632b0a..7bd98e6a 100644 --- a/adv-simd.h +++ b/adv-simd.h @@ -64,18 +64,6 @@ CRYPTOPP_CONSTANT(BT_InBlockIsCounter = BlockTransformation::BT_InBlockIsCounter CRYPTOPP_CONSTANT(BT_ReverseDirection = BlockTransformation::BT_ReverseDirection) CRYPTOPP_CONSTANT(BT_DontIncrementInOutPointers = BlockTransformation::BT_DontIncrementInOutPointers) -// Coverity finding on xorBlocks. While not obvious, xorBlocks is -// always non-NULL when BT_XorInput is set. All callers follow the -// convention. Also see https://stackoverflow.com/q/33719379/608639. -inline word32 XorBlocksToFlags(const byte* xorBlocks, word32 flags) -{ -#if defined(__COVERITY__) - return xorBlocks ? (flags) : (flags &= ~BT_XorInput); -#else - return CRYPTOPP_UNUSED(xorBlocks), flags; -#endif -} - ANONYMOUS_NAMESPACE_END // *************************** ARM NEON ************************** // @@ -121,9 +109,12 @@ inline size_t AdvancedProcessBlocks64_6x2_NEON(F2 func2, F6 func6, const ptrdiff_t neonBlockSize = 16; ptrdiff_t inIncrement = (flags & (BT_InBlockIsCounter|BT_DontIncrementInOutPointers)) ? 0 : neonBlockSize; - ptrdiff_t xorIncrement = xorBlocks ? neonBlockSize : 0; + ptrdiff_t xorIncrement = (xorBlocks != NULLPTR) ? neonBlockSize : 0; ptrdiff_t outIncrement = (flags & BT_DontIncrementInOutPointers) ? 0 : neonBlockSize; - flags = XorBlocksToFlags(xorBlocks, flags); // Coverity hack + + // Clang and Coverity are generating findings using xorBlocks as a flag. + const bool xorInput = (xorBlocks != NULLPTR) && (flags & BT_XorInput); + const bool xorOutput = (xorBlocks != NULLPTR) && !(flags & BT_XorInput); if (flags & BT_ReverseDirection) { @@ -176,7 +167,7 @@ inline size_t AdvancedProcessBlocks64_6x2_NEON(F2 func2, F6 func6, inBlocks += inIncrement; } - if (flags & BT_XorInput) + if (xorInput) { block0 = veorq_u32(block0, vreinterpretq_u32_u8(vld1q_u8(xorBlocks))); xorBlocks += xorIncrement; @@ -194,7 +185,7 @@ inline size_t AdvancedProcessBlocks64_6x2_NEON(F2 func2, F6 func6, func6(block0, block1, block2, block3, block4, block5, subKeys, static_cast(rounds)); - if (xorBlocks && !(flags & BT_XorInput)) + if (xorOutput) { block0 = veorq_u32(block0, vreinterpretq_u32_u8(vld1q_u8(xorBlocks))); xorBlocks += xorIncrement; @@ -253,7 +244,7 @@ inline size_t AdvancedProcessBlocks64_6x2_NEON(F2 func2, F6 func6, inBlocks += inIncrement; } - if (flags & BT_XorInput) + if (xorInput) { block0 = veorq_u32(block0, vreinterpretq_u32_u8(vld1q_u8(xorBlocks))); xorBlocks += xorIncrement; @@ -263,7 +254,7 @@ inline size_t AdvancedProcessBlocks64_6x2_NEON(F2 func2, F6 func6, func2(block0, block1, subKeys, static_cast(rounds)); - if (xorBlocks && !(flags & BT_XorInput)) + if (xorOutput) { block0 = veorq_u32(block0, vreinterpretq_u32_u8(vld1q_u8(xorBlocks))); xorBlocks += xorIncrement; @@ -306,7 +297,7 @@ inline size_t AdvancedProcessBlocks64_6x2_NEON(F2 func2, F6 func6, const uint8x8_t v = vld1_u8(inBlocks); block = vreinterpretq_u32_u8(vcombine_u8(v,v)); - if (flags & BT_XorInput) + if (xorInput) { const uint8x8_t x = vld1_u8(xorBlocks); block = veorq_u32(block, vreinterpretq_u32_u8(vcombine_u8(x,x))); @@ -317,7 +308,7 @@ inline size_t AdvancedProcessBlocks64_6x2_NEON(F2 func2, F6 func6, func2(block, zero, subKeys, static_cast(rounds)); - if (xorBlocks && !(flags & BT_XorInput)) + if (xorOutput) { const uint8x8_t x = vld1_u8(xorBlocks); block = veorq_u32(block, vreinterpretq_u32_u8(vcombine_u8(x,x))); @@ -350,9 +341,12 @@ size_t AdvancedProcessBlocks128_NEON1x6(F1 func1, F6 func6, // const ptrdiff_t neonBlockSize = 16; ptrdiff_t inIncrement = (flags & (BT_InBlockIsCounter|BT_DontIncrementInOutPointers)) ? 0 : blockSize; - ptrdiff_t xorIncrement = xorBlocks ? blockSize : 0; + ptrdiff_t xorIncrement = (xorBlocks != NULLPTR) ? blockSize : 0; ptrdiff_t outIncrement = (flags & BT_DontIncrementInOutPointers) ? 0 : blockSize; - flags = XorBlocksToFlags(xorBlocks, flags); // Coverity hack + + // Clang and Coverity are generating findings using xorBlocks as a flag. + const bool xorInput = (xorBlocks != NULLPTR) && (flags & BT_XorInput); + const bool xorOutput = (xorBlocks != NULLPTR) && !(flags & BT_XorInput); if (flags & BT_ReverseDirection) { @@ -398,7 +392,7 @@ size_t AdvancedProcessBlocks128_NEON1x6(F1 func1, F6 func6, inBlocks += inIncrement; } - if (flags & BT_XorInput) + if (xorInput) { block0 = veorq_u64(block0, vreinterpretq_u64_u8(vld1q_u8(xorBlocks))); xorBlocks += xorIncrement; @@ -416,7 +410,7 @@ size_t AdvancedProcessBlocks128_NEON1x6(F1 func1, F6 func6, func6(block0, block1, block2, block3, block4, block5, subKeys, static_cast(rounds)); - if (xorBlocks && !(flags & BT_XorInput)) + if (xorOutput) { block0 = veorq_u64(block0, vreinterpretq_u64_u8(vld1q_u8(xorBlocks))); xorBlocks += xorIncrement; @@ -454,7 +448,7 @@ size_t AdvancedProcessBlocks128_NEON1x6(F1 func1, F6 func6, uint64x2_t block; block = vreinterpretq_u64_u8(vld1q_u8(inBlocks)); - if (flags & BT_XorInput) + if (xorInput) block = veorq_u64(block, vreinterpretq_u64_u8(vld1q_u8(xorBlocks))); if (flags & BT_InBlockIsCounter) @@ -462,7 +456,7 @@ size_t AdvancedProcessBlocks128_NEON1x6(F1 func1, F6 func6, func1(block, subKeys, static_cast(rounds)); - if (xorBlocks && !(flags & BT_XorInput)) + if (xorOutput) block = veorq_u64(block, vreinterpretq_u64_u8(vld1q_u8(xorBlocks))); vst1q_u8(outBlocks, vreinterpretq_u8_u64(block)); @@ -490,9 +484,12 @@ size_t AdvancedProcessBlocks128_6x2_NEON(F2 func2, F6 func6, // const ptrdiff_t neonBlockSize = 16; ptrdiff_t inIncrement = (flags & (BT_InBlockIsCounter|BT_DontIncrementInOutPointers)) ? 0 : blockSize; - ptrdiff_t xorIncrement = xorBlocks ? blockSize : 0; + ptrdiff_t xorIncrement = (xorBlocks != NULLPTR) ? blockSize : 0; ptrdiff_t outIncrement = (flags & BT_DontIncrementInOutPointers) ? 0 : blockSize; - flags = XorBlocksToFlags(xorBlocks, flags); // Coverity hack + + // Clang and Coverity are generating findings using xorBlocks as a flag. + const bool xorInput = (xorBlocks != NULLPTR) && (flags & BT_XorInput); + const bool xorOutput = (xorBlocks != NULLPTR) && !(flags & BT_XorInput); if (flags & BT_ReverseDirection) { @@ -538,7 +535,7 @@ size_t AdvancedProcessBlocks128_6x2_NEON(F2 func2, F6 func6, inBlocks += inIncrement; } - if (flags & BT_XorInput) + if (xorInput) { block0 = veorq_u64(block0, vreinterpretq_u64_u8(vld1q_u8(xorBlocks))); xorBlocks += xorIncrement; @@ -556,7 +553,7 @@ size_t AdvancedProcessBlocks128_6x2_NEON(F2 func2, F6 func6, func6(block0, block1, block2, block3, block4, block5, subKeys, static_cast(rounds)); - if (xorBlocks && !(flags & BT_XorInput)) + if (xorOutput) { block0 = veorq_u64(block0, vreinterpretq_u64_u8(vld1q_u8(xorBlocks))); xorBlocks += xorIncrement; @@ -608,7 +605,7 @@ size_t AdvancedProcessBlocks128_6x2_NEON(F2 func2, F6 func6, inBlocks += inIncrement; } - if (flags & BT_XorInput) + if (xorInput) { block0 = veorq_u64(block0, vreinterpretq_u64_u8(vld1q_u8(xorBlocks))); xorBlocks += xorIncrement; @@ -618,7 +615,7 @@ size_t AdvancedProcessBlocks128_6x2_NEON(F2 func2, F6 func6, func2(block0, block1, subKeys, static_cast(rounds)); - if (xorBlocks && !(flags & BT_XorInput)) + if (xorOutput) { block0 = veorq_u64(block0, vreinterpretq_u64_u8(vld1q_u8(xorBlocks))); xorBlocks += xorIncrement; @@ -640,7 +637,7 @@ size_t AdvancedProcessBlocks128_6x2_NEON(F2 func2, F6 func6, uint64x2_t block, zero = {0,0}; block = vreinterpretq_u64_u8(vld1q_u8(inBlocks)); - if (flags & BT_XorInput) + if (xorInput) block = veorq_u64(block, vreinterpretq_u64_u8(vld1q_u8(xorBlocks))); if (flags & BT_InBlockIsCounter) @@ -648,7 +645,7 @@ size_t AdvancedProcessBlocks128_6x2_NEON(F2 func2, F6 func6, func2(block, zero, subKeys, static_cast(rounds)); - if (xorBlocks && !(flags & BT_XorInput)) + if (xorOutput) block = veorq_u64(block, vreinterpretq_u64_u8(vld1q_u8(xorBlocks))); vst1q_u8(outBlocks, vreinterpretq_u8_u64(block)); @@ -726,9 +723,12 @@ inline size_t GCC_NO_UBSAN AdvancedProcessBlocks64_6x2_SSE(F2 func2, F6 func6, const ptrdiff_t xmmBlockSize = 16; ptrdiff_t inIncrement = (flags & (BT_InBlockIsCounter|BT_DontIncrementInOutPointers)) ? 0 : xmmBlockSize; - ptrdiff_t xorIncrement = xorBlocks ? xmmBlockSize : 0; + ptrdiff_t xorIncrement = (xorBlocks != NULLPTR) ? xmmBlockSize : 0; ptrdiff_t outIncrement = (flags & BT_DontIncrementInOutPointers) ? 0 : xmmBlockSize; - flags = XorBlocksToFlags(xorBlocks, flags); // Coverity hack + + // Clang and Coverity are generating findings using xorBlocks as a flag. + const bool xorInput = (xorBlocks != NULLPTR) && (flags & BT_XorInput); + const bool xorOutput = (xorBlocks != NULLPTR) && !(flags & BT_XorInput); if (flags & BT_ReverseDirection) { @@ -781,7 +781,7 @@ inline size_t GCC_NO_UBSAN AdvancedProcessBlocks64_6x2_SSE(F2 func2, F6 func6, inBlocks += inIncrement; } - if (flags & BT_XorInput) + if (xorInput) { block0 = _mm_xor_si128(block0, _mm_loadu_si128(CONST_M128_CAST(xorBlocks))); xorBlocks += xorIncrement; @@ -799,7 +799,7 @@ inline size_t GCC_NO_UBSAN AdvancedProcessBlocks64_6x2_SSE(F2 func2, F6 func6, func6(block0, block1, block2, block3, block4, block5, subKeys, static_cast(rounds)); - if (xorBlocks && !(flags & BT_XorInput)) + if (xorOutput) { block0 = _mm_xor_si128(block0, _mm_loadu_si128(CONST_M128_CAST(xorBlocks))); xorBlocks += xorIncrement; @@ -858,7 +858,7 @@ inline size_t GCC_NO_UBSAN AdvancedProcessBlocks64_6x2_SSE(F2 func2, F6 func6, inBlocks += inIncrement; } - if (flags & BT_XorInput) + if (xorInput) { block0 = _mm_xor_si128(block0, _mm_loadu_si128(CONST_M128_CAST(xorBlocks))); xorBlocks += xorIncrement; @@ -868,7 +868,7 @@ inline size_t GCC_NO_UBSAN AdvancedProcessBlocks64_6x2_SSE(F2 func2, F6 func6, func2(block0, block1, subKeys, static_cast(rounds)); - if (xorBlocks && !(flags & BT_XorInput)) + if (xorOutput) { block0 = _mm_xor_si128(block0, _mm_loadu_si128(CONST_M128_CAST(xorBlocks))); xorBlocks += xorIncrement; @@ -911,7 +911,7 @@ inline size_t GCC_NO_UBSAN AdvancedProcessBlocks64_6x2_SSE(F2 func2, F6 func6, // UBsan false positive; mem_addr can be unaligned. _mm_load_sd(CONST_DOUBLE_CAST(inBlocks))); - if (flags & BT_XorInput) + if (xorInput) { block = _mm_xor_si128(block, _mm_castpd_si128( // UBsan false positive; mem_addr can be unaligned. @@ -923,7 +923,7 @@ inline size_t GCC_NO_UBSAN AdvancedProcessBlocks64_6x2_SSE(F2 func2, F6 func6, func2(block, zero, subKeys, static_cast(rounds)); - if (xorBlocks && !(flags & BT_XorInput)) + if (xorOutput) { block = _mm_xor_si128(block, _mm_castpd_si128( // UBsan false positive; mem_addr can be unaligned. @@ -957,9 +957,12 @@ inline size_t AdvancedProcessBlocks128_6x2_SSE(F2 func2, F6 func6, // const ptrdiff_t xmmBlockSize = 16; ptrdiff_t inIncrement = (flags & (BT_InBlockIsCounter|BT_DontIncrementInOutPointers)) ? 0 : blockSize; - ptrdiff_t xorIncrement = xorBlocks ? blockSize : 0; + ptrdiff_t xorIncrement = (xorBlocks != NULLPTR) ? blockSize : 0; ptrdiff_t outIncrement = (flags & BT_DontIncrementInOutPointers) ? 0 : blockSize; - flags = XorBlocksToFlags(xorBlocks, flags); // Coverity hack + + // Clang and Coverity are generating findings using xorBlocks as a flag. + const bool xorInput = (xorBlocks != NULLPTR) && (flags & BT_XorInput); + const bool xorOutput = (xorBlocks != NULLPTR) && !(flags & BT_XorInput); if (flags & BT_ReverseDirection) { @@ -1003,7 +1006,7 @@ inline size_t AdvancedProcessBlocks128_6x2_SSE(F2 func2, F6 func6, inBlocks += inIncrement; } - if (flags & BT_XorInput) + if (xorInput) { block0 = _mm_xor_si128(block0, _mm_loadu_si128(CONST_M128_CAST(xorBlocks))); xorBlocks += xorIncrement; @@ -1021,7 +1024,7 @@ inline size_t AdvancedProcessBlocks128_6x2_SSE(F2 func2, F6 func6, func6(block0, block1, block2, block3, block4, block5, subKeys, static_cast(rounds)); - if (xorBlocks && !(flags & BT_XorInput)) + if (xorOutput) { block0 = _mm_xor_si128(block0, _mm_loadu_si128(CONST_M128_CAST(xorBlocks))); xorBlocks += xorIncrement; @@ -1071,7 +1074,7 @@ inline size_t AdvancedProcessBlocks128_6x2_SSE(F2 func2, F6 func6, inBlocks += inIncrement; } - if (flags & BT_XorInput) + if (xorInput) { block0 = _mm_xor_si128(block0, _mm_loadu_si128(CONST_M128_CAST(xorBlocks))); xorBlocks += xorIncrement; @@ -1081,7 +1084,7 @@ inline size_t AdvancedProcessBlocks128_6x2_SSE(F2 func2, F6 func6, func2(block0, block1, subKeys, static_cast(rounds)); - if (xorBlocks && !(flags & BT_XorInput)) + if (xorOutput) { block0 = _mm_xor_si128(block0, _mm_loadu_si128(CONST_M128_CAST(xorBlocks))); xorBlocks += xorIncrement; @@ -1103,7 +1106,7 @@ inline size_t AdvancedProcessBlocks128_6x2_SSE(F2 func2, F6 func6, __m128i block, zero = _mm_setzero_si128(); block = _mm_loadu_si128(CONST_M128_CAST(inBlocks)); - if (flags & BT_XorInput) + if (xorInput) block = _mm_xor_si128(block, _mm_loadu_si128(CONST_M128_CAST(xorBlocks))); if (flags & BT_InBlockIsCounter) @@ -1111,7 +1114,7 @@ inline size_t AdvancedProcessBlocks128_6x2_SSE(F2 func2, F6 func6, func2(block, zero, subKeys, static_cast(rounds)); - if (xorBlocks && !(flags & BT_XorInput)) + if (xorOutput) block = _mm_xor_si128(block, _mm_loadu_si128(CONST_M128_CAST(xorBlocks))); _mm_storeu_si128(M128_CAST(outBlocks), block); @@ -1139,9 +1142,12 @@ inline size_t AdvancedProcessBlocks128_4x1_SSE(F1 func1, F4 func4, // const ptrdiff_t xmmBlockSize = 16; ptrdiff_t inIncrement = (flags & (BT_InBlockIsCounter|BT_DontIncrementInOutPointers)) ? 0 : blockSize; - ptrdiff_t xorIncrement = xorBlocks ? blockSize : 0; + ptrdiff_t xorIncrement = (xorBlocks != NULLPTR) ? blockSize : 0; ptrdiff_t outIncrement = (flags & BT_DontIncrementInOutPointers) ? 0 : blockSize; - flags = XorBlocksToFlags(xorBlocks, flags); // Coverity hack + + // Clang and Coverity are generating findings using xorBlocks as a flag. + const bool xorInput = (xorBlocks != NULLPTR) && (flags & BT_XorInput); + const bool xorOutput = (xorBlocks != NULLPTR) && !(flags & BT_XorInput); if (flags & BT_ReverseDirection) { @@ -1179,7 +1185,7 @@ inline size_t AdvancedProcessBlocks128_4x1_SSE(F1 func1, F4 func4, inBlocks += inIncrement; } - if (flags & BT_XorInput) + if (xorInput) { block0 = _mm_xor_si128(block0, _mm_loadu_si128(CONST_M128_CAST(xorBlocks))); xorBlocks += xorIncrement; @@ -1193,7 +1199,7 @@ inline size_t AdvancedProcessBlocks128_4x1_SSE(F1 func1, F4 func4, func4(block0, block1, block2, block3, subKeys, static_cast(rounds)); - if (xorBlocks && !(flags & BT_XorInput)) + if (xorOutput) { block0 = _mm_xor_si128(block0, _mm_loadu_si128(CONST_M128_CAST(xorBlocks))); xorBlocks += xorIncrement; @@ -1222,7 +1228,7 @@ inline size_t AdvancedProcessBlocks128_4x1_SSE(F1 func1, F4 func4, { __m128i block = _mm_loadu_si128(CONST_M128_CAST(inBlocks)); - if (flags & BT_XorInput) + if (xorInput) block = _mm_xor_si128(block, _mm_loadu_si128(CONST_M128_CAST(xorBlocks))); if (flags & BT_InBlockIsCounter) @@ -1230,7 +1236,7 @@ inline size_t AdvancedProcessBlocks128_4x1_SSE(F1 func1, F4 func4, func1(block, subKeys, static_cast(rounds)); - if (xorBlocks && !(flags & BT_XorInput)) + if (xorOutput) block = _mm_xor_si128(block, _mm_loadu_si128(CONST_M128_CAST(xorBlocks))); _mm_storeu_si128(M128_CAST(outBlocks), block); @@ -1279,9 +1285,12 @@ size_t AdvancedProcessBlocks128_6x1_ALTIVEC(F1 func1, F6 func6, const word32 *su // const ptrdiff_t vexBlockSize = 16; ptrdiff_t inIncrement = (flags & (BT_InBlockIsCounter|BT_DontIncrementInOutPointers)) ? 0 : blockSize; - ptrdiff_t xorIncrement = xorBlocks ? blockSize : 0; + ptrdiff_t xorIncrement = (xorBlocks != NULLPTR) ? blockSize : 0; ptrdiff_t outIncrement = (flags & BT_DontIncrementInOutPointers) ? 0 : blockSize; - flags = XorBlocksToFlags(xorBlocks, flags); // Coverity hack + + // Clang and Coverity are generating findings using xorBlocks as a flag. + const bool xorInput = (xorBlocks != NULLPTR) && (flags & BT_XorInput); + const bool xorOutput = (xorBlocks != NULLPTR) && !(flags & BT_XorInput); if (flags & BT_ReverseDirection) { @@ -1326,7 +1335,7 @@ size_t AdvancedProcessBlocks128_6x1_ALTIVEC(F1 func1, F6 func6, const word32 *su inBlocks += inIncrement; } - if (flags & BT_XorInput) + if (xorInput) { block0 = VectorXor(block0, VectorLoad(xorBlocks)); xorBlocks += xorIncrement; @@ -1344,7 +1353,7 @@ size_t AdvancedProcessBlocks128_6x1_ALTIVEC(F1 func1, F6 func6, const word32 *su func6(block0, block1, block2, block3, block4, block5, subKeys, rounds); - if (xorBlocks && !(flags & BT_XorInput)) + if (xorOutput) { block0 = VectorXor(block0, VectorLoad(xorBlocks)); xorBlocks += xorIncrement; @@ -1381,7 +1390,7 @@ size_t AdvancedProcessBlocks128_6x1_ALTIVEC(F1 func1, F6 func6, const word32 *su { uint32x4_p block = VectorLoad(inBlocks); - if (flags & BT_XorInput) + if (xorInput) block = VectorXor(block, VectorLoad(xorBlocks)); if (flags & BT_InBlockIsCounter) @@ -1389,7 +1398,7 @@ size_t AdvancedProcessBlocks128_6x1_ALTIVEC(F1 func1, F6 func6, const word32 *su func1(block, subKeys, rounds); - if (xorBlocks && !(flags & BT_XorInput)) + if (xorOutput) block = VectorXor(block, VectorLoad(xorBlocks)); VectorStore(block, outBlocks);