From 6a11f00768c969a10c0d0f2048da3f69a18c728c Mon Sep 17 00:00:00 2001 From: Jeffrey Walton Date: Sat, 8 Jun 2019 12:59:14 -0400 Subject: [PATCH] Clear lgtm findings --- chacha_simd.cpp | 85 ++++++++++++++++++++++++------------------------- sse_simd.h | 45 +++++++++++++++++++++++++- 2 files changed, 85 insertions(+), 45 deletions(-) diff --git a/chacha_simd.cpp b/chacha_simd.cpp index 9fd6b0f1..e225579d 100644 --- a/chacha_simd.cpp +++ b/chacha_simd.cpp @@ -38,6 +38,7 @@ #if (CRYPTOPP_SSE2_INTRIN_AVAILABLE) # include # include +# include "sse_simd.h" #endif #if defined(__SSSE3__) @@ -565,14 +566,10 @@ void ChaCha_OperateKeystream_NEON(const word32 *state, const byte* input, byte * void ChaCha_OperateKeystream_SSE2(const word32 *state, const byte* input, byte *output, unsigned int rounds) { - const __m128i* state_mm = reinterpret_cast(state); - const __m128i* input_mm = reinterpret_cast(input); - __m128i* output_mm = reinterpret_cast<__m128i*>(output); - - const __m128i state0 = _mm_load_si128(state_mm + 0); - const __m128i state1 = _mm_load_si128(state_mm + 1); - const __m128i state2 = _mm_load_si128(state_mm + 2); - const __m128i state3 = _mm_load_si128(state_mm + 3); + const __m128i state0 = load_m128i<0>(state); + const __m128i state1 = load_m128i<1>(state); + const __m128i state2 = load_m128i<2>(state); + const __m128i state3 = load_m128i<3>(state); __m128i r0_0 = state0; __m128i r0_1 = state1; @@ -772,57 +769,57 @@ void ChaCha_OperateKeystream_SSE2(const word32 *state, const byte* input, byte * r3_3 = _mm_add_epi32(r3_3, state3); r3_3 = _mm_add_epi64(r3_3, _mm_set_epi32(0, 0, 0, 3)); - if (input_mm) + if (input) { - r0_0 = _mm_xor_si128(_mm_loadu_si128(input_mm + 0), r0_0); - r0_1 = _mm_xor_si128(_mm_loadu_si128(input_mm + 1), r0_1); - r0_2 = _mm_xor_si128(_mm_loadu_si128(input_mm + 2), r0_2); - r0_3 = _mm_xor_si128(_mm_loadu_si128(input_mm + 3), r0_3); + r0_0 = _mm_xor_si128(load_m128i<0>(input), r0_0); + r0_1 = _mm_xor_si128(load_m128i<1>(input), r0_1); + r0_2 = _mm_xor_si128(load_m128i<2>(input), r0_2); + r0_3 = _mm_xor_si128(load_m128i<3>(input), r0_3); } - _mm_storeu_si128(output_mm + 0, r0_0); - _mm_storeu_si128(output_mm + 1, r0_1); - _mm_storeu_si128(output_mm + 2, r0_2); - _mm_storeu_si128(output_mm + 3, r0_3); + store_m128i<0>(output, r0_0); + store_m128i<1>(output, r0_1); + store_m128i<2>(output, r0_2); + store_m128i<3>(output, r0_3); - if (input_mm) + if (input) { - r1_0 = _mm_xor_si128(_mm_loadu_si128(input_mm + 4), r1_0); - r1_1 = _mm_xor_si128(_mm_loadu_si128(input_mm + 5), r1_1); - r1_2 = _mm_xor_si128(_mm_loadu_si128(input_mm + 6), r1_2); - r1_3 = _mm_xor_si128(_mm_loadu_si128(input_mm + 7), r1_3); + r1_0 = _mm_xor_si128(load_m128i<4>(input), r1_0); + r1_1 = _mm_xor_si128(load_m128i<5>(input), r1_1); + r1_2 = _mm_xor_si128(load_m128i<6>(input), r1_2); + r1_3 = _mm_xor_si128(load_m128i<7>(input), r1_3); } - _mm_storeu_si128(output_mm + 4, r1_0); - _mm_storeu_si128(output_mm + 5, r1_1); - _mm_storeu_si128(output_mm + 6, r1_2); - _mm_storeu_si128(output_mm + 7, r1_3); + store_m128i<4>(output, r1_0); + store_m128i<5>(output, r1_1); + store_m128i<6>(output, r1_2); + store_m128i<7>(output, r1_3); - if (input_mm) + if (input) { - r2_0 = _mm_xor_si128(_mm_loadu_si128(input_mm + 8), r2_0); - r2_1 = _mm_xor_si128(_mm_loadu_si128(input_mm + 9), r2_1); - r2_2 = _mm_xor_si128(_mm_loadu_si128(input_mm + 10), r2_2); - r2_3 = _mm_xor_si128(_mm_loadu_si128(input_mm + 11), r2_3); + r2_0 = _mm_xor_si128(load_m128i< 8>(input), r2_0); + r2_1 = _mm_xor_si128(load_m128i< 9>(input), r2_1); + r2_2 = _mm_xor_si128(load_m128i<10>(input), r2_2); + r2_3 = _mm_xor_si128(load_m128i<11>(input), r2_3); } - _mm_storeu_si128(output_mm + 8, r2_0); - _mm_storeu_si128(output_mm + 9, r2_1); - _mm_storeu_si128(output_mm + 10, r2_2); - _mm_storeu_si128(output_mm + 11, r2_3); + store_m128i< 8>(output, r2_0); + store_m128i< 9>(output, r2_1); + store_m128i<10>(output, r2_2); + store_m128i<11>(output, r2_3); - if (input_mm) + if (input) { - r3_0 = _mm_xor_si128(_mm_loadu_si128(input_mm + 12), r3_0); - r3_1 = _mm_xor_si128(_mm_loadu_si128(input_mm + 13), r3_1); - r3_2 = _mm_xor_si128(_mm_loadu_si128(input_mm + 14), r3_2); - r3_3 = _mm_xor_si128(_mm_loadu_si128(input_mm + 15), r3_3); + r3_0 = _mm_xor_si128(load_m128i<12>(input), r3_0); + r3_1 = _mm_xor_si128(load_m128i<13>(input), r3_1); + r3_2 = _mm_xor_si128(load_m128i<14>(input), r3_2); + r3_3 = _mm_xor_si128(load_m128i<15>(input), r3_3); } - _mm_storeu_si128(output_mm + 12, r3_0); - _mm_storeu_si128(output_mm + 13, r3_1); - _mm_storeu_si128(output_mm + 14, r3_2); - _mm_storeu_si128(output_mm + 15, r3_3); + store_m128i<12>(output, r3_0); + store_m128i<13>(output, r3_1); + store_m128i<14>(output, r3_2); + store_m128i<15>(output, r3_3); } #endif // CRYPTOPP_SSE2_INTRIN_AVAILABLE diff --git a/sse_simd.h b/sse_simd.h index c77cbd82..0effad47 100644 --- a/sse_simd.h +++ b/sse_simd.h @@ -20,6 +20,27 @@ NAMESPACE_BEGIN(CryptoPP) #if (CRYPTOPP_SSE2_INTRIN_AVAILABLE) +template +inline __m128i load_m128i(T* ptr) +{ + return _mm_loadu_si128( + reinterpret_cast<__m128i*>(ptr)); +} + +template +inline __m128i load_m128i(const T* ptr) +{ + return _mm_loadu_si128( + reinterpret_cast(ptr)); +} + +template +inline void store_m128i(T* ptr, __m128i val) +{ + return _mm_storeu_si128( + reinterpret_cast<__m128i*>(ptr), val); +} + // N specifies the nth 128-bit element template inline __m128i load_m128i(T* ptr) @@ -46,10 +67,32 @@ inline void store_m128i(T* ptr, __m128i val) return _mm_storeu_si128( reinterpret_cast<__m128i*>(ptr+SCALE*N), val); } + #endif #if (CRYPTOPP_AVX2_AVAILABLE) +template +inline __m256i load_m256i(T* ptr) +{ + return _mm256_loadu_si256( + reinterpret_cast<__m256i*>(ptr)); +} + +template +inline __m256i load_m256i(const T* ptr) +{ + return _mm256_loadu_si256( + reinterpret_cast(ptr)); +} + +template +inline void store_m256i(T* ptr, __m256i val) +{ + return _mm256_storeu_si256( + reinterpret_cast<__m256i*>(ptr), val); +} + // N specifies the nth 256-bit element template inline __m256i load_m256i(T* ptr) @@ -81,4 +124,4 @@ inline void store_m256i(T* ptr, __m256i val) NAMESPACE_END -#endif // CRYPTOPP_SSE_CRYPTO_H \ No newline at end of file +#endif // CRYPTOPP_SSE_CRYPTO_H