diff --git a/chacha-simd.cpp b/chacha-simd.cpp index ecc1c1d8..b865d74d 100644 --- a/chacha-simd.cpp +++ b/chacha-simd.cpp @@ -118,9 +118,9 @@ template inline __m128i RotateLeft(const __m128i val) { #ifdef __XOP__ - return _mm_roti_epi32(val, R); + return _mm_roti_epi32(val, R); #else - return _mm_or_si128(_mm_slli_epi32(val, R), _mm_srli_epi32(val, 32-R)); + return _mm_or_si128(_mm_slli_epi32(val, R), _mm_srli_epi32(val, 32-R)); #endif } @@ -129,10 +129,10 @@ template <> inline __m128i RotateLeft<8>(const __m128i val) { #ifdef __XOP__ - return _mm_roti_epi32(val, 8); + return _mm_roti_epi32(val, 8); #else - const __m128i mask = _mm_set_epi8(14,13,12,15, 10,9,8,11, 6,5,4,7, 2,1,0,3); - return _mm_shuffle_epi8(val, mask); + const __m128i mask = _mm_set_epi8(14,13,12,15, 10,9,8,11, 6,5,4,7, 2,1,0,3); + return _mm_shuffle_epi8(val, mask); #endif } @@ -140,10 +140,10 @@ template <> inline __m128i RotateLeft<16>(const __m128i val) { #ifdef __XOP__ - return _mm_roti_epi32(val, 16); + return _mm_roti_epi32(val, 16); #else - const __m128i mask = _mm_set_epi8(13,12,15,14, 9,8,11,10, 5,4,7,6, 1,0,3,2); - return _mm_shuffle_epi8(val, mask); + const __m128i mask = _mm_set_epi8(13,12,15,14, 9,8,11,10, 5,4,7,6, 1,0,3,2); + return _mm_shuffle_epi8(val, mask); #endif } #endif // SSE3 @@ -430,264 +430,264 @@ void ChaCha_OperateKeystream_NEON(const word32 *state, const byte* input, byte * void ChaCha_OperateKeystream_SSE2(const word32 *state, const byte* input, byte *output, unsigned int rounds, bool xorInput) { - const __m128i* state_mm = reinterpret_cast(state); - const __m128i* input_mm = reinterpret_cast(input); - __m128i* output_mm = reinterpret_cast<__m128i*>(output); + const __m128i* state_mm = reinterpret_cast(state); + const __m128i* input_mm = reinterpret_cast(input); + __m128i* output_mm = reinterpret_cast<__m128i*>(output); - const __m128i state0 = _mm_load_si128(state_mm + 0); - const __m128i state1 = _mm_load_si128(state_mm + 1); - const __m128i state2 = _mm_load_si128(state_mm + 2); - const __m128i state3 = _mm_load_si128(state_mm + 3); + const __m128i state0 = _mm_load_si128(state_mm + 0); + const __m128i state1 = _mm_load_si128(state_mm + 1); + const __m128i state2 = _mm_load_si128(state_mm + 2); + const __m128i state3 = _mm_load_si128(state_mm + 3); - __m128i r0_0 = state0; - __m128i r0_1 = state1; - __m128i r0_2 = state2; - __m128i r0_3 = state3; + __m128i r0_0 = state0; + __m128i r0_1 = state1; + __m128i r0_2 = state2; + __m128i r0_3 = state3; - __m128i r1_0 = state0; - __m128i r1_1 = state1; - __m128i r1_2 = state2; - __m128i r1_3 = _mm_add_epi64(r0_3, _mm_set_epi32(0, 0, 0, 1)); + __m128i r1_0 = state0; + __m128i r1_1 = state1; + __m128i r1_2 = state2; + __m128i r1_3 = _mm_add_epi64(r0_3, _mm_set_epi32(0, 0, 0, 1)); - __m128i r2_0 = state0; - __m128i r2_1 = state1; - __m128i r2_2 = state2; - __m128i r2_3 = _mm_add_epi64(r0_3, _mm_set_epi32(0, 0, 0, 2)); + __m128i r2_0 = state0; + __m128i r2_1 = state1; + __m128i r2_2 = state2; + __m128i r2_3 = _mm_add_epi64(r0_3, _mm_set_epi32(0, 0, 0, 2)); - __m128i r3_0 = state0; - __m128i r3_1 = state1; - __m128i r3_2 = state2; - __m128i r3_3 = _mm_add_epi64(r0_3, _mm_set_epi32(0, 0, 0, 3)); + __m128i r3_0 = state0; + __m128i r3_1 = state1; + __m128i r3_2 = state2; + __m128i r3_3 = _mm_add_epi64(r0_3, _mm_set_epi32(0, 0, 0, 3)); - for (int i = static_cast(rounds); i > 0; i -= 2) - { - r0_0 = _mm_add_epi32(r0_0, r0_1); - r1_0 = _mm_add_epi32(r1_0, r1_1); - r2_0 = _mm_add_epi32(r2_0, r2_1); - r3_0 = _mm_add_epi32(r3_0, r3_1); + for (int i = static_cast(rounds); i > 0; i -= 2) + { + r0_0 = _mm_add_epi32(r0_0, r0_1); + r1_0 = _mm_add_epi32(r1_0, r1_1); + r2_0 = _mm_add_epi32(r2_0, r2_1); + r3_0 = _mm_add_epi32(r3_0, r3_1); - r0_3 = _mm_xor_si128(r0_3, r0_0); - r1_3 = _mm_xor_si128(r1_3, r1_0); - r2_3 = _mm_xor_si128(r2_3, r2_0); - r3_3 = _mm_xor_si128(r3_3, r3_0); + r0_3 = _mm_xor_si128(r0_3, r0_0); + r1_3 = _mm_xor_si128(r1_3, r1_0); + r2_3 = _mm_xor_si128(r2_3, r2_0); + r3_3 = _mm_xor_si128(r3_3, r3_0); - r0_3 = RotateLeft<16>(r0_3); - r1_3 = RotateLeft<16>(r1_3); - r2_3 = RotateLeft<16>(r2_3); - r3_3 = RotateLeft<16>(r3_3); + r0_3 = RotateLeft<16>(r0_3); + r1_3 = RotateLeft<16>(r1_3); + r2_3 = RotateLeft<16>(r2_3); + r3_3 = RotateLeft<16>(r3_3); - r0_2 = _mm_add_epi32(r0_2, r0_3); - r1_2 = _mm_add_epi32(r1_2, r1_3); - r2_2 = _mm_add_epi32(r2_2, r2_3); - r3_2 = _mm_add_epi32(r3_2, r3_3); + r0_2 = _mm_add_epi32(r0_2, r0_3); + r1_2 = _mm_add_epi32(r1_2, r1_3); + r2_2 = _mm_add_epi32(r2_2, r2_3); + r3_2 = _mm_add_epi32(r3_2, r3_3); - r0_1 = _mm_xor_si128(r0_1, r0_2); - r1_1 = _mm_xor_si128(r1_1, r1_2); - r2_1 = _mm_xor_si128(r2_1, r2_2); - r3_1 = _mm_xor_si128(r3_1, r3_2); + r0_1 = _mm_xor_si128(r0_1, r0_2); + r1_1 = _mm_xor_si128(r1_1, r1_2); + r2_1 = _mm_xor_si128(r2_1, r2_2); + r3_1 = _mm_xor_si128(r3_1, r3_2); - r0_1 = RotateLeft<12>(r0_1); - r1_1 = RotateLeft<12>(r1_1); - r2_1 = RotateLeft<12>(r2_1); - r3_1 = RotateLeft<12>(r3_1); + r0_1 = RotateLeft<12>(r0_1); + r1_1 = RotateLeft<12>(r1_1); + r2_1 = RotateLeft<12>(r2_1); + r3_1 = RotateLeft<12>(r3_1); - r0_0 = _mm_add_epi32(r0_0, r0_1); - r1_0 = _mm_add_epi32(r1_0, r1_1); - r2_0 = _mm_add_epi32(r2_0, r2_1); - r3_0 = _mm_add_epi32(r3_0, r3_1); + r0_0 = _mm_add_epi32(r0_0, r0_1); + r1_0 = _mm_add_epi32(r1_0, r1_1); + r2_0 = _mm_add_epi32(r2_0, r2_1); + r3_0 = _mm_add_epi32(r3_0, r3_1); - r0_3 = _mm_xor_si128(r0_3, r0_0); - r1_3 = _mm_xor_si128(r1_3, r1_0); - r2_3 = _mm_xor_si128(r2_3, r2_0); - r3_3 = _mm_xor_si128(r3_3, r3_0); + r0_3 = _mm_xor_si128(r0_3, r0_0); + r1_3 = _mm_xor_si128(r1_3, r1_0); + r2_3 = _mm_xor_si128(r2_3, r2_0); + r3_3 = _mm_xor_si128(r3_3, r3_0); - r0_3 = RotateLeft<8>(r0_3); - r1_3 = RotateLeft<8>(r1_3); - r2_3 = RotateLeft<8>(r2_3); - r3_3 = RotateLeft<8>(r3_3); + r0_3 = RotateLeft<8>(r0_3); + r1_3 = RotateLeft<8>(r1_3); + r2_3 = RotateLeft<8>(r2_3); + r3_3 = RotateLeft<8>(r3_3); - r0_2 = _mm_add_epi32(r0_2, r0_3); - r1_2 = _mm_add_epi32(r1_2, r1_3); - r2_2 = _mm_add_epi32(r2_2, r2_3); - r3_2 = _mm_add_epi32(r3_2, r3_3); + r0_2 = _mm_add_epi32(r0_2, r0_3); + r1_2 = _mm_add_epi32(r1_2, r1_3); + r2_2 = _mm_add_epi32(r2_2, r2_3); + r3_2 = _mm_add_epi32(r3_2, r3_3); - r0_1 = _mm_xor_si128(r0_1, r0_2); - r1_1 = _mm_xor_si128(r1_1, r1_2); - r2_1 = _mm_xor_si128(r2_1, r2_2); - r3_1 = _mm_xor_si128(r3_1, r3_2); + r0_1 = _mm_xor_si128(r0_1, r0_2); + r1_1 = _mm_xor_si128(r1_1, r1_2); + r2_1 = _mm_xor_si128(r2_1, r2_2); + r3_1 = _mm_xor_si128(r3_1, r3_2); - r0_1 = RotateLeft<7>(r0_1); - r1_1 = RotateLeft<7>(r1_1); - r2_1 = RotateLeft<7>(r2_1); - r3_1 = RotateLeft<7>(r3_1); + r0_1 = RotateLeft<7>(r0_1); + r1_1 = RotateLeft<7>(r1_1); + r2_1 = RotateLeft<7>(r2_1); + r3_1 = RotateLeft<7>(r3_1); - r0_1 = _mm_shuffle_epi32(r0_1, _MM_SHUFFLE(0, 3, 2, 1)); - r0_2 = _mm_shuffle_epi32(r0_2, _MM_SHUFFLE(1, 0, 3, 2)); - r0_3 = _mm_shuffle_epi32(r0_3, _MM_SHUFFLE(2, 1, 0, 3)); + r0_1 = _mm_shuffle_epi32(r0_1, _MM_SHUFFLE(0, 3, 2, 1)); + r0_2 = _mm_shuffle_epi32(r0_2, _MM_SHUFFLE(1, 0, 3, 2)); + r0_3 = _mm_shuffle_epi32(r0_3, _MM_SHUFFLE(2, 1, 0, 3)); - r1_1 = _mm_shuffle_epi32(r1_1, _MM_SHUFFLE(0, 3, 2, 1)); - r1_2 = _mm_shuffle_epi32(r1_2, _MM_SHUFFLE(1, 0, 3, 2)); - r1_3 = _mm_shuffle_epi32(r1_3, _MM_SHUFFLE(2, 1, 0, 3)); + r1_1 = _mm_shuffle_epi32(r1_1, _MM_SHUFFLE(0, 3, 2, 1)); + r1_2 = _mm_shuffle_epi32(r1_2, _MM_SHUFFLE(1, 0, 3, 2)); + r1_3 = _mm_shuffle_epi32(r1_3, _MM_SHUFFLE(2, 1, 0, 3)); - r2_1 = _mm_shuffle_epi32(r2_1, _MM_SHUFFLE(0, 3, 2, 1)); - r2_2 = _mm_shuffle_epi32(r2_2, _MM_SHUFFLE(1, 0, 3, 2)); - r2_3 = _mm_shuffle_epi32(r2_3, _MM_SHUFFLE(2, 1, 0, 3)); + r2_1 = _mm_shuffle_epi32(r2_1, _MM_SHUFFLE(0, 3, 2, 1)); + r2_2 = _mm_shuffle_epi32(r2_2, _MM_SHUFFLE(1, 0, 3, 2)); + r2_3 = _mm_shuffle_epi32(r2_3, _MM_SHUFFLE(2, 1, 0, 3)); - r3_1 = _mm_shuffle_epi32(r3_1, _MM_SHUFFLE(0, 3, 2, 1)); - r3_2 = _mm_shuffle_epi32(r3_2, _MM_SHUFFLE(1, 0, 3, 2)); - r3_3 = _mm_shuffle_epi32(r3_3, _MM_SHUFFLE(2, 1, 0, 3)); + r3_1 = _mm_shuffle_epi32(r3_1, _MM_SHUFFLE(0, 3, 2, 1)); + r3_2 = _mm_shuffle_epi32(r3_2, _MM_SHUFFLE(1, 0, 3, 2)); + r3_3 = _mm_shuffle_epi32(r3_3, _MM_SHUFFLE(2, 1, 0, 3)); - r0_0 = _mm_add_epi32(r0_0, r0_1); - r1_0 = _mm_add_epi32(r1_0, r1_1); - r2_0 = _mm_add_epi32(r2_0, r2_1); - r3_0 = _mm_add_epi32(r3_0, r3_1); + r0_0 = _mm_add_epi32(r0_0, r0_1); + r1_0 = _mm_add_epi32(r1_0, r1_1); + r2_0 = _mm_add_epi32(r2_0, r2_1); + r3_0 = _mm_add_epi32(r3_0, r3_1); - r0_3 = _mm_xor_si128(r0_3, r0_0); - r1_3 = _mm_xor_si128(r1_3, r1_0); - r2_3 = _mm_xor_si128(r2_3, r2_0); - r3_3 = _mm_xor_si128(r3_3, r3_0); + r0_3 = _mm_xor_si128(r0_3, r0_0); + r1_3 = _mm_xor_si128(r1_3, r1_0); + r2_3 = _mm_xor_si128(r2_3, r2_0); + r3_3 = _mm_xor_si128(r3_3, r3_0); - r0_3 = RotateLeft<16>(r0_3); - r1_3 = RotateLeft<16>(r1_3); - r2_3 = RotateLeft<16>(r2_3); - r3_3 = RotateLeft<16>(r3_3); + r0_3 = RotateLeft<16>(r0_3); + r1_3 = RotateLeft<16>(r1_3); + r2_3 = RotateLeft<16>(r2_3); + r3_3 = RotateLeft<16>(r3_3); - r0_2 = _mm_add_epi32(r0_2, r0_3); - r1_2 = _mm_add_epi32(r1_2, r1_3); - r2_2 = _mm_add_epi32(r2_2, r2_3); - r3_2 = _mm_add_epi32(r3_2, r3_3); + r0_2 = _mm_add_epi32(r0_2, r0_3); + r1_2 = _mm_add_epi32(r1_2, r1_3); + r2_2 = _mm_add_epi32(r2_2, r2_3); + r3_2 = _mm_add_epi32(r3_2, r3_3); - r0_1 = _mm_xor_si128(r0_1, r0_2); - r1_1 = _mm_xor_si128(r1_1, r1_2); - r2_1 = _mm_xor_si128(r2_1, r2_2); - r3_1 = _mm_xor_si128(r3_1, r3_2); + r0_1 = _mm_xor_si128(r0_1, r0_2); + r1_1 = _mm_xor_si128(r1_1, r1_2); + r2_1 = _mm_xor_si128(r2_1, r2_2); + r3_1 = _mm_xor_si128(r3_1, r3_2); - r0_1 = RotateLeft<12>(r0_1); - r1_1 = RotateLeft<12>(r1_1); - r2_1 = RotateLeft<12>(r2_1); - r3_1 = RotateLeft<12>(r3_1); + r0_1 = RotateLeft<12>(r0_1); + r1_1 = RotateLeft<12>(r1_1); + r2_1 = RotateLeft<12>(r2_1); + r3_1 = RotateLeft<12>(r3_1); - r0_0 = _mm_add_epi32(r0_0, r0_1); - r1_0 = _mm_add_epi32(r1_0, r1_1); - r2_0 = _mm_add_epi32(r2_0, r2_1); - r3_0 = _mm_add_epi32(r3_0, r3_1); + r0_0 = _mm_add_epi32(r0_0, r0_1); + r1_0 = _mm_add_epi32(r1_0, r1_1); + r2_0 = _mm_add_epi32(r2_0, r2_1); + r3_0 = _mm_add_epi32(r3_0, r3_1); - r0_3 = _mm_xor_si128(r0_3, r0_0); - r1_3 = _mm_xor_si128(r1_3, r1_0); - r2_3 = _mm_xor_si128(r2_3, r2_0); - r3_3 = _mm_xor_si128(r3_3, r3_0); + r0_3 = _mm_xor_si128(r0_3, r0_0); + r1_3 = _mm_xor_si128(r1_3, r1_0); + r2_3 = _mm_xor_si128(r2_3, r2_0); + r3_3 = _mm_xor_si128(r3_3, r3_0); - r0_3 = RotateLeft<8>(r0_3); - r1_3 = RotateLeft<8>(r1_3); - r2_3 = RotateLeft<8>(r2_3); - r3_3 = RotateLeft<8>(r3_3); + r0_3 = RotateLeft<8>(r0_3); + r1_3 = RotateLeft<8>(r1_3); + r2_3 = RotateLeft<8>(r2_3); + r3_3 = RotateLeft<8>(r3_3); - r0_2 = _mm_add_epi32(r0_2, r0_3); - r1_2 = _mm_add_epi32(r1_2, r1_3); - r2_2 = _mm_add_epi32(r2_2, r2_3); - r3_2 = _mm_add_epi32(r3_2, r3_3); + r0_2 = _mm_add_epi32(r0_2, r0_3); + r1_2 = _mm_add_epi32(r1_2, r1_3); + r2_2 = _mm_add_epi32(r2_2, r2_3); + r3_2 = _mm_add_epi32(r3_2, r3_3); - r0_1 = _mm_xor_si128(r0_1, r0_2); - r1_1 = _mm_xor_si128(r1_1, r1_2); - r2_1 = _mm_xor_si128(r2_1, r2_2); - r3_1 = _mm_xor_si128(r3_1, r3_2); + r0_1 = _mm_xor_si128(r0_1, r0_2); + r1_1 = _mm_xor_si128(r1_1, r1_2); + r2_1 = _mm_xor_si128(r2_1, r2_2); + r3_1 = _mm_xor_si128(r3_1, r3_2); - r0_1 = RotateLeft<7>(r0_1); - r1_1 = RotateLeft<7>(r1_1); - r2_1 = RotateLeft<7>(r2_1); - r3_1 = RotateLeft<7>(r3_1); + r0_1 = RotateLeft<7>(r0_1); + r1_1 = RotateLeft<7>(r1_1); + r2_1 = RotateLeft<7>(r2_1); + r3_1 = RotateLeft<7>(r3_1); - r0_1 = _mm_shuffle_epi32(r0_1, _MM_SHUFFLE(2, 1, 0, 3)); - r0_2 = _mm_shuffle_epi32(r0_2, _MM_SHUFFLE(1, 0, 3, 2)); - r0_3 = _mm_shuffle_epi32(r0_3, _MM_SHUFFLE(0, 3, 2, 1)); + r0_1 = _mm_shuffle_epi32(r0_1, _MM_SHUFFLE(2, 1, 0, 3)); + r0_2 = _mm_shuffle_epi32(r0_2, _MM_SHUFFLE(1, 0, 3, 2)); + r0_3 = _mm_shuffle_epi32(r0_3, _MM_SHUFFLE(0, 3, 2, 1)); - r1_1 = _mm_shuffle_epi32(r1_1, _MM_SHUFFLE(2, 1, 0, 3)); - r1_2 = _mm_shuffle_epi32(r1_2, _MM_SHUFFLE(1, 0, 3, 2)); - r1_3 = _mm_shuffle_epi32(r1_3, _MM_SHUFFLE(0, 3, 2, 1)); + r1_1 = _mm_shuffle_epi32(r1_1, _MM_SHUFFLE(2, 1, 0, 3)); + r1_2 = _mm_shuffle_epi32(r1_2, _MM_SHUFFLE(1, 0, 3, 2)); + r1_3 = _mm_shuffle_epi32(r1_3, _MM_SHUFFLE(0, 3, 2, 1)); - r2_1 = _mm_shuffle_epi32(r2_1, _MM_SHUFFLE(2, 1, 0, 3)); - r2_2 = _mm_shuffle_epi32(r2_2, _MM_SHUFFLE(1, 0, 3, 2)); - r2_3 = _mm_shuffle_epi32(r2_3, _MM_SHUFFLE(0, 3, 2, 1)); + r2_1 = _mm_shuffle_epi32(r2_1, _MM_SHUFFLE(2, 1, 0, 3)); + r2_2 = _mm_shuffle_epi32(r2_2, _MM_SHUFFLE(1, 0, 3, 2)); + r2_3 = _mm_shuffle_epi32(r2_3, _MM_SHUFFLE(0, 3, 2, 1)); - r3_1 = _mm_shuffle_epi32(r3_1, _MM_SHUFFLE(2, 1, 0, 3)); - r3_2 = _mm_shuffle_epi32(r3_2, _MM_SHUFFLE(1, 0, 3, 2)); - r3_3 = _mm_shuffle_epi32(r3_3, _MM_SHUFFLE(0, 3, 2, 1)); - } + r3_1 = _mm_shuffle_epi32(r3_1, _MM_SHUFFLE(2, 1, 0, 3)); + r3_2 = _mm_shuffle_epi32(r3_2, _MM_SHUFFLE(1, 0, 3, 2)); + r3_3 = _mm_shuffle_epi32(r3_3, _MM_SHUFFLE(0, 3, 2, 1)); + } - r0_0 = _mm_add_epi32(r0_0, state0); - r0_1 = _mm_add_epi32(r0_1, state1); - r0_2 = _mm_add_epi32(r0_2, state2); - r0_3 = _mm_add_epi32(r0_3, state3); + r0_0 = _mm_add_epi32(r0_0, state0); + r0_1 = _mm_add_epi32(r0_1, state1); + r0_2 = _mm_add_epi32(r0_2, state2); + r0_3 = _mm_add_epi32(r0_3, state3); - r1_0 = _mm_add_epi32(r1_0, state0); - r1_1 = _mm_add_epi32(r1_1, state1); - r1_2 = _mm_add_epi32(r1_2, state2); - r1_3 = _mm_add_epi32(r1_3, state3); - r1_3 = _mm_add_epi64(r1_3, _mm_set_epi32(0, 0, 0, 1)); + r1_0 = _mm_add_epi32(r1_0, state0); + r1_1 = _mm_add_epi32(r1_1, state1); + r1_2 = _mm_add_epi32(r1_2, state2); + r1_3 = _mm_add_epi32(r1_3, state3); + r1_3 = _mm_add_epi64(r1_3, _mm_set_epi32(0, 0, 0, 1)); - r2_0 = _mm_add_epi32(r2_0, state0); - r2_1 = _mm_add_epi32(r2_1, state1); - r2_2 = _mm_add_epi32(r2_2, state2); - r2_3 = _mm_add_epi32(r2_3, state3); - r2_3 = _mm_add_epi64(r2_3, _mm_set_epi32(0, 0, 0, 2)); + r2_0 = _mm_add_epi32(r2_0, state0); + r2_1 = _mm_add_epi32(r2_1, state1); + r2_2 = _mm_add_epi32(r2_2, state2); + r2_3 = _mm_add_epi32(r2_3, state3); + r2_3 = _mm_add_epi64(r2_3, _mm_set_epi32(0, 0, 0, 2)); - r3_0 = _mm_add_epi32(r3_0, state0); - r3_1 = _mm_add_epi32(r3_1, state1); - r3_2 = _mm_add_epi32(r3_2, state2); - r3_3 = _mm_add_epi32(r3_3, state3); - r3_3 = _mm_add_epi64(r3_3, _mm_set_epi32(0, 0, 0, 3)); + r3_0 = _mm_add_epi32(r3_0, state0); + r3_1 = _mm_add_epi32(r3_1, state1); + r3_2 = _mm_add_epi32(r3_2, state2); + r3_3 = _mm_add_epi32(r3_3, state3); + r3_3 = _mm_add_epi64(r3_3, _mm_set_epi32(0, 0, 0, 3)); - if (xorInput) - { - r0_0 = _mm_xor_si128(_mm_loadu_si128(input_mm + 0), r0_0); - r0_1 = _mm_xor_si128(_mm_loadu_si128(input_mm + 1), r0_1); - r0_2 = _mm_xor_si128(_mm_loadu_si128(input_mm + 2), r0_2); - r0_3 = _mm_xor_si128(_mm_loadu_si128(input_mm + 3), r0_3); - } + if (xorInput) + { + r0_0 = _mm_xor_si128(_mm_loadu_si128(input_mm + 0), r0_0); + r0_1 = _mm_xor_si128(_mm_loadu_si128(input_mm + 1), r0_1); + r0_2 = _mm_xor_si128(_mm_loadu_si128(input_mm + 2), r0_2); + r0_3 = _mm_xor_si128(_mm_loadu_si128(input_mm + 3), r0_3); + } - _mm_storeu_si128(output_mm + 0, r0_0); - _mm_storeu_si128(output_mm + 1, r0_1); - _mm_storeu_si128(output_mm + 2, r0_2); - _mm_storeu_si128(output_mm + 3, r0_3); + _mm_storeu_si128(output_mm + 0, r0_0); + _mm_storeu_si128(output_mm + 1, r0_1); + _mm_storeu_si128(output_mm + 2, r0_2); + _mm_storeu_si128(output_mm + 3, r0_3); - if (xorInput) - { - r1_0 = _mm_xor_si128(_mm_loadu_si128(input_mm + 4), r1_0); - r1_1 = _mm_xor_si128(_mm_loadu_si128(input_mm + 5), r1_1); - r1_2 = _mm_xor_si128(_mm_loadu_si128(input_mm + 6), r1_2); - r1_3 = _mm_xor_si128(_mm_loadu_si128(input_mm + 7), r1_3); - } + if (xorInput) + { + r1_0 = _mm_xor_si128(_mm_loadu_si128(input_mm + 4), r1_0); + r1_1 = _mm_xor_si128(_mm_loadu_si128(input_mm + 5), r1_1); + r1_2 = _mm_xor_si128(_mm_loadu_si128(input_mm + 6), r1_2); + r1_3 = _mm_xor_si128(_mm_loadu_si128(input_mm + 7), r1_3); + } - _mm_storeu_si128(output_mm + 4, r1_0); - _mm_storeu_si128(output_mm + 5, r1_1); - _mm_storeu_si128(output_mm + 6, r1_2); - _mm_storeu_si128(output_mm + 7, r1_3); + _mm_storeu_si128(output_mm + 4, r1_0); + _mm_storeu_si128(output_mm + 5, r1_1); + _mm_storeu_si128(output_mm + 6, r1_2); + _mm_storeu_si128(output_mm + 7, r1_3); - if (xorInput) - { - r2_0 = _mm_xor_si128(_mm_loadu_si128(input_mm + 8), r2_0); - r2_1 = _mm_xor_si128(_mm_loadu_si128(input_mm + 9), r2_1); - r2_2 = _mm_xor_si128(_mm_loadu_si128(input_mm + 10), r2_2); - r2_3 = _mm_xor_si128(_mm_loadu_si128(input_mm + 11), r2_3); - } + if (xorInput) + { + r2_0 = _mm_xor_si128(_mm_loadu_si128(input_mm + 8), r2_0); + r2_1 = _mm_xor_si128(_mm_loadu_si128(input_mm + 9), r2_1); + r2_2 = _mm_xor_si128(_mm_loadu_si128(input_mm + 10), r2_2); + r2_3 = _mm_xor_si128(_mm_loadu_si128(input_mm + 11), r2_3); + } - _mm_storeu_si128(output_mm + 8, r2_0); - _mm_storeu_si128(output_mm + 9, r2_1); - _mm_storeu_si128(output_mm + 10, r2_2); - _mm_storeu_si128(output_mm + 11, r2_3); + _mm_storeu_si128(output_mm + 8, r2_0); + _mm_storeu_si128(output_mm + 9, r2_1); + _mm_storeu_si128(output_mm + 10, r2_2); + _mm_storeu_si128(output_mm + 11, r2_3); - if (xorInput) - { - r3_0 = _mm_xor_si128(_mm_loadu_si128(input_mm + 12), r3_0); - r3_1 = _mm_xor_si128(_mm_loadu_si128(input_mm + 13), r3_1); - r3_2 = _mm_xor_si128(_mm_loadu_si128(input_mm + 14), r3_2); - r3_3 = _mm_xor_si128(_mm_loadu_si128(input_mm + 15), r3_3); - } + if (xorInput) + { + r3_0 = _mm_xor_si128(_mm_loadu_si128(input_mm + 12), r3_0); + r3_1 = _mm_xor_si128(_mm_loadu_si128(input_mm + 13), r3_1); + r3_2 = _mm_xor_si128(_mm_loadu_si128(input_mm + 14), r3_2); + r3_3 = _mm_xor_si128(_mm_loadu_si128(input_mm + 15), r3_3); + } - _mm_storeu_si128(output_mm + 12, r3_0); - _mm_storeu_si128(output_mm + 13, r3_1); - _mm_storeu_si128(output_mm + 14, r3_2); - _mm_storeu_si128(output_mm + 15, r3_3); + _mm_storeu_si128(output_mm + 12, r3_0); + _mm_storeu_si128(output_mm + 13, r3_1); + _mm_storeu_si128(output_mm + 14, r3_2); + _mm_storeu_si128(output_mm + 15, r3_3); } #endif // CRYPTOPP_SSE2_INTRIN_AVAILABLE || CRYPTOPP_SSE2_ASM_AVAILABLE diff --git a/chacha.cpp b/chacha.cpp index aa2e9268..9a5aedfc 100644 --- a/chacha.cpp +++ b/chacha.cpp @@ -28,84 +28,84 @@ extern void ChaCha_OperateKeystream_SSE2(const word32 *state, const byte* input, #if defined(CRYPTOPP_DEBUG) && !defined(CRYPTOPP_DOXYGEN_PROCESSING) void ChaCha_TestInstantiations() { - ChaCha::Encryption x; + ChaCha::Encryption x; } #endif std::string ChaCha_Policy::AlgorithmProvider() const { #if (CRYPTOPP_SSE2_INTRIN_AVAILABLE || CRYPTOPP_SSE2_ASM_AVAILABLE) - if (HasSSE2()) - return "SSE2"; + if (HasSSE2()) + return "SSE2"; #endif #if (CRYPTOPP_ARM_NEON_AVAILABLE) - if (HasNEON()) - return "NEON"; + if (HasNEON()) + return "NEON"; #endif - return "C++"; + return "C++"; } void ChaCha_Policy::CipherSetKey(const NameValuePairs ¶ms, const byte *key, size_t length) { - CRYPTOPP_UNUSED(params); - CRYPTOPP_ASSERT(length == 16 || length == 32); + CRYPTOPP_UNUSED(params); + CRYPTOPP_ASSERT(length == 16 || length == 32); - m_rounds = params.GetIntValueWithDefault(Name::Rounds(), 20); - if (!(m_rounds == 8 || m_rounds == 12 || m_rounds == 20)) - throw InvalidRounds(ChaCha::StaticAlgorithmName(), m_rounds); + m_rounds = params.GetIntValueWithDefault(Name::Rounds(), 20); + if (!(m_rounds == 8 || m_rounds == 12 || m_rounds == 20)) + throw InvalidRounds(ChaCha::StaticAlgorithmName(), m_rounds); - // "expand 16-byte k" or "expand 32-byte k" - m_state[0] = 0x61707865; - m_state[1] = (length == 16) ? 0x3120646e : 0x3320646e; - m_state[2] = (length == 16) ? 0x79622d36 : 0x79622d32; - m_state[3] = 0x6b206574; + // "expand 16-byte k" or "expand 32-byte k" + m_state[0] = 0x61707865; + m_state[1] = (length == 16) ? 0x3120646e : 0x3320646e; + m_state[2] = (length == 16) ? 0x79622d36 : 0x79622d32; + m_state[3] = 0x6b206574; - GetBlock get1(key); - get1(m_state[4])(m_state[5])(m_state[6])(m_state[7]); + GetBlock get1(key); + get1(m_state[4])(m_state[5])(m_state[6])(m_state[7]); - GetBlock get2(key + ((length == 32) ? 16 : 0)); - get2(m_state[8])(m_state[9])(m_state[10])(m_state[11]); + GetBlock get2(key + ((length == 32) ? 16 : 0)); + get2(m_state[8])(m_state[9])(m_state[10])(m_state[11]); } void ChaCha_Policy::CipherResynchronize(byte *keystreamBuffer, const byte *IV, size_t length) { - CRYPTOPP_UNUSED(keystreamBuffer), CRYPTOPP_UNUSED(length); - CRYPTOPP_ASSERT(length==8); + CRYPTOPP_UNUSED(keystreamBuffer), CRYPTOPP_UNUSED(length); + CRYPTOPP_ASSERT(length==8); - GetBlock get(IV); - m_state[12] = m_state[13] = 0; - get(m_state[14])(m_state[15]); + GetBlock get(IV); + m_state[12] = m_state[13] = 0; + get(m_state[14])(m_state[15]); } void ChaCha_Policy::SeekToIteration(lword iterationCount) { - m_state[13] = (word32)iterationCount; - m_state[12] = (word32)SafeRightShift<32>(iterationCount); + m_state[13] = (word32)iterationCount; + m_state[12] = (word32)SafeRightShift<32>(iterationCount); } unsigned int ChaCha_Policy::GetAlignment() const { #if (CRYPTOPP_SSE2_INTRIN_AVAILABLE || CRYPTOPP_SSE2_ASM_AVAILABLE) - if (HasSSE2()) - return 16; - else + if (HasSSE2()) + return 16; + else #endif - return GetAlignmentOf(); + return GetAlignmentOf(); } unsigned int ChaCha_Policy::GetOptimalBlockSize() const { #if (CRYPTOPP_SSE2_INTRIN_AVAILABLE || CRYPTOPP_SSE2_ASM_AVAILABLE) - if (HasSSE2()) - return 4*BYTES_PER_ITERATION; - else + if (HasSSE2()) + return 4*BYTES_PER_ITERATION; + else #endif #if (CRYPTOPP_ARM_NEON_AVAILABLE) - if (HasNEON()) - return 4*BYTES_PER_ITERATION; - else + if (HasNEON()) + return 4*BYTES_PER_ITERATION; + else #endif - return BYTES_PER_ITERATION; + return BYTES_PER_ITERATION; } // OperateKeystream always produces a key stream. The key stream is written @@ -115,91 +115,91 @@ void ChaCha_Policy::OperateKeystream(KeystreamOperation operation, byte *output, const byte *input, size_t iterationCount) { #if (CRYPTOPP_SSE2_INTRIN_AVAILABLE || CRYPTOPP_SSE2_ASM_AVAILABLE) - if (HasSSE2()) - { - while (iterationCount >= 4) - { - bool xorInput = (operation & INPUT_NULL) != INPUT_NULL; - ChaCha_OperateKeystream_SSE2(m_state, input, output, m_rounds, xorInput); + if (HasSSE2()) + { + while (iterationCount >= 4) + { + bool xorInput = (operation & INPUT_NULL) != INPUT_NULL; + ChaCha_OperateKeystream_SSE2(m_state, input, output, m_rounds, xorInput); - m_state[12] += 4; - if (m_state[12] < 4) - m_state[13]++; + m_state[12] += 4; + if (m_state[12] < 4) + m_state[13]++; - input += (!!xorInput)*4*BYTES_PER_ITERATION; - output += 4*BYTES_PER_ITERATION; - iterationCount -= 4; - } - } + input += (!!xorInput)*4*BYTES_PER_ITERATION; + output += 4*BYTES_PER_ITERATION; + iterationCount -= 4; + } + } #endif #if (CRYPTOPP_ARM_NEON_AVAILABLE) - if (HasNEON()) - { - while (iterationCount >= 4) - { - bool xorInput = (operation & INPUT_NULL) != INPUT_NULL; - ChaCha_OperateKeystream_NEON(m_state, input, output, m_rounds, xorInput); + if (HasNEON()) + { + while (iterationCount >= 4) + { + bool xorInput = (operation & INPUT_NULL) != INPUT_NULL; + ChaCha_OperateKeystream_NEON(m_state, input, output, m_rounds, xorInput); - m_state[12] += 4; - if (m_state[12] < 4) - m_state[13]++; + m_state[12] += 4; + if (m_state[12] < 4) + m_state[13]++; - input += (!!xorInput)*4*BYTES_PER_ITERATION; - output += 4*BYTES_PER_ITERATION; - iterationCount -= 4; - } - } + input += (!!xorInput)*4*BYTES_PER_ITERATION; + output += 4*BYTES_PER_ITERATION; + iterationCount -= 4; + } + } #endif - while (iterationCount--) - { - word32 x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15; + while (iterationCount--) + { + word32 x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15; - x0 = m_state[0]; x1 = m_state[1]; x2 = m_state[2]; x3 = m_state[3]; - x4 = m_state[4]; x5 = m_state[5]; x6 = m_state[6]; x7 = m_state[7]; - x8 = m_state[8]; x9 = m_state[9]; x10 = m_state[10]; x11 = m_state[11]; - x12 = m_state[12]; x13 = m_state[13]; x14 = m_state[14]; x15 = m_state[15]; + x0 = m_state[0]; x1 = m_state[1]; x2 = m_state[2]; x3 = m_state[3]; + x4 = m_state[4]; x5 = m_state[5]; x6 = m_state[6]; x7 = m_state[7]; + x8 = m_state[8]; x9 = m_state[9]; x10 = m_state[10]; x11 = m_state[11]; + x12 = m_state[12]; x13 = m_state[13]; x14 = m_state[14]; x15 = m_state[15]; - for (int i = static_cast(m_rounds); i > 0; i -= 2) - { - CHACHA_QUARTER_ROUND(x0, x4, x8, x12); - CHACHA_QUARTER_ROUND(x1, x5, x9, x13); - CHACHA_QUARTER_ROUND(x2, x6, x10, x14); - CHACHA_QUARTER_ROUND(x3, x7, x11, x15); + for (int i = static_cast(m_rounds); i > 0; i -= 2) + { + CHACHA_QUARTER_ROUND(x0, x4, x8, x12); + CHACHA_QUARTER_ROUND(x1, x5, x9, x13); + CHACHA_QUARTER_ROUND(x2, x6, x10, x14); + CHACHA_QUARTER_ROUND(x3, x7, x11, x15); - CHACHA_QUARTER_ROUND(x0, x5, x10, x15); - CHACHA_QUARTER_ROUND(x1, x6, x11, x12); - CHACHA_QUARTER_ROUND(x2, x7, x8, x13); - CHACHA_QUARTER_ROUND(x3, x4, x9, x14); - } + CHACHA_QUARTER_ROUND(x0, x5, x10, x15); + CHACHA_QUARTER_ROUND(x1, x6, x11, x12); + CHACHA_QUARTER_ROUND(x2, x7, x8, x13); + CHACHA_QUARTER_ROUND(x3, x4, x9, x14); + } #ifndef CRYPTOPP_DOXYGEN_PROCESSING - #define CHACHA_OUTPUT(x){\ - CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 0, x0 + m_state[0]);\ - CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 1, x1 + m_state[1]);\ - CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 2, x2 + m_state[2]);\ - CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 3, x3 + m_state[3]);\ - CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 4, x4 + m_state[4]);\ - CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 5, x5 + m_state[5]);\ - CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 6, x6 + m_state[6]);\ - CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 7, x7 + m_state[7]);\ - CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 8, x8 + m_state[8]);\ - CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 9, x9 + m_state[9]);\ - CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 10, x10 + m_state[10]);\ - CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 11, x11 + m_state[11]);\ - CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 12, x12 + m_state[12]);\ - CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 13, x13 + m_state[13]);\ - CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 14, x14 + m_state[14]);\ - CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 15, x15 + m_state[15]);} + #define CHACHA_OUTPUT(x){\ + CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 0, x0 + m_state[0]);\ + CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 1, x1 + m_state[1]);\ + CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 2, x2 + m_state[2]);\ + CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 3, x3 + m_state[3]);\ + CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 4, x4 + m_state[4]);\ + CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 5, x5 + m_state[5]);\ + CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 6, x6 + m_state[6]);\ + CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 7, x7 + m_state[7]);\ + CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 8, x8 + m_state[8]);\ + CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 9, x9 + m_state[9]);\ + CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 10, x10 + m_state[10]);\ + CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 11, x11 + m_state[11]);\ + CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 12, x12 + m_state[12]);\ + CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 13, x13 + m_state[13]);\ + CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 14, x14 + m_state[14]);\ + CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 15, x15 + m_state[15]);} - CRYPTOPP_KEYSTREAM_OUTPUT_SWITCH(CHACHA_OUTPUT, BYTES_PER_ITERATION); - #undef CHACHA_OUTPUT + CRYPTOPP_KEYSTREAM_OUTPUT_SWITCH(CHACHA_OUTPUT, BYTES_PER_ITERATION); + #undef CHACHA_OUTPUT #endif - if (++m_state[12] == 0) - m_state[13]++; - } + if (++m_state[12] == 0) + m_state[13]++; + } } NAMESPACE_END diff --git a/chacha.h b/chacha.h index d66127a8..e7ec2cb2 100644 --- a/chacha.h +++ b/chacha.h @@ -22,9 +22,9 @@ NAMESPACE_BEGIN(CryptoPP) /// \since Crypto++ 5.6.4 struct ChaCha_Info : public VariableKeyLength<32, 16, 32, 16, SimpleKeyingInterface::UNIQUE_IV, 8> { - static const char* StaticAlgorithmName() { - return "ChaCha"; - } + static const char* StaticAlgorithmName() { + return "ChaCha"; + } }; /// \brief ChaCha stream cipher implementation @@ -32,18 +32,18 @@ struct ChaCha_Info : public VariableKeyLength<32, 16, 32, 16, SimpleKeyingInterf class CRYPTOPP_NO_VTABLE ChaCha_Policy : public AdditiveCipherConcretePolicy { protected: - void CipherSetKey(const NameValuePairs ¶ms, const byte *key, size_t length); - void OperateKeystream(KeystreamOperation operation, byte *output, const byte *input, size_t iterationCount); - void CipherResynchronize(byte *keystreamBuffer, const byte *IV, size_t length); - bool CipherIsRandomAccess() const {return true;} - void SeekToIteration(lword iterationCount); - unsigned int GetAlignment() const; - unsigned int GetOptimalBlockSize() const; + void CipherSetKey(const NameValuePairs ¶ms, const byte *key, size_t length); + void OperateKeystream(KeystreamOperation operation, byte *output, const byte *input, size_t iterationCount); + void CipherResynchronize(byte *keystreamBuffer, const byte *IV, size_t length); + bool CipherIsRandomAccess() const {return true;} + void SeekToIteration(lword iterationCount); + unsigned int GetAlignment() const; + unsigned int GetOptimalBlockSize() const; - std::string AlgorithmProvider() const; + std::string AlgorithmProvider() const; - FixedSizeAlignedSecBlock m_state; - int m_rounds; + FixedSizeAlignedSecBlock m_state; + int m_rounds; }; /// \brief ChaCha stream cipher @@ -56,8 +56,8 @@ protected: /// \since Crypto++ 5.6.4 struct ChaCha : public ChaCha_Info, public SymmetricCipherDocumentation { - typedef SymmetricCipherFinal >, ChaCha_Info > Encryption; - typedef Encryption Decryption; + typedef SymmetricCipherFinal >, ChaCha_Info > Encryption; + typedef Encryption Decryption; }; NAMESPACE_END