Update comments

pull/548/head
Jeffrey Walton 2017-11-22 17:35:59 -05:00
parent f2bc3cd0ca
commit f5784c1634
No known key found for this signature in database
GPG Key ID: B36AB348921B1838
1 changed files with 20 additions and 14 deletions

View File

@ -65,7 +65,8 @@ inline void SPECK128_Enc_Block(__m128i &block0, const word64 *subkeys, unsigned
// Hack ahead... SPECK128_AdvancedProcessBlocks_SSSE3 loads each SPECK-128 block into a // Hack ahead... SPECK128_AdvancedProcessBlocks_SSSE3 loads each SPECK-128 block into a
// __m128i. We can't SSE over them, so we rearrange the data to allow packed operations. // __m128i. We can't SSE over them, so we rearrange the data to allow packed operations.
// Its also easier to permute them in SPECK128_Enc_Block rather than the calling code. // Its also easier to permute them in SPECK128_Enc_Block rather than the calling code.
// SPECK128_AdvancedProcessBlocks_SSSE3 is rather messy. // SPECK128_AdvancedProcessBlocks_SSSE3 is rather messy. The zero block below is a
// "don't care". It is present so we can vectorize SPECK128_Enc_Block.
__m128i block1 = _mm_setzero_si128(); __m128i block1 = _mm_setzero_si128();
__m128i x1 = _mm_unpacklo_epi64(block0, block1); __m128i x1 = _mm_unpacklo_epi64(block0, block1);
__m128i y1 = _mm_unpackhi_epi64(block0, block1); __m128i y1 = _mm_unpackhi_epi64(block0, block1);
@ -76,11 +77,12 @@ inline void SPECK128_Enc_Block(__m128i &block0, const word64 *subkeys, unsigned
for (size_t i=0; static_cast<int>(i)<rounds; ++i) for (size_t i=0; static_cast<int>(i)<rounds; ++i)
{ {
const __m128i k1 = _mm_castpd_si128(_mm_loaddup_pd((const double*)(subkeys+i))); const __m128i rk = _mm_castpd_si128(
_mm_loaddup_pd(reinterpret_cast<const double*>(subkeys+i)));
x1 = RotateRight64<8>(x1); x1 = RotateRight64<8>(x1);
x1 = _mm_add_epi64(x1, y1); x1 = _mm_add_epi64(x1, y1);
x1 = _mm_xor_si128(x1, k1); x1 = _mm_xor_si128(x1, rk);
y1 = RotateLeft64<3>(y1); y1 = RotateLeft64<3>(y1);
y1 = _mm_xor_si128(y1, x1); y1 = _mm_xor_si128(y1, x1);
} }
@ -89,7 +91,7 @@ inline void SPECK128_Enc_Block(__m128i &block0, const word64 *subkeys, unsigned
y1 = _mm_shuffle_epi8(y1, mask); y1 = _mm_shuffle_epi8(y1, mask);
block0 = _mm_unpacklo_epi64(x1, y1); block0 = _mm_unpacklo_epi64(x1, y1);
block1 = _mm_unpackhi_epi64(x1, y1); // block1 = _mm_unpackhi_epi64(x1, y1);
} }
inline void SPECK128_Enc_4_Blocks(__m128i &block0, __m128i &block1, inline void SPECK128_Enc_4_Blocks(__m128i &block0, __m128i &block1,
@ -112,14 +114,15 @@ inline void SPECK128_Enc_4_Blocks(__m128i &block0, __m128i &block1,
for (size_t i=0; static_cast<int>(i)<rounds; ++i) for (size_t i=0; static_cast<int>(i)<rounds; ++i)
{ {
const __m128i k1 = _mm_castpd_si128(_mm_loaddup_pd((const double*)(subkeys+i))); const __m128i rk = _mm_castpd_si128(
_mm_loaddup_pd(reinterpret_cast<const double*>(subkeys+i)));
x1 = RotateRight64<8>(x1); x1 = RotateRight64<8>(x1);
x2 = RotateRight64<8>(x2); x2 = RotateRight64<8>(x2);
x1 = _mm_add_epi64(x1, y1); x1 = _mm_add_epi64(x1, y1);
x2 = _mm_add_epi64(x2, y2); x2 = _mm_add_epi64(x2, y2);
x1 = _mm_xor_si128(x1, k1); x1 = _mm_xor_si128(x1, rk);
x2 = _mm_xor_si128(x2, k1); x2 = _mm_xor_si128(x2, rk);
y1 = RotateLeft64<3>(y1); y1 = RotateLeft64<3>(y1);
y2 = RotateLeft64<3>(y2); y2 = RotateLeft64<3>(y2);
y1 = _mm_xor_si128(y1, x1); y1 = _mm_xor_si128(y1, x1);
@ -142,7 +145,8 @@ inline void SPECK128_Dec_Block(__m128i &block0, const word64 *subkeys, unsigned
// Hack ahead... SPECK128_AdvancedProcessBlocks_SSSE3 loads each SPECK-128 block into a // Hack ahead... SPECK128_AdvancedProcessBlocks_SSSE3 loads each SPECK-128 block into a
// __m128i. We can't SSE over them, so we rearrange the data to allow packed operations. // __m128i. We can't SSE over them, so we rearrange the data to allow packed operations.
// Its also easier to permute them in SPECK128_Dec_Block rather than the calling code. // Its also easier to permute them in SPECK128_Dec_Block rather than the calling code.
// SPECK128_AdvancedProcessBlocks_SSSE3 is rather messy. // SPECK128_AdvancedProcessBlocks_SSSE3 is rather messy. The zero block below is a
// "don't care". It is present so we can vectorize SPECK128_Dec_Block.
__m128i block1 = _mm_setzero_si128(); __m128i block1 = _mm_setzero_si128();
__m128i x1 = _mm_unpacklo_epi64(block0, block1); __m128i x1 = _mm_unpacklo_epi64(block0, block1);
__m128i y1 = _mm_unpackhi_epi64(block0, block1); __m128i y1 = _mm_unpackhi_epi64(block0, block1);
@ -153,11 +157,12 @@ inline void SPECK128_Dec_Block(__m128i &block0, const word64 *subkeys, unsigned
for (size_t i=rounds-1; static_cast<int>(i)>=0; --i) for (size_t i=rounds-1; static_cast<int>(i)>=0; --i)
{ {
const __m128i k1 = _mm_castpd_si128(_mm_loaddup_pd((const double*)(subkeys+i))); const __m128i rk = _mm_castpd_si128(
_mm_loaddup_pd(reinterpret_cast<const double*>(subkeys+i)));
y1 = _mm_xor_si128(y1, x1); y1 = _mm_xor_si128(y1, x1);
y1 = RotateRight64<3>(y1); y1 = RotateRight64<3>(y1);
x1 = _mm_xor_si128(x1, k1); x1 = _mm_xor_si128(x1, rk);
x1 = _mm_sub_epi64(x1, y1); x1 = _mm_sub_epi64(x1, y1);
x1 = RotateLeft64<8>(x1); x1 = RotateLeft64<8>(x1);
} }
@ -166,7 +171,7 @@ inline void SPECK128_Dec_Block(__m128i &block0, const word64 *subkeys, unsigned
y1 = _mm_shuffle_epi8(y1, mask); y1 = _mm_shuffle_epi8(y1, mask);
block0 = _mm_unpacklo_epi64(x1, y1); block0 = _mm_unpacklo_epi64(x1, y1);
block1 = _mm_unpackhi_epi64(x1, y1); // block1 = _mm_unpackhi_epi64(x1, y1);
} }
inline void SPECK128_Dec_4_Blocks(__m128i &block0, __m128i &block1, inline void SPECK128_Dec_4_Blocks(__m128i &block0, __m128i &block1,
@ -189,14 +194,15 @@ inline void SPECK128_Dec_4_Blocks(__m128i &block0, __m128i &block1,
for (size_t i=rounds-1; static_cast<int>(i)>=0; --i) for (size_t i=rounds-1; static_cast<int>(i)>=0; --i)
{ {
const __m128i k1 = _mm_castpd_si128(_mm_loaddup_pd((const double*)(subkeys+i))); const __m128i rk = _mm_castpd_si128(
_mm_loaddup_pd(reinterpret_cast<const double*>(subkeys+i)));
y1 = _mm_xor_si128(y1, x1); y1 = _mm_xor_si128(y1, x1);
y2 = _mm_xor_si128(y2, x2); y2 = _mm_xor_si128(y2, x2);
y1 = RotateRight64<3>(y1); y1 = RotateRight64<3>(y1);
y2 = RotateRight64<3>(y2); y2 = RotateRight64<3>(y2);
x1 = _mm_xor_si128(x1, k1); x1 = _mm_xor_si128(x1, rk);
x2 = _mm_xor_si128(x2, k1); x2 = _mm_xor_si128(x2, rk);
x1 = _mm_sub_epi64(x1, y1); x1 = _mm_sub_epi64(x1, y1);
x2 = _mm_sub_epi64(x2, y2); x2 = _mm_sub_epi64(x2, y2);
x1 = RotateLeft64<8>(x1); x1 = RotateLeft64<8>(x1);