Update comments for vector permutes in SPECK-128

pull/548/head
Jeffrey Walton 2017-12-04 12:31:32 -05:00
parent 25709d2597
commit 076937eb81
No known key found for this signature in database
GPG Key ID: B36AB348921B1838
1 changed files with 24 additions and 23 deletions

View File

@ -380,10 +380,10 @@ inline size_t SPECK64_AdvancedProcessBlocks_NEON(F1 func1, F4 func4,
block = veorq_u32(block, x);
}
const word32 t0 = vgetq_lane_u32(block, 0);
std::memcpy(Ptr32(outBlocks)+0, &t0, 4);
const word32 t1 = vgetq_lane_u32(block, 1);
std::memcpy(Ptr32(outBlocks)+1, &t1, 4);
word32 t[2];
t[0] = vgetq_lane_u32(block, 0);
t[1] = vgetq_lane_u32(block, 1);
std::memcpy(outBlocks, t, sizeof(t));
inBlocks += inIncrement;
outBlocks += outIncrement;
@ -476,7 +476,7 @@ inline void SPECK128_Enc_Block(uint64x2_t &block0, const word64 *subkeys, unsign
// a big-endian byte array. Depending on the number of blocks it needs to
// be permuted to the following. If only a single block is available then
// a Zero block is provided to promote vectorizations.
// [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
// [A1 A2][B1 B2] ... => [A1 B1][A2 B2] ...
uint64x2_t block1 = {0};
uint64x2_t x1 = UnpackLow64(block0, block1);
uint64x2_t y1 = UnpackHigh64(block0, block1);
@ -498,7 +498,7 @@ inline void SPECK128_Enc_Block(uint64x2_t &block0, const word64 *subkeys, unsign
x1 = Shuffle64(x1);
y1 = Shuffle64(y1);
// [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4]
// [A1 B1][A2 B2] ... => [A1 A2][B1 B2] ...
block0 = UnpackLow64(x1, y1);
// block1 = UnpackHigh64(x1, y1);
}
@ -511,7 +511,7 @@ inline void SPECK128_Enc_6_Blocks(uint64x2_t &block0, uint64x2_t &block1,
// a big-endian byte array. Depending on the number of blocks it needs to
// be permuted to the following. If only a single block is available then
// a Zero block is provided to promote vectorizations.
// [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
// [A1 A2][B1 B2] ... => [A1 B1][A2 B2] ...
uint64x2_t x1 = UnpackLow64(block0, block1);
uint64x2_t y1 = UnpackHigh64(block0, block1);
uint64x2_t x2 = UnpackLow64(block2, block3);
@ -554,7 +554,7 @@ inline void SPECK128_Enc_6_Blocks(uint64x2_t &block0, uint64x2_t &block1,
x3 = Shuffle64(x3);
y3 = Shuffle64(y3);
// [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4]
// [A1 B1][A2 B2] ... => [A1 A2][B1 B2] ...
block0 = UnpackLow64(x1, y1);
block1 = UnpackHigh64(x1, y1);
block2 = UnpackLow64(x2, y2);
@ -569,7 +569,7 @@ inline void SPECK128_Dec_Block(uint64x2_t &block0, const word64 *subkeys, unsign
// a big-endian byte array. Depending on the number of blocks it needs to
// be permuted to the following. If only a single block is available then
// a Zero block is provided to promote vectorizations.
// [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
// [A1 A2][B1 B2] ... => [A1 B1][A2 B2] ...
uint64x2_t block1 = {0};
uint64x2_t x1 = UnpackLow64(block0, block1);
uint64x2_t y1 = UnpackHigh64(block0, block1);
@ -591,7 +591,7 @@ inline void SPECK128_Dec_Block(uint64x2_t &block0, const word64 *subkeys, unsign
x1 = Shuffle64(x1);
y1 = Shuffle64(y1);
// [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4]
// [A1 B1][A2 B2] ... => [A1 A2][B1 B2] ...
block0 = UnpackLow64(x1, y1);
// block1 = UnpackHigh64(x1, y1);
}
@ -604,7 +604,7 @@ inline void SPECK128_Dec_6_Blocks(uint64x2_t &block0, uint64x2_t &block1,
// a big-endian byte array. Depending on the number of blocks it needs to
// be permuted to the following. If only a single block is available then
// a Zero block is provided to promote vectorizations.
// [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
// [A1 A2][B1 B2] ... => [A1 B1][A2 B2] ...
uint64x2_t x1 = UnpackLow64(block0, block1);
uint64x2_t y1 = UnpackHigh64(block0, block1);
uint64x2_t x2 = UnpackLow64(block2, block3);
@ -647,7 +647,7 @@ inline void SPECK128_Dec_6_Blocks(uint64x2_t &block0, uint64x2_t &block1,
x3 = Shuffle64(x3);
y3 = Shuffle64(y3);
// [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4]
// [A1 B1][A2 B2] ... => [A1 A2][B1 B2] ...
block0 = UnpackLow64(x1, y1);
block1 = UnpackHigh64(x1, y1);
block2 = UnpackLow64(x2, y2);
@ -838,7 +838,7 @@ inline void SPECK128_Enc_Block(__m128i &block0, const word64 *subkeys, unsigned
// a big-endian byte array. Depending on the number of blocks it needs to
// be permuted to the following. If only a single block is available then
// a Zero block is provided to promote vectorizations.
// [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
// [A1 A2][B1 B2] ... => [A1 B1][A2 B2] ...
__m128i block1 = _mm_setzero_si128();
__m128i x1 = _mm_unpacklo_epi64(block0, block1);
__m128i y1 = _mm_unpackhi_epi64(block0, block1);
@ -862,7 +862,7 @@ inline void SPECK128_Enc_Block(__m128i &block0, const word64 *subkeys, unsigned
x1 = _mm_shuffle_epi8(x1, mask);
y1 = _mm_shuffle_epi8(y1, mask);
// [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4]
// [A1 B1][A2 B2] ... => [A1 A2][B1 B2] ...
block0 = _mm_unpacklo_epi64(x1, y1);
// block1 = _mm_unpackhi_epi64(x1, y1);
}
@ -874,7 +874,7 @@ inline void SPECK128_Enc_4_Blocks(__m128i &block0, __m128i &block1,
// a big-endian byte array. Depending on the number of blocks it needs to
// be permuted to the following. If only a single block is available then
// a Zero block is provided to promote vectorizations.
// [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
// [A1 A2][B1 B2] ... => [A1 B1][A2 B2] ...
__m128i x1 = _mm_unpacklo_epi64(block0, block1);
__m128i y1 = _mm_unpackhi_epi64(block0, block1);
__m128i x2 = _mm_unpacklo_epi64(block2, block3);
@ -908,7 +908,7 @@ inline void SPECK128_Enc_4_Blocks(__m128i &block0, __m128i &block1,
x2 = _mm_shuffle_epi8(x2, mask);
y2 = _mm_shuffle_epi8(y2, mask);
// [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4]
// [A1 B1][A2 B2] ... => [A1 A2][B1 B2] ...
block0 = _mm_unpacklo_epi64(x1, y1);
block1 = _mm_unpackhi_epi64(x1, y1);
block2 = _mm_unpacklo_epi64(x2, y2);
@ -921,7 +921,7 @@ inline void SPECK128_Dec_Block(__m128i &block0, const word64 *subkeys, unsigned
// a big-endian byte array. Depending on the number of blocks it needs to
// be permuted to the following. If only a single block is available then
// a Zero block is provided to promote vectorizations.
// [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
// [A1 A2][B1 B2] ... => [A1 B1][A2 B2] ...
__m128i block1 = _mm_setzero_si128();
__m128i x1 = _mm_unpacklo_epi64(block0, block1);
__m128i y1 = _mm_unpackhi_epi64(block0, block1);
@ -945,7 +945,7 @@ inline void SPECK128_Dec_Block(__m128i &block0, const word64 *subkeys, unsigned
x1 = _mm_shuffle_epi8(x1, mask);
y1 = _mm_shuffle_epi8(y1, mask);
// [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4]
// [A1 B1][A2 B2] ... => [A1 A2][B1 B2] ...
block0 = _mm_unpacklo_epi64(x1, y1);
// block1 = _mm_unpackhi_epi64(x1, y1);
}
@ -957,7 +957,7 @@ inline void SPECK128_Dec_4_Blocks(__m128i &block0, __m128i &block1,
// a big-endian byte array. Depending on the number of blocks it needs to
// be permuted to the following. If only a single block is available then
// a Zero block is provided to promote vectorizations.
// [A1 A2 A3 A4][B1 B2 B3 B4] ... => [A1 A3 B1 B3][A2 A4 B2 B4] ...
// [A1 A2][B1 B2] ... => [A1 B1][A2 B2] ...
__m128i x1 = _mm_unpacklo_epi64(block0, block1);
__m128i y1 = _mm_unpackhi_epi64(block0, block1);
__m128i x2 = _mm_unpacklo_epi64(block2, block3);
@ -991,7 +991,7 @@ inline void SPECK128_Dec_4_Blocks(__m128i &block0, __m128i &block1,
x2 = _mm_shuffle_epi8(x2, mask);
y2 = _mm_shuffle_epi8(y2, mask);
// [A1 A3 B1 B3][A2 A4 B2 B4] => [A1 A2 A3 A4][B1 B2 B3 B4]
// [A1 B1][A2 B2] ... => [A1 A2][B1 B2] ...
block0 = _mm_unpacklo_epi64(x1, y1);
block1 = _mm_unpackhi_epi64(x1, y1);
block2 = _mm_unpacklo_epi64(x2, y2);
@ -1402,9 +1402,10 @@ inline size_t SPECK64_AdvancedProcessBlocks_SSE41(F1 func1, F4 func4,
block = _mm_xor_si128(block, _mm_insert_epi32(x, xorPtr[1], 1));
}
word32* outPtr = reinterpret_cast<word32*>(outBlocks);
outPtr[0] = _mm_extract_epi32(block, 0);
outPtr[1] = _mm_extract_epi32(block, 1);
word32 t[2];
t[0] = _mm_extract_epi32(block, 0);
t[1] = _mm_extract_epi32(block, 1);
std::memcpy(outBlocks, t, sizeof(t));
inBlocks += inIncrement;
outBlocks += outIncrement;