From cd31fa29dcbb63a3c917ac73752c985f0074720e Mon Sep 17 00:00:00 2001 From: Jeffrey Walton Date: Mon, 4 Dec 2017 03:38:39 -0500 Subject: [PATCH] Switch to uint64x2_t for SPECK-128 --- speck-simd.cpp | 190 ++++++++++++++++++++++++++----------------------- 1 file changed, 99 insertions(+), 91 deletions(-) diff --git a/speck-simd.cpp b/speck-simd.cpp index 9e286557..c8232d01 100644 --- a/speck-simd.cpp +++ b/speck-simd.cpp @@ -282,6 +282,10 @@ inline size_t SPECK64_AdvancedProcessBlocks_NEON(F1 func1, F4 func4, outIncrement = 0-outIncrement; // Hack... Disable parallel for decryption. It is buggy. + // What needs to happen is, move pointer one more block size to get + // a full 128-bit word, then swap N-bit words, and then swap the + // Xor block if it is being used. Its a real kludge and it is + // being side stepped at the moment. flags &= ~BlockTransformation::BT_AllowParallel; } @@ -289,7 +293,7 @@ inline size_t SPECK64_AdvancedProcessBlocks_NEON(F1 func1, F4 func4, { while (length >= 4*blockSize) { - uint32x4_t block0 = vld1q_u32(reinterpret_cast(inBlocks)), block1; + uint32x4_t block0 = vreinterpretq_u32_u8(vld1q_u8(inBlocks)), block1; if (flags & BlockTransformation::BT_InBlockIsCounter) { const uint32x4_t be1 = vld1q_u32(s_one64); @@ -300,7 +304,7 @@ inline size_t SPECK64_AdvancedProcessBlocks_NEON(F1 func1, F4 func4, else { inBlocks += 2*inIncrement; - block1 = vld1q_u32(Ptr32(inBlocks)); + block1 = vreinterpretq_u32_u8(vld1q_u8(inBlocks)); inBlocks += 2*inIncrement; } @@ -308,9 +312,9 @@ inline size_t SPECK64_AdvancedProcessBlocks_NEON(F1 func1, F4 func4, { // Coverity finding, appears to be false positive. Assert the condition. CRYPTOPP_ASSERT(xorBlocks); - block0 = veorq_u32(block0, vld1q_u32(Ptr32(xorBlocks))); + block0 = veorq_u32(block0, vreinterpretq_u32_u8(vld1q_u8(xorBlocks))); xorBlocks += 2*xorIncrement; - block1 = veorq_u32(block1, vld1q_u32(Ptr32(xorBlocks))); + block1 = veorq_u32(block1, vreinterpretq_u32_u8(vld1q_u8(xorBlocks))); xorBlocks += 2*xorIncrement; } @@ -318,9 +322,9 @@ inline size_t SPECK64_AdvancedProcessBlocks_NEON(F1 func1, F4 func4, if (xorBlocks && !(flags & BlockTransformation::BT_XorInput)) { - block0 = veorq_u32(block0, vld1q_u32(Ptr32(xorBlocks))); + block0 = veorq_u32(block0, vreinterpretq_u32_u8(vld1q_u8(xorBlocks))); xorBlocks += 2*xorIncrement; - block1 = veorq_u32(block1, vld1q_u32(Ptr32(xorBlocks))); + block1 = veorq_u32(block1, vreinterpretq_u32_u8(vld1q_u8(xorBlocks))); xorBlocks += 2*xorIncrement; } @@ -384,20 +388,20 @@ const word32 s_one128[] = {0, 0, 0, 1<<24}; // uint32x4_t const word32 s_one128[] = {0, 0, 0, 1}; // uint32x4_t #endif -template -inline W UnpackHigh64(const T& a, const T& b) +template +inline T UnpackHigh64(const T& a, const T& b) { const uint64x1_t x(vget_high_u64((uint64x2_t)a)); const uint64x1_t y(vget_high_u64((uint64x2_t)b)); - return (W)vcombine_u64(x, y); + return (T)vcombine_u64(x, y); } -template -inline W UnpackLow64(const T& a, const T& b) +template +inline T UnpackLow64(const T& a, const T& b) { const uint64x1_t x(vget_low_u64((uint64x2_t)a)); const uint64x1_t y(vget_low_u64((uint64x2_t)b)); - return (W)vcombine_u64(x, y); + return (T)vcombine_u64(x, y); } template @@ -450,14 +454,14 @@ inline uint64x2_t Shuffle64(const uint64x2_t& val) #endif } -inline void SPECK128_Enc_Block(uint8x16_t &block0, const word64 *subkeys, unsigned int rounds) +inline void SPECK128_Enc_Block(uint64x2_t &block0, const word64 *subkeys, unsigned int rounds) { // Hack ahead... Rearrange the data for vectorization. It is easier to permute // the data in SPECK128_Enc_Blocks then SPECK128_AdvancedProcessBlocks_NEON. // The zero block below is a "don't care". It is present so we can vectorize. - uint8x16_t block1 = {0}; - uint64x2_t x1 = UnpackLow64(block0, block1); - uint64x2_t y1 = UnpackHigh64(block0, block1); + uint64x2_t block1 = {0}; + uint64x2_t x1 = UnpackLow64(block0, block1); + uint64x2_t y1 = UnpackHigh64(block0, block1); x1 = Shuffle64(x1); y1 = Shuffle64(y1); @@ -476,22 +480,22 @@ inline void SPECK128_Enc_Block(uint8x16_t &block0, const word64 *subkeys, unsign x1 = Shuffle64(x1); y1 = Shuffle64(y1); - block0 = UnpackLow64(x1, y1); - // block1 = UnpackHigh64(x1, y1); + block0 = UnpackLow64(x1, y1); + // block1 = UnpackHigh64(x1, y1); } -inline void SPECK128_Enc_6_Blocks(uint8x16_t &block0, uint8x16_t &block1, - uint8x16_t &block2, uint8x16_t &block3, uint8x16_t &block4, - uint8x16_t &block5, const word64 *subkeys, unsigned int rounds) +inline void SPECK128_Enc_6_Blocks(uint64x2_t &block0, uint64x2_t &block1, + uint64x2_t &block2, uint64x2_t &block3, uint64x2_t &block4, + uint64x2_t &block5, const word64 *subkeys, unsigned int rounds) { // Hack ahead... Rearrange the data for vectorization. It is easier to permute // the data in SPECK128_Enc_Blocks then SPECK128_AdvancedProcessBlocks_NEON. - uint64x2_t x1 = UnpackLow64(block0, block1); - uint64x2_t y1 = UnpackHigh64(block0, block1); - uint64x2_t x2 = UnpackLow64(block2, block3); - uint64x2_t y2 = UnpackHigh64(block2, block3); - uint64x2_t x3 = UnpackLow64(block4, block5); - uint64x2_t y3 = UnpackHigh64(block4, block5); + uint64x2_t x1 = UnpackLow64(block0, block1); + uint64x2_t y1 = UnpackHigh64(block0, block1); + uint64x2_t x2 = UnpackLow64(block2, block3); + uint64x2_t y2 = UnpackHigh64(block2, block3); + uint64x2_t x3 = UnpackLow64(block4, block5); + uint64x2_t y3 = UnpackHigh64(block4, block5); x1 = Shuffle64(x1); y1 = Shuffle64(y1); @@ -528,22 +532,22 @@ inline void SPECK128_Enc_6_Blocks(uint8x16_t &block0, uint8x16_t &block1, x3 = Shuffle64(x3); y3 = Shuffle64(y3); - block0 = UnpackLow64(x1, y1); - block1 = UnpackHigh64(x1, y1); - block2 = UnpackLow64(x2, y2); - block3 = UnpackHigh64(x2, y2); - block4 = UnpackLow64(x3, y3); - block5 = UnpackHigh64(x3, y3); + block0 = UnpackLow64(x1, y1); + block1 = UnpackHigh64(x1, y1); + block2 = UnpackLow64(x2, y2); + block3 = UnpackHigh64(x2, y2); + block4 = UnpackLow64(x3, y3); + block5 = UnpackHigh64(x3, y3); } -inline void SPECK128_Dec_Block(uint8x16_t &block0, const word64 *subkeys, unsigned int rounds) +inline void SPECK128_Dec_Block(uint64x2_t &block0, const word64 *subkeys, unsigned int rounds) { // Hack ahead... Rearrange the data for vectorization. It is easier to permute // the data in SPECK128_Dec_Blocks then SPECK128_AdvancedProcessBlocks_NEON. // The zero block below is a "don't care". It is present so we can vectorize. - uint8x16_t block1 = {0}; - uint64x2_t x1 = UnpackLow64(block0, block1); - uint64x2_t y1 = UnpackHigh64(block0, block1); + uint64x2_t block1 = {0}; + uint64x2_t x1 = UnpackLow64(block0, block1); + uint64x2_t y1 = UnpackHigh64(block0, block1); x1 = Shuffle64(x1); y1 = Shuffle64(y1); @@ -562,22 +566,22 @@ inline void SPECK128_Dec_Block(uint8x16_t &block0, const word64 *subkeys, unsign x1 = Shuffle64(x1); y1 = Shuffle64(y1); - block0 = UnpackLow64(x1, y1); - // block1 = UnpackHigh64(x1, y1); + block0 = UnpackLow64(x1, y1); + // block1 = UnpackHigh64(x1, y1); } -inline void SPECK128_Dec_6_Blocks(uint8x16_t &block0, uint8x16_t &block1, - uint8x16_t &block2, uint8x16_t &block3, uint8x16_t &block4, - uint8x16_t &block5, const word64 *subkeys, unsigned int rounds) +inline void SPECK128_Dec_6_Blocks(uint64x2_t &block0, uint64x2_t &block1, + uint64x2_t &block2, uint64x2_t &block3, uint64x2_t &block4, + uint64x2_t &block5, const word64 *subkeys, unsigned int rounds) { // Hack ahead... Rearrange the data for vectorization. It is easier to permute // the data in SPECK128_Dec_Blocks then SPECK128_AdvancedProcessBlocks_NEON. - uint64x2_t x1 = UnpackLow64(block0, block1); - uint64x2_t y1 = UnpackHigh64(block0, block1); - uint64x2_t x2 = UnpackLow64(block2, block3); - uint64x2_t y2 = UnpackHigh64(block2, block3); - uint64x2_t x3 = UnpackLow64(block4, block5); - uint64x2_t y3 = UnpackHigh64(block4, block5); + uint64x2_t x1 = UnpackLow64(block0, block1); + uint64x2_t y1 = UnpackHigh64(block0, block1); + uint64x2_t x2 = UnpackLow64(block2, block3); + uint64x2_t y2 = UnpackHigh64(block2, block3); + uint64x2_t x3 = UnpackLow64(block4, block5); + uint64x2_t y3 = UnpackHigh64(block4, block5); x1 = Shuffle64(x1); y1 = Shuffle64(y1); @@ -614,12 +618,12 @@ inline void SPECK128_Dec_6_Blocks(uint8x16_t &block0, uint8x16_t &block1, x3 = Shuffle64(x3); y3 = Shuffle64(y3); - block0 = UnpackLow64(x1, y1); - block1 = UnpackHigh64(x1, y1); - block2 = UnpackLow64(x2, y2); - block3 = UnpackHigh64(x2, y2); - block4 = UnpackLow64(x3, y3); - block5 = UnpackHigh64(x3, y3); + block0 = UnpackLow64(x1, y1); + block1 = UnpackHigh64(x1, y1); + block2 = UnpackLow64(x2, y2); + block3 = UnpackHigh64(x2, y2); + block4 = UnpackLow64(x3, y3); + block5 = UnpackHigh64(x3, y3); } template @@ -651,40 +655,40 @@ size_t SPECK128_AdvancedProcessBlocks_NEON(F1 func1, F6 func6, { while (length >= 6*blockSize) { - uint8x16_t block0, block1, block2, block3, block4, block5, temp; - block0 = vld1q_u8(inBlocks); + uint64x2_t block0, block1, block2, block3, block4, block5; + block0 = vreinterpretq_u64_u8(vld1q_u8(inBlocks)); if (flags & BlockTransformation::BT_InBlockIsCounter) { - uint32x4_t be = vld1q_u32(s_one128); - block1 = (uint8x16_t)vaddq_u32(vreinterpretq_u32_u8(block0), be); - block2 = (uint8x16_t)vaddq_u32(vreinterpretq_u32_u8(block1), be); - block3 = (uint8x16_t)vaddq_u32(vreinterpretq_u32_u8(block2), be); - block4 = (uint8x16_t)vaddq_u32(vreinterpretq_u32_u8(block3), be); - block5 = (uint8x16_t)vaddq_u32(vreinterpretq_u32_u8(block4), be); - temp = (uint8x16_t)vaddq_u32(vreinterpretq_u32_u8(block5), be); - vst1q_u8(const_cast(inBlocks), temp); + uint64x2_t be = vreinterpretq_u64_u32(vld1q_u32(s_one128)); + block1 = vaddq_u64(block0, be); + block2 = vaddq_u64(block1, be); + block3 = vaddq_u64(block2, be); + block4 = vaddq_u64(block3, be); + block5 = vaddq_u64(block4, be); + vst1q_u8(const_cast(inBlocks), + vreinterpretq_u8_u64(vaddq_u64(block5, be))); } else { const int inc = static_cast(inIncrement); - block1 = vld1q_u8(inBlocks+1*inc); - block2 = vld1q_u8(inBlocks+2*inc); - block3 = vld1q_u8(inBlocks+3*inc); - block4 = vld1q_u8(inBlocks+4*inc); - block5 = vld1q_u8(inBlocks+5*inc); + block1 = vreinterpretq_u64_u8(vld1q_u8(inBlocks+1*inc)); + block2 = vreinterpretq_u64_u8(vld1q_u8(inBlocks+2*inc)); + block3 = vreinterpretq_u64_u8(vld1q_u8(inBlocks+3*inc)); + block4 = vreinterpretq_u64_u8(vld1q_u8(inBlocks+4*inc)); + block5 = vreinterpretq_u64_u8(vld1q_u8(inBlocks+5*inc)); inBlocks += 6*inc; } if (flags & BlockTransformation::BT_XorInput) { const int inc = static_cast(xorIncrement); - block0 = veorq_u8(block0, vld1q_u8(xorBlocks+0*inc)); - block1 = veorq_u8(block1, vld1q_u8(xorBlocks+1*inc)); - block2 = veorq_u8(block2, vld1q_u8(xorBlocks+2*inc)); - block3 = veorq_u8(block3, vld1q_u8(xorBlocks+3*inc)); - block4 = veorq_u8(block4, vld1q_u8(xorBlocks+4*inc)); - block5 = veorq_u8(block5, vld1q_u8(xorBlocks+5*inc)); + block0 = veorq_u64(block0, vreinterpretq_u64_u8(vld1q_u8(xorBlocks+0*inc))); + block1 = veorq_u64(block1, vreinterpretq_u64_u8(vld1q_u8(xorBlocks+1*inc))); + block2 = veorq_u64(block2, vreinterpretq_u64_u8(vld1q_u8(xorBlocks+2*inc))); + block3 = veorq_u64(block3, vreinterpretq_u64_u8(vld1q_u8(xorBlocks+3*inc))); + block4 = veorq_u64(block4, vreinterpretq_u64_u8(vld1q_u8(xorBlocks+4*inc))); + block5 = veorq_u64(block5, vreinterpretq_u64_u8(vld1q_u8(xorBlocks+5*inc))); xorBlocks += 6*inc; } @@ -693,22 +697,22 @@ size_t SPECK128_AdvancedProcessBlocks_NEON(F1 func1, F6 func6, if (xorBlocks && !(flags & BlockTransformation::BT_XorInput)) { const int inc = static_cast(xorIncrement); - block0 = veorq_u8(block0, vld1q_u8(xorBlocks+0*inc)); - block1 = veorq_u8(block1, vld1q_u8(xorBlocks+1*inc)); - block2 = veorq_u8(block2, vld1q_u8(xorBlocks+2*inc)); - block3 = veorq_u8(block3, vld1q_u8(xorBlocks+3*inc)); - block4 = veorq_u8(block4, vld1q_u8(xorBlocks+4*inc)); - block5 = veorq_u8(block5, vld1q_u8(xorBlocks+5*inc)); + block0 = veorq_u64(block0, vreinterpretq_u64_u8(vld1q_u8(xorBlocks+0*inc))); + block1 = veorq_u64(block1, vreinterpretq_u64_u8(vld1q_u8(xorBlocks+1*inc))); + block2 = veorq_u64(block2, vreinterpretq_u64_u8(vld1q_u8(xorBlocks+2*inc))); + block3 = veorq_u64(block3, vreinterpretq_u64_u8(vld1q_u8(xorBlocks+3*inc))); + block4 = veorq_u64(block4, vreinterpretq_u64_u8(vld1q_u8(xorBlocks+4*inc))); + block5 = veorq_u64(block5, vreinterpretq_u64_u8(vld1q_u8(xorBlocks+5*inc))); xorBlocks += 6*inc; } const int inc = static_cast(outIncrement); - vst1q_u8(outBlocks+0*inc, block0); - vst1q_u8(outBlocks+1*inc, block1); - vst1q_u8(outBlocks+2*inc, block2); - vst1q_u8(outBlocks+3*inc, block3); - vst1q_u8(outBlocks+4*inc, block4); - vst1q_u8(outBlocks+5*inc, block5); + vst1q_u8(outBlocks+0*inc, vreinterpretq_u8_u64(block0)); + vst1q_u8(outBlocks+1*inc, vreinterpretq_u8_u64(block1)); + vst1q_u8(outBlocks+2*inc, vreinterpretq_u8_u64(block2)); + vst1q_u8(outBlocks+3*inc, vreinterpretq_u8_u64(block3)); + vst1q_u8(outBlocks+4*inc, vreinterpretq_u8_u64(block4)); + vst1q_u8(outBlocks+5*inc, vreinterpretq_u8_u64(block5)); outBlocks += 6*inc; length -= 6*blockSize; @@ -717,10 +721,10 @@ size_t SPECK128_AdvancedProcessBlocks_NEON(F1 func1, F6 func6, while (length >= blockSize) { - uint8x16_t block = vld1q_u8(inBlocks); + uint64x2_t block = vreinterpretq_u64_u8(vld1q_u8(inBlocks)); if (flags & BlockTransformation::BT_XorInput) - block = veorq_u8(block, vld1q_u8(xorBlocks)); + block = veorq_u64(block, vreinterpretq_u64_u8(vld1q_u8(xorBlocks))); if (flags & BlockTransformation::BT_InBlockIsCounter) const_cast(inBlocks)[15]++; @@ -728,9 +732,9 @@ size_t SPECK128_AdvancedProcessBlocks_NEON(F1 func1, F6 func6, func1(block, subKeys, rounds); if (xorBlocks && !(flags & BlockTransformation::BT_XorInput)) - block = veorq_u8(block, vld1q_u8(xorBlocks)); + block = veorq_u64(block, vreinterpretq_u64_u8(vld1q_u8(xorBlocks))); - vst1q_u8(outBlocks, block); + vst1q_u8(outBlocks, vreinterpretq_u8_u64(block)); inBlocks += inIncrement; outBlocks += outIncrement; @@ -1269,6 +1273,10 @@ inline size_t SPECK64_AdvancedProcessBlocks_SSE41(F1 func1, F4 func4, outIncrement = 0-outIncrement; // Hack... Disable parallel for decryption. It is buggy. + // What needs to happen is, move pointer one more block size to get + // a full 128-bit word, then swap N-bit words, and then swap the + // Xor block if it is being used. Its a real kludge and it is + // being side stepped at the moment. flags &= ~BlockTransformation::BT_AllowParallel; }