Switch to uint64x2_t for SPECK-128

pull/548/head
Jeffrey Walton 2017-12-04 03:38:39 -05:00
parent 1de143203e
commit cd31fa29dc
No known key found for this signature in database
GPG Key ID: B36AB348921B1838
1 changed files with 99 additions and 91 deletions

View File

@ -282,6 +282,10 @@ inline size_t SPECK64_AdvancedProcessBlocks_NEON(F1 func1, F4 func4,
outIncrement = 0-outIncrement; outIncrement = 0-outIncrement;
// Hack... Disable parallel for decryption. It is buggy. // Hack... Disable parallel for decryption. It is buggy.
// What needs to happen is, move pointer one more block size to get
// a full 128-bit word, then swap N-bit words, and then swap the
// Xor block if it is being used. Its a real kludge and it is
// being side stepped at the moment.
flags &= ~BlockTransformation::BT_AllowParallel; flags &= ~BlockTransformation::BT_AllowParallel;
} }
@ -289,7 +293,7 @@ inline size_t SPECK64_AdvancedProcessBlocks_NEON(F1 func1, F4 func4,
{ {
while (length >= 4*blockSize) while (length >= 4*blockSize)
{ {
uint32x4_t block0 = vld1q_u32(reinterpret_cast<const word32*>(inBlocks)), block1; uint32x4_t block0 = vreinterpretq_u32_u8(vld1q_u8(inBlocks)), block1;
if (flags & BlockTransformation::BT_InBlockIsCounter) if (flags & BlockTransformation::BT_InBlockIsCounter)
{ {
const uint32x4_t be1 = vld1q_u32(s_one64); const uint32x4_t be1 = vld1q_u32(s_one64);
@ -300,7 +304,7 @@ inline size_t SPECK64_AdvancedProcessBlocks_NEON(F1 func1, F4 func4,
else else
{ {
inBlocks += 2*inIncrement; inBlocks += 2*inIncrement;
block1 = vld1q_u32(Ptr32(inBlocks)); block1 = vreinterpretq_u32_u8(vld1q_u8(inBlocks));
inBlocks += 2*inIncrement; inBlocks += 2*inIncrement;
} }
@ -308,9 +312,9 @@ inline size_t SPECK64_AdvancedProcessBlocks_NEON(F1 func1, F4 func4,
{ {
// Coverity finding, appears to be false positive. Assert the condition. // Coverity finding, appears to be false positive. Assert the condition.
CRYPTOPP_ASSERT(xorBlocks); CRYPTOPP_ASSERT(xorBlocks);
block0 = veorq_u32(block0, vld1q_u32(Ptr32(xorBlocks))); block0 = veorq_u32(block0, vreinterpretq_u32_u8(vld1q_u8(xorBlocks)));
xorBlocks += 2*xorIncrement; xorBlocks += 2*xorIncrement;
block1 = veorq_u32(block1, vld1q_u32(Ptr32(xorBlocks))); block1 = veorq_u32(block1, vreinterpretq_u32_u8(vld1q_u8(xorBlocks)));
xorBlocks += 2*xorIncrement; xorBlocks += 2*xorIncrement;
} }
@ -318,9 +322,9 @@ inline size_t SPECK64_AdvancedProcessBlocks_NEON(F1 func1, F4 func4,
if (xorBlocks && !(flags & BlockTransformation::BT_XorInput)) if (xorBlocks && !(flags & BlockTransformation::BT_XorInput))
{ {
block0 = veorq_u32(block0, vld1q_u32(Ptr32(xorBlocks))); block0 = veorq_u32(block0, vreinterpretq_u32_u8(vld1q_u8(xorBlocks)));
xorBlocks += 2*xorIncrement; xorBlocks += 2*xorIncrement;
block1 = veorq_u32(block1, vld1q_u32(Ptr32(xorBlocks))); block1 = veorq_u32(block1, vreinterpretq_u32_u8(vld1q_u8(xorBlocks)));
xorBlocks += 2*xorIncrement; xorBlocks += 2*xorIncrement;
} }
@ -384,20 +388,20 @@ const word32 s_one128[] = {0, 0, 0, 1<<24}; // uint32x4_t
const word32 s_one128[] = {0, 0, 0, 1}; // uint32x4_t const word32 s_one128[] = {0, 0, 0, 1}; // uint32x4_t
#endif #endif
template <class W, class T> template <class T>
inline W UnpackHigh64(const T& a, const T& b) inline T UnpackHigh64(const T& a, const T& b)
{ {
const uint64x1_t x(vget_high_u64((uint64x2_t)a)); const uint64x1_t x(vget_high_u64((uint64x2_t)a));
const uint64x1_t y(vget_high_u64((uint64x2_t)b)); const uint64x1_t y(vget_high_u64((uint64x2_t)b));
return (W)vcombine_u64(x, y); return (T)vcombine_u64(x, y);
} }
template <class W, class T> template <class T>
inline W UnpackLow64(const T& a, const T& b) inline T UnpackLow64(const T& a, const T& b)
{ {
const uint64x1_t x(vget_low_u64((uint64x2_t)a)); const uint64x1_t x(vget_low_u64((uint64x2_t)a));
const uint64x1_t y(vget_low_u64((uint64x2_t)b)); const uint64x1_t y(vget_low_u64((uint64x2_t)b));
return (W)vcombine_u64(x, y); return (T)vcombine_u64(x, y);
} }
template <unsigned int R> template <unsigned int R>
@ -450,14 +454,14 @@ inline uint64x2_t Shuffle64(const uint64x2_t& val)
#endif #endif
} }
inline void SPECK128_Enc_Block(uint8x16_t &block0, const word64 *subkeys, unsigned int rounds) inline void SPECK128_Enc_Block(uint64x2_t &block0, const word64 *subkeys, unsigned int rounds)
{ {
// Hack ahead... Rearrange the data for vectorization. It is easier to permute // Hack ahead... Rearrange the data for vectorization. It is easier to permute
// the data in SPECK128_Enc_Blocks then SPECK128_AdvancedProcessBlocks_NEON. // the data in SPECK128_Enc_Blocks then SPECK128_AdvancedProcessBlocks_NEON.
// The zero block below is a "don't care". It is present so we can vectorize. // The zero block below is a "don't care". It is present so we can vectorize.
uint8x16_t block1 = {0}; uint64x2_t block1 = {0};
uint64x2_t x1 = UnpackLow64<uint64x2_t>(block0, block1); uint64x2_t x1 = UnpackLow64(block0, block1);
uint64x2_t y1 = UnpackHigh64<uint64x2_t>(block0, block1); uint64x2_t y1 = UnpackHigh64(block0, block1);
x1 = Shuffle64(x1); x1 = Shuffle64(x1);
y1 = Shuffle64(y1); y1 = Shuffle64(y1);
@ -476,22 +480,22 @@ inline void SPECK128_Enc_Block(uint8x16_t &block0, const word64 *subkeys, unsign
x1 = Shuffle64(x1); x1 = Shuffle64(x1);
y1 = Shuffle64(y1); y1 = Shuffle64(y1);
block0 = UnpackLow64<uint8x16_t>(x1, y1); block0 = UnpackLow64(x1, y1);
// block1 = UnpackHigh64<uint8x16_t>(x1, y1); // block1 = UnpackHigh64(x1, y1);
} }
inline void SPECK128_Enc_6_Blocks(uint8x16_t &block0, uint8x16_t &block1, inline void SPECK128_Enc_6_Blocks(uint64x2_t &block0, uint64x2_t &block1,
uint8x16_t &block2, uint8x16_t &block3, uint8x16_t &block4, uint64x2_t &block2, uint64x2_t &block3, uint64x2_t &block4,
uint8x16_t &block5, const word64 *subkeys, unsigned int rounds) uint64x2_t &block5, const word64 *subkeys, unsigned int rounds)
{ {
// Hack ahead... Rearrange the data for vectorization. It is easier to permute // Hack ahead... Rearrange the data for vectorization. It is easier to permute
// the data in SPECK128_Enc_Blocks then SPECK128_AdvancedProcessBlocks_NEON. // the data in SPECK128_Enc_Blocks then SPECK128_AdvancedProcessBlocks_NEON.
uint64x2_t x1 = UnpackLow64<uint64x2_t>(block0, block1); uint64x2_t x1 = UnpackLow64(block0, block1);
uint64x2_t y1 = UnpackHigh64<uint64x2_t>(block0, block1); uint64x2_t y1 = UnpackHigh64(block0, block1);
uint64x2_t x2 = UnpackLow64<uint64x2_t>(block2, block3); uint64x2_t x2 = UnpackLow64(block2, block3);
uint64x2_t y2 = UnpackHigh64<uint64x2_t>(block2, block3); uint64x2_t y2 = UnpackHigh64(block2, block3);
uint64x2_t x3 = UnpackLow64<uint64x2_t>(block4, block5); uint64x2_t x3 = UnpackLow64(block4, block5);
uint64x2_t y3 = UnpackHigh64<uint64x2_t>(block4, block5); uint64x2_t y3 = UnpackHigh64(block4, block5);
x1 = Shuffle64(x1); x1 = Shuffle64(x1);
y1 = Shuffle64(y1); y1 = Shuffle64(y1);
@ -528,22 +532,22 @@ inline void SPECK128_Enc_6_Blocks(uint8x16_t &block0, uint8x16_t &block1,
x3 = Shuffle64(x3); x3 = Shuffle64(x3);
y3 = Shuffle64(y3); y3 = Shuffle64(y3);
block0 = UnpackLow64<uint8x16_t>(x1, y1); block0 = UnpackLow64(x1, y1);
block1 = UnpackHigh64<uint8x16_t>(x1, y1); block1 = UnpackHigh64(x1, y1);
block2 = UnpackLow64<uint8x16_t>(x2, y2); block2 = UnpackLow64(x2, y2);
block3 = UnpackHigh64<uint8x16_t>(x2, y2); block3 = UnpackHigh64(x2, y2);
block4 = UnpackLow64<uint8x16_t>(x3, y3); block4 = UnpackLow64(x3, y3);
block5 = UnpackHigh64<uint8x16_t>(x3, y3); block5 = UnpackHigh64(x3, y3);
} }
inline void SPECK128_Dec_Block(uint8x16_t &block0, const word64 *subkeys, unsigned int rounds) inline void SPECK128_Dec_Block(uint64x2_t &block0, const word64 *subkeys, unsigned int rounds)
{ {
// Hack ahead... Rearrange the data for vectorization. It is easier to permute // Hack ahead... Rearrange the data for vectorization. It is easier to permute
// the data in SPECK128_Dec_Blocks then SPECK128_AdvancedProcessBlocks_NEON. // the data in SPECK128_Dec_Blocks then SPECK128_AdvancedProcessBlocks_NEON.
// The zero block below is a "don't care". It is present so we can vectorize. // The zero block below is a "don't care". It is present so we can vectorize.
uint8x16_t block1 = {0}; uint64x2_t block1 = {0};
uint64x2_t x1 = UnpackLow64<uint64x2_t>(block0, block1); uint64x2_t x1 = UnpackLow64(block0, block1);
uint64x2_t y1 = UnpackHigh64<uint64x2_t>(block0, block1); uint64x2_t y1 = UnpackHigh64(block0, block1);
x1 = Shuffle64(x1); x1 = Shuffle64(x1);
y1 = Shuffle64(y1); y1 = Shuffle64(y1);
@ -562,22 +566,22 @@ inline void SPECK128_Dec_Block(uint8x16_t &block0, const word64 *subkeys, unsign
x1 = Shuffle64(x1); x1 = Shuffle64(x1);
y1 = Shuffle64(y1); y1 = Shuffle64(y1);
block0 = UnpackLow64<uint8x16_t>(x1, y1); block0 = UnpackLow64(x1, y1);
// block1 = UnpackHigh64<uint8x16_t>(x1, y1); // block1 = UnpackHigh64(x1, y1);
} }
inline void SPECK128_Dec_6_Blocks(uint8x16_t &block0, uint8x16_t &block1, inline void SPECK128_Dec_6_Blocks(uint64x2_t &block0, uint64x2_t &block1,
uint8x16_t &block2, uint8x16_t &block3, uint8x16_t &block4, uint64x2_t &block2, uint64x2_t &block3, uint64x2_t &block4,
uint8x16_t &block5, const word64 *subkeys, unsigned int rounds) uint64x2_t &block5, const word64 *subkeys, unsigned int rounds)
{ {
// Hack ahead... Rearrange the data for vectorization. It is easier to permute // Hack ahead... Rearrange the data for vectorization. It is easier to permute
// the data in SPECK128_Dec_Blocks then SPECK128_AdvancedProcessBlocks_NEON. // the data in SPECK128_Dec_Blocks then SPECK128_AdvancedProcessBlocks_NEON.
uint64x2_t x1 = UnpackLow64<uint64x2_t>(block0, block1); uint64x2_t x1 = UnpackLow64(block0, block1);
uint64x2_t y1 = UnpackHigh64<uint64x2_t>(block0, block1); uint64x2_t y1 = UnpackHigh64(block0, block1);
uint64x2_t x2 = UnpackLow64<uint64x2_t>(block2, block3); uint64x2_t x2 = UnpackLow64(block2, block3);
uint64x2_t y2 = UnpackHigh64<uint64x2_t>(block2, block3); uint64x2_t y2 = UnpackHigh64(block2, block3);
uint64x2_t x3 = UnpackLow64<uint64x2_t>(block4, block5); uint64x2_t x3 = UnpackLow64(block4, block5);
uint64x2_t y3 = UnpackHigh64<uint64x2_t>(block4, block5); uint64x2_t y3 = UnpackHigh64(block4, block5);
x1 = Shuffle64(x1); x1 = Shuffle64(x1);
y1 = Shuffle64(y1); y1 = Shuffle64(y1);
@ -614,12 +618,12 @@ inline void SPECK128_Dec_6_Blocks(uint8x16_t &block0, uint8x16_t &block1,
x3 = Shuffle64(x3); x3 = Shuffle64(x3);
y3 = Shuffle64(y3); y3 = Shuffle64(y3);
block0 = UnpackLow64<uint8x16_t>(x1, y1); block0 = UnpackLow64(x1, y1);
block1 = UnpackHigh64<uint8x16_t>(x1, y1); block1 = UnpackHigh64(x1, y1);
block2 = UnpackLow64<uint8x16_t>(x2, y2); block2 = UnpackLow64(x2, y2);
block3 = UnpackHigh64<uint8x16_t>(x2, y2); block3 = UnpackHigh64(x2, y2);
block4 = UnpackLow64<uint8x16_t>(x3, y3); block4 = UnpackLow64(x3, y3);
block5 = UnpackHigh64<uint8x16_t>(x3, y3); block5 = UnpackHigh64(x3, y3);
} }
template <typename F1, typename F6> template <typename F1, typename F6>
@ -651,40 +655,40 @@ size_t SPECK128_AdvancedProcessBlocks_NEON(F1 func1, F6 func6,
{ {
while (length >= 6*blockSize) while (length >= 6*blockSize)
{ {
uint8x16_t block0, block1, block2, block3, block4, block5, temp; uint64x2_t block0, block1, block2, block3, block4, block5;
block0 = vld1q_u8(inBlocks); block0 = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
if (flags & BlockTransformation::BT_InBlockIsCounter) if (flags & BlockTransformation::BT_InBlockIsCounter)
{ {
uint32x4_t be = vld1q_u32(s_one128); uint64x2_t be = vreinterpretq_u64_u32(vld1q_u32(s_one128));
block1 = (uint8x16_t)vaddq_u32(vreinterpretq_u32_u8(block0), be); block1 = vaddq_u64(block0, be);
block2 = (uint8x16_t)vaddq_u32(vreinterpretq_u32_u8(block1), be); block2 = vaddq_u64(block1, be);
block3 = (uint8x16_t)vaddq_u32(vreinterpretq_u32_u8(block2), be); block3 = vaddq_u64(block2, be);
block4 = (uint8x16_t)vaddq_u32(vreinterpretq_u32_u8(block3), be); block4 = vaddq_u64(block3, be);
block5 = (uint8x16_t)vaddq_u32(vreinterpretq_u32_u8(block4), be); block5 = vaddq_u64(block4, be);
temp = (uint8x16_t)vaddq_u32(vreinterpretq_u32_u8(block5), be); vst1q_u8(const_cast<byte*>(inBlocks),
vst1q_u8(const_cast<byte*>(inBlocks), temp); vreinterpretq_u8_u64(vaddq_u64(block5, be)));
} }
else else
{ {
const int inc = static_cast<int>(inIncrement); const int inc = static_cast<int>(inIncrement);
block1 = vld1q_u8(inBlocks+1*inc); block1 = vreinterpretq_u64_u8(vld1q_u8(inBlocks+1*inc));
block2 = vld1q_u8(inBlocks+2*inc); block2 = vreinterpretq_u64_u8(vld1q_u8(inBlocks+2*inc));
block3 = vld1q_u8(inBlocks+3*inc); block3 = vreinterpretq_u64_u8(vld1q_u8(inBlocks+3*inc));
block4 = vld1q_u8(inBlocks+4*inc); block4 = vreinterpretq_u64_u8(vld1q_u8(inBlocks+4*inc));
block5 = vld1q_u8(inBlocks+5*inc); block5 = vreinterpretq_u64_u8(vld1q_u8(inBlocks+5*inc));
inBlocks += 6*inc; inBlocks += 6*inc;
} }
if (flags & BlockTransformation::BT_XorInput) if (flags & BlockTransformation::BT_XorInput)
{ {
const int inc = static_cast<int>(xorIncrement); const int inc = static_cast<int>(xorIncrement);
block0 = veorq_u8(block0, vld1q_u8(xorBlocks+0*inc)); block0 = veorq_u64(block0, vreinterpretq_u64_u8(vld1q_u8(xorBlocks+0*inc)));
block1 = veorq_u8(block1, vld1q_u8(xorBlocks+1*inc)); block1 = veorq_u64(block1, vreinterpretq_u64_u8(vld1q_u8(xorBlocks+1*inc)));
block2 = veorq_u8(block2, vld1q_u8(xorBlocks+2*inc)); block2 = veorq_u64(block2, vreinterpretq_u64_u8(vld1q_u8(xorBlocks+2*inc)));
block3 = veorq_u8(block3, vld1q_u8(xorBlocks+3*inc)); block3 = veorq_u64(block3, vreinterpretq_u64_u8(vld1q_u8(xorBlocks+3*inc)));
block4 = veorq_u8(block4, vld1q_u8(xorBlocks+4*inc)); block4 = veorq_u64(block4, vreinterpretq_u64_u8(vld1q_u8(xorBlocks+4*inc)));
block5 = veorq_u8(block5, vld1q_u8(xorBlocks+5*inc)); block5 = veorq_u64(block5, vreinterpretq_u64_u8(vld1q_u8(xorBlocks+5*inc)));
xorBlocks += 6*inc; xorBlocks += 6*inc;
} }
@ -693,22 +697,22 @@ size_t SPECK128_AdvancedProcessBlocks_NEON(F1 func1, F6 func6,
if (xorBlocks && !(flags & BlockTransformation::BT_XorInput)) if (xorBlocks && !(flags & BlockTransformation::BT_XorInput))
{ {
const int inc = static_cast<int>(xorIncrement); const int inc = static_cast<int>(xorIncrement);
block0 = veorq_u8(block0, vld1q_u8(xorBlocks+0*inc)); block0 = veorq_u64(block0, vreinterpretq_u64_u8(vld1q_u8(xorBlocks+0*inc)));
block1 = veorq_u8(block1, vld1q_u8(xorBlocks+1*inc)); block1 = veorq_u64(block1, vreinterpretq_u64_u8(vld1q_u8(xorBlocks+1*inc)));
block2 = veorq_u8(block2, vld1q_u8(xorBlocks+2*inc)); block2 = veorq_u64(block2, vreinterpretq_u64_u8(vld1q_u8(xorBlocks+2*inc)));
block3 = veorq_u8(block3, vld1q_u8(xorBlocks+3*inc)); block3 = veorq_u64(block3, vreinterpretq_u64_u8(vld1q_u8(xorBlocks+3*inc)));
block4 = veorq_u8(block4, vld1q_u8(xorBlocks+4*inc)); block4 = veorq_u64(block4, vreinterpretq_u64_u8(vld1q_u8(xorBlocks+4*inc)));
block5 = veorq_u8(block5, vld1q_u8(xorBlocks+5*inc)); block5 = veorq_u64(block5, vreinterpretq_u64_u8(vld1q_u8(xorBlocks+5*inc)));
xorBlocks += 6*inc; xorBlocks += 6*inc;
} }
const int inc = static_cast<int>(outIncrement); const int inc = static_cast<int>(outIncrement);
vst1q_u8(outBlocks+0*inc, block0); vst1q_u8(outBlocks+0*inc, vreinterpretq_u8_u64(block0));
vst1q_u8(outBlocks+1*inc, block1); vst1q_u8(outBlocks+1*inc, vreinterpretq_u8_u64(block1));
vst1q_u8(outBlocks+2*inc, block2); vst1q_u8(outBlocks+2*inc, vreinterpretq_u8_u64(block2));
vst1q_u8(outBlocks+3*inc, block3); vst1q_u8(outBlocks+3*inc, vreinterpretq_u8_u64(block3));
vst1q_u8(outBlocks+4*inc, block4); vst1q_u8(outBlocks+4*inc, vreinterpretq_u8_u64(block4));
vst1q_u8(outBlocks+5*inc, block5); vst1q_u8(outBlocks+5*inc, vreinterpretq_u8_u64(block5));
outBlocks += 6*inc; outBlocks += 6*inc;
length -= 6*blockSize; length -= 6*blockSize;
@ -717,10 +721,10 @@ size_t SPECK128_AdvancedProcessBlocks_NEON(F1 func1, F6 func6,
while (length >= blockSize) while (length >= blockSize)
{ {
uint8x16_t block = vld1q_u8(inBlocks); uint64x2_t block = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
if (flags & BlockTransformation::BT_XorInput) if (flags & BlockTransformation::BT_XorInput)
block = veorq_u8(block, vld1q_u8(xorBlocks)); block = veorq_u64(block, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
if (flags & BlockTransformation::BT_InBlockIsCounter) if (flags & BlockTransformation::BT_InBlockIsCounter)
const_cast<byte *>(inBlocks)[15]++; const_cast<byte *>(inBlocks)[15]++;
@ -728,9 +732,9 @@ size_t SPECK128_AdvancedProcessBlocks_NEON(F1 func1, F6 func6,
func1(block, subKeys, rounds); func1(block, subKeys, rounds);
if (xorBlocks && !(flags & BlockTransformation::BT_XorInput)) if (xorBlocks && !(flags & BlockTransformation::BT_XorInput))
block = veorq_u8(block, vld1q_u8(xorBlocks)); block = veorq_u64(block, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
vst1q_u8(outBlocks, block); vst1q_u8(outBlocks, vreinterpretq_u8_u64(block));
inBlocks += inIncrement; inBlocks += inIncrement;
outBlocks += outIncrement; outBlocks += outIncrement;
@ -1269,6 +1273,10 @@ inline size_t SPECK64_AdvancedProcessBlocks_SSE41(F1 func1, F4 func4,
outIncrement = 0-outIncrement; outIncrement = 0-outIncrement;
// Hack... Disable parallel for decryption. It is buggy. // Hack... Disable parallel for decryption. It is buggy.
// What needs to happen is, move pointer one more block size to get
// a full 128-bit word, then swap N-bit words, and then swap the
// Xor block if it is being used. Its a real kludge and it is
// being side stepped at the moment.
flags &= ~BlockTransformation::BT_AllowParallel; flags &= ~BlockTransformation::BT_AllowParallel;
} }