Fix Simon-64 CTR mode

This fixes CTR mode for Simon-64. We were only incrementing half the counters.

We still have Speck-64 to cleanup.
pull/548/head
Jeffrey Walton 2017-12-07 19:45:32 -05:00
parent 07f2a4fc3f
commit 02037b5ce6
No known key found for this signature in database
GPG Key ID: B36AB348921B1838
2 changed files with 110 additions and 120 deletions

View File

@ -52,9 +52,13 @@ using CryptoPP::BlockTransformation;
#if defined(CRYPTOPP_ARM_NEON_AVAILABLE) #if defined(CRYPTOPP_ARM_NEON_AVAILABLE)
#if defined(CRYPTOPP_LITTLE_ENDIAN) #if defined(CRYPTOPP_LITTLE_ENDIAN)
const word32 s_one64[] = {0, 1<<24, 0, 1<<24}; const word32 s_zero[] = {0, 0, 0, 0};
const word32 s_one64_1b[] = {0, 0, 0, 1<<24}; // Only second 8-byte block is incremented after loading
const word32 s_one64_2b[] = {0, 2<<24, 0, 2<<24}; // Routine step. Both 8-byte block are incremented
#else #else
const word32 s_one64[] = {0, 1, 0, 1}; const word32 s_zero[] = {0, 0, 0, 0};
const word32 s_one64_1b[] = {0, 0, 0, 1};
const word32 s_one64_2b[] = {0, 2, 0, 2};
#endif #endif
template <unsigned int R> template <unsigned int R>
@ -125,30 +129,6 @@ inline uint32x4_t SIMON64_f(const uint32x4_t& val)
vandq_u32(RotateLeft32<1>(val), RotateLeft32<8>(val))); vandq_u32(RotateLeft32<1>(val), RotateLeft32<8>(val)));
} }
template <typename T>
inline word32* Ptr32(T* ptr)
{
return reinterpret_cast<word32*>(ptr);
}
template <typename T>
inline const word32* Ptr32(const T* ptr)
{
return reinterpret_cast<const word32*>(ptr);
}
template <typename T>
inline word64* Ptr64(T* ptr)
{
return reinterpret_cast<word64*>(ptr);
}
template <typename T>
inline const word64* Ptr64(const T* ptr)
{
return reinterpret_cast<const word64*>(ptr);
}
inline void SIMON64_Enc_Block(uint32x4_t &block1, uint32x4_t &block0, inline void SIMON64_Enc_Block(uint32x4_t &block1, uint32x4_t &block0,
const word32 *subkeys, unsigned int rounds) const word32 *subkeys, unsigned int rounds)
{ {
@ -388,25 +368,40 @@ inline size_t SIMON64_AdvancedProcessBlocks_NEON(F2 func2, F6 func6,
if (flags & BlockTransformation::BT_AllowParallel) if (flags & BlockTransformation::BT_AllowParallel)
{ {
// Load these magic values once. Analysis claims be1 and be2
// may be uninitialized, but they are when the block is a ctr.
uint32x4_t be1, be2;
if (flags & BlockTransformation::BT_InBlockIsCounter)
{
be1 = vld1q_u32(s_one64_1b);
be2 = vld1q_u32(s_one64_2b);
}
while (length >= 6*neonBlockSize) while (length >= 6*neonBlockSize)
{ {
uint32x4_t block0, block1, block2, block3, block4, block5; uint32x4_t block0, block1, block2, block3, block4, block5;
block0 = vreinterpretq_u32_u8(vld1q_u8(inBlocks));
if (flags & BlockTransformation::BT_InBlockIsCounter) if (flags & BlockTransformation::BT_InBlockIsCounter)
{ {
const uint32x4_t be1 = vld1q_u32(s_one64); // For 64-bit block ciphers we need to load the initial single CTR block.
block1 = vaddq_u32(block0, be1); // After the dup load we have two counters in the XMM word. Then we need
block2 = vaddq_u32(block1, be1); // to increment the low ctr by 0 and the high ctr by 1.
block3 = vaddq_u32(block2, be1); block0 = vaddq_u32(be1, vreinterpretq_u32_u64(
block4 = vaddq_u32(block3, be1); vld1q_dup_u64(reinterpret_cast<const word64*>(inBlocks))));
block5 = vaddq_u32(block4, be1);
vst1q_u8(const_cast<byte*>(inBlocks), // After initial increment of {0,1} remaining counters increment by {1,1}.
vreinterpretq_u8_u32(vaddq_u32(block5, be1))); block1 = vaddq_u32(be2, block0);
block2 = vaddq_u32(be2, block1);
block3 = vaddq_u32(be2, block2);
block4 = vaddq_u32(be2, block3);
block5 = vaddq_u32(be2, block4);
vst1_u8(const_cast<byte*>(inBlocks), vget_low_u8(
vreinterpretq_u8_u32(vaddq_u32(be2, block5))));
} }
else else
{ {
const int inc = static_cast<int>(inIncrement); const int inc = static_cast<int>(inIncrement);
block0 = vreinterpretq_u32_u8(vld1q_u8(inBlocks+0*inc));
block1 = vreinterpretq_u32_u8(vld1q_u8(inBlocks+1*inc)); block1 = vreinterpretq_u32_u8(vld1q_u8(inBlocks+1*inc));
block2 = vreinterpretq_u32_u8(vld1q_u8(inBlocks+2*inc)); block2 = vreinterpretq_u32_u8(vld1q_u8(inBlocks+2*inc));
block3 = vreinterpretq_u32_u8(vld1q_u8(inBlocks+3*inc)); block3 = vreinterpretq_u32_u8(vld1q_u8(inBlocks+3*inc));
@ -456,18 +451,24 @@ inline size_t SIMON64_AdvancedProcessBlocks_NEON(F2 func2, F6 func6,
while (length >= 2*neonBlockSize) while (length >= 2*neonBlockSize)
{ {
uint32x4_t block0, block1; uint32x4_t block0, block1;
block0 = vreinterpretq_u32_u8(vld1q_u8(inBlocks));
if (flags & BlockTransformation::BT_InBlockIsCounter) if (flags & BlockTransformation::BT_InBlockIsCounter)
{ {
const uint32x4_t be1 = vld1q_u32(s_one64); // For 64-bit block ciphers we need to load the initial single CTR block.
block1 = vaddq_u32(block0, be1); // After the dup load we have two counters in the XMM word. Then we need
vst1q_u8(const_cast<byte*>(inBlocks), // to increment the low ctr by 0 and the high ctr by 1.
vreinterpretq_u8_u32(vaddq_u32(block1, be1))); block0 = vaddq_u32(be1, vreinterpretq_u32_u64(
vld1q_dup_u64(reinterpret_cast<const word64*>(inBlocks))));
// After initial increment of {0,1} remaining counters increment by {1,1}.
block1 = vaddq_u32(be2, block0);
vst1_u8(const_cast<byte*>(inBlocks), vget_low_u8(
vreinterpretq_u8_u32(vaddq_u32(be2, block1))));
} }
else else
{ {
const int inc = static_cast<int>(inIncrement); const int inc = static_cast<int>(inIncrement);
block0 = vreinterpretq_u32_u8(vld1q_u8(inBlocks+0*inc));
block1 = vreinterpretq_u32_u8(vld1q_u8(inBlocks+1*inc)); block1 = vreinterpretq_u32_u8(vld1q_u8(inBlocks+1*inc));
inBlocks += 2*inc; inBlocks += 2*inc;
} }
@ -521,16 +522,14 @@ inline size_t SIMON64_AdvancedProcessBlocks_NEON(F2 func2, F6 func6,
while (length >= blockSize) while (length >= blockSize)
{ {
uint32x4_t block, zero = {0,0,0,0}; uint32x4_t zero = vld1q_u32(s_zero);
block = vsetq_lane_u32(Ptr32(inBlocks)[0], block, 0); uint32x4_t block = vreinterpretq_u32_u64(vld1q_dup_u64(
block = vsetq_lane_u32(Ptr32(inBlocks)[1], block, 1); reinterpret_cast<const word64*>(inBlocks)));
if (flags & BlockTransformation::BT_XorInput) if (flags & BlockTransformation::BT_XorInput)
{ {
uint32x4_t x; block = veorq_u32(block, vreinterpretq_u32_u64(
x = vsetq_lane_u32(Ptr32(xorBlocks)[0], x, 0); vld1q_dup_u64(reinterpret_cast<const word64*>(xorBlocks))));
x = vsetq_lane_u32(Ptr32(xorBlocks)[1], x, 1);
block = veorq_u32(block, x);
} }
if (flags & BlockTransformation::BT_InBlockIsCounter) if (flags & BlockTransformation::BT_InBlockIsCounter)
@ -540,16 +539,12 @@ inline size_t SIMON64_AdvancedProcessBlocks_NEON(F2 func2, F6 func6,
if (xorBlocks && !(flags & BlockTransformation::BT_XorInput)) if (xorBlocks && !(flags & BlockTransformation::BT_XorInput))
{ {
uint32x4_t x; block = veorq_u32(block, vreinterpretq_u32_u64(
x = vsetq_lane_u32(Ptr32(xorBlocks)[0], x, 0); vld1q_dup_u64(reinterpret_cast<const word64*>(xorBlocks))));
x = vsetq_lane_u32(Ptr32(xorBlocks)[1], x, 1);
block = veorq_u32(block, x);
} }
word32 t[2]; vst1_u8(const_cast<byte*>(outBlocks),
t[0] = vgetq_lane_u32(block, 0); vget_low_u8(vreinterpretq_u8_u32(block)));
t[1] = vgetq_lane_u32(block, 1);
std::memcpy(outBlocks, t, sizeof(t));
inBlocks += inIncrement; inBlocks += inIncrement;
outBlocks += outIncrement; outBlocks += outIncrement;
@ -1762,7 +1757,7 @@ inline size_t SIMON64_AdvancedProcessBlocks_SSE41(F2 func2, F6 func6,
if (flags & BlockTransformation::BT_AllowParallel) if (flags & BlockTransformation::BT_AllowParallel)
{ {
// Load these magic value once. Analysis claims be1 and be2 // Load these magic values once. Analysis claims be1 and be2
// may be uninitialized, but they are when the block is a ctr. // may be uninitialized, but they are when the block is a ctr.
__m128i be1, be2; __m128i be1, be2;
if (flags & BlockTransformation::BT_InBlockIsCounter) if (flags & BlockTransformation::BT_InBlockIsCounter)
@ -1782,7 +1777,7 @@ inline size_t SIMON64_AdvancedProcessBlocks_SSE41(F2 func2, F6 func6,
block0 = _mm_add_epi32(be1, _mm_castpd_si128( block0 = _mm_add_epi32(be1, _mm_castpd_si128(
_mm_loaddup_pd(reinterpret_cast<const double*>(inBlocks)))); _mm_loaddup_pd(reinterpret_cast<const double*>(inBlocks))));
// After initial increment both counters increment by 1. // After initial increment of {0,1} remaining counters increment by {1,1}.
block1 = _mm_add_epi32(be2, block0); block1 = _mm_add_epi32(be2, block0);
block2 = _mm_add_epi32(be2, block1); block2 = _mm_add_epi32(be2, block1);
block3 = _mm_add_epi32(be2, block2); block3 = _mm_add_epi32(be2, block2);
@ -1872,7 +1867,7 @@ inline size_t SIMON64_AdvancedProcessBlocks_SSE41(F2 func2, F6 func6,
block0 = _mm_add_epi32(be1, _mm_castpd_si128( block0 = _mm_add_epi32(be1, _mm_castpd_si128(
_mm_loaddup_pd(reinterpret_cast<const double*>(inBlocks)))); _mm_loaddup_pd(reinterpret_cast<const double*>(inBlocks))));
// After initial increment both counters increment by 1. // After initial increment of {0,1} remaining counters increment by {1,1}.
block1 = _mm_add_epi32(be2, block0); block1 = _mm_add_epi32(be2, block0);
// Store the next counter. // Store the next counter.

View File

@ -50,9 +50,13 @@ using CryptoPP::BlockTransformation;
#if defined(CRYPTOPP_ARM_NEON_AVAILABLE) #if defined(CRYPTOPP_ARM_NEON_AVAILABLE)
#if defined(CRYPTOPP_LITTLE_ENDIAN) #if defined(CRYPTOPP_LITTLE_ENDIAN)
const word32 s_one64[] = {0, 1<<24, 0, 2<<24}; const word32 s_zero[] = {0, 0, 0, 0};
const word32 s_one64_1b[] = {0, 0, 0, 1<<24}; // Only second 8-byte block is incremented after loading
const word32 s_one64_2b[] = {0, 2<<24, 0, 2<<24}; // Routine step. Both 8-byte block are incremented
#else #else
const word32 s_one64[] = {0, 2, 0, 1}; const word32 s_zero[] = {0, 0, 0, 0};
const word32 s_one64_1b[] = {0, 0, 0, 1};
const word32 s_one64_2b[] = {0, 2, 0, 2};
#endif #endif
template <unsigned int R> template <unsigned int R>
@ -117,30 +121,6 @@ inline uint32x4_t Shuffle32(const uint32x4_t& val)
#endif #endif
} }
template <typename T>
inline word32* Ptr32(T* ptr)
{
return reinterpret_cast<word32*>(ptr);
}
template <typename T>
inline const word32* Ptr32(const T* ptr)
{
return reinterpret_cast<const word32*>(ptr);
}
template <typename T>
inline word64* Ptr64(T* ptr)
{
return reinterpret_cast<word64*>(ptr);
}
template <typename T>
inline const word64* Ptr64(const T* ptr)
{
return reinterpret_cast<const word64*>(ptr);
}
inline void SPECK64_Enc_Block(uint32x4_t &block0, uint32x4_t &block1, inline void SPECK64_Enc_Block(uint32x4_t &block0, uint32x4_t &block1,
const word32 *subkeys, unsigned int rounds) const word32 *subkeys, unsigned int rounds)
{ {
@ -360,25 +340,40 @@ inline size_t SPECK64_AdvancedProcessBlocks_NEON(F2 func2, F6 func6,
if (flags & BlockTransformation::BT_AllowParallel) if (flags & BlockTransformation::BT_AllowParallel)
{ {
// Load these magic values once. Analysis claims be1 and be2
// may be uninitialized, but they are when the block is a ctr.
uint32x4_t be1, be2;
if (flags & BlockTransformation::BT_InBlockIsCounter)
{
be1 = vld1q_u32(s_one64_1b);
be2 = vld1q_u32(s_one64_2b);
}
while (length >= 6*neonBlockSize) while (length >= 6*neonBlockSize)
{ {
uint32x4_t block0, block1, block2, block3, block4, block5; uint32x4_t block0, block1, block2, block3, block4, block5;
block0 = vreinterpretq_u32_u8(vld1q_u8(inBlocks));
if (flags & BlockTransformation::BT_InBlockIsCounter) if (flags & BlockTransformation::BT_InBlockIsCounter)
{ {
const uint32x4_t be1 = vld1q_u32(s_one64); // For 64-bit block ciphers we need to load the initial single CTR block.
block1 = vaddq_u32(block0, be1); // After the dup load we have two counters in the XMM word. Then we need
block2 = vaddq_u32(block1, be1); // to increment the low ctr by 0 and the high ctr by 1.
block3 = vaddq_u32(block2, be1); block0 = vaddq_u32(be1, vreinterpretq_u32_u64(
block4 = vaddq_u32(block3, be1); vld1q_dup_u64(reinterpret_cast<const word64*>(inBlocks))));
block5 = vaddq_u32(block4, be1);
vst1q_u8(const_cast<byte*>(inBlocks), // After initial increment of {0,1} remaining counters increment by {1,1}.
vreinterpretq_u8_u32(vaddq_u32(block5, be1))); block1 = vaddq_u32(be2, block0);
block2 = vaddq_u32(be2, block1);
block3 = vaddq_u32(be2, block2);
block4 = vaddq_u32(be2, block3);
block5 = vaddq_u32(be2, block4);
vst1_u8(const_cast<byte*>(inBlocks), vget_low_u8(
vreinterpretq_u8_u32(vaddq_u32(be2, block5))));
} }
else else
{ {
const int inc = static_cast<int>(inIncrement); const int inc = static_cast<int>(inIncrement);
block0 = vreinterpretq_u32_u8(vld1q_u8(inBlocks+0*inc));
block1 = vreinterpretq_u32_u8(vld1q_u8(inBlocks+1*inc)); block1 = vreinterpretq_u32_u8(vld1q_u8(inBlocks+1*inc));
block2 = vreinterpretq_u32_u8(vld1q_u8(inBlocks+2*inc)); block2 = vreinterpretq_u32_u8(vld1q_u8(inBlocks+2*inc));
block3 = vreinterpretq_u32_u8(vld1q_u8(inBlocks+3*inc)); block3 = vreinterpretq_u32_u8(vld1q_u8(inBlocks+3*inc));
@ -428,18 +423,24 @@ inline size_t SPECK64_AdvancedProcessBlocks_NEON(F2 func2, F6 func6,
while (length >= 2*neonBlockSize) while (length >= 2*neonBlockSize)
{ {
uint32x4_t block0, block1; uint32x4_t block0, block1;
block0 = vreinterpretq_u32_u8(vld1q_u8(inBlocks));
if (flags & BlockTransformation::BT_InBlockIsCounter) if (flags & BlockTransformation::BT_InBlockIsCounter)
{ {
const uint32x4_t be1 = vld1q_u32(s_one64); // For 64-bit block ciphers we need to load the initial single CTR block.
block1 = vaddq_u32(block0, be1); // After the dup load we have two counters in the XMM word. Then we need
vst1q_u8(const_cast<byte*>(inBlocks), // to increment the low ctr by 0 and the high ctr by 1.
vreinterpretq_u8_u32(vaddq_u32(block1, be1))); block0 = vaddq_u32(be1, vreinterpretq_u32_u64(
vld1q_dup_u64(reinterpret_cast<const word64*>(inBlocks))));
// After initial increment of {0,1} remaining counters increment by {1,1}.
block1 = vaddq_u32(be2, block0);
vst1_u8(const_cast<byte*>(inBlocks), vget_low_u8(
vreinterpretq_u8_u32(vaddq_u32(be2, block1))));
} }
else else
{ {
const int inc = static_cast<int>(inIncrement); const int inc = static_cast<int>(inIncrement);
block0 = vreinterpretq_u32_u8(vld1q_u8(inBlocks+0*inc));
block1 = vreinterpretq_u32_u8(vld1q_u8(inBlocks+1*inc)); block1 = vreinterpretq_u32_u8(vld1q_u8(inBlocks+1*inc));
inBlocks += 2*inc; inBlocks += 2*inc;
} }
@ -493,16 +494,14 @@ inline size_t SPECK64_AdvancedProcessBlocks_NEON(F2 func2, F6 func6,
while (length >= blockSize) while (length >= blockSize)
{ {
uint32x4_t block, zero = {0,0,0,0}; uint32x4_t zero = vld1q_u32(s_zero);
block = vsetq_lane_u32(Ptr32(inBlocks)[0], block, 0); uint32x4_t block = vreinterpretq_u32_u64(vld1q_dup_u64(
block = vsetq_lane_u32(Ptr32(inBlocks)[1], block, 1); reinterpret_cast<const word64*>(inBlocks)));
if (flags & BlockTransformation::BT_XorInput) if (flags & BlockTransformation::BT_XorInput)
{ {
uint32x4_t x; block = veorq_u32(block, vreinterpretq_u32_u64(
x = vsetq_lane_u32(Ptr32(xorBlocks)[0], x, 0); vld1q_dup_u64(reinterpret_cast<const word64*>(xorBlocks))));
x = vsetq_lane_u32(Ptr32(xorBlocks)[1], x, 1);
block = veorq_u32(block, x);
} }
if (flags & BlockTransformation::BT_InBlockIsCounter) if (flags & BlockTransformation::BT_InBlockIsCounter)
@ -512,16 +511,12 @@ inline size_t SPECK64_AdvancedProcessBlocks_NEON(F2 func2, F6 func6,
if (xorBlocks && !(flags & BlockTransformation::BT_XorInput)) if (xorBlocks && !(flags & BlockTransformation::BT_XorInput))
{ {
uint32x4_t x; block = veorq_u32(block, vreinterpretq_u32_u64(
x = vsetq_lane_u32(Ptr32(xorBlocks)[0], x, 0); vld1q_dup_u64(reinterpret_cast<const word64*>(xorBlocks))));
x = vsetq_lane_u32(Ptr32(xorBlocks)[1], x, 1);
block = veorq_u32(block, x);
} }
word32 t[2]; vst1_u8(const_cast<byte*>(outBlocks),
t[0] = vgetq_lane_u32(block, 0); vget_low_u8(vreinterpretq_u8_u32(block)));
t[1] = vgetq_lane_u32(block, 1);
std::memcpy(outBlocks, t, sizeof(t));
inBlocks += inIncrement; inBlocks += inIncrement;
outBlocks += outIncrement; outBlocks += outIncrement;
@ -1658,7 +1653,7 @@ inline size_t SPECK64_AdvancedProcessBlocks_SSE41(F2 func2, F6 func6,
if (flags & BlockTransformation::BT_AllowParallel) if (flags & BlockTransformation::BT_AllowParallel)
{ {
// Load these magic value once. Analysis claims be1 and be2 // Load these magic values once. Analysis claims be1 and be2
// may be uninitialized, but they are when the block is a ctr. // may be uninitialized, but they are when the block is a ctr.
__m128i be1, be2; __m128i be1, be2;
if (flags & BlockTransformation::BT_InBlockIsCounter) if (flags & BlockTransformation::BT_InBlockIsCounter)
@ -1678,7 +1673,7 @@ inline size_t SPECK64_AdvancedProcessBlocks_SSE41(F2 func2, F6 func6,
block0 = _mm_add_epi32(be1, _mm_castpd_si128( block0 = _mm_add_epi32(be1, _mm_castpd_si128(
_mm_loaddup_pd(reinterpret_cast<const double*>(inBlocks)))); _mm_loaddup_pd(reinterpret_cast<const double*>(inBlocks))));
// After initial increment both counters increment by 1. // After initial increment of {0,1} remaining counters increment by {1,1}.
block1 = _mm_add_epi32(be2, block0); block1 = _mm_add_epi32(be2, block0);
block2 = _mm_add_epi32(be2, block1); block2 = _mm_add_epi32(be2, block1);
block3 = _mm_add_epi32(be2, block2); block3 = _mm_add_epi32(be2, block2);
@ -1768,7 +1763,7 @@ inline size_t SPECK64_AdvancedProcessBlocks_SSE41(F2 func2, F6 func6,
block0 = _mm_add_epi32(be1, _mm_castpd_si128( block0 = _mm_add_epi32(be1, _mm_castpd_si128(
_mm_loaddup_pd(reinterpret_cast<const double*>(inBlocks)))); _mm_loaddup_pd(reinterpret_cast<const double*>(inBlocks))));
// After initial increment both counters increment by 1. // After initial increment of {0,1} remaining counters increment by {1,1}.
block1 = _mm_add_epi32(be2, block0); block1 = _mm_add_epi32(be2, block0);
// Store the next counter. // Store the next counter.