Add GCM_SetKeyWithoutResync_PMULL
parent
48f46bb852
commit
6145d52b22
80
gcm-simd.cpp
80
gcm-simd.cpp
|
|
@ -279,6 +279,31 @@ uint64x2_t GCM_Multiply_PMULL(const uint64x2_t &x, const uint64x2_t &h, const ui
|
||||||
return GCM_Reduce_PMULL(c0, c1, c2, r);
|
return GCM_Reduce_PMULL(c0, c1, c2, r);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void GCM_SetKeyWithoutResync_PMULL(byte *mulTable, byte *hashKey, unsigned int tableSize)
|
||||||
|
{
|
||||||
|
const uint64x2_t r = s_clmulConstants[0];
|
||||||
|
const uint64x2_t t = vreinterpretq_u64_u8(vrev64q_u8(vld1q_u8(hashKey)));
|
||||||
|
const uint64x2_t h0 = vextq_u64(t, t, 1);
|
||||||
|
|
||||||
|
uint64x2_t h = h0;
|
||||||
|
unsigned int i;
|
||||||
|
for (i=0; i<tableSize-32; i+=32)
|
||||||
|
{
|
||||||
|
const uint64x2_t h1 = GCM_Multiply_PMULL(h, h0, r);
|
||||||
|
vst1_u64((uint64_t *)(mulTable+i), vget_low_u64(h));
|
||||||
|
vst1q_u64((uint64_t *)(mulTable+i+16), h1);
|
||||||
|
vst1q_u64((uint64_t *)(mulTable+i+8), h);
|
||||||
|
vst1_u64((uint64_t *)(mulTable+i+8), vget_low_u64(h1));
|
||||||
|
h = GCM_Multiply_PMULL(h1, h0, r);
|
||||||
|
}
|
||||||
|
|
||||||
|
const uint64x2_t h1 = GCM_Multiply_PMULL(h, h0, r);
|
||||||
|
vst1_u64((uint64_t *)(mulTable+i), vget_low_u64(h));
|
||||||
|
vst1q_u64((uint64_t *)(mulTable+i+16), h1);
|
||||||
|
vst1q_u64((uint64_t *)(mulTable+i+8), h);
|
||||||
|
vst1_u64((uint64_t *)(mulTable+i+8), vget_low_u64(h1));
|
||||||
|
}
|
||||||
|
|
||||||
size_t GCM_AuthenticateBlocks_PMULL(const byte *data, size_t len, const byte *mtable, byte *hbuffer)
|
size_t GCM_AuthenticateBlocks_PMULL(const byte *data, size_t len, const byte *mtable, byte *hbuffer)
|
||||||
{
|
{
|
||||||
const uint64x2_t* table = reinterpret_cast<const uint64x2_t*>(mtable);
|
const uint64x2_t* table = reinterpret_cast<const uint64x2_t*>(mtable);
|
||||||
|
|
@ -366,6 +391,46 @@ const unsigned int s_cltableSizeInBlocks = 8;
|
||||||
|
|
||||||
ANONYMOUS_NAMESPACE_END
|
ANONYMOUS_NAMESPACE_END
|
||||||
|
|
||||||
|
#if 0
|
||||||
|
// preserved for testing
|
||||||
|
void gcm_gf_mult(const unsigned char *a, const unsigned char *b, unsigned char *c)
|
||||||
|
{
|
||||||
|
word64 Z0=0, Z1=0, V0, V1;
|
||||||
|
|
||||||
|
typedef BlockGetAndPut<word64, BigEndian> Block;
|
||||||
|
Block::Get(a)(V0)(V1);
|
||||||
|
|
||||||
|
for (int i=0; i<16; i++)
|
||||||
|
{
|
||||||
|
for (int j=0x80; j!=0; j>>=1)
|
||||||
|
{
|
||||||
|
int x = b[i] & j;
|
||||||
|
Z0 ^= x ? V0 : 0;
|
||||||
|
Z1 ^= x ? V1 : 0;
|
||||||
|
x = (int)V1 & 1;
|
||||||
|
V1 = (V1>>1) | (V0<<63);
|
||||||
|
V0 = (V0>>1) ^ (x ? W64LIT(0xe1) << 56 : 0);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Block::Put(NULLPTR, c)(Z0)(Z1);
|
||||||
|
}
|
||||||
|
|
||||||
|
__m128i _mm_clmulepi64_si128(const __m128i &a, const __m128i &b, int i)
|
||||||
|
{
|
||||||
|
word64 A[1] = {ByteReverse(((word64*)&a)[i&1])};
|
||||||
|
word64 B[1] = {ByteReverse(((word64*)&b)[i>>4])};
|
||||||
|
|
||||||
|
PolynomialMod2 pa((byte *)A, 8);
|
||||||
|
PolynomialMod2 pb((byte *)B, 8);
|
||||||
|
PolynomialMod2 c = pa*pb;
|
||||||
|
|
||||||
|
__m128i output;
|
||||||
|
for (int i=0; i<16; i++)
|
||||||
|
((byte *)&output)[i] = c.GetByte(i);
|
||||||
|
return output;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
__m128i GCM_Reduce_CLMUL(__m128i c0, __m128i c1, __m128i c2, const __m128i &r)
|
__m128i GCM_Reduce_CLMUL(__m128i c0, __m128i c1, __m128i c2, const __m128i &r)
|
||||||
{
|
{
|
||||||
/*
|
/*
|
||||||
|
|
@ -410,18 +475,25 @@ __m128i GCM_Multiply_CLMUL(const __m128i &x, const __m128i &h, const __m128i &r)
|
||||||
void GCM_SetKeyWithoutResync_CLMUL(byte *mulTable, byte *hashKey, unsigned int tableSize)
|
void GCM_SetKeyWithoutResync_CLMUL(byte *mulTable, byte *hashKey, unsigned int tableSize)
|
||||||
{
|
{
|
||||||
const __m128i r = s_clmulConstants[0];
|
const __m128i r = s_clmulConstants[0];
|
||||||
__m128i h0 = _mm_shuffle_epi8(_mm_load_si128((__m128i *)(void *)hashKey), s_clmulConstants[1]);
|
const __m128i h0 = _mm_shuffle_epi8(_mm_load_si128((__m128i *)(void *)hashKey), s_clmulConstants[1]);
|
||||||
__m128i h = h0;
|
|
||||||
|
|
||||||
for (unsigned int i=0; i<tableSize; i+=32)
|
__m128i h = h0;
|
||||||
|
unsigned int i;
|
||||||
|
for (i=0; i<tableSize-32; i+=32)
|
||||||
{
|
{
|
||||||
__m128i h1 = GCM_Multiply_CLMUL(h, h0, r);
|
const __m128i h1 = GCM_Multiply_CLMUL(h, h0, r);
|
||||||
_mm_storel_epi64((__m128i *)(void *)(mulTable+i), h);
|
_mm_storel_epi64((__m128i *)(void *)(mulTable+i), h);
|
||||||
_mm_storeu_si128((__m128i *)(void *)(mulTable+i+16), h1);
|
_mm_storeu_si128((__m128i *)(void *)(mulTable+i+16), h1);
|
||||||
_mm_storeu_si128((__m128i *)(void *)(mulTable+i+8), h);
|
_mm_storeu_si128((__m128i *)(void *)(mulTable+i+8), h);
|
||||||
_mm_storel_epi64((__m128i *)(void *)(mulTable+i+8), h1);
|
_mm_storel_epi64((__m128i *)(void *)(mulTable+i+8), h1);
|
||||||
h = GCM_Multiply_CLMUL(h1, h0, r);
|
h = GCM_Multiply_CLMUL(h1, h0, r);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
const __m128i h1 = GCM_Multiply_CLMUL(h, h0, r);
|
||||||
|
_mm_storel_epi64((__m128i *)(void *)(mulTable+i), h);
|
||||||
|
_mm_storeu_si128((__m128i *)(void *)(mulTable+i+16), h1);
|
||||||
|
_mm_storeu_si128((__m128i *)(void *)(mulTable+i+8), h);
|
||||||
|
_mm_storel_epi64((__m128i *)(void *)(mulTable+i+8), h1);
|
||||||
}
|
}
|
||||||
|
|
||||||
void GCM_ReverseHashBufferIfNeeded_CLMUL(byte *hashBuffer)
|
void GCM_ReverseHashBufferIfNeeded_CLMUL(byte *hashBuffer)
|
||||||
|
|
|
||||||
63
gcm.cpp
63
gcm.cpp
|
|
@ -61,46 +61,6 @@ void GCM_Base::GCTR::IncrementCounterBy256()
|
||||||
IncrementCounterByOne(m_counterArray+BlockSize()-4, 3);
|
IncrementCounterByOne(m_counterArray+BlockSize()-4, 3);
|
||||||
}
|
}
|
||||||
|
|
||||||
#if 0
|
|
||||||
// preserved for testing
|
|
||||||
void gcm_gf_mult(const unsigned char *a, const unsigned char *b, unsigned char *c)
|
|
||||||
{
|
|
||||||
word64 Z0=0, Z1=0, V0, V1;
|
|
||||||
|
|
||||||
typedef BlockGetAndPut<word64, BigEndian> Block;
|
|
||||||
Block::Get(a)(V0)(V1);
|
|
||||||
|
|
||||||
for (int i=0; i<16; i++)
|
|
||||||
{
|
|
||||||
for (int j=0x80; j!=0; j>>=1)
|
|
||||||
{
|
|
||||||
int x = b[i] & j;
|
|
||||||
Z0 ^= x ? V0 : 0;
|
|
||||||
Z1 ^= x ? V1 : 0;
|
|
||||||
x = (int)V1 & 1;
|
|
||||||
V1 = (V1>>1) | (V0<<63);
|
|
||||||
V0 = (V0>>1) ^ (x ? W64LIT(0xe1) << 56 : 0);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
Block::Put(NULLPTR, c)(Z0)(Z1);
|
|
||||||
}
|
|
||||||
|
|
||||||
__m128i _mm_clmulepi64_si128(const __m128i &a, const __m128i &b, int i)
|
|
||||||
{
|
|
||||||
word64 A[1] = {ByteReverse(((word64*)&a)[i&1])};
|
|
||||||
word64 B[1] = {ByteReverse(((word64*)&b)[i>>4])};
|
|
||||||
|
|
||||||
PolynomialMod2 pa((byte *)A, 8);
|
|
||||||
PolynomialMod2 pb((byte *)B, 8);
|
|
||||||
PolynomialMod2 c = pa*pb;
|
|
||||||
|
|
||||||
__m128i output;
|
|
||||||
for (int i=0; i<16; i++)
|
|
||||||
((byte *)&output)[i] = c.GetByte(i);
|
|
||||||
return output;
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
|
|
||||||
inline static void Xor16(byte *a, const byte *b, const byte *c)
|
inline static void Xor16(byte *a, const byte *b, const byte *c)
|
||||||
{
|
{
|
||||||
CRYPTOPP_ASSERT(IsAlignedOn(a,GetAlignmentOf<word64>()));
|
CRYPTOPP_ASSERT(IsAlignedOn(a,GetAlignmentOf<word64>()));
|
||||||
|
|
@ -151,6 +111,7 @@ const unsigned int s_cltableSizeInBlocks = 8;
|
||||||
|
|
||||||
extern size_t GCM_AuthenticateBlocks_PMULL(const byte *data, size_t len, const byte *mtable, byte *hbuffer);
|
extern size_t GCM_AuthenticateBlocks_PMULL(const byte *data, size_t len, const byte *mtable, byte *hbuffer);
|
||||||
extern uint64x2_t GCM_Multiply_PMULL(const uint64x2_t &x, const uint64x2_t &h, const uint64x2_t &r);
|
extern uint64x2_t GCM_Multiply_PMULL(const uint64x2_t &x, const uint64x2_t &h, const uint64x2_t &r);
|
||||||
|
extern void GCM_SetKeyWithoutResync_PMULL(byte *mulTable, byte *hashKey, unsigned int tableSize);
|
||||||
|
|
||||||
CRYPTOPP_ALIGN_DATA(16)
|
CRYPTOPP_ALIGN_DATA(16)
|
||||||
const word64 s_clmulConstants64[] = {
|
const word64 s_clmulConstants64[] = {
|
||||||
|
|
@ -217,27 +178,7 @@ void GCM_Base::SetKeyWithoutResync(const byte *userKey, size_t keylength, const
|
||||||
#elif CRYPTOPP_ARM_PMULL_AVAILABLE
|
#elif CRYPTOPP_ARM_PMULL_AVAILABLE
|
||||||
if (HasPMULL())
|
if (HasPMULL())
|
||||||
{
|
{
|
||||||
const uint64x2_t r = s_clmulConstants[0];
|
GCM_SetKeyWithoutResync_PMULL(mulTable, hashKey, tableSize);
|
||||||
const uint64x2_t t = vreinterpretq_u64_u8(vrev64q_u8(vld1q_u8(hashKey)));
|
|
||||||
const uint64x2_t h0 = vextq_u64(t, t, 1);
|
|
||||||
|
|
||||||
uint64x2_t h = h0;
|
|
||||||
for (i=0; i<tableSize-32; i+=32)
|
|
||||||
{
|
|
||||||
const uint64x2_t h1 = GCM_Multiply_PMULL(h, h0, r);
|
|
||||||
vst1_u64((uint64_t *)(mulTable+i), vget_low_u64(h));
|
|
||||||
vst1q_u64((uint64_t *)(mulTable+i+16), h1);
|
|
||||||
vst1q_u64((uint64_t *)(mulTable+i+8), h);
|
|
||||||
vst1_u64((uint64_t *)(mulTable+i+8), vget_low_u64(h1));
|
|
||||||
h = GCM_Multiply_PMULL(h1, h0, r);
|
|
||||||
}
|
|
||||||
|
|
||||||
const uint64x2_t h1 = GCM_Multiply_PMULL(h, h0, r);
|
|
||||||
vst1_u64((uint64_t *)(mulTable+i), vget_low_u64(h));
|
|
||||||
vst1q_u64((uint64_t *)(mulTable+i+16), h1);
|
|
||||||
vst1q_u64((uint64_t *)(mulTable+i+8), h);
|
|
||||||
vst1_u64((uint64_t *)(mulTable+i+8), vget_low_u64(h1));
|
|
||||||
|
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue