Switch to vec_vsx_ld, remove unaligned loads
Partially unroll loop Rijndael_UncheckedSetKey_POWER8 loop. It saves about another 60 cyclespull/484/merge
parent
d9592a303c
commit
441e944a66
|
|
@ -899,48 +899,25 @@ static inline VectorType VectorLoad(int off, const byte src[16])
|
||||||
return (VectorType)Load8x16(off, (uint8_t*)src);
|
return (VectorType)Load8x16(off, (uint8_t*)src);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Loads an aligned byte array, does not perform an endian conversion.
|
// Loads a byte array, does not perform an endian conversion.
|
||||||
// This function presumes the subkey table is correct endianess.
|
// This function presumes the subkey table is correct endianess.
|
||||||
static inline VectorType VectorLoadKey(const byte src[16])
|
static inline VectorType VectorLoadKey(const byte src[16])
|
||||||
{
|
{
|
||||||
CRYPTOPP_ASSERT(IsAlignedOn(src, 16));
|
return (VectorType)vec_vsx_ld(0, (uint8_t*)src);
|
||||||
return (VectorType)vec_ld(0, (uint8_t*)src);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// Loads a byte array, does not perform an endian conversion.
|
// Loads a byte array, does not perform an endian conversion.
|
||||||
// This function presumes the subkey table is correct endianess.
|
// This function presumes the subkey table is correct endianess.
|
||||||
static inline VectorType VectorLoadKeyUnaligned(const byte src[16])
|
|
||||||
{
|
|
||||||
const uint8x16_p8 perm = vec_lvsl(0, src);
|
|
||||||
const uint8x16_p8 low = vec_ld(0, src);
|
|
||||||
const uint8x16_p8 high = vec_ld(15, src);
|
|
||||||
return (VectorType)vec_perm(low, high, perm);
|
|
||||||
}
|
|
||||||
|
|
||||||
// Loads an aligned byte array, does not perform an endian conversion.
|
|
||||||
// This function presumes the subkey table is correct endianess.
|
|
||||||
static inline VectorType VectorLoadKey(const word32 src[4])
|
static inline VectorType VectorLoadKey(const word32 src[4])
|
||||||
{
|
{
|
||||||
CRYPTOPP_ASSERT(IsAlignedOn(src, 16));
|
return (VectorType)vec_vsx_ld(0, (uint8_t*)src);
|
||||||
return (VectorType)vec_ld(0, (uint8_t*)src);
|
|
||||||
}
|
|
||||||
|
|
||||||
// Loads an aligned byte array, does not perform an endian conversion.
|
|
||||||
// This function presumes the subkey table is correct endianess.
|
|
||||||
static inline VectorType VectorLoadKey(int off, const byte src[16])
|
|
||||||
{
|
|
||||||
CRYPTOPP_ASSERT(IsAlignedOn(src, 16));
|
|
||||||
return (VectorType)vec_ld(off, (uint8_t*)src);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// Loads a byte array, does not perform an endian conversion.
|
// Loads a byte array, does not perform an endian conversion.
|
||||||
// This function presumes the subkey table is correct endianess.
|
// This function presumes the subkey table is correct endianess.
|
||||||
static inline VectorType VectorLoadKeyUnaligned(int off, const byte src[16])
|
static inline VectorType VectorLoadKey(int off, const byte src[16])
|
||||||
{
|
{
|
||||||
const uint8x16_p8 perm = vec_lvsl(off, src);
|
return (VectorType)vec_vsx_ld(off, (uint8_t*)src);
|
||||||
const uint8x16_p8 low = vec_ld(off, src);
|
|
||||||
const uint8x16_p8 high = vec_ld(off+15, src);
|
|
||||||
return (VectorType)vec_perm(low, high, perm);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// Stores to a mis-aligned byte array, performs an endian conversion.
|
// Stores to a mis-aligned byte array, performs an endian conversion.
|
||||||
|
|
@ -1117,8 +1094,6 @@ void Rijndael_UncheckedSetKey_POWER8(const byte* userKey, size_t keyLen, word32*
|
||||||
r4 = (uint8x16_p8)VectorLoadKey(s_rcon[2]);
|
r4 = (uint8x16_p8)VectorLoadKey(s_rcon[2]);
|
||||||
r1 = Rijndael_Subkey_POWER8(r1, r4, r5);
|
r1 = Rijndael_Subkey_POWER8(r1, r4, r5);
|
||||||
skptr += 16; VectorStore(r1, skptr);
|
skptr += 16; VectorStore(r1, skptr);
|
||||||
|
|
||||||
return;
|
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
|
|
@ -1162,7 +1137,19 @@ void Rijndael_UncheckedSetKey_POWER8(const byte* userKey, size_t keyLen, word32*
|
||||||
rk = rk_saved;
|
rk = rk_saved;
|
||||||
const uint8x16_p8 mask = ((uint8x16_p8){12,13,14,15, 8,9,10,11, 4,5,6,7, 0,1,2,3});
|
const uint8x16_p8 mask = ((uint8x16_p8){12,13,14,15, 8,9,10,11, 4,5,6,7, 0,1,2,3});
|
||||||
const uint8x16_p8 zero = {0};
|
const uint8x16_p8 zero = {0};
|
||||||
for (unsigned int i=0; i<=rounds; ++i, rk+=4)
|
|
||||||
|
unsigned int i=0;
|
||||||
|
for (i=0; i<rounds; i+=2, rk+=8)
|
||||||
|
{
|
||||||
|
uint8x16_p8 d1 = vec_vsx_ld( 0, (uint8_t*)rk);
|
||||||
|
uint8x16_p8 d2 = vec_vsx_ld(16, (uint8_t*)rk);
|
||||||
|
d1 = vec_perm(d1, zero, mask);
|
||||||
|
d2 = vec_perm(d2, zero, mask);
|
||||||
|
vec_vsx_st(d1, 0, (uint8_t*)rk);
|
||||||
|
vec_vsx_st(d2, 16, (uint8_t*)rk);
|
||||||
|
}
|
||||||
|
|
||||||
|
for ( ; i<rounds+1; i++, rk+=4)
|
||||||
vec_vsx_st(vec_perm(vec_vsx_ld(0, (uint8_t*)rk), zero, mask), 0, (uint8_t*)rk);
|
vec_vsx_st(vec_perm(vec_vsx_ld(0, (uint8_t*)rk), zero, mask), 0, (uint8_t*)rk);
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue