Add PowerPC VectorLoadKeyUnaligned for AES-192

Make internal functions static. We get better optimizations depsice using unnamed namespaces
Add PowerPC uint32x4 functions for handling 32-bit rcon and mask
pull/484/merge
Jeffrey Walton 2017-09-20 08:57:53 -04:00
parent c94d076aa1
commit c5a427d690
No known key found for this signature in database
GPG Key ID: B36AB348921B1838
1 changed files with 91 additions and 42 deletions

View File

@ -11,8 +11,8 @@
// mbedTLS under a debugger was helped for us to determine problems
// with our subkey generation and scheduling.
//
// AltiVec and Power8 code based on http://github.com/noloader/AES-Power8
//
// AltiVec and Power8 code based on http://github.com/noloader/AES-Intrinsics and
// http://www.ibm.com/developerworks/library/se-power8-in-core-cryptography/
#include "pch.h"
#include "config.h"
@ -161,7 +161,7 @@ const word32 s_one[] = {0, 0, 0, 1<<24}; // uint32x4_t
const word32 s_one[] = {0, 0, 0, 1}; // uint32x4_t
#endif
inline void ARMV8_Enc_Block(uint8x16_t &block, const word32 *subkeys, unsigned int rounds)
static inline void ARMV8_Enc_Block(uint8x16_t &block, const word32 *subkeys, unsigned int rounds)
{
CRYPTOPP_ASSERT(subkeys);
const byte *keys = reinterpret_cast<const byte*>(subkeys);
@ -189,7 +189,7 @@ inline void ARMV8_Enc_Block(uint8x16_t &block, const word32 *subkeys, unsigned i
block = veorq_u8(block, vld1q_u8(keys+rounds*16));
}
inline void ARMV8_Enc_6_Blocks(uint8x16_t &block0, uint8x16_t &block1, uint8x16_t &block2,
static inline void ARMV8_Enc_6_Blocks(uint8x16_t &block0, uint8x16_t &block1, uint8x16_t &block2,
uint8x16_t &block3, uint8x16_t &block4, uint8x16_t &block5,
const word32 *subkeys, unsigned int rounds)
{
@ -245,7 +245,7 @@ inline void ARMV8_Enc_6_Blocks(uint8x16_t &block0, uint8x16_t &block1, uint8x16_
block5 = veorq_u8(block5, key);
}
inline void ARMV8_Dec_Block(uint8x16_t &block, const word32 *subkeys, unsigned int rounds)
static inline void ARMV8_Dec_Block(uint8x16_t &block, const word32 *subkeys, unsigned int rounds)
{
CRYPTOPP_ASSERT(subkeys);
const byte *keys = reinterpret_cast<const byte*>(subkeys);
@ -273,7 +273,7 @@ inline void ARMV8_Dec_Block(uint8x16_t &block, const word32 *subkeys, unsigned i
block = veorq_u8(block, vld1q_u8(keys+rounds*16));
}
inline void ARMV8_Dec_6_Blocks(uint8x16_t &block0, uint8x16_t &block1, uint8x16_t &block2,
static inline void ARMV8_Dec_6_Blocks(uint8x16_t &block0, uint8x16_t &block1, uint8x16_t &block2,
uint8x16_t &block3, uint8x16_t &block4, uint8x16_t &block5,
const word32 *subkeys, unsigned int rounds)
{
@ -476,7 +476,7 @@ const word32 s_rconLE[] = {
0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x1B, 0x36
};
inline void AESNI_Enc_Block(__m128i &block, MAYBE_CONST __m128i *subkeys, unsigned int rounds)
static inline void AESNI_Enc_Block(__m128i &block, MAYBE_CONST __m128i *subkeys, unsigned int rounds)
{
block = _mm_xor_si128(block, subkeys[0]);
for (unsigned int i=1; i<rounds-1; i+=2)
@ -488,7 +488,7 @@ inline void AESNI_Enc_Block(__m128i &block, MAYBE_CONST __m128i *subkeys, unsign
block = _mm_aesenclast_si128(block, subkeys[rounds]);
}
inline void AESNI_Enc_4_Blocks(__m128i &block0, __m128i &block1, __m128i &block2, __m128i &block3,
static inline void AESNI_Enc_4_Blocks(__m128i &block0, __m128i &block1, __m128i &block2, __m128i &block3,
MAYBE_CONST __m128i *subkeys, unsigned int rounds)
{
__m128i rk = subkeys[0];
@ -511,7 +511,7 @@ inline void AESNI_Enc_4_Blocks(__m128i &block0, __m128i &block1, __m128i &block2
block3 = _mm_aesenclast_si128(block3, rk);
}
inline void AESNI_Dec_Block(__m128i &block, MAYBE_CONST __m128i *subkeys, unsigned int rounds)
static inline void AESNI_Dec_Block(__m128i &block, MAYBE_CONST __m128i *subkeys, unsigned int rounds)
{
block = _mm_xor_si128(block, subkeys[0]);
for (unsigned int i=1; i<rounds-1; i+=2)
@ -523,7 +523,7 @@ inline void AESNI_Dec_Block(__m128i &block, MAYBE_CONST __m128i *subkeys, unsign
block = _mm_aesdeclast_si128(block, subkeys[rounds]);
}
inline void AESNI_Dec_4_Blocks(__m128i &block0, __m128i &block1, __m128i &block2, __m128i &block3,
static inline void AESNI_Dec_4_Blocks(__m128i &block0, __m128i &block1, __m128i &block2, __m128i &block3,
MAYBE_CONST __m128i *subkeys, unsigned int rounds)
{
__m128i rk = subkeys[0];
@ -547,7 +547,7 @@ inline void AESNI_Dec_4_Blocks(__m128i &block0, __m128i &block1, __m128i &block2
}
template <typename F1, typename F4>
inline size_t Rijndael_AdvancedProcessBlocks_AESNI(F1 func1, F4 func4,
static inline size_t Rijndael_AdvancedProcessBlocks_AESNI(F1 func1, F4 func4,
MAYBE_CONST word32 *subKeys, size_t rounds, const byte *inBlocks,
const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
{
@ -788,6 +788,13 @@ static inline uint8x16_p8 Reverse8x16(const uint8x16_p8& src)
return vec_perm(src, zero, mask);
}
static inline uint32x4_p8 Reverse8x16(const uint32x4_p8& src)
{
const uint8x16_p8 mask = {15,14,13,12, 11,10,9,8, 7,6,5,4, 3,2,1,0};
const uint8x16_p8 zero = {0};
return (uint32x4_p8)vec_perm((uint8x16_p8)src, zero, mask);
}
static inline uint64x2_p8 Reverse64x2(const uint64x2_p8& src)
{
const uint8x16_p8 mask = {15,14,13,12, 11,10,9,8, 7,6,5,4, 3,2,1,0};
@ -821,6 +828,19 @@ static inline uint8x16_p8 Load8x16(int off, const uint8_t src[16])
#endif
}
static inline uint32x4_p8 Load32x4(const uint32_t src[4])
{
#if defined(CRYPTOPP_XLC_VERSION)
return vec_xl_be(0, (uint32_t*)src);
#else
# if defined(IS_LITTLE_ENDIAN)
return Reverse8x16(vec_vsx_ld(0, src));
# else
return vec_vsx_ld(0, src);
# endif
#endif
}
static inline void Store8x16(const uint8x16_p8& src, uint8_t dest[16])
{
#if defined(CRYPTOPP_XLC_VERSION)
@ -884,7 +904,7 @@ static inline void Store64x2(const uint64x2_p8& src, uint8_t dest[16])
#endif
// Loads a mis-aligned byte array, performs an endian conversion.
inline VectorType VectorLoad(const byte src[16])
static inline VectorType VectorLoad(const byte src[16])
{
#if defined(CRYPTOPP_XLC_VERSION)
return Load8x16(src);
@ -894,7 +914,7 @@ inline VectorType VectorLoad(const byte src[16])
}
// Loads a mis-aligned byte array, performs an endian conversion.
inline VectorType VectorLoad(int off, const byte src[16])
static inline VectorType VectorLoad(int off, const byte src[16])
{
#if defined(CRYPTOPP_XLC_VERSION)
return Load8x16(off, src);
@ -903,13 +923,28 @@ inline VectorType VectorLoad(int off, const byte src[16])
#endif
}
// Loads a mis-aligned byte array, performs an endian conversion.
static inline VectorType VectorLoad(const word32 src[4])
{
#if defined(CRYPTOPP_XLC_VERSION)
return (VectorType)Load32x4((uint32_t*)src);
#elif defined(CRYPTOPP_GCC_VERSION)
return (VectorType)Load32x4((uint32_t*)src);
#endif
}
// Loads an aligned byte array, does not perform an endian conversion.
// This function presumes the subkey table is correct endianess.
inline VectorType VectorLoadKey(const byte src[16])
static inline VectorType VectorLoadKey(const byte src[16])
{
if (IsAlignedOn(src, 16))
CRYPTOPP_ASSERT(IsAlignedOn(src, 16));
return (VectorType)vec_ld(0, (uint8_t*)src);
}
// Loads a byte array, does not perform an endian conversion.
// This function presumes the subkey table is correct endianess.
static inline VectorType VectorLoadKeyUnaligned(const byte src[16])
{
const uint8x16_p8 perm = vec_lvsl(0, src);
const uint8x16_p8 low = vec_ld(0, src);
const uint8x16_p8 high = vec_ld(15, src);
@ -918,11 +953,16 @@ inline VectorType VectorLoadKey(const byte src[16])
// Loads an aligned byte array, does not perform an endian conversion.
// This function presumes the subkey table is correct endianess.
inline VectorType VectorLoadKey(const word32 src[4])
static inline VectorType VectorLoadKey(const word32 src[4])
{
if (IsAlignedOn(src, 16))
CRYPTOPP_ASSERT(IsAlignedOn(src, 16));
return (VectorType)vec_ld(0, (uint8_t*)src);
}
// Loads a byte array, does not perform an endian conversion.
// This function presumes the subkey table is correct endianess.
static inline VectorType VectorLoadKeyUnaligned(const word32 src[4])
{
const uint8x16_p8 perm = vec_lvsl(0, (uint8_t*)src);
const uint8x16_p8 low = vec_ld(0, (uint8_t*)src);
const uint8x16_p8 high = vec_ld(15, (uint8_t*)src);
@ -931,11 +971,16 @@ inline VectorType VectorLoadKey(const word32 src[4])
// Loads an aligned byte array, does not perform an endian conversion.
// This function presumes the subkey table is correct endianess.
inline VectorType VectorLoadKey(int off, const byte src[16])
static inline VectorType VectorLoadKey(int off, const byte src[16])
{
if (IsAlignedOn(src, 16))
CRYPTOPP_ASSERT(IsAlignedOn(src, 16));
return (VectorType)vec_ld(off, (uint8_t*)src);
}
// Loads a byte array, does not perform an endian conversion.
// This function presumes the subkey table is correct endianess.
static inline VectorType VectorLoadKeyUnaligned(int off, const byte src[16])
{
const uint8x16_p8 perm = vec_lvsl(off, src);
const uint8x16_p8 low = vec_ld(off, src);
const uint8x16_p8 high = vec_ld(off+15, src);
@ -943,7 +988,7 @@ inline VectorType VectorLoadKey(int off, const byte src[16])
}
// Stores to a mis-aligned byte array, performs an endian conversion.
inline void VectorStore(const VectorType& src, byte dest[16])
static inline void VectorStore(const VectorType& src, byte dest[16])
{
#if defined(CRYPTOPP_XLC_VERSION)
return Store8x16(src, dest);
@ -953,19 +998,19 @@ inline void VectorStore(const VectorType& src, byte dest[16])
}
template <class T1, class T2>
inline T1 VectorXor(const T1& vec1, const T2& vec2)
static inline T1 VectorXor(const T1& vec1, const T2& vec2)
{
return (T1)vec_xor(vec1, (T1)vec2);
}
template <class T1, class T2>
inline T1 VectorAdd(const T1& vec1, const T2& vec2)
static inline T1 VectorAdd(const T1& vec1, const T2& vec2)
{
return (T1)vec_add(vec1, (T1)vec2);
}
template <class T1, class T2>
inline T1 VectorEncrypt(const T1& state, const T2& key)
static inline T1 VectorEncrypt(const T1& state, const T2& key)
{
#if defined(CRYPTOPP_XLC_VERSION)
return (T1)__vcipher((VectorType)state, (VectorType)key);
@ -977,7 +1022,7 @@ inline T1 VectorEncrypt(const T1& state, const T2& key)
}
template <class T1, class T2>
inline T1 VectorEncryptLast(const T1& state, const T2& key)
static inline T1 VectorEncryptLast(const T1& state, const T2& key)
{
#if defined(CRYPTOPP_XLC_VERSION)
return (T1)__vcipherlast((VectorType)state, (VectorType)key);
@ -989,7 +1034,7 @@ inline T1 VectorEncryptLast(const T1& state, const T2& key)
}
template <class T1, class T2>
inline T1 VectorDecrypt(const T1& state, const T2& key)
static inline T1 VectorDecrypt(const T1& state, const T2& key)
{
#if defined(CRYPTOPP_XLC_VERSION)
return (T1)__vncipher((VectorType)state, (VectorType)key);
@ -1001,7 +1046,7 @@ inline T1 VectorDecrypt(const T1& state, const T2& key)
}
template <class T1, class T2>
inline T1 VectorDecryptLast(const T1& state, const T2& key)
static inline T1 VectorDecryptLast(const T1& state, const T2& key)
{
#if defined(CRYPTOPP_XLC_VERSION)
return (T1)__vncipherlast((VectorType)state, (VectorType)key);
@ -1015,18 +1060,22 @@ inline T1 VectorDecryptLast(const T1& state, const T2& key)
//////////////////////////////////////////////////////////////////
/* Round constants */
CRYPTOPP_ALIGN_DATA(16)
const uint32_t s_rcon[3][4] = {
0x01<<24,0x01<<24,0x01<<24,0x01<<24, /* 1 */
0x1b<<24,0x1b<<24,0x1b<<24,0x1b<<24, /* 9 */
0x36<<24,0x36<<24,0x36<<24,0x36<<24 /* 10 */
static const uint32_t s_rcon[3][4] = {
{0x01<<24,0x01<<24,0x01<<24,0x01<<24}, /* 1 */
{0x1b<<24,0x1b<<24,0x1b<<24,0x1b<<24}, /* 9 */
{0x36<<24,0x36<<24,0x36<<24,0x36<<24} /* 10 */
};
static const uint32_t s_mask[4] = {
0x0d0e0f0c, 0x0d0e0f0c, 0x0d0e0f0c, 0x0d0e0f0c
};
static inline uint8x16_p8
Rijndael_Subkey_POWER8(uint8x16_p8 r1, uint8x16_p8 r4)
Rijndael_Subkey_POWER8(uint8x16_p8 r1, const uint8x16_p8 r4)
{
const uint8x16_p8 r5 = (uint8x16_p8)((uint32x4_p8){0x0d0e0f0c,0x0d0e0f0c,0x0d0e0f0c,0x0d0e0f0c});
const uint8x16_p8 r5 = (uint8x16_p8)VectorLoad(s_mask);
const uint8x16_p8 r0 = {0};
uint8x16_p8 r3, r6;
r3 = vec_perm(r1, r1, r5); /* line 1 */
@ -1051,12 +1100,12 @@ void Rijndael_UncheckedSetKey_POWER8(word32* rk, size_t keyLen, const word32* rc
const byte* Se, unsigned int rounds)
{
#if defined(IS_BIG_ENDIAN)
// Testing shows this is about 150 to 300 cycles faster.
// Testing shows this is about 125 to 275 cycles faster.
if (keyLen == 16)
{
uint8_t* skptr = (uint8_t*)rk;
uint8x16_p8 r1 = (uint8x16_p8)VectorLoadKey((uint8_t*)skptr);
uint8x16_p8 r4 = (uint8x16_p8)VectorLoadKey(s_rcon[0]);
uint8x16_p8 r4 = (uint8x16_p8)VectorLoad(s_rcon[0]);
for (unsigned int i=0; i<rounds-2; ++i)
{
@ -1139,7 +1188,7 @@ void Rijndael_UncheckedSetKey_POWER8(word32* rk, size_t keyLen, const word32* rc
}
}
inline void POWER8_Enc_Block(VectorType &block, const word32 *subkeys, unsigned int rounds)
static inline void POWER8_Enc_Block(VectorType &block, const word32 *subkeys, unsigned int rounds)
{
CRYPTOPP_ASSERT(IsAlignedOn(subkeys, 16));
const byte *keys = reinterpret_cast<const byte*>(subkeys);
@ -1157,7 +1206,7 @@ inline void POWER8_Enc_Block(VectorType &block, const word32 *subkeys, unsigned
block = VectorEncryptLast(block, VectorLoadKey(rounds*16, keys));
}
inline void POWER8_Enc_6_Blocks(VectorType &block0, VectorType &block1,
static inline void POWER8_Enc_6_Blocks(VectorType &block0, VectorType &block1,
VectorType &block2, VectorType &block3, VectorType &block4,
VectorType &block5, const word32 *subkeys, unsigned int rounds)
{
@ -1192,7 +1241,7 @@ inline void POWER8_Enc_6_Blocks(VectorType &block0, VectorType &block1,
block5 = VectorEncryptLast(block5, k);
}
inline void POWER8_Dec_Block(VectorType &block, const word32 *subkeys, unsigned int rounds)
static inline void POWER8_Dec_Block(VectorType &block, const word32 *subkeys, unsigned int rounds)
{
CRYPTOPP_ASSERT(IsAlignedOn(subkeys, 16));
const byte *keys = reinterpret_cast<const byte*>(subkeys);
@ -1210,7 +1259,7 @@ inline void POWER8_Dec_Block(VectorType &block, const word32 *subkeys, unsigned
block = VectorDecryptLast(block, VectorLoadKey(0, keys));
}
inline void POWER8_Dec_6_Blocks(VectorType &block0, VectorType &block1,
static inline void POWER8_Dec_6_Blocks(VectorType &block0, VectorType &block1,
VectorType &block2, VectorType &block3, VectorType &block4,
VectorType &block5, const word32 *subkeys, unsigned int rounds)
{