Refactor LoadT() and StoreT(). Add separate ReverseT() for little endian machines
The refactoring has no effect on little endian machines. However, on big endian GCC119 using GCC 7.1 the performance improved by 2.5x for ECB and CTR modes: BEFORE: <TR><TH>AES/CTR (128-bit key)<TD>2723<TD>1.4<TD>0.163<TD>670 <TR><TH>AES/CTR (192-bit key)<TD>2560<TD>1.5<TD>0.175<TD>719 <TR><TH>AES/CTR (256-bit key)<TD>2728<TD>1.4<TD>0.183<TD>749 <TR><TH>AES/CBC (128-bit key)<TD>1204<TD>3.2<TD>0.135<TD>554 <TR><TH>AES/CBC (192-bit key)<TD>1066<TD>3.7<TD>0.148<TD>605 <TR><TH>AES/CBC (256-bit key)<TD>948<TD>4.1<TD>0.155<TD>635 <TR><TH>AES/OFB (128-bit key)<TD>1019<TD>3.8<TD>0.158<TD>648 <TR><TH>AES/CFB (128-bit key)<TD>949<TD>4.1<TD>0.192<TD>787 <TR><TH>AES/ECB (128-bit key)<TD>3564<TD>1.1<TD>0.082<TD>337 AFTER: <TR><TH>AES/CTR (128-bit key)<TD>6484<TD>0.6<TD>0.163<TD>677 <TR><TH>AES/CTR (192-bit key)<TD>5641<TD>0.7<TD>0.176<TD>728 <TR><TH>AES/CTR (256-bit key)<TD>5005<TD>0.8<TD>0.183<TD>761 <TR><TH>AES/CBC (128-bit key)<TD>1223<TD>3.2<TD>0.135<TD>559 <TR><TH>AES/CBC (192-bit key)<TD>1080<TD>3.7<TD>0.147<TD>611 <TR><TH>AES/CBC (256-bit key)<TD>966<TD>4.1<TD>0.155<TD>642 <TR><TH>AES/OFB (128-bit key)<TD>1057<TD>3.7<TD>0.158<TD>656 <TR><TH>AES/CFB (128-bit key)<TD>1217<TD>3.3<TD>0.186<TD>774 <TR><TH>AES/ECB (128-bit key)<TD>7289<TD>0.5<TD>0.082<TD>342pull/484/merge
parent
1661ff127a
commit
2c18fe8af8
|
|
@ -771,22 +771,7 @@ void Rijndael_UncheckedSetKeyRev_AESNI(word32 *key, unsigned int rounds)
|
|||
typedef __vector unsigned char uint8x16_p8;
|
||||
typedef __vector unsigned long long uint64x2_p8;
|
||||
|
||||
/* Reverses a 16-byte array as needed */
|
||||
void ByteReverseArrayLE(byte dest[16], const byte src[16])
|
||||
{
|
||||
#if defined(CRYPTOPP_XLC_VERSION) && defined(IS_LITTLE_ENDIAN)
|
||||
vec_st(vec_reve(vec_ld(0, src)), 0, dest);
|
||||
#elif defined(IS_LITTLE_ENDIAN)
|
||||
const uint8x16_p8 mask = {15,14,13,12, 11,10,9,8, 7,6,5,4, 3,2,1,0};
|
||||
const uint8x16_p8 zero = {0};
|
||||
vec_vsx_st(vec_perm(vec_vsx_ld(0, src), zero, mask), 0, dest);
|
||||
#else
|
||||
if (src != dest)
|
||||
std::memcpy(dest, src, 16);
|
||||
#endif
|
||||
}
|
||||
|
||||
void ByteReverseArrayLE(byte src[16])
|
||||
void ByteReverseArray(byte src[16])
|
||||
{
|
||||
#if defined(CRYPTOPP_XLC_VERSION) && defined(IS_LITTLE_ENDIAN)
|
||||
vec_st(vec_reve(vec_ld(0, src)), 0, src);
|
||||
|
|
@ -797,78 +782,92 @@ void ByteReverseArrayLE(byte src[16])
|
|||
#endif
|
||||
}
|
||||
|
||||
uint8x16_p8 Load8x16(const uint8_t src[16])
|
||||
static inline uint8x16_p8 Reverse8x16(const uint8x16_p8& src)
|
||||
{
|
||||
const uint8x16_p8 mask = {15,14,13,12, 11,10,9,8, 7,6,5,4, 3,2,1,0};
|
||||
const uint8x16_p8 zero = {0};
|
||||
return vec_perm(src, zero, mask);
|
||||
}
|
||||
|
||||
static inline uint64x2_p8 Reverse64x2(const uint64x2_p8& src)
|
||||
{
|
||||
const uint8x16_p8 mask = {15,14,13,12, 11,10,9,8, 7,6,5,4, 3,2,1,0};
|
||||
const uint8x16_p8 zero = {0};
|
||||
return (uint64x2_p8)vec_perm((uint8x16_p8)src, zero, mask);
|
||||
}
|
||||
|
||||
static inline uint8x16_p8 Load8x16(const uint8_t src[16])
|
||||
{
|
||||
#if defined(CRYPTOPP_XLC_VERSION)
|
||||
/* http://stackoverflow.com/q/46124383/608639 */
|
||||
return vec_xl_be(0, (uint8_t*)src);
|
||||
#else
|
||||
return (uint8x16_p8)vec_vsx_ld(0, src);
|
||||
# if defined(IS_LITTLE_ENDIAN)
|
||||
return Reverse8x16(vec_vsx_ld(0, src));
|
||||
# else
|
||||
return vec_vsx_ld(0, src);
|
||||
# endif
|
||||
#endif
|
||||
}
|
||||
|
||||
uint8x16_p8 Load8x16(int off, const uint8_t src[16])
|
||||
static inline uint8x16_p8 Load8x16(int off, const uint8_t src[16])
|
||||
{
|
||||
#if defined(CRYPTOPP_XLC_VERSION)
|
||||
/* http://stackoverflow.com/q/46124383/608639 */
|
||||
return vec_xl_be(off, (uint8_t*)src);
|
||||
#else
|
||||
return (uint8x16_p8)vec_vsx_ld(off, src);
|
||||
# if defined(IS_LITTLE_ENDIAN)
|
||||
return Reverse8x16(vec_vsx_ld(off, src));
|
||||
# else
|
||||
return vec_vsx_ld(off, src);
|
||||
# endif
|
||||
#endif
|
||||
}
|
||||
|
||||
void Store8x16(const uint8x16_p8 src, uint8_t dest[16])
|
||||
static inline void Store8x16(const uint8x16_p8 src, uint8_t dest[16])
|
||||
{
|
||||
#if defined(CRYPTOPP_XLC_VERSION)
|
||||
/* http://stackoverflow.com/q/46124383/608639 */
|
||||
vec_xst_be(src, 0, (uint8_t*)dest);
|
||||
#else
|
||||
# if defined(IS_LITTLE_ENDIAN)
|
||||
vec_vsx_st(Reverse8x16(src), 0, dest);
|
||||
# else
|
||||
vec_vsx_st(src, 0, dest);
|
||||
# endif
|
||||
#endif
|
||||
}
|
||||
|
||||
uint64x2_p8 Load64x2(const uint8_t src[16])
|
||||
static inline uint64x2_p8 Load64x2(const uint8_t src[16])
|
||||
{
|
||||
#if defined(CRYPTOPP_XLC_VERSION)
|
||||
/* http://stackoverflow.com/q/46124383/608639 */
|
||||
return (uint64x2_p8)vec_xl_be(0, (uint8_t*)src);
|
||||
#else
|
||||
# if defined(IS_LITTLE_ENDIAN)
|
||||
const uint8x16_p8 mask = {15,14,13,12, 11,10,9,8, 7,6,5,4, 3,2,1,0};
|
||||
const uint8x16_p8 zero = {0};
|
||||
return (uint64x2_p8)vec_perm(vec_vsx_ld(0, src), zero, mask);
|
||||
return Reverse64x2((uint64x2_p8)vec_vsx_ld(0, src));
|
||||
# else
|
||||
return (uint64x2_p8)vec_vsx_ld(0, src);
|
||||
# endif
|
||||
#endif
|
||||
}
|
||||
|
||||
uint64x2_p8 Load64x2(int off, const uint8_t src[16])
|
||||
static inline uint64x2_p8 Load64x2(int off, const uint8_t src[16])
|
||||
{
|
||||
#if defined(CRYPTOPP_XLC_VERSION)
|
||||
/* http://stackoverflow.com/q/46124383/608639 */
|
||||
return (uint64x2_p8)vec_xl_be(off, (uint8_t*)src);
|
||||
#else
|
||||
# if defined(IS_LITTLE_ENDIAN)
|
||||
const uint8x16_p8 mask = {15,14,13,12, 11,10,9,8, 7,6,5,4, 3,2,1,0};
|
||||
const uint8x16_p8 zero = {0};
|
||||
return (uint64x2_p8)vec_perm(vec_vsx_ld(off, src), zero, mask);
|
||||
return Reverse64x2((uint64x2_p8)vec_vsx_ld(off, src));
|
||||
# else
|
||||
return (uint64x2_p8)vec_vsx_ld(off, src);
|
||||
# endif
|
||||
#endif
|
||||
}
|
||||
|
||||
void Store64x2(const uint64x2_p8 src, uint8_t dest[16])
|
||||
static inline void Store64x2(const uint64x2_p8 src, uint8_t dest[16])
|
||||
{
|
||||
#if defined(CRYPTOPP_XLC_VERSION)
|
||||
/* http://stackoverflow.com/q/46124383/608639 */
|
||||
vec_xst_be((uint8x16_p8)src, 0, (uint8_t*)dest);
|
||||
#else
|
||||
# if defined(IS_LITTLE_ENDIAN)
|
||||
const uint8x16_p8 mask = {15,14,13,12, 11,10,9,8, 7,6,5,4, 3,2,1,0};
|
||||
const uint8x16_p8 zero = {0};
|
||||
vec_vsx_st(vec_perm((uint8x16_p8)src, zero, mask), 0, dest);
|
||||
vec_vsx_st((uint8x16_p8)Reverse64x2(src), 0, dest);
|
||||
# else
|
||||
vec_vsx_st((uint8x16_p8)src, 0, dest);
|
||||
# endif
|
||||
|
|
|
|||
|
|
@ -251,7 +251,7 @@ extern size_t Rijndael_Dec_AdvancedProcessBlocks_ARMV8(const word32 *subkeys, si
|
|||
#endif
|
||||
|
||||
#if (CRYPTOPP_POWER8_AES_AVAILABLE)
|
||||
extern void ByteReverseArrayLE(byte src[16]);
|
||||
extern void ByteReverseArray(byte src[16]);
|
||||
|
||||
extern size_t Rijndael_Enc_AdvancedProcessBlocks_POWER8(const word32 *subkeys, size_t rounds,
|
||||
const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags);
|
||||
|
|
@ -329,7 +329,7 @@ void Rijndael::Base::UncheckedSetKey(const byte *userKey, unsigned int keyLen, c
|
|||
// reversed on little-endian systems to ensure it loads properly.
|
||||
byte * ptr = reinterpret_cast<byte*>(rk);
|
||||
for (unsigned int i=0; i<=m_rounds; i++)
|
||||
ByteReverseArrayLE(ptr+i*16);
|
||||
ByteReverseArray(ptr+i*16);
|
||||
#endif // IS_LITTLE_ENDIAN
|
||||
|
||||
return;
|
||||
|
|
|
|||
Loading…
Reference in New Issue