Refactor LoadT() and StoreT(). Add separate ReverseT() for little endian machines

The refactoring has no effect on little endian machines. However, on big endian GCC119 using GCC 7.1 the performance improved by 2.5x for ECB and CTR modes:

BEFORE:

<TR><TH>AES/CTR (128-bit key)<TD>2723<TD>1.4<TD>0.163<TD>670
<TR><TH>AES/CTR (192-bit key)<TD>2560<TD>1.5<TD>0.175<TD>719
<TR><TH>AES/CTR (256-bit key)<TD>2728<TD>1.4<TD>0.183<TD>749
<TR><TH>AES/CBC (128-bit key)<TD>1204<TD>3.2<TD>0.135<TD>554
<TR><TH>AES/CBC (192-bit key)<TD>1066<TD>3.7<TD>0.148<TD>605
<TR><TH>AES/CBC (256-bit key)<TD>948<TD>4.1<TD>0.155<TD>635
<TR><TH>AES/OFB (128-bit key)<TD>1019<TD>3.8<TD>0.158<TD>648
<TR><TH>AES/CFB (128-bit key)<TD>949<TD>4.1<TD>0.192<TD>787
<TR><TH>AES/ECB (128-bit key)<TD>3564<TD>1.1<TD>0.082<TD>337

AFTER:

<TR><TH>AES/CTR (128-bit key)<TD>6484<TD>0.6<TD>0.163<TD>677
<TR><TH>AES/CTR (192-bit key)<TD>5641<TD>0.7<TD>0.176<TD>728
<TR><TH>AES/CTR (256-bit key)<TD>5005<TD>0.8<TD>0.183<TD>761
<TR><TH>AES/CBC (128-bit key)<TD>1223<TD>3.2<TD>0.135<TD>559
<TR><TH>AES/CBC (192-bit key)<TD>1080<TD>3.7<TD>0.147<TD>611
<TR><TH>AES/CBC (256-bit key)<TD>966<TD>4.1<TD>0.155<TD>642
<TR><TH>AES/OFB (128-bit key)<TD>1057<TD>3.7<TD>0.158<TD>656
<TR><TH>AES/CFB (128-bit key)<TD>1217<TD>3.3<TD>0.186<TD>774
<TR><TH>AES/ECB (128-bit key)<TD>7289<TD>0.5<TD>0.082<TD>342
pull/484/merge
Jeffrey Walton 2017-09-18 18:15:25 -04:00
parent 1661ff127a
commit 2c18fe8af8
No known key found for this signature in database
GPG Key ID: B36AB348921B1838
2 changed files with 40 additions and 41 deletions

View File

@ -771,22 +771,7 @@ void Rijndael_UncheckedSetKeyRev_AESNI(word32 *key, unsigned int rounds)
typedef __vector unsigned char uint8x16_p8; typedef __vector unsigned char uint8x16_p8;
typedef __vector unsigned long long uint64x2_p8; typedef __vector unsigned long long uint64x2_p8;
/* Reverses a 16-byte array as needed */ void ByteReverseArray(byte src[16])
void ByteReverseArrayLE(byte dest[16], const byte src[16])
{
#if defined(CRYPTOPP_XLC_VERSION) && defined(IS_LITTLE_ENDIAN)
vec_st(vec_reve(vec_ld(0, src)), 0, dest);
#elif defined(IS_LITTLE_ENDIAN)
const uint8x16_p8 mask = {15,14,13,12, 11,10,9,8, 7,6,5,4, 3,2,1,0};
const uint8x16_p8 zero = {0};
vec_vsx_st(vec_perm(vec_vsx_ld(0, src), zero, mask), 0, dest);
#else
if (src != dest)
std::memcpy(dest, src, 16);
#endif
}
void ByteReverseArrayLE(byte src[16])
{ {
#if defined(CRYPTOPP_XLC_VERSION) && defined(IS_LITTLE_ENDIAN) #if defined(CRYPTOPP_XLC_VERSION) && defined(IS_LITTLE_ENDIAN)
vec_st(vec_reve(vec_ld(0, src)), 0, src); vec_st(vec_reve(vec_ld(0, src)), 0, src);
@ -797,78 +782,92 @@ void ByteReverseArrayLE(byte src[16])
#endif #endif
} }
uint8x16_p8 Load8x16(const uint8_t src[16]) static inline uint8x16_p8 Reverse8x16(const uint8x16_p8& src)
{
const uint8x16_p8 mask = {15,14,13,12, 11,10,9,8, 7,6,5,4, 3,2,1,0};
const uint8x16_p8 zero = {0};
return vec_perm(src, zero, mask);
}
static inline uint64x2_p8 Reverse64x2(const uint64x2_p8& src)
{
const uint8x16_p8 mask = {15,14,13,12, 11,10,9,8, 7,6,5,4, 3,2,1,0};
const uint8x16_p8 zero = {0};
return (uint64x2_p8)vec_perm((uint8x16_p8)src, zero, mask);
}
static inline uint8x16_p8 Load8x16(const uint8_t src[16])
{ {
#if defined(CRYPTOPP_XLC_VERSION) #if defined(CRYPTOPP_XLC_VERSION)
/* http://stackoverflow.com/q/46124383/608639 */
return vec_xl_be(0, (uint8_t*)src); return vec_xl_be(0, (uint8_t*)src);
#else #else
return (uint8x16_p8)vec_vsx_ld(0, src); # if defined(IS_LITTLE_ENDIAN)
return Reverse8x16(vec_vsx_ld(0, src));
# else
return vec_vsx_ld(0, src);
# endif
#endif #endif
} }
uint8x16_p8 Load8x16(int off, const uint8_t src[16]) static inline uint8x16_p8 Load8x16(int off, const uint8_t src[16])
{ {
#if defined(CRYPTOPP_XLC_VERSION) #if defined(CRYPTOPP_XLC_VERSION)
/* http://stackoverflow.com/q/46124383/608639 */
return vec_xl_be(off, (uint8_t*)src); return vec_xl_be(off, (uint8_t*)src);
#else #else
return (uint8x16_p8)vec_vsx_ld(off, src); # if defined(IS_LITTLE_ENDIAN)
return Reverse8x16(vec_vsx_ld(off, src));
# else
return vec_vsx_ld(off, src);
# endif
#endif #endif
} }
void Store8x16(const uint8x16_p8 src, uint8_t dest[16]) static inline void Store8x16(const uint8x16_p8 src, uint8_t dest[16])
{ {
#if defined(CRYPTOPP_XLC_VERSION) #if defined(CRYPTOPP_XLC_VERSION)
/* http://stackoverflow.com/q/46124383/608639 */
vec_xst_be(src, 0, (uint8_t*)dest); vec_xst_be(src, 0, (uint8_t*)dest);
#else #else
# if defined(IS_LITTLE_ENDIAN)
vec_vsx_st(Reverse8x16(src), 0, dest);
# else
vec_vsx_st(src, 0, dest); vec_vsx_st(src, 0, dest);
# endif
#endif #endif
} }
uint64x2_p8 Load64x2(const uint8_t src[16]) static inline uint64x2_p8 Load64x2(const uint8_t src[16])
{ {
#if defined(CRYPTOPP_XLC_VERSION) #if defined(CRYPTOPP_XLC_VERSION)
/* http://stackoverflow.com/q/46124383/608639 */
return (uint64x2_p8)vec_xl_be(0, (uint8_t*)src); return (uint64x2_p8)vec_xl_be(0, (uint8_t*)src);
#else #else
# if defined(IS_LITTLE_ENDIAN) # if defined(IS_LITTLE_ENDIAN)
const uint8x16_p8 mask = {15,14,13,12, 11,10,9,8, 7,6,5,4, 3,2,1,0}; return Reverse64x2((uint64x2_p8)vec_vsx_ld(0, src));
const uint8x16_p8 zero = {0};
return (uint64x2_p8)vec_perm(vec_vsx_ld(0, src), zero, mask);
# else # else
return (uint64x2_p8)vec_vsx_ld(0, src); return (uint64x2_p8)vec_vsx_ld(0, src);
# endif # endif
#endif #endif
} }
uint64x2_p8 Load64x2(int off, const uint8_t src[16]) static inline uint64x2_p8 Load64x2(int off, const uint8_t src[16])
{ {
#if defined(CRYPTOPP_XLC_VERSION) #if defined(CRYPTOPP_XLC_VERSION)
/* http://stackoverflow.com/q/46124383/608639 */
return (uint64x2_p8)vec_xl_be(off, (uint8_t*)src); return (uint64x2_p8)vec_xl_be(off, (uint8_t*)src);
#else #else
# if defined(IS_LITTLE_ENDIAN) # if defined(IS_LITTLE_ENDIAN)
const uint8x16_p8 mask = {15,14,13,12, 11,10,9,8, 7,6,5,4, 3,2,1,0}; return Reverse64x2((uint64x2_p8)vec_vsx_ld(off, src));
const uint8x16_p8 zero = {0};
return (uint64x2_p8)vec_perm(vec_vsx_ld(off, src), zero, mask);
# else # else
return (uint64x2_p8)vec_vsx_ld(off, src); return (uint64x2_p8)vec_vsx_ld(off, src);
# endif # endif
#endif #endif
} }
void Store64x2(const uint64x2_p8 src, uint8_t dest[16]) static inline void Store64x2(const uint64x2_p8 src, uint8_t dest[16])
{ {
#if defined(CRYPTOPP_XLC_VERSION) #if defined(CRYPTOPP_XLC_VERSION)
/* http://stackoverflow.com/q/46124383/608639 */
vec_xst_be((uint8x16_p8)src, 0, (uint8_t*)dest); vec_xst_be((uint8x16_p8)src, 0, (uint8_t*)dest);
#else #else
# if defined(IS_LITTLE_ENDIAN) # if defined(IS_LITTLE_ENDIAN)
const uint8x16_p8 mask = {15,14,13,12, 11,10,9,8, 7,6,5,4, 3,2,1,0}; vec_vsx_st((uint8x16_p8)Reverse64x2(src), 0, dest);
const uint8x16_p8 zero = {0};
vec_vsx_st(vec_perm((uint8x16_p8)src, zero, mask), 0, dest);
# else # else
vec_vsx_st((uint8x16_p8)src, 0, dest); vec_vsx_st((uint8x16_p8)src, 0, dest);
# endif # endif

View File

@ -251,7 +251,7 @@ extern size_t Rijndael_Dec_AdvancedProcessBlocks_ARMV8(const word32 *subkeys, si
#endif #endif
#if (CRYPTOPP_POWER8_AES_AVAILABLE) #if (CRYPTOPP_POWER8_AES_AVAILABLE)
extern void ByteReverseArrayLE(byte src[16]); extern void ByteReverseArray(byte src[16]);
extern size_t Rijndael_Enc_AdvancedProcessBlocks_POWER8(const word32 *subkeys, size_t rounds, extern size_t Rijndael_Enc_AdvancedProcessBlocks_POWER8(const word32 *subkeys, size_t rounds,
const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags); const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags);
@ -329,7 +329,7 @@ void Rijndael::Base::UncheckedSetKey(const byte *userKey, unsigned int keyLen, c
// reversed on little-endian systems to ensure it loads properly. // reversed on little-endian systems to ensure it loads properly.
byte * ptr = reinterpret_cast<byte*>(rk); byte * ptr = reinterpret_cast<byte*>(rk);
for (unsigned int i=0; i<=m_rounds; i++) for (unsigned int i=0; i<=m_rounds; i++)
ByteReverseArrayLE(ptr+i*16); ByteReverseArray(ptr+i*16);
#endif // IS_LITTLE_ENDIAN #endif // IS_LITTLE_ENDIAN
return; return;