Fix unaligned pointer crash on Win32 due to _mm_load_si128

The SSSE3 intrinsics were performing aligned loads using _mm_load_si128 using user supplied pointers. The pointers are only a byte pointer, so its alignment can drop to 1 or 2. Switching to _mm_loadu_si128 will sidestep potential problems. The crash surfaced under Win32 testing.

Switch to memcpy's when performing bulk assignment x[0]=y[0] ... x[3]=y[3]. I believe Yun used the pattern to promote vectorization. Some compilers appear to be braindead and issue integer move's one word at a time. Non-braindead compiler will still take the optimization when advantageous, and slower compilers will benefit from the bulk move. We also cherry picked vectorization opportunities, like in ARIA_GSRK_NEON.

Remove keyBits variable. We now use UncheckedSetKey's keylen throughout.

Also fix a typo in CRYPTOPP_BOOL_SSSE3_INTRINSICS_AVAILABLE. __SSSE3__ was listed twice.
pull/402/head
Jeffrey Walton 2017-04-13 04:28:02 -04:00
parent 59767be52e
commit 35f95fb739
No known key found for this signature in database
GPG Key ID: B36AB348921B1838
3 changed files with 73 additions and 41 deletions

View File

@ -14,6 +14,8 @@
#include "misc.h" #include "misc.h"
#include "cpu.h" #include "cpu.h"
#include <iostream>
// Enable SSE intrinsics for Visual Studio. It reduces key schedule setup by 150 // Enable SSE intrinsics for Visual Studio. It reduces key schedule setup by 150
// to 200 cycles. GCC does fine on its own, and it slows things down a small bit. // to 200 cycles. GCC does fine on its own, and it slows things down a small bit.
#if CRYPTOPP_BOOL_SSSE3_INTRINSICS_AVAILABLE && _MSC_VER #if CRYPTOPP_BOOL_SSSE3_INTRINSICS_AVAILABLE && _MSC_VER
@ -290,43 +292,40 @@ void ARIA::Base::UncheckedSetKey(const byte *key, unsigned int keylen, const Nam
const byte *mk = key; const byte *mk = key;
byte *rk = m_rk.data(); byte *rk = m_rk.data();
int keyBits, Q, q, R, r; int Q, q, R, r;
switch (keylen) switch (keylen)
{ {
case 16: case 16:
R = r = m_rounds = 12; R = r = m_rounds = 12;
keyBits = 128;
Q = q = 0; Q = q = 0;
break; break;
case 32: case 32:
R = r = m_rounds = 16; R = r = m_rounds = 16;
keyBits = 256;
Q = q = 2; Q = q = 2;
break; break;
case 24: case 24:
R = r = m_rounds = 14; R = r = m_rounds = 14;
keyBits = 192;
Q = q = 1; Q = q = 1;
break; break;
default: default:
Q = q = R = r = keyBits = m_rounds = 0; Q = q = R = r = m_rounds = 0;
CRYPTOPP_ASSERT(0); CRYPTOPP_ASSERT(0);
} }
// w0 has room for 32 bytes. w1-w3 each has room for 16 bytes. t is a 16 byte temp area. // w0 has room for 32 bytes. w1-w3 each has room for 16 bytes. t and u are 16 byte temp areas.
word32 *w0 = m_w.data(), *w1 = m_w.data()+8, *w2 = m_w.data()+12, *w3 = m_w.data()+16, *t = m_w.data()+20; word32 *w0 = m_w.data(), *w1 = m_w.data()+8, *w2 = m_w.data()+12, *w3 = m_w.data()+16, *t = m_w.data()+20;
#if CRYPTOPP_ENABLE_ARIA_INTRINSICS #if CRYPTOPP_ENABLE_ARIA_INTRINSICS
if (HasSSSE3()) if (HasSSSE3())
{ {
// 7 SSE instructions // 7 SSE instructions. 'mk' may be unaligned.
const __m128i m = _mm_set_epi8(12,13,14,15, 8,9,10,11, 4,5,6,7, 0,1,2,3); const __m128i m = _mm_set_epi8(12,13,14,15, 8,9,10,11, 4,5,6,7, 0,1,2,3);
const __m128i w = _mm_shuffle_epi8(_mm_load_si128((const __m128i*)mk), m); const __m128i w = _mm_shuffle_epi8(_mm_loadu_si128((const __m128i*)(mk)), m);
_mm_store_si128((__m128i*)w0, w); _mm_store_si128((__m128i*)w0, w);
_mm_store_si128((__m128i*)t, _mm_xor_si128(w, _mm_store_si128((__m128i*)t, _mm_xor_si128(w,
_mm_load_si128((const __m128i*)KRK[q]))); _mm_load_si128((const __m128i*)(KRK[q]))));
} }
else else
#endif // CRYPTOPP_ENABLE_ARIA_INTRINSICS #endif // CRYPTOPP_ENABLE_ARIA_INTRINSICS
@ -342,15 +341,15 @@ void ARIA::Base::UncheckedSetKey(const byte *key, unsigned int keylen, const Nam
// 24 integer instructions // 24 integer instructions
ARIA_FO; ARIA_FO;
if (keyBits == 256) if (keylen == 32)
{ {
#if CRYPTOPP_ENABLE_ARIA_INTRINSICS #if CRYPTOPP_ENABLE_ARIA_INTRINSICS
if (HasSSSE3()) if (HasSSSE3())
{ {
// 3 SSE instructions // 3 SSE instructions. 'mk' may be unaligned.
const __m128i m = _mm_set_epi8(12,13,14,15, 8,9,10,11, 4,5,6,7, 0,1,2,3); const __m128i m = _mm_set_epi8(12,13,14,15, 8,9,10,11, 4,5,6,7, 0,1,2,3);
_mm_store_si128(reinterpret_cast<__m128i*>(w1), _mm_store_si128(reinterpret_cast<__m128i*>(w1),
_mm_shuffle_epi8(_mm_load_si128((const __m128i*)(mk+16)), m)); _mm_shuffle_epi8(_mm_loadu_si128((const __m128i*)(mk+16)), m));
} }
#endif // CRYPTOPP_ENABLE_ARIA_INTRINSICS #endif // CRYPTOPP_ENABLE_ARIA_INTRINSICS
{ {
@ -361,7 +360,7 @@ void ARIA::Base::UncheckedSetKey(const byte *key, unsigned int keylen, const Nam
w1[3] = LoadWord<true>(mk,7); w1[3] = LoadWord<true>(mk,7);
} }
} }
else if (keyBits == 192) else if (keylen == 24)
{ {
w1[0] = LoadWord<true>(mk,4); w1[0] = LoadWord<true>(mk,4);
w1[1] = LoadWord<true>(mk,5); w1[1] = LoadWord<true>(mk,5);
@ -404,7 +403,8 @@ void ARIA::Base::UncheckedSetKey(const byte *key, unsigned int keylen, const Nam
{ {
// 23 integer instructions // 23 integer instructions
w1[0]^=t[0]; w1[1]^=t[1]; w1[2]^=t[2]; w1[3]^=t[3]; w1[0]^=t[0]; w1[1]^=t[1]; w1[2]^=t[2]; w1[3]^=t[3];
t[0]=w1[0]; t[1]=w1[1]; t[2]=w1[2]; t[3]=w1[3]; // t[0]=w1[0]; t[1]=w1[1]; t[2]=w1[2]; t[3]=w1[3];
memcpy(t, w1, 16);
q = (q==2) ? 0 : (q+1); q = (q==2) ? 0 : (q+1);
t[0]^=KRK[q][0]; t[1]^=KRK[q][1]; t[2]^=KRK[q][2]; t[3]^=KRK[q][3]; t[0]^=KRK[q][0]; t[1]^=KRK[q][1]; t[2]^=KRK[q][2]; t[3]^=KRK[q][3];
@ -435,7 +435,8 @@ void ARIA::Base::UncheckedSetKey(const byte *key, unsigned int keylen, const Nam
{ {
// 23 integer instructions // 23 integer instructions
t[0]^=w0[0]; t[1]^=w0[1]; t[2]^=w0[2]; t[3]^=w0[3]; t[0]^=w0[0]; t[1]^=w0[1]; t[2]^=w0[2]; t[3]^=w0[3];
w2[0]=t[0]; w2[1]=t[1]; w2[2]=t[2]; w2[3]=t[3]; // w2[0]=t[0]; w2[1]=t[1]; w2[2]=t[2]; w2[3]=t[3];
memcpy(w2, t, 16);
q = (q==2) ? 0 : (q+1); q = (q==2) ? 0 : (q+1);
t[0]^=KRK[q][0]; t[1]^=KRK[q][1]; t[2]^=KRK[q][2]; t[3]^=KRK[q][3]; t[0]^=KRK[q][0]; t[1]^=KRK[q][1]; t[2]^=KRK[q][2]; t[3]^=KRK[q][3];
@ -506,12 +507,12 @@ void ARIA::Base::UncheckedSetKey(const byte *key, unsigned int keylen, const Nam
ARIA_GSRK<67>(w3, w0, rk + 176); ARIA_GSRK<67>(w3, w0, rk + 176);
ARIA_GSRK<97>(w0, w1, rk + 192); ARIA_GSRK<97>(w0, w1, rk + 192);
if (keyBits > 128) if (keylen > 16)
{ {
ARIA_GSRK<97>(w1, w2, rk + 208); ARIA_GSRK<97>(w1, w2, rk + 208);
ARIA_GSRK<97>(w2, w3, rk + 224); ARIA_GSRK<97>(w2, w3, rk + 224);
if (keyBits > 192) if (keylen > 24)
{ {
ARIA_GSRK< 97>(w3, w0, rk + 240); ARIA_GSRK< 97>(w3, w0, rk + 240);
ARIA_GSRK<109>(w0, w1, rk + 256); ARIA_GSRK<109>(w0, w1, rk + 256);
@ -522,33 +523,40 @@ void ARIA::Base::UncheckedSetKey(const byte *key, unsigned int keylen, const Nam
// Decryption operation // Decryption operation
if (!IsForwardTransformation()) if (!IsForwardTransformation())
{ {
word32 *a, *z, w; word32 *a, *z, *s, w;
word32 s0, s1, s2, s3;
mk = key; mk = key;
rk = m_rk.data(); rk = m_rk.data();
r = R; q = Q; r = R; q = Q;
// 32 integer intructions // 32 integer intructions. Memcpy is faster
a=reinterpret_cast<word32*>(rk); z=a+r*4; a=reinterpret_cast<word32*>(rk); s=m_w.data()+24; z=a+r*4;
t[0]=a[0]; t[1]=a[1]; t[2]=a[2]; t[3]=a[3]; // t[0]=a[0]; t[1]=a[1]; t[2]=a[2]; t[3]=a[3];
a[0]=z[0]; a[1]=z[1]; a[2]=z[2]; a[3]=z[3]; // a[0]=z[0]; a[1]=z[1]; a[2]=z[2]; a[3]=z[3];
z[0]=t[0]; z[1]=t[1]; z[2]=t[2]; z[3]=t[3]; // z[0]=t[0]; z[1]=t[1]; z[2]=t[2]; z[3]=t[3];
memcpy(t, a, 16);
memcpy(a, z, 16);
memcpy(z, t, 16);
a+=4; z-=4; a+=4; z-=4;
for (; a<z; a+=4, z-=4) for (; a<z; a+=4, z-=4)
{ {
ARIA_M1(a[0],t[0]); ARIA_M1(a[1],t[1]); ARIA_M1(a[2],t[2]); ARIA_M1(a[3],t[3]); ARIA_M1(a[0],t[0]); ARIA_M1(a[1],t[1]); ARIA_M1(a[2],t[2]); ARIA_M1(a[3],t[3]);
ARIA_MM(t[0],t[1],t[2],t[3]); ARIA_P(t[0],t[1],t[2],t[3]); ARIA_MM(t[0],t[1],t[2],t[3]); ARIA_MM(t[0],t[1],t[2],t[3]); ARIA_P(t[0],t[1],t[2],t[3]); ARIA_MM(t[0],t[1],t[2],t[3]);
s0=t[0]; s1=t[1]; s2=t[2]; s3=t[3]; // s[0]=t[0]; s[1]=t[1]; s[2]=t[2]; s[3]=t[3];
memcpy(s, t, 16);
ARIA_M1(z[0],t[0]); ARIA_M1(z[1],t[1]); ARIA_M1(z[2],t[2]); ARIA_M1(z[3],t[3]); ARIA_M1(z[0],t[0]); ARIA_M1(z[1],t[1]); ARIA_M1(z[2],t[2]); ARIA_M1(z[3],t[3]);
ARIA_MM(t[0],t[1],t[2],t[3]); ARIA_P(t[0],t[1],t[2],t[3]); ARIA_MM(t[0],t[1],t[2],t[3]); ARIA_MM(t[0],t[1],t[2],t[3]); ARIA_P(t[0],t[1],t[2],t[3]); ARIA_MM(t[0],t[1],t[2],t[3]);
a[0]=t[0]; a[1]=t[1]; a[2]=t[2]; a[3]=t[3]; // a[0]=t[0]; a[1]=t[1]; a[2]=t[2]; a[3]=t[3];
z[0]=s0; z[1]=s1; z[2]=s2; z[3]=s3; // z[0]=s[0]; z[1]=s[1]; z[2]=s[2]; z[3]=s[3];
memcpy(a, t, 16);
memcpy(z, s, 16);
} }
ARIA_M1(a[0],t[0]); ARIA_M1(a[1],t[1]); ARIA_M1(a[2],t[2]); ARIA_M1(a[3],t[3]); ARIA_M1(a[0],t[0]); ARIA_M1(a[1],t[1]); ARIA_M1(a[2],t[2]); ARIA_M1(a[3],t[3]);
ARIA_MM(t[0],t[1],t[2],t[3]); ARIA_P(t[0],t[1],t[2],t[3]); ARIA_MM(t[0],t[1],t[2],t[3]); ARIA_MM(t[0],t[1],t[2],t[3]); ARIA_P(t[0],t[1],t[2],t[3]); ARIA_MM(t[0],t[1],t[2],t[3]);
z[0]=t[0]; z[1]=t[1]; z[2]=t[2]; z[3]=t[3]; // z[0]=t[0]; z[1]=t[1]; z[2]=t[2]; z[3]=t[3];
memcpy(z, t, 16);
} }
} }
@ -557,12 +565,21 @@ void ARIA::Base::ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock, b
const byte *rk = reinterpret_cast<const byte*>(m_rk.data()); const byte *rk = reinterpret_cast<const byte*>(m_rk.data());
word32 *t = const_cast<word32*>(m_w.data()+20); word32 *t = const_cast<word32*>(m_w.data()+20);
// Visual Studio is generating bad code within the SSSE3 code block. It is #if CRYPTOPP_ENABLE_ARIA_INTRINSICS
// providing a NULL pointer or a pointer set to a constant like 0x1000. if (HasSSSE3())
// It looks like some leftover garbage in the XMM register rather than {
// the pointer loaded into the integer register for the non-SSE code path. // 3 SSE instructions. 'inBlock' may be unaligned.
const __m128i m = _mm_set_epi8(12,13,14,15, 8,9,10,11, 4,5,6,7, 0,1,2,3);
const __m128i w = _mm_shuffle_epi8(_mm_loadu_si128((const __m128i*)(inBlock)), m);
_mm_store_si128((__m128i*)t, w);
}
else
#endif // CRYPTOPP_ENABLE_ARIA_INTRINSICS
{
// 13 integer instructions
t[0] = LoadWord<true>(inBlock,0); t[1] = LoadWord<true>(inBlock,1); t[0] = LoadWord<true>(inBlock,0); t[1] = LoadWord<true>(inBlock,1);
t[2] = LoadWord<true>(inBlock,2); t[3] = LoadWord<true>(inBlock,3); t[2] = LoadWord<true>(inBlock,2); t[3] = LoadWord<true>(inBlock,3);
}
if (m_rounds > 12) { if (m_rounds > 12) {
ARIA_KXL; rk+= 16; ARIA_FO; ARIA_KXL; rk+= 16; ARIA_FO;
@ -622,9 +639,24 @@ void ARIA::Base::ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock, b
ARIA_WORD(outBlock,3)^=LoadWord<true>(rk,3); ARIA_WORD(outBlock,3)^=LoadWord<true>(rk,3);
#endif #endif
#if CRYPTOPP_ENABLE_ARIA_INTRINSICS
if (xorBlock != NULLPTR && HasSSSE3())
{
// 3 SSE instructions
_mm_storeu_si128((__m128i*)(outBlock),
_mm_xor_si128(
// 'outBlock' and 'xorBlock' may be unaligned.
_mm_loadu_si128((const __m128i*)(outBlock)),
_mm_loadu_si128((const __m128i*)(xorBlock))));
}
else
#endif // CRYPTOPP_ENABLE_ARIA_INTRINSICS
{
// 15 integer instructions
if (xorBlock) if (xorBlock)
for (unsigned int n=0; n<16; ++n) for (unsigned int n=0; n<16; ++n)
outBlock[n] ^= xorBlock[n]; outBlock[n] ^= xorBlock[n];
} }
}
NAMESPACE_END NAMESPACE_END

4
aria.h
View File

@ -48,9 +48,9 @@ public:
void ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock, byte *outBlock) const; void ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock, byte *outBlock) const;
private: private:
// Reference implementation allocates a table for 17 sub-keys // Reference implementation allocates a table of 17 sub-keys.
FixedSizeAlignedSecBlock<byte, 16*17> m_rk; // round keys FixedSizeAlignedSecBlock<byte, 16*17> m_rk; // round keys
FixedSizeAlignedSecBlock<word32, 4*6> m_w; // w0, w1, w2, w3 and t FixedSizeAlignedSecBlock<word32, 4*7> m_w; // w0, w1, w2, w3, t and u
unsigned int m_rounds; unsigned int m_rounds;
}; };

View File

@ -402,7 +402,7 @@ NAMESPACE_END
#define CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE 0 #define CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE 0
#endif #endif
#if !defined(CRYPTOPP_DISABLE_SSSE3) && (_MSC_VER >= 1500 || (defined(__SSSE3__) && defined(__SSSE3__))) #if !defined(CRYPTOPP_DISABLE_SSSE3) && (_MSC_VER >= 1500 || (defined(__SSE3__) && defined(__SSSE3__)))
#define CRYPTOPP_BOOL_SSSE3_ASM_AVAILABLE 1 #define CRYPTOPP_BOOL_SSSE3_ASM_AVAILABLE 1
#else #else
#define CRYPTOPP_BOOL_SSSE3_ASM_AVAILABLE 0 #define CRYPTOPP_BOOL_SSSE3_ASM_AVAILABLE 0