Fix unaligned pointer crash on Win32 due to _mm_load_si128
The SSSE3 intrinsics were performing aligned loads using _mm_load_si128 using user supplied pointers. The pointers are only a byte pointer, so its alignment can drop to 1 or 2. Switching to _mm_loadu_si128 will sidestep potential problems. The crash surfaced under Win32 testing. Switch to memcpy's when performing bulk assignment x[0]=y[0] ... x[3]=y[3]. I believe Yun used the pattern to promote vectorization. Some compilers appear to be braindead and issue integer move's one word at a time. Non-braindead compiler will still take the optimization when advantageous, and slower compilers will benefit from the bulk move. We also cherry picked vectorization opportunities, like in ARIA_GSRK_NEON. Remove keyBits variable. We now use UncheckedSetKey's keylen throughout. Also fix a typo in CRYPTOPP_BOOL_SSSE3_INTRINSICS_AVAILABLE. __SSSE3__ was listed twice.pull/402/head
parent
59767be52e
commit
35f95fb739
108
aria.cpp
108
aria.cpp
|
|
@ -14,6 +14,8 @@
|
||||||
#include "misc.h"
|
#include "misc.h"
|
||||||
#include "cpu.h"
|
#include "cpu.h"
|
||||||
|
|
||||||
|
#include <iostream>
|
||||||
|
|
||||||
// Enable SSE intrinsics for Visual Studio. It reduces key schedule setup by 150
|
// Enable SSE intrinsics for Visual Studio. It reduces key schedule setup by 150
|
||||||
// to 200 cycles. GCC does fine on its own, and it slows things down a small bit.
|
// to 200 cycles. GCC does fine on its own, and it slows things down a small bit.
|
||||||
#if CRYPTOPP_BOOL_SSSE3_INTRINSICS_AVAILABLE && _MSC_VER
|
#if CRYPTOPP_BOOL_SSSE3_INTRINSICS_AVAILABLE && _MSC_VER
|
||||||
|
|
@ -206,7 +208,7 @@ inline word32 LoadWord(const byte x[16], const unsigned int i) {
|
||||||
// Key XOR Layer
|
// Key XOR Layer
|
||||||
#define ARIA_KXL { \
|
#define ARIA_KXL { \
|
||||||
t[0]^=LoadWord<false>(rk,0); t[1]^=LoadWord<false>(rk,1); \
|
t[0]^=LoadWord<false>(rk,0); t[1]^=LoadWord<false>(rk,1); \
|
||||||
t[2]^=LoadWord<false>(rk,2); t[3]^=LoadWord<false>(rk,3); \
|
t[2]^=LoadWord<false>(rk,2); t[3]^=LoadWord<false>(rk,3); \
|
||||||
}
|
}
|
||||||
|
|
||||||
// S-Box Layer 1 + M
|
// S-Box Layer 1 + M
|
||||||
|
|
@ -290,43 +292,40 @@ void ARIA::Base::UncheckedSetKey(const byte *key, unsigned int keylen, const Nam
|
||||||
|
|
||||||
const byte *mk = key;
|
const byte *mk = key;
|
||||||
byte *rk = m_rk.data();
|
byte *rk = m_rk.data();
|
||||||
int keyBits, Q, q, R, r;
|
int Q, q, R, r;
|
||||||
|
|
||||||
switch (keylen)
|
switch (keylen)
|
||||||
{
|
{
|
||||||
case 16:
|
case 16:
|
||||||
R = r = m_rounds = 12;
|
R = r = m_rounds = 12;
|
||||||
keyBits = 128;
|
|
||||||
Q = q = 0;
|
Q = q = 0;
|
||||||
break;
|
break;
|
||||||
case 32:
|
case 32:
|
||||||
R = r = m_rounds = 16;
|
R = r = m_rounds = 16;
|
||||||
keyBits = 256;
|
|
||||||
Q = q = 2;
|
Q = q = 2;
|
||||||
break;
|
break;
|
||||||
case 24:
|
case 24:
|
||||||
R = r = m_rounds = 14;
|
R = r = m_rounds = 14;
|
||||||
keyBits = 192;
|
|
||||||
Q = q = 1;
|
Q = q = 1;
|
||||||
break;
|
break;
|
||||||
default:
|
default:
|
||||||
Q = q = R = r = keyBits = m_rounds = 0;
|
Q = q = R = r = m_rounds = 0;
|
||||||
CRYPTOPP_ASSERT(0);
|
CRYPTOPP_ASSERT(0);
|
||||||
}
|
}
|
||||||
|
|
||||||
// w0 has room for 32 bytes. w1-w3 each has room for 16 bytes. t is a 16 byte temp area.
|
// w0 has room for 32 bytes. w1-w3 each has room for 16 bytes. t and u are 16 byte temp areas.
|
||||||
word32 *w0 = m_w.data(), *w1 = m_w.data()+8, *w2 = m_w.data()+12, *w3 = m_w.data()+16, *t = m_w.data()+20;
|
word32 *w0 = m_w.data(), *w1 = m_w.data()+8, *w2 = m_w.data()+12, *w3 = m_w.data()+16, *t = m_w.data()+20;
|
||||||
|
|
||||||
#if CRYPTOPP_ENABLE_ARIA_INTRINSICS
|
#if CRYPTOPP_ENABLE_ARIA_INTRINSICS
|
||||||
if (HasSSSE3())
|
if (HasSSSE3())
|
||||||
{
|
{
|
||||||
// 7 SSE instructions
|
// 7 SSE instructions. 'mk' may be unaligned.
|
||||||
const __m128i m = _mm_set_epi8(12,13,14,15, 8,9,10,11, 4,5,6,7, 0,1,2,3);
|
const __m128i m = _mm_set_epi8(12,13,14,15, 8,9,10,11, 4,5,6,7, 0,1,2,3);
|
||||||
const __m128i w = _mm_shuffle_epi8(_mm_load_si128((const __m128i*)mk), m);
|
const __m128i w = _mm_shuffle_epi8(_mm_loadu_si128((const __m128i*)(mk)), m);
|
||||||
_mm_store_si128((__m128i*)w0, w);
|
_mm_store_si128((__m128i*)w0, w);
|
||||||
|
|
||||||
_mm_store_si128((__m128i*)t, _mm_xor_si128(w,
|
_mm_store_si128((__m128i*)t, _mm_xor_si128(w,
|
||||||
_mm_load_si128((const __m128i*)KRK[q])));
|
_mm_load_si128((const __m128i*)(KRK[q]))));
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
#endif // CRYPTOPP_ENABLE_ARIA_INTRINSICS
|
#endif // CRYPTOPP_ENABLE_ARIA_INTRINSICS
|
||||||
|
|
@ -342,15 +341,15 @@ void ARIA::Base::UncheckedSetKey(const byte *key, unsigned int keylen, const Nam
|
||||||
// 24 integer instructions
|
// 24 integer instructions
|
||||||
ARIA_FO;
|
ARIA_FO;
|
||||||
|
|
||||||
if (keyBits == 256)
|
if (keylen == 32)
|
||||||
{
|
{
|
||||||
#if CRYPTOPP_ENABLE_ARIA_INTRINSICS
|
#if CRYPTOPP_ENABLE_ARIA_INTRINSICS
|
||||||
if (HasSSSE3())
|
if (HasSSSE3())
|
||||||
{
|
{
|
||||||
// 3 SSE instructions
|
// 3 SSE instructions. 'mk' may be unaligned.
|
||||||
const __m128i m = _mm_set_epi8(12,13,14,15, 8,9,10,11, 4,5,6,7, 0,1,2,3);
|
const __m128i m = _mm_set_epi8(12,13,14,15, 8,9,10,11, 4,5,6,7, 0,1,2,3);
|
||||||
_mm_store_si128(reinterpret_cast<__m128i*>(w1),
|
_mm_store_si128(reinterpret_cast<__m128i*>(w1),
|
||||||
_mm_shuffle_epi8(_mm_load_si128((const __m128i*)(mk+16)), m));
|
_mm_shuffle_epi8(_mm_loadu_si128((const __m128i*)(mk+16)), m));
|
||||||
}
|
}
|
||||||
#endif // CRYPTOPP_ENABLE_ARIA_INTRINSICS
|
#endif // CRYPTOPP_ENABLE_ARIA_INTRINSICS
|
||||||
{
|
{
|
||||||
|
|
@ -361,7 +360,7 @@ void ARIA::Base::UncheckedSetKey(const byte *key, unsigned int keylen, const Nam
|
||||||
w1[3] = LoadWord<true>(mk,7);
|
w1[3] = LoadWord<true>(mk,7);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
else if (keyBits == 192)
|
else if (keylen == 24)
|
||||||
{
|
{
|
||||||
w1[0] = LoadWord<true>(mk,4);
|
w1[0] = LoadWord<true>(mk,4);
|
||||||
w1[1] = LoadWord<true>(mk,5);
|
w1[1] = LoadWord<true>(mk,5);
|
||||||
|
|
@ -404,7 +403,8 @@ void ARIA::Base::UncheckedSetKey(const byte *key, unsigned int keylen, const Nam
|
||||||
{
|
{
|
||||||
// 23 integer instructions
|
// 23 integer instructions
|
||||||
w1[0]^=t[0]; w1[1]^=t[1]; w1[2]^=t[2]; w1[3]^=t[3];
|
w1[0]^=t[0]; w1[1]^=t[1]; w1[2]^=t[2]; w1[3]^=t[3];
|
||||||
t[0]=w1[0]; t[1]=w1[1]; t[2]=w1[2]; t[3]=w1[3];
|
// t[0]=w1[0]; t[1]=w1[1]; t[2]=w1[2]; t[3]=w1[3];
|
||||||
|
memcpy(t, w1, 16);
|
||||||
|
|
||||||
q = (q==2) ? 0 : (q+1);
|
q = (q==2) ? 0 : (q+1);
|
||||||
t[0]^=KRK[q][0]; t[1]^=KRK[q][1]; t[2]^=KRK[q][2]; t[3]^=KRK[q][3];
|
t[0]^=KRK[q][0]; t[1]^=KRK[q][1]; t[2]^=KRK[q][2]; t[3]^=KRK[q][3];
|
||||||
|
|
@ -435,7 +435,8 @@ void ARIA::Base::UncheckedSetKey(const byte *key, unsigned int keylen, const Nam
|
||||||
{
|
{
|
||||||
// 23 integer instructions
|
// 23 integer instructions
|
||||||
t[0]^=w0[0]; t[1]^=w0[1]; t[2]^=w0[2]; t[3]^=w0[3];
|
t[0]^=w0[0]; t[1]^=w0[1]; t[2]^=w0[2]; t[3]^=w0[3];
|
||||||
w2[0]=t[0]; w2[1]=t[1]; w2[2]=t[2]; w2[3]=t[3];
|
// w2[0]=t[0]; w2[1]=t[1]; w2[2]=t[2]; w2[3]=t[3];
|
||||||
|
memcpy(w2, t, 16);
|
||||||
|
|
||||||
q = (q==2) ? 0 : (q+1);
|
q = (q==2) ? 0 : (q+1);
|
||||||
t[0]^=KRK[q][0]; t[1]^=KRK[q][1]; t[2]^=KRK[q][2]; t[3]^=KRK[q][3];
|
t[0]^=KRK[q][0]; t[1]^=KRK[q][1]; t[2]^=KRK[q][2]; t[3]^=KRK[q][3];
|
||||||
|
|
@ -506,12 +507,12 @@ void ARIA::Base::UncheckedSetKey(const byte *key, unsigned int keylen, const Nam
|
||||||
ARIA_GSRK<67>(w3, w0, rk + 176);
|
ARIA_GSRK<67>(w3, w0, rk + 176);
|
||||||
ARIA_GSRK<97>(w0, w1, rk + 192);
|
ARIA_GSRK<97>(w0, w1, rk + 192);
|
||||||
|
|
||||||
if (keyBits > 128)
|
if (keylen > 16)
|
||||||
{
|
{
|
||||||
ARIA_GSRK<97>(w1, w2, rk + 208);
|
ARIA_GSRK<97>(w1, w2, rk + 208);
|
||||||
ARIA_GSRK<97>(w2, w3, rk + 224);
|
ARIA_GSRK<97>(w2, w3, rk + 224);
|
||||||
|
|
||||||
if (keyBits > 192)
|
if (keylen > 24)
|
||||||
{
|
{
|
||||||
ARIA_GSRK< 97>(w3, w0, rk + 240);
|
ARIA_GSRK< 97>(w3, w0, rk + 240);
|
||||||
ARIA_GSRK<109>(w0, w1, rk + 256);
|
ARIA_GSRK<109>(w0, w1, rk + 256);
|
||||||
|
|
@ -522,33 +523,40 @@ void ARIA::Base::UncheckedSetKey(const byte *key, unsigned int keylen, const Nam
|
||||||
// Decryption operation
|
// Decryption operation
|
||||||
if (!IsForwardTransformation())
|
if (!IsForwardTransformation())
|
||||||
{
|
{
|
||||||
word32 *a, *z, w;
|
word32 *a, *z, *s, w;
|
||||||
word32 s0, s1, s2, s3;
|
|
||||||
|
|
||||||
mk = key;
|
mk = key;
|
||||||
rk = m_rk.data();
|
rk = m_rk.data();
|
||||||
r = R; q = Q;
|
r = R; q = Q;
|
||||||
|
|
||||||
// 32 integer intructions
|
// 32 integer intructions. Memcpy is faster
|
||||||
a=reinterpret_cast<word32*>(rk); z=a+r*4;
|
a=reinterpret_cast<word32*>(rk); s=m_w.data()+24; z=a+r*4;
|
||||||
t[0]=a[0]; t[1]=a[1]; t[2]=a[2]; t[3]=a[3];
|
// t[0]=a[0]; t[1]=a[1]; t[2]=a[2]; t[3]=a[3];
|
||||||
a[0]=z[0]; a[1]=z[1]; a[2]=z[2]; a[3]=z[3];
|
// a[0]=z[0]; a[1]=z[1]; a[2]=z[2]; a[3]=z[3];
|
||||||
z[0]=t[0]; z[1]=t[1]; z[2]=t[2]; z[3]=t[3];
|
// z[0]=t[0]; z[1]=t[1]; z[2]=t[2]; z[3]=t[3];
|
||||||
|
memcpy(t, a, 16);
|
||||||
|
memcpy(a, z, 16);
|
||||||
|
memcpy(z, t, 16);
|
||||||
a+=4; z-=4;
|
a+=4; z-=4;
|
||||||
|
|
||||||
for (; a<z; a+=4, z-=4)
|
for (; a<z; a+=4, z-=4)
|
||||||
{
|
{
|
||||||
ARIA_M1(a[0],t[0]); ARIA_M1(a[1],t[1]); ARIA_M1(a[2],t[2]); ARIA_M1(a[3],t[3]);
|
ARIA_M1(a[0],t[0]); ARIA_M1(a[1],t[1]); ARIA_M1(a[2],t[2]); ARIA_M1(a[3],t[3]);
|
||||||
ARIA_MM(t[0],t[1],t[2],t[3]); ARIA_P(t[0],t[1],t[2],t[3]); ARIA_MM(t[0],t[1],t[2],t[3]);
|
ARIA_MM(t[0],t[1],t[2],t[3]); ARIA_P(t[0],t[1],t[2],t[3]); ARIA_MM(t[0],t[1],t[2],t[3]);
|
||||||
s0=t[0]; s1=t[1]; s2=t[2]; s3=t[3];
|
// s[0]=t[0]; s[1]=t[1]; s[2]=t[2]; s[3]=t[3];
|
||||||
|
memcpy(s, t, 16);
|
||||||
|
|
||||||
ARIA_M1(z[0],t[0]); ARIA_M1(z[1],t[1]); ARIA_M1(z[2],t[2]); ARIA_M1(z[3],t[3]);
|
ARIA_M1(z[0],t[0]); ARIA_M1(z[1],t[1]); ARIA_M1(z[2],t[2]); ARIA_M1(z[3],t[3]);
|
||||||
ARIA_MM(t[0],t[1],t[2],t[3]); ARIA_P(t[0],t[1],t[2],t[3]); ARIA_MM(t[0],t[1],t[2],t[3]);
|
ARIA_MM(t[0],t[1],t[2],t[3]); ARIA_P(t[0],t[1],t[2],t[3]); ARIA_MM(t[0],t[1],t[2],t[3]);
|
||||||
a[0]=t[0]; a[1]=t[1]; a[2]=t[2]; a[3]=t[3];
|
// a[0]=t[0]; a[1]=t[1]; a[2]=t[2]; a[3]=t[3];
|
||||||
z[0]=s0; z[1]=s1; z[2]=s2; z[3]=s3;
|
// z[0]=s[0]; z[1]=s[1]; z[2]=s[2]; z[3]=s[3];
|
||||||
|
memcpy(a, t, 16);
|
||||||
|
memcpy(z, s, 16);
|
||||||
}
|
}
|
||||||
ARIA_M1(a[0],t[0]); ARIA_M1(a[1],t[1]); ARIA_M1(a[2],t[2]); ARIA_M1(a[3],t[3]);
|
ARIA_M1(a[0],t[0]); ARIA_M1(a[1],t[1]); ARIA_M1(a[2],t[2]); ARIA_M1(a[3],t[3]);
|
||||||
ARIA_MM(t[0],t[1],t[2],t[3]); ARIA_P(t[0],t[1],t[2],t[3]); ARIA_MM(t[0],t[1],t[2],t[3]);
|
ARIA_MM(t[0],t[1],t[2],t[3]); ARIA_P(t[0],t[1],t[2],t[3]); ARIA_MM(t[0],t[1],t[2],t[3]);
|
||||||
z[0]=t[0]; z[1]=t[1]; z[2]=t[2]; z[3]=t[3];
|
// z[0]=t[0]; z[1]=t[1]; z[2]=t[2]; z[3]=t[3];
|
||||||
|
memcpy(z, t, 16);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -557,12 +565,21 @@ void ARIA::Base::ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock, b
|
||||||
const byte *rk = reinterpret_cast<const byte*>(m_rk.data());
|
const byte *rk = reinterpret_cast<const byte*>(m_rk.data());
|
||||||
word32 *t = const_cast<word32*>(m_w.data()+20);
|
word32 *t = const_cast<word32*>(m_w.data()+20);
|
||||||
|
|
||||||
// Visual Studio is generating bad code within the SSSE3 code block. It is
|
#if CRYPTOPP_ENABLE_ARIA_INTRINSICS
|
||||||
// providing a NULL pointer or a pointer set to a constant like 0x1000.
|
if (HasSSSE3())
|
||||||
// It looks like some leftover garbage in the XMM register rather than
|
{
|
||||||
// the pointer loaded into the integer register for the non-SSE code path.
|
// 3 SSE instructions. 'inBlock' may be unaligned.
|
||||||
t[0] = LoadWord<true>(inBlock,0); t[1] = LoadWord<true>(inBlock,1);
|
const __m128i m = _mm_set_epi8(12,13,14,15, 8,9,10,11, 4,5,6,7, 0,1,2,3);
|
||||||
t[2] = LoadWord<true>(inBlock,2); t[3] = LoadWord<true>(inBlock,3);
|
const __m128i w = _mm_shuffle_epi8(_mm_loadu_si128((const __m128i*)(inBlock)), m);
|
||||||
|
_mm_store_si128((__m128i*)t, w);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
#endif // CRYPTOPP_ENABLE_ARIA_INTRINSICS
|
||||||
|
{
|
||||||
|
// 13 integer instructions
|
||||||
|
t[0] = LoadWord<true>(inBlock,0); t[1] = LoadWord<true>(inBlock,1);
|
||||||
|
t[2] = LoadWord<true>(inBlock,2); t[3] = LoadWord<true>(inBlock,3);
|
||||||
|
}
|
||||||
|
|
||||||
if (m_rounds > 12) {
|
if (m_rounds > 12) {
|
||||||
ARIA_KXL; rk+= 16; ARIA_FO;
|
ARIA_KXL; rk+= 16; ARIA_FO;
|
||||||
|
|
@ -622,9 +639,24 @@ void ARIA::Base::ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock, b
|
||||||
ARIA_WORD(outBlock,3)^=LoadWord<true>(rk,3);
|
ARIA_WORD(outBlock,3)^=LoadWord<true>(rk,3);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
if (xorBlock)
|
#if CRYPTOPP_ENABLE_ARIA_INTRINSICS
|
||||||
for (unsigned int n=0; n<16; ++n)
|
if (xorBlock != NULLPTR && HasSSSE3())
|
||||||
outBlock[n] ^= xorBlock[n];
|
{
|
||||||
|
// 3 SSE instructions
|
||||||
|
_mm_storeu_si128((__m128i*)(outBlock),
|
||||||
|
_mm_xor_si128(
|
||||||
|
// 'outBlock' and 'xorBlock' may be unaligned.
|
||||||
|
_mm_loadu_si128((const __m128i*)(outBlock)),
|
||||||
|
_mm_loadu_si128((const __m128i*)(xorBlock))));
|
||||||
|
}
|
||||||
|
else
|
||||||
|
#endif // CRYPTOPP_ENABLE_ARIA_INTRINSICS
|
||||||
|
{
|
||||||
|
// 15 integer instructions
|
||||||
|
if (xorBlock)
|
||||||
|
for (unsigned int n=0; n<16; ++n)
|
||||||
|
outBlock[n] ^= xorBlock[n];
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
NAMESPACE_END
|
NAMESPACE_END
|
||||||
|
|
|
||||||
4
aria.h
4
aria.h
|
|
@ -48,9 +48,9 @@ public:
|
||||||
void ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock, byte *outBlock) const;
|
void ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock, byte *outBlock) const;
|
||||||
|
|
||||||
private:
|
private:
|
||||||
// Reference implementation allocates a table for 17 sub-keys
|
// Reference implementation allocates a table of 17 sub-keys.
|
||||||
FixedSizeAlignedSecBlock<byte, 16*17> m_rk; // round keys
|
FixedSizeAlignedSecBlock<byte, 16*17> m_rk; // round keys
|
||||||
FixedSizeAlignedSecBlock<word32, 4*6> m_w; // w0, w1, w2, w3 and t
|
FixedSizeAlignedSecBlock<word32, 4*7> m_w; // w0, w1, w2, w3, t and u
|
||||||
unsigned int m_rounds;
|
unsigned int m_rounds;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
|
||||||
2
config.h
2
config.h
|
|
@ -402,7 +402,7 @@ NAMESPACE_END
|
||||||
#define CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE 0
|
#define CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE 0
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if !defined(CRYPTOPP_DISABLE_SSSE3) && (_MSC_VER >= 1500 || (defined(__SSSE3__) && defined(__SSSE3__)))
|
#if !defined(CRYPTOPP_DISABLE_SSSE3) && (_MSC_VER >= 1500 || (defined(__SSE3__) && defined(__SSSE3__)))
|
||||||
#define CRYPTOPP_BOOL_SSSE3_ASM_AVAILABLE 1
|
#define CRYPTOPP_BOOL_SSSE3_ASM_AVAILABLE 1
|
||||||
#else
|
#else
|
||||||
#define CRYPTOPP_BOOL_SSSE3_ASM_AVAILABLE 0
|
#define CRYPTOPP_BOOL_SSSE3_ASM_AVAILABLE 0
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue