Enable x25519 SSE2 for MS compilers (GH #761)

pull/765/head
Jeffrey Walton 2018-12-14 13:43:01 -05:00
parent 17d7a70501
commit 1b49bdc420
No known key found for this signature in database
GPG Key ID: B36AB348921B1838
2 changed files with 18 additions and 10 deletions

12
donna.h
View File

@ -54,15 +54,23 @@ int curve25519(byte sharedKey[32], const byte secretKey[32], const byte othersKe
# define CRYPTOPP_CURVE25519_32BIT 1
#endif
// Benchmarking on a modern 64-bit Core i5-6400 shows SSE2 on Linux
// Benchmarking on a modern 64-bit Core i5-6400 @2.7 GHz shows SSE2 on Linux
// is not profitable. Here are the numbers in milliseconds/operation:
//
// * Langley, C++, 0.050
// * Moon, C++: 0.040
// * Moon, SSE2: 0.061
// * Moon, native: 0.045
//
// However, a modern 64-bit Core i5-3200 @2.3 GHz shows SSE2 is profitable
// for MS compilers. Here are the numbers in milliseconds/operation:
//
// * x86, no SSE2, 0.294
// * x86, SSE2, 0.097
// * x64, no SSE2, 0.081
// * x64, SSE2, 0.071
#if (CRYPTOPP_SSE2_INTRIN_AVAILABLE) && 0
#if (CRYPTOPP_SSE2_INTRIN_AVAILABLE) && defined(_MSC_VER)
# define CRYPTOPP_CURVE25519_SSE2 1
#endif

View File

@ -214,7 +214,7 @@ curve25519_contract(byte out[32], const bignum25519 in) {
*/
inline void
curve25519_swap_conditional(bignum25519 a, bignum25519 b, word32 iswap) {
const word32 swap = (word32)(-(int32_t)iswap);
const word32 swap = (word32)(-(sword32)iswap);
xmmi a0,a1,a2,b0,b1,b2,x0,x1,x2;
xmmi mask = _mm_cvtsi32_si128(swap);
mask = _mm_shuffle_epi32(mask, 0);
@ -1114,16 +1114,16 @@ int curve25519_SSE2(byte sharedKey[32], const byte secretKey[32], const byte oth
packed32bignum25519 qx, qz, pqz, pqx;
packed64bignum25519 nq, sq, sqscalar, prime, primex, primez, nqpq;
bignum25519mulprecomp preq;
size_t bit, lastbit;
size_t i=0, bit=0, lastbit=0;
curve25519_expand(nqpqx, othersKey);
curve25519_mul_precompute(&preq, nqpqx);
/* do bits 254..3 */
for (int i = 254, lastbit = 0; i >= 3; i--) {
for (i = 254, lastbit=0; i >= 3; i--) {
bit = (e[i/8] >> (i & 7)) & 1;
curve25519_swap_conditional(nqx, nqpqx, bit ^ lastbit);
curve25519_swap_conditional(nqz, nqpqz, bit ^ lastbit);
curve25519_swap_conditional(nqx, nqpqx, (word32)(bit ^ lastbit));
curve25519_swap_conditional(nqz, nqpqz, (word32)(bit ^ lastbit));
lastbit = bit;
curve25519_tangle32(qx, nqx, nqpqx); /* qx = [nqx,nqpqx] */
@ -1149,11 +1149,11 @@ int curve25519_SSE2(byte sharedKey[32], const byte secretKey[32], const byte oth
/* it's possible to get rid of this swap with the swap in the above loop
at the bottom instead of the top, but compilers seem to optimize better this way */
curve25519_swap_conditional(nqx, nqpqx, bit);
curve25519_swap_conditional(nqz, nqpqz, bit);
curve25519_swap_conditional(nqx, nqpqx, (word32)bit);
curve25519_swap_conditional(nqz, nqpqz, (word32)bit);
/* do bits 2..0 */
for (size_t i = 0; i < 3; i++) {
for (i = 0; i < 3; i++) {
curve25519_compute_nq(nq, nqx, nqz);
curve25519_square_packed64(sq, nq); /* sq = nq^2 */
curve25519_121665_packed64(sqscalar, sq); /* sqscalar = sq * [121666,121665] */