Enable x25519 SSE2 for MS compilers (GH #761)
parent
17d7a70501
commit
1b49bdc420
12
donna.h
12
donna.h
|
|
@ -54,15 +54,23 @@ int curve25519(byte sharedKey[32], const byte secretKey[32], const byte othersKe
|
||||||
# define CRYPTOPP_CURVE25519_32BIT 1
|
# define CRYPTOPP_CURVE25519_32BIT 1
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
// Benchmarking on a modern 64-bit Core i5-6400 shows SSE2 on Linux
|
// Benchmarking on a modern 64-bit Core i5-6400 @2.7 GHz shows SSE2 on Linux
|
||||||
// is not profitable. Here are the numbers in milliseconds/operation:
|
// is not profitable. Here are the numbers in milliseconds/operation:
|
||||||
//
|
//
|
||||||
// * Langley, C++, 0.050
|
// * Langley, C++, 0.050
|
||||||
// * Moon, C++: 0.040
|
// * Moon, C++: 0.040
|
||||||
// * Moon, SSE2: 0.061
|
// * Moon, SSE2: 0.061
|
||||||
// * Moon, native: 0.045
|
// * Moon, native: 0.045
|
||||||
|
//
|
||||||
|
// However, a modern 64-bit Core i5-3200 @2.3 GHz shows SSE2 is profitable
|
||||||
|
// for MS compilers. Here are the numbers in milliseconds/operation:
|
||||||
|
//
|
||||||
|
// * x86, no SSE2, 0.294
|
||||||
|
// * x86, SSE2, 0.097
|
||||||
|
// * x64, no SSE2, 0.081
|
||||||
|
// * x64, SSE2, 0.071
|
||||||
|
|
||||||
#if (CRYPTOPP_SSE2_INTRIN_AVAILABLE) && 0
|
#if (CRYPTOPP_SSE2_INTRIN_AVAILABLE) && defined(_MSC_VER)
|
||||||
# define CRYPTOPP_CURVE25519_SSE2 1
|
# define CRYPTOPP_CURVE25519_SSE2 1
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -214,7 +214,7 @@ curve25519_contract(byte out[32], const bignum25519 in) {
|
||||||
*/
|
*/
|
||||||
inline void
|
inline void
|
||||||
curve25519_swap_conditional(bignum25519 a, bignum25519 b, word32 iswap) {
|
curve25519_swap_conditional(bignum25519 a, bignum25519 b, word32 iswap) {
|
||||||
const word32 swap = (word32)(-(int32_t)iswap);
|
const word32 swap = (word32)(-(sword32)iswap);
|
||||||
xmmi a0,a1,a2,b0,b1,b2,x0,x1,x2;
|
xmmi a0,a1,a2,b0,b1,b2,x0,x1,x2;
|
||||||
xmmi mask = _mm_cvtsi32_si128(swap);
|
xmmi mask = _mm_cvtsi32_si128(swap);
|
||||||
mask = _mm_shuffle_epi32(mask, 0);
|
mask = _mm_shuffle_epi32(mask, 0);
|
||||||
|
|
@ -1114,16 +1114,16 @@ int curve25519_SSE2(byte sharedKey[32], const byte secretKey[32], const byte oth
|
||||||
packed32bignum25519 qx, qz, pqz, pqx;
|
packed32bignum25519 qx, qz, pqz, pqx;
|
||||||
packed64bignum25519 nq, sq, sqscalar, prime, primex, primez, nqpq;
|
packed64bignum25519 nq, sq, sqscalar, prime, primex, primez, nqpq;
|
||||||
bignum25519mulprecomp preq;
|
bignum25519mulprecomp preq;
|
||||||
size_t bit, lastbit;
|
size_t i=0, bit=0, lastbit=0;
|
||||||
|
|
||||||
curve25519_expand(nqpqx, othersKey);
|
curve25519_expand(nqpqx, othersKey);
|
||||||
curve25519_mul_precompute(&preq, nqpqx);
|
curve25519_mul_precompute(&preq, nqpqx);
|
||||||
|
|
||||||
/* do bits 254..3 */
|
/* do bits 254..3 */
|
||||||
for (int i = 254, lastbit = 0; i >= 3; i--) {
|
for (i = 254, lastbit=0; i >= 3; i--) {
|
||||||
bit = (e[i/8] >> (i & 7)) & 1;
|
bit = (e[i/8] >> (i & 7)) & 1;
|
||||||
curve25519_swap_conditional(nqx, nqpqx, bit ^ lastbit);
|
curve25519_swap_conditional(nqx, nqpqx, (word32)(bit ^ lastbit));
|
||||||
curve25519_swap_conditional(nqz, nqpqz, bit ^ lastbit);
|
curve25519_swap_conditional(nqz, nqpqz, (word32)(bit ^ lastbit));
|
||||||
lastbit = bit;
|
lastbit = bit;
|
||||||
|
|
||||||
curve25519_tangle32(qx, nqx, nqpqx); /* qx = [nqx,nqpqx] */
|
curve25519_tangle32(qx, nqx, nqpqx); /* qx = [nqx,nqpqx] */
|
||||||
|
|
@ -1149,11 +1149,11 @@ int curve25519_SSE2(byte sharedKey[32], const byte secretKey[32], const byte oth
|
||||||
|
|
||||||
/* it's possible to get rid of this swap with the swap in the above loop
|
/* it's possible to get rid of this swap with the swap in the above loop
|
||||||
at the bottom instead of the top, but compilers seem to optimize better this way */
|
at the bottom instead of the top, but compilers seem to optimize better this way */
|
||||||
curve25519_swap_conditional(nqx, nqpqx, bit);
|
curve25519_swap_conditional(nqx, nqpqx, (word32)bit);
|
||||||
curve25519_swap_conditional(nqz, nqpqz, bit);
|
curve25519_swap_conditional(nqz, nqpqz, (word32)bit);
|
||||||
|
|
||||||
/* do bits 2..0 */
|
/* do bits 2..0 */
|
||||||
for (size_t i = 0; i < 3; i++) {
|
for (i = 0; i < 3; i++) {
|
||||||
curve25519_compute_nq(nq, nqx, nqz);
|
curve25519_compute_nq(nq, nqx, nqz);
|
||||||
curve25519_square_packed64(sq, nq); /* sq = nq^2 */
|
curve25519_square_packed64(sq, nq); /* sq = nq^2 */
|
||||||
curve25519_121665_packed64(sqscalar, sq); /* sqscalar = sq * [121666,121665] */
|
curve25519_121665_packed64(sqscalar, sq); /* sqscalar = sq * [121666,121665] */
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue