From 77923a291a5c993cca5c11c9b7dc00891d88bd8e Mon Sep 17 00:00:00 2001 From: Jeffrey Walton Date: Tue, 11 Dec 2018 16:17:56 -0500 Subject: [PATCH] Add Langley's curve25519 (GH #761, PR# 762) --- Filelist.txt | 6 + TestData/x25519.dat | 1 + bench3.cpp | 4 + cryptest.nmake | 88 ++-- cryptlib.vcxproj | 5 + cryptlib.vcxproj.filters | 21 + donna.h | 65 +++ donna_32.cpp | 899 +++++++++++++++++++++++++++++++++++++++ donna_64.cpp | 509 ++++++++++++++++++++++ test.cpp | 7 +- validat0.cpp | 80 +++- validat3.cpp | 2 + validat7.cpp | 10 + validate.h | 2 + xed25519.cpp | 143 +++++++ xed25519.h | 44 ++ 16 files changed, 1825 insertions(+), 61 deletions(-) create mode 100644 TestData/x25519.dat create mode 100644 donna.h create mode 100644 donna_32.cpp create mode 100644 donna_64.cpp create mode 100644 xed25519.cpp create mode 100644 xed25519.h diff --git a/Filelist.txt b/Filelist.txt index 5d786837..cae750c7 100644 --- a/Filelist.txt +++ b/Filelist.txt @@ -101,6 +101,9 @@ dlltest.vcxproj dlltest.vcxproj.filters dmac.h drbg.h +donna.h +donna_32.cpp +donna_64.cpp dsa.cpp dsa.h eax.cpp @@ -371,6 +374,8 @@ whrlpool.h words.h x64dll.asm x64masm.asm +xed25519.h +xed25519.cpp xtr.cpp xtr.h xtrcrypt.cpp @@ -455,6 +460,7 @@ TestData/skipjack.dat TestData/squareva.dat TestData/twofishv.dat TestData/usage.dat +TestData/x25519.dat TestData/xtrdh171.dat TestData/xtrdh342.dat TestVectors/Readme.txt diff --git a/TestData/x25519.dat b/TestData/x25519.dat new file mode 100644 index 00000000..52685977 --- /dev/null +++ b/TestData/x25519.dat @@ -0,0 +1 @@ +30440320984E4ED42CF68631C0FF27A4AB8D3EA7AFDCFE3C0087A847C4FAD054A9C7756C0420CF293063A1807A96AA89929564B1695AFBEC2546BCD048AACBD6C741CE3B5221 \ No newline at end of file diff --git a/bench3.cpp b/bench3.cpp index be19ab0b..470ec251 100644 --- a/bench3.cpp +++ b/bench3.cpp @@ -29,6 +29,7 @@ #include "ec2n.h" #include "asn.h" #include "dh.h" +#include "xed25519.h" #include "mqv.h" #include "hmqv.h" #include "fhmqv.h" @@ -390,6 +391,7 @@ void Benchmark3(double t, double hertz) ECGDSA::Verifier spub3(spriv3); ECDH::Domain ecdhc(ASN1::secp256k1()); ECMQV::Domain ecmqvc(ASN1::secp256k1()); + x25519 x25519ka(Test::GlobalRNG()); BenchMarkEncryption("ECIES over GF(p) 256", cpub, t); BenchMarkDecryption("ECIES over GF(p) 256", cpriv, cpub, t); @@ -399,6 +401,8 @@ void Benchmark3(double t, double hertz) BenchMarkVerification("ECDSA-RFC6979 over GF(p) 256", spriv2, spub2, t); BenchMarkSigning("ECGDSA over GF(p) 256", spriv3, t); BenchMarkVerification("ECGDSA over GF(p) 256", spriv3, spub3, t); + BenchMarkKeyGen("x25519", x25519ka, t); + BenchMarkAgreement("x25519", x25519ka, t); BenchMarkKeyGen("ECDHC over GF(p) 256", ecdhc, t); BenchMarkAgreement("ECDHC over GF(p) 256", ecdhc, t); BenchMarkKeyGen("ECMQVC over GF(p) 256", ecmqvc, t); diff --git a/cryptest.nmake b/cryptest.nmake index dc987a55..13591113 100644 --- a/cryptest.nmake +++ b/cryptest.nmake @@ -52,59 +52,63 @@ LIB_SRCS = \ cryptlib.cpp cpu.cpp integer.cpp 3way.cpp adler32.cpp algebra.cpp \ - algparam.cpp arc4.cpp aria_simd.cpp aria.cpp ariatab.cpp asn.cpp \ - authenc.cpp base32.cpp base64.cpp basecode.cpp bfinit.cpp blake2s_simd.cpp \ - blake2b_simd.cpp blake2.cpp blowfish.cpp blumshub.cpp camellia.cpp cast.cpp casts.cpp \ - cbcmac.cpp ccm.cpp chacha_avx.cpp chacha_simd.cpp chacha.cpp cham_simd.cpp cham.cpp \ - channels.cpp cmac.cpp crc_simd.cpp crc.cpp darn.cpp default.cpp des.cpp dessp.cpp \ - dh.cpp dh2.cpp dll.cpp dsa.cpp eax.cpp ec2n.cpp eccrypto.cpp ecp.cpp elgamal.cpp \ - emsa2.cpp eprecomp.cpp esign.cpp files.cpp filters.cpp fips140.cpp \ - fipstest.cpp gcm_simd.cpp gcm.cpp gf256.cpp gf2_32.cpp gf2n.cpp \ - gfpcrypt.cpp gost.cpp gzip.cpp hc128.cpp hc256.cpp hex.cpp hight.cpp \ - hmac.cpp hrtimer.cpp ida.cpp idea.cpp iterhash.cpp kalyna.cpp \ - kalynatab.cpp keccak.cpp keccakc.cpp lea_simd.cpp lea.cpp luc.cpp \ - mars.cpp marss.cpp md2.cpp md4.cpp md5.cpp misc.cpp modes.cpp mqueue.cpp \ - mqv.cpp nbtheory.cpp neon_simd.cpp oaep.cpp osrng.cpp padlkrng.cpp \ - panama.cpp pkcspad.cpp poly1305.cpp polynomi.cpp ppc_simd.cpp pssr.cpp \ + algparam.cpp arc4.cpp aria.cpp aria_simd.cpp ariatab.cpp asn.cpp \ + authenc.cpp base32.cpp base64.cpp basecode.cpp bfinit.cpp blake2.cpp \ + blake2b_simd.cpp blake2s_simd.cpp blowfish.cpp blumshub.cpp camellia.cpp \ + cast.cpp casts.cpp cbcmac.cpp ccm.cpp chacha.cpp chacha_avx.cpp \ + chacha_simd.cpp cham.cpp cham_simd.cpp channels.cpp cmac.cpp crc.cpp \ + crc_simd.cpp darn.cpp default.cpp des.cpp dessp.cpp dh.cpp dh2.cpp \ + dll.cpp donna_32.cpp donna_64.cpp dsa.cpp eax.cpp ec2n.cpp eccrypto.cpp \ + ecp.cpp elgamal.cpp emsa2.cpp eprecomp.cpp esign.cpp files.cpp \ + filters.cpp fips140.cpp fipstest.cpp gcm.cpp gcm_simd.cpp gf256.cpp \ + gf2_32.cpp gf2n.cpp gfpcrypt.cpp gost.cpp gzip.cpp hc128.cpp hc256.cpp \ + hex.cpp hight.cpp hmac.cpp hrtimer.cpp ida.cpp idea.cpp iterhash.cpp \ + kalyna.cpp kalynatab.cpp keccak.cpp keccakc.cpp lea.cpp lea_simd.cpp \ + luc.cpp mars.cpp marss.cpp md2.cpp md4.cpp md5.cpp misc.cpp modes.cpp \ + mqueue.cpp mqv.cpp nbtheory.cpp neon_simd.cpp oaep.cpp osrng.cpp \ + padlkrng.cpp panama.cpp pkcspad.cpp poly1305.cpp polynomi.cpp \ + ppc_power7.cpp ppc_power8.cpp ppc_power9.cpp ppc_simd.cpp pssr.cpp \ pubkey.cpp queue.cpp rabbit.cpp rabin.cpp randpool.cpp rc2.cpp rc5.cpp \ - rc6.cpp rdrand.cpp rdtables.cpp rijndael_simd.cpp rijndael.cpp ripemd.cpp \ + rc6.cpp rdrand.cpp rdtables.cpp rijndael.cpp rijndael_simd.cpp ripemd.cpp \ rng.cpp rsa.cpp rw.cpp safer.cpp salsa.cpp scrypt.cpp seal.cpp seed.cpp \ - serpent.cpp sha_simd.cpp sha.cpp sha3.cpp shacal2_simd.cpp shacal2.cpp \ - shark.cpp sharkbox.cpp simeck_simd.cpp simeck.cpp simon.cpp \ - simon128_simd.cpp simon64_simd.cpp skipjack.cpp sm3.cpp sm4_simd.cpp \ - sm4.cpp sosemanuk.cpp speck.cpp speck128_simd.cpp speck64_simd.cpp \ + serpent.cpp sha.cpp sha3.cpp sha_simd.cpp shacal2.cpp shacal2_simd.cpp \ + shark.cpp sharkbox.cpp simeck.cpp simeck_simd.cpp simon.cpp \ + simon128_simd.cpp simon64_simd.cpp skipjack.cpp sm3.cpp sm4.cpp \ + sm4_simd.cpp sosemanuk.cpp speck.cpp speck128_simd.cpp speck64_simd.cpp \ square.cpp squaretb.cpp sse_simd.cpp strciphr.cpp tea.cpp tftables.cpp \ threefish.cpp tiger.cpp tigertab.cpp ttmac.cpp tweetnacl.cpp twofish.cpp \ - vmac.cpp wake.cpp whrlpool.cpp xtr.cpp xtrcrypt.cpp zdeflate.cpp \ - zinflate.cpp zlib.cpp + vmac.cpp wake.cpp whrlpool.cpp xed25519.cpp xtr.cpp xtrcrypt.cpp \ + zdeflate.cpp zinflate.cpp zlib.cpp LIB_OBJS = \ cryptlib.obj cpu.obj integer.obj 3way.obj adler32.obj algebra.obj \ - algparam.obj arc4.obj aria_simd.obj aria.obj ariatab.obj asn.obj \ - authenc.obj base32.obj base64.obj basecode.obj bfinit.obj blake2s_simd.obj \ - blake2b_simd.obj blake2.obj blowfish.obj blumshub.obj camellia.obj cast.obj casts.obj \ - cbcmac.obj ccm.obj chacha_avx.obj chacha_simd.obj chacha.obj cham_simd.obj cham.obj \ - channels.obj cmac.obj crc_simd.obj crc.obj darn.obj default.obj des.obj dessp.obj \ - dh.obj dh2.obj dll.obj dsa.obj eax.obj ec2n.obj eccrypto.obj ecp.obj elgamal.obj \ - emsa2.obj eprecomp.obj esign.obj files.obj filters.obj fips140.obj \ - fipstest.obj gcm_simd.obj gcm.obj gf256.obj gf2_32.obj gf2n.obj \ - gfpcrypt.obj gost.obj gzip.obj hc128.obj hc256.obj hex.obj hight.obj \ - hmac.obj hrtimer.obj ida.obj idea.obj iterhash.obj kalyna.obj \ - kalynatab.obj keccak.obj keccakc.obj lea_simd.obj lea.obj luc.obj \ - mars.obj marss.obj md2.obj md4.obj md5.obj misc.obj modes.obj mqueue.obj \ - mqv.obj nbtheory.obj neon_simd.obj oaep.obj osrng.obj padlkrng.obj \ - panama.obj pkcspad.obj poly1305.obj polynomi.obj ppc_simd.obj pssr.obj \ + algparam.obj arc4.obj aria.obj aria_simd.obj ariatab.obj asn.obj \ + authenc.obj base32.obj base64.obj basecode.obj bfinit.obj blake2.obj \ + blake2b_simd.obj blake2s_simd.obj blowfish.obj blumshub.obj camellia.obj \ + cast.obj casts.obj cbcmac.obj ccm.obj chacha.obj chacha_avx.obj \ + chacha_simd.obj cham.obj cham_simd.obj channels.obj cmac.obj crc.obj \ + crc_simd.obj darn.obj default.obj des.obj dessp.obj dh.obj dh2.obj \ + dll.obj donna_32.obj donna_64.obj dsa.obj eax.obj ec2n.obj eccrypto.obj \ + ecp.obj elgamal.obj emsa2.obj eprecomp.obj esign.obj files.obj \ + filters.obj fips140.obj fipstest.obj gcm.obj gcm_simd.obj gf256.obj \ + gf2_32.obj gf2n.obj gfpcrypt.obj gost.obj gzip.obj hc128.obj hc256.obj \ + hex.obj hight.obj hmac.obj hrtimer.obj ida.obj idea.obj iterhash.obj \ + kalyna.obj kalynatab.obj keccak.obj keccakc.obj lea.obj lea_simd.obj \ + luc.obj mars.obj marss.obj md2.obj md4.obj md5.obj misc.obj modes.obj \ + mqueue.obj mqv.obj nbtheory.obj neon_simd.obj oaep.obj osrng.obj \ + padlkrng.obj panama.obj pkcspad.obj poly1305.obj polynomi.obj \ + ppc_power7.obj ppc_power8.obj ppc_power9.obj ppc_simd.obj pssr.obj \ pubkey.obj queue.obj rabbit.obj rabin.obj randpool.obj rc2.obj rc5.obj \ - rc6.obj rdrand.obj rdtables.obj rijndael_simd.obj rijndael.obj ripemd.obj \ + rc6.obj rdrand.obj rdtables.obj rijndael.obj rijndael_simd.obj ripemd.obj \ rng.obj rsa.obj rw.obj safer.obj salsa.obj scrypt.obj seal.obj seed.obj \ - serpent.obj sha_simd.obj sha.obj sha3.obj shacal2_simd.obj shacal2.obj \ - shark.obj sharkbox.obj simeck_simd.obj simeck.obj simon.obj \ - simon128_simd.obj simon64_simd.obj skipjack.obj sm3.obj sm4_simd.obj \ - sm4.obj sosemanuk.obj speck.obj speck128_simd.obj speck64_simd.obj \ + serpent.obj sha.obj sha3.obj sha_simd.obj shacal2.obj shacal2_simd.obj \ + shark.obj sharkbox.obj simeck.obj simeck_simd.obj simon.obj \ + simon128_simd.obj simon64_simd.obj skipjack.obj sm3.obj sm4.obj \ + sm4_simd.obj sosemanuk.obj speck.obj speck128_simd.obj speck64_simd.obj \ square.obj squaretb.obj sse_simd.obj strciphr.obj tea.obj tftables.obj \ threefish.obj tiger.obj tigertab.obj ttmac.obj tweetnacl.obj twofish.obj \ - vmac.obj wake.obj whrlpool.obj xtr.obj xtrcrypt.obj zdeflate.obj \ - zinflate.obj zlib.obj + vmac.obj wake.obj whrlpool.obj xed25519.obj xtr.obj xtrcrypt.obj \ + zdeflate.obj zinflate.obj zlib.obj TEST_SRCS = \ test.cpp bench1.cpp bench2.cpp bench3.cpp datatest.cpp \ diff --git a/cryptlib.vcxproj b/cryptlib.vcxproj index 3de7acfd..31c4d11c 100644 --- a/cryptlib.vcxproj +++ b/cryptlib.vcxproj @@ -213,6 +213,8 @@ + + @@ -335,6 +337,7 @@ + @@ -413,6 +416,7 @@ + @@ -529,6 +533,7 @@ + diff --git a/cryptlib.vcxproj.filters b/cryptlib.vcxproj.filters index 104cf051..8d10ba5a 100644 --- a/cryptlib.vcxproj.filters +++ b/cryptlib.vcxproj.filters @@ -119,6 +119,9 @@ Source Files + + Source Files + Source Files @@ -137,6 +140,12 @@ Source Files + + Source Files + + + Source Files + Source Files @@ -482,6 +491,9 @@ Source Files + + Source Files + Source Files @@ -597,6 +609,9 @@ Header Files + + Header Files + Header Files @@ -618,6 +633,9 @@ Header Files + + Header Files + Header Files @@ -963,6 +981,9 @@ Header Files + + Header Files + Header Files diff --git a/donna.h b/donna.h new file mode 100644 index 00000000..926a4521 --- /dev/null +++ b/donna.h @@ -0,0 +1,65 @@ +// donna.h - written and placed in public domain by Jeffrey Walton +// This is a port of Adam Langley's curve25519-donna +// located at https://github.com/agl/curve25519-donna + +/* Copyright 2008, Google Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following disclaimer + * in the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Google Inc. nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * curve25519-donna: Curve25519 elliptic curve, public key function + * + * http://code.google.com/p/curve25519-donna/ + * + * Adam Langley + * + * Derived from public domain C code by Daniel J. Bernstein + * + * More information about curve25519 can be found here + * http://cr.yp.to/ecdh.html + * + * djb's sample implementation of curve25519 is written in a special assembly + * language called qhasm and uses the floating point registers. + * + * This is, almost, a clean room reimplementation from the curve25519 paper. It + * uses many of the tricks described therein. Only the crecip function is taken + * from the sample implementation. */ + +#ifndef CRYPTOPP_DONNA_H +#define CRYPTOPP_DONNA_H + +#include "cryptlib.h" + +NAMESPACE_BEGIN(CryptoPP) +NAMESPACE_BEGIN(Donna) + +int curve25519(byte pubkey[32], const byte seckey[32], const byte basepoint[32]); + +NAMESPACE_END // Donna +NAMESPACE_END // CryptoPP + +#endif // CRYPTOPP_DONNA_H diff --git a/donna_32.cpp b/donna_32.cpp new file mode 100644 index 00000000..08bff4dd --- /dev/null +++ b/donna_32.cpp @@ -0,0 +1,899 @@ +// donna_32.cpp - written and placed in public domain by Jeffrey Walton +// This is a port of Adam Langley's curve25519-donna +// located at https://github.com/agl/curve25519-donna + +/* Copyright 2008, Google Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following disclaimer + * in the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Google Inc. nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * curve25519-donna: Curve25519 elliptic curve, public key function + * + * http://code.google.com/p/curve25519-donna/ + * + * Adam Langley + * + * Derived from public domain C code by Daniel J. Bernstein + * + * More information about curve25519 can be found here + * http://cr.yp.to/ecdh.html + * + * djb's sample implementation of curve25519 is written in a special assembly + * language called qhasm and uses the floating point registers. + * + * This is, almost, a clean room reimplementation from the curve25519 paper. It + * uses many of the tricks described therein. Only the crecip function is taken + * from the sample implementation. */ + +#include "pch.h" + +#include "config.h" +#include "donna.h" +#include "stdcpp.h" + +// This macro is not in a header like config.h because +// we don't want it exposed to user code. We also need +// a standard header like or . +// Langley uses uint128_t in the 64-bit code paths so +// we further restrict 64-bit code. +#if (UINTPTR_MAX == 0xffffffff) || !defined(CRYPTOPP_WORD128_AVAILABLE) +# define CRYPTOPP_32BIT 1 +#else +# define CRYPTOPP_64BIT 1 +#endif + +// Squash MS LNK4221 and libtool warnings +extern const char DONNA32_FNAME[] = __FILE__; + +#if defined(CRYPTOPP_32BIT) + +ANONYMOUS_NAMESPACE_BEGIN + +using std::memcpy; +using CryptoPP::byte; +using CryptoPP::word32; +using CryptoPP::word64; +using CryptoPP::sword32; +using CryptoPP::sword64; + +typedef sword64 limb; + +/* Field element representation: + * + * Field elements are written as an array of signed, 64-bit limbs, least + * significant first. The value of the field element is: + * x[0] + 2^26·x[1] + x^51·x[2] + 2^102·x[3] + ... + * + * i.e. the limbs are 26, 25, 26, 25, ... bits wide. */ + +/* Sum two numbers: output += in */ +void fsum(limb *output, const limb *in) +{ + for (unsigned int i = 0; i < 10; i += 2) { + output[0+i] = output[0+i] + in[0+i]; + output[1+i] = output[1+i] + in[1+i]; + } +} + +/* Find the difference of two numbers: output = in - output + * (note the order of the arguments!). */ +void fdifference(limb *output, const limb *in) +{ + for (unsigned int i = 0; i < 10; ++i) { + output[i] = in[i] - output[i]; + } +} + +/* Multiply a number by a scalar: output = in * scalar */ +void fscalar_product(limb *output, const limb *in, const limb scalar) +{ + for (unsigned int i = 0; i < 10; ++i) { + output[i] = in[i] * scalar; + } +} + +/* Multiply two numbers: output = in2 * in + * + * output must be distinct to both inputs. The inputs are reduced coefficient + * form, the output is not. + * + * output[x] <= 14 * the largest product of the input limbs. */ +void fproduct(limb *output, const limb *in2, const limb *in) +{ + output[0] = ((limb) ((sword32) in2[0])) * ((sword32) in[0]); + output[1] = ((limb) ((sword32) in2[0])) * ((sword32) in[1]) + + ((limb) ((sword32) in2[1])) * ((sword32) in[0]); + output[2] = 2 * ((limb) ((sword32) in2[1])) * ((sword32) in[1]) + + ((limb) ((sword32) in2[0])) * ((sword32) in[2]) + + ((limb) ((sword32) in2[2])) * ((sword32) in[0]); + output[3] = ((limb) ((sword32) in2[1])) * ((sword32) in[2]) + + ((limb) ((sword32) in2[2])) * ((sword32) in[1]) + + ((limb) ((sword32) in2[0])) * ((sword32) in[3]) + + ((limb) ((sword32) in2[3])) * ((sword32) in[0]); + output[4] = ((limb) ((sword32) in2[2])) * ((sword32) in[2]) + + 2 * (((limb) ((sword32) in2[1])) * ((sword32) in[3]) + + ((limb) ((sword32) in2[3])) * ((sword32) in[1])) + + ((limb) ((sword32) in2[0])) * ((sword32) in[4]) + + ((limb) ((sword32) in2[4])) * ((sword32) in[0]); + output[5] = ((limb) ((sword32) in2[2])) * ((sword32) in[3]) + + ((limb) ((sword32) in2[3])) * ((sword32) in[2]) + + ((limb) ((sword32) in2[1])) * ((sword32) in[4]) + + ((limb) ((sword32) in2[4])) * ((sword32) in[1]) + + ((limb) ((sword32) in2[0])) * ((sword32) in[5]) + + ((limb) ((sword32) in2[5])) * ((sword32) in[0]); + output[6] = 2 * (((limb) ((sword32) in2[3])) * ((sword32) in[3]) + + ((limb) ((sword32) in2[1])) * ((sword32) in[5]) + + ((limb) ((sword32) in2[5])) * ((sword32) in[1])) + + ((limb) ((sword32) in2[2])) * ((sword32) in[4]) + + ((limb) ((sword32) in2[4])) * ((sword32) in[2]) + + ((limb) ((sword32) in2[0])) * ((sword32) in[6]) + + ((limb) ((sword32) in2[6])) * ((sword32) in[0]); + output[7] = ((limb) ((sword32) in2[3])) * ((sword32) in[4]) + + ((limb) ((sword32) in2[4])) * ((sword32) in[3]) + + ((limb) ((sword32) in2[2])) * ((sword32) in[5]) + + ((limb) ((sword32) in2[5])) * ((sword32) in[2]) + + ((limb) ((sword32) in2[1])) * ((sword32) in[6]) + + ((limb) ((sword32) in2[6])) * ((sword32) in[1]) + + ((limb) ((sword32) in2[0])) * ((sword32) in[7]) + + ((limb) ((sword32) in2[7])) * ((sword32) in[0]); + output[8] = ((limb) ((sword32) in2[4])) * ((sword32) in[4]) + + 2 * (((limb) ((sword32) in2[3])) * ((sword32) in[5]) + + ((limb) ((sword32) in2[5])) * ((sword32) in[3]) + + ((limb) ((sword32) in2[1])) * ((sword32) in[7]) + + ((limb) ((sword32) in2[7])) * ((sword32) in[1])) + + ((limb) ((sword32) in2[2])) * ((sword32) in[6]) + + ((limb) ((sword32) in2[6])) * ((sword32) in[2]) + + ((limb) ((sword32) in2[0])) * ((sword32) in[8]) + + ((limb) ((sword32) in2[8])) * ((sword32) in[0]); + output[9] = ((limb) ((sword32) in2[4])) * ((sword32) in[5]) + + ((limb) ((sword32) in2[5])) * ((sword32) in[4]) + + ((limb) ((sword32) in2[3])) * ((sword32) in[6]) + + ((limb) ((sword32) in2[6])) * ((sword32) in[3]) + + ((limb) ((sword32) in2[2])) * ((sword32) in[7]) + + ((limb) ((sword32) in2[7])) * ((sword32) in[2]) + + ((limb) ((sword32) in2[1])) * ((sword32) in[8]) + + ((limb) ((sword32) in2[8])) * ((sword32) in[1]) + + ((limb) ((sword32) in2[0])) * ((sword32) in[9]) + + ((limb) ((sword32) in2[9])) * ((sword32) in[0]); + output[10] = 2 * (((limb) ((sword32) in2[5])) * ((sword32) in[5]) + + ((limb) ((sword32) in2[3])) * ((sword32) in[7]) + + ((limb) ((sword32) in2[7])) * ((sword32) in[3]) + + ((limb) ((sword32) in2[1])) * ((sword32) in[9]) + + ((limb) ((sword32) in2[9])) * ((sword32) in[1])) + + ((limb) ((sword32) in2[4])) * ((sword32) in[6]) + + ((limb) ((sword32) in2[6])) * ((sword32) in[4]) + + ((limb) ((sword32) in2[2])) * ((sword32) in[8]) + + ((limb) ((sword32) in2[8])) * ((sword32) in[2]); + output[11] = ((limb) ((sword32) in2[5])) * ((sword32) in[6]) + + ((limb) ((sword32) in2[6])) * ((sword32) in[5]) + + ((limb) ((sword32) in2[4])) * ((sword32) in[7]) + + ((limb) ((sword32) in2[7])) * ((sword32) in[4]) + + ((limb) ((sword32) in2[3])) * ((sword32) in[8]) + + ((limb) ((sword32) in2[8])) * ((sword32) in[3]) + + ((limb) ((sword32) in2[2])) * ((sword32) in[9]) + + ((limb) ((sword32) in2[9])) * ((sword32) in[2]); + output[12] = ((limb) ((sword32) in2[6])) * ((sword32) in[6]) + + 2 * (((limb) ((sword32) in2[5])) * ((sword32) in[7]) + + ((limb) ((sword32) in2[7])) * ((sword32) in[5]) + + ((limb) ((sword32) in2[3])) * ((sword32) in[9]) + + ((limb) ((sword32) in2[9])) * ((sword32) in[3])) + + ((limb) ((sword32) in2[4])) * ((sword32) in[8]) + + ((limb) ((sword32) in2[8])) * ((sword32) in[4]); + output[13] = ((limb) ((sword32) in2[6])) * ((sword32) in[7]) + + ((limb) ((sword32) in2[7])) * ((sword32) in[6]) + + ((limb) ((sword32) in2[5])) * ((sword32) in[8]) + + ((limb) ((sword32) in2[8])) * ((sword32) in[5]) + + ((limb) ((sword32) in2[4])) * ((sword32) in[9]) + + ((limb) ((sword32) in2[9])) * ((sword32) in[4]); + output[14] = 2 * (((limb) ((sword32) in2[7])) * ((sword32) in[7]) + + ((limb) ((sword32) in2[5])) * ((sword32) in[9]) + + ((limb) ((sword32) in2[9])) * ((sword32) in[5])) + + ((limb) ((sword32) in2[6])) * ((sword32) in[8]) + + ((limb) ((sword32) in2[8])) * ((sword32) in[6]); + output[15] = ((limb) ((sword32) in2[7])) * ((sword32) in[8]) + + ((limb) ((sword32) in2[8])) * ((sword32) in[7]) + + ((limb) ((sword32) in2[6])) * ((sword32) in[9]) + + ((limb) ((sword32) in2[9])) * ((sword32) in[6]); + output[16] = ((limb) ((sword32) in2[8])) * ((sword32) in[8]) + + 2 * (((limb) ((sword32) in2[7])) * ((sword32) in[9]) + + ((limb) ((sword32) in2[9])) * ((sword32) in[7])); + output[17] = ((limb) ((sword32) in2[8])) * ((sword32) in[9]) + + ((limb) ((sword32) in2[9])) * ((sword32) in[8]); + output[18] = 2 * ((limb) ((sword32) in2[9])) * ((sword32) in[9]); +} + +/* Reduce a long form to a short form by taking the input mod 2^255 - 19. + * + * On entry: |output[i]| < 14*2^54 + * On exit: |output[0..8]| < 280*2^54 */ +void freduce_degree(limb *output) +{ + /* Each of these shifts and adds ends up multiplying the value by 19. + * + * For output[0..8], the absolute entry value is < 14*2^54 and we add, at + * most, 19*14*2^54 thus, on exit, |output[0..8]| < 280*2^54. */ + output[8] += output[18] << 4; + output[8] += output[18] << 1; + output[8] += output[18]; + output[7] += output[17] << 4; + output[7] += output[17] << 1; + output[7] += output[17]; + output[6] += output[16] << 4; + output[6] += output[16] << 1; + output[6] += output[16]; + output[5] += output[15] << 4; + output[5] += output[15] << 1; + output[5] += output[15]; + output[4] += output[14] << 4; + output[4] += output[14] << 1; + output[4] += output[14]; + output[3] += output[13] << 4; + output[3] += output[13] << 1; + output[3] += output[13]; + output[2] += output[12] << 4; + output[2] += output[12] << 1; + output[2] += output[12]; + output[1] += output[11] << 4; + output[1] += output[11] << 1; + output[1] += output[11]; + output[0] += output[10] << 4; + output[0] += output[10] << 1; + output[0] += output[10]; +} + +#if (-1 & 3) != 3 +#error "This code only works on a two's complement system" +#endif + +/* return v / 2^26, using only shifts and adds. + * + * On entry: v can take any value. */ +inline limb div_by_2_26(const limb v) +{ + /* High word of v; no shift needed. */ + const uint32_t highword = (uint32_t) (((uint64_t) v) >> 32); + /* Set to all 1s if v was negative; else set to 0s. */ + const int32_t sign = ((int32_t) highword) >> 31; + /* Set to 0x3ffffff if v was negative; else set to 0. */ + const int32_t roundoff = ((uint32_t) sign) >> 6; + /* Should return v / (1<<26) */ + return (v + roundoff) >> 26; +} + +/* return v / (2^25), using only shifts and adds. + * + * On entry: v can take any value. */ +inline limb div_by_2_25(const limb v) +{ + /* High word of v; no shift needed*/ + const uint32_t highword = (uint32_t) (((uint64_t) v) >> 32); + /* Set to all 1s if v was negative; else set to 0s. */ + const int32_t sign = ((int32_t) highword) >> 31; + /* Set to 0x1ffffff if v was negative; else set to 0. */ + const int32_t roundoff = ((uint32_t) sign) >> 7; + /* Should return v / (1<<25) */ + return (v + roundoff) >> 25; +} + +/* Reduce all coefficients of the short form input so that |x| < 2^26. + * + * On entry: |output[i]| < 280*2^54 */ +void freduce_coefficients(limb *output) +{ + output[10] = 0; + + for (unsigned int i = 0; i < 10; i += 2) { + limb over = div_by_2_26(output[i]); + /* The entry condition (that |output[i]| < 280*2^54) means that over is, at + * most, 280*2^28 in the first iteration of this loop. This is added to the + * next limb and we can approximate the resulting bound of that limb by + * 281*2^54. */ + output[i] -= over << 26; + output[i+1] += over; + + /* For the first iteration, |output[i+1]| < 281*2^54, thus |over| < + * 281*2^29. When this is added to the next limb, the resulting bound can + * be approximated as 281*2^54. + * + * For subsequent iterations of the loop, 281*2^54 remains a conservative + * bound and no overflow occurs. */ + over = div_by_2_25(output[i+1]); + output[i+1] -= over << 25; + output[i+2] += over; + } + + /* Now |output[10]| < 281*2^29 and all other coefficients are reduced. */ + output[0] += output[10] << 4; + output[0] += output[10] << 1; + output[0] += output[10]; + + output[10] = 0; + + /* Now output[1..9] are reduced, and |output[0]| < 2^26 + 19*281*2^29 + * So |over| will be no more than 2^16. */ + { + limb over = div_by_2_26(output[0]); + output[0] -= over << 26; + output[1] += over; + } + + /* Now output[0,2..9] are reduced, and |output[1]| < 2^25 + 2^16 < 2^26. The + * bound on |output[1]| is sufficient to meet our needs. */ +} + +/* A helpful wrapper around fproduct: output = in * in2. + * + * On entry: |in[i]| < 2^27 and |in2[i]| < 2^27. + * + * output must be distinct to both inputs. The output is reduced degree + * (indeed, one need only provide storage for 10 limbs) and |output[i]| < 2^26. */ +void fmul(limb *output, const limb *in, const limb *in2) +{ + limb t[19]; + fproduct(t, in, in2); + /* |t[i]| < 14*2^54 */ + freduce_degree(t); + freduce_coefficients(t); + /* |t[i]| < 2^26 */ + memcpy(output, t, sizeof(limb) * 10); +} + +/* Square a number: output = in**2 + * + * output must be distinct from the input. The inputs are reduced coefficient + * form, the output is not. + * + * output[x] <= 14 * the largest product of the input limbs. */ +void fsquare_inner(limb *output, const limb *in) +{ + output[0] = ((limb) ((sword32) in[0])) * ((sword32) in[0]); + output[1] = 2 * ((limb) ((sword32) in[0])) * ((sword32) in[1]); + output[2] = 2 * (((limb) ((sword32) in[1])) * ((sword32) in[1]) + + ((limb) ((sword32) in[0])) * ((sword32) in[2])); + output[3] = 2 * (((limb) ((sword32) in[1])) * ((sword32) in[2]) + + ((limb) ((sword32) in[0])) * ((sword32) in[3])); + output[4] = ((limb) ((sword32) in[2])) * ((sword32) in[2]) + + 4 * ((limb) ((sword32) in[1])) * ((sword32) in[3]) + + 2 * ((limb) ((sword32) in[0])) * ((sword32) in[4]); + output[5] = 2 * (((limb) ((sword32) in[2])) * ((sword32) in[3]) + + ((limb) ((sword32) in[1])) * ((sword32) in[4]) + + ((limb) ((sword32) in[0])) * ((sword32) in[5])); + output[6] = 2 * (((limb) ((sword32) in[3])) * ((sword32) in[3]) + + ((limb) ((sword32) in[2])) * ((sword32) in[4]) + + ((limb) ((sword32) in[0])) * ((sword32) in[6]) + + 2 * ((limb) ((sword32) in[1])) * ((sword32) in[5])); + output[7] = 2 * (((limb) ((sword32) in[3])) * ((sword32) in[4]) + + ((limb) ((sword32) in[2])) * ((sword32) in[5]) + + ((limb) ((sword32) in[1])) * ((sword32) in[6]) + + ((limb) ((sword32) in[0])) * ((sword32) in[7])); + output[8] = ((limb) ((sword32) in[4])) * ((sword32) in[4]) + + 2 * (((limb) ((sword32) in[2])) * ((sword32) in[6]) + + ((limb) ((sword32) in[0])) * ((sword32) in[8]) + + 2 * (((limb) ((sword32) in[1])) * ((sword32) in[7]) + + ((limb) ((sword32) in[3])) * ((sword32) in[5]))); + output[9] = 2 * (((limb) ((sword32) in[4])) * ((sword32) in[5]) + + ((limb) ((sword32) in[3])) * ((sword32) in[6]) + + ((limb) ((sword32) in[2])) * ((sword32) in[7]) + + ((limb) ((sword32) in[1])) * ((sword32) in[8]) + + ((limb) ((sword32) in[0])) * ((sword32) in[9])); + output[10] = 2 * (((limb) ((sword32) in[5])) * ((sword32) in[5]) + + ((limb) ((sword32) in[4])) * ((sword32) in[6]) + + ((limb) ((sword32) in[2])) * ((sword32) in[8]) + + 2 * (((limb) ((sword32) in[3])) * ((sword32) in[7]) + + ((limb) ((sword32) in[1])) * ((sword32) in[9]))); + output[11] = 2 * (((limb) ((sword32) in[5])) * ((sword32) in[6]) + + ((limb) ((sword32) in[4])) * ((sword32) in[7]) + + ((limb) ((sword32) in[3])) * ((sword32) in[8]) + + ((limb) ((sword32) in[2])) * ((sword32) in[9])); + output[12] = ((limb) ((sword32) in[6])) * ((sword32) in[6]) + + 2 * (((limb) ((sword32) in[4])) * ((sword32) in[8]) + + 2 * (((limb) ((sword32) in[5])) * ((sword32) in[7]) + + ((limb) ((sword32) in[3])) * ((sword32) in[9]))); + output[13] = 2 * (((limb) ((sword32) in[6])) * ((sword32) in[7]) + + ((limb) ((sword32) in[5])) * ((sword32) in[8]) + + ((limb) ((sword32) in[4])) * ((sword32) in[9])); + output[14] = 2 * (((limb) ((sword32) in[7])) * ((sword32) in[7]) + + ((limb) ((sword32) in[6])) * ((sword32) in[8]) + + 2 * ((limb) ((sword32) in[5])) * ((sword32) in[9])); + output[15] = 2 * (((limb) ((sword32) in[7])) * ((sword32) in[8]) + + ((limb) ((sword32) in[6])) * ((sword32) in[9])); + output[16] = ((limb) ((sword32) in[8])) * ((sword32) in[8]) + + 4 * ((limb) ((sword32) in[7])) * ((sword32) in[9]); + output[17] = 2 * ((limb) ((sword32) in[8])) * ((sword32) in[9]); + output[18] = 2 * ((limb) ((sword32) in[9])) * ((sword32) in[9]); +} + +/* fsquare sets output = in^2. + * + * On entry: The |in| argument is in reduced coefficients form and |in[i]| < + * 2^27. + * + * On exit: The |output| argument is in reduced coefficients form (indeed, one + * need only provide storage for 10 limbs) and |out[i]| < 2^26. */ +void +fsquare(limb *output, const limb *in) +{ + limb t[19]; + fsquare_inner(t, in); + /* |t[i]| < 14*2^54 because the largest product of two limbs will be < + * 2^(27+27) and fsquare_inner adds together, at most, 14 of those + * products. */ + freduce_degree(t); + freduce_coefficients(t); + /* |t[i]| < 2^26 */ + memcpy(output, t, sizeof(limb) * 10); +} + +/* Take a little-endian, 32-byte number and expand it into polynomial form */ +void fexpand(limb *output, const byte *input) +{ +#define F(n,start,shift,mask) \ + output[n] = ((((limb) input[start + 0]) | \ + ((limb) input[start + 1]) << 8 | \ + ((limb) input[start + 2]) << 16 | \ + ((limb) input[start + 3]) << 24) >> shift) & mask; + F(0, 0, 0, 0x3ffffff); + F(1, 3, 2, 0x1ffffff); + F(2, 6, 3, 0x3ffffff); + F(3, 9, 5, 0x1ffffff); + F(4, 12, 6, 0x3ffffff); + F(5, 16, 0, 0x1ffffff); + F(6, 19, 1, 0x3ffffff); + F(7, 22, 3, 0x1ffffff); + F(8, 25, 4, 0x3ffffff); + F(9, 28, 6, 0x1ffffff); +#undef F +} + +#if (-32 >> 1) != -16 +#error "This code only works when >> does sign-extension on negative numbers" +#endif + +/* sword32_eq returns 0xffffffff iff a == b and zero otherwise. */ +sword32 sword32_eq(sword32 a, sword32 b) +{ + a = ~(a ^ b); + a &= a << 16; + a &= a << 8; + a &= a << 4; + a &= a << 2; + a &= a << 1; + return a >> 31; +} + +/* sword32_gte returns 0xffffffff if a >= b and zero otherwise, where a and b are + * both non-negative. */ +sword32 sword32_gte(sword32 a, sword32 b) +{ + a -= b; + /* a >= 0 iff a >= b. */ + return ~(a >> 31); +} + +/* Take a fully reduced polynomial form number and contract it into a + * little-endian, 32-byte array. + * + * On entry: |input_limbs[i]| < 2^26 */ +void fcontract(byte *output, limb *input_limbs) +{ + int i, j; + sword32 input[10]; + sword32 mask; + + /* |input_limbs[i]| < 2^26, so it's valid to convert to an sword32. */ + for (i = 0; i < 10; i++) { + input[i] = (sword32)input_limbs[i]; + } + + for (j = 0; j < 2; ++j) { + for (i = 0; i < 9; ++i) { + if ((i & 1) == 1) { + /* This calculation is a time-invariant way to make input[i] + * non-negative by borrowing from the next-larger limb. */ + const sword32 mask = input[i] >> 31; + const sword32 carry = -((input[i] & mask) >> 25); + input[i] = input[i] + (carry << 25); + input[i+1] = input[i+1] - carry; + } else { + const sword32 mask = input[i] >> 31; + const sword32 carry = -((input[i] & mask) >> 26); + input[i] = input[i] + (carry << 26); + input[i+1] = input[i+1] - carry; + } + } + + /* There's no greater limb for input[9] to borrow from, but we can multiply + * by 19 and borrow from input[0], which is valid mod 2^255-19. */ + { + const sword32 mask = input[9] >> 31; + const sword32 carry = -((input[9] & mask) >> 25); + input[9] = input[9] + (carry << 25); + input[0] = input[0] - (carry * 19); + } + + /* After the first iteration, input[1..9] are non-negative and fit within + * 25 or 26 bits, depending on position. However, input[0] may be + * negative. */ + } + + /* The first borrow-propagation pass above ended with every limb + except (possibly) input[0] non-negative. + + If input[0] was negative after the first pass, then it was because of a + carry from input[9]. On entry, input[9] < 2^26 so the carry was, at most, + one, since (2**26-1) >> 25 = 1. Thus input[0] >= -19. + + In the second pass, each limb is decreased by at most one. Thus the second + borrow-propagation pass could only have wrapped around to decrease + input[0] again if the first pass left input[0] negative *and* input[1] + through input[9] were all zero. In that case, input[1] is now 2^25 - 1, + and this last borrow-propagation step will leave input[1] non-negative. */ + { + const sword32 mask = input[0] >> 31; + const sword32 carry = -((input[0] & mask) >> 26); + input[0] = input[0] + (carry << 26); + input[1] = input[1] - carry; + } + + /* All input[i] are now non-negative. However, there might be values between + * 2^25 and 2^26 in a limb which is, nominally, 25 bits wide. */ + for (j = 0; j < 2; j++) { + for (i = 0; i < 9; i++) { + if ((i & 1) == 1) { + const sword32 carry = input[i] >> 25; + input[i] &= 0x1ffffff; + input[i+1] += carry; + } else { + const sword32 carry = input[i] >> 26; + input[i] &= 0x3ffffff; + input[i+1] += carry; + } + } + + { + const sword32 carry = input[9] >> 25; + input[9] &= 0x1ffffff; + input[0] += 19*carry; + } + } + + /* If the first carry-chain pass, just above, ended up with a carry from + * input[9], and that caused input[0] to be out-of-bounds, then input[0] was + * < 2^26 + 2*19, because the carry was, at most, two. + * + * If the second pass carried from input[9] again then input[0] is < 2*19 and + * the input[9] -> input[0] carry didn't push input[0] out of bounds. */ + + /* It still remains the case that input might be between 2^255-19 and 2^255. + * In this case, input[1..9] must take their maximum value and input[0] must + * be >= (2^255-19) & 0x3ffffff, which is 0x3ffffed. */ + mask = sword32_gte(input[0], 0x3ffffed); + for (i = 1; i < 10; i++) { + if ((i & 1) == 1) { + mask &= sword32_eq(input[i], 0x1ffffff); + } else { + mask &= sword32_eq(input[i], 0x3ffffff); + } + } + + /* mask is either 0xffffffff (if input >= 2^255-19) and zero otherwise. Thus + * this conditionally subtracts 2^255-19. */ + input[0] -= mask & 0x3ffffed; + + for (i = 1; i < 10; i++) { + if ((i & 1) == 1) { + input[i] -= mask & 0x1ffffff; + } else { + input[i] -= mask & 0x3ffffff; + } + } + + input[1] <<= 2; + input[2] <<= 3; + input[3] <<= 5; + input[4] <<= 6; + input[6] <<= 1; + input[7] <<= 3; + input[8] <<= 4; + input[9] <<= 6; +#define F(i, s) \ + output[s+0] |= input[i] & 0xff; \ + output[s+1] = (input[i] >> 8) & 0xff; \ + output[s+2] = (input[i] >> 16) & 0xff; \ + output[s+3] = (input[i] >> 24) & 0xff; + output[0] = 0; + output[16] = 0; + F(0,0); + F(1,3); + F(2,6); + F(3,9); + F(4,12); + F(5,16); + F(6,19); + F(7,22); + F(8,25); + F(9,28); +#undef F +} + +/* Input: Q, Q', Q-Q' + * Output: 2Q, Q+Q' + * + * x2 z3: long form + * x3 z3: long form + * x z: short form, destroyed + * xprime zprime: short form, destroyed + * qmqp: short form, preserved + * + * On entry and exit, the absolute value of the limbs of all inputs and outputs + * are < 2^26. */ +void fmonty(limb *x2, limb *z2, /* output 2Q */ + limb *x3, limb *z3, /* output Q + Q' */ + limb *x, limb *z, /* input Q */ + limb *xprime, limb *zprime, /* input Q' */ + const limb *qmqp /* input Q - Q' */) +{ + limb origx[10], origxprime[10], zzz[19], xx[19], zz[19]; + limb xxprime[19], zzprime[19], zzzprime[19], xxxprime[19]; + + memcpy(origx, x, 10 * sizeof(limb)); + fsum(x, z); + /* |x[i]| < 2^27 */ + fdifference(z, origx); /* does x - z */ + /* |z[i]| < 2^27 */ + + memcpy(origxprime, xprime, sizeof(limb) * 10); + fsum(xprime, zprime); + /* |xprime[i]| < 2^27 */ + fdifference(zprime, origxprime); + /* |zprime[i]| < 2^27 */ + fproduct(xxprime, xprime, z); + /* |xxprime[i]| < 14*2^54: the largest product of two limbs will be < + * 2^(27+27) and fproduct adds together, at most, 14 of those products. + * (Approximating that to 2^58 doesn't work out.) */ + fproduct(zzprime, x, zprime); + /* |zzprime[i]| < 14*2^54 */ + freduce_degree(xxprime); + freduce_coefficients(xxprime); + /* |xxprime[i]| < 2^26 */ + freduce_degree(zzprime); + freduce_coefficients(zzprime); + /* |zzprime[i]| < 2^26 */ + memcpy(origxprime, xxprime, sizeof(limb) * 10); + fsum(xxprime, zzprime); + /* |xxprime[i]| < 2^27 */ + fdifference(zzprime, origxprime); + /* |zzprime[i]| < 2^27 */ + fsquare(xxxprime, xxprime); + /* |xxxprime[i]| < 2^26 */ + fsquare(zzzprime, zzprime); + /* |zzzprime[i]| < 2^26 */ + fproduct(zzprime, zzzprime, qmqp); + /* |zzprime[i]| < 14*2^52 */ + freduce_degree(zzprime); + freduce_coefficients(zzprime); + /* |zzprime[i]| < 2^26 */ + memcpy(x3, xxxprime, sizeof(limb) * 10); + memcpy(z3, zzprime, sizeof(limb) * 10); + + fsquare(xx, x); + /* |xx[i]| < 2^26 */ + fsquare(zz, z); + /* |zz[i]| < 2^26 */ + fproduct(x2, xx, zz); + /* |x2[i]| < 14*2^52 */ + freduce_degree(x2); + freduce_coefficients(x2); + /* |x2[i]| < 2^26 */ + fdifference(zz, xx); // does zz = xx - zz + /* |zz[i]| < 2^27 */ + memset(zzz + 10, 0, sizeof(limb) * 9); + fscalar_product(zzz, zz, 121665); + /* |zzz[i]| < 2^(27+17) */ + /* No need to call freduce_degree here: + fscalar_product doesn't increase the degree of its input. */ + freduce_coefficients(zzz); + /* |zzz[i]| < 2^26 */ + fsum(zzz, xx); + /* |zzz[i]| < 2^27 */ + fproduct(z2, zz, zzz); + /* |z2[i]| < 14*2^(26+27) */ + freduce_degree(z2); + freduce_coefficients(z2); + /* |z2|i| < 2^26 */ +} + +/* Conditionally swap two reduced-form limb arrays if 'iswap' is 1, but leave + * them unchanged if 'iswap' is 0. Runs in data-invariant time to avoid + * side-channel attacks. + * + * NOTE that this function requires that 'iswap' be 1 or 0; other values give + * wrong results. Also, the two limb arrays must be in reduced-coefficient, + * reduced-degree form: the values in a[10..19] or b[10..19] aren't swapped, + * and all all values in a[0..9],b[0..9] must have magnitude less than + * INT32_MAX. */ +void swap_conditional(limb a[19], limb b[19], limb iswap) +{ + const sword32 swap = (sword32) -iswap; + + for (unsigned int i = 0; i < 10; ++i) { + const sword32 x = swap & ( ((sword32)a[i]) ^ ((sword32)b[i]) ); + a[i] = ((sword32)a[i]) ^ x; + b[i] = ((sword32)b[i]) ^ x; + } +} + +/* Calculates nQ where Q is the x-coordinate of a point on the curve + * + * resultx/resultz: the x coordinate of the resulting curve point (short form) + * n: a little endian, 32-byte number + * q: a point of the curve (short form) */ +void +cmult(limb *resultx, limb *resultz, const byte *n, const limb *q) +{ + limb a[19] = {0}, b[19] = {1}, c[19] = {1}, d[19] = {0}; + limb *nqpqx = a, *nqpqz = b, *nqx = c, *nqz = d, *t; + limb e[19] = {0}, f[19] = {1}, g[19] = {0}, h[19] = {1}; + limb *nqpqx2 = e, *nqpqz2 = f, *nqx2 = g, *nqz2 = h; + + memcpy(nqpqx, q, sizeof(limb) * 10); + + for (unsigned int i = 0; i < 32; ++i) { + byte b = n[31 - i]; + for (unsigned int j = 0; j < 8; ++j) { + const limb bit = b >> 7; + + swap_conditional(nqx, nqpqx, bit); + swap_conditional(nqz, nqpqz, bit); + fmonty(nqx2, nqz2, + nqpqx2, nqpqz2, + nqx, nqz, + nqpqx, nqpqz, + q); + swap_conditional(nqx2, nqpqx2, bit); + swap_conditional(nqz2, nqpqz2, bit); + + t = nqx; + nqx = nqx2; + nqx2 = t; + t = nqz; + nqz = nqz2; + nqz2 = t; + t = nqpqx; + nqpqx = nqpqx2; + nqpqx2 = t; + t = nqpqz; + nqpqz = nqpqz2; + nqpqz2 = t; + + b <<= 1; + } + } + + memcpy(resultx, nqx, sizeof(limb) * 10); + memcpy(resultz, nqz, sizeof(limb) * 10); +} + +// ----------------------------------------------------------------------------- +// Shamelessly copied from djb's code +// ----------------------------------------------------------------------------- +void crecip(limb *out, const limb *z) +{ + limb z2[10]; + limb z9[10]; + limb z11[10]; + limb z2_5_0[10]; + limb z2_10_0[10]; + limb z2_20_0[10]; + limb z2_50_0[10]; + limb z2_100_0[10]; + limb t0[10]; + limb t1[10]; + int i; + + /* 2 */ fsquare(z2,z); + /* 4 */ fsquare(t1,z2); + /* 8 */ fsquare(t0,t1); + /* 9 */ fmul(z9,t0,z); + /* 11 */ fmul(z11,z9,z2); + /* 22 */ fsquare(t0,z11); + /* 2^5 - 2^0 = 31 */ fmul(z2_5_0,t0,z9); + + /* 2^6 - 2^1 */ fsquare(t0,z2_5_0); + /* 2^7 - 2^2 */ fsquare(t1,t0); + /* 2^8 - 2^3 */ fsquare(t0,t1); + /* 2^9 - 2^4 */ fsquare(t1,t0); + /* 2^10 - 2^5 */ fsquare(t0,t1); + /* 2^10 - 2^0 */ fmul(z2_10_0,t0,z2_5_0); + + /* 2^11 - 2^1 */ fsquare(t0,z2_10_0); + /* 2^12 - 2^2 */ fsquare(t1,t0); + /* 2^20 - 2^10 */ for (i = 2;i < 10;i += 2) { fsquare(t0,t1); fsquare(t1,t0); } + /* 2^20 - 2^0 */ fmul(z2_20_0,t1,z2_10_0); + + /* 2^21 - 2^1 */ fsquare(t0,z2_20_0); + /* 2^22 - 2^2 */ fsquare(t1,t0); + /* 2^40 - 2^20 */ for (i = 2;i < 20;i += 2) { fsquare(t0,t1); fsquare(t1,t0); } + /* 2^40 - 2^0 */ fmul(t0,t1,z2_20_0); + + /* 2^41 - 2^1 */ fsquare(t1,t0); + /* 2^42 - 2^2 */ fsquare(t0,t1); + /* 2^50 - 2^10 */ for (i = 2;i < 10;i += 2) { fsquare(t1,t0); fsquare(t0,t1); } + /* 2^50 - 2^0 */ fmul(z2_50_0,t0,z2_10_0); + + /* 2^51 - 2^1 */ fsquare(t0,z2_50_0); + /* 2^52 - 2^2 */ fsquare(t1,t0); + /* 2^100 - 2^50 */ for (i = 2;i < 50;i += 2) { fsquare(t0,t1); fsquare(t1,t0); } + /* 2^100 - 2^0 */ fmul(z2_100_0,t1,z2_50_0); + + /* 2^101 - 2^1 */ fsquare(t1,z2_100_0); + /* 2^102 - 2^2 */ fsquare(t0,t1); + /* 2^200 - 2^100 */ for (i = 2;i < 100;i += 2) { fsquare(t1,t0); fsquare(t0,t1); } + /* 2^200 - 2^0 */ fmul(t1,t0,z2_100_0); + + /* 2^201 - 2^1 */ fsquare(t0,t1); + /* 2^202 - 2^2 */ fsquare(t1,t0); + /* 2^250 - 2^50 */ for (i = 2;i < 50;i += 2) { fsquare(t0,t1); fsquare(t1,t0); } + /* 2^250 - 2^0 */ fmul(t0,t1,z2_50_0); + + /* 2^251 - 2^1 */ fsquare(t1,t0); + /* 2^252 - 2^2 */ fsquare(t0,t1); + /* 2^253 - 2^3 */ fsquare(t1,t0); + /* 2^254 - 2^4 */ fsquare(t0,t1); + /* 2^255 - 2^5 */ fsquare(t1,t0); + /* 2^255 - 21 */ fmul(out,t1,z11); +} + +ANONYMOUS_NAMESPACE_END + +NAMESPACE_BEGIN(CryptoPP) +NAMESPACE_BEGIN(Donna) + +int curve25519(byte pubkey[32], const byte seckey[32], const byte basepoint[32]) +{ + limb bp[10], x[10], z[11], zmone[10]; + byte e[32]; int i; + + for (i = 0; i < 32; ++i) + e[i] = seckey[i]; + + e[0] &= 248; + e[31] &= 127; + e[31] |= 64; + + fexpand(bp, basepoint); + cmult(x, z, e, bp); + crecip(zmone, z); + fmul(z, x, zmone); + fcontract(pubkey, z); + return 0; +} + +NAMESPACE_END // Donna +NAMESPACE_END // CryptoPP + +#endif // CRYPTOPP_32BIT diff --git a/donna_64.cpp b/donna_64.cpp new file mode 100644 index 00000000..16bff906 --- /dev/null +++ b/donna_64.cpp @@ -0,0 +1,509 @@ +// donna_64.cpp - written and placed in public domain by Jeffrey Walton +// This is a port of Adam Langley's curve25519-donna +// located at https://github.com/agl/curve25519-donna + +/* Copyright 2008, Google Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following disclaimer + * in the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Google Inc. nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * curve25519-donna: Curve25519 elliptic curve, public key function + * + * http://code.google.com/p/curve25519-donna/ + * + * Adam Langley + * + * Derived from public domain C code by Daniel J. Bernstein + * + * More information about curve25519 can be found here + * http://cr.yp.to/ecdh.html + * + * djb's sample implementation of curve25519 is written in a special assembly + * language called qhasm and uses the floating point registers. + * + * This is, almost, a clean room reimplementation from the curve25519 paper. It + * uses many of the tricks described therein. Only the crecip function is taken + * from the sample implementation. */ + +#include "pch.h" + +#include "config.h" +#include "donna.h" +#include "stdcpp.h" + +// This macro is not in a header like config.h because +// we don't want it exposed to user code. We also need +// a standard header like or . +// Langley uses uint128_t in the 64-bit code paths so +// we further restrict 64-bit code. +#if (UINTPTR_MAX == 0xffffffff) || !defined(CRYPTOPP_WORD128_AVAILABLE) +# define CRYPTOPP_32BIT 1 +#else +# define CRYPTOPP_64BIT 1 +#endif + +// Squash MS LNK4221 and libtool warnings +extern const char DONNA64_FNAME[] = __FILE__; + +#if defined(CRYPTOPP_64BIT) + +ANONYMOUS_NAMESPACE_BEGIN + +using std::memcpy; +using CryptoPP::byte; +using CryptoPP::word32; +using CryptoPP::word64; +using CryptoPP::sword32; +using CryptoPP::sword64; +using CryptoPP::word128; + +typedef word64 limb; +typedef limb felem[5]; + +// This is a special gcc mode for 128-bit integers. It's implemented on 64-bit +// platforms only as far as I know. +//typedef unsigned uint128_t __attribute__((mode(TI))); + +/* Sum two numbers: output += in */ +inline void fsum(limb *output, const limb *in) +{ + output[0] += in[0]; + output[1] += in[1]; + output[2] += in[2]; + output[3] += in[3]; + output[4] += in[4]; +} + +/* Find the difference of two numbers: output = in - output + * (note the order of the arguments!) + * + * Assumes that out[i] < 2**52 + * On return, out[i] < 2**55 + */ +inline void fdifference_backwards(felem out, const felem in) +{ + /* 152 is 19 << 3 */ + const limb two54m152 = (((limb)1) << 54) - 152; + const limb two54m8 = (((limb)1) << 54) - 8; + + out[0] = in[0] + two54m152 - out[0]; + out[1] = in[1] + two54m8 - out[1]; + out[2] = in[2] + two54m8 - out[2]; + out[3] = in[3] + two54m8 - out[3]; + out[4] = in[4] + two54m8 - out[4]; +} + +/* Multiply a number by a scalar: output = in * scalar */ +inline void fscalar_product(felem output, const felem in, const limb scalar) +{ + word128 a; + + a = ((word128) in[0]) * scalar; + output[0] = ((limb)a) & 0x7ffffffffffff; + + a = ((word128) in[1]) * scalar + ((limb) (a >> 51)); + output[1] = ((limb)a) & 0x7ffffffffffff; + + a = ((word128) in[2]) * scalar + ((limb) (a >> 51)); + output[2] = ((limb)a) & 0x7ffffffffffff; + + a = ((word128) in[3]) * scalar + ((limb) (a >> 51)); + output[3] = ((limb)a) & 0x7ffffffffffff; + + a = ((word128) in[4]) * scalar + ((limb) (a >> 51)); + output[4] = ((limb)a) & 0x7ffffffffffff; + + output[0] += (a >> 51) * 19; +} + +/* Multiply two numbers: output = in2 * in + * + * output must be distinct to both inputs. The inputs are reduced coefficient + * form, the output is not. + * + * Assumes that in[i] < 2**55 and likewise for in2. + * On return, output[i] < 2**52 + */ +inline void fmul(felem output, const felem in2, const felem in) +{ + word128 t[5]; + limb r0,r1,r2,r3,r4,s0,s1,s2,s3,s4,c; + + r0 = in[0]; + r1 = in[1]; + r2 = in[2]; + r3 = in[3]; + r4 = in[4]; + + s0 = in2[0]; + s1 = in2[1]; + s2 = in2[2]; + s3 = in2[3]; + s4 = in2[4]; + + t[0] = ((word128) r0) * s0; + t[1] = ((word128) r0) * s1 + ((word128) r1) * s0; + t[2] = ((word128) r0) * s2 + ((word128) r2) * s0 + ((word128) r1) * s1; + t[3] = ((word128) r0) * s3 + ((word128) r3) * s0 + ((word128) r1) * s2 + ((word128) r2) * s1; + t[4] = ((word128) r0) * s4 + ((word128) r4) * s0 + ((word128) r3) * s1 + ((word128) r1) * s3 + ((word128) r2) * s2; + + r4 *= 19; + r1 *= 19; + r2 *= 19; + r3 *= 19; + + t[0] += ((word128) r4) * s1 + ((word128) r1) * s4 + ((word128) r2) * s3 + ((word128) r3) * s2; + t[1] += ((word128) r4) * s2 + ((word128) r2) * s4 + ((word128) r3) * s3; + t[2] += ((word128) r4) * s3 + ((word128) r3) * s4; + t[3] += ((word128) r4) * s4; + + r0 = (limb)t[0] & 0x7ffffffffffff; c = (limb)(t[0] >> 51); + t[1] += c; r1 = (limb)t[1] & 0x7ffffffffffff; c = (limb)(t[1] >> 51); + t[2] += c; r2 = (limb)t[2] & 0x7ffffffffffff; c = (limb)(t[2] >> 51); + t[3] += c; r3 = (limb)t[3] & 0x7ffffffffffff; c = (limb)(t[3] >> 51); + t[4] += c; r4 = (limb)t[4] & 0x7ffffffffffff; c = (limb)(t[4] >> 51); + r0 += c * 19; c = r0 >> 51; r0 = r0 & 0x7ffffffffffff; + r1 += c; c = r1 >> 51; r1 = r1 & 0x7ffffffffffff; + r2 += c; + + output[0] = r0; + output[1] = r1; + output[2] = r2; + output[3] = r3; + output[4] = r4; +} + +inline void fsquare_times(felem output, const felem in, limb count) +{ + word128 t[5]; + limb r0,r1,r2,r3,r4,c; + limb d0,d1,d2,d4,d419; + + r0 = in[0]; + r1 = in[1]; + r2 = in[2]; + r3 = in[3]; + r4 = in[4]; + + do { + d0 = r0 * 2; + d1 = r1 * 2; + d2 = r2 * 2 * 19; + d419 = r4 * 19; + d4 = d419 * 2; + + t[0] = ((word128) r0) * r0 + ((word128) d4) * r1 + (((word128) d2) * (r3 )); + t[1] = ((word128) d0) * r1 + ((word128) d4) * r2 + (((word128) r3) * (r3 * 19)); + t[2] = ((word128) d0) * r2 + ((word128) r1) * r1 + (((word128) d4) * (r3 )); + t[3] = ((word128) d0) * r3 + ((word128) d1) * r2 + (((word128) r4) * (d419 )); + t[4] = ((word128) d0) * r4 + ((word128) d1) * r3 + (((word128) r2) * (r2 )); + + r0 = (limb)t[0] & 0x7ffffffffffff; c = (limb)(t[0] >> 51); + t[1] += c; r1 = (limb)t[1] & 0x7ffffffffffff; c = (limb)(t[1] >> 51); + t[2] += c; r2 = (limb)t[2] & 0x7ffffffffffff; c = (limb)(t[2] >> 51); + t[3] += c; r3 = (limb)t[3] & 0x7ffffffffffff; c = (limb)(t[3] >> 51); + t[4] += c; r4 = (limb)t[4] & 0x7ffffffffffff; c = (limb)(t[4] >> 51); + r0 += c * 19; c = r0 >> 51; r0 = r0 & 0x7ffffffffffff; + r1 += c; c = r1 >> 51; r1 = r1 & 0x7ffffffffffff; + r2 += c; + } while(--count); + + output[0] = r0; + output[1] = r1; + output[2] = r2; + output[3] = r3; + output[4] = r4; +} + +/* Load a little-endian 64-bit number */ +limb load_limb(const byte *in) +{ + return + ((limb)in[0]) | + (((limb)in[1]) << 8) | + (((limb)in[2]) << 16) | + (((limb)in[3]) << 24) | + (((limb)in[4]) << 32) | + (((limb)in[5]) << 40) | + (((limb)in[6]) << 48) | + (((limb)in[7]) << 56); +} + +void store_limb(byte *out, limb in) +{ + out[0] = in & 0xff; + out[1] = (in >> 8) & 0xff; + out[2] = (in >> 16) & 0xff; + out[3] = (in >> 24) & 0xff; + out[4] = (in >> 32) & 0xff; + out[5] = (in >> 40) & 0xff; + out[6] = (in >> 48) & 0xff; + out[7] = (in >> 56) & 0xff; +} + +/* Take a little-endian, 32-byte number and expand it into polynomial form */ +void fexpand(limb *output, const byte *in) +{ + output[0] = load_limb(in) & 0x7ffffffffffff; + output[1] = (load_limb(in+6) >> 3) & 0x7ffffffffffff; + output[2] = (load_limb(in+12) >> 6) & 0x7ffffffffffff; + output[3] = (load_limb(in+19) >> 1) & 0x7ffffffffffff; + output[4] = (load_limb(in+24) >> 12) & 0x7ffffffffffff; +} + +/* Take a fully reduced polynomial form number and contract it into a + * little-endian, 32-byte array + */ +void fcontract(byte *output, const felem input) +{ + word128 t[5]; + + t[0] = input[0]; + t[1] = input[1]; + t[2] = input[2]; + t[3] = input[3]; + t[4] = input[4]; + + t[1] += t[0] >> 51; t[0] &= 0x7ffffffffffff; + t[2] += t[1] >> 51; t[1] &= 0x7ffffffffffff; + t[3] += t[2] >> 51; t[2] &= 0x7ffffffffffff; + t[4] += t[3] >> 51; t[3] &= 0x7ffffffffffff; + t[0] += 19 * (t[4] >> 51); t[4] &= 0x7ffffffffffff; + + t[1] += t[0] >> 51; t[0] &= 0x7ffffffffffff; + t[2] += t[1] >> 51; t[1] &= 0x7ffffffffffff; + t[3] += t[2] >> 51; t[2] &= 0x7ffffffffffff; + t[4] += t[3] >> 51; t[3] &= 0x7ffffffffffff; + t[0] += 19 * (t[4] >> 51); t[4] &= 0x7ffffffffffff; + + /* now t is between 0 and 2^255-1, properly carried. */ + /* case 1: between 0 and 2^255-20. case 2: between 2^255-19 and 2^255-1. */ + + t[0] += 19; + + t[1] += t[0] >> 51; t[0] &= 0x7ffffffffffff; + t[2] += t[1] >> 51; t[1] &= 0x7ffffffffffff; + t[3] += t[2] >> 51; t[2] &= 0x7ffffffffffff; + t[4] += t[3] >> 51; t[3] &= 0x7ffffffffffff; + t[0] += 19 * (t[4] >> 51); t[4] &= 0x7ffffffffffff; + + /* now between 19 and 2^255-1 in both cases, and offset by 19. */ + + t[0] += 0x8000000000000 - 19; + t[1] += 0x8000000000000 - 1; + t[2] += 0x8000000000000 - 1; + t[3] += 0x8000000000000 - 1; + t[4] += 0x8000000000000 - 1; + + /* now between 2^255 and 2^256-20, and offset by 2^255. */ + + t[1] += t[0] >> 51; t[0] &= 0x7ffffffffffff; + t[2] += t[1] >> 51; t[1] &= 0x7ffffffffffff; + t[3] += t[2] >> 51; t[2] &= 0x7ffffffffffff; + t[4] += t[3] >> 51; t[3] &= 0x7ffffffffffff; + t[4] &= 0x7ffffffffffff; + + store_limb(output, t[0] | (t[1] << 51)); + store_limb(output+8, (t[1] >> 13) | (t[2] << 38)); + store_limb(output+16, (t[2] >> 26) | (t[3] << 25)); + store_limb(output+24, (t[3] >> 39) | (t[4] << 12)); +} + +/* Input: Q, Q', Q-Q' + * Output: 2Q, Q+Q' + * + * x2 z3: long form + * x3 z3: long form + * x z: short form, destroyed + * xprime zprime: short form, destroyed + * qmqp: short form, preserved + */ +void fmonty(limb *x2, limb *z2, /* output 2Q */ + limb *x3, limb *z3, /* output Q + Q' */ + limb *x, limb *z, /* input Q */ + limb *xprime, limb *zprime, /* input Q' */ + const limb *qmqp /* input Q - Q' */) +{ + limb origx[5], origxprime[5], zzz[5], xx[5], zz[5]; + limb xxprime[5], zzprime[5], zzzprime[5]; + + memcpy(origx, x, 5 * sizeof(limb)); + fsum(x, z); + fdifference_backwards(z, origx); // does x - z + + memcpy(origxprime, xprime, sizeof(limb) * 5); + fsum(xprime, zprime); + fdifference_backwards(zprime, origxprime); + fmul(xxprime, xprime, z); + fmul(zzprime, x, zprime); + memcpy(origxprime, xxprime, sizeof(limb) * 5); + fsum(xxprime, zzprime); + fdifference_backwards(zzprime, origxprime); + fsquare_times(x3, xxprime, 1); + fsquare_times(zzzprime, zzprime, 1); + fmul(z3, zzzprime, qmqp); + + fsquare_times(xx, x, 1); + fsquare_times(zz, z, 1); + fmul(x2, xx, zz); + fdifference_backwards(zz, xx); // does zz = xx - zz + fscalar_product(zzz, zz, 121665); + fsum(zzz, xx); + fmul(z2, zz, zzz); +} + +// ----------------------------------------------------------------------------- +// Maybe swap the contents of two limb arrays (@a and @b), each @len elements +// long. Perform the swap iff @swap is non-zero. +// +// This function performs the swap without leaking any side-channel +// information. +// ----------------------------------------------------------------------------- +void swap_conditional(limb a[5], limb b[5], limb iswap) +{ + const limb swap = -iswap; + + for (unsigned int i = 0; i < 5; ++i) { + const limb x = swap & (a[i] ^ b[i]); + a[i] ^= x; + b[i] ^= x; + } +} + +/* Calculates nQ where Q is the x-coordinate of a point on the curve + * + * resultx/resultz: the x coordinate of the resulting curve point (short form) + * n: a little endian, 32-byte number + * q: a point of the curve (short form) + */ +void cmult(limb *resultx, limb *resultz, const byte *n, const limb *q) +{ + limb a[5] = {0}, b[5] = {1}, c[5] = {1}, d[5] = {0}; + limb *nqpqx = a, *nqpqz = b, *nqx = c, *nqz = d, *t; + limb e[5] = {0}, f[5] = {1}, g[5] = {0}, h[5] = {1}; + limb *nqpqx2 = e, *nqpqz2 = f, *nqx2 = g, *nqz2 = h; + + memcpy(nqpqx, q, sizeof(limb) * 5); + + for (unsigned int i = 0; i < 32; ++i) { + byte b = n[31 - i]; + for (unsigned int j = 0; j < 8; ++j) { + const limb bit = b >> 7; + + swap_conditional(nqx, nqpqx, bit); + swap_conditional(nqz, nqpqz, bit); + fmonty(nqx2, nqz2, + nqpqx2, nqpqz2, + nqx, nqz, + nqpqx, nqpqz, + q); + swap_conditional(nqx2, nqpqx2, bit); + swap_conditional(nqz2, nqpqz2, bit); + + t = nqx; + nqx = nqx2; + nqx2 = t; + t = nqz; + nqz = nqz2; + nqz2 = t; + t = nqpqx; + nqpqx = nqpqx2; + nqpqx2 = t; + t = nqpqz; + nqpqz = nqpqz2; + nqpqz2 = t; + + b <<= 1; + } + } + + memcpy(resultx, nqx, sizeof(limb) * 5); + memcpy(resultz, nqz, sizeof(limb) * 5); +} + + +// ----------------------------------------------------------------------------- +// Shamelessly copied from djb's code, tightened a little +// ----------------------------------------------------------------------------- +void crecip(felem out, const felem z) +{ + felem a,t0,b,c; + + /* 2 */ fsquare_times(a, z, 1); // a = 2 + /* 8 */ fsquare_times(t0, a, 2); + /* 9 */ fmul(b, t0, z); // b = 9 + /* 11 */ fmul(a, b, a); // a = 11 + /* 22 */ fsquare_times(t0, a, 1); + /* 2^5 - 2^0 = 31 */ fmul(b, t0, b); + /* 2^10 - 2^5 */ fsquare_times(t0, b, 5); + /* 2^10 - 2^0 */ fmul(b, t0, b); + /* 2^20 - 2^10 */ fsquare_times(t0, b, 10); + /* 2^20 - 2^0 */ fmul(c, t0, b); + /* 2^40 - 2^20 */ fsquare_times(t0, c, 20); + /* 2^40 - 2^0 */ fmul(t0, t0, c); + /* 2^50 - 2^10 */ fsquare_times(t0, t0, 10); + /* 2^50 - 2^0 */ fmul(b, t0, b); + /* 2^100 - 2^50 */ fsquare_times(t0, b, 50); + /* 2^100 - 2^0 */ fmul(c, t0, b); + /* 2^200 - 2^100 */ fsquare_times(t0, c, 100); + /* 2^200 - 2^0 */ fmul(t0, t0, c); + /* 2^250 - 2^50 */ fsquare_times(t0, t0, 50); + /* 2^250 - 2^0 */ fmul(t0, t0, b); + /* 2^255 - 2^5 */ fsquare_times(t0, t0, 5); + /* 2^255 - 21 */ fmul(out, t0, a); +} + +ANONYMOUS_NAMESPACE_END + +NAMESPACE_BEGIN(CryptoPP) +NAMESPACE_BEGIN(Donna) + +int curve25519(byte pubkey[32], const byte seckey[32], const byte basepoint[32]) +{ + limb bp[5], x[5], z[5], zmone[5]; + uint8_t e[32]; + int i; + + for (i = 0;i < 32;++i) + e[i] = seckey[i]; + + e[0] &= 248; + e[31] &= 127; + e[31] |= 64; + + fexpand(bp, basepoint); + cmult(x, z, e, bp); + crecip(zmone, z); + fmul(z, x, zmone); + fcontract(pubkey, z); + return 0; +} + +NAMESPACE_END // Donna +NAMESPACE_END // CryptoPP + +#endif // CRYPTOPP_64BIT diff --git a/test.cpp b/test.cpp index f4e70746..7f8809b9 100644 --- a/test.cpp +++ b/test.cpp @@ -829,9 +829,10 @@ bool Validate(int alg, bool thorough, const char *seedInput) case 12: result = ValidateThreeWay(); break; case 13: result = ValidateBBS(); break; case 14: result = ValidateDH(); break; - case 15: result = ValidateRSA(); break; - case 16: result = ValidateElGamal(); break; - case 17: result = ValidateDSA(thorough); break; + case 15: result = ValidateX25519(); break; + case 16: result = ValidateRSA(); break; + case 17: result = ValidateElGamal(); break; + case 18: result = ValidateDSA(thorough); break; // case 18: result = ValidateHAVAL(); break; case 19: result = ValidateSAFER(); break; case 20: result = ValidateLUC(); break; diff --git a/validat0.cpp b/validat0.cpp index 1e2fc9b5..e4138bd5 100644 --- a/validat0.cpp +++ b/validat0.cpp @@ -21,6 +21,11 @@ #include "gzip.h" #include "zlib.h" +//curve25519 +#include "xed25519.h" +#include "donna.h" +#include "naclite.h" + #include #include #include @@ -41,12 +46,11 @@ NAMESPACE_BEGIN(Test) // Issue 64: "PolynomialMod2::operator<<=", http://github.com/weidai11/cryptopp/issues/64 bool TestPolynomialMod2() { + std::cout << "\nTesting PolynomialMod2 bit operations...\n\n"; bool pass1 = true, pass2 = true, pass3 = true; - std::cout << "\nTesting PolynomialMod2 bit operations...\n\n"; - - static const unsigned int start = 0; - static const unsigned int stop = 4 * WORD_BITS + 1; + const unsigned int start = 0; + const unsigned int stop = 4 * WORD_BITS + 1; for (unsigned int i = start; i < stop; i++) { @@ -424,10 +428,54 @@ bool TestCompressors() return !fail1 && !fail2 && !fail3; } +bool TestCurve25519() +{ + std::cout << "\nTesting curve25519 Key Agreements...\n\n"; + const unsigned int AGREE_COUNT = 64; + bool pass = true; + + SecByteBlock priv1(32), priv2(32), pub1(32), pub2(32), share1(32), share2(32); + for (unsigned int i=0; i m_sk, m_pk; +}; + +NAMESPACE_END // CryptoPP + +#endif // CRYPTOPP_XED25519_H