From 57109b3120b8286cdd7b4d143854f2d27873368c Mon Sep 17 00:00:00 2001
From: weidai <weidai11@users.noreply.github.com>
Date: Fri, 25 Jul 2003 00:15:52 +0000
Subject: [PATCH] fix bugs in 64-bit CPU support

---
 blumshub.cpp |   4 +-
 blumshub.h   |   2 +-
 config.h     |  86 +++---
 cryptlib.cpp |   2 +
 cryptlib.dsp |  16 +-
 cryptlib.h   |   2 +-
 gf2n.cpp     |   2 +-
 integer.cpp  | 747 ++++++++++++++++++++++++++++++++-------------------
 integer.h    |   3 +
 misc.h       |   7 +-
 modes.cpp    |   2 +-
 modes.h      |   2 +-
 nbtheory.cpp |  40 +--
 nbtheory.h   |   2 +-
 seal.cpp     |   2 +-
 seal.h       |   2 +-
 strciphr.cpp |   2 +-
 strciphr.h   |   4 +-
 validat1.cpp |  14 +-
 19 files changed, 554 insertions(+), 387 deletions(-)

diff --git a/blumshub.cpp b/blumshub.cpp
index 40c654af..6e1854d8 100644
--- a/blumshub.cpp
+++ b/blumshub.cpp
@@ -39,9 +39,9 @@ BlumBlumShub::BlumBlumShub(const Integer &p, const Integer &q, const Integer &se
 {
 }
 
-void BlumBlumShub::Seek(dword index)
+void BlumBlumShub::Seek(lword index)
 {
-	Integer i(Integer::POSITIVE, HIGH_WORD(index), word(index));
+	Integer i(Integer::POSITIVE, index);
 	i *= 8;
 	Integer e = a_exp_b_mod_c (2, i / maxBits + 1, (p-1)*(q-1));
 	current = modn.Exponentiate(x0, e);
diff --git a/blumshub.h b/blumshub.h
index dbbb8be4..6583e886 100644
--- a/blumshub.h
+++ b/blumshub.h
@@ -46,7 +46,7 @@ public:
 	BlumBlumShub(const Integer &p, const Integer &q, const Integer &seed);
 	
 	bool IsRandomAccess() const {return true;}
-	void Seek(dword index);
+	void Seek(lword index);
 
 protected:
 	const Integer p, q;
diff --git a/config.h b/config.h
index d4f5cb83..09774e49 100644
--- a/config.h
+++ b/config.h
@@ -91,77 +91,63 @@
 #	define __USE_W32_SOCKETS
 #endif
 
-typedef unsigned char byte;     // moved outside namespace for Borland C++Builder 5
+typedef unsigned char byte;		// put in global namespace to avoid ambiguity with other byte typedefs
 
 NAMESPACE_BEGIN(CryptoPP)
 
 typedef unsigned short word16;
-	typedef unsigned int word32;
+typedef unsigned int word32;
 
 #if defined(__GNUC__) || defined(__MWERKS__)
-#	define WORD64_AVAILABLE
+	#define WORD64_AVAILABLE
 	typedef unsigned long long word64;
-#	define W64LIT(x) x##LL
+	#define W64LIT(x) x##LL
 #elif defined(_MSC_VER) || defined(__BCPLUSPLUS__)
-#	define WORD64_AVAILABLE
+	#define WORD64_AVAILABLE
 	typedef unsigned __int64 word64;
-#	define W64LIT(x) x##ui64
+	#define W64LIT(x) x##ui64
 #endif
 
-#if defined(__alpha__) || defined(__ia64__) || defined(_ARCH_PPC64) || defined(__x86_64__) || defined(__mips64) || defined(__sparc_v9__) || defined(__sparcv9) || defined(__sparc_v8__) || defined(__sparcv8)
-#	define CRYPTOPP_64BIT_CPU
-#endif
-
-// defined this if your CPU is not 64-bit to use alternative code that avoids word64
-#if defined(WORD64_AVAILABLE) && !defined(CRYPTOPP_64BIT_CPU)
-#	define CRYPTOPP_SLOW_WORD64
-#endif
-
-// word should have the same size as your CPU registers
-// dword should be twice as big as word
-
-#if (defined(__GNUC__) && !defined(__alpha)) || defined(__MWERKS__)
-	typedef unsigned long word;
-	typedef unsigned long long dword;
-#elif defined(_MSC_VER) || defined(__BCPLUSPLUS__)
-	typedef unsigned __int32 word;
-	typedef unsigned __int64 dword;
+// define largest word type
+#ifdef WORD64_AVAILABLE
+	typedef word64 lword;
 #else
-	typedef unsigned int word;
-	typedef unsigned long dword;
+	typedef word32 lword;
+#endif
+
+#if defined(__alpha__) || defined(__ia64__) || defined(_ARCH_PPC64) || defined(__x86_64__) || defined(__mips64)
+	// These platforms have 64-bit CPU registers. Unfortunately most C++ compilers doesn't
+	// allow any way to access the 64-bit by 64-bit multiply instruction without using
+	// assembly, so in order to use word64 as word, the assembly instruction must be defined
+	// in Dword::Multiply().
+	typedef word32 hword;
+	typedef word64 word;
+#else
+	#define CRYPTOPP_NATIVE_DWORD_AVAILABLE
+	#ifdef WORD64_AVAILABLE
+		#define CRYPTOPP_SLOW_WORD64 // defined this if your CPU is not 64-bit to use alternative code that avoids word64
+		typedef word16 hword;
+		typedef word32 word;
+		typedef word64 dword;
+	#else
+		typedef word8 hword;
+		typedef word16 word;
+		typedef word32 dword;
+	#endif
 #endif
 
 const unsigned int WORD_SIZE = sizeof(word);
 const unsigned int WORD_BITS = WORD_SIZE * 8;
 
-#define LOW_WORD(x) (word)(x)
-
-union dword_union
-{
-	dword_union (const dword &dw) : dw(dw) {}
-	dword dw;
-	word w[2];
-};
-
-#ifdef IS_LITTLE_ENDIAN
-#	define HIGH_WORD(x) (dword_union(x).w[1])
-#else
-#	define HIGH_WORD(x) (dword_union(x).w[0])
-#endif
-
-// if the above HIGH_WORD macro doesn't work (if you are not sure, compile it
-// and run the validation tests), try this:
-// #define HIGH_WORD(x) (word)((x)>>WORD_BITS)
-
 #if defined(_MSC_VER) || defined(__BCPLUSPLUS__)
-#	define INTEL_INTRINSICS
-#	define FAST_ROTATE
+	#define INTEL_INTRINSICS
+	#define FAST_ROTATE
 #elif defined(__MWERKS__) && TARGET_CPU_PPC
-#	define PPC_INTRINSICS
-#	define FAST_ROTATE
+	#define PPC_INTRINSICS
+	#define FAST_ROTATE
 #elif defined(__GNUC__) && defined(__i386__)
 	// GCC does peephole optimizations which should result in using rotate instructions
-#	define FAST_ROTATE
+	#define FAST_ROTATE
 #endif
 
 NAMESPACE_END
diff --git a/cryptlib.cpp b/cryptlib.cpp
index cafaaa71..a4f972e7 100644
--- a/cryptlib.cpp
+++ b/cryptlib.cpp
@@ -22,7 +22,9 @@ CRYPTOPP_COMPILE_ASSERT(sizeof(word32) == 4);
 #ifdef WORD64_AVAILABLE
 CRYPTOPP_COMPILE_ASSERT(sizeof(word64) == 8);
 #endif
+#ifdef CRYPTOPP_NATIVE_DWORD_AVAILABLE
 CRYPTOPP_COMPILE_ASSERT(sizeof(dword) == 2*sizeof(word));
+#endif
 
 const std::string BufferedTransformation::NULL_CHANNEL;
 const NullNameValuePairs g_nullNameValuePairs;
diff --git a/cryptlib.dsp b/cryptlib.dsp
index 1205ba2b..1935989a 100644
--- a/cryptlib.dsp
+++ b/cryptlib.dsp
@@ -282,14 +282,6 @@ SOURCE=.\dh2.cpp
 # End Source File
 # Begin Source File
 
-SOURCE=.\diamond.cpp
-# End Source File
-# Begin Source File
-
-SOURCE=.\diamondt.cpp
-# End Source File
-# Begin Source File
-
 SOURCE=.\dll.cpp
 # SUBTRACT CPP /YX /Yc /Yu
 # End Source File
@@ -748,10 +740,6 @@ SOURCE=.\dh2.h
 # End Source File
 # Begin Source File
 
-SOURCE=.\diamond.h
-# End Source File
-# Begin Source File
-
 SOURCE=.\dmac.h
 # End Source File
 # Begin Source File
@@ -760,6 +748,10 @@ SOURCE=.\dsa.h
 # End Source File
 # Begin Source File
 
+SOURCE=.\dword.h
+# End Source File
+# Begin Source File
+
 SOURCE=.\ec2n.h
 # End Source File
 # Begin Source File
diff --git a/cryptlib.h b/cryptlib.h
index 4c0e24aa..35231047 100644
--- a/cryptlib.h
+++ b/cryptlib.h
@@ -496,7 +496,7 @@ public:
 	//! returns whether this cipher supports random access
 	virtual bool IsRandomAccess() const =0;
 	//! for random access ciphers, seek to an absolute position
-	virtual void Seek(dword n)
+	virtual void Seek(lword n)
 	{
 		assert(!IsRandomAccess());
 		throw NotImplemented("StreamTransformation: this object doesn't support random access");
diff --git a/gf2n.cpp b/gf2n.cpp
index b7b4bf9c..93d5edeb 100644
--- a/gf2n.cpp
+++ b/gf2n.cpp
@@ -143,7 +143,7 @@ void PolynomialMod2::Decode(BufferedTransformation &bt, unsigned int inputLen)
 	{
 		byte b;
 		bt.Get(b);
-		reg[(i-1)/WORD_SIZE] |= b << ((i-1)%WORD_SIZE)*8;
+		reg[(i-1)/WORD_SIZE] |= word(b) << ((i-1)%WORD_SIZE)*8;
 	}
 }
 
diff --git a/integer.cpp b/integer.cpp
index deb60f95..52042dd5 100644
--- a/integer.cpp
+++ b/integer.cpp
@@ -60,8 +60,6 @@ void AlignedAllocator<T>::deallocate(void *p, size_type n)
 }
 #endif
 
-#define MAKE_DWORD(lowWord, highWord) ((dword(highWord)<<WORD_BITS) | (lowWord))
-
 static int Compare(const word *A, const word *B, unsigned int N)
 {
 	while (N--)
@@ -106,31 +104,303 @@ static void TwosComplement(word *A, unsigned int N)
 		A[i] = ~A[i];
 }
 
-static word LinearMultiply(word *C, const word *A, word B, unsigned int N)
+static word AtomicInverseModPower2(word A)
 {
-	word carry=0;
-	for(unsigned i=0; i<N; i++)
-	{
-		dword p = (dword)A[i] * B + carry;
-		C[i] = LOW_WORD(p);
-		carry = HIGH_WORD(p);
-	}
-	return carry;
-}
+	assert(A%2==1);
 
-static void AtomicInverseModPower2(word *C, word A0, word A1)
-{
-	assert(A0%2==1);
+	word R=A%8;
 
-	dword A=MAKE_DWORD(A0, A1), R=A0%8;
-
-	for (unsigned i=3; i<2*WORD_BITS; i*=2)
+	for (unsigned i=3; i<WORD_BITS; i*=2)
 		R = R*(2-R*A);
 
 	assert(R*A==1);
+	return R;
+}
 
-	C[0] = LOW_WORD(R);
-	C[1] = HIGH_WORD(R);
+// ********************************************************
+
+class DWord
+{
+public:
+	DWord() {}
+
+#ifdef CRYPTOPP_NATIVE_DWORD_AVAILABLE
+	explicit DWord(word low)
+	{
+		m_whole = low;
+	}
+#else
+	explicit DWord(word low)
+	{
+		m_halfs.low = low;
+		m_halfs.high = 0;
+	}
+#endif
+
+	DWord(word low, word high)
+	{
+		m_halfs.low = low;
+		m_halfs.high = high;
+	}
+
+	static DWord Multiply(word a, word b)
+	{
+		DWord r;
+		#ifdef CRYPTOPP_NATIVE_DWORD_AVAILABLE
+			r.m_whole = (dword)a * b;
+		#elif defined(__alpha__)
+			r.m_halfs.low = a*b; __asm__("umulh %1,%2,%0" : "=r" (r.m_halfs.high) : "r" (a), "r" (b));
+		#elif defined(__ia64__)
+			r.m_halfs.low = a*b; __asm__("xmpy.hu %0=%1,%2" : "=f" (r.m_halfs.high) : "f" (a), "f" (b));
+		#elif defined(_ARCH_PPC64)
+			r.m_halfs.low = a*b; __asm__("mulhdu %0,%1,%2" : "=r" (r.m_halfs.high) : "r" (a), "r" (b) : "cc");
+		#elif defined(__x86_64__)
+			__asm__("mulq %3" : "=r.m_halfs.high" (r.m_halfs.high), "=a" (r.m_halfs.low) : "a" (a), "rm" (b) : "cc");
+		#elif defined(__mips64)
+			__asm__("dmultu %2,%3" : "=h" (r.m_halfs.high), "=l" (r.m_halfs.low) : "r" (a), "r" (b));
+		#elif defined(_M_IX86)
+			// for testing
+			word64 t = (word64)a * b;
+			r.m_halfs.high = ((word32 *)(&t))[1];
+			r.m_halfs.low = (word32)t;
+		#else
+			#error can not implement DWord
+		#endif
+		return r;
+	}
+
+	static DWord MultiplyAndAdd(word a, word b, word c)
+	{
+		DWord r = Multiply(a, b);
+		return r += c;
+	}
+
+	DWord & operator+=(word a)
+	{
+		#ifdef CRYPTOPP_NATIVE_DWORD_AVAILABLE
+			m_whole = m_whole + a;
+		#else
+			m_halfs.low += a;
+			m_halfs.high += (m_halfs.low < a);
+		#endif
+		return *this;
+	}
+
+	DWord operator+(word a)
+	{
+		DWord r;
+		#ifdef CRYPTOPP_NATIVE_DWORD_AVAILABLE
+			r.m_whole = m_whole + a;
+		#else
+			r.m_halfs.low = m_halfs.low + a;
+			r.m_halfs.high = m_halfs.high + (r.m_halfs.low < a);
+		#endif
+		return r;
+	}
+
+	DWord operator-(DWord a)
+	{
+		DWord r;
+		#ifdef CRYPTOPP_NATIVE_DWORD_AVAILABLE
+			r.m_whole = m_whole - a.m_whole;
+		#else
+			r.m_halfs.low = m_halfs.low - a.m_halfs.low;
+			r.m_halfs.high = m_halfs.high - a.m_halfs.high - (r.m_halfs.low > m_halfs.low);
+		#endif
+		return r;
+	}
+
+	DWord operator-(word a)
+	{
+		DWord r;
+		#ifdef CRYPTOPP_NATIVE_DWORD_AVAILABLE
+			r.m_whole = m_whole - a;
+		#else
+			r.m_halfs.low = m_halfs.low - a;
+			r.m_halfs.high = m_halfs.high - (r.m_halfs.low > m_halfs.low);
+		#endif
+		return r;
+	}
+
+	// returns quotient, which must fit in a word
+	word operator/(word divisor);
+
+	word operator%(word a);
+
+	bool operator!() const
+	{
+	#ifdef CRYPTOPP_NATIVE_DWORD_AVAILABLE
+		return !m_whole;
+	#else
+		return !m_halfs.high && !m_halfs.low;
+	#endif
+	}
+
+	word GetLowHalf() const {return m_halfs.low;}
+	word GetHighHalf() const {return m_halfs.high;}
+	word GetHighHalfAsBorrow() const {return 0-m_halfs.high;}
+
+private:
+	union
+	{
+	#ifdef CRYPTOPP_NATIVE_DWORD_AVAILABLE
+		dword m_whole;
+	#endif
+		struct
+		{
+		#ifdef IS_LITTLE_ENDIAN
+			word low;
+			word high;
+		#else
+			word high;
+			word low;
+		#endif
+		} m_halfs;
+	};
+};
+
+class Word
+{
+public:
+	Word() {}
+
+	Word(word value)
+	{
+		m_whole = value;
+	}
+
+	Word(hword low, hword high)
+	{
+		m_whole = low | (word(high) << (WORD_BITS/2));
+	}
+
+	static Word Multiply(hword a, hword b)
+	{
+		Word r;
+		r.m_whole = (word)a * b;
+		return r;
+	}
+
+	Word operator-(Word a)
+	{
+		Word r;
+		r.m_whole = m_whole - a.m_whole;
+		return r;
+	}
+
+	Word operator-(hword a)
+	{
+		Word r;
+		r.m_whole = m_whole - a;
+		return r;
+	}
+
+	// returns quotient, which must fit in a word
+	hword operator/(hword divisor)
+	{
+		return hword(m_whole / divisor);
+	}
+
+	bool operator!() const
+	{
+		return !m_whole;
+	}
+
+	word GetWhole() const {return m_whole;}
+	hword GetLowHalf() const {return hword(m_whole);}
+	hword GetHighHalf() const {return hword(m_whole>>(WORD_BITS/2));}
+	hword GetHighHalfAsBorrow() const {return 0-hword(m_whole>>(WORD_BITS/2));}
+	
+private:
+	word m_whole;
+};
+
+// do a 3 word by 2 word divide, returns quotient and leaves remainder in A
+template <class S, class D>
+S DivideThreeWordsByTwo(S *A, S B0, S B1, D *dummy=NULL)
+{
+	// assert {A[2],A[1]} < {B1,B0}, so quotient can fit in a S
+	assert(A[2] < B1 || (A[2]==B1 && A[1] < B0));
+
+	// estimate the quotient: do a 2 S by 1 S divide
+	S Q;
+	if (S(B1+1) == 0)
+		Q = A[2];
+	else
+		Q = D(A[1], A[2]) / S(B1+1);
+
+	// now subtract Q*B from A
+	D p = D::Multiply(B0, Q);
+	D u = (D) A[0] - p.GetLowHalf();
+	A[0] = u.GetLowHalf();
+	u = (D) A[1] - p.GetHighHalf() - u.GetHighHalfAsBorrow() - D::Multiply(B1, Q);
+	A[1] = u.GetLowHalf();
+	A[2] += u.GetHighHalf();
+
+	// Q <= actual quotient, so fix it
+	while (A[2] || A[1] > B1 || (A[1]==B1 && A[0]>=B0))
+	{
+		u = (D) A[0] - B0;
+		A[0] = u.GetLowHalf();
+		u = (D) A[1] - B1 - u.GetHighHalfAsBorrow();
+		A[1] = u.GetLowHalf();
+		A[2] += u.GetHighHalf();
+		Q++;
+		assert(Q);	// shouldn't overflow
+	}
+
+	return Q;
+}
+
+// do a 4 word by 2 word divide, returns 2 word quotient in Q0 and Q1
+template <class S, class D>
+inline D DivideFourWordsByTwo(S *T, const D &Al, const D &Ah, const D &B)
+{
+	if (!B) // if divisor is 0, we assume divisor==2**(2*WORD_BITS)
+		return D(Ah.GetLowHalf(), Ah.GetHighHalf());
+	else
+	{
+		S Q[2];
+		T[0] = Al.GetLowHalf();
+		T[1] = Al.GetHighHalf(); 
+		T[2] = Ah.GetLowHalf();
+		T[3] = Ah.GetHighHalf();
+		Q[1] = DivideThreeWordsByTwo<S, D>(T+1, B.GetLowHalf(), B.GetHighHalf());
+		Q[0] = DivideThreeWordsByTwo<S, D>(T, B.GetLowHalf(), B.GetHighHalf());
+		return D(Q[0], Q[1]);
+	}
+}
+
+// returns quotient, which must fit in a word
+inline word DWord::operator/(word a)
+{
+	#ifdef CRYPTOPP_NATIVE_DWORD_AVAILABLE
+		return word(m_whole / a);
+	#else
+		hword r[4];
+		return DivideFourWordsByTwo<hword, Word>(r, m_halfs.low, m_halfs.high, a).GetWhole();
+	#endif
+}
+
+inline word DWord::operator%(word a)
+{
+	#ifdef CRYPTOPP_NATIVE_DWORD_AVAILABLE
+		return word(m_whole % a);
+	#else
+		if (a < (word(1) << (WORD_BITS/2)))
+		{
+			hword h = hword(a);
+			word r = m_halfs.high % h;
+			r = ((m_halfs.low >> (WORD_BITS/2)) + (r << (WORD_BITS/2))) % h;
+			return hword((hword(m_halfs.low) + (r << (WORD_BITS/2))) % h);
+		}
+		else
+		{
+			hword r[4];
+			DivideFourWordsByTwo<hword, Word>(r, m_halfs.low, m_halfs.high, a);
+			return Word(r[0], r[1]).GetWhole();
+		}
+	#endif
 }
 
 // ********************************************************
@@ -162,69 +432,30 @@ word Portable::Add(word *C, const word *A, const word *B, unsigned int N)
 {
 	assert (N%2 == 0);
 
-#ifdef IS_LITTLE_ENDIAN
-	if (sizeof(dword) == sizeof(size_t))	// dword is only register size
+	DWord u(0, 0);
+	for (unsigned int i = 0; i < N; i+=2)
 	{
-		dword carry = 0;
-		N >>= 1;
-		for (unsigned int i = 0; i < N; i++)
-		{
-			dword a = ((const dword *)A)[i] + carry;
-			dword c = a + ((const dword *)B)[i];
-			((dword *)C)[i] = c;
-			carry = (a < carry) | (c < a);
-		}
-		return (word)carry;
-	}
-	else
-#endif
-	{
-		word carry = 0;
-		for (unsigned int i = 0; i < N; i+=2)
-		{
-			dword u = (dword) carry + A[i] + B[i];
-			C[i] = LOW_WORD(u);
-			u = (dword) HIGH_WORD(u) + A[i+1] + B[i+1];
-			C[i+1] = LOW_WORD(u);
-			carry = HIGH_WORD(u);
-		}
-		return carry;
+		u = DWord(A[i]) + B[i] + u.GetHighHalf();
+		C[i] = u.GetLowHalf();
+		u = DWord(A[i+1]) + B[i+1] + u.GetHighHalf();
+		C[i+1] = u.GetLowHalf();
 	}
+	return u.GetHighHalf();
 }
 
 word Portable::Subtract(word *C, const word *A, const word *B, unsigned int N)
 {
 	assert (N%2 == 0);
 
-#ifdef IS_LITTLE_ENDIAN
-	if (sizeof(dword) == sizeof(size_t))	// dword is only register size
+	DWord u(0, 0);
+	for (unsigned int i = 0; i < N; i+=2)
 	{
-		dword borrow = 0;
-		N >>= 1;
-		for (unsigned int i = 0; i < N; i++)
-		{
-			dword a = ((const dword *)A)[i];
-			dword b = a - borrow;
-			dword c = b - ((const dword *)B)[i];
-			((dword *)C)[i] = c;
-			borrow = (b > a) | (c > b);
-		}
-		return (word)borrow;
-	}
-	else
-#endif
-	{
-		word borrow=0;
-		for (unsigned i = 0; i < N; i+=2)
-		{
-			dword u = (dword) A[i] - B[i] - borrow;
-			C[i] = LOW_WORD(u);
-			u = (dword) A[i+1] - B[i+1] - (word)(0-HIGH_WORD(u));
-			C[i+1] = LOW_WORD(u);
-			borrow = 0-HIGH_WORD(u);
-		}
-		return borrow;
+		u = (DWord) A[i] - B[i] - u.GetHighHalfAsBorrow();
+		C[i] = u.GetLowHalf();
+		u = (DWord) A[i+1] - B[i+1] - u.GetHighHalfAsBorrow();
+		C[i+1] = u.GetLowHalf();
 	}
+	return 0-u.GetHighHalf();
 }
 
 void Portable::Multiply2(word *C, const word *A, const word *B)
@@ -261,38 +492,28 @@ void Portable::Multiply2(word *C, const word *A, const word *B)
 	unsigned int ai = A[1] < A[0];
 	unsigned int bi = B[0] < B[1];
 	unsigned int di = ai & bi;
-	dword d = (dword)D[di]*D[di+2];
+	DWord d = DWord::Multiply(D[di], D[di+2]);
 	D[1] = D[3] = 0;
 	unsigned int si = ai + !bi;
 	word s = D[si];
 
-	dword A0B0 = (dword)A[0]*B[0];
-	C[0] = LOW_WORD(A0B0);
+	DWord A0B0 = DWord::Multiply(A[0], B[0]);
+	C[0] = A0B0.GetLowHalf();
 
-	dword A1B1 = (dword)A[1]*B[1];
-	dword t = (dword) HIGH_WORD(A0B0) + LOW_WORD(A0B0) + LOW_WORD(d) + LOW_WORD(A1B1);
-	C[1] = LOW_WORD(t);
+	DWord A1B1 = DWord::Multiply(A[1], B[1]);
+	DWord t = (DWord) A0B0.GetHighHalf() + A0B0.GetLowHalf() + d.GetLowHalf() + A1B1.GetLowHalf();
+	C[1] = t.GetLowHalf();
 
-	t = A1B1 + HIGH_WORD(t) + HIGH_WORD(A0B0) + HIGH_WORD(d) + HIGH_WORD(A1B1) - s;
-	C[2] = LOW_WORD(t);
-	C[3] = HIGH_WORD(t);
+	t = A1B1 + t.GetHighHalf() + A0B0.GetHighHalf() + d.GetHighHalf() + A1B1.GetHighHalf() - s;
+	C[2] = t.GetLowHalf();
+	C[3] = t.GetHighHalf();
 }
 
 inline void Portable::Multiply2Bottom(word *C, const word *A, const word *B)
 {
-#ifdef IS_LITTLE_ENDIAN
-	if (sizeof(dword) == sizeof(size_t))
-	{
-		dword a = *(const dword *)A, b = *(const dword *)B;
-		((dword *)C)[0] = a*b;
-	}
-	else
-#endif
-	{
-		dword t = (dword)A[0]*B[0];
-		C[0] = LOW_WORD(t);
-		C[1] = HIGH_WORD(t) + A[0]*B[1] + A[1]*B[0];
-	}
+	DWord t = DWord::Multiply(A[0], B[0]);
+	C[0] = t.GetLowHalf();
+	C[1] = t.GetHighHalf() + A[0]*B[1] + A[1]*B[0];
 }
 
 word Portable::Multiply2Add(word *C, const word *A, const word *B)
@@ -301,77 +522,77 @@ word Portable::Multiply2Add(word *C, const word *A, const word *B)
 	unsigned int ai = A[1] < A[0];
 	unsigned int bi = B[0] < B[1];
 	unsigned int di = ai & bi;
-	dword d = (dword)D[di]*D[di+2];
+	DWord d = DWord::Multiply(D[di], D[di+2]);
 	D[1] = D[3] = 0;
 	unsigned int si = ai + !bi;
 	word s = D[si];
 
-	dword A0B0 = (dword)A[0]*B[0];
-	dword t = A0B0 + C[0];
-	C[0] = LOW_WORD(t);
+	DWord A0B0 = DWord::Multiply(A[0], B[0]);
+	DWord t = A0B0 + C[0];
+	C[0] = t.GetLowHalf();
 
-	dword A1B1 = (dword)A[1]*B[1];
-	t = (dword) HIGH_WORD(t) + LOW_WORD(A0B0) + LOW_WORD(d) + LOW_WORD(A1B1) + C[1];
-	C[1] = LOW_WORD(t);
+	DWord A1B1 = DWord::Multiply(A[1], B[1]);
+	t = (DWord) t.GetHighHalf() + A0B0.GetLowHalf() + d.GetLowHalf() + A1B1.GetLowHalf() + C[1];
+	C[1] = t.GetLowHalf();
 
-	t = (dword) HIGH_WORD(t) + LOW_WORD(A1B1) + HIGH_WORD(A0B0) + HIGH_WORD(d) + HIGH_WORD(A1B1) - s + C[2];
-	C[2] = LOW_WORD(t);
+	t = (DWord) t.GetHighHalf() + A1B1.GetLowHalf() + A0B0.GetHighHalf() + d.GetHighHalf() + A1B1.GetHighHalf() - s + C[2];
+	C[2] = t.GetLowHalf();
 
-	t = (dword) HIGH_WORD(t) + HIGH_WORD(A1B1) + C[3];
-	C[3] = LOW_WORD(t);
-	return HIGH_WORD(t);
+	t = (DWord) t.GetHighHalf() + A1B1.GetHighHalf() + C[3];
+	C[3] = t.GetLowHalf();
+	return t.GetHighHalf();
 }
 
 #define MulAcc(x, y)								\
-	p = (dword)A[x] * B[y] + c; 					\
-	c = LOW_WORD(p);								\
-	p = (dword)d + HIGH_WORD(p);					\
-	d = LOW_WORD(p);								\
-	e += HIGH_WORD(p);
+	p = DWord::MultiplyAndAdd(A[x], B[y], c);		\
+	c = p.GetLowHalf();								\
+	p = (DWord) d + p.GetHighHalf();					\
+	d = p.GetLowHalf();								\
+	e += p.GetHighHalf();
 
 #define SaveMulAcc(s, x, y) 						\
 	R[s] = c;										\
-	p = (dword)A[x] * B[y] + d; 					\
-	c = LOW_WORD(p);								\
-	p = (dword)e + HIGH_WORD(p);					\
-	d = LOW_WORD(p);								\
-	e = HIGH_WORD(p);
+	p = DWord::MultiplyAndAdd(A[x], B[y], d);				\
+	c = p.GetLowHalf();								\
+	p = (DWord) e + p.GetHighHalf();					\
+	d = p.GetLowHalf();								\
+	e = p.GetHighHalf();
 
 #define SquAcc(x, y)								\
-	q = (dword)A[x] * A[y];	\
+	q = DWord::Multiply(A[x], A[y]);	\
 	p = q + c; 					\
-	c = LOW_WORD(p);								\
-	p = (dword)d + HIGH_WORD(p);					\
-	d = LOW_WORD(p);								\
-	e += HIGH_WORD(p);			\
+	c = p.GetLowHalf();								\
+	p = (DWord) d + p.GetHighHalf();					\
+	d = p.GetLowHalf();								\
+	e += p.GetHighHalf();			\
 	p = q + c; 					\
-	c = LOW_WORD(p);								\
-	p = (dword)d + HIGH_WORD(p);					\
-	d = LOW_WORD(p);								\
-	e += HIGH_WORD(p);
+	c = p.GetLowHalf();								\
+	p = (DWord) d + p.GetHighHalf();					\
+	d = p.GetLowHalf();								\
+	e += p.GetHighHalf();
 
 #define SaveSquAcc(s, x, y) 						\
 	R[s] = c;										\
-	q = (dword)A[x] * A[y];	\
+	q = DWord::Multiply(A[x], A[y]);	\
 	p = q + d; 					\
-	c = LOW_WORD(p);								\
-	p = (dword)e + HIGH_WORD(p);					\
-	d = LOW_WORD(p);								\
-	e = HIGH_WORD(p);			\
+	c = p.GetLowHalf();								\
+	p = (DWord) e + p.GetHighHalf();					\
+	d = p.GetLowHalf();								\
+	e = p.GetHighHalf();			\
 	p = q + c; 					\
-	c = LOW_WORD(p);								\
-	p = (dword)d + HIGH_WORD(p);					\
-	d = LOW_WORD(p);								\
-	e += HIGH_WORD(p);
+	c = p.GetLowHalf();								\
+	p = (DWord) d + p.GetHighHalf();					\
+	d = p.GetLowHalf();								\
+	e += p.GetHighHalf();
 
 void Portable::Multiply4(word *R, const word *A, const word *B)
 {
-	dword p;
+	DWord p;
 	word c, d, e;
 
-	p = (dword)A[0] * B[0];
-	R[0] = LOW_WORD(p);
-	c = HIGH_WORD(p);
+	p = DWord::Multiply(A[0], B[0]);
+	R[0] = p.GetLowHalf();
+	c = p.GetHighHalf();
 	d = e = 0;
 
 	MulAcc(0, 1);
@@ -394,38 +615,38 @@ void Portable::Multiply4(word *R, const word *A, const word *B)
 	MulAcc(3, 2);
 
 	R[5] = c;
-	p = (dword)A[3] * B[3] + d;
-	R[6] = LOW_WORD(p);
-	R[7] = e + HIGH_WORD(p);
+	p = DWord::MultiplyAndAdd(A[3], B[3], d);
+	R[6] = p.GetLowHalf();
+	R[7] = e + p.GetHighHalf();
 }
 
 void Portable::Square2(word *R, const word *A)
 {
-	dword p, q;
+	DWord p, q;
 	word c, d, e;
 
-	p = (dword)A[0] * A[0];
-	R[0] = LOW_WORD(p);
-	c = HIGH_WORD(p);
+	p = DWord::Multiply(A[0], A[0]);
+	R[0] = p.GetLowHalf();
+	c = p.GetHighHalf();
 	d = e = 0;
 
 	SquAcc(0, 1);
 
 	R[1] = c;
-	p = (dword)A[1] * A[1] + d;
-	R[2] = LOW_WORD(p);
-	R[3] = e + HIGH_WORD(p);
+	p = DWord::MultiplyAndAdd(A[1], A[1], d);
+	R[2] = p.GetLowHalf();
+	R[3] = e + p.GetHighHalf();
 }
 
 void Portable::Square4(word *R, const word *A)
 {
 	const word *B = A;
-	dword p, q;
+	DWord p, q;
 	word c, d, e;
 
-	p = (dword)A[0] * A[0];
-	R[0] = LOW_WORD(p);
-	c = HIGH_WORD(p);
+	p = DWord::Multiply(A[0], A[0]);
+	R[0] = p.GetLowHalf();
+	c = p.GetHighHalf();
 	d = e = 0;
 
 	SquAcc(0, 1);
@@ -442,19 +663,19 @@ void Portable::Square4(word *R, const word *A)
 	SaveSquAcc(4, 2, 3);
 
 	R[5] = c;
-	p = (dword)A[3] * A[3] + d;
-	R[6] = LOW_WORD(p);
-	R[7] = e + HIGH_WORD(p);
+	p = DWord::MultiplyAndAdd(A[3], A[3], d);
+	R[6] = p.GetLowHalf();
+	R[7] = e + p.GetHighHalf();
 }
 
 void Portable::Multiply8(word *R, const word *A, const word *B)
 {
-	dword p;
+	DWord p;
 	word c, d, e;
 
-	p = (dword)A[0] * B[0];
-	R[0] = LOW_WORD(p);
-	c = HIGH_WORD(p);
+	p = DWord::Multiply(A[0], B[0]);
+	R[0] = p.GetLowHalf();
+	c = p.GetHighHalf();
 	d = e = 0;
 
 	MulAcc(0, 1);
@@ -533,19 +754,19 @@ void Portable::Multiply8(word *R, const word *A, const word *B)
 	MulAcc(7, 6);
 
 	R[13] = c;
-	p = (dword)A[7] * B[7] + d;
-	R[14] = LOW_WORD(p);
-	R[15] = e + HIGH_WORD(p);
+	p = DWord::MultiplyAndAdd(A[7], B[7], d);
+	R[14] = p.GetLowHalf();
+	R[15] = e + p.GetHighHalf();
 }
 
 void Portable::Multiply4Bottom(word *R, const word *A, const word *B)
 {
-	dword p;
+	DWord p;
 	word c, d, e;
 
-	p = (dword)A[0] * B[0];
-	R[0] = LOW_WORD(p);
-	c = HIGH_WORD(p);
+	p = DWord::Multiply(A[0], B[0]);
+	R[0] = p.GetLowHalf();
+	c = p.GetHighHalf();
 	d = e = 0;
 
 	MulAcc(0, 1);
@@ -561,12 +782,12 @@ void Portable::Multiply4Bottom(word *R, const word *A, const word *B)
 
 void Portable::Multiply8Bottom(word *R, const word *A, const word *B)
 {
-	dword p;
+	DWord p;
 	word c, d, e;
 
-	p = (dword)A[0] * B[0];
-	R[0] = LOW_WORD(p);
-	c = HIGH_WORD(p);
+	p = DWord::Multiply(A[0], B[0]);
+	R[0] = p.GetLowHalf();
+	c = p.GetHighHalf();
 	d = e = 0;
 
 	MulAcc(0, 1);
@@ -620,6 +841,7 @@ class PentiumOptimized : public Portable
 public:
 	static word __fastcall Add(word *C, const word *A, const word *B, unsigned int N);
 	static word __fastcall Subtract(word *C, const word *A, const word *B, unsigned int N);
+// TODO test this with .NET #if _MSC_VER < 1300
 	static inline void Square4(word *R, const word *A)
 	{
 		// VC60 workaround: MSVC 6.0 has an optimization bug that makes
@@ -628,6 +850,7 @@ public:
 		// bug is fixed.
 		Multiply4(R, A, A);
 	}
+//#endif
 };
 
 typedef PentiumOptimized LowLevel;
@@ -1703,88 +1926,7 @@ void PentiumOptimized::Multiply8(word* Z, const word* X, const word* Y)
 	);
 }
 
-#elif defined(__GNUC__) && defined(CRYPTOPP_64BIT_CPU)
-
-#ifdef __alpha__
-#define MUL64x64(a, b, c, d) c = a*b; __asm__("umulh %1,%2,%0" : "=r" (d) : "r" (a), "r" (b))
-#elif defined(__ia64__)
-#define MUL64x64(a, b, c, d) c = a*b; __asm__("xmpy.hu %0=%1,%2" : "=f" (d) : "f" (a), "f" (b))
-#elif defined(_ARCH_PPC64)
-#define MUL64x64(a, b, c, d) c = a*b; __asm__("mulhdu %0,%1,%2" : "=r" (d) : "r" (a), "r" (b) : "cc")
-#elif defined(__x86_64__)
-#define MUL64x64(a, b, c, d) __asm__("mulq %3" : "=d" (d), "=a" (c) : "a" (a), "rm" (b) : "cc")
-#elif defined(__mips64)
-#define MUL64x64(a, b, c, d) __asm__("dmultu %2,%3" : "=h" (d), "=l" (c) : "r" (a), "r" (b))
-#elif defined(__sparc_v9__) || defined(__sparcv9) || defined(__sparc_v8__) || defined(__sparcv8)
-#define MUL64x64(a, b, c, d) __asm__("umul %2,%3,%1;rd %%y,%0" : "=r" (d), "=r" (c) : "r" (a), "r" (b))
-#endif
-
-class OptimizedFor64BitCPU : public Portable
-{
-public:
-	static inline void Multiply2(word *C, const word *A, const word *B);
-	static inline word Multiply2Add(word *C, const word *A, const word *B);
-	static inline void Multiply4(word *C, const word *A, const word *B);
-	static inline unsigned int MultiplyRecursionLimit() {return 4;}
-
-	static inline void Multiply4Bottom(word *C, const word *A, const word *B);
-	static inline unsigned int MultiplyBottomRecursionLimit() {return 4;}
-
-	static inline void Square4(word *R, const word *A)
-	{
-		Multiply4(R, A, A);
-	}
-};
-
-typedef OptimizedFor64BitCPU LowLevel;
-
-inline void OptimizedFor64BitCPU::Multiply2(word *C, const word *A, const word *B)
-{
-	register dword c, d, a = *(const dword *)A, b = *(const dword *)B;
-	MUL64x64(a, b, c, d);
-	((dword *)C)[0] = c;
-	((dword *)C)[1] = d;
-}
-
-inline word OptimizedFor64BitCPU::Multiply2Add(word *C, const word *A, const word *B)
-{
-	register dword c, d, e, a = *(const dword *)A, b = *(const dword *)B;
-	c = ((dword *)C)[0];
-	MUL64x64(a, b, d, e);
-	d += c;
-	((dword *)C)[0] = d;
-	d = (d < c);
-	c = ((dword *)C)[1] + d;
-	d = (c < d);
-	c += e;
-	((dword *)C)[1] = c;
-	d |= (c < e);
-	return d;
-}
-
-inline void OptimizedFor64BitCPU::Multiply4(word *R, const word *A, const word *B)
-{
-	Multiply2(R, A, B);
-	Multiply2(R+4, A+2, B+2);
-	word carry = Multiply2Add(R+2, A+0, B+2);
-	carry += Multiply2Add(R+2, A+2, B+0);
-	Increment(R+6, 2, carry);
-}
-
-static inline void Multiply2BottomAdd(word *C, const word *A, const word *B)
-{
-	register dword a = *(const dword *)A, b = *(const dword *)B;
-	((dword *)C)[0] = a*b + ((dword *)C)[0];
-}
-
-inline void OptimizedFor64BitCPU::Multiply4Bottom(word *R, const word *A, const word *B)
-{
-	Multiply2(R, A, B);
-	Multiply2BottomAdd(R+2, A+0, B+2);
-	Multiply2BottomAdd(R+2, A+2, B+0);
-}
-
-#else	// no processor specific code available
+#else	// no processor specific code at this layer
 
 typedef Portable LowLevel;
 
@@ -1970,13 +2112,12 @@ void RecursiveMultiplyTop(word *R, word *T, const word *L, const word *A, const
 	if (N==4)
 	{
 		P::Multiply4(T, A, B);
-		((dword *)R)[0] = ((dword *)T)[2];
-		((dword *)R)[1] = ((dword *)T)[3];
+		memcpy(R, T+4, 4*WORD_SIZE);
 	}
 	else if (N==2)
 	{
 		P::Multiply2(T, A, B);
-		((dword *)R)[0] = ((dword *)T)[1];
+		memcpy(R, T+2, 2*WORD_SIZE);
 	}
 	else
 	{
@@ -2088,6 +2229,18 @@ inline void MultiplyTop(word *R, word *T, const word *L, const word *A, const wo
 		RecursiveMultiplyTop<LowLevel>(R, T, L, A, B, N);
 }
 
+static word LinearMultiply(word *C, const word *A, word B, unsigned int N)
+{
+	word carry=0;
+	for(unsigned i=0; i<N; i++)
+	{
+		DWord p = DWord::MultiplyAndAdd(A[i], B, carry);
+		C[i] = p.GetLowHalf();
+		carry = p.GetHighHalf();
+	}
+	return carry;
+}
+
 // R[NA+NB] - result = A*B
 // T[NA+NB] - temporary work space
 // A[NA] ---- multiplier
@@ -2153,7 +2306,14 @@ void AsymmetricMultiply(word *R, word *T, const word *A, unsigned int NA, const
 void RecursiveInverseModPower2(word *R, word *T, const word *A, unsigned int N)
 {
 	if (N==2)
-		AtomicInverseModPower2(R, A[0], A[1]);
+	{
+		T[0] = AtomicInverseModPower2(A[0]);
+		T[1] = 0;
+		LowLevel::Multiply2Bottom(T+2, T, A);
+		TwosComplement(T+2, 2);
+		Increment(T+2, 2, 2);
+		LowLevel::Multiply2Bottom(R, T, T+2);
+	}
 	else
 	{
 		const unsigned int N2 = N/2;
@@ -2255,37 +2415,36 @@ void HalfMontgomeryReduce(word *R, word *T, const word *X, const word *M, const
 #undef R2
 #undef R3
 
+/*
 // do a 3 word by 2 word divide, returns quotient and leaves remainder in A
 static word SubatomicDivide(word *A, word B0, word B1)
 {
 	// assert {A[2],A[1]} < {B1,B0}, so quotient can fit in a word
 	assert(A[2] < B1 || (A[2]==B1 && A[1] < B0));
 
-	dword p, u;
-	word Q;
-
 	// estimate the quotient: do a 2 word by 1 word divide
+	word Q;
 	if (B1+1 == 0)
 		Q = A[2];
 	else
-		Q = word(MAKE_DWORD(A[1], A[2]) / (B1+1));
+		Q = DWord(A[1], A[2]).DividedBy(B1+1);
 
 	// now subtract Q*B from A
-	p = (dword) B0*Q;
-	u = (dword) A[0] - LOW_WORD(p);
-	A[0] = LOW_WORD(u);
-	u = (dword) A[1] - HIGH_WORD(p) - (word)(0-HIGH_WORD(u)) - (dword)B1*Q;
-	A[1] = LOW_WORD(u);
-	A[2] += HIGH_WORD(u);
+	DWord p = DWord::Multiply(B0, Q);
+	DWord u = (DWord) A[0] - p.GetLowHalf();
+	A[0] = u.GetLowHalf();
+	u = (DWord) A[1] - p.GetHighHalf() - u.GetHighHalfAsBorrow() - DWord::Multiply(B1, Q);
+	A[1] = u.GetLowHalf();
+	A[2] += u.GetHighHalf();
 
 	// Q <= actual quotient, so fix it
 	while (A[2] || A[1] > B1 || (A[1]==B1 && A[0]>=B0))
 	{
-		u = (dword) A[0] - B0;
-		A[0] = LOW_WORD(u);
-		u = (dword) A[1] - B1 - (word)(0-HIGH_WORD(u));
-		A[1] = LOW_WORD(u);
-		A[2] += HIGH_WORD(u);
+		u = (DWord) A[0] - B0;
+		A[0] = u.GetLowHalf();
+		u = (DWord) A[1] - B1 - u.GetHighHalfAsBorrow();
+		A[1] = u.GetLowHalf();
+		A[2] += u.GetHighHalf();
 		Q++;
 		assert(Q);	// shouldn't overflow
 	}
@@ -2318,6 +2477,27 @@ static inline void AtomicDivide(word *Q, const word *A, const word *B)
 #endif
 	}
 }
+*/
+
+static inline void AtomicDivide(word *Q, const word *A, const word *B)
+{
+	word T[4];
+	DWord q = DivideFourWordsByTwo<word, DWord>(T, DWord(A[0], A[1]), DWord(A[2], A[3]), DWord(B[0], B[1]));
+	Q[0] = q.GetLowHalf();
+	Q[1] = q.GetHighHalf();
+
+#ifndef NDEBUG
+	if (B[0] || B[1])
+	{
+		// multiply quotient and divisor and add remainder, make sure it equals dividend
+		assert(!T[2] && !T[3] && (T[1] < B[1] || (T[1]==B[1] && T[0]<B[0])));
+		word P[4];
+		Portable::Multiply2(P, Q, B);
+		Add(P, P, T, 4);
+		assert(memcmp(P, A, 4*WORD_SIZE)==0);
+	}
+#endif
+}
 
 // for use by Divide(), corrects the underestimated quotient {Q1,Q0}
 static void CorrectQuotientEstimate(word *R, word *T, word *Q, const word *B, unsigned int N)
@@ -2570,6 +2750,13 @@ Integer::Integer(const Integer& t)
 	CopyWords(reg, t.reg, reg.size());
 }
 
+Integer::Integer(Sign s, lword value)
+	: reg(2), sign(s)
+{
+	reg[0] = word(value);
+	reg[1] = word(SafeRightShift<WORD_BITS>(value));
+}
+
 Integer::Integer(signed long value)
 	: reg(2)
 {
@@ -2581,7 +2768,7 @@ Integer::Integer(signed long value)
 		value = -value;
 	}
 	reg[0] = word(value);
-	reg[1] = word(SafeRightShift<WORD_BITS, unsigned long>(value));
+	reg[1] = word(SafeRightShift<WORD_BITS>((unsigned long)value));
 }
 
 Integer::Integer(Sign s, word high, word low)
@@ -2877,13 +3064,13 @@ void Integer::Decode(BufferedTransformation &bt, unsigned int inputLen, Signedne
 	for (unsigned int i=inputLen; i > 0; i--)
 	{
 		bt.Get(b);
-		reg[(i-1)/WORD_SIZE] |= b << ((i-1)%WORD_SIZE)*8;
+		reg[(i-1)/WORD_SIZE] |= word(b) << ((i-1)%WORD_SIZE)*8;
 	}
 
 	if (sign == NEGATIVE)
 	{
 		for (unsigned i=inputLen; i<reg.size()*WORD_SIZE; i++)
-			reg[i/WORD_SIZE] |= 0xff << (i%WORD_SIZE)*8;
+			reg[i/WORD_SIZE] |= word(0xff) << (i%WORD_SIZE)*8;
 		TwosComplement(reg, reg.size());
 	}
 }
@@ -3598,8 +3785,8 @@ void Integer::Divide(word &remainder, Integer &quotient, const Integer &dividend
 	remainder = 0;
 	while (i--)
 	{
-		quotient.reg[i] = word(MAKE_DWORD(dividend.reg[i], remainder) / divisor);
-		remainder = word(MAKE_DWORD(dividend.reg[i], remainder) % divisor);
+		quotient.reg[i] = DWord(dividend.reg[i], remainder) / divisor;
+		remainder = DWord(dividend.reg[i], remainder) % divisor;
 	}
 
 	if (dividend.NotNegative())
@@ -3640,16 +3827,16 @@ word Integer::Modulo(word divisor) const
 
 		if (divisor <= 5)
 		{
-			dword sum=0;
+			DWord sum(0, 0);
 			while (i--)
 				sum += reg[i];
-			remainder = word(sum%divisor);
+			remainder = sum % divisor;
 		}
 		else
 		{
 			remainder = 0;
 			while (i--)
-				remainder = word(MAKE_DWORD(reg[i], remainder) % divisor);
+				remainder = DWord(reg[i], remainder) % divisor;
 		}
 	}
 
diff --git a/integer.h b/integer.h
index b8284614..72a962e3 100644
--- a/integer.h
+++ b/integer.h
@@ -100,6 +100,9 @@ public:
 		//! convert from signed long
 		Integer(signed long value);
 
+		//! convert from lword
+		Integer(Sign s, lword value);
+
 		//! convert from two words
 		Integer(Sign s, word highWord, word lowWord);
 
diff --git a/misc.h b/misc.h
index f6918b83..790e459b 100644
--- a/misc.h
+++ b/misc.h
@@ -100,6 +100,11 @@ inline unsigned int BitsToWords(unsigned int bitCount)
 	return ((bitCount+WORD_BITS-1)/(WORD_BITS));
 }
 
+inline unsigned int BitsToDwords(unsigned int bitCount)
+{
+	return ((bitCount+2*WORD_BITS-1)/(2*WORD_BITS));
+}
+
 CRYPTOPP_DLL void xorbuf(byte *buf, const byte *mask, unsigned int count);
 CRYPTOPP_DLL void xorbuf(byte *output, const byte *input, const byte *mask, unsigned int count);
 
@@ -142,7 +147,7 @@ inline unsigned int GetAlignment(T *dummy=NULL)	// VC60 workaround
 
 inline bool IsAlignedOn(const void *p, unsigned int alignment)
 {
-	return IsPowerOf2(alignment) ? ModPowerOf2((unsigned int)p, alignment) == 0 : (unsigned int)p % alignment == 0;
+	return IsPowerOf2(alignment) ? ModPowerOf2((size_t)p, alignment) == 0 : (size_t)p % alignment == 0;
 }
 
 template <class T>
diff --git a/modes.cpp b/modes.cpp
index 0d163cb2..d0fc7d2a 100644
--- a/modes.cpp
+++ b/modes.cpp
@@ -38,7 +38,7 @@ void CipherModeBase::GetNextIV(byte *IV)
 	memcpy(IV, m_register, BlockSize());
 }
 
-void CTR_ModePolicy::SeekToIteration(dword iterationCount)
+void CTR_ModePolicy::SeekToIteration(lword iterationCount)
 {
 	int carry=0;
 	for (int i=BlockSize()-1; i>=0; i--)
diff --git a/modes.h b/modes.h
index 34dd21ab..8cdea99a 100644
--- a/modes.h
+++ b/modes.h
@@ -157,7 +157,7 @@ private:
 	bool CanOperateKeystream() const {return true;}
 	void OperateKeystream(KeystreamOperation operation, byte *output, const byte *input, unsigned int iterationCount);
 	void CipherResynchronize(byte *keystreamBuffer, const byte *iv);
-	void SeekToIteration(dword iterationCount);
+	void SeekToIteration(lword iterationCount);
 
 	inline void ProcessMultipleBlocks(byte *output, const byte *input, unsigned int n);
 
diff --git a/nbtheory.cpp b/nbtheory.cpp
index 8c2e0423..013e4419 100644
--- a/nbtheory.cpp
+++ b/nbtheory.cpp
@@ -15,12 +15,12 @@ NAMESPACE_BEGIN(CryptoPP)
 
 const word s_lastSmallPrime = 32719;
 
-std::vector<word> * NewPrimeTable()
+std::vector<word16> * NewPrimeTable()
 {
 	const unsigned int maxPrimeTableSize = 3511;
 
-	std::auto_ptr<std::vector<word> > pPrimeTable(new std::vector<word>);
-	std::vector<word> &primeTable = *pPrimeTable;
+	std::auto_ptr<std::vector<word16> > pPrimeTable(new std::vector<word16>);
+	std::vector<word16> &primeTable = *pPrimeTable;
 	primeTable.reserve(maxPrimeTableSize);
 
 	primeTable.push_back(2);
@@ -42,9 +42,9 @@ std::vector<word> * NewPrimeTable()
 	return pPrimeTable.release();
 }
 
-const word * GetPrimeTable(unsigned int &size)
+const word16 * GetPrimeTable(unsigned int &size)
 {
-	std::vector<word> &primeTable = StaticObject<std::vector<word> >(&NewPrimeTable);
+	std::vector<word16> &primeTable = StaticObject<std::vector<word16> >(&NewPrimeTable);
 	size = primeTable.size();
 	return &primeTable[0];
 }
@@ -52,10 +52,10 @@ const word * GetPrimeTable(unsigned int &size)
 bool IsSmallPrime(const Integer &p)
 {
 	unsigned int primeTableSize;
-	const word * primeTable = GetPrimeTable(primeTableSize);
+	const word16 * primeTable = GetPrimeTable(primeTableSize);
 
 	if (p.IsPositive() && p <= primeTable[primeTableSize-1])
-		return std::binary_search(primeTable, primeTable+primeTableSize, (word)p.ConvertToLong());
+		return std::binary_search(primeTable, primeTable+primeTableSize, (word16)p.ConvertToLong());
 	else
 		return false;
 }
@@ -63,7 +63,7 @@ bool IsSmallPrime(const Integer &p)
 bool TrialDivision(const Integer &p, unsigned bound)
 {
 	unsigned int primeTableSize;
-	const word * primeTable = GetPrimeTable(primeTableSize);
+	const word16 * primeTable = GetPrimeTable(primeTableSize);
 
 	assert(primeTable[primeTableSize-1] >= bound);
 
@@ -81,7 +81,7 @@ bool TrialDivision(const Integer &p, unsigned bound)
 bool SmallDivisorsTest(const Integer &p)
 {
 	unsigned int primeTableSize;
-	const word * primeTable = GetPrimeTable(primeTableSize);
+	const word16 * primeTable = GetPrimeTable(primeTableSize);
 	return !TrialDivision(p, primeTable[primeTableSize-1]);
 }
 
@@ -278,7 +278,7 @@ public:
 	bool NextCandidate(Integer &c);
 
 	void DoSieve();
-	static void SieveSingle(std::vector<bool> &sieve, word p, const Integer &first, const Integer &step, word stepInv);
+	static void SieveSingle(std::vector<bool> &sieve, word16 p, const Integer &first, const Integer &step, word16 stepInv);
 
 	Integer m_first, m_last, m_step;
 	signed int m_delta;
@@ -315,12 +315,12 @@ bool PrimeSieve::NextCandidate(Integer &c)
 	}
 }
 
-void PrimeSieve::SieveSingle(std::vector<bool> &sieve, word p, const Integer &first, const Integer &step, word stepInv)
+void PrimeSieve::SieveSingle(std::vector<bool> &sieve, word16 p, const Integer &first, const Integer &step, word16 stepInv)
 {
 	if (stepInv)
 	{
 		unsigned int sieveSize = sieve.size();
-		word j = word((dword(p-(first%p))*stepInv) % p);
+		word j = word((word32(p-(first%p))*stepInv) % p);
 		// if the first multiple of p is p, skip it
 		if (first.WordCount() <= 1 && first + step*j == p)
 			j += p;
@@ -332,7 +332,7 @@ void PrimeSieve::SieveSingle(std::vector<bool> &sieve, word p, const Integer &fi
 void PrimeSieve::DoSieve()
 {
 	unsigned int primeTableSize;
-	const word * primeTable = GetPrimeTable(primeTableSize);
+	const word16 * primeTable = GetPrimeTable(primeTableSize);
 
 	const unsigned int maxSieveSize = 32768;
 	unsigned int sieveSize = STDMIN(Integer(maxSieveSize), (m_last-m_first)/m_step+1).ConvertToLong();
@@ -352,11 +352,11 @@ void PrimeSieve::DoSieve()
 		Integer halfStep = m_step >> 1;
 		for (unsigned int i = 0; i < primeTableSize; ++i)
 		{
-			word p = primeTable[i];
-			word stepInv = m_step.InverseMod(p);
+			word16 p = primeTable[i];
+			word16 stepInv = m_step.InverseMod(p);
 			SieveSingle(m_sieve, p, m_first, m_step, stepInv);
 
-			word halfStepInv = 2*stepInv < p ? 2*stepInv : 2*stepInv-p;
+			word16 halfStepInv = 2*stepInv < p ? 2*stepInv : 2*stepInv-p;
 			SieveSingle(m_sieve, p, qFirst, halfStep, halfStepInv);
 		}
 	}
@@ -380,11 +380,11 @@ bool FirstPrime(Integer &p, const Integer &max, const Integer &equiv, const Inte
 	}
 
 	unsigned int primeTableSize;
-	const word * primeTable = GetPrimeTable(primeTableSize);
+	const word16 * primeTable = GetPrimeTable(primeTableSize);
 
 	if (p <= primeTable[primeTableSize-1])
 	{
-		const word *pItr;
+		const word16 *pItr;
 
 		--p;
 		if (p.IsPositive())
@@ -441,7 +441,7 @@ static bool ProvePrime(const Integer &p, const Integer &q)
 		return false;
 
 	unsigned int primeTableSize;
-	const word * primeTable = GetPrimeTable(primeTableSize);
+	const word16 * primeTable = GetPrimeTable(primeTableSize);
 
 	assert(primeTableSize >= 50);
 	for (int i=0; i<50; i++) 
@@ -499,7 +499,7 @@ Integer MaurerProvablePrime(RandomNumberGenerator &rng, unsigned int bits)
 	Integer p;
 
 	unsigned int primeTableSize;
-	const word * primeTable = GetPrimeTable(primeTableSize);
+	const word16 * primeTable = GetPrimeTable(primeTableSize);
 
 	if (bits < smallPrimeBound)
 	{
diff --git a/nbtheory.h b/nbtheory.h
index cb953f24..c731c508 100644
--- a/nbtheory.h
+++ b/nbtheory.h
@@ -9,7 +9,7 @@
 NAMESPACE_BEGIN(CryptoPP)
 
 // obtain pointer to small prime table and get its size
-CRYPTOPP_DLL const word * GetPrimeTable(unsigned int &size);
+CRYPTOPP_DLL const word16 * GetPrimeTable(unsigned int &size);
 
 // ************ primality testing ****************
 
diff --git a/seal.cpp b/seal.cpp
index a478f9a5..0962c67f 100644
--- a/seal.cpp
+++ b/seal.cpp
@@ -75,7 +75,7 @@ void SEAL_Policy<B>::CipherResynchronize(byte *keystreamBuffer, const byte *IV)
 }
 
 template <class B>
-void SEAL_Policy<B>::SeekToIteration(dword iterationCount)
+void SEAL_Policy<B>::SeekToIteration(lword iterationCount)
 {
 	m_outsideCounter = m_startCount + (unsigned int)(iterationCount / m_iterationsPerCount);
 	m_insideCounter = (unsigned int)(iterationCount % m_iterationsPerCount);
diff --git a/seal.h b/seal.h
index 2e190026..9157720a 100644
--- a/seal.h
+++ b/seal.h
@@ -23,7 +23,7 @@ protected:
 	void OperateKeystream(KeystreamOperation operation, byte *output, const byte *input, unsigned int iterationCount);
 	void CipherResynchronize(byte *keystreamBuffer, const byte *IV);
 	bool IsRandomAccess() const {return true;}
-	void SeekToIteration(dword iterationCount);
+	void SeekToIteration(lword iterationCount);
 
 private:
 	FixedSizeSecBlock<word32, 512> m_T;
diff --git a/strciphr.cpp b/strciphr.cpp
index 1f03d4c5..3394b204 100644
--- a/strciphr.cpp
+++ b/strciphr.cpp
@@ -91,7 +91,7 @@ void AdditiveCipherTemplate<S>::Resynchronize(const byte *iv)
 }
 
 template <class BASE>
-void AdditiveCipherTemplate<BASE>::Seek(dword position)
+void AdditiveCipherTemplate<BASE>::Seek(lword position)
 {
 	PolicyInterface &policy = AccessPolicy();
 	unsigned int bytesPerIteration = policy.GetBytesPerIteration();
diff --git a/strciphr.h b/strciphr.h
index 3e0a739a..eb1d22fe 100644
--- a/strciphr.h
+++ b/strciphr.h
@@ -66,7 +66,7 @@ struct CRYPTOPP_DLL CRYPTOPP_NO_VTABLE AdditiveCipherAbstractPolicy
 	virtual void CipherSetKey(const NameValuePairs &params, const byte *key, unsigned int length) =0;
 	virtual void CipherResynchronize(byte *keystreamBuffer, const byte *iv) {throw NotImplemented("StreamTransformation: this object doesn't support resynchronization");}
 	virtual bool IsRandomAccess() const =0;
-	virtual void SeekToIteration(dword iterationCount) {assert(!IsRandomAccess()); throw NotImplemented("StreamTransformation: this object doesn't support random access");}
+	virtual void SeekToIteration(lword iterationCount) {assert(!IsRandomAccess()); throw NotImplemented("StreamTransformation: this object doesn't support random access");}
 };
 
 template <typename WT, unsigned int W, unsigned int X = 1, class BASE = AdditiveCipherAbstractPolicy>
@@ -130,7 +130,7 @@ public:
 	bool IsSelfInverting() const {return true;}
 	bool IsForwardTransformation() const {return true;}
 	bool IsRandomAccess() const {return GetPolicy().IsRandomAccess();}
-	void Seek(dword position);
+	void Seek(lword position);
 
 	typedef typename BASE::PolicyInterface PolicyInterface;
 
diff --git a/validat1.cpp b/validat1.cpp
index 1342de53..6c0aee3f 100644
--- a/validat1.cpp
+++ b/validat1.cpp
@@ -188,7 +188,7 @@ bool TestSettings()
 		pass = false;
 	}
 	cout << "sizeof(word64) == " << sizeof(word64) << endl;
-#else
+#elif CRYPTOPP_NATIVE_DWORD_AVAILABLE
 	if (sizeof(dword) >= 8)
 	{
 		cout << "FAILED:  sizeof(dword) >= 8, but WORD64_AVAILABLE not defined" << endl;
@@ -198,6 +198,7 @@ bool TestSettings()
 		cout << "passed:  word64 not available" << endl;
 #endif
 
+#ifdef CRYPTOPP_NATIVE_DWORD_AVAILABLE
 	if (sizeof(dword) == 2*sizeof(word))
 		cout << "passed:  ";
 	else
@@ -206,16 +207,7 @@ bool TestSettings()
 		pass = false;
 	}
 	cout << "sizeof(word) == " << sizeof(word) << ", sizeof(dword) == " << sizeof(dword) << endl;
-
-	dword test = (dword(1)<<WORD_BITS) + 2;
-	if (HIGH_WORD(test) == 1 && LOW_WORD(test) == 2)
-		cout << "passed:  ";
-	else
-	{
-		cout << "FAILED:  ";
-		pass = false;
-	}
-	cout << "HIGH_WORD() and LOW_WORD() macros\n";
+#endif
 
 	if (!pass)
 	{