Merge branch 'master' into hmqv
commit
932801bd5f
465
cryptest.sh
465
cryptest.sh
|
|
@ -115,6 +115,9 @@ if [[ "$IS_SOLARIS" -ne "0" ]]; then
|
|||
else
|
||||
AWK=nawk;
|
||||
fi
|
||||
|
||||
DISASS=dis
|
||||
DISASSARGS=()
|
||||
fi
|
||||
|
||||
# Fixup
|
||||
|
|
@ -138,7 +141,9 @@ done
|
|||
# We need to use the C++ compiler to determine feature availablility. Otherwise
|
||||
# mis-detections occur on a number of platforms.
|
||||
if [[ ((-z "$CXX") || ("$CXX" == "gcc")) ]]; then
|
||||
if [[ "$IS_DARWIN" -ne "0" ]]; then
|
||||
if [[ ("$CXX" == "gcc") ]]; then
|
||||
CXX=g++
|
||||
elif [[ "$IS_DARWIN" -ne "0" ]]; then
|
||||
CXX=c++
|
||||
elif [[ "$IS_SOLARIS" -ne "0" ]]; then
|
||||
if [[ (-e "/opt/developerstudio12.5/bin/CC") ]]; then
|
||||
|
|
@ -182,7 +187,7 @@ if [[ ("$SUN_COMPILER" -eq "0") ]]; then
|
|||
fi
|
||||
|
||||
# Now that the compiler is fixed, determine the compiler version for fixups
|
||||
CLANG_37_OR_ABOVE=$("$CXX" -v 2>&1 | "$EGREP" -i -c 'clang version (3\.[7-9]|[5-9])')
|
||||
CLANG_37_OR_ABOVE=$("$CXX" -v 2>&1 | "$EGREP" -i -c 'clang version (3\.[7-9]|[4-9]\.[0-9])')
|
||||
GCC_60_OR_ABOVE=$("$CXX" -v 2>&1 | "$EGREP" -i -c 'gcc version (6\.[0-9]|[7-9])')
|
||||
GCC_51_OR_ABOVE=$("$CXX" -v 2>&1 | "$EGREP" -i -c 'gcc version (5\.[1-9]|[6-9])')
|
||||
GCC_48_COMPILER=$("$CXX" -v 2>&1 | "$EGREP" -i -c 'gcc version 4\.8')
|
||||
|
|
@ -521,7 +526,7 @@ HAVE_X86_AES=0
|
|||
HAVE_X86_RDRAND=0
|
||||
HAVE_X86_RDSEED=0
|
||||
HAVE_X86_PCLMUL=0
|
||||
if [[ (("$IS_X86" -ne "0") || ("$IS_X64" -ne "0")) && ("$SUN_COMPILER" -eq "0") ]]; then
|
||||
if [[ ("$IS_X86" -ne "0" || "$IS_X64" -ne "0") && ("$SUN_COMPILER" -eq "0") ]]; then
|
||||
rm -f "$TMP/adhoc.exe" > /dev/null 2>&1
|
||||
"$CXX" -DCRYPTOPP_ADHOC_MAIN -maes adhoc.cpp -o "$TMP/adhoc.exe" > /dev/null 2>&1
|
||||
if [[ "$?" -eq "0" ]]; then
|
||||
|
|
@ -569,7 +574,11 @@ fi
|
|||
|
||||
# ARMv7 and ARMv8, including NEON, CRC32 and Crypto extensions
|
||||
if [[ ("$IS_ARM32" -ne "0" || "$IS_ARM64" -ne "0") ]]; then
|
||||
ARM_FEATURES=$(cat /proc/cpuinfo 2>&1 | "$AWK" '{IGNORECASE=1}{if ($1 == "Features") print}' | cut -f 2 -d ':')
|
||||
if [[ ("$IS_DARWIN" -ne "0") ]]; then
|
||||
ARM_FEATURES=$(sysctl machdep.cpu.features 2>&1 | cut -f 2 -d ':')
|
||||
else
|
||||
ARM_FEATURES=$(cat /proc/cpuinfo 2>&1 | "$AWK" '{IGNORECASE=1}{if ($1 == "Features") print}' | cut -f 2 -d ':')
|
||||
fi
|
||||
|
||||
if [[ (-z "$HAVE_ARMV7A" && "$IS_ARM32" -ne "0") ]]; then
|
||||
HAVE_ARMV7A=$(echo "$ARM_FEATURES" | "$GREP" -i -c 'neon')
|
||||
|
|
@ -646,7 +655,7 @@ fi
|
|||
# Used to disassemble object modules so we can verify some aspects of code generation
|
||||
if [[ (-z "$HAVE_DISASS") ]]; then
|
||||
echo "int main(int argc, char* argv[]) {return 0;}" > "$TMP/test.cc"
|
||||
"$CXX" -x c "$TMP/test.cc" -o "$TMP/test.exe" > /dev/null 2>&1
|
||||
"$CXX" "$TMP/test.cc" -o "$TMP/test.exe" > /dev/null 2>&1
|
||||
if [[ "$?" -eq "0" ]]; then
|
||||
"$DISASS" "${DISASSARGS[@]}" "$TMP/test.exe" > /dev/null 2>&1
|
||||
if [[ "$?" -eq "0" ]]; then
|
||||
|
|
@ -1014,99 +1023,387 @@ echo "Start time: $TEST_BEGIN" | tee -a "$TEST_RESULTS"
|
|||
|
||||
############################################
|
||||
# Test AES-NI code generation
|
||||
if [[ ("$HAVE_DISASS" -ne "0" && "$HAVE_X86_AES" -ne "0") ]]; then
|
||||
echo
|
||||
echo "************************************" | tee -a "$TEST_RESULTS"
|
||||
echo "Testing: AES-NI code generation" | tee -a "$TEST_RESULTS"
|
||||
echo
|
||||
if [[ ("$HAVE_DISASS" -ne "0" && ("$IS_X86" -ne "0" || "$IS_X64" -ne "0")) ]]; then
|
||||
|
||||
"$MAKE" clean > /dev/null 2>&1
|
||||
rm -f adhoc.cpp > /dev/null 2>&1
|
||||
# This works for SunCC, but we need something like:
|
||||
# /opt/solarisstudio12.4/bin/CC -DNDEBUG -g2 -O2 -xarch=aes -m64 -D__SSE2__ -D__SSE3__ \
|
||||
# -D__SSE4_1__ -D__SSE4_2__ -D__AES__ -D__PCLMUL__ -c rijndael.cpp
|
||||
|
||||
OBJFILE=rijndael.o
|
||||
CXX="$CXX" CXXFLAGS="$RELEASE_CXXFLAGS" "$MAKE" "${MAKEARGS[@]}" $OBJFILE 2>&1 | tee -a "$TEST_RESULTS"
|
||||
|
||||
COUNT=0
|
||||
FAILED=0
|
||||
DISASS_TEXT=$("$DISASS" "${DISASSARGS[@]}" "$OBJFILE" 2>/dev/null)
|
||||
|
||||
COUNT=$(echo "$DISASS_TEXT" | "$GREP" -i -c aesenc)
|
||||
if [[ ("$COUNT" -eq "0") ]]; then
|
||||
FAILED=1
|
||||
echo "ERROR: failed to generate aesenc instruction" | tee -a "$TEST_RESULTS"
|
||||
fi
|
||||
|
||||
COUNT=$(echo "$DISASS_TEXT" | "$GREP" -i -c aesenclast)
|
||||
if [[ ("$COUNT" -eq "0") ]]; then
|
||||
FAILED=1
|
||||
echo "ERROR: failed to generate aesenclast instruction" | tee -a "$TEST_RESULTS"
|
||||
fi
|
||||
|
||||
COUNT=$(echo "$DISASS_TEXT" | "$GREP" -i -c aesdec)
|
||||
if [[ ("$COUNT" -eq "0") ]]; then
|
||||
FAILED=1
|
||||
echo "ERROR: failed to generate aesdec instruction" | tee -a "$TEST_RESULTS"
|
||||
fi
|
||||
|
||||
COUNT=$(echo "$DISASS_TEXT" | "$GREP" -i -c aesdeclast)
|
||||
if [[ ("$COUNT" -eq "0") ]]; then
|
||||
FAILED=1
|
||||
echo "ERROR: failed to generate aesdeclast instruction" | tee -a "$TEST_RESULTS"
|
||||
fi
|
||||
|
||||
COUNT=$(echo "$DISASS_TEXT" | "$GREP" -i -c aesimc)
|
||||
if [[ ("$COUNT" -eq "0") ]]; then
|
||||
FAILED=1
|
||||
echo "ERROR: failed to generate aesimc instruction" | tee -a "$TEST_RESULTS"
|
||||
fi
|
||||
|
||||
COUNT=$(echo "$DISASS_TEXT" | "$GREP" -i -c aeskeygenassist)
|
||||
if [[ ("$COUNT" -eq "0") ]]; then
|
||||
FAILED=1
|
||||
echo "ERROR: failed to generate aeskeygenassist instruction" | tee -a "$TEST_RESULTS"
|
||||
fi
|
||||
|
||||
if [[ ("$FAILED" -eq "0") ]];then
|
||||
echo "Verified aesenc, aesenclast, aesdec, aesdeclast, aesimc, aeskeygenassist machine instruction generation" | tee -a "$TEST_RESULTS"
|
||||
if [[ ("$IS_DARWIN" -ne "0") ]]; then
|
||||
X86_AESNI=$(sysctl machdep.cpu.features 2>/dev/null | "$GREP" -i -c aes)
|
||||
elif [[ ("$IS_SOLARIS" -ne "0") ]]; then
|
||||
X86_AESNI=$(isainfo -v 2>/dev/null | "$GREP" -i -c aes)
|
||||
else
|
||||
if [[ ("$CLANG_COMPILER" -ne "0" && "$CLANG_37_OR_ABOVE" -eq "0") ]]; then
|
||||
echo "This could be due to Clang and lack of expected support for SSSE3 in some versions of the compiler. If so, try Clang 3.7 or above"
|
||||
X86_AESNI=$(cat /proc/cpuinfo 2>/dev/null | "$GREP" -i -c aes)
|
||||
fi
|
||||
|
||||
if [[ ("$X86_AESNI" -ne "0") ]]; then
|
||||
echo
|
||||
echo "************************************" | tee -a "$TEST_RESULTS"
|
||||
echo "Testing: X86 AES-NI code generation" | tee -a "$TEST_RESULTS"
|
||||
echo
|
||||
|
||||
"$MAKE" clean > /dev/null 2>&1
|
||||
rm -f adhoc.cpp > /dev/null 2>&1
|
||||
|
||||
OBJFILE=rijndael.o
|
||||
CXX="$CXX" CXXFLAGS="$RELEASE_CXXFLAGS ${PLATFORM_CXXFLAGS[@]}" "$MAKE" "${MAKEARGS[@]}" $OBJFILE 2>&1 | tee -a "$TEST_RESULTS"
|
||||
|
||||
COUNT=0
|
||||
FAILED=0
|
||||
DISASS_TEXT=$("$DISASS" "${DISASSARGS[@]}" "$OBJFILE" 2>/dev/null)
|
||||
|
||||
COUNT=$(echo "$DISASS_TEXT" | "$GREP" -i -c aesenc)
|
||||
if [[ ("$COUNT" -eq "0") ]]; then
|
||||
FAILED=1
|
||||
echo "ERROR: failed to generate aesenc instruction" | tee -a "$TEST_RESULTS"
|
||||
fi
|
||||
|
||||
COUNT=$(echo "$DISASS_TEXT" | "$GREP" -i -c aesenclast)
|
||||
if [[ ("$COUNT" -eq "0") ]]; then
|
||||
FAILED=1
|
||||
echo "ERROR: failed to generate aesenclast instruction" | tee -a "$TEST_RESULTS"
|
||||
fi
|
||||
|
||||
COUNT=$(echo "$DISASS_TEXT" | "$GREP" -i -c aesdec)
|
||||
if [[ ("$COUNT" -eq "0") ]]; then
|
||||
FAILED=1
|
||||
echo "ERROR: failed to generate aesdec instruction" | tee -a "$TEST_RESULTS"
|
||||
fi
|
||||
|
||||
COUNT=$(echo "$DISASS_TEXT" | "$GREP" -i -c aesdeclast)
|
||||
if [[ ("$COUNT" -eq "0") ]]; then
|
||||
FAILED=1
|
||||
echo "ERROR: failed to generate aesdeclast instruction" | tee -a "$TEST_RESULTS"
|
||||
fi
|
||||
|
||||
COUNT=$(echo "$DISASS_TEXT" | "$GREP" -i -c aesimc)
|
||||
if [[ ("$COUNT" -eq "0") ]]; then
|
||||
FAILED=1
|
||||
echo "ERROR: failed to generate aesimc instruction" | tee -a "$TEST_RESULTS"
|
||||
fi
|
||||
|
||||
COUNT=$(echo "$DISASS_TEXT" | "$GREP" -i -c aeskeygenassist)
|
||||
if [[ ("$COUNT" -eq "0") ]]; then
|
||||
FAILED=1
|
||||
echo "ERROR: failed to generate aeskeygenassist instruction" | tee -a "$TEST_RESULTS"
|
||||
fi
|
||||
|
||||
if [[ ("$FAILED" -eq "0") ]];then
|
||||
echo "Verified aesenc, aesenclast, aesdec, aesdeclast, aesimc, aeskeygenassist machine instructions" | tee -a "$TEST_RESULTS"
|
||||
else
|
||||
if [[ ("$CLANG_COMPILER" -ne "0" && "$CLANG_37_OR_ABOVE" -eq "0") ]]; then
|
||||
echo "This could be due to Clang and lack of expected support for SSSE3 in some versions of the compiler. If so, try Clang 3.7 or above"
|
||||
fi
|
||||
fi
|
||||
fi
|
||||
fi
|
||||
|
||||
############################################
|
||||
# ARM 64x64→128-bit multiply code generation
|
||||
if [[ ("$HAVE_DISASS" -ne "0" && "$HAVE_ARM_CRYPTO" -ne "0") ]]; then
|
||||
echo
|
||||
echo "************************************" | tee -a "$TEST_RESULTS"
|
||||
echo "Testing: ARM 64x64→128-bit multiply code generation" | tee -a "$TEST_RESULTS"
|
||||
echo
|
||||
# X86 carryless multiply code generation
|
||||
if [[ ("$HAVE_DISASS" -ne "0" && ("$IS_X86" -ne "0" || "$IS_X64" -ne "0")) ]]; then
|
||||
|
||||
"$MAKE" clean > /dev/null 2>&1
|
||||
rm -f adhoc.cpp > /dev/null 2>&1
|
||||
# This works for SunCC, but we need something like:
|
||||
# /opt/solarisstudio12.4/bin/CC -DNDEBUG -g2 -O2 -xarch=aes -m64 -D__SSE2__ -D__SSE3__ \
|
||||
# -D__SSE4_1__ -D__SSE4_2__ -D__AES__ -D__PCLMUL__ -D__RDRND__ -D__RDSEED__ -c gcm.cpp
|
||||
|
||||
OBJFILE=gcm.o
|
||||
CXX="$CXX" CXXFLAGS="$RELEASE_CXXFLAGS" "$MAKE" "${MAKEARGS[@]}" $OBJFILE 2>&1 | tee -a "$TEST_RESULTS"
|
||||
|
||||
COUNT=0
|
||||
FAILED=0
|
||||
DISASS_TEXT=$("$DISASS" "${DISASSARGS[@]}" "$OBJFILE" 2>/dev/null)
|
||||
|
||||
COUNT=$(echo "$DISASS_TEXT" | "$GREP" -v pmull2 | "$GREP" -i -c pmull)
|
||||
if [[ ("$COUNT" -eq "0") ]]; then
|
||||
FAILED=1
|
||||
echo "ERROR: failed to generate pmull instruction" | tee -a "$TEST_RESULTS"
|
||||
if [[ ("$IS_DARWIN" -ne "0") ]]; then
|
||||
X86_PCLMUL=$(sysctl machdep.cpu.features 2>/dev/null | "$GREP" -i -c pclmulq)
|
||||
elif [[ ("$IS_SOLARIS" -ne "0") ]]; then
|
||||
X86_PCLMUL=$(isainfo -v 2>/dev/null | "$GREP" -i -c pclmulq)
|
||||
else
|
||||
X86_PCLMUL=$(cat /proc/cpuinfo 2>/dev/null | "$GREP" -i -c pclmulq)
|
||||
fi
|
||||
|
||||
COUNT=$(echo "$DISASS_TEXT" | "$GREP" -i -c pmull2)
|
||||
if [[ ("$COUNT" -eq "0") ]]; then
|
||||
FAILED=1
|
||||
echo "ERROR: failed to generate pmull2 instruction" | tee -a "$TEST_RESULTS"
|
||||
if [[ ("$X86_PCLMUL" -ne "0") ]]; then
|
||||
echo
|
||||
echo "************************************" | tee -a "$TEST_RESULTS"
|
||||
echo "Testing: X86 carryless multiply code generation" | tee -a "$TEST_RESULTS"
|
||||
echo
|
||||
|
||||
"$MAKE" clean > /dev/null 2>&1
|
||||
rm -f adhoc.cpp > /dev/null 2>&1
|
||||
|
||||
OBJFILE=gcm.o
|
||||
CXX="$CXX" CXXFLAGS="$RELEASE_CXXFLAGS ${PLATFORM_CXXFLAGS[@]}" "$MAKE" "${MAKEARGS[@]}" $OBJFILE 2>&1 | tee -a "$TEST_RESULTS"
|
||||
|
||||
COUNT=0
|
||||
FAILED=0
|
||||
DISASS_TEXT=$("$DISASS" "${DISASSARGS[@]}" "$OBJFILE" 2>/dev/null)
|
||||
|
||||
COUNT=$(echo "$DISASS_TEXT" | "$EGREP" -i -c '(pclmullqh|vpclmulqdq)')
|
||||
if [[ ("$COUNT" -eq "0") ]]; then
|
||||
FAILED=1
|
||||
echo "ERROR: failed to generate pclmullqh instruction" | tee -a "$TEST_RESULTS"
|
||||
fi
|
||||
|
||||
COUNT=$(echo "$DISASS_TEXT" | "$EGREP" -i -c '(pclmullql|vpclmulqdq)')
|
||||
if [[ ("$COUNT" -eq "0") ]]; then
|
||||
FAILED=1
|
||||
echo "ERROR: failed to generate pclmullql instruction" | tee -a "$TEST_RESULTS"
|
||||
fi
|
||||
|
||||
if [[ ("$FAILED" -eq "0") ]];then
|
||||
echo "Verified pclmullqh and pclmullql machine instructions" | tee -a "$TEST_RESULTS"
|
||||
else
|
||||
if [[ ("$CLANG_COMPILER" -ne "0") ]]; then
|
||||
echo "This is probably due to Clang and its integrated assembler. The integrated assembler cannot consume a lot of ASM used by the library"
|
||||
fi
|
||||
fi
|
||||
fi
|
||||
fi
|
||||
|
||||
############################################
|
||||
# Test RDRAND and RDSEED code generation
|
||||
if [[ ("$HAVE_DISASS" -ne "0" && ("$IS_X86" -ne "0" || "$IS_X64" -ne "0")) ]]; then
|
||||
|
||||
# This works for SunCC, but we need something like:
|
||||
# /opt/solarisstudio12.4/bin/CC -DNDEBUG -g2 -O2 -xarch=avx_i -m64 -D__SSE2__ -D__SSE3__ \
|
||||
# -D__SSE4_1__ -D__SSE4_2__ -D__AES__ -D__PCLMUL__ -D__RDRND__ -D__RDSEED__ -c rdrand.cpp
|
||||
|
||||
if [[ ("$IS_DARWIN" -ne "0") ]]; then
|
||||
X86_RDRAND=$(sysctl machdep.cpu.features 2>/dev/null | "$GREP" -i -c rdrand)
|
||||
X86_RDSEED=$(sysctl machdep.cpu.features 2>/dev/null | "$GREP" -i -c rdseed)
|
||||
elif [[ ("$IS_SOLARIS" -ne "0") ]]; then
|
||||
X86_RDRAND=$(isainfo -v 2>/dev/null | "$GREP" -i -c rdrand)
|
||||
X86_RDSEED=$(isainfo -v 2>/dev/null | "$GREP" -i -c rdseed)
|
||||
else
|
||||
X86_RDRAND=$(cat /proc/cpuinfo 2>/dev/null | "$GREP" -i -c rdrand)
|
||||
X86_RDSEED=$(cat /proc/cpuinfo 2>/dev/null | "$GREP" -i -c rdseed)
|
||||
fi
|
||||
|
||||
if [[ ("$FAILED" -eq "0") ]];then
|
||||
echo "Verified pmull and pmull2 machine instruction generation" | tee -a "$TEST_RESULTS"
|
||||
if [[ ("$X86_RDRAND" -ne "0" || "$X86_RDSEED" -ne "0") ]]; then
|
||||
echo
|
||||
echo "************************************" | tee -a "$TEST_RESULTS"
|
||||
echo "Testing: X86 RDRAND and RDSEED code generation" | tee -a "$TEST_RESULTS"
|
||||
echo
|
||||
|
||||
"$MAKE" clean > /dev/null 2>&1
|
||||
rm -f adhoc.cpp > /dev/null 2>&1
|
||||
|
||||
OBJFILE=rdrand.o
|
||||
CXX="$CXX" CXXFLAGS="$RELEASE_CXXFLAGS ${PLATFORM_CXXFLAGS[@]}" "$MAKE" "${MAKEARGS[@]}" $OBJFILE 2>&1 | tee -a "$TEST_RESULTS"
|
||||
|
||||
COUNT=0
|
||||
FAILED=0
|
||||
DISASS_TEXT=$("$DISASS" "${DISASSARGS[@]}" "$OBJFILE" 2>/dev/null)
|
||||
|
||||
if [[ "$X86_RDRAND" -ne "0" ]]; then
|
||||
COUNT=$(echo "$DISASS_TEXT" | "$GREP" -i -c rdrand)
|
||||
if [[ ("$COUNT" -eq "0") ]]; then
|
||||
FAILED=1
|
||||
echo "ERROR: failed to generate rdrand instruction" | tee -a "$TEST_RESULTS"
|
||||
fi
|
||||
fi
|
||||
|
||||
if [[ "$X86_RDSEED" -ne "0" ]]; then
|
||||
COUNT=$(echo "$DISASS_TEXT" | "$GREP" -i -c rdseed)
|
||||
if [[ ("$COUNT" -eq "0") ]]; then
|
||||
FAILED=1
|
||||
echo "ERROR: failed to generate rdseed instruction" | tee -a "$TEST_RESULTS"
|
||||
fi
|
||||
fi
|
||||
|
||||
if [[ ("$FAILED" -eq "0") ]];then
|
||||
echo "Verified rdrand and rdseed machine instructions" | tee -a "$TEST_RESULTS"
|
||||
fi
|
||||
fi
|
||||
fi
|
||||
|
||||
############################################
|
||||
# X86 CRC32 code generation
|
||||
if [[ ("$HAVE_DISASS" -ne "0" && ("$IS_X86" -ne "0" || "$IS_X64" -ne "0")) ]]; then
|
||||
|
||||
# This works for SunCC, but we need something like:
|
||||
# /opt/solarisstudio12.3/bin/CC -DNDEBUG -g2 -O2 -xarch=sse4_2 -m64 -D__SSE2__ -D__SSE3__ \
|
||||
# -D__SSE4_1__ -D__SSE4_2__ -c crc.cpp
|
||||
|
||||
if [[ ("$IS_DARWIN" -ne "0") ]]; then
|
||||
X86_CRC32=$(sysctl machdep.cpu.features 2>/dev/null | "$GREP" -i -c sse4.2)
|
||||
elif [[ ("$IS_SOLARIS" -ne "0") ]]; then
|
||||
X86_CRC32=$(isainfo -v 2>/dev/null | "$GREP" -i -c sse4_2)
|
||||
else
|
||||
X86_CRC32=$(cat /proc/cpuinfo 2>/dev/null | "$GREP" -i -c sse4_2)
|
||||
fi
|
||||
|
||||
if [[ ("$X86_CRC32" -ne "0") ]]; then
|
||||
echo
|
||||
echo "************************************" | tee -a "$TEST_RESULTS"
|
||||
echo "Testing: X86 CRC32 code generation" | tee -a "$TEST_RESULTS"
|
||||
echo
|
||||
|
||||
"$MAKE" clean > /dev/null 2>&1
|
||||
rm -f adhoc.cpp > /dev/null 2>&1
|
||||
|
||||
OBJFILE=crc.o
|
||||
CXX="$CXX" CXXFLAGS="$RELEASE_CXXFLAGS ${PLATFORM_CXXFLAGS[@]}" "$MAKE" "${MAKEARGS[@]}" $OBJFILE 2>&1 | tee -a "$TEST_RESULTS"
|
||||
|
||||
COUNT=0
|
||||
FAILED=0
|
||||
DISASS_TEXT=$("$DISASS" "${DISASSARGS[@]}" "$OBJFILE" 2>/dev/null)
|
||||
|
||||
COUNT=$(echo "$DISASS_TEXT" | "$GREP" -i -c crc32l)
|
||||
if [[ ("$COUNT" -eq "0") ]]; then
|
||||
FAILED=1
|
||||
echo "ERROR: failed to generate crc32l instruction" | tee -a "$TEST_RESULTS"
|
||||
fi
|
||||
|
||||
COUNT=$(echo "$DISASS_TEXT" | "$GREP" -i -c crc32b)
|
||||
if [[ ("$COUNT" -eq "0") ]]; then
|
||||
FAILED=1
|
||||
echo "ERROR: failed to generate crc32b instruction" | tee -a "$TEST_RESULTS"
|
||||
fi
|
||||
|
||||
if [[ ("$FAILED" -eq "0") ]];then
|
||||
echo "Verified crc32l and crc32b machine instructions" | tee -a "$TEST_RESULTS"
|
||||
fi
|
||||
fi
|
||||
fi
|
||||
|
||||
############################################
|
||||
# ARM NEON code generation
|
||||
if [[ ("$HAVE_DISASS" -ne "0" && ("$IS_ARM32" -ne "0" || "$IS_ARM64" -ne "0")) ]]; then
|
||||
|
||||
if [[ ("$IS_DARWIN" -ne "0") ]]; then
|
||||
ARM_NEON=$(sysctl machdep.cpu.features 2>/dev/null | "$EGREP" -i -c '(neon|asimd)')
|
||||
else
|
||||
ARM_NEON=$(cat /proc/cpuinfo 2>/dev/null | "$EGREP" -i -c '(neon|asimd)')
|
||||
fi
|
||||
|
||||
if [[ ("$ARM_NEON" -ne "0" || "$HAVE_ARM_NEON" -ne "0") ]]; then
|
||||
echo
|
||||
echo "************************************" | tee -a "$TEST_RESULTS"
|
||||
echo "Testing: ARM NEON code generation" | tee -a "$TEST_RESULTS"
|
||||
echo
|
||||
|
||||
"$MAKE" clean > /dev/null 2>&1
|
||||
rm -f adhoc.cpp > /dev/null 2>&1
|
||||
|
||||
OBJFILE=blake2.o
|
||||
CXX="$CXX" CXXFLAGS="$RELEASE_CXXFLAGS ${PLATFORM_CXXFLAGS[@]}" "$MAKE" "${MAKEARGS[@]}" $OBJFILE 2>&1 | tee -a "$TEST_RESULTS"
|
||||
|
||||
COUNT=0
|
||||
FAILED=0
|
||||
DISASS_TEXT=$("$DISASS" "${DISASSARGS[@]}" "$OBJFILE" 2>/dev/null)
|
||||
|
||||
# BLAKE2_NEON_Compress32: 40 each vld1q_u8 and vld1q_u64
|
||||
# BLAKE2_NEON_Compress64: 22 each vld1q_u8 and vld1q_u64
|
||||
COUNT=$(echo "$DISASS_TEXT" | "$EGREP" -i -c 'ldr.*q')
|
||||
if [[ ("$COUNT" -lt "62") ]]; then
|
||||
FAILED=1
|
||||
echo "ERROR: failed to generate expected vector load instructions" | tee -a "$TEST_RESULTS"
|
||||
fi
|
||||
|
||||
# BLAKE2_NEON_Compress{32|64}: 6 each vst1q_u32 and vst1q_u64
|
||||
COUNT=$(echo "$DISASS_TEXT" | "$EGREP" -i -c 'str.*q')
|
||||
if [[ ("$COUNT" -lt "6") ]]; then
|
||||
FAILED=1
|
||||
echo "ERROR: failed to generate expected vector store instructions" | tee -a "$TEST_RESULTS"
|
||||
fi
|
||||
|
||||
if [[ ("$FAILED" -eq "0") ]];then
|
||||
echo "Verified vector load and store machine instructions" | tee -a "$TEST_RESULTS"
|
||||
fi
|
||||
fi
|
||||
fi
|
||||
|
||||
############################################
|
||||
# ARM carryless multiply code generation
|
||||
if [[ ("$HAVE_DISASS" -ne "0" && ("$IS_ARM32" -ne "0" || "$IS_ARM64" -ne "0")) ]]; then
|
||||
|
||||
if [[ ("$IS_DARWIN" -ne "0") ]]; then
|
||||
ARM_PMULL=$(sysctl machdep.cpu.features 2>/dev/null | "$GREP" -i -c pmull)
|
||||
else
|
||||
ARM_PMULL=$(cat /proc/cpuinfo 2>/dev/null | "$GREP" -i -c pmull)
|
||||
fi
|
||||
|
||||
if [[ ("$ARM_PMULL" -ne "0" || "$HAVE_ARM_CRYPTO" -ne "0") ]]; then
|
||||
echo
|
||||
echo "************************************" | tee -a "$TEST_RESULTS"
|
||||
echo "Testing: ARM carryless multiply code generation" | tee -a "$TEST_RESULTS"
|
||||
echo
|
||||
|
||||
"$MAKE" clean > /dev/null 2>&1
|
||||
rm -f adhoc.cpp > /dev/null 2>&1
|
||||
|
||||
OBJFILE=gcm.o
|
||||
CXX="$CXX" CXXFLAGS="$RELEASE_CXXFLAGS ${PLATFORM_CXXFLAGS[@]}" "$MAKE" "${MAKEARGS[@]}" $OBJFILE 2>&1 | tee -a "$TEST_RESULTS"
|
||||
|
||||
COUNT=0
|
||||
FAILED=0
|
||||
DISASS_TEXT=$("$DISASS" "${DISASSARGS[@]}" "$OBJFILE" 2>/dev/null)
|
||||
|
||||
COUNT=$(echo "$DISASS_TEXT" | "$GREP" -v pmull2 | "$GREP" -i -c pmull)
|
||||
if [[ ("$COUNT" -eq "0") ]]; then
|
||||
FAILED=1
|
||||
echo "ERROR: failed to generate pmull instruction" | tee -a "$TEST_RESULTS"
|
||||
fi
|
||||
|
||||
COUNT=$(echo "$DISASS_TEXT" | "$GREP" -i -c pmull2)
|
||||
if [[ ("$COUNT" -eq "0") ]]; then
|
||||
FAILED=1
|
||||
echo "ERROR: failed to generate pmull2 instruction" | tee -a "$TEST_RESULTS"
|
||||
fi
|
||||
|
||||
if [[ ("$FAILED" -eq "0") ]];then
|
||||
echo "Verified pmull and pmull2 machine instructions" | tee -a "$TEST_RESULTS"
|
||||
fi
|
||||
fi
|
||||
fi
|
||||
|
||||
############################################
|
||||
# ARM CRC32 code generation
|
||||
if [[ ("$HAVE_DISASS" -ne "0" && ("$IS_ARM32" -ne "0" || "$IS_ARM64" -ne "0")) ]]; then
|
||||
|
||||
if [[ ("$IS_DARWIN" -ne "0") ]]; then
|
||||
ARM_CRC32=$(sysctl machdep.cpu.features 2>/dev/null | "$GREP" -i -c crc)
|
||||
else
|
||||
ARM_CRC32=$(cat /proc/cpuinfo 2>/dev/null | "$GREP" -i -c crc32)
|
||||
fi
|
||||
|
||||
if [[ ("$ARM_CRC32" -ne "0") ]]; then
|
||||
echo
|
||||
echo "************************************" | tee -a "$TEST_RESULTS"
|
||||
echo "Testing: ARM CRC32 code generation" | tee -a "$TEST_RESULTS"
|
||||
echo
|
||||
|
||||
"$MAKE" clean > /dev/null 2>&1
|
||||
rm -f adhoc.cpp > /dev/null 2>&1
|
||||
|
||||
OBJFILE=crc.o
|
||||
CXX="$CXX" CXXFLAGS="$RELEASE_CXXFLAGS ${PLATFORM_CXXFLAGS[@]}" "$MAKE" "${MAKEARGS[@]}" $OBJFILE 2>&1 | tee -a "$TEST_RESULTS"
|
||||
|
||||
COUNT=0
|
||||
FAILED=0
|
||||
DISASS_TEXT=$("$DISASS" "${DISASSARGS[@]}" "$OBJFILE" 2>/dev/null)
|
||||
|
||||
COUNT=$(echo "$DISASS_TEXT" | "$GREP" -i -c crc32cb)
|
||||
if [[ ("$COUNT" -eq "0") ]]; then
|
||||
FAILED=1
|
||||
echo "ERROR: failed to generate crc32cb instruction" | tee -a "$TEST_RESULTS"
|
||||
fi
|
||||
|
||||
COUNT=$(echo "$DISASS_TEXT" | "$GREP" -i -c crc32cw)
|
||||
if [[ ("$COUNT" -eq "0") ]]; then
|
||||
FAILED=1
|
||||
echo "ERROR: failed to generate crc32cw instruction" | tee -a "$TEST_RESULTS"
|
||||
fi
|
||||
|
||||
COUNT=$(echo "$DISASS_TEXT" | "$GREP" -i -c crc32b)
|
||||
if [[ ("$COUNT" -eq "0") ]]; then
|
||||
FAILED=1
|
||||
echo "ERROR: failed to generate crc32b instruction" | tee -a "$TEST_RESULTS"
|
||||
fi
|
||||
|
||||
COUNT=$(echo "$DISASS_TEXT" | "$GREP" -i -c crc32w)
|
||||
if [[ ("$COUNT" -eq "0") ]]; then
|
||||
FAILED=1
|
||||
echo "ERROR: failed to generate crc32w instruction" | tee -a "$TEST_RESULTS"
|
||||
fi
|
||||
|
||||
if [[ ("$FAILED" -eq "0") ]];then
|
||||
echo "Verified crc32cb, crc32cw, crc32b and crc32w machine instructions" | tee -a "$TEST_RESULTS"
|
||||
fi
|
||||
fi
|
||||
fi
|
||||
|
||||
|
|
|
|||
216
gcm.cpp
216
gcm.cpp
|
|
@ -12,7 +12,7 @@
|
|||
#ifndef CRYPTOPP_IMPORTS
|
||||
#ifndef CRYPTOPP_GENERATE_X64_MASM
|
||||
|
||||
// Clang 3.3 integrated assembler crash on Linux. MacPorts GCC compile error. SunCC crash under Sun Studio 12.5 and below.
|
||||
// Clang 3.3 integrated assembler crash on Linux. MacPorts GCC compile error. SunCC crash under SunCC 5.14 and below.
|
||||
#if (defined(CRYPTOPP_LLVM_CLANG_VERSION) && (CRYPTOPP_LLVM_CLANG_VERSION < 30400)) || defined(CRYPTOPP_CLANG_INTEGRATED_ASSEMBLER) || (defined(__SUNPRO_CC) && __SUNPRO_CC <= 0x5140)
|
||||
# undef CRYPTOPP_X86_ASM_AVAILABLE
|
||||
# undef CRYPTOPP_X32_ASM_AVAILABLE
|
||||
|
|
@ -20,11 +20,9 @@
|
|||
# undef CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE
|
||||
# undef CRYPTOPP_BOOL_SSSE3_ASM_AVAILABLE
|
||||
# undef CRYPTOPP_BOOL_SSE2_INTRINSICS_AVAILABLE
|
||||
# undef CRYPTOPP_BOOL_AESNI_INTRINSICS_AVAILABLE
|
||||
# define CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE 0
|
||||
# define CRYPTOPP_BOOL_SSSE3_ASM_AVAILABLE 0
|
||||
# define CRYPTOPP_BOOL_SSE2_INTRINSICS_AVAILABLE 0
|
||||
# define CRYPTOPP_BOOL_AESNI_INTRINSICS_AVAILABLE 0
|
||||
#endif
|
||||
|
||||
#include "gcm.h"
|
||||
|
|
@ -94,6 +92,16 @@ inline static void SSE2_Xor16(byte *a, const byte *b, const byte *c)
|
|||
}
|
||||
#endif
|
||||
|
||||
#if CRYPTOPP_BOOL_NEON_INTRINSICS_AVAILABLE
|
||||
inline static void NEON_Xor16(byte *a, const byte *b, const byte *c)
|
||||
{
|
||||
assert(IsAlignedOn(a,GetAlignmentOf<uint64x2_t>()));
|
||||
assert(IsAlignedOn(b,GetAlignmentOf<uint64x2_t>()));
|
||||
assert(IsAlignedOn(c,GetAlignmentOf<uint64x2_t>()));
|
||||
*(uint64x2_t*)a = veorq_u64(*(uint64x2_t*)b, *(uint64x2_t*)c);
|
||||
}
|
||||
#endif
|
||||
|
||||
inline static void Xor16(byte *a, const byte *b, const byte *c)
|
||||
{
|
||||
assert(IsAlignedOn(a,GetAlignmentOf<word64>()));
|
||||
|
|
@ -109,6 +117,7 @@ static const word64 s_clmulConstants64[] = {
|
|||
W64LIT(0xe100000000000000), W64LIT(0xc200000000000000),
|
||||
W64LIT(0x08090a0b0c0d0e0f), W64LIT(0x0001020304050607),
|
||||
W64LIT(0x0001020304050607), W64LIT(0x08090a0b0c0d0e0f)};
|
||||
|
||||
static const __m128i *s_clmulConstants = (const __m128i *)(const void *)s_clmulConstants64;
|
||||
static const unsigned int s_clmulTableSizeInBlocks = 8;
|
||||
|
||||
|
|
@ -146,14 +155,56 @@ inline __m128i CLMUL_Reduce(__m128i c0, __m128i c1, __m128i c2, const __m128i &r
|
|||
|
||||
inline __m128i CLMUL_GF_Mul(const __m128i &x, const __m128i &h, const __m128i &r)
|
||||
{
|
||||
__m128i c0 = _mm_clmulepi64_si128(x,h,0);
|
||||
__m128i c1 = _mm_xor_si128(_mm_clmulepi64_si128(x,h,1), _mm_clmulepi64_si128(x,h,0x10));
|
||||
__m128i c2 = _mm_clmulepi64_si128(x,h,0x11);
|
||||
const __m128i c0 = _mm_clmulepi64_si128(x,h,0);
|
||||
const __m128i c1 = _mm_xor_si128(_mm_clmulepi64_si128(x,h,1), _mm_clmulepi64_si128(x,h,0x10));
|
||||
const __m128i c2 = _mm_clmulepi64_si128(x,h,0x11);
|
||||
|
||||
return CLMUL_Reduce(c0, c1, c2, r);
|
||||
}
|
||||
#endif
|
||||
|
||||
#if CRYPTOPP_BOOL_ARM_CRYPTO_INTRINSICS_AVAILABLE
|
||||
|
||||
CRYPTOPP_ALIGN_DATA(16)
|
||||
static const word64 s_clmulConstants64[] = {
|
||||
W64LIT(0xe100000000000000), W64LIT(0xc200000000000000), // Used for ARM and x86; polynomial coefficients
|
||||
W64LIT(0x08090a0b0c0d0e0f), W64LIT(0x0001020304050607), // Unused for ARM; used for x86 _mm_shuffle_epi8
|
||||
W64LIT(0x0001020304050607), W64LIT(0x08090a0b0c0d0e0f) // Unused for ARM; used for x86 _mm_shuffle_epi8
|
||||
};
|
||||
|
||||
static const uint64x2_t *s_clmulConstants = (const uint64x2_t *)s_clmulConstants64;
|
||||
static const unsigned int s_clmulTableSizeInBlocks = 8;
|
||||
|
||||
inline uint64x2_t PMULL_Reduce(uint64x2_t c0, uint64x2_t c1, uint64x2_t c2, const uint64x2_t &r)
|
||||
{
|
||||
// See comments fo CLMUL_Reduce
|
||||
|
||||
c1 = veorq_u64(c1, (uint64x2_t)vextq_u8(vdupq_n_u8(0), (uint8x16_t)c0, 8));
|
||||
c1 = veorq_u64(c1, (uint64x2_t)vmull_p64(vgetq_lane_u64(c0, 0), vgetq_lane_u64(r, 1)));
|
||||
c0 = (uint64x2_t)vextq_u8((uint8x16_t)c0, vdupq_n_u8(0), 8);
|
||||
c0 = veorq_u64(c0, c1);
|
||||
c0 = vshlq_n_u64(c0, 1);
|
||||
c0 = (uint64x2_t)vmull_p64(vgetq_lane_u64(c0, 0), vgetq_lane_u64(r, 0));
|
||||
c2 = veorq_u64(c2, c0);
|
||||
c2 = veorq_u64(c2, (uint64x2_t)vextq_u8((uint8x16_t)c1, vdupq_n_u8(0), 8));
|
||||
c1 = vcombine_u64(vget_low_u64(c1), vget_low_u64(c2));
|
||||
c1 = vshrq_n_u64(c1, 63);
|
||||
c2 = vshlq_n_u64(c2, 1);
|
||||
|
||||
return veorq_u64(c2, c1);
|
||||
}
|
||||
|
||||
inline uint64x2_t PMULL_GF_Mul(const uint64x2_t &x, const uint64x2_t &h, const uint64x2_t &r)
|
||||
{
|
||||
const uint64x2_t c0 = (uint64x2_t)vmull_p64(vgetq_lane_u64(x, 0), vgetq_lane_u64(h, 0));
|
||||
const uint64x2_t c1 = veorq_u64((uint64x2_t)vmull_p64(vgetq_lane_u64(x, 1), vgetq_lane_u64(h,0)),
|
||||
(uint64x2_t)vmull_p64(vgetq_lane_u64(x, 0), vgetq_lane_u64(h, 1)));
|
||||
const uint64x2_t c2 = (uint64x2_t)vmull_high_p64((poly64x2_t)x, (poly64x2_t)h);
|
||||
|
||||
return PMULL_Reduce(c0, c1, c2, r);
|
||||
}
|
||||
#endif
|
||||
|
||||
void GCM_Base::SetKeyWithoutResync(const byte *userKey, size_t keylength, const NameValuePairs ¶ms)
|
||||
{
|
||||
BlockCipher &blockCipher = AccessBlockCipher();
|
||||
|
|
@ -172,6 +223,14 @@ void GCM_Base::SetKeyWithoutResync(const byte *userKey, size_t keylength, const
|
|||
tableSize = s_clmulTableSizeInBlocks * REQUIRED_BLOCKSIZE;
|
||||
}
|
||||
else
|
||||
#elif CRYPTOPP_BOOL_ARM_CRYPTO_INTRINSICS_AVAILABLE
|
||||
if (HasPMULL())
|
||||
{
|
||||
// Avoid "parameter not used" error and suppress Coverity finding
|
||||
(void)params.GetIntValue(Name::TableSize(), tableSize);
|
||||
tableSize = s_clmulTableSizeInBlocks * REQUIRED_BLOCKSIZE;
|
||||
}
|
||||
else
|
||||
#endif
|
||||
{
|
||||
if (params.GetIntValue(Name::TableSize(), tableSize))
|
||||
|
|
@ -208,6 +267,32 @@ void GCM_Base::SetKeyWithoutResync(const byte *userKey, size_t keylength, const
|
|||
h = CLMUL_GF_Mul(h1, h0, r);
|
||||
}
|
||||
|
||||
return;
|
||||
}
|
||||
#elif CRYPTOPP_BOOL_ARM_CRYPTO_INTRINSICS_AVAILABLE
|
||||
if (HasPMULL())
|
||||
{
|
||||
const uint64x2_t r = s_clmulConstants[0];
|
||||
const uint64x2_t t = vld1q_u64((uint64_t *)hashKey);
|
||||
const uint64x2_t h0 = (uint64x2_t)vrev64q_u8((uint8x16_t)vcombine_u64(vget_high_u64(t), vget_low_u64(t)));
|
||||
|
||||
uint64x2_t h = h0;
|
||||
for (i=0; i<tableSize-32; i+=32)
|
||||
{
|
||||
const uint64x2_t h1 = PMULL_GF_Mul(h, h0, r);
|
||||
vst1_u64((uint64_t *)(table+i), vget_low_u64(h));
|
||||
vst1q_u64((uint64_t *)(table+i+16), h1);
|
||||
vst1q_u64((uint64_t *)(table+i+8), h);
|
||||
vst1_u64((uint64_t *)(table+i+8), vget_low_u64(h1));
|
||||
h = PMULL_GF_Mul(h1, h0, r);
|
||||
}
|
||||
|
||||
const uint64x2_t h1 = PMULL_GF_Mul(h, h0, r);
|
||||
vst1_u64((uint64_t *)(table+i), vget_low_u64(h));
|
||||
vst1q_u64((uint64_t *)(table+i+16), h1);
|
||||
vst1q_u64((uint64_t *)(table+i+8), h);
|
||||
vst1_u64((uint64_t *)(table+i+8), vget_low_u64(h1));
|
||||
|
||||
return;
|
||||
}
|
||||
#endif
|
||||
|
|
@ -237,6 +322,12 @@ void GCM_Base::SetKeyWithoutResync(const byte *userKey, size_t keylength, const
|
|||
for (k=1; k<j; k++)
|
||||
SSE2_Xor16(table+i*256*16+(j+k)*16, table+i*256*16+j*16, table+i*256*16+k*16);
|
||||
else
|
||||
#elif CRYPTOPP_BOOL_NEON_INTRINSICS_AVAILABLE
|
||||
if (HasNEON())
|
||||
for (j=2; j<=0x80; j*=2)
|
||||
for (k=1; k<j; k++)
|
||||
NEON_Xor16(table+i*256*16+(j+k)*16, table+i*256*16+j*16, table+i*256*16+k*16);
|
||||
else
|
||||
#endif
|
||||
for (j=2; j<=0x80; j*=2)
|
||||
for (k=1; k<j; k++)
|
||||
|
|
@ -286,6 +377,15 @@ void GCM_Base::SetKeyWithoutResync(const byte *userKey, size_t keylength, const
|
|||
SSE2_Xor16(table+1024+i*256+(j+k)*16, table+1024+i*256+j*16, table+1024+i*256+k*16);
|
||||
}
|
||||
else
|
||||
#elif CRYPTOPP_BOOL_NEON_INTRINSICS_AVAILABLE
|
||||
if (HasNEON())
|
||||
for (j=2; j<=8; j*=2)
|
||||
for (k=1; k<j; k++)
|
||||
{
|
||||
NEON_Xor16(table+i*256+(j+k)*16, table+i*256+j*16, table+i*256+k*16);
|
||||
NEON_Xor16(table+1024+i*256+(j+k)*16, table+1024+i*256+j*16, table+1024+i*256+k*16);
|
||||
}
|
||||
else
|
||||
#endif
|
||||
for (j=2; j<=8; j*=2)
|
||||
for (k=1; k<j; k++)
|
||||
|
|
@ -305,6 +405,15 @@ inline void GCM_Base::ReverseHashBufferIfNeeded()
|
|||
__m128i &x = *(__m128i *)(void *)HashBuffer();
|
||||
x = _mm_shuffle_epi8(x, s_clmulConstants[1]);
|
||||
}
|
||||
#elif CRYPTOPP_BOOL_ARM_CRYPTO_INTRINSICS_AVAILABLE
|
||||
if (HasPMULL())
|
||||
{
|
||||
if (GetNativeByteOrder() != BIG_ENDIAN_ORDER)
|
||||
{
|
||||
const uint8x16_t x = vrev64q_u8(vld1q_u8(HashBuffer()));
|
||||
vst1q_u8(HashBuffer(), (uint8x16_t)vcombine_u64(vget_high_u64((uint64x2_t)x), vget_low_u64((uint64x2_t)x)));
|
||||
}
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
|
|
@ -358,6 +467,8 @@ unsigned int GCM_Base::OptimalDataAlignment() const
|
|||
return
|
||||
#if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE || defined(CRYPTOPP_X64_MASM_AVAILABLE)
|
||||
HasSSE2() ? 16 :
|
||||
#elif CRYPTOPP_BOOL_NEON_INTRINSICS_AVAILABLE
|
||||
HasNEON() ? 16 :
|
||||
#endif
|
||||
GetBlockCipher().OptimalDataAlignment();
|
||||
}
|
||||
|
|
@ -384,12 +495,12 @@ size_t GCM_Base::AuthenticateBlocks(const byte *data, size_t len)
|
|||
{
|
||||
const __m128i *table = (const __m128i *)(const void *)MulTable();
|
||||
__m128i x = _mm_load_si128((__m128i *)(void *)HashBuffer());
|
||||
const __m128i r = s_clmulConstants[0], bswapMask = s_clmulConstants[1], bswapMask2 = s_clmulConstants[2];
|
||||
const __m128i r = s_clmulConstants[0], mask1 = s_clmulConstants[1], mask2 = s_clmulConstants[2];
|
||||
|
||||
while (len >= 16)
|
||||
{
|
||||
size_t s = UnsignedMin(len/16, s_clmulTableSizeInBlocks), i=0;
|
||||
__m128i d, d2 = _mm_shuffle_epi8(_mm_loadu_si128((const __m128i *)(const void *)(data+(s-1)*16)), bswapMask2);;
|
||||
__m128i d, d2 = _mm_shuffle_epi8(_mm_loadu_si128((const __m128i *)(const void *)(data+(s-1)*16)), mask2);;
|
||||
__m128i c0 = _mm_setzero_si128();
|
||||
__m128i c1 = _mm_setzero_si128();
|
||||
__m128i c2 = _mm_setzero_si128();
|
||||
|
|
@ -398,41 +509,41 @@ size_t GCM_Base::AuthenticateBlocks(const byte *data, size_t len)
|
|||
{
|
||||
__m128i h0 = _mm_load_si128(table+i);
|
||||
__m128i h1 = _mm_load_si128(table+i+1);
|
||||
__m128i h01 = _mm_xor_si128(h0, h1);
|
||||
__m128i h2 = _mm_xor_si128(h0, h1);
|
||||
|
||||
if (++i == s)
|
||||
{
|
||||
d = _mm_shuffle_epi8(_mm_loadu_si128((const __m128i *)(const void *)data), bswapMask);
|
||||
d = _mm_shuffle_epi8(_mm_loadu_si128((const __m128i *)(const void *)data), mask1);
|
||||
d = _mm_xor_si128(d, x);
|
||||
c0 = _mm_xor_si128(c0, _mm_clmulepi64_si128(d, h0, 0));
|
||||
c2 = _mm_xor_si128(c2, _mm_clmulepi64_si128(d, h1, 1));
|
||||
d = _mm_xor_si128(d, _mm_shuffle_epi32(d, _MM_SHUFFLE(1, 0, 3, 2)));
|
||||
c1 = _mm_xor_si128(c1, _mm_clmulepi64_si128(d, h01, 0));
|
||||
c1 = _mm_xor_si128(c1, _mm_clmulepi64_si128(d, h2, 0));
|
||||
break;
|
||||
}
|
||||
|
||||
d = _mm_shuffle_epi8(_mm_loadu_si128((const __m128i *)(const void *)(data+(s-i)*16-8)), bswapMask2);
|
||||
d = _mm_shuffle_epi8(_mm_loadu_si128((const __m128i *)(const void *)(data+(s-i)*16-8)), mask2);
|
||||
c0 = _mm_xor_si128(c0, _mm_clmulepi64_si128(d2, h0, 1));
|
||||
c2 = _mm_xor_si128(c2, _mm_clmulepi64_si128(d, h1, 1));
|
||||
d2 = _mm_xor_si128(d2, d);
|
||||
c1 = _mm_xor_si128(c1, _mm_clmulepi64_si128(d2, h01, 1));
|
||||
c1 = _mm_xor_si128(c1, _mm_clmulepi64_si128(d2, h2, 1));
|
||||
|
||||
if (++i == s)
|
||||
{
|
||||
d = _mm_shuffle_epi8(_mm_loadu_si128((const __m128i *)(const void *)data), bswapMask);
|
||||
d = _mm_shuffle_epi8(_mm_loadu_si128((const __m128i *)(const void *)data), mask1);
|
||||
d = _mm_xor_si128(d, x);
|
||||
c0 = _mm_xor_si128(c0, _mm_clmulepi64_si128(d, h0, 0x10));
|
||||
c2 = _mm_xor_si128(c2, _mm_clmulepi64_si128(d, h1, 0x11));
|
||||
d = _mm_xor_si128(d, _mm_shuffle_epi32(d, _MM_SHUFFLE(1, 0, 3, 2)));
|
||||
c1 = _mm_xor_si128(c1, _mm_clmulepi64_si128(d, h01, 0x10));
|
||||
c1 = _mm_xor_si128(c1, _mm_clmulepi64_si128(d, h2, 0x10));
|
||||
break;
|
||||
}
|
||||
|
||||
d2 = _mm_shuffle_epi8(_mm_loadu_si128((const __m128i *)(const void *)(data+(s-i)*16-8)), bswapMask);
|
||||
d2 = _mm_shuffle_epi8(_mm_loadu_si128((const __m128i *)(const void *)(data+(s-i)*16-8)), mask1);
|
||||
c0 = _mm_xor_si128(c0, _mm_clmulepi64_si128(d, h0, 0x10));
|
||||
c2 = _mm_xor_si128(c2, _mm_clmulepi64_si128(d2, h1, 0x10));
|
||||
d = _mm_xor_si128(d, d2);
|
||||
c1 = _mm_xor_si128(c1, _mm_clmulepi64_si128(d, h01, 0x10));
|
||||
c1 = _mm_xor_si128(c1, _mm_clmulepi64_si128(d, h2, 0x10));
|
||||
}
|
||||
data += s*16;
|
||||
len -= s*16;
|
||||
|
|
@ -444,6 +555,75 @@ size_t GCM_Base::AuthenticateBlocks(const byte *data, size_t len)
|
|||
_mm_store_si128((__m128i *)(void *)HashBuffer(), x);
|
||||
return len;
|
||||
}
|
||||
#elif CRYPTOPP_BOOL_ARM_CRYPTO_INTRINSICS_AVAILABLE
|
||||
if (HasPMULL())
|
||||
{
|
||||
const uint64x2_t *table = (const uint64x2_t *)MulTable();
|
||||
uint64x2_t x = vld1q_u64((const uint64_t*)HashBuffer());
|
||||
const uint64x2_t r = s_clmulConstants[0];
|
||||
|
||||
while (len >= 16)
|
||||
{
|
||||
size_t s = UnsignedMin(len/16, s_clmulTableSizeInBlocks), i=0;
|
||||
uint64x2_t d, d2 = (uint64x2_t)vrev64q_u8((uint8x16_t)vld1q_u64((const uint64_t *)(data+(s-1)*16)));
|
||||
uint64x2_t c0 = vdupq_n_u64(0);
|
||||
uint64x2_t c1 = vdupq_n_u64(0);
|
||||
uint64x2_t c2 = vdupq_n_u64(0);
|
||||
|
||||
while (true)
|
||||
{
|
||||
const uint64x2_t h0 = vld1q_u64((const uint64_t*)(table+i));
|
||||
const uint64x2_t h1 = vld1q_u64((const uint64_t*)(table+i+1));
|
||||
const uint64x2_t h2 = veorq_u64(h0, h1);
|
||||
|
||||
if (++i == s)
|
||||
{
|
||||
const uint64x2_t t1 = vld1q_u64((const uint64_t *)data);
|
||||
d = veorq_u64((uint64x2_t)vrev64q_u8((uint8x16_t)vcombine_u64(vget_high_u64(t1), vget_low_u64(t1))), x);
|
||||
c0 = veorq_u64(c0, (uint64x2_t)vmull_p64(vgetq_lane_u64(d, 0), vgetq_lane_u64(h0, 0)));
|
||||
c2 = veorq_u64(c2, (uint64x2_t)vmull_p64(vgetq_lane_u64(d, 1), vgetq_lane_u64(h1, 0)));
|
||||
d = veorq_u64(d, (uint64x2_t)vcombine_u32(vget_high_u32((uint32x4_t)d), vget_low_u32((uint32x4_t)d)));
|
||||
c1 = veorq_u64(c1, (uint64x2_t)vmull_p64(vgetq_lane_u64(d, 0), vgetq_lane_u64(h2, 0)));
|
||||
|
||||
break;
|
||||
}
|
||||
|
||||
d = (uint64x2_t)vrev64q_u8((uint8x16_t)vld1q_u64((const uint64_t *)(data+(s-i)*16-8)));
|
||||
c0 = veorq_u64(c0, (uint64x2_t)vmull_p64(vgetq_lane_u64(d2, 1), vgetq_lane_u64(h0, 0)));
|
||||
c2 = veorq_u64(c2, (uint64x2_t)vmull_p64(vgetq_lane_u64(d, 1), vgetq_lane_u64(h1, 0)));
|
||||
d2 = veorq_u64(d2, d);
|
||||
c1 = veorq_u64(c1, (uint64x2_t)vmull_p64(vgetq_lane_u64(d2, 1), vgetq_lane_u64(h2, 0)));
|
||||
|
||||
if (++i == s)
|
||||
{
|
||||
|
||||
const uint64x2_t t2 = vld1q_u64((const uint64_t *)data);
|
||||
d = veorq_u64((uint64x2_t)vrev64q_u8((uint8x16_t)vcombine_u64(vget_high_u64(t2), vget_low_u64(t2))), x);
|
||||
c0 = veorq_u64(c0, (uint64x2_t)vmull_p64(vgetq_lane_u64(d, 0), vgetq_lane_u64(h0, 1)));
|
||||
c2 = veorq_u64(c2, (uint64x2_t)vmull_high_p64((poly64x2_t)d, (poly64x2_t)h1));
|
||||
d = veorq_u64(d, (uint64x2_t)vcombine_u32(vget_high_u32((uint32x4_t)d), vget_low_u32((uint32x4_t)d)));
|
||||
c1 = veorq_u64(c1, (uint64x2_t)vmull_p64(vgetq_lane_u64(d, 0), vgetq_lane_u64(h2, 1)));
|
||||
|
||||
break;
|
||||
}
|
||||
|
||||
const uint64x2_t t3 = vld1q_u64((uint64_t *)(data+(s-i)*16-8));
|
||||
d2 = (uint64x2_t)vrev64q_u8((uint8x16_t)vcombine_u64(vget_high_u64(t3), vget_low_u64(t3)));
|
||||
c0 = veorq_u64(c0, (uint64x2_t)vmull_p64(vgetq_lane_u64(d, 0), vgetq_lane_u64(h0, 1)));
|
||||
c2 = veorq_u64(c2, (uint64x2_t)vmull_p64(vgetq_lane_u64(d2, 0), vgetq_lane_u64(h1, 1)));
|
||||
d = veorq_u64(d, d2);
|
||||
c1 = veorq_u64(c1, (uint64x2_t)vmull_p64(vgetq_lane_u64(d, 0), vgetq_lane_u64(h2, 1)));
|
||||
}
|
||||
data += s*16;
|
||||
len -= s*16;
|
||||
|
||||
c1 = veorq_u64(veorq_u64(c1, c0), c2);
|
||||
x = PMULL_Reduce(c0, c1, c2, r);
|
||||
}
|
||||
|
||||
vst1q_u64((uint64_t *)HashBuffer(), x);
|
||||
return len;
|
||||
}
|
||||
#endif
|
||||
|
||||
typedef BlockGetAndPut<word64, NativeByteOrder> Block;
|
||||
|
|
@ -453,6 +633,8 @@ size_t GCM_Base::AuthenticateBlocks(const byte *data, size_t len)
|
|||
switch (2*(m_buffer.size()>=64*1024)
|
||||
#if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE || defined(CRYPTOPP_X64_MASM_AVAILABLE)
|
||||
+ HasSSE2()
|
||||
//#elif CRYPTOPP_BOOL_NEON_INTRINSICS_AVAILABLE
|
||||
// + HasNEON()
|
||||
#endif
|
||||
)
|
||||
{
|
||||
|
|
|
|||
Loading…
Reference in New Issue