Improved sha256 performance on ppc64 by 4x
Results: I removed the other hash algorithm on test.cpp and gave as an input a ~700MB file. Upstream: $ time ./cryptest.exe m ~/ubuntu-16.10-server-ppc64el.iso SHA-256: d14bdb413ea6cdc8d9354fcbc37a834b7de0c23f992deb0c6764d0fd5d65408e real 0m18.811s user 0m18.456s sys 0m0.356s This patch: $ time ./cryptest.exe m ~/ubuntu-16.10-server-ppc64el.iso SHA-256: d14bdb413ea6cdc8d9354fcbc37a834b7de0c23f992deb0c6764d0fd5d65408e real 0m4.158s user 0m3.992s sys 0m0.168s This approach used altivec + VSX instructions found on POWER8 systems and newer. If unwanted on PPC, "cmake" it with -DDISABLE_ALTIVEC=1 or "make" it with USE_ALTIVEC=0.pull/394/head
parent
01cea29692
commit
3f128e4667
|
|
@ -45,6 +45,7 @@ endif()
|
||||||
option(DISABLE_ASM "Disable ASM" OFF)
|
option(DISABLE_ASM "Disable ASM" OFF)
|
||||||
option(DISABLE_SSSE3 "Disable SSSE3" OFF)
|
option(DISABLE_SSSE3 "Disable SSSE3" OFF)
|
||||||
option(DISABLE_AESNI "Disable AES-NI" OFF)
|
option(DISABLE_AESNI "Disable AES-NI" OFF)
|
||||||
|
option(DISABLE_ALTIVEC "Disable PPC64's Altivec registers for SHA-2" OFF)
|
||||||
option(DISABLE_NATIVE_ARCH "Disable the addition of -march=native" OFF)
|
option(DISABLE_NATIVE_ARCH "Disable the addition of -march=native" OFF)
|
||||||
set(CRYPTOPP_DATA_DIR "" CACHE PATH "Crypto++ test data directory")
|
set(CRYPTOPP_DATA_DIR "" CACHE PATH "Crypto++ test data directory")
|
||||||
|
|
||||||
|
|
@ -97,6 +98,9 @@ endif()
|
||||||
if(DISABLE_AESNI)
|
if(DISABLE_AESNI)
|
||||||
add_definitions(-DCRYPTOPP_DISABLE_AESNI)
|
add_definitions(-DCRYPTOPP_DISABLE_AESNI)
|
||||||
endif()
|
endif()
|
||||||
|
if(DISABLE_ALTIVEC)
|
||||||
|
add_definitions(-DCRYPTOPP_DISABLE_ALTIVEC)
|
||||||
|
endif()
|
||||||
if(NOT CRYPTOPP_DATA_DIR STREQUAL "")
|
if(NOT CRYPTOPP_DATA_DIR STREQUAL "")
|
||||||
add_definitions(-DCRYPTOPP_DATA_DIR="${CRYPTOPP_DATA_DIR}")
|
add_definitions(-DCRYPTOPP_DATA_DIR="${CRYPTOPP_DATA_DIR}")
|
||||||
endif()
|
endif()
|
||||||
|
|
@ -242,6 +246,15 @@ if(MSVC AND NOT DISABLE_ASM)
|
||||||
endif()
|
endif()
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
|
if (${UNAME_MACHINE} MATCHES "ppc64le" AND NOT ${DISABLE_ALTIVEC})
|
||||||
|
message(STATUS "Enabling SHA-2 acceleration for ppc64le.")
|
||||||
|
set(cryptopp_SOURCES
|
||||||
|
${CMAKE_CURRENT_SOURCE_DIR}/sha256_compress_ppc.s
|
||||||
|
${cryptopp_SOURCES}
|
||||||
|
)
|
||||||
|
enable_language(ASM)
|
||||||
|
endif()
|
||||||
|
|
||||||
#============================================================================
|
#============================================================================
|
||||||
# Compile targets
|
# Compile targets
|
||||||
#============================================================================
|
#============================================================================
|
||||||
|
|
|
||||||
18
GNUmakefile
18
GNUmakefile
|
|
@ -57,6 +57,9 @@ IS_X32 ?= 0
|
||||||
# Set to 1 if you used NASM to build rdrand-{x86|x32|x64}
|
# Set to 1 if you used NASM to build rdrand-{x86|x32|x64}
|
||||||
USE_NASM ?= 0
|
USE_NASM ?= 0
|
||||||
|
|
||||||
|
# Set to 0 if you don't want acceleration on SHA on PPC64le
|
||||||
|
USE_ALTIVEC ?= $(IS_PPC)
|
||||||
|
|
||||||
# Fixup for X32
|
# Fixup for X32
|
||||||
ifeq ($(IS_X32),1)
|
ifeq ($(IS_X32),1)
|
||||||
IS_X86 = 0
|
IS_X86 = 0
|
||||||
|
|
@ -480,6 +483,10 @@ OBJS += rdrand-x86.o
|
||||||
endif
|
endif
|
||||||
endif # Nasm
|
endif # Nasm
|
||||||
|
|
||||||
|
ifeq ($(USE_ALTIVEC),1)
|
||||||
|
OBJS += sha256_compress_ppc.o
|
||||||
|
endif
|
||||||
|
|
||||||
# List test.cpp first to tame C++ static initialization problems.
|
# List test.cpp first to tame C++ static initialization problems.
|
||||||
TESTSRCS := adhoc.cpp test.cpp bench1.cpp bench2.cpp validat0.cpp validat1.cpp validat2.cpp validat3.cpp datatest.cpp regtest.cpp fipsalgt.cpp dlltest.cpp
|
TESTSRCS := adhoc.cpp test.cpp bench1.cpp bench2.cpp validat0.cpp validat1.cpp validat2.cpp validat3.cpp datatest.cpp regtest.cpp fipsalgt.cpp dlltest.cpp
|
||||||
TESTOBJS := $(TESTSRCS:.cpp=.o)
|
TESTOBJS := $(TESTSRCS:.cpp=.o)
|
||||||
|
|
@ -774,6 +781,17 @@ rdrand-x32.o: ;
|
||||||
rdrand-x64.o: ;
|
rdrand-x64.o: ;
|
||||||
endif
|
endif
|
||||||
|
|
||||||
|
ifeq ($(IS_PPC),1)
|
||||||
|
ifeq ($(USE_ALTIVEC),1)
|
||||||
|
sha256_compress_ppc.o: sha256_compress_ppc.s
|
||||||
|
$(CXX) $(strip $(CXXFLAGS)) -c $<
|
||||||
|
else
|
||||||
|
# PPC without altivec. Correctly fallback to C implementation
|
||||||
|
sha.o : sha.cpp
|
||||||
|
$(CXX) $(strip $(CXXFLAGS)) -c $< -DCRYPTOPP_DISABLE_ALTIVEC
|
||||||
|
endif
|
||||||
|
endif
|
||||||
|
|
||||||
# Only use CRYPTOPP_DATA_DIR if its not set in CXXFLAGS
|
# Only use CRYPTOPP_DATA_DIR if its not set in CXXFLAGS
|
||||||
ifeq ($(findstring -DCRYPTOPP_DATA_DIR, $(strip $(CXXFLAGS))),)
|
ifeq ($(findstring -DCRYPTOPP_DATA_DIR, $(strip $(CXXFLAGS))),)
|
||||||
ifneq ($(strip $(CRYPTOPP_DATA_DIR)),)
|
ifneq ($(strip $(CRYPTOPP_DATA_DIR)),)
|
||||||
|
|
|
||||||
13
config.h
13
config.h
|
|
@ -474,6 +474,12 @@ NAMESPACE_END
|
||||||
#define CRYPTOPP_BOOL_SSE_SHA_INTRINSICS_AVAILABLE 0
|
#define CRYPTOPP_BOOL_SSE_SHA_INTRINSICS_AVAILABLE 0
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#if !defined(CRYPTOPP_BOOL_ALTIVEC_AVAILABLE) && !defined(CRYPTOPP_DISABLE_ALTIVEC)
|
||||||
|
# if defined(__powerpc64__) && defined(__POWER8_VECTOR__) && __POWER8_VECTOR__ == 1
|
||||||
|
# define CRYPTOPP_BOOL_ALTIVEC_AVAILABLE 1
|
||||||
|
# endif
|
||||||
|
#endif
|
||||||
|
|
||||||
// Requires ARMv7 and ACLE 1.0. Testing shows ARMv7 is really ARMv7a under most toolchains.
|
// Requires ARMv7 and ACLE 1.0. Testing shows ARMv7 is really ARMv7a under most toolchains.
|
||||||
#if !defined(CRYPTOPP_BOOL_NEON_INTRINSICS_AVAILABLE) && !defined(CRYPTOPP_DISABLE_ASM)
|
#if !defined(CRYPTOPP_BOOL_NEON_INTRINSICS_AVAILABLE) && !defined(CRYPTOPP_DISABLE_ASM)
|
||||||
# if defined(__ARM_NEON__) || defined(__ARM_NEON) || defined(_M_ARM)
|
# if defined(__ARM_NEON__) || defined(__ARM_NEON) || defined(_M_ARM)
|
||||||
|
|
@ -597,6 +603,13 @@ NAMESPACE_END
|
||||||
#define CRYPTOPP_BOOL_ARM64 0
|
#define CRYPTOPP_BOOL_ARM64 0
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
// Using a SIMD altivec approach on machines that supports so
|
||||||
|
#if defined(__powerpc64__) && defined(__POWER8_VECTOR__) && __POWER8_VECTOR__ == 1
|
||||||
|
#define CRYPTOPP_BOOL_PPC64LE 1
|
||||||
|
#else
|
||||||
|
#define CRYPTOPP_BOOL_PPC64LE 0
|
||||||
|
#endif
|
||||||
|
|
||||||
#if !defined(CRYPTOPP_NO_UNALIGNED_DATA_ACCESS) && !defined(CRYPTOPP_ALLOW_UNALIGNED_DATA_ACCESS)
|
#if !defined(CRYPTOPP_NO_UNALIGNED_DATA_ACCESS) && !defined(CRYPTOPP_ALLOW_UNALIGNED_DATA_ACCESS)
|
||||||
#if (CRYPTOPP_BOOL_X64 || CRYPTOPP_BOOL_X86 || CRYPTOPP_BOOL_X32 || defined(__powerpc__) || (__ARM_FEATURE_UNALIGNED >= 1))
|
#if (CRYPTOPP_BOOL_X64 || CRYPTOPP_BOOL_X86 || CRYPTOPP_BOOL_X32 || defined(__powerpc__) || (__ARM_FEATURE_UNALIGNED >= 1))
|
||||||
#define CRYPTOPP_ALLOW_UNALIGNED_DATA_ACCESS
|
#define CRYPTOPP_ALLOW_UNALIGNED_DATA_ACCESS
|
||||||
|
|
|
||||||
31
sha.cpp
31
sha.cpp
|
|
@ -28,6 +28,7 @@
|
||||||
# undef CRYPTOPP_X32_ASM_AVAILABLE
|
# undef CRYPTOPP_X32_ASM_AVAILABLE
|
||||||
# undef CRYPTOPP_X64_ASM_AVAILABLE
|
# undef CRYPTOPP_X64_ASM_AVAILABLE
|
||||||
# undef CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE
|
# undef CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE
|
||||||
|
# undef CRYPTOPP_BOOL_ALTIVEC_AVAILABLE
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
NAMESPACE_BEGIN(CryptoPP)
|
NAMESPACE_BEGIN(CryptoPP)
|
||||||
|
|
@ -518,7 +519,7 @@ void SHA256::InitState(HashWordType *state)
|
||||||
memcpy(state, s, sizeof(s));
|
memcpy(state, s, sizeof(s));
|
||||||
}
|
}
|
||||||
|
|
||||||
#if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE || CRYPTOPP_BOOL_ARM_CRYPTO_INTRINSICS_AVAILABLE
|
#if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE || CRYPTOPP_BOOL_ARM_CRYPTO_INTRINSICS_AVAILABLE || CRYPTOPP_BOOL_ALTIVEC_AVAILABLE
|
||||||
CRYPTOPP_ALIGN_DATA(16) extern const word32 SHA256_K[64] CRYPTOPP_SECTION_ALIGN16 = {
|
CRYPTOPP_ALIGN_DATA(16) extern const word32 SHA256_K[64] CRYPTOPP_SECTION_ALIGN16 = {
|
||||||
#else
|
#else
|
||||||
extern const word32 SHA256_K[64] = {
|
extern const word32 SHA256_K[64] = {
|
||||||
|
|
@ -878,6 +879,8 @@ void CRYPTOPP_FASTCALL X86_SHA256_HashBlocks(word32 *state, const word32 *data,
|
||||||
static void CRYPTOPP_FASTCALL SHA256_SSE_SHA_HashBlocks(word32 *state, const word32 *data, size_t length);
|
static void CRYPTOPP_FASTCALL SHA256_SSE_SHA_HashBlocks(word32 *state, const word32 *data, size_t length);
|
||||||
#elif CRYPTOPP_BOOL_ARM_CRYPTO_INTRINSICS_AVAILABLE
|
#elif CRYPTOPP_BOOL_ARM_CRYPTO_INTRINSICS_AVAILABLE
|
||||||
static void CRYPTOPP_FASTCALL SHA256_ARM_SHA_HashBlocks(word32 *state, const word32 *data, size_t length);
|
static void CRYPTOPP_FASTCALL SHA256_ARM_SHA_HashBlocks(word32 *state, const word32 *data, size_t length);
|
||||||
|
#elif CRYPTOPP_BOOL_ALTIVEC_AVAILABLE
|
||||||
|
static void CRYPTOPP_FASTCALL SHA256_Altivec_SHA_HashBlocks(word32 *state, const word32 *data, size_t length);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if (defined(CRYPTOPP_X86_ASM_AVAILABLE) || defined(CRYPTOPP_X32_ASM_AVAILABLE) || defined(CRYPTOPP_X64_MASM_AVAILABLE)) && !defined(CRYPTOPP_DISABLE_SHA_ASM)
|
#if (defined(CRYPTOPP_X86_ASM_AVAILABLE) || defined(CRYPTOPP_X32_ASM_AVAILABLE) || defined(CRYPTOPP_X64_MASM_AVAILABLE)) && !defined(CRYPTOPP_DISABLE_SHA_ASM)
|
||||||
|
|
@ -895,6 +898,10 @@ pfnSHAHashBlocks InitializeSHA256HashBlocks()
|
||||||
else
|
else
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#if CRYPTOPP_BOOL_ALTIVEC_AVAILABLE
|
||||||
|
return &SHA256_Altivec_SHA_HashBlocks;
|
||||||
|
#endif
|
||||||
|
|
||||||
return &X86_SHA256_HashBlocks;
|
return &X86_SHA256_HashBlocks;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -1065,6 +1072,13 @@ static void SHA256_ARM_SHA_Transform(word32 *state, const word32 *data)
|
||||||
}
|
}
|
||||||
#endif // CRYPTOPP_BOOL_ARM_CRYPTO_INTRINSICS_AVAILABLE
|
#endif // CRYPTOPP_BOOL_ARM_CRYPTO_INTRINSICS_AVAILABLE
|
||||||
|
|
||||||
|
#if CRYPTOPP_BOOL_ALTIVEC_AVAILABLE
|
||||||
|
static void SHA256_Altivec_SHA_Transform(word32 *state, const word32 *data)
|
||||||
|
{
|
||||||
|
return SHA256_Altivec_SHA_HashBlocks(state, data, SHA256::BLOCKSIZE);
|
||||||
|
}
|
||||||
|
#endif // CRYPTOPP_BOOL_ALTIVEC_AVAILABLE
|
||||||
|
|
||||||
///////////////////////////////////
|
///////////////////////////////////
|
||||||
// start of Walton/Gulley's code //
|
// start of Walton/Gulley's code //
|
||||||
///////////////////////////////////
|
///////////////////////////////////
|
||||||
|
|
@ -1438,6 +1452,18 @@ static void CRYPTOPP_FASTCALL SHA256_ARM_SHA_HashBlocks(word32 *state, const wor
|
||||||
// end of Walton/Schneiders/O'Rourke/Hovsmith's code //
|
// end of Walton/Schneiders/O'Rourke/Hovsmith's code //
|
||||||
///////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////
|
||||||
|
|
||||||
|
#if CRYPTOPP_BOOL_ALTIVEC_AVAILABLE
|
||||||
|
|
||||||
|
// Function to be found on sha256_compress_ppc.s
|
||||||
|
extern "C" void sha256_compress_ppc(word32 *STATE, const word32 *input, const uint32_t *k);
|
||||||
|
|
||||||
|
static void CRYPTOPP_FASTCALL SHA256_Altivec_SHA_HashBlocks(word32 *state, const word32 *data, size_t length)
|
||||||
|
{
|
||||||
|
for (size_t i = 0; i < length; i += SHA256::BLOCKSIZE)
|
||||||
|
sha256_compress_ppc(state, data + i, SHA256_K);
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
pfnSHATransform InitializeSHA256Transform()
|
pfnSHATransform InitializeSHA256Transform()
|
||||||
{
|
{
|
||||||
#if CRYPTOPP_BOOL_SSE_SHA_INTRINSICS_AVAILABLE
|
#if CRYPTOPP_BOOL_SSE_SHA_INTRINSICS_AVAILABLE
|
||||||
|
|
@ -1455,6 +1481,9 @@ pfnSHATransform InitializeSHA256Transform()
|
||||||
return &SHA256_ARM_SHA_Transform;
|
return &SHA256_ARM_SHA_Transform;
|
||||||
else
|
else
|
||||||
#endif
|
#endif
|
||||||
|
#if CRYPTOPP_BOOL_ALTIVEC_AVAILABLE
|
||||||
|
return &SHA256_Altivec_SHA_Transform;
|
||||||
|
#endif
|
||||||
|
|
||||||
return &SHA256_CXX_Transform;
|
return &SHA256_CXX_Transform;
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,744 @@
|
||||||
|
|
||||||
|
# This file was generated using the sha2-le project from:
|
||||||
|
# https://github.com/PPC64/sha2-le
|
||||||
|
|
||||||
|
|
||||||
|
# Configured as:
|
||||||
|
# STATE_MEM_16BYTE_ALIGNED = 0
|
||||||
|
# INPUT_MEM_16BYTE_ALIGNED = 0
|
||||||
|
# K_MEM_16BYTE_ALIGNED = 1
|
||||||
|
# SWAP_BYTES = 0
|
||||||
|
|
||||||
|
.file "sha256_compress_ppc.s"
|
||||||
|
|
||||||
|
# Keep in mind that vector loading/store is/would-be done directly by dealing
|
||||||
|
# with 16-bytes:
|
||||||
|
# +-------+----+----+----+----+
|
||||||
|
# | | Vector Words: |
|
||||||
|
# | Name | #1 | #2 | #3 | #4 |
|
||||||
|
# +-------+----+----+----+----+
|
||||||
|
# | Va | a | b | c | d |
|
||||||
|
# +-------+----+----+----+----+
|
||||||
|
#
|
||||||
|
# But this algorithm will mainly deal with each data (a-h) separately on a
|
||||||
|
# vector:
|
||||||
|
# +-------+----+----+----+----+
|
||||||
|
# | | Vector Words: |
|
||||||
|
# | Name | #1 | #2 | #3 | #4 |
|
||||||
|
# +-------+----+----+----+----+
|
||||||
|
# | Va | a | - | - | - |
|
||||||
|
# | Vb | b | - | - | - |
|
||||||
|
# | Vc | c | - | - | - |
|
||||||
|
# | Vd | d | - | - | - |
|
||||||
|
# +-------+----+----+----+----+
|
||||||
|
|
||||||
|
|
||||||
|
.text
|
||||||
|
.align 4
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# void sha256_compress_ppc(uint32_t *STATE, const uint8_t *input, const uint32_t *k)
|
||||||
|
.globl sha256_compress_ppc
|
||||||
|
.type sha256_compress_ppc,%function
|
||||||
|
sha256_compress_ppc:
|
||||||
|
|
||||||
|
# Saving non volatile registers
|
||||||
|
|
||||||
|
li 0, -176; stvx 29, 1, 0
|
||||||
|
li 0, -160; stvx 28, 1, 0
|
||||||
|
li 0, -144; stvx 27, 1, 0
|
||||||
|
li 0, -128; stvx 26, 1, 0
|
||||||
|
li 0, -112; stvx 25, 1, 0
|
||||||
|
li 0, -96; stvx 24, 1, 0
|
||||||
|
li 0, -80; stvx 23, 1, 0
|
||||||
|
li 0, -64; stvx 22, 1, 0
|
||||||
|
li 0, -48; stvx 21, 1, 0
|
||||||
|
li 0, -32; stvx 20, 1, 0
|
||||||
|
|
||||||
|
# Load hash STATE to registers
|
||||||
|
|
||||||
|
|
||||||
|
# load unaligned
|
||||||
|
lvx 9, 0, 3
|
||||||
|
addi 6, 3,16
|
||||||
|
lvsr 7, 0, 6
|
||||||
|
lvx 13, 0, 6
|
||||||
|
vperm 9, 13, 9, 7 # a = {a,b,c,d}
|
||||||
|
addi 6, 6, 16
|
||||||
|
lvx 22, 0, 6
|
||||||
|
vperm 13, 22, 13, 7 # e = {e,f,g,h}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# Unpack a-h data from the packed vector to a vector register each
|
||||||
|
|
||||||
|
vsldoi 10, 9, 9, 12
|
||||||
|
vsldoi 11, 9, 9, 8
|
||||||
|
vsldoi 12, 9, 9, 4
|
||||||
|
|
||||||
|
|
||||||
|
vsldoi 14, 13, 13, 12
|
||||||
|
vsldoi 15, 13, 13, 8
|
||||||
|
vsldoi 16, 13, 13, 4
|
||||||
|
|
||||||
|
|
||||||
|
# Load 16 elements from w out of the loop
|
||||||
|
|
||||||
|
|
||||||
|
# set vrb according to alignment
|
||||||
|
lvsr 21, 0, 4
|
||||||
|
|
||||||
|
# unaligned load
|
||||||
|
lvx 17, 0, 4
|
||||||
|
addi 6, 4, 16
|
||||||
|
lvx 18, 0, 6
|
||||||
|
|
||||||
|
# w0 = w[j-16] to w[j-13]
|
||||||
|
vperm 17, 18, 17, 21
|
||||||
|
addi 6, 4, 32
|
||||||
|
lvx 19, 0, 6
|
||||||
|
|
||||||
|
# w1 = w[j-12] to w[j-9]
|
||||||
|
vperm 18, 19, 18, 21
|
||||||
|
addi 6, 4, 48
|
||||||
|
lvx 20, 0, 6
|
||||||
|
|
||||||
|
# w2 = w[j-8] to w[j-5]
|
||||||
|
vperm 19, 20, 19, 21
|
||||||
|
addi 6, 4, 64
|
||||||
|
lvx 4, 0, 6
|
||||||
|
|
||||||
|
# w3 = w[j-4] to w[j-1]
|
||||||
|
vperm 20, 4, 20, 21
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# aligned load
|
||||||
|
# vt0 = K[j-16] to K[j-13]
|
||||||
|
lvx 4, 0, 5
|
||||||
|
addi 6, 5, 16
|
||||||
|
# vt1 = K[j-12] to K[j-9]
|
||||||
|
lvx 5, 0, 6
|
||||||
|
addi 6, 5, 32
|
||||||
|
# vt2 = K[j-8] to K[j-5]
|
||||||
|
lvx 6, 0, 6
|
||||||
|
addi 6, 5, 48
|
||||||
|
# vt3 = K[j-4] to K[j-1]
|
||||||
|
lvx 7, 0, 6
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# Add _w to K
|
||||||
|
vadduwm 26,4,17
|
||||||
|
vadduwm 27,5,18
|
||||||
|
vadduwm 28,6,19
|
||||||
|
vadduwm 29,7,20
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
vsldoi 23, 26, 26, 12
|
||||||
|
vsldoi 24, 26, 26, 8
|
||||||
|
vsldoi 25, 26, 26, 4
|
||||||
|
|
||||||
|
# iteration: #1
|
||||||
|
|
||||||
|
vsel 0, 15, 14, 13 # ch = Ch(e,f,g)
|
||||||
|
vxor 1, 9, 10 # intermediate Maj
|
||||||
|
vshasigmaw 3, 13, 1, 0xF # bse = BigSigma1(e)
|
||||||
|
vadduwm 6, 0, 26 # vt2 = ch + kpw
|
||||||
|
vadduwm 5, 16, 3 # vt1 = h + bse
|
||||||
|
vsel 1, 10, 11, 1 # maj = Maj(a,b,c)
|
||||||
|
vadduwm 7, 5, 6 # vt3 = h + bse + ch + kpw
|
||||||
|
vshasigmaw 2, 9, 1, 0 # bsa = BigSigma0(a)
|
||||||
|
vadduwm 8, 2, 1 # vt4 = bsa + maj
|
||||||
|
vadduwm 12, 12, 7 # d = d + vt3
|
||||||
|
vadduwm 16, 7, 8 # h = vt3 + vt4
|
||||||
|
|
||||||
|
# iteration: #2
|
||||||
|
|
||||||
|
vsel 0, 14, 13, 12 # ch = Ch(e,f,g)
|
||||||
|
vxor 1, 16, 9 # intermediate Maj
|
||||||
|
vshasigmaw 3, 12, 1, 0xF # bse = BigSigma1(e)
|
||||||
|
vadduwm 6, 0, 23 # vt2 = ch + kpw
|
||||||
|
vadduwm 5, 15, 3 # vt1 = h + bse
|
||||||
|
vsel 1, 9, 10, 1 # maj = Maj(a,b,c)
|
||||||
|
vadduwm 7, 5, 6 # vt3 = h + bse + ch + kpw
|
||||||
|
vshasigmaw 2, 16, 1, 0 # bsa = BigSigma0(a)
|
||||||
|
vadduwm 8, 2, 1 # vt4 = bsa + maj
|
||||||
|
vadduwm 11, 11, 7 # d = d + vt3
|
||||||
|
vadduwm 15, 7, 8 # h = vt3 + vt4
|
||||||
|
|
||||||
|
# iteration: #3
|
||||||
|
|
||||||
|
vsel 0, 13, 12, 11 # ch = Ch(e,f,g)
|
||||||
|
vxor 1, 15, 16 # intermediate Maj
|
||||||
|
vshasigmaw 3, 11, 1, 0xF # bse = BigSigma1(e)
|
||||||
|
vadduwm 6, 0, 24 # vt2 = ch + kpw
|
||||||
|
vadduwm 5, 14, 3 # vt1 = h + bse
|
||||||
|
vsel 1, 16, 9, 1 # maj = Maj(a,b,c)
|
||||||
|
vadduwm 7, 5, 6 # vt3 = h + bse + ch + kpw
|
||||||
|
vshasigmaw 2, 15, 1, 0 # bsa = BigSigma0(a)
|
||||||
|
vadduwm 8, 2, 1 # vt4 = bsa + maj
|
||||||
|
vadduwm 10, 10, 7 # d = d + vt3
|
||||||
|
vadduwm 14, 7, 8 # h = vt3 + vt4
|
||||||
|
|
||||||
|
# iteration: #4
|
||||||
|
|
||||||
|
vsel 0, 12, 11, 10 # ch = Ch(e,f,g)
|
||||||
|
vxor 1, 14, 15 # intermediate Maj
|
||||||
|
vshasigmaw 3, 10, 1, 0xF # bse = BigSigma1(e)
|
||||||
|
vadduwm 6, 0, 25 # vt2 = ch + kpw
|
||||||
|
vadduwm 5, 13, 3 # vt1 = h + bse
|
||||||
|
vsel 1, 15, 16, 1 # maj = Maj(a,b,c)
|
||||||
|
vadduwm 7, 5, 6 # vt3 = h + bse + ch + kpw
|
||||||
|
vshasigmaw 2, 14, 1, 0 # bsa = BigSigma0(a)
|
||||||
|
vadduwm 8, 2, 1 # vt4 = bsa + maj
|
||||||
|
vadduwm 9, 9, 7 # d = d + vt3
|
||||||
|
vadduwm 13, 7, 8 # h = vt3 + vt4
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
vsldoi 23, 27, 27, 12
|
||||||
|
vsldoi 24, 27, 27, 8
|
||||||
|
vsldoi 25, 27, 27, 4
|
||||||
|
|
||||||
|
# iteration: #5
|
||||||
|
|
||||||
|
vsel 0, 11, 10, 9 # ch = Ch(e,f,g)
|
||||||
|
vxor 1, 13, 14 # intermediate Maj
|
||||||
|
vshasigmaw 3, 9, 1, 0xF # bse = BigSigma1(e)
|
||||||
|
vadduwm 6, 0, 27 # vt2 = ch + kpw
|
||||||
|
vadduwm 5, 12, 3 # vt1 = h + bse
|
||||||
|
vsel 1, 14, 15, 1 # maj = Maj(a,b,c)
|
||||||
|
vadduwm 7, 5, 6 # vt3 = h + bse + ch + kpw
|
||||||
|
vshasigmaw 2, 13, 1, 0 # bsa = BigSigma0(a)
|
||||||
|
vadduwm 8, 2, 1 # vt4 = bsa + maj
|
||||||
|
vadduwm 16, 16, 7 # d = d + vt3
|
||||||
|
vadduwm 12, 7, 8 # h = vt3 + vt4
|
||||||
|
|
||||||
|
# iteration: #6
|
||||||
|
|
||||||
|
vsel 0, 10, 9, 16 # ch = Ch(e,f,g)
|
||||||
|
vxor 1, 12, 13 # intermediate Maj
|
||||||
|
vshasigmaw 3, 16, 1, 0xF # bse = BigSigma1(e)
|
||||||
|
vadduwm 6, 0, 23 # vt2 = ch + kpw
|
||||||
|
vadduwm 5, 11, 3 # vt1 = h + bse
|
||||||
|
vsel 1, 13, 14, 1 # maj = Maj(a,b,c)
|
||||||
|
vadduwm 7, 5, 6 # vt3 = h + bse + ch + kpw
|
||||||
|
vshasigmaw 2, 12, 1, 0 # bsa = BigSigma0(a)
|
||||||
|
vadduwm 8, 2, 1 # vt4 = bsa + maj
|
||||||
|
vadduwm 15, 15, 7 # d = d + vt3
|
||||||
|
vadduwm 11, 7, 8 # h = vt3 + vt4
|
||||||
|
|
||||||
|
# iteration: #7
|
||||||
|
|
||||||
|
vsel 0, 9, 16, 15 # ch = Ch(e,f,g)
|
||||||
|
vxor 1, 11, 12 # intermediate Maj
|
||||||
|
vshasigmaw 3, 15, 1, 0xF # bse = BigSigma1(e)
|
||||||
|
vadduwm 6, 0, 24 # vt2 = ch + kpw
|
||||||
|
vadduwm 5, 10, 3 # vt1 = h + bse
|
||||||
|
vsel 1, 12, 13, 1 # maj = Maj(a,b,c)
|
||||||
|
vadduwm 7, 5, 6 # vt3 = h + bse + ch + kpw
|
||||||
|
vshasigmaw 2, 11, 1, 0 # bsa = BigSigma0(a)
|
||||||
|
vadduwm 8, 2, 1 # vt4 = bsa + maj
|
||||||
|
vadduwm 14, 14, 7 # d = d + vt3
|
||||||
|
vadduwm 10, 7, 8 # h = vt3 + vt4
|
||||||
|
|
||||||
|
# iteration: #8
|
||||||
|
|
||||||
|
vsel 0, 16, 15, 14 # ch = Ch(e,f,g)
|
||||||
|
vxor 1, 10, 11 # intermediate Maj
|
||||||
|
vshasigmaw 3, 14, 1, 0xF # bse = BigSigma1(e)
|
||||||
|
vadduwm 6, 0, 25 # vt2 = ch + kpw
|
||||||
|
vadduwm 5, 9, 3 # vt1 = h + bse
|
||||||
|
vsel 1, 11, 12, 1 # maj = Maj(a,b,c)
|
||||||
|
vadduwm 7, 5, 6 # vt3 = h + bse + ch + kpw
|
||||||
|
vshasigmaw 2, 10, 1, 0 # bsa = BigSigma0(a)
|
||||||
|
vadduwm 8, 2, 1 # vt4 = bsa + maj
|
||||||
|
vadduwm 13, 13, 7 # d = d + vt3
|
||||||
|
vadduwm 9, 7, 8 # h = vt3 + vt4
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
vsldoi 23, 28, 28, 12
|
||||||
|
vsldoi 24, 28, 28, 8
|
||||||
|
vsldoi 25, 28, 28, 4
|
||||||
|
|
||||||
|
# iteration: #9
|
||||||
|
|
||||||
|
vsel 0, 15, 14, 13 # ch = Ch(e,f,g)
|
||||||
|
vxor 1, 9, 10 # intermediate Maj
|
||||||
|
vshasigmaw 3, 13, 1, 0xF # bse = BigSigma1(e)
|
||||||
|
vadduwm 6, 0, 28 # vt2 = ch + kpw
|
||||||
|
vadduwm 5, 16, 3 # vt1 = h + bse
|
||||||
|
vsel 1, 10, 11, 1 # maj = Maj(a,b,c)
|
||||||
|
vadduwm 7, 5, 6 # vt3 = h + bse + ch + kpw
|
||||||
|
vshasigmaw 2, 9, 1, 0 # bsa = BigSigma0(a)
|
||||||
|
vadduwm 8, 2, 1 # vt4 = bsa + maj
|
||||||
|
vadduwm 12, 12, 7 # d = d + vt3
|
||||||
|
vadduwm 16, 7, 8 # h = vt3 + vt4
|
||||||
|
|
||||||
|
# iteration: #10
|
||||||
|
|
||||||
|
vsel 0, 14, 13, 12 # ch = Ch(e,f,g)
|
||||||
|
vxor 1, 16, 9 # intermediate Maj
|
||||||
|
vshasigmaw 3, 12, 1, 0xF # bse = BigSigma1(e)
|
||||||
|
vadduwm 6, 0, 23 # vt2 = ch + kpw
|
||||||
|
vadduwm 5, 15, 3 # vt1 = h + bse
|
||||||
|
vsel 1, 9, 10, 1 # maj = Maj(a,b,c)
|
||||||
|
vadduwm 7, 5, 6 # vt3 = h + bse + ch + kpw
|
||||||
|
vshasigmaw 2, 16, 1, 0 # bsa = BigSigma0(a)
|
||||||
|
vadduwm 8, 2, 1 # vt4 = bsa + maj
|
||||||
|
vadduwm 11, 11, 7 # d = d + vt3
|
||||||
|
vadduwm 15, 7, 8 # h = vt3 + vt4
|
||||||
|
|
||||||
|
# iteration: #11
|
||||||
|
|
||||||
|
vsel 0, 13, 12, 11 # ch = Ch(e,f,g)
|
||||||
|
vxor 1, 15, 16 # intermediate Maj
|
||||||
|
vshasigmaw 3, 11, 1, 0xF # bse = BigSigma1(e)
|
||||||
|
vadduwm 6, 0, 24 # vt2 = ch + kpw
|
||||||
|
vadduwm 5, 14, 3 # vt1 = h + bse
|
||||||
|
vsel 1, 16, 9, 1 # maj = Maj(a,b,c)
|
||||||
|
vadduwm 7, 5, 6 # vt3 = h + bse + ch + kpw
|
||||||
|
vshasigmaw 2, 15, 1, 0 # bsa = BigSigma0(a)
|
||||||
|
vadduwm 8, 2, 1 # vt4 = bsa + maj
|
||||||
|
vadduwm 10, 10, 7 # d = d + vt3
|
||||||
|
vadduwm 14, 7, 8 # h = vt3 + vt4
|
||||||
|
|
||||||
|
# iteration: #12
|
||||||
|
|
||||||
|
vsel 0, 12, 11, 10 # ch = Ch(e,f,g)
|
||||||
|
vxor 1, 14, 15 # intermediate Maj
|
||||||
|
vshasigmaw 3, 10, 1, 0xF # bse = BigSigma1(e)
|
||||||
|
vadduwm 6, 0, 25 # vt2 = ch + kpw
|
||||||
|
vadduwm 5, 13, 3 # vt1 = h + bse
|
||||||
|
vsel 1, 15, 16, 1 # maj = Maj(a,b,c)
|
||||||
|
vadduwm 7, 5, 6 # vt3 = h + bse + ch + kpw
|
||||||
|
vshasigmaw 2, 14, 1, 0 # bsa = BigSigma0(a)
|
||||||
|
vadduwm 8, 2, 1 # vt4 = bsa + maj
|
||||||
|
vadduwm 9, 9, 7 # d = d + vt3
|
||||||
|
vadduwm 13, 7, 8 # h = vt3 + vt4
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
vsldoi 23, 29, 29, 12
|
||||||
|
vsldoi 24, 29, 29, 8
|
||||||
|
vsldoi 25, 29, 29, 4
|
||||||
|
|
||||||
|
# iteration: #13
|
||||||
|
|
||||||
|
vsel 0, 11, 10, 9 # ch = Ch(e,f,g)
|
||||||
|
vxor 1, 13, 14 # intermediate Maj
|
||||||
|
vshasigmaw 3, 9, 1, 0xF # bse = BigSigma1(e)
|
||||||
|
vadduwm 6, 0, 29 # vt2 = ch + kpw
|
||||||
|
vadduwm 5, 12, 3 # vt1 = h + bse
|
||||||
|
vsel 1, 14, 15, 1 # maj = Maj(a,b,c)
|
||||||
|
vadduwm 7, 5, 6 # vt3 = h + bse + ch + kpw
|
||||||
|
vshasigmaw 2, 13, 1, 0 # bsa = BigSigma0(a)
|
||||||
|
vadduwm 8, 2, 1 # vt4 = bsa + maj
|
||||||
|
vadduwm 16, 16, 7 # d = d + vt3
|
||||||
|
vadduwm 12, 7, 8 # h = vt3 + vt4
|
||||||
|
|
||||||
|
# iteration: #14
|
||||||
|
|
||||||
|
vsel 0, 10, 9, 16 # ch = Ch(e,f,g)
|
||||||
|
vxor 1, 12, 13 # intermediate Maj
|
||||||
|
vshasigmaw 3, 16, 1, 0xF # bse = BigSigma1(e)
|
||||||
|
vadduwm 6, 0, 23 # vt2 = ch + kpw
|
||||||
|
vadduwm 5, 11, 3 # vt1 = h + bse
|
||||||
|
vsel 1, 13, 14, 1 # maj = Maj(a,b,c)
|
||||||
|
vadduwm 7, 5, 6 # vt3 = h + bse + ch + kpw
|
||||||
|
vshasigmaw 2, 12, 1, 0 # bsa = BigSigma0(a)
|
||||||
|
vadduwm 8, 2, 1 # vt4 = bsa + maj
|
||||||
|
vadduwm 15, 15, 7 # d = d + vt3
|
||||||
|
vadduwm 11, 7, 8 # h = vt3 + vt4
|
||||||
|
|
||||||
|
# iteration: #15
|
||||||
|
|
||||||
|
vsel 0, 9, 16, 15 # ch = Ch(e,f,g)
|
||||||
|
vxor 1, 11, 12 # intermediate Maj
|
||||||
|
vshasigmaw 3, 15, 1, 0xF # bse = BigSigma1(e)
|
||||||
|
vadduwm 6, 0, 24 # vt2 = ch + kpw
|
||||||
|
vadduwm 5, 10, 3 # vt1 = h + bse
|
||||||
|
vsel 1, 12, 13, 1 # maj = Maj(a,b,c)
|
||||||
|
vadduwm 7, 5, 6 # vt3 = h + bse + ch + kpw
|
||||||
|
vshasigmaw 2, 11, 1, 0 # bsa = BigSigma0(a)
|
||||||
|
vadduwm 8, 2, 1 # vt4 = bsa + maj
|
||||||
|
vadduwm 14, 14, 7 # d = d + vt3
|
||||||
|
vadduwm 10, 7, 8 # h = vt3 + vt4
|
||||||
|
|
||||||
|
# iteration: #16
|
||||||
|
|
||||||
|
vsel 0, 16, 15, 14 # ch = Ch(e,f,g)
|
||||||
|
vxor 1, 10, 11 # intermediate Maj
|
||||||
|
vshasigmaw 3, 14, 1, 0xF # bse = BigSigma1(e)
|
||||||
|
vadduwm 6, 0, 25 # vt2 = ch + kpw
|
||||||
|
vadduwm 5, 9, 3 # vt1 = h + bse
|
||||||
|
vsel 1, 11, 12, 1 # maj = Maj(a,b,c)
|
||||||
|
vadduwm 7, 5, 6 # vt3 = h + bse + ch + kpw
|
||||||
|
vshasigmaw 2, 10, 1, 0 # bsa = BigSigma0(a)
|
||||||
|
vadduwm 8, 2, 1 # vt4 = bsa + maj
|
||||||
|
vadduwm 13, 13, 7 # d = d + vt3
|
||||||
|
vadduwm 9, 7, 8 # h = vt3 + vt4
|
||||||
|
|
||||||
|
|
||||||
|
# j will be multiple of 4 for loading words.
|
||||||
|
# Whenever read, advance the pointer (e.g: when j is used in CALC_4W)
|
||||||
|
li 8, 16*4
|
||||||
|
|
||||||
|
# Rolling the 16 to 64 rounds
|
||||||
|
li 6, (64-16)/8
|
||||||
|
mtctr 6
|
||||||
|
|
||||||
|
.align 4
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
.Loop1:
|
||||||
|
|
||||||
|
|
||||||
|
# Load aligned of K[j]
|
||||||
|
lvx 4, 8, 5
|
||||||
|
|
||||||
|
|
||||||
|
# Advance j
|
||||||
|
addi 8, 8, 16
|
||||||
|
|
||||||
|
# b = w[j-15], w[j-14], w[j-13], w[j-12]
|
||||||
|
vsldoi 5, 18, 17, 12
|
||||||
|
|
||||||
|
# c = w[j-7], w[j-6], w[j-5], w[j-4]
|
||||||
|
vsldoi 6, 20, 19, 12
|
||||||
|
|
||||||
|
# d = w[j-2], w[j-1], w[j-4], w[j-3]
|
||||||
|
vsldoi 7, 20, 20, 8
|
||||||
|
|
||||||
|
# b = s0(w[j-15]) , s0(w[j-14]) , s0(w[j-13]) , s0(w[j-12])
|
||||||
|
vshasigmaw 5, 5, 0, 0
|
||||||
|
|
||||||
|
# d = s1(w[j-2]) , s1(w[j-1]) , s1(w[j-4]) , s1(w[j-3])
|
||||||
|
vshasigmaw 7, 7, 0, 0xf
|
||||||
|
|
||||||
|
# c = s0(w[j-15]) + w[j-7],
|
||||||
|
# s0(w[j-14]) + w[j-6],
|
||||||
|
# s0(w[j-13]) + w[j-5],
|
||||||
|
# s0(w[j-12]) + w[j-4]
|
||||||
|
vadduwm 6, 5, 6
|
||||||
|
|
||||||
|
# c = s0(w[j-15]) + w[j-7] + w[j-16],
|
||||||
|
# s0(w[j-14]) + w[j-6] + w[j-15],
|
||||||
|
# s0(w[j-13]) + w[j-5] + w[j-14],
|
||||||
|
# s0(w[j-12]) + w[j-4] + w[j-13]
|
||||||
|
vadduwm 6, 6, 17
|
||||||
|
|
||||||
|
# e = s0(w[j-15]) + w[j-7] + w[j-16] + s1(w[j-2]), // w[j]
|
||||||
|
# s0(w[j-14]) + w[j-6] + w[j-15] + s1(w[j-1]), // w[j+1]
|
||||||
|
# s0(w[j-13]) + w[j-5] + w[j-14] + s1(w[j-4]), // UNDEFINED
|
||||||
|
# s0(w[j-12]) + w[j-4] + w[j-13] + s1(w[j-3]) // UNDEFINED
|
||||||
|
vadduwm 8, 6, 7
|
||||||
|
|
||||||
|
# At this point, e[0] and e[1] are the correct values to be stored at w[j]
|
||||||
|
# and w[j+1].
|
||||||
|
# e[2] and e[3] are not considered.
|
||||||
|
# b = s1(w[j]) , s1(s(w[j+1]) , UNDEFINED , UNDEFINED
|
||||||
|
vshasigmaw 5, 8, 0, 0xf
|
||||||
|
|
||||||
|
# v5 = s1(w[j-2]) , s1(w[j-1]) , s1(w[j]) , s1(w[j+1])
|
||||||
|
xxmrgld 39,37,39
|
||||||
|
|
||||||
|
# c = s0(w[j-15]) + w[j-7] + w[j-16] + s1(w[j-2]), // w[j]
|
||||||
|
# s0(w[j-14]) + w[j-6] + w[j-15] + s1(w[j-1]), // w[j+1]
|
||||||
|
# s0(w[j-13]) + w[j-5] + w[j-14] + s1(w[j]), // w[j+2]
|
||||||
|
# s0(w[j-12]) + w[j-4] + w[j-13] + s1(w[j+1]) // w[j+4]
|
||||||
|
vadduwm 6, 6, 7
|
||||||
|
|
||||||
|
# Updating w0 to w3 to hold the new previous 16 values from w.
|
||||||
|
vmr 17, 18
|
||||||
|
vmr 18, 19
|
||||||
|
vmr 19, 20
|
||||||
|
vmr 20, 6
|
||||||
|
|
||||||
|
# Store K + w to v9 (4 values at once)
|
||||||
|
vadduwm 22, 6, 4
|
||||||
|
|
||||||
|
vsldoi 23, 22, 22,12
|
||||||
|
vsldoi 24, 22, 22,8
|
||||||
|
vsldoi 25, 22, 22,4
|
||||||
|
|
||||||
|
# iteration: #17 #25 #33 #41 #49 #57
|
||||||
|
|
||||||
|
vsel 0, 15, 14, 13 # ch = Ch(e,f,g)
|
||||||
|
vxor 1, 9, 10 # intermediate Maj
|
||||||
|
vshasigmaw 3, 13, 1, 0xF # bse = BigSigma1(e)
|
||||||
|
vadduwm 6, 0, 22 # vt2 = ch + kpw
|
||||||
|
vadduwm 5, 16, 3 # vt1 = h + bse
|
||||||
|
vsel 1, 10, 11, 1 # maj = Maj(a,b,c)
|
||||||
|
vadduwm 7, 5, 6 # vt3 = h + bse + ch + kpw
|
||||||
|
vshasigmaw 2, 9, 1, 0 # bsa = BigSigma0(a)
|
||||||
|
vadduwm 8, 2, 1 # vt4 = bsa + maj
|
||||||
|
vadduwm 12, 12, 7 # d = d + vt3
|
||||||
|
vadduwm 16, 7, 8 # h = vt3 + vt4
|
||||||
|
|
||||||
|
# iteration: #18 #26 #34 #42 #50 #58
|
||||||
|
|
||||||
|
vsel 0, 14, 13, 12 # ch = Ch(e,f,g)
|
||||||
|
vxor 1, 16, 9 # intermediate Maj
|
||||||
|
vshasigmaw 3, 12, 1, 0xF # bse = BigSigma1(e)
|
||||||
|
vadduwm 6, 0, 23 # vt2 = ch + kpw
|
||||||
|
vadduwm 5, 15, 3 # vt1 = h + bse
|
||||||
|
vsel 1, 9, 10, 1 # maj = Maj(a,b,c)
|
||||||
|
vadduwm 7, 5, 6 # vt3 = h + bse + ch + kpw
|
||||||
|
vshasigmaw 2, 16, 1, 0 # bsa = BigSigma0(a)
|
||||||
|
vadduwm 8, 2, 1 # vt4 = bsa + maj
|
||||||
|
vadduwm 11, 11, 7 # d = d + vt3
|
||||||
|
vadduwm 15, 7, 8 # h = vt3 + vt4
|
||||||
|
|
||||||
|
# iteration: #19 #27 #35 #43 #51 #59
|
||||||
|
|
||||||
|
vsel 0, 13, 12, 11 # ch = Ch(e,f,g)
|
||||||
|
vxor 1, 15, 16 # intermediate Maj
|
||||||
|
vshasigmaw 3, 11, 1, 0xF # bse = BigSigma1(e)
|
||||||
|
vadduwm 6, 0, 24 # vt2 = ch + kpw
|
||||||
|
vadduwm 5, 14, 3 # vt1 = h + bse
|
||||||
|
vsel 1, 16, 9, 1 # maj = Maj(a,b,c)
|
||||||
|
vadduwm 7, 5, 6 # vt3 = h + bse + ch + kpw
|
||||||
|
vshasigmaw 2, 15, 1, 0 # bsa = BigSigma0(a)
|
||||||
|
vadduwm 8, 2, 1 # vt4 = bsa + maj
|
||||||
|
vadduwm 10, 10, 7 # d = d + vt3
|
||||||
|
vadduwm 14, 7, 8 # h = vt3 + vt4
|
||||||
|
|
||||||
|
# iteration: #20 #28 #36 #44 #52 #60
|
||||||
|
|
||||||
|
vsel 0, 12, 11, 10 # ch = Ch(e,f,g)
|
||||||
|
vxor 1, 14, 15 # intermediate Maj
|
||||||
|
vshasigmaw 3, 10, 1, 0xF # bse = BigSigma1(e)
|
||||||
|
vadduwm 6, 0, 25 # vt2 = ch + kpw
|
||||||
|
vadduwm 5, 13, 3 # vt1 = h + bse
|
||||||
|
vsel 1, 15, 16, 1 # maj = Maj(a,b,c)
|
||||||
|
vadduwm 7, 5, 6 # vt3 = h + bse + ch + kpw
|
||||||
|
vshasigmaw 2, 14, 1, 0 # bsa = BigSigma0(a)
|
||||||
|
vadduwm 8, 2, 1 # vt4 = bsa + maj
|
||||||
|
vadduwm 9, 9, 7 # d = d + vt3
|
||||||
|
vadduwm 13, 7, 8 # h = vt3 + vt4
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# Load aligned of K[j]
|
||||||
|
lvx 4, 8, 5
|
||||||
|
|
||||||
|
|
||||||
|
# Advance j
|
||||||
|
addi 8, 8, 16
|
||||||
|
|
||||||
|
# b = w[j-15], w[j-14], w[j-13], w[j-12]
|
||||||
|
vsldoi 5, 18, 17, 12
|
||||||
|
|
||||||
|
# c = w[j-7], w[j-6], w[j-5], w[j-4]
|
||||||
|
vsldoi 6, 20, 19, 12
|
||||||
|
|
||||||
|
# d = w[j-2], w[j-1], w[j-4], w[j-3]
|
||||||
|
vsldoi 7, 20, 20, 8
|
||||||
|
|
||||||
|
# b = s0(w[j-15]) , s0(w[j-14]) , s0(w[j-13]) , s0(w[j-12])
|
||||||
|
vshasigmaw 5, 5, 0, 0
|
||||||
|
|
||||||
|
# d = s1(w[j-2]) , s1(w[j-1]) , s1(w[j-4]) , s1(w[j-3])
|
||||||
|
vshasigmaw 7, 7, 0, 0xf
|
||||||
|
|
||||||
|
# c = s0(w[j-15]) + w[j-7],
|
||||||
|
# s0(w[j-14]) + w[j-6],
|
||||||
|
# s0(w[j-13]) + w[j-5],
|
||||||
|
# s0(w[j-12]) + w[j-4]
|
||||||
|
vadduwm 6, 5, 6
|
||||||
|
|
||||||
|
# c = s0(w[j-15]) + w[j-7] + w[j-16],
|
||||||
|
# s0(w[j-14]) + w[j-6] + w[j-15],
|
||||||
|
# s0(w[j-13]) + w[j-5] + w[j-14],
|
||||||
|
# s0(w[j-12]) + w[j-4] + w[j-13]
|
||||||
|
vadduwm 6, 6, 17
|
||||||
|
|
||||||
|
# e = s0(w[j-15]) + w[j-7] + w[j-16] + s1(w[j-2]), // w[j]
|
||||||
|
# s0(w[j-14]) + w[j-6] + w[j-15] + s1(w[j-1]), // w[j+1]
|
||||||
|
# s0(w[j-13]) + w[j-5] + w[j-14] + s1(w[j-4]), // UNDEFINED
|
||||||
|
# s0(w[j-12]) + w[j-4] + w[j-13] + s1(w[j-3]) // UNDEFINED
|
||||||
|
vadduwm 8, 6, 7
|
||||||
|
|
||||||
|
# At this point, e[0] and e[1] are the correct values to be stored at w[j]
|
||||||
|
# and w[j+1].
|
||||||
|
# e[2] and e[3] are not considered.
|
||||||
|
# b = s1(w[j]) , s1(s(w[j+1]) , UNDEFINED , UNDEFINED
|
||||||
|
vshasigmaw 5, 8, 0, 0xf
|
||||||
|
|
||||||
|
# v5 = s1(w[j-2]) , s1(w[j-1]) , s1(w[j]) , s1(w[j+1])
|
||||||
|
xxmrgld 39,37,39
|
||||||
|
|
||||||
|
# c = s0(w[j-15]) + w[j-7] + w[j-16] + s1(w[j-2]), // w[j]
|
||||||
|
# s0(w[j-14]) + w[j-6] + w[j-15] + s1(w[j-1]), // w[j+1]
|
||||||
|
# s0(w[j-13]) + w[j-5] + w[j-14] + s1(w[j]), // w[j+2]
|
||||||
|
# s0(w[j-12]) + w[j-4] + w[j-13] + s1(w[j+1]) // w[j+4]
|
||||||
|
vadduwm 6, 6, 7
|
||||||
|
|
||||||
|
# Updating w0 to w3 to hold the new previous 16 values from w.
|
||||||
|
vmr 17, 18
|
||||||
|
vmr 18, 19
|
||||||
|
vmr 19, 20
|
||||||
|
vmr 20, 6
|
||||||
|
|
||||||
|
# Store K + w to v9 (4 values at once)
|
||||||
|
vadduwm 22, 6, 4
|
||||||
|
|
||||||
|
vsldoi 23, 22, 22,12
|
||||||
|
vsldoi 24, 22, 22,8
|
||||||
|
vsldoi 25, 22, 22,4
|
||||||
|
|
||||||
|
# iteration: #21 #29 #37 #45 #53 #61
|
||||||
|
|
||||||
|
vsel 0, 11, 10, 9 # ch = Ch(e,f,g)
|
||||||
|
vxor 1, 13, 14 # intermediate Maj
|
||||||
|
vshasigmaw 3, 9, 1, 0xF # bse = BigSigma1(e)
|
||||||
|
vadduwm 6, 0, 22 # vt2 = ch + kpw
|
||||||
|
vadduwm 5, 12, 3 # vt1 = h + bse
|
||||||
|
vsel 1, 14, 15, 1 # maj = Maj(a,b,c)
|
||||||
|
vadduwm 7, 5, 6 # vt3 = h + bse + ch + kpw
|
||||||
|
vshasigmaw 2, 13, 1, 0 # bsa = BigSigma0(a)
|
||||||
|
vadduwm 8, 2, 1 # vt4 = bsa + maj
|
||||||
|
vadduwm 16, 16, 7 # d = d + vt3
|
||||||
|
vadduwm 12, 7, 8 # h = vt3 + vt4
|
||||||
|
|
||||||
|
# iteration: #22 #30 #38 #46 #54 #62
|
||||||
|
|
||||||
|
vsel 0, 10, 9, 16 # ch = Ch(e,f,g)
|
||||||
|
vxor 1, 12, 13 # intermediate Maj
|
||||||
|
vshasigmaw 3, 16, 1, 0xF # bse = BigSigma1(e)
|
||||||
|
vadduwm 6, 0, 23 # vt2 = ch + kpw
|
||||||
|
vadduwm 5, 11, 3 # vt1 = h + bse
|
||||||
|
vsel 1, 13, 14, 1 # maj = Maj(a,b,c)
|
||||||
|
vadduwm 7, 5, 6 # vt3 = h + bse + ch + kpw
|
||||||
|
vshasigmaw 2, 12, 1, 0 # bsa = BigSigma0(a)
|
||||||
|
vadduwm 8, 2, 1 # vt4 = bsa + maj
|
||||||
|
vadduwm 15, 15, 7 # d = d + vt3
|
||||||
|
vadduwm 11, 7, 8 # h = vt3 + vt4
|
||||||
|
|
||||||
|
# iteration: #23 #31 #39 #47 #55 #63
|
||||||
|
|
||||||
|
vsel 0, 9, 16, 15 # ch = Ch(e,f,g)
|
||||||
|
vxor 1, 11, 12 # intermediate Maj
|
||||||
|
vshasigmaw 3, 15, 1, 0xF # bse = BigSigma1(e)
|
||||||
|
vadduwm 6, 0, 24 # vt2 = ch + kpw
|
||||||
|
vadduwm 5, 10, 3 # vt1 = h + bse
|
||||||
|
vsel 1, 12, 13, 1 # maj = Maj(a,b,c)
|
||||||
|
vadduwm 7, 5, 6 # vt3 = h + bse + ch + kpw
|
||||||
|
vshasigmaw 2, 11, 1, 0 # bsa = BigSigma0(a)
|
||||||
|
vadduwm 8, 2, 1 # vt4 = bsa + maj
|
||||||
|
vadduwm 14, 14, 7 # d = d + vt3
|
||||||
|
vadduwm 10, 7, 8 # h = vt3 + vt4
|
||||||
|
|
||||||
|
# iteration: #24 #32 #40 #48 #56 #64
|
||||||
|
|
||||||
|
vsel 0, 16, 15, 14 # ch = Ch(e,f,g)
|
||||||
|
vxor 1, 10, 11 # intermediate Maj
|
||||||
|
vshasigmaw 3, 14, 1, 0xF # bse = BigSigma1(e)
|
||||||
|
vadduwm 6, 0, 25 # vt2 = ch + kpw
|
||||||
|
vadduwm 5, 9, 3 # vt1 = h + bse
|
||||||
|
vsel 1, 11, 12, 1 # maj = Maj(a,b,c)
|
||||||
|
vadduwm 7, 5, 6 # vt3 = h + bse + ch + kpw
|
||||||
|
vshasigmaw 2, 10, 1, 0 # bsa = BigSigma0(a)
|
||||||
|
vadduwm 8, 2, 1 # vt4 = bsa + maj
|
||||||
|
vadduwm 13, 13, 7 # d = d + vt3
|
||||||
|
vadduwm 9, 7, 8 # h = vt3 + vt4
|
||||||
|
|
||||||
|
bdnz .Loop1
|
||||||
|
|
||||||
|
# Update hash STATE
|
||||||
|
|
||||||
|
li 6, 16
|
||||||
|
li 7, 32
|
||||||
|
|
||||||
|
|
||||||
|
lvsr 21, 0, 3
|
||||||
|
lvx 4, 0, 3 # vt0 = STATE[0]..STATE[3]
|
||||||
|
lvx 3, 6, 3 # vt5 = STATE[4]..STATE[8]
|
||||||
|
vperm 4, 3, 4, 21
|
||||||
|
lvx 2, 7, 3 # vt5 = STATE[4]..STATE[8]
|
||||||
|
vperm 3, 2, 3, 21
|
||||||
|
|
||||||
|
|
||||||
|
vmrglw 5, 10, 9 # vt1 = {a, b, ?, ?}
|
||||||
|
vmrglw 6, 12, 11 # vt2 = {c, d, ?, ?}
|
||||||
|
vmrglw 7, 14, 13 # vt3 = {e, f, ?, ?}
|
||||||
|
vmrglw 8, 16, 15 # vt4 = {g, h, ?, ?}
|
||||||
|
xxmrgld 37,38,37 # vt1 = {a, b, c, d}
|
||||||
|
xxmrgld 39,40,39 # vt3 = {e, f, g, h}
|
||||||
|
|
||||||
|
# vt0 = {a+STATE[0], b+STATE[1], c+STATE[2], d+STATE[3]}
|
||||||
|
vadduwm 4, 4, 5
|
||||||
|
|
||||||
|
# vt5 = {e+STATE[4], f+STATE[5], g+STATE[6], h+STATE[7]
|
||||||
|
vadduwm 3, 3, 7
|
||||||
|
|
||||||
|
|
||||||
|
mfvrwz 22, 4 # aux = a+STATE[0]
|
||||||
|
stw 22, 8(3) # update h[3]
|
||||||
|
|
||||||
|
# vt6 = {b+STATE[1], c+STATE[2], d+STATE[3], a+STATE[0]}
|
||||||
|
vsldoi 2, 4, 4, 12
|
||||||
|
mfvrwz 22, 2 # aux = b+STATE[1]
|
||||||
|
stw 22, 12(3) # update h[2]
|
||||||
|
|
||||||
|
# vt6 = {c+STATE[2], d+STATE[3], a+STATE[0], b+STATE[1]}
|
||||||
|
vsldoi 2, 2, 2, 12
|
||||||
|
mfvrwz 22, 2 # aux = c+STATE[2]
|
||||||
|
stw 22, 0(3) # update h[1]
|
||||||
|
|
||||||
|
# vt6 = {d+STATE[3], a+STATE[0], b+STATE[1], c+STATE[2]}
|
||||||
|
vsldoi 2, 2, 2, 12
|
||||||
|
mfvrwz 22, 2 # aux = d+STATE[3]
|
||||||
|
stw 22, 4(3) # update h[0]
|
||||||
|
mfvrwz 22, 3 # aux = e+STATE[4]
|
||||||
|
stw 22, 24(3) # update h[7]
|
||||||
|
|
||||||
|
# vt6 = {f+STATE[5], g+STATE[6], d+STATE[3], h+STATE[7]}
|
||||||
|
vsldoi 2, 3, 3, 12
|
||||||
|
mfvrwz 22, 2 # aux = f+STATE[5]
|
||||||
|
stw 22, 28(3) # update h[6]
|
||||||
|
|
||||||
|
# vt6 = {g+STATE[6], h+STATE[7], e+STATE[4], f+STATE[5]}
|
||||||
|
vsldoi 2, 2, 2, 12
|
||||||
|
mfvrwz 22, 2 # aux = g+STATE[6]
|
||||||
|
stw 22, 16(3) # update h[5]
|
||||||
|
|
||||||
|
# vt6 = {h+STATE[7], e+STATE[4], f+STATE[5], g+STATE[6]}
|
||||||
|
vsldoi 2, 2, 2, 12
|
||||||
|
mfvrwz 22, 2 # aux = h+STATE[7]
|
||||||
|
stw 22, 20(3)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# Restoring non volatile registers
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
li 0, -176; lvx 29, 1, 0
|
||||||
|
li 0, -160; lvx 28, 1, 0
|
||||||
|
li 0, -144; lvx 27, 1, 0
|
||||||
|
li 0, -128; lvx 26, 1, 0
|
||||||
|
li 0, -112; lvx 25, 1, 0
|
||||||
|
li 0, -96; lvx 24, 1, 0
|
||||||
|
li 0, -80; lvx 23, 1, 0
|
||||||
|
li 0, -64; lvx 22, 1, 0
|
||||||
|
li 0, -48; lvx 21, 1, 0
|
||||||
|
li 0, -32; lvx 20, 1, 0
|
||||||
|
|
||||||
|
|
||||||
|
blr
|
||||||
|
.size sha256_compress_ppc, . - sha256_compress_ppc
|
||||||
|
|
||||||
Loading…
Reference in New Issue