From f9773d29618fb400f45c3acce30d1acb61e501f9 Mon Sep 17 00:00:00 2001 From: Jeffrey Walton Date: Tue, 14 Mar 2017 06:07:37 -0400 Subject: [PATCH] Updated RDRAND and RDSEED under NASM Port rdrand.S to Solaris Port rdrand.S to X32 The X32 port is responsible for the loop unwinding. The unwind generates a 32-byte block (X64 and X32) or 16-byte block (X86). On X32, it increases throughut by 100% (doubles it). On X86 and X64, throughput increases by about 6%. Anything over 4 machine words slows things down. --- rdrand-nasm.sh | 36 +++++--- rdrand.S | 229 ++++++++++++++++++++++++++++++++++++++----------- 2 files changed, 207 insertions(+), 58 deletions(-) diff --git a/rdrand-nasm.sh b/rdrand-nasm.sh index 615b8c1e..c0b50588 100755 --- a/rdrand-nasm.sh +++ b/rdrand-nasm.sh @@ -1,26 +1,42 @@ #!/usr/bin/env bash IS_LINUX=$(uname -s | grep -i -c linux) +IS_SOLARIS=$(uname -s | grep -i -c sunos) IS_DARWIN=$(uname -s | grep -i -c darwin) IS_CYGWIN=$(uname -s | grep -i -c cygwin) rm -f rdrand-x86.o rdrand-x32.o rdrand-x64.o &>/dev/null +NASM=$(which nasm 2>&1) +if [[ ! -f "$NASM" ]]; then + echo "Unable to locate Nasm" + [ "$0" = "$BASH_SOURCE" ] && exit 1 || return 1 +fi + if [[ "$IS_LINUX" -eq "1" ]]; then - echo "Building rdrand and rdseed modules for Linux" - nasm -f elf32 rdrand.S -DX86 -g -o rdrand-x86.o - nasm -f elfx32 rdrand.S -DX32 -g -o rdrand-x32.o - nasm -f elf64 rdrand.S -DX64 -g -o rdrand-x64.o + echo "Building rdrand and rdseed modules for Linux" + nasm -f elf32 rdrand.S -DX86 -g -o rdrand-x86.o + nasm -f elfx32 rdrand.S -DX32 -g -o rdrand-x32.o + nasm -f elf64 rdrand.S -DX64 -g -o rdrand-x64.o +fi + +if [[ "$IS_SOLARIS" -eq "1" ]]; then + echo "Building rdrand and rdseed modules for Solaris" + nasm -f elf32 rdrand.S -DX86 -o rdrand-x86.o + nasm -f elfx32 rdrand.S -DX32 -o rdrand-x32.o + nasm -f elf64 rdrand.S -DX64 -o rdrand-x64.o fi if [[ "$IS_DARWIN" -eq "1" ]]; then - echo "Building rdrand and rdseed modules for Darwin" - nasm -f macho32 rdrand.S -DDARWIN -DX86 -g -o rdrand-x86.o - nasm -f macho64 rdrand.S -DDARWIN -DX64 -g -o rdrand-x64.o + echo "Building rdrand and rdseed modules for Darwin" + nasm -f macho32 rdrand.S -DDARWIN -DX86 -g -o rdrand-x86.o + nasm -f macho64 rdrand.S -DDARWIN -DX64 -g -o rdrand-x64.o fi if [[ "$IS_CYGWIN" -eq "1" ]]; then - echo "Building rdrand and rdseed modules for Cygwin" - nasm -f win32 rdrand.S -DCYGWIN -DX86 -g -o rdrand-x86.o - nasm -f win64 rdrand.S -DCYGWIN -DX64 -g -o rdrand-x64.o + echo "Building rdrand and rdseed modules for Cygwin" + nasm -f win32 rdrand.S -DCYGWIN -DX86 -g -o rdrand-x86.o + nasm -f win64 rdrand.S -DCYGWIN -DX64 -g -o rdrand-x64.o fi + +[ "$0" = "$BASH_SOURCE" ] && exit 0 || return 0 diff --git a/rdrand.S b/rdrand.S index 111a6d11..474a6832 100644 --- a/rdrand.S +++ b/rdrand.S @@ -24,27 +24,26 @@ %ifdef X86 ;; Set via the command line %define arg1 [esp+04h] %define arg2 [esp+08h] -%ifdef CYGWIN ;; Cygwin follows Windows ABI here, not Linux ABI %define buffer ecx %define bsize edx -%else -%define buffer edi -%define bsize esi -%endif +%define lsize dl ;; Used for tail bytes, 1-byte constants %define MWSIZE 04h ;; machine word size %elifdef X32 ;; Set via the command line -%define buffer edi -%define bsize esi +%define buffer edi ;; Linux ABI +%define bsize esi ;; Linux ABI +%define lsize si %define MWSIZE 04h ;; machine word size %elifdef X64 ;; Set via the command line %ifdef CYGWIN ;; Cygwin follows Windows ABI here, not Linux ABI -%define buffer rcx -%define bsize rdx +%define buffer rcx ;; Windows ABI +%define bsize rdx ;; Windows ABI +%define lsize dx ;; Used for tail bytes, 2-byte constants %else -%define buffer rdi -%define bsize rsi +%define buffer rdi ;; Linux ABI +%define bsize rsi ;; Linux ABI +%define lsize si ;; Used for tail bytes, 2-byte constants %endif %define MWSIZE 08h ;; machine word size @@ -62,10 +61,12 @@ %define NASM_RDSEED_GenerateBlock _NASM_RDSEED_GenerateBlock %endif -%ifdef CYGWIN and X86 +%ifdef CYGWIN +%ifdef X86 %define NASM_RDRAND_GenerateBlock _NASM_RDRAND_GenerateBlock %define NASM_RDSEED_GenerateBlock _NASM_RDSEED_GenerateBlock %endif +%endif ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; @@ -83,9 +84,43 @@ NASM_RDRAND_GenerateBlock: mov buffer, arg1 mov bsize, arg2 -.GenerateBlock_Top: + ;; A block of 16-bytes appears to be optimal. Adding + ;; more rdrand calls degrades performance. + cmp bsize, 16 + jb .GenerateBlock_4 - cmp bsize, 0 +.GenerateBlock_16: + +.Call_RDRAND_EAX_4: + rdrand eax + jnc .Call_RDRAND_EAX_4 + mov [buffer+0], eax + +.Call_RDRAND_EAX_3: + rdrand eax + jnc .Call_RDRAND_EAX_3 + mov [buffer+4], eax + +.Call_RDRAND_EAX_2: + rdrand eax + jnc .Call_RDRAND_EAX_2 + mov [buffer+8], eax + +.Call_RDRAND_EAX_1: + rdrand eax + jnc .Call_RDRAND_EAX_1 + mov [buffer+12], eax + + sub bsize, 16 + add buffer, 16 + + cmp bsize, 16 + jae .GenerateBlock_16 + + ;; Fewer than 16 bytes remain +.GenerateBlock_4: + + cmp lsize, 0 je .GenerateBlock_Return .Call_RDRAND_EAX_0: @@ -93,25 +128,23 @@ NASM_RDRAND_GenerateBlock: rdrand eax jnc .Call_RDRAND_EAX_0 -.RDRAND_succeeded: - - cmp bsize, MWSIZE + cmp lsize, MWSIZE jb .Partial_Machine_Word .Full_Machine_Word: mov [buffer], eax add buffer, MWSIZE - sub bsize, MWSIZE + sub lsize, MWSIZE ;; Continue - jmp .GenerateBlock_Top + jmp .GenerateBlock_4 - ;; 1,2,3 bytes remain for X86 + ;; 1,2,3 bytes remain .Partial_Machine_Word: ;; Test bit 1 to see if size is at least 2 - test bsize, 2 + test lsize, 2 jz .Bit_1_Not_Set mov [buffer], ax @@ -121,7 +154,7 @@ NASM_RDRAND_GenerateBlock: .Bit_1_Not_Set: ;; Test bit 0 to see if size is at least 1 - test bsize, 1 + test lsize, 1 jz .Bit_0_Not_Set mov [buffer], al @@ -150,32 +183,66 @@ NASM_RDRAND_GenerateBlock: ;; No need for Load_Arguments due to fastcall -.GenerateBlock_Top: + ;; A block of 32-bytes appears to be optimal. Adding + ;; more rdrand calls degrades performance. + cmp bsize, 32 + jb .GenerateBlock_8 - cmp bsize, 0 +.GenerateBlock_32: + +.Call_RDRAND_RAX_4: + rdrand rax + jnc .Call_RDRAND_RAX_4 + mov [buffer+0], rax + +.Call_RDRAND_RAX_3: + rdrand rax + jnc .Call_RDRAND_RAX_3 + mov [buffer+8], rax + +.Call_RDRAND_RAX_2: + rdrand rax + jnc .Call_RDRAND_RAX_2 + mov [buffer+16], rax + +.Call_RDRAND_RAX_1: + rdrand rax + jnc .Call_RDRAND_RAX_1 + mov [buffer+24], rax + + sub bsize, 32 + add buffer, 32 + + cmp bsize, 32 + jae .GenerateBlock_32 + + ;; Fewer than 32 bytes remain +.GenerateBlock_8: + + cmp lsize, 0 je .GenerateBlock_Return .Call_RDRAND_RAX_0: rdrand rax jnc .Call_RDRAND_RAX_0 - cmp bsize, MWSIZE + cmp lsize, MWSIZE jb .Partial_Machine_Word .Full_Machine_Word: mov [buffer], rax add buffer, MWSIZE - sub bsize, MWSIZE + sub lsize, MWSIZE ;; Continue - jmp .GenerateBlock_Top + jmp .GenerateBlock_8 ;; 1,2,3,4,5,6,7 bytes remain .Partial_Machine_Word: ;; Test bit 2 to see if size is at least 4 - test bsize, 4 + test lsize, 4 jz .Bit_2_Not_Set mov [buffer], eax @@ -185,7 +252,7 @@ NASM_RDRAND_GenerateBlock: .Bit_2_Not_Set: ;; Test bit 1 to see if size is at least 2 - test bsize, 2 + test lsize, 2 jz .Bit_1_Not_Set mov [buffer], ax @@ -195,7 +262,7 @@ NASM_RDRAND_GenerateBlock: .Bit_1_Not_Set: ;; Test bit 0 to see if size is at least 1 - test bsize, 1 + test lsize, 1 jz .Bit_0_Not_Set mov [buffer], al @@ -227,9 +294,43 @@ NASM_RDSEED_GenerateBlock: mov buffer, arg1 mov bsize, arg2 -.GenerateBlock_Top: + ;; A block of 16-bytes appears to be optimal. Adding + ;; more rdrand calls degrades performance. + cmp bsize, 16 + jb .GenerateBlock_4 - cmp bsize, 0 +.GenerateBlock_16: + +.Call_RDSEED_EAX_4: + rdseed eax + jnc .Call_RDSEED_EAX_4 + mov [buffer+0], eax + +.Call_RDSEED_EAX_3: + rdseed eax + jnc .Call_RDSEED_EAX_3 + mov [buffer+4], eax + +.Call_RDSEED_EAX_2: + rdseed eax + jnc .Call_RDSEED_EAX_2 + mov [buffer+8], eax + +.Call_RDSEED_EAX_1: + rdseed eax + jnc .Call_RDSEED_EAX_1 + mov [buffer+12], eax + + sub bsize, 16 + add buffer, 16 + + cmp bsize, 16 + jae .GenerateBlock_16 + + ;; Fewer than 16 bytes remain +.GenerateBlock_4: + + cmp lsize, 0 je .GenerateBlock_Return .Call_RDSEED_EAX_0: @@ -237,25 +338,23 @@ NASM_RDSEED_GenerateBlock: rdseed eax jnc .Call_RDSEED_EAX_0 -.RDSEED_succeeded: - - cmp bsize, MWSIZE + cmp lsize, MWSIZE jb .Partial_Machine_Word .Full_Machine_Word: mov [buffer], eax add buffer, MWSIZE - sub bsize, MWSIZE + sub lsize, MWSIZE ;; Continue - jmp .GenerateBlock_Top + jmp .GenerateBlock_4 - ;; 1,2,3 bytes remain for X86 + ;; 1,2,3 bytes remain .Partial_Machine_Word: ;; Test bit 1 to see if size is at least 2 - test bsize, 2 + test lsize, 2 jz .Bit_1_Not_Set mov [buffer], ax @@ -265,7 +364,7 @@ NASM_RDSEED_GenerateBlock: .Bit_1_Not_Set: ;; Test bit 0 to see if size is at least 1 - test bsize, 1 + test lsize, 1 jz .Bit_0_Not_Set mov [buffer], al @@ -294,32 +393,66 @@ NASM_RDSEED_GenerateBlock: ;; No need for Load_Arguments due to fastcall -.GenerateBlock_Top: + ;; A block of 32-bytes appears to be optimal. Adding + ;; more rdrand calls degrades performance. + cmp bsize, 32 + jb .GenerateBlock_8 - cmp bsize, 0 +.GenerateBlock_32: + +.Call_RDSEED_RAX_4: + rdseed rax + jnc .Call_RDSEED_RAX_4 + mov [buffer+0], rax + +.Call_RDSEED_RAX_3: + rdseed rax + jnc .Call_RDSEED_RAX_3 + mov [buffer+8], rax + +.Call_RDSEED_RAX_2: + rdseed rax + jnc .Call_RDSEED_RAX_2 + mov [buffer+16], rax + +.Call_RDSEED_RAX_1: + rdseed rax + jnc .Call_RDSEED_RAX_1 + mov [buffer+24], rax + + sub bsize, 32 + add buffer, 32 + + cmp bsize, 32 + jae .GenerateBlock_32 + + ;; Fewer than 32 bytes remain +.GenerateBlock_8: + + cmp lsize, 0 je .GenerateBlock_Return .Call_RDSEED_RAX_0: rdseed rax jnc .Call_RDSEED_RAX_0 - cmp bsize, MWSIZE + cmp lsize, MWSIZE jb .Partial_Machine_Word .Full_Machine_Word: mov [buffer], rax add buffer, MWSIZE - sub bsize, MWSIZE + sub lsize, MWSIZE ;; Continue - jmp .GenerateBlock_Top + jmp .GenerateBlock_8 ;; 1,2,3,4,5,6,7 bytes remain .Partial_Machine_Word: ;; Test bit 2 to see if size is at least 4 - test bsize, 4 + test lsize, 4 jz .Bit_2_Not_Set mov [buffer], eax @@ -329,7 +462,7 @@ NASM_RDSEED_GenerateBlock: .Bit_2_Not_Set: ;; Test bit 1 to see if size is at least 2 - test bsize, 2 + test lsize, 2 jz .Bit_1_Not_Set mov [buffer], ax @@ -339,7 +472,7 @@ NASM_RDSEED_GenerateBlock: .Bit_1_Not_Set: ;; Test bit 0 to see if size is at least 1 - test bsize, 1 + test lsize, 1 jz .Bit_0_Not_Set mov [buffer], al