Updated RDRAND and RDSEED under NASM

Port rdrand.S to Solaris
Port rdrand.S to X32
The X32 port is responsible for the loop unwinding. The unwind generates a 32-byte block (X64 and X32) or 16-byte block (X86). On X32, it increases throughut by 100% (doubles it). On X86 and X64, throughput increases by about 6%. Anything over 4 machine words slows things down.
pull/392/head
Jeffrey Walton 2017-03-14 06:07:37 -04:00
parent 4bcaabbe26
commit f9773d2961
No known key found for this signature in database
GPG Key ID: B36AB348921B1838
2 changed files with 207 additions and 58 deletions

View File

@ -1,26 +1,42 @@
#!/usr/bin/env bash
IS_LINUX=$(uname -s | grep -i -c linux)
IS_SOLARIS=$(uname -s | grep -i -c sunos)
IS_DARWIN=$(uname -s | grep -i -c darwin)
IS_CYGWIN=$(uname -s | grep -i -c cygwin)
rm -f rdrand-x86.o rdrand-x32.o rdrand-x64.o &>/dev/null
NASM=$(which nasm 2>&1)
if [[ ! -f "$NASM" ]]; then
echo "Unable to locate Nasm"
[ "$0" = "$BASH_SOURCE" ] && exit 1 || return 1
fi
if [[ "$IS_LINUX" -eq "1" ]]; then
echo "Building rdrand and rdseed modules for Linux"
nasm -f elf32 rdrand.S -DX86 -g -o rdrand-x86.o
nasm -f elfx32 rdrand.S -DX32 -g -o rdrand-x32.o
nasm -f elf64 rdrand.S -DX64 -g -o rdrand-x64.o
echo "Building rdrand and rdseed modules for Linux"
nasm -f elf32 rdrand.S -DX86 -g -o rdrand-x86.o
nasm -f elfx32 rdrand.S -DX32 -g -o rdrand-x32.o
nasm -f elf64 rdrand.S -DX64 -g -o rdrand-x64.o
fi
if [[ "$IS_SOLARIS" -eq "1" ]]; then
echo "Building rdrand and rdseed modules for Solaris"
nasm -f elf32 rdrand.S -DX86 -o rdrand-x86.o
nasm -f elfx32 rdrand.S -DX32 -o rdrand-x32.o
nasm -f elf64 rdrand.S -DX64 -o rdrand-x64.o
fi
if [[ "$IS_DARWIN" -eq "1" ]]; then
echo "Building rdrand and rdseed modules for Darwin"
nasm -f macho32 rdrand.S -DDARWIN -DX86 -g -o rdrand-x86.o
nasm -f macho64 rdrand.S -DDARWIN -DX64 -g -o rdrand-x64.o
echo "Building rdrand and rdseed modules for Darwin"
nasm -f macho32 rdrand.S -DDARWIN -DX86 -g -o rdrand-x86.o
nasm -f macho64 rdrand.S -DDARWIN -DX64 -g -o rdrand-x64.o
fi
if [[ "$IS_CYGWIN" -eq "1" ]]; then
echo "Building rdrand and rdseed modules for Cygwin"
nasm -f win32 rdrand.S -DCYGWIN -DX86 -g -o rdrand-x86.o
nasm -f win64 rdrand.S -DCYGWIN -DX64 -g -o rdrand-x64.o
echo "Building rdrand and rdseed modules for Cygwin"
nasm -f win32 rdrand.S -DCYGWIN -DX86 -g -o rdrand-x86.o
nasm -f win64 rdrand.S -DCYGWIN -DX64 -g -o rdrand-x64.o
fi
[ "$0" = "$BASH_SOURCE" ] && exit 0 || return 0

229
rdrand.S
View File

@ -24,27 +24,26 @@
%ifdef X86 ;; Set via the command line
%define arg1 [esp+04h]
%define arg2 [esp+08h]
%ifdef CYGWIN ;; Cygwin follows Windows ABI here, not Linux ABI
%define buffer ecx
%define bsize edx
%else
%define buffer edi
%define bsize esi
%endif
%define lsize dl ;; Used for tail bytes, 1-byte constants
%define MWSIZE 04h ;; machine word size
%elifdef X32 ;; Set via the command line
%define buffer edi
%define bsize esi
%define buffer edi ;; Linux ABI
%define bsize esi ;; Linux ABI
%define lsize si
%define MWSIZE 04h ;; machine word size
%elifdef X64 ;; Set via the command line
%ifdef CYGWIN ;; Cygwin follows Windows ABI here, not Linux ABI
%define buffer rcx
%define bsize rdx
%define buffer rcx ;; Windows ABI
%define bsize rdx ;; Windows ABI
%define lsize dx ;; Used for tail bytes, 2-byte constants
%else
%define buffer rdi
%define bsize rsi
%define buffer rdi ;; Linux ABI
%define bsize rsi ;; Linux ABI
%define lsize si ;; Used for tail bytes, 2-byte constants
%endif
%define MWSIZE 08h ;; machine word size
@ -62,10 +61,12 @@
%define NASM_RDSEED_GenerateBlock _NASM_RDSEED_GenerateBlock
%endif
%ifdef CYGWIN and X86
%ifdef CYGWIN
%ifdef X86
%define NASM_RDRAND_GenerateBlock _NASM_RDRAND_GenerateBlock
%define NASM_RDSEED_GenerateBlock _NASM_RDSEED_GenerateBlock
%endif
%endif
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
@ -83,9 +84,43 @@ NASM_RDRAND_GenerateBlock:
mov buffer, arg1
mov bsize, arg2
.GenerateBlock_Top:
;; A block of 16-bytes appears to be optimal. Adding
;; more rdrand calls degrades performance.
cmp bsize, 16
jb .GenerateBlock_4
cmp bsize, 0
.GenerateBlock_16:
.Call_RDRAND_EAX_4:
rdrand eax
jnc .Call_RDRAND_EAX_4
mov [buffer+0], eax
.Call_RDRAND_EAX_3:
rdrand eax
jnc .Call_RDRAND_EAX_3
mov [buffer+4], eax
.Call_RDRAND_EAX_2:
rdrand eax
jnc .Call_RDRAND_EAX_2
mov [buffer+8], eax
.Call_RDRAND_EAX_1:
rdrand eax
jnc .Call_RDRAND_EAX_1
mov [buffer+12], eax
sub bsize, 16
add buffer, 16
cmp bsize, 16
jae .GenerateBlock_16
;; Fewer than 16 bytes remain
.GenerateBlock_4:
cmp lsize, 0
je .GenerateBlock_Return
.Call_RDRAND_EAX_0:
@ -93,25 +128,23 @@ NASM_RDRAND_GenerateBlock:
rdrand eax
jnc .Call_RDRAND_EAX_0
.RDRAND_succeeded:
cmp bsize, MWSIZE
cmp lsize, MWSIZE
jb .Partial_Machine_Word
.Full_Machine_Word:
mov [buffer], eax
add buffer, MWSIZE
sub bsize, MWSIZE
sub lsize, MWSIZE
;; Continue
jmp .GenerateBlock_Top
jmp .GenerateBlock_4
;; 1,2,3 bytes remain for X86
;; 1,2,3 bytes remain
.Partial_Machine_Word:
;; Test bit 1 to see if size is at least 2
test bsize, 2
test lsize, 2
jz .Bit_1_Not_Set
mov [buffer], ax
@ -121,7 +154,7 @@ NASM_RDRAND_GenerateBlock:
.Bit_1_Not_Set:
;; Test bit 0 to see if size is at least 1
test bsize, 1
test lsize, 1
jz .Bit_0_Not_Set
mov [buffer], al
@ -150,32 +183,66 @@ NASM_RDRAND_GenerateBlock:
;; No need for Load_Arguments due to fastcall
.GenerateBlock_Top:
;; A block of 32-bytes appears to be optimal. Adding
;; more rdrand calls degrades performance.
cmp bsize, 32
jb .GenerateBlock_8
cmp bsize, 0
.GenerateBlock_32:
.Call_RDRAND_RAX_4:
rdrand rax
jnc .Call_RDRAND_RAX_4
mov [buffer+0], rax
.Call_RDRAND_RAX_3:
rdrand rax
jnc .Call_RDRAND_RAX_3
mov [buffer+8], rax
.Call_RDRAND_RAX_2:
rdrand rax
jnc .Call_RDRAND_RAX_2
mov [buffer+16], rax
.Call_RDRAND_RAX_1:
rdrand rax
jnc .Call_RDRAND_RAX_1
mov [buffer+24], rax
sub bsize, 32
add buffer, 32
cmp bsize, 32
jae .GenerateBlock_32
;; Fewer than 32 bytes remain
.GenerateBlock_8:
cmp lsize, 0
je .GenerateBlock_Return
.Call_RDRAND_RAX_0:
rdrand rax
jnc .Call_RDRAND_RAX_0
cmp bsize, MWSIZE
cmp lsize, MWSIZE
jb .Partial_Machine_Word
.Full_Machine_Word:
mov [buffer], rax
add buffer, MWSIZE
sub bsize, MWSIZE
sub lsize, MWSIZE
;; Continue
jmp .GenerateBlock_Top
jmp .GenerateBlock_8
;; 1,2,3,4,5,6,7 bytes remain
.Partial_Machine_Word:
;; Test bit 2 to see if size is at least 4
test bsize, 4
test lsize, 4
jz .Bit_2_Not_Set
mov [buffer], eax
@ -185,7 +252,7 @@ NASM_RDRAND_GenerateBlock:
.Bit_2_Not_Set:
;; Test bit 1 to see if size is at least 2
test bsize, 2
test lsize, 2
jz .Bit_1_Not_Set
mov [buffer], ax
@ -195,7 +262,7 @@ NASM_RDRAND_GenerateBlock:
.Bit_1_Not_Set:
;; Test bit 0 to see if size is at least 1
test bsize, 1
test lsize, 1
jz .Bit_0_Not_Set
mov [buffer], al
@ -227,9 +294,43 @@ NASM_RDSEED_GenerateBlock:
mov buffer, arg1
mov bsize, arg2
.GenerateBlock_Top:
;; A block of 16-bytes appears to be optimal. Adding
;; more rdrand calls degrades performance.
cmp bsize, 16
jb .GenerateBlock_4
cmp bsize, 0
.GenerateBlock_16:
.Call_RDSEED_EAX_4:
rdseed eax
jnc .Call_RDSEED_EAX_4
mov [buffer+0], eax
.Call_RDSEED_EAX_3:
rdseed eax
jnc .Call_RDSEED_EAX_3
mov [buffer+4], eax
.Call_RDSEED_EAX_2:
rdseed eax
jnc .Call_RDSEED_EAX_2
mov [buffer+8], eax
.Call_RDSEED_EAX_1:
rdseed eax
jnc .Call_RDSEED_EAX_1
mov [buffer+12], eax
sub bsize, 16
add buffer, 16
cmp bsize, 16
jae .GenerateBlock_16
;; Fewer than 16 bytes remain
.GenerateBlock_4:
cmp lsize, 0
je .GenerateBlock_Return
.Call_RDSEED_EAX_0:
@ -237,25 +338,23 @@ NASM_RDSEED_GenerateBlock:
rdseed eax
jnc .Call_RDSEED_EAX_0
.RDSEED_succeeded:
cmp bsize, MWSIZE
cmp lsize, MWSIZE
jb .Partial_Machine_Word
.Full_Machine_Word:
mov [buffer], eax
add buffer, MWSIZE
sub bsize, MWSIZE
sub lsize, MWSIZE
;; Continue
jmp .GenerateBlock_Top
jmp .GenerateBlock_4
;; 1,2,3 bytes remain for X86
;; 1,2,3 bytes remain
.Partial_Machine_Word:
;; Test bit 1 to see if size is at least 2
test bsize, 2
test lsize, 2
jz .Bit_1_Not_Set
mov [buffer], ax
@ -265,7 +364,7 @@ NASM_RDSEED_GenerateBlock:
.Bit_1_Not_Set:
;; Test bit 0 to see if size is at least 1
test bsize, 1
test lsize, 1
jz .Bit_0_Not_Set
mov [buffer], al
@ -294,32 +393,66 @@ NASM_RDSEED_GenerateBlock:
;; No need for Load_Arguments due to fastcall
.GenerateBlock_Top:
;; A block of 32-bytes appears to be optimal. Adding
;; more rdrand calls degrades performance.
cmp bsize, 32
jb .GenerateBlock_8
cmp bsize, 0
.GenerateBlock_32:
.Call_RDSEED_RAX_4:
rdseed rax
jnc .Call_RDSEED_RAX_4
mov [buffer+0], rax
.Call_RDSEED_RAX_3:
rdseed rax
jnc .Call_RDSEED_RAX_3
mov [buffer+8], rax
.Call_RDSEED_RAX_2:
rdseed rax
jnc .Call_RDSEED_RAX_2
mov [buffer+16], rax
.Call_RDSEED_RAX_1:
rdseed rax
jnc .Call_RDSEED_RAX_1
mov [buffer+24], rax
sub bsize, 32
add buffer, 32
cmp bsize, 32
jae .GenerateBlock_32
;; Fewer than 32 bytes remain
.GenerateBlock_8:
cmp lsize, 0
je .GenerateBlock_Return
.Call_RDSEED_RAX_0:
rdseed rax
jnc .Call_RDSEED_RAX_0
cmp bsize, MWSIZE
cmp lsize, MWSIZE
jb .Partial_Machine_Word
.Full_Machine_Word:
mov [buffer], rax
add buffer, MWSIZE
sub bsize, MWSIZE
sub lsize, MWSIZE
;; Continue
jmp .GenerateBlock_Top
jmp .GenerateBlock_8
;; 1,2,3,4,5,6,7 bytes remain
.Partial_Machine_Word:
;; Test bit 2 to see if size is at least 4
test bsize, 4
test lsize, 4
jz .Bit_2_Not_Set
mov [buffer], eax
@ -329,7 +462,7 @@ NASM_RDSEED_GenerateBlock:
.Bit_2_Not_Set:
;; Test bit 1 to see if size is at least 2
test bsize, 2
test lsize, 2
jz .Bit_1_Not_Set
mov [buffer], ax
@ -339,7 +472,7 @@ NASM_RDSEED_GenerateBlock:
.Bit_1_Not_Set:
;; Test bit 0 to see if size is at least 1
test bsize, 1
test lsize, 1
jz .Bit_0_Not_Set
mov [buffer], al