Updated RDRAND and RDSEED under NASM
Port rdrand.S to Solaris Port rdrand.S to X32 The X32 port is responsible for the loop unwinding. The unwind generates a 32-byte block (X64 and X32) or 16-byte block (X86). On X32, it increases throughut by 100% (doubles it). On X86 and X64, throughput increases by about 6%. Anything over 4 machine words slows things down.pull/392/head
parent
4bcaabbe26
commit
f9773d2961
|
|
@ -1,26 +1,42 @@
|
|||
#!/usr/bin/env bash
|
||||
|
||||
IS_LINUX=$(uname -s | grep -i -c linux)
|
||||
IS_SOLARIS=$(uname -s | grep -i -c sunos)
|
||||
IS_DARWIN=$(uname -s | grep -i -c darwin)
|
||||
IS_CYGWIN=$(uname -s | grep -i -c cygwin)
|
||||
|
||||
rm -f rdrand-x86.o rdrand-x32.o rdrand-x64.o &>/dev/null
|
||||
|
||||
NASM=$(which nasm 2>&1)
|
||||
if [[ ! -f "$NASM" ]]; then
|
||||
echo "Unable to locate Nasm"
|
||||
[ "$0" = "$BASH_SOURCE" ] && exit 1 || return 1
|
||||
fi
|
||||
|
||||
if [[ "$IS_LINUX" -eq "1" ]]; then
|
||||
echo "Building rdrand and rdseed modules for Linux"
|
||||
nasm -f elf32 rdrand.S -DX86 -g -o rdrand-x86.o
|
||||
nasm -f elfx32 rdrand.S -DX32 -g -o rdrand-x32.o
|
||||
nasm -f elf64 rdrand.S -DX64 -g -o rdrand-x64.o
|
||||
echo "Building rdrand and rdseed modules for Linux"
|
||||
nasm -f elf32 rdrand.S -DX86 -g -o rdrand-x86.o
|
||||
nasm -f elfx32 rdrand.S -DX32 -g -o rdrand-x32.o
|
||||
nasm -f elf64 rdrand.S -DX64 -g -o rdrand-x64.o
|
||||
fi
|
||||
|
||||
if [[ "$IS_SOLARIS" -eq "1" ]]; then
|
||||
echo "Building rdrand and rdseed modules for Solaris"
|
||||
nasm -f elf32 rdrand.S -DX86 -o rdrand-x86.o
|
||||
nasm -f elfx32 rdrand.S -DX32 -o rdrand-x32.o
|
||||
nasm -f elf64 rdrand.S -DX64 -o rdrand-x64.o
|
||||
fi
|
||||
|
||||
if [[ "$IS_DARWIN" -eq "1" ]]; then
|
||||
echo "Building rdrand and rdseed modules for Darwin"
|
||||
nasm -f macho32 rdrand.S -DDARWIN -DX86 -g -o rdrand-x86.o
|
||||
nasm -f macho64 rdrand.S -DDARWIN -DX64 -g -o rdrand-x64.o
|
||||
echo "Building rdrand and rdseed modules for Darwin"
|
||||
nasm -f macho32 rdrand.S -DDARWIN -DX86 -g -o rdrand-x86.o
|
||||
nasm -f macho64 rdrand.S -DDARWIN -DX64 -g -o rdrand-x64.o
|
||||
fi
|
||||
|
||||
if [[ "$IS_CYGWIN" -eq "1" ]]; then
|
||||
echo "Building rdrand and rdseed modules for Cygwin"
|
||||
nasm -f win32 rdrand.S -DCYGWIN -DX86 -g -o rdrand-x86.o
|
||||
nasm -f win64 rdrand.S -DCYGWIN -DX64 -g -o rdrand-x64.o
|
||||
echo "Building rdrand and rdseed modules for Cygwin"
|
||||
nasm -f win32 rdrand.S -DCYGWIN -DX86 -g -o rdrand-x86.o
|
||||
nasm -f win64 rdrand.S -DCYGWIN -DX64 -g -o rdrand-x64.o
|
||||
fi
|
||||
|
||||
[ "$0" = "$BASH_SOURCE" ] && exit 0 || return 0
|
||||
|
|
|
|||
229
rdrand.S
229
rdrand.S
|
|
@ -24,27 +24,26 @@
|
|||
%ifdef X86 ;; Set via the command line
|
||||
%define arg1 [esp+04h]
|
||||
%define arg2 [esp+08h]
|
||||
%ifdef CYGWIN ;; Cygwin follows Windows ABI here, not Linux ABI
|
||||
%define buffer ecx
|
||||
%define bsize edx
|
||||
%else
|
||||
%define buffer edi
|
||||
%define bsize esi
|
||||
%endif
|
||||
%define lsize dl ;; Used for tail bytes, 1-byte constants
|
||||
%define MWSIZE 04h ;; machine word size
|
||||
|
||||
%elifdef X32 ;; Set via the command line
|
||||
%define buffer edi
|
||||
%define bsize esi
|
||||
%define buffer edi ;; Linux ABI
|
||||
%define bsize esi ;; Linux ABI
|
||||
%define lsize si
|
||||
%define MWSIZE 04h ;; machine word size
|
||||
|
||||
%elifdef X64 ;; Set via the command line
|
||||
%ifdef CYGWIN ;; Cygwin follows Windows ABI here, not Linux ABI
|
||||
%define buffer rcx
|
||||
%define bsize rdx
|
||||
%define buffer rcx ;; Windows ABI
|
||||
%define bsize rdx ;; Windows ABI
|
||||
%define lsize dx ;; Used for tail bytes, 2-byte constants
|
||||
%else
|
||||
%define buffer rdi
|
||||
%define bsize rsi
|
||||
%define buffer rdi ;; Linux ABI
|
||||
%define bsize rsi ;; Linux ABI
|
||||
%define lsize si ;; Used for tail bytes, 2-byte constants
|
||||
%endif
|
||||
%define MWSIZE 08h ;; machine word size
|
||||
|
||||
|
|
@ -62,10 +61,12 @@
|
|||
%define NASM_RDSEED_GenerateBlock _NASM_RDSEED_GenerateBlock
|
||||
%endif
|
||||
|
||||
%ifdef CYGWIN and X86
|
||||
%ifdef CYGWIN
|
||||
%ifdef X86
|
||||
%define NASM_RDRAND_GenerateBlock _NASM_RDRAND_GenerateBlock
|
||||
%define NASM_RDSEED_GenerateBlock _NASM_RDSEED_GenerateBlock
|
||||
%endif
|
||||
%endif
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
|
|
@ -83,9 +84,43 @@ NASM_RDRAND_GenerateBlock:
|
|||
mov buffer, arg1
|
||||
mov bsize, arg2
|
||||
|
||||
.GenerateBlock_Top:
|
||||
;; A block of 16-bytes appears to be optimal. Adding
|
||||
;; more rdrand calls degrades performance.
|
||||
cmp bsize, 16
|
||||
jb .GenerateBlock_4
|
||||
|
||||
cmp bsize, 0
|
||||
.GenerateBlock_16:
|
||||
|
||||
.Call_RDRAND_EAX_4:
|
||||
rdrand eax
|
||||
jnc .Call_RDRAND_EAX_4
|
||||
mov [buffer+0], eax
|
||||
|
||||
.Call_RDRAND_EAX_3:
|
||||
rdrand eax
|
||||
jnc .Call_RDRAND_EAX_3
|
||||
mov [buffer+4], eax
|
||||
|
||||
.Call_RDRAND_EAX_2:
|
||||
rdrand eax
|
||||
jnc .Call_RDRAND_EAX_2
|
||||
mov [buffer+8], eax
|
||||
|
||||
.Call_RDRAND_EAX_1:
|
||||
rdrand eax
|
||||
jnc .Call_RDRAND_EAX_1
|
||||
mov [buffer+12], eax
|
||||
|
||||
sub bsize, 16
|
||||
add buffer, 16
|
||||
|
||||
cmp bsize, 16
|
||||
jae .GenerateBlock_16
|
||||
|
||||
;; Fewer than 16 bytes remain
|
||||
.GenerateBlock_4:
|
||||
|
||||
cmp lsize, 0
|
||||
je .GenerateBlock_Return
|
||||
|
||||
.Call_RDRAND_EAX_0:
|
||||
|
|
@ -93,25 +128,23 @@ NASM_RDRAND_GenerateBlock:
|
|||
rdrand eax
|
||||
jnc .Call_RDRAND_EAX_0
|
||||
|
||||
.RDRAND_succeeded:
|
||||
|
||||
cmp bsize, MWSIZE
|
||||
cmp lsize, MWSIZE
|
||||
jb .Partial_Machine_Word
|
||||
|
||||
.Full_Machine_Word:
|
||||
|
||||
mov [buffer], eax
|
||||
add buffer, MWSIZE
|
||||
sub bsize, MWSIZE
|
||||
sub lsize, MWSIZE
|
||||
|
||||
;; Continue
|
||||
jmp .GenerateBlock_Top
|
||||
jmp .GenerateBlock_4
|
||||
|
||||
;; 1,2,3 bytes remain for X86
|
||||
;; 1,2,3 bytes remain
|
||||
.Partial_Machine_Word:
|
||||
|
||||
;; Test bit 1 to see if size is at least 2
|
||||
test bsize, 2
|
||||
test lsize, 2
|
||||
jz .Bit_1_Not_Set
|
||||
|
||||
mov [buffer], ax
|
||||
|
|
@ -121,7 +154,7 @@ NASM_RDRAND_GenerateBlock:
|
|||
.Bit_1_Not_Set:
|
||||
|
||||
;; Test bit 0 to see if size is at least 1
|
||||
test bsize, 1
|
||||
test lsize, 1
|
||||
jz .Bit_0_Not_Set
|
||||
|
||||
mov [buffer], al
|
||||
|
|
@ -150,32 +183,66 @@ NASM_RDRAND_GenerateBlock:
|
|||
|
||||
;; No need for Load_Arguments due to fastcall
|
||||
|
||||
.GenerateBlock_Top:
|
||||
;; A block of 32-bytes appears to be optimal. Adding
|
||||
;; more rdrand calls degrades performance.
|
||||
cmp bsize, 32
|
||||
jb .GenerateBlock_8
|
||||
|
||||
cmp bsize, 0
|
||||
.GenerateBlock_32:
|
||||
|
||||
.Call_RDRAND_RAX_4:
|
||||
rdrand rax
|
||||
jnc .Call_RDRAND_RAX_4
|
||||
mov [buffer+0], rax
|
||||
|
||||
.Call_RDRAND_RAX_3:
|
||||
rdrand rax
|
||||
jnc .Call_RDRAND_RAX_3
|
||||
mov [buffer+8], rax
|
||||
|
||||
.Call_RDRAND_RAX_2:
|
||||
rdrand rax
|
||||
jnc .Call_RDRAND_RAX_2
|
||||
mov [buffer+16], rax
|
||||
|
||||
.Call_RDRAND_RAX_1:
|
||||
rdrand rax
|
||||
jnc .Call_RDRAND_RAX_1
|
||||
mov [buffer+24], rax
|
||||
|
||||
sub bsize, 32
|
||||
add buffer, 32
|
||||
|
||||
cmp bsize, 32
|
||||
jae .GenerateBlock_32
|
||||
|
||||
;; Fewer than 32 bytes remain
|
||||
.GenerateBlock_8:
|
||||
|
||||
cmp lsize, 0
|
||||
je .GenerateBlock_Return
|
||||
|
||||
.Call_RDRAND_RAX_0:
|
||||
rdrand rax
|
||||
jnc .Call_RDRAND_RAX_0
|
||||
|
||||
cmp bsize, MWSIZE
|
||||
cmp lsize, MWSIZE
|
||||
jb .Partial_Machine_Word
|
||||
|
||||
.Full_Machine_Word:
|
||||
|
||||
mov [buffer], rax
|
||||
add buffer, MWSIZE
|
||||
sub bsize, MWSIZE
|
||||
sub lsize, MWSIZE
|
||||
|
||||
;; Continue
|
||||
jmp .GenerateBlock_Top
|
||||
jmp .GenerateBlock_8
|
||||
|
||||
;; 1,2,3,4,5,6,7 bytes remain
|
||||
.Partial_Machine_Word:
|
||||
|
||||
;; Test bit 2 to see if size is at least 4
|
||||
test bsize, 4
|
||||
test lsize, 4
|
||||
jz .Bit_2_Not_Set
|
||||
|
||||
mov [buffer], eax
|
||||
|
|
@ -185,7 +252,7 @@ NASM_RDRAND_GenerateBlock:
|
|||
.Bit_2_Not_Set:
|
||||
|
||||
;; Test bit 1 to see if size is at least 2
|
||||
test bsize, 2
|
||||
test lsize, 2
|
||||
jz .Bit_1_Not_Set
|
||||
|
||||
mov [buffer], ax
|
||||
|
|
@ -195,7 +262,7 @@ NASM_RDRAND_GenerateBlock:
|
|||
.Bit_1_Not_Set:
|
||||
|
||||
;; Test bit 0 to see if size is at least 1
|
||||
test bsize, 1
|
||||
test lsize, 1
|
||||
jz .Bit_0_Not_Set
|
||||
|
||||
mov [buffer], al
|
||||
|
|
@ -227,9 +294,43 @@ NASM_RDSEED_GenerateBlock:
|
|||
mov buffer, arg1
|
||||
mov bsize, arg2
|
||||
|
||||
.GenerateBlock_Top:
|
||||
;; A block of 16-bytes appears to be optimal. Adding
|
||||
;; more rdrand calls degrades performance.
|
||||
cmp bsize, 16
|
||||
jb .GenerateBlock_4
|
||||
|
||||
cmp bsize, 0
|
||||
.GenerateBlock_16:
|
||||
|
||||
.Call_RDSEED_EAX_4:
|
||||
rdseed eax
|
||||
jnc .Call_RDSEED_EAX_4
|
||||
mov [buffer+0], eax
|
||||
|
||||
.Call_RDSEED_EAX_3:
|
||||
rdseed eax
|
||||
jnc .Call_RDSEED_EAX_3
|
||||
mov [buffer+4], eax
|
||||
|
||||
.Call_RDSEED_EAX_2:
|
||||
rdseed eax
|
||||
jnc .Call_RDSEED_EAX_2
|
||||
mov [buffer+8], eax
|
||||
|
||||
.Call_RDSEED_EAX_1:
|
||||
rdseed eax
|
||||
jnc .Call_RDSEED_EAX_1
|
||||
mov [buffer+12], eax
|
||||
|
||||
sub bsize, 16
|
||||
add buffer, 16
|
||||
|
||||
cmp bsize, 16
|
||||
jae .GenerateBlock_16
|
||||
|
||||
;; Fewer than 16 bytes remain
|
||||
.GenerateBlock_4:
|
||||
|
||||
cmp lsize, 0
|
||||
je .GenerateBlock_Return
|
||||
|
||||
.Call_RDSEED_EAX_0:
|
||||
|
|
@ -237,25 +338,23 @@ NASM_RDSEED_GenerateBlock:
|
|||
rdseed eax
|
||||
jnc .Call_RDSEED_EAX_0
|
||||
|
||||
.RDSEED_succeeded:
|
||||
|
||||
cmp bsize, MWSIZE
|
||||
cmp lsize, MWSIZE
|
||||
jb .Partial_Machine_Word
|
||||
|
||||
.Full_Machine_Word:
|
||||
|
||||
mov [buffer], eax
|
||||
add buffer, MWSIZE
|
||||
sub bsize, MWSIZE
|
||||
sub lsize, MWSIZE
|
||||
|
||||
;; Continue
|
||||
jmp .GenerateBlock_Top
|
||||
jmp .GenerateBlock_4
|
||||
|
||||
;; 1,2,3 bytes remain for X86
|
||||
;; 1,2,3 bytes remain
|
||||
.Partial_Machine_Word:
|
||||
|
||||
;; Test bit 1 to see if size is at least 2
|
||||
test bsize, 2
|
||||
test lsize, 2
|
||||
jz .Bit_1_Not_Set
|
||||
|
||||
mov [buffer], ax
|
||||
|
|
@ -265,7 +364,7 @@ NASM_RDSEED_GenerateBlock:
|
|||
.Bit_1_Not_Set:
|
||||
|
||||
;; Test bit 0 to see if size is at least 1
|
||||
test bsize, 1
|
||||
test lsize, 1
|
||||
jz .Bit_0_Not_Set
|
||||
|
||||
mov [buffer], al
|
||||
|
|
@ -294,32 +393,66 @@ NASM_RDSEED_GenerateBlock:
|
|||
|
||||
;; No need for Load_Arguments due to fastcall
|
||||
|
||||
.GenerateBlock_Top:
|
||||
;; A block of 32-bytes appears to be optimal. Adding
|
||||
;; more rdrand calls degrades performance.
|
||||
cmp bsize, 32
|
||||
jb .GenerateBlock_8
|
||||
|
||||
cmp bsize, 0
|
||||
.GenerateBlock_32:
|
||||
|
||||
.Call_RDSEED_RAX_4:
|
||||
rdseed rax
|
||||
jnc .Call_RDSEED_RAX_4
|
||||
mov [buffer+0], rax
|
||||
|
||||
.Call_RDSEED_RAX_3:
|
||||
rdseed rax
|
||||
jnc .Call_RDSEED_RAX_3
|
||||
mov [buffer+8], rax
|
||||
|
||||
.Call_RDSEED_RAX_2:
|
||||
rdseed rax
|
||||
jnc .Call_RDSEED_RAX_2
|
||||
mov [buffer+16], rax
|
||||
|
||||
.Call_RDSEED_RAX_1:
|
||||
rdseed rax
|
||||
jnc .Call_RDSEED_RAX_1
|
||||
mov [buffer+24], rax
|
||||
|
||||
sub bsize, 32
|
||||
add buffer, 32
|
||||
|
||||
cmp bsize, 32
|
||||
jae .GenerateBlock_32
|
||||
|
||||
;; Fewer than 32 bytes remain
|
||||
.GenerateBlock_8:
|
||||
|
||||
cmp lsize, 0
|
||||
je .GenerateBlock_Return
|
||||
|
||||
.Call_RDSEED_RAX_0:
|
||||
rdseed rax
|
||||
jnc .Call_RDSEED_RAX_0
|
||||
|
||||
cmp bsize, MWSIZE
|
||||
cmp lsize, MWSIZE
|
||||
jb .Partial_Machine_Word
|
||||
|
||||
.Full_Machine_Word:
|
||||
|
||||
mov [buffer], rax
|
||||
add buffer, MWSIZE
|
||||
sub bsize, MWSIZE
|
||||
sub lsize, MWSIZE
|
||||
|
||||
;; Continue
|
||||
jmp .GenerateBlock_Top
|
||||
jmp .GenerateBlock_8
|
||||
|
||||
;; 1,2,3,4,5,6,7 bytes remain
|
||||
.Partial_Machine_Word:
|
||||
|
||||
;; Test bit 2 to see if size is at least 4
|
||||
test bsize, 4
|
||||
test lsize, 4
|
||||
jz .Bit_2_Not_Set
|
||||
|
||||
mov [buffer], eax
|
||||
|
|
@ -329,7 +462,7 @@ NASM_RDSEED_GenerateBlock:
|
|||
.Bit_2_Not_Set:
|
||||
|
||||
;; Test bit 1 to see if size is at least 2
|
||||
test bsize, 2
|
||||
test lsize, 2
|
||||
jz .Bit_1_Not_Set
|
||||
|
||||
mov [buffer], ax
|
||||
|
|
@ -339,7 +472,7 @@ NASM_RDSEED_GenerateBlock:
|
|||
.Bit_1_Not_Set:
|
||||
|
||||
;; Test bit 0 to see if size is at least 1
|
||||
test bsize, 1
|
||||
test lsize, 1
|
||||
jz .Bit_0_Not_Set
|
||||
|
||||
mov [buffer], al
|
||||
|
|
|
|||
Loading…
Reference in New Issue