What are the costs of a failed store-to-load forwarding on recent x86 architectures?
In particular, store-to-load forwarding that fails because the load partly overlaps an earlier store, or because the earlier load or store cross some alignment boundary that causes the forwarding to fail.
Certainly there is a latency cost: how big is it? Is there also a throughput cost, e.g., does a failed store-to-load forwarding use additional resources that are then unavailable to other loads and stores, or even other non-memory operations?
Is there a difference when all the parts of the store come from the store buffer, versus the case where it's a mix of the store buffer and L1?
It is not really a full answer, but still evidence that the penalty is visible.
MSVC 2022 benchmark, compiler with /std:c++latest
.
#include <chrono>
#include <iostream>
struct alignas(16) S
{
char* a;
int* b;
};
extern "C" void init_fused_copy_unfused(int n, S & s2, S & s1);
extern "C" void init_fused_copy_fused(int n, S & s2, S & s1);
extern "C" void init_unfused_copy_unfused(int n, S & s2, S & s1);
extern "C" void init_unfused_copy_fused(int n, S & s2, S & s1);
int main()
{
using namespace std::chrono;
S s1, s2;
constexpr int N = 1'000'000'000;
auto t1 = system_clock::now();
init_fused_copy_fused(N, s2, s1);
auto t2 = system_clock::now();
init_fused_copy_unfused(N, s2, s1);
auto t3 = system_clock::now();
init_unfused_copy_fused(N, s2, s1);
auto t4 = system_clock::now();
init_unfused_copy_unfused(N, s2, s1);
auto t5 = system_clock::now();
std::cout
<< "init fused copy fused " << duration_cast<duration<double>>(t2 - t1) << "\n"
<< "init fused copy unfused " << duration_cast<duration<double>>(t3 - t2) << "\n"
<< "init unfused copy fused " << duration_cast<duration<double>>(t4 - t3) << "\n"
<< "init unfused copy unfused " << duration_cast<duration<double>>(t5 - t4) << "\n";
}
.code
c db 0
i dd 0
s dq byte ptr [c], dword ptr [i]
ALIGN 16
init_fused_copy_fused PROC
movups xmm0,xmmword ptr [s]
movups xmmword ptr [r8],xmm0
movups xmm1,xmmword ptr [r8]
movups xmmword ptr [rdx], xmm1
dec ecx
jnz init_fused_copy_fused
ret
init_fused_copy_fused ENDP
ALIGN 16
init_unfused_copy_fused PROC
lea rax, byte ptr [c]
mov qword ptr[r8], rax
lea rax, dword ptr [i]
mov qword ptr[r8 + 8], rax
movups xmm1,xmmword ptr [r8]
movups xmmword ptr [rdx], xmm1
dec ecx
jnz init_unfused_copy_fused
ret
init_unfused_copy_fused ENDP
ALIGN 16
init_fused_copy_unfused PROC
movups xmm0,xmmword ptr [s]
movups xmmword ptr [r8],xmm0
mov rax, qword ptr[r8]
mov qword ptr[rdx], rax
mov rax, qword ptr[r8 + 8]
mov qword ptr[rdx +8], rax
dec ecx
jnz init_fused_copy_unfused
ret
init_fused_copy_unfused ENDP
ALIGN 16
init_unfused_copy_unfused PROC
lea rax, byte ptr [c]
mov qword ptr[r8], rax
lea rax, dword ptr [i]
mov qword ptr[r8 + 8], rax
mov rax, qword ptr[r8]
mov qword ptr[rdx], rax
mov rax, qword ptr[r8 + 8]
mov qword ptr[rdx +8], rax
dec ecx
jnz init_unfused_copy_unfused
ret
init_unfused_copy_unfused ENDP
END
init fused copy fused 0.664739s
init fused copy unfused 0.935631s
init unfused copy fused 4.34326s
init unfused copy unfused 1.02741s
CPU: Intel(R) Core(TM) i7-8750H CPU @ 2.20GHz 2.21 GHz
I interpret the results as follows:
On the Intel Sandy Bridge family, store-forwarding stalls can't pipeline with other store-forwarding stalls. Even on independent addresses, they conflict for throughput. See Store forwarding by example, One more interesting experiment for Ivy Bridge, and Alex's answer for Coffee Lake (Skylake derivative).
But a failed (slow-path) store-forwarding doesn't seem to block successful store-forwarding. Testing on Skylake (i7-6700k) I made a test loop that includes two dependency chains:
;; nasm -felf64 testloop.asm
;; ld -o testloop testloop.o
;; taskset -c 3 perf stat -etask-clock:u,context-switches:u,cpu-migrations:u,page-faults:u,cycles:u,branches:u,instructions:u,uops_issued.any:u,uops_executed.thread:u,idq.dsb_uops:u -r1 ./testloop
default rel
%ifdef __YASM_VER__
CPU Conroe AMD
CPU Skylake AMD
%else
%use smartalign
alignmode p6, 64
%endif
global _start
_start:
lea rdi, [buf]
mov ebp, 100000000
align 64
.loop:
mov [rdi+64], ecx
; mov rcx, [rdi+64] ; reload here: 16c. Or 16.8 if we *also* reload after the %rep block
%rep 3
mov [rdi], eax
mov eax, [rdi]
%endrep
mov rcx, [rdi+64] ; reload here: 15c
dec ebp
jnz .loop
.end:
;;NASM-only, not YASM: %if __BITS__ == 32
%ifidn __OUTPUT_FORMAT__, elf32
mov eax,1
xor ebx,ebx
int 0x80 ; sys_exit(0) 32-bit ABI
%else
xor edi,edi
mov eax,231 ; __NR_exit_group from /usr/include/asm/unistd_64.h
syscall ; sys_exit_group(0)
%endif
section .bss
align 4096
buf: resb 4096
Performance results:
$ t=testloop; asm-link -dn "$t".asm && taskset -c 3 perf stat --all-user -etask-clock,context-switches,cpu-migrations,page-faults,cycles,instructions,uops_issued.any,uops_executed.thread,ld_blocks.store_forward,resource_stalls.sb -r2 ./"$t"
+ nasm -felf64 -Worphan-labels testloop.asm
+ ld -o testloop testloop.o
testloop: file format elf64-x86-64
Disassembly of section .text:
0000000000401000 <_start>:
401000: 48 8d 3d f9 0f 00 00 lea rdi,[rip+0xff9] # 402000 <__bss_start>
401007: bd 00 e1 f5 05 mov ebp,0x5f5e100
40100c: 0f 1f 84 00 00 00 00 00 nop DWORD PTR [rax+rax*1+0x0]
401014: 0f 1f 84 00 00 00 00 00 nop DWORD PTR [rax+rax*1+0x0]
40101c: 0f 1f 84 00 00 00 00 00 nop DWORD PTR [rax+rax*1+0x0]
401024: 0f 1f 84 00 00 00 00 00 nop DWORD PTR [rax+rax*1+0x0]
40102c: 0f 1f 84 00 00 00 00 00 nop DWORD PTR [rax+rax*1+0x0]
401034: 0f 1f 84 00 00 00 00 00 nop DWORD PTR [rax+rax*1+0x0]
40103c: 0f 1f 40 00 nop DWORD PTR [rax+0x0]
0000000000401040 <_start.loop>:
401040: 89 4f 40 mov DWORD PTR [rdi+0x40],ecx
401043: 89 07 mov DWORD PTR [rdi],eax
401045: 8b 07 mov eax,DWORD PTR [rdi]
401047: 89 07 mov DWORD PTR [rdi],eax
401049: 8b 07 mov eax,DWORD PTR [rdi]
40104b: 89 07 mov DWORD PTR [rdi],eax
40104d: 8b 07 mov eax,DWORD PTR [rdi]
40104f: 48 8b 4f 40 mov rcx,QWORD PTR [rdi+0x40]
401053: ff cd dec ebp
401055: 75 e9 jne 401040 <_start.loop>
0000000000401057 <_start.end>:
401057: 31 ff xor edi,edi
401059: b8 e7 00 00 00 mov eax,0xe7
40105e: 0f 05 syscall
Performance counter stats for './testloop' (two runs):
385.85 msec task-clock # 0.999 CPUs utilized ( +- 0.02% )
0 context-switches # 0.000 /sec
0 cpu-migrations # 0.000 /sec
2 page-faults # 5.183 /sec
1,503,701,305 cycles # 3.897 GHz ( +- 0.01% )
1,000,000,130 instructions # 0.67 instructions per cycle ( +- 0.00% )
900,084,383 uops_issued.any # 2.333 G/sec ( +- 0.00% )
1,300,091,135 uops_executed.thread # 3.369 G/sec ( +- 0.00% )
99,933,928 ld_blocks.store_forward # 258.998 M/sec ( +- 0.02% )
443,686,304 resource_stalls.sb # 1.150 G/sec ( +- 4.87% )
0.386139 +- 0.000119 seconds time elapsed ( +- 0.03% )
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With