I am experimenting with a program to see if its caching behaviour is consistent with my conceptual understanding.
To do this I am using the Perf command:
perf stat -e cache-misses ./a.out
to record the cache-miss ratio of the following simple C program:
int main() {
int N = 10000;
double *arr = malloc(sizeof(double) * N * N);
for(int i = 0; i < N; i++) {
for(int j = 0; j < N; j++){
arr[i * N + j] = 10.0;
}
}
return 0;
}
I get a cache-miss ratio of 50.212%. If I change the array access pattern as follows:
arr[j * N + i]
I get that the cache-miss ratio is 22.206%.
These results are surprising to me.
The answer is pretty simple: compiler optimize out your assignments. Here is how the disassembly looks like for your code:
10 int main() {
0x00000000004004d6 <+0>: mov $0x2710,%edx
0x00000000004004db <+5>: jmp 0x4004e7 <main+17>
15 for(int j = 0; j < N; j++){
0x00000000004004dd <+7>: sub $0x1,%eax
0x00000000004004e0 <+10>: jne 0x4004dd <main+7>
14 for(int i = 0; i < N; i++) {
0x00000000004004e2 <+12>: sub $0x1,%edx
0x00000000004004e5 <+15>: je 0x4004ee <main+24>
10 int main() {
0x00000000004004e7 <+17>: mov $0x2710,%eax
0x00000000004004ec <+22>: jmp 0x4004dd <main+7>
16 arr[i * N + j] = 10.0;
17 }
18 }
19 return 0;
20 }
0x00000000004004ee <+24>: mov $0x0,%eax
0x00000000004004f3 <+29>: retq
As you can see there are no assembler instructions generated for the line arr[i * N + j] = 10.0;
, so those cache misses you observe with perf are unrelated.
The fix is quite easy. Simply add volatile
to the pointer declaration, forcing compiler to generate the assignments, i.e.:
volatile double *arr = malloc(sizeof(double) * N * N);
The disassembly now:
10 int main() {
0x0000000000400526 <+0>: sub $0x8,%rsp
11 int N = 10000;
12 volatile double *arr = malloc(sizeof(double) * N * N);
0x000000000040052a <+4>: mov $0x2faf0800,%edi
0x000000000040052f <+9>: callq 0x400410 <malloc@plt>
0x0000000000400534 <+14>: mov $0x0,%edx
16 arr[i * N + j] = 10.0;
0x0000000000400539 <+19>: movsd 0xc7(%rip),%xmm0 # 0x400608
0x0000000000400541 <+27>: jmp 0x40055f <main+57>
0x0000000000400543 <+29>: movslq %edx,%rcx
0x0000000000400546 <+32>: lea (%rax,%rcx,8),%rcx
0x000000000040054a <+36>: movsd %xmm0,(%rcx)
0x000000000040054e <+40>: add $0x1,%edx
15 for(int j = 0; j < N; j++){
0x0000000000400551 <+43>: cmp %esi,%edx
0x0000000000400553 <+45>: jne 0x400543 <main+29>
0x0000000000400555 <+47>: mov %esi,%edx
14 for(int i = 0; i < N; i++) {
0x0000000000400557 <+49>: cmp $0x5f5e100,%esi
0x000000000040055d <+55>: je 0x400567 <main+65>
0x000000000040055f <+57>: lea 0x2710(%rdx),%esi
0x0000000000400565 <+63>: jmp 0x400543 <main+29>
17 }
18 }
19 return 0;
20 }
0x0000000000400567 <+65>: mov $0x0,%eax
0x000000000040056c <+70>: add $0x8,%rsp
0x0000000000400570 <+74>: retq
And there are much more cache misses as well.
Running your test just once might get you very noisy results. You should run your measurements few times and take a median. There are benchmark frameworks, like Google Benchmark, so please have a look.
And the last one. Both of your patterns, like i * N + j
and j * N + i
are easily recognizable by the CPU prefetcher, so the cache miss ratio in both cases should be quite similar...
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With