I have 700 items and I loop through the 700 items for each I obtain the item' three attributes and perform some basic calculations. I have implemented this using two techniques:
1) Three 700-element arrays, one array for each of the three attributes. So:
item0.a = array1[0]
item0.b = array2[0]
item0.e = array3[0]
2) One 2100-element array containing data for the three attributes consecutively. So:
item0.a = array[(0*3)+0]
item0.b = array[(0*3)+1]
item0.e = array[(0*3)+2]
Now the three item attributes a
, b
and e
are used together within the loop- therefore it would make sense that if you store them in one array the performance should be better than if you use the three-array technique (due to spatial locality). However:
Here is the code for the 2100-array technique:
unsigned int x;
unsigned int y;
double c = 0;
double d = 0;
bool data_for_all_items = true;
unsigned long long start = 0;
unsigned long long finish = 0;
unsigned int array[2100];
//I have left out code for simplicity. You can assume by now the array is populated.
start = __rdtscp(&x);
for(int i=0; i < 700; i++){
unsigned short j = i * 3;
unsigned int a = array[j + 0];
unsigned int b = array[j + 1];
data_for_all_items = data_for_all_items & (a!= -1 & b != -1);
unsigned int e = array[j + 2];
c += (a * e);
d += (b * e);
}
finish = __rdtscp(&y);
and here is the code for the three 700-element arrays technique:
unsigned int x;
unsigned int y;
double c = 0;
double d = 0;
bool data_for_all_items = true;
unsigned long long start = 0;
unsigned long long finish = 0;
unsigned int array1[700];
unsigned int array2[700];
unsigned int array3[700];
//I have left out code for simplicity. You can assume by now the arrays are populated.
start = __rdtscp(&x);
for(int i=0; i < 700; i++){
unsigned int a= array1[i]; //Array 1
unsigned int b= array2[i]; //Array 2
data_for_all_items = data_for_all_items & (a!= -1 & b != -1);
unsigned int e = array3[i]; //Array 3
c += (a * e);
d += (b * e);
}
finish = __rdtscp(&y);
Why isn't the technique using one-2100 element array faster? It should be because the three attributes are used together, per each 700 item.
I used MSVC 2012, Win 7 64
Assembly for 3x 700-element array technique:
start = __rdtscp(&x);
rdtscp
shl rdx,20h
lea r8,[this]
or rax,rdx
mov dword ptr [r8],ecx
mov r8d,8ch
mov r9,rax
lea rdx,[rbx+0Ch]
for(int i=0; i < 700; i++){
sub rdi,rbx
unsigned int a = array1[i];
unsigned int b = array2[i];
data_for_all_items = data_for_all_items & (a != -1 & b != -1);
cmp dword ptr [rdi+rdx-0Ch],0FFFFFFFFh
lea rdx,[rdx+14h]
setne cl
cmp dword ptr [rdi+rdx-1Ch],0FFFFFFFFh
setne al
and cl,al
cmp dword ptr [rdi+rdx-18h],0FFFFFFFFh
setne al
and cl,al
cmp dword ptr [rdi+rdx-10h],0FFFFFFFFh
setne al
and cl,al
cmp dword ptr [rdi+rdx-14h],0FFFFFFFFh
setne al
and cl,al
cmp dword ptr [rdx-20h],0FFFFFFFFh
setne al
and cl,al
cmp dword ptr [rdx-1Ch],0FFFFFFFFh
setne al
and cl,al
cmp dword ptr [rdx-18h],0FFFFFFFFh
setne al
and cl,al
cmp dword ptr [rdx-10h],0FFFFFFFFh
setne al
and cl,al
cmp dword ptr [rdx-14h],0FFFFFFFFh
setne al
and cl,al
and r15b,cl
dec r8
jne 013F26DA53h
unsigned int e = array3[i];
c += (a * e);
d += (b * e);
}
finish = __rdtscp(&y);
rdtscp
shl rdx,20h
lea r8,[y]
or rax,rdx
mov dword ptr [r8],ecx
Assembler for the 2100-element array technique:
start = __rdtscp(&x);
rdtscp
lea r8,[this]
shl rdx,20h
or rax,rdx
mov dword ptr [r8],ecx
for(int i=0; i < 700; i++){
xor r8d,r8d
mov r10,rax
unsigned short j = i*3;
movzx ecx,r8w
add cx,cx
lea edx,[rcx+r8]
unsigned int a = array[j + 0];
unsigned int b = array[j + 1];
data_for_all_items = data_for_all_items & (best_ask != -1 & best_bid != -1);
movzx ecx,dx
cmp dword ptr [r9+rcx*4+4],0FFFFFFFFh
setne dl
cmp dword ptr [r9+rcx*4],0FFFFFFFFh
setne al
inc r8d
and dl,al
and r14b,dl
cmp r8d,2BCh
jl 013F05DA10h
unsigned int e = array[pos + 2];
c += (a * e);
d += (b * e);
}
finish = __rdtscp(&y);
rdtscp
shl rdx,20h
lea r8,[y]
or rax,rdx
mov dword ptr [r8],ecx
Edit: Given your assembly code, the second loop is five times unrolled. The unrolled version could run faster on an out-of-order execution CPU such as any modern x86/x86-64 CPU.
The second code is vectorisable - two elements of each array could be loaded at each iteration in one XMM register each. Since modern CPUs use SSE for both scalar and vector FP arithmetic, this cuts the number of cycles roughly in half. With an AVX-capable CPU four doubles could be loaded in an YMM register and therefore the number of cycles should be cut in four.
The first loop is not vectorisable along i
since the value of a
in iteration i+1
comes from a location 3 elements after the one where the value of a
in iteration i
comes from. In that case vectorisation requires gathered vector loads are those are only supported in the AVX2 instruction set.
Using proper data structures is crucial when programming CPUs with vector capabilities. Converting codes like your first loop into something like your second loop is 90% of the job that one has to do in order to get good performance on Intel Xeon Phi which has very wide vector registers but awfully slow in-order execution engine.
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With