Suppose I'm using AVX2's VGATHERDPS - this should load 8 single-precision floats using 8 DWORD indices.
What happens when the data to be loaded exists in different cache-lines? Is the instruction implemented as a hardware loop which fetches cache-lines one by one? Or, can it issue a load to multiple cache-lines at once?
I read a couple of papers which state the former (and that's the one which makes more sense to me), but I would like to know a bit more about this.
Link to one paper: http://arxiv.org/pdf/1401.7494.pdf
I did some benchmarking of the AVX gather instructions (on a Haswell CPU) and it seems to be a fairly simple brute force implementation - even when the elements to be loaded are contiguous it seems that there is still one read cycle per element, so performance is really no better than just doing scalar loads.
NB: this answer is now obsolete as things have changed considerably since Haswell. See the accepted answer for full details (unless you happen to be targeting Haswell CPUs).
Gather was first implemented with Haswell but was not optimized until Broadwell (the first generation after Haswell).
I wrote my own code to test gather (see below). Here is a summary on Skylake, SkylakeX (with a dedicated AVX512 port), and KNL systems.
scalar auto AVX2 AVX512
Skylake GCC 0.47 0.38 0.38 NA
SkylakeX GCC 0.56 0.23 0.35 0.24
KNL GCC 3.95 1.37 2.11 1.16
KNL ICC 3.92 1.17 2.31 1.17
From the table it's clear that in all cases gather loads are faster than scalar loads (for the benchmark I used).
I'm not sure how Intel implements gather internally. The masks don't seem to have an effect on performance for gather. That's one thing Intel could optimize (if you only read one scalar value to due the mask it should be faster than gathering all values and then using the mask.
The Intel manual shows some nice figures on gather
https://www.intel.com/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-optimization-manual.pdf
DCU = L1 Data Cache Unit. MCU = mid-level = L2 cache. LLC = last-level = L3 cache. L3 is shared, L2 and L1d are per-core private.
Intel is just benchmarking gathers, not using the result for anything.
//gather.c
#include <stdio.h>
#include <omp.h>
#include <stdlib.h>
#define N 1024
#define R 1000000
void foo_auto(double * restrict a, double * restrict b, int *idx, int n);
void foo_AVX2(double * restrict a, double * restrict b, int *idx, int n);
void foo_AVX512(double * restrict a, double * restrict b, int *idx, int n);
void foo1(double * restrict a, double * restrict b, int *idx, int n);
void foo2(double * restrict a, double * restrict b, int *idx, int n);
void foo3(double * restrict a, double * restrict b, int *idx, int n);
double test(int *idx, void (*fp)(double * restrict a, double * restrict b, int *idx, int n)) {
double a[N];
double b[N];
double dtime;
for(int i=0; i<N; i++) a[i] = 1.0*N;
for(int i=0; i<N; i++) b[i] = 1.0;
fp(a, b, idx, N);
dtime = -omp_get_wtime();
for(int i=0; i<R; i++) fp(a, b, idx, N);
dtime += omp_get_wtime();
return dtime;
}
int main(void) {
//for(int i=0; i<N; i++) idx[i] = N - i - 1;
//for(int i=0; i<N; i++) idx[i] = i;
//for(int i=0; i<N; i++) idx[i] = rand()%N;
//for(int i=0; i<R; i++) foo2(a, b, idx, N);
int idx[N];
double dtime;
int ntests=2;
void (*fp[4])(double * restrict a, double * restrict b, int *idx, int n);
fp[0] = foo_auto;
fp[1] = foo_AVX2;
#if defined ( __AVX512F__ ) || defined ( __AVX512__ )
fp[2] = foo_AVX512;
ntests=3;
#endif
for(int i=0; i<ntests; i++) {
for(int i=0; i<N; i++) idx[i] = 0;
test(idx, fp[i]);
dtime = test(idx, fp[i]);
printf("%.2f ", dtime);
for(int i=0; i<N; i++) idx[i] = i;
test(idx, fp[i]);
dtime = test(idx, fp[i]);
printf("%.2f ", dtime);
for(int i=0; i<N; i++) idx[i] = N-i-1;
test(idx, fp[i]);
dtime = test(idx, fp[i]);
printf("%.2f ", dtime);
for(int i=0; i<N; i++) idx[i] = rand()%N;
test(idx, fp[i]);
dtime = test(idx, fp[i]);
printf("%.2f\n", dtime);
}
for(int i=0; i<N; i++) idx[i] = 0;
test(idx, foo1);
dtime = test(idx, foo1);
printf("%.2f ", dtime);
for(int i=0; i<N; i++) idx[i] = i;
test(idx, foo2);
dtime = test(idx, foo2);
printf("%.2f ", dtime);
for(int i=0; i<N; i++) idx[i] = N-i-1;
test(idx, foo3);
dtime = test(idx, foo3);
printf("%.2f ", dtime);
printf("NA\n");
}
//foo2.c
#include <x86intrin.h>
void foo_auto(double * restrict a, double * restrict b, int *idx, int n) {
for(int i=0; i<n; i++) b[i] = a[idx[i]];
}
void foo_AVX2(double * restrict a, double * restrict b, int *idx, int n) {
for(int i=0; i<n; i+=4) {
__m128i vidx = _mm_loadu_si128((__m128i*)&idx[i]);
__m256d av = _mm256_i32gather_pd(&a[i], vidx, 8);
_mm256_storeu_pd(&b[i],av);
}
}
#if defined ( __AVX512F__ ) || defined ( __AVX512__ )
void foo_AVX512(double * restrict a, double * restrict b, int *idx, int n) {
for(int i=0; i<n; i+=8) {
__m256i vidx = _mm256_loadu_si256((__m256i*)&idx[i]);
__m512d av = _mm512_i32gather_pd(vidx, &a[i], 8);
_mm512_storeu_pd(&b[i],av);
}
}
#endif
void foo1(double * restrict a, double * restrict b, int *idx, int n) {
for(int i=0; i<n; i++) b[i] = a[0];
}
void foo2(double * restrict a, double * restrict b, int *idx, int n) {
for(int i=0; i<n; i++) b[i] = a[i];
}
void foo3(double * restrict a, double * restrict b, int *idx, int n) {
for(int i=0; i<n; i++) b[i] = a[n-i-1];
}
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With