I'm learning how to optimize code for GPU. I read about importance of memory locality. I've also seen some tutorials and examples of GPU convolution. Based on that I wrote and tested several own kernels. Surprisingly I found that the simplest naive kernell is the fastest!? and it is jut <10x faster than CPU. (Yes I amortized upload/download time by running the kenrnel 64x).
What I do wrong? I woud expect that convolution is just that kind of operation for which GPUs are optimized. If I can get 100x speed-up on matrix multiplication, why convolution is so slow?
performance [CPU ticks/pixel] (lower is better):EDIT: GPU-scanline_async I made later after reading advices about async_work_group_copy
I wonder 2 things:
The test was done on my laptop with CPU Intel Core i7 6700HQ Skylake and GPU nVidia 960M by running the kernels 64x/frame on floating point array of 256x256 pixels. The code full can be seen here.
=========== Kernel codes ===========
kernel GPU-Naive 2D global=(256,256) local=(16,16)
__kernel void blur2D_naive(
__global float* I,
__global float* O
){
const int ix = get_global_id (0)+1;
const int iy = get_global_id (1)+1;
const int nx = get_global_size(0)+2;
int i = iy * nx + ix;
// 1.6 ticks/pixel
O[i] =( I[i-nx-1] + I[i-nx] + I[i-nx+1] +
I[i -1] + I[i ] + I[i +1] +
I[i+nx-1] + I[i+nx] + I[i+nx+1] ) * 0.11111111111;
// modified with gaussian mask 4.9 ticks/pixel
//O[i] =( 0.0625*I[i-nx-1] + 0.125*I[i-nx] + 0.0625*I[i-nx+1] +
// 0.125 *I[i -1] + 0.25 *I[i ] + 0.125 *I[i +1] +
// 0.0625*I[i+nx-1] + 0.125*I[i+nx] + 0.0625*I[i+nx+1] );
}
kernel GPU-local 2D global=(256,256) local=(16,16)
#define NBx 18 // tile size including borders [halo] 16+2
#define NBy 18
// seems to be slower than naive method
__kernel void blur2D_local(
__global float* I,
__global float* O
){
__local float L[NBx*NBy];
const int2 iG = (int2)(get_global_id (0)+1 , get_global_id (1)+1 );
const int2 nG = (int2)(get_global_size(0)+2 , get_global_size(1)+2 );
const int2 iL = (int2)(get_local_id (0)+1 , get_local_id (1)+1 );
const int2 nL = (int2)(get_local_size (0)+2 , get_local_size (1)+2 );
const int2 iGR = (int2)(get_group_id (0) , get_group_id (1) );
// copy boundary pixels to local memory
switch( get_local_id(1) ){ // some threads copy one more of boundary (halo) pixels
case 4:
switch( get_local_id(0) ){ // copy corner points
case 0: L[ 0 ] = I[ nG.x* get_group_id(1)*get_local_size(1) + get_group_id(0)*get_local_size(0) ]; break; // upper-left
case 1: L[ NBx-1 ] = I[ nG.x* get_group_id(1)*get_local_size(1) + get_group_id(0)*get_local_size(0)+(NBx-1) ]; break; // upper-right
case 2: L[ (NBy-1)*NBx ] = I[ nG.x*(get_group_id(1)*get_local_size(1)+(NBy-1)) + get_group_id(0)*get_local_size(0) ]; break; // lower-left
case 3: L[ NBy* NBx-1 ] = I[ nG.x*(get_group_id(1)*get_local_size(1)+(NBy-1)) + get_group_id(0)*get_local_size(0)+(NBx-1) ]; break; // lower-rigth
}
// copy border lines
case 0: L[ iL.x ] = I[ nG.x* get_group_id(1)*get_local_size(1) + iG.x ]; break; // top line
case 1: L[ NBx*(NBy-1) + iL.x ] = I[ nG.x*(get_group_id(1)*get_local_size(1)+(NBy-1) ) + iG.x ]; break; // botton line
case 2: L[ NBx*iL.x ] = I[ nG.x*(get_group_id(1)*get_local_size(1)+get_local_id(0) ) + get_group_id(0)*get_local_size(0) ]; break; // left line
case 3: L[ NBx*iL.x + (NBx-1) ] = I[ nG.x*(get_group_id(1)*get_local_size(1)+get_local_id(0) ) + (get_group_id(0)*get_local_size(0)+(NBx-1)) ]; break; // right line
} // each thread coppied at max. 1 border pixels
int ig = iG.y*nG.x + iG.x;
int il = iL.y*nL.x + iL.x;
L[il] = I[ig]; // each thread copy his pixel to local memory
barrier(CLK_LOCAL_MEM_FENCE);
const float renorm = 1.0/9.0;
O[ig] =( L[il-NBx-1] + L[il-NBx] + L[il-NBx+1] +
L[il -1] + L[il ] + L[il +1] +
L[il+NBx-1] + L[il+NBx] + L[il+NBx+1] ) / 9.0;
}
kernel GPU-local_async 2D global=(256,16) local=(16,16)
#define nTiles 16
#define NBx 18
#define NBy 18
#define copy_tile(event,ig0,I,L) { int ig_=ig0; int il_=0; for(int i=0; i<NBy; i++){ event = async_work_group_copy( L+il_, I+ig_, NBx, event ); ig_+=nx; il_+=NBx; } }
// https://streamcomputing.eu/blog/2014-06-19/using-async_work_group_copy-on-2d-data/
__kernel void blur2D_local_async(
__global float* I,
__global float* O
){
const int nx = get_global_size(0)+2;
__local float LI[NBx*NBy*2];
int iL0 = 0;
int iL1 = NBx*NBy;
event_t event = 0;
int ig0 = get_group_id(0)*get_local_size(0);
copy_tile(event,ig0,I,LI);
for( int it=0; it<nTiles; it++ ){
int ig = ig0 + (get_local_id(1)+1)*nx + get_local_id(0)+1;
int il = (get_local_id(1)+1)*NBx + get_local_id(0) + iL0;
ig0 += get_local_size(1)*nx;
event_t event_ = 0;
copy_tile(event_,ig0,I,LI+iL1);
wait_group_events(1, &event);
//barrier(CLK_LOCAL_MEM_FENCE);
O[ig] =( LI[il-NBx] + LI[il-NBx+1] + LI[il-NBx+2] +
LI[il ] + LI[il +1] + LI[il +2] +
LI[il+NBx] + LI[il+NBx+1] + LI[il+NBx+2] ) * 0.11111111111;
int iLtmp=iL0; iL0=iL1; iL1=iLtmp;
event = event_;
}
}
kernel GPU-scanline_private 1D global=(256) local=(32)
__kernel void blur2D_scanline_priv(
int nx, int ny,
__global float* I,
__global float* O
){
int ig = get_global_id(0)+1;
float3 Lm = (float3)( I[ig-1], I[ig], I[ig+1] ); ig += nx;
float3 L0 = (float3)( I[ig-1], I[ig], I[ig+1] );
for(int iy=1; iy<(ny-1); iy++ ){
ig += nx;
float3 Lp= (float3)( I[ig-1], I[ig], I[ig+1] );
O[ig-nx] =
( Lm.x + Lm.y + Lm.z +
L0.x + L0.y + L0.z +
Lp.x + Lp.y + Lp.z ) * 0.11111111111;
Lm=L0; L0=Lp;
}
}
kernel GPU-scanline_async 1D global=(256) local=(32)
#define NB 34
__kernel void blur2D_scanline_async(
int nx, int ny,
__global float* I,
__global float* O
){
__local float L[NB*4];
int i0=0;
int i1=NB;
int i2=NB*2;
int i3=NB*3;
event_t event = 0;
int ig0 = get_group_id(0)*get_local_size(0);
event = async_work_group_copy( L , I+ig0, NB, event ); ig0 += nx;
event = async_work_group_copy( L+NB , I+ig0, NB, event ); ig0 += nx;
event = async_work_group_copy( L+NB*2, I+ig0, NB, event ); ig0 += nx;
const int il = get_local_id(0);
int ig = get_global_id(0)+1;
for(int iy=1; iy<(ny-2); iy++ ){
wait_group_events(1, &event);
event = async_work_group_copy( L+i3, I+ig0, NB, event ); ig0 += nx;
ig += nx;
O[ig] =
( L[i0+il] + L[i0+il+1] + L[i0+il+2] +
L[i1+il] + L[i1+il+1] + L[i1+il+2] +
L[i2+il] + L[i2+il+1] + L[i2+il+2] ) * 0.11111111111;
__local float *Ltmp;
int itmp=i0; i0=i1; i1=i2; i2=i3; i3=itmp;
}
}
kernel CPU-naive
void blur(int nx, int ny, float * I, float * O ){
float renorm = 1.0/9.0;
for(int iy=1;iy<ny-1;iy++){ for(int ix=1;ix<nx-1;ix++){
int i = iy*nx+ix;
O[i] =( I[i-nx-1] + I[i-nx] + I[i-nx+1] +
I[i -1] + I[i ] + I[i +1] +
I[i+nx-1] + I[i+nx] + I[i+nx+1] ) * renorm;
} }
}
In matrix multiplication, each sub-matrix (patch) is used for all patches in all lines in the other matrix. If there is 2x2 sub-matrix in a patch and if main matrix is 20x20 then each sub-matrix is used 10 times for multiplication. GPU generally uses 16x16 or 32x32 sized patches which means, for a 2kx2k multiplication each 16x16 patch is re-used for 128 times at least.
MM reuse = 128
and add the sub-matrix - sub-matrix multiplication re-use, it is enough to push gpu to limits.
In a 3x3 convolution, 3x3 patch is not used for a whole scanline or a whole picture. Only its pixels are re-used.
3x3 stencil: each pixel is re-used by neighbouring 8 stencils.
5x5 stencil: each pixel is re-used by neighbouring 24 stencils.
to catch up with matrix multiplication, it would need
11x11 stencil to have a reuse of 120
which is also more local than matrix multiplication and should get more gflops than it but it is not doing equal amounts of multiplications and additions.
It is doing 9 additions + 1 multiplications.
8 potential multiplications are lost. Nearly half of GFLOPS limit is lost.
You should try async workgroup copies.
Matrix multiplication/with 16x16 sub matrices) vs convolution(17x17 brush size):
Matrix: L2 re-use ratio increases with main matrix size, or L1 re-use ratio increases with sub-matrix size (L1)
Matrix: 16*16*16 multiplications + 16*16*16 additions per workgroup
Matrix: uniform thread usage, no if-else, all local memory is re-used
Matrix: increasing patch size also increases re-use by cubic power rate in sub-matrix multiplications(but decreases L2 re-usage because of having less patches per line, which makes total re-use like square-power rate)
Matrix: local memory must be at least 2x tile area (sub mat-mat mul)
Matrix: can do 4x4 sub-sub-multiplications in private memory(which use each element 4 times) which means 4x4 memory = 64 add+64 mul
Having an addition-heavy kernel leaves room for another multiplication-heavy kernel to work concurrently or in same kernel asynchronously. Maybe if you are using this for an image processing, maybe you can add some "blend" or "resize" kernels inside so they work together?
Scanline version is loading 3 elements, doing 9 add + 1 mul then repeats, loaded elements stay for 3 turns which means they are re-used for 3 times only and its neighbours(x or y directio) may not fall in neighbour thread or even neighbour workgroup. Also 3 loads versus 1 stores is unbalanced. If memory bandwidth is 100 GB/s then it would use 50GB/s for loads, 15 GB/s for stores unless they are coming from L1.
You can decrease add/mul imbalance by using accumulator.
store = (accumulator) * 0.1111111
accumulator+=new vector // 3 adds
accumulator-=old vecotr // 3 adds
so now it is 6 adds + 1 muls so more balanced like: 1Tflops GPU will have 500Gflops for adds, 90 Gflops for muls.
Naive version doesn't use local memory, leaving more room for more wavefronts in-flight. Local memory version actually breaks L1 access pattern and let less wavefronts in-flight. This reduces VALU occupation.
You can decrease local memory usage by doing scanline on workgroup level instead of thread level. What I mean is something like:
load from memory: x x x x x x x x x x do scanline for it: (left to right,1-D) a b c d e f g h i j now use it for scanline on workgroup level: a c c u m u l a t o r (+new) (top to bottom) z x z x z x z x z x (- old)
calculate frontline 1-d scanline: 30 additions for each new row
calculate wide vector 2-d scanline:30*30 additions
each pixel get 1 value instead of adding 3 values
storing: 16x16 multiplications
much less local memory used, more balanced (~8 add 1 mul)
this has 1-d scanline that is single-thread for N cycles or multi threaded reduce for LogN cycles(considering enough threads in a compute unit).
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With