I have the following OpenCL kernel:
kernel void ndft(
global float *re, global float *im, int num_values,
global float *spectrum_re, global float *spectrum_im,
global float *spectrum_abs,
global float *sin_array, global float *cos_array,
float sqrt_num_values_reciprocal)
{
// MATH MAGIC - DISREGARD FROM HERE -----------
float x;
float y;
float sum_re = 0;
float sum_im = 0;
size_t thread_id = get_global_id(0);
//size_t local_id = get_local_id(0);
// num_values = 24 (live environment), 48 (test)
for (int i = 0; i < num_values; i++)
{
x = cos_array[thread_id * num_values + i] * sqrt_num_values_reciprocal;
y = sin_array[thread_id * num_values + i] * sqrt_num_values_reciprocal;
sum_re = sum_re + re[i] * x + im[i] * y;
sum_im = sum_im - re[i] * y + x * im[i];
}
// MATH MAGIC DONE ----------------------------
//spectrum_re[thread_id] = sum_re;
//spectrum_im[thread_id] = sum_im;
//spectrum_abs[thread_id] = hypot(sum_re, sum_im);
float asdf = hypot(sum_re, sum_im); // this is just a dummy calculation
}
Like this, the execution time is about 15 us (work group size = 567, 14 work groups, for a total of 7938 threads).
However, of course, I somehow need to retrieve the results of the operation, which is what the last few lines are for, (commented out). As soon as I perform a single of those memory operations (and it does not matter if spectrum_X
is global
, as in the example, or local
), the exeuction time of the kernel increases to ~1.4 to 1.5 ms.
I thought the increase in execution time was some kind of fixed overhead, so I would just accumulate more data, so that the relative amount of time lost due to that effect minimizes. But when I double my number of threads (i. e. twice the amount of data), the execution time also doubles (to 2.8 ~ 3.0 ms).
I found out, that even if I only uncomment one of the those lines, I have the same execution time as if I uncommented all three. Even if I add a if (thread_id == 0)
and run it, I have the same execution time. However, it is just way too slow this way (the upper limit for my application is about 30 us). It even perfoms about 5 times faster when I run it in ordinary C code on my CPU.
Now I am obviously doing something wrong but I am not sure where to start looking for a solution.
As I commented on talonmies' answer, I also did the following:
From the above code, I made the last 4 lines look like
//spectrum_re[thread_id] = sum_re;
//spectrum_im[thread_id] = sum_im;
spectrum_abs[thread_id] = hypot(sum_re, sum_im);
//float asdf = hypot(sum_re, sum_im);
As expected, execution time ~1.8 ms. The generated assembler code for my system is:
//
// Generated by NVIDIA NVVM Compiler
// Compiler built on Tue Apr 03 12:42:39 2012 (1333449759)
// Driver
//
.version 3.0
.target sm_21, texmode_independent
.address_size 32
.entry ndft(
.param .u32 .ptr .global .align 4 ndft_param_0,
.param .u32 .ptr .global .align 4 ndft_param_1,
.param .u32 ndft_param_2,
.param .u32 .ptr .global .align 4 ndft_param_3,
.param .u32 .ptr .global .align 4 ndft_param_4,
.param .u32 .ptr .global .align 4 ndft_param_5,
.param .u32 .ptr .global .align 4 ndft_param_6,
.param .u32 .ptr .global .align 4 ndft_param_7,
.param .f32 ndft_param_8
)
{
.reg .f32 %f;
.reg .pred %p;
.reg .s32 %r;
ld.param.u32 %r3, [ndft_param_2];
// inline asm
mov.u32 %r18, %envreg3;
// inline asm
// inline asm
mov.u32 %r19, %ntid.x;
// inline asm
// inline asm
mov.u32 %r20, %ctaid.x;
// inline asm
// inline asm
mov.u32 %r21, %tid.x;
// inline asm
add.s32 %r22, %r21, %r18;
mad.lo.s32 %r11, %r20, %r19, %r22;
setp.gt.s32 %p1, %r3, 0;
@%p1 bra BB0_2;
mov.f32 %f46, 0f00000000;
mov.f32 %f45, %f46;
bra.uni BB0_4;
BB0_2:
ld.param.u32 %r38, [ndft_param_2];
mul.lo.s32 %r27, %r38, %r11;
shl.b32 %r28, %r27, 2;
ld.param.u32 %r40, [ndft_param_6];
add.s32 %r12, %r40, %r28;
ld.param.u32 %r41, [ndft_param_7];
add.s32 %r13, %r41, %r28;
mov.f32 %f46, 0f00000000;
mov.f32 %f45, %f46;
mov.u32 %r43, 0;
mov.u32 %r42, %r43;
BB0_3:
add.s32 %r29, %r13, %r42;
ld.global.f32 %f18, [%r29];
ld.param.f32 %f44, [ndft_param_8];
mul.f32 %f19, %f18, %f44;
add.s32 %r30, %r12, %r42;
ld.global.f32 %f20, [%r30];
mul.f32 %f21, %f20, %f44;
ld.param.u32 %r35, [ndft_param_0];
add.s32 %r31, %r35, %r42;
ld.global.f32 %f22, [%r31];
fma.rn.f32 %f23, %f22, %f19, %f46;
ld.param.u32 %r36, [ndft_param_1];
add.s32 %r32, %r36, %r42;
ld.global.f32 %f24, [%r32];
fma.rn.f32 %f46, %f24, %f21, %f23;
neg.f32 %f25, %f22;
fma.rn.f32 %f26, %f25, %f21, %f45;
fma.rn.f32 %f45, %f24, %f19, %f26;
add.s32 %r42, %r42, 4;
add.s32 %r43, %r43, 1;
ld.param.u32 %r37, [ndft_param_2];
setp.lt.s32 %p2, %r43, %r37;
@%p2 bra BB0_3;
BB0_4:
// inline asm
abs.f32 %f27, %f46;
// inline asm
// inline asm
abs.f32 %f29, %f45;
// inline asm
setp.gt.f32 %p3, %f27, %f29;
selp.f32 %f8, %f29, %f27, %p3;
selp.f32 %f32, %f27, %f29, %p3;
// inline asm
abs.f32 %f31, %f32;
// inline asm
setp.gt.f32 %p4, %f31, 0f7E800000;
mov.f32 %f47, %f32;
@%p4 bra BB0_6;
mov.f32 %f48, %f8;
bra.uni BB0_7;
BB0_6:
mov.f32 %f33, 0f3E800000;
mul.rn.f32 %f10, %f8, %f33;
mul.rn.f32 %f47, %f32, %f33;
mov.f32 %f48, %f10;
BB0_7:
mov.f32 %f13, %f48;
// inline asm
div.approx.f32 %f34, %f13, %f47;
// inline asm
mul.rn.f32 %f39, %f34, %f34;
add.f32 %f38, %f39, 0f3F800000;
// inline asm
sqrt.approx.f32 %f37, %f38; // <-- this is part of hypot()
// inline asm
mul.rn.f32 %f40, %f32, %f37;
add.f32 %f41, %f32, %f8;
setp.eq.f32 %p5, %f32, 0f00000000;
selp.f32 %f42, %f41, %f40, %p5;
setp.eq.f32 %p6, %f32, 0f7F800000;
setp.eq.f32 %p7, %f8, 0f7F800000;
or.pred %p8, %p6, %p7;
selp.f32 %f43, 0f7F800000, %f42, %p8;
shl.b32 %r33, %r11, 2;
ld.param.u32 %r39, [ndft_param_5];
add.s32 %r34, %r39, %r33;
st.global.f32 [%r34], %f43; // <-- stores the hypot's result in spectrum_abs
ret;
}
Indeed all my calculation operations are there - lots of adds/mults as well as a sqrt
for the hypot
function. From the above asm code, I removed the second last line:
st.global.f32 [%r34], %f43;
which is the line that actually stores the data in the global array spectrum_abs
. Then I used clCreateProgramWithBinary
and used the modified asm code file as input. Execution time went down to 20 us.
I would guess you are seeing the effects of compiler optimization.
The NVIDIA compiler is very aggressive at eliminating "dead code" which does not directly participate in a write to global memory. So in your kernel, if you don't write sum_re
or sum_im
, the compiler will optimize the whole computation loop (and probably everything else) and leave your with an empty kernel containing nothing more than a no-op. The 15 microsecond execution time you are seeing is mostly just kernel launch overhead and not much else. When you uncomment a global memory write, then the compiler leaves all the computation code in place, and you see the true execution time of your code.
So the real question you should probably be asking is how to optimize that kernel to reduce its execution time from the 1.5 milliseconds it currently takes towards your (very ambitous) 30 microsecond target.
Despite the skepticism expressed to the original answer, here is a complete repro case which supports the assertion that this is a compiler related effect:
#include <iostream>
#include <OpenCL/opencl.h>
size_t source_size;
const char * source_str =
"kernel void ndft( \n" \
" global float *re, global float *im, int num_values, \n" \
" global float *spectrum_re, global float *spectrum_im, \n" \
" global float *spectrum_abs, \n" \
" global float *sin_array, global float *cos_array, \n" \
" float sqrt_num_values_reciprocal) \n" \
"{ \n" \
" // MATH MAGIC - DISREGARD FROM HERE ----------- \n" \
" \n" \
" float x; \n" \
" float y; \n" \
" float sum_re = 0; \n" \
" float sum_im = 0; \n" \
" \n" \
" size_t thread_id = get_global_id(0); \n" \
" \n" \
" for (int i = 0; i < num_values; i++) \n" \
" { \n" \
" x = cos_array[thread_id * num_values + i] * sqrt_num_values_reciprocal; \n" \
" y = sin_array[thread_id * num_values + i] * sqrt_num_values_reciprocal; \n" \
" sum_re += re[i] * x + im[i] * y; \n" \
" sum_im -= re[i] * y + x * im[i]; \n" \
" } \n" \
" \n" \
" // MATH MAGIC DONE ---------------------------- \n" \
" \n" \
" //spectrum_re[thread_id] = sum_re; \n" \
" //spectrum_im[thread_id] = sum_im; \n" \
" //spectrum_abs[thread_id] = hypot(sum_re, sum_im); \n" \
"} \n";
int main(void)
{
int err;
cl_device_id device_id;
clGetDeviceIDs(NULL, CL_DEVICE_TYPE_GPU, 1, &device_id, NULL);
cl_context context = clCreateContext(0, 1, &device_id, NULL, NULL, &err);
cl_program program = clCreateProgramWithSource(context, 1, (const char **)&source_str, (const size_t *)&source_size, &err);
err = clBuildProgram(program, 1, &device_id, NULL, NULL, NULL);
cl_uint program_num_devices;
clGetProgramInfo(program, CL_PROGRAM_NUM_DEVICES, sizeof(cl_uint), &program_num_devices, NULL);
size_t * binaries_sizes = new size_t[program_num_devices];
clGetProgramInfo( program, CL_PROGRAM_BINARY_SIZES, program_num_devices*sizeof(size_t), binaries_sizes, NULL);
char **binaries = new char*[program_num_devices];
for (size_t i = 0; i < program_num_devices; i++)
binaries[i] = new char[binaries_sizes[i]+1];
clGetProgramInfo(program, CL_PROGRAM_BINARIES, program_num_devices*sizeof(size_t), binaries, NULL);
for (size_t i = 0; i < program_num_devices; i++)
{
binaries[i][binaries_sizes[i]] = '\0';
std::cout << "Program " << i << ":" << std::endl;
std::cout << binaries[i];
}
return 0;
}
When compiled and run, it emits the follow PTX code from the OpenCL runtime:
Program 0:
bplist00?^clBinaryDriver\clBinaryData_clBinaryVersionWCLH 1.0O!.version 1.5
.target sm_12
.target texmode_independent
.reg .b32 r<126>; /* define r0..125 */
.reg .b64 x<126>; /* define r0..125 */
.reg .b32 f<128>; /* define f0..127 */
.reg .pred p<32>; /* define p0..31 */
.reg .u32 sp;
.reg .b8 wb0,wb1,wb2,wb3; /* 8-bit write buffer */
.reg .b16 ws0,ws1,ws2,ws3; /* 16-bit write buffer */
.reg .b32 tb0,tb1,tb2,tb3; /* read tex buffer */
.reg .b64 vl0,vl1; /* 64-bit vector buffer */
.reg .b16 cvt16_0,cvt16_1; /* tmps for conversions */
.const .align 1 .b8 ndft_gid_base[52];
.local .align 16 .b8 ndft_stack[8];
.entry ndft(
.param.b32 ndft_0 /* re */,
.param.b32 ndft_1 /* im */,
.param.b32 ndft_2 /* num_values */,
.param.b32 ndft_3 /* spectrum_re */,
.param.b32 ndft_4 /* spectrum_im */,
.param.b32 ndft_5 /* spectrum_abs */,
.param.b32 ndft_6 /* sin_array */,
.param.b32 ndft_7 /* cos_array */,
.param.f32 ndft_8 /* sqrt_num_values_reciprocal */
) {
mov.u32 sp, ndft_stack;
mov.u32 r0, 4294967295;
ld.param.u32 r1, [ndft_2 + 0];
LBB1_1:
add.u32 r0, r0, 1;
setp.lt.s32 p0, r0, r1;
@p0 bra LBB1_1;
LBB1_2:
ret;
}
ie. a kernel stub containing none of the computation loop. When the three global memory writes in the last three lines of the kernel are uncommented, it emits this:
Program 0:
S.version 1.5inaryDriver\clBinaryData_clBinaryVersionWCLH 1.0O
.target sm_12
.target texmode_independent
.reg .b32 r<126>; /* define r0..125 */
.reg .b64 x<126>; /* define r0..125 */
.reg .b32 f<128>; /* define f0..127 */
.reg .pred p<32>; /* define p0..31 */
.reg .u32 sp;
.reg .b8 wb0,wb1,wb2,wb3; /* 8-bit write buffer */
.reg .b16 ws0,ws1,ws2,ws3; /* 16-bit write buffer */
.reg .b32 tb0,tb1,tb2,tb3; /* read tex buffer */
.reg .b64 vl0,vl1; /* 64-bit vector buffer */
.reg .b16 cvt16_0,cvt16_1; /* tmps for conversions */
.const .align 1 .b8 ndft_gid_base[52];
.local .align 16 .b8 ndft_stack[8];
.entry ndft(
.param.b32 ndft_0 /* re */,
.param.b32 ndft_1 /* im */,
.param.b32 ndft_2 /* num_values */,
.param.b32 ndft_3 /* spectrum_re */,
.param.b32 ndft_4 /* spectrum_im */,
.param.b32 ndft_5 /* spectrum_abs */,
.param.b32 ndft_6 /* sin_array */,
.param.b32 ndft_7 /* cos_array */,
.param.f32 ndft_8 /* sqrt_num_values_reciprocal */
) {
mov.u32 sp, ndft_stack;
cvt.u32.u16 r0, %tid.x;
cvt.u32.u16 r1, %ntid.x;
cvt.u32.u16 r2, %ctaid.x;
mad24.lo.u32 r0, r2, r1, r0;
mov.u32 r1, 0;
shl.b32 r2, r1, 2;
mov.u32 r3, ndft_gid_base;
add.u32 r2, r2, r3;
ld.const.u32 r2, [r2 + 40];
add.u32 r0, r0, r2;
ld.param.u32 r2, [ndft_2 + 0];
mul.lo.u32 r3, r0, r2;
shl.b32 r3, r3, 2;
mov.f32 f0, 0f00000000 /* 0.000000e+00 */;
ld.param.f32 f1, [ndft_8 + 0];
ld.param.u32 r4, [ndft_7 + 0];
ld.param.u32 r5, [ndft_6 + 0];
ld.param.u32 r6, [ndft_5 + 0];
ld.param.u32 r7, [ndft_4 + 0];
ld.param.u32 r8, [ndft_3 + 0];
ld.param.u32 r9, [ndft_1 + 0];
ld.param.u32 r10, [ndft_0 + 0];
mov.u32 r11, r1;
mov.f32 f2, f0;
LBB1_1:
setp.ge.s32 p0, r11, r2;
@!p0 bra LBB1_7;
LBB1_2:
shl.b32 r1, r0, 2;
add.u32 r2, r8, r1;
st.global.f32 [r2+0], f0;
add.u32 r1, r7, r1;
st.global.f32 [r1+0], f2;
abs.f32 f1, f2;
abs.f32 f0, f0;
setp.gt.f32 p0, f0, f1;
selp.f32 f2, f0, f1, p0;
abs.f32 f3, f2;
mov.f32 f4, 0f7E800000 /* 8.507059e+37 */;
setp.gt.f32 p1, f3, f4;
selp.f32 f0, f1, f0, p0;
shl.b32 r0, r0, 2;
add.u32 r0, r6, r0;
@!p1 bra LBB1_8;
LBB1_3:
mul.rn.f32 f3, f2, 0f3E800000 /* 2.500000e-01 */;
mul.rn.f32 f1, f0, 0f3E800000 /* 2.500000e-01 */;
LBB1_4:
mov.f32 f4, 0f00000000 /* 0.000000e+00 */;
setp.eq.f32 p0, f2, f4;
@!p0 bra LBB1_9;
LBB1_5:
add.f32 f1, f2, f0;
LBB1_6:
mov.f32 f3, 0f7F800000 /* inf */;
setp.eq.f32 p0, f0, f3;
setp.eq.f32 p1, f2, f3;
or.pred p0, p1, p0;
selp.f32 f0, f3, f1, p0;
st.global.f32 [r0+0], f0;
ret;
LBB1_7:
add.u32 r12, r3, r1;
add.u32 r13, r4, r12;
ld.global.f32 f3, [r13+0];
mul.rn.f32 f3, f3, f1;
add.u32 r13, r9, r1;
ld.global.f32 f4, [r13+0];
mul.rn.f32 f5, f3, f4;
add.u32 r12, r5, r12;
ld.global.f32 f6, [r12+0];
mul.rn.f32 f6, f6, f1;
add.u32 r12, r10, r1;
ld.global.f32 f7, [r12+0];
mul.rn.f32 f8, f7, f6;
add.f32 f5, f8, f5;
sub.f32 f2, f2, f5;
mul.rn.f32 f4, f4, f6;
mul.rn.f32 f3, f7, f3;
add.f32 f3, f3, f4;
add.f32 f0, f0, f3;
add.u32 r11, r11, 1;
add.u32 r1, r1, 4;
bra LBB1_1;
LBB1_8:
mov.f32 f1, f0;
mov.f32 f3, f2;
bra LBB1_4;
LBB1_9:
div.approx.f32 f1, f1, f3;
mul.rn.f32 f1, f1, f1;
add.f32 f1, f1, 0f3F800000 /* 1.000000e+00 */;
sqrt.approx.ftz.f32 f1, f1;
mul.rn.f32 f1, f2, f1;
bra LBB1_6;
}
I think this is pretty irrefutable evidence that it is compiler optimisation which is causes the difference in runtime, and depends only on whether memory writes are included in the kernel code or not.
I guess the final question then becomes why this is so slow (irrespective of the debate about whether this is caused by compiler optimization or not). The 1.5 millisecond runtime you are seeing is a true reflection of the performance of the code and the real question is why. From my reading of your kernel code, the answer looks to lie in memory access patterns which are pretty horrible for the GPU. Inside the compute loop you have a two global memory reads with very large strides, like this one:
x = cos_array[thread_id * num_values + i] * sqrt_num_values_reciprocal;
According to the comment in your code num_values
is either 24 or 48. That means the memory reads can't possibly coalesce, and the L1 cache on a Fermi GPU isn't going to be much help either. This will have a huge negative impact on memory bandwidth utilisation and make the code very slow. If you are stuck with that input data ordering, then a faster solution would be to use a warp to do the computation of a one output (so do a warp wide reduction to the final sum). This will reduce the read stride from 24 or 48 to 1 and coalesce the global memory reads from those two large input arrays.
Inside the loop there is also repeated fetches to global memory for either 24 or 48 elements of re
and im
:
sum_re += re[i] * x + im[i] * y;
sum_im -= re[i] * y + x * im[i];
This is unnecessary, and wastes a lot of global memory bandwidth or cache efficiency (the GPU doesn't have enough registers to let the compiler hold the whole of each array in register). It would be far better to have each work group read those two arrays into __local
memory arrays once and use the local memory copy inside the compute loop. If you have each work group compute multiple times, rather than just once, then you can potentially save a lot of global memory bandwidth and amortise the initial read until it is almost free.
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With