I'm trying to launch multiple CUDA FFT kernels asynchronously using streams. For that, I'm creating my streams, cuFFT forward and inverse plans as follows:
streams = (cudaStream_t*) malloc(sizeof(cudaStream_t)*streamNum);
plansF = (cufftHandle *) malloc(sizeof(cufftHandle)*streamNum);
plansI = (cufftHandle *) malloc(sizeof(cufftHandle)*streamNum);
for(int i=0; i<streamNum; i++)
{
cudaStreamCreate(&streams[i]);
CHECK_ERROR(5)
cufftPlan1d(&plansF[i], ticks, CUFFT_R2C,1);
CHECK_ERROR(5)
cufftPlan1d(&plansI[i], ticks, CUFFT_C2R,1);
CHECK_ERROR(5)
cufftSetStream(plansF[i],streams[i]);
CHECK_ERROR(5)
cufftSetStream(plansI[i],streams[i]);
CHECK_ERROR(5)
}
In the main
function, I'm launching forward FFTs as follows:
for(w=1;w<q;w++)
{
cufftExecR2C(plansF[w], gpuMem1+k,gpuMem2+j);
CHECK_ERROR(8)
k += rect_small_real;
j += rect_small_complex;
}
I also have other kernels that I launch asynchronously with the same streams.
When I profile my application using Visual Profiler 5.0, I see that all kernels except the CUDA FFT (both forward and inverse) run in parallel and overlap. FFT kernels do run in different streams, but they do not overlap, as they actually run sequentially. Can anyone tell me what is my problem?
My environment is VS 2008, 64 bit, Windows 7.
Thanks.
This is a worked example of cuFFT execution and memcopies using streams in CUDA on the Kepler architecture.
Here is the code:
#include <stdio.h>
#include <cufft.h>
#define NUM_STREAMS 3
/********************/
/* CUDA ERROR CHECK */
/********************/
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
/********/
/* MAIN */
/********/
int main()
{
const int N = 5000;
// --- Host input data initialization
float2 *h_in1 = new float2[N];
float2 *h_in2 = new float2[N];
float2 *h_in3 = new float2[N];
for (int i = 0; i < N; i++) {
h_in1[i].x = 1.f;
h_in1[i].y = 0.f;
h_in2[i].x = 1.f;
h_in2[i].y = 0.f;
h_in3[i].x = 1.f;
h_in3[i].y = 0.f;
}
// --- Host output data initialization
float2 *h_out1 = new float2[N];
float2 *h_out2 = new float2[N];
float2 *h_out3 = new float2[N];
for (int i = 0; i < N; i++) {
h_out1[i].x = 0.f;
h_out1[i].y = 0.f;
h_out2[i].x = 0.f;
h_out2[i].y = 0.f;
h_out3[i].x = 0.f;
h_out3[i].y = 0.f;
}
// --- Registers host memory as page-locked (required for asynch cudaMemcpyAsync)
gpuErrchk(cudaHostRegister(h_in1, N*sizeof(float2), cudaHostRegisterPortable));
gpuErrchk(cudaHostRegister(h_in2, N*sizeof(float2), cudaHostRegisterPortable));
gpuErrchk(cudaHostRegister(h_in3, N*sizeof(float2), cudaHostRegisterPortable));
gpuErrchk(cudaHostRegister(h_out1, N*sizeof(float2), cudaHostRegisterPortable));
gpuErrchk(cudaHostRegister(h_out2, N*sizeof(float2), cudaHostRegisterPortable));
gpuErrchk(cudaHostRegister(h_out3, N*sizeof(float2), cudaHostRegisterPortable));
// --- Device input data allocation
float2 *d_in1; gpuErrchk(cudaMalloc((void**)&d_in1, N*sizeof(float2)));
float2 *d_in2; gpuErrchk(cudaMalloc((void**)&d_in2, N*sizeof(float2)));
float2 *d_in3; gpuErrchk(cudaMalloc((void**)&d_in3, N*sizeof(float2)));
float2 *d_out1; gpuErrchk(cudaMalloc((void**)&d_out1, N*sizeof(float2)));
float2 *d_out2; gpuErrchk(cudaMalloc((void**)&d_out2, N*sizeof(float2)));
float2 *d_out3; gpuErrchk(cudaMalloc((void**)&d_out3, N*sizeof(float2)));
// --- Creates CUDA streams
cudaStream_t streams[NUM_STREAMS];
for (int i = 0; i < NUM_STREAMS; i++) gpuErrchk(cudaStreamCreate(&streams[i]));
// --- Creates cuFFT plans and sets them in streams
cufftHandle* plans = (cufftHandle*) malloc(sizeof(cufftHandle)*NUM_STREAMS);
for (int i = 0; i < NUM_STREAMS; i++) {
cufftPlan1d(&plans[i], N, CUFFT_C2C, 1);
cufftSetStream(plans[i], streams[i]);
}
// --- Async memcopyes and computations
gpuErrchk(cudaMemcpyAsync(d_in1, h_in1, N*sizeof(float2), cudaMemcpyHostToDevice, streams[0]));
gpuErrchk(cudaMemcpyAsync(d_in2, h_in2, N*sizeof(float2), cudaMemcpyHostToDevice, streams[1]));
gpuErrchk(cudaMemcpyAsync(d_in3, h_in3, N*sizeof(float2), cudaMemcpyHostToDevice, streams[2]));
cufftExecC2C(plans[0], (cufftComplex*)d_in1, (cufftComplex*)d_out1, CUFFT_FORWARD);
cufftExecC2C(plans[1], (cufftComplex*)d_in2, (cufftComplex*)d_out2, CUFFT_FORWARD);
cufftExecC2C(plans[2], (cufftComplex*)d_in3, (cufftComplex*)d_out3, CUFFT_FORWARD);
gpuErrchk(cudaMemcpyAsync(h_out1, d_out1, N*sizeof(float2), cudaMemcpyDeviceToHost, streams[0]));
gpuErrchk(cudaMemcpyAsync(h_out2, d_out2, N*sizeof(float2), cudaMemcpyDeviceToHost, streams[1]));
gpuErrchk(cudaMemcpyAsync(h_out3, d_out3, N*sizeof(float2), cudaMemcpyDeviceToHost, streams[2]));
for(int i = 0; i < NUM_STREAMS; i++)
gpuErrchk(cudaStreamSynchronize(streams[i]));
// --- Releases resources
gpuErrchk(cudaHostUnregister(h_in1));
gpuErrchk(cudaHostUnregister(h_in2));
gpuErrchk(cudaHostUnregister(h_in3));
gpuErrchk(cudaHostUnregister(h_out1));
gpuErrchk(cudaHostUnregister(h_out2));
gpuErrchk(cudaHostUnregister(h_out3));
gpuErrchk(cudaFree(d_in1));
gpuErrchk(cudaFree(d_in2));
gpuErrchk(cudaFree(d_in3));
gpuErrchk(cudaFree(d_out1));
gpuErrchk(cudaFree(d_out2));
gpuErrchk(cudaFree(d_out3));
for(int i = 0; i < NUM_STREAMS; i++) gpuErrchk(cudaStreamDestroy(streams[i]));
delete[] h_in1;
delete[] h_in2;
delete[] h_in3;
delete[] h_out1;
delete[] h_out2;
delete[] h_out3;
cudaDeviceReset();
return 0;
}
Please, add cuFFT error check according to CUFFT error handling.
Below, some profiling information when testing the above algorithm on a Kepler K20c card is provided. As you will see, you will achieve a true overlap between computation and memory transfers only provided that you have a sufficiently large N
.
N = 5000
N = 50000
N = 500000
The problem is in the hardware you use.
All CUDA capable GPUs are capable of executing a kernel and copying data in both ways concurrently. However, only devices with Compute Capability 3.5 have the feature named Hyper-Q.
Briefly, in these GPU's several (16 I suppose) hardware kernel queues are implemented. In previous GPU's one one hardware queue is available.
This means that cudaStreams are only virtual and their usage for old hardware makes sense only in case of overlapping computations and memory copying. Of course this is valid not only for cuFFT but also for your own kernels too!
Please look deeply inside the output of visual profiler. You may unintentionally think of the timeline visualization as of the exact data for GPU execution. However it is not that simple. There're several lines in which displayed data may refer to timepoint in which the kernel launch line was executed (usually orange ones). And this line correspond to execution of specific kernel on GPU (blue rectangles). The same is for memory transfers (the exact time is shown as light brown rectangles).
Hope, I helped you to solve your problem.
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With