Having defined how to deal with errors:
static void HandleError( cudaError_t err,
const char *file,
int line ) {
if (err != cudaSuccess) {
printf( "%s in %s at line %d\n", cudaGetErrorString( err ),
file, line );
exit( EXIT_FAILURE );
}
}
#define HANDLE_ERROR( err ) (HandleError( err, __FILE__, __LINE__ ))
Normally, to store our results in the array d_results, of type double, of size N, that can be allocated in the GPU memory at once, we can manage to transfer the data from the device to the host like so:
double *d_results;
HANDLE_ERROR(cudaMalloc(&d_results,N*sizeof(double)));
//Launch our kernel to do some computations and store the results in d_results
.....
// and transfer our data from the device to the host
vector<double> results(N);
cudaMemcpy(results.data(),d_results,N*sizeof(double),cudaMemcpyDeviceToHost);
If the second line fails because there are not enough memory to store all the results at once. How can I manage to do the computations and transfer the results to the host properly? is mandatory to do the computation by batches? I rather to avoid a manual batching. What is the standard approach to manage this situation in CUDA?
Batching is the best way to go. You can automate most of the batching process if you do something like this:
#include <assert.h>
#include <iostream>
int main()
{
// Allocate 4 Gb array on host
const size_t N = 1 << 30;
int * data = new int[N];
// Allocate as much memory as will fit on GPU
size_t total_mem, free_mem;
cudaMemGetInfo(&free_mem, &total_mem);
const size_t MB = 1 << 20;
cudaError_t status;
int *buffer;
size_t buffer_size = free_mem;
for(; buffer_size > MB; buffer_size -= MB) {
status = cudaMalloc((void **)&buffer, buffer_size);
if (status == cudaSuccess)
break;
}
std::cout << "Allocated " << buffer_size << " bytes on GPU" << std::endl;
// Loop through host source data in batches
std::cout << N << " items require processing" << std::endl;
size_t batchN = buffer_size / sizeof(int);
size_t remainN = N;
int * dp = data;
std::cout << "Using batch size " << batchN << std::endl;
for(; remainN > 0; remainN -= batchN) {
batchN = (remainN < batchN) ? remainN : batchN;
size_t worksize = batchN * sizeof(int);
std::cout << "Processing batch of size " << batchN;
std::cout << "," << remainN << " items remaining" << std::endl;
cudaMemcpy(buffer, dp, worksize, cudaMemcpyHostToDevice);
cudaMemset(buffer, 0xff, worksize);
cudaMemcpy(dp, buffer, worksize, cudaMemcpyDeviceToHost);
dp += batchN;
}
for(size_t i = 0; i < N; i++) {
assert(data[i] == 0xffffffff);
}
cudaDeviceReset();
return 0;
}
Which is basically
In the above code I have used cudaMemset as a proxy for a real kernel, but it gives you an idea of what is required. If you want to get fancier, you could use two buffers and streams (with registered/pinned host memory) and copy asynchronously to get compute/copy overlap which will improve the overall performance in non trivial cases.
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With