I am creating a test program that will create a device and a host array of size n and then launch a kernel that creates n threads which allocate the constant value 0.95f to each location in the device array. After completion, the device array is copied to the host array and all entries are totaled and a final total is displayed.
The program below seems to work fine for array sizes up to around 60 million floats and returns the correct results very quickly, but upon reaching 70 million the program seems to hang for a while and eventually returns a NAN result for the total. Inspecting the host array after a 60 million run shows it is correctly populated with 0.95f, but inspecting it after a 70 million run shows it is populated with NAN. As far as I am aware none of the CUDA calls return errors.
I am using a 2GB GT640m (Compute 3.0), giving me a max block size of 1024 and a max grid dimension of 2147483647.
I am sure there are better ways of achieving something similar, and I would like to hear suggestions. But I would also like to understand what has gone wrong here so I can learn from it.
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include <fstream>
void cudaErrorHandler(cudaError_t status)
{
// Cuda call returned an error, just print error for now
if(status != cudaSuccess)
{
printf("Error");
}
}
__global__ void addKernel(float* _Results, int _TotalCombinations)
{
// Get thread Id
unsigned int Id = (blockDim.x * blockDim.y * blockIdx.x) + (blockDim.x * threadIdx.y) + threadIdx.x;
//If the Id is within simulation range, log it
if(Id < _TotalCombinations)
{
_Results[Id] = 0.95f;
}
}
#define BLOCK_DIM_X 32
#define BLOCK_DIM_Y 32
#define BLOCK_SIZE BLOCK_DIM_X * BLOCK_DIM_Y // Statc block size of 32*32 (1024)
#define CUDA_CALL(x) cudaErrorHandler(x)
int main()
{
// The number of simulations to run
unsigned int totalCombinations = 45000000;
int gridsize = 1;
// Work out how many blocks of size 1024 are required to perform all of totalCombinations
for(unsigned int totalsize = gridsize * BLOCK_SIZE; totalsize < totalCombinations;
gridsize++, totalsize = gridsize * BLOCK_SIZE)
;
// Allocate host memory
float* host_results = new float[totalCombinations];
memset(host_results, 0, sizeof(float) * totalCombinations);
float *dev_results = 0;
cudaSetDevice(0);
// Allocate device memory
CUDA_CALL(cudaMalloc((void**)&dev_results, totalCombinations * sizeof(float)));
dim3 grid, block;
block = dim3(BLOCK_DIM_X, BLOCK_DIM_Y);
grid = dim3(gridsize);
// Launch kernel
addKernel<<<gridsize, block>>>(dev_results, totalCombinations);
// Wait for synchronize
CUDA_CALL(cudaDeviceSynchronize());
// Copy device data back to host
CUDA_CALL(cudaMemcpy(host_results, dev_results, totalCombinations * sizeof(float), cudaMemcpyDeviceToHost));
double total = 0.0;
// Total the results in the host array
for(unsigned int i = 0; i < totalCombinations; i++)
total+=host_results[i];
// Print results to screen
printf("Total %f\n", total);
delete[] host_results;
return 0;
}
As you've discovered, your error handling method is not working. Below I have pasted a version of your code with an error checking method that I use frequently. The reason things are not working at your failure point is that your gridsize (you are launching a 1D grid) is exceeding the maximum grid size in the X dimension (65535 by default, ie. for compute capability up to 2.x). If you want to take advantage of a larger gridsize dimension (2^31 -1 is limit with compute capability 3.0), you need to compile with the -arch=sm_30
switch.
Just for reference here is a version of your code which shows an error-checking method that I use frequently.
#include <stdio.h>
#include <fstream>
#define cudaCheckErrors(msg) \
do { \
cudaError_t __err = cudaGetLastError(); \
if (__err != cudaSuccess) { \
fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \
msg, cudaGetErrorString(__err), \
__FILE__, __LINE__); \
fprintf(stderr, "*** FAILED - ABORTING\n"); \
exit(1); \
} \
} while (0)
__global__ void addKernel(float* _Results, int _TotalCombinations)
{
// Get thread Id
unsigned int Id = (blockDim.x * blockDim.y * blockIdx.x) + (blockDim.x * threadIdx.y) + threadIdx.x;
//If the Id is within simulation range, log it
if(Id < _TotalCombinations)
{
_Results[Id] = 0.95f;
}
}
#define BLOCK_DIM_X 32
#define BLOCK_DIM_Y 32
#define BLOCK_SIZE BLOCK_DIM_X * BLOCK_DIM_Y // Statc block size of 32*32 (1024)
int main()
{
// The number of simulations to run
unsigned int totalCombinations = 65000000;
int gridsize = 1;
// Work out how many blocks of size 1024 are required to perform all of totalCombinations
for(unsigned int totalsize = gridsize * BLOCK_SIZE; totalsize < totalCombinations;
gridsize++, totalsize = gridsize * BLOCK_SIZE)
;
printf("gridsize = %d, blocksize = %d\n", gridsize, BLOCK_SIZE);
// Allocate host memory
float* host_results = new float[totalCombinations];
memset(host_results, 0, sizeof(float) * totalCombinations);
float *dev_results = 0;
cudaSetDevice(0);
// Allocate device memory
cudaMalloc((void**)&dev_results, totalCombinations * sizeof(float));
cudaCheckErrors("cudaMalloc fail");
dim3 grid, block;
block = dim3(BLOCK_DIM_X, BLOCK_DIM_Y);
grid = dim3(gridsize);
// Launch kernel
addKernel<<<gridsize, block>>>(dev_results, totalCombinations);
cudaCheckErrors("kernel fail");
// Wait for synchronize
cudaDeviceSynchronize();
cudaCheckErrors("sync fail");
// Copy device data back to host
cudaMemcpy(host_results, dev_results, totalCombinations * sizeof(float), cudaMemcpyDeviceToHost);
cudaCheckErrors("cudaMemcpy 2 fail");
double total = 0.0;
// Total the results in the host array
for(unsigned int i = 0; i < totalCombinations; i++)
total+=host_results[i];
// Print results to screen
printf("Total %f\n", total);
delete[] host_results;
return 0;
}
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With