I'm trying the "hello world" program of CUDA programming: adding two vectors together. Here's the program I have tried:
#include <cuda.h>
#include <stdio.h>
#define SIZE 10
__global__ void vecAdd(float* A, float* B, float* C)
{
int i = threadIdx.x;
C[i] = A[i] + B[i];
}
int main()
{
float A[SIZE], B[SIZE], C[SIZE];
float *devPtrA, *devPtrB, *devPtrC;
size_t memsize= SIZE * sizeof(float);
for (int i=0; i< SIZE; i++) {
A[i] = i;
B[i] = i;
}
cudaMalloc(&devPtrA, memsize);
cudaMalloc(&devPtrB, memsize);
cudaMalloc(&devPtrC, memsize);
cudaMemcpy(devPtrA, A, memsize, cudaMemcpyHostToDevice);
cudaMemcpy(devPtrB, B, memsize, cudaMemcpyHostToDevice);
vecAdd<<<1, SIZE>>>(devPtrA, devPtrB, devPtrC);
cudaMemcpy(C, devPtrC, memsize, cudaMemcpyDeviceToHost);
for (int i=0; i<SIZE; i++)
printf("C[%d]: %f + %f => %f\n",i,A[i],B[i],C[i]);
cudaFree(devPtrA);
cudaFree(devPtrB);
cudaFree(devPtrC);
}
Compiled with:
nvcc cuda.cu
Output is this:
C[0]: 0.000000 + 0.000000 => 0.000000
C[1]: 1.000000 + 1.000000 => 0.000000
C[2]: 2.000000 + 2.000000 => 0.000000
C[3]: 3.000000 + 3.000000 => 0.000000
C[4]: 4.000000 + 4.000000 => 0.000000
C[5]: 5.000000 + 5.000000 => 0.000000
C[6]: 6.000000 + 6.000000 => 0.000000
C[7]: 7.000000 + 7.000000 => 0.000000
C[8]: 8.000000 + 8.000000 => 366987238703104.000000
C[9]: 9.000000 + 9.000000 => 0.000000
Every time I run it, I get a different answer for C[8], but the results for all the other elements are always 0.000000.
The Ubuntu 11.04 system a 64-bit Xeon server with 4 cores running the latest NVIDIA drivers (downloaded on Oct 4, 2012). The card is an EVGA GeForce GT 430 with 96 cores and 1GB of RAM.
What should I do to figure out what's going on?
It seems that your drivers are not initialized, but not checking the cuda return codes is always a bad practice, you should avoid that. Here is simple function + Macro that you can use for cuda calls(quoted from Cuda by Example):
static void HandleError( cudaError_t err,
const char *file,
int line ) {
if (err != cudaSuccess) {
printf( "%s in %s at line %d\n", cudaGetErrorString( err ),
file, line );
exit( EXIT_FAILURE );
}
}
#define HANDLE_ERROR( err ) (HandleError( err, __FILE__, __LINE__ ))
Now start calling your functions like:
HANDLE_ERROR(cudaMemcpy(...));
Most likely cause: the NVIDIA drivers weren't loaded. On a headless Linux system, X Windows isn't running, so the drivers aren't loaded at boot time.
Run nvidia-smi -a
as root to load them and get a confirmation in the form of a report.
Although the drivers are now loaded, they still need to be initialized every time a CUDA program is run. Put the drivers into persistent mode with nvidia-smi -pm 1
so they remain initialized all the time. Add this to a boot script (such as rc.local) so it happens at every boot.
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With