suppose I have this class :
class Particle
{
double *_w;
};
And I want to send nParticles objects of Particle
to my kernel. Allocating space for these objects is easy :
Particle *dev_p;
cudaStatus = cudaMalloc((void**)&dev_P, nParticles * sizeof(Particle));
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMalloc failed!");
goto Error;
}
Also suppose that nParticles is 100. Now I need to allocate 300 double for each _w
in a Particle
object. How can I do this? I tried this code :
for( int i = 0; i < nParticles; i++){
cudaStatus = cudaMalloc((void**)&(dev_P[i]._w), 300 * sizeof(double));
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMalloc failed!");
goto Error;
}
}
But debugging with Nsight stops when I access dev_p[i]._w[j] .
Perhaps you should include a complete simple example. (If I compile your code above and run it by itself, on linux, I get a seg fault at the second cudaMalloc operation). One wrinkle I see is that since you have in the first step allocated the particle objects in device memory, when you go to allocate the _w
pointers, you are passing a pointer to cudaMalloc that is already in device memory. You're supposed to pass a host-based pointer to cudaMalloc, which it will then assign to the allocated area in device (global) memory.
One possible solution that I think conforms to what I see in yoru example is like this:
#include <stdio.h>
#define cudaCheckErrors(msg) \
do { \
cudaError_t __err = cudaGetLastError(); \
if (__err != cudaSuccess) { \
fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \
msg, cudaGetErrorString(__err), \
__FILE__, __LINE__); \
fprintf(stderr, "*** FAILED - ABORTING\n"); \
exit(1); \
} \
} while (0)
class Particle
{
public:
double *_w;
};
__global__ void test(Particle *p){
int idx=threadIdx.x + blockDim.x*blockIdx.x;
if (idx == 2){
printf("dev_p[2]._w[2] = %f\n", p[idx]._w[2]);
}
}
int main() {
int nParticles=100;
Particle *dev_p;
double *w[nParticles];
cudaMalloc((void**)&dev_p, nParticles * sizeof(Particle));
cudaCheckErrors("cudaMalloc1 fail");
for( int i = 0; i < nParticles; i++){
cudaMalloc((void**)&(w[i]), 300 * sizeof(double));
cudaCheckErrors("cudaMalloc2 fail");
cudaMemcpy(&(dev_p[i]._w), &(w[i]), sizeof(double *), cudaMemcpyHostToDevice);
cudaCheckErrors("cudaMemcpy1 fail");
}
double testval = 32.7;
cudaMemcpy(w[2]+2, &testval, sizeof(double), cudaMemcpyHostToDevice);
cudaCheckErrors("cudaMemcpy2 fail");
test<<<1, 32>>>(dev_p);
cudaDeviceSynchronize();
cudaCheckErrors("kernel fail");
printf("Done!\n");
}
Here we are creating a separate set of pointers on the host to use for cudaMalloc purposes, then copying those allocated pointers down to the device for use as device pointers (this is legal with UVA).
Another approach would be to allocate the _w pointers on the device side. This may serve your purposes as well.
All of the above I am assuming cc 2.0 or greater.
Using a methodology similar to what is described here, it may be possible to collapse the device-side allocations done in a loop down to a single allocation:
cudaMalloc(&(w[0]), nParticles*300*sizeof(double));
cudaCheckErrors("cudaMalloc2 fail");
cudaMemcpy(&(dev_p[0]._w), &(w[0]), sizeof(double *), cudaMemcpyHostToDevice);
cudaCheckErrors("cudaMemcpy1 fail");
for( int i = 1; i < nParticles; i++){
w[i] = w[i-1] + 300;
cudaMemcpy(&(dev_p[i]._w), &(w[i]), sizeof(double *), cudaMemcpyHostToDevice);
cudaCheckErrors("cudaMemcpy1 fail");
}
The cudaMemcpy
operations still have to be done individually.
There are two ways of doing it. First one - you allocate the memory on the host filling up host array of particle objects. Once complete, you copy the host array to the device through cudaMemcpy
.
Second way - on Fermi and higher you can call malloc
in the kernel, filling the dev_P
array from the kernel.
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With