I am working on a project with CUDA. To get the hang of it, I have the following code.
#include <iostream>
using namespace std;
__global__ void inc(int *foo) {
++(*foo);
}
int main() {
int count = 0, *cuda_count;
cudaMalloc((void**)&cuda_count, sizeof(int));
cudaMemcpy(cuda_count, &count, sizeof(int), cudaMemcpyHostToDevice);
cout << "count: " << count << '\n';
inc <<< 100, 25 >>> (&count);
cudaMemcpy(&count, cuda_count, sizeof(int), cudaMemcpyDeviceToHost);
cudaFree(cuda_count);
cout << "count: " << count << '\n';
return 0;
}
Output is
count: 0
count: 0
What's the problem?
Thanks in advance!
You should pass cuda_count
to your kernel function. Apart from that, all your threads are trying to increment the same memory location. The effect of that isn’t well-defined (at least one write will succeed, but more than one can).
You need to prevent that by only letting one thread perform the work:
__global__ void inc(int *foo) {
if (blockIdx.x == 0 && threadIdx.x == 0)
++*foo;
}
(untested)
I found the solution. I just had to use an atomic function, i.e a function that is executed without interference from other threads. In other words, no other thread can access a specific address until the operation is complete.
Code:
#include <iostream>
using namespace std;
__global__ void inc(int *foo) {
atomicAdd(foo, 1);
}
int main() {
int count = 0, *cuda_count;
cudaMalloc((void**)&cuda_count, sizeof(int));
cudaMemcpy(cuda_count, &count, sizeof(int), cudaMemcpyHostToDevice);
cout << "count: " << count << '\n';
inc <<< 100, 25 >>> (cuda_count);
cudaMemcpy(&count, cuda_count, sizeof(int), cudaMemcpyDeviceToHost);
cudaFree(cuda_count);
cout << "count: " << count << '\n';
return 0;
}
Output:
count: 0
count: 2500
Thank you for making me realize the error that I was committing.
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With