I am writing a simpled code about the addition of the elements of 2 matrices A and B; the code is quite simple and it is inspired on the example given in chapter 2 of the CUDA C Programming Guide.
#include <stdio.h>
#include <stdlib.h>
#define N 2
__global__ void MatAdd(int A[][N], int B[][N], int C[][N]){
int i = threadIdx.x;
int j = threadIdx.y;
C[i][j] = A[i][j] + B[i][j];
}
int main(){
int A[N][N] = {{1,2},{3,4}};
int B[N][N] = {{5,6},{7,8}};
int C[N][N] = {{0,0},{0,0}};
int (*pA)[N], (*pB)[N], (*pC)[N];
cudaMalloc((void**)&pA, (N*N)*sizeof(int));
cudaMalloc((void**)&pB, (N*N)*sizeof(int));
cudaMalloc((void**)&pC, (N*N)*sizeof(int));
cudaMemcpy(pA, A, (N*N)*sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(pB, B, (N*N)*sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(pC, C, (N*N)*sizeof(int), cudaMemcpyHostToDevice);
int numBlocks = 1;
dim3 threadsPerBlock(N,N);
MatAdd<<<numBlocks,threadsPerBlock>>>(A,B,C);
cudaMemcpy(C, pC, (N*N)*sizeof(int), cudaMemcpyDeviceToHost);
int i, j; printf("C = \n");
for(i=0;i<N;i++){
for(j=0;j<N;j++){
printf("%d ", C[i][j]);
}
printf("\n");
}
cudaFree(pA);
cudaFree(pB);
cudaFree(pC);
printf("\n");
return 0;
}
when i run it i keep getting the initial matrix C = [0 0 ; 0 0] instead of the addition of the elements(i,j) of the 2 matrices A and B; i have previously done another example about the addition of the elements of two arrays and it seems to work fine; however this time i don't know why it does not work.
I believe there's something wrong with the cudaMalloc command by i don't really know what else could it be.
Any ideas?
MatAdd<<<numBlocks,threadsPerBlock>>>(pA,pB,pC);
instead of MatAdd<<<numBlocks,threadsPerBlock>>>(A,B,C);
solves the problem.
The reason is that A,B
and C
are allocated on the CPU, while pA,pB
and pC
are allocated of the GPU, using CudaMalloc()
. Once pA,pB
and pC
are allocated, the values are sent from the CPU to GPU by cudaMemcpy(pA, A, (N*N)*sizeof(int), cudaMemcpyHostToDevice);
Then, the addition is performed on the GPU, that is with pA,pB
and pC
. To use printf
, the result pC
is sent from the GPU to the CPU via cudaMemcpy(C, pC, (N*N)*sizeof(int), cudaMemcpyDeviceToHost);
Think as if the CPU cannot see pA
and the GPU cannot see A
.
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With