I am trying to loop over 2-dimensional array on CUDA efficiently. In host code I have
double **h_matrix; // Matrix on host of size Nx by Ny
double tmp;
...
for(i = 0; i < Nx; i++) {
for(j = 0; j < Ny; j++) {
tmp = h_matrix[i][j];
... // Perform some operation on tmp
h_matrix[i][j] = tmp;
}
}
To perform similar task efficiently in CUDA, I understand that I have to use cudaMallocPitch()
to allocate memory for 2D array, as shown in CUDA Programming guide (scroll a bit for example). That example doesn't really help much, since that kernel doesn't use any information about grid, block or thread performing it even though it is launched as <<<100, 512>>>
.
NVidia'a Parallel forall blog suggests using a grid stride loops to write flexible & scalable kernels, however, their examples use only 1D arrays. How can I write grid stride loops for 2D arrays allocated using cudaMallocPitch()
to parallelize code shown above? Should I use 2D dimGrid and dimBlock, and if so, how?
Grid-stride loops In cases where the number of blocks per grid exceeds the hardware limit but the array fits in memory, instead of using one thread per array element, we can use one thread to process several elements. We will do so by using a technique called grid-stride loops.
numba.cuda.blockDim. The shape of the block of threads, as declared when instantiating the kernel. This value is the same for all threads in a given kernel, even if they belong to different blocks (i.e. each block is “full”). numba.cuda.blockIdx. The block indices in the grid of threads launched a kernel.
As the name suggests, the stride of the loop is the total number of threads in the grid (i.e. blockDim. x * gridDim. x ).
Perhaps an extension of the grid-stride loop concept to the 2D case in connection to 2D matrices allocated by cudaMallocPitch
could look like:
#define N 11
#define M 3
__global__ void kernel(float * d_matrix, size_t pitch) {
int idx = blockIdx.x*blockDim.x + threadIdx.x;
int idy = blockIdx.y*blockDim.y + threadIdx.y;
for (int j = blockIdx.y * blockDim.y + threadIdx.y; j < N; j += blockDim.y * gridDim.y)
{
float* row_d_matrix = (float*)((char*)d_matrix + idy*pitch);
for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < M; i += blockDim.x * gridDim.x) {
row_d_matrix[i] = ....
}
}
}
int main()
{
float *d_matrix;
size_t pitch;
cudaMallocPitch(&d_matrix,&pitch,M*sizeof(float),N);
kernel<<<GridSize,BlockSize>>>(d_matrix,pitch);
// Other stuff
}
Here is a complete compilable example I created based on JackOLantern's answer.
#include <stdio.h>
#include <assert.h>
#define N 11
#define M 3
__global__ void kernel(float * d_matrix, size_t pitch) {
for (int j = blockIdx.y * blockDim.y + threadIdx.y; j < N; j += blockDim.y * gridDim.y) {
float* row_d_matrix = (float*)((char*)d_matrix + j*pitch);
for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < M; i += blockDim.x * gridDim.x) {
row_d_matrix[i] = (j * M + i) + (j * M + i);
}
}
}
void verify(float *h, float *d, int size) {
for (int i = 0; i < size; i++) {
assert(h[i] == d[i]);
}
printf("Results match\n");
}
int main() {
float *h_matrix;
float *d_matrix;
float *dc_matrix;
h_matrix = (float *) malloc(M * N * sizeof(float));
dc_matrix = (float *) malloc(M * N * sizeof(float));
for (int j = 0; j < N; j++) {
for (int i = 0; i < M; i++) {
h_matrix[j * M + i] = (j * M + i) + (j * M + i);
}
}
size_t pitch;
cudaMallocPitch(&d_matrix, &pitch, M * sizeof(float), N);
dim3 grid(1, 1, 1);
dim3 block(3, 3, 1);
kernel<<<grid, block>>>(d_matrix, pitch);
cudaMemcpy2D(dc_matrix, M * sizeof(float), d_matrix, pitch, M * sizeof(float), N, cudaMemcpyDeviceToHost);
verify(h_matrix, dc_matrix, M * N);
free(h_matrix);
cudaFree(d_matrix);
free(dc_matrix);
}
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With