Parallel batched small matrices in CUDA not working with for loop

Question

I have a number (say a million) of small matrices 4 x 3. I would like to do several simple operations with them and I would like my CUDA kernel to parallelize the matrices index only, (not the row/column operations). Let me explain better: I pass as an input to my GPU Kernel an array of matrices A[MatrixNumb][row][col] and I would like operation parallelization to be only on the MatrixNumb (therefore I want to force the operation in one dimension. The example below is with 3 Matrices only, for simplicity. It compiles and runs, however it gives me the wrong results. Basically, it returns the same matrices I give it as an input. I cannot understand why and if I am making any mistake, how can I re-write/think the code? I wrote the code using also CudaMallocManaged, in order to have shared memory between host and device, however it gives me the same results using the classic CudaMalloc and using memcpy.

Source.cpp

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <iostream>
#include <assert.h>
#include <chrono>
#include <random>
#include <time.h>
#include <math.h>

#include <cuda_runtime.h>
#include "device_launch_parameters.h"
#include <cuda.h>
#include <device_functions.h>

using namespace std;


__global__ void SVD(double*** a, const int m, const int n, const int numMatrices, double** w)
{
  int idx = blockIdx.x * blockDim.x + threadIdx.x;

  // I would like that each thread runs these loops independently
  for (int i = 0; i < m; i++) {
    for (int j = 0; j < n; j++) {
      a[idx][i][j] = (a[idx][i][j] * a[idx][i][j]) * 3.14;
    }
  }
  for (int j = 0; j < n; j++) {
    w[idx][j] = 3.14 * a[idx][1][j]* a[idx][1][j];
  }

}


int main()
{
  const int n = 3;
  const int m = 4;
  const int lda = m;
  const int numMatrices = 3;

  random_device device;
  mt19937 generator(device());
  uniform_real_distribution<double> distribution(1., 5.);

  // create pointers
  double*** A = new double** [numMatrices];
  double** w = new double* [numMatrices];

  //ALLOCATE SHARED MEMORY
  for (int nm = 0; nm < numMatrices; nm++) {
    A[nm] = new double* [lda];
    w[nm] = new double[n];

    for (int i = 0; i < lda; i++) {
      A[nm][i] = new double[n];

      for (int j = 0; j < n; j++) {
        cudaMallocManaged((void**)&A[nm][i][j], sizeof(double));
        cudaMallocManaged((void**)&w[nm][j], sizeof(double));
      }
    }
  }

  cout << " memory allocated" << endl;


  //FILL MATRICES INTO SHARED MEMORY
  for (int nm = 0; nm < numMatrices; nm++) {
    A[nm] = new double* [lda];
    w[nm] = new double[n];                                   

    for (int i = 0; i < lda; i++) {
      A[nm][i] = new double[n];

      for (int j = 0; j < n; j++) {
        A[nm][i][j] = distribution(generator);
        w[nm][j] = 0.0;
      }
    }
  }
  cout << " matrix filled " << endl;


  // PRINT MATRICES BEFORE CUDA OPERATION
  for (int nm = 0; nm < numMatrices; nm++) {
    for (int i = 0; i < lda; i++) {
      for (int j = 0; j < n; j++) {
        cout << A[nm][i][j] << " ";
      }
      cout << endl;
    }
    cout << endl;
  }

  //KERNEL ----------------------------------------------------------------------
  int NThreads = 3;   
  int NBlocks = int(numMatrices / NThreads + 1);
 
  SVD << <NBlocks, NThreads >> > (A, n, m, numMatrices, w);
  cudaDeviceSynchronize();
  cout << " Kernel done " << endl << endl;

  cout << " --- GPU --- " << endl;
  cout << " NEW MATRIX: " << endl;
  for (int nm = 0; nm < numMatrices; nm++) {
    for (int i = 0; i < lda; i++) {
      for (int j = 0; j < n; j++) {
        cout << A[nm][i][j] << " ";
      }
      cout << endl;
    }
    cout << endl;
  }

  cout << " NEW VECTOR RESULTS: " << endl;
  for (int nm = 0; nm < numMatrices; nm++) {
    for (int i = 0; i < n; i++) {
      cout << w[nm][i] << " ";
    }
    cout << endl;
  }

  cout << endl;

  //FREE THE DEVICE'S MEMORY -----------------------------------------------------
  cudaFree(A);
  cudaFree(w);
  cout << " Cuda free " << endl << endl;

  return 0;
}

The (wrong) output I get is the following:

memory allocated
 matrix filled
1.28689 3.76588 3.88649
1.52547 4.42371 2.62566
1.48002 3.33719 1.58413
3.78243 2.8394 3.0249

1.14322 1.70261 2.02784
2.86852 2.87918 3.2896
4.87268 3.52447 1.58414
3.52306 3.84931 3.18212

1.76397 1.41317 4.9765
1.63338 4.79316 2.64009
1.99873 1.72617 1.15974
1.18922 4.21513 1.6695

 Kernel done

 --- GPU ---
 NEW MATRIX:
1.28689 3.76588 3.88649
1.52547 4.42371 2.62566
1.48002 3.33719 1.58413
3.78243 2.8394 3.0249

1.14322 1.70261 2.02784
2.86852 2.87918 3.2896
4.87268 3.52447 1.58414
3.52306 3.84931 3.18212

1.76397 1.41317 4.9765
1.63338 4.79316 2.64009
1.99873 1.72617 1.15974
1.18922 4.21513 1.6695

 NEW VECTOR RESULTS:
0 0 0
0 0 0
0 0 0

 Cuda free

I expected to se the new matrices and the vectors modified by the operations of: a[idx][i][j] = (a[idx][i][j] * a[idx][i][j]) * 3.14; however, It looks like the code does not see the kernel or the kernel does not work properly.

Robert Crovella · Accepted Answer

You had a several issues:

When using managed memory with double or triple pointer access, every pointer in the tree must be allocated using a managed allocator
Your allocation schemes had too many levels, and you were allocating some pointers twice (memory leak).
The order of arguments you are passing to your kernel does not match the order of arguments your kernel expects (n, m were backwards).
Since you are potentially launching more blocks/threads than necessary, your kernel requires a thread check (if-test).
Your code should be in a .cu file, not a .cpp file.

The following code has the above issues addressed, and seems to run without runtime error.

$ cat t61.cu
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <iostream>
#include <assert.h>
#include <chrono>
#include <random>
#include <time.h>
#include <math.h>


using namespace std;


__global__ void SVD(double*** a, const int m, const int n, const int numMatrices, double** w)
{
  int idx = blockIdx.x * blockDim.x + threadIdx.x;
  if (idx < numMatrices){
  // I would like that each thread runs these loops independently
  for (int i = 0; i < m; i++) {
    for (int j = 0; j < n; j++) {
      a[idx][i][j] = (a[idx][i][j] * a[idx][i][j]) * 3.14;
    }
  }
  for (int j = 0; j < n; j++) {
    w[idx][j] = 3.14 * a[idx][1][j]* a[idx][1][j];
  }
  }
}


int main()
{
  const int n = 3;
  const int m = 4;
  const int lda = m;
  const int numMatrices = 3;

  random_device device;
  mt19937 generator(device());
  uniform_real_distribution<double> distribution(1., 5.);

  // create pointers
  double*** A;
  cudaMallocManaged(&A, sizeof(double**)*numMatrices);
  double** w;
  cudaMallocManaged(&w, sizeof(double*)* numMatrices);

  //ALLOCATE SHARED MEMORY
  for (int nm = 0; nm < numMatrices; nm++) {
    cudaMallocManaged(&(A[nm]), sizeof(double*)*lda);
    cudaMallocManaged(&(w[nm]), sizeof(double)*n);

    for (int i = 0; i < lda; i++) {
      cudaMallocManaged(&(A[nm][i]), sizeof(double)*n);
      }
    }

  cout << " memory allocated" << endl;


  //FILL MATRICES INTO SHARED MEMORY
  for (int nm = 0; nm < numMatrices; nm++) {
    for (int i = 0; i < lda; i++) {
      for (int j = 0; j < n; j++) {
        A[nm][i][j] = distribution(generator);
        w[nm][j] = 0.0;
      }
    }
  }
  cout << " matrix filled " << endl;


  // PRINT MATRICES BEFORE CUDA OPERATION
  for (int nm = 0; nm < numMatrices; nm++) {
    for (int i = 0; i < lda; i++) {
      for (int j = 0; j < n; j++) {
        cout << A[nm][i][j] << " ";
      }
      cout << endl;
    }
    cout << endl;
  }

  //KERNEL ----------------------------------------------------------------------
  int NThreads = 3;
  int NBlocks = int(numMatrices / NThreads + 1);

  SVD << <NBlocks, NThreads >> > (A, m, n, numMatrices, w);
  cudaDeviceSynchronize();
  cout << " Kernel done " << endl << endl;

  cout << " --- GPU --- " << endl;
  cout << " NEW MATRIX: " << endl;
  for (int nm = 0; nm < numMatrices; nm++) {
    for (int i = 0; i < lda; i++) {
      for (int j = 0; j < n; j++) {
        cout << A[nm][i][j] << " ";
      }
      cout << endl;
    }
    cout << endl;
  }

  cout << " NEW VECTOR RESULTS: " << endl;
  for (int nm = 0; nm < numMatrices; nm++) {
    for (int i = 0; i < n; i++) {
      cout << w[nm][i] << " ";
    }
    cout << endl;
  }

  cout << endl;

  //FREE THE DEVICE'S MEMORY -----------------------------------------------------
  cudaFree(A);
  cudaFree(w);
  cout << " Cuda free " << endl << endl;

  return 0;
}
$ nvcc -o t61 t61.cu
$ cuda-memcheck ./t61
========= CUDA-MEMCHECK
 memory allocated
 matrix filled
3.73406 3.51919 3.249
1.52374 2.678 2.50944
3.67358 1.15831 3.26327
2.58468 1.49937 2.67133

1.72144 2.99183 3.11156
1.06247 3.34983 4.23568
3.49749 3.07641 3.42827
4.09607 2.00557 2.12049

3.65427 3.98966 4.73428
1.68397 4.3746 2.95533
2.1914 4.96086 1.7165
3.10095 2.61781 4.52626

 Kernel done

 --- GPU ---
 NEW MATRIX:
43.7816 38.888 33.1458
7.29041 22.5191 19.7735
42.375 4.2129 33.4376
20.977 7.05908 22.407

9.30494 28.1062 30.4008
3.54453 35.2351 56.3348
38.41 29.7179 36.9045
52.6821 12.6301 14.1189

41.9306 49.9807 70.3782
8.90432 60.0905 27.4247
15.079 77.2757 9.25165
30.1939 21.5182 64.3294

 NEW VECTOR RESULTS:
166.891 1592.32 1227.71
39.4501 3898.35 9965.14
248.961 11338.1 2361.64

 Cuda free

========= ERROR SUMMARY: 0 errors
$

Parallel batched small matrices in CUDA not working with for loop

Tags:

c++

cuda

gpu

Kosuke88kk

1 Answers

Robert Crovella

Recent Activity

Donate For Us

Parallel batched small matrices in CUDA not working with for loop

Tags:

c++

cuda

gpu

Kosuke88kk

1 Answers

Robert Crovella

Related questions

Recent Activity

Donate For Us