Dealing with matrices in CUDA: understanding basic concepts

Question

I'm building a CUDA kernel to compute the numerical N*N jacobian of a function, using finite differences; in the example I provided, it is the square function (each entry of the vector is squared). The host coded allocates in linear memory, while I'm using a 2-dimensional indexing in the kernel.

My issue is that I haven't found a way to sum on the diagonal of the matrices cudaMalloc'ed. My attempt has been to use the statement threadIdx.x == blockIdx.x as a condition for the diagonal, but instead it evaluates to true only for them both at 0.

Here is the kernel and EDIT: I posted the whole code as an answer, based on the suggestions in the comments (the main() is basically the same, while the kernel is not)

template <typename T>
__global__ void jacobian_kernel (
                T * J,
                const T t0, 
                const T tn,
                const T h,
                const T * u0, 
                const T * un, 
                const T * un_old)
{
    T cgamma = 2 - sqrtf(2);
    const unsigned int t = threadIdx.x;
    const unsigned int b = blockIdx.x;
    const unsigned int tid = t + b * blockDim.x;
    /*__shared__*/ T temp_sx[BLOCK_SIZE][BLOCK_SIZE];
    /*__shared__*/ T temp_dx[BLOCK_SIZE][BLOCK_SIZE];
    __shared__ T sm_temp_du[BLOCK_SIZE];

    T* temp_du = &sm_temp_du[0];

    if (tid < N )
    {
        temp_sx[b][t] = un[t]; 
        temp_dx[b][t] = un[t];

        if ( t == b )
        {
            if ( tn == t0 )
            {   
                temp_du[t] = u0[t]*0.001; 

                temp_sx[b][t] += temp_du[t]; //(*)
                temp_dx[b][t] -= temp_du[t];

                temp_sx[b][t] += ( abs( temp_sx[b][t] ) < 10e-6 ? 0.1 : 0 );
                temp_dx[b][t] += ( abs( temp_dx[b][t] ) < 10e-6 ? 0.1 : 0 );

                temp_sx[b][t] = ( temp_sx[b][t] == 0 ? 0.1 : temp_sx[b][t] );
                temp_dx[b][t] = ( temp_dx[b][t] == 0 ? 0.1 : temp_dx[b][t] );

            }

            else
            {
                temp_du[t] = MAX( un[t] - un_old[t], 10e-6 );
                temp_sx[b][t] += temp_du[t];
                temp_dx[b][t] -= temp_du[t];
            }
        }
        __syncthreads();

        //J = f(tn, un + du)
        d_func(tn, (temp_sx[b]), (temp_sx[b]), 1.f);
        d_func(tn, (temp_dx[b]), (temp_dx[b]), 1.f);

        __syncthreads();
        J[tid] = (temp_sx[b][t] - temp_dx[b][t]) * powf((2 * temp_du[t]), -1);

        //J[tid]*= - h*cgamma/2;
        //J[tid]+= ( t == b ? 1 : 0);
        //J[tid] = temp_J[tid];
    }
}

The general procedure for computing the jacobian is

Copy un into every row of temp_sx and temp_dx
Compute du as a 0.01 magnitude from u0
Sum du to the diagonal of temp_sx, subtract du from the diagonal of temp_dx
Compute the square function on each entry of temp_sx and temp_dx
Subtract them and divide every entry by 2*du

This procedure can be summarized with (f(un + du*e_i) - f(un - du*e_i))/2*du.

My problem is to sum du to the diagonal of the matrices of temp_sx and temp_dx like I tried in (*). How can I achieve that?

EDIT: Now calling 1D blocks and threads; in fact, .y axis wasn't used at all in the kernel. I'm calling the kernel with a fixed amount of shared memory

Note that in int main() I'm calling the kernel with

#define REAL sizeof(float)
#define N 32
#define BLOCK_SIZE 16
#define NUM_BLOCKS ((N*N + BLOCK_SIZE - 1)/ BLOCK_SIZE)
...
dim3 dimGrid(NUM_BLOCKS,); 
dim3 dimBlock(BLOCK_SIZE); 
size_t shm_size = N*N*REAL;
jacobian_kernel <<< dimGrid, dimBlock, size_t shm_size >>> (...);

So that I attempt to deal with block-splitting the function calls. In the kernel to sum on the diagonal I used if(threadIdx.x == blockIdx.x){...}. Why isn't this correct? I'm asking it because while debugging and making the code print the statement, It only evaluates true if they both are 0. Thus du[0] is the only numerical value and the matrix becomes nan. Note that this approach worked with the first code I built, where instead I called the kernel with

jacobian_kernel <<< N, N >>> (...)

So that when threadIdx.x == blockIdx.x the element is on the diagonal. This approach doesn't fit anymore though, since now I need to deal with larger N (possibly larger than 1024, which is the maximum number of threads per block).

What statement should I put there that works even if the matrices are split into blocks and threads?

Let me know if I should share some other info.

Eugenio · Accepted Answer

Here is how I managed to solve my problem, based on the suggestion in the comments on the answer. The example is compilable, provided you put helper_cuda.h and helper_string.h in the same directory or you add -I directive to the CUDA examples include path, installed along with the CUDA toolkit. The relevant changes are only in the kernel; there's a minor change in the main() though, since I was calling double the resources to execute the kernel, but the .y axis of the grid of thread blocks wasn't even used at all, so it didn't generate any error.

#include <stdio.h> 
#include <stdlib.h> 
#include <iostream>
#include <math.h>
#include <assert.h>

#include <cuda.h>
#include <cuda_runtime.h>
#include "helper_cuda.h"
#include "helper_string.h"
#include <fstream>

#ifndef MAX
    #define MAX(a,b) ((a > b) ? a : b)
#endif
#define REAL sizeof(float)
#define N       128
#define BLOCK_SIZE  128
#define NUM_BLOCKS  ((N*N + BLOCK_SIZE - 1)/ BLOCK_SIZE)

template <typename T>
inline void printmatrix( T mat, int rows, int cols);
template <typename T>
__global__ void jacobian_kernel ( const T * A, T * J, const T t0, const T tn, const T h, const T * u0, const T * un, const T * un_old);
template<typename T>
__device__ void d_func(const T t, const T u[], T res[], const T h = 1);
template<typename T>

int main ()
{   
    float t0    = 0.; //float tn = 0.;
    float h     = 0.1;

    float* u0 = (float*)malloc(REAL*N); for(int i = 0; i < N; ++i){u0[i] = i+1;}
    float* un = (float*)malloc(REAL*N); memcpy(un, u0, REAL*N);
    float* un_old = (float*)malloc(REAL*N); memcpy(un_old, u0, REAL*N);
    float* J = (float*)malloc(REAL*N*N);
    float* A = (float*)malloc(REAL*N*N); host_heat_matrix(A);

    float *d_u0;
    float *d_un;
    float *d_un_old;
    float *d_J;
    float *d_A;

    checkCudaErrors(cudaMalloc((void**)&d_u0,   REAL*N)); //printf("1: %p
", d_u0);
    checkCudaErrors(cudaMalloc((void**)&d_un,   REAL*N)); //printf("2: %p
", d_un);
    checkCudaErrors(cudaMalloc((void**)&d_un_old,   REAL*N)); //printf("3: %p
", d_un_old);
    checkCudaErrors(cudaMalloc((void**)&d_J,    REAL*N*N)); //printf("4: %p
", d_J);
    checkCudaErrors(cudaMalloc((void**)&d_A,    REAL*N*N)); //printf("4: %p
", d_J);
    checkCudaErrors(cudaMemcpy(d_u0, u0,        REAL*N, cudaMemcpyHostToDevice)); assert(d_u0 != NULL);
    checkCudaErrors(cudaMemcpy(d_un, un,        REAL*N, cudaMemcpyHostToDevice)); assert(d_un != NULL);
    checkCudaErrors(cudaMemcpy(d_un_old, un_old,    REAL*N, cudaMemcpyHostToDevice)); assert(d_un_old != NULL);
    checkCudaErrors(cudaMemcpy(d_J, J,      REAL*N*N, cudaMemcpyHostToDevice)); assert(d_J != NULL);
    checkCudaErrors(cudaMemcpy(d_A, A, REAL*N*N, cudaMemcpyHostToDevice)); assert(d_A != NULL);

    dim3 dimGrid(NUM_BLOCKS); std::cout << "NUM_BLOCKS 	" << dimGrid.x << "
";
    dim3 dimBlock(BLOCK_SIZE); std::cout << "BLOCK_SIZE 	" << dimBlock.x << "
";
    size_t shm_size = N*REAL; //std::cout << shm_size << "
";

    //HERE IS A RELEVANT CHANGE OF THE MAIN, SINCE I WAS CALLING 
    //THE KERNEL WITH A 2D GRID BUT WITHOUT USING THE .y AXIS,
    //WHILE NOW THE GRID IS 1D
    jacobian_kernel <<< dimGrid, dimBlock, shm_size >>> (d_A, d_J, t0, t0, h, d_u0, d_un, d_un_old);

    checkCudaErrors(cudaMemcpy(J, d_J, REAL*N*N, cudaMemcpyDeviceToHost)); //printf("4: %p
", d_J);

    printmatrix( J, N, N);

    checkCudaErrors(cudaDeviceReset());
    free(u0);
    free(un);
    free(un_old);
    free(J);

}

template <typename T>
__global__ void jacobian_kernel (
                            const T * A,
                            T * J,
                            const T t0, 
                            const T tn,
                            const T h,
                            const T * u0, 
                            const T * un, 
                            const T * un_old)
{
    T cgamma = 2 - sqrtf(2);
    const unsigned int t = threadIdx.x;
    const unsigned int b = blockIdx.x;
    const unsigned int tid = t + b * blockDim.x;
    /*__shared__*/ T temp_sx[BLOCK_SIZE][BLOCK_SIZE];
    /*__shared__*/ T temp_dx[BLOCK_SIZE][BLOCK_SIZE];
    __shared__ T sm_temp_du;
    T* temp_du = &sm_temp_du;

    //HERE IS A RELEVANT CHANGE (*)
    if ( t < BLOCK_SIZE && b < NUM_BLOCKS )
    {
        temp_sx[b][t] = un[t]; //printf("temp_sx[%d] = %f
", t,(temp_sx[b][t]));
        temp_dx[b][t] = un[t];
        //printf("t = %d, b = %d, t + b * blockDim.x = %d 
",t, b, tid);

        //HERE IS A NOTE (**)
        if ( t == b )
        {
            //printf("t = %d, b = %d 
",t, b);
            if ( tn == t0 )
            {   
                *temp_du = u0[t]*0.001;

                temp_sx[b][t] += *temp_du;
                temp_dx[b][t] -= *temp_du;

                temp_sx[b][t] += ( abs( temp_sx[b][t] ) < 10e-6 ? 0.1 : 0 );
                temp_dx[b][t] += ( abs( temp_dx[b][t] ) < 10e-6 ? 0.1 : 0 );

                temp_sx[b][t] = ( temp_sx[b][t] == 0 ? 0.1 : temp_sx[b][t] );
                temp_dx[b][t] = ( temp_dx[b][t] == 0 ? 0.1 : temp_dx[b][t] );

            }

            else
            {
                *temp_du = MAX( un[t] - un_old[t], 10e-6 );
                temp_sx[b][t] += *temp_du;
                temp_dx[b][t] -= *temp_du;
            }
        ;
        }
//printf("du[%d] %f
", tid, (*temp_du));
        __syncthreads();
        //printf("temp_sx[%d][%d] = %f
", b, t, temp_sx[b][t]);
        //printf("temp_dx[%d][%d] = %f
", b, t, temp_dx[b][t]);

        //d_func(tn, (temp_sx[b]), (temp_sx[b]), 1.f);
        //d_func(tn, (temp_dx[b]), (temp_dx[b]), 1.f);
        matvec_dev( tn, A, (temp_sx[b]), (temp_sx[b]), N, N, 1.f );
        matvec_dev( tn, A, (temp_dx[b]), (temp_dx[b]), N, N, 1.f );
        __syncthreads();
        //printf("temp_sx_later[%d][%d] = %f
", b, t, (temp_sx[b][t]));
        //printf("temp_sx_later[%d][%d] - temp_dx_later[%d][%d] = %f
", b,t,b,t, (temp_sx[b][t] - temp_dx[b][t]) / 2 * *temp_du);
        //if (t == b ) printf( "2du[%d]^-1 = %f
",t, powf((2 * *temp_du), -1));
        J[tid] = (temp_sx[b][t] - temp_dx[b][t]) / (2 * *temp_du);
    }
}                           

template<typename T>
__device__ void d_func(const T t, const T u[], T res[], const T h )
{
    __shared__ float temp_u;
    temp_u = u[threadIdx.x];
    res[threadIdx.x] = h*powf( (temp_u), 2);
}

template <typename T>
inline void printmatrix( T mat, int rows, int cols)
{
    std::ofstream matrix_out;
    matrix_out.open( "heat_matrix.txt", std::ofstream::out);
    for( int i = 0; i < rows; i++)
    {
        for( int j = 0;  j <cols; j++)
        {
            double next = mat[i + N*j];
            matrix_out << ( (next >= 0) ? " " : "") << next << " ";
        }
        matrix_out << "
";
    }   
}

The relevant change is on (*). Before I used if (tid < N) which has two downsides:

First, it is wrong, since it should be tid < N*N, as my data is 2D, while tid is a global index which tracks all the data.
Even if I wrote tid < N*N, since I'm splitting the function calls into blocks, the t < BLOCK_SIZE && b < NUM_BLOCKS seems clearer to me in how the indexing is arranged in the code.

Moreover, the statement t == b in (**) is actually the right one to operate on the diagonal elements of the matrix. The fact that it was evaluated true only on 0 was because of my error right above.

Thanks for the suggestions!

Dealing with matrices in CUDA: understanding basic concepts

Tags:

matrix

cuda

Eugenio

1 Answers

Eugenio

Recent Activity

Donate For Us

Dealing with matrices in CUDA: understanding basic concepts

Tags:

matrix

cuda

Eugenio

1 Answers

Eugenio

Related questions

Recent Activity

Donate For Us