Reducing matrix rows or columns in CUDA

Question

I'm using CUDA with cuBLAS to perform matrix operations.

I need to sum the rows (or columns) of a matrix. Currently I'm doing it by multiplying the matrix with a ones vector but this doesn't seem so efficient.

Is there any better way? Couldn't find anything in cuBLAS.

6 revs · Accepted Answer

Actually multiplying the matrix with a ones vector using cublas_gemv() is a very efficient way, unless you are considering write your own kernel by hand.

You can easily profile the mem bandwidth of cublas_gemv(). It's very close to that of simply reading the whole matrix data once, which can be seen as the theoretical peak performance of matrix row/col summation.

The extra operation "x1.0" won't lead to much performance reduction because:

cublas_gemv() is basically a mem bandwidth bound operation, extra arithmetic instructions won't be the bottleneck;
FMA instruction further reduce the instruction throughput;
mem of ones vector is usually much smaller than that of the matrix, and can be easily cache by GPU to reduce to mem bandwidth.

cublas_gemv() also help you deal with the matrix layout problem. It works on row/col-major and arbitrary padding.

I also asked a similar question about this. My experiment shows cublas_gemv() is better than segmented reduce using Thrust::reduce_by_key, which is another approach of matrix row summation.

Vitality · Answer

Posts related to this one containing useful answers on the same topic are available at

Reduce matrix rows with CUDA

and

Reduce matrix columns with CUDA.

Here I just want to point out how the approach of reducing the columns of a matrix through the multiplication of a row by the same matrix can be generalized to perform the linear combination of an ensemble of vectors. In other words, if one wants to calculate the following vector basis expansion

enter image description here

where f(x_m) are samples of the function f(x), while the \psi_n's are basis functions and the c_n's are expansion coefficients, then the \psi_n's can be organized in a N x M matrix and the coefficients c_n's in a row vector and then compute the vector x matrix multiplication using cublas<t>gemv.

Below, I'm reporting a fully worked example:

#include <cublas_v2.h>

#include <thrust/device_vector.h>
#include <thrust/random.h>

#include <stdio.h>
#include <iostream>

#include "Utilities.cuh"

/********************************************/
/* LINEAR COMBINATION FUNCTION - FLOAT CASE */
/********************************************/
void linearCombination(const float * __restrict__ d_coeff, const float * __restrict__ d_basis_functions_real, float * __restrict__ d_linear_combination,
                       const int N_basis_functions, const int N_sampling_points, const cublasHandle_t handle) {

    float alpha = 1.f;
    float beta  = 0.f;
    cublasSafeCall(cublasSgemv(handle, CUBLAS_OP_N, N_sampling_points, N_basis_functions, &alpha, d_basis_functions_real, N_sampling_points, 
                               d_coeff, 1, &beta, d_linear_combination, 1));

}

void linearCombination(const double * __restrict__ d_coeff, const double * __restrict__ d_basis_functions_real, double * __restrict__ d_linear_combination,
                       const int N_basis_functions, const int N_sampling_points, const cublasHandle_t handle) {

    double alpha = 1.;
    double beta  = 0.;
    cublasSafeCall(cublasDgemv(handle, CUBLAS_OP_N, N_sampling_points, N_basis_functions, &alpha, d_basis_functions_real, N_sampling_points, 
                               d_coeff, 1, &beta, d_linear_combination, 1));

}

/********/
/* MAIN */
/********/
int main()
{
    const int N_basis_functions = 5;     // --- Number of rows                  -> Number of basis functions
    const int N_sampling_points = 8;     // --- Number of columns               -> Number of sampling points of the basis functions

    // --- Random uniform integer distribution between 10 and 99
    thrust::default_random_engine rng;
    thrust::uniform_int_distribution<int> dist(10, 99);

    // --- Matrix allocation and initialization
    thrust::device_vector<float> d_basis_functions_real(N_basis_functions * N_sampling_points);
    for (size_t i = 0; i < d_basis_functions_real.size(); i++) d_basis_functions_real[i] = (float)dist(rng);

    thrust::device_vector<double> d_basis_functions_double_real(N_basis_functions * N_sampling_points);
    for (size_t i = 0; i < d_basis_functions_double_real.size(); i++) d_basis_functions_double_real[i] = (double)dist(rng);

    /************************************/
    /* COMPUTING THE LINEAR COMBINATION */
    /************************************/
    cublasHandle_t handle;
    cublasSafeCall(cublasCreate(&handle));

    thrust::device_vector<float>  d_linear_combination_real(N_sampling_points);
    thrust::device_vector<double> d_linear_combination_double_real(N_sampling_points);
    thrust::device_vector<float>  d_coeff_real(N_basis_functions, 1.f);
    thrust::device_vector<double> d_coeff_double_real(N_basis_functions, 1.);

    linearCombination(thrust::raw_pointer_cast(d_coeff_real.data()), thrust::raw_pointer_cast(d_basis_functions_real.data()), thrust::raw_pointer_cast(d_linear_combination_real.data()),
                      N_basis_functions, N_sampling_points, handle);
    linearCombination(thrust::raw_pointer_cast(d_coeff_double_real.data()), thrust::raw_pointer_cast(d_basis_functions_double_real.data()), thrust::raw_pointer_cast(d_linear_combination_double_real.data()),
                      N_basis_functions, N_sampling_points, handle);

    /*************************/
    /* DISPLAYING THE RESULT */
    /*************************/
    std::cout << "Real case 

";
    for(int j = 0; j < N_sampling_points; j++) {
        std::cout << "Column " << j << " - [ ";
        for(int i = 0; i < N_basis_functions; i++)
            std::cout << d_basis_functions_real[i * N_sampling_points + j] << " ";
        std::cout << "] = " << d_linear_combination_real[j] << "
";
    }

    std::cout << "

Double real case 

";
    for(int j = 0; j < N_sampling_points; j++) {
        std::cout << "Column " << j << " - [ ";
        for(int i = 0; i < N_basis_functions; i++)
            std::cout << d_basis_functions_double_real[i * N_sampling_points + j] << " ";
        std::cout << "] = " << d_linear_combination_double_real[j] << "
";
    }

    return 0;
}

Reducing matrix rows or columns in CUDA

Tags:

cuda

cublas

Ran

2 Answers

6 revs

Vitality

Recent Activity

Donate For Us

Reducing matrix rows or columns in CUDA

Tags:

cuda

cublas

Ran

2 Answers

6 revs

Vitality

Related questions

Recent Activity

Donate For Us