We are experiencing problems while using cuSOLVER
's cusolverSpScsrlsvchol
function, probably due to misunderstanding of the cuSOLVER
library.
Motivation: we are solving the Poisson equation -divgrad x = b
on a rectangular grid. In 2
dimensions with a 5
-stencil (1, 1, -4, 1, 1)
, the Laplacian on the grid provides a (quite sparse) matrix A
. Moreover, the charge distribution on the grid gives a (dense) vector b
. A
is positive definite and symmetric.
Now we solve A * x = b
for x
using nvidia's new cuSOLVER
library that comes with CUDA 7.0 . It provides a function cusolverSpScsrlsvchol
which should do the sparse Cholesky factorisation for floats.
Note: we are able to correctly solve the system with the alternative sparse QR factorisation function cusolverSpScsrlsvqr
. For a 4 x 4
grid with all b
entries on the edge being 1
and the rest 0
, we get for x
:
1 1 0.999999 1 1 1 0.999999 1 1 1 1 1 1 1 1 1
Our problems:
cusolverSpScsrlsvchol
returns wrong results for x
:
1 3.33333 2.33333 1 3.33333 2.33333 1.33333 1 2.33333 1.33333 0.666667 1 1 1 1 1
(solved, see answer below) Converting the CSR matrix A
to a dense matrix and showing the output gives weird numbers (10^-44
and the like). The respective data from the CSR format are correct and validated with python numpy.
(solved, see answer below) The alternative sparse LU
and partial pivoting with cusolverSpScsrlsvlu
cannot even be found:
$ nvcc -std=c++11 cusparse_test3.cu -o cusparse_test3 -lcusparse -lcusolver
cusparse_test3.cu(208): error: identifier "cusolverSpScsrlsvlu" is undefined
What are we doing wrong? Thanks for your help!
Our C++ CUDA code:
#include <iostream>
#include <cuda_runtime.h>
#include <cuda.h>
#include <cusolverSp.h>
#include <cusparse.h>
#include <vector>
#include <cassert>
// create poisson matrix with Dirichlet bc. of a rectangular grid with
// dimension NxN
void assemble_poisson_matrix_coo(std::vector<float>& vals, std::vector<int>& row, std::vector<int>& col,
std::vector<float>& rhs, int Nrows, int Ncols) {
//nnz: 5 entries per row (node) for nodes in the interior
// 1 entry per row (node) for nodes on the boundary, since we set them explicitly to 1.
int nnz = 5*Nrows*Ncols - (2*(Ncols-1) + 2*(Nrows-1))*4;
vals.resize(nnz);
row.resize(nnz);
col.resize(nnz);
rhs.resize(Nrows*Ncols);
int counter = 0;
for(int i = 0; i < Nrows; ++i) {
for (int j = 0; j < Ncols; ++j) {
int idx = j + Ncols*i;
if (i == 0 || j == 0 || j == Ncols-1 || i == Nrows-1) {
vals[counter] = 1.;
row[counter] = idx;
col[counter] = idx;
counter++;
rhs[idx] = 1.;
// if (i == 0) {
// rhs[idx] = 3.;
// }
} else { // -laplace stencil
// above
vals[counter] = -1.;
row[counter] = idx;
col[counter] = idx-Ncols;
counter++;
// left
vals[counter] = -1.;
row[counter] = idx;
col[counter] = idx-1;
counter++;
// center
vals[counter] = 4.;
row[counter] = idx;
col[counter] = idx;
counter++;
// right
vals[counter] = -1.;
row[counter] = idx;
col[counter] = idx+1;
counter++;
// below
vals[counter] = -1.;
row[counter] = idx;
col[counter] = idx+Ncols;
counter++;
rhs[idx] = 0;
}
}
}
assert(counter == nnz);
}
int main() {
// --- create library handles:
cusolverSpHandle_t cusolver_handle;
cusolverStatus_t cusolver_status;
cusolver_status = cusolverSpCreate(&cusolver_handle);
std::cout << "status create cusolver handle: " << cusolver_status << std::endl;
cusparseHandle_t cusparse_handle;
cusparseStatus_t cusparse_status;
cusparse_status = cusparseCreate(&cusparse_handle);
std::cout << "status create cusparse handle: " << cusparse_status << std::endl;
// --- prepare matrix:
int Nrows = 4;
int Ncols = 4;
std::vector<float> csrVal;
std::vector<int> cooRow;
std::vector<int> csrColInd;
std::vector<float> b;
assemble_poisson_matrix_coo(csrVal, cooRow, csrColInd, b, Nrows, Ncols);
int nnz = csrVal.size();
int m = Nrows * Ncols;
std::vector<int> csrRowPtr(m+1);
// --- prepare solving and copy to GPU:
std::vector<float> x(m);
float tol = 1e-5;
int reorder = 0;
int singularity = 0;
float *db, *dcsrVal, *dx;
int *dcsrColInd, *dcsrRowPtr, *dcooRow;
cudaMalloc((void**)&db, m*sizeof(float));
cudaMalloc((void**)&dx, m*sizeof(float));
cudaMalloc((void**)&dcsrVal, nnz*sizeof(float));
cudaMalloc((void**)&dcsrColInd, nnz*sizeof(int));
cudaMalloc((void**)&dcsrRowPtr, (m+1)*sizeof(int));
cudaMalloc((void**)&dcooRow, nnz*sizeof(int));
cudaMemcpy(db, b.data(), b.size()*sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(dcsrVal, csrVal.data(), csrVal.size()*sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(dcsrColInd, csrColInd.data(), csrColInd.size()*sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(dcooRow, cooRow.data(), cooRow.size()*sizeof(int), cudaMemcpyHostToDevice);
cusparse_status = cusparseXcoo2csr(cusparse_handle, dcooRow, nnz, m,
dcsrRowPtr, CUSPARSE_INDEX_BASE_ZERO);
std::cout << "status cusparse coo2csr conversion: " << cusparse_status << std::endl;
cudaDeviceSynchronize(); // matrix format conversion has to be finished!
// --- everything ready for computation:
cusparseMatDescr_t descrA;
cusparse_status = cusparseCreateMatDescr(&descrA);
std::cout << "status cusparse createMatDescr: " << cusparse_status << std::endl;
// optional: print dense matrix that has been allocated on GPU
std::vector<float> A(m*m, 0);
float *dA;
cudaMalloc((void**)&dA, A.size()*sizeof(float));
cusparseScsr2dense(cusparse_handle, m, m, descrA, dcsrVal,
dcsrRowPtr, dcsrColInd, dA, m);
cudaMemcpy(A.data(), dA, A.size()*sizeof(float), cudaMemcpyDeviceToHost);
std::cout << "A: \n";
for (int i = 0; i < m; ++i) {
for (int j = 0; j < m; ++j) {
std::cout << A[i*m + j] << " ";
}
std::cout << std::endl;
}
cudaFree(dA);
std::cout << "b: \n";
cudaMemcpy(b.data(), db, (m)*sizeof(int), cudaMemcpyDeviceToHost);
for (auto a : b) {
std::cout << a << ",";
}
std::cout << std::endl;
// --- solving!!!!
// cusolver_status = cusolverSpScsrlsvchol(cusolver_handle, m, nnz, descrA, dcsrVal,
// dcsrRowPtr, dcsrColInd, db, tol, reorder, dx,
// &singularity);
cusolver_status = cusolverSpScsrlsvqr(cusolver_handle, m, nnz, descrA, dcsrVal,
dcsrRowPtr, dcsrColInd, db, tol, reorder, dx,
&singularity);
cudaDeviceSynchronize();
std::cout << "singularity (should be -1): " << singularity << std::endl;
std::cout << "status cusolver solving (!): " << cusolver_status << std::endl;
cudaMemcpy(x.data(), dx, m*sizeof(float), cudaMemcpyDeviceToHost);
// relocated these 2 lines from above to solve (2):
cusparse_status = cusparseDestroy(cusparse_handle);
std::cout << "status destroy cusparse handle: " << cusparse_status << std::endl;
cusolver_status = cusolverSpDestroy(cusolver_handle);
std::cout << "status destroy cusolver handle: " << cusolver_status << std::endl;
for (auto a : x) {
std::cout << a << " ";
}
std::cout << std::endl;
cudaFree(db);
cudaFree(dx);
cudaFree(dcsrVal);
cudaFree(dcsrColInd);
cudaFree(dcsrRowPtr);
cudaFree(dcooRow);
return 0;
}
1.cusolverSpScsrlsvchol returns wrong results for x: 1 3.33333 2.33333 1 3.33333 2.33333 1.33333 1 2.33333 1.33333 0.666667 1 1 1 1 1
You said:
A is positive definite and symmetric.
No, it is not. It is not symmetric.
cusolverSpcsrlsvqr() has no requirement that the A
matrix be symmetric.
cusolverSpcsrlsvchol() does have that requirement:
A is an m×m symmetric postive definite sparse matrix
This is the printout your code provides for the A matrix:
A:
1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 1 0 0 0 -1 0 0 0 0 0 0 0 0 0 0
0 0 1 0 0 0 -1 0 0 0 0 0 0 0 0 0
0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 1 -1 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 4 -1 0 0 -1 0 0 0 0 0 0
0 0 0 0 0 -1 4 0 0 0 -1 0 0 0 0 0
0 0 0 0 0 0 -1 1 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 1 -1 0 0 0 0 0 0
0 0 0 0 0 -1 0 0 0 4 -1 0 0 0 0 0
0 0 0 0 0 0 -1 0 0 -1 4 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 -1 1 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0
0 0 0 0 0 0 0 0 0 -1 0 0 0 1 0 0
0 0 0 0 0 0 0 0 0 0 -1 0 0 0 1 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1
If that were symmetric, I would expect the second row:
0 1 0 0 0 -1 0 0 0 0 0 0 0 0 0 0
to match the 2nd column:
0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0
By the way, a suggestion about Stack Overflow. If you answer your own question, my suggestion is that you intend it to be a complete answer. Some people might see an answered question and skip it. Probably better to edit such content into your question, thus focusing your question (I think) down to a single question. SO also doesn't work as well in my opinion when you ask multiple questions per question. That sort of behavior makes the question unnecessarily more difficult to answer, and I don't think it is serving you well here.
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With