While trying to compute eigenvalues and eigenvectors of several matrices in parallel, I found that LAPACKs dsyevr function does not seem to be thread safe.
Here is a minimal code example in C which demonstrates the problem:
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>
#include <assert.h>
#include <omp.h>
#include "lapacke.h"
#define M 8 /* number of matrices to be diagonalized */
#define N 1000 /* size of each matrix (real, symmetric) */
typedef double vec_t[N]; /* type for length N vector */
typedef double mtx_t[N][N]; /* type for N x N matrices */
void
init(int m, int n, mtx_t *A){
/* init m symmetric n x x matrices */
srand(0);
for (int i = 0; i < m; ++i){
for (int j = 0; j < n; ++j){
for (int k = 0; k <= j; ++k){
A[i][j][k] = A[i][k][j] = (rand()%100-50) / (double)100.;
}
}
}
}
void
solve(int n, double *A, double *E, double *Q){
/* diagonalize one matrix */
double tol = 0.;
int *isuppz = malloc(2*n*sizeof(int)); assert(isuppz);
int k;
int info = LAPACKE_dsyevr(LAPACK_COL_MAJOR, 'V', 'A', 'L',
n, A, n, 0., 0., 0, 0, tol, &k, E, Q, n, isuppz);
assert(!info);
free(isuppz);
}
void
s_solve(int m, int n, mtx_t *A, vec_t *E, mtx_t *Q){
/* serial solve */
for (int i = 0; i < m; ++i){
solve(n, (double *)A[i], (double *)E[i], (double *)Q[i]);
}
}
void
p_solve(int m, int n, mtx_t *A, vec_t *E, mtx_t *Q, int nt){
/* parallel solve */
int i;
#pragma omp parallel for schedule(static) num_threads(nt) \
private(i) \
shared(m, n, A, E, Q)
for (i = 0; i < m; ++i){
solve(n, (double *)A[i], (double *)E[i], (double *)Q[i]);
}
}
void
analyze_results(int m, int n, vec_t *E0, vec_t *E1, mtx_t *Q0, mtx_t *Q1){
/* compare eigenvalues */
printf("\nmax. abs. diff. of eigenvalues:\n");
for (int i = 0; i < m; ++i){
double t, dE = 0.;
for (int j = 0; j < n; ++j){
t = fabs(E0[i][j] - E1[i][j]);
if (t > dE) dE = t;
}
printf("%i: %5.1e\n", i, dE);
}
/* compare eigenvectors (ignoring sign) */
printf("\nmax. abs. diff. of eigenvectors (ignoring sign):\n");
for (int i = 0; i < m; ++i){
double t, dQ = 0.;
for (int j = 0; j < n; ++j){
for (int k = 0; k < n; ++k){
t = fabs(fabs(Q0[i][j][k]) - fabs(Q1[i][j][k]));
if (t > dQ) dQ = t;
}
}
printf("%i: %5.1e\n", i, dQ);
}
}
int main(void){
mtx_t *A = malloc(M*N*N*sizeof(double)); assert(A);
init(M, N, A);
/* allocate space for matrices, eigenvalues and eigenvectors */
mtx_t *s_A = malloc(M*N*N*sizeof(double)); assert(s_A);
vec_t *s_E = malloc(M*N*sizeof(double)); assert(s_E);
mtx_t *s_Q = malloc(M*N*N*sizeof(double)); assert(s_Q);
/* copy initial matrix */
memcpy(s_A, A, M*N*N*sizeof(double));
/* solve serial */
s_solve(M, N, s_A, s_E, s_Q);
/* allocate space for matrices, eigenvalues and eigenvectors */
mtx_t *p_A = malloc(M*N*N*sizeof(double)); assert(p_A);
vec_t *p_E = malloc(M*N*sizeof(double)); assert(p_E);
mtx_t *p_Q = malloc(M*N*N*sizeof(double)); assert(p_Q);
/* copy initial matrix */
memcpy(p_A, A, M*N*N*sizeof(double));
/* use one thread, to check that the algorithm is deterministic */
p_solve(M, N, p_A, p_E, p_Q, 1);
analyze_results(M, N, s_E, p_E, s_Q, p_Q);
/* copy initial matrix */
memcpy(p_A, A, M*N*N*sizeof(double));
/* use several threads, and see what happens */
p_solve(M, N, p_A, p_E, p_Q, 4);
analyze_results(M, N, s_E, p_E, s_Q, p_Q);
free(A);
free(s_A);
free(s_E);
free(s_Q);
free(p_A);
free(p_E);
free(p_Q);
return 0;
}
and this is what you get (see difference in last output block, which tells you, that the eigenvectors are wrong, although eigenvalues are ok):
max. abs. diff. of eigenvalues:
0: 0.0e+00
1: 0.0e+00
2: 0.0e+00
3: 0.0e+00
4: 0.0e+00
5: 0.0e+00
6: 0.0e+00
7: 0.0e+00
max. abs. diff. of eigenvectors (ignoring sign):
0: 0.0e+00
1: 0.0e+00
2: 0.0e+00
3: 0.0e+00
4: 0.0e+00
5: 0.0e+00
6: 0.0e+00
7: 0.0e+00
max. abs. diff. of eigenvalues:
0: 0.0e+00
1: 0.0e+00
2: 0.0e+00
3: 0.0e+00
4: 0.0e+00
5: 0.0e+00
6: 0.0e+00
7: 0.0e+00
max. abs. diff. of eigenvectors (ignoring sign):
0: 0.0e+00
1: 1.2e-01
2: 1.6e-01
3: 1.4e-01
4: 2.3e-01
5: 1.8e-01
6: 2.6e-01
7: 2.6e-01
The code was compiled with gcc 4.4.5 and linked against openblas (containing LAPACK) (libopenblas_sandybridge-r0.2.8.so) but was also tested with another LAPACK version. Calling LAPACK directly from C (without LAPACKE) was also tested, same results. Substituting dsyevr
by the dsyevd
function (and adjusting arguments) did also have no effect.
Finally, here is the compilation instruction I used:
gcc -std=c99 -fopenmp -L/path/to/openblas/lib -Wl,-R/path/to/openblas/lib/ \
-lopenblas -lgomp -I/path/to/openblas/include main.c -o main
Unfortunately google did not answer my questions, so any hint is welcome!
EDIT: To make sure, that everything is ok with the BLAS and LAPACK versions I took the reference LAPACK (including BLAS and LAPACKE) from http://www.netlib.org/lapack/ (version 3.4.2) Compiling the example code was a bit tricky, but did finally work with separate compiling and linking:
gcc -c -std=c99 -fopenmp -I../lapack-3.4.2/lapacke/include \
netlib_dsyevr.c -o netlib_main.o
gfortran netlib_main.o ../lapack-3.4.2/liblapacke.a \
../lapack-3.4.2/liblapack.a ../lapack-3.4.2/librefblas.a \
-lgomp -o netlib_main
The build of the netlib LAPACK/BLAS and the example program was done on a Darwin 12.4.0 x86_64
and a Linux 3.2.0-0.bpo.4-amd64 x86_64
platform. Consistent misbehavior of the program can be observed.
I finally received the explanation from the LAPACK team, which I would like to quote (with permission):
I think the problem you are seeing may be caused by how the FORTRAN version of the LAPACK library you are using was compiled. Using gfortran 4.8.0 (on Mac OS 10.8), I could reproduce the problem you saw if I compile LAPACK with the -O3 option for gfortran. If I recompile the LAPACK and reference BLAS library with -fopenmp -O3, the problem goes away. There is a note in the gfortran manual stating "-fopenmp implies -frecursive, i.e., all local arrays will be allocated on the stack," so there may be local arrays used in some auxiliary routines called by dsyevr for which the default setting of the compiler is causing them to be allocated in a non-thread safe manner. In any case, allocating these on the stack, which -fopenmp seems to do, will address this issue.
I can confirm that this solves the problem for netlib-BLAS/LAPACK. One should keep in mind that the stack size is limited and has possibly to be adjusted if matrices get big and/or many.
OpenBLAS must be compiled with USE_OPENMP=1
and USE_THREAD=1
to get a single threaded and thread-safe library.
With these compiler and make flags, the sample program runs correctly with all libraries. It remains an open question, how one makes sure that the user to whom one hands ones code in the end is linking to a correctly compiled BLAS/LAPACK library? If the user would just get a segmentation fault one could add a note in a README file, but since the error is more subtle, it is not even guaranteed that the error is recognized by the user (users don't read the README file by default ;-) ). Shipping a BLAS/LAPACK with ones code is not a good idea, since it is the basic idea of BLAS/LAPACK that everyone has a specifically optimized version for his computer. Ideas are welcome...
Re another library: GSL. It's threadsafe. But that means that you have to create workspaces for each thread and be sure that each thread uses it workspace, e.g., index pointers by thread number.
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With