Logo Questions Linux Laravel Mysql Ubuntu Git Menu
 

Problem with MPI matrix-matrix multiply: Cluster slower than single computer

I code a small program using MPI to parallelize matrix-matrix multiplication. The problem is: When running the program on my computer, it takes about 10 seconds to complete, but about 75 seconds on a cluster. I think I have some synchronization problem, but I cannot figure it out (yet).

Here's my source code:

/*matrix.c
mpicc -o out matrix.c
mpirun -np 11 out
*/

#include <mpi.h>
#include <stdio.h>
#include <string.h>
#include <stdlib.h>

#define N 1000

#define DATA_TAG 10
#define B_SENT_TAG 20
#define FINISH_TAG 30

int master(int);
int worker(int, int);

int main(int argc, char **argv) {
    int myrank, p;
    double s_time, f_time;

    MPI_Init(&argc,&argv);
    MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
    MPI_Comm_size(MPI_COMM_WORLD, &p);

    if (myrank == 0) {
        s_time = MPI_Wtime();
        master(p);
        f_time = MPI_Wtime();
        printf("Complete in %1.2f seconds\n", f_time - s_time);
        fflush(stdout);
    }
    else {
        worker(myrank, p);
    }
    MPI_Finalize();
    return 0;
}

int *read_matrix_row();
int *read_matrix_col();
int send_row(int *, int);
int recv_row(int *, int, MPI_Status *);
int send_tag(int, int);
int write_matrix(int *);

int master(int p) {
    MPI_Status status;
    int *a; int *b;
    int *c = (int *)malloc(N * sizeof(int));
    int i, j; int num_of_finish_row = 0;

    while (1) {
        for (i = 1; i < p; i++) {
            a = read_matrix_row();
            b = read_matrix_col();
            send_row(a, i);
            send_row(b, i);
            //printf("Master - Send data to worker %d\n", i);fflush(stdout);
        }
        wait();
        for (i = 1; i < N / (p - 1); i++) {
            for (j = 1; j < p; j++) {
                //printf("Master - Send next row to worker[%d]\n", j);fflush(stdout);
                b = read_matrix_col();
                send_row(b, j);
            }
        }
        for (i = 1; i < p; i++) {
            //printf("Master - Announce all row of B sent to worker[%d]\n", i);fflush(stdout);
            send_tag(i, B_SENT_TAG);
        }
        //MPI_Barrier(MPI_COMM_WORLD);
        for (i = 1; i < p; i++) {
            recv_row(c, MPI_ANY_SOURCE, &status);
            //printf("Master - Receive result\n");fflush(stdout);
            num_of_finish_row++;
        }
        //printf("Master - Finish %d rows\n", num_of_finish_row);fflush(stdout);
        if (num_of_finish_row >= N)
            break;
    }
    //printf("Master - Finish multiply two matrix\n");fflush(stdout);
    for (i = 1; i < p; i++) {
        send_tag(i, FINISH_TAG);
    }
    //write_matrix(c);
    return 0;
}

int worker(int myrank, int p) {
    int *a = (int *)malloc(N * sizeof(int));
    int *b = (int *)malloc(N * sizeof(int));
    int *c = (int *)malloc(N * sizeof(int));
    int i;
    for (i = 0; i < N; i++) {
        c[i] = 0;
    }
    MPI_Status status;
    int next = (myrank == (p - 1)) ? 1 : myrank + 1;
    int prev = (myrank == 1) ? p - 1 : myrank - 1;
    while (1) {
        recv_row(a, 0, &status);
        if (status.MPI_TAG == FINISH_TAG)
            break;
        recv_row(b, 0, &status);
        wait();
        //printf("Worker[%d] - Receive data from master\n", myrank);fflush(stdout);
        while (1) {
            for (i = 1; i < p; i++) {
                //printf("Worker[%d] - Start calculation\n", myrank);fflush(stdout);
                calc(c, a, b);
                //printf("Worker[%d] - Exchange data with %d, %d\n", myrank, next, prev);fflush(stdout);
                exchange(b, next, prev);
            }
            //printf("Worker %d- Request for more B's row\n", myrank);fflush(stdout);
            recv_row(b, 0, &status);
            //printf("Worker %d - Receive tag %d\n", myrank, status.MPI_TAG);fflush(stdout);
            if (status.MPI_TAG == B_SENT_TAG) {
                break;
                //printf("Worker[%d] - Finish calc one row\n", myrank);fflush(stdout);
            }
        }
        //wait();
        //printf("Worker %d - Send result\n", myrank);fflush(stdout);
        send_row(c, 0);
        for (i = 0; i < N; i++) {
            c[i] = 0;
        }
    }
    return 0;
}

int *read_matrix_row() {
    int *row = (int *)malloc(N * sizeof(int));
    int i;
    for (i = 0; i < N; i++) {
        row[i] = 1;
    }
    return row;
}
int *read_matrix_col() {
    int *col = (int *)malloc(N * sizeof(int));
    int i;
    for (i = 0; i < N; i++) {
        col[i] = 1;
    }
    return col;
}

int send_row(int *row, int dest) {
    MPI_Send(row, N, MPI_INT, dest, DATA_TAG, MPI_COMM_WORLD);
    return 0;
}

int recv_row(int *row, int src, MPI_Status *status) {
    MPI_Recv(row, N, MPI_INT, src, MPI_ANY_TAG, MPI_COMM_WORLD, status);
    return 0;
}

int wait() {
    MPI_Barrier(MPI_COMM_WORLD);
    return 0;
}
int calc(int *c_row, int *a_row, int *b_row) {
    int i;
    for (i = 0; i < N; i++) {
        c_row[i] = c_row[i] + a_row[i] * b_row[i];
        //printf("%d ", c_row[i]);
    }
    //printf("\n");fflush(stdout);
    return 0;
}

int exchange(int *row, int next, int prev) {
    MPI_Request request; MPI_Status status;
    MPI_Isend(row, N, MPI_INT, next, DATA_TAG, MPI_COMM_WORLD, &request);
    MPI_Irecv(row, N, MPI_INT, prev, MPI_ANY_TAG, MPI_COMM_WORLD, &request);
    MPI_Wait(&request, &status);
    return 0;
}

int send_tag(int worker, int tag) {
    MPI_Send(0, 0, MPI_INT, worker, tag, MPI_COMM_WORLD);
    return 0;
}

int write_matrix(int *matrix) {
    int i;
    for (i = 0; i < N; i++) {
        printf("%d ", matrix[i]);
    }
    printf("\n");
    fflush(stdout);
    return 0;
}
like image 738
genienin Avatar asked Apr 18 '11 08:04

genienin


3 Answers

Well, you have a fairly small matrix (N=1000), and secondly you distribute your algorithm on a row/column basis rather than blocked.

For a more realistic version using better algorithms, you might want to acquire an optimized BLAS library (e.g. GOTO is free), test single-thread performance with that one, then get PBLAS and link it against your optimized BLAS, and compare MPI parallel performance using the PBLAS version.

like image 185
janneb Avatar answered Sep 26 '22 18:09

janneb


I see some errors in your program:

  1. First, why are you calling the wait function since its implementation is simply calling MPI_Barrier. MPI_Barrier is a primitive synchronization that blocks all threads until they reach the "barrier" by calling MPI_Barrier. My question is: do you want the master to be synchronized with the workers? In this context, that would not be optimal because a worker doesn't need to wait for the master to begin its calculation.

  2. Second, there are 2 unnecessary for loops.

    for (i = 1; i < N / (p - 1); i++) {
        for (j = 1; j < p; j++) {
            b = read_matrix_col();
            send_row(b, j);
        }
    }
    
    for (i = 1; i < p; i++) {
        send_tag(i, B_SENT_TAG);
    }
    

In the first i-loop, you don't use the variable in your statement. Since the j-loop and the second i-loop are the same, you could do:

for (i = 0; i < p; i++) {
    b = read_matrix_col();
    send_row(b, j);
    send_tag(i, B_SENT_TAG);
 }

In terms of data transfer, your program is not optimized because you are sending an array of 1000 integers of data for each data transfer. There should be a better way to optimise the data transfer, but I will let you look at it. So make the corrections I told you and tell us what is your new performance.

And as @janneb said, you can use the BLAS library for better performance for matrix multiplication. Good luck!

like image 33
Dimitri Avatar answered Sep 23 '22 18:09

Dimitri


I did not look over your code, but I can provide some hints about why your result may not unexpected:

  1. As already mentioned, N=1000 may be too small. You should make more tests to see the scalability of your program (try setting N=100, 500, 1000, 5000, 10000, etc.) and compare results on both your system and the cluster.

  2. Compare results between your system (one processor I presume) and a single processor on the cluster. Usually in production environments like servers or clusters a single processor is less powerful than the best processors designed for desktop use, but they provide stability, reliability and other features useful for environments which run 24h/day at full capacity.

  3. If your processor has multiple cores, more than one MPI processes may run at the same time and synchronization between them is negligible compared to the synchronization between nodes in a cluster.

  4. Are the nodes from the cluster statically assigned to you? Maybe other users' programs can be scheduled on the nodes you are running at the same time as you.

  5. Read documentation about the cluster's architecture. Some architectures may be more suitable for particular classes of problems.

  6. Assess latency of the network of the cluster. Ping-ing from each node to another many times and computing the mean value may give a rough estimate.

  7. Last but perhaps the most important, your algorithm may not be optimal. Read a/some books on matrix multiplication (I can recommend "Matrix Computations", Golub and Van Loan).

like image 31
gheorghe1800 Avatar answered Sep 22 '22 18:09

gheorghe1800