I did a program that does matrices multiplications (without optimization).
for(i=0; i<a_r; i++)
for(j=0;j<b_c; j++)
for(k=0;k<a_c; k++)
c[i][j]=c[i][j]+a[i][k]*b[k][j];
Depending on how I allocate the memory, the time of computation is different.
It is important to notice three points:
I test this program with three different allocations:
Test 1: Global allocation
Int a[2500][2500], b[2500][2500], c[2500][2500];
Main {matrix multiplication a*b=c}
The computing time is quite constant (around 41s).
Test 2: dynamic allocation (arrays)
Main{
int **a,**b,**c;
a=(int **) malloc(sizeof(int)*2500);
for( i=0;i<a_c; i++)
a[i]=(int *) malloc(sizeof(int)*2500);
…
matrix multiplication a*b=c
}
When I execute the program several times in a raw I obtain these running times: 260 seconds, 180, 110, 110… If I wait around 5 seconds and launch again the program I obtain the same results.
Test 3: dynamic allocation (lines)
Main{
int *a, *b, *c;
a=(int *) malloc(sizeof(int)*2500*2500);
… (same for b and c)
matrix multiplication a*b=c
}
The computing time is quite constant (around 44s).
I think test 2 is less efficient because of the way the data are stored in memory. Like explain in this article(the website is out today) or in this question. Some way to go through the memory are more efficient and as a result some way to allocate memory give you a more efficient program.
But (in test 2), I have no clue to figure why the program is faster over time. Does anyone have an idea to explain this phenomenon? Thanks in advance.
P.S. I did these tests on an Intel Xeon E5-2625 with CentOS 6.3 and kernel Linux 3.11.1. EDIT : Frequency scaling is disabled on my computer. The frequency is constant.
Here is the code:
#include <stdlib.h>
#include <stdio.h>
#include <time.h>
#include <float.h>
#include <sys/time.h>
#include <sys/mman.h>
#include <sys/resource.h>
/****************************************************
* Random generate a random int between _Min and _Max
*/
int Random(int _iMin,int _iMax)
{
return (_iMin + (rand()%(_iMax-_iMin+1)));
}
/****************************************************
* Return the struct of getrusage.
*/
struct rusage give_ru()
{
struct rusage ru;
getrusage(RUSAGE_SELF, &ru);
//getrusage(RUSAGE_CHILDREN, &ru);
return ru;
}
/****************************************************
* Do a matrix multiplication a*b=c
* This program isn't optimized.
*/
int main()
{
/*
* Initialize variables and array sizes.
*/
int **a,**b,**c;
printf("\nenter Taille de la matrice : 2500");
int a_r=2500,a_c=2500,b_r=2500,b_c=2500;
clock_t start,end;
struct rusage start1,end1;
/* variables to store time difference between
start of matrix multiplication and end of multiplication. */
double dif; /*variable to calculate the time difference between the parallelization */
int i,j,k;
if(a_c!=b_r )
{
printf("\ncan not multiply");
goto again;
}
/*
* Memory allocation.
*/
start =clock();
a=(int **) malloc(sizeof(int *)*a_r);
if (a == NULL)
{
printf("Allocation of matrix a failed \n");
exit(0);
}
for( i=0;i<a_c; i++)
{
a[i]=(int *) malloc(sizeof(int)*a_c);
if (a[i] == NULL)
{
printf("Allocation of matrix a[%d] failed \n",i);
exit(0);
}
}
b=(int **) malloc(sizeof(int *)*b_r);
if (b == NULL)
{
printf("Allocation of matrix b failed \n");
exit(0);
}
for( i=0;i<b_c; i++)
{
b[i]=(int *) malloc(sizeof(int)*b_c);
if (b[i] == NULL)
{
printf("Allocation of matrix b[%d] failed \n",i);
exit(0);
}
}
c=(int **) malloc(sizeof(int *)*a_r);
if (c == NULL)
{
printf("Allocation of matrix c failed \n");
exit(0);
}
for( i=0;i< b_c; i++)
{
c[i]=(int *) malloc(sizeof(int)*b_c);
if (c[i] == NULL)
{
printf("Allocation of matrix c[%d] failed \n",i);
exit(0);
}
}
/* Memory is lock*/
printf("Starting mlockall.\n");
if(mlockall(MCL_CURRENT | MCL_FUTURE)<0) {
perror("mlockall");
return 1;
}
/*
* Matrix initialization (with random integer).
*/
printf("Initializing matrices...\n");
//initializing first matrix
srand(time(NULL));
for(i=0;i<a_r; i++)
{
for(j=0;j<a_c; j++)
{
a[i][j] = Random(0,100);//i+j;
//printf("%d \n", a[i][j]);
}
}
// initializing second matrix
srand(time(NULL));
for(i=0;i<b_r; i++)
{
for(j=0;j<b_c; j++)
{
b[i][j] = Random(0,100);//i*j;
}
}
/*initialize product matrix */
srand(time(NULL));
for(i=0;i<a_r; i++)
{
for(j=0;j< b_c; j++)
{
c[i][j]= Random(0,100);//0;
}
}
end= clock(); //end the timer
dif = ((double) (end - start)) / CLOCKS_PER_SEC; //store the difference
printf("Malloc and initialization took %f sec. time.\n", dif);
/*
* Matrix Multiplication.
*/
start =clock(); //start the timer
start1 = give_ru(); //start the timer
/* multiply matrix one and matrix two */
for(i=0;i<a_r; i++)
for(j=0;j<a_c; j++)
for(k=0;k<b_c; k++)
c[i][j]=c[i][j]+a[i][k]*b[k][j];
end1 = give_ru(); //end the timer
end = clock(); //end the timer
dif = ((double) (end - start)) / CLOCKS_PER_SEC; //store the difference
struct timeval timeUserStart = start1.ru_utime;
double utimeStart = (double)timeUserStart.tv_sec + (double)timeUserStart.tv_usec / 1000000.0;
struct timeval timeSystemStart = start1.ru_stime;
double stimeStart = (double)timeSystemStart.tv_sec + (double)timeSystemStart.tv_usec / 1000000.0;
struct timeval timeUserEnd = end1.ru_utime;
double utimeEnd = (double)timeUserEnd.tv_sec + (double)timeUserEnd.tv_usec / 1000000.0;
struct timeval timeSystemEnd = end1.ru_stime;
double stimeEnd = (double)timeSystemEnd.tv_sec + (double)timeSystemEnd.tv_usec / 1000000.0;
double difuTime = utimeEnd - utimeStart;
double difsTime = stimeEnd - stimeStart;
long pageReclaims = end1.ru_minflt;//start1.ru_minflt-
long pageFaults = end1.ru_majflt;//start1.ru_majflt-
printf("Parallelization took %f sec. time.\n", dif);
printf("Time User : %f .\n", difuTime );
printf("Time System : %f .\n", difsTime );
printf("Page reclaims : %ld .\n", end1.ru_minflt);
printf("Page faults : %ld .\n", end1.ru_majflt);
/*
* free memory.
*/
printf("Finished, cleaning up \n");
munlockall();
/*free memory*/
for(i=0;i<a_r; i++)
{
free(a[i]);
a[i] = NULL;
}
free(a);
a = NULL;
for(i=0;i<a_c; i++)
{
free(b[i]);
b[i] = NULL;
}
free(b);
b = NULL;
for(i=0;i<b_c; i++)
{
free(c[i]);
c[i] = NULL;
}
free(c);
c = NULL;
return 0;
}
This is a major open question in theoretical computer science. As of December 2020, the matrix multiplication algorithm with best asymptotic complexity runs in O(n2.3728596) time, given by Josh Alman and Virginia Vassilevska Williams.
Time Complexity of above method is O(N3). Divide and Conquer : Following is simple Divide and Conquer method to multiply two square matrices.
When is matrix multiplication defined? In order for matrix multiplication to be defined, the number of columns in the first matrix must be equal to the number of rows in the second matrix.
In terms of serial complexity, the matrix-vector multiplication is qualified as a quadratic complexity algorithm (or a bilinear complexity algorithm if the matrix is rectangular).
An array of pointer require an extra level of indirection, and thus will be slower. Moreover, it might cause more cache misses.
So I tried with 1000*1000 matrix :
$ perf stat ./test2
Performance counter stats for './test1':
8561,688348 task-clock (msec) # 1,000 CPUs utilized
13 context-switches # 0,002 K/sec
9 cpu-migrations # 0,001 K/sec
3 058 page-faults # 0,357 K/sec
24 844 304 630 cycles # 2,902 GHz [83,32%]
21 302 837 742 stalled-cycles-frontend # 85,75% frontend cycles idle [83,32%]
2 110 745 845 stalled-cycles-backend # 8,50% backend cycles idle [66,68%]
7 030 427 722 instructions # 0,28 insns per cycle
# 3,03 stalled cycles per insn [83,36%]
1 004 889 984 branches # 117,371 M/sec [83,37%]
1 077 360 branch-misses # 0,11% of all branches [83,32%]
8,561758966 seconds time elapsed
$ perf stat ./test3
Performance counter stats for './test2':
1367,856713 task-clock (msec) # 0,995 CPUs utilized
195 context-switches # 0,143 K/sec
3 cpu-migrations # 0,002 K/sec
3 001 page-faults # 0,002 M/sec
3 977 759 335 cycles # 2,908 GHz [83,41%]
975 477 913 stalled-cycles-frontend # 24,52% frontend cycles idle [83,41%]
179 003 530 stalled-cycles-backend # 4,50% backend cycles idle [66,81%]
7 017 803 848 instructions # 1,76 insns per cycle
# 0,14 stalled cycles per insn [83,41%]
1 002 321 021 branches # 732,768 M/sec [83,42%]
1 046 066 branch-misses # 0,10% of all branches [82,97%]
1,374613079 seconds time elapsed
As we can see, a lot more cycles int test2.
I also mesured the cache missed :
$ perf stat -e 'L1-dcache-loads,L1-dcache-load-misses,L1-dcache-stores,L1-dcache-store-misses' ./test2
Performance counter stats for './test1':
3 009 028 415 L1-dcache-loads
1 259 622 058 L1-dcache-load-misses # 41,86% of all L1-dcache hits
6 427 561 L1-dcache-stores
1 141 461 L1-dcache-store-misses
8,650905202 seconds time elapsed
$ perf stat -e 'L1-dcache-loads,L1-dcache-load-misses,L1-dcache-stores,L1-dcache-store-misses' ./test3
Performance counter stats for './test2':
2 004 807 223 L1-dcache-loads
596 388 192 L1-dcache-load-misses # 29,75% of all L1-dcache hits
2 111 264 L1-dcache-stores
384 198 L1-dcache-store-misses
1,384352257 seconds time elapsed
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With