Problem
I am learning about HPC and code optimization. I attempt to replicate the results in Goto's seminal matrix multiplication paper (http://www.cs.utexas.edu/users/pingali/CS378/2008sp/papers/gotoPaper.pdf). Despite my best efforts, I cannot get over ~50% maximum theoretical CPU performance.
Background
See related issues here (Optimized 2x2 matrix multiplication: Slow assembly versus fast SIMD), including info about my hardware
What I have attempted
This related paper (http://www.cs.utexas.edu/users/flame/pubs/blis3_ipdps14.pdf) has a good description of Goto's algorithmic structure. I provide my source code below.
My question
I am asking for general help. I have been working on this for far too long, have tried many different algorithms, inline assembly, inner kernels of various sizes (2x2, 4x4, 2x8, ..., mxn with m and n large), yet I cannot seem to break 50% CPU Gflops. This is purely for education purposes and not a homework.
Source Code
Hopefully is understandable. Please ask if not. I set up the macro structure (for loops) as described in the 2nd paper above. I pack the matrices as discussed in either paper and shown graphically in Figure 11 here (http://www.cs.utexas.edu/users/flame/pubs/BLISTOMSrev2.pdf). My inner kernel computes 2x8 blocks, as this seems to be the optimal computation for Nehalem architecture (see GotoBLAS source code - kernels). The inner kernel is based on the concept of calculating rank-1 updates as described here (http://code.google.com/p/blis/source/browse/config/template/kernels/3/bli_gemm_opt_mxn.c)
#include <stdio.h>
#include <time.h>
#include <stdlib.h>
#include <string.h>
#include <x86intrin.h>
#include <math.h>
#include <omp.h>
#include <stdint.h>
// define some prefetch functions
#define PREFETCHNTA(addr,nrOfBytesAhead) \
_mm_prefetch(((char *)(addr))+nrOfBytesAhead,_MM_HINT_NTA)
#define PREFETCHT0(addr,nrOfBytesAhead) \
_mm_prefetch(((char *)(addr))+nrOfBytesAhead,_MM_HINT_T0)
#define PREFETCHT1(addr,nrOfBytesAhead) \
_mm_prefetch(((char *)(addr))+nrOfBytesAhead,_MM_HINT_T1)
#define PREFETCHT2(addr,nrOfBytesAhead) \
_mm_prefetch(((char *)(addr))+nrOfBytesAhead,_MM_HINT_T2)
// define a min function
#ifndef min
#define min( a, b ) ( ((a) < (b)) ? (a) : (b) )
#endif
// zero a matrix
void zeromat(double *C, int n)
{
int i = n;
while (i--) {
int j = n;
while (j--) {
*(C + i*n + j) = 0.0;
}
}
}
// compute a 2x8 block from (2 x kc) x (kc x 8) matrices
inline void
__attribute__ ((gnu_inline))
__attribute__ ((aligned(64))) dgemm_2x8_sse(
int k,
const double* restrict a1, const int cs_a,
const double* restrict b1, const int rs_b,
double* restrict c11, const int rs_c
)
{
register __m128d xmm1, xmm4, //
r8, r9, r10, r11, r12, r13, r14, r15; // accumulators
// 10 registers declared here
r8 = _mm_xor_pd(r8,r8); // ab
r9 = _mm_xor_pd(r9,r9);
r10 = _mm_xor_pd(r10,r10);
r11 = _mm_xor_pd(r11,r11);
r12 = _mm_xor_pd(r12,r12); // ab + 8
r13 = _mm_xor_pd(r13,r13);
r14 = _mm_xor_pd(r14,r14);
r15 = _mm_xor_pd(r15,r15);
// PREFETCHT2(b1,0);
// PREFETCHT2(b1,64);
//int l = k;
while (k--) {
//PREFETCHT0(a1,0); // fetch 64 bytes from a1
// i = 0
xmm1 = _mm_load1_pd(a1);
xmm4 = _mm_load_pd(b1);
xmm4 = _mm_mul_pd(xmm1,xmm4);
r8 = _mm_add_pd(r8,xmm4);
xmm4 = _mm_load_pd(b1 + 2);
xmm4 = _mm_mul_pd(xmm1,xmm4);
r9 = _mm_add_pd(r9,xmm4);
xmm4 = _mm_load_pd(b1 + 4);
xmm4 = _mm_mul_pd(xmm1,xmm4);
r10 = _mm_add_pd(r10,xmm4);
xmm4 = _mm_load_pd(b1 + 6);
xmm4 = _mm_mul_pd(xmm1,xmm4);
r11 = _mm_add_pd(r11,xmm4);
//
// i = 1
xmm1 = _mm_load1_pd(a1 + 1);
xmm4 = _mm_load_pd(b1);
xmm4 = _mm_mul_pd(xmm1,xmm4);
r12 = _mm_add_pd(r12,xmm4);
xmm4 = _mm_load_pd(b1 + 2);
xmm4 = _mm_mul_pd(xmm1,xmm4);
r13 = _mm_add_pd(r13,xmm4);
xmm4 = _mm_load_pd(b1 + 4);
xmm4 = _mm_mul_pd(xmm1,xmm4);
r14 = _mm_add_pd(r14,xmm4);
xmm4 = _mm_load_pd(b1 + 6);
xmm4 = _mm_mul_pd(xmm1,xmm4);
r15 = _mm_add_pd(r15,xmm4);
a1 += cs_a;
b1 += rs_b;
//PREFETCHT2(b1,0);
//PREFETCHT2(b1,64);
}
// copy result into C
PREFETCHT0(c11,0);
xmm1 = _mm_load_pd(c11);
xmm1 = _mm_add_pd(xmm1,r8);
_mm_store_pd(c11,xmm1);
xmm1 = _mm_load_pd(c11 + 2);
xmm1 = _mm_add_pd(xmm1,r9);
_mm_store_pd(c11 + 2,xmm1);
xmm1 = _mm_load_pd(c11 + 4);
xmm1 = _mm_add_pd(xmm1,r10);
_mm_store_pd(c11 + 4,xmm1);
xmm1 = _mm_load_pd(c11 + 6);
xmm1 = _mm_add_pd(xmm1,r11);
_mm_store_pd(c11 + 6,xmm1);
c11 += rs_c;
PREFETCHT0(c11,0);
xmm1 = _mm_load_pd(c11);
xmm1 = _mm_add_pd(xmm1,r12);
_mm_store_pd(c11,xmm1);
xmm1 = _mm_load_pd(c11 + 2);
xmm1 = _mm_add_pd(xmm1,r13);
_mm_store_pd(c11 + 2,xmm1);
xmm1 = _mm_load_pd(c11 + 4);
xmm1 = _mm_add_pd(xmm1,r14);
_mm_store_pd(c11 + 4,xmm1);
xmm1 = _mm_load_pd(c11 + 6);
xmm1 = _mm_add_pd(xmm1,r15);
_mm_store_pd(c11 + 6,xmm1);
}
// packs a matrix into rows of slivers
inline void
__attribute__ ((gnu_inline))
__attribute__ ((aligned(64))) rpack( double* restrict dst,
const double* restrict src,
const int kc, const int mc, const int mr, const int n)
{
double tmp[mc*kc] __attribute__ ((aligned(64)));
double* restrict ptr = &tmp[0];
for (int i = 0; i < mc; ++i)
for (int j = 0; j < kc; ++j)
*ptr++ = *(src + i*n + j);
ptr = &tmp[0];
//const int inc_dst = mr*kc;
for (int k = 0; k < mc; k+=mr)
for (int j = 0; j < kc; ++j)
for (int i = 0; i < mr*kc; i+=kc)
*dst++ = *(ptr + k*kc + j + i);
}
// packs a matrix into columns of slivers
inline void
__attribute__ ((gnu_inline))
__attribute__ ((aligned(64))) cpack(double* restrict dst,
const double* restrict src,
const int nc,
const int kc,
const int nr,
const int n)
{
double tmp[kc*nc] __attribute__ ((aligned(64)));
double* restrict ptr = &tmp[0];
for (int i = 0; i < kc; ++i)
for (int j = 0; j < nc; ++j)
*ptr++ = *(src + i*n + j);
ptr = &tmp[0];
// const int inc_k = nc/nr;
for (int k = 0; k < nc; k+=nr)
for (int j = 0; j < kc*nc; j+=nc)
for (int i = 0; i < nr; ++i)
*dst++ = *(ptr + k + i + j);
}
void blis_dgemm_ref(
const int n,
const double* restrict A,
const double* restrict B,
double* restrict C,
const int mc,
const int nc,
const int kc
)
{
int mr = 2;
int nr = 8;
double locA[mc*kc] __attribute__ ((aligned(64)));
double locB[kc*nc] __attribute__ ((aligned(64)));
int ii,jj,kk,i,j;
#pragma omp parallel num_threads(4) shared(A,B,C) private(ii,jj,kk,i,j,locA,locB)
{//use all threads in parallel
#pragma omp for
// partitions C and B into wide column panels
for ( jj = 0; jj < n; jj+=nc) {
// A and the current column of B are partitioned into col and row panels
for ( kk = 0; kk < n; kk+=kc) {
cpack(locB, B + kk*n + jj, nc, kc, nr, n);
// partition current panel of A into blocks
for ( ii = 0; ii < n; ii+=mc) {
rpack(locA, A + ii*n + kk, kc, mc, mr, n);
for ( i = 0; i < min(n-ii,mc); i+=mr) {
for ( j = 0; j < min(n-jj,nc); j+=nr) {
// inner kernel that compues 2 x 8 block
dgemm_2x8_sse( kc,
locA + i*kc , mr,
locB + j*kc , nr,
C + (i+ii)*n + (j+jj), n );
}
}
}
}
}
}
}
double compute_gflops(const double time, const int n)
{
// computes the gigaflops for a square matrix-matrix multiplication
double gflops;
gflops = (double) (2.0*n*n*n)/time/1.0e9;
return(gflops);
}
// ******* MAIN ********//
void main() {
clock_t time1, time2;
double time3;
double gflops;
const int trials = 10;
int nmax = 4096;
printf("%10s %10s\n","N","Gflops/s");
int mc = 128;
int kc = 256;
int nc = 128;
for (int n = kc; n <= nmax; n+=kc) { //assuming kc is the max dim
double *A = NULL;
double *B = NULL;
double *C = NULL;
A = _mm_malloc (n*n * sizeof(*A),64);
B = _mm_malloc (n*n * sizeof(*B),64);
C = _mm_malloc (n*n * sizeof(*C),64);
srand(time(NULL));
// Create the matrices
for (int i = 0; i < n; i++) {
for (int j = 0; j < n; j++) {
A[i*n + j] = (double) rand()/RAND_MAX;
B[i*n + j] = (double) rand()/RAND_MAX;
//D[j*n + i] = B[i*n + j]; // Transpose
C[i*n + j] = 0.0;
}
}
// warmup
zeromat(C,n);
blis_dgemm_ref(n,A,B,C,mc,nc,kc);
zeromat(C,n);
time2 = 0;
for (int count = 0; count < trials; count++){// iterations per experiment here
time1 = clock();
blis_dgemm_ref(n,A,B,C,mc,nc,kc);
time2 += clock() - time1;
zeromat(C,n);
}
time3 = (double)(time2)/CLOCKS_PER_SEC/trials;
gflops = compute_gflops(time3, n);
printf("%10d %10f\n",n,gflops);
_mm_free(A);
_mm_free(B);
_mm_free(C);
}
printf("tests are done\n");
}
EDIT 1
OS = Win 7 64 bit
Compiler = gcc 4.8.1, but 32 bit and mingw (32 bit also. I am working to get a "non-install" version of mingw64 so I can generate faster code/work with more XMM registers, etc. If anyone has a link to a mingw64 installation that is similar in fashion to mingw-get
please post. My work computer has way too many admin restrictions.
Packing
You appear to be packing the block of the A
matrix too often. You do
rpack(locA, A + ii*n + kk, kc, mc, mr, n);
But this only depends on ii
and kk
and not on jj
but it's inside the inner loop on jj
so you repack the same thing for each iteration of jj
. I don't think that's necessary. In my code I do the packing before the matrix multiplication. Probably it's more efficient to pack inside the the matrix multiplication while the values are still in the cache but it's trickier to do that. But packing is a O(n^2) operation and matrix multiplication is a O(n^3) operation so it's not very inefficient to pack outside of the matrix multiplication for large matrices (I know that from testing as well - commenting out the packing only changes the efficiency by a few percent). However, by repacking with rpack
each jj
iteration you have effectively made it an O(n^3) operation.
Wall Time
You want the wall time. On Unix the clock() function does not return the wall time (though it does on Windows with MSVC). It returns the cumulative time for each thread. This is one of the most common errors I have seen on SO for OpenMP.
Use omp_get_wtime()
to get the wall time.
Note that I don't know how the clock()
function works with MinGW or MinGW-w64 (they are separate projects). MinGW links to MSVCRT so I would guess that clock()
with MinGW returns the wall time as it does with MSVC. However, MinGW-w64 does not link to MSVCRT (as far as I understand it links to something like glibc). It's possible that clock()
in MinGW-w64 performs the same as clock()
does with Unix.
Hyper Threading
Hyper threading works well for code that stalls the CPU often. That's actually the majority of code because it's very difficult to write code that does not stall the CPU. That's why Intel invented Hyper Threading. It's easier to task switch and give the CPU something else to do than to optimize the code. However, for code that is highly optimized hyper-threading can actually give worse results. In my own matrix multiplication code that's certainly the case. Set the number of threads to the number of physical cores you have (two in your case).
My Code
Below is my code. I did not include the inner64
function here. You can find it at Difference in performance between MSVC and GCC for highly optimized matrix multplication code (with the obnoxious and misleading name of AddDot4x4_vec_block_8wide
)
I wrote this code before reading the Goto paper and also before reading Agner Fog's optimization manuals. You appear to reorder/pack the matrices in the main loop. That probably makes more sense. I don't think I reorder them the same way you do and also I only reorder one of the input matrices (B) and not both as you do.
The performance of this code on my system (Xeon [email protected]) with Linux and GCC is about 75% of the peak for this matrix size (4096x4096). Intel's MKL get's about 94% of the peak on my system for this matrix size so there is clearly room for improvement.
#include <stdio.h>
#include <stdlib.h>
#include <omp.h>
#include <immintrin.h>
extern "C" void inner64(const float *a, const float *b, float *c);
void (*fp)(const float *a, const float *b, float *c) = inner64;
void reorder(float * __restrict a, float * __restrict b, int n, int bs) {
int nb = n/bs;
#pragma omp parallel for
for(int i=0; i<nb; i++) {
for(int j=0; j<nb; j++) {
for(int i2=0; i2<bs; i2++) {
for(int j2=0; j2<bs; j2++) {
b[bs*bs*(nb*i+j) + bs*i2+j2]= a[bs*(i*n+j) + i2*n + j2];
}
}
}
}
}
inline void gemm_block(float * __restrict a, float * __restrict b, float * __restrict c, int n, int n2) {
for(int i=0; i<n2; i++) {
fp(&a[i*n], b, &c[i*n]);
}
}
void gemm(float * __restrict a, float * __restrict b, float * __restrict c, int n, int bs) {
int nb = n/bs;
float *b2 = (float*)_mm_malloc(sizeof(float)*n*n,64);
reorder(b,b2,n,bs);
#pragma omp parallel for
for(int i=0; i<nb; i++) {
for(int j=0; j<nb; j++) {
for(int k=0; k<nb; k++) {
gemm_block(&a[bs*(i*n+k)],&b2[bs*bs*(k*nb+j)],&c[bs*(i*n+j)], n, bs);
}
}
}
_mm_free(b2);
}
int main() {
float peak = 1.0f*8*4*2*3.69f;
const int n = 4096;
float flop = 2.0f*n*n*n*1E-9f;
omp_set_num_threads(4);
float *a = (float*)_mm_malloc(sizeof(float)*n*n,64);
float *b = (float*)_mm_malloc(sizeof(float)*n*n,64);
float *c = (float*)_mm_malloc(sizeof(float)*n*n,64);
for(int i=0; i<n*n; i++) {
a[i] = 1.0f*rand()/RAND_MAX;
b[i] = 1.0f*rand()/RAND_MAX;
}
gemm(a,b,c,n,64); //warm OpenMP up
while(1) {
for(int i=0; i<n*n; i++) c[i] = 0;
double dtime = omp_get_wtime();
gemm(a,b,c,n,64);
dtime = omp_get_wtime() - dtime;
printf("time %.2f s, efficiency %.2f%%\n", dtime, 100*flop/dtime/peak);
}
}
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With