Logo Questions Linux Laravel Mysql Ubuntu Git Menu
 

SIMD vs OMP in vector multiplication

In my project I have to do several vector multiplications, done on either double *a-vectors or float *a-vectors. In order to accelerate that, I wanted to use either SIMD-operations or omp. For getting the fastest result, I wrote a benchmark program:

#include <iostream>
#include <memory>
#include <vector>
#include <omp.h>
#include <immintrin.h>
#include <stdlib.h>
#include <chrono>


#define SIZE 32768
#define ROUNDS 1e5

void multiply_singular(float *a, float *b, float *d)
{
    for(int i = 0; i < SIZE; i++)
        d[i] = a[i]*b[i];
}

void multiply_omp(float *a, float *b, float *d)
{
#pragma omp parallel for
    for(int i = 0; i < SIZE; i++)
        d[i] = a[i]*b[i];
}

void multiply_avx(float *a, float *b, float *d)
{
    __m256 a_a, b_a, c_a;
    for(int i = 0; i < SIZE/8; i++)
    {
        a_a = _mm256_loadu_ps(a+8*i);
        b_a = _mm256_loadu_ps(b+8*i);
        c_a = _mm256_mul_ps(a_a, b_a);
        _mm256_storeu_ps (d+i*8, c_a);
    }
}

void multiply_avx_omp(float *a, float *b, float *d)
{
    __m256 a_a, b_a, c_a;
#pragma omp for
    for(int i = 0; i < SIZE/8; i++)
    {
        a_a = _mm256_loadu_ps(a+8*i);
        b_a = _mm256_loadu_ps(b+8*i);
        c_a = _mm256_mul_ps(a_a, b_a);
        _mm256_storeu_ps (d+i*8, c_a);
    }
}

void multiply_singular_double(double *a, double *b, double *d)
{
    for(int i = 0; i < SIZE; i++)
        d[i] = a[i]*b[i];
}

void multiply_omp_double(double *a, double *b, double *d)
{
#pragma omp parallel for
    for(int i = 0; i < SIZE; i++)
        d[i] = a[i]*b[i];
}

void multiply_avx_double(double *a, double *b, double *d)
{
    __m256d a_a, b_a, c_a;
    for(int i = 0; i < SIZE/4; i++)
    {
        a_a = _mm256_loadu_pd(a+4*i);
        b_a = _mm256_loadu_pd(b+4*i);
        c_a = _mm256_mul_pd(a_a, b_a);
        _mm256_storeu_pd (d+i*4, c_a);
    }
}

void multiply_avx_double_omp(double *a, double *b, double *d)
{
    __m256d a_a, b_a, c_a;
#pragma omp parallel for
    for(int i = 0; i < SIZE/4; i++)
    {
        a_a = _mm256_loadu_pd(a+4*i);
        b_a = _mm256_loadu_pd(b+4*i);
        c_a = _mm256_mul_pd(a_a, b_a);
        _mm256_storeu_pd (d+i*4, c_a);
    }
}


int main()
{
    float *a, *b, *c, *d, *e, *f;
    double *a_d, *b_d, *c_d, *d_d, *e_d, *f_d;
    a = new float[SIZE] {0};
    b = new float[SIZE] {0};
    c = new float[SIZE] {0};
    d = new float[SIZE] {0};
    e = new float[SIZE] {0};
    f = new float[SIZE] {0};
    a_d = new double[SIZE] {0};
    b_d = new double[SIZE] {0};
    c_d = new double[SIZE] {0};
    d_d = new double[SIZE] {0};
    e_d = new double[SIZE] {0};
    f_d = new double[SIZE] {0};
    for(int i = 0; i < SIZE; i++)
    {
        a[i] = i;
        b[i] = i;
        a_d[i] = i;
        b_d[i] = i;
    };
    std::cout << "Now doing the single float rounds!\n";
    std::chrono::high_resolution_clock::time_point t1 = std::chrono::high_resolution_clock::now();
    for(int i = 0; i < ROUNDS; i++)
    {
        multiply_singular(a, b, c);
    }
    std::chrono::high_resolution_clock::time_point t2 = std::chrono::high_resolution_clock::now();
    auto duration_ss = std::chrono::duration_cast<std::chrono::microseconds>(t2-t1).count();
    std::cout << "Now doing the omp float rounds!\n";
    t1 = std::chrono::high_resolution_clock::now();
    for(int i = 0; i < ROUNDS*10; i++)
    {
        multiply_omp(a, b, d);
    };
    t2 = std::chrono::high_resolution_clock::now();
    auto duration_so = std::chrono::duration_cast<std::chrono::microseconds>(t2-t1).count();
    std::cout << "Now doing the avx float rounds!\n";
    t1 = std::chrono::high_resolution_clock::now();
    for(int i = 0; i < ROUNDS*10; i++)
    {
        multiply_avx(a, b, e);
    };
    t2 = std::chrono::high_resolution_clock::now();
    auto duration_sa = std::chrono::duration_cast<std::chrono::microseconds>(t2-t1).count();
    std::cout << "Now doing the avx omp float rounds!\n";
    t1 = std::chrono::high_resolution_clock::now();
    for(int i = 0; i < ROUNDS*10; i++)
    {
        multiply_avx_omp(a, b, e);
    };
    t2 = std::chrono::high_resolution_clock::now();
    auto duration_sao = std::chrono::duration_cast<std::chrono::microseconds>(t2-t1).count();
    std::cout << "Now doing the single double rounds!\n";
    t1 = std::chrono::high_resolution_clock::now();
    for(int i = 0; i < ROUNDS; i++)
    {
        multiply_singular_double(a_d, b_d, c_d);
    };
    t2 = std::chrono::high_resolution_clock::now();
    auto duration_ds = std::chrono::duration_cast<std::chrono::microseconds>(t2-t1).count();
    std::cout << "Now doing the omp double rounds!\n";
    t1 = std::chrono::high_resolution_clock::now();
    for(int i = 0; i < ROUNDS*10; i++)
    {
        multiply_omp_double(a_d, b_d, d_d);
    };
    t2 = std::chrono::high_resolution_clock::now();
    auto duration_do = std::chrono::duration_cast<std::chrono::microseconds>(t2-t1).count();
    std::cout << "Now doing the avx double rounds!\n";
    t1 = std::chrono::high_resolution_clock::now();
    for(int i = 0; i < ROUNDS*10; i++)
    {
        multiply_avx_double(a_d, b_d, e_d);
    };
    t2 = std::chrono::high_resolution_clock::now();
    auto duration_da = std::chrono::duration_cast<std::chrono::microseconds>(t2-t1).count();
    std::cout << "Now doing the avx omp double rounds!\n";
    t1 = std::chrono::high_resolution_clock::now();
    for(int i = 0; i < ROUNDS*10; i++)
    {
        multiply_avx_double_omp(a_d, b_d, f_d);
    };
    t2 = std::chrono::high_resolution_clock::now();
    auto duration_dao = std::chrono::duration_cast<std::chrono::microseconds>(t2-t1).count();
    std::cout << "Finished\n";
    std::cout << "Elapsed time for functions:\n";
    std::cout << "Function\ttime[ms]\n";
    std::cout << "Singular float:\t" << duration_ss/ROUNDS << '\n';
    std::cout << "OMP float:\t" << duration_so/(ROUNDS*10) << '\n';
    std::cout << "AVX float avx:\t" << duration_sa/(ROUNDS*10) << '\n';
    std::cout << "OMP AVX float avx omp:\t" << duration_sao/(ROUNDS*10) << '\n';
    std::cout << "Singular double:\t" << duration_ds/ROUNDS << '\n';
    std::cout << "OMP double:\t" << duration_do/(ROUNDS*10) << '\n';
    std::cout << "AVX double:\t" << duration_da/(ROUNDS*10) << '\n';
    std::cout << "OMP AVX double:\t" << duration_dao/(ROUNDS*10) << '\n';
    delete[] a;
    delete[] b;
    delete[] c;
    delete[] d;
    delete[] e;
    delete[] f;
    delete[] a_d;
    delete[] b_d;
    delete[] c_d;
    delete[] d_d;
    delete[] e_d;
    delete[] f_d;
    return 0;
}

When compiling it with g++-5 -fopenmp -std=c++14 -march=native test_new.cpp -o test -lgomp, I get

Elapsed time for functions:
Function    time[ms]
Singular float: 117.979
OMP float:  40.5385
AVX float avx:  60.2964
OMP AVX float avx omp:  61.4206
Singular double:    129.59
OMP double: 200.745
AVX double: 136.715
OMP AVX double: 122.176

or in a second run

Elapsed time for functions:
Function    time[ms]
Singular float: 113.932
OMP float:  39.2581
AVX float avx:  58.3029
OMP AVX float avx omp:  60.0023
Singular double:    123.575
OMP double: 66.0327
AVX double: 124.293
OMP AVX double: 318.038

Here obviously the pure omp-function is faster than the other functions, even as the AVX function. When adding the -O3-switch to the compiling line, I get the following result:

Elapsed time for functions:
Function    time[ms]
Singular float: 12.7361
OMP float:  4.82436
AVX float avx:  14.7514
OMP AVX float avx omp:  14.7225
Singular double:    27.9976
OMP double: 8.50957
AVX double: 32.5175
OMP AVX double: 257.219

Here again omp is significantly faster than everything else, while AVX is slowest, even slower than the linear approach. Why is that? Is my AVX function implementation just crappy, or are there other problems?

Executed on Ubuntu 14.04.1, i7 Sandy Bridge, gcc version 5.3.0.

Edit: I found one mistake: I should move the declarations of the temporary variables in the avx-functions inside the for-loop, that gets me nearly to the omp-level (and delivers correct results).

Edit 2: When disabling the -O3-switch, the OMP-AVX-instructions are faster than the OMP-functions, with the switch they are nearly on par.

Edit 3: When filling the arrays with random data every time before executing the next loop, I get (with -O3):

Elapsed time for functions:
Function    time[ms]
Singular float: 30.742
Cilk float: 24.0769
OMP float:  17.2415
AVX float avx:  33.0217
OMP AVX float avx omp:  10.1934
Singular double:    60.412
Cilk double:    34.6458
OMP double: 19.0739
AVX double: 66.8676
OMP AVX double: 22.3586

and without:

Elapsed time for functions:
Function    time[ms]
Singular float: 274.402
Cilk float: 88.258
OMP float:  66.2124
AVX float avx:  117.066
OMP AVX float avx omp:  35.0313
Singular double:    238.652
Cilk double:    91.1667
OMP double: 127.621
AVX double: 249.516
OMP AVX double: 116.24

(and I added a cilk_for()-loop for comparison, too).

Update: I added (as suggested in the answer) also a function using the #pragma omp parallel for simd. That resulted in:

Elapsed time for functions:
Function                time[ms]
Singular float:         106.081
Cilk float:             33.2761
OMP float:              17.0651
AVX float avx:          65.1129
OMP AVX float:          19.1496
SIMD OMP float:         2.6095
Aligned AVX OMP float:  18.1165
Singular double:        118.939
Cilk double:            53.1102
OMP double:             35.652
AVX double:             131.24
OMP AVX double:         39.4377
SIMD OMP double:        7.0748
Aligned AVX OMP double: 38.4474
like image 615
arc_lupus Avatar asked May 25 '16 07:05

arc_lupus


People also ask

When should I use SIMD OpenMP?

A simple answer: OpenMP only used to exploit multiple threads for multiple cores. This new simd extention allows you to explicitly use SIMD instructions on modern CPUs, such as Intel's AVX/SSE and ARM's NEON. (Note that a SIMD instruction is executed in a single thread and a single core, by design.

What is SIMD loop?

A SIMD loop has logical iterations numbered 0,1,...,N-1 where N is the number of loop iterations, and the logical numbering denotes the sequence in which the iterations would be executed if the associated loop(s) were executed with no SIMD instructions.


1 Answers

With compilers supporting OpenMP4.x you may want to start from something like this:

void multiply_singular_omp_for_simd(float *a, float *b, float *d)
{
    #pragma omp parallel for simd schedule (static,16)
    for(int i = 0; i < SIZE; i++)
        d[i] = a[i]*b[i];
}

It will give you both SIMD and Thread parallelism. The parallel decomposition will be done automatically, first having parallel tasks/chunks spread across threads/cores, secondly for every task/chunk spreading individual iterations "across" simd "lanes".

Read given couple articles if you feel concerned: Threading and SIMD in OpenMP4, ICC documentation.

Formally the way you phrased your question is slightly ambiguous, because staring from 4.0, OMP loop could be SIMD, Threading or SIMD+Threading parallel. So it's not about OMP vs. SIMD anymore. Instead it's about OMP SIMD vs. OMP Threading.

Not sure how good your given GCC implementation is, but ICC/IFORT can deal with omp parallel for simd for relatively long time now. GCC should also support it starting from 5.x (#pragma omp simd was supported by GCC for some time, but it's not necessary the case for #pragma omp parallel for simd).

For optimal compiler-driven implementation you may ideally prefer to do cache blocking and manually split iteration space to have outer loop driven by omp parallel for, while innermost loop driven by omp simd. But this is maybe slightly out of scope of original question.

like image 108
zam Avatar answered Oct 26 '22 19:10

zam