SIMD vs OMP in vector multiplication

Tags:

In my project I have to do several vector multiplications, done on either double *a-vectors or float *a-vectors. In order to accelerate that, I wanted to use either SIMD-operations or omp. For getting the fastest result, I wrote a benchmark program:

#include <iostream>
#include <memory>
#include <vector>
#include <omp.h>
#include <immintrin.h>
#include <stdlib.h>
#include <chrono>


#define SIZE 32768
#define ROUNDS 1e5

void multiply_singular(float *a, float *b, float *d)
{
    for(int i = 0; i < SIZE; i++)
        d[i] = a[i]*b[i];
}

void multiply_omp(float *a, float *b, float *d)
{
#pragma omp parallel for
    for(int i = 0; i < SIZE; i++)
        d[i] = a[i]*b[i];
}

void multiply_avx(float *a, float *b, float *d)
{
    __m256 a_a, b_a, c_a;
    for(int i = 0; i < SIZE/8; i++)
    {
        a_a = _mm256_loadu_ps(a+8*i);
        b_a = _mm256_loadu_ps(b+8*i);
        c_a = _mm256_mul_ps(a_a, b_a);
        _mm256_storeu_ps (d+i*8, c_a);
    }
}

void multiply_avx_omp(float *a, float *b, float *d)
{
    __m256 a_a, b_a, c_a;
#pragma omp for
    for(int i = 0; i < SIZE/8; i++)
    {
        a_a = _mm256_loadu_ps(a+8*i);
        b_a = _mm256_loadu_ps(b+8*i);
        c_a = _mm256_mul_ps(a_a, b_a);
        _mm256_storeu_ps (d+i*8, c_a);
    }
}

void multiply_singular_double(double *a, double *b, double *d)
{
    for(int i = 0; i < SIZE; i++)
        d[i] = a[i]*b[i];
}

void multiply_omp_double(double *a, double *b, double *d)
{
#pragma omp parallel for
    for(int i = 0; i < SIZE; i++)
        d[i] = a[i]*b[i];
}

void multiply_avx_double(double *a, double *b, double *d)
{
    __m256d a_a, b_a, c_a;
    for(int i = 0; i < SIZE/4; i++)
    {
        a_a = _mm256_loadu_pd(a+4*i);
        b_a = _mm256_loadu_pd(b+4*i);
        c_a = _mm256_mul_pd(a_a, b_a);
        _mm256_storeu_pd (d+i*4, c_a);
    }
}

void multiply_avx_double_omp(double *a, double *b, double *d)
{
    __m256d a_a, b_a, c_a;
#pragma omp parallel for
    for(int i = 0; i < SIZE/4; i++)
    {
        a_a = _mm256_loadu_pd(a+4*i);
        b_a = _mm256_loadu_pd(b+4*i);
        c_a = _mm256_mul_pd(a_a, b_a);
        _mm256_storeu_pd (d+i*4, c_a);
    }
}


int main()
{
    float *a, *b, *c, *d, *e, *f;
    double *a_d, *b_d, *c_d, *d_d, *e_d, *f_d;
    a = new float[SIZE] {0};
    b = new float[SIZE] {0};
    c = new float[SIZE] {0};
    d = new float[SIZE] {0};
    e = new float[SIZE] {0};
    f = new float[SIZE] {0};
    a_d = new double[SIZE] {0};
    b_d = new double[SIZE] {0};
    c_d = new double[SIZE] {0};
    d_d = new double[SIZE] {0};
    e_d = new double[SIZE] {0};
    f_d = new double[SIZE] {0};
    for(int i = 0; i < SIZE; i++)
    {
        a[i] = i;
        b[i] = i;
        a_d[i] = i;
        b_d[i] = i;
    };
    std::cout << "Now doing the single float rounds!\n";
    std::chrono::high_resolution_clock::time_point t1 = std::chrono::high_resolution_clock::now();
    for(int i = 0; i < ROUNDS; i++)
    {
        multiply_singular(a, b, c);
    }
    std::chrono::high_resolution_clock::time_point t2 = std::chrono::high_resolution_clock::now();
    auto duration_ss = std::chrono::duration_cast<std::chrono::microseconds>(t2-t1).count();
    std::cout << "Now doing the omp float rounds!\n";
    t1 = std::chrono::high_resolution_clock::now();
    for(int i = 0; i < ROUNDS*10; i++)
    {
        multiply_omp(a, b, d);
    };
    t2 = std::chrono::high_resolution_clock::now();
    auto duration_so = std::chrono::duration_cast<std::chrono::microseconds>(t2-t1).count();
    std::cout << "Now doing the avx float rounds!\n";
    t1 = std::chrono::high_resolution_clock::now();
    for(int i = 0; i < ROUNDS*10; i++)
    {
        multiply_avx(a, b, e);
    };
    t2 = std::chrono::high_resolution_clock::now();
    auto duration_sa = std::chrono::duration_cast<std::chrono::microseconds>(t2-t1).count();
    std::cout << "Now doing the avx omp float rounds!\n";
    t1 = std::chrono::high_resolution_clock::now();
    for(int i = 0; i < ROUNDS*10; i++)
    {
        multiply_avx_omp(a, b, e);
    };
    t2 = std::chrono::high_resolution_clock::now();
    auto duration_sao = std::chrono::duration_cast<std::chrono::microseconds>(t2-t1).count();
    std::cout << "Now doing the single double rounds!\n";
    t1 = std::chrono::high_resolution_clock::now();
    for(int i = 0; i < ROUNDS; i++)
    {
        multiply_singular_double(a_d, b_d, c_d);
    };
    t2 = std::chrono::high_resolution_clock::now();
    auto duration_ds = std::chrono::duration_cast<std::chrono::microseconds>(t2-t1).count();
    std::cout << "Now doing the omp double rounds!\n";
    t1 = std::chrono::high_resolution_clock::now();
    for(int i = 0; i < ROUNDS*10; i++)
    {
        multiply_omp_double(a_d, b_d, d_d);
    };
    t2 = std::chrono::high_resolution_clock::now();
    auto duration_do = std::chrono::duration_cast<std::chrono::microseconds>(t2-t1).count();
    std::cout << "Now doing the avx double rounds!\n";
    t1 = std::chrono::high_resolution_clock::now();
    for(int i = 0; i < ROUNDS*10; i++)
    {
        multiply_avx_double(a_d, b_d, e_d);
    };
    t2 = std::chrono::high_resolution_clock::now();
    auto duration_da = std::chrono::duration_cast<std::chrono::microseconds>(t2-t1).count();
    std::cout << "Now doing the avx omp double rounds!\n";
    t1 = std::chrono::high_resolution_clock::now();
    for(int i = 0; i < ROUNDS*10; i++)
    {
        multiply_avx_double_omp(a_d, b_d, f_d);
    };
    t2 = std::chrono::high_resolution_clock::now();
    auto duration_dao = std::chrono::duration_cast<std::chrono::microseconds>(t2-t1).count();
    std::cout << "Finished\n";
    std::cout << "Elapsed time for functions:\n";
    std::cout << "Function\ttime[ms]\n";
    std::cout << "Singular float:\t" << duration_ss/ROUNDS << '\n';
    std::cout << "OMP float:\t" << duration_so/(ROUNDS*10) << '\n';
    std::cout << "AVX float avx:\t" << duration_sa/(ROUNDS*10) << '\n';
    std::cout << "OMP AVX float avx omp:\t" << duration_sao/(ROUNDS*10) << '\n';
    std::cout << "Singular double:\t" << duration_ds/ROUNDS << '\n';
    std::cout << "OMP double:\t" << duration_do/(ROUNDS*10) << '\n';
    std::cout << "AVX double:\t" << duration_da/(ROUNDS*10) << '\n';
    std::cout << "OMP AVX double:\t" << duration_dao/(ROUNDS*10) << '\n';
    delete[] a;
    delete[] b;
    delete[] c;
    delete[] d;
    delete[] e;
    delete[] f;
    delete[] a_d;
    delete[] b_d;
    delete[] c_d;
    delete[] d_d;
    delete[] e_d;
    delete[] f_d;
    return 0;
}

When compiling it with g++-5 -fopenmp -std=c++14 -march=native test_new.cpp -o test -lgomp, I get

Elapsed time for functions:
Function    time[ms]
Singular float: 117.979
OMP float:  40.5385
AVX float avx:  60.2964
OMP AVX float avx omp:  61.4206
Singular double:    129.59
OMP double: 200.745
AVX double: 136.715
OMP AVX double: 122.176

or in a second run

Elapsed time for functions:
Function    time[ms]
Singular float: 113.932
OMP float:  39.2581
AVX float avx:  58.3029
OMP AVX float avx omp:  60.0023
Singular double:    123.575
OMP double: 66.0327
AVX double: 124.293
OMP AVX double: 318.038

Here obviously the pure omp-function is faster than the other functions, even as the AVX function. When adding the -O3-switch to the compiling line, I get the following result:

Elapsed time for functions:
Function    time[ms]
Singular float: 12.7361
OMP float:  4.82436
AVX float avx:  14.7514
OMP AVX float avx omp:  14.7225
Singular double:    27.9976
OMP double: 8.50957
AVX double: 32.5175
OMP AVX double: 257.219

Here again omp is significantly faster than everything else, while AVX is slowest, even slower than the linear approach. Why is that? Is my AVX function implementation just crappy, or are there other problems?

Executed on Ubuntu 14.04.1, i7 Sandy Bridge, gcc version 5.3.0.

Edit: I found one mistake: I should move the declarations of the temporary variables in the avx-functions inside the for-loop, that gets me nearly to the omp-level (and delivers correct results).

Edit 2: When disabling the -O3-switch, the OMP-AVX-instructions are faster than the OMP-functions, with the switch they are nearly on par.

Edit 3: When filling the arrays with random data every time before executing the next loop, I get (with -O3):

Elapsed time for functions:
Function    time[ms]
Singular float: 30.742
Cilk float: 24.0769
OMP float:  17.2415
AVX float avx:  33.0217
OMP AVX float avx omp:  10.1934
Singular double:    60.412
Cilk double:    34.6458
OMP double: 19.0739
AVX double: 66.8676
OMP AVX double: 22.3586

and without:

Elapsed time for functions:
Function    time[ms]
Singular float: 274.402
Cilk float: 88.258
OMP float:  66.2124
AVX float avx:  117.066
OMP AVX float avx omp:  35.0313
Singular double:    238.652
Cilk double:    91.1667
OMP double: 127.621
AVX double: 249.516
OMP AVX double: 116.24

(and I added a cilk_for()-loop for comparison, too).

Update: I added (as suggested in the answer) also a function using the #pragma omp parallel for simd. That resulted in:

Elapsed time for functions:
Function                time[ms]
Singular float:         106.081
Cilk float:             33.2761
OMP float:              17.0651
AVX float avx:          65.1129
OMP AVX float:          19.1496
SIMD OMP float:         2.6095
Aligned AVX OMP float:  18.1165
Singular double:        118.939
Cilk double:            53.1102
OMP double:             35.652
AVX double:             131.24
OMP AVX double:         39.4377
SIMD OMP double:        7.0748
Aligned AVX OMP double: 38.4474

615

asked May 25 '16 07:05

arc_lupus

1 Answers

With compilers supporting OpenMP4.x you may want to start from something like this:

void multiply_singular_omp_for_simd(float *a, float *b, float *d)
{
    #pragma omp parallel for simd schedule (static,16)
    for(int i = 0; i < SIZE; i++)
        d[i] = a[i]*b[i];
}

It will give you both SIMD and Thread parallelism. The parallel decomposition will be done automatically, first having parallel tasks/chunks spread across threads/cores, secondly for every task/chunk spreading individual iterations "across" simd "lanes".

Read given couple articles if you feel concerned: Threading and SIMD in OpenMP4, ICC documentation.

Formally the way you phrased your question is slightly ambiguous, because staring from 4.0, OMP loop could be SIMD, Threading or SIMD+Threading parallel. So it's not about OMP vs. SIMD anymore. Instead it's about OMP SIMD vs. OMP Threading.

Not sure how good your given GCC implementation is, but ICC/IFORT can deal with omp parallel for simd for relatively long time now. GCC should also support it starting from 5.x (#pragma omp simd was supported by GCC for some time, but it's not necessary the case for #pragma omp parallel for simd).

For optimal compiler-driven implementation you may ideally prefer to do cache blocking and manually split iteration space to have outer loop driven by omp parallel for, while innermost loop driven by omp simd. But this is maybe slightly out of scope of original question.

108

answered Oct 26 '22 19:10

zam

Related questions
                            
                                C++: passing the this pointer to another class
                            
                                How to use a boost::mutex as the mapped type in std::map?
                            
                                CMake : softlink resource ( such as GLSL shaders ) or copy each complilation
                            
                                How to create functor that wraps lambda with captured variable?
                            
                                Why are decimal and hexadecimal integer literals treated differently?
                            
                                How to catch key presses in editable QTableWidgetItem?
                            
                                Local static variable shared between all instantiations of a generic lambda expression
                            
                                How direct base is constructed in virtual inheritance?
                            
                                Does GNU/Flex C++ work at all?
                            
                                Static initialization with private constructor
                            
                                cmake: linking against STATIC IMPORTED library fails
                            
                                how to open partition with CreateFile API?
                            
                                c++ dll windows plugin architecture approach
                            
                                Is it safe to have pointers to elements in Data Structures? (c++ with QT)
                            
                                How to shrink the size of a shared library?
                            
                                Should I store references in classes in c++? [closed]
                            
                                creating Mat with openCV in python
                            
                                correctly set CMake variables when adding yaml-cpp to existing project
                            
                                How do you enable C++11 syntax in Eclipse Neon?
                            
                                Feature tracking using optical flow

Donate For Us

If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!

Donate Us With

SIMD vs OMP in vector multiplication

Tags:

c++

c++11

vector

avx

openmp

arc_lupus

People also ask

1 Answers

zam

Recent Activity

Donate For Us