Lazy vector access in parallel loops

Question

Inside a performance-critical, parallel code I have a vector whose elements are:

Very expensive to compute, and the result is deterministic (the value of the element at a given position will depend on the position only)
Random access (typically the number of accesses are larger or much larger than the size of the vector)
Clustered accesses (many accesses request the same value)
The vector is shared by different threads (race condition?)
To avoid heap defragmention, the object should never be recreated, but whenever possible resetted and recycled
The value to be placed in the vector will be provided by a polymorphic object

Currently, I precompute all possible values of the vectors, so race condition should not be an issue. In order to improve performances, I am considering to create a lazy vector, such that the code performs computations only when the element of the vector is requested. In a parallel region, it might happen that more than one thread are requesting, and perhaps calculating, the same element at the same time. How do I take care of this possible race condition?

Below is an example of what I want to achieve. It compiles and runs properly under Windows 10, Visual Studio 17. I use C++17.

// Lazy.cpp : Defines the entry point for the console application.
#include "stdafx.h"
#include <vector>
#include <iostream>
#include <stdlib.h>  
#include <chrono>
#include <math.h>
const double START_SUM = 1;
const double END_SUM = 1000;

//base object responsible for providing the values
class Evaluator
{
public:
    Evaluator() {};
    ~Evaluator() {};
    //Function with deterministic output, depending on the position
    virtual double expensiveFunction(int pos) const = 0;
};
//
class EvaluatorA: public Evaluator
{
public:
    //expensive evaluation
    virtual double expensiveFunction(int pos) const override {
        double t = 0;
        for (int j = START_SUM; j++ < END_SUM; j++)
            t += log(exp(log(exp(log(j + pos)))));
        return t;
    }
    EvaluatorA() {};
    ~EvaluatorA() {};
};
class EvaluatorB : public Evaluator
{
public:
    //even more expensive evaluation
    virtual double expensiveFunction(int pos) const override {
        double t = 0;
        for (int j = START_SUM; j++ < 10*END_SUM; j++)
            t += log(exp(log(exp(log(j + pos)))));
        return t;
    }
    EvaluatorB() {};
    ~EvaluatorB() {};
};

class LazyVectorTest //vector that contains N possible results
{
public:
    LazyVectorTest(int N,const Evaluator & eval) : N(N), innerContainer(N, 0), isThatComputed(N, false), eval_ptr(&eval)
    {};
    ~LazyVectorTest() {};

    //reset, to generate a new table of values
    //the size of the vector stays constant
    void reset(const Evaluator & eval) {
        this->eval_ptr = &eval;
        for (int i = 0; i<N; i++)
            isThatComputed[i] = false;
    }
    int size() { return N; }
    //accessing the same position should yield the same result
    //unless the object is resetted
    const inline double& operator[](int pos) {
        if (!isThatComputed[pos]) {
            innerContainer[pos] = eval_ptr->expensiveFunction(pos);
            isThatComputed[pos] = true;
        }
        return innerContainer[pos];
    }

private:
    const int N;
    const Evaluator* eval_ptr;
    std::vector<double> innerContainer;
    std::vector<bool> isThatComputed;
};
    //the parallel access will take place here

template <typename T>
double accessingFunction(T& A, const std::vector<int>& elementsToAccess) {
    double tsum = 0;
    int size = elementsToAccess.size();
//#pragma omp parallel for
    for (int i = 0; i < size; i++)
        tsum += A[elementsToAccess[i]];
    return tsum;
}

std::vector<int> randomPos(int sizePos, int N) {
    std::vector<int> elementsToAccess;
    for (int i = 0; i < sizePos; i++)
        elementsToAccess.push_back(rand() % N);
    return elementsToAccess;
}
int main()
{
    srand(time(0));
    int minAccessNumber = 1;
    int maxAccessNumber = 100;
    int sizeVector = 50;

    auto start = std::chrono::steady_clock::now();
    double res = 0;
    float numberTest = 100;
    typedef LazyVectorTest container;

    EvaluatorA eval;
    for (int i = 0; i < static_cast<int>(numberTest); i++) {
        res = eval.expensiveFunction(i);
    }
    auto end = std::chrono::steady_clock::now();
    std::chrono::duration<double, std::milli>diff(end - start);
    double benchmark = diff.count() / numberTest;
    std::cout <<"Average time to compute expensive function:" <<benchmark<<" ms"<<std::endl;
    std::cout << "Value of the function:" << res<< std::endl;
    std::vector<std::vector<int>> indexs(numberTest);
    container A(sizeVector, eval);

    for (int accessNumber = minAccessNumber; accessNumber < maxAccessNumber; accessNumber++) {
        indexs.clear();
        for (int i = 0; i < static_cast<int>(numberTest); i++) {
            indexs.emplace_back(randomPos(accessNumber, sizeVector));
        }
        auto start_lazy = std::chrono::steady_clock::now();
        for (int i = 0; i < static_cast<int>(numberTest); i++) {
            A.reset(eval);
            double res_lazy = accessingFunction(A, indexs[i]);
        }
        auto end_lazy = std::chrono::steady_clock::now();
        std::chrono::duration<double, std::milli>diff_lazy(end_lazy - start_lazy);
        std::cout << accessNumber << "," << diff_lazy.count() / numberTest << ", " << diff_lazy.count() / (numberTest* benchmark) << std::endl;
    }
    return 0;
}

Caleth · Accepted Answer

Rather than roll you own locking, I'd first see if you get acceptable performance with std::call_once.

class LazyVectorTest //vector that contains N possible results
{
    //Function with deterministic output, depending on the position
    void expensiveFunction(int pos) {
        double t = 0;
        for (int j = START_SUM; j++ < END_SUM; j++)
            t += log(exp(log(exp(log(j+pos)))));
        values[pos] = t;
    }

public:
    LazyVectorTest(int N) : values(N), flags(N)
    {};

    int size() { return values.size(); }
    //accessing the same position should yield the same result
    double operator[](int pos) {
        std::call_once(flags[pos], &LazyVectorTest::expensiveFunction, this, pos);
        return values[pos];
    }
private:
    std::vector<double> values;
    std::vector<std::once_flag> flags;
};

call_once is pretty transparent. It allows exactly one thread to run a function to completion. The only potential drawback is that it will block a second thread waiting for a possible exception, rather than immediately do nothing. In this case that is desirable, as you want the modification values[pos] = t; to be sequenced before the read return values[pos];

Lazy vector access in parallel loops

Tags:

c++

parallel-processing

c++17

openmp

Enzo Ferrazzano

1 Answers

Caleth

Recent Activity

Donate For Us

Lazy vector access in parallel loops

Tags:

c++

parallel-processing

c++17

openmp

Enzo Ferrazzano

1 Answers

Caleth

Related questions

Recent Activity

Donate For Us