In C++11, this:
const std::vector<int>& f() {
static const std::vector<int> x { 1, 2, 3 };
return x;
}
is thread-safe. However, is there an extra penalty for calling this function after the first time (i.e. when it is initialized) due to this extra thread-safe guarantee? I am wondering if the function will be slower than one using a global variable, because it has to acquire a mutex to check whether it's being initialized by another thread every time it is called, or something.
"The best intution to be ever had is 'I should measure this.'" So let's find out:
#include <atomic>
#include <chrono>
#include <cstdint>
#include <iostream>
#include <numeric>
#include <vector>
namespace {
class timer {
using hrc = std::chrono::high_resolution_clock;
hrc::time_point start;
static hrc::time_point now() {
// Prevent memory operations from reordering across the
// time measurement. This is likely overkill, needs more
// research to determine the correct fencing.
std::atomic_thread_fence(std::memory_order_seq_cst);
auto t = hrc::now();
std::atomic_thread_fence(std::memory_order_seq_cst);
return t;
}
public:
timer() : start(now()) {}
hrc::duration elapsed() const {
return now() - start;
}
template <typename Duration>
typename Duration::rep elapsed() const {
return std::chrono::duration_cast<Duration>(elapsed()).count();
}
template <typename Rep, typename Period>
Rep elapsed() const {
return elapsed<std::chrono::duration<Rep,Period>>();
}
};
const std::vector<int>& f() {
static const auto x = std::vector<int>{ 1, 2, 3 };
return x;
}
static const auto y = std::vector<int>{ 1, 2, 3 };
const std::vector<int>& g() {
return y;
}
const unsigned long long n_iterations = 500000000;
template <typename F>
void test_one(const char* name, F f) {
f(); // First call outside the timer.
using value_type = typename std::decay<decltype(f()[0])>::type;
std::cout << name << ": " << std::flush;
auto t = timer{};
auto sum = uint64_t{};
for (auto i = n_iterations; i > 0; --i) {
const auto& vec = f();
sum += std::accumulate(begin(vec), end(vec), value_type{});
}
const auto elapsed = t.elapsed<std::chrono::milliseconds>();
std::cout << elapsed << " ms (" << sum << ")\n";
}
} // anonymous namespace
int main() {
test_one("local static", f);
test_one("global static", g);
}
Running at Coliru, the local version does 5e8 iterations in 4618 ms, the global version in 4392 ms. So yes, the local version is slower by approximately 0.452 nanoseconds per iteration. Although there's a measurable difference, it's too small to impact observed performance in most situations.
#include <atomic>
#include <chrono>
#include <cstdint>
#include <iostream>
#include <numeric>
#include <vector>
namespace {
class timer {
using hrc = std::chrono::high_resolution_clock;
hrc::time_point start;
static hrc::time_point now() {
// Prevent memory operations from reordering across the
// time measurement. This is likely overkill.
std::atomic_thread_fence(std::memory_order_seq_cst);
auto t = hrc::now();
std::atomic_thread_fence(std::memory_order_seq_cst);
return t;
}
public:
timer() : start(now()) {}
hrc::duration elapsed() const {
return now() - start;
}
template <typename Duration>
typename Duration::rep elapsed() const {
return std::chrono::duration_cast<Duration>(elapsed()).count();
}
template <typename Rep, typename Period>
Rep elapsed() const {
return elapsed<std::chrono::duration<Rep,Period>>();
}
};
class f {
public:
const std::vector<int>& operator()() {
static const auto x = std::vector<int>{ 1, 2, 3 };
return x;
}
};
class g {
static const std::vector<int> x;
public:
const std::vector<int>& operator()() {
return x;
}
};
const std::vector<int> g::x{ 1, 2, 3 };
const unsigned long long n_iterations = 500000000;
template <typename F>
void test_one(const char* name, F f) {
f(); // First call outside the timer.
using value_type = typename std::decay<decltype(f()[0])>::type;
std::cout << name << ": " << std::flush;
auto t = timer{};
auto sum = uint64_t{};
for (auto i = n_iterations; i > 0; --i) {
const auto& vec = f();
sum += std::accumulate(begin(vec), end(vec), value_type{});
}
const auto elapsed = t.elapsed<std::chrono::milliseconds>();
std::cout << elapsed << " ms (" << sum << ")\n";
}
} // anonymous namespace
int main() {
test_one("local static", f());
test_one("global static", g());
}
Not surprisingly, runtimes were faster under both g++ (3803ms local, 2323ms global) and clang (4183ms local, 3253ms global). The results affirm our intuition that the global technique should be faster than the local, with deltas of 2.96 nanoseconds (g++) and 1.86 nanoseconds (clang) per iteration.
Yes, there will be a cost to check whether the object has been initialised. This would typically test an atomic Boolean variable, rather than lock a mutex.
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With