Please consider the following piece of code:
void error_handling();
bool method_impl();
bool method()
{
const bool res = method_impl();
if (res == false) {
error_handling();
return false;
}
return true;
}
I know method_impl()
will return true
99.999% (yes, three decimal places) of the time, but my compiler doesn't. method()
is partially critical in term of time-consumption.
method()
(and make it less readable) to ensure a jump may only occur when method_impl()
returns false
? If yes, how?Following other answers' suggestions, I benchmarked the solutions. If you consider upvoting this answer, please upvote the others too.
#include <iostream>
#include <iomanip>
#include <string>
// solutions
#include <ctime>
// benchmak
#include <limits>
#include <random>
#include <chrono>
#include <algorithm>
#include <functional>
//
// Solutions
//
namespace
{
volatile std::time_t near_futur = -1;
void error_handling() { std::cerr << "error\n"; }
bool method_impl() { return std::time(NULL) != near_futur; }
bool method_no_builtin()
{
const bool res = method_impl();
if (res == false) {
error_handling();
return false;
}
return true;
}
bool method_builtin()
{
const bool res = method_impl();
if (__builtin_expect(res, 1) == false) {
error_handling();
return false;
}
return true;
}
bool method_builtin_incorrect()
{
const bool res = method_impl();
if (__builtin_expect(res, 0) == false) {
error_handling();
return false;
}
return true;
}
bool method_rewritten()
{
const bool res = method_impl();
if (res == true) {
return true;
} else {
error_handling();
return false;
}
}
}
//
// benchmark
//
constexpr std::size_t BENCHSIZE = 10'000'000;
class Clock
{
std::chrono::time_point<std::chrono::steady_clock> _start;
public:
static inline std::chrono::time_point<std::chrono::steady_clock> now() { return std::chrono::steady_clock::now(); }
Clock() : _start(now())
{
}
template<class DurationUnit>
std::size_t end()
{
return std::chrono::duration_cast<DurationUnit>(now() - _start).count();
}
};
//
// Entry point
//
int main()
{
{
Clock clock;
bool result = true;
for (std::size_t i = 0 ; i < BENCHSIZE ; ++i)
{
result &= method_no_builtin();
result &= method_no_builtin();
result &= method_no_builtin();
result &= method_no_builtin();
result &= method_no_builtin();
result &= method_no_builtin();
result &= method_no_builtin();
result &= method_no_builtin();
result &= method_no_builtin();
result &= method_no_builtin();
}
const double unit_time = clock.end<std::chrono::nanoseconds>() / static_cast<double>(BENCHSIZE);
std::cout << std::setw(40) << "method_no_builtin(): " << std::setprecision(3) << unit_time << " ns\n";
}
{
Clock clock;
bool result = true;
for (std::size_t i = 0 ; i < BENCHSIZE ; ++i)
{
result &= method_builtin();
result &= method_builtin();
result &= method_builtin();
result &= method_builtin();
result &= method_builtin();
result &= method_builtin();
result &= method_builtin();
result &= method_builtin();
result &= method_builtin();
result &= method_builtin();
}
const double unit_time = clock.end<std::chrono::nanoseconds>() / static_cast<double>(BENCHSIZE);
std::cout << std::setw(40) << "method_builtin(): " << std::setprecision(3) << unit_time << " ns\n";
}
{
Clock clock;
bool result = true;
for (std::size_t i = 0 ; i < BENCHSIZE ; ++i)
{
result &= method_builtin_incorrect();
result &= method_builtin_incorrect();
result &= method_builtin_incorrect();
result &= method_builtin_incorrect();
result &= method_builtin_incorrect();
result &= method_builtin_incorrect();
result &= method_builtin_incorrect();
result &= method_builtin_incorrect();
result &= method_builtin_incorrect();
result &= method_builtin_incorrect();
}
const double unit_time = clock.end<std::chrono::nanoseconds>() / static_cast<double>(BENCHSIZE);
std::cout << std::setw(40) << "method_builtin_incorrect(): " << std::setprecision(3) << unit_time << " ns\n";
}
{
Clock clock;
bool result = true;
for (std::size_t i = 0 ; i < BENCHSIZE ; ++i)
{
result &= method_rewritten();
result &= method_rewritten();
result &= method_rewritten();
result &= method_rewritten();
result &= method_rewritten();
result &= method_rewritten();
result &= method_rewritten();
result &= method_rewritten();
result &= method_rewritten();
result &= method_rewritten();
}
const double unit_time = clock.end<std::chrono::nanoseconds>() / static_cast<double>(BENCHSIZE);
std::cout << std::setw(40) << "method_rewritten(): " << std::setprecision(3) << unit_time << " ns\n";
}
}
g++ -std=c++14 -O2 -Wall -Wextra -Werror main.cpp
method_no_builtin(): 42.8 ns
method_builtin(): 44.4 ns
method_builtin_incorrect(): 51.4 ns
method_rewritten(): 39.3 ns
Demo
g++ -std=c++14 -O3 -Wall -Wextra -Werror main.cpp
method_no_builtin(): 32.3 ns
method_builtin(): 31.1 ns
method_builtin_incorrect(): 35.6 ns
method_rewritten(): 30.5 ns
Demo
The difference between those optimizations are too small to come to any conclusion other than: if there is a performance gain to find in optimizing a branch for a known more common path, this gain is too small to be worth the trouble and the loss in readability.
You could suggest the compiler that the method_impl()
will return true:
void error_handling();
bool method_impl();
bool method()
{
const bool res = method_impl();
if (__builtin_expect (res, 0) == false) {
error_handling();
return false;
}
return true;
}
This will work in GCC.
The underlying hardware already performs this optimizations. It will "fail" to predict it the first times, but after it will hit the correct option en.wikipedia.org/wiki/Branch_predictor.
You can try applying the GCC extension and check if it is faster with it or not, but I think you will barely see any difference with it and without it. The branch prediction is applied always, it is not something you enable
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With