Why is `std::visit` so inefficient?

Question

I found that use std::variant+std::visit is inefficient (performance almost like virtual function), but when I replace std::visit by x.index(), the performance is as better as CRTP.

My code:

#include <benchmark/benchmark.h>
#include <memory>
#include <vector>
#include <variant>
#include <cmath>

namespace variant_shapes {
    // 圆形实现 - 不使用虚函数
    class Circle {
    private:
        double radius_;
    public:
        explicit Circle(double radius) : radius_(radius) {}
        
        double area() const {
            return M_PI * radius_ * radius_;
        }
        
        double getRadius() const { return radius_; }
    };

    // 矩形实现 - 不使用虚函数
    class Rectangle {
    private:
        double width_;
        double height_;
    public:
        Rectangle(double width, double height) : width_(width), height_(height) {}
        
        double area() const {
            return width_ * height_;
        }
        
        double getWidth() const { return width_; }
        double getHeight() const { return height_; }
    };
}

// 添加一个直接使用 if constexpr 的基准测试
static void BM_StaticDispatch(benchmark::State& state) {
    std::vector<std::variant<variant_shapes::Circle, variant_shapes::Rectangle>> shapes;
    shapes.reserve(1000);

    for (int i = 0; i < 1000; ++i) {
        if (i % 2 == 0) {
            shapes.emplace_back(variant_shapes::Circle(5.0));
        } else {
            shapes.emplace_back(variant_shapes::Rectangle(4.0, 6.0));
        }
    }
    auto visitor = [](auto &&arg) -> double
    {
        using T = std::decay_t<decltype(arg)>;

        if constexpr (std::is_same_v<T, variant_shapes::Circle>)
        {
            return arg.area();
        }
        else if constexpr (std::is_same_v<T, variant_shapes::Rectangle>)
        {
            return arg.area();
        }

    };

    for (auto _ : state) {
        double totalArea = 0.0;
        for (const auto& s : shapes) {
            totalArea += std::visit(visitor, s);
        }
        benchmark::DoNotOptimize(totalArea);
    }
}
BENCHMARK(BM_StaticDispatch);

// std::variant + std::visit 基准测试
// 使用 if constexpr 的静态分派函数
template<typename T>
double get_area(const T& shape) {
    if constexpr (std::is_same_v<T, variant_shapes::Circle>) {
        return shape.area();
    } else if constexpr (std::is_same_v<T, variant_shapes::Rectangle>) {
        return shape.area();
    }
    return 0.0; // 不应该到达这里
}
static void BM_StaticDispatch2(benchmark::State& state) {
    std::vector<std::variant<variant_shapes::Circle, variant_shapes::Rectangle>> shapes;
    shapes.reserve(1000);

    for (int i = 0; i < 1000; ++i) {
        if (i % 2 == 0) {
            shapes.emplace_back(variant_shapes::Circle(5.0));
        } else {
            shapes.emplace_back(variant_shapes::Rectangle(4.0, 6.0));
        }
    }
    auto visitor = [](auto &&arg) -> double
    {
        using T = std::decay_t<decltype(arg)>;

        if constexpr (std::is_same_v<T, variant_shapes::Circle>)
        {
            return arg.area();
        }
        else if constexpr (std::is_same_v<T, variant_shapes::Rectangle>)
        {
            return arg.area();
        }

    };

    for (auto _ : state) {
        double totalArea = 0.0;
        for (const auto& s : shapes) {
            totalArea += s.index() == 0
                        ? get_area(std::get<0>(s))
                        : get_area(std::get<1>(s));
        }
        benchmark::DoNotOptimize(totalArea);
    }
}
BENCHMARK(BM_StaticDispatch2);

Performance test link: Quick C++ Benchmark.

Ahmed AEK · Accepted Answer

what you are seeing is just a bad optimization done on this code by clang that made std::visit worse, both gcc and clang are able to inline both codes without a jump table. and in gcc case both function have the same time.

overall this example is heavily influenced by the CPU and compiler optimization capabilities on this exact code and shouldn't be used as a general case for std::variant performance. however it is nice to know that both gcc and clang can inline std::visit with two alternatives.

for clang the variant assembly is as follows online compiler

.LBB0_27:
        mov     qword ptr [rsp + 8], 0
        xorpd   xmm1, xmm1
        mov     rax, rbx
        jmp     .LBB0_28
.LBB0_30:
        mulsd   xmm2, xmm3 // xmm2 *= xmm3
        addsd   xmm1, xmm2 // totalArea += xmm2
        movsd   qword ptr [rsp + 8], xmm1
        add     rax, 24
        cmp     rax, r12
        je      .LBB0_31
.LBB0_28:
        cmp     byte ptr [rax + 16], 0
        movsd   xmm2, qword ptr [rax] // xmm2 = radius
        movapd  xmm3, xmm2 // xmm3 = radius
        mulsd   xmm3, xmm0 // xmm3 *= M_PI
        je      .LBB0_30
        movsd   xmm3, qword ptr [rax + 8] // xmm3 = height
        jmp     .LBB0_30

which roughly translates to

test_flag = index == 0;
xmm2 = radius * M_PI;
if (!test_flag)
{
  xmm2 = height_;
}
totalArea += xmm2 * (radius or width);

even if the CPU could predict that branch, it will still evaluate radius * M_PI. but this delay could be drastically different between different CPUs.

whereas the ternary operator turned into the following:

.LBB1_28:
        mov     qword ptr [rsp + 8], 0
        xorpd   xmm1, xmm1
        mov     rax, rbx
        jmp     .LBB1_29
.LBB1_56:
        movsd   xmm2, qword ptr [rax] // xmm2 = width;
        mulsd   xmm2, qword ptr [rax + 8] // xmm2 *= height;
.LBB1_57:
        addsd   xmm1, xmm2 // totalArea += xmm2
        movsd   qword ptr [rsp + 8], xmm1
        add     rax, 24
        cmp     rax, r12
        je      .LBB1_52
.LBB1_29:
        movzx   ecx, byte ptr [rax + 16]
        cmp     ecx, 1
        je      .LBB1_56
        test    ecx, ecx 
        jne     .LBB1_54 // bad_access
        movsd   xmm3, qword ptr [rax]
        movapd  xmm2, xmm3 // xmm2 = radius;
        mulsd   xmm2, xmm0 // xmm2 *= M_PI
        mulsd   xmm2, xmm3 // xmm2 *= radius;
        jmp     .LBB1_57

which roughly translates to

test_flag = index == 0;
if (test_flag)
{
  xmm2 = M_PI * radius * radius
}
else
{
  xmm2 = width * height;
}
totalArea += xmm2;

which the CPU can easily predict, modern CPUs can actually predict 010101 pattern.

lastly gcc optimizes both to the following

.L30:
        movsd   xmm3, QWORD PTR .LC4[rip]
        mov     rax, rbp
        pxor    xmm1, xmm1
        jmp     .L28
.L68:
        movsd   xmm2, QWORD PTR [rax]
        add     rax, 24
        movapd  xmm0, xmm2 // xmm0 = radius
        mulsd   xmm0, xmm3 // xmm0 *= M_PI
        mulsd   xmm0, xmm2 // xmm0 *= radius
        addsd   xmm1, xmm0 // result += xmm0
        cmp     r15, rax
        je      .L24
.L28:
        movzx   ecx, BYTE PTR [rax+16]
        test    cl, cl
        je      .L68
        cmp     cl, 1 // bad access
        jne     .L63
        movsd   xmm0, QWORD PTR [rax] // xmm0 = width
        mulsd   xmm0, QWORD PTR [rax+8] // xmm0 *= height
        add     rax, 24
        addsd   xmm1, xmm0 // totalArea += xmm0
        cmp     r15, rax
        jne     .L28

which roughly translates to

if (index == 0)
{
  totalArea += M_PI * radius * radius;
}
else
{
  totalArea += width * height;
}

which the CPU can also predict.

Jarod42 · Answer

It is partially due to complexity requirement:

When the number of variants is zero or one, the invocation of the callable object is implemented in constant time; i.e., it does not depend on the number of types can be stored in the variant.

so you cannot chain

if (s.index() == 0) return get_area(std::get<0>(s))
else if (s.index() == 1) return get_area(std::get<1>(s))
// ...

it would be O(n).

So possible implementation are switch, or using table similar to vtable.

Jump prediction (indirection through the "vtable") is generally worst than branch prediction.

Why is `std::visit` so inefficient?

Tags:

c++

performance

virtual-functions

crtp

std-visit

yangfeng wang

2 Answers

Ahmed AEK

Jarod42

Recent Activity

Donate For Us

Why is `std::visit` so inefficient?

Tags:

c++

performance

virtual-functions

crtp

std-visit

yangfeng wang

2 Answers

Ahmed AEK

Jarod42

Related questions

Recent Activity

Donate For Us