Is there any hope to call a common base class method on a std::variant efficiently?

Question

The way std::variant dispatches to different visitor methods when std::visit is called is pretty reasonable when the variant alternatives are completely different types. Essentially a visitor-specific vtable is built at compile-time and after some error checking¹ the appropriate visitor function is looked by indexing the table based the current index() which resolves to something like an indirect jump on most platforms.

If the alternatives share a common base class, however, calling a (non-virtual) member function or accessing state on the base class with a visitor is conceptually much simpler: you are always calling the same method and usually using the same pointer² to the base class.

Still, the implementation ends up just as slow. For example:

#include <variant>

struct Base {
  int m_base;
  int getBaseMember() { return m_base; }
};

struct Foo : public Base {
  int m_foo;
};

struct Bar : public Base {
  int m_bar;
};

using Foobar = std::variant<Foo,Bar>;

int getBaseMemVariant(Foobar& v) {
  return std::visit([](auto&& e){ return e.getBaseMember(); }, v);
}

The generated code on x86 for the most recent version of gcc and clang is similar³ (clang shown):

getBaseMemVariant(std::__1::variant<Foo, Bar>&): # @getBaseMemVariant(std::__1::variant<Foo, Bar>&)
        sub     rsp, 24
        mov     rax, rdi
        mov     ecx, dword ptr [rax + 8]
        mov     edx, 4294967295
        cmp     rcx, rdx
        je      .LBB0_2
        lea     rdx, [rsp + 8]
        mov     qword ptr [rsp + 16], rdx
        lea     rdi, [rsp + 16]
        mov     rsi, rax
        call    qword ptr [8*rcx + decltype(auto) std::__1::__variant_detail::__visitation::__base::__visit_alt<std::__1::__variant_detail::__visitation::__variant::__value_visitor<getBaseMemVariant(std::__1::variant<Foo, Bar>&)::$_0>, std::__1::__variant_detail::__impl<Foo, Bar>&>(std::__1::__variant_detail::__visitation::__variant::__value_visitor<getBaseMemVariant(std::__1::variant<Foo, Bar>&)::$_0>&&, std::__1::__variant_detail::__impl<Foo, Bar>&)::__fmatrix]
        add     rsp, 24
        ret
.LBB0_2:
        mov     edi, 8
        call    __cxa_allocate_exception
        mov     qword ptr [rax], vtable for std::bad_variant_access+16
        mov     esi, typeinfo for std::bad_variant_access
        mov     edx, std::exception::~exception()
        mov     rdi, rax
        call    __cxa_throw
decltype(auto) std::__1::__variant_detail::__visitation::__base::__dispatcher<0ul>::__dispatch<std::__1::__variant_detail::__visitation::__variant::__value_visitor<getBaseMemVariant(std::__1::variant<Foo, Bar>&)::$_0>&&, std::__1::__variant_detail::__base<(std::__1::__variant_detail::_Trait)0, Foo, Bar>&>(std::__1::__variant_detail::__visitation::__variant::__value_visitor<getBaseMemVariant(std::__1::variant<Foo, Bar>&)::$_0>&&, std::__1::__variant_detail::__base<(std::__1::__variant_detail::_Trait)0, Foo, Bar>&): # @"decltype(auto) std::__1::__variant_detail::__visitation::__base::__dispatcher<0ul>::__dispatch<std::__1::__variant_detail::__visitation::__variant::__value_visitor<getBaseMemVariant(std::__1::variant<Foo, Bar>&)::$_0>&&, std::__1::__variant_detail::__base<(std::__1::__variant_detail::_Trait)0, Foo, Bar>&>(std::__1::__variant_detail::__visitation::__variant::__value_visitor<getBaseMemVariant(std::__1::variant<Foo, Bar>&)::$_0>&&, std::__1::__variant_detail::__base<(std::__1::__variant_detail::_Trait)0, Foo, Bar>&)"
        mov     eax, dword ptr [rsi]
        ret
decltype(auto) std::__1::__variant_detail::__visitation::__base::__dispatcher<1ul>::__dispatch<std::__1::__variant_detail::__visitation::__variant::__value_visitor<getBaseMemVariant(std::__1::variant<Foo, Bar>&)::$_0>&&, std::__1::__variant_detail::__base<(std::__1::__variant_detail::_Trait)0, Foo, Bar>&>(std::__1::__variant_detail::__visitation::__variant::__value_visitor<getBaseMemVariant(std::__1::variant<Foo, Bar>&)::$_0>&&, std::__1::__variant_detail::__base<(std::__1::__variant_detail::_Trait)0, Foo, Bar>&): # @"decltype(auto) std::__1::__variant_detail::__visitation::__base::__dispatcher<1ul>::__dispatch<std::__1::__variant_detail::__visitation::__variant::__value_visitor<getBaseMemVariant(std::__1::variant<Foo, Bar>&)::$_0>&&, std::__1::__variant_detail::__base<(std::__1::__variant_detail::_Trait)0, Foo, Bar>&>(std::__1::__variant_detail::__visitation::__variant::__value_visitor<getBaseMemVariant(std::__1::variant<Foo, Bar>&)::$_0>&&, std::__1::__variant_detail::__base<(std::__1::__variant_detail::_Trait)0, Foo, Bar>&)"
        mov     eax, dword ptr [rsi]
        ret
decltype(auto) std::__1::__variant_detail::__visitation::__base::__visit_alt<std::__1::__variant_detail::__visitation::__variant::__value_visitor<getBaseMemVariant(std::__1::variant<Foo, Bar>&)::$_0>, std::__1::__variant_detail::__impl<Foo, Bar>&>(std::__1::__variant_detail::__visitation::__variant::__value_visitor<getBaseMemVariant(std::__1::variant<Foo, Bar>&)::$_0>&&, std::__1::__variant_detail::__impl<Foo, Bar>&)::__fmatrix:
        .quad   decltype(auto) std::__1::__variant_detail::__visitation::__base::__dispatcher<0ul>::__dispatch<std::__1::__variant_detail::__visitation::__variant::__value_visitor<getBaseMemVariant(std::__1::variant<Foo, Bar>&)::$_0>&&, std::__1::__variant_detail::__base<(std::__1::__variant_detail::_Trait)0, Foo, Bar>&>(std::__1::__variant_detail::__visitation::__variant::__value_visitor<getBaseMemVariant(std::__1::variant<Foo, Bar>&)::$_0>&&, std::__1::__variant_detail::__base<(std::__1::__variant_detail::_Trait)0, Foo, Bar>&)
        .quad   decltype(auto) std::__1::__variant_detail::__visitation::__base::__dispatcher<1ul>::__dispatch<std::__1::__variant_detail::__visitation::__variant::__value_visitor<getBaseMemVariant(std::__1::variant<Foo, Bar>&)::$_0>&&, std::__1::__variant_detail::__base<(std::__1::__variant_detail::_Trait)0, Foo, Bar>&>(std::__1::__variant_detail::__visitation::__variant::__value_visitor<getBaseMemVariant(std::__1::variant<Foo, Bar>&)::$_0>&&, std::__1::__variant_detail::__base<(std::__1::__variant_detail::_Trait)0, Foo, Bar>&)

The call qword ptr [8*rcx + ... is the actual indirect call to a function pointed to by the vtable (the vtable itself appears at the bottom of the listing). The code before that is first checking the "is empty" state, and then setting up the visit call (I'm not sure what the weirdness with rdi is, I guess it's setting up a pointer to the visitor as the first argument or something).

The actual methods which are pointer to by the vtable and executed by the call are very simple, a single mov to read the member. Critically, both are indentical:

    mov     eax, dword ptr [rsi]
    ret

So we have a huge mess. To execute that single mov we have a dozen setup instructions, and more importantly an indirect branch: which if targeting a series of Foobar variant objects with different contained alternatives will mispredict very badly. Finally, the indirect call seems like an insurmountable barrier to further optimization: here will look at a simple call without any surrounding context, but in real use this might be optimized into a larger function with significant opportunities for further optimization - but I think the indirect call will block it.

You can play with the code yourself on godbolt.

Versus a Union

The slowness isn't inherent: here's a very simple "discriminated union" struct that combines the two classes in a union along with an isFoo discriminator that keeps track of what class is contained:

struct FoobarUnion {
  bool isFoo;
  union {
    Foo foo;
    Bar bar;
  };
  Base *asBase() {return isFoo ? (Base *)&foo : &bar; };
};

int getBaseMemUnion(FoobarUnion& v) {
  return v.asBase()->getBaseMember();
}

The corresponding getBaseMemUnion function compiles down to a single mov instruction on both gcc and clang:

getBaseMemUnion(FoobarUnion&):      # @getBaseMemUnion(FoobarUnion&)
        mov     eax, dword ptr [rdi + 4]
        ret

Granted, the discriminated union doesn't have to check the "is valueless" error condition, but that's not the main reason for variant slowness and in any case such a condition is impossible with Foo and Bar since none of their constructors throw⁴. Even if you wanted to support such a state, the resulting function with the union is still very efficient - only a small check is added, but the behavior of calling the base class is the same.

Is there anything I can do to this use of variant efficient in this case of calling a common base class function, or is the promise of a zero-cost abstraction just not panning out here?

I'm open to a different call pattern, compiler options, whatever.

¹ In particular, checking whether the variant is valueless_by_exception due to an earlier assignment that failed.

² The pointer to the base class doesn't always have the same relationship to the most derived pointer for all alternatives, e.g., when multiple inheritance is involved.

³ Well gcc is a bit worse since it seems to redundantly perform the "is valueless" check both upfront prior to the call to visit and also in each auto-generated method pointed to by the vtable. clang only does it upfront. Keep in mind when I say "gcc" I really mean "gcc with libstdc++" while "clang" really means "clang with libc++". Some differences, like the redundant index() check in the generated visitor functions are likely due to library differences rather than compiler optimization differences.

⁴ If the valueless state is problematic, one could also consider something like strict_variant which never has an empty state but still uses local storage if the move constructor cannot throw.

Barry · Accepted Answer

For what it's worth, a totally handrolled visit with a switch does pretty well:

// use a code generator to write out all of these
template <typename F, typename V>
auto custom_visit(F f, V&& v, std::integral_constant<size_t, 2> )
{
    switch (v.index()) {
    case 0: return f(std::get<0>(std::forward<V>(v)));
    case 1: return f(std::get<1>(std::forward<V>(v)));
#ifdef VALUELESS
    case std::variant_npos: {
        []() [[gnu::cold, gnu::noinline]] {
            throw std::bad_variant_access();
        }();
    }
#endif
    }
    __builtin_unreachable();

}

template <typename F, typename V>
auto custom_visit(F f, V&& v) {
    return custom_visit(f, std::forward<V>(v),
        std::variant_size<std::decay_t<V>>{});
}

Which you'd use like:

int getBaseMemVariant2(Foobar& v) {
  return custom_visit([](Base& b){ return &b; }, v)->getBaseMember();
}

With VALUELESS, this emits:

getBaseMemVariant2(std::variant<Foo, Bar>&):
    movzx   eax, BYTE PTR [rdi+8]
    cmp     al, -1
    je      .L27
    cmp     al, 1
    ja      .L28
    mov     eax, DWORD PTR [rdi]
    ret
.L27:
    sub     rsp, 8
    call    auto custom_visit<getBaseMemVariant2(std::variant<Foo, Bar>&)::{lambda(Base&)#1}, std::variant<Foo, Bar>&>(getBaseMemVariant2(std::variant<Foo, Bar>&)::{lambda(Base&)#1}, std::variant<Foo, Bar>&, std::integral_constant<unsigned long, 2ul>)::{lambda()#1}::operator()() const [clone .isra.1]

Which is pretty good. Without VALUELESS, this emits :

getBaseMemVariant2(std::variant<Foo, Bar>&):
    mov     eax, DWORD PTR [rdi]
    ret

as desired.

I don't really know what conclusion, if any, to draw from this. Clearly, there's hope?

Raven · Answer

I am not competent enough to perform an assembly-level analysis, but I have written a small wrapper around std::variant explicitly for handling variants where all alternatives inherit from a common base class.

Inside, I am essentially obtaining a pointer to where the variant stores its contents and then use this as a normal pointer to a base class. So once my new variant is created, I expect the actual function call to have about the same overhead as a regular virtual function call on a base class pointer.

pv::polymorphic_value< Base, Foo, Bar > variant;
variant->getBaseMember();

The library is freely available under https://github.com/Krzmbrzl/polymorphic_variant

Is there any hope to call a common base class method on a std::variant efficiently?

Tags:

c++

performance

x86

c++17

variant

Versus a Union

BeeOnRope

2 Answers

Barry

Raven

Recent Activity

Donate For Us

Is there any hope to call a common base class method on a std::variant efficiently?

Tags:

c++

performance

x86

c++17

variant

Versus a Union

BeeOnRope

2 Answers

Barry

Raven

Related questions

Recent Activity

Donate For Us