The way std::variant
dispatches to different visitor methods when std::visit
is called is pretty reasonable when the variant alternatives are completely different types. Essentially a visitor-specific vtable
is built at compile-time and after some error checking1 the appropriate visitor function is looked by indexing the table based the current index()
which resolves to something like an indirect jump on most platforms.
If the alternatives share a common base class, however, calling a (non-virtual) member function or accessing state on the base class with a visitor is conceptually much simpler: you are always calling the same method and usually using the same pointer2 to the base class.
Still, the implementation ends up just as slow. For example:
#include <variant>
struct Base {
int m_base;
int getBaseMember() { return m_base; }
};
struct Foo : public Base {
int m_foo;
};
struct Bar : public Base {
int m_bar;
};
using Foobar = std::variant<Foo,Bar>;
int getBaseMemVariant(Foobar& v) {
return std::visit([](auto&& e){ return e.getBaseMember(); }, v);
}
The generated code on x86 for the most recent version of gcc
and clang
is similar3 (clang shown):
getBaseMemVariant(std::__1::variant<Foo, Bar>&): # @getBaseMemVariant(std::__1::variant<Foo, Bar>&)
sub rsp, 24
mov rax, rdi
mov ecx, dword ptr [rax + 8]
mov edx, 4294967295
cmp rcx, rdx
je .LBB0_2
lea rdx, [rsp + 8]
mov qword ptr [rsp + 16], rdx
lea rdi, [rsp + 16]
mov rsi, rax
call qword ptr [8*rcx + decltype(auto) std::__1::__variant_detail::__visitation::__base::__visit_alt<std::__1::__variant_detail::__visitation::__variant::__value_visitor<getBaseMemVariant(std::__1::variant<Foo, Bar>&)::$_0>, std::__1::__variant_detail::__impl<Foo, Bar>&>(std::__1::__variant_detail::__visitation::__variant::__value_visitor<getBaseMemVariant(std::__1::variant<Foo, Bar>&)::$_0>&&, std::__1::__variant_detail::__impl<Foo, Bar>&)::__fmatrix]
add rsp, 24
ret
.LBB0_2:
mov edi, 8
call __cxa_allocate_exception
mov qword ptr [rax], vtable for std::bad_variant_access+16
mov esi, typeinfo for std::bad_variant_access
mov edx, std::exception::~exception()
mov rdi, rax
call __cxa_throw
decltype(auto) std::__1::__variant_detail::__visitation::__base::__dispatcher<0ul>::__dispatch<std::__1::__variant_detail::__visitation::__variant::__value_visitor<getBaseMemVariant(std::__1::variant<Foo, Bar>&)::$_0>&&, std::__1::__variant_detail::__base<(std::__1::__variant_detail::_Trait)0, Foo, Bar>&>(std::__1::__variant_detail::__visitation::__variant::__value_visitor<getBaseMemVariant(std::__1::variant<Foo, Bar>&)::$_0>&&, std::__1::__variant_detail::__base<(std::__1::__variant_detail::_Trait)0, Foo, Bar>&): # @"decltype(auto) std::__1::__variant_detail::__visitation::__base::__dispatcher<0ul>::__dispatch<std::__1::__variant_detail::__visitation::__variant::__value_visitor<getBaseMemVariant(std::__1::variant<Foo, Bar>&)::$_0>&&, std::__1::__variant_detail::__base<(std::__1::__variant_detail::_Trait)0, Foo, Bar>&>(std::__1::__variant_detail::__visitation::__variant::__value_visitor<getBaseMemVariant(std::__1::variant<Foo, Bar>&)::$_0>&&, std::__1::__variant_detail::__base<(std::__1::__variant_detail::_Trait)0, Foo, Bar>&)"
mov eax, dword ptr [rsi]
ret
decltype(auto) std::__1::__variant_detail::__visitation::__base::__dispatcher<1ul>::__dispatch<std::__1::__variant_detail::__visitation::__variant::__value_visitor<getBaseMemVariant(std::__1::variant<Foo, Bar>&)::$_0>&&, std::__1::__variant_detail::__base<(std::__1::__variant_detail::_Trait)0, Foo, Bar>&>(std::__1::__variant_detail::__visitation::__variant::__value_visitor<getBaseMemVariant(std::__1::variant<Foo, Bar>&)::$_0>&&, std::__1::__variant_detail::__base<(std::__1::__variant_detail::_Trait)0, Foo, Bar>&): # @"decltype(auto) std::__1::__variant_detail::__visitation::__base::__dispatcher<1ul>::__dispatch<std::__1::__variant_detail::__visitation::__variant::__value_visitor<getBaseMemVariant(std::__1::variant<Foo, Bar>&)::$_0>&&, std::__1::__variant_detail::__base<(std::__1::__variant_detail::_Trait)0, Foo, Bar>&>(std::__1::__variant_detail::__visitation::__variant::__value_visitor<getBaseMemVariant(std::__1::variant<Foo, Bar>&)::$_0>&&, std::__1::__variant_detail::__base<(std::__1::__variant_detail::_Trait)0, Foo, Bar>&)"
mov eax, dword ptr [rsi]
ret
decltype(auto) std::__1::__variant_detail::__visitation::__base::__visit_alt<std::__1::__variant_detail::__visitation::__variant::__value_visitor<getBaseMemVariant(std::__1::variant<Foo, Bar>&)::$_0>, std::__1::__variant_detail::__impl<Foo, Bar>&>(std::__1::__variant_detail::__visitation::__variant::__value_visitor<getBaseMemVariant(std::__1::variant<Foo, Bar>&)::$_0>&&, std::__1::__variant_detail::__impl<Foo, Bar>&)::__fmatrix:
.quad decltype(auto) std::__1::__variant_detail::__visitation::__base::__dispatcher<0ul>::__dispatch<std::__1::__variant_detail::__visitation::__variant::__value_visitor<getBaseMemVariant(std::__1::variant<Foo, Bar>&)::$_0>&&, std::__1::__variant_detail::__base<(std::__1::__variant_detail::_Trait)0, Foo, Bar>&>(std::__1::__variant_detail::__visitation::__variant::__value_visitor<getBaseMemVariant(std::__1::variant<Foo, Bar>&)::$_0>&&, std::__1::__variant_detail::__base<(std::__1::__variant_detail::_Trait)0, Foo, Bar>&)
.quad decltype(auto) std::__1::__variant_detail::__visitation::__base::__dispatcher<1ul>::__dispatch<std::__1::__variant_detail::__visitation::__variant::__value_visitor<getBaseMemVariant(std::__1::variant<Foo, Bar>&)::$_0>&&, std::__1::__variant_detail::__base<(std::__1::__variant_detail::_Trait)0, Foo, Bar>&>(std::__1::__variant_detail::__visitation::__variant::__value_visitor<getBaseMemVariant(std::__1::variant<Foo, Bar>&)::$_0>&&, std::__1::__variant_detail::__base<(std::__1::__variant_detail::_Trait)0, Foo, Bar>&)
The call qword ptr [8*rcx + ...
is the actual indirect call to a function pointed to by the vtable (the vtable itself appears at the bottom of the listing). The code before that is first checking the "is empty" state, and then setting up the visit
call (I'm not sure what the weirdness with rdi
is, I guess it's setting up a pointer to the visitor as the first argument or something).
The actual methods which are pointer to by the vtable and executed by the call
are very simple, a single mov
to read the member. Critically, both are indentical:
mov eax, dword ptr [rsi]
ret
So we have a huge mess. To execute that single mov
we have a dozen setup instructions, and more importantly an indirect branch: which if targeting a series of Foobar
variant
objects with different contained alternatives will mispredict very badly. Finally, the indirect call seems like an insurmountable barrier to further optimization: here will look at a simple call without any surrounding context, but in real use this might be optimized into a larger function with significant opportunities for further optimization - but I think the indirect call will block it.
You can play with the code yourself on godbolt.
The slowness isn't inherent: here's a very simple "discriminated union" struct
that combines the two classes in a union
along with an isFoo
discriminator that keeps track of what class is contained:
struct FoobarUnion {
bool isFoo;
union {
Foo foo;
Bar bar;
};
Base *asBase() {return isFoo ? (Base *)&foo : &bar; };
};
int getBaseMemUnion(FoobarUnion& v) {
return v.asBase()->getBaseMember();
}
The corresponding getBaseMemUnion
function compiles down to a single mov
instruction on both gcc and clang:
getBaseMemUnion(FoobarUnion&): # @getBaseMemUnion(FoobarUnion&)
mov eax, dword ptr [rdi + 4]
ret
Granted, the discriminated union doesn't have to check the "is valueless" error condition, but that's not the main reason for variant
slowness and in any case such a condition is impossible with Foo
and Bar
since none of their constructors throw4. Even if you wanted to support such a state, the resulting function with the union
is still very efficient - only a small check is added, but the behavior of calling the base class is the same.
Is there anything I can do to this use of variant
efficient in this case of calling a common base class function, or is the promise of a zero-cost abstraction just not panning out here?
I'm open to a different call pattern, compiler options, whatever.
1 In particular, checking whether the variant is valueless_by_exception
due to an earlier assignment that failed.
2 The pointer to the base class doesn't always have the same relationship to the most derived pointer for all alternatives, e.g., when multiple inheritance is involved.
3 Well gcc
is a bit worse since it seems to redundantly perform the "is valueless" check both upfront prior to the call to visit
and also in each auto-generated method pointed to by the vtable
. clang only does it upfront. Keep in mind when I say "gcc" I really mean "gcc with libstdc++" while "clang" really means "clang with libc++". Some differences, like the redundant index()
check in the generated visitor functions are likely due to library differences rather than compiler optimization differences.
4 If the valueless
state is problematic, one could also consider something like strict_variant
which never has an empty state but still uses local storage if the move constructor cannot throw.
For what it's worth, a totally handrolled visit with a switch
does pretty well:
// use a code generator to write out all of these
template <typename F, typename V>
auto custom_visit(F f, V&& v, std::integral_constant<size_t, 2> )
{
switch (v.index()) {
case 0: return f(std::get<0>(std::forward<V>(v)));
case 1: return f(std::get<1>(std::forward<V>(v)));
#ifdef VALUELESS
case std::variant_npos: {
[]() [[gnu::cold, gnu::noinline]] {
throw std::bad_variant_access();
}();
}
#endif
}
__builtin_unreachable();
}
template <typename F, typename V>
auto custom_visit(F f, V&& v) {
return custom_visit(f, std::forward<V>(v),
std::variant_size<std::decay_t<V>>{});
}
Which you'd use like:
int getBaseMemVariant2(Foobar& v) {
return custom_visit([](Base& b){ return &b; }, v)->getBaseMember();
}
With VALUELESS
, this emits:
getBaseMemVariant2(std::variant<Foo, Bar>&):
movzx eax, BYTE PTR [rdi+8]
cmp al, -1
je .L27
cmp al, 1
ja .L28
mov eax, DWORD PTR [rdi]
ret
.L27:
sub rsp, 8
call auto custom_visit<getBaseMemVariant2(std::variant<Foo, Bar>&)::{lambda(Base&)#1}, std::variant<Foo, Bar>&>(getBaseMemVariant2(std::variant<Foo, Bar>&)::{lambda(Base&)#1}, std::variant<Foo, Bar>&, std::integral_constant<unsigned long, 2ul>)::{lambda()#1}::operator()() const [clone .isra.1]
Which is pretty good. Without VALUELESS
, this emits :
getBaseMemVariant2(std::variant<Foo, Bar>&):
mov eax, DWORD PTR [rdi]
ret
as desired.
I don't really know what conclusion, if any, to draw from this. Clearly, there's hope?
I am not competent enough to perform an assembly-level analysis, but I have written a small wrapper around std::variant
explicitly for handling variants where all alternatives inherit from a common base class.
Inside, I am essentially obtaining a pointer to where the variant stores its contents and then use this as a normal pointer to a base class. So once my new variant is created, I expect the actual function call to have about the same overhead as a regular virtual function call on a base class pointer.
pv::polymorphic_value< Base, Foo, Bar > variant;
variant->getBaseMember();
The library is freely available under https://github.com/Krzmbrzl/polymorphic_variant
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With