I been experimenting a little with expressive templates and rope data structures to try and see what kind of gains can be achieved. So far it is working quite nicely. However, when attempting to concatenate more than 5 arguments together the compiler fails to optimize properly and generates unnecessary temporaries. Could someone enlighten me why this is happening? Is it a compiler limitation or do my optimization options require tweaking?
I am using g++ 4.4.1 (mingw32) with the following options: -O3 -Winline -Wextra -Wall -fno-exceptions -fno-rtti -fomit-frame-pointer -fexpensive-optimizations -fverbose-asm -S
The code is below, it is just an experimentation so it does not really follow any standards:
#include <stdio.h>
template<typename Derived>
struct rope_base {
const Derived & ref() const;
};
struct string {
size_t m_length;
char * m_value;
template<typename Derived>
string(const rope_base<Derived> & rope);
~string();
const char * data() const;
size_t length() const;
char * write_to(char * dst) const;
};
struct static_string {
const char * m_value;
const size_t m_length;
static_string(const char * value);
size_t length() const;
char * write_to(char * dst) const;
};
template<typename T>
struct rope_traits {
typedef const T type;
};
template<>
struct rope_traits<string> {
typedef const string & type;
};
template<>
struct rope_traits<static_string> {
typedef const static_string & type;
};
template<typename Left, typename Right>
struct rope : public rope_base<rope<Left, Right> > {
typename rope_traits<Left>::type m_left;
typename rope_traits<Right>::type m_right;
rope(const Left & left, const Right & right);
size_t length() const;
char * write_to(char * dst) const;
};
inline static_string::static_string(const char * value)
: m_value(value)
, m_length(__builtin_strlen(m_value)) {}
inline size_t static_string::length() const {
return m_length;
}
inline char * static_string::write_to(char * dst) const {
__builtin_memcpy(dst, m_value, m_length);
return dst + m_length;
}
template<typename Derived>
inline string::string(const rope_base<Derived> & rope)
: m_length(rope.ref().length())
, m_value(new char[m_length + 1]) {
*rope.ref().write_to(m_value) = 0;
}
inline string::~string() {
delete[] m_value;
}
inline const char * string::data() const {
return m_value;
}
inline size_t string::length() const {
return m_length;
}
template<typename Derived>
inline const Derived & rope_base<Derived>::ref() const {
return static_cast<const Derived &>(*this);
}
template<typename Left, typename Right>
inline rope<Left, Right>::rope(const Left & left, const Right & right)
: m_left(left)
, m_right(right) {}
template<typename Left, typename Right>
inline size_t rope<Left, Right>::length() const {
return m_left.length() + m_right.length();
}
template<typename Left, typename Right>
inline char * rope<Left, Right>::write_to(char * dst) const {
return m_right.write_to(m_left.write_to(dst));
}
inline rope<static_string, static_string> operator+(const static_string & left, const static_string & right) {
return rope<static_string, static_string>(left, right);
}
template<typename Left>
inline rope<Left, static_string> operator+(const rope_base<Left> & left, const static_string & right) {
return rope<Left, static_string>(left.ref(), right);
}
template<typename Right>
inline rope<static_string, Right> operator+(const static_string & left, const rope_base<Right> & right) {
return rope<static_string, Right>(left, right.ref());
}
template<typename Left, typename Right>
inline rope<Left, Right> operator+(const rope_base<Left> & left, const rope_base<Right> & right) {
return rope<Left, Right>(left.ref(), right.ref());
}
typedef static_string ss;
int main(int, char **)
{
// works up to 5
string s(ss("111111111111") + "222222222222" + "333333333333" + "444444444444" + "555555555555");
printf("%d %s\n", s.length(), s.data());
return 0;
}
The code above generates quite nice assembler output which is fully inlined and all arguments have been reduced to constants:
.def ___main; .scl 2; .type 32; .endef
.section .rdata,"dr"
LC0:
.ascii "444444444444\0"
LC1:
.ascii "333333333333\0"
LC2:
.ascii "222222222222\0"
LC3:
.ascii "111111111111\0"
LC4:
.ascii "555555555555\0"
LC5:
.ascii "%d %s\12\0"
.text
.p2align 2,,3
.globl _main
.def _main; .scl 2; .type 32; .endef
_main:
pushl %ebp #
movl %esp, %ebp #,
andl $-16, %esp #,
pushl %edi #
pushl %esi #
pushl %ebx #
subl $20, %esp #,
call ___main #
movl $LC3, %esi #, D.2495
movl $61, (%esp) #,
call __Znaj #
movl %eax, %ebx #, D.3126
movl $3, %ecx #, tmp74
movl %eax, %edi # D.3126, D.3125
rep movsl
leal 12(%eax), %eax #, D.3180
movb $3, %cl #,
movl %eax, %edi # D.3180, D.3180
movl $LC2, %esi #, D.2496
rep movsl
leal 24(%ebx), %eax #, D.3186
movb $3, %cl #,
movl %eax, %edi # D.3186, D.3186
movl $LC1, %esi #, D.2502
rep movsl
leal 36(%ebx), %eax #, D.3192
movb $3, %cl #,
movl %eax, %edi # D.3192, D.3192
movl $LC0, %esi #, D.2539
rep movsl
leal 48(%ebx), %eax #, D.3198
movl $LC4, %esi #, tmp87
movb $3, %cl #,
movl %eax, %edi # D.3198, D.3198
rep movsl
movb $0, 12(%eax) #,
movl %ebx, 8(%esp) # D.3126,
movl $60, 4(%esp) #,
movl $LC5, (%esp) #,
call _printf #
testl %ebx, %ebx # D.3126
je L2 #,
movl %ebx, (%esp) # D.3126,
call __ZdaPv #
L2:
xorl %eax, %eax #
addl $20, %esp #,
popl %ebx #
popl %esi #
popl %edi #
leave
ret
.def __Znaj; .scl 2; .type 32; .endef
.def _printf; .scl 2; .type 32; .endef
.def __ZdaPv; .scl 2; .type 32; .endef
Inlining fails when adding one or more parameters to the concatenation, resulting in temporaries being copied around and parameters being treated as variables:
.def ___main; .scl 2; .type 32; .endef
.section .rdata,"dr"
LC0:
.ascii "777777777777\0"
LC1:
.ascii "666666666666\0"
LC2:
.ascii "555555555555\0"
LC3:
.ascii "444444444444\0"
LC4:
.ascii "333333333333\0"
LC5:
.ascii "222222222222\0"
LC6:
.ascii "111111111111\0"
LC7:
.ascii "888888888888\0"
LC8:
.ascii "%d %s\12\0"
.text
.p2align 2,,3
.globl _main
.def _main; .scl 2; .type 32; .endef
_main:
pushl %ebp #
movl %esp, %ebp #,
andl $-16, %esp #,
pushl %edi #
pushl %esi #
pushl %ebx #
subl $228, %esp #,
call ___main #
movl $LC0, 168(%esp) #, D.2650.m_value
movl $12, 172(%esp) #, D.2650.m_length
movl $LC1, 176(%esp) #, D.2613.m_value
movl $12, 180(%esp) #, D.2613.m_length
movl $LC2, 184(%esp) #, D.2576.m_value
movl $12, 188(%esp) #, D.2576.m_length
movl $LC3, 192(%esp) #, D.2539.m_value
movl $12, 196(%esp) #, D.2539.m_length
movl $LC4, 200(%esp) #, D.2502.m_value
movl $12, 204(%esp) #, D.2502.m_length
movl $LC5, 208(%esp) #, D.2496.m_value
movl $12, 212(%esp) #, D.2496.m_length
movl $LC6, 216(%esp) #, D.2495.m_value
movl $12, 220(%esp) #, D.2495.m_length
leal 216(%esp), %eax #, tmp78
movl %eax, 152(%esp) # tmp78, D.2571.m_left.m_left.m_left
leal 208(%esp), %eax #, tmp79
movl %eax, 156(%esp) # tmp79, D.2571.m_left.m_left.m_right
leal 200(%esp), %eax #, tmp80
movl %eax, 160(%esp) # tmp80, D.2571.m_left.m_right
leal 192(%esp), %eax #, tmp81
movl %eax, 164(%esp) # tmp81, D.2571.m_right
leal 132(%esp), %edi #, tmp82
leal 152(%esp), %esi #, tmp83
movl $4, %ecx #, tmp84
rep movsl
leal 184(%esp), %eax #, tmp85
movl %eax, 148(%esp) # tmp85, D.2608.m_right
leal 108(%esp), %edi #, tmp86
leal 132(%esp), %esi #, tmp87
movb $5, %cl #,
rep movsl
leal 176(%esp), %eax #, tmp89
movl %eax, 128(%esp) # tmp89, D.2645.m_right
leal 80(%esp), %edi #, tmp90
leal 108(%esp), %esi #, tmp91
movb $6, %cl #,
rep movsl
leal 168(%esp), %eax #, tmp93
movl %eax, 104(%esp) # tmp93, D.2682.m_right
leal 48(%esp), %edi #, tmp94
leal 80(%esp), %esi #, tmp95
movb $7, %cl #,
rep movsl
movl 48(%esp), %ebx # D.2719.m_left.m_left.m_left.m_left.m_left.m_left.m_left, SR.35
movl 52(%esp), %edx # D.2719.m_left.m_left.m_left.m_left.m_left.m_left.m_right, SR.34
movl 56(%esp), %eax # D.2719.m_left.m_left.m_left.m_left.m_left.m_right,
movl %eax, 36(%esp) #, %sfp
movl 60(%esp), %eax # D.2719.m_left.m_left.m_left.m_left.m_right,
movl %eax, 32(%esp) #, %sfp
movl 64(%esp), %eax # D.2719.m_left.m_left.m_left.m_right,
movl %eax, 28(%esp) #, %sfp
movl 68(%esp), %eax # D.2719.m_left.m_left.m_right,
movl %eax, 24(%esp) #, %sfp
movl 72(%esp), %eax # D.2719.m_left.m_right,
movl %eax, 20(%esp) #, %sfp
movl 4(%ebx), %eax # <variable>.m_length, tmp97
addl 4(%edx), %eax # <variable>.m_length, tmp97
addl $12, %eax #,
movl %eax, 44(%esp) #, %sfp
movl 36(%esp), %eax # %sfp,
movl 4(%eax), %eax # <variable>.m_length,
addl %eax, 44(%esp) #, %sfp
movl 32(%esp), %eax # %sfp,
movl 4(%eax), %eax # <variable>.m_length,
addl %eax, 44(%esp) #, %sfp
movl 28(%esp), %eax # %sfp,
movl 4(%eax), %eax # <variable>.m_length,
addl %eax, 44(%esp) #, %sfp
movl 24(%esp), %eax # %sfp,
movl 4(%eax), %eax # <variable>.m_length,
addl %eax, 44(%esp) #, %sfp
movl 20(%esp), %eax # %sfp,
movl 4(%eax), %eax # <variable>.m_length,
addl %eax, 44(%esp) #, %sfp
movl 44(%esp), %eax # %sfp, tmp105
incl %eax # tmp105
movl %eax, (%esp) # tmp105,
movl %edx, 16(%esp) #,
call __Znaj #
movl %eax, 40(%esp) #, %sfp
movl (%ebx), %esi # <variable>.m_value, <variable>.m_value
movl 4(%ebx), %ecx # <variable>.m_length, <variable>.m_length
movl %eax, %edi #, D.3662
rep movsb
movl 40(%esp), %eax # %sfp, D.3735
addl 4(%ebx), %eax # <variable>.m_length, D.3735
movl 16(%esp), %edx #,
movl (%edx), %esi # <variable>.m_value, <variable>.m_value
movl 4(%edx), %ecx # <variable>.m_length, <variable>.m_length
movl %eax, %edi # D.3735, D.3735
rep movsb
addl 4(%edx), %eax # <variable>.m_length, D.3741
movl 36(%esp), %edx # %sfp,
movl (%edx), %esi # <variable>.m_value, <variable>.m_value
movl 4(%edx), %ecx # <variable>.m_length, <variable>.m_length
movl %eax, %edi # D.3741, D.3741
rep movsb
addl 4(%edx), %eax # <variable>.m_length, D.3747
movl 32(%esp), %edx # %sfp,
movl (%edx), %esi # <variable>.m_value, <variable>.m_value
movl 4(%edx), %ecx # <variable>.m_length, <variable>.m_length
movl %eax, %edi # D.3747, D.3747
rep movsb
addl 4(%edx), %eax # <variable>.m_length, D.3753
movl 28(%esp), %edx # %sfp,
movl (%edx), %esi # <variable>.m_value, <variable>.m_value
movl 4(%edx), %ecx # <variable>.m_length, <variable>.m_length
movl %eax, %edi # D.3753, D.3753
rep movsb
addl 4(%edx), %eax # <variable>.m_length, D.3759
movl 24(%esp), %edx # %sfp,
movl (%edx), %esi # <variable>.m_value, <variable>.m_value
movl 4(%edx), %ecx # <variable>.m_length, <variable>.m_length
movl %eax, %edi # D.3759, D.3759
rep movsb
addl 4(%edx), %eax # <variable>.m_length, D.3765
movl 20(%esp), %edx # %sfp,
movl (%edx), %esi # <variable>.m_value, <variable>.m_value
movl 4(%edx), %ecx # <variable>.m_length, <variable>.m_length
movl %eax, %edi # D.3765, D.3765
rep movsb
addl 4(%edx), %eax # <variable>.m_length, D.3771
movl $LC7, %esi #, tmp148
movb $3, %cl #,
movl %eax, %edi # D.3771, D.3771
rep movsl
movb $0, 12(%eax) #,
movl 40(%esp), %eax # %sfp,
movl %eax, 8(%esp) #,
movl 44(%esp), %edx # %sfp,
movl %edx, 4(%esp) #,
movl $LC8, (%esp) #,
call _printf #
movl 40(%esp), %eax # %sfp,
testl %eax, %eax #
je L2 #,
movl 40(%esp), %eax # %sfp,
movl %eax, (%esp) #,
call __ZdaPv #
L2:
xorl %eax, %eax #
addl $228, %esp #,
popl %ebx #
popl %esi #
popl %edi #
leave
ret
.def __Znaj; .scl 2; .type 32; .endef
.def _printf; .scl 2; .type 32; .endef
.def __ZdaPv; .scl 2; .type 32; .endef
A few things (per the discussion in comments):
Upgrade your gcc. A lot of optimisation improvements were made in the 4.6 series, including some improvements to inlining.
With a newer gcc -Winline warns that your string constructor will not be inlined:
warning: inlining failed in call to 'string::string(const rope_base<Derived>&) [with Derived = rope<rope<rope<rope<rope<static_string, static_string>, static_string> static_string>, static_string>, static_string>]': call is unlikely and code size would grow [-Winline]
I'm not quite sure why gcc gives this particular message (which is usually pretty clearly related to an inline call in an unlikely branch), but the root of the problem is the call to new() in that constructor. Using a static fixed size buffer produces nice compact asm. Using malloc, or wrapping new in a non-inlined function call allows the constructor (but not malloc/function) to be inlined.
Which approach to take really depends on your specific use case. If the length of the strings could be known at compile time, or given a maximum value, as in your example, you can obviously completely inline. In the general case though, you are always going to have to give something up.
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With