Logo Questions Linux Laravel Mysql Ubuntu Git Menu
 

std::vector<uint8_t> manually copying instead of calling memcpy when C++11/14 enabled

Using gcc 4.9, cross-compiling for ARM using the Linaro toolchain, I have found the compiled result of vector.assign() changes when adding -std=c++14, in a way that creates significant performance problems.

I have tried several different ways of doing this allocation + copy, but all of them have this performance problem as long as I'm using std::vector to do it.

I can reproduce the problem with this toy example:

VectorTest.h

#include <stdint.h>
#include <stddef.h>
#include <vector>

struct VectorWrapper_t
{
    VectorWrapper_t(uint8_t const* pData, size_t length);
    std::vector<uint8_t> data;
};

VectorTest.cpp

#include "VectorTest.h"

VectorWrapper_t::VectorWrapper_t(uint8_t const* pData, size_t length)
{
    data.assign(pData, pData + length);
}

gcc flags:

-std=c++14 \
-mthumb -march=armv7-a -mtune=cortex-a9 \
-mlittle-endian -mfloat-abi=hard -mfpu=neon -Wa,-mimplicit-it=thumb \
-O2 -g

Viewing the assembly, I can see why: the original version (C++03, I'm assuming?) calls memmove, whereas the C++14 version instead adds an extra loop that it looks like is copying the data manually. Looking at the .loc tags gcc adds with -fverbose-asm, the instructions in this loop come from stl_construct.h and stl_uninitialized.h.

Changing to gcc 5.2.1 (with C++14), it compiles nearly identically to the C++03 example, except with memcpy instead of memmove.

I'm able to get around this problem by using std::unique_ptr<uint8_t[]> instead of a vector here. However, I would like to get to the bottom of this issue to figure out if other places using vectors could have performance problems and how to potentially fix them (updating to gcc 5.2 isn't practical).

So my question is: Why does it compile differently under C++11/14?

For reference, gcc --version reports:
arm-linux-gnueabihf-gcc (Linaro GCC 4.9-2014.12) 4.9.3 20141205 (prerelease).

Here is the assembly gcc generated:

# C++03, gcc 4.9

    push    {r3, r4, r5, r6, r7, lr}    @
    movs    r3, #0  @ tmp118,
    mov r4, r0  @ this, this
    str r3, [r0]    @ tmp118, MEM[(struct _Vector_impl *)this_1(D)]._M_start
    mov r5, r2  @ length, length
    str r3, [r0, #4]    @ tmp118, MEM[(struct _Vector_impl *)this_1(D)]._M_finish
    str r3, [r0, #8]    @ tmp118, MEM[(struct _Vector_impl *)this_1(D)]._M_end_of_storage
    cbnz    r2, .L19    @ length,
    mov r0, r4  @, this
    pop {r3, r4, r5, r6, r7, pc}    @
.L19:
    mov r0, r2  @, length
    mov r6, r1  @ pData, pData
    bl  _Znwj   @
    mov r2, r5  @, length
    mov r1, r6  @, pData
    mov r7, r0  @ D.13516,
    bl  memmove @
    ldr r0, [r4]    @ D.13515, MEM[(struct vector *)this_1(D)].D.11902._M_impl._M_start
    cbz r0, .L3 @ D.13515,
    bl  _ZdlPv  @
.L3:
    add r5, r5, r7  @ D.13515, D.13516
    str r7, [r4]    @ D.13516, MEM[(struct vector *)this_1(D)].D.11902._M_impl._M_start
    str r5, [r4, #4]    @ D.13515, MEM[(struct vector *)this_1(D)].D.11902._M_impl._M_finish
    mov r0, r4  @, this
    str r5, [r4, #8]    @ D.13515, MEM[(struct vector *)this_1(D)].D.11902._M_impl._M_end_of_storage
    pop {r3, r4, r5, r6, r7, pc}    @
.L6:
    ldr r0, [r4]    @ D.13515, MEM[(struct _Vector_base *)this_1(D)]._M_impl._M_start
    cbz r0, .L5 @ D.13515,
    bl  _ZdlPv  @
.L5:
    bl  __cxa_end_cleanup   @

# C++14, gcc 4.9

    push    {r3, r4, r5, r6, r7, lr}    @
    movs    r3, #0  @ tmp157,
    mov r6, r0  @ this, this
    str r3, [r0]    @ tmp157, MEM[(struct _Vector_impl *)this_1(D)]._M_start
    mov r5, r2  @ length, length
    str r3, [r0, #4]    @ tmp157, MEM[(struct _Vector_impl *)this_1(D)]._M_finish
    str r3, [r0, #8]    @ tmp157, MEM[(struct _Vector_impl *)this_1(D)]._M_end_of_storage
    cbnz    r2, .L25    @ length,
    mov r0, r6  @, this
    pop {r3, r4, r5, r6, r7, pc}    @
.L25:
    mov r0, r2  @, length
    mov r4, r1  @ pData, pData
    bl  _Znwj   @
    adds    r3, r4, r5  @ D.20345, pData, length
    mov r7, r0  @ __result,
    cmp r4, r3  @ pData, D.20345
    ittt    ne
    addne   r1, r4, #-1 @ ivtmp.76, pData,
    movne   r3, r0  @ __result, __result
    addne   r4, r0, r5  @ D.20346, __result, length
    beq .L26    @,
.L7:
    ldrb    r2, [r1, #1]!   @ zero_extendqisi2  @ D.20348, MEM[base: _48, offset: 0]
    cbz r3, .L6 @ __result,
    strb    r2, [r3]    @ D.20348, MEM[base: __result_23, offset: 0B]
.L6:
    adds    r3, r3, #1  @ __result, __result,
    cmp r3, r4  @ __result, D.20346
    bne .L7 @,
.L8:
    ldr r0, [r6]    @ D.20346, MEM[(struct vector *)this_1(D)].D.18218._M_impl._M_start
    cbz r0, .L5 @ D.20346,
    bl  _ZdlPv  @
.L5:
    str r7, [r6]    @ __result, MEM[(struct vector *)this_1(D)].D.18218._M_impl._M_start
    mov r0, r6  @, this
    str r4, [r6, #4]    @ D.20346, MEM[(struct vector *)this_1(D)].D.18218._M_impl._M_finish
    str r4, [r6, #8]    @ D.20346, MEM[(struct vector *)this_1(D)].D.18218._M_impl._M_end_of_storage
    pop {r3, r4, r5, r6, r7, pc}    @
.L26:
    adds    r4, r0, r5  @ D.20346, __result, length
    b   .L8 @
.L11:
    ldr r0, [r6]    @ D.20346, MEM[(struct _Vector_base *)this_1(D)]._M_impl._M_start
    cbz r0, .L10    @ D.20346,
    bl  _ZdlPv  @
.L10:
    bl  __cxa_end_cleanup   @

# C++14, gcc 5.2

    push    {r3, r4, r5, r6, r7, lr}    @
    movs    r3, #0  @ tmp118,
    mov r4, r0  @ this, this
    str r3, [r0]    @ tmp118, MEM[(struct _Vector_impl *)this_1(D)]._M_start
    str r3, [r0, #4]    @ tmp118, MEM[(struct _Vector_impl *)this_1(D)]._M_finish
    str r3, [r0, #8]    @ tmp118, MEM[(struct _Vector_impl *)this_1(D)]._M_end_of_storage
    cbnz    r2, .L19    @ length,
    mov r0, r4  @, this
    pop {r3, r4, r5, r6, r7, pc}    @
.L19:
    mov r0, r2  @, length
    mov r6, r1  @ pData, pData
    mov r5, r2  @ length, length
    bl  _Znwj   @
    mov r2, r5  @, length
    mov r1, r6  @, pData
    mov r7, r0  @ D.20824,
    bl  memcpy  @
    ldr r0, [r4]    @ D.20823, MEM[(struct vector *)this_1(D)].D.18751._M_impl._M_start
    cbz r0, .L3 @ D.20823,
    bl  _ZdlPv  @
.L3:
    add r5, r5, r7  @ D.20823, D.20824
    str r7, [r4]    @ D.20824, MEM[(struct vector *)this_1(D)].D.18751._M_impl._M_start
    str r5, [r4, #4]    @ D.20823, MEM[(struct vector *)this_1(D)].D.18751._M_impl._M_finish
    mov r0, r4  @, this
    str r5, [r4, #8]    @ D.20823, MEM[(struct vector *)this_1(D)].D.18751._M_impl._M_end_of_storage
    pop {r3, r4, r5, r6, r7, pc}    @
.L6:
    ldr r0, [r4]    @ D.20823, MEM[(struct _Vector_base *)this_1(D)]._M_impl._M_start
    cbz r0, .L5 @ D.20823,
    bl  _ZdlPv  @
.L5:
    bl  __cxa_end_cleanup   @
like image 772
Joel Geddert Avatar asked Feb 01 '17 21:02

Joel Geddert


1 Answers

This was a GCC bug in the 4.9.2 release, see PR 64476. The difference between the default -std=gnu++03 mode and -std=c++14 is that for C++11 and later it's possible to have trivial types that aren't assignable (because they can have a deleted assignment operator) which causes the implementation of std::uninitialized_copy to take a different (slower) code path. The check for assignability was wrong, meaning that we took the slow path when we didn't need to.

I fixed it two years ago for GCC 4.9.3, but your compiler is based on a snapshot made between the 4.9.2 and 4.9.3 releases and is a few weeks too old to have the fix.

You could ask Linaro to update their GCC 4.9 compiler to 4.9.4, or at least to apply the patch fixing this bug.

like image 168
Jonathan Wakely Avatar answered Oct 14 '22 19:10

Jonathan Wakely