Logo Questions Linux Laravel Mysql Ubuntu Git Menu
 

Why does direct accessing to structure members produces significantly more assembly code compared to indirect accessing in GCC?

There are many topics here whether a direct access or an indirect access (via pointer) are faster when accessing structure members, in C.

One example: C pointers vs direct member access for structs

The general opinion is direct access will be faster (at least theoretically) since pointer dereferencing is not used.

So I gave it a try with a chunk of code in my system: GNU Embedded Tools GCC 4.7.4, generating code for ARM (actually ARM-Cortex-A15).

Surprisingly, direct access was much slower. Then I generated the assembly codes for the object file.

Direct access code has 114 lines of assembly code, and indirect access code has 33 lines of assembly code. What is going on here?

Below are the C code and generated assembly code of the functions. The structures are all map to external memory and the structure members are all one-byte word long (unsigned char type).

First function, with indirect access:

void sub_func_1(unsigned int num_of, struct file_s *__restrict__ first_file_ptr, struct file_s  *__restrict__ second_file_ptr, struct output_s *__restrict__ output_ptr)
{
    if(LIKELY(num_of == 0))
     {
        output_ptr->curr_id                         = UNUSED;
        output_ptr->curr_cnt                        = output_ptr->cnt;

        output_ptr->curr_mode                       = output_ptr->_mode;
        output_ptr->curr_type                       = output_ptr->type;
        output_ptr->curr_size                       = output_ptr->size;
        output_ptr->curr_allocation_type            = output_ptr->allocation_type;
        output_ptr->curr_allocation_localized       = output_ptr->allocation_localized;
        output_ptr->curr_mode_enable                = output_ptr->mode_enable;

        if(output_ptr->curr_cnt == 1)
         {
            first_file_ptr->status                  = BLOCK_IDLE;
            first_file_ptr->type                    = USER_DATA_TYPE;
            first_file_ptr->index                   = FIRST__WORD;
            first_file_ptr->layer_cnt               = output_ptr->layer_cnt;

            second_file_ptr->status                 = DISABLED;
            second_file_ptr->index                  = 0;
            second_file_ptr->redundancy_version     = 1;

            output_ptr->total_layer_cnt             = first_file_ptr->layer_cnt;
         }
     }
}
00000000 <sub_func_1>:
   0:   e3500000    cmp r0, #0
   4:   e92d01f0    push    {r4, r5, r6, r7, r8}
   8:   1a00001b    bne 7c <sub_func_1+0x7c>
   c:   e5d34007    ldrb    r4, [r3, #7]
  10:   e3a05008    mov r5, #8
  14:   e5d3c003    ldrb    ip, [r3, #3]
  18:   e5d38014    ldrb    r8, [r3, #20]
  1c:   e5c35001    strb    r5, [r3, #1]
  20:   e5d37015    ldrb    r7, [r3, #21]
  24:   e5d36018    ldrb    r6, [r3, #24]
  28:   e5c34008    strb    r4, [r3, #8]
  2c:   e5d35019    ldrb    r5, [r3, #25]
  30:   e35c0001    cmp ip, #1
  34:   e5c3c005    strb    ip, [r3, #5]
  38:   e5d34012    ldrb    r4, [r3, #18]
  3c:   e5c38010    strb    r8, [r3, #16]
  40:   e5c37011    strb    r7, [r3, #17]
  44:   e5c3601a    strb    r6, [r3, #26]
  48:   e5c3501b    strb    r5, [r3, #27]
  4c:   e5c34013    strb    r4, [r3, #19]
  50:   1a000009    bne 7c <sub_func_1+0x7c>
  54:   e5d3400b    ldrb    r4, [r3, #11]
  58:   e3a05005    mov r5, #5
  5c:   e5c1c000    strb    ip, [r1]
  60:   e5c10002    strb    r0, [r1, #2]
  64:   e5c15001    strb    r5, [r1, #1]
  68:   e5c20000    strb    r0, [r2]
  6c:   e5c14003    strb    r4, [r1, #3]
  70:   e5c20005    strb    r0, [r2, #5]
  74:   e5c2c014    strb    ip, [r2, #20]
  78:   e5c3400f    strb    r4, [r3, #15]
  7c:   e8bd01f0    pop {r4, r5, r6, r7, r8}
  80:   e12fff1e    bx  lr

Second function, with direct access:

void sub_func_2(unsigned int output_index, unsigned int cc_index, unsigned int num_of)
{
    if(LIKELY(num_of == 0))
     {
        output_file[output_index].curr_id                       = UNUSED;
        output_file[output_index].curr_cnt                      = output_file[output_index].cnt;

        output_file[output_index].curr_mode                     = output_file[output_index]._mode;
        output_file[output_index].curr_type                     = output_file[output_index].type;
        output_file[output_index].curr_size                     = output_file[output_index].size;
        output_file[output_index].curr_allocation_type          = output_file[output_index].allocation_type;
        output_file[output_index].curr_allocation_localized     = output_file[output_index].allocation_localized;
        output_file[output_index].curr_mode_enable              = output_file[output_index].mode_enable;

        if(output_file[output_index].curr_cnt == 1)
         {
            output_file[output_index].cc_file[cc_index].file[0].status              = BLOCK_IDLE;
            output_file[output_index].cc_file[cc_index].file[0].type                = USER_DATA_TYPE;
            output_file[output_index].cc_file[cc_index].file[0].index               = FIRST__WORD;
            output_file[output_index].cc_file[cc_index].file[0].layer_cnt           = output_file[output_index].layer_cnt;

            output_file[output_index].cc_file[cc_index].file[1].status              = DISABLED;
            output_file[output_index].cc_file[cc_index].file[1].index               = 0;
            output_file[output_index].cc_file[cc_index].file[1].redundancy_version  = 1;

            output_file[output_index].total_layer_cnt                               = output_file[output_index].cc_file[cc_index].file[0].layer_cnt;
         }
     }
}
00000084 <sub_func_2>:
  84:   e92d0ff0    push    {r4, r5, r6, r7, r8, r9, sl, fp}
  88:   e3520000    cmp r2, #0
  8c:   e24dd018    sub sp, sp, #24
  90:   e58d2004    str r2, [sp, #4]
  94:   1a000069    bne 240 <sub_func_2+0x1bc>
  98:   e3a03d61    mov r3, #6208   ; 0x1840
  9c:   e30dc0c0    movw    ip, #53440  ; 0xd0c0
  a0:   e340c001    movt    ip, #1
  a4:   e3002000    movw    r2, #0
  a8:   e0010193    mul r1, r3, r1
  ac:   e3402000    movt    r2, #0
  b0:   e3067490    movw    r7, #25744  ; 0x6490
  b4:   e3068488    movw    r8, #25736  ; 0x6488
  b8:   e3a0b008    mov fp, #8
  bc:   e3066498    movw    r6, #25752  ; 0x6498
  c0:   e02c109c    mla ip, ip, r0, r1
  c4:   e082c00c    add ip, r2, ip
  c8:   e28c3b19    add r3, ip, #25600  ; 0x6400
  cc:   e08c4007    add r4, ip, r7
  d0:   e5d39083    ldrb    r9, [r3, #131]  ; 0x83
  d4:   e08c5006    add r5, ip, r6
  d8:   e5d3a087    ldrb    sl, [r3, #135]  ; 0x87
  dc:   e5c3b081    strb    fp, [r3, #129]  ; 0x81
  e0:   e5c39085    strb    r9, [r3, #133]  ; 0x85
  e4:   e2833080    add r3, r3, #128    ; 0x80
  e8:   e7cca008    strb    sl, [ip, r8]
  ec:   e5d4a004    ldrb    sl, [r4, #4]
  f0:   e7cca007    strb    sl, [ip, r7]
  f4:   e5d47005    ldrb    r7, [r4, #5]
  f8:   e5c47001    strb    r7, [r4, #1]
  fc:   e7dc6006    ldrb    r6, [ip, r6]
 100:   e5d5c001    ldrb    ip, [r5, #1]
 104:   e5c56002    strb    r6, [r5, #2]
 108:   e5c5c003    strb    ip, [r5, #3]
 10c:   e5d4c002    ldrb    ip, [r4, #2]
 110:   e5c4c003    strb    ip, [r4, #3]
 114:   e5d33005    ldrb    r3, [r3, #5]
 118:   e3530001    cmp r3, #1
 11c:   1a000047    bne 240 <sub_func_2+0x1bc>
 120:   e30dc0c0    movw    ip, #53440  ; 0xd0c0
 124:   e30db0c0    movw    fp, #53440  ; 0xd0c0
 128:   e1a0700c    mov r7, ip
 12c:   e7dfc813    bfi ip, r3, #16, #16
 130:   e1a05007    mov r5, r7
 134:   e1a0900b    mov r9, fp
 138:   e02c109c    mla ip, ip, r0, r1
 13c:   e1a04005    mov r4, r5
 140:   e1a0a00b    mov sl, fp
 144:   e7df9813    bfi r9, r3, #16, #16
 148:   e7dfb813    bfi fp, r3, #16, #16
 14c:   e1a06007    mov r6, r7
 150:   e7dfa813    bfi sl, r3, #16, #16
 154:   e58dc008    str ip, [sp, #8]
 158:   e7df6813    bfi r6, r3, #16, #16
 15c:   e1a0c004    mov ip, r4
 160:   e7df4813    bfi r4, r3, #16, #16
 164:   e02b109b    mla fp, fp, r0, r1
 168:   e7df5813    bfi r5, r3, #16, #16
 16c:   e0291099    mla r9, r9, r0, r1
 170:   e7df7813    bfi r7, r3, #16, #16
 174:   e7dfc813    bfi ip, r3, #16, #16
 178:   e0261096    mla r6, r6, r0, r1
 17c:   e0241094    mla r4, r4, r0, r1
 180:   e082b00b    add fp, r2, fp
 184:   e0829009    add r9, r2, r9
 188:   e02a109a    mla sl, sl, r0, r1
 18c:   e28bbc65    add fp, fp, #25856  ; 0x6500
 190:   e58d600c    str r6, [sp, #12]
 194:   e2899c65    add r9, r9, #25856  ; 0x6500
 198:   e3a06005    mov r6, #5
 19c:   e58d4010    str r4, [sp, #16]
 1a0:   e59d4008    ldr r4, [sp, #8]
 1a4:   e0251095    mla r5, r5, r0, r1
 1a8:   e5cb3000    strb    r3, [fp]
 1ac:   e082a00a    add sl, r2, sl
 1b0:   e59db00c    ldr fp, [sp, #12]
 1b4:   e5c96001    strb    r6, [r9, #1]
 1b8:   e59d6004    ldr r6, [sp, #4]
 1bc:   e28aac65    add sl, sl, #25856  ; 0x6500
 1c0:   e58d5014    str r5, [sp, #20]
 1c4:   e0271097    mla r7, r7, r0, r1
 1c8:   e0825004    add r5, r2, r4
 1cc:   e30d40c0    movw    r4, #53440  ; 0xd0c0
 1d0:   e02c109c    mla ip, ip, r0, r1
 1d4:   e0855008    add r5, r5, r8
 1d8:   e7df4813    bfi r4, r3, #16, #16
 1dc:   e5ca6002    strb    r6, [sl, #2]
 1e0:   e5d59003    ldrb    r9, [r5, #3]
 1e4:   e082600b    add r6, r2, fp
 1e8:   e59db014    ldr fp, [sp, #20]
 1ec:   e0201094    mla r0, r4, r0, r1
 1f0:   e2866c65    add r6, r6, #25856  ; 0x6500
 1f4:   e59d1010    ldr r1, [sp, #16]
 1f8:   e306a53c    movw    sl, #25916  ; 0x653c
 1fc:   e0827007    add r7, r2, r7
 200:   e2877c65    add r7, r7, #25856  ; 0x6500
 204:   e082c00c    add ip, r2, ip
 208:   e5c69003    strb    r9, [r6, #3]
 20c:   e59d6004    ldr r6, [sp, #4]
 210:   e28ccc65    add ip, ip, #25856  ; 0x6500
 214:   e082500b    add r5, r2, fp
 218:   e0820000    add r0, r2, r0
 21c:   e0824001    add r4, r2, r1
 220:   e085500a    add r5, r5, sl
 224:   e0808008    add r8, r0, r8
 228:   e7c4600a    strb    r6, [r4, sl]
 22c:   e5c56005    strb    r6, [r5, #5]
 230:   e5c73050    strb    r3, [r7, #80]   ; 0x50
 234:   e5dc3003    ldrb    r3, [ip, #3]
 238:   e287704c    add r7, r7, #76 ; 0x4c
 23c:   e5c83007    strb    r3, [r8, #7]
 240:   e28dd018    add sp, sp, #24
 244:   e8bd0ff0    pop {r4, r5, r6, r7, r8, r9, sl, fp}
 248:   e12fff1e    bx  lr

And last part, my compile options are:

# Compile options.
C_OPTS =    -Wall \
-std=gnu99 \
-fgnu89-inline \
-Wcast-align \
-Werror=uninitialized \
-Werror=maybe-uninitialized \
-Werror=overflow \
-mcpu=cortex-a15 \
-mtune=cortex-a15 \
-mabi=aapcs \
-mfpu=neon \
-ftree-vectorize \
-ftree-slp-vectorize \
-ftree-vectorizer-verbose=4 \
-mfloat-abi=hard \
-O3 \
-flto \
-marm \
-ffat-lto-objects \
-fno-gcse \
-fno-strict-aliasing \
-fno-delete-null-pointer-checks \
-fno-strict-overflow \
-fuse-linker-plugin \
-falign-functions=4 \
-falign-loops=4 \
-falign-labels=4 \
-falign-jumps=4 

Update:

Note: I deleted the structure definitions because there was differences with the version of my own program. It is actually a huge structure and it is not efficient to put here completely.

As suggested, I get rid of -fno-gcse, and the generated asm is not huge as before.

Without -fno-gcse, sub_func_1 generates the same code as above.

For sub_func_2:

00000084 <sub_func_2>:
  84:   e3520000    cmp r2, #0
  88:   e92d0070    push    {r4, r5, r6}
  8c:   1a000030    bne 154 <sub_func_2+0xd0>
  90:   e30d30c0    movw    r3, #53440  ; 0xd0c0
  94:   e3a06008    mov r6, #8
  98:   e3403001    movt    r3, #1
  9c:   e0030093    mul r3, r3, r0
  a0:   e3a00d61    mov r0, #6208   ; 0x1840
  a4:   e0213190    mla r1, r0, r1, r3
  a8:   e59f30ac    ldr r3, [pc, #172]  ; 15c <sub_func_2+0xd8>
  ac:   e0831001    add r1, r3, r1
  b0:   e2813b19    add r3, r1, #25600  ; 0x6400
  b4:   e5d34083    ldrb    r4, [r3, #131]  ; 0x83
  b8:   e1a00003    mov r0, r3
  bc:   e5d35087    ldrb    r5, [r3, #135]  ; 0x87
  c0:   e5c36081    strb    r6, [r3, #129]  ; 0x81
  c4:   e5c34085    strb    r4, [r3, #133]  ; 0x85
  c8:   e3064488    movw    r4, #25736  ; 0x6488
  cc:   e2833080    add r3, r3, #128    ; 0x80
  d0:   e7c15004    strb    r5, [r1, r4]
  d4:   e5d05094    ldrb    r5, [r0, #148]  ; 0x94
  d8:   e0844006    add r4, r4, r6
  dc:   e7c15004    strb    r5, [r1, r4]
  e0:   e5d04095    ldrb    r4, [r0, #149]  ; 0x95
  e4:   e5d0c092    ldrb    ip, [r0, #146]  ; 0x92
  e8:   e5c04091    strb    r4, [r0, #145]  ; 0x91
  ec:   e3064498    movw    r4, #25752  ; 0x6498
  f0:   e7d15004    ldrb    r5, [r1, r4]
  f4:   e5c0c093    strb    ip, [r0, #147]  ; 0x93
  f8:   e5d04099    ldrb    r4, [r0, #153]  ; 0x99
  fc:   e5c0509a    strb    r5, [r0, #154]  ; 0x9a
 100:   e5c0409b    strb    r4, [r0, #155]  ; 0x9b
 104:   e5d33005    ldrb    r3, [r3, #5]
 108:   e3530001    cmp r3, #1
 10c:   1a000010    bne 154 <sub_func_2+0xd0>
 110:   e281cc65    add ip, r1, #25856  ; 0x6500
 114:   e3a06005    mov r6, #5
 118:   e2810b19    add r0, r1, #25600  ; 0x6400
 11c:   e1a0500c    mov r5, ip
 120:   e5cc3000    strb    r3, [ip]
 124:   e1a0400c    mov r4, ip
 128:   e5cc6001    strb    r6, [ip, #1]
 12c:   e5cc2002    strb    r2, [ip, #2]
 130:   e5d0608b    ldrb    r6, [r0, #139]  ; 0x8b
 134:   e5cc6003    strb    r6, [ip, #3]
 138:   e306c53c    movw    ip, #25916  ; 0x653c
 13c:   e7c1200c    strb    r2, [r1, ip]
 140:   e5c52041    strb    r2, [r5, #65]   ; 0x41
 144:   e285503c    add r5, r5, #60 ; 0x3c
 148:   e5c43050    strb    r3, [r4, #80]   ; 0x50
 14c:   e284404c    add r4, r4, #76 ; 0x4c
 150:   e5c0608f    strb    r6, [r0, #143]  ; 0x8f
 154:   e8bd0070    pop {r4, r5, r6}
 158:   e12fff1e    bx  lr
 15c:   00000000    .word   0x00000000
like image 356
tozak Avatar asked May 05 '16 12:05

tozak


1 Answers

TL:DR: can't reproduce that insane compiler output. Maybe the surrounding code + LTO did it?

I do have suggestions to improve the code: see the stuff below about copying whole structs instead of copying many individual members.


The question you linked is about accessing a value-type global directly vs. through a global pointer. On ARM, where it takes multiple instructions or a load from a nearby constant to get an arbitrary 32bit pointer into a register, passing around pointers is better than having each function reference a global directly.

See this example on the Godbolt Compiler Explorer (ARM gcc 4.8.2 -O3)

struct example {
  int a, b, c;
} global_example;

int load_global(void) { return global_example.c; }
        movw    r3, #:lower16:global_example    @ tmp113,
        movt    r3, #:upper16:global_example    @ tmp113,
        ldr     r0, [r3, #8]      @, global_example.c
        bx      lr  @
int load_pointer(struct example *p) { return p->c; }
        ldr     r0, [r0, #8]      @, p_2(D)->c
        bx      lr  @

(Apparently gcc is horrible at passing structs by val as function args, see the code for byval(struct example by_val) on the godbolt link.)

Even worse is if you have a global pointer: first you have to load the value of the pointer, then another load to dereference it. This is the indirection overhead that was being discussed in the question you linked. If both loads miss in cache, you're paying the round-trip latency twice. The load address for the 2nd load isn't available until the first load completes, so no pipelining of those memory requests is possible even on an out-of-order CPU.

If you already have a pointer as an arg, it will be in a register. Dereferencing it is the same as loading from a global. (But better, because you don't need to get the global's address into a register yourself.)


Your real code

I can't reproduce your massive asm output with ARM gcc 4.8.2 on Godbolt, or locally with ARM gcc 5.2.1. I'm not using LTO, though, since I don't have a complete test program.

All I can see is just slightly larger code to do some index math.

bfi is Bitfield Insert. I think 144: e7df9813 bfi r9, r3, #16, #16 is setting the top half of r9 = low half of r3. I don't see how that and mla (integer mul-accumulate) make much sense. Other than perverse results from -ftree-vectorize, all I can think of is maybe -fno-gcse has a really bad impact for the version of gcc you tested.

Is it manipulating constants that are going to be stored? The code you actually posted #defines everything to 0, which gcc takes advantage of. (It also takes advantage of the fact that it already has 1 in a register if curr_cnt == 1, and stores that register for the second_file_ptr->redundancy_version = 1;). ARM doesn't have a str [mem], immediate or anything like x86's mov [mem], imm.

If your compiler output is from code with different values for those constants, the compiler would be doing more work to store different things.

Unfortunately gcc is bad at merging narrow stores into a single wider store (long-standing missed-optimization bug). For x86, clang does this in at least one case, storing 0x0100 (256) instead of a 0 and a 1. (check on godbolt by flipping the compiler to clang 3.7.1 or something, and removing the ARM-specific compiler args. There's a mov word ptr \[rsi\], 256 where gcc uses

    mov     BYTE PTR [rsi], 0 # *first_file_ptr_23(D).status,
    mov     BYTE PTR [rsi+1], 1       # *first_file_ptr_23(D).type,

If you arranged your structs carefully, there would be more opportunities for copying 4B blocks in this function.

It might also help to have two identical sub-structs of curr and not-curr, instead of curr_size and size. You might have to declare it packed to avoid padding after the sub-structs, though. Your two groups of members aren't in exactly the same order, which prevents compilers from doing much block-copying anyway when you do a bunch of assignments.


It helps gcc and clang copy multiple bytes at once if you do:

struct output_s_optimized {
   struct __attribute__((packed)) stuff {
       unsigned char cnt,
                    mode,
                    type,
                    size,
                    allocation_type,
                    allocation_localized,
                    mode_enable;
   } curr;  // 7B
   unsigned char    curr_id;  // no non-curr id?

   struct stuff non_curr;
   unsigned char layer_cnt;
                              // Another 8 byte boundary here
   unsigned char total_layer_cnt;

   struct cc_file_s cc_file[128];    
};

void foo(struct output_s_optimized *p) {
  p->curr_id = 0;
  p->non_curr = p->curr;
}
void bar(struct output_s_optimized *output_ptr) {
  output_ptr->curr_id = 0;
  output_ptr->curr.cnt                        = output_ptr->non_curr.cnt;
  output_ptr->curr.mode                       = output_ptr->non_curr.mode;
  output_ptr->curr.type                       = output_ptr->non_curr.type;
  output_ptr->curr.size                       = output_ptr->non_curr.size;
  output_ptr->curr.allocation_type            = output_ptr->non_curr.allocation_type;
  output_ptr->curr.allocation_localized       = output_ptr->non_curr.allocation_localized;
  output_ptr->curr.mode_enable                = output_ptr->non_curr.mode_enable;
}

gcc 4.8.2 compiles foo() to three copies: byte, 2B, and 4B, even on ARM. It compiles bar() to eight 1B copies, and so does clang-3.8 on x86. So copying whole structs can help your compiler a lot (as well as making sure the data to be copied is arranged in the same order in both locations).


the same code on x86: nothing new

You can use -fverbose-asm to put comments on each line. For x86, the compiler output from gcc 6.1 -O3 is very similar between versions, as you can see on the Godbolt Compiler Explorer. x86 addressing modes can index a global variable directly, so you see stuff like

movzx   edi, BYTE PTR [rcx+10]        # *output_ptr_7(D)._mode
# where rcx is the output_ptr arg, used directly

vs.

movzx   ecx, BYTE PTR output_file[rdi+10]     # output_file[output_index_7(D)]._mode
# where rdi = output_index * 1297  (sizeof(output_file[0])), calculated once at the start

(gcc apparently doesn't care that each instruction has a 4B displacement as part of the addressing mode, but this is an ARM question so I won't go tradeoffs between code-size and insn count with x86's variable-length insns.)

like image 105
Peter Cordes Avatar answered Oct 15 '22 20:10

Peter Cordes