Logo Questions Linux Laravel Mysql Ubuntu Git Menu
 

Does arm-none-eabi-gcc produce slower code than Keil uVision

Tags:

c

gcc

arm

stm32

I have a simple blinking led program running on STM32f103C8 (without initialization boilerplate):

void soft_delay(void) {
    for (volatile uint32_t i=0; i<2000000; ++i) { }
}

  uint32_t iters = 0;
  while (1)
  {
    LL_GPIO_TogglePin(LED_GPIO_Port, LED_Pin);
    soft_delay();
    ++iters;
  }

It was compiled with both Keil uVision v.5 (default compiler) and CLion using arm-none-eabi-gcc compiler. The surprise is that arm-none-eabi-gcc program runs 50% slower in Release mode (-O2 -flto) and 100% slower in Debug mode.

I suspect 3 reasons:

  • Keil over-optimization (unlikely, because the code is very simple)

  • arm-none-eabi-gcc under-optimization due to wrong compiler flags (I use CLion Embedded plugins` CMakeLists.txt)

  • A bug in the initialization so that chip has lower clock frequency with arm-none-eabi-gcc (to be investigated)

I have not yet dived into the jungles of optimization and disassembling, I hope that there are many experienced embedded developers who already encountered this issue and have the answer.

UPDATE 1

Playing around with different optimization levels of Keil ArmCC, I see how it affects the generated code. And it affects drastically, especially execution time. Here are the benchmarks and disassembly of soft_delay() function for each optimization level (RAM and Flash amounts include initialization code).

-O0: RAM: 1032, Flash: 1444, Execution Time (20 iterations): 18.7 sec

soft_delay PROC
        PUSH     {r3,lr}
        MOVS     r0,#0
        STR      r0,[sp,#0]
        B        |L6.14|
|L6.8|
        LDR      r0,[sp,#0]
        ADDS     r0,r0,#1
        STR      r0,[sp,#0]
|L6.14|
        LDR      r1,|L6.24|
        LDR      r0,[sp,#0]
        CMP      r0,r1
        BCC      |L6.8|
        POP      {r3,pc}
        ENDP

-O1: RAM: 1032, Flash: 1216, Execution Time (20 iterations): 13.3 sec

soft_delay PROC
        PUSH     {r3,lr}
        MOVS     r0,#0
        STR      r0,[sp,#0]
        LDR      r0,|L6.24|
        B        |L6.16|
|L6.10|
        LDR      r1,[sp,#0]
        ADDS     r1,r1,#1
        STR      r1,[sp,#0]
|L6.16|
        LDR      r1,[sp,#0]
        CMP      r1,r0
        BCC      |L6.10|
        POP      {r3,pc}
        ENDP

-O2 -Otime: RAM: 1032, Flash: 1136, Execution Time (20 iterations): 9.8 sec

soft_delay PROC
        SUB      sp,sp,#4
        MOVS     r0,#0
        STR      r0,[sp,#0]
        LDR      r0,|L4.24|
|L4.8|
        LDR      r1,[sp,#0]
        ADDS     r1,r1,#1
        STR      r1,[sp,#0]
        CMP      r1,r0
        BCC      |L4.8|
        ADD      sp,sp,#4
        BX       lr
        ENDP

-O3: RAM: 1032, Flash: 1176, Execution Time (20 iterations): 9.9 sec

soft_delay PROC
        PUSH     {r3,lr}
        MOVS     r0,#0
        STR      r0,[sp,#0]
        LDR      r0,|L5.20|
|L5.8|
        LDR      r1,[sp,#0]
        ADDS     r1,r1,#1
        STR      r1,[sp,#0]
        CMP      r1,r0
        BCC      |L5.8|
        POP      {r3,pc}
        ENDP

TODO: benchmarking and disassembly for arm-none-eabi-gcc.

like image 273
Fedorov7890 Avatar asked Nov 16 '25 23:11

Fedorov7890


1 Answers

This second answer is a demonstration of the kinds of things that would affect the performance results the OP may be seeing and examples of to possibly test for those STM32F103C8 blue pill.

Complete source code:

flash.ld

MEMORY
{
    rom : ORIGIN = 0x08000000, LENGTH = 0x1000
    ram : ORIGIN = 0x20000000, LENGTH = 0x1000
}
SECTIONS
{
    .text : { *(.text*) } > rom
    .rodata : { *(.rodata*) } > rom
    .bss : { *(.bss*) } > ram
}

flash.s

.cpu cortex-m0
.thumb

.thumb_func
.global _start
_start:
stacktop: .word 0x20001000
.word reset
.word hang
.word hang

.thumb_func
reset:
    bl notmain
    b hang
.thumb_func
hang:   b .

.align

.thumb_func
.globl PUT32
PUT32:
    str r1,[r0]
    bx lr

.thumb_func
.globl GET32
GET32:
    ldr r0,[r0]
    bx lr

.thumb_func
.globl dummy
dummy:
    bx lr

test.s

.cpu cortex-m0
.thumb

.word 0,0,0
.word 0,0,0,0

.thumb_func
.globl TEST
TEST:
    bx lr

notmain.c

//PA9  TX
//PA10 RX

void PUT32 ( unsigned int, unsigned int );
unsigned int GET32 ( unsigned int );
void dummy ( unsigned int );

#define USART1_BASE 0x40013800
#define USART1_SR   (USART1_BASE+0x00)
#define USART1_DR   (USART1_BASE+0x04)
#define USART1_BRR  (USART1_BASE+0x08)
#define USART1_CR1  (USART1_BASE+0x0C)
#define USART1_CR2  (USART1_BASE+0x10)
#define USART1_CR3  (USART1_BASE+0x14)
//#define USART1_GTPR (USART1_BASE+0x18)
#define GPIOA_BASE  0x40010800
#define GPIOA_CRH   (GPIOA_BASE+0x04)
#define RCC_BASE    0x40021000
#define RCC_APB2ENR (RCC_BASE+0x18)

#define STK_CSR     0xE000E010
#define STK_RVR     0xE000E014
#define STK_CVR     0xE000E018
#define STK_MASK    0x00FFFFFF

static void uart_init ( void )
{
    //assuming 8MHz clock, 115200 8N1
    unsigned int ra;

    ra=GET32(RCC_APB2ENR);
    ra|=1<<2;   //GPIOA
    ra|=1<<14;  //USART1
    PUT32(RCC_APB2ENR,ra);

    //pa9 TX  alternate function output push-pull
    //pa10 RX configure as input floating
    ra=GET32(GPIOA_CRH);
    ra&=~(0xFF0);
    ra|=0x490;
    PUT32(GPIOA_CRH,ra);

    PUT32(USART1_CR1,0x2000);
    PUT32(USART1_CR2,0x0000);
    PUT32(USART1_CR3,0x0000);
    //8000000/16 = 500000
    //500000/115200 = 4.34
    //4 and 5/16 = 4.3125
    //4.3125 * 16 * 115200 = 7948800
    PUT32(USART1_BRR,0x0045);
    PUT32(USART1_CR1,0x200C);
}
static void uart_putc ( unsigned int c )
{
    while(1)
    {
        if(GET32(USART1_SR)&0x80) break;
    }
    PUT32(USART1_DR,c);
}
static void hexstrings ( unsigned int d )
{
    //unsigned int ra;
    unsigned int rb;
    unsigned int rc;

    rb=32;
    while(1)
    {
        rb-=4;
        rc=(d>>rb)&0xF;
        if(rc>9) rc+=0x37; else rc+=0x30;
        uart_putc(rc);
        if(rb==0) break;
    }
    uart_putc(0x20);
}
static void hexstring ( unsigned int d )
{
    hexstrings(d);
    uart_putc(0x0D);
    uart_putc(0x0A);
}

void soft_delay(void) {
    for (volatile unsigned int i=0; i<2000000; ++i) { }
}

int notmain ( void )
{

    PUT32(STK_CSR,4);
    PUT32(STK_RVR,0x00FFFFFF);
    PUT32(STK_CVR,0x00000000);
    PUT32(STK_CSR,5);

    uart_init();
    hexstring(0x12345678);
    hexstring(GET32(0xE000E018));
    hexstring(GET32(0xE000E018));
    return(0);
}

build

arm-none-eabi-as --warn --fatal-warnings -mcpu=cortex-m3 flash.s -o flash.o
arm-none-eabi-as --warn --fatal-warnings -mcpu=cortex-m3 test.s -o test.o
arm-none-eabi-gcc -Wall -Werror -O2 -nostdlib -nostartfiles -ffreestanding  -mthumb -mcpu=cortex-m0 -march=armv6-m -c notmain.c -o notmain.thumb.o
arm-none-eabi-ld -o notmain.thumb.elf -T flash.ld flash.o test.o notmain.thumb.o
arm-none-eabi-objdump -D notmain.thumb.elf > notmain.thumb.list
arm-none-eabi-objcopy notmain.thumb.elf notmain.thumb.bin -O binary
arm-none-eabi-gcc -Wall -Werror -O2 -nostdlib -nostartfiles -ffreestanding  -mthumb -mcpu=cortex-m3 -march=armv7-m -c notmain.c -o notmain.thumb2.o
arm-none-eabi-ld -o notmain.thumb2.elf -T flash.ld flash.o test.o notmain.thumb2.o
arm-none-eabi-objdump -D notmain.thumb2.elf > notmain.thumb2.list
arm-none-eabi-objcopy notmain.thumb2.elf notmain.thumb2.bin -O binary

uart output as shown

12345678 
00FFE445 
00FFC698

If I take your code, make it shorter, don't have all day.

void soft_delay(void) {
    for (volatile unsigned int i=0; i<0x2000; ++i) { }
}

arm-none-eabi-gcc -c -O0 -mthumb -mcpu=cortex-m0 hello.c -o hello.o

yes I know this is an m3

arm-none-eabi-gcc --version
arm-none-eabi-gcc (GCC) 5.4.0

gives

00000000 <soft_delay>:
   0:   b580        push    {r7, lr}
   2:   b082        sub sp, #8
   4:   af00        add r7, sp, #0
   6:   2300        movs    r3, #0
   8:   607b        str r3, [r7, #4]
   a:   e002        b.n 12 <soft_delay+0x12>
   c:   687b        ldr r3, [r7, #4]
   e:   3301        adds    r3, #1
  10:   607b        str r3, [r7, #4]
  12:   687b        ldr r3, [r7, #4]
  14:   4a03        ldr r2, [pc, #12]   ; (24 <soft_delay+0x24>)
  16:   4293        cmp r3, r2
  18:   d9f8        bls.n   c <soft_delay+0xc>
  1a:   46c0        nop         ; (mov r8, r8)
  1c:   46bd        mov sp, r7
  1e:   b002        add sp, #8
  20:   bd80        pop {r7, pc}
  22:   46c0        nop         ; (mov r8, r8)
  24:   00001fff    

first check the test infrastructure

.cpu cortex-m0
.thumb

.align 8
.word 0,0


.thumb_func
.globl TEST
TEST:
    push {r4,r5,r6,lr}
    mov r4,r0
    mov r5,r1
    ldr r6,[r4]
inner:
    bl soft_delay
    sub r5,#1
    bne inner
    ldr r3,[r4]
    sub r0,r6,r3
    pop {r4,r5,r6,pc}

.align 8

soft_delay:
    bx lr

in the openocd telnet window

reset halt
flash write_image erase notmain.thumb.elf
reset

gives

12345678 
00001B59 

7001 clocks, assuming the systick matches the cpu, thats 7001 arm clocks, 4 instructions per loop.

Step back note I aligned some things

08000108 <TEST>:
 8000108:   b570        push    {r4, r5, r6, lr}
 800010a:   1c04        adds    r4, r0, #0
 800010c:   1c0d        adds    r5, r1, #0
 800010e:   6826        ldr r6, [r4, #0]

08000110 <inner>:
 8000110:   f000 f876   bl  8000200 <soft_delay>
 8000114:   3d01        subs    r5, #1
 8000116:   d1fb        bne.n   8000110 <inner>
 8000118:   6823        ldr r3, [r4, #0]
 800011a:   1af0        subs    r0, r6, r3
 800011c:   bd70        pop {r4, r5, r6, pc}

08000200 <soft_delay>:
 8000200:   4770        bx  lr

both loops are nicely aligned.

Now if I do this:

0800010a <TEST>:
 800010a:   b570        push    {r4, r5, r6, lr}
 800010c:   1c04        adds    r4, r0, #0
 800010e:   1c0d        adds    r5, r1, #0
 8000110:   6826        ldr r6, [r4, #0]

08000112 <inner>:
 8000112:   f000 f875   bl  8000200 <soft_delay>
 8000116:   3d01        subs    r5, #1
 8000118:   d1fb        bne.n   8000112 <inner>
 800011a:   6823        ldr r3, [r4, #0]
 800011c:   1af0        subs    r0, r6, r3
 800011e:   bd70        pop {r4, r5, r6, pc}

Simply changing the alignment of the code that is supposed to be testing the code under test I now get:

00001F40

8000 ticks to do that loop 1000 times with that call with the code function under test still being aligned

08000200 <soft_delay>:
 8000200:   4770        bx  lr

The .align 8, in general don't use .align with a number on gnu its behavior does not translate across targets. .balign is better. Anyway I used it. The two words are because the align made TEST aligned, but inner is what I wanted aligned so I added two words to make it aligned.

.align 8
.word 0,0

nop

.thumb_func
.globl TEST
TEST:
    push {r4,r5,r6,lr}
    mov r4,r0
    mov r5,r1
    ldr r6,[r4]
inner:
    bl soft_delay
    sub r5,#1
    bne inner
    ldr r3,[r4]
    sub r0,r6,r3
    pop {r4,r5,r6,pc}

A little code review to make sure I didn't make a mistake here.

r0 is the systick current value register
r1 is the number of loops I want to run the code under test

The calling convention allows for r0-r3 to be clobbered so I need to move r0 and r1 to non-volatile registers (per the calling convention).

I want to sample the time the instruction before the loop and the instruction after.

so I need two registers for r0 and r1 and a register to store the begin time so r4,r5,r6 and that fits in nicely to have an even number of registers pushed on the stack. Have to preserve lr so we can return.

we can now safely call soft_delay in the loop, subtract the count, branch if not equal to inner, once the count is done read the timer in r3. from output above this is a down counter so subtract end from beginning, technically since this is a 24 bit counter I should and with 0x00FFFFFF to correctly do that subtraction, but because this isn't going to roll over I can assume out that operation. result/return value goes in r0, pop everything which includes popping the pc to do the return to the C calling function which prints out r0's value.

I think the test code is good.

reading the CPUID register

411FC231 

So that means r1p1, while the TRM I am using is written for r2p1 you have to be very careful to use the right document but also sometimes use the current document or all the ones in between if available to see what changed.

ICode memory interface

Instruction fetches from Code memory space 0x00000000 to 0x1FFFFFFF are performed over the 32-bit AHB-Lite bus. The Debugger cannot access this interface. All fetches are word-wide. The number of instructions fetched per word depends on the code running and the alignment of the code in memory.

Sometimes in ARM TRMs you see the fetch info up top near the processor features, this tells me what I wanted to know.

08000112 <inner>:
 8000112:   f000 f875   bl  8000200 <soft_delay>
 8000116:   3d01        subs    r5, #1
 8000118:   d1fb        bne.n   8000112 <inner>

this requires a fetch at 110, 114 and 118.

08000110 <inner>:
 8000110:   f000 f876   bl  8000200 <soft_delay>
 8000114:   3d01        subs    r5, #1
 8000116:   d1fb        bne.n   8000110 <inner>

This a fetch at 110 and 114, but not one at 118, so that extra fetch could be our added clock. the m3 was the first publicly available one and it has a lot of features in the core that went away and similar ones came back. Some of the smaller cores fetch differently and you don't see this alignment issue. with bigger cores like full sized ones they fetch sometimes 4 or 8 instructions at a time and you have to change your alignment even more to hit the boundary but you can hit the boundary and since it is 2 or 4 clocks plus bus overhead for the extra fetch you can see those.

If I put two nops

nop
nop

.thumb_func
.globl TEST
TEST:

gives

08000114 <inner>:
 8000114:   f000 f874   bl  8000200 <soft_delay>
 8000118:   3d01        subs    r5, #1
 800011a:   d1fb        bne.n   8000114 <inner>
 800011c:   6823        ldr r3, [r4, #0]
 800011e:   1af0        subs    r0, r6, r3
 8000120:   bd70        pop {r4, r5, r6, pc}

gives

00001B59 

So that's good we are back to that number, could try a few more to confirm but it appears that alignment is sensitive to our outer test loop, which is bad, but we can manage that, don't change it it won't affect the test. If I didn't care about alignment and had something like this:

void soft_delay(void) {
    for (volatile unsigned int i=0; i<0x2000; ++i) { }
}
int notmain ( void )
{
    unsigned int ra;
    unsigned int beg;
    unsigned int end;

    PUT32(STK_CSR,4);
    PUT32(STK_RVR,0x00FFFFFF);
    PUT32(STK_CVR,0x00000000);
    PUT32(STK_CSR,5);

    uart_init();
    hexstring(0x12345678);
    beg=GET32(STK_CVR);
    for(ra=0;ra<1000;ra++)
    {
        soft_delay();
    }
    end=GET32(STK_CVR);
    hexstring((beg-end)&0x00FFFFFF);
    return(0);
}

Then as I played with optimization options and I also played with using different compilers any change in the program/binary in front of the test loop would/could move the test loop changing its performance, in my simple example it was a 14% performance difference, that's massive if you are doing performance tests. letting the compiler take care of all this without us being in control the everything in front of the function under test could mess with the function under test, as written above the compiler might opt to inline the function rather than call it making an even more interesting situation as the test loop while probably not as clean as mine, certainly not if not optimized, but now the code under test is dynamic as options or alignments change.

I'm very happy you happened to be using this core/chip...

If I re-align inner and now mess with this

.align 8
nop
soft_delay:
    bx lr

08000202 <soft_delay>:
 8000202:   4770        bx  lr

it's a single instruction which is fetched at 0x200 from what we have read and seem to be able to tell. wouldn't expect this to change anything and it didn't

00001B59

but now that we know what we know, we can use our experience to mess with this trivial Not interesting at all example.

.align 8
nop
soft_delay:
    nop
    bx lr

gives

00001F41

as expected. and we can have even more fun:

.align 8
.word 0,0
nop
.thumb_func
.globl TEST
TEST:

combined gives

08000112 <inner>:
 8000112:   f000 f876   bl  8000202 <soft_delay>
 8000116:   3d01        subs    r5, #1
 8000118:   d1fb        bne.n   8000112 <inner>

08000202 <soft_delay>:
 8000202:   46c0        nop         ; (mov r8, r8)
 8000204:   4770        bx  lr

no surprise if you know what you are doing:

00002328 

9000 clocks, 29% performance difference. we are literally talking about 5 (technically 6) instructions, same exact machine code and by simply changing alignment the performance can be 29% different, compiler and options have nothing to do with it, yet, have not even gotten there.

How can we expect to do any kind of performance evaluation of a program using the time the code a bunch of times in a loop method? We cant unless we know what we are doing, have an understanding of the architecture, etc.

Now as it should be obvious and reading the documentation I am using the internal 8Mhz clock, everything is derived from that so the systick times are not going to sometimes vary as you might see with dram for example. The LATENCY bits in the FLASH_ACR register should have defaulted to zero wait states for 0 < SYSCLK <- 24Mhz. If I were to bump up the clock above 24Mhz, the processor is running faster but the flash is now slower relative to the processor.

Without messing with the clocks and simply adding a wait state by changing the FLASH_ACR register to 0x31.

000032C6 

12998 up from 9000, I didn't expect it to double necessarily and it didn't.

Hmm for fun make a PUT16 using strh, and

.thumb_func
.globl HOP
HOP:
    bx r2

and

PUT16(0x2000010a,0xb570); // 800010a:  b570        push    {r4, r5, r6, lr}
PUT16(0x2000010c,0x1c04); // 800010c:  1c04        adds    r4, r0, #0
PUT16(0x2000010e,0x1c0d); // 800010e:  1c0d        adds    r5, r1, #0
PUT16(0x20000110,0x6826); // 8000110:  6826        ldr r6, [r4, #0]

PUT16(0x20000112,0xf000); // 8000112:  f000 f876   bl  8000202 <soft_delay>
PUT16(0x20000114,0xf876); // 8000112:  f000 f876   bl  8000202 <soft_delay>

PUT16(0x20000116,0x3d01); // 8000116:  3d01        subs    r5, #1
PUT16(0x20000118,0xd1fb); // 8000118:  d1fb        bne.n   8000112 <inner>
PUT16(0x2000011a,0x6823); // 800011a:  6823        ldr r3, [r4, #0]
PUT16(0x2000011c,0x1af0); // 800011c:  1af0        subs    r0, r6, r3
PUT16(0x2000011e,0xbd70); // 800011e:  bd70        pop {r4, r5, r6, pc}

PUT16(0x20000202,0x46c0); // 8000202:  46c0        nop         ; (mov r8, r8)
PUT16(0x20000204,0x4770); // 8000204:  4770        bx  lr

    hexstring(HOP(STK_CVR,1000,0x2000010B));

gives 0000464B

and that was not at all expected. but is 18,000 basically

Putting ram to bed after this

PUT16(0x20000108,0xb570); // 800010a:  b570        push    {r4, r5, r6, lr}
PUT16(0x2000010a,0x1c04); // 800010c:  1c04        adds    r4, r0, #0
PUT16(0x2000010c,0x1c0d); // 800010e:  1c0d        adds    r5, r1, #0
PUT16(0x2000010e,0x6826); // 8000110:  6826        ldr r6, [r4, #0]

PUT16(0x20000110,0xf000); // 8000112:  f000 f876   bl  8000202 <soft_delay>
PUT16(0x20000112,0xf876); // 8000112:  f000 f876   bl  8000202 <soft_delay>

PUT16(0x20000114,0x3d01); // 8000116:  3d01        subs    r5, #1
PUT16(0x20000116,0xd1fb); // 8000118:  d1fb        bne.n   8000112 <inner>
PUT16(0x20000118,0x6823); // 800011a:  6823        ldr r3, [r4, #0]
PUT16(0x2000011a,0x1af0); // 800011c:  1af0        subs    r0, r6, r3
PUT16(0x2000011c,0xbd70); // 800011e:  bd70        pop {r4, r5, r6, pc}

PUT16(0x20000200,0x46c0); // 8000202:  46c0        nop         ; (mov r8, r8)
PUT16(0x20000200,0x4770); // 8000204:  4770        bx  lr
hexstring(HOP(STK_CVR,1000,0x20000109));

00002EDE

The machine code did not change because I moved both back by 2 so the relative address between them was the same. Note that bl is two separate instructions not one 32 bit one. You cant see this in the newer docs you need to go back to the original/early ARM ARM where it is explained. And it is easy to do experiments where you split the two instructions and put other stuff in between and they work just fine, because they are two separate instructions.

At this point the reader should be able to make a 2 instruction test loop, time it and dramatically change the performance of the execution of those two instructions on this platform using the same exact machine code.

So let's try the volatile loop that you wrote.

.align 8

soft_delay:
    push {r7, lr}
    sub sp, #8
    add r7, sp, #0
    mov r3, #0
    str r3, [r7, #4]
    b L12
 Lc:
    ldr r3, [r7, #4]
    add r3, #1
    str r3, [r7, #4]
 L12:
    ldr r3, [r7, #4]
    ldr r2, L24
    cmp r3, r2
    bls Lc
    nop
    mov sp, r7
    add sp, #8
    pop {r7, pc}
    nop

 .align
 L24:   .word 0x1FFF

this is I believe the unoptimized -O0 version. starting off with one test loop

hexstring(TEST(STK_CVR,1));

experience, the times we are seeing will overflow our 24 bit counter and the results will be very strange or lead to false conclusions.

0001801F

98,000, quick check for safety:

.align
 L24:   .word 0x1F

0000019F

not bad that is on par with 256 times faster.

so we have some wiggle room in our test loop but not much try 10

hexstring(TEST(STK_CVR,10));

000F012D

98334 ticks per loop.

changing the alignment

08000202 <soft_delay>:
 8000202:   b580        push    {r7, lr}
 8000204:   b082        sub sp, #8

gave the same result

000F012D

not unheard of, you can examine the differences if you want count through each instruction check fetch cycles, etc.

had I made the test:

soft_delay:
  nop
  nop
  bx lr

its two fetch cycles no matter what the alignment or if I had left it bx lr with no nops as we saw so by simply having an odd number of instructions in the test then alignment won't affect the results on fetches along, but note that from what we know now had some other code in the program moved the outer timing/test loop that may have changed performance and the results may show a difference between two tests that were purely the timing code and not the code under test (read Michael Abrash).

The cortex-m3 is based on the armv7-m architecture. If I change the compiler from -mcpu=cortex-m0 (all cortex-m compatible so far) to -mcpu=cortex-m3 (not all cortex-m compatible will break on half of them) it produces a little bit less code.

.align 8

soft_delay:
    push {r7}
    sub  sp, #12
    add  r7, sp, #0
    movs r3, #0
    str  r3, [r7, #4]
    b L12
Lc:
    ldr r3, [r7, #4]
    add r3, #1
    str r3, [r7, #4]
L12:
    ldr r3, [r7, #4]
    /*14:   f5b3 5f00   cmp.w   r3, #8192   ; 0x2000*/
    //cmp.w r3, #8192
    .word 0x5f00f5b3
    bcc Lc
    nop
    add r7, #12
    mov sp, r7
    pop {r7}
    bx  lr

000C80FB 81945 ticks for the code under test.

I hate unified syntax, that was a massive mistake, so I fumble along in legacy mode. thus the .word thing there in the middle.

As part of writing this I kinda messed up my system in order to demonstrate something. I was building a gcc 5.4.0 but overwrote my 9.2.0 so had to re-build both.

2.95 was the version I started using with arm and didn't support thumb gcc 3.x.x was the first to. And either gcc 4.x.x or gcc 5.x.x produced "slower" code for some of my projects, at work we are currently moving from ubuntu 16.04 to 18.04 for our build systems which if you use the apt-got cross compiler for arm that moves you from 5.x.x to 7.x.x and it is making larger binaries for the same source code and where we are tight on memory it is pushing us beyond what's available so we have to either remove some code (easiest to make the printed messages shorter, cut text out) or stick to the older compiler by building our own or apt-getting the older one. 19.10 does no longer offers the 5.x.x version.

So both are now built.

  18:   d3f8        bcc.n   c <soft_delay+0xc>
  1a:   bf00        nop
  1c:   bf00        nop
  1e:   370c        adds    r7, #12

these nops after bcc are baffling to me...

  18:   d3f8        bcc.n   c <soft_delay+0xc>
  1a:   bf00        nop
  1c:   370c        adds    r7, #12

gcc 5.4.0 is putting one, gcc 9.2.0 is putting two nops, ARM doesn't have the branch shadow thing of MIPS (MIPS doesn't currently either).

000C80FB gcc 5.4.0
000C8105 gcc 9.2.0

I call the function 10 times, the nop is outside the code under tests loop so has a lesser effect.

Optimized all cortex-m variants (to date) using gcc 9.2.0

soft_delay:
    mov r3, #0
    mov r2, #128
    sub sp, #8
    str r3, [sp, #4]
    ldr r3, [sp, #4]
    lsl r2, r2, #6
    cmp r3, r2
    bcs L1c
L10:
    ldr r3, [sp, #4]
    add r3, #1
    str r3, [sp, #4]
    ldr r3, [sp, #4]
    cmp r3, r2
    bcc L10
L1c:
    add sp, #8
    bx  lr

(also understand that not all say gcc 9.2.0 builds produce the same code when you build the compiler you have options and those options can affect the output making different builds of 9.2.0 possibly producing different results)

000C80B5

gcc 9.2.0 built for cortex-m3:

soft_delay:
    mov r3, #0
    sub sp, #8
    str r3, [sp, #4]
    ldr r3, [sp, #4]
/*8:  f5b3 5f00   cmp.w   r3, #8192   ; 0x2000*/
    .word 0x5F00F5B3
    bcs L1c
Le:
    ldr r3, [sp, #4]
    add r3, #1
    str r3, [sp, #4]
    ldr r3, [sp, #4]
/*16: f5b3 5f00   cmp.w   r3, #8192   ; 0x2000*/
    .word 0x5F00F5B3
    bcc Le
L1c:
    add sp, #8
    bx  lr

000C80A1

That's in the noise. despite the code built has differences. they simply didn't gain in comparing the 0x2000 in fewer instructions. and note if you change that 0x2000 to some other number then that does not simply make the loop take that much longer it can change the generated code for architectures like this.

How I like to make these counted delay loops is to use a function outside the compile domain

extern void dummy ( unsigned int );
void soft_delay(void) {
    for (unsigned int i=0; i<0x2000; ++i) { dummy(i); }
}

soft_delay:
    push {r4, r5, r6, lr}
    mov r5, #128
    mov r4, #0
    lsl r5, r5, #6
L8:
    mov r0, r4
    add r4, #1
    bl dummy
    cmp r4, r5
    bne L8
    pop {r4, r5, r6, pc}

the feature there is you don't need the overhead of what volatile does you do have a call and clearly there is overhead as well due to the call but not as much

000B40C9

or even better:

soft_delay:
    sub r0,#1
    bne soft_delay
    bx lr

I would have to change the code wrapped around the code under test to make that function work.

Another note specific to these targets but also something you deal with

unsigned int more_fun ( unsigned int, unsigned int );
unsigned int fun ( unsigned int a, unsigned int b )
{
    return(more_fun(a,b)+a+(b<<2));
}



00000000 <fun>:
   0:   b570        push    {r4, r5, r6, lr}
   2:   000c        movs    r4, r1
   4:   0005        movs    r5, r0
   6:   f7ff fffe   bl  0 <more_fun>
   a:   00a4        lsls    r4, r4, #2
   c:   1964        adds    r4, r4, r5
   e:   1820        adds    r0, r4, r0
  10:   bd70        pop {r4, r5, r6, pc}
  12:   46c0        nop         ; (mov r8, r8)

a question repeated here at SO on a period basis. why is it pushing r6 it isn't using r6.

The compiler operates using what I call and used to be called a calling convention, now they use terms ABI, EABI, whatever either case it is the same thing it is a set of rules the compiler follows for a particular target. Arm added a rule to keep the stack aligned on a 64 bit address boundary instead of 32, this caused the extra item to keep the stack aligned, what register is used there can vary. If you use an older gcc vs a newer this can/will affect the performance of your code all by itself.

like image 155
old_timer Avatar answered Nov 18 '25 14:11

old_timer