Why does my empty loop run twice as fast if called as a function, on Intel Skylake CPUs?

Question

I was running some tests to compare C to Java and ran into something interesting. Running my exactly identical benchmark code with optimization level 1 (-O1) in a function called by main, rather than in main itself, resulted in roughly double performance. I'm printing out the size of test_t to verify beyond any doubt that the code is being compiled to x64.

I sent the executables to my friend who's running an i7-7700HQ and got similar results. I'm running an i7-6700.

Here's the slower code:

#include <stdio.h>
#include <time.h>
#include <stdint.h>

int main() {
    printf("Size = %I64u
", sizeof(size_t));
    int start = clock();
    for(int64_t i = 0; i < 10000000000L; i++) {
        
    }
    printf("%ld
", clock() - start);
    return 0;
}

And the faster:

#include <stdio.h>
#include <time.h>
#include <stdint.h>

void test() {
    printf("Size = %I64u
", sizeof(size_t));
    int start = clock();
    for(int64_t i = 0; i < 10000000000L; i++) {
        
    }
    printf("%ld
", clock() - start);
}

int main() {
    test();
    return 0;
}

I'll also provide the assembly code for you to dig in to. I don't know assembly. Slower:

    .file   "dummy.c"
    .text
    .def    __main; .scl    2;  .type   32; .endef
    .section .rdata,"dr"
.LC0:
    .ascii "Size = %I64u\12\0"
.LC1:
    .ascii "%ld\12\0"
    .text
    .globl  main
    .def    main;   .scl    2;  .type   32; .endef
    .seh_proc   main
main:
    pushq   %rbx
    .seh_pushreg    %rbx
    subq    $32, %rsp
    .seh_stackalloc 32
    .seh_endprologue
    call    __main
    movl    $8, %edx
    leaq    .LC0(%rip), %rcx
    call    printf
    call    clock
    movl    %eax, %ebx
    movabsq $10000000000, %rax
.L2:
    subq    $1, %rax
    jne .L2
    call    clock
    subl    %ebx, %eax
    movl    %eax, %edx
    leaq    .LC1(%rip), %rcx
    call    printf
    movl    $0, %eax
    addq    $32, %rsp
    popq    %rbx
    ret
    .seh_endproc
    .ident  "GCC: (x86_64-posix-seh-rev0, Built by MinGW-W64 project) 8.1.0"
    .def    printf; .scl    2;  .type   32; .endef
    .def    clock;  .scl    2;  .type   32; .endef

Faster:

    .file   "dummy.c"
    .text
    .section .rdata,"dr"
.LC0:
    .ascii "Size = %I64u\12\0"
.LC1:
    .ascii "%ld\12\0"
    .text
    .globl  test
    .def    test;   .scl    2;  .type   32; .endef
    .seh_proc   test
test:
    pushq   %rbx
    .seh_pushreg    %rbx
    subq    $32, %rsp
    .seh_stackalloc 32
    .seh_endprologue
    movl    $8, %edx
    leaq    .LC0(%rip), %rcx
    call    printf
    call    clock
    movl    %eax, %ebx
    movabsq $10000000000, %rax
.L2:
    subq    $1, %rax
    jne .L2
    call    clock
    subl    %ebx, %eax
    movl    %eax, %edx
    leaq    .LC1(%rip), %rcx
    call    printf
    nop
    addq    $32, %rsp
    popq    %rbx
    ret
    .seh_endproc
    .def    __main; .scl    2;  .type   32; .endef
    .globl  main
    .def    main;   .scl    2;  .type   32; .endef
    .seh_proc   main
main:
    subq    $40, %rsp
    .seh_stackalloc 40
    .seh_endprologue
    call    __main
    call    test
    movl    $0, %eax
    addq    $40, %rsp
    ret
    .seh_endproc
    .ident  "GCC: (x86_64-posix-seh-rev0, Built by MinGW-W64 project) 8.1.0"
    .def    printf; .scl    2;  .type   32; .endef
    .def    clock;  .scl    2;  .type   32; .endef

Here's my batch script for compilation:

@echo off
set /p file= File to compile: 
del compiled.exe
gcc -Wall -Wextra -std=c17 -O1 -o compiled.exe %file%.c
compiled.exe
PAUSE

And for compilation to assembly:

@echo off
set /p file= File to compile: 
del %file%.s
gcc -S -Wall -Wextra -std=c17 -O1 %file%.c
PAUSE

harold · Accepted Answer

The slow version:

enter image description here

Note that the sub rax, 1 \ jne pair goes right across the boundary of the ..80 (which is a 32byte boundary). This is one of the cases mentioned in Intels document regarding this issue namely as this diagram:

enter image description here

So this op/branch pair is affected by the fix for the JCC erratum (which would cause it to not be cached in the µop cache). I'm not sure if that is the reason, there are other things at play too, but it's a thing.

In the fast version, the branch is not "touching" a 32byte boundary, so it is not affected.

enter image description here

There may be other effects that apply. Still due to crossing a 32byte boundary, in the slow case the loop is spread across 2 chunks in the µop cache, even without the fix for JCC erratum that may cause it to run at 2 cycles per iteration if the loop cannot execute from the Loop Stream Detector (which is disabled on some processors by an other fix for an other erratum, SKL150). See eg this answer about loop performance.

To address the various comments saying they cannot reproduce this, yes there are various ways that could happen:

Whichever effect was responsible for the slowdown, it is likely caused by the exact placement of the op/branch pair across a 32byte boundary, which happened by pure accident. Compiling from source is unlikely to reproduce the same circumstances, unless you use the same compiler with the same setup as was used by the original poster.
Even using the same binary, regardless of which of the effects is responsible, the weird effect would only happen on particular processors.

Why does my empty loop run twice as fast if called as a function, on Intel Skylake CPUs?

Tags:

performance

cpu-architecture

c

assembly

x86-64

Anonymous Noob

1 Answers

harold

Recent Activity

Donate For Us

Why does my empty loop run twice as fast if called as a function, on Intel Skylake CPUs?

Tags:

performance

cpu-architecture

c

assembly

x86-64

Anonymous Noob

1 Answers

harold

Related questions

Recent Activity

Donate For Us