How smart is my compiler?

Tags:

I am trying to measure the estimated cycle lengths it takes my computer to perform different operations, so I perform the same one for a 100K times and calculate the average. I am using loop unwinding to be a little more accurate: I perform 10 basic operations in each iteration and I increase my index by 10, resulting in fewer loop operations.

None of this really matter for my question: is there any way the compiler can understand I'm doing the same operation several time and only perform it once? Here's my loop:

for (i=0; i<iterations; i+=LOOP_FACTOR)
{
    result = -1;
    result = -1;
    result = -1;
    result = -1;
    result = -1;
    result = -1;
    result = -1;
    result = -1;
    result = -1;
    result = -1;
}

Also, I don't know if it matters - I am using Eclipse. I thought it might matter as there are different compilers out there.

365

asked Mar 12 '12 13:03

yotamoo

4 Answers

In GCC without optimization it's compiled as is:

(gdb) disas main
Dump of assembler code for function main:
   0x00000000004004e4 <+0>: push   rbp
   0x00000000004004e5 <+1>: mov    rbp,rsp
   0x00000000004004e8 <+4>: mov    DWORD PTR [rip+0x200482],0x0        # 0x600974 <i>
   0x00000000004004f2 <+14>:    jmp    0x400567 <main+131>
   0x00000000004004f4 <+16>:    mov    DWORD PTR [rip+0x200472],0xffffffff        # 0x600970 <result>
   0x00000000004004fe <+26>:    mov    DWORD PTR [rip+0x200468],0xffffffff        # 0x600970 <result>
   0x0000000000400508 <+36>:    mov    DWORD PTR [rip+0x20045e],0xffffffff        # 0x600970 <result>
   0x0000000000400512 <+46>:    mov    DWORD PTR [rip+0x200454],0xffffffff        # 0x600970 <result>
   0x000000000040051c <+56>:    mov    DWORD PTR [rip+0x20044a],0xffffffff        # 0x600970 <result>
   0x0000000000400526 <+66>:    mov    DWORD PTR [rip+0x200440],0xffffffff        # 0x600970 <result>
   0x0000000000400530 <+76>:    mov    DWORD PTR [rip+0x200436],0xffffffff        # 0x600970 <result>
   0x000000000040053a <+86>:    mov    DWORD PTR [rip+0x20042c],0xffffffff        # 0x600970 <result>
   0x0000000000400544 <+96>:    mov    DWORD PTR [rip+0x200422],0xffffffff        # 0x600970 <result>
   0x000000000040054e <+106>:   mov    DWORD PTR [rip+0x200418],0xffffffff        # 0x600970 <result>
   0x0000000000400558 <+116>:   mov    eax,DWORD PTR [rip+0x200416]        # 0x600974 <i>
   0x000000000040055e <+122>:   add    eax,0x1
   0x0000000000400561 <+125>:   mov    DWORD PTR [rip+0x20040d],eax        # 0x600974 <i>
   0x0000000000400567 <+131>:   mov    eax,DWORD PTR [rip+0x200407]        # 0x600974 <i>
   0x000000000040056d <+137>:   cmp    eax,0x3e7
   0x0000000000400572 <+142>:   jle    0x4004f4 <main+16>
   0x0000000000400574 <+144>:   mov    eax,DWORD PTR [rip+0x2003f6]        # 0x600970 <result>
   0x000000000040057a <+150>:   mov    esi,eax
   0x000000000040057c <+152>:   mov    edi,0x40067c
   0x0000000000400581 <+157>:   mov    eax,0x0
   0x0000000000400586 <+162>:   call   0x4003e0 <printf@plt>
   0x000000000040058b <+167>:   pop    rbp
   0x000000000040058c <+168>:   ret

But if you run with basic optimization (gcc -O) then it is shortened to one write:

Dump of assembler code for function main:
   0x00000000004004e4 <+0>: sub    rsp,0x8
   0x00000000004004e8 <+4>: mov    eax,0x3e8
   0x00000000004004ed <+9>: sub    eax,0x1
   0x00000000004004f0 <+12>:    jne    0x4004ed <main+9>
   0x00000000004004f2 <+14>:    mov    DWORD PTR [rip+0x2003fc],0xffffffff        # 0x6008f8 <result>
   0x00000000004004fc <+24>:    mov    DWORD PTR [rip+0x2003f6],0x3e8        # 0x6008fc <i>
   0x0000000000400506 <+34>:    mov    esi,0xffffffff
   0x000000000040050b <+39>:    mov    edi,0x40060c
   0x0000000000400510 <+44>:    mov    eax,0x0
   0x0000000000400515 <+49>:    call   0x4003e0 <printf@plt>
   0x000000000040051a <+54>:    add    rsp,0x8
   0x000000000040051e <+58>:    ret

My testing code is:

#define TIMES 1000

int result, i;

int main() {
    for (i=0; i<TIMES; i++)
    {
        result = -1;
        result = -1;
        result = -1;
        result = -1;
        result = -1;
        result = -1;
        result = -1;
        result = -1;
        result = -1;
        result = -1;
    }
    printf("%d", result);
}

116

answered Nov 09 '22 23:11

Hauleth

It will probably optimize that code. If you want to profile the -1, then you should run with -O0. You should probably also generate some code without a loop to profile the individual instruction.

answered Nov 09 '22 21:11

alternative

There's not much sense in profiling code without optimization.

Instead, I'd suggest declaring result as volatile.

As it is now, your code will probably be optimized to:

result = -1;

RESULTS

Both codes compiled with full optimization:

volatile int result = 10000;

00401000  mov         ecx,3E8h 
00401005  or          eax,0FFFFFFFFh 
00401008  jmp         wmain+10h (401010h) 
0040100A  lea         ebx,[ebx] 
00401010  sub         ecx,1 
    {
        result = -1;
00401013  mov         dword ptr [result (40301Ch)],eax 
        result = -1;
00401018  mov         dword ptr [result (40301Ch)],eax 
        result = -1;
0040101D  mov         dword ptr [result (40301Ch)],eax 
        result = -1;
00401022  mov         dword ptr [result (40301Ch)],eax 
        result = -1;
00401027  mov         dword ptr [result (40301Ch)],eax 
        result = -1;
0040102C  mov         dword ptr [result (40301Ch)],eax 
        result = -1;
00401031  mov         dword ptr [result (40301Ch)],eax 
        result = -1;
00401036  mov         dword ptr [result (40301Ch)],eax 
        result = -1;
0040103B  mov         dword ptr [result (40301Ch)],eax 
        result = -1;
00401040  mov         dword ptr [result (40301Ch)],eax 
00401045  jne         wmain+10h (401010h) 
    }
    cout << result;
00401047  mov         eax,dword ptr [result (40301Ch)] 
0040104C  mov         ecx,dword ptr [__imp_std::cout (402038h)] 
00401052  push        eax  
00401053  call        dword ptr [__imp_std::basic_ostream<char,std::char_traits<char> >::operator<< (40203Ch)]

int result = 10000;

    for (int i=0; i< 1000 ; i += 1)
    {
        result = -1;
        result = -1;
        result = -1;
        result = -1;
        result = -1;
        result = -1;
        result = -1;
        result = -1;
        result = -1;
        result = -1;
    }
    cout << result;
00401000  mov         ecx,dword ptr [__imp_std::cout (402038h)] 
00401006  push        0FFFFFFFFh 
00401008  mov         dword ptr [result (40301Ch)],0FFFFFFFFh 
00401012  call        dword ptr [__imp_std::basic_ostream<char,std::char_traits<char> >::operator<< (40203Ch)]

answered Nov 09 '22 22:11

Luchian Grigore

It depends on the optimization level of your compiler. as such it can be optimized in a few ways:

the repetitions will be folded into 1 expression (via elimination of dead/redundant assignments)
the loop itself will be folded as it nothing changes between it doing 1 iteration or it doing 1000 iterates, thus it can be folded to a single instance of the inner expressions.

This gets a little more tricky if result is made volatile, which 'prevents' the compiler assuming that it's value will not change outside of the loop or sequence of expressions, it may even be enough to prevent the collapsing of the 10 inner statements into 1.

The best way to test this is to examine the output of the compiler, using something like objdump or a debugger.

answered Nov 09 '22 23:11

Necrolis

Related questions
                            
                                Omitting arguments in C++ Templates
                            
                                c++ how to create a directory from a path
                            
                                c++ multiple enums in one function argument using bitwise or "|"
                            
                                strcmpi renamed to _strcmpi?
                            
                                Why can't convert TCHAR* to char*
                            
                                Best way to implement globally scoped data
                            
                                How to avoid integer overflow?
                            
                                What happens when I throw an exception?
                            
                                C and C++ difference in sizeof('x') [duplicate]
                            
                                C++ unhandled exceptions
                            
                                Lightweight portable C++ threading
                            
                                Advantages of using initializer list? [duplicate]
                            
                                Non-Blocking File Reads
                            
                                Codeblocks comment out whole block
                            
                                Does gcc optimize my cycle with condition?
                            
                                Write a recursive function that reverses the input string
                            
                                count the number of distinct absolute values among the elements of the array
                            
                                constructing a Data Frame in Rcpp
                            
                                Retrieve RAM info on a Mac?
                            
                                Why are type_traits implemented with specialized template structs instead of constexpr?

Donate For Us

If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!

Donate Us With