Logo Questions Linux Laravel Mysql Ubuntu Git Menu
 

foreach vs for: please explain the assembly code difference

I've recently been testing the performance of the for loop vs the foreach loop in C#, and I've noticed that for summing an array of ints into a long, the foreach loop may come out actually faster. Here is the full test program, I've used Visual Studio 2012, x86, release mode, optimizations on.

Here is the assembly code for both loops. The foreach:

            long sum = 0;
00000000  push        ebp 
00000001  mov         ebp,esp 
00000003  push        edi 
00000004  push        esi 
00000005  push        ebx 
00000006  xor         ebx,ebx 
00000008  xor         edi,edi 
            foreach (var i in collection) {
0000000a  xor         esi,esi 
0000000c  cmp         dword ptr [ecx+4],0 
00000010  jle         00000025 
00000012  mov         eax,dword ptr [ecx+esi*4+8] 
                sum += i;
00000016  mov         edx,eax 
00000018  sar         edx,1Fh 
0000001b  add         ebx,eax 
0000001d  adc         edi,edx 
0000001f  inc         esi 
            foreach (var i in collection) {
00000020  cmp         dword ptr [ecx+4],esi 
00000023  jg          00000012 
            }
            return sum;
00000025  mov         eax,ebx 
00000027  mov         edx,edi 
00000029  pop         ebx 
0000002a  pop         esi 
0000002b  pop         edi 
0000002c  pop         ebp 
0000002d  ret 

And the for:

    long sum = 0;
00000000  push        ebp 
00000001  mov         ebp,esp 
00000003  push        edi 
00000004  push        esi 
00000005  push        ebx 
00000006  push        eax 
00000007  xor         ebx,ebx 
00000009  xor         edi,edi 
            for (int i = 0; i < collection.Length; ++i) {
0000000b  xor         esi,esi 
0000000d  mov         eax,dword ptr [ecx+4] 
00000010  mov         dword ptr [ebp-10h],eax 
00000013  test        eax,eax 
00000015  jle         0000002A 
                sum += collection[i];
00000017  mov         eax,dword ptr [ecx+esi*4+8] 
0000001b  cdq 
0000001c  add         eax,ebx 
0000001e  adc         edx,edi 
00000020  mov         ebx,eax 
00000022  mov         edi,edx 
            for (int i = 0; i < collection.Length; ++i) {
00000024  inc         esi 
00000025  cmp         dword ptr [ebp-10h],esi 
00000028  jg          00000017 
            }
            return sum;
0000002a  mov         eax,ebx 
0000002c  mov         edx,edi 
0000002e  pop         ecx 
0000002f  pop         ebx 
00000030  pop         esi 
00000031  pop         edi 
00000032  pop         ebp 
00000033  ret

As you can see, the main loop is 7 instructions for "foreach" and 9 instructions for "for". This translates into approximately a 10% performance difference in my benchmarks.

I'm not very good at reading assembly code however and I don't understand why the for loop wouldn't be at least as efficient as the foreach. What is going on here?

like image 914
Asik Avatar asked Jan 10 '13 16:01

Asik


2 Answers

As the array is so big the only relevand part is clearly the one inside the loop, this one:

// for loop
00000017  mov         eax,dword ptr [ecx+esi*4+8] 
0000001b  cdq 
0000001c  add         eax,ebx 
0000001e  adc         edx,edi 
00000020  mov         ebx,eax 
00000022  mov         edi,edx 

// foreach loop
00000012  mov         eax,dword ptr [ecx+esi*4+8] 
00000016  mov         edx,eax 
00000018  sar         edx,1Fh 
0000001b  add         ebx,eax 
0000001d  adc         edi,edx 

Since the sum is a long int it is stored in two differenc registers, namely ebx contains its least significant four bytes and edi the most significant four ones. They differ in how collection[i] is (implicitly) casted from int to long:

// for loop
0000001b  cdq 

// foreach loop
00000016  mov         edx,eax 
00000018  sar         edx,1Fh 

Another important thing to notice is that the for-loop version does the sum in "reversed" order:

long temp = (long) collection[i];   // implicit cast, stored in edx:eax
temp += sum;                        // instead of "simply" sum += temp
sum = temp;                         // sum is stored back into ebx:edi

I can't tell ou why the compiler preferred this way instead of sum += temp (@EricLippert could maybe tell us :) ) but I suspect that it is related to some instruction dependency issues that might arise.

like image 82
BlackBear Avatar answered Sep 28 '22 10:09

BlackBear


OK, so here's an annotated version of the assembly code, as you will see the instruction in the loop are very close.

            foreach (var i in collection) {
0000000a  xor         esi,esi                       clear index
0000000c  cmp         dword ptr [ecx+4],0           get size of collection
00000010  jle         00000025                      exit if empty
00000012  mov         eax,dword ptr [ecx+esi*4+8]   get item from collection
                sum += i;
00000016  mov         edx,eax                       move to edx:eax
00000018  sar         edx,1Fh                       shift 31 bits to keep sign only
0000001b  add         ebx,eax                       add to sum
0000001d  adc         edi,edx                       add with carry from previous add
0000001f  inc         esi                           increment index
            foreach (var i in collection) {
00000020  cmp         dword ptr [ecx+4],esi         compare size to index
00000023  jg          00000012                      loop if more
            }
            return sum;
00000025  mov         eax,ebx                       result was in ebx
=================================================
            for (int i = 0; i < collection.Length; ++i) {
0000000b  xor         esi,esi                       clear index
0000000d  mov         eax,dword ptr [ecx+4]         get limit on for
00000010  mov         dword ptr [ebp-10h],eax       save limit
00000013  test        eax,eax                       test if limit is empty
00000015  jle         0000002A                      exit loop if empty
                sum += collection[i];
00000017  mov         eax,dword ptr [ecx+esi*4+8]   get item form collection  
0000001b  cdq                                       convert eax to edx:eax
0000001c  add         eax,ebx                       add to sum
0000001e  adc         edx,edi                       add with carry from previous add
00000020  mov         ebx,eax                       put result in edi:ebx
00000022  mov         edi,edx 
            for (int i = 0; i < collection.Length; ++i) {
00000024  inc         esi                           increment index
00000025  cmp         dword ptr [ebp-10h],esi       compare to limit
00000028  jg          00000017                      loop if more
            }
            return sum;
0000002a  mov         eax,ebx                       result was in ebx
like image 42
Peter Wooster Avatar answered Sep 28 '22 10:09

Peter Wooster