Logo Questions Linux Laravel Mysql Ubuntu Git Menu
 

Why is pointer access slower than vector::iterator access? (compiler code generation)

OK, the question title is a bit crappy, but I didn't really know how to phrase this better.

The problem I have is that given a std::vector<T> vs. a T* + size_t count my compiler (Visual Studio 2005 / VC++ 8) will actually generate worse code when looping over the pointer than when looping over the vector.

That is, I have a test struct containing a vector and another one containing a pointer + count. Now, when writing the semantically exact same looping construct, the version with the std::vector is significantly (which is to say > 10%) faster than the version with the pointer.

Below you will find the code as well as the generated assembly. It would be great if someone could explain what's going on here.

If you look at the assembly, you can note how the raw pointer version generates slightly more instructions. It would already be a very nice answer if anyone could explain how these versions differ semantically on the assembly level.

And please refrain from answers telling me I shouldn't care, premature optimization, root of all evil, etc. In this specific case I do care and anyway I think it is a rather interesting puzzle! :-)


Compiler settings:

  • Full Optimization (/Ox)
  • Whole Program Opt. = NO

Here comes the code:

stdafx.h

// Disable secure STL stuff!
#define _SECURE_SCL 0
#define _SECURE_SCL_THROWS 0
#include <iostream>
#include <iomanip>
#include <vector>
#include <mmsystem.h>

header file

// loop1.h
typedef int PodType;

const size_t container_size = 3;
extern volatile size_t g_read_size;

void side_effect();

struct RawX {
    PodType* pData;
    PodType wCount;

    RawX()
    : pData(NULL)
    , wCount(0)
    { }

    ~RawX() {
        delete[] pData;
        pData = NULL;
        wCount = 0;
    }

    void Resize(PodType n) {
        delete[] pData;
        wCount = n;
        pData = new PodType[wCount];
    }
private:
    RawX(RawX const&);
    RawX& operator=(RawX const&);
};

struct VecX {
    std::vector<PodType> vData;
};

void raw_loop(const int n, RawX* obj);
void raw_iterator_loop(const int n, RawX* obj);
void vector_loop(const int n, VecX* obj);
void vector_iterator_loop(const int n, VecX* obj);

implementation file

// loop1.cpp
void raw_loop(const int n, RawX* obj)
{
    for(int i=0; i!=n; ++i) {
        side_effect();
        for(int j=0, e=obj->wCount; j!=e; ++j) {
            g_read_size = obj->pData[j];
            side_effect();
        }
        side_effect();
    }
}

void raw_iterator_loop(const int n, RawX* obj)
{
    for(int i=0; i!=n; ++i) {
        side_effect();
        for(PodType *j=obj->pData, *e=obj->pData+size_t(obj->wCount); j!=e; ++j) {
            g_read_size = *j;
            side_effect();
        }
        side_effect();
    }
}

void vector_loop(const int n, VecX* obj)
{
    for(int i=0; i!=n; ++i) {
        side_effect();
        for(size_t j=0, e=obj->vData.size(); j!=e; ++j) {
            g_read_size = obj->vData[j];
            side_effect();
        }
        side_effect();
    }
}

void vector_iterator_loop(const int n, VecX* obj)
{
    for(int i=0; i!=n; ++i) {
        side_effect();
        for(std::vector<PodType>::const_iterator j=obj->vData.begin(), e=obj->vData.end(); j!=e; ++j) {
            g_read_size = *j;
            side_effect();
        }
        side_effect();      
    }
}

test main file

using namespace std;

volatile size_t g_read_size;
void side_effect()
{
    g_read_size = 0;
}

typedef size_t Value;

template<typename Container>
Value average(Container const& c)
{
    const Value sz = c.size();
    Value sum = 0;
    for(Container::const_iterator i=c.begin(), e=c.end(); i!=e; ++i)
        sum += *i;
    return sum/sz;

}

void take_timings()
{
    const int x = 10;
    const int n = 10*1000*1000;

    VecX vobj;
    vobj.vData.resize(container_size);
    RawX robj;
    robj.Resize(container_size);

    std::vector<DWORD> raw_times;
    std::vector<DWORD> vec_times;
    std::vector<DWORD> rit_times;
    std::vector<DWORD> vit_times;

    for(int i=0; i!=x; ++i) {
        const DWORD t1 = timeGetTime();
        raw_loop(n, &robj);
        const DWORD t2 = timeGetTime();
        vector_loop(n, &vobj);
        const DWORD t3 = timeGetTime();
        raw_iterator_loop(n, &robj);
        const DWORD t4 = timeGetTime();
        vector_iterator_loop(n, &vobj);
        const DWORD t5 = timeGetTime();
        raw_times.push_back(t2-t1);
        vec_times.push_back(t3-t2);
        rit_times.push_back(t4-t3);
        vit_times.push_back(t5-t4);
    }

    cout << "Average over " << x << " iterations for loops with count " << n << " ...\n";
    cout << "The PodType is '" << typeid(PodType).name() << "'\n";
    cout << "raw_loop: " << setw(10) << average(raw_times) << " ms \n";
    cout << "vec_loop: " << setw(10) << average(vec_times) << " ms \n";
    cout << "rit_loop: " << setw(10) << average(rit_times) << " ms \n";
    cout << "vit_loop: " << setw(10) << average(vit_times) << " ms \n";
}

int main()
{
    take_timings();
    return 0;
}

Here comes the generated assembly as displayed by the visual studio debugger (for the 2 functions with the "iterators".

*raw_iterator_loop*

void raw_iterator_loop(const int n, RawX* obj)
{
    for(int i=0; i!=n; ++i) {
00  mov         eax,dword ptr [esp+4] 
00  test        eax,eax 
00  je          raw_iterator_loop+53h (4028C3h) 
00  push        ebx  
00  mov         ebx,dword ptr [esp+0Ch] 
00  push        ebp  
00  push        esi  
00  push        edi  
00  mov         ebp,eax 
        side_effect();
00  call        side_effect (401020h) 
        for(PodType *j=obj->pData, *e=obj->pData+size_t(obj->wCount); j!=e; ++j) {
00  movzx       eax,word ptr [ebx+4] 
00  mov         esi,dword ptr [ebx] 
00  lea         edi,[esi+eax*2] 
00  cmp         esi,edi 
00  je          raw_iterator_loop+45h (4028B5h) 
00  jmp         raw_iterator_loop+30h (4028A0h) 
00  lea         esp,[esp] 
00  lea         ecx,[ecx] 
            g_read_size = *j;
00  movzx       ecx,word ptr [esi] 
00  mov         dword ptr [g_read_size (4060B0h)],ecx 
            side_effect();
00  call        side_effect (401020h) 
00  add         esi,2 
00  cmp         esi,edi 
00  jne         raw_iterator_loop+30h (4028A0h) 
        }
        side_effect();
00  call        side_effect (401020h) 
00  sub         ebp,1 
00  jne         raw_iterator_loop+12h (402882h) 
00  pop         edi  
00  pop         esi  
00  pop         ebp  
00  pop         ebx  
    }
}
00  ret              

*vector_iterator_loop*

void vector_iterator_loop(const int n, VecX* obj)
{
    for(int i=0; i!=n; ++i) {
00  mov         eax,dword ptr [esp+4] 
00  test        eax,eax 
00  je          vector_iterator_loop+43h (402813h) 
00  push        ebx  
00  mov         ebx,dword ptr [esp+0Ch] 
00  push        ebp  
00  push        esi  
00  push        edi  
00  mov         ebp,eax 
        side_effect();
00  call        side_effect (401020h) 
        for(std::vector<PodType>::const_iterator j=obj->vData.begin(), e=obj->vData.end(); j!=e; ++j) {
00  mov         esi,dword ptr [ebx+4] 
00  mov         edi,dword ptr [ebx+8] 
00  cmp         esi,edi 
00  je          vector_iterator_loop+35h (402805h) 
            g_read_size = *j;
00  movzx       eax,word ptr [esi] 
00  mov         dword ptr [g_read_size (4060B0h)],eax 
            side_effect();
00  call        side_effect (401020h) 
00  add         esi,2 
00  cmp         esi,edi 
00  jne         vector_iterator_loop+21h (4027F1h) 
        }
        side_effect();      
00  call        side_effect (401020h) 
00  sub         ebp,1 
00  jne         vector_iterator_loop+12h (4027E2h) 
00  pop         edi  
00  pop         esi  
00  pop         ebp  
00  pop         ebx  
    }
}
00  ret          
like image 634
Martin Ba Avatar asked Oct 14 '10 16:10

Martin Ba


People also ask

Are iterators faster than pointers?

Iterators is a generic concept. They work on all sorts of containers and have similar interface. Accessing the array element directly like arr_int[i] is definitely faster because it directly translates to pointer arithmetic.

Is std :: vector slower than array?

A std::vector can never be faster than an array, as it has (a pointer to the first element of) an array as one of its data members. But the difference in run-time speed is slim and absent in any non-trivial program. One reason for this myth to persist, are examples that compare raw arrays with mis-used std::vectors.

Why use iterators instead of pointers?

After all, iterators are invalidated at mostly the same times and the same ways as pointers, and one reason that iterators exist is to provide a way to "point" at a contained object. So, if you have a choice, prefer to use iterators into containers.


2 Answers

While my version of the generated machine code is different from yours (MSVC++ 2005), one difference between the two variants is pretty much the same as in your code:

  • In vector version of the code the "end iterator" value is pre-calculated and stored as a member of std::vector object, so the inner loop simply loads the readily available value.

  • In raw pointer version the "end iterator" value is calculated explicitly in the header of the inner cycle (by a lea instruction used to implement multiplication), meaning that each iteration of the outer cycle performs that calculation again and again.

If you re-implement your raw_iterator_loop as follows (i.e. pull the calculation of the end pointer out of the outer loop)

void raw_iterator_loop(const int n, RawX* obj)
{
    PodType *e = obj->pData+size_t(obj->wCount);

    for(int i=0; i!=n; ++i) {
        side_effect();
        for(PodType *j=obj->pData; j!=e; ++j) {
            g_read_size = *j;
            side_effect();
        }
        side_effect();
    }
}

(or even store and maintain the end pointer in your class) you should end up with a more "fair" comparison.

like image 193
AnT Avatar answered Sep 21 '22 06:09

AnT


One concrete reason for the difference in instructions generated is that Visual C++ vector has members _Myfirst and _Mylast (corresponding to begin() and end()) that simplify the loop setup.

In the raw case, the compiler has to do actual pointer math to set up the required start and end locals.

It's feasible that this complicates register usage enough to make the vector code faster.

like image 33
Steve Townsend Avatar answered Sep 19 '22 06:09

Steve Townsend