OK, the question title is a bit crappy, but I didn't really know how to phrase this better. The problem I have is that given a <code>std::vector<T></code> vs. a <code>T*</code> + <code>size_t count</code> my compiler (Visual Studio 2005 / VC++ 8) will actually generate worse code when looping over the pointer than when looping over the vector. That is, I have a test struct containing a vector and another one containing a pointer + count. Now, when writing the semantically exact same looping construct, the version with the std::vector is significantly (which is to say > 10%) faster than the version with the pointer. Below you will find the code as well as the generated assembly. It would be great if someone could explain what's going on here. If you look at the assembly, you can note how the raw pointer version generates slightly more instructions. It would already be a very nice answer if anyone could explain how these versions differ semantically on the assembly level. And please refrain from answers telling me I shouldn't care, premature optimization, root of all evil, etc. In this specific case I do care and anyway I think it is a rather interesting puzzle! :-) <hr> Compiler settings: <ul> <li>Full Optimization (/Ox)</li> <li>Whole Program Opt. = NO</li> </ul> Here comes the code: stdafx.h <pre class="prettyprint"><code>// Disable secure STL stuff! #define _SECURE_SCL 0 #define _SECURE_SCL_THROWS 0 #include <iostream> #include <iomanip> #include <vector> #include <mmsystem.h> </code></pre> header file <pre class="prettyprint"><code>// loop1.h typedef int PodType; const size_t container_size = 3; extern volatile size_t g_read_size; void side_effect(); struct RawX { PodType* pData; PodType wCount; RawX() : pData(NULL) , wCount(0) { } ~RawX() { delete[] pData; pData = NULL; wCount = 0; } void Resize(PodType n) { delete[] pData; wCount = n; pData = new PodType[wCount]; } private: RawX(RawX const&); RawX& operator=(RawX const&); }; struct VecX { std::vector<PodType> vData; }; void raw_loop(const int n, RawX* obj); void raw_iterator_loop(const int n, RawX* obj); void vector_loop(const int n, VecX* obj); void vector_iterator_loop(const int n, VecX* obj); </code></pre> implementation file <pre class="prettyprint"><code>// loop1.cpp void raw_loop(const int n, RawX* obj) { for(int i=0; i!=n; ++i) { side_effect(); for(int j=0, e=obj->wCount; j!=e; ++j) { g_read_size = obj->pData[j]; side_effect(); } side_effect(); } } void raw_iterator_loop(const int n, RawX* obj) { for(int i=0; i!=n; ++i) { side_effect(); for(PodType *j=obj->pData, *e=obj->pData+size_t(obj->wCount); j!=e; ++j) { g_read_size = *j; side_effect(); } side_effect(); } } void vector_loop(const int n, VecX* obj) { for(int i=0; i!=n; ++i) { side_effect(); for(size_t j=0, e=obj->vData.size(); j!=e; ++j) { g_read_size = obj->vData[j]; side_effect(); } side_effect(); } } void vector_iterator_loop(const int n, VecX* obj) { for(int i=0; i!=n; ++i) { side_effect(); for(std::vector<PodType>::const_iterator j=obj->vData.begin(), e=obj->vData.end(); j!=e; ++j) { g_read_size = *j; side_effect(); } side_effect(); } } </code></pre> test main file <pre class="prettyprint"><code>using namespace std; volatile size_t g_read_size; void side_effect() { g_read_size = 0; } typedef size_t Value; template<typename Container> Value average(Container const& c) { const Value sz = c.size(); Value sum = 0; for(Container::const_iterator i=c.begin(), e=c.end(); i!=e; ++i) sum += *i; return sum/sz; } void take_timings() { const int x = 10; const int n = 10*1000*1000; VecX vobj; vobj.vData.resize(container_size); RawX robj; robj.Resize(container_size); std::vector<DWORD> raw_times; std::vector<DWORD> vec_times; std::vector<DWORD> rit_times; std::vector<DWORD> vit_times; for(int i=0; i!=x; ++i) { const DWORD t1 = timeGetTime(); raw_loop(n, &robj); const DWORD t2 = timeGetTime(); vector_loop(n, &vobj); const DWORD t3 = timeGetTime(); raw_iterator_loop(n, &robj); const DWORD t4 = timeGetTime(); vector_iterator_loop(n, &vobj); const DWORD t5 = timeGetTime(); raw_times.push_back(t2-t1); vec_times.push_back(t3-t2); rit_times.push_back(t4-t3); vit_times.push_back(t5-t4); } cout << "Average over " << x << " iterations for loops with count " << n << " ...\n"; cout << "The PodType is '" << typeid(PodType).name() << "'\n"; cout << "raw_loop: " << setw(10) << average(raw_times) << " ms \n"; cout << "vec_loop: " << setw(10) << average(vec_times) << " ms \n"; cout << "rit_loop: " << setw(10) << average(rit_times) << " ms \n"; cout << "vit_loop: " << setw(10) << average(vit_times) << " ms \n"; } int main() { take_timings(); return 0; } </code></pre> <hr> Here comes the generated assembly as displayed by the visual studio debugger (for the 2 functions with the "iterators". *raw_iterator_loop* <pre class="prettyprint"><code>void raw_iterator_loop(const int n, RawX* obj) { for(int i=0; i!=n; ++i) { 00 mov eax,dword ptr [esp+4] 00 test eax,eax 00 je raw_iterator_loop+53h (4028C3h) 00 push ebx 00 mov ebx,dword ptr [esp+0Ch] 00 push ebp 00 push esi 00 push edi 00 mov ebp,eax side_effect(); 00 call side_effect (401020h) for(PodType *j=obj->pData, *e=obj->pData+size_t(obj->wCount); j!=e; ++j) { 00 movzx eax,word ptr [ebx+4] 00 mov esi,dword ptr [ebx] 00 lea edi,[esi+eax*2] 00 cmp esi,edi 00 je raw_iterator_loop+45h (4028B5h) 00 jmp raw_iterator_loop+30h (4028A0h) 00 lea esp,[esp] 00 lea ecx,[ecx] g_read_size = *j; 00 movzx ecx,word ptr [esi] 00 mov dword ptr [g_read_size (4060B0h)],ecx side_effect(); 00 call side_effect (401020h) 00 add esi,2 00 cmp esi,edi 00 jne raw_iterator_loop+30h (4028A0h) } side_effect(); 00 call side_effect (401020h) 00 sub ebp,1 00 jne raw_iterator_loop+12h (402882h) 00 pop edi 00 pop esi 00 pop ebp 00 pop ebx } } 00 ret </code></pre> *vector_iterator_loop* <pre class="prettyprint"><code>void vector_iterator_loop(const int n, VecX* obj) { for(int i=0; i!=n; ++i) { 00 mov eax,dword ptr [esp+4] 00 test eax,eax 00 je vector_iterator_loop+43h (402813h) 00 push ebx 00 mov ebx,dword ptr [esp+0Ch] 00 push ebp 00 push esi 00 push edi 00 mov ebp,eax side_effect(); 00 call side_effect (401020h) for(std::vector<PodType>::const_iterator j=obj->vData.begin(), e=obj->vData.end(); j!=e; ++j) { 00 mov esi,dword ptr [ebx+4] 00 mov edi,dword ptr [ebx+8] 00 cmp esi,edi 00 je vector_iterator_loop+35h (402805h) g_read_size = *j; 00 movzx eax,word ptr [esi] 00 mov dword ptr [g_read_size (4060B0h)],eax side_effect(); 00 call side_effect (401020h) 00 add esi,2 00 cmp esi,edi 00 jne vector_iterator_loop+21h (4027F1h) } side_effect(); 00 call side_effect (401020h) 00 sub ebp,1 00 jne vector_iterator_loop+12h (4027E2h) 00 pop edi 00 pop esi 00 pop ebp 00 pop ebx } } 00 ret </code></pre>

While my version of the generated machine code is different from yours (MSVC++ 2005), one difference between the two variants is pretty much the same as in your code: <ul> <li>In vector version of the code the "end iterator" value is pre-calculated and stored as a member of <code>std::vector</code> object, so the inner loop simply loads the readily available value.</li> <li>In raw pointer version the "end iterator" value is calculated explicitly in the header of the inner cycle (by a <code>lea</code> instruction used to implement multiplication), meaning that each iteration of the outer cycle performs that calculation again and again.</li> </ul> If you re-implement your <code>raw_iterator_loop</code> as follows (i.e. pull the calculation of the end pointer out of the outer loop) <pre class="prettyprint"><code>void raw_iterator_loop(const int n, RawX* obj) { PodType *e = obj->pData+size_t(obj->wCount); for(int i=0; i!=n; ++i) { side_effect(); for(PodType *j=obj->pData; j!=e; ++j) { g_read_size = *j; side_effect(); } side_effect(); } } </code></pre> (or even store and maintain the end pointer in your class) you should end up with a more "fair" comparison.

One concrete reason for the difference in instructions generated is that Visual C++ <code>vector</code> has members <code>_Myfirst</code> and <code>_Mylast</code> (corresponding to <code>begin()</code> and <code>end()</code>) that simplify the loop setup. In the raw case, the compiler has to do actual pointer math to set up the required start and end locals. It's feasible that this complicates register usage enough to make the <code>vector</code> code faster.

Why is pointer access slower than vector::iterator access? (compiler code generation)

Tags:

c++

performance

assembly

debugging

compiler-construction

OK, the question title is a bit crappy, but I didn't really know how to phrase this better.

The problem I have is that given a std::vector<T> vs. a T* + size_t count my compiler (Visual Studio 2005 / VC++ 8) will actually generate worse code when looping over the pointer than when looping over the vector.

That is, I have a test struct containing a vector and another one containing a pointer + count. Now, when writing the semantically exact same looping construct, the version with the std::vector is significantly (which is to say > 10%) faster than the version with the pointer.

Below you will find the code as well as the generated assembly. It would be great if someone could explain what's going on here.

If you look at the assembly, you can note how the raw pointer version generates slightly more instructions. It would already be a very nice answer if anyone could explain how these versions differ semantically on the assembly level.

And please refrain from answers telling me I shouldn't care, premature optimization, root of all evil, etc. In this specific case I do care and anyway I think it is a rather interesting puzzle! :-)

Compiler settings:

Full Optimization (/Ox)
Whole Program Opt. = NO

Here comes the code:

stdafx.h

// Disable secure STL stuff!
#define _SECURE_SCL 0
#define _SECURE_SCL_THROWS 0
#include <iostream>
#include <iomanip>
#include <vector>
#include <mmsystem.h>

header file

// loop1.h
typedef int PodType;

const size_t container_size = 3;
extern volatile size_t g_read_size;

void side_effect();

struct RawX {
    PodType* pData;
    PodType wCount;

    RawX()
    : pData(NULL)
    , wCount(0)
    { }

    ~RawX() {
        delete[] pData;
        pData = NULL;
        wCount = 0;
    }

    void Resize(PodType n) {
        delete[] pData;
        wCount = n;
        pData = new PodType[wCount];
    }
private:
    RawX(RawX const&);
    RawX& operator=(RawX const&);
};

struct VecX {
    std::vector<PodType> vData;
};

void raw_loop(const int n, RawX* obj);
void raw_iterator_loop(const int n, RawX* obj);
void vector_loop(const int n, VecX* obj);
void vector_iterator_loop(const int n, VecX* obj);

implementation file

// loop1.cpp
void raw_loop(const int n, RawX* obj)
{
    for(int i=0; i!=n; ++i) {
        side_effect();
        for(int j=0, e=obj->wCount; j!=e; ++j) {
            g_read_size = obj->pData[j];
            side_effect();
        }
        side_effect();
    }
}

void raw_iterator_loop(const int n, RawX* obj)
{
    for(int i=0; i!=n; ++i) {
        side_effect();
        for(PodType *j=obj->pData, *e=obj->pData+size_t(obj->wCount); j!=e; ++j) {
            g_read_size = *j;
            side_effect();
        }
        side_effect();
    }
}

void vector_loop(const int n, VecX* obj)
{
    for(int i=0; i!=n; ++i) {
        side_effect();
        for(size_t j=0, e=obj->vData.size(); j!=e; ++j) {
            g_read_size = obj->vData[j];
            side_effect();
        }
        side_effect();
    }
}

void vector_iterator_loop(const int n, VecX* obj)
{
    for(int i=0; i!=n; ++i) {
        side_effect();
        for(std::vector<PodType>::const_iterator j=obj->vData.begin(), e=obj->vData.end(); j!=e; ++j) {
            g_read_size = *j;
            side_effect();
        }
        side_effect();      
    }
}

test main file

using namespace std;

volatile size_t g_read_size;
void side_effect()
{
    g_read_size = 0;
}

typedef size_t Value;

template<typename Container>
Value average(Container const& c)
{
    const Value sz = c.size();
    Value sum = 0;
    for(Container::const_iterator i=c.begin(), e=c.end(); i!=e; ++i)
        sum += *i;
    return sum/sz;

}

void take_timings()
{
    const int x = 10;
    const int n = 10*1000*1000;

    VecX vobj;
    vobj.vData.resize(container_size);
    RawX robj;
    robj.Resize(container_size);

    std::vector<DWORD> raw_times;
    std::vector<DWORD> vec_times;
    std::vector<DWORD> rit_times;
    std::vector<DWORD> vit_times;

    for(int i=0; i!=x; ++i) {
        const DWORD t1 = timeGetTime();
        raw_loop(n, &robj);
        const DWORD t2 = timeGetTime();
        vector_loop(n, &vobj);
        const DWORD t3 = timeGetTime();
        raw_iterator_loop(n, &robj);
        const DWORD t4 = timeGetTime();
        vector_iterator_loop(n, &vobj);
        const DWORD t5 = timeGetTime();
        raw_times.push_back(t2-t1);
        vec_times.push_back(t3-t2);
        rit_times.push_back(t4-t3);
        vit_times.push_back(t5-t4);
    }

    cout << "Average over " << x << " iterations for loops with count " << n << " ...\n";
    cout << "The PodType is '" << typeid(PodType).name() << "'\n";
    cout << "raw_loop: " << setw(10) << average(raw_times) << " ms \n";
    cout << "vec_loop: " << setw(10) << average(vec_times) << " ms \n";
    cout << "rit_loop: " << setw(10) << average(rit_times) << " ms \n";
    cout << "vit_loop: " << setw(10) << average(vit_times) << " ms \n";
}

int main()
{
    take_timings();
    return 0;
}

Here comes the generated assembly as displayed by the visual studio debugger (for the 2 functions with the "iterators".

*raw_iterator_loop*

void raw_iterator_loop(const int n, RawX* obj)
{
    for(int i=0; i!=n; ++i) {
00  mov         eax,dword ptr [esp+4] 
00  test        eax,eax 
00  je          raw_iterator_loop+53h (4028C3h) 
00  push        ebx  
00  mov         ebx,dword ptr [esp+0Ch] 
00  push        ebp  
00  push        esi  
00  push        edi  
00  mov         ebp,eax 
        side_effect();
00  call        side_effect (401020h) 
        for(PodType *j=obj->pData, *e=obj->pData+size_t(obj->wCount); j!=e; ++j) {
00  movzx       eax,word ptr [ebx+4] 
00  mov         esi,dword ptr [ebx] 
00  lea         edi,[esi+eax*2] 
00  cmp         esi,edi 
00  je          raw_iterator_loop+45h (4028B5h) 
00  jmp         raw_iterator_loop+30h (4028A0h) 
00  lea         esp,[esp] 
00  lea         ecx,[ecx] 
            g_read_size = *j;
00  movzx       ecx,word ptr [esi] 
00  mov         dword ptr [g_read_size (4060B0h)],ecx 
            side_effect();
00  call        side_effect (401020h) 
00  add         esi,2 
00  cmp         esi,edi 
00  jne         raw_iterator_loop+30h (4028A0h) 
        }
        side_effect();
00  call        side_effect (401020h) 
00  sub         ebp,1 
00  jne         raw_iterator_loop+12h (402882h) 
00  pop         edi  
00  pop         esi  
00  pop         ebp  
00  pop         ebx  
    }
}
00  ret

*vector_iterator_loop*

void vector_iterator_loop(const int n, VecX* obj)
{
    for(int i=0; i!=n; ++i) {
00  mov         eax,dword ptr [esp+4] 
00  test        eax,eax 
00  je          vector_iterator_loop+43h (402813h) 
00  push        ebx  
00  mov         ebx,dword ptr [esp+0Ch] 
00  push        ebp  
00  push        esi  
00  push        edi  
00  mov         ebp,eax 
        side_effect();
00  call        side_effect (401020h) 
        for(std::vector<PodType>::const_iterator j=obj->vData.begin(), e=obj->vData.end(); j!=e; ++j) {
00  mov         esi,dword ptr [ebx+4] 
00  mov         edi,dword ptr [ebx+8] 
00  cmp         esi,edi 
00  je          vector_iterator_loop+35h (402805h) 
            g_read_size = *j;
00  movzx       eax,word ptr [esi] 
00  mov         dword ptr [g_read_size (4060B0h)],eax 
            side_effect();
00  call        side_effect (401020h) 
00  add         esi,2 
00  cmp         esi,edi 
00  jne         vector_iterator_loop+21h (4027F1h) 
        }
        side_effect();      
00  call        side_effect (401020h) 
00  sub         ebp,1 
00  jne         vector_iterator_loop+12h (4027E2h) 
00  pop         edi  
00  pop         esi  
00  pop         ebp  
00  pop         ebx  
    }
}
00  ret

634

asked Oct 14 '10 16:10

Martin Ba

2 Answers

While my version of the generated machine code is different from yours (MSVC++ 2005), one difference between the two variants is pretty much the same as in your code:

In vector version of the code the "end iterator" value is pre-calculated and stored as a member of std::vector object, so the inner loop simply loads the readily available value.
In raw pointer version the "end iterator" value is calculated explicitly in the header of the inner cycle (by a lea instruction used to implement multiplication), meaning that each iteration of the outer cycle performs that calculation again and again.

If you re-implement your raw_iterator_loop as follows (i.e. pull the calculation of the end pointer out of the outer loop)

void raw_iterator_loop(const int n, RawX* obj)
{
    PodType *e = obj->pData+size_t(obj->wCount);

    for(int i=0; i!=n; ++i) {
        side_effect();
        for(PodType *j=obj->pData; j!=e; ++j) {
            g_read_size = *j;
            side_effect();
        }
        side_effect();
    }
}

(or even store and maintain the end pointer in your class) you should end up with a more "fair" comparison.

193

answered Sep 21 '22 06:09

AnT

One concrete reason for the difference in instructions generated is that Visual C++ vector has members _Myfirst and _Mylast (corresponding to begin() and end()) that simplify the loop setup.

In the raw case, the compiler has to do actual pointer math to set up the required start and end locals.

It's feasible that this complicates register usage enough to make the vector code faster.

answered Sep 19 '22 06:09

Steve Townsend

Related questions
                            
                                Does a Qt application work in Google Native Client?
                            
                                sprintf_s problem
                            
                                Is it possible to choose a C++ generic type parameter at runtime?
                            
                                Hashing function for four unsigned integers (C++)
                            
                                compiling a program to run in DOS mode
                            
                                what is auto_ptr_ref, what it achieves and how it achieves it
                            
                                C++ specific patterns due to language design
                            
                                C++: Getting incorrect file size
                            
                                Populate a vector with all multimap values with a given key
                            
                                Some clarification on rvalue references
                            
                                What is proper way to detect all available serial ports on Windows?
                            
                                OpenSSL certificate lacks key identifiers
                            
                                C++ warning: suggest parentheses around arithmetic in operand of |
                            
                                Resolving RVA's for Import and Export tables within a PE file
                            
                                Why do compilers allow string literals not to be const?
                            
                                How do I turn relational comparison of pointers into an error?
                            
                                Variadic templates for lambda expressions
                            
                                swprintf and vswprintf not declared?
                            
                                Disabling C++0x features in VC 2010?
                            
                                Pointers vs auto_ptr vs shared_ptr

Donate For Us

If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!

Donate Us With