Is new[] faster than Win32's VirtualAlloc?

Tags:

I was testing the performance of some string pool allocators: I considered the one presented here that calls VirtualAlloc and then carves out sub-allocations, and a similar implementation using standard C++ (without directly calling any Win32 API) and new[].

I expected the VirtualAlloc version to be faster, since I thought there should be less overhead than C++ new[]; but the results I observed are the opposite: using new[] seems to result in faster code than using the lower-level VirtualAlloc.

I ran the test several times (the code is compiled with VS2010 SP1), and the output is something like this:

Click to copy

String pool using VirtualAlloc: 1280.07 ms
String pool using new[]: 799.193 ms

Why is this? Why does new[] seem to be faster than VirtualAlloc?

Test source code follows:

Click to copy

////////////////////////////////////////////////////////////////////////////
// Testing VirtualAlloc vs. new[].
////////////////////////////////////////////////////////////////////////////


#include <string.h>
#include <wchar.h>
#include <algorithm>
#include <exception>
#include <iostream>
#include <new>
#include <ostream>
#include <stdexcept>
#include <string>
#include <vector>
#include <windows.h>
using namespace std;


//--------------------------------------------------------------------------
// String pool allocator using VirtualAlloc, based on this:
// http://blogs.msdn.com/oldnewthing/archive/2005/05/19/420038.aspx
//--------------------------------------------------------------------------
class StringPoolUsingVirtualAlloc
{
public:

    StringPoolUsingVirtualAlloc()
        : m_pchNext(nullptr), 
          m_pchLimit(nullptr), 
          m_phdrCur(nullptr)
    {
        SYSTEM_INFO si;
        GetSystemInfo(&si);
        m_dwGranularity = static_cast<DWORD>( 
            RoundUp( sizeof(HEADER) + MIN_CBCHUNK, si.dwAllocationGranularity 
            ));
    }

    ~StringPoolUsingVirtualAlloc()
    {
        HEADER* phdr = m_phdrCur;
        while (phdr) 
        {
            HEADER * phdrPrev = phdr->m_phdrPrev;
            VirtualFree(phdr, 0, MEM_RELEASE);
            phdr = phdrPrev;
        }
    }

    wchar_t* DuplicateString(const wstring& source)
    {
        return AllocString(source.c_str(), source.c_str() + source.length());
    }

private:
    union HEADER 
    {
        struct 
        {
            HEADER* m_phdrPrev;
            SIZE_T  m_cb;
        };
        wchar_t alignment;
    };

    enum 
    { 
        MIN_CBCHUNK = 32000,
        MAX_CHARALLOC = 1024*1024
    };

    wchar_t*  m_pchNext;
    wchar_t*  m_pchLimit;
    HEADER*   m_phdrCur;
    DWORD     m_dwGranularity;

    static SIZE_T RoundUp(SIZE_T cb, SIZE_T units)
    {
        return ((cb + units - 1) / units) * units;
    }

    wchar_t* AllocString(const wchar_t* pchBegin, const wchar_t* pchEnd)
    {
        SIZE_T cchTotal = pchEnd - pchBegin + 1;
        if (cchTotal > MAX_CHARALLOC) 
            throw length_error("String too big.");

        wchar_t* psz = m_pchNext;
        if (m_pchNext + cchTotal <= m_pchLimit) 
        {
            m_pchNext += cchTotal;
            lstrcpynW(psz, pchBegin, static_cast<int>(cchTotal));
            return psz;
        }

        SIZE_T cbAlloc = RoundUp(cchTotal * sizeof(wchar_t) + sizeof(HEADER), m_dwGranularity);
        BYTE* pbNext = reinterpret_cast<BYTE*>(
            VirtualAlloc(nullptr, cbAlloc, MEM_COMMIT, PAGE_READWRITE));
        if (pbNext == nullptr) 
            throw bad_alloc();

        m_pchLimit = reinterpret_cast<wchar_t*>(pbNext + cbAlloc);
        HEADER* phdrCur = reinterpret_cast<HEADER*>(pbNext);
        phdrCur->m_phdrPrev = m_phdrCur;
        phdrCur->m_cb = cbAlloc;
        m_phdrCur = phdrCur;
        m_pchNext = reinterpret_cast<wchar_t*>(phdrCur + 1);
        return AllocString(pchBegin, pchEnd);
    }

    StringPoolUsingVirtualAlloc(const StringPoolUsingVirtualAlloc &);
    StringPoolUsingVirtualAlloc & operator=(const StringPoolUsingVirtualAlloc &);
};


//--------------------------------------------------------------------------
// String pool allocator that uses standard C++ (no Win32 stuff) and new[].
//--------------------------------------------------------------------------
class StringPoolUsingNew
{
public:

    StringPoolUsingNew()
        : m_pchNext(NULL), 
          m_pchLimit(NULL), 
          m_currChunk(NULL)
    {
    }

    ~StringPoolUsingNew()
    {
        for (auto it = m_chunks.begin(); it != m_chunks.end(); ++it)
            delete *it;
    }

    wchar_t* DuplicateString(const wstring& source)
    {
        return AllocString(source.c_str(), source.c_str() + source.length());
    }

private:

    class Chunk
    {
    public:
        explicit Chunk(size_t maxCharCount)
        {
            m_data = new wchar_t[maxCharCount];
            m_maxCharCount = maxCharCount;
        }

        ~Chunk()
        {
            delete [] m_data;
        }

        wchar_t* Begin()             { return m_data; }
        const wchar_t* Begin() const { return m_data; }
        size_t Length() const        { return m_maxCharCount; }

    private:
        Chunk(const Chunk&);
        Chunk& operator=(const Chunk&);

        wchar_t * m_data;
        size_t m_maxCharCount;
    };

    static const size_t kMinChunkCharCount = 16000;
    static const size_t kMaxCharAlloc = 1024*1024;

    wchar_t*  m_pchNext;
    wchar_t*  m_pchLimit;
    Chunk*    m_currChunk;
    vector<Chunk*> m_chunks;

    wchar_t* AllocString(const wchar_t* pchBegin, const wchar_t* pchEnd)
    {
        const size_t cchTotal = pchEnd - pchBegin + 1;
        if (cchTotal > kMaxCharAlloc) 
            throw length_error("String too big.");

        wchar_t* dest = m_pchNext;
        if (m_pchNext + cchTotal <= m_pchLimit) 
        {
            m_pchNext += cchTotal;
            const size_t copyCount = cchTotal - 1;
            if (copyCount != 0)
                wmemcpy(dest, pchBegin, copyCount);
            dest[copyCount] = L'\0';
            return dest;
        }

        const size_t newChunkSize = max(cchTotal, kMinChunkCharCount);
        Chunk* newChunk = new Chunk(newChunkSize);
        m_chunks.push_back(newChunk);

        m_pchNext = newChunk->Begin();
        m_pchLimit = newChunk->Begin() + newChunk->Length();
        m_currChunk = newChunk;

        return AllocString(pchBegin, pchEnd);
    }

    StringPoolUsingNew(const StringPoolUsingNew&);
    StringPoolUsingNew& operator=(const StringPoolUsingNew&);
};


//------------------------------------------------------------------------
//                          Perf Measurement
//------------------------------------------------------------------------

long long Counter() 
{
    LARGE_INTEGER li;
    QueryPerformanceCounter(&li);
    return li.QuadPart;
}

long long Frequency() 
{
    LARGE_INTEGER li;
    QueryPerformanceFrequency(&li);
    return li.QuadPart;
}

void PrintTime(long long start, long long finish, const char * s) 
{
    cout << s << ": " << (finish - start) * 1000.0 / Frequency() << " ms" << endl;
}


//--------------------------------------------------------------------------
// Test
//--------------------------------------------------------------------------
int main()
{
    static const int kExitOk = 0;
    static const int kExitError = 1;
    try
    {
        long long start = 0;
        long long finish = 0;

        const auto shuffled = []() -> vector<wstring> 
        {
            const wstring lorem[] = {
                L"Lorem ipsum dolor sit amet, consectetuer adipiscing elit.",
                L"Maecenas porttitor congue massa. Fusce posuere, magna sed",
                L"pulvinar ultricies, purus lectus malesuada libero,",
                L"sit amet commodo magna eros quis urna.",
                L"Nunc viverra imperdiet enim. Fusce est. Vivamus a tellus.",
                L"Pellentesque habitant morbi tristique senectus et netus et",
                L"malesuada fames ac turpis egestas. Proin pharetra nonummy pede.",
                L"Mauris et orci."
            };

            vector<wstring> v;
            for (long long i = 0; i < 400*1000; ++i) 
            {
                for (auto it = begin(lorem); it != end(lorem); ++it) 
                {
                    v.push_back((*it) + L" (#" + to_wstring(i) + L")");
                }
            }
            random_shuffle(v.begin(), v.end());

            return v;
        }();

        start = Counter();
        {
            StringPoolUsingVirtualAlloc pool;
            vector<const wchar_t*> v;
            for (auto it = shuffled.begin(); it != shuffled.end(); ++it)
            {
                v.push_back( pool.DuplicateString(*it) );
            }
        }
        finish = Counter();
        PrintTime(start, finish, "String pool using VirtualAlloc");

        start = Counter();
        {
            StringPoolUsingNew pool;
            vector<const wchar_t*> v;
            for (auto it = shuffled.begin(); it != shuffled.end(); ++it)
            {
                v.push_back( pool.DuplicateString(*it) );
            }
        }
        finish = Counter();
        PrintTime(start, finish, "String pool using new[]");

        return kExitOk;
    }
    catch (const exception& e)
    {
        cerr << "*** ERROR: " << e.what() << endl;
        return kExitError;
    }
}

////////////////////////////////////////////////////////////////////////////

630

asked Feb 01 '13 00:02

Mr.C64

1 Answers

Yes, calling new[] repeatedly is much faster than calling VirtualAlloc repeatedly.

First, it is important to understand what new T[N] does. The new operator allocates storage by calling operator new[]. At least since Visual C++ 2010, operator new[] simply calls malloc, which calls the Windows API HeapAlloc to allocate storage from the CRT heap. Prior to Visual C++ 2012, each CRT has its own heap, created via HeapCreate. In Visual C++ 2012, the CRT uses the process heap, obtained via GetProcessHeap. From a performance standpoint, it doesn't matter which heap is used.

VirtualAlloc is used to map pages of memory into the virtual address space of a process. This function is used when you need control over whole pages. For example, if you want to allocate storage to hold executable code, you need to use VirtualAlloc so that you can change the permissions on that storage to allow execution. VirtualAlloc is not optimized for general purpose memory allocation.

For that, you need a heap, which maps a large region of address space at a time, then services allocation requests from that mapped address space. A heap does not have to map and unmap virtual pages every time an allocation is requested (and, also importantly, a heap does not have to zero memory every time an allocation is performed).

When I run your original benchmark, I get the following result:

Click to copy

String pool using VirtualAlloc: 1162.45 ms
String pool using new[]: 625.842 ms

I replaced your usage of VirtualAlloc with HeapAlloc. To do this, I created a private heap for the allocator using HeapCreate(0, 0, 0), then replaced the calls to VirtualAlloc and VirtualFree with calls to HeapAlloc and HeapFree from this private heap. (Note that I did not use the process heap, because as I explained above, new[] uses that heap, so using that heap also here could change the performance of the new[] allocator.) The results of my modified allocator are as follows:

Click to copy

String pool using HeapAlloc: 919.853 ms
String pool using new[]: 636.515 ms

Well, that's extremely disappointing! We improved the performance of the custom allocator by 21%, but it's still much slower than new[]. What's up with that?

The profiler helpfully pointed out what the problem is: your benchmark is comparing apples and oranges. Your new[]-based allocator uses wmemcpy to copy strings, but your VirtualAlloc-based allocator uses lstrcpyn. wmemcpy simply calls memcpy, which has an intrinsic form, so it can be fully inlined with the insanely fast intrinsic form. lstrcpyn is a Windows API function that cannot be inlined. Your VirtualAlloc-based allocator doesn't stand a chance!

I replaced the use of lstrcpyn with wmemcpy. The results are as follows:

Click to copy

String pool using HeapAlloc: 636.149 ms
String pool using new[]: 655.479 ms

And these are the results that we expect: they perform roughly the same, with new[] being just a little bit slower, probably because of the small overhead of calling through operator new and malloc.

answered Sep 27 '22 22:09

James McNellis

Related questions
                            
                                How do I overload the operator * when my object is on the right side in C++?
                            
                                STL performance O(ln(n)) questions
                            
                                If a class might be inherited, should every function be virtual?
                            
                                c++ casting a union to one of its member types
                            
                                c++ comparing two floating point values
                            
                                how to set char * value from std string (c_str()) not working
                            
                                C, C++ Interface with Python
                            
                                expected unqualified-id before numeric constant
                            
                                Qt - How to get|compile Mysql driver
                            
                                C++ sort on vector using function object
                            
                                Converting float to 32-bit hexadecimal C++
                            
                                accessing std::list in the middle
                            
                                Overload ++ operator
                            
                                Idiom for doing something twice in C++
                            
                                How to remove border around QGraphicsItem when selected?
                            
                                Why does C++ still have a delete[] AND a delete operator? [closed]
                            
                                Finding NULL pointers in std vectors
                            
                                auto keyword strange behavior in C++11
                            
                                Why is it an infinite loop?
                            
                                Why isn't the member variable passed to a function modified?

Donate For Us

If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!

Donate Us With

Is new[] faster than Win32's VirtualAlloc?

Tags:

c++

memory-management

winapi

Mr.C64

People also ask

1 Answers

James McNellis

Recent Activity

Donate For Us