I was playing the other day, trying to see how far could I optimize something. I decided to start from a simple map that just does a linear search to find if an element is there, and then try to optimize the most of it. Also, to compare, I do the same with a std::map and a std::vector using std::find.
The results with the map are the expected ones, slower creation and destruction than my map, but much more speed( actually, I have been unable to mesure it, it returns 0 allways). The problem is with std::vector. I expected it to be slower than my implementation, but is not, and I really don't understand how can it be the same or faster, as my implementation is skipping a worst case( the value isn't in the vector) and is using a cache of results.
Can anyone shed some light here? I know that the guys behind stl are semi-gods, but still, this doesn't make sense.
Benchmark results ( i3, Windows 8.1 Pro 64, Visual Studio 2013 ):
std::vector :
Build : 85.0042 ms
Loop : 37.0011 ms
Find : 1.82259 ms -> First : Found, Second : Found, Third : Not Found
Release : 0 ms
--------------------
std::map :
Build : 6929.41 ms
Loop : 570.032 ms
Find : 0 ms -> First : Found, Second : Found, Third : Not Found
Release : 1425.08
--------------------
Linear Map V0:
Build : 194.012 ms
Loop : 49.0052 ms
Find : 1.88915 ms -> First : Found, Second : Found, Third : Not Found
Release : 109.004
Here's the code for the map:
template<typename T>
class LinearMap0
{
public:
LinearMap0()
{
_end = _root = new Node;
_prebuffer = nullptr;
prebufferCapacity = 0;
_alive = true;
prebufferMarker = 0;
_cache = _mm_set1_epi32(-1);
for (auto& ptr : _cacheBuffer) ptr = nullptr;
MinID = INT32_MAX - 1;
MaxID = -1;
}
void PreAllocate(int Count)
{
prebufferCapacity = Count;
_prebuffer = new Node[Count];
}
~LinearMap0()
{
if (_alive)
{
Release();
}
}
void Release()
{
Node* marker = _end;
while (marker->Prev)
{
marker = marker->Prev;
if (!marker->Next->IsPreAllocated) delete marker->Next;
}
if (!_root->IsPreAllocated) delete _root;
delete[] _prebuffer;
_alive = false;
}
void AddElement(int ID,T element)
{
Node* tmp = nullptr;
if (prebufferMarker < prebufferCapacity)
{
// Use a pre-allocated object
tmp = &_prebuffer[prebufferMarker];
prebufferMarker++;
tmp->IsPreAllocated = true;
}
else
{
tmp = new Node;
}
tmp->ID = ID;
tmp->Data = element;
// Update list
_end->Next = tmp;
Node* prevEnd = _end;
_end = tmp;
_end->Prev = prevEnd;
bool isMin = ID < MinID; MinID = ID * isMin + (1 - isMin) * MinID;
bool isMax = ID > MaxID; MaxID = ID * isMax + (1 - isMax) * MaxID;
}
void DeleteLast()
{
Node* tmp = _end;
_end = _end->Prev;
_end->Next = nullptr;
delete tmp;
}
template<class Function>
void Loop(Function&& f, bool Forward = true)
{
if (Forward)
{
Node* marker = _root;
while (marker->Next)
{
marker = marker->Next;
f(marker->Data);
}
}
else
{
Node* marker = _end;
while (marker->Prev)
{
marker = marker->Prev;
f(marker->Data);
}
}
}
T* Find(int ID)
{
// Bounds check
if (ID < MinID || ID > MaxID) return nullptr;
// Check it it's in the cache
// Compare the value to every value in the cache
__m128i idxSSE = _mm_set1_epi32(ID);
__m128i C = _mm_cmpeq_epi32(_cache, idxSSE);
// To change form -1 to 1
C = _mm_mul_epi32(C, _mm_set1_epi32(-1));
// Now C holds 1 if true, or 0 if false (in each of its 4 members). It should only be ONE set at 1
__m128i tmp = _mm_set1_epi32(1);
__m128i S = _mm_sub_epi32(tmp, C);
// Now find the index
int i = S.m128i_i32[0] * (C.m128i_i32[1] + S.m128i_i32[1] * (2 * C.m128i_i32[2] + S.m128i_i32[2] * (3 * C.m128i_i32[3] + S.m128i_i32[3] * -1)));
if (i != -1)
return _cacheBuffer[i];
// Traverse the list
Node* marker0 = _root;
T* obj = nullptr;
while (true)
{
if (marker0->ID == ID)
{
obj = &marker0->Data;
}
if (marker0->Next) marker0 = marker0->Next; else break;
}
// Cache value and return
_cache.m128i_i32[cacheMarker] = ID;
_cacheBuffer[cacheMarker] = obj;
cacheMarker = (cacheMarker + 1) & 3; // x & 3 = x % 4
return obj;
}
private:
struct Node
{
Node()
{
Prev = nullptr;
Next = nullptr;
IsPreAllocated = false;
ID = -1;
}
T Data;
Node* Prev;
Node* Next;
bool IsPreAllocated;
int ID;
};
Node* _root;
Node* _end;
Node* _prebuffer;
int prebufferCapacity;
int prebufferMarker;
bool _alive;
__m128i _cache;
T* _cacheBuffer[4];
int cacheMarker;
int MinID, MaxID;
};
And here's the benchmark:
// Initialize seeds
const __int64 ecount = 5 * 1000*1000;
vector<__int64> seed(ecount);
for (__int64 i = 0; i < ecount; i++)
{
seed[i] = i;
}
random_shuffle(seed.begin(), seed.end());
///////////// std::vector
vector<__int64> v;
cout << "--------------------" << endl;
cout << "std::vector :" << endl;
cout << " Build : " << time_call([&]()
{
v.resize(ecount/2);
for (__int64 i = 0; i < ecount; i++)
{
if (i < (ecount / 2))
v[i] = seed[i];
else
v.push_back(seed[i]);
}
}) << " ms" << endl;
cout << " Loop : " << time_call([&]()
{
for (auto& n : v)
n /= 2;
}) << " ms" << endl;
bool found1, found2, found3;
cout << " Find : " << (((float)time_call([&]()
{
for (int i = 0; i < 15; i++)
{
// Should exist
found1 = find(v.begin(), v.end(), seed[5] / 2) != v.end();//find(seed[5]) != m.end();
found2 = find(v.begin(), v.end(), seed[1000] / 2) != v.end();
// Shouldn't exist
found3 = find(v.begin(), v.end(), -1234) != v.end();
}
})) / 15.0) / 3.0;
cout << " ms " << " -> First : " << ((found1) ? "Found" : "Not Found") << ", Second : " << ((found2) ? "Found" : "Not Found") << ", Third : " << ((found3) ? "Found" : "Not Found") << endl;
cout << " Release : " << time_call([&]()
{
v.clear();
}) << " ms" << endl;
///////////// std::map
map<__int64, __int64> m;
cout << "--------------------" << endl;
cout << "std::map :" << endl;
cout << " Build : " << time_call([&]()
{
for (__int64 i = 0; i < ecount; i++)
{
m[seed[i]] = seed[i];
}
}) << " ms" << endl;
cout << " Loop : " << time_call([&]()
{
for (auto& n : m)
n.second /= 2;
}) << " ms" << endl;
cout << " Find : " << (((float)time_call([&]()
{
for (int i = 0; i < 15; i++)
{
// Should exist
found1 = m.find(seed[5]) != m.end();
found2 = m.find(seed[1000]) != m.end();
// Shouldn't exist
found3 = m.find(-1234) != m.end();
}
})) / 15.0) / 3.0;
cout << " ms " << " -> First : " << ((found1) ? "Found" : "Not Found") << ", Second : " << ((found2) ? "Found" : "Not Found") << ", Third : " << ((found3) ? "Found" : "Not Found") << endl;
cout << " Release : " << time_call([&]()
{
m.clear();
}) << endl;
///////////// Linear Map V0
LinearMap0<__int64> c;
cout << "--------------------" << endl;
cout << "Linear Map V0:" << endl;
cout << " Build : " << time_call([&]()
{
c.PreAllocate(ecount / 2);
for (__int64 i = 0; i < ecount; i++)
{
c.AddElement(seed[i],seed[i]);
}
}) << " ms" << endl;
cout << " Loop : " << time_call([&]()
{
c.Loop([](__int64& Data)
{
Data /= 2;
});
}) << " ms" << endl;
cout << " Find : " << (((float)time_call([&]()
{
for (int i = 0; i < 15; i++)
{
// Should exist
found1 = c.Find(seed[5]);
found2 = c.Find(seed[1000]);
// Shouldn't exist
found3 = c.Find(-1234);
}
})) / 15.0) / 3.0;
cout << " ms -> First : " << ((found1) ? "Found" : "Not Found") << ", Second : " << ((found2) ? "Found" : "Not Found") << ", Third : " << ((found3) ? "Found" : "Not Found") << endl;
cout << " Release : " << time_call([&]()
{
c.Release();
}) << endl;
EDIT: time_call is:
template <class Function>
double time_call(Function&& f)
{
chrono::time_point<chrono::high_resolution_clock> start, end;
start = chrono::high_resolution_clock::now();
f();
end = chrono::high_resolution_clock::now();
return ((double)(chrono::duration_cast<chrono::nanoseconds>(end - start).count())) / 1000000.0;
}
Your container is a linked list, whereas std::vector
is a dynamically-sized array.
The linked list approach has benefits, such as being able to insert elements without any re-allocations.
However the array approach has some significant advantages:
-O3
then the compiler will likely use a vectorized version of std::find
. It's impossible to vectorize a linked list scan due to memory dependencies.next
pointer which effectively makes your elements larger. Also, each non-preallocated Node
has to pay the overhead of the allocator (i.e. accounting data for new
and delete
). That means you're hitting memory bandwidth limits sooner, and you can fit fewer elements in cache.If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With