I have some timing code which I'm using to measure the runtime of a given snippet of code:
struct time_data {
std::chrono::steady_clock::time_point start, end;
auto get_duration() const {
return end - start;
}
void print_data(std::ostream & out) const {
out << str();
}
std::string str() const {
static const std::locale loc{ "" };
std::stringstream ss;
ss.imbue(loc);
ss << "Start: " << std::setw(24) << std::chrono::nanoseconds{ start.time_since_epoch() }.count() << "ns\n";
ss << "End: " << std::setw(24) << std::chrono::nanoseconds{ end.time_since_epoch() }.count() << "ns\n";
ss << "Duration: " << std::setw(24) << std::chrono::nanoseconds{ get_duration() }.count() << "ns\n";
return ss.str();
}
static friend std::ostream & operator<<(std::ostream & out, const time_data & data) {
return out << data.str();
}
};
template<typename T>
time_data time_function(T && func) {
time_data data;
data.start = std::chrono::steady_clock::now();
func();
data.end = std::chrono::steady_clock::now();
return data;
}
Then, to make use of it, I have the following code designed to perform an accumulation on a set of data, but instead of simply adding the numbers together, it first finds the Total Stopping Time from applying the Collatz Conjecture and accumulates that instead.
template<typename T>
T accumulation_function(T a, T b) {
T count = 0;
while (b > 1) {
if (b % 2 == 0) b /= 2;
else b = b * 3 + 1;
++count;
}
return a + count;
}
template<typename IT>
auto std_sum(IT begin, IT end) {
auto sum = (*begin - *begin);
sum = std::accumulate(begin, end, sum, accumulation_function<decltype(sum)>);
return sum;
}
template<typename IT>
auto single_thread_sum(IT begin, IT end) {
auto sum = (*begin - *begin);
IT current = begin;
while (current != end) {
sum = accumulation_function(sum, *current);
++current;
}
return sum;
}
template<typename IT, uint64_t N>
auto N_thread_smart_sum(IT begin, IT end) {
auto sum = (*begin - *begin);
std::vector<std::thread> threads;
std::array<decltype(sum), N> sums;
auto dist = std::distance(begin, end);
for (uint64_t i = 0; i < N; i++) {
threads.emplace_back([=, &sums] {
IT first = begin;
IT last = begin;
auto & tsum = sums[i];
tsum = 0;
std::advance(first, i * dist / N);
std::advance(last, (i + 1) * dist / N);
while (first != last) {
tsum = accumulation_function(tsum, *first);
++first;
}
});
}
for (std::thread & thread : threads)
thread.join();
for (const auto & s : sums) {
sum += s;
}
return sum;
}
template<typename IT>
auto two_thread_smart_sum(IT begin, IT end) {
return N_thread_smart_sum<IT, 2>(begin, end);
}
template<typename IT>
auto four_thread_smart_sum(IT begin, IT end) {
return N_thread_smart_sum<IT, 4>(begin, end);
}
template<typename IT>
auto eight_thread_smart_sum(IT begin, IT end) {
return N_thread_smart_sum<IT, 8>(begin, end);
}
template<typename IT>
auto sixteen_thread_smart_sum(IT begin, IT end) {
return N_thread_smart_sum<IT, 16>(begin, end);
}
template<typename IT>
auto thirty_two_thread_smart_sum(IT begin, IT end) {
return N_thread_smart_sum<IT, 32>(begin, end);
}
template<typename IT>
auto sixty_four_thread_smart_sum(IT begin, IT end) {
return N_thread_smart_sum<IT, 64>(begin, end);
}
And for reference sake: the boilerplate main
code that runs it all:
int main() {
std::vector<int64_t> raw_data;
auto fill_data = time_function([&raw_data] {
constexpr uint64_t SIZE = 1'000'000'000ull;
raw_data.resize(SIZE);
std::vector<std::thread> threads;
for (int i = 0; i < 8; i++) {
threads.emplace_back([i, SIZE, &raw_data] {
uint64_t begin = i * SIZE / 8;
uint64_t end = (i + 1) * SIZE / 8;
for (uint64_t index = begin; index < end; index++) {
raw_data[index] = begin % (20 + i);
}
});
}
for (std::thread & t : threads)
t.join();
});
int64_t sum;
std::cout << std::setw(25) << "Fill Data" << std::endl;
std::cout << fill_data << std::endl;
auto std_data = time_function([&] {
sum = std_sum(raw_data.begin(), raw_data.end());
});
std::cout << std::setw(25) << "STD Sum: " << sum << std::endl;
std::cout << std_data << std::endl;
auto single_data = time_function([&] {
sum = single_thread_sum(raw_data.begin(), raw_data.end());
});
std::cout << std::setw(25) << "Single Sum: " << sum << std::endl;
std::cout << single_data << std::endl;
auto smart_2_data = time_function([&] {
sum = two_thread_smart_sum(raw_data.begin(), raw_data.end());
});
std::cout << std::setw(25) << "Two-Thread-Smart Sum: " << sum << std::endl;
std::cout << smart_2_data << std::endl;
auto smart_4_data = time_function([&] {
sum = four_thread_smart_sum(raw_data.begin(), raw_data.end());
});
std::cout << std::setw(25) << "Four-Thread-Smart Sum: " << sum << std::endl;
std::cout << smart_4_data << std::endl;
auto smart_8_data = time_function([&] {
sum = eight_thread_smart_sum(raw_data.begin(), raw_data.end());
});
std::cout << std::setw(25) << "Eight-Thread-Smart Sum: " << sum << std::endl;
std::cout << smart_8_data << std::endl;
auto smart_16_data = time_function([&] {
sum = sixteen_thread_smart_sum(raw_data.begin(), raw_data.end());
});
std::cout << std::setw(25) << "Sixteen-Thread-Smart Sum: " << sum << std::endl;
std::cout << smart_16_data << std::endl;
auto smart_32_data = time_function([&] {
sum = thirty_two_thread_smart_sum(raw_data.begin(), raw_data.end());
});
std::cout << std::setw(25) << "Thirty-Two-Thread-Smart Sum: " << sum << std::endl;
std::cout << smart_32_data << std::endl;
auto smart_64_data = time_function([&] {
sum = sixty_four_thread_smart_sum(raw_data.begin(), raw_data.end());
});
std::cout << std::setw(25) << "Sixty-Four-Thread-Smart Sum: " << sum << std::endl;
std::cout << smart_64_data << std::endl;
return 0;
}
This is the output of the program:
Fill Data
Start: 16,295,979,890,252ns
End: 16,300,523,805,484ns
Duration: 4,543,915,232ns
STD Sum: 7750000000
Start: 16,300,550,212,791ns
End: 16,313,216,607,890ns
Duration: 12,666,395,099ns
Single Sum: 7750000000
Start: 16,313,216,694,522ns
End: 16,325,774,379,684ns
Duration: 12,557,685,162ns
Two-Thread-Smart Sum: 7750000000
Start: 16,325,774,466,014ns
End: 16,334,441,032,868ns
Duration: 8,666,566,854ns
Four-Thread-Smart Sum: 7750000000
Start: 16,334,441,137,913ns
End: 16,342,188,642,721ns
Duration: 7,747,504,808ns
Eight-Thread-Smart Sum: 7750000000
Start: 16,342,188,770,706ns
End: 16,347,850,908,471ns
Duration: 5,662,137,765ns
Sixteen-Thread-Smart Sum: 7750000000
Start: 16,347,850,961,597ns
End: 16,352,187,838,584ns
Duration: 4,336,876,987ns
Thirty-Two-Thread-Smart Sum: 7750000000
Start: 16,352,187,891,710ns
End: 16,356,111,411,220ns
Duration: 3,923,519,510ns
Sixty-Four-Thread-Smart Sum: 7750000000
Start: 16,356,111,471,288ns
End: 16,359,988,028,812ns
Duration: 3,876,557,524ns
The first few results aren't surprising: my self-written accumulate code works about the same as the std::accumulate
function (I've seen both emerge as the "fastest" on consecutive runs, implying their implementation is probably similar). And when I move to Two threads, and Four threads, the code gets faster. This makes sense, because I'm using an Intel 4-core processor.
But the results beyond that point are confusing. My CPU only has 4 cores (8 if you consider the hyperthreading), but even if I excuse that the Hyperthreading is giving marginal performance increases at 8 threads, bumping the number up to 16, 32, and 64 threads all yield additional performance gains. Why is this? How are the additional threads yielding additional performance gains, when I've long since maxed out the number of threads the CPU can physically concurrently run?
Note: this is not the same as this linked question, as I'm dealing with a specific use case and code, whereas the linked question is dealing in generalities.
I made an adjustment to the code so that when threads are created, their priority is set to the highest priority Windows allows. Results, broadly speaking, seem to be the same.
Fill Data
Start: 18,950,798,175,057ns
End: 18,955,085,850,007ns
Duration: 4,287,674,950ns
STD Sum: 7750000000
Start: 18,955,086,975,013ns
End: 18,967,581,061,562ns
Duration: 12,494,086,549ns
Single Sum: 7750000000
Start: 18,967,581,136,724ns
End: 18,980,127,355,698ns
Duration: 12,546,218,974ns
Two-Thread-Smart Sum: 7750000000
Start: 18,980,127,426,332ns
End: 18,988,619,042,214ns
Duration: 8,491,615,882ns
Four-Thread-Smart Sum: 7750000000
Start: 18,988,619,135,487ns
End: 18,996,215,684,824ns
Duration: 7,596,549,337ns
Eight-Thread-Smart Sum: 7750000000
Start: 18,996,215,763,004ns
End: 19,002,055,977,730ns
Duration: 5,840,214,726ns
Sixteen-Thread-Smart Sum: 7750000000
Start: 19,002,056,055,608ns
End: 19,006,282,772,254ns
Duration: 4,226,716,646ns
Thirty-Two-Thread-Smart Sum: 7750000000
Start: 19,006,282,840,774ns
End: 19,010,539,676,914ns
Duration: 4,256,836,140ns
Sixty-Four-Thread-Smart Sum: 7750000000
Start: 19,010,539,758,113ns
End: 19,014,450,311,829ns
Duration: 3,910,553,716ns
I edited the program to make sure the tsum
variable was a local variable inside the lambda, rather than a reference to outside the lambda. The Multithreaded code sped up quite significantly, but it's still exhibiting the same behavior.
Fill Data
Start: 21,740,931,802,656ns
End: 21,745,429,501,480ns
Duration: 4,497,698,824ns
STD Sum: 7750000000
Start: 21,745,430,637,655ns
End: 21,758,206,612,632ns
Duration: 12,775,974,977ns
Single Sum: 7750000000
Start: 21,758,206,681,153ns
End: 21,771,260,233,797ns
Duration: 13,053,552,644ns
Two-Thread-Smart Sum: 7750000000
Start: 21,771,260,287,828ns
End: 21,777,845,764,595ns
Duration: 6,585,476,767ns
Four-Thread-Smart Sum: 7750000000
Start: 21,777,845,831,002ns
End: 21,784,011,543,159ns
Duration: 6,165,712,157ns
Eight-Thread-Smart Sum: 7750000000
Start: 21,784,011,628,584ns
End: 21,788,846,061,014ns
Duration: 4,834,432,430ns
Sixteen-Thread-Smart Sum: 7750000000
Start: 21,788,846,127,422ns
End: 21,791,921,652,246ns
Duration: 3,075,524,824ns
Thirty-Two-Thread-Smart Sum: 7750000000
Start: 21,791,921,747,330ns
End: 21,794,980,832,033ns
Duration: 3,059,084,703ns
Sixty-Four-Thread-Smart Sum: 7750000000
Start: 21,794,980,901,761ns
End: 21,797,937,401,426ns
Duration: 2,956,499,665ns
I ran the same tests again, but in reverse. Results are quite similar:
Fill Data
Start: 22,845,472,001,475ns
End: 22,850,746,219,215ns
Duration: 5,274,217,740ns
Sixty-Four-Thread-Smart Sum: 7750000000
Start: 22,850,747,328,223ns
End: 22,853,951,903,952ns
Duration: 3,204,575,729ns
Thirty-Two-Thread-Smart Sum: 7750000000
Start: 22,853,951,981,830ns
End: 22,857,239,237,507ns
Duration: 3,287,255,677ns
Sixteen-Thread-Smart Sum: 7750000000
Start: 22,857,239,307,839ns
End: 22,860,389,285,305ns
Duration: 3,149,977,466ns
Eight-Thread-Smart Sum: 7750000000
Start: 22,860,389,353,524ns
End: 22,864,828,560,484ns
Duration: 4,439,206,960ns
Four-Thread-Smart Sum: 7750000000
Start: 22,864,828,633,834ns
End: 22,870,640,982,096ns
Duration: 5,812,348,262ns
Two-Thread-Smart Sum: 7750000000
Start: 22,870,641,056,956ns
End: 22,877,032,284,384ns
Duration: 6,391,227,428ns
Single Sum: 7750000000
Start: 22,877,032,367,997ns
End: 22,889,501,695,960ns
Duration: 12,469,327,963ns
STD Sum: 7750000000
Start: 22,889,501,770,820ns
End: 22,902,014,633,229ns
Duration: 12,512,862,409ns
First of all, threads cannot speed up execution of code. They do not make the computer run faster. All they can do is increase the efficiency of the computer by using time that would otherwise be wasted.
Multi-threadingGiven concurrency, one can initiate parallel execution of multiple processes and achieve faster runtime.
Threads and cores are two types of processing units inside the device. The number of threads and cores determines how many tasks can be executed simultaneously. A larger number of threads or cores means that more tasks can be completed at the same time, but this does not mean that those tasks will complete faster.
Threads are not as fast as Cores. Hyperthreading or SMT allows tasks to be scheduled more efficiently, meaning they can use parts of the Core that are currently not actively processing a task. In a best-case scenario, Threads provide around 50% more performance compared to a physical core.
All of your tsum
writes are to the same array
, which will occupy consecutive addresses in memory. This will cause cache contention issues with all those writes. Having more threads spreads those writes out to different cache lines, so the CPU cores spend less time invalidating and reloading cache lines.
Try accumulating to a local tsum
variable (not a reference into sums
), then writing that to sums[i]
when the loop is done.
I suspect that you are seeing the speed up because the threads for your application represent a larger percentage of the total active threads running on the system, giving you a more slices of time overall. You might want to see if the numbers change for the better or worse when you have additional programs running.
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With