Consider following c++ code:
#include "threadpool.hpp"
#include <chrono>
#include <list>
#include <iostream>
#include <cmath>
int loop_size;
void process(int num) {
double x = 0;
double sum = 0;
for(int i = 0; i < loop_size; ++i) {
x += 0.0001;
sum += sin(x) / cos(x) + cos(x) * cos(x);
}
}
int main(int argc, char* argv[]) {
if(argc < 3) {
std::cerr << argv[0] << " [thread_pool_size] [threads] [sleep_time]" << std::endl;
exit(0);
}
thread_pool* pool = nullptr;
int th_count = std::atoi(argv[1]);
if(th_count != 0) {
pool = new thread_pool(th_count);
}
loop_size = std::stoi(argv[3]);
int max = std::stoi(argv[2]);
auto then = std::chrono::steady_clock::now();
std::list<std::thread> ths;
if(th_count == 0) {
for(int i = 0; i < max; ++i) {
ths.emplace_back(&process, i);
}
for(std::thread& t : ths) {
t.join();
}
} else {
for(int i = 0; i < max; ++i) {
pool->enqueue(std::bind(&process, i));
}
delete pool;
}
int diff = std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::steady_clock::now() - then).count();
std::cerr << "Time: " << diff << '\n';
return 0;
}
And "threadpool.hpp"
is modified version of this github repo and it is available here
I compiled above code on my machine (Corei7-6700) and a 88-core server (2x Xeon E5-2696 v4). The results I can't explain.
This is how I run the code:
tp <threadpool size> <number of threads> <iterations>
The very same code runs slower on faster machines! I have 8 cores on my local machine and 88 cores on remote server and these are results: (last two columns indicate average time to complete in milliseconds on each machine)
+============+=========+============+=============+====================+
| Threadpool | Threads | Iterations | Corei7-6700 | 2x Xeon E5-2696 v4 |
+============+=========+============+=============+====================+
| 100 | 100000 | 1000 | 1300 | 6000 |
+------------+---------+------------+-------------+--------------------+
| 1000 | 100000 | 1000 | 1400 | 5000 |
+------------+---------+------------+-------------+--------------------+
| 10000 | 100000 | 1000 | 1470 | 3400 |
+------------+---------+------------+-------------+--------------------+
It seems having more cores makes the code run slower. So I reduced CPU affinity on server (taskset) to 8 cores and run the code again:
taskset 0-7 tp <threadpool size> <number of threads> <iterations>
This is the new data:
+============+=========+============+=============+====================+
| Threadpool | Threads | Iterations | Corei7-6700 | 2x Xeon E5-2696 v4 |
+============+=========+============+=============+====================+
| 100 | 100000 | 1000 | 1300 | 900 |
+------------+---------+------------+-------------+--------------------+
| 1000 | 100000 | 1000 | 1400 | 1000 |
+------------+---------+------------+-------------+--------------------+
| 10000 | 100000 | 1000 | 1470 | 1070 |
+------------+---------+------------+-------------+--------------------+
I have tested the same code on a 32-core Xeon and a 22-core old Xeon machine, and the pattern is similar: Having less cores, makes the multi-threaded code run faster. But why?
IMPORTANT NOTE: This is an effort to solve my original problem here:
Why having more and faster cores makes my multithreaded software slower?
Notes:
g++ ./threadpool.cpp -o ./tp -lpthread
You are enqueueing a ton of workers into the thread pool which take very little time to execute. Consequently you are bottlenecked by the implementation of the thread pool (not the actual work), specifically the way its mutex handles contention. I tried replacing thread_pool
with folly::CPUThreadPoolExecutor
, which kind of helped:
thread_pool version:
2180 ms | thread_pool_size=100 num_workers=100000 loop_size=1000 affinity=0-23
2270 ms | thread_pool_size=1000 num_workers=100000 loop_size=1000 affinity=0-23
2400 ms | thread_pool_size=10000 num_workers=100000 loop_size=1000 affinity=0-23
530 ms | thread_pool_size=100 num_workers=100000 loop_size=1000 affinity=0-7
1930 ms | thread_pool_size=1000 num_workers=100000 loop_size=1000 affinity=0-7
2300 ms | thread_pool_size=10000 num_workers=100000 loop_size=1000 affinity=0-7
folly::CPUThreadPoolExecutor version:
830 ms | thread_pool_size=100 num_workers=100000 loop_size=1000 affinity=0-23
780 ms | thread_pool_size=1000 num_workers=100000 loop_size=1000 affinity=0-23
800 ms | thread_pool_size=10000 num_workers=100000 loop_size=1000 affinity=0-23
880 ms | thread_pool_size=100 num_workers=100000 loop_size=1000 affinity=0-7
1130 ms | thread_pool_size=1000 num_workers=100000 loop_size=1000 affinity=0-7
1120 ms | thread_pool_size=10000 num_workers=100000 loop_size=1000 affinity=0-7
I would suggest that you (1) do more work in each thread; (2) use about as many threads as CPUs; (3) use a better thread pool. Let's set thread_pool_size
to the number of CPUs, and multiply loop_size
by 10:
thread_pool version:
1880 ms | thread_pool_size=24 num_workers=100000 loop_size=10000 affinity=0-23
4100 ms | thread_pool_size=8 num_workers=100000 loop_size=10000 affinity=0-7
folly::CPUThreadPoolExecutor version:
1520 ms | thread_pool_size=24 num_workers=100000 loop_size=10000 affinity=0-23
2310 ms | thread_pool_size=8 num_workers=100000 loop_size=10000 affinity=0-7
Notice that by increasing the amount of work per thread by 10x, we actually made the thread_pool
version faster, and the folly::CPUThreadPoolExecutor
version only took 2x as much time. Let's multiply the loop_size
by 10x more:
thread_pool version:
28695 ms | thread_pool_size=24 num_workers=100000 loop_size=100000 affinity=0-23
81600 ms | thread_pool_size=8 num_workers=100000 loop_size=100000 affinity=0-7
folly::CPUThreadPoolExecutor version:
6830 ms | thread_pool_size=24 num_workers=100000 loop_size=100000 affinity=0-23
14400 ms | thread_pool_size=8 num_workers=100000 loop_size=100000 affinity=0-7
For folly::CPUThreadPoolExecutor
the results speak for themselves: doing more work in each thread gets you closer to truly linear gains from parallelism. And thread_pool
just seems not to be up to the task; it can't properly deal with this scale of mutex contention.
Here's the code I used to test (compiled with gcc 5.5, full optimization):
#include <chrono>
#include <cmath>
#include <iostream>
#include <memory>
#include <vector>
#define USE_FOLLY 1
#if USE_FOLLY
#include <folly/executors/CPUThreadPoolExecutor.h>
#include <folly/futures/Future.h>
#else
#include "threadpool.hpp"
#endif
int loop_size;
thread_local double dummy = 0.0;
void process(int num) {
double x = 0;
double sum = 0;
for (int i = 0; i < loop_size; ++i) {
x += 0.0001;
sum += sin(x) / cos(x) + cos(x) * cos(x);
}
dummy += sum; // prevent optimization
}
int main(int argc, char* argv[]) {
if (argc < 3) {
std::cerr << argv[0] << " [thread_pool_size] [threads] [sleep_time]"
<< std::endl;
exit(0);
}
int th_count = std::atoi(argv[1]);
#if USE_FOLLY
auto executor = std::make_unique<folly::CPUThreadPoolExecutor>(th_count);
#else
auto pool = std::make_unique<thread_pool>(th_count);
#endif
loop_size = std::stoi(argv[3]);
int max = std::stoi(argv[2]);
auto then = std::chrono::steady_clock::now();
#if USE_FOLLY
std::vector<folly::Future<folly::Unit>> futs;
for (int i = 0; i < max; ++i) {
futs.emplace_back(folly::via(executor.get()).then([i]() { process(i); }));
}
folly::collectAll(futs).get();
#else
for (int i = 0; i < max; ++i) {
pool->enqueue([i]() { process(i); });
}
pool = nullptr;
#endif
int diff = std::chrono::duration_cast<std::chrono::milliseconds>(
std::chrono::steady_clock::now() - then)
.count();
std::cerr << "Time: " << diff << '\n';
return 0;
}
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With