Why is this code getting faster when I'm using way more threads than my CPU has cores?

Tags:

I have some timing code which I'm using to measure the runtime of a given snippet of code:

struct time_data {
    std::chrono::steady_clock::time_point start, end;

    auto get_duration() const {
        return end - start;
    }

    void print_data(std::ostream & out) const {
        out << str();
    }

    std::string str() const {
        static const std::locale loc{ "" };
        std::stringstream ss;
        ss.imbue(loc);
        ss << "Start:    " << std::setw(24) << std::chrono::nanoseconds{ start.time_since_epoch() }.count() << "ns\n";
        ss << "End:      " << std::setw(24) << std::chrono::nanoseconds{ end.time_since_epoch() }.count() << "ns\n";
        ss << "Duration: " << std::setw(24) << std::chrono::nanoseconds{ get_duration() }.count() << "ns\n";
        return ss.str();
    }

    static friend std::ostream & operator<<(std::ostream & out, const time_data & data) {
        return out << data.str();
    }
};

template<typename T>
time_data time_function(T && func) {
    time_data data;
    data.start = std::chrono::steady_clock::now();
    func();
    data.end = std::chrono::steady_clock::now();
    return data;
}

Then, to make use of it, I have the following code designed to perform an accumulation on a set of data, but instead of simply adding the numbers together, it first finds the Total Stopping Time from applying the Collatz Conjecture and accumulates that instead.

template<typename T>
T accumulation_function(T a, T b) {
    T count = 0;
    while (b > 1) {
        if (b % 2 == 0) b /= 2;
        else b = b * 3 + 1;
        ++count;
    }
    return a + count;
}

template<typename IT>
auto std_sum(IT begin, IT end) {
    auto sum = (*begin - *begin);
    sum = std::accumulate(begin, end, sum, accumulation_function<decltype(sum)>);
    return sum;
}

template<typename IT>
auto single_thread_sum(IT begin, IT end) {
    auto sum = (*begin - *begin);
    IT current = begin;
    while (current != end) {
        sum = accumulation_function(sum, *current);
        ++current;
    }
    return sum;
}

template<typename IT, uint64_t N>
auto N_thread_smart_sum(IT begin, IT end) {
    auto sum = (*begin - *begin);
    std::vector<std::thread> threads;
    std::array<decltype(sum), N> sums;
    auto dist = std::distance(begin, end);
    for (uint64_t i = 0; i < N; i++) {
        threads.emplace_back([=, &sums] {
            IT first = begin;
            IT last = begin;
            auto & tsum = sums[i];
            tsum = 0;
            std::advance(first, i * dist / N);
            std::advance(last, (i + 1) * dist / N);
            while (first != last) {
                tsum = accumulation_function(tsum, *first);
                ++first;
            }
        });
    }
    for (std::thread & thread : threads)
        thread.join();
    for (const auto & s : sums) {
        sum += s;
    }
    return sum;
}

template<typename IT>
auto two_thread_smart_sum(IT begin, IT end) {
    return N_thread_smart_sum<IT, 2>(begin, end);
}

template<typename IT>
auto four_thread_smart_sum(IT begin, IT end) {
    return N_thread_smart_sum<IT, 4>(begin, end);
}

template<typename IT>
auto eight_thread_smart_sum(IT begin, IT end) {
    return N_thread_smart_sum<IT, 8>(begin, end);
}

template<typename IT>
auto sixteen_thread_smart_sum(IT begin, IT end) {
    return N_thread_smart_sum<IT, 16>(begin, end);
}

template<typename IT>
auto thirty_two_thread_smart_sum(IT begin, IT end) {
    return N_thread_smart_sum<IT, 32>(begin, end);
}

template<typename IT>
auto sixty_four_thread_smart_sum(IT begin, IT end) {
    return N_thread_smart_sum<IT, 64>(begin, end);
}

And for reference sake: the boilerplate main code that runs it all:

int main() {
    std::vector<int64_t> raw_data;
    auto fill_data = time_function([&raw_data] {
        constexpr uint64_t SIZE = 1'000'000'000ull;
        raw_data.resize(SIZE);
        std::vector<std::thread> threads;
        for (int i = 0; i < 8; i++) {
            threads.emplace_back([i, SIZE, &raw_data] {
                uint64_t begin = i * SIZE / 8;
                uint64_t end = (i + 1) * SIZE / 8;
                for (uint64_t index = begin; index < end; index++) {
                    raw_data[index] = begin % (20 + i);
                }
            });
        }
        for (std::thread & t : threads) 
            t.join();
    });

    int64_t sum;

    std::cout << std::setw(25) << "Fill Data" << std::endl;
    std::cout << fill_data << std::endl;

    auto std_data = time_function([&] {
        sum = std_sum(raw_data.begin(), raw_data.end());
    });

    std::cout << std::setw(25) << "STD Sum: " << sum << std::endl;
    std::cout << std_data << std::endl;

    auto single_data = time_function([&] {
        sum = single_thread_sum(raw_data.begin(), raw_data.end());
    });

    std::cout << std::setw(25) << "Single Sum: " << sum << std::endl;
    std::cout << single_data << std::endl;

    auto smart_2_data = time_function([&] {
        sum = two_thread_smart_sum(raw_data.begin(), raw_data.end());
    });

    std::cout << std::setw(25) << "Two-Thread-Smart Sum: " << sum << std::endl;
    std::cout << smart_2_data << std::endl;

    auto smart_4_data = time_function([&] {
        sum = four_thread_smart_sum(raw_data.begin(), raw_data.end());
    });

    std::cout << std::setw(25) << "Four-Thread-Smart Sum: " << sum << std::endl;
    std::cout << smart_4_data << std::endl;

    auto smart_8_data = time_function([&] {
        sum = eight_thread_smart_sum(raw_data.begin(), raw_data.end());
    });

    std::cout << std::setw(25) << "Eight-Thread-Smart Sum: " << sum << std::endl;
    std::cout << smart_8_data << std::endl;

    auto smart_16_data = time_function([&] {
        sum = sixteen_thread_smart_sum(raw_data.begin(), raw_data.end());
    });

    std::cout << std::setw(25) << "Sixteen-Thread-Smart Sum: " << sum << std::endl;
    std::cout << smart_16_data << std::endl;

    auto smart_32_data = time_function([&] {
        sum = thirty_two_thread_smart_sum(raw_data.begin(), raw_data.end());
    });

    std::cout << std::setw(25) << "Thirty-Two-Thread-Smart Sum: " << sum << std::endl;
    std::cout << smart_32_data << std::endl;

    auto smart_64_data = time_function([&] {
        sum = sixty_four_thread_smart_sum(raw_data.begin(), raw_data.end());
    });

    std::cout << std::setw(25) << "Sixty-Four-Thread-Smart Sum: " << sum << std::endl;
    std::cout << smart_64_data << std::endl;
    return 0;
}

This is the output of the program:

                Fill Data
Start:          16,295,979,890,252ns
End:            16,300,523,805,484ns
Duration:            4,543,915,232ns

                STD Sum: 7750000000
Start:          16,300,550,212,791ns
End:            16,313,216,607,890ns
Duration:           12,666,395,099ns

             Single Sum: 7750000000
Start:          16,313,216,694,522ns
End:            16,325,774,379,684ns
Duration:           12,557,685,162ns

   Two-Thread-Smart Sum: 7750000000
Start:          16,325,774,466,014ns
End:            16,334,441,032,868ns
Duration:            8,666,566,854ns

  Four-Thread-Smart Sum: 7750000000
Start:          16,334,441,137,913ns
End:            16,342,188,642,721ns
Duration:            7,747,504,808ns

 Eight-Thread-Smart Sum: 7750000000
Start:          16,342,188,770,706ns
End:            16,347,850,908,471ns
Duration:            5,662,137,765ns

Sixteen-Thread-Smart Sum: 7750000000
Start:          16,347,850,961,597ns
End:            16,352,187,838,584ns
Duration:            4,336,876,987ns

Thirty-Two-Thread-Smart Sum: 7750000000
Start:          16,352,187,891,710ns
End:            16,356,111,411,220ns
Duration:            3,923,519,510ns

Sixty-Four-Thread-Smart Sum: 7750000000
Start:          16,356,111,471,288ns
End:            16,359,988,028,812ns
Duration:            3,876,557,524ns

The first few results aren't surprising: my self-written accumulate code works about the same as the std::accumulate function (I've seen both emerge as the "fastest" on consecutive runs, implying their implementation is probably similar). And when I move to Two threads, and Four threads, the code gets faster. This makes sense, because I'm using an Intel 4-core processor.

But the results beyond that point are confusing. My CPU only has 4 cores (8 if you consider the hyperthreading), but even if I excuse that the Hyperthreading is giving marginal performance increases at 8 threads, bumping the number up to 16, 32, and 64 threads all yield additional performance gains. Why is this? How are the additional threads yielding additional performance gains, when I've long since maxed out the number of threads the CPU can physically concurrently run?

Note: this is not the same as this linked question, as I'm dealing with a specific use case and code, whereas the linked question is dealing in generalities.

EDIT:

I made an adjustment to the code so that when threads are created, their priority is set to the highest priority Windows allows. Results, broadly speaking, seem to be the same.

                Fill Data
Start:          18,950,798,175,057ns
End:            18,955,085,850,007ns
Duration:            4,287,674,950ns

                STD Sum: 7750000000
Start:          18,955,086,975,013ns
End:            18,967,581,061,562ns
Duration:           12,494,086,549ns

             Single Sum: 7750000000
Start:          18,967,581,136,724ns
End:            18,980,127,355,698ns
Duration:           12,546,218,974ns

   Two-Thread-Smart Sum: 7750000000
Start:          18,980,127,426,332ns
End:            18,988,619,042,214ns
Duration:            8,491,615,882ns

  Four-Thread-Smart Sum: 7750000000
Start:          18,988,619,135,487ns
End:            18,996,215,684,824ns
Duration:            7,596,549,337ns

 Eight-Thread-Smart Sum: 7750000000
Start:          18,996,215,763,004ns
End:            19,002,055,977,730ns
Duration:            5,840,214,726ns

Sixteen-Thread-Smart Sum: 7750000000
Start:          19,002,056,055,608ns
End:            19,006,282,772,254ns
Duration:            4,226,716,646ns

Thirty-Two-Thread-Smart Sum: 7750000000
Start:          19,006,282,840,774ns
End:            19,010,539,676,914ns
Duration:            4,256,836,140ns

Sixty-Four-Thread-Smart Sum: 7750000000
Start:          19,010,539,758,113ns
End:            19,014,450,311,829ns
Duration:            3,910,553,716ns

DOUBLE EDIT COMBO:

I edited the program to make sure the tsum variable was a local variable inside the lambda, rather than a reference to outside the lambda. The Multithreaded code sped up quite significantly, but it's still exhibiting the same behavior.

                Fill Data
Start:          21,740,931,802,656ns
End:            21,745,429,501,480ns
Duration:            4,497,698,824ns

                STD Sum: 7750000000
Start:          21,745,430,637,655ns
End:            21,758,206,612,632ns
Duration:           12,775,974,977ns

             Single Sum: 7750000000
Start:          21,758,206,681,153ns
End:            21,771,260,233,797ns
Duration:           13,053,552,644ns

   Two-Thread-Smart Sum: 7750000000
Start:          21,771,260,287,828ns
End:            21,777,845,764,595ns
Duration:            6,585,476,767ns

  Four-Thread-Smart Sum: 7750000000
Start:          21,777,845,831,002ns
End:            21,784,011,543,159ns
Duration:            6,165,712,157ns

 Eight-Thread-Smart Sum: 7750000000
Start:          21,784,011,628,584ns
End:            21,788,846,061,014ns
Duration:            4,834,432,430ns

Sixteen-Thread-Smart Sum: 7750000000
Start:          21,788,846,127,422ns
End:            21,791,921,652,246ns
Duration:            3,075,524,824ns

Thirty-Two-Thread-Smart Sum: 7750000000
Start:          21,791,921,747,330ns
End:            21,794,980,832,033ns
Duration:            3,059,084,703ns

Sixty-Four-Thread-Smart Sum: 7750000000
Start:          21,794,980,901,761ns
End:            21,797,937,401,426ns
Duration:            2,956,499,665ns

TRIPLE EDIT WOMBO-COMBO:

I ran the same tests again, but in reverse. Results are quite similar:

                Fill Data
Start:          22,845,472,001,475ns
End:            22,850,746,219,215ns
Duration:            5,274,217,740ns

Sixty-Four-Thread-Smart Sum: 7750000000
Start:          22,850,747,328,223ns
End:            22,853,951,903,952ns
Duration:            3,204,575,729ns

Thirty-Two-Thread-Smart Sum: 7750000000
Start:          22,853,951,981,830ns
End:            22,857,239,237,507ns
Duration:            3,287,255,677ns

Sixteen-Thread-Smart Sum: 7750000000
Start:          22,857,239,307,839ns
End:            22,860,389,285,305ns
Duration:            3,149,977,466ns

 Eight-Thread-Smart Sum: 7750000000
Start:          22,860,389,353,524ns
End:            22,864,828,560,484ns
Duration:            4,439,206,960ns

  Four-Thread-Smart Sum: 7750000000
Start:          22,864,828,633,834ns
End:            22,870,640,982,096ns
Duration:            5,812,348,262ns

   Two-Thread-Smart Sum: 7750000000
Start:          22,870,641,056,956ns
End:            22,877,032,284,384ns
Duration:            6,391,227,428ns

             Single Sum: 7750000000
Start:          22,877,032,367,997ns
End:            22,889,501,695,960ns
Duration:           12,469,327,963ns

                STD Sum: 7750000000
Start:          22,889,501,770,820ns
End:            22,902,014,633,229ns
Duration:           12,512,862,409ns

914

asked Oct 13 '16 19:10

Xirema

2 Answers

All of your tsum writes are to the same array, which will occupy consecutive addresses in memory. This will cause cache contention issues with all those writes. Having more threads spreads those writes out to different cache lines, so the CPU cores spend less time invalidating and reloading cache lines.

Try accumulating to a local tsum variable (not a reference into sums), then writing that to sums[i] when the loop is done.

answered Sep 28 '22 02:09

1201ProgramAlarm

I suspect that you are seeing the speed up because the threads for your application represent a larger percentage of the total active threads running on the system, giving you a more slices of time overall. You might want to see if the numbers change for the better or worse when you have additional programs running.

answered Sep 28 '22 04:09

Mikel F

Related questions
                            
                                Google Drive API for C or C++
                            
                                Audio streaming using C++ tutorial and sample code [closed]
                            
                                k-way triangle set intersection and triangulation
                            
                                Static analysis of noexcept "violations" in C++
                            
                                How to compile c++11 code with android ndk and eclipse?
                            
                                Is it possible to disable GCC warning about missing underscore in user defined literal?
                            
                                Remote debugging C++ applications with Eclipse CDT/RSE/RDT
                            
                                How to call a C++ API from C#
                            
                                How to check whether a container is stable
                            
                                Skip some arguments in a C++ function?
                            
                                Why doesn't Qt Creator find included headers in included paths - even though qmake is able to find them
                            
                                OpenMP on a 2-socket system
                            
                                cmake add_custom_command issue with multiple output files
                            
                                Concept of and basic questions about separating logic (C++) and GUI (Qt)
                            
                                Inheriting-Constructors + In-Class-Initialization of non-default constructabe type fails
                            
                                How to correctly "perfect forward" getter functions?
                            
                                Why does QGLWidget only render a blank screen?
                            
                                C / C++ Literals
                            
                                Running C# code from C++ application (Android NDK) for free
                            
                                How to test for trivially copy assignable lambdas

Donate For Us

If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!

Donate Us With