community. I have this piece of code that finds closest pair of points in Euclidean 3D space. This question is neither about the algorithm nor its implementation or whatever. The problem is that it runs significantly slower when compiled with GCC rather than Clang. Most confusingly, it has comparable execution time on random samples and like 100 times slower on some specific one. I suspect there can be a bug in GCC as I cannot think of any other option.
#include <iostream>
#include <cstdio>
#include <cstdlib>
#include <algorithm>
#include <cmath>
#include <vector>
#include <set>
#include <map>
#include <unordered_set>
#include <unordered_map>
#include <queue>
#include <ctime>
#include <fstream>
#include <cassert>
#include <complex>
#include <string>
#include <cstring>
#include <chrono>
#include <random>
#include <queue>
static std::mt19937 mmtw(std::chrono::steady_clock::now().time_since_epoch().count());
int64_t rng(int64_t x, int64_t y) {
static std::uniform_int_distribution<int64_t> d;
return d(mmtw) % (y - x + 1) + x;
}
constexpr static int MAXN = 1e5 + 10;
void solve(std::istream &in, std::ostream &out);
void generate(std::ostream &out) {
constexpr int N = 1e5;
out << N << '\n';
int MIN = -1e6;
int MAX = 1e6;
for (int i = 0; i < N; ++i) {
out << 0 << ' ';
out << i << ' ';
out << (i + 1) * int(1e4) << '\n';
}
}
int main() {
freopen("input.txt", "r", stdin);
std::ios_base::sync_with_stdio(false);
std::cin.tie(nullptr);
std::cout.tie(nullptr);
std::cerr.tie(nullptr);
std::ofstream fout("input.txt");
generate(fout);
fout.close();
solve(std::cin, std::cout);
return 0;
}
struct point_t {
int32_t x, y, z;
int id;
point_t() = default;
point_t(int32_t x, int32_t y, int32_t z) : x(x), y(y), z(z) {}
point_t operator +(const point_t &rhs) const {
return point_t(x + rhs.x, y + rhs.y, z + rhs.z);
}
point_t operator -(const point_t &rhs) const {
return point_t(x - rhs.x, y - rhs.y, z - rhs.z);
}
int64_t abs2() const {
return 1LL * x * x + 1LL * y * y + 1LL * z * z;
}
};
std::istream &operator >>(std::istream &in, point_t &pt) {
return in >> pt.x >> pt.y >> pt.z;
}
inline bool cmp_x(const point_t &lhs, const point_t &rhs) {
return lhs.x < rhs.x;
}
inline bool cmp_y(const point_t &lhs, const point_t &rhs) {
return lhs.y < rhs.y;
}
inline bool cmp_z(const point_t &lhs, const point_t &rhs) {
return lhs.z < rhs.z;
}
struct pair_t {
int64_t distance_sq;
point_t a {}, b {};
pair_t() : distance_sq(std::numeric_limits<int64_t>::max()) {};
pair_t(const point_t &a, const point_t &b) : distance_sq((a - b).abs2()), a(a), b(b) {}
bool operator<(const pair_t &rhs) const {
return distance_sq < rhs.distance_sq;
}
};
template <typename T> inline T sqr(T arg) { return arg * arg; }
point_t pts[MAXN];
static pair_t ans = pair_t();
void recur_2D(point_t pts[], int size, int64_t threshold_sq) {
if (size <= 3) {
for (int i = 0; i < size; ++i) {
for (int j = i + 1; j < size; ++j) {
ans = std::min(ans, pair_t(pts[i], pts[j]));
}
}
std::sort(pts, pts + size, cmp_y);
return;
}
int mid = size / 2;
int midx = pts[mid].x;
recur_2D(pts, mid, threshold_sq);
recur_2D(pts + mid, size - mid, threshold_sq);
static point_t buffer[MAXN];
std::merge(pts, pts + mid, pts + mid, pts + size, buffer, cmp_y);
std::copy(buffer, buffer + size, pts);
int buff_sz = 0;
for (int i = 0; i < size; ++i) {
if (sqr(pts[i].x - midx) >= threshold_sq) {
continue;
}
int64_t x_sqr = sqr(pts[i].x - midx);
for (int j = buff_sz - 1; j >= 0; --j) {
if (sqr(pts[i].y - buffer[j].y) + x_sqr >= threshold_sq) {
break;
}
ans = std::min(ans, pair_t(pts[i], buffer[j]));
}
buffer[buff_sz++] = pts[i];
}
}
void recur_3D(point_t pts[], int size) {
if (size <= 3) {
for (int i = 0; i < size; ++i) {
for (int j = i + 1; j < size; ++j) {
ans = std::min(ans, pair_t(pts[i], pts[j]));
}
}
std::sort(pts, pts + size, cmp_x);
return;
}
int mid = size / 2;
int midz = pts[mid].z;
recur_3D(pts, mid);
recur_3D(pts + mid, size - mid);
static point_t buffer[MAXN];
std::merge(pts, pts + mid, pts + mid, pts + size, buffer, cmp_x);
std::copy(buffer, buffer + size, pts);
int buff_sz = 0;
for (int i = 0; i < size; ++i) {
if (sqr(pts[i].z - midz) >= ans.distance_sq) {
continue;
}
buffer[buff_sz++] = pts[i];
}
recur_2D(buffer, buff_sz, ans.distance_sq);
}
void solve(std::istream &in, std::ostream &out) {
clock_t start = clock();
int num_of_points;
in >> num_of_points;
for (int i = 0; i < num_of_points; ++i) {
in >> pts[i];
pts[i].id = i + 1;
}
std::sort(pts, pts + num_of_points, cmp_z);
recur_3D(pts, num_of_points);
out << ans.distance_sq << '\n';
out << 1.0 * (clock() - start) / CLOCKS_PER_SEC << " s.\n";
}
Link to this code: https://code.re/2yfPzjkD
It generates the sample that makes the code very slow and then measures algorithm execution time.
I compile with
g++ -DLOCAL -std=c++1z -O3 -Wno-everything main.cpp
and with
clang++ -DLOCAL -std=c++1z -O3 -Wno-everything main.cpp
and run
./main
while having input.txt
in the same directory.
The Clang-copiled binary runs in 0.053798 s.
while the GCC one in 12.4276 s.
. These numbers are from program's output, you can see that function solve
.
I have also verified the difference on https://wandbox.org/ on different compiler versions. https://wandbox.org/permlink/YFEEWSKyos2dQf32 -- clang https://wandbox.org/permlink/XctarNHvd3I1B0x8 -- gcc
Note, I compressed input and thus had to change reading in solve
a bit.
On my local machine I have these compilers.
clang++ --version
clang version 7.0.0 (tags/RELEASE_700/final)
g++ --version
g++ (GCC) 8.2.1 20180831
It feels like I run GCC code without compiler optimizations. What can be the reason?
UPD.
Also, there is a version that calls std::sort
only once in the very beginning.
https://wandbox.org/permlink/i9Kd3GdewxSRwXsM
I have also tried to compile Clang with -stdlib=libstdc++
, shuffling data and think that different implementations of std::sort
are not cause.
This is simply undefined behavior. Your code has undefined behavior due to signed integer overflow at:
template <typename T> inline T sqr(T arg) { return arg * arg; }
You can replace that with:
template <typename T>
inline T sqr(T arg)
{
assert(double(arg)*arg <= std::numeric_limits<T>::max());
assert(double(arg)*arg >= std::numeric_limits<T>::min());
return arg * arg;
}
and catch the error in a debugger. It fails with arg=-60000
called from recur_3D
on the line:
if (sqr(pts[i].z - midz) >= ans.distance_sq) {
this happens with pts[i] = {x = 0, y = 0, z = 10000, id = 1}
and midz=70000
.
Because this is undefiend behavior, all bets are off. Different compiles utilize the assumption that "undefined behavior never happens" in different ways. This is why clang and gcc perform differently, and it is pure "luck".
Consider using UndefinedBehaviorSanitizer to catch these errors. I don't have it on my clang installation, but clang++ -fsanitize=signed-integer-overflow
should do the trick.
Fixing this function gives a comparable speed for both clang and gcc.
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With