I wrote a function that multiplies Eigen matrices of dimension 10x10 together. Then I wrote a naive multiply function CustomMultiply
which was surprisingly 2x faster than Eigen's implementation.
I tried a couple of different compilation flags like -O2 and -O3, which did not make a difference.
#include <Eigen/Core>
constexpr int dimension = 10;
using Matrix = Eigen::Matrix<double, dimension, dimension>;
Matrix CustomMultiply(const Matrix& a, const Matrix& b) {
Matrix result = Matrix::Zero();
for (int bcol_idx = 0; bcol_idx < dimension; ++bcol_idx) {
for (int brow_idx = 0; brow_idx < dimension; ++brow_idx) {
result.col(bcol_idx).noalias() += a.col(brow_idx) * b(brow_idx, bcol_idx);
}
}
return result;
}
Matrix PairwiseMultiplyEachMatrixNoAlias(int num_repetitions, const std::vector<Matrix>& input) {
Matrix acc = Matrix::Zero();
for (int i = 0; i < num_repetitions; ++i) {
for (const auto& matrix_a : input) {
for (const auto& matrix_b : input) {
acc.noalias() += matrix_a * matrix_b;
}
}
}
return acc;
}
Matrix PairwiseMultiplyEachMatrixCustom(int num_repetitions, const std::vector<Matrix>& input) {
Matrix acc = Matrix::Zero();
for (int i = 0; i < num_repetitions; ++i) {
for (const auto& matrix_a : input) {
for (const auto& matrix_b : input) {
acc.noalias() += CustomMultiply(matrix_a, matrix_b);
}
}
}
return acc;
}
PairwiseMultiplyEachMatrixNoAlias
is 2x slower on PairwiseMultiplyEachMatrixCustom
on my machine when I pass in 100 random matrices as input
and use 100 as num_repetitions
.
My machine details: Intel Xeon CPU E5-2630 v4, Ubuntu 16.04, Eigen 3
Updates: Results are unchanged after the following modifications after helpful discussion in the comments
num_repetitions = 1
and input.size() = 1000
.lazyProduct()
and using .eval()
actually leads to further
slowdown -march=native -DNDEBUG
Updates 2:
Following up on @dtell's findings with Google Benchmark library, I found an interesting result. Multiplying 2 matrices with Eigen is faster than custom, but multiplying many matrices with Eigen is 2x slower, in line with the previous findings.
Here is my Google Benchmark code. (Note: There was an off-by one in the GenerateRandomMatrices()
function below which is now fixed.)
#include <Eigen/Core>
#include <Eigen/StdVector>
#include <benchmark/benchmark.h>
constexpr int dimension = 10;
constexpr int num_random_matrices = 10;
using Matrix = Eigen::Matrix<double, dimension, dimension>;
using Eigen_std_vector = std::vector<Matrix,Eigen::aligned_allocator<Matrix>>;
Eigen_std_vector GetRandomMatrices(int num_matrices) {
Eigen_std_vector matrices;
for (int i = 0; i < num_matrices; ++i) {
matrices.push_back(Matrix::Random());
}
return matrices;
}
Matrix CustomMultiply(const Matrix& a, const Matrix& b) {
Matrix result = Matrix::Zero();
for (int bcol_idx = 0; bcol_idx < dimension; ++bcol_idx) {
for (int brow_idx = 0; brow_idx < dimension; ++brow_idx) {
result.col(bcol_idx).noalias() += a.col(brow_idx) * b(brow_idx, bcol_idx);
}
}
return result;
}
Matrix PairwiseMultiplyEachMatrixNoAlias(int num_repetitions, const Eigen_std_vector& input) {
Matrix acc = Matrix::Zero();
for (int i = 0; i < num_repetitions; ++i) {
for (const auto& matrix_a : input) {
for (const auto& matrix_b : input) {
acc.noalias() += matrix_a * matrix_b;
}
}
}
return acc;
}
Matrix PairwiseMultiplyEachMatrixCustom(int num_repetitions, const Eigen_std_vector& input) {
Matrix acc = Matrix::Zero();
for (int i = 0; i < num_repetitions; ++i) {
for (const auto& matrix_a : input) {
for (const auto& matrix_b : input) {
acc.noalias() += CustomMultiply(matrix_a, matrix_b);
}
}
}
return acc;
}
void BM_PairwiseMultiplyEachMatrixNoAlias(benchmark::State& state) {
// Perform setup here
const auto random_matrices = GetRandomMatrices(num_random_matrices);
for (auto _ : state) {
benchmark::DoNotOptimize(PairwiseMultiplyEachMatrixNoAlias(1, random_matrices));
}
}
BENCHMARK(BM_PairwiseMultiplyEachMatrixNoAlias);
void BM_PairwiseMultiplyEachMatrixCustom(benchmark::State& state) {
// Perform setup here
const auto random_matrices = GetRandomMatrices(num_random_matrices);
for (auto _ : state) {
benchmark::DoNotOptimize(PairwiseMultiplyEachMatrixCustom(1, random_matrices));
}
}
BENCHMARK(BM_PairwiseMultiplyEachMatrixCustom);
void BM_MultiplySingle(benchmark::State& state) {
// Perform setup here
const auto random_matrices = GetRandomMatrices(2);
for (auto _ : state) {
benchmark::DoNotOptimize((random_matrices[0] * random_matrices[1]).eval());
}
}
BENCHMARK(BM_MultiplySingle);
void BM_MultiplySingleCustom(benchmark::State& state) {
// Perform setup here
const auto random_matrices = GetRandomMatrices(2);
for (auto _ : state) {
benchmark::DoNotOptimize(CustomMultiply(random_matrices[0], random_matrices[1]));
}
}
BENCHMARK(BM_MultiplySingleCustom);
double TestCustom() {
const Matrix a = Matrix::Random();
const Matrix b = Matrix::Random();
const Matrix c = a * b;
const Matrix custom_c = CustomMultiply(a, b);
const double err = (c - custom_c).squaredNorm();
return err;
}
// Just sanity check the multiplication
void BM_TestCustom(benchmark::State& state) {
if (TestCustom() > 1e-10) {
exit(-1);
}
}
BENCHMARK(BM_TestCustom);
This yields the following mysterious report
Run on (20 X 3100 MHz CPU s)
CPU Caches:
L1 Data 32K (x10)
L1 Instruction 32K (x10)
L2 Unified 256K (x10)
L3 Unified 25600K (x1)
***WARNING*** CPU scaling is enabled, the benchmark real time measurements may be noisy and will incur extra overhead.
----------------------------------------------------------------------------
Benchmark Time CPU Iterations
----------------------------------------------------------------------------
BM_PairwiseMultiplyEachMatrixNoAlias 28283 ns 28285 ns 20250
BM_PairwiseMultiplyEachMatrixCustom 14442 ns 14443 ns 48488
BM_MultiplySingle 791 ns 791 ns 876969
BM_MultiplySingleCustom 874 ns 874 ns 802052
BM_TestCustom 0 ns 0 ns 0
My current hypothesis is that the slowdown is attributable to instruction cache misses. It's possible Eigen's matrix multiply function does bad things to the instruction cache.
VTune output for custom:
VTune output for Eigen:
Maybe someone with more experience with VTune can tell me if I am interpreting this result correctly. The DSB is the decoded instruction cache and MITE has something to do with instruction decoder bandwidth. The Eigen version shows that most instructions are missing the DSB (66% miss rate) and a marked increase in stalling due to MITE bandwidth.
Update 3: After getting reports that the single version of custom was faster, I also reproduced it on my machine. This goes against @dtell's original findings on their machine.
CPU Caches:
L1 Data 32K (x10)
L1 Instruction 32K (x10)
L2 Unified 256K (x10)
L3 Unified 25600K (x1)
***WARNING*** CPU scaling is enabled, the benchmark real time measurements may be noisy and will incur extra overhead.
----------------------------------------------------------------------------
Benchmark Time CPU Iterations
----------------------------------------------------------------------------
BM_PairwiseMultiplyEachMatrixNoAlias 34787 ns 34789 ns 16477
BM_PairwiseMultiplyEachMatrixCustom 17901 ns 17902 ns 37759
BM_MultiplySingle 349 ns 349 ns 2054295
BM_MultiplySingleCustom 178 ns 178 ns 4624183
BM_TestCustom 0 ns 0 ns 0
I wonder if in my previous benchmark result I had left out an optimization flag. In any case, I think the issue is confirmed that Eigen incurs an overhead when multiplying small matrices. If anyone out there has a machine that does not use a uop cache, I would be interested in seeing if the slowdown is less severe.
(gdb) bt
#0 0x00005555555679e3 in Eigen::internal::gemm_pack_rhs<double, long, Eigen::internal::const_blas_data_mapper<double, long, 0>, 4, 0, false, false>::operator()(double*, Eigen::internal::const_blas_data_mapper<double, long, 0> const&, long, long, long, long) ()
#1 0x0000555555566654 in Eigen::internal::general_matrix_matrix_product<long, double, 0, false, double, 0, false, 0>::run(long, long, long, double const*, long, double const*, long, double*, long, double, Eigen::internal::level3_blocking<double, double>&, Eigen::internal::GemmParallelInfo<long>*) ()
#2 0x0000555555565822 in BM_PairwiseMultiplyEachMatrixNoAlias(benchmark::State&) ()
#3 0x000055555556d571 in benchmark::internal::(anonymous namespace)::RunInThread(benchmark::internal::Benchmark::Instance const*, unsigned long, int, benchmark::internal::ThreadManager*) ()
#4 0x000055555556b469 in benchmark::RunSpecifiedBenchmarks(benchmark::BenchmarkReporter*, benchmark::BenchmarkReporter*) ()
#5 0x000055555556a450 in main ()
From stack trace, eigen's matrix multiplication is using a generic multiply method and loop through a dynamic matrix size. For custom implementation, clang aggressively vectorize it and unroll loop, so there's much less branching.
Maybe there's some flag/option for eigen to generate code for this particular size to optimize.
However, if the matrix size is bigger, the Eigen version will perform much better than custom.
I've rewritten your code using a proper benchmark library, namely Google Benchmark and cannot reproduce your measurements.
My results for -O0
where the second template parameter is the matrix dimension:
Running ./benchmark
Run on (12 X 2900 MHz CPU s)
CPU Caches:
L1 Data 32K (x6)
L1 Instruction 32K (x6)
L2 Unified 262K (x6)
L3 Unified 12582K (x1)
---------------------------------------------------------------------
Benchmark Time CPU Iterations
---------------------------------------------------------------------
BM_CustomMultiply<double, 3> 5391 ns 5389 ns 105066
BM_CustomMultiply<double, 4> 9365 ns 9364 ns 73649
BM_CustomMultiply<double, 5> 15349 ns 15349 ns 44008
BM_CustomMultiply<double, 6> 20953 ns 20947 ns 32230
BM_CustomMultiply<double, 7> 33328 ns 33318 ns 21584
BM_CustomMultiply<double, 8> 44237 ns 44230 ns 15500
BM_CustomMultiply<double, 9> 57142 ns 57140 ns 11953
BM_CustomMultiply<double, 10> 69382 ns 69382 ns 9998
BM_EigenMultiply<double, 3> 2335 ns 2335 ns 295458
BM_EigenMultiply<double, 4> 1613 ns 1613 ns 457382
BM_EigenMultiply<double, 5> 4791 ns 4791 ns 142992
BM_EigenMultiply<double, 6> 3471 ns 3469 ns 206002
BM_EigenMultiply<double, 7> 9052 ns 9051 ns 78135
BM_EigenMultiply<double, 8> 8655 ns 8655 ns 81717
BM_EigenMultiply<double, 9> 11446 ns 11399 ns 67001
BM_EigenMultiply<double, 10> 15092 ns 15053 ns 46924
As you can see the number of iterations Google Benchmark uses is order of magnitudes higher that your benchmark. Micro-benchmarking is extremely hard especially when you deal with execution times of a few hundred nanoseconds.
To be fair, calling your custom function involves a copy and manually inlining it gives a few nanoseconds, but still not beating Eigen.
Measurement with manually inlined CustomMultiply
and -O2 -DNDEBUG -march=native
:
Running ./benchmark
Run on (12 X 2900 MHz CPU s)
CPU Caches:
L1 Data 32K (x6)
L1 Instruction 32K (x6)
L2 Unified 262K (x6)
L3 Unified 12582K (x1)
---------------------------------------------------------------------
Benchmark Time CPU Iterations
---------------------------------------------------------------------
BM_CustomMultiply<double, 3> 51 ns 51 ns 11108114
BM_CustomMultiply<double, 4> 88 ns 88 ns 7683611
BM_CustomMultiply<double, 5> 147 ns 147 ns 4642341
BM_CustomMultiply<double, 6> 213 ns 213 ns 3205627
BM_CustomMultiply<double, 7> 308 ns 308 ns 2246391
BM_CustomMultiply<double, 8> 365 ns 365 ns 1904860
BM_CustomMultiply<double, 9> 556 ns 556 ns 1254953
BM_CustomMultiply<double, 10> 661 ns 661 ns 1027825
BM_EigenMultiply<double, 3> 39 ns 39 ns 17918807
BM_EigenMultiply<double, 4> 69 ns 69 ns 9931755
BM_EigenMultiply<double, 5> 119 ns 119 ns 5801185
BM_EigenMultiply<double, 6> 178 ns 178 ns 3838772
BM_EigenMultiply<double, 7> 256 ns 256 ns 2692898
BM_EigenMultiply<double, 8> 385 ns 385 ns 1826598
BM_EigenMultiply<double, 9> 546 ns 546 ns 1271687
BM_EigenMultiply<double, 10> 644 ns 644 ns 1104798
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With