Just for the purpose of working on my C++ / Rcpp programming, I took a shot at implementing a (sample) standard deviation function:
#include <Rcpp.h>
#include <vector>
#include <cmath>
#include <numeric>
// [[Rcpp::export]]
double cppSD(Rcpp::NumericVector rinVec)
{
std::vector<double> inVec(rinVec.begin(),rinVec.end());
int n = inVec.size();
double sum = std::accumulate(inVec.begin(), inVec.end(), 0.0);
double mean = sum / inVec.size();
for(std::vector<double>::iterator iter = inVec.begin();
iter != inVec.end(); ++iter){
double temp;
temp= (*iter - mean)*(*iter - mean);
*iter = temp;
}
double sd = std::accumulate(inVec.begin(), inVec.end(), 0.0);
return std::sqrt( sd / (n-1) );
}
I also decided to test out the stddev
function from the Armadillo library, considering that it can be called on a vector:
#include <RcppArmadillo.h>
// [[Rcpp::depends(RcppArmadillo)]]
using namespace Rcpp;
// [[Rcpp::export]]
double armaSD(arma::colvec inVec)
{
return arma::stddev(inVec);
}
Then I benchmarked these two functions against the base R function sd()
for a few vectors of varying size:
Rcpp::sourceCpp('G:/CPP/armaSD.cpp')
Rcpp::sourceCpp('G:/CPP/cppSD.cpp')
require(microbenchmark)
##
## sample size = 1,000: armaSD() < cppSD() < sd()
X <- rexp(1000)
microbenchmark(armaSD(X),sd(X), cppSD(X))
#Unit: microseconds
# expr min lq median uq max neval
# armaSD(X) 4.181 4.562 4.942 5.322 12.924 100
# sd(X) 17.865 19.766 20.526 21.287 86.285 100
# cppSD(X) 4.561 4.941 5.321 5.701 29.269 100
##
## sample size = 10,000: armaSD() < cppSD() < sd()
X <- rexp(10000)
microbenchmark(armaSD(X),sd(X), cppSD(X))
#Unit: microseconds
# expr min lq median uq max neval
# armaSD(X) 24.707 25.847 26.4175 29.6490 52.455 100
# sd(X) 51.315 54.356 55.8760 61.1980 100.730 100
# cppSD(X) 26.608 28.128 28.8885 31.7395 114.413 100
##
## sample size = 25,000: armaSD() < cppSD() < sd()
X <- rexp(25000)
microbenchmark(armaSD(X),sd(X), cppSD(X))
#Unit: microseconds
# expr min lq median uq max neval
# armaSD(X) 66.900 67.6600 68.040 76.403 155.845 100
# sd(X) 108.332 111.5625 122.016 125.817 169.910 100
# cppSD(X) 70.320 71.0805 74.692 80.203 102.250 100
##
## sample size = 50,000: cppSD() < sd() < armaSD()
X <- rexp(50000)
microbenchmark(armaSD(X),sd(X), cppSD(X))
#Unit: microseconds
# expr min lq median uq max neval
# armaSD(X) 249.733 267.4085 297.8175 337.729 642.388 100
# sd(X) 203.740 229.3975 240.2300 260.186 303.709 100
# cppSD(X) 162.308 185.1140 239.6600 256.575 290.405 100
##
## sample size = 75,000: sd() < cppSD() < armaSD()
X <- rexp(75000)
microbenchmark(armaSD(X),sd(X), cppSD(X))
#Unit: microseconds
# expr min lq median uq max neval
# armaSD(X) 445.110 479.8900 502.5070 520.5625 642.388 100
# sd(X) 310.931 334.8780 354.0735 379.7310 429.146 100
# cppSD(X) 346.661 380.8715 400.6370 424.0140 501.747 100
I was not terribly surprised at the fact that my C++ function cppSD()
was faster than stats::sd()
for smaller samples, but slower for larger sized vectors since stats::sd()
is vectorized. However, I did not expect to see such a performance degradation from the arma::stddev()
function since it appears to also be operating in a vectorized manner. Is there a problem with the way that I'm using arma::stdev()
, or is it simply that stats::sd()
was written (in C
I'm assuming) in such a way that it can handle larger vectors much more efficiently? Any input would be appreciated.
Update:
Although my question was originally about the correct use of arma::stddev
, and not so much about trying find the most efficient way possible to calculate the sample standard deviation, it is very interesting to see that the Rcpp::sd
sugar function performs so well. To make things a little more interesting, I benchmarked the arma::stddev
and Rcpp::sd
functions below against an RcppParallel
version that I adapted from two of JJ Allaire's Rcpp Gallery posts - here and here:
library(microbenchmark)
set.seed(123)
x <- rnorm(5.5e06)
##
Res <- microbenchmark(
armaSD(x),
par_sd(x),
sd_sugar(x),
times=500L,
control=list(warmup=25))
##
R> print(Res)
Unit: milliseconds
expr min lq mean median uq max neval
armaSD(x) 24.486943 24.960966 26.994684 25.255584 25.874139 123.55804 500
par_sd(x) 8.130751 8.322682 9.136323 8.429887 8.624072 22.77712 500
sd_sugar(x) 13.713366 13.984638 14.628911 14.156142 14.401138 32.81684 500
This was on my laptop running 64-bit linux with a i5-4200U CPU @ 1.60GHz processor; but I'm guessing the difference between par_sd
and sugar_sd
would be less substantial on a Windows machine.
And the code for the RcppParallel
version (which is considerably longer, and requires a C++11 compatible compiler for the lambda expression used in overloaded operator()
of the InnerProduct
functor):
#include <Rcpp.h>
#include <RcppParallel.h>
// [[Rcpp::depends(RcppParallel)]]
// [[Rcpp::plugins(cpp11)]]
/*
* based on: http://gallery.rcpp.org/articles/parallel-vector-sum/
*/
struct Sum : public RcppParallel::Worker {
const RcppParallel::RVector<double> input;
double value;
Sum(const Rcpp::NumericVector input)
: input(input), value(0) {}
Sum(const Sum& sum, RcppParallel::Split)
: input(sum.input), value(0) {}
void operator()(std::size_t begin, std::size_t end) {
value += std::accumulate(input.begin() + begin,
input.begin() + end,
0.0);
}
void join(const Sum& rhs) {
value += rhs.value;
}
};
/*
* based on: http://gallery.rcpp.org/articles/parallel-inner-product/
*/
struct InnerProduct : public RcppParallel::Worker {
const RcppParallel::RVector<double> x;
const RcppParallel::RVector<double> y;
double mean;
double product;
InnerProduct(const Rcpp::NumericVector x,
const Rcpp::NumericVector y,
const double mean)
: x(x), y(y), mean(mean), product(0) {}
InnerProduct(const InnerProduct& innerProduct,
RcppParallel::Split)
: x(innerProduct.x), y(innerProduct.y),
mean(innerProduct.mean), product(0) {}
void operator()(std::size_t begin, std::size_t end) {
product += std::inner_product(x.begin() + begin,
x.begin() + end,
y.begin() + begin,
0.0, std::plus<double>(),
[&](double lhs, double rhs)->double {
return ( (lhs-mean)*(rhs-mean) );
});
}
void join(const InnerProduct& rhs) {
product += rhs.product;
}
};
// [[Rcpp::export]]
double par_sd(const Rcpp::NumericVector& x_)
{
int N = x_.size();
Rcpp::NumericVector y_(x_);
Sum sum(x_);
RcppParallel::parallelReduce(0, x_.length(), sum);
double mean = sum.value / N;
InnerProduct innerProduct(x_, y_, mean);
RcppParallel::parallelReduce(0, x_.length(), innerProduct);
return std::sqrt( innerProduct.product / (N-1) );
}
You made a subtle mistake in how you instantiate the Armadillo object -- which leads to copies and hence degraded performance.
Use an interface of const arma::colvec & invec
instead, and all is good:
R> sourceCpp("/tmp/sd.cpp")
R> library(microbenchmark)
R> X <- rexp(500)
R> microbenchmark(armaSD(X), armaSD2(X), sd(X), cppSD(X))
Unit: microseconds
expr min lq median uq max neval
armaSD(X) 3.745 4.0280 4.2055 4.5510 19.375 100
armaSD2(X) 3.305 3.4925 3.6400 3.9525 5.154 100
sd(X) 22.463 23.6985 25.1525 26.0055 52.457 100
cppSD(X) 3.640 3.9495 4.2030 4.8620 13.609 100
R> X <- rexp(5000)
R> microbenchmark(armaSD(X), armaSD2(X), sd(X), cppSD(X))
Unit: microseconds
expr min lq median uq max neval
armaSD(X) 18.627 18.9120 19.3245 20.2150 34.684 100
armaSD2(X) 14.583 14.9020 15.1675 15.5775 22.527 100
sd(X) 54.507 58.8315 59.8615 60.4250 84.857 100
cppSD(X) 18.585 19.0290 19.3970 20.5160 22.174 100
R> X <- rexp(50000)
R> microbenchmark(armaSD(X), armaSD2(X), sd(X), cppSD(X))
Unit: microseconds
expr min lq median uq max neval
armaSD(X) 186.307 187.180 188.575 191.825 405.775 100
armaSD2(X) 142.447 142.793 143.207 144.233 155.770 100
sd(X) 382.857 384.704 385.223 386.075 405.713 100
cppSD(X) 181.601 181.895 182.279 183.350 194.588 100
R>
which is based on my version of your code where everything is one file and armaSD2
is defined as I suggested -- leading to the winning performance.
#include <RcppArmadillo.h>
// [[Rcpp::depends(RcppArmadillo)]]
#include <vector>
#include <cmath>
#include <numeric>
// [[Rcpp::export]]
double cppSD(Rcpp::NumericVector rinVec) {
std::vector<double> inVec(rinVec.begin(),rinVec.end());
int n = inVec.size();
double sum = std::accumulate(inVec.begin(), inVec.end(), 0.0);
double mean = sum / inVec.size();
for(std::vector<double>::iterator iter = inVec.begin();
iter != inVec.end();
++iter){
double temp = (*iter - mean)*(*iter - mean);
*iter = temp;
}
double sd = std::accumulate(inVec.begin(), inVec.end(), 0.0);
return std::sqrt( sd / (n-1) );
}
// [[Rcpp::export]]
double armaSD(arma::colvec inVec) {
return arma::stddev(inVec);
}
// [[Rcpp::export]]
double armaSD2(const arma::colvec & inVec) { return arma::stddev(inVec); }
/*** R
library(microbenchmark)
X <- rexp(500)
microbenchmark(armaSD(X), armaSD2(X), sd(X), cppSD(X))
X <- rexp(5000)
microbenchmark(armaSD(X), armaSD2(X), sd(X), cppSD(X))
X <- rexp(50000)
microbenchmark(armaSD(X), armaSD2(X), sd(X), cppSD(X))
*/
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With