I am trying to vectorize the following function to drop the sapply loop. I am calculating cumulative skewness.
cskewness <- function(.x) {
skewness <- function(.x) {
sqrt(length(.x)) * sum((.x - mean(.x))^3) / (sum((.x - mean(.x))^2)^(3 / 2))
}
sapply(seq_along(.x), function(k, z) skewness(z[1:k]), z = .x)
}
Not getting my algebra right. Have this which is wrong:
skewness2 <- function(.x) {
n <- length(.x)
csum <- cumsum(.x)
cmu <- csum / 1:length(.x)
num <- cumsum(.x - cmu)^3
den <- cumsum((.x - cmu)^2)^(3/2)
sqrt(n) * num / den
}
Correct code produces:
x <- c(1,2,4,5,8)
> cskewness(x)
[1] NaN 0.0000000 0.3818018 0.0000000 0.4082483
> skewness2(x)
[1] NaN 1.000000 1.930591 3.882748 4.928973
Thanks for jblood94's effort to capture the possible hazard caused by significant outliers, I have opportunity to rethink and optimize the code.
We can use scale to pre-process the input vector, such that the detrimental impacts by the outliers are minimized, and also the skewness is not affected as well
cskewness_tic <- function(y) {
# rescale `y` to avoid detrimental impacts by outliers
y <- scale(y)
# cumulative length of y
k <- seq_along(y)
# cumulative n-th raw moments of y
m3 <- cumsum(y^3)
m2 <- cumsum(y^2)
m1 <- cumsum(y)
u <- m1 / k
# expand cubic terms and refactor skewness in terms of num/den
num <- (m3 - 3 * u * m2 + 3 * u^2 * m1 - k * u^3) / k
den <- sqrt((m2 + k * u^2 - 2 * u * m1) / k)^3
c(NaN, (num / den)[-1])
}
With list of functions for benchmarking (see details of cumskewness and cumskewnessCpp from @jblood94's answer)
set.seed(0)
x <- sample(1e6) + 1e10
microbenchmark(
cskewness_tic = cskewness_tic(x),
cumskewness = cumskewness(x),
cumskewnessCpp = cumskewnessCpp(x),
check = "equal",
unit = "relative",
times = 50L
)
shows
Unit: relative
expr min lq mean median uq max neval
cskewness_tic 3.144719 3.268279 3.461112 3.314326 3.846614 2.395820 50
cumskewness 4.647973 4.627621 4.455523 4.585634 4.448119 2.687571 50
cumskewnessCpp 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 50
First of all, non-vectorized approaches are not as bad as you think, so *apply is also a good option if you think it fits your ultimate objective and simplifies your work. No need face the loops with a fear.
If you want to take benefits from vectorization (like speed), you can try expanding the mathematical expression of skewness in terms of raw moments (refer this and this), and then have cumsum operations on each decoupled term
cskewness_tic <- function(y) {
# cumulative length of y
k <- seq_along(y)
# cumulative n-th raw moments of y
m3 <- cumsum(y^3)
m2 <- cumsum(y^2)
m1 <- cumsum(y)
u <- m1 / k
# expand cubic terms and refactor skewness in terms of num/den
num <- (m3 - 3 * u * m2 + 3 * u^2 * m1 - k * u^3) / k
den <- sqrt((m2 + k * u^2 - 2 * u * m1) / k)^3
num / den
}
such that
> cskewness_tic(x)
[1] NaN 0.0000000 0.3818018 0.0000000 0.4082483
List of apporaches
cskewness_origin <- function(.x) {
skewness <- function(.x) {
sqrt(length(.x)) * sum((.x - mean(.x))^3) / (sum((.x - mean(.x))^2)^(3 / 2))
}
sapply(seq_along(.x), function(k, z) skewness(z[1:k]), z = .x)
}
cskewness_jblood94 <- function(.x) {
d <- outer(.x, cumsum(.x) / (1:length(.x)), "-")
d[lower.tri(d)] <- 0
sqrt(1:length(.x)) * colSums(d^3) / colSums(d^2)^(3 / 2)
}
cskewness_tic <- function(y) {
# cumulative length of y
k <- seq_along(y)
# cumulative n-th order moments of y
m3 <- cumsum(y^3)
m2 <- cumsum(y^2)
m1 <- cumsum(y)
u <- m1 / k
# expand cubic terms and refactor skewness in terms of num/den
num <- (m3 - 3 * u * m2 + 3 * u^2 * m1 - k * u^3) / k
den <- sqrt((m2 + k * u^2 - 2 * u * m1) / k)^3
num / den
}
by running the following benchmarking
set.seed(0)
x <- sample(1e3)
microbenchmark(
origin = cskewness_origin(x),
jblood94 = cskewness_jblood94(x),
tic = cskewness_tic(x),
unit = "relative",
check = "equivalent",
times = 50L
)
we see
Unit: relative
expr min lq mean median uq max neval
origin 252.7504 247.1442 253.5965 254.8663 257.2905 239.0963 50
jblood94 302.8267 309.0854 326.7546 318.7141 310.2365 388.3786 50
tic 1.0000 1.0000 1.0000 1.0000 1.0000 1.0000 50
Using a single-pass for loop will be efficient:
cumskewness <- function(x) {
n <- length(x)
if (n == 0L) {
return(x)
} else if (n == 1L) {
return(0)
}
m2 <- m3 <- term1 <- 0
out <- numeric(n)
out[1] <- NaN
m1 <- x[1]
for (i in 2:n) {
n0 <- i - 1
delta <- x[i] - m1
delta_n <- delta/i
m1 <- m1 + delta_n
term1 <- delta*delta_n*n0
m3 <- m3 + term1*delta_n*(n0 - 1) - 3*delta_n*m2
m2 <- m2 + term1
out[i] <- sqrt(i)*m3/m2^1.5
}
out
}
It is also straightforward to write up with Rcpp:
library(Rcpp)
cppFunction('
NumericVector cumskewnessCpp(const NumericVector& x) {
const int n = x.size();
if (n == 0) {
return(x);
} else if (n == 1) {
return(0);
}
int n1;
double m1 = x(0);
double m2, m3, delta, delta_n, term1;
NumericVector out(n);
out(0) = R_NaN;
for (int i = 1; i < n; i++) {
n1 = i + 1;
delta = x(i) - m1;
delta_n = delta/n1;
m1 += delta_n;
term1 = delta*delta_n*i;
m3 += term1*delta_n*(i - 1) - 3*delta_n*m2;
m2 += term1;
out(i) = sqrt(n1)*m3/pow(m2, 1.5);
}
return out;
}
')
cumskewness(x)
#> [1] NaN 0.0000000 0.3818018 0.0000000 0.4082483
cumskewnessCpp(x)
#> [1] NaN 0.000000e+00 3.818018e-01 -2.808667e-17 4.082483e-01
Including the vectorized solution from @ThomisIsCoding:
cskewness_tic <- function(y) {
# cumulative length of y
k <- seq_along(y)
# cumulative n-th order moments of y
m3 <- cumsum(y^3)
m2 <- cumsum(y^2)
m1 <- cumsum(y)
u <- m1 / k
# expand cubic terms and refactor skewness in terms of num/den
num <- (m3 - 3 * u * m2 + 3 * u^2 * m1 - k * u^3) / k
den <- sqrt((m2 + k * u^2 - 2 * u * m1) / k)^3
num / den
}
set.seed(0)
x <- sample(1e3)
microbenchmark::microbenchmark(
cskewness_tic = cskewness_tic(x),
cumskewness = cumskewness(x),
cumskewnessCpp = cumskewnessCpp(x),
check = "equal",
unit = "relative"
)
#> Unit: relative
#> expr min lq mean median uq max neval
#> cskewness_tic 2.035272 2.07954 2.006118 2.388216 2.282022 0.7228815 100
#> cumskewness 3.930424 3.96835 4.003956 3.939762 3.711879 1.1339946 100
#> cumskewnessCpp 1.000000 1.00000 1.000000 1.000000 1.000000 1.0000000 100
cskewness_ticCatastrophic cancellation can result in precision errors for higher-order moments. It occurs when the standard deviation is small relative to the mean. Demonstrating:
set.seed(0)
x <- sample(1e3) + 1e8
y1 <- cskewness(x)
y2 <- cumskewness(x)
y3 <- cumskewnessCpp(x)
y4 <- cskewness_tic(x); y4[1] <- NaN
all.equal(y1, y2)
#> [1] TRUE
all.equal(y1, y3)
#> [1] TRUE
all.equal(y1, y4)
#> [1] "Mean relative difference: 270.1872"
skewness2 <- function(.x) {
d <- outer(.x, cumsum(.x)/(1:length(.x)), "-")
d[lower.tri(d)] <- 0
sqrt(1:length(.x))*colSums(d^3)/colSums(d^2)^(3/2)
}
x <- c(1,2,4,5,8)
cskewness(x)
#> [1] NaN 0.0000000 0.3818018 0.0000000 0.4082483
skewness2(x)
#> [1] NaN 0.0000000 0.3818018 0.0000000 0.4082483
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With