I am working on a research project where I want to determine equivalence of two distributions. I am currently using the Mann-Whitney Test for Equivalence and the code I am running (below) was provided with the book Testing Statistical Hypotheses of Equivalence and Noninferiority by Stefan Wellek (2010). Before running my data I am testing this code with random normal distributions which have the same mean and standard deviation. My problem is that there are three nested for loops and when running larger distributions sizes (as in the example below) the code takes forever to run. If I only had to run it once that would not be such a problem, but I am doing a simulation test and creating power curves so I need to run many iterations of this code (around 10,000). At the moment, depending on how I alter the distribution sizes, it takes days to run 10,000 iterations.
Any help in a way to increase the performance of this would be greatly appreciated.
x <- rnorm(n=125, m=3, sd=1)
y <- rnorm(n=500, m=3, sd=1)
alpha <- 0.05
m <- length(x)
n <- length(y)
eps1_ <- 0.2 #0.1382 default
eps2_ <- 0.2 #0.2602 default
eqctr <- 0.5 + (eps2_-eps1_)/2
eqleng <- eps1_ + eps2_
wxy <- 0
pihxxy <- 0
pihxyy <- 0
for (i in 1:m)
for (j in 1:n)
wxy <- wxy + trunc(0.5*(sign(x[i] - y[j]) + 1))
for (i in 1:m)
for (j1 in 1:(n-1))
for (j2 in (j1+1):n)
pihxyy <- pihxyy + trunc(0.5*(sign(x[i] - max(y[j1],y[j2])) + 1))
for (i1 in 1:(m-1))
for (i2 in (i1+1):m)
for (j in 1:n)
pihxxy <- pihxxy + trunc(0.5*(sign(min(x[i1],x[i2]) - y[j]) + 1))
wxy <- wxy / (m*n)
pihxxy <- pihxxy*2 / (m*(m-1)*n)
pihxyy <- pihxyy*2 / (n*(n-1)*m)
sigmah <- sqrt((wxy-(m+n-1)*wxy**2+(m-1)*pihxxy+(n-1)*pihxyy)/(m*n))
crit <- sqrt(qchisq(alpha,1,(eqleng/2/sigmah)**2))
if (abs((wxy-eqctr)/sigmah) >= crit) rej <- 1
if (abs((wxy-eqctr)/sigmah) < crit) rej <- 0
if (is.na(sigmah) || is.na(crit)) rej <- 1
MW_Decision <- rej
cat(" ALPHA =",alpha," M =",m," N =",n," EPS1_ =",eps1_," EPS2_ =",eps2_,
"\n","WXY =",wxy," SIGMAH =",sigmah," CRIT =",crit," REJ=",MW_Decision)
See edit below for an even better suggestion
One simple suggestion to get a bit of a speed boost is to byte compile your code.
For example, I wrapped your code into a function starting from the alpha <- 0.05
line and ran it on my laptop. Simply byte compiling your current code, it runs twice as fast.
set.seed(1234)
x <- rnorm(n=125, m=3, sd=1)
y <- rnorm(n=500, m=3, sd=1)
# f1 <- function(x,y){ ...your code...}
system.time(f1(x, y))
# user system elapsed
# 33.249 0.008 33.278
library(compiler)
f2 <- cmpfun(f1)
system.time(f2(x, y))
# user system elapsed
# 17.162 0.002 17.170
EDIT
I should add, this is the type of things that a different language would do much better than R. Have you looked at the Rcpp
and the inline
packages?
I've been curious to learn how to use them so I figured this was a good chance.
Here's a tweak of your code using the inline
package and Fortran (since I'm more comfortable with that than C). It wasn't hard at all (provided you know Fortran or C); I just followed the examples listed in cfunction
.
First, let's re-write your loops and compile them:
library(inline)
# Fortran code for first loop
loop1code <- "
integer i, j1, j2
real*8 tmp
do i = 1, m
do j1 = 1, n-1
do j2 = j1+1, n
tmp = x(i) - max(y(j1),y(j2))
if (tmp > 0.) pihxyy = pihxyy + 1
end do
end do
end do
"
# Compile the code and turn loop into a function
loop1fun <- cfunction(sig = signature(x="numeric", y="numeric", pihxyy="integer", m="integer", n="integer"), dim=c("(m)", "(n)", "", "", ""), loop1code, language="F95")
# Fortran code for second loop
loop2code <- "
integer i1, i2, j
real*8 tmp
do i1 = 1, m-1
do i2 = i1+1, m
do j = 1, n
tmp = min(x(i1), x(i2)) - y(j)
if (tmp > 0.) pihxxy = pihxxy + 1
end do
end do
end do
"
# Compile the code and turn loop into a function
loop2fun <- cfunction(sig = signature(x="numeric", y="numeric", pihxxy="integer", m="integer", n="integer"), dim=c("(m)", "(n)", "", "", ""), loop2code, language="F95")
Now let's create a new function that uses these. So it's not too long, I'll just sketch the key parts I modified from your code:
f3 <- function(x, y){
# ... code ...
# Remove old loop
## for (i in 1:m)
## for (j1 in 1:(n-1))
## for (j2 in (j1+1):n)
## pihxyy <- pihxyy + trunc(0.5*(sign(x[i] - max(y[j1],y[j2])) + 1))
# Call new function from compiled code instead
pihxyy <- loop1fun(x, y, pihxyy, m, n)$pihxyy
# Remove second loop
## for (i1 in 1:(m-1))
## for (i2 in (i1+1):m)
## for (j in 1:n)
## pihxxy <- pihxxy + trunc(0.5*(sign(min(x[i1],x[i2]) - y[j]) + 1))
# Call new compiled function for second loop
pihxxy <- loop2fun(x, y, pihxxy, m, n)$pihxxy
# ... code ...
}
And now we run it and voila, we get a huge speed boost! :)
system.time(f3(x, y))
# user system elapsed
0.12 0.00 0.12
I did check that it got the same results as your code, but you probably want to run some additional tests just in case.
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With