I am simulating data and comparing glm.fit , bigglm, speedglm, glmnet, LiblineaR for binary logit model.
testGLMResults_and_speed <- function(N, p, chunk=NULL, Ifsample=FALSE, size=NULL, reps=5){
library(LiblineaR)
library(speedglm)
library(biglm)
library(glmnet)
# simulate dataset
X = scale(matrix(rnorm(N*p), nrow=N, ncol=p))
X1 = cbind(rep(1, N), X)
q = as.integer(p/2)
b = c(rnorm(q+1), rnorm(p-q)*10)
eta = X1 %*% b
# simulate Y
simy <- function(x){
p = 1/(1 + exp(-eta[x]))
u = runif(1, 0, 1)
return(ifelse(u<=p, 1, 0))
}
Y = sapply(1:N, simy)
XYData = as.data.frame(cbind(y=Y, X))
getSample <- function(X, Y=NULL, size){
ix = sample(dim(X)[1], size, replace=FALSE)
return(list(X=X[ix,], Y=Y[ix]))
}
#LiblineaR function
fL <- function(X, Y, type, Ifsample=Ifsample, size=size){
if(Ifsample){
res = getSample(X, Y, size)
X = res$X; Y = res$Y;
}
resL = LiblineaR(data=X, labels=Y, type=type)
return(resL$W)
}
#glmnet
fNet <- function(X, Y, Ifsample=Ifsample, size=size){
if(Ifsample){
res = getSample(X, Y, size)
X = res$X; Y = res$Y;
}
resNGLM <- glmnet(x=X, y=Y, family="binomial", standardize=FALSE, type.logistic="modified.Newton")
return(c(resNGLM$beta[, resNGLM$dim[2]], 0))
}
#glm.fit
fglmfit <- function(X1, Y, Ifsample=Ifsample, size=size){
if(Ifsample){
res = getSample(X1, Y, size)
X1 = res$X; Y=res$Y;
}
resGLM = glm.fit(x=X1, y=Y, family=binomial(link=logit))
return(c(resGLM$coefficients[2:(p+1)], resGLM$coefficients[1]))
}
#speedglm
fspeedglm <- function(X1, Y, Ifsample=Ifsample, size=size){
if(Ifsample){
res = getSample(X1, Y, size)
X1 = res$X; Y=res$Y;
}
resSGLM = speedglm.wfit(y=Y, X=X1, family=binomial(link=logit), row.chunk=chunk)
return(c(resSGLM$coefficients[2:(p+1)], resSGLM$coefficients[1]))
}
#bigglm
fbigglm <- function(form, XYdf, Ifsample=Ifsample, size=size){
if(Ifsample){
res = getSample(XYdf, Y=NULL, size)
XYdf = res$X;
}
resBGLM <- bigglm(formula=form, data=XYdf, family = binomial(link=logit), maxit=20)
if(resBGLM$converged){
resBGLM = summary(resBGLM)$mat[,1]
} else {
resBGLM = rep(-99, p+1)
}
return(c(resBGLM[2:(p+1)], resBGLM[1]))
}
## benchmarking function
## calls reps times and averages parameter values and times over reps runs
bench_mark <- function(func, args, reps){
oneRun <- function(x, func, args){
times = system.time(res <- do.call(func, args))[c("user.self", "sys.self", "elapsed")]
return(list(times=times, res=res))
}
out = lapply(1:reps, oneRun, func, args)
out.times = colMeans(t(sapply(1:reps, function(x) return(out[[x]]$times))))
out.betas = colMeans(t(sapply(1:reps, function(x) return(out[[x]]$res))))
return(list(times=out.times, betas=out.betas))
}
#benchmark LiblineaR
func="fL"
args = list(X=X, Y=Y, type=0, Ifsample=Ifsample, size=size)
res_L0 = bench_mark(func, args, reps)
args = list(X=X, Y=Y, type=6, Ifsample=Ifsample, size=size)
res_L6 = bench_mark(func, args, reps)
#benchmark glmnet
func = "fNet"
args = list(X=X, Y=Y, Ifsample=Ifsample, size=size)
res_GLMNet = bench_mark(func, args, reps)
func="fglmfit"
args = list(X1=X1, Y=Y, Ifsample=Ifsample, size=size)
res_GLM = bench_mark(func, args, reps)
func="fspeedglm"
args = list(X1=X1, Y=Y, Ifsample=Ifsample, size=size)
res_SGLM = bench_mark(func, args, reps)
func = "fbigglm"
# create formula for bigglm
xvarname = paste("V", 2:dim(XYData)[2], sep="")
f = as.formula(paste("y ~ ", paste(xvarname, collapse="+")))
args = list(form=f, XYdf=XYData, Ifsample=Ifsample, size=size)
res_BIGGLM = bench_mark(func, args, reps)
summarize <- function(var){
return(rbind(L0=res_L0[[var]], L6=res_L6[[var]],
GLMNet=res_GLMNet[[var]], GLMfit=res_GLM[[var]],
speedGLM=res_SGLM[[var]], bigGLM=res_BIGGLM[[var]]))
}
times = summarize("times")
betas = rbind(summarize("betas"), betaTRUE = c(b[2:(p+1)], b[1]))
colnames(betas)[dim(betas)[2]] = "Bias"
# compare betas with true beta
betacompare = t(sapply(1:dim(betas)[1], function(x) betas[x,]/betas[dim(betas)[1],]))
print(paste("Run times averaged over", reps, "runs"))
print(times)
print(paste("Beta estimates averaged over", reps, "runs"))
print(betas)
print(paste("Ratio Beta estimates averaged over", reps, "runs for all methods (reference is true beta)"))
print(betacompare)
}
A sample run looks as follows:
> testGLMResults_and_speed(10000, 5, 500, FALSE, 5000, 5)
[1] "Run times averaged over 5 runs"
user.self sys.self elapsed
L0 0.0152 0.0032 0.0106
L6 0.0108 0.0002 0.0108
GLMNet 0.5660 0.0002 0.5666
GLMfit 0.0836 0.0262 0.0576
speedGLM 0.0438 0.0122 0.0298
bigGLM 0.2414 0.0956 0.1822
[1] "Beta estimates averaged over 5 runs"
V1 V2 V3 V4 V5 Bias
L0 0.4611973 0.7113034 -7.373351 4.890485 2.699251 0.3026018
L6 0.4516369 0.7002171 -7.284820 4.829202 2.659993 0.2972566
GLMNet -0.4873854 -0.7512806 7.839316 -5.200533 -2.868255 0.0000000
GLMfit -0.5064564 -0.7773168 8.088214 -5.367488 -2.961743 -0.3326801
speedGLM -0.5064564 -0.7773168 8.088214 -5.367488 -2.961743 -0.3326801
bigGLM -0.5064564 -0.7773168 8.088214 -5.367488 -2.961743 -0.3326801
betaTRUE -0.4377651 -0.7692994 8.290677 -5.442880 -3.088798 -0.3901147
[1] "Ratio Beta estimates averaged over 5 runs for all methods (reference is true beta)"
V1 V2 V3 V4 V5 Bias
[1,] -1.053527 -0.9246119 -0.8893544 -0.8985106 -0.8738838 -0.7756739
[2,] -1.031688 -0.9102009 -0.8786760 -0.8872513 -0.8611741 -0.7619724
[3,] 1.113349 0.9765776 0.9455579 0.9554745 0.9285990 0.0000000
[4,] 1.156914 1.0104217 0.9755794 0.9861486 0.9588659 0.8527751
[5,] 1.156914 1.0104217 0.9755794 0.9861486 0.9588659 0.8527751
[6,] 1.156914 1.0104217 0.9755794 0.9861486 0.9588659 0.8527751
[7,] 1.000000 1.0000000 1.0000000 1.0000000 1.0000000 1.0000000
Warning messages:
1: glm.fit: fitted probabilities numerically 0 or 1 occurred
2: glm.fit: fitted probabilities numerically 0 or 1 occurred
3: glm.fit: fitted probabilities numerically 0 or 1 occurred
4: glm.fit: fitted probabilities numerically 0 or 1 occurred
5: glm.fit: fitted probabilities numerically 0 or 1 occurred
I am seeing often that the estimates from LiblineaR is close but has the signs reversed! Does anybody know why this happens? It is often the fastest and is even faster with larger datasets I have seen.
I understand that the values will not be the same due to regularization; I am more curious about the signs.
If somebody (rep>1500) can please add the #LiblineaR and #speedglm tags, I would appreciate it.
While I haven't used LiblineaR
, this kind of thing happens when a logistic regression is estimating the probability of the opposite label to what you expect. Everything else is estimating the probability that Y = 1, so my guess is that LiblineaR
is predicting the probability that Y = 0.
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With