benchmarking logistic regression using glm.fit , bigglm, speedglm, glmnet, LiblineaR

Question

I am simulating data and comparing glm.fit , bigglm, speedglm, glmnet, LiblineaR for binary logit model.

testGLMResults_and_speed <- function(N, p, chunk=NULL, Ifsample=FALSE, size=NULL, reps=5){

  library(LiblineaR)
  library(speedglm)
  library(biglm)
  library(glmnet)

  # simulate dataset
  X = scale(matrix(rnorm(N*p), nrow=N, ncol=p))
  X1 = cbind(rep(1, N), X)
  q = as.integer(p/2)
  b = c(rnorm(q+1), rnorm(p-q)*10)
  eta = X1 %*% b

  # simulate Y
  simy <- function(x){
    p = 1/(1 + exp(-eta[x]))
    u = runif(1, 0, 1)
    return(ifelse(u<=p, 1, 0))
  }

  Y = sapply(1:N, simy)
  XYData = as.data.frame(cbind(y=Y, X))

  getSample <- function(X, Y=NULL, size){
    ix = sample(dim(X)[1], size, replace=FALSE)    
    return(list(X=X[ix,], Y=Y[ix]))
  }

  #LiblineaR function
  fL <- function(X, Y, type, Ifsample=Ifsample, size=size){    
    if(Ifsample){
      res = getSample(X, Y, size)
      X = res$X; Y = res$Y;
    }
    resL = LiblineaR(data=X, labels=Y, type=type)
    return(resL$W)    
  }

  #glmnet
  fNet <- function(X, Y, Ifsample=Ifsample, size=size){    
    if(Ifsample){
      res = getSample(X, Y, size)
      X = res$X; Y = res$Y;
    }
    resNGLM <- glmnet(x=X, y=Y, family="binomial", standardize=FALSE, type.logistic="modified.Newton")    
    return(c(resNGLM$beta[, resNGLM$dim[2]], 0))
  }

  #glm.fit
  fglmfit <- function(X1, Y, Ifsample=Ifsample, size=size){
    if(Ifsample){
      res = getSample(X1, Y, size)      
      X1 = res$X; Y=res$Y;
    }
    resGLM = glm.fit(x=X1, y=Y, family=binomial(link=logit))
    return(c(resGLM$coefficients[2:(p+1)], resGLM$coefficients[1]))
  }

  #speedglm
  fspeedglm <- function(X1, Y, Ifsample=Ifsample, size=size){
    if(Ifsample){
      res = getSample(X1, Y, size)  
      X1 = res$X; Y=res$Y;
    }
    resSGLM = speedglm.wfit(y=Y, X=X1, family=binomial(link=logit), row.chunk=chunk)
    return(c(resSGLM$coefficients[2:(p+1)], resSGLM$coefficients[1]))
  }

  #bigglm
  fbigglm <- function(form, XYdf, Ifsample=Ifsample, size=size){
    if(Ifsample){
      res = getSample(XYdf, Y=NULL, size)  
      XYdf = res$X; 
    }
    resBGLM <- bigglm(formula=form, data=XYdf, family = binomial(link=logit), maxit=20)
    if(resBGLM$converged){
      resBGLM = summary(resBGLM)$mat[,1]
    } else {
      resBGLM = rep(-99, p+1)
    }    
    return(c(resBGLM[2:(p+1)], resBGLM[1]))
  }

  ## benchmarking function
  ## calls reps times and averages parameter values and times over reps runs
  bench_mark <- function(func, args, reps){
    oneRun <- function(x, func, args){
      times = system.time(res <- do.call(func, args))[c("user.self", "sys.self", "elapsed")]
      return(list(times=times, res=res))
    }    
    out = lapply(1:reps, oneRun, func, args)
    out.times = colMeans(t(sapply(1:reps, function(x) return(out[[x]]$times))))
    out.betas = colMeans(t(sapply(1:reps, function(x) return(out[[x]]$res))))
    return(list(times=out.times, betas=out.betas))
  }

  #benchmark LiblineaR
  func="fL"
  args = list(X=X, Y=Y, type=0, Ifsample=Ifsample, size=size)
  res_L0 = bench_mark(func, args, reps)
  args = list(X=X, Y=Y, type=6, Ifsample=Ifsample, size=size)
  res_L6 = bench_mark(func, args, reps)

  #benchmark glmnet 
  func = "fNet"
  args = list(X=X, Y=Y, Ifsample=Ifsample, size=size)
  res_GLMNet = bench_mark(func, args, reps)

  func="fglmfit"
  args = list(X1=X1, Y=Y, Ifsample=Ifsample, size=size)
  res_GLM = bench_mark(func, args, reps)

  func="fspeedglm"
  args = list(X1=X1, Y=Y, Ifsample=Ifsample, size=size)
  res_SGLM = bench_mark(func, args, reps)  

  func = "fbigglm"
  # create formula for bigglm
  xvarname = paste("V", 2:dim(XYData)[2], sep="")
  f  = as.formula(paste("y ~ ", paste(xvarname, collapse="+")))
  args = list(form=f, XYdf=XYData, Ifsample=Ifsample, size=size)
  res_BIGGLM = bench_mark(func, args, reps)

  summarize <- function(var){
    return(rbind(L0=res_L0[[var]], L6=res_L6[[var]], 
                GLMNet=res_GLMNet[[var]], GLMfit=res_GLM[[var]], 
                speedGLM=res_SGLM[[var]], bigGLM=res_BIGGLM[[var]]))
  }

  times = summarize("times")
  betas = rbind(summarize("betas"), betaTRUE = c(b[2:(p+1)], b[1]))
  colnames(betas)[dim(betas)[2]] = "Bias"
  # compare betas with true beta
  betacompare = t(sapply(1:dim(betas)[1], function(x) betas[x,]/betas[dim(betas)[1],]))


  print(paste("Run times averaged over", reps, "runs"))
  print(times)

  print(paste("Beta estimates averaged over", reps, "runs"))
  print(betas)

  print(paste("Ratio Beta estimates averaged over", reps, "runs for all methods (reference is true beta)"))
  print(betacompare)
}

A sample run looks as follows:

> testGLMResults_and_speed(10000, 5, 500, FALSE, 5000, 5)
[1] "Run times averaged over 5 runs"
         user.self sys.self elapsed
L0          0.0152   0.0032  0.0106
L6          0.0108   0.0002  0.0108
GLMNet      0.5660   0.0002  0.5666
GLMfit      0.0836   0.0262  0.0576
speedGLM    0.0438   0.0122  0.0298
bigGLM      0.2414   0.0956  0.1822
[1] "Beta estimates averaged over 5 runs"
                 V1         V2        V3        V4        V5       Bias
L0        0.4611973  0.7113034 -7.373351  4.890485  2.699251  0.3026018
L6        0.4516369  0.7002171 -7.284820  4.829202  2.659993  0.2972566
GLMNet   -0.4873854 -0.7512806  7.839316 -5.200533 -2.868255  0.0000000
GLMfit   -0.5064564 -0.7773168  8.088214 -5.367488 -2.961743 -0.3326801
speedGLM -0.5064564 -0.7773168  8.088214 -5.367488 -2.961743 -0.3326801
bigGLM   -0.5064564 -0.7773168  8.088214 -5.367488 -2.961743 -0.3326801
betaTRUE -0.4377651 -0.7692994  8.290677 -5.442880 -3.088798 -0.3901147
[1] "Ratio Beta estimates averaged over 5 runs for all methods (reference is true beta)"
            V1         V2         V3         V4         V5       Bias
[1,] -1.053527 -0.9246119 -0.8893544 -0.8985106 -0.8738838 -0.7756739
[2,] -1.031688 -0.9102009 -0.8786760 -0.8872513 -0.8611741 -0.7619724
[3,]  1.113349  0.9765776  0.9455579  0.9554745  0.9285990  0.0000000
[4,]  1.156914  1.0104217  0.9755794  0.9861486  0.9588659  0.8527751
[5,]  1.156914  1.0104217  0.9755794  0.9861486  0.9588659  0.8527751
[6,]  1.156914  1.0104217  0.9755794  0.9861486  0.9588659  0.8527751
[7,]  1.000000  1.0000000  1.0000000  1.0000000  1.0000000  1.0000000
Warning messages:
1: glm.fit: fitted probabilities numerically 0 or 1 occurred 
2: glm.fit: fitted probabilities numerically 0 or 1 occurred 
3: glm.fit: fitted probabilities numerically 0 or 1 occurred 
4: glm.fit: fitted probabilities numerically 0 or 1 occurred 
5: glm.fit: fitted probabilities numerically 0 or 1 occurred

I am seeing often that the estimates from LiblineaR is close but has the signs reversed! Does anybody know why this happens? It is often the fastest and is even faster with larger datasets I have seen.

I understand that the values will not be the same due to regularization; I am more curious about the signs.

If somebody (rep>1500) can please add the #LiblineaR and #speedglm tags, I would appreciate it.

Hong Ooi · Accepted Answer

While I haven't used LiblineaR, this kind of thing happens when a logistic regression is estimating the probability of the opposite label to what you expect. Everything else is estimating the probability that Y = 1, so my guess is that LiblineaR is predicting the probability that Y = 0.

benchmarking logistic regression using glm.fit , bigglm, speedglm, glmnet, LiblineaR

Tags:

r

logistic-regression

glm

glmnet

user1971988

1 Answers

Hong Ooi

Recent Activity

Donate For Us

benchmarking logistic regression using glm.fit , bigglm, speedglm, glmnet, LiblineaR

Tags:

r

logistic-regression

glm

glmnet

user1971988

1 Answers

Hong Ooi

Related questions

Recent Activity

Donate For Us