Logo Questions Linux Laravel Mysql Ubuntu Git Menu
 

benchmarking logistic regression using glm.fit , bigglm, speedglm, glmnet, LiblineaR

I am simulating data and comparing glm.fit , bigglm, speedglm, glmnet, LiblineaR for binary logit model.

testGLMResults_and_speed <- function(N, p, chunk=NULL, Ifsample=FALSE, size=NULL, reps=5){

  library(LiblineaR)
  library(speedglm)
  library(biglm)
  library(glmnet)

  # simulate dataset
  X = scale(matrix(rnorm(N*p), nrow=N, ncol=p))
  X1 = cbind(rep(1, N), X)
  q = as.integer(p/2)
  b = c(rnorm(q+1), rnorm(p-q)*10)
  eta = X1 %*% b

  # simulate Y
  simy <- function(x){
    p = 1/(1 + exp(-eta[x]))
    u = runif(1, 0, 1)
    return(ifelse(u<=p, 1, 0))
  }

  Y = sapply(1:N, simy)
  XYData = as.data.frame(cbind(y=Y, X))

  getSample <- function(X, Y=NULL, size){
    ix = sample(dim(X)[1], size, replace=FALSE)    
    return(list(X=X[ix,], Y=Y[ix]))
  }

  #LiblineaR function
  fL <- function(X, Y, type, Ifsample=Ifsample, size=size){    
    if(Ifsample){
      res = getSample(X, Y, size)
      X = res$X; Y = res$Y;
    }
    resL = LiblineaR(data=X, labels=Y, type=type)
    return(resL$W)    
  }

  #glmnet
  fNet <- function(X, Y, Ifsample=Ifsample, size=size){    
    if(Ifsample){
      res = getSample(X, Y, size)
      X = res$X; Y = res$Y;
    }
    resNGLM <- glmnet(x=X, y=Y, family="binomial", standardize=FALSE, type.logistic="modified.Newton")    
    return(c(resNGLM$beta[, resNGLM$dim[2]], 0))
  }

  #glm.fit
  fglmfit <- function(X1, Y, Ifsample=Ifsample, size=size){
    if(Ifsample){
      res = getSample(X1, Y, size)      
      X1 = res$X; Y=res$Y;
    }
    resGLM = glm.fit(x=X1, y=Y, family=binomial(link=logit))
    return(c(resGLM$coefficients[2:(p+1)], resGLM$coefficients[1]))
  }

  #speedglm
  fspeedglm <- function(X1, Y, Ifsample=Ifsample, size=size){
    if(Ifsample){
      res = getSample(X1, Y, size)  
      X1 = res$X; Y=res$Y;
    }
    resSGLM = speedglm.wfit(y=Y, X=X1, family=binomial(link=logit), row.chunk=chunk)
    return(c(resSGLM$coefficients[2:(p+1)], resSGLM$coefficients[1]))
  }

  #bigglm
  fbigglm <- function(form, XYdf, Ifsample=Ifsample, size=size){
    if(Ifsample){
      res = getSample(XYdf, Y=NULL, size)  
      XYdf = res$X; 
    }
    resBGLM <- bigglm(formula=form, data=XYdf, family = binomial(link=logit), maxit=20)
    if(resBGLM$converged){
      resBGLM = summary(resBGLM)$mat[,1]
    } else {
      resBGLM = rep(-99, p+1)
    }    
    return(c(resBGLM[2:(p+1)], resBGLM[1]))
  }

  ## benchmarking function
  ## calls reps times and averages parameter values and times over reps runs
  bench_mark <- function(func, args, reps){
    oneRun <- function(x, func, args){
      times = system.time(res <- do.call(func, args))[c("user.self", "sys.self", "elapsed")]
      return(list(times=times, res=res))
    }    
    out = lapply(1:reps, oneRun, func, args)
    out.times = colMeans(t(sapply(1:reps, function(x) return(out[[x]]$times))))
    out.betas = colMeans(t(sapply(1:reps, function(x) return(out[[x]]$res))))
    return(list(times=out.times, betas=out.betas))
  }

  #benchmark LiblineaR
  func="fL"
  args = list(X=X, Y=Y, type=0, Ifsample=Ifsample, size=size)
  res_L0 = bench_mark(func, args, reps)
  args = list(X=X, Y=Y, type=6, Ifsample=Ifsample, size=size)
  res_L6 = bench_mark(func, args, reps)

  #benchmark glmnet 
  func = "fNet"
  args = list(X=X, Y=Y, Ifsample=Ifsample, size=size)
  res_GLMNet = bench_mark(func, args, reps)

  func="fglmfit"
  args = list(X1=X1, Y=Y, Ifsample=Ifsample, size=size)
  res_GLM = bench_mark(func, args, reps)

  func="fspeedglm"
  args = list(X1=X1, Y=Y, Ifsample=Ifsample, size=size)
  res_SGLM = bench_mark(func, args, reps)  

  func = "fbigglm"
  # create formula for bigglm
  xvarname = paste("V", 2:dim(XYData)[2], sep="")
  f  = as.formula(paste("y ~ ", paste(xvarname, collapse="+")))
  args = list(form=f, XYdf=XYData, Ifsample=Ifsample, size=size)
  res_BIGGLM = bench_mark(func, args, reps)

  summarize <- function(var){
    return(rbind(L0=res_L0[[var]], L6=res_L6[[var]], 
                GLMNet=res_GLMNet[[var]], GLMfit=res_GLM[[var]], 
                speedGLM=res_SGLM[[var]], bigGLM=res_BIGGLM[[var]]))
  }

  times = summarize("times")
  betas = rbind(summarize("betas"), betaTRUE = c(b[2:(p+1)], b[1]))
  colnames(betas)[dim(betas)[2]] = "Bias"
  # compare betas with true beta
  betacompare = t(sapply(1:dim(betas)[1], function(x) betas[x,]/betas[dim(betas)[1],]))


  print(paste("Run times averaged over", reps, "runs"))
  print(times)

  print(paste("Beta estimates averaged over", reps, "runs"))
  print(betas)

  print(paste("Ratio Beta estimates averaged over", reps, "runs for all methods (reference is true beta)"))
  print(betacompare)
}

A sample run looks as follows:

> testGLMResults_and_speed(10000, 5, 500, FALSE, 5000, 5)
[1] "Run times averaged over 5 runs"
         user.self sys.self elapsed
L0          0.0152   0.0032  0.0106
L6          0.0108   0.0002  0.0108
GLMNet      0.5660   0.0002  0.5666
GLMfit      0.0836   0.0262  0.0576
speedGLM    0.0438   0.0122  0.0298
bigGLM      0.2414   0.0956  0.1822
[1] "Beta estimates averaged over 5 runs"
                 V1         V2        V3        V4        V5       Bias
L0        0.4611973  0.7113034 -7.373351  4.890485  2.699251  0.3026018
L6        0.4516369  0.7002171 -7.284820  4.829202  2.659993  0.2972566
GLMNet   -0.4873854 -0.7512806  7.839316 -5.200533 -2.868255  0.0000000
GLMfit   -0.5064564 -0.7773168  8.088214 -5.367488 -2.961743 -0.3326801
speedGLM -0.5064564 -0.7773168  8.088214 -5.367488 -2.961743 -0.3326801
bigGLM   -0.5064564 -0.7773168  8.088214 -5.367488 -2.961743 -0.3326801
betaTRUE -0.4377651 -0.7692994  8.290677 -5.442880 -3.088798 -0.3901147
[1] "Ratio Beta estimates averaged over 5 runs for all methods (reference is true beta)"
            V1         V2         V3         V4         V5       Bias
[1,] -1.053527 -0.9246119 -0.8893544 -0.8985106 -0.8738838 -0.7756739
[2,] -1.031688 -0.9102009 -0.8786760 -0.8872513 -0.8611741 -0.7619724
[3,]  1.113349  0.9765776  0.9455579  0.9554745  0.9285990  0.0000000
[4,]  1.156914  1.0104217  0.9755794  0.9861486  0.9588659  0.8527751
[5,]  1.156914  1.0104217  0.9755794  0.9861486  0.9588659  0.8527751
[6,]  1.156914  1.0104217  0.9755794  0.9861486  0.9588659  0.8527751
[7,]  1.000000  1.0000000  1.0000000  1.0000000  1.0000000  1.0000000
Warning messages:
1: glm.fit: fitted probabilities numerically 0 or 1 occurred 
2: glm.fit: fitted probabilities numerically 0 or 1 occurred 
3: glm.fit: fitted probabilities numerically 0 or 1 occurred 
4: glm.fit: fitted probabilities numerically 0 or 1 occurred 
5: glm.fit: fitted probabilities numerically 0 or 1 occurred 

I am seeing often that the estimates from LiblineaR is close but has the signs reversed! Does anybody know why this happens? It is often the fastest and is even faster with larger datasets I have seen.

I understand that the values will not be the same due to regularization; I am more curious about the signs.

If somebody (rep>1500) can please add the #LiblineaR and #speedglm tags, I would appreciate it.

like image 965
user1971988 Avatar asked Nov 12 '22 19:11

user1971988


1 Answers

While I haven't used LiblineaR, this kind of thing happens when a logistic regression is estimating the probability of the opposite label to what you expect. Everything else is estimating the probability that Y = 1, so my guess is that LiblineaR is predicting the probability that Y = 0.

like image 115
Hong Ooi Avatar answered Nov 15 '22 12:11

Hong Ooi