我正在模拟数据并比较二元 logit 模型的 glm.fit、bigglm、speedglm、glmnet、LiblineaR。
testGLMResults_and_speed <- function(N, p, chunk=NULL, Ifsample=FALSE, size=NULL, reps=5){
library(LiblineaR)
library(speedglm)
library(biglm)
library(glmnet)
# simulate dataset
X = scale(matrix(rnorm(N*p), nrow=N, ncol=p))
X1 = cbind(rep(1, N), X)
q = as.integer(p/2)
b = c(rnorm(q+1), rnorm(p-q)*10)
eta = X1 %*% b
# simulate Y
simy <- function(x){
p = 1/(1 + exp(-eta[x]))
u = runif(1, 0, 1)
return(ifelse(u<=p, 1, 0))
}
Y = sapply(1:N, simy)
XYData = as.data.frame(cbind(y=Y, X))
getSample <- function(X, Y=NULL, size){
ix = sample(dim(X)[1], size, replace=FALSE)
return(list(X=X[ix,], Y=Y[ix]))
}
#LiblineaR function
fL <- function(X, Y, type, Ifsample=Ifsample, size=size){
if(Ifsample){
res = getSample(X, Y, size)
X = res$X; Y = res$Y;
}
resL = LiblineaR(data=X, labels=Y, type=type)
return(resL$W)
}
#glmnet
fNet <- function(X, Y, Ifsample=Ifsample, size=size){
if(Ifsample){
res = getSample(X, Y, size)
X = res$X; Y = res$Y;
}
resNGLM <- glmnet(x=X, y=Y, family="binomial", standardize=FALSE, type.logistic="modified.Newton")
return(c(resNGLM$beta[, resNGLM$dim[2]], 0))
}
#glm.fit
fglmfit <- function(X1, Y, Ifsample=Ifsample, size=size){
if(Ifsample){
res = getSample(X1, Y, size)
X1 = res$X; Y=res$Y;
}
resGLM = glm.fit(x=X1, y=Y, family=binomial(link=logit))
return(c(resGLM$coefficients[2:(p+1)], resGLM$coefficients[1]))
}
#speedglm
fspeedglm <- function(X1, Y, Ifsample=Ifsample, size=size){
if(Ifsample){
res = getSample(X1, Y, size)
X1 = res$X; Y=res$Y;
}
resSGLM = speedglm.wfit(y=Y, X=X1, family=binomial(link=logit), row.chunk=chunk)
return(c(resSGLM$coefficients[2:(p+1)], resSGLM$coefficients[1]))
}
#bigglm
fbigglm <- function(form, XYdf, Ifsample=Ifsample, size=size){
if(Ifsample){
res = getSample(XYdf, Y=NULL, size)
XYdf = res$X;
}
resBGLM <- bigglm(formula=form, data=XYdf, family = binomial(link=logit), maxit=20)
if(resBGLM$converged){
resBGLM = summary(resBGLM)$mat[,1]
} else {
resBGLM = rep(-99, p+1)
}
return(c(resBGLM[2:(p+1)], resBGLM[1]))
}
## benchmarking function
## calls reps times and averages parameter values and times over reps runs
bench_mark <- function(func, args, reps){
oneRun <- function(x, func, args){
times = system.time(res <- do.call(func, args))[c("user.self", "sys.self", "elapsed")]
return(list(times=times, res=res))
}
out = lapply(1:reps, oneRun, func, args)
out.times = colMeans(t(sapply(1:reps, function(x) return(out[[x]]$times))))
out.betas = colMeans(t(sapply(1:reps, function(x) return(out[[x]]$res))))
return(list(times=out.times, betas=out.betas))
}
#benchmark LiblineaR
func="fL"
args = list(X=X, Y=Y, type=0, Ifsample=Ifsample, size=size)
res_L0 = bench_mark(func, args, reps)
args = list(X=X, Y=Y, type=6, Ifsample=Ifsample, size=size)
res_L6 = bench_mark(func, args, reps)
#benchmark glmnet
func = "fNet"
args = list(X=X, Y=Y, Ifsample=Ifsample, size=size)
res_GLMNet = bench_mark(func, args, reps)
func="fglmfit"
args = list(X1=X1, Y=Y, Ifsample=Ifsample, size=size)
res_GLM = bench_mark(func, args, reps)
func="fspeedglm"
args = list(X1=X1, Y=Y, Ifsample=Ifsample, size=size)
res_SGLM = bench_mark(func, args, reps)
func = "fbigglm"
# create formula for bigglm
xvarname = paste("V", 2:dim(XYData)[2], sep="")
f = as.formula(paste("y ~ ", paste(xvarname, collapse="+")))
args = list(form=f, XYdf=XYData, Ifsample=Ifsample, size=size)
res_BIGGLM = bench_mark(func, args, reps)
summarize <- function(var){
return(rbind(L0=res_L0[[var]], L6=res_L6[[var]],
GLMNet=res_GLMNet[[var]], GLMfit=res_GLM[[var]],
speedGLM=res_SGLM[[var]], bigGLM=res_BIGGLM[[var]]))
}
times = summarize("times")
betas = rbind(summarize("betas"), betaTRUE = c(b[2:(p+1)], b[1]))
colnames(betas)[dim(betas)[2]] = "Bias"
# compare betas with true beta
betacompare = t(sapply(1:dim(betas)[1], function(x) betas[x,]/betas[dim(betas)[1],]))
print(paste("Run times averaged over", reps, "runs"))
print(times)
print(paste("Beta estimates averaged over", reps, "runs"))
print(betas)
print(paste("Ratio Beta estimates averaged over", reps, "runs for all methods (reference is true beta)"))
print(betacompare)
}
示例运行如下所示:
> testGLMResults_and_speed(10000, 5, 500, FALSE, 5000, 5)
[1] "Run times averaged over 5 runs"
user.self sys.self elapsed
L0 0.0152 0.0032 0.0106
L6 0.0108 0.0002 0.0108
GLMNet 0.5660 0.0002 0.5666
GLMfit 0.0836 0.0262 0.0576
speedGLM 0.0438 0.0122 0.0298
bigGLM 0.2414 0.0956 0.1822
[1] "Beta estimates averaged over 5 runs"
V1 V2 V3 V4 V5 Bias
L0 0.4611973 0.7113034 -7.373351 4.890485 2.699251 0.3026018
L6 0.4516369 0.7002171 -7.284820 4.829202 2.659993 0.2972566
GLMNet -0.4873854 -0.7512806 7.839316 -5.200533 -2.868255 0.0000000
GLMfit -0.5064564 -0.7773168 8.088214 -5.367488 -2.961743 -0.3326801
speedGLM -0.5064564 -0.7773168 8.088214 -5.367488 -2.961743 -0.3326801
bigGLM -0.5064564 -0.7773168 8.088214 -5.367488 -2.961743 -0.3326801
betaTRUE -0.4377651 -0.7692994 8.290677 -5.442880 -3.088798 -0.3901147
[1] "Ratio Beta estimates averaged over 5 runs for all methods (reference is true beta)"
V1 V2 V3 V4 V5 Bias
[1,] -1.053527 -0.9246119 -0.8893544 -0.8985106 -0.8738838 -0.7756739
[2,] -1.031688 -0.9102009 -0.8786760 -0.8872513 -0.8611741 -0.7619724
[3,] 1.113349 0.9765776 0.9455579 0.9554745 0.9285990 0.0000000
[4,] 1.156914 1.0104217 0.9755794 0.9861486 0.9588659 0.8527751
[5,] 1.156914 1.0104217 0.9755794 0.9861486 0.9588659 0.8527751
[6,] 1.156914 1.0104217 0.9755794 0.9861486 0.9588659 0.8527751
[7,] 1.000000 1.0000000 1.0000000 1.0000000 1.0000000 1.0000000
Warning messages:
1: glm.fit: fitted probabilities numerically 0 or 1 occurred
2: glm.fit: fitted probabilities numerically 0 or 1 occurred
3: glm.fit: fitted probabilities numerically 0 or 1 occurred
4: glm.fit: fitted probabilities numerically 0 or 1 occurred
5: glm.fit: fitted probabilities numerically 0 or 1 occurred
我经常看到来自 LiblineaR 的估计值很接近,但迹象却相反!有人知道为什么会这样吗?它通常是最快的,并且在我见过的更大数据集的情况下甚至更快。
我知道由于正则化,这些值不会相同;我对这些迹象更好奇。
如果有人(rep>1500)可以请添加#LiblineaR 和#speedglm 标签,我将不胜感激。