我有独立和依赖的数据集。我想测试因变量和自变量之间所有可能的关系。在我之前的帖子(如何使用带有多个参数的 mapply 复制函数来计算方法的功效?)中,我想使用仿真数据进行功效分析。现在,我想使用相同的功能分析真实数据。问题是test_function需要更多时间,因为我的数据集很大(每个数据集的维度大于 10000 X 40000)。另外,我想使用并行计算来加快计算速度。我发现bigstatsr包(https://privefl.github.io/bigstatsr/index.html)可以处理太大而无法放入内存的矩阵。此外,我想避免expand.grid因为它对于大数据的计算成本也很高。我没有找到任何可以使用 bigstatsr 包同时使用两个数据集并并行估计参数的帖子。数据集示例和代码如下:
# dependent dataset
test_A <- data.frame(matrix(rnorm(100), nr=10, nc=10))
# independent dataset
test_B <- data.frame(matrix(sample(c(0,1,2), 500, replace = TRUE), nr=50, nc=10))
# Find all combination using dependent and independe datasets's variables
A_B_pair <- subset(expand.grid(c1=names(test_A), c2=names(test_B),
stringsAsFactors = FALSE))
# Main function to estimate the parameter and p-values
test_function <- function(test_A, test_B, x,y){
c1 <- test_A [[x]]
c2 <- test_B[[y]]
Data <- data.frame(1, XX=c1, YY=c2)
model_lm <- lm(YY ~ XX, Data)
est_lm <- as.numeric(model_lm$coefficients)[2]
pvalue_lm <- as.numeric(summary(model_lm)$coeffi[,4][2])
return(unlist(data.frame(lm.estimator = est_lm, lm.pvalue =pvalue_lm)))
}
# Final output
output <- mapply(test_function, MoreArgs = list(test_A, test_B),
x = A_B_pair$c1, y = A_B_pair$c2)
编辑: 我想应用我提出的方法来估计参数并将结果与 lm方法进行比较。我提出的方法如下:
library(pracma)
Proposed_method<- function(Data, Beta)
{
n = dim(Data)[1]
Median <- t(apply(Data,2,median))
Dist <- sqrt(rowSums((Data - as.matrix(rep(1,dim(Data)[1]))%*%Median)^2))
Data0 <- as.matrix(Data[which(Dist <= as.numeric(quantile(Dist, p=.45, na.rm = TRUE))),])
Yo <- as.matrix(Data0[,dim(Data0)[2]])
Xo <- as.matrix(Data0[,-dim(Data0)[2]])
Gama0 <- as.numeric(pinv(crossprod(Xo, Xo))%*%crossprod(Xo, Yo))
Sigma2o <- var(Yo)
Y <- as.matrix(Data[,dim(Data)[2]])
X <- as.matrix(Data[,-dim(Data)[2]])
DiffTol = 0.0001;
DiffNorm = +10000;
Iter = 0;
###########While loop################
while (DiffNorm > DiffTol)
{
Const <- sqrt(2*pi*Sigma2o)
devmat <- (Y-X%*%Gama0)
Squaremat <- as.matrix(apply(devmat, c(1,2), function(x) x^2))
Gauss <- exp(-Squaremat/(2*as.numeric(Sigma2o)))/as.numeric(Const)
Wbeta <- exp(-(Beta*((Y-X%*%Gama0)*(Y-X%*%Gama0)))/(2*as.numeric(Sigma2o)))
ONE1 <- rep(1,dim(X)[2]);
Xb <- (X*(Wbeta%*%ONE1))
Gama <- as.numeric(pinv(crossprod(X, Xb))%*%crossprod(Xb, Y))
hedprod <- (Y-X%*%Gama)*(Y-X%*%Gama)
tWbeta <- as.matrix(t(Wbeta))
One_1 <- as.matrix(rep(1,dim(X)[1]))
Sigma2 <- (tWbeta%*%hedprod)*pinv(tWbeta%*%One_1)
LHb<-(sum(Gauss^Beta)/n-1)/Beta
LH<-prod(Gauss)
##########
Norm2 <- ((sum(Gama*Gama))^0.5 + abs(Sigma2))
DiffNorm <-((sum((Gama-Gama0)*(Gama-Gama0)))^0.5 + abs(Sigma2 - Sigma2o))/Norm2
###
Gama0 = Gama
Sigma2o=Sigma2
Iter = Iter + 1
}
return(list(Gama=Gama,Sigma2=Sigma2,Wt=Wbeta,LHb=LHb,LH=LH))
}
# independent variable dataset
test_A <- data.frame(matrix(sample(c(0,1,2), 500, replace = TRUE), nr=10, nc=50))
# dependent variable dataset
test_B <- data.frame(matrix(rnorm(1000), nr=10, nc=100))
# Find all combination using dependent and independe datasets's variables
A_B_pair <- subset(expand.grid(c1=names(test_A), c2=names(test_B),
stringsAsFactors = FALSE))
# Main function to estimate the parameter and p-values by proposed method and lm
test_function <- function(x, y){
c1 <- test_A[[x]]
c2 <- test_B[[y]]
Data <- data.frame(1, XX=c1, YY=c2)
nn <- dim(Data)[1]
Beta = 0.1
Omit = 2
ResL1 <- Proposed_method(Data, Beta)
ResL0 <- Proposed_method(as.matrix(Data[,-Omit]), Beta)
LR0 <- (-nn)*log(ResL1$Sigma2/ResL0$Sigma2)
# Proposed estimator
Proposed_estimator <- (ResL1$Gama)[2]
Proposed_pvalue <- as.numeric(pchisq(q=LR0, df=1, lower.tail = FALSE))
#lm model
model_lm <- lm(YY ~ XX, Data)
est_lm <- as.numeric(model_lm$coefficients)[2]
pvalue_lm <- as.numeric(summary(model_lm)$coeffi[,4][2])
return(unlist(data.frame(lm.estimator = est_lm, lm.pvalue =pvalue_lm, Proposed_estimator,Proposed_pvalue)))
}
# Output:
output <- mapply(test_function, x = A_B_pair$c1, y = A_B_pair$c2)
# transpose the output
output_t <- data.frame(t(output))
# Final output
output_final <- cbind(A_B_pair, output_t)
output_final <- structure(list(c1 = c("X1", "X2", "X3", "X4", "X5"), c2 = c("X1",
"X1", "X1", "X1", "X1"), lm.estimator = c(-0.855708052636761,
0.227250280548332, -0.128955946232531, 0.171650221327542, -0.701027831473379
), lm.pvalue = c(0.0361141129937136, 0.646905371365762, 0.816730073250761,
0.780290676037238, 0.261013977519426), Proposed_estimator = c(-0.879232513006948,
0.242368232504351, -0.110999951753211, 0.174574390311335, -0.76456493319124
), Proposed_pvalue = c(0.0131801103443272, 0.583155149115837,
0.870570103632653, 0.783460676404866, 0.154142429946211)), row.names = c(NA,
5L), class = "data.frame"))
如何应用 bigstatsr 并并行计算此函数以获取输出?非常感谢您的努力和帮助。