我正在模拟 R 中的数据,以检查当异常值和多重共线性同时存在时哪些模型表现更好。为此,我将数据拆分为 70:30 的随机拆分,但我只需要在 70 个训练样本中引入异常值和多重共线性,并保持测试样本的清洁。 我怎么能在 R 中做到这一点?
以下是我的 R 代码,其中在整个数据中引入了异常值和多重共线性。
um <- function(R,n,sig,p,po,py,fx,fy){
#' where 'R is the level of multicollinearity between 0 and 1'#
#' "n" is the sample size
#' "sig" is the error vatiance
#' "p" is the number of explanaitory variable
#' 'po' is percentage outlier in x direction
#' 'py' is percentage outlier in y direction
#' 'fx' is magnitude of outlier in x direction
#' 'fy' is magnitude of outlier in y direction'#
RR=1000
set.seed(123)
OP1=NULL
#Explanatory vriables
x=matrix(0,nrow=n,ncol=p)
W <-matrix(rnorm(n*(p+1),mean=0,sd=1), n, p+1)
for (i in 1:n){
for (j in 1:p){
x[i,j] <- sqrt(1-R^2)*W[i,j]+(R)*W[i,p+1]; #Introducing multicollinearity
}
}
b=eigen(t(x)%*%x)$vec[,1]
#Invoking outlier
rep1=sample(1:n, size=po*n, replace=FALSE)
x[rep1,2]=fx*max(x[,2])+x[rep1,2] # The point of outlier
for (i in 1:RR){
u=rnorm(n,0,sig)
y=x%*%b+u
rep2=sample(1:n, size=py*n, replace=FALSE)
y[rep2]=fy*max(y)+y[rep2]
dat=data.frame(y,x)
dat[] <- lapply(dat, scale)
dat<-as.data.frame(dat)
n=nrow(dat)
mols=matrix(0,nrow= n);mM=matrix(0,nrow= n)
# 70:30 random split
training_idx = sample(1:nrow(dat),nrow(dat)*0.7,replace=FALSE)
tes_idx = setdiff(1:nrow(dat),training_idx)
training = dat[training_idx,]
xtr=as.matrix(training[,-1])
ytr=training[,1]
test = dat[tes_idx,]
xte=as.matrix(test[,-1])
yte=test[,1]
# building the models on training data
mest=rlm(ytr~xtr,psi=psi.huber,k2=1.345,maxit=1000)$coefficients
ols=lm(ytr~xtr)$coefficients
# Calculate MdAE on test data
OLS=median(abs(yte-cbind(1,xte)%*%ols))
M=median(abs(yte-cbind(1,xte)%*%mest))
res2=cbind(OLS,M)
OP1=res2
}
MAE=(t(OP1))
data.frame(R,n,sig,p,po,py,fx,fy,MAE)
}
results=NULL
R=c(0.99)
n=c(100)
sig=c(5)
p=c(5)
po=c(0.2)
py=c(0.2)
fx=c(5)
fy=c(5)
for(i in 1:length(R)){
for(j in 1:length(n)){
for(k in 1:length(sig)){
for(l in 1:length(p)){
for(m in 1:length(po)){
for(nn in 1:length(py)){
for(o in 1:length(fx)){
for(pp in 1:length(fy)){
results=rbind(results,um(R=R[i],n=n[j],sig=sig[k],p=p[l],
po=po[m],py=py[nn],fx=fx[o],fy=fy[pp]))
}
}
}
}
}
}
}
}
View(results)