1

我正在模拟 R 中的数据,以检查当异常值和多重共线性同时存在时哪些模型表现更好。为此,我将数据拆分为 70:30 的随机拆分,但我只需要在 70 个训练样本中引入异常值和多重共线性,并保持测试样本的清洁。 我怎么能在 R 中做到这一点?

以下是我的 R 代码,其中在整个数据中引入了异常值和多重共线性。

      um <- function(R,n,sig,p,po,py,fx,fy){
    
      #' where 'R is the level of multicollinearity between 0 and 1'#
      #' "n" is the sample size
      #' "sig" is the error vatiance
      #' "p" is the number of explanaitory variable
      #' 'po' is percentage outlier in x direction
      #'  'py' is percentage outlier in y direction
      #' 'fx' is magnitude of outlier in x direction
      #' 'fy' is magnitude of outlier in y direction'#
     
      RR=1000
      set.seed(123)
      OP1=NULL
    
      #Explanatory vriables
      
      x=matrix(0,nrow=n,ncol=p)
      W <-matrix(rnorm(n*(p+1),mean=0,sd=1), n, p+1)  
      for (i in 1:n){
        for (j in 1:p){
          x[i,j] <- sqrt(1-R^2)*W[i,j]+(R)*W[i,p+1];      #Introducing multicollinearity
        }    
      }
      
      b=eigen(t(x)%*%x)$vec[,1]
      
      #Invoking outlier
      rep1=sample(1:n, size=po*n, replace=FALSE)
      x[rep1,2]=fx*max(x[,2])+x[rep1,2]     # The point of outlier
      for (i in 1:RR){
        u=rnorm(n,0,sig)
        y=x%*%b+u
        rep2=sample(1:n, size=py*n, replace=FALSE)
        y[rep2]=fy*max(y)+y[rep2]
        
        dat=data.frame(y,x)
        dat[] <- lapply(dat, scale)
        dat<-as.data.frame(dat)
        n=nrow(dat)
        
        mols=matrix(0,nrow= n);mM=matrix(0,nrow= n)
        
        # 70:30 random split
        training_idx = sample(1:nrow(dat),nrow(dat)*0.7,replace=FALSE)
        tes_idx = setdiff(1:nrow(dat),training_idx)
        training = dat[training_idx,]
        xtr=as.matrix(training[,-1])
        ytr=training[,1]
        test = dat[tes_idx,]
        xte=as.matrix(test[,-1])
        yte=test[,1]
        
        # building the models on training data
        mest=rlm(ytr~xtr,psi=psi.huber,k2=1.345,maxit=1000)$coefficients
        ols=lm(ytr~xtr)$coefficients
        
        # Calculate MdAE on test data
        OLS=median(abs(yte-cbind(1,xte)%*%ols))
        M=median(abs(yte-cbind(1,xte)%*%mest))
    
        res2=cbind(OLS,M)
    
        OP1=res2
      }
      
        MAE=(t(OP1))
      
      data.frame(R,n,sig,p,po,py,fx,fy,MAE)
      }
       results=NULL
       R=c(0.99)
       n=c(100)
       sig=c(5)
       p=c(5)
       po=c(0.2)
       py=c(0.2)
       fx=c(5)
       fy=c(5)
    
    for(i in 1:length(R)){
      for(j in 1:length(n)){
        for(k in 1:length(sig)){
          for(l in 1:length(p)){
            for(m in 1:length(po)){
              for(nn in 1:length(py)){
                for(o in 1:length(fx)){
                  for(pp in 1:length(fy)){
                    results=rbind(results,um(R=R[i],n=n[j],sig=sig[k],p=p[l],
                                               po=po[m],py=py[nn],fx=fx[o],fy=fy[pp]))
                  }
                }
              }
            }
          }
        }
      }
    }
    
    View(results)
4

0 回答 0