r - R从文件导入构建大数据框

Question

我使用 readMat 命令从 MATLAB .mat 文件导入大量数据。数据具有这种特征布局：struct$ERP: 200x256x16, struct$TGT: 200x1，以及我丢弃的其他一些无关的东西。变量是分类目标，第三维度是每个目标的TGT试验指标。

每个单独的主题文件夹中有七个或更多这些文件——我想建立一个有效的结构，使我能够快速处理单个试验（沿第三维的切片），同时使用正确的 R 跟踪目标变量风格。

我可以使用一个主题的文件以相对笨拙的方式执行这些步骤：

require(R.matlab)
subdirs <- list.dirs(".")
filelist <- list.files(path = '.', full.names = FALSE, pattern = "^.*\\.mat$", 
                   ignore.case = TRUE, recursive = TRUE, 
                   include.dirs = FALSE)


sub1t1src <- as.data.frame(filedatas[1][1][[1]]$eeg)
erp1 <- sub1t1src[1,1]$ERP
erp1 <- aperm(erp1,c(2,3,1)) # data is permuted differently than I would like
erp1r <- apply(erp1,2,rbind)
erp1rdf <- as.data.frame(erp1r)

tgt1 <- sub1t1df[2,1]$TGT
tgt1 <- as.factor(tgt1)
tgt1r <- rep(tgt1,each=256)

sub1t1df <- cbind(erp1rdf,tgt1r)

这为我提供了一个数据框，其中每个试验展开成行，TGT 变量在每一行上重复。这是一种聪明的方法，还是有更好的方法来组织类似于原始 MATLAB 结构的数据？

最重要的是，如果这是一种不那么糟糕的方法，我如何遍历每个文件，filelist并将诸如短主题名称之类的内容subdirs作为新列添加到数据中，以使用apply函数的“正确”方式？

样本数据（.mats 是二进制的，所以我只放一个主题的修剪输出）：

subdirs <- c(".", "./s1", "./s2")
filelist <- c("s1/file1.dat.mat","s1/file2.dat.mat","s2/file1.dat.mat","s2/file2.dat.mat")
# for some reason subdirs are still in the output of filelist

sub1t1src <- structure(list(`1.1` = structure(list(ERP = structure(c(-10.5069999694824, 
-13.585000038147, -6.21299982070923, -11.6659994125366, -16.5679988861084, 
-17.1949996948242, -26.390998840332, -13.6799993515015, -0.759999990463257, 
-7.58099985122681, 23.5789985656738, -2.07099986076355, -7.3149995803833, 
-2.33699989318848, -18.1070003509521, -21.9639987945557), 
Csingle = TRUE, .Dim = c(2L, 
2L, 4L)), TGT = structure(c(1L, 0L, 0L, 1L), .Dim = c(4L, 1L)), FS = structure(256,
.Dim = c(1L, 1L))), .Names = c("ERP", "TGT", "FS"))), 
.Names = "1.1", row.names = c("ERP", "TGT", "FS"
), class = "data.frame")

score 1 · Accepted Answer

考虑到 Roland 的建议，我猜这是用 for 循环做的不好的方法。我把主题分开了，因为物体太大了。

subdirs <- list.dirs(".")

for (this_subd in 2:length(subdirs)) {
  erpdata <- array(dim = c(200,16,256)) # ERP array
  targets <- array(dim = c(200,1)) # Target array
  # look for all mat-files in that directory
  filelist <- list.files(path = subdirs[this_subd], full.names = FALSE, 
                         pattern = "^.*.mat$", 
                         ignore.case = TRUE, recursive = TRUE, 
                         include.dirs = FALSE)

  # combine current subdir path
  filelist <- paste(subdirs[this_subd],filelist,sep="/")

  # Anonymous function to work over each file and resave as R data
  filedatas <- lapply(filelist, function(x) {
    curdata <- readMat(con = x)
    return(curdata)
  })

  for (this_file in 2:length(filedatas)) {
    this_erp <- filedatas[][[this_file]]$eeg
    this_tgt <- this_erp[,,1]$TGT
    this_erp <- array(this_erp[,,1]$ERP,dim = c(200,16,256))
    erpdata <- abind(erpdata,this_erp,along=1)
    targets <- rbind(targets,this_tgt)
  }

  # Permute the data into  samples X channels X trials
  erpdata <- aperm(erpdata,c(3,2,1))
  # Remove NAs from originally initializing array
  erpdata <- array(erpdata[!is.na(erpdata)],dim = dim(erpdata))
  targets <- array(targets[!is.na(targets)],dim = dim(targets))
  targets <- as.factor(targets) # convert to categorical variable

  save(erpdata,targets,
       file = paste(subdirs[this_subd],"/",
                  substring(subdirs[this_subd],first=3),
                  "unifieddata.Rdata",
                  sep = "")
  )
  # cleanup to save memory
  rm(erpdata,targets,this_erp,this_tgt)
}

r - R从文件导入构建大数据框

1 回答 1

Related

Reference