1

我正在寻找将几个“.txt”文件导入一个数据帧(添加文件名)的函数的速度改进。“.txt”文件的数量> 10 000,并且所有这些文件具有相同的结构,并且位于具有多个子目录的一个目录中。所有 10 000 个文件的总大小约为 800 MB。将所有 10 000 个文件加载到 df 需要几个小时。我的电脑:Toshiba P50t,8GB RAM 和 1TB HDD

请查看我正在使用的代码。我很高兴听到有关如何提高加载速度的建议(我不希望使用诸如将数据加载到 MS SQL 并将其导入 R 之类的中间工具)我尝试使用 fread 而不是 read_csv 而没有显着的速度差异。

files_to_df_v01 <- function( directory , Output_file_name , What_stocks) {

  List <-   data.frame(dir(directory, pattern="*.txt", recursive = T))
  names(List)[1] <- "Path_file"
  List <-  arrange(List,List$Path_file)
  List_wse_stocks <- (filter ( List , str_count(List$Path_file , pattern = What_stocks ) > 0 ))

  library(readr)

  rownumber = 1
  setwd(directory)

  ############## LOOP ################ 

  for (i in List_wse_stocks$Path_file) {    
    if (file.info(i)$size != 0) {           
      dat <- read_csv(i,col_types = cols(Ticker = col_character(), Date = col_date(format = "%Y-%m-%d"), Open = col_double(), High = col_double(), Low = col_double(), Close = col_double(), Volume = col_integer(), OpenInt = col_integer() ))
      L_ = (str_locate_all(i,"/")) 
      sapply(L_,max) 
      File_name <- substr(i,sapply(L_,max)+1, nchar(i)) 
      dat$Ticker <- substr(File_name,1,nchar(File_name)-4) 
      datt = dat %>% select(Ticker, Date, Open, High, Low, Close, Volume, OpenInt)
      if (rownumber == 1) { rownumber = rownumber + 1
      GPW_wse_stocks <- datt }
      else{GPW_wse_stocks <- rbind(GPW_wse_stocks, datt)}
    }   
  }   
  # )                   ##############  END of LOOP

  save(GPW_wse_stocks,file=Output_file_name)

  return(data.frame(GPW_wse_stocks)) 

}
4

2 回答 2

2

使用data.table我设法获得快 4 倍的解决方案:

# Creating test data :

dir.create("Test")
dd <- "Test/csvReadingTest2"
dir.create(dd)
dir.create(file.path(dd, "v1"))
dir.create(file.path(dd, "v2"))

n <- 3000
f <- function(x) sample(x, n, replace = T)
require(data.table)
set.seed(123)
d1 <- data.table(Ticker = f(LETTERS),
                 Date = f(seq.Date(as.Date("2016-01-01"), by = "month",
                                   length.out = n/100)),
                 Open = f(c(1.2, 1.3)), High = f(c(1.2, 1.3)),
                 Low = f(c(1.2, 1.3)), Close = f(c(1.2, 1.3)),
                 Volume = f(1:10), OpenInt = f(1:10))
d1
#       Ticker       Date Open High Low Close Volume OpenInt
#    1:      H 2203-04-01  1.2  1.3 1.2   1.2      6       4
#    2:      N 2121-05-01  1.2  1.3 1.2   1.2      9       6
#    3:      E 2060-04-01  1.3  1.2 1.2   1.3      1       3
#    4:      V 2132-04-01  1.3  1.3 1.3   1.2      7       8
#    5:      F 2253-04-01  1.2  1.3 1.3   1.2      3      10
#  ---                                                     
# 2996:      J 2027-05-01  1.3  1.3 1.2   1.2      7       6
# 2997:      K 2177-05-01  1.2  1.3 1.2   1.2      5       4
# 2998:      S 2200-03-01  1.2  1.2 1.2   1.2      6       2
# 2999:      V 2110-05-01  1.3  1.3 1.3   1.2      4       3
# 3000:      Q 2043-05-01  1.2  1.3 1.2   1.2      3       5

invisible(lapply(1:100, function(x) fwrite(d1, paste0(dd, "/v1/d", x, ".txt"))))
invisible(lapply(1:100, function(x) fwrite(d1, paste0(dd, "/v2/d", x, ".txt"))))

稍微修改了您的功能:

################################################################################

yourFunction_modified <- function(directory, Output_file_name, What_stocks) {

  # require(plyr)
  require(dplyr)
  require(stringr)
  library(readr)

  # List <-   data.frame(dir(directory, pattern = "*.txt", recursive = T))
  # names(List)[1] <- "Path_file"
  # List <-  arrange(List, List$Path_file)
  # List_wse_stocks <- (filter(List , str_count(List$Path_file ,
  #                                               pattern = What_stocks ) > 0 ))

  l <- list.files(directory, recursive = T, full.names = T, pattern = "*.txt")
  l <- l[grepl(What_stocks, l)]

  rownumber = 1

  for (i in l) {    
    if (file.info(i)$size != 0) {           
      dat <- read_csv(i,
                      col_types = cols(Ticker = col_character(),
                                       Date = col_date(format = "%Y-%m-%d"),
                                       Open = col_double(), High = col_double(),
                                       Low = col_double(), Close = col_double(),
                                       Volume = col_integer(),
                                       OpenInt = col_integer()))
      L_ = (str_locate_all(i,"/")) 
      File_name <- substr(i,sapply(L_,max) + 1, nchar(i)) 
      dat$Ticker <- substr(File_name,1,nchar(File_name) - 4) 
      datt = dat %>% select(Ticker, Date, Open, High, Low, Close,
                            Volume, OpenInt)
      if (rownumber == 1) {
        rownumber = rownumber + 1
        GPW_wse_stocks <- datt
      } else {
          GPW_wse_stocks <- rbind(GPW_wse_stocks, datt)
      }
    } 
  }   
  save(GPW_wse_stocks, file = Output_file_name)
  return(data.frame(GPW_wse_stocks)) 
}


system.time(
  x <- yourFunction_modified(dd, file.path(dirname(dd), "csvReadingTest2.Rdat"),
                       "/d[0-9]")
)

 # 25 - 18 sek

我的功能:

myFun <- function(directory, Output_file_name, What_stocks) {
  require(data.table)
  require(Hmisc)

  l <- list.files(directory, recursive = T, full.names = T, pattern = "*.txt")
  l <- l[grepl(What_stocks, l)]
  l <- l[file.info(l)$size != 0]

  dtList <- lapply(l, function(i) { 
      dat <- fread(i)
      File_name <- basename(i)
      dat$Ticker <- substr(File_name, 1, nchar(File_name) - 4) 
      necessary <- Cs(Ticker, Date, Open, High, Low, Close, Volume, OpenInt)
      # Delete unnecesary columns:
      for (ii in setdiff(colnames(dat), necessary)) {
        set(dat, j = ii, value = NULL)
      }
      dat
  })
  dtList[1:2]
  dt <- rbindlist(dtList, use.names = T, fill = T, idcol = F)
  require(fasttime)
  dt[, Date := as.Date(fastPOSIXct(Date))]
  save(dt, file = Output_file_name)
  return(dt[]) 
}

system.time(
  x2 <- myFun(dd, file.path(dirname(dd), "csvReadingTest2v2.Rdat"),
                       "/d[0-9]")
)

# 6 - 4 sek

all.equal(as.data.table(x), x2)
# [1] TRUE1
于 2018-01-29T09:02:16.040 回答
2

rbindlist(lapply(files, fread))非常快,但如果您有大量小文件并且您不关心保留文件名,则最好直接使用操作系统。

设置数据,因为 OP 没有:100 行的 10,000 个文件。

setwd(tempdir())
dir.create("48492154")
setwd("48492154")


dates <- as.character(seq.Date(as.Date("2012-01-01"),
                               as.Date(Sys.Date()),
                               length.out = 500))

library(data.table)
for (i in 1:1e4) {
  DT <- data.table(Ticker = 1:100,
                   Date = sample(dates, size = 100),
                   Open = round(runif(100) + 100, 1),
                   Close = round(runif(100) + 100, 1),
                   Volume = sample(1:100),
                   OpenInt = 1:100)
  cat(i, "of 10,000\r") 
  flush.console()
  fwrite(DT, paste0(i, ".csv"), showProgress = FALSE)
}

简单的方法(也处理重复的标题并使 colClasses 更接近真相。)

system.time({
  res <- rbindlist(lapply(dir(pattern = "\\.csv"), fread))
})
#>   user  system elapsed 
#>   5.46    3.17    8.62 

使用 Windows 系统copy

system.time({
  # Windows only
  shell("copy /b *.csv out.txt > dump.log")
  new_res <- fread("out.txt")

  # Delete the headers mixed in (whereas rbindlist() above
  # handles this automatically -- and better)
  for (j in names(new_res)) {
    new_res <- new_res[.subset2(new_res, j) != j]
  }
})

#>   user  system elapsed 
#>   0.76    0.13    3.31 
于 2018-01-29T12:16:45.747 回答