使用data.table
我设法获得快 4 倍的解决方案:
# Creating test data :
dir.create("Test")
dd <- "Test/csvReadingTest2"
dir.create(dd)
dir.create(file.path(dd, "v1"))
dir.create(file.path(dd, "v2"))
n <- 3000
f <- function(x) sample(x, n, replace = T)
require(data.table)
set.seed(123)
d1 <- data.table(Ticker = f(LETTERS),
Date = f(seq.Date(as.Date("2016-01-01"), by = "month",
length.out = n/100)),
Open = f(c(1.2, 1.3)), High = f(c(1.2, 1.3)),
Low = f(c(1.2, 1.3)), Close = f(c(1.2, 1.3)),
Volume = f(1:10), OpenInt = f(1:10))
d1
# Ticker Date Open High Low Close Volume OpenInt
# 1: H 2203-04-01 1.2 1.3 1.2 1.2 6 4
# 2: N 2121-05-01 1.2 1.3 1.2 1.2 9 6
# 3: E 2060-04-01 1.3 1.2 1.2 1.3 1 3
# 4: V 2132-04-01 1.3 1.3 1.3 1.2 7 8
# 5: F 2253-04-01 1.2 1.3 1.3 1.2 3 10
# ---
# 2996: J 2027-05-01 1.3 1.3 1.2 1.2 7 6
# 2997: K 2177-05-01 1.2 1.3 1.2 1.2 5 4
# 2998: S 2200-03-01 1.2 1.2 1.2 1.2 6 2
# 2999: V 2110-05-01 1.3 1.3 1.3 1.2 4 3
# 3000: Q 2043-05-01 1.2 1.3 1.2 1.2 3 5
invisible(lapply(1:100, function(x) fwrite(d1, paste0(dd, "/v1/d", x, ".txt"))))
invisible(lapply(1:100, function(x) fwrite(d1, paste0(dd, "/v2/d", x, ".txt"))))
稍微修改了您的功能:
################################################################################
yourFunction_modified <- function(directory, Output_file_name, What_stocks) {
# require(plyr)
require(dplyr)
require(stringr)
library(readr)
# List <- data.frame(dir(directory, pattern = "*.txt", recursive = T))
# names(List)[1] <- "Path_file"
# List <- arrange(List, List$Path_file)
# List_wse_stocks <- (filter(List , str_count(List$Path_file ,
# pattern = What_stocks ) > 0 ))
l <- list.files(directory, recursive = T, full.names = T, pattern = "*.txt")
l <- l[grepl(What_stocks, l)]
rownumber = 1
for (i in l) {
if (file.info(i)$size != 0) {
dat <- read_csv(i,
col_types = cols(Ticker = col_character(),
Date = col_date(format = "%Y-%m-%d"),
Open = col_double(), High = col_double(),
Low = col_double(), Close = col_double(),
Volume = col_integer(),
OpenInt = col_integer()))
L_ = (str_locate_all(i,"/"))
File_name <- substr(i,sapply(L_,max) + 1, nchar(i))
dat$Ticker <- substr(File_name,1,nchar(File_name) - 4)
datt = dat %>% select(Ticker, Date, Open, High, Low, Close,
Volume, OpenInt)
if (rownumber == 1) {
rownumber = rownumber + 1
GPW_wse_stocks <- datt
} else {
GPW_wse_stocks <- rbind(GPW_wse_stocks, datt)
}
}
}
save(GPW_wse_stocks, file = Output_file_name)
return(data.frame(GPW_wse_stocks))
}
system.time(
x <- yourFunction_modified(dd, file.path(dirname(dd), "csvReadingTest2.Rdat"),
"/d[0-9]")
)
# 25 - 18 sek
我的功能:
myFun <- function(directory, Output_file_name, What_stocks) {
require(data.table)
require(Hmisc)
l <- list.files(directory, recursive = T, full.names = T, pattern = "*.txt")
l <- l[grepl(What_stocks, l)]
l <- l[file.info(l)$size != 0]
dtList <- lapply(l, function(i) {
dat <- fread(i)
File_name <- basename(i)
dat$Ticker <- substr(File_name, 1, nchar(File_name) - 4)
necessary <- Cs(Ticker, Date, Open, High, Low, Close, Volume, OpenInt)
# Delete unnecesary columns:
for (ii in setdiff(colnames(dat), necessary)) {
set(dat, j = ii, value = NULL)
}
dat
})
dtList[1:2]
dt <- rbindlist(dtList, use.names = T, fill = T, idcol = F)
require(fasttime)
dt[, Date := as.Date(fastPOSIXct(Date))]
save(dt, file = Output_file_name)
return(dt[])
}
system.time(
x2 <- myFun(dd, file.path(dirname(dd), "csvReadingTest2v2.Rdat"),
"/d[0-9]")
)
# 6 - 4 sek
all.equal(as.data.table(x), x2)
# [1] TRUE1