我正在尝试加快 R 中的一些代码。我认为可以替换我的循环方法(可能使用某种形式的 lapply 或使用 sqldf),但我似乎无法弄清楚如何。
基本前提是我有一个包含 ~50 个子目录的父目录,每个子目录包含 ~200 个 CSV 文件(总共 10,000 个 CSV)。这些 CSV 文件中的每一个都包含约 86,400 行(数据每天按秒计算)。
该脚本的目标是计算每个文件的两个时间间隔的平均值和标准差,然后为每个子目录制作一个汇总图,如下所示:
library(timeSeries)
library(ggplot2)
# list subdirectories in parent directory
dir <- list.dirs(path = "/ParentDirectory", full.names = TRUE, recursive = FALSE)
num <- (length(dir))
# iterate through all subdirectories
for (idx in 1:num){
# declare empty vectors to fill for each subdirectory
DayVal <- c()
DayStd <- c()
NightVal <- c()
NightStd <- c()
date <- as.Date(character())
setwd(dir[idx])
filenames <- list.files(path=getwd())
numfiles <- length(filenames)
# for each file in the subdirectory
for (i in c(1:numfiles)){
day <- read.csv(filenames[i], sep = ',')
today <- as.Date(day$time[1], "%Y-%m-%d")
# setting interval for times of day we care about <- SQL seems like it may be useful here but I couldn't get read.csv.sql to recognize hourly intervals
nightThreshold <- as.POSIXct(paste(today, "03:00:00"))
dayThreshold <- as.POSIXct(paste(today, "15:00:00"))
nightInt <- day[(as.POSIXct(day$time) >= nightThreshold & as.POSIXct(day$time) <= (nightThreshold + 3600)) , ]
dayInt <- day[(as.POSIXct(day$time) >= dayThreshold & as.POSIXct(day$time) <= (dayThreshold + 3600)) , ]
#check some thresholds in the data for that time period
if (sum(nightInt$val, na.rm=TRUE) < 5){
NightMean <- mean(nightInt$val, na.rm =TRUE)
NightSD <-sd(nightInt$val, na.rm =TRUE)
} else {
NightMean <- NA
NightSD <- NA
}
if (sum(dayInt$val, na.rm=TRUE) > 5){
DayMean <- mean(dayInt$val, na.rm =TRUE)
DaySD <-sd(dayInt$val, na.rm =TRUE)
} else {
DayMean <- NA
DaySD <- NA
}
NightVal <- c(NightVal, NightMean)
NightStd <- c(NightStd, NightSD)
DayVal <- c(gsrDayVal, DayMean)
DayStd <- c(gsrDayStd, DaySD)
date <-c(date, as.Date(today))
}
df<-data.frame(date,DayVal,DayStd,NightVal, NightStd)
# plot for the subdirectory
p1 <- ggplot() +
geom_point(data = df, aes(x = date, y = gsrDayVal, color = "Day Average")) +
geom_point(data = df, aes(x = date, y = gsrDayStd, color = "Day Standard Dev")) +
geom_point(data = df, aes(x = date, y = gsrNightVal, color = "Night Average")) +
geom_point(data = df, aes(x = date, y = gsrNightStd, color = "Night Standard Dev")) +
scale_colour_manual(values = c("steelblue", " turquoise3", "purple3", "violet"))
}
非常感谢您提供的任何建议!