0

我有一张大桌子,上面有几个晚上的时间戳。列是哪个晚上的 id,那个晚上的时间戳的 id 和那个时间戳的炉膛速率,它看起来像这样:

allData <- data.table(nightNo=c(1,1,1,1,1,1,2,2,2,2), withinNightNo=c(1,2,3,4,5,6,1,2,3,4), HR=c(1:10))

nightNo withinNightNo HR
   1             1     1
   1             2     2
   1             3     3
   1             4     4
   1             5     5
   1             6     6
   2             1     7
   2             2     8
   2             3     9
   2             4    10

我想在表中添加两个新列,即从同一晚的最后 10 行中 HR 的斜率和累积和。我使用线性回归计算斜率并将 cumsum 定义为:CUMSUM n = MAX(CUMSUM n-1 , 0) + (value n - MEAN(value 1-n ))。结果应如下所示:

nightNo withinNightNo  HR HRSlope HRCumsum
    1             1     1     NaN      0.0
    1             2     2       1      0.5
    1             3     3       1      1.5
    1             4     4       1      3.0
    1             5     5       1      5.0
    1             6     6       1      7.5
    2             1     7     NaN      0.0
    2             2     8       1      0.5
    2             3     9       1      1.5
    2             4    10       1      3.0

我已经使用 for 循环为这两个函数创建了代码。它们可以工作,但是我的表太大了,甚至计算单个值的斜率/累积和都需要很长时间。我的代码如下所示:

# Add HRSlope column
allData$HRSlope <- 0

for(i in 1:nrow(allData)){
    # Get points from up to last 10 seconds of the same night
    start <- ifelse(i < 11, 1, (i-10))
    points <- filter(allData[start:i,], nightNo == allData[i,]$nightNo)[, c("withinNightNo", "HR")]

    # Calculate necessary values
    meanX <- mean(points$withinNightNo)
    meanY <- mean(points$HR)
    meanXY <- mean(points$withinNightNo * points$HR)
    meanX2 <- mean(points$withinNightNo^2)

    # Calculate slope and add to table
    allData[i,]$HRSlope <- (meanX * meanY - meanXY) / (meanX^2 - meanX2)

    cat(i, "\n")
}

# Add cumsum column, and add first value to sum
allData$HRCumsum <- 0
Sum <- allData[1,]$HR

for(i in 2:nrow(allData)){
  # Get sum and average of HR in night so far, reset Sum if new night started
  Sum <- allData[i,]$HR + ifelse(allData[i,]$nightNo != allData[i-1,]$nightNo, 0 , Sum )
  Average <- Sum / allData[i,]$withinNightNo

  # Get previous cumsum, if available
  pCumsum <- ifelse(allData[i,]$nightNo != allData[i-1,]$nightNo, 0 , allData[i-1,]$HRCumsum )

  # Calculate current cumsum
  allData[i,]$HRCumsum <- max(pCumsum, 0) + (allData[i,]$HR - Average)

  cat(i, "\n")
}

有没有更有效的方法来做到这一点,大概没有 for 循环?

编辑:

我已经能够在一定程度上提高我的斜率函数的速度。然而,它仍然使用一个 forloop,它实际上在一个字段中输入了一个错误的值 9 次,然后才输入了正确的值。关于如何解决这两个问题的任何想法?

getSlope <- function(x, y) {
    # Calculate necessary values
    meanX <- mean(x)
    meanY <- mean(y)
    meanXY <- mean(x * y)
    meanX2 <- mean(x^2)

    # Calculate slope
    return((meanX * meanY - meanXY) / (meanX^2 - meanX2))
}

# Loop back to 1
for(i in max(allData):1){
    # Prevent i<=0
    low <- ifelse(i < 10, 0, i-10)

    # Grab up to last 10 points and calculate slope
    allData[with(allData, withinNightNo > i-10 & withinNightNo <= i), slope := getSlope(withinNightNo, HR), by= nightNo]
}

编辑2:

我也能够稍微提高我的 cumsum,但它与斜坡有同样的问题。除此之外,它需要更大的表块,因为它需要获取平均值,并且需要对所有数据进行两次循环。任何关于改进这一点的想法也将不胜感激。

# Calculate part of the cumsum
getCumsumPart <- function(x){
    return(x-mean(x))
}

# Calculate valueN - mean(value1:N)
for(i in max(allData$withinNightNo):1){
   allData[with(allData, withinNightNo <= i), cumsumPart:= 
   getCumsumPart(HR), by=nightNo]
}

# Calculate  + max(cumsumN-1, 0)
for(i in max(allData$withinNightNo):1){
    allData[with(allData, withinNightNo <= i & cumsumPart > 0), cumsum:= sum(cumsumPart), by=nightNo]
}

# Remove part table
allData$cumsumPart <- NULL

# Set NA values to 0
allData[with(allData, is.na(cumsum)), cumsum := 0]
4

1 回答 1

1

试试这个方法

library(dplyr)
library(caTools)

allData <- data.frame(nightNo=c(1,1,1,1,1,1,2,2,2,2), 
                      withinNightNo=c(1,2,3,4,5,6,1,2,3,4), 
                      HR=c(1:10))

group_fun <- function(grouped_df, window=10L) {
  # slope
  mean_x <- runmean(grouped_df$withinNightNo, window, align="right")
  mean_y <- runmean(grouped_df$HR, window, align="right")
  mean_xy <- runmean(grouped_df$withinNightNo * grouped_df$HR, window, align="right")
  mean_xx <- runmean(grouped_df$withinNightNo * grouped_df$withinNightNo, window, align="right")
  grouped_df$slope <- (mean_x * mean_y - mean_xy) / (mean_x^2 - mean_xx)

  # cumsum
  partial <- grouped_df$HR - mean_y # from above
  # the "loop" is unavoidable here, I think
  cumsum <- 0
  grouped_df$cumsum <- sapply(partial, function(val) {
    cumsum <<- max(cumsum, 0) + val
    cumsum
  })

  grouped_df
}

out <- allData %>%
  group_by(nightNo) %>%
  do(group_fun(., window=3L)) # change window as desired
于 2018-05-23T22:24:28.623 回答