0

我有一个关于过去 30 年股票回报率的数据集。现在我需要删除公司的所有行(年),直到第一行,这不是 NA。但是我需要为该公司保留所有其他行,这可能会在以后发生。然后代码应该跳转到下一个公司(Id)并重新启动该过程。

我已经尝试了以下代码,但说实话我有点迷路了。

cleaning <- function (DT, colnames){
  for(cols in colnames) 
    if(is.na(cols)){
      DT[, cols := NULL]
    } else {
      break
    }
}
MergedDT[, cleaning(MergedDT, RET), by = "Id"]

我收到了该代码的以下警告:

> 1: In `[.data.table`(DT, , `:=`(cols, NULL)) :   Adding new column
> 'cols' then assigning NULL (deleting it).

此外,我认为有一种更有效的方法来解决这个问题。

4

3 回答 3

0

, 的组合对group_by每个公司(或cyl在本例中的每个)进行分析,并do找到第一个年份(或mpg)不是 NA 的实例应该起作用:

df <- structure(list(model = c("Datsun 710", "Merc 240D", "Merc 230", 
"Fiat 128", "Honda Civic", "Toyota Corolla", "Toyota Corona", 
"Fiat X1-9", "Porsche 914-2", "Lotus Europa", "Volvo 142E", "Mazda RX4", 
"Mazda RX4 Wag", "Hornet 4 Drive", "Valiant", "Merc 280", "Merc 280C", 
"Ferrari Dino", "Hornet Sportabout", "Duster 360", "Merc 450SE", 
"Merc 450SL", "Merc 450SLC", "Cadillac Fleetwood", "Lincoln Continental", 
"Chrysler Imperial", "Dodge Challenger", "AMC Javelin", "Camaro Z28", 
"Pontiac Firebird", "Ford Pantera L", "Maserati Bora"), mpg = c(NA, 
NA, NA, NA, NA, 33.9, 21.5, NA, 26, 30.4, 21.4, NA, NA, NA, 18.1, 
19.2, 17.8, 19.7, NA, NA, NA, NA, 15.2, 10.4, 10.4, 14.7, 15.5, 
15.2, 13.3, 19.2, 15.8, 15), cyl = c(4, 4, 4, 4, 4, 4, 4, 4, 
4, 4, 4, 6, 6, 6, 6, 6, 6, 6, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 
8, 8, 8), disp = c(108, 146.7, 140.8, 78.7, 75.7, 71.1, 120.1, 
79, 120.3, 95.1, 121, 160, 160, 258, 225, 167.6, 167.6, 145, 
360, 360, 275.8, 275.8, 275.8, 472, 460, 440, 318, 304, 350, 
400, 351, 301), hp = c(93, 62, 95, 66, 52, 65, 97, 66, 91, 113, 
109, 110, 110, 110, 105, 123, 123, 175, 175, 245, 180, 180, 180, 
205, 215, 230, 150, 150, 245, 175, 264, 335), drat = c(3.85, 
3.69, 3.92, 4.08, 4.93, 4.22, 3.7, 4.08, 4.43, 3.77, 4.11, 3.9, 
3.9, 3.08, 2.76, 3.92, 3.92, 3.62, 3.15, 3.21, 3.07, 3.07, 3.07, 
2.93, 3, 3.23, 2.76, 3.15, 3.73, 3.08, 4.22, 3.54), wt = c(2.32, 
3.19, 3.15, 2.2, 1.615, 1.835, 2.465, 1.935, 2.14, 1.513, 2.78, 
2.62, 2.875, 3.215, 3.46, 3.44, 3.44, 2.77, 3.44, 3.57, 4.07, 
3.73, 3.78, 5.25, 5.424, 5.345, 3.52, 3.435, 3.84, 3.845, 3.17, 
3.57), qsec = c(18.61, 20, 22.9, 19.47, 18.52, 19.9, 20.01, 18.9, 
16.7, 16.9, 18.6, 16.46, 17.02, 19.44, 20.22, 18.3, 18.9, 15.5, 
17.02, 15.84, 17.4, 17.6, 18, 17.98, 17.82, 17.42, 16.87, 17.3, 
15.41, 17.05, 14.5, 14.6), vs = c(1, 1, 1, 1, 1, 1, 1, 1, 0, 
1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
0, 0), am = c(1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 
0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1), gear = c(4, 
4, 4, 4, 4, 4, 3, 4, 5, 5, 4, 4, 4, 3, 3, 4, 4, 5, 3, 3, 3, 3, 
3, 3, 3, 3, 3, 3, 3, 3, 5, 5), carb = c(1, 2, 2, 1, 2, 1, 1, 
1, 2, 2, 2, 4, 4, 1, 1, 4, 4, 6, 2, 4, 3, 3, 3, 4, 4, 4, 2, 2, 
4, 2, 4, 8)), row.names = c(NA, -32L), class = c("tbl_df", "tbl", 
"data.frame"))

df %>%
  group_by(cyl) %>%
  do(
    .[first(which(!is.na(.$mpg))):nrow(.),]
  )
于 2019-06-11T08:44:47.573 回答
0
DT[DT[,  .I[cumsum(!is.na(RET)) > 0], ID]$V1]

   ID  RET
1:  1 0.02
2:  1   NA
3:  2 0.01
4:  2   NA
5:  3 0.01
6:  3 0.05
7:  3 0.02

数据(从 chinsoon12 窃取(原始问题海报未能提供可重复的数据)):

DT <- data.table(ID=c(1,1,1,2,2,2,2,3,3,3), RET=c(NA,0.02,NA, NA,NA,0.01,NA, 0.01,0.05,0.02))
于 2019-06-11T13:53:30.777 回答
0

Iiuc,您希望为每个 ID 修剪开始的 NA 回报,这是一个选项:

DT[-DT[,.I[seq_len(match(TRUE, !is.na(RET)) - 1L)], .(ID)]$V1]

输出:

   ID  RET
1:  1 0.02
2:  1   NA
3:  2 0.01
4:  2   NA
5:  3 0.01
6:  3 0.05
7:  3 0.02

数据:

DT <- data.table(ID=c(1,1,1,2,2,2,2,3,3,3), RET=c(NA,0.02,NA, NA,NA,0.01,NA, 0.01,0.05,0.02))

DT

    ID  RET
 1:  1   NA
 2:  1 0.02
 3:  1   NA
 4:  2   NA
 5:  2   NA
 6:  2 0.01
 7:  2   NA
 8:  3 0.01
 9:  3 0.05
10:  3 0.02
于 2019-06-11T10:07:38.497 回答