0

我有一个纵向数据框,其中包含很多看起来像这样的缺失值。

ID = c(1,1,1,1,1,2,2,2,2,2,3,3,3,3,3)
date = c(1,2,3,4,5,1,2,3,4,5,1,2,3,4,5)
cond = c(0,0,0,1,0,0,0,0,1,0,0,0,0,0,0)
var = c(1, NA , 2, 0,NA, NA, 3, NA,0, NA, 2, NA, 1,NA,NA)
df = data.frame(ID, date, cond,var)

我想基于两个条件继续最后的观察:

1)什么cond=0时候应该对感兴趣的变量的较高值进行观察。

2)什么cond=1时候应该结转感兴趣变量的较低值。

有谁知道我如何以一种优雅的方式做到这一点?

最终数据集应如下所示

ID = c(1,1,1,1,1,2,2,2,2,2,3,3,3,3,3)
date = c(1,2,3,4,5,1,2,3,4,5,1,2,3,4,5)
cond = c(0,0,0,1,0,0,0,0,1,0,0,0,0,0,0)
var = c(1, 1 , 2, 0, 0, NA, 3, 3, 0, 0,2,2,2,2,2)
final = data.frame(ID, date, cond,var)

到目前为止,我能够继续最后的观察,但我无法强加条件

library(zoo)
df <- df %>%
  group_by(ID) %>% 
  mutate(var = 
           na.locf(var, na.rm = F))

欢迎任何建议

4

2 回答 2

2

这是accumulate2ie的使用

df%>%
   group_by(ID)%>%
   mutate(d = unlist(accumulate2(var,cond[-1],function(z,x,y) if(y) min(z,x,na.rm=TRUE) else max(z,x,na.rm=TRUE))))
# A tibble: 15 x 5
# Groups:   ID [3]
      ID  date  cond   var     d
   <dbl> <dbl> <dbl> <dbl> <dbl>
 1     1     1     0     1     1
 2     1     2     0    NA     1
 3     1     3     0     2     2
 4     1     4     1     0     0
 5     1     5     0    NA     0
 6     2     1     0    NA    NA
 7     2     2     0     3     3
 8     2     3     0    NA     3
 9     2     4     1     0     0
10     2     5     0    NA     0
11     3     1     0     2     2
12     3     2     0    NA     2
13     3     3     0     1     2
14     3     4     0    NA     2
15     3     5     0    NA     2
于 2019-11-26T18:53:47.963 回答
1

我想,如果我明白你的追求是什么?

ID = c(1,1,1,1,1,2,2,2,2,2,3,3,3,3,3)
date = c(1,2,3,4,5,1,2,3,4,5,1,2,3,4,5)
cond = c(0,0,0,1,0,0,0,0,1,0,0,0,0,0,0)
var = c(1, NA , 2, 0,NA, NA, 3, NA,0, NA, 2, NA, 1,NA,NA)
df = data.frame(ID, date, cond,var)

使用case_when你可以做一些条件检查。我不确定您是否要返回所有“ID”字段的最小值,但这会查看条件,然后滞后或导致找到非缺失值

library(dplyr)
df %>% 
  mutate(var_imput = case_when(
    cond == 0 & is.na(var)~lag(x = var, n = 1, default = NA),
    cond == 1 & is.na(var)~lead(x = var, n = 1, default = NA),
    TRUE~var
  ))

产生:

   ID date cond var var_imput
1   1    1    0   1         1
2   1    2    0  NA         1
3   1    3    0   2         2
4   1    4    1   0         0
5   1    5    0  NA         0
6   2    1    0  NA        NA
7   2    2    0   3         3
8   2    3    0  NA         3
9   2    4    1   0         0
10  2    5    0  NA         0
11  3    1    0   2         2
12  3    2    0  NA         2
13  3    3    0   1         1
14  3    4    0  NA         1
15  3    5    0  NA        NA

如果要按 ID 分组,则可以按 ID 生成一个插补表,然后将其与原始表连接,如下所示:

# enerate input table
input_table <- df %>% 
  group_by(ID) %>% 
  summarise(min = min(var, na.rm = T),
            max = max(var, na.rm = T)) %>% 
  gather(cond, value, -ID) %>% 
  mutate(cond = ifelse(cond == "min", 0, 1))

# Join and impute missing
df %>% 
  left_join(input_table,by = c("ID", "cond")) %>% 
  mutate(var_imput = ifelse(is.na(var), value, var))
于 2019-11-26T18:55:48.470 回答