0

我有一个具有这种结构的数据集:

ID = c(1,1,1,1,2,2,2,3,3,3,3) 
L40 = c(1, NA, NA, NA, 1, NA, NA, NA, 1, NA, NA) 
K50 = c(NA, NA, NA, NA, NA, 1, NA, NA, NA, NA, 1) 
df = data.frame(ID, L40, K50)
# ID L40 K50
# 1   1   1  NA
# 2   1  NA  NA
# 3   1  NA  NA
# 4   1  NA  NA
# 5   2   1  NA
# 6   2  NA   1
# 7   2  NA  NA
# 8   3  NA  NA
# 9   3   1  NA
# 10  3  NA  NA
# 11  3  NA   1

当 L40 和 K50 列中出现缺失值时,我想结转该列中最后一个非缺失值,条件是 ID 与前一个 ID 相同,并且当前行中 L40 和 K50 中的值为空。我应用了以下代码:

    library(tidyr)
    df2 <- df %>% group_by(ID) %>% fill(L40:K50)

这并没有达到我想要的。我希望仅当该行中的其他列(ID 除外)为空时,才将先前的非缺失值结转到下一行。这就是我要的:

    ID = c(1,1,1,1,2,2,2,3,3,3,3) 
    L40 = c(1, 1, 1, 1, 1, NA, NA, NA, 1, 1, NA)
    K50 = c(NA, NA, NA, NA, NA, 1, 1, NA, NA, NA, 1)  
    df3 = data.frame(ID, L40, K50)
df3
# ID L40 K50
# 1   1   1  NA
# 2   1   1  NA
# 3   1   1  NA
# 4   1   1  NA
# 5   2   1  NA
# 6   2  NA   1
# 7   2  NA   1
# 8   3  NA  NA
# 9   3   1  NA
# 10  3   1  NA
# 11  3  NA   1
4

2 回答 2

0

我们可以用na.locf

library(data.table)
library(zoo)
setDT(df)[, if(any(is.na(K50[-1]))) lapply(.SD, na.locf) else .SD , by = ID]
#   ID L40 K50
#1:  1   1  NA
#2:  1   1  NA
#3:  1   1  NA
#4:  1   1  NA
#5:  2   1  NA
#6:  2  NA   1
#7:  3  NA   1
#8:  3  NA   1
#9:  3  NA   1

使用的选项dplyr

library(dplyr)
df %>% 
   mutate(ind = rowSums(is.na(.))) %>%
   group_by(ID)  %>%
   mutate_each(funs(if(any(ind>1)) na.locf(., na.rm=FALSE) else .), L40:K50) %>%
   select(-ind)
#      ID   L40   K50
#   <dbl> <dbl> <dbl>
#1     1     1    NA
#2     1     1    NA
#3     1     1    NA
#4     1     1    NA 
#5     2     1    NA
#6     2    NA     1
#7     3    NA     1
#8     3    NA     1
#9     3    NA     1
于 2016-07-28T09:28:25.383 回答
0

我在这个问题上玩了一段时间,并以我对 RI 的有限知识提出了以下解决方法。为了便于说明,我在原始数据框中添加了一个日期列:

ID = c(1,1,1,1,2,2,2,3,3,3,3)
date = c(1,2,3,4,1,2,3,1,2,3,4)
L40 = c(1, 1, NA, NA, 1, NA, NA, NA, 1, NA, NA)
K50 = c(NA, 1, 1, NA, NA, 1, NA, NA, NA, NA, 1) 
df = data.frame(ID, date, L40, K50)

这是我所做的:

#gather the diagnosis columns in rows and keep only those rows where the patient has the associated diagnosis.
df1 <- df %>% gather(diagnos, dummy, L40:K50) %>% filter(dummy==1) %>% arrange(ID, date)

#concatenate across rows by ID and date to collect all diagnoses of an ID at a particular date.
df2 <- df1 %>% group_by(ID, date) %>% mutate(diag = paste(diagnos, collapse=" ")) %>% select(-diagnos, -dummy)

#convert into data tables in preparation for join
Dt1 <- data.table(df)
Dt2 <- data.table(df2)

setkey(Dt1, ID, date)
setkey(Dt2, ID, date)

#Each observation in Dt1 is matched with the observation in Dt1 with the same date or, if that particular date is not present, 
#by the nearest previous date:
final <- Dt2[Dt1, roll=TRUE] %>% distinct()

这会将诊断的名称推进到下一个观察到的诊断。

于 2016-07-29T14:21:03.330 回答