1

与我在这里提出的问题相关:查找日期是否在多对矢量化日期之间重叠

初始数据示例:

  person start_loc start_date end_date.1 end_date.2 end_date.3 end_loc.1 end_loc.2 end_loc.3
1      1         a 2021-02-10 2021-02-17       <NA>       <NA>         g                    
2      2         a 2021-01-30 2020-09-29 2020-12-12 2021-02-04         a         a         g
3      3         g 2020-12-04       <NA>       <NA>       <NA>                              
4      4         r 2020-12-09 2020-12-12 2020-12-14 2021-01-05         c         c         g
5      5         t 2021-03-22 2021-03-25 2021-03-29       <NA>         b         t          
6      6         b 2021-04-04 2021-04-07 2021-04-09       <NA>         b         t          

example <- structure(list(person = 1:6, start_loc = c("a", "a", "g", "r", 
"t", "b"), start_date = structure(c(18668, 18657, 18600, 18605, 
18708, 18721), class = "Date"), end_date.1 = structure(c(18675, 
18534, NA, 18608, 18711, 18724), class = "Date"), end_date.2 = structure(c(NA, 
18608, NA, 18610, 18715, 18726), class = "Date"), end_date.3 = structure(c(NA, 
18662, NA, 18632, NA, NA), class = "Date"), end_loc.1 = c("g", 
"a", "", "c", "b", "b"), end_loc.2 = c("", "a", "", "c", "t", 
"t"), end_loc.3 = c("", "g", "", "g", "", "")), class = "data.frame", row.names = c(NA, 
-6L))

我的数据是这样排列的,因此每个person和 a都有行start_date,还有start_loc. 我想知道哪些人有

  • end_date的 7 天内start_date
  • 并且如果有两对或更多对符合此条件,则优先考虑end_loc匹配的那些start_loc
  • 否则最早。

所以所需的输出看起来像:

 person start_loc start_date end_date.1 end_date.2 end_date.3 end_loc.1 end_loc.2 end_loc.3   end_date end_loc
1      1         a 2021-02-10 2021-02-17       <NA>       <NA>         g                     2021-02-17       g
2      2         a 2021-01-30 2020-09-29 2020-12-12 2021-02-04         a         a         g       <NA>        
3      3         g 2020-12-04       <NA>       <NA>       <NA>                                     <NA>        
4      4         r 2020-12-09 2020-12-12 2020-12-14 2021-01-05         c         c         g 2020-12-12       c
5      5         t 2021-03-22 2021-03-25 2021-03-29       <NA>         b         t           2021-03-29       t
6      6         b 2021-04-04 2021-04-07 2021-04-09       <NA>         b         t           2021-04-07       b

我从上一个问题中遵循了一些技术,例如 using ,c_across但我似乎无法让 R 带回单个输出。这可能吗?我是否需要再次长期结构化数据?acrossrowwise

4

2 回答 2

2

抱歉延迟回复,但你可以做这样的事情

  • 人应该有一个结果(也许你的输入中有一个错字)
  • 在数据中用 NA 替换空字符串 ''
example <- structure(list(person = 1:6, start_loc = c("a", "a", "g", "r", 
                                                      "t", "b"), start_date = structure(c(18668, 18657, 18600, 18605, 
                                                                                          18708, 18721), class = "Date"), end_date.1 = structure(c(18675, 
                                                                                                                                                   18534, NA, 18608, 18711, 18724), class = "Date"), end_date.2 = structure(c(NA, 
                                                                                                                                                                                                                              18608, NA, 18610, 18715, 18726), class = "Date"), end_date.3 = structure(c(NA, 
                                                                                                                                                                                                                                                                                                         18662, NA, 18632, NA, NA), class = "Date"), end_loc.1 = c("g", 
                                                                                                                                                                                                                                                                                                                                                                   "a", NA, "c", "b", "b"), end_loc.2 = c(NA, "a", NA, "c", "t", 
                                                                                                                                                                                                                                                                                                                                                                                                          "t"), end_loc.3 = c(NA, "g", NA, "g", NA, NA)), class = "data.frame", row.names = c(NA, 
                                                                                                                                                                                                                                                                                                                                                                                                                                                                                              -6L))

library(tidyverse)
example %>% left_join(example %>% pivot_longer(cols = !c(person, start_loc, start_date), names_sep = '\\.', 
                                               names_to = c('.value', 'number'),
                                               values_drop_na = T) %>%
                        group_by(person) %>%
                        mutate(diff = end_date - start_date,
                               cond2 = diff <= 7 & diff >= 0,
                               cond1 = start_loc == end_loc) %>%
                        filter(cond2) %>%
                        arrange(person, -cond1, diff) %>%
                        summarise(end_date = first(end_date),
                                  end_loc = first(end_loc)), by = 'person')
#>   person start_loc start_date end_date.1 end_date.2 end_date.3 end_loc.1
#> 1      1         a 2021-02-10 2021-02-17       <NA>       <NA>         g
#> 2      2         a 2021-01-30 2020-09-29 2020-12-12 2021-02-04         a
#> 3      3         g 2020-12-04       <NA>       <NA>       <NA>      <NA>
#> 4      4         r 2020-12-09 2020-12-12 2020-12-14 2021-01-05         c
#> 5      5         t 2021-03-22 2021-03-25 2021-03-29       <NA>         b
#> 6      6         b 2021-04-04 2021-04-07 2021-04-09       <NA>         b
#>   end_loc.2 end_loc.3   end_date end_loc
#> 1      <NA>      <NA> 2021-02-17       g
#> 2         a         g 2021-02-04       g
#> 3      <NA>      <NA>       <NA>    <NA>
#> 4         c         g 2020-12-12       c
#> 5         t      <NA> 2021-03-29       t
#> 6         t      <NA> 2021-04-07       b

实际上,left_join 中的语法完成了总结的工作

example %>% pivot_longer(cols = !c(person, start_loc, start_date), names_sep = '\\.', 
                                               names_to = c('.value', 'number'),
                                               values_drop_na = T) %>%
                        group_by(person) %>%
                        mutate(diff = end_date - start_date,
                               cond2 = diff <= 7 & diff >= 0,
                               cond1 = start_loc == end_loc) %>%
                        filter(cond2) %>%
                        arrange(person, -cond1, diff) %>%
                        summarise(end_date = first(end_date),
                                  end_loc = first(end_loc))

# A tibble: 5 x 3
  person end_date   end_loc
   <int> <date>     <chr>  
1      1 2021-02-17 g      
2      2 2021-02-04 g      
3      4 2020-12-12 c      
4      5 2021-03-29 t      
5      6 2021-04-07 b  
于 2021-05-27T17:24:04.553 回答
1

你可以使用dplyrand tidyr

library(dplyr)
library(tidyr)

tmp0 <- example %>%
  pivot_longer(cols = starts_with("end_date"), names_to=c("id"), names_pattern="end_date.(.)", values_to="end_date", values_drop_na = TRUE) %>%
  pivot_longer(cols = starts_with("end_loc"), names_to=c("id2"), names_pattern="end_loc.(.)", values_to="end_loc", values_drop_na = TRUE) %>%
  filter(id==id2, end_date <= start_date + 7 & end_date >= start_date) %>%
  select(-id, -id2) %>%
  group_by(person)


tmp1 <- tmp0 %>%
  mutate(match = case_when(end_loc == start_loc ~ end_date, 
                           TRUE ~ NA_real_)) %>%
  filter(end_date == match) %>%
  select(-match)

tmp2 <- tmp0 %>%
  filter(end_date == min(end_date)) %>%
  anti_join(tmp1, by=c("person"))

tmp1 %>%
  bind_rows(tmp2) %>%
  right_join(example, by=c("person", "start_loc", "start_date"))  %>%
  arrange(person, start_loc, start_date)

返回

# A tibble: 6 x 11
# Groups:   person [6]
  person start_loc start_date end_date   end_loc end_date.1 end_date.2 end_date.3 end_loc.1 end_loc.2
   <int> <chr>     <date>     <date>     <chr>   <date>     <date>     <date>     <chr>     <chr>    
1      1 a         2021-02-10 2021-02-17 g       2021-02-17 NA         NA         "g"       ""       
2      2 a         2021-01-30 2021-02-04 g       2020-09-29 2020-12-12 2021-02-04 "a"       "a"      
3      3 g         2020-12-04 NA         NA      NA         NA         NA         ""        ""       
4      4 r         2020-12-09 2020-12-12 c       2020-12-12 2020-12-14 2021-01-05 "c"       "c"      
5      5 t         2021-03-22 2021-03-29 t       2021-03-25 2021-03-29 NA         "b"       "t"      
6      6 b         2021-04-04 2021-04-07 b       2021-04-07 2021-04-09 NA         "b"       "t"      
# ... with 1 more variable: end_loc.3 <chr>

我正在建立三个临时表来获取所需的信息。tmp1包含 start_loc 和 end_loc 相同的数据,tmp2看看不同位置的数据。最后,我们结合这两个表并创建所需的输出。

于 2021-05-25T15:36:08.277 回答