1

我有下want表,它是用rowwise()和生成的mutate()。主要问题是这个解决方案太慢了。

实际数据集包含大约 15,000,000 行,大约需要 6 个小时才能完成。

我希望做的是将rowwise()andmutate()操作转换为 tidytable 以便它可以利用data.table速度。或者也许我只需要一个基本的 R 解决方案?

set.seed(1990)
mydf <- tibble(id = as.vector(outer(letters, letters, paste0))[1:10]
               , open_week = rep(1:5,2)) %>%
  mutate(close_week = open_week + sample(1:5,10, replace = T)) %>%
  arrange(open_week)
mydf
# some are closed, some are not closed # if not closed, set to NA
mydf$close_week[sample(c(TRUE, FALSE),10, replace = T, prob = c(0.1,0.9))] <- NA


mydf

# A tibble: 10 x 3
   id    open_week close_week
   <chr>     <int>      <int>
 1 aa            1          2
 2 fa            1          4
 3 ba            2          4
 4 ga            2         NA
 5 ca            3          7
 6 ha            3          6
 7 da            4          6
 8 ia            4          5
 9 ea            5          7
10 ja            5          9

# calculate up to the last week
week_last <- max(mydf$close_week, na.rm = T)

# create complete week grid
df <- as_tibble(data.frame(week = seq(from = min(mydf$open_week, na.rm = T)
                                     , to = max(mydf$close_week, na.rm = T), by = 1)))

have <- df %>% 
  rowwise() %>% 
  mutate( # which IDs are active - for the rowwise week?
         active_id_list = list(mydf$id[week >= mydf$open_week & 
                                                          week < ifelse(is.na(mydf$close_week),
                                                                        week_last +1,
                                                                        mydf$close_week)]),
         # what are the ages of the IDs - for the rowwise week?
         active_id_age_list = list(week - mydf$open_week[week >= mydf$open_week & 
                                                            week < ifelse(is.na(mydf$close_week),
                                                                                   week +1,
                                                                          mydf$close_week)]),
         # which IDs have age less than 1 week, more than 1 week - for the rowwise week?
         active_id_less_1_week_list = list(active_id_list[active_id_age_list < 1]),
         active_id_above_1_week_list = list(active_id_list[active_id_age_list >= 1]),
         
         # how many active IDs based on age less than 1 week, age more than 1 week - for the rowwise week?
         active_id_less_1_week = sum(active_id_age_list < 1, na.rm = T),
         active_id_above_1_week = sum(active_id_age_list >= 1, na.rm = T),

         # how many active IDs in total?
         active_id_count = length(active_id_age_list)) %>% 
  ungroup() %>% 
  dplyr::select(!where(is.list)) # remove the list object, unless want to inspect the actual ID list

have

# A tibble: 9 x 4
   week active_id_less_1_week active_id_above_1_week active_id_count
  <dbl>                 <int>                  <int>           <int>
1     1                     2                      0               2
2     2                     2                      1               3
3     3                     2                      3               5
4     4                     2                      3               5
5     5                     2                      4               6
6     6                     0                      4               4
7     7                     0                      2               2
8     8                     0                      2               2
9     9                     0                      1               1

我尝试按照 https://markfairbanks.github.io/tidytable/reference/mutate_rowwise..html替换rowwise()andmutate()tidytable::mutate_rowwise.()

但我不确定如何解释以下错误


have <- df %>% 
  tidytable::mutate_rowwise.( # which IDs are active - for the rowwise week?
    active_id_list = list(mydf$id[week >= mydf$open_week & 
                                    week < ifelse(is.na(mydf$close_week),
                                                  week_last +1,
                                                  mydf$close_week)]),
    # what are the ages of the IDs - for the rowwise week?
    active_id_age_list = list(week - mydf$open_week[week >= mydf$open_week & 
                                                      week < ifelse(is.na(mydf$close_week),
                                                                    week +1,
                                                                    mydf$close_week)]),
    # which IDs have age less than 1 week, more than 1 week - for the rowwise week?
    active_id_less_1_week_list = list(active_id_list[active_id_age_list < 1]),
    active_id_above_1_week_list = list(active_id_list[active_id_age_list >= 1]),
    
    # how many active IDs based on age less than 1 week, age more than 1 week - for the rowwise week?
    active_id_less_1_week = sum(active_id_age_list < 1, na.rm = T),
    active_id_above_1_week = sum(active_id_age_list >= 1, na.rm = T),
    
    # how many active IDs in total?
    active_id_count = length(active_id_age_list)) %>% 
  ungroup() %>% 
  dplyr::select(!where(is.list)) # remove the list object, unless want to inspect the actual ID list

Error in `[.data.table`(list(week = c(1, 2, 3, 4, 5, 6, 7, 8, 9), .rowwise_id = 1:9),  : 
  'list' object cannot be coerced to type 'double'
4

1 回答 1

1

错误发生在对list元素进行子集化时,即我们没有提取list元素。它可以用[[

df %>% 
  tidytable::mutate_rowwise.( # which IDs are active - for the rowwise week?
    active_id_list = list(mydf$id[week >= mydf$open_week & 
                                    week < ifelse(is.na(mydf$close_week),
                                                  week_last +1,
                                                  mydf$close_week)]),
    # what are the ages of the IDs - for the rowwise week?
    active_id_age_list = list(week - mydf$open_week[week >= mydf$open_week & 
                                                      week < ifelse(is.na(mydf$close_week),
                                                                    week +1,
                                                                    mydf$close_week)]), active_id_less_1_week_list = list(active_id_list[active_id_age_list[[1]] < 1]))
于 2021-05-28T18:02:04.300 回答