我有下want
表,它是用rowwise()
和生成的mutate()
。主要问题是这个解决方案太慢了。
实际数据集包含大约 15,000,000 行,大约需要 6 个小时才能完成。
我希望做的是将rowwise()
andmutate()
操作转换为 tidytable 以便它可以利用data.table
速度。或者也许我只需要一个基本的 R 解决方案?
set.seed(1990)
mydf <- tibble(id = as.vector(outer(letters, letters, paste0))[1:10]
, open_week = rep(1:5,2)) %>%
mutate(close_week = open_week + sample(1:5,10, replace = T)) %>%
arrange(open_week)
mydf
# some are closed, some are not closed # if not closed, set to NA
mydf$close_week[sample(c(TRUE, FALSE),10, replace = T, prob = c(0.1,0.9))] <- NA
mydf
# A tibble: 10 x 3
id open_week close_week
<chr> <int> <int>
1 aa 1 2
2 fa 1 4
3 ba 2 4
4 ga 2 NA
5 ca 3 7
6 ha 3 6
7 da 4 6
8 ia 4 5
9 ea 5 7
10 ja 5 9
# calculate up to the last week
week_last <- max(mydf$close_week, na.rm = T)
# create complete week grid
df <- as_tibble(data.frame(week = seq(from = min(mydf$open_week, na.rm = T)
, to = max(mydf$close_week, na.rm = T), by = 1)))
have <- df %>%
rowwise() %>%
mutate( # which IDs are active - for the rowwise week?
active_id_list = list(mydf$id[week >= mydf$open_week &
week < ifelse(is.na(mydf$close_week),
week_last +1,
mydf$close_week)]),
# what are the ages of the IDs - for the rowwise week?
active_id_age_list = list(week - mydf$open_week[week >= mydf$open_week &
week < ifelse(is.na(mydf$close_week),
week +1,
mydf$close_week)]),
# which IDs have age less than 1 week, more than 1 week - for the rowwise week?
active_id_less_1_week_list = list(active_id_list[active_id_age_list < 1]),
active_id_above_1_week_list = list(active_id_list[active_id_age_list >= 1]),
# how many active IDs based on age less than 1 week, age more than 1 week - for the rowwise week?
active_id_less_1_week = sum(active_id_age_list < 1, na.rm = T),
active_id_above_1_week = sum(active_id_age_list >= 1, na.rm = T),
# how many active IDs in total?
active_id_count = length(active_id_age_list)) %>%
ungroup() %>%
dplyr::select(!where(is.list)) # remove the list object, unless want to inspect the actual ID list
have
# A tibble: 9 x 4
week active_id_less_1_week active_id_above_1_week active_id_count
<dbl> <int> <int> <int>
1 1 2 0 2
2 2 2 1 3
3 3 2 3 5
4 4 2 3 5
5 5 2 4 6
6 6 0 4 4
7 7 0 2 2
8 8 0 2 2
9 9 0 1 1
我尝试按照
https://markfairbanks.github.io/tidytable/reference/mutate_rowwise..html替换rowwise()
andmutate()
tidytable::mutate_rowwise.()
但我不确定如何解释以下错误
have <- df %>%
tidytable::mutate_rowwise.( # which IDs are active - for the rowwise week?
active_id_list = list(mydf$id[week >= mydf$open_week &
week < ifelse(is.na(mydf$close_week),
week_last +1,
mydf$close_week)]),
# what are the ages of the IDs - for the rowwise week?
active_id_age_list = list(week - mydf$open_week[week >= mydf$open_week &
week < ifelse(is.na(mydf$close_week),
week +1,
mydf$close_week)]),
# which IDs have age less than 1 week, more than 1 week - for the rowwise week?
active_id_less_1_week_list = list(active_id_list[active_id_age_list < 1]),
active_id_above_1_week_list = list(active_id_list[active_id_age_list >= 1]),
# how many active IDs based on age less than 1 week, age more than 1 week - for the rowwise week?
active_id_less_1_week = sum(active_id_age_list < 1, na.rm = T),
active_id_above_1_week = sum(active_id_age_list >= 1, na.rm = T),
# how many active IDs in total?
active_id_count = length(active_id_age_list)) %>%
ungroup() %>%
dplyr::select(!where(is.list)) # remove the list object, unless want to inspect the actual ID list
Error in `[.data.table`(list(week = c(1, 2, 3, 4, 5, 6, 7, 8, 9), .rowwise_id = 1:9), :
'list' object cannot be coerced to type 'double'