r - 如何在R中通过ID计算分类法术的数量和持续时间

Question

我有一个纵向数据集，每月记录一个人的就业状况，持续 45 个月。我希望能够创建两个变量以添加到此数据集中：1）每个人“失业”的总持续时间 2）失业法术的数量

理想情况下，它也会跳过 NA 而不会中断咒语

我创建了一个示例数据集以使事情变得简单：


    ID <- c(1:10, 1:10, 1:10)
    date <- c("2006-09-01", "2006-09-01", "2006-09-01", "2006-09-01", "2006-09-01", "2006-09-01", "2006-09-01", 
              "2006-09-01", "2006-09-01", "2006-09-01", "2006-10-01", "2006-10-01", "2006-10-01", "2006-10-01", 
              "2006-10-01", "2006-10-01", "2006-10-01", "2006-10-01", "2006-10-01", "2006-10-01", "2006-11-01", 
              "2006-11-01", "2006-11-01", "2006-11-01", "2006-11-01", "2006-11-01", "2006-11-01", "2006-11-01", 
              "2006-11-01", "2006-11-01")
    act <- c("Unemployed", "Employment", "Education", "Education", "Education", "Education", "Education", 
             "Education", "Education", "Unemployed", "Education", "Unemployed", "Unemployed", "Unemployed", 
             "Education", "Education", "Employment", "Education", "Education", "NA", "Unemployed", 
             "Unemployed", "NA", "Unemployed", "Education", "Employment", "Employment", "NA", "Education", 
             "Unemployed")
    df <- data.frame(ID, date, act)
    df[order(ID),]

       ID       date        act
    1   1 2006-09-01 Unemployed
    11  1 2006-10-01  Education
    21  1 2006-11-01 Unemployed
    2   2 2006-09-01 Employment
    12  2 2006-10-01 Unemployed
    22  2 2006-11-01 Unemployed
    3   3 2006-09-01  Education
    13  3 2006-10-01 Unemployed
    23  3 2006-11-01         NA
    4   4 2006-09-01  Education
    14  4 2006-10-01 Unemployed
    24  4 2006-11-01 Unemployed
    5   5 2006-09-01  Education
    15  5 2006-10-01  Education
    25  5 2006-11-01  Education
    6   6 2006-09-01  Education
    16  6 2006-10-01  Education
    26  6 2006-11-01 Employment
    7   7 2006-09-01  Education
    17  7 2006-10-01 Employment
    27  7 2006-11-01 Employment
    8   8 2006-09-01  Education
    18  8 2006-10-01  Education
    28  8 2006-11-01         NA
    9   9 2006-09-01  Education
    19  9 2006-10-01  Education
    29  9 2006-11-01  Education
    10 10 2006-09-01 Unemployed
    20 10 2006-10-01         NA
    30 10 2006-11-01 Unemployed

我尝试了 Roland 在计算 R 中的持续时间时提出的解决方案，但我不确定如何调整它以通过 ID 为我提供结果并处理 NA。


    library(data.table)
    setDT(df)
    df[, date := as.POSIXct(date, format = "%Y-%m-%d", tz = "GMT")]

    glimpse(df)
    df$act <- ifelse(df$act == "Unemployed",1,-1)
    df[, run := cumsum(c(1, diff(act) != 0))]

    df1 <- df[, list(act = unique(act), 
                               duration = difftime(max(date), min(date), unit = "weeks")), 
                        by = run]
    df1
        run act duration
     1:   1   1  0 weeks
     2:   2  -1  0 weeks
     3:   3   1  0 weeks
     4:   4  -1  0 weeks
     5:   5   1  0 weeks
     6:   6  -1  0 weeks
     7:   7   1  0 weeks
     8:   8  -1  0 weeks
     9:   9   1  0 weeks
    10:  10  -1  0 weeks
    11:  11   1  0 weeks

我所追求的是实现这一目标（这里的持续时间是几个月，但可以是几周或几天）：

    ID spell_count duration
1    1           2        2
2    2           1        2
3    3           1        1
...
10  10           1        2

任何帮助，任何链接/文献/示例将不胜感激。

谢谢你。

score 1 · Accepted Answer

我只使用你的第一个代码块，然后在整个持续时间内，我这样做：


    library(data.table)
    setDT(df)
    df_duration = df[act=="Unemployed",.(duration = .N),by = ID]

失业法术的数量有点棘手：


    df_spell_count = df[order(ID,date)]
    df_spell_count <- df_spell_count[!(is.na(act)|act=="NA")]
    df_spell_count[,previous_act := shift(act,1),by = ID]
    df_spell_count<-df_spell_count[act =="Unemployed" & (previous_act!="Unemployed" | is.na(previous_act))]
    df_spell_count<-df_spell_count[,.(spell_count =.N),by = ID]

如果你想合并这两件事，只需：

df_stats <- merge(df_duration,df_spell_count, by = "ID", all.x = TRUE,all.y = TRUE)

请注意，此 df 不包含那些没有失业期的用户的行。

score 0 · Accepted Answer

使用包tidyverse，您可以按变量（或更多）分组并非常轻松地进行汇总。

在聚合数据之前，我将强制列date进行分类并用实际缺失值Date替换字符串， ."NA"NA

library(tidyverse)

is.na(df$act) <- df$act == "NA"
df$date <- as.Date(df$date)

df %>%
  group_by(ID, act) %>%
  summarise(spell_count = sum(act == "Unemployed", na.rm = TRUE),
            duration = difftime(last(date), first(date), units = "weeks")) %>%
  filter(act == "Unemployed") %>%
  select(-act)
## A tibble: 5 x 3
## Groups:   ID [5]
#     ID spell_count duration      
#  <int>       <int> <time>        
#1     1           2 8.714286 weeks
#2     2           2 4.428571 weeks
#3     3           1 0.000000 weeks
#4     4           2 4.428571 weeks
#5    10           2 8.714286 weeks

上面的代码将只给出至少有一个act == "Unemployed".
如果您想要所有行，则以下基本 R 解决方案将执行此操作。

res <- lapply(split(df, df$ID), function(DF){
  i <- DF$act == "Unemployed"
  if(any(i, na.rm = TRUE))
    duration <- difftime(max(DF$date[i], na.rm = TRUE), min(DF$date[i], na.rm = TRUE), units = "weeks")
  else
    duration <- 0
  spell_count <- sum(i, na.rm = TRUE)
  data.frame(ID = DF$ID[1], spell_count, duration)

})

res <- do.call(rbind, res)
row.names(res) <- NULL
res
#   ID spell_count       duration
#1   1           2 8.714286 weeks
#2   2           2 4.428571 weeks
#3   3           1 0.000000 weeks
#4   4           2 4.428571 weeks
#5   5           0 0.000000 weeks
#6   6           0 0.000000 weeks
#7   7           0 0.000000 weeks
#8   8           0 0.000000 weeks
#9   9           0 0.000000 weeks
#10 10           2 8.714286 weeks

score 0 · Accepted Answer

这是使用 tidyverse 的另一种尝试。“法术”数据是面板数据的常见转换；在 tidyverse 方法中，我认为的技巧是生成一个拼写变量，例如 OP 原始代码中的“运行”变量。

# libraries
library(tidyverse)
library(zoo)
library(lubridate)

# example dataset
ID <- c(1:10, 1:10, 1:10)
date <- c("2006-09-01", "2006-09-01", "2006-09-01", "2006-09-01", "2006-09-01", "2006-09-01", "2006-09-01", 
          "2006-09-01", "2006-09-01", "2006-09-01", "2006-10-01", "2006-10-01", "2006-10-01", "2006-10-01", 
          "2006-10-01", "2006-10-01", "2006-10-01", "2006-10-01", "2006-10-01", "2006-10-01", "2006-11-01", 
          "2006-11-01", "2006-11-01", "2006-11-01", "2006-11-01", "2006-11-01", "2006-11-01", "2006-11-01", 
          "2006-11-01", "2006-11-01")
act <- c("Unemployed", "Employment", "Education", "Education", "Education", "Education", "Education", 
         "Education", "Education", "Unemployed", "Education", "Unemployed", "Unemployed", "Unemployed", 
         "Education", "Education", "Employment", "Education", "Education", "NA", "Unemployed", 
         "Unemployed", "NA", "Unemployed", "Education", "Employment", "Employment", "NA", "Education", 
         "Unemployed")
df <- data.frame(ID, date, act)
df[order(ID),]

# convert types of some variables (in particular use zoo::yearmon instead of date, since these are actually yearmonth combos)
df$act <- as.character(df$act)
df$date <- lubridate::ymd(df$date)
df$yearmon <- zoo::as.yearmon(df$date)
df$act <- ifelse(df$act=='NA',NA,df$act)


# construct "act2", which is act, except when an NA is surrounded by the SAME act before and after, it is replaced with that same act
# e.g. Unemployed NA Unemployed -> Unemployed Unemployed Unemployed
# e.g. Education NA Unemployed -> stays the same
# (see note at the end of this discussion for more details on this)
df <- df %>% arrange(ID,date)

df <- df %>% group_by(ID) %>% mutate(
  act2 = ifelse(is.na(act) & (lag(act)==lead(act)), lead(act), act)
)

# create "spell" variable, which is like the "run" variable in the example code
# within ID this identifies the spell that is currently taken place 
# --- this is the most important part of the code ---
df <- df %>% group_by(ID) %>% mutate(
  spell = cumsum(coalesce(is.na(act2) | act2!=lag(act2),FALSE)) + 1
)

# add yearmonth + 1 month, in order to do duration calculations
# (I'm again exploiting the fact that your data is monthly. if this were not true, this variable could be lead(date), within ID. but then we'd have to figure out how to deal with ends of the panel, where lead(date) is NA)
df$yearmonplusmonth <- df$yearmon + (1/12)

# construct a dataset of ID-spell combinations
spells <- df %>% group_by(ID,spell) %>% summarize(
  spelltype = first(act2),
  duration = (max(yearmonplusmonth) - min(yearmon))*12
)

# construct a dataset at the ID level, with desired summaries of spells
spellsummary <- spells %>% group_by(ID,spelltype) %>% summarize(
  spell_count = n(),
  duration = sum(duration)
) 

# if there are no spells of a given spelltype, it doesn't appear in spellsummary
# we need to fill out spellsummary with zeroes in ID-spelltype cases where there are no spells:
temp <- expand.grid(ID = unique(spellsummary$ID), spelltype = unique(spellsummary$spelltype))
spellsummary <- full_join(spellsummary,temp,by=c('ID','spelltype'))
spellsummary <- spellsummary %>% mutate_at(vars(spell_count,duration),funs(coalesce(as.numeric(.),0)))
spellsummary <- spellsummary %>% mutate_at(vars(spell_count,duration),funs(round(.,0)))
spellsummary <- spellsummary %>% arrange(ID,spelltype)

# finally, we just want Unemployed spelltype summaries by ID:
spellsummary %>% filter(spelltype=='Unemployed')

# A tibble: 10 x 4
# Groups:   ID [10]
# ID spelltype  spell_count duration
# <int> <chr>            <dbl>    <dbl>
# 1     1 Unemployed           2        2
# 2     2 Unemployed           1        2
# 3     3 Unemployed           1        1
# 4     4 Unemployed           1        2
# 5     5 Unemployed           0        0
# 6     6 Unemployed           0        0
# 7     7 Unemployed           0        0
# 8     8 Unemployed           0        0
# 9     9 Unemployed           0        0
# 10    10 Unemployed           1        3

注意：我在最后一行的持续时间内得到 3，而不是在 OP 的所需输出中得到 2。原因是我假设 Unemp NA Unemp 真的是 Unemp Unemp Unemp，无论是出于 spell_count 的目的还是出于持续时间的目的。OP 希望这适用于 spell_count 而不是持续时间。为了实现这一点，一种方法可能是使用“act”变量进行持续时间计算，使用“act2”变量进行 spell_count 计算——我把这个留给读者。

r - 如何在R中通过ID计算分类法术的数量和持续时间

3 回答 3

Related

Reference