r - R中时变协变量Cox比例风险建模的数据格式

Question

我正在尝试在 R 中开发一个时变 Cox 比例风险 (CPH) 模型，并且想知道是否有人生成了任何代码来帮助格式化用于时变/时间相关 CPH 模型的计数结构的数据。

为了使问题可重现并且更简单，我提取了前 100 行数据，其中包含 4 个变量（id、date、y和x）。id是唯一的主题标识符。是一个整数序列，date每个 0 到 n 天的观察id。y是危害分析的状态或结果，x是时变协变量。在此示例中，一旦y发生 = 1，每个主题的数据将被审查，并且理想的输出数据帧中不应包含其他数据。

数据的结构使得每个受试者都有 1 行对应于每天的观察。

head(test)
id date y x
1     0 0 0
1     1 0 1
1     2 0 1
1     3 0 1
1     4 0 1
1     5 0 0

但是，据我了解，cphR 中的函数要求时变协变量的结构方式是，start和end变量需要重新编码为 3 行，间隔为 (0,1] 和 (1,5] 和 (5 ,6] 用于上述head(test)代码块中的数据。

可以使用以下代码重建前 100 行数据：

dput(test)
structure(list(id = c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 
1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 
3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 
5, 5, 6, 6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 
8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 9, 9, 9, 9, 9, 9, 
9, 9, 9), date = c(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 
13, 14, 15, 16, 17, 18, 19, 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 
3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 0, 1, 2, 3, 4, 5, 6, 0, 1, 2, 
3, 4, 5, 6, 7, 8, 0, 1, 2, 3, 4, 5, 6, 0, 1, 2, 3, 4, 5, 6, 7, 
8, 9, 10, 11, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 
0, 1, 2, 3, 4, 5, 6, 7, 8), y = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 
0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
0, 0, 0, 1, 0, 0, 0), x = c(0L, 1L, 1L, 1L, 1L, 0L, 1L, 0L, 0L, 
1L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 
0L, 0L, 0L, 0L, 1L, 1L, 0L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 0L, 0L, 
0L, 0L, 1L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 
0L, 0L, 0L, 0L, 0L, 0L, 1L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 
0L, 0L, 0L, 0L, 1L, 1L, 1L, 1L, 0L, 0L, 0L)), .Names = c("id", 
"date", "y", "x"), row.names = c(NA, -100L), class = "data.frame")

理想情况下，我正在尝试重新编码这些数据，以便输出为：

head(ideal_output)
id start end y x
1      0   1 0 0
1      1   5 0 1
1      5   6 0 0
1      6   7 0 1
1      7   9 0 0
1      9  11 0 1
1     11  20 0 0
2      0   8 0 0
3      0   1 0 0
3      1   3 0 1
3      3   4 0 0
3      4   6 0 1
3      6   7 1 1
4      0   2 0 0
4      2   4 0 1
4      4   7 0 0
5      0   9 0 0
6      0   7 0 0
7      0   1 0 0
7      1   2 0 1
7      2   3 0 0 
7      3   4 1 0
8      0   3 0 0
8      3   4 1 1
9      0   2 0 0
9      2   5 0 1
9      5   6 1 1

我已手动完成此操作以创建ideal_output上述内容，但这是一个容易出错的过程，并且对于id我需要评估的数百个 's 和几个协变量来说是站不住脚的。因此，在开发一种自动化的方式来解决这个数据格式化挑战方面的任何帮助都将不胜感激。谢谢！

score 1 · Accepted Answer

我认为 Survsplit() 函数是您问题的答案。

看看： http ://www.rdocumentation.org/packages/eha/functions/SurvSplit

或者，尝试谷歌：第 5 章扩展和分层考克斯 - nus.edu.sg

score 1 · Accepted Answer

As @Ham suggest you can use tmerge. Here is an example

> #####
> # `dat` is the data.frame you provided 
> library(survival)
> 
> # make baseline data.frame for tmerge
> baseline <- by(dat, dat$id, function(x){
+     n <- nrow(x)
+     # avoid slow data.frame call
+     structure(list(
+       id = x$id[1], start = x$date[1], x = x$x[1], end = x$date[n], 
+       dummy = 0),
+       row.names = 1L, class = "data.frame")
+   })
> baseline <- do.call(rbind, baseline)
> baseline # show baseline data
  id start x end dummy
1  1     0 0  19     0
2  2     0 0   7     0
3  3     0 0  12     0
4  4     0 0   6     0
5  5     0 0   8     0
6  6     0 0   6     0
7  7     0 0  11     0
8  8     0 0  14     0
9  9     0 0   8     0
> 
> # use tmerge
> final_dat <- tmerge(baseline, baseline, id = id, y = event(end, dummy))
> final_dat <- tmerge(
+   final_dat, dat, id = id, y = cumtdc(date, y), x = tdc(date, x))
> final_dat[final_dat$id == 3, ] # look at one example
   id start x end dummy tstart tstop y
27  3     0 0  12     0      0     1 0
28  3     0 1  12     0      1     2 0
29  3     0 1  12     0      2     3 0
30  3     0 0  12     0      3     4 0
31  3     0 1  12     0      4     5 0
32  3     0 1  12     0      5     6 0
33  3     0 1  12     0      6     7 1
34  3     0 1  12     0      7     8 1
35  3     0 1  12     0      8     9 1
36  3     0 1  12     0      9    10 1
37  3     0 1  12     0     10    11 1
38  3     0 0  12     0     11    12 1
> 
> # remove values where y is not zero or y is not the first non-zero value
> final_dat <- within(final_dat, ycum <- unlist(tapply(y, id, cumsum)))
> final_dat <- final_dat[final_dat$ycum < 2, ]
> final_dat$ycum <- NULL
> final_dat[final_dat$id == 3, ]
   id start x end dummy tstart tstop y
27  3     0 0  12     0      0     1 0
28  3     0 1  12     0      1     2 0
29  3     0 1  12     0      2     3 0
30  3     0 0  12     0      3     4 0
31  3     0 1  12     0      4     5 0
32  3     0 1  12     0      5     6 0
33  3     0 1  12     0      6     7 1
> 
> # remove x row where the previous x value do match. But
> #  * keep those where y = 1
> #  * update tstop for the last row where the last row may be removed
> final_dat <- within(
+   final_dat,
+   max_t <- unlist(tapply(tstop, id, function(z) rep(max(z), length(z))))) 
> final_dat <- within(
+   final_dat, 
+   keep <- unlist(tapply(x, id, function(z)
+     c(TRUE, z[-1] != z[-length(z)]))))
> 
> final_dat <- final_dat[final_dat$keep | final_dat$y, ]
> 
> final_dat <- within(
+   final_dat, is_last <- unlist(tapply(id, id, function(z) 
+     seq_along(z) == length(z))))
> 
> needs_update <- final_dat$is_last & !final_dat$y
> final_dat[needs_update, "tstop"] <- 
+   final_dat[needs_update, "max_t"]  + 1
> 
> # have to update the tstop column 
> final_dat <- within(final_dat, tstop <- unlist(by(
+   cbind(tstart, tstop), id, function(z) {
+     n <- nrow(z)
+     c(z$tstart[-1], z$tstop[n])
+ })))
> 
> # show final data.frame
> final_dat[, c("id", "tstart", "tstop", "y", "x")]
   id tstart tstop y x
1   1      0     1 0 0
2   1      1     5 0 1
6   1      5     6 0 0
7   1      6     7 0 1
8   1      7     9 0 0
10  1      9    11 0 1
12  1     11    20 0 0
20  2      0     8 0 0
27  3      0     1 0 0
28  3      1     3 0 1
30  3      3     4 0 0
31  3      4     6 0 1
33  3      6     7 1 1
39  4      0     2 0 0
41  4      2     4 0 1
43  4      4     7 0 0
45  5      0     9 0 0
53  6      0     7 0 0
59  7      0     1 0 0
60  7      1     2 0 1
61  7      2     3 0 0
62  7      3     4 1 0
70  8      0     3 0 0
73  8      3     4 1 1
84  9      0     2 0 0
86  9      2     5 0 1
89  9      5     6 1 1

The code after tmerge can be done faster with dplyr or data.table. If you have more columns than just one, x, then I suggest that you: 1) store a column index of dat and use that in tmerge in the tdc function instead of x. Then merge the tables afterwards with merge. Further, you need to update the line that makes the keep indicator. Otherwise the code should be identical.

score 0 · Accepted Answer

我认为 tmerge() 函数是您问题的答案。

看：https ://cran.r-project.org/web/packages/survival/vignettes/timedep.pdf

r - R中时变协变量Cox比例风险建模的数据格式

3 回答 3

Related

Reference