3

有哪些方法可以转换会话路径数据,例如:

df
#   Session Link1 Link2 Link3 Link4 Link5
# 1       1     A     B                  
# 2       2     C                        
# 3       3     D     A     B            
# 4       4     C     F     G     H     J
# 5       5     A     B     C            

进入如下所示的数据集:

desired
#    Session From   To
# 1        1    A    B
# 2        2    C <NA>
# 3        3    D    A
# 4        3    A    B
# 5        4    C    F
# 6        4    F    G
# 7        4    G    H
# 8        4    H    J
# 9        5    A    B
# 10       5    B    C

再现性数据:

df <- structure(list(Session = 1:5, Link1 = structure(c(1L, 2L, 3L, 2L, 1L), .Label = c("A", "C", "D"), class = "factor"), Link2 = structure(c(3L, 1L, 2L, 4L, 3L), .Label = c("", "A", "B", "F"), class = "factor"), Link3 = structure(c(1L, 1L, 2L, 4L, 3L), .Label = c("", "B", "C", "G"), class = "factor"), Link4 = structure(c(1L, 1L, 1L, 2L, 1L), .Label = c("", "H"), class = "factor"), Link5 = structure(c(1L, 1L, 1L, 2L, 1L), .Label = c("", "J"), class = "factor")), .Names = c("Session", "Link1", "Link2", "Link3", "Link4", "Link5"), class = "data.frame", row.names = c(NA, -5L))
desired <- structure(list(Session = c(1L, 2L, 3L, 3L, 4L, 4L, 4L, 4L, 5L, 5L), From = structure(c(1L, 3L, 4L, 1L, 3L, 5L, 6L, 7L, 1L, 2L), .Label = c("A", "B", "C", "D", "F", "G", "H"), class = "factor"), To = structure(c(2L, NA, 1L, 2L, 4L, 5L, 6L, 7L, 2L, 3L), .Label = c("A", "B", "C", "F", "G", "H", "J"), class = "factor")), .Names = c("Session", "From", "To"), class = "data.frame", row.names = c(NA, -10L))
4

3 回答 3

5

我们可以使用data.table. 将“data.frame”转换为“data.table”(setDT(df))。melt通过指定id.var为“会话”从“宽”格式重塑为“长”格式。删除为空的 'value' 元素[value!='']。按“会话”分组,我们在“值”列中为那些只有单行 ( if...else) 的“会话”插入“NA”值,通过删除最后一个和按“会话”分组的“V1”的第一个元素。

 library(data.table)#v1.9.5+
 melt(setDT(df), id.var='Session')[value!=''][, 
   if(.N==1L) c(value, NA) else value, by = Session][,
      list(From=V1[-.N], To=V1[-1L]), by = Session]
 #   Session From To
 #1:       1    A  B
 #2:       2    C NA
 #3:       3    D  A
 #4:       3    A  B
 #5:       4    C  F
 #6:       4    F  G
 #7:       4    G  H
 #8:       4    H  J
 #9:       5    A  B
 #10:      5    B  C

在该步骤之后,上述内容可以简化为单个块melt。由于某种原因,tmp[-.N]无法正常工作。所以我用tmp[1:(.N-1)].

melt(setDT(df), id.var= 'Session')[value!='', {
              tmp <- if(.N==1L) c(value, NA) else value
              list(From= tmp[1:(.N-1)], To= tmp[-1L]) }, by = Session]
#    Session From To
#1:       1    A  B
#2:       2    C NA
#3:       3    D  A
#4:       3    A  B
#5:       4    C  F
#6:       4    F  G
#7:       4    G  H
#8:       4    H  J
#9:       5    A  B
#10:      5    B  C
于 2015-07-10T13:42:21.830 回答
2

受@akrun 的启发,这是我个人对这个问题的尝试。当然,对结果进行了调整,以包括每对的终端从到路径:

library(dplyr)
library(tidyr)

gather(df, "Link_Num", "Value", -Session) %>%
  group_by(Session) %>%
  mutate(to = Value,
         from = lag(to)) %>%
  filter(Link_Num != "Link1" &
         from != "") %>%
  select(Session, from, to, Link_Num) %>%
  arrange(Session)

产生:

   Session from to Link_Num
1        1    A  B    Link2
2        1    B       Link3
3        2    C       Link2
4        3    D  A    Link2
5        3    A  B    Link3
6        3    B       Link4
7        4    C  F    Link2
8        4    F  G    Link3
9        4    G  H    Link4
10       4    H  J    Link5
11       5    A  B    Link2
12       5    B  C    Link3
13       5    C       Link4
于 2015-07-10T14:15:15.323 回答
2

dplyr具有功能的另一种方法meltand lead

library(dplyr)
df$spacer <- ""
df %>% melt(id.var = "Session") %>%
  arrange(Session) %>% 
  mutate(To = lead(value)) %>%
  filter(To !="" & value !="" | To =="" & variable =="Link1") %>%
  mutate(To = ifelse(To == "", NA, To)) %>% select(-variable)
#    Session value   To
# 1        1     A    B
# 2        2     C <NA>
# 3        3     D    A
# 4        3     A    B
# 5        4     C    F
# 6        4     F    G
# 7        4     G    H
# 8        4     H    J
# 9        5     A    B
# 10       5     B    C
于 2015-07-10T14:38:21.433 回答