0

我有一个包含两个变量列表的数据框。列表中的每个观察都包含不同长度的元素。例如,变量“accession”的第 4 个包含一个元素,但第 7 个包含两个元素。 当前数据框

我想制作一个新的数据框,将两个列表组合在一起,如下所示: 我想要的最终数据框

谢谢你帮助我!

这是我目前拥有的数据框。

library(rentrez)


search <- entrez_search(db="gds", term=paste0("disease", " AND gse[ETYP]") , retMax = 15) 
id <- unlist(search$ids)
UID <- c(sapply(id, paste0, collapse=""))
pub.summary <- entrez_summary(db = "gds", id = UID ,  
                              always_return_list = TRUE)
summary <- extract_from_esummary(esummaries = pub.summary , 
                                           elements = c("samples"),
                                           simplify = T)
df <- data.frame(summary)
df <-data.frame(t(df))
df <- df %>% mutate()
df

这是我希望拥有的数据框结果

#  accession                                  title
#1 GSM3955152                                Cancer3
   GSM3955155                              Adjacent3
   GSM3955757 SW480 cells, HES1-binding RNAs/LncRNAs
   GSM3955153                              Adjacent1
   GSM3955150                                Cancer1
   GSM3955151                                Cancer2
#2 GSM33026213                      his4wk_sensitized_uti_1
   GSM3302681                         3his4wk_resolved_pbs_2
   GSM3302624                           c57bl6j_pbs_9
.
.
.
.
#4 GSM3955757                      SW480 cells, HES1-binding RNAs/LncRNAs
.
.
.
.
#15 GSM3934992                    control rep4 [N_0039]
    GSM3935006                    control rep15 [W_010]
    GSM3935012                    control rep17 [W_023]
    GSM3934989                    control rep1 [N_0026]
END
 
    

4

1 回答 1

0

更新

根据 OP 的更新,一个选项是simplify = FALSEextract_from_esummary返回中指定为list,然后从每个中提取first list元素listrbind 创建单个数据框

summary <- extract_from_esummary(esummaries = pub.summary , 
                                           elements = "samples",
                                           simplify = FALSE)


out <- do.call(rbind, lapply(summary, `[[`, 1))
row.names(out) <- NULL
head(out)
#  accession                                  title
#1 GSM3955152                                Cancer3
#2 GSM3955155                              Adjacent3
#3 GSM3955757 SW480 cells, HES1-binding RNAs/LncRNAs
#4 GSM3955153                              Adjacent1
#5 GSM3955150                                Cancer1
#6 GSM3955151                                Cancer2

一个选项是填充list元素以在两列NA中保持length相同(如果一个长度不同),然后unnest

library(dplyr)
library(purrr)
df1 %>%
   mutate(n = pmax(lengths(accession), lengths(title))) %>% 
   mutate_at(vars(accession, title), ~ 
         map2(., n, ~ `length<-`(.x, .y))) %>% 
   select(-n) %>%
   unnest(cols = c(accession, title))
# A tibble: 12 x 2
#   accession title
#   <chr>     <chr>
# 1 A         a    
# 2 B         b    
# 3 C         c    
# 4 <NA>      d    
# 5 <NA>      e    
# 6 A         a    
# 7 B         b    
# 8 C         c    
# 9 D         <NA> 
#10 E         <NA> 
#11 A         d    
#12 B         <NA> 

或者一个选项是gather进入“长”格式,然后unnest是“val”列,spread然后返回“宽”格式

library(tidyr)
df1 %>%
    mutate(rn = row_number()) %>% 
    gather(key, val, -rn) %>%
    unnest(val) %>%
    group_by(rn, key) %>% 
    mutate(i1 = row_number()) %>% 
    spread(key, val) %>% 
    ungroup %>% 
    select(-rn, -i1)

数据

df1 <- tibble(accession = list(LETTERS[1:3], LETTERS[1:5], LETTERS[1:2]), 
       title = list(letters[1:5], letters[1:3], letters[4]))
于 2019-07-22T19:30:43.430 回答