136

我有一个数据框,如下所示:

data.frame(director = c("Aaron Blaise,Bob Walker", "Akira Kurosawa", 
                        "Alan J. Pakula", "Alan Parker", "Alejandro Amenabar", "Alejandro Gonzalez Inarritu", 
                        "Alejandro Gonzalez Inarritu,Benicio Del Toro", "Alejandro González Iñárritu", 
                        "Alex Proyas", "Alexander Hall", "Alfonso Cuaron", "Alfred Hitchcock", 
                        "Anatole Litvak", "Andrew Adamson,Marilyn Fox", "Andrew Dominik", 
                        "Andrew Stanton", "Andrew Stanton,Lee Unkrich", "Angelina Jolie,John Stevenson", 
                        "Anne Fontaine", "Anthony Harvey"), AB = c('A', 'B', 'A', 'A', 'B', 'B', 'B', 'A', 'B', 'A', 'B', 'A', 'A', 'B', 'B', 'B', 'B', 'B', 'B', 'A'))

如您所见,该director列中的某些条目是多个名称,以逗号分隔。我想将这些条目分成单独的行,同时保持另一列的值。例如,上面数据框中的第一行应该分成两行,每列有一个名称,director列中有“A” AB

4

6 回答 6

123

几种选择:

的两种方式:

library(data.table)
# method 1 (preferred)
setDT(v)[, lapply(.SD, function(x) unlist(tstrsplit(x, ",", fixed=TRUE))), by = AB
         ][!is.na(director)]
# method 2
setDT(v)[, strsplit(as.character(director), ",", fixed=TRUE), by = .(AB, director)
         ][,.(director = V1, AB)]

2) / 组合:

library(dplyr)
library(tidyr)
v %>% 
  mutate(director = strsplit(as.character(director), ",")) %>%
  unnest(director)

3)仅使用使用tidyr 0.5.0(及更高版本),您也可以只使用separate_rows

separate_rows(v, director, sep = ",")

您可以使用该convert = TRUE参数自动将数字转换为数字列。

4) 以 R 为基数:

# if 'director' is a character-column:
stack(setNames(strsplit(df$director,','), df$AB))

# if 'director' is a factor-column:
stack(setNames(strsplit(as.character(df$director),','), df$AB))
于 2015-07-20T10:56:48.990 回答
90

这个老问题经常被用作欺骗目标(用 标记r-faq)。到今天为止,它已经回答了 3 次,提供了 6 种不同的方法,但缺乏一个基准作为指导,哪种方法最快1

基准解决方案包括

使用该包对 6 种不同大小的数据帧进行了基准测试,总共有 8 种不同的方法microbenchmark(参见下面的代码)。

OP 给出的样本数据仅包含 20 行。要创建更大的数据框,只需将这 20 行重复 1、10、100、1000、10000 和 100000 次,这样问题的大小就可以达到 200 万行。

基准测试结果

在此处输入图像描述

基准测试结果表明,对于足够大的数据帧,所有data.table方法都比任何其他方法都快。对于超过 5000 行的数据帧,Jaap 的data.table方法 2 和变体DT3是最快的,比最慢的方法快很多。

值得注意的是,这两种tidyverse方法的时间安排和splistackshape解决方案非常相似,以至于很难区分图表中的曲线。它们是所有数据帧大小的基准测试方法中最慢的。

对于较小的数据帧,Matt 的基本 R 解决方案和data.table方法 4 的开销似乎比其他方法少。

代码

director <- 
  c("Aaron Blaise,Bob Walker", "Akira Kurosawa", "Alan J. Pakula", 
    "Alan Parker", "Alejandro Amenabar", "Alejandro Gonzalez Inarritu", 
    "Alejandro Gonzalez Inarritu,Benicio Del Toro", "Alejandro González Iñárritu", 
    "Alex Proyas", "Alexander Hall", "Alfonso Cuaron", "Alfred Hitchcock", 
    "Anatole Litvak", "Andrew Adamson,Marilyn Fox", "Andrew Dominik", 
    "Andrew Stanton", "Andrew Stanton,Lee Unkrich", "Angelina Jolie,John Stevenson", 
    "Anne Fontaine", "Anthony Harvey")
AB <- c("A", "B", "A", "A", "B", "B", "B", "A", "B", "A", "B", "A", 
        "A", "B", "B", "B", "B", "B", "B", "A")

library(data.table)
library(magrittr)

为问题大小的基准运行定义函数n

run_mb <- function(n) {
  # compute number of benchmark runs depending on problem size `n`
  mb_times <- scales::squish(10000L / n , c(3L, 100L)) 
  cat(n, " ", mb_times, "\n")
  # create data
  DF <- data.frame(director = rep(director, n), AB = rep(AB, n))
  DT <- as.data.table(DF)
  # start benchmarks
  microbenchmark::microbenchmark(
    matt_mod = {
      s <- strsplit(as.character(DF$director), ',')
      data.frame(director=unlist(s), AB=rep(DF$AB, lengths(s)))},
    jaap_DT1 = {
      DT[, lapply(.SD, function(x) unlist(tstrsplit(x, ",", fixed=TRUE))), by = AB
         ][!is.na(director)]},
    jaap_DT2 = {
      DT[, strsplit(as.character(director), ",", fixed=TRUE), 
         by = .(AB, director)][,.(director = V1, AB)]},
    jaap_dplyr = {
      DF %>% 
        dplyr::mutate(director = strsplit(as.character(director), ",")) %>%
        tidyr::unnest(director)},
    jaap_tidyr = {
      tidyr::separate_rows(DF, director, sep = ",")},
    cSplit = {
      splitstackshape::cSplit(DF, "director", ",", direction = "long")},
    DT3 = {
      DT[, strsplit(as.character(director), ",", fixed=TRUE),
         by = .(AB, director)][, director := NULL][
           , setnames(.SD, "V1", "director")]},
    DT4 = {
      DT[, .(director = unlist(strsplit(as.character(director), ",", fixed = TRUE))), 
         by = .(AB)]},
    times = mb_times
  )
}

针对不同的问题规模运行基准测试

# define vector of problem sizes
n_rep <- 10L^(0:5)
# run benchmark for different problem sizes
mb <- lapply(n_rep, run_mb)

为绘图准备数据

mbl <- rbindlist(mb, idcol = "N")
mbl[, n_row := NROW(director) * n_rep[N]]
mba <- mbl[, .(median_time = median(time), N = .N), by = .(n_row, expr)]
mba[, expr := forcats::fct_reorder(expr, -median_time)]

创建图表

library(ggplot2)
ggplot(mba, aes(n_row, median_time*1e-6, group = expr, colour = expr)) + 
  geom_point() + geom_smooth(se = FALSE) + 
  scale_x_log10(breaks = NROW(director) * n_rep) + scale_y_log10() + 
  xlab("number of rows") + ylab("median of execution time [ms]") +
  ggtitle("microbenchmark results") + theme_bw()

会话信息和包版本(摘录)

devtools::session_info()
#Session info
# version  R version 3.3.2 (2016-10-31)
# system   x86_64, mingw32
#Packages
# data.table      * 1.10.4  2017-02-01 CRAN (R 3.3.2)
# dplyr             0.5.0   2016-06-24 CRAN (R 3.3.1)
# forcats           0.2.0   2017-01-23 CRAN (R 3.3.2)
# ggplot2         * 2.2.1   2016-12-30 CRAN (R 3.3.2)
# magrittr        * 1.5     2014-11-22 CRAN (R 3.3.0)
# microbenchmark    1.4-2.1 2015-11-25 CRAN (R 3.3.3)
# scales            0.4.1   2016-11-09 CRAN (R 3.3.2)
# splitstackshape   1.4.2   2014-10-23 CRAN (R 3.3.3)
# tidyr             0.6.1   2017-01-10 CRAN (R 3.3.2)

1这个热情洋溢的评论激起了我的好奇心Brilliant!快几个数量级!对作为该问题的副本而关闭的问题的答案。 tidyverse

于 2017-04-15T22:21:42.533 回答
52

命名您的原始 data.frame v,我们有这个:

> s <- strsplit(as.character(v$director), ',')
> data.frame(director=unlist(s), AB=rep(v$AB, sapply(s, FUN=length)))
                      director AB
1                 Aaron Blaise  A
2                   Bob Walker  A
3               Akira Kurosawa  B
4               Alan J. Pakula  A
5                  Alan Parker  A
6           Alejandro Amenabar  B
7  Alejandro Gonzalez Inarritu  B
8  Alejandro Gonzalez Inarritu  B
9             Benicio Del Toro  B
10 Alejandro González Iñárritu  A
11                 Alex Proyas  B
12              Alexander Hall  A
13              Alfonso Cuaron  B
14            Alfred Hitchcock  A
15              Anatole Litvak  A
16              Andrew Adamson  B
17                 Marilyn Fox  B
18              Andrew Dominik  B
19              Andrew Stanton  B
20              Andrew Stanton  B
21                 Lee Unkrich  B
22              Angelina Jolie  B
23              John Stevenson  B
24               Anne Fontaine  B
25              Anthony Harvey  A

注意使用rep来构建新的 AB 列。在这里,sapply返回每个原始行中的名称数。

于 2012-12-08T02:36:20.200 回答
31

派对迟到了,但另一个通用的替代方法是使用cSplit我的“splitstackshape”包中的一个direction参数。将此设置"long"为获得您指定的结果:

library(splitstackshape)
head(cSplit(mydf, "director", ",", direction = "long"))
#              director AB
# 1:       Aaron Blaise  A
# 2:         Bob Walker  A
# 3:     Akira Kurosawa  B
# 4:     Alan J. Pakula  A
# 5:        Alan Parker  A
# 6: Alejandro Amenabar  B
于 2015-02-03T04:34:37.663 回答
2
devtools::install_github("yikeshu0611/onetree")

library(onetree)

dd=spread_byonecolumn(data=mydata,bycolumn="director",joint=",")

head(dd)
            director AB
1       Aaron Blaise  A
2         Bob Walker  A
3     Akira Kurosawa  B
4     Alan J. Pakula  A
5        Alan Parker  A
6 Alejandro Amenabar  B
于 2019-07-26T18:21:37.840 回答
1

目前可以推荐使用strsplitfrom base产生的另一个基准将列中的逗号分隔字符串拆分为单独的行,因为它在各种大小范围内都是最快的:

s <- strsplit(v$director, ",", fixed=TRUE)
s <- data.frame(director=unlist(s), AB=rep(v$AB, lengths(s)))

请注意,使用fixed=TRUE对时间有重大影响。

显示计算时间随行数的曲线

比较方法:

met <- alist(base = {s <- strsplit(v$director, ",") #Matthew Lundberg
   s <- data.frame(director=unlist(s), AB=rep(v$AB, sapply(s, FUN=length)))}
 , baseLength = {s <- strsplit(v$director, ",") #Rich Scriven
   s <- data.frame(director=unlist(s), AB=rep(v$AB, lengths(s)))}
 , baseLeFix = {s <- strsplit(v$director, ",", fixed=TRUE)
   s <- data.frame(director=unlist(s), AB=rep(v$AB, lengths(s)))}
 , cSplit = s <- cSplit(v, "director", ",", direction = "long") #A5C1D2H2I1M1N2O1R2T1
 , dt = s <- setDT(v)[, lapply(.SD, function(x) unlist(tstrsplit(x, "," #Jaap
   , fixed=TRUE))), by = AB][!is.na(director)]
#, dt2 = s <- setDT(v)[, strsplit(director, "," #Jaap #Only Unique
#  , fixed=TRUE), by = .(AB, director)][,.(director = V1, AB)]
 , dplyr = {s <- v %>%  #Jaap
    mutate(director = strsplit(director, ",", fixed=TRUE)) %>%
    unnest(director)}
 , tidyr = s <- separate_rows(v, director, sep = ",") #Jaap
 , stack = s <- stack(setNames(strsplit(v$director, ",", fixed=TRUE), v$AB)) #Jaap
#, dt3 = {s <- setDT(v)[, strsplit(director, ",", fixed=TRUE), #Uwe #Only Unique
#  by = .(AB, director)][, director := NULL][, setnames(.SD, "V1", "director")]}
 , dt4 = {s <- setDT(v)[, .(director = unlist(strsplit(director, "," #Uwe
   , fixed = TRUE))), by = .(AB)]}
 , dt5 = {s <- vT[, .(director = unlist(strsplit(director, "," #Uwe
   , fixed = TRUE))), by = .(AB)]}
   )

图书馆:

library(microbenchmark)
library(splitstackshape) #cSplit
library(data.table) #dt, dt2, dt3, dt4
#setDTthreads(1) #Looks like it has here minor effect
library(dplyr) #dplyr
library(tidyr) #dplyr, tidyr

数据:

v0 <- data.frame(director = c("Aaron Blaise,Bob Walker", "Akira Kurosawa", 
                        "Alan J. Pakula", "Alan Parker", "Alejandro Amenabar", "Alejandro Gonzalez Inarritu", 
                        "Alejandro Gonzalez Inarritu,Benicio Del Toro", "Alejandro González Iñárritu", 
                        "Alex Proyas", "Alexander Hall", "Alfonso Cuaron", "Alfred Hitchcock", 
                        "Anatole Litvak", "Andrew Adamson,Marilyn Fox", "Andrew Dominik", 
                        "Andrew Stanton", "Andrew Stanton,Lee Unkrich", "Angelina Jolie,John Stevenson", 
                        "Anne Fontaine", "Anthony Harvey"), AB = c('A', 'B', 'A', 'A', 'B', 'B', 'B', 'A', 'B', 'A', 'B', 'A', 'A', 'B', 'B', 'B', 'B', 'B', 'B', 'A'))

计算和计时结果:

n <- 10^(0:5)
x <- lapply(n, function(n) {v <- v0[rep(seq_len(nrow(v0)), n),]
  vT <- setDT(v)
  ti <- min(100, max(3, 1e4/n))
  microbenchmark(list = met, times = ti, control=list(order="block"))})

y <- do.call(cbind, lapply(x, function(y) aggregate(time ~ expr, y, median)))
y <- cbind(y[1], y[-1][c(TRUE, FALSE)])
y[-1] <- y[-1] / 1e6 #ms
names(y)[-1] <- paste("n:", n * nrow(v0))
y #Time in ms
#         expr     n: 20    n: 200    n: 2000   n: 20000   n: 2e+05   n: 2e+06
#1        base 0.2989945 0.6002820  4.8751170  46.270246  455.89578  4508.1646
#2  baseLength 0.2754675 0.5278900  3.8066300  37.131410  442.96475  3066.8275
#3   baseLeFix 0.2160340 0.2424550  0.6674545   4.745179   52.11997   555.8610
#4      cSplit 1.7350820 2.5329525 11.6978975  99.060448 1053.53698 11338.9942
#5          dt 0.7777790 0.8420540  1.6112620   8.724586  114.22840  1037.9405
#6       dplyr 6.2425970 7.9942780 35.1920280 334.924354 4589.99796 38187.5967
#7       tidyr 4.0323765 4.5933730 14.7568235 119.790239 1294.26959 11764.1592
#8       stack 0.2931135 0.4672095  2.2264155  22.426373  289.44488  2145.8174
#9         dt4 0.5822910 0.6414900  1.2214470   6.816942   70.20041   787.9639
#10        dt5 0.5015235 0.5621240  1.1329110   6.625901   82.80803   636.1899

注意,方法如

(v <- rbind(v0[1:2,], v0[1,]))
#                 director AB
#1 Aaron Blaise,Bob Walker  A
#2          Akira Kurosawa  B
#3 Aaron Blaise,Bob Walker  A

setDT(v)[, strsplit(director, "," #Jaap #Only Unique
  , fixed=TRUE), by = .(AB, director)][,.(director = V1, AB)]
#         director AB
#1:   Aaron Blaise  A
#2:     Bob Walker  A
#3: Akira Kurosawa  B

strsplitunique director返回 a并且可能与

tmp <- unique(v)
s <- strsplit(tmp$director, ",", fixed=TRUE)
s <- data.frame(director=unlist(s), AB=rep(tmp$AB, lengths(s)))

但据我了解,这没有被问到。

于 2020-06-22T17:28:40.097 回答