我知道这是一个非常愚蠢的问题,但我无法解决它,这就是为什么要问...我如何从公共 ID 的大型数据集中提取行并利用这些行的方法并制作具有这些 ID 的列作为行名。例如
IDs Var2
Ae4 2
Ae4 4
Ae4 6
Bc3 3
Bc3 5
Ad2 8
Ad2 7
OutPut
Var(x)
Ae4 4
Bc3 4
Ad2 7.5
我知道这是一个非常愚蠢的问题,但我无法解决它,这就是为什么要问...我如何从公共 ID 的大型数据集中提取行并利用这些行的方法并制作具有这些 ID 的列作为行名。例如
IDs Var2
Ae4 2
Ae4 4
Ae4 6
Bc3 3
Bc3 5
Ad2 8
Ad2 7
OutPut
Var(x)
Ae4 4
Bc3 4
Ad2 7.5
如果你有一个大的data.frame,你可以使用data.table
一些替代ddply
是aggregate
和data.table
set.seed(001)
dat <- data.frame(ID = rep(LETTERS[1:5], each = 20), value = runif(1e6))
library(data.table)
DT <- data.table(dat)
DT[, mean(value), by=list(ID)] # data.table approach
aggregate(.~ID, data=dat, mean) # aggregate (R Base function) approach
library(rbenchmark) # comparing performance
benchmark(DT[, mean(value), by=list(ID)], # data.table approach
aggregate(.~ID, data=dat, mean), # aggregate approach
ddply(dat, .(ID), summarize, mn = mean(value)), # ddply approach (Paul Hiemstra's answer)
columns=c("test", "replications", "elapsed", "relative"),
order='relative',
replications=1)
test replications elapsed relative
1 DT[, mean(value), by = list(ID)] 1 0.14 1.000
3 ddply(dat, .(ID), summarize, mn = mean(value)) 1 0.58 4.143
2 aggregate(. ~ ID, data = dat, mean) 1 3.59 25.643
如您所见,最快的是data.table
方法。
有一个 R 基础方法甚至比 更快data.table
,让我们看看:
unlist(lapply(split(dat$value, dat$ID), mean)) # another R Base approach
benchmark(DT[, mean(value), by=list(ID)], # data.table approach
aggregate(.~ID, data=dat, mean), # aggregate approach
ddply(dat, .(ID), summarize, mn = mean(value)), # ddply approach (Paul Hiemstra's answer)
unlist(lapply(split(dat$value, dat$ID), mean)), # lapply, split approach
columns=c("test", "replications", "elapsed", "relative"),
order='relative',
replications=1)
test replications elapsed relative
4 unlist(lapply(split(dat$value, dat$ID), mean)) 1 0.06 1.000
1 DT[, mean(value), by = list(ID)] 1 0.10 1.667
3 ddply(dat, .(ID), summarize, mn = mean(value)) 1 0.56 9.333
2 aggregate(. ~ ID, data = dat, mean) 1 3.28 54.667
Venables 和 Ripley (2000, pag.37) 建议组合unlist
,lapply
和split
比仅使用更快sapply
,在这个特定示例中它甚至比data.table
参考:
Venables, WN 和 Ripley, BD (2000)。S 编程。施普林格。统计和计算 ISBN 0-387-98966-8(alk.paper)
更多组
dat <- data.frame(ID = as.character(as.hexmode(1:2000)), value = runif(1e6))
DT <- as.data.table(dat)
benchmark(
DT[, mean(value), by=ID],
aggregate(.~ID, data=dat, mean),
ddply(dat, .(ID), summarize, mn = mean(value)),
unlist(lapply(split(dat$value, dat$ID), mean)),
columns=c("test", "replications", "elapsed", "relative"),
order='relative',
replications=3)
test replications elapsed relative
1 DT[, mean(value), by = ID] 3 0.33 1.000
4 unlist(lapply(split(dat$value, dat$ID), mean)) 3 0.41 1.242
2 aggregate(. ~ ID, data = dat, mean) 3 7.69 23.303
3 ddply(dat, .(ID), summarize, mn = mean(value)) 3 17.08 51.758
更多行
dat <- data.frame(ID = as.character(as.hexmode(1:2000)), value = runif(1e7))
DT <- as.data.table(dat)
benchmark(
DT[, mean(value), by=ID],
aggregate(.~ID, data=dat, mean),
ddply(dat, .(ID), summarize, mn = mean(value)),
unlist(lapply(split(dat$value, dat$ID), mean)),
columns=c("test", "replications", "elapsed", "relative"),
order='relative',
replications=3)
test replications elapsed relative
1 DT[, mean(value), by = ID] 3 3.18 1.000
4 unlist(lapply(split(dat$value, dat$ID), mean)) 3 4.26 1.340
2 aggregate(. ~ ID, data = dat, mean) 3 90.28 28.390
3 ddply(dat, .(ID), summarize, mn = mean(value)) 3 268.86 84.547
先设置一个键
system.time(setkey(DT,ID))
user system elapsed
0.71 0.03 0.75
object.size(dat)
152.7 Mb # Quite small. Easy for a 32bit PC with 2GB RAM.
object.size(DT)
152.7 Mb
benchmark(
DT[, mean(value), by=ID],
aggregate(.~ID, data=dat, mean),
ddply(dat, .(ID), summarize, mn = mean(value)),
unlist(lapply(split(dat$value, dat$ID), mean)),
columns=c("test", "replications", "elapsed", "relative"),
order='relative',
replications=3)
test replications elapsed relative
1 DT[, mean(value), by = ID] 3 0.95 1.000
4 unlist(lapply(split(dat$value, dat$ID), mean)) 3 4.08 4.295
2 aggregate(. ~ ID, data = dat, mean) 3 91.76 96.589
3 ddply(dat, .(ID), summarize, mn = mean(value)) 3 265.15 279.105
更多行
dat <- data.frame(ID = rep(1:2000,each=50000), value = runif(1e8))
DT <- as.data.table(dat)
system.time(setkey(DT,ID))
user system elapsed
2.10 0.25 2.34
object.size(dat)
1.1 Gb # Comfortable for a 64bit PC with 8GB RAM
object.size(DT)
1.1 Gb
benchmark(
DT[, mean(value), by=ID],
unlist(lapply(split(dat$value, dat$ID), mean)),
columns=c("test", "replications", "elapsed", "relative"),
order='relative',
replications=3)
test replications elapsed relative
1 DT[, mean(value), by = ID] 3 7.30 1.000
2 unlist(lapply(split(dat$value, dat$ID), mean)) 3 184.83 25.319
plyr
使用以下功能可以轻松完成此类事情ddply
:
dat = data.frame(ID = rep(LETTERS[1:5], each = 20), value = runif(100))
> head(dat)
ID value
1 A 0.45800889
2 A 0.11221072
3 A 0.58833532
4 A 0.70056704
5 A 0.08337996
6 A 0.05195357
ddply(dat, .(ID), summarize, mn = mean(value))
ID mn
1 A 0.4960083
2 B 0.5809681
3 C 0.4512388
4 D 0.5079790
5 E 0.5397708
如果您的数据集很大,和/或唯一的数量ID
很大,您可以使用data.table
. 有关. _ _plyr