相反,这种方法一次将每一列相加,然后i
从列总和中减去每一列。
#here's a pretty quick base R solution:
do.call(rbind
,tapply(seq_len(nrow(matrix))
, matrix[, 'factor']
, FUN = function(i) sweep(-matrix[i, -length(matrix)]
, 2
, colSums(matrix[i, -length(matrix)]), `+`) / (length(i)-1)
)
)
<NA> <NA> <NA> <NA> <NA> factor
x1 2 2.5 3.5 4.5 5.5 1
x2 2 4.0 5.0 6.0 7.0 1
x3 1 2.5 3.5 4.5 5.5 1
x4 6 2.0 3.0 9.0 7.0 2
x5 1 2.0 3.0 5.0 7.0 2
x6 6 4.0 3.0 1.0 8.0 3
x7 5 2.0 4.0 3.0 2.0 3
# similar but MUCH slower
do.call(rbind
, by(matrix[, -6]
, matrix[, 6]
#, function(x) sweep(-x, 2,colSums(x), FUN = '+') / (nrow(x)-1))
, function(x) mapply(`-`, colSums(x), x) / (nrow(x) - 1)) #mapply is faster
)
一些表现。请注意,@Ronak 提供了一个警告,因此我更改为dplyr
建议的代码,mutate_at(vars(-group_cols), ...)
而不是mutate_all(...)
. 此外,虽然data.table
在@akrun 中,但我正要在@akrun 发布之前对其进行编辑,基本上是@Ronakdplyr
的翻译方式。
@G。Fernando 的解决方案大约需要 20 秒,因此我将其从配置文件中排除。@akrun 的基本解决方案是最快的。我个人最喜欢@Ronak 的基础,因为它读起来很好。
#10,000 rows
#1,000 groups
Unit: milliseconds
expr min lq mean median uq max neval
dt_version 191.4823 193.7294 201.39367 200.61610 210.0798 211.0581 10
cole_base2 7688.4689 7948.5534 8159.32689 8224.02570 8358.9145 8560.0802 10
cole_base3 760.9410 761.6176 789.35789 791.22520 812.1285 822.8938 10
Ronak_base 378.2914 381.9018 403.30458 403.65600 418.5159 431.2887 10
Ronak_dplyr 7025.7606 7045.9863 7217.55143 7150.09070 7395.1977 7505.7091 10
akrun_base 26.3189 27.2791 28.90526 28.03645 29.3622 33.5207 10
#10,000 rows
#100 groups
Unit: milliseconds
expr min lq mean median uq max neval
dt_version 32.6928 33.4362 36.27415 37.34835 38.8137 39.9793 10
cole_base2 770.1962 817.3142 847.01249 846.13940 893.4095 900.8028 10
cole_base3 97.5201 101.1023 108.46434 102.01210 105.9185 165.3160 10
Ronak_base 115.7445 117.9968 128.06018 124.27730 129.9934 170.3994 10
Ronak_dplyr 721.4570 734.6108 747.46815 735.65990 756.1121 787.0906 10
akrun_base 23.3171 24.4940 26.79405 26.55190 29.1286 30.2099 10
library(data.table)
library(microbenchmark)
library(dplyr)
n_cols <- 10
n_rows <- 1E5
n_row_per_group <- 100
set.seed(1)
matrix <- matrix(sample(100, n_rows*n_cols, replace = T), ncol = n_cols)
matrix <- cbind(matrix, factor = rep(1:(n_rows / n_row_per_group), each = n_row_per_group))
df <- data.frame(matrix)
dt <- as.data.table(df)
do.call(rbind
, lapply(unique(matrix[,'factor'])
, function(x) {
sub_mat <- matrix[matrix[, 'factor'] == x,]
sweep(-sub_mat, 2, colSums(sub_mat), '+') / (nrow(sub_mat) - 1)
})
)
microbenchmark(
# cole_base = { #too slow for lots of little groups
# do.call(rbind
# ,by(matrix[, -6]
# , matrix[, 6]
# , function(x) mapply(`-`, colSums(x), x) / (nrow(x) - 1)
# )
# )
# },
dt_version = {
dt[, lapply(.SD, function(x) (sum(x) - x) / (.N - 1)) , by = 'factor']
}
,cole_base2 = {
do.call(rbind
, lapply(unique(matrix[,'factor'])
, function(x) {
sub_mat <- matrix[matrix[, 'factor'] == x,]
sweep(-sub_mat, 2, colSums(sub_mat), '+') / (nrow(sub_mat) - 1)
})
)
}
,cole_base3 = {
do.call(rbind
,tapply(seq_len(nrow(matrix))
, matrix[, 'factor']
, FUN = function(i) sweep(-matrix[i, -length(matrix)], 2, colSums(matrix[i, -length(matrix)]), `+`) / (length(i)-1)
, simplify = F)
)
}
,Ronak_base = {
lapply(df[-ncol(df)], function(x)
ave(x, df$factor, FUN = function(x) (sum(x) - x)/(length(x) - 1)))
}
# ,G_fern_base = { #pretty slow, i hardcoded the factor row - it needs fixed slightly
# do.call(rbind,
# lapply(levels(factor(matrix[,11])),function(x) {
# list=as.list(NULL)
# index=which(matrix[,11]==x)
# for(i in 1:length(index)){
# if(length(index)>2){
# list[[i]]=colSums(matrix[index[-i],])
# }else{
# list[[i]]=matrix[index[-i],]
# }
# list[[i]]=list[[i]][-11]/(length(index)-1)
# }
# return(do.call(rbind,list))
# })
# )
# }
, Ronak_dplyr = {
df %>%
group_by(factor) %>%
mutate_at(vars(-group_cols()), ~(sum(.)-.)/ (n() - 1))
}
, akrun_base = {
n1 <- tabulate(matrix[, ncol(matrix)])
m1 <- rowsum(matrix[,-ncol(matrix)], group = matrix[,ncol(matrix)])
(m1[rep(seq_len(nrow(m1)), n1),] - matrix[, -ncol(matrix)])/rep(n1 - 1, n1)
}
, times = 10
)