这个怎么样?
df[, 2:4][t(apply(df[,2:4], 1, duplicated))] <- NA
编辑:更快的基本解决方案:
for (i in 2:(ncol(df)-1)) {
for (j in (i+1):ncol(df)) {
chk <- df[[i]] == df[[j]]
df[[j]][chk] <- NA
}
}
这是上述两种方法以及 AnandaMahtoreshape2
和data.table
更大数据方法的基准。for-loop
与 righti
和indices 一起使用j
似乎是最快的。
基准测试结果:
require(microbenchmark)
microbenchmark(ar.f <- arun.f(df), ar.s <- arun.s(df),
an.f <- ananda.ave(df),
an.s <- ananda.dt(copy(DT)), times=10)
# Unit: milliseconds
# expr min lq median uq max neval
# arun.f(df) 4816.3937 5197.0626 6402.454 6955.9380 7534.6912 10
# arun.s(df) 114.8372 118.7971 149.284 202.6081 297.4787 10
# ananda.ave(df) 2877.7936 3288.5935 3650.660 3985.5390 4111.9064 10
# ananda.dt(copy(DT)) 3383.1229 3861.6379 4432.751 4776.6108 5368.6504 10
创建数据:
set.seed(1234)
df <- cbind(data.frame(ID = rep(letters[1:20], each=1e4)), stringsAsFactors=FALSE),
matrix(sample(1:10, 6 * 1e5, replace=TRUE), ncol=3))
names(df)[2:4] <- paste0("code", 1:3)
我的第一个版本:
arun.f <- function(df) {
df[, 2:4][t(apply(df[,2:4], 1, duplicated))] <- NA
df
}
我的第二个版本:
arun.s <- function(df) {
for (i in 2:(ncol(df)-1)) {
for (j in (i+1):ncol(df)) {
chk <- df[[i]] == df[[j]]
df[[j]][chk] <- NA
}
}
df
}
Ananda 的ave
+reshape2
解决方案:
library(reshape2)
ananda.ave <- function(df) {
df$ID2 <- with(df, ave(ID, ID, FUN = seq_along))
m.df <- melt(df, id.vars=c("ID", "ID2"))
m.df[duplicated(m.df[setdiff(names(m.df), "variable")]), "value"] <- NA
dcast(m.df, ID + ID2 ~ variable)
}
阿难的data.table
解决方法:
(稍作修改以更优化)
library(data.table)
DT <- data.table(df)
ananda.dt <- function(dt) {
temp <- dt[, list(ID2 = 1:.N, Value = unlist(.SD, use.names=FALSE)), by ="ID"]
temp[duplicated(temp), Value := NA]
out <- setnames(temp[, as.list(Value), by=list(ID, ID2)], 3:5, paste0("code", 1:3))
}