我正在根据一些相当长的字符串对变量进行重新编码,这里以字符串 A、B、C、D、E 和 G 为例。我想知道是否有一种方法可以重新编码,而不必重复引用df$foo
使用基础 R 12 次?也许我可以探索一些更聪明更快的方式?这真的是在 R 中最聪明的方法吗?
df <- data.frame(
foo = 1000:1010,
bar = letters[1:11])
df
foo bar
1 1000 a
2 1001 b
3 1002 c
4 1003 d
5 1004 e
6 1005 f
7 1006 g
8 1007 h
9 1008 i
10 1009 j
11 1010 k
A <- c(1002)
B <- c(1007, 1008)
C <- c(1001, 1003)
D <- c(1004, 1006)
E <- c(1000, 1005)
G <- c(1010, 1009)
df$foo[df$foo %in% A] <- 1
df$foo[df$foo %in% B] <- 2
df$foo[df$foo %in% C] <- 3
df$foo[df$foo %in% D] <- 4
df$foo[df$foo %in% E] <- 5
df$foo[df$foo %in% G] <- 7
df
foo bar
1 5 a
2 3 b
3 1 c
4 3 d
5 4 e
6 5 f
7 4 g
8 2 h
9 2 i
10 7 j
11 7 k
更新于 2013-03-11 05:28:061Z,
我已经重写了函数的五个解决方案,以便能够使用 microbenchmark 包对它们进行比较,结果是 Tyler Rinker 和 flodel 的解决方案是最快的解决方案(见下面的结果),并不是说这个问题仅与速度有关。我也在寻找解决方案的简洁性和智能性。出于好奇,我还使用Recode
car 包中的功能添加了一个解决方案。如果我可以以更优化的方式重写解决方案,或者 microbenchmark 包不是比较这些功能的最佳方式,请随时告诉我。
df <- data.frame(
foo = sample(1000:1010, 1e5+22, replace = TRUE),
bar = rep(letters, 3847))
str(df)
A <- c(1002)
B <- c(1007, 1008)
C <- c(1001, 1003)
D <- c(1004, 1006)
E <- c(1000, 1005)
G <- c(1010, 1009)
# juba's solution
juba <- function(df,foo) within(df, {foo[foo %in% A] <- 1; foo[foo %in% B] <- 2;foo[foo %in% C] <- 3;foo[foo %in% D] <- 4;foo[foo %in% E] <- 5;foo[foo %in% G] <- 7})
# Arun's solution
Arun <- function(df,x) factor(df[,x], levels=c(A,B,C,D,E,G), labels=c(1, rep(c(2:5, 7), each=2)))
# flodel's solution
flodel <- function(df,x) rep(c(1, 2, 3, 4, 5, 7), sapply(list(A, B, C, D, E, G), length))[match(df[,x], unlist(list(A, B, C, D, E, G)))]
# Tyler Rinker's solution
TylerRinker <- function(df,x) data.frame(vals = unlist(list(A = c(1002),B = c(1007, 1008),C = c(1001, 1003),D = c(1004, 1006),E = c(1000, 1005), G = c(1010, 1009))), labs = c(1, rep(c(2:5, 7), each=2)))[match(df[,x], unlist(list(A = c(1002),B = c(1007, 1008),C = c(1001, 1003),D = c(1004, 1006),E = c(1000, 1005), G = c(1010, 1009)))), 2]
# agstudy's solution
agstudy <- function(df,foo) merge(df,data.frame(foo=unlist(list(A, B, C, D, E, G)), val =rep((1:7)[-6],rapply(list(A, B, C, D, E, G), length))))
# Recode from the car package
ReINcar <- function(df,x) Recode(df[,x], "A='A'; B='B'; C='C'; D='D'; E='E'; G='G'")
# install.packages("microbenchmark", dependencies = TRUE)
require(microbenchmark)
# run test
res <- microbenchmark(juba(df, foo), Arun(df, 1), flodel(df, 1), TylerRinker(df,1) ,agstudy(df, foo), ReINcar(df, 1), times = 25)
There were 15 warnings (use warnings() to see them) # warning duo to x's solution
## Print results:
print(res)
号码,
Unit: milliseconds
expr min lq median uq max neval
juba(df, foo) 37.944355 39.521603 41.987174 46.385974 79.559750 25
Arun(df, 1) 23.833334 24.115776 24.648842 26.987431 55.466448 25
flodel(df, 1) 3.586179 3.637024 3.956814 6.468735 28.404166 25
TylerRinker(df, 1) 3.919563 4.115994 4.529926 5.532688 8.508956 25
agstudy(df, foo) 301.487732 324.641734 334.801005 352.753496 415.421212 25
ReINcar(df, 1) 73.655566 77.903088 81.745037 101.038791 125.158208 25
### Plot results:
boxplot(res)