r - 优化基础 R 中的重新编码

Question

我正在根据一些相当长的字符串对变量进行重新编码，这里以字符串 A、B、C、D、E 和 G 为例。我想知道是否有一种方法可以重新编码，而不必重复引用df$foo使用基础 R 12 次？也许我可以探索一些更聪明更快的方式？这真的是在 R 中最聪明的方法吗？

df <- data.frame(
  foo = 1000:1010,
  bar = letters[1:11])
df  
    foo bar
1  1000   a
2  1001   b
3  1002   c
4  1003   d
5  1004   e
6  1005   f
7  1006   g
8  1007   h
9  1008   i
10 1009   j
11 1010   k

A  <- c(1002)
B  <- c(1007, 1008)
C  <- c(1001, 1003)
D  <- c(1004, 1006)
E  <- c(1000, 1005)
G  <- c(1010, 1009)

df$foo[df$foo %in% A] <- 1
df$foo[df$foo %in% B] <- 2
df$foo[df$foo %in% C] <- 3
df$foo[df$foo %in% D] <- 4
df$foo[df$foo %in% E] <- 5
df$foo[df$foo %in% G] <- 7
df
   foo bar
1    5   a
2    3   b
3    1   c
4    3   d
5    4   e
6    5   f
7    4   g
8    2   h
9    2   i
10   7   j
11   7   k

更新于 2013-03-11 05:28:061Z,

我已经重写了函数的五个解决方案，以便能够使用 microbenchmark 包对它们进行比较，结果是 Tyler Rinker 和 flodel 的解决方案是最快的解决方案（见下面的结果），并不是说这个问题仅与速度有关。我也在寻找解决方案的简洁性和智能性。出于好奇，我还使用Recodecar 包中的功能添加了一个解决方案。如果我可以以更优化的方式重写解决方案，或者 microbenchmark 包不是比较这些功能的最佳方式，请随时告诉我。

df <- data.frame(
  foo = sample(1000:1010, 1e5+22, replace = TRUE),
  bar = rep(letters, 3847))
str(df)

A  <- c(1002)
B  <- c(1007, 1008)
C  <- c(1001, 1003)
D  <- c(1004, 1006)
E  <- c(1000, 1005)
G  <- c(1010, 1009)

# juba's solution
juba <- function(df,foo) within(df, {foo[foo %in% A] <- 1; foo[foo %in% B] <- 2;foo[foo %in% C] <- 3;foo[foo %in% D] <- 4;foo[foo %in% E] <- 5;foo[foo %in% G] <- 7})
# Arun's solution
Arun <- function(df,x) factor(df[,x], levels=c(A,B,C,D,E,G), labels=c(1, rep(c(2:5, 7), each=2)))
# flodel's solution
flodel <- function(df,x) rep(c(1, 2, 3, 4, 5, 7), sapply(list(A, B, C, D, E, G), length))[match(df[,x], unlist(list(A, B, C, D, E, G)))]
# Tyler Rinker's solution
TylerRinker <- function(df,x)  data.frame(vals = unlist(list(A  = c(1002),B  = c(1007, 1008),C  = c(1001, 1003),D  = c(1004, 1006),E  = c(1000, 1005), G = c(1010, 1009))), labs = c(1, rep(c(2:5, 7), each=2)))[match(df[,x], unlist(list(A  = c(1002),B  = c(1007, 1008),C  = c(1001, 1003),D  = c(1004, 1006),E  = c(1000, 1005), G = c(1010, 1009)))), 2] 
# agstudy's solution
agstudy <- function(df,foo) merge(df,data.frame(foo=unlist(list(A, B, C, D, E, G)), val =rep((1:7)[-6],rapply(list(A, B, C, D, E, G), length))))
# Recode from the car package
ReINcar <- function(df,x) Recode(df[,x], "A='A'; B='B'; C='C'; D='D'; E='E'; G='G'")

# install.packages("microbenchmark", dependencies = TRUE)
require(microbenchmark)

# run test
res <- microbenchmark(juba(df, foo), Arun(df, 1), flodel(df, 1), TylerRinker(df,1) ,agstudy(df, foo), ReINcar(df, 1), times = 25)
There were 15 warnings (use warnings() to see them) # warning duo to x's solution

## Print results:
print(res)

号码，

   Unit: milliseconds
                   expr        min         lq     median         uq        max neval
          juba(df, foo)  37.944355  39.521603  41.987174  46.385974  79.559750    25
            Arun(df, 1)  23.833334  24.115776  24.648842  26.987431  55.466448    25
          flodel(df, 1)   3.586179   3.637024   3.956814   6.468735  28.404166    25
     TylerRinker(df, 1)   3.919563   4.115994   4.529926   5.532688   8.508956    25
       agstudy(df, foo) 301.487732 324.641734 334.801005 352.753496 415.421212    25
         ReINcar(df, 1)  73.655566  77.903088  81.745037 101.038791 125.158208    25


### Plot results:
boxplot(res)

微基准测试结果的箱线图，

微基准测试结果的箱线图

score 5 · Accepted Answer

这是一种通用（可扩展）方法，也非常快：

sets <- list(A, B, C, D, E, G)
vals <-    c(1, 2, 3, 4, 5, 7)

keys   <- unlist(sets)
values <- rep(vals, sapply(sets, length))
df$foo <- values[match(df$foo, keys)]

score 4 · Accepted Answer

Using within can help you save some keystrokes :

df <- within(df,
       {foo[foo %in% A] <- 1;
        foo[foo %in% B] <- 2;
        foo[foo %in% C] <- 3;
        foo[foo %in% D] <- 4;
        foo[foo %in% E] <- 5;
        foo[foo %in% G] <- 7})

score 3 · Accepted Answer

你也可以这样做：（已编辑）

> df$foo <- factor(df$foo, levels=c(A,B,C,D,E,G), labels=c(1, rep(c(2:5, 7), each=2)))

# Warning message:
# In `levels<-`(`*tmp*`, value = if (nl == nL) as.character(labels) else paste0(labels,  :
#   duplicated levels will not be allowed in factors anymore

#    foo bar
# 1    5   a
# 2    3   b
# 3    1   c
# 4    3   d
# 5    4   e
# 6    5   f
# 7    4   g
# 8    2   h
# 9    2   i
# 10   7   j
# 11   7   k

score 3 · Accepted Answer

我的方法（失去 A、B、C ......但我看到弗洛德尔的方法非常相似）。

keyL <- list(
    A  = c(1002),
    B  = c(1007, 1008),
    C  = c(1001, 1003),
    D  = c(1004, 1006),
    E  = c(1000, 1005),
    G  = c(1010, 1009)
)

key <- data.frame(vals = unlist(keyL), labs = c(1, rep(c(2:5, 7), each=2)))

df$foo2 <- key[match(df$foo, key$vals), 2]

我不喜欢写旧专栏，所以创建了一个新专栏。我还将密钥存储为命名列表。

score 2 · Accepted Answer

另一种选择是使用merge，非常类似于@flodel 和@Tyler 方法

sets <- list(A, B, C, D, E, G)
df.code = data.frame(foo=unlist(sets),
                     val =rep((1:7)[-6],rapply(sets, length)))
> merge(df,df.code)
    foo bar val
1  1000   a   5
2  1001   b   3
3  1002   c   1
4  1003   d   3
5  1004   e   4
6  1005   f   5
7  1006   g   4
8  1007   h   2
9  1008   i   2
10 1009   j   7
11 1010   k   7

score 1 · Accepted Answer

我认为这可以满足您的要求，尽管使用的格式略有不同。这可能是最快的方法。

library(data.table)

## Create the sample data:
dt <- data.table(foo=sample(1000:1010, 1e5+22, replace = TRUE), bar=rep(letters, 3847), key="foo")

## Create the table that maps the old value of foo to the new one:
dt.recode<-data.table(foo_old=1000:1010, foo_new=c(5L, 3L, 1L, 3L, 4L, 5L, 4L, 2L, 2L, 7L, 7L), key="foo_old")

## Show the result of the join/merge between the original and recoded table:
## (not necesary if you only want to update the original table)
dt[dt.recode]
##       foo bar foo_new
##  1: 1000   a       5
##  2: 1001   b       3
##  3: 1002   c       1
##  4: 1003   d       3
##  5: 1004   e       4
##  6: 1005   f       5
##  7: 1006   g       4
##  8: 1007   h       2
##  9: 1008   i       2
## 10: 1009   j       7
## 11: 1010   k       7

## Same as above, but updates the value of foo in the original table:
dt[dt.recode,foo:=foo_new][]
##     foo bar
##  1:   5   a
##  2:   3   b
##  3:   1   c
##  4:   3   d
##  5:   4   e
##  6:   5   f
##  7:   4   g
##  8:   2   h
##  9:   2   i
## 10:   7   j
## 11:   7   k

如果您愿意，而不是从头开始创建数据表，以下是如何将您的数据框转换为数据表（并添加稍后连接所需的键）：

dt <- as.data.table(df)
setkey(dt,foo)

我不确定你想如何使用这种方法计算时间，但假设 dt 和 dt.recode 已经存在并且已经被键入，然后运行更新表的单行显示我的系统上的 0 已用时间。

此外，如果您的 A、B、C、D、E、G 组有任何内在含义，我会将它们作为一列添加到您的原始表中。然后你可以加入这个字段，dt.recode 只需要 6 行（假设你有六个组）。

r - 优化基础 R 中的重新编码

更新于 2013-03-11 05:28:061Z,

微基准测试结果的箱线图，

6 回答 6

Related

Reference