0

我有一个df包含 1000 万行的数据框。我想将“生日”列的字符格式从“xxxxxxxx”转换为“xxxx-xx-xx”。例如。从“20051023”到“2005-10-23”。我可以df$birthday <- lapply(df$birthday, as.Date, "%Y%m%d")这样做,但它会浪费大量内存和计算时间来进行数据转换。但是,我只想将其转换为类似日期的字符,而不是日期类型。因此我使用stringi包,因为它是用 C 语言编写的。不幸的是,df$birthday <- stri_join(stri_sub(df$birthday, from=c(1,5,7), to=c(4,6,8)), collapse = "-")不起作用,因为该函数不支持矢量输入。有没有办法解决这个问题?非常感谢。

4

2 回答 2

1

和子一起去。

date <- c("20051023", "20151023")
sub("^(\\d{4})(\\d{2})(\\d{2})$", "\\1-\\2-\\3", date)
# [1] "2005-10-23" "2015-10-23"
于 2015-10-09T16:31:53.370 回答
1

as.Date适用于矢量

 df$birthday <- format(as.Date(df$birthday, "%Y%m%d"), "%Y-%m-%d)

向量化函数比应用快得多

library(microbenchmark)
n <- 1e3
df <- data.frame(birthday = rep("20051023", n))
microbenchmark(
  lapply(df$birthday, as.Date, "%Y%m%d"),
  sapply(df$birthday, as.Date, "%Y%m%d"),
  as.Date(df$birthday, "%Y%m%d")
)

 Unit: microseconds
                                   expr       min        lq       mean    median        uq       max neval cld
 lapply(df$birthday, as.Date, "%Y%m%d") 22833.624 25340.118 29064.7188 28406.154 32346.245 58522.360   100   b
 sapply(df$birthday, as.Date, "%Y%m%d") 24048.493 26252.660 29797.9074 28437.156 33119.381 47966.133   100   b
         as.Date(df$birthday, "%Y%m%d")   431.469   447.719   481.5221   461.189   475.086  1984.158   100  a 

正则表达式的偏离速度更快。

microbenchmark(
  as.character(as.Date(df$birthday, "%Y%m%d")),
  format(as.Date(df$birthday, "%Y%m%d"), "%Y-%m%-d"),
  sub("^(\\d{4})(\\d{2})(\\d{2})$", "\\1-\\2-\\3", df$birthday)
)

Unit: microseconds
                                                                      expr      min       lq     mean
                              as.character(as.Date(df$birthday, "%Y%m%d")) 4923.189 5057.462 5390.313
                        format(as.Date(df$birthday, "%Y%m%d"), "%Y-%m%-d") 3428.657 3553.736 3697.660
 sub("^(\\\\d{4})(\\\\d{2})(\\\\d{2})$", "\\\\1-\\\\2-\\\\3", df$birthday)  713.699  739.997  815.737
    median        uq      max neval cld
 5150.0420 5394.4265 8225.270   100   c
 3594.7875 3665.9865 5753.200   100  b 
  763.0885  783.1865 2433.585   100 a 

sub()适用于矩阵,但不适用于 data.frames。因此as.matrix

df <- as.data.frame(matrix("20051023", ncol = 3, nrow = 3))
df$ID <- seq_len(nrow(df))
df[, 1:3] <- sub("^(\\d{4})(\\d{2})(\\d{2})$", "\\1-\\2-\\3", as.matrix(df[, 1:3]))

矩阵解决方案比 for 循环更快。差异随着您需要循环的列数而增加。

df <- as.data.frame(matrix("20051023", ncol = 20, nrow = 3))
df$ID <- seq_len(nrow(df))
library(microbenchmark)
microbenchmark(
  matrix = df[, seq_len(ncol(df) - 1)] <- sub("^(\\d{4})(\\d{2})(\\d{2})$", "\\1-\\2-\\3", as.matrix(df[, seq_len(ncol(df) - 1)])),
  forloop = {
    for(i in seq_len(ncol(df) - 1)){
      df[, i] <- sub("^(\\d{4})(\\d{2})(\\d{2})$", "\\1-\\2-\\3", df[, i])
    }
  }
)

Unit: microseconds
    expr      min       lq      mean    median       uq      max neval cld
  matrix  460.555  476.805  504.3012  494.1235  507.594 1122.522   100  a 
 forloop 1554.425 1590.774 1677.3038 1625.8390 1670.312 3563.845   100   b
于 2015-10-09T16:47:01.143 回答