5

我有这个字符向量:

dput(t$line)
c("0304", "0305", "0306", "0308", "0311", "0313", "0314", "0316", 
"0318", "0321", "0322", "0323", "0324", "0326", "0327", "0330", 
"0333", "0337", "0338", "0339", "0342", "0341", "0344", "0346", 
"0347", "0348", "0349", "0350", "0352", "0353", "0357", "0359", 
"0360", "0362", "0363", "0364", "0365", "0367", "0371", "0370", 
"0373", "0375", "0378", "0380", "0381", "0385", "0386", "0387", 
"0391", "0395", "0394", "0397", "0398", "0399", "0400", "0402", 
"0404", "0405", "0406", "0408", "0412", "0416", "0419", "0423", 
"0424", "0425", "0426", "0428", "0429", "0432", "0433", "0436", 
"0435", "0439", "0437", "0440", "0441")

它包含的数字不是完全连续的。我想让它们连续,同时在需要的地方保留前导零或零。我想出了这个解决方案:

paste("0", seq(as.numeric(t$line[1]), as.numeric(t$line[1]) + length(t$line), 1), sep = "")
 [1] "0304" "0305" "0306" "0307" "0308" "0309" "0310" "0311" "0312" "0313" "0314" "0315" "0316" "0317" "0318" "0319" "0320"
[18] "0321" "0322" "0323" "0324" "0325" "0326" "0327" "0328" "0329" "0330" "0331" "0332" "0333" "0334" "0335" "0336" "0337"
[35] "0338" "0339" "0340" "0341" "0342" "0343" "0344" "0345" "0346" "0347" "0348" "0349" "0350" "0351" "0352" "0353" "0354"
[52] "0355" "0356" "0357" "0358" "0359" "0360" "0361" "0362" "0363" "0364" "0365" "0366" "0367" "0368" "0369" "0370" "0371"
[69] "0372" "0373" "0374" "0375" "0376" "0377" "0378" "0379" "0380" "0381"

只要恰好有一个 0要添加,这就可以了。然而,可能有多个前导零或根本没有。如何用适当的前导零使序列连续?

4

3 回答 3

3

一种stringr选择可能是:

str_pad(seq.int(min(as.numeric(x)), length.out = length(x)), 4, "left", "0")

 [1] "0304" "0305" "0306" "0307" "0308" "0309" "0310" "0311" "0312" "0313" "0314" "0315" "0316"
[14] "0317" "0318" "0319" "0320" "0321" "0322" "0323" "0324" "0325" "0326" "0327" "0328" "0329"
[27] "0330" "0331" "0332" "0333" "0334" "0335" "0336" "0337" "0338" "0339" "0340" "0341" "0342"
[40] "0343" "0344" "0345" "0346" "0347" "0348" "0349" "0350" "0351" "0352" "0353" "0354" "0355"
[53] "0356" "0357" "0358" "0359" "0360" "0361" "0362" "0363" "0364" "0365" "0366" "0367" "0368"
[66] "0369" "0370" "0371" "0372" "0373" "0374" "0375" "0376" "0377" "0378" "0379" "0380"
于 2020-09-26T14:46:31.547 回答
2

length(x)您需要从 开始的连续序列min(x),其中nchar的结果元素与 的相同x

使用sprintf而不是paste0格式化前导零。nchar(x)[1]给出需要(偶尔)用零填充的长度. 如果长度相等不安全,请使用max(nchar(x)),但这会更慢。

因为x[1]不一定必须是您可能希望min(as.numeric(x))用作起点的最小值。当您使用seq时,它的终点应该是min(as.numeric(x)) + length(x) - 1(因为 min 已经是第一个元素)。或者使用length.out=length(x)看起来更快,结合seq.int甚至更快。

sprintf(paste0("%0", nchar(x)[1], "d"), seq.int(min(as.numeric(x)), length.out=length(x)))
# [1] "0304" "0305" "0306" "0307" "0308" "0309" "0310" "0311" "0312" "0313" "0314" "0315"
# [13] "0316" "0317" "0318" "0319" "0320" "0321" "0322" "0323" "0324" "0325" "0326" "0327"
# [25] "0328" "0329" "0330" "0331" "0332" "0333" "0334" "0335" "0336" "0337" "0338" "0339"
# [37] "0340" "0341" "0342" "0343" "0344" "0345" "0346" "0347" "0348" "0349" "0350" "0351"
# [49] "0352" "0353" "0354" "0355" "0356" "0357" "0358" "0359" "0360" "0361" "0362" "0363"
# [61] "0364" "0365" "0366" "0367" "0368" "0369" "0370" "0371" "0372" "0373" "0374" "0375"
# [73] "0376" "0377" "0378" "0379" "0380"

另一种选择是使用冒号:,但seq.int上面似乎更快(见下面的基准)。

sprintf(paste0("%0", nchar(x)[1], "d"), 0:(length(x) - 1) + min(as.numeric(x)))

注意:要通过估算缺失来完成原始向量,您可以执行以下操作:

sprintf(paste0("%0", max(nchar(x)), "d"), do.call(`:`, as.list(range(as.numeric(x)))))
# [1] "0304" "0305" "0306" "0307" "0308" "0309" "0310" "0311" "0312" "0313" "0314"
# [12] "0315" "0316" "0317" "0318" "0319" "0320" "0321" "0322" "0323" "0324" "0325"
# [23] "0326" "0327" "0328" "0329" "0330" "0331" "0332" "0333" "0334" "0335" "0336"
# [34] "0337" "0338" "0339" "0340" "0341" "0342" "0343" "0344" "0345" "0346" "0347"
# [45] "0348" "0349" "0350" "0351" "0352" "0353" "0354" "0355" "0356" "0357" "0358"
# [56] "0359" "0360" "0361" "0362" "0363" "0364" "0365" "0366" "0367" "0368" "0369"
# [67] "0370" "0371" "0372" "0373" "0374" "0375" "0376" "0377" "0378" "0379" "0380"
# [78] "0381" "0382" "0383" "0384" "0385" "0386" "0387" "0388" "0389" "0390" "0391"
# [89] "0392" "0393" "0394" "0395" "0396" "0397" "0398" "0399" "0400" "0401" "0402"
# [100] "0403" "0404" "0405" "0406" "0407" "0408" "0409" "0410" "0411" "0412" "0413"
# [111] "0414" "0415" "0416" "0417" "0418" "0419" "0420" "0421" "0422" "0423" "0424"
# [122] "0425" "0426" "0427" "0428" "0429" "0430" "0431" "0432" "0433" "0434" "0435"
# [133] "0436" "0437" "0438" "0439" "0440" "0441"

基准

f1 <- function() sprintf(paste0("%0", max(nchar(x)), "d"),
                         seq(min(as.numeric(x)), min(as.numeric(x)) + length(x) - 1))
f2 <- function() sprintf(paste0("%0", max(nchar(x)), "d"),
                         seq(min(as.numeric(x)), length.out=length(x)))
f3 <- function() sprintf(paste0("%0", max(nchar(x)), "d"),
                         seq.int(min(as.numeric(x)), length.out=length(x)))
f31 <- function() sprintf(paste0("%0", nchar(x[1]), "d"),
                         seq.int(min(as.numeric(x)), length.out=length(x)))
f4 <- function() sprintf(paste0("%0", nchar(x[1]), "d"),
                         0:(length(x) - 1) + min(as.numeric(x)))
f5 <- function() stringr::str_pad(seq.int(min(as.numeric(x)),
                                          length.out=length(x)),
                                  nchar(x[1]), "left", "0")

set.seed(5789)
x <- sample(sprintf("%05d", 1:99999))

microbenchmark::microbenchmark(seq_to=f1(), seq_len=f2(), seq.int=f3(),
                               seq.int1=f31(), colon=f4(), stringr=f5())
# Unit: milliseconds
#     expr       min        lq      mean    median        uq       max neval    cld
#   seq_to 104.22119 106.83928 108.92791 107.81301 109.68406 124.35686   100      f
#  seq_len  87.14385  89.89180  92.34962  90.97192  92.09823 110.59426   100    d  
#  seq.int  85.72324  87.93885  89.91353  89.03327  90.32758 113.41480   100   c   
# seq.int1  59.54312  61.63065  62.86618  62.47707  63.53334  76.33471   100 a     
#    colon  60.94867  63.16109  64.73306  63.88925  64.79997  81.63646   100  b    
#  stringr  99.08452 101.56649 104.01522 102.74420 104.20269 158.30948   100     e 
于 2020-09-26T14:49:52.320 回答
2

一个更通用的解决方案,它考虑了条目的最大长度,因此隐含了前导零的数量:

t$line2 <- c("000517","00524")

Cont.PadZero <- function(vec) sprintf(paste0("%0", max(nchar(vec)), "d"), seq.int(min(as.numeric(vec)), max(as.numeric(vec))))

Cont.PadZero(t$line2)
[1] "000517" "000518" "000519" "000520" "000521" "000522" "000523" "000524"

于 2020-09-26T14:50:19.880 回答