optimization - 在 R 中优化 Verhoeff 算法

Question

我编写了以下函数来计算 R 中的校验位。

verhoeffCheck <- function(x)
{
## calculates check digit based on Verhoeff algorithm
## note that due to the way strsplit works, to call for vector x, use sapply(x,verhoeffCheck)

## check for string since leading zeros with numbers will be lost
if (class(x)!="character"){stop("Must enter a string")}

#split and convert to numbers
digs <- strsplit(x,"")[[1]]
digs <- as.numeric(digs)

digs <- rev(digs)   ## right to left algorithm

## tables required for D_5 group

d5_mult <- matrix(c(
                 0:9,
                 c(1:4,0,6:9,5),
                 c(2:4,0:1,7:9,5:6),
                 c(3:4,0:2,8:9,5:7),
                 c(4,0:3,9,5:8),
                 c(5,9:6,0,4:1),
                 c(6:5,9:7,1:0,4:2),
                 c(7:5,9:8,2:0,4:3),
                 c(8:5,9,3:0,4),
                 9:0
                 ),10,10,byrow=T)

d5_perm <- matrix(c(
                 0:9,
                 c(1,5,7,6,2,8,3,0,9,4),
                 c(5,8,0,3,7,9,6,1,4,2),
                 c(8,9,1,6,0,4,3,5,2,7),
                 c(9,4,5,3,1,2,6,8,7,0),
                 c(4,2,8,6,5,7,3,9,0,1),
                 c(2,7,9,3,8,0,6,4,1,5),
                 c(7,0,4,6,9,1,3,2,5,8)
                 ),8,10,byrow=T)

d5_inv <- c(0,4:1,5:9)

## apply algoritm - note 1-based indexing in R
d <- 0

for (i in 1:length(digs)){
    d <- d5_mult[d+1,(d5_perm[(i%%8)+1,digs[i]+1])+1]
    }

d5_inv[d+1]
}

为了在字符串向量上运行，sapply必须使用。这部分是因为使用了strsplit，它返回一个向量列表。即使对于中等大小的输入，这也会影响性能。

这个函数如何向量化？

我也知道必须在每次迭代中创建表会损失一些性能。将这些存储在新环境中会是更好的解决方案吗？

score 4 · Accepted Answer

我们首先定义查找矩阵。我已经以一种应该使它们更容易检查参考的方式布置它们，例如 http://en.wikipedia.org/wiki/Verhoeff_algorithm。

d5_mult <- matrix(as.integer(c(
  0, 1, 2, 3, 4, 5, 6, 7, 8, 9,
  1, 2, 3, 4, 0, 6, 7, 8, 9, 5,
  2, 3, 4, 0, 1, 7, 8, 9, 5, 6,
  3, 4, 0, 1, 2, 8, 9, 5, 6, 7,
  4, 0, 1, 2, 3, 9, 5, 6, 7, 8,
  5, 9, 8, 7, 6, 0, 4, 3, 2, 1,
  6, 5, 9, 8, 7, 1, 0, 4, 3, 2,
  7, 6, 5, 9, 8, 2, 1, 0, 4, 3,
  8, 7, 6, 5, 9, 3, 2, 1, 0, 4,
  9, 8, 7, 6, 5, 4, 3, 2, 1, 0
)), ncol = 10, byrow = TRUE)

d5_perm <- matrix(as.integer(c(
  0, 1, 2, 3, 4, 5, 6, 7, 8, 9,
  1, 5, 7, 6, 2, 8, 3, 0, 9, 4,
  5, 8, 0, 3, 7, 9, 6, 1, 4, 2,
  8, 9, 1, 6, 0, 4, 3, 5, 2, 7,
  9, 4, 5, 3, 1, 2, 6, 8, 7, 0,
  4, 2, 8, 6, 5, 7, 3, 9, 0, 1,
  2, 7, 9, 3, 8, 0, 6, 4, 1, 5,
  7, 0, 4, 6, 9, 1, 3, 2, 5, 8
)), ncol = 10, byrow = TRUE)

d5_inv <- as.integer(c(0, 4, 3, 2, 1, 5, 6, 7, 8, 9))

接下来，我们将定义检查函数，并使用测试输入进行尝试。我尽可能地遵循了维基百科的推导。

p <- function(i, n_i) {
  d5_perm[(i %% 8) + 1, n_i + 1] + 1
}
d <- function(c, p) {
  d5_mult[c + 1, p]
}

verhoeff <- function(x) {
  #split and convert to numbers
  digs <- strsplit(as.character(x), "")[[1]]
  digs <- as.numeric(digs)
  digs <- rev(digs)   ## right to left algorithm

  ## apply algoritm - note 1-based indexing in R
  c <- 0
  for (i in 1:length(digs)) {
    c <- d(c, p(i, digs[i]))
  }

  d5_inv[c + 1]
}
verhoeff(142857)

## [1] 0

这个函数基本上是迭代的，因为每次迭代都取决于前一次的值。这意味着我们不太可能在 R 中进行向量化，所以如果我们想要向量化，我们需要使用 Rcpp。

然而，在我们转向之前，我们是否可以更快地进行初始拆分值得探索。首先，我们做一些微基准测试，看看是否值得打扰：

library(microbenchmark)
digits <- function(x) {
  digs <- strsplit(as.character(x), "")[[1]]
  digs <- as.numeric(digs)
  rev(digs)
}

microbenchmark(
  digits(142857),
  verhoeff(142857)
)

## Unit: microseconds
##              expr   min    lq median    uq   max neval
##    digits(142857) 11.30 12.01  12.43 12.85 28.79   100
##  verhoeff(142857) 32.24 33.81  34.66 35.47 95.85   100

它看起来像它！在我的电脑上，verhoeff_prepare()大约占运行时间的 50%。在 stackoverflow 上稍作搜索揭示了另一种将数字转换为数字的方法：

digits2 <- function(x) {
   n <- floor(log10(x))
   x %/% 10^(0:n) %% 10
}
digits2(12345)

## [1] 5 4 3 2 1

microbenchmark(
  digits(142857),
  digits2(142857)
)

## Unit: microseconds
##             expr    min     lq median     uq   max neval
##   digits(142857) 11.495 12.102 12.468 12.834 79.60   100
##  digits2(142857)  2.322  2.784  3.358  3.561 13.69   100

digits2()比它快很多，digits()但它对整个运行时的影响有限。

verhoeff2 <- function(x) {
  digs <- digits2(x)

  c <- 0
  for (i in 1:length(digs)) {
    c <- d(c, p(i, digs[i]))
  }

  d5_inv[c + 1]
}
verhoeff2(142857)

## [1] 0

microbenchmark(
  verhoeff(142857),
  verhoeff2(142857)
)

## Unit: microseconds
##               expr   min    lq median    uq   max neval
##   verhoeff(142857) 33.06 34.49  35.19 35.92 73.38   100
##  verhoeff2(142857) 20.98 22.58  24.05 25.28 48.69   100

为了让它更快，我们可以尝试 C++。

#include <Rcpp.h>
using namespace Rcpp;

// [[Rcpp::export]]
int verhoeff3_c(IntegerVector digits, IntegerMatrix mult, IntegerMatrix perm,
                IntegerVector inv) {
  int n = digits.size();
  int c = 0;

  for(int i = 0; i < n; ++i) {
    int p = perm(i % 8, digits[i]);
    c = mult(c, p);
  }

  return inv[c];
}

verhoeff3 <- function(x) {
  verhoeff3_c(digits(x), d5_mult, d5_perm, d5_inv)
}
verhoeff3(142857)

## [1] 3

microbenchmark(
  verhoeff2(142857),
  verhoeff3(142857)
)

## Unit: microseconds
##               expr   min    lq median    uq   max neval
##  verhoeff2(142857) 21.00 22.85  25.53 27.11 63.71   100
##  verhoeff3(142857) 16.75 17.99  18.87 19.64 79.54   100

这并没有带来太大的改善。如果我们将数字传递给 C++ 并在循环中处理数字，也许我们可以做得更好：

#include <Rcpp.h>
using namespace Rcpp;

// [[Rcpp::export]]
int verhoeff4_c(int number, IntegerMatrix mult, IntegerMatrix perm,
                IntegerVector inv) {
  int c = 0;
  int i = 0;

  for (int i = 0; number > 0; ++i, number /= 10) {
    int p = perm(i % 8, number % 10);
    c = mult(c, p);
  }

  return inv[c];
}

verhoeff4 <- function(x) {
  verhoeff4_c(x, d5_mult, d5_perm, d5_inv)
}
verhoeff4(142857)

## [1] 3

microbenchmark(
  verhoeff2(142857),
  verhoeff3(142857),
  verhoeff4(142857)
)

## Unit: microseconds
##               expr    min     lq median     uq   max neval
##  verhoeff2(142857) 21.808 24.910 26.838 27.797 64.22   100
##  verhoeff3(142857) 17.699 18.742 19.599 20.764 81.67   100
##  verhoeff4(142857)  3.143  3.797  4.095  4.396 13.21   100

我们得到了回报：verhoeff4()大约比 verhoeff2().

score 1 · Accepted Answer

如果您的输入字符串可以包含不同数量的字符，那么我看不到任何回合lapply调用（或plyr等效项）。诀窍是将它们移动到函数内部，因此verhoeffCheck可以接受向量输入。这样，您只需要创建一次矩阵。

verhoeffCheckNew <- function(x)
{
## calculates check digit based on Verhoeff algorithm

## check for string since leading zeros with numbers will be lost
  if (!is.character(x)) stop("Must enter a string")

  #split and convert to numbers
  digs <- strsplit(x, "")
  digs <- lapply(digs, function(x) rev(as.numeric(x)))

  ## tables required for D_5 group
  d5_mult <- matrix(c(
                   0:9,
                   c(1:4,0,6:9,5),
                   c(2:4,0:1,7:9,5:6),
                   c(3:4,0:2,8:9,5:7),
                   c(4,0:3,9,5:8),
                   c(5,9:6,0,4:1),
                   c(6:5,9:7,1:0,4:2),
                   c(7:5,9:8,2:0,4:3),
                   c(8:5,9,3:0,4),
                   9:0
                   ),10,10,byrow=T)

  d5_perm <- matrix(c(
                   0:9,
                   c(1,5,7,6,2,8,3,0,9,4),
                   c(5,8,0,3,7,9,6,1,4,2),
                   c(8,9,1,6,0,4,3,5,2,7),
                   c(9,4,5,3,1,2,6,8,7,0),
                   c(4,2,8,6,5,7,3,9,0,1),
                   c(2,7,9,3,8,0,6,4,1,5),
                   c(7,0,4,6,9,1,3,2,5,8)
                   ),8,10,byrow=T)

  d5_inv <- c(0,4:1,5:9)

  ## apply algorithm - note 1-based indexing in R      
  sapply(digs, function(x)
  {
    d <- 0  
    for (i in 1:length(x)){
        d <- d5_mult[d + 1, (d5_perm[(i %% 8) + 1, x[i] + 1]) + 1]
        }  
    d5_inv[d+1]
  })
}

由于d取决于以前的情况，因此对for循环进行矢量化并不容易。

我的版本运行 1e5 字符串的时间大约是一半。

rand_string <- function(n = 12) 
{
  paste(sample(as.character(0:9), sample(n), replace = TRUE), collapse = "")
}
big_test <- replicate(1e5, rand_string())

tic()
res1 <- unname(sapply(big_test, verhoeffCheck))
toc()

tic()
res2 <- verhoeffCheckNew(big_test)
toc()

identical(res1, res2) #hopefully TRUE!

看到这个tic问题和toc。

进一步的想法：

您可能需要额外的输入检查以及在转换为数字时 ""返回的其他字符串。NA

由于您只处理整数，因此使用它们而不是双精度数可能会带来轻微的性能优势。（使用as.integer而不是as.numeric附加L到矩阵中的值。）

score 0 · Accepted Answer

Richie C 很好地回答了向量化问题；至于只创建表一次而不会弄乱全局名称空间，一种不需要包的快速解决方案是

verhoeffCheck <- local(function(x)
{
## calculates check digit based on Verhoeff algorithm
## note that due to the way strsplit works, to call for vector x, use sapply(x,verhoeffCheck)
## check for string since leading zeros with numbers will be lost
if (class(x)!="character"){stop("Must enter a string")}
#split and convert to numbers
digs <- strsplit(x,"")[[1]]
digs <- as.numeric(digs)
digs <- rev(digs)   ## right to left algorithm
## apply algoritm - note 1-based indexing in R
d <- 0
for (i in 1:length(digs)){
    d <- d5_mult[d+1,(d5_perm[(i%%8)+1,digs[i]+1])+1]
    }
d5_inv[d+1]
})

assign("d5_mult", matrix(c(
    0:9, c(1:4,0,6:9,5), c(2:4,0:1,7:9,5:6), c(3:4,0:2,8:9,5:7),
    c(4,0:3,9,5:8), c(5,9:6,0,4:1), c(6:5,9:7,1:0,4:2), c(7:5,9:8,2:0,4:3),
    c(8:5,9,3:0,4), 9:0), 10, 10, byrow = TRUE), 
    envir = environment(verhoeffCheck))

assign("d5_perm", matrix(c(
    0:9, c(1,5,7,6,2,8,3,0,9,4), c(5,8,0,3,7,9,6,1,4,2),
    c(8,9,1,6,0,4,3,5,2,7), c(9,4,5,3,1,2,6,8,7,0), c(4,2,8,6,5,7,3,9,0,1),
    c(2,7,9,3,8,0,6,4,1,5), c(7,0,4,6,9,1,3,2,5,8)), 8, 10, byrow = TRUE),
    envir = environment(verhoeffCheck))

assign("d5_inv", c(0,4:1,5:9), envir = environment(verhoeffCheck))
## Now just use the function

它将数据保存在函数的环境中。你可以计时，看看它有多快。

希望这可以帮助。

艾伦

optimization - 在 R 中优化 Verhoeff 算法

3 回答 3

Related

Reference