r - 提高 R 中（Levenshtein distance with weights）脚本的性能

Question

我正在使用 Levenshtein 距离度量进行大量字符串比较，但因为我需要能够考虑字符串潜在结构中的空间邻接性，所以我必须制作自己的脚本，包括权重函数。

我现在的问题是我的脚本效率很低。我必须进行大约 600,000 次比较，并且脚本需要几个小时才能完成。因此，我正在寻找一种方法来提高我的脚本效率，但是作为一个自学成才的小伙伴，我不知道如何解决这个问题。

这是功能：

zeros <- function(lengthA,lengthB){
  m <- matrix(c(rep(0,lengthA*lengthB)),nrow=lengthA,ncol=lengthB)
  return(m)
}


weight <- function(A,B,weights){
  if (weights == TRUE){

    # cost_weight defines the matrix structure of the AOI-placement
    cost_weight <- matrix(c("a","b","c","d","e","f","g","h","i","j","k","l",
                           "m","n","o","p","q","r","s","t","u","v","w","x"),
                          nrow=6)

    max_walk <- 8.00  # defined as the maximum posible distance between letters in 
                      # the cost_weight matrix
    indexA <- which(cost_weight==A, arr.ind=TRUE)
    indexB <- which(cost_weight==B, arr.ind=TRUE)
    walk <- abs(indexA[1]-indexB[1])+abs(indexA[2]-indexB[2])
    w <- walk/max_walk
  }

  else {w <- 1}

  return(w)
}


dist <- function(A, B, insertion, deletion, substitution, weights=TRUE){
  D <- zeros(nchar(A)+1,nchar(B)+1)
  As <- strsplit(A,"")[[1]]
  Bs <- strsplit(B,"")[[1]]
  # filling out the matrix
  for (i in seq(to=nchar(A))){ 
    D[i + 1,1] <- D[i,1] + deletion * weight(As[i],Bs[1], weights)
  }
  for (j in seq(to=nchar(B))){ 
    D[1,j + 1] <- D[1,j] + insertion * weight(As[1],Bs[j], weights)
  }
  for (i in seq(to=nchar(A))){ 
    for (j in seq(to=nchar(B))){
      if (As[i] == Bs[j]){
        D[i + 1,j + 1] <- D[i,j]
        } 
      else{
        D[i + 1,j + 1] <- min(D[i + 1,j] + insertion * weight(As[i],Bs[j], weights),
                              D[i,j + 1] + deletion * weight(As[i],Bs[j], weights),
                              D[i,j]     + substitution * weight(As[i],Bs[j], weights))
      }
    }
  }
  return(D)
}


levenshtein <- function(A, B, insertion=1, deletion=1, substitution=1){
  # Compute levenshtein distance between iterables A and B

  if (nchar(A) == nchar(B) & A == B){
    return(0)
  }

  if (nchar(B) > nchar(A)){
    C <- A
    A <- B
    B <- A
    #(A, B) <- (B, A)
  }

  if (nchar(A) == 0){
    return (nchar(B))
  }

  else{
    return (dist(A, B, insertion, deletion, substitution)[nchar(A),nchar(B)])
  }
}

将我的 Levenshtein 度量的性能与 stringdist 包中的性能进行比较，性能要差 83 倍。

library (stringdist)
library(rbenchmark)

A <-"abcdefghijklmnopqrstuvwx"
B <-"xwvutsrqponmlkjihgfedcba"

benchmark(levenshtein(A,B), stringdist(A,B,method="lv"),
          columns=c("test", "replications", "elapsed", "relative"),
          order="relative", replications=10) 


                             test replications elapsed relative
2 stringdist(A, B, method = "lv")           10    0.01        1
1               levenshtein(A, B)           10    0.83       83

有没有人有改进我的脚本的想法？

score 1 · Accepted Answer

以下代码已经是一些改进（您的代码；计算与您之前所做的相同，与不同stringdist），但我相信它可以更加简化和加速。

zeros <- function(lengthA,lengthB){
  m <- matrix(0, nrow=lengthA, ncol=lengthB)
  return(m)
}


weight <- function(A,B,weights){
  if (weights){
    # cost_weight defines the matrix structure of the AOI-placement
    cost_weight <- matrix(c("a","b","c","d","e","f","g","h","i","j","k","l",
                            "m","n","o","p","q","r","s","t","u","v","w","x"),
                          nrow=6)

    max_walk <- 8.00  # defined as the maximum posible distance between letters in 
    # the cost_weight matrix
    amats <- lapply(A, `==`, y=cost_weight)
    bmats <- lapply(B, `==`, y=cost_weight)
    walk <- mapply(function(a, b){
      sum(abs(which(a, arr.ind=TRUE) - which(b, arr.ind=TRUE)))
    }, amats, bmats)
    return(walk/max_walk)
  }
  else return(1)
}


dist <- function(A, B, insertion, deletion, substitution, weights=TRUE){
  #browser()
  D <- zeros(nchar(A)+1,nchar(B)+1)
  As <- strsplit(A,"")[[1]]
  Bs <- strsplit(B,"")[[1]]
  # filling out the matrix
  weight.mat <- outer(As, Bs, weight, weights=weights)
  D[,1] <- c(0, deletion * cumsum(weight.mat[, 1]))
  D[1,] <- c(0, insertion * cumsum(weight.mat[1,]))

  for (i in seq(to=nchar(A))){ 
    for (j in seq(to=nchar(B))){
      if (As[i] == Bs[j]){
        D[i + 1,j + 1] <- D[i,j]
      } 
      else{
        D[i + 1,j + 1] <- min(D[i + 1,j] + insertion * weight.mat[i, j],
                              D[i,j + 1] + deletion * weight.mat[i, j],
                              D[i,j]     + substitution * weight.mat[i, j])
      }
    }
  }
  return(D)
}


levenshtein <- function(A, B, insertion=1, deletion=1, substitution=1){
  # Compute levenshtein distance between iterables A and B

  if (nchar(A) == nchar(B) & A == B){
    return(0)
  }

  if (nchar(B) > nchar(A)){
    C <- A
    A <- B
    B <- A
    #(A, B) <- (B, A)
  }

  if (nchar(A) == 0){
    return (nchar(B))
  }

  else{
    return (dist(A, B, insertion, deletion, substitution)[nchar(A),nchar(B)])
  }
}

r - 提高 R 中（Levenshtein distance with weights）脚本的性能

1 回答 1

Related

Reference