0

我正在输入字符串和一些以前输入的字符串之间进行模糊匹配测试。测试是在打字时实时进行的。

我已经有了一个准确得惊人的算法,叫做StrikeAMatch,它已被翻译成多种语言。我发现最快的 Ruby 实现是amatch。但是,它与我的 JRuby 环境不兼容,因为它在需要 C Ruby 解释器 (MRI)的 C 扩展中处理数据。虽然它非常快:

a = "Lorem ipsum dolor"
b = "Lorem ipsum dolor sit amet, consectetuer adipiscing elit. Nam cursus. Morbi ut mi. Nullam enim leo, egestas id, condimentum at, laoreet mattis, massa. Sed eleifend nonummy diam. Praesent mauris ante,"
puts Benchmark.measure {
  10000.times { a.pair_distance_similar(b) }
}
# => 0.130000   0.000000   0.130000 (  0.146347)

我希望我可以避免设置替代环境。另一种方法是尝试移植JRuby Wiki 中建议的原始 Java 代码。不知道该怎么做。

关于如何解决这个问题的任何想法?

4

2 回答 2

0

有两种方法可以解决这个问题:

  1. 从现有的 Ruby gem 中获取 C ext 代码并将其移植到 Java

...有许多具有 C-Ruby/JRuby 特定扩展的 gem 示例,例如 json/nokogiri

  1. 获取 Java 端口,加载 Java 类并使用 JRuby 的 Java 集成
于 2016-05-10T20:37:56.853 回答
0

该算法易于实现。例如,这是我用 Java 编写的一个快速实现:

import java.util.HashMap;
import java.util.HashSet;
import java.util.Map;
import java.util.Set;

public class StrikeAMatch {

    protected int numPairs(final String string) {
        return Math.max(0, string.length() - 1);
    }

    protected Map<String, Integer> pairs(final String string) {
        if (string.isEmpty()) {
            return new HashMap<>(0);
        }

        final Map<String, Integer> pairs = new HashMap(2 * numPairs(string));

        for (int i = 1, j = string.length(); i < j; i += 1) {
            final char a = string.charAt(i - 1);
            final char b = string.charAt(i);
            final String pair = String.format("%c%c", a, b);
            if (pairs.containsKey(pair)) {
                pairs.put(pair, 1 + pairs.get(pair));
            }
            else {
                pairs.put(pair, 1);
            }
        }

        return pairs;
    }

    protected int intersectionSize(
            final Map<String, Integer> lhsPairs,
            final Map<String, Integer> rhsPairs) {
        final Set<String> lhsKeys = lhsPairs.keySet();
        final Set<String> rhsKeys = rhsPairs.keySet();
        final Set<String> pairIntersection = new HashSet<>(lhsKeys);
        pairIntersection.retainAll(rhsKeys);
        int sharedPairs = 0;
        for (final String pair : pairIntersection) {
            sharedPairs += Math.min(lhsPairs.get(pair), rhsPairs.get(pair));
        }
        return sharedPairs;
    }

    public double between(final String lhs, final String rhs) {
        if (lhs.isEmpty() && rhs.isEmpty()) {
            return 1.0;
        }
        final Map<String, Integer> lhsPairs = pairs(lhs.toUpperCase());
        final Map<String, Integer> rhsPairs = pairs(rhs.toUpperCase());
        return (2.0 * intersectionSize(lhsPairs, rhsPairs))
             / (numPairs(lhs) + numPairs(rhs));
    }

    public static void main(final String[] args) {
        final StrikeAMatch distance = new StrikeAMatch();
        for (final String lhs : args) {
            for (final String rhs : args) {
                System.out.printf("d(\"%s\", \"%s\") = [%.7f]%n",
                    lhs, rhs, distance.between(lhs, rhs));
            }
        }
    }
}

您甚至可以填充第一个和最后一个字符,以将其扩展到一个字符或零字符的术语:

import java.util.HashMap;
import java.util.Map;

public class PaddedStrikeAMatch extends StrikeAMatch {

    protected int numPairs(final String string) {
        return string.length() + 1;
    }

    protected Map<String, Integer> pairs(final String string) {
        if (string.isEmpty()) {
            final Map<String, Integer> pairs = new HashMap<>(1);
            pairs.put("  ", 1);
            return pairs;
        }

        final Map<String, Integer> pairs = new HashMap(2 * numPairs(string));
        pairs.put(String.format(" %c", string.charAt(0)), 1);
        pairs.put(String.format("%c ", string.charAt(string.length() - 1)), 1);

        for (int i = 1, j = string.length(); i < j; i += 1) {
            final char a = string.charAt(i - 1);
            final char b = string.charAt(i);
            final String pair = String.format("%c%c", a, b);
            if (pairs.containsKey(pair)) {
                pairs.put(pair, 1 + pairs.get(pair));
            }
            else {
                pairs.put(pair, 1);
            }
        }

        return pairs;
    }

    public static void main(final String[] args) {
        final StrikeAMatch distance = new PaddedStrikeAMatch();
        for (final String lhs : args) {
            for (final String rhs : args) {
                System.out.printf("d(\"%s\", \"%s\") = [%.7f]%n",
                    lhs, rhs, distance.between(lhs, rhs));
            }
        }
    }
}

为了验证它,以下是您提供的链接中提出的示例:

% java StrikeAMatch France French
d("France", "France") = [1.0000000]
d("France", "French") = [0.4000000]
d("French", "France") = [0.4000000]
d("French", "French") = [1.0000000]

% java StrikeAMatch Healed Sealed Healthy Heard Herded Help Sold
d("Healed", "Healed") = [1.0000000]
d("Healed", "Sealed") = [0.8000000]
d("Healed", "Healthy") = [0.5454545]
d("Healed", "Heard") = [0.4444444]
d("Healed", "Herded") = [0.4000000]
d("Healed", "Help") = [0.2500000]
d("Healed", "Sold") = [0.0000000]
d("Sealed", "Healed") = [0.8000000]
d("Sealed", "Sealed") = [1.0000000]
d("Sealed", "Healthy") = [0.3636364]
d("Sealed", "Heard") = [0.2222222]
d("Sealed", "Herded") = [0.2000000]
d("Sealed", "Help") = [0.0000000]
d("Sealed", "Sold") = [0.0000000]
d("Healthy", "Healed") = [0.5454545]
d("Healthy", "Sealed") = [0.3636364]
d("Healthy", "Healthy") = [1.0000000]
d("Healthy", "Heard") = [0.4000000]
d("Healthy", "Herded") = [0.1818182]
d("Healthy", "Help") = [0.2222222]
d("Healthy", "Sold") = [0.0000000]
d("Heard", "Healed") = [0.4444444]
d("Heard", "Sealed") = [0.2222222]
d("Heard", "Healthy") = [0.4000000]
d("Heard", "Heard") = [1.0000000]
d("Heard", "Herded") = [0.4444444]
d("Heard", "Help") = [0.2857143]
d("Heard", "Sold") = [0.0000000]
d("Herded", "Healed") = [0.4000000]
d("Herded", "Sealed") = [0.2000000]
d("Herded", "Healthy") = [0.1818182]
d("Herded", "Heard") = [0.4444444]
d("Herded", "Herded") = [1.0000000]
d("Herded", "Help") = [0.2500000]
d("Herded", "Sold") = [0.0000000]
d("Help", "Healed") = [0.2500000]
d("Help", "Sealed") = [0.0000000]
d("Help", "Healthy") = [0.2222222]
d("Help", "Heard") = [0.2857143]
d("Help", "Herded") = [0.2500000]
d("Help", "Help") = [1.0000000]
d("Help", "Sold") = [0.0000000]
d("Sold", "Healed") = [0.0000000]
d("Sold", "Sealed") = [0.0000000]
d("Sold", "Healthy") = [0.0000000]
d("Sold", "Heard") = [0.0000000]
d("Sold", "Herded") = [0.0000000]
d("Sold", "Help") = [0.0000000]
d("Sold", "Sold") = [1.0000000]

...这是填充版本:

% java PaddedStrikeAMatch France French
d("France", "France") = [1.0000000]
d("France", "French") = [0.4285714]
d("French", "France") = [0.4285714]
d("French", "French") = [1.0000000]

% java PaddedStrikeAMatch Healed Sealed Healthy Heard Herded Help Sold
d("Healed", "Healed") = [1.0000000]
d("Healed", "Sealed") = [0.7142857]
d("Healed", "Healthy") = [0.5333333]
d("Healed", "Heard") = [0.6153846]
d("Healed", "Herded") = [0.5714286]
d("Healed", "Help") = [0.3333333]
d("Healed", "Sold") = [0.1666667]
d("Sealed", "Healed") = [0.7142857]
d("Sealed", "Sealed") = [1.0000000]
d("Sealed", "Healthy") = [0.2666667]
d("Sealed", "Heard") = [0.3076923]
d("Sealed", "Herded") = [0.2857143]
d("Sealed", "Help") = [0.0000000]
d("Sealed", "Sold") = [0.3333333]
d("Healthy", "Healed") = [0.5333333]
d("Healthy", "Sealed") = [0.2666667]
d("Healthy", "Healthy") = [1.0000000]
d("Healthy", "Heard") = [0.4285714]
d("Healthy", "Herded") = [0.2666667]
d("Healthy", "Help") = [0.3076923]
d("Healthy", "Sold") = [0.0000000]
d("Heard", "Healed") = [0.6153846]
d("Heard", "Sealed") = [0.3076923]
d("Heard", "Healthy") = [0.4285714]
d("Heard", "Heard") = [1.0000000]
d("Heard", "Herded") = [0.6153846]
d("Heard", "Help") = [0.3636364]
d("Heard", "Sold") = [0.1818182]
d("Herded", "Healed") = [0.5714286]
d("Herded", "Sealed") = [0.2857143]
d("Herded", "Healthy") = [0.2666667]
d("Herded", "Heard") = [0.6153846]
d("Herded", "Herded") = [1.0000000]
d("Herded", "Help") = [0.3333333]
d("Herded", "Sold") = [0.1666667]
d("Help", "Healed") = [0.3333333]
d("Help", "Sealed") = [0.0000000]
d("Help", "Healthy") = [0.3076923]
d("Help", "Heard") = [0.3636364]
d("Help", "Herded") = [0.3333333]
d("Help", "Help") = [1.0000000]
d("Help", "Sold") = [0.0000000]
d("Sold", "Healed") = [0.1666667]
d("Sold", "Sealed") = [0.3333333]
d("Sold", "Healthy") = [0.0000000]
d("Sold", "Heard") = [0.1818182]
d("Sold", "Herded") = [0.1666667]
d("Sold", "Help") = [0.0000000]
d("Sold", "Sold") = [1.0000000]

如果你想直接从 JRuby 中使用它,你只需要添加StrikeAMatch.class到你的$CLASSPATH, 和你的脚本中,require 'java'然后是java_import 'StrikeAMatch

require 'java'

java_import 'StrikeAMatch'

distance = StrikeAMatch.new
ARGV.each do |lhs|
  ARGV.each do |rhs|
    puts "d(\"#{lhs}\", \"#{rhs}\") = [#{distance.between(lhs, rhs)}]"
  end
end

要调用它:

% jruby strike_a_match.rb France French
d("France", "France") = [1.0]
d("France", "French") = [0.4]
d("French", "France") = [0.4]
d("French", "French") = [1.0]
于 2016-05-10T20:45:38.063 回答