2

我编写了一个程序来搜索段落中的给定短语,并在该段落中用花括号将短语括起来。我已经使用 BoyerMoore 的算法进行搜索。同时我还需要提高程序的性能。虽然我得到了所需的输出,但性能是灾难性的。

这是代码:

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.HashMap;
import java.util.List;
import java.util.ArrayList;
import java.util.Map;

public class BoyerMoore {
    static class Pair {
        public int start, end;

        Pair(int start, int end) {
            this.start = start;
            this.end = end;
        }

        public int weight() {
            return end - start;
        }

        public boolean contains(int point) {
            return start <= point && point <= end;
        }

        public int returnStart() {
            return start;
        }


    }

    static class Group {
        public List<Pair> pairs = new ArrayList<Pair>();
        public Pair maxWeight;

        Group(Pair start) {
            add(start);
        }

        Group(List<Pair> pairs) {
            for (Pair pair : pairs) {
                add(pair);
            }
        }

        public boolean contains(Pair pair) {
            for (Pair my : pairs) {
                if (my.contains(pair.start) || my.contains(pair.end))
                    return true;
            }
            return false;
        }

        public void add(Pair pair) {
            pairs.add(pair);
            if (maxWeight == null || maxWeight.weight() < pair.weight())
                maxWeight = pair;
        }
    }

    public static List<Integer> match(String pattern, String text) {
        List<Integer> matches = new ArrayList<Integer>();
        int m = text.length();
        int n = pattern.length();

        Map<Character, Integer> rightMostIndexes = preprocessForBadCharacterShift(pattern);
        int alignedAt = 0;
        while (alignedAt + (n - 1) < m) {
            for (int indexInPattern = n - 1; indexInPattern >= 0; indexInPattern--) {
                int indexInText = alignedAt + indexInPattern;
                char x = text.charAt(indexInText);
                char y = pattern.charAt(indexInPattern);
                if (indexInText >= m)
                    break;
                if (x != y) {
                    Integer r = rightMostIndexes.get(x);
                    if (r == null) {
                        alignedAt = indexInText + 1;
                    } else {
                        int shift = indexInText - (alignedAt + r);
                        alignedAt += shift > 0 ? shift : 1;
                    }
                    break;
                } else if (indexInPattern == 0) {
                    matches.add(alignedAt);
                    alignedAt++;
                }
            }
        }
        return matches;
    }

    private static Map<Character, Integer> preprocessForBadCharacterShift(
            String pattern) {
        Map<Character, Integer> map = new HashMap<Character, Integer>();
        for (int i = pattern.length() - 1; i >= 0; i--) {
            char c = pattern.charAt(i);
            if (!map.containsKey(c))
                map.put(c, i);
        }
        return map;
    }

    public static void main(String[] args) throws IOException {

        BufferedReader input = new BufferedReader(new InputStreamReader(
                System.in));

        ArrayList<String> ListOfAllPhrase = new ArrayList<String>();

        List<Pair> pairs = new ArrayList<Pair>();

        List<Group> groups = new ArrayList<Group>();
        ListOfAllPhrase.add("protein");
        ListOfAllPhrase.add("protein kinase");
        ListOfAllPhrase.add("protein kinase A anchor protein");
        ListOfAllPhrase.add("protein kinase A anchor proteins");
        ListOfAllPhrase.add("protein kinase A anchor protein activity");

        ListOfAllPhrase.add("IL-6");

        ListOfAllPhrase.add("SOX5");
        ListOfAllPhrase.add("NOX5");    

        System.out.println("Input a sentence: ");
        String line = input.readLine();
        char[] lineInChar = line.toCharArray();
        long startTime = System.currentTimeMillis();

        for (int i = 0; i < ListOfAllPhrase.size(); i++) {

            // offset.add((ListOfAllPhrase.get(i)).length());

            List<Integer> matches = match(ListOfAllPhrase.get(i).toLowerCase(),
                    line.toLowerCase());
            for (Integer integer : matches) {

                pairs.add(new Pair(integer, (ListOfAllPhrase.get(i)).length()
                        + integer));


            }

        }


        System.out.println("Total time taken: "
                + (System.currentTimeMillis() - startTime));

        for (Pair pair : pairs) {
            List<Group> intersects = new ArrayList<Group>();
            for (Group group : groups) {
                if (group.contains(pair)) {
                    intersects.add(group);
                }
            }

            if (intersects.isEmpty()) {
                groups.add(new Group(pair));
            } else {
                List<Pair> intervals = new ArrayList<Pair>();
                intervals.add(pair);
                for (Group intersect : intersects) {
                    intervals.addAll(intersect.pairs);
                }

                groups.removeAll(intersects);
                groups.add(new Group(intervals));
            }
        }
        StringBuilder newBuilder = new StringBuilder();
        int flag = 1;
        System.out.println(lineInChar.length);
        for (int a = 0; a <= lineInChar.length; a++) {

            for (Group group : groups) {

                if (a == group.maxWeight.start) {
                    newBuilder.append("{");
                    flag = 1;

                    break;
                }
                if (a == group.maxWeight.end && a == lineInChar.length) {
                    newBuilder.append("}");
                    flag = 0;
                    break;
                }
                if (a == lineInChar.length && a == group.maxWeight.end + 1) {
                    newBuilder.append("}");
                    flag = 0;
                    break;
                }

                if (a == group.maxWeight.end) {
                    newBuilder.append("}");
                    flag = 1;

                    break;
                }
            }
            if (flag == 0)
                continue;

            newBuilder.append(lineInChar[a]);
            flag = 1;

        }
        System.out.println("Final output: " + newBuilder);


    }
}

我可以实施或做些什么来提高我的程序的性能?我应该切换到另一种字符串搜索算法吗?

如果有人可以帮我解决这个问题?

4

1 回答 1

1

我认为您实现了所描述的 Boyer-Moore 算法。虽然我会建议这个:

  • 避免 for 循环中的“昂贵”操作。例如,您的main方法中的 toLowerCase() 操作。重写循环(在我的测试中速度增益为 33%):

    for (int i = 0; i < ListOfAllPhrase.size(); i++) {
    
        // offset.add((ListOfAllPhrase.get(i)).length());
    
        List<Integer> matches = match(ListOfAllPhrase.get(i).toLowerCase(),
                line.toLowerCase());
        for (Integer integer : matches) {
    
            pairs.add(new Pair(integer, (ListOfAllPhrase.get(i)).length()
                    + integer));
        }
    }
    

    到 :

    ArrayList<String> lowerCaseListOfPhrases = new ArrayList<String>(ListOfAllPhrase.size());
    for (String phrase : ListOfAllPhrase) {
        lowerCaseListOfPhrases.add(phrase.toLowerCase());
    }
    String lowerCaseLine = line.toLowerCase();
    for (String phrase : lowerCaseListOfPhrases) {
      List<Integer> matches = match(phrase, lowerCaseLine);
        for (Integer integer : matches) {
            pairs.add(new Pair(integer, phrase.length() + integer));
        }
    
    }
    
  • 看看这个快速实现(参见http://algs4.cs.princeton.edu/53substring/BoyerMoore.java.html):

    public static List<Integer> match2(String pattern, String text) {
      List<Integer> result = new ArrayList<Integer>();
    
      int[] right = new int[256]; // Assuming a 256 character encoding
      for (int c = 0; c < 256; c++)
            right[c] = -1;
      for (int j = 0; j < pattern.length(); j++)
            right[pattern.charAt(j)] = j;
    
      int M = pattern.length();
      int N = text.length();
      int skip;
      for (int i = 0; i <= N - M; i += skip) {
            skip = 0;
            for (int j = M-1; j >= 0; j--) {
                  if (pattern.charAt(j) != text.charAt(i+j)) {
                        skip = Math.max(1, j - right[text.charAt(i+j)]);
                        break;
                  }
            }
            if (skip == 0) {    // found
              result.add(i);
              skip += pattern.length();
            }
      }
      return result;
    }
    

    执行此测试时,我的性能提高了 +- 50%:

    public static void main(String[] args) throws IOException {
    
      String phrase = "protein kinase A anchor protein activity";
      String txt = "This is a test protein kinase A anchor protein activityThis is a test protein kinase A anchor protein activityThis is ";
    
      List<Integer> result1 = null;
      List<Integer> result2 = null;
    
      long currentTime = System.currentTimeMillis();
      for (int i=0; i<1000000; i++) {
            result1 = match(phrase, txt);
      }
      System.out.println("ExecutionTime match: " + (System.currentTimeMillis() - currentTime));
    
      currentTime = System.currentTimeMillis();
      for (int i=0; i<1000000; i++) {
            result2 = match2(phrase, txt);
      }
      System.out.println("ExecutionTime match2: " + (System.currentTimeMillis() - currentTime));
    
      Assert.assertTrue(result1.equals(result2));
    
    }
    

    输出:

    执行时间匹配:5590
    执行时间匹配2:2663

    • 如果您不介意 Boyer-Moore 算法,请使用 Java 内置功能:

    public static List match3(String pattern, String text) { List result = new ArrayList();

    int index = text.indexOf(pattern);
    while (index >= 0) {
        result.add(index);
        index = text.indexOf(pattern, index + 1);
    }
    return result;
    }
    
于 2012-11-18T11:44:17.683 回答