java - 大文本中的最长公共子串

Question

我有这个学校任务，要求我们编写代码来找到最长的公共子字符串。我已经这样做了，但它只适用于不太大的文本，并且被要求找到 Moby Dick 和 War And Peace 的公共子字符串。如果你能指出我做错了什么的正确方向，我将不胜感激。当我调用它来创建 SuffixArray 时，编译器抱怨错误出现在 MyString 类的子字符串方法中，但我不知道为什么它说它太大，给了我内存不足

package datastructuresone;

import java.io.File;
import java.io.FileNotFoundException;
import java.util.Arrays;
import java.util.Scanner;


 class SuffixArray
 {

private final MyString[] suffixes;
private final int N;

public SuffixArray(String s)
{
    N = s.length();
    MyString snew = new MyString(s);
    suffixes = new MyString[N];
    for (int i = 0; i < N; i++)
    {
        suffixes[i] = snew.substring(i);
    }
    Arrays.sort(suffixes);
}

public int length()
{
    return N;
}

public int index(int i)
{
    return N - suffixes[i].length();
}

public MyString select(int i)
{
    return suffixes[i];
}

// length of longest common prefix of s and t
private static int lcp(MyString s, MyString t)
{
    int N = Math.min(s.length(), t.length());
    for (int i = 0; i < N; i++)
    {
        if (s.charAt(i) != t.charAt(i))
        {
            return i;
        }
    }
    return N;
}

// longest common prefix of suffixes(i) and suffixes(i-1)
public int lcp(int i)
{
    return lcp(suffixes[i], suffixes[i - 1]);
}

// longest common prefix of suffixes(i) and suffixes(j)
public int lcp(int i, int j)
{
    return lcp(suffixes[i], suffixes[j]);

}
}

public class DataStructuresOne
{

public static void main(String[] args) throws FileNotFoundException
{
    Scanner in1 = new Scanner(new File("./build/classes/WarAndPeace.txt"));
    Scanner in2 = new Scanner(new File("./build/classes/MobyDick.txt"));

    StringBuilder sb = new StringBuilder();
    StringBuilder sb1 = new StringBuilder();

    while (in1.hasNextLine())
    {
        sb.append(in1.nextLine());

    }
    while (in2.hasNextLine())
    {
        sb1.append(in2.nextLine());
    }

    String text1 = sb.toString().replaceAll("\\s+", " ");
    String text2 = sb1.toString().replaceAll("\\s+", " ");

    int N1 = text1.length();
    int N2 = text2.length();

    SuffixArray sa = new SuffixArray(text1 + "#" + text2);
    int N = sa.length();

    String substring = "";
    for (int i = 1; i < N; i++)
    {

        // adjacent suffixes both from second text string
        if (sa.select(i).length() <= N2 && sa.select(i - 1).length() <= N2)
        {
            continue;
        }

        // adjacent suffixes both from first text string
        if (sa.select(i).length() > N2 + 1 && sa.select(i - 1).length() > N2 + 1)
        {
            continue;
        }

        // check if adjacent suffixes longer common substring
        int length = sa.lcp(i);
        if (length > substring.length())
        {
            substring = sa.select(i).toString().substring(0, length);
            System.out.println(substring + " ");
        }
    }

    System.out.println("The length of the substring " + substring.length() + "length on    first N " + N1 + " length of Second N " + N2
            + "The length of the array sa: " + N);
   System.out.println("'" + substring + "'");

   final class MyString implements Comparable<MyString>
{

public MyString(String str)
{
    offset = 0;
    len = str.length();
    arr = str.toCharArray();
}

public int length()
{
    return len;
}

public char charAt(int idx)
{
    return arr[ idx + offset];
}

public int compareTo(MyString other)
{
    int myEnd = offset + len;
    int yourEnd = other.offset + other.len;
    int i = offset, j = other.offset;

    for (; i < myEnd && j < yourEnd; i++, j++)
    {
        if (arr[ i] != arr[ j])
        {
            return arr[ i] - arr[ j];
        }
    }

    // reached end.  Who got there first?
    if (i == myEnd && j == yourEnd)
    {
        return 0;   // identical strings
    }
    if (i == myEnd)
    {
        return -1;
    } else
    {
        return +1;
    }
}


public MyString substring(int beginIndex, int endIndex)
{
    return new MyString(arr, beginIndex + offset, endIndex - beginIndex);
}

public MyString substring(int beginIndex)
{
    return substring(beginIndex, offset + len);
}

public boolean equals(Object other)
{
    return (other instanceof MyString) && compareTo((MyString) other) == 0;
}

public String toString()
{
    return new String(arr, offset, len);
}

private MyString(char[] a, int of, int ln)
{
    arr = a;
    offset = of;
    len = ln;
}
private char[] arr;
private int offset;
private int len;
}

score 3 · Accepted Answer

这里：

for (int i = 0; i < N; i++)
{
    suffixes[i] = snew.substring(i);
}

您不仅要存储整个长字符串，还要存储整个字符串 - 1 个字母和整个字符串 - 2 个字母等。所有这些都单独存储。

如果您的字符串只有 10 个字母，那么您将在 10 个不同的字符串中存储总共 55 个字符。

在 1000 个字符中，您总共存储了 500500 个字符。

更一般地说，您必须处理 length*(length+1)/2 个字符。

只是为了好玩，我不知道战争与和平中有多少字符，但页数约为 1250，典型的单词/页估计为 250，平均单词长度约为 5 个字符，得出：

(1250 * 250 * 5)*(1250 * 250 * 5 + 1)/2 = 1.2207039 * 10^12 个字符。

内存中 char 的大小为 2 个字节，因此您看到的大小约为 2.22 TB（相比之下，仅小说文本为 1.49 MB）。

score 1 · Accepted Answer

在代码的前几行中，我计算了这两个文本的至少 3 个副本。这里有一些想法

在读取每一行时转换空格 - 而不是在它们是巨大的字符串之后。不要忘记行首和行尾空格的情况。
使用 StringBuilder 作为基础而不是 String 来构建您的 MyString 类。如果可以的话，用它的本机方法在 StringBuilder 内部进行所有的查看。
不要再提取字符串了。

查找 -Xmx java 运行时选项并将堆空间设置为大于默认值。你必须谷歌这个，因为我没有记住它。请注意 -Xmx=1024M 最后需要那个 M 。（看文件大小，看看这两本书有多大。）

score 0 · Accepted Answer

我用 Scala 编写了这个程序。也许你可以把它翻译成Java。

class MyString private (private val string: String, startIndex: Int, endIndex: Int) extends Comparable[MyString] {
  def this(string: String) = this(string, 0, string.length)
  def length() = endIndex-startIndex
  def charAt(i: Int) = {
    if(i >= length) throw new IndexOutOfBoundsException
    string.charAt(startIndex + i)
  }
  def substring(start: Int, end: Int): MyString = {
    if(start < 0 || end > length || end < start) throw new IndexOutOfBoundsException
    new MyString(string, startIndex + start, startIndex + end)
  }
  def substring(start: Int): MyString = substring(start, length)
  def longestCommonSubstring(other: MyString): MyString = {
    var index = 0
    val len = math.min(length, other.length)
    while(index < len && charAt(index) == other.charAt(index)) index += 1
    substring(0, index)
  }
  def compareTo(other: MyString): Int = {
    val len = math.min(length, other.length)
    for(i <- 0 until len) {
      if(charAt(i) > other.charAt(i)) return 1
      if(charAt(i) < other.charAt(i)) return -1
    }
    length-other.length
  }
  def >(other: MyString) = compareTo(other) > 0
  def <(other: MyString) = compareTo(other) < 0
  override def equals(other: Any) = other.isInstanceOf[MyString] && compareTo(other.asInstanceOf[MyString]) == 0
  override def toString() = "\"" + string.substring(startIndex, endIndex) + "\""
}

def readFile(name: String) = new MyString(io.Source.fromFile(name).getLines.mkString(" ").replaceAll("\\s+", " "))
def makeList(str: MyString) = (0 until str.length).map(i => str.substring(i)).toIndexedSeq

val string1 = readFile("WarAndPeace.txt")
val string2 = readFile("MobyDick.txt")

val (list1, list2) = (makeList(string1).sorted, makeList(string2).sorted)

var longestMatch = new MyString("")
var (index1, index2) = (0,0)
while(index1 < list1.size && index2 < list2.size) {
  val lcs = list1(index1).longestCommonSubstring(list2(index2)) 
  if(lcs.length > longestMatch.length) longestMatch = lcs
  if(list1(index1) < list2(index2)) index1 += 1
  else index2 += 1
}

println(longestMatch)

score 0 · Accepted Answer

当您构造 MyString 时，您调用arr = str.toCharArray();which 创建字符串字符数据的新副本。但是在 Java 中，字符串是不可变的——那么为什么不存储对字符串的引用而不是其数据的副本呢？

您一次构造每个后缀，但一次只引用一个（嗯，两个）。如果您重新编码您的解决方案以仅引用它当前关心的后缀，并仅在需要它们时构建它们（然后丢失对它们的引用），它们可能会被 Java 垃圾收集。这将减少内存不足的可能性。比较存储 2 个字符串与存储数十万个字符串的内存开销：)

java - 大文本中的最长公共子串

4 回答 4

Related

Reference