如果您想要原始字符串中的位置,则不能先删除非字母,否则信息会丢失。您要么需要直接在原始字符串中查找 kgram(更多 CPU 时间),要么将每个字母的原始位置与修改后的字符串一起存储(更多内存空间)。
这是后者的实现:
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
public class KGram {
public final String str;
public final int start;
public final int end;
public KGram(String str, int start, int end) {
this.str = str;
this.start = start;
this.end = end;
}
@Override
public String toString() {
return "KGram[\"" + str + "\":" + start + "," + end + "]";
}
public static List<KGram> extractFrom(String input, int size) {
char[] chars = new char[input.length()];
int[] indexes = new int[input.length()];
int len = 0;
for (int i = 0; i < input.length(); i++) {
char c = input.charAt(i);
if (!Character.isLetter(c)) continue;
chars[len] = c;
indexes[len] = i;
len++;
}
List<KGram> kgrams = new ArrayList<>();
for (int i = 0, j = size - 1; j < len; i++, j++) {
String str = new String(Arrays.copyOfRange(chars, i, j + 1));
kgrams.add(new KGram(str, indexes[i], indexes[j]));
}
return kgrams;
}
}
例子:
String test = "blahello,,,,/blatestbla7234///§\"§$%\"%$\n\n23344)§()(§$blablayeahbla";
List<KGram> kgrams = KGram.extractFrom(test, 5);
System.out.println(kgrams.get(4)); // prints KGram["ellob":4,13]
System.out.println(kgrams.get(26)); // prints KGram["ahbla":60,64]