嗨,我在我的 htmlfiles 文件夹中存储了 3 个 html 文件,这些文件使用 tika 提取文本并将其存储到 htmltextfiles 文件夹中的文本文件中。对于 htmltextfiles 文件夹中的每个文本文件,我删除了停用词并显示了前 10 个经常出现的词。我的 htmltextfiles 包含:
java.txt file contains:This is Java Program written in java
Php.txt file contains:This is Php Program written in Php
.net.txt file contains:This is .net Program written in .net
下面是我的java程序
public class FrequencyCount {
int[][] table = new int[4][1000000];
TreeMap<Integer, List<String>> map = new TreeMap<Integer, List<String>>(
Collections.reverseOrder());
public static void main(String[] args) throws Exception {
FrequencyCount freq = new FrequencyCount();
BufferedReader br = null;
String[] stopwords = { "a", "about", "above", "above", "across",
"after", "afterwards", "again", "against", "all", "almost",
"alone", "along", "already", "also", "although", "always",
"am", "among", "amongst", "amoungst", "amount", "an", "and",
"another", "any", "anyhow", "anyone", "anything", "anyway",
"anywhere", "are", "around", "as", "at", "back", "be",
"became", "because", "become", "becomes", "becoming", "been",
"before", "beforehand", "behind", "being", "below", "beside",
"besides", "between", "beyond", "bill", "both", "bottom",
"but", "by", "call", "can", "cannot", "cant", "co", "con",
"could", "couldnt", "cry", "de", "describe", "detail", "do",
"done", "down", "due", "during", "each", "eg", "eight",
"either", "eleven", "else", "elsewhere", "empty", "enough",
"etc", "even", "ever", "every", "everyone", "everything",
"everywhere", "except", "few", "fifteen", "fify", "fill",
"find", "fire", "first", "five", "for", "former", "formerly",
"forty", "found", "four", "from", "front", "full", "further",
"get", "give", "go", "had", "has", "hasnt", "have", "he",
"hence", "her", "here", "hereafter", "hereby", "herein",
"hereupon", "hers", "herself", "him", "himself", "his", "how",
"however", "hundred", "ie", "if", "in", "inc", "indeed",
"interest", "into", "is", "it", "its", "itself", "keep",
"last", "latter", "latterly", "least", "less", "ltd", "made",
"many", "may", "me", "meanwhile", "might", "mill", "mine",
"more", "moreover", "most", "mostly", "move", "much", "must",
"my", "myself", "name", "namely", "neither", "never",
"nevertheless", "next", "nine", "no", "nobody", "none",
"noone", "nor", "not", "nothing", "now", "nowhere", "of",
"off", "often", "on", "once", "one", "only", "onto", "or",
"other", "others", "otherwise", "our", "ours", "ourselves",
"out", "over", "own", "part", "per", "perhaps", "please",
"put", "rather", "re", "same", "see", "seem", "seemed",
"seeming", "seems", "serious", "several", "she", "should",
"show", "side", "since", "sincere", "six", "sixty", "so",
"some", "somehow", "someone", "something", "sometime",
"sometimes", "somewhere", "still", "such", "system", "take",
"ten", "than", "that", "the", "their", "them", "themselves",
"then", "thence", "there", "thereafter", "thereby",
"therefore", "therein", "thereupon", "these", "they", "thickv",
"thin", "third", "this", "those", "though", "three", "through",
"throughout", "thru", "thus", "to", "together", "too", "top",
"toward", "towards", "twelve", "twenty", "two", "un", "under",
"until", "up", "upon", "us", "very", "via", "was", "we",
"well", "were", "what", "want", "wants", "whatever", "when",
"whence", "whenever", "where", "whereafter", "whereas",
"whereby", "wherein", "whereupon", "wherever", "whether",
"which", "while", "whither", "who", "whoever", "whole", "whom",
"whose", "why", "will", "with", "within", "without", "would",
"yet", "you", "your", "yours", "yourself", "yourselves", "1",
"2", "3", "4", "5", "6", "7", "8", "9", "10", "1.", "2.", "3.",
"4.", "5.", "6.", "11", "7.", "8.", "9.", "12", "13", "14",
"A", "B", "C", "D", "E", "F", "G", "H", "I", "J", "K", "L",
"M", "N", "O", "P", "Q", "R", "S", "T", "U", "V", "W", "X",
"Y", "Z", "terms", "CONDITIONS", "conditions", "values",
"interested.", "care", "sure", ".", "!", "@", "#", "$", "%",
"^", "&", "*", "(", ")", "{", "}", "[", "]", ":", ";", ",",
"<", ".", ">", "/", "?", "_", "-", "+", "=", "a", "b", "c",
"d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o",
"p", "q", "r", "s", "t", "u", "v", "w", "x", "y", "z",
"contact", "grounds", "buyers", "tried", "said,", "plan",
"value", "principle.", "forces", "sent:", "is,", "was", "like",
"discussion", "tmus", "diffrent.", "layout", "area.", "thanks",
"thankyou", "hello", "bye", "rise", "fell", "fall", "psqft.",
"http://", "km", "miles" };
File dir = new File("C://htmlfiles");
File listDir[] = dir.listFiles();
for (int i = 0; i < listDir.length; i++)
{
String file = listDir[i].toString();
File file1 = new File(file);
InputStream input = new FileInputStream(file1);
Metadata metadata = new Metadata();
BodyContentHandler handler = new BodyContentHandler(10 * 1024 * 1024);
AutoDetectParser parser = new AutoDetectParser();
parser.parse(input, handler, metadata);
Document doc = new Document();
doc.add(new Field("contents", handler.toString(), Field.Store.NO,Field.Index.ANALYZED));
String path = "C://htmltextfiles".concat("/").concat(listDir[i].getName()).concat(".txt");
File file2 = new File(path);
FileWriter fw = new FileWriter(file2.getAbsoluteFile());
BufferedWriter bw = new BufferedWriter(fw);
bw.write(handler.toString());
bw.close();
}
try
{
File dir1 = new File("C://htmltextfiles");
File listDir1[] = dir1.listFiles();
for (int k = 0; k < listDir1.length; k++)
{
br = new BufferedReader(new FileReader(new File(listDir1[k].getAbsolutePath())));
String fileline = br.readLine();
while (fileline != null)
{
if (fileline.length() > 0)
{
String[] sArr = fileline.split(" ");
for (String s : sArr)
{
int flag = 1;
for (int j = 0; j < stopwords.length; j++)
{
String s1 = s.toLowerCase();
if (s1.equals(stopwords[j]))
{
flag = 0;
}
}
if (flag != 0)
{
if (s.trim().length() > 0)
{
try
{
freq.add(freq.trimStr(s));
} catch (ArrayIndexOutOfBoundsException e) {
}
}
}
}
}
fileline = br.readLine();
}
System.out.println("Keywords for file:" + listDir1[k].getName());
Set<Integer> set = freq.map.keySet();
for (Integer x : set) {
System.out.println(freq.map.get(x) + " found " + x
+ " times");
}
}
} catch (Exception e) {
e.printStackTrace();
} finally {
br.close();
}
}
public String trimStr(String s) {
if (s.toUpperCase().equals(s.toLowerCase())) {
return s;
}
s = s.toLowerCase().trim();
if (s.endsWith("'s")) {
s = s.substring(0, s.length() - 2);
}
int i = 0;
int j = s.length() - 1;
char[] cArr = s.toCharArray();
while (!(cArr[i] >= 65 && cArr[i] <= 90)
&& !(cArr[i] >= 97 && cArr[i] <= 122)) {
i++;
}
while (!(cArr[j] >= 65 && cArr[j] <= 90)
&& !(cArr[j] >= 97 && cArr[j] <= 122)) {
j--;
}
return s.substring(i, j + 1);
}
public int[] hash(String s) {
int h1 = hash1(s);
int h2 = hash2(s);
int h3 = hash3(s);
int h4 = hash4(s);
int[] res = new int[] { h1, h2, h3, h4 };
return res;
}
public int hash1(String x) {
char ch[] = x.toCharArray();
int i, sum;
for (sum = 0, i = 0; i < x.length(); i++)
sum += ch[i];
return sum % 1000000;
}
public int hash2(String s) {
int h = 0;
for (int i = 0; i < s.length(); i++) {
h = 31 * h + s.charAt(i);
}
h = h % 1000000;
if (h < 0) {
h = -h;
}
return h;
}
public int hash3(String s) {
int h = 0;
for (int i = 0; i < s.length(); i++) {
h = 17 * h + s.charAt(i);
}
h = h % 1000000;
if (h < 0) {
h = -h;
}
return h;
}
public int hash4(String s) {
int h = 0;
for (int i = 0; i < s.length(); i++) {
h = 11 * h + s.charAt(i);
}
h = h % 1000000;
if (h < 0) {
h = -h;
}
return h;
}
public void add(String s) {
int[] h = hash(s);
table[0][h[0]] = table[0][h[0]] + 1;
table[1][h[1]] = table[1][h[1]] + 1;
table[2][h[2]] = table[2][h[2]] + 1;
table[3][h[3]] = table[3][h[3]] + 1;
int r = Math.min(Math.min(Math.min(table[0][h[0]], table[1][h[1]]),
table[2][h[2]]), table[3][h[3]]);
boolean add = true;
List<String> list = map.get(r);
if (list == null) {
if (map.size() == 10) {
Integer lastKey = map.lastKey();
if (lastKey.intValue() > r) {
add = false;
} else {
map.remove(lastKey);
}
}
list = new ArrayList<String>();
}
if (add) {
list.add(s);
map.put(r, list);
if (r > 1) {
list = map.get(r - 1);
if (list != null) {
if (list.size() == 1) {
map.remove(r - 1);
} else {
list.remove(s);
}
}
}
}
}
public int count(String s) {
int[] h = hash(s);
int a = table[0][h[0]];
int b = table[1][h[1]];
int c = table[2][h[2]];
int d = table[3][h[3]];
int r = Math.min(Math.min(Math.min(a, b), c), d);
return r;
}
}
对于此代码,我得到低于输出
Keywords for file:java.htm.txt
[java] found 2 times
Keywords for file:php.htm.txt
[java] found 2 times
[php] found 2 times
Keywords for file:.net.html.txt
[java] found 2 times
[php] found 2 times
[.net] found 2 times
问题是一个文件的关键字正在添加到下一个文件关键字中,我的目的是获得以下输出
Keywords for file:java.htm.txt
[java] found 2 times
Keywords for file:php.htm.txt
[php] found 2 times
Keywords for file:.net.html.txt
[.net] found 2 times
谁能建议我如何实现这一点,我已经尝试了很多方法,但没有得到我需要的输出,请帮助我..谢谢