I have the following problem:there are several text documents which I need to parse and create an index but without stop words and to stem the terms.I can do it manually but I heard from a colleague about Lucene which can do it automatically. I searched the web and found many examples which I tried but each and every example use a different version of lucene and different methods and none of the examples is complete. At the end of this process I need to calculate the tf/idf for each term in my collection.
update: I've created an index with one doc at the moment.the doc is without stop words and stemmed. how do I calculate tf/idf to this doc uisng lucenc? (I will add more docs after I'll figure out how to do the calculation)
Any help with lucene will be appreciated. Thanks.
import java.io.*;
import java.util.HashSet;
import org.apache.lucene.analysis.*;
import org.apache.lucene.analysis.tokenattributes.*;
import org.apache.lucene.analysis.standard.*;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.*;
import org.apache.lucene.analysis.snowball.*;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexWriter;
public class Stemmer
{
static HashSet<String> stopWordsList = null;
public static String Stem(String text, String language) throws IOException
{
parse p = new parse();
stopWordsList = p.readStopWordsFile();
StringBuffer result = new StringBuffer();
if (text!=null && text.trim().length()>0)
{
StringReader tReader = new StringReader(text);
// Analyzer analyzer = new StopAnalyzer(Version.LUCENE_36,stopWordsList);
@SuppressWarnings("deprecation")
Analyzer analyzer = new SnowballAnalyzer(Version.LUCENE_35,"English",stopWordsList);
// disk index storage
Directory directory = FSDirectory.open(new File("d:/index"));
@SuppressWarnings("deprecation")
IndexWriter writer = new IndexWriter(directory, analyzer, true, new IndexWriter.MaxFieldLength(25000));
TokenStream tStream = analyzer.tokenStream("contents", tReader);
@SuppressWarnings("deprecation")
TermAttribute term = tStream.addAttribute(TermAttribute.class);
try {
while (tStream.incrementToken())
{
result.append(term.term());
result.append(" ");
}
Document doc = new Document();
String title = "DocID";
// adding title field
doc.add(new Field("title", title, Field.Store.YES, Field.Index.ANALYZED));
String content = result.toString();
// adding content field
doc.add(new Field("content", content, Field.Store.YES, Field.Index.ANALYZED));
// writing new document to the index
writer.addDocument(doc);
writer.close();
System.out.println("Reult is: " + result);
}
catch (IOException ioe)
{
System.out.println("Error: "+ioe.getMessage());
}
}
// If, for some reason, the stemming did not happen, return the original text
if (result.length()==0)
result.append(text);
return result.toString().trim();
} //end stem
public static void main (String[] args) throws IOException
{
Stemmer.Stem("Michele Bachmann amenities pressed her allegations that the former head of her Iowa presidential bid was bribed by the campaign of rival Ron Paul to endorse him, even as one of her own aides denied the charge.", "English");
}
}//end class