java - 使用 Lucene 计算 TFIDF 分数

Question

这是我为文档集合中的文档计算 TF-IDF 值的程序。这工作正常，但在计算“IDF”值时需要大量时间（查找包含特定术语的文档数）。

是否有更有效的方法来查找包含特定术语的文档数量？

freq = termsFreq.getTermFrequencies();

terms = termsFreq.getTerms();

int noOfTerms = terms.length;
score = new float[noOfTerms];
DefaultSimilarity simi = new DefaultSimilarity();

        for (i = 0; i < noOfTerms; i++) {

            int noofDocsContainTerm = noOfDocsContainTerm(terms[i]);
            float tf = simi.tf(freq[i]);
            float idf = simi.idf(noofDocsContainTerm, noOfDocs);  
            score[i] = tf * idf ;

        }

////

public int noOfDocsContainTerm(String querystr) throws CorruptIndexException, IOException, ParseException{

QueryParser qp=new QueryParser(Version.LUCENE_35, "docuemnt", new StandardAnalyzer(Version.LUCENE_35));  

Query q=qp.parse(querystr);

int hitsPerPage = docNames.length; //minumum number or search results
IndexSearcher searcher = new IndexSearcher(ramMemDir, true);
TopScoreDocCollector collector = TopScoreDocCollector.create(hitsPerPage, true);

searcher.search(q, collector);

ScoreDoc[] hits = collector.topDocs().scoreDocs;

    return hits.length;
}

score 8 · Accepted Answer

如果你有一个词并且想要它的文档频率，即包含这个词的文档的数量：调用IndexReader.termEnum(Term)方法。它为您提供了一个 TermEnum 对象。然后，调用TermEnum.docFreq()方法。它为您提供该术语在索引中的文档频率。

score 4 · Accepted Answer

/*
* To change this template, choose Tools | Templates
* and open the template in the editor.
*/



import java.io.*;
import java.util.*;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.WhitespaceAnalyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.*;
import org.apache.lucene.queryParser.ParseException;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.search.*;
import org.apache.lucene.store.NIOFSDirectory;
import org.apache.lucene.util.Version;


/*
* Date Author Changes April 14, 2012 Kasun Perera Created
*/

/*
*
* Class contains methods for indexing documents with Lucene, and    calculating
* TFIDF weights
*/
public class DocIndexer {

private String docNames[];
private String docIDS[];
private String pathToIndex;
private String pathToDocumentCollection;
private String fiboTermList[]; //marked up fibo terms
private String taxoTermList[]; // marked up taxonomy terms
private RAMDirectory ramMemDir;
private String fileNames[];
private byte files[][];
private String filesInText[];
int noOfWordsOfDOc[];
int noOfSentencesOfDoc[];
ArrayList<String> ArrLstSentencesOfDoc[];
String removedTermsOfDOc[][];
int freqAfterRemovalOfDoc[][];
//int queryDocIndex ;
private int curDocNo;
private final int maxTerms = 1000000;




/**
 * Constructor used when indexing directory is a RAM memory directory, We
 * need RAM directory because Stratoes Server dosen't allow access local
 * files
 *
 * @param pathToIndex- doc index path 
 * @param pathToDocumentCollection - doccollection path
 */
public DocIndexer(String pathToIndex, String pathToDocumentCollection) {
  //  this.docNames = docNames;

    //this.bufPathToIndex= new RandomAccessBuffer() ;
  //  this.ramMemDir = new RAMDirectory();
    this.pathToIndex = pathToIndex;
    this.pathToDocumentCollection= pathToDocumentCollection;
    // this.files = files;
   // this.filesInText = docContent;

}




/**
 * Count the number of words in a given String
 *
 * @param line- Input String
 * @return - number of words in the input String
 */
private int wordCount(String line) {
    int numWords = 0;
    int index = 0;
    boolean prevWhiteSpace = true;
    while (index < line.length()) {
        char c = line.charAt(index++);
        boolean currWhiteSpace = Character.isWhitespace(c);
        if (prevWhiteSpace && !currWhiteSpace) {
            numWords++;
        }
        prevWhiteSpace = currWhiteSpace;
    }
    return numWords;
}

/*
*given it's URL this methods read the text files
*/
public static String fileReader(String filename) throws IOException {

    String filetext = null;
    BufferedReader reader = null;
    //BufferedReader namesReader; //reader for followers
    //Extractor extractor = new Extractor();
    File inFile = new File(filename);
    //File namesFile = new File(args[1]); //get followers file 
    //File userFile = new File(args[1]);

    //READING FROM USERS FILE
    reader = new BufferedReader(new FileReader(inFile));
    String line = null;

    int numLine = 0;

    while ((line = reader.readLine()) != null) {
        // numLine++;
        filetext = filetext + " " + line;

        // System.out.println(line);
    }

    reader.close();
    return filetext;
}

/**
 * Method to index the documents only using the content of the document
 * "docid" field is used for indexing, since Lucene Dosen't retrieve the
 * documents in the indexed order 
 *
 * @param docNo- document number of the document to be indexed
 * @throws IOException
 */
 public void indexDocs() throws IOException {
    //String pathToDocumentCollection = "F:\\karsha project\\Term Analysis\\keygraph docs\\selected_section_collection\\compelete_collection_2\\msrb_fibo_stopwords_replaced_term_docs\\";
   // String pathToIndex = "F:\\karsha project\\Term Analysis\\keygraph docs\\selected_section_collection\\compelete_collection_2\\INDEX_msrb_fibo_stopwords_replaced_term_docs";
    File folder = new File(pathToDocumentCollection);
    File[] listOfFiles = folder.listFiles();
    int noOfFiles = listOfFiles.length;
    System.out.println("Number of files : " + noOfFiles);

    IndexWriter iW;
    int indexDocCount = 0;
    try {
        NIOFSDirectory dir = new NIOFSDirectory(new File(pathToIndex));
        iW = new IndexWriter(dir, new IndexWriterConfig(Version.LUCENE_36, new WhitespaceAnalyzer(Version.LUCENE_36)));

        for (int i = 0; i < noOfFiles; i++) {
            if (listOfFiles[i].isFile()) {
                String docName = listOfFiles[i].getName();
                System.out.println("doc name: " + docName + "length - " + listOfFiles[i].length());
                if (listOfFiles[i].length() > 1) {
                    String filesInText = fileReader(pathToDocumentCollection + docName);

                    //docIds[i] = docNames[i].substring( 0, docName.length() - 4 );
                    System.out.println("Added to index : " + docName);

                    //  StringReader strRdElt = new StringReader(filesInText[i]);
                    //filesInText = filesInText.replaceAll( "[^A-Za-z_]", " " );
                    //System.out.println( "Added to index : " + docName );
                    StringReader strRdElt = new StringReader(filesInText.replaceAll("\\d+(?:[.,]\\d+)*\\s*", ""));
                    StringReader docId = new StringReader(docName.substring(0, docName.length() - 4)); // give a unique doc Id here

                    org.apache.lucene.document.Document doc = new org.apache.lucene.document.Document();

                    doc.add(new Field("doccontent", strRdElt, Field.TermVector.YES));
                    doc.add(new Field("docid", docId, Field.TermVector.YES));
                    iW.addDocument(doc);
                    indexDocCount++;
                }
            }
        }

        System.out.println("no of documents added to index : " + indexDocCount);

        iW.close();
        // dir.close() ;
    } catch (CorruptIndexException e) {
        e.printStackTrace();
    } catch (IOException e) {
        e.printStackTrace();
    }
}



/**
 * This method calculates the TF-IDF score for each terms in the indexed
 * documents
 *
 * @param numberOfDocs
 * @return - Hashmap of TF-IDF score per each term in document wise
 * @throws CorruptIndexException
 * @throws ParseException
 */
public HashMap<Integer, HashMap> tfIdfScore(int numberOfDocs) throws CorruptIndexException, ParseException {

    int noOfDocs = docNames.length;

    HashMap<Integer, HashMap> scoreMap = new HashMap<Integer, HashMap>();
    //HashMap<Integer, float[]> scoreMap = new HashMap<Integer, float[]>();


    try {

        IndexReader re = IndexReader.open(NIOFSDirectory.open(new File(pathToIndex)), true) ;
       // IndexReader re = IndexReader.open(ramMemDir);

        int i = 0;
        for (int k = 0; k < numberOfDocs; k++) {
            int freq[];
            TermFreqVector termsFreq;
            TermFreqVector termsFreqDocId;
            //TermFreqVector termsFreq3[];
            HashMap<String, Float> wordMap = new HashMap<String, Float>();
            String terms[];
            float score[] = null;

            //termsFreq3=re.getTermFreqVectors(currentDocID);
            termsFreq = re.getTermFreqVector(k, "doccontent");
            termsFreqDocId = re.getTermFreqVector(k, "docid");

            int aInt = Integer.parseInt(termsFreqDocId.getTerms()[0]);
            freq = termsFreq.getTermFrequencies();

            terms = termsFreq.getTerms();

            int noOfTerms = terms.length;
            score = new float[noOfTerms];
            DefaultSimilarity simi = new DefaultSimilarity();
            for (i = 0; i < noOfTerms; i++) {
                int noofDocsContainTerm = re.docFreq(new Term("doccontent", terms[i]));
                // System.out.println(terms[i]+"\t"+freq[i]);
                //int noofDocsContainTerm = docsContainTerm(terms[i], "docnames");
                float tf = simi.tf(freq[i]);
                float idf = simi.idf(noofDocsContainTerm, noOfDocs);
                wordMap.put(terms[i], (tf * idf));

            }
            scoreMap.put(aInt, wordMap);
        }


    } catch (IOException e) {
        // score = null;
        e.printStackTrace();
    }



    //Map<Integer,Float[]> scoreMap=new Map<Integer, Float[]>(); 


    return scoreMap;
}


public HashMap<Integer, HashMap> getTFIDF() throws IOException, CorruptIndexException, ParseException, ClassNotFoundException {
    int noOfDocs = docNames.length;
    float tfIdfScore[][] = new float[noOfDocs][];
    //HashMap<Integer, float[]> scoreMap = new HashMap<Integer, float[]>();
    HashMap<Integer, HashMap> scoreMap = new HashMap<Integer, HashMap>();


    scoreMap = tfIdfScore(noOfDocs);




    return  scoreMap;
}

}

score 0 · Accepted Answer

有一种使用 Lucene api 以简洁和优化的方式计算 idf 的有效方法。正如您之前索引文档一样，您可以使用来自 Lucene api 的静态数据。下面的代码一起计算 tf 和 idf：

public double getTFIDFScoreInCollection(String FIELD, String word,IndexReader reader)
        throws IOException {

    IndexSearcher searcher = new IndexSearcher(reader);
    ClassicSimilarity similarity = new ClassicSimilarity();
    IndexReaderContext context = searcher.getTopReaderContext();
    CollectionStatistics collectionStats = searcher.collectionStatistics(FIELD);

    long totalDocCount = collectionStats.docCount();

    BytesRef ref = new BytesRef(word);

    long termFreq = this.getTermFrequencyInCollection(FIELD,word);
    float tf = similarity.tf(termFreq);

    Term term = new Term(FIELD, ref);
    TermContext termContext = TermContext.build(context, term);

    TermStatistics termStats = searcher.termStatistics(term, termContext);
    long docFreq = termStats.docFreq();
    float idf = similarity.idf(docFreq, totalDocCount);

    return tf*idf;

}

并且不要忘记导入适当的依赖项：

import java.io.IOException;
import java.io.StringReader;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute;
import org.apache.lucene.document.FieldType;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexReaderContext;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.TermContext;
import org.apache.lucene.search.CollectionStatistics;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.TermStatistics;
import org.apache.lucene.search.similarities.ClassicSimilarity;
import org.apache.lucene.util.BytesRef;

java - 使用 Lucene 计算 TFIDF 分数

3 回答 3

Related

Reference