2

我正在 TREC 文档上运行我的代码,现在正在实施评分方案以获取相关文档的数量。但是现在我想实现布尔检索,我正在尝试使用 HitCollector。

下面是我的代码..

公共类 BatchSearch {

private BatchSearch() {}

/** Simple command-line based search demo. */
public static void main(String[] args) throws Exception {
    String usage =
            "Usage:\tjava BatchSearch [-index dir] [-simfn similarity] [-field f] [-queries file]";
    if (args.length > 0 && ("-h".equals(args[0]) || "-help".equals(args[0]))) {
        System.out.println(usage);
        System.out.println("Supported similarity functions:\ndefault: DefaultSimilary (tfidf)\n");
        System.exit(0);
    }

    String index = "index";
    String field = "contents";
    String queries = null;
    String simstring = "default";

    for(int i = 0;i < args.length;i++) {
        if ("-index".equals(args[i])) {
            index = args[i+1];
            i++;
        } else if ("-field".equals(args[i])) {
            field = args[i+1];
            i++;
        } else if ("-queries".equals(args[i])) {
            queries = args[i+1];
            i++;
        } else if ("-simfn".equals(args[i])) {
            simstring = args[i+1];
            i++;
        }
    }

    Similarity simfn = null;
    if ("default".equals(simstring)) {
        simfn = new DefaultSimilarity();
    } else if ("bm25".equals(simstring)) {
        simfn = new BM25Similarity();
    } else if ("dfr".equals(simstring)) {
        simfn = new DFRSimilarity(new BasicModelP(), new AfterEffectL(), new NormalizationH2());
    } else if ("lm".equals(simstring)) {
        simfn = new LMDirichletSimilarity();
    }
    if (simfn == null) {
        System.out.println(usage);
        System.out.println("Supported similarity functions:\ndefault: DefaultSimilary (tfidf)");
        System.out.println("bm25: BM25Similarity (standard parameters)");
        System.out.println("dfr: Divergence from Randomness model (PL2 variant)");
        System.out.println("lm: Language model, Dirichlet smoothing");
        System.exit(0);
    }

    IndexReader reader = DirectoryReader.open(FSDirectory.open(new File(index)));
    IndexSearcher searcher = new IndexSearcher(reader);
    searcher.setSimilarity(simfn);
    Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_41);

    BufferedReader in = null;
    if (queries != null) {
        in = new BufferedReader(new InputStreamReader(new FileInputStream(queries), "UTF-8"));
    } else {
        in = new BufferedReader(new InputStreamReader(new FileInputStream("queries"), "UTF-8"));
    }
    QueryParser parser = new QueryParser(Version.LUCENE_41, field, analyzer);
    while (true) {
        String line = in.readLine();

        if (line == null || line.length() == -1) {
            break;
        }

        line = line.trim();
        if (line.length() == 0) {
            break;
        }

        String[] pair = line.split(" ", 2);
        Query query = parser.parse(pair[1]);

        doBatchSearch(in, searcher, pair[0], query, simstring);

    }
    reader.close();
}

/**
 * This function performs a top-1000 search for the query as a basic TREC run.
 */
public static void doBatchSearch(BufferedReader in, IndexSearcher searcher, String qid, Query query, String runtag)  
        throws IOException {

    // Collect enough docs to show 5 pages
    TopDocs results = searcher.search(query, 1000);
    ScoreDoc[] hits = results.scoreDocs;
    HashMap<String, String> seen = new HashMap<String, String>(1000);
    int numTotalHits = results.totalHits;

    int start = 0;
    int end = Math.min(numTotalHits, 1000);

    for (int i = start; i < end; i++) {
        Document doc = searcher.doc(hits[i].doc);
        String docno = doc.get("docno");
        // There are duplicate document numbers in the FR collection, so only output a given
        // docno once.
        if (seen.containsKey(docno)) {
            continue;
        }
        seen.put(docno, docno);
        System.out.println(qid+" Q0 "+docno+" "+i+" "+hits[i].score+" "+runtag);
    }
}

}

评分是在 doBatchSearch 中完成的,现在我想在这里实现 HitCollector。

4

0 回答 0