我正在 TREC 文档上运行我的代码,现在正在实施评分方案以获取相关文档的数量。但是现在我想实现布尔检索,我正在尝试使用 HitCollector。
下面是我的代码..
公共类 BatchSearch {
private BatchSearch() {}
/** Simple command-line based search demo. */
public static void main(String[] args) throws Exception {
String usage =
"Usage:\tjava BatchSearch [-index dir] [-simfn similarity] [-field f] [-queries file]";
if (args.length > 0 && ("-h".equals(args[0]) || "-help".equals(args[0]))) {
System.out.println(usage);
System.out.println("Supported similarity functions:\ndefault: DefaultSimilary (tfidf)\n");
System.exit(0);
}
String index = "index";
String field = "contents";
String queries = null;
String simstring = "default";
for(int i = 0;i < args.length;i++) {
if ("-index".equals(args[i])) {
index = args[i+1];
i++;
} else if ("-field".equals(args[i])) {
field = args[i+1];
i++;
} else if ("-queries".equals(args[i])) {
queries = args[i+1];
i++;
} else if ("-simfn".equals(args[i])) {
simstring = args[i+1];
i++;
}
}
Similarity simfn = null;
if ("default".equals(simstring)) {
simfn = new DefaultSimilarity();
} else if ("bm25".equals(simstring)) {
simfn = new BM25Similarity();
} else if ("dfr".equals(simstring)) {
simfn = new DFRSimilarity(new BasicModelP(), new AfterEffectL(), new NormalizationH2());
} else if ("lm".equals(simstring)) {
simfn = new LMDirichletSimilarity();
}
if (simfn == null) {
System.out.println(usage);
System.out.println("Supported similarity functions:\ndefault: DefaultSimilary (tfidf)");
System.out.println("bm25: BM25Similarity (standard parameters)");
System.out.println("dfr: Divergence from Randomness model (PL2 variant)");
System.out.println("lm: Language model, Dirichlet smoothing");
System.exit(0);
}
IndexReader reader = DirectoryReader.open(FSDirectory.open(new File(index)));
IndexSearcher searcher = new IndexSearcher(reader);
searcher.setSimilarity(simfn);
Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_41);
BufferedReader in = null;
if (queries != null) {
in = new BufferedReader(new InputStreamReader(new FileInputStream(queries), "UTF-8"));
} else {
in = new BufferedReader(new InputStreamReader(new FileInputStream("queries"), "UTF-8"));
}
QueryParser parser = new QueryParser(Version.LUCENE_41, field, analyzer);
while (true) {
String line = in.readLine();
if (line == null || line.length() == -1) {
break;
}
line = line.trim();
if (line.length() == 0) {
break;
}
String[] pair = line.split(" ", 2);
Query query = parser.parse(pair[1]);
doBatchSearch(in, searcher, pair[0], query, simstring);
}
reader.close();
}
/**
* This function performs a top-1000 search for the query as a basic TREC run.
*/
public static void doBatchSearch(BufferedReader in, IndexSearcher searcher, String qid, Query query, String runtag)
throws IOException {
// Collect enough docs to show 5 pages
TopDocs results = searcher.search(query, 1000);
ScoreDoc[] hits = results.scoreDocs;
HashMap<String, String> seen = new HashMap<String, String>(1000);
int numTotalHits = results.totalHits;
int start = 0;
int end = Math.min(numTotalHits, 1000);
for (int i = start; i < end; i++) {
Document doc = searcher.doc(hits[i].doc);
String docno = doc.get("docno");
// There are duplicate document numbers in the FR collection, so only output a given
// docno once.
if (seen.containsKey(docno)) {
continue;
}
seen.put(docno, docno);
System.out.println(qid+" Q0 "+docno+" "+i+" "+hits[i].score+" "+runtag);
}
}
}
评分是在 doBatchSearch 中完成的,现在我想在这里实现 HitCollector。