我正在尝试突出显示字符串中的术语。我的代码沿字符串搜索并在索引中查找等效项。代码返回找到的条款没问题。但是,我想将用户输入的原始字符串返回给用户,并突出显示找到的术语。我正在使用 Lucene 4,因为那是我用来学习 Lucene 的书。我有一个可怜的尝试来获取术语向量等,但它遍历整个领域,我不知道如何获得找到的术语。这是我的代码:
public class TokenArrayTest {
private static final String INDEX_DIR = "C:/ontologies/Lucene/icnpIndex";
//private static List<Float> levScore = new ArrayList<Float>();
//add key and value pairs of tokens to a map to send to a servlet. key 10,11,12 etc
//private static HashMap<Integer, String> hashMap = new HashMap<Integer, String>();
private static List<String> tokens = new ArrayList<String>();
private static int totalResults=0;
public static void main(String[] pArgs) throws IOException, ParseException, InvalidTokenOffsetsException
{
//counters which detect found term changes to advance the html table to the next cell
int b=1;
int c=1;
String searchText="Mrs. smith has limited mobility and fell out of bed. She needs a feeding assessment. She complained of abdominal pains nuring the night. She woke with a headache and she is due for a shower this morning.";
//Get directory reference
Directory dir = FSDirectory.open(new File(INDEX_DIR));
//Index reader - an interface for accessing a point-in-time view of a lucene index
IndexReader reader = DirectoryReader.open(dir);
//Create lucene searcher. It search over a single IndexReader.
IndexSearcher searcher = new IndexSearcher(reader);
//analyzer with the default stop words
Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_40);
TokenStream tokenStream = analyzer.tokenStream(null, new StringReader(searchText));
CharTermAttribute termAttribute = tokenStream.getAttribute(CharTermAttribute.class);
//Query parser to be used for creating TermQuery
QueryParser qp = new QueryParser(Version.LUCENE_40, "Preferred Term", analyzer);
/*add all of the words to an array after they have passed through the analyzer.
* The words are used one by one through the query method later on.
*/
while (tokenStream.incrementToken()) {
tokens.add(termAttribute.toString());
}
//print the top half of the html page
System.out.print("<html>\r\n" +
"\r\n" +
"<head>\r\n" +
"<meta http-equiv=\"Content-Type\" content=\"text/html; charset=windows-1252\">\r\n" +
"\r\n" +
"<title>ICNP results</title>\r\n" +
"</head>\r\n" +
"\r\n" +
"<body>\r\n" +
"\r\n" +
"<p>"+
searchText+"<br>"+
"<p>"+
"<div align=\"center\">\r\n" +
" <center>\r\n" +
" <table border=\"1\" \r\n" +
" <tr>\r\n" +
"<td>\r\n"+
"");
//place each word from the previous array into the query
for(int n=0;n<tokens.size();++n) {
//Create the query
Query query = qp.parse(tokens.get(n));
//Search the lucene documents for the hits
TopDocs hits = searcher.search(query, 20);
//Total found documents
totalResults =totalResults+hits.totalHits;
//print out the score for each searched term
//for (ScoreDoc sd : hits.scoreDocs)
//{
//Document d = searcher.doc(sd.doc);
// System.out.println("Score : " + sd.score);
// }
/** Highlighter Code Start ****/
//Put a html code in here for each found term if need be
Formatter formatter = new SimpleHTMLFormatter("", "");
//Scores text fragments by the number of unique query terms found
QueryScorer scorer = new QueryScorer(query);
//used to markup highlighted terms found in the best sections of a text
Highlighter highlighter = new Highlighter(formatter, scorer);
//It breaks text up into same-size texts but does not split up spans
Fragmenter fragmenter = new SimpleSpanFragmenter(scorer, 20);
//set fragmenter to highlighter
highlighter.setTextFragmenter(fragmenter);
//Iterate over found results
for (int i = 0; i < hits.scoreDocs.length; i++)
{
int docid = hits.scoreDocs[i].doc;
Document doc = searcher.doc(docid);
//Get stored text from found document
String text = doc.get("Preferred Term");
//a pitiful attempt to get term vectors and such like
termsVector = reader.getTermVector(i, "Preferred Term");
termsEnum = termsVector.iterator(termsEnum);
while ( (term = termsEnum.next()) != null ) {
val = term.utf8ToString();
System.out.println("DocId: " + i);
System.out.println(" term: " + val);
System.out.println(" length: " + term.length);
docsAndPositionsEnum = termsEnum.docsAndPositions(null, docsAndPositionsEnum);
if (docsAndPositionsEnum.nextDoc() >= 0) {
int freq = docsAndPositionsEnum.freq();
System.out.println(" freq: " + docsAndPositionsEnum.freq());
for (int j = 0; j < freq; j++) {
System.out.println(" [");
System.out.println(" position: " + docsAndPositionsEnum.nextPosition());
System.out.println(" offset start: " + docsAndPositionsEnum.startOffset());
System.out.println(" offset end: " + docsAndPositionsEnum.endOffset());
System.out.println(" ]");
}
}
}
//Create token stream
TokenStream stream = TokenSources.getAnyTokenStream(reader, docid, "Preferred Term", analyzer);
//Get highlighted text fragments
String[] frags = highlighter.getBestFragments(stream, text,20);
for (String frag : frags)
{
//On the first pass print this html out
if((c==1)&&(b!=c)) {
System.out.println("<select>");
c=b;
}else if((b!=c)) { //and every other time move to the next cell when b changes
System.out.println("</select>"
+ "</td><td>"
+ "<select>");
c=b;
}
System.out.println("<option value='"+frag+"'>"+frag+"</option>");
}
}
b=b+1;
}
dir.close();
b=1;
c=1;
totalResults=0;
//print the bottom half of the html page
System.out.print("</select></td>\r\n" +
" </tr>\r\n" +
" </table>\r\n" +
" </center>\r\n" +
"</div>\r\n" +
"\r\n" +
"</body>\r\n" +
"\r\n" +
"</html>\r\n" +
"");
}
}