对于实体提取,您需要具有字符串格式的文档文本。检查 stackoverflow 以了解将文档文本转换为 String 的多种方法(这里的简短回答是对文本文件使用 BufferedInputStream,或对 MS 和 PDF 文件使用 Apache Tika)
一旦你在内存中有文档文本,这段代码应该可以帮助你进行句子边界检测、标记化和 NER。然后获取此结果并使用 docname/docid、可能是一些文件元数据、实际实体字符串、类型和 Span(NE 在文本中命中的位置)以任何您想要的方式生成 xmlDoc
package processors;
import java.io.File;
import java.io.FileInputStream;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.List;
import opennlp.tools.namefind.NameFinderME;
import opennlp.tools.namefind.TokenNameFinderModel;
import opennlp.tools.sentdetect.SentenceDetector;
import opennlp.tools.sentdetect.SentenceDetectorME;
import opennlp.tools.sentdetect.SentenceModel;
import opennlp.tools.tokenize.TokenizerME;
import opennlp.tools.tokenize.TokenizerModel;
import opennlp.tools.util.Span;
public class OpenNLPNER implements Runnable
static TokenizerModel tm = null;
static TokenNameFinderModel locModel = null;
String doc;
NameFinderME myNameFinder;
TokenizerME wordBreaker;
SentenceDetector sd;
public OpenNLPNER()
public OpenNLPNER(String document, SentenceDetector sd, NameFinderME mf, TokenizerME wordBreaker)
System.out.println("got doc");
this.sd = sd;
this.myNameFinder = mf;
this.wordBreaker = wordBreaker;
doc = document;
private static List<String> getMyDocsFromSomewhere()
//this should return an object that has all the info about the doc you want
return new ArrayList<String>();
public static void main(String[] args)
String modelPath = "c:\\temp\\opennlpmodels\\";
if (tm == null)
//user does normal namefinder instantiations...
InputStream stream = new FileInputStream(new File(modelPath + "en-token.zip"));
// new SentenceDetectorME(new SentenceModel(new FileInputStream(new File(modelPath + "en-sent.zip"))));
tm = new TokenizerModel(stream);
// new TokenizerME(tm);
locModel = new TokenNameFinderModel(new FileInputStream(new File(modelPath + "en-ner-location.bin")));
// new NameFinderME(locModel);
System.out.println("getting data");
List<String> docs = getMyDocsFromSomewhere();
System.out.println("\tdone getting data");
// FileWriter fw = new FileWriter("C:\\apache\\modelbuilder\\sentences.txt");
for (String docu : docs)
//you could also use the runnable here and launch in a diff thread
new OpenNLPNER(docu,
new SentenceDetectorME(new SentenceModel(new FileInputStream(new File(modelPath + "en-sent.zip")))),
new NameFinderME(locModel), new TokenizerME(tm)).run();
} catch (Exception ex)
public void run()
} catch (Exception ex)
public void process(String document) throws Exception
// System.out.println(document);
//user instantiates the non static entitylinkerproperty object and constructs is with a pointer to the prop file they need to use
String modelPath = "C:\\apache\\entitylinker\\";
//input document
//user splits doc to sentences
String[] sentences = sd.sentDetect(document);
//get the sentence spans
Span[] sentenceSpans = sd.sentPosDetect(document);
Span[][] allnamesInDoc = new Span[sentenceSpans.length][];
String[][] allTokensInDoc = new String[sentenceSpans.length][];
for (int sentenceIndex = 0; sentenceIndex < sentences.length; sentenceIndex++)
String[] stringTokens = wordBreaker.tokenize(sentences[sentenceIndex]);
Span[] tokenSpans = wordBreaker.tokenizePos(sentences[sentenceIndex]);
Span[] spans = myNameFinder.find(stringTokens);
allnamesInDoc[sentenceIndex] = spans;
allTokensInDoc[sentenceIndex] = stringTokens;
//now access the data like this...
for (int s = 0; s < sentenceSpans.length; s++)
Span[] namesInSentence = allnamesInDoc[s];
String[] tokensInSentence = allTokensInDoc[s];
String[] entities = Span.spansToStrings(namesInSentence, tokensInSentence);
for (String entity : entities)
//start building up the XML here....
System.out.println(entity + " Was in setnence " + s + " @ " + namesInSentence[s].toString());