我正在尝试下面的代码来获取泰语句子中的所有标记。它抛出异常。谁能指出我在 JAVA 中标记泰语?
import org.apache.lucene.analysis.Analyzer.TokenStreamComponents;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.icu.ICUNormalizer2Filter;
import org.apache.lucene.analysis.icu.segmentation.ICUTokenizer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
public class Tokenizer{
public static void main(String[] args) throws IOException {
ICUTokenizer tokenizer = new ICUTokenizer(new StringReader("การที่ได้ต้องแสดงว่างานดี"));
TokenFilter filter = new ICUNormalizer2Filter(tokenizer);
TokenStreamComponents tt = new TokenStreamComponents(tokenizer, filter);
TokenStream ts = tt.getTokenStream();
CharTermAttribute cattr = ts.addAttribute(CharTermAttribute.class);
ts.reset();
while(ts.incrementToken()){
System.out.println(cattr.toString()+"-----");
}
}
}
例外如下
Exception in thread "main" java.lang.ExceptionInInitializerError
at org.apache.lucene.analysis.icu.segmentation.ICUTokenizer.<init>(ICUTokenizer.java:72)
at com.tokenizer.tt.main(tt.java:22)
Caused by: java.lang.RuntimeException: java.io.IOException: ICU data file error: Not an ICU data file
at org.apache.lucene.analysis.icu.segmentation.DefaultICUTokenizerConfig.readBreakIterator(DefaultICUTokenizerConfig.java:128)
at org.apache.lucene.analysis.icu.segmentation.DefaultICUTokenizerConfig.<clinit>(DefaultICUTokenizerConfig.java:66)
... 2 more
Caused by: java.io.IOException: ICU data file error: Not an ICU data file
at com.ibm.icu.impl.ICUBinary.readHeader(ICUBinary.java:577)
at com.ibm.icu.text.RBBIDataWrapper.get(RBBIDataWrapper.java:173)
at com.ibm.icu.text.RuleBasedBreakIterator.getInstanceFromCompiledRules(RuleBasedBreakIterator.java:71)
at org.apache.lucene.analysis.icu.segmentation.DefaultICUTokenizerConfig.readBreakIterator(DefaultICUTokenizerConfig.java:123)
... 3 more