每个人:
我正在 PyLucene 4.9.0 中开发自己的分析器,并在分析器中为 CompoundTokenFilter 创建了一个 TokenFilter,因为 DictionaryCompoundTokenFilter 表现不佳。
DictionaryCompoundTokenFilter 使用蛮力算法,但我只想在复合词中的子词都在字典中时拆分复合词,例如当“breast”和“cancer”都在给定字典中时拆分“breastcancer”。
但是在运行程序时,它显示“'CharTermAttribute'对象的属性'length'不可读”,我找不到它有什么问题。谢谢!
from __future__ import division
import lucene, math, itertools
from java.lang import CharSequence
from java.io import IOException
from java.util import LinkedList
from org.apache.pylucene.analysis import PythonTokenStream
from org.apache.lucene.analysis import TokenFilter
from org.apache.pylucene.analysis import PythonTokenFilter
from org.apache.lucene.analysis import TokenStream
from org.apache.lucene.analysis.tokenattributes import CharTermAttribute
from org.apache.lucene.analysis.tokenattributes import OffsetAttribute
from org.apache.lucene.analysis.tokenattributes import PositionIncrementAttribute
from org.apache.lucene.analysis.util import CharArraySet
from org.apache.lucene.util import AttributeSource
from org.apache.lucene.util import Version
class CompoundTokenFilter(PythonTokenFilter):
def __init__(self,matchVersion,input,dictionary,DEFAULT_MIN_WORD_SIZE,DEFAULT_MIN_SUBWORD_SIZE,DEFAULT_MAX_SUBWORD_SIZE):
super(CompoundTokenFilter,self).__init__(input)
self.matchVersion=matchVersion
self.dictionary=dictionary
self.tokens=LinkedList()
self.minWordSize=DEFAULT_MIN_WORD_SIZE
self.minSubwordSize=DEFAULT_MIN_SUBWORD_SIZE
self.maxSubwordSize=DEFAULT_MAX_SUBWORD_SIZE
self.current=AttributeSource.State
self.termAtt=input.addAttribute(CharTermAttribute.class_)
self.offsetAtt=input.addAttribute(OffsetAttribute.class_)
self.posIncAtt=input.addAttribute(PositionIncrementAttribute.class_)
self.input=input
def decompose(self):
l=self.termAtt.length()
s=self.termAtt.subSequence(0,l)
if s in self.dictionary:
self.tokens.add(CompoundToken(self.matchVersion,self.input,self.dictionary,self.minWordSize,self.minSubwordSize,self.maxSubwordSize,0,l))
else:
d=filter(lambda x:len(x)>=self.minSubwordSize and len(x)<=self.maxSubwordSize in s,this.dictionary)
if len(d)>0:
start=int(math.floor(l/self.maxSubwordSize))
end=int(math.ceil(l/self.minSubwordSize))
subwords_combinations=[]
for i in xrange(start,end+1):
subwords_combinations.extend(itertools.permutations(d,i))
subwords_combinations=filter(lambda x:''.join(x)==s,subwords_combinations)
subwords=sorted(set(reduce(lambda x,y:x+y,subwords_combinations)),key=lambda x:-1*len(x))
for subword in subwords:
tokens.add(CompoundToken(self.matchVersion,self.input,self.dictionary,self.minWordSize,self.minSubwordSize,self.maxSubwordSize,s.find(subword),s.find(subword)+len(subword)))
def incrementToken(self):
if (not self.tokens.isEmpty()):
assert self.current!=None
token=self.tokens.removeFirst()
AttributeSource.restoreState(self.current)
self.termAtt.setEmpty().append(token.txt)
self.offsetAttribute.setOffset(token.startOffset, token.endOffset)
self.posIncAtt.setPositionIncrement(0)
return True
self.current=None
if(self.input.incrementToken()):
if self.termAtt.length()>=self.minWordSize:
decompose()
if not tokens.isEmpty():
self.current=AttributeSource.captureState()
return True
else:
return False
def reset(self):
super(CompoundTokenFilter,self).reset()
self.tokens.clear()
self.current=None
class CompoundToken:
def __init__(self,matchVersion,input,dictionary,DEFAULT_MIN_WORD_SIZE,DEFAULT_MIN_SUBWORD_SIZE,DEFAULT_MAX_SUBWORD_SIZE,offset,length):
compoundTokenFilter=CompoundTokenFilter(matchVersion,input,dictionary,DEFAULT_MIN_WORD_SIZE,DEFAULT_MIN_SUBWORD_SIZE,DEFAULT_MAX_SUBWORD_SIZE)
self.txt=compoundTokenFilter.termAtt.subSequence(offset, offset + length)
startOff = compoundWordTokenFilterBase.this.offsetAtt.startOffset()
endOff = compoundWordTokenFilterBase.this.offsetAtt.endOffset()
if matchVersion.onOrAfter(Version.LUCENE_4_4) or endOff - startOff != compoundTokenFilter.termAtt.length():
self.startOffset = startOff
self.endOffset = endOff
else:
newStart = startOff + offset
self.startOffset = newStart
self.endOffset = newStart + length