我正在尝试在概率 CFG 上使用 CYK 算法构建解析树。这是我的概率 CFG
pcfg = PCFG.fromstring("""S -> NP VP [1.0]
NP -> DT NN [0.5]
NP -> NP PP [0.25]
NP -> 'John' [0.1]
NP -> 'I' [0.15]
PP -> P NP [1.0]
VP -> V [0.4]
VP -> Vt NP [0.4]
VP -> VP PP [0.2]
V -> 'sleeps' [0.5]
V -> 'laughs' [0.5]
Vt -> 'saw' [0.7]
Vt -> 'ate' [0.3]
P -> 'with' [0.7]
P -> 'under' [0.3]
DT -> 'the' [0.7]
DT -> 'a' [0.3]
NN -> 'man' [0.7]
NN -> 'woman' [0.2]
NN -> 'telescope' [0.1]
""")
这是 CYK 构建解析树的代码:
class PCFGParser():
def __init__(self,grammar):
#Initialize Nonterminals and the Unary and Binary rules
self.unary_rules = []
self.binary_rules =[]
self.N =['S','NP','PP','VP','V','Vt','P','DT','NN'] # List of non-terminals from the grammar
for rule in grammar.productions():
if len(rule)==1:
self.unary_rules.append(rule)
elif len(rule)==2:
self.binary_rules.append(rule)
def q(self, X , Y, Z):
#Returns probabilities for Binary rules
print('binary_func_called')
print("X : ",X,"TYPE X :",type(X))
print("Y : ",Y,"TYPE Y :",type(Y))
print("Z : ",Z,"TYPE Z :",type(Z))
for rule in self.binary_rules:
print('RULE LHS : ',rule.lhs(),'RULE LHS TYPE :',type(str(rule.lhs())))
print('RULE RHS 0 : ',rule.rhs()[0],'RULE RHS 0 TYPE : ',type(str(rule.rhs()[0])))
print('RULE RHS 1 : ',rule.rhs()[1],'RULE RHS 0 TYPE : ',type(str(rule.rhs()[1])))
if str(rule.lhs())==X and str(rule.rhs()[0])==Y and str(rule.rhs()[1])==Z:
print('working- returns prob')
return rule.prob()
else:
print(' not satisfied')
return 0
def q_unary(self, X, W):
#Returns probabilities for Unary rules
for rule in self.unary_rules:
if str(rule.lhs())==X and rule.rhs()[0]==W:
return rule.prob()
return 0
def parse(self, sentence):
#Calls the CYK Algorithm and stores the parse tree in a JSON Format
sentence = sentence.strip()
print (json.dumps(self.CKY(sentence.split(' '))))
def CKY(self, x):
#Returns Tree for a grammar in Chomsky-Normal Form
n = len(x) # length of sentence x
pi = defaultdict(float) # DP table pi
bp = {} # back pointers
# Base case
for i in range(n):
w = x[i]
for X in self.N:
pi[i, i, X] = self.q_unary(X, w)
# Recursive case
for l in range(1, n):
for i in range(n-l):
j = i + l
for X in self.N:
max_score = 0
args = None
for R in self.binary_rules: # search only within the rules with non-zero probability
if str(R.lhs()) == X: # consider rules which start from X
Y,Z= R.rhs()
Y = str(Y)
Z = str(Z)
for s in range(i, j):
if pi[i, s, Y] and pi[s + 1, j, Z]: # calculate score if both pi entries have non-zero score
score = self.q(X, Y, Z) * pi[i, s, Y] * pi[s + 1, j, Z]
if max_score < score:
max_score = score
args = Y, Z, s
if max_score: # update table and back pointers
pi[i, j, X] = max_score
bp[i, j, X] = args
# Backtrack to retrieve the tree
if pi[0, n-1, 'S']:
return self.backtrack_tree(x, bp, 0, n-1, 'S')
else: # if start symbol is not 'S'
max_score = 0
args = None
for X in self.N:
print(pi[0, n-1, X])
if max_score < pi[0, n-1, X]:
print('if - working')
max_score = pi[0, n-1, X]
args = 0, n-1, X
return self.backtrack_tree(x, bp, *args)
def backtrack_tree(self, sentence, bp, i, j, X):
#Recurse to get the parse tree
if i == j:
return [X, sentence[i]]
else:
Y, Z, s = bp[i, j, X]
return [X, self.backtrack_tree(sentence, bp, i, s, Y),
self.backtrack_tree(sentence, bp, s+1, j, Z)]
parser = PCFGParser(pcfg)
parser.parse('the man sleeps')
但是,介于两者之间的概率计算不正确,导致以下输出以及错误:
binary_func_called
X : NP TYPE X : <class 'str'>
Y : DT TYPE Y : <class 'str'>
Z : NN TYPE Z : <class 'str'>
RULE LHS : S RULE LHS TYPE : <class 'str'>
RULE RHS 0 : NP RULE RHS 0 TYPE : <class 'str'>
RULE RHS 1 : VP RULE RHS 0 TYPE : <class 'str'>
not satisfied
RULE LHS : NP RULE LHS TYPE : <class 'str'>
RULE RHS 0 : DT RULE RHS 0 TYPE : <class 'str'>
RULE RHS 1 : NN RULE RHS 0 TYPE : <class 'str'>
working- returns prob
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-16-b0f35f4387c1> in <module>
----> 1 parser.parse('the man sleeps')
<ipython-input-11-33bb4cfed9b6> in parse(self, sentence)
57
58 sentence = sentence.strip()
---> 59 print (json.dumps(self.CKY(sentence.split(' '))))
60
61
<ipython-input-11-33bb4cfed9b6> in CKY(self, x)
109 max_score = pi[0, n-1, X]
110 args = 0, n-1, X
--> 111 return self.backtrack_tree(x, bp, *args)
112
113
TypeError: backtrack_tree() argument after * must be an iterable, not NoneType
我遵循的算法:
有人可以帮我完成这项工作吗?我已经正确地遵循了上述算法,为什么它会失败?我已经尽力了,但仍然无法弄清楚如何解决这个问题。递归似乎存在问题,因为它返回 S -> NP VP 的 0 概率。