参考
def fit(self, sentence_pairs):
""" Estimate of missing probability for each symbol
Parameters:
sentence_pairs - list of (original phrase, abbreviation)
In the abbreviation, all missed symbols are replaced with "-"
"""
self.missed_counter_ = defaultdict(lambda: Counter())
self.total_counter_ = defaultdict(lambda: Counter())
for (original, observed) in sentence_pairs:
for i, (original_letter, observed_letter) \
in enumerate(zip(original[self.order:], observed[self.order:])):
context = original[i:(i+self.order)]
if observed_letter == '-':
self.missed_counter_[context][original_letter] += 1
self.total_counter_[context][original_letter] += 1
def predict_proba(self, context, last_letter):
""" Estimate of probability of last_letter being missed after context"""
if self.order:
local = context[-self.order:]
else:
local = ''
missed_freq = self.missed_counter_[local][last_letter] + self.smoothing_missed
total_freq = self.total_counter_[local][last_letter] + self.smoothing_total
return missed_freq / total_freq