from katz.good_turing import GoodTuring
import itertools
[docs]
class BackOff:
def __init__(self, corpus):
"""Class to create a Katz back-off model from a corpus of text and evaluate
the probability of a given tuple of words based on this corpus
Args:
:corpus (list of tuples): Tuples of words which form the corpus
Returns:
Backoff: Katz back-off model based on this corpus
"""
all_len = [len(s) for s in corpus]
# Create the required GoodTuring Objects
self.all_gt = [None] * (max(all_len)+1)
for i in range(1, max(all_len)+1):
# d = [s[:i] for s in corpus]
# # Deal with edge effect
# for j in range(all_len[0] - i):
# d = d + [corpus[-1][j+1:i+j+1]]
# self.all_gt[i] = GoodTuring(d)
d = [s[-i:] for s in corpus if len(s) >= i] # The final n terms of the tuples
d_start = [dd[:-1] for dd in d] # The first n-1 terms of the n-tuple
# These come in pairs to you can evaluate:
# C(w_{i-n+1) ... w_i) [first item]
# C(w_{i-n+1) ... w_{i-1}) [second item]
# Each pair in list is for a different n
if i == 1:
self.all_gt[i] = [GoodTuring(d), None]
else:
self.all_gt[i] = [GoodTuring(d), GoodTuring(d_start)]
# Find all unique words in the corpus
# self.words = list(sum(corpus, ()))
self.words = list(itertools.chain(*corpus))
self.words = list(sorted(set(self.words), key=self.words.index))
[docs]
def get_d(self, phrase):
"""
Compute the amount of discounting found by Good-Turing estimation
Args:
:phrase (tuple): Collection of words to find discounting for
Returns:
:d (float): Good-Turing estimate for discounting
"""
cstar = self.all_gt[len(phrase)][0].expected_count(phrase) # C^*(w_{i-n+1} ... w_i)
c = self.all_gt[len(phrase)][0].actual_count(phrase) # C(w_{i-n+1} ... w_i)
return cstar / c
[docs]
def sort_endings(self, phrase):
"""
Find all ways of completing a phrase such that the new phrase appears
in the corpus
Args:
:phrase (tuple): Collection of words to find valid completions to
Returns:
:seen (list): List of words which can complete the phrase to produce a phrase found in the corpus
:unseen (list): List of words which, if appended to the phrase, produce a phrase NOT found in the corpus
"""
seen = []
unseen = []
for i, w in enumerate(self.words):
if self.all_gt[len(phrase)+1][0].actual_count(phrase + (w,)) == 0:
unseen.append(w)
else:
seen.append(w)
return seen, unseen
[docs]
def get_alpha(self, old_phrase):
"""
Compute the back-off weight
Args:
:old_phrase (tuple): The (n-1)-length tuple used to find the back-off weight for the n-length tuple
Returns:
:alpha (float): The back-off weight
:beta (float): The left-over probability mass for the (n-1)-gram
"""
seen, unseen = self.sort_endings(old_phrase)
beta = 0.
# Sum over w_i: C(w_{i-n+1} ... w_i) > 0 [i.e. seen]
for w in seen:
new_phrase = old_phrase + (w,)
d = self.get_d(new_phrase) # d_{w_{i-n+1} ... w_i}
cnew = self.all_gt[len(new_phrase)][0].actual_count(new_phrase) # C(w_{i-n+1} ... w_i)
beta += d * cnew
cold = self.all_gt[len(new_phrase)][1].actual_count(old_phrase) # [1] since want to get C(w_{i-n+1} ... w_{i-1})
beta = 1 - beta / cold # beta_{w_{i-n+1} ... w_{i-1}}
# Expect len(seen) < len(unseen)
# Economical to only run len(seen) times and then use normalisation of probs
alpha = 1.
# Sum over w_i: C(w_{i-n+1} ... w_i) > 0 [i.e. seen]
for w in seen:
alpha -= self.get_pbo(w, old_phrase[1:]) # Pbo(w_i | w_{i-n+2} ... w_{i-1})
alpha = beta / alpha # alpha_{w_{i-n+1} ... w_{i-1}}
return alpha, beta
[docs]
def get_pbo(self, wnew, old_phrase):
"""
Compute the probability for word wnew given the preceeding set of words old_phrase
Args:
:wnew (str): The new word we wish to know the probability of obtaining
:old_phrase (tuple): The preceeding phrase
Returns:
:pbo (float): The conditional probability P(wnew|old_phrase)
"""
new_phrase = old_phrase + (wnew,)
cnew = self.all_gt[len(new_phrase)][0].actual_count(new_phrase) # C(w_{i-n+1} ... w_i)
if cnew > 0 and len(old_phrase) > 0:
d = self.get_d(new_phrase) # d_{w_{i-n+1} ... w_i}
cold = self.all_gt[len(new_phrase)][1].actual_count(old_phrase) # C(w_{i-n+1} ... w_{i-1})
pbo = d * cnew / cold
elif len(old_phrase) > 1: # Never saw new phrase, and old phrase has length > 1
if self.all_gt[len(new_phrase)][1].actual_count(old_phrase) > 0:
# old_phrase appears as the start somewhere
alpha, beta = self.get_alpha(old_phrase) # alpha_{w_{i-n+1} ... w_{i-1}}
pbo = alpha * self.get_pbo(wnew, old_phrase[1:]) # alpha_{w_{i-n+1} ... w_{i-1}} * Pbo(w_i | w_{i-n+2} ... w_{i-1})
else:
# If no data for (n-1)-gram, skip n-1 and use n-2
pbo = self.get_pbo(wnew, old_phrase[1:]) # Pbo(w_i | w_{i-n+2} ... w_{i-1})
elif len(old_phrase) > 0 and self.all_gt[len(new_phrase)][1].actual_count(old_phrase) > 0:
# Never saw new phrase, but have an old phrase of length 1 which appears somehwere in corpus
alpha, beta = self.get_alpha(old_phrase) # alpha_{w_{i-1}}
pbo = alpha * self.all_gt[1][0].actual_count((wnew,)) / len(self.all_gt[1][0].corpus) # alpha_{w_{i-1}} * Pbo(w_{i} | . )
else:
pbo = self.all_gt[1][0].actual_count((wnew,)) / len(self.all_gt[1][0].corpus)
return pbo