Source code for katz.back_off

from katz.good_turing import GoodTuring
import itertools

[docs] class BackOff: def __init__(self, corpus): """Class to create a Katz back-off model from a corpus of text and evaluate the probability of a given tuple of words based on this corpus Args: :corpus (list of tuples): Tuples of words which form the corpus Returns: Backoff: Katz back-off model based on this corpus """ all_len = [len(s) for s in corpus] # Create the required GoodTuring Objects self.all_gt = [None] * (max(all_len)+1) for i in range(1, max(all_len)+1): # d = [s[:i] for s in corpus] # # Deal with edge effect # for j in range(all_len[0] - i): # d = d + [corpus[-1][j+1:i+j+1]] # self.all_gt[i] = GoodTuring(d) d = [s[-i:] for s in corpus if len(s) >= i] # The final n terms of the tuples d_start = [dd[:-1] for dd in d] # The first n-1 terms of the n-tuple # These come in pairs to you can evaluate: # C(w_{i-n+1) ... w_i) [first item] # C(w_{i-n+1) ... w_{i-1}) [second item] # Each pair in list is for a different n if i == 1: self.all_gt[i] = [GoodTuring(d), None] else: self.all_gt[i] = [GoodTuring(d), GoodTuring(d_start)] # Find all unique words in the corpus # self.words = list(sum(corpus, ())) self.words = list(itertools.chain(*corpus)) self.words = list(sorted(set(self.words), key=self.words.index))
[docs] def get_d(self, phrase): """ Compute the amount of discounting found by Good-Turing estimation Args: :phrase (tuple): Collection of words to find discounting for Returns: :d (float): Good-Turing estimate for discounting """ cstar = self.all_gt[len(phrase)][0].expected_count(phrase) # C^*(w_{i-n+1} ... w_i) c = self.all_gt[len(phrase)][0].actual_count(phrase) # C(w_{i-n+1} ... w_i) return cstar / c
[docs] def sort_endings(self, phrase): """ Find all ways of completing a phrase such that the new phrase appears in the corpus Args: :phrase (tuple): Collection of words to find valid completions to Returns: :seen (list): List of words which can complete the phrase to produce a phrase found in the corpus :unseen (list): List of words which, if appended to the phrase, produce a phrase NOT found in the corpus """ seen = [] unseen = [] for i, w in enumerate(self.words): if self.all_gt[len(phrase)+1][0].actual_count(phrase + (w,)) == 0: unseen.append(w) else: seen.append(w) return seen, unseen
[docs] def get_alpha(self, old_phrase): """ Compute the back-off weight Args: :old_phrase (tuple): The (n-1)-length tuple used to find the back-off weight for the n-length tuple Returns: :alpha (float): The back-off weight :beta (float): The left-over probability mass for the (n-1)-gram """ seen, unseen = self.sort_endings(old_phrase) beta = 0. # Sum over w_i: C(w_{i-n+1} ... w_i) > 0 [i.e. seen] for w in seen: new_phrase = old_phrase + (w,) d = self.get_d(new_phrase) # d_{w_{i-n+1} ... w_i} cnew = self.all_gt[len(new_phrase)][0].actual_count(new_phrase) # C(w_{i-n+1} ... w_i) beta += d * cnew cold = self.all_gt[len(new_phrase)][1].actual_count(old_phrase) # [1] since want to get C(w_{i-n+1} ... w_{i-1}) beta = 1 - beta / cold # beta_{w_{i-n+1} ... w_{i-1}} # Expect len(seen) < len(unseen) # Economical to only run len(seen) times and then use normalisation of probs alpha = 1. # Sum over w_i: C(w_{i-n+1} ... w_i) > 0 [i.e. seen] for w in seen: alpha -= self.get_pbo(w, old_phrase[1:]) # Pbo(w_i | w_{i-n+2} ... w_{i-1}) alpha = beta / alpha # alpha_{w_{i-n+1} ... w_{i-1}} return alpha, beta
[docs] def get_pbo(self, wnew, old_phrase): """ Compute the probability for word wnew given the preceeding set of words old_phrase Args: :wnew (str): The new word we wish to know the probability of obtaining :old_phrase (tuple): The preceeding phrase Returns: :pbo (float): The conditional probability P(wnew|old_phrase) """ new_phrase = old_phrase + (wnew,) cnew = self.all_gt[len(new_phrase)][0].actual_count(new_phrase) # C(w_{i-n+1} ... w_i) if cnew > 0 and len(old_phrase) > 0: d = self.get_d(new_phrase) # d_{w_{i-n+1} ... w_i} cold = self.all_gt[len(new_phrase)][1].actual_count(old_phrase) # C(w_{i-n+1} ... w_{i-1}) pbo = d * cnew / cold elif len(old_phrase) > 1: # Never saw new phrase, and old phrase has length > 1 if self.all_gt[len(new_phrase)][1].actual_count(old_phrase) > 0: # old_phrase appears as the start somewhere alpha, beta = self.get_alpha(old_phrase) # alpha_{w_{i-n+1} ... w_{i-1}} pbo = alpha * self.get_pbo(wnew, old_phrase[1:]) # alpha_{w_{i-n+1} ... w_{i-1}} * Pbo(w_i | w_{i-n+2} ... w_{i-1}) else: # If no data for (n-1)-gram, skip n-1 and use n-2 pbo = self.get_pbo(wnew, old_phrase[1:]) # Pbo(w_i | w_{i-n+2} ... w_{i-1}) elif len(old_phrase) > 0 and self.all_gt[len(new_phrase)][1].actual_count(old_phrase) > 0: # Never saw new phrase, but have an old phrase of length 1 which appears somehwere in corpus alpha, beta = self.get_alpha(old_phrase) # alpha_{w_{i-1}} pbo = alpha * self.all_gt[1][0].actual_count((wnew,)) / len(self.all_gt[1][0].corpus) # alpha_{w_{i-1}} * Pbo(w_{i} | . ) else: pbo = self.all_gt[1][0].actual_count((wnew,)) / len(self.all_gt[1][0].corpus) return pbo