import collections
import numpy as np
import scipy.stats
[docs]
class GoodTuring:
def __init__(self, corpus):
"""
Class to perform Good-Turing frequency estimation
Args:
:corpus (list): List of objects for which we want to produce Good-Turing frequency estimates.
Each entry is considered a word
Returns:
GoodTuring: Object to perform Good-Turing frequency estimates
"""
# Store the data
self.corpus = corpus
# Dict for frequency of each element
self.R = dict(collections.Counter(corpus))
# Array for frequency of frequencies
Nr = dict(collections.Counter(list(self.R.values())))
Nr = np.array([list(Nr.keys()), list(Nr.values())])
idx = np.argsort(Nr[0,:])
self.Nr = Nr[:,idx]
# Apply smoothing to get Zr
Zr = np.zeros(self.Nr.shape[1], dtype=float)
q = np.concatenate(([0], self.Nr[0,:-2]))
t = self.Nr[0,1:]
Zr[:-1] = self.Nr[1,:-1] / (0.5 * (t - q))
if len(Zr) > 1:
Zr[-1] = self.Nr[1,-1] / (self.Nr[0,-1] - self.Nr[0,-2])
else:
# Single unique frequency level: no averaging possible, use count directly
Zr[-1] = self.Nr[1,-1]
self.Zr = Zr
# Apply linear regression only when there are enough valid (finite) data points
log_r = np.log(self.Nr[0,:])
log_zr = np.log(self.Zr)
valid = np.isfinite(log_r) & np.isfinite(log_zr)
if valid.sum() >= 2:
res = scipy.stats.linregress(log_r[valid], log_zr[valid])
self.slope = res.slope
self.intercept = res.intercept
else:
# Insufficient data for regression: fall back to no discounting (d = 1).
# With slope=-1 and intercept=0: get_S(r) = 1/r, so
# expected_count = (k+1)*S(k+1)/S(k) = (k+1)*(1/(k+1))/(1/k) = k = actual_count.
self.slope = -1.0
self.intercept = 0.0
[docs]
def get_S(self, r):
"""
Compute the smoothed frequency estimate, S
Args:
:r (int): The number of occurences a given species was previous observed
Returns:
:S (float): The smoothed/adjusted estimate for the number of objects which occur r times
"""
return np.exp(self.slope * np.log(r) + self.intercept)
[docs]
def actual_count(self, word):
"""
Return the number of times word appeared in the corpus
Args:
:word: The word whose frequency we wish to find
Returns:
:int: The number of times this word appeared in the corpus
"""
if word in self.R.keys():
return self.R[word]
else:
return 0
[docs]
def expected_count(self, word):
"""
Compute the predicted number of times a word should appear in a text equal to the length of the corpus
Args:
:word: The word whose expected frequency we wish to find
Returns:
float: The estimated freuqency of occurrence of this word in the corpus
"""
k = self.actual_count(word)
return (k+1) * self.get_S(k+1) / self.get_S(k)