Source code for katz.good_turing

import collections
import numpy as np
import scipy.stats


[docs]
class GoodTuring:

    def __init__(self, corpus):
        """
        Class to perform Good-Turing frequency estimation
        
        Args:
            :corpus (list): List of objects for which we want to produce Good-Turing frequency estimates. 
                Each entry is considered a word
            
        Returns:
            GoodTuring: Object to perform Good-Turing frequency estimates
        """
    
        # Store the data
        self.corpus = corpus
    
        # Dict for frequency of each element
        self.R = dict(collections.Counter(corpus))
        
        # Array for frequency of frequencies
        Nr = dict(collections.Counter(list(self.R.values())))
        Nr = np.array([list(Nr.keys()), list(Nr.values())])
        idx = np.argsort(Nr[0,:])
        self.Nr = Nr[:,idx]
        
        # Apply smoothing to get Zr
        Zr = np.zeros(self.Nr.shape[1], dtype=float)
        q = np.concatenate(([0], self.Nr[0,:-2]))
        t = self.Nr[0,1:]
        Zr[:-1] = self.Nr[1,:-1] / (0.5 * (t - q))
        if len(Zr) > 1:
            Zr[-1] = self.Nr[1,-1] / (self.Nr[0,-1] - self.Nr[0,-2])
        else:
            # Single unique frequency level: no averaging possible, use count directly
            Zr[-1] = self.Nr[1,-1]
        self.Zr = Zr
        
        # Apply linear regression only when there are enough valid (finite) data points
        log_r = np.log(self.Nr[0,:])
        log_zr = np.log(self.Zr)
        valid = np.isfinite(log_r) & np.isfinite(log_zr)
        if valid.sum() >= 2:
            res = scipy.stats.linregress(log_r[valid], log_zr[valid])
            self.slope = res.slope
            self.intercept = res.intercept
        else:
            # Insufficient data for regression: fall back to no discounting (d = 1).
            # With slope=-1 and intercept=0: get_S(r) = 1/r, so
            # expected_count = (k+1)*S(k+1)/S(k) = (k+1)*(1/(k+1))/(1/k) = k = actual_count.
            self.slope = -1.0
            self.intercept = 0.0
        

[docs]
    def get_S(self, r):
        """
        Compute the smoothed frequency estimate, S
        
        Args:
            :r (int): The number of occurences a given species was previous observed
            
        Returns:
            :S (float): The smoothed/adjusted estimate for the number of objects which occur r times
        """
        return np.exp(self.slope * np.log(r) + self.intercept)

        

[docs]
    def actual_count(self, word):
        """
        Return the number of times word appeared in the corpus
        
        Args:
            :word: The word whose frequency we wish to find
            
        Returns:
            :int: The number of times this word appeared in the corpus
        """
        if word in self.R.keys():
            return self.R[word]
        else:
            return 0



[docs]
    def expected_count(self, word):
        """
        Compute the predicted number of times a word should appear in a text equal to the length of the corpus
        
        Args:
            :word: The word whose expected frequency we wish to find
            
        Returns:
            float: The estimated freuqency of occurrence of this word in the corpus
        """
        k = self.actual_count(word)
        return (k+1) * self.get_S(k+1) / self.get_S(k)