
    Ng
                         d Z ddlmZ ddlmZ ddlmZ d Z G d de          Z G d d	e          Z	 G d
 de          Z
dS )zSmoothing algorithms for language modeling.

According to Chen & Goodman 1995 these should work with both Backoff and
Interpolation.
    )methodcaller)	Smoothing)ConditionalFreqDistc                     t          | t                    rt          d          nd t          fd|                                 D                       S )zCount values that are greater than zero in a distribution.

    Assumes distribution is either a mapping with counts as values or
    an instance of `nltk.ConditionalFreqDist`.
    Nc                     | S N )counts    M/var/www/html/ai-engine/env/lib/python3.11/site-packages/nltk/lm/smoothing.py<lambda>z'_count_values_gt_zero.<locals>.<lambda>   s    5     c              3   :   K   | ]} |          d k    dV  dS )r      Nr
   ).0dist_or_countas_counts     r   	<genexpr>z(_count_values_gt_zero.<locals>.<genexpr>   sA        8O8ORS8S8S8S8S8S8S r   )
isinstancer   r   sumvalues)distributionr   s    @r   _count_values_gt_zeror      sv     l$788	!S        +2244     r   c                   4     e Zd ZdZ fdZd Zd Zd Z xZS )
WittenBellzWitten-Bell smoothing.c                 >     t                      j        ||fi | d S r	   )super__init__)self
vocabularycounterkwargs	__class__s       r   r   zWittenBell.__init__'   s*    W7777777r   c                     | j         |                             |          }|                     |          }d|z
  |z  |fS )Ng      ?)countsfreq_gammar   wordcontextalphagammas        r   alpha_gammazWittenBell.alpha_gamma*   sA    G$))$//G$$eu$e++r   c                     t          | j        |                   }||| j        |                                         z   z  S r	   )r   r%   r   r   r*   n_pluss      r   r'   zWittenBell._gamma/   s9    &t{7';<<$+g"6"8"8":"::;;r   c                 @    | j         j                            |          S r	   r%   unigramsr&   r   r)   s     r   unigram_scorezWittenBell.unigram_score3       {#((...r   	__name__
__module____qualname____doc__r   r-   r'   r5   __classcell__r#   s   @r   r   r   $   sk          8 8 8 8 8, , ,
< < </ / / / / / /r   r   c                   6     e Zd ZdZd fd	Zd Zd Zd Z xZS )AbsoluteDiscountingz!Smoothing with absolute discount.      ?c                 L     t                      j        ||fi | || _        d S r	   )r   r   discount)r   r    r!   rB   r"   r#   s        r   r   zAbsoluteDiscounting.__init__:   s/    W77777 r   c                     t          | j        |         |         | j        z
  d          | j        |                                         z  }|                     |          }||fS )Nr   )maxr%   rB   r   r'   r(   s        r   r-   zAbsoluteDiscounting.alpha_gamma>   s_    G$T*T]:A>>k'"$$&&' 	 G$$e|r   c                     t          | j        |                   }| j        |z  | j        |                                         z  S r	   )r   r%   rB   r   r/   s      r   r'   zAbsoluteDiscounting._gammaF   s;    &t{7';<<&$+g*>*@*@*B*BBBr   c                 @    | j         j                            |          S r	   r2   r4   s     r   r5   z!AbsoluteDiscounting.unigram_scoreJ   r6   r   )r@   r7   r=   s   @r   r?   r?   7   ss        ++! ! ! ! ! !  C C C/ / / / / / /r   r?   c                   J     e Zd ZdZd fd	Zd Zd Z e            fdZ xZ	S )	KneserNeya  Kneser-Ney Smoothing.

    This is an extension of smoothing with a discount.

    Resources:
    - https://pages.ucsd.edu/~rlevy/lign256/winter2008/kneser_ney_mini_example.pdf
    - https://www.youtube.com/watch?v=ody1ysUTD7o
    - https://medium.com/@dennyc/a-simple-numerical-example-for-kneser-ney-smoothing-nlp-4600addf38b8
    - https://www.cl.uni-heidelberg.de/courses/ss15/smt/scribe6.pdf
    - https://www-i6.informatik.rwth-aachen.de/publications/download/951/Kneser-ICASSP-1995.pdf
    皙?c                 Z     t                      j        ||fi | || _        || _        d S r	   )r   r   rB   _order)r   r    r!   orderrB   r"   r#   s         r   r   zKneserNey.__init__[   s6    W77777 r   c                 <    |                      |          \  }}||z  S r	   )_continuation_counts)r   r)   word_continuation_counttotal_counts       r   r5   zKneserNey.unigram_score`   s&    /3/H/H/N/N,&44r   c                 .   | j         |         }t          |          dz   | j        k    r||         |                                fn|                     ||          \  }}t          || j        z
  d          |z  }| j        t          |          z  |z  }||fS )Nr   g        )r%   lenrK   r   rN   rD   rB   r   )r   r)   r*   prefix_countsrO   rP   r+   r,   s           r   r-   zKneserNey.alpha_gammad   s    G, 7||a4;.. 4 -//"3"344**499 	-
 +dm;SAAKO 5m D DD{Re|r   c                     fd| j         t                    dz                                            D             }d\  }}|D ]0}|t          ||         dk              z  }|t	          |          z  }1||fS )a  Count continuations that end with context and word.

        Continuations track unique ngram "types", regardless of how many
        instances were observed for each "type".
        This is different than raw ngram counts which track number of instances.
        c              3   >   K   | ]\  }}|d d         k    |V  dS )r   Nr
   )r   prefix_ngramr%   r*   s      r   r   z1KneserNey._continuation_counts.<locals>.<genexpr>v   sG       ,
 ,
$fABB7** ****,
 ,
r      )r   r   r   )r%   rR   itemsintr   )r   r)   r*    higher_order_ngrams_with_context#higher_order_ngrams_with_word_counttotalr%   s     `    r   rN   zKneserNey._continuation_countso   s    ,
 ,
 ,
 ,
(,CLL14D(E(K(K(M(M,
 ,
 ,
(
 6:2+U6 	3 	3F/3vd|a7G3H3HH/*6222EE2E99r   )rI   )
r8   r9   r:   r;   r   r5   r-   tuplerN   r<   r=   s   @r   rH   rH   N   s        
 
     
5 5 5	 	 	 27 : : : : : : : :r   rH   N)r;   operatorr   nltk.lm.apir   nltk.probabilityr   r   r   r?   rH   r
   r   r   <module>ra      s    
 " ! ! ! ! ! ! ! ! ! ! ! 0 0 0 0 0 0  "/ / / / / / / /&/ / / / /) / / /.1: 1: 1: 1: 1:	 1: 1: 1: 1: 1:r   