
    Ng                         d Z ddlZddlmZ ddlmZ ddlmZ ddlm	Z	 ed             Z
e
                    e          d             Ze
                    e          d	             Z G d
 d          ZdS )zLanguage Model Vocabulary    N)Counter)Iterable)singledispatch)chainc                 @    t          dt          |                      )Nz/Unsupported type for looking up in vocabulary: )	TypeErrortypewordsvocabs     N/var/www/html/ai-engine/env/lib/python3.11/site-packages/nltk/lm/vocabulary.py_dispatched_lookupr      s    
Sd5kkSS
T
TT    c                 :    t          fd| D                       S )zcLook up a sequence of words in the vocabulary.

    Returns an iterator over looked up words.

    c              3   8   K   | ]}t          |          V  d S Nr   ).0wr   s     r   	<genexpr>z_.<locals>.<genexpr>   s.      ==!#Au--======r   )tupler
   s    `r   _r      s(     ====u======r   c                     | |v r| n|j         S )z$Looks up one word in the vocabulary.)	unk_label)wordr   s     r   _string_lookupr      s     5==44eo5r   c                   `    e Zd ZdZddZed             Zd Zd Zd	 Z	d
 Z
d Zd Zd Zd ZdS )
Vocabularya
  Stores language model vocabulary.

    Satisfies two common language modeling requirements for a vocabulary:

    - When checking membership and calculating its size, filters items
      by comparing their counts to a cutoff value.
    - Adds a special "unknown" token which unseen words are mapped to.

    >>> words = ['a', 'c', '-', 'd', 'c', 'a', 'b', 'r', 'a', 'c', 'd']
    >>> from nltk.lm import Vocabulary
    >>> vocab = Vocabulary(words, unk_cutoff=2)

    Tokens with counts greater than or equal to the cutoff value will
    be considered part of the vocabulary.

    >>> vocab['c']
    3
    >>> 'c' in vocab
    True
    >>> vocab['d']
    2
    >>> 'd' in vocab
    True

    Tokens with frequency counts less than the cutoff value will be considered not
    part of the vocabulary even though their entries in the count dictionary are
    preserved.

    >>> vocab['b']
    1
    >>> 'b' in vocab
    False
    >>> vocab['aliens']
    0
    >>> 'aliens' in vocab
    False

    Keeping the count entries for seen words allows us to change the cutoff value
    without having to recalculate the counts.

    >>> vocab2 = Vocabulary(vocab.counts, unk_cutoff=1)
    >>> "b" in vocab2
    True

    The cutoff value influences not only membership checking but also the result of
    getting the size of the vocabulary using the built-in `len`.
    Note that while the number of keys in the vocabulary's counter stays the same,
    the items in the vocabulary differ depending on the cutoff.
    We use `sorted` to demonstrate because it keeps the order consistent.

    >>> sorted(vocab2.counts)
    ['-', 'a', 'b', 'c', 'd', 'r']
    >>> sorted(vocab2)
    ['-', '<UNK>', 'a', 'b', 'c', 'd', 'r']
    >>> sorted(vocab.counts)
    ['-', 'a', 'b', 'c', 'd', 'r']
    >>> sorted(vocab)
    ['<UNK>', 'a', 'c', 'd']

    In addition to items it gets populated with, the vocabulary stores a special
    token that stands in for so-called "unknown" items. By default it's "<UNK>".

    >>> "<UNK>" in vocab
    True

    We can look up words in a vocabulary using its `lookup` method.
    "Unseen" words (with counts less than cutoff) are looked up as the unknown label.
    If given one word (a string) as an input, this method will return a string.

    >>> vocab.lookup("a")
    'a'
    >>> vocab.lookup("aliens")
    '<UNK>'

    If given a sequence, it will return an tuple of the looked up words.

    >>> vocab.lookup(["p", 'a', 'r', 'd', 'b', 'c'])
    ('<UNK>', 'a', '<UNK>', 'd', '<UNK>', 'c')

    It's possible to update the counts after the vocabulary has been created.
    In general, the interface is the same as that of `collections.Counter`.

    >>> vocab['b']
    1
    >>> vocab.update(["b", "b", "c"])
    >>> vocab['b']
    3
    N   <UNK>c                     || _         |dk     rt          d|           || _        t                      | _        |                     ||nd           dS )a  Create a new Vocabulary.

        :param counts: Optional iterable or `collections.Counter` instance to
                       pre-seed the Vocabulary. In case it is iterable, counts
                       are calculated.
        :param int unk_cutoff: Words that occur less frequently than this value
                               are not considered part of the vocabulary.
        :param unk_label: Label for marking words not part of vocabulary.

        r   z)Cutoff value cannot be less than 1. Got: N )r   
ValueError_cutoffr   countsupdate)selfr%   
unk_cutoffr   s       r   __init__zVocabulary.__init__   s^     #>>UUUVVV!iif0FFb99999r   c                     | j         S )ziCutoff value.

        Items with count below this value are not considered part of vocabulary.

        )r$   r'   s    r   cutoffzVocabulary.cutoff   s     |r   c                 f     | j         j        |i | t          d | D                       | _        dS )zWUpdate vocabulary counts.

        Wraps `collections.Counter.update` method.

        c              3      K   | ]}d V  dS )r   N )r   r   s     r   r   z$Vocabulary.update.<locals>.<genexpr>   s"      ((a((((((r   N)r%   r&   sum_len)r'   counter_argscounter_kwargss      r   r&   zVocabulary.update   s@     	L;N;;;((4(((((			r   c                 "    t          ||           S )a  Look up one or more words in the vocabulary.

        If passed one word as a string will return that word or `self.unk_label`.
        Otherwise will assume it was passed a sequence of words, will try to look
        each of them up and return an iterator over the looked up words.

        :param words: Word(s) to look up.
        :type words: Iterable(str) or str
        :rtype: generator(str) or str
        :raises: TypeError for types other than strings or iterables

        >>> from nltk.lm import Vocabulary
        >>> vocab = Vocabulary(["a", "b", "c", "a", "b"], unk_cutoff=2)
        >>> vocab.lookup("a")
        'a'
        >>> vocab.lookup("aliens")
        '<UNK>'
        >>> vocab.lookup(["a", "b", "c", ["x", "b"]])
        ('a', 'b', '<UNK>', ('<UNK>', 'b'))

        r   )r'   r   s     r   lookupzVocabulary.lookup   s    , "%...r   c                 @    || j         k    r| j        n| j        |         S r   )r   r$   r%   r'   items     r   __getitem__zVocabulary.__getitem__   s!    #t~55t||4;t;LLr   c                 $    | |         | j         k    S )zPOnly consider items with counts GE to cutoff as being in the
        vocabulary.)r,   r7   s     r   __contains__zVocabulary.__contains__   s     DzT[((r   c                 d     t           fd j        D              j        r j        gng           S )zKBuilding on membership check define how to iterate over
        vocabulary.c              3   $   K   | ]
}|v |V  d S r   r/   )r   r8   r'   s     r   r   z&Vocabulary.__iter__.<locals>.<genexpr>   s'      ::dTT\\T\\\\::r   )r   r%   r   r+   s   `r   __iter__zVocabulary.__iter__   sD     ::::dk::: $3T^
 
 	
r   c                     | j         S )z1Computing size of vocabulary reflects the cutoff.)r1   r+   s    r   __len__zVocabulary.__len__   s
    yr   c                 b    | j         |j         k    o| j        |j        k    o| j        |j        k    S r   )r   r,   r%   )r'   others     r   __eq__zVocabulary.__eq__   s5    Neo- ,u|+,u|+	
r   c                 t    d                     | j        j        | j        | j        t          |                     S )Nz/<{} with cutoff={} unk_label='{}' and {} items>)format	__class____name__r,   r   lenr+   s    r   __str__zVocabulary.__str__   s2    @GGN#T[$.#d))
 
 	
r   )Nr   r    )rG   
__module____qualname____doc__r)   propertyr,   r&   r5   r9   r;   r>   r@   rC   rI   r/   r   r   r   r   %   s        W Wr: : : :&   X) ) )/ / /0M M M) ) )

 
 
  
 
 

 
 
 
 
r   r   )rL   syscollectionsr   collections.abcr   	functoolsr   	itertoolsr   r   registerr   strr   r   r/   r   r   <module>rU      s      



       $ $ $ $ $ $ $ $ $ $ $ $       U U U X&&> > '&> S!!6 6 "!6
u
 u
 u
 u
 u
 u
 u
 u
 u
 u
r   