
    Ngn                        d Z ddlZddlZddlZddlmZmZmZ ddlm	Z	 ddl
mZ ddlmZ ddlmZ ddlmZ dd	lmZmZ dd
lmZ ddlmZ ddlmZ ddlmZmZmZ  edg d          Z G d d          Z  G d d          Z! G d d          Z" G d d          Z# G d de#          Z$d Z%e&dk    r
 e%             g dZ'dS )a  
This module brings together a variety of NLTK functionality for
text analysis, and provides simple, interactive interfaces.
Functionality includes: concordancing, collocation discovery,
regular expression search over tokenized strings, and
distributional similarity.
    N)Counterdefaultdict
namedtuple)reduce)log)BigramCollocationFinder)MLE)padded_everygram_pipeline)BigramAssocMeasures	f_measure)ConditionalFreqDist)FreqDist)sent_tokenize)LazyConcatenation
cut_string	tokenwrapConcordanceLine)leftqueryrightoffset
left_printright_printlinec                   T    e Zd ZdZed             Zddd fdZd Zd Zdd	Z	ddZ
dS )ContextIndexa  
    A bidirectional index between words and their 'contexts' in a text.
    The context of a word is usually defined to be the words that occur
    in a fixed window around the word; but other definitions may also
    be used by providing a custom context function.
    c                     |dk    r| |dz
                                            nd}|t          |           dz
  k    r| |dz                                             nd}||fS )z;One left token and one right token, normalized to lowercaser      *START**END*)lowerlen)tokensir   r   s       E/var/www/html/ai-engine/env/lib/python3.11/site-packages/nltk/text.py_default_contextzContextIndex._default_context/   sf     )*Qva!e}""$$$I)*c&kkAo)=)=q1u##%%%7e}    Nc                     | S N xs    r%   <lambda>zContextIndex.<lambda>6   s    Q r'   c                 ,    | _          _        |r| _        n j         _        rfdD             t	           fdt                    D                        _        t	           fdt                    D                        _        d S )Nc                 *    g | ]} |          |S r*   r*   ).0tfilters     r%   
<listcomp>z)ContextIndex.__init__.<locals>.<listcomp>>   s&    555A66!995a555r'   c              3   t   K   | ]2\  }}                     |                              |          fV  3d S r)   )_key_context_funcr0   r$   wselfr#   s      r%   	<genexpr>z(ContextIndex.__init__.<locals>.<genexpr>?   sW       %
 %
>BaTYYq\\4--fa889%
 %
 %
 %
 %
 %
r'   c              3   t   K   | ]2\  }}                     |                              |          fV  3d S r)   )r6   r5   r7   s      r%   r:   z(ContextIndex.__init__.<locals>.<genexpr>B   sW       %
 %
>BaT**DIIaLL9%
 %
 %
 %
 %
 %
r'   )r5   _tokensr6   r&   CFD	enumerate_word_to_contexts_context_to_words)r9   r#   context_funcr2   keys   `` ` r%   __init__zContextIndex.__init__6   s    	 	7!-D!%!6D 	65555555F!$ %
 %
 %
 %
 %
FOPVFWFW%
 %
 %
 "
 "
 "% %
 %
 %
 %
 %
FOPVFWFW%
 %
 %
 "
 "
r'   c                     | j         S )zw
        :rtype: list(str)
        :return: The document that this context index was
            created from.
        r<   r9   s    r%   r#   zContextIndex.tokensF        |r'   c                     |                      |          }t          | j        |                   }i }| j                                        D ]%\  }}t	          |t          |                    ||<   &|S )z
        Return a dictionary mapping from words to 'similarity scores,'
        indicating how often these two words occur in the same
        context.
        )r5   setr?   itemsr   )r9   wordword_contextsscoresr8   
w_contextss         r%   word_similarity_dictz!ContextIndex.word_similarity_dictN   sq     yyD24899!399;; 	B 	BMAz!-ZAAF1IIr'      c                 D   t          t                    }| j        |                     |                   D ]M}| j        |         D ]=}||k    r5||xx         | j        |         |         | j        |         |         z  z  cc<   >Nt          ||j        d          d |         S )NT)rB   reverse)r   intr?   r5   r@   sortedget)r9   rK   nrM   cr8   s         r%   similar_wordszContextIndex.similar_words]   s    S!!'		$8 	 	A+A.  991III.q1$7$:PQR:STU:VVIII
 f&*d;;;BQB??r'   Fc                 r     fdD              fdD             fdt          t                              D             }t          t          j                  |r%|r#t          dd                                        st                      S t           fdD                       }|S )a  
        Find contexts where the specified words can all appear; and
        return a frequency distribution mapping each context to the
        number of times that context was used.

        :param words: The words used to seed the similarity search
        :type words: str
        :param fail_on_unknown: If true, then raise a value error if
            any of the given words do not occur at all in the index.
        c                 :    g | ]}                     |          S r*   )r5   r0   r8   r9   s     r%   r3   z0ContextIndex.common_contexts.<locals>.<listcomp>r   s#    ---!1---r'   c                 D    g | ]}t          j        |                   S r*   )rI   r?   r[   s     r%   r3   z0ContextIndex.common_contexts.<locals>.<listcomp>s   s)    BBBqC.q122BBBr'   c                 0    g | ]}|         
|         S r*   r*   )r0   r$   contextswordss     r%   r3   z0ContextIndex.common_contexts.<locals>.<listcomp>t   s&    HHHaHQKHqHHHr'   z%The following word(s) were not found: c              3   D   K   | ]}j         |         D ]
}|v |V  d S r)   )r?   )r0   r8   rW   commonr9   s      r%   r:   z/ContextIndex.common_contexts.<locals>.<genexpr>|   sI        $*@*C %&qF{{{{{{{ r'   )ranger"   r   rI   intersection
ValueErrorjoinr   )r9   r_   fail_on_unknownemptyfdrb   r^   s   ``   @@r%   common_contextszContextIndex.common_contextsg   s     .---u---BBBBEBBBHHHHH5U#4#4HHH((33 		_ 		DchhuooVVV 	::          B Ir'   rP   )F)__name__
__module____qualname____doc__staticmethodr&   rC   r#   rO   rX   rj   r*   r'   r%   r   r   '   s            \ -1;; 
 
 
 
     @ @ @ @     r'   r   c                   @    e Zd ZdZd fdZd Zd Zd ZddZdd
Z	dS )ConcordanceIndexzs
    An index that can be used to look up the offset locations at which
    a given word occurs in a document.
    c                     | S r)   r*   r+   s    r%   r-   zConcordanceIndex.<lambda>   s    Q r'   c                     || _         	 || _        	 t          t                    | _        	 t          |          D ]:\  }}|                     |          }| j        |                             |           ;dS )a  
        Construct a new concordance index.

        :param tokens: The document (list of tokens) that this
            concordance index was created from.  This list can be used
            to access the context of a given word occurrence.
        :param key: A function that maps each token to a normalized
            version that will be used as a key in the index.  E.g., if
            you use ``key=lambda s:s.lower()``, then the index will be
            case-insensitive.
        N)r<   r5   r   list_offsetsr>   append)r9   r#   rB   indexrK   s        r%   rC   zConcordanceIndex.__init__   s     	  	D#D))L$V,, 	. 	.KE499T??DM$&&u----	. 	.r'   c                     | j         S )z{
        :rtype: list(str)
        :return: The document that this concordance index was
            created from.
        rE   rF   s    r%   r#   zConcordanceIndex.tokens   rG   r'   c                 F    |                      |          }| j        |         S )z
        :rtype: list(int)
        :return: A list of the offset positions at which the given
            word occurs.  If a key function was specified for the
            index, then given word's key will be looked up.
        )r5   rv   r9   rK   s     r%   offsetszConcordanceIndex.offsets   s      yy}T""r'   c                 X    dt          | j                  t          | j                  fz  S )Nz+<ConcordanceIndex for %d tokens (%d types)>)r"   r<   rv   rF   s    r%   __repr__zConcordanceIndex.__repr__   s/    <@
 
 	
r'   P   c           
         t          |t                    r|}n|g}d                    |          }t          d |D                       }||z
  dz
  dz  }|dz  }g }|                     |d                   }	t          |dd                   D ]H\  }fd|                     |          D             }
t          |
                    |	                    }	I|	r|	D ]d                    | j        t          |          z                      }| j        t          d|z
                     }| j        t          |          z   |z            }t          d                    |          |                               |          }t          d                    |          |          }d                    |||g          }t          ||||||          }|                    |           |S )	z
        Find all concordance lines given the query word.

        Provided with a list of words, these will be found as a phrase.
        r`   c              3   B   K   | ]}t          j        |          d V  dS )r   N)unicodedata	combining)r0   chars     r%   r:   z4ConcordanceIndex.find_concordance.<locals>.<genexpr>   s2      UUt9Nt9T9TUUUUUUUr'         r   r   Nc                      h | ]
}|z
  d z
  S )r   r*   )r0   r   r$   s     r%   	<setcomp>z4ConcordanceIndex.find_concordance.<locals>.<setcomp>   s!    LLLvFQJNLLLr'   )
isinstanceru   rf   sumr|   r>   rT   rd   r<   r"   maxr   rjustr   rw   )r9   rK   widthphrase
phrase_str
phrase_len
half_widthcontextconcordance_listr|   word_offsets
query_wordleft_contextright_contextr   r   
line_printconcordance_liner$   s                     @r%   find_concordancez!ConcordanceIndex.find_concordance   s    dD!! 	FFVFXXf%%
UUzUUUUU
j(1,2
1* ,,vay)) ,, 	A 	AGAtLLLLd9K9KLLLL\66w??@@GG 	: : : XXdl1q3v;;3F&GHH
#|C1w;,?,?!,CD $QV_q7{-J K'(>(>LLRR 
 )-)@)@*MM XXz:{&KLL
#2 !$ $  !''(89999r'      c                 <   |                      ||          }|st          d           dS t          |t          |                    }t          d| dt          |           d           t	          |d|                   D ]\  }}t          |j                   dS )a  
        Print concordance lines given the query word.
        :param word: The target word or phrase (a list of strings)
        :type word: str or list
        :param lines: The number of lines to display (default=25)
        :type lines: int
        :param width: The width of each line, in characters (default=80)
        :type width: int
        :param save: The option to save the concordance.
        :type save: bool
        )r   z
no matcheszDisplaying z of z	 matches:N)r   printminr"   r>   r   )r9   rK   r   linesr   r$   r   s          r%   print_concordancez"ConcordanceIndex.print_concordance   s      00U0CC 	-,s#34455EKKK3/?+@+@KKKLLL'01A&5&1I'J'J - -##&+,,,,- -r'   N)r   )r   r   )
rl   rm   rn   ro   rC   r#   r|   r~   r   r   r*   r'   r%   rr   rr      s         
 $/; . . . .4  # # #
 
 
.  .  .  . `- - - - - -r'   rr   c                       e Zd ZdZd Zd ZdS )TokenSearchera  
    A class that makes it easier to use regular expressions to search
    over tokenized strings.  The tokenized string is converted to a
    string where tokens are marked with angle brackets -- e.g.,
    ``'<the><window><is><still><open>'``.  The regular expression
    passed to the ``findall()`` method is modified to treat angle
    brackets as non-capturing parentheses, in addition to matching the
    token boundaries; and to have ``'.'`` not match the angle brackets.
    c                 N    d                     d |D                       | _        d S )N c              3   &   K   | ]}d |z   dz   V  dS )<>Nr*   )r0   r8   s     r%   r:   z)TokenSearcher.__init__.<locals>.<genexpr>  s*      ::aC!GcM::::::r'   )rf   _raw)r9   r#   s     r%   rC   zTokenSearcher.__init__  s(    GG::6:::::			r'   c                 ~   t          j        dd|          }t          j        dd|          }t          j        dd|          }t          j        dd|          }t          j        || j                  }|D ];}|                    d          s$|                    d          rt          d	          <d
 |D             }|S )a  
        Find instances of the regular expression in the text.
        The text is a list of tokens, and a regexp pattern to match
        a single token must be surrounded by angle brackets.  E.g.

        >>> from nltk.text import TokenSearcher
        >>> from nltk.book import text1, text5, text9
        >>> text5.findall("<.*><.*><bro>")
        you rule bro; telling you bro; u twizted bro
        >>> text1.findall("<a>(<.*>)<man>")
        monied; nervous; dangerous; white; white; white; pious; queer; good;
        mature; white; Cape; great; wise; wise; butterless; white; fiendish;
        pale; furious; better; certain; complete; dismasted; younger; brave;
        brave; brave; brave
        >>> text9.findall("<th.*>{3,}")
        thread through those; the thought that; that the thing; the thing
        that; that that thing; through these than through; them that the;
        through the thick; them that they; thought that the

        :param regexp: A regular expression
        :type regexp: str
        z\sr   r   z(?:<(?:r   z)>)z	(?<!\\)\.z[^>]z$Bad regexp for TokenSearcher.findallc                 H    g | ]}|d d                              d           S )r   z><splitr0   hs     r%   r3   z)TokenSearcher.findall.<locals>.<listcomp>5  s,    222!B$d##222r'   )resubfindallr   
startswithendswithre   )r9   regexphitsr   s       r%   r   zTokenSearcher.findall  s    0 r6**i00eV,,ff55 z&$),,  	I 	IA<<$$ IC I !GHHH 32T222r'   N)rl   rm   rn   ro   rC   r   r*   r'   r%   r   r     s<         ; ; ;' ' ' ' 'r'   r   c                       e Zd ZdZdZd!dZd Zd Zd"d	Zd"d
Z	d#dZ
d#dZd Zd Zd Zd$dZd$dZd Zd%dZd&dZd Zd Zd Z ej        d          Zd Zd Zd  ZdS )'Texta  
    A wrapper around a sequence of simple (string) tokens, which is
    intended to support initial exploration of texts (via the
    interactive console).  Its methods perform a variety of analyses
    on the text's contexts (e.g., counting, concordancing, collocation
    discovery), and display the results.  If you wish to write a
    program which makes use of these analyses, then you should bypass
    the ``Text`` class, and use the appropriate analysis function or
    class directly instead.

    A ``Text`` is typically initialized from a given document or
    corpus.  E.g.:

    >>> import nltk.corpus
    >>> from nltk.text import Text
    >>> moby = Text(nltk.corpus.gutenberg.words('melville-moby_dick.txt'))

    TNc                 b   | j         rt          |          }|| _        |r	|| _        dS d|dd         v rK|dd                             d          }d                    d |d|         D                       | _        dS d                    d |dd         D                       d	z   | _        dS )
zv
        Create a Text object.

        :param tokens: The source text.
        :type tokens: sequence of str
        ]NrP   r`   c              3   4   K   | ]}t          |          V  d S r)   strr0   toks     r%   r:   z Text.__init__.<locals>.<genexpr>b  s(       C CcS C C C C C Cr'   r   c              3   4   K   | ]}t          |          V  d S r)   r   r   s     r%   r:   z Text.__init__.<locals>.<genexpr>d  s(       @ @cS @ @ @ @ @ @r'      z...)_COPY_TOKENSru   r#   namerx   rf   )r9   r#   r   ends       r%   rC   zText.__init__S  s      	"&\\F 	IDIIIF3B3K"+##C((C C CVAcE] C C CCCDIII @ @VBQBZ @ @ @@@5HDIIIr'   c                     | j         |         S r)   )r#   )r9   r$   s     r%   __getitem__zText.__getitem__j  s    {1~r'   c                 *    t          | j                  S r)   )r"   r#   rF   s    r%   __len__zText.__len__m  s    4;r'   O   r   c                     d| j         vrt          | j        d           | _        | j                            |||          S )a  
        Prints a concordance for ``word`` with the specified context window.
        Word matching is not case-sensitive.

        :param word: The target word or phrase (a list of strings)
        :type word: str or list
        :param width: The width of each line, in characters (default=80)
        :type width: int
        :param lines: The number of lines to display (default=25)
        :type lines: int

        :seealso: ``ConcordanceIndex``
        _concordance_indexc                 *    |                                  S r)   r!   ss    r%   r-   z"Text.concordance.<locals>.<lambda>      17799 r'   rB   )__dict__rr   r#   r   r   r9   rK   r   r   s       r%   concordancezText.concordancet  sP      t}44&6!4!4' ' 'D# &88ueLLLr'   c                     d| j         vrt          | j        d           | _        | j                            ||          d|         S )a  
        Generate a concordance for ``word`` with the specified context window.
        Word matching is not case-sensitive.

        :param word: The target word or phrase (a list of strings)
        :type word: str or list
        :param width: The width of each line, in characters (default=80)
        :type width: int
        :param lines: The number of lines to display (default=25)
        :type lines: int

        :seealso: ``ConcordanceIndex``
        r   c                 *    |                                  S r)   r   r   s    r%   r-   z'Text.concordance_list.<locals>.<lambda>  r   r'   r   N)r   rr   r#   r   r   r   s       r%   r   zText.concordance_list  sW      t}44&6!4!4' ' 'D# &77eDDVeVLLr'   rP   r   c                    d| j         v r| j        |k    r| j        |k    s|| _        || _        ddlm} |                    d          t          j        | j        |          }|	                    d           |
                    fd           t                      }t          |                    |j        |                    | _        | j        S )a  
        Return collocations derived from the text, ignoring stopwords.

            >>> from nltk.book import text4
            >>> text4.collocation_list()[:2]
            [('United', 'States'), ('fellow', 'citizens')]

        :param num: The maximum number of collocations to return.
        :type num: int
        :param window_size: The number of tokens spanned by a collocation (default=2)
        :type window_size: int
        :rtype: list(tuple(str, str))
        _collocationsr   )	stopwordsenglishr   c                 V    t          |           dk     p|                                 v S )N   )r"   r!   )r8   ignored_wordss    r%   r-   z'Text.collocation_list.<locals>.<lambda>  s#    s1vvz/WQWWYY-=W r'   )r   _num_window_sizenltk.corpusr   r_   r   
from_wordsr#   apply_freq_filterapply_word_filterr   ru   nbestlikelihood_ratior   )r9   numwindow_sizer   finderbigram_measuresr   s         @r%   collocation_listzText.collocation_list  s     t},,	S  ![00DI +D .-----%OOI66M,7[QQF$$Q'''$$%W%W%W%WXXX133O!%_=sCC" "D !!r'   c                     d |                      ||          D             }t          t          |d                     dS )a  
        Print collocations derived from the text, ignoring stopwords.

            >>> from nltk.book import text4
            >>> text4.collocations() # doctest: +NORMALIZE_WHITESPACE
            United States; fellow citizens; years ago; four years; Federal
            Government; General Government; American people; Vice President; God
            bless; Chief Justice; one another; fellow Americans; Old World;
            Almighty God; Fellow citizens; Chief Magistrate; every citizen; Indian
            tribes; public debt; foreign nations


        :param num: The maximum number of collocations to print.
        :type num: int
        :param window_size: The number of tokens spanned by a collocation (default=2)
        :type window_size: int
        c                 $    g | ]\  }}|d z   |z   S r`   r*   r0   w1w2s      r%   r3   z%Text.collocations.<locals>.<listcomp>  s1     
 
 
$b"BHrM
 
 
r'   ; )	separatorN)r   r   r   )r9   r   r   collocation_stringss       r%   collocationszText.collocations  sU    &
 
(,(=(=c;(O(O
 
 
 	i+t<<<=====r'   c                 6    | j                             |          S )zJ
        Count the number of times this word appears in the text.
        )r#   countr{   s     r%   r   z
Text.count       {  &&&r'   c                 6    | j                             |          S )zQ
        Find the index of the first occurrence of the word in the text.
        )r#   rx   r{   s     r%   rx   z
Text.index  r   r'   c                     t           r)   )NotImplementedError)r9   methods     r%   readabilityzText.readability  s    !!r'   c                    d| j         vrt          | j        d d           | _                                        | j        j                                        v rt                             t          fd                                D                       }d |	                    |          D             }t          t          |                     dS t          d           dS )	a~  
        Distributional similarity: find other words which appear in the
        same contexts as the specified word; list most similar words first.

        :param word: The word used to seed the similarity search
        :type word: str
        :param num: The number of words to generate (default=20)
        :type num: int
        :seealso: ContextIndex.similar_words()
        _word_context_indexc                 *    |                                  S r)   )isalphar+   s    r%   r-   zText.similar.<locals>.<lambda>  s    aiikk r'   c                 *    |                                  S r)   r   r   s    r%   r-   zText.similar.<locals>.<lambda>  s     r'   )r2   rB   c              3   F   K   | ]}|         D ]}|v |k    |V  d S r)   r*   )r0   r8   rW   r^   wcirK   s      r%   r:   zText.similar.<locals>.<genexpr>  sW        Q  ==d  *3	 r'   c                     g | ]\  }}|S r*   r*   r0   r8   _s      r%   r3   z Text.similar.<locals>.<listcomp>  s    77741aQ777r'   z
No matchesN)r   r   r#   r   r!   r?   
conditionsrI   r   most_commonr   r   )r9   rK   r   ri   r_   r^   r  s    `   @@r%   similarzText.similar  s    !55'3$9$9?R?R( ( (D$ zz||&83>>####3t9~~H      ))    B 872>>##6#6777E)E""#####,r'   c                    d| j         vrt          | j        d           | _        	 | j                            |d          }|st          d           dS d |                    |          D             }t          t          d |D                                  dS # t          $ r}t          |           Y d}~dS d}~ww xY w)	aY  
        Find contexts where the specified words appear; list
        most frequent common contexts first.

        :param words: The words used to seed the similarity search
        :type words: str
        :param num: The number of words to generate (default=20)
        :type num: int
        :seealso: ContextIndex.common_contexts()
        r   c                 *    |                                  S r)   r   r   s    r%   r-   z&Text.common_contexts.<locals>.<lambda>  r   r'   r   TzNo common contexts were foundc                     g | ]\  }}|S r*   r*   r  s      r%   r3   z(Text.common_contexts.<locals>.<listcomp>   s    "E"E"EA1"E"E"Er'   c              3   ,   K   | ]\  }}|d z   |z   V  dS )r  Nr*   r   s      r%   r:   z'Text.common_contexts.<locals>.<genexpr>!  s.      LL&"bS2LLLLLLr'   N)	r   r   r#   r   rj   r   r  r   re   )r9   r_   r   ri   ranked_contextses         r%   rj   zText.common_contexts
  s     !55'3!4!4( ( (D$		)99%FFB N566666"E"E1D1D"E"E"EiLLOLLLLLMMMMM 	 	 	!HHHHHHHHH	s   ,B AB 
C &B;;C c                 *    ddl m}  || |           dS )z
        Produce a plot showing the distribution of the words through the text.
        Requires pylab to be installed.

        :param words: The words to be plotted
        :type words: list(str)
        :seealso: nltk.draw.dispersion_plot()
        r   )dispersion_plotN)	nltk.drawr  )r9   r_   r  s      r%   r  zText.dispersion_plot&  s.     	.-----e$$$$$r'   r   c                 x    t          ||          \  }}t          |          }|                    ||           |S )N)order)r
   r	   fit)r9   tokenized_sentsrV   
train_datapadded_sentsmodels         r%   _train_default_ngram_lmzText._train_default_ngram_lm3  s<    #<Q#P#P 
L!		*l+++r'   d   *   c                    d t          d                    | j                            D             | _        t	          | d          s<t          dt          j                   |                     | j        d          | _	        g }|dk    s
J d	            t          |          |k     rlt          | j	                            |||
                    D ])\  }}|dk    r|dk    r n|                    |           *|dz  }t          |          |k     l|rd                    |          dz   nd}|t          |d|                   z   }t          |           |S )a  
        Print random text, generated using a trigram language model.
        See also `help(nltk.lm)`.

        :param length: The length of text to generate (default=100)
        :type length: int

        :param text_seed: Generation can be conditioned on preceding context.
        :type text_seed: list(str)

        :param random_seed: A random seed or an instance of `random.Random`. If provided,
            makes the random sampling part of generation reproducible. (default=42)
        :type random_seed: int
        c                 8    g | ]}|                     d           S r   r   )r0   sents     r%   r3   z!Text.generate.<locals>.<listcomp>I  s/     !
 !
 !
 $DJJsOO!
 !
 !
r'   r`   _trigram_modelzBuilding ngram index...)filer   )rV   r   z!The `length` must be more than 0.)	text_seedrandom_seedz<s>z</s>r   r   N)r   rf   r#   _tokenized_sentshasattrr   sysstderrr  r   r"   r>   generaterw   r   )	r9   lengthr"  r#  generated_tokensidxtokenprefix
output_strs	            r%   r(  zText.generate9  s    !
 !
(5chht{6K6K(L(L!
 !
 !
 t-.. 	+#*===="&">">% #? # #D zzz>zzz"##f,,'#,,i[ -    	/ 	/
U
 E>>F??E ''....1K "##f,, /8?)$$s**Ri(8&(ABBB
jr'   c                 :     |                                  j        | S )zc
        See documentation for FreqDist.plot()
        :seealso: nltk.prob.FreqDist.plot()
        )vocabplot)r9   argss     r%   r1  z	Text.plotg  s    
 !tzz|| $''r'   c                 J    d| j         vrt          |           | _        | j        S )z.
        :seealso: nltk.prob.FreqDist
        _vocab)r   r   r4  rF   s    r%   r0  z
Text.vocabn  s&     4=(("4..DK{r'   c                     d| j         vrt          |           | _        | j                            |          }d |D             }t	          t          |d                     dS )a  
        Find instances of the regular expression in the text.
        The text is a list of tokens, and a regexp pattern to match
        a single token must be surrounded by angle brackets.  E.g.

        >>> from nltk.book import text1, text5, text9
        >>> text5.findall("<.*><.*><bro>")
        you rule bro; telling you bro; u twizted bro
        >>> text1.findall("<a>(<.*>)<man>")
        monied; nervous; dangerous; white; white; white; pious; queer; good;
        mature; white; Cape; great; wise; wise; butterless; white; fiendish;
        pale; furious; better; certain; complete; dismasted; younger; brave;
        brave; brave; brave
        >>> text9.findall("<th.*>{3,}")
        thread through those; the thought that; that the thing; the thing
        that; that that thing; through these than through; them that the;
        through the thick; them that they; thought that the

        :param regexp: A regular expression
        :type regexp: str
        _token_searcherc                 8    g | ]}d                      |          S r   )rf   r   s     r%   r3   z Text.findall.<locals>.<listcomp>  s"    ******r'   r   N)r   r   r6  r   r   r   )r9   r   r   s      r%   r   zText.findallw  sh    . DM11#0#6#6D #++F33**T***id##$$$$$r'   z\w+|[\.\!\?]c                    |dz
  }|dk    rK| j                             ||                   s+|dz  }|dk    r | j                             ||                   +|dk    r||         nd}|dz   }|t          |          k     rX| j                             ||                   s8|dz  }|t          |          k     r | j                             ||                   8|t          |          k    r||         nd}||fS )z
        One left & one right token, both case-normalized.  Skip over
        non-sentence-final punctuation.  Used by the ``ContextIndex``
        that is created for ``similar()`` and ``common_contexts()``.
        r   r   r   r    )_CONTEXT_REmatchr"   )r9   r#   r$   jr   r   s         r%   _contextzText._context  s     E1ffT-33F1I>>fFA 1ffT-33F1I>>fFFvayy	 E#f++ood&6&<&<VAY&G&GoFA #f++ood&6&<&<VAY&G&Go#f++--q		7e}r'   c                     d| j         z  S Nz
<Text: %s>r   rF   s    r%   __str__zText.__str__      di''r'   c                     d| j         z  S r>  r?  rF   s    r%   r~   zText.__repr__  rA  r'   r)   )r   r   )rP   r   rk   )r   )r  Nr  )rl   rm   rn   ro   r   rC   r   r   r   r   r   r   r   rx   r   r	  rj   r  r  r(  r1  r0  r   r   compiler9  r<  r@  r~   r*   r'   r%   r   r   9  s        . LI I I I.       M M M M*M M M M(!" !" !" !"F> > > >0' ' '' ' '" " "           D   8% % %   , , , ,\( ( (  % % %D "*_--K  0( ( (( ( ( ( (r'   r   c                   *    e Zd ZdZd Zd Zd Zd ZdS )TextCollectiona;  A collection of texts, which can be loaded with list of texts, or
    with a corpus consisting of one or more texts, and which supports
    counting, concordancing, collocation discovery, etc.  Initialize a
    TextCollection as follows:

    >>> import nltk.corpus
    >>> from nltk.text import TextCollection
    >>> from nltk.book import text1, text2, text3
    >>> gutenberg = TextCollection(nltk.corpus.gutenberg)
    >>> mytexts = TextCollection([text1, text2, text3])

    Iterating over a TextCollection produces all the tokens of all the
    texts in order.
    c                     t          d          r fd                                D             | _        t                              | t                               i | _        d S )Nr_   c                 :    g | ]}                     |          S r*   )r_   )r0   fsources     r%   r3   z+TextCollection.__init__.<locals>.<listcomp>  s#    @@@!fll1oo@@@r'   )r%  fileids_textsr   rC   r   
_idf_cache)r9   rI  s    `r%   rC   zTextCollection.__init__  sh    67## 	A@@@@v~~/?/?@@@Fd-f55666r'   c                 L    |                     |          t          |          z  S )z"The frequency of the term in text.)r   r"   r9   termtexts      r%   tfzTextCollection.tf  s    zz$#d))++r'   c                 2   | j                                       }|yt          fd| j        D                       }t          | j                  dk    rt	          d          |r$t          t          | j                  |z            nd}|| j         <   |S )zThe number of texts in the corpus divided by the
        number of texts that the term appears in.
        If a term does not appear in the corpus, 0.0 is returned.Nc                     g | ]}|v d 	S )Tr*   )r0   rP  rO  s     r%   r3   z&TextCollection.idf.<locals>.<listcomp>  s    HHHD44<<4<<<r'   r   z+IDF undefined for empty document collectiong        )rL  rU   r"   rK  re   r   )r9   rO  idfmatchess    `  r%   rT  zTextCollection.idf  s    
 o!!$'';HHHHDKHHHIIG4;1$$ !NOOO5<E#c$+&&0111#C$'DOD!
r'   c                 Z    |                      ||          |                     |          z  S r)   )rQ  rT  rN  s      r%   tf_idfzTextCollection.tf_idf  s%    wwtT""TXXd^^33r'   N)rl   rm   rn   ro   rC   rQ  rT  rW  r*   r'   r%   rE  rE    sZ           , , ,  4 4 4 4 4r'   rE  c                  R   ddl m}  t          |                     d                    }t	          |           t	                       t	          d           |                    d           t	                       t	          d           |                    d           t	                       t	          d           |                                 t	                       t	          d           |                    g d	           t	                       t	          d
           |	                    d           t	                       t	          d           t	          d|d                    t	          d|dd                    t	          d|
                                d                    d S )Nr   )brownnews)
categorieszConcordance:zDistributionally similar words:zCollocations:zDispersion plot:)rZ  reportsaid	announcedzVocabulary plot:2   z	Indexing:ztext[3]:r   z
text[3:5]:   ztext.vocab()['news']:)r   rY  r   r_   r   r   r	  r   r  r1  r0  )rY  rP  s     r%   demora    s{   !!!!!!v..//D	$KKK	GGG	.V	GGG	
+,,,LL	GGG	/	GGG 

@@@AAA	GGG	
IIbMMM	GGG	+	*d1g	,QqS	"""	
!4::<<#788888r'   __main__)r   rr   r   r   rE  )(ro   r   r&  r   collectionsr   r   r   	functoolsr   mathr   nltk.collocationsr   nltk.lmr	   nltk.lm.preprocessingr
   nltk.metricsr   r   nltk.probabilityr   r=   r   nltk.tokenizer   	nltk.utilr   r   r   r   r   rr   r   r   rE  ra  rl   __all__r*   r'   r%   <module>rn     sS    
			 



     8 8 8 8 8 8 8 8 8 8             5 5 5 5 5 5       ; ; ; ; ; ; 7 7 7 7 7 7 7 7 7 7 7 7 7 7 % % % % % % ' ' ' ' ' ' > > > > > > > > > >*MMM X X X X X X X Xv|- |- |- |- |- |- |- |-~5 5 5 5 5 5 5 5p~( ~( ~( ~( ~( ~( ~( ~(D+4 +4 +4 +4 +4T +4 +4 +4\9 9 9< zDFFF  r'   