
    Ng-                     X   d Z ddlZddlZddlmZ ddlmZmZ ddlm	Z	 ddl
mZ ddlmZ ddlmZmZ dd	lmZmZmZmZmZmZmZ dd
lmZ ddlmZmZ ddlmZm Z m!Z!m"Z" ddl#m$Z$ ddl%m&Z& ddl'm(Z( ddl)m*Z* ddl+m,Z,m-Z- ddl.m/Z/m0Z0 ej1        dd            Z2ddZ3 e	            Z4ddZ5dS )a@	  
NLTK Tokenizer Package

Tokenizers divide strings into lists of substrings.  For example,
tokenizers can be used to find the words and punctuation in a string:

    >>> from nltk.tokenize import word_tokenize
    >>> s = '''Good muffins cost $3.88\nin New York.  Please buy me
    ... two of them.\n\nThanks.'''
    >>> word_tokenize(s) # doctest: +NORMALIZE_WHITESPACE
    ['Good', 'muffins', 'cost', '$', '3.88', 'in', 'New', 'York', '.',
    'Please', 'buy', 'me', 'two', 'of', 'them', '.', 'Thanks', '.']

This particular tokenizer requires the Punkt sentence tokenization
models to be installed. NLTK also provides a simpler,
regular-expression based tokenizer, which splits text on whitespace
and punctuation:

    >>> from nltk.tokenize import wordpunct_tokenize
    >>> wordpunct_tokenize(s) # doctest: +NORMALIZE_WHITESPACE
    ['Good', 'muffins', 'cost', '$', '3', '.', '88', 'in', 'New', 'York', '.',
    'Please', 'buy', 'me', 'two', 'of', 'them', '.', 'Thanks', '.']

We can also operate at the level of sentences, using the sentence
tokenizer directly as follows:

    >>> from nltk.tokenize import sent_tokenize, word_tokenize
    >>> sent_tokenize(s)
    ['Good muffins cost $3.88\nin New York.', 'Please buy me\ntwo of them.', 'Thanks.']
    >>> [word_tokenize(t) for t in sent_tokenize(s)] # doctest: +NORMALIZE_WHITESPACE
    [['Good', 'muffins', 'cost', '$', '3.88', 'in', 'New', 'York', '.'],
    ['Please', 'buy', 'me', 'two', 'of', 'them', '.'], ['Thanks', '.']]

Caution: when tokenizing a Unicode string, make sure you are not
using an encoded version of the string (it may be necessary to
decode it first, e.g. with ``s.decode("utf8")``.

NLTK tokenizers can produce token-spans, represented as tuples of integers
having the same semantics as string slices, to support efficient comparison
of tokenizers.  (These methods are implemented as generators.)

    >>> from nltk.tokenize import WhitespaceTokenizer
    >>> list(WhitespaceTokenizer().span_tokenize(s)) # doctest: +NORMALIZE_WHITESPACE
    [(0, 4), (5, 12), (13, 17), (18, 23), (24, 26), (27, 30), (31, 36), (38, 44),
    (45, 48), (49, 51), (52, 55), (56, 58), (59, 64), (66, 73)]

There are numerous ways to tokenize text.  If you need more control over
tokenization, see the other methods provided in this package.

For further information, please see Chapter 3 of the NLTK book.
    N)load)TweetTokenizercasual_tokenize)NLTKWordTokenizer)LegalitySyllableTokenizer)MWETokenizer)PunktSentenceTokenizerPunktTokenizer)BlanklineTokenizerRegexpTokenizerWhitespaceTokenizerWordPunctTokenizerblankline_tokenizeregexp_tokenizewordpunct_tokenize)ReppTokenizer)SExprTokenizersexpr_tokenize)LineTokenizerSpaceTokenizerTabTokenizerline_tokenize)SyllableTokenizer)StanfordSegmenter)TextTilingTokenizer)ToktokTokenizer)TreebankWordDetokenizerTreebankWordTokenizer)regexp_span_tokenizestring_span_tokenizeenglishc                      t          |           S )z
    A constructor for the PunktTokenizer that utilizes
    a lru cache for performance.

    :param language: the model name in the Punkt corpus
    :type language: str
    )r
   )languages    R/var/www/html/ai-engine/env/lib/python3.11/site-packages/nltk/tokenize/__init__.py_get_punkt_tokenizerr%   `   s     (###    c                 J    t          |          }|                    |           S )a  
    Return a sentence-tokenized copy of *text*,
    using NLTK's recommended sentence tokenizer
    (currently :class:`.PunktSentenceTokenizer`
    for the specified language).

    :param text: text to split into sentences
    :param language: the model name in the Punkt corpus
    )r%   tokenize)textr#   	tokenizers      r$   sent_tokenizer+   m   s%     %X..Id###r&   Fc                 D    |r| gnt          | |          }d |D             S )a  
    Return a tokenized copy of *text*,
    using NLTK's recommended word tokenizer
    (currently an improved :class:`.TreebankWordTokenizer`
    along with :class:`.PunktSentenceTokenizer`
    for the specified language).

    :param text: text to split into words
    :type text: str
    :param language: the model name in the Punkt corpus
    :type language: str
    :param preserve_line: A flag to decide whether to sentence tokenize the text or not.
    :type preserve_line: bool
    c                 L    g | ]!}t                               |          D ]}|"S  )_treebank_word_tokenizerr(   ).0senttokens      r$   
<listcomp>z!word_tokenize.<locals>.<listcomp>   sI       1I1R1RSW1X1X (-   r&   )r+   )r)   r#   preserve_line	sentencess       r$   word_tokenizer6      s?     (J]4-J-JI #   r&   )r!   )r!   F)6__doc__	functoolsre	nltk.datar   nltk.tokenize.casualr   r   nltk.tokenize.destructiver    nltk.tokenize.legality_principler   nltk.tokenize.mwer   nltk.tokenize.punktr	   r
   nltk.tokenize.regexpr   r   r   r   r   r   r   nltk.tokenize.reppr   nltk.tokenize.sexprr   r   nltk.tokenize.simpler   r   r   r   !nltk.tokenize.sonority_sequencingr    nltk.tokenize.stanford_segmenterr   nltk.tokenize.texttilingr   nltk.tokenize.toktokr   nltk.tokenize.treebankr   r   nltk.tokenize.utilr   r    	lru_cacher%   r+   r/   r6   r.   r&   r$   <module>rK      sC  2 2h     				       @ @ @ @ @ @ @ @ 7 7 7 7 7 7 F F F F F F * * * * * * F F F F F F F F                  - , , , , , > > > > > > > >            @ ? ? ? ? ? > > > > > > 8 8 8 8 8 8 0 0 0 0 0 0 Q Q Q Q Q Q Q Q I I I I I I I I $ $ $ $$ $ $ $ -,..      r&   