
    Ng                         d Z ddlmZmZ ddlmZmZ  G d de          Z G d de          Z G d d	e          Z	 G d
 de          Z
ddZdS )a  
Simple Tokenizers

These tokenizers divide strings into substrings using the string
``split()`` method.
When tokenizing using a particular delimiter string, use
the string ``split()`` method directly, as this is more efficient.

The simple tokenizers are *not* available as separate functions;
instead, you should just use the string ``split()`` method directly:

    >>> s = "Good muffins cost $3.88\nin New York.  Please buy me\ntwo of them.\n\nThanks."
    >>> s.split() # doctest: +NORMALIZE_WHITESPACE
    ['Good', 'muffins', 'cost', '$3.88', 'in', 'New', 'York.',
    'Please', 'buy', 'me', 'two', 'of', 'them.', 'Thanks.']
    >>> s.split(' ') # doctest: +NORMALIZE_WHITESPACE
    ['Good', 'muffins', 'cost', '$3.88\nin', 'New', 'York.', '',
    'Please', 'buy', 'me\ntwo', 'of', 'them.\n\nThanks.']
    >>> s.split('\n') # doctest: +NORMALIZE_WHITESPACE
    ['Good muffins cost $3.88', 'in New York.  Please buy me',
    'two of them.', '', 'Thanks.']

The simple tokenizers are mainly useful because they follow the
standard ``TokenizerI`` interface, and so can be used with any code
that expects a tokenizer.  For example, these tokenizers can be used
to specify the tokenization conventions when building a `CorpusReader`.

    )StringTokenizer
TokenizerI)regexp_span_tokenizestring_span_tokenizec                       e Zd ZdZdZdS )SpaceTokenizera  Tokenize a string using the space character as a delimiter,
    which is the same as ``s.split(' ')``.

        >>> from nltk.tokenize import SpaceTokenizer
        >>> s = "Good muffins cost $3.88\nin New York.  Please buy me\ntwo of them.\n\nThanks."
        >>> SpaceTokenizer().tokenize(s) # doctest: +NORMALIZE_WHITESPACE
        ['Good', 'muffins', 'cost', '$3.88\nin', 'New', 'York.', '',
        'Please', 'buy', 'me\ntwo', 'of', 'them.\n\nThanks.']
     N__name__
__module____qualname____doc___string     P/var/www/html/ai-engine/env/lib/python3.11/site-packages/nltk/tokenize/simple.pyr   r   *   s          GGGr   r   c                       e Zd ZdZdZdS )TabTokenizerzTokenize a string use the tab character as a delimiter,
    the same as ``s.split('\t')``.

        >>> from nltk.tokenize import TabTokenizer
        >>> TabTokenizer().tokenize('a\tb c\n\t d')
        ['a', 'b c\n', ' d']
    	Nr
   r   r   r   r   r   8   s          GGGr   r   c                   "    e Zd ZdZdZd Zd ZdS )CharTokenizerzTokenize a string into individual characters.  If this functionality
    is ever required directly, use ``for char in string``.
    Nc                      t          |          S N)listselfss     r   tokenizezCharTokenizer.tokenizeK   s    Awwr   c              #   p   K   t          t          dt          |          dz                       E d {V  d S )N   )	enumeraterangelenr   s     r   span_tokenizezCharTokenizer.span_tokenizeN   s@      U1c!ffqj1122222222222r   )r   r   r   r   r   r   r$   r   r   r   r   r   D   sC          G  3 3 3 3 3r   r   c                   &    e Zd ZdZddZd Zd ZdS )LineTokenizera  Tokenize a string into its lines, optionally discarding blank lines.
    This is similar to ``s.split('\n')``.

        >>> from nltk.tokenize import LineTokenizer
        >>> s = "Good muffins cost $3.88\nin New York.  Please buy me\ntwo of them.\n\nThanks."
        >>> LineTokenizer(blanklines='keep').tokenize(s) # doctest: +NORMALIZE_WHITESPACE
        ['Good muffins cost $3.88', 'in New York.  Please buy me',
        'two of them.', '', 'Thanks.']
        >>> # same as [l for l in s.split('\n') if l.strip()]:
        >>> LineTokenizer(blanklines='discard').tokenize(s) # doctest: +NORMALIZE_WHITESPACE
        ['Good muffins cost $3.88', 'in New York.  Please buy me',
        'two of them.', 'Thanks.']

    :param blanklines: Indicates how blank lines should be handled.  Valid values are:

        - ``discard``: strip blank lines out of the token list before returning it.
           A line is considered blank if it contains only whitespace characters.
        - ``keep``: leave all blank lines in the token list.
        - ``discard-eof``: if the string ends with a newline, then do not generate
           a corresponding token ``''`` after that newline.
    discardc                 j    d}||vr%t          dd                    |          z            || _        d S )N)r'   keepdiscard-eofzBlank lines must be one of: %sr	   )
ValueErrorjoin_blanklines)r   
blanklinesvalid_blankliness      r   __init__zLineTokenizer.__init__i   sK    =---0388<L3M3MM   &r   c                     |                                 }| j        dk    rd |D             }n;| j        dk    r0|r.|d                                         s|                                 |S )Nr'   c                 :    g | ]}|                                 |S r   )rstrip).0ls     r   
<listcomp>z*LineTokenizer.tokenize.<locals>.<listcomp>v   s%    44414Q444r   r*   )
splitlinesr-   strippop)r   r   liness      r   r   zLineTokenizer.tokenizer   so    y((44444EE.. U2Y__.. 		r   c              #   |   K   | j         dk    rt          |d          E d {V  d S t          |d          E d {V  d S )Nr)   z\nz
\n(\s+\n)*)r-   r   r   r   s     r   r$   zLineTokenizer.span_tokenize}   sd      v%%+Au55555555555+A}===========r   Nr'   )r   r   r   r   r0   r   r$   r   r   r   r&   r&   R   sP         ,& & & &  > > > > >r   r&   r'   c                 F    t          |                              |           S r   )r&   r   )textr.   s     r   line_tokenizer@      s    $$--d333r   Nr=   )r   nltk.tokenize.apir   r   nltk.tokenize.utilr   r   r   r   r   r&   r@   r   r   r   <module>rC      s   : : 9 9 9 9 9 9 9 I I I I I I I I    _   	 	 	 	 	? 	 	 	3 3 3 3 3O 3 3 3/> /> /> /> />J /> /> />p4 4 4 4 4 4r   