
    Ng$                     x    d dl Z d dlZd dlmZmZmZ d dlmZ d dlm	Z	  G d d          Z
 G d de          ZdS )	    N)IteratorListTuple)
TokenizerI)align_tokensc                   *    e Zd ZdZg dZddgZddgZdS )MacIntyreContractionszI
    List of contractions adapted from Robert MacIntyre's tokenizer.
    )z(?i)\b(can)(?#X)(not)\bz(?i)\b(d)(?#X)('ye)\bz(?i)\b(gim)(?#X)(me)\bz(?i)\b(gon)(?#X)(na)\bz(?i)\b(got)(?#X)(ta)\bz(?i)\b(lem)(?#X)(me)\bz(?i)\b(more)(?#X)('n)\bz(?i)\b(wan)(?#X)(na)(?=\s)z(?i) ('t)(?#X)(is)\bz(?i) ('t)(?#X)(was)\bz(?i)\b(whad)(dd)(ya)\bz(?i)\b(wha)(t)(cha)\bN)__name__
__module____qualname____doc__CONTRACTIONS2CONTRACTIONS3CONTRACTIONS4     U/var/www/html/ai-engine/env/lib/python3.11/site-packages/nltk/tokenize/destructive.pyr	   r	      sA         	 	 	M -.FGM.0HIMMMr   r	   c                   
   e Zd ZdZ ej        dej                  df ej        d          df ej        d          df ej        d          df ej        d	ej                  d
fgZ ej        dej                  df ej        d          df ej        d          df ej        d          df ej        d          df ej        d          dfgZ ej        dej                  df ej        d          df ej        d          df ej        dej                  df ej        d          df ej        d          df ej        d          df ej        d          d f ej        d!ej                  dfg	Z	 ej        d"          dfZ
 ej        d#          d$f ej        d%          d&f ej        d'          d(f ej        d)          d*f ej        d+          d,f ej        d-          d.fgZ ej        d/          d0fZ e            Z e eej        ej                            Z e eej        ej                            Z	 d9d2ed3ed4ed5ee         fd6Zd2ed5eeeef                  fd7Zd8S ):NLTKWordTokenizeraE  
    The NLTK tokenizer that has improved upon the TreebankWordTokenizer.

    This is the method that is invoked by ``word_tokenize()``.  It assumes that the
    text has already been segmented into sentences, e.g. using ``sent_tokenize()``.

    The tokenizer is "destructive" such that the regexes applied will munge the
    input string to a state beyond re-construction. It is possible to apply
    `TreebankWordDetokenizer.detokenize` to the tokenized outputs of
    `NLTKDestructiveWordTokenizer.tokenize` but there's no guarantees to
    revert to the original string.
    u   ([«“‘„]|[`]+)z \1 z^\"``z(``)z([ \(\[{<])(\"|\'{2})z\1 `` z$(?i)(\')(?!re|ve|ll|m|t|s|d|n)(\w)\bz\1 \2u   ([»”’])''z '' "z\s+ z([^' ])('[sS]|'[mM]|'[dD]|') z\1 \2 z)([^' ])('ll|'LL|'re|'RE|'ve|'VE|n't|N'T) u&   ([^\.])(\.)([\]\)}>"\'»”’ ]*)\s*$z	\1 \2 \3 z([:,])([^\d])z \1 \2z([:,])$z\.{2,}z \g<0> z[;@#$%&]z([^\.])(\.)([\]\)}>"\']*)\s*$z\1 \2\3 z[?!]z([^'])' z\1 ' z[*]z[\]\[\(\)\{\}\<\>]z\(z-LRB-z\)z-RRB-z\[z-LSB-z\]z-RSB-z\{z-LCB-z\}z-RCB-z--z -- Ftextconvert_parentheses
return_strreturnc                    |rt          j        dt          d           | j        D ]\  }}|                    ||          }| j        D ]\  }}|                    ||          }| j        \  }}|                    ||          }|r#| j        D ]\  }}|                    ||          }| j        \  }}|                    ||          }d|z   dz   }| j	        D ]\  }}|                    ||          }| j
        D ]}|                    d|          }| j        D ]}|                    d|          }|                                S )a  Return a tokenized copy of `text`.

        >>> from nltk.tokenize import NLTKWordTokenizer
        >>> s = '''Good muffins cost $3.88 (roughly 3,36 euros)\nin New York.  Please buy me\ntwo of them.\nThanks.'''
        >>> NLTKWordTokenizer().tokenize(s) # doctest: +NORMALIZE_WHITESPACE
        ['Good', 'muffins', 'cost', '$', '3.88', '(', 'roughly', '3,36',
        'euros', ')', 'in', 'New', 'York.', 'Please', 'buy', 'me', 'two',
        'of', 'them.', 'Thanks', '.']
        >>> NLTKWordTokenizer().tokenize(s, convert_parentheses=True) # doctest: +NORMALIZE_WHITESPACE
        ['Good', 'muffins', 'cost', '$', '3.88', '-LRB-', 'roughly', '3,36',
        'euros', '-RRB-', 'in', 'New', 'York.', 'Please', 'buy', 'me', 'two',
        'of', 'them.', 'Thanks', '.']


        :param text: A string with a sentence or sentences.
        :type text: str
        :param convert_parentheses: if True, replace parentheses to PTB symbols,
            e.g. `(` to `-LRB-`. Defaults to False.
        :type convert_parentheses: bool, optional
        :param return_str: If True, return tokens as space-separated string,
            defaults to False.
        :type return_str: bool, optional
        :return: List of tokens from `text`.
        :rtype: List[str]
        zHParameter 'return_str' has been deprecated and should no longer be used.   )category
stacklevelr   z \1 \2 )warningswarnDeprecationWarningSTARTING_QUOTESsubPUNCTUATIONPARENS_BRACKETSCONVERT_PARENTHESESDOUBLE_DASHESENDING_QUOTESr   r   split)selfr   r   r   regexpsubstitutions         r   tokenizezNLTKWordTokenizer.tokenizey   s   8  	M"+	    %)$8 	2 	2 FL::lD11DD$($4 	2 	2 FL::lD11DD  $3zz,-- 	6(,(@ 6 6$zz,55  $1zz,-- TzC$($6 	2 	2 FL::lD11DD( 	0 	0F::j$//DD( 	0 	0F::j$//DD zz||r   c              #      K   |                      |          }d|v sd|v r.d t          j        d|          D             fd|D             }n|}t          ||          E d{V  dS )a}  
        Returns the spans of the tokens in ``text``.
        Uses the post-hoc nltk.tokens.align_tokens to return the offset spans.

            >>> from nltk.tokenize import NLTKWordTokenizer
            >>> s = '''Good muffins cost $3.88\nin New (York).  Please (buy) me\ntwo of them.\n(Thanks).'''
            >>> expected = [(0, 4), (5, 12), (13, 17), (18, 19), (19, 23),
            ... (24, 26), (27, 30), (31, 32), (32, 36), (36, 37), (37, 38),
            ... (40, 46), (47, 48), (48, 51), (51, 52), (53, 55), (56, 59),
            ... (60, 62), (63, 68), (69, 70), (70, 76), (76, 77), (77, 78)]
            >>> list(NLTKWordTokenizer().span_tokenize(s)) == expected
            True
            >>> expected = ['Good', 'muffins', 'cost', '$', '3.88', 'in',
            ... 'New', '(', 'York', ')', '.', 'Please', '(', 'buy', ')',
            ... 'me', 'two', 'of', 'them.', '(', 'Thanks', ')', '.']
            >>> [s[start:end] for start, end in NLTKWordTokenizer().span_tokenize(s)] == expected
            True

        :param text: A string with a sentence or sentences.
        :type text: str
        :yield: Tuple[int, int]
        r   r   c                 6    g | ]}|                                 S r   )group).0ms     r   
<listcomp>z3NLTKWordTokenizer.span_tokenize.<locals>.<listcomp>   s     KKKQqwwyyKKKr   z
``|'{2}|\"c                 F    g | ]}|d v r                     d          n|S ))r   r   r   r   )pop)r4   tokmatcheds     r   r6   z3NLTKWordTokenizer.span_tokenize.<locals>.<listcomp>   sB        #&):":":A  r   N)r0   refinditerr   )r-   r   
raw_tokenstokensr:   s       @r   span_tokenizezNLTKWordTokenizer.span_tokenize   s      . ]]4((
 4KKTT\\KK"+mT*J*JKKKG   %  FF
  F-----------r   N)FF)r
   r   r   r   r;   compileUr%   r+   r'   r(   r)   r*   r	   _contractionslistmapr   r   strboolr   r0   r   r   intr?   r   r   r   r   r   %   sv         
*BD	1	17;	F		U#	G		g&	,	-	-y9	;RT	B	BHMO 
NBD	)	)73	E		F#	D		6"	F		S!	4	5	5yA	@	A	A9MM( 
Dbd	K	K\Z	$	%	%y1	J		)BJy"$''	
 
K	 	 *-BJ788	
 
G		j)	K	 	 (+BJvrt$$	
K, "rz"788*EO 
E		G$	E		G$	E		G$	E		G$	E		G$	E		G$  RZ&&0M *)++MDRZ)DEEFFMDRZ)DEEFFM PUF FF.2FHLF	cF F F FP).# ).(5c?*C ). ). ). ). ). ).r   r   )r;   r"   typingr   r   r   nltk.tokenize.apir   nltk.tokenize.utilr   r	   r   r   r   r   <module>rK      s    
			  ( ( ( ( ( ( ( ( ( ( ( ( ( ( ( ( + + + + + +J J J J J J J J&E. E. E. E. E.
 E. E. E. E. E.r   