
    Ng?                         d Z ddlZddlZddlmZmZmZ ddlmZ ddl	m
Z
 ddlmZ  G d de          Z G d	 d
e          ZdS )a	  

Penn Treebank Tokenizer

The Treebank tokenizer uses regular expressions to tokenize text as in Penn Treebank.
This implementation is a port of the tokenizer sed script written by Robert McIntyre
and available at http://www.cis.upenn.edu/~treebank/tokenizer.sed.
    N)IteratorListTuple)
TokenizerI)MacIntyreContractions)align_tokensc            
          e Zd ZdZ ej        d          df ej        d          df ej        d          dfgZ ej        d          d	f ej        d
          df ej        d          df ej        d          df ej        d          df ej        d          df ej        d          dfgZ ej        d          dfZ ej        d          df ej        d          df ej        d          df ej        d          df ej        d          df ej        d          d fgZ	 ej        d!          d"fZ
 ej        d#          d$f ej        d%          d$f ej        d&          d'f ej        d(          d'fgZ e            Z e eej        ej                            Z e eej        ej                            Z	 d1d*ed+ed,ed-ee         fd.Zd*ed-eeeef                  fd/Zd0S )2TreebankWordTokenizera	  
    The Treebank tokenizer uses regular expressions to tokenize text as in Penn Treebank.

    This tokenizer performs the following steps:

    - split standard contractions, e.g. ``don't`` -> ``do n't`` and ``they'll`` -> ``they 'll``
    - treat most punctuation characters as separate tokens
    - split off commas and single quotes, when followed by whitespace
    - separate periods that appear at the end of line

    >>> from nltk.tokenize import TreebankWordTokenizer
    >>> s = '''Good muffins cost $3.88\nin New York.  Please buy me\ntwo of them.\nThanks.'''
    >>> TreebankWordTokenizer().tokenize(s)
    ['Good', 'muffins', 'cost', '$', '3.88', 'in', 'New', 'York.', 'Please', 'buy', 'me', 'two', 'of', 'them.', 'Thanks', '.']
    >>> s = "They'll save and invest more."
    >>> TreebankWordTokenizer().tokenize(s)
    ['They', "'ll", 'save', 'and', 'invest', 'more', '.']
    >>> s = "hi, my name can't hello,"
    >>> TreebankWordTokenizer().tokenize(s)
    ['hi', ',', 'my', 'name', 'ca', "n't", 'hello', ',']
    z^\"``z(``)z \1 z([ \(\[{<])(\"|\'{2})z\1 `` z([:,])([^\d])z \1 \2z([:,])$z\.\.\.z ... z[;@#$%&]z \g<0> z([^\.])(\.)([\]\)}>"\']*)\s*$z\1 \2\3 z[?!]z([^'])' z\1 ' z[\]\[\(\)\{\}\<\>]z\(-LRB-z\)-RRB-z\[-LSB-z\]-RSB-z\{-LCB-z\}-RCB--- -- ''z '' "z([^' ])('[sS]|'[mM]|'[dD]|') z\1 \2 z)([^' ])('ll|'LL|'re|'RE|'ve|'VE|n't|N'T) Ftextconvert_parentheses
return_strreturnc                    |durt          j        dt          d           | j        D ]\  }}|                    ||          }| j        D ]\  }}|                    ||          }| j        \  }}|                    ||          }|r#| j        D ]\  }}|                    ||          }| j        \  }}|                    ||          }d|z   dz   }| j	        D ]\  }}|                    ||          }| j
        D ]}|                    d|          }| j        D ]}|                    d|          }|                                S )a  Return a tokenized copy of `text`.

        >>> from nltk.tokenize import TreebankWordTokenizer
        >>> s = '''Good muffins cost $3.88 (roughly 3,36 euros)\nin New York.  Please buy me\ntwo of them.\nThanks.'''
        >>> TreebankWordTokenizer().tokenize(s) # doctest: +NORMALIZE_WHITESPACE
        ['Good', 'muffins', 'cost', '$', '3.88', '(', 'roughly', '3,36',
        'euros', ')', 'in', 'New', 'York.', 'Please', 'buy', 'me', 'two',
        'of', 'them.', 'Thanks', '.']
        >>> TreebankWordTokenizer().tokenize(s, convert_parentheses=True) # doctest: +NORMALIZE_WHITESPACE
        ['Good', 'muffins', 'cost', '$', '3.88', '-LRB-', 'roughly', '3,36',
        'euros', '-RRB-', 'in', 'New', 'York.', 'Please', 'buy', 'me', 'two',
        'of', 'them.', 'Thanks', '.']

        :param text: A string with a sentence or sentences.
        :type text: str
        :param convert_parentheses: if True, replace parentheses to PTB symbols,
            e.g. `(` to `-LRB-`. Defaults to False.
        :type convert_parentheses: bool, optional
        :param return_str: If True, return tokens as space-separated string,
            defaults to False.
        :type return_str: bool, optional
        :return: List of tokens from `text`.
        :rtype: List[str]
        FzHParameter 'return_str' has been deprecated and should no longer be used.   )category
stacklevel z \1 \2 )warningswarnDeprecationWarningSTARTING_QUOTESsubPUNCTUATIONPARENS_BRACKETSCONVERT_PARENTHESESDOUBLE_DASHESENDING_QUOTESCONTRACTIONS2CONTRACTIONS3split)selfr   r   r   regexpsubstitutions         R/var/www/html/ai-engine/env/lib/python3.11/site-packages/nltk/tokenize/treebank.pytokenizezTreebankWordTokenizer.tokenizee   s   6 U""M"+	    %)$8 	2 	2 FL::lD11DD$($4 	2 	2 FL::lD11DD  $3zz,-- 	6(,(@ 6 6$zz,55  $1zz,-- TzC$($6 	2 	2 FL::lD11DD( 	0 	0F::j$//DD( 	0 	0F::j$//DD zz||    c              #      K   |                      |          }d|v sd|v r.d t          j        d|          D             fd|D             }n|}t          ||          E d{V  dS )a  
        Returns the spans of the tokens in ``text``.
        Uses the post-hoc nltk.tokens.align_tokens to return the offset spans.

            >>> from nltk.tokenize import TreebankWordTokenizer
            >>> s = '''Good muffins cost $3.88\nin New (York).  Please (buy) me\ntwo of them.\n(Thanks).'''
            >>> expected = [(0, 4), (5, 12), (13, 17), (18, 19), (19, 23),
            ... (24, 26), (27, 30), (31, 32), (32, 36), (36, 37), (37, 38),
            ... (40, 46), (47, 48), (48, 51), (51, 52), (53, 55), (56, 59),
            ... (60, 62), (63, 68), (69, 70), (70, 76), (76, 77), (77, 78)]
            >>> list(TreebankWordTokenizer().span_tokenize(s)) == expected
            True
            >>> expected = ['Good', 'muffins', 'cost', '$', '3.88', 'in',
            ... 'New', '(', 'York', ')', '.', 'Please', '(', 'buy', ')',
            ... 'me', 'two', 'of', 'them.', '(', 'Thanks', ')', '.']
            >>> [s[start:end] for start, end in TreebankWordTokenizer().span_tokenize(s)] == expected
            True

        :param text: A string with a sentence or sentences.
        :type text: str
        :yield: Tuple[int, int]
        r   r   c                 6    g | ]}|                                 S  )group).0ms     r/   
<listcomp>z7TreebankWordTokenizer.span_tokenize.<locals>.<listcomp>   s     KKKQqwwyyKKKr1   z
``|'{2}|\"c                 F    g | ]}|d v r                     d          n|S ))r   r   r   r   )pop)r6   tokmatcheds     r/   r8   z7TreebankWordTokenizer.span_tokenize.<locals>.<listcomp>   sB        #&):":":A  r1   N)r0   refinditerr   )r,   r   
raw_tokenstokensr<   s       @r/   span_tokenizez#TreebankWordTokenizer.span_tokenize   s      . ]]4((
 4KKTT\\KK"+mT*J*JKKKG   %  FF
  F-----------r1   N)FF)__name__
__module____qualname____doc__r=   compiler"   r$   r%   r&   r'   r(   r   _contractionslistmapr)   r*   strboolr   r0   r   r   intrA   r4   r1   r/   r
   r
      s        0 
F		U#	G		g&	,	-	-y9O 
$	%	%y1	J		)	I		)	K	 	 *-BJ788	
 
G		j)	K	 	 (+K "rz"788*EO 
E		G$	E		G$	E		G$	E		G$	E		G$	E		G$  RZ&&0M 
E		F#	D		6"	4	5	5yA	@	A	A9M	M *)++MDRZ)DEEFFMDRZ)DEEFFM PUE EE.2EHLE	cE E E EN).# ).(5c?*C ). ). ). ). ). ).r1   r
   c            	          e Zd ZdZ e            Zd ej        D             Zd ej        D             Z ej	        d          df ej	        d          df ej	        d          df ej	        d	          df ej	        d
          dfgZ
 ej	        d          dfZ ej	        d          df ej	        d          df ej	        d          df ej	        d          df ej	        d          df ej	        d          dfgZ ej	        d          df ej	        d          df ej	        d          dfgZ ej	        d          df ej	        d           df ej	        d!          d"f ej	        d#          df ej	        d$          df ej	        d%          d&f ej	        d'          d(fgZ ej	        d)          d*f ej	        d+          d(f ej	        d,          dfgZd4d.ee         d/ed0efd1Zd4d.ee         d/ed0efd2Zd3S )5TreebankWordDetokenizera  
    The Treebank detokenizer uses the reverse regex operations corresponding to
    the Treebank tokenizer's regexes.

    Note:

    - There're additional assumption mades when undoing the padding of ``[;@#$%&]``
      punctuation symbols that isn't presupposed in the TreebankTokenizer.
    - There're additional regexes added in reversing the parentheses tokenization,
       such as the ``r'([\]\)\}\>])\s([:;,.])'``, which removes the additional right
       padding added to the closing parentheses precedding ``[:;,.]``.
    - It's not possible to return the original whitespaces as they were because
      there wasn't explicit records of where `'\n'`, `'\t'` or `'\s'` were removed at
      the text.split() operation.

    >>> from nltk.tokenize.treebank import TreebankWordTokenizer, TreebankWordDetokenizer
    >>> s = '''Good muffins cost $3.88\nin New York.  Please buy me\ntwo of them.\nThanks.'''
    >>> d = TreebankWordDetokenizer()
    >>> t = TreebankWordTokenizer()
    >>> toks = t.tokenize(s)
    >>> d.detokenize(toks)
    'Good muffins cost $3.88 in New York. Please buy me two of them. Thanks.'

    The MXPOST parentheses substitution can be undone using the ``convert_parentheses``
    parameter:

    >>> s = '''Good muffins cost $3.88\nin New (York).  Please (buy) me\ntwo of them.\n(Thanks).'''
    >>> expected_tokens = ['Good', 'muffins', 'cost', '$', '3.88', 'in',
    ... 'New', '-LRB-', 'York', '-RRB-', '.', 'Please', '-LRB-', 'buy',
    ... '-RRB-', 'me', 'two', 'of', 'them.', '-LRB-', 'Thanks', '-RRB-', '.']
    >>> expected_tokens == t.tokenize(s, convert_parentheses=True)
    True
    >>> expected_detoken = 'Good muffins cost $3.88 in New (York). Please (buy) me two of them. (Thanks).'
    >>> expected_detoken == d.detokenize(t.tokenize(s, convert_parentheses=True), convert_parentheses=True)
    True

    During tokenization it's safe to add more spaces but during detokenization,
    simply undoing the padding doesn't really help.

    - During tokenization, left and right pad is added to ``[!?]``, when
      detokenizing, only left shift the ``[!?]`` is needed.
      Thus ``(re.compile(r'\s([?!])'), r'\g<1>')``.

    - During tokenization ``[:,]`` are left and right padded but when detokenizing,
      only left shift is necessary and we keep right pad after comma/colon
      if the string after is a non-digit.
      Thus ``(re.compile(r'\s([:,])\s([^\d])'), r'\1 \2')``.

    >>> from nltk.tokenize.treebank import TreebankWordDetokenizer
    >>> toks = ['hello', ',', 'i', 'ca', "n't", 'feel', 'my', 'feet', '!', 'Help', '!', '!']
    >>> twd = TreebankWordDetokenizer()
    >>> twd.detokenize(toks)
    "hello, i can't feel my feet! Help!!"

    >>> toks = ['hello', ',', 'i', "can't", 'feel', ';', 'my', 'feet', '!',
    ... 'Help', '!', '!', 'He', 'said', ':', 'Help', ',', 'help', '?', '!']
    >>> twd.detokenize(toks)
    "hello, i can't feel; my feet! Help!! He said: Help, help?!"
    c                 ^    g | ]*}t          j        |                    d d                    +S z(?#X)z\sr=   rF   replacer6   patterns     r/   r8   z"TreebankWordDetokenizer.<listcomp>  @        	
7??7E2233  r1   c                 ^    g | ]*}t          j        |                    d d                    +S rP   rQ   rS   s     r/   r8   z"TreebankWordDetokenizer.<listcomp>  rU   r1   z+([^' ])\s('ll|'LL|'re|'RE|'ve|'VE|n't|N'T) z\1\2 z([^' ])\s('[sS]|'[mM]|'[dD]|') z(\S)\s(\'\')\1\2z(\'\')\s([.,:)\]>};%])r   r   r   r   r   (r   )r   [r   ]r   {r   }z([\[\(\{\<])\sz\g<1>z\s([\]\)\}\>])z([\]\)\}\>])\s([:;,.])z([^'])\s'\sz\1' z\s([?!])z([^\.])\s(\.)([\]\)}>"\']*)\s*$z\1\2\3z([#$])\sz\s([;%])z
\s\.\.\.\sz...z\s([:,])z\1z([ (\[{<])\s``z\1``z(``)\sr   Fr@   r   r   c                    d                     |          }d|z   dz   }| j        D ]}|                    d|          }| j        D ]}|                    d|          }| j        D ]\  }}|                    ||          }|                                }| j        \  }}|                    ||          }|r#| j        D ]\  }}|                    ||          }| j        D ]\  }}|                    ||          }| j	        D ]\  }}|                    ||          }| j
        D ]\  }}|                    ||          }|                                S )a  
        Treebank detokenizer, created by undoing the regexes from
        the TreebankWordTokenizer.tokenize.

        :param tokens: A list of strings, i.e. tokenized text.
        :type tokens: List[str]
        :param convert_parentheses: if True, replace PTB symbols with parentheses,
            e.g. `-LRB-` to `(`. Defaults to False.
        :type convert_parentheses: bool, optional
        :return: str
        r   rW   )joinr*   r#   r)   r(   stripr'   r&   r%   r$   r"   )r,   r@   r   r   r-   r.   s         r/   r0   z TreebankWordDetokenizer.tokenize[  s    xx TzC ( 	- 	-F::gt,,DD( 	- 	-F::gt,,DD %)$6 	2 	2 FL::lD11DD zz||  $1zz,-- 	6(,(@ 6 6$zz,55 %)$8 	2 	2 FL::lD11DD %)$4 	2 	2 FL::lD11DD %)$8 	2 	2 FL::lD11DDzz||r1   c                 .    |                      ||          S )z&Duck-typing the abstract *tokenize()*.)r0   )r,   r@   r   s      r/   
detokenizez"TreebankWordDetokenizer.detokenize  s    }}V%8999r1   N)F)rB   rC   rD   rE   r   rG   r)   r*   r=   rF   r(   r'   r&   r%   r$   r"   r   rJ   rK   r0   rb   r4   r1   r/   rN   rN      s       : :x *)++M $2  M $2  M 
B	C	CXN	6	7	7B	O	$	$g.BJ011	
 
E		C 	M  RZ((%0M 
G		c"	G		c"	G		c"	G		c"	G		c"	G		c" 
%	&	&1	%	&	&1	-	.	.8O 
N	#	#W-	K	 	 (+	6	7	7C
 
K	 	 (+	K	 	 (+	M	"	"F+ BJ{##	
K, 
%	&	&0	I		&	E		D!O3 3tCy 3t 3PS 3 3 3 3j: :c : :RU : : : : : :r1   rN   )rE   r=   r   typingr   r   r   nltk.tokenize.apir   nltk.tokenize.destructiver   nltk.tokenize.utilr   r
   rN   r4   r1   r/   <module>rg      s     
			  ( ( ( ( ( ( ( ( ( ( ( ( ( ( ( ( ; ; ; ; ; ; + + + + + +x. x. x. x. x.J x. x. x.vz: z: z: z: z:j z: z: z: z: z:r1   