
    Ng                         d dl Z d dlZd dlZd dlZd dlmZ d dlmZmZm	Z	m
Z
 d dlmZ d dlmZ dZ G d de          ZdS )	    N)PIPE)_java_optionsconfig_javafind_jarjava)CoreNLPParser)
TokenizerIz1https://nlp.stanford.edu/software/tokenizer.shtmlc                   L    e Zd ZdZdZ	 	 	 	 	 ddZed             Zd	 Zdd
Z	dS )StanfordTokenizeraF  
    Interface to the Stanford Tokenizer

    >>> from nltk.tokenize.stanford import StanfordTokenizer
    >>> s = "Good muffins cost $3.88\nin New York.  Please buy me\ntwo of them.\nThanks."
    >>> StanfordTokenizer().tokenize(s) # doctest: +SKIP
    ['Good', 'muffins', 'cost', '$', '3.88', 'in', 'New', 'York', '.', 'Please', 'buy', 'me', 'two', 'of', 'them', '.', 'Thanks', '.']
    >>> s = "The colour of the wall is blue."
    >>> StanfordTokenizer(options={"americanize": True}).tokenize(s) # doctest: +SKIP
    ['The', 'color', 'of', 'the', 'wall', 'is', 'blue', '.']
    zstanford-postagger.jarNutf8F-mx1000mc                 4   t          j        t          d          t          d           t	          | j        |ddt          |          | _        || _        || _	        |i n|}d
                    d |                                D                       | _        d S )	Nzz
The StanfordTokenizer will be deprecated in version 3.2.5.
Please use [91mnltk.parse.corenlp.CoreNLPParser[0m instead.'   )
stacklevel)STANFORD_POSTAGGER )env_vars
searchpathurlverbose,c              3   *   K   | ]\  }}| d | V  dS )=Nr   ).0keyvals      R/var/www/html/ai-engine/env/lib/python3.11/site-packages/nltk/tokenize/stanford.py	<genexpr>z-StanfordTokenizer.__init__.<locals>.<genexpr>E   s0      $T$TS^^c^^$T$T$T$T$T$T    )warningswarnstrDeprecationWarningr   _JAR_stanford_url_stanford_jar	_encodingjava_optionsjoinitems_options_cmd)selfpath_to_jarencodingoptionsr   r(   s         r   __init__zStanfordTokenizer.__init__%   s     	W 
 	
 	
 	
 	
 &I,
 
 
 "(""WHH$T$TGMMOO$T$T$TTTr   c                 *    |                                  S )N)
splitlines)ss    r   _parse_tokenized_outputz)StanfordTokenizer._parse_tokenized_outputG   s    ||~~r   c                 Z    dg}|                      |                     ||                    S )zW
        Use stanford tokenizer's PTBTokenizer to tokenize multiple sentences.
        z%edu.stanford.nlp.process.PTBTokenizer)r4   _execute)r,   r3   cmds      r   tokenizezStanfordTokenizer.tokenizeK   s.     77++DMM#q,A,ABBBr   c                    | j         }|                    d|g           | j        }|r|                    d| j        g           d                    t                    }t          | j        |           t          j        dd          5 }t          |t                    r|r|                    |          }|                    |           |                                 |                    |j                   t!          || j        t$          t$                    \  }}	|                    |          }d d d            n# 1 swxY w Y   t)          j        |j                   t          |d           |S )	Nz-charsetz-options )r/   r   wbF)modedelete)	classpathstdoutstderr)r'   extendr+   r)   r   r   r(   tempfileNamedTemporaryFile
isinstancer"   encodewriteflushappendnamer   r&   r   decodeosunlink)
r,   r7   input_r   r.   r+   default_options
input_filer?   r@   s
             r   r6   zStanfordTokenizer._executeR   s   >

J)***( 	8JJ
D$56777((=11 	D-w???? (d5AAA 	-Z&#&& 18 1x00V$$$JJz''' "t1$t  NFF ]]8,,F	- 	- 	- 	- 	- 	- 	- 	- 	- 	- 	- 	- 	- 	- 	- 		*/""" 	OU;;;;s   
B*E  EE)Nr   NFr   )F)
__name__
__module____qualname____doc__r$   r0   staticmethodr4   r8   r6   r   r   r   r   r      s        
 
 $D  U  U  U  UD   \C C C! ! ! ! ! !r   r   )jsonrK   rB   r    
subprocessr   nltk.internalsr   r   r   r   nltk.parse.corenlpr   nltk.tokenize.apir	   r%   r   r   r   r   <module>rZ      s     				         E E E E E E E E E E E E , , , , , , ( ( ( ( ( (C] ] ] ] ]
 ] ] ] ] ]r   