
    Ng]%                     ~    d dl Z d dlZd dlZd dlZd dlmZ d dlmZmZm	Z	m
Z
mZmZ d dlmZ dZ G d de          ZdS )    N)PIPE)_java_optionsconfig_javafind_dir	find_filefind_jarjava)
TokenizerIz!https://nlp.stanford.edu/softwarec                   f     e Zd ZdZdZ	 	 	 	 	 	 	 	 	 	 	 	 ddZd	 Z fd
Zd Zd Z	d Z
ddZ xZS )StanfordSegmenteru[  Interface to the Stanford Segmenter

    If stanford-segmenter version is older than 2016-10-31, then path_to_slf4j
    should be provieded, for example::

        seg = StanfordSegmenter(path_to_slf4j='/YOUR_PATH/slf4j-api.jar')

    >>> from nltk.tokenize.stanford_segmenter import StanfordSegmenter
    >>> seg = StanfordSegmenter() # doctest: +SKIP
    >>> seg.default_config('zh') # doctest: +SKIP
    >>> sent = u'这是斯坦福中文分词器测试'
    >>> print(seg.segment(sent)) # doctest: +SKIP
    这 是 斯坦福 中文 分词器 测试
    <BLANKLINE>
    >>> seg.default_config('ar') # doctest: +SKIP
    >>> sent = u'هذا هو تصنيف ستانفورد العربي للكلمات'
    >>> print(seg.segment(sent.split())) # doctest: +SKIP
    هذا هو تصنيف ستانفورد العربي ل الكلمات
    <BLANKLINE>
    zstanford-segmenter.jarNfalseUTF-8F-mx2gc                    t          j        dt                     t          j        t	          d          t          d           t          j        dt                     t          | j        |ddt          |          }|t          d	|d
dt          |          }nd }t          j	        
                    d ||fD                       | _        || _        || _        || _        || _        || _        || _        |	| _        || _        |
i n|
}
d
                    d |
                                D                       | _        d S )Nalwaysz}
The StanfordTokenizer will be deprecated in version 3.2.5.
Please use [91mnltk.parse.corenlp.CoreNLPTokenizer[0m instead.'   )
stacklevelignoreSTANFORD_SEGMENTER )env_vars
searchpathurlverbosezslf4j-api.jar)SLF4Jr   c              3      K   | ]}||V  	d S Nr   ).0_s     \/var/www/html/ai-engine/env/lib/python3.11/site-packages/nltk/tokenize/stanford_segmenter.py	<genexpr>z-StanfordSegmenter.__init__.<locals>.<genexpr>j   s+       -
 -
amAmmmm-
 -
    ,c              3   N   K   | ] \  }}| d t          j        |           V  !dS )=N)jsondumps)r   keyvals      r!   r"   z-StanfordSegmenter.__init__.<locals>.<genexpr>x   sM       %
 %
+33s&&TZ__&&%
 %
 %
 %
 %
 %
r#   )warningssimplefilterDeprecationWarningwarnstrr   _JAR_stanford_urlospathsepjoin_stanford_jar_java_class_model_sihan_corpora_dict_sihan_post_processing_keep_whitespaces_dict	_encodingjava_optionsitems_options_cmd)selfpath_to_jarpath_to_slf4j
java_classpath_to_modelpath_to_dictpath_to_sihan_corpora_dictsihan_post_processingkeep_whitespacesencodingoptionsr   r=   stanford_segmenterslf4js                  r!   __init__zStanfordSegmenter.__init__8   s     	h(:;;;Z 
 	
 	
 	
 	
 	h(:;;;%I,
 
 
 $8!  EE E  Z__ -
 -
*E2-
 -
 -
 
 
 &##= &;#!1!
!(""WHH %
 %
7>}}%
 %
 %
 
 
r#   c                 @   d}t           j                            d          r>t           j                            t           j                            d          d          h}d| _        d| _        d| _        |dk    r
d| _        d}n|d	k    rd
| _        d}d| _        d}	 t          ||t          dd          | _        n%# t          $ r}t          d|z            |d}~ww xY wd}	 t          |t          dd          }t           j                            ||          | _        n7# t          $ r}t          d|z            |d}~ww xY wt          d|           	 t          ||t          dd          | _        dS # t          $ r}t          d|z            |d}~ww xY w)z
        Attempt to initialize Stanford Word Segmenter for the specified language
        using the STANFORD_SEGMENTER and STANFORD_MODELS environment variables
        r   r   dataNr   arz=edu.stanford.nlp.international.arabic.process.ArabicSegmenterz'arabic-segmenter-atb+bn+arztrain.ser.gzzhz%edu.stanford.nlp.ie.crf.CRFClassifierzpku.gztruezdict-chris6.ser.gzF)STANFORD_MODELS)r   r   r   r   z_Could not find '%s' (tried using env. variables STANFORD_MODELS and <STANFORD_SEGMENTER>/data/)z./data/r   )r   r   r   zMCould not find '%s' (tried using the STANFORD_SEGMENTER environment variable)zUnsupported language )rS   r   )r2   environgetpathr4   r;   r8   r9   r6   r   r1   LookupErrorr   r7   )r@   langsearch_pathmodelrE   e	sihan_dirpath_to_sihan_dirs           r!   default_configz StanfordSegmenter.default_config|   s0    :>>.// 	W7<<
7K(L(LfUUVK 
#' &-#4<<O  >EET\\FDE*0D'/L& *%!1  

    !P"#  	 "I$,%!4	% % %! ,.7<<8I9+U+U((   !?AJK   <d<<===	#&!B  DKKK  	 	 	LNST  	sH   B= =
CCC%=D# #
E-E  EE; ;
FFFc                 J    t                                          |           d S r   )supertokenize)r@   s	__class__s     r!   ra   zStanfordSegmenter.tokenize   s!    r#   c                     | j         d| j        d| j        d|g}| j        *|                    d| j        d| j        d| j        g           |                     |          }|S ) -loadClassifier-keepAllWhitespaces	-textFileN-serDictionary-sighanCorporaDict-sighanPostProcessing)r6   r7   r:   r8   extendr;   r9   _execute)r@   input_file_pathcmdstdouts       r!   segment_filezStanfordSegmenter.segment_file   s     K!"
 #/JJ$J(,+/	 	 	 s##r#   c                 .    |                      |g          S r   )segment_sents)r@   tokenss     r!   segmentzStanfordSegmenter.segment   s    !!6(+++r#   c                 X   | j         }t          j        d          \  }| _        t	          j        |d          }d                    d |D                       }t          |t                    r|r|	                    |          }|
                    |           |                                 | j        d| j        d| j        d| j        g}| j        *|                    d
| j        d| j        d| j        g           |                     |          }t	          j        | j                   |S )re   T)textwb
c              3   @   K   | ]}d                      |          V  dS )re   N)r4   )r   xs     r!   r"   z2StanfordSegmenter.segment_sents.<locals>.<genexpr>   s,      ::1388A;;::::::r#   rf   rg   rh   Nri   rj   rk   )r<   tempfilemkstemp_input_file_pathr2   fdopenr4   
isinstancer/   encodewritecloser6   r7   r:   r8   rl   r;   r9   rm   unlink)r@   	sentencesrI   	_input_fh_inputro   rp   s          r!   rs   zStanfordSegmenter.segment_sents   s9   >+3+;+F+F+F(	4( Ii..	::	:::::fc"" 	-x 	-]]8,,F K!"!
 #/JJ$J(,+/	 	 	 s## 		$'(((r#   c                    | j         }|                    d|g           | j        }|r|                    d| j        g           d                    t                    }t          | j        |           t          || j        t          t                    \  }}|
                    |          }t          |d           |S )Nz-inputEncodingz-optionsre   )rJ   r   )	classpathrp   stderrF)r<   rl   r?   r4   r   r   r=   r	   r5   r   decode)r@   ro   r   rI   r?   default_optionsrp   _stderrs           r!   rm   zStanfordSegmenter._execute  s    >

$h/000( 	8JJ
D$56777((=11 	D-w????4-d4
 
 
 x(( 	OU;;;;r#   )NNNNNNr   r   r   NFr   )F)__name__
__module____qualname____doc__r0   rM   r^   ra   rq   ru   rs   rm   __classcell__)rc   s   @r!   r   r       s         * $D #'% B
 B
 B
 B
HG G GR      6, , ,( ( (T       r#   r   )r'   r2   r|   r+   
subprocessr   nltk.internalsr   r   r   r   r   r	   nltk.tokenize.apir
   r1   r   r   r#   r!   <module>r      s     				                        ) ( ( ( ( (3D D D D D
 D D D D Dr#   