
    Ngd                        d dl mZ d dlZd dlZd dlZd dlZd dlZd dlmZ d dl	m
Z
mZ ddlmZmZ  ej        e          Z G d de          ZdS )	    )annotationsN)Iterable)NLTK_IMPORT_ERRORis_nltk_available   )ENGLISH_STOP_WORDSWordTokenizerc                  \    e Zd ZdZg edddfddZd ZddZddZddZ	e
dd            ZdS )PhraseTokenizera~  Tokenizes the text with respect to existent phrases in the vocab.

    This tokenizers respects phrases that are in the vocab. Phrases are separated with 'ngram_separator', for example,
    in Google News word2vec file, ngrams are separated with a _ like New_York. These phrases are detected in text and merged as one special token. (New York is the ... => [New_York, is, the])
    F_   vocabIterable[str]
stop_wordsdo_lower_caseboolngram_separatorstrmax_ngram_lengthintc                    t                      s+t          t          j        | j        j                            t          |          | _        || _        || _	        || _
        |                     |           d S N)r   ImportErrorr   format	__class____name__setr   r   r   r   	set_vocab)selfr   r   r   r   r   s         r/var/www/html/ai-engine/env/lib/python3.11/site-packages/sentence_transformers/models/tokenizer/PhraseTokenizer.py__init__zPhraseTokenizer.__init__   so     !"" 	Q/6t~7NOOPPPj//*. 0u    c                    | j         S r   )r   )r   s    r    	get_vocabzPhraseTokenizer.get_vocab)   s
    zr"   c                   || _         t          j        d t          |          D                       | _        t                      | _        t                      | _        |D ]}| j        v| j        |v rm|	                    | j                  dz   }| j        | j        z   |vr?|| j
        k    r4| j                            |           | j                            |           t          |          dk    rSt                              d| j                    t                              dt          | j                              d S d S )Nc                    g | ]	\  }}||f
S  r'   ).0idxwords      r    
<listcomp>z-PhraseTokenizer.set_vocab.<locals>.<listcomp>.   s     0_0_0_d$0_0_0_r"   r   r   z(PhraseTokenizer - Phrase ngram lengths: zPhraseTokenizer - Num phrases: )r   collectionsOrderedDict	enumerateword2idxr   ngram_lookupngram_lengthsr   countr   addlenloggerinfo)r   r   r*   ngram_counts       r    r   zPhraseTokenizer.set_vocab,   s>   
#/0_0_iX]N^N^0_0_0_``  EE UU 	8 	8D#/D4HD4P4P"jj)=>>B'$*>>dJJ{^b^sOsOs%))$///&**;777u::>>KKW4CUWWXXXKKR#d>O:P:PRRSSSSS >r"   textreturn	list[int]c                z   ddl m}  ||d          }t          | j        d          D ]}d}|t	          |          |z
  k    r| j                            ||||z                      }|| j        v r|g||||z   <   n8|                                | j        v r|                                g||||z   <   |dz  }|t	          |          |z
  k    g }|D ]}	|	| j	        v r|	| j
        v r!|                    | j
        |	                    6|	                                }	|	| j	        v rT|	| j
        v r!|                    | j
        |	                    ~|	                    t          j                  }	|	| j	        v rt	          |	          dk    r*|	| j
        v r!|                    | j
        |	                    |S )Nr   )word_tokenizeT)preserve_line)reverser   )nltkr<   sortedr1   r4   r   joinr0   lowerr   r/   appendstripstringpunctuation)
r   r8   kwargsr<   tokens	ngram_lenr)   ngramtokens_filteredtokens
             r    tokenizezPhraseTokenizer.tokenize?   s    &&&&&&t4888   2DAAA 	 	ICVy000,11&sY9N2OPPD---5:GF3y011[[]]d&7775:[[]]OF3y01q Vy000  	 	E''$-''&&t}U';<<<KKMME''$-''&&t}U';<<<KK 233E''UaET]$:$:&&t}U';<<<r"   output_pathc           	     T   t          t          j                            |d          d          5 }t	          j        t          | j                                                  t          | j	                  | j
        | j        | j        d|           d d d            d S # 1 swxY w Y   d S )Nphrasetokenizer_config.jsonw)r   r   r   r   r   )openospathrA   jsondumplistr/   keysr   r   r   r   )r   rN   fOuts      r    savezPhraseTokenizer.saveh   s    "',,{,IJJCPP 
	TXI!$-"4"4"6"677"&t"7"7%)%7'+';(,(=  	 	 	
	 
	 
	 
	 
	 
	 
	 
	 
	 
	 
	 
	 
	 
	 
	 
	 
	 
	s   A!BB!$B!
input_pathc                    t          t          j                            | d                    5 }t	          j        |          }d d d            n# 1 swxY w Y   t          di |S )NrP   r'   )rR   rS   rT   rA   rU   loadr   )r[   fInconfigs      r    r]   zPhraseTokenizer.loadu   s    "',,z+HIIJJ 	$cYs^^F	$ 	$ 	$ 	$ 	$ 	$ 	$ 	$ 	$ 	$ 	$ 	$ 	$ 	$ 	$ (((((s   AAAN)
r   r   r   r   r   r   r   r   r   r   )r   r   )r8   r   r9   r:   )rN   r   )r[   r   )r   
__module____qualname____doc__r   r!   r$   r   rM   rZ   staticmethodr]   r'   r"   r    r   r      s           "$6#" !    "  T T T T&' ' ' 'R    ) ) ) \) ) )r"   r   )
__future__r   r,   rU   loggingrS   rE   collections.abcr   transformers.utils.import_utilsr   r   r	   r   	getLoggerr   r5   r   r'   r"   r    <module>ri      s    " " " " " "       				  $ $ $ $ $ $ P P P P P P P P < < < < < < < <		8	$	$i) i) i) i) i)m i) i) i) i) i)r"   