
    Ng	                    j    d dl mZ d dlZd dlZd dlZd dlZd dlmZ ddlm	Z	mZ  G d de          Z
dS )    )annotationsN)Iterable   )ENGLISH_STOP_WORDSWordTokenizerc                  X    e Zd ZdZg edfddZd	 Zdd
ZddZddZ	e
dd            ZdS )WhitespaceTokenizerz
    Simple and fast white-space tokenizer. Splits sentence based on white spaces.
    Punctuation are stripped from tokens.
    FvocabIterable[str]
stop_wordsdo_lower_caseboolc                f    t          |          | _        || _        |                     |           d S N)setr   r   	set_vocab)selfr
   r   r   s       v/var/www/html/ai-engine/env/lib/python3.11/site-packages/sentence_transformers/models/tokenizer/WhitespaceTokenizer.py__init__zWhitespaceTokenizer.__init__   s1     j//*u    c                    | j         S r   )r
   )r   s    r   	get_vocabzWhitespaceTokenizer.get_vocab   s
    zr   c                t    || _         t          j        d t          |          D                       | _        d S )Nc                    g | ]	\  }}||f
S  r   ).0idxwords      r   
<listcomp>z1WhitespaceTokenizer.set_vocab.<locals>.<listcomp>   s     0_0_0_d$0_0_0_r   )r
   collectionsOrderedDict	enumerateword2idx)r   r
   s     r   r   zWhitespaceTokenizer.set_vocab   s7    
#/0_0_iX]N^N^0_0_0_``r   textstrreturn	list[int]c                6   | j         r|                                }|                                }g }|D ]}|| j        v r|| j        v r!|                    | j        |                    6|                    t          j                  }|| j        v r_t          |          dk    r*|| j        v r!|                    | j        |                    |                                }|| j        v r|| j        v r!|                    | j        |                    |S )Nr   )
r   lowersplitr   r#   appendstripstringpunctuationlen)r   r$   kwargstokenstokens_filteredtokens         r   tokenizezWhitespaceTokenizer.tokenize    s(    	 ::<<D 	 	E''$-''&&t}U';<<<KK 233E''UaET]$:$:&&t}U';<<<KKMME''$-''&&t}U';<<< ( r   output_pathc                <   t          t          j                            |d          d          5 }t	          j        t          | j                                                  t          | j	                  | j
        d|           d d d            d S # 1 swxY w Y   d S )Nwhitespacetokenizer_config.jsonw)r
   r   r   )openospathjoinjsondumplistr#   keysr   r   )r   r5   fOuts      r   savezWhitespaceTokenizer.save>   s    "',,{,MNNPSTT 	X\I!$-"4"4"6"677"&t"7"7%)%7 
   	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	s   ABBB
input_pathc                    t          t          j                            | d                    5 }t	          j        |          }d d d            n# 1 swxY w Y   t          di |S )Nr7   r   )r9   r:   r;   r<   r=   loadr	   )rC   fInconfigs      r   rE   zWhitespaceTokenizer.loadI   s    "',,z+LMMNN 	$RUYs^^F	$ 	$ 	$ 	$ 	$ 	$ 	$ 	$ 	$ 	$ 	$ 	$ 	$ 	$ 	$ #,,V,,,s   AAAN)r
   r   r   r   r   r   )r
   r   )r$   r%   r&   r'   )r5   r%   )rC   r%   )__name__
__module____qualname____doc__r   r   r   r   r4   rB   staticmethodrE   r   r   r   r	   r	      s          &(EWot      a a a a   <	 	 	 	 - - - \- - -r   r	   )
__future__r   r    r=   r:   r-   collections.abcr   r   r   r	   r   r   r   <module>rO      s    " " " " " "      				  $ $ $ $ $ $ < < < < < < < <B- B- B- B- B-- B- B- B- B- B-r   