
    Ng8                    Z    d dl mZ d dlmZmZmZmZ d dlmZm	Z	m
Z
  G d de          ZdS )    )annotations)AnyListOptionalcast)TextSplitter	Tokenizersplit_text_on_tokensc                  \     e Zd ZU dZ	 	 	 dd fdZddZddZddZdZde	d<   ddZ
 xZS )%SentenceTransformersTokenTextSplitterz8Splitting text to tokens using sentence model tokenizer.2   'sentence-transformers/all-mpnet-base-v2Nchunk_overlapint
model_namestrtokens_per_chunkOptional[int]kwargsr   returnNonec                    t                      j        di |d|i 	 ddlm} n# t          $ r t	          d          w xY w|| _         || j                  | _        | j        j        | _        |                     |           dS )zCreate a new TextSplitter.r   r   )SentenceTransformerzCould not import sentence_transformer python package. This is needed in order to for SentenceTransformersTokenTextSplitter. Please install it with `pip install sentence-transformers`.)r   N )	super__init__sentence_transformersr   ImportErrorr   _model	tokenizer_initialize_chunk_configuration)selfr   r   r   r   r   	__class__s         j/var/www/html/ai-engine/env/lib/python3.11/site-packages/langchain_text_splitters/sentence_transformers.pyr   z.SentenceTransformersTokenTextSplitter.__init__   s     	??6??????	AAAAAAA 	 	 	N  	 %))$/::.,,>N,OOOOOs	   ' Ac          	         t          t          | j        j                  | _        || j        | _        n|| _        | j        | j        k    r(t          d| j         d| j         d| j         d          d S )NzThe token limit of the models 'z' is: z. Argument tokens_per_chunk=z > maximum token limit.)r   r   r   max_seq_lengthmaximum_tokens_per_chunkr   
ValueErrorr   )r"   r   s     r$   r!   zESentenceTransformersTokenTextSplitter._initialize_chunk_configuration#   s     )-S$+2L(M(M%#$($AD!!$4D! 4#@@@+$/ + +5+ +.2.C+ + +   A@    text	List[str]c                     d fd}t           j         j         j        j        |          }t          ||          S )	Nr*   r   r   	List[int]c                >                         |           dd         S )N   )_encode)r*   r"   s    r$   %encode_strip_start_and_stop_token_idsz_SentenceTransformersTokenTextSplitter.split_text.<locals>.encode_strip_start_and_stop_token_ids6   s    <<%%ad++r)   )r   r   decodeencode)r*   r    r*   r   r   r-   )r	   _chunk_overlapr   r    r3   r
   )r"   r*   r2   r    s   `   r$   
split_textz0SentenceTransformersTokenTextSplitter.split_text5   sc    	, 	, 	, 	, 	, 	, -!2>(8	
 
 
	 $CCCCr)   c               F    t          |                     |                    S )N)lenr1   )r"   r*   s     r$   count_tokensz2SentenceTransformersTokenTextSplitter.count_tokensB   s    4<<%%&&&r)   l         _max_length_equal_32_bit_integerr-   c                J    | j                             || j        d          }|S )Ndo_not_truncate)
max_length
truncation)r    r4   r;   )r"   r*   &token_ids_with_start_and_end_token_idss      r$   r1   z-SentenceTransformersTokenTextSplitter._encodeG   s4    151F1F<( 2G 2
 2
.
 65r)   )r   r   N)
r   r   r   r   r   r   r   r   r   r   )r   r   r   r   )r*   r   r   r+   )r*   r   r   r   r5   )__name__
__module____qualname____doc__r   r!   r7   r:   r;   __annotations__r1   __classcell__)r#   s   @r$   r   r      s         BB  C*.	P P P P P P P0   $D D D D' ' ' ' -2$11116 6 6 6 6 6 6 6r)   r   N)
__future__r   typingr   r   r   r   langchain_text_splitters.baser   r	   r
   r   r   r)   r$   <module>rJ      s    " " " " " " , , , , , , , , , , , , W W W W W W W W W WE6 E6 E6 E6 E6L E6 E6 E6 E6 E6r)   