
    Ng,                    j   d dl mZ d dlZd dlZd dlmZmZ d dlmZ d dl	m
Z
 d dlmZmZmZmZmZmZmZmZmZmZmZmZ d dlmZmZ  ej        e          Z edd	
          Z G d d	ee          Z G d de          Z  G d de!e
          Z" ed           G d d                      Z#ddZ$dS )    )annotationsN)ABCabstractmethod)	dataclass)Enum)AbstractSetAnyCallable
CollectionIterableListLiteralOptionalSequenceTypeTypeVarUnion)BaseDocumentTransformerDocumentTSTextSplitter)boundc                      e Zd ZdZddedddfd9dZed:d            Z	 d;d<dZd=dZ	d>d#Z
d?d&Zed@d*            Zed+d e            d,fdAd6            ZdBd8ZdS )Cr   z)Interface for splitting text into chunks.i     FT
chunk_sizeintchunk_overlaplength_functionCallable[[str], int]keep_separator$Union[bool, Literal['start', 'end']]add_start_indexboolstrip_whitespacereturnNonec                    ||k    rt          d| d| d          || _        || _        || _        || _        || _        || _        dS )ad  Create a new TextSplitter.

        Args:
            chunk_size: Maximum size of chunks to return
            chunk_overlap: Overlap in characters between chunks
            length_function: Function that measures the length of given chunks
            keep_separator: Whether to keep the separator and where to place it
                            in each corresponding chunk (True='start')
            add_start_index: If `True`, includes chunk's start index in metadata
            strip_whitespace: If `True`, strips whitespace from the start and end of
                              every document
        zGot a larger chunk overlap (z) than chunk size (z), should be smaller.N)
ValueError_chunk_size_chunk_overlap_length_function_keep_separator_add_start_index_strip_whitespace)selfr   r   r   r    r"   r$   s          Y/var/www/html/ai-engine/env/lib/python3.11/site-packages/langchain_text_splitters/base.py__init__zTextSplitter.__init__!   s|    * :%%6} 6 66 6 6   &+ /- /!1    textstr	List[str]c                    dS )z$Split text into multiple components.N )r/   r3   s     r0   
split_textzTextSplitter.split_textB   s      r2   Ntexts	metadatasOptional[List[dict]]List[Document]c           	        |pi gt          |          z  }g }t          |          D ]\  }}d}d}|                     |          D ]}	t          j        ||                   }
| j        rE||z   | j        z
  }|                    |	t          d|                    }||
d<   t          |	          }t          |	|
          }|
                    |           |S )z&Create documents from a list of texts.r   start_index)page_contentmetadata)len	enumerater8   copydeepcopyr-   r*   findmaxr   append)r/   r9   r:   
_metadatas	documentsir3   indexprevious_chunk_lenchunkr@   offsetnew_docs                r0   create_documentszTextSplitter.create_documentsF   s     32$U"3
	 '' 	* 	*GAtE!".. * *=A77( 4"%77$:MMF IIeSF^^<<E.3H]+),U&"III  ))))* r2   rI   Iterable[Document]c                    g g }}|D ]6}|                     |j                   |                     |j                   7|                     ||          S )zSplit documents.)r:   )rG   r?   r@   rP   )r/   rI   r9   r:   docs        r0   split_documentszTextSplitter.split_documentsZ   sa    ry 	+ 	+CLL)***S\****$$Ui$@@@r2   docs	separatorOptional[str]c                v    |                     |          }| j        r|                                }|dk    rd S |S )N )joinr.   strip)r/   rU   rV   r3   s       r0   
_join_docszTextSplitter._join_docsb   s>    ~~d##! 	 ::<<D2::4Kr2   splitsIterable[str]c                   |                      |          }g }g }d}|D ]}|                      |          }||z   t          |          dk    r|ndz   | j        k    r|| j        k    r%t                              d| d| j                    t          |          dk    r|                     ||          }	|	|                    |	           || j        k    s,||z   t          |          dk    r|ndz   | j        k    r}|dk    rw||                      |d                   t          |          dk    r|ndz   z  }|dd          }|| j        k    K||z   t          |          dk    r|ndz   | j        k    r|dk    w|                    |           ||t          |          dk    r|ndz   z  }|                     ||          }	|	|                    |	           |S )Nr   zCreated a chunk of size z%, which is longer than the specified    )r+   rA   r)   loggerwarningr\   rG   r*   )
r/   r]   rV   separator_lenrU   current_doctotald_lenrS   s
             r0   _merge_splitszTextSplitter._merge_splitsk   sV    --i88!# 	K 	KA((++D[1A1AA1E1E1M"# # 4+++NNQ5 Q Q>B>NQ Q   {##a''//+yAACC(((  $"555[9I9IA9M9MSTU*+ +!AII!6!6{1~!F!F-0-=-=-A-AMMq"  '2!""o  $"555[9I9IA9M9MSTU*+ +!AII q!!!Tc+.>.>.B.B]]JJEEook955?KKr2   	tokenizerr	   kwargsc                    	 ddl m} t          |          st          d          dfd}n# t          $ r t          d	          w xY w | dd
|i|S )z>Text splitter that uses HuggingFace tokenizer to count length.r   )PreTrainedTokenizerBasezATokenizer received was not an instance of PreTrainedTokenizerBaser3   r4   r%   r   c                H    t                              |                     S NrA   encoder3   ri   s    r0   _huggingface_tokenizer_lengthzNTextSplitter.from_huggingface_tokenizer.<locals>._huggingface_tokenizer_length   s    9++D11222r2   z`Could not import transformers python package. Please install it with `pip install transformers`.r   r3   r4   r%   r   r7   )transformersrl   
isinstancer(   ImportError)clsri   rj   rl   rr   s    `   r0   from_huggingface_tokenizerz'TextSplitter.from_huggingface_tokenizer   s    	<<<<<<i)@AA  W  3 3 3 3 3 3 3  	 	 	E  	
 sKK#@KFKKKs	   +/ A	gpt2allrw   Type[TS]encoding_name
model_nameallowed_special'Union[Literal['all'], AbstractSet[str]]disallowed_special&Union[Literal['all'], Collection[str]]r   c                  	 	 ddl }n# t          $ r t          d          w xY w||                    |          	n|                    |          	d	fd}t	          | t
                    r||d	}i ||} | dd
|i|S )z9Text splitter that uses tiktoken encoder to count length.r   NzCould not import tiktoken python package. This is needed in order to calculate max_tokens_for_prompt. Please install it with `pip install tiktoken`.r3   r4   r%   r   c                N    t                              |                     S N)r~   r   ro   )r3   r~   r   encs    r0   _tiktoken_encoderz=TextSplitter.from_tiktoken_encoder.<locals>._tiktoken_encoder   s4    

$3'9     r2   )r|   r}   r~   r   r   rs   r7   )tiktokenrv   encoding_for_modelget_encoding
issubclassTokenTextSplitter)
rw   r|   r}   r~   r   rj   r   r   extra_kwargsr   s
      ``    @r0   from_tiktoken_encoderz"TextSplitter.from_tiktoken_encoder   s    	OOOO 	 	 	A  	 !--j99CC''66C	 	 	 	 	 	 	 	 c,-- 	0!.(#2&8	 L 0/,/Fs??#4????s   
 $Sequence[Document]c                F    |                      t          |                    S )z2Transform sequence of documents by splitting them.)rT   list)r/   rI   rj   s      r0   transform_documentsz TextSplitter.transform_documents   s     ##DOO444r2   )r   r   r   r   r   r   r    r!   r"   r#   r$   r#   r%   r&   r3   r4   r%   r5   rn   )r9   r5   r:   r;   r%   r<   )rI   rQ   r%   r<   )rU   r5   rV   r4   r%   rW   )r]   r^   rV   r4   r%   r5   )ri   r	   rj   r	   r%   r   )rw   r{   r|   r4   r}   rW   r~   r   r   r   rj   r	   r%   r   )rI   r   rj   r	   r%   r   )__name__
__module____qualname____doc__rA   r1   r   r8   rP   rT   r\   rh   classmethodrx   setr   r   r7   r2   r0   r   r      s@       33  03?D %!%2 2 2 2 2B 3 3 3 ^3 CG    (A A A A   ( ( ( (T L L L [L(  $$(CF355EJ)@ )@ )@ )@ [)@V5 5 5 5 5 5r2   c                  F     e Zd ZdZdd e            dfd fdZddZ xZS )r   z/Splitting text to tokens using model tokenizer.ry   Nrz   r|   r4   r}   rW   r~   r   r   r   rj   r	   r%   r&   c                    t                      j        di | 	 ddl}n# t          $ r t          d          w xY w||                    |          }n|                    |          }|| _        || _        || _        dS )zCreate a new TextSplitter.r   NzCould not import tiktoken python package. This is needed in order to for TokenTextSplitter. Please install it with `pip install tiktoken`.r7   )	superr1   r   rv   r   r   
_tokenizer_allowed_special_disallowed_special)	r/   r|   r}   r~   r   rj   r   r   	__class__s	           r0   r1   zTokenTextSplitter.__init__   s     	""6"""	OOOO 	 	 	A  	 !--j99CC''66C /#5   s   ! ;r3   r5   c                     d fd}t           j         j         j        j        |          }t          ||          S )	N_textr4   r%   	List[int]c                R    j                             | j        j                  S r   )r   rp   r   r   )r   r/   s    r0   _encodez-TokenTextSplitter.split_text.<locals>._encode   s1    ?)) $ 5#'#; *   r2   )r   tokens_per_chunkdecoderp   rq   )r   r4   r%   r   )	Tokenizerr*   r)   r   r   split_text_on_tokens)r/   r3   r   ri   s   `   r0   r8   zTokenTextSplitter.split_text   sc    	 	 	 	 	 	 -!-?)	
 
 
	 $CCCCr2   )r|   r4   r}   rW   r~   r   r   r   rj   r	   r%   r&   r   )r   r   r   r   r   r1   r8   __classcell__)r   s   @r0   r   r      sx        99 $$(CF355EJ6 6 6 6 6 6 66D D D D D D D Dr2   r   c                  z    e Zd ZdZdZdZdZdZdZdZ	dZ
d	Zd
ZdZdZdZdZdZdZdZdZdZdZdZdZdZdZdZdZdZdS )Languagez"Enum of the programming languages.cppgojavakotlinjstsphpprotopythonrstrubyrustscalaswiftmarkdownlatexhtmlsolcsharpcobolcluaperlhaskellelixir
powershellN)r   r   r   r   CPPGOJAVAKOTLINJSr   PHPPROTOPYTHONRSTRUBYRUSTSCALASWIFTMARKDOWNLATEXHTMLSOLCSHARPCOBOLCLUAPERLHASKELLELIXIR
POWERSHELLr7   r2   r0   r   r     s        ,,
C	BDF	B	B
CEF
CDDEEHED
CFEA
CDGFJJJr2   r   T)frozenc                  B    e Zd ZU dZded<   	 ded<   	 ded<   	 ded<   d	S )
r   zTokenizer data class.r   r   r   zCallable[[List[int]], str]r   zCallable[[str], List[int]]rp   N)r   r   r   r   __annotations__r7   r2   r0   r   r   ,  sQ         *,&&&&=&&&&==r2   r   r3   r4   ri   r%   r5   c                   g }|                     |           }d}t          ||j        z   t          |                    }|||         }|t          |          k     r|                    |                    |                     |t          |          k    rnT||j        |j        z
  z  }t          ||j        z   t          |                    }|||         }|t          |          k     |S )z6Split incoming text and return chunks using tokenizer.r   )rp   minr   rA   rG   r   r   )r3   ri   r]   	input_ids	start_idxcur_idx	chunk_idss          r0   r   r   :  s    F  &&II)i88#i..IIG)G+,I
c)nn
$
$i&&y11222c)nn$$Y/)2III	i)"<<c)nnMMi/0	 c)nn
$
$ Mr2   )r3   r4   ri   r   r%   r5   )%
__future__r   rC   loggingabcr   r   dataclassesr   enumr   typingr   r	   r
   r   r   r   r   r   r   r   r   r   langchain_core.documentsr   r   	getLoggerr   ra   r   r   r   r4   r   r   r   r7   r2   r0   <module>r      s'   " " " " " "   # # # # # # # # ! ! ! ! ! !                                  G F F F F F F F		8	$	$WT(((|5 |5 |5 |5 |5*C |5 |5 |5~-D -D -D -D -D -D -D -D`    sD   > $
> 
> 
> 
> 
> 
> 
> 
>     r2   