
    Ng(<                        d dl mZ d dlZd dlmZmZmZmZmZm	Z	 d dl
mZ d dlmZ d dlmZ  G d de          Z G d	 d
          Z G d de          Z G d de          Z G d d          ZdS )    )annotationsN)AnyDictListTuple	TypedDictUnionDocument)Language)RecursiveCharacterTextSplitterc                  $     e Zd ZdZd fdZ xZS )MarkdownTextSplitterz=Attempts to split the text along Markdown-formatted headings.kwargsr   returnNonec                |    |                      t          j                  } t                      j        dd|i| dS )z"Initialize a MarkdownTextSplitter.
separatorsN )get_separators_for_languager   MARKDOWNsuper__init__)selfr   r   	__class__s      ]/var/www/html/ai-engine/env/lib/python3.11/site-packages/langchain_text_splitters/markdown.pyr   zMarkdownTextSplitter.__init__   sA    55h6GHH
99J9&99999    )r   r   r   r   )__name__
__module____qualname____doc__r   __classcell__)r   s   @r   r   r      sC        GG: : : : : : : : : :r   r   c                  0    e Zd ZdZ	 	 ddd	ZddZddZdS )MarkdownHeaderTextSplitterz4Splitting markdown files based on specified headers.FTheaders_to_split_onList[Tuple[str, str]]return_each_lineboolstrip_headersc                R    || _         t          |d d          | _        || _        dS )a  Create a new MarkdownHeaderTextSplitter.

        Args:
            headers_to_split_on: Headers we want to track
            return_each_line: Return each line w/ associated headers
            strip_headers: Strip split headers from the content of the chunk
        c                ,    t          | d                   S )Nr   )len)splits    r   <lambda>z5MarkdownHeaderTextSplitter.__init__.<locals>.<lambda>*   s    3uQx== r   T)keyreverseN)r'   sortedr%   r)   r   r%   r'   r)   s       r   r   z#MarkdownHeaderTextSplitter.__init__   s@     !1 $*%@%@$$
 $
 $
  +r   linesList[LineType]r   List[Document]c                4   g }|D ]}|r8|d         d         |d         k    r |d         dxx         d|d         z   z  cc<   =|r|d         d         |d         k    rt          |d         d                   t          |d                   k     ri|d         d                             d          d         d         dk    r8| j        s1|d         dxx         d|d         z   z  cc<   |d         |d         d<   |                    |           	d |D             S )	zCombine lines with common metadata into chunks
        Args:
            lines: Line of text / associated header metadata
        metadatacontentz  

r   #c                H    g | ]}t          |d          |d                    S r9   r8   page_contentr8   r
   .0chunks     r   
<listcomp>zHMarkdownHeaderTextSplitter.aggregate_lines_to_chunks.<locals>.<listcomp>T   s?     
 
 
 %	"2U:=NOOO
 
 
r   )r,   r-   r)   append)r   r3   aggregated_chunkslines       r   aggregate_lines_to_chunksz4MarkdownHeaderTextSplitter.aggregate_lines_to_chunks/   sn   
 -/ 	/ 	/D!/%b)*5j9III
 ""%i000FT)_4LL0000!/%b)*5j9III)"-j9::SjAQ=R=RRR%b))4::4@@DQG3NN* O ""%i000FT)_4LL000484D!"%j11 "((....
 
*
 
 
 	
r   textstrc                   |                     d          }g }g }i }g }i }d}d}	|D ]}
|
                                }d                    t          t          j        |                    }|sM|                    d          r|                    d          dk    rd}d}	n3|                    d          rd}d}	n|                    |	          rd}d}	|r|                    |           | j	        D ]\  }}|                    |          rpt          |          t          |          k    s|t          |                   dk    r6||                    d
          }|r_|d         d         |k    rM|                                }|d         |v r|                    |d                    |r|d         d         |k    M|||t          |          d	                                         d}|                    |           |d         ||<   |rQ|                    d                    |          |                                d           |                                 | j        s|                    |            nm|r|                    |           nS|rQ|                    d                    |          |                                d           |                                 |                                }|r+|                    d                    |          |d           | j        s|                     |          S d |D             S )zASplit markdown file
        Args:
            text: Markdown filer:   F z```   Tz~~~ Nr;   r7   levelname)rN   rO   datarP   )r9   r8   c                H    g | ]}t          |d          |d                    S r=   r
   r@   s     r   rC   z9MarkdownHeaderTextSplitter.split_text.<locals>.<listcomp>   s?        eI&6zARSSS  r   )r-   stripjoinfilterrI   isprintable
startswithcountrD   r%   r,   popcopyclearr)   r'   rG   )r   rH   r3   lines_with_metadatacurrent_contentcurrent_metadataheader_stackinitial_metadatain_code_blockopening_fencerF   stripped_lineseprO   current_header_levelpopped_headerheaders                    r   
split_textz%MarkdownHeaderTextSplitter.split_textY   s    

4  .0%'+- *,+- U	7 U	7D JJLLM GGF3?M$J$JKKM  ' ++E22 *}7J7J57Q7QUV7V7V$(M$)MM"--e44 *$(M$)M ++M:: '$)M$&M &&}555 "5 <, <,	T ++C00 / &&#c((22mCHH6MQT6T6T '/2yy~~, )
L ,R 0 9=Q Q Q -9,<,<,>,>M  -V48HHH 0 4 4]65J K K K )
L ,R 0 9=Q Q Q &:$($1#c((**$=$C$C$E$E. .
 %++F33317(. ' 0+22+/99_+E+E,<,A,A,C,C    (--///- >'..}===E  	,#**=9999$ ,'..'+yy'A'A(8(=(=(?(?    $))+++/4466 	&& IIo66DTUU   $ 	112EFFF 0   r   N)FT)r%   r&   r'   r(   r)   r(   )r3   r4   r   r5   rH   rI   r   r5   )r   r   r    r!   r   rG   rg   r   r   r   r$   r$      sj        >>
 "'"	+ + + + +.(
 (
 (
 (
Tx x x x x xr   r$   c                  (    e Zd ZU dZded<   ded<   dS )LineTypezLine type as typed dict.zDict[str, str]r8   rI   r9   Nr   r   r    r!   __annotations__r   r   r   rj   rj      s+         ""LLLLLr   rj   c                  2    e Zd ZU dZded<   ded<   ded<   dS )
HeaderTypezHeader type as typed dict.intrN   rI   rO   rP   Nrk   r   r   r   rn   rn      s1         $$JJJIIIIIIIIr   rn   c                  l    e Zd ZdZdddddddZ	 	 	 d&d'dZd(dZd)dZd*dZd+d Z	d,d#Z
d,d$Zd,d%Zd	S )-&ExperimentalMarkdownSyntaxTextSplittera  
    An experimental text splitter for handling Markdown syntax.

    This splitter aims to retain the exact whitespace of the original text while
    extracting structured metadata, such as headers. It is a re-implementation of the
    MarkdownHeaderTextSplitter with notable changes to the approach and
    additional features.

    Key Features:
    - Retains the original whitespace and formatting of the Markdown text.
    - Extracts headers, code blocks, and horizontal rules as metadata.
    - Splits out code blocks and includes the language in the "Code" metadata key.
    - Splits text on horizontal rules (`---`) as well.
    - Defaults to sensible splitting behavior, which can be overridden using the
      `headers_to_split_on` parameter.

    Parameters:
    ----------
    headers_to_split_on : List[Tuple[str, str]], optional
        Headers to split on, defaulting to common Markdown headers if not specified.
    return_each_line : bool, optional
        When set to True, returns each line as a separate chunk. Default is False.

    Usage example:
    --------------
    >>> headers_to_split_on = [
    >>>     ("#", "Header 1"),
    >>>     ("##", "Header 2"),
    >>> ]
    >>> splitter = ExperimentalMarkdownSyntaxTextSplitter(
    >>>     headers_to_split_on=headers_to_split_on
    >>> )
    >>> chunks = splitter.split(text)
    >>> for chunk in chunks:
    >>>     print(chunk)

    This class is currently experimental and subject to change based on feedback and
    further development.
    zHeader 1zHeader 2zHeader 3zHeader 4zHeader 5zHeader 6)r;   z##z###z####z#####z######NFTr%   "Union[List[Tuple[str, str]], None]r'   r(   r)   c                    g | _         t          d          | _        g | _        || _        |rt          |          | _        n| j        | _        || _        d S )NrK   r?   )	chunksr   current_chunkcurrent_header_stackr)   dictsplittable_headersDEFAULT_HEADER_KEYSr'   r2   s       r   r   z/ExperimentalMarkdownSyntaxTextSplitter.__init__  sc     ')%2666;=!* 	?&*+>&?&?D##&*&>D# 0r   rH   rI   r   r5   c                z   |                     d          }|rp|                    d          }|                     |          }|                     |          }|                     |          }|r~|                                  | j        s| j        xj        |z  c_        t          |
                    d                    }|
                    d          }|                     ||           n|rk|                                  |                     ||          | j        _        |
                    d          | j        j        d<   |                                  n,|r|                                  n| j        xj        |z  c_        |p|                                  | j        rd | j        D             S | j        S )NT)keependsr   rL      Codec                    g | ]J}|j                                         D ].}||                                t          ||j                   /KS )r>   )r?   
splitlinesisspacer   r8   )rA   rB   rF   s      r   rC   zEExperimentalMarkdownSyntaxTextSplitter.split_text.<locals>.<listcomp>I  su       !.99;;  	 !%	dU^DDD   r   )r   rX   _match_header_match_code_match_horz_complete_chunk_docr)   rv   r?   r,   group_resolve_header_stack_resolve_code_chunkr8   r'   ru   )	r   rH   	raw_linesraw_lineheader_match
code_match
horz_matchheader_depthheader_texts	            r   rg   z1ExperimentalMarkdownSyntaxTextSplitter.split_text&  s   OOTO22	 	< }}Q''H--h77L))(33J))(33J <((***) @&33x?33  #<#5#5a#8#899*0033**<EEEE 
<((***262J2Ji3 3"/ 7A6F6Fq6I6I"+F3((**** <((****"//8;//3  	<6 	  """   	 ![    {r   r   ro   r   r   c                    t          | j                  D ]4\  }\  }}||k    r&||f| j        |<   | j        d |dz            | _         d S 5| j                            ||f           d S )NrL   )	enumeraterw   rD   )r   r   r   idepth_s         r   r   z<ExperimentalMarkdownSyntaxTextSplitter._resolve_header_stackQ  s    &t'@AA 	 	MAzq$$0<k/J)!,,0,EgAg,N) % 	!((,)DEEEEEr   current_liner   	List[str]c                t    |}|r3|                     d          }||z  }|                     |          r|S |3dS )Nr   rK   )rX   r   )r   r   r   rB   r   s        r   r   z:ExperimentalMarkdownSyntaxTextSplitter._resolve_code_chunkY  sV     	 }}Q''HXE)) 	  	
 rr   c                $   | j         j        }|rl|                                sX| j        D ]1\  }}| j                            d|z            }|| j         j        |<   2| j                            | j                    t          d          | _         d S )Nr;   rK   rt   )
rv   r?   r   rw   ry   getr8   ru   rD   r   )r   chunk_contentr   value
header_keys        r   r   z:ExperimentalMarkdownSyntaxTextSplitter._complete_chunk_docb  s    *7 	3!6!6!8!8 	3 $ 9 @ @u!488uEE
:?"+J77Kt1222%2666r   rF   Union[re.Match, None]c                p    t          j        d|          }|r|                    d          | j        v r|S d S )Nz^(#{1,6}) (.*)rL   )rematchr   ry   )r   rF   r   s      r   r   z4ExperimentalMarkdownSyntaxTextSplitter._match_headero  s=    *D11 	U[[^^t'>>>Ltr   c                T    fddD             }t          d |D             d           S )Nc                :    g | ]}t          j        |          S r   r   r   rA   rulerF   s     r   rC   zFExperimentalMarkdownSyntaxTextSplitter._match_code.<locals>.<listcomp>w  s%    OOOD28D$''OOOr   )z^```(.*)z^~~~(.*)c              3     K   | ]}||V  	d S Nr   rA   r   s     r   	<genexpr>zEExperimentalMarkdownSyntaxTextSplitter._match_code.<locals>.<genexpr>x  '      99u59U999999r   nextr   rF   matchess    ` r   r   z2ExperimentalMarkdownSyntaxTextSplitter._match_codev  s<    OOOO4NOOO999994@@@r   c                T    fddD             }t          d |D             d           S )Nc                :    g | ]}t          j        |          S r   r   r   s     r   rC   zFExperimentalMarkdownSyntaxTextSplitter._match_horz.<locals>.<listcomp>{  s2     
 
 
%)BHT4  
 
 
r   )z
^\*\*\*+\nz^---+\nz^___+\nc              3     K   | ]}||V  	d S r   r   r   s     r   r   zEExperimentalMarkdownSyntaxTextSplitter._match_horz.<locals>.<genexpr>~  r   r   r   r   s    ` r   r   z2ExperimentalMarkdownSyntaxTextSplitter._match_horzz  sL    
 
 
 
-T
 
 
 999994@@@r   )NFT)r%   rr   r'   r(   r)   r(   rh   )r   ro   r   rI   r   r   )r   rI   r   r   r   rI   )r   r   )rF   rI   r   r   )r   r   r    r!   rz   r   rg   r   r   r   r   r   r   r   r   r   rq   rq      s       & &R   CG!&"	1 1 1 1 1") ) ) )VF F F F   
7 
7 
7 
7   A A A AA A A A A Ar   rq   )
__future__r   r   typingr   r   r   r   r   r	   langchain_core.documentsr   langchain_text_splitters.baser   "langchain_text_splitters.characterr   r   r$   rj   rn   rq   r   r   r   <module>r      s|   " " " " " " 				 ; ; ; ; ; ; ; ; ; ; ; ; ; ; ; ; - - - - - - 2 2 2 2 2 2 M M M M M M: : : : :9 : : :| | | | | | | |~    y          [A [A [A [A [A [A [A [A [A [Ar   