
    Ng^-                        d dl mZ d dlZd dlZd dlmZmZ d dlmZm	Z	m
Z
mZmZmZmZmZ d dlZd dlmZ d dlmZ  G d de          Z G d	 d
          Z G d d          ZdS )    )annotationsN)BytesIOStringIO)AnyDictIterableListOptionalTuple	TypedDictcastDocument)RecursiveCharacterTextSplitterc                  <    e Zd ZU dZded<   ded<   ded<   ded<   dS )	ElementTypezElement type as typed dict.strurlxpathcontentzDict[str, str]metadataN)__name__
__module____qualname____doc____annotations__     Y/var/www/html/ai-engine/env/lib/python3.11/site-packages/langchain_text_splitters/html.pyr   r      s?         %%HHHJJJLLLr   r   c                  >    e Zd ZdZ	 dddZddZddZddZddZdS )HTMLHeaderTextSplitterzU
    Splitting HTML files based on specified headers.
    Requires lxml package.
    Fheaders_to_split_onList[Tuple[str, str]]return_each_elementboolc                <    || _         t          |          | _        dS )ay  Create a new HTMLHeaderTextSplitter.

        Args:
            headers_to_split_on: list of tuples of headers we want to track mapped to
                (arbitrary) keys for metadata. Allowed header values: h1, h2, h3, h4,
                h5, h6 e.g. [("h1", "Header 1"), ("h2", "Header 2)].
            return_each_element: Return each element w/ associated headers.
        N)r$   sortedr"   )selfr"   r$   s      r   __init__zHTMLHeaderTextSplitter.__init__   s#     $7 #)*=#>#>   r   elementsList[ElementType]returnList[Document]c                    g }|D ]Q}|r8|d         d         |d         k    r |d         dxx         d|d         z   z  cc<   <|                     |           Rd |D             S )zCombine elements with common metadata into chunks

        Args:
            elements: HTML element content with associated identifying info and metadata
        r   r   z  
c                H    g | ]}t          |d          |d                    S r   r   page_contentr   r   .0chunks     r   
<listcomp>zGHTMLHeaderTextSplitter.aggregate_elements_to_chunks.<locals>.<listcomp>E   s?     
 
 
 %	"2U:=NOOO
 
 
r   )append)r(   r*   aggregated_chunkselements       r   aggregate_elements_to_chunksz3HTMLHeaderTextSplitter.aggregate_elements_to_chunks.   s     02 	2 	2G!
2%b)*59LLL
 ""%i000FWY=O4OO0000 "((1111
 
*
 
 
 	
r   r   r   kwargsr   c                t    t          j        |fi |}|                     t          |j                            S )zSplit HTML from web URL

        Args:
            url: web URL
            **kwargs: Arbitrary additional keyword arguments. These are usually passed
                to the fetch url content request.
        )requestsgetsplit_text_from_filer   r   )r(   r   r<   rs       r   split_text_from_urlz*HTMLHeaderTextSplitter.split_text_from_urlJ   s9     L''''((););<<<r   textc                F    |                      t          |                    S zJSplit HTML text string

        Args:
            text: HTML text
        r@   r   r(   rC   s     r   
split_textz!HTMLHeaderTextSplitter.split_textU        (($888r   filec                @   	 ddl m} n"# t          $ r}t          d          |d}~ww xY w|                    d          }|                    ||          }t          j        t                    j        dz  }|                    |          }|	                    |          } ||          }	|
                    t          |	                    }
d | j        D             t          | j                  d	d
i}g }|
                    d|          D ]}|                    d          s|                    d          r|                    t!          |d                    d |                    d|          D                       d                    d |                    d|          D                       fdt%          fd|                    d|                    D                                  | j        s|                     |          S d |D             S )CSplit HTML file

        Args:
            file: HTML file
        r   etree>Unable to import lxml, please install with `pip install lxml`.Nzutf-8)encodingz!xsl/html_chunks_with_headers.xsltc                    g | ]
}|d          S )r   r   )r5   headers     r   r7   z?HTMLHeaderTextSplitter.split_text_from_file.<locals>.<listcomp>x   s    JJJvJJJr   hzhttp://www.w3.org/1999/xhtmlz*//*z*[@class='headers']z*[@class='chunk'] c                     g | ]}|j         pd S rT   rC   r5   nodes     r   r7   z?HTMLHeaderTextSplitter.split_text_from_file.<locals>.<listcomp>   .       $( !%	R  r   z*[@class='xpath']c                     g | ]}|j         pd S rV   rW   rX   s     r   r7   z?HTMLHeaderTextSplitter.split_text_from_file.<locals>.<listcomp>   rZ   r   c                :    i | ]}|j                  |j        pd S rV   )tagrC   )r5   rY   header_mappings     r   
<dictcomp>z?HTMLHeaderTextSplitter.split_text_from_file.<locals>.<dictcomp>   s:     " " " !% +484dio2" " "r   c                    | j         v S N)r]   )xheader_filters    r   <lambda>z=HTMLHeaderTextSplitter.split_text_from_file.<locals>.<lambda>   s    !%=*@ r   z*[@class='headers']/*)r   r   r   r   c                H    g | ]}t          |d          |d                    S r1   r   r4   s     r   r7   z?HTMLHeaderTextSplitter.split_text_from_file.<locals>.<listcomp>   s?        eI&6zARSSS  r   )lxmlrN   ImportError
HTMLParserparsepathlibPath__file__parentXSLT
fromstringr   r"   dictfindallr8   r   joinfilterr$   r;   )r(   rJ   rN   eparsertree	xslt_path	xslt_tree	transformresult
result_domns_mapr*   r:   rc   r^   s                 @@r   r@   z+HTMLHeaderTextSplitter.split_text_from_file]   s   	""""""" 	 	 	P 	 !!7!33{{4(( L**14WW	KK	**	JJy))	4%%c&kk22
 KJ1IJJJd677 56 !))&&99 	 	G455 #: :    gg ,3OO<OQW,X,X    !# ,3OO<OQW,X,X  ! !" " " " )/ @ @ @ @ '0G P P) )	" " "    6 ' 	44X>>> %   s    
*%*N)F)r"   r#   r$   r%   )r*   r+   r,   r-   )r   r   r<   r   r,   r-   rC   r   r,   r-   rJ   r   r,   r-   )	r   r   r   r   r)   r;   rB   rH   r@   r   r   r   r!   r!      s          %*? ? ? ? ?"
 
 
 
8	= 	= 	= 	=9 9 9 9H H H H H Hr   r!   c                  R    e Zd ZdZ	 dd dZd!dZd"dZ	 dd#dZd$dZd%dZ	d&dZ
dS )'HTMLSectionSplitterz`
    Splitting HTML files based on specified tag and font sizes.
    Requires lxml package.
    Nr"   r#   rw   Optional[str]r<   r   r,   Nonec                   t          |          | _        |9t          j        t                    j        dz                                  | _        n+t          j        |                                          | _        || _        dS )a  Create a new HTMLSectionSplitter.

        Args:
            headers_to_split_on: list of tuples of headers we want to track mapped to
                (arbitrary) keys for metadata. Allowed header values: h1, h2, h3, h4,
                h5, h6 e.g. [("h1", "Header 1"), ("h2", "Header 2"].
            xslt_path: path to xslt file for document transformation.
            Uses a default if not passed.
            Needed for html contents that using different format and layouts.
        Nzxsl/converting_to_header.xslt)	rp   r"   rj   rk   rl   rm   absoluterw   r<   )r(   r"   rw   r<   s       r   r)   zHTMLSectionSplitter.__init__   sm      $((;#<#< X&&-0OOhjj NN %\)44==??DNr   	documentsIterable[Document]r-   c                    g g }}|D ]6}|                     |j                   |                     |j                   7|                     ||          }t	          di | j        }|                    |          S )zSplit documents.)	metadatasr   )r8   r3   r   create_documentsr   r<   split_documents)r(   r   textsr   docresultstext_splitters          r   r   z#HTMLSectionSplitter.split_documents   s    ry 	+ 	+CLL)***S\****'''CC6EEEE,,W555r   rC   r   c                F    |                      t          |                    S rE   rF   rG   s     r   rH   zHTMLSectionSplitter.split_text   rI   r   r   	List[str]r   Optional[List[dict]]c                   |pi gt          |          z  }g }t          |          D ]\  }}|                     |          D ]}t          j        ||                   }|j                                        D ]#}	|j        |	         dk    r|d         |j        |	<   $i ||j        }t          |j        |          }
|	                    |
           |S )z&Create documents from a list of texts.#TITLE#Titler2   )
len	enumeraterH   copydeepcopyr   keysr   r3   r8   )r(   r   r   
_metadatasr   irC   r6   r   keynew_docs              r   r   z$HTMLSectionSplitter.create_documents   s     32$U"3
	 '' 		* 		*GAt.. * *=A77 >..00 @ @C~c*i77.6w.?s+9h9%.9"0BXVVV  ))))* r   html_docList[Dict[str, Optional[str]]]c                   	 ddl m}m} n"# t          $ r}t          d          |d }~ww xY w ||d          }t	          | j                                                  }g }|                    dg|z             }t          |          D ]\  }}	|	}
|dk    rd}d}g }n"|
j	        
                                }|
j        }g }|
j        D ]S}|dz   t          |          k     r|||dz            k    r n+t          |t                    r|                    |           Td	                    |          
                                }|d
k    r|                    |||d           |S )Nr   )BeautifulSoupPageElementzzUnable to import BeautifulSoup/PageElement,                     please install with `pip install                     bs4`.zhtml.parserbodyr   h1    rT   )rR   r   tag_name)bs4r   r   rg   listr"   r   find_allr   rC   stripnamenext_elementsr   
isinstancer   r8   rr   )r(   r   r   r   rt   soupheaderssectionsr   rR   header_elementcurrent_headercurrent_header_tagsection_contentr:   r   s                   r   split_html_by_headersz)HTMLSectionSplitter.split_html_by_headers   s   	666666666 	 	 	  		 }X}55t/44667702--7 233"7++ 	 	IAv*0NAvv!*%)"(*!/!4!:!:!<!<%3%8""$)7 4 4q53w<<''Gwq1u~,E,EEgs++ 4#**7333hh//5577G"}}"0#*$6    s    
*%*html_contentc                h   | j         |S 	 ddlm} n"# t          $ r}t          d          |d }~ww xY w|                                }|                    t          |          |          }|                    | j                   }|                    |          } ||          }t          |          S )Nr   rM   rO   )	rw   rf   rN   rg   rh   ri   r   rn   r   )	r(   r   rN   rt   ru   rv   rx   ry   rz   s	            r   convert_possible_tags_to_headerz3HTMLSectionSplitter.convert_possible_tags_to_header  s    >!	""""""" 	 	 	P 	
 !!##{{8L116::KK//	JJy))	46{{s    
1,1rJ   c                     |                                 }                     |          }                     |          } fd|D             S )rL   c           
         g | ]R}t          t          t          |d                    j        t          |d                            |d         i          SS )r   r   rR   )r   )r   r   r   r"   )r5   sectionr(   s     r   r7   z<HTMLSectionSplitter.split_text_from_file.<locals>.<listcomp>7  st     

 

 

  S'),--,S1D-E-EF I  

 

 

r   )getvaluer   r   )r(   rJ   file_contentr   s   `   r   r@   z(HTMLSectionSplitter.split_text_from_file-  se     }};;LII--l;;

 

 

 

 $

 

 

 
	
r   ra   )r"   r#   rw   r   r<   r   r,   r   )r   r   r,   r-   r}   )r   r   r   r   r,   r-   )r   r   r,   r   )r   r   r,   r   r~   )r   r   r   r   r)   r   rH   r   r   r   r@   r   r   r   r   r      s          $(    4
6 
6 
6 
69 9 9 9 CG    $* * * *X   &
 
 
 
 
 
r   r   )
__future__r   r   rj   ior   r   typingr   r   r   r	   r
   r   r   r   r>   langchain_core.documentsr   "langchain_text_splitters.characterr   r   r!   r   r   r   r   <module>r      sA   " " " " " "                   N N N N N N N N N N N N N N N N N N N N  - - - - - - M M M M M M    )   N N N N N N N NbY
 Y
 Y
 Y
 Y
 Y
 Y
 Y
 Y
 Y
r   