
    Ng                        d dl mZ d dlmZ d dlmZmZmZmZm	Z	m
Z
mZmZ d dlmZ d dlmZ erd dlmZ d dlmZmZ  G d d	e          ZddZddZddZdS )    )annotations)Path)TYPE_CHECKINGAnyIteratorListOptionalSequenceTupleUnion)Document)
BaseLoaderNavigableStringCommentTagc                  6    e Zd ZdZ	 	 	 	 	 dddZddZddZdS )ReadTheDocsLoaderz+Load `ReadTheDocs` documentation directory.Nz*.htmz*.html      ?pathUnion[str, Path]encodingOptional[str]errorscustom_html_tagOptional[Tuple[str, dict]]patternsSequence[str]exclude_links_ratiofloatkwargsOptional[Any]c                (   	 ddl m} n# t          $ r t          d          w xY w	  |	 	 di |}	n"# t          $ r}
t	          d          |
d}
~
ww xY wt          |          | _        || _        || _        || _	        || _
        || _        || _        dS )	uZ  
        Initialize ReadTheDocsLoader

        The loader loops over all files under `path` and extracts the actual content of
        the files by retrieving main html tags. Default main html tags include
        `<main id="main-content>`, <`div role="main>`, and `<article role="main">`. You
        can also define your own html tags by passing custom_html_tag, e.g.
        `("div", "class=main")`. The loader iterates html tags with the order of
        custom html tags (if exists) and default html tags. If any of the tags is not
        empty, the loop will break and retrieve the content out of that tag.

        Args:
            path: The location of pulled readthedocs folder.
            encoding: The encoding with which to open the documents.
            errors: Specify how encoding and decoding errors are to be handled—this
                cannot be used in binary mode.
            custom_html_tag: Optional custom html tag to retrieve the content from
                files.
            patterns: The file patterns to load, passed to `glob.rglob`.
            exclude_links_ratio: The ratio of links:content to exclude pages from.
                This is to reduce the frequency at which index pages make their
                way into retrieved results. Recommended: 0.5
            kwargs: named arguments passed to `bs4.BeautifulSoup`.
        r   BeautifulSoupzWCould not import python packages. Please install it with `pip install beautifulsoup4`. 6<html><body>Parser builder library test.</body></html>html.parserz"Parsing kwargs do not appear validN)r(   r)   )bs4r'   ImportError	Exception
ValueErrorr   	file_pathr   r   r   r   	bs_kwargsr!   )selfr   r   r   r   r   r!   r#   r'   _es              l/var/www/html/ai-engine/env/lib/python3.11/site-packages/langchain_community/document_loaders/readthedocs.py__init__zReadTheDocsLoader.__init__   s    D	))))))) 	 	 	H  		JH   AA
  	J 	J 	JABBI	J d . #6   s   	 #
2 
AAAreturnIterator[Document]c           	   #  |  K   | j         D ]}| j                            |          D ]}|                                rt	          || j        | j                  5 }|                     |                                          }ddd           n# 1 swxY w Y   t          |dt          |          i          V  dS )zA lazy loader for Documents.)r   r   Nsource)page_contentmetadata)r   r.   rglobis_diropenr   r   _clean_datareadr   str)r0   file_patternpftexts        r3   	lazy_loadzReadTheDocsLoader.lazy_loadM   s      M 	O 	OL^)),77 O O88:: !dmDKHHH 6A++AFFHH55D6 6 6 6 6 6 6 6 6 6 6 6 6 6 6DHc!ff;MNNNNNNNO	O 	Os   (BBBdatar@   c                   ddl m}  ||dfi | j        }dddifdddifg}| j        |                    | j                   d }|d d d	         D ]\  }}|                    ||          }| n |(t          |          | j        k    rt          |          }nd
}d	                    d |
                    d          D                       S )Nr   r&   r)   divrolemainidzmain-content 
c                    g | ]}||S  rP   ).0ts     r3   
<listcomp>z1ReadTheDocsLoader._clean_data.<locals>.<listcomp>s   s    ;;;;!;;;    )r*   r'   r/   r   appendfind_get_link_ratior!   _get_clean_textjoinsplit)	r0   rF   r'   soup	html_tagselementtagattrsrD   s	            r3   r>   zReadTheDocsLoader._clean_dataW   s   %%%%%%}T=CCDNCC VV$%dN+,
	
 +T1222 $DDbD/ 	 	JCiiU++G" # ?7#;#;t?W#W#W"7++DDDyy;;TZZ%5%5;;;<<<rT   )NNNr   r   )r   r   r   r   r   r   r   r   r   r    r!   r"   r#   r$   )r5   r6   )rF   r@   r5   r@   )__name__
__module____qualname____doc__r4   rE   r>   rP   rT   r3   r   r      sq        55
 #' $6:"5%(97 97 97 97 97vO O O O= = = = = =rT   r   r]   r   r5   r@   c                \    g d}g d}t          | ||          }|                                S )zMReturns cleaned text with newlines preserved and irrelevant elements removed.)scriptnoscriptcanvasmetasvgmapareaaudior8   trackvideoembedobjectparampictureiframeframeframesetnoframesappletformbuttonselectbasestyleimg)rB   rH   ulollih1h2h3h4h5h6pretabletr)_process_elementstrip)r]   elements_to_skipnewline_elementsrD   s       r3   rX   rX   v   sI      :  " G%57GHHD::<<rT   sectionr"   c                   |                      d          }d                    d | j        D                       }t          |          dk    rdS d                    d |D                       }t          |          t          |          z  S )NarM   c              3  4   K   | ]}t          |          V  d S N)r@   )rQ   ss     r3   	<genexpr>z"_get_link_ratio.<locals>.<genexpr>   s(      BBAQBBBBBBrT   r   c              3  z   K   | ]6}|j         D ],}|t          |j                                                  V  -7d S r   )stringsr@   stringr   )rQ   linkr   s      r3   r   z"_get_link_ratio.<locals>.<genexpr>   sk        l  	FM!!""      rT   )find_allrY   stripped_stringslen)r   links
total_text	link_texts       r3   rW   rW      s    S!!EBB)ABBBBBJ
:!q      I y>>C
OO++rT   $Union[Tag, NavigableString, Comment]r   	List[str]r   c                `   ddl m ddlmm t          | dd          }t          |           s|v rdS t          |           r| S |dk    rdS |v r-d                    fd	| j        D                       dz   S d                    fd
| j        D                       S )zq
    Traverse through HTML tree recursively to preserve newline and skip
    unwanted (code/binary) elements
    r   r   r   nameNrM   brrN   c              3  `   K   | ](}t          |f          t          |          V  )d S r   
isinstancer   rQ   childr   r   r   r   r   s     r3   r   z#_process_element.<locals>.<genexpr>   s\        ec?G%DEE (8:JKK     rT   c              3  `   K   | ](}t          |f          t          |          V  )d S r   r   r   s     r3   r   z#_process_element.<locals>.<genexpr>   s\       
 
%#!@AA
U$46FGG
 
 
 
 
 
rT   )	r*   r   bs4.elementr   r   getattrr   rY   children)r]   r   r   tag_namer   r   r   s    `` @@@r3   r   r      sK    $#####((((((((w--H'7## 
x3C'C'Cr	G_	-	- 
	T		t	%	%	%GG        $-    
 	
 ww 
 
 
 
 
 
 
 
 )
 
 
 
 
 	
rT   N)r]   r   r5   r@   )r   r   r5   r"   )r]   r   r   r   r   r   r5   r@   )
__future__r   pathlibr   typingr   r   r   r   r	   r
   r   r   langchain_core.documentsr   )langchain_community.document_loaders.baser   r*   r   r   r   r   r   rX   rW   r   rP   rT   r3   <module>r      s5   " " " " " "       W W W W W W W W W W W W W W W W W W W W - - - - - - @ @ @ @ @ @ )######((((((((d= d= d= d= d=
 d= d= d=N1 1 1 1h, , , ,!
 !
 !
 !
 !
 !
rT   