
    Ng4#                        d dl Z d dlZd dlZd dlmZmZ d dlmZmZm	Z	m
Z
mZmZmZmZmZ d dlZd dlZd dlmZ d dlmZ d dlmZ  ej        e          Z e            ddd	d
dd
dZdededefdZ G d de          ZdS )    N)FutureThreadPoolExecutor)	AnyAsyncIteratorDictIteratorListOptionalTupleUnioncast)Document)
BaseLoader)get_user_agentzJtext/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8zen-US,en;q=0.5zhttps://www.google.com/1z
keep-alive)
User-AgentAcceptzAccept-LanguageRefererDNT
ConnectionzUpgrade-Insecure-Requestssoupurlreturnc                 2   d|i}|                      d          x}r|                                |d<   |                      dddi          x}r|                    dd          |d<   |                      d	          x}r|                    d
d          |d<   |S )z)Build metadata from BeautifulSoup output.sourcetitlemetanamedescription)attrscontentzNo description found.htmllangzNo language found.language)findget_textget)r   r   metadatar   r   r"   s         k/var/www/html/ai-engine/env/lib/python3.11/site-packages/langchain_community/document_loaders/async_html.py_build_metadatar*   &   s    #H		'"""u -!NN,,iiv}.EiFFF{ V"-//)=T"U"Uyy   t F#xx0DEEO    c                      e Zd ZdZ	 	 	 	 	 	 	 	 	 	 d*ddddeeee         f         d	ee         d
ee	         dee         de	dee         dede
deeeef                  de	de	de	de	fdZdedefdZededdfd            Z	 d+dede
de
dedef
d Zded!ej        deeef         fd"Zd#ee         de	deeeef                  fd$Zd#ee         dee         fd%Zded&edefd'Zdee         fd(Zdee         fd)ZdS ),AsyncHtmlLoaderzLoad `HTML` asynchronously.NThtml.parser   F)preserve_order	trust_envweb_pathheader_template
verify_sslproxiesautoset_encodingencodingdefault_parserrequests_per_secondrequests_kwargsraise_for_statusignore_load_errorsr0   r1   c                |   t          |t                    r	|g| _        nt          |t                    r|| _        |pt          }|                    d          sD	 ddlm}  |            j        |d<   n*# t          $ r t                              d           Y nw xY wt          j                    | _        t          |          | j        _        || j        _        |r| j        j                            |           || _        || _        |	pi | _        |
| _        || _        || _        || _        || _        || _        dS )zInitialize with a webpage path.r   r   )	UserAgentzxfake_useragent not found, using default user agent.To get a realistic header for requests, `pip install fake_useragent`.N)
isinstancestr	web_pathsr	   default_header_templater'   fake_useragentr>   randomImportErrorloggerinforequestsSessionsessiondictheadersverifyr5   updater9   r8   r:   r;   r6   r7   r<   r0   r1   )selfr2   r3   r4   r5   r6   r7   r8   r9   r:   r;   r<   r0   r1   rL   r>   s                   r)   __init__zAsyncHtmlLoader.__init__5   sb   , h$$ 	&&ZDNN$'' 	&%DN!<%<{{<(( 
		444444(1	(:%%   4      '))#G}}( 	1L ''000#6 ,.4" 0 0 "4,"s   A3 3$BBr   r   c                     | j         rR	  | j        j        |fi | j        S # t          $ r,}t          j        t          |                     Y d }~d S d }~ww xY w | j        j        |fi | j        S N)r<   rJ   r'   r:   	Exceptionwarningswarnr@   )rO   r   es      r)   _fetch_valid_connection_docsz,AsyncHtmlLoader._fetch_valid_connection_docso   s    " 	't|'DDt/CDDD   c!ff%%%ttttt  t|<<t';<<<s   ! 
A!AAparserc                 f    g d}| |vr(t          dd                    |          z   dz             dS )z#Check that parser is valid for bs4.)r.   lxmlxmlzlxml-xmlhtml5libz`parser` must be one of z, .N)
ValueErrorjoin)rX   valid_parserss     r)   _check_parserzAsyncHtmlLoader._check_parsery   sN     ONN&&*TYY}-E-EEK   '&r+            ?retriescooldownbackoffc                 "  K   t          j        | j                  4 d {V }t          |          D ]}	 t	          d| j        j        | j        j                                        d| j	        }| j        j
        sd|d<    |j        |fi |4 d {V 	 }	 |                                 d {V }	n/# t          $ r" t                              d|            d}	Y nw xY w|	cd d d           d {V  c cd d d           d {V  S # 1 d {V swxY w Y   # t           j        t"          f$ r}
||dz
  k    r@| j        r9t                              d| d	| d
           Y d }
~
 d d d           d {V  dS ||dz
  k    r t                              d| d|dz    d| d|
 d	           t)          j        |||z  z             d {V  Y d }
~
d }
~
ww xY w	 d d d           d {V  n# 1 d {V swxY w Y   t-          d          )N)r1   )rL   cookiesFsslzFailed to decode content from     zError fetching z after z	 retries.z with attempt /z: z. Retrying...zretry count exceeded )aiohttpClientSessionr1   rangerK   rJ   rL   rh   get_dictr:   rM   r'   textUnicodeDecodeErrorrF   errorClientConnectionErrorTimeoutErrorr<   warningasynciosleepr^   )rO   r   rd   re   rf   rJ   ikwargsresponserr   rV   s              r)   _fetchzAsyncHtmlLoader._fetch   s      (4>BBB 	C 	C 	C 	C 	C 	C 	Cg7^^ C CC#' $ $ 4 $ 4 = = ? ?$ $ .$ $F
  <. .(-u*w{        	$ 	$ 	$ 	$ 	$ 	$ 	$ 	$ "&)1#8#8#8#8#8#8DD1 & & &"LL)O#)O)OPPP#%DDD&  $	$ 	$ 	$ 	$ 	$ 	$ 	$ 	$ 	$ 	$ 	$ 	$ 	$ 	$ 	$	C 	C 	C 	C 	C 	C 	C 	C 	C 	C 	C 	C 	C 	C	$ 	$ 	$ 	$ 	$ 	$ 	$ 	$ 	$ 	$ 	$ 	$ 	$ 	$ 	$  5|D C C CGaK''D,C''W'W'WW'W'W'WXXX!rrrr/	C 	C 	C 	C 	C 	C 	C 	C 	C 	C 	C 	C 	C 	C0 gk))Cc C C 1uC C'.C C23C C C   &mHwz,ABBBBBBBBBBBBBBC'C	C 	C 	C 	C 	C 	C 	C 	C 	C 	C 	C 	C 	C 	C 	C 	C 	C 	C 	C 	C 	C 	C 	C 	C 	C 	C 	C@ /000s   G1ADDB43D4)C 	DC 	 D$D6G1
DDDDG1G31G$G1<AGG1GG11
G;>G;	semaphorec                    K   |4 d {V  ||                      |           d {V fcd d d           d {V  S # 1 d {V swxY w Y   d S rR   )r}   )rO   r   r~   s      r)   _fetch_with_rate_limitz&AsyncHtmlLoader._fetch_with_rate_limit   s        	/ 	/ 	/ 	/ 	/ 	/ 	/ 	/dkk#.........	/ 	/ 	/ 	/ 	/ 	/ 	/ 	/ 	/ 	/ 	/ 	/ 	/ 	/ 	/ 	/ 	/ 	/ 	/ 	/ 	/ 	/ 	/ 	/ 	/ 	/ 	/ 	/ 	/ 	/s   ;
AAurlsc                   K   t          j         j                   fd|D             }	 ddlm} |r ||ddd          D ]}| d {V W V  d S |                    |ddd          D ]}| d {V W V  d S # t          $ r[ t          j        d           |rt          j	        |  d {V D ]}|W V  Y d S t          j        |          D ]}| d {V W V  Y d S w xY w)	Nc                 `    g | ]*}t          j                            |                    +S rm   )rx   create_taskr   ).0r   rO   r~   s     r)   
<listcomp>z3AsyncHtmlLoader._lazy_fetch_all.<locals>.<listcomp>   sD     
 
 
  ; ;C K KLL
 
 
r+   r   )tqdm_asynciozFetching pagesTrk   )descasciiminintervalz2For better logging of progress, `pip install tqdm`)
rx   	Semaphorer9   tqdm.asyncior   as_completedrE   rT   rU   gather)rO   r   r0   tasksr   taskresultr~   s   `      @r)   _lazy_fetch_allzAsyncHtmlLoader._lazy_fetch_all   s      %d&>??	
 
 
 
 

 
 
	%111111 	%(L 0!   % %D !%******$$$$$% %
 )55 0! 6   % %D !%******$$$$$% %  	% 	% 	%MNOOO %$+NE$::::::: ! !F LLLLL! ! ! $077 % %D $******$$$$$% % %	%s   %A> 'A> ><C#="C#"C#c                 R   K   d |                      |d          2              d{V S )z/Fetch all urls concurrently with rate limiting.c                 (   K   g | 3 d {V \  }}|6 S rR   rm   )r   _docs      r)   r   z-AsyncHtmlLoader.fetch_all.<locals>.<listcomp>   s2      IIIIIIIIfaIIIIs   TN)r   )rO   r   s     r)   	fetch_allzAsyncHtmlLoader.fetch_all   s<      II(<(<T4(H(HIIIIIIIIIIr+   rr   c                     ddl m} |                    d          rd}n| j        }|                     |            |||          }t          ||          }t          ||          S )Nr   )BeautifulSoupz.xmlr[   )page_contentr(   )bs4r   endswithr8   ra   r*   r   )rO   r   rr   r   rX   r   r(   s          r)   _to_documentzAsyncHtmlLoader._to_document   s~    %%%%%%<< 	)FF(F6"""}T6**"4--TH====r+   c              #   @  K   	 t          j                     t          d          5 }|                    t           j        |                     | j                            }|                                }ddd           n# 1 swxY w Y   n<# t          $ r/ t          j        |                     | j                            }Y nw xY wt          t          t          t                   |                    D ](\  }}|                     | j        |         |          V  )dS )+Lazy load text from the url(s) in web_path.rk   )max_workersN)rx   get_running_loopr   submitrunr   rA   r   RuntimeError	enumerater   r	   r@   r   )rO   executorfutureresultsrz   rr   s         r)   	lazy_loadzAsyncHtmlLoader.lazy_load   sV     	B$&&& $222 *h,4OOKNN4>22- - !--//* * * * * * * * * * * * * * *  	B 	B 	Bk$.."@"@AAGGG	B !d3i!9!9:: 	= 	=GAt##DN1$5t<<<<<<	= 	=s5   #B AB 4B  BB BB 6CCc                   K   |                      | j        | j                  2 3 d{V \  }}|                     ||          W V  $6 dS )r   N)r   rA   r0   r   )rO   r   rr   s      r)   
alazy_loadzAsyncHtmlLoader.alazy_load   s      #33ND/ 
  
 	/ 	/ 	/ 	/ 	/ 	/ 	/)#t ##C....... 
  
  
s   A)
NTNTNr.   r/   NFF)rb   r/   rc   )__name__
__module____qualname____doc__r   r@   r	   r
   rK   boolintr   r   rP   rW   staticmethodra   floatr}   rx   r   r   r   r   r   r   r   r   r   r   r   rm   r+   r)   r-   r-   2   s       %%
 +/%)"&!%"&+#$48!&#(8#  $8# 8# 8#T#Y'8# "$8# TN	8#
 $8# 8# 3-8# 8# !8# "$sCx.18# 8# !8# 8# 8# 8# 8# 8#t= = = = = = c d    \ OR#1 #1#1!$#147#1FK#1	#1 #1 #1 #1J//#*#4/	sCx/ / / /%I%/3%	uS#X	'% % % %<JDI J$s) J J J J
> 
>3 
>8 
> 
> 
> 
>=8H- = = = =(/-"9 / / / / / /r+   r-   ) rx   loggingrT   concurrent.futuresr   r   typingr   r   r   r   r	   r
   r   r   r   rn   rH   langchain_core.documentsr   )langchain_community.document_loaders.baser   $langchain_community.utils.user_agentr   	getLoggerr   rF   rB   r@   rK   r*   r-   rm   r+   r)   <module>r      s      9 9 9 9 9 9 9 9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
   - - - - - - @ @ @ @ @ @ ? ? ? ? ? ?		8	$	$ !.""'(!$	 	 	# 	C 	D 	 	 	 	B/ B/ B/ B/ B/j B/ B/ B/ B/ B/r+   