
    Ng2                         d Z ddlZddlZddlZddlmZmZmZmZm	Z	m
Z
mZ ddlZddlZddlmZ ddlmZ ddlmZ  ej        e          Z e            ddd	d
dd
dZdededefdZ G d de          ZdS )zWeb base loader class.    N)AnyDictIteratorListOptionalSequenceUnion)Document)
BaseLoader)get_user_agentzJtext/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8zen-US,en;q=0.5zhttps://www.google.com/1z
keep-alive)
User-AgentAcceptzAccept-LanguageRefererDNT
ConnectionzUpgrade-Insecure-Requestssoupurlreturnc                 2   d|i}|                      d          x}r|                                |d<   |                      dddi          x}r|                    dd          |d<   |                      d	          x}r|                    d
d          |d<   |S )z)Build metadata from BeautifulSoup output.sourcetitlemetanamedescription)attrscontentzNo description found.htmllangzNo language found.language)findget_textget)r   r   metadatar   r   r   s         i/var/www/html/ai-engine/env/lib/python3.11/site-packages/langchain_community/document_loaders/web_base.py_build_metadatar&      s    #H		'"""u -!NN,,iiv}.EiFFF{ V"-//)=T"U"Uyy   t F#xx0DEEO    c            $          e Zd ZdZ	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 d/dd	d
eeee         f         dee         de	dee         de	de	dee         dee         de
dedeeeef                  de	deeeef                  deeeef                  dede	ddf"dZedefd            Z	 d0ded e
d!e
d"edef
d#Zded$ej        defd%Zd&ee         defd'Zed(eddfd)            Zd1d&ee         d(eedf         dee         fd*Z	 	 d2ded(eedf         dee         defd+Zd1d(eedf         defd,Zdee         fd-Zdee         fd.ZdS )3WebBaseLoadera  
    WebBaseLoader document loader integration

    Setup:
        Install ``langchain_community``.

        .. code-block:: bash

            pip install -U langchain_community

    Instantiate:
        .. code-block:: python

            from langchain_community.document_loaders import WebBaseLoader

            loader = WebBaseLoader(
                web_path = "https://www.espn.com/"
                # header_template = None,
                # verify_ssl = True,
                # proxies = None,
                # continue_on_failure = False,
                # autoset_encoding = True,
                # encoding = None,
                # web_paths = (),
                # requests_per_second = 2,
                # default_parser = "html.parser",
                # requests_kwargs = None,
                # raise_for_status = False,
                # bs_get_text_kwargs = None,
                # bs_kwargs = None,
                # session = None,
                # show_progress = True,
            )

    Lazy load:
        .. code-block:: python

            docs = []
            docs_lazy = loader.lazy_load()

            # async variant:
            # docs_lazy = await loader.alazy_load()

            for doc in docs_lazy:
                docs.append(doc)
            print(docs[0].page_content[:100])
            print(docs[0].metadata)

        .. code-block:: python

            ESPN - Serving Sports Fans. Anytime. Anywhere.

            {'source': 'https://www.espn.com/', 'title': 'ESPN - Serving Sports Fans. Anytime. Anywhere.', 'description': 'Visit ESPN for live scores, highlights and sports news. Stream exclusive games on ESPN+ and play fantasy sports.', 'language': 'en'}


    Async load:
        .. code-block:: python

            docs = await loader.aload()
            print(docs[0].page_content[:100])
            print(docs[0].metadata)

        .. code-block:: python

            ESPN - Serving Sports Fans. Anytime. Anywhere.

            {'source': 'https://www.espn.com/', 'title': 'ESPN - Serving Sports Fans. Anytime. Anywhere.', 'description': 'Visit ESPN for live scores, highlights and sports news. Stream exclusive games on ESPN+ and play fantasy sports.', 'language': 'en'}

     NTF    html.parser)show_progressweb_pathheader_template
verify_sslproxiescontinue_on_failureautoset_encodingencoding	web_pathsrequests_per_seconddefault_parserrequests_kwargsraise_for_statusbs_get_text_kwargs	bs_kwargssessionr.   r   c                   |r|rt          d          |rt          |          | _        nxt          |t                    r	|g| _        nZt          |t
                    rt          |          | _        n0t          dt          |           dt          |           d          |	| _        |
| _	        |pi | _
        || _        || _        |pi | _        |pi | _        |r|| _        nt!          j                    }|pt$                                          }|                    d          sD	 ddlm}  |            j        |d<   n*# t0          $ r t2                              d           Y nw xY wt7          |          |_        ||_        |r|j                            |           || _        || _         || _!        || _"        d	S )
a6  Initialize loader.

        Args:
            web_paths: Web paths to load from.
            requests_per_second: Max number of concurrent requests to make.
            default_parser: Default parser to use for BeautifulSoup.
            requests_kwargs: kwargs for requests
            raise_for_status: Raise an exception if http status code denotes an error.
            bs_get_text_kwargs: kwargs for beatifulsoup4 get_text
            bs_kwargs: kwargs for beatifulsoup4 web page parsing
            show_progress: Show progress bar when loading pages.
        zmReceived web_path and web_paths. Only one can be specified. web_path is deprecated, web_paths should be used.z+web_path must be str or Sequence[str] got (z*) or web_paths must be Sequence[str] got ()r   r   )	UserAgentzxfake_useragent not found, using default user agent.To get a realistic header for requests, `pip install fake_useragent`.N)#
ValueErrorlistr6   
isinstancestrr   	TypeErrortyper7   r8   r9   r:   r.   r;   r<   r=   requestsSessiondefault_header_templatecopyr#   fake_useragentr@   randomImportErrorloggerinfodictheadersverifyr2   updater3   r4   r5   )selfr/   r0   r1   r2   r3   r4   r5   r6   r7   r8   r9   r:   r;   r<   r=   r.   r@   s                     r%   __init__zWebBaseLoader.__init__p   s"   B  		 	D    
	!)__DNN#&& 	&ZDNN(++ 	!(^^DNNLd8nn L L9=iL L L   $7 ,.4" 0*"4":"b 	#"DLL&((G-O1H1M1M1O1OO"&&|44 
	8888884=IKK4FOL11"   KK8     #?33GO'GN 0&&w///"DL#6  0 s   (E $E('E(c                 j    t          | j                  dk    rt          d          | j        d         S )N   zMultiple webpaths found.r   )lenr6   rA   )rT   s    r%   r/   zWebBaseLoader.web_path   s3    t~""7888~a  r'            ?r   retriescooldownbackoffc                 <  K   t          j                    4 d {V }t          |          D ]=}	 t          | j        j        | j        j                                                  }| j        j        sd|d<    |j	        |fi |4 d {V }| j
        r|
                                 |                                 d {V cd d d           d {V  c cd d d           d {V  S # 1 d {V swxY w Y   # t           j        $ r_}	||dz
  k    r t                              d| d|dz    d| d|	 d		           t          j        |||z  z             d {V  Y d }	~	7d }	~	ww xY w	 d d d           d {V  n# 1 d {V swxY w Y   t#          d
          )N)rQ   cookiesFsslrW   Error fetching z with attempt /z: z. Retrying...zretry count exceeded)aiohttpClientSessionrangerP   r=   rQ   r_   get_dictrR   r#   r:   textClientConnectionErrorrN   warningasynciosleeprA   )
rT   r   r[   r\   r]   r=   ikwargsresponsees
             r%   _fetchzWebBaseLoader._fetch   s      (** 	C 	C 	C 	C 	C 	C 	Cg7^^ C CC#' $ 4 $ 4 = = ? ?$ $ $F  <. .(-u*w{399&99 5 5 5 5 5 5 5X0 8$55777%-]]__4444445 5 5 5 5 5 5 5 5 5 5 5 5 5 5	C 	C 	C 	C 	C 	C 	C 	C 	C 	C 	C 	C 	C 	C5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 4 C C CGaK''Cc C C 1uC C'.C C23C C C   &mHwz,ABBBBBBBBBBBBBBCC	C 	C 	C 	C 	C 	C 	C 	C 	C 	C 	C 	C 	C 	C 	C 	C 	C 	C 	C 	C 	C 	C 	C 	C 	C 	C 	C, /000sf   E>AC:5C(C:E>(
C22C:5C26C:9E>:E(	AE#E>#E((E>>
FF	semaphorec                 x  K   |4 d {V  	 |                      |           d {V 	 cd d d           d {V  S # t          $ ra}| j        r5t                              d| d           Y d }~d d d           d {V  dS t                              d| d           |d }~ww xY w# 1 d {V swxY w Y   d S )Nra   z*, skipping due to continue_on_failure=Truer*   za and aborting, use continue_on_failure=True to continue loading urls after encountering an error.)rp   	Exceptionr3   rN   ri   	exception)rT   r   rq   ro   s       r%   _fetch_with_rate_limitz$WebBaseLoader._fetch_with_rate_limit   s       	 	 	 	 	 	 	 	![[---------	 	 	 	 	 	 	 	 	 	 	 	 	 	    + NN5# 5 5 5   222	 	 	 	 	 	 	 	 	 	 	 	 	 	   Lc L L L   	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	s8   B);
B&%B!*B) B!!B&&B))
B36B3urlsc                   K   t          j        | j                  }g }|D ]?}t          j        |                     ||                    }|                    |           @	 | j        rddlm}  |j	        |dddd d{V S t          j	        |  d{V S # t          $ r+ t          j        d           t          j	        |  d{V cY S w xY w)	z/Fetch all urls concurrently with rate limiting.r   )tqdm_asynciozFetching pagesTrW   )descasciiminintervalNz2For better logging of progress, `pip install tqdm`)rj   	Semaphorer7   ensure_futureru   appendr.   tqdm.asynciorx   gatherrM   warningswarn)rT   rv   rq   tasksr   taskrx   s          r%   	fetch_allzWebBaseLoader.fetch_all   s:     %d&>??	 	 	C()D)DS))T)TUUDLL	0! 45555550\0!11          %^U33333333 	0 	0 	0MNOOO //////////	0s   !!B B 2CCparserc                 f    g d}| |vr(t          dd                    |          z   dz             dS )z#Check that parser is valid for bs4.)r-   lxmlxmlzlxml-xmlhtml5libz`parser` must be one of z, .N)rA   join)r   valid_parserss     r%   _check_parserzWebBaseLoader._check_parser	  sN     ONN&&*TYY}-E-EEK   '&r'   c                 N   ddl m} t          j        |                     |                    }g }t          |          D ]e\  }}||         }|4|                    d          rd}n| j        }|                     |           |	                     |||fi | j
                   f|S )z2Fetch all urls, then return soups for all results.r   BeautifulSoupN.xmlr   )bs4r   rj   runr   	enumerateendswithr8   r   r~   r<   )	rT   rv   r   r   resultsfinal_resultsrl   resultr   s	            r%   
scrape_allzWebBaseLoader.scrape_all  s    %%%%%%+dnnT2233"7++ 	R 	RIAvq'C~<<'' 1"FF!0F""6***  vv!P!P!P!PQQQQr'   c                 P   ddl m} ||                    d          rd}n| j        }|                     |            | j        j        |fi | j        }| j        r|                                 | j	        | j	        |_	        n| j
        r|j        |_	         ||j        |fi |pi S )Nr   r   r   r   )r   r   r   r8   r   r=   r#   r9   r:   r5   r4   apparent_encodingrg   )rT   r   r   r<   r   html_docs         r%   _scrapezWebBaseLoader._scrape$  s     	&%%%%%>||F## -,6"""#4<#C@@4+?@@  	(%%'''=$ $H" 	; ( :H}X]FHHyBHHHr'   c                 X    || j         }|                     | j        || j                  S )z?Scrape data from webpage and return it in BeautifulSoup format.N)r   r<   )r8   r   r/   r<   )rT   r   s     r%   scrapezWebBaseLoader.scrape>  s-     >(F||DM&DN|SSSr'   c              #      K   | j         D ]S}|                     || j                  } |j        di | j        }t          ||          }t          ||          V  TdS )z+Lazy load text from the url(s) in web_path.)r<   page_contentr$   Nr+   )r6   r   r<   r"   r;   r&   r
   )rT   pathr   rg   r$   s        r%   	lazy_loadzWebBaseLoader.lazy_loadF  s      N 	A 	AD<<<??D 4=;;4#:;;D&tT22Hx@@@@@@@		A 	Ar'   c                     |                      | j                  }g }t          | j        |          D ]K\  }} |j        di | j        }t          ||          }|                    t          ||                     L|S )z9Load text from the urls in web_path async into Documents.r   r+   )r   r6   zipr"   r;   r&   r~   r
   )rT   r   docsr   r   rg   r$   s          r%   aloadzWebBaseLoader.aloadN  s     //$.11dng66 	H 	HJD$ 4=;;4#:;;D&tT22HKKdXFFFGGGGr'   )r*   NTNFTNr+   r,   r-   NFNNN)rY   r,   rZ   )N)NN) __name__
__module____qualname____doc__r	   rD   r   r   rP   boolintr   r   rU   propertyr/   floatrp   rj   r|   ru   r   r   staticmethodr   r   r   r   r   r
   r   r   r+   r'   r%   r)   r)   )   s       D DP /1*."&$)!%"&#%#$+48!&7;.2!O!$ #%O! O! O!Xc]*+O! "$O! 	O!
 $O! "O! O! 3-O! C=O! !O! O! "$sCx.1O! O! %T#s(^4O! DcN+O!  !O!$ %O!& 
'O! O! O! O!b !# ! ! ! X! OR1 11!$1471FK1	1 1 1 16#*#4	   &0DI 0# 0 0 0 0( c d    \ tCy %T	2B dSVi    * $($(	I II c4i I D>	I
 
I I I I4T TU39- T T T T TA8H- A A A A
tH~ 
 
 
 
 
 
r'   r)   )r   rj   loggingr   typingr   r   r   r   r   r   r	   rc   rG   langchain_core.documentsr
   )langchain_community.document_loaders.baser   $langchain_community.utils.user_agentr   	getLoggerr   rN   rI   rD   rP   r&   r)   r+   r'   r%   <module>r      sK        G G G G G G G G G G G G G G G G G G   - - - - - - @ @ @ @ @ @ ? ? ? ? ? ?		8	$	$ !.""'(!$	 	 	# 	C 	D 	 	 	 	o o o o oJ o o o o or'   