
    Ng~                         d dl Z d dlZd dlmZmZmZmZ d dlmZ d dl	m
Z
 d dlmZ  ej        e          Z G d de
          ZdS )    N)AsyncIteratorIteratorListOptional)Document)
BaseLoader)get_user_agentc                       e Zd ZdZddddee         dedee         fdZd	ed
efdZ	d
e
e         fdZd
ee         fdZdS )AsyncChromiumLoaderzJScrape HTML pages from URLs using a
    headless instance of the Chromium.TN)headless
user_agenturlsr   r   c                    || _         || _        |pt                      | _        	 ddl}dS # t
          $ r t          d          w xY w)a^  Initialize the loader with a list of URL paths.

        Args:
            urls: A list of URLs to scrape content from.
            headless: Whether to run browser in headless mode.
            user_agent: The user agent to use for the browser

        Raises:
            ImportError: If the required 'playwright' package is not installed.
        r   Nz`playwright is required for AsyncChromiumLoader. Please install it with `pip install playwright`.)r   r   r	   r   
playwrightImportError)selfr   r   r   r   s        i/var/www/html/ai-engine/env/lib/python3.11/site-packages/langchain_community/document_loaders/chromium.py__init__zAsyncChromiumLoader.__init__   sn    " 	 $8(8(8	 	 	 	C  	s	   + Aurlreturnc                 R  K   ddl m} t                              d           d} |            4 d{V }|j                            | j                   d{V }	 |                    | j                   d{V }|	                    |           d{V  |
                                 d{V }t                              d           n# t          $ r}d	| }Y d}~nd}~ww xY w|                                 d{V  ddd          d{V  n# 1 d{V swxY w Y   |S )
a  
        Asynchronously scrape the content of a given URL using Playwright's async API.

        Args:
            url (str): The URL to scrape.

        Returns:
            str: The scraped HTML content or an error message if an exception occurs.

        r   )async_playwrightzStarting scraping... N)r   )r   zContent scrapedzError: )playwright.async_apir   loggerinfochromiumlaunchr   new_pager   gotocontent	Exceptionclose)r   r   r   resultspbrowserpagees           r   ascrape_playwrightz&AsyncChromiumLoader.ascrape_playwright.   s      	:99999*+++##%% 		" 		" 		" 		" 		" 		" 		"J--t}-EEEEEEEEG($---IIIIIIIIiinn$$$$$$$ $......-.... ( ( ('A--(--//!!!!!!!		" 		" 		" 		" 		" 		" 		" 		" 		" 		" 		" 		" 		" 		" 		" 		" 		" 		" 		" 		" 		" 		" 		" 		" 		" 		" 		" s<   'DA0CD
C'C"D"C''D
D #D c              #      K   | j         D ]@}t          j        |                     |                    }d|i}t	          ||          V  AdS )a-  
        Lazily load text content from the provided URLs.

        This method yields Documents one at a time as they're scraped,
        instead of waiting to scrape all URLs before returning.

        Yields:
            Document: The scraped content encapsulated within a Document object.

        sourcepage_contentmetadataN)r   asynciorunr)   r   )r   r   html_contentr.   s       r   	lazy_loadzAsyncChromiumLoader.lazy_loadI   si       9 	I 	IC";t'>'>s'C'CDDL #HxHHHHHHH	I 	I    c                    K    fd j         D             }t          j        |  d{V }t           j         |          D ]\  }}d|i}t	          ||          W V  dS )a  
        Asynchronously load text content from the provided URLs.

        This method leverages asyncio to initiate the scraping of all provided URLs
        simultaneously. It improves performance by utilizing concurrent asynchronous
        requests. Each Document is yielded as soon as its content is available,
        encapsulating the scraped content.

        Yields:
            Document: A Document object containing the scraped content, along with its
            source URL as metadata.
        c                 :    g | ]}                     |          S  )r)   ).0r   r   s     r   
<listcomp>z2AsyncChromiumLoader.alazy_load.<locals>.<listcomp>f   s'    CCC#((--CCCr3   Nr+   r,   )r   r/   gatherzipr   )r   tasksr$   r   r!   r.   s   `     r   
alazy_loadzAsyncChromiumLoader.alazy_loadY   s       DCCCCCC.......	733 	D 	DLC #H(CCCCCCCC	D 	Dr3   )__name__
__module____qualname____doc__r   strboolr   r   r)   r   r   r2   r   r<   r6   r3   r   r   r      s        * * $(  3i 	
 SM   :C C    6I8H- I I I I D-"9 D D D D D Dr3   r   )r/   loggingtypingr   r   r   r   langchain_core.documentsr   )langchain_community.document_loaders.baser   $langchain_community.utils.user_agentr	   	getLoggerr=   r   r   r6   r3   r   <module>rI      s      : : : : : : : : : : : : - - - - - - @ @ @ @ @ @ ? ? ? ? ? ?		8	$	$]D ]D ]D ]D ]D* ]D ]D ]D ]D ]Dr3   