
    NgL!                     
   d Z ddlZddlmZmZ ddlmZmZmZm	Z	m
Z
mZ ddlmZ ddlmZ erddlmZ ddlmZ dd	lmZ dd
lmZmZmZ  ej        e          Z G d de          Z G d de          Z G d de          ZdS )zQLoader that uses Playwright to load a page, then uses unstructured to parse html.    N)ABCabstractmethod)TYPE_CHECKINGAsyncIteratorDictIteratorListOptional)Document)
BaseLoader)Browser)Page)Response)r   r   r   c            	       b    e Zd ZdZedddddddefd	            Zedd
dddddefd            ZdS )PlaywrightEvaluatorzAbstract base class for all evaluators.

    Each evaluator should take a page, a browser instance, and a response
    object, process the page as necessary, and return the resulting text.
    pager   browserr   responser   returnc                     dS )a  Synchronously process the page and return the resulting text.

        Args:
            page: The page to process.
            browser: The browser instance.
            response: The response from page.goto().

        Returns:
            text: The text content of the page.
        N selfr   r   r   s       o/var/www/html/ai-engine/env/lib/python3.11/site-packages/langchain_community/document_loaders/url_playwright.pyevaluatezPlaywrightEvaluator.evaluate   s	     	    	AsyncPageAsyncBrowserAsyncResponsec                 
   K   dS )a  Asynchronously process the page and return the resulting text.

        Args:
            page: The page to process.
            browser: The browser instance.
            response: The response from page.goto().

        Returns:
            text: The text content of the page.
        Nr   r   s       r   evaluate_asyncz"PlaywrightEvaluator.evaluate_async*   s       	r   N)__name__
__module____qualname____doc__r   strr   r!   r   r   r   r   r      s          V i : RU    ^ *8DS	   ^  r   r   c                   h    e Zd ZdZddeee                  fdZddddd	d
defdZddddd	ddefdZ	dS )UnstructuredHtmlEvaluatorz@Evaluate the page HTML content using the `unstructured` library.Nremove_selectorsc                 Z    	 ddl }n# t          $ r t          d          w xY w|| _        dS )z%Initialize UnstructuredHtmlEvaluator.r   NzQunstructured package not found, please install it with `pip install unstructured`)unstructuredImportErrorr)   )r   r)   r+   s      r   __init__z"UnstructuredHtmlEvaluator.__init__>   sX    	 	 	 	-  	 !1    !r   r   r   r   r   r   r   c                 N   ddl m} | j        pg D ]W}|                    |                                          }|D ]+}|                                r|                    d           ,X|                                } ||          }d                    d |D                       S )z3Synchronously process the HTML content of the page.r   partition_htmlelement => element.remove()text

c                 ,    g | ]}t          |          S r   r&   .0els     r   
<listcomp>z6UnstructuredHtmlEvaluator.evaluate.<locals>.<listcomp>V       777CGG777r   	unstructured.partition.htmlr1   r)   locatorall
is_visibler   contentjoin	r   r   r   r   r1   selectorelementselementpage_sources	            r   r   z"UnstructuredHtmlEvaluator.evaluateJ   s    >>>>>>-3 	D 	DH||H--1133H# D D%%'' D$$%BCCCD llnn!>{333{{77h777888r   r   r   r   c                   K   ddl m} | j        pg D ]i}|                    |                                           d{V }|D ]7}|                                 d{V r|                    d           d{V  8j|                                 d{V } ||          }d                    d |D                       S )z4Asynchronously process the HTML content of the page.r   r0   Nr2   r3   r5   c                 ,    g | ]}t          |          S r   r7   r8   s     r   r;   z<UnstructuredHtmlEvaluator.evaluate_async.<locals>.<listcomp>f   r<   r   r=   rD   s	            r   r!   z(UnstructuredHtmlEvaluator.evaluate_asyncX   s      	?>>>>>-3 	J 	JH!\\(337799999999H# J J ++-------- J!**+HIIIIIIIIIJ !LLNN******!>{333{{77h777888r   N)
r"   r#   r$   r%   r
   r	   r&   r-   r   r!   r   r   r   r(   r(   ;   s        JJ
1 
1$s))< 
1 
1 
1 
19V 9i 9: 9RU 9 9 9 999*89DS9	9 9 9 9 9 9r   r(   c                       e Zd ZdZ	 	 	 	 	 ddee         dededeee                  dee         d	ee	eef                  fd
Z
dee         fdZdee         fdZdee         fdZdS )PlaywrightURLLoadera  Load `HTML` pages with `Playwright` and parse with `Unstructured`.

    This is useful for loading pages that require javascript to render.

    Attributes:
        urls (List[str]): List of URLs to load.
        continue_on_failure (bool): If True, continue loading other URLs on failure.
        headless (bool): If True, the browser will run in headless mode.
        proxy (Optional[Dict[str, str]]): If set, the browser will access URLs
            through the specified proxy.

    Example:
        .. code-block:: python

            from langchain_community.document_loaders import PlaywrightURLLoader

            urls = ["https://api.ipify.org/?format=json",]
            proxy={
                "server": "https://xx.xx.xx:15818", # https://<host>:<port>
                "username": "username",
                "password": "password"
            }
            loader = PlaywrightURLLoader(urls, proxy=proxy)
            data = loader.load()
    TNurlscontinue_on_failureheadlessr)   	evaluatorproxyc                     	 ddl }n# t          $ r t          d          w xY w|| _        || _        || _        || _        |r|rt          d          |pt          |          | _        dS )z%Load a list of URLs using Playwright.r   NzMplaywright package not found, please install it with `pip install playwright`z:`remove_selectors` and `evaluator` cannot be both not None)	
playwrightr,   rN   rO   rP   rR   
ValueErrorr(   rQ   )r   rN   rO   rP   r)   rQ   rR   rT   s           r   r-   zPlaywrightURLLoader.__init__   s    	 	 	 	+  	 	#6  
 		 	L  
 #Q&?@P&Q&Qr.   r   c           	   #   H  K   ddl m}  |            5 }|j                            | j        | j                  }| j        D ]}	 |                                }|                    |          }|t          d|           | j
                            |||          }d|i}t          ||          V  t# t          $ r4}	| j        r!t                              d| d	|	            n|	Y d}	~	d}	~	ww xY w|                                 ddd           dS # 1 swxY w Y   dS )
zLoad the specified URLs using Playwright and create Document instances.

        Returns:
            A list of Document instances with loaded content.
        r   )sync_playwrightrP   rR   N"page.goto() returned None for url sourcepage_contentmetadataError fetching or processing , exception: )playwright.sync_apirW   chromiumlaunchrP   rR   rN   new_pagegotorU   rQ   r   r   	ExceptionrO   loggererrorclose)
r   rW   pr   urlr   r   r4   r]   es
             r   	lazy_loadzPlaywrightURLLoader.lazy_load   s      	877777_ 	!j''dj'QQGy     "++--D#yy~~H'()Sc)S)STTT>224(KKD (#H"xHHHHHHH       /  QCQQaQQ     	      MMOOO'	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	s<   0DA0B54D5
C3?*C.)D.C33DDDc                 N   K   d |                                  2              d{V S )Load the specified URLs with Playwright and create Documents asynchronously.
        Use this function when in a jupyter notebook environment.

        Returns:
            A list of Document instances with loaded content.
        c                 "   K   g | 3 d {V }|
6 S rK   r   )r9   docs     r   r;   z-PlaywrightURLLoader.aload.<locals>.<listcomp>   s.      77777777c7777s   N)
alazy_load)r   s    r   aloadzPlaywrightURLLoader.aload   s8       87T__%6%67777777777r   c           	       K   ddl m}  |            4 d{V }|j                            | j        | j                   d{V }| j        D ]}	 |                                 d{V }|                    |           d{V }|t          d|           | j
                            |||           d{V }d|i}t          ||          W V  # t          $ r4}	| j        r!t                              d| d	|	            n|	Y d}	~	d}	~	ww xY w|                                 d{V  ddd          d{V  dS # 1 d{V swxY w Y   dS )
rn   r   )async_playwrightNrX   rY   rZ   r[   r^   r_   )playwright.async_apirt   ra   rb   rP   rR   rN   rc   rd   rU   rQ   r!   r   re   rO   rf   rg   rh   )
r   rt   ri   r   rj   r   r   r4   r]   rk   s
             r   rq   zPlaywrightURLLoader.alazy_load   sx      	:99999##%% 	" 	" 	" 	" 	" 	" 	"J--t}DJ-WWWWWWWWGy     !(!1!1!3!3333333D%)YYs^^333333H'()Sc)S)STTT!%!>!>tWh!W!WWWWWWWD (#H"xHHHHHHHH       /  QCQQaQQ     	      --//!!!!!!!'	" 	" 	" 	" 	" 	" 	" 	" 	" 	" 	" 	" 	" 	" 	" 	" 	" 	" 	" 	" 	" 	" 	" 	" 	" 	" 	" 	" 	" 	"s<   6EBCE
D*DEDE
EE)TTNNN)r"   r#   r$   r%   r	   r&   boolr
   r   r   r-   r   r   rl   rr   r   rq   r   r   r   rM   rM   i   s        : %)0437*.R R3iR "R 	R
 #49-R /0R S#X'R R R R>8H-    :8T(^ 8 8 8 8"-"9 " " " " " "r   rM   )r%   loggingabcr   r   typingr   r   r   r   r	   r
   langchain_core.documentsr   )langchain_community.document_loaders.baser   ru   r   r   r   r   r   r   r`   	getLoggerr"   rf   r   r(   rM   r   r   r   <module>r}      s   W W  # # # # # # # # O O O O O O O O O O O O O O O O - - - - - - @ @ @ @ @ @ <<<<<<<666666>>>>>>;;;;;;;;;; 
	8	$	$# # # # ## # # #L+9 +9 +9 +9 +9 3 +9 +9 +9\|" |" |" |" |"* |" |" |" |" |"r   