
    Ng                     v    d Z ddlZddlmZmZ ddlmZ ddlmZ  ej	        e
          Z G d de          ZdS )z1Loader that uses unstructured to load HTML files.    N)AnyList)Document)
BaseLoaderc                       e Zd ZdZ	 	 	 ddee         dededed	ef
d
ZdeddfdZ	defdZ
defdZdefdZdee         fdZdS )UnstructuredURLLoadera  Load files from remote URLs using `Unstructured`.

    Use the unstructured partition function to detect the MIME type
    and route the file to the appropriate partitioner.

    You can run the loader in one of two modes: "single" and "elements".
    If you use "single" mode, the document will be returned as a single
    langchain Document object. If you use "elements" mode, the unstructured
    library will split the document into elements such as Title and NarrativeText.
    You can pass in additional unstructured kwargs after mode to apply
    different unstructured settings.

    Examples
    --------
    from langchain_community.document_loaders import UnstructuredURLLoader

    loader = UnstructuredURLLoader(
        urls=["<url-1>", "<url-2>"], mode="elements", strategy="fast",
    )
    docs = loader.load()

    References
    ----------
    https://unstructured-io.github.io/unstructured/bricks.html#partition
    TsingleFurlscontinue_on_failuremodeshow_progress_barunstructured_kwargsc                    	 ddl }ddlm} || _        n# t          $ r t	          d          w xY w|                     |           || _        |                    di           }t          |	                                          dk    r]d}	| 
                                r|                                  }	n|                                  }	|	rt                              d           || _        || _        || _        || _        || _        dS )zInitialize with file path.r   N)__version__zQunstructured package not found, please install it with `pip install unstructured`headersFzNYou are using an old version of unstructured. The headers parameter is ignored)unstructuredunstructured.__version__r   _UnstructuredURLLoader__versionImportError_validate_moder   poplenkeys-_UnstructuredURLLoader__is_non_html_available9_UnstructuredURLLoader__is_headers_available_for_non_html5_UnstructuredURLLoader__is_headers_available_for_htmlloggerwarningr
   r   r   r   r   )
selfr
   r   r   r   r   r   __unstructured_version__r   warn_about_headerss
             d/var/www/html/ai-engine/env/lib/python3.11/site-packages/langchain_community/document_loaders/url.py__init__zUnstructuredURLLoader.__init__(   sF   		XXXXXX5DNN 	 	 	-  	 	D!!!	%)))R88w||~~!##!&++-- P)-)Q)Q)S)S%S"")-)M)M)O)O%O"! 7  
 	#6 #6 !2s    .returnNc                 B    ddh}||vrt          d| d| d          d S )Nr	   elementszGot z# for `mode`, but should be one of ``)
ValueError)r   r   _valid_modess      r"   r   z$UnstructuredURLLoader._validate_modeS   sG     *-|##OtOOOOO   $#    c                     | j                             d          d         }t          d |                    d          D                       }|dk    S )N-r   c                 ,    g | ]}t          |          S  int.0xs     r"   
<listcomp>zIUnstructuredURLLoader.__is_headers_available_for_html.<locals>.<listcomp>\       %W%W%Wc!ff%W%W%Wr*   .)r         r   splittupler   _unstructured_versionunstructured_versions      r"   __is_headers_available_for_htmlz5UnstructuredURLLoader.__is_headers_available_for_htmlZ   sU     $ 4 4S 9 9! <$%W%W6K6Q6QRU6V6V%W%W%WXX#y00r*   c                     | j                             d          d         }t          d |                    d          D                       }|dk    S )Nr,   r   c                 ,    g | ]}t          |          S r.   r/   r1   s     r"   r4   zMUnstructuredURLLoader.__is_headers_available_for_non_html.<locals>.<listcomp>b   r5   r*   r6   )r   r7      r9   r<   s      r"   #__is_headers_available_for_non_htmlz9UnstructuredURLLoader.__is_headers_available_for_non_html`   U     $ 4 4S 9 9! <$%W%W6K6Q6QRU6V6V%W%W%WXX#z11r*   c                     | j                             d          d         }t          d |                    d          D                       }|dk    S )Nr,   r   c                 ,    g | ]}t          |          S r.   r/   r1   s     r"   r4   zAUnstructuredURLLoader.__is_non_html_available.<locals>.<listcomp>h   r5   r*   r6   )r   r7      r9   r<   s      r"   __is_non_html_availablez-UnstructuredURLLoader.__is_non_html_availablef   rD   r*   c           	         ddl m} ddlm} t	                      }| j        r;	 ddlm} n"# t          $ r}t          d          |d}~ww xY w || j                  }n| j        }|D ]}	 | 	                                r:| 
                                r |d|| j        d| j        }nI |dd|i| j        }n9|                                 r |d|| j        d| j        }n |dd|i| j        }n@# t          $ r3}| j        r%t                               d	| d
|            Y d}~|d}~ww xY w| j        dk    rId                    d |D                       }	d|i}
|                    t+          |	|
                      | j        dk    rY|D ]V}|j                                        }
|j        |
d<   |                    t+          t3          |          |
                     W|S )z
Load file.r   )	partition)partition_html)tqdmzPackage tqdm must be installed if show_progress_bar=True. Please install with 'pip install tqdm' or set show_progress_bar=False.N)urlr   rM   zError fetching or processing z, exception: r	   z

c                 ,    g | ]}t          |          S r.   )str)r2   els     r"   r4   z.UnstructuredURLLoader.load.<locals>.<listcomp>   s    #?#?#?CGG#?#?#?r*   source)page_contentmetadatar&   categoryr.   )unstructured.partition.autorJ   unstructured.partition.htmlrK   listr   rL   r   r
   r   r   r   r   r   	Exceptionr   r   errorr   joinappendr   rS   to_dictrT   rO   )r   rJ   rK   docsrL   er
   rM   r&   textrS   elements               r"   loadzUnstructuredURLLoader.loadl   s   999999>>>>>>#vv! 	%%%%%%%   !/  	 4	??DD9D 	X 	XC//11 W??AA R#,9 $ #T\$ $=A=U$ $ $-9#Q#Q#Q8P#Q#Q;;== W#1> $ #T\$ $=A=U$ $ $2>#V#Vc#VT=U#V#V   + LL!V!V!VST!V!VWWWHHHHG yH$${{#?#?h#?#?#?@@$c?H$JJJKKKKj((' X XG&/7799H+2+;HZ(KKc'llX V V VWWWWs3   * 
A	AA	*BC22
D/<'D*(D**D/)Tr	   F)__name__
__module____qualname____doc__r   rO   boolr   r#   r   r   r   r   r   ra   r.   r*   r"   r   r      s	        : %)"')3 )33i)3 ")3 	)3
  )3  #)3 )3 )3 )3V3 4    1 1 1 1 12T 2 2 2 22 2 2 2 25d8n 5 5 5 5 5 5r*   r   )re   loggingtypingr   r   langchain_core.documentsr   )langchain_community.document_loaders.baser   	getLoggerrb   r   r   r.   r*   r"   <module>rl      s    7 7          - - - - - - @ @ @ @ @ @		8	$	$T T T T TJ T T T T Tr*   