
    Ngb
                         d dl Z d dlZd dlmZ d dlmZmZmZ d dlm	Z	 d dl
mZ  ej        e          Z G d de          ZdS )    N)Path)DictIteratorUnion)Document)
BaseLoaderc                   ~    e Zd ZdZ	 	 	 ddeeef         deedf         deedf         deddf
d	Zde	e
         fd
ZdS )MHTMLLoaderz)Parse `MHTML` files with `BeautifulSoup`.N 	file_pathopen_encoding	bs_kwargsget_text_separatorreturnc                     	 ddl }n# t          $ r t          d          w xY w|| _        || _        |ddi}|| _        || _        dS )a  initialize with path, and optionally, file encoding to use, and any kwargs
        to pass to the BeautifulSoup object.

        Args:
            file_path: Path to file to load.
            open_encoding: The encoding to use when opening the file.
            bs_kwargs: Any kwargs to pass to the BeautifulSoup object.
            get_text_separator: The separator to use when getting the text
                from the soup.
        r   NzUbeautifulsoup4 package not found, please install it with `pip install beautifulsoup4`featureslxml)bs4ImportErrorr   r   r   r   )selfr   r   r   r   r   s         f/var/www/html/ai-engine/env/lib/python3.11/site-packages/langchain_community/document_loaders/mhtml.py__init__zMHTMLLoader.__init__   sz    "	JJJJ 	 	 	/  	 #*#V,I""4s    !c              #     K   ddl m} t          | j        d| j                  5 }t          j        |                                          }|                                }t          |t                    s|g}|D ]}|                                dk    r|                    d                                          } ||fi | j        }|                    | j                  }|j        rt#          |j        j                  }	nd}	t#          | j                  |	d	}
t'          ||

          V   ddd           dS 	 ddd           dS # 1 swxY w Y   dS )z*Load MHTML document into document objects.r   )BeautifulSoupr)encodingz	text/htmlT)decoder   )sourcetitle)page_contentmetadataN)r   r   openr   r   emailmessage_from_stringreadget_payload
isinstancelistget_content_typer   r   get_textr   r   strstringr   )r   r   fmessagepartsparthtmlsouptextr   r!   s              r   	lazy_loadzMHTMLLoader.lazy_load0   s      	&%%%%%$.#0BCCC 	q/99G''))EeT** " 	  ((**k99++4+88??AAD(=@@@@D==)@AADz # #DJ$5 6 6 " #&dn"5"5!&= =H #xHHHHHH1	 	 	 	 	 	 	 	 :	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	s   DE?EEE)NNr   )__name__
__module____qualname____doc__r   r+   r   dictr   r   r   r4        r   r
   r
      s        33
 +/'+"$5 5d#5 S$Y'5 t$	5
  5 
5 5 5 5@8H-      r;   r
   )r#   loggingpathlibr   typingr   r   r   langchain_core.documentsr   )langchain_community.document_loaders.baser   	getLoggerr5   loggerr
   r:   r;   r   <module>rC      s            ( ( ( ( ( ( ( ( ( ( - - - - - - @ @ @ @ @ @		8	$	$@ @ @ @ @* @ @ @ @ @r;   