
    Ng                         d dl Zd dlZd dlmZ d dlmZmZmZ d dl	m
Z
 d dlmZ  ej        e          Z G d de          ZdS )    N)Path)DictIteratorUnion)Document)
BaseLoaderc                   ~    e Zd ZdZ	 	 	 ddeeef         deedf         deedf         deddf
d	Zde	e
         fd
ZdS )BSHTMLLoaderaS  
    __ModuleName__ document loader integration

    Setup:
        Install ``langchain-community`` and ``bs4``.

        .. code-block:: bash

            pip install -U langchain-community bs4

    Instantiate:
        .. code-block:: python

            from langchain_community.document_loaders import BSHTMLLoader

            loader = BSHTMLLoader(
                file_path="./example_data/fake-content.html",
            )

    Lazy load:
        .. code-block:: python

            docs = []
            docs_lazy = loader.lazy_load()

            # async variant:
            # docs_lazy = await loader.alazy_load()

            for doc in docs_lazy:
                docs.append(doc)
            print(docs[0].page_content[:100])
            print(docs[0].metadata)

        .. code-block:: python


            Test Title


            My First Heading
            My first paragraph.



            {'source': './example_data/fake-content.html', 'title': 'Test Title'}

    Async load:
        .. code-block:: python

            docs = await loader.aload()
            print(docs[0].page_content[:100])
            print(docs[0].metadata)

        .. code-block:: python



            Test Title


            My First Heading
            My first paragraph.



            {'source': './example_data/fake-content.html', 'title': 'Test Title'}

    N 	file_pathopen_encoding	bs_kwargsget_text_separatorreturnc                     	 ddl }n# t          $ r t          d          w xY w|| _        || _        |2t          j                            d          st          d          ddi}|| _        || _        dS )a  initialize with path, and optionally, file encoding to use, and any kwargs
        to pass to the BeautifulSoup object.

        Args:
            file_path: The path to the file to load.
            open_encoding: The encoding to use when opening the file.
            bs_kwargs: Any kwargs to pass to the BeautifulSoup object.
            get_text_separator: The separator to use when calling get_text on the soup.
        r   NzUbeautifulsoup4 package not found, please install it with `pip install beautifulsoup4`lxmlzBy default BSHTMLLoader uses the 'lxml' package. Please either install it with `pip install -U lxml` or pass in init arg `bs_kwargs={'features': '...'}` to overwrite the default BeautifulSoup kwargs.features)	bs4ImportErrorr   r   	importlibutil	find_specr   r   )selfr   r   r   r   r   s         h/var/www/html/ai-engine/env/lib/python3.11/site-packages/langchain_community/document_loaders/html_bs.py__init__zBSHTMLLoader.__init__S   s     	JJJJ 	 	 	/  	 #*>++F33 !,   $V,I""4s    !c              #   h  K   ddl m} t          | j        d| j                  5 } ||fi | j        }ddd           n# 1 swxY w Y   |                    | j                  }|j        rt          |j        j
                  }nd}t          | j                  |d}t          ||          V  dS )	z)Load HTML document into document objects.r   )BeautifulSoupr)encodingNr   )sourcetitle)page_contentmetadata)r   r   openr   r   r   get_textr   r!   strstringr   )r   r   fsouptextr!   r#   s          r   	lazy_loadzBSHTMLLoader.lazy_loady   s     %%%%%%$.#0BCCC 	6q =55dn55D	6 	6 	6 	6 	6 	6 	6 	6 	6 	6 	6 	6 	6 	6 	6 }}T455: 	
)**EEE $.))1
 1
 D8<<<<<<<<s   A  AA)NNr   )__name__
__module____qualname____doc__r   r&   r   dictr   r   r   r+        r   r
   r
      s        C CP +/'+"$$5 $5d#$5 S$Y'$5 t$	$5
  $5 
$5 $5 $5 $5L=8H- = = = = = =r2   r
   )importlib.utilr   loggingpathlibr   typingr   r   r   langchain_core.documentsr   )langchain_community.document_loaders.baser   	getLoggerr,   loggerr
   r1   r2   r   <module>r;      s               ( ( ( ( ( ( ( ( ( ( - - - - - - @ @ @ @ @ @		8	$	$~= ~= ~= ~= ~=: ~= ~= ~= ~= ~=r2   