
    Ng                         d dl Z d dlmZ d dlmZmZmZmZ d dlm	Z	 d dl
mZ  e j        e          Z G d de          ZdS )    N)Path)IteratorOptionalSequenceUnion)Document)
BaseLoaderc                       e Zd ZdZ	 	 	 	 ddeeef         dee         deee	                  d	ee
         d
ee
         f
dZd ZdefdZdee         fdZdS )MWDumpLoadera  Load `MediaWiki` dump from an `XML` file.

    Example:
        .. code-block:: python

            from langchain_text_splitters import RecursiveCharacterTextSplitter
            from langchain_community.document_loaders import MWDumpLoader

            loader = MWDumpLoader(
                file_path="myWiki.xml",
                encoding="utf8"
            )
            docs = loader.load()
            text_splitter = RecursiveCharacterTextSplitter(
                chunk_size=1000, chunk_overlap=0
            )
            texts = text_splitter.split_documents(docs)


    :param file_path: XML local file path
    :type file_path: str
    :param encoding: Charset encoding, defaults to "utf8"
    :type encoding: str, optional
    :param namespaces: The namespace of pages you want to parse.
        See https://www.mediawiki.org/wiki/Help:Namespaces#Localisation
        for a list of all common namespaces
    :type namespaces: List[int],optional
    :param skip_redirects: TR=rue to skip pages that redirect to other pages,
        False to keep them. False by default
    :type skip_redirects: bool, optional
    :param stop_on_error: False to skip over pages that cause parsing errors,
        True to stop. True by default
    :type stop_on_error: bool, optional
    utf8NFT	file_pathencoding
namespacesskip_redirectsstop_on_errorc                     t          |t                    r|nt          |          | _        || _        || _        || _        || _        d S )N)
isinstancestrr   r   r   r   r   )selfr   r   r   r   r   s         n/var/www/html/ai-engine/env/lib/python3.11/site-packages/langchain_community/document_loaders/mediawikidump.py__init__zMWDumpLoader.__init__0   sH     '1C&@&@Tc)nn $,*    c                     	 dd l }n"# t          $ r}t          d          |d }~ww xY w|j                            t	          | j        | j                            S )Nr   zBUnable to import 'mwxml'. Please install with `pip install mwxml`.)r   )mwxmlImportErrorDump	from_fileopenr   r   )r   r   es      r   _load_dump_filezMWDumpLoader._load_dump_file?   sr    	LLLL 	 	 	W 	
 z##D$-$P$P$PQQQ    
&!&returnc                     	 ddl }n"# t          $ r}t          d          |d}~ww xY w|D ]O}|                    |j                  }|                    ddd          }d|j        i}t          ||          c S dS )	zParse a single page.r   NzXUnable to import 'mwparserfromhell'. Please install with `pip install mwparserfromhell`.TF)	normalizecollapsekeep_template_paramssource)page_contentmetadata)mwparserfromhellr   parsetext
strip_codetitler   )r   pager*   r   revisioncoder,   r)   s           r   _load_single_page_from_dumpz(MWDumpLoader._load_single_page_from_dumpI   s    	##### 	 	 	3  	
  	B 	BH#))(-88D??E #  D !$*-HAAAAAA	B 	Br!   c              #   Z  K   |                                  }|j        D ]}| j        r|j        r| j        r|j        | j        vr'	 |                     |          V  @# t          $ r@}t          	                    d
                    |                     | j        r|Y d}~d}~ww xY wdS )zLazy load from a file path.zParsing error: {}N)r    pagesr   redirectr   	namespacer2   	Exceptionloggererrorformatr   )r   dumpr/   r   s       r   	lazy_loadzMWDumpLoader.lazy_loadZ   s      
 ##%%J 	 	D" t}  4>#H#H66t<<<<<<   077::;;;% GHHHH	 	s   A
B((6B##B()r   NFT)__name__
__module____qualname____doc__r   r   r   r   r   intboolr   r    r   r2   r   r<    r   r   r   r      s        ! !L #).2).(,+ +d#+ 3-+ Xc]+	+
 !+  ~+ + + +R R RB8 B B B B"	(	     r   r   )loggingpathlibr   typingr   r   r   r   langchain_core.documentsr   )langchain_community.document_loaders.baser	   	getLoggerr=   r8   r   rC   r   r   <module>rJ      s           6 6 6 6 6 6 6 6 6 6 6 6 - - - - - - @ @ @ @ @ @		8	$	$a a a a a: a a a a ar   