
    Ng                     f    d dl mZ d dlmZmZ d dlmZ d dlmZ d dl	m
Z
 dZ G d de
          Zd	S )
    )Path)IteratorUnion)urlparse)Document)
BaseLoaderzShttps://readers.llmsherpa.com/api/document/developer/parseDocument?renderFormat=allc                       e Zd ZdZdddefdeeef         dedededef
d	Z	e
d
edefd            Ze
	 dd
edededefd            Zdee         fdZdS )LLMSherpaFileLoaderaD  Load Documents using `LLMSherpa`.

    LLMSherpaFileLoader use LayoutPDFReader, which is part of the LLMSherpa library.
    This tool is designed to parse PDFs while preserving their layout information,
    which is often lost when using most PDF to text parsers.

    Examples
    --------
    from langchain_community.document_loaders.llmsherpa import LLMSherpaFileLoader

    loader = LLMSherpaFileLoader(
        "example.pdf",
        strategy="chunks",
        llmsherpa_api_url="http://localhost:5010/api/parseDocument?renderFormat=all",
    )
    docs = loader.load()
    Tchunks	file_pathnew_indent_parser	apply_ocrstrategyllmsherpa_api_urlc                 F   	 ddl }n# t          $ r t          d          w xY wg d}||vrt          d| d| d          |                     |          st          d|           |                     |||	          | _        || _        t          |          | _        dS )
zInitialize with a file path.r   NzKllmsherpa package not found, please install it with `pip install llmsherpa`)sectionsr   htmltextzGot z' for `strategy`, but should be one of ``zInvalid URL: )urlr   r   )		llmsherpaImportError
ValueError_is_valid_url_validate_llmsherpa_urlr   r   strr   )selfr   r   r   r   r   r   _valid_strategiess           j/var/www/html/ai-engine/env/lib/python3.11/site-packages/langchain_community/document_loaders/llmsherpa.py__init__zLLMSherpaFileLoader.__init__   s	   	 	 	 	*  	
 CBB,,,>x > >):> > >  
 !!"344 	B@->@@AAA//!/ 0 
 
 !Ys    !r   returnc                 p    t          |           }t          |j                  ot          |j                  S )zCheck if the url is valid.)r   boolnetlocscheme)r   parseds     r   r   z!LLMSherpaFileLoader._is_valid_urlA   s.     #FM"":tFM':'::    c                     t          |           }| }d|j        vrd|j        vrt          d|            d|j        vr|dz   }|rd|j        vr|dz   }|rd|j        vr|d	z   }|S )
z$Check if the llmsherpa url is valid.z/api/parseDocumentz%/api/document/developer/parseDocumentzInvalid LLMSherpa URL: zrenderFormat=allz?renderFormat=allzuseNewIndentParser=truez&useNewIndentParser=truezapplyOcr=yesz&applyOcr=yes)r   pathr   query)r   r   r   r&   	valid_urls        r   r   z+LLMSherpaFileLoader._validate_llmsherpa_urlG   s    
 #	 3336;FF<s<<===V\11!$77I 	?!:&,!N!N!$>>I 	4v|;;!O3Ir'   c              #   X   K   ddl m}  | j                  }|                     j                  } j        dk    r3 fdt          |                                          D             E d{V   j        dk    r3 fdt          |                                          D             E d{V   j        dk    r1t          |
                                d	 j        i
          gE d{V   j        dk    r3t          |                                d	 j        i
          gE d{V  dS dS )z
Load file.r   )LayoutPDFReaderr   c           	      ~    g | ]9\  }}t          |                    d d           j        ||j        d          :S )T)include_childrenrecurse)sourcesection_numbersection_titlepage_contentmetadata)r   to_textr   title).0section_numsectionr   s      r   
<listcomp>z1LLMSherpaFileLoader.lazy_load.<locals>.<listcomp>f   si     
 
 
 )K !($PT!U!U"&.*5)0   
 
 
r'   Nr   c           	      x    g | ]6\  }}t          |                                j        ||j        d           7S ))r1   chunk_number
chunk_typer4   )r   to_context_textr   tag)r9   	chunk_numchunkr   s      r   r<   z1LLMSherpaFileLoader.lazy_load.<locals>.<listcomp>r   sd     
 
 
 %Iu !&!6!6!8!8"&.(1&+i   
 
 
r'   r   r1   r4   r   )llmsherpa.readersr-   r   read_pdfr   r   	enumerater   r   r   to_htmlr7   )r   r-   docs_readerdocs   `   r   	lazy_loadzLLMSherpaFileLoader.lazy_load\   s      	655555%odh//""4>22=J&&
 
 
 
 -6cllnn,E,E
 
 
 
 
 
 
 
 
 
 =H$$
 
 
 
 )2#**,,(?(?
 
 
 
 
 
 
 
 
 
 =F""!$ $.          =F""!$ $.            #"r'   N)TT)__name__
__module____qualname____doc__DEFAULT_APIr   r   r   r#   r    staticmethodr   r   r   r   rJ    r'   r   r
   r
      s        * #' !, (  (d# (   ( 	 (
  (  (  (  (  (D ;3 ;4 ; ; ; \;
 DH %)=A	   \(2	(	2 2 2 2 2 2r'   r
   N)pathlibr   typingr   r   urllib.parser   langchain_core.documentsr   (langchain_community.document_loaders.pdfr   rO   r
   rQ   r'   r   <module>rW      s          " " " " " " " " ! ! ! ! ! ! - - - - - - ? ? ? ? ? ?cB B B B B* B B B B Br'   