
    Ng                        d dl Z d dlZd dlZd dlZd dlZd dlZd dlmZ d dlm	Z	 d dl
mZ d dlmZmZmZmZmZmZmZmZmZ d dlmZ d dlZd dlmZ d dlmZ d d	lmZ d d
lm Z  d dl!m"Z" d dl#m$Z$m%Z%m&Z&m'Z'm(Z(m)Z)m*Z* d dl+m,Z, erd dl-m.Z.  ej/        e0          Z1 G d de,          Z2 G d dee          Z3 G d de3          Z4 G d de3          Z5 G d de3          Z6 G d de          Z7 G d de3          Z8 G d de3          Z9 G d d e3          Z: G d! d"e3          Z; G d# d$e3          Z< G d% d&e3          Z= G d' d(e"          Z> G d) d*e3          Z? G d+ d,e3          Z@e5ZAdS )-    N)ABC)StringIO)Path)	TYPE_CHECKINGAnyDictIteratorListMappingOptionalSequenceUnion)urlparse)Document)get_from_dict_or_env)
BaseLoader)Blob)DedocBaseLoader)AmazonTextractPDFParserDocumentIntelligenceParserPDFMinerParserPDFPlumberParserPyMuPDFParserPyPDFium2ParserPyPDFParser)UnstructuredFileLoader)TextLinearizationConfigc                       e Zd ZdZdefdZdS )UnstructuredPDFLoadera  Load `PDF` files using `Unstructured`.

    You can run the loader in one of two modes: "single" and "elements".
    If you use "single" mode, the document will be returned as a single
    langchain Document object. If you use "elements" mode, the unstructured
    library will split the document into elements such as Title and NarrativeText.
    You can pass in additional unstructured kwargs after mode to apply
    different unstructured settings.

    Examples
    --------
    from langchain_community.document_loaders import UnstructuredPDFLoader

    loader = UnstructuredPDFLoader(
        "example.pdf", mode="elements", strategy="fast",
    )
    docs = loader.load()

    References
    ----------
    https://unstructured-io.github.io/unstructured/bricks.html#partition-pdf
    returnc                 6    ddl m}  |dd| j        i| j        S )Nr   )partition_pdffilename )unstructured.partition.pdfr"   	file_pathunstructured_kwargs)selfr"   s     d/var/www/html/ai-engine/env/lib/python3.11/site-packages/langchain_community/document_loaders/pdf.py_get_elementsz#UnstructuredPDFLoader._get_elementsG   s4    <<<<<<}QQdnQ8PQQQ    N)__name__
__module____qualname____doc__r
   r*   r$   r+   r)   r   r   /   s@         .Rt R R R R R Rr+   r   c                       e Zd ZdZdddeeef         dee         fdZ	ddZ
ed	edefd
            Zed	edefd            Zed	edefd            Zedefd            ZdS )BasePDFLoaderzBase Loader class for `PDF` files.

    If the file is a web path, it will download it to a temporary file, use it, then
        clean up the temporary file after completion.
    Nheadersr&   r3   c                   t          |          | _        d| _        || _        d| j        v r)t          j                            | j                  | _        t          j                            | j                  s|                     | j                  r{t          j
                    | _        t          j                            | j                  \  }}|                     | j                  r2t          | j                  j                            d          d         }t          j                            | j        j        d|           }| j        | _        |                     | j                  st'          j        | j        | j                  }|j        dk    rt-          d|j        z            t/          |d	
          5 }|                    |j                   ddd           n# 1 swxY w Y   t          |          | _        dS dS t          j                            | j                  st-          d| j        z            dS )zInitialize with a file path.

        Args:
            file_path: Either a local, S3 or web path to a PDF file.
            headers: Headers to use for GET request to download a file from a web path.
        N~/tmpr2      z3Check the url of your file; returned status code %swb)modez'File path %s is not a valid file or url)strr&   web_pathr3   ospath
expanduserisfile_is_valid_urltempfileTemporaryDirectorytemp_dirsplitext_is_s3_presigned_urlr   splitjoinname
_is_s3_urlrequestsgetstatus_code
ValueErroropenwritecontent)r(   r&   r3   _suffixtemp_pdfrfs           r)   __init__zBasePDFLoader.__init__T   s7    Y$.  W//??DN w~~dn-- 	Y$2D2DT^2T2T 	Y$799DM((88IAv((88 F!$.116<<SAA"Ew||DM$6fGGH NDM??4>22 
/LFFF=C''$M-(  
 (... '!GGAI&&&' ' ' ' ' ' ' ' ' ' ' ' ' ' '!$X
/ 
/ // 	YFWXXX	Y 	Ys   G--G14G1r    c                 \    t          | d          r| j                                         d S d S )NrE   )hasattrrE   cleanupr(   s    r)   __del__zBasePDFLoader.__del__w   s8    4$$ 	$M!!#####	$ 	$r+   urlc                 p    t          |           }t          |j                  ot          |j                  S )zCheck if the url is valid.)r   boolnetlocscheme)r^   parseds     r)   rB   zBasePDFLoader._is_valid_url{   s.     #FM"":tFM':'::r+   c                 p    	 t          |           }|j        dk    r	|j        rdS dS # t          $ r Y dS w xY w)zcheck if the url is S3s3TF)r   rb   ra   rO   r^   results     r)   rK   zBasePDFLoader._is_s3_url   sR    	c]]F}$$$t5 	 	 	55	s   !' 
55c                     	 t          |           }t          t          j        d|j                            S # t
          $ r Y dS w xY w)z'Check if the url is a presigned S3 url.z\.s3\.amazonaws\.com$F)r   r`   researchra   rO   rf   s     r)   rG   z"BasePDFLoader._is_s3_presigned_url   sQ    	c]]F	":FMJJKKK 	 	 	55	s   58 
AAc                 ,    | j         | j         n| j        S N)r=   r&   r\   s    r)   sourcezBasePDFLoader.source   s     $ 9t}}t~Mr+   )r    N)r,   r-   r.   r/   r   r<   r   r   r   rX   r]   staticmethodr`   rB   rK   rG   propertyrm   r$   r+   r)   r1   r1   M   s3         RV !Y !Y !Y%T	"2 !Y !Y !Y !Y !YF$ $ $ $ ;3 ;4 ; ; ; \;
      \ # $    \ N N N N XN N Nr+   r1   c                   *    e Zd ZdZdee         fdZdS )OnlinePDFLoaderzLoad online `PDF`.r    c                 l    t          t          | j                            }|                                S )zLoad documents.)r   r<   r&   load)r(   loaders     r)   rs   zOnlinePDFLoader.load   s'    &s4>':':;;{{}}r+   N)r,   r-   r.   r/   r
   r   rs   r$   r+   r)   rq   rq      s9        d8n      r+   rq   c                        e Zd ZdZ	 	 	 dddddedeeeef                  dee         d	e	d
edee         ddf fdZ
dee         fdZ xZS )PyPDFLoaderuc  
    PyPDFLoader document loader integration

    Setup:
        Install ``langchain-community``.

        .. code-block:: bash

            pip install -U langchain-community

    Instantiate:
        .. code-block:: python

            from langchain_community.document_loaders import PyPDFLoader

            loader = PyPDFLoader(
                file_path = "./example_data/layout-parser-paper.pdf",
                password = "my-password",
                extract_images = True,
                # headers = None
                # extraction_mode = "plain",
                # extraction_kwargs = None,
            )

    Lazy load:
        .. code-block:: python

            docs = []
            docs_lazy = loader.lazy_load()

            # async variant:
            # docs_lazy = await loader.alazy_load()

            for doc in docs_lazy:
                docs.append(doc)
            print(docs[0].page_content[:100])
            print(docs[0].metadata)

        .. code-block:: python

            LayoutParser : A Uniﬁed Toolkit for Deep
            Learning Based Document Image Analysis
            Zejiang Shen1( ), R
            {'source': './example_data/layout-parser-paper.pdf', 'page': 0}

    Async load:
        .. code-block:: python

            docs = await loader.aload()
            print(docs[0].page_content[:100])
            print(docs[0].metadata)

        .. code-block:: python

            LayoutParser : A Uniﬁed Toolkit for Deep
            Learning Based Document Image Analysis
            Zejiang Shen1( ), R
            {'source': './example_data/layout-parser-paper.pdf', 'page': 0}
    NFplain)extraction_modeextraction_kwargsr&   passwordr3   extract_imagesrx   ry   r    c                    	 ddl }n# t          $ r t          d          w xY wt                                          ||           t	          ||||          | _        dS )Initialize with a file path.r   NzCpypdf package not found, please install it with `pip install pypdf`r2   )rz   r{   rx   ry   )pypdfImportErrorsuperrX   r   parser)	r(   r&   rz   r3   r{   rx   ry   r~   	__class__s	           r)   rX   zPyPDFLoader.__init__   s    	LLLL 	 	 	U  	 	G444!)+/	
 
 
    "c              #     K   | j         rAt          j        t          | j        d                                          | j                   }nt          j        | j                  }| j                            |          E d{V  dS Lazy load given path as pages.rbr?   N	r=   r   	from_datarP   r&   read	from_pathr   parser(   blobs     r)   	lazy_loadzPyPDFLoader.lazy_load          = 	2>$t~t"<"<"A"A"C"C$-XXXDD>$.11D;$$T***********r+   )NNF)r,   r-   r.   r/   r<   r   r   bytesr   r`   rX   r	   r   r   __classcell__r   s   @r)   rv   rv      s        : :~ 15"&$
  ',0
 
 

 5e,-
 $	

 
 
 $D>
 

 
 
 
 
 
2+	(	+ + + + + + + +r+   rv   c                   \     e Zd ZdZddddedee         def fdZd	e	e
         fd
Z xZS )PyPDFium2Loaderz;Load `PDF` using `pypdfium2` and chunks at character level.NFr3   r{   r&   r3   r{   c                x    t                                          ||           t          |          | _        dS )r}   r2   r{   N)r   rX   r   r   )r(   r&   r3   r{   r   s       r)   rX   zPyPDFium2Loader.__init__  s8     	G444%^DDDr+   r    c              #     K   | j         rAt          j        t          | j        d                                          | j                   }nt          j        | j                  }| j                            |          E d{V  dS r   r   r   s     r)   r   zPyPDFium2Loader.lazy_load  r   r+   r,   r-   r.   r/   r<   r   r   r`   rX   r	   r   r   r   r   s   @r)   r   r     s        EE #'$	E 	E 	E	E $		E
 	E 	E 	E 	E 	E 	E+	(	+ + + + + + + +r+   r   c                       e Zd ZdZ	 	 	 	 	 ddeeef         dedededed	efd
Ze	dedefd            Z
dee         fdZdS )PyPDFDirectoryLoaderzLoad a directory with `PDF` files using `pypdf` and chunks at character level.

    Loader also stores page numbers in metadata.
    **/[!.]*.pdfFr?   globsilent_errorsload_hidden	recursiver{   c                 Z    || _         || _        || _        || _        || _        || _        d S rl   )r?   r   r   r   r   r{   )r(   r?   r   r   r   r   r{   s          r)   rX   zPyPDFDirectoryLoader.__init__#  s7     		&"*,r+   r    c                 @    t          d | j        D                        S )Nc              3   @   K   | ]}|                     d           V  dS ).N
startswith).0parts     r)   	<genexpr>z3PyPDFDirectoryLoader._is_visible.<locals>.<genexpr>5  s.      CCts++CCCCCCr+   )anypartsr   s    r)   _is_visiblez PyPDFDirectoryLoader._is_visible3  s$    CC
CCCCCCCr+   c                    t          | j                  }g }| j        r|                    | j                  n|                    | j                  }|D ]}|                                r|                     |                    |                    s| j        r	 t          t          |          | j                  }|                                }|D ]}t          |          |j        d<   |                    |           # t          $ r.}| j        rt"                              |           n|Y d }~d }~ww xY w|S )Nr   rm   )r   r?   r   rglobr   is_filer   relative_tor   rv   r<   r{   rs   metadataextend	Exceptionr   loggerwarning)	r(   pdocsitemsirt   sub_docsdoces	            r)   rs   zPyPDFDirectoryLoader.load7  sO   OO&*nK	"""!&&:K:K 	$ 	$Ayy{{ $##AMM!$4$455 $9I $
$!,SVVDDW!X!X!X#);;==#+ < <C58VVCL22H----$ $ $ $- $"NN1----"#G .----$
 s   A(D
D;$D66D;N)r   FFFF)r,   r-   r.   r/   r   r<   r   r`   rX   rn   r   r
   r   rs   r$   r+   r)   r   r     s          ##!$- -CI- - 	-
 - - - - - -  D$ D4 D D D \Dd8n      r+   r   c                   f     e Zd ZdZdddddedee         ded	ed
df
 fdZd
e	e
         fdZ xZS )PDFMinerLoaderz"Load `PDF` files using `PDFMiner`.NFT)r3   r{   concatenate_pagesr&   r3   r{   r   r    c                    	 ddl m} n# t          $ r t          d          w xY wt                                          ||           t          ||          | _        dS )a  Initialize with file path.

        Args:
            extract_images: Whether to extract images from PDF.
            concatenate_pages: If True, concatenate all PDF pages into one a single
                               document. Otherwise, return one document per page.
        r   )extract_textO`pdfminer` package not found, please install it with `pip install pdfminer.six`r2   )r{   r   N)pdfminer.high_levelr   r   r   rX   r   r   )r(   r&   r3   r{   r   r   r   s         r)   rX   zPDFMinerLoader.__init__O  s    	8888888 	 	 	-  	 	G444$)=N
 
 
   
 $c              #     K   | j         rAt          j        t          | j        d                                          | j                   }nt          j        | j                  }| j                            |          E d{V  dS )zLazily load documents.r   r   Nr   r   s     r)   r   zPDFMinerLoader.lazy_loadk  r   r+   r   r   s   @r)   r   r   L  s        ,, #'$"&
 
 

 $	

 
  
 

 
 
 
 
 
8+	(	+ + + + + + + +r+   r   c                   V     e Zd ZdZdddedee         f fdZdee	         fdZ
 xZS )	PDFMinerPDFasHTMLLoaderz2Load `PDF` files as HTML content using `PDFMiner`.Nr2   r&   r3   c                    	 ddl m} n# t          $ r t          d          w xY wt                                          ||           dS )r}   r   extract_text_to_fpr   r2   N)r   r   r   r   rX   )r(   r&   r3   r   r   s       r)   rX   z PDFMinerPDFasHTMLLoader.__init__y  st    	>>>>>>> 	 	 	-  	 	G44444r   r    c              #   D  K   ddl m} ddlm} ddlm} t                      } || j        d          5 } |||d |            d           d	d	d	           n# 1 swxY w Y   d
| j        | j        n| j        i}t          |
                                |          V  d	S )
Load file.r   r   )LAParams)open_filenamer    html)codeclaparamsoutput_typeNrm   page_contentr   )r   r   pdfminer.layoutr   pdfminer.utilsr   r   r&   r=   r   getvalue)r(   r   r   r   output_stringfpr   s          r)   r   z!PDFMinerPDFasHTMLLoader.lazy_load  s$     ::::::,,,,,,000000 

]4>400 	B!"   	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 (=dnn4=
 M$:$:$<$<xPPPPPPPPs   AA A)r,   r-   r.   r/   r<   r   r   rX   r	   r   r   r   r   s   @r)   r   r   v  s        <<DH 
5 
5 
5# 
58D> 
5 
5 
5 
5 
5 
5Q8H- Q Q Q Q Q Q Q Qr+   r   c                        e Zd ZdZddddedee         deded	df
 fd
Z	ded	e
e         fdZded	ee         fdZd	e
e         fdZ xZS )PyMuPDFLoaderz!Load `PDF` files using `PyMuPDF`.NFr   r&   r3   r{   kwargsr    c                    	 ddl }n# t          $ r t          d          w xY wt                                          ||           || _        || _        dS )r}   r   NzI`PyMuPDF` package not found, please install it with `pip install pymupdf`r2   )fitzr   r   rX   r{   text_kwargs)r(   r&   r3   r{   r   r   r   s         r)   rX   zPyMuPDFLoader.__init__  sz    	KKKK 	 	 	(  	
 	G444,!r   c              +     K   |rt                               d| d           i | j        |}t          || j                  }| j        rAt          j        t          | j	        d          
                                | j                  }nt          j        | j	                  }|                    |          E d {V  d S )NzReceived runtime arguments zd. Passing runtime args to `load` is deprecated. Please pass arguments during initialization instead.)r   r{   r   r   )r   r   r   r   r{   r=   r   r   rP   r&   r   r   
lazy_parse)r(   r   r   r   r   s        r)   
_lazy_loadzPyMuPDFLoader._lazy_load  s       	NNXf X X X  
 5)4V4#D4G
 
 
 = 	2>$t~t"<"<"A"A"C"C$-XXXDD>$.11D$$T***********r+   c                 6    t           | j        di |          S )Nr$   )listr   )r(   r   s     r)   rs   zPyMuPDFLoader.load  s"    ODO--f--...r+   c              #   >   K   |                                  E d {V  d S rl   )r   r\   s    r)   r   zPyMuPDFLoader.lazy_load  s.      ??$$$$$$$$$$$r+   )r,   r-   r.   r/   r<   r   r   r`   r   rX   r	   r   r   r
   rs   r   r   r   s   @r)   r   r     s        ++ #'$" " "" $	"
 " " 
" " " " " "(+3 +8H+= + + + +"/S /T(^ / / / /%8H- % % % % % % % %r+   r   c                   .    e Zd ZdZ	 	 	 	 ddededed	ed
eeee	f                  de	ddf fdZ
edeeef         fd            Zedefd            Zedefd            ZdefdZdeddfdZdedefdZededefd            Zdee         fdZ xZS )MathpixPDFLoaderz)Load `PDF` files using `Mathpix` service.md  FNr&   processed_file_formatmax_wait_time_secondsshould_clean_pdfextra_request_datar   r    c                 ,   t          |dd          | _        t          |dd          | _        |                    dd           |                    dd            t	                      j        |fi | || _        ||ni | _        || _        || _	        dS )a  Initialize with a file path.

        Args:
            file_path: a file for loading.
            processed_file_format: a format of the processed file. Default is "md".
            max_wait_time_seconds: a maximum time to wait for the response from
             the server. Default is 500.
            should_clean_pdf: a flag to clean the PDF file. Default is False.
            extra_request_data: Additional request data.
            **kwargs: additional keyword arguments.
        mathpix_api_keyMATHPIX_API_KEYmathpix_api_idMATHPIX_API_IDN)
r   r   r   popr   rX   r   r   r   r   )r(   r&   r   r   r   r   r   r   s          r)   rX   zMathpixPDFLoader.__init__  s    (  4%'8 
  
 3$&6
 

 	

$d+++

#T***--f---%:""4"@b 	 &;" 0r+   c                      | j         | j        dS )N)app_idapp_key)r   r   r\   s    r)   _mathpix_headersz!MathpixPDFLoader._mathpix_headers  s    -$:NOOOr+   c                     dS )Nzhttps://api.mathpix.com/v3/pdfr$   r\   s    r)   r^   zMathpixPDFLoader.url  s    //r+   c                 R    d| j         dii| j        }dt          j        |          iS )Nconversion_formatsToptions_json)r   r   jsondumps)r(   optionss     r)   datazMathpixPDFLoader.data  s:     !4#=t"D
%
 
7 3 344r+   c                 L   t          | j        d          5 }d|i}t          j        | j        | j        || j                  }d d d            n# 1 swxY w Y   |                                }d|v rt          d|d                    d|v r
|d         }|S t          d          )Nr   file)r3   filesr  errorzMathpix request failed: pdf_idzUnable to send PDF to Mathpix.)	rP   r&   rL   postr^   r   r  r   rO   )r(   rW   r  responseresponse_datar  s         r)   send_pdfzMathpixPDFLoader.send_pdf  s    $.$'' 	1QKE}$"7u49  H	 	 	 	 	 	 	 	 	 	 	 	 	 	 	
 !m##Pg8NPPQQQ}$$"8,FM=>>>s   ,AAAr  c                    | j         dz   |z   }t          d| j        d          D ]}t          j        || j                  }|                                }|                    dd          }|                    dd          }|%d| }||d	|d
          dz  }t          |          |                    dd          }	|	dk    r dS |	dk    rt          d          t          d|	 d           t          j
        d           t          )zmWait for processing to complete.

        Args:
            pdf_id: a PDF id.

        Returns: None
        r6   r      r2   r  N
error_infoz%Unable to retrieve PDF from Mathpix: z (id)status	completedz#Unable to retrieve PDF from MathpixzStatus: z$, waiting for processing to complete)r^   ranger   rL   rM   r   r   rO   printtimesleepTimeoutError)
r(   r  r^   rS   r	  r
  r  r  	error_msgr  s
             r)   wait_for_processingz$MathpixPDFLoader.wait_for_processing  s4    hnv%q$4a88 	 	A|C1FGGGH$MMOOM "%%gt44E&**<>>J KEKK	)!9j&6!9!9!99I +++"&&x66F$$7"" !FGGGMMMMNNN
1r+   c                     |                      |           | j         d| d| j         }t          j        || j                  }|j                            d          S )Nr6   r   r2   zutf-8)r  r^   r   rL   rM   r   rR   decode)r(   r  r^   r	  s       r)   get_processed_pdfz"MathpixPDFLoader.get_processed_pdf9  sd      (((AAFAAT%?AA<T-BCCC&&w///r+   contentsc                 b   d                     d |                     d          D                       } |                     dd                              dd          } |                     dd                              d	d
                              dd                              dd          } | S )ziClean the PDF file.

        Args:
            contents: a PDF file contents.

        Returns:

        
c                 <    g | ]}|                     d           |S )z![]r   )r   lines     r)   
<listcomp>z.MathpixPDFLoader.clean_pdf.<locals>.<listcomp>J  s)    QQQd$//%:P:PQTQQQr+   z	\section{z# }r   z\$$z\%%z\((z\)r  )rI   rH   replace)r  s    r)   	clean_pdfzMathpixPDFLoader.clean_pdf?  s     99QQhnnT22QQQ
 
 ##L$77??RHH UC((WUC  WUC  WUC  	 	 r+   c                     |                                  }|                     |          }| j        r|                     |          }| j        | j        |d}t          ||          gS )N)rm   r&   r  r   )r  r  r   r(  rm   r   )r(   r  r  r   s       r)   rs   zMathpixPDFLoader.loadW  sg    ))&11  	0~~h//H"kvVVhBBBCCr+   )r   r   FN)r,   r-   r.   r/   r<   intr`   r   r   r   rX   ro   r   r^   dictr  r  r  r  rn   r(  r
   r   rs   r   r   s   @r)   r   r     s       33
 &*%(!&7;%1 %1%1  #%1  #	%1
 %1 %T#s(^4%1 %1 
%1 %1 %1 %1 %1 %1N P$sCx. P P P XP 0S 0 0 0 X0 5d 5 5 5 X5?# ? ? ? ?## #$ # # # #J0 0 0 0 0 0 C C    \.Dd8n D D D D D D D Dr+   r   c                        e Zd ZdZ	 	 	 	 ddedeeeef                  dedee	         ded	df fd
Z
d	ee         fdZ xZS )PDFPlumberLoaderz$Load `PDF` files using `pdfplumber`.NFr&   r   deduper3   r{   r    c                     	 ddl }n# t          $ r t          d          w xY wt                                          ||           |pi | _        || _        || _        dS )r}   r   NzMpdfplumber package not found, please install it with `pip install pdfplumber`r2   )
pdfplumberr   r   rX   r   r.  r{   )r(   r&   r   r.  r3   r{   r0  r   s          r)   rX   zPDFPlumberLoader.__init__c  s    	 	 	 	+  	 	G444&,",r   c                 0   t          | j        | j        | j                  }| j        rAt          j        t          | j        d          	                                | j                  }nt          j
        | j                  }|                    |          S )r   )r   r.  r{   r   r   )r   r   r.  r{   r=   r   r   rP   r&   r   r   r   )r(   r   r   s      r)   rs   zPDFPlumberLoader.loady  s     "(;.
 
 

 = 	2>$t~t"<"<"A"A"C"C$-XXXDD>$.11D||D!!!r+   )NFNF)r,   r-   r.   r/   r<   r   r   r   r`   r   rX   r
   r   rs   r   r   s   @r)   r-  r-  `  s        ..
 48"&$- -- gc3h/0- 	-
 $- - 
- - - - - -,"d8n " " " " " " " "r+   r-  c                       e Zd ZdZ	 	 	 	 	 	 ddddedeee                  dee         dee         dee         d	ee         d
ee         ded         ddf fdZ	de
e         fdZdee         fdZededefd            Z xZS )AmazonTextractPDFLoadera  Load `PDF` files from a local file system, HTTP or S3.

    To authenticate, the AWS client uses the following methods to
    automatically load credentials:
    https://boto3.amazonaws.com/v1/documentation/api/latest/guide/credentials.html

    If a specific credential profile should be used, you must pass
    the name of the profile from the ~/.aws/credentials file that is to be used.

    Make sure the credentials / roles used have the required policies to
    access the Amazon Textract service.

    Example:
        .. code-block:: python
            from langchain_community.document_loaders import AmazonTextractPDFLoader
            loader = AmazonTextractPDFLoader(
                file_path="s3://pdfs/myfile.pdf"
            )
            document = loader.load()
    N)linearization_configr&   textract_featuresclientcredentials_profile_nameregion_nameendpoint_urlr3   r4  r   r    c                   t                                          ||           	 ddln# t          $ r t          d          w xY w|rfd|D             }	ng }	|s|s|r	 ddl}
||
                    |          }n|
                                }i }|r||d<   |r||d<    |j        di |}n=# t          $ r t          d
          t          $ r}t          d|           |d}~ww xY wt          |	||          | _
        dS )aU  Initialize the loader.

        Args:
            file_path: A file, url or s3 path for input file
            textract_features: Features to be used for extraction, each feature
                               should be passed as a str that conforms to the enum
                               `Textract_Features`, see `amazon-textract-caller` pkg
            client: boto3 textract client (Optional)
            credentials_profile_name: AWS profile name, if not default (Optional)
            region_name: AWS region, eg us-east-1 (Optional)
            endpoint_url: endpoint url for the textract service (Optional)
            linearization_config: Config to be used for linearization of the output
                                  should be an instance of TextLinearizationConfig from
                                  the `textractor` pkg
        r2   r   NztCould not import amazon-textract-caller python package. Please install it with `pip install amazon-textract-caller`.c                 *    g | ]}j         |         S r$   )Textract_Features)r   xtcs     r)   r"  z4AmazonTextractPDFLoader.__init__.<locals>.<listcomp>  s!    KKKA,Q/KKKr+   )profile_namer8  r9  textractzRCould not import boto3 python package. Please install it with `pip install boto3`.zCould not load credentials to authenticate with AWS client. Please check that credentials in the specified profile name are valid. )r5  r6  r4  )r@  )r   rX   textractcallerr   boto3Sessionr6  r   rO   r   r   )r(   r&   r5  r6  r7  r8  r9  r3   r4  featuresrB  sessionclient_paramsr   r>  r   s                 @r)   rX   z AmazonTextractPDFLoader.__init__  s   6 	G444	''''' 	 	 	O  	
  	KKKK9JKKKHHH# 	{ 	l 	+7#mm9QmRRGG $mmooG " ?3>M-0 A4@M.1'DDmDD   !B       3/03 3  	 .&!5
 
 
s"   , A#AB2 2"C,C''C,c                 D    t          |                                           S zLoad given path as pages.r   r   r\   s    r)   rs   zAmazonTextractPDFLoader.load      DNN$$%%%r+   c              #   V  K   | j         r0|                     | j                   rt          | j                   }nOt          j        | j                  }t
                              |          dk    rt          d|j         d          | j	        
                    |          E d{V  dS )zLazy load documentsr      z	the file z is a multi-page document,                     but not stored on S3.                     Textract requires multi-page documents to be on S3.N)r=   rK   r   r   r&   r3  _get_number_of_pagesrO   r?   r   r   r   s     r)   r   z!AmazonTextractPDFLoader.lazy_load  s       = 		T__T];; 		T]+++DD>$.11D&;;DAAAEE I	 I I I   ;$$T***********r+   r   c                 &   	 dd l }ddlm}m} n# t          $ r t	          d          w xY w| j        dk    rW|                                 5 }|                    |          }t          |j	                  cd d d            S # 1 swxY w Y   d S | j        dk    rXd}|
                    |                                           }t          |                    |                    D ]
\  }}|dz  }|S | j        dv rdS t          d| j                   )	Nr   )ImageImageSequencezcCould not import pypdf or Pilloe python package. Please install it with `pip install pypdf Pillow`.zapplication/pdfz
image/tiffrL  )z	image/pngz
image/jpegzunsupported mime type: )r~   PILrO  rP  r   mimetypeas_bytes_io	PdfReaderlenpagesrP   as_bytes	enumerater	   rO   )	r   r~   rO  rP  input_pdf_file
pdf_reader	num_pagesimgrS   s	            r)   rM  z,AmazonTextractPDFLoader._get_number_of_pages  s   	LLL000000000 	 	 	E  	
 =---!!## -~"__^<<
:+,,- - - - - - - - - - - - - - - - - - ]l**I**T]]__--C!-"8"8"="=>>  1Q		]9991Ft}FFGGGs    ))BBB)NNNNNN)r,   r-   r.   r/   r<   r   r   r   r   rX   r
   r   rs   r	   r   rn   r   r*  rM  r   r   s   @r)   r3  r3    sw        0 6: $26%)&*"&J
 EIJ
 J
 J
J
 $HSM2J
 	J

 #+3-J
 c]J
 smJ
 $J
 ''@AJ
 
J
 J
 J
 J
 J
 J
X&d8n & & & &+	(	+ + + +* H4 HC H H H \H H H H Hr+   r3  c                       e Zd ZdZdefdZdS )DedocPDFLoadera  
    DedocPDFLoader document loader integration to load PDF files using `dedoc`.
    The file loader can automatically detect the correctness of a textual layer in the
        PDF document.
    Note that `__init__` method supports parameters that differ from ones of
        DedocBaseLoader.

    Setup:
        Install ``dedoc`` package.

        .. code-block:: bash

            pip install -U dedoc

    Instantiate:
        .. code-block:: python

            from langchain_community.document_loaders import DedocPDFLoader

            loader = DedocPDFLoader(
                file_path="example.pdf",
                # split=...,
                # with_tables=...,
                # pdf_with_text_layer=...,
                # pages=...,
                # ...
            )

    Load:
        .. code-block:: python

            docs = loader.load()
            print(docs[0].page_content[:100])
            print(docs[0].metadata)

        .. code-block:: python

            Some text
            {
                'file_name': 'example.pdf',
                'file_type': 'application/pdf',
                # ...
            }

    Lazy load:
        .. code-block:: python

            docs = []
            docs_lazy = loader.lazy_load()

            for doc in docs_lazy:
                docs.append(doc)
            print(docs[0].page_content[:100])
            print(docs[0].metadata)

        .. code-block:: python

            Some text
            {
                'file_name': 'example.pdf',
                'file_type': 'application/pdf',
                # ...
            }

    Parameters used for document parsing via `dedoc`
        (https://dedoc.readthedocs.io/en/latest/parameters/pdf_handling.html):

        with_attachments: enable attached files extraction
        recursion_deep_attachments: recursion level for attached files extraction,
            works only when with_attachments==True
        pdf_with_text_layer: type of handler for parsing, available options
            ["true", "false", "tabby", "auto", "auto_tabby" (default)]
        language: language of the document for PDF without a textual layer,
            available options ["eng", "rus", "rus+eng" (default)], the list of
            languages can be extended, please see
            https://dedoc.readthedocs.io/en/latest/tutorials/add_new_language.html
        pages: page slice to define the reading range for parsing
        is_one_column_document: detect number of columns for PDF without a textual
            layer, available options ["true", "false", "auto" (default)]
        document_orientation: fix document orientation (90, 180, 270 degrees) for PDF
            without a textual layer, available options ["auto" (default), "no_change"]
        need_header_footer_analysis: remove headers and footers from the output result
        need_binarization: clean pages background (binarize) for PDF without a textual
            layer
        need_pdf_table_analysis: parse tables for PDF without a textual layer
    r    c                 H    ddl m}  || j        | j        | j                  S )Nr   )make_manager_pdf_config)r&   parsing_paramsrH   )dedoc.utils.langchainr`  r&   parsing_parametersrH   )r(   r`  s     r)   _make_configzDedocPDFLoader._make_configv  s>    AAAAAA&&n2*
 
 
 	
r+   N)r,   r-   r.   r/   r+  rd  r$   r+   r)   r^  r^    s<        U Un
d 
 
 
 
 
 
r+   r^  c                   z     e Zd ZdZ	 	 ddedededee         ddf
 fd	Zde	e
         fd
Zdee
         fdZ xZS )DocumentIntelligenceLoaderz+Load a PDF with Azure Document Intelligenceprebuilt-documentNr&   r6  modelr3   r    c                 z    t          ||          | _        t                                          ||           dS )a  
        Initialize the object for file processing with Azure Document Intelligence
        (formerly Form Recognizer).

        This constructor initializes a DocumentIntelligenceParser object to be used
        for parsing files using the Azure Document Intelligence API. The load method
        generates a Document node including metadata (source blob and page number)
        for each page.

        Parameters:
        -----------
        file_path : str
            The path to the file that needs to be parsed.
        client: Any
            A DocumentAnalysisClient to perform the analysis of the blob
        model : str
            The model name or ID to be used for form recognition in Azure.

        Examples:
        ---------
        >>> obj = DocumentIntelligenceLoader(
        ...     file_path="path/to/file",
        ...     client=client,
        ...     model="prebuilt-document"
        ... )
        )r6  rh  r2   N)r   r   r   rX   )r(   r&   r6  rh  r3   r   s        r)   rX   z#DocumentIntelligenceLoader.__init__  s=    D 1eLLLG44444r+   c                 D    t          |                                           S rH  rI  r\   s    r)   rs   zDocumentIntelligenceLoader.load  rJ  r+   c              #   |   K   t          j        | j                  }| j                            |          E d{V  dS )r   N)r   r   r&   r   r   r   s     r)   r   z$DocumentIntelligenceLoader.lazy_load  sG       ~dn--;$$T***********r+   )rg  N)r,   r-   r.   r/   r<   r   r   r   rX   r
   r   rs   r	   r   r   r   s   @r)   rf  rf    s        55 )"&#5 #5#5 #5 	#5
 $#5 
#5 #5 #5 #5 #5 #5J&d8n & & & &+	(	+ + + + + + + +r+   rf  c            	       `     e Zd ZdZ	 d
deeef         dededdf fdZde	e
         fd	Z xZS )ZeroxPDFLoadera  
    Document loader utilizing Zerox library:
    https://github.com/getomni-ai/zerox

    Zerox converts PDF document to serties of images (page-wise) and
    uses vision-capable LLM model to generate Markdown representation.

    Zerox utilizes anyc operations. Therefore when using this loader
    inside Jupyter Notebook (or any environment running async)
    you will need to:
    ```python
        import nest_asyncio
        nest_asyncio.apply()
    ```
    gpt-4o-minir&   rh  zerox_kwargsr    Nc                 j    t                                          |           	 || _        || _        d S )N)r&   )r   rX   ro  rh  )r(   r&   rh  ro  r   s       r)   rX   zZeroxPDFLoader.__init__  s:     	9---	& )


r+   c              #   0  K   ddl }ddlm} |                     |d| j        | j        d| j                  }t          |j                  dk    rB|j        d         j	        }|j        D ]*}t          |j        | j        |j	        |d          V  )dS dS )	z
        Loads documnts from pdf utilizing zerox library:
        https://github.com/getomni-ai/zerox

        Returns:
            Iterator[Document]: An iterator over parsed Document instances.
        r   N)zerox)r&   rh  r7   )rm   pager[  r   r$   )asynciopyzeroxrr  runr&   rh  ro  rU  rV  rs  r   rR   rm   )r(   rt  rr  zerox_outputr[  rs  s         r)   r   zZeroxPDFLoader.lazy_load  s       	!!!!!! {{ERDN$*RR@QRR
 

 |!""Q&&$*2.3I$*  !%"&+ $	%.        '& r+   )rn  )r,   r-   r.   r/   r   r<   r   r   rX   r	   r   r   r   r   s   @r)   rm  rm    s         & # d#  	
 
     :8H-        r+   rm  )Br   loggingr>   ri   rC   r  abcr   ior   pathlibr   typingr   r   r   r	   r
   r   r   r   r   urllib.parser   rL   langchain_core.documentsr   langchain_core.utilsr   )langchain_community.document_loaders.baser   1langchain_community.document_loaders.blob_loadersr   *langchain_community.document_loaders.dedocr   0langchain_community.document_loaders.parsers.pdfr   r   r   r   r   r   r   1langchain_community.document_loaders.unstructuredr   )textractor.data.text_linearization_configr   	getLogger__file__r   r   r1   rq   rv   r   r   r   r   r   r   r-  r3  r^  rf  rm  PagedPDFSplitterr$   r+   r)   <module>r     s     				 				                    
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 " ! ! ! ! !  - - - - - - 5 5 5 5 5 5 @ @ @ @ @ @ B B B B B B F F F F F F                  U T T T T T RQQQQQQ		8	$	$R R R R R2 R R R<JN JN JN JN JNJ JN JN JNZ    m   ^+ ^+ ^+ ^+ ^+- ^+ ^+ ^+B+ + + + +m + + +2, , , , ,: , , ,^'+ '+ '+ '+ '+] '+ '+ '+T!Q !Q !Q !Q !Qm !Q !Q !QH,% ,% ,% ,% ,%M ,% ,% ,%bRD RD RD RD RD} RD RD RDj%" %" %" %" %"} %" %" %"PSH SH SH SH SHm SH SH SHl_
 _
 _
 _
 _
_ _
 _
 _
D1+ 1+ 1+ 1+ 1+ 1+ 1+ 1+hJ J J J J] J J J\    r+   