
    Ng_                       d Z ddlmZ ddlZddlmZmZmZmZm	Z	m
Z
mZmZmZ ddlmZ ddlZddlmZ ddlmZ ddlmZ er ddlZddlZddlZddlZddl Z!dd	lm"Z" dd
l#m$Z$ g dZ%g dZ&d dZ' G d de          Z( G d de          Z) G d de          Z* G d de          Z+ G d de          Z, G d de          Z- G d de          Z.dS )!z(Module contains common parsers for PDFs.    )annotationsN)	TYPE_CHECKINGAnyDictIterableIteratorMappingOptionalSequenceUnion)urlparse)Document)BaseBlobParser)Blob)
PageObject)TextLinearizationConfig)	DCTDecodeDCT	JPXDecode)	LZWDecodeLZWFlateDecodeFlASCII85DecodeA85ASCIIHexDecodeAHxRunLengthDecodeRLCCITTFaxDecodeCCFJBIG2Decodeimages,Sequence[Union[Iterable[np.ndarray], bytes]]returnstrc                    	 ddl m} n# t          $ r t          d          w xY w |            }d}| D ]6} ||          \  }}|r$d |D             }|d                    |          z  }7|S )zExtract text from images with RapidOCR.

    Args:
        images: Images to extract text from.

    Returns:
        Text extracted from images.

    Raises:
        ImportError: If `rapidocr-onnxruntime` package is not installed.
    r   )RapidOCRzc`rapidocr-onnxruntime` package not found, please install it with `pip install rapidocr-onnxruntime` c                    g | ]
}|d          S )    ).0texts     l/var/www/html/ai-engine/env/lib/python3.11/site-packages/langchain_community/document_loaders/parsers/pdf.py
<listcomp>z5extract_from_images_with_rapidocr.<locals>.<listcomp>O   s    111$d1g111    
)rapidocr_onnxruntimer(   ImportErrorjoin)r#   r(   ocrr.   imgresult_s          r/   !extract_from_images_with_rapidocrr:   5   s    
1111111 
 
 
1
 
 	


 (**CD & &CHH	 	&11&111FDIIf%%%DKs   	 #c                  8    e Zd ZdZ	 	 ddddddZddZddZdS )PyPDFParserzLoad `PDF` using `pypdf`NFplain)extraction_modeextraction_kwargspasswordOptional[Union[str, bytes]]extract_imagesboolr>   r&   r?   Optional[Dict[str, Any]]c               B    || _         || _        || _        |pi | _        d S N)r@   rB   r>   r?   )selfr@   rB   r>   r?   s        r/   __init__zPyPDFParser.__init__W   s.     !,.!2!8br1   blobr   r%   Iterator[Document]c              #  >   K   	 ddl n# t          $ r t          d          w xY wd fd                                5 } j        | j        	          } fd
t          |j                  D             E d{V  ddd           dS # 1 swxY w Y   dS )Lazily parse the blob.r   NzE`pypdf` package not found, please install it with `pip install pypdf`page'PageObject'r%   r&   c                    j                             d          r|                                 S  | j        ddj        ij        S )zM
            Extract text from image given the version of pypdf.
            3r>   Nr,   )__version__
startswithextract_textr>   r?   )rM   pypdfrG   s    r/   _extract_text_from_pagez7PyPDFParser.lazy_parse.<locals>._extract_text_from_pagen   sb      ++C00 ((***(t(  $($8,  r1   )r@   c                    g | ]>\  }}t           |                               |          z   j        |d          ?S ))rM   sourcerM   page_contentmetadata)r   _extract_images_from_pagerX   )r-   page_numberrM   rU   rI   rG   s      r/   r0   z*PyPDFParser.lazy_parse.<locals>.<listcomp>}   sr        &K !8!8d!C!C!C44T::";(,[II    r1   )rM   rN   r%   r&   )rT   r4   as_bytes_io	PdfReaderr@   	enumeratepages)rG   rI   pdf_file_obj
pdf_readerrU   rT   s   ``  @@r/   
lazy_parsezPyPDFParser.lazy_parsed   s\     	LLLL 	 	 	&  	
	 
	 
	 
	 
	 
	 
	  
	<(NNNJ      *3:3C)D)D         
	 
	 
	 
	 
	 
	 
	 
	 
	 
	 
	 
	 
	 
	 
	 
	 
	 
	s    'A BBBrM   pypdf._page.PageObjectc                   | j         rd|d                                         vrdS |d         d                                         }g }|D ]}||         d         dk    r||         d         dd         t          v r}||         d	         ||         d
         }}|                    t          j        ||                                         t
          j                  	                    ||d                     ||         d         dd         t          v r.|                    ||                                                    t          j        d           t          |          S )8Extract images from page and get the text with RapidOCR.z/XObjectz
/Resourcesr)   z/Subtypez/Imagez/Filterr+   Nz/Heightz/WidthdtypeUnknown PDF Filter!)rB   keys
get_object_PDF_FILTER_WITHOUT_LOSSappendnp
frombufferget_datauint8reshape_PDF_FILTER_WITH_LOSSwarningswarnr:   )rG   rM   xObjectr#   objheightwidths          r/   r\   z%PyPDFParser._extract_images_from_page   sd   " 	j\8J8O8O8Q8Q&Q&Q2|$Z0;;== 	9 	9Cs|J'8333<	*122.2JJJ$+CL$;WS\(=SEFMMgcl&;&;&=&=RXNNNVV"E2    
 S\),QRR04IIIMM'#,"7"7"9"9::::M"78880888r1   NF)r@   rA   rB   rC   r>   r&   r?   rD   rI   r   r%   rJ   )rM   re   r%   r&   __name__
__module____qualname____doc__rH   rd   r\   r,   r1   r/   r<   r<   T   sv        "" 15$9
  '6:9 9 9 9 9 9       D9 9 9 9 9 9r1   r<   c                  2    e Zd ZdZdddddZddZddZdS )PDFMinerParserzParse `PDF` using `PDFMiner`.FT)concatenate_pagesrB   rC   r   c               "    || _         || _        dS )a$  Initialize a parser based on PDFMiner.

        Args:
            extract_images: Whether to extract images from PDF.
            concatenate_pages: If True, concatenate all PDF pages into one a single
                               document. Otherwise, return one document per page.
        N)rB   r   )rG   rB   r   s      r/   rH   zPDFMinerParser.__init__   s     -!2r1   rI   r   r%   rJ   c              #    K   | j         s	 ddlm} n# t          $ r t          d          w xY w|                                5 }| j        r( ||          }d|j        i}t          ||          V  nhddlm	} |
                    |          }t          |          D ]=\  }}	 |||g          }|j        t          |          d}t          ||          V  >d	d	d	           d	S # 1 swxY w Y   d	S dd	l}
dd
lm}m} ddlm} ddlm}m} ddlm	} |
                                }|                                5 }|
                    |          } |            } ||| |                      } || |                      } |||          } |||          }t          |          D ]\  }}|                    |           |                    |           |                                |                     |                                          z   }|                    d           |                    d           |j        t          |          d}t          ||          V  	 d	d	d	           d	S # 1 swxY w Y   d	S )rL   r   )rS   zO`pdfminer` package not found, please install it with `pip install pdfminer.six`rX   rY   )PDFPage)page_numbersrW   N)PDFPageAggregatorTextConverter)LAParams)PDFPageInterpreterPDFResourceManager)laparams)rB   pdfminer.high_levelrS   r4   r^   r   rX   r   pdfminer.pdfpager   	get_pagesr`   r&   iopdfminer.converterr   r   pdfminer.layoutr   pdfminer.pdfinterpr   r   StringIOprocess_pagegetvaluer\   
get_resulttruncateseek)rG   rI   rS   rb   r.   r[   r   ra   ir9   r   r   r   r   r   r   text_iorsrcmgrdevice_for_textdevice_for_imageinterpreter_for_textinterpreter_for_imagerM   contents                           r/   rd   zPDFMinerParser.lazy_parse   s      " /	L<<<<<<<   !1   !!## M|) M'<55D ($+6H"xHHHHHHH888888#--l;;E )% 0 0 M M1+|LsKKK.2k3q66#J#J&D8LLLLLLLM M M M M M M M M M M M M M M M M M IIIKKKKKKKK000000QQQQQQQQ000000kkmmG!!## L|)),77,,.."/-88::"V"V"V#4#4Wxxzz#R#R#R '9'9'?'S'S$(:(:7DT(U(U%(// 	L 	LGAt(55d;;;)66t<<<%..0043Q3Q(33554 4 G $$Q'''LLOOO*.+s1vvFFH"(KKKKKKK	LL L L L L L L L L L L L L L L L L Ls.    ,BC((C,/C,=D2I==JJrM   pdfminer.layout.LTPager&   c           	     v   ddl dfdg }t          t          t          t	          |                              D ]}|j        d         j        t          v rv|                    t          j
        |j                                        t          j                                      |j        d	         |j        d
         d                     |j        d         j        t          v r-|                    |j                                                   t          j        d           t#          |          S )rg   r   Nlayout_objectr   r%   c                    t          | j        j                  r| S t          | j        j                  r| D ]} |          c S d S d S rF   )
isinstancelayoutLTImageLTContainer)r   child	get_imagepdfminers     r/   r   z;PDFMinerParser._extract_images_from_page.<locals>.get_image   sm    -)@AA %$$-)DEE * , ,E$9U+++++, , tr1   Filterrh   HeightWidthrj   rk   )r   r   r%   r   )r   listfilterrC   mapstreamnamern   ro   rp   rq   rr   rs   rt   ru   rv   rw   r:   )rG   rM   r#   r7   r   r   s       @@r/   r\   z(PDFMinerParser._extract_images_from_page   s7   	 	 	 	 	 	 	 tSD%9%9::;; 
	5 
	5Cz(#(,DDDM#*"5"5"7"7rxHHHPP
8,cj.A2    
 H%*.CCCcj11334444344440888r1   NF)rB   rC   r   rC   r}   )rM   r   r%   r&   r~   r,   r1   r/   r   r      sm        ''	3RV 	3 	3 	3 	3 	3 	32L 2L 2L 2Lh9 9 9 9 9 9r1   r   c                  @    e Zd ZdZ	 	 ddd
ZddZddZddZddZdS )PyMuPDFParserzParse `PDF` using `PyMuPDF`.NFtext_kwargsOptional[Mapping[str, Any]]rB   rC   r%   Nonec                &    |pi | _         || _        dS )z~Initialize the parser.

        Args:
            text_kwargs: Keyword arguments to pass to ``fitz.Page.get_text()``.
        N)r   rB   )rG   r   rB   s      r/   rH   zPyMuPDFParser.__init__   s     ',",r1   rI   r   rJ   c              #      K   ddl }                                5 }j         |j        |          n |j        |d           fdD             E d{V  ddd           dS # 1 swxY w Y   dS )rL   r   Npdf)r   filetypec           
         g | ]=}t                              |                              |                     >S )rY   )r   _get_page_content_extract_metadatar-   rM   rI   docrG   s     r/   r0   z,PyMuPDFParser.lazy_parse.<locals>.<listcomp>  sd       
 	 !%!7!7T4!H!H!33CtDD    r1   )fitzr^   dataopen)rG   rI   r   	file_pathr   s   ``  @r/   rd   zPyMuPDFParser.lazy_parse  s      	 	9y di	**diy5AAA     
           	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	s   AA,,A03A0r   fitz.fitz.DocumentrM   fitz.fitz.Pager&   c                     |j         di | j        |                     ||          z   }|s$t          j        d|j         d|j                    |S )zq
        Get the text of the page using PyMuPDF and RapidOCR and issue a warning
        if it is empty.
        zWarning: Empty content on page z of document r,   )get_textr   r\   rv   rw   numberrX   )rG   r   rM   rI   r   s        r/   r   zPyMuPDFParser._get_page_content   s      $-33$"233d6T6T7
 7
 
  	M;;; ;-1[; ;  
 r1   dictc                    t          |j        |j        |j        t                    dfi fdj        D             S )z,Extract metadata from the document and page.rX   r   rM   total_pagesc                z    i | ]7}t          j        |         t          t          f          )|j        |         8S r,   )r   r[   r&   intr-   kr   s     r/   
<dictcomp>z3PyMuPDFParser._extract_metadata.<locals>.<dictcomp>>  sK       cl1oSz::3<?  r1   )r   rX   r   lenr[   )rG   r   rM   rI   s    `  r/   r   zPyMuPDFParser._extract_metadata3  sk     +!["3xx	 
 
     
 
 	
r1   c                P   | j         sdS ddl}|                                }g }|D ]r}|d         } |j        ||          }|                    t          j        |j        t
          j                  	                    |j
        |j        d                     st          |          S )rg   r)   r   Nrh   rj   )rB   r   
get_imagesPixmapro   rp   rq   samplesrs   rt   rz   r{   r:   )	rG   r   rM   r   img_listimgsr7   xrefpixs	            r/   r\   z'PyMuPDFParser._extract_images_from_pageE  s     " 	2??$$ 	 	Cq6D$+c4((CKKck:::BBJ	2    
 1666r1   r|   )r   r   rB   rC   r%   r   r}   )r   r   rM   r   rI   r   r%   r&   )r   r   rM   r   rI   r   r%   r   )r   r   rM   r   r%   r&   )	r   r   r   r   rH   rd   r   r   r\   r,   r1   r/   r   r      s        && 48$- - - - -   &   &
 
 
 
$7 7 7 7 7 7r1   r   c                  ,    e Zd ZdZdddZddZddZdS )PyPDFium2ParserzParse `PDF` with `PyPDFium2`.FrB   rC   r%   r   c                Z    	 ddl }n# t          $ r t          d          w xY w|| _        dS )zInitialize the parser.r   NzKpypdfium2 package not found, please install it with `pip install pypdfium2`)	pypdfium2r4   rB   )rG   rB   r   s      r/   rH   zPyPDFium2Parser.__init__]  sX    	 	 	 	+  	
 -s    !rI   r   rJ   c              #  ,  K   ddl }|                                5 } |j        |d          }	 t          |          D ]\  }}|                                }|                                }|                                 |d|                     |          z   z  }|                                 |j        |d}	t          ||	          V  	 |                                 n# |                                 w xY w	 ddd           dS # 1 swxY w Y   dS )rL   r   NT)	autocloser2   rW   rY   )
r   r^   PdfDocumentr`   get_textpageget_text_rangecloser\   rX   r   )
rG   rI   r   r   rc   r]   rM   	text_pager   r[   s
             r/   rd   zPyPDFium2Parser.lazy_parseh  s       	#9..yDIIIJ
#)2:)>)> L L%K $ 1 1 3 3I'6688GOO%%%td&D&DT&J&JJJGJJLLL*.+{KKH"(KKKKKKKL   """"
  """""	# 	# 	# 	# 	# 	# 	# 	# 	# 	# 	# 	# 	# 	# 	# 	# 	# 	#s)   D	BC"D	"C88D		DDrM   pypdfium2._helpers.page.PdfPager&   c                    | j         sdS ddlm} t          |                    |j        f                    }t          t          d |                    }t          |          S )rg   r)   r   N)r   c                N    |                                                                  S rF   )
get_bitmapto_numpy)xs    r/   <lambda>z;PyPDFium2Parser._extract_images_from_page.<locals>.<lambda>  s    ALLNN$;$;$=$= r1   )rB   pypdfium2.rawrawr   get_objectsFPDF_PAGEOBJ_IMAGEr   r:   )rG   rM   pdfium_cr#   s       r/   r\   z)PyPDFium2Parser._extract_images_from_page|  sq    " 	2((((((d&&x/J.L&MMNNc==vFFGG0888r1   Nr   )rB   rC   r%   r   r}   )rM   r   r%   r&   r~   r,   r1   r/   r   r   Z  s[        ''	- 	- 	- 	- 	-# # # #(
9 
9 
9 
9 
9 
9r1   r   c                  :    e Zd ZdZ	 	 	 dddZddZddZddZdS )PDFPlumberParserzParse `PDF` with `PDFPlumber`.NFr   r   deduperC   rB   r%   r   c                4    |pi | _         || _        || _        dS )zInitialize the parser.

        Args:
            text_kwargs: Keyword arguments to pass to ``pdfplumber.Page.extract_text()``
            dedupe: Avoiding the error of duplicate characters if `dedupe=True`.
        N)r   r   rB   )rG   r   r   rB   s       r/   rH   zPDFPlumberParser.__init__  s&     ',",r1   rI   r   rJ   c              #      K   ddl }                                5 } |j        |           fdj        D             E d{V  ddd           dS # 1 swxY w Y   dS )rL   r   Nc                   g | ]}t                              |          d z                       |          z   t          j        j        |j        dz
  t          j                  dfi fdj        D                       S )r2   r+   r   c                |    i | ]8}t          j        |                   t          t          fv *|j        |         9S r,   )typer[   r&   r   r   s     r/   r   z:PDFPlumberParser.lazy_parse.<locals>.<listcomp>.<dictcomp>  sH        !#CLO44c
BB s|ABBBr1   rY   )	r   _process_page_contentr\   r   rX   r]   r   ra   r[   r   s     r/   r0   z/PDFPlumberParser.lazy_parse.<locals>.<listcomp>  s       & % !%!;!;D!A!A"44T::"; "&*k)-$($4q$8+.sy>>	     %(\   	    r1   )
pdfplumberr^   r   ra   )rG   rI   r  r   r   s   ``  @r/   rd   zPDFPlumberParser.lazy_parse  s       	9!*/),,C     &  I'         	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	s   ,AAArM   pdfplumber.page.Pager&   c                |    | j         r$ |                                j        di | j        S  |j        di | j        S )z)Process the page content based on dedupe.r,   )r   dedupe_charsrS   r   )rG   rM   s     r/   r  z&PDFPlumberParser._process_page_content  sQ    ; 	H34$$&&3GGd6FGGG t 444#3444r1   c                (   | j         sdS g }|j        D ]}|d         d         j        t          v ry|                    t          j        |d                                         t
          j                  	                    |d         d         |d         d         d                     |d         d         j        t          v r.|                    |d                                                    t          j        d           t          |          S )	rg   r)   r   r   rh   r   r   rj   rk   )rB   r#   r   rn   ro   rp   rq   rr   rs   rt   ru   rv   rw   r:   )rG   rM   r#   r7   s       r/   r\   z*PDFPlumberParser._extract_images_from_page  s	   " 	2; 
	5 
	5C8}X&+/GGGM#h-"8"8":":"(KKKSSHh/Xw1G    
 Xx(-1FFFc(m44667777344440888r1   )NFF)r   r   r   rC   rB   rC   r%   r   r}   )rM   r  r%   r&   )r   r   r   r   rH   rd   r  r\   r,   r1   r/   r   r     sz        (( 48$	- - - - -    :5 5 5 59 9 9 9 9 9r1   r   c                  .    e Zd ZdZ	 	 dddddZddZdS )AmazonTextractPDFParsera{  Send `PDF` files to `Amazon Textract` and parse them.

    For parsing multi-page PDFs, they have to reside on S3.

    The AmazonTextractPDFLoader calls the
    [Amazon Textract Service](https://aws.amazon.com/textract/)
    to convert PDFs into a Document structure.
    Single and multi-page documents are supported with up to 3000 pages
    and 512 MB of size.

    For the call to be successful an AWS account is required,
    similar to the
    [AWS CLI](https://docs.aws.amazon.com/cli/latest/userguide/cli-chap-configure.html)
    requirements.

    Besides the AWS configuration, it is very similar to the other PDF
    loaders, while also supporting JPEG, PNG and TIFF and non-native
    PDF formats.

    ```python
    from langchain_community.document_loaders import AmazonTextractPDFLoader
    loader=AmazonTextractPDFLoader("example_data/alejandro_rosalez_sample-small.jpeg")
    documents = loader.load()
    ```

    One feature is the linearization of the output.
    When using the features LAYOUT, FORMS or TABLES together with Textract

    ```python
    from langchain_community.document_loaders import AmazonTextractPDFLoader
    # you can mix and match each of the features
    loader=AmazonTextractPDFLoader(
        "example_data/alejandro_rosalez_sample-small.jpeg",
        textract_features=["TABLES", "LAYOUT"])
    documents = loader.load()
    ```

    it will generate output that formats the text in reading order and
    try to output the information in a tabular structure or
    output the key/value pairs with a colon (key: value).
    This helps most LLMs to achieve better accuracy when
    processing these texts.

    N)linearization_configtextract_featuresOptional[Sequence[int]]clientOptional[Any]r	  #Optional['TextLinearizationConfig']r%   r   c                  	 ddl ddlmc m} | _        || _        |fd|D             | _        ng | _        ||| _        n#| j                            dddd          | _        n# t          $ r t          d	          w xY w|s>	 ddl
}|                    d
          | _        dS # t          $ r t          d          w xY w|| _        dS )a5  Initializes the parser.

        Args:
            textract_features: Features to be used for extraction, each feature
                               should be passed as an int that conforms to the enum
                               `Textract_Features`, see `amazon-textract-caller` pkg
            client: boto3 textract client
            linearization_config: Config to be used for linearization of the output
                                  should be an instance of TextLinearizationConfig from
                                  the `textractor` pkg
        r   Nc                :    g | ]}                     |          S r,   )Textract_Features)r-   ftcs     r/   r0   z4AmazonTextractPDFParser.__init__.<locals>.<listcomp>  s4     * * *01B((++* * *r1   Tz# z## *)hide_figure_layouttitle_prefixsection_header_prefixlist_element_prefixzCould not import amazon-textract-caller or amazon-textract-textractor python package. Please install it with `pip install amazon-textract-caller` & `pip install amazon-textract-textractor`.textractzRCould not import boto3 python package. Please install it with `pip install boto3`.)textractcallertextractor.entities.documententitiesdocumentr  
textractorr
  r	  r   r4   boto3r  boto3_textract_client)rG   r
  r  r	  r  r  r  s         @r/   rH   z AmazonTextractPDFParser.__init__  sb   &	''''=========DG(DO ,* * * *5F* * *&& *,&#/,@)),0O,S,S'+!%*/(+	 -T - -)  	 	 	<  	  	0-2\\*-E-E***   !B   *0D&&&s   A%A) )B	B) )CrI   r   rJ   c              #  l  K   |j         r!t          t          |j                             nd}|rL|j        dk    rA|j        r:| j                            t          |j                   | j        | j                  }nI| j                            |	                                | j        | j        j
        j        | j                  }| j        j                            |          }t          |j                  D ]<\  }}t          |                    | j                  |j        |dz   d          V  =dS )	zIterates over the Blob pages and returns an Iterator with a Document
        for each page, like the other parsers If multi-page document, blob.path
        has to be set to the S3 URI and for single page docs
        the blob.data is taken
        Ns3)input_documentfeaturesr   )r#  r$  	call_moder   )configr+   rW   rY   )pathr   r&   schemenetlocr  call_textractr
  r   as_bytesTextract_Call_Mode
FORCE_SYNCr  r   r   r`   ra   r   r	  rX   )rG   rI   url_parse_resulttextract_response_jsonr  idxrM   s          r/   rd   z"AmazonTextractPDFParser.lazy_parseA  sT      8<yJ8C	NN333d 	 '4// ' 0 &*W%:%:"49~~/&*&@ &; & &"" &*W%:%:#}}/'4?&*&@	 &; & &" ?+001GHH"8>22 	 	IC!]]$2K]LL$(KqAA      	 	r1   )NN)r
  r  r  r  r	  r  r%   r   r}   )r   r   r   r   rH   rd   r,   r1   r/   r  r    sf        + +^ 6: $=0
 EI=0 =0 =0 =0 =0 =0~! ! ! ! ! !r1   r  c                  *    e Zd ZdZddZddZddZdS )DocumentIntelligenceParserzjLoads a PDF with Azure Document Intelligence
    (formerly Form Recognizer) and chunks at character level.r  r   modelr&   c                J    t          j        d           || _        || _        d S )Na<  langchain_community.document_loaders.parsers.pdf.DocumentIntelligenceParserand langchain_community.document_loaders.pdf.DocumentIntelligenceLoader are deprecated. Please upgrade to langchain_community.document_loaders.DocumentIntelligenceLoader for any file parsing purpose using Azure Document Intelligence service.)rv   rw   r  r3  )rG   r  r3  s      r/   rH   z#DocumentIntelligenceParser.__init__i  s/    	
 	
 	
 


r1   rI   r   r8   r%   rJ   c              #     K   |j         D ]H}d                    d |j        D                       }t          ||j        |j        d          }|V  Id S )N c                    g | ]	}|j         
S r,   )r   )r-   lines     r/   r0   z=DocumentIntelligenceParser._generate_docs.<locals>.<listcomp>w  s    AAAAAAr1   rW   rY   )ra   r5   linesr   rX   r]   )rG   rI   r8   pr   ds         r/   _generate_docsz)DocumentIntelligenceParser._generate_docsu  s{       
	 
	AhhAAAAABBG$"kM   A GGGG
	 
	r1   c              #    K   |                                 5 }| j                            | j        |          }|                                }|                     ||          }|E d{V  ddd           dS # 1 swxY w Y   dS )rL   N)r^   r  begin_analyze_documentr3  r8   r<  )rG   rI   file_objpollerr8   docss         r/   rd   z%DocumentIntelligenceParser.lazy_parse  s        	8[77
HMMF]]__F&&tV44DOOOOOOO	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	s   AA77A;>A;N)r  r   r3  r&   )rI   r   r8   r   r%   rJ   r}   )r   r   r   r   rH   r<  rd   r,   r1   r/   r2  r2  e  s\        A A
 
 
 
   	 	 	 	 	 	r1   r2  )r#   r$   r%   r&   )/r   
__future__r   rv   typingr   r   r   r   r   r	   r
   r   r   urllib.parser   numpyrp   langchain_core.documentsr   )langchain_community.document_loaders.baser   1langchain_community.document_loaders.blob_loadersr   	fitz.fitzr   r   r   pdfplumber.pager  pypdf._pagerT   pypdfium2._helpers.pager   r   )textractor.data.text_linearization_configr   ru   rn   r:   r<   r   r   r   r   r  r2  r,   r1   r/   <module>rN     s   . . " " " " " " 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 " ! ! ! ! !     - - - - - - D D D D D D B B B B B B R""""      QQQQQQ :99    "   >G9 G9 G9 G9 G9. G9 G9 G9T\9 \9 \9 \9 \9^ \9 \9 \9~Z7 Z7 Z7 Z7 Z7N Z7 Z7 Z7z,9 ,9 ,9 ,9 ,9n ,9 ,9 ,9^H9 H9 H9 H9 H9~ H9 H9 H9VN N N N Nn N N Nb& & & & & & & & & &r1   