
    Ng`<                     <   d Z ddlZddlZddlZddlmZ ddlmZmZm	Z	m
Z
mZ ddlmZ ddlmZ ddlmZ ddlmZ dd	lmZ dd
lmZ erddlmZ ddlmZ  ej        e          Ze G d d                      Z eddd           G d de                      Z dS )zModule contains a PDF parser based on Document AI from Google Cloud.

You need to install two libraries to use this parser:
pip install google-cloud-documentai
pip install google-cloud-documentai-toolbox
    N)	dataclass)TYPE_CHECKINGIteratorListOptionalSequence)
deprecated)Document)batch_iterate)BaseBlobParser)Blob)get_client_info)	OperationDocumentProcessorServiceClientc                   (    e Zd ZU dZeed<   eed<   dS )DocAIParsingResultsz/Dataclass to store Document AI parsing results.source_pathparsed_pathN)__name__
__module____qualname____doc__str__annotations__     n/var/www/html/ai-engine/env/lib/python3.11/site-packages/langchain_community/document_loaders/parsers/docai.pyr   r      s0         99r   r   z0.0.32z1.0z&langchain_google_community.DocAIParser)sinceremovalalternative_importc                       e Zd ZdZdddddded         dee         dee         dee         fd	Zd
edee	         fdZ
	 	 	 d$d
ededee         deee                  dee	         f
dZ	 	 	 d%dee         dee         dededee	         f
dZdee         dee	         fdZdee         ded         fdZded         defdZdddddd dee         dee         dee         d!ededee         ded         fd"Zded         dee         fd#ZdS )&DocAIParserz`Google Cloud Document AI` parser.

    For a detailed explanation of Document AI, refer to the product documentation.
    https://cloud.google.com/document-ai/docs/overview
    N)clientlocationgcs_output_pathprocessor_namer$   r   r%   r&   r'   c                   t          |          t          |          k    rt          d          d}|r(t          j        ||          st          d| d          || _        || _        |r	|| _        d	S 	 ddlm} ddl	m
} n"# t          $ r}t          d          |d	}~ww xY w || d
          }	 ||	t          d                    | _        d	S )a  Initializes the parser.

        Args:
            client: a DocumentProcessorServiceClient to use
            location: a Google Cloud location where a Document AI processor is located
            gcs_output_path: a path on Google Cloud Storage to store parsing results
            processor_name: full resource name of a Document AI processor or processor
                version

        You should provide either a client or location (and then a client
            would be instantiated).
        zGYou must specify either a client or a location to instantiate a client.z?projects\/[0-9]+\/locations\/[a-z\-0-9]+\/processors\/[a-z0-9]+zProcessor name z has the wrong format. If your prediction endpoint looks like https://us-documentai.googleapis.com/v1/projects/PROJECT_ID/locations/us/processors/PROCESSOR_ID:process, use only projects/PROJECT_ID/locations/us/processors/PROCESSOR_ID part.r   )ClientOptionsr   Zdocumentai package not found, please install it with `pip install google-cloud-documentai`Nz-documentai.googleapis.com)api_endpointzdocument-ai)module)client_optionsclient_info)bool
ValueErrorre	fullmatch_gcs_output_path_processor_name_clientgoogle.api_core.client_optionsr)   google.cloud.documentair   ImportErrorr   )
selfr$   r%   r&   r'   patternr)   r   excoptionss
             r   __init__zDocAIParser.__init__2   s]   * <<4>>))  
 U 	",w"G"G 	.      !0- 	!DLLLHHHHHHRRRRRRR   !=  
 $m (DDD  G :9&+=AAA  DLLLs   6B 
B"BB"blobreturnc              #   P   K   |                      |g| j                  E d{V  dS )zParses a blob lazily.

        Args:
            blobs: a Blob to parse

        This is a long-running operation. A recommended way is to batch
            documents together and use the `batch_parse()` method.
        r&   N)batch_parser3   )r9   r>   s     r   
lazy_parsezDocAIParser.lazy_parsel   s>       ##TFD<Q#RRRRRRRRRRRr   Tenable_native_pdf_parsing
field_mask
page_rangec           
   #     K   	 ddl m} ddlm}m}m} n"# t          $ r}	t          d          |	d}	~	ww xY w	 ddlm n"# t          $ r}	t          d          |	d}	~	ww xY w|r ||          nd}
|r ||	          nd}| j	        
                    |                    | j        |                    j        j        pd
           ||
|          d|                    fdj        j        D             E d{V  dS )a  Parses a blob lazily using online processing.

        Args:
            blob: a blob to parse.
            enable_native_pdf_parsing: enable pdf embedded text extraction
            field_mask: a comma-separated list of which fields to include in the
                Document AI response.
                suggested: "text,pages.pageNumber,pages.layout"
            page_range: list of page numbers to parse. If `None`,
                entire document will be parsed.
        r   
documentai)IndividualPageSelector	OcrConfigProcessOptionsr*   N_text_from_layoutjdocumentai_toolbox package not found, please install it with `pip install google-cloud-documentai-toolbox`rD   )pagesapplication/pdfgcs_uri	mime_type)
ocr_configindividual_page_selectorT)namegcs_documentprocess_optionsskip_human_reviewrE   c              3      K   | ];}t           |j        j        j                  |j        j        d           V  <dS )pagesource)page_contentmetadataN)r
   layoutdocumenttextpage_numberpath).0r^   rN   r>   responses     r   	<genexpr>z-DocAIParser.online_process.<locals>.<genexpr>   su       	
 	
  ..t{H<M<RSS ,"i   	
 	
 	
 	
 	
 	
r   )google.cloudrI    google.cloud.documentai_v1.typesrJ   rK   rL   r8   -google.cloud.documentai_toolbox.wrappers.pagerN   r5   process_documentProcessRequestr4   GcsDocumentrf   mimetyperc   rQ   )r9   r>   rD   rE   rF   rI   rJ   rK   rL   r;   rV   rW   rN   rh   s    `          @@r   online_processzDocAIParser.online_processw   s     $	//////          
  	 	 	9  	
	WWWWWWW 	 	 	A  	 )II0IJJJJ 	 9CL""4444 	! <00%%)'33 I"m@/@ 4   !/)-E! ! ! #'% &  
 
	
 	
 	
 	
 	
 	
 !)/	
 	
 	
 		
 		
 		
 		
 		
 		
 		
 		
 		
s'    
727A 
A!AA!  <   blobstimeout_seccheck_in_interval_secc              #     K   |p| j         }|st          d          |                     ||          }d |D             }t                              d|           d}|                     |          rat          j        |           ||z  }||k    rt          d| d          t                              d           |                     |          a| 	                    |	          }	| 
                    |	          E d
{V  d
S )a  Parses a list of blobs lazily.

        Args:
            blobs: a list of blobs to parse.
            gcs_output_path: a path on Google Cloud Storage to store parsing results.
            timeout_sec: a timeout to wait for Document AI to complete, in seconds.
            check_in_interval_sec: an interval to wait until next check
                whether parsing operations have been completed, in seconds
        This is a long-running operation. A recommended way is to decouple
            parsing from creating LangChain Documents:
            >>> operations = parser.docai_parse(blobs, gcs_path)
            >>> parser.is_running(operations)
            You can get operations names and save them:
            >>> names = [op.operation.name for op in operations]
            And when all operations are finished, you can use their results:
            >>> operations = parser.operations_from_names(operation_names)
            >>> results = parser.get_results(operations)
            >>> docs = parser.parse_from_results(results)
        :An output path on Google Cloud Storage should be provided.rA   c                 &    g | ]}|j         j        S r   )	operationrX   rg   ops     r   
<listcomp>z+DocAIParser.batch_parse.<locals>.<listcomp>   s    BBB2<,BBBr   z9Started parsing with Document AI, submitted operations %sr   z#Timeout exceeded! Check operations z later!.)
operationsN)r3   r0   docai_parseloggerdebug
is_runningtimesleepTimeoutErrorget_resultsparse_from_results)
r9   rt   r&   ru   rv   output_pathr   operation_namestime_elapsedresultss
             r   rB   zDocAIParser.batch_parse   sJ     4 &>)> 	L   %%e[%II
BBzBBBG	
 	
 	
 ooj)) 	J,---11Lk))"U_UUU   LL ooj)) 	 ""j"99**733333333333r   r   c              #      	K   	 ddl m} ddlm} ddlm n"# t          $ r}t          d          |d }~ww xY w|D ]6	 |	j                  \  }} |||          }	fd|D             E d {V  7d S )Nr   )split_gcs_uri)_get_shardsrM   rO   c              3      K   | ]@}|j         D ]6}t           |j        |j                  |j        j        d           V  7AdS r]   )rQ   r
   rb   rd   re   r   )rg   shardr^   rN   results      r   ri   z1DocAIParser.parse_from_results.<locals>.<genexpr>  s        
 !K   !2!24;
!K!K&*&6&BTUU        r   )7google.cloud.documentai_toolbox.utilities.gcs_utilitiesr   1google.cloud.documentai_toolbox.wrappers.documentr   rl   rN   r8   r   )
r9   r   r   r   r;   gcs_bucket_name
gcs_prefixshardsrN   r   s
           @@r   r   zDocAIParser.parse_from_results   s)     
	      VUUUUUWWWWWWW 	 	 	A  	
  
	 
	F*7-8J*K*K'OZ [*==F    
 $          
	 
	s    
838r   r   c                 x     	 ddl m n"# t          $ r}t          d          |d}~ww xY w fd|D             S )z5Initializes Long-Running Operations from their names.r   )GetOperationRequestzhlong running operations package not found, please install it with `pip install gapic-google-longrunning`Nc                 Z    g | ]'}j                              |                     (S ))rX   )request)r5   get_operation)rg   rX   r   r9   s     r   r}   z5DocAIParser.operations_from_names.<locals>.<listcomp>  sK     
 
 
 L&&/B/B/M/M/M&NN
 
 
r   )!google.longrunning.operations_pb2r   r8   )r9   r   r;   r   s   `  @r   operations_from_namesz!DocAIParser.operations_from_names
  s    	        	 	 	:  	
 
 
 
 
'
 
 
 	
s    
*%*r   c                 4    t          d |D                       S )Nc              3   @   K   | ]}|                                  V  d S )N)doner{   s     r   ri   z)DocAIParser.is_running.<locals>.<genexpr>  s,      66Rrwwyy=666666r   )any)r9   r   s     r   r   zDocAIParser.is_running  s    66:666666r   i  )r&   r'   
batch_sizerD   rE   r   c                   	 ddl m ddlm}m} n"# t
          $ r}	t          d          |	d}	~	ww xY w|p| j        }
|
t          d          |p| j        }|t          d          g }t          ||          D ]}
                                        fd	|D             
                    }                    j                            |
|                    }|r | ||                    nd}|                    | j                                                ||||d                               |S )a3  Runs Google Document AI PDF Batch Processing on a list of blobs.

        Args:
            blobs: a list of blobs to be parsed
            gcs_output_path: a path (folder) on GCS to store results
            processor_name: name of a Document AI processor.
            batch_size: amount of documents per batch
            enable_native_pdf_parsing: a config option for the parser
            field_mask: a comma-separated list of which fields to include in the
                Document AI response.
                suggested: "text,pages.pageNumber,pages.layout"

        Document AI has a 1000 file limit per batch, so batches larger than that need
        to be split into multiple requests.
        Batch processing is an async long-running operation
        and results are stored in a output GCS bucket.
        r   rH   )rK   rL   r*   Nrx   z0A Document AI processor name should be provided.)sizeiterablec                 V    g | ]%}                     |j        |j        pd           &S )rR   rS   )ro   rf   rp   )rg   r>   rI   s     r   r}   z+DocAIParser.docai_parse.<locals>.<listcomp>O  sP       
 !	 #..$(I&*m&H7H /    r   )	documents)gcs_documents)rT   rE   )gcs_output_configrP   )rV   T)rX   input_documentsdocument_output_configrZ   r[   )rj   rI   rk   rK   rL   r8   r3   r0   r4   r   BatchDocumentsInputConfigGcsDocumentsDocumentOutputConfigGcsOutputConfigappendr5   batch_process_documentsBatchProcessRequest)r9   rt   r&   r'   r   rD   rE   rK   rL   r;   r   r   batchinput_configoutput_configrZ   rI   s                   @r   r   zDocAIParser.docai_parse  s   6	//////RRRRRRRRR 	 	 	9  	 &>)>L   (?4+?!OPPP
"
UCCC &	 &	E%??(55   
 %*   6   @ 
 
L ';;","A"Q"Q'J #R # # <  M -(y2K           4422+(4/<(7*. 3   
 
 
 
 s    
1,1c                 t    	 ddl m n"# t          $ r}t          d          |d }~ww xY wfd|D             S )Nr   )BatchProcessMetadatar*   c                     g | ]e}t          |j                  r|j        j        n#                    |j        j                  j        D ]}t          |j        |j                   fS ))r   r   )
isinstancera   individual_process_statusesdeserializevaluer   input_gcs_sourceoutput_gcs_destination)rg   r|   statusr   s      r   r}   z+DocAIParser.get_results.<locals>.<listcomp>~  s     
 
 

  bk+?@@.77)55K% -
 
   "3"9  
 
 
 
r   )google.cloud.documentai_v1r   r8   )r9   r   r;   r   s      @r   r   zDocAIParser.get_resultsu  s    	GGGGGGG 	 	 	9  	
 
 
 

 !
 
 
 	
s   
 
)$))TNN)Nrr   rs   )r   r   r   r   r   r   r=   r   r   r
   rC   r/   r   intrq   r   rB   r   r   r   r   r   r   r   r   r   r#   r#   &   s         >B"&)-(,8 8 8 9:8 3-	8
 "#8 !8 8 8 8t	St 	S(: 	S 	S 	S 	S +/$(*.F
 F
F
 $(F
 SM	F

 T#Y'F
 
(	F
 F
 F
 F
V *.%'/4 /4~/4 "#/4 	/4
  #/4 
(	/4 /4 /4 /4b/0	(	   4
T#Y 
4CT 
 
 
 
"7T+%6 74 7 7 7 7 *.(,*.$(U U U~U "#	U
 !U U $(U SMU 
k	U U U Un
d;&7 
DAT<U 
 
 
 
 
 
r   r#   )!r   loggingr1   r   dataclassesr   typingr   r   r   r   r   langchain_core._api.deprecationr	   langchain_core.documentsr
   langchain_core.utils.iterr   )langchain_community.document_loaders.baser   1langchain_community.document_loaders.blob_loadersr   &langchain_community.utilities.vertexair   google.api_core.operationr   r7   r   	getLoggerr   r   r   r#   r   r   r   <module>r      s     				  ! ! ! ! ! ! D D D D D D D D D D D D D D 6 6 6 6 6 6 - - - - - - 3 3 3 3 3 3 D D D D D D B B B B B B B B B B B B G333333FFFFFF 
	8	$	$         
?  
`
 `
 `
 `
 `
. `
 `
 
`
 `
 `
r   