
    Ngw<                       d dl mZ d dlZd dlZd dlmZ d dlmZmZm	Z	m
Z
mZmZmZ d dlZd dlZd dlmZmZ d dlmZ d dlmZmZ d dlmZ d d	lmZ d d
lmZmZ d dl m!Z!  G d d          Z" G d d          Z#d)dZ$	 	 	 d*d+d"Z%	 	 	 d,d-d(Z&dS ).    )annotationsN)PurePath)AnyBinaryIO
CollectionListOptionalUnioncast)ImageImageSequence)
TextRegion)LayoutElementLayoutElements)logger)	get_model)"UnstructuredElementExtractionModel UnstructuredObjectDetectionModel)	draw_bboxc                      e Zd ZdZddZddZedd            Zedd
            Z	e	 	 ddd            Z
e	 	 	 dd d            ZdS )!DocumentLayoutzClass for handling documents that are saved as .pdf files. For .pdf files, a
    document image analysis (DIA) model detects the layout of the page prior to extracting
    element.Nc                    || _         d S N_pages)selfpagess     c/var/www/html/ai-engine/env/lib/python3.11/site-packages/unstructured_inference/inference/layout.py__init__zDocumentLayout.__init__   s        returnstrc                J    d                     d | j        D                       S )N

c                ,    g | ]}t          |          S  r"   ).0pages     r   
<listcomp>z*DocumentLayout.__str__.<locals>.<listcomp>"   s    ===$CII===r    )joinr   r   s    r   __str__zDocumentLayout.__str__!   s%    {{==$*===>>>r    List[PageLayout]c                    | j         S )z1Gets all elements from pages in sequential order.r   r,   s    r   r   zDocumentLayout.pages$   s     {r    r   c                (     |             }||_         |S )zCGenerates a new instance of the class from a list of `PageLayouts`sr   )clsr   
doc_layouts      r   
from_pageszDocumentLayout.from_pages)   s     SUU
!
r       filenamefixed_layouts*Optional[List[Optional[List[TextRegion]]]]pdf_image_dpiintc           
     v   t          j        d| d           t          j                    5 }t	          |||d          }t          t          t                   |          }t          |          }g }	|d t          d|          D             }t          t          ||                    D ]b\  }
\  }}t          j        |          5 }t          j        |f|
dz   ||d	|}|	                    |           ddd           n# 1 swxY w Y   c|                     |	          cddd           S # 1 swxY w Y   dS )
z)Creates a DocumentLayout from a pdf file.zReading PDF for file:  ...T)output_folder	path_onlyNc                    g | ]}d S r   r&   r(   _s     r   r*   z,DocumentLayout.from_file.<locals>.<listcomp>F   s     I I I! I I Ir    r      )numberdocument_filenamefixed_layout)r   infotempfileTemporaryDirectoryconvert_pdf_to_imager   r   r"   lenrange	enumeratezipr   open
PageLayout
from_imageappendr3   )r1   r5   r6   r8   kwargstemp_dir_image_pathsimage_pathsnumber_of_pagesr   i
image_pathrD   imager)   s                  r   	from_filezDocumentLayout.from_file0   s    	;X;;;<<<(** 	)h/&	  L tCy,77K!+..O&(E$ I IuQ/H/H I I I1:3{M;Z;Z1[1[ ' '--J Z
++ 'u%0 1u*2%1	 
 ! D LL&&&' ' ' ' ' ' ' ' ' ' ' ' ' ' ' >>%((1	) 	) 	) 	) 	) 	) 	) 	) 	) 	) 	) 	) 	) 	) 	) 	) 	) 	)s6   BD./D5D.DD.D	D..D25D2detection_model*Optional[UnstructuredObjectDetectionModel]element_extraction_model,Optional[UnstructuredElementExtractionModel]rD   Optional[List[TextRegion]]c           
        t          j        d| d           	 t          j        |          }|j        }g }t          t          j        |                    D ]6\  }	}
|
                    d          }
||
_        |	                    |
           7nf# t          $ rY}t          j                            |          st          j                            |          r|t          d| d          |d}~ww xY wg }t          |          D ]2\  }	}t!          j        |f||	|||d|}|	                    |           3|                     |          S )z,Creates a DocumentLayout from an image file.zReading image file: r;   RGBzFile "z" not found!N)rW   rB   rZ   r\   rD   )r   rE   r   rM   formatrK   r   IteratorconvertrP   	ExceptionospathisdirisfileFileNotFoundErrorrN   rO   r3   )r1   r5   rZ   r\   rD   rQ   rX   ra   imagesrV   imer   r)   s                 r   from_image_filezDocumentLayout.from_image_fileU   s    	98999:::	PJx((E\F(*F"=#9%#@#@AA " "2ZZ&&"	b!!!!"  	P 	P 	Pw}}X&& P"'..*B*B P'(G(G(G(GHHaO		P
 !&)) 
	 
	HAu(# /)A)   D LL~~e$$$s   A5B 
C3AC..C3r   r!   r"   )r!   r.   )r   r.   r!   r   )Nr4   )r5   r"   r6   r7   r8   r9   r!   r   )NNN)
r5   r"   rZ   r[   r\   r]   rD   r^   r!   r   )__name__
__module____qualname____doc__r   r-   propertyr   classmethodr3   rY   rm   r&   r    r   r   r      s            ? ? ? ?    X    [  EI 	") ") ") ") [")H  GKQU37#% #% #% #% [#% #% #%r    r   c                      e Zd ZdZ	 	 	 	 	 d.d/dZd0dZ	 d1d2dZ	 	 d3d4dZd5dZ	 	 	 	 	 d6d7d'Z	d8d9d)Z
e	 	 	 	 	 	 d:d;d-            ZdS )<rN   z!Class for an individual PDF page.NrB   r9   rX   Image.Imageimage_metadataOptional[dict]rW   Optional[Union[str, PurePath]]rC   rZ   r[   r\   r]   c                    ||t          d          || _        |i }|| _        || _        d | _        || _        || _        || _        || _        g | _	        d | _
        d S )NzBOnly one of detection_model and extraction_model should be passed.)
ValueErrorrX   rw   rW   image_arrayrC   rB   rZ   r\   elementselements_array)r   rB   rX   rw   rW   rC   rZ   r\   s           r   r   zPageLayout.__init__   s|     &+C+Oabbb,1
!N,$>B!2.(@%3559r    r!   r"   c                J    d                     d | j        D                       S )Nr$   c                ,    g | ]}t          |          S r&   r'   )r(   elements     r   r*   z&PageLayout.__str__.<locals>.<listcomp>   s    FFFWCLLFFFr    )r+   r}   r,   s    r   r-   zPageLayout.__str__   s%    {{FFFFFGGGr    TOptional[List[LayoutElement]]c                    | j         t          d          | j        J |                      | j                  }|r	|| _        dS |S )zRUses end-to-end text element extraction model to extract the elements on the page.NzMCannot get elements using image extraction, no image extraction model defined)r\   r{   rX   r}   )r   inplacer}   s      r   #get_elements_using_image_extractionz.PageLayout.get_elements_using_image_extraction   s`    
 (0_   z%%%00<< 	$DM4r    Fr   bool
array_onlyc                r   | j         :t                      }t          |t                    r|| _         nt	          d          | j        J |                      | j                  }| j                             |          }|r$|| _        |s|                                | _	        dS |                                S )z8Uses specified model to detect the elements on the page.Nz)Default model should be a detection model)
rZ   r   
isinstancer   NotImplementedErrorrX   deduplicate_detected_elementsr~   as_listr}   )r   r   r   modelinferred_layouts        r   !get_elements_with_detection_modelz,PageLayout.get_elements_with_detection_model   s     'KKE%!ABB W',$$)*UVVV z%%%*.*>*>tz*J*J.LL
 
  	"1D : / 7 7 9 94&&(((r    !Union[np.ndarray[Any, Any], None]c                    | j         X| j        rt          j        | j                  | _         n2t	          j        | j                  }t          j        |          | _         | j         S )z*Converts the raw image into a numpy array.)r|   rX   nparrayr   rM   rW   )r   rX   s     r   _get_image_arrayzPageLayout._get_image_array   sV    #z 3#%8DJ#7#7  
4?33#%8E?? r    r4   colorsOptional[Union[List[str], str]]	image_dpiannotation_dataOptional[dict[str, dict]]add_detailssourcesOptional[List[str]]c           	     L   |d | j         D             }t          |t                    r|g}t          |          t          | j                   k     r,t          | j                   t          |          z  dz   }||z  }| j        r| j                                        }nB| j        rt          j        | j                  }n!| 	                    | j
        | j        |          }|:t          | j         |          D ]#\  }}	|	|j        |v rt          |||	|          }$n|                                D ]s\  }
}t!          | |
          r^t#          | |
          rN|d         }	|d         }t#          | |
          D ]-}t#          |dd          }|||v rt          |||	||          }.t|S )	a{  Annotates the elements on the page image.
        if add_details is True, and the elements contain type and source attributes, then
        the type and source will be added to the image.
        sources is a list of sources to annotate. If sources is ["all"], then all sources will be
        annotated. Current sources allowed are "yolox","detectron2_onnx" and "detectron2_lp" Nc                    g | ]}d S )redr&   r?   s     r   r*   z'PageLayout.annotate.<locals>.<listcomp>   s    333e333r    rA   )colordetailsr   widthsource)r   r   r   )r}   r   r"   rI   rX   copyrW   r   rM   
_get_imagerC   rB   rL   r   r   itemshasattrgetattr)r   r   r   r   r   r   n_copiesimgelr   	attributestyler   regionrequired_sources                  r   annotatezPageLayout.annotate   s    >33T]333Ffc"" 	XFv;;T]++++DM**c&kk9Q>Hh&F : 	R*//##CC_ 	R*T_--CC//$"8$+yQQC" 77 O O	E?bi7&:&:#C5+NNNCO %4$9$9$;$;   	54++ i0H0H !'NE!'NE")$	":": 	 	*1&(D*I*I#OG1K1K"+ # &&+&+(3# # #C 
r    r8   c                   t          j                    5 }t          j        |||d          }t	          t
          t                   |          }|t          |          k    rt          d| d          t          j
        ||dz
                     5 }|                                cddd           cddd           S # 1 swxY w Y   	 ddd           dS # 1 swxY w Y   dS )z&Hotloads a page image from a pdf file.Tdpir<   
paths_onlyzPage number z0 is greater than the number of pages in the PDF.rA   N)rF   rG   	pdf2imageconvert_from_pathr   r   r"   rI   r{   r   rM   r   )r   r5   page_numberr8   rR   rS   rT   rX   s           r   r   zPageLayout._get_image	  s    (** 	$h$6!&	  L tCy,77KS---- `;```   Ka899 $Uzz||$ $ $ $ $ $ $	$ 	$ 	$ 	$ 	$ 	$ 	$ 	$$ $ $ $ $ $ $ $ $	$ 	$ 	$ 	$ 	$ 	$ 	$ 	$ 	$ 	$ 	$ 	$ 	$ 	$ 	$ 	$ 	$ 	$s6   A<CB=$C=C	CC	CCCrA   rD   r^   c                    | ||||          }|j         |                                 n||                                 ng |_        |j        r|j        j        nd|j        r|j        j        nd|j        r|j        j        ndd|_        |rt          j
                            |          nd|_        |rt          j
                            |          nd|_        d|_        |S )z6Creates a PageLayout from an already-loaded PIL Image.)rB   rX   rZ   r\   N)ra   r   height)r\   r   r   r}   rX   ra   r   r   rw   re   rf   abspathrW   rC   )	r1   rX   rW   rC   rB   rZ   r\   rD   r)   s	            r   rO   zPageLayout.from_image  s    s+%=	
 
 
 (4446666!224444DM ,0:?dj''4)-=TZ%%+/:?dj''4
 

 :DM"'//*555GX!b1B!C!C!C^b 
r    )NNNNN)rB   r9   rX   rv   rw   rx   rW   ry   rC   ry   rZ   r[   r\   r]   rn   )T)r!   r   )TF)r   r   r   r   r!   r   )r!   r   )Nr4   NFN)r   r   r   r9   r   r   r   r   r   r   r!   rv   )r4   )r8   r9   r!   rv   )NNrA   NNN)rX   rv   rW   ry   rC   ry   rB   r9   rZ   r[   r\   r]   rD   r^   )ro   rp   rq   rr   r   r-   r   r   r   r   r   rt   rO   r&   r    r   rN   rN   |   s,       ++ *.59<@FJQU: : : : :8H H H H
     $  ) ) ) ) ):        3759!'+2 2 2 2 2h$ $ $ $ $&  6:<@FJQU37% % % % [% % %r    rN   datar   
model_nameOptional[str]rQ   r   r!   c                ~   t          j                    5 }t          j                            |d          }t          |d          5 }|                    |                                            |                                 ddd           n# 1 swxY w Y   t          ||fi |}ddd           n# 1 swxY w Y   |S )ztProcess PDF as file-like object `data` into a `DocumentLayout`.

    Uses the model identified by `model_name`.
    zdocument.pdfwbN)
rF   rG   re   rf   r+   rM   writereadflushprocess_file_with_model)r   r   rQ   tmp_dir_path	file_pathflayouts          r   process_data_with_modelr   E  s0    
	$	&	& 	
,GLL~>>	)T"" 	aGGDIIKK   GGIII	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 )
 
 
 
	
 	
 	
 	
 	
 	
 	
 	
 	
 	
 	
 	
 	
 	
 	
 Ms5   1B2<BB2B	B2B	B22B69B6Fr4   r5   r"   is_imager   r6   r7   r8   r9   c                "   t          |fi |}t          |t                    r|}d}n9t          |t                    rd}|}nt	          dt          |                     |rt          j        | f||d|nt          j        | f||||d|}	|	S )zjProcesses pdf file with name filename into a DocumentLayout by using a model identified by
    model_name.NzUnsupported model type: )rZ   r\   )rZ   r\   r6   r8   )	r   r   r   r   r{   typer   rm   rY   )
r5   r   r   r6   r8   rQ   r   rZ   r\   r   s
             r   r   r   \  s     j++F++E%9:: C#'  	E=	>	> C#(  ADKKAABBB 	
&	
+%=	
 	
 		
 	
 	
 %
+%=''
 
 
 
 " Mr    r   r<   ry   r=   #Union[List[Image.Image], List[str]]c                    |r|st          d          |t          j        | |||          }nt          j        | ||          }|S )z9Get the image renderings of the pdf pages using pdf2imagez4output_folder must be specified if path_only is trueNr   )r   r   )r{   r   r   )r5   r   r<   r=   rj   s        r   rH   rH     sx      Q QOPPP ,' 	
 
 
 , 
 
 
 Mr    )r   r   r   r   rQ   r   r!   r   )FNr4   )r5   r"   r   r   r   r   r6   r7   r8   r9   rQ   r   r!   r   )r4   NF)
r5   r"   r   r9   r<   ry   r=   r   r!   r   )'
__future__r   re   rF   pathlibr   typingr   r   r   r   r	   r
   r   numpyr   r   PILr   r   )unstructured_inference.inference.elementsr   .unstructured_inference.inference.layoutelementr   r   unstructured_inference.loggerr   "unstructured_inference.models.baser   /unstructured_inference.models.unstructuredmodelr   r    unstructured_inference.visualizer   r   rN   r   r   rH   r&   r    r   <module>r      s   " " " " " " 				        I I I I I I I I I I I I I I I I I I         $ $ $ $ $ $ $ $      Y X X X X X X X 0 0 0 0 0 0 8 8 8 8 8 8        7 6 6 6 6 6`% `% `% `% `% `% `% `%FF F F F F F F FR   4 @D% % % % %T 48	      r    