
    Ng|F                    T   d dl mZ d dlZd dlZd dlmZmZmZmZm	Z	m
Z
 d dlZd dlmZ d dlmZ d dlmZ d dlmZ d dlmZ d d	lmZmZ d d
lmZ d dlmZ d dlmZ d dlm Z  erd dl!m"Z" d dl#m$Z$m%Z% d dl&m'Z' d dl(m)Z) dddej*        j+        ddfdJd&Z, e d'          dddej*        j+        ddfdKd)            Z- e d'          ddej*        j+        ddfdLd0            Z. e d'          	 dMdNd5            Z/	 	 dOdPd<Z0	 dQdRdAZ1ej2        fdSdGZ3 e d'          ej2        fdTdI            Z4dS )U    )annotationsN)IOTYPE_CHECKINGAnyListOptionalcast)Image)ImageSequence)ElementType)SimpleTableCell)OCRLayoutDumper)pad_element_bboxes
valid_text)
env_config)OCRMode)OCRAgent)requires_dependencies)
TextRegion)DocumentLayout
PageLayout)LayoutElement)!UnstructuredTableTransformerModelFeng   databytes | IO[bytes]
out_layout'DocumentLayout'extracted_layoutList[List['TextRegion']]is_imageboolinfer_table_structureocr_languagesstrocr_modepdf_image_dpiintocr_layout_dumperOptional[OCRLayoutDumper]returnc	                   t          | t                    r| n|                                 }	t          j                    5 }
t
          j                            |
d          }t          |d          5 }|	                    |	           ddd           n# 1 swxY w Y   t          |||||||||	  	        }ddd           n# 1 swxY w Y   |S )a  
    Process OCR data from a given data and supplement the output DocumentLayout
    from unstructured_inference with ocr.

    Parameters:
    - data (Union[bytes, BinaryIO]): The input file data,
        which can be either bytes or a BinaryIO object.

    - out_layout (DocumentLayout): The output layout from unstructured-inference.

    - is_image (bool, optional): Indicates if the input data is an image (True) or not (False).
        Defaults to False.

    - infer_table_structure (bool, optional):  If true, extract the table content.

    - ocr_languages (str, optional): The languages for OCR processing. Defaults to "eng" (English).

    - ocr_mode (str, optional): The OCR processing mode, e.g., "entire_page" or "individual_blocks".
        Defaults to "entire_page". If choose "entire_page" OCR, OCR processes the entire image
        page and will be merged with the output layout. If choose "individual_blocks" OCR,
        OCR is performed on individual elements by cropping the image.

    - pdf_image_dpi (int, optional): DPI (dots per inch) for processing PDF images. Defaults to 200.

    - ocr_layout_dumper (OCRLayoutDumper, optional): The OCR layout dumper to save the OCR layout.

    Returns:
        DocumentLayout: The merged layout information obtained after OCR processing.
    tmp_filewbN)	filenamer   r    r"   r$   r%   r'   r(   r*   )
isinstancebytesreadtempfileTemporaryDirectoryospathjoinopenwriteprocess_file_with_ocr)r   r   r    r"   r$   r%   r'   r(   r*   
data_bytestmp_dir_pathtmp_file_pathr.   merged_layoutss                 `/var/www/html/ai-engine/env/lib/python3.11/site-packages/unstructured/partition/pdf_image/ocr.pyprocess_data_with_ocrrA      sH   P $D%00AdiikkJ		$	&	& 
,\:>>-&& 	'(NN:&&&	' 	' 	' 	' 	' 	' 	' 	' 	' 	' 	' 	' 	' 	' 	' /"!-"7''/

 

 


 
 
 
 
 
 
 
 
 
 
 
 
 
 
" s5   1C0BCB	CB	CCCunstructured_inferencer0   c	                   ddl m}	 g }
	 |rt          j        |           5 }|j        }t          t          j        |                    D ]t\  }}|                    d          }||_        |t          |          k     r||         nd}t          |j        |         ||||||          }|
                    |           u |	j        |
          cddd           S # 1 swxY w Y   dS t          j                    5 }t!          j        | ||d          }t%          t&          t(                   |          }t          |          D ]\  }}|t          |          k     r||         nd}t          j        |          5 }t          |j        |         ||||||          }|
                    |           ddd           n# 1 swxY w Y    |	j        |
          cddd           S # 1 swxY w Y   dS # t*          $ rY}t,          j                            |           st,          j                            |           r|t5          d|  d	          |d}~ww xY w)
a  
    Process OCR data from a given file and supplement the output DocumentLayout
    from unstructured-inference with ocr.

    Parameters:
    - filename (str): The path to the input file, which can be an image or a PDF.

    - out_layout (DocumentLayout): The output layout from unstructured-inference.

    - is_image (bool, optional): Indicates if the input data is an image (True) or not (False).
        Defaults to False.

    - infer_table_structure (bool, optional):  If true, extract the table content.

    - ocr_languages (str, optional): The languages for OCR processing. Defaults to "eng" (English).

    - ocr_mode (str, optional): The OCR processing mode, e.g., "entire_page" or "individual_blocks".
        Defaults to "entire_page". If choose "entire_page" OCR, OCR processes the entire image
        page and will be merged with the output layout. If choose "individual_blocks" OCR,
        OCR is performed on individual elements by cropping the image.

    - pdf_image_dpi (int, optional): DPI (dots per inch) for processing PDF images. Defaults to 200.

    Returns:
        DocumentLayout: The merged layout information obtained after OCR processing.
    r   )r   RGBN)page_layoutimager$   r%   r'   extracted_regionsr*   T)dpioutput_folder
paths_onlyzFile "z" not found!)'unstructured_inference.inference.layoutr   PILImager9   format	enumerater   Iteratorconvertlensupplement_page_layout_with_ocrpagesappend
from_pagesr4   r5   	pdf2imageconvert_from_pathr	   r   r&   	Exceptionr6   r7   isdirisfileFileNotFoundError)r0   r   r    r"   r$   r%   r'   r(   r*   r   merged_page_layoutsimagesimage_formatirF   rG   merged_page_layouttemp_dir_image_pathsimage_paths
image_pathes                         r@   r;   r;   \   s   N GFFFFF,..L (	Fx(( FF%} )-*@*H*H I I C CHAu!MM%00E#/EL?@3GWCXCX?X?X(8(;(;^b%)H$.$4Q$7#.C&3!)*;*;* * *& (../ABBBB0~01DEE!F F F F F F F F F F F F F F F F F F$ ,.. F((:%"*#	      #49l;;%.{%;%; G GMAz?@3GWCXCX?X?X(8(;(;^b%!z22 
Ge-L(2(8(;"'2G*7%-.?.?. . .* ,223EFFF
G 
G 
G 
G 
G 
G 
G 
G 
G 
G 
G 
G 
G 
G 
G 1~01DEE+F F F F F F F F F F F F F F F F F F,  L L L7=="" 	LbgnnX&>&> 	LG#$CX$C$C$CDD!K	Ls   G0 B-CG0 CG0 !C"G0 'G0 :A>G#87F;/G#;F??G#F?G#G0 #G''G0 *G'+G0 0
I:AIIrE   'PageLayout'rF   PILImage.ImagerG   Optional[List['TextRegion']]c                l   t          j        |          }|t          j        j        k    rf|                    |          }|r|                    |           t          t          t          d         | j
                  |          | j
        dd<   n|t          j        j        k    r| j
        D ]}	|	j        szt          j        }
t          |	|
          }|                    |j        j        |j        j        |j        j        |j        j        f          }|                    |          }||	_        nt/          d          |rqddlm} |                                 |j        t9          d	          t;          t          t          d         | j
                  ||j        ||
          | j
        dd<   | S )a6  
    Supplement an PageLayout with OCR results depending on OCR mode.
    If mode is "entire_page", we get the OCR layout for the entire image and
    merge it with PageLayout.
    If mode is "individual_blocks", we find the elements from PageLayout
    with no text and add text from OCR to each element.
    )languager   )r   
ocr_layoutNpaddingz[Invalid OCR mode. Parameter `ocr_mode` must be set to `entire_page` or `individual_blocks`.r   )tablesz&Unable to load table extraction agent.)elementsrF   tables_agent	ocr_agentrG   )r   	get_agentr   	FULL_PAGEvalueget_layout_from_imageadd_ocred_page merge_out_layout_with_ocr_layoutr	   r   ro   INDIVIDUAL_BLOCKStextr   IMAGE_CROP_PADr   cropbboxx1y1x2y2get_text_from_image
ValueErrorunstructured_inference.modelsrn   
load_agentrp   RuntimeError(supplement_element_with_table_extraction)rE   rF   r$   r%   r'   rG   r*   rq   rk   elementrm   padded_elementcropped_imagetext_from_ocrrn   s                  r@   rR   rR      s   $ "M:::I7$***44U;;
 	9,,Z888"BD1;3GHH!#
 #
 #
QQQ 
W.4	4	4"+ 	- 	-G< -$3!3GW!M!M!M %

&+.&+.&+.&+.	! ! !* = =m L L,	-" C
 
 	
  
888888&GHHH"J$/1EFF,/#
 #
 #
QQQ     ro   List['LayoutElement']rp   #'UnstructuredTableTransformerModel'c                   ddl m} d | D             }|D ]}t          j        }t	          ||          }	|                    |	j        j        |	j        j        |	j        j	        |	j        j
        f          }
t          |
|||	          }|                    |
|d          }|dk    rdn
 ||          }||_        t          j        rd	 |D             }||_        | S )
aH  Supplement the existing layout with table extraction. Any Table elements
    that are extracted will have a metadata fields "text_as_html" where
    the table's text content is rendered into a html string and "table_as_cells"
    with the raw table cells output from table agent if env_config.EXTRACT_TABLE_AS_CELLS is True
    r   )cells_to_htmlc                <    g | ]}|j         t          j        k    |S  )typer   TABLE).0els     r@   
<listcomp>z<supplement_element_with_table_extraction.<locals>.<listcomp>  s'    LLLRrw+:K/K/Kb/K/K/Kr   rl   )table_element_imagerq   rG   table_elementcells)
ocr_tokensresult_format c                Z    g | ](}t          j        |                                          )S r   )r   from_table_transformer_cellto_dict)r   cells     r@   r   z<supplement_element_with_table_extraction.<locals>.<listcomp>&  s<     " " "PT;DAAIIKK" " "r   )$unstructured_inference.models.tablesr   r   TABLE_IMAGE_CROP_PADr   r{   r|   r}   r~   r   r   get_table_tokenspredicttext_as_htmlEXTRACT_TABLE_AS_CELLStable_as_cells)ro   rF   rp   rq   rG   r   table_elementsr   rm   r   r   table_tokens
tatr_cellsr   simple_table_cellss                  r@   r   r      s5    CBBBBBLL8LLLN! 8 81+GWEEE

#&#&#&#&	
 
 ( -/(	
 
 
 "))l' * 
 


 (2--rr==3L3L+, 	8" "Xb" " " &8G"Or   r   rq   r   r   Optional['LayoutElement']List[dict[str, Any]]c                2   |                     |           }g }|D ]K}|                    |j        j        |j        j        |j        j        |j        j        g|j        d           Lt          |          D ] \  }}d|vr||d<   d|vrd|d<   d|vrd|d<   !|S )z1Get OCR tokens from either paddleocr or tesseract)rF   )r|   ry   span_numline_numr   	block_num)	ru   rT   r|   r}   r~   r   r   ry   rN   )	r   rq   rG   r   rk   r   
ocr_regionidxtokens	            r@   r   r   .  s     007J0KKJL  
 

 O&O&O&O&	 # 
	
 
	
 
	
 
	
  -- # #
UU"" #E*U"" !E*e##!"E+r   Trk   List['TextRegion']supplement_with_ocr_elementsc                z    d | D             }|D ]}t          ||          |_        |rt          | |          n| }|S )a  
    Merge the out layout with the OCR-detected text regions on page level.

    This function iterates over each out layout element and aggregates the associated text from
    the OCR layout using the specified threshold. The out layout's text attribute is then updated
    with this aggregated text. If `supplement_with_ocr_elements` is `True`, the out layout will be
    supplemented with the OCR layout.
    c                :    g | ]}t          |j                  |S r   )r   ry   )r   regions     r@   r   z4merge_out_layout_with_ocr_layout.<locals>.<listcomp>`  s)    ___6zRXR]G^G^____r   )aggregate_ocr_text_by_blockry   #supplement_layout_with_ocr_elements)r   rk   r   out_regions_without_text
out_regionfinal_layouts         r@   rw   rw   R  sk      `_Z___. 
 

5
 

 (	+J
CCC  r   r   'TextRegion'subregion_thresholdfloatOptional[str]c                    g }| D ]E}|j                             |j         |          }|r!|j        r|                    |j                   F|rd                    |          ndS )zdExtracts the text aggregated from the regions of the ocr layout that lie within the given
    block. r   )r|   is_almost_subregion_ofry   rT   r8   )rk   r   r   extracted_textsr   'ocr_region_is_subregion_of_given_regions         r@   r   r   q  s~     O  4 4
2</2X2XK3
 3
/ 3 	4z 	4"":?333(7?388O$$$R?r   layoutc                   
 ddl m} g 
|D ]@}| D ];}|j                            |j        |          }|r
                    |            n<A
fd|D             }|r ||          }| |z   }	n| }	|	S )a5  
    Supplement the existing layout with additional OCR-derived elements.

    This function takes two lists: one list of pre-existing layout elements (`layout`)
    and another list of OCR-detected text regions (`ocr_layout`). It identifies OCR regions
    that are subregions of the elements in the existing layout and removes them from the
    OCR-derived list. Then, it appends the remaining OCR-derived regions to the existing layout.

    Parameters:
    - layout (List[LayoutElement]): A list of existing layout elements, each of which is
                                    an instance of `LayoutElement`.
    - ocr_layout (List[TextRegion]): A list of OCR-derived text regions, each of which is
                                     an instance of `TextRegion`.

    Returns:
    - List[LayoutElement]: The final combined layout consisting of both the original layout
                           elements and the new OCR-derived elements.

    Note:
    - The function relies on `is_almost_subregion_of()` method to determine if an OCR region
      is a subregion of an existing layout element.
    - It also relies on `build_layout_elements_from_ocr_regions()` to convert OCR regions to
     layout elements.
    - The env_config `OCR_LAYOUT_SUBREGION_THRESHOLD` is used to specify the subregion matching
     threshold.
    r   )&build_layout_elements_from_ocr_regionsc                    g | ]}|v|	S r   r   )r   r   ocr_regions_to_removes     r@   r   z7supplement_layout_with_ocr_elements.<locals>.<listcomp>  s$    aaaVVK`=`=`&=`=`=`r   )0unstructured.partition.pdf_image.inference_utilsr   r|   r   rT   )r   rk   r   r   r   r   !ocr_region_is_subregion_of_out_elocr_regions_to_addocr_elements_to_addr   r   s             @r@   r   r     s    B      /1   
 	 	B0:0V0V#1 1- 1 %,,Z888 baaazaaa DDEWXX 33r   )r   r   r   r   r    r!   r"   r#   r$   r#   r%   r&   r'   r&   r(   r)   r*   r+   r,   r   )r0   r&   r   r   r    r!   r"   r#   r$   r#   r%   r&   r'   r&   r(   r)   r*   r+   r,   r   )rE   rf   rF   rg   r$   r#   r%   r&   r'   r&   rG   rh   r*   r+   r,   rf   )N)
ro   r   rF   rg   rp   r   rG   rh   r,   r   )NN)
r   rg   rq   r   rG   rh   r   r   r,   r   )T)r   r   rk   r   r   r#   r,   r   )rk   r   r   r   r   r   r,   r   )r   r   rk   r   r   r   r,   r   )5
__future__r   r6   r4   typingr   r   r   r   r   r	   rV   PILr
   rL   r   unstructured.documents.elementsr   (unstructured.metrics.table.table_formatsr   5unstructured.partition.pdf_image.analysis.layout_dumpr   0unstructured.partition.pdf_image.pdf_image_utilsr   r   #unstructured.partition.utils.configr   &unstructured.partition.utils.constantsr   5unstructured.partition.utils.ocr_models.ocr_interfacer   unstructured.utilsr   )unstructured_inference.inference.elementsr   rK   r   r   .unstructured_inference.inference.layoutelementr   r   r   rs   rt   rA   r;   rR   r   r   rw   OCR_LAYOUT_SUBREGION_THRESHOLDr   r   r   r   r@   <module>r      s5   " " " " " " 				  ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ?     " ! ! ! ! !       7 7 7 7 7 7 D D D D D D Q Q Q Q Q Q [ [ [ [ [ [ [ [ : : : : : : : : : : : : J J J J J J 4 4 4 4 4 4 WDDDDDDRRRRRRRRLLLLLLVVVVVV "'%+37; ; ; ; ;| /00
 "'%+37WL WL WL WL 10WLt /00 #(%+6:37A A A A 10AH /00 7;. . . . 10.h 7;/3	! ! ! ! !N *.    D ",!J@ @ @ @ @* /00 ",!J6 6 6 6 106 6 6r   