
    Ng                       d dl mZ d dlZd dlZd dlZd dlZd dlZd dlZd dlm	Z	 d dl
mZmZmZmZmZ d dlZd dlZd dlmZ d dlmZmZmZmZ d dlmZ d dlmZ d d	lmZ  d d
l!m"Z" d dl#m$Z$ d dl%m&Z& d dl'm(Z( d dl)m*Z*m+Z+ d dl,m-Z-m.Z. d dl/m0Z0m1Z1m2Z2m3Z3mZm4Z4m5Z5m6Z6m7Z7m8Z8m9Z9 d dl:m;Z; d dl<m=Z= d dl>m?Z? d dl@mAZAmBZB d dlCmDZD d dlEmFZFmGZGmHZHmIZImJZJmKZK d dlLmMZMmNZNmOZO d dlPmQZQ d dlRmSZSmTZTmUZUmVZV d dlWmXZX d dlYmZZZ d dl[m\Z\m]Z]m^Z^ d dl_m`Z`maZambZbmcZcmdZdmeZemfZf d dlgmhZhmiZi d dljmkZkmlZl d d lmmnZn d d!lompZp d d"lqmrZrmsZsmtZtmuZumvZvmwZw d d#lxmyZymzZz d d$l{m|Z| d d%l}m~Z~mZ er	 e|ej        _         ej        d&ej        '          Z ed(          dd+            Z e9             e=e?j                  e(ddd,ewj        d,ddddddd,ddd,d-d,d.fddI                                    ZdJdd,d,ewj        d,dddd,ddd,d-d,d.fddMZ	 	 	 	 	 dddOZ	 dddQZ edR          epj        d-fddW            Z	 	 dddXZ	 	 	 ddd[Z ed(          dJdd,d,d,ddevj        j        ddddd,d,ddd,d,dd-d,d.dfddc            Zd,eufddfZ	 	 	 	 	 	 	 	 dddgZddd-d,deufddkZddmZddpZ ej        dqdr          ds             ZddvZdd|ZddZ	 	 dddZd,d,dd.ddeudd-df
ddZdS )    )annotationsN)Path)IOTYPE_CHECKINGAnyOptionalcast)psparser)LTContainerLTImageLTItem	LTTextBox)open_filename)register_heif_opener)Image)	PdfReader)DocumentLayout)LayoutElement)add_chunking_strategy)%clean_extra_whitespace_with_index_run-index_adjustment_after_clean_extra_whitespace)
PixelSpace
PointSpace)CoordinatesMetadataElementElementMetadataElementTyper   LinkListItem	PageBreakTextTitleprocess_metadata)PageCountExceededError)add_metadata_with_filetype)FileType)loggertrace_logger)PARAGRAPH_PATTERN)add_element_metadataexactly_oneget_page_image_metadatanormalize_layout_elementocr_data_to_elementsspooled_to_bytes_io_if_needed)check_language_argsprepare_languages_for_tesseracttesseract_to_paddle_language)get_last_modified_date)ExtractedLayoutDumperFinalLayoutDumperObjectDetectionLayoutDumperOCRLayoutDumper)save_analysis_artifiacts)run_form_extraction)check_element_types_to_extractconvert_pdf_to_imagessave_elements) check_annotations_within_elementclean_pdfminer_inner_elementsget_links_in_elementget_urisget_words_from_objmap_bbox_and_index$merge_inferred_with_extracted_layout)open_pdfminer_pages_generatorrect_to_bbox)determine_pdf_or_image_strategyvalidate_strategy)element_from_text)
env_config)OCR_AGENT_PADDLESORT_MODE_BASICSORT_MODE_DONTSORT_MODE_XY_CUTOCRModePartitionStrategy)coord_has_valid_pointssort_page_elements)parse_keyword)firstrequires_dependenciesz\s+)patternflagsunstructured_inferencereturnstrc                 N    ddl m}  t          j                            d|           S )Nr   DEFAULT_MODELUNSTRUCTURED_HI_RES_MODEL_NAME)"unstructured_inference.models.baser\   osenvirongetr[   s    V/var/www/html/ai-engine/env/lib/python3.11/site-packages/unstructured/partition/pdf.pydefault_hi_res_modelrc   p   s.     A@@@@@:>>:MJJJ    F   TfilenameOptional[str]fileOptional[IO[bytes]]include_page_breaksboolstrategyinfer_table_structureocr_languages	languagesOptional[list[str]]metadata_filenamemetadata_last_modifiedchunking_strategyhi_res_model_nameextract_images_in_pdfextract_image_block_typesextract_image_block_output_dirextract_image_block_to_payloadstarting_page_numberintextract_formsform_extraction_skip_tableskwargsr   list[Element]c                    t          | |           t          |pg |          }t          d| |||||||
|||||||d|S )a  Parses a pdf document into a list of interpreted elements.
    Parameters
    ----------
    filename
        A string defining the target filename path.
    file
        A file-like object as bytes --> open(filename, "rb").
    strategy
        The strategy to use for partitioning the PDF. Valid strategies are "hi_res",
        "ocr_only", and "fast". When using the "hi_res" strategy, the function uses
        a layout detection model to identify document elements. When using the
        "ocr_only" strategy, partition_pdf simply extracts the text from the
        document using OCR and processes it. If the "fast" strategy is used, the text
        is extracted directly from the PDF. The default strategy `auto` will determine
        when a page can be extracted using `fast` mode, otherwise it will fall back to `hi_res`.
    infer_table_structure
        Only applicable if `strategy=hi_res`.
        If True, any Table elements that are extracted will also have a metadata field
        named "text_as_html" where the table's text content is rendered into an html string.
        I.e., rows and cells are preserved.
        Whether True or False, the "text" field is always present in any Table element
        and is the text content of the table (no structure).
    languages
        The languages present in the document, for use in partitioning and/or OCR. To use a language
        with Tesseract, you'll first need to install the appropriate Tesseract language pack.
    metadata_last_modified
        The last modified date for the document.
    hi_res_model_name
        The layout detection model used when partitioning strategy is set to `hi_res`.
    extract_images_in_pdf
        Only applicable if `strategy=hi_res`.
        If True, any detected images will be saved in the path specified by
        'extract_image_block_output_dir' or stored as base64 encoded data within metadata fields.
        Deprecation Note: This parameter is marked for deprecation. Future versions will use
        'extract_image_block_types' for broader extraction capabilities.
    extract_image_block_types
        Only applicable if `strategy=hi_res`.
        Images of the element type(s) specified in this list (e.g., ["Image", "Table"]) will be
        saved in the path specified by 'extract_image_block_output_dir' or stored as base64
        encoded data within metadata fields.
    extract_image_block_to_payload
        Only applicable if `strategy=hi_res`.
        If True, images of the element type(s) defined in 'extract_image_block_types' will be
        encoded as base64 data and stored in two metadata fields: 'image_base64' and
        'image_mime_type'.
        This parameter facilitates the inclusion of element data directly within the payload,
        especially for web-based applications or APIs.
    extract_image_block_output_dir
        Only applicable if `strategy=hi_res` and `extract_image_block_to_payload=False`.
        The filesystem path for saving images of the element type(s)
        specified in 'extract_image_block_types'.
    extract_forms
        Whether the form extraction logic should be run
        (results in adding FormKeysValues elements to output).
    form_extraction_skip_tables
        Whether the form extraction logic should ignore regions designated as Tables.
    rf   rh   )rf   rh   rj   rl   rm   ro   rr   rt   ru   rv   rw   rx   ry   r{   r|    )r+   r0   partition_pdf_or_image)rf   rh   rj   rl   rm   rn   ro   rq   rr   rs   rt   ru   rv   rw   rx   ry   r{   r|   r}   s                      rb   partition_pdfr   {   s~    d ----#IO]CCI! /35+3";'E'E1#$?   !  rd    Optional[bytes | IO[bytes]]is_imagec                   |dg}t                       t          ||           | rt          |           nd}g }d}|s{	 t          d| t	          |          ||p||d|}t          d |D                       }n?# t          $ r2}t          j        |           t          j	        d           Y d}~nd}~ww xY wt          |||||	|
          }||                    d           t          |          }t          j        t          k    rt!          |          }|t"          j        k    rt'          j                    5  t'          j        d	           t-          di d
| dt	          |          d|d|d|d|d|d|p|d|d|d|	d|
d|d|d|d|d||}t/          |          }ddd           n# 1 swxY w Y   n|t"          j        k    rt3          d||d|}|S |t"          j        k    rQt'          j                    5  t7          d| ||||||p||d|}t/          |          }ddd           n# 1 swxY w Y   |S )zCParses a pdf or image document into a list of interpreted elements.NengFrf   rh   ro   rr   ry   c              3  |   K   | ]7}|D ]2}t          |t                    o|j                                        V  38d S N)
isinstancer!   textstrip).0page_elementsels      rb   	<genexpr>z)partition_pdf_or_image.<locals>.<genexpr>  sf       ' '!'' '  2t$$8' ' ' ' ' ' 'rd   z3PDF text extraction failed, skip text extraction...)r   pdf_text_extractablerm   ru   rv   r   ignorerf   rh   r   rm   rj   ro   rn   rr   rt   r   ru   rv   rw   rx   ry   r{   r|   )extracted_elementsrj   )rf   rh   rj   ro   rn   r   rr   ry   r   )r   rG   r3   extractable_elementsr/   any	Exceptionr'   debuginforF   seekr1   rI   	OCR_AGENTrJ   r2   rO   HI_RESwarningscatch_warningssimplefilter_partition_pdf_or_image_local$_process_uncategorized_text_elementsFAST_partition_pdf_with_pdfparserOCR_ONLY _partition_pdf_or_image_with_ocr)rf   rh   r   rj   rl   rm   ro   rr   rt   ru   rv   rw   rx   ry   r{   r|   r}   last_modifiedr   r   ern   elementsout_elementss                           rb   r   r      sP   2 G	 h)))8@J*8444dM  O	O!5 "!2488#'='N%9" " " " $' ' '%7' ' ' $ $  
  	O 	O 	OLOOOKMNNNNNNNN	O /133";  H 		!3I>>M///4]CC$+++$&& 	J 	J!(+++4   !24888 " '<&;	
 %8$7 $) ,m (>'N #4"3 &:%9 '<&; +D*C 0N/M 0N/M &:%9  ,m!" -H,G% H( @IIL-	J 	J 	J 	J 	J 	J 	J 	J 	J 	J 	J 	J 	J 	J 	J0 
&+	+	+4 
1 3
 
 
 
 	&/	/	/$&& 	J 	J7 
!$7#+!'='N%9
 
 
 
H @IIL	J 	J 	J 	J 	J 	J 	J 	J 	J 	J 	J 	J 	J 	J 	J s<   :A9 9
B5(B00B5:A2F88F<?F<'H;;H?H?list[list[Element]]c           	     x    t          |t                    rt          j        |          }t	          d| ||||d|S )Nr   r   )r   bytesioBytesIO_partition_pdf_with_pdfminer)rf   rh   ro   rr   ry   r}   s         rb   r   r   c  sZ     $  z$' 51    rd   	list[str]c           
        |dg}t          | |           | r[t          | d          5 }t          t          t                   |          }t          d|| |||d|}ddd           n# 1 swxY w Y   n|rt          d|| |||d|}|S )a  Partitions a PDF using PDFMiner instead of using a layoutmodel. Used for faster
    processing or detectron2 is not available.

    Implementation is based on the `extract_text` implemenation in pdfminer.six, but
    modified to support tracking page numbers and working with file-like objects.

    ref: https://github.com/pdfminer/pdfminer.six/blob/master/pdfminer/high_level.py
    Nr   r   rb)fprf   ro   rr   ry   r   )r+   r   r	   r   r   _process_pdfminer_pages)rf   rh   ro   rr   ry   r}   r   r   s           rb   r   r   w  s     G	---- 
8T** 		bbi$$B. !#'=%9   H		 		 		 		 		 		 		 		 		 		 		 		 		 		 		 
 
* 
#9!5
 
 
 
 Os   3A((A,/A,pdfminerr   	IO[bytes]annotation_thresholdOptional[float]c                P   g }t          t          |           |          D ]\  }\  }	}
|
j        |
j        }}g }g }t	          ||          }|	j        rt          |	j        |||          }|
D ]}t          |j        |          \  }}}}||||f}g }t          |          dk    rbt          |t                    rMt          ||||          }t          ||          \  }}|D ]%}|                    t          ||                     &t!          |d          r|                                g}n)t%          |          }t'          j        t*          |          }|D ]}t-          |          \  }}|                                r|||f||f||f||ff}t1          |||          }t3          ||          } t5          ||          }!t7          ||| ||!|          |_        d|j        _        |                    |           t=          ||          }|                    |           |S )	z>Uses PDFMiner to split a document into pages and process them.startwidthheightr   get_text)coordinatescoordinate_systempointssystem)rf   page_numberr   r   linksro   r   )	enumeraterD   r   r   r   annotsr@   rE   bboxlenr   r   r=   rA   appendrB   hasattrr   _extract_textresplitr)   r   r   rH   r   _get_links_from_urls_metadatar   metadatadetection_origin_combine_list_elements)"r   rf   ro   rr   r   ry   r}   r   r   pagepage_layoutr   r   r   annotation_listr   objx1y1x2y2r   urls_metadataannotations_within_element_wordsannot_text_snippets_textmoved_indicesr   elementcoordinates_metadatar   s"                                     rb   r   r     s    H,5%b))1E- - - A' A'((dK $);+=v')&
 
 
 ; 	\&t{F<M{[[O /	2 /	2C)#(F;;NBBB#D24M?##a''JsI,F,F'-M#(	. .* .c6::57 K KE!(();E5)I)IJJJJsJ'' D-0\\^^,<%c**!#*;U!C!C' 2 2'LU'S'S$};;== 2!2hR2r(RHEF/$**;  G
 ,?%0, , ,( :-WWE'6!)$/$8&<#"+( ( (G$ 9CG$5!((111124 /}>OPP&&&&Ord   c                    |r7t          |                                          }|                    d           n3| r"t          |                                           }nt          d          |S )Nr   z-Either 'file' or 'filename' must be provided.)r   get_num_pagesr   
ValueError)rf   rh   number_of_pagess      rb   _get_pdf_page_numberr     sn      J#D//7799		!	 J#H--;;==HIIIrd   pdf_hi_res_max_pagesNonec                ^    |r(t          | |          }||k    rt          ||          dS dS )z6Checks whether PDF exceeds pdf_hi_res_max_pages limit.r   )document_pagesr   N)r   r$   )rf   rh   r   r   s       rb   #check_pdf_hi_res_max_pages_exceededr     sW      -xdKKK000(-DX    00rd   ocr_mode
model_namepdf_image_dpiOptional[int]r   analysisanalyzed_image_output_dir_pathc                	   ddl m}m} ddlm}m} ddlm}m} |st          | ||           |	p|pt                      }	|
d}
d}d}d} d}!t          j        }"| || ||	|
          }#|r || |
	          ng g f\  }$}%|r|sYt          j        r*t          t          t          j                  d
z            }n#t          t          j                    d
z            }t%          j        |d           |"s/t)          |#|	          }t+          |$          }t-                      } t/          |#|$|	          }& || |&|$|||||
| 	  	        }'n# ||||	|
          }#t1          |d          r|                    d           |r |||
          ng g f\  }$}%|r|sYt          j        r*t          t          t          j                  d
z            }n#t          t          j                    d
z            }|"s/t)          |#|	          }t+          |$          }t-                      } t/          |#|$|	          }&t1          |d          r|                    d            |||&|$|||||
| 	  	        }'t5          |'          }'|'j        D ]}(|(j        D ]})|)j        pd|)_        t=          |'fd||d|||%d|}*t?          |          }|r"tA          |*|tB          j"        | |||
||	  	         |D ]-}+|r|+tB          j"        k    rtA          |*||+| |||
||	  	         .g },|*D ]})tG          |)tH                    r|stG          |)tJ                    r)|,&                    tO          tP          |)                     XtG          |)tR                    r}tU          j+        tX          d|)j        pd          -                                |)_        |)j        stG          |)tH                    r(|,&                    tO          tP          |)                     |r)t]          || |	|,|          }-|,/                    |-           |r|"sta          |,          }!g }.|r|.&                    |           |r|.&                    |           | r|.&                    |            |!r|.&                    |!           tc          |.| |||t          j2        t          j        t          j3        t          j4        t          j5        t          j6        d
 |,S )z)Partition using package installed locallyr   )process_data_with_modelprocess_file_with_model)process_data_with_ocrprocess_file_with_ocr)process_data_with_pdfminerprocess_file_with_pdfminer)rf   rh   r   N   )r   r   r   )rf   dpi	annotatedT)exist_ok)layoutr   )r  )inferred_document_layoutextracted_layoutrt   )r  r   rm   rn   r   r   ocr_layout_dumperr   )rh   r   r   F)sortablerj   last_modification_dateinfer_list_itemsro   ry   layouts_links)	r   ry   element_category_to_saverf   rh   r   r   rx   output_dir_path )rh   rf   r   r   skip_table_regions)
rf   rh   r   r   skip_bboxesskip_dump_od	draw_griddraw_captionresizeformat)7'unstructured_inference.inference.layoutr   r   $unstructured.partition.pdf_image.ocrr   r   4unstructured.partition.pdf_image.pdfminer_processingr   r   r   rc   rI   ANALYSIS_DUMP_OD_SKIPGLOBAL_WORKING_DIR_ENABLEDrY   r   GLOBAL_WORKING_PROCESS_DIRcwdr_   makedirsr6   r4   r7   rC   r   r   r>   pagesr   r   document_to_element_listr:   r<   r   IMAGEr   r    r   r   r	   r   r!   r   sub RE_MULTISPACE_INCLUDING_NEWLINESr   r9   extendr5   r8   ANALYSIS_BBOX_SKIPANALYSIS_BBOX_DRAW_GRIDANALYSIS_BBOX_DRAW_CAPTIONANALYSIS_BBOX_RESIZEANALYSIS_BBOX_FORMAT)/rf   rh   r   rm   rj   ro   rn   r   r   rt   r   rr   r   ru   rv   rw   rx   r   r   ry   r{   r|   r   r}   r   r   r   r   r   r   od_model_layout_dumperextracted_layout_dumperr  final_layout_dumperskip_analysis_dumpr  r  r	  merged_document_layoutfinal_document_layoutr   r   r   el_typer   formslayout_dumperss/                                                  rb   r   r     s;   8       
 baaaaaaa       
  
+D?S	
 	
 	
 	
 *QZQ;O;Q;QDH?C377;#9|#:#:('	$
 $
 $
  $&&mLLLLb 	(-  	61 S8 S58ZBCCkQ6 622 69k9Q5R5R2K6FFFF% 6)D30* * *& +@++ + +' %4$5$5!!E%=-/"
 "
 "
 !6 5"-"7''/
!
 
!
 
!
 $;#:('	$
 $
 $
  4   	IIaLLL $&&DmDDDDb 	(-  	61 S8 S58ZBCCkQ6 622 69k9Q5R5R2% 6)D30* * *& +@++ + +' %4$5$5! "F%=-/"
 "
 "
 4   	IIaLLL 5 5"-"7''/
!
 
!
 
!
 ::OPP%+ $ $- 	$ 	$BgmBGG	$ (/5 1#   H !??X Y Y  
!5%0%6'+I:
	
 
	
 
	
 
	
 - 
 
  	W0A%A%A!5%,'+I:
	
 
	
 
	
 
	
 
	
 L 7 7b)$$ 	-@ 	b%   
	7Wb 1 12222D!! 	7f02  egg	 G
 w 7*R33 7##D"$5$5666 ##(!:
 
 
 	E""" 
! 	"3## # # ! 	:!!"8999" 	;!!"9::: 	5!!"3444 	7!!"5666 +I"5#9 8#>22	
 	
 	
 	
 rd   r   	sort_modec                    g }| D ]\}t          |t                    }|t          k    rt          ||          }||z  }|r#|                    t          d                     ]|S )z!Partitions a PDF using pdfparser.r   r   )rQ   rK   r   r    )r   rj   r0  r}   r   r   sorted_page_elementss          rb   r   r   $  s|     H+ 
0 
0  2-QQ''#56JI#V#V (( 	0OOI2...///Ord   c                   g }	|rg }
|t          j        |          nt          j        |           }|
                    |           t          |
|          D ]-\  }}t	          d||||||d|}|	                    |           .nMt          t          | |          |          D ]-\  }}t	          d||||||d|}|	                    |           .|	S )zkPartitions an image or PDF using OCR. For PDFs, each page is converted
    to an image prior to processing.Nr   )imagero   rn   r   rj   rr   r   )PILImageopenr   r   +_partition_pdf_or_image_with_ocr_from_imager!  r;   )rf   rh   rj   ro   rn   r   rr   ry   r}   r   imagesr5  r   r   s                 rb   r   r   <  sG    H +'+'7d###X]8=T=Te"+F:N"O"O"O 
	+ 
	+KG #+'$7'=   M OOM****
	+ #,!(D119M#
 #
 #
 	+ 	+K H #+'$7'=   M OOM****Ord   r5  PILImage.Imager   c                   ddl m} |                    |          }	|	                                rt          }|	                    |           }
t          || j        ||          }t          |
| j	        |          }|}|t          k    rt          ||          }|r#|                    t          d                     |S )	zQExtract `unstructured` elements from an image using OCR and perform partitioning.r   )OCRAgent)language)r5  )r   filetyper   ro   )
image_sizecommon_metadatar   r2  )5unstructured.partition.utils.ocr_models.ocr_interfacer<  	get_agentis_text_sortedrL   get_layout_elements_from_imager   r  r.   sizerQ   r   r    )r5  ro   rn   r   rj   rr   r0  r}   r<  	ocr_agentocr_datar   r   r3  s                 rb   r8  r8  m  s     ONNNNN""M"::I !! #"	77e7DDH,	  H ):   M )N""1-KK 8##I2$6$6$6777rd   r   c                    g }| D ]r}t          |d          rI|j        t          j        k    r4t	          t          t          |          j                  }|j        |_        n|}|	                    |           s|S )zProcesses a list of elements, creating a new list where elements with the
    category `UncategorizedText` are replaced with corresponding
    elements created from their text content.category)
r   rI  r   UNCATEGORIZED_TEXTrH   r	   r!   r   r   r   )r   r   r   new_els       rb   r   r     s~    
 L $ $2z"" 	r{k6T'T'T&tD"~~':;;F kFOOFF####rd   itemr   c                    t          | d          r|                                 S t          | t                    rd}| D ]}|t	          |          pdz  }|S t          | t
          t          f          rdS dS )zrRecursively extracts text from PDFMiner objects to account
    for scenarios where the text is in a sub-container.r   r   
)r   r   r   r   r   r   r   )rL  r   childs      rb   r   r     s     tZ   }}	D+	&	& 	 	/ 	/EM%((.B.DD	D9g.	/	/  t4rd   zpdfminer.pdfinterpz!PDFPageInterpreter.init_resourcesc                6    |d         }d|v r|d=  | |          S )Nr   
ColorSpacer   )wrappedinstanceargsr}   	resourcess        rb   #pdfminer_interpreter_init_resourcesrV    s.    QIy  l#79rd   r   PixelSpace | PointSpacec                @   d}g }| D ]}t          |t                    r|}|j        }|j        j        }nT|rRt          |j        j        |          r7| d|j         |_        t          |||          }|                                 |                    |           |S )zECombine elements that should be considered a single ListItem element.N)r   boundaryr  )element1element2r   )	r   r   r   r   r   check_coords_within_boundary"_combine_coordinates_into_element1popr   )r   r   tmp_elementupdated_elementsr   tmp_text
tmp_coordss          rb   r   r     s     K&( ) )gx(( 	#!K|H )5JJ 	#9(4
 
 
 	# #+;;W\;;K8$ "3  G   """((((rd   r   list[dict[str, Any]]r   
np.ndarray
list[Link]c           
         g }| D ]l}t          j        t                    5  |                    |d         |d         t	          |d         |          d           ddd           n# 1 swxY w Y   m|S )z+Extracts links from a list of URL metadata.r   uristart_index)r   urlrh  N)
contextlibsuppress
IndexErrorr   r   )r   r   r   ri  s       rb   r   r     s     E   ,, 
	 
	LLKu:#PM*%$ $ 	 	 	
	 
	 
	 
	 
	 
	 
	 
	 
	 
	 
	 
	 
	 
	 
	 Ls   :A&&A*	-A*	rZ  r   r[  c                   t          | j        j        j        d         d         |j        j        j        d         d                   }t	          | j        j        j        d         d         |j        j        j        d         d                   }t          | j        j        j        d         d         |j        j        j        d         d                   }t	          | j        j        j        d         d         |j        j        j        d         d                   }||f||f||f||ff}t          ||          | j        _        t          j        |           S )zXCombine the coordiantes of two elements and apply the updated coordiantes to `elements1`r      re   r   )minr   r   r   maxr   copydeepcopy)rZ  r[  r   r   r   r   r   r   s           rb   r]  r]    sH    
%,Q/2%,Q/2
 
B 
%,Q/2%,Q/2
 
B 
%,Q/2%,Q/2
 
B 
%,Q/2%,Q/2
 
B 2hR2r(RH5F$7 % % %H! ="""rd   皙?333333?r   r   rY  horizontal_thresholdfloatvertical_thresholdc                B   t          |           s,t          |          st          j        d|  d| d           dS |j        d         d         }|j        d         d         }|j        d         d         }|j        d         d         }||z
  }||z
  }	| j        d         d         |||z  z
  k    o3| j        d         d         |||z  z   k     o| j        d         d         |k    }
| j        d         d         |||	z  z   k     o| j        d         d         |||	z  z
  k    }|
o|S )a  Checks if the coordinates are within boundary thresholds.
    Parameters
    ----------
    coordinates
        a CoordinatesMetadata input
    boundary
        a CoordinatesMetadata to compare against
    vertical_threshold
        a float ranges from [0,1] to scale the vertical (y-axis) boundary
    horizontal_threshold
        a float ranges from [0,1] to scale the horizontal (x-axis) boundary
    zcoordinates z and boundary z did not pass validationFr   rn  re   )rP   r(   detailr   )r   rY  ru  rw  boundary_x_minboundary_x_maxboundary_y_minboundary_y_max
line_widthline_heightx_within_boundaryy_within_boundarys               rb   r\  r\    sp   $ "+.. 7Mh7W7W X;XXhXXX	
 	
 	
 u_Q'*N_Q'*N_Q'*N_Q'*N.0J >1K 
	A	q	!N6JZ6W$X	X 	9"1%:NQ[:[(\\	9"1%7  	1a >5G+5U#VV[a #n8J[8X&YY  2!22rd   documentr   r  r  r  source_formatr   r	  Optional[list[list]]c                ~   g }t          | j                  }t          | j        |	          D ]\  }}g }t          |          }|                    d          }|                    d          }|                    d          }g }|
r|
d         r|
||	z
           nd}|j        D ]|r)|r't          j        d          rt          ||          }nd}t          |||r|nd	
          }t          |t                    rV|D ]}|r||j        _        ||j        _        |                    |           |                    fd|D                        |rt!          |j                  ng |j        _        |r||j        _        t%          dd          |j        _        t%          dd          |j        _        t          |t*                    r6|j        j        *t/          d |j        D                       rd|j        _        |                    |           |                    |f           |j        j        r|j        j        j        nd}t          d          rj        nd}t9          |f|||||j        j        |||d| |D ]H\  }t          d          r3j        ,t=          fd|D                       }|j        |j        _         I|}|r|tB          k    rtE          ||          }|r,|||	z   k     r#|                    tG          d                     |                    |           |S )zDConverts a DocumentLayout object to a list of unstructured elements.r   r  r   r   r   Nr   r   html)r   r  r  c                    g | ]}|fS r   r   )r   r   layout_elements     rb   
<listcomp>z,document_to_element_list.<locals>.<listcomp>w  s    +S+S+SR^R,@+S+S+Srd   text_as_htmltable_as_cellsc              3  <   K   | ]}t          |d d          dv V  dS )typer   )HeadlineSubheadlineN)getattr)r   r   s     rb   r   z+document_to_element_list.<locals>.<genexpr>  sM       d dOQGB++/JJd d d d d drd   
image_path)r   r>  r   r   category_depthr  r   ro   parentc              3  4   K   | ]\  }}|j         u |V  d S r   )r  )r   l_elr   r  s      rb   r   z+document_to_element_list.<locals>.<genexpr>  s4      ]]HD"t~G\?\?\R?\?\?\?\]]rd   r   r2  )$r   r  r   r,   ra   r   r   r   r   r-   r   listr   r   r   r!  r?   r   r  r  r  r"   r  r   r   r   r   r  r*   r  rS   id	parent_idrL   rQ   r    )r  r  rj   r  r  r  r   r0  ro   ry   r	  r}   r   	num_pagesr   r   r   page_image_metadataimage_formatimage_widthimage_heighttranslation_mappingr   r   r   r   r   el_image_pathelement_parentr3  r  s                                 @rb   r  r  C  s3    !HHN##I&x~=QRRR W. W.T')5d;;*..x88)--g66*..x88EG !.q!1M+(<<== 	 #m 9	 9	N )| )8K]0[0[ )$.[$V$V$V!!$(!."3!1/<Hmm&	  G '4(( F! : :B- K4J1.9BK++$$W---#**+S+S+S+S7+S+S+STTT INU(0CDDDSU  & * L5KG$207X\0]0] -29.JZ\`2a2a /w.. 873C3R3Z`c d dUYUbd d d a a3Z 78G$3$$W---#**NG+DEEE7>7G7S] ,33Y] 
 .5^\-R-R\))X\  !'%'"3&/>(!1#      (; 	? 	?#NG~x00 ?^5J5V!&]]]](;]]]" " .<-> *, 	P	^33#5mY#O#O  	<;=Q1Q#Q#Q ''	r(:(:(:;;;,----Ord   )rX   rY   )(rf   rg   rh   ri   rj   rk   rl   rY   rm   rk   rn   rg   ro   rp   rq   rg   rr   rg   rs   rg   rt   rg   ru   rk   rv   rp   rw   rg   rx   rk   ry   rz   r{   rk   r|   rk   r}   r   rX   r~   )$rf   rY   rh   r   r   rk   rj   rk   rl   rY   rm   rk   ro   rp   rr   rg   rt   rg   ru   rk   rv   rp   rw   rg   rx   rk   ry   rz   r{   rk   r|   rk   r}   r   rX   r~   )r   NNNre   )rf   rY   rh   r   ro   rp   rr   rg   ry   rz   r}   r   rX   r   )re   )rf   rY   rh   ri   ro   r   rr   rg   ry   rz   r}   r   rX   r   )r   r   rf   rY   ro   r   rr   rg   r   r   ry   rz   rX   r   )r   N)rf   rY   rh   r   rX   rz   )r   NN)rf   rY   rh   r   r   rz   rX   r   )2rf   rY   rh   r   r   rk   rm   rk   rj   rk   ro   rp   rn   rg   r   rY   r   rg   rt   rg   r   r   rr   rg   r   rk   ru   rk   rv   rp   rw   rg   rx   rk   r   rk   r   rg   ry   rz   r{   rk   r|   rk   r   r   r}   r   rX   r~   )r   r   rj   rk   r0  rY   )r   NFNNFNre   )rf   rY   rh   r   rj   rk   ro   rp   rn   rg   r   rk   rr   rg   ry   rz   r}   r   )r5  r:  ro   rp   rn   rg   r   rz   rj   rk   rr   rg   r0  rY   r}   r   rX   r~   )r   r~   )rL  r   rX   rY   )r   r~   r   rW  rX   r~   )r   rc  r   rd  rX   re  )rZ  r   r[  r   r   rW  rX   r   )rs  rt  )
r   r   rY  r   ru  rv  rw  rv  rX   rk   )r  r   r  rk   rj   rk   r  rg   r  rk   r  rg   r   rg   r0  rY   ro   rp   ry   rz   r	  r  r}   r   rX   r~   )
__future__r   rj  rq  r   r_   r   r   pathlibr   typingr   r   r   r   r	   numpynpwraptr   r
   pdfminer.layoutr   r   r   r   pdfminer.utilsr   pi_heifr   PILr   r6  pypdfr   r  r   .unstructured_inference.inference.layoutelementr   unstructured.chunkingr   unstructured.cleaners.corer   r   "unstructured.documents.coordinatesr   r   unstructured.documents.elementsr   r   r   r   r   r   r    r!   r"   r#   unstructured.errorsr$    unstructured.file_utils.filetyper%   unstructured.file_utils.modelr&   unstructured.loggerr'   r(   unstructured.nlp.patternsr)   $unstructured.partition.common.commonr*   r+   r,   r-   r.   r/   "unstructured.partition.common.langr0   r1   r2   &unstructured.partition.common.metadatar3   5unstructured.partition.pdf_image.analysis.layout_dumpr4   r5   r6   r7   /unstructured.partition.pdf_image.analysis.toolsr8   0unstructured.partition.pdf_image.form_extractionr9   0unstructured.partition.pdf_image.pdf_image_utilsr:   r;   r<   r  r=   r>   r?   r@   rA   rB   rC   /unstructured.partition.pdf_image.pdfminer_utilsrD   rE   !unstructured.partition.strategiesrF   rG   unstructured.partition.textrH   #unstructured.partition.utils.configrI   &unstructured.partition.utils.constantsrJ   rK   rL   rM   rN   rO   $unstructured.partition.utils.sortingrP   rQ   unstructured.patches.pdfminerrR   unstructured.utilsrS   rT   PSBaseParser_parse_keywordcompileDOTALLr   rc   PDFAUTOr   r   r   r   PDF_ANNOTATION_THRESHOLDr   r   r   	FULL_PAGEvaluer   r   r   r8  r   r   patch_function_wrapperrV  r   r   r]  r\  r  r   rd   rb   <module>r     s	   " " " " " "      				 				 				        9 9 9 9 9 9 9 9 9 9 9 9 9 9            C C C C C C C C C C C C ( ( ( ( ( ( ( ( ( ( ( ( ! ! ! ! ! !       B B B B B B H H H H H H 7 7 7 7 7 7        F E E E E E E E                          7 6 6 6 6 6 G G G G G G 2 2 2 2 2 2 4 4 4 4 4 4 4 4 7 7 7 7 7 7                        
 J I I I I I            U T T T T T P P P P P P         
                         a ` ` ` ` ` ` ` 9 9 9 9 9 9 : : : : : :                \ [ [ [ [ [ [ [ 7 7 7 7 7 7 ; ; ; ; ; ; ; ; 	 (5  $#-2:fBI#N#N#N   /00K K K 10K HL))" $ %%*"'#'%)'+,0'+'+"'5948+0 !(,%d d d d  *) dP (, %%*"'%),0'+"'5948+0 !(,!{ { { { {~ (,%),0 !    2 !"* * * * *Z z"" -7,O !P P P P #"Ph (,     (, $     /00(,"' %%)#'%+ $'+#',0!&"'5948+048 !(,*./L L L L 10Lb !&%    2 (, %%)#',0 !. . . . .f &*#' %,0%* * * * *Z   "   . 24WXX  YX   8   (# # # #> #& #	)3 )3 )3 )3 )3\  %,0!#'&*%%) !*.k k k k k k krd   