
    Ngm              
       >   d dl Z d dlZd dlmZmZmZ d dlmZ d dlm	Z	m
Z
mZmZmZ d dlmZmZ d dlmZ d dlmZ d dlmZ d d	lmZ d
 Zdedee         fdZdedee         fdZdeeeeef         dedeeeeef         fdZ eddg          defd            ZdS )    N)BinaryIOListTuple)PDFPageAggregator)LAParamsLTContainerLTImageLTItem
LTTextLine)PDFPageInterpreterPDFResourceManager)PDFPage)PSSyntaxError)logger)requires_dependenciesc                      t                      } t                      }t          | |          }t          | |          }||fS )N)laparams)r   r   r   r   )rsrcmgrr   deviceinterpreters       k/var/www/html/ai-engine/env/lib/python3.11/site-packages/unstructured/partition/pdf_image/pdfminer_utils.pyinit_pdfminerr      sB     ""GzzHw:::F$Wf55K;    parent_objectreturnc                     g }t          | t                    r|                    |            n<t          | t                    r'| D ]$}|                    t          |                     %|S )zPRecursively extracts image objects from a given parent object in a PDF document.)
isinstancer	   appendr   extendextract_image_objectsr   objectschilds      r   r    r       sr    G-)) 9}%%%%	M;	/	/ 9" 	9 	9ENN0778888Nr   c                     g }t          | t                    r|                    |            n<t          | t                    r'| D ]$}|                    t          |                     %|S )zORecursively extracts text objects from a given parent object in a PDF document.)r   r   r   r   r   extract_text_objectsr!   s      r   r%   r%   %   sr    G-,, 8}%%%%	M;	/	/ 8" 	8 	8ENN/667777Nr   rectheightc                 0    | \  }}}}||z
  }||z
  }||||fS )ab  
    Converts a PDF rectangle coordinates (x1, y1, x2, y2) to a bounding box in the specified
    coordinate system where the vertical axis is measured from the top of the page.

    Args:
        rect (Tuple[float, float, float, float]): A tuple representing a PDF rectangle
            coordinates (x1, y1, x2, y2).
        height (float): The height of the page in the specified coordinate system.

    Returns:
        Tuple[float, float, float, float]: A tuple representing the bounding box coordinates
        (x1, y1, x2, y2) with the y-coordinates adjusted to be measured from the top of the page.
     )r&   r'   x1y2x2y1s         r   rect_to_bboxr.   2   s2    " NBB	"B	"BBr   pikepdfpypdffpc              #     K   ddl }ddlm} t                      \  }}t	          j                    5 }t          j                            |d          }	 t          j
        |           }t          |          D ]"\  }}		 |                    |	           |                                }
n# t          $ r t          j        d           t          j        d|dz    d            || |	          }|j                            |          5 }|                    |           ddd           n# 1 swxY w Y   t)          t          j
        t%          |d
                              }	|                    |	           |                                }
Y nw xY w|	|
fV  $n# t          $ r t          j        d           t          j        d           |j                            |           5 }|                    |           ddd           n# 1 swxY w Y   t          j
        t%          |d
                    }|D ]1}	|                    |	           |                                }
|	|
fV  2Y nw xY wddd           dS # 1 swxY w Y   dS )zTOpen PDF pages using PDFMiner, handling and repairing invalid dictionary constructs.r   N)get_page_datatmp_filez2Detected invalid dictionary construct for PDFminerzRepairing the PDF page    z ...)page_numberrbzRepairing the PDF document ...)r/   ,unstructured.partition.pdf_image.pypdf_utilsr3   r   tempfileTemporaryDirectoryospathjoinr   	get_pages	enumerateprocess_page
get_resultr   r   infoPdfopensavenext)r1   r/   r3   r   r   tmp_dir_pathtmp_file_pathpagesipagepage_layouterror_page_datapdfs                r   open_pdfminer_pages_generatorrO   I   sw      NNNJJJJJJ'//FK		$	&	&  (,\:>>	(%b))E$U++ ( (46,,T222"("3"3"5"5KK$ 
6 
6 
6K TUUUK C!A# C C CDDD&3mBA&F&F&FO ))/:: 0c///0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1$}d2K2K L LMMD,,T222"("3"3"5"5KKK
6 K'''''!("  
	( 
	( 
	(KLMMMK8999!!"%% ('''( ( ( ( ( ( ( ( ( ( ( ( ( ( (%d=$&?&?@@E ( (((...$//11K'''''( (
	(- (  (  (  (  (  (  (  (  (  (  (  (  (  (  (  (  (  (s   !I=)F=)B'&F'A FD)	F)D--F0D-1AFFFFI=AI-'H	=I-	HI-HAI-*I=,I--I==JJ)r;   r9   typingr   r   r   pdfminer.converterr   pdfminer.layoutr   r   r	   r
   r   pdfminer.pdfinterpr   r   pdfminer.pdfpager   pdfminer.pdfparserr   unstructured.loggerr   unstructured.utilsr   r   r    r%   floatr.   rO   r)   r   r   <module>rY      s   				  ( ( ( ( ( ( ( ( ( ( 0 0 0 0 0 0 N N N N N N N N N N N N N N E E E E E E E E $ $ $ $ $ $ , , , , , , & & & & & & 4 4 4 4 4 4  
 
DM 
 
 
 

 
4
3C 
 
 
 

ueU*
+ 5%%&   . 	7+,,*(*( *( *( -,*( *( *(r   