
    Ng#                     &   d dl mZmZmZmZmZ d dlZd dlZddl	m
Z
 ddlmZ dZ	 	 	 	 	 	 	 ddej        j        de
fdZd	ddd	d
d
d
def	dededededededededeee                  dedeee
         eee
         ed         f         f         fdZdS )    )ListUnionOptionalDictTupleN   )Layout   )load_dataframeH         ?FTpagereturnc                 T   |ddg}|                      |||||||          }t          j        |          }	t          |	          dk    rt	                      S |	ddg                             dt          | j                                                d	          |	ddg<   |	d
dg                             dt          | j	                                                d	          |	d
dg<   t          |	                                                    ddddddd          d          }
|
S )zThe helper function used for extracting words from a pdfplumber page
    object. 

    Returns:
        Layout: a layout object representing all extracted pdf tokens on this page. 
    Nfontnamesizex_tolerancey_tolerancekeep_blank_charsuse_text_flowhorizontal_ltrvertical_ttbextra_attrsr   x0x1)lowerupperfloattopbottomx_1x_2y_1y_2idtype)r   r   r    r!   indexr   )columns	rectangle)
block_type)extract_wordspd	DataFramelenr	   clipintwidthastypeheightr   reset_indexrename)r   r   r   r   r   r   r   r   tokensdfpage_tokenss              O/var/www/html/ai-engine/env/lib/python3.11/site-packages/layoutparser/io/pdf.pyextract_words_for_pager;      s[     !6*)#%!    F 
f		B
2ww!||xx 	D$<AS__==DDWMM d| 	E8""#dk2B2B"CCJJ7SS x !
"  	  		
 		
   K     filenameload_imagesr   r   r   r   r   r   r   dpizImage.Imagec
                    t          j        |           }
g }t          t          |
j                            D ]}}|
j        |         }t          ||||||||          }t          |j                  |j        d<   t          |j	                  |j        d<   ||j        d<   |
                    |           ~|s|S ddl}|                    | |	          }t          |          D ]w\  }}|j        \  }}||         }|j        d         }|j        d         }||k    s||k    r:||z  }||z  }|                    ||f          }||j        d<   ||j        d<   |||<   x||fS )a  Load all tokens for each page from a PDF file, and save them
    in a list of Layout objects with the original page order.

    Args:
        filename (str): The path to the PDF file.
        load_images (bool, optional):
            Whether load screenshot for each page of the PDF file.
            When set to true, the function will return both the layout and
            screenshot image for each page.
            Defaults to False.
        x_tolerance (int, optional):
            The threshold used for extracting "word tokens" from the pdf file.
            It will merge the pdf characters into a word token if the difference
            between the x_2 of one character and the x_1 of the next is less than
            or equal to x_tolerance. See details in `pdf2plumber's documentation
            <https://github.com/jsvine/pdfplumber#the-pdfplumberpage-class>`_.
            Defaults to 1.5.
        y_tolerance (int, optional):
            The threshold used for extracting "word tokens" from the pdf file.
            It will merge the pdf characters into a word token if the difference
            between the y_2 of one character and the y_1 of the next is less than
            or equal to y_tolerance. See details in `pdf2plumber's documentation
            <https://github.com/jsvine/pdfplumber#the-pdfplumberpage-class>`_.
            Defaults to 2.
        keep_blank_chars (bool, optional):
            When keep_blank_chars is set to True, it will treat blank characters
            are treated as part of a word, not as a space between words. See
            details in `pdf2plumber's documentation
            <https://github.com/jsvine/pdfplumber#the-pdfplumberpage-class>`_.
            Defaults to False.
        use_text_flow (bool, optional):
            When use_text_flow is set to True, it will use the PDF's underlying
            flow of characters as a guide for ordering and segmenting the words,
            rather than presorting the characters by x/y position. (This mimics
            how dragging a cursor highlights text in a PDF; as with that, the
            order does not always appear to be logical.) See details in
            `pdf2plumber's documentation
            <https://github.com/jsvine/pdfplumber#the-pdfplumberpage-class>`_.
            Defaults to True.
        horizontal_ltr (bool, optional):
            When horizontal_ltr is set to True, it means the doc should read
            text from left to right, vice versa.
            Defaults to True.
        vertical_ttb (bool, optional):
            When vertical_ttb is set to True, it means the doc should read
            text from top to bottom, vice versa.
            Defaults to True.
        extra_attrs (Optional[List[str]], optional):
            Passing a list of extra_attrs (e.g., ["fontname", "size"]) will
            restrict each words to characters that share exactly the same
            value for each of those `attributes extracted by pdfplumber
            <https://github.com/jsvine/pdfplumber/blob/develop/README.md#char-properties>`_,
            and the resulting word dicts will indicate those attributes.
            See details in `pdf2plumber's documentation
            <https://github.com/jsvine/pdfplumber#the-pdfplumberpage-class>`_.
            Defaults to `["fontname", "size"]`.
        dpi (int, optional):
            When loading images of the pdf, you can also specify the resolution
            (or `DPI, dots per inch <https://en.wikipedia.org/wiki/Dots_per_inch>`_)
            for rendering the images. Higher DPI values mean clearer images (also
            larger file sizes).
            Setting dpi will also automatically resizes the extracted pdf_layout
            to match the sizes of the images. Therefore, when visualizing the
            pdf_layouts, it can be rendered appropriately.
            Defaults to `DEFAULT_PDF_DPI=72`, which is also the default rendering dpi
            from the pdfplumber PDF parser.

    Returns:
        List[Layout]:
            When `load_images=False`, it will only load the pdf_tokens from
            the PDF file. Each element of the list denotes all the tokens appeared
            on a single page, and the list is ordered the same as the original PDF
            page order.
        Tuple[List[Layout], List["Image.Image"]]:
            When `load_images=True`, besides the `all_page_layout`, it will also
            return a list of page images.

    Examples::
        >>> import layoutparser as lp
        >>> pdf_layout = lp.load_pdf("path/to/pdf")
        >>> pdf_layout[0] # the layout for page 0
        >>> pdf_layout, pdf_images = lp.load_pdf("path/to/pdf", load_images=True)
        >>> lp.draw_box(pdf_images[0], pdf_layout[0])
    r   r2   r4   r(   r   N)r?   )
pdfplumberopenranger/   pagesr;   r   r2   	page_datar4   append	pdf2imageconvert_from_path	enumerater   scale)r=   r>   r   r   r   r   r   r   r   r?   plumber_pdf_objectall_page_layoutpage_idcur_pager9   rG   
pdf_images
page_imageimage_widthimage_heightpage_layoutlayout_widthlayout_heightscale_xscale_ys                            r:   load_pdfrX   T   s   B $22O/56677 , ,%+G4,##-')%#	
 	
 	
 */x~)>)>g&*/*@*@h')0g&{++++ +00s0CC
#,Z#8#8 	7 	7GZ(2%K)'2K&09L'1(;Ml**lm.K.K%4&6)//'0BCC1<%g.2>%h/+6(
**r<   )r   r   FTTTN)typingr   r   r   r   r   rA   pandasr-   elementsr	   basicr   DEFAULT_PDF_DPIr   Pager;   strboolr1   rX    r<   r:   <module>rb      s   6 5 5 5 5 5 5 5 5 5 5 5 5 5               ! ! ! ! ! !
 7 7
/
7 7 7 7 7x "'+M+ M+M+M+ M+ 	M+
 M+ M+ M+ M+ $s)$M+ 
M+ 4<tF|T--@@AABM+ M+ M+ M+ M+ M+r<   