
    Ng 1                         d Z ddlZddlZddlZddlmZ ddlmZ	 ddl
mZ ddlmZ ej        Z ej        e          Z G d de	j                  Z G d de	j                  ZdS )	)PdfTextPagePdfTextSearcher    N)PdfiumError)PDFIUM_INFOc                   z     e Zd ZdZ fdZed             ZddZdd	ZddZ	d Z
ddZd ZddZd ZddZ xZS )r   z
    Text page helper class.
    
    Attributes:
        raw (FPDF_TEXTPAGE): The underlying PDFium textpage handle.
        page (PdfPage): Reference to the page this textpage belongs to.
    c                 z    || _         || _        t                                          t          j                   d S N)rawpagesuper__init__pdfium_cFPDFText_ClosePage)selfr
   r   	__class__s      W/var/www/html/ai-engine/env/lib/python3.11/site-packages/pypdfium2/_helpers/textpage.pyr   zPdfTextPage.__init__   s2    	455555    c                     | j         S r	   )r   r   s    r   parentzPdfTextPage.parent!   s
    yr   r   c                    ||k    rdS t          j        | |          }|dk    r|                     |dz   ||dz   |          S t          j        | |          }|dk    r|                     ||dz
  ||dz             S ||||fS )Nr      )r   "FPDFText_GetTextIndexFromCharIndex_get_active_text_range)r   c_startc_end	l_passive	r_passivet_startt_ends          r   r   z"PdfTextPage._get_active_text_range&   s    U??1=dGLLb==..wqy%1iXXX;D%HHB;;..waIVWKXXXy)33r   r   ignoreFc                    ||fdk    r,|s*t          j        d           |                     |          S |dk    r|                                 |z
  }|                     |||z   dz
            }|dk    rdS |\  }}}}	||z  }|||	z   z  }|dz   |z
  }
dt
          j        cxk     rd	k     rn n|
d
z  }
|
dz  }
t          j        |
d
z            }t          j	        |t          j
        t          j                            }t          j        | |||          }|
|k    sJ d|
 d|             |j        d|dz
  d
z                               d|          S )a  
        Warning:
            .. versionchanged:: 4.28
               For various reasons, calling this method with default params now implicitly translates to :meth:`.get_text_bounded` (pass ``force_this=True`` to circumvent).
        
        Extract text from a given range.
        
        Parameters:
            index (int): Index of the first char to include.
            count (int): Number of chars to cover, relative to the internal char list. Defaults to -1 for all remaining chars after *index*.
            errors (str): Error handling when decoding the data (see :meth:`bytes.decode`).
        Returns:
            str: The text in the range in question, or an empty string if no text was found.
        
        Note:
            * The returned text's length does not have to match *count*, even if it will for most PDFs.
              This is because the underlying API may exclude/insert chars compared to the internal list, although rare in practice.
              This means, if the char at ``i`` is excluded, ``get_text_range(i, 2)[1]`` will raise an index error.
              Pdfium provides raw APIs ``FPDFText_GetTextIndexFromCharIndex()`` / ``FPDFText_GetCharIndexFromTextIndex()`` to translate between the two views and identify excluded/inserted chars.
            * In case of leading/trailing excluded characters, pypdfium2 modifies *index* and *count* accordingly to prevent pdfium from unexpectedly reading beyond ``range(index, index+count)``.
        r   r   z]get_text_range() call with default params will be implicitly redirected to get_text_bounded()errorsr   r   r    i  i     zBuffer too small: z vs N	utf-16-le)warningswarnget_text_boundedcount_charsr   r   buildctypescreate_string_buffercastPOINTERc_ushortr   FPDFText_GetTextr
   decode)r   indexcountr&   
force_thisactive_ranger    r!   r   r   in_countbuffer
buffer_ptr	out_counts                 r   get_text_rangezPdfTextPage.get_text_range6   s   2 5>W$$Z$Myzzz(((777B;;$$&&.E 225%+a-HH12 0<,	9Y&&7W$
 +#****d*****MHA,X\::[)H)HII
-dE5*MM	9$$$&T8&T&T&T&T$$$z*IaK?*+22;v2NNNr   Nc                    | j                                         }||d         }||d         }||d         }||d         }| ||||f}t          j        g |ddR  }|dk    rdS t	          j        |dz            }	t	          j        |	t	          j        t          j                            }
t          j        g ||
|R   |	j	        
                    d|          S )	a  
        Extract text from given boundaries in PDF coordinates.
        If a boundary value is None, it defaults to the corresponding value of :meth:`.PdfPage.get_bbox`.
        
        Parameters:
            errors (str): Error treatment when decoding the data (see :meth:`bytes.decode`).
        Returns:
            str: The text on the page area in question, or an empty string if no text was found.
        Nr   r   r(      r'   r)   r%   )r   get_bboxr   FPDFText_GetBoundedTextr/   r0   r1   r2   r3   r
   r5   )r   leftbottomrighttopr&   bboxargsn_charsr;   r<   s              r   r,   zPdfTextPage.get_text_boundedq   s     y!!##<7D>!WF=GE;q'CdC/2BDB$BBBBa<<2,Wq[99[)H)HII
(D$D
DGDDDDz  V <<<r   c                 X    t          j        |           }|dk    rt          d          |S )zV
        Returns:
            int: The number of characters on the text page.
        r   zFailed to get character count.)r   FPDFText_CountCharsr   )r   rI   s     r   r-   zPdfTextPage.count_chars   s1    
 .t44b==>???r   c                 \    t          j        | ||          }|dk    rt          d          |S )a  
        Parameters:
            index (int): Start character index.
            count (int): Character count to consider (defaults to -1 for all remaining).
        Returns:
            int: The number of text rectangles in the given character range.
        r   zFailed to count rectangles.)r   FPDFText_CountRectsr   )r   r6   r7   n_rectss       r   count_rectszPdfTextPage.count_rects   s5     .tUEBBb==;<<<r   c                 F    t          j        | ||||          }|dk     rdS |S )a  
        Get the index of a character by position.
        
        Parameters:
            x (float): Horizontal position (in PDF canvas units).
            y (float): Vertical position.
            x_tol (float): Horizontal tolerance.
            y_tol (float): Vertical tolerance.
        Returns:
            int | None: The index of the character at or nearby the point (x, y).
            May be None if there is no character or an error occurred.
        r   N)r   FPDFText_GetCharIndexAtPos)r   xyx_toly_tolr6   s         r   	get_indexzPdfTextPage.get_index   s/     3D!QuMM1994r   c                    |rIt          j                    }t          j        | ||          }|j        |j        |j        |j        f\  }}}}nst                      t                      t                      t                      f\  }}}}t          j        | |||||          }|j	        |j	        |j	        |j	        f\  }}}}|st          d          ||||fS )a  
        Get the bounding box of a single character.
        
        Parameters:
            index (int):
                Index of the character to work with, in the page's character array.
            loose (bool):
                Get a more comprehensive box covering the entire font bounds, as opposed to the default tight box specific to the one character.
        Returns:
            Float values for left, bottom, right and top in PDF canvas units.
        zFailed to get charbox.)r   FS_RECTFFPDFText_GetLooseCharBoxrC   rD   rE   rF   c_doubleFPDFText_GetCharBoxvaluer   )	r   r6   looserectoklbrts	            r   get_charboxzPdfTextPage.get_charbox   s      	<$&&D24EEBDKTXEJAq!QQ!XZZXZZGJAq!Q-dE1aAFFB!'17AG;JAq!Q 	86777!Qzr   c                     t                      t                      t                      t                      f\  }}}}t          j        | |||||          }|st          d          |j        |j        |j        |j        fS )al  
        Get the bounding box of a text rectangle at the given index.
        Note that :meth:`.count_rects` must be called once with default parameters
        before subsequent :meth:`.get_rect` calls for this function to work (due to PDFium's API).
        
        Returns:
            Float values for left, bottom, right and top in PDF canvas units.
        zzFailed to get rectangle. (Make sure count_rects() was called with default params once before subsequent get_rect() calls.))rZ   r   FPDFText_GetRectr   r\   )r   r6   r`   ra   rb   rc   r_   s          r   get_rectzPdfTextPage.get_rect   s     ZZXZZC
1a&tUAq!Q?? 	\  [  \  \  \!'1733r   c                    t          |          dk    rt          d          d}|r|t          j        z  }|r|t          j        z  }|r|t          j        z  }|dz                       d          }t          j        |t          j	        t          j
                            }t          j        | |||          }	t          |	|           }
|                     |
           |
S )au  
        Locate text on the page.
        
        Parameters:
            text (str):
                The string to search for.
            index (int):
                Character index at which to start searching.
            match_case (bool):
                If True, the search will be case-specific (upper and lower letters treated as different characters).
            match_whole_word (bool):
                If True, substring occurrences will be ignored (e. g. `cat` would not match `category`).
            consecutive (bool):
                If False (the default), :meth:`.search` will skip past the current match to look for the next match.
                If True, parts of the previous match may be caught again (e. g. searching for `aa` in `aaaa` would match 3 rather than 2 times).
        Returns:
            PdfTextSearcher: A helper object to search text.
        r   z#Text length must be greater than 0. r)   )len
ValueErrorr   FPDF_MATCHCASEFPDF_MATCHWHOLEWORDFPDF_CONSECUTIVEencoder/   r1   r2   r3   FPDFText_FindStartr   _add_kid)r   textr6   
match_casematch_whole_wordconsecutiveflagsenc_textenc_text_ptrraw_searchersearchers              r   searchzPdfTextPage.search   s    ( t99>>BCCC 	-X,,E 	2X11E 	/X..E6M))+66{8V^FO-L-LMM24ueTT"<66hr   )r   r   )r   r   r"   F)NNNNr"   r$   )F)r   FFF)__name__
__module____qualname____doc__r   propertyr   r   r>   r,   r-   rO   rV   rd   rg   r{   __classcell__r   s   @r   r   r      s        6 6 6 6 6
   X4 4 4 4 8O 8O 8O 8Ov= = = =@       &   84 4 4 $ $ $ $ $ $ $ $r   r   c                   J     e Zd ZdZ fdZed             Zd Zd Zd Z	 xZ
S )r   z
    Text searcher helper class.
    
    Attributes:
        raw (FPDF_SCHHANDLE): The underlying PDFium searcher handle.
        textpage (PdfTextPage): Reference to the textpage this searcher belongs to.
    c                 z    || _         || _        t                                          t          j                   d S r	   )r
   textpager   r   r   FPDFText_FindClose)r   r
   r   r   s      r   r   zPdfTextSearcher.__init__  s2     455555r   c                     | j         S r	   )r   r   s    r   r   zPdfTextSearcher.parent  s
    }r   c                 x     ||           }|sd S t          j        |           }t          j        |           }||fS r	   )r   FPDFText_GetSchResultIndexFPDFText_GetSchCount)r   	find_funcr_   r6   r7   s        r   _get_occurrencezPdfTextSearcher._get_occurrence#  sF    Yt__ 	43D99-d33e|r   c                 @    |                      t          j                  S )z
        Returns:
            (int, int): Start character index and count of the next occurrence,
            or None if the last occurrence was passed.
        )r   r   FPDFText_FindNextr   s    r   get_nextzPdfTextSearcher.get_next+       ##H$>???r   c                 @    |                      t          j                  S )z
        Returns:
            (int, int): Start character index and count of the previous occurrence (i. e. the one before the last valid occurrence),
            or None if the last occurrence was passed.
        )r   r   FPDFText_FindPrevr   s    r   get_prevzPdfTextSearcher.get_prev3  r   r   )r|   r}   r~   r   r   r   r   r   r   r   r   r   s   @r   r   r     s         6 6 6 6 6
   X  @ @ @@ @ @ @ @ @ @r   r   )__all__r/   loggingr*   pypdfium2.rawr
   r   pypdfium2.internalinternalpdfium_ipypdfium2._helpers.miscr   pypdfium2.versionr   rZ   	getLoggerr|   loggerAutoCloseabler   r    r   r   <module>r      s    -                % % % % % % / / / / / / ) ) ) ) ) )?		8	$	$z z z z z8) z z zz)@ )@ )@ )@ )@x- )@ )@ )@ )@ )@r   