
    NgM                        d dl Z d dlZd dlZd dlmZ d dlmZmZmZ d dl	m
Z
mZmZmZmZmZmZmZmZmZmZ d dlmZ d dlmZ d dlmZmZ d dlmZ d	d
lmZm Z  d	dl!m"Z"m#Z#  ej$        e%          Z&e
rd	dl'm(Z( d	dl)m*Z* edge+f         Z,ded         dee-ee-         e,f         ded         fdZ. G d d          Z/e G d de/                      Z0 G d de1          Z2 G d de/          Z3dS )    N)deque)asdict	dataclassfield)TYPE_CHECKINGAnyCallableDictIterableIteratorListOptionalPatternTupleUnion)
NumberTree)	PDFParser)	PDFObjRefresolve1)	PSLiteral   )T_bboxT_obj)decode_textgeometry)Page)PDFPDFStructElementelementsmatcherreturnc              #   t  K   dddt           ffd}dddt           ffd}t          t                    r|}nt          t          j                  r|}n}t          |           }|rN|                                } ||          r|V  |                    t          |j	                             |LdS dS )z=
    Common code for `find_all()` in trees and elements.
    xr   r!   c                     | j         k    S )zMatch an element name.)typer#   r    s    P/var/www/html/ai-engine/env/lib/python3.11/site-packages/pdfplumber/structure.py	match_tagz_find_all.<locals>.match_tag/   s    v      c                 8                         | j                  S )z,Match an element name by regular expression.)matchr%   r&   s    r'   match_regexz_find_all.<locals>.match_regex3   s    }}QV$$$r)   N)
bool
isinstancestrrer   r   popleft
extendleftreversedchildren)r   r    r(   r,   
match_funcdels    `     r'   	_find_allr8   '   s     !' !D ! ! ! ! ! !%) %d % % % % % % '3 

	GRZ	(	(  


hA
 ,YY[[:b>> 	HHH	Xbk**+++	  , , , , ,r)   c                       e Zd ZU dZed         ed<   deeee         e	f         de
d         fdZdeeee         e	f         ded         fdZdS )	FindablezRfind() and find_all() methods that can be inherited to avoid
    repeating oneselfr   r4   r    r!   c                 ,    t          | j        |          S )zIterate depth-first over matching elements in subtree.

        The `matcher` argument is either an element name, a regular
        expression, or a function taking a `PDFStructElement` and
        returning `True` if the element matches.
        )r8   r4   selfr    s     r'   find_allzFindable.find_allK   s     000r)   c                 j    	 t          t          | j        |                    S # t          $ r Y dS w xY w)zFind the first matching element in subtree.

        The `matcher` argument is either an element name, a regular
        expression, or a function taking a `PDFStructElement` and
        returning `True` if the element matches.
        N)nextr8   r4   StopIterationr<   s     r'   findzFindable.findV   sC    		$-99::: 	 	 	44	s   !$ 
22N)__name__
__module____qualname____doc__r   __annotations__r   r/   r   	MatchFuncr   r>   r   rB    r)   r'   r:   r:   E   s           %&&&&	1S'#,	9:	1	$	%	1 	1 	1 	1S'#,	9:	$	%     r)   r:   c                      e Zd ZU eed<   ee         ed<   ee         ed<   ee         ed<   ee         ed<   ee         ed<   ee         ed<   ee         ed<    ee	          Z	e
eef         ed
<    ee	          Zee         ed<    ee	          Zed          ed<   ded          fdZdeeee         ef                  fdZde
eef         fdZdS )r   r%   revisionidlangalt_textactual_texttitlepage_number)default_factory
attributesmcidsr4   r!   c                 *    t          | j                  S Niterr4   r=   s    r'   __iter__zPDFStructElement.__iter__s       DM"""r)   c              #     K   | j         D ]}| j        |fV  t          | j                  }|rT|                                }|j         D ]}|j        |fV  |                    t          |j                             |RdS dS )zCollect all MCIDs (with their page numbers, if there are
        multiple pages in the tree) inside a structure element.
        N)rT   rQ   r   r4   r1   r2   r3   )r=   mcidr6   r7   s       r'   	all_mcidszPDFStructElement.all_mcidsv   s      
 J 	) 	)D"D((((($-   	0B + +nd*****LL"+..///	  	0 	0 	0 	0 	0r)   c                 @   t          |           }t          |g          }|r||                                }t          |                                          D ]%}||         ||         g k    s||         i k    r||= &d|v r|                    |d                    |||S )z'Return a compacted dict representation.Nr4   )r   r   r1   listkeysextend)r=   rr6   r7   ks        r'   to_dictzPDFStructElement.to_dict   s    4LL1#JJ 	)B"''))__  a5=BqERKK2a5B;;1RJ(((  	) r)   N)rC   rD   rE   r/   rG   r   intr   dictrS   r
   r   r`   rT   r   r4   r   rZ   r   r^   re   rI   r)   r'   r   r   e   sj        
IIIsm
3-sm#C=#!&t!<!<!<JS#X<<<uT222E49222).t)D)D)DHd%&DDD#(#56 # # # #08E(3-*<$=> 0 0 0 0c3h      r)   c                       e Zd ZdS )StructTreeMissingN)rC   rD   rE   rI   r)   r'   ri   ri      s        Dr)   ri   c                   L   e Zd ZU dZed         ed<   dddded         fdZdeee	f         d	ee
         d
eee	f         fdZde	d
eee         ee	         f         fdZdee	         d
dfdZdeee	f         d
efdZddZdeee	f         d
dfdZd
ee         fdZded
efdZdS )PDFStructTreeaz  Parse the structure tree of a PDF.

    The constructor takes a `pdfplumber.PDF` and optionally a
    `pdfplumber.Page`.  To avoid creating the entire tree for a large
    document it is recommended to provide a page.

    This class creates a representation of the portion of the
    structure tree that reaches marked content sections, either for a
    single page, or for the whole document.  Note that this is slightly
    different from the behaviour of other PDF libraries which will
    also include structure elements with no content.

    If the PDF has no structure, the constructor will raise
    `StructTreeMissing`.

    r   pageNdocr   c                    |j         | _         d| j         j        vrt          d          t          | j         j        d                   | _        t          | j                            di                     | _        t          | j                            di                     | _        g | _        ||| _	        |j
        |i| _        d | _        | j                            d          }||                                  d S t          |          }d| j	        j        j        vrd S | j	        j        j        d         t          t#          fd|j        D                                 }|                     |           d S d | _	        d |j        D             | _        d	 | j                                        D             | _        |                                  d S )
NStructTreeRootzPDF has no structureRoleMapClassMap
ParentTreeStructParentsc              3   .   K   | ]\  }}|k    |V  d S rV   rI   ).0numarray	parent_ids      r'   	<genexpr>z)PDFStructTree.__init__.<locals>.<genexpr>   s1      XX:3siGWGWGWGWGWGWXXr)   c                     i | ]
}|j         |S rI   )rQ   ru   rl   s     r'   
<dictcomp>z*PDFStructTree.__init__.<locals>.<dictcomp>   s    GGGT$*DGGGr)   c                 2    i | ]}|j         j        |j        S rI   )page_objpageidrQ   r{   s     r'   r|   z*PDFStructTree.__init__.<locals>.<dictcomp>   s/       ;?$d&6  r)   )rm   catalogri   r   rootgetrole_map	class_mapr4   rl   rQ   pages	page_dict_parse_struct_treer   r~   attrsr@   values_parse_parent_tree)r=   rm   rl   parent_tree_objparent_treeparent_arrayrx   s         @r'   __init__zPDFStructTree.__init__   s   748#333#$:;;;TX-.>?@@	 y"!=!=>>!$)--
B"?"?@@02 DI*D1DJ!DN #immL99O&'')))))(99 #$)*<*BBBF I.4_E	'XXXX1CXXXXX    ''55555DIGGSYGGGDJ CG:CTCTCVCV  DN ##%%%%%r)   objrK   r!   c                 &   g }dD ]\}||vrt          ||                   }t          |t                    r|                    |           G|                    |           ]g }d }|D ]]}t          |t
                    r ||k    r||                    |           d }7||                    |           t          |          }^||                    |           i }	|D ]}t          |t                    rFt          |j                  }|| j	        vrt                              d|           P| j	        |         }|                                D ]=\  }
}t          |t                    rt          |j                  |	|
<   2||
         |	|
<   >|	S )N)CAzUnknown attribute class %s)r   r.   r`   rb   appendrf   r   r   namer   loggerwarningitems)r=   r   rK   attr_obj_listkeyattr_obj	attr_objsprev_objarefattrrd   vs               r'   _make_attributeszPDFStructTree._make_attributes   s     	/ 	/C#~~C))H(D)) /$$X....$$X....	! 	* 	*D $$$ *8##(<$$X...'$$X...#D>>X&&&  	% 	%C#y)) *!#(++dn,,NN#?EEEnS)		 % %1a++ %)!&11DGG!!fDGG	%
 r)   c                    d|vsJ d|z              d|vsJ d|z              d }| j         4d|v r0|d         j        }|| j         v sJ d|z              | j         |         }d}d|v rBt          |d         j                  }|| j        v rt          | j        |         j                  }d	|v rt          |d	                   ng }t          |t                    r|g}nt          |t                    r	|d	         g}|	                    d
          }| 
                    ||          }d|v r"t          t          |d                             nd }d|v r"t          t          |d                             nd }	d|v r"t          t          |d                             nd }
d|v r"t          t          |d                             nd }d|v r"t          t          |d                             nd }t          |||||
|	|||	  	        }||fS )NMCIDzUncaught MCR: %sObjzUncaught OBJR: %sPgzObject on unparsed page: %s SKRIDTLangAlt
ActualText)	r%   rL   rQ   rK   rM   rP   rN   rO   rS   )r   objidr   r   r   r   r.   rf   rg   r   r   r   )r=   r   rQ   
page_objidobj_tagr4   rK   rS   
element_idrP   rM   rN   rO   elements                 r'   _make_elementzPDFStructTree._make_element
  sP   S   "4s":   C!4s!:>%$#++TJ///1NQT1T///.4K#::!#c(-00G$-''%dmG&<&ABB),8CH%%%h$$ 	" zHH$'' 	"CzH773<<**399
9=[#d)!4!4555$
36#::HSX..///45;s]]{8CK001118=;xE
33444$8D8K8KK\!233444QU 	 ###!

 

 

   r)   r   c                    t          |          }i }d}|r|                                }|t          j        k    r't	          |          |v r9t          |          }d|v r!t          |d         j                  dk    rd}nK|                     |          \  }}|J ||f|t	          |          <   |	                    |d                    ||sJ | 
                    |           dS )zYPopulate the structure tree using the leaves of the parent tree for
        a given page.FTypero   TNP)r   r1   r   KEYWORD_NULLreprr   r   r   r   r   _resolve_children)	r=   r   r6   s
found_rootrefr   r   r4   s	            r'   r   z PDFStructTree._parse_parent_tree4  s    ,
 	#))++C i,,,CyyA~~3--C}}S[-=!>!>BR!R!R!

 %)$6$6s$;$;!***&0$s))S"""'  	#* q!!!!!r)   c                     d|vrdS |d         j         }| j        	|| j        v S | j        || j        j        j        k    rdS dS )Nr   TF)r   r   rl   r~   r   )r=   r   r   s      r'   on_parsed_pagezPDFStructTree.on_parsed_pageS  sU    s??4Y_
>%//9 TY/666utr)   c                    	 t           j        d                   }t          |t                    r j        d         g}t	          |          }i 	|r%|                                }t          |          	v r)t          |          }t          |t                    r1d|v r-                     |          sg|d         }t          |          }                     |          \  }}||f	t          |          <   |D ]x}t          |          }t          |t                    r(                     |          s<d|v r	|d         }nd|v rNt          |t                    r|
                    |           y|%dt          t                   dt          t                   f	 fd |                                	           dS )zgPopulate the structure tree starting from the root, skipping
        unparsed pages and empty elements.r   r   r   r   r!   c                    g }| D ]}t          |          }t          |t                    r|                    |           <t          |t                    rB                    |          sgd|v r|                    |d                    d|v r|d         }t          |                   \  }} |          }||st          |          = ||ft          |          <   |                    |           |S )Nr   r   )r   r.   rf   r   rg   r   r   )	r   next_elementsr   r   r   r4   pruner   r=   s	         r'   r   z/PDFStructTree._parse_struct_tree.<locals>.prune  s   M . .smmc3'' 
)!((---T** )..s33 ! }}%,,S[999 #!%j$%d3iiL! 5???(?$s))#*H#4Ad3iiL!((----  r)   N)r   r   r.   rg   r   r1   r   r   r   r   r   r   r   r   )
r=   r   r6   r   r   r   r4   childr   r   s
   `       @@r'   r   z PDFStructTree._parse_struct_tree_  s    	#'' dD!! 	$IcN#D$KK 	$))++CCyyA~~3--C#t$$ $#**3// %jsmm $ 2 23 7 7GX #H,Ad3iiL! 
$ 
$uooc4(( !..s33 ! || #E
3 eY// $HHUOOO5  	$<	!DI 	!$s) 	! 	! 	! 	! 	! 	! 	! 	!2 	dq!!!!!r)   seenc                 R   t          | j        d                   }t          |t                    r| j        d         g}g | _        g }|D ]n}t          |          }t          |t                    r"d|v r|                     |          s@|d         }t          |          v r|                    |           ot          |          }|rI|	                                }t          |                   \  }}|
J d            |D ]
}	t          |	          }t          |t                    r|j                            |           n\t          |t                    rG|                     |          smd|v r!|j                            |d                    nd|v r|d         }	t          |	t                    rW                    t          |	          d          \  }
}|
/|j                            |
           |                    |	           |Ifd|D             | _        dS )z|Resolve children starting from the tree root based on references we
        saw when traversing the structure tree.
        r   r   NzUnparsed elementr   )NNc                 F    g | ]}t          |                   d          S )r   )r   )ru   r   r   s     r'   
<listcomp>z3PDFStructTree._resolve_children.<locals>.<listcomp>  s(    CCCd3ii+CCCr)   )r   r   r.   rg   r4   r   r   r   r   r1   rf   rT   r   r   )r=   r   r   parsed_rootr   r   r6   r   r4   r   child_element_s    `          r'   r   zPDFStructTree._resolve_children  s<    	#''dD!! 	$IcN#D 	( 	(C3--C#t$$ !#**3// %jCyyD  ""3'''+ 	())++C $T#YYGX&&(:&&&! ( (uooc3'' 	+M((----T** +..s33 ! }},,S[9999# #E
eY// ('+xxU\'J'J$M1$0(//>>>+  	(, DCCC{CCCr)   c                 *    t          | j                  S rV   rW   rY   s    r'   rZ   zPDFStructTree.__iter__  r[   r)   r7   c                    d}| j         | j         }n|j        | j        |j                 }|j                            dd          }||ddl m}m}m}  | ||          |j        d         |j        d         z
            }t          ||          rUt          j        |          }|                    |g          }|st          d          t          j        |d                   S |S g }	|                                D ]\  }
}|
<|7t           j                            |j                                                  }nDg }nAt           j                            | j        |
         j                                                  }|D ]#}|d         |k    r|	                    |           $|	st          d	          t          j        |	          S )
z9Get the bounding box for an element for visual debugging.NBBoxr   )CroppedPage_invert_box_normalize_box   zElement no longer on pager   r]   zNo objects found)rl   rQ   r   rS   r   r   r   r   mediaboxr.   r   bbox_to_rect_crop_fn
IndexErrorobj_to_bboxr^   	itertoolschainfrom_iterableobjectsr   r   objects_to_bbox)r=   r7   rl   bboxr   r   r   rectrects	mcid_objsrQ   r]   r   cs                 r'   element_bboxzPDFStructTree.element_bbox  s   9 9DD^':bn-D}  .. 0FFFFFFFFFF
 ;t$$dmA&6q9I&I D $,, ,T22tf-- B$%@AAA+E!H555 I%'\\^^ , ,!T&'"+/"?"?@S@S@U@U"V"V"$'o;;
;/7>>@@ G ! , ,AyD((!((+++,  5 !3444+I666r)   rV   )r!   N)rC   rD   rE   rF   r   rG   r   r
   r/   r   rf   r   r   r   r   r   r   r-   r   r   r   r   rZ   r   r   rI   r)   r'   rk   rk      s         " 6
,& ,&E ,&&)9 ,& ,& ,& ,&\/S>/-5c]/	c3h/ / / /b(! (!x8H/I4PS9/T)U (! (! (! (!T"tCy "T " " " ">
$sCx. 
T 
 
 
 
B" B" B" B"H*Dd38n *D *D *D *D *DX#(#34 # # # #-7/ -7F -7 -7 -7 -7 -7 -7r)   rk   )4r   loggingr0   collectionsr   dataclassesr   r   r   typingr   r   r	   r
   r   r   r   r   r   r   r   pdfminer.data_structuresr   pdfminer.pdfparserr   pdfminer.pdftypesr   r   pdfminer.psparserr   _typingr   r   utilsr   r   	getLoggerrC   r   rl   r   pdfr   r-   rH   r/   r8   r:   r   
ValueErrorri   rk   rI   r)   r'   <module>r      s        				       0 0 0 0 0 0 0 0 0 0                          0 / / / / / ( ( ( ( ( ( 1 1 1 1 1 1 1 1 ' ' ' ' ' ' " " " " " " " " ( ( ( ( ( ( ( (		8	$	$   ()4/0	,)*,3i/0,  !, , , ,<       @ * * * * *x * * *Z	 	 	 	 	
 	 	 	h7 h7 h7 h7 h7H h7 h7 h7 h7 h7r)   