
    Ng/m                        U d dl Z d dlZd dlZd dlZd dlZd dlmZ d dlmZm	Z	m
Z
mZmZmZmZmZmZmZ ddlmZmZmZmZmZmZ ddlmZ ddlmZ dd	lmZ  ej        e           Z!d
Z"d
Z#dZ$dZ%dZ&ee'd<   dZ(ee'd<   ddddddddZ)dede	egef         fdZ*dede	egeeef         f         fdZ+ ed           ed
           ed            ed          dZ, ed           ed           ed            ed!          dZ-d>deded#e.ddfd$Z/ G d% d&          Z0 G d' d(          Z1 G d) d*          Z2	 d?d,ed-e3d.edefd/Z4 e j5        e1j6                  j7        8                                Z9 e j5        e2          j7        8                                Z:d,ed.ede0fd0Z;	 	 d@d,ed1ee         d2ee         d.ede.f
d3Z<e"fd4ed5ede.fd6Z=e"e#fd,ed7ed8ede.fd9Z>	 	 dAd,ed5ed;eee.d<f                  defd=Z?dS )B    N)
itemgetter)
AnyCallableDict	GeneratorListMatchOptionalPatternTupleUnion   )T_bboxT_dirT_numT_obj
T_obj_iter
T_obj_list   )cluster_objects)to_list)objects_to_bbox   g      @   ttbDEFAULT_LINE_DIRltrDEFAULT_CHAR_DIRffffifflfiflst)u   ﬀu   ﬃu   ﬄu   ﬁu   ﬂu   ﬆu   ﬅline_dirreturnc                 $    d d d d d|          S )Nc                     | d         S )Ntop xs    Q/var/www/html/ai-engine/env/lib/python3.11/site-packages/pdfplumber/utils/text.py<lambda>z&get_line_cluster_key.<locals>.<lambda>/   s
    5     c                     | d          S )Nbottomr*   r+   s    r-   r.   z&get_line_cluster_key.<locals>.<lambda>0   s    !H+ r/   c                     | d         S Nx0r*   r+   s    r-   r.   z&get_line_cluster_key.<locals>.<lambda>1   s
    4 r/   c                     | d          S )Nx1r*   r+   s    r-   r.   z&get_line_cluster_key.<locals>.<lambda>2   s    !D' r/   r   bttr   rtlr*   )r%   s    r-   get_line_cluster_keyr:   -   s3    !!%%  !!	 
  r/   char_dirc                 $    d d d d d|          S )Nc                 "    | d         | d         fS )Nr)   r1   r*   r+   s    r-   r.   z#get_char_sort_key.<locals>.<lambda>8   s    !E(AhK0 r/   c                 8    | d         | d         z    | d          fS )Nr)   heightr*   r+   s    r-   r.   z#get_char_sort_key.<locals>.<lambda>9   s!    AeHq{23ahY? r/   c                 "    | d         | d         fS r3   r*   r+   s    r-   r.   z#get_char_sort_key.<locals>.<lambda>:   s    !D'1T7+ r/   c                 &    | d          | d          fS )Nr6   r4   r*   r+   s    r-   r.   z#get_char_sort_key.<locals>.<lambda>;   s    1T7(QtWH- r/   r7   r*   )r;   s    r-   get_char_sort_keyrB   6   s3    00??++--	 
  r/   r7   r)   r1   r4   r6    suffixc           
      8   t          t                                                    }| |vrt          d| d| d|            ||vrt          d| d| d|           t          |           t          |          k    rt          d| d|  d| d|           d S )Nr%   z must be one of z, not r;   =z is incompatible with char_dir)setPOSITION_KEYSkeys
ValueError)r%   r;   rD   
valid_dirss       r-   validate_directionsrL   N   s    ]''))**Jz!!KvKKzKKKK
 
 	
 z!!KvKKzKKKK
 
 	
 8}}H%%0v 0 0 0 0"0 0%-0 0
 
 	
 &%r/   c                   :   e Zd ZdZdeeeee         f                  de	de	ddfdZ
defdZ	 	 	 ddee         dedededeeef         f
dZ	 	 	 	 	 ddeeee         f         dedededededeeeef                  fdZ	 ddededeeeef                  fdZdS )TextMapz
    A TextMap maps each unicode character in the text to an individual `char`
    object (or, in the case of layout-implied whitespace, `None`).
    tuplesline_dir_renderchar_dir_renderr&   Nc                     t          ||d           || _        || _        || _        |                                 | _        d S )N_render)rL   rO   rP   rQ   	to_string	as_string)selfrO   rP   rQ   s       r-   __init__zTextMap.__init__e   sB     	O_iHHH..))r/   c                 N   | j         }| j        }d                    t          t	          d          | j                            }|dk    r|dk    r|S |                    d          |dv rt          t                              |dk    rd D             |d	v rst          t          t                              |d
k    rfdD             nfdD             d                    fdt                    D                       S d                              S )NrC   r   r   r   
r8   r9   r9   c                 R    g | ]$}d                      t          |                    %S rC   )joinreversed.0lines     r-   
<listcomp>z%TextMap.to_string.<locals>.<listcomp>   s*    CCCT$00CCCr/   )r9   r   r8   c                 @    g | ]}d t          |          z
  z  |z   S  lenr`   ra   max_line_lengths     r-   rb   z%TextMap.to_string.<locals>.<listcomp>   s;       IM#d)) ;<D  r/   c                 @    g | ]}|d t          |          z
  z  z   S rd   rf   rh   s     r-   rb   z%TextMap.to_string.<locals>.<listcomp>   s;       IMT'B CD  r/   c              3   \   K   | ]%d                      fdD                       V  &dS )rC   c              3   (   K   | ]}|         V  d S Nr*   )r`   ra   is     r-   	<genexpr>z.TextMap.to_string.<locals>.<genexpr>.<genexpr>   s'      66DG666666r/   Nr]   )r`   rn   liness    @r-   ro   z$TextMap.to_string.<locals>.<genexpr>   sV       ! !;<BGG666666666! ! ! ! ! !r/   )rQ   rP   r]   mapr   rO   splitlistr^   maxrg   range)rV   cdldbaserq   ri   s       @@r-   rT   zTextMap.to_stringq   sm   !!wws:a==$+6677;;2;;KJJt$$E^##Xe__--U{{CCUCCC^##"%c#uoo"6"6;;   QV  EE   QV  E yy ! ! ! !@Eo@V@V! ! !    yy'''r/   r   Tm
main_groupreturn_groupsreturn_charsc                 &   | j         |                    |          |                    |                   }d |D             }t          |          \  }}}	}
|                    |          |||	|
d}|r|                                |d<   |r||d<   |S )Nc                     g | ]	\  }}||
S rm   r*   )r`   textcs      r-   rb   z)TextMap.match_to_dict.<locals>.<listcomp>   s    <<<yaammmmr/   )r   r4   r)   r6   r1   groupschars)rO   startendr   groupr   )rV   rz   r{   r|   r}   subsetr   r4   r)   r6   r1   results               r-   match_to_dictzTextMap.match_to_dict   s     QWWZ001553D3DDE<<F<<<-e44CV GGJ''
 
  	* xxzzF8 	$#F7Or/   patternregexcasec                     t          |t                    r)|du rt          d          |du rt          d          |}n?|du rt          j        |          }|du rt          j        nd}t          j        ||          }t          j        | j                  }	t          fd|	          }
 fd|
D             S )NFzACannot pass a compiled search pattern *and* regex=False together.z@Cannot pass a compiled search pattern *and* case=False together.r   c                 l    t          |                                                                         S rm   )boolr   strip)rz   r{   s    r-   r.   z TextMap.search.<locals>.<lambda>   s'    D)<)<)B)B)D)D$E$E r/   c                 B    g | ]}                     |           S ))r|   r}   r{   )r   )r`   rz   r{   r}   r|   rV   s     r-   rb   z"TextMap.search.<locals>.<listcomp>   sL     
 
 
  +)%	   
 
 
r/   )

isinstancer   rJ   reescapeIcompilefinditerrU   filter)rV   r   r   r   r|   r}   r{   compiledflagsgenfiltereds   `   ```    r-   searchzTextMap.search   s    gw'' 	2~~ W   u}} V   HH~~)G,, EMMBDDqEz'511Hk(DN33 EEEEsKK
 
 
 
 
 
 
 
 
 
 	
r/   r   c                 B    |rd}nd}|                      |d|d          S )af  
        `strip` is analogous to Python's `str.strip()` method, and returns
        `text` attributes without their surrounding whitespace. Only
        relevant when the relevant TextMap is created with `layout` = True

        Setting `return_chars` to False will exclude the individual
        character objects from the returned text-line dicts.
        z *([^\n]+?) *(\n|$)z([^\n]+)r   F)r{   r}   r|   )r   )rV   r   r}   pats       r-   extract_text_lineszTextMap.extract_text_lines   s<      	(CCC{{AL  
 
 	
r/   )r   TT)TTTTr   )TT)__name__
__module____qualname____doc__r   r   strr
   r   r   rW   rT   r	   intr   r   r   r   r   r   r   r   r*   r/   r-   rN   rN   _   s        

*U3/01
* 
* 	
*
 

* 
* 
* 
*(3 ( ( ( (F "! :  	
  
c3h   < "!&
 &
sGCL()&
 &
 	&

 &
 &
 &
 
d38n	&
 &
 &
 &
R 8<
 

04
	d38n	
 
 
 
 
 
r/   rN   c            +           e Zd ZdZdeeeef                  ddfdZdddddde	e
ddeeeddddddd	fd
edededededededededededededee         dee         dee         dee         dedededef*dZdS )WordMapz&
    A WordMap maps words->chars.
    rO   r&   Nc                     || _         d S rm   )rO   )rV   rO   s     r-   rW   zWordMap.__init__   s    r/   Fr   )r   r   r   r   Tlayoutlayout_widthlayout_heightlayout_width_charslayout_height_charslayout_bbox	x_density	y_densityx_shifty_shifty_tolerancer%   r;   line_dir_rotatedchar_dir_rotatedrQ   rP   use_text_flow	presortedexpand_ligaturesc                   23 g }t          | j                  st          ||p||p|          S |rt          ni }|rm|r|rt	          d          nt          t          ||z                      }|r|rt	          d          nt          t          ||z                      }dg|z  }ng }d}t          |          3t          |          2t          |         }t          |         }t          |         |          }t          |         |          }|s|r| j        nt          | j        3fd          }t          |3fd||p|	          }t          |          D ]\  }} |r0 || d         d                   }!|!||
z   z
  }"|d
v rdnd}#|"|#z  |z  }$nd}$t          t          |dk              t          |$          |z
            }%t          |%          D ]=}t          |          r|d         d         dk    r||z  }|                    d           >||%z  }d}&|s|r| nt          | 2fd          }'|'D ]\  }(})|r$ ||(          }*|*||	z   z
  }+|d
v rdnd}#|+|#z  |z  },nd},t          t#          d|&          t          |,          |&z
            }-|dg|-z  z  }|&|-z  }&|)D ]E}.|                    |.d         |.d                   }/|/D ]}0|                    |0|.f           |&dz  }&F|r|dg||&z
  z  z  }|rP||dz   z
  }1t          |1          D ]"}|dk    r||z  }|                    d           #|d         dk    r
|dd         }t          ||p||p|          S )a  
        Given a list of (word, chars) tuples (i.e., a WordMap), return a list of
        (char-text, char) tuples (i.e., a TextMap) that can be used to mimic
        the structural layout of the text on the page(s), using the following
        approach for top-to-bottom, left-to-right text:

        - Sort the words by (top, x0) if not already sorted.

        - Cluster the words by top (taking `y_tolerance` into account), and
          iterate through them.

        - For each cluster, divide (top - y_shift) by `y_density` to calculate
          the minimum number of newlines that should come before this cluster.
          Append that number of newlines *minus* the number of newlines already
          appended, with a minimum of one.

        - Then for each cluster, iterate through each word in it. Divide each
          word's x0, minus `x_shift`, by `x_density` to calculate the minimum
          number of characters that should come before this cluster.  Append that
          number of spaces *minus* the number of characters and spaces already
          appended, with a minimum of one. Then append the word's text.

        - At the termination of each line, add more spaces if necessary to
          mimic `layout_width`.

        - Finally, add newlines to the end if necessary to mimic to
          `layout_height`.

        For other line/character directions (e.g., bottom-to-top,
        right-to-left), these steps are adjusted.
        rP   rQ   z;`layout_width` and `layout_width_chars` cannot both be set.z=`layout_height` and `layout_height_chars` cannot both be set.)re   Nr   c                 &     | d                   S Nr   r*   r,   line_cluster_keys    r-   r.   z$WordMap.to_textmap.<locals>.<lambda>U  s    3C3CAaD3I3I r/   keyc                 &     | d                   S r   r*   r   s    r-   r.   z$WordMap.to_textmap.<locals>.<lambda>Z  s    &&qt,, r/   )preserve_orderrZ   r   rY   )rY   Nc                 &     | d                   S r   r*   )r,   char_sort_keys    r-   r.   z$WordMap.to_textmap.<locals>.<lambda>z  s    }}QqT7J7J r/   r   N)rg   rO   rN   	LIGATURESrJ   r   roundr:   rB   rH   BBOX_ORIGIN_KEYSsortedr   	enumerateru   rv   appendminget)4rV   r   r   r   r   r   r   r   r   r   r   r   r%   r;   r   r   rQ   rP   r   r   r   _textmap
expansions
blank_linenum_newlinesline_position_keychar_position_keyy_originx_originwords_sorted_line_dirtuples_by_linern   line_tuplesline_position
y_dist_rawadjy_distnum_newlines_prependline_lenline_tuples_sortedwordr   char_position
x_dist_rawx_distnum_spaces_prependr   lettersletternum_newlines_appendr   r   s4                                                     @@r-   
to_textmapzWordMap.to_textmap   s   l 794; 	 / ;8 / ;8    #3:YY
 	! J $U  
 &)|i/G)H)H%I%I"" L  $W  
 '*%	0I*J*J&K&K#%);;JJJ/99)(33)(3)(3#H-k::#H-k:: K)KDKK)I)I)I)IJJJ 	 )!,,,,$5	
 
 
 (77 3	L 3	LNA{  1 1+a.2C D D*h.@A
$66bbA#c)I5#&AE

f,	$ $  /00 . .8}} +Q4(?(?
*H----00LH L -LK-J-J-J-JKKK   2 & &e $5$5d$;$;M!.(W2D!EJ (N : :""C'#-	9FFF%(Q)9)95==8;S%T%T"[M,>>>.. & &A(nnQvY&	BBG") & & 444 A&&  L[M-?(-JKK  		)"59I"J.// . .q55
*H---- ||++#CRC=+7x+7x
 
 
 	
r/   )r   r   r   r   r   r   r   r   rW   DEFAULT_X_DENSITYDEFAULT_Y_DENSITYDEFAULT_Y_TOLERANCEr   r   r   r   r   r   r   r
   rN   r   r*   r/   r-   r   r      s        tE%*;$<= $    
  "##$*,,0**,0,0+/+/#!%+s
 s
s
 s
 	s

  s
 !s
 s
 s
 s
 s
 s
 s
 s
 s
 #5/s
  #5/!s
" "%#s
$ "%%s
& 's
( )s
* +s
, 
-s
 s
 s
 s
 s
 s
r/   r   c                      e Zd Zeeddddddeedddddfdededee	e
df         dee	e
df         ded	ed
ededededee         dee         deee                  deeef         defdZde	defdZdedefdZdedededededefdZdededeeddf         fdZdedeeeef         ddf         fdZdedeeeef         ddf         fd Zdedefd!Z	 d$ded"edefd#ZdS )%WordExtractorNFTx_tolerancer   x_tolerance_ratioy_tolerance_ratiokeep_blank_charsr   vertical_ttbhorizontal_ltrr%   r;   r   r   extra_attrssplit_at_punctuationr   c                     || _         || _        || _        || _        || _        || _        || _        || _        |du rt          	                    d           |du rt          	                    d           |	| _
        |
| _        |p|
| _        |p|	| _        t          | j
        | j                   t          | j        | j        d           |g n|| _        |du rt           j        n|pd| _        |rt&          ni | _        d S )NFzNvertical_ttb is deprecated and will be removed; use line_dir/char_dir instead.zPhorizontal_ltr is deprecated and will be removed; use line_dir/char_dir instead._rotatedTrC   )r   r   r   r   r   r   r   r   loggerwarningr%   r;   r   r   rL   r   stringpunctuationr   r   r   )rV   r   r   r   r   r   r   r   r   r%   r;   r   r   r   r   r   s                   r-   rW   zWordExtractor.__init__  s7   $ '&!2!2 0*,(5  NN2   U""NN2   !  0 <H 0 <HDM4=999D143H*UUU!,!422+
 $t++ &," 	! (8?))Rr/   uprightr&   c                 N    |s	| j         sdS |r	| j        sdS |r| j        n| j        S )Nr8   r9   )r   r   r;   r   )rV   r   s     r-   get_char_dirzWordExtractor.get_char_dir  sF      	t0 	5 	T0 	5 'Bt}}T-BBr/   ordered_charsc                 P    t          |          \  }}}}|d         d         |d         d         z
  }|d         d         }                     |          }d                     fd|D                       |||||z   ||||z
  ||z
  |d
}	 j        D ]}
|d         |
         |	|
<   |	S )Nr   doctopr)   r   rC   c              3   f   K   | ]+}j                             |d          |d                    V  ,dS r   N)r   r   )r`   r   rV   s     r-   ro   z,WordExtractor.merge_chars.<locals>.<genexpr>  sN        >?##AfIqy99     r/   )
r   r4   r6   r)   r  r1   r   r?   width	direction)r   r   r]   r   )rV   r   r4   r)   r6   r1   
doctop_adjr   r;   r   r   s   `          r-   merge_charszWordExtractor.merge_chars  s    -m<<CV"1%h/-2B52II
"9-$$W-- GG    CP     J&sl"W!
 
 # 	. 	.C%a(-DIIr/   	prev_char	curr_charr  c                    |dv rO|}|}|d         }|d         }	|dk    r|d         }
|d         }|d         }nj|d          }
|d          }|d          }nN|}|}|d         }|d         }	|dk    r|d         }
|d         }|d         }n|d          }
|d          }|d          }t          ||
k     p|||z   k    p|	||z   k              S )a  This method takes several factors into account to determine if
        `curr_char` represents the beginning of a new word:

        - Whether the text is "upright" (i.e., non-rotated)
        - Whether the user has specified that horizontal text runs
          left-to-right (default) or right-to-left, as represented by
          self.horizontal_ltr
        - Whether the user has specified that vertical text the text runs
          top-to-bottom (default) or bottom-to-top, as represented by
          self.vertical_ttb
        - The x0, top, x1, and bottom attributes of prev_char and
          curr_char
        - The self.x_tolerance and self.y_tolerance settings. Note: In
          this case, x/y refer to those directions for non-rotated text.
          For vertical text, they are flipped. A more accurate terminology
          might be "*intra*line character distance tolerance" and
          "*inter*line character distance tolerance"

        An important note: The *intra*line distance is measured from the
        *end* of the previous character to the *beginning* of the current
        character, while the *inter*line distance is measured from the
        *top* of the previous character to the *top* of the next
        character. The reasons for this are partly repository-historical,
        and partly logical, as successive text lines' bounding boxes often
        overlap slightly (and we don't want that overlap to be interpreted
        as the two lines being the same line).

        The upright-ness of the character determines the attributes to
        compare, while horizontal_ltr/vertical_ttb determine the direction
        of the comparison.
        )r   r9   r)   r   r4   r6   r   r1   )r   )rV   r	  r
  r  r   r   r,   yaycyaxbxcxs                r-   char_begins_new_wordz"WordExtractor.char_begins_new_word  s'   R &&AA5!B5!BE!!t_t_t_o%o%o% AA4B4BE!!u%x(u%))&&))"W R!V R!V
 
 	
r/   c           
   #   \  
K   g 
dt           t                   dt          t          d d f         f
fd}| j        }| j        }| j        }| j        }|D ]}|d         }	| j        s&|		                                r |d           E d {V  7|	| j
        v r# ||          E d {V   |d           E d {V  c
rX|                     
d         ||||n|
d         d         z  ||n|
d         d         z            r ||          E d {V  
                    |           Ӊ
r
V  d S d S )Nnew_charr&   c              3   &   K   rV  | g n| gd S rm   r*   )r  current_words    r-   start_next_wordz:WordExtractor.iter_chars_to_words.<locals>.start_next_wordX  s1      
  #""""!)!122zLLLr/   r   r   size)r   r   )r
   r   r   r   r   r   r   r   r   isspacer   r  r   )rV   r   r  r  xtxtrytytrcharr   r  s             @r-   iter_chars_to_wordsz!WordExtractor.iter_chars_to_wordsQ  s     
 $&	Buo	Bz4-.	B 	B 	B 	B 	B 	B $$! 	* 	*D<D( *T\\^^ **?40000000000222*?4000000000*?40000000000 
*$";";R #&;RRC,r:J6:R4R#&;RRC,r:J6:R4R #< # # 
* +?40000000000 ##D))))  		 	r/   r   c              #   D  K   t          |          }|d         d         }|r| j        n| j        }|                     |          }t	          |          }t          |          }t          |||dv r| j        n| j                  }|D ]}t          ||          }	|	|fV  d S )Nr   r   r   r8   r   )
rt   r%   r   r   r:   rB   r   r   r   r   )
rV   r   r   r%   r;   r   r   subclustersscchars_sorteds
             r-   iter_chars_to_linesz!WordExtractor.iter_chars_to_lines  s       U(9%$+F4==1F$$W--/99)(33 &!)^!;!;TAQ
 
  	+ 	+B!"-888L*****	+ 	+r/   c              #   (  K   t          dg| j        R  }t          j        ||          }|D ]e\  }}| j        r
|| j        fgn|                     |          }|D ]7\  }}|                     ||          D ]}	|                     |	          |	fV  8fd S )Nr   )	r   r   	itertoolsgroupbyr   r;   r%  r  r  )
rV   r   grouping_keygrouped_charskeyvals
char_groupline_groups
line_charsr  
word_charss
             r-   iter_extract_tuplesz!WordExtractor.iter_extract_tuples  s       ")?d.>???!)%>>#0 	E 	EGZ %:*dm,----j99 
 *5 E E%
I"&":"::y"Q"Q E EJ++J77DDDDDEE	E 	Er/   c                 `    t          t          |                     |                              S rm   )r   rt   r0  )rV   r   s     r-   extract_wordmapzWordExtractor.extract_wordmap  s&    tD44U;;<<===r/   r}   c                     |r,t          d |                     |          D                       S t          d |                     |          D                       S )Nc              3   ,   K   | ]\  }}i |d |iV  dS )r   Nr*   r`   r   r/  s      r-   ro   z.WordExtractor.extract_words.<locals>.<genexpr>  sI        $D* .4-*--     r/   c              3       K   | ]	\  }}|V  
d S rm   r*   r5  s      r-   ro   z.WordExtractor.extract_words.<locals>.<genexpr>  s'      UU!1zUUUUUUr/   )rt   r0  )rV   r   r}   s      r-   extract_wordszWordExtractor.extract_words  sv      	V  (,(@(@(G(G     
 UUT5M5Me5T5TUUUUUUr/   F)r   r   r   DEFAULT_X_TOLERANCEr   r   r   r   r   r   floatr   r   r
   r   r   rW   r   r   r   r  r  r   r   r  r   r%  r0  r   r2  r7  r*   r/   r-   r   r     s        105959!&#!#**,0,0+/16!%!4@ 4@4@ 4@ !eT!12	4@
 !eT!124@ 4@ 4@ 4@ 4@ 4@ 4@ #5/4@ #5/4@ d3i(4@ $D#I.4@  !4@ 4@ 4@ 4@l
CC 
CE 
C 
C 
C 
C     4K
K
 K
 	K

 K
 K
 
K
 K
 K
 K
Z.!. . 
:tT)	*	. . . .`++	5U*+T47	8+ + + +.EE	5
*+T47	8E E E E >Z >G > > > > 7<	V 	V	V/3	V		V 	V 	V 	V 	V 	Vr/   r   Fr   r}   kwargsc                 B    t          di |                    | |          S )Nr*   )r   r7  )r   r}   r;  s      r-   r7  r7    s(     ""6""00EEEr/   c                                         d                    d          pt          |           d           t          di fdt          D             }|                    |           } |j        di fdt          D             }|S )NTr   )r   r   c                 *    i | ]}|v ||         S r*   r*   r`   kr;  s     r-   
<dictcomp>z$chars_to_textmap.<locals>.<dictcomp>  s$    
F
F
FA!v++1fQi+++r/   c                 *    i | ]}|v ||         S r*   r*   r?  s     r-   rA  z$chars_to_textmap.<locals>.<dictcomp>  s$    
?
?
?A1;;1fQi;;;r/   r*   )updater   r   r   WORD_EXTRACTOR_KWARGSr2  r   TEXTMAP_KWARGS)r   r;  	extractorwordmaptextmaps    `   r-   chars_to_textmaprI    s    
MM!::m44N8N8N	
 	
     
F
F
F
F!6
F
F
F I ''..G g   
?
?
?
?
?
?
? G Nr/   rP   rQ   c                 r   t          |           } t          |           dk    rdS                     d          ri ||d}t          | fi |j        S t          di fdt          D             }|                    |           }|p|j        }|p|j	        }t          |j                  }                    dt                    }                    dt                    }	t          |||dv r|	n|          }
t          d	 d
                    d |
D                       D             ||          j        S )Nr   rC   r   r   c                 *    i | ]}|v ||         S r*   r*   r?  s     r-   rA  z extract_text.<locals>.<dictcomp>  s$    JJJa6kkq&)kkkr/   r   r   r!  c                     g | ]}|d fS rm   r*   )r`   r  s     r-   rb   z extract_text.<locals>.<listcomp>  s,        t  r/   rY   c              3   T   K   | ]#}d                      d |D                       V  $dS )re   c              3   &   K   | ]}|d          V  dS r  r*   )r`   r   s     r-   ro   z)extract_text.<locals>.<genexpr>.<genexpr>  s&      &E&EtF|&E&E&E&E&E&Er/   Nrp   r_   s     r-   ro   zextract_text.<locals>.<genexpr>  s=      XX$chh&E&E&E&E&EEEXXXXXXr/   r*   )r   rg   r   rI  rU   r   rD  r7  r%   r;   r:   r9  r   r   rN   r]   )r   rP   rQ   r;  textmap_kwargsrF  wordsr   r   r   rq   s      `       r-   extract_textrQ    s    ENNE
5zzQrzz( #

"1oVV
  8888BB! 
 
JJJJ%:JJJ
 
	 ''..)?Y-?)?Y-?/	0BCCjj0CDDjj0CDD*n<<KK+
 
   IIXXRWXXXXX   ,+	
 	
 	
 		r/   r.  	tolerancec                     d}d }t          | t          d                    D ]+}||d         ||z   k    r|dz  }|d         }||d         z  },|S )NrC   r4   r   re   r6   r   )r   r   )r.  rR  colllast_x1r  s        r-   collate_linerV    sr     DGzz$'7'7888  d4jGi4G&H&HCKDt*VKr/   r   r   c                     t          | t          d          |          }d                    fd|D                       S )Nr  rY   c              3   8   K   | ]}t          |          V  d S rm   )rV  )r`   r   r   s     r-   ro   z&extract_text_simple.<locals>.<genexpr>  s-      EEa\![11EEEEEEr/   )r   r   r]   )r   r   r   	clustereds    `  r-   extract_text_simplerZ    sE    
  z(';';[III99EEEE9EEEEEEr/   fontnamer  r   .c                     t          g d|pt                      R  t          dd          dt          dt          t          ddf         ffd} ||           }t          || j                  S )	u   
    Removes duplicate chars — those sharing the same text and positioning
    (within `tolerance`) as other characters in the set. Use extra_args to
    be more restrictive with the properties shared by the matching chars.
    )r   r   r  r4   r   r&   Nc              3   6  K   t          |           }t          j        |          D ]m\  }}t          t	          |          t          d                    D ]<}t          |t          d                    D ]}t          |          d         V  =nd S )Nr   r  r4   r   )r   r'  r(  r   rt   r   )	r   sorted_charsgrp	grp_chars	y_cluster	x_clusterr   pos_keyrR  s	         r-   yield_unique_charsz(dedupe_chars.<locals>.yield_unique_chars'  s      e---'/#FFF 	< 	<NC,YH!5!5y  < <	 "1z$//" " < <I !888;;;;;<<	< 	<r/   r   )r   tupler   r   r   r   index)r   rR  r   re  dedupedr   rd  s    `   @@r-   dedupe_charsri    s     
E)
E[-CEGG
E
E
EC4((G	<* 	<5$;L1M 	< 	< 	< 	< 	< 	< 	< 	< ! ''G'u{++++r/   r\   r8  )NN)r   r[  )@inspectr'  loggingr   r   operatorr   typingr   r   r   r   r   r	   r
   r   r   r   _typingr   r   r   r   r   r   
clusteringr   genericr   geometryr   	getLoggerr   r   r9  r   r   r   r   __annotations__r   r   r:   rB   r   rH   r   rL   rN   r   r   r   r7  	signaturer   
parametersrI   rE  rD  rI  rQ  rV  rZ  ri  r*   r/   r-   <module>rv     s<         				                               J I I I I I I I I I I I I I I I ' ' ' ' ' '       % % % % % %		8	$	$     %    %     	5 Xugun-E     (E7E%,<O3O*P     :a==:a==:a==:a==	   :e:h:d:d	 
 
% 
5 
# 
t 
 
 
 
"G
 G
 G
 G
 G
 G
 G
 G
T{
 {
 {
 {
 {
 {
 {
 {
|MV MV MV MV MV MV MV MVb -2F FF%)F=@FF F F F #"7#566AFFHH))-88CHHJJ J # '    ( (,'+- --e_- e_- 	-
 	- - - -d +  	     -,F FFF F 		F F F F -A, ,,, %S/*, 	, , , , , ,r/   