
    Ng\                         d Z ddlmZ  G d d          Zd Zd Zd Zdd	Z	 	 d dZd!dZ	d Z
d ZddZddZd"dZd Zd Zd Zd Zd#dZd Zd Zd Zd ZdS )$z*
Copyright (C) 2021 Microsoft Corporation
    )defaultdictc                   .    e Zd ZddZd Zd Zd Zd ZdS )RectNc                     |d| _         d| _        d| _        d| _        d S |d         | _         |d         | _        |d         | _        |d         | _        d S )Nr            x_miny_minx_maxy_max)selfbboxs     k/var/www/html/ai-engine/env/lib/python3.11/site-packages/unstructured_inference/models/table_postprocess.py__init__zRect.__init__	   sW    <DJDJDJDJJJaDJaDJaDJaDJJJ    c                 T    | j         | j        z
  | j        | j        z
  z  }|dk    r|ndS )z$Calculates the area of the rectangler   g        )r   r   r   r   )r   areas     r   get_areazRect.get_area   s0    
TZ'DJ,CDaxxttS(r   c                 8   |                                  dk    r1|j        | _        |j        | _        |j        | _        |j        | _        nt          | j        |j                  | _        t          | j        |j                  | _        t          | j        |j                  | _        t          | j        |j                  | _        | j        | j        k    s(| j        | j        k    s|                                  dk    rd| _        d| _        d| _        d| _        | S )z2Calculates the intersection with another rectangler   )r   r   r   r   r   maxmin)r   others     r   	intersectzRect.intersect   s    ==??aDJDJDJDJJTZ55DJTZ55DJTZ55DJTZ55DJzDJ&&$*tz*A*AT]]__XYEYEY



r   c                    t          |          }|                                 dk    r2|j        | _        |j        | _        |j        | _        |j        | _        | S t          | j        |j                  | _        t          | j        |j                  | _        t          | j        |j                  | _        t          | j        |j                  | _        | S )z4Calculates a rectangle that includes both rectanglesr   )r   r   r   r   r   r   r   r   )r   r   r   s      r   include_rectzRect.include_rect/   s    T

==??aDJDJDJDJKU[11
U[11
U[11
U[11
 r   c                 6    | j         | j        | j        | j        gS )z1Returns the coordinates that define the rectangler
   )r   s    r   get_bboxzRect.get_bboxG   s    
DJ
DJ??r   )N)__name__
__module____qualname__r   r   r   r   r    r   r   r   r      si        
! 
! 
! 
!) ) )
  *  0@ @ @ @ @r   r   c                      fd| D             S )z3
    Filter out objects below a certain score.
    c                 ,    g | ]}|d          k    |S )scorer#   ).0obj	thresholds     r   
<listcomp>z#apply_threshold.<locals>.<listcomp>P   s'    @@@Cc'li&?&?C&?&?&?r   r#   )objectsr)   s    `r   apply_thresholdr,   L   s      A@@@7@@@@r   c                     t          |          dk    r#t          | |d          } t          ||            nt          | ddd          } t          |           dk    rt	          |           } | S )z^
    Apply operations to the detected rows, such as
    thresholding, NMS, and alignment.
    r         ?overlap_thresholdobject2_overlapTmatch_criteriamatch_thresholdkeep_higherr   )lennms_by_containmentremove_objects_without_contentnmssort_objects_top_to_bottom)rowstokensscore_thresholds      r   refine_rowsr>   S   ss     6{{Q!$#FFF&vt44444(93\`aaa
4yy1}})$//Kr   c                     t          |          dk    r#t          | |d          } t          ||            nt          | ddd          } t          |           dk    rt	          |           } | S )	za
    Apply operations to the detected columns, such as
    thresholding, NMS, and alignment.
    r   r.   r/   r1   g      ?Tr2   r   )r6   r7   r8   r9   sort_objects_left_to_right)columnsr<   r=   s      r   refine_columnsrB   d   s     6{{Q$WfLLL&vw7777, 	
 
 
 7||a,W55Nr   r.   c                    t          |           } t          |           }d | D             t          | ||d          \  }}}t          d|          D ]}t	          ||                   }t          |          dk    rd|<   t          |          D ]J}|         s@t	          ||                   }	t          |                    |	                    dk    rd|<   Kfdt          |           D             }
|
S )z_
    Non-maxima suppression (NMS) of objects based on shared containment of other objects.
    c                     g | ]}d S Fr#   r'   r(   s     r   r*   z&nms_by_containment.<locals>.<listcomp>   s    666S5666r   F)r0   forced_assignmentr   r   Tc                 *    g | ]\  }}|         |S r#   r#   r'   idxr(   suppressions      r   r*   z&nms_by_containment.<locals>.<listcomp>   s(    ^^^XS#[Y\M]^S^^^r   )sort_objects_by_scorer6   slot_into_containersrangesetintersection	enumerate)container_objectspackage_objectsr0   num_objectspackages_by_container_object2_numobject2_packagesobject1_numobject1_packagesfinal_objectsrK   s              @r   r7   r7   z   sA    ..?@@'((K66$5666K"6+	# # #1a Q,, 4 44[ABB  A%%'+K$ -- 	4 	4K{+ 4#&'<['I#J#J '445EFFGG!KK/3K,		4 _^^^3D)E)E^^^Mr   Fc                 |   g }d | D             }d |D             }t          |           dk    st          |          dk    r|||fS t          t                    }t          |          D ]N\  }}	g }t	          |	d                   }
|
                                }t          |           D ]x\  }}t	          |d                   }|                    t	          |	d                                                             }|dk    r||z  }|                    |||d           yt          |          dk    rt          |          }|d         }|                    |d                    |s|d         |k    rB||d                                      |           ||                             |d                    P|||fS )z
    Slot a collection of objects into the container they occupy most (the container which holds the
    largest fraction of the object).
    c                     g | ]}g S r#   r#   )r'   	containers     r   r*   z(slot_into_containers.<locals>.<listcomp>   s    CCCIRCCCr   c                     g | ]}g S r#   r#   )r'   packages     r   r*   z(slot_into_containers.<locals>.<listcomp>   s    ==='2===r   r   r   )r^   container_numr&   r&   ra   )	r6   r   dictrQ   r   r   r   appendrL   )rR   rS   r0   rG   best_match_scorescontainer_assignmentspackage_assignmentsmatch_scorespackage_numr`   package_rectpackage_areara   r^   container_rectintersect_areaoverlap_fractionsorted_match_scoresbest_match_scores                      r   rM   rM      s    CC1BCCC==_===
""c/&:&:a&?&?$&9;LLLt$$L )/ : : [ [WGFO,,#,,..(12C(D(D 	 	$M9!)F"344N+55d76?6K6KLLUUWWNa#1L#@ ##%.)6!1    |q  "7"E"E215$$%5g%>???  [$4W$=AR$R$R%&6&GHOOP[\\\#K0778H8YZZZ "57HHHr   Tc                 (    t          | d |          S )zG
    Put any set of objects in order from high score to low score.
    c                     | d         S )Nr&   r#   ks    r   <lambda>z'sort_objects_by_score.<locals>.<lambda>   s
    7 r   )keyreversesorted)r+   rv   s     r   rL   rL      s     '33WEEEEr   c                     |dd         D ]U}t          | |d                   \  }}t          |                                          dk    r|                    |           VdS )z~
    Remove any objects (these can be rows, columns, supercells, etc.) that don't
    have any text associated with them.
    Nr   r   )extract_text_inside_bboxr6   stripremove)
page_spansr+   r(   object_textrV   s        r   r8   r8      sn    
 qqqz    1*c&kJJQ{  ""##q((NN3   r   c                 L    t          | |          }t          |d          }||fS )z1
    Extract the text inside a bounding box.
    T)remove_integer_superscripts)get_bbox_span_subsetextract_text_from_spans)spansr   
bbox_spans	bbox_texts       r   rz   rz      s1     &eT22J'
PTUUUIj  r   c                 l    g }| D ].}t          |d         ||          r|                    |           /|S )z
    Reduce the set of spans to those that fall within a bounding box.

    threshold: the fraction of the span that must overlap with the bbox.
    r   )overlapsrc   )r   r   r)   span_subsetspans        r   r   r      sK     K % %DL$	22 	%t$$$r   c                    t          t          |                     }|                                }|dk    rdS |                    t          t          |                                                              |z  |k    S )zN
    Test if more than "threshold" fraction of bbox1 overlaps with bbox2.
    r   F)r   listr   r   )bbox1bbox2r)   rect1area1s        r   r   r      sj     eENNEzzu??4U,,--66885@IMMr   c                 R   |rdnd}| dd         }|r^| D ][}d|vr|d         }|dz  rG|d                                                                          r|                    |           Vd|d<   \t          |          d	k    rdS |                    d
            |                    d            |                    d            g }|d	         d         g}t          |dd         |dd                   D ]\  }	}
|	d         |
d         k    s|	d         |
d         k    r|                    |                                           }t          |          d	k    r>|d         dk    r2t          |          dk    r|d         dk    r|d         dk    s|s|dz  }|                    |           |
d         g}|                    |
d                    |                    |          }|                    |           |                    |                                           S )zT
    Convert a collection of page tokens/words/spans into a single text string.
      Nflagsr   textTsuperscriptr   c                     | d         S )Nspan_numr#   r   s    r   rt   z)extract_text_from_spans.<locals>.<lambda>      T*%5 r   ru   c                     | d         S )Nline_numr#   r   s    r   rt   z)extract_text_from_spans.<locals>.<lambda>  r   r   c                     | d         S )N	block_numr#   r   s    r   rt   z)extract_text_from_spans.<locals>.<lambda>  s    T+%6 r   r   r   -)r{   isdigitr|   r6   sortzipjoinrc   )r   join_with_spacer   	join_char
spans_copyr   r   
line_textsline_span_textsspan1span2	line_texts               r   r   r      sm   
 '.BIqqqJ" 	/ 	/ 	/Dd""MEt| /<%%''//11 /%%d++++*.D'
:!rOO55O666OO55O666OO66O777 J!!}V,-OJssOZ^<< 2 2u{!333uZ7HER\L]7]7]!77==??II""bMS((Y!++	"0D0DSUZ]I]I]' J^ S 	i((($V}oOO""5=1111//Ii   >>*%%++---r   c                 &    t          | d           S )z6
    Put the objects in order from left to right.
    c                 <    | d         d         | d         d         z   S )Nr   r   r   r#   rr   s    r   rt   z,sort_objects_left_to_right.<locals>.<lambda>3      ailQvYq\&A r   r   rw   objss    r   r@   r@   /       $AABBBBr   c                 &    t          | d           S )z6
    Put the objects in order from top to bottom.
    c                 <    | d         d         | d         d         z   S )Nr   r   r	   r#   rr   s    r   rt   z,sort_objects_top_to_bottom.<locals>.<lambda>:  r   r   r   rw   r   s    r   r:   r:   6  r   r   c                     	 | D ]$}|d         |d         d<   |d         |d         d<   %n)# t           $ r}t          d|            Y d}~nd}~ww xY w| S )zd
    For every column, align the top and bottom boundaries to the final
    table bounding box.
    r   r   r	   zCould not align columns: N	Exceptionprint)rA   r   columnerrs       r   align_columnsr   =  s    
 	( 	(F $QF6N1 $QF6N1	(    /#//000 N   '* 
AAAc                     	 | D ]$}|d         |d         d<   |d         |d         d<   %n)# t           $ r}t          d|            Y d}~nd}~ww xY w| S )za
    For every row, align the left and right boundaries to the final
    table bounding box.
    r   r   r   zCould not align rows: Nr   )r;   r   rowr   s       r   
align_rowsr   M  s    
 	% 	%C!!WCKN!!WCKNN	%    ,s,,--- Kr   r1   皙?c                    t          |           dk    rg S t          | |          } t          |           }d | D             t          d|          D ]}t          | |         d                   }|                                }t          |          D ]}|         st          | |         d                   }	|	                                }
|	                    |                                          }	 |dk    r||
z  }n|dk    r||z  }n|dk    r||
|z   |z
  z  }||k    rd	|<    n# t          $ r Y w xY wfd
t          |           D             S )a2  
    A customizable version of non-maxima suppression (NMS).

    Default behavior: If a lower-confidence object overlaps more than 5% of its area
    with a higher-confidence object, remove the lower-confidence object.

    objects: set of dicts; each object dict must have a 'bbox' and a 'score' field
    match_criteria: how to measure how much two objects "overlap"
    match_threshold: the cutoff for determining that overlap requires suppression of one object
    keep_higher: if True, keep the object with the higher metric; otherwise, keep the lower
    r   )rv   c                     g | ]}d S rE   r#   rF   s     r   r*   znms.<locals>.<listcomp>o  s    ,,,S5,,,r   r   r   object1_overlapr1   iouTc                 *    g | ]\  }}|         |S r#   r#   rI   s      r   r*   znms.<locals>.<listcomp>  s'    KKKHC+c:JKCKKKr   )r6   rL   rN   r   r   r   ZeroDivisionErrorrQ   )r+   r3   r4   r5   rT   rW   object2_rectobject2_arearY   object1_rectobject1_arearl   metricrK   s                @r   r9   r9   ]  s    7||q	#G[AAAGg,,K,,G,,,KQ,,  GK0899#,,.. -- 	 	K{+ #GK$8$@AA+4466!-!7!7!E!E!N!N!P!P%):::!/,!>'+<<<!/,!>'500!/<,3NQ_3_!`0037K0 1 )   D$ LKKK	' 2 2KKKKs   44D++
D87D8c           	      	   g }| D ]}d|d<   d}d}t                      }t                      }t                    D ]\  }	}
|
d         d         |
d         d         z
  }|d         d         |d         d         z
  }t          |
d         d         |d         d                   }t          |
d         d         |d         d                   }||z
  }d|v rt          ||z  ||z            }n||z  }|dk    r7d|
v r|
d         r|                    |	           |                    |	           d|d<   t          |          d	k    rPt          |          d	k    r=t          |          t          |          k    rt                      }nt                      }t          |          d	k    rd
|d<   nd|v r|                    |          }|D ]A}	|t          |	         d                   } |                    |	         d                   }B|g }t                    D ]\  }}|d         d         |d         d	         z
  }|d         d         |d         d	         z
  }t          |d         d	         |d         d	                   }t          |d         d         |d         d                   }||z
  }d|v r$t          ||z  ||z            }|d         r|dz  }n||z  }|dk    rH|	                    |           |t          |d                   }|                    |d                   }|4|
                    |                                          }||d<   t          |          d	k    rjt          |          d	k    rVt          |          dk    st          |          dk    r/t          |          |d<   ||d<   |	                    |           d|v r|d         rt          |d                   dk    rt          d	t          |d                             D ]}	|	g|d         |d         d
d}fd|d         D             }fd|d         D             }t          d |D                       t          d |D                       t          d |D                       t          d |D                       g}||d<   |	                    |           |S )z
    For each supercell, align it to the rows it intersects 50% of the height of,
    and the columns it intersects 50% of the width of.
    Eliminate supercells for which there are no rows and columns it intersects 50% with.
    FheaderNr   r	   r   r   r.   r   Tr   row_numberscolumn_numbersr&   )r   r   r&   
propagatedc                      g | ]
}|         S r#   r#   )r'   rJ   rA   s     r   r*   z$align_supercells.<locals>.<listcomp>  s    ,a,a,acWS\,a,a,ar   c                      g | ]
}|         S r#   r#   )r'   rJ   r;   s     r   r*   z$align_supercells.<locals>.<listcomp>  s    )X)X)X$s))X)X)Xr   c                 *    g | ]}|d          d         S )r   r   r#   r'   r   s     r   r*   z$align_supercells.<locals>.<listcomp>  !    SSS6VF^A.SSSr   c                 *    g | ]}|d          d         S )r   r   r#   r'   r   s     r   r*   z$align_supercells.<locals>.<listcomp>       JJJS[^JJJr   c                 *    g | ]}|d          d         S )r   r   r#   r   s     r   r*   z$align_supercells.<locals>.<listcomp>  r   r   c                 *    g | ]}|d          d         S )r   r	   r#   r   s     r   r*   z$align_supercells.<locals>.<listcomp>  r   r   )rO   rQ   r   r   addr6   unionr   r   rc   r   r   r   rN   )
supercellsr;   rA   aligned_supercells	supercellrow_bbox_rectcol_bbox_rectintersecting_header_rowsintersecting_data_rowsrow_numr   
row_heightsupercell_heightmin_row_overlapmax_row_overlapoverlap_heightrm   intersecting_rowsintersecting_colscol_numcol	col_widthsupercell_widthmin_col_overlapmax_col_overlapoverlap_widthsupercell_bboxnew_supercellnew_supercell_columnsnew_supercell_rowsr   s    ``                            r   align_supercellsr     s     e= e=	#	(#&55 !$%dOO 	8 	8LGSVQ#f+a.8J(03i6G6JJ!#f+a.)F2CA2FGGO!#f+a.)F2CA2FGGO,>N""#&"Z/"%55$ $  
 $2J#> 3&&s??s8}?,009999*..w777 $	(%&&**s3K/L/Lq/P/P)**S1I-J-JJJ+.55((),&'((1,,"&Ihy  2889QRR( 	R 	RG$ $T']6%: ; ; - : :4=;P Q Q %g.. 	L 	LLGSFAVQ7I'/2Yv5Fq5IIO!#f+a.)F2CA2FGGO!#f+a.)F2CA2FGGO+o=M""#&}y'@-RaBa#b#b X& <'7!';$#09#< 3&&!((111 ($(V$5$5MM$1$>$>s6{$K$KM &00??HHJJ*	& !""Q&&%&&**&''!++s3D/E/E/I/I'+,='>'>Im$*;I&'%%i000 ""y':"s9M]C^?_?_bc?c?c$QIm,D(E(EFF = =G(/y*34D*E!*7!3&*	% %M -b,a,a,aYO_E`,a,a,a))X)X)X)Xy?W)X)X)X&SS=RSSSTTJJ7IJJJKKSS=RSSSTTJJ7IJJJKK	D -1M&)&--m<<<<r   c                    t          |           } t          |           }d | D             t          d|          D ]}| |         }t          |          D ]}| |         }t          ||           t          |d                   dk     rt          |d                   dk     s2t          |d                   dk    st          |d                   dk    rd|<   fdt	          |           D             S )	a  
    A NMS scheme for supercells that first attempts to shrink supercells to
    resolve overlap.
    If two supercells overlap the same (sub)cell, shrink the lower confidence
    supercell to resolve the overlap. If shrunk supercell is empty, remove it.
    c                     g | ]}d S rE   r#   r'   r   s     r   r*   z"nms_supercells.<locals>.<listcomp>  s    555Y5555r   r   r   r   r   r   Tc                 *    g | ]\  }}|         |S r#   r#   rI   s      r   r*   z"nms_supercells.<locals>.<listcomp>  s'    NNNHC[=MNCNNNr   )rL   r6   rN   remove_supercell_overlaprQ   )r   num_supercellssupercell2_num
supercell2supercell1_num
supercell1rK   s         @r   nms_supercellsr    s    'z22J__N55*555K>22 
/ 
//
#N33 	= 	=N#N3J$Z<<<<M*++a//C
CS8T4U4UXY4Y4Y:m,--22:./00A55*.K'NNNN	* 5 5NNNNr   c                     d | D             }t          |          }|dd         D ]}t          t                    }t          |d                   }|D ]s}t	          |d                   }||k     rVt          |d                                       t          |d                             r|d         D ]}||xx         dz  cc<   tt          d|          D ]%}||         dk    r|                     |            n&dS )aW  
    Make sure no supercell in the header is below more than one supercell in any row above it.
    The cells in the header form a tree, but a supercell with more than one supercell in a row
    above it means that some cell has more than one parent, which is not allowed. Eliminate
    any supercell that would cause this to be violated.
    c                 *    g | ]}d |v |d          |S )r   r#   r   s     r   r*   z)header_supercell_tree.<locals>.<listcomp>  s4       Y1F1F9U]K^1F	1F1F1Fr   Nr   r   r   r   )	rL   r   intr   r   rO   issubsetrN   r|   )	r   header_supercellsheader_supercellancestors_by_rowmin_rowheader_supercell2max_row2row2r   s	            r   header_supercell_treer    sY    #-   ..?@@-aaa0  &s++&}566!2 	0 	0,];<<H'!!c*:;K*L&M&M&V&V%&6788' '! .m< 0 0D$T***a/****G$$ 	 	C$))!!"2333 * r   c                    t          | d                                       t          |d                             }t          | d                                       t          |d                             }t          |          dk    rt          |          dk    rt          |d                   t          |d                   k     rt          |d                   }t	          |d                   }||v r1|                    |           |d                             |           n||v r1|                    |           |d                             |           ng |d<   t                      }nt          |d                   }t	          |d                   }||v r1|                    |           |d                             |           nH||v r1|                    |           |d                             |           ng |d<   t                      }t          |          dk    rt          |          dk    dS dS dS dS )a  
    This function resolves overlap between supercells (supercells must be
    disjoint) by iteratively shrinking supercells by the fewest grid cells
    necessary to resolve the overlap.
    Example:
    If two supercells overlap at grid cell (R, C), and supercell #1 is less
    confident than supercell #2, we eliminate either row R from supercell #1
    or column C from supercell #1 by comparing the number of columns in row R
    versus the number of rows in column C. If the number of columns in row R
    is less than the number of rows in column C, we eliminate row R from
    supercell #1. This resolves the overlap by removing fewer grid cells from
    supercell #1 than if we eliminated column C from it.
    r   r   r   N)rO   rP   r6   r   r   r|   )r  r   common_rowscommon_columns
min_column
max_columnr  max_rows           r   r   r   3  sT    j/00==c*]B[>\>\]]K$4566CCJ'()) N k

Q

3~#6#6#:#:
 z-())C
;K0L,M,MMMZ(89::JZ(89::J^++%%j111+,33J????~--%%j111+,33J????/1
+,!$*]344G*]344G+%%""7+++=)009999K''""7+++=)009999,.
=)!ee9 k

Q

3~#6#6#:#:#:#:

#:#:

r   N)r.   )r.   F)T)TT)r1   r   T)__doc__collectionsr   r   r,   r>   rB   r7   rM   rL   r8   rz   r   r   r   r@   r:   r   r   r9   r   r  r  r   r#   r   r   <module>r     s    $ # # # # #A@ A@ A@ A@ A@ A@ A@ A@HA A A  "  ,   @ 	/I /I /I /IdF F F F     ! ! !
 
 
 
N N N N/. /. /. /.dC C CC C C      *L *L *L *LZo o odO O O6  81$ 1$ 1$ 1$ 1$r   