
    Ngs              	          U d dl mc mZ d dlmZ d dlmZ d dlm	Z	m
Z
mZmZmZmZmZmZ d dlZd dlZd dlZd dlmZ d dlmZmZmZ d dlmZ d dlmZ d d	l m!Z! d d
l"m#Z# d dl$m%Z% d dl&m'Z' d dl(m)Z) ddl*m+Z,  G d de'          Z- e-            a.e-e/d<   d Z0de1fdZ2ej3        ej4        ej5        ej6        ej7        ej8        ddZ9de:dej        de;fdZ<dedee=e=f         dee=e1f         fdZ>deee1e	f                  d ee1e?f         d!eee1e	f                  fd"Z@d# ZAd$ ZBd% ZCd& ZDd' ZEd( ZFd) ZGd* ZHd+ee:         d!ee:         fd,ZId+ee:         d!e1fd-ZJd.ej        d/e?d!ej        fd0ZKdS )1    N)defaultdict)Path)AnyDictListMappingOptionalSequenceTupleUnion)Image)DetrImageProcessor"TableTransformerForObjectDetectionlogging)%TableTransformerObjectDetectionOutput)inference_config)table_cells_to_dataframe)logger)Rect)UnstructuredModel)pad_image_with_background_color   )table_postprocessc            
       L    e Zd ZdZd Z	 	 ddej        deee	                  de
f fdZdej                                        rd	nd
fdee
eef         dee
         fdZej        fdej        dedefdZej        ddfdej        dedeee	                  dee
         fdZ xZS )!UnstructuredTableTransformerModelz1Unstructured model wrapper for table-transformer.c                     d S )N )selfs    `/var/www/html/ai-engine/env/lib/python3.11/site-packages/unstructured_inference/models/tables.py__init__z*UnstructuredTableTransformerModel.__init__   s        Nhtmlx
ocr_tokensresult_formatc                 v    t                                          |           |                     |||          S )a  Predict table structure deferring to run_prediction with ocr tokens

        Note:
        `ocr_tokens` is a list of dictionaries representing OCR tokens,
        where each dictionary has the following format:
        {
            "bbox": [int, int, int, int],  # Bounding box coordinates of the token
            "block_num": int,  # Block number
            "line_num": int,   # Line number
            "span_num": int,   # Span number
            "text": str,  # Text content of the token
        }
        The bounding box coordinates should match the table structure.
        FIXME: refactor token data into a dataclass so we have clear expectations of the fields
        )r$   r%   )superpredictrun_prediction)r   r#   r$   r%   	__class__s       r   r(   z)UnstructuredTableTransformerModel.predict!   s6    * 	""1="YYYr!   cudacpumodeldevicec                    || _         t                      | _        	 t          j        d           t          j                    }t          j                     t          j	        |          | _
        t          j        |           | j
                                         nE# t          $ r8 t          j        d           t          j        d           t          d          w xY w| j
                            |           dS )z4Loads the donut model using the specified parametersz%Loading the table structure model ...zFailed to initialize the model.z Ensure that the model is correctzKReview the parameters to initialize a UnstructuredTableTransformerModel objN)r.   r   feature_extractorr   infor   get_verbosityset_verbosity_errorr   from_pretrainedr-   set_verbosityevalEnvironmentErrorcriticalImportErrorto)r   r-   r.   cached_current_verbositys       r   
initializez,UnstructuredTableTransformerModel.initialize9   s     !3!5!5	K?@@@'.'<'>'>$')));KERRDJ!":;;;JOO 	 	 	O=>>>O>???]  	 	
fs   B B ACpad_for_structure_detectionreturnc                     t          j                    5  |                     t          ||          d                              | j                  } | j        di |}||d<   |cddd           S # 1 swxY w Y   dS )zget the table structure as a dictionary contaning different types of elements as
        key-value pairs; check table-transformer documentation for more informationpt)return_tensorsr=   Nr   )torchno_gradr0   r   r:   r.   r-   )r   r#   r=   encodingoutputs_structures        r   get_structurez/UnstructuredTableTransformerModel.get_structureR   s     ]__ 	% 	%--/3NOO# .   boo  !+
 6 6X 6 6?Z;<$	% 	% 	% 	% 	% 	% 	% 	% 	% 	% 	% 	% 	% 	% 	% 	% 	% 	%s   AA22A69A6c                 H   |                      ||          }|t          d          t          |||          }t          |          dk    r	|d         }ndS |dk    rt	          |          pd}n2|dk    rt          |          }n|dk    r|}nt          d	| d
          |S )zPredict table structureNz1Cannot predict table structure with no OCR tokens)tokensr    r"   	dataframecellszresult_format zG is not a valid format. Valid formats are: "html", "dataframe", "cells")rF   
ValueError	recognizelencells_to_htmlr   )r   r#   r=   r$   r%   rE   recognized_table
predictions           r   r)   z0UnstructuredTableTransformerModel.run_predictionb   s     !..q2MNNPQQQ$%6*MMM  1$$)!,JJ 2F""&z228bJJk))1*==JJg%%#JJC C C C  
 r!   )Nr"   )__name__
__module____qualname____doc__r    PILImager   r	   r   r   strr(   rB   r+   is_availabler   r   r   r<   r   TABLE_IMAGE_BACKGROUND_PADintdictrF   r)   __classcell__)r*   s   @r   r   r      s       ;;   ,0#	Z Z>Z T$Z(Z 	Z Z Z Z Z Z4 GK*/**A*A*C*C N S$ BBC    8 ,<+V% %>% &)% 
	% % % %& ,<+V+/'-   >  &)  T$Z(	 
  }               r!   r   tables_agentc                      t          t          d          s.t          j        d           t                              d           dS )zOLoads the Table agent as a global variable to ensure that we only load it once.r-   zLoading the Table agent ...z1microsoft/table-transformer-structure-recognitionN)hasattrr]   r   r1   r<   r   r!   r   
load_agentr`      sB     <)) U1222 STTT
Fr!   	data_typec                 @    | dk    rdddddddd	}n| d
k    rdddd}|S )zDefines class map dictionaries	structurer   r                  tabletable column	table rowtable column headertable projected row headertable spanning cell	no object	detection)rj   ztable rotatedrp   r   )ra   	class_maps     r   get_class_maprs      sR    K#$*+#$
 
		 
k	!	!!!DD	r!   
   ri   outputsimgrH   c                     t          d          }d |                                D             }t          }t          | |j        |          }t          ||          }t          ||          }fd|D             S )zRecognize table elements.rc   c                     i | ]\  }}||	S r   r   ).0kvs      r   
<dictcomp>zrecognize.<locals>.<dictcomp>   s    FFF41a!QFFFr!   c                 <    g | ]}t          |          d          S )r   )structure_to_cells)ry   rc   rH   s     r   
<listcomp>zrecognize.<locals>.<listcomp>   s)    WWWy&11!4WWWr!   )rs   itemsstructure_class_thresholdsoutputs_to_objectssizeapply_thresholds_on_objectsobjects_to_structures)	ru   rv   rH   str_class_name2idxstr_class_idx2nameclass_thresholdsobjectshigh_confidence_objectstables_structures	     `      r   rM   rM      s    &{33FF+=+C+C+E+EFFF1 !#(4FGGG9'CSTT,-DfN^__WWWWFVWWWWr!   img_sizeclass_idx2namec                 r   | d                              d                              d          }t          |j                                                                                                                  d         }t          |j                                                                                                                  d         }| d                                                                         d         }|                     dd          }|d         |dz  z   |d         |dz  z   f}d t          ||          D             }|g }	t          |||          D ]S\  }
}}|t          |
                   }|d	k    r2|	                    |t          |          fd
|D             d           T|	S )zOutput table element types.logitsr   
pred_boxesr=   rd   r   c                 6    g | ]}|                                 S r   )tolistry   elems     r   r   z&outputs_to_objects.<locals>.<listcomp>   s     UUUT4;;==UUUr!   rp   c                 4    g | ]}t          |          z
  S r   )float)ry   r   
shift_sizes     r   r   z&outputs_to_objects.<locals>.<listcomp>   s$    GGG$U4[[:5GGGr!   )labelscorebbox)softmaxmaxlistindicesdetachr,   numpyvaluesgetrescale_bboxesziprZ   appendr   )ru   r   r   mpred_labelspred_scorespred_bboxespad
scale_sizer   r   r   r   class_labelr   s                 @r   r   r      s    	!!"%%))"--Aqy''))--//557788;Kqx((,,..446677:K,'..004466q9K
++3Q
7
7C1+a'!sQw)>?JUU^K-T-TUUUK JG!+{KHH 	 	ud$SZZ0+%%NN("5\\GGGG$GGG    Nr!   r   
thresholdsr>   c                 $    fd| D             } | S )a  
    Filters predicted objects which the confidence scores below the thresholds

    Args:
        objects: Sequence of mappings for example:
        [
            {
                "label": "table row",
                "score": 0.55,
                "bbox": [...],
            },
            ...,
        ]
        thresholds: Mapping from labels to thresholds

    Returns:
        Filtered list of objects

    c                 D    g | ]}|d          |d                  k    |S )r   r   r   )ry   objr   s     r   r   z/apply_thresholds_on_objects.<locals>.<listcomp>   s0    RRRsWCL9Q)Q)Qs)Q)Q)Qr!   r   )r   r   s    `r   r   r      s$    . SRRRgRRRGNr!   c                     |                      d          \  }}}}|d|z  z
  |d|z  z
  |d|z  z   |d|z  z   g}t          j        |d          S )zbConvert rectangle format from center-x, center-y, width, height to
    x-min, y-min, x-max, y-max.r         ?r   )dim)unbindrB   stack)r#   x_cy_cwhbs         r   box_cxcywh_to_xyxyr      s^     XXb\\NCa
a-3q=C#'MS37]LA;qa    r!   c                 |    |\  }}t          |           }|t          j        ||||gt          j                  z  }|S )z;Rescale relative bounding box to box of size given by size.dtype)r   rB   tensorfloat32)out_bboxr   img_wimg_hr   s        r   r   r     sA    LE58$$A	EL%u5U]KKKKAHr!   c                     t          |                               t          |                    }t          |                                           }|dk    r|                                |z  S dS )zA
    Compute the intersection area over box area, for bbox1.
    r   )r   	intersectget_area)bbox1bbox2intersection
bbox1_areas       r   iobr     s_     ;;((e55Le%%''JA~~$$&&331r!   c                 6   d | D             }g }|D ]fd| D             }fd|D             }i }d |D             }d |D             }	d |D             }
d |D             }|D ]}d|d	<   d
 |D             }|D ]}d|d	<   ||z  }|	D ];}d|d<   |
D ]1}t          |d         |d                   t          j        k    rd|d<   2<t          j        |	||d                   }	t          j        |||d                   }t                      }|	D ]}|                    |d                    t                      }|D ]}|                    |d                    |j        |j	        |j
        |j        gd<   d         d<   t          j        |d                   }t          j        |	d                   }	|	|d<   ||d<   |
|d<   ||d<   t          |	          dk    r#t          |          dk    rt          ||          }|                    |           |S )aV  
    Process the bounding boxes produced by the table structure recognition model into
    a *consistent* set of table structures (rows, columns, spanning cells, headers).
    This entails resolving conflicts/overlaps, and ensuring the boxes meet certain alignment
    conditions (for example: rows should all have the same width, etc.).
    c                 *    g | ]}|d          dk    |S )r   rj   r   ry   r   s     r   r   z)objects_to_structures.<locals>.<listcomp>!  s&    @@@cG(?(?c(?(?(?r!   c                 h    g | ].}t          |d          d                    t          j        k    ,|/S r   r   r   TABLE_IOB_THRESHOLD)ry   r   rj   s     r   r   z)objects_to_structures.<locals>.<listcomp>%  sD     
 
 
3v;f..2B2VVV VVVr!   c                 h    g | ].}t          |d          d                    t          j        k    ,|/S r   r   )ry   tokenrj   s     r   r   z)objects_to_structures.<locals>.<listcomp>*  sD     
 
 
5=%-004D4XXX XXXr!   c                 *    g | ]}|d          dk    |S )r   rk   r   r   s     r   r   z)objects_to_structures.<locals>.<listcomp>2  s&    RRR33w<>3Q3Q33Q3Q3Qr!   c                 *    g | ]}|d          dk    |S )r   rl   r   r   s     r   r   z)objects_to_structures.<locals>.<listcomp>3  s&    LLLG0K0K0K0K0Kr!   c                 *    g | ]}|d          dk    |S )r   rm   r   r   s     r   r   z)objects_to_structures.<locals>.<listcomp>4  '    ```##g,J_:_:_#:_:_:_r!   c                 *    g | ]}|d          dk    |S )r   ro   r   r   s     r   r   z)objects_to_structures.<locals>.<listcomp>5  r   r!   Fprojected row headerc                 *    g | ]}|d          dk    |S )r   rn   r   r   s     r   r   z)objects_to_structures.<locals>.<listcomp>8  s.     !
 !
 !
CL<X,X,XC,X,X,Xr!   Tcolumn headerr   rl   rk   row_column_bboxrowscolumnscolumn headersspanning cellsr   r   )r   r   r   postprocessrefine_rowsrefine_columnsr   include_rectx_miny_minx_maxy_maxalign_columns
align_rowsrN   refine_table_structurer   )r   rH   r   tablestable_structurestable_objectstable_tokensrc   r   r   column_headersspanning_cellsr   projected_row_headers
header_objrow_rectcolumn_rectrj   s                    @r   r   r     s    A@W@@@F D+ D+
 
 
 

 
 


 
 
 

 
 
 	RR-RRRLL}LLL``````````! 	0 	0C*/C&''!
 !
(!
 !
 !
 ) 	/ 	/C*.C&''// 	0 	0C#(C , 0 0
s6{Jv$677;K;___+/C(0
 &t\;KK;XYY,^,
 
 66 	/ 	/C!!#f+....ff 	2 	2C$$S[1111NN	$
  /0f +GU;L5MNN%dE2C,DEE 	&&	)&4	"#&4	"#t99q==S\\A--.y:JKKI	****r!   c                    | d         }| d         }| d         }t          j        ||d                   }t          j        |          }t          ||          }d | d         D             }d | d         D             }t          j        ||d                   }t          j        ||d	                   }||z  }t          j        |||          }t          j        |          }t          j        |           || d<   || d<   || d<   || d<   | S )
zp
    Apply operations to the detected table structure objects such as
    thresholding, NMS, and alignment.
    r   r   r   rm   c                 "    g | ]}|d          
|S r   r   r   s     r   r   z*refine_table_structure.<locals>.<listcomp>  s3       $G]B^  r!   r   c                 "    g | ]}|d          
|S r   r   r   s     r   r   z*refine_table_structure.<locals>.<listcomp>  s3       dCY>Z  r!   ro   rn   )r   apply_thresholdnmsalign_headersalign_supercellsnms_supercellsheader_supercell_tree)table_structurer   r   r   r   r   r   s          r   r   r   m  sa   
 6"Di(G %%56N 0./ N !_^44N">488N ()9:  N ()9:   !0./ N (756  ++N !1.$PPN /??N%n555!(OI"OF(6O$%(6O$%r!   c                    g }|D ]}d|d<   g }| D ]}t          |          D ]\  }}|d         d         |d         d         z
  }t          |d         d         |d         d                   }t          |d         d         |d         d                   }	|	|z
  }
|
|z  dk    r|                    |           t	          |          dk    r|S t                      }|d         dk    r(t          t          |d         dz                       |z   }d}|D ]6}||dz   k    r+||         }d	|d<   |                    |d                   }|}6 d|	                                i}|                    |           |S )
z
    Adjust the header boundary to be the convex hull of the rows it intersects
    at least 50% of the height of.

    For now, we are not supporting tables with multiple headers, so we need to
    eliminate anything besides the top-most header.
    Fr   r   re   r   r   r   r   T)
	enumerater   minr   rN   r   r   ranger   get_bbox)headersr   aligned_headersrowheader_row_numsheaderrow_num
row_heightmin_row_overlapmax_row_overlapoverlap_heightheader_rectlast_row_nums                r   r   r     s    O % %$OO 0 0%dOO 	0 	0LGSVQ#f+a.8J!#f+a.&.2CDDO!#f+a.&.2CDDO,>N
*c11&&w///	0 ?q  &&KqAu_Q%7!%;<<==OL" 
 
lQ&&&w-C#'C %223v;??K"LL
 k**,,-F6"""r!   c                     	 t          |           t          |           z  }t          |           }||z   dz  }n# t          $ r d}Y nw xY w|S )zt
    Compute a confidence score based on how well the page tokens
    slot into the cells reported by the model
    rd   r   )sumrN   r  ZeroDivisionError)cell_match_scoresmean_match_scoremin_match_scoreconfidence_scores       r   compute_confidence_scorer    sr    
011C8I4J4JJ/00,>!C   s   69 AAc           	         | d         }| d         }| d         }t          |dd           }g }g }t          |          D ]X\  }}t          |          D ]A\  }	}
t          t          |d                             }t          t          |
d                             }|                    |          }d|
v o|
d         }|                                |g|	g|d	}d
|d<   |D ]z}t          t          |d                             }|                    |                                          |                                z  t          j        k    rd|d<   d
|d<    n{|d         r|	                    |           'd
|d<   |	                    |           CZ|D ]}t          t          |d                             }t                      }t                      }d}d}|D ]+}t          t          |d                             }|                                }|                    |                                          |z  t          j        k    r|d         d
u r|#t          t          |d                             }n(|                    t          |d                              |                    t          |d                             }|                    t          |d                             }|od|v o|d         }d|d<   -t          |          dk    rbt          |          dk    rO|                                t          |          t          |          ||d         d}|	                    |           t          j        |          \  }}}t!          |          }|}|}|D ]}t                      }|d         D ]0}|                    t          ||         d                              1t                      }|d         D ]0}	|                    t          ||	         d                              1|                    |          }|                                |d<   t          j        |dd
          \  }}}t#          ||          D ]1\  }}fd|D             }t          j        |d
          |d<   ||d<   2t          |          }t          j        |          }t          |          } t          j        |          }t+          t                    }!t+          t                    }"t+          t                    }#t+          t                    }$|D ]}t-          |d                   }%t/          |d                   }&t-          |d                   }'t/          |d                   }(|d         D ]})|#|'         	                    |)d         d                    |!|%         	                    |)d         d                    |$|(         	                    |)d         d                    |"|&         	                    |)d         d                    t          |          D ]\  }	}
t          |#d                   dk    rt-          |#d                   |
d         d<   t          |!|	                   dk    rt-          |!|	                   |
d         d<   t          |$| dz
                     dk    r!t/          |$| dz
                     |
d         d<   t          |"|	                   dk    rt/          |"|	                   |
d         d<   t          |          D ]\  }}t          |#|                   dk    rt-          |#|                   |d         d<   t          |!d                   dk    rt-          |!d                   |d         d<   t          |$|                   dk    rt/          |$|                   |d         d<   t          |"|dz
                     dk    r!t/          |"|dz
                     |d         d<   |D ]}d}d}|d         D ][}	|)t          t          ||	         d                             }-|                    t          ||	         d                              \|d         D ][}|)t          t          ||         d                             }-|                    t          ||         d                              \|                    |          }|                                dk    r|                                |d<   	 ||fS )a  
    Assuming the row, column, spanning cell, and header bounding boxes have
    been refined into a set of consistent table structures, process these
    table structures into table cells. This is a universal representation
    format for the table, which can later be exported to Pandas or CSV formats.
    Classify the cells as header/access cells or data cells
    based on if they intersect with the header bounding box.
    r   r   r   Tc                     | d         S )Nr   r   )cells    r   <lambda>z$structure_to_cells.<locals>.<lambda>  s    4PW= r!   )reversekeyr   r   )r   column_numsrow_numsr   Fsubcell	is_mergedr   Nr#  r"  r   )r   r"  r#  r   r   gMbP?)overlap_thresholdforced_assignmentc                      g | ]
}|         S r   r   )ry   numrH   s     r   r   z&structure_to_cells.<locals>.<listcomp>J  s    <<<cfSk<<<r!   )remove_integer_superscripts	cell textspansr   rd   re   )sortedr  r   r   r   r  r   r   r   r   setr   unionrN   r   slot_into_containersr  r   extract_text_from_spanssort_objects_top_to_bottomsort_objects_left_to_rightr   r  r   )*r  rH   r   r   r   rK   subcells
column_numcolumnr  r
  r   r   	cell_rectr  r  spanning_cellspanning_cell_rectcell_columns	cell_rowsr$  subcell_rectsubcell_rect_area_r  r  dilated_columnsdilated_rowsspan_nums_by_cellcell_span_nums
cell_spansnum_rowsnum_columnsmin_y_values_by_rowmax_y_values_by_rowmin_x_values_by_columnmax_x_values_by_columnmin_rowmax_row
min_column
max_columnspans*    `                                        r   r~   r~     s	    i(G6"D$%56NND>X>XYYYNEH'00 # #
F%dOO 	# 	#LGStF6N3344KDV--..H **;77I$+DO0DF!**,, *|$I!'	 D $DO!/  %)$}V/D*E*E%F%F"&00;;DDFFI[I[I]I]]$89 9 '+DO(-D%E9 I #%%%% 05+,T""""9	#< (    !$}V'<"="=>>uuEE		 	, 	,GWV_ 5 566L , 5 5 7 7&&'9::CCEEHYY 45 59@9MQV9V9V$ $T'&/%:%: ; ;II**4+@+@AAA%OOC
0C,D,DEE	+11#gm6L2M2MNN  [Ow$>[7?C['+$y>>A#l"3"3a"7"7!**,,#L11 OO!'(56L(M D LL)>ufMMAq
/0ABB OL , ,ff}- 	P 	PJ$$T/**Ef*M%N%NOOOO66J' 	G 	GG!!$|G'<V'D"E"EFFFF))(33	 ))++V)>	  q! !$E+< = = # #n<<<<^<<<
 (?(-
 
 
[ #W 4yyH1$77Dg,,K4W==G%d++%d++(..(.. 	A 	Ad:&''d:&''m,--
m,--
M 	A 	AD":.55d6l1oFFF(//VQ@@@":.55d6l1oFFF(//VQ@@@@		A
 "$ ? ?%a())A-- !7!:;;CKN"7+,,q00 !4W!=>>CKN%kAo677!;; !7a!HIICKN"7+,,q00 !4W!=>>CKN'00 G G
F%j122Q66 #$::$F G GF6N1"1%&&** #$7$: ; ;F6N1%j122Q66 #$::$F G GF6N1"8a<011A55 #$71$E F FF6N1  J' 	C 	CGT']6%: ; ;<<%%d4=+@&A&ABBBB}- 	L 	LJ""4
(;F(C#D#DEE((gj.A&.I)J)JKKKK&&{33	!##$--//DL"""r!   rK   c                    | sg S t          d | D                       }t          d | D                       }t          j        |dz   |dz   ft                    }| D ]}|d         D ]}|d         D ]	}d|||f<   
 d | D             }|                                 }t          j        |d	k              }	t          |	d
         |	d                   D ]%\  }}|g|gd||v d}
|                    |
           &|S )ag  fills the missing cells in the table by adding a cells with empty text
    where there are no cells detected by the model.

    A cell contains the following keys relevent to the html conversion:
    row_nums: List[int]
        the row numbers this cell belongs to; for cells spanning multiple rows there are more than
        one numbers
    column_nums: List[int]
        the columns numbers this cell belongs to; for cells spanning multiple columns there are more
        than one numbers
    cell text: str
        the text in this cell
    column header: bool
        whether this cell is a column header

    c                 (    h | ]}|d          D ]}|S )r#  r   ry   r  r
  s      r   	<setcomp>zfill_cells.<locals>.<setcomp>  s*    KKK$z:JKK3KKKKr!   c                 (    h | ]}|d          D ]}|S )r"  r   )ry   r  cols      r   rR  zfill_cells.<locals>.<setcomp>  s*    NNN$}:MNN3NNNNr!   r   r   r#  r"  Tc                 8    h | ]}|d          
|d         D ]}|S )r   r#  r   rQ  s      r   rR  zfill_cells.<locals>.<setcomp>  s5    ^^^4D,A^TR\M]^^c3^^^^r!   Fr   rI   )r#  r"  r+  r   )r   npzerosboolcopywherer   r   )rK   table_rows_notable_cols_nofilledr  r
  rT  header_rows	new_cellsnot_filled_idxnew_cells              r   
fill_cellsrb    s[   "  	KKKKKLLMNNNNNOOMX}q(-!*;<DIIIF ( (
# 	( 	(CM* ( (#'sCx  (	( _^5^^^K

IXfo..Nq)>!+<== # #S5 K/	
 
 	""""r!   c                    t          t          |           d           } t          j        d          }d}d}t	          d | D                       }|rt          j        |d          }t          j        |d          }| D ]}t          |d	                   }i }t          |d
                   }	|	dk    rt          |	          |d<   t          |d	                   }
|
dk    rt          |
          |d<   ||k    r(|}|d         r|}d}n|}d}t          j        |d          }t          j        |||          }|d         |_	        t          t          j
        |dd                    S )a  Convert table structure to html format.

    Args:
        cells: List of dictionaries representing table cells, where each dictionary has the
            following format:
            {
                "row_nums": List[int],
                "column_nums": List[int],
                "cell text": str,
                "column header": bool,
            }
    Returns:
        str: HTML table string
    c                 V    t          | d                   t          | d                   fS )Nr#  r"  )r  )rz   s    r   r  zcells_to_html.<locals>.<lambda>  s$    S:5G5GQ}M]I^I^4_ r!   )r!  rj   r   Nc              3   &   K   | ]}|d          V  dS )r   Nr   )ry   r  s     r   	<genexpr>z cells_to_html.<locals>.<genexpr>  s'      CCT40CCCCCCr!   theadtbodyr#  r"  r   colspanrowspanr   thtdtr)attribr+  unicodeF)rD   short_empty_elements)r-  rb  ETElementany
SubElementr  rN   rW   texttostring)rK   rj   current_rowtable_headertable_has_header
table_bodyr  this_rowrn  ri  rj  table_subelementcell_tagr
  tcells                  r   rO   rO     s    :e$$*_*_```EJwEKLCCUCCCCC 5}UG44ug..J ' 'tJ'((d=)**Q;; #GF9d:&''Q;; #GF9k!!"KO$  #/ #- - 0$77Cc8F;;;+&

r{595QQQRRRr!   imagezoomc                 x   |dk    rd}t          j        t          j        t          j        |           t           j                  d||t           j                  }t          j        dt          j                  }t          j	        ||d          }t          j
        ||d          }t          j        |          S )zscale an image based on the zoom factor using cv2; the scaled image is post processed by
    dilation then erosion to improve edge sharpness for OCR tasksr   r   N)fxfyinterpolation)r   r   )
iterations)cv2resizecvtColorrV  arrayCOLOR_RGB2BGRINTER_CUBIConesuint8dilateeroderV   	fromarray)r  r  	new_imagekernels       r   
zoom_imager    s     qyy
RXe__c&788o  I WVRX&&F
9f;;;I	)V:::Ii(((r!   )Lxml.etree.ElementTreeetreeElementTreerq  collectionsr   pathlibr   typingr   r   r   r   r	   r
   r   r   r  r   rV  rB   PILr   rV   transformersr   r   r   @transformers.models.table_transformer.modeling_table_transformerr   unstructured_inference.configr   .unstructured_inference.inference.layoutelementr   unstructured_inference.loggerr   /unstructured_inference.models.table_postprocessr   /unstructured_inference.models.unstructuredmodelr   unstructured_inference.utilsr   rI   r   r   r   r]   __annotations__r`   rW   rs   TT_TABLE_CONFTABLE_COLUMN_CONFTABLE_ROW_CONFTABLE_COLUMN_HEADER_CONFTABLE_PROJECTED_ROW_HEADER_CONFTABLE_SPANNING_CELL_CONFr   r[   r   rM   rZ   r   r   r   r   r   r   r   r   r   r  r~   rb  rO   r  r   r!   r   <module>r     s   # " " " " " " " " " # # # # # #       M M M M M M M M M M M M M M M M M M M M 



      ! ! ! ! ! ! X X X X X X X X X X      ; : : : : : S S S S S S 0 0 0 0 0 0 @ @ @ @ @ @ M M M M M M H H H H H H . . . . . .g g g g g(9 g g gT 3T2S2U2U/ U U U  S    $ +$6!0+D"2"R+D	 	 Xt X(. X$ X X X X2CHo CH%   Bgc3h'(U
# gc3h    8! ! !  
 
 
Q Q Qh. . .b/ / /d  l# l# l#^'d4j 'T$Z ' ' ' 'T/Sd /S /S /S /S /Sd)hn )E )hn ) ) ) ) ) )r!   