
    Ng                    j   d dl mZ d dlZd dlZd dlZd dlZd dlZd dlm	Z	m
Z
 d dlmZ d dlmZ d dlmZmZmZ d dlZd dlZd dlmZ d dlmZmZ d d	lmZ d d
lmZ d dlm Z m!Z! d dl"m#Z#m$Z$m%Z%m&Z&m'Z'm(Z(m)Z)m*Z*m+Z+m,Z,  ej-        d          Z. ej/                    Z0de0_1         ej2        d          Z3e04                    e3           dd e.j5        D             vre.6                    e0           e.7                    ej8                   g dZ9ddddddZ:ddgZ;e G d de	                      Z<e G d de<                      Z=e G d de<                      Z>e G d  d!e<                      Z?	 	 d>d?d-Z@	 	 	 	 d@dAd7ZAe G d8 d9e<e	                      ZB G d: d;eB          ZC G d< d=eB          ZDdS )B    )annotationsN)ABCabstractmethod)	dataclassPath)ListOptionalUnion)tqdm)$calculate_element_type_percent_matchget_element_type_frequency)ObjectDetectionEvalProcessor)TableEvalProcessor)calculate_accuracycalculate_percent_missing_text)
_count_display_format_grouping_output_mean_prepare_output_cct_pstdev_read_text_file_rename_aggregated_columns_stdev_write_to_filezunstructured.evaleval_log_handlerz:%(asctime)s %(processName)-10s %(levelname)-8s %(message)sc                    g | ]	}|j         
S  )name).0hs     Y/var/www/html/ai-engine/env/lib/python3.11/site-packages/unstructured/metrics/evaluate.py
<listcomp>r$   0   s    >>>af>>>    )metricaverage	sample_sdpopulation_sdcountr&   r'   r(   r)   r*   )indexr   r   r   r   jsontxtc                     e Zd ZU dZded<   ded<   d Zeed                         Zeed                         Z	ed&d            Z
	 	 d'd(dZ	 	 	 	 d)d*dZed             Zed+d            Zd,d Zd-d$Zed-d%            ZdS ).BaseMetricsCalculatorzFoundation class for specialized metrics calculators.

    It provides a common interface for calculating metrics based on outputs and ground truths.
    Those can be provided as either directories or lists of files.
    z
str | Pathdocuments_dirground_truths_dirc                `    t           j                                                   _        t           j                                                   _         fd j                            d          D              _         fd j                            d          D              _        dS )z/Discover all files in the provided directories.c                l    g | ]0}|                                 |                    j                  1S r   is_filerelative_tor0   r!   pathselfs     r#   r$   z7BaseMetricsCalculator.__post_init__.<locals>.<listcomp>Q   H      
  
  
||~~ 
T/00 
  
  
r%   *c                l    g | ]0}|                                 |                    j                  1S r   )r5   r6   r1   r7   s     r#   r$   z7BaseMetricsCalculator.__post_init__.<locals>.<listcomp>V   sH     $
 $
 $
||~~$
T344$
 $
 $
r%   N)r   r0   resolver1   glob_document_paths_ground_truth_pathsr9   s   `r#   __post_init__z#BaseMetricsCalculator.__post_init__K   s    !$"455==??!%d&<!=!=!E!E!G!G 
  
  
  
*//44 
  
  

$
 $
 $
 $
.33C88$
 $
 $
   r%   c                    dS )z3Default name for the per-document metrics TSV file.Nr   rA   s    r#   default_tsv_namez&BaseMetricsCalculator.default_tsv_name\         r%   c                    dS )z1Default name for the aggregated metrics TSV file.Nr   rA   s    r#   default_agg_tsv_namez*BaseMetricsCalculator.default_agg_tsv_namea   rE   r%   rowslistreturn!tuple[pd.DataFrame, pd.DataFrame]c                    dS )zGenerates pandas DataFrames from the list of rows.

        The first DF (index 0) is a dataframe containing metrics per file.
        The second DF (index 1) is a dataframe containing the aggregated
            metrics.
        Nr   )r9   rH   s     r#   _generate_dataframesz*BaseMetricsCalculator._generate_dataframesf   rE   r%   Ndocument_pathsOptional[list[str | Path]]ground_truth_pathsc                R    |rd |D             | _         |rd |D             | _        | S )z/Overrides the default list of files to process.c                ,    g | ]}t          |          S r   r   r!   ps     r#   r$   z2BaseMetricsCalculator.on_files.<locals>.<listcomp>v   s    #D#D#DDGG#D#D#Dr%   c                ,    g | ]}t          |          S r   r   rS   s     r#   r$   z2BaseMetricsCalculator.on_files.<locals>.<listcomp>y   s    'L'L'LAQ'L'L'Lr%   )r?   r@   )r9   rN   rP   s      r#   on_fileszBaseMetricsCalculator.on_fileso   sH      	E#D#D^#D#D#DD  	M'L'L9K'L'L'LD$r%   Texecutor%Optional[concurrent.futures.Executor]
export_dirOptional[str | Path]visualize_progressbooldisplay_agg_dfpd.DataFramec                   ||                                  }|                     ||          }|                     |          \  }}|,t          || j        |           t          || j        |           |du rt          |           |S )a  Calculates metrics for each document using the provided executor.

        * Optionally, the results can be exported and displayed.
        * It loops through the list of structured output from all of `documents_dir` or
        selected files from `document_paths`, and compares them with gold-standard
        of the same file name under `ground_truths_dir` or selected files from `ground_truth_paths`.

        Args:
            executor: concurrent.futures.Executor instance
            export_dir: directory to export the results
            visualize_progress: whether to display progress bar
            display_agg_df: whether to display the aggregated results

        Returns:
            Metrics for each document as a pandas DataFrame
        NT)_default_executor_process_all_documentsrM   r   rD   rG   r   )r9   rW   rY   r[   r]   rH   dfagg_dfs           r#   	calculatezBaseMetricsCalculator.calculate}   s    . --//H**85GHH..t44
F!:t'<bAAA:t'@&IIIT!!V	r%   c                    t          t          j                            dt          j                                        }t
                              d| d            |                                 |          S )NMAX_PROCESSESzConfiguring a pool of z$ processors for parallel processing.)max_workers)intosenvironget	cpu_countloggerinfo_get_executor_class)clsmax_processorss     r#   r`   z'BaseMetricsCalculator._default_executor   sb    RZ^^OR\^^LLMMa^aaabbb(s&&((^DDDDr%   Ztype[concurrent.futures.ThreadPoolExecutor] | type[concurrent.futures.ProcessPoolExecutor]c                $    t           j        j        S N)
concurrentfuturesProcessPoolExecutor)rp   s    r#   ro   z)BaseMetricsCalculator._get_executor_class   s     !55r%   concurrent.futures.Executorc                    |5  d t          |                    | j        | j                  t	          | j                  d|           D             cddd           S # 1 swxY w Y   dS )zTriggers processing of all documents using the provided executor.

        Failures are omitted from the returned result.
        c                    g | ]}||S rt   r   )r!   rows     r#   r$   z@BaseMetricsCalculator._process_all_documents.<locals>.<listcomp>   s)     	 	 	 ?  #??r%   F)totalleavedisableN)r   map_try_process_documentr?   len)r9   rW   r[   s      r#   ra   z,BaseMetricsCalculator._process_all_documents   s      
	 
		 	LL!;T=QRRd233 22	  	 	 	
	 
	 
	 
	 
	 
	 
	 
	 
	 
	 
	 
	 
	 
	 
	 
	 
	 
	s   AAA"%A"docr   Optional[list]c                    t                               d|            	 |                     |          S # t          $ r+}t                               d| d|            Y d}~dS d}~ww xY w)z3Safe wrapper around the document processing method.zProcessing Failed to process document : N)rm   rn   _process_document	Exceptionerror)r9   r   es      r#   r   z+BaseMetricsCalculator._try_process_document   s    '#''(((	))#... 	 	 	LLAsAAaAABBB44444	s   4 
A) A$$A)c                    dS )z=Should return all metadata and metrics for a single document.Nr   )r9   r   s     r#   r   z'BaseMetricsCalculator._process_document   rE   r%   )rH   rI   rJ   rK   NN)rN   rO   rP   rO   rJ   r/   NNTT
rW   rX   rY   rZ   r[   r\   r]   r\   rJ   r^   )rJ   rr   )rW   rx   r[   r\   rJ   rI   r   r   rJ   r   )__name__
__module____qualname____doc____annotations__rB   propertyr   rD   rG   rM   rV   rd   classmethodr`   ro   ra   r   r   r   r%   r#   r/   r/   @   s          !!!!
 
 
" B B ^ XB @ @ ^ X@    ^ 6:9=      ;?+/#'#" " " " "H E E [E
 6 6 6 [6
   &    L L L ^L L Lr%   r/   c                       e Zd ZU dZdZded<   dZded<   dZded<    fd	Ze	d
             Z
e	d             Ze	d             ZddZd Z xZS )TableStructureMetricsCalculatoraH  Calculates the following metrics for tables:
        - tables found accuracy
        - table-level accuracy
        - element in column index accuracy
        - element in row index accuracy
        - element's column content accuracy
        - element's row content accuracy
    It also calculates the aggregated accuracy.
    NzOptional[float]cutoffTr\   weighted_averageinclude_false_positivesc                H    t                                                       d S rt   )superrB   r9   	__class__s    r#   rB   z-TableStructureMetricsCalculator.__post_init__   s    r%   c                
    g dS )N)
total_tablestable_level_acctable_detection_recalltable_detection_precisiontable_detection_f1composite_structure_accelement_col_level_index_accelement_row_level_index_accelement_col_level_content_accelement_row_level_content_accr   rA   s    r#   supported_metric_namesz6TableStructureMetricsCalculator.supported_metric_names   s    
 
 
 	
r%   c                    dS )Nz%all-docs-table-structure-accuracy.tsvr   rA   s    r#   rD   z0TableStructureMetricsCalculator.default_tsv_name       66r%   c                    dS )Nz&aggregate-table-structure-accuracy.tsvr   rA   s    r#   rG   z4TableStructureMetricsCalculator.default_agg_tsv_name       77r%   r   r   rJ   r   c                n  
 t          |          }|j        }t          |          j        dd          }|dz   }t          |j                  dk    r|j        d         nd }|| j        v rd S | j        |z  }|                                s t          	                    d| d           d S | j
        |z  }|                                s t          	                    d| d           d S t          j        ||| j        d          }	|	                                
|||
j        g
fd	| j        D             z   S )
N   .jsonPrediction file  does not exist, skippingGround truth file html)prediction_fileground_truth_filer   source_typec                0    g | ]}t          |          S r   getattr)r!   r&   report_from_htmls     r#   r$   zETableStructureMetricsCalculator._process_document.<locals>.<listcomp>  s$    YYY6W%v..YYYr%   )r   stemsuffixr   partsr@   r0   existsrm   warningr1   r   from_json_filesr   process_filetotal_predicted_tablesr   )r9   r   doc_pathout_filenamedoctypesrc_gt_filename	connectorr   r   processor_from_text_as_htmlr   s             @r#   r   z1TableStructureMetricsCalculator._process_document   sq   99}|$$+ABB/&0*-hn*=*=*A*AHN2&&t	d6664,s2%%'' 	NNXoXXXYYY4 2_D '')) 	NN\0A\\\]]]4&8&H+/;	'
 '
 '
# 7CCEE3	

 ZYYYT=XYYYZ 	Zr%   c                "   g d| j         z   }t          j        ||          }|d         |d<   | j        r>d|d         |j                            d          |j                            d          z  <   ||d         dk             }| j        s|d         	                    d           |d<   |j
        r7t          j        d | j         D                                                       }ni }| j         D ]F}|||                                                  }||                             t          t          t           g                                          }	|                    d          r||                                         |	d	<   n|                    d
          r9t)          j        t)          j        ||         |d                   d          |	d	<   n8t)          j        t)          j        ||         |d                   d          |	d	<   |	j
        rt          j        g dg d          ||<   A|	||<   Ht          j        |                                                                          }|                    t2                    }||fS )N)filenamer   r   r   columnsr   _table_weightsr   r   c                    | dk    rdndS )Nr   r   r   )table_weights    r#   <lambda>zFTableStructureMetricsCalculator._generate_dataframes.<locals>.<lambda>/  s    ,!*;*;QQ r%   c                    g | ]	}|d d d dg
S )Nr   r   )r!   r&   s     r#   r$   zHTableStructureMetricsCalculator._generate_dataframes.<locals>.<listcomp>4  s#    YYY6&$dA.YYYr%   r   r   )weights   NNNr   r   r   r   r   datar+   )r   pd	DataFramer   r   eqr   gtr   applyemptyreset_indexnotnullaggr   r   r   	transpose
startswithmeannproundr'   SeriesrenameAGG_HEADERS_MAPPING)
r9   rH   headersrb   has_tables_dfrc   element_metrics_resultsr&   	metric_df
agg_metrics
             r#   rM   z4TableStructureMetricsCalculator._generate_dataframes  s   
 
 

 '( \$000!.1' 	^\]B !3!3A!6!69R9U9UVW9X9X!XY 2./!34$ 	#%&6#7#=#=BB$ $B   	U\YYT=XYYY kmm F ')#5 A A)-*?*G*G*I*IJ	&v.22FGV3LMMWWYY
$$^44 *3F*;*@*@*B*BJw''&&'899 *,(
9V#4iHX>YZZZ+ +Jw'' +-(
9V#4i>WXXX+ +Jw' # A68i222:b:b:b7 7 7+F33 7A+F33\"9::DDFFRRTTF':;;6zr%   r   )r   r   r   r   r   r   r   r   rB   r   r   rD   rG   r   rM   __classcell__r   s   @r#   r   r      s           #F""""!!!!!$(((((          
 
 X
 7 7 X7 8 8 X8 Z  Z  Z  ZD7 7 7 7 7 7 7r%   r   c                       e Zd ZU dZdZded<   dZded<   dZd	ed
<    fdZe	d!d            Z
e	d!d            Z	 	 	 	 d"d# fdZd Zd$dZd%dZd  Z xZS )&TextExtractionMetricsCalculatorzCalculates text accuracy and percent missing between document and ground truth texts.

    It also calculates the aggregated accuracy and percent missing.
    NOptional[str]group_by)r   r   r   ztuple[int, int, int]r   r,   strdocument_typec                p    t                                                       |                                  d S rt   )r   rB   _validate_inputsr   s    r#   rB   z-TextExtractionMetricsCalculator.__post_init___  s1    r%   rJ   c                    dS )Nzall-docs-cct.tsvr   rA   s    r#   rD   z0TextExtractionMetricsCalculator.default_tsv_namec  s    !!r%   c                    dS )Nzaggregate-scores-cct.tsvr   rA   s    r#   rG   z4TextExtractionMetricsCalculator.default_agg_tsv_nameg  s    ))r%   TrW   rX   rY   rZ   r[   r\   r]   r^   c                    t                                          ||||          }|| j        rt          | j        ||d           |S )0See the parent class for the method's docstring.rW   rY   r[   r]   Ntext_extractionr   rd   r   get_mean_groupingr9   rW   rY   r[   r]   rb   r   s         r#   rd   z)TextExtractionMetricsCalculator.calculatek  sZ     WW!1)	  
 
 !dm!dmR=NOOO	r%   c                *     j         s.t                              d           t          j        d            j        t          vrt          d j         d           j         D ]r}	 |j        d          n.# t          $ r! t          
                    d| d           Y ;w xY w|j        d         d j         k    rt                              d	           st           fd
 j         D                       st                              d	           d S d S )Nz;No output files to calculate to edit distances for, exitingr   ztSpecified file type under `documents_dir` or `output_list` should be one of `json` or `txt`. The given file type is z
, exiting.File z! does not have a suffix, skipping.zpThe directory contains file type inconsistent with the given input. Please note that some files will be skipped.c              3  J   K   | ]}|j         d          dj         k    V  dS )r  r  N)suffixesr   r7   s     r#   	<genexpr>zCTextExtractionMetricsCalculator._validate_inputs.<locals>.<genexpr>  s;      bbT4=$(@D,>(@(@@bbbbbbr%   )r?   rm   rn   sysexitr   OUTPUT_TYPE_OPTIONS
ValueErrorr
  
IndexErrorr   r   all)r9   r8   s   ` r#   r   z0TextExtractionMetricsCalculator._validate_inputs~  sj   # 	KKUVVVHQKKK%888Z;?;MZ Z Z   ( 
	 
	Db!!!   LTLLLMMM }R $<(:$<$<<<C   bbbbTMabbbbb 	NN?    	 	s   'A55(B B r   r   r   c                   |j         }|j        d         }t          |j                  dk    r|j        d         nd }|                     |          \  }}dt          |                                          t          |                                          z  cxk     rdk     r(n n%t          t          ||| j                  d          }nd}t          t          ||          d          }|||||gS )Nr   r   g      ?g       @r   g{Gz?)
r   r
  r   r   	_get_cctsencoder   r   r   r   )	r9   r   r   r   r   
output_cct
source_cctaccuracypercent_missings	            r#   r   z1TextExtractionMetricsCalculator._process_document  s    8,q/$'	NNQ$6$6CIaLLD	!%!4!4
J Z&&(())C
0A0A0C0C,D,DDJJJJsJJJJJ/
JUUWXYYHH H >z: V VXYZZ'9hHHr%   tuple[str, str]c                    t          | j        |z  | j                  }t          | j        |                    d          z            }||fS )N)docpathoutput_type.txt)r   r0   r   r   r1   with_suffix)r9   r   r  r  s       r#   r  z)TextExtractionMetricsCalculator._get_ccts  sT    (&,$:L
 
 

 %T%;coof>U>U%UVV
:%%r%   c                8   g d}t          j        ||          }|dg                             t          t          t
          t          g                                          }|dg                             t          t          t
          t          g                                          }|j        d         dk    r,|j        d         dk    rt          j        t                    }n4t          j
        ||f                                          }t          |_        ||fS )N)r   r   r   cct-accuracycct-%missingr   r   r!  r   r   )r   r   r   r   r   r   r   r   shapeAGG_HEADERSconcatr   r   )r9   rH   r   rb   accmissrc   s          r#   rM   z4TextExtractionMetricsCalculator._generate_dataframes  s    VVV\$000.!"&&vw'GHHRRTT>"#''(HIISSUU9Q<1A!!3!3\+666FFYT{++7799F(FN6zr%   rJ   r   r   r   r   )r   r   rJ   r  )r   r   r   r   r   r   r   r   rB   r   rD   rG   rd   r   r   r  rM   r   r   s   @r#   r   r   T  s4         
 #H""""$-G----M          " " " X" * * * X*
 ;?+/#'#      &  4I I I I & & & &      r%   r   c                  |     e Zd ZU dZdZded<   	 	 	 	 dd fdZedd            Zedd            Z	ddZ
d Z xZS )ElementTypeMetricsCalculatorz
    Calculates element type frequency accuracy, percent missing and
    aggregated accuracy between document and ground truth.
    Nr   r   TFrW   rX   rY   rZ   r[   r\   r]   rJ   r^   c                    t                                          ||||          }|| j        rt          | j        ||d           |S )r   r   Nelement_typer  r  s         r#   rd   z&ElementTypeMetricsCalculator.calculate  sY     WW!1)	  
 
 !dm!dmR^LLL	r%   r   c                    dS )Nz#all-docs-element-type-frequency.tsvr   rA   s    r#   rD   z-ElementTypeMetricsCalculator.default_tsv_name  s    44r%   c                    dS )Nz!aggregate-scores-element-type.tsvr   rA   s    r#   rG   z1ElementTypeMetricsCalculator.default_agg_tsv_name  s    22r%   r   r   r   c                v   |j         }|j        d         }t          |j                  dk    r|j        d         nd }t	          t          | j        |z                      }t	          t          | j        |                    d          z                      }t          t          ||          d          }||||gS )Nr   r   r   r   )r   r
  r   r   r   r   r0   r1   r  r   r   )r9   r   r   r   r   outputsourcer  s           r#   r   z.ElementTypeMetricsCalculator._process_document  s    8,q/$'	NNQ$6$6CIaLLD	+OD<NQT<T,U,UVV+D2S__W5M5MMNN
 
 =ffMMqQQ'9h77r%   c                `   g d}t          j        ||          }|j        r)t          j        g d                                          }nU|                    dt
          t          t          t          gi                                          }|	                                }t          |_        ||fS )N)r   r   r   element-type-accuracyr   )r2  NNNr   r2  )r   r   r   r   r   r   r   r   r   r   r#  r   )r9   rH   r   rb   rc   s        r#   rM   z1ElementTypeMetricsCalculator._generate_dataframes  s    OOO\$0008 	*\"P"P"PQQ[[]]FFVV4ufgv6VWXXbbddF''))F$6zr%   )NNTFr   r'  r   )r   r   r   r   r   r   rd   r   rD   rG   r   rM   r   r   s   @r#   r)  r)    s          
 #H"""" ;?+/#'$      & 5 5 5 X5 3 3 3 X3
8 
8 
8 
8      r%   r)  r   r   
data_inputUnion[pd.DataFrame, str]rY   	eval_nameagg_namer   export_filenamerJ   Nonec                   | dvr| dk    rt          d          |dk    rddg}d}n,|dk    rd	g}d
}n |dk    rddg}d}nt          d| d          t          |t                    rt          j                            |          st          d| d          |                    d          rt          j	        |d          }nj|                    d          rt          j	        |d          }n>|                    d          rt          j	        |dd          }nt          d          |}|j
        rt          d          | dk    rH| |j        vs,||                                                                          rt          d|  d          g }| rm| dk    rg|D ]d}	|                    t!          |                    |                               |	t&          t(          t*          t,          gi                               e| dk    rld|d <   |D ]d}	|                    t!          |                    d                               |	t&          t(          t*          t,          gi                               et/          | }d |j                            d          v r|                    d d!d"          }|r-|                    d          s|dz   }t5          |||           dS t5          |d#|  d$| d|           dS )%a  Aggregates accuracy and missing metrics by column name 'doctype' or 'connector',
    or 'all' for all rows. Export to TSV.
    If `all`, passing export_name is recommended.

    Args:
        group_by (str): Grouping category ('doctype' or 'connector' or 'all').
        data_input (Union[pd.DataFrame, str]): DataFrame or path to a CSV/TSV file.
        export_dir (str): Directory for the exported TSV file.
        eval_name (str): Evaluated metric ('text_extraction' or 'element_type').
        agg_name (str, optional): String to use with export filename. Default is `cct` for
            group_by `text_extraction` and `element-type` for `element_type`
        export_name (str, optional): Export filename.
    )r   r   r  z<Invalid grouping category. Returning a non-group evaluation.r  r   r!  cctr+  r2  zelement-typeobject_detectionf1_scorem_apzobject-detectionzUnknown metric for eval zE. Expected `text_extraction` or `element_type` or `table_extraction`.r   not found..csvNheader.tsv	sepr  rE  rA  #Please provide a .csv or .tsv file.zData is empty. Exiting.zData cannot be aggregated by `z6`. Check if it's empty or the column is missing/empty.r   grouping_keyr   )axislevelzall-z-agg-)r  
isinstancer   ri   r8   r   FileNotFoundErrorendswithr   read_csvr   
SystemExitr   isnullr  appendr   groupbyr   r   r   r   r   r   get_level_valuesdropr   )
r   r3  rY   r5  r6  r7  
agg_fieldsrb   
grouped_dffields
             r#   r  r    s   * ///H4E4EWXXX%%%$n5
	n	$	$-.
!	(	(	( &)
%Sy S S S
 
 	

 *c"" w~~j)) 	E#$CJ$C$C$CDDDv&& 	DZ555BB  (( 	DZT222BB  (( 	DZT$???BBBCCC	x 
2333	U		
 : :bl>Q>Q>S>S>W>W>Y>Y :DX D D D
 
 	

 J H%% 	 	E*JJx((,,eeVWf5U-VWW    
 5> 	 	E*JJ~..22EE67TZ;[3\]]    
 )*5J+<<Q????__^!1_EE
 U''// 	7-6Oz?J?????z#G(#G#G#G#G#GTTTTTr%   r   metricsfileUnion[str, pd.DataFrame]filter_listUnion[str, List[str]]	filter_byreturn_typeOptional[pd.DataFrame]c                n   t          | t                    rt          j                            |           st          d|  d          |                     d          rt          j        | d          }nj|                     d          rt          j        | d          }n>|                     d	          rt          j        | dd
          }nt          d          | }t          |t                    rt          j                            |          st          d| d          |                    d          rt          j        |d          }nh|                    d          rt          j        |d          }n<|                    d	          rt          j        |dd
          }nt          d          |j
        dddf                             t                    j                                        }n$t          |t                    st          d          ||j        vrt          d          |||                             |                   }|j        rt%          d          |dk    r|S |dk    r|rt'          |||           dS |dk    r|st          d          t          d          )a2  Reads the data_input file and filter only selected row available in filter_list.

    Args:
        data_input (str, dataframe): the source data, path to file or dataframe
        filter_list (str, list): the filter, path to file or list of string
        filter_by (str): data_input's column to filter the filter_list to
        export_filename (str, optional): export filename. required when return_type is "file"
        export_dir (str, optional): export directory. default to <current directory>/metrics
        return_type (str): "file" or "dataframe"
    r  r>  r?  Nr@  rB  rC  rD  r  rF  rG  r   z1Please provide a List of strings or path to file.z5`filter_by` key does not exists in the data provided.zANo common file names between data_input and filter_list. Exiting.	dataframerY  z!Please provide `export_filename`.z1Return type must be either `dataframe` or `file`.)rK  r   ri   r8   r   rL  rM  r   rN  r  ilocastypevaluestolistrI   r   isinr   rO  r   )	r3  r[  r]  r7  rY   r^  rb   	filter_dfress	            r#   filter_metricsri  V  s   $ *c"" w~~j)) 	E#$CJ$C$C$CDDDv&& 	DZ555BB  (( 	DZT222BB  (( 	DZT$???BBBCCC+s## Nw~~k** 	F#$DK$D$D$DEEE'' 	DK===II!!&)) 	DKT:::II!!&)) 	DKT$GGGIIBCCCnQQQT*11#66=DDFFT** NLMMM
""PQQQ
R	],,
-C
y ^\]]]k!!
			?	z?C88888				<===LMMMr%   c                  |     e Zd ZdZ fdZed             Zed             Zed             Zdd
Z	ddZ
ddZ xZS )$ObjectDetectionMetricsCalculatorBasez
    Calculates object detection metrics for each document:
    - f1 score
    - precision
    - recall
    - average precision (mAP)
    It also calculates aggregated metrics.
    c                     t                                                        fd j                            d          D              _        d S )Nc                l    g | ]0}|                                 |                    j                  1S r   r4   r7   s     r#   r$   zFObjectDetectionMetricsCalculatorBase.__post_init__.<locals>.<listcomp>  r:   r%   z,analysis/*/layout_dump/object_detection.json)r   rB   r0   rglobr?   r   s   `r#   rB   z2ObjectDetectionMetricsCalculatorBase.__post_init__  s]     
  
  
  
*001_`` 
  
  
r%   c                
    g dS Nr<  	precisionrecallr=  r   rA   s    r#   r   z;ObjectDetectionMetricsCalculatorBase.supported_metric_names      ::::r%   c                    dS Nz%all-docs-object-detection-metrics.tsvr   rA   s    r#   rD   z5ObjectDetectionMetricsCalculatorBase.default_tsv_name  r   r%   c                    dS Nz&aggregate-object-detection-metrics.tsvr   rA   s    r#   rG   z9ObjectDetectionMetricsCalculatorBase.default_agg_tsv_name  r   r%   	file_stemr   rJ   Optional[Path]c                \    | j         D ]#}t          |j                  j        |k    r|c S $dS )az  Find the file corresponding to OD model dump file among the set of ground truth files

        The files in ground truth paths keep the original extension and have .json suffix added,
        e.g.:
        some_document.pdf.json
        poster.jpg.json

        To compare to `file_stem` we need to take the prefix part of the file, thus double-stem
        is applied.
        N)r@   r   r   )r9   ry  r8   s      r#   _find_file_in_ground_truthz?ObjectDetectionMetricsCalculatorBase._find_file_in_ground_truth  s?     , 	 	DDI#y00 1tr%   r   r   tuple(str, Path, Path)c                $   t          |          }|j        d         }|                     |          }|| j        vrt	          d| d          t          |j                  j        dd         }| j        |z  }|                                s1t          
                    d| d           t	          d| d          | j        |z  }|                                s1t          
                    d| d           t	          d| d          |||fS )	a  Resolves ground doctype, prediction file path and ground truth path.

        As OD dump directory structure differes from other simple outputs, it needs
        a specific processing to match the output OD dump file with corresponding
        OD GT file.

        The outputs are placed in a dicrectory structure:

        analysis
        |- document_name
            |- layout_dump
                |- object_detection.json
            |- bboxes # not used in this evaluation

        and the GT file is pleced in od_gt directory for given dataset

        dataset_name
        |- od_gt
            |- document_name.pdf.json

        Args:
            doc (Path): path to the OD dump file

        Returns:
            tuple: doctype, prediction file path, ground truth path
        r   z not found in list of GT filesr   Nr   r   z does not exist)r   r   r|  r@   r  r   r   r0   r   rm   r   r1   )r9   r   od_dump_pathry  r   r   r   r   s           r#   
_get_pathsz/ObjectDetectionMetricsCalculatorBase._get_paths  s8   6 Cyy &r*	99)DD$":::a/aaabbb+,,3ABB7,s2%%'' 	RNNXoXXXYYYPPPPQQQ 2_D '')) 	VNN\0A\\\]]]T2CTTTUUU):::r%   rK   c                P   g d| j         z   }t          j        ||          }|j        rt          j        t                    }ni }| j         D ]}|||                                                  }||                             t          t          t          t          g                                          }|j        rt          j        g dg d          ||<   |||<   t          j        |                                                                          }t          |_        ||fS )N)r   r   r   r   r   r   r   )r   r   r   r   r#  r   r   r   r   r   r   r   r   r   r   )	r9   rH   r   rb   rc   r   r&   r   r   s	            r#   rM   z9ObjectDetectionMetricsCalculatorBase._generate_dataframes  s0   6669TT\$0008 	U\+666FF&(#5 A Ar&z11334	&v.22E67F3STT^^``
# A68i222:b:b:b7 7 7+F33 7A+F33\"9::DDFFRRTTF$6zr%   )ry  r   rJ   rz  )r   r   rJ   r}  )rJ   rK   )r   r   r   r   rB   r   r   rD   rG   r|  r  rM   r   r   s   @r#   rk  rk    s         
 
 
 
 
 ; ; X; 7 7 X7 8 8 X8    /; /; /; /;b       r%   rk  c                  n     e Zd Z fdZed             Zed             Zed             Zdd	Zd
 Z	 xZ
S )(ObjectDetectionPerClassMetricsCalculatorc                ~    t                                                       d | _        |                                  d S rt   )r   rB   per_class_metric_names_set_supported_metricsr   s    r#   rB   z6ObjectDetectionPerClassMetricsCalculator.__post_init__  s9    8<###%%%%%r%   c                <    | j         r| j         S t          d          )Nz:per_class_metrics not initialized - cannot get class names)r  r  rA   s    r#   r   z?ObjectDetectionPerClassMetricsCalculator.supported_metric_names  s&    & 	[..YZZZr%   c                    dS )Nz/all-docs-object-detection-metrics-per-class.tsvr   rA   s    r#   rD   z9ObjectDetectionPerClassMetricsCalculator.default_tsv_name  s    @@r%   c                    dS )Nz0aggregate-object-detection-metrics-per-class.tsvr   rA   s    r#   rG   z=ObjectDetectionPerClassMetricsCalculator.default_agg_tsv_name   s    AAr%   r   r   rJ   r   c                   	 |                      |          \  }}}n8# t          $ r+}t                              d| d|            Y d}~dS d}~ww xY wt	          j        ||          }|                                \  }}|j        |dg}	| j        D ]x}
d	                    |

                    d          dd                   }|

                    d          d         }t          ||          }|	                    ||                    y|	S )zCalculate both class-aggregated and per-class metrics for a single document.

        Args:
            doc (Path): path to the OD dump file

        Returns:
            tuple: a tuple of aggregated and per-class metrics for a single document
        r   r   Nprediction_file_pathground_truth_file_path_r  )r  r  rm   r   r   r   get_metricsr   r   joinsplitr   rQ  )r9   r   r   r   r   r   	processorr  per_class_metricsper_class_metrics_rowcombined_metric_namer&   
class_nameclass_metricss                 r#   r   z:ObjectDetectionPerClassMetricsCalculator._process_document$  sJ   	:>//#:N:N7G_&7&7 	 	 	LLAsAAaAABBB44444	 1@!0#4
 
 
	  )4466 "!
 %)$? 	D 	D XX288==crcBCCF-33C88<J#$5v>>M!((z)BCCCC$$s    
A AAc                v   g d}t                      }| j        D ]d}| j        |z  }t          |          5 }t	          j        |          }|d         }|                    |           ddd           n# 1 swxY w Y   eg }|D ]!}	|D ]}
|                    |	 d|
            "t          |          | _	        dS )a  Sets the supported metrics based on the classes found in the ground truth files.
        The difference between per class and aggregated calculator is that the list of classes
        (so the metrics) bases on the contents of the GT / prediction files.
        rq  object_detection_classesNr  )
setr@   r1   openr,   loadupdaterQ  sortedr  )r9   rX  classesgt_filegt_file_pathfr   
gt_classesr  r&   r  s              r#   r  z?ObjectDetectionPerClassMetricsCalculator._set_supported_metricsF  s.   
 >==%%/ 	+ 	+G1G;Ll## +qYq\\ :;
z***+ + + + + + + + + + + + + + + "$ 	H 	HF% H H
&--.F.F*.F.FGGGGH&,-C&D&D###s   2A33A7	:A7	r   )r   r   r   rB   r   r   rD   rG   r   r  r   r   s   @r#   r  r    s        & & & & &
 [ [ X[ A A XA B B XB %  %  %  %DE E E E E E Er%   r  c                  \    e Zd ZdZed             Zed             Zed             Zdd	Zd
S )*ObjectDetectionAggregatedMetricsCalculatorzSCalculates object detection metrics for each document and aggregates by all classesc                
    g dS rp  r   rA   s    r#   r   zAObjectDetectionAggregatedMetricsCalculator.supported_metric_names]  rt  r%   c                    dS rv  r   rA   s    r#   rD   z;ObjectDetectionAggregatedMetricsCalculator.default_tsv_namea  r   r%   c                    dS rx  r   rA   s    r#   rG   z?ObjectDetectionAggregatedMetricsCalculator.default_agg_tsv_namee  r   r%   r   r   rJ   r   c                @   	 |                      |          \  }}}n8# t          $ r+}t                              d| d|            Y d}~dS d}~ww xY wt	          j        ||          }|                                \  }|j        |dgfd| j        D             z   S )zCalculate both class-aggregated and per-class metrics for a single document.

        Args:
            doc (Path): path to the OD dump file

        Returns:
            list: a list of aggregated metrics for a single document
        r   r   Nr  c                0    g | ]}t          |          S r   r   )r!   r&   rX  s     r#   r$   zPObjectDetectionAggregatedMetricsCalculator._process_document.<locals>.<listcomp>  s#    PPP&WWf%%PPPr%   )	r  r  rm   r   r   r   r  r   r   )	r9   r   r   r   r   r   r  r  rX  s	           @r#   r   z<ObjectDetectionAggregatedMetricsCalculator._process_documenti  s    	:>//#:N:N7G_&7&7 	 	 	LLAsAAaAABBB44444	 1@!0#4
 
 
	 **,,
 "
 QPPPD4OPPP	Q 	Qs    
A AANr   )	r   r   r   r   r   r   rD   rG   r   r   r%   r#   r  r  Z  s        ]]; ; X; 7 7 X7 8 8 X8Q Q Q Q Q Qr%   r  r   )r   r   r3  r4  rY   r   r5  r   r6  r   r7  r   rJ   r8  )r   NrX  rY  )r3  rZ  r[  r\  r]  r   r7  r   rY   r   r^  r   rJ   r_  )E
__future__r   concurrent.futuresru   r,   loggingri   r  abcr   r   dataclassesr   pathlibr   typingr	   r
   r   numpyr   pandasr   r   !unstructured.metrics.element_typer   r   %unstructured.metrics.object_detectionr   %unstructured.metrics.table.table_evalr   $unstructured.metrics.text_extractionr   r   unstructured.metrics.utilsr   r   r   r   r   r   r   r   r   r   	getLoggerrm   StreamHandlerhandlerr    	Formatter	formattersetFormatterhandlers
addHandlersetLevelDEBUGr#  r   r  r/   r   r   r)  r  ri  rk  r  r  r   r%   r#   <module>r     s   # " " " " "       				 



 # # # # # # # # ! ! ! ! ! !       ( ( ( ( ( ( ( ( ( (                           E D D D D D c c c c c c c c                        
	.	/	/
'

!
!!GZ[[	   Y    >>fo>>>>>
g    JJJ   uo  JL JL JL JL JLC JL JL JLZ B B B B B&; B B BJ g g g g g&; g g gT : : : : :#8 : : :D #%)VU VU VU VU VUx  %)>N >N >N >N >NB s s s s s+@# s s slIE IE IE IE IE/S IE IE IEX(Q (Q (Q (Q (Q1U (Q (Q (Q (Q (Qr%   