
    Ng0                     R   d Z ddlZddlZddlmZ ddlmZ ddlmZm	Z	m
Z
mZ ddlZddlZddlmZ ddlmZmZ e G d d	                      Zd
 Zde
e         defdZdee         dedeeeef         fdZ G d d          Z ej                     ej        dd ej        d                     ej        dd ej        d                     ej        deddd          dededee         fd                                                 Z e!d!k    r e              dS dS )"ak  
The purpose of this script is to create a comprehensive metric for table evaluation
1. Verify table identification.
    a. Concatenate all text in the table and ground truth.
    b. Calculate the difference to find the closest matches.
    c. If contents are too different, mark as a failure.

2. For each identified table:
    a. Align elements at the level of individual elements.
    b. Match elements by text.
    c. Determine indexes for both predicted and actual data.
    d. Compare index tuples at column and row levels to assess content shifts.
    e. Compare the token orders by flattened along column and row levels
    f. Note: Imperfect HTML is acceptable unless it impedes parsing,
       in which case the table is considered failed.

Example
python table_eval.py      --prediction_file "model_output.pdf.json"     --ground_truth_file "ground_truth.pdf.json"
    N)	dataclass)Path)AnyDictListOptional)TableAlignment),extract_and_convert_tables_from_ground_truth*extract_and_convert_tables_from_predictionc                       e Zd ZU dZeed<   eed<   eed<   eed<   eed<   eed<   eed<   eed	<   eed
<   eed<   edefd            ZdS )TableEvaluationz,Class representing a gathered table metrics.total_tablestotal_predicted_tablestable_level_acctable_detection_recalltable_detection_precisiontable_detection_f1element_col_level_index_accelement_row_level_index_accelement_col_level_content_accelement_row_level_content_accreturnc                 L    | j         | j        z   | j        | j        z   dz  z   dz  S )N      )r   r   r   r   )selfs    a/var/www/html/ai-engine/env/lib/python3.11/site-packages/unstructured/metrics/table/table_eval.pycomposite_structure_accz'TableEvaluation.composite_structure_acc6   s>     ,./1D4VVZ[[\ 	 	    N)	__name__
__module____qualname____doc__int__annotations__floatpropertyr    r   r   r   r   '   s         66!!!!$$$$!&&&&!&&&&#((((#((((    X  r   r   c                 N   t          j        t          |          f          }t          j        |          }t          |           D ]^\  }}||         }|dk    rt          j        dt          j        |g          d         ||                                                   ||<   _|S )zcomputes for each predicted table its accurary compared to ground truth.

    The accuracy is defined as the SequenceMatcher.ratio() between those two strings. If a
    prediction does not have a matched ground truth its accuracy is 0
    Nr   )	npzeroslenr	   get_content_in_tables	enumeratedifflibSequenceMatcherratio)predicted_table_dataground_truth_table_datamatched_indicesscoreground_truth_textidx	predictedmatched_idxs           r   r   r   ?   s     Hc/**,--E&<=TUU#$899 	 	Y%c*",0)==a@k*
 
 %''	 	c


 Lr   r5   r   c                 4    t          d | D                       S )a  Counts the number of predicted tables that have a corresponding match in the ground truth.

    Args:
      matched_indices: List of indices indicating matches between predicted
        and ground truth tables.

    Returns:
      The count of matched predicted tables.

    c              3   &   K   | ]}|d k    dV  dS )r      Nr(   ).0r8   s     r   	<genexpr>z*_count_predicted_tables.<locals>.<genexpr>_   s&      88Ssaxxqxxxx88r   )sum)r5   s    r   _count_predicted_tablesrA   T   s!     88O888888r   ground_truth_tables_numberc                    t          |           }t          |           }d|v r|                    d           t          |          }||z
  }|}|dk    r||z  nd}||z   dk    r|||z   z  nd}||z   dk    rd|z  |z  ||z   z  nd}	|||	fS )a  
    Calculate the table detection metrics: recall, precision, and f1 score.
    Args:
        matched_indices:
            List of indices indicating matches between predicted and ground truth tables
            For example: matched_indices[i] = j means that the
            i-th predicted table is matched with the j-th ground truth table.
        ground_truth_tables_number: the number of ground truth tables.

    Returns:
        Tuple of recall, precision, and f1 scores
    r*   r   r   )r-   setremove)
r5   rB   predicted_tables_numbermatched_settrue_positivefalse_positivepositiverecall	precisionf1s
             r   !calculate_table_detection_metricsrN   b   s     "/22o&&K	[2$$M,}<N)H)1A]X%%1F >)A-- 	788 
 ;Df:Lq:P:PY	9v#5	6	6VWB9b  r   c                       e Zd Z	 	 ddeeeef                  deeeef                  dedefdZe		 	 dd	e
d
e
dee         dedd f
d            ZdefdZdS )TableEvalProcessor皙?html
predictionground_truthcutoffsource_typec                 >    || _         || _        || _        || _        dS )ai  
        Initializes the TableEvalProcessor prediction and ground truth.

        Args:
            ground_truth: Ground truth table data. The tables text should be in the deckerd format.
            prediction: Predicted table data.
            cutoff: The cutoff value for the element level alignment. Default is 0.8.

        Examples:
            ground_truth: [
                {
                    "type": "Table",
                    "text": [
                        {
                            "id": "f4c35dae-105b-46f5-a77a-7fbc199d6aca",
                            "x": 0,
                            "y": 0,
                            "w": 1,
                            "h": 1,
                            "content": "Cell text"
                        },
                        ...
                }
            ]
            prediction: [
                {
                    "element_id": <id_string>,
                    ...
                    "metadata": {
                        ...
                        "text_as_html": "<table><thead><tr><th rowspan="2">June....
                                                                </tr></td></table>",
                        "table_as_cells":
                        [
                            {
                                "x": 0,
                                "y": 0,
                                "w": 1,
                                "h": 2,
                                "content": "June"
                            },
                            ...
                        ]
                    }
                },
            ]

        NrS   rT   rU   rV   )r   rS   rT   rU   rV   s        r   __init__zTableEvalProcessor.__init__   s)    n %(&r   Nprediction_fileground_truth_filer   c                 ,   t          |          5 }t          j        |          }ddd           n# 1 swxY w Y   t          |          5 }t          j        |          }ddd           n# 1 swxY w Y   | | ||||          S  | |||          S )a  Factory classmethod to initialize the object with path to json files instead of dicts

        Args:
          prediction_file: Path to the json file containing the predicted table data.
          ground_truth_file: Path to the json file containing the ground truth table data.
          source_type: 'cells' or 'html'. 'cells' refers to reading 'table_as_cells' field while
            'html' is extracted from 'text_as_html'
          cutoff: The cutoff value for the element level alignment.
            If not set, class default value is used (=0.8).

        Returns:
          TableEvalProcessor: An instance of the class initialized with the provided data.
        NrX   )rS   rT   rV   )openjsonload)clsrZ   r[   rU   rV   frS   rT   s           r   from_json_filesz"TableEvalProcessor.from_json_files   s$   * /"" 	&a1J	& 	& 	& 	& 	& 	& 	& 	& 	& 	& 	& 	& 	& 	& 	&#$$ 	(9Q<<L	( 	( 	( 	( 	( 	( 	( 	( 	( 	( 	( 	( 	( 	( 	(3%)'	    3*<U`aaaas   155A,,A03A0c                 d   t          | j                  }t          | j        | j                  }t          |          }t          |          }|s<|rdnt          j        }|sdnd}t          dt          |          ||||||||
  
        S |r(|s&t          t          |          ddddddddd
  
        S t          j        ||          }t          j        t          |||                    }t          j        |||| j                  }	t!          |t          |                    \  }
}}t          t          |          t          |          ||
|||	                    dd          |	                    dd          |	                    d	d          |	                    d
d          
  
        }|S )zProcesses the files and computes table-level and element-level accuracy.

        Returns:
            TableEvaluation: A dataclass object containing the computed metrics.
        )file_elementsrV   r   r=   )
r   r   r   r   r   r   r   r   r   r   rU   )r5   rB   col_index_accrow_index_acccol_content_accrow_content_acc)r
   rT   r   rS   rV   boolr+   nanr   r-   r	   get_table_level_alignmentmeanr   get_element_level_alignmentrU   rN   get)r   r4   r3   is_table_in_gtis_table_predictedr6   	table_accr5   predicted_table_accmetricsr   r   r   
evaluations                 r   process_filezTableEvalProcessor.process_file   s0    #O#
 #
  J/t7G 
  
  
 566!"677 	+7AAE!3:I"'*+?'@'@ )',*/#(,1,1.3.3     1	"4 1	" !899'( !'(*+#$,-,-././    -F$' O #%' 46M__# # %@$'{	  G 2$3/23J/K/K   R"$=?Q ) !899'*+?'@'@ 3'=*C#5,3KK,K,K,3KK,K,K.5kk:KQ.O.O.5kk:KQ.O.O  J r   )rQ   rR   )NrR   )r    r!   r"   r   r   strr   r&   rY   classmethodr   r   rb   r   rv   r(   r   r   rP   rP      s       
 !:' :'c3h(:' 4S>*:' 	:'
 :' :' :' :'x 
 #'! b  b b   b 	 b
  b 
 b  b  b [ bDPo P P P P P Pr   rP   z--prediction_filez&Path to the model prediction JSON fileT)exists)helptypez--ground_truth_filez"Path to the ground truth JSON filez--cutoffrQ   z]The cutoff value for the element level alignment.         If not set, a default value is used)r{   show_defaultdefaultrz   rZ   r[   rU   c                     t                               t          |           t          |          |          }|                                }t	          |           dS )zBRuns the table evaluation process and prints the computed metrics.re   N)rP   rb   r   rv   print)rZ   r[   rU   	processorreports        r   runr   9  sZ    " #22_ 3  I
 ##%%F	&MMMMMr   __main__)"r#   r0   r^   dataclassesr   pathlibr   typingr   r   r   r   clicknumpyr+   *unstructured.metrics.table.table_alignmentr	   +unstructured.metrics.table.table_extractionr
   r   r   r   r$   rA   listtupler&   rN   rP   commandoptionrw   r   r    r(   r   r   <module>r      s   ,   ! ! ! ! ! !       , , , , , , , , , , , ,      E E E E E E               .  *9T#Y 93 9 9 9 9!!#Y!!<?!!
5%!! !! !! !!Hp p p p p p p pf FZUZ_cMdMdMd    D:5:]aKbKbKb   	
-    huo        zCEEEEE r   