
    Ng
                         d dl mZ d dlmZmZmZmZ d dlZd dl	m
Z
 e
j        e
j        e
j        e
j        dZddefd	Zd
edee         fdZ	 	 	 	 ddej        dej        dedee         dededeeef         fdZdS )    )partial)CallableDictListOptionalN)fuzz)token_ratioratiopartial_token_ratiopartial_ratio	
returnc                 P    |                     fd| j        D                       S )z4joining dataframe's table content as one long stringc                 :    g | ]}                     |          S  )join).0row	tab_tokens     ^/var/www/html/ai-engine/env/lib/python3.11/site-packages/unstructured_inference/models/eval.py
<listcomp>z$_join_df_content.<locals>.<listcomp>   s%     J J J!4!4 J J J    )r   values)dfr   row_break_tokens    ` r   _join_df_contentr      s.     J J J J	 J J JKKKr   textc                 *    |                                  S )z2a simple tokenizer that splits text by white space)split)r   s    r   default_tokenizerr!      s    ::<<r   r	   	actual_dfpred_df	eval_func	processorr   r   c           	      0   t                               |          }|t          d|           t          t          ||          }d|  | ||            ||          |          d|  | || j                   ||j                  |          iS )a  ravel the table as string then use text distance to compare the prediction against true
    table

    Parameters
    ----------
    actual_df: pd.DataFrame
        actual table as pandas dataframe

    pred_df: pd.DataFrame
        predicted table as pandas dataframe

    eval_func: str, default tp "token_ratio"
        the eval_func should be one of "token_ratio", "ratio", "partial_token_ratio",
        "partial_ratio". Those are functions provided by rapidfuzz to evaluate text distances
        using either tokens or characters. In general token is better than characters for evaluating
        tables.

    processor: Callable, default to None
        processor to tokenize the text; by default None means no processing (using characters). For
        tokens eval functions we recommend using the `default_tokenizer` or some other functions to
        break down the text into words

    tab_token: str, default to "\t"
        the string to join cells together

    row_break_token: str, default to "\n"
        the string to join rows together

    Returns
    -------
    Dict[str, int]
        mapping of by column and by row scores to the scores as float numbers
    Nz`eval_func must be one of "token_ratio", "ratio", "partial_token_ratio", "partial_ratio" but got )r   r   by_col_)r%   by_row_)EVAL_FUNCTIONSget
ValueErrorr   r   T)r"   r#   r$   r%   r   r   func	join_funcs           r   compare_contents_as_dfr/      s    R i((D|3'03 3
 
 	
 (I___I)ttIi  Ig 
  
  

 	)ttIik""Igi   
  
  
 r   )r   r   )r	   Nr   r   )	functoolsr   typingr   r   r   r   pandaspd	rapidfuzzr   r	   r
   r   r   r)   strr   r!   	DataFramefloatr/   r   r   r   <module>r8      sO         1 1 1 1 1 1 1 1 1 1 1 1           #Z3'	 L L# L L L L
C DI     #$(; ;|;\; ; !	;
 ; ; 
#u*; ; ; ; ; ;r   