
    Ng                     <   d dl mZmZmZ d dlmZ d dlmZmZ 	 ddee	         dee	         dee
e
e
f         defd	Z	 	 	 ddee	         dee	         dee
e
e
f         de	dedefdZde	dee	e
f         fdZdee	         dee	         defdZddee	         dede	fdZdS )    )DictOptionalTuple)Levenshtein)clean_bulletsremove_sentence_punctuation      r   outputsourceweightsreturnc                 (    t          | ||d          S )z
    Calculates accuracy by calling calculate_edit_distance function using `return_as=score`.
    The function will return complement of the edit distance instead.
    score)	return_as)calculate_edit_distance)r   r   r   s      `/var/www/html/ai-engine/env/lib/python3.11/site-packages/unstructured/metrics/text_extraction.pycalculate_accuracyr      s     #667gNNNN    distanceTr   standardize_whitespacesc                 J   ddg}||vrt          d|z            t          | |          } t          ||          }t          j        | ||          }t	          t          |          d          }t          t	          ||z  d          d          }|dk    rd|z
  S |dk    r|S dS )a%  
    Calculates edit distance using Levenshtein distance between two strings.

    Args:
        output (str): The target string to be compared.
        source (str): The reference string against which 'output' is compared.
        weights (Tuple[int, int, int], optional): A tuple containing weights
            for insertion, deletion, and substitution operations in the edit
            distance calculation. Default is (2, 1, 1).
        return_as (str, optional): The type of result to return, one of
            ["score", "distance"].
            Default is "distance".

    Returns:
        float: The calculated edit distance or similarity score between
            the 'output' and 'source' strings.

    Raises:
        ValueError: If 'return_as' is not one of the valid return types
        ["score", "distance"].

    Note:
        This function calculates the edit distance (or similarity score) between
        two strings using the Levenshtein distance algorithm. The 'weights' parameter
        allows customizing the cost of insertion, deletion, and substitution
        operations. The 'return_as' parameter determines the type of result to return:
        - "score": Returns the similarity score, where 1.0 indicates a perfect match.
        - "distance": Returns the raw edit distance value.

    r   r   z.Invalid return value type. Expected one of: %s)r   g      ?g        r   )
ValueErrorprepare_strr   r   maxlenmin)	r   r   r   r   r   return_typesr   source_char_lenbounded_percentage_distances	            r   r   r      s    J Z(L$$ILXYYY!899F!899F#FFGDDDH #f++s++O"%c(_*Dc&J&JC"P"PG...	j	 	 3r   textc                    i }d}t          t          |                                 ddg                                                    }d}|t	          |          k     rt	          ||                   dk    r2||         |v r|||         xx         dz  cc<   nd|||         <   |dz  }n|}d}|t	          |          k     rUt	          ||                   dk    r<|||         z  }|dz  }|t	          |          k     rt	          ||                   dk    <t	          |          dk    r4||                                         r||v r||xx         dz  cc<   nd||<   |}|t	          |          k     |S )z
    Outputs the bag of words (BOW) found in the input text and their frequencies.

    Takes "clean, concatenated text" (CCT) from a document as input.

    Removes sentence punctuation, but not punctuation within a word (ex. apostrophes).
     -'r   r   )r   r   lowersplitr   isalnum)r"   bowincorrect_wordwordsijs         r   bag_of_wordsr/   J   s    CN5djjllS#JOOPPVVXXE	A
c%jj..uQx==1Qx3E!H" !E!HFAAANc%jj..Sq]]a%7%7%(*Q c%jj..Sq]]a%7%7 >""a''E!H,<,<,>,>'!S(('''1,''''*+C'A) c%jj..* Jr   c                 f   t          |           } t          |          }t          |           }t          |          }d}d}|                                D ]2\  }}||z  }||vr||z  }||         }|t          ||z
  d          z  }3|dk    rdS t	          ||z  d          }	t          |	d          S )a  
    Creates the bag of words (BOW) found in each input text and their frequencies, then compares the
    output BOW against the source BOW to calculate the % of text from the source text missing from
    the output text.

    Takes "clean, concatenated text" (CCT) from a document output and the ground truth source text
    as inputs.

    If the output text contains all words from the source text and then some extra, result will be
    0% missing text - this calculation does not penalize duplication.

    A spaced-out word (ex. h e l l o) is considered missing; individual characters of a word
    will not be counted as separate words.

    Returns the percentage of missing text represented as a decimal between 0 and 1.
    r      r   )r   r/   itemsr   roundr   )
r   r   
output_bow
source_bowtotal_source_word_counttotal_missing_word_countsource_wordsource_countoutput_countfraction_missings
             r   calculate_percent_missing_textr<   o   s    (   F  Ff%%Jf%%J   %/%5%5%7%7 L L!\</j(($4$$%k2L$L<,G(K(KK$$ !##q58OOQRSS###r   Fstringc                 z    | sdS |r'd                     |                                           S t          |           S )Nr$    )joinr(   str)r=   r   s     r   r   r      s=     r (xx'''v;;r   N)r	   )r	   r   T)F)typingr   r   r   rapidfuzz.distancer   unstructured.cleaners.corer   r   rA   intfloatr   boolr   r/   r<   r    r   r   <module>rI      s   ( ( ( ( ( ( ( ( ( ( * * * * * * Q Q Q Q Q Q Q Q %.	O 	OSM	OSM	O 3S=!	O 		O 	O 	O 	O %.$(3 3SM3SM3 3S=!3 	3
 "3 3 3 3 3l"s "tCH~ " " " "J+$SM+$SM+$ +$ +$ +$ +$\   QT      r   