
    NgE                        d dl Z d dlZd dlZd dlmZ d dlZd dlZd dlZd dl	m
Z
mZmZmZmZ e j                            d          Ze j                             ej        e          j                                        ddd          Ze j                            ed          ZdZd	Zd
ZdZdZdZ ej        g dg dg dg dg dd          Z ej        g dg dg dg dd          Z  ej!                    d             Z"ej#        $                    ed          ej#        %                    d          d                         Z&ej#        '                    deee ed          dddifeee ed          d i fe
ee ed!          d"i fg          d#             Z(ej#        $                    ed          ej#        %                    d          d$                         Z)ej#        $                    ed          ej#        %                    d          d%                         Z*ej#        $                    ed          ej#        %                    d          d&                         Z+ej#        $                    ed          ej#        %                    d          d'                         Z,ej#        $                    ed          ej#        %                    d          d(                         Z-ej#        $                    ed          ej#        %                    d          d)                         Z.ej#        $                    ed          ej#        %                    d          ej#        '                    d*d+d,g          d-e/d.e0fd/                                    Z1ej#        $                    ed          ej#        %                    d          d0                         Z2ej#        $                    ed          ej#        %                    d          d1                         Z3ej#        $                    ed          ej#        %                    d          d2                         Z4ej#        $                    ed          ej#        %                    d          d3                         Z5ej#        $                    ed          ej#        %                    d          d4                         Z6ej#        $                    ed          ej#        %                    d          d5                         Z7ej#        $                    ed          ej#        %                    d          ej#        '                    d6d+d,g          d7e/d.e0fd8                                    Z8ej#        $                    ed          ej#        %                    d          d9                         Z9ej#        $                    ed          ej#        %                    d          d:                         Z:ej#        $                    ed          ej#        %                    d          d;                         Z;dS )<    N)Path)ElementTypeMetricsCalculatorTableStructureMetricsCalculatorTextExtractionMetricsCalculatorfilter_metricsget_mean_groupingz/.dockerenvz..zexample-docstest_evaluate_filesunstructured_outputgold_standard_cctgold_standard_element_typegold_standard_table_structureunstructured_output_cct#unstructured_output_table_structure)Bank Good Credit Loan.pptx Performance-Audit-Discussion.pdfzcurrency.csv)pptxpdfcsv)
connector1r   
connector2)gCl?g+?gMb?)gMbP?gMb`?gˡE?)filenamedoctype	connectorzcct-accuracyzcct-%missing)r   r   r   zelement-type-accuracyc               #   ,   K   d } dV   |              dS )z@Fixture for removing side-effects of running tests in this file.c                      g d} d t          j        t                    D             }|D ]$}|j        | v rt	          j        |j                   %dS )z.Remove directories created from running tests.)test_evaluate_results_ccttest_evaluate_results_cct_txt"test_evaluate_results_element_type$test_evaluate_result_table_structurec              3   B   K   | ]}|                                 |V  d S )N)is_dir).0ds     c/var/www/html/ai-engine/env/lib/python3.11/site-packages/test_unstructured/metrics/test_evaluate.py	<genexpr>zL_cleanup_after_test.<locals>.remove_generated_directories.<locals>.<genexpr>J   s/      IIahhjjI1IIIIII    N)osscandirTESTING_FILE_DIRnameshutilrmtreepath)target_dir_namessubdirsr#   s      r$   remove_generated_directoriesz9_cleanup_after_test.<locals>.remove_generated_directories@   so    
 
 
 JIbj)9::III 	& 	&Av)))af%%%	& 	&r&   N )r0   s    r$   _cleanup_after_testr2   <   s5      & & &  
EEE  """""r&   z&Skipping this test in Docker container)reasonr2   c                     t           j                            t          t                    } t           j                            t          t
                    }t           j                            t          d          }t          | |                              |dd           t           j                            t           j                            |d                    sJ t          j
        t           j                            |d          d          }t          |          dk    sJ t          |j                  d	k    sJ |j        d
         j        dk    sJ d S )Nr   documents_dirground_truths_dirF)
export_dirvisualize_progressdisplay_agg_dfall-docs-cct.tsv	sep      r   r   )r'   r-   joinr)   UNSTRUCTURED_OUTPUT_DIRNAMEGOLD_CCT_DIRNAMEr   	calculateisfilepdread_csvlencolumnsilocr   
output_dir
source_dirr8   dfs       r$   test_text_extraction_evaluationrO   T   s    .0KLLJ.0@AAJ.0KLLJ# J  i:%PUiVVV7>>"',,z3EFFGGGGG	RW\\*.@AAt	L	L	LBr77a<<<<rz??a71:">>>>>>>r&   )calculator_classoutput_dirnamesource_dirnamer-   expected_lengthkwargszBank Good Credit Loan.pptx.txtr@   document_typetxtzIRS-2023-Form-1095-A.pdf.json   zIRS-form-1987.pdf.json   c                     t          t                    |z  }t          t                    |z  } | d||d|}|                    |          }	t          |	          |k    sJ d S )Nr5   r1   )r   r)   _process_documentrH   )
rP   rQ   rR   r-   rS   rT   rL   rM   
calculatoroutput_lists
             r$   :test_process_document_returns_the_correct_amount_of_valuesr]   f   s{    @ &''.8J&''.8J!!c
jcc\bccJ..t44K{......r&   c                  :   t           j                            t          t                    } t           j                            t          t
                    }t           j                            t          d          }t          | |d                              |           t          j	        t           j                            |d          d          }t          |          dk    sJ t          |j                  d	k    sJ |j        d
         j        dk    sJ d S )Nr   rV   r6   r7   rU   r8   r;   r<   r=   r?   r@   r   r   )r'   r-   rA   r)   UNSTRUCTURED_CCT_DIRNAMErC   r   rD   rF   rG   rH   rI   rJ   r   rK   s       r$   (test_text_extraction_evaluation_type_txtrb      s     .0HIIJ.0@AAJ.0KLLJ# Je  i:i&&&	RW\\*.@AAt	L	L	LBr77a<<<<rz??a71:">>>>>>>r&   c                     t           j                            t          t                    } t           j                            t          t
                    }t           j                            t          d          }t          | |                              |d           t           j                            t           j                            |d                    sJ t          j
        t           j                            |d          d          }t          |          dk    sJ t          |j                  d	k    sJ |j        d
         j        dk    sJ d S )Nr   r5   Fr8   r9   z#all-docs-element-type-frequency.tsvr<   r=      rX   r   zIRS-form-1987.pdf)r'   r-   rA   r)   rB   GOLD_ELEMENT_TYPE_DIRNAMEr   rD   rE   rF   rG   rH   rI   rJ   r   rK   s       r$   test_element_type_evaluationrg      s    .0KLLJ.0IJJJ.0KLLJ  $   i:%i@@@7>>"',,z3XYYZZZZZ	RW\\*.STTZ^	_	_	_Br77a<<<<rz??a71:"5555555r&   c                  P   t           j                            t          t                    } t           j                            t          t
                    }t           j                            t          d          }t          | |                              |d           t           j                            t           j                            |d                    sJ t           j                            t           j                            |d                    sJ t          j
        t           j                            |d          d          }t          j
        t           j                            |d          d                              d	          }t          |          d
k    sJ t          |j                  dk    sJ |j        d         j        dk    sJ t!          j        t!          j        |d         |d                   d          |j        d         k    sJ d S )Nr   r5   Frd   z%all-docs-table-structure-accuracy.tsvz&aggregate-table-structure-accuracy.tsvr<   r=   metric      re   zIRS-2023-Form-1095-A.pdftable_level_acctotal_tables)weightsr?   )rl   average)r'   r-   rA   r)   $UNSTRUCTURED_TABLE_STRUCTURE_DIRNAMEGOLD_TABLE_STRUCTURE_DIRNAMEr   rD   rE   rF   rG   	set_indexrH   rI   rJ   r   nproundro   loc)rL   rM   r8   rN   agg_dfs        r$   test_table_structure_evaluationrw      s    .0TUUJ.0LMMJ.0VWWJ# $   i:%i@@@7>>"',,z3Z[[\\\\\7>>"',,z3[\\]]]]]	RW\\*.UVV\`	a	a	aB[
Z!IJJPT  i  r77a<<<<rz??b    71:"<<<<<
B012n;MNNNPQRR:23	4 	4 	4 	4 	4 	4r&   c                     t           j                            t          t                    } dg}t           j                            t          t
                    }t           j                            t          d          }t          | |                              |                              |           t           j        	                    t           j                            |d                    sJ t          j        t           j                            |d          d          }t          |          t          |          k    sJ d S )	Nzcurrency.csv.jsonr   r5   )document_pathsr`   r;   r<   r=   )r'   r-   rA   r)   rB   rC   r   on_filesrD   rE   rF   rG   rH   )rL   r\   rM   r8   rN   s        r$   test_text_extraction_takes_listr{      s    .0KLLJ&'K.0@AAJ.0KLLJ# $   hkh**99
9+K+K+K 7>>"',,z3EFFGGGGG	RW\\*.@AAt	L	L	LBr77c+&&&&&&&&r&   c                     t           j                            t          t                    } t           j                            t          t
                    }t           j                            t          d          }t          | |d                              |           t          j	        t           j                            |d          d          }t          |          dk    sJ d S )	Nr   r   )r6   r7   group_byr`   all-doctype-agg-cct.tsvr<   r=   rX   )r'   r-   rA   r)   rB   rC   r   rD   rF   rG   rH   rK   s       r$   "test_text_extraction_with_groupingr      s     .0KLLJ.0@AAJ.0KLLJ# $   i:i&&&	RW\\*.GHHd	S	S	SBr77a<<<<<<r&   c                     t           j                            t          t                    } t           j                            t          t
                    }t           j                            t          d          }t          j        t                    5  t          | |d          
                    |           d d d            d S # 1 swxY w Y   d S )Nr   zinvalid typer_   r`   )r'   r-   rA   r)   rB   rC   pytestraises
ValueErrorr   rD   )rL   rM   r8   s      r$   test_text_extraction_wrong_typer      s     .0KLLJ.0@AAJ.0KLLJ	z	"	" + +'$
R`	
 	
 	

)z)
*
*
*+ + + + + + + + + + + + + + + + + +s   'CCC)grouping	count_row)r   r?   )r   rj   r   r   c                 T   t           j                            t          d          }t	          | t
          |d           t          j        t           j                            |d|  d          d          }||                                          	                                |k    sJ d S )Nr   text_extractionr}   
data_inputr8   	eval_nameall-z-agg-cct.tsvr<   r=   )
r'   r-   rA   r)   r   DUMMY_DF_CCTrF   rG   dropnanunique)r   r   r8   
grouped_dfs       r$   test_get_mean_grouping_df_inputr      s     .0KLLJ#	    RW\\*6SX6S6S6STTZ^___Jh&&((0022i??????r&   c                  t   t           j                            t          t                    } t           j                            t          t
                    }t           j                            t          d          }t          | |                              |           t           j                            |d          }t          d||d           t          j
        t           j                            |d          d	
          }|d                                                                         dk    sJ d S )Nr   r5   r`   r;   r   r   r   r~   r<   r=   r?   )r'   r-   rA   r)   rB   rC   r   rD   r   rF   rG   r   r   )rL   rM   r8   r   r   s        r$    test_get_mean_grouping_tsv_inputr     s
    .0KLLJ.0@AAJ.0KLLJ# $   i:i&&&w||J(:;;H#	    RW\\*6OPPVZ[[[Ji ''))1133q888888r&   c                  4   t           j                            t          t                    } t           j                            t          t
                    }t           j                            t          d          }t          | |                              |           t          j	        t           j                            |d          d          }t          j        t                    5  t          d||d	           d d d            d S # 1 swxY w Y   d S )
Nr   r5   r`   r;   r<   r=   invalidr   r   )r'   r-   rA   r)   rB   rC   r   rD   rF   rG   r   r   r   r   rK   s       r$   $test_get_mean_grouping_invalid_groupr   #  s1    .0KLLJ.0@AAJ.0KLLJ# $   i:i&&&	RW\\*.@AAt	L	L	LB	z	"	" 
 
!'		
 	
 	
 	

 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
s   ,DDDc                      t          j                    } t          j        t                    5  t          d| dd           d d d            d S # 1 swxY w Y   d S )Nr   some_dirr   r   rF   	DataFramer   r   
SystemExitr   )empty_dfs    r$   &test_text_extraction_grouping_empty_dfr   9  s     |~~H	z	"	" X X)XzEVWWWWX X X X X X X X X X X X X X X X X Xs   AAAc                      t          j        dg di          } t          j        t                    5  t          d| dd           d d d            d S # 1 swxY w Y   d S )Nsome_column)re   rj   r?   r   r   r   r   )df_with_no_groupings    r$   .test_get_mean_grouping_missing_grouping_columnr   A  s     ,yyy'ABB	z	"	" Y Y)%8*FWXXXY Y Y Y Y Y Y Y Y Y Y Y Y Y Y Y Y Ys   AAAc                      t          j        dg di          } t          j        t                    5  t          d| dd           d d d            d S # 1 swxY w Y   d S )Nr   )NNNr   r   r   r   )df_with_null_groupings    r$   /test_get_mean_grouping_all_null_grouping_columnr   I  s     L)5G5G5G)HII	z	"	" e e)%:JRcdddde e e e e e e e e e e e e e e e e es   AAAc                      t          j        t                    5  t          dt          dd           d d d            d S # 1 swxY w Y   d S )Nr   r   r   r   )r   r   r   r   DUMMY_DF_ELEMENT_TYPEr1   r&   r$   (test_get_mean_grouping_invalid_eval_namer   Q  s     
z	"	" ] ])%:JR[\\\\] ] ] ] ] ] ] ] ] ] ] ] ] ] ] ] ] ]s   A  AA)r}   r   r}   c                 T   t           j                            t          d          }t	          | t
          |d           t          j        t           j                            |d|  d          d          }||                                          	                                |k    sJ d S )Nr   element_typer   r   z-agg-element-type.tsvr<   r=   )
r'   r-   rA   r)   r   r   rF   rG   r   r   )r}   r   r8   r   s       r$   #test_get_mean_grouping_element_typer   X  s     .0TUUJ( 	    
Z!G!G!G!GHHd  J h&&((0022i??????r&   c                  x   t          t          j                            t          d          d          5 } |                     d           |                     d           d d d            n# 1 swxY w Y   t          j                            t          d          }t          t          t          j                            t          d          dd|d	           t          j	        t          j                            |d          d
          }t          |          dk    sJ |d         j        d         dk    sJ d S )Nfilter_list.txtwBank Good Credit Loan.pptx
!Performance-Audit-Discussion.pdf
r   r   filtered_metrics.tsvfiler   filter_list	filter_byexport_filenamer8   return_typer<   r=   rj   r   r   )openr'   r-   rA   r)   writer   r   rF   rG   rH   rJ   )r   r8   filtered_dfs      r$   test_filter_metricsr   i  sX    
bgll+->??	E	E 9

1222

78889 9 9 9 9 9 9 9 9 9 9 9 9 9 9 .0KLLJGLL!13DEE.    +bgll:7MNNTXYYYK{q    z"'*.JJJJJJJ   +A++A/2A/c                     t          t          j                            t          d          d          5 } |                     d           |                     d           d d d            n# 1 swxY w Y   t          j                            t          d          }t          t          ddgdd	|d
           t          j	        t          j                            |d	          d          }t          d||dd           t          j	        t          j                            |d          d          }t          j        t          |j        d                   d          sJ t          j        t          |j        d                   d          sJ t          j        t          |j        d                   d          sJ d S )Nr   r   r   r   r   r   r   r   r   r   r   r<   r=   allr   two-filename-agg-cct.tsvr}   r   r8   r   r   re   r   L7A`?re   re   Pn?re   rj   粝K?r   r'   r-   rA   r)   r   r   r   rF   rG   r   rs   isclosefloatrJ   r   r8   r   r   s       r$   test_get_mean_grouping_all_filer   ~  s    
bgll+->??	E	E 9

1222

78889 9 9 9 9 9 9 9 9 9 9 9 9 9 9 .0KLLJ13UV.    +bgll:7MNNTXYYYK#2    RW\\*6PQQW[\\\J:eJOD122E::::::eJOD122E::::::eJOD122E:::::::r   c                     t          t          j                            t          d          d          5 } |                     d           |                     d           d d d            n# 1 swxY w Y   t          j                            t          d          }t          t          t          j                            t          d          dd|d	           t          j	        t          j                            |d          d
          }t          d||dd           t          j	        t          j                            |d          d
          }t          j        t          |j        d                   d          sJ t          j        t          |j        d                   d          sJ t          j        t          |j        d                   d          sJ d S )Nr   r   r   r   r   r   r   r   r   r<   r=   r   r   r   r   r   r   r   r   r   r   r   r   s       r$   #test_get_mean_grouping_all_file_txtr     s    
bgll+->??	E	E 9

1222

78889 9 9 9 9 9 9 9 9 9 9 9 9 9 9 .0KLLJGLL!13DEE.    +bgll:7MNNTXYYYK#2    RW\\*6PQQW[\\\J:eJOD122E::::::eJOD122E::::::eJOD122E:::::::r   )<r'   pathlibr+   r   numpyrs   pandasrF   r   unstructured.metrics.evaluater   r   r   r   r   r-   existsis_in_dockerrA   __file__parentresolveEXAMPLE_DOCS_DIRECTORYr)   rB   rC   rf   rq   ra   rp   r   r   r   fixturer2   markskipifusefixturesrO   parametrizer]   rb   rg   rw   r{   r   r   strintr   r   r   r   r   r   r   r   r   r   r   r1   r&   r$   <module>r      s   				                               w~~m,,GL!))++T4   7<< 68MNN 3 & 8 > 4 'L $r|
 
 

 *))???------
 
  %
 
 

 *))???!6!6!6	 	   # # #. L)QRR.//? ? 0/ SR?  a ,$D122e$	
 ,0(D011	
 )'%D)**	
# :/ /; :/ L)QRR.//? ? 0/ SR? L)QRR.//6 6 0/ SR6" L)QRR.//  0/ SR2 L)QRR.//' ' 0/ SR'" L)QRR.//  0/ SR L)QRR.//+ + 0/ SR+ L)QRR.//2^EU4VWW	@c 	@c 	@ 	@ 	@ XW 0/ SR	@ L)QRR.//9 9 0/ SR9* L)QRR.//
 
 0/ SR
( L)QRR.//X X 0/ SRX L)QRR.//Y Y 0/ SRY L)QRR.//e e 0/ SRe L)QRR.//] ] 0/ SR]
 L)QRR.//2^EU4VWW@# @# @ @ @ XW 0/ SR@ L)QRR.//K K 0/ SRK& L)QRR.//; ; 0/ SR;< L)QRR.//; ; 0/ SR; ; ;r&   