
    Ng]                         d dl Z d dlZd dlmZ d dlmZmZmZmZ d dl	m
Z
 d Zej                            dddg          d	             Zej                            d
g d          d             Zd Zej                            dddddddddfddddddddfddddddddddfdddddddddddddfdi ddddddddddd dd!dd"dd#dd$dd%dd&dd'dd(dd)dd*dfg          d+             Zej                            dg d,          d-             Zej                            d.g d/          d0             Zej                            d1 ej        d d ddd2d3d dddd4d3gd d d2d5dd d4d5gd67           ej        d d ddd2d3dd ddd8d3dd ddd9d3d dddd:d3ddddd;d3ddddd<d3d dddd=d3ddddd4d3ddddd>d3g	d d d2d5d dd8d5d dd9d5dd d:d5ddd;d5ddd<d5dd d=d5ddd4d5ddd>d5g	d?7           ej        d d ddd@dAd ddddBdAd dCdddDdAdddddEdAdddddFdAdd dddGdAdddddHdAdddddIdAdCd dddJdAg	d d d@d5d ddBd5d dCdDd5dddEd5dddFd5dd dGd5dddHd5dddId5dCd dJd5g	dK7          g          dL             Zej                            dM ej        dNd d d2d5dd d4d5gd67           ej        dOd d d2d5d dd8d5d dd9d5dd d:d5ddd;d5ddd<d5dd d=d5ddd4d5ddd>d5g	d?7           ej        dPd d d@d5d ddBd5d dCdDd5dddEd5dddFd5dd dGd5dddHd5dddId5dCd dJd5g	dK7          g          dQ             ZdR ZdSedTefdUZej                            dVg dW          dX             ZdS )Y    N)text_extraction)deckerd_table_to_html!extract_cells_from_table_as_cellsextract_cells_from_text_as_htmlhtml_table_to_deckerd)	partitionc                  L   d} d}t          j        ddd                    |                     }|                     dd          }d}d}d}d	}t	          t          j        | | d
          d          dk    sJ t	          t          j        || d
          d          dk    sJ t	          t          j        || d
          d          dk    sJ t	          t          j        || d
          d          dk    sJ t	          t          j        || d
          d          dk    sJ t	          t          j        || d
          d          dk    sJ t	          t          j        || d
          d          dk    sJ t	          t          j        || d
          d          dk    sJ d S )NzI like pizza. I like bagels.z!I like p i z z a . I like bagles.z\s+  zI like pizza.zI like pizza. I like .zI like pizza. I like beagles.z"I like pizza pizza. I like bagels.score)	return_as         ?g      ?g(\?g{Gz?g        g=
ףp=?g{Gz?gHzG?)resubjoinreplaceroundr   calculate_edit_distance)
source_cctsource_cct_word_spacesource_cct_spacessource_cct_no_spacesource_cct_one_sentencesource_cct_missing_wordsource_cct_addn_charsource_cct_dup_words           j/var/www/html/ai-engine/env/lib/python3.11/site-packages/test_unstructured/metrics/test_text_extraction.pytest_calculate_edit_distancer      s   /J?vsCHHZ,@,@AA$,,S"55-6:> 	o5j*X_```bcdd	 	 	 	 	3%!  
 	
 	
 	 	 	 	 	3!!  
 	
 	
 	 	 	 	 	3#!  
 	
 	
 	 	 	 	 	3'!  
 	
 	
 	 	 	 	 	3'!  
 	
 	
 	 	 	 	 	3$!  
 	
 	
 	 	 	 	 	3#!  
 	
 	
 	 	 	 	 	 	    )filenamestandardize_whitespacesexpected_scoreexpected_distance)fake-text.txtFg(\?&   )r%   Tgq=
ףp?   c                    t          d          5 }|                                }d d d            n# 1 swxY w Y   t          d|            }d                    d |D                       }t	          j        ||d|          }t	          j        ||d|          }	|d	k    sJ |d
k    sJ |	d	k    sJ t          |d          |k    sJ |	|k    sJ d S )Nzexample-docs/fake-text.txtzexample-docs/)r!   
c                 ,    g | ]}t          |          S  )str).0els     r   
<listcomp>z>test_calculate_edit_distance_with_filename.<locals>.<listcomp>z   s    777CGG777r    r   r   r"   distancer   r   r   )openreadr   r   r   r   r   )
r!   r"   r#   r$   fr   elements
output_cctr   r1   s
             r   *test_calculate_edit_distance_with_filenamer7   l   sM    
*	+	+ qVVXX
               "<("<"<===H77h77788J3J'Kb  E 6 7	  H A::::C<<<<q====??n,,,,(((((((s   155text1text2))z7The  dogloved the cat, but	
    the cat	loved the
 cow0The dog loved the cat, but the cat loved the cow)z3Hello    my	name	is H a r p e r, 
what's yourname?/Hello my name is H a r p e r, what's your name?z+I have a	
	dog and a	cat,I love my



dog.&I have a dog and a cat, I love my dog.)z
            Name    Age City           Occupation
            Alice   30  New York       Engineer
            Bob     25  Los Angeles    Designer
            Charlie 35  Chicago        Teacher
            David   40  San Francisco  Developer
            
            Name	Age	City	Occupation
            Alice	30	New York	Engineer
            Bob	25	Los Angeles	Designer
            Charlie	35	Chicago	Teacher
            David	40	San Francisco	Developer
            )r?   zName	Age	City	Occupation

 
Alice	30	New York	Engineer
Bob	25	Los Angeles	Designer
Charlie	35	Chicago	Teacher
David	40	San Francisco	Developerc                     t          j        | |dd          dk    sJ t          j        | |dd          dk    sJ t          j        | |dd          dk     sJ t          j        | |dd          dk    sJ d S )Nr   Tr0   r   r1   r   Fr   r   r8   s     r   6test_calculate_edit_distance_with_various_whitespace_1rB      s    Z 	/5GT	
 	
 	
 	 	 	 	 	/5J	
 	
 	
 	 	 	 	 	/5GU	
 	
 	
 	 	 	 	 	/5J	
 	
 	
 	 	 	 	 	 	r    c                      d} d}t          j        | |dd          t          j        | |dd          k    sJ t          j        | |dd          t          j        | |dd          k     sJ d S )Nr?   a  

            | Name    | Age | City         | Occupation     |
            |---------|-----|--------------|----------------|
            | Alice   | 30  | New York     | Engineer       |
            | Bob     | 25  | Los Angeles  | Designer       |
            | Charlie | 35  | Chicago      | Teacher        |
            | David   | 40  | San Francisco| Developer      |

            r   Tr0   Fr1   rA   )source_cct_tabssource_cct_with_borderss     r   6test_calculate_edit_distance_with_various_whitespace_2rF      s    O	 20G]a  /0G]b	 	 	   
 20J`d  / %		 	 	     r    textexpectedr;      r      )thecatloveddogbutcowr<   )hellomynameiszwhat'syourr>   )ihavearO   andrM   loverS   z4My dog's hair is red, but the dogs' houses are blue.)rS   zdog'shairrU   redrP   rL   zdogs'housesarebluezwSometimes sentences have a dash - like this one!
                    A hyphen connects 2 words with no gap: easy-peasy.	sometimes	sentencesrX   rY   dashlikethisonehyphenconnects2wordswithnogapz
easy-peasyc                 :    t          j        |           |k    sJ d S N)r   bag_of_wordsrG   s     r   test_bag_of_wordsrq      s'    p '--999999r    ))z9The  dogloved the cat, but	
    the cat	loved the
 cow

r;   )z5

Hello    my	name	is H a r p e r, 
what's yourname?r<   r=   )zL     is for the way you look at me
            O    is for the only one I see
            V    is very, very extraordinary
            E    is even more than anyone that you adore canzL is for the way you look at me O is for the only one I see V is very, very extraordinary E is even more than anyone that you adore can)a  
            | Name    | Age | City         | Occupation     |
            |---------|-----|--------------|----------------|
            | Alice   | 30  | New York     | Engineer       |
            | Bob     | 25  | Los Angeles  | Designer       |
            | Charlie | 35  | Chicago      | Teacher        |
            | David   | 40  | San Francisco| Developer      |
            z| Name | Age | City | Occupation | |---------|-----|--------------|----------------| | Alice | 30 | New York | Engineer | | Bob | 25 | Los Angeles | Designer | | Charlie | 35 | Chicago | Teacher | | David | 40 | San Francisco| Developer |c                 r    t          j        | d          |k    sJ t          j        |           | k    sJ d S )NT)r"   )r   prepare_strrG   s     r   test_prepare_stringrt   /  sH    H &tTJJJhVVVV&t,,444444r    output_textsource_textexpected_percentage))extrar   r   )r   zSource text has a sentence.rK   )z'The original s e n t e n c e is normal.z"The original sentence is normal...g?)z'We saw 23% improvement in this quarter.z-We saw 23% improvement in sales this quarter.g      ?)rl   z4Is it possible to have more than everything missing?rK   c                 <    t          j        | |          |k    sJ d S ro   )r   calculate_percent_missing_textru   s      r   #test_calculate_percent_missing_textr|   W  s6    @ 	6{KPP	 	 	 	 	 	r    )table_as_cellsexpected_extractionzMonth A.)xywhcontent22)	row_index	col_indexr   z0Simple table, 1 head cell, 1 body cell, no spans)idzMonth B.zMonth C.1112132123z0Simple table, 3 head cell, 5 body cell, no spansh12col1)r   r   r   r   r   h1col23   h1col4h2col2h2col34r3col1r3col2r34col34r4col12z various spans, with 2 row headerc                 >    dd| id}t          |          |k    sJ d S )NTabler}   typemetadata)r   )r}   r~   example_elements      r   +test_cells_table_extraction_from_predictionr   |  s>    N %~6 O -_==ATTTTTTTr    )text_as_htmlr~   z
<table>
    <thead>
        <tr>
            <th>Month A.</th>
        </tr>
    </thead>
    <tbody>
        <tr>
            <td>22</td>
        </tr>
    </tbody>
</table>"
            a  
<table>
    <thead>
        <tr>
            <th>Month A.</th>
            <th>Month B.</th>
            <th>Month C.</th>
        </tr>
    </thead>
    <tbody>
        <tr>
            <td>11</td>
            <td>12</td>
            <td>13</td>
        </tr>
        <tr>
            <td>21</td>
            <td>22</td>
            <td>23</td>
        </tr>
    </tbody>
</table>"
  
<table>
    <thead>
        <tr>
            <th rowspan="2">h12col1</th>
            <th colspan="2">h1col23</th>
            <th>h1col4</th>
        </tr>
        <tr>
            <th>h2col2</th>
            <th colspan="2">h2col34</th>
        </tr>
    </thead>
    <tbody>
        <tr>
            <td>r3col1</td>
            <td>r3col2</td>
            <td colspan="2" rowspan="2">r34col34</td>
        </tr>
        <tr>
            <td colspan="2">r4col12</td>
        </tr>
    </tbody>
</table>
c                 >    dd| id}t          |          |k    sJ d S )Nr   r   r   )r   )r   r~   r   s      r   *test_html_table_extraction_from_predictionr   )  s@    j L
 O +?;;?RRRRRRRr    c                  Z    ddg dd} t          |           J t          |           J d S )Nr   r   )r   r}   r   )r   r   )r   s    r   =test_cells_extraction_from_prediction_when_missing_predictionr     sF    &R[]4^4^__O*?;;CCC,_==EEEEEr    htmlreturnc                 j    d |                      d          D             }d                    |          S )Nc                 :    g | ]}||                                 S r+   )strip)r-   lines     r   r/   z_trim_html.<locals>.<listcomp>  s%    DDD4tD$**,,DDDr    r)   r   )splitr   )r   
html_liness     r   
_trim_htmlr     s3    DD4::d+;+;DDDJ77:r    html_to_test)z
<table>
    <thead>
        <tr>
            <th>Month A.</th>
        </tr>
    </thead>
    <tbody>
        <tr>
            <td>22</td>
        </tr>
    </tbody>
</table>
a  
<table>
    <thead>
        <tr>
            <th>Month A.</th>
            <th>Month B.</th>
            <th>Month C.</th>
        </tr>
    </thead>
    <tbody>
        <tr>
            <td>11</td>
            <td>12</td>
            <td>13</td>
        </tr>
        <tr>
            <td>21</td>
            <td>22</td>
            <td>23</td>
        </tr>
    </tbody>
</table>
r   c                 l    t          |           }t          |          }t          |           |k    sJ d S ro   )r   r   r   )r   deckerd_table
html_tables      r   test_deckerd_html_converterr     s>    H *,77M&}55Jl##z111111r    )r   pytestunstructured.metricsr   +unstructured.metrics.table.table_extractionr   r   r   r   unstructured.partition.autor   r   markparametrizer7   rB   rF   rq   rt   r|   paramr   r   r   r,   r   r   r+   r    r   <module>r      s	   				  0 0 0 0 0 0            2 1 1 1 1 1Z Z Zz R*) ) ) )4 ' ' '* *V W* *V6  B  ?a!AaJJ	

 >q!11MM	

 5QQq1aWXYY	

 C 	
"JQQ  Q	
    q ! A Q   a q  a!	
;36 6n: :o6 6n:   " "F5 5G" "F5
 9   < = < -aaaJGGaaaDAA
  aJGGaDAA B
	
 
	
 
	
 	aaaJGGaaaJGGaaaJGGaaaDAAaaaDAAaaaDAAaaaDAAaaaDAAaaaDAA
  aJGGaJGGaJGGaDAAaDAAaDAAaDAAaDAAaDAA
 B/	
 	
 	
D 	 (  (  '  '  (  '  '  )  ( s@F "#!"(  "#!"(  "#!"'  "#!"'  "#!"(  "#!"'  "#!"'  "#!")  "#!"( S.^ 2cr	
 r	
 r	
]ad dJU UKd dJU +  aJGGaDAA B'	
 	
 	
* 	0  aJGGaJGGaJGGaDAAaDAAaDAAaDAAaDAAaDAA
 BG$	
 $	
 $	
\ 	6 "#!"(  "#!"(  "#!"'  "#!"'  "#!"(  "#!"'  "#!"'  "#!")  "#!"( S.^ 2SJ	
 J	
 J	
IOR RfS SgR RfSF F FS S    
 ? ? ?B BF2 2GB BF2 2 2r    