
    Ng0                    j   d dl mZ d dlZd dlZd dlmZ d dlmZmZm	Z	m
Z
mZmZmZ d dlmZmZmZmZmZ d dlmZ d dlmZ d dlmZ d d	lmZmZ d d
lmZ dZej         !                    dde
efdee	fdeefg          d+d            Z"ej         !                    dddg          d,d            Z#d Z$d Z%ej         !                    dde
efdee	fg          d+d            Z&d Z'd-d Z(d-d!Z)d" Z*d# Z+ej         !                    dddg          d.d$            Z,d% Z-d& Z.d' Z/d( Z0 G d) d*          Z1dS )/    )annotationsN)MockFixture)EXPECTED_TABLE"EXPECTED_TABLE_SEMICOLON_DELIMITEREXPECTED_TABLE_WITH_EMOJIEXPECTED_TEXT!EXPECTED_TEXT_SEMICOLON_DELIMITEREXPECTED_TEXT_WITH_EMOJIEXPECTED_TEXT_XLSX)FixtureRequestMockassert_round_trips_through_JSONexample_doc_pathfunction_mock)chunk_by_title)clean_extra_whitespace)Table)_CsvPartitioningContextpartition_csv)#UNSTRUCTURED_INCLUDE_DEBUG_METADATAztext/csv)filenameexpected_textexpected_tablestanley-cups.csvzstanley-cups-with-emoji.csvztable-semicolon-delimiter.csvr   strr   r   c                
   d|  }t          |          }t          |d         j                  |k    sJ |d         j        j        |k    sJ |d         j        j        t          k    sJ |d         j        j        | k    sJ d S )Nexample-docs/r   r   )r   r   textmetadatatext_as_htmlfiletypeEXPECTED_FILETYPEr   )r   r   r   f_pathelementss        `/var/www/html/ai-engine/env/lib/python3.11/site-packages/test_unstructured/partition/test_csv.py test_partition_csv_from_filenamer'   #   s     (X''Ff---H!(1+"233}DDDDA;,>>>>A;(,=====A;(H444444    infer_table_structureTFboolc                    d}t          ||           }t          |d         j        d          o|d         j        j        d u}|| k    sJ d S )Nexample-docs/stanley-cups.csv)r   r)   r   r!   )r   hasattrr    r!   )r)   r$   r%   $table_element_has_text_as_html_fields       r&   6test_partition_csv_from_filename_infer_table_structurer/   9   sf    ,FfDYZZZH 	$n55 	:QK -T9 ) 03HHHHHHHr(   c                     t          t          d          d          } t          | d         j                  t          k    sJ | d         j        j        dk    sJ d S )Nr   test)metadata_filenamer   )r   r   r   r   r   r    r   r%   s    r&   7test_partition_csv_from_filename_with_metadata_filenamer4   E   s`    -.@AAU[\\\H!(1+"233}DDDDA;(F222222r(   c                     t          t          d          d          } t          | d         j                  t          k    sJ d S )Nzstanley-cups-utf-16.csvzutf-16)encodingr   )r   r   r   r   r   r3   s    r&    test_partition_csv_with_encodingr7   L   sE    -.GHHS[\\\H!(1+"233}DDDDDDr(   c                   d|  }t          |d          5 }t          |          }d d d            n# 1 swxY w Y   t          |d         j                  |k    sJ t	          |d         t
                    sJ |d         j        j        |k    sJ |d         j        j        t          k    sJ |d         j        j
        J t          rd |D             dhk    sJ d S d S )Nr   rbfiler   c                &    h | ]}|j         j        S  )r    detection_origin).0elements     r&   	<setcomp>z/test_partition_csv_from_file.<locals>.<setcomp>c   s    JJJg 1JJJr(   csv)openr   r   r   
isinstancer   r    r!   r"   r#   r   r   )r   r   r   r$   fr%   s         r&   test_partition_csv_from_filerF   R   sF    (X''F	fd		 )q a((() ) ) ) ) ) ) ) ) ) ) ) ) ) )!(1+"233}DDDDhqk5)))))A;,>>>>A;(,=====A;(000* VJJJJJugUUUUV VUUs   377c                    t          t          d          d          5 } t          | d          }d d d            n# 1 swxY w Y   t          |d         j                  t
          k    sJ |d         j        j        dk    sJ d S )Nr   r9   r1   )r;   r2   r   )rC   r   r   r   r   r   r    r   rE   r%   s     r&   3test_partition_csv_from_file_with_metadata_filenamerI   f   s    	122D	9	9 CQ a6BBBC C C C C C C C C C C C C C C "(1+"233}DDDDA;(F222222s   <A A mockerr   c                    d}|                      d|           t          t          d                    }|d         j        j        |k    sJ d S )N2029-07-05T09:24:281unstructured.partition.csv.get_last_modified_datereturn_valuer   r   patchr   r   r    last_modified)rJ   filesystem_last_modifiedr%   s      r&   Dtest_partition_csv_from_file_path_gets_last_modified_from_filesystemrT   q   sd    4
LL;-    
 -.@AABBHA;-1IIIIIIIr(   c                    d}d}|                      d|           t          t          d          |          }|d         j        j        |k    sJ d S )NrL   2020-07-05T09:24:28rM   rN   r   )metadata_last_modifiedr   rP   )rJ   rS   rW   r%   s       r&   @test_partition_csv_from_file_path_prefers_metadata_last_modifiedrX   }   sx    42
LL;Jb     +,,E[  H A;-1GGGGGGGr(   c                     t          t          d          d          5 } t          |           }d d d            n# 1 swxY w Y   |d         j        j        J d S )Nr   r9   r:   r   rC   r   r   r    rR   rH   s     r&   4test_partition_csv_from_file_gets_last_modified_Noner[      s    	122D	9	9 )Q a((() ) ) ) ) ) ) ) ) ) ) ) ) ) ) A;-55555s   ;??c                     d} t          t          d          d          5 }t          ||           }d d d            n# 1 swxY w Y   |d         j        j        | k    sJ d S )NrV   r   r9   )r;   rW   r   rZ   )rW   rE   r%   s      r&   ;test_partition_csv_from_file_prefers_metadata_last_modifiedr]      s    2	122D	9	9 XQ a@VWWWX X X X X X X X X X X X X X X A;-1GGGGGGGs   >AAc                ^    t          t          |                     }t          |           d S )Nr   )r   r   r   r   r%   s     r&   test_partition_csv_with_jsonr`      s/    &6x&@&@AAAH#H-----r(   c                     d} t          |           }t          | dddd          }t          |dd          }||k    sJ ||k    sJ d S )	Nr,   r   by_title	   r   F)chunking_strategymax_characterscombine_text_under_n_charsinclude_header)re   rf   )r   r   )r   r%   chunk_elementschunkss       r&   7test_add_chunking_strategy_to_partition_csv_non_defaultrj      sy    .Hh///H"$#$  N HQSTUUUFX%%%%V######r(   c                 `    d} t          | dd          }|d         j        j        dgk    sJ d S )Nr,   fastF)r   strategyrg   r   engr   r    	languagesr_   s     r&   1test_partition_csv_element_metadata_has_languagesrq      s?    .HhPUVVVHA;)eW444444r(   c                 d    d} t          | ddgd          }|d         j        j        dgk    sJ d S )Nr,   rl   deuF)r   rm   rp   rg   r   ro   r_   s     r&   )test_partition_csv_respects_languages_argrt      sK    .HFuge  H A;)eW444444r(   c                     t          t          d          dd          } | d         }|j        dt          z   k    sJ |j        j        J d S )Nr   rl   T)rm   rg   r   z#Stanley Cups Unnamed: 1 Unnamed: 2 )r   r   r   r   r    r!   )r%   tables     r&   test_partition_csv_headerrw      sb    +,,vd  H QKE:>ASSSSSS>&22222r(   c                     e Zd ZdZd Zd Zej                            dddg          dd            Z	d	 Z
d
 Zej                            dddg          d d            Zd!dZd Zd Zd Zd Z ej                    d"d            ZdS )#Describe_CsvPartitioningContextzIUnit-test suite for `unstructured.partition.csv._CsvPartitioningContext`.c                    t          j        t          d          d d dd          }t          |t                     sJ d S )Nr   T	file_pathr;   r6   rg   r)   )r   loadr   rD   selfctxs     r&   .it_provides_a_validating_alternate_constructorzNDescribe_CsvPartitioningContext.it_provides_a_validating_alternate_constructor   sP    %*&'9::"&
 
 
 #67777777r(   c                    t          j        t          d          5  t          j        d d d dd           d d d            d S # 1 swxY w Y   d S )N1either file-path or file-like object must be provmatchTr{   )pytestraises
ValueErrorr   r}   r   s    r&   ;and_the_validating_constructor_raises_on_an_invalid_contextz[Describe_CsvPartitioningContext.and_the_validating_constructor_raises_on_an_invalid_context   s    ]:-`aaa 	 	#(#&*   	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	s   AA
A	file_namer   zcsv-with-long-lines.csvr   c                X    t          t          |                    }|j        dk    sJ d S )N,r   r   	delimiter)r   r   r   s      r&   <it_auto_detects_the_delimiter_for_a_comma_delimited_CSV_filez\Describe_CsvPartitioningContext.it_auto_detects_the_delimiter_for_a_comma_delimited_CSV_file   s3     &&6y&A&ABB}######r(   c                X    t          t          d                    }|j        dk    sJ d S )Nzsemicolon-delimited.csv;r   r~   s     r&   Dand_it_auto_detects_the_delimiter_for_a_semicolon_delimited_CSV_filezdDescribe_CsvPartitioningContext.and_it_auto_detects_the_delimiter_for_a_semicolon_delimited_CSV_file   s2    %&67P&Q&QRR}######r(   c                P    t          t          d                    }|j        J d S )Nzsingle-column.csvr   r~   s     r&   Abut_it_returns_None_as_the_delimiter_for_a_single_column_CSV_filezaDescribe_CsvPartitioningContext.but_it_returns_None_as_the_delimiter_for_a_single_column_CSV_file   s-    %&67J&K&KLL}$$$$$r(   )rg   expected_value)FN)Tr   rg   r*   r   
int | Nonec                <    t          |          j        |k    sJ d S )N)rg   )r   header)r   rg   r   s      r&   8it_identifies_the_header_row_based_on_include_header_argzXDescribe_CsvPartitioningContext.it_identifies_the_header_row_based_on_include_header_arg  s*     'nEEELP^^^^^^^r(   get_last_modified_date_r   c                    d}||_         t          d          }|j        }|                    d           ||k    sJ d S )Nz2024-08-04T02:23:53za/b/document.csv)r|   )rO   r   rR   assert_called_once_with)r   r   rS   r   rR   s        r&   Ait_gets_last_modified_from_the_filesystem_when_a_path_is_providedzaDescribe_CsvPartitioningContext.it_gets_last_modified_from_the_filesystem_when_a_path_is_provided  sY     $9 /G,%0BCCC)778JKKK 8888888r(   c                d    t          j        d          }t          |          }|j        }|J d S )Ns   abcdefgr:   )ioBytesIOr   rR   )r   r;   r   rR   s       r&   Sand_it_falls_back_to_None_for_the_last_modified_date_when_file_path_is_not_providedzsDescribe_CsvPartitioningContext.and_it_falls_back_to_None_for_the_last_modified_date_when_file_path_is_not_provided  s;    z*%%%4000)$$$$$r(   c                   t          t          d          d          5 }|                                 t          |          }|                                 5 }||u sJ |                                dk    sJ |                    d          dk    sJ |                                dk    sJ 	 d d d            n# 1 swxY w Y   |                                dk    sJ 	 d d d            d S # 1 swxY w Y   d S )Nr   r9   r:   r         Stanley Cups,,)rC   r   readr   tell)r   rE   r   r;   s       r&   Oit_provides_transparent_access_to_the_source_file_when_it_is_a_file_like_objectzoDescribe_CsvPartitioningContext.it_provides_transparent_access_to_the_source_file_when_it_is_a_file_like_object$  sh   "#566== 	!FFHHH)q111C &tqyyyyvvxx1}}}}yy}}(99999vvxx2~~~~~& & & & & & & & & & & & & & & 6688q=====	! 	! 	! 	! 	! 	! 	! 	! 	! 	! 	! 	! 	! 	! 	! 	! 	! 	!s6   9C-AB:.C-:B>	>C-B>	C--C14C1c                    t          t          d                    }|                                5 }|                    d          dk    sJ 	 d d d            d S # 1 swxY w Y   d S )Nr   r   r   )r   r   rC   r   )r   r   r;   s      r&   Hit_provides_transparent_access_to_the_source_file_when_it_is_a_file_pathzhDescribe_CsvPartitioningContext.it_provides_transparent_access_to_the_source_file_when_it_is_a_file_path3  s    %&67I&J&JKKXXZZ 	6499R==$555555	6 	6 	6 	6 	6 	6 	6 	6 	6 	6 	6 	6 	6 	6 	6 	6 	6 	6s   AA"Ac                    t          j        t          d          5  t                                                       d d d            d S # 1 swxY w Y   d S )Nr   r   )r   r   r   r   	_validater   s    r&   5it_raises_when_neither_file_path_nor_file_is_providedzUDescribe_CsvPartitioningContext.it_raises_when_neither_file_path_nor_file_is_provided:  s    ]:-`aaa 	2 	2#%%//111	2 	2 	2 	2 	2 	2 	2 	2 	2 	2 	2 	2 	2 	2 	2 	2 	2 	2s   !A

AArequestr   returnc                "    t          |d          S )NrM   )r   )r   r   s     r&   r   z7Describe_CsvPartitioningContext.get_last_modified_date_@  s    W&YZZZr(   N)r   r   )rg   r*   r   r   )r   r   )r   r   r   r   )__name__
__module____qualname____doc__r   r   r   markparametrizer   r   r   r   r   r   r   r   r   fixturer   r=   r(   r&   ry   ry      s[       SS8 8 8   [ &		
 $ $ $ $$ $ $% % % [AMS\C]^^_ _ _ _^_
9 
9 
9 
9% % %! ! !6 6 62 2 2 V^[ [ [ [ [ [r(   ry   )r   r   r   r   r   r   )r)   r*   )rJ   r   )r   r   )2
__future__r   r   r   pytest_mockr   *test_unstructured.partition.test_constantsr   r   r   r   r	   r
   r   test_unstructured.unit_utilsr   r   r   r   r   unstructured.chunking.titler   unstructured.cleaners.corer   unstructured.documents.elementsr   unstructured.partition.csvr   r   &unstructured.partition.utils.constantsr   r#   r   r   r'   r/   r4   r7   rF   rI   rT   rX   r[   r]   r`   rj   rq   rt   rw   ry   r=   r(   r&   <module>r      sx   # " " " " " 				  # # # # # #                               7 6 6 6 6 6 = = = = = = 1 1 1 1 1 1 M M M M M M M M V V V V V V  3	]N;	&(@B[\+-.	
 5 5 5 5 04-@@I I I A@I3 3 3E E E 3	]N;	&(@B[\ 
V 
V 
V 
V3 3 3	J 	J 	J 	JH H H H6 6 6H H H &8:W%XYY. . . ZY.
$ $ $$5 5 55 5 53 3 3n[ n[ n[ n[ n[ n[ n[ n[ n[ n[r(   