
    Ng9                       d dl mZ d dlZd dlZd dlmZmZ d dlZd dlm	Z	 d dl
mZmZ d dlmZ d dlmZ d dlmZmZmZmZ d d	lmZ d d
lmZ d dlmZ  ed           ed           ed           ed           ed           ed          gZd                    dd          ZdZej         !                    dg d          dPd            Z"d Z#ej         !                    dg d          dQd             Z$ej         !                    d!d"d#e%fd$d#e&fg          dRd'            Z'd( Z(d) Z)ej         !                    dg d          dQd*            Z*d+ Z+ej         !                    dg d          dQd,            Z,d- Z-d. Z.d/ Z/d0 Z0d1 Z1d2 Z2d3 Z3d4 Z4d5 Z5d6 Z6d7 Z7d8 Z8d9 Z9d: Z:d; Z;dSd>Z<d? Z=d@ Z>dSdAZ?dB Z@dC ZAdD ZBdE ZCej         !                    dFg d          dTdI            ZDdJ ZEdK ZFdL ZGdM ZHdN ZIdO ZJdS )U    )annotationsN)OptionalType)MockerFixture)assert_round_trips_through_JSONexample_doc_path)chunk_by_title)group_broken_paragraphs)AddressListItemNarrativeTextTitle)FileTypepartition_text)#UNSTRUCTURED_INCLUDE_DEBUG_METADATAz.This is a test document to use for unit tests.textzDoylestown, PA 18901zImportant points:zHamburgers are deliciouszDogs are the bestzI love fuzzy blanketszThis is a story. This is a story that doesn't matter
 because it is just being used as an example. Hi. Hello. Howdy. Hola.
 The example is simple and repetitive and long and somewhat boring,
 but it serves a purpose. End.
 zThis is a story.

This is a story that doesn't matter because it is just being used as an example.

Hi.

Hello.

Howdy.

Hola.

The example is simple and repetitive and long and somewhat boring, but it serves a purpose.

End.
filenameencoding))fake-text.txtutf-8r   N)fake-text-utf-16-be.txtz	utf-16-ber   strr   Optional[str]c                    t          t          |           |          }t          |          dk    sJ |t          k    sJ |D ]}|j        j        | k    sJ t          rd |D             dhk    sJ d S d S )Nr   r   c                &    h | ]}|j         j        S  )metadatadetection_origin.0elements     a/var/www/html/ai-engine/env/lib/python3.11/site-packages/test_unstructured/partition/test_text.py	<setcomp>z4test_partition_text_from_filename.<locals>.<setcomp>G   s    JJJg 1JJJ    r   )r   r   lenEXPECTED_OUTPUTr$   r   r   )r   r   elementsr(   s       r)   !test_partition_text_from_filenamer/   7   s     .x888LLLHx==1&&&& 5 5(H44444* WJJJJJvhVVVVW WVVr+   c                     t          t          d          dd          } | t          k    sJ | D ]}|j        j        dk    sJ d S )Nr   r   test)r   metadata_filename)r   r   r-   r$   r   r.   r(   s     r)   8test_partition_text_from_filename_with_metadata_filenamer4   J   sk    ))Gv  H &&&& 3 3(F222223 3r+   )zfake-text-utf-16.txtzfake-text-utf-16-le.txtzfake-text-utf-32.txtc                    t          t          |                     }t          |          dk    sJ |t          k    sJ |D ]}|j        j        | k    sJ d S )Nr   )r   r   r,   r-   r$   r   )r   r.   r(   s      r)   2test_partition_text_from_filename_default_encodingr6   T   su    
 .x8899Hx==1&&&& 5 5(H444445 5r+   r   r   errorr   zutf-16r   r8   Type[BaseException]c                    t          j        |          5  t          |           } t          | |           d d d            d S # 1 swxY w Y   d S )Nr   )pytestraisesr   r   r7   s      r)   7test_partition_text_from_filename_raises_econding_errorr=   a   s     
u		 = =#H--8<<<<= = = = = = = = = = = = = = = = = =s   !AA
Ac                     t          t          d          d          5 } t          |           }d d d            n# 1 swxY w Y   t          |          dk    sJ |t          k    sJ |D ]}|j        j        J d S Nr   rbfiler   openr   r   r,   r-   r$   r   fr.   r(   s      r)   test_partition_text_from_filerG   r       	//	6	6 *!!q)))* * * * * * * * * * * * * * * x==1&&&& 1 1(00001 1   ;??c                    t          d          } t          | d          5 }t          |d          }d d d            n# 1 swxY w Y   t          |          dk    sJ |t          k    sJ |D ]}|j        j        dk    sJ d S )Nr   r@   r1   rB   r2   r   )r   rD   r   r,   r-   r$   r   r   rF   r.   r(   s       r)   4test_partition_text_from_file_with_metadata_filenamerM   |   s    00H	h		 D!qFCCCD D D D D D D D D D D D D D Dx==1&&&& 3 3(F222223 3s   >AAc                    t          t          |           d          5 }t          |          }d d d            n# 1 swxY w Y   t          |          dk    sJ |t          k    sJ |D ]}|j        j        J d S Nr@   rA   r   rC   rL   s       r)   .test_partition_text_from_file_default_encodingrP      s    
 
x(($	/	/ *1!q)))* * * * * * * * * * * * * * *x==1&&&& 1 1(00001 1rI   c                     t          t          d          d          5 } t          |           }d d d            n# 1 swxY w Y   t          |          dk    sJ |t          k    sJ |D ]}|j        j        J d S r?   rC   rE   s      r)   #test_partition_text_from_bytes_filerR      rH   rI   c                    t          t          |           d          5 }t          |          }d d d            n# 1 swxY w Y   t          |          dk    sJ |t          k    sJ |D ]}|j        j        J d S rO   rC   rL   s       r)   4test_partition_text_from_bytes_file_default_encodingrT      s    
 
x(($	/	/ *1!q)))* * * * * * * * * * * * * * * x==1&&&& 1 1(00001 1rI   c                 x    t          t          d          ddg          } | d         j        j        dgk    sJ d S )Nbook-war-and-peace-1p.txtfasten)strategy	languagesr   engr   r   r$   rZ   r.   s    r)   <test_text_partition_element_metadata_user_provided_languagesr^      sO    455SWRX  H A;)eW444444r+   c                     t          t          d                    5 } |                                 }d d d            n# 1 swxY w Y   t          |          }t	          |          dk    sJ |t
          k    sJ |D ]}|j        j        J d S )Nr   r   r   )rD   r   readr   r,   r-   r$   r   )rF   r   r.   r(   s       r)   test_partition_text_from_textra      s    	//	0	0 Avvxx               4(((Hx==1&&&& 1 1(00001 1   >AAc                 2    t          d          g k    sJ d S )Nr   r   r   r#   r+   r)   5test_partition_text_from_text_works_with_empty_stringrd      s$    r"""b((((((r+   c                     t          j        t                    5  t                       d d d            d S # 1 swxY w Y   d S N)r;   r<   
ValueErrorr   r#   r+   r)   .test_partition_text_raises_with_none_specifiedrh      s    	z	"	"                   s   6::c                     t          d          } t          |           5 }|                                }d d d            n# 1 swxY w Y   t          j        t
                    5  t          | |           d d d            d S # 1 swxY w Y   d S )Nr   )r   r   )r   rD   r`   r;   r<   rg   r   )r   rF   r   s      r)   2test_partition_text_raises_with_too_many_specifiedrj      s   00H	h 1vvxx               
z	"	" 5 5t44445 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5s#   A  AA$BB
Bc                     d} t          |           }|t          d          t          d          gk    sJ |D ]}|j        j        J d S )Nz6
    VERY IMPORTANT MEMO
    DOYLESTOWN, PA 18901
    r   zVERY IMPORTANT MEMOzDOYLESTOWN, PA 18901)r   r   r   r$   r   r   r.   r(   s      r)   <test_partition_text_captures_everything_even_with_linebreaksrm      s    D 4(((H()))+,,,      1 1(00001 1r+   c                     d} t          | t                    }|t          d          t          d          gk    sJ |D ]}|j        j        J d S )NzYThe big brown fox
was walking down the lane.

At the end of the lane,
the fox met a bear.)r   paragraph_grouperz,The big brown fox was walking down the lane.r   z+At the end of the lane, the fox met a bear.)r   r
   r   r$   r   rl   s      r)   ,test_partition_text_groups_broken_paragraphsrp      s    	 	 4;RSSSHIJJJHIII      1 1(00001 1r+   c                     t          t          d                    } t          |           dk    sJ | d         j                            d          sJ | d         j                            d          sJ d S )Nnorwich-city.txtr   zIwan RobertszExternal links)r   r   r,   r   
startswithendswithr]   s    r)   $test_partition_text_splits_long_textrv      sy    ./ABBCCHx==1A;&&~66666B<%%&67777777r+   c                     d} t          |           }t          |          dk    sJ |d         j        | k    sJ t          |d         t                    rJ d S )Nz--------------------r      r   )r   r,   r   
isinstancer   )r   r.   s     r)   *test_partition_text_doesnt_get_page_breaksrz      sf    !D4(((Hx==AA;t####(1+x0000000r+   c                     t          t          d                    } t          d | D                       sJ t          d | D                       sJ d S )Nr   c              3  6   K   | ]}|j         j        d k    V  dS r   r$   r   r'   es     r)   	<genexpr>zZtest_partition_text_from_filename_gets_filename_metadata_from_file_path.<locals>.<genexpr>  ,      HH!qz"o5HHHHHHr+   c              3  P   K   | ]!}|j         j        t          d           k    V  "dS )r   N)r$   file_directoryr   r~   s     r)   r   zZtest_partition_text_from_filename_gets_filename_metadata_from_file_path.<locals>.<genexpr>  s5      SSQqz(,<R,@,@@SSSSSSr+   r   r   allr]   s    r)   Gtest_partition_text_from_filename_gets_filename_metadata_from_file_pathr     sd    .??@@HHHxHHHHHHHHSS(SSSSSSSSSSr+   c                     t          t          d          d          5 } t          |           }d d d            n# 1 swxY w Y   t          d |D                       sJ t          d |D                       sJ d S )Nr   r@   rA   c              3  2   K   | ]}|j         j        d u V  d S rf   r}   r~   s     r)   r   zLtest_partition_text_from_file_gets_filename_metadata_None.<locals>.<genexpr>  s,      ==qqz"d*======r+   c              3  2   K   | ]}|j         j        d u V  d S rf   r$   r   r~   s     r)   r   zLtest_partition_text_from_file_gets_filename_metadata_None.<locals>.<genexpr>  s,      CCQqz(D0CCCCCCr+   rD   r   r   r   rF   r.   s     r)   9test_partition_text_from_file_gets_filename_metadata_Noner   
  s    	//	6	6 *!!q)))* * * * * * * * * * * * * * * ==H========CC(CCCCCCCCCCrI   c                     t          t          d          d          } t          d | D                       sJ t          d | D                       sJ d S )Nr   z	a/b/c.txt)r2   c              3  6   K   | ]}|j         j        d k    V  dS )zc.txtNr}   r~   s     r)   r   zNtest_partition_text_from_filename_prefers_metadata_filename.<locals>.<genexpr>  ,      @@!qz"g-@@@@@@r+   c              3  6   K   | ]}|j         j        d k    V  dS )za/bNr   r~   s     r)   r   zNtest_partition_text_from_filename_prefers_metadata_filename.<locals>.<genexpr>  ,      DDaqz(E1DDDDDDr+   r   r]   s    r)   ;test_partition_text_from_filename_prefers_metadata_filenamer     sj    .??S^___H@@x@@@@@@@@DD8DDDDDDDDDDr+   c                     t          t          d          d          5 } t          | d          }d d d            n# 1 swxY w Y   t          d |D                       sJ t          d |D                       sJ d S )Nr   r@   z	d/e/f.txtrK   c              3  6   K   | ]}|j         j        d k    V  dS )zf.txtNr}   r~   s     r)   r   zJtest_partition_text_from_file_prefers_metadata_filename.<locals>.<genexpr>  r   r+   c              3  6   K   | ]}|j         j        d k    V  dS )zd/eNr   r~   s     r)   r   zJtest_partition_text_from_file_prefers_metadata_filename.<locals>.<genexpr>  r   r+   r   r   s     r)   7test_partition_text_from_file_prefers_metadata_filenamer     s    	//	6	6 I!!qKHHHI I I I I I I I I I I I I I I @@x@@@@@@@@DD8DDDDDDDDDDs   <A A c                     dt          t          d                    } t          fd| D                       s-J d dt          | d         j        j                               d S )Nz
text/plainr   c              3  8   K   | ]}|j         j        k    V  d S rf   r$   filetype)r'   r   TXT_MIME_TYPEs     r)   r   zRtest_partition_text_gets_the_TXT_MIME_type_in_metadata_filetype.<locals>.<genexpr>'  s-      FFqz"m3FFFFFFr+   zExpected all elements to have 'z' as their filetype, but got: r   )r   r   r   reprr$   r   )r.   r   s    @r)   ?test_partition_text_gets_the_TXT_MIME_type_in_metadata_filetyper   $  s     M.??@@HFFFFXFFFFF  	2- 	2 	2!%.//	2 	2    r+   c                     t          t          d          t          j                  } t	          d | D                       s*J dt          | d         j        j                               d S )Nz	README.md)metadata_file_typec              3  6   K   | ]}|j         j        d k    V  dS )ztext/markdownNr   r~   s     r)   r   zAtest_partition_text_prefers_metadata_file_type.<locals>.<genexpr>/  r   r+   zJExpected all elements to have 'text/markdown' as their filetype, but got: r   )r   r   r   MDr   r   r$   r   r]   s    r)   .test_partition_text_prefers_metadata_file_typer   -  s    .{;;PXP[\\\HHHxHHHHH  	2!%.//	2 	2    r+   mockerr   c                    d|                      d           t          t          d                    }t          fd|D                       sJ d S )N2029-07-05T09:24:282unstructured.partition.text.get_last_modified_datereturn_valuer   c              3  8   K   | ]}|j         j        k    V  d S rf   r$   last_modified)r'   r   filesystem_last_modifieds     r)   r   zXtest_partition_text_from_file_path_gets_last_modified_from_filesystem.<locals>.<genexpr>@  s.      VVqz'+CCVVVVVVr+   patchr   r   r   )r   r.   r   s     @r)   Etest_partition_text_from_file_path_gets_last_modified_from_filesystemr   8  sq    4
LL<Kc     .??@@HVVVVXVVVVVVVVVVr+   c                     t          t          d          d          5 } t          |           }d d d            n# 1 swxY w Y   t          d |D                       sJ d S )Nr   r@   rA   c              3  2   K   | ]}|j         j        d u V  d S rf   r   r~   s     r)   r   zHtest_partition_text_from_file_gets_last_modified_None.<locals>.<genexpr>G  ,      BBAqz'4/BBBBBBr+   r   r   s     r)   5test_partition_text_from_file_gets_last_modified_Noner   C  s    	//	6	6 *!!q)))* * * * * * * * * * * * * * * BBBBBBBBBBBBrI   c                     t          t          d                    5 } |                                 }d d d            n# 1 swxY w Y   t          |          }t	          d |D                       sJ d S )Nr   r   c              3  2   K   | ]}|j         j        d u V  d S rf   r   r~   s     r)   r   zHtest_partition_text_from_text_gets_last_modified_None.<locals>.<genexpr>P  r   r+   rD   r   r`   r   r   )rF   r   r.   s      r)   5test_partition_text_from_text_gets_last_modified_Noner   J  s    	//	0	0 Avvxx               4(((HBBBBBBBBBBBBrb   c                    d}d|                      d|           t          t          d                    }t          fd|D                       sJ d S )Nr   2020-07-05T09:24:28r   r   r   )metadata_last_modifiedc              3  8   K   | ]}|j         j        k    V  d S rf   r   r'   r   r   s     r)   r   zTtest_partition_text_from_file_path_prefers_metadata_last_modified.<locals>.<genexpr>^  .      TTaqz'+AATTTTTTr+   r   )r   r   r.   r   s      @r)   Atest_partition_text_from_file_path_prefers_metadata_last_modifiedr   S  s    42
LL<Kc     ))BX  H TTTT8TTTTTTTTTTr+   c                     dt          t          d          d          5 } t          |           }d d d            n# 1 swxY w Y   t          fd|D                       sJ d S )Nr   r   r@   )rB   r   c              3  8   K   | ]}|j         j        k    V  d S rf   r   r   s     r)   r   zOtest_partition_text_from_file_prefers_metadata_last_modified.<locals>.<genexpr>f  r   r+   r   )rF   r.   r   s     @r)   <test_partition_text_from_file_prefers_metadata_last_modifiedr   a  s    2	//	6	6 Y!!qAWXXXY Y Y Y Y Y Y Y Y Y Y Y Y Y Y TTTT8TTTTTTTTTTs   ?AAc                     dt          t          d                    5 } |                                 }d d d            n# 1 swxY w Y   t          |          }t	          fd|D                       sJ d S )Nr   r   )r   r   c              3  8   K   | ]}|j         j        k    V  d S rf   r   r   s     r)   r   zOtest_partition_text_from_text_prefers_metadata_last_modified.<locals>.<genexpr>p  r   r+   r   )rF   r   r.   r   s      @r)   <test_partition_text_from_text_prefers_metadata_last_modifiedr   i  s    2	//	0	0 Avvxx               4@VWWWHTTTT8TTTTTTTTTTs   AAAc                 N    d t          d          D             } | g dk    sJ d S )Nc                    g | ]	}|j         
S r#   )idr&   s     r)   
<listcomp>zYtest_Text_element_assigns_id_hashes_that_are_unique_and_deterministic.<locals>.<listcomp>w  s    
P
P
P'7:
P
P
Pr+   hello
hello
hellor   ) 8657c0ec31a4cfc822f6cd4a5684cafd 72aefb4a12be063ad160931fdb380163 ba8c1a216ca585aecdd365a72e6124f1r   )idss    r)   Etest_Text_element_assigns_id_hashes_that_are_unique_and_deterministicr   v  sQ    
P
P^9N%O%O%O
P
P
PC         r+   c                     t          dd          } | D ]E}t          j        |j        d          sJ t	          j        |                                           Fd S )Nr   T)r   unique_element_ids   )version)r   uuidUUIDr   jsondumpsto_dictr3   s     r)   >test_Text_element_assings_UUID_when_unique_element_ids_is_Truer     sj    #8TRRRH & &yQ////// 	
7??$$%%%%	& &r+   )	file_namer   r   
str | Nonec                `    t          t          |           |          }t          |           d S )Nr!   )r   r   r   )r   r   r.   s      r)   test_partition_text_with_jsonr     s3     .y99HMMMH#H-----r+   c                     t          d          } t          |           }t          | d          }t          |          }||k    sJ ||k    sJ d S )NrV   )r   by_title)chunking_strategy)r   r   r	   )r   r.   chunk_elementschunkss       r)   ,test_add_chunking_strategy_on_partition_textr     sg     ;<<Hx000H#H
KKKNH%%FX%%%%V######r+   c                 p    t          t          d                    } | d         j        j        dgk    sJ d S )Nrr   r   r[   r\   r]   s    r)   2test_partition_text_element_metadata_has_languagesr     s<    ./ABBCCHA;)eW444444r+   c                     t          t          d          d          } d | D             }|dgddgdgdgdggk    sJ d S )Nzlanguage-docs/eng_spa_mult.txtTdetect_language_per_elementc                &    g | ]}|j         j        S r#   r$   rZ   r&   s     r)   r   zLtest_partition_text_respects_detect_language_per_element.<locals>.<listcomp>  s    @@@GW'@@@r+   r[   spa)r   r   r.   langss     r)   8test_partition_text_respects_detect_language_per_elementr     si    9::X\  H A@x@@@EeWuenugwHHHHHHHr+   c                 v    t          t          d          dg          } | d         j        j        dgk    sJ d S )Nrr   deurZ   r   r\   r]   s    r)   *test_partition_text_respects_languages_argr     sC    ./ABBugVVVHA;)eW444444r+   c                     t          j        t                    5  t          t	          d          d           d d d            d S # 1 swxY w Y   d S )Nrr   r[   r   )r;   r<   	TypeErrorr   r   r#   r+   r)   5test_partition_text_element_metadata_raises_TypeErrorr     s    	y	!	! N N'(:;;uMMMMN N N N N N N N N N N N N N N N N Ns   AA
A
c                     t          t          d          d          } d | D             }t          |          dk    sJ d S )Nz(language-docs/UDHR_first_article_all.txtTr   c                J    g | ] }|j         j        |j         j        d          !S )r   r   r~   s     r)   r   zEtest_partition_text_detects_more_than_3_languages.<locals>.<listcomp>  s,    OOO!*:NOQZ!!$OOOr+   
   )r   r   r,   r   s     r)   1test_partition_text_detects_more_than_3_languagesr     sR    CDD$(  H POhOOOEu::??????r+   )r   r   r   r   )r   r   )r   r   r   r   r8   r9   )r   r   )r   r   r   r   )K
__future__r   r   r   typingr   r   r;   pytest_mockr   test_unstructured.unit_utilsr   r   unstructured.chunking.titler	   unstructured.cleaners.corer
   unstructured.documents.elementsr   r   r   r   unstructured.file_utils.modelr   unstructured.partition.textr   &unstructured.partition.utils.constantsr   r-   replaceMIN_MAX_TEXTSHORT_PARAGRAPHSmarkparametrizer/   r4   r6   UnicodeDecodeErrorUnicodeErrorr=   rG   rM   rP   rR   rT   r^   ra   rd   rh   rj   rm   rp   rv   rz   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r#   r+   r)   <module>r     s\   # " " " " "   ! ! ! ! ! ! ! !  % % % % % % Z Z Z Z Z Z Z Z 6 6 6 6 6 6 > > > > > > S S S S S S S S S S S S 2 2 2 2 2 2 6 6 6 6 6 6 V V V V V V MGHHHG'(((	E"###H,---H%&&&H)***" #*'# #  $    W W W W3 3 3 OOO 5 5 5	 5 %	($67	"Hl; = = = =1 1 13 3 3 OOO 1 1 1	 11 1 1 OOO 1 1 1	 15 5 5	1 	1 	1) ) )  
5 5 51 1 11 1 1&8 8 81 1 1T T TD D DE E EE E E    W W W WC C CC C CU U U UU U UU U U  & & &    . . . .
$ $ $5 5 5
I I I5 5 5
N N N
    r+   