
    Ng4!                    8   d dl mZ d dlmZ d dlmZ d dlZd dlZd dlm	Z	 d dl
mZmZ d dlmZ d dlmZmZ d d	lmZ d d
lmZ d Zd Zd Z G d d          Zd Zd Zd Zd Zd Zd Zd Z d Z!d Z"d Z#d'dZ$d Z%d Z&d'dZ'd  Z(d! Z)d" Z*d# Z+d$ Z,d% Z-d& Z.dS )(    )annotations)Any)patchN)MockFixture)assert_round_trips_through_JSONexample_doc_path)chunk_by_title)ElementTypeTitlepartition_md)#UNSTRUCTURED_INCLUDE_DEBUG_METADATAc                     t          d          } t          |           }t          |          dk    sJ dd |D             vsJ t          |d         t                    sJ t
          rd |D             dhk    sJ d S d S )N	README.mdfilenamer   	PageBreakc                    g | ]	}|j         
S  )category).0elems     _/var/www/html/ai-engine/env/lib/python3.11/site-packages/test_unstructured/partition/test_md.py
<listcomp>z3test_partition_md_from_filename.<locals>.<listcomp>   s    BBBt}BBB    c                &    h | ]}|j         j        S r   )metadatadetection_originr   elements     r   	<setcomp>z2test_partition_md_from_filename.<locals>.<setcomp>   s    JJJg 1JJJr   md)r   r   len
isinstancer   r   r   elementss     r   test_partition_md_from_filenamer'      s    ,,HX...Hx==1BBBBBBBBBhqk5)))))* UJJJJJtfTTTTU UTTr   c                     t          d          } t          | d          5 }t          |          }d d d            n# 1 swxY w Y   t          |          dk    sJ d S )Nr   rbfiler   )r   openr   r#   )r   fr&   s      r   test_partition_md_from_filer.      s    ,,H	h		 (Q'''( ( ( ( ( ( ( ( ( ( ( ( ( ( (x==1s   =AAc                    t          t          d                    5 } |                                 }d d d            n# 1 swxY w Y   t          |          }t	          |          dk    sJ t          d |D                       sJ d S )Nr   textr   c              3  2   K   | ]}|j         j        d u V  d S Nr   r   r   es     r   	<genexpr>z.test_partition_md_from_text.<locals>.<genexpr>*   ,      ==qqz"d*======r   )r,   r   readr   r#   allr-   r1   r&   s      r   test_partition_md_from_textr<   #   s    	{++	,	, vvxx               &&&Hx==1==H==========   >AAc                      e Zd Zi fd	dZdS )
MockResponser1   strstatus_codeintheadersdict[str, Any]c                F    || _         || _        |dk     | _        || _        d S )Ni,  )r1   rA   okrC   )selfr1   rA   rC   s       r   __init__zMockResponse.__init__.   s(    	&#r   N)r1   r@   rA   rB   rC   rD   )__name__
__module____qualname__rH   r   r   r   r?   r?   -   s0        NP       r   r?   c                    t          d          } t          |           5 }|                                }d d d            n# 1 swxY w Y   t          |dddi          }t	          j        t          d|          5 }t          d	          }d d d            n# 1 swxY w Y   t          |          d
k    sJ t          d |D                       sJ d S )Nr      Content-Typetext/markdownr1   rA   rC   getreturn_valuehttps://fake.urlurlr   c              3  2   K   | ]}|j         j        d u V  d S r3   r4   r5   s     r   r7   z-test_partition_md_from_url.<locals>.<genexpr>C   r8   r   )
r   r,   r9   r?   r   objectrequestsr   r#   r:   )r   r-   r1   response_r&   s         r   test_partition_md_from_urlr\   5   s\   ,,H	h 1vvxx               1  H
 
hH	=	=	= 8$67778 8 8 8 8 8 8 8 8 8 8 8 8 8 8 x==1==H==========s#   A  AA;BBBc                    t          d          } t          |           5 }|                                }d d d            n# 1 swxY w Y   t          |dddi          }t	          j        t          d|          5 }t          j        t                    5  t          d	           d d d            n# 1 swxY w Y   d d d            d S # 1 swxY w Y   d S )
Nr   i  rN   z	text/htmlrP   rQ   rR   rT   rU   r   r,   r9   r?   r   rX   rY   pytestraises
ValueErrorr   r   r-   r1   rZ   r[   s        r   6test_partition_md_from_url_raises_with_bad_status_coderc   F   s   ,,H	h 1vvxx               -  H
 
hH	=	=	= -FMR\D]D] - -+,,,,- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -G   A  AA;C
B2&C
2B6	6C
9B6	:C

CCc                    t          d          } t          |           5 }|                                }d d d            n# 1 swxY w Y   t          |dddi          }t	          j        t          d|          5 }t          j        t                    5  t          d	           d d d            n# 1 swxY w Y   d d d            d S # 1 swxY w Y   d S )
Nr   rM   rN   zapplication/jsonrP   rQ   rR   rT   rU   r^   rb   s        r   7test_partition_md_from_url_raises_with_bad_content_typerf   T   s   ,,H	h 1vvxx               !34  H
 
hH	=	=	= -FMR\D]D] - -+,,,,- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -rd   c                     t          j        t                    5  t                       d d d            d S # 1 swxY w Y   d S r3   )r_   r`   ra   r   r   r   r   ,test_partition_md_raises_with_none_specifiedrh   b   s    	z	"	"                   s   6::c                     t          d          } t          |           5 }|                                }d d d            n# 1 swxY w Y   t          j        t
                    5  t          | |           d d d            d S # 1 swxY w Y   d S )Nr   )r   r1   )r   r,   r9   r_   r`   ra   r   )r   r-   r1   s      r   0test_partition_md_raises_with_too_many_specifiedrj   g   s   ,,H	h 1vvxx               
z	"	" 3 3hT22223 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3s#   A  AA$BB
Bc                     t          t          d                    } t          |           dk    sJ t          d | D                       sJ d S )Nr   r   c              3  6   K   | ]}|j         j        d k    V  dS )r   Nr4   r5   s     r   r7   zRtest_partition_md_from_filename_gets_filename_from_filename_arg.<locals>.<genexpr>w   s,      DDaqz"k1DDDDDDr   r   r   r#   r:   r&   s    r   ?test_partition_md_from_filename_gets_filename_from_filename_argro   s   sX    ,[99::Hx==1DD8DDDDDDDDDDr   c                     t          t          d          d          5 } t          |           }d d d            n# 1 swxY w Y   t          |          dk    sJ t	          d |D                       sJ d S )Nr   r)   r*   r   c              3  2   K   | ]}|j         j        d u V  d S r3   r4   r5   s     r   r7   zAtest_partition_md_from_file_gets_filename_None.<locals>.<genexpr>   r8   r   )r,   r   r   r#   r:   r-   r&   s     r   .test_partition_md_from_file_gets_filename_Noners   z   s    	{++T	2	2 (aQ'''( ( ( ( ( ( ( ( ( ( ( ( ( ( ( x==1==H==========   ;??c                     t          t          d          d          } t          |           dk    sJ t          d | D                       sJ d S )Nr   orig-name.md)metadata_filenamer   c              3  6   K   | ]}|j         j        d k    V  dS rv   Nr4   r   s     r   r7   zLtest_partition_md_from_filename_prefers_metadata_filename.<locals>.<genexpr>   s-      SSww(N:SSSSSSr   rm   rn   s    r   9test_partition_md_from_filename_prefers_metadata_filenamerz      s]    ,[99^\\\Hx==1SS(SSSSSSSSSSr   c                     t          t          d          d          5 } t          | d          }d d d            n# 1 swxY w Y   t          d |D                       sJ d S )Nr   r)   rv   )r+   rw   c              3  6   K   | ]}|j         j        d k    V  dS ry   r4   r5   s     r   r7   zHtest_partition_md_from_file_prefers_metadata_filename.<locals>.<genexpr>   s,      GGqz"n4GGGGGGr   r,   r   r   r:   rr   s     r   5test_partition_md_from_file_prefers_metadata_filenamer~      s    	{++T	2	2 JaQ.IIIJ J J J J J J J J J J J J J J GGhGGGGGGGGGGs   <A A c                     dt          t          d                    } t          fd| D                       s-J d dt          | d         j        j                               d S )NrO   r   c              3  8   K   | ]}|j         j        k    V  d S r3   )r   filetype)r   r6   MD_MIME_TYPEs     r   r7   zOtest_partition_md_gets_the_MD_MIME_type_in_metadata_filetype.<locals>.<genexpr>   s-      EEqqz"l2EEEEEEr   zExpected all elements to have 'z' as their filetype, but got: r   )r   r   r:   reprr   r   )r&   r   s    @r   <test_partition_md_gets_the_MD_MIME_type_in_metadata_filetyper      s    "L,[99::HEEEEHEEEEE  	2, 	2 	2!%.//	2 	2    r   mockerr   c                    d|                      d           t          t          d                    }t          fd|D                       sJ d S )N2029-07-05T09:24:280unstructured.partition.md.get_last_modified_daterR   r   c              3  8   K   | ]}|j         j        k    V  d S r3   r   last_modified)r   r6   filesystem_last_modifieds     r   r7   zVtest_partition_md_from_file_path_gets_last_modified_from_filesystem.<locals>.<genexpr>   s.      VVqz'+CCVVVVVVr   r   r   r   r:   )r   r&   r   s     @r   Ctest_partition_md_from_file_path_gets_last_modified_from_filesystemr      sq    4
LL:Ia     ,[99::HVVVVXVVVVVVVVVVr   c                     t          t          d          d          5 } t          |           }d d d            n# 1 swxY w Y   t          d |D                       sJ d S )Nr   r)   r*   c              3  2   K   | ]}|j         j        d u V  d S r3   r   r5   s     r   r7   zFtest_partition_md_from_file_gets_last_modified_None.<locals>.<genexpr>   ,      BBAqz'4/BBBBBBr   r}   rr   s     r   3test_partition_md_from_file_gets_last_modified_Noner      s    	{++T	2	2 (aQ'''( ( ( ( ( ( ( ( ( ( ( ( ( ( ( BBBBBBBBBBBBrt   c                     t          t          d                    5 } |                                 }d d d            n# 1 swxY w Y   t          |          }t	          d |D                       sJ d S )Nr   r0   c              3  2   K   | ]}|j         j        d u V  d S r3   r   r5   s     r   r7   zFtest_partition_md_from_text_gets_last_modified_None.<locals>.<genexpr>   r   r   r,   r   r9   r   r:   r;   s      r   3test_partition_md_from_text_gets_last_modified_Noner      s    	{++	,	, vvxx               &&&HBBBBBBBBBBBBr=   c                    d}d|                      d|           t          t          d                    }t          fd|D                       sJ d S )Nr   2020-07-05T09:24:28r   rR   r   )metadata_last_modifiedc              3  8   K   | ]}|j         j        k    V  d S r3   r   r   r6   r   s     r   r7   zRtest_partition_md_from_file_path_prefers_metadata_last_modified.<locals>.<genexpr>   .      TTaqz'+AATTTTTTr   r   )r   r   r&   r   s      @r   ?test_partition_md_from_file_path_prefers_metadata_last_modifiedr      s    42
LL:Ia     %%>T  H TTTT8TTTTTTTTTTr   c                     dt          t          d          d          5 } t          |           }d d d            n# 1 swxY w Y   t          fd|D                       sJ d S )Nr   r   r)   )r+   r   c              3  8   K   | ]}|j         j        k    V  d S r3   r   r   s     r   r7   zMtest_partition_md_from_file_prefers_metadata_last_modified.<locals>.<genexpr>   r   r   r}   )r-   r&   r   s     @r   :test_partition_md_from_file_prefers_metadata_last_modifiedr      s    2	{++T	2	2 WaQ?UVVVW W W W W W W W W W W W W W W TTTT8TTTTTTTTTTs   ?AAc                     dt          t          d                    5 } |                                 }d d d            n# 1 swxY w Y   t          |          }t	          fd|D                       sJ d S )Nr   r   )r1   r   c              3  8   K   | ]}|j         j        k    V  d S r3   r   r   s     r   r7   zMtest_partition_md_from_text_prefers_metadata_last_modified.<locals>.<genexpr>   r   r   r   )r-   r1   r&   r   s      @r   :test_partition_md_from_text_prefers_metadata_last_modifiedr      s    2	{++	,	, vvxx               >TUUUHTTTT8TTTTTTTTTTs   AAAc                     t          t          d                    5 } |                                 }d d d            n# 1 swxY w Y   t          |          }t	          |           d S )Nr   r0   )r,   r   r9   r   r   r;   s      r   test_partition_md_with_jsonr      s    	{++	,	, vvxx              &&&H#H-----r=   c                     t          d          } t          |           }t          | d          }t          |          }||k    sJ ||k    sJ d S )Nr   by_title)chunking_strategy)r   r   r	   )r   r&   chunk_elementschunkss       r   3test_add_chunking_strategy_by_title_on_partition_mdr      sc    ,,HH%%H!(jIIINH%%FX%%%%V######r   c                 \    d} t          |           }|d         j        j        dgk    sJ d S )Nzexample-docs/README.mdr   r   eng)r   r   	languagesr%   s     r   0test_partition_md_element_metadata_has_languagesr      s:    'HX...HA;)eW444444r   c                 j    d} t          | d          }d |D             }|dgddgdgdgdggk    sJ d S )Nz*example-docs/language-docs/eng_spa_mult.mdT)r   detect_language_per_elementc                &    g | ]}|j         j        S r   )r   r   r   s     r   r   zJtest_partition_md_respects_detect_language_per_element.<locals>.<listcomp>   s    @@@GW'@@@r   r   spar   )r   r&   langss      r   6test_partition_md_respects_detect_language_per_elementr      sZ    ;HX4PPPH@@x@@@EeWuenugwHHHHHHHr   c                     t          d          } t          |           }t          |          dk    sJ |d         j        t          j        k    sJ d S )Nzsimple-table.mdr   r   )r   r   r#   r   r
   TABLEr%   s     r   test_partition_md_parse_tabler      sX     122HX...Hx==1A;;#4444444r   )r   r   )/
__future__r   typingr   unittest.mockr   r_   rY   pytest_mockr   test_unstructured.unit_utilsr   r   unstructured.chunking.titler	   unstructured.documents.elementsr
   r   unstructured.partition.mdr   &unstructured.partition.utils.constantsr   r'   r.   r<   r?   r\   rc   rf   rh   rj   ro   rs   rz   r~   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   <module>r      s   " " " " " "               # # # # # # Z Z Z Z Z Z Z Z 6 6 6 6 6 6 > > > > > > > > 2 2 2 2 2 2 V V V V V VU U U  > > >       > > >"- - -- - -  
3 3 3E E E> > >T T TH H H  W W W WC C CC C CU U U UU U UU U U. . .$ $ $5 5 5I I I5 5 5 5 5r   