
    Ng)"                       d Z ddlmZ ddlZddlmZ ddlmZ ddlm	Z	 ddl
mZmZ ddlmZ dd	lmZ dd
lmZ ddlmZ ej                            dddg          d'd            Zd Zej                            dddg          d'd            Zd Zej                            dddg          d'd            Zej                            dddg          d'd            Zd Zd Zej                            dddg          d'd            Zej                            dddg          d'd            Z d Z!d Z"d(dZ#d Z$d(d Z%d! Z&ej                            dddg          d'd"            Z'd# Z(d$ Z)d% Z*d& Z+dS ))z3Test-suite for `unstructured.partition.xml` module.    )annotationsN)MockerFixture)example_doc_path)chunk_by_title)NarrativeTextTitle)partition_json)#UNSTRUCTURED_INCLUDE_DEBUG_METADATA)partition_xml)elements_to_jsonfilenamefactbook.xmlfactbook-utf-16.xmlstrc                    t          |           }t          |d          }|d         j        dk    sJ |d         j        j        | k    sJ t
          rd |D             dhk    sJ d S d S )NFr   xml_keep_tagsr   United Statesc                &    h | ]}|j         j        S  )metadatadetection_origin.0elements     `/var/www/html/ai-engine/env/lib/python3.11/site-packages/test_unstructured/partition/test_xml.py	<setcomp>z3test_partition_xml_from_filename.<locals>.<setcomp>   s    JJJg 1JJJ    xml)r   r   textr   r   r
   r   	file_pathelementss      r    test_partition_xml_from_filenamer$      s     **IiuEEEHA;....A;(H4444* VJJJJJugUUUUV VUUr   c                     t          t          d          dd          } | d         j        dk    sJ | d         j        j        dk    sJ d S )Nr   Ftest)r   metadata_filenamer   r   )r   r   r    r   r   )r#   s    r   7test_partition_xml_from_filename_with_metadata_filenamer(      sa    ((QW  H A;....A;(F222222r   c                    t          |           }t          |d          5 }t          |d|          }d d d            n# 1 swxY w Y   |d         j        dk    sJ |d         j        j        | k    sJ d S NrbFfiler   r'   r   r   r   openr   r    r   r   r   r"   fr#   s       r   test_partition_xml_from_filer2   %        **I	i		 [! auPYZZZ[ [ [ [ [ [ [ [ [ [ [ [ [ [ [ A;....A;(H444444   ?AAc                     t          t          d          d          5 } t          | dd          }d d d            n# 1 swxY w Y   |d         j        dk    sJ |d         j        j        dk    sJ d S )Nr   r+   Fr&   r,   r   r   )r/   r   r   r    r   r   r1   r#   s     r   3test_partition_xml_from_file_with_metadata_filenamer7   /   s    	~..	5	5 X auPVWWWX X X X X X X X X X X X X X X A;....A;(F222222s   =AAc                    t          |           }t          |d          5 }t          |d|          }d d d            n# 1 swxY w Y   |d         j        dk    sJ |d         j        j        | k    sJ d S r*   r.   r0   s       r   test_partition_xml_from_file_rbr9   7   r3   r4   c                    t          |           }t          |d          }d|d         j        v sJ |d         j        j        | k    sJ d S )NTr   <leader>Joe Biden</leader>r   )r   r   r    r   r   r!   s      r   ;test_partition_xml_from_filename_with_tags_default_encodingr<   A   sZ     **IitDDDH'8A;+;;;;;A;(H444444r   c                     t          t          d                    5 } |                                 }d d d            n# 1 swxY w Y   t          |d          }d|d         j        v sJ d S )Nr   T)r    r   r;   r   )r/   r   readr   r    )r1   r    r#   s      r   &test_partition_xml_from_text_with_tagsr?   J   s    	~..	/	/ 1vvxx              $d;;;H'8A;+;;;;;;;s   >AAc                     t          j        t                    5  t          t	          d          dd           d d d            d S # 1 swxY w Y   d S )Nr   Tutf-8)r   encoding)pytestraisesUnicodeDecodeErrorr   r   r   r   r   @test_partition_xml_from_filename_with_tags_raises_encoding_errorrF   R   s    	)	*	* e e&'<==T\cdddde e e e e e e e e e e e e e e e e es    AAAc                    t          |           }t          |d          5 }t          |d|          }d d d            n# 1 swxY w Y   d|d         j        v sJ |d         j        j        | k    sJ d S Nr+   Tr,   r;   r   r.   r0   s       r   7test_partition_xml_from_file_with_tags_default_encodingrI   W        **I	i		 Z! atyYYYZ Z Z Z Z Z Z Z Z Z Z Z Z Z Z (8A;+;;;;;A;(H444444r4   c                    t          |           }t          |d          5 }t          |d|          }d d d            n# 1 swxY w Y   d|d         j        v sJ |d         j        j        | k    sJ d S rH   r.   r0   s       r   :test_partition_xml_from_file_rb_with_tags_default_encodingrL   a   rJ   r4   c                     t          j        t                    5  t          t	          d          d          5 } t          | dd           d d d            n# 1 swxY w Y   d d d            d S # 1 swxY w Y   d S )Nr   r+   TrA   )r-   r   rB   )rC   rD   rE   r/   r   r   )r1   s    r   ?test_partition_xml_from_file_rb_with_tags_raises_encoding_errorrN   k   s   	)	*	*  "#8994@@ 	A"    	 	 	 	 	 	 	 	 	 	 	 	 	 	 	                 s4   A/AA/A	A/A	A//A36A3c                     dt          t          d                    } t          fd| D                       s-J d dt          | d         j        j                               d S )Nzapplication/xmlr   c              3  8   K   | ]}|j         j        k    V  d S )N)r   filetype)r   eXML_MIME_TYPEs     r   	<genexpr>zQtest_partition_xml_gets_the_XML_mime_type_in_metadata_filetype.<locals>.<genexpr>{   s-      FFqz"m3FFFFFFr   zExpected all elements to have 'z' as their filetype, but got: r   )r   r   allreprr   rQ   )r#   rS   s    @r   >test_partition_xml_gets_the_XML_mime_type_in_metadata_filetyperW   x   s    %M-n==>>HFFFFXFFFFF  	2- 	2 	2!%.//	2 	2    r   mockerr   c                    d}|                      d|           t          d          }|d         j        j        |k    sJ d S )N2029-07-05T09:24:281unstructured.partition.xml.get_last_modified_datereturn_valueexample-docs/factbook.xml)r   r   patchr   r   last_modified)rX   mocked_last_modification_dater#   s      r   Dtest_partition_xml_from_file_path_gets_last_modified_from_filesystemrc      s^    $9!
LL;2    
 &ABBBHA;-1NNNNNNNr   c                     t          dd          5 } t          |           }d d d            n# 1 swxY w Y   |d         j        j        J d S )Nr^   r+   )r-   r   r/   r   r   ra   r6   s     r   4test_partition_xml_from_file_gets_last_modified_Nonerf      s    	)4	0	0 )A a((() ) ) ) ) ) ) ) ) ) ) ) ) ) ) A;-55555s   .22c                    d}d}|                      d|           t          d|          }|d         j        j        |k    sJ d S )NrZ   z2020-07-05T09:24:28r[   r\   r^   )r   metadata_last_modifiedr   r_   )rX   filesystem_last_modifiedrh   r#   s       r   @test_partition_xml_from_file_path_prefers_metadata_last_modifiedrj      so    42
LL;Jb     ,5  H
 A;-1GGGGGGGr   c                     t          dd          5 } t          | d          }d d d            n# 1 swxY w Y   |d         j        j        dk    sJ d S )Nr^   r+   rZ   )r-   rh   r   re   r6   s     r   ;test_partition_xml_from_file_prefers_metadata_last_modifiedrl      s    	)4	0	0 WA a@UVVVW W W W W W W W W W W W W W W A;-1FFFFFFFs   /33c                   t          |           }t          |d          }t          t          |                    }t	          |          t	          |          k    sJ |d         j        j        |d         j        j        k    sJ |d         j        j        |d         j        j        k    sJ t          t	          |                    D ]}||         ||         k    sJ d S )NFr   r    r   )	r   r   r	   r   lenr   page_numberr   range)r   r"   r#   test_elementsis        r   test_partition_xml_with_jsonrt      s     **IiuEEEH"(8(B(BCCCMx==C......A;+}Q/?/H/TTTTTA;(M!,<,E,NNNNN3x==!! / /{mA....../ /r   c                 f   d} t          |           }|d         t          d          k    sJ t          |d         t                    sJ t	          |d                                       d          sJ t	          |d                                                                       d          sJ d S )Nz<xml>
        <parrot>
            <name>Conure</name>
            <description>A conure is a very friendly bird.
            Conures are feathery and like to dance.
            </description>
        </parrot>
    </xml>rn   r   Conure   z!A conure is a very friendly bird.z'Conures are feathery and like to dance.)r   r   
isinstancer   r   
startswithstripendswith)xml_textr#   s     r   -test_partition_xml_with_narrative_line_breaksr}      s    H (+++HA;%//))))hqk=11111x{&&'JKKKKKx{!!##,,-VWWWWWWWr   c                     t          d          } t          |           }t          | d          }t          |          }||k    sJ ||k    sJ d S )Nr   by_title)chunking_strategy)r   r   r   )r"   r#   chunk_elementschunkss       r   +test_add_chunking_strategy_on_partition_xmlr      sc     00IY''H"9
KKKNH%%FX%%%%V######r   c                 t    t          d          } t          |           }|d         j        j        dgk    sJ d S )Nr   r   eng)r   r   r   	languages)r"   r#   s     r   1test_partition_xml_element_metadata_has_languagesr      s?     00IY''HA;)eW444444r   c                     t          t          d          d          } d | D             }|dgddgdgdgdggk    sJ d S )Nzlanguage-docs/eng_spa_mult.xmlT)detect_language_per_elementc                &    g | ]}|j         j        S r   )r   r   r   s     r   
<listcomp>zKtest_partition_xml_respects_detect_language_per_element.<locals>.<listcomp>   s    @@@GW'@@@r   r   spa)r   r   )r#   langss     r   7test_partition_xml_respects_detect_language_per_elementr      si    9::X\  H A@x@@@EeWuenugwHHHHHHHr   )r   r   )rX   r   ),__doc__
__future__r   rC   pytest_mockr   test_unstructured.unit_utilsr   unstructured.chunking.titler   unstructured.documents.elementsr   r   unstructured.partition.jsonr	   &unstructured.partition.utils.constantsr
   unstructured.partition.xmlr   unstructured.staging.baser   markparametrizer$   r(   r2   r7   r9   r<   r?   rF   rI   rL   rN   rW   rc   rf   rj   rl   rt   r}   r   r   r   r   r   r   <module>r      sR   9 9 " " " " " "  % % % % % % 9 9 9 9 9 9 6 6 6 6 6 6 @ @ @ @ @ @ @ @ 6 6 6 6 6 6 V V V V V V 4 4 4 4 4 4 6 6 6 6 6 6 n6K%LMMV V V NMV3 3 3 n6K%LMM5 5 5 NM53 3 3 n6K%LMM5 5 5 NM5 n6K%LMM5 5 5 NM5< < <e e e
 n6K%LMM5 5 5 NM5 n6K%LMM5 5 5 NM5    
O 
O 
O 
O6 6 6H H H H G G G n6K%LMM
/ 
/ 
/ NM
/X X X"$ $ $5 5 5I I I I Ir   