
    Ng                       d Z ddlmZ ddlZddlZddlZddlZddlZddlm	Z	m
Z
 ddlZddlZddlmZ ddlmZ ddlmZ ddlmZmZmZmZmZmZmZ dd	lmZ dd
lmZmZm Z m!Z!m"Z"m#Z#m$Z$m%Z%m&Z&m'Z'm(Z(m)Z)m*Z* ddl+m,Z,m-Z-m.Z.m/Z/ ddl0m1Z1m2Z2 dUdZ3dVdZ4dVdZ5dVdZ6d Z7d Z8ej9        :                    dddg          dWd            Z;d Z<d Z=d Z>d  Z?d! Z@d" ZAd# ZBd$ ZCdXd'ZDd( ZEdXd)ZFd* ZGdYd/ZHdYd0ZIdZd4ZJd[d5ZKd\d6ZLd]d7ZMd]d8ZNd]d9ZOd: ZPd; ZQd< ZRd= ZSd> ZTd? ZUd@ ZVdA ZWdB ZXdC ZYdD ZZ ej[                    d^dF            Z\ ej[                    d_dG            Z] ej[                    d_dH            Z^ ej[                    dI             Z_ ej[                    dJ             Z` ej[                    d`dO            Za ej[                    dadP            Zb G dQ dR          Zc G dS dT          ZddS )bz4Test suite for `unstructured.partition.docx` module.    )annotationsN)AnyIterator)Document)	Paragraph)MockFixture)FixtureRequestMockassert_round_trips_through_JSONexample_doc_pathfunction_mockinstance_mockproperty_mock)chunk_by_title)AddressCompositeElementElementFooterHeaderImageListItemNarrativeText	PageBreakTable
TableChunkTextTitle)DocxPartitionerOptions_DocxPartitionerpartition_docxregister_picture_partitioner)#UNSTRUCTURED_INCLUDE_DEBUG_METADATAPartitionStrategymock_document_file_pathstrexpected_elementslist[Element]c                    t          |           }||k    sJ |d         j        j        J |D ]}|j        j        dk    sJ t          rd |D             dhk    sJ d S d S )Nr   mock_document.docxc                &    h | ]}|j         j        S  )metadatadetection_origin.0elements     a/var/www/html/ai-engine/env/lib/python3.11/site-packages/test_unstructured/partition/test_docx.py	<setcomp>z4test_partition_docx_from_filename.<locals>.<setcomp>E   s    JJJg 1JJJ    docx)r    r,   page_numberfilenamer"   )r$   r&   elementsr0   s       r1   !test_partition_docx_from_filenamer8   ;   s     566H(((((A;+333 A A(,@@@@@@* WJJJJJvhVVVVW WVVr3   
list[Text]c                N   t          | d          5 }t          j                    }|                    |                                           |                    d           t          |          }||k    sJ |D ]}|j        j        J 	 ddd           dS # 1 swxY w Y   dS )z`partition_docx()` accepts a SpooledTemporaryFile as its `file` argument.

    `python-docx` will NOT accept a `SpooledTemporaryFile` in Python versions before 3.11 so we need
    to ensure the source file is appropriately converted in this case.
    rbr   fileN)	opentempfileSpooledTemporaryFilewritereadseekr    r,   r6   )r$   r&   	test_filespooled_temp_filer7   r0   s         r1   %test_partition_docx_with_spooled_filerF   H   s    
%t	,	, 5	$9;;	 0 0111q!!!!'8999,,,,, 	5 	5G#,4444	55 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5s   A;BB!Bc                    t          | d          5 }t          |          }d d d            n# 1 swxY w Y   ||k    sJ |D ]}|j        j        J d S )Nr;   r<   )r>   r    r,   r6   )r$   r&   fr7   r0   s        r1   test_partition_docx_from_filerI   Z   s    	%t	,	, *!q)))* * * * * * * * * * * * * * *((((( 1 1(00001 1s   .22c                `    t          j        d          }t          | |          }||k    sJ d S )Ns   abcde)r6   r=   )ioBytesIOr    )r$   r&   rH   r7   s       r1   :test_partition_docx_uses_file_path_when_both_are_specifiedrM   b   s>     	
8A'>QGGGH(((((((r3   c                     t          j        t          d          5  t                       d d d            d S # 1 swxY w Y   d S )Nz5either `filename` or `file` argument must be providedmatch)pytestraises
ValueErrorr    r+   r3   r1   'test_partition_docx_raises_with_neitherrT   j   s    	z)`	a	a	a                   s   8<<c                     t          t          d                    } d | D             g dk    sJ d | D             t          t          t          gk    sJ dS )zHDocx with no sections partitions recognizing both paragraphs and tables.teams_chat.docxc                    g | ]	}|j         
S r+   textr/   es     r1   
<listcomp>z5test_parition_docx_from_team_chat.<locals>.<listcomp>u   s    %%%qAF%%%r3   )z)0:0:0.0 --> 0:0:1.510
Some Body
OK. Yeah.z'0:0:3.270 --> 0:0:4.250
James Bond
Umm.zsaved-by Dennis Forsythec                ,    g | ]}t          |          S r+   )typerZ   s     r1   r\   z5test_parition_docx_from_team_chat.<locals>.<listcomp>z   s    &&&DGG&&&r3   N)r    r   r   r   r7   s    r1   !test_parition_docx_from_team_chatr`   r   s{    ./@AABBH%%H%%% * * *    
 '&X&&&4u*=======r3   infer_table_structureTFboolc                    t          t          d          |           }t          |d         j        d          o|d         j        j        d u}|| k    sJ d S )Nfake_table.docx)ra   r   text_as_html)r    r   hasattrr,   re   )ra   r7   $table_element_has_text_as_html_fields      r1   )test_partition_docx_infer_table_structurerh   }   sq    *++CX  H 	$n55 	:QK -T9 ) 03HHHHHHHr3   c                     t          t          d                    } t          | d         t                    sJ | d         j        dk    sJ | d         j        j        dk    sJ | d         j        j        dk    sJ d S )Nrd   r   z4Header Col 1 Header Col 2 Lorem ipsum A Link examplezv<table><tr><td>Header Col 1</td><td>Header Col 2</td></tr><tr><td>Lorem ipsum</td><td>A Link example</td></tr></table>)r    r   
isinstancer   rY   r,   re   r6   r_   s    r1   #test_partition_docx_processes_tablerk      s    ./@AABBHhqk5)))))A; VWWWWA;,	    A;(,=======r3   c                     t          t          d                    } | d         t          d          k    sJ | d         t          d          k    sJ | D ]}|j        j        dk    sJ d S )Nhandbook-1p.docxr   zUS Trustee Handbook	Copyright)r    r   r   r   r,   r6   )r7   r0   s     r1   +test_partition_docx_grabs_header_and_footerrp      s    ./ABBCCHA;&!6777777B<6+...... ? ?(,>>>>>>? ?r3   c                     t          t          d          d          } dd | D             vsJ t          d | D                       sJ dS )a  Hard page-breaks by themselves are not enough to locate page-breaks in a document.

    In particular, they are redundant when rendered page-breaks are present, which they usually are
    in a native Word document, so lead to double-counting those page-breaks. When rendered page
    breaks are *not* present, only a small fraction will be represented by hard page-breaks so hard
    breaks are a false-positive and will generally produce incorrect page numbers.
    z(handbook-1p-no-rendered-page-breaks.docxTinclude_page_breaksr   c                6    g | ]}t          |          j        S r+   r^   __name__rZ   s     r1   r\   zqtest_partition_docx_includes_neither_page_breaks_nor_numbers_when_rendered_breaks_not_present.<locals>.<listcomp>   !    BBBAtAww/BBBr3   c              3  2   K   | ]}|j         j        d u V  d S N)r,   r5   rZ   s     r1   	<genexpr>zptest_partition_docx_includes_neither_page_breaks_nor_numbers_when_rendered_breaks_not_present.<locals>.<genexpr>   s,      @@!qz%-@@@@@@r3   Nr    r   allr_   s    r1   ]test_partition_docx_includes_neither_page_breaks_nor_numbers_when_rendered_breaks_not_presentr}      sr     CDDZ^  H BBBBBBBBB@@x@@@@@@@@@@r3   c                     t          t          d          d          } dd | D             vsJ | d         j        j        dk    sJ | d         j        j        dk    sJ d	S )
zPage-number metadata is not supressed when `include_page_breaks` arga is False.

    Only inclusion of PageBreak elements is affected by that option.
    rm   Frr   r   c                6    g | ]}t          |          j        S r+   ru   rZ   s     r1   r\   zetest_partition_docx_includes_page_numbers_when_page_break_elements_are_suppressed.<locals>.<listcomp>   rw   r3         Nr    r   r,   r5   r_   s    r1   Qtest_partition_docx_includes_page_numbers_when_page_break_elements_are_suppressedr      s}    
 ./ABBX]^^^HBBBBBBBBBA;+q0000B< ,111111r3   c                     t          t          d          dd          } dd | D             v sJ | d         j        j        dk    sJ | d         j        j        d	k    sJ d S )
Nrm   T   )rs   starting_page_numberr   c                6    g | ]}t          |          j        S r+   ru   rZ   s     r1   r\   zWtest_partition_docx_includes_page_break_elements_when_so_instructed.<locals>.<listcomp>   s!    >>>477+>>>r3   r   r      r   r_   s    r1   Ctest_partition_docx_includes_page_break_elements_when_so_instructedr      s    +,,$]^  H >>X>>>>>>>A;+q0000B< ,111111r3   c                     t          t          d                    } | d         t          d          k    sJ t          d | D                       dk    sJ d S )Nz example-list-items-multiple.docxrn   zCThis is simply dummy text of the printing and typesetting industry.c              3  D   K   | ]}t          |t                    d V  dS )r   N)rj   r   rZ   s     r1   rz   z4test_partition_docx_detects_lists.<locals>.<genexpr>   s1      >>QjH&=&=>q>>>>>>r3   
   )r    r   r   sumr_   s    r1   !test_partition_docx_detects_listsr      ss    ./QRRSSHB<8M      >>(>>>>>"DDDDDDr3   c                 x    t          t          d          d          } t          d | D                       sJ d S )Nsimple.docxtest)metadata_filenamec              3  6   K   | ]}|j         j        d k    V  dS r   Nr,   r6   r.   s     r1   rz   z\test_partition_docx_from_filename_prefers_metadata_filename_when_provided.<locals>.<genexpr>   -      KKww(F2KKKKKKr3   r{   r_   s    r1   Itest_partition_docx_from_filename_prefers_metadata_filename_when_providedr      sG    .}==QWXXXHKK(KKKKKKKKKKr3   c                     t          t          d          d          5 } t          | d          }d d d            n# 1 swxY w Y   t          d |D                       sJ d S )Nr   r;   r   )r=   r   c              3  6   K   | ]}|j         j        d k    V  dS r   r   r.   s     r1   rz   zXtest_partition_docx_from_file_prefers_metadata_filename_when_provided.<locals>.<genexpr>   r   r3   )r>   r   r    r|   rH   r7   s     r1   Etest_partition_docx_from_file_prefers_metadata_filename_when_providedr      s    	}--t	4	4 D!qFCCCD D D D D D D D D D D D D D DKK(KKKKKKKKKKs   <A A mockerr   c                    d}|                      d|           t          t          d                    }|d         j        j        |k    sJ d S )Nz2029-07-05T09:24:282unstructured.partition.docx.get_last_modified_datereturn_value	fake.docxr   patchr    r   r,   last_modified)r   filesystem_last_modifiedr7   s      r1   Etest_partition_docx_from_file_path_gets_last_modified_from_filesystemr      sc    4
LL<Kc     .{;;<<HA;-1IIIIIIIr3   c                     t          t          d          d          5 } t          |           }d d d            n# 1 swxY w Y   |d         j        j        J d S )Nr   r;   r<   r   r>   r   r    r,   r   r   s     r1   5test_partition_docx_from_file_gets_last_modified_Noner      s    	}--t	4	4 *!q)))* * * * * * * * * * * * * * * A;-55555s   ;??c                    d}d}|                      d|           t          t          d          |          }|d         j        j        |k    sJ d S )Nz2023-11-01T14:13:072020-07-05T09:24:28r   r   r   )metadata_last_modifiedr   r   )r   r   r   r7   s       r1   Atest_partition_docx_from_file_path_prefers_metadata_last_modifiedr      sw    42
LL<Kc     %%>T  H A;-1GGGGGGGr3   c                     d} t          t          d          d          5 }t          ||           }d d d            n# 1 swxY w Y   |d         j        j        | k    sJ d S )Nr   r   r;   )r=   r   r   r   )r   rH   r7   s      r1   <test_partition_docx_from_file_prefers_metadata_last_modifiedr     s    2	}--t	4	4 Y!qAWXXXY Y Y Y Y Y Y Y Y Y Y Y Y Y Y A;-1GGGGGGGs   >AA	opts_argsdict[str, Any]expected_emphasized_textslist[dict[str, str]]c                   t          d          | d<   t          d	i | }t          |          }|j        j        d         }t          |                    |                    }|j        dk    sJ ||k    sJ |j        j        d         }t          |                    |                    }|j        dk    sJ |g k    sJ |j        j        d         }t          |                    |                    }|j        dk    sJ |g k    sJ d S )
Nfake-doc-emphasized-text.docx	file_pathr   $I am a bold italic bold-italic text.r    r   I am a normal text.r+   )r   r   r   	_document
paragraphslist_iter_paragraph_emphasisrY   )r   r   optspartitioner	paragraphemphasized_textss         r1   (test_get_emphasized_texts_from_paragraphr     s0    ..MNNIk!..I..D"4((K%03IK@@KKLL>CCCCC88888%03IK@@KKLL>Rr!!!!%03IK@@KKLL>22222r!!!!!!r3   c                    t          d          | d<   t          di | }t          |          }|j        j        d         }t          |                    |                    }||k    sJ d S Nr   r   r   r+   )r   r   r   r   tablesr   _iter_table_emphasis)r   r   r   r   tabler   s         r1   test_iter_table_emphasisr   (  s{     ..MNNIk!..I..D"4((K!(+EK<<UCCDD8888888r3   !expected_emphasized_text_contents	list[str]expected_emphasized_text_tagsc                    t          d          | d<   t          di | }t          |          }|j        j        d         }|                    |          \  }}||k    sJ ||k    sJ d S r   )r   r   r   r   r   _table_emphasis)r   r   r   r   r   r   emphasized_text_contentsemphasized_text_tagss           r1   test_table_emphasisr   5  s    
 ..MNNIk!..I..D"4((K!(+E5@5P5PQV5W5W22#'HHHHH#@@@@@@@r3   c                   t          t          d                    }t          |d         t                    sJ |d         j        j        | k    sJ |d         j        j        |k    sJ |d         t          d          k    sJ |d         j        j        | k    sJ |d         j        j        |k    sJ |d         t          d          k    sJ |d         j        j        J |d         j        j        J d S )Nr   r   r   r   r   r   )r    r   rj   r   r,   r   r   r   )r   r   r7   s      r1   *test_partition_docx_grabs_emphasized_textsr   E  s    ./NOOPPHhqk5)))))A;8<]]]]]A;48UUUUUA;-(NOOOOOOA;8<]]]]]A;48UUUUUA;-(=>>>>>>A;8@@@A;4<<<<<r3   c                B    t          |           }t          |           d S ry   )r    r   )r$   r7   s     r1   test_partition_docx_with_jsonr   X  s$    566H#H-----r3   c                P   t          d          | d<   t          d	i | }t          |          }g d}|j        j        }t          |          D ]X\  }\  }}||         }|                    |          }	||j        v sJ d|g d|             |	|k    sJ d| d| d|	             Yd S )
Nzcategory-level.docxr   ))r   zCall me Ishmael.)r   zA Heading 1)r   z#Whenever I find myself growing grim)r   zA top level list item)r   z
Next level)r   Same)r   zSecond top-level list item)r   z$whenever I find myself involuntarily)r   r   )r   zA Heading 2)r   z)This is my substitute for pistol and ball)r   zAnother Heading 1)r   zThere now is your insular cityz
paragraph[z].text does not contain zexpected paragraph[z] to have depth==z, got r+   )r   r   r   r   r   	enumerate_parse_category_depth_by_stylerY   )
r   r   r   
test_casesr   idxdepthrY   r   actual_depths
             r1   "test_parse_category_depth_by_styler   ]  s   -.CDDIk!..I..D"4((K  J  &1J'
33 S S]eTsO	"AA)LLy~%%%'YSE'Y'YSW'Y'Y%%%E!!!RRRuRRLRR "!!!S Sr3   c                    t          di | }t          |          }g d}t          |          D ]5\  }\  }}|                    |          |k    sJ d||          d            6d S )N))r   	Heading 1)r   z	Heading 2)r   z	Heading 3)r   Subtitle)r   List)r   zList 2)r   zList 3)r   List Bullet)r   zList Bullet 2)r   zList Bullet 3)r   zList Number)r   zList Number 2)r   zList Number 3z
test case z failedr+   )r   r   r   #_parse_category_depth_by_style_name)r   r   r   r   r   r   rY   s          r1   'test_parse_category_depth_by_style_namer   ~  s    !..I..D"4((K  J  (
33 1 1]eT;;DAAUJJJ0
3000 KJJJ1 1r3   c                p    t          di | }t          |          }|                                dk    sJ d S )Nr   r+   )r   r   #_parse_category_depth_by_style_ilvl)r   r   r   s      r1   'test_parse_category_depth_by_style_ilvlr     sE    !..I..D"4((K::<<AAAAAAr3   c                     t          t          d          d          } t          t          d                    }t          |          }| |k    sJ | |k    sJ d S )Nrm   by_title)chunking_strategy)r    r   r   )chunk_elementsr7   chunkss      r1   9test_add_chunking_strategy_on_partition_docx_default_argsr     sr    #+,,
  N ./ABBCCHH%%FX%%%%V######r3   c                 &   t          d          } t          | ddd          }t          |           }t          |dd          }||k    sJ ||k    sJ |D ]:}t          |t          t
          f          sJ t          |j                  dk    sJ ;d S )Nr   r   	      )r   max_characterscombine_text_under_n_chars)r   r   )r   r    r   rj   r   r   lenrY   )	docx_pathr   r7   r   chunks        r1   ,test_add_chunking_strategy_on_partition_docxr     s     !@AAI#Z^_  N i((HHQSTUUUFV####~%%%% $ $%"2J!?@@@@@5:!#####$ $r3   c                 v    t          d          } t          |           }|d         j        j        dgk    sJ d S )Nrm   r6   r   engr   r    r,   	languagesr6   r7   s     r1   2test_partition_docx_element_metadata_has_languagesr     sC     233Hx000HA;)eW444444r3   c                     t          d          } t          | d          }d |D             }|dgddgdgdgdggk    sJ d S )Nzlanguage-docs/eng_spa_mult.docxT)r6   detect_language_per_elementc                &    g | ]}|j         j        S r+   )r,   r   r.   s     r1   r\   zLtest_partition_docx_respects_detect_language_per_element.<locals>.<listcomp>  s    @@@GW'@@@r3   r   spa)r   r    )r6   r7   langss      r1   8test_partition_docx_respects_detect_language_per_elementr    sc     ABBHxTRRRH@@x@@@EeWuenugwHHHHHHHr3   c                 z    t          d          } t          | dg          }|d         j        j        dgk    sJ d S )Nrm   deur6   r   r   r   r   s     r1   *test_partition_docx_respects_languages_argr    sG     233HxE7CCCHA;)eW444444r3   c                     t          j        t                    5  t          d          } t	          | d           d d d            d S # 1 swxY w Y   d S )Nrm   r   r  )rQ   rR   	TypeErrorr   r    r   s    r1   :test_partition_docx_raises_TypeError_for_invalid_languagesr    s    	y	!	! ; ;#$677E::::; ; ; ; ; ; ; ; ; ; ; ; ; ; ; ; ; ;s   !AAAc                    t          t          d                    } | d         }|j        dk    sJ |j        }|j        J |j        J |j        J | d         }|j        dk    sJ |j        }|j        J |j        J |j        J | d         }|j        dk    sJ |j        }|j        dd	d
dgk    sJ |j        d	gk    sJ |j        d
gk    sJ | d         }|j        dk    sJ |j        }|j        ddddgk    sJ |j        dgk    sJ |j        dgk    sJ | d         }|j        dk    sJ |j        }|j        ddddgk    sJ |j        dgk    sJ |j        dgk    sJ | d         }|j        dk    sJ |j        }|j        ddddgk    sJ |j        dgk    sJ |j        dgk    sJ | d         }|j        dk    sJ |j        }|j        J |j        J |j        J d S )Nzhlink-meta.docxr   Oner   zTwo with link to bookmark.r   zThree with link to foo.com.   zlink to foo.comzhttps://foo.com)start_indexrY   urlr   z,Four with link to foo.com searching for bar.r   z!link to foo.com searching for barzhttps://foo.com?q=barr   z/Five with link to foo.com introduction section.z$link to foo.com introduction sectionzhttp://foo.com/#intro   zEight with link to file.zlink to filezcourt-exif.jpg   zNine.)r    r   rY   r,   links
link_texts	link_urls)r7   r0   r,   s      r1   /test_partition_docx_includes_hyperlink_metadatar    s=   ./@AABBH qkG<5    H>!!!&&&%%% qkG<77777H>!!!&&&%%% qkG<88888H>%$	
 	
     #4"55555"3!44444 qkG<IIIIIH>7*	
 	
     #F"GGGGG"9!::::: qkG<LLLLLH>:*	
 	
     #I"JJJJJ"9!::::: qkG<55555H>"#	
 	
     >"22222"2!33333 qkG<7""""H>!!!&&&%%%%%r3   c                     t          d          } d t          |           D             }d t          |           D             }||k    sJ t          |          t          t          |                    k    sJ d S )Nzduplicate-paragraphs.docxc                    g | ]	}|j         
S r+   idr.   s     r1   r\   zTtest_partition_docx_assigns_deterministic_and_unique_element_ids.<locals>.<listcomp>.  s    
C
C
C'7:
C
C
Cr3   c                    g | ]	}|j         
S r+   r  r.   s     r1   r\   zTtest_partition_docx_assigns_deterministic_and_unique_element_ids.<locals>.<listcomp>/  s    EEEGWZEEEr3   )r   r    r   set)document_pathidsids_2s      r1   @test_partition_docx_assigns_deterministic_and_unique_element_idsr   +  s~    $%@AAM
C
C^M%B%B
C
C
CCEE~m'D'DEEEE %<<<<s88s3s88}}$$$$$$r3   c                 f    t          t          d                    } d | D             g dk    sJ d S )Nzdocx-shapes.docxc                    g | ]	}|j         
S r+   rX   r.   s     r1   r\   z8test_it_considers_text_inside_shapes.<locals>.<listcomp>=  s    888WGL888r3   )z,Paragraph with single <inline-image> within.z:Paragraph with <inline-image1> and <inline-image2> within.z'Paragraph with floating shape attached.)r    r   )partitioned_docs    r1   $test_it_considers_text_inside_shapesr$  :  sW    $%56H%I%IJJO88888 = = =      r3   c                 p    t          d t          t          d                    D                       rJ d S )Nc              3  @   K   | ]}t          |t                    V  d S ry   rj   r   rZ   s     r1   rz   zMtest_partition_docx_generates_no_Image_elements_by_default.<locals>.<genexpr>I  s=        !"
1e     r3   contains-pictures.docx)anyr    r   r+   r3   r1   :test_partition_docx_generates_no_Image_elements_by_defaultr*  H  sV      &45EF^5_5_&`&`         r3   c                 "    G d d          } t          |            t          t          d                    }d t          _        t          |          dk    sJ d |D             }t          |          dk    sJ d |D             g dk    sJ d S )	Nc                  &    e Zd Zed	d            ZdS )
`test_partition_docx_uses_registered_picture_partitioner.<locals>.FakeParagraphPicturePartitionerr   r   r   r   returnIterator[Image]c              3     K   t          j        |j         |j                                                                                   }t          d| d|j                   V  d S )NzImage with hash z, strategy: )hashlibsha1rY   strategyencode	hexdigestr   )clsr   r   	call_hashs       r1   iter_elementszntest_partition_docx_uses_registered_picture_partitioner.<locals>.FakeParagraphPicturePartitioner.iter_elementsP  sk        	%G%G%G%N%N%P%PQQ[[]]IQ9QQ$-QQRRRRRRRr3   N)r   r   r   r   r.  r/  )rv   
__module____qualname__classmethodr8  r+   r3   r1   FakeParagraphPicturePartitionerr-  O  s8        		S 	S 	S 
	S 	S 	Sr3   r<  r(  r  c                <    g | ]}t          |t                    |S r+   r'  rZ   s     r1   r\   zKtest_partition_docx_uses_registered_picture_partitioner.<locals>.<listcomp>`  s'    BBBAZ5-A-ABaBBBr3      c                    g | ]	}|j         
S r+   rX   rZ   s     r1   r\   zKtest_partition_docx_uses_registered_picture_partitioner.<locals>.<listcomp>b  s    +++qAF+++r3   )JImage with hash 429de54e71f1f0fb395b6f6191961a3ea1b64dc0, strategy: hi_reszJImage with hash 5e0cd2c62809377d8ce7422d8ca6b0cf5f4453bc, strategy: hi_resr@  zJImage with hash ccbd34be6096544babc391890cb0849c24cc046c, strategy: hi_reszJImage with hash a41b819c7b4a9750ec0f9198c59c2057d39c653c, strategy: hi_reszJImage with hash ba0dc2a1205af8f6d9e06c8d415df096b0a9c428, strategy: hi_res)r!   r    r   r   _PicturePartitionerClsr   )r<  r7   image_elementss      r1   7test_partition_docx_uses_registered_picture_partitionerrC  N  s    S S S S S S S S !!@AAA./GHHIIH 591x==BBBBBBN~!####++N+++ 0 0 0      r3   r.  c            
         t          d          t          d          t          d          t          d          t          d          t          d          t          d          t	          d          gS )	N&These are a few of my favorite things:ParrotsHockeyAnalysis4This is my first thought. This is my second thought.This is my third thought.2023DOYLESTOWN, PA 18901)r   r   r   r   r   r+   r3   r1   r&   r&   o  sh     	677jLMM122V&''	 	r3   c                 
    g dS )N)bolditalicbold-italicrP  r+   r+   r3   r1   r   r   }  s    ;;;;r3   c                 
    g dS )N)birR  rS  r+   r+   r3   r1   r   r     s    r3   c                 &    ddddddddddddgS )NrN  rR  )rY   tagrO  rS  rP  r+   r+   r3   r1   r   r     s:     $$#&&s++s++	 r3   c                    t          j                    } |                     dd           |                     dd           |                     dd           |                     dd           |                     d	d           |                     d
d           |                     d	d           |                     dd           |                     dd           |                     d           |                     d           | S )NrE  r   )styleu   • ParrotsNormalu   • rG  r   r   rH  rI  rJ  z	Body TextrK  rL  )r4   r   add_paragraph)documents    r1   mock_documentr[    s    }HC;WWW=99962228=9992]333:X6662X...QYabbb6kJJJ6"""1222Or3   r[  r   tmp_pathpathlib.Pathc                T    t          |dz            }|                     |           |S )Nr)   )r%   save)r[  r\  r6   s      r1   r$   r$     s.    82233Hx   Or3   c                     ddddddS )zAll default arguments for `DocxPartitionerOptions`.

    Individual argument values can be changed to suit each test. Makes construction of opts more
    compact for testing purposes.
    NT)r=   r   rs   ra   r3  r+   r+   r3   r1   r   r     s"     #!%  r3   c                     e Zd ZdZd9dZd9dZd:dZej        	                    d	d
dg          d;d            Z
ej        	                    d	d
dg          d;d            Zd9dZd9dZd<dZd9dZej        	                    dddg          d=d            Zej        	                    dddg          d>d"            Zd9d#Zd9d$Zej        	                    d%d&ej        d'fej        d(fg          d?d*            Zej        	                    d+d,d-g          d@d/            Zd9d0Zd9d1Zd9d2Zd9d3Zd9d4Zd9d5Zd9d6Z ej                     dAd8            Z!dS )BDescribeDocxPartitionerOptionszQUnit-test suite for `unstructured.partition.docx.DocxPartitionerOptions` objects.r   r   c                z    t          d          |d<   t          j        di |}t          |t                    sJ d S )Nr   r   r+   )r   r   loadrj   selfr   r   s      r1   $it_provides_a_validating_constructorzCDescribeDocxPartitionerOptions.it_provides_a_validating_constructor  sG    !1-!@!@	+%*77Y77$ 67777777r3   c                    t          j        t          d          5  t          j        di | d d d            d S # 1 swxY w Y   d S )Nzno DOCX document specified, rO   r+   rQ   rR   rS   r   rd  rf  r   s     r1   (and_it_raises_when_options_are_not_validzGDescribeDocxPartitionerOptions.and_it_raises_when_options_are_not_valid  s    ]:-KLLL 	5 	5"'44)444	5 	5 	5 	5 	5 	5 	5 	5 	5 	5 	5 	5 	5 	5 	5 	5 	5 	5   ;??requestr	   c                   t          |t                    }t          |d|          }t          |t          dd          }t	          di |}|j        }|                                 |                    d           ||u sJ d S )Nz)unstructured.partition.docx.docx.Documentr   
_docx_filez
abcde.docxr+   )r   r   r   r   r   rZ  assert_called_once_with)rf  rm  r   	document_docx_Document__docx_file_prop_r   rZ  s           r1   it_loads_the_docx_documentz9DescribeDocxPartitionerOptions.it_loads_the_docx_document  s    
 "'844	&@y
 
 
 )+\
 
 
 &22	22=00222..|<<<9$$$$$$r3   	arg_valueTFrb   c                >    ||d<   t          di |}|j        |u sJ d S )Nrs   r+   )r   rs   rf  ru  r   r   s       r1   Pit_knows_whether_to_emit_PageBreak_elements_as_part_of_the_output_element_streamzoDescribeDocxPartitionerOptions.it_knows_whether_to_emit_PageBreak_elements_as_part_of_the_output_element_stream  s<     ,5	'(%22	22'9444444r3   c                >    ||d<   t          di |}|j        |u sJ d S )Nra   r+   )r   ra   rw  s       r1   :it_knows_whether_to_include_text_as_html_in_Table_metadatazYDescribeDocxPartitionerOptions.it_knows_whether_to_include_text_as_html_in_Table_metadata  s<     .7	)*%22	22)Y666666r3   c                ,   t          di |}|                                }t          t          |d           t                    sJ |j        dk    sJ t          j        t                    5  t          |           d d d            d S # 1 swxY w Y   d S )Nr   r+   )	r   increment_page_numberrj   nextr   r5   rQ   rR   StopIterationrf  r   r   page_break_iters       r1   Dit_generates_a_PageBreak_element_when_the_page_number_is_incrementedzcDescribeDocxPartitionerOptions.it_generates_a_PageBreak_element_when_the_page_number_is_incremented	  s     &22	224466$55yAAAAA1$$$$]=)) 	" 	"!!!	" 	" 	" 	" 	" 	" 	" 	" 	" 	" 	" 	" 	" 	" 	" 	" 	" 	"s   ,B		BBc                    d|d<   t          di |}|                                }t          j        t                    5  t          |           d d d            n# 1 swxY w Y   |j        dk    sJ d S )NFrs   r   r+   )r   r|  rQ   rR   r~  r}  r5   r  s       r1   Sbut_it_does_not_generate_a_PageBreak_element_when_include_page_breaks_option_is_offzrDescribeDocxPartitionerOptions.but_it_does_not_generate_a_PageBreak_element_when_include_page_breaks_option_is_off  s     ,1	'(%22	224466]=)) 	" 	"!!!	" 	" 	" 	" 	" 	" 	" 	" 	" 	" 	" 	" 	" 	" 	"1$$$$$$s   AA"Aget_last_modified_date_r
   c                ~    d|d<   d|_         t          di |}|j        }|                    d           |dk    sJ d S )Nza/b/document.docxr   z2024-04-02T20:32:35r+   )r   r   r   rp  )rf  r   r  r   r   s        r1   Dit_gets_last_modified_from_the_filesystem_when_file_path_is_providedzcDescribeDocxPartitionerOptions.it_gets_last_modified_from_the_filesystem_when_file_path_is_provided#  s_     "5	+/D,%22	22*778KLLL 5555555r3   c                b    t          j        d          }||d<   t          di |}|j        J d S Ns   abcdefgr=   r+   )rK   rL   r   r   )rf  r   r=   r   s       r1   Rbut_it_falls_back_to_None_for_the_last_modified_date_when_no_file_path_is_providedzqDescribeDocxPartitionerOptions.but_it_falls_back_to_None_for_the_last_modified_date_when_no_file_path_is_provided/  sE     z*%% 	&%22	22!)))))r3   r   z
u/v/w.docxN
str | Nonec                B    ||d<   t          di |}|j        |k    sJ d S Nr   r+   )r   metadata_file_path)rf  r   r   r   s       r1   ,it_uses_the_file_path_argument_when_providedzKDescribeDocxPartitionerOptions.it_uses_the_file_path_argument_when_provided:  s;     "+	+%22	22&)333333r3   )
page_countdocument_contains_pagebreaksexpected_value)r  Tr  )r   FNr  intr  r  
int | Nonec                    t          |t          d|          }t          di |}||_        |j        }|                                 ||u sJ d S )N_document_contains_pagebreaksr   r+   )r   r   _page_countermetadata_page_numberrp  )	rf  rm  r   r  r  r  #_document_contains_pagebreaks_prop_r   r  s	            r1   Bit_reports_None_when_no_rendered_page_breaks_are_found_in_documentzaDescribeDocxPartitionerOptions.it_reports_None_when_no_rendered_page_breaks_are_found_in_documentE  st     /<"+5	/
 /
 /
+ &22	22'#8+CCEEE#~555555r3   c                    t          di |}|j        dk    sJ t          |                                           |j        dk    sJ t          |                                           |j        dk    sJ dS )z)In DOCX, page-number is the slide number.r   r   r   Nr+   r   r5   r   r|  re  s      r1   !it_keeps_track_of_the_page_numberz@DescribeDocxPartitionerOptions.it_keeps_track_of_the_page_numbera  s    %22	221$$$$T''))***1$$$$T''))***1$$$$$$r3   c                    t          di |ddi}|j        dk    sJ t          |                                           |j        dk    sJ d S )Nr   r   r   r+   r  re  s      r1   Eit_assigns_the_correct_page_number_when_starting_page_number_is_givenzdDescribeDocxPartitionerOptions.it_assigns_the_correct_page_number_when_starting_page_number_is_givenk  sj     &JJ	JJJJJ1$$$$T''))***1$$$$$$r3   )ru  r  )Nhi_resfastr  r%   c                B    ||d<   t          di |}|j        |k    sJ d S )Nr3  r+   )r   r3  )rf  r   ru  r  r   s        r1   +it_knows_which_partitioning_strategy_to_usezJDescribeDocxPartitionerOptions.it_knows_which_partitioning_strategy_to_usev  s:     !*	*%22	22}......r3   )	file_namer  )page-breaks.docxT)rV   Fr  c                X    t          |          |d<   t          di |}|j        |u sJ d S r  )r   r   r  )rf  r   r  r  r   s        r1   2it_knows_whether_the_document_contains_page_breakszQDescribeDocxPartitionerOptions.it_knows_whether_the_document_contains_page_breaks  sC     "2)!<!<	+%22	221^CCCCCCr3   c                B    d|d<   t          di |}|j        dk    sJ d S )N
l/m/n.docxr   r+   )r   ro  re  s      r1   Dit_uses_the_path_to_open_the_presentation_when_file_path_is_providedzcDescribeDocxPartitionerOptions.it_uses_the_path_to_open_the_presentation_when_file_path_is_provided  s:     ".	+%22	22,......r3   c                    t          j                    }|                    d           ||d<   t          di |}|j        }||usJ t          |t          j                  sJ |                                dk    sJ d S r  )	r?   r@   rA   r   ro  rj   rK   rL   getvalue)rf  r   rE   r   	docx_files        r1   Fand_it_uses_a_BytesIO_file_to_replaces_a_SpooledTemporaryFile_providedzeDescribeDocxPartitionerOptions.and_it_uses_a_BytesIO_file_to_replaces_a_SpooledTemporaryFile_provided  s     %9;;
+++-	&%22	22O	 11111)RZ00000!!##z111111r3   c                    t          j        d          }||d<   t          di |}|j        }||u sJ t	          |t           j                  sJ |                                dk    sJ d S r  )rK   rL   r   ro  rj   r  )rf  r   r=   r   r  s        r1   Fand_it_uses_the_provided_file_directly_when_not_a_SpooledTemporaryFilezeDescribeDocxPartitionerOptions.and_it_uses_the_provided_file_directly_when_not_a_SpooledTemporaryFile  s     z*%% 	&%22	22O	D    )RZ00000!!##z111111r3   c                    d|d<   t          j        t          d          5  t          j        di | d d d            d S # 1 swxY w Y   d S )Nr  r   z'no such file or directory: 'l/m/n.docx'rO   r+   )rQ   rR   FileNotFoundErrorr   rd  rj  s     r1   *it_raises_when_no_file_exists_at_file_pathzIDescribeDocxPartitionerOptions.it_raises_when_no_file_exists_at_file_path  s    !-	+],4]^^^ 	5 	5"'44)444	5 	5 	5 	5 	5 	5 	5 	5 	5 	5 	5 	5 	5 	5 	5 	5 	5 	5s   A  AAc                    t          d          |d<   t          j        t          d          5  t	          j        di | d d d            d S # 1 swxY w Y   d S )N
simple.docr   *not a ZIP archive \(so not a DOCX file\): rO   r+   )r   rQ   rR   rS   r   rd  rj  s     r1   =and_it_raises_when_the_file_at_file_path_is_not_a_ZIP_archivez\DescribeDocxPartitionerOptions.and_it_raises_when_the_file_at_file_path_is_not_a_ZIP_archive  s     "2,!?!?	+]:-Z[[[ 	5 	5"'44)444	5 	5 	5 	5 	5 	5 	5 	5 	5 	5 	5 	5 	5 	5 	5 	5 	5 	5s   AAAc                   t          t          d          d          5 }||d<   t          j        t          d          5  t          j        di | d d d            n# 1 swxY w Y   d d d            d S # 1 swxY w Y   d S )Nr  r;   r=   r  rO   r+   )r>   r   rQ   rR   rS   r   rd  )rf  r   rH   s      r1   <and_it_raises_when_the_file_like_object_is_not_a_ZIP_archivez[DescribeDocxPartitionerOptions.and_it_raises_when_the_file_like_object_is_not_a_ZIP_archive  s
    "<00$77 	91 !Ifz1^___ 9 9&+88i8889 9 9 9 9 9 9 9 9 9 9 9 9 9 9	9 	9 	9 	9 	9 	9 	9 	9 	9 	9 	9 	9 	9 	9 	9 	9 	9 	9s4   !A5AA5A!	!A5$A!	%A55A9<A9c                    t          j        t          d          5  t          j        di | d d d            d S # 1 swxY w Y   d S )Nz1no DOCX document specified, either `filename` or rO   r+   ri  rj  s     r1   :and_it_raises_when_neither_a_file_path_or_file_is_providedzYDescribeDocxPartitionerOptions.and_it_raises_when_neither_a_file_path_or_file_is_provided  s    ]:-`aaa 	5 	5"'44)444	5 	5 	5 	5 	5 	5 	5 	5 	5 	5 	5 	5 	5 	5 	5 	5 	5 	5rl  r.  c                "    t          |d          S )Nr   )r   )rf  rm  s     r1   r  z6DescribeDocxPartitionerOptions.get_last_modified_date_  s    W&Z[[[r3   r   r   )rm  r	   r   r   )ru  rb   r   r   )r   r   r  r
   )r   r  r   r   )
rm  r	   r   r   r  r  r  rb   r  r  )r   r   ru  r%   r  r%   )r   r   r  r%   r  rb   )rm  r	   r.  r
   )"rv   r9  r:  __doc__rg  rk  rt  rQ   markparametrizerx  rz  r  r  r  r  r  r  r  r  r#   FASTHI_RESr  r  r  r  r  r  r  r  r  fixturer  r+   r3   r1   rb  rb    s       [[8 8 8 85 5 5 5% % % %, [[4-885 5 5 985 [[4-887 7 7 987
" 
" 
" 
"
% 
% 
% 
%
6 
6 
6 
6* * * * [[<*>??4 4 4 @?4 [H	'( 6 6 6	 60% % % %% % % % ['	-2F;>O>VX`=ab / / /	 / ['*DF`)a D D D D/ / / /2 2 2 22 2 2 25 5 5 5
5 5 5 59 9 9 95 5 5 5 V^\ \ \ \ \ \r3   rb  c                  X    e Zd ZdZddZddZddZddZddZd	 Z	dd
Z
ddZddZdS )Describe_DocxPartitionerzCUnit-test suite for `unstructured.partition.docx._DocxPartitioner`.r   r   c                    t          di |}t          j        t          d                    j        d         }t          |                              |          dk    sJ d S )Ndocx-tables.docxr   zv<table><tr><td>Header Col 1</td><td>Header Col 2</td></tr><tr><td>Lorem ipsum</td><td>A link example</td></tr></table>r+   )r   r4   r   r   r   r   _convert_table_to_htmlrf  r   r   r   s       r1   it_can_convert_a_table_to_htmlz7Describe_DocxPartitioner.it_can_convert_a_table_to_html  sr    %22	22./ABBCCJ1M%%<<UCC
 
 
 
 
 
r3   c                    t          di |}t          j        t          d                    j        d         }t          j        ddt          |                              |                    }|dk    sJ dS )  
        Fixture table is:

            +---+-------------+---+
            | a |     >b<     | c |
            +---+-------------+---+
            |   | +-----+---+ |   |
            |   | |  e  | f | |   |
            | d | +-----+---+ | i |
            |   | | g&t | h | |   |
            |   | +-----+---+ |   |
            +---+-------------+---+
            | j |      k      | l |
            +---+-------------+---+
        r  r   z +<<z<table><tr><td>a</td><td>&gt;b&lt;</td><td>c</td></tr><tr><td>d</td><td>e f g&amp;t h</td><td>i</td></tr><tr><td>j</td><td>k</td><td>l</td></tr></table>Nr+   )	r   r4   r   r   r   resubr   r  )rf  r   r   r   htmls        r1   )and_it_can_convert_a_nested_table_to_htmlzBDescribe_DocxPartitioner.and_it_can_convert_a_nested_table_to_html  s      &22	22./ABBCCJ1M vfc#3D#9#9#P#PQV#W#WXX
 
 
 
 
 
r3   c                    t          di |}t          j        t          d                    j        d         }d                    t          |                              |                    dk    sJ d S )Nr  r    z4Header Col 1 Header Col 2 Lorem ipsum A link exampler+   r   r4   r   r   r   joinr   _iter_table_textsr  s       r1   $it_can_convert_a_table_to_plain_textz=Describe_DocxPartitioner.it_can_convert_a_table_to_plain_text  s}    %22	22./ABBCCJ1Mxx(..@@GGHHB
 
 
 
 
 
r3   c                    t          di |}t          j        t          d                    j        d         }d                    t          |                              |                    dk    sJ dS )r  r  r   r  za >b< c d e f g&t h i j k lNr+   r  r  s       r1   /and_it_can_convert_a_nested_table_to_plain_textzHDescribe_DocxPartitioner.and_it_can_convert_a_nested_table_to_plain_text  s      &22	22./ABBCCJ1Mxx(..@@GGHH)
 
 
 
 
 
r3   c                    t          di |}t          j        t          d                    j        d         }d                    t          |                              |                    dk    sJ dS )z
        Fixture table is:

            +---+-------+
            | a | b     |
            |   +---+---+
            |   | c | d |
            +---+---+   |
            | e     |   |
            +-------+---+
        r  r   r  z	a b c d eNr+   r  r  s       r1   /but_the_text_of_a_merged_cell_appears_only_oncezHDescribe_DocxPartitioner.but_the_text_of_a_merged_cell_appears_only_once"  sr     &22	22./ABBCCJ1Mxx(..@@GGHHKWWWWWWr3   c                   t          t          t          d                              }t          |          }|j                            d          sJ t          |          }t          |          j        dk    sJ |j        dk    sJ |j        j	        dk    sJ t          |          }t          |          j        dk    sJ |j        dk    sJ d|j                    |j        j	        dk    sJ d	|j        j	                    t          |          }t          |          j        dk    sJ |j        dk    sJ d|j                    |j        j	        d
k    sJ d	|j        j	                    t          |          }t          |          j        dk    sJ |j        dk    sJ d|j                    |j        j	        dk    sJ d	|j        j	                    t          |          }t          |          j        dk    sJ |j        dk    sJ d|j                    |j        j	        dk    sJ d	|j        j	                    t          |          }t          |          j        dk    sJ |j        dk    sJ d|j                    |j        j	        dk    sJ d	|j        j	                    dS )ak  DOCX permits table rows to start late and end early.

        It is relatively rare in the wild, but DOCX tables are unique (as far as I know) in that
        they allow rows to start late, like in column 3, and end early, like the last cell is in
        column 5 of a 7 column table.

        A practical example might look like this:

                       +------+------+
                       | East | West |
            +----------+------+------+
            | Started  |  25  |  32  |
            +----------+------+------+
            | Finished |  17  |  21  |
            +----------+------+------+
        z tables-with-incomplete-rows.docxzExample of DOCX table r   za b c dzI<table><tr><td>a</td><td>b</td></tr><tr><td>c</td><td>d</td></tr></table>za b czactual e.text=zD<table><tr><td>a</td><td/></tr><tr><td>b</td><td>c</td></tr></table>zactual e.metadata.text_as_html=zX<table><tr><td>a</td><td>a</td><td/></tr><tr><td>b</td><td>c</td><td>d</td></tr></table>zX<table><tr><td>a</td><td>b</td><td/></tr><tr><td>a</td><td>c</td><td>d</td></tr></table>za b c d e fz<table><tr><td>a</td><td>a</td><td>b</td><td>c</td></tr><tr><td/><td>d</td><td>d</td><td/></tr><tr><td>e</td><td>d</td><td>d</td><td>f</td></tr><tr><td/><td>d</td><td>d</td><td/></tr></table>z"Data More Dato WTF? Strange FormataD  <table><tr><td>Data</td><td>Data</td><td/></tr><tr><td>Data</td><td>Data</td><td/></tr><tr><td>Data</td><td>Data</td><td/></tr><tr><td/><td>More</td><td/></tr><tr><td>Dato</td><td/></tr><tr><td>WTF?</td><td>WTF?</td><td/></tr><tr><td>Strange</td><td>Strange</td><td/></tr><tr><td/><td>Format</td><td>Format</td></tr></table>N)
iterr    r   r}  rY   
startswithr^   rv   r,   re   )rf  r7   r[   s      r1   ,it_can_partition_tables_with_incomplete_rowszEDescribe_DocxPartitioner.it_can_partition_tables_with_incomplete_rows2  s]   " '78Z'['[\\]]NNv  !9::::: NNAww7****v""""z&W
 
 
 
 NNAww7****v   "5AF"5"5   z&R
 
 
/QZ,//
 
 
 NNAww7****v"""$7af$7$7"""z&
 
 

 0QZ,//
 
 
 NNAww7****v"""$7af$7$7"""z&
 
 

 0QZ,//
 
 
  NNAww7****v&&&(;!&(;(;&&&z&
 
 
 0QZ,//
 
 
 NNAww7****v====?R?R?R===z&	
 
 
 0QZ,//
 
 
 
 
r3   c                v   dd}t          d          |d<   t          di |}t          d          t          d	          t          d
          t          d	          t          d          t          d          t          d          t          d	          t          d	          t          d          t          d          t          d	          t	          d          g}t          j        |          }t          |          D ]:\  }}|||         k    s)J d |||                    d ||           d            ;dS )a  Page-break behavior has some subtleties.

        * A hard page-break does not generate a PageBreak element (because that would double-count
          it). Word inserts a rendered page-break for the hard break at the effective location.
        * A (rendered) page-break mid-paragraph produces two elements, like `Text, PageBreak, Text`,
          so each Text (subclass) element gets the right page-number.
        * A rendered page-break mid-hyperlink produces two text elements, but the hyperlink itself
          is not split; the entire hyperlink goes on the page where the hyperlink starts, even
          though some of its text appears on the following page. The rest of the paragraph, after
          the hyperlink, appears on the following page.
        * Odd and even-page section starts can lead to two page-breaks, like an odd-page section
          start could go from page 3 to page 5 because 5 is the next odd page.
        r[   r   r.  r%   c                &    | j         j         d|  dS )z?A more detailed `repr()` to aid debugging when assertion fails.z('z'))	__class__rv   )r[   s    r1   str_reprz[Describe_DocxPartitioner.it_places_page_breaks_precisely_where_they_occur.<locals>.str_repr  s    k*33a3333r3   r  r   zsFirst page, tab here:	followed by line-break here:
here:
and here:
no-break hyphen here:-and hard page-break here>>r   z<<Text on second page. The font is big so it breaks onto third page--------------------here-->> <<but break falls inside link so text stays together.zContinuous section break here>>z<<followed by text on same pagezOdd-page section break here>>z9<<producing two page-breaks to get from page-3 to page-5.zQThen text gets big again so a "natural" rendered page break happens again here>> z<<and then more text proceeds.z

Expected: z

Got:      
N)r[   r   r.  r%   r+   )r   r   r   r   r   r   iter_document_elementsr   )rf  r   r  r   expectedr7   r   r[   s           r1   0it_places_page_breaks_precisely_where_they_occurzIDescribe_DocxPartitioner.it_places_page_breaks_precisely_where_they_occur  s|   	4 	4 	4 	4 "22D!E!E	+%22	22 -  bMM 
 bMM;<<;<<9::bMMbMMUVVc  bMM233A!
F $:4@@)) 	 	FC%%%1(3-!8!8 1 1!)!1 1 1 &%%%	 	r3   c                    t          d          |d<   t          di |}t          |          }|j        j        d         }|                    |          }t          |          }|j        dk    sJ d S )Ndocx-hdrftr.docxr   r   z:First header para
Table cell1 Table cell2
Last header parar+   )r   r   r   r   sections_iter_section_headersr}  rY   )rf  r   r   r   sectionheader_iterr0   s          r1   *it_includes_table_cell_text_in_Header_textzCDescribe_DocxPartitioner.it_includes_table_cell_text_in_Header_text  s    !12D!E!E	+%22	22&t,,'03!77@@{##|]]]]]]]r3   c                    t          d          |d<   t          di |}t          |          }|j        j        d         }|                    |          }t          |          }|j        dk    sJ dS )z?This case also verifies nested-table and merged-cell behaviors.r  r   r   zpara1
cell1 a b c d e f
para2Nr+   )r   r   r   r   r  _iter_section_footersr}  rY   )rf  r   r   r   r  footer_iterr0   s          r1   *it_includes_table_cell_text_in_Footer_textzCDescribe_DocxPartitioner.it_includes_table_cell_text_in_Footer_text  s    !12D!E!E	+%22	22&t,,'03!77@@{##|@@@@@@@r3   Nr  )rv   r9  r:  r  r  r  r  r  r  r  r  r  r  r+   r3   r1   r  r    s        MM	
 	
 	
 	

 
 
 
<
 
 
 

 
 
 
.X X X X p0 p0 p0h? ? ? ?F	^ 	^ 	^ 	^
A 
A 
A 
A 
A 
Ar3   r  )r$   r%   r&   r'   )r$   r%   r&   r9   )ra   rb   )r   r   )r   r   r   r   )r   r   r   r   r   r   )r   r   r   r   )r$   r%   r  )r.  r9   )r.  r   )r[  r   r\  r]  r.  r%   )r.  r   )er  
__future__r   r1  rK   pathlibr  r?   typingr   r   r4   rQ   docx.documentr   docx.text.paragraphr   pytest_mockr   test_unstructured.unit_utilsr	   r
   r   r   r   r   r   unstructured.chunking.titler   unstructured.documents.elementsr   r   r   r   r   r   r   r   r   r   r   r   r   unstructured.partition.docxr   r   r    r!   &unstructured.partition.utils.constantsr"   r#   r8   rF   rI   rM   rT   r`   r  r  rh   rk   rp   r}   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r  r  r  r  r   r$  r*  rC  r  r&   r   r   r   r[  r$   r   rb  r  r+   r3   r1   <module>r     s   ; : " " " " " "  				  				                    " " " " " " ) ) ) ) ) ) # # # # # #                  7 6 6 6 6 6                                               
W 
W 
W 
W5 5 5 5$1 1 1 1) ) ) )  > > > 04-@@I I I A@I> > >? ? ?A A A 	2 	2 	22 2 2E E EL L L
L L LJ J J J6 6 6H H H HH H H" " " ".
9 
9 
9 
9A A A A = = = =&. . . .
S S S SB1 1 1 12B B B B$ $ $$ $ $&5 5 5I I I5 5 5; ; ;Q& Q& Q&h	% 	% 	%      B 
 
 
 
 < < < <              4        .G\ G\ G\ G\ G\ G\ G\ G\TiA iA iA iA iA iA iA iA iA iAr3   