
    Ng4              	         U d dl mZ d dlZd dlZd dlZd dlZd dlZd dlmZm	Z	m
Z
mZmZ d dlZd dlmZ d dlmZ d dlmZ d dlmZ d dlmZmZmZ d d	lmZ d d
lmZmZ d dlm Z  d dl!m"Z" d dl#m$Z$ d dl%m&Z& d dl'm(Z( d dl)m*Z* d dl+m,Z, d dl-m.Z. d dl/m0Z0m1Z1m2Z2m3Z3m4Z4m5Z5m6Z6m7Z7m8Z8m9Z9m:Z:mZm;Z;m<Z< d dl=m>Z> d dl?m@Z@mAZA d dlBmCZCmDZDmEZEmFZFmGZG d dlHmIZI d dlJmKZKmLZL dZMdeNd<   dZOdeNd<   dZPdeNd <   d>d%ZQ G d& d"e          ZR e@e>jS                  e*	 d?dd'd'd(dd)d@d7                        ZT G d8 d9          ZU G d: d;          ZV G d< d=          ZWdS )A    )annotationsN)IOAnyIteratorProtocolType)Document)WD_SECTION_START)CT_Tbl)CT_P)Section_Footer_Header)Table)_Cell_Row)	Hyperlink)RenderedPageBreak)	Paragraph)Run)	TypeAlias)add_chunking_strategy)clean_bullets)htmlify_matrix_of_cell_texts)AddressElementElementMetadataEmailAddressFooterHeaderImageLinkListItemNarrativeText	PageBreakr   TextTitle)FileType)apply_metadataget_last_modified_date)is_bulleted_textis_email_addressis_possible_narrative_textis_possible_titleis_us_city_state_zip)PartitionStrategy)is_temp_file_pathlazypropertydocxstrDETECTION_ORIGINzCT_P | CT_Tblr   BlockElementzParagraph | DocxTable	BlockItempicture_partitionerPicturePartitionerTreturnNonec                :    t                               |            dS )zLSpecify a pluggable sub-partitioner to be used for partitioning DOCX images.N)DocxPartitionerOptionsregister_picture_partitioner)r8   s    W/var/www/html/ai-engine/env/lib/python3.11/site-packages/unstructured/partition/docx.pyr>   r>   ?   s    778KLLLLL    c                  *    e Zd ZdZed
d            Zd	S )r9   a/  Defines the interface for a pluggable sub-partitioner for DOCX Picture objects.

    In Microsoft Word parlance, an image is a "picture". We use that term here for an image in a
    DOCX file both for domain consistency and because it conveniently avoids confusion with an
    `unstructured` `Image` element.

    A picture can be either *inline* or *floating*. An inline picture is treated like a big
    character in the text of a paragraph, moving with the text. A floating picture can be moved
    freely and text flows around it.

    Both inline and floating pictures are defined inside a paragraph in the DOCX file. A paragraph
    can have zero or more pictures. A DOCX picture partitioner takes a `docx` `Paragraph` object
    and generates an `Image` element for each picture found in that paragraph.
    	paragraphr   optsr=   r:   Iterator[Image]c                    dS )z<Generate an `Image` element for each picture in `paragraph`.N clsrB   rC   s      r?   iter_elementsz!PicturePartitionerT.iter_elementsY   s	     	r@   NrB   r   rC   r=   r:   rD   __name__
__module____qualname____doc__classmethodrI   rF   r@   r?   r9   r9   I   s>             [  r@   T   )fileinclude_page_breaksinfer_table_structurestarting_page_numberstrategyfilename
str | NonerR   IO[bytes] | NonerS   boolrT   rU   intrV   kwargsr   list[Element]c                   t                               || ||||          }t                              |          }t	          |          S )a  Partitions Microsoft Word Documents in .docx format into its document elements.

    Parameters
    ----------
    filename
        A string defining the target filename path.
    file
        A file-like object using "rb" mode --> open(filename, "rb").
    include_page_breaks
        When True, add a `PageBreak` element to the element-stream when a page-break is detected in
        the document. Note that not all DOCX files include page-break information.
    infer_table_structure
        If True, any Table elements that are extracted will also have a metadata field
        named "text_as_html" where the table's text content is rendered into an html string.
        I.e., rows and cells are preserved.
        Whether True or False, the "text" field is always present in any Table element
        and is the text content of the table (no structure).
    metadata_filename
        The filename to use for the metadata. Relevant because partition_doc converts the document
        to .docx before partition. We want the original source filename in the metadata.
    metadata_last_modified
        The last modified date for the document.
    starting_page_number
        Assign this number to the first page of this document and increment the page number from
        there.
    )rR   	file_pathrS   rT   rU   rV   )r=   load_DocxPartitioneriter_document_elementslist)	rW   rR   rS   rT   rU   rV   r\   rC   elementss	            r?   partition_docxre   d   sR    L "&&/31 '  D  66t<<H>>r@   c                  p   e Zd ZdZdZ	 dddd)dZed*d            Zed+d            Ze	d,d            Z
e	d-d            Zd.dZe	d-d            Ze	d/d            Ze	d/d            Zed0d             Zed1d!            Ze	d2d"            Ze	d3d$            Ze	d-d%            Ze	d4d'            Zd5d(ZdS )6r=   zVEncapsulates partitioning option validation, computation, and application of defaults.NrQ   )rU   rV   rR   rY   r_   rX   rS   rZ   rT   rU   r[   rV   c               Z    || _         || _        || _        || _        || _        || _        d S N)_file
_file_path_include_page_breaks_infer_table_structure	_strategy_page_counter)selfrR   r_   rS   rT   rU   rV   s          r?   __init__zDocxPartitionerOptions.__init__   s7     
#$7!&;#!1r@   r\   r   r:   c                6     | di |                                 S )z#Construct and validate an instance.rF   )	_validate)rH   r\   s     r?   r`   zDocxPartitionerOptions.load   s"     s}}V}}&&(((r@   r8   r9   c                    || _         dS )zKSpecify a pluggable sub-partitioner to extract images from DOCX paragraphs.N)_PicturePartitionerCls)rH   r8   s     r?   r>   z3DocxPartitionerOptions.register_picture_partitioner   s     &9"""r@   r	   c                4    t          j        | j                  S z?The python-docx `Document` object loaded from file or filename.)r3   r	   
_docx_filero   s    r?   documentzDocxPartitionerOptions.document   s     }T_---r@   c                    | j         S )aG  When True, include `PageBreak` elements in element-stream.

        Note that regardless of this setting, page-breaks are detected, and page-number is tracked
        and included in element metadata. Only the presence of distinct `PageBreak` elements (which
        contain no text) in the element stream is affected.
        )rk   rx   s    r?   rS   z*DocxPartitionerOptions.include_page_breaks   s     ((r@   Iterator[PageBreak]c              #  l   K   | xj         dz  c_         | j        rt          dt                    V  dS dS )zGIncrement page-number by 1 and generate a PageBreak element if enabled.rQ    )detection_originN)rn   rk   r%   r5   rx   s    r?   increment_page_numberz,DocxPartitionerOptions.increment_page_number   sR      a$ 	CB1ABBBBBBBB	C 	Cr@   c                    | j         S )zRTrue when partitioner should compute and apply `text_as_html` metadata for tables.)rl   rx   s    r?   rT   z,DocxPartitionerOptions.infer_table_structure   s     **r@   c                h    | j         sdS t          | j                   rdnt          | j                   S )zHThe best last-modified date available, None if no sources are available.N)rj   r1   r*   rx   s    r?   last_modifiedz$DocxPartitionerOptions.last_modified   s=      	4 &do66cDD<RSWSb<c<c	
r@   c                    | j         S )zHThe best available file-path for this document or `None` if unavailable.)rj   rx   s    r?   metadata_file_pathz)DocxPartitionerOptions.metadata_file_path   s     r@   
int | Nonec                "    | j         r| j        ndS )aK  The current page number to report in metadata, or None if we can't really tell.

        Page numbers are not added to element metadata if we can't find any page-breaks in the
        document (which may be a common case).

        In the DOCX format, determining page numbers is strictly a best-efforts attempt since
        actual page-breaks are determined at rendering time (e.g. printing) based on the
        font-metrics of the target device. Explicit (hard) page-breaks are always recorded in the
        docx file but the rendered page-breaks are only added optionally.
        N)_document_contains_pagebreaksrn   rx   s    r?   metadata_page_numberz+DocxPartitionerOptions.metadata_page_number   s     &*%GQt!!TQr@   c                    | j         S )aW  The current page number.

        Note this value may not represent the actual rendered page number when rendered page-break
        indicators are not present in the document (not uncommon). Use `.metadata_page_number` for
        metadata purposes, which is `None` when rendered page-breaks are not present in this
        document.
        )rn   rx   s    r?   page_numberz"DocxPartitionerOptions.page_number   s     !!r@   c                    | j         pt          S )z5The sub-partitioner to use for DOCX image extraction.)rt   _NullPicturePartitionerrx   s    r?   r8   z*DocxPartitionerOptions.picture_partitioner  s     *E.EEr@   r4   c                6    | j         t          j        n| j         S )zThe partitioning strategy for this document.

        One of "hi_res", "fast", and a few others. These are available as class attributes on
        `unstructured.partition.utils.constants.PartitionStrategy` but resolve to str values.
        )rm   r0   HI_RESrx   s    r?   rV   zDocxPartitionerOptions.strategy  s     ,0>+A ''t~Ur@   c                ^    d}t          | j        j                            |                    S )a>  True when there is at least one page-break detected in the document.

        Only `w:lastRenderedPageBreak` elements reliably indicate a page-break. These are reliably
        inserted by Microsoft Word, but probably don't appear in documents converted into .docx
        format from for example .odt format.
        z./w:body/w:p/w:r/w:lastRenderedPageBreak | ./w:body/w:p/w:hyperlink/w:r/w:lastRenderedPageBreak | ./w:body/w:tbl/w:tr/w:tc/w:p/w:r/w:lastRenderedPageBreak | ./w:body/w:tbl/w:tr/w:tc/w:p/w:hyperlink/w:r/w:lastRenderedPageBreak)rZ   ry   elementxpath)ro   r   s     r?   r   z4DocxPartitionerOptions._document_contains_pagebreaks  s0    V 	 DM)//66777r@   str | IO[bytes]c                   | j         r| j         S t          | j        t          j                  rE| j                            d           t          j        | j                                                  S | j        J | j        S )zThe Word 2007+ document file to be partitioned.

        This is either a `str` path or a file-like object. `python-docx` accepts either for opening
        a document file.
        r   )	rj   
isinstanceri   tempfileSpooledTemporaryFileseekioBytesIOreadrx   s    r?   rw   z!DocxPartitionerOptions._docx_file,  su     ? 	#?"
 dj("?@@ 	1JOOA:djoo//000z%%%zr@   c                   | j         rt          j                            | j                   s$t	          dt          | j                              t          j        | j                   s$t          dt          | j                              nT| j	        r>t          j        | j	                  s$t          dt          | j	                             nt          d          | S )z6Raise on first invalide option, return self otherwise.zno such file or directory: z(not a ZIP archive (so not a DOCX file): zQno DOCX document specified, either `filename` or `file` argument must be provided)
rj   ospathisfileFileNotFoundErrorreprzipfile
is_zipfile
ValueErrorri   rx   s    r?   rr   z DocxPartitionerOptions._validate@  s     ? 	7>>$/22 _'(]d4?F[F[(](]^^^%do66 e !cDQUQ`LaLa!c!cdddeZ 	%dj11 ` !^DQUQ[L\L\!^!^___` c   r@   )rR   rY   r_   rX   rS   rZ   rT   rZ   rU   r[   rV   rX   )r\   r   r:   r=   )r8   r9   r:   r	   r:   rZ   )r:   r{   )r:   rX   )r:   r   r:   r[   )r:   r9   )r:   r4   )r:   r   )r:   r=   )rL   rM   rN   rO   rt   rp   rP   r`   r>   r2   ry   rS   r   rT   r   r   propertyr   r   r8   rV   r   rw   rr   rF   r@   r?   r=   r=      s#       ``!  %&#2 2 2 2 2 2$ ) ) ) [) 9 9 9 [9 . . . \. ) ) ) \)C C C C + + + \+ 
 
 
 \
    \ R R R XR " " " X" F F F \F V V V \V 8 8 8 \8(    \&     r@   r=   c                  "   e Zd ZdZd=dZed>d            Zd?d	Zd?d
Zd@dZ	dAdZ
edBd            ZedCd            ZdDdZdEdZd@dZdFdZdGdZdHd"ZdId$ZdJd(ZdKd*ZdLd+ZdMd-ZdNd/ZdOd1ZdPd3ZdQd4ZdRd5ZdSd7ZdTd9ZdTd:Z dUd;Z!d<S )Vra   z8Provides `.partition()` for MS-Word 2007+ (.docx) files.rC   r=   r:   r;   c                    || _         d S rh   )_opts)ro   rC   s     r?   rp   z_DocxPartitioner.__init__V  s    


r@   Iterator[Element]c                v     | |          }|j         r|                                n|                                S )zFPartition MS Word documents (.docx format) into its document elements.)_document_contains_sections_iter_document_elements#_iter_sectionless_document_elements)rH   rC   ro   s      r?   rb   z'_DocxPartitioner.iter_document_elementsY  sC     s4yy /<D((***99;;	
r@   c              #    K   t          | j        j                  D ]\  }}|                     ||          E d{V  |                     |          E d{V  |                                D ]c}t          |t                    r|                     |          E d{V  3t          |t                    r| 
                    |          E d{V  d|                     |          E d{V  dS )zFGenerate each document-element in (docx) `document` in document order.N)	enumerate	_documentsections_iter_section_page_breaks_iter_section_headersiter_inner_contentr   r   _iter_paragraph_elements	DocxTable_iter_table_element_iter_section_footers)ro   section_idxsection
block_items       r?   r   z(_DocxPartitioner._iter_document_elementsg  sF      %.dn.E$F$F 	; 	; K55k7KKKKKKKKK11':::::::::%88:: D D
 j)44 D#<<ZHHHHHHHHHH	  D  $77
CCCCCCCCC11'::::::::::	; 	;r@   c              #    K   | j                                         D ]c}t          |t                    r|                     |          E d{V  3t          |t
                    r|                     |          E d{V  ddS )zGenerate each document-element in a docx `document` that has no sections.

        A "section-less" DOCX must be iterated differently. Also it will have no headers or footers
        (because those live in a section).
        N)r   r   r   r   r   r   r   )ro   r   s     r?   r   z4_DocxPartitioner._iter_sectionless_document_elements  s       .;;== 	@ 	@J*i00 @88DDDDDDDDDDJ	22 @33J?????????	@ 	@r@   rB   r   c              #  ^  K   d                     d |j                            d          D                       }|                                sdS |                     |          }|                     |          r>t          |                                          }|rt          ||t                    V  dS | 	                    |          }|r |||t                    V  dS | 
                    |          }|r |||t                    V  dS t          ||t                    V  dS )a  Generate zero-or-one document element for `paragraph`.

        In Word, an empty paragraph is commonly used for inter-paragraph spacing. An empty paragraph
        does not contribute to the document-element stream and will not cause an element to be
        emitted.
        r}   c              3  $   K   | ]}|j         V  d S rh   text.0es     r?   	<genexpr>zB_DocxPartitioner._classify_paragraph_to_element.<locals>.<genexpr>  s8       
 
 F
 
 
 
 
 
r@   zJw:r | w:hyperlink | w:r/descendant::wp:inline[ancestor::w:drawing][1]//w:rN)r   metadatar~   )r   r~   )join_pr   strip_paragraph_metadata_is_list_itemr   r#   r5   _style_based_element_type&_parse_paragraph_text_for_element_typer&   )ro   rB   r   r   
clean_text
TextSubClss         r?   _classify_paragraph_to_elementz/_DocxPartitioner._classify_paragraph_to_element  s      ww 
 
\''\ 
 
 
 
 
 zz|| 	F++I66 i(( 	&t,,2244J #%%5     
 F 33I>>
 	*$L\]]]]]]F @@KK
 	*$L\]]]]]]F 4(=MNNNNNNNNr@   tabler   r4   c                ^    d
fddfdt          fd	|j        D                       S )a  HTML string version of `table`.

        Example:

            <table>
            <tbody>
            <tr><th>item  </th><th style="text-align: right;">  qty</th></tr>
            <tr><td>spam  </td><td style="text-align: right;">   42</td></tr>
            <tr><td>eggs  </td><td style="text-align: right;">  451</td></tr>
            <tr><td>bacon </td><td style="text-align: right;">    0</td></tr>
            </tbody>
            </table>

        `is_nested` is used for recursive calls when a nested table is encountered. Certain
        behaviors are different in that case, but the caller can safely ignore that parameter and
        allow it to take its default value.
        cellr   r:   Iterator[str]c              3     K   |                                  D ]U}t          |x}t                    r
|j        V  #t          |x}t                    r|j        D ]} |          E d{V  VdS )zGenerate the text of each paragraph or table in `cell` as a separate string.

            A table nested in `cell` is converted to the normalized text it contains.
            N)r   r   r   r   r   rows)r   r   rB   r   rowiter_row_cells_as_texts        r?   iter_cell_block_itemszF_DocxPartitioner._convert_table_to_html.<locals>.iter_cell_block_items  s      
 #5577 ? ?
:5iyAA ? $.(((( 3Y?? ?$z ? ?#9#9##>#>>>>>>>>>? ?r@   r   r   c              3    K   t          | j                  D ]}dV  | j        D ]I}d                     |                    }d                    |                                          V  Jt          | j                  D ]}dV  dS )aB  Generate the normalized text of each cell in `row` as a separate string.

            The text of each paragraph within a cell is not separated. A table nested in a cell is
            converted to a normalized string of its contents and combined with the text of the
            cell that contains the table.
            r}    N)rangegrid_cols_beforecellsr   splitgrid_cols_after)r   _r   	cell_textr   s       r?   r   zG_DocxPartitioner._convert_table_to_html.<locals>.iter_row_cells_as_text  s       3/00  	 2 2HH%:%:4%@%@AA	hhy00111111 3.//   r@   c                @    g | ]}t           |                    S rF   )rc   )r   rr   s     r?   
<listcomp>z;_DocxPartitioner._convert_table_to_html.<locals>.<listcomp>  s.    ,a,a,aQRT2H2H2K2K-L-L,a,a,ar@   r   r   r:   r   )r   r   r:   r   )r   r   )ro   r   r   r   s     @@r?   _convert_table_to_htmlz'_DocxPartitioner._convert_table_to_html  si    &	? 	? 	? 	? 	? 	?	 	 	 	 	 	( ,,a,a,a,aV[V`,a,a,abbbr@   r	   c                    | j         j        S rv   )r   ry   rx   s    r?   r   z_DocxPartitioner._document  s     z""r@   rZ   c                4    t          | j        j                  S )a2  True when there is at least one section in the document.

        This is always true for a document produced by Word, but may not always be the case when the
        document results from conversion or export. In particular, a Microsoft Teams chat-transcript
        export will have no sections.
        )rZ   r   r   rx   s    r?   r   z,_DocxPartitioner._document_contains_sections  s     DN+,,,r@   hdrftr_Header | _Footerc                `     d fd}d                     d  ||          D                       S )	am  The text enclosed in `hdrftr` as a single string.

        Each paragraph is included along with the text of each table cell. Empty text is omitted.
        Each paragraph text-item is separated by a newline ("
") although note that a paragraph
        that contains a line-break will also include a newline representing that line-break, so
        newlines do not necessarily distinguish separate paragraphs.

        The entire text of a table is included as a single string with a space separating the text
        of each cell.

        A header with no text or only whitespace returns the empty string ("").
        r   r   r:   r   c              3    K   |                                  D ]r}t          |t                    r|j                                        V  3t          |t
                    r*d                                        |                    V  sdS )zGenerate each text item in `hdrftr` stripped of leading and trailing whitespace.

            This includes paragraphs as well as table cell contents.
            r   N)r   r   r   r   r   r   r   _iter_table_texts)r   r   ro   s     r?   iter_hdrftr_textsz?_DocxPartitioner._header_footer_text.<locals>.iter_hdrftr_texts  s      
 %7799 G G
j)44 G$///111111	  G ((4#9#9*#E#EFFFFFG Gr@   
c              3     K   | ]}||V  	d S rh   rF   r   r   s     r?   r   z7_DocxPartitioner._header_footer_text.<locals>.<genexpr>#  s'      LL$tLLLLLLLr@   )r   r   r:   r   )r   )ro   r   r   s   `  r?   _header_footer_textz$_DocxPartitioner._header_footer_text  sV    	G 	G 	G 	G 	G 	G yyLL*;*;F*C*CLLLLLLr@   c                J    t          |j                  rdS d|j        j        v S )z7True when `paragraph` can be identified as a list-item.Tz	<w:numPr>)r+   r   r   xml)ro   rB   s     r?   r   z_DocxPartitioner._is_list_item%  s)    IN++ 	4il...r@   c              #  
  K   dfd |          D ]m}t          |t                    r7|                     |          E d{V  |                     |          E d{V  N| j                                        E d{V  ndS )zGenerate zero-or-more document elements for `paragraph`.

        The generated elements can be both textual elements and PageBreak elements. An empty
        paragraph produces no elements.
        rB   r   r:   'Iterator[Paragraph | RenderedPageBreak]c              3     K   | j         s| V  dS | j        d         }|j        }|r|V  |V  |j        }|r |          E d{V  dS dS )an  Generate Paragraph and RenderedPageBreak items from `paragraph`.

            Each generated paragraph is the portion of the paragraph on the same page. When the
            paragraph contains no page-breaks, it is iterated unchanged and iteration stops. When
            there is a page-break, in general there one paragraph "fragment" before the page break,
            the page break, and then the fragment after the page break. However many combinations
            are possible. The first item can be either a page-break or a paragraph, but the type
            always alternates throughout the sequence.
            Nr   )contains_page_breakrendered_page_breakspreceding_paragraph_fragmentfollowing_paragraph_fragment)rB   
page_breakr   r   iter_paragraph_itemss       r?   r   zG_DocxPartitioner._iter_paragraph_elements.<locals>.iter_paragraph_items3  s       0 "7:J ,6+R(+ 32222
 ,6+R( , N//0LMMMMMMMMMMMN Nr@   N)rB   r   r:   r   )r   r   r   _iter_paragraph_imagesr   r   )ro   rB   itemr   s      @r?   r   z)_DocxPartitioner._iter_paragraph_elements,  s      	N 	N 	N 	N 	N 	NB )(33 	> 	>D$	** >>>tDDDDDDDDD66t<<<<<<<<<<:;;==========	> 	>r@   Iterator[dict[str, str]]c              #     K   |j         D ]C}|j        r|j                                        nd}|s'|j        r|ddV  |j        r|ddV  DdS )zLGenerate e.g. {"text": "MUST", "tag": "b"} for each emphasis in `paragraph`.r}   b)r   tagiN)runsr   r   bolditalic)ro   rB   runr   s       r?   _iter_paragraph_emphasisz)_DocxPartitioner._iter_paragraph_emphasis[  s      > 	1 	1C'*x738>>###RD x 1#C00000z 1#C00000	1 	1r@   rD   c              #  d   K   | j         j        }|                    || j                   E d{V  dS )zRGenerate `Image` element for each picture shape in `paragraph` when so configured.N)r   r8   rI   )ro   rB   PicturePartitionerClss      r?   r   z'_DocxPartitioner._iter_paragraph_imagesf  sC       !%
 >(66y$*MMMMMMMMMMMr@   r   r   Iterator[Footer]c              #      K   d fd} ||j         d          E d	{V  |j        r ||j        d
          E d	{V   j        j        j        r ||j        d          E d	{V  d	S d	S )aO  Generate any `Footer` elements defined for this section.

        A Word document has up to three header and footer definition pairs for each document
        section, a primary, first-page, and even-page header and footer. The first-page pair
        applies only to the first page of the section (perhaps a title page or chapter start). The
        even-page pair is used in book-bound documents where there are both recto and verso pages
        (it is applied to verso (even-numbered) pages). A page where neither more specialized
        footer applies uses the primary footer.
        footerr   header_footer_typer4   r:   r  c           	   3     K   | j         rdS                     |           }|sdS t          |t          t	          j        j        |d                    V  dS )z2Generate zero-or-one Footer elements for `footer`.Nr   rW   r  category_depthr   r~   r   )is_linked_to_previousr   r   r5   r   r   r   )r  r  r   ro   s      r?   iter_footerz;_DocxPartitioner._iter_section_footers.<locals>.iter_footerx        + ++F33D !1(!Z:'9#$         r@   primaryN
first_page	even_page)r  r   r  r4   r:   r  )r  "different_first_page_header_footerfirst_page_footerr   settings odd_and_even_pages_header_footereven_page_footer)ro   r   r  s   `  r?   r   z&_DocxPartitioner._iter_section_footersm  s      	 	 	 	 	 	" ;w~y9999999995 	L"{7#<lKKKKKKKKK>"C 	J"{7#;[IIIIIIIIIII	J 	Jr@   Iterator[Header]c              #      K   d fd} ||j         d          E d	{V  |j        r ||j        d
          E d	{V   j        j        j        r ||j        d          E d	{V  d	S d	S )zGenerate `Header` elements for this section if it has them.

        See `._iter_section_footers()` docstring for more on docx headers and footers.
        headerr   r  r4   r:   r!  c           	   3     K   | j         rdS                     |           }|sdS t          |t          t	          j        j        |d                    V  dS )z2Generate zero-or-one Header elements for `header`.Nr   r  r  )r  r   r    r5   r   r   r   )r#  r  r   ro   s      r?   maybe_iter_headerzA_DocxPartitioner._iter_section_headers.<locals>.maybe_iter_header  r  r@   r  Nr  r  )r#  r   r  r4   r:   r!  )r#  r  first_page_headerr   r  r  even_page_header)ro   r   r%  s   `  r?   r   z&_DocxPartitioner._iter_section_headers  s      	 	 	 	 	 	" %$W^Y?????????5 	R(()BLQQQQQQQQQ>"C 	P(()A;OOOOOOOOOOO	P 	Pr@   r   r[   r{   c              #     K   d fd}|j         }|t          j        k    r* |            s j                                        E d{V  nA|t          j        k    r1|dk    rdS  |            r j                                        E d{V  dS )a  Generate zero-or-one `PageBreak` document elements for `section`.

        A docx section has a "start" type which can be "continuous" (no page-break), "nextPage",
        "evenPage", or "oddPage". For the next, even, and odd varieties, a `w:renderedPageBreak`
        element signals one page break. Here we only need to handle the case where we need to add
        another, for example to go from one odd page to another odd page and we need a total of
        two page-breaks.
        r:   rZ   c                 *     j         j        dz  dk    S )N   rQ   )r   r   rx   s   r?   page_is_oddz?_DocxPartitioner._iter_section_page_breaks.<locals>.page_is_odd  s    :)A-22r@   Nr   r   )
start_typer
   	EVEN_PAGEr   r   ODD_PAGE)ro   r   r   r+  r,  s   `    r?   r   z*_DocxPartitioner._iter_section_page_breaks  s      	3 	3 	3 	3 	3 	3 '
 )333 ;== >:;;=========+444a{}} >:;;========= 	r@   Iterator[Table]c              #  l  K   | j         j        r|                     |          nd}d                    |                     |                    }|                     |          \  }}t          |t          t          || j         j	        | j         j
        | j         j        |pd|pd                    V  dS )zBGenerate zero-or-one Table element for a DOCX `w:tbl` XML element.Nr   )text_as_htmlrW   r   r   emphasized_text_contentsemphasized_text_tags)r~   r   )r   rT   r   r   r   _table_emphasisr   r5   r   r   r   r   )ro   r   
html_table
text_tabler2  r3  s         r?   r   z$_DocxPartitioner._iter_table_element  s      
 37*2R\D''...X\ 	 XXd44U;;<<
9=9M9Me9T9T6 "6-$'6 J;"j6)A)IT%9%AT  
 
 
 	
 	
 	
 	
 	
r@   c              #  |   K   |j         D ]1}|j        D ]'}|j        D ]}|                     |          E d{V  (2dS )zHGenerate e.g. {"text": "word", "tag": "b"} for each emphasis in `table`.N)r   r   
paragraphsr  )ro   r   r   r   rB   s        r?   _iter_table_emphasisz%_DocxPartitioner._iter_table_emphasis  s      : 	H 	HC	 H H!% H HI#<<YGGGGGGGGGGHH	H 	Hr@   r   c           	   #      K   d	 fd}|j         D ]H}|j        }|j        D ]7}|j        dk    rd  |t	          ||                    D             E d{V  8IdS )
a  Generate text of each cell in `table` stripped of leading and trailing whitespace.

        Nested tables are recursed into and their text contributes to the output in depth-first
        pre-order. Empty strings due to empty or whitespace-only cells are dropped.
        r   r   r:   r   c              3     K   |                                  D ]c}t          |t                    r|j                                        V  3t          |t
                    r                    |          E d{V  ddS )zGenerate each text item in `cell` stripped of leading and trailing whitespace.

            This includes paragraphs as well as table cell contents.
            N)r   r   r   r   r   r   r   )r   r   ro   s     r?   iter_cell_textsz;_DocxPartitioner._iter_table_texts.<locals>.iter_cell_texts  s      
 #5577 B B
j)44 B$///111111	  B  $55jAAAAAAAAAB Br@   continuec              3     K   | ]}||V  	d S rh   rF   r   s     r?   r   z5_DocxPartitioner._iter_table_texts.<locals>.<genexpr>  s(      WWTRVWDWWWWWWr@   Nr   )r   _trtc_lstvMerger   )ro   r   r<  r   trtcs   `     r?   r   z"_DocxPartitioner._iter_table_texts  s      	B 	B 	B 	B 	B 	B : 	X 	XCBi X X9
**WW__U2u=M=M-N-NWWWWWWWWWWWX	X 	Xr@   tuple[list[str], list[str]]c                    t          j        |                     |                    \  }}d |D             d |D             fS )z@[contents, tags] pair describing emphasized text in `paragraph`.c                    g | ]
}|d          S r   rF   r   s     r?   r   z8_DocxPartitioner._paragraph_emphasis.<locals>.<listcomp>  s    000q6000r@   c                    g | ]
}|d          S r  rF   r   s     r?   r   z8_DocxPartitioner._paragraph_emphasis.<locals>.<listcomp>  s    2S2S2S1U82S2S2Sr@   )	itertoolsteer  )ro   rB   iter_p_emphiter_p_emph_2s       r?   _paragraph_emphasisz$_DocxPartitioner._paragraph_emphasis  sM    %.]43P3PQZ3[3[%\%\"]00K0002S2S]2S2S2STTr@   'tuple[list[str], list[str], list[Link]]c                    j         sg g g fS dfd}t           |                      }d |D             }d |D             }|||fS )z,Describes hyperlinks in `paragraph`, if any.r:   Iterator[Link]c               3  :  K   d}                                  D ]}t          |t                    r| t          |j                  z  } /t          |t
                    r<|j        }|j        }| }| t          |          z  } |si|slt          |||          V  dS )zGenerate `Link` typed-dict for each external link in `paragraph`.

            Word uses hyperlinks for internal "jumps" within the document, as well as for web and
            other external locations. Only generate the external ones.
            r   )r   urlstart_indexN)r   r   r   lenr   r   rR  r"   )offsetr  r   rR  rS  rB   s        r?   iter_paragraph_linkszC_DocxPartitioner._paragraph_link_meta.<locals>.iter_paragraph_links  s       F!4466 L LdC(( Lc$)nn,FFi00 L9D(C"(Kc$ii'F
  !    ! Dc{KKKKKK'L Lr@   c                "    g | ]}|d          pdS )r   r}   rF   r   links     r?   r   z9_DocxPartitioner._paragraph_link_meta.<locals>.<listcomp>7  s!    ;;;Td6l(b;;;r@   c                    g | ]
}|d          S )rR  rF   rX  s     r?   r   z9_DocxPartitioner._paragraph_link_meta.<locals>.<listcomp>8  s    333TT%[333r@   )r:   rP  )
hyperlinksrc   )ro   rB   rV  links
link_texts	link_urlss    `    r?   _paragraph_link_metaz%_DocxPartitioner._paragraph_link_meta  s    # 	r2:	L 	L 	L 	L 	L 	L8 ))++,, <;U;;;
33U333	9e++r@   r   c                    |                      |          }|                     |          \  }}|                     |          \  }}}t          ||pd|pd| j        j        | j        j        |pd|pd|pd| j        j        	  	        }d|_        |S )z.ElementMetadata object describing `paragraph`.N)	r  r2  r3  rW   r   r]  r^  r\  r   r3   )	_parse_category_depth_by_stylerM  r_  r   r   r   r   r   r~   )	ro   rB   r  r2  r3  r]  r^  r\  element_metadatas	            r?   r   z$_DocxPartitioner._paragraph_metadata;  s    <<YGG9=9Q9QR[9\9\6 "6'+'@'@'K'K$
Iu*)%=%E!5!=Z2*2!)T'4-4
7

 

 

 -3)r@   c                
   |j                             d          }|r"t          t          |d                             S |j        r|j        j        pd}|                     |          }|dk    r|S |                                 S )z0Determine category depth from paragraph metadataz./w:pPr/w:numPr/w:ilvl/@w:valr   Normal)_elementr   roundfloatstylename#_parse_category_depth_by_style_name#_parse_category_depth_by_style_ilvl)ro   rB   r   
style_namedepths        r?   ra  z/_DocxPartitioner._parse_category_depth_by_styleN  s     "(()HII 	*uQx)))  o>)/*>K8
88DD199L ;;===r@   c                    dS )Nr   rF   rx   s    r?   rk  z4_DocxPartitioner._parse_category_depth_by_style_ilvl`  s    qr@   rl  c                    dd}                     d          r |          S dk    rdS g d	}t          fd
|D                       r |          S dS )zParse category-depth from the style-name of `paragraph`.

        Category depth is 0-indexed and relative to the other element types in the document.
        suffixr4   r:   r[   c                    |                                  d                                         r*t          |                                  d                   dz
  ndS )NrQ   r   )r   isdigitr[   )rp  s    r?   _extract_numberzM_DocxPartitioner._parse_category_depth_by_style_name.<locals>._extract_numberj  sF    28,,..2D2L2L2N2NU3v||~~b)**Q..TUUr@   HeadingSubtitlerQ   )ListList BulletList ContinueList Numberc              3  B   K   | ]}                     |          V  d S rh   )
startswith)r   prefixrl  s     r?   r   zG_DocxPartitioner._parse_category_depth_by_style_name.<locals>.<genexpr>v  s1      IIz$$V,,IIIIIIr@   r   )rp  r4   r:   r[   )r|  any)ro   rl  rt  list_prefixess    `  r?   rj  z4_DocxPartitioner._parse_category_depth_by_style_named  s    	V 	V 	V 	V   ++ 	/"?:...##1 POOIIII=IIIII 	/"?:... qr@   Type[Text] | Nonec                   |j                                         }t          |          dk     rdS t          |          rt          S t          |          rt          S t          |          rt          S t          |          rt          S dS )zEAttempt to differentiate the element-type by inspecting the raw text.r*  N)r   r   rT  r/   r   r,   r   r-   r$   r.   r'   )ro   rB   r   s      r?   r   z7_DocxPartitioner._parse_paragraph_text_for_element_type|  s    ~##%%t99q==4%% 	ND!! 	 %d++ 	!  T"" 	Ltr@   c                   i dt           dt          dt          dt          dt          dt          dt          dt          d	t          d
t          dt           dt          dt          dt          dt          dt          dt          t          t          t          t          t          t          t          t           t           t           t          t          t          d}|j        r|j        j        pd}|                    |          S )zElement-type for `paragraph` based on its paragraph-style.

        Returns `None` when the style doesn't tell us anything useful, including when it
        is the default "Normal" style.
        Captionz	Heading 1z	Heading 2z	Heading 3z	Heading 4z	Heading 5z	Heading 6z	Heading 7z	Heading 8z	Heading 9zIntense Quoterw  zList 2zList 3rx  zList Bullet 2zList Bullet 3)ry  zList Continue 2zList Continue 3rz  zList Number 2zList Number 3zList Paragraphz
Macro Textz
No SpacingQuoterv  
TOCHeadingr'   rd  )r&   r'   r#   rh  ri  get)ro   rB   STYLE_TO_ELEMENT_MAPPINGrl  s       r?   r   z*_DocxPartitioner._style_based_element_type  s9   $
t$
$
 $
 	$

 $
 $
 $
 $
 $
 $
 T$
 H$
 h$
 h$
 8$
  X!$
" X#$
$ &''#%%&=$
 $
 $
 F  o>)/*>K8
 (++J777r@   c                    t          j        |                     |                    \  }}d |D             d |D             fS )z<[contents, tags] pair describing emphasized text in `table`.c                    g | ]
}|d          S r   rF   r   s     r?   r   z4_DocxPartitioner._table_emphasis.<locals>.<listcomp>  s    222q6222r@   c                    g | ]
}|d          S rH  rF   r   s     r?   r   z4_DocxPartitioner._table_emphasis.<locals>.<listcomp>  s    4W4W4W!QuX4W4W4Wr@   )rI  rJ  r9  )ro   r   iter_tbl_emphiter_tbl_emph_2s       r?   r4  z _DocxPartitioner._table_emphasis  sM    )2t7P7PQV7W7W)X)X&22M2224W4W4W4W4WXXr@   N)rC   r=   r:   r;   )rC   r=   r:   r   )r:   r   )rB   r   r:   r   )r   r   r:   r4   r   r   )r   r   r:   r4   )rB   r   r:   rZ   )rB   r   r:   r  )rB   r   r:   rD   )r   r   r:   r  )r   r   r:   r!  )r   r[   r   r   r:   r{   )r   r   r:   r/  )r   r   r:   r  )r   r   r:   r   )rB   r   r:   rD  )rB   r   r:   rN  )rB   r   r:   r   )rB   r   r:   r[   r   )rl  r4   r:   r[   )rB   r   r:   r  )r   r   r:   rD  )"rL   rM   rN   rO   rp   rP   rb   r   r   r   r   r2   r   r   r   r   r   r  r   r   r   r   r   r9  r   rM  r_  r   ra  rk  rj  r   r   r4  rF   r@   r?   ra   ra   S  s       BB    
 
 
 [
; ; ; ;:@ @ @ @.O .O .O .O`5c 5c 5c 5cn # # # \# - - - \-M M M M</ / / /-> -> -> ->^	1 	1 	1 	1N N N N J  J  J  JDP P P P:$ $ $ $L
 
 
 
.H H H HX X X X<U U U U
&, &, &, &,P       &> > > >$      0   "08 08 08 08dY Y Y Y Y Yr@   ra   c                  *    e Zd ZdZed
d            Zd	S )r   zWDoes not parse the provided paragraph for pictures and generates zero `Image` elements.rB   r   rC   r=   r:   rD   c              #  
   K   dS )zNo-op picture partitioner.NrF   rG   s      r?   rI   z%_NullPicturePartitioner.iter_elements  s       	r@   NrJ   rK   rF   r@   r?   r   r     s8        aa   [  r@   r   )r8   r9   r:   r;   rh   )rW   rX   rR   rY   rS   rZ   rT   rZ   rU   r[   rV   rX   r\   r   r:   r]   )X
__future__r   r   rI  r   r   r   typingr   r   r   r   r   r3   docx.documentr	   docx.enum.sectionr
   docx.oxml.tabler   docx.oxml.text.paragraphr   docx.sectionr   r   r   
docx.tabler   r   r   r   docx.text.hyperlinkr   docx.text.pagebreakr   docx.text.paragraphr   docx.text.runr   typing_extensionsr   unstructured.chunkingr   unstructured.cleaners.corer   unstructured.common.html_tabler   unstructured.documents.elementsr   r   r   r   r   r    r!   r"   r#   r$   r%   r&   r'   unstructured.file_utils.modelr(   &unstructured.partition.common.metadatar)   r*    unstructured.partition.text_typer+   r,   r-   r.   r/   &unstructured.partition.utils.constantsr0   unstructured.utilsr1   r2   r5   __annotations__r6   r7   r>   r9   DOCXre   r=   ra   r   rF   r@   r?   <module>r     sL   # " " " " " " 				     				   4 4 4 4 4 4 4 4 4 4 4 4 4 4  " " " " " " . . . . . . " " " " " " ) ) ) ) ) ) 2 2 2 2 2 2 2 2 2 2 ) ) ) ) ) ) " " " " " " " " ) ) ) ) ) ) 1 1 1 1 1 1 ) ) ) ) ) )       ' ' ' ' ' ' 7 7 7 7 7 7 4 4 4 4 4 4 G G G G G G                                 3 2 2 2 2 2 Y Y Y Y Y Y Y Y              E D D D D D > > > > > > > >     ) ) ) ) ).	 . . . .M M M M    (   6 / " $"& !/ / / / /  /dx x x x x x x xvo	Y o	Y o	Y o	Y o	Y o	Y o	Y o	Yn         r@   