
    NgpT              
      &   d Z ddlmZ ddlZddlmZ ddlmZmZm	Z	m
Z
mZ ddlZddlmZ ddlmZ ddlmZ dd	lmZ dd
lmZ ddlmZ ddlmZ ddlmZ ddlmZ ddlm Z  ddl!m"Z"m#Z# ddl$m%Z%m&Z&m'Z'm(Z(m)Z)m*Z*m+Z+m,Z,m-Z- ddl.m/Z/ ddl0m1Z1m2Z2 ddl3m4Z4m5Z5m6Z6 ddl7m8Z8 ddl9m:Z:m;Z; dZ<d8dZ= G d de
          Z> e1e/j?                  e 	 d9ddddde8j@        d d:d1                        ZA G d2 d3          ZB G d4 d5          ZC G d6 d7          ZDdS );zPartitioner for PPTX documents.

PPTX files are PowerPoint 2007+ documents. These are XML-based and "open" (documented ISO standard),
unlike the `.ppt` format which was binary and proprietary.
    )annotationsN)SpooledTemporaryFile)IOAnyIteratorProtocolSequence)Presentation)Shape)	BaseShape)GraphicFrame)
GroupShape)Picture)_BaseGroupShapes)Slide)
_Paragraph)add_chunking_strategy)	HtmlTablehtmlify_matrix_of_cell_texts)	ElementElementMetadataEmailAddressListItemNarrativeText	PageBreakTableTextTitle)FileType)apply_metadataget_last_modified_date)is_email_addressis_possible_narrative_textis_possible_title)PartitionStrategy)is_temp_file_pathlazypropertypptxpicture_partitionerAbstractPicturePartitionerreturnNonec                :    t                               |            dS zLSpecify a pluggable sub-partitioner to be used for partitioning PPTX images.N)PptxPartitionerOptionsregister_picture_partitioner)r)   s    W/var/www/html/ai-engine/env/lib/python3.11/site-packages/unstructured/partition/pptx.pyr0   r0   2   s    778KLLLLL    c                  *    e Zd ZdZed
d            Zd	S )r*   a  Defines the interface for a pluggable sub-partitioner for PPTX Picture objects.

    A PPTX Picture object generally contains an image (e.g. JPG, PNG) but can also contain other
    media types like a video or sound file. The interface classmethod generates zero-or-more
    elements from the specified Picture object. If the media in the picture object is not supported
    then it will silently return without generating any elements.
    picturer   optsr/   r+   Iterator[Element]c                    dS )zHGenerate document elements derived from `picture`, a PPTX Picture shape.N clsr4   r5   s      r1   iter_elementsz(AbstractPicturePartitioner.iter_elementsE   s	     	r2   Nr4   r   r5   r/   r+   r6   __name__
__module____qualname____doc__classmethodr;   r8   r2   r1   r*   r*   <   s>             [  r2   T   )fileinclude_page_breaksinclude_slide_notesinfer_table_structurestarting_page_numberstrategyfilename
str | NonerD   IO[bytes] | NonerE   boolrF   bool | NonerG   rH   intrI   strkwargsr   list[Element]c          	     |    t          || |||||          }t          t                              |                    S )aL  Partition PowerPoint document in .pptx format into its document elements.

    Parameters
    ----------
    filename
        A string defining the target filename path.
    file
        A file-like object using "rb" mode --> open(filename, "rb").
    include_page_breaks
        If True, includes a PageBreak element between slides
    include_slide_notes
        If True, includes the slide notes as element
    infer_table_structure
        If True, any Table elements that are extracted will also have a metadata field
        named "text_as_html" where the table's text content is rendered into an html string.
        I.e., rows and cells are preserved.
        Whether True or False, the "text" field is always present in any Table element
        and is the text content of the table (no structure).
    starting_page_number
        Indicates what page number should be assigned to the first slide in the presentation.
        This information will be reflected in elements' metadata and can be be especially
        useful when partitioning a document that is part of a larger document.
    )rD   	file_pathrE   rF   rG   rI   rH   )r/   list_PptxPartitioneriter_presentation_elements)	rJ   rD   rE   rF   rG   rH   rI   rQ   r5   s	            r1   partition_pptxrX   P   sM    H "//31  D  ;;DAABBBr2   c                      e Zd ZdZd"dZed#d            Zd$dZd%dZd&dZ	d'dZ
d(dZd)dZd(dZd*dZed+d            Zd,d Zd!S )-rV   z;Provides `.partition()` for PowerPoint 2007+ (.pptx) files.r5   r/   c                    || _         d S N)_opts)selfr5   s     r1   __init__z_PptxPartitioner.__init__   s    


r2   r+   r6   c                <     | |                                           S )zFPartition MS Word documents (.docx format) into its document elements.)_iter_presentation_elements)r:   r5   s     r1   rW   z+_PptxPartitioner.iter_presentation_elements   s     s4yy44666r2   c              #  f  K   | j         j        D ] }| j                                        E d{V  |                     |          E d{V  |                     |          \  }}|D ]}|j        r3t          |t                    sJ | 	                    |          E d{V  <|j
        rUt          |t                    sJ ||k    r|                     |          E d{V  ||                     |          E d{V  t          |t                    r|                     |          E d{V  ɐ"dS )zAGenerate each document-element in presentation in document order.N)_presentationslidesr\   increment_page_number_iter_maybe_slide_notes_order_shapes	has_table
isinstancer   _iter_table_elementhas_text_framer   _iter_title_shape_element_iter_shape_elementsr   _iter_picture_elements)r]   slidetitle_shapeshapesshapes        r1   r`   z,_PptxPartitioner._iter_presentation_elements   s      '. 	B 	BEz7799999999933E:::::::::"&"4"4U";";K B B? 
B%e\:::::#77>>>>>>>>>>) B%eU33333++#'#A#A%#H#HHHHHHHHH#'#<#<U#C#CCCCCCCCCw// B#::5AAAAAAAAAB	B 	Br2   	paragraphr   rM   c                P    t          |j                            d                    S )zTrue when `paragraph` has a bullet-charcter prefix.

        Bullet characters in the openxml schema are represented by buChar.
        z./a:pPr/a:buChar)rM   _pxpath)r]   rr   s     r1   _is_bulleted_paragraphz'_PptxPartitioner._is_bulleted_paragraph   s#     IL&&'9::;;;r2   rn   r   Iterator[NarrativeText]c              #     K   | j         j        sdS |j        sdS |j        }|j        }|sdS |j                                        }|sdS t          || j                                         t                    V  dS )z?Generate zero-or-one NarrativeText element for the slide-notes.Ntextmetadatadetection_origin)
r\   rF   has_notes_slidenotes_slidenotes_text_framerz   stripr   text_metadataDETECTION_ORIGIN)r]   rn   r~   r   
notes_texts        r1   re   z(_PptxPartitioner._iter_maybe_slide_notes   s       z- 	F $ 	F'&7   	F%*0022
  	Fdj&>&>&@&@Sc
 
 
 	
 	
 	
 	
 	
r2   r4   r   c              #  d   K   | j         j        }|                    || j                   E d{V  dS )z6Generate elements derived from the image in `picture`.N)r\   r)   r;   )r]   r4   PicturePartitionerClss      r1   rm   z'_PptxPartitioner._iter_picture_elements   sC       !%
 >(66w
KKKKKKKKKKKr2   rq   r   c              #    K   |                      |          rdS |j        j        D ](}|j        }|                                dk    r#|j        pd}| j                            |          }|                     |          rt          ||t                    V  vt          |          rt          |t                    V  t          |          rt          ||t                    V  t          |          r9| j                            |dz             }t!          ||t                    V  t#          ||t                    V  *dS )z?Generate Text or subtype element for each paragraph in `shape`.N r   category_depthry   rz   r|   rC   )_shape_is_off_slide
text_frame
paragraphsrz   r   levelr\   r   rv   r   r   r"   r   r#   r   r$   r   r   )r]   rq   rr   rz   r   r{   s         r1   rl   z%_PptxPartitioner._iter_shape_elements   s     ##E** 	F)4 	\ 	\I>Dzz||r!!O(qEz//u/EEH**955 \D8N^_______!$'' \"?OPPPPPPP+D11 \#%%5      
 #4(( \:335193MMK[\\\\\\\xJZ[[[[[[[/	\ 	\r2   graphfrmr   Iterator[Table]c              #  F  K   t          |j        j                  x}sdS t          d |D                       }t	          j        |          }|j        sdS | j                            | j        j	        r|j
        nd          }t          |j        |t                    V  dS )zzGenerate zero-or-one Table element for the table in `shape`.

        An empty table does not produce an element.
        Nc                0    g | ]}d  |j         D             S )c                    g | ]	}|j         
S r8   )rz   ).0cells     r1   
<listcomp>zC_PptxPartitioner._iter_table_element.<locals>.<listcomp>.<listcomp>   s    ...Ddi...r2   )cells)r   rows     r1   r   z8_PptxPartitioner._iter_table_element.<locals>.<listcomp>   s)    ???3..CI...???r2   ry   )rU   tablerowsr   r   from_html_textrz   r\   table_metadatarG   htmlr   r   )r]   r   r   	html_text
html_tabler{   s         r1   ri   z$_PptxPartitioner._iter_table_element   s      
 X^0111 	F0??$???
 
	 -i88
 	F:,,#z?IJOOT
 
 8N^________r2   c              #    K   |                      |          rdS d}|j        j        D ]}|j        }|                                dk    r"|                     |          r<|j        pd}t          || j        	                    |          t                    V  st          |          rt          |t                    V  t          || j        	                    |          t                    V  |dz  }dS )zGenerate Title element for each paragraph in title `shape`.

        Text is most likely a title, but in the rare case that the title shape was used
        for the slide body text, also check for bulleted paragraphs.Nr   r   r   ry   r   rC   )r   r   r   rz   r   rv   r   r   r\   r   r   r"   r   r   )r]   rq   depthrr   rz   bullet_depths         r1   rk   z*_PptxPartitioner._iter_title_shape_element  sC     
 ##E** 	F)4 	 	I>Dzz||r!!**955 (3!!Z55\5RR%5      
 "$'' 	"?OPPPPPPP !Z55U5KK%5     
 
+	 	r2   (tuple[Shape | None, Sequence[BaseShape]]c                n    dfddd	}|j         j        t           |j                   |
          fS )zOrders the shapes on `slide` from top to bottom and left to right.

        Returns the title shape if it exists and the ordered shapes.rp   r   r+   Iterator[BaseShape]c              3  v   K   | D ]2}t          |t                    r |j                  E d {V  .|V  3d S r[   )rh   r   rp   )rp   rq   iter_shapess     r1   r   z3_PptxPartitioner._order_shapes.<locals>.iter_shapes1  sd          eZ00  *{5<8888888888KKKK	   r2   rq   r   tuple[int, int]c                &    | j         pd| j        pdfS Nr   )topleft)rq   s    r1   sort_keyz0_PptxPartitioner._order_shapes.<locals>.sort_key8  s    9>5:?22r2   )key)rp   r   r+   r   )rq   r   r+   r   )rp   titlesorted)r]   rn   r   r   s      @r1   rf   z_PptxPartitioner._order_shapes,  s`    
	  	  	  	  	  	 	3 	3 	3 	3 |!6++el*C*C#R#R#RRRr2   r
   c                >    t          j        | j        j                  S )zKThe python-pptx `Presentation` object loaded from the provided source file.)r(   r
   r\   	pptx_filer]   s    r1   rb   z_PptxPartitioner._presentation=  s      !5666r2   c                d    t          |j        o|j        o|j        dk     p
|j        dk               S r   )rM   r   r   )r]   rq   s     r1   r   z$_PptxPartitioner._shape_is_off_slideB  s2     UY-5:TEIM4SUZRS^UUUr2   N)r5   r/   )r5   r/   r+   r6   )r+   r6   )rr   r   r+   rM   )rn   r   r+   rw   )r4   r   r+   r6   )rq   r   r+   r6   )r   r   r+   r   )rn   r   r+   r   )r+   r
   )rq   r   r+   rM   )r>   r?   r@   rA   r^   rB   rW   r`   rv   re   rm   rl   ri   rk   rf   r'   rb   r   r8   r2   r1   rV   rV      s>       EE    7 7 7 [7B B B BH< < < <
 
 
 
4L L L L\ \ \ \<` ` ` `,   @S S S S" 7 7 7 \7V V V V V Vr2   rV   c                  0   e Zd ZdZdZ	 ddd)dZed*d            Zed+d            Z	ed+d            Z
d,dZed+d            Zed-d            Zed-d            Zed.d            Zed/d            Zed0d!            Zed1d"            Zd2d$Zd3d4d(ZdS )5r/   zVEncapsulates partitioning option validation, computation, and application of defaults.NrC   )rH   rD   rL   rT   rK   rE   rM   rF   rN   rG   rI   rP   rH   rO   c               n    || _         || _        || _        || _        || _        || _        |dz
  | _        d S )NrC   )_file
_file_path_include_page_breaks_include_slide_notes_infer_table_structure	_strategy_page_counter)r]   rD   rT   rE   rF   rG   rI   rH   s           r1   r^   zPptxPartitionerOptions.__init__U  sD     
#$7!$7!&;#!1A5r2   r)   r*   c                    || _         dS r.   )_PicturePartitionerCls)r:   r)   s     r1   r0   z3PptxPartitionerOptions.register_picture_partitioneri  s     &9"""r2   r+   c                    | j         S )aG  When True, include `PageBreak` elements in element-stream.

        Note that regardless of this setting, page-breaks are detected, and page-number is tracked
        and included in element metadata. Only the presence of distinct `PageBreak` elements (which
        contain no text) in the element stream is affected.
        )r   r   s    r1   rE   z*PptxPartitionerOptions.include_page_breaksn  s     ((r2   c                "    | j         dn| j         S )zNWhen True, also partition any text found in slide notes as part of each slide.NF)r   r   s    r1   rF   z*PptxPartitionerOptions.include_slide_notesx  s     19uut?XXr2   Iterator[PageBreak]c           	   #     K   | xj         dz  c_         | j         dk     rdS | j        r7t          dt          t	          | j        | j        dz
                      V  dS dS )zGIncrement page-number by 1 and generate a PageBreak element if enabled.rC      Nr   )last_modifiedpage_number)r|   r{   )r   r   r   r   r   r   r   r   s    r1   rd   z,PptxPartitionerOptions.increment_page_number}  s      a!!F$ 	!1("&"4$BRUVBV         	 	r2   c                    | j         S )zRTrue when partitioner should compute and apply `text_as_html` metadata for tables.)r   r   s    r1   rG   z,PptxPartitionerOptions.infer_table_structure  s     **r2   c                h    | j         sdS t          | j                   rdnt          | j                   S )zHThe best last-modified date available, None if no sources are available.N)r   r&   r!   r   s    r1   r   z$PptxPartitionerOptions.last_modified  s=      	4 &do66cDD<RSWSb<c<c	
r2   c                    | j         S )zHThe best available file-path for this document or `None` if unavailable.)r   r   s    r1   metadata_file_pathz)PptxPartitionerOptions.metadata_file_path  s     r2   c                    | j         S )z The current page (slide) number.)r   r   s    r1   r   z"PptxPartitionerOptions.page_number  s     !!r2   c                ,    | j         t          n| j         S )z3The sub-partitioner to use for PPTX Picture shapes.)r   _NullPicturePartitionerr   s    r1   r)   z*PptxPartitionerOptions.picture_partitioner  s      *2 $#,	
r2   str | IO[bytes]c                   | j         r| j         S t          | j        t                    rE| j                            d           t          j        | j                                                  S | j        r| j        S t          d          )zThe PowerPoint document file to be partitioned.

        This is either a str path or a file-like object. `python-pptx` accepts either for opening a
        presentation file.
        r   zQNo PPTX document specified, either `filename` or `file` argument must be provided)	r   rh   r   r   seekioBytesIOread
ValueErrorr   s    r1   r   z PptxPartitionerOptions.pptx_file  s     ? 	#?"
 dj"677 	1JOOA:djoo//000: 	:_
 
 	
r2   c                    | j         S )a  The requested partitioning strategy.

        This indicates whether the partitioner should undertake expensive operations like inference
        and OCR to produce a more thorough and/or accurate partitioning of the document.

        Can take several values but for PPTX purposes there is only "hi_res" and not "hi_res".
        Depending on the picture-partitioner used, images may only be OCR'ed and added to the
        element-stream when this partitioning strategy is "hi_res".
        )r   r   s    r1   rI   zPptxPartitionerOptions.strategy  s     ~r2   text_as_htmlc                b    t          | j        | j        | j        |          }t          |_        |S )z=ElementMetadata instance suitable for use with Table element.)rJ   r   r   r   r   r   r   r   r   r|   )r]   r   element_metadatas      r1   r   z%PptxPartitionerOptions.table_metadata  s=    *,,(%	
 
 
 -=)r2   r   r   r   c                b    t          | j        | j        | j        |          }t          |_        |S )zAElementMetadata instance suitable for use with Text and subtypes.)rJ   r   r   r   r   )r]   r   r   s      r1   r   z$PptxPartitionerOptions.text_metadata  s=    *,,()	
 
 
 -=)r2   )rD   rL   rT   rK   rE   rM   rF   rN   rG   rM   rI   rP   rH   rO   )r)   r*   )r+   rM   )r+   r   )r+   rK   )r+   rO   )r+   r*   )r+   r   )r+   rP   )r   rK   )r   )r   rO   r+   r   )r>   r?   r@   rA   r   r^   rB   r0   r'   rE   rF   rd   rG   r   r   propertyr   r)   r   rI   r   r   r8   r2   r1   r/   r/   H  s       ``!$ %&6 6 6 6 6 6( 9 9 9 [9 ) ) ) \) Y Y Y \Y     + + + \+ 
 
 
 \
    \ " " " X" 
 
 
 \
  
 
 
 \
. 
 
 
 \
	  	  	  	 	  	  	  	  	  	  	 r2   r/   c                  *    e Zd ZdZed
d            Zd	S )r   zHDoes not parse the provided Picture element and generates zero elements.r4   r   r5   r/   r+   r6   c              #  
   K   dS )zNo-op picture partitioner.Nr8   r9   s      r1   r;   z%_NullPicturePartitioner.iter_elements  s       	r2   Nr<   r=   r8   r2   r1   r   r     s8        RR   [  r2   r   )r)   r*   r+   r,   r[   )rJ   rK   rD   rL   rE   rM   rF   rN   rG   rM   rH   rO   rI   rP   rQ   r   r+   rR   )ErA   
__future__r   r   tempfiler   typingr   r   r   r   r	   r(   pptx.presentationr
   pptx.shapes.autoshaper   pptx.shapes.baser   pptx.shapes.graphfrmr   pptx.shapes.groupr   pptx.shapes.picturer   pptx.shapes.shapetreer   
pptx.slider   pptx.text.textr   unstructured.chunkingr   unstructured.common.html_tabler   r   unstructured.documents.elementsr   r   r   r   r   r   r   r   r   unstructured.file_utils.modelr   &unstructured.partition.common.metadatar    r!    unstructured.partition.text_typer"   r#   r$   &unstructured.partition.utils.constantsr%   unstructured.utilsr&   r'   r   r0   r*   PPTXFASTrX   rV   r/   r   r8   r2   r1   <module>r      s    # " " " " " 				 ) ) ) ) ) ) 8 8 8 8 8 8 8 8 8 8 8 8 8 8  * * * * * * ' ' ' ' ' ' & & & & & & - - - - - - ( ( ( ( ( ( ' ' ' ' ' ' 2 2 2 2 2 2       % % % % % % 7 7 7 7 7 7 R R R R R R R R
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 3 2 2 2 2 2 Y Y Y Y Y Y Y Y         
 E D D D D D > > > > > > > > M M M M       ( ,C " $'+"& !%*,C ,C ,C ,C ,C  ,C^DV DV DV DV DV DV DV DVNh  h  h  h  h  h  h  h `         r2   