
    NgC              	         U d Z ddlmZ ddlZddlmZ ddlmZmZm	Z	m
Z
 ddlZddlZddlZddlmZmZ ddlmZ ddlmZ dd	lmZ dd
lmZmZmZmZmZm Z m!Z! ddl"m#Z# ddl$m%Z%m&Z& ddl'm(Z(m)Z)m*Z*m+Z+ ddl,m-Z- dZ.de/d<   dZ0de/d<    e%e#j1                  e	 d8ddddddd9d'                        Z2 G d( d)          Z3 G d* d+          Z4 G d, d-          Z5 G d. d/          Z6d:d2Z7d;d7Z8dS )<z0Partitioner for Excel 2007+ (XLSX) spreadsheets.    )annotationsN)SpooledTemporaryFile)IOAnyIteratorOptional)Self	TypeAlias)add_chunking_strategy)clean_bullets)	HtmlTable)ElementElementMetadataListItemNarrativeTextTableTextTitle)FileType)apply_metadataget_last_modified_date)is_bulleted_textis_possible_narrative_textis_possible_numbered_listis_possible_title)lazypropertyztuple[int, int]r
   _CellCoordinatexlsxstrDETECTION_ORIGINTF   )filefind_subtableinclude_headerinfer_table_structurestarting_page_numberfilenameOptional[str]r"   Optional[IO[bytes]]r#   boolr$   r%   r&   intkwargsr   returnlist[Element]c          	        t          | ||||          }g }t          |j                                        |          D ]\  }	\  }
}|j        st          j        |                    d|j        d                    }t          |r|j
        nd|
|	|j        |j                  }t          |_        |                    t!          |j        |                     t$                              |          D ]e}t)          |j                  }|                                D ]I}t/          t1          |                    }t3          |
|	|          |_        |                    |           J|j        }|t          j        |                    d|j        d                    }t!          |j        	          }t3          |
|	|          |_        |j        r|j
        nd|j        _        |                    |           |                                D ]I}t/          t1          |                    }t3          |
|	|          |_        |                    |           Jg|S )
a#  Partitions Microsoft Excel Documents in .xlsx format into its document elements.

    Parameters
    ----------
    filename
        A string defining the target filename path.
    file
        A file-like object using "rb" mode --> open(filename, "rb").
    find_subtable
        Detect "subtables" on each worksheet and partition each of those as a separate `Table`
        element. When `False`, each worksheet is partitioned as a single `Table` element. A
        subtable is a contiguous block of cells with more than two cells in each row.
    infer_table_structure
        If True, any Table elements that are extracted will also have a metadata field
        named "text_as_html" where the table's text content is rendered into an html string.
        I.e., rows and cells are preserved.
        Whether True or False, the "text" field is always present in any Table element
        and is the text content of the table (no structure).
    include_header
        Determines whether or not header info is included in text and medatada.text_as_html
    )	file_pathr"   r#   r$   r%   )startF )indexheaderna_repN)text_as_html	page_namepage_numberr'   last_modified)textmetadatar:   )_XlsxPartitionerOptions	enumeratesheetsitemsr#   r   from_html_textto_htmlr$   r   htmlmetadata_file_pathr9   r    detection_originappendr   r:   _ConnectedComponentsfrom_worksheet_df_SubtableParsersubtable#iter_leading_single_cell_rows_texts_create_elementr   _get_metadatar;   
core_tabler%   r6   $iter_trailing_single_cell_rows_texts)r'   r"   r#   r$   r%   r&   r,   optselementsr8   
sheet_namesheet
html_tabler;   	componentsubtable_parsercontentelementrN   s                      W/var/www/html/ai-engine/env/lib/python3.11/site-packages/unstructured/partition/xlsx.pypartition_xlsxrZ   )   s   B ##%3  D !H,5#7- - - 0- 0-((j% ! -	-"1E$2EbQQ J '0EOZ__4$'0"0  H )9H%OOEzJJJKKKK 2CCEJJ - -	"1)2D"E"E  /RRTT - -G-c'll;;G'4Zd'S'SG$OOG,,,, -7
)!*!9"**t?R[]*^^" "J $999G'4Zd'S'SG$+/+EO
4 $1 OOG,,,
  /SSUU - -G-c'll;;G'4Zd'S'SG$OOG,,,,-3-< O    c                      e Zd ZdZdd
Zedd            Zedd            Zedd            Zedd            Z	edd            Z
edd            Zedd            ZdS )r=   zVEncapsulates partitioning option validation, computation, and application of defaults.r0   r(   r"   r)   r#   r*   r$   r%   c               L    || _         || _        || _        || _        || _        d S N)
_file_path_file_find_subtable_include_header_infer_table_structure)selfr0   r"   r#   r$   r%   s         rY   __init__z _XlsxPartitionerOptions.__init__   s0     $
+-&;###r[   r-   c                    | j         S )zTrue when partitioner should detect and emit separate `Table` elements for subtables.

        A subtable is (roughly) a contiguous rectangle of populated cells bounded by empty rows.
        )ra   rd   s    rY   r#   z%_XlsxPartitionerOptions.find_subtable   s     ""r[   
int | Nonec                    | j         rdndS )zNThe index of the row Pandas should treat as column-headings. Either 0 or None.r   Nrb   rg   s    rY   header_row_idxz&_XlsxPartitionerOptions.header_row_idx   s     (2qqd2r[   c                    | j         S )z6True when column headers should be included in tables.rj   rg   s    rY   r$   z&_XlsxPartitionerOptions.include_header   s     ##r[   c                    | j         S )zGTrue when partitioner should compute and apply `text_as_html` metadata.)rc   rg   s    rY   r%   z-_XlsxPartitionerOptions.infer_table_structure   s     **r[   c                <    | j         rt          | j                   ndS )zHThe best last-modified date available, None if no sources are available.N)r_   r   rg   s    rY   r9   z%_XlsxPartitionerOptions.last_modified   s!     ;?/S%do666tSr[   
str | Nonec                    | j         S )zHThe best available file-path for this document or `None` if unavailable.)r_   rg   s    rY   rD   z*_XlsxPartitionerOptions.metadata_file_path   s     r[   dict[str, pd.DataFrame]c                T   | j         x}rt          j        |d| j                  S | j        x}rlt          |t                    r;|                    d           t          j	        |
                                          }t          j        |d| j                  S t          d          )zFThe spreadsheet worksheets, each as a data-frame mapped by sheet-name.N)rR   r4   r   z7Either 'filename' or 'file' argument must be specified.)r_   pd
read_excelrk   r`   
isinstancer   seekioBytesIOread
ValueError)rd   r0   fs      rY   r?   z_XlsxPartitionerOptions.sheets   s     '9 	Y=tDDWXXXX
?1 	Q!122 )q			Jqvvxx((=tD<OPPPPRSSSr[   N)
r0   r(   r"   r)   r#   r*   r$   r*   r%   r*   )r-   r*   )r-   rh   )r-   r(   )r-   ro   )r-   rq   )__name__
__module____qualname____doc__re   r   r#   rk   r$   r%   r9   rD   r?    r[   rY   r=   r=      s       ``< < < < # # # \# 3 3 3 \3 $ $ $ \$ + + + \+ T T T \T    \ T T T \T T Tr[   r=   c                      e Zd ZdZddZedd	            ZddZedd            Zedd            Z	edd            Z
dS )_ConnectedComponentzA collection of cells that are "2d-connected" in a worksheet.

    2d-connected means there is a path from each cell to every other cell by traversing up, down,
    left, or right (not diagonally).
    	worksheetpd.DataFramecell_coordinate_setset[_CellCoordinate]c                "    || _         || _        d S r^   )
_worksheet_cell_coordinate_set)rd   r   r   s      rY   re   z_ConnectedComponent.__init__   s    #$7!!!r[   r-   r+   c                    | j         d         S )z7The right-most column index of the connected component.   _extentsrg   s    rY   max_xz_ConnectedComponent.max_x        }Qr[   otherc                f    t          | j        | j                            |j                            S )zProduce new instance with union of cells in `self` and `other`.

        Used to combine regions of workshet that are "overlapping" row-wise but not actually
        2D-connected.
        )r   r   r   union)rd   r   s     rY   mergez_ConnectedComponent.merge   s1     #OT6<<U=WXX
 
 	
r[   c                    | j         d         S )z6The left-most column index of the connected component.r   r   rg   s    rY   min_xz_ConnectedComponent.min_x   r   r[   c                V    | j         \  }}}}| j        j        ||dz   ||dz   f         S )zThe connected region of the worksheet as a `DataFrame`.

        The subtable is the rectangular region of the worksheet inside the connected-component
        bounding-box. Row-indices and column labels are preserved, not restarted at 0.
        r!   )r   r   iloc)rd   r   min_yr   max_ys        rY   rJ   z_ConnectedComponent.subtable   s;     &*]"ueU#EEAI$5uuqy7H$HIIr[   tuple[int, int, int, int]c                N   t          d          t          d          t          d          t          d          f\  }}}}| j        D ]%\  }}||k     r|}||k    r|}||k     r|}||k    r|}&t          |          t          |          t          |          t          |          fS )z1Compute bounding box of this connected component.infz-inf)floatr   r+   )rd   r   r   r   r   xys          rY   r   z_ConnectedComponent._extents   s     &+5\\5<<vPUV\P]P]%]"ueU- 	 	DAq5yy5yy5yy5yy5zz3u::s5zz3u::==r[   N)r   r   r   r   )r-   r+   )r   r   r-   r   )r-   r   )r-   r   )r|   r}   r~   r   re   r   r   r   r   rJ   r   r   r[   rY   r   r      s         8 8 8 8       \ 
 
 
 
       \  J J J \J > > > \> > >r[   r   c                  Z    e Zd ZdZddZddZedd	            Zedd            Z	ddZ
dS )rG   zThe collection of connected-components for a single worksheet.

    "Connected-components" refers to the graph algorithm we use to detect contiguous groups of
    non-empty cells in an excel sheet.
    worksheet_dfr   c                    || _         d S r^   )_worksheet_df)rd   r   s     rY   re   z_ConnectedComponents.__init__
  s    )r[   r-   Iterator[_ConnectedComponent]c                *    t          | j                  S r^   )iter_connected_componentsrg   s    rY   __iter__z_ConnectedComponents.__iter__  s    D.///r[   r	   c                     | |          S )zKConstruct from a worksheet dataframe produced by reading Excel with pandas.r   )clsr   s     rY   rH   z&_ConnectedComponents.from_worksheet_df  s     s<   r[   list[_ConnectedComponent]c                     j         j        \  }}t          j        ||f          j        } j                                         j        }d ||         D             }t          j        ||          }|                    |           t          j	        |          }t                                fd|D                                 S )z=The `_ConnectedComponent` objects comprising this collection.c                ,    g | ]}t          |          S r   tuple).0pairs     rY   
<listcomp>z>_ConnectedComponents._connected_components.<locals>.<listcomp>  s    KKK45;;KKKr[   c                :    g | ]}t          j        |          S r   )r   r   )r   component_node_setrd   s     rY   r   z>_ConnectedComponents._connected_components.<locals>.<listcomp>*  s7       * ((:<NOO  r[   )r   shapenpindicesTisnanxgrid_2d_graphremove_nodes_fromconnected_componentslist_merge_overlapping_tables)rd   max_rowmax_col
node_arrayempty_cellsnodes_to_removegraphconnected_node_setss   `       rY   r   z*_ConnectedComponents._connected_components  s    
  -3Z' 2335
(--//1KK:k3JKKK*7G<<000 !5
 
 **   .A   
 
 	
r[   r   c              #     K   t          |d           }d}|D ]3}||}|j        |j        k    r|                    |          }-|V  |}4||V  dS dS )a  Merge connected-components that overlap row-wise.

        A pair of overlapping components might look like one of these:

            x x x        x x
                x        x x   x x
            x   x   OR         x x
            x
            x x x
        c                    | j         S r^   )r   )r   s    rY   <lambda>z@_ConnectedComponents._merge_overlapping_tables.<locals>.<lambda>?  s    qw r[   )keyN)sortedr   r   r   )rd   r   sorted_componentscurrent_componentrU   s        rY   r   z._ConnectedComponents._merge_overlapping_tables1  s       ##7=N=NOOO * 	. 	.I ($-! "3"999$5$;$;I$F$F!! ('''$-!! (###### )(r[   N)r   r   )r-   r   )r   r   r-   r	   )r-   r   )r   r   r-   r   )r|   r}   r~   r   re   r   classmethodrH   r   r   r   r   r[   rY   rG   rG     s         * * * *0 0 0 0 ! ! ! [! 
 
 
 \
6$$ $$ $$ $$ $$ $$r[   rG   c                      e Zd ZdZddZedd            Zdd	Zdd
Zedd            Z	edd            Z
edd            ZdS )rI   a  Distinguishes core-table from leading and trailing title rows in a subtable.

    A *subtable* is a contiguous block of populated cells in the spreadsheet. Leading or trailing
    rows of that block containing only one populated cell are called "single-cell rows" and are
    not considered part of the core table. These are each emitted separately as a `Text`-subtype
    element.
    rJ   r   c                    || _         d S r^   )	_subtable)rd   rJ   s     rY   re   z_SubtableParser.__init__a  s    !r[   r-   pd.DataFrame | Nonec                    t          | j                  }|t          | j                  k    rdS t          | j                  t          | j                  z
  }| j        ||         S )zCThe part between the leading and trailing single-cell rows, if any.N)len _leading_single_cell_row_indicesr   !_trailing_single_cell_row_indices)rd   core_table_startcore_table_stops      rY   rN   z_SubtableParser.core_tabled  sg     tDEE s4>22224
 dn--D4Z0[0[[ ~.>??r[   Iterator[str]c              #     K   | j         D ]3}| j        j        |                                         j        d         V  4dS )z8Generate the cell-text for each leading single-cell row.r   N)r   r   r   dropnard   row_idxs     rY   rK   z3_SubtableParser.iter_leading_single_cell_rows_textsv  sS      < 	@ 	@G.%g.5577<Q?????	@ 	@r[   c              #     K   | j         D ]3}| j        j        |                                         j        d         V  4dS )z9Generate the cell-text for each trailing single-cell row.r   N)r   r   r   r   r   s     rY   rO   z4_SubtableParser.iter_trailing_single_cell_rows_texts{  sS      = 	@ 	@G.%g.5577<Q?????	@ 	@r[   tuple[int, ...]c                >     d fd}t           |                      S )zEIndex of each leading single-cell row in subtable, in top-down order.r-   Iterator[int]c               3  Z   K   t          j                  D ]\  } }|| k    r d S | V  d S r^   )r>   _single_cell_row_indicesnext_row_idxidxrd   s     rY   $iter_leading_single_cell_row_indicesz^_SubtableParser._leading_single_cell_row_indices.<locals>.iter_leading_single_cell_row_indices  sQ      %.t/L%M%M # #!c,&&FF"""""# #r[   r-   r   r   )rd   r   s   ` rY   r   z0_SubtableParser._leading_single_cell_row_indices  s;    	# 	# 	# 	# 	# 	# 99;;<<<r[   c                >     d fd}t           |                      S )z=Index of each single-cell row in subtable, in top-down order.r-   r   c               3     K   t          j                                                  D ]%\  } \  }}|                                dk    r!| V  &d S )Nr!   )r>   r   iterrowscount)r   _rowrd   s      rY   iter_single_cell_row_idxszK_SubtableParser._single_cell_row_indices.<locals>.iter_single_cell_row_idxs  s_      !*4>+B+B+D+D!E!E  Xa99;;!##				 r[   r   r   )rd   r   s   ` rY   r   z(_SubtableParser._single_cell_row_indices  s;    	 	 	 	 	 	 ..00111r[   c                     t           j                  t           j                  k    rdS d fd}t          t	          t           |                                          S )zFIndex of each trailing single-cell row in subtable, in top-down order.r   r-   r   c               3     K   t          j                  dz
  } j        ddd         D ]}|| k    r dS | V  | dz  } dS )z... moving from end upward ...r!   N)r   r   r   r   s     rY   %iter_trailing_single_cell_row_indicesz`_SubtableParser._trailing_single_cell_row_indices.<locals>.iter_trailing_single_cell_row_indices  sl      t~..2L4TTrT: " ",&&FF""""!	" "r[   r   )r   r   r   r   reversedr   )rd   r   s   ` rY   r   z1_SubtableParser._trailing_single_cell_row_indices  sr     t455T^9L9LLL2	" 	" 	" 	" 	" 	" Xd#H#H#J#JKKLLMMMr[   N)rJ   r   )r-   r   )r-   r   )r-   r   )r|   r}   r~   r   re   r   rN   rK   rO   r   r   r   r   r[   rY   rI   rI   X  s         " " " " @ @ @ \@"@ @ @ @
@ @ @ @
 	= 	= 	= \	= 	2 	2 	2 \	2 N N N \N N Nr[   rI   r:   r   c                4   t          |           rt          t          |                     S t          |           rt          |           S t	          |           rt          |           S t          |           rt          |           S t          |           S )z=Create `Text`-subtype document element appropriate to `text`.r<   )	r   r   r   r   r   r   r   r   r   r<   s    rY   rL   rL     s     	]4001111	"4	(	( T""""	#D	)	) $''''	4	 	  $r[   rR   r8   rP   r   c                <    t          | ||j        |j                  S )N)r7   r8   r'   r9   )r   rD   r9   )rR   r8   rP   s      rY   rM   rM     s-     ((	   r[   r^   )r'   r(   r"   r)   r#   r*   r$   r*   r%   r*   r&   r+   r,   r   r-   r.   )r:   r   r-   r   )rR   r   r8   r+   rP   r=   r-   r   )9r   
__future__r   rw   tempfiler   typingr   r   r   r   networkxr   numpyr   pandasrs   typing_extensionsr	   r
   unstructured.chunkingr   unstructured.cleaners.corer   unstructured.common.html_tabler   unstructured.documents.elementsr   r   r   r   r   r   r   unstructured.file_utils.modelr   &unstructured.partition.common.metadatar   r    unstructured.partition.text_typer   r   r   r   unstructured.utilsr   r   __annotations__r    XLSXrZ   r=   r   rG   rI   rL   rM   r   r[   rY   <module>r     s#   6 6 6 " " " " " " 				 ) ) ) ) ) ) . . . . . . . . . . . .             - - - - - - - - 7 7 7 7 7 7 4 4 4 4 4 4 4 4 4 4 4 4                  3 2 2 2 2 2 Y Y Y Y Y Y Y Y            , + + + + +. . . . .      "Z !% "& !Z Z Z Z Z  Zz?T ?T ?T ?T ?T ?T ?T ?TD6> 6> 6> 6> 6> 6> 6> 6>rR$ R$ R$ R$ R$ R$ R$ R$jPN PN PN PN PN PN PN PNf        r[   