
    Ng,                        d Z ddlmZ ddlZddlZddlZddlZddlZddl	m
Z
mZmZmZ ddlmZ ddlmZmZ ddlmZ ddlmZ dd	lmZ  ed
          ZddZg dg ddZefd dZ	 d!d"dZd#dZd#dZdS )$z>Helpers used across multiple partitioners to compute metadata.    )annotationsN)AnyCallableIteratorSequence)	ParamSpec)ElementElementMetadata)FileType)apply_lang_metadata)get_call_args_applying_defaults_Pfilenamestrreturn
str | Nonec                    t           j                            |           sdS t          j                            t           j                            |                     }|                    d          S )a
  Modification time of file at path `filename`, if it exists.

    Returns `None` when `filename` is not a path to a file on the local filesystem.

    Otherwise returns date and time in ISO 8601 string format (YYYY-MM-DDTHH:MM:SS) like
    "2024-03-05T17:02:53".
    Nz%Y-%m-%dT%H:%M:%S%z)ospathisfiledtdatetimefromtimestampgetmtimestrftime)r   modify_dates     b/var/www/html/ai-engine/env/lib/python3.11/site-packages/unstructured/partition/common/metadata.pyget_last_modified_dater      sY     7>>(## t+++BG,<,<X,F,FGGK 5666    )	TextUncategorizedTextNarrativeTextListItemBulletedTextTableFigureCaptionCheckBoxr%   )
Titler    r!   r"   r#   r$   r%   r&   r'   r%   )r(   HeaderelementsSequence[Element]rulesetdict[str, list[str]]list[Element]c                   g }| D ]}|j         j        d}t          |dd          }t          |j         dd          pd}|s=|rx|d         }t          |d          }t          |j         dd          pd}	||k    r|	|k     s||k    r ||                    |g           v r|j        }n|                                 |x||j         _        |                    |           t          |           S )aK  Sets `.metadata.parent_id` for each element it applies to.

    `parent_id` assignment is based on the element's category and depth. The importance of an
    element's category is determined by a rule set. The rule set trumps category_depth. That is,
    category_depth is only relevant when elements are of the same category.
    Ncategorycategory_depthr   )metadata	parent_idgetattrgetidpopappendlist)
r*   r,   stackelementr4   element_categoryelement_category_depthtop_elementtop_element_categorytop_element_category_depths
             r   set_element_hierarchyrB   @   s?    E $ $%1	"7J==!()9;KQ!O!O!TST   	 	#(9K#*;
#C#C ($  
  ' %(888.1GGG$(888$4H"(M(MMM'N	IIKKK-  	0 &/"W>>r   	file_typeFileType | NoneDCallable[[Callable[_P, list[Element]]], Callable[_P, list[Element]]]c                     d fd}|S )a  Post-process element-metadata for this document.

    This decorator adds a post-processing step to a partitioner, primarily to apply metadata that
    is common to all partitioners. It assumes the following responsibilities:

      - Hash element-ids. Computes and applies SHA1 hash element.id when `unique_element_ids`
        argument is False.

      - Element Hierarchy. Computes and applies `parent_id` metadata based on `category_depth`
        etc. added by partitioner.

      - Language metadata. Computes and applies `language` metadata based on a language detection
        model.

      - Apply `filetype` (MIME-type) metadata. There are three cases; first one in this order that
        applies is used:

          - `metadata_file_type` argument is present in call, use that.
          - `file_type` decorator argument is populated, use that.
          - `file_type` decorator argument is omitted or None, don't apply `.metadata.filetype`
            (assume the partitioner will do that for itself, like `partition_image()`.

      - Replace `filename` with `metadata_filename` when present.

      - Replace `last_modified` with `metadata_last_modified` when present.

      - Apply `url` metadata when present.
    funcCallable[_P, list[Element]]r   c                L     t          j                   d fd            }|S )	a  The decorator function itself.

        This function is returned by the `apply_metadata()` function and is the actual decorator.
        Think of `apply_metadata()` as a factory function that configures this decorator, in
        particular by setting its `file_type` value.
        args_P.argskwargs	_P.kwargsr   r.   c                     | i |}t          g| R i |}t          |          }|                    d          }|                    dd          }t          t	          |||                    }i }|                    d          p}|
|j        |d<   |                    d          p|                    d          }|r||d<   |                    d	          }	|	r|	|d
<   |                    d          }
|
r|
|d<   |D ]3}|j        j        r|j                            t          di |           4|                    dd          }|du rt          |          }t          |          }|S )N	languagesdetect_language_per_elementF)r*   rO   rP   metadata_file_typefiletypemetadata_filenamer   metadata_last_modifiedlast_modifiedurlunique_element_ids )r    _uniqueify_elements_and_metadatar6   r:   r   	mime_typer3   attached_to_filenameupdater
   _assign_hash_idsrB   )rJ   rL   r*   	call_argsrO   rP   metadata_kwargsrQ   r   rT   rV   r<   rW   rC   rG   s                r   wrapperz2apply_metadata.<locals>.decorator.<locals>.wrapper   s   tT,V,,H7NtNNNvNNI 8AAH "k22I*3--8UW\*]*]'#%'0K   H /1O "+/C!D!D!Q	!-.@.J
+ !}}%899VY]]:=V=VH 7.6
+ &/]]3K%L%L"% J3I0 --&&C -),& $ L L #8  ''(J(J/(J(JKKKK (1}}5I5'Q'Q!U**+H55 -X66HOr   )rJ   rK   rL   rM   r   r.   )	functoolswraps)rG   r`   rC   s   ` r   	decoratorz!apply_metadata.<locals>.decorator   sM     
		O	 O	 O	 O	 O	 O	 
	O	b r   )rG   rH   r   rH   rX   )rC   rc   s   ` r   apply_metadatard   x   s1    @Z Z Z Z Z Zx r   c                    d | D             }d t          j        |          D             }t          | |          D ]\  }}|                    |           | S )a.  Converts `.id` of each element from UUID to hash.

    The hash is based on the `.text` of the element, but also on its page-number and sequence number
    on that page. This provides for deterministic results even when the document is split into one
    or more fragments for parallel processing.
    c                &    g | ]}|j         j        S rX   )r3   page_number).0es     r   
<listcomp>z$_assign_hash_ids.<locals>.<listcomp>   s    ===qAJ*===r   c                B    g | ]\  }}t          |          D ]\  }}|S rX   )	enumerate)rh   _groupseq_on_pages       r   rj   z$_assign_hash_ids.<locals>.<listcomp>   sO       Au'..  K 	   r   )	itertoolsgroupbyzip
id_to_hash)r*   page_numberspage_seq_numbersr<   seq_on_page_counters        r   r]   r]      s     >=H===L !),77   ),H6F(G(G 0 0$$.////Or   c                :    dd}t           ||                     S )zEnsure each of `elements` and their metadata are unique instances.

    This prevents hard-to-diagnose bugs downstream when mutating one element unexpectedly also
    mutates others because they are the same instance.
    r*   r.   r   Iterator[Element]c              3    K   t                      }t                      }| D ]}t          |          |v rt          j        |          }t          |j                  |v rt          j        |j                  |_        |                    t          |                     |                    t          |j                             |V  dS )zLSubstitute deep-copies of any non-unique elements or metadata in `elements`.N)setr7   copydeepcopyr3   add)r*   seen_elementsseen_metadatar<   s       r   iter_unique_elementsz>_uniqueify_elements_and_metadata.<locals>.iter_unique_elements  s      "%%%"%%% 	 	G'{{m++-00'"##}44#'=1A#B#B bkk***b!122333MMMM	 	r   )r*   r.   r   rx   )r:   )r*   r   s     r   rY   rY     s2        $$X..///r   )r   r   r   r   )r*   r+   r,   r-   r   r.   )N)rC   rD   r   rE   )r*   r.   r   r.   ) __doc__
__future__r   r{   r   r   ra   rp   r   typingr   r   r   r   typing_extensionsr   unstructured.documents.elementsr	   r
   unstructured.file_utils.modelr   "unstructured.partition.common.langr   unstructured.utilsr   r   r   HIERARCHY_RULE_SETrB   rd   r]   rY   rX   r   r   <module>r      s   D D " " " " " "              				 4 4 4 4 4 4 4 4 4 4 4 4 ' ' ' ' ' ' D D D D D D D D 2 2 2 2 2 2 B B B B B B > > > > > >Yt__7 7 7 7 
 
 
    8 BT0 0 0 0 0r "&| | | | |~   *0 0 0 0 0 0r   