
    Ng                       U d dl mZ d dlZd dlZd dlZd dlZd dlZd dlZd dl	Z	d dl
Z
d dlZd dlmZ d dlmZ d dlmZmZmZmZmZmZ d dlmZmZmZ d dlmZmZmZ d dlm Z  d d	l!m"Z"m#Z# d
Z$de%d<   dZ&de%d<   ej'         G d d                      Z(ej'         G d d                      Z) G d de          Z* G d de          Z+ G d de          Z, G d d          Z- G d dej.                  Z/ ed          Z0dVd!Z1dWd#Z2 G d$ d%          Z3 G d& d'ej4                  Z5 G d( d)e5          Z6 G d* d+e5          Z7 G d, d-e7          Z8 G d. d/e7          Z9 G d0 d1e7          Z: G d2 d3e7          Z; G d4 d5e7          Z< G d6 d7e7          Z= G d8 d9e7          Z> G d: d;e7          Z? G d< d=e7          Z@ G d> d?e7          ZA G d@ dAe7          ZB G dB dCeB          ZC G dD dEe7          ZD G dF dGe7          ZE G dH dIe7          ZF G dJ dKe7          ZG G dL dMe7          ZHi e3jI        e=e3jJ        e=e3jK        e=e3jL        e=e3jM        e=e3jN        e7e3jO        e9e3jP        e;e3jQ        e;e3jR        e;e3jS        e;e3jT        e;e3jU        e;e3jV        e;e3jW        e;e3jX        e<e3jY        e<i e3jZ        e<e3j[        eDe3j\        eDe3j]        eEe3j^        eEe3j_        eEe3j`        e:e3ja        e:e3jb        e@e3jc        e@e3jd        e@e3je        eBe3jf        e>e3jg        e?e3jh        e8e3ji        eAe3jj        eFe3jk        eGe3jl        eHiZmdNe%dO<   dXdSZndYdUZodS )Z    )annotationsN)groupby)MappingProxyType)AnyCallable	FrozenSetOptionalSequencecast)	ParamSpec	TypeAlias	TypedDict)TYPE_TO_COORDINATE_SYSTEM_MAPCoordinateSystemRelativeCoordinateSystem)#UNSTRUCTURED_INCLUDE_DEBUG_METADATA)get_call_args_applying_defaultslazypropertyztuple[float, float]r   Pointztuple[Point, ...]Pointsc                      e Zd ZU dZdZded<   dZded<   dZded<   dZded<   dZ	ded	<   dZ
ded
<   dZded<   d Zedd            ZdS )DataSourceMetadataz@Metadata fields that pertain to the data source of the document.NOptional[str]urlversionzOptional[dict[str, Any]]record_locatordate_createddate_modifieddate_processedzOptional[list[dict[str, Any]]]permissions_datac                H    d | j                                         D             S )Nc                    i | ]
\  }}|||S N ).0keyvalues      [/var/www/html/ai-engine/env/lib/python3.11/site-packages/unstructured/documents/elements.py
<dictcomp>z.DataSourceMetadata.to_dict.<locals>.<dictcomp>+   s#    XXXzsEeFWUFWFWFW    )__dict__itemsselfs    r(   to_dictzDataSourceMetadata.to_dict*   s$    XXT]-@-@-B-BXXXXr*   
input_dictdict[str, Any]c                    d t          j        |           D             fd|                                D             } | di |S )Nc                    g | ]	}|j         
S r$   )name)r%   fs     r(   
<listcomp>z0DataSourceMetadata.from_dict.<locals>.<listcomp>0   s    ;;;qAF;;;r*   c                $    i | ]\  }}|v 	||S r$   r$   )r%   kvsupported_fieldss      r(   r)   z0DataSourceMetadata.from_dict.<locals>.<dictcomp>1   s*    MMMAq<L7L7L17L7L7Lr*   r$   )dcfieldsr,   )clsr0   argsr:   s      @r(   	from_dictzDataSourceMetadata.from_dict-   sX     <;BIcNN;;;MMMM!1!1!3!3MMMs{{T{{r*   r0   r1   )__name__
__module____qualname____doc__r   __annotations__r   r   r   r   r   r    r/   classmethodr?   r$   r*   r(   r   r      s         JJC!G!!!!/3N3333"&L&&&&#'M''''$(N((((7;;;;;Y Y Y    [  r*   r   c                  V    e Zd ZU dZded<   ded<   ddZddZd Zedd            Z	dS )CoordinatesMetadataz?Metadata fields that pertain to the coordinates of the element.Optional[Points]pointsOptional[CoordinateSystem]systemc                P    ||||t          d          || _        || _        d S )NzNCoordinates points should not exist without coordinates system and vice versa.)
ValueErrorrJ   rL   )r.   rJ   rL   s      r(   __init__zCoordinatesMetadata.__init__=   s>    Nv1v7Ifn`   r*   otherr   returnboolc                    t          |t                    sdS t          | j        |j        k    | j        |j        k    g          S NF)
isinstancerH   allrJ   rL   r.   rP   s     r(   __eq__zCoordinatesMetadata.__eq__F   sI    %!455 	5,,
 
 	
r*   c                    | j         | j        d nt          | j        j        j                  | j        d n| j        j        | j        d n| j        j        dS )N)rJ   rL   layout_widthlayout_height)rJ   rL   str	__class__rA   widthheightr-   s    r(   r/   zCoordinatesMetadata.to_dictP   sZ    k"k1dds4;;P;Y7Z7Z$(K$7DDT[=N%)[%8TTdk>P	
 
 	
r*   r0   r1   c                d   dd}|                     d          }| ||          nd }|                     d          }|                     d          }|                     d	          }|d n8|d
k    rt                      n$|!||t          v rt          |         ||          nd } | ||          S )Nsequence_of_sequencesSequence[Sequence[float]]rQ   r   c           	     >   g }| D ]}t          |t                    r6|                    t          t          t          |                               Mt          |t
                    r(|                    t          t          |                     t          |          S r#   )rU   listappendr   r   tuple)ra   rJ   seqs      r(   convert_to_pointsz8CoordinatesMetadata.from_dict.<locals>.convert_to_points[   s    "$F, 4 4c4(( 4MM$ueCjj"9"9::::U++ 4MM$uc"2"2333== r*   rJ   rL   rZ   r[   r   rJ   rL   )ra   rb   rQ   r   )getr   r   )	r=   r0   rh   input_pointsrJ   system_namer^   r_   rL   s	            r(   r?   zCoordinatesMetadata.from_dictX   s    	! 	! 	! 	! "~~h//4@4L""<000RV !nnX..~..00 " D "<<< )***
 )".'+HHH	 2+>ufMMM  	$ s&0000r*   N)rJ   rI   rL   rK   )rP   r   rQ   rR   r@   )
rA   rB   rC   rD   rE   rO   rX   r/   rF   r?   r$   r*   r(   rH   rH   6   s         II&&&&   
 
 
 

 
 
 %1 %1 %1 [%1 %1 %1r*   rH   c                  2    e Zd ZU dZded<   ded<   ded<   dS )	Linkz#Metadata related to extracted linksr   textr\   r   intstart_indexN)rA   rB   rC   rD   rE   r$   r*   r(   rn   rn      s9         --HHHr*   rn   c                  .    e Zd ZU ded<   ded<   ded<   dS )FormKeyOrValuer\   ro   r   layout_element_idzOptional[Text]custom_elementNrA   rB   rC   rE   r$   r*   r(   rs   rs      s3         III$$$$""""""r*   rs   c                  .    e Zd ZU ded<   ded<   ded<   dS )FormKeyValuePairrs   r&   zOptional[FormKeyOrValue]r'   float
confidenceNrv   r$   r*   r(   rx   rx      s6         ####r*   rx   c                      e Zd ZU dZded<   ded<   ded<   ded	<   d
ed<   ded<   ded<   ded<   ded<   ded<   ded<   ded<   ded<   ded<   ded<   ded<   ded<   ded<   ded<   ded<   ded<   ded <   d!ed"<   d#ed$<   ded%<   ded&<   ded'<   ded(<   ded)<   ded*<   ded+<   ded,<   ded-<   ded.<   ded/<   d0ed1<   ded2<    edg          Z	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 dMdNd7ZdOd;ZdPd>Z	dQ fdBZ
edRdE            ZedSdG            ZedSdH            ZdTdIZdUdJZedVdL            Z xZS )WElementMetadataz>Fully-dynamic replacement for dataclass-based ElementMetadata.r   attached_to_filenameOptional[int]category_depthOptional[CoordinatesMetadata]coordinatesOptional[DataSourceMetadata]data_sourceOptional[float]detection_class_probdetection_originOptional[list[str]]emphasized_text_contentsemphasized_text_tagsfile_directoryfilenamefiletype
image_pathimage_base64image_mime_typeheader_footer_typeOptional[bool]is_continuationz Optional[list[FormKeyValuePair]]key_value_pairs	languageslast_modified
link_texts	link_urlsOptional[list[int]]link_start_indexesOptional[list[Link]]linksOptional[list[Element]]orig_elements	page_namepage_number	parent_idbcc_recipientcc_recipientemail_message_id	sent_fromsent_tosubject	signaturetext_as_htmlOptional[dict[str, str | int]]table_as_cellsr   NOptional[str | pathlib.Path]rQ   Nonec$                   || _         || _        || _        || _        || _        || _        || _        || _        |	| _        t          |t          j                  rt          |          n|}t          j                            |pd          \  }$}%|
p|$pd | _        |%pd | _        || _        || _        || _        || _        || _        || _        || _        || _        || _        || _        || _        || _        || _        || _        || _         || _!        || _"        || _#        || _$        || _%        | | _&        |"| _'        |!| _(        |#| _)        d S )N )*r}   r   r   r   r   r   r   r   r   rU   pathlibPathr\   ospathsplitr   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   )&r.   r}   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   directory_path	file_names&                                         r(   rO   zElementMetadata.__init__   sb   L %9!*,(&&$8!(@%$8! %/x$F$FT3x===H$&GMM(.b$A$A!	 -FF$!)T "4(.$."*$""4
 0*"&"""(,r*   rP   objectrR   c                P    t          |t                    sdS | j        |j        k    S )zImplments equivalence, like meta == other_meta.

        All fields at all levels must match. Unpopulated fields are not considered except when
        populated in one and not the other.
        F)rU   r|   r<   rW   s     r(   rX   zElementMetadata.__eq__*  s*     %11 	5{el**r*   	attr_namer\   c                >    || j         v rdS t          d| d          )z)Only called when attribute doesn't exist.Nz+'ElementMetadata' object has no attribute '')_known_field_namesAttributeError)r.   r   s     r(   __getattr__zElementMetadata.__getattr__4  s/    ///4W9WWWXXXr*   _ElementMetadata__name_ElementMetadata__valuer   c                    ||| j         v rt          | |           d S t          s|| j        v rd S t	                                          ||           d S r#   )r+   delattrr   DEBUG_FIELD_NAMESsuper__setattr__)r.   r   r   r]   s      r(   r   zElementMetadata.__setattr__:  sd    ?&&f%%%F2 	vAW7W7WFFG,,,,,r*   	meta_dictr1   c                   ddl m} t          j        |          }t	                      }|                                D ]\  }}|dk    r t                              |          |_        +|dk    r t                              |          |_
        Q|dk    r ||          |_        h|dk    rt          |          |_        t          |||           |S )zConstruct from a metadata-dict.

        This would generally be a dict formed using the `.to_dict()` method and stored as JSON
        before "rehydrating" it using this method.
        r   )!elements_from_base64_gzipped_jsonr   r   r   r   )unstructured.staging.baser   copydeepcopyr|   r,   rH   r?   r   r   r   r   #_kvform_rehydrate_internal_elementsr   setattr)r=   r   r   r.   
field_namefield_values         r(   r?   zElementMetadata.from_dictD  s     	POOOOO M),,	  '0'8'8 
	7 
	7#J]**#6#@#@#M#M  },,#5#?#?#L#L  ..%F%F{%S%S""000'J;'W'W$$j+6666r*   MappingProxyType[str, Any]c                h     t           fd j                                        D                       S )zPopulated metadata fields in this object as a read-only dict.

        Basically `self.__dict__` but it needs a little filtering to remove entries like
        "_known_field_names". Note this is a *snapshot* and will not reflect later changes.
        c                X    i | ]&\  }}|                     d           s|j        v#||'S )_)
startswithr   )r%   r   r   r.   s      r(   r)   z*ElementMetadata.fields.<locals>.<dictcomp>f  sP       +J!,,S11 7AH^6^6^ K6^6^6^r*   )r   r+   r,   r-   s   `r(   r<   zElementMetadata.fields^  sK         /3}/B/B/D/D  
 
 	
r*   c                z      j         t           fd j                                        D                       S )a,  Populated non-ad-hoc fields in this object as a read-only dict.

        Only fields declared at the top of this class are included. Ad-hoc fields added to this
        instance by assignment are not. Note this is a *snapshot* and will not reflect changes that
        occur after this call.
        c                6    i | ]\  }}|v 	|j         v||S r$   )r   )r%   r   r   known_field_namesr.   s      r(   r)   z0ElementMetadata.known_fields.<locals>.<dictcomp>w  sB       +J"333
$J`8`8` K8`8`8`r*   )r   r   r+   r,   )r.   r   s   `@r(   known_fieldszElementMetadata.known_fieldsm  sZ     !3    /3}/B/B/D/D  
 
 	
r*   c                   ddl m} t          j        t	          | j                            }| j        D ]}|                    |d           d |                                D             }| j	        | j	        
                                |d<   | j        | j        
                                |d<   | j         || j                  |d<   | j        t          | j                  |d<   |S )	zConvert this metadata to dict form, suitable for JSON serialization.

        The returned dict is "sparse" in that no key-value pair appears for a field with value
        `None`.
        r   )elements_to_base64_gzipped_jsonNc                2    i | ]\  }}|g k    |i k    ||S r$   r$   )r%   r   r'   s      r(   r)   z+ElementMetadata.to_dict.<locals>.<dictcomp>  s7     %
 %
 %
!
E{{u{{ *{{r*   r   r   r   r   )r   r   r   r   dictr<   r   popr,   r   r/   r   r   r   _kvform_pairs_to_dict)r.   r   r   r   s       r(   r/   zElementMetadata.to_dict~  s    	NMMMMMM$t{"3"344	 0 	, 	,JMM*d++++%
 %
%.__%6%6%
 %
 %
	 ''+'7'?'?'A'AIm$''+'7'?'?'A'AIm$))H)HI[)\)\Io&++@AU+V+VI'(r*   c                    t          |t                    st          d          |j                                        D ]\  }}t          | ||           dS )ak  Update self with all fields present in `other`.

        Semantics are like those of `dict.update()`.

        - fields present in both `self` and `other` will be updated to the value in `other`.
        - fields present in `other` but not `self` will be added to `self`.
        - fields present in `self` but not `other` are unchanged.
        - `other` is unchanged.
        - both ad-hoc and known fields participate in update with the same semantics.

        Note that fields listed in DEBUG_FIELD_NAMES are skipped in this process. Those can only be
        updated by direct assignment to the instance.
        z@argument to '.update()' must be an instance of 'ElementMetadata'N)rU   r|   rN   r<   r,   r   )r.   rP   r   r   s       r(   updatezElementMetadata.update  sg     %11 	a_```',|'9'9';'; 	3 	3#JD*k2222	3 	3r*   FrozenSet[str]c                *    t          | j                  S )aK  field-names for non-user-defined fields, available on all ElementMetadata instances.

        Note that the first call to this lazyproperty adds a `"_known_field_names"` item to the
        `__dict__` of this instance, so this be called *before* iterating through `self.__dict__`
        to avoid a mid-iteration mutation.
        )	frozensetrE   r-   s    r(   r   z"ElementMetadata._known_field_names  s     -...r*   )#NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN)Hr}   r   r   r   r   r~   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r~   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   rQ   r   rP   r   rQ   rR   )r   r\   rQ   r   )r   r\   r   r   rQ   r   )r   r1   rQ   r|   )rQ   r   rQ   r1   )rP   r|   rQ   r   )rQ   r   )rA   rB   rC   rD   rE   r   r   rO   rX   r   r   rF   r?   propertyr<   r   r/   r   r   r   __classcell__r]   s   @r(   r|   r|      s}        HH ('''!!!!....----))))####1111----!!!!""""%%%%####5555""""    ####""""++++**** '&&&%%%%####""""      2222
 "	#5"677 /3-1(,,05948048<48(,15"&,0&*)-$(*.)-'+26*.)-&**.15#'%)#')-'+#'!%9=&*!IP P P P Pd+ + + +Y Y Y Y- - - - - -    [2 
 
 
 X
 
 
 
 X
    B3 3 3 3( 	/ 	/ 	/ \	/ 	/ 	/ 	/ 	/r*   r|   c                  H    e Zd ZdZdZ	 dZ	 dZ	 dZ	 dZ	 e	dd	            Z
d
S )ConsolidationStrategya  Methods by which a metadata field can be consolidated across a collection of elements.

    These are assigned to `ElementMetadata` field-names immediately below. Metadata consolidation is
    part of the chunking process and may arise elsewhere as well.
    dropfirststring_concatenateLIST_CONCATENATElist_uniquerQ    dict[str, ConsolidationStrategy]c                `   i d| j         d| j         d| j         d| j        d| j        d| j         d| j        d| j        d	| j        d
| j        d| j         d| j         d| j         d| j        d| j        d| j        d| j        i d| j        d| j        d| j         d| j        d| j        d| j        d| j        d| j         d| j        d| j        d| j         d| j         d| j        d| j         d | j         d!| j         d"| j         | j        | j         | j         | j        d#S )$a7  Mapping from ElementMetadata field-name to its consolidation strategy.

        Note that only _TextSection objects ("pre-chunks" containing only `Text` elements that are
        not `Table`) have their metadata consolidated, so these strategies are only applicable for
        non-Table Text elements.
        r}   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   max_charactersr   r   r   r   r   r   r   r   )r   r   r   r   )FIRSTDROPr   LIST_UNIQUESTRING_CONCATENATEr=   s    r(   field_consolidation_strategiesz4ConsolidationStrategy.field_consolidation_strategies  s    '
"CI'
CI'
 SY'
 ch	'

 38'
 39'
 #CH'
 '
 '(<'
 #C$8'
 ci'
 	'
 	'
 !#('
 #('
  CH!'
" sx#'
 '
$ sx%'
& ''
( SY)'
* #.+'
, --'
. !#(/'
0 SX1'
2 	3'
4 ch5'
6 SX7'
8 9'
: 39;'
< ='
> ?'
@ syA'
B C'
D syE'
 '
F  2!i9"xM'
 '
 '
 '	
r*   N)rQ   r   )rA   rB   rC   rD   r   r   r   r   r   rF   r   r$   r*   r(   r   r     sh          DHEO-c)_K_.
 .
 .
 [.
 .
 .
r*   r   _Pelementslist[Element]rQ   c                    d | D             }d t          |          D             }d t          | |          D             }| D ]'}|j        j        }|r||vr||         |j        _        (| S )a  Converts `id` and `parent_id` of elements from UUIDs to hashes.

    This function ensures deterministic IDs by:
    1. Converting each element's UUID into a hash.
    2. Updating the `parent_id` to match the new hash ID of parent elements.

    Args:
        elements: A list of Element objects to update.

    Returns:
        List of updated Element objects with hashes for `id` and `parent_id`.
    c                &    g | ]}|j         j        S r$   )metadatar   )r%   es     r(   r6   z+assign_and_map_hash_ids.<locals>.<listcomp>  s    ===qAJ*===r*   c                B    g | ]\  }}t          |          D ]\  }}|S r$   )	enumerate)r%   r   groupseq_on_pages       r(   r6   z+assign_and_map_hash_ids.<locals>.<listcomp>  sK        5PYZ_P`P` >Lk1   r*   c                J    i | ] \  }}|j         |                    |          !S r$   )id
id_to_hash)r%   elementseq_on_page_counters      r(   r)   z+assign_and_map_hash_ids.<locals>.<dictcomp>  s@       (G( 	
G&&':;;  r*   )r   zipr   r   )r   page_numberspage_seq_pairsold_to_new_mappingr   r   s         r(   assign_and_map_hash_idsr    s     >=H===L $+L$9$9  N
 ,/.,I,I    = =J(	 	I-???1)<
Or*   DCallable[[Callable[_P, list[Element]]], Callable[_P, list[Element]]]c                     dd} | S )a$  Post-process element-metadata for this document.

    This decorator adds a post-processing step to a document partitioner.

    - Adds `metadata_filename` parameter to docstring if not present.
    - Updates element.id to a UUID when `unique_element_ids` argument is provided and True.

    funcCallable[_P, list[Element]]rQ   c                      j         r'd j        j        v rd j         vr xj         dz  c_         t          j                   d
 fd	            }|S )Nmetadata_filenamezT
Metadata Parameters:
	metadata_filename:
		The filename to use in element metadata.r>   _P.argskwargs	_P.kwargsrQ   r   c                      | i |}t          g| R i |}|                    dd          }|du rt          |          }|S )Nunique_element_idsF)r   rj   r  )r>   r  r   	call_argsr  r  s        r(   wrapperz4process_metadata.<locals>.decorator.<locals>.wrapperC  sf    tT,V,,H7NtNNNvNNI'0}}5I5'Q'Q!U**28<<Or*   )r>   r  r  r  rQ   r   )rD   __code__co_varnames	functoolswraps)r  r  s   ` r(   	decoratorz#process_metadata.<locals>.decorator8  s|    < 	#t}'@@@'t|;;G
 
			 	 	 	 	 
		 r*   )r  r  rQ   r  r$   )r  s    r(   process_metadatar  .  s       0 r*   c                      e Zd ZdZdZdZdZdZdZdZ	dZ
d	Zd
ZdZdZdZdZdZdZdZdZdZdZdZdZdZdZdZdZdZdZdZdZ dZ!d Z"d!Z#d"Z$d#Z%d$Z&d%Z'd&Z(d'Z)d(Z*d)Z+d*Z,d+Z-e.d,             Z/d-S ).ElementTypeTitleTextUncategorizedTextNarrativeTextBulletedText	ParagraphAbstract	ThreadingFormz
Field-NameValuern   CompositeElementImagePictureFigureCaptionFigureCaptionListListItemz	List-itemChecked	UncheckedCheckBoxCheckedCheckBoxUncheckedRadioButtonCheckedRadioButtonUncheckedAddressEmailAddress	PageBreakFormulaTableHeaderHeadlineSubheadlinezPage-headerzSection-headerFooterFootnotezPage-footer
PageNumberCodeSnippetFormKeysValuesc                :      fdt                     D             S )z
        Convert class attributes to a dictionary.

        Returns:
            dict: A dictionary where keys are attribute names and values are attribute values.
        c                    i | ]E}t          t          |                    |                    d           4|t          |          FS )__)callablegetattrr   )r%   attrr=   s     r(   r)   z'ElementType.to_dict.<locals>.<dictcomp>  sc     
 
 
GC..//
 9=8M8M
'#t$$
 
 
r*   )dirr   s   `r(   r/   zElementType.to_dict  s4    
 
 
 
C
 
 
 	
r*   N)0rA   rB   rC   TITLETEXTUNCATEGORIZED_TEXTNARRATIVE_TEXTBULLETED_TEXT	PARAGRAPHABSTRACT	THREADINGFORM
FIELD_NAMEVALUELINKCOMPOSITE_ELEMENTIMAGEPICTUREFIGURE_CAPTIONFIGURECAPTIONLIST	LIST_ITEMLIST_ITEM_OTHERCHECKED	UNCHECKEDCHECK_BOX_CHECKEDCHECK_BOX_UNCHECKEDRADIO_BUTTON_CHECKEDRADIO_BUTTON_UNCHECKEDADDRESSEMAIL_ADDRESS
PAGE_BREAKFORMULATABLEHEADERHEADLINESUB_HEADLINEPAGE_HEADERSECTION_HEADERFOOTERFOOTNOTEPAGE_FOOTERPAGE_NUMBERCODE_SNIPPETFORM_KEYS_VALUESrF   r/   r$   r*   r(   r!  r!  S  s       ED,$N"MIHIDJED*EG$NFGDI!OGI)-/3G"MJGEFH LK%NFHKK L'
 
 [
 
 
r*   r!  c                  n    e Zd ZU dZded<   dZ	 	 	 	 	 dd dZd Z	 d!d"dZd#dZ	e
d             Zd$dZdS )%Elementa  An element is a semantically-coherent component of a document, often a paragraph.

    There are a few design principles that are followed when creating an element:
    1. It will always have an ID, which by default is a random UUID.
    2. Asking for an ID should always return a string, it can never be None.
    3. ID is lazy, meaning it will be generated when asked for the first time.
    4. When deterministic behavior is needed, the ID can be converted.
        to a hash based on its text `element.id_to_hash(position)`
    4. Even if the `text` attribute is not defined in a subclass, it will default to a blank string.
    6. Assigning a string ID manually is possible, but is meant to be used
        only for deserialization purposes.
    r\   ro   r$  N
element_idr   r   )Optional[tuple[tuple[float, float], ...]]coordinate_systemrK   r   Optional[ElementMetadata]r   c                    |$t          |t                    st          d          || _        |t	                      n|| _        ||t          ||          | j        _        || j        _        t          | d          r| j
        nd| _
        d S )Nz'element_id must be of type str or None.ri   ro   r   )rU   r\   rN   _element_idr|   r   rH   r   r   hasattrro   )r.   r{  r   r}  r   r   s         r(   rO   zElement.__init__  s     !*Z*E*E!FGGG%-5-=)))8"&7&C(;"+<) ) )DM% *:& ")v!6!6>DIIB			r*   c                    | j         S r#   ro   r-   s    r(   __str__zElement.__str__  
    yr*   T
new_systemr   in_placerR   rQ   rI   c                     j         j        " j         j        j         j         j        j        dS t	           fd j         j        j        D                       }|r"| j         j        _         j         j        _        |S )zConverts the element location coordinates to a new coordinate system.

        If inplace is true, changes the coordinates in place and updates the coordinate system.
        Nc              3  l   K   | ].\  }}j         j        j                            ||           V  /dS ))r  xyN)r   r   rL   !convert_coordinates_to_new_system)r%   r  r  r  r.   s      r(   	<genexpr>z<Element.convert_coordinates_to_new_system.<locals>.<genexpr>  sd        
  
 1 M%,NN% O   
  
  
  
  
  
r*   )r   r   rL   rJ   rf   )r.   r  r  new_coordinatess   ``  r(   r  z)Element.convert_coordinates_to_new_system  s     M%-}(/7}(/74  
  
  
  
  
 18 
  
  
 
 
  	:/>DM%,/9DM%,r*   sequence_numberrp   c                    | j         j         | j         | j         j         | }t	          j        |                                                                          dd         | _        | j	        S )a  Calculates and assigns a deterministic hash as an ID.

        The hash ID is based on element's text, sequence number on page,
        page number and its filename.

        Args:
            sequence_number: index on page

        Returns: new ID value
        N    )
r   r   ro   r   hashlibsha256encode	hexdigestr  r  )r.   r  datas      r(   r  zElement.id_to_hash  sa     -(a$)aT]5NaP_aa">$++--88BBDDSbSIwr*   c                h    | j         %t          t          j                              | _         | j         S r#   )r  r\   uuiduuid4r-   s    r(   r  z
Element.id  s*    #"4:<<00Dr*   r1   c                R    d | j         | j        | j                                        dS )N)typer{  ro   r   )r  ro   r   r/   r-   s    r(   r/   zElement.to_dict  s/    'I--//	
 
 	
r*   )NNNNN)
r{  r   r   r|  r}  rK   r   r~  r   r   )T)r  r   r  rR   rQ   rI   )r  rp   rQ   r\   r   )rA   rB   rC   rD   rE   categoryrO   r  r  r  r   r  r/   r$   r*   r(   rz  rz    s           III"H %)AE8<.2*.? ? ? ? ?,   >B    :        X 

 
 
 
 
 
r*   rz  c                  F     e Zd ZdZ	 	 	 	 	 	 dd fdZddZd fdZ xZS )CheckBoxzyA checkbox with an attribute indicating whether its checked or not.

    Primarily used in documents that are forms.
    NFr{  r   r   r|  r}  rK   checkedrR   r   r~  r   c                    |r|nt                      }t                                          |||||           || _        d S )N)r{  r   r}  r   r   )r|   r   rO   r  )r.   r{  r   r}  r  r   r   r]   s          r(   rO   zCheckBox.__init__  sV      (>88_->->!#/- 	 	
 	
 	
 %r*   rP   r   rQ   c                    t          |t                    sdS t          | j        |j        k    | j        j        |j        j        k    f          S rT   )rU   r  rV   r  r   r   rW   s     r(   rX   zCheckBox.__eq__  sN    %** 	5-)U^-GG
 
 	
r*   r1   c                z    t                                                      }d|d<   | j        |d<   | j        |d<   |S )-Serialize to JSON-compatible (str keys) dict.r  r  r  r{  )r   r/   r  r  r.   outr]   s     r(   r/   zCheckBox.to_dict  s;    ggoo FI GL
r*   )NNNFNN)r{  r   r   r|  r}  rK   r  rR   r   r~  r   r   r   r   )rA   rB   rC   rD   rO   rX   r/   r   r   s   @r(   r  r    s          %)AE8<.2*.% % % % % % %&
 
 
 
         r*   r  c                  T     e Zd ZdZ	 	 	 	 	 	 dd fdZddZd ZddZd fdZ xZ	S ) r#  z:Base element for capturing free text from within document.Nro   r\   r{  r   r   r|  r}  rK   r   r~  r   
embeddingsOptional[list[float]]c                    |r|nt                      }|| _        || _        t                                          |||||           d S )N)r{  r   r   r}  r   )r|   ro   r  r   rO   )	r.   ro   r{  r   r}  r   r   r  r]   s	           r(   rO   zText.__init__#  sa      (>88_->->	1;!#/- 	 	
 	
 	
 	
 	
r*   rP   r   c                    t          |t                    sdS t          | j        |j        k    | j        j        |j        j        k    | j        |j        k    | j        |j        k    f          S rT   )rU   r#  rV   ro   r   r   r  r  rW   s     r(   rX   zText.__eq__9  sg    %&& 	5	UZ')U^-GG/5#33	
 
 	
r*   c                    | j         S r#   r  r-   s    r(   r  zText.__str__E  r  r*   cleanersCallable[[str], str]c                    | j         }|D ]} ||          }t          |t                    st          d          || _         dS )zApplies a cleaning brick to the text element.

        The function that's passed in should take a string as input and produce a string as
        output.
        z%Cleaner produced a non-string output.N)ro   rU   r\   rN   )r.   r  cleaned_textcleaners       r(   applyz
Text.applyH  sX     y 	1 	1G"7<00LL,,, 	FDEEE 			r*   rQ   r1   c                    t                                                      }| j        |d<   | j        |d<   | j        |d<   | j        r
| j        |d<   |S )r  r{  r  ro   r  )r   r/   r  r  ro   r  r  s     r(   r/   zText.to_dictW  sS    ggoo GLmFiF? 	0 $C
r*   )NNNNNN)ro   r\   r{  r   r   r|  r}  rK   r   r~  r   r   r  r  )rP   r   )r  r  r   )
rA   rB   rC   rD   rO   rX   r  r  r/   r   r   s   @r(   r#  r#     s        DD
 %)AE8<.2*.,0
 
 
 
 
 
 
,

 

 

 

  ! ! ! !         r*   r#  c                      e Zd ZdZd ZdS )r=  z,An element containing formulas in a documentNrA   rB   rC   rD   r  r$   r*   r(   r=  r=  b  s        22HHHr*   r=  c                      e Zd ZdZd ZdS )r,  aA  A chunk formed from text (non-Table) elements.

    Only produced by chunking. An instance may be formed by combining one or more sequential
    elements produced by partitioning. It it also used when text-splitting an "oversized" element,
    a single element that by itself is larger than the requested chunk size.
    Nr  r$   r*   r(   r,  r,  h  s          "HHHr*   r,  c                      e Zd ZdZd ZdS )r/  z>An element for capturing text associated with figure captions.Nr  r$   r*   r(   r/  r/  s  s        HHHHHr*   r/  c                      e Zd ZdZd ZdS )r%  zNarrativeText is an element consisting of multiple, well-formulated sentences. This
    excludes elements such titles, headers, footers, and captions.Nr  r$   r*   r(   r%  r%  y  s!        F F HHHr*   r%  c                      e Zd ZdZd ZdS )r3  z;ListItem is a NarrativeText element that is part of a list.Nr  r$   r*   r(   r3  r3    s        EEHHHr*   r3  c                      e Zd ZdZd ZdS )r"  z$A text element for capturing titles.Nr  r$   r*   r(   r"  r"    s        ..HHHr*   r"  c                      e Zd ZdZd ZdS )r:  z'A text element for capturing addresses.Nr  r$   r*   r(   r:  r:    s        11HHHr*   r:  c                      e Zd ZdZd ZdS )r;  z&A text element for capturing addressesNr  r$   r*   r(   r;  r;    s        00HHHr*   r;  c                       e Zd ZdZej        ZdS )r-  z,A text element for capturing image metadata.N)rA   rB   rC   rD   r!  r[  r  r$   r*   r(   r-  r-    s        66 HHHr*   r-  c                      e Zd ZdZd ZdS )r<  z%An element for capturing page breaks.Nr  r$   r*   r(   r<  r<    s        //HHHr*   r<  c                      e Zd ZdZd ZdS )r>  z An element for capturing tables.Nr  r$   r*   r(   r>  r>    s        **HHHr*   r>  c                      e Zd ZdZdZdS )
TableChunkz*An element for capturing chunks of tables.r>  Nr  r$   r*   r(   r  r    s        44HHHr*   r  c                      e Zd ZdZd ZdS )r?  z*An element for capturing document headers.Nr  r$   r*   r(   r?  r?            44HHHr*   r?  c                      e Zd ZdZd ZdS )rB  z*An element for capturing document footers.Nr  r$   r*   r(   rB  rB    r  r*   rB  c                      e Zd ZdZd ZdS )rE  z'An element for capturing code snippets.Nr  r$   r*   r(   rE  rE    s        11HHHr*   rE  c                      e Zd ZdZd ZdS )rD  z&An element for capturing page numbers.Nr  r$   r*   r(   rD  rD    s        00HHHr*   rD  c                      e Zd ZdZd ZdS )rF  z1An element for capturing Key-Value dicts (forms).Nr  r$   r*   r(   rF  rF    s        ;;HHHr*   rF  zdict[str, type[Text]]TYPE_TO_TEXT_ELEMENT_MAPkv_pairslist[dict[str, Any]]list[FormKeyValuePair]c                ,   ddl m} | D ]l}|d         d         # ||d         d         g          \  |d         d<   |d         1|d         d         # ||d         d         g          \  |d         d<   mt          t          t                   |           S )z
    The key_value_pairs metadata field contains (in the vast majority of cases)
    nested Text elements. Those need to be turned from dicts into Elements explicitly,
    e.g. when partition_json is used.
    r   )elements_from_dictsr&   ru   Nr'   )r   r  r   rd   rx   )r  r  kv_pairs      r(   r   r     s     >=====   5>*+72E2E 0123 3/WU^,- 7'GG,<=M,N,Z4G4G!"2345 51WW./ %&111r*   orig_kv_pairsc                $   t          j        |           }|D ]x}|d         d         )|d         d                                         |d         d<   |d         7|d         d         )|d         d                                         |d         d<   y|S )z
    The key_value_pairs metadata field contains (in the vast majority of cases)
    nested Text elements. Those need to be turned from Elements to dicts recursively,
    e.g. when FormKeysValues.to_dict() is used.

    r&   ru   Nr'   )r   r   r/   )r  r  r  s      r(   r   r     s     &*]=%A%AH ^ ^5>*+7/6u~>N/O/W/W/Y/YGEN+,7'GG,<=M,N,Z181ABR1S1[1[1]1]GG-.Or*   )r   r   rQ   r   )rQ   r  )r  r  rQ   r  )r  r  rQ   r  )p
__future__r   abcr   dataclassesr;   enumr  r  r   r   r  	itertoolsr   typesr   typingr   r   r   r	   r
   r   typing_extensionsr   r   r   "unstructured.documents.coordinatesr   r   r   &unstructured.partition.utils.constantsr   unstructured.utilsr   r   r   rE   r   	dataclassr   rH   rn   rs   rx   r|   Enumr   r   r  r  r!  ABCrz  r  r#  r=  r,  r/  r%  r3  r"  r:  r;  r-  r<  r>  r  r?  rB  rE  rD  rF  rN  rr  ro  rp  rW  rP  rZ  rO  rQ  rS  rT  rU  rV  rX  rY  ra  rR  rb  rn  rq  rs  ru  rt  r]  r_  r[  r^  r\  rm  ri  rj  rl  rk  rw  rv  rx  r  r   r   r$   r*   r(   <module>r     s   " " " " " " " 



            				         " " " " " " E E E E E E E E E E E E E E E E = = = = = = = = = =         
 W V V V V V L L L L L L L L( ( ( ( (' ' ' ' '        . G1 G1 G1 G1 G1 G1 G1 G1T    9   # # # # #Y # # #    y   h/ h/ h/ h/ h/ h/ h/ h/V	E
 E
 E
 E
 E
DI E
 E
 E
P Yt__       F" " " "J9
 9
 9
 9
 9
 9
 9
 9
xb
 b
 b
 b
 b
cg b
 b
 b
J) ) ) ) )w ) ) )X? ? ? ? ?7 ? ? ?D    d   " " " " "t " " "    D       D       t       D       d       4   ! ! ! ! !D ! ! !           D              T       T       $                   T      &3u&3&3 %&3 e	&3
 E&3 "D&3 !#3&3 m&3 &3 =&3 -&3 =&3 m&3 }&3  m!&3" 8#&3$ x%&3 &3& '&3( )&3* V+&3, -&3. V/&30 &1&32 3&34 5&36 u7&38 9&3: ;&3< u=&3> ?&3@ |A&3B C&3D IE&3F kG&3 &3H Z .K&3 &3  & & & &R2 2 2 2*     r*   