
    NgA                    B   U d Z ddlmZ ddlZddlZddlZddlZddlZddl	Z	ddl
mZmZ ddlmZmZmZmZmZ ddlmZmZ ddlmZ ddlmZ dd	lmZ dd
lmZ ddlmZ ddl m!Z! dZ"de#d<   	 d(ddddddd)d!Z$ G d" d#          Z% G d$ d%          Z& G d& d'          Z'dS )*z{Provides `partition_email()` function.

Suitable for use with `.eml` files, which can be exported from many email clients.
    )annotationsN)EmailMessageMIMEPart)IOAnyFinalIteratorcast)ElementElementMetadata)FileType)UnsupportedFileFormatError)get_last_modified_date)partition_html)partition_text)lazyproperty)	text/html
text/plainzFinal[tuple[str, ...]]VALID_CONTENT_SOURCESr   T)filecontent_sourcemetadata_filenamemetadata_last_modifiedprocess_attachmentsfilename
str | Noner   IO[bytes] | Noner   strr   r   r   boolkwargsr   returnlist[Element]c          	         t                               | ||||||          }t          t                              |                    S )a5  Partitions an .eml file into document elements.

    Args:
        filename: str path of the target file.
        file: A file-like object open for reading bytes (not str) e.g. --> open(filename, "rb").
        content_source: The preferred message body. Many emails contain both a plain-text and an
            HTML version of the message body. By default, the HTML version will be used when
            available. Specifying "text/plain" will cause the plain-text version to be preferred.
            When the preferred version is not available, the other version will be used.
        metadata_filename: The file-path to use for metadata purposes. Useful when the target file
            is specified as a file-like object or when `filename` is a temporary file and the
            original file-path is known or a more meaningful file-path is desired.
        metadata_last_modified: The last-modified timestamp to be applied in metadata. Useful when
            a file-like object (which can have no last-modified date) target is used. The
            last-modified metadata is otherwise drawn from the filesystem when a path is provided.
        process_attachments: When True, also partition any attachments in the message after
            partitioning the message body. All document elements appear in the single returned
            element list. The filename of the attachment, when available, is used as the
            `filename` metadata value for elements arising from the attachment.

    Note that all global keyword arguments such as `unique_element_ids`, `language` and
    `chunking_strategy` can be used and will be passed along to the decorators that implement
    those functions. Further, any keyword arguments applicable to HTML will be passed along to the
    HTML partitioner when processing an HTML message body.
    	file_pathr   r   metadata_file_pathr   r   r    ctx)EmailPartitioningContextloadlist_EmailPartitioneriter_elements)r   r   r   r   r   r   r    r(   s           X/var/www/html/ai-engine/env/lib/python3.11/site-packages/unstructured/partition/email.pypartition_emailr/      sV    F #
'
'%,5/ (  C !//C/88999    c                     e Zd ZdZddddddi fd*dZed+d            Zed,d            Zed-d            Z	ed,d            Z
ed.d            Zed/d            Zed0d            Zed0d            Zed0d            Zed0d             Zed1d"            Zed2d#            Zed3d$            Zed0d%            Zed,d&            Zed0d'            Zed0d(            Zd4d)ZdS )5r)   zVEncapsulates partitioning option validation, computation, and application of defaults.Nr   Fr%   r   r   r   r   r   r&   r   r   r   r    dict[str, Any]c                h    || _         || _        || _        || _        || _        || _        || _        d S N)
_file_path_file_content_source_metadata_file_path_metadata_last_modified_process_attachments_kwargs)selfr%   r   r   r&   r   r   r    s           r.   __init__z!EmailPartitioningContext.__init__O   s=     $
-#5 '=$$7!r0   r!   c           	     J     | |||||||                                           S )z#Construct and validate an instance.r$   )	_validate)clsr%   r   r   r&   r   r   r    s           r.   r*   zEmailPartitioningContext.loada   s=     s)1#9 3
 
 
 )++	r0   list[str] | Nonec                    | j                             d          }|sdS t          j                            |          }d |D             S )z6The "blind carbon-copy" Bcc: addresses of the message.BccNc                L    g | ]!}t           j                            |          "S  emailutils
formataddr.0addrs     r.   
<listcomp>z:EmailPartitioningContext.bcc_addresses.<locals>.<listcomp>~   (    ???&&t,,???r0   msgget_allrG   rH   getaddresses)r<   bccsaddrss      r.   bcc_addressesz&EmailPartitioningContext.bcc_addressesw   sO     x&& 	4((..??????r0   MIMEPart | Nonec                B    | j                             | j                  S )zThe message part containing the actual textual email message.

        This is as opposed to attachments or "related" parts like an image that appears in the
        message etc.
        )preferencelist)rP   get_bodycontent_type_preferencer<   s    r.   	body_partz"EmailPartitioningContext.body_part   s      x  0L MMMr0   c                    | j                             d          }|sdS t          j                            |          }d |D             S )z/The "carbon-copy" Cc: addresses of the message.CcNc                L    g | ]!}t           j                            |          "S rE   rF   rJ   s     r.   rM   z9EmailPartitioningContext.cc_addresses.<locals>.<listcomp>   rN   r0   rO   )r<   ccsrT   s      r.   cc_addressesz%EmailPartitioningContext.cc_addresses   O     ht$$ 	4((--??????r0   tuple[str, ...]c                     | j         dk    rdndS )zWhether to prefer HTML or plain-text body when message-body has both.

        The default order of preference is `("html", "plain")`. The order can be switched by
        specifying `"text/plain"` as the `content_source` arg value.
        r   )plainhtml)rf   re   )r7   r[   s    r.   rZ   z0EmailPartitioningContext.content_type_preference   s     %)$8L$H$H  N__r0   r   c                |    t          | j        | j        | j        | j        r| j        gnd| j        | j                  S )zThe email-specific metadata fields for this message.

        Suitable for use with `.metadata.update()` on the base metadata applied to message body
        elements by delegate partitioners for text and HTML.
        N)bcc_recipientcc_recipientemail_message_id	sent_fromsent_tosubject)r   rU   ra   
message_idfrom_addressto_addressesrm   r[   s    r.   email_metadataz'EmailPartitioningContext.email_metadata   sM     ,*!_-1->Ht())D%L
 
 
 	
r0   c                    | j                             d          }|sdS t          j                            |          }d |D             }|d         S )z"The address of the message sender.FromNc                L    g | ]!}t           j                            |          "S rE   rF   rJ   s     r.   rM   z9EmailPartitioningContext.from_address.<locals>.<listcomp>   s(    JJJD5;11$77JJJr0   r   rO   )r<   fromsrT   formatted_addrss       r.   ro   z%EmailPartitioningContext.from_address   sY       (( 	4((//JJEJJJq!!r0   c                    | j                             d          }|sdS |                                                    d          S )z2The value of the Message-ID: header, when present.z
Message-IDNz<>)rP   getstrip)r<   raw_ids     r.   rn   z#EmailPartitioningContext.message_id   s@     l++ 	4||~~##D)))r0   c                "    | j         p| j        pdS )a  The best available file-path information for this email message.

        It's value is computed according to these rules, applied in order:

          - The `metadata_filename` arg value when one was provided to `partition_email()`.
          - The `file_path` value when one was provided.
          - None otherwise.

        This value is used as the `filename` metadata value for elements produced by partitioning
        the email message (but not those from its attachments).
        N)r8   r5   r[   s    r.   r&   z+EmailPartitioningContext.metadata_file_path   s     'B4?BdBr0   c                ,    | j         p| j        p| j        S )au  The best available last-modified date for this message, as an ISO8601 string.

        It's value is computed according to these rules, applied in order:

          - The `metadata_last_modified` arg value when one was provided to `partition_email()`.
          - The date-time in the `Sent:` header of the message, when present.
          - The last-modified date recorded on the filesystem for `file_path` when it was provided.
          - None otherwise.

        This value is used as the `last_modified` metadata value for all elements produced by
        partitioning this email message, including any attachments.
        )r9   
_sent_date_filesystem_last_modifiedr[   s    r.   r   z/EmailPartitioningContext.metadata_last_modified   s     +`t`$B``r0   r   c                   | j         et          | j         d          5 }t          t          t	          j        |t          j        j                            cddd           S # 1 swxY w Y   | j        J | j        	                                }t          t          t	          j
        |t          j        j                            S )zOThe Python stdlib `email.message.EmailMessage` object parsed from the EML file.Nrb)policy)r5   openr
   r   rG   message_from_binary_filer   defaultr6   readmessage_from_bytes)r<   f
file_bytess      r.   rP   zEmailPartitioningContext.msg   s     ?&dot,,  %"@5<K_"`"`"`                
 z%%%Z__&&
L%"::elNb"c"c"cddds   8A""A&)A&c                    | j         S )zThe "extra" keyword arguments received by `partition_email()`.

        These are passed along to delegate partitioners which extract keyword args like
        `chunking_strategy` etc. in their decorators to control metadata behaviors, etc.
        )r;   r[   s    r.   partitioning_kwargsz,EmailPartitioningContext.partitioning_kwargs   s     |r0   c                    | j         S )zWhen True, partition attachments in addition to the email message body.

        Any attachment having file-format that cannot be partitioned by unstructured is silently
        skipped.
        )r:   r[   s    r.   r   z,EmailPartitioningContext.process_attachments   s     ((r0   c                B    | j                             d          }|sdS |S )z/The value of the Subject: header, when present.SubjectN)rP   rx   )r<   rm   s     r.   rm   z EmailPartitioningContext.subject   s)     (,,y)) 	4r0   c                    | j                             d          }|sdS t          j                            |          }d |D             S )z!The To: addresses of the message.ToNc                L    g | ]!}t           j                            |          "S rE   rF   rJ   s     r.   rM   z9EmailPartitioningContext.to_addresses.<locals>.<listcomp>  rN   r0   rO   )r<   tosrT   s      r.   rp   z%EmailPartitioningContext.to_addresses  rb   r0   c                <    | j         rt          | j                   ndS )zVLast-modified retrieved from filesystem when a file-path was provided, None otherwise.N)r5   r   r[   s    r.   r~   z2EmailPartitioningContext._filesystem_last_modified  s!     ;?/S%do666tSr0   c                    | j                             d          }|sdS t          j                            |          }|                    t          j        j                  	                    d          S )z?ISO-8601 str representation of message sent-date, if available.DateNseconds)timespec)
rP   rx   rG   rH   parsedate_to_datetime
astimezonedttimezoneutc	isoformat)r<   date_str	sent_dates      r.   r}   z#EmailPartitioningContext._sent_date  sb     8<<'' 	4K55h??	##BKO44>>	>RRRr0   c                l   | j         s| j        st          d          | j        rVt          | j                            d          t
                    st          d          | j                            d           | j        t          vr+t          t          | j                   dt                     | S )z5Raise on first invalid option, return self otherwise.zOno document specified; either a `filename` or `file` argument must be provided.r   z)file object must be opened in binary modez: is not a valid value for content_source; must be one of: )
r5   r6   
ValueError
isinstancer   bytesseekr7   r   reprr[   s    r.   r?   z"EmailPartitioningContext._validate  s     	tz 	a   : 	
""E  N !!LMMMJOOA'<<<,-- < <$9< <  
 r0   )r%   r   r   r   r   r   r&   r   r   r   r   r   r    r2   )r%   r   r   r   r   r   r&   r   r   r   r   r   r    r2   r!   r)   )r!   rA   )r!   rV   )r!   rc   )r!   r   r!   r   )r!   r   )r!   r2   )r!   r   )r!   r)   )__name__
__module____qualname____doc__r=   classmethodr*   r   rU   r\   ra   rZ   rq   ro   rn   r&   r   rP   r   r   rm   rp   r~   r}   r?   rE   r0   r.   r)   r)   L   s       `` !%!%))--1$)!#    $    [* @ @ @ \@ N N N \N @ @ @ \@ ` ` ` \` 
 
 
 \
 " " " \" * * * \* C C C \C a a a \a e e e \e    \ ) ) ) \)    \ @ @ @ \@ T T T \T S S S \S     r0   r)   c                  B    e Zd ZdZddZedd            ZddZdd	Zd
S )r,   z8Encapsulates the partitioning logic for email documents.r(   r)   c                    || _         d S r4   )_ctx)r<   r(   s     r.   r=   z_EmailPartitioner.__init__7  s    			r0   r!   Iterator[Element]c                >     | |                                           S )z@Generate the document elements for the email described by `ctx`.r'   _iter_elements)r@   r(   s     r.   r-   z_EmailPartitioner.iter_elements:  s      ss|||**,,,r0   c              #  2  K   |                                  D ]*}|j                            | j        j                   |V  +| j        j        sdS | j        j                                        D ](}t          	                    || j                  E d{V  )dS )zGenerate the document elements for the email described in the partitioning context.

        This optionally includes elements generated by partitioning any partitionable attachments
        in the message as well.
        N)
_iter_email_body_elementsmetadataupdater   rq   r   rP   iter_attachments_AttachmentPartitionerr-   )r<   e
attachments      r.   r   z _EmailPartitioner._iter_elements?  s       //11 	 	AJdi6777GGGGy, 	F)-88:: 	S 	SJ-;;J	RRRRRRRRRR	S 	Sr0   c              #    K   | j         j        }|dS |                                }|                                }t	          |t
                    sJ |dk    rAt          d|| j         j        t          j	        | j         j
        d| j         j        E d{V  dS t          d|| j         j        t          j	        | j         j
        d| j         j        E d{V  dS )z/Generate document elements from the email body.Nr   )textr   metadata_file_typer   rE   )r   r\   get_content_typeget_contentr   r   r   r&   r   EMLr   r   r   )r<   r\   content_typecontents       r.   r   z+_EmailPartitioner._iter_email_body_elementsO  s8     I'	 F 1133''))'3''''';&&% "&)">#+<'+y'G	 
 )/           & "&)">#+<'+y'G	 
 )/          r0   N)r(   r)   )r(   r)   r!   r   r!   r   )	r   r   r   r   r=   r   r-   r   r   rE   r0   r.   r,   r,   4  sz        BB    - - - [-S S S S      r0   r,   c                      e Zd ZdZddZedd	            Zdd
Zedd            Z	edd            Z
edd            ZdS )r   z'Partitions an attachment to a MSG file.r   r   r(   r)   c                "    || _         || _        d S r4   )_attachmentr   )r<   r   r(   s      r.   r=   z_AttachmentPartitioner.__init__p  s    %			r0   r!   r   c                >     | ||                                           S )zHPartition an attachment MIME-part from a MIME email message (.eml file).r   )r@   r   r(   s      r.   r-   z$_AttachmentPartitioner.iter_elementst  s"    
 s:s##22444r0   c              #     K   ddl m} t          j        | j                  }	  |d|| j        | j        j        d| j        j        }n# t          $ r Y dS w xY w|D ]}| j
        |j        _        |V  dS )zPartition the byte-stream in the attachment MIME-part into elements.

        Generates zero elements if the attachment is not partitionable.
        r   )	partition)r   r   r   NrE   )unstructured.partition.autor   ioBytesIO_file_bytes_attachment_file_namer   r   r   r   _attached_to_filenamer   attached_to_filename)r<   r   r   elementsr   s        r.   r   z%_AttachmentPartitioner._iter_elements{  s       	:99999z$*++
	 y "&"<'+y'G  )/	 HH * 	 	 	 FF	
  	 	A.2.HAJ+GGGG	 	s   %A	 	
AAr   c                `    | j         j        }|dS t          j                            |          S )z@The file-name (no path) of the message. `None` if not available.N)r   r&   ospathbasename)r<   r%   s     r.   r   z,_AttachmentPartitioner._attached_to_filename  s/     I0	4w	***r0   c                4    | j                                         S )zOThe original name of the attached file, `None` if not present in the MIME part.)r   get_filenamer[   s    r.   r   z,_AttachmentPartitioner._attachment_file_name  s     ,,...r0   r   c                    | j                                         }t          |t                    r|                    d          S t          |t
                    sJ |S )zThe bytes of the attached file.zutf-8)r   r   r   r   encoder   )r<   r   s     r.   r   z"_AttachmentPartitioner._file_bytes  sW     "..00gs## 	+>>'***'5)))))r0   N)r   r   r(   r)   )r   r   r(   r)   r!   r   r   r   )r!   r   )r   r   r   r   r=   r   r-   r   r   r   r   r   rE   r0   r.   r   r   m  s        11    5 5 5 [5   8 + + + \+ / / / \/    \  r0   r   r4   )r   r   r   r   r   r   r   r   r   r   r   r   r    r   r!   r"   )(r   
__future__r   datetimer   rG   email.policyemail.utilsr   r   email.messager   r   typingr   r   r   r	   r
   unstructured.documents.elementsr   r   unstructured.file_utils.modelr   unstructured.partition.commonr   &unstructured.partition.common.metadatar   unstructured.partition.htmlr   unstructured.partition.textr   unstructured.utilsr   r   __annotations__r/   r)   r,   r   rE   r0   r.   <module>r      s    
 # " " " " "              				 				 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 D D D D D D D D 2 2 2 2 2 2 D D D D D D I I I I I I 6 6 6 6 6 6 6 6 6 6 6 6 + + + + + +0K  K K K K  -: "%$()- $-: -: -: -: -: -:`e e e e e e e eP6 6 6 6 6 6 6 6r@ @ @ @ @ @ @ @ @ @r0   