
    Ng%                    R   d Z ddlmZ ddlmZmZmZmZmZm	Z	m
Z
 ddlZddlmZ ddlmZ ddlmZ ddlmZ dd	lmZ dd
lmZmZ ddlmZmZ ddlmZmZ ddlm Z m!Z!  eej"                  e	 d-ddddi dddddd
d.d(                        Z# G d) d*          Z$ G d+ d,          Z%dS )/zProvides `partition_html().    )annotations)IOAnyIteratorListLiteralOptionalcastN)etree)add_chunking_strategy)Element)read_txt_file)FileType)apply_metadataget_last_modified_date)Flowhtml_parser)!ontology_to_unstructured_elementsparse_html_to_ontology)is_temp_file_pathlazypropertyTFv1to_text)
filetextencodingurlheaders
ssl_verifyskip_headers_and_footersdetection_originhtml_parser_versionimage_alt_modefilenameOptional[str]r   Optional[IO[bytes]]r   r   r   r   dict[str, str]r   boolr    r!   r"   Literal['v1', 'v2']r#   Optional[Literal['to_text']]kwargsr   returnlist[Element]c       
            | |                                 dk    r|s| s|sg S t          | |||||||||	|
          }t          t                              |                    S )a  Partitions an HTML document into its constituent elements.

    HTML source parameters
    ----------------------
    The HTML to be partitioned can be specified four different ways:

    filename
        A string defining the target filename path.
    file
        A file-like object using "r" mode --> open(filename, "r").
    text
        The string representation of the HTML document.
    url
        The URL of a webpage to parse. Only for URLs that return an HTML document.
    headers
        The HTTP headers to be used in the HTTP request when `url` is specified.
    ssl_verify
        If the URL parameter is set, determines whether or not SSL verification is performed
        on the HTTP request.
    encoding
        The encoding method used to decode the text input. If None, utf-8 will be used.
    skip_headers_and_footers
        If True, ignores any content that is within <header> or <footer> tags

    html_parser_version (Literal['v1', 'v2']):
        The version of the HTML parser to use. The default is 'v1'. For 'v2' the parser will
        use the ontology schema to parse the HTML document.

    image_alt_mode (Literal['to_text']):
        When set 'to_text', the v2 parser will include the alternative text of images in the output.
    N )	file_pathr   r   r   r   r   r   r    r!   r"   r#   )stripHtmlPartitionerOptionslist_HtmlPartitioneriter_elements)r$   r   r   r   r   r   r   r    r!   r"   r#   r+   optss                a/var/www/html/ai-engine/env/lib/python3.11/site-packages/unstructured/partition/html/partition.pypartition_htmlr8      s    b DJJLLB..t.H.UX.	!!9)/%  D  ..t44555    c                      e Zd ZdZdddd!dZed"d            Zed"d            Zed#d            Zed"d            Z	ed$d            Z
ed%d            Zed$d            Zd S )&r2   zVEncapsulates partitioning option validation, computation, and application of defaults.r   r   )r"   r#   r0   
str | Noner   IO[bytes] | Noner   r   r   r   r'   r   r(   r    r!   r"   r)   r#   r*   c                   || _         || _        || _        || _        || _        || _        || _        || _        |	| _        |
| _	        || _
        d S N)
_file_path_file_text	_encoding_url_headers_ssl_verify_skip_headers_and_footers_detection_origin_html_parser_version_image_alt_mode)selfr0   r   r   r   r   r   r   r    r!   r"   r#   s               r7   __init__zHtmlPartitionerOptions.__init__a   s\     $

!	%)A&!1$7!-r9   r,   c                    | j         S )zOTrace of initial partitioner to be included in metadata for debugging purposes.)rG   rJ   s    r7   r!   z'HtmlPartitionerOptions.detection_origin|   s     %%r9   c                    | j         S )zCaller-provided encoding used to store HTML character stream as bytes.

        `None` when no encoding was provided and encoding should be auto-detected.
        )rB   rM   s    r7   r   zHtmlPartitionerOptions.encoding   s     ~r9   strc                    | j         r!t          | j         | j                  d         S | j        r!t          | j        | j                  d         S | j        rt          | j                  S | j        rt          j        | j        | j	        | j
                  }|j        st          d|j                   |j                            dd          }|                    d          st          d	| d
          |j        S t          d          )zIThe HTML document as a string, loaded from wherever the caller specified.)r$   r      )r   r   )r   verifyz*Error status code on GET of provided URL: zContent-Typer/   z	text/htmlz%Expected content type text/html. Got .z>Exactly one of filename, file, text, or url must be specified.)r?   r   rB   r@   rA   rO   rC   requestsgetrD   rE   ok
ValueErrorstatus_coder   
startswithr   )rJ   responsecontent_types      r7   	html_textz HtmlPartitionerOptions.html_text   s    ? 	W $/DNSSSTUVV: 	N dj4>JJJ1MM: 	#tz??"9 
	!|DIt}TM]^^^H;  WAUWW   $+//CCL**;77 Z !X!X!X!XYYY= YZZZr9   c                d    | j         rt          | j                   rdnt          | j                   S )zHThe best last-modified date available, None if no sources are available.N)r?   r   r   rM   s    r7   last_modifiedz$HtmlPartitionerOptions.last_modified   s6    
 ?9&7&H&H9DD'88	
r9   c                    | j         S )zAWhen True, elements located within a header or footer are pruned.)rF   rM   s    r7   r    z/HtmlPartitionerOptions.skip_headers_and_footers   s     --r9   c                    | j         S )zEWhen html_parser_version=='v2', HTML elements follow ontology schema.)rH   rM   s    r7   r"   z*HtmlPartitionerOptions.html_parser_version   s     ((r9   c                    | j         dk    S )zDWhen True, the alternative text of images is included in the output.r   )rI   rM   s    r7   add_img_alt_textz'HtmlPartitionerOptions.add_img_alt_text   s     #y00r9   N)r0   r;   r   r<   r   r;   r   r;   r   r;   r   r'   r   r(   r    r(   r!   r;   r"   r)   r#   r*   )r,   r;   )r,   rO   )r,   r(   )r,   r)   )__name__
__module____qualname____doc__rK   r   r!   r   r\   r^   r    r"   rb    r9   r7   r2   r2   ^   s$       `` 487@. . . . . .6 & & & \&    \ [ [ [ \[2 
 
 
 \
 . . . \. ) ) ) \) 1 1 1 \1 1 1r9   r2   c                  j    e Zd ZdZddZedd            ZddZedd
            Z	edd            Z
dS )r4   z/Partition HTML document into document-elements.r6   r2   c                    || _         d S r>   )_opts)rJ   r6   s     r7   rK   z_HtmlPartitioner.__init__   s    


r9   r,   Iterator[Element]c              #  P   K    | |                                           E d{V  dS )zBPartition HTML document provided by `opts` into document-elements.N)_iter_elements)clsr6   s     r7   r5   z_HtmlPartitioner.iter_elements   s:       3t99++-----------r9   c              #     K   | j         j        dk    r| j                                        n| j        }|D ]2}| j         j        |j        _        | j         j        |j        _        |V  3dS )zGenerated document-elements (e.g. Title, NarrativeText, etc.) parsed from document.

        Elements appear in document order.
        r   N)rj   r"   _mainr5   _from_ontologyr^   metadatar!   )rJ   elements_iteres      r7   rm   z_HtmlPartitioner._iter_elements   s}       z-55 J$$&&&$ 	  	 	A'+z'?AJ$*.**EAJ'GGGG	 	r9   r   c                "   | j         j        }	 t          j        |t                    }n=# t
          $ r0 t          j        |                    d          t                    }Y nw xY wt          j        |g dd           | j         j        rt          j        |ddgd           |	                    d          x}t          t          |          S |	                    d	          x}t          t          |          S t          t          |          S )
zThe root HTML element.zutf-8)delimglinkmetanoscriptscriptstyleF)	with_tailheaderfooterz.//mainNz.//body)rj   r\   r   
fromstringr   rW   encodestrip_elementsr    findr
   r   )rJ   r\   rootmainbodys        r7   rp   z_HtmlPartitioner._main   s#   
 J(		L#I{;;DD 	L 	L 	L#I$4$4W$=$={KKDDD	L
 	OOO[`	
 	
 	
 	

 :. 	N (';uMMMM IIi(((D5d###IIi(((D5d###D$s   ) 7A#"A#List[Element]c                r    | j         j        }t          |          }t          || j         j                  }|S )zHConvert an ontology elements represented in HTML to an ontology element.)rb   )rj   r\   r   r   rb   )rJ   r\   ontologyunstructured_elementss       r7   rq   z_HtmlPartitioner._from_ontology   sB     J(	))44 Atz'B!
 !
 !
 %$r9   N)r6   r2   )r6   r2   r,   rk   )r,   rk   )r,   r   )r,   r   )rc   rd   re   rf   rK   classmethodr5   rm   r   rp   rq   rg   r9   r7   r4   r4      s        99    . . . [.           \ B % % % \% % %r9   r4   r>   )r$   r%   r   r&   r   r%   r   r%   r   r%   r   r'   r   r(   r    r(   r!   r%   r"   r)   r#   r*   r+   r   r,   r-   )&rf   
__future__r   typingr   r   r   r   r   r	   r
   rT   lxmlr   unstructured.chunkingr   unstructured.documents.elementsr    unstructured.file_utils.encodingr   unstructured.file_utils.modelr   &unstructured.partition.common.metadatar   r   "unstructured.partition.html.parserr   r   +unstructured.partition.html.transformationsr   r   unstructured.utilsr   r   HTMLr8   r2   r4   rg   r9   r7   <module>r      s	   " ! " " " " " " C C C C C C C C C C C C C C C C C C        7 7 7 7 7 7 3 3 3 3 3 3 : : : : : : 2 2 2 2 2 2 Y Y Y Y Y Y Y Y @ @ @ @ @ @ @ @        ? > > > > > > > "@6 !%" %*&*/33<@6 @6 @6 @6 @6  @6F[1 [1 [1 [1 [1 [1 [1 [1|E% E% E% E% E% E% E% E% E% E%r9   