
    NgA                        d dl mZ d dlZd dlmZ d dlmZ d dlmZm	Z	 d dl
mZmZ d dlmZmZ d dlmZmZmZmZ d	Z	 	 	 	 	 d2d3dZd4dZd5dZd6dZd6d Zd7d#Zd8d%Zd9d'Zd9d(Z	 d:d;d.Zd<d0Z d=d1Z!dS )>    )annotationsN)OrderedDict)chain)SequenceType)BeautifulSoupTag)elementsontology)CSS_CLASS_TO_ELEMENT_TYPE_MAP)HTML_TAG_AND_CSS_NAME_TO_ELEMENT_TYPE_MAP$HTML_TAG_TO_DEFAULT_ELEMENT_TYPE_MAP+ONTOLOGY_CLASS_TO_UNSTRUCTURED_ELEMENT_TYPE2   Tontology_elementontology.OntologyElement	parent_idstrpage_numberintdepthfilename
str | Noneadd_img_alt_textboolreturnlist[elements.Element]c                   g }| j         t          j        j        k    r|t          k    r|!t          | t          j                  r| j        }t          | t          j                  sL|t          j
        d| j        dt          j        ||                     d          |||                    gz  }g }| j        D ]@}t          || j        |t          | t          j                  rdn|d	z   ||
          }||z  }At!          |          }	||	z  }njt"          | j                 }
|                                 }|                     |          } |
|| j        dt          j        |||||                    }|g}|S )a5  
    Converts an OntologyElement object to a list of unstructured Element objects.

    To preserve the structure of the ontology, the function is recursive
    and the tree structure is represented in flatten list by the parent_id
    attribute in the metadata of each Element object.
    To preserve all the attributes of the ontology element, the HTML code
    is injected to unstructured Element in ElementMetadata.text_as_html attribute.

    For Layout elements, the function creates an empty Text Element (with the
    HTML code injected the same way).

    TODO (Pluto): Better way would be to have special Element type in Unstructured

    Args:
        ontology_element (OntologyElement): The ontology element to be converted.
        parent_id (str, optional): The ID of the parent element. Defaults to None.
        page_number (int, optional): The page number of the element. Defaults to None.
        depth (int, optional): The depth of the element in the hierarchy. Defaults to 0.
        filename (str, optional): The name of the file the element comes from. Defaults to None.
        add_img_alt_text (bool): Whether to include the alternative text of images
                                            in the output. Defaults to True.
    Returns:
        list[Element]: A list of unstructured Element objects.
    N vlm_partitionerF)add_children)r   text_as_htmlr   category_depthr   )text
element_iddetection_originmetadatar      )r   r   r   r   r   )r   )elementTyper   ElementTypeEnumlayoutRECURSION_LIMIT
isinstancePager   Documentr
   TextidElementMetadatato_htmlchildren!ontology_to_unstructured_elementscombine_inline_elementsr   	__class__to_text)r   r   r   r   r   r   elements_to_returnr4   childcombined_childrenelement_classhtml_code_of_ontology_elementelement_textunstructured_elements                 g/var/www/html/ai-engine/env/lib/python3.11/site-packages/unstructured/partition/html/transformations.pyr5   r5      s   B #x'?'FFF5TcKcKc:.>#N#N*6K*H,=>> 	/2%6%5"+%5%=%=5%=%Q%Q$/',!)  	  #  %. 		 		E5*-'%&68IJJYaaPUXYPY!!1  E HH3H==//CDTD^_(8(@(@(B(B%'//AQ/RR,}'*.-#:'$!  	 
  
  
 33    r
   c                   g }d}| D ]i}||}t          ||          r;|xj        d|j        z   z  c_        |j        xj        d|j        j        z   z  c_        R|                    |           |}j||                    |           |S )a  
    Combines consecutive inline elements into a single element. Inline elements
    can be also combined with text elements.

    Combined elements contains multiple HTML tags together eg.
    {
        'text': "Text from element 1 Text from element 2",
        'metadata': {
            'text_as_html': "<p>Text from element 1</p><a>Text from element 2</a>"
        }
    }

    Args:
        elements (list[Element]): A list of elements to be combined.

    Returns:
        list[Element]: A list of combined elements.
    N )#can_unstructured_elements_be_mergedr$   r'   r"   append)r
   result_elementscurrent_elementnext_elements       r@   r6   r6   o   s    & OO  
+ 
+"*O.MM 	+  C,*;$;;  $11S<;P;]5]]111""?333*OO"///rA   rG   elements.ElementrH   c                   | j         j        |j         j        k    rdS t          | j         j        d                              d          }t          |j         j        d                              d          }d t          ||          D             }|D ]-}|j        r dS t          |          st          |          s dS .dS )z
    Elements can be merged when:
    - They are on the same level in the HTML tree
    - Neither of them has children
    - All elements are inline elements or text element
    Fhtml.parser	recursivec                ,    g | ]}t          |          S  )parse_html_to_ontology_element).0html_tags     r@   
<listcomp>z7can_unstructured_elements_be_merged.<locals>.<listcomp>   s0        	'x00  rA   T)	r'   r#   r   r"   find_allr   r4   is_inline_elementis_text_element)rG   rH   current_html_tagsnext_html_tagsontology_elementsr   s         r@   rD   rD      s     .,2G2VVVu% -} hh  #<#8#E}UU^^ _  N /@@  
 .  $ 	55!"233 	GW7X7X 	554rA   c                T    t           j        t           j        t           j        t           j        t           j        t           j        t           j        t           j        g}t           j	        j
        g}t           fd|D                       rdS t           fd|D                       rdS dS )z>Categories or classes that we want to combine with inline textc              3  8   K   | ]}t          |          V  d S Nr-   rQ   class_r   s     r@   	<genexpr>z"is_text_element.<locals>.<genexpr>   s.      
K
KF:&//
K
K
K
K
K
KrA   Tc              3  .   K   | ]}j         |k    V  d S r\   r)   rQ   categoryr   s     r@   r`   z"is_text_element.<locals>.<genexpr>   s,      
T
T'83
T
T
T
T
T
TrA   F)r   NarrativeTextQuote	ParagraphFootnoteFootnoteReferenceCitationBibliographyGlossaryr*   r'   any)r   text_classestext_categoriess   `  r@   rV   rV      s     	"	L  /89O

K
K
K
Kl
K
K
KKK t

T
T
T
TO
T
T
TTT t5rA   c                     t           j        g}t           j        j        t           j        j        g}t           fd|D                       rdS t           fd|D                       rdS dS )z@Categories or classes that we want to combine with text elementsc              3  8   K   | ]}t          |          V  d S r\   r]   r^   s     r@   r`   z$is_inline_element.<locals>.<genexpr>   s.      
M
MF:&//
M
M
M
M
M
MrA   Tc              3  .   K   | ]}j         |k    V  d S r\   rb   rc   s     r@   r`   z$is_inline_element.<locals>.<genexpr>   s,      
V
V'83
V
V
V
V
V
VrA   F)r   	Hyperlinkr*   specialized_text
annotationrm   )r   inline_classesinline_categoriess   `  r@   rU   rU      s     ()N 1 +
 
M
M
M
Mn
M
M
MMM t

V
V
V
VDU
V
V
VVV t5rA   unstructured_elementsSequence[elements.Element]c                :   t                      }| d         j        j        }|0t          j                                        }|| d         j        _        t          j        d|i          ||<   | D ]}t          |j        j        d          	                    d          }|D ]_}t          |          }|||j        <   |j        j        r8|j        j        |v r*||j        j                 j                            |           `|                    d          \  }}|S )	a9  
    Converts a sequence of unstructured Element objects to an OntologyElement object.

    The function caches the elements in a dictionary and each element is assigned to its parent.
    At the end the root element is popped from the dictionary and returned.

    Such approach comes with limitations:
        - The parent element has to be in the list before the child element

    Args:
        unstructured_elements (Sequence[Element]): The sequence of unstructured Element objects.

    Returns:
        OntologyElement: The converted OntologyElement object.
    r   Nr1   )additional_attributesrK   FrL   )last)r   r'   r   r   OntologyElementgenerate_unique_idr/   r   r"   rT   rP   r1   r4   rE   popitem)	rx   id_to_element_mappingdocument_element_idelementhtml_as_tagshtml_as_tagr   root_idroot_elements	            r@   !unstructured_elements_to_ontologyr      sS   $ (MM/2;E"&6IIKK6Ia )3191B#%892 2 2-. ) d d$W%5%BMRR[[ \ 
 
 ( 	d 	dK=kJJ :J!"2"56) dg.>.HLa.a.a%g&6&@AJQQRbccc	d 299u9EEG\rA   	html_codec                   t          |           } t          |           } t          | d          }|                    dd          }|s|                    dd          }|st	          d          t          |          }|S )ag  
    Parses the given HTML code and converts it into an Element object.

    Args:
        html_code (str): The HTML code to be parsed.
            Parsing HTML will start from <div class="Page">.
    Returns:
        OntologyElement: The parsed Element object.

    Raises:
        ValueError: If no <body class="Document"> element is found in the HTML.
    rK   bodyr/   )r_   divr.   zKNo <body class='Document'> or <div class='Page'> element found in the HTML.)#remove_empty_divs_from_html_content#remove_empty_tags_from_html_contentr   find
ValueErrorrP   )r   soupdocumentdocument_elements       r@   parse_html_to_ontologyr     s     4I>>I3I>>IM22Dyy
y33H 399U6922 
Y
 
 	
 6h??rA   html_contentc                    t          | d          }|                    d          }t          |          D ]}|j        s|                                 t          |          S )NrK   r   )r   rT   reversedattrsunwrapr   )r   r   divsr   s       r@   r   r   3  s[    }55D==D~~  y 	JJLLLt99rA   c                h    t          | d          }d fd} ||           t          |          S )NrK   c                    | j         dvrdS |                                 rdS | j        rdS |                     d          sdS dS )N)	pspanr   h1h2h3h4h5h6FT)strip)namer   r   get_text)tags    r@   is_emptyz5remove_empty_tags_from_html_content.<locals>.is_empty?  sY    8SSS588:: 	59 	5||$|'' 	4urA   c                t    |                                  D ]!} |          r|                                 "d S r\   )rT   	decompose)r   r   r   s     r@   remove_empty_tagsz>remove_empty_tags_from_html_content.<locals>.remove_empty_tagsO  sB    ==?? 	  	 Cx}}  	  	 rA   )r   r   )r   r   r   r   s      @r@   r   r   <  sY    }55D            
 dt99rA   r(   r   r	   recursion_depthontology.OntologyElement | Nonec                   t          |           \  }}t          |           }| j        dk    rt          j        ddd|          S |t          j        k    rt          d | j        D                       p! |            j        t          j	        j
        k    }|o
t          k    }|rd}fd| j        D             }n8d                    d | j        D                                                       }g } |||||	          }	|	S )
a  
    Converts a BeautifulSoup Tag object into an OntologyElement object. This function is recursive.
    First tries to recognize a class from Unstructured Ontology, then if class is matched tries
    to go deeper inside HTML tree. The recursive parsing is ended if the class is not recognized or
    there are no HTML Tags inside HTML - just text. Then it is parsed to
    Paragraph or UncategorizedText object.

    Args:
        soup (Tag): The BeautifulSoup Tag object to be converted.
        recursion_depth (int): Flag to control limit of recursion depth.
    Returns:
        OntologyElement: The converted OntologyElement object.
    brr   N)r$   css_class_namehtml_tag_namer{   c              3  @   K   | ]}t          |t                    V  d S r\   )r-   r	   rQ   contents     r@   r`   z1parse_html_to_ontology_element.<locals>.<genexpr>v  s,      FFW
7C((FFFFFFrA   c                   g | ]}t          |                                          #t          |t                    rt	          |d z             n3t          j        t          |                                                    S )r(   )r   )r$   )r   r   r-   r	   rP   r   rg   )rQ   r:   r   s     r@   rS   z2parse_html_to_ontology_element.<locals>.<listcomp>}  s     
 
 
 5zz!!
 eS))A.uoXYFYZZZZ'SZZ-=-=-?-?@@@	
 
 
rA   
c                P    g | ]#}t          |                                          $S rO   )r   r   r   s     r@   rS   z2parse_html_to_ontology_element.<locals>.<listcomp>  s*    LLL7#g,,,,..LLLrA   )r$   r4   r   r{   )'extract_tag_and_ontology_class_from_tagget_escaped_attributesr   r   rg   UncategorizedTextrm   contentsr)   r*   r+   r,   r4   joinr   )
r   r   ontology_html_tagontology_classescaped_attrshas_childrenshould_unwrap_htmlr$   r4   output_elements
    `        r@   rP   rP   Y  sY     )PPT(U(U%~*400MyD!"/	
 
 
 	
 
85	5 	GFFFFFFF	K>'8+C+JJ 
 &L/_*L 
 
 
 
 
 
 
 yyLLdmLLLMMSSUU#^'+	  N rA   *tuple[str, Type[ontology.OntologyElement]]c                    d\  }}| j                             d          r3| j        t          j        | j        | j         d         d         f          }}|sn| j                             d          rT| j         d         d         t          v r:t	          j        | j         d         d                   } |            j        d         }|s'| j        t          v r| j        t          | j                 }}|sd}t          j        }||fS )a  
    Extracts the HTML tag and corresponding ontology class
    from a BeautifulSoup Tag object. The CSS class is prioritized over
    the HTML tag. If not recognized soup.name and UnstructuredText is returned.

    Args:
        soup (Tag): The BeautifulSoup Tag object to extract information from.

    Returns:
        tuple: A tuple containing the HTML tag (str) and the ontology class (Type[OntologyElement]).
    )NNclassr   r   )	r   getr   r   r   allowed_tagsr   r   r   )r   rR   r<   s      r@   r   r     s    )Hm z~~g 
"&)-V-ZY
7+A./.
 .
- 3JNN7##3 Jw"&CCC59$*W:Ma:PQQ =??/2  ]TY*NNN"&)-QRVR[-\-  3  2]""rA   c                    i }| j                                         D ]X\  }}t          j        |          }d}|r6t	          |t
                    rd |D             }nt          j        |          }|||<   Y|S )z
    Escapes the attributes of a BeautifulSoup Tag object.

    Args:
        soup (Tag): The BeautifulSoup Tag object whose attributes need to be escaped.

    Returns:
        dict: A dictionary with escaped attribute names and values.
    Nc                6    g | ]}t          j        |          S rO   )htmlescape)rQ   vs     r@   rS   z*get_escaped_attributes.<locals>.<listcomp>  s      ? ? ?AQ ? ? ?rA   )r   itemsr   r   r-   list)r   r   keyvalueescaped_keyescaped_values         r@   r   r     s     Mj&&(( 3 3
Uk#&& 	3%&& 3 ? ? ? ? ? $E 2 2%2k""rA   )NNr   NT)r   r   r   r   r   r   r   r   r   r   r   r   r   r   )r
   r   r   r   )rG   rI   rH   rI   r   r   )r   r   r   r   )rx   ry   r   r   )r   r   r   r   )r   r   r   r   )r(   )r   r	   r   r   r   r   )r   r	   r   r   )r   r	   )"
__future__r   r   collectionsr   	itertoolsr   typingr   r   bs4r   r	   unstructured.documentsr
   r   unstructured.documents.mappingsr   r   r   r   r,   r5   r6   rD   rV   rU   r   r   r   r   rP   r   r   rO   rA   r@   <module>r      s   " " " " " "  # # # # # #       ! ! ! ! ! ! ! ! " " " " " " " " 5 5 5 5 5 5 5 5            
 !W W W W Wt% % % %P   D   0   $0 0 0 0f   :      < '(8 8 8 8 8v-# -# -# -#`     rA   