
    Ng                        d Z ddlmZ ddlZddlmZmZmZmZ ddl	m
Z
 ddlmZ ddlmZ erddlmZ ddZ G d d          Z G d d          Z G d d          ZdS )zProvides operations related to the HTML table stored in `.metadata.text_as_html`.

Used during partitioning as well as chunking.
    )annotationsN)TYPE_CHECKINGIteratorSequencecast)etree)fragment_fromstring)lazyproperty)HtmlElementmatrixSequence[Sequence[str]]returnstrc                d    dfd}dd| r"d	d
                      ||                      dnd
S )a  Form an HTML table from "rows" and "columns" of `matrix`.

    Character overhead is minimized:
    - No whitespace padding is added for human readability
    - No newlines ("
") are added
    - No `<thead>`, `<tbody>`, or `<tfoot>` elements are used; we can't tell where those might be
      semantically appropriate anyway so at best they would consume unnecessary space and at worst
      would be misleading.
    rows_of_cell_strsr   r   Iterator[str]c              3  d   K   | D ])}|sdd                      |                     dV  *d S )Nz<tr> z</tr>join)r   row_cell_strsiter_tdss     Z/var/www/html/ai-engine/env/lib/python3.11/site-packages/unstructured/common/html_table.pyiter_trsz.htmlify_matrix_of_cell_texts.<locals>.iter_trs   s_      . 	A 	AM  @-!8!899@@@@@@@		A 	A    r   Sequence[str]c              3     K   | D ]q}t          j        |          }d                    |                    d                    }d                    |                                          }|rd| dndV  rd S )Nz<br/>
 z<td>z</td><td/>)htmlescaper   split)r   s	cell_texts      r   r   z.htmlify_matrix_of_cell_texts.<locals>.iter_tds&   s       	D 	DAAAQWWT]]++A++I-6C)))))GCCCC	D 	Dr   z<table>r   z</table>)r   r   r   r   )r   r   r   r   r   )r   r   r   s     @r   htmlify_matrix_of_cell_textsr&      st    A A A A A A	D 	D 	D 	D =CJ8RWWXXf--..8888Jr   c                  j    e Zd ZdZddZedd            Zedd	            ZddZ	edd            Z
dS )	HtmlTablezA `<table>` element.tabler   c                    || _         d S N)_table)selfr)   s     r   __init__zHtmlTable.__init__7   s    r   	html_textr   r   c                   t          |          }|                    d          }|st          d          |d         }|                    d          }|D ]}|                                 |                                D ]s}|j                                         |j        dk    rd|_        |j        r1d	                    |j        
                                          |_        |j        rd |_        t | |          S )Nz//tablez)`html_text` contains no `<table>` elementr   z.//thead | .//tbody | .//tfootthtdr   )r	   xpath
ValueErrordrop_tagiterattribcleartagtextr   r#   tail)clsr/   roottablesr)   noise_elementses          r   from_html_textzHtmlTable.from_html_text:   s    #9--I&& 	JHIIIq	 %EFF 	 	AJJLLLL  	 	A HNN u}} v 2!&,,..11 v s5zzr   c                B    t          j        | j        t                    S )a  The HTML-fragment for this `<table>` element, all on one line.

        Like: `<table><tr><td>foo</td></tr><tr><td>bar</td></tr></table>`

        The HTML contains no human-readability whitespace, attributes, or `<thead>`, `<tbody>`, or
        `<tfoot>` tags. It is made as compact as possible to maximize the semantic content in a
        given space. This is particularly important for chunking.
        encoding)r   tostringr,   r   r-   s    r   r!   zHtmlTable.html]   s     ~dkC8888r   Iterator[HtmlRow]c              #  z   K   d t          d| j                            d                    D             E d {V  d S )Nc              3  4   K   | ]}t          |          V  d S r+   )HtmlRow).0trs     r   	<genexpr>z&HtmlTable.iter_rows.<locals>.<genexpr>j   s(      __BGBKK______r   zlist[HtmlElement]z./tr)r   r,   r3   rF   s    r   	iter_rowszHtmlTable.iter_rowsi   sN      __$/BDKDUDUV\D]D]*^*^____________r   c                    d                     | j                                                  }d                     |                                          S )z-The clean, concatenated, text for this table.r   )r   r,   itertextr#   )r-   
table_texts     r   r:   zHtmlTable.textl   sA     XXdk224455
xx
((**+++r   N)r)   r   )r/   r   r   r(   r   r   )r   rG   )__name__
__module____qualname____doc__r.   classmethodrA   r
   r!   rN   r:    r   r   r(   r(   4   s                  [ D 	9 	9 	9 \	9` ` ` ` , , , \, , ,r   r(   c                  B    e Zd ZdZddZedd            Zdd	ZddZdS )rJ   zA `<tr>` element.rL   r   c                    || _         d S r+   )_tr)r-   rL   s     r   r.   zHtmlRow.__init__w       r   r   r   c                B    t          j        | j        t                    S )z*Like  "<tr><td>foo</td><td>bar</td></tr>".rC   )r   rE   r[   r   rF   s    r   r!   zHtmlRow.htmlz   s     ~dh5555r   Iterator[HtmlCell]c              #  @   K   | j         D ]}t          |          V  d S r+   )r[   HtmlCellr-   r2   s     r   
iter_cellszHtmlRow.iter_cells   s6      ( 	 	B2,,	 	r   r   c              #  h   K   | j         D ]'}|j        x}|                                }|s#|V  (dS )zGenerate contents of each cell of this row as a separate string.

        A cell that is empty or contains only whitespace does not generate a string.
        N)r[   r:   strip)r-   r2   r:   s      r   iter_cell_textszHtmlRow.iter_cell_texts   sU      
 ( 	 	B(::<<D JJJJ	 	r   N)rL   r   rR   )r   r^   )r   r   )	rS   rT   rU   rV   r.   r
   r!   rb   re   rX   r   r   rJ   rJ   t   sv            6 6 6 \6        r   rJ   c                  J    e Zd ZdZd
dZedd            Zedd            Zd	S )r`   zA `<td>` element.r2   r   c                    || _         d S r+   )_tdra   s     r   r.   zHtmlCell.__init__   r\   r   r   r   c                T    | j         r t          j        | j        t                    ndS )zLike  "<td>foo bar baz</td>".rC   r    )r:   r   rE   rh   r   rF   s    r   r!   zHtmlCell.html   s'     :>Ou~dh5555Or   c                J    | j         j        x}dS |                                S )z6Text inside `<td>` element, empty string when no text.Nr   )rh   r:   rd   )r-   r:   s     r   r:   zHtmlCell.text   s&     HM!D*2zz||r   N)r2   r   rR   )rS   rT   rU   rV   r.   r
   r!   r:   rX   r   r   r`   r`      sr            P P P \P    \  r   r`   )r   r   r   r   )rV   
__future__r   r!   typingr   r   r   r   lxmlr   	lxml.htmlr	   unstructured.utilsr
   r   r&   r(   rJ   r`   rX   r   r   <module>rp      s<   
 # " " " " "  : : : : : : : : : : : :       ) ) ) ) ) ) + + + + + + &%%%%%%K K K K@=, =, =, =, =, =, =, =,@       :         r   