
    Ng%                        d dl mZ d dlmZmZmZ d dlmZ d dlm	Z	 ddddZ
dd	Zd dZd!dZd dZd"dZd#dZd$dZ	 d%d&dZd'dZd(dZdS ))    )annotations)AnyDictList)BeautifulSoup)cells_to_html 	row_index	col_indexcontentcellsList[Dict[str, Any]]c                b  	 t          | d           }t                      }|D ]}|d         dk    s|d         dk    r}t          |d         |d         |d         z             D ]W}t          |d         |d         |d         z             D ]/}||f|d         |d         fk    r|                    ||f           0X|d         |d         f|v r^|d         |d         c		fd|D             }|D ]}|dxx         dz  cc<   |                    	f           |d         |d         f|v ^
|S )	a  Move cells to the right if spanned cells have an influence on the rendering.

    Args:
        cells: List of cells in the table in Deckerd format.

    Returns:
        List of cells in the table in Deckerd format with cells moved to the right if spanned.
    c                "    | d         | d         fS )Nyx )r   s    g/var/www/html/ai-engine/env/lib/python3.11/site-packages/unstructured/metrics/table/table_extraction.py<lambda>z/_move_cells_for_spanned_cells.<locals>.<lambda>   s    ##/?     keyw   hr   r   c                D    g | ]}|d          k    |d         k    |S )r   r   r   ).0ccell_xcell_ys     r   
<listcomp>z1_move_cells_for_spanned_cells.<locals>.<listcomp>"   s9    !c!c!cQsVv=M=MRSTWRX\bRbRb!RbRbRbr   )sortedsetrangeaddremove)
r   sorted_cellscells_occupied_by_spannedcellijcells_to_the_rightcell_to_mover    r!   s
           @@r   _move_cells_for_spanned_cellsr/      s    %%?%?@@@L # ? ?9q==DIMM49d3i$s)&;<< > >tCy$s)d3i*?@@ > >A1v$s)T#Y!777155q!f===> Cy$s)$(AAA!#YS	NFF!c!c!c!c!c\!c!c!c 2 ' 'S!!!Q&!!!!%,,ff-=>>> Cy$s)$(AAA r   r   strreturnc                   t          | d          }|                    d          }|                    dg          }g }t          |          D ]\  }}|                    ddg          }t          |          D ]s\  }}	||t	          |	j                            dd                    t	          |	j                            dd                    |	j        d	}
|                    |
           tt          |          S )
zConvert html format to Deckerd table structure.

    Args:
        content: The html content with a table to extract.

    Returns:
        A list of dictionaries where each dictionary represents a cell in the table.
    zhtml.parsertabletrthtdcolspanr   rowspan)r   r   r   r   r   )
r   findfindAll	enumerateintattrsgettextappendr/   )r   soupr3   rows
table_datar+   rowr   r,   	cell_datar*   s              r   html_table_to_deckerdrF   )   s    -00DIIgE==$  DJD// 
$ 
$3T4L))%e,, 	$ 	$LAy,,Y::;;,,Y::;;$> D d####	$ )444r   c           
        g }d | D             }t          d |D                       }t          t          |                    }| D ]}t          t          |d         |d         |d         z                       t          t          |d         |d         |d         z                       |d         |d         |d         |d         |v d}|                    |           t          |          }|S )	zConvert Deckerd table structure to html format.

    Args:
        cells: List of dictionaries where each dictionary represents a cell in the table.

    Returns:
        A string with the html content of the table.
    c                *    g | ]}|d          dk    |S )r   r   r   r   r*   s     r   r"   z)deckerd_table_to_html.<locals>.<listcomp>R   s!    @@@catr   c              3  &   K   | ]}|d          V  dS )r   Nr   rI   s     r   	<genexpr>z(deckerd_table_to_html.<locals>.<genexpr>S   s&      >>dS	>>>>>>r   r   r   r   r   r   )row_numscolumn_numsr   r   z	cell textzcolumn header)maxr$   r%   listr@   r   )r   transformer_cellsfirst_row_cellsheader_lengthheader_rowsr*   rE   r3   s           r   deckerd_table_to_htmlrT   F   s     A@@@@O>>o>>>>>MeM**++K 	, 	,U49d3i$s).CDDEEd3icT#Y1F G GHHcci!#Y+5
 
	 	  +++++,,ELr   c                >    t          |           }t          |          S )a7  Convert html format to table structure. As a middle step it converts
    html to the Deckerd format as it's more convenient to work with.

    Args:
        content: The html content with a table to extract.

    Returns:
        A list of dictionaries where each dictionary represents a cell in the table.
    )rF   _convert_table_from_deckerd)r   deckerd_cellss     r   _convert_table_from_htmlrX   d   s     *'22M&}555r   c                    g }| D ]Z}	 |d         |d         |d         d}n)# t           $ r
 t          }Y nt          $ r
 t          }Y nw xY w|                    |           [|S )zConvert deckerd format to table structure.

    Args:
      content: The deckerd formatted content with a table to extract.

    Returns:
      A list of dictionaries where each dictionary represents a cell in the table.
    r   r   r   r
   )KeyError
EMPTY_CELL	TypeErrorr@   )r   rC   r3   rE   s       r   rV   rV   r   s     J % %		#"3Z"3Z + II
  	# 	# 	#"III 	# 	# 	#"III	#)$$$$s   !AAArC   List[List[Dict[str, Any]]]c                &    t          | d           S )Nc                "    | d         | d         fS )Nr   r   r   )r*   s    r   r   z#_sort_table_cells.<locals>.<lambda>   s    [0A4CT/U r   r   )r#   )rC   s    r   _sort_table_cellsr`      s    *"U"UVVVVr   file_elementsc                *   g }| D ]}d|v r|d         dk    r{d|v rw	 t          |d                   }|                    t          |                     O# t          $ r1}t	          d|            |                    i            Y d}~d}~ww xY w|S )a  Extracts and converts tables data to a structured format based on the specified table type.

    Args:
        file_elements: List of elements from the ground truth file.

    Returns:
        A list of tables with each table represented as a list of cell data dictionaries.

    typeTabler?   z$Error converting ground truth data: N)rV   r@   r`   	Exceptionprint)ra   ground_truth_table_dataelementconverted_dataes        r   ,extract_and_convert_tables_from_ground_truthrk      s     !  	3 	3WG!;!;'@Q@Q3!<FO" " (../@/P/PQQQQ 3 3 3@Q@@AAA'..r222222223 #"s   7A
B'B

Bhtmlsource_typec                F   t           t          d}||vrt          d| d          ||         }|dk    rt          nt           }g }| D ]Y}|                    d          dk    r> ||          }|s ||          }|r$t	          |          }|                    |           Z|S )ap  Extracts and converts table data to a structured format

    Args:
      file_elements: List of elements from the file.
      source_type: 'cells' or 'html'. 'cells' refers to reading 'table_as_cells' field while
        'html' is extracted from 'text_as_html'

    Returns:
      A list of tables with each table represented as a list of cell data dictionaries.

    )rl   r   zsource_type z: is not valid. Allowed source_types are "html" and "cells"r   rc   rd   )extract_cells_from_text_as_html!extract_cells_from_table_as_cells
ValueErrorr>   r`   r@   )	ra   rm   $source_type_to_extraction_strategiesextract_cells_fnfallback_extract_cells_fnpredicted_table_datarh   extracted_cellsr(   s	            r   *extract_and_convert_tables_from_predictionrw      s     02, ,( >>>b;bbb
 
 	
 <KH '!! 	*),    : :;;v'))..w77O" E";";G"D"D :0AA$++L999r   rh   Dict[str, Any]List[Dict[str, Any]] | Nonec                    | d                              d          }|rd|vrdS d}	 t          |          }n)# t          $ r}t          d|            Y d}~nd}~ww xY w|S )a  Extracts and parse cells from "text_as_html" field in Element structure

    Args:
        element: Example element:
        {
            "type": "Table",
            "metadata": {
                "text_as_html": "<table>
                                    <thead>
                                        <tr>
                                            <th>Month A.</th>
                                        </tr>
                                    </thead>
                                    </tbody>
                                        <tr>
                                            <td>22</td><
                                        </tr>
                                    </tbody>
                                </table>"
            }
        }

    Returns:
        List of extracted cells in a format:
        [
            {
                "row_index": 0,
                "col_index": 0,
                "content": "Month A.",
            },
            ...,
        ]
    metadatatext_as_htmlz<table>Nz*Error converting Unstructured table data: )r>   rX   re   rf   )rh   valpredicted_cellsrj   s       r   ro   ro      s    D *

!
!.
1
1C )3&&tO@2377 @ @ @>1>>????????@ s   7 
AAAc                b    | d                              d          }d}|rt          |          }|S )aj  Extracts and parse cells from "table_as_cells" field in Element structure

    Args:
        element: Example element:
        {
            "type": "Table",
            "metadata": {
                "table_as_cells": [{"x": 0, "y": 0, "w": 1, "h": 1, "content": "Month A."},
                                   {"x": 0, "y": 1, "w": 1, "h": 1, "content": "22"}]
            }
        }

    Returns:
        List of extracted cells in a format:
        [
            {
                "row_index": 0,
                "col_index": 0,
                "content": "Month A.",
            },
            ...,
        ]
    r{   table_as_cellsN)r>   rV   )rh   r~   converted_cellss      r   rp   rp     s>    0 j)--.>??OO G5oFFr   N)r   r   )r   r0   r1   r   )r   r   r1   r0   )r   r   r1   r   )rC   r]   r1   r]   )ra   r   r1   r]   )rl   )ra   r   rm   r0   r1   r]   )rh   rx   r1   ry   )rh   rx   r1   r   )
__future__r   typingr   r   r   bs4r   $unstructured_inference.models.tablesr   r[   r/   rF   rT   rX   rV   r`   rk   rw   ro   rp   r   r   r   <module>r      sS   " " " " " " " " " " " " " " " "       > > > > > >  
   45 5 5 5:   <6 6 6 6   2W W W W# # # #8 =C(  (  (  (  ( V, , , ,^     r   