"""Provides operations related to the HTML table stored in `.metadata.text_as_html`.

Used during partitioning as well as chunking.
"""

from __future__ import annotations

import html
from typing import TYPE_CHECKING, Iterator, Sequence, cast

from lxml import etree
from lxml.html import fragment_fromstring

from unstructured.utils import lazyproperty

if TYPE_CHECKING:
    from lxml.html import HtmlElement


def htmlify_matrix_of_cell_texts(matrix: Sequence[Sequence[str]]) -> str:
    """Form an HTML table from "rows" and "columns" of `matrix`.

    Character overhead is minimized:
    - No whitespace padding is added for human readability
    - No newlines ("\n") are added
    - No `<thead>`, `<tbody>`, or `<tfoot>` elements are used; we can't tell where those might be
      semantically appropriate anyway so at best they would consume unnecessary space and at worst
      would be misleading.
    """

    def iter_trs(rows_of_cell_strs: Sequence[Sequence[str]]) -> Iterator[str]:
        for row_cell_strs in rows_of_cell_strs:
            # -- suppress emission of rows with no cells --
            if not row_cell_strs:
                continue
            yield f"<tr>{''.join(iter_tds(row_cell_strs))}</tr>"

    def iter_tds(row_cell_strs: Sequence[str]) -> Iterator[str]:
        for s in row_cell_strs:
            # -- take care of things like '<' and '>' in the text --
            s = html.escape(s)
            # -- substitute <br/> elements for line-feeds in the text --
            s = "<br/>".join(s.split("\n"))
            # -- normalize whitespace in cell --
            cell_text = " ".join(s.split())
            # -- emit void `<td/>` when cell text is empty string --
            yield f"<td>{cell_text}</td>" if cell_text else "<td/>"

    return f"<table>{''.join(iter_trs(matrix))}</table>" if matrix else ""


class HtmlTable:
    """A `<table>` element."""

    def __init__(self, table: HtmlElement):
        self._table = table

    @classmethod
    def from_html_text(cls, html_text: str) -> HtmlTable:
        # -- root is always a `<table>` element so far but let's be robust --
        root = fragment_fromstring(html_text)
        tables = root.xpath("//table")
        if not tables:
            raise ValueError("`html_text` contains no `<table>` element")
        table = tables[0]

        # -- remove `<thead>`, `<tbody>`, and `<tfoot>` noise elements when present --
        noise_elements = table.xpath(".//thead | .//tbody | .//tfoot")
        for e in noise_elements:
            e.drop_tag()

        # -- normalize and compactify the HTML --
        for e in table.iter():
            # -- Strip all attributes from elements, like border="1", class="dataframe" added
            # -- by pandas.DataFrame.to_html(), style="text-align: right;", etc.
            e.attrib.clear()

            # -- change any `<th>` elements to `<td>` so all cells have the same tag --
            if e.tag == "th":
                e.tag = "td"

            # -- normalize whitespace in element text; this removes indent whitespace before nested
            # -- elements and reduces whitespace between words to a single space.
            if e.text:
                e.text = " ".join(e.text.split())

            # -- remove all tails, those are newline + indent if anything --
            if e.tail:
                e.tail = None

        return cls(table)

    @lazyproperty
    def html(self) -> str:
        """The HTML-fragment for this `<table>` element, all on one line.

        Like: `<table><tr><td>foo</td></tr><tr><td>bar</td></tr></table>`

        The HTML contains no human-readability whitespace, attributes, or `<thead>`, `<tbody>`, or
        `<tfoot>` tags. It is made as compact as possible to maximize the semantic content in a
        given space. This is particularly important for chunking.
        """
        return etree.tostring(self._table, encoding=str)

    def iter_rows(self) -> Iterator[HtmlRow]:
        yield from (HtmlRow(tr) for tr in cast("list[HtmlElement]", self._table.xpath("./tr")))

    @lazyproperty
    def text(self) -> str:
        """The clean, concatenated, text for this table."""
        table_text = " ".join(self._table.itertext())
        # -- blank cells will introduce extra whitespace, so normalize after accumulating --
        return " ".join(table_text.split())


class HtmlRow:
    """A `<tr>` element."""

    def __init__(self, tr: HtmlElement):
        self._tr = tr

    @lazyproperty
    def html(self) -> str:
        """Like  "<tr><td>foo</td><td>bar</td></tr>"."""
        return etree.tostring(self._tr, encoding=str)

    def iter_cells(self) -> Iterator[HtmlCell]:
        for td in self._tr:
            yield HtmlCell(td)

    def iter_cell_texts(self) -> Iterator[str]:
        """Generate contents of each cell of this row as a separate string.

        A cell that is empty or contains only whitespace does not generate a string.
        """
        for td in self._tr:
            if (text := td.text) is None:
                continue
            text = text.strip()
            if not text:
                continue
            yield text


class HtmlCell:
    """A `<td>` element."""

    def __init__(self, td: HtmlElement):
        self._td = td

    @lazyproperty
    def html(self) -> str:
        """Like  "<td>foo bar baz</td>"."""
        return etree.tostring(self._td, encoding=str) if self.text else "<td/>"

    @lazyproperty
    def text(self) -> str:
        """Text inside `<td>` element, empty string when no text."""
        if (text := self._td.text) is None:
            return ""
        return text.strip()
