# pyright: reportPrivateUsage=false

"""Provides `partition_html()."""

from __future__ import annotations

from typing import IO, Any, Iterator, List, Literal, Optional, cast

import requests
from lxml import etree

from unstructured.chunking import add_chunking_strategy
from unstructured.documents.elements import Element
from unstructured.file_utils.encoding import read_txt_file
from unstructured.file_utils.model import FileType
from unstructured.partition.common.metadata import apply_metadata, get_last_modified_date
from unstructured.partition.html.parser import Flow, html_parser
from unstructured.partition.html.transformations import (
    ontology_to_unstructured_elements,
    parse_html_to_ontology,
)
from unstructured.utils import is_temp_file_path, lazyproperty


@apply_metadata(FileType.HTML)
@add_chunking_strategy
def partition_html(
    filename: Optional[str] = None,
    *,
    file: Optional[IO[bytes]] = None,
    text: Optional[str] = None,
    encoding: Optional[str] = None,
    url: Optional[str] = None,
    headers: dict[str, str] = {},
    ssl_verify: bool = True,
    skip_headers_and_footers: bool = False,
    detection_origin: Optional[str] = None,
    html_parser_version: Literal["v1", "v2"] = "v1",
    image_alt_mode: Optional[Literal["to_text"]] = "to_text",
    **kwargs: Any,
) -> list[Element]:
    """Partitions an HTML document into its constituent elements.

    HTML source parameters
    ----------------------
    The HTML to be partitioned can be specified four different ways:

    filename
        A string defining the target filename path.
    file
        A file-like object using "r" mode --> open(filename, "r").
    text
        The string representation of the HTML document.
    url
        The URL of a webpage to parse. Only for URLs that return an HTML document.
    headers
        The HTTP headers to be used in the HTTP request when `url` is specified.
    ssl_verify
        If the URL parameter is set, determines whether or not SSL verification is performed
        on the HTTP request.
    encoding
        The encoding method used to decode the text input. If None, utf-8 will be used.
    skip_headers_and_footers
        If True, ignores any content that is within <header> or <footer> tags

    html_parser_version (Literal['v1', 'v2']):
        The version of the HTML parser to use. The default is 'v1'. For 'v2' the parser will
        use the ontology schema to parse the HTML document.

    image_alt_mode (Literal['to_text']):
        When set 'to_text', the v2 parser will include the alternative text of images in the output.
    """
    # -- parser rejects an empty str, nip that edge-case in the bud here --
    if text is not None and text.strip() == "" and not file and not filename and not url:
        return []

    opts = HtmlPartitionerOptions(
        file_path=filename,
        file=file,
        text=text,
        encoding=encoding,
        url=url,
        headers=headers,
        ssl_verify=ssl_verify,
        skip_headers_and_footers=skip_headers_and_footers,
        detection_origin=detection_origin,
        html_parser_version=html_parser_version,
        image_alt_mode=image_alt_mode,
    )

    return list(_HtmlPartitioner.iter_elements(opts))


class HtmlPartitionerOptions:
    """Encapsulates partitioning option validation, computation, and application of defaults."""

    def __init__(
        self,
        *,
        file_path: str | None,
        file: IO[bytes] | None,
        text: str | None,
        encoding: str | None,
        url: str | None,
        headers: dict[str, str],
        ssl_verify: bool,
        skip_headers_and_footers: bool,
        detection_origin: str | None,
        html_parser_version: Literal["v1", "v2"] = "v1",
        image_alt_mode: Optional[Literal["to_text"]] = "to_text",
    ):
        self._file_path = file_path
        self._file = file
        self._text = text
        self._encoding = encoding
        self._url = url
        self._headers = headers
        self._ssl_verify = ssl_verify
        self._skip_headers_and_footers = skip_headers_and_footers
        self._detection_origin = detection_origin
        self._html_parser_version = html_parser_version
        self._image_alt_mode = image_alt_mode

    @lazyproperty
    def detection_origin(self) -> str | None:
        """Trace of initial partitioner to be included in metadata for debugging purposes."""
        return self._detection_origin

    @lazyproperty
    def encoding(self) -> str | None:
        """Caller-provided encoding used to store HTML character stream as bytes.

        `None` when no encoding was provided and encoding should be auto-detected.
        """
        return self._encoding

    @lazyproperty
    def html_text(self) -> str:
        """The HTML document as a string, loaded from wherever the caller specified."""
        if self._file_path:
            return read_txt_file(filename=self._file_path, encoding=self._encoding)[1]

        if self._file:
            return read_txt_file(file=self._file, encoding=self._encoding)[1]

        if self._text:
            return str(self._text)

        if self._url:
            response = requests.get(self._url, headers=self._headers, verify=self._ssl_verify)
            if not response.ok:
                raise ValueError(
                    f"Error status code on GET of provided URL: {response.status_code}"
                )
            content_type = response.headers.get("Content-Type", "")
            if not content_type.startswith("text/html"):
                raise ValueError(f"Expected content type text/html. Got {content_type}.")

            return response.text

        raise ValueError("Exactly one of filename, file, text, or url must be specified.")

    @lazyproperty
    def last_modified(self) -> str | None:
        """The best last-modified date available, None if no sources are available."""
        return (
            None
            if not self._file_path or is_temp_file_path(self._file_path)
            else get_last_modified_date(self._file_path)
        )

    @lazyproperty
    def skip_headers_and_footers(self) -> bool:
        """When True, elements located within a header or footer are pruned."""
        return self._skip_headers_and_footers

    @lazyproperty
    def html_parser_version(self) -> Literal["v1", "v2"]:
        """When html_parser_version=='v2', HTML elements follow ontology schema."""
        return self._html_parser_version

    @lazyproperty
    def add_img_alt_text(self) -> bool:
        """When True, the alternative text of images is included in the output."""
        return self._image_alt_mode == "to_text"


class _HtmlPartitioner:
    """Partition HTML document into document-elements."""

    def __init__(self, opts: HtmlPartitionerOptions):
        self._opts = opts

    @classmethod
    def iter_elements(cls, opts: HtmlPartitionerOptions) -> Iterator[Element]:
        """Partition HTML document provided by `opts` into document-elements."""
        yield from cls(opts)._iter_elements()

    def _iter_elements(self) -> Iterator[Element]:
        """Generated document-elements (e.g. Title, NarrativeText, etc.) parsed from document.

        Elements appear in document order.
        """
        elements_iter = (
            self._main.iter_elements()
            if self._opts.html_parser_version == "v1"
            else self._from_ontology
        )

        for e in elements_iter:
            e.metadata.last_modified = self._opts.last_modified
            e.metadata.detection_origin = self._opts.detection_origin
            yield e

    @lazyproperty
    def _main(self) -> Flow:
        """The root HTML element."""
        # NOTE(scanny) - get `html_text` first so any encoding error raised is not confused with a
        # recoverable parsing error.
        html_text = self._opts.html_text

        # NOTE(scanny) - `lxml` will not parse a `str` that includes an XML encoding declaration
        # and will raise the following error:
        #     ValueError: Unicode strings with encoding declaration are not supported. ...
        # This is not valid HTML (would be in XHTML), but Chrome accepts it so we work around it
        # by UTF-8 encoding the str bytes and parsing those.
        try:
            root = etree.fromstring(html_text, html_parser)
        except ValueError:
            root = etree.fromstring(html_text.encode("utf-8"), html_parser)

        # -- remove a variety of HTML element types like <script> and <style> that we prefer not
        # -- to encounter while parsing.
        etree.strip_elements(
            root, ["del", "img", "link", "meta", "noscript", "script", "style"], with_tail=False
        )

        # -- remove <header> and <footer> tags if the caller doesn't want their contents --
        if self._opts.skip_headers_and_footers:
            etree.strip_elements(root, ["header", "footer"], with_tail=False)

        # -- jump to the core content if the document indicates where it is --
        if (main := root.find(".//main")) is not None:
            return cast(Flow, main)
        if (body := root.find(".//body")) is not None:
            return cast(Flow, body)
        return cast(Flow, root)

    @lazyproperty
    def _from_ontology(self) -> List[Element]:
        """Convert an ontology elements represented in HTML to an ontology element."""
        html_text = self._opts.html_text
        ontology = parse_html_to_ontology(html_text)
        unstructured_elements = ontology_to_unstructured_elements(
            ontology, add_img_alt_text=self._opts.add_img_alt_text
        )
        return unstructured_elements
