from __future__ import annotations

from typing import IO, Any, Optional

from unstructured.documents.elements import Element
from unstructured.file_utils.file_conversion import convert_file_to_html_text_using_pandoc
from unstructured.file_utils.model import FileType
from unstructured.partition.common.common import exactly_one
from unstructured.partition.common.metadata import get_last_modified_date
from unstructured.partition.html import partition_html

DETECTION_ORIGIN: str = "epub"


def partition_epub(
    filename: Optional[str] = None,
    *,
    file: Optional[IO[bytes]] = None,
    metadata_filename: Optional[str] = None,
    metadata_last_modified: Optional[str] = None,
    languages: Optional[list[str]] = ["auto"],
    detect_language_per_element: bool = False,
    **kwargs: Any,
) -> list[Element]:
    """Partitions an EPUB document. The document is first converted to HTML and then
    partitioned using partition_html.

    Parameters
    ----------
    filename
        A string defining the target filename path.
    file
        A file-like object using "rb" mode --> open(filename, "rb").
    metadata_last_modified
        The last modified date for the document.
    languages
        User defined value for `metadata.languages` if provided. Otherwise language is detected
        using naive Bayesian filter via `langdetect`. Multiple languages indicates text could be
        in either language.
        Additional Parameters:
            detect_language_per_element
                Detect language per element instead of at the document level.
    """
    exactly_one(filename=filename, file=file)

    last_modified = get_last_modified_date(filename) if filename else None

    html_text = convert_file_to_html_text_using_pandoc(
        source_format="epub", filename=filename, file=file
    )

    return partition_html(
        text=html_text,
        encoding="unicode",
        metadata_filename=metadata_filename or filename,
        metadata_file_type=FileType.EPUB,
        metadata_last_modified=metadata_last_modified or last_modified,
        languages=languages,
        detect_language_per_element=detect_language_per_element,
        detection_origin=DETECTION_ORIGIN,
        **kwargs,
    )
