from __future__ import annotations

import copy
import re
from typing import IO, Any, Callable, Literal

from unstructured.chunking import add_chunking_strategy
from unstructured.cleaners.core import (
    auto_paragraph_grouper,
    clean_bullets,
)
from unstructured.documents.coordinates import CoordinateSystem
from unstructured.documents.elements import (
    Address,
    Element,
    ElementMetadata,
    EmailAddress,
    Footer,
    Header,
    ListItem,
    NarrativeText,
    Text,
    Title,
)
from unstructured.file_utils.encoding import read_txt_file
from unstructured.file_utils.model import FileType
from unstructured.nlp.patterns import PARAGRAPH_PATTERN, UNICODE_BULLETS_RE
from unstructured.partition.common.common import exactly_one
from unstructured.partition.common.metadata import apply_metadata, get_last_modified_date
from unstructured.partition.text_type import (
    is_bulleted_text,
    is_email_address,
    is_possible_narrative_text,
    is_possible_numbered_list,
    is_possible_title,
    is_us_city_state_zip,
)


@apply_metadata(FileType.TXT)
@add_chunking_strategy
def partition_text(
    filename: str | None = None,
    *,
    file: IO[bytes] | None = None,
    encoding: str | None = None,
    text: str | None = None,
    paragraph_grouper: Callable[[str], str] | Literal[False] | None = None,
    detection_origin: str | None = "text",
    **kwargs: Any,
) -> list[Element]:
    """Partition a .txt documents into its constituent paragraph elements.

    If paragraphs are below "min_partition" or above "max_partition" boundaries,
    they are combined or split.
    Parameters
    ----------
    filename
        A string defining the target filename path.
    file
        A file-like object using "rb" mode --> open(filename, "rb").
    encoding
        The encoding method used to decode the input bytes when drawn from `filename` or `file`.
        Defaults to "utf-8".
    text
        The string representation of the .txt document.
    paragrapher_grouper
        A str -> str function for fixing paragraphs that are interrupted by line breaks
        for formatting purposes.
    """
    if text is not None and text.strip() == "" and not file and not filename:
        return []

    # -- Verify that only one of the arguments was provided --
    exactly_one(filename=filename, file=file, text=text)

    file_text = ""
    if filename is not None:
        encoding, file_text = read_txt_file(filename=filename, encoding=encoding)
    elif file is not None:
        encoding, file_text = read_txt_file(file=file, encoding=encoding)
    elif text is not None:
        file_text = str(text)

    if paragraph_grouper is False:
        pass
    elif paragraph_grouper is not None:
        file_text = paragraph_grouper(file_text)
    else:
        file_text = auto_paragraph_grouper(file_text)

    file_content = _split_by_paragraph(file_text)

    elements: list[Element] = []
    metadata = ElementMetadata(
        last_modified=get_last_modified_date(filename) if filename else None,
    )
    metadata.detection_origin = detection_origin

    for ctext in file_content:
        ctext = ctext.strip()

        if ctext and not _is_empty_bullet(ctext):
            element = element_from_text(ctext)
            element.metadata = copy.deepcopy(metadata)
            elements.append(element)

    return elements


def element_from_text(
    text: str,
    coordinates: tuple[tuple[float, float], ...] | None = None,
    coordinate_system: CoordinateSystem | None = None,
) -> Element:
    if _is_in_header_position(coordinates, coordinate_system):
        return Header(
            text=text,
            coordinates=coordinates,
            coordinate_system=coordinate_system,
        )
    elif _is_in_footer_position(coordinates, coordinate_system):
        return Footer(
            text=text,
            coordinates=coordinates,
            coordinate_system=coordinate_system,
        )
    elif is_bulleted_text(text):
        clean_text = clean_bullets(text)
        return ListItem(
            text=clean_text,
            coordinates=coordinates,
            coordinate_system=coordinate_system,
        )
    elif is_email_address(text):
        return EmailAddress(text=text)
    elif is_us_city_state_zip(text):
        return Address(
            text=text,
            coordinates=coordinates,
            coordinate_system=coordinate_system,
        )
    elif is_possible_numbered_list(text):
        return ListItem(
            text=text,
            coordinates=coordinates,
            coordinate_system=coordinate_system,
        )
    elif is_possible_narrative_text(text):
        return NarrativeText(
            text=text,
            coordinates=coordinates,
            coordinate_system=coordinate_system,
        )
    elif is_possible_title(text):
        return Title(
            text=text,
            coordinates=coordinates,
            coordinate_system=coordinate_system,
        )
    else:
        return Text(
            text=text,
            coordinates=coordinates,
            coordinate_system=coordinate_system,
        )


# ================================================================================================
# HELPER FUNCTIONS
# ================================================================================================


def _get_height_percentage(
    coordinates: tuple[tuple[float, float], ...],
    coordinate_system: CoordinateSystem,
) -> float:
    avg_y = sum(coordinate[1] for coordinate in coordinates) / len(coordinates)
    return avg_y / coordinate_system.height


def _is_empty_bullet(text: str) -> bool:
    """Checks if input text is an empty bullet."""
    return bool(UNICODE_BULLETS_RE.match(text) and len(text) == 1)


def _is_in_footer_position(
    coordinates: tuple[tuple[float, float], ...] | None,
    coordinate_system: CoordinateSystem | None,
    threshold: float = 0.93,
) -> bool:
    """Checks to see if the position of the text indicates that the text belongs
    to a footer."""
    if coordinates is None or coordinate_system is None:
        return False

    height_percentage = _get_height_percentage(coordinates, coordinate_system)
    return height_percentage > threshold


def _is_in_header_position(
    coordinates: tuple[tuple[float, float], ...] | None,
    coordinate_system: CoordinateSystem | None,
    threshold: float = 0.07,
) -> bool:
    """Checks to see if the position of the text indicates that the text belongs to a header."""
    if coordinates is None or coordinate_system is None:
        return False

    height_percentage = _get_height_percentage(coordinates, coordinate_system)
    return height_percentage < threshold


def _split_by_paragraph(file_text: str) -> list[str]:
    """Split text into paragraphs."""
    return re.split(PARAGRAPH_PATTERN, file_text.strip())