import logging
import os
import re
import statistics
from pathlib import Path
from typing import List, Optional, Union

import click
import pandas as pd

from unstructured.staging.base import elements_from_json, elements_to_text

logger = logging.getLogger("unstructured.eval")


def _prepare_output_cct(docpath: str, output_type: str) -> str:
    """
    Convert given input document (path) into cct-ready. The function only support conversion
    from `json` or `txt` file.
    """
    try:
        if output_type == "json":
            output_cct = elements_to_text(elements_from_json(docpath))
        elif output_type == "txt":
            output_cct = _read_text_file(docpath)
        else:
            raise ValueError(
                f"File type not supported. Expects one of `json` or `txt`, \
                    but received {output_type} instead."
            )
    except ValueError as e:
        logger.error(f"Could not read the file {docpath}")
        raise e
    return output_cct


def _listdir_recursive(dir: str) -> List[str]:
    """
    Recursively lists all files in the given directory and its subdirectories.
    Returns a list of all files found, with each file's path relative to the
    initial directory.
    """
    listdir = []
    for dirpath, _, filenames in os.walk(dir):
        for filename in filenames:
            # Remove the starting directory from the path to show the relative path
            relative_path = os.path.relpath(dirpath, dir)
            if relative_path == ".":
                listdir.append(filename)
            else:
                listdir.append(os.path.join(relative_path, filename))
    return listdir


def _rename_aggregated_columns(df):
    """
    Renames aggregated columns in a DataFrame based on a predefined mapping.

    Parameters:
    df (pandas.DataFrame): The DataFrame with aggregated columns to rename.

    Returns:
    pandas.DataFrame: A new DataFrame with renamed aggregated columns.
    """
    rename_map = {"_mean": "mean", "_stdev": "stdev", "_pstdev": "pstdev", "_count": "count"}
    return df.rename(columns=rename_map)


def _format_grouping_output(*df):
    """
    Concatenates multiple pandas DataFrame objects along the columns (side-by-side)
    and resets the index.
    """
    return pd.concat(df, axis=1).reset_index()


def _display(df):
    """
    Displays the evaluation metrics in a formatted text table.
    """
    if len(df) == 0:
        return
    headers = df.columns.tolist()
    col_widths = [
        max(len(header), max(len(str(item)) for item in df[header])) for header in headers
    ]
    click.echo(" ".join(header.ljust(col_widths[i]) for i, header in enumerate(headers)))
    click.echo("-" * sum(col_widths) + "-" * (len(headers) - 1))
    for _, row in df.iterrows():
        formatted_row = []
        for item in row:
            if isinstance(item, float):
                formatted_row.append(f"{item:.3f}")
            else:
                formatted_row.append(str(item))
        click.echo(
            " ".join(formatted_row[i].ljust(col_widths[i]) for i in range(len(formatted_row))),
        )


def _write_to_file(
    directory: str, filename: str, df: pd.DataFrame, mode: str = "w", overwrite: bool = True
):
    """
    Save the metrics report to tsv file. The function allows an option 1) to choose `mode`
    as `w` (write) or `a` (append) and 2) to `overwrite` the file if filename existed or not.
    """
    if mode not in ["w", "a"]:
        raise ValueError("Mode not supported. Mode must be one of [w, a].")
    if directory:
        Path(directory).mkdir(exist_ok=True)
    if "count" in df.columns:
        df["count"] = df["count"].astype(int)
    if "filename" in df.columns and "connector" in df.columns:
        df.sort_values(by=["connector", "filename"], inplace=True)
    if not overwrite:
        filename = _get_non_duplicated_filename(directory, filename)
    df.to_csv(
        os.path.join(directory, filename), sep="\t", mode=mode, index=False, header=(mode == "w")
    )


def _sorting_key(filename):
    """
    A function that defines the sorting method for duplicated file names. For example,
    with filename.ext filename (1).ext filename (2).ext filename (10).ext - this function
    extracts the integer in the bracket and sort those numbers ascendingly.
    """
    # Regular expression to find the number in the filename
    numbers = re.findall(r"(\d+)", filename)
    if numbers:
        # If there's a number, return it as an integer for sorting
        return int(numbers[-1])
    else:
        # If no number, return 0 so these files come first
        return 0


def _uniquity_file(file_list, target_filename) -> str:
    """
    Checks the duplicity of the file name from the list and run the numerical check
    of the minimum number needed as extension to not overwrite the exising file.
    Returns a string of file name in the format of `filename (<min number>).ext`.
    """
    original_filename, extension = target_filename.rsplit(".", 1)
    pattern = rf"^{re.escape(original_filename)}(?: \((\d+)\))?\.{re.escape(extension)}$"
    duplicated_files = sorted([f for f in file_list if re.match(pattern, f)], key=_sorting_key)

    numbers = []
    for file in duplicated_files:
        match = re.search(r"\((\d+)\)", file)
        if match:
            numbers.append(int(match.group(1)))

    numbers.sort()

    counter = 1
    for number in numbers:
        if number == counter:
            counter += 1
        else:
            break

    return original_filename + " (" + str(counter) + ")." + extension


def _get_non_duplicated_filename(dir, filename) -> str:
    """
    Helper function to calls the `_uniquity_file` function. Takes in directory and file name
    to check on.
    """
    filename = _uniquity_file(os.listdir(dir), filename)
    return filename


def _mean(scores: Union[pd.Series, List[float]], rounding: Optional[int] = 3) -> Union[float, None]:
    """
    Find mean from the list. Returns None if no element in the list.

    Args:
        rounding (int): optional argument that allows user to define decimal points. Default at 3.
    """
    if len(scores) == 0:
        return None
    mean = statistics.mean(scores)
    if not rounding:
        return mean
    return round(mean, rounding)


def _stdev(scores: List[Optional[float]], rounding: Optional[int] = 3) -> Union[float, None]:
    """
    Find standard deviation from the list.
    Returns None if only 0 or 1 element in the list.

    Args:
        rounding (int): optional argument that allows user to define decimal points. Default at 3.
    """
    # Filter out None values
    scores = [score for score in scores if score is not None]
    # Proceed only if there are more than one value
    if len(scores) <= 1:
        return None
    if not rounding:
        return statistics.stdev(scores)
    return round(statistics.stdev(scores), rounding)


def _pstdev(scores: List[Optional[float]], rounding: Optional[int] = 3) -> Union[float, None]:
    """
    Find population standard deviation from the list.
    Returns None if only 0 or 1 element in the list.

    Args:
        rounding (int): optional argument that allows user to define decimal points. Default at 3.
    """
    scores = [score for score in scores if score is not None]
    if len(scores) <= 1:
        return None
    if not rounding:
        return statistics.pstdev(scores)
    return round(statistics.pstdev(scores), rounding)


def _count(scores: List[Optional[float]]) -> float:
    """
    Returns the row count of the list.
    """
    return len(scores)


def _read_text_file(path):
    """
    Reads the contents of a text file and returns it as a string.
    """
    # Check if the file exists
    if not os.path.exists(path):
        raise FileNotFoundError(f"The file at {path} does not exist.")

    try:
        with open(path, errors="ignore") as f:
            text = f.read()
        return text
    except OSError as e:
        # Handle other I/O related errors
        raise IOError(f"An error occurred when reading the file at {path}: {e}")