Source code for cdm_reader_mapper.mdf_reader.utils.utilities

"""Auxiliary functions and class for reading, converting, decoding and validating MDF files."""

from __future__ import annotations
import ast
import os
from collections.abc import Callable, Iterable
from pathlib import Path
from typing import Any

import pandas as pd

from cdm_reader_mapper.common import ProcessFunction, process_function



[docs]
def as_list(x: str | Iterable[Any] | None) -> list[Any] | None:
    """
    Ensure the input is a list; keep None as None.

    Parameters
    ----------
    x : str, iterable, or None
        Input value to convert. Strings become single-element lists.
        Other iterables are converted to a list preserving iteration order.
        If None is passed, None is returned.

    Returns
    -------
    list or None
        Converted list or None if input was None.

    Notes
    -----
    Sets are inherently unordered; the resulting list may not have a predictable order.
    """
    if x is None:
        return None
    if isinstance(x, str):
        return [x]
    return list(x)




[docs]
def as_path(value: str | os.PathLike[str], name: str) -> Path:
    """
    Ensure the input is a Path-like object.

    Parameters
    ----------
    value : str or os.PathLike
        The value to convert to a Path.
    name : str
        Name of the parameter, used in error messages.

    Returns
    -------
    pathlib.Path
        Path object representing `value`.

    Raises
    ------
    TypeError
        If `value` is not a string or Path-like object.
    """
    if isinstance(value, (str, os.PathLike)):
        return Path(value)
    raise TypeError(f"{name} must be str or Path-like")




[docs]
def join(col: Any | Iterable[Any]) -> str:
    """
    Join multi-level columns as a colon-separated string.

    Parameters
    ----------
    col : any or iterable of any
        A column name, which may be a single value or a list/tuple of values.

    Returns
    -------
    str
        Colon-separated string if input is iterable, or string of the single value.
    """
    if isinstance(col, (list, tuple)):
        return ":".join(str(c) for c in col)
    return str(col)




[docs]
def update_dtypes(dtypes: dict[str, Any] | pd.Series, columns: Iterable[str]) -> dict[str, Any] | pd.Series:
    """
    Filter dtypes dictionary to only include columns present in 'columns'.

    Parameters
    ----------
    dtypes : dict or pd.Series
        Dictionary mapping column names to their data types.
    columns : iterable of str
        List of columns to keep.

    Returns
    -------
    dict
        Filtered dictionary containing only keys present in 'columns'.
    """
    if isinstance(dtypes, dict):
        dtypes = {k: v for k, v in dtypes.items() if k in columns}
    return dtypes




[docs]
def update_column_names(dtypes: dict[str, Any] | str, col_o: str, col_n: str) -> dict[str, Any] | str:
    """
    Rename a column in a dtypes dictionary if it exists.

    Parameters
    ----------
    dtypes : dict or str
        Dictionary mapping column names to data types, or a string.
    col_o : str
        Original column name to rename.
    col_n : str
        New column name.

    Returns
    -------
    dict or str
        Updated dictionary with column renamed, or string unchanged.
    """
    if isinstance(dtypes, str):
        return dtypes
    if col_o != col_n and col_o in dtypes.keys():
        dtypes[col_n] = dtypes[col_o]
        del dtypes[col_o]
    return dtypes




[docs]
def update_column_labels(columns: Iterable[str | tuple[str, ...]]) -> pd.Index | pd.MultiIndex:
    """
    Convert string column labels to tuples if needed, producing a pandas Index or MultiIndex.

    This function attempts to parse each column label:
    - If the label is a string representation of a tuple (e.g., "('A','B')"), it will be converted to a tuple.
    - If the label is a string containing a colon (e.g., "A:B"), it will be split into a tuple ("A", "B").
    - Otherwise, the label is left unchanged.

    If all resulting labels are tuples, a pandas MultiIndex is returned.
    Otherwise, a regular pandas Index is returned.

    Parameters
    ----------
    columns : iterable of str or tuple
        Column labels to convert.

    Returns
    -------
    pd.Index or pd.MultiIndex
        Converted column labels as a pandas Index or MultiIndex.
    """
    new_cols: list[str | tuple[str, ...]] = []
    all_tuples = True

    for col in columns:
        if isinstance(col, str):
            try:
                col_ = ast.literal_eval(col)
            except (ValueError, SyntaxError, AttributeError):
                if isinstance(col, str) and ":" in col:
                    col_ = tuple(col.split(":"))
                else:
                    col_ = col
        else:
            col_ = col

        all_tuples &= isinstance(col_, tuple)
        new_cols.append(col_)

    if all_tuples:
        return pd.MultiIndex.from_tuples(new_cols)
    return pd.Index(new_cols)




[docs]
def update_and_select(
    df: pd.DataFrame,
    subset: str | list[str] | None = None,
    column_names: pd.Index | pd.MultiIndex | None = None,
) -> tuple[pd.DataFrame, dict[str, Any]]:
    """
    Update string column labels and select subset from DataFrame.

    Parameters
    ----------
    df : pd.DataFrame
        DataFrame to be updated.
    subset : str or list, optional
        Column names to be selected.
    column_names : pd.Index or pd.MultiIndex, optional
        Column labels for re-indexing.

    Returns
    -------
    tuple[pd.DataFrame, dict]
        - The CSV as a DataFrame. Empty if file does not exist.
        - dictionary containing data column labels and data types
    """
    df.columns = update_column_labels(df.columns)
    if subset is not None:
        df = df[subset]
    if column_names is not None and not df.empty:
        df = df.reindex(columns=column_names)
    return df, {"columns": df.columns, "dtypes": df.dtypes}



@process_function()
def _read_data_from_file(
    filepath: Path,
    reader: Callable[..., Any],
    col_subset: str | list[str] | None = None,
    column_names: pd.Index | pd.MultiIndex | None = None,
    reader_kwargs: dict[str, Any] | None = None,
) -> ProcessFunction:
    """
    Helper function for reading file.

    Parameters
    ----------
    filepath : Path-like
        Path to original file.
    reader : Callable
        Function to use for reading `filepath`.
    col_subset : str or list of str, optional
        Subset of `filepath` to be read.
    column_names : pd.Index or pd.MultiIndex, optional
        Columns of `filepath` to be read.
    reader_kwargs : dict, optional
        Additional keyword-arguments to read `filepath`.

    Returns
    -------
    ProcessFunction
        An instance containing all relevant information that has benn read.
    """
    if filepath is None or not Path(filepath).is_file():
        raise FileNotFoundError(f"File not found: {filepath}")

    reader_kwargs = reader_kwargs or {}

    data = reader(filepath, **reader_kwargs)

    return ProcessFunction(
        data=data,
        func=update_and_select,
        func_kwargs={"subset": col_subset, "column_names": column_names},
        makecopy=False,
    )



[docs]
def read_csv(
    filepath: Path,
    delimiter: str = ",",
    col_subset: str | list[str] | None = None,
    column_names: pd.Index | pd.MultiIndex | None = None,
    **kwargs: Any,
) -> tuple[pd.DataFrame | Iterable[pd.DataFrame], dict[str, Any]]:
    r"""
    Safe CSV reader that handles missing files and column subsets.

    Parameters
    ----------
    filepath : str or Path or None
        Path to the CSV file.
    delimiter : str, default ","
        Separator of CSV columns.
    col_subset : list of str, optional
        Subset of columns to read from the CSV.
    column_names : pd.Index or pd.MultiIndex, optional
        Column labels for re-indexing.
    \**kwargs : any
        Additional keyword arguments passed to pd.read_csv.

    Returns
    -------
    tuple of pd.DataFrame and dict
        - The CSV as a DataFrame. Empty if file does not exist.
        - dictionary containing data column labels and data types.
    """
    result = _read_data_from_file(
        filepath,
        reader=pd.read_csv,
        col_subset=col_subset,
        column_names=column_names,
        reader_kwargs={"delimiter": delimiter, **kwargs},
    )
    return tuple(result)




[docs]
def read_parquet(
    filepath: Path,
    col_subset: str | list[str] | None = None,
    column_names: pd.Index | pd.MultiIndex | None = None,
    **kwargs: Any,
) -> tuple[pd.DataFrame | Iterable[pd.DataFrame], dict[str, Any]]:
    r"""
    Safe CSV reader that handles missing files and column subsets.

    Parameters
    ----------
    filepath : str or Path or None
        Path to the PARQUET file.
    col_subset : list of str, optional
        Subset of columns to read from the PARQUET.
    column_names : pd.Index or pd.MultiIndex, optional
        Column labels for re-indexing.
    \**kwargs : Any
        Additional keyword arguments passed to pd.read_parquet.

    Returns
    -------
    tuple of pd.DataFrame and  dict
        - The PARQUET as a DataFrame. Empty if file does not exist.
        - dictionary containing data column labels and data types.
    """
    result = _read_data_from_file(
        filepath,
        pd.read_parquet,
        col_subset,
        column_names,
        reader_kwargs=kwargs,
    )
    return tuple(result)




[docs]
def read_feather(
    filepath: Path,
    col_subset: str | list[str] | None = None,
    column_names: pd.Index | pd.MultiIndex | None = None,
    **kwargs: Any,
) -> tuple[pd.DataFrame | Iterable[pd.DataFrame], dict[str, Any]]:
    r"""
    Safe CSV reader that handles missing files and column subsets.

    Parameters
    ----------
    filepath : str or Path or None
        Path to the FEATHER file.
    col_subset : list of str, optional
        Subset of columns to read from the FEATHER.
    column_names : pd.Index or pd.MultiIndex, optional
        Column labels for re-indexing.
    \**kwargs : Any
        Additional keyword arguments passed to pd.read_feather.

    Returns
    -------
    tuple of pd.DataFrame and dict
        - The CSV as a DataFrame. Empty if file does not exist.
        - dictionary containing data column labels and data types.
    """
    result = _read_data_from_file(
        filepath,
        pd.read_feather,
        col_subset,
        column_names,
        reader_kwargs=kwargs,
    )
    return tuple(result)




[docs]
def convert_dtypes(dtypes: dict[str, str]) -> tuple[dict[str, str], list[str]]:
    """
    Convert datetime columns to object dtype and return columns to parse as dates.

    Parameters
    ----------
    dtypes : dict[str, str]
        Dictionary mapping column names to pandas dtypes.

    Returns
    -------
    tuple
        - Updated dtypes dictionary (datetime converted to object).
        - List of columns originally marked as datetime.
    """
    parse_dates = []
    for key, value in dtypes.items():
        if value == "datetime":
            parse_dates.append(key)
            dtypes[key] = "datetime64[ns]"
    return dtypes, parse_dates




[docs]
def validate_arg(arg_name: str, arg_value: Any, arg_type: type) -> bool:
    """
    Validate that the input argument is of the expected type.

    Parameters
    ----------
    arg_name : str
        Name of the argument.
    arg_value : Any
        Value of the argument.
    arg_type : type
        Expected type of the argument.

    Returns
    -------
    bool
        True if `arg_value` is of type `arg_type` or None.

    Raises
    ------
    ValueError
        If `arg_value` is not of type `arg_type` and not None.
    """
    if arg_value and not isinstance(arg_value, arg_type):
        raise ValueError(f"Argument {arg_name} must be {arg_type} or None, not {type(arg_value)}")

    return True



def _adjust_dtype(dtype: Any, df: pd.DataFrame) -> Any:
    """
    Filter dtype dictionary to only include columns present in the DataFrame.

    Parameters
    ----------
    dtype : Any
        Data type specification, typically a dictionary mapping column names
        to dtype definitions. If not a dictionary, it is returned as-is.
    df : pandas.DataFrame
        DataFrame used to filter valid column names.

    Returns
    -------
    Any
        Filtered dtype dictionary containing only keys present in
        `df.columns`, or the original `dtype` if it is not a dictionary.
    """
    if not isinstance(dtype, dict):
        return dtype
    return {k: v for k, v in dtype.items() if k in df.columns}



[docs]
def convert_str_boolean(x: Any) -> Any:
    """
    Convert string boolean values 'True'/'False' to Python booleans.

    Parameters
    ----------
    x : Any
        Input value.

    Returns
    -------
    bool or original value
        True if 'True', False if 'False', else original value.
    """
    if pd.isna(x):
        return x
    if x == "True":
        return True
    if x == "False":
        return False
    return x



def _remove_boolean_values(x: Any) -> Any:
    """
    Remove boolean values or string representations of boolean.

    Parameters
    ----------
    x : Any
        Input value which may be a boolean, a boolean-like string, or any
        other object.

    Returns
    -------
    Any
        Returns `None` if the input is a boolean or boolean-like value.
        Otherwise returns the original input unchanged.
    """
    x = convert_str_boolean(x)
    if x is True or x is False:
        return None
    return x



[docs]
def remove_boolean_values(data: pd.DataFrame, dtypes: dict[str, str]) -> pd.DataFrame:
    """
    Remove boolean values from a DataFrame and adjust dtypes.

    Parameters
    ----------
    data : pd.DataFrame
        Input data.
    dtypes : dict
        Dictionary mapping column names to desired dtypes.

    Returns
    -------
    pd.DataFrame
        DataFrame with booleans removed and dtype adjusted.
    """
    data = data.map(_remove_boolean_values)
    dtype = _adjust_dtype(dtypes, data)
    data = data.apply(lambda series: series.str.strip() if series.dtype == "str" else series)
    return data.astype(dtype)