Source code for cdm_reader_mapper.mdf_reader.utils.validators

"""Data validation module."""

from __future__ import annotations
import logging
from collections.abc import Iterable
from typing import Any, get_args

import numpy as np
import pandas as pd

from .. import properties
from ..codes import codes
from .utilities import convert_str_boolean


numeric_types = get_args(properties.NumericTypes)


def _is_false(x: Any) -> bool:
    """
    Check if a value is exactly False.

    Parameters
    ----------
    x : Any
        Value to be checked if exactly False.

    Returns
    -------
    bool
        True if `x` is exactly False, otherwise False.
    """
    return x is False


def _is_true(x: Any) -> bool:
    """
    Check if a value is exactly True.

    Parameters
    ----------
    x : Any
        Value to be checked if exactly False.

    Returns
    -------
    bool
        True if `x` is exactly False, otherwise False.
    """
    return x is True


[docs] def validate_datetime(series: pd.Series) -> pd.Series: """ Validate that entries in a pandas Series can be converted to datetime. Missing values are treated as valid. Parameters ---------- series : pd.Series Series of object values to validate. Returns ------- pd.Series Boolean Series indicating valid entries. """ dates = pd.to_datetime(series, errors="coerce") return dates.notna() | series.isna()
[docs] def validate_numeric(series: pd.Series, valid_min: float, valid_max: float) -> pd.Series: """ Validate that entries in a pandas Series are numeric and within a range. - Converts boolean-like strings to bools. - Invalid or missing values are marked as False unless missing (NaN). Parameters ---------- series : pd.Series Series of object values to validate. valid_min : float Minimum valid value. valid_max : float Maximum valid value. Returns ------- pd.Series Boolean Series indicating valid entries. """ converted = series.apply(convert_str_boolean) numeric = pd.to_numeric(converted, errors="coerce") valid_range = numeric.between(valid_min, valid_max) return valid_range | series.isna()
[docs] def validate_str(series: pd.Series) -> pd.Series: """ Validate that entries in a pandas Series are strings. Currently all values are treated as valid. Parameters ---------- series : pd.Series Series of object values to validate. Returns ------- pd.Series Boolean Series with all True. """ return pd.Series(True, index=series.index, dtype="boolean")
[docs] def validate_codes(series: pd.Series, code_table: Iterable[Any], column_type: str) -> pd.Series: """ Validate that entries in a pandas Series exist in a provided code table. Missing values are treated as valid. Parameters ---------- series : pd.Series Series of object values to validate. code_table : Iterable Allowed codes for validation. column_type : str Column type for dtype lookup (via properties.pandas_dtypes). Returns ------- pd.Series Boolean Series indicating valid entries. """ if not code_table: logging.error("Code table not found for element %s", series.name) return pd.Series(False, index=series.index) keys = set(code_table) dtype = properties.pandas_dtypes.get(column_type, object) converted = series.astype(dtype) as_str = converted.astype(str) return converted.isna() | as_str.isin(keys)
[docs] def validate( data: pd.DataFrame, imodel: str | None, ext_table_path: str | None, attributes: dict[str, dict[str, Any]], disables: list[str] | None = None, ) -> pd.DataFrame | None: """ Validate a pandas DataFrame according to a data model and code tables. Each column is validated based on its `column_type` attribute. Supports: - Numeric types: checked against valid_min and valid_max - Keys: checked against a code table - Datetime and string: validated using simple validators - Explicit boolean literals ("True"/"False") override column validation Parameters ---------- data : pd.DataFrame Input data to validate. imodel : str Name of the internal data model, e.g., 'icoads_r300_d704'. ext_table_path : str, optional Path to external code tables for validation. attributes : dict[str, dict] Dictionary of column attributes (e.g., type, valid ranges, codetable). disables : list[str], optional Columns to skip during validation. Returns ------- pd.DataFrame Boolean mask of the same shape as `data`. True indicates a valid entry. """ if imodel is None: logging.error("imodel is not defined.") return None if not isinstance(data, pd.DataFrame): logging.error("input data must be a pandas DataFrame.") return None mask = pd.DataFrame(pd.NA, index=data.index, columns=data.columns, dtype="boolean") if data.empty: return mask disables = disables or [] elements = [col for col in data.columns if col not in disables] element_atts = {element: attributes[element] for element in elements if element in attributes} validated_columns = [] validated_dtypes = set(numeric_types) | {"datetime", "key"} basic_functions = { "datetime": validate_datetime, "str": validate_str, } for column in data.columns: if column in disables or column not in attributes: continue series = data[column] column_atts = element_atts.get(column, {}) column_type = column_atts.get("column_type") if column_type in numeric_types: valid_min = column_atts.get("valid_min", -np.inf) valid_max = column_atts.get("valid_max", np.inf) column_mask = validate_numeric(series, valid_min, valid_max) elif column_type == "key": code_table_name = column_atts.get("codetable") if not isinstance(code_table_name, str): logging.warning("codetable must be a str for column '{column}', got %s", type(code_table_name)) continue code_table = codes.read_table(code_table_name, imodel=imodel, ext_table_path=ext_table_path) column_mask = validate_codes(series, code_table, column_type) elif column_type in basic_functions: column_mask = basic_functions[column_type](series) else: logging.warning("Unknown column_type '%s' for column '%s'", column_type, column) continue mask[column] = column_mask if column_type in validated_dtypes: validated_columns.append(column) # Explicit boolean literals ("True"/"False") override validation results if validated_columns: validated_columns = list(dict.fromkeys(validated_columns)) to_bool = data[validated_columns].applymap(convert_str_boolean) false_mask = to_bool.applymap(_is_false) true_mask = to_bool.applymap(_is_true) mask[validated_columns] = mask[validated_columns].mask(false_mask, False) mask[validated_columns] = mask[validated_columns].mask(true_mask, True) return mask.astype("boolean")