Source code for cdm_reader_mapper.cdm_mapper.utils.conversions

"""Convert Common Datamodel (CDM) mapping table elements from/to string types."""

from __future__ import annotations
import ast
from collections.abc import Callable
from typing import Any, get_args

import numpy as np
import pandas as pd

from .. import properties
from ..tables.tables import get_cdm_atts, get_imodel_maps


[docs] class BaseConverter: """ Base class for managing type conversion functions. Parameters ---------- converters : dict Mapping of type names to conversion functions. args : dict Optional mapping of type names to argument names for converters. """ def __init__(self, converters: dict[str, Any], args: dict[str, Any] | None = None): """ Initialize the BaseConverter with converters and optional arguments. Parameters ---------- converters : dict Mapping of type names to conversion functions. args : dict, optional Mapping of type names to argument names for converters. """ self._converters = converters or {} self._args = args or {} def __getitem__(self, key: str) -> Callable[..., Any] | None: """ Retrieve a converter function for a given type. Parameters ---------- key : str The type name to look up. Returns ------- callable or None The conversion function if found, else None. """ return self._converters.get(key)
[docs] def get_args(self, key: str) -> str | None: """ Retrieve the argument name associated with a converter type. Parameters ---------- key : str The type name to look up. Returns ------- str or None The argument name if found, else None. """ return self._args.get(key)
def __contains__(self, key: str) -> bool: """ Check if a converter exists for a given type. Parameters ---------- key : str The type name to check. Returns ------- bool True if the converter exists, False otherwise. """ return key in self._converters
[docs] class ConvertFromStr(BaseConverter): """ Converter class for converting string representations into Python types. Provides default converters for integers, floats, timestamps, and strings, including array variants. """ def __init__(self) -> None: """Initialize ConvertFromStr with default string-to-type converters.""" super().__init__( converters={ "int": _convert_integer_from_str, "int[]": _convert_integer_array_from_str, "numeric": _convert_float_from_str, "numeric[]": _convert_float_array_from_str, "timestamp with timezone": _convert_datetime_from_str, "varchar": _convert_str_from_str, "varchar[]": _convert_str_array_from_str, } )
[docs] class ConvertToStr(BaseConverter): """ Converter class for converting Python types to string representations. Provides default converters for integers, floats, timestamps, and strings, including array variants. Supports optional arguments for certain types (e.g., decimal_places for numeric types). """ def __init__(self) -> None: """Initialize ConvertToStr with default type-to-string converters and optional arguments.""" super().__init__( converters={ "int": _convert_integer_to_str, "int[]": _convert_integer_array_to_str, "numeric": _convert_float_to_str, "numeric[]": _convert_float_array_to_str, "timestamp with timezone": _convert_datetime_to_str, "varchar": _convert_str_to_str, "varchar[]": _convert_str_array_to_str, }, args={ "numeric": "decimal_places", "numeric[]": "decimal_places", }, )
def _convert_array_general_from_str(data: pd.Series, null_label: str, dtype: type | str) -> pd.Series: """ Convert a series of string values (single or list) into an array. Parameters ---------- data : pd.Series Series containing string values or lists of values. null_label : str Label to replace missing value with. dtype : type or str Result data type. Returns ------- pd.Series Series of arrays. """ def _convert_value(x: Any) -> Any: """ Convert value to `dtype`. Parameters ---------- x : Any Value to be converted. Returns ------- Any Converted value. """ if isinstance(x, list): x_list = x elif pd.isna(x): return pd.NA elif x == null_label: return pd.NA else: x_list = str(x).strip("{}").split(",") v_list = [] for x_ in x_list: if pd.isna(x_): v_list.append(pd.NA) elif x_ == null_label: v_list.append(pd.NA) elif not x_: v_list.append(pd.NA) else: v_list.append(x_) if len(v_list) == 0: return pd.NA if len(v_list) == 1 and pd.isna(v_list[0]): return pd.NA return list(pd.array(v_list, dtype=dtype)) return data.apply(_convert_value) def _convert_array_general_to_str(data: pd.Series, null_label: str, dtype: type | str) -> pd.Series: """ Convert a series of values (single or list) into an string array. Parameters ---------- data : pd.Series Series containing values or lists of values. null_label : str Label to replace missing value with. dtype : type or str Result data type. Returns ------- pd.Series Series of string arrays. """ def _convert_value(x: Any) -> str: """ Convert value to str. Parameters ---------- x : Any Value to be converted. Returns ------- str Converted value. """ if isinstance(x, str): try: x = ast.literal_eval(x) except (SyntaxError, ValueError): x = [x] if x is None: return null_label if isinstance(x, np.ndarray): x_list = x.tolist() elif isinstance(x, list): x_list = x else: x_list = [x] str_list = [] for x_ in x_list: if pd.isna(x_): str_list.append(null_label) elif not x_ and pd.to_numeric(x_, errors="coerce"): str_list.append(null_label) else: t = pd.array([x_], dtype=dtype)[0] str_list.append(str(t)) if len(str_list) == 0: return null_label if len(str_list) == 1 and str_list[0] == null_label: return null_label return "{" + ",".join(str_list) + "}" return data.apply(_convert_value).astype(object) def _convert_str_to_str(data: pd.Series, null_label: str) -> pd.Series: """ Convert elements to string representation. Parameters ---------- data : pd.Series Series containing elements to convert. null_label : str Label to use for NaN or invalid values. Returns ------- pd.Series Series with string representations of elements. """ def _return_str(x: Any, null_label: str) -> str: """ Convert value to string. Parameters ---------- x : Any Value to be converted. null_label : str Label to replace missing value with. Returns ------- str Converted value. """ if isinstance(x, list): return str(x) if pd.isna(x): return null_label return str(x) return data.apply(lambda x: _return_str(x, null_label)) def _convert_str_from_str(data: pd.Series, null_label: str) -> pd.Series: """ Convert elements from string representation. Parameters ---------- data : pd.Series Series containing string elements to convert. null_label : str Label to use for NaN or invalid values. Returns ------- pd.Series Series with data type representtions of elements. """ def _return_str(x: Any, null_label: str) -> str | pd.NA: """ Convert value to object. Parameters ---------- x : Any Value to be converted. null_label : str Label to replace missing value with. Returns ------- str | pd.NA Converted value. """ if pd.isna(x): return pd.NA if x == null_label: return pd.NA return str(x) return data.apply(lambda x: _return_str(x, null_label)) def _convert_str_array_to_str(data: pd.Series, null_label: str) -> pd.Series: """ Convert a series of string values or lists to string array format. Parameters ---------- data : pd.Series Series containing integer values or lists of integers. null_label : str Label to use for NaN, empty, or invalid values. Returns ------- pd.Series Series with integer arrays in "{...}" format. """ return _convert_array_general_to_str(data, null_label, dtype=str) def _convert_str_array_from_str(data: pd.Series, null_label: str) -> pd.Series: """ Convert a series of string arrays in "{...}" format to Python lists of strings. Parameters ---------- data : pd.Series Series containing string arrays in "{...}" format. null_label : str Label to use for NaN, empty, or invalid values. Returns ------- pd.Series Series with Python lists of strings. """ return _convert_array_general_from_str(data, null_label, object) def _convert_integer_to_str(data: pd.Series, null_label: str) -> pd.Series: """ Convert numeric elements to integer strings. Parameters ---------- data : pd.Series Series of numeric values to convert. null_label : str Label to use for NaN or invalid values. Returns ------- pd.Series Series with values as integer strings, NaNs replaced by null_label. """ def _return_str(x: Any, null_label: str) -> str: """ Convert value to str. Parameters ---------- x : Any Value to be converted. null_label : str Label to replace missing value with. Returns ------- str Converted value. """ if pd.isna(x): return null_label try: return str(int(float(x))) except ValueError: return null_label return data.apply(lambda x: _return_str(x, null_label)).astype(object) def _convert_integer_from_str(data: pd.Series, null_label: str) -> pd.Series: """ Convert a series of string values to nullable integer type. Parameters ---------- data : pd.Series Series containing string representations of numeric values. null_label : str Label to use for NaN, empty, or invalid values. Returns ------- pd.Series Series with values converted to pandas nullable integer dtype ("Int64"). Invalid or non-convertible values are set to pd.NA. """ return pd.to_numeric(data, errors="coerce").astype("Int64") def _convert_integer_array_to_str(data: pd.Series, null_label: str) -> pd.Series: """ Convert a series of integer values or lists to string array format. Parameters ---------- data : pd.Series Series containing integer values or lists of integers. null_label : str Label to use for NaN, empty, or invalid values. Returns ------- pd.Series Series with integer arrays in "{...}" format. """ return _convert_array_general_to_str(data, null_label, dtype="Int64") def _convert_integer_array_from_str(data: pd.Series, null_label: str) -> pd.Series: """ Convert a series of string arrays in "{...}" format to lists of integers. Parameters ---------- data : pd.Series Series containing string arrays in "{...}" format. null_label : str Label to use for NaN, empty, or invalid values. Returns ------- pd.Series Series with values converted to lists of integers using pandas nullable integer dtype ("Int64"). Invalid or non-convertible elements are set to NaN. """ return _convert_array_general_from_str(data, null_label, "Int64") def _convert_float_to_str(data: pd.Series, null_label: str, decimal_places: int) -> pd.Series: """ Convert numeric elements to float strings with specified decimals. Parameters ---------- data : pd.Series Series of numeric values to convert. null_label : str Label to use for NaN or invalid values. decimal_places : int Number of decimal places for formatting. Returns ------- pd.Series Series with values as formatted float strings, NaNs replaced by null_label. """ def _return_str(x: Any, null_label: str, format_float: str) -> str: """ Convert value to str. Parameters ---------- x : Any Value to be converted. null_label : str Label to replace missing value with. format_float : str Format specifier for floating-point values (e.g., ``'.2f'``). Returns ------- str Converted value. """ if pd.isna(x): return null_label try: return format_float.format(float(x)) except ValueError: return null_label format_float = "{:." + str(decimal_places) + "f}" return data.apply(lambda x: _return_str(x, null_label, format_float)).astype(object) def _convert_float_from_str(data: pd.Series, null_label: str) -> pd.Series: """ Convert a series of string values to nullable float type. Parameters ---------- data : pd.Series Series containing string representations of numeric values. null_label : str Label to use for NaN, empty, or invalid values. Returns ------- pd.Series Series with values converted to pandas nullable float dtype ("Float64"). Invalid or non-convertible values are set to NaN. """ return pd.to_numeric(data, errors="coerce").astype("Float64") def _convert_float_array_to_str(data: pd.Series, null_label: str) -> pd.Series: """ Convert a series of float values or lists to string array format. Parameters ---------- data : pd.Series Series containing float values or lists of integers. null_label : str Label to use for NaN, empty, or invalid values. Returns ------- pd.Series Series with float arrays in "{...}" format. """ return _convert_array_general_to_str(data, null_label, dtype=float) def _convert_float_array_from_str(data: pd.Series, null_label: str) -> pd.Series: """ Convert a series of string arrays in "{...}" format to lists of floats. Parameters ---------- data : pd.Series Series containing string arrays in "{...}" format. null_label : str Label to use for NaN, empty, or invalid values. Returns ------- pd.Series Series with values converted to lists of floats using pandas nullable float dtype ("Float64"). Invalid or non-convertible elements are set to NaN. """ return _convert_array_general_from_str(data, null_label, "Float64") def _convert_datetime_to_str(data: pd.Series, null_label: str) -> pd.Series: """ Convert datetime elements to string format "%Y-%m-%d %H:%M:%S". Parameters ---------- data : pd.Series Series of datetime objects or strings. null_label : str Label to use for NaN or invalid values. Returns ------- pd.Series Series with datetime strings, NaNs replaced by null_label. """ def _return_str(x: Any, null_label: str) -> str: """ Convert value to str. Parameters ---------- x : Any Value to be converted. null_label : str Label to replace missing value with. Returns ------- str Converted value. """ if pd.isna(x): return null_label if isinstance(x, str): return x return str(x.strftime("%Y-%m-%d %H:%M:%S")) return data.apply(lambda x: _return_str(x, null_label)).astype(object) def _convert_datetime_from_str(data: pd.Series, null_label: str) -> pd.Series: """ Convert a series of string values to datetime objects. Parameters ---------- data : pd.Series Series containing string representations of datetime values. null_label : str Label to use for NaN, empty, or invalid values. Returns ------- pd.Series Series with values converted to pandas datetime dtype (datetime64[ns]). Invalid or non-convertible values are set to NaT. """ return pd.to_datetime(data, errors="coerce") def _convert_column( series: pd.Series, column_atts: dict[str, Any], null_label: str, converters: ConvertFromStr | ConvertToStr, ) -> pd.Series: """ Apply a type-specific converter to a pandas Series based on column attributes. Parameters ---------- series : pd.Series Input series to be converted. column_atts : dict Dictionary containing metadata about the column. Must include the key "data_type" to determine which converter to apply. May also include additional parameters such as "decimal_places". null_label : str or None Label used to represent null values when converting to string format. Ignored when ``mode="from_str"``. converters : ConvertFromStr or ConvertToStr Converter registry that maps data types to conversion functions and optional argument requirements. Returns ------- pd.Series Converted series if a matching converter is found; otherwise, the original series is returned unchanged. Raises ------ TypeError If ``series`` is not a pandas Series. Notes ----- - If ``column_atts`` does not define a "data_type", no conversion is applied. - If no converter exists for the given data type, the input series is returned unchanged. - For converters requiring additional arguments (e.g., "decimal_places" for numeric types), the value is taken from ``column_atts`` or falls back to a default. """ if not isinstance(series, pd.Series): raise TypeError("series must be a pd.Series.") data_type = column_atts.get("data_type") if data_type is None: return series converter = converters[data_type] if converter is None: return series converter_args = converters.get_args(data_type) kwargs = {} if converter_args == "decimal_places": kwargs["decimal_places"] = column_atts.get( "decimal_places", properties.default_decimal_places, ) return converter(series, null_label, **kwargs) def _convert_columns( data: pd.DataFrame, imodel: str, cdm_subset: str | list[str] | None, null_label: str, mode: str, ) -> pd.DataFrame: """ Apply type-based conversions to columns in a DataFrame based on a data model. Parameters ---------- data : pd.DataFrame Input DataFrame containing columns to be converted. imodel : str Input data model identifier. Must follow the expected naming convention and correspond to a supported data model. cdm_subset : str or list, optional Subset of CDM (Common Data Model) tables to process. If None or empty, all available tables are considered. null_label : str Label used to represent null values when converting to string format. Ignored when ``mode="from_str"``. mode : {"from_str", "to_str"} Conversion mode: - "from_str": Convert string representations to native pandas dtypes. - "to_str": Convert values to string representations. Returns ------- pd.DataFrame DataFrame with selected columns converted according to the data model specifications. Raises ------ TypeError If ``data`` is not a pandas DataFrame or ``imodel`` is not a string. ValueError If ``imodel`` is not defined or not supported, or if ``mode`` is invalid. Notes ----- - Column conversion is driven by CDM metadata obtained via ``get_cdm_atts``. - If ``mode="to_str"``, additional mappings from ``get_imodel_maps`` may override or extend column attributes. - Columns can be referenced either by name or by (table, column) tuples. - Columns without matching metadata or converters are left unchanged. - For "from_str" mode, occurrences of ``null_label`` are replaced with ``pd.NA``. """ if not isinstance(data, pd.DataFrame): raise TypeError("data must be a pd.DataFrame.") if imodel is None: raise ValueError("Input data model 'imodel' is not defined.") if not isinstance(imodel, str): raise TypeError(f"Input data model type is not supported: {type(imodel)}") data_model = imodel.split("_") if data_model[0] not in get_args(properties.SupportedDataModels): raise ValueError(f"Input data model {data_model[0]} not supported") if mode not in ("from_str", "to_str"): raise ValueError("mode must be one of {'from_str', 'to_str'}") converters: ConvertFromStr | ConvertToStr if mode == "from_str": data = data.replace(null_label, pd.NA) data = data.fillna(pd.NA) converters = ConvertFromStr() imodel_maps = {} else: converters = ConvertToStr() imodel_maps = get_imodel_maps(*data_model, cdm_tables=cdm_subset) if not cdm_subset: cdm_subset = properties.cdm_tables cdm_atts = get_cdm_atts(cdm_subset) data_column: str | tuple[str, str] for table, table_atts in cdm_atts.items(): table_maps = imodel_maps.get(table, {}) for column, column_atts in table_atts.items(): if column in data.columns: data_column = column elif (table, column) in data.columns: data_column = (table, column) else: continue column_maps = table_maps.get(column, {}) column_atts = {**column_atts, **column_maps} data[data_column] = _convert_column( data[data_column], column_atts, null_label, converters, ) return data
[docs] def convert_from_str_df( data: pd.DataFrame, imodel: str | None, cdm_subset: str | list[str] | None = None, null_label: str = "null", ) -> pd.DataFrame: """ Convert string-encoded values in a DataFrame to native pandas dtypes. Parameters ---------- data : pd.DataFrame Input DataFrame containing string representations of values. imodel : str Input data model identifier used to determine column types. cdm_subset : str or list, optional Subset of CDM tables to process. If None, all tables are considered. null_label : str or None, default "null" Label representing null values in the input data. Returns ------- pd.DataFrame DataFrame with values converted from string representations to appropriate pandas dtypes. """ if imodel is None: raise ValueError("imodel must be a string, not None.") return _convert_columns( data, imodel, cdm_subset, null_label, "from_str", )
[docs] def convert_from_str_series( series: pd.Series, column_atts: dict[Any, Any], null_label: str = "null", ) -> pd.Series: """ Convert a Series of string values to a native pandas dtype. Parameters ---------- series : pd.Series Input Series containing string representations of values. column_atts : dict Dictionary defining column metadata, including the "data_type" used to select the appropriate converter. null_label : str or None, default "null" Label representing null values in the input data. Returns ------- pd.Series Series with values converted to the appropriate pandas dtype. """ series = series.fillna(pd.NA) return _convert_column( series, column_atts, null_label, ConvertFromStr(), )
[docs] def convert_to_str_df( data: pd.DataFrame, imodel: str | None, cdm_subset: str | list[str] | None = None, null_label: str = "null", ) -> pd.DataFrame: """ Convert DataFrame values to string representations based on a data model. Parameters ---------- data : pd.DataFrame Input DataFrame containing values to convert. imodel : str Input data model identifier used to determine column types. cdm_subset : str or list, optional Subset of CDM tables to process. If None, all tables are considered. null_label : str or None, default "null" Label to use for null or missing values in the output. Returns ------- pd.DataFrame DataFrame with values converted to string representations. """ if imodel is None: raise ValueError("imodel must be a string, not None.") return _convert_columns( data, imodel, cdm_subset, null_label, "to_str", )
[docs] def convert_to_str_series( series: pd.Series, column_atts: dict[Any, Any], null_label: str = "null", ) -> pd.Series: """ Convert a Series to string representations based on column metadata. Parameters ---------- series : pd.Series Input Series containing values to convert. column_atts : dict Dictionary defining column metadata, including the "data_type" used to select the appropriate converter. null_label : str or None, default "null" Label representing null values in the input data. Returns ------- pd.Series Series with values converted to string representations. """ return _convert_column( series, column_atts, null_label, ConvertToStr(), )