Source code for cdm_reader_mapper.cdm_mapper.writer

"""
Write Common Data Model (CDM) mapping tables.

Created on Thu Apr 11 13:45:38 2019

Exports tables written in the C3S Climate Data Store Common Data Model (CDM) format to ascii files,
The tables format is contained in a python dictionary, stored as an attribute in a pandas.DataFrame
(or Iterable[pd.DataFrame]).

This module uses a set of printer functions to "print" element values to a
string object before exporting them to a final ascii file.

Each of the CDM table element's has a data type (pseudo-sql as defined in the CDM documentation) which defines
which printer function needs to be used.

Numeric data types are printed with an specific number of decimal places, defined in the data element attributes. This
can vary according to each CDM, element, imodel and mapping .json file. If this is not defined in the input attributes
of the imodel, the number of decimal places used comes from a default tool defined in properties.py

@author: iregon
"""

from __future__ import annotations
import logging
from pathlib import Path
from typing import Any, get_args

import pandas as pd

from cdm_reader_mapper.common import get_filename, logging_hdlr

from .properties import SupportedFileTypes
from .tables.tables import get_cdm_atts
from .utils.conversions import convert_from_str_df, convert_to_str_df
from .utils.utilities import adjust_filename, dict_to_tuple_list, get_cdm_subset


def _table_to_file(
    data: pd.DataFrame,
    filename: str | Path,
    data_format: SupportedFileTypes = "parquet",
    delimiter: str = "|",
    encoding: str = "utf-8",
    **kwargs: Any,
) -> None:
    r"""
    Write a pandas DataFrame to disk in a selected file format.

    Parameters
    ----------
    data : pd.DataFrame
        DataFrame to be written to disk.
    filename : str or Path-like
        Destination file path.
    data_format : {"parquet", "csv", "feather"}, default "parquet"
        Output file format.
    delimiter : str, default "|"
        Field delimiter used when writing CSV files.
    encoding : str, default "utf-8"
        Text encoding used when writing CSV files.
    \**kwargs : Any
        Additional keyword arguments forwarded to the underlying pandas
        serialization function.

    Returns
    -------
    None
        This function performs a write operation and returns no value.

    Raises
    ------
    ValueError
        If `data_format` is not one of the supported formats defined by
        ``SupportedFileTypes``.
    """
    data = data.dropna(how="all")
    if data_format == "csv":
        header = True
        wmode = "w"
        data.to_csv(
            filename,
            index=False,
            header=header,
            mode=wmode,
            sep=delimiter,
            encoding=encoding,
            **kwargs,
        )
    elif data_format == "parquet":
        data.to_parquet(filename, engine="pyarrow", compression="snappy", **kwargs)
    elif data_format == "feather":
        data.to_feather(filename, **kwargs)
    else:
        raise ValueError(f"data_format must be one of {get_args(SupportedFileTypes)} not {data_format}.")


def _write_table(
    data: pd.DataFrame,
    table: str,
    data_format: SupportedFileTypes,
    filename: dict[str, str | Path],
    separator: str | None,
    prefix: str | None,
    suffix: str | None,
    extension: str,
    delimiter: str,
    encoding: str,
    out_dir: Path,
    logger: logging.Logger,
) -> None:
    """
    Write single table to CDM-table file on file system.

    Parameters
    ----------
    data : pd.DataFrame
        Data to write on file system.
    table : str
        Name of the CDM table.
    data_format : str
        Format of output data file.
    filename : dict
        Dictionary containing CDM tables and their corresponding file names.
    separator : str
        Separator of file name structure: ``<prefix><separator><table><separator>*<suffix>.<extension>``.
    prefix : str
        Prefix of file name structure: ``<prefix><separator><table><separator>*<suffix>.<extension>``.
    suffix : str
        Suffix of file name structure: ``<prefix><separator><table><separator>*<suffix>.<extension>``.
    extension : str
        Extension of file name structure: ``<prefix><separator><table><separator>*<suffix>.<extension>``.
    delimiter : str
        Character or regex pattern to treat as the delimiter while reading with df.to_csv.
    encoding : str
        A string representing the encoding to use in the output file.
    out_dir : str
        Path to the output directory.
    logger : logging.Logger
        Logger instance used for logging.
    """
    cdm_atts = get_cdm_atts(table)[table]
    table_columns = pd.Index(cdm_atts.keys())
    if table in data:
        cdm_table = data[table]
    elif data.columns.equals(table_columns):
        cdm_table = data
    else:
        cdm_table = pd.DataFrame(columns=table_columns)

    filename_ = filename.get(table)
    if not filename_:
        filename_ = get_filename(
            [prefix, table, suffix],
            path=out_dir,
            extension=extension,
            separator=separator,
        )
    filename_ = adjust_filename(str(filename_), table=table, extension=extension)
    if len(Path(filename_).parts) == 1:
        filename_ = out_dir / filename_

    logger.info("Writing table %s: %s", table, filename_)
    _table_to_file(
        cdm_table,
        delimiter=delimiter,
        encoding=encoding,
        filename=filename_,
        data_format=data_format,
    )



[docs]
def write_tables(
    data: pd.DataFrame,
    data_format: SupportedFileTypes = "parquet",
    out_dir: str | Path | None = None,
    prefix: str | None = None,
    suffix: str | None = None,
    extension: str | None = None,
    filename: str | Path | dict[str, str | Path] | None = None,
    separator: str | None = "-",
    cdm_subset: str | list[str] | None = None,
    col_subset: str | list[str] | dict[str, str] | None = None,
    delimiter: str = "|",
    encoding: str = "utf-8",
    from_str: bool | None = None,
    to_str: bool | None = None,
    imodel: str | None = None,
    **kwargs: Any,
) -> None:
    r"""
    Write pandas.DataFrame to CDM-table file on file system.

    Parameters
    ----------
    data : pandas.DataFrame
        Data to export.
    data_format : {"csv", "parquet", "feather"}, default: "parqeut"
        Format of output data file(s).
    out_dir : str, optional
        Path to the output directory.
        Defaults to current directory.
    prefix : str, optional
        Prefix of file name structure: ``<prefix><separator><table><separator>*<suffix>.<extension>``.
    suffix : str, optional
        Suffix of file name structure: ``<prefix><separator><table><separator>*<suffix>.<extension>``.
    extension : str, optional
        Extension of file name structure: ``<prefix><separator><table><separator>*<suffix>.<extension>``.
    filename : str, Path-like or dict, optional
        Name of the output file name(s).
        List one filename for each table name in ``data`` ({<table>:<filename>}).
        If None, automatically create file name from table name, ``prefix`` and ``suffix``.
    separator : str, optional
        Separator of file name structure: ``<prefix><separator><table><separator>*<suffix>.<extension>``.
    cdm_subset : str or list of str, optional
        Specifies a subset of tables or a single table.

        - For multiple subsets of tables:
          This function returns a pandas.DataFrame that is multi-index at
          the columns, with (table-name, field) as column names. Tables are merged via the report_id field.

        - For a single table:
          This function returns a pandas.DataFrame with a simple indexing for the columns.
    col_subset : str, list or dict, optional
        Specify the section or sections of the file to write.

        - For multiple sections of the tables:
          e.g col_subset = {table0:[columns0],...tableN:[columnsN]}

        - For a single section:
          e.g. list type object col_subset = [columns]
          This variable assumes that the column names are all conform to the cdm field names.
    delimiter : str, default: "|"
        Character or regex pattern to treat as the delimiter while reading with df.to_csv.
        This is only relevant if `data_format` is "csv".
    encoding : str
        A string representing the encoding to use in the output file, defaults to utf-8.
        This is only relevant if `data_format` is "csv".
    from_str : bool, optional
        If True convert original string data to `imodel`-specific data types.
    to_str : bool, optional
        If True convert original `imodel`-specific data types to strings.
    imodel : str , optional
        Name of data model, e.g. icoads.
        Must be set if either `from_str` or `to_str` is set.
    \**kwargs : Any
        Additional keyword-arguments that will be ignored.

    See Also
    --------
    write: Write either MDF data or CDM tables to disk.
    write_data : Write MDF data and validation mask to disk.
    read: Read either original marine-meteorological data or MDF data or CDM tables from disk.
    read_tables : Read CDM tables from disk.
    read_data : Read MDF data and validation mask from disk.
    read_mdf : Read original marine-meteorological data from disk.

    Notes
    -----
    - Use this function after reading CDM tables.
    - `kwargs` will be ignored!
    """
    logger = logging_hdlr.init_logger(__name__, level="INFO")
    supported_file_types = get_args(SupportedFileTypes)
    if data_format not in supported_file_types:
        raise ValueError(f"data_format must be one of {supported_file_types}, not {data_format}.")

    cdm_subset = get_cdm_subset(cdm_subset)

    if col_subset:
        to_select: str | list[str] | list[tuple[str, str]]
        if isinstance(col_subset, dict):
            to_select = dict_to_tuple_list(col_subset)
        else:
            to_select = cdm_subset
        data = data[to_select]

    if data.empty:
        logger.warning("All CDM tables are empty")
        return

    if kwargs:
        logging.warning("'kwargs' will be ignored: %s", kwargs)

    if isinstance(filename, dict):
        cdm_subset = list(filename.keys())
    elif isinstance(filename, (str, Path)):
        filename = {table_name: filename for table_name in cdm_subset}
    elif filename is None:
        filename = {}

    out_dir = out_dir or "."
    out_dir = Path(out_dir)

    extension = extension or data_format

    if to_str is True:
        data = convert_to_str_df(data.copy(), imodel=imodel, cdm_subset=cdm_subset)

    if from_str is True:
        data = convert_from_str_df(data.copy(), imodel=imodel, cdm_subset=cdm_subset)

    for table in cdm_subset:
        _write_table(
            data,
            table,
            data_format,
            filename,
            separator,
            prefix,
            suffix,
            extension,
            delimiter,
            encoding,
            out_dir,
            logger,
        )