Source code for sbmlsim.data

"""Module handling data (experiment and simulation)."""
from enum import Enum
from pathlib import Path
from typing import Dict, List, Optional, Union

import pandas as pd
from sbmlutils import log

from sbmlsim.combine import mathml
from sbmlsim.units import DimensionalityError, Quantity, UnitRegistry, UnitsInformation
from sbmlsim.xresult import XResult


[docs]logger = log.get_logger(__name__)
[docs]class Data(object): """Data. Main data generator class which uses data either from experimental data, simulations or via function calculations. All transformation of data and a tree of data operations. This is just a promise for data which will be fullfilled with data from tasks. """
[docs] class Types(Enum): """Data types."""
[docs] TASK = 1
[docs] DATASET = 2
[docs] FUNCTION = 3
[docs] class Symbols(Enum): """Symbols."""
[docs] TIME = 1
[docs] AMOUNT = 2
[docs] CONCENTRATION = 3
def __init__( self, index: str, symbol: Optional[Symbols] = None, task: str = None, dataset: str = None, function: str = None, variables: Dict[str, "Data"] = None, parameters: Dict[str, float] = None, sid: str = None, ): """Construct data.""" # FIXME: get rid of backwards compatibility if not symbol: if index.startswith("[") and index.endswith("]"): index = index[1:-1] symbol = Data.Symbols.CONCENTRATION logger.debug( f"Encoding concentration '[{index}]' as 'index={index}' " f"and 'symbol={symbol}'." ) else: symbol = Data.Symbols.AMOUNT self.index: str = index self.symbol: "Symbols" = symbol # noqa: F821 self.task_id: str = task self.dset_id: str = dataset self.function: str = function self.variables: Dict[str, "Data"] = variables self.parameters: Dict[str, float] = parameters self.unit: Optional[str] = None self._sid = sid if (not self.task_id) and (not self.dset_id) and (not self.function): raise ValueError( "Either 'task_id', 'dset_id' or 'function' required for Data." ) if self.symbol == Data.Symbols.CONCENTRATION and index.startswith("["): raise ValueError( "Use index without brackets in combination with 'symbol=concentration'" ) @property
[docs] def selection(self) -> str: """Get selection string. Depending on symbol, different selections have to be performed. """ if self.symbol and self.symbol == Data.Symbols.CONCENTRATION: return f"[{self.index}]" return self.index
[docs] def __repr__(self) -> str: """Get string.""" s: str if self.is_task(): s = f"Data(Task|index={self.index}, symbol={self.symbol}, task_id={self.task_id})" elif self.is_dataset(): s = f"Data(DataSet|index={self.index}, symbol={self.symbol}, dset_id={self.dset_id})" elif self.is_function(): s = f"Data(Function|index={self.index}, symbol={self.symbol}, function={self.function})" return s
@property
[docs] def sid(self) -> str: """Get id.""" sid: str if self._sid: sid = self._sid elif self.task_id: sid = f"{self.task_id}__{self.index}" elif self.dset_id: sid = f"{self.dset_id}__{self.index}" elif self.function: sid = self.index return sid
[docs] def is_task(self) -> bool: """Check if task.""" return self.task_id is not None
[docs] def is_dataset(self) -> bool: """Check if dataset.""" return self.dset_id is not None
[docs] def is_function(self): """Check if function.""" return self.function is not None
@property
[docs] def name(self) -> str: """Get name.""" name: str dtype = self.dtype if dtype in [Data.Types.TASK, Data.Types.DATASET]: name = self.index elif dtype == Data.Types.FUNCTION: if len(self.variables) == 1: name = list(self.variables.values())[0].index else: name = self.index return name
@property
[docs] def dtype(self) -> "Data.Types": """Get data type.""" if self.task_id: dtype = Data.Types.TASK elif self.dset_id: dtype = Data.Types.DATASET elif self.function: dtype = Data.Types.FUNCTION else: raise ValueError("DataType could not be determined!") return dtype
# todo: dimensions, data type # TODO: calculations # TODO: conversion factors for units, necessary to store # TODO: storage of definitions on simulation.
[docs] def to_dict(self): """Convert to dictionary.""" # FIXME: ensure that the data is evaluated (via get_data) before # it is serialized. Currently only the plotted variables are # evaluated (-> units can not be resolved for the remainder). d = { "type": self.dtype, "index": self.index, "unit": self.unit, "task": self.task_id, "dataset": self.dset_id, "function": self.function, "variables": self.variables if self.variables else None, } return d
[docs] def get_data( self, experiment: "SimulationExperiment", to_units: str = None # noqa: F821 ) -> Quantity: """Return actual data from the data object. The data is resolved from the available datasets and the injected Experiment. :param to_units: units to convert to :return: """ # Necessary to resolve the data if self.dtype == Data.Types.DATASET: # read dataset data if not experiment._datasets: experiment._datasets = experiment.datasets() dset = experiment._datasets[self.dset_id] if not isinstance(dset, DataSet): raise ValueError( f"DataSet '{self.dset_id}' is not a DataSet, but " f"type '{type(dset)}'\n" f"{dset}" ) if dset.empty: logger.error(f"Adding empty dataset '{dset}' for '{self.dset_id}'.") # data with units if self.index.endswith("_se") or self.index.endswith("_sd"): uindex = self.index[:-3] else: uindex = self.index if self.index not in dset.columns: error_msg = ( f"Data column with key '{self.index}' does not " f"exist in dataset: '{self.dset_id}'." ) logger.error(error_msg) raise KeyError(error_msg) try: self.unit = dset.uinfo[uindex] except KeyError as err: logger.error( f"Units missing for key '{uindex}' in dataset: " f"'{self.dset_id}'. Add missing units to dataset." ) raise err x = dset[self.index].values * dset.uinfo.ureg(dset.uinfo[uindex]) elif self.dtype == Data.Types.TASK: # read results of task print(experiment.results.keys()) xres: XResult = experiment.results[self.task_id] if not isinstance(xres, XResult): raise ValueError("Only Result objects supported in task data.") # units match the symbols self.unit = xres.uinfo[self.selection] # x = xres.dim_mean(self.index) x = xres[self.selection].values * xres.uinfo.ureg(self.unit) elif self.dtype == Data.Types.FUNCTION: # evaluate with actual data astnode = mathml.formula_to_astnode(self.function) variables = {} for var_key, variable in self.variables.items(): # lookup via key if isinstance(variable, str): variables[var_key] = experiment._data[variable].data elif isinstance(variable, Data): variables[var_key] = variable.get_data(experiment=experiment) for par_key, par_value in self.parameters.items(): variables[par_key] = par_value x = mathml.evaluate(astnode=astnode, variables=variables) self.unit = str(x.units) # FIXME: check if this is correct # convert units to requested units if to_units is not None: try: x = x.to(to_units) except DimensionalityError as err: logger.error( f"Could not convert '{str(self)}' to units '{to_units}' with " f"data \n'{x}'" ) raise err except AttributeError as err: logger.error( f"Could not convert '{str(self)}' with " f"data '{x} ({type(x)})' to " f"units '{to_units}'" ) raise err return x
[docs]class DataSeries(pd.Series): """DataSet - a pd.Series with additional unit information.""" # additional properties
[docs] _metadata = ["uinfo"]
@property
[docs] def _constructor(self): return DataSeries
@property
[docs] def _constructor_expanddim(self): return DataSet
[docs]class DataSet(pd.DataFrame): """DataSet. pd.DataFrame with additional unit information in the form of UnitInformations. """ # additional properties
[docs] _metadata = ["uinfo", "Q_"]
@property
[docs] def _constructor(self): return DataSet
@property
[docs] def _constructor_sliced(self): return DataSeries
[docs] def get_quantity(self, key: str): """Return quantity for given key. Requires using the numpy data instead of the series. """ return self.uinfo.ureg.Quantity( self[key].values, self.uinfo[key], )
[docs] def __repr__(self) -> str: """Return DataFrame with all columns.""" pd.set_option("display.max_columns", None) s = super().__repr__() pd.reset_option("display.max_columns") return str(s)
@classmethod
[docs] def from_df( cls, df: pd.DataFrame, ureg: UnitRegistry, udict: Dict[str, str] = None ) -> "DataSet": """Create DataSet from given pandas.DataFrame. The DataFrame can have various formats which should be handled. Standard formats are 1. units annotations based on '*_unit' columns, with additional '*_sd' or '*_se' units 2. units annotations based on 'unit' column which is applied on 'mean', 'value', 'sd' and 'se' columns :param df: pandas.DataFrame :param uinfo: optional units information :return: dataset """ if not isinstance(ureg, UnitRegistry): raise ValueError( f"ureg must be a UnitRegistry, but '{ureg}' is '{type(ureg)}'" ) if df.empty: raise ValueError(f"DataFrame cannot be empty, check DataFrame: {df}") if udict is None: udict = {} # all units from udict and DataFrame all_udict: Dict[str, str] = {} for key in df.columns: # handle '*_unit columns' if key.endswith("_unit"): # parse the item and unit in dict units = df[key].unique() if len(units) > 1: logger.error( f"Column '{key}' units are not unique: '{units}' in \n" f"{df}" ) elif len(units) == 0: logger.error(f"Column '{key}' units are missing: '{units}'") item_key = key[0:-5] if item_key not in df.columns: logger.error( f"Missing * column '{item_key}' for unit " f"column: '{key}'" ) else: all_udict[item_key] = units[0] elif key == "unit": # add unit to "mean" and "value" for key in ["mean", "value", "median"]: if (key in df.columns) and not (f"{key}_unit" in df.columns): # FIXME: probably not a good idea to add columns while iterating over them df[f"{key}_unit"] = df.unit unit_keys = df.unit.unique() if len(df.unit.unique()) > 1: logger.error( f"More than one unit in 'unit' column will create issues in " f"unit conversion, filter data to reduce units: '{df.unit.unique()}'" ) udict[key] = unit_keys[0] # rename the sd and se columns to mean_sd and mean_se if key == "mean": for err_key in ["sd", "se"]: if ( err_key not in df.columns and f"mean_{err_key}" in df.columns ): df[err_key] = df[f"mean_{err_key}"] if f"mean_{err_key}" in df.columns: # remove existing mean_sd column del df[f"mean_{err_key}"] logger.warning( f"Removing existing column: 'mean_{err_key}' " f"from DataSet. Column should be named: " f"'{err_key}'" ) df.rename( columns={f"{err_key}": f"mean_{err_key}"}, inplace=True, ) # remove unit column del df["unit"] elif key in ["count", "n"]: # add special units for count if f"{key}_unit" not in df.columns: udict[key] = "dimensionless" # add external definitions if udict: for key, unit in udict.items(): if key in all_udict: logger.error(f"Duplicate unit definition for: '{key}'") else: all_udict[key] = unit # add the unit columns to the data frame setattr(df, f"{key}_unit", unit) dset = DataSet(df) dset.uinfo = UnitsInformation(all_udict, ureg=ureg) dset.Q_ = dset.uinfo.ureg.Quantity return dset
[docs] def unit_conversion(self, key, factor: Quantity) -> None: """Convert the units of the given key in the dataset via `key * factor`. Changes values in place in the DataSet. The quantity in the dataset is multiplied with the conversion factor. In addition to the key, also the respective error measures are converted with the same factor, i.e. - {key} - {key}_sd - {key}_se - {key}_min - {key}_max FIXME: in addition base keys should be updated in the table, i.e. if key in [mean, median, min, max, sd, se, cv] then the other keys should be updated; use default set of keys for automatic conversion :param key: column key in dataset (this column is unit converted) :param factor: multiplicative Quantity factor for conversion :return: None """ if key in self.columns: if key not in self.uinfo: raise ValueError( f"Unit conversion only possible on keys which have units! " f"No unit defined for key '{key}'" ) # unit conversion and simplification new_quantity = self.uinfo.Q_(self[key], self.uinfo[key]) * factor new_quantity = new_quantity.to_base_units().to_reduced_units() # updated values self[key] = new_quantity.magnitude # update error measures for err_key in [f"{key}_sd", f"{key}_se", f"{key}_min", f"{key}_max"]: if err_key in self.columns: # error keys not stored in udict, only the base quantity new_err_quantity = ( self.uinfo.Q_(self[err_key], self.uinfo[key]) * factor ) new_err_quantity = ( new_err_quantity.to_base_units().to_reduced_units() ) self[err_key] = new_err_quantity.magnitude # updated units new_units = new_quantity.units new_units_str = ( str(new_units).replace("**", "^").replace(" ", "") ) # '{:~}'.format(new_units) self.uinfo[key] = new_units_str if f"{key}_unit" in self.columns: self[f"{key}_unit"] = new_units_str else: logger.error( f"Key '{key}' not in DataSet, unit conversion not applied: '{factor}'" )
# @deprecated
[docs]def load_pkdb_dataframe( sid, data_path: Union[Path, List[Path]], sep="\t", comment="#", **kwargs ) -> pd.DataFrame: """Load TSV data from PKDB figure or table id. This is a simple helper functions to directly loading the TSV data. It is recommended to use `pkdb_analysis` methods instead. This function will be removed. E.g. for 'Amchin1999_Tab1' the file data_path / 'Amchin1999' / '.Amchin1999.tsv' is loaded. :param sid: figure or table id :param data_path: base path of data or iterable of data_paths :param sep: separator :param comment: comment characters :param kwargs: additional kwargs for csv parsing :return: pandas DataFrame """ study = sid.split("_")[0] if isinstance(data_path, Path): data_path = [data_path] for p in data_path: path = p / study / f".{sid}.tsv" if path.exists(): # use the first path which exists break if not path.exists(): ValueError(f"file path not found in data_path: {data_path}") try: df = pd.read_csv(path, sep=sep, comment=comment, **kwargs) except pd.errors.ParserError as err: logger.error(f"Could not read DataFrame for '{sid}' at '{path}'.") raise err # FIXME: handle unnecessary UnitStrippedWarning: The unit of the quantity is stripped when downcasting to ndarray. # At this point we only work with numpy arrays, units not important here df = df.dropna(how="all") # drop all NA rows return df
# @deprecated
[docs]def load_pkdb_dataframes_by_substance( sid, data_path, **kwargs ) -> Dict[str, pd.DataFrame]: """Load dataframes from given PKDB figure/table id split on substance. The DataFrame is split on the 'substance' key. This is a simple helper functions to directly loading the TSV data. It is recommended to use `pkdb_analysis` methods instead. This function will be removed. :param sid: :param data_path: :param kwargs: :return: Dict[substance, pd.DataFrame] """ df = load_pkdb_dataframe(sid=sid, data_path=data_path, na_values=["na"], **kwargs) frames = {} for substance in df.substance.unique(): frames[substance] = df.copy()[df.substance == substance] return frames