Source code for sbmlsim.data

"""Module handling data (experiment and simulation)."""
import logging
from enum import Enum
from pathlib import Path
from typing import Dict, List, Optional, Union

import numpy as np
import pandas as pd

from sbmlsim.combine import mathml
from sbmlsim.result import XResult
from sbmlsim.units import DimensionalityError, Quantity, UnitRegistry, UnitsInformation
from sbmlsim.utils import deprecated


[docs]logger = logging.getLogger(__name__)
[docs]class Data(object): """Data. Main data generator class which uses data either from experimental data, simulations or via function calculations. All transformation of data and a tree of data operations. """
[docs] class Types(Enum): """Data types."""
[docs] TASK = 1
[docs] DATASET = 2
[docs] FUNCTION = 3
def __init__( self, experiment, index: str, task: str = None, dataset: str = None, function=None, variables=None, ): """Construct data.""" self.experiment = experiment self.index: str = index self.task_id: str = task self.dset_id: str = dataset self.function = function self.variables = variables self.unit: Optional[str] = None if (not self.task_id) and (not self.dset_id) and (not self.function): raise ValueError( "Either 'task_id', 'dset_id' or 'function' required for Data." ) # register data in simulation self._register_data()
[docs] def _register_data(self): """Register data in simulation.""" # FIXME: this creates strange issues if self.experiment: if self.experiment._data is None: self.experiment._data = {} self.experiment._data[self.sid] = self else: logger.error("No experiment for data, registration failed.")
[docs] def __str__(self) -> str: """Get string.""" s: str if self.is_task(): s = f"Data(index={self.index}, task_id={self.task_id})|Task" elif self.is_dataset(): s = f"Data(index={self.index}, dset_id={self.dset_id})|DataSet" elif self.is_function(): s = f"Data(index={self.index}, function={self.function})|Function" return s
@property
[docs] def sid(self) -> str: """Get id.""" sid: str if self.task_id: sid = f"{self.task_id}__{self.index}" elif self.dset_id: sid = f"{self.dset_id}__{self.index}" elif self.function: sid = self.index return sid
[docs] def is_task(self) -> bool: """Check if task.""" return self.task_id is not None
[docs] def is_dataset(self) -> bool: """Check if dataset.""" return self.dset_id is not None
[docs] def is_function(self): """Check if function.""" return self.function is not None
@property
[docs] def dtype(self) -> "Data.Types": """Get data type.""" if self.task_id: dtype = Data.Types.TASK elif self.dset_id: dtype = Data.Types.DATASET elif self.function: dtype = Data.Types.FUNCTION else: raise ValueError("DataType could not be determined!") return dtype
# todo: dimensions, data type # TODO: calculations # TODO: conversion factors for units, necessary to store # TODO: storage of definitions on simulation.
[docs] def to_dict(self): """Convert to dictionary.""" # FIXME: ensure that the data is evaluated (via get_data) before # it is serialized. Currently only the plotted variables are # evaluated (-> units can not be resolved for the remainder). d = { "type": self.dtype, "index": self.index, "unit": self.unit, "task": self.task_id, "dataset": self.dset_id, "function": self.function, "variables": self.variables if self.variables else None, } return d
[docs] def get_data(self, to_units: str = None): """Return actual data from the data object. :param to_units: units to convert to :return: """ # Necessary to resolve the data if self.dtype == Data.Types.DATASET: # read dataset data dset = self.experiment._datasets[self.dset_id] if not isinstance(dset, DataSet): raise ValueError( f"DataSet '{self.dset_id}' is not a DataSet, but " f"type '{type(dset)}'\n" f"{dset}" ) if dset.empty: logger.error(f"Adding empty dataset '{dset}' for '{self.dset_id}'.") # data with units if self.index.endswith("_se") or self.index.endswith("_sd"): uindex = self.index[:-3] else: uindex = self.index if self.index not in dset.columns: error_msg = ( f"Data column with key '{self.index}' does not " f"exist in dataset: '{self.dset_id}'." ) logger.error(error_msg) raise KeyError(error_msg) try: self.unit = dset.uinfo[uindex] except KeyError as err: logger.error( f"Units missing for key '{uindex}' in dataset: " f"'{self.dset_id}'. Add missing units to dataset." ) raise err x = dset[self.index].values * dset.uinfo.ureg(dset.uinfo[uindex]) elif self.dtype == Data.Types.TASK: # read results of task xres: XResult = self.experiment.results[self.task_id] if not isinstance(xres, XResult): raise ValueError("Only Result objects supported in task data.") self.unit = xres.uinfo[self.index] # FIXME: complete data must be kept # print(xres) x = xres.dim_mean(self.index) # x = xres[self.index] elif self.dtype == Data.Types.FUNCTION: # evaluate with actual data astnode = mathml.formula_to_astnode(self.function) variables = {} for k, v in self.variables.items(): # lookup via key if isinstance(v, str): variables[k] = self.experiment._data[v].data elif isinstance(v, Data): variables[k] = v.data x = mathml.evaluate(astnode=astnode, variables=variables) self.unit = str(x.units) # check if this is correct # convert units to requested units if to_units is not None: try: x = x.to(to_units) except DimensionalityError as err: logger.error( f"Could not convert '{str(self)}' with " f"data '{x} ({type(x)})' to " f"units '{to_units}'" ) raise err except AttributeError as err: logger.error( f"Could not convert '{str(self)}' with " f"data '{x} ({type(x)})' to " f"units '{to_units}'" ) raise err return x
[docs] data = property(get_data)
[docs]class DataFunction(object): """Functional data calculation. The idea ist to provide an object which can calculate a generic math function based on given input symbols. Important challenge is to handle the correct functional evaluation. """ def __init__(self, index, formula, variables): self.index = index self.formula = formula self.variables = variables
[docs]class DataSeries(pd.Series): """DataSet - a pd.Series with additional unit information.""" # additional properties
[docs] _metadata = ["uinfo"]
@property
[docs] def _constructor(self): return DataSeries
@property
[docs] def _constructor_expanddim(self): return DataSet
[docs]class DataSet(pd.DataFrame): """DataSet. pd.DataFrame with additional unit information in the form of UnitInformations. """ # additional properties
[docs] _metadata = ["uinfo", "Q_"]
@property
[docs] def _constructor(self): return DataSet
@property
[docs] def _constructor_sliced(self): return DataSeries
[docs] def get_quantity(self, key: str): """Return quantity for given key. Requires using the numpy data instead of the series. """ return self.uinfo.ureg.Quantity( self[key].values, self.uinfo[key],
)
[docs] def __repr__(self) -> str: """Return DataFrame with all columns.""" pd.set_option("max_columns", None) s = super().__repr__() pd.reset_option("max_columns") return str(s)
@classmethod
[docs] def from_df( cls, df: pd.DataFrame, ureg: UnitRegistry, udict: Dict[str, str] = None ) -> "DataSet": """Create DataSet from given pandas.DataFrame. The DataFrame can have various formats which should be handled. Standard formats are 1. units annotations based on '*_unit' columns, with additional '*_sd' or '*_se' units 2. units annotations based on 'unit' column which is applied on 'mean', 'value', 'sd' and 'se' columns :param df: pandas.DataFrame :param uinfo: optional units information :return: dataset """ if not isinstance(ureg, UnitRegistry): raise ValueError( f"ureg must be a UnitRegistry, but '{ureg}' is '{type(ureg)}'" ) if df.empty: raise ValueError(f"DataFrame cannot be empty, check DataFrame: {df}") if udict is None: udict = {} # all units from udict and DataFrame all_udict: Dict[str, str] = {} for key in df.columns: # handle '*_unit columns' if key.endswith("_unit"): # parse the item and unit in dict units = df[key].unique() if len(units) > 1: logger.error( f"Column '{key}' units are not unique: '{units}' in \n" f"{df}" ) elif len(units) == 0: logger.error(f"Column '{key}' units are missing: '{units}'") print(df.head()) item_key = key[0:-5] if item_key not in df.columns: logger.error( f"Missing * column '{item_key}' for unit " f"column: '{key}'" ) else: all_udict[item_key] = units[0] elif key == "unit": # add unit to "mean" and "value" for key in ["mean", "value", "median"]: if (key in df.columns) and not (f"{key}_unit" in df.columns): # FIXME: probably not a good idea to add columns while iterating over them df[f"{key}_unit"] = df.unit unit_keys = df.unit.unique() if len(df.unit.unique()) > 1: logger.error( f"More than one unit in 'unit' column will create issues in " f"unit conversion, filter data to reduce units: '{df.unit.unique()}'" ) udict[key] = unit_keys[0] # rename the sd and se columns to mean_sd and mean_se if key == "mean": for err_key in ["sd", "se"]: if f"mean_{err_key}" in df.columns: # remove existing mean_sd column del df[f"mean_{err_key}"] logger.warning( f"Removing existing column: `mean_{err_key}`from DataSet." ) df.rename( columns={f"{err_key}": f"mean_{err_key}"}, inplace=True, ) # remove unit column del df["unit"] elif key in ["count", "n"]: # add special units for count if f"{key}_unit" not in df.columns: udict[key] = "dimensionless" # add external definitions if udict: for key, unit in udict.items(): if key in all_udict: logger.error(f"Duplicate unit definition for: '{key}'") else: all_udict[key] = unit # add the unit columns to the data frame setattr(df, f"{key}_unit", unit) dset = DataSet(df) dset.uinfo = UnitsInformation(all_udict, ureg=ureg) dset.Q_ = dset.uinfo.ureg.Quantity return dset
[docs] def unit_conversion(self, key, factor: Quantity) -> None: """Convert the units of the given key in the dataset. Changes values in place in the DataSet. The quantity in the dataset is multiplied with the conversion factor. In addition to the key, also the respective error measures are converted with the same factor, i.e. - {key} - {key}_sd - {key}_se - {key}_min - {key}_max FIXME: in addition base keys should be updated in the table, i.e. if key in [mean, median, min, max, sd, se, cv] then the other keys should be updated; use default set of keys for automatic conversion :param key: column key in dataset (this column is unit converted) :param factor: multiplicative Quantity factor for conversion :return: None """ if key in self.columns: if key not in self.uinfo: raise ValueError( f"Unit conversion only possible on keys which have units! " f"No unit defined for key '{key}'" ) # unit conversion and simplification new_quantity = self.uinfo.Q_(self[key], self.uinfo[key]) * factor new_quantity = new_quantity.to_base_units().to_reduced_units() # updated values self[key] = new_quantity.magnitude # update error measures for err_key in [f"{key}_sd", f"{key}_se", f"{key}_min", f"{key}_max"]: if err_key in self.columns: # error keys not stored in udict, only the base quantity new_err_quantity = ( self.uinfo.Q_(self[err_key], self.uinfo[key]) * factor ) new_err_quantity = ( new_err_quantity.to_base_units().to_reduced_units() ) self[err_key] = new_err_quantity.magnitude # updated units new_units = new_quantity.units new_units_str = ( str(new_units).replace("**", "^").replace(" ", "") ) # '{:~}'.format(new_units) self.uinfo[key] = new_units_str if f"{key}_unit" in self.columns: self[f"{key}_unit"] = new_units_str else: logger.error( f"Key '{key}' not in DataSet, unit conversion not applied: '{factor}'"
) # @deprecated
[docs]def load_pkdb_dataframe( sid, data_path: Union[Path, List[Path]], sep="\t", comment="#", **kwargs ) -> pd.DataFrame: """Load TSV data from PKDB figure or table id. This is a simple helper functions to directly loading the TSV data. It is recommended to use `pkdb_analysis` methods instead. This function will be removed. E.g. for 'Amchin1999_Tab1' the file data_path / 'Amchin1999' / '.Amchin1999.tsv' is loaded. :param sid: figure or table id :param data_path: base path of data or iterable of data_paths :param sep: separator :param comment: comment characters :param kwargs: additional kwargs for csv parsing :return: pandas DataFrame """ study = sid.split("_")[0] if isinstance(data_path, Path): data_path = [data_path] for p in data_path: path = p / study / f".{sid}.tsv" if path.exists(): # use the first path which exists break if not path.exists(): ValueError(f"file path not found in data_path: {data_path}") df = pd.read_csv(path, sep=sep, comment=comment, **kwargs) # FIXME: handle unnecessary UnitStrippedWarning: The unit of the quantity is stripped when downcasting to ndarray. # At this point we only work with numpy arrays, units not important here df = df.dropna(how="all") # drop all NA rows return df
# @deprecated
[docs]def load_pkdb_dataframes_by_substance( sid, data_path, **kwargs ) -> Dict[str, pd.DataFrame]: """Load dataframes from given PKDB figure/table id split on substance. The DataFrame is split on the 'substance' key. This is a simple helper functions to directly loading the TSV data. It is recommended to use `pkdb_analysis` methods instead. This function will be removed. :param sid: :param data_path: :param kwargs: :return: Dict[substance, pd.DataFrame] """ df = load_pkdb_dataframe(sid=sid, data_path=data_path, na_values=["na"], **kwargs) frames = {} for substance in df.substance.unique(): frames[substance] = df.copy()[df.substance == substance] return frames