"""Reading NUML, CSV and TSV data from DataDescriptions."""
import http.client as httplib
import importlib
import os
import tempfile
from pathlib import Path
from typing import Dict, Optional
import libsbml
import libsedml
import pandas as pd
from sbmlutils import log
from .numl import NumlParser
[docs]logger = log.get_logger(__name__)
[docs]class DataDescriptionParser:
"""Class for parsing DataDescriptions."""
@classmethod
[docs] def parse(
cls, dd: libsedml.SedDataDescription, working_dir: Path = None
) -> Dict[str, pd.Series]:
"""Parse single DataDescription.
Returns dictionary of data sources {DataSource.id, slice_data}
:param dd: SED-ML DataDescription
:param working_dir: workingDir relative to which the sources are resolved
:return: dictionary of pandas.Series
"""
importlib.reload(libsedml)
assert dd.getTypeCode() == libsedml.SEDML_DATA_DESCRIPTION
did = dd.getId()
name = dd.getName()
source = dd.getSource()
# -------------------------------
# Resolve source
# -------------------------------
# FIXME: this must work for absolute paths and URL paths
if working_dir is None:
working_dir = "."
# TODO: refactor in general resource module (for resolving anyURI and resource)
tmp_file = None
if source.startswith("http") or source.startswith("HTTP"):
conn = httplib.HTTPConnection(source)
conn.request("GET", "")
r1 = conn.getresponse()
# print(r1.status, r1.reason)
data = r1.read()
conn.close()
try:
file_str = str(data.decode("utf-8"))
except UnicodeDecodeError:
file_str = str(data)
tmp_file = tempfile.NamedTemporaryFile("w")
tmp_file.write(file_str)
source_path = tmp_file.name
else:
source_path = os.path.join(working_dir, source)
# -------------------------------
# Find the format
# -------------------------------
format = None
if hasattr(dd, "getFormat"):
format = dd.getFormat()
format = cls._determine_format(source_path=source_path, format=format)
# log data description
logger.info("-" * 80)
logger.info("DataDescription: :", dd)
logger.info("\tid:", did)
logger.info("\tname:", name)
logger.info("\tsource", source)
logger.info("\tformat", format)
# -------------------------------
# Parse DimensionDescription
# -------------------------------
# FIXME: uses the data_types to check the actual data type
dim_description = dd.getDimensionDescription()
data_types = None
if dim_description is not None:
data_types = NumlParser.parse_dimension_description(
dim_description, library=NumlParser.Library.LIBSEDML
)
# -------------------------------
# Load complete data
# -------------------------------
data = None
if format == cls.FORMAT_CSV:
data = cls._load_csv(path=source_path)
elif format == cls.FORMAT_TSV:
data = cls._load_tsv(path=source_path)
elif format == cls.FORMAT_NUML:
data = NumlParser.load_numl_data(path=source_path)
# log data
logger.info("-" * 80)
logger.info("Data")
logger.info("-" * 80)
if format in [cls.FORMAT_CSV, cls.FORMAT_TSV]:
logger.info(data.head(10))
elif format == cls.FORMAT_NUML:
# multiple result components via id
for result in data:
logger.info(result[0]) # rc id
logger.info(result[1].head(10)) # DataFrame
logger.info("-" * 80)
# -------------------------------
# Process DataSources
# -------------------------------
data_sources = {}
for ds in dd.getListOfDataSources():
dsid = ds.getId()
# log DataSource
logger.info("\n\t*** DataSource:", ds)
logger.info("\t\tid:", ds.getId())
logger.info("\t\tname:", ds.getName())
logger.info("\t\tindexSet:", ds.getIndexSet())
logger.info("\t\tslices")
# CSV/TSV
if format in [cls.FORMAT_CSV, cls.FORMAT_TSV]:
if len(ds.getIndexSet()) > 0:
# if index set we return the index
data_sources[dsid] = pd.Series(data.index.tolist())
else:
sids = []
for slice in ds.getListOfSlices():
# FIXME: this does not handle multiple slices for rows
# print('\t\t\treference={}; value={}'.format(slice.getReference(), slice.getValue()))
sids.append(slice.getValue())
# slice values are columns from data frame
try:
data_sources[dsid] = data[sids].values
except KeyError as err:
# something does not fit between data and data sources
logger.error("-" * 80)
logger.error(f"Format: {format}")
logger.error(f"Source: {source_path}")
logger.error("-" * 80)
logger.error(data)
logger.error("-" * 80)
raise err
# NUML
elif format == cls.FORMAT_NUML:
# Using the first results component only in SED-ML L1V3
rc_id, rc, data_types = data[0]
index_set = ds.getIndexSet()
if ds.getIndexSet() and len(ds.getIndexSet()) != 0:
# data via indexSet
data_source = rc[index_set].drop_duplicates()
data_sources[dsid] = data_source
else:
# data via slices
for slice in ds.getListOfSlices():
reference = slice.getReference()
value = slice.getValue()
df = rc.loc[rc[reference] == value]
# select last column with values
data_sources[dsid] = df.iloc[:, -1]
# log data sources
logger.info("-" * 80)
logger.info("DataSources")
logger.info("-" * 80)
for key, value in data_sources.items():
logger.info("{} : {}; shape={}".format(key, type(value), value.shape))
logger.info("-" * 80)
# cleanup
# FIXME: handle in finally
if tmp_file is not None:
os.remove(tmp_file)
importlib.reload(libsbml)
return data_sources
@classmethod
@classmethod
[docs] def _load_csv(cls, path: Path) -> pd.DataFrame:
"""Read CSV data from file."""
return cls._load_sv(path, separator=",")
@classmethod
[docs] def _load_tsv(cls, path: Path) -> pd.DataFrame:
"""Read TSV data from file."""
return cls._load_sv(path, separator="\t")
@classmethod
[docs] def _load_sv(cls, path: Path, separator: str) -> pd.DataFrame:
"""Load tsv/csv data from given source.
CSV files must have a header. Handles file and online resources.
"""
df = pd.read_csv(
path,
sep=separator,
index_col=False,
skip_blank_lines=True,
quotechar='"',
comment="#",
skipinitialspace=True,
na_values="nan",
)
return df