Source code for sbmlsim.combine.sedml.data

"""Reading NUML, CSV and TSV data from DataDescriptions."""
import http.client as httplib
import importlib
import os
import tempfile
from pathlib import Path
from typing import Dict, Optional

import libsbml
import libsedml
import pandas as pd
from sbmlutils import log

from .numl import NumlParser


[docs]logger = log.get_logger(__name__)
[docs]class DataDescriptionParser: """Class for parsing DataDescriptions."""
[docs] FORMAT_URN = "urn:sedml:format:"
[docs] FORMAT_NUML = "urn:sedml:format:numl"
[docs] FORMAT_CSV = "urn:sedml:format:csv"
[docs] FORMAT_TSV = "urn:sedml:format:tsv"
[docs] SUPPORTED_FORMATS = [FORMAT_NUML, FORMAT_CSV, FORMAT_TSV]
@classmethod
[docs] def parse( cls, dd: libsedml.SedDataDescription, working_dir: Path = None ) -> Dict[str, pd.Series]: """Parse single DataDescription. Returns dictionary of data sources {DataSource.id, slice_data} :param dd: SED-ML DataDescription :param working_dir: workingDir relative to which the sources are resolved :return: dictionary of pandas.Series """ importlib.reload(libsedml) assert dd.getTypeCode() == libsedml.SEDML_DATA_DESCRIPTION did = dd.getId() name = dd.getName() source = dd.getSource() # ------------------------------- # Resolve source # ------------------------------- # FIXME: this must work for absolute paths and URL paths if working_dir is None: working_dir = "." # TODO: refactor in general resource module (for resolving anyURI and resource) tmp_file = None if source.startswith("http") or source.startswith("HTTP"): conn = httplib.HTTPConnection(source) conn.request("GET", "") r1 = conn.getresponse() # print(r1.status, r1.reason) data = r1.read() conn.close() try: file_str = str(data.decode("utf-8")) except UnicodeDecodeError: file_str = str(data) tmp_file = tempfile.NamedTemporaryFile("w") tmp_file.write(file_str) source_path = tmp_file.name else: source_path = os.path.join(working_dir, source) # ------------------------------- # Find the format # ------------------------------- format = None if hasattr(dd, "getFormat"): format = dd.getFormat() format = cls._determine_format(source_path=source_path, format=format) # log data description logger.info("-" * 80) logger.info("DataDescription: :", dd) logger.info("\tid:", did) logger.info("\tname:", name) logger.info("\tsource", source) logger.info("\tformat", format) # ------------------------------- # Parse DimensionDescription # ------------------------------- # FIXME: uses the data_types to check the actual data type dim_description = dd.getDimensionDescription() data_types = None if dim_description is not None: data_types = NumlParser.parse_dimension_description( dim_description, library=NumlParser.Library.LIBSEDML ) # ------------------------------- # Load complete data # ------------------------------- data = None if format == cls.FORMAT_CSV: data = cls._load_csv(path=source_path) elif format == cls.FORMAT_TSV: data = cls._load_tsv(path=source_path) elif format == cls.FORMAT_NUML: data = NumlParser.load_numl_data(path=source_path) # log data logger.info("-" * 80) logger.info("Data") logger.info("-" * 80) if format in [cls.FORMAT_CSV, cls.FORMAT_TSV]: logger.info(data.head(10)) elif format == cls.FORMAT_NUML: # multiple result components via id for result in data: logger.info(result[0]) # rc id logger.info(result[1].head(10)) # DataFrame logger.info("-" * 80) # ------------------------------- # Process DataSources # ------------------------------- data_sources = {} for ds in dd.getListOfDataSources(): dsid = ds.getId() # log DataSource logger.info("\n\t*** DataSource:", ds) logger.info("\t\tid:", ds.getId()) logger.info("\t\tname:", ds.getName()) logger.info("\t\tindexSet:", ds.getIndexSet()) logger.info("\t\tslices") # CSV/TSV if format in [cls.FORMAT_CSV, cls.FORMAT_TSV]: if len(ds.getIndexSet()) > 0: # if index set we return the index data_sources[dsid] = pd.Series(data.index.tolist()) else: sids = [] for slice in ds.getListOfSlices(): # FIXME: this does not handle multiple slices for rows # print('\t\t\treference={}; value={}'.format(slice.getReference(), slice.getValue())) sids.append(slice.getValue()) # slice values are columns from data frame try: data_sources[dsid] = data[sids].values except KeyError as err: # something does not fit between data and data sources logger.error("-" * 80) logger.error(f"Format: {format}") logger.error(f"Source: {source_path}") logger.error("-" * 80) logger.error(data) logger.error("-" * 80) raise err # NUML elif format == cls.FORMAT_NUML: # Using the first results component only in SED-ML L1V3 rc_id, rc, data_types = data[0] index_set = ds.getIndexSet() if ds.getIndexSet() and len(ds.getIndexSet()) != 0: # data via indexSet data_source = rc[index_set].drop_duplicates() data_sources[dsid] = data_source else: # data via slices for slice in ds.getListOfSlices(): reference = slice.getReference() value = slice.getValue() df = rc.loc[rc[reference] == value] # select last column with values data_sources[dsid] = df.iloc[:, -1] # log data sources logger.info("-" * 80) logger.info("DataSources") logger.info("-" * 80) for key, value in data_sources.items(): logger.info("{} : {}; shape={}".format(key, type(value), value.shape)) logger.info("-" * 80) # cleanup # FIXME: handle in finally if tmp_file is not None: os.remove(tmp_file) importlib.reload(libsbml) return data_sources
@classmethod
[docs] def _determine_format(cls, source_path: Path, format: Optional[str] = None) -> str: """Determine format of file. :param source_path: path of file :param format: format given in the DataDescription :return: format str """ if format is None or format == "": is_xml = False with open(source_path) as unknown_file: start_str = unknown_file.read(1024) start_str = start_str.strip() if start_str.startswith("<"): is_xml = True if is_xml: # xml format is numl format = cls.FORMAT_NUML # defaults to numl else: # format is either csv or tsv df_csv = cls._load_csv(source_path) df_tsv = cls._load_tsv(source_path) if df_csv.shape[1] >= df_tsv.shape[1]: format = cls.FORMAT_CSV else: format = cls.FORMAT_TSV # base format if format.startswith(cls.FORMAT_NUML): format = cls.FORMAT_NUML # check supported formats if format not in cls.SUPPORTED_FORMATS: raise NotImplementedError( "Format '{}' not supported for DataDescription. Format must be in: {}".format( format, cls.SUPPORTED_FORMATS ) ) return format
@classmethod
[docs] def _load_csv(cls, path: Path) -> pd.DataFrame: """Read CSV data from file.""" return cls._load_sv(path, separator=",")
@classmethod
[docs] def _load_tsv(cls, path: Path) -> pd.DataFrame: """Read TSV data from file.""" return cls._load_sv(path, separator="\t")
@classmethod
[docs] def _load_sv(cls, path: Path, separator: str) -> pd.DataFrame: """Load tsv/csv data from given source. CSV files must have a header. Handles file and online resources. """ df = pd.read_csv( path, sep=separator, index_col=False, skip_blank_lines=True, quotechar='"', comment="#", skipinitialspace=True, na_values="nan", ) return df