Source code for tsgm.dataset

import typing as T
import logging
import numpy as np

import tsgm.types


logger = logging.getLogger('dataset')
logger.setLevel(logging.DEBUG)


[docs]class DatasetProperties: """ Stores the properties of a dataset. Along with dimensions it can store properties of the covariates. """ def __init__(self, N: int, D: int, T: int, variables: T.Optional[T.List] = None) -> None: """ :param N: The number of samples. :type N: int :param D: The number of dimensions. :type data: int :param T: The number of timestemps. :type statistics: list :param variables: The properties of each covariate. :type variables: list """ self.N = N self.D = D self.T = T self._variables = variables assert variables is None or self.D == len(variables)
[docs]class Dataset(DatasetProperties): """ Wrapper for time-series datasets. Additional information is stored in `metadata` field. """ def __init__(self, x: tsgm.types.Tensor, y: tsgm.types.Tensor, metadata: T.Optional[T.Dict] = None): """ :param x: The matrix of time series with dimensions NxDxT :type x: tsgm.types.Tensor :param y: The lables of a time series. :type y: tsgm.types.Tensor :param metadata: Additional info for the dataset. :type statistics: typing.Optional[typing.Dict] """ self._x = x self._y = y assert self._y is None or self._x.shape[0] == self._y.shape[0] self._metadata = metadata or {} self._graph = self._metadata.get("graph") super().__init__(N=self._x.shape[0], D=self._x.shape[1], T=self._x.shape[2]) @property
[docs] def X(self) -> tsgm.types.Tensor: """ Returns the time series tensor in format: n_samples x seq_len x feat_dim. """ return self._x
@property
[docs] def y(self) -> tsgm.types.Tensor: """ Returns labels tensor. """ return self._y
@property
[docs] def Xy(self) -> tuple: """ Returns a tuple of a time series tensor and labels tensor. """ return self._x, self._y
@property
[docs] def Xy_concat(self) -> tsgm.types.Tensor: """ Returns a concatenated time series and labels in a tensor. Output shape is n_sample x seq_len x feat_dim + y_dim """ if self._y is None: return self._x elif len(self._y.shape) == 1: return np.concatenate((self._x, np.repeat(self._y[:, None, None], self._x.shape[1], axis=1)), axis=2) elif len(self._y.shape) == 2: if self._y.shape[1] == 1: return np.concatenate((self._x, np.repeat(self._y[:, :, None], self._x.shape[1], axis=1)), axis=2) elif self._y.shape[1] == self._x.shape[1]: return np.concatenate((self._x, self._y[:, :, None]), axis=2) else: return np.concatenate((self._x, np.repeat(self._y[:, None, :], self._x.shape[1], axis=1)), axis=2) else: raise ValueError("X & y are not compatible for Xy_concat operation")
def _compatible(self, other_ds: "Dataset") -> bool: if self.X.shape[1:] == other_ds.X.shape[1:]: return self.y is None and other_ds.y is None or self.y.shape[1:] == other_ds.y.shape[1:] else: return False def _merge_meta(self, other_meta: dict) -> dict: return {**self._metadata, **other_meta} def _concatenate_dataset(self, other_ds: "Dataset") -> "Dataset": assert self._compatible(other_ds) return Dataset( np.concatenate((self.X, other_ds.X), axis=0), np.concatenate((self.y, other_ds.y), axis=0) if self.y is not None else None, self._merge_meta(other_ds._metadata) )
[docs] def __add__(self, other_ds: "Dataset") -> "Dataset": """ Returns a concatenated time series and labels in a tensor. Output shape is n_sample x seq_len x feat_dim + y_dim """ assert self._compatible(other_ds) logger.warning("Operator '+' concatenates dataset objects") return self._concatenate_dataset(other_ds)
def __or__(self, other_ds: "Dataset") -> "Dataset": return self._concatenate_dataset(other_ds) @property
[docs] def shape(self) -> tuple: """ Returns the shape of the time series in the dataset. """ return self.X.shape
def __len__(self) -> int: return self.X.shape[0] @property
[docs] def seq_len(self) -> int: """ Returns the length of sequences in the dataset. """ return self.X.shape[1]
@property
[docs] def feat_dim(self) -> int: """ Returns the size of feature dimension in the time series. """ return self.X.shape[2]
@property
[docs] def output_dim(self) -> int: """ Returns the number of classes in the dataset. """ output_dim = len(set(self.y)) if output_dim > len(self.y) * 0.5: logger.warning("either the number of classes if huge or it is not a classification dataset") return len(set(self.y))
DatasetOrTensor = T.Union[Dataset, tsgm.types.Tensor]