From 125c078d199df75c55f02df298530ee526881f2a Mon Sep 17 00:00:00 2001 From: dennisbader Date: Tue, 12 Apr 2022 20:31:39 -0600 Subject: [PATCH 01/26] added methods ``from_longitudinal_dataframe` and `add_static_covariates` --- darts/timeseries.py | 76 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 76 insertions(+) diff --git a/darts/timeseries.py b/darts/timeseries.py index 8f31a8d80f..a71f82f5ec 100644 --- a/darts/timeseries.py +++ b/darts/timeseries.py @@ -46,6 +46,7 @@ DIMS = ("time", "component", "sample") VALID_INDEX_TYPES = (pd.DatetimeIndex, pd.RangeIndex) +STATIC_COV_TAG = "static_covariates" class TimeSeries: @@ -496,6 +497,73 @@ def from_dataframe( fillna_value=fillna_value, ) + @classmethod + def from_longitudinal_dataframe( + cls, + df: pd.DataFrame, + group_cols: Union[List[str], str], + time_col: Optional[str] = None, + value_cols: Optional[Union[List[str], str]] = None, + fill_missing_dates: Optional[bool] = False, + freq: Optional[str] = None, + fillna_value: Optional[float] = None, + ) -> Union["TimeSeries", List["TimeSeries"]]: + """ + Build a list of TimeSeries instances grouped by a selection of columns from a DataFrame. + One column (or the DataFrame index) has to represent the time, + a list of columns `group_cols` must be used for extracting the individual TimeSeries by groups, + and a list of columns `value_cols` has to represent the values for the individual time series. + + Parameters + ---------- + df + The DataFrame + group_cols + A string or list of strings representing the columns from the DataFrame by which to extract the + individual TimeSeries groups. + time_col + The time column name. If set, the column will be cast to a pandas DatetimeIndex. + If not set, the DataFrame index will be used. In this case the DataFrame must contain an index that is + either a pandas DatetimeIndex or a pandas RangeIndex. If a DatetimeIndex is + used, it is better if it has no holes; alternatively setting `fill_missing_dates` can in some casees solve + these issues (filling holes with NaN, or with the provided `fillna_value` numeric value, if any). + value_cols + A string or list of strings representing the value column(s) to be extracted from the DataFrame. If set to + `None`, the whole DataFrame will be used. + fill_missing_dates + Optionally, a boolean value indicating whether to fill missing dates with NaN values. This requires + either a provided `freq` or the possibility to infer the frequency from the provided timestamps. See + :meth:`_fill_missing_dates() ` for more info. + freq + Optionally, a string representing the frequency of the Pandas DateTimeIndex. This is useful in order to + fill in missing values if some dates are missing and `fill_missing_dates` is set to `True`. + fillna_value + Optionally, a numeric value to fill missing values (NaNs) with. + + Returns + ------- + TimeSeries + A univariate or multivariate deterministic TimeSeries constructed from the inputs. + """ + # split df by groups and store group values (static covariates) + splits = [ + (pd.Series(static_covs, index=group_cols), group.drop(columns=group_cols)) + for static_covs, group in df.groupby(group_cols) + ] + + # create a list with multiple TimeSeries and add static covariates + return [ + TimeSeries.from_dataframe( + df=split, + time_col=time_col, + value_cols=value_cols, + fill_missing_dates=fill_missing_dates, + freq=freq, + fillna_value=fillna_value, + ).add_static_covariates(static_covs) + for static_covs, split in splits + ] + @classmethod def from_series( cls, @@ -705,6 +773,10 @@ def from_pickle(cls, path: str) -> "TimeSeries": ========== """ + @property + def static_covariates(self): + return self._xa.attrs.get(STATIC_COV_TAG, None) + @property def n_samples(self): """Number of samples contained in the series.""" @@ -2011,6 +2083,10 @@ def with_values(self, values: np.ndarray) -> "TimeSeries": return self.__class__(new_xa) + def add_static_covariates(self, covariates: pd.Series): + self._xa.attrs["static_covariates"] = covariates + return self + def stack(self, other: "TimeSeries") -> "TimeSeries": """ Stacks another univariate or multivariate TimeSeries with the same time index on top of From fde974e68908282ecdea0caa87ce14107a944306 Mon Sep 17 00:00:00 2001 From: dennisbader Date: Sat, 23 Apr 2022 11:39:40 -0600 Subject: [PATCH 02/26] dataset adaption for static covs --- darts/models/forecasting/tft_model.py | 7 ++-- .../forecasting/torch_forecasting_model.py | 2 +- darts/utils/data/sequential_dataset.py | 32 +++++++++++++++---- darts/utils/data/shifted_dataset.py | 11 +++++-- 4 files changed, 40 insertions(+), 12 deletions(-) diff --git a/darts/models/forecasting/tft_model.py b/darts/models/forecasting/tft_model.py index 0282e3dc97..0a23b75301 100644 --- a/darts/models/forecasting/tft_model.py +++ b/darts/models/forecasting/tft_model.py @@ -31,7 +31,7 @@ logger = get_logger(__name__) MixedCovariatesTrainTensorType = Tuple[ - torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor + torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor ] @@ -415,7 +415,7 @@ def forward(self, x: Tuple[torch.Tensor, Optional[torch.Tensor]]) -> torch.Tenso # Embedding and variable selection if static_covariates is not None: - # TODO: impelement static covariates + # TODO: implement static covariates # # static embeddings will be constant over entire batch # static_embedding = {name: input_vectors[name][:, 0] for name in self.static_variables} # static_embedding, static_covariate_var = self.static_covariates_vsn(static_embedding) @@ -775,6 +775,7 @@ def _create_model(self, train_sample: MixedCovariatesTrainTensorType) -> nn.Modu historic_future_covariate, future_covariate, future_target, + static_covariates, ) = train_sample # add a covariate placeholder so that relative index will be included @@ -800,7 +801,7 @@ def _create_model(self, train_sample: MixedCovariatesTrainTensorType) -> nn.Modu axis=1, ) - static_covariates = None # placeholder for future + # static_covariates = None # placeholder for future self.output_dim = ( (future_target.shape[1], 1) diff --git a/darts/models/forecasting/torch_forecasting_model.py b/darts/models/forecasting/torch_forecasting_model.py index d1423f1b1c..d8eca0c486 100644 --- a/darts/models/forecasting/torch_forecasting_model.py +++ b/darts/models/forecasting/torch_forecasting_model.py @@ -846,7 +846,7 @@ def fit_from_dataset( train_sample = train_dataset[0] if self.model is None: # Build model, based on the dimensions of the first series in the train set. - self.train_sample, self.output_dim = train_sample, train_sample[-1].shape[1] + self.train_sample, self.output_dim = train_sample, train_sample[-2].shape[1] self._init_model(trainer) else: # Check existing model has input/output dims matching what's provided in the training set. diff --git a/darts/utils/data/sequential_dataset.py b/darts/utils/data/sequential_dataset.py index dc23bd14ba..418adee3a1 100644 --- a/darts/utils/data/sequential_dataset.py +++ b/darts/utils/data/sequential_dataset.py @@ -233,10 +233,22 @@ def __len__(self): def __getitem__( self, idx - ) -> Tuple[np.ndarray, Optional[np.ndarray], Optional[np.ndarray], np.ndarray]: - past_target, past_covariate, future_target = self.ds_past[idx] - _, future_covariate, _ = self.ds_future[idx] - return past_target, past_covariate, future_covariate, future_target + ) -> Tuple[ + np.ndarray, + Optional[np.ndarray], + Optional[np.ndarray], + np.ndarray, + Optional[np.ndarray], + ]: + past_target, past_covariate, future_target, static_covariate = self.ds_past[idx] + _, future_covariate, _, static_covariate = self.ds_future[idx] + return ( + past_target, + past_covariate, + future_covariate, + future_target, + static_covariate, + ) class MixedCovariatesSequentialDataset(MixedCovariatesTrainingDataset): @@ -322,16 +334,24 @@ def __getitem__( Optional[np.ndarray], Optional[np.ndarray], np.ndarray, + Optional[np.ndarray], ]: - past_target, past_covariate, future_target = self.ds_past[idx] - _, historic_future_covariate, future_covariate, _ = self.ds_dual[idx] + past_target, past_covariate, future_target, static_covariate = self.ds_past[idx] + ( + _, + historic_future_covariate, + future_covariate, + _, + static_covariate, + ) = self.ds_dual[idx] return ( past_target, past_covariate, historic_future_covariate, future_covariate, future_target, + static_covariate, ) diff --git a/darts/utils/data/shifted_dataset.py b/darts/utils/data/shifted_dataset.py index b79c26533d..063de5134f 100644 --- a/darts/utils/data/shifted_dataset.py +++ b/darts/utils/data/shifted_dataset.py @@ -518,7 +518,9 @@ def __init__( def __len__(self): return self.ideal_nr_samples - def __getitem__(self, idx) -> Tuple[np.ndarray, Optional[np.ndarray], np.ndarray]: + def __getitem__( + self, idx + ) -> Tuple[np.ndarray, Optional[np.ndarray], np.ndarray, Optional[np.ndarray]]: # determine the index of the time series. ts_idx = idx // self.max_samples_per_ts ts_target = self.target_series[ts_idx] @@ -598,4 +600,9 @@ def __getitem__(self, idx) -> Tuple[np.ndarray, Optional[np.ndarray], np.ndarray f"target series.", ) - return past_target, covariate, future_target + static_covariate = ( + ts_target.static_covariates.values + if ts_target.static_covariates is not None + else None + ) + return past_target, covariate, future_target, static_covariate From d6d4885d57cd98ce219e032f39c77cbf9cdc5fa9 Mon Sep 17 00:00:00 2001 From: dennisbader Date: Thu, 19 May 2022 15:16:04 +0200 Subject: [PATCH 03/26] extended datasets for static covariates support and unified variable names Please enter the commit message for your changes. Lines starting --- .../forecasting/pl_forecasting_module.py | 1 - darts/timeseries.py | 44 ++++- darts/utils/data/horizon_based_dataset.py | 44 +++-- darts/utils/data/inference_dataset.py | 179 +++++++++++++----- darts/utils/data/sequential_dataset.py | 61 +++--- darts/utils/data/shifted_dataset.py | 120 +++++++----- darts/utils/data/training_dataset.py | 120 +++++++----- 7 files changed, 381 insertions(+), 188 deletions(-) diff --git a/darts/models/forecasting/pl_forecasting_module.py b/darts/models/forecasting/pl_forecasting_module.py index a5fd3f7330..d18301b696 100644 --- a/darts/models/forecasting/pl_forecasting_module.py +++ b/darts/models/forecasting/pl_forecasting_module.py @@ -478,7 +478,6 @@ def _process_input_batch( tuple ``(x_past, x_future)`` the input/past and output/future chunks. """ - ( past_target, past_covariates, diff --git a/darts/timeseries.py b/darts/timeseries.py index a71f82f5ec..197f144d3f 100644 --- a/darts/timeseries.py +++ b/darts/timeseries.py @@ -504,6 +504,7 @@ def from_longitudinal_dataframe( group_cols: Union[List[str], str], time_col: Optional[str] = None, value_cols: Optional[Union[List[str], str]] = None, + static_cols: Optional[Union[List[str], str]] = None, fill_missing_dates: Optional[bool] = False, freq: Optional[str] = None, fillna_value: Optional[float] = None, @@ -530,6 +531,11 @@ def from_longitudinal_dataframe( value_cols A string or list of strings representing the value column(s) to be extracted from the DataFrame. If set to `None`, the whole DataFrame will be used. + static_cols + A string or list of strings representing static variable columns from the DataFrame that should be + appended as static covariates to the resulting TimeSeries groups. Different to `group_cols`, the + DataFrame is not grouped by these columns. Note that for every group, there must be exactly one + unique value. fill_missing_dates Optionally, a boolean value indicating whether to fill missing dates with NaN values. This requires either a provided `freq` or the possibility to infer the frequency from the provided timestamps. See @@ -545,11 +551,37 @@ def from_longitudinal_dataframe( TimeSeries A univariate or multivariate deterministic TimeSeries constructed from the inputs. """ - # split df by groups and store group values (static covariates) - splits = [ - (pd.Series(static_covs, index=group_cols), group.drop(columns=group_cols)) - for static_covs, group in df.groupby(group_cols) - ] + group_cols = [group_cols] if not isinstance(group_cols, list) else group_cols + static_cols = ( + [static_cols] + if not isinstance(static_cols, list) and static_cols is not None + else [] + ) + static_cov_cols = group_cols + static_cols + + # split df by groups, and store group values and static values (static covariates) + splits = [] + for static_cov_vals, group in df.groupby(group_cols): + # check that for each group there is only one unique value per column in `static_cols` + if static_cols: + static_cols_valid = [ + len(group[col].unique()) == 1 for col in static_cols + ] + raise_if_not( + all(static_cols_valid), + f"Encountered more than one unique value in group {group} for given static columns: " + f"{[static_col for static_col, is_valid in zip(static_cols, static_cols_valid) if not is_valid]}.", + logger, + ) + # add the static covariates to the group values + static_cov_vals += tuple(group[static_cols].values[0]) + # store static covariate Series and group DataFrame (without static cov columns) + splits.append( + ( + pd.DataFrame(static_cov_vals, index=static_cov_cols), + group.drop(columns=static_cov_cols), + ) + ) # create a list with multiple TimeSeries and add static covariates return [ @@ -2083,7 +2115,7 @@ def with_values(self, values: np.ndarray) -> "TimeSeries": return self.__class__(new_xa) - def add_static_covariates(self, covariates: pd.Series): + def add_static_covariates(self, covariates: Union[pd.Series, pd.DataFrame]): self._xa.attrs["static_covariates"] = covariates return self diff --git a/darts/utils/data/horizon_based_dataset.py b/darts/utils/data/horizon_based_dataset.py index d49465b9c7..7091d113d1 100644 --- a/darts/utils/data/horizon_based_dataset.py +++ b/darts/utils/data/horizon_based_dataset.py @@ -26,7 +26,8 @@ def __init__( lookback: int = 3, ) -> None: """ - A time series dataset containing tuples of (past_target, past_covariates, future_target) arrays, + A time series dataset containing tuples of (past_target, past_covariates, static_covariates, future_target) + arrays, in a way inspired by the N-BEATS way of training on the M4 dataset: https://arxiv.org/abs/1905.10437. The "past" series have length `lookback * output_chunk_length`, and the "future" series has length @@ -105,31 +106,33 @@ def __len__(self): def __getitem__( self, idx: int - ) -> Tuple[np.ndarray, Optional[np.ndarray], np.ndarray]: + ) -> Tuple[np.ndarray, Optional[np.ndarray], Optional[np.ndarray], np.ndarray]: # determine the index of the time series. - ts_idx = idx // self.nr_samples_per_ts - ts_target = self.target_series[ts_idx] - target_vals = ts_target.random_component_values(copy=False) + target_idx = idx // self.nr_samples_per_ts + target_series = self.target_series[target_idx] + target_vals = target_series.random_component_values(copy=False) raise_if_not( len(target_vals) >= (self.lookback + self.max_lh) * self.output_chunk_length, "The dataset contains some input/target series that are shorter than " - "`(lookback + max_lh) * H` ({}-th series)".format(ts_idx), + "`(lookback + max_lh) * H` ({}-th series)".format(target_idx), ) # determine the index lh_idx of the forecasting point (the last point of the input series, before the target) # lh_idx should be in [0, self.nr_samples_per_ts) - lh_idx = idx - (ts_idx * self.nr_samples_per_ts) + lh_idx = idx - (target_idx * self.nr_samples_per_ts) # determine the index at the end of the output chunk - end_of_output_idx = len(ts_target) - ( + end_of_output_idx = len(target_series) - ( (self.min_lh - 1) * self.output_chunk_length + lh_idx ) # optionally, load covariates - ts_covariate = self.covariates[ts_idx] if self.covariates is not None else None - main_cov_type = ( + covariate_series = ( + self.covariates[target_idx] if self.covariates is not None else None + ) + main_covariate_type = ( CovariateType.NONE if self.covariates is None else CovariateType.PAST ) @@ -145,14 +148,14 @@ def __getitem__( cov_start, cov_end, ) = self._memory_indexer( - ts_idx=ts_idx, - ts_target=ts_target, + target_idx=target_idx, + target_series=target_series, shift=shift, input_chunk_length=input_chunk_length, output_chunk_length=self.output_chunk_length, end_of_output_idx=end_of_output_idx, - ts_covariate=ts_covariate, - cov_type=main_cov_type, + covariate_series=covariate_series, + covariate_type=main_covariate_type, ) # extract sample target @@ -163,12 +166,12 @@ def __getitem__( covariate = None if self.covariates is not None: raise_if_not( - cov_end <= len(ts_covariate), + cov_end <= len(covariate_series), f"The dataset contains 'past' covariates that don't extend far enough into the future. " f"({idx}-th sample)", ) - covariate = ts_covariate.random_component_values(copy=False)[ + covariate = covariate_series.random_component_values(copy=False)[ cov_start:cov_end ] @@ -178,4 +181,11 @@ def __getitem__( "input (or output) chunk relative to the target series.", ) - return past_target, covariate, future_target + # TODO: we need think about the dimensionality of static covariates + static_covariate = ( + target_series.static_covariates.T.values + if target_series.static_covariates is not None + else None + ) + + return past_target, covariate, static_covariate, future_target diff --git a/darts/utils/data/inference_dataset.py b/darts/utils/data/inference_dataset.py index 57460154c5..e3eecde1b6 100644 --- a/darts/utils/data/inference_dataset.py +++ b/darts/utils/data/inference_dataset.py @@ -38,31 +38,31 @@ def __getitem__(self, idx: int): @staticmethod def _covariate_indexer( - ts_idx: int, + target_idx: int, target_series: TimeSeries, covariate_series: TimeSeries, - cov_type: CovariateType, + covariate_type: CovariateType, input_chunk_length: int, output_chunk_length: int, n: int, ): """returns tuple of (past_start, past_end, future_start, future_end)""" # get the main covariate type: CovariateType.PAST or CovariateType.FUTURE - main_cov_type = ( + main_covariate_type = ( CovariateType.PAST - if cov_type is CovariateType.PAST + if covariate_type is CovariateType.PAST else CovariateType.FUTURE ) raise_if_not( - main_cov_type in [CovariateType.PAST, CovariateType.FUTURE], - "`main_cov_type` must be one of `(CovariateType.PAST, CovariateType.FUTURE)`", + main_covariate_type in [CovariateType.PAST, CovariateType.FUTURE], + "`main_covariate_type` must be one of `(CovariateType.PAST, CovariateType.FUTURE)`", ) # we need to use the time index (datetime or integer) here to match the index with the covariate series past_start = target_series.time_index[-input_chunk_length] past_end = target_series.time_index[-1] - if main_cov_type is CovariateType.PAST: + if main_covariate_type is CovariateType.PAST: future_end = past_end + max(0, n - output_chunk_length) * target_series.freq else: # CovariateType.FUTURE future_end = past_end + max(n, output_chunk_length) * target_series.freq @@ -75,26 +75,29 @@ def _covariate_indexer( past_start, past_end = future_start, future_start # check if case specific indexes are available - case_start = future_start if cov_type is CovariateType.FUTURE else past_start + case_start = ( + future_start if covariate_type is CovariateType.FUTURE else past_start + ) raise_if_not( covariate_series.start_time() <= case_start, - f"For the given forecasting case, the provided {main_cov_type.value} covariates at dataset index " - f"`{ts_idx}` do not extend far enough into the past. The {main_cov_type.value} covariates must start at " - f"time step `{case_start}`, whereas now they start at time step `{covariate_series.start_time()}`.", + f"For the given forecasting case, the provided {main_covariate_type.value} covariates at dataset index " + f"`{target_idx}` do not extend far enough into the past. The {main_covariate_type.value} covariates " + f"must start at time step `{case_start}`, whereas now they start at time step " + f"`{covariate_series.start_time()}`.", ) raise_if_not( covariate_series.end_time() >= future_end, - f"For the given forecasting horizon `n={n}`, the provided {main_cov_type.value} covariates " - f"at dataset index `{ts_idx}` do not extend far enough into the future. As `" + f"For the given forecasting horizon `n={n}`, the provided {main_covariate_type.value} covariates " + f"at dataset index `{target_idx}` do not extend far enough into the future. As `" f"{'n > output_chunk_length' if n > output_chunk_length else 'n <= output_chunk_length'}" - f"` the {main_cov_type.value} covariates must end at time step `{future_end}`, " + f"` the {main_covariate_type.value} covariates must end at time step `{future_end}`, " f"whereas now they end at time step `{covariate_series.end_time()}`.", ) # extract the index position (index) from time_index value - cov_start = covariate_series.time_index.get_loc(past_start) - cov_end = covariate_series.time_index.get_loc(future_end) + 1 - return cov_start, cov_end + covariate_start = covariate_series.time_index.get_loc(past_start) + covariate_end = covariate_series.time_index.get_loc(future_end) + 1 + return covariate_start, covariate_end class GenericInferenceDataset(InferenceDataset): @@ -108,7 +111,8 @@ def __init__( covariate_type: CovariateType = CovariateType.PAST, ): """ - Contains (past_target, past_covariates | historic_future_covariates, future_past_covariates | future_covariate). + Contains (past_target, past_covariates | historic_future_covariates, future_past_covariates | future_covariate, + static_covariates). "future_past_covariates" are past covariates that happen to be also known in the future - those are needed for forecasting with n > output_chunk_length by any model relying on past covariates. @@ -155,7 +159,13 @@ def __len__(self): def __getitem__( self, idx: int - ) -> Tuple[np.ndarray, Optional[np.ndarray], Optional[np.ndarray], TimeSeries]: + ) -> Tuple[ + np.ndarray, + Optional[np.ndarray], + Optional[np.ndarray], + Optional[np.ndarray], + TimeSeries, + ]: target_series = self.target_series[idx] raise_if_not( len(target_series) >= self.input_chunk_length, @@ -168,15 +178,15 @@ def __getitem__( ] # optionally, extract covariates - cov_past, cov_future = None, None + past_covariate, future_covariate = None, None covariate_series = None if self.covariates is None else self.covariates[idx] if covariate_series is not None: # get start and end indices (integer) of the covariates including historic and future parts - cov_start, cov_end = self._covariate_indexer( - ts_idx=idx, + covariate_start, covariate_end = self._covariate_indexer( + target_idx=idx, target_series=target_series, covariate_series=covariate_series, - cov_type=self.covariate_type, + covariate_type=self.covariate_type, input_chunk_length=self.input_chunk_length, output_chunk_length=self.output_chunk_length, n=self.n, @@ -184,23 +194,41 @@ def __getitem__( # extract covariate values and split into a past (historic) and future part covariate = covariate_series.random_component_values(copy=False)[ - cov_start:cov_end + covariate_start:covariate_end ] if self.input_chunk_length != 0: # regular models - cov_past, cov_future = ( + past_covariate, future_covariate = ( covariate[: self.input_chunk_length], covariate[self.input_chunk_length :], ) else: # regression ensemble models have a input_chunk_length == 0 part for using predictions as input - cov_past, cov_future = covariate, covariate + past_covariate, future_covariate = covariate, covariate # set to None if empty array - cov_past = cov_past if cov_past is not None and len(cov_past) > 0 else None - cov_future = ( - cov_future if cov_future is not None and len(cov_future) > 0 else None + past_covariate = ( + past_covariate + if past_covariate is not None and len(past_covariate) > 0 + else None + ) + future_covariate = ( + future_covariate + if future_covariate is not None and len(future_covariate) > 0 + else None ) - return past_target, cov_past, cov_future, target_series + # TODO: we need think about the dimensionality of static covariates + static_covariate = ( + target_series.static_covariates.T.values + if target_series.static_covariates is not None + else None + ) + return ( + past_target, + past_covariate, + future_covariate, + static_covariate, + target_series, + ) class PastCovariatesInferenceDataset(InferenceDataset): @@ -214,7 +242,7 @@ def __init__( covariate_type: CovariateType = CovariateType.PAST, ): """ - Contains (past_target, past_covariates, future_past_covariates). + Contains (past_target, past_covariates, future_past_covariates, static_covariates). "future_past_covariates" are past covariates that happen to be also known in the future - those are needed for forecasting with n > output_chunk_length by any model relying on past covariates. @@ -252,7 +280,13 @@ def __len__(self): def __getitem__( self, idx: int - ) -> Tuple[np.ndarray, Optional[np.ndarray], Optional[np.ndarray], TimeSeries]: + ) -> Tuple[ + np.ndarray, + Optional[np.ndarray], + Optional[np.ndarray], + Optional[np.ndarray], + TimeSeries, + ]: return self.ds[idx] @@ -266,7 +300,7 @@ def __init__( covariate_type: CovariateType = CovariateType.FUTURE, ): """ - Contains (past_target, future_covariates) tuples + Contains (past_target, future_covariates, static_covariates) tuples Parameters ---------- @@ -296,9 +330,9 @@ def __len__(self): def __getitem__( self, idx: int - ) -> Tuple[np.ndarray, Optional[np.ndarray], TimeSeries]: - past_target_vals, _, cov_future, target_series = self.ds[idx] - return past_target_vals, cov_future, target_series + ) -> Tuple[np.ndarray, Optional[np.ndarray], Optional[np.ndarray], TimeSeries]: + past_target, _, future_covariate, static_covariate, target_series = self.ds[idx] + return past_target, future_covariate, static_covariate, target_series class DualCovariatesInferenceDataset(InferenceDataset): @@ -311,7 +345,7 @@ def __init__( output_chunk_length: int = 1, ): """ - Contains (past_target, historic_future_covariates, future_covariates) tuples. + Contains (past_target, historic_future_covariates, future_covariates, static_covariates) tuples. Parameters ---------- @@ -353,10 +387,28 @@ def __len__(self): def __getitem__( self, idx - ) -> Tuple[np.ndarray, Optional[np.ndarray], Optional[np.ndarray], TimeSeries]: - past_target, historic_future_covs, _, ts_target = self.ds_past[idx] - _, future_covs, _ = self.ds_future[idx] - return past_target, historic_future_covs, future_covs, ts_target + ) -> Tuple[ + np.ndarray, + Optional[np.ndarray], + Optional[np.ndarray], + Optional[np.ndarray], + TimeSeries, + ]: + ( + past_target, + historic_future_covariate, + _, + static_covariate, + ts_target, + ) = self.ds_past[idx] + _, future_covariate, _, _ = self.ds_future[idx] + return ( + past_target, + historic_future_covariate, + future_covariate, + static_covariate, + ts_target, + ) class MixedCovariatesInferenceDataset(InferenceDataset): @@ -370,7 +422,8 @@ def __init__( output_chunk_length: int = 1, ): """ - Contains (past_target, past_covariates, historic_future_covariates, future_covariates, future_past_covariates) + Contains (past_target, past_covariates, historic_future_covariates, future_covariates, future_past_covariates, + static_covariates) tuples. "future_past_covariates" are past covariates that happen to be also known in the future - those are needed for forecasting with n > output_chunk_length by any model relying on past covariates. @@ -423,17 +476,25 @@ def __getitem__( Optional[np.ndarray], Optional[np.ndarray], Optional[np.ndarray], + Optional[np.ndarray], TimeSeries, ]: - past_target, past_covs, future_past_covs, ts_target = self.ds_past[idx] - _, historic_future_covs, future_covs, _ = self.ds_future[idx] + ( + past_target, + past_covariate, + future_past_covariate, + static_covariate, + ts_target, + ) = self.ds_past[idx] + _, historic_future_covariate, future_covariate, _, _ = self.ds_future[idx] return ( past_target, - past_covs, - historic_future_covs, - future_covs, - future_past_covs, + past_covariate, + historic_future_covariate, + future_covariate, + future_past_covariate, + static_covariate, ts_target, ) @@ -449,7 +510,7 @@ def __init__( output_chunk_length: int = 1, ): """ - Contains (past_target, past_covariates, future_covariates, future_past_covariates) tuples. + Contains (past_target, past_covariates, future_covariates, future_past_covariates, static_covariates) tuples. "future_past_covariates" are past covariates that happen to be also known in the future - those are needed for forecasting with n > output_chunk_length by any model relying on past covariates. @@ -501,9 +562,23 @@ def __getitem__( Optional[np.ndarray], Optional[np.ndarray], Optional[np.ndarray], + Optional[np.ndarray], TimeSeries, ]: - past_target, past_covs, future_past_covs, ts_target = self.ds_past[idx] - _, future_covs, _ = self.ds_future[idx] - return past_target, past_covs, future_covs, future_past_covs, ts_target + ( + past_target, + past_covariate, + future_past_covariate, + static_covariate, + ts_target, + ) = self.ds_past[idx] + _, future_covariate, _, _ = self.ds_future[idx] + return ( + past_target, + past_covariate, + future_covariate, + future_past_covariate, + static_covariate, + ts_target, + ) diff --git a/darts/utils/data/sequential_dataset.py b/darts/utils/data/sequential_dataset.py index 418adee3a1..49d8630bfa 100644 --- a/darts/utils/data/sequential_dataset.py +++ b/darts/utils/data/sequential_dataset.py @@ -30,7 +30,7 @@ def __init__( max_samples_per_ts: Optional[int] = None, ): """ - A time series dataset containing tuples of (past_target, past_covariates, future_target). + A time series dataset containing tuples of (past_target, past_covariates, static_covariates, future_target). The "past" series have length `input_chunk_length` and the "future" series have length `output_chunk_length`. The "future" series are immediately consecutive to the "past" series. The slicing of past and future covariates matches that of past and future targets, respectively. The slicing @@ -83,7 +83,9 @@ def __init__( def __len__(self): return len(self.ds) - def __getitem__(self, idx) -> Tuple[np.ndarray, Optional[np.ndarray], np.ndarray]: + def __getitem__( + self, idx + ) -> Tuple[np.ndarray, Optional[np.ndarray], Optional[np.ndarray], np.ndarray]: return self.ds[idx] @@ -97,7 +99,7 @@ def __init__( max_samples_per_ts: Optional[int] = None, ): """ - A time series dataset containing tuples of (past_target, future_covariates, future_target). + A time series dataset containing tuples of (past_target, future_covariates, static_covariates, future_target). The "past" series have length `input_chunk_length` and the "future" series have length `output_chunk_length`. The "future" series are immediately consecutive to the "past" series. The slicing of past and future covariates matches that of past and future targets, respectively. The slicing @@ -150,7 +152,9 @@ def __init__( def __len__(self): return len(self.ds) - def __getitem__(self, idx) -> Tuple[np.ndarray, Optional[np.ndarray], np.ndarray]: + def __getitem__( + self, idx + ) -> Tuple[np.ndarray, Optional[np.ndarray], Optional[np.ndarray], np.ndarray]: return self.ds[idx] @@ -165,7 +169,7 @@ def __init__( ): """ A time series dataset containing tuples of - (past_target, historic_future_covariates, future_covariates, future_target). + (past_target, historic_future_covariates, future_covariates, static_covariates, future_target). The "past" series (incl `historic_future_covariates`) have length `input_chunk_length` and the "future" series have length `output_chunk_length`. The "future" series are immediately consecutive to the "past" series. The slicing of past and future covariates matches that of past and future targets, @@ -237,17 +241,17 @@ def __getitem__( np.ndarray, Optional[np.ndarray], Optional[np.ndarray], - np.ndarray, Optional[np.ndarray], + np.ndarray, ]: - past_target, past_covariate, future_target, static_covariate = self.ds_past[idx] - _, future_covariate, _, static_covariate = self.ds_future[idx] + past_target, past_covariate, static_covariate, future_target = self.ds_past[idx] + _, future_covariate, _, _ = self.ds_future[idx] return ( past_target, past_covariate, future_covariate, - future_target, static_covariate, + future_target, ) @@ -263,7 +267,7 @@ def __init__( ): """ A time series dataset containing tuples of - (past_target, past_covariates, historic_future_covariates, future_covariates, future_target). + (past_target, past_covariates, historic_future_covariates, future_covariates, static_covariates, future_target). The "past" series (incl `historic_future_covariates`) have length `input_chunk_length` and the "future" series have length `output_chunk_length`. The "future" series are immediately consecutive to the "past" series. The slicing of past and future covariates matches that of past and future targets, @@ -333,25 +337,19 @@ def __getitem__( Optional[np.ndarray], Optional[np.ndarray], Optional[np.ndarray], - np.ndarray, Optional[np.ndarray], + np.ndarray, ]: - past_target, past_covariate, future_target, static_covariate = self.ds_past[idx] - ( - _, - historic_future_covariate, - future_covariate, - _, - static_covariate, - ) = self.ds_dual[idx] + past_target, past_covariate, static_covariate, future_target = self.ds_past[idx] + _, historic_future_covariate, future_covariate, _, _ = self.ds_dual[idx] return ( past_target, past_covariate, historic_future_covariate, future_covariate, - future_target, static_covariate, + future_target, ) @@ -366,7 +364,8 @@ def __init__( max_samples_per_ts: Optional[int] = None, ): """ - A time series dataset containing tuples of (past_target, past_covariates, future_covariates, future_target). + A time series dataset containing tuples of (past_target, past_covariates, future_covariates, static_covariates, + future_target). The "past" series have length `input_chunk_length` and the "future" series have length `output_chunk_length`. The "future" series are immediately consecutive to the "past" series. The slicing of past and future covariates matches that of past and future targets, respectively. The slicing @@ -436,7 +435,19 @@ def __len__(self): def __getitem__( self, idx - ) -> Tuple[np.ndarray, Optional[np.ndarray], Optional[np.ndarray], np.ndarray]: - past_target, past_covariate, future_target = self.ds_past[idx] - _, future_covariate, _ = self.ds_future[idx] - return past_target, past_covariate, future_covariate, future_target + ) -> Tuple[ + np.ndarray, + Optional[np.ndarray], + Optional[np.ndarray], + Optional[np.ndarray], + np.ndarray, + ]: + past_target, past_covariate, static_covariate, future_target = self.ds_past[idx] + _, future_covariate, _, _ = self.ds_future[idx] + return ( + past_target, + past_covariate, + future_covariate, + static_covariate, + future_target, + ) diff --git a/darts/utils/data/shifted_dataset.py b/darts/utils/data/shifted_dataset.py index 063de5134f..ad882165b5 100644 --- a/darts/utils/data/shifted_dataset.py +++ b/darts/utils/data/shifted_dataset.py @@ -31,7 +31,7 @@ def __init__( max_samples_per_ts: Optional[int] = None, ): """ - A time series dataset containing tuples of (past_target, past_covariates, future_target) + A time series dataset containing tuples of (past_target, past_covariates, static_covariates, future_target) arrays, which all have length `length`. The "future_target" is the "past_target" target shifted by `shift` time steps forward. So if an emitted "past_target" (and "past_covariates") goes from position `i` to `i+length`, @@ -83,7 +83,9 @@ def __init__( def __len__(self): return len(self.ds) - def __getitem__(self, idx) -> Tuple[np.ndarray, Optional[np.ndarray], np.ndarray]: + def __getitem__( + self, idx + ) -> Tuple[np.ndarray, Optional[np.ndarray], Optional[np.ndarray], np.ndarray]: return self.ds[idx] @@ -97,7 +99,7 @@ def __init__( max_samples_per_ts: Optional[int] = None, ): """ - A time series dataset containing tuples of (past_target, future_covariates, future_target) + A time series dataset containing tuples of (past_target, future_covariates, static_covariates, future_target) arrays, which all have length `length`. The "future_target" is the "past_target" target shifted by `shift` time steps forward. So if an emitted "past_target" goes from position `i` to `i+length`, @@ -152,7 +154,9 @@ def __init__( def __len__(self): return len(self.ds) - def __getitem__(self, idx) -> Tuple[np.ndarray, Optional[np.ndarray], np.ndarray]: + def __getitem__( + self, idx + ) -> Tuple[np.ndarray, Optional[np.ndarray], Optional[np.ndarray], np.ndarray]: return self.ds[idx] @@ -167,7 +171,7 @@ def __init__( ): """ A time series dataset containing tuples of - (past_target, historic_future_covariates, future_covariates, future_target) + (past_target, historic_future_covariates, future_covariates, static_covariates, future_target) arrays, which all have length `length`. The "future_target" is the "past_target" target shifted by `shift` time steps forward. So if an emitted "past_target" goes from position `i` to `i+length`, @@ -238,10 +242,22 @@ def __len__(self): def __getitem__( self, idx - ) -> Tuple[np.ndarray, Optional[np.ndarray], Optional[np.ndarray], np.ndarray]: - past_target, past_covariate, future_target = self.ds_past[idx] - _, future_covariate, _ = self.ds_future[idx] - return past_target, past_covariate, future_covariate, future_target + ) -> Tuple[ + np.ndarray, + Optional[np.ndarray], + Optional[np.ndarray], + Optional[np.ndarray], + np.ndarray, + ]: + past_target, past_covariate, static_covariate, future_target = self.ds_past[idx] + _, future_covariate, _, _ = self.ds_future[idx] + return ( + past_target, + past_covariate, + future_covariate, + static_covariate, + future_target, + ) class MixedCovariatesShiftedDataset(MixedCovariatesTrainingDataset): @@ -256,8 +272,7 @@ def __init__( ): """ A time series dataset containing tuples of (past_target, past_covariates, historic_future_covariates, - future_covariates, future_target) - arrays, which all have length `length`. + future_covariates, static_covariates, future_target) arrays, which all have length `length`. The "future_target" is the "past_target" target shifted by `shift` time steps forward. So if an emitted "past_target" goes from position `i` to `i+length`, the emitted "future_target" will go from position `i+shift` to `i+shift+length`. @@ -330,16 +345,18 @@ def __getitem__( Optional[np.ndarray], Optional[np.ndarray], Optional[np.ndarray], + Optional[np.ndarray], np.ndarray, ]: - past_target, past_covariate, future_target = self.ds_past[idx] - _, historic_future_covariate, future_covariate, _ = self.ds_dual[idx] + past_target, past_covariate, static_covariate, future_target = self.ds_past[idx] + _, historic_future_covariate, future_covariate, _, _ = self.ds_dual[idx] return ( past_target, past_covariate, historic_future_covariate, future_covariate, + static_covariate, future_target, ) @@ -355,8 +372,8 @@ def __init__( max_samples_per_ts: Optional[int] = None, ): """ - A time series dataset containing tuples of (past_target, past_covariates, future_covariates, future_target) - arrays, which all have length `length`. + A time series dataset containing tuples of (past_target, past_covariates, future_covariates, static_covariates, + future_target) arrays, which all have length `length`. The "future_target" is the "past_target" target shifted by `shift` time steps forward. So if an emitted "past_target" goes from position `i` to `i+length`, the emitted "future_target" will go from position `i+shift` to `i+shift+length`. @@ -428,10 +445,22 @@ def __len__(self): def __getitem__( self, idx - ) -> Tuple[np.ndarray, Optional[np.ndarray], Optional[np.ndarray], np.ndarray]: - past_target, past_covariate, future_target = self.ds_past[idx] - _, future_covariate, _ = self.ds_future[idx] - return past_target, past_covariate, future_covariate, future_target + ) -> Tuple[ + np.ndarray, + Optional[np.ndarray], + Optional[np.ndarray], + Optional[np.ndarray], + np.ndarray, + ]: + past_target, past_covariate, static_covariate, future_target = self.ds_past[idx] + _, future_covariate, _, _ = self.ds_future[idx] + return ( + past_target, + past_covariate, + future_covariate, + static_covariate, + future_target, + ) class GenericShiftedDataset(TrainingDataset): @@ -447,8 +476,8 @@ def __init__( covariate_type: CovariateType = CovariateType.NONE, ): """ - Contains (past_target, _covariate, future_target), where "" is past if `shift_covariates = False` - and future otherwise. + Contains (past_target, _covariates, static_covariates, future_target), where "" is past if + `shift_covariates = False` and future otherwise. The past chunks have length `input_chunk_length` and the future chunks have length `output_chunk_length`. The future chunks start `shift` after the past chunks' start. @@ -522,9 +551,9 @@ def __getitem__( self, idx ) -> Tuple[np.ndarray, Optional[np.ndarray], np.ndarray, Optional[np.ndarray]]: # determine the index of the time series. - ts_idx = idx // self.max_samples_per_ts - ts_target = self.target_series[ts_idx] - target_vals = ts_target.random_component_values(copy=False) + target_idx = idx // self.max_samples_per_ts + target_series = self.target_series[target_idx] + target_vals = target_series.random_component_values(copy=False) # determine the actual number of possible samples in this time series n_samples_in_ts = len(target_vals) - self.size_of_both_chunks + 1 @@ -533,22 +562,24 @@ def __getitem__( n_samples_in_ts >= 1, "The dataset contains some time series that are too short to contain " "`max(self.input_chunk_length, self.shift + self.output_chunk_length)` " - "({}-th series)".format(ts_idx), + "({}-th series)".format(target_idx), ) # determine the index at the end of the output chunk # it is originally in [0, self.max_samples_per_ts), so we use a modulo to have it in [0, n_samples_in_ts) end_of_output_idx = ( - len(ts_target) - - (idx - (ts_idx * self.max_samples_per_ts)) % n_samples_in_ts + len(target_series) + - (idx - (target_idx * self.max_samples_per_ts)) % n_samples_in_ts ) # optionally, load covariates - ts_covariate = self.covariates[ts_idx] if self.covariates is not None else None + covariate_series = ( + self.covariates[target_idx] if self.covariates is not None else None + ) - main_cov_type = CovariateType.NONE + main_covariate_type = CovariateType.NONE if self.covariates is not None: - main_cov_type = ( + main_covariate_type = ( CovariateType.FUTURE if self.shift_covariates else CovariateType.PAST ) @@ -558,17 +589,17 @@ def __getitem__( past_end, future_start, future_end, - cov_start, - cov_end, + covariate_start, + covariate_end, ) = self._memory_indexer( - ts_idx=ts_idx, - ts_target=ts_target, + target_idx=target_idx, + target_series=target_series, shift=self.shift, input_chunk_length=self.input_chunk_length, output_chunk_length=self.output_chunk_length, end_of_output_idx=end_of_output_idx, - ts_covariate=ts_covariate, - cov_type=main_cov_type, + covariate_series=covariate_series, + covariate_type=main_covariate_type, ) # extract sample target @@ -579,13 +610,13 @@ def __getitem__( covariate = None if self.covariates is not None: raise_if_not( - cov_end <= len(ts_covariate), - f"The dataset contains {main_cov_type.value} covariates " + covariate_end <= len(covariate_series), + f"The dataset contains {main_covariate_type.value} covariates " f"that don't extend far enough into the future. ({idx}-th sample)", ) - covariate = ts_covariate.random_component_values(copy=False)[ - cov_start:cov_end + covariate = covariate_series.random_component_values(copy=False)[ + covariate_start:covariate_end ] raise_if_not( @@ -595,14 +626,15 @@ def __getitem__( if self.shift_covariates else self.input_chunk_length ), - f"The dataset contains {main_cov_type.value} covariates " + f"The dataset contains {main_covariate_type.value} covariates " f"whose time axis doesn't allow to obtain the input (or output) chunk relative to the " f"target series.", ) + # TODO: we need think about the dimensionality of static covariates static_covariate = ( - ts_target.static_covariates.values - if ts_target.static_covariates is not None + target_series.static_covariates.T.values + if target_series.static_covariates is not None else None ) - return past_target, covariate, future_target, static_covariate + return past_target, covariate, static_covariate, future_target diff --git a/darts/utils/data/training_dataset.py b/darts/utils/data/training_dataset.py index 281a9b00fe..d485ee6159 100644 --- a/darts/utils/data/training_dataset.py +++ b/darts/utils/data/training_dataset.py @@ -25,22 +25,27 @@ def __init__(self): * "PastCovariates" datasets (for PastCovariatesTorchModel): containing (past_target, past_covariates, + static_covariates, future_target) * "FutureCovariates" datasets (for FutureCovariatesTorchModel): containing (past_target, future_covariates, + static_covariates, future_target) * "DualCovariates" datasets (for DualCovariatesTorchModel): containing (past_target, historic_future_covariates, future_covariates, + static_covariates, future_target) * "MixedCovariates" datasets (for MixedCovariatesTorchModel): containing (past_target, past_covariates, historic_future_covariates, future_covariates, + static_covariates, future_target) * "SplitCovariates" datasets (for SplitCovariatesTorchModel): containing (past_target, past_covariates, future_covariates, + static_covariates, future_target) The covariates are optional and can be `None`. @@ -71,30 +76,30 @@ def __getitem__(self, idx: int): def _memory_indexer( self, - ts_idx: int, - ts_target: TimeSeries, + target_idx: int, + target_series: TimeSeries, shift: int, input_chunk_length: int, output_chunk_length: int, end_of_output_idx: int, - ts_covariate: TimeSeries, - cov_type: CovariateType = CovariateType.NONE, + covariate_series: TimeSeries, + covariate_type: CovariateType = CovariateType.NONE, ) -> SampleIndexType: """Returns the (start, end) indices for past target, future target and covariates (sub sets) of the current - sample `i` from `ts_idx`. + sample `i` from `target_idx`. Works for all TimeSeries index types: pd.DatetimeIndex, pd.RangeIndex (and the deprecated Int64Index) - When `ts_idx` is observed for the first time, it stores the position of the sample `0` within the full target - time series and the (start, end) indices of all sub sets. + When `target_idx` is observed for the first time, it stores the position of the sample `0` within the full + target time series and the (start, end) indices of all sub sets. This allows to calculate the sub set indices for all future samples `i` by simply adjusting for the difference between the positions of sample `i` and sample `0`. Parameters ---------- - ts_idx + target_idx index of the current target TimeSeries. - ts_target + target_series current target TimeSeries. shift The number of time steps by which to shift the output chunks relative to the input chunks. @@ -103,18 +108,18 @@ def _memory_indexer( output_chunk_length The length of the emitted future output series. end_of_output_idx - the index where the output chunk of the current sample ends in `ts_target`. - ts_covariate + the index where the output chunk of the current sample ends in `target_series`. + covariate_series current covariate TimeSeries. - cov_type: + covariate_type: the type of covariate to extract. Instance of `CovariateType`: One of (`CovariateType.PAST`, `CovariateType.FUTURE`, `CovariateType.NONE`). """ - cov_start, cov_end = None, None + covariate_start, covariate_end = None, None - # the first time ts_idx is observed - if ts_idx not in self._index_memory: + # the first time target_idx is observed + if target_idx not in self._index_memory: start_of_output_idx = end_of_output_idx - output_chunk_length start_of_input_idx = start_of_output_idx - shift @@ -130,42 +135,46 @@ def _memory_indexer( start_of_input_idx + input_chunk_length, ) - if cov_type is not CovariateType.NONE: + if covariate_type is not CovariateType.NONE: # not CovariateType.Future -> both CovariateType.PAST and CovariateType.HISTORIC_FUTURE - start = future_start if cov_type is CovariateType.FUTURE else past_start - end = future_end if cov_type is CovariateType.FUTURE else past_end + start = ( + future_start + if covariate_type is CovariateType.FUTURE + else past_start + ) + end = future_end if covariate_type is CovariateType.FUTURE else past_end # we need to be careful with getting ranges and indexes: # to get entire range, full_range = ts[:len(ts)]; to get last index: last_idx = ts[len(ts) - 1] # extract actual index value (respects datetime- and integer-based indexes; also from non-zero start) - start_time = ts_target.time_index[start] - end_time = ts_target.time_index[end - 1] + start_time = target_series.time_index[start] + end_time = target_series.time_index[end - 1] raise_if_not( - start_time in ts_covariate.time_index - and end_time in ts_covariate.time_index, - f"Missing covariates; could not find {cov_type.value} covariates in index value range: " + start_time in covariate_series.time_index + and end_time in covariate_series.time_index, + f"Missing covariates; could not find {covariate_type.value} covariates in index value range: " f"{start_time} - {end_time}.", ) # extract the index position (index) from index value - cov_start = ts_covariate.time_index.get_loc(start_time) - cov_end = ts_covariate.time_index.get_loc(end_time) + 1 + covariate_start = covariate_series.time_index.get_loc(start_time) + covariate_end = covariate_series.time_index.get_loc(end_time) + 1 # store position of initial sample and all relevant sub set indices - self._index_memory[ts_idx] = { + self._index_memory[target_idx] = { "end_of_output_idx": end_of_output_idx, "past_target": (past_start, past_end), "future_target": (future_start, future_end), - "covariate": (cov_start, cov_end), + "covariate": (covariate_start, covariate_end), } else: # load position of initial sample and its sub set indices - end_of_output_idx_last = self._index_memory[ts_idx]["end_of_output_idx"] - past_start, past_end = self._index_memory[ts_idx]["past_target"] - future_start, future_end = self._index_memory[ts_idx]["future_target"] - cov_start, cov_end = self._index_memory[ts_idx]["covariate"] + end_of_output_idx_last = self._index_memory[target_idx]["end_of_output_idx"] + past_start, past_end = self._index_memory[target_idx]["past_target"] + future_start, future_end = self._index_memory[target_idx]["future_target"] + covariate_start, covariate_end = self._index_memory[target_idx]["covariate"] # evaluate how much the new sample needs to be shifted, and shift all indexes idx_shift = end_of_output_idx - end_of_output_idx_last @@ -173,17 +182,28 @@ def _memory_indexer( past_end += idx_shift future_start += idx_shift future_end += idx_shift - cov_start = cov_start + idx_shift if cov_start is not None else None - cov_end = cov_end + idx_shift if cov_end is not None else None + covariate_start = ( + covariate_start + idx_shift if covariate_start is not None else None + ) + covariate_end = ( + covariate_end + idx_shift if covariate_end is not None else None + ) - return past_start, past_end, future_start, future_end, cov_start, cov_end + return ( + past_start, + past_end, + future_start, + future_end, + covariate_start, + covariate_end, + ) class PastCovariatesTrainingDataset(TrainingDataset, ABC): def __init__(self): """ Abstract class for a PastCovariatesTorchModel training dataset. It contains 3-tuples of - `(past_target, past_covariate, future_target)` `np.ndarray`. + `(past_target, past_covariate, static_covariates, future_target)` `np.ndarray`. The covariates are optional and can be `None`. """ super().__init__() @@ -191,7 +211,7 @@ def __init__(self): @abstractmethod def __getitem__( self, idx: int - ) -> Tuple[np.ndarray, Optional[np.ndarray], np.ndarray]: + ) -> Tuple[np.ndarray, Optional[np.ndarray], Optional[np.ndarray], np.ndarray]: pass @@ -199,7 +219,7 @@ class FutureCovariatesTrainingDataset(TrainingDataset, ABC): def __init__(self): """ Abstract class for a FutureCovariatesTorchModel training dataset. It contains 3-tuples of - `(past_target, future_covariate, future_target)` `np.ndarray`. + `(past_target, future_covariate, static_covariates, future_target)` `np.ndarray`. The covariates are optional and can be `None`. """ super().__init__() @@ -207,7 +227,7 @@ def __init__(self): @abstractmethod def __getitem__( self, idx: int - ) -> Tuple[np.ndarray, Optional[np.ndarray], np.ndarray]: + ) -> Tuple[np.ndarray, Optional[np.ndarray], Optional[np.ndarray], np.ndarray]: pass @@ -215,7 +235,7 @@ class DualCovariatesTrainingDataset(TrainingDataset, ABC): def __init__(self): """ Abstract class for a DualCovariatesTorchModel training dataset. It contains 4-tuples of - `(past_target, historic_future_covariates, future_covariates, future_target)` `np.ndarray`. + `(past_target, historic_future_covariates, future_covariates, static_covariates, future_target)` `np.ndarray`. The covariates are optional and can be `None`. """ super().__init__() @@ -223,7 +243,13 @@ def __init__(self): @abstractmethod def __getitem__( self, idx: int - ) -> Tuple[np.ndarray, Optional[np.ndarray], Optional[np.ndarray], np.ndarray]: + ) -> Tuple[ + np.ndarray, + Optional[np.ndarray], + Optional[np.ndarray], + Optional[np.ndarray], + np.ndarray, + ]: pass @@ -231,7 +257,8 @@ class MixedCovariatesTrainingDataset(TrainingDataset, ABC): def __init__(self): """ Abstract class for a MixedCovariatesTorchModel training dataset. It contains 5-tuples of - `(past_target, past_covariates, historic_future_covariates, future_covariates, future_target)` `np.ndarray`. + `(past_target, past_covariates, historic_future_covariates, future_covariates, static_covariates, + future_target)` `np.ndarray`. The covariates are optional and can be `None`. """ super().__init__() @@ -244,6 +271,7 @@ def __getitem__( Optional[np.ndarray], Optional[np.ndarray], Optional[np.ndarray], + Optional[np.ndarray], np.ndarray, ]: pass @@ -253,7 +281,7 @@ class SplitCovariatesTrainingDataset(TrainingDataset, ABC): def __init__(self): """ Abstract class for a SplitCovariatesTorchModel training dataset. It contains 4-tuples of - `(past_target, past_covariates, future_covariates, future_target)` `np.ndarray`. + `(past_target, past_covariates, future_covariates, static_covariates, future_target)` `np.ndarray`. The covariates are optional and can be `None`. """ super().__init__() @@ -261,5 +289,11 @@ def __init__(self): @abstractmethod def __getitem__( self, idx: int - ) -> Tuple[np.ndarray, Optional[np.ndarray], Optional[np.ndarray], np.ndarray]: + ) -> Tuple[ + np.ndarray, + Optional[np.ndarray], + Optional[np.ndarray], + Optional[np.ndarray], + np.ndarray, + ]: pass From 4717ff1ce4d8f6db0113a21c68039c6f6e83f4fe Mon Sep 17 00:00:00 2001 From: dennisbader Date: Thu, 19 May 2022 15:26:12 +0200 Subject: [PATCH 04/26] adapted PLXCovariatesModules with static covariates --- .../forecasting/pl_forecasting_module.py | 31 ++++++++++++++----- 1 file changed, 24 insertions(+), 7 deletions(-) diff --git a/darts/models/forecasting/pl_forecasting_module.py b/darts/models/forecasting/pl_forecasting_module.py index d18301b696..f7e97c1154 100644 --- a/darts/models/forecasting/pl_forecasting_module.py +++ b/darts/models/forecasting/pl_forecasting_module.py @@ -339,7 +339,16 @@ def epochs_trained(self): class PLPastCovariatesModule(PLForecastingModule, ABC): def _produce_train_output(self, input_batch: Tuple): - past_target, past_covariate = input_batch + """ + Feeds PastCovariatesTorchModel with input and output chunks of a PastCovariatesSequentialDataset for + training. + + Parameters: + ---------- + input_batch + ``(past_target, past_covariates, static_covariates)`` + """ + past_target, past_covariate, _ = input_batch # Currently all our PastCovariates models require past target and covariates concatenated inpt = ( torch.cat([past_target, past_covariate], dim=2) @@ -360,13 +369,13 @@ def _get_batch_prediction( n prediction length input_batch - (past_target, past_covariates, future_past_covariates) + ``(past_target, past_covariates, future_past_covariates, static_covariates)`` roll_size roll input arrays after every sequence by ``roll_size``. Initially, ``roll_size`` is equivalent to ``self.output_chunk_length`` """ dim_component = 2 - past_target, past_covariates, future_past_covariates = input_batch + past_target, past_covariates, future_past_covariates, _ = input_batch n_targets = past_target.shape[dim_component] n_past_covs = ( @@ -459,6 +468,15 @@ class PLMixedCovariatesModule(PLForecastingModule, ABC): def _produce_train_output( self, input_batch: Tuple ) -> Tuple[torch.Tensor, torch.Tensor]: + """ + Feeds MixedCovariatesTorchModel with input and output chunks of a MixedCovariatesSequentialDataset for + training. + + Parameters: + ---------- + input_batch + ``(past_target, past_covariates, historic_future_covariates, future_covariates, static_covariates)``. + """ return self(self._process_input_batch(input_batch)) def _process_input_batch( @@ -471,7 +489,7 @@ def _process_input_batch( Parameters ---------- input_batch - ``(past_target, past_covariates, historic_future_covariates, future_covariates)``. + ``(past_target, past_covariates, historic_future_covariates, future_covariates, static_covariates)``. Returns ------- @@ -483,12 +501,10 @@ def _process_input_batch( past_covariates, historic_future_covariates, future_covariates, + static_covariates, ) = input_batch dim_variable = 2 - # TODO: impelement static covariates - static_covariates = None - x_past = torch.cat( [ tensor @@ -541,6 +557,7 @@ def _get_batch_prediction( historic_future_covariates, future_covariates, future_past_covariates, + static_covariates, ) = input_batch n_targets = past_target.shape[dim_component] From 5b6b781c82839fb9e9b2be65aacdd9e4330f0b97 Mon Sep 17 00:00:00 2001 From: dennisbader Date: Fri, 20 May 2022 14:51:46 +0200 Subject: [PATCH 05/26] adapted TFTModel for static covariate support --- .../forecasting/pl_forecasting_module.py | 57 ++++++++++++------- darts/models/forecasting/tft_model.py | 38 ++++++------- darts/models/forecasting/tft_submodels.py | 16 ++++-- .../forecasting/torch_forecasting_model.py | 2 +- darts/timeseries.py | 7 ++- darts/utils/data/horizon_based_dataset.py | 2 +- darts/utils/data/inference_dataset.py | 2 +- darts/utils/data/shifted_dataset.py | 2 +- 8 files changed, 75 insertions(+), 51 deletions(-) diff --git a/darts/models/forecasting/pl_forecasting_module.py b/darts/models/forecasting/pl_forecasting_module.py index f7e97c1154..c66eb20e05 100644 --- a/darts/models/forecasting/pl_forecasting_module.py +++ b/darts/models/forecasting/pl_forecasting_module.py @@ -481,7 +481,7 @@ def _produce_train_output( def _process_input_batch( self, input_batch - ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]: + ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[torch.Tensor]]: """ Converts output of MixedCovariatesDataset (training dataset) into an input/past- and output/future chunk. @@ -494,7 +494,7 @@ def _process_input_batch( Returns ------- tuple - ``(x_past, x_future)`` the input/past and output/future chunks. + ``(x_past, x_future, x_static)`` the input/past and output/future chunks. """ ( past_target, @@ -505,6 +505,32 @@ def _process_input_batch( ) = input_batch dim_variable = 2 + # TODO: remove when everything works + # x_past = torch.cat( + # [ + # tensor + # for tensor in [ + # past_target, + # past_covariates, + # historic_future_covariates, + # static_covariates, + # ] + # if tensor is not None + # ], + # dim=dim_variable, + # ) + # + # x_future = None + # if future_covariates is not None or static_covariates is not None: + # x_future = torch.cat( + # [ + # tensor + # for tensor in [future_covariates, static_covariates] + # if tensor is not None + # ], + # dim=dim_variable, + # ) + x_past = torch.cat( [ tensor @@ -512,25 +538,15 @@ def _process_input_batch( past_target, past_covariates, historic_future_covariates, - static_covariates, ] if tensor is not None ], dim=dim_variable, ) - x_future = None - if future_covariates is not None or static_covariates is not None: - x_future = torch.cat( - [ - tensor - for tensor in [future_covariates, static_covariates] - if tensor is not None - ], - dim=dim_variable, - ) - - return x_past, x_future + x_future = future_covariates + x_static = static_covariates + return x_past, x_future, x_static def _get_batch_prediction( self, n: int, input_batch: Tuple, roll_size: int @@ -570,7 +586,7 @@ def _get_batch_prediction( else 0 ) - input_past, input_future = self._process_input_batch( + input_past, input_future, input_static = self._process_input_batch( ( past_target, past_covariates, @@ -578,10 +594,11 @@ def _get_batch_prediction( future_covariates[:, :roll_size, :] if future_covariates is not None else None, + static_covariates, ) ) - out = self._produce_predict_output(x=(input_past, input_future))[ + out = self._produce_predict_output(x=(input_past, input_future, input_static))[ :, self.first_prediction_index :, : ] @@ -649,9 +666,9 @@ def _get_batch_prediction( input_future = future_covariates[:, left_future:right_future, :] # take only last part of the output sequence where needed - out = self._produce_predict_output(x=(input_past, input_future))[ - :, self.first_prediction_index :, : - ] + out = self._produce_predict_output( + x=(input_past, input_future, input_static) + )[:, self.first_prediction_index :, :] batch_prediction.append(out) prediction_length += self.output_chunk_length diff --git a/darts/models/forecasting/tft_model.py b/darts/models/forecasting/tft_model.py index 0a23b75301..14edf7685b 100644 --- a/darts/models/forecasting/tft_model.py +++ b/darts/models/forecasting/tft_model.py @@ -334,13 +334,15 @@ def get_attention_mask_future( ) return mask - def forward(self, x: Tuple[torch.Tensor, Optional[torch.Tensor]]) -> torch.Tensor: + def forward( + self, x: Tuple[torch.Tensor, Optional[torch.Tensor], Optional[torch.Tensor]] + ) -> torch.Tensor: """TFT model forward pass. Parameters ---------- x - comes as tuple `(x_past, x_future)` where `x_past` is the input/past chunk and `x_future` + comes as tuple `(x_past, x_future, x_static)` where `x_past` is the input/past chunk and `x_future` is the output/future chunk. Input dimensions are `(n_samples, n_time_steps, n_variables)` Returns @@ -348,12 +350,9 @@ def forward(self, x: Tuple[torch.Tensor, Optional[torch.Tensor]]) -> torch.Tenso torch.Tensor the output tensor """ - x_cont_past, x_cont_future = x + x_cont_past, x_cont_future, x_static = x dim_samples, dim_time, dim_variable = 0, 1, 2 - # TODO: impelement static covariates - static_covariates = None - batch_size = x_cont_past.shape[dim_samples] encoder_length = self.input_chunk_length decoder_length = self.output_chunk_length @@ -414,12 +413,14 @@ def forward(self, x: Tuple[torch.Tensor, Optional[torch.Tensor]]) -> torch.Tenso } # Embedding and variable selection - if static_covariates is not None: - # TODO: implement static covariates - # # static embeddings will be constant over entire batch - # static_embedding = {name: input_vectors[name][:, 0] for name in self.static_variables} - # static_embedding, static_covariate_var = self.static_covariates_vsn(static_embedding) - raise NotImplementedError("Static covariates have yet to be defined") + if x_static is not None: + static_embedding = { + name: x_static[:, 0, i].unsqueeze(-1) + for i, name in enumerate(self.static_variables) + } + static_embedding, static_covariate_var = self.static_covariates_vsn( + static_embedding + ) else: static_embedding = torch.zeros( (x_cont_past.shape[0], self.hidden_size), @@ -427,14 +428,6 @@ def forward(self, x: Tuple[torch.Tensor, Optional[torch.Tensor]]) -> torch.Tenso device=self.device, ) - # # TODO: implement below when static covariates are supported - # # this is only to interpret the output - # static_covariate_var = torch.zeros( - # (x_cont_past.shape[0], 0), - # dtype=x_cont_past.dtype, - # device=x_cont_past.device, - # ) - static_context_expanded = self.expand_static_context( context=self.static_context_grn(static_embedding), time_steps=time_steps ) @@ -754,7 +747,8 @@ def __init__( def _create_model(self, train_sample: MixedCovariatesTrainTensorType) -> nn.Module: """ `train_sample` contains the following tensors: - (past_target, past_covariates, historic_future_covariates, future_covariates, future_target) + (past_target, past_covariates, historic_future_covariates, future_covariates, static_covariates, + future_target) each tensor has shape (n_timesteps, n_variables) - past/historic tensors have shape (input_chunk_length, n_variables) @@ -774,8 +768,8 @@ def _create_model(self, train_sample: MixedCovariatesTrainTensorType) -> nn.Modu past_covariate, historic_future_covariate, future_covariate, - future_target, static_covariates, + future_target, ) = train_sample # add a covariate placeholder so that relative index will be included diff --git a/darts/models/forecasting/tft_submodels.py b/darts/models/forecasting/tft_submodels.py index 4c97e8329b..16945fe1d2 100644 --- a/darts/models/forecasting/tft_submodels.py +++ b/darts/models/forecasting/tft_submodels.py @@ -20,7 +20,7 @@ ' """ -from typing import Dict, List, Tuple, Union +from typing import Dict, List, Optional, Tuple, Union import torch import torch.nn as nn @@ -378,17 +378,25 @@ def __init__( self, input_sizes: Dict[str, int], hidden_size: int, - input_embedding_flags: Dict[str, bool] = {}, + input_embedding_flags: Optional[Dict[str, bool]] = None, dropout: float = 0.1, context_size: int = None, - single_variable_grns: Dict[str, _GatedResidualNetwork] = {}, - prescalers: Dict[str, nn.Linear] = {}, + single_variable_grns: Optional[Dict[str, _GatedResidualNetwork]] = None, + prescalers: Optional[Dict[str, nn.Linear]] = None, ): """ Calcualte weights for ``num_inputs`` variables which are each of size ``input_size`` """ super().__init__() + input_embedding_flags = ( + input_embedding_flags if input_embedding_flags is not None else {} + ) + single_variable_grns = ( + single_variable_grns if single_variable_grns is not None else {} + ) + prescalers = prescalers if prescalers is not None else {} + self.hidden_size = hidden_size self.input_sizes = input_sizes self.input_embedding_flags = input_embedding_flags diff --git a/darts/models/forecasting/torch_forecasting_model.py b/darts/models/forecasting/torch_forecasting_model.py index d8eca0c486..d1423f1b1c 100644 --- a/darts/models/forecasting/torch_forecasting_model.py +++ b/darts/models/forecasting/torch_forecasting_model.py @@ -846,7 +846,7 @@ def fit_from_dataset( train_sample = train_dataset[0] if self.model is None: # Build model, based on the dimensions of the first series in the train set. - self.train_sample, self.output_dim = train_sample, train_sample[-2].shape[1] + self.train_sample, self.output_dim = train_sample, train_sample[-1].shape[1] self._init_model(trainer) else: # Check existing model has input/output dims matching what's provided in the training set. diff --git a/darts/timeseries.py b/darts/timeseries.py index 197f144d3f..def8b1de38 100644 --- a/darts/timeseries.py +++ b/darts/timeseries.py @@ -894,6 +894,11 @@ def has_range_index(self) -> bool: """Whether this series is indexed with an RangeIndex (otherwise it is indexed with a DatetimeIndex).""" return not self._has_datetime_index + @property + def has_static_covariates(self) -> bool: + """Whether this series contains static covariates.""" + return self.static_covariates is not None + @property def duration(self) -> Union[pd.Timedelta, int]: """The duration of this time series (as a time delta or int).""" @@ -2116,7 +2121,7 @@ def with_values(self, values: np.ndarray) -> "TimeSeries": return self.__class__(new_xa) def add_static_covariates(self, covariates: Union[pd.Series, pd.DataFrame]): - self._xa.attrs["static_covariates"] = covariates + self._xa.attrs["static_covariates"] = covariates.astype(self.dtype) return self def stack(self, other: "TimeSeries") -> "TimeSeries": diff --git a/darts/utils/data/horizon_based_dataset.py b/darts/utils/data/horizon_based_dataset.py index 7091d113d1..10d836a736 100644 --- a/darts/utils/data/horizon_based_dataset.py +++ b/darts/utils/data/horizon_based_dataset.py @@ -184,7 +184,7 @@ def __getitem__( # TODO: we need think about the dimensionality of static covariates static_covariate = ( target_series.static_covariates.T.values - if target_series.static_covariates is not None + if target_series.has_static_covariates else None ) diff --git a/darts/utils/data/inference_dataset.py b/darts/utils/data/inference_dataset.py index e3eecde1b6..92426aa789 100644 --- a/darts/utils/data/inference_dataset.py +++ b/darts/utils/data/inference_dataset.py @@ -219,7 +219,7 @@ def __getitem__( # TODO: we need think about the dimensionality of static covariates static_covariate = ( target_series.static_covariates.T.values - if target_series.static_covariates is not None + if target_series.has_static_covariates else None ) return ( diff --git a/darts/utils/data/shifted_dataset.py b/darts/utils/data/shifted_dataset.py index ad882165b5..aa29f3f6a9 100644 --- a/darts/utils/data/shifted_dataset.py +++ b/darts/utils/data/shifted_dataset.py @@ -634,7 +634,7 @@ def __getitem__( # TODO: we need think about the dimensionality of static covariates static_covariate = ( target_series.static_covariates.T.values - if target_series.static_covariates is not None + if target_series.has_static_covariates else None ) return past_target, covariate, static_covariate, future_target From 55eaf3bc81a21e968746ba022ecedc03a2233710 Mon Sep 17 00:00:00 2001 From: dennisbader Date: Fri, 20 May 2022 16:07:51 +0200 Subject: [PATCH 06/26] added temporary fix for static covariates with scalers --- darts/dataprocessing/transformers/boxcox.py | 4 ++-- darts/dataprocessing/transformers/scaler.py | 4 ++-- darts/timeseries.py | 10 +++++++--- 3 files changed, 11 insertions(+), 7 deletions(-) diff --git a/darts/dataprocessing/transformers/boxcox.py b/darts/dataprocessing/transformers/boxcox.py index 22067ec6e5..dbf1eb8598 100644 --- a/darts/dataprocessing/transformers/boxcox.py +++ b/darts/dataprocessing/transformers/boxcox.py @@ -167,7 +167,7 @@ def ts_transform( ) return series.with_values( BoxCox._reshape_out(series, transformed_vals, component_mask=component_mask) - ) + ).set_static_covariates(series.static_covariates) @staticmethod def ts_inverse_transform( @@ -185,7 +185,7 @@ def ts_inverse_transform( BoxCox._reshape_out( series, inv_transformed_vals, component_mask=component_mask ) - ) + ).set_static_covariates(series.static_covariates) def fit( self, series: Union[TimeSeries, Sequence[TimeSeries]], **kwargs diff --git a/darts/dataprocessing/transformers/scaler.py b/darts/dataprocessing/transformers/scaler.py index 2246c29ca3..c2b1c32383 100644 --- a/darts/dataprocessing/transformers/scaler.py +++ b/darts/dataprocessing/transformers/scaler.py @@ -106,7 +106,7 @@ def ts_transform(series: TimeSeries, transformer, **kwargs) -> TimeSeries: values=transformed_vals, fill_missing_dates=False, columns=series.columns, - ) + ).set_static_covariates(series.static_covariates) @staticmethod def ts_inverse_transform( @@ -126,7 +126,7 @@ def ts_inverse_transform( values=inv_transformed_vals, fill_missing_dates=False, columns=series.columns, - ) + ).set_static_covariates(series.static_covariates) @staticmethod def ts_fit(series: TimeSeries, transformer, *args, **kwargs) -> Any: diff --git a/darts/timeseries.py b/darts/timeseries.py index def8b1de38..4a4cdb408d 100644 --- a/darts/timeseries.py +++ b/darts/timeseries.py @@ -592,7 +592,7 @@ def from_longitudinal_dataframe( fill_missing_dates=fill_missing_dates, freq=freq, fillna_value=fillna_value, - ).add_static_covariates(static_covs) + ).set_static_covariates(static_covs) for static_covs, split in splits ] @@ -2120,8 +2120,12 @@ def with_values(self, values: np.ndarray) -> "TimeSeries": return self.__class__(new_xa) - def add_static_covariates(self, covariates: Union[pd.Series, pd.DataFrame]): - self._xa.attrs["static_covariates"] = covariates.astype(self.dtype) + def set_static_covariates( + self, covariates: Optional[Union[pd.Series, pd.DataFrame]] + ): + self._xa.attrs["static_covariates"] = ( + covariates.astype(self.dtype) if covariates is not None else covariates + ) return self def stack(self, other: "TimeSeries") -> "TimeSeries": From 29924f49c7c6e2a01078f642f29edce45527647e Mon Sep 17 00:00:00 2001 From: dennisbader Date: Tue, 24 May 2022 16:32:41 +0200 Subject: [PATCH 07/26] unittests for from_longitudinal_dataframe() and set_static_covariates --- .../test_timeseries_static_covariates.py | 170 ++++++++++++++++++ darts/timeseries.py | 53 ++++-- 2 files changed, 212 insertions(+), 11 deletions(-) create mode 100644 darts/tests/test_timeseries_static_covariates.py diff --git a/darts/tests/test_timeseries_static_covariates.py b/darts/tests/test_timeseries_static_covariates.py new file mode 100644 index 0000000000..62deb37e3c --- /dev/null +++ b/darts/tests/test_timeseries_static_covariates.py @@ -0,0 +1,170 @@ +import copy + +import numpy as np +import pandas as pd +import pytest + +from darts import TimeSeries +from darts.tests.base_test_class import DartsBaseTestClass +from darts.utils.timeseries_generation import _generate_index, linear_timeseries + + +class TimeSeriesMultivariateTestCase(DartsBaseTestClass): + @classmethod + def setUpClass(cls): + super().setUpClass() + + n_groups = 5 + len_ts = 10 + times = ( + pd.concat( + [ + pd.DataFrame( + _generate_index(start=pd.Timestamp(2010, 1, 1), length=len_ts) + ) + ] + * n_groups, + axis=0, + ) + .reset_index(drop=True) + .rename(columns={0: "times"}) + ) + + x = pd.DataFrame(np.random.randn(n_groups * len_ts, 3), columns=["a", "b", "c"]) + static_multivar = pd.DataFrame( + [ + [i, 0 if j < (len_ts // 2) else 1] + for i in range(n_groups) + for j in range(len_ts) + ], + columns=["st1", "st2"], + ) + + df_long_multi = pd.DataFrame( + pd.concat([times, x, static_multivar], axis=1), + ) + df_long_multi.loc[:, "constant"] = 1 + df_long_uni = df_long_multi.drop(columns=["st2"]) + + cls.n_groups = n_groups + cls.len_ts = len_ts + cls.df_long_multi = df_long_multi + cls.df_long_uni = df_long_uni + + def test_timeseries_from_longitudinal_df(self): + # univariate static covs: only group by "st1", keep static covs "st1" + value_cols = ["a", "b", "c"] + ts_groups1 = TimeSeries.from_longitudinal_dataframe( + df=self.df_long_uni, + group_cols="st1", + static_cols=None, + time_col="times", + value_cols=value_cols, + ) + assert len(ts_groups1) == self.n_groups + for i, ts in enumerate(ts_groups1): + assert ts.static_covariates.shape == (1, 1) + assert ts.static_covariates.index.equals(pd.Index(["st1"])) + assert (ts.static_covariates.values == [[i]]).all() + + # multivariate static covs: only group by "st1", keep static covs "st1", "constant" + ts_groups2 = TimeSeries.from_longitudinal_dataframe( + df=self.df_long_multi, + group_cols=["st1"], + static_cols="constant", + time_col="times", + value_cols=value_cols, + ) + assert len(ts_groups2) == self.n_groups + for i, ts in enumerate(ts_groups2): + assert ts.static_covariates.shape == (2, 1) + assert ts.static_covariates.index.equals(pd.Index(["st1", "constant"])) + assert (ts.static_covariates.values == [[i], [1]]).all() + + # multivariate static covs: group by "st1" and "st2", keep static covs "st1", "st2", "constant" + ts_groups3 = TimeSeries.from_longitudinal_dataframe( + df=self.df_long_multi, + group_cols=["st1", "st2"], + static_cols=["constant"], + time_col="times", + value_cols=value_cols, + ) + assert len(ts_groups3) == self.n_groups * 2 + for idx, ts in enumerate(ts_groups3): + i = idx // 2 + j = idx % 2 + assert ts.static_covariates.shape == (3, 1) + assert ts.static_covariates.index.equals( + pd.Index(["st1", "st2", "constant"]) + ) + assert (ts.static_covariates.values == [[i], [j], [1]]).all() + + df = copy.deepcopy(self.df_long_multi) + df.loc[:, "non_static"] = np.arange(len(df)) + # non static columns as static columns should raise an error + with pytest.raises(ValueError): + _ = TimeSeries.from_longitudinal_dataframe( + df=df, + group_cols=["st1"], + static_cols=["non_static"], + time_col="times", + value_cols=value_cols, + ) + + # groups that are too short for TimeSeries requirements should raise an error + with pytest.raises(ValueError): + _ = TimeSeries.from_longitudinal_dataframe( + df=df, + group_cols=["st1", "non_static"], + static_cols=None, + time_col="times", + value_cols=value_cols, + ) + + def test_set_static_covariates_univariate(self): + ts = linear_timeseries(length=10) + static_covs = pd.Series([0.0, 1.0], index=["st1", "st2"]) + + # inplace from Series for chained calls + ts.set_static_covariates(static_covs) + assert ts.static_covariates.equals(static_covs.to_frame()) + + # from Series + ts = ts.set_static_covariates(static_covs) + assert ts.static_covariates.equals(static_covs.to_frame()) + + # from DataFrame + ts = ts.set_static_covariates(static_covs.to_frame()) + assert ts.static_covariates.equals(static_covs.to_frame()) + + # with None + ts = ts.set_static_covariates(None) + assert ts.static_covariates is None + + # only pd.Series, pd.DataFrame or None + with pytest.raises(ValueError): + ts.set_static_covariates([1, 2, 3]) + + # multivariate does not work with univariate TimeSeries + with pytest.raises(ValueError): + static_covs_multi = pd.concat([static_covs] * 2, axis=1) + ts.set_static_covariates(static_covs_multi) + + def test_set_static_covariates_multivariate(self): + ts = linear_timeseries(length=10) + ts_multi = ts.stack(ts) + static_covs = pd.DataFrame([[0.0, 1.0], [0.0, 1.0]], index=["st1", "st2"]) + + # from univariate static covariates + ts_multi = ts_multi.set_static_covariates(static_covs[static_covs.columns[0]]) + assert ts_multi.static_covariates.equals( + static_covs[static_covs.columns[0]].to_frame() + ) + + # from univariate static covariates + ts_multi = ts_multi.set_static_covariates(static_covs) + assert ts_multi.static_covariates.equals(static_covs) + + # raise an error if multivariate static covariates columns don't match the number of components in the series + with pytest.raises(ValueError): + _ = ts_multi.set_static_covariates(pd.concat([static_covs] * 2, axis=1)) diff --git a/darts/timeseries.py b/darts/timeseries.py index c42de4f605..2bbf611714 100644 --- a/darts/timeseries.py +++ b/darts/timeseries.py @@ -552,27 +552,41 @@ def from_longitudinal_dataframe( A univariate or multivariate deterministic TimeSeries constructed from the inputs. """ group_cols = [group_cols] if not isinstance(group_cols, list) else group_cols - static_cols = ( - [static_cols] - if not isinstance(static_cols, list) and static_cols is not None - else [] - ) + if static_cols is not None: + static_cols = ( + [static_cols] if not isinstance(static_cols, list) else static_cols + ) + else: + static_cols = [] static_cov_cols = group_cols + static_cols # split df by groups, and store group values and static values (static covariates) splits = [] for static_cov_vals, group in df.groupby(group_cols): + static_cov_vals = ( + (static_cov_vals,) + if not isinstance(static_cov_vals, tuple) + else static_cov_vals + ) # check that for each group there is only one unique value per column in `static_cols` if static_cols: static_cols_valid = [ len(group[col].unique()) == 1 for col in static_cols ] - raise_if_not( - all(static_cols_valid), - f"Encountered more than one unique value in group {group} for given static columns: " - f"{[static_col for static_col, is_valid in zip(static_cols, static_cols_valid) if not is_valid]}.", - logger, - ) + if not all(static_cols_valid): + # encountered performance issues when evaluating the error message from below in every + # iteration with `raise_if_not(all(static_cols_valid), message, logger)` + invalid_cols = [ + static_col + for static_col, is_valid in zip(static_cols, static_cols_valid) + if not is_valid + ] + raise_if( + True, + f"Encountered more than one unique value in group {group} for given static columns: " + f"{invalid_cols}.", + logger, + ) # add the static covariates to the group values static_cov_vals += tuple(group[static_cols].values[0]) # store static covariate Series and group DataFrame (without static cov columns) @@ -2131,6 +2145,23 @@ def with_values(self, values: np.ndarray) -> "TimeSeries": def set_static_covariates( self, covariates: Optional[Union[pd.Series, pd.DataFrame]] ): + raise_if( + not isinstance(covariates, (pd.Series, pd.DataFrame)) + and covariates is not None, + "`covariates` must be either a pandas Series, DataFrame or None", + logger, + ) + # check if valid static covariates multivariate TimeSeries static covariatesx + if isinstance(covariates, pd.DataFrame): + n_components = len(covariates.columns) + raise_if( + n_components > 1 and n_components != self.n_components, + "When passing a multi-column pandas DataFrame, the number of columns must match the number of " + "components of the TimeSeries object (multivariate static covariates must map to each component).", + logger, + ) + elif isinstance(covariates, pd.Series): + covariates = covariates.to_frame() self._xa.attrs["static_covariates"] = ( covariates.astype(self.dtype) if covariates is not None else covariates ) From 079d969918ec7c5f660f7933c8c80c7ff74872b5 Mon Sep 17 00:00:00 2001 From: dennisbader Date: Tue, 24 May 2022 17:00:10 +0200 Subject: [PATCH 08/26] updated dataset tests --- darts/tests/test_datasets.py | 505 +++++++++++++++++++++++------------ 1 file changed, 341 insertions(+), 164 deletions(-) diff --git a/darts/tests/test_datasets.py b/darts/tests/test_datasets.py index d52ede538e..ac1cd1a9e8 100644 --- a/darts/tests/test_datasets.py +++ b/darts/tests/test_datasets.py @@ -36,25 +36,37 @@ if TORCH_AVAILABLE: class DatasetTestCase(DartsBaseTestClass): - target1, target2 = gaussian_timeseries(length=100), gaussian_timeseries( - length=150 + target1 = gaussian_timeseries(length=100).set_static_covariates( + pd.Series([0, 1], index=["st1", "st2"]) ) + target2 = gaussian_timeseries(length=150).set_static_covariates( + pd.Series([2, 3], index=["st1", "st2"]) + ) + cov_st1 = target1.static_covariates.T.values + cov_st2 = target2.static_covariates.T.values + cov_st2_df = pd.Series([2, 3], index=["st1", "st2"]) vals1, vals2 = target1.values(), target2.values() cov1, cov2 = gaussian_timeseries(length=100), gaussian_timeseries(length=150) - def _assert_eq(self, tup_ar, tup_series): - l1 = [] - for ar_element in tup_ar: - l1.append(None if ar_element is None else list(ar_element)) - l2 = [] - for series_element in tup_series: - l2.append( - None - if series_element is None - else list(series_element.values(copy=False)) + def _assert_eq(self, lefts: tuple, rights: tuple): + for left, right in zip(lefts, rights): + left = left.values() if isinstance(left, TimeSeries) else left + right = right.values() if isinstance(right, TimeSeries) else right + assert type(left) == type(right) + assert ( + isinstance( + left, (TimeSeries, pd.Series, pd.DataFrame, np.ndarray, list) + ) + or left is None ) - - self.assertEqual(l1, l2) + if isinstance(left, (pd.Series, pd.DataFrame)): + assert left.equals(right) + elif isinstance(left, np.ndarray): + assert np.array_equal(left, right) + elif isinstance(left, (list, TimeSeries)): + assert left == right + else: + assert right is None def test_past_covariates_inference_dataset(self): # one target series @@ -62,7 +74,7 @@ def test_past_covariates_inference_dataset(self): target_series=self.target1, input_chunk_length=len(self.target1) ) np.testing.assert_almost_equal(ds[0][0], self.vals1) - self.assertEqual(ds[0][1:], (None, None, self.target1)) + self._assert_eq(ds[0][1:], (None, None, self.cov_st1, self.target1)) # two target series ds = PastCovariatesInferenceDataset( @@ -70,7 +82,7 @@ def test_past_covariates_inference_dataset(self): input_chunk_length=max(len(self.target1), len(self.target2)), ) np.testing.assert_almost_equal(ds[1][0], self.vals2) - self.assertEqual(ds[1][1:], (None, None, self.target2)) + self._assert_eq(ds[1][1:], (None, None, self.cov_st2, self.target2)) # fail if covariates do not have same size with self.assertRaises(ValueError): @@ -86,8 +98,8 @@ def test_past_covariates_inference_dataset(self): ) np.testing.assert_almost_equal(ds[1][0], self.vals2) np.testing.assert_almost_equal(ds[1][1], self.cov2.values()) - self.assertEqual( - ds[1][2:], (None, self.target2) + self._assert_eq( + ds[1][2:], (None, self.cov_st2, self.target2) ) # no "future past" covariate here # more complex case with future past covariates: @@ -98,7 +110,7 @@ def test_past_covariates_inference_dataset(self): target = TimeSeries.from_times_and_values( times1, np.random.randn(len(times1)) - ) + ).set_static_covariates(self.cov_st2_df) short_cov = TimeSeries.from_times_and_values( times1, np.random.randn(len(times1)) ) @@ -116,7 +128,7 @@ def test_past_covariates_inference_dataset(self): # should fail if covariates are too short with self.assertRaises(ValueError): - ds[0] + _ = ds[0] # Should return correct values when covariates is long enough ds = PastCovariatesInferenceDataset( @@ -130,12 +142,13 @@ def test_past_covariates_inference_dataset(self): np.testing.assert_almost_equal(ds[0][0], target.values()[-10:]) np.testing.assert_almost_equal(ds[0][1], long_cov.values()[-60:-50]) np.testing.assert_almost_equal(ds[0][2], long_cov.values()[-50:-30]) - self.assertEqual(ds[0][3], target) + np.testing.assert_almost_equal(ds[0][3], self.cov_st2) + self.assertEqual(ds[0][4], target) # Should also work for integer-indexed series target = TimeSeries.from_times_and_values( pd.RangeIndex(start=10, stop=50, step=1), np.random.randn(40) - ) + ).set_static_covariates(self.cov_st2_df) covariate = TimeSeries.from_times_and_values( pd.RangeIndex(start=20, stop=80, step=1), np.random.randn(60) ) @@ -151,7 +164,8 @@ def test_past_covariates_inference_dataset(self): np.testing.assert_almost_equal(ds[0][0], target.values()[-10:]) np.testing.assert_almost_equal(ds[0][1], covariate.values()[20:30]) np.testing.assert_almost_equal(ds[0][2], covariate.values()[30:40]) - self.assertEqual(ds[0][3], target) + np.testing.assert_almost_equal(ds[0][3], self.cov_st2) + self.assertEqual(ds[0][4], target) def test_future_covariates_inference_dataset(self): # one target series @@ -159,7 +173,7 @@ def test_future_covariates_inference_dataset(self): target_series=self.target1, input_chunk_length=len(self.target1) ) np.testing.assert_almost_equal(ds[0][0], self.vals1) - self.assertEqual(ds[0][1:], (None, self.target1)) + self._assert_eq(ds[0][1:], (None, self.cov_st1, self.target1)) # two target series ds = FutureCovariatesInferenceDataset( @@ -167,7 +181,7 @@ def test_future_covariates_inference_dataset(self): input_chunk_length=max(len(self.target1), len(self.target2)), ) np.testing.assert_almost_equal(ds[1][0], self.vals2) - self.assertEqual(ds[1][1:], (None, self.target2)) + self._assert_eq(ds[1][1:], (None, self.cov_st2, self.target2)) # fail if covariates do not have same size with self.assertRaises(ValueError): @@ -183,7 +197,7 @@ def test_future_covariates_inference_dataset(self): target = TimeSeries.from_times_and_values( times1, np.random.randn(len(times1)) - ) + ).set_static_covariates(self.cov_st2_df) short_cov = TimeSeries.from_times_and_values( times1, np.random.randn(len(times1)) ) @@ -197,7 +211,7 @@ def test_future_covariates_inference_dataset(self): # should fail if covariates are too short with self.assertRaises(ValueError): - ds[0] + _ = ds[0] # Should return correct values when covariates is long enough ds = FutureCovariatesInferenceDataset( @@ -206,12 +220,13 @@ def test_future_covariates_inference_dataset(self): np.testing.assert_almost_equal(ds[0][0], target.values()[-10:]) np.testing.assert_almost_equal(ds[0][1], long_cov.values()[-50:-20]) - self.assertEqual(ds[0][2], target) + np.testing.assert_almost_equal(ds[0][2], self.cov_st2) + self.assertEqual(ds[0][3], target) # Should also work for integer-indexed series target = TimeSeries.from_times_and_values( pd.RangeIndex(start=10, stop=50, step=1), np.random.randn(40) - ) + ).set_static_covariates(self.cov_st2_df) covariate = TimeSeries.from_times_and_values( pd.RangeIndex(start=20, stop=80, step=1), np.random.randn(60) ) @@ -222,7 +237,8 @@ def test_future_covariates_inference_dataset(self): np.testing.assert_almost_equal(ds[0][0], target.values()[-10:]) np.testing.assert_almost_equal(ds[0][1], covariate.values()[30:50]) - self.assertEqual(ds[0][2], target) + np.testing.assert_almost_equal(ds[0][2], self.cov_st2) + self.assertEqual(ds[0][3], target) def test_dual_covariates_inference_dataset(self): # one target series @@ -230,7 +246,7 @@ def test_dual_covariates_inference_dataset(self): target_series=self.target1, input_chunk_length=len(self.target1) ) np.testing.assert_almost_equal(ds[0][0], self.vals1) - self.assertEqual(ds[0][1:], (None, None, self.target1)) + self._assert_eq(ds[0][1:], (None, None, self.cov_st1, self.target1)) # two target series ds = DualCovariatesInferenceDataset( @@ -238,7 +254,7 @@ def test_dual_covariates_inference_dataset(self): input_chunk_length=max(len(self.target1), len(self.target2)), ) np.testing.assert_almost_equal(ds[1][0], self.vals2) - self.assertEqual(ds[1][1:], (None, None, self.target2)) + self._assert_eq(ds[1][1:], (None, None, self.cov_st2, self.target2)) # fail if covariates do not have same size with self.assertRaises(ValueError): @@ -254,7 +270,7 @@ def test_dual_covariates_inference_dataset(self): target = TimeSeries.from_times_and_values( times1, np.random.randn(len(times1)) - ) + ).set_static_covariates(self.cov_st2_df) short_cov = TimeSeries.from_times_and_values( times1, np.random.randn(len(times1)) ) @@ -272,7 +288,7 @@ def test_dual_covariates_inference_dataset(self): # should fail if covariates are too short with self.assertRaises(ValueError): - ds[0] + _ = ds[0] # Should return correct values when covariates is long enough ds = DualCovariatesInferenceDataset( @@ -286,12 +302,13 @@ def test_dual_covariates_inference_dataset(self): np.testing.assert_almost_equal(ds[0][0], target.values()[-10:]) np.testing.assert_almost_equal(ds[0][1], long_cov.values()[-60:-50]) np.testing.assert_almost_equal(ds[0][2], long_cov.values()[-50:-20]) - self.assertEqual(ds[0][3], target) + np.testing.assert_almost_equal(ds[0][3], self.cov_st2) + self.assertEqual(ds[0][4], target) # Should also work for integer-indexed series target = TimeSeries.from_times_and_values( pd.RangeIndex(start=10, stop=50, step=1), np.random.randn(40) - ) + ).set_static_covariates(self.cov_st2_df) covariate = TimeSeries.from_times_and_values( pd.RangeIndex(start=20, stop=80, step=1), np.random.randn(60) ) @@ -307,7 +324,8 @@ def test_dual_covariates_inference_dataset(self): np.testing.assert_almost_equal(ds[0][0], target.values()[-10:]) np.testing.assert_almost_equal(ds[0][1], covariate.values()[20:30]) np.testing.assert_almost_equal(ds[0][2], covariate.values()[30:50]) - self.assertEqual(ds[0][3], target) + np.testing.assert_almost_equal(ds[0][3], self.cov_st2) + self.assertEqual(ds[0][4], target) def test_mixed_covariates_inference_dataset(self): # With future past covariates: @@ -318,7 +336,7 @@ def test_mixed_covariates_inference_dataset(self): target = TimeSeries.from_times_and_values( times1, np.random.randn(len(times1)) - ) + ).set_static_covariates(self.cov_st2_df) past_cov = TimeSeries.from_times_and_values( times1, np.random.randn(len(times1)) ) @@ -340,7 +358,7 @@ def test_mixed_covariates_inference_dataset(self): # should fail if future covariates are too short with self.assertRaises(ValueError): - ds[0] + _ = ds[0] # Should return correct values when covariates is long enough ds = MixedCovariatesInferenceDataset( @@ -359,12 +377,13 @@ def test_mixed_covariates_inference_dataset(self): np.testing.assert_almost_equal(ds[0][2], future_cov.values()[-60:-50]) np.testing.assert_almost_equal(ds[0][3], future_cov.values()[-50:-20]) np.testing.assert_almost_equal(ds[0][4], long_past_cov.values()[-50:-30]) - self.assertEqual(ds[0][5], target) + np.testing.assert_almost_equal(ds[0][5], self.cov_st2) + self.assertEqual(ds[0][6], target) # Should also work for integer-indexed series target = TimeSeries.from_times_and_values( pd.RangeIndex(start=10, stop=50, step=1), np.random.randn(40) - ) + ).set_static_covariates(self.cov_st2_df) past_cov = TimeSeries.from_times_and_values( pd.RangeIndex(start=20, stop=80, step=1), np.random.randn(60) ) @@ -386,7 +405,8 @@ def test_mixed_covariates_inference_dataset(self): np.testing.assert_almost_equal(ds[0][2], future_cov.values()[10:20]) np.testing.assert_almost_equal(ds[0][3], future_cov.values()[20:40]) np.testing.assert_almost_equal(ds[0][4], past_cov.values()[30:40]) - self.assertEqual(ds[0][5], target) + np.testing.assert_almost_equal(ds[0][5], self.cov_st2) + self.assertEqual(ds[0][6], target) def test_split_covariates_inference_dataset(self): # With future past covariates: @@ -397,7 +417,7 @@ def test_split_covariates_inference_dataset(self): target = TimeSeries.from_times_and_values( times1, np.random.randn(len(times1)) - ) + ).set_static_covariates(self.cov_st2_df) past_cov = TimeSeries.from_times_and_values( times1, np.random.randn(len(times1)) ) @@ -419,7 +439,7 @@ def test_split_covariates_inference_dataset(self): # should fail if future covariates are too short with self.assertRaises(ValueError): - ds[0] + _ = ds[0] # Should return correct values when covariates is long enough ds = SplitCovariatesInferenceDataset( @@ -437,12 +457,13 @@ def test_split_covariates_inference_dataset(self): np.testing.assert_almost_equal(ds[0][1], long_past_cov.values()[-60:-50]) np.testing.assert_almost_equal(ds[0][2], future_cov.values()[-50:-20]) np.testing.assert_almost_equal(ds[0][3], long_past_cov.values()[-50:-30]) - self.assertEqual(ds[0][4], target) + np.testing.assert_almost_equal(ds[0][4], self.cov_st2) + self.assertEqual(ds[0][5], target) # Should also work for integer-indexed series target = TimeSeries.from_times_and_values( pd.RangeIndex(start=10, stop=50, step=1), np.random.randn(40) - ) + ).set_static_covariates(self.cov_st2_df) past_cov = TimeSeries.from_times_and_values( pd.RangeIndex(start=20, stop=80, step=1), np.random.randn(60) ) @@ -463,7 +484,8 @@ def test_split_covariates_inference_dataset(self): np.testing.assert_almost_equal(ds[0][1], past_cov.values()[20:30]) np.testing.assert_almost_equal(ds[0][2], future_cov.values()[20:40]) np.testing.assert_almost_equal(ds[0][3], past_cov.values()[30:40]) - self.assertEqual(ds[0][4], target) + np.testing.assert_almost_equal(ds[0][4], self.cov_st2) + self.assertEqual(ds[0][5], target) def test_past_covariates_sequential_dataset(self): # one target series @@ -473,7 +495,9 @@ def test_past_covariates_sequential_dataset(self): output_chunk_length=10, ) self.assertEqual(len(ds), 81) - self._assert_eq(ds[5], (self.target1[75:85], None, self.target1[85:95])) + self._assert_eq( + ds[5], (self.target1[75:85], None, self.cov_st1, self.target1[85:95]) + ) # two target series ds = PastCovariatesSequentialDataset( @@ -482,9 +506,12 @@ def test_past_covariates_sequential_dataset(self): output_chunk_length=10, ) self.assertEqual(len(ds), 262) - self._assert_eq(ds[5], (self.target1[75:85], None, self.target1[85:95])) self._assert_eq( - ds[136], (self.target2[125:135], None, self.target2[135:145]) + ds[5], (self.target1[75:85], None, self.cov_st1, self.target1[85:95]) + ) + self._assert_eq( + ds[136], + (self.target2[125:135], None, self.cov_st2, self.target2[135:145]), ) # two target series with custom max_nr_samples @@ -495,9 +522,12 @@ def test_past_covariates_sequential_dataset(self): max_samples_per_ts=50, ) self.assertEqual(len(ds), 100) - self._assert_eq(ds[5], (self.target1[75:85], None, self.target1[85:95])) self._assert_eq( - ds[55], (self.target2[125:135], None, self.target2[135:145]) + ds[5], (self.target1[75:85], None, self.cov_st1, self.target1[85:95]) + ) + self._assert_eq( + ds[55], + (self.target2[125:135], None, self.cov_st2, self.target2[135:145]), ) # two targets and one covariate @@ -514,11 +544,22 @@ def test_past_covariates_sequential_dataset(self): output_chunk_length=10, ) self._assert_eq( - ds[5], (self.target1[75:85], self.cov1[75:85], self.target1[85:95]) + ds[5], + ( + self.target1[75:85], + self.cov1[75:85], + self.cov_st1, + self.target1[85:95], + ), ) self._assert_eq( ds[136], - (self.target2[125:135], self.cov2[125:135], self.target2[135:145]), + ( + self.target2[125:135], + self.cov2[125:135], + self.cov_st2, + self.target2[135:145], + ), ) # should fail if covariates do not have the required time span, even though covariates are longer @@ -526,7 +567,7 @@ def test_past_covariates_sequential_dataset(self): times2 = pd.date_range(start="20120101", end="20150101", freq="D") target = TimeSeries.from_times_and_values( times1, np.random.randn(len(times1)) - ) + ).set_static_covariates(self.cov_st2_df) cov = TimeSeries.from_times_and_values(times2, np.random.randn(len(times2))) ds = PastCovariatesSequentialDataset( target_series=target, @@ -535,14 +576,14 @@ def test_past_covariates_sequential_dataset(self): output_chunk_length=10, ) with self.assertRaises(ValueError): - ds[5] + _ = ds[5] # the same should fail when series are integer-indexed times1 = pd.RangeIndex(start=0, stop=100, step=1) times2 = pd.RangeIndex(start=200, stop=400, step=1) target = TimeSeries.from_times_and_values( times1, np.random.randn(len(times1)) - ) + ).set_static_covariates(self.cov_st2_df) cov = TimeSeries.from_times_and_values(times2, np.random.randn(len(times2))) ds = PastCovariatesSequentialDataset( target_series=target, @@ -551,14 +592,14 @@ def test_past_covariates_sequential_dataset(self): output_chunk_length=10, ) with self.assertRaises(ValueError): - ds[5] + _ = ds[5] # we should get the correct covariate slice even when target and covariates are not aligned times1 = pd.date_range(start="20100101", end="20110101", freq="D") times2 = pd.date_range(start="20090101", end="20110106", freq="D") target = TimeSeries.from_times_and_values( times1, np.random.randn(len(times1)) - ) + ).set_static_covariates(self.cov_st2_df) cov = TimeSeries.from_times_and_values(times2, np.random.randn(len(times2))) ds = PastCovariatesSequentialDataset( target_series=target, @@ -567,15 +608,15 @@ def test_past_covariates_sequential_dataset(self): output_chunk_length=10, ) - self.assertTrue(all(ds[0][1] == cov.values()[-25:-15])) - self.assertTrue(all(ds[5][1] == cov.values()[-30:-20])) + np.testing.assert_almost_equal(ds[0][1], cov.values()[-25:-15]) + np.testing.assert_almost_equal(ds[5][1], cov.values()[-30:-20]) # This should also be the case when series are integer indexed times1 = pd.RangeIndex(start=100, stop=200, step=1) times2 = pd.RangeIndex(start=50, stop=250, step=1) target = TimeSeries.from_times_and_values( times1, np.random.randn(len(times1)) - ) + ).set_static_covariates(self.cov_st2_df) cov = TimeSeries.from_times_and_values(times2, np.random.randn(len(times2))) ds = PastCovariatesSequentialDataset( target_series=target, @@ -584,8 +625,8 @@ def test_past_covariates_sequential_dataset(self): output_chunk_length=10, ) - self.assertTrue(all(ds[0][1] == cov.values()[-70:-60])) - self.assertTrue(all(ds[5][1] == cov.values()[-75:-65])) + np.testing.assert_almost_equal(ds[0][1], cov.values()[-70:-60]) + np.testing.assert_almost_equal(ds[5][1], cov.values()[-75:-65]) def test_future_covariates_sequential_dataset(self): # one target series @@ -595,7 +636,9 @@ def test_future_covariates_sequential_dataset(self): output_chunk_length=10, ) self.assertEqual(len(ds), 81) - self._assert_eq(ds[5], (self.target1[75:85], None, self.target1[85:95])) + self._assert_eq( + ds[5], (self.target1[75:85], None, self.cov_st1, self.target1[85:95]) + ) # two target series ds = FutureCovariatesSequentialDataset( @@ -604,9 +647,12 @@ def test_future_covariates_sequential_dataset(self): output_chunk_length=10, ) self.assertEqual(len(ds), 262) - self._assert_eq(ds[5], (self.target1[75:85], None, self.target1[85:95])) self._assert_eq( - ds[136], (self.target2[125:135], None, self.target2[135:145]) + ds[5], (self.target1[75:85], None, self.cov_st1, self.target1[85:95]) + ) + self._assert_eq( + ds[136], + (self.target2[125:135], None, self.cov_st2, self.target2[135:145]), ) # two target series with custom max_nr_samples @@ -617,9 +663,12 @@ def test_future_covariates_sequential_dataset(self): max_samples_per_ts=50, ) self.assertEqual(len(ds), 100) - self._assert_eq(ds[5], (self.target1[75:85], None, self.target1[85:95])) self._assert_eq( - ds[55], (self.target2[125:135], None, self.target2[135:145]) + ds[5], (self.target1[75:85], None, self.cov_st1, self.target1[85:95]) + ) + self._assert_eq( + ds[55], + (self.target2[125:135], None, self.cov_st2, self.target2[135:145]), ) # two targets and one covariate @@ -629,8 +678,12 @@ def test_future_covariates_sequential_dataset(self): ) # two targets and two covariates; covariates not aligned, must contain correct values - target1 = TimeSeries.from_values(np.random.randn(100)) - target2 = TimeSeries.from_values(np.random.randn(50)) + target1 = TimeSeries.from_values( + np.random.randn(100) + ).set_static_covariates(self.cov_st2_df) + target2 = TimeSeries.from_values(np.random.randn(50)).set_static_covariates( + self.cov_st2_df + ) cov1 = TimeSeries.from_values(np.random.randn(120)) cov2 = TimeSeries.from_values(np.random.randn(80)) @@ -641,20 +694,22 @@ def test_future_covariates_sequential_dataset(self): output_chunk_length=10, ) - self.assertTrue(all(ds[0][0] == target1.values()[-20:-10])) - self.assertTrue(all(ds[0][1] == cov1.values()[-30:-20])) - self.assertTrue(all(ds[0][2] == target1.values()[-10:])) + np.testing.assert_almost_equal(ds[0][0], target1.values()[-20:-10]) + np.testing.assert_almost_equal(ds[0][1], cov1.values()[-30:-20]) + np.testing.assert_almost_equal(ds[0][2], self.cov_st2) + np.testing.assert_almost_equal(ds[0][3], target1.values()[-10:]) - self.assertTrue(all(ds[101][0] == target2.values()[-40:-30])) - self.assertTrue(all(ds[101][1] == cov2.values()[-60:-50])) - self.assertTrue(all(ds[101][2] == target2.values()[-30:-20])) + np.testing.assert_almost_equal(ds[101][0], target2.values()[-40:-30]) + np.testing.assert_almost_equal(ds[101][1], cov2.values()[-60:-50]) + np.testing.assert_almost_equal(ds[101][2], self.cov_st2) + np.testing.assert_almost_equal(ds[101][3], target2.values()[-30:-20]) # Should also contain correct values when time-indexed with covariates not aligned times1 = pd.date_range(start="20090201", end="20090220", freq="D") times2 = pd.date_range(start="20090201", end="20090222", freq="D") target1 = TimeSeries.from_times_and_values( times1, np.random.randn(len(times1)) - ) + ).set_static_covariates(self.cov_st2_df) cov1 = TimeSeries.from_times_and_values( times2, np.random.randn(len(times2)) ) @@ -666,12 +721,15 @@ def test_future_covariates_sequential_dataset(self): output_chunk_length=2, ) - self.assertTrue(all(ds[0][0] == target1.values()[-4:-2])) - self.assertTrue(all(ds[0][1] == cov1.values()[-4:-2])) - self.assertTrue(all(ds[0][2] == target1.values()[-2:])) + np.testing.assert_almost_equal(ds[0][0], target1.values()[-4:-2]) + np.testing.assert_almost_equal(ds[0][1], cov1.values()[-4:-2]) + np.testing.assert_almost_equal(ds[0][2], self.cov_st2) + np.testing.assert_almost_equal(ds[0][3], target1.values()[-2:]) # Should fail if covariates are not long enough - target1 = TimeSeries.from_values(np.random.randn(8)) + target1 = TimeSeries.from_values(np.random.randn(8)).set_static_covariates( + self.cov_st2_df + ) cov1 = TimeSeries.from_values(np.random.randn(7)) ds = FutureCovariatesSequentialDataset( @@ -682,7 +740,7 @@ def test_future_covariates_sequential_dataset(self): ) with self.assertRaises(ValueError): - ds[0] + _ = ds[0] def test_dual_covariates_sequential_dataset(self): # Must contain (past_target, historic_future_covariates, future_covariates, future_target) @@ -695,7 +753,8 @@ def test_dual_covariates_sequential_dataset(self): ) self.assertEqual(len(ds), 81) self._assert_eq( - ds[5], (self.target1[75:85], None, None, self.target1[85:95]) + ds[5], + (self.target1[75:85], None, None, self.cov_st1, self.target1[85:95]), ) # two target series @@ -706,10 +765,18 @@ def test_dual_covariates_sequential_dataset(self): ) self.assertEqual(len(ds), 262) self._assert_eq( - ds[5], (self.target1[75:85], None, None, self.target1[85:95]) + ds[5], + (self.target1[75:85], None, None, self.cov_st1, self.target1[85:95]), ) self._assert_eq( - ds[136], (self.target2[125:135], None, None, self.target2[135:145]) + ds[136], + ( + self.target2[125:135], + None, + None, + self.cov_st2, + self.target2[135:145], + ), ) # two target series with custom max_nr_samples @@ -721,10 +788,18 @@ def test_dual_covariates_sequential_dataset(self): ) self.assertEqual(len(ds), 100) self._assert_eq( - ds[5], (self.target1[75:85], None, None, self.target1[85:95]) + ds[5], + (self.target1[75:85], None, None, self.cov_st1, self.target1[85:95]), ) self._assert_eq( - ds[55], (self.target2[125:135], None, None, self.target2[135:145]) + ds[55], + ( + self.target2[125:135], + None, + None, + self.cov_st2, + self.target2[135:145], + ), ) # two targets and one covariate @@ -734,8 +809,12 @@ def test_dual_covariates_sequential_dataset(self): ) # two targets and two covariates; covariates not aligned, must contain correct values - target1 = TimeSeries.from_values(np.random.randn(100)) - target2 = TimeSeries.from_values(np.random.randn(50)) + target1 = TimeSeries.from_values( + np.random.randn(100) + ).set_static_covariates(self.cov_st2_df) + target2 = TimeSeries.from_values(np.random.randn(50)).set_static_covariates( + self.cov_st2_df + ) cov1 = TimeSeries.from_values(np.random.randn(120)) cov2 = TimeSeries.from_values(np.random.randn(80)) @@ -746,22 +825,24 @@ def test_dual_covariates_sequential_dataset(self): output_chunk_length=10, ) - self.assertTrue(all(ds[0][0] == target1.values()[-20:-10])) - self.assertTrue(all(ds[0][1] == cov1.values()[-40:-30])) - self.assertTrue(all(ds[0][2] == cov1.values()[-30:-20])) - self.assertTrue(all(ds[0][3] == target1.values()[-10:])) + np.testing.assert_almost_equal(ds[0][0], target1.values()[-20:-10]) + np.testing.assert_almost_equal(ds[0][1], cov1.values()[-40:-30]) + np.testing.assert_almost_equal(ds[0][2], cov1.values()[-30:-20]) + np.testing.assert_almost_equal(ds[0][3], self.cov_st2) + np.testing.assert_almost_equal(ds[0][4], target1.values()[-10:]) - self.assertTrue(all(ds[101][0] == target2.values()[-40:-30])) - self.assertTrue(all(ds[101][1] == cov2.values()[-70:-60])) - self.assertTrue(all(ds[101][2] == cov2.values()[-60:-50])) - self.assertTrue(all(ds[101][3] == target2.values()[-30:-20])) + np.testing.assert_almost_equal(ds[101][0], target2.values()[-40:-30]) + np.testing.assert_almost_equal(ds[101][1], cov2.values()[-70:-60]) + np.testing.assert_almost_equal(ds[101][2], cov2.values()[-60:-50]) + np.testing.assert_almost_equal(ds[101][3], self.cov_st2) + np.testing.assert_almost_equal(ds[101][4], target2.values()[-30:-20]) # Should also contain correct values when time-indexed with covariates not aligned times1 = pd.date_range(start="20090201", end="20090220", freq="D") times2 = pd.date_range(start="20090201", end="20090222", freq="D") target1 = TimeSeries.from_times_and_values( times1, np.random.randn(len(times1)) - ) + ).set_static_covariates(self.cov_st2_df) cov1 = TimeSeries.from_times_and_values( times2, np.random.randn(len(times2)) ) @@ -773,13 +854,16 @@ def test_dual_covariates_sequential_dataset(self): output_chunk_length=2, ) - self.assertTrue(all(ds[0][0] == target1.values()[-4:-2])) - self.assertTrue(all(ds[0][1] == cov1.values()[-6:-4])) - self.assertTrue(all(ds[0][2] == cov1.values()[-4:-2])) - self.assertTrue(all(ds[0][3] == target1.values()[-2:])) + np.testing.assert_almost_equal(ds[0][0], target1.values()[-4:-2]) + np.testing.assert_almost_equal(ds[0][1], cov1.values()[-6:-4]) + np.testing.assert_almost_equal(ds[0][2], cov1.values()[-4:-2]) + np.testing.assert_almost_equal(ds[0][3], self.cov_st2) + np.testing.assert_almost_equal(ds[0][4], target1.values()[-2:]) # Should fail if covariates are not long enough - target1 = TimeSeries.from_values(np.random.randn(8)) + target1 = TimeSeries.from_values(np.random.randn(8)).set_static_covariates( + self.cov_st2_df + ) cov1 = TimeSeries.from_values(np.random.randn(7)) ds = DualCovariatesSequentialDataset( @@ -790,7 +874,7 @@ def test_dual_covariates_sequential_dataset(self): ) with self.assertRaises(ValueError): - ds[0] + _ = ds[0] def test_past_covariates_shifted_dataset(self): # one target series @@ -798,16 +882,21 @@ def test_past_covariates_shifted_dataset(self): target_series=self.target1, length=10, shift=5 ) self.assertEqual(len(ds), 86) - self._assert_eq(ds[5], (self.target1[80:90], None, self.target1[85:95])) + self._assert_eq( + ds[5], (self.target1[80:90], None, self.cov_st1, self.target1[85:95]) + ) # two target series ds = PastCovariatesShiftedDataset( target_series=[self.target1, self.target2], length=10, shift=5 ) self.assertEqual(len(ds), 272) - self._assert_eq(ds[5], (self.target1[80:90], None, self.target1[85:95])) self._assert_eq( - ds[141], (self.target2[130:140], None, self.target2[135:145]) + ds[5], (self.target1[80:90], None, self.cov_st1, self.target1[85:95]) + ) + self._assert_eq( + ds[141], + (self.target2[130:140], None, self.cov_st2, self.target2[135:145]), ) # two target series with custom max_nr_samples @@ -818,9 +907,12 @@ def test_past_covariates_shifted_dataset(self): max_samples_per_ts=50, ) self.assertEqual(len(ds), 100) - self._assert_eq(ds[5], (self.target1[80:90], None, self.target1[85:95])) self._assert_eq( - ds[55], (self.target2[130:140], None, self.target2[135:145]) + ds[5], (self.target1[80:90], None, self.cov_st1, self.target1[85:95]) + ) + self._assert_eq( + ds[55], + (self.target2[130:140], None, self.cov_st2, self.target2[135:145]), ) # two targets and one covariate @@ -837,47 +929,64 @@ def test_past_covariates_shifted_dataset(self): shift=5, ) self._assert_eq( - ds[5], (self.target1[80:90], self.cov1[80:90], self.target1[85:95]) + ds[5], + ( + self.target1[80:90], + self.cov1[80:90], + self.cov_st1, + self.target1[85:95], + ), ) self._assert_eq( ds[141], - (self.target2[130:140], self.cov2[130:140], self.target2[135:145]), + ( + self.target2[130:140], + self.cov2[130:140], + self.cov_st2, + self.target2[135:145], + ), ) # Should contain correct values even when covariates are not aligned - target1 = TimeSeries.from_values(np.random.randn(8)) + target1 = TimeSeries.from_values(np.random.randn(8)).set_static_covariates( + self.cov_st2_df + ) cov1 = TimeSeries.from_values(np.random.randn(10)) ds = PastCovariatesShiftedDataset( target_series=[target1], covariates=[cov1], length=3, shift=2 ) - self.assertTrue(all(ds[0][0] == target1.values()[-5:-2])) - self.assertTrue(all(ds[0][1] == cov1.values()[-7:-4])) - self.assertTrue(all(ds[0][2] == target1.values()[-3:])) + np.testing.assert_almost_equal(ds[0][0], target1.values()[-5:-2]) + np.testing.assert_almost_equal(ds[0][1], cov1.values()[-7:-4]) + np.testing.assert_almost_equal(ds[0][2], self.cov_st2) + np.testing.assert_almost_equal(ds[0][3], target1.values()[-3:]) # Should also contain correct values when time-indexed with covariates not aligned times1 = pd.date_range(start="20090201", end="20090220", freq="D") times2 = pd.date_range(start="20090201", end="20090222", freq="D") target1 = TimeSeries.from_times_and_values( times1, np.random.randn(len(times1)) - ) + ).set_static_covariates(self.cov_st2_df) cov1 = TimeSeries.from_times_and_values( times2, np.random.randn(len(times2)) ) ds = PastCovariatesShiftedDataset( target_series=[target1], covariates=[cov1], length=3, shift=2 ) - self.assertTrue(all(ds[0][0] == target1.values()[-5:-2])) - self.assertTrue(all(ds[0][1] == cov1.values()[-7:-4])) - self.assertTrue(all(ds[0][2] == target1.values()[-3:])) + np.testing.assert_almost_equal(ds[0][0], target1.values()[-5:-2]) + np.testing.assert_almost_equal(ds[0][1], cov1.values()[-7:-4]) + np.testing.assert_almost_equal(ds[0][2], self.cov_st2) + np.testing.assert_almost_equal(ds[0][3], target1.values()[-3:]) # Should fail if covariates are too short - target1 = TimeSeries.from_values(np.random.randn(8)) + target1 = TimeSeries.from_values(np.random.randn(8)).set_static_covariates( + self.cov_st2_df + ) cov1 = TimeSeries.from_values(np.random.randn(5)) ds = PastCovariatesShiftedDataset( target_series=[target1], covariates=[cov1], length=3, shift=2 ) with self.assertRaises(ValueError): - ds[0] + _ = ds[0] def test_future_covariates_shifted_dataset(self): # one target series @@ -885,16 +994,21 @@ def test_future_covariates_shifted_dataset(self): target_series=self.target1, length=10, shift=5 ) self.assertEqual(len(ds), 86) - self._assert_eq(ds[5], (self.target1[80:90], None, self.target1[85:95])) + self._assert_eq( + ds[5], (self.target1[80:90], None, self.cov_st1, self.target1[85:95]) + ) # two target series ds = FutureCovariatesShiftedDataset( target_series=[self.target1, self.target2], length=10, shift=5 ) self.assertEqual(len(ds), 272) - self._assert_eq(ds[5], (self.target1[80:90], None, self.target1[85:95])) self._assert_eq( - ds[141], (self.target2[130:140], None, self.target2[135:145]) + ds[5], (self.target1[80:90], None, self.cov_st1, self.target1[85:95]) + ) + self._assert_eq( + ds[141], + (self.target2[130:140], None, self.cov_st2, self.target2[135:145]), ) # two target series with custom max_nr_samples @@ -905,9 +1019,12 @@ def test_future_covariates_shifted_dataset(self): max_samples_per_ts=50, ) self.assertEqual(len(ds), 100) - self._assert_eq(ds[5], (self.target1[80:90], None, self.target1[85:95])) self._assert_eq( - ds[55], (self.target2[130:140], None, self.target2[135:145]) + ds[5], (self.target1[80:90], None, self.cov_st1, self.target1[85:95]) + ) + self._assert_eq( + ds[55], + (self.target2[130:140], None, self.cov_st2, self.target2[135:145]), ) # two targets and one covariate @@ -924,47 +1041,64 @@ def test_future_covariates_shifted_dataset(self): shift=5, ) self._assert_eq( - ds[5], (self.target1[80:90], self.cov1[85:95], self.target1[85:95]) + ds[5], + ( + self.target1[80:90], + self.cov1[85:95], + self.cov_st1, + self.target1[85:95], + ), ) self._assert_eq( ds[141], - (self.target2[130:140], self.cov2[135:145], self.target2[135:145]), + ( + self.target2[130:140], + self.cov2[135:145], + self.cov_st2, + self.target2[135:145], + ), ) # Should contain correct values even when covariates are not aligned - target1 = TimeSeries.from_values(np.random.randn(8)) + target1 = TimeSeries.from_values(np.random.randn(8)).set_static_covariates( + self.cov_st2_df + ) cov1 = TimeSeries.from_values(np.random.randn(10)) ds = FutureCovariatesShiftedDataset( target_series=[target1], covariates=[cov1], length=3, shift=2 ) - self.assertTrue(all(ds[0][0] == target1.values()[-5:-2])) - self.assertTrue(all(ds[0][1] == cov1.values()[-5:-2])) - self.assertTrue(all(ds[0][2] == target1.values()[-3:])) + np.testing.assert_almost_equal(ds[0][0], target1.values()[-5:-2]) + np.testing.assert_almost_equal(ds[0][1], cov1.values()[-5:-2]) + np.testing.assert_almost_equal(ds[0][2], self.cov_st2) + np.testing.assert_almost_equal(ds[0][3], target1.values()[-3:]) # Should also contain correct values when time-indexed with covariates not aligned times1 = pd.date_range(start="20090201", end="20090220", freq="D") times2 = pd.date_range(start="20090201", end="20090222", freq="D") target1 = TimeSeries.from_times_and_values( times1, np.random.randn(len(times1)) - ) + ).set_static_covariates(self.cov_st2_df) cov1 = TimeSeries.from_times_and_values( times2, np.random.randn(len(times2)) ) ds = FutureCovariatesShiftedDataset( target_series=[target1], covariates=[cov1], length=3, shift=2 ) - self.assertTrue(all(ds[0][0] == target1.values()[-5:-2])) - self.assertTrue(all(ds[0][1] == cov1.values()[-5:-2])) - self.assertTrue(all(ds[0][2] == target1.values()[-3:])) + np.testing.assert_almost_equal(ds[0][0], target1.values()[-5:-2]) + np.testing.assert_almost_equal(ds[0][1], cov1.values()[-5:-2]) + np.testing.assert_almost_equal(ds[0][2], self.cov_st2) + np.testing.assert_almost_equal(ds[0][3], target1.values()[-3:]) # Should fail if covariates are too short - target1 = TimeSeries.from_values(np.random.randn(8)) + target1 = TimeSeries.from_values(np.random.randn(8)).set_static_covariates( + self.cov_st2_df + ) cov1 = TimeSeries.from_values(np.random.randn(7)) ds = FutureCovariatesShiftedDataset( target_series=[target1], covariates=[cov1], length=3, shift=2 ) with self.assertRaises(ValueError): - ds[0] + _ = ds[0] def test_dual_covariates_shifted_dataset(self): # one target series @@ -973,7 +1107,8 @@ def test_dual_covariates_shifted_dataset(self): ) self.assertEqual(len(ds), 86) self._assert_eq( - ds[5], (self.target1[80:90], None, None, self.target1[85:95]) + ds[5], + (self.target1[80:90], None, None, self.cov_st1, self.target1[85:95]), ) # two target series @@ -982,10 +1117,18 @@ def test_dual_covariates_shifted_dataset(self): ) self.assertEqual(len(ds), 272) self._assert_eq( - ds[5], (self.target1[80:90], None, None, self.target1[85:95]) + ds[5], + (self.target1[80:90], None, None, self.cov_st1, self.target1[85:95]), ) self._assert_eq( - ds[141], (self.target2[130:140], None, None, self.target2[135:145]) + ds[141], + ( + self.target2[130:140], + None, + None, + self.cov_st2, + self.target2[135:145], + ), ) # two target series with custom max_nr_samples @@ -997,10 +1140,18 @@ def test_dual_covariates_shifted_dataset(self): ) self.assertEqual(len(ds), 100) self._assert_eq( - ds[5], (self.target1[80:90], None, None, self.target1[85:95]) + ds[5], + (self.target1[80:90], None, None, self.cov_st1, self.target1[85:95]), ) self._assert_eq( - ds[55], (self.target2[130:140], None, None, self.target2[135:145]) + ds[55], + ( + self.target2[130:140], + None, + None, + self.cov_st2, + self.target2[135:145], + ), ) # two targets and one covariate @@ -1022,6 +1173,7 @@ def test_dual_covariates_shifted_dataset(self): self.target1[80:90], self.cov1[80:90], self.cov1[85:95], + self.cov_st1, self.target1[85:95], ), ) @@ -1031,46 +1183,53 @@ def test_dual_covariates_shifted_dataset(self): self.target2[130:140], self.cov2[130:140], self.cov2[135:145], + self.cov_st2, self.target2[135:145], ), ) # Should contain correct values even when covariates are not aligned - target1 = TimeSeries.from_values(np.random.randn(8)) + target1 = TimeSeries.from_values(np.random.randn(8)).set_static_covariates( + self.cov_st2_df + ) cov1 = TimeSeries.from_values(np.random.randn(10)) ds = DualCovariatesShiftedDataset( target_series=[target1], covariates=[cov1], length=3, shift=2 ) - self.assertTrue(all(ds[0][0] == target1.values()[-5:-2])) - self.assertTrue(all(ds[0][1] == cov1.values()[-7:-4])) - self.assertTrue(all(ds[0][2] == cov1.values()[-5:-2])) - self.assertTrue(all(ds[0][3] == target1.values()[-3:])) + np.testing.assert_almost_equal(ds[0][0], target1.values()[-5:-2]) + np.testing.assert_almost_equal(ds[0][1], cov1.values()[-7:-4]) + np.testing.assert_almost_equal(ds[0][2], cov1.values()[-5:-2]) + np.testing.assert_almost_equal(ds[0][3], self.cov_st2) + np.testing.assert_almost_equal(ds[0][4], target1.values()[-3:]) # Should also contain correct values when time-indexed with covariates not aligned times1 = pd.date_range(start="20090201", end="20090220", freq="D") times2 = pd.date_range(start="20090201", end="20090222", freq="D") target1 = TimeSeries.from_times_and_values( times1, np.random.randn(len(times1)) - ) + ).set_static_covariates(self.cov_st2_df) cov1 = TimeSeries.from_times_and_values( times2, np.random.randn(len(times2)) ) ds = DualCovariatesShiftedDataset( target_series=[target1], covariates=[cov1], length=3, shift=2 ) - self.assertTrue(all(ds[0][0] == target1.values()[-5:-2])) - self.assertTrue(all(ds[0][1] == cov1.values()[-7:-4])) - self.assertTrue(all(ds[0][2] == cov1.values()[-5:-2])) - self.assertTrue(all(ds[0][3] == target1.values()[-3:])) + np.testing.assert_almost_equal(ds[0][0], target1.values()[-5:-2]) + np.testing.assert_almost_equal(ds[0][1], cov1.values()[-7:-4]) + np.testing.assert_almost_equal(ds[0][2], cov1.values()[-5:-2]) + np.testing.assert_almost_equal(ds[0][3], self.cov_st2) + np.testing.assert_almost_equal(ds[0][4], target1.values()[-3:]) # Should fail if covariates are too short - target1 = TimeSeries.from_values(np.random.randn(8)) + target1 = TimeSeries.from_values(np.random.randn(8)).set_static_covariates( + self.cov_st2_df + ) cov1 = TimeSeries.from_values(np.random.randn(7)) ds = DualCovariatesShiftedDataset( target_series=[target1], covariates=[cov1], length=3, shift=2 ) with self.assertRaises(ValueError): - ds[0] + _ = ds[0] def test_horizon_based_dataset(self): # one target series @@ -1081,7 +1240,9 @@ def test_horizon_based_dataset(self): lookback=2, ) self.assertEqual(len(ds), 20) - self._assert_eq(ds[5], (self.target1[65:85], None, self.target1[85:95])) + self._assert_eq( + ds[5], (self.target1[65:85], None, self.cov_st1, self.target1[85:95]) + ) # two target series ds = HorizonBasedDataset( @@ -1091,9 +1252,12 @@ def test_horizon_based_dataset(self): lookback=2, ) self.assertEqual(len(ds), 40) - self._assert_eq(ds[5], (self.target1[65:85], None, self.target1[85:95])) self._assert_eq( - ds[25], (self.target2[115:135], None, self.target2[135:145]) + ds[5], (self.target1[65:85], None, self.cov_st1, self.target1[85:95]) + ) + self._assert_eq( + ds[25], + (self.target2[115:135], None, self.cov_st2, self.target2[135:145]), ) # two targets and one covariate @@ -1111,11 +1275,22 @@ def test_horizon_based_dataset(self): lookback=2, ) self._assert_eq( - ds[5], (self.target1[65:85], self.cov1[65:85], self.target1[85:95]) + ds[5], + ( + self.target1[65:85], + self.cov1[65:85], + self.cov_st1, + self.target1[85:95], + ), ) self._assert_eq( ds[25], - (self.target2[115:135], self.cov2[115:135], self.target2[135:145]), + ( + self.target2[115:135], + self.cov2[115:135], + self.cov_st2, + self.target2[135:145], + ), ) def test_get_matching_index(self): @@ -1126,7 +1301,7 @@ def test_get_matching_index(self): times2 = pd.date_range(start="20100101", end="20100320", freq="D") target = TimeSeries.from_times_and_values( times1, np.random.randn(len(times1)) - ) + ).set_static_covariates(self.cov_st2_df) cov = TimeSeries.from_times_and_values(times2, np.random.randn(len(times2))) self.assertEqual(_get_matching_index(target, cov, idx=15), 5) @@ -1135,12 +1310,14 @@ def test_get_matching_index(self): times2 = pd.date_range(start="20090101", end="20110601", freq="M") target = TimeSeries.from_times_and_values( times1, np.random.randn(len(times1)) - ) + ).set_static_covariates(self.cov_st2_df) cov = TimeSeries.from_times_and_values(times2, np.random.randn(len(times2))) self.assertEqual(_get_matching_index(target, cov, idx=15), 15 - 7) # check integer-indexed series times2 = pd.RangeIndex(start=10, stop=90) - target = TimeSeries.from_values(np.random.randn(100)) + target = TimeSeries.from_values(np.random.randn(100)).set_static_covariates( + self.cov_st2_df + ) cov = TimeSeries.from_times_and_values(times2, np.random.randn(len(times2))) self.assertEqual(_get_matching_index(target, cov, idx=15), 5) From 3511b81191781be74d22e1275ca2e2a4918267cc Mon Sep 17 00:00:00 2001 From: dennisbader Date: Fri, 27 May 2022 10:18:14 +0200 Subject: [PATCH 09/26] fixed all downstream issues from new static covariates in datasets --- .../forecasting/pl_forecasting_module.py | 31 +------------------ darts/models/forecasting/rnn_model.py | 4 +-- darts/models/forecasting/tft_model.py | 2 -- darts/timeseries.py | 5 +-- 4 files changed, 6 insertions(+), 36 deletions(-) diff --git a/darts/models/forecasting/pl_forecasting_module.py b/darts/models/forecasting/pl_forecasting_module.py index 5d84542317..0a6d937ced 100644 --- a/darts/models/forecasting/pl_forecasting_module.py +++ b/darts/models/forecasting/pl_forecasting_module.py @@ -508,32 +508,6 @@ def _process_input_batch( ) = input_batch dim_variable = 2 - # TODO: remove when everything works - # x_past = torch.cat( - # [ - # tensor - # for tensor in [ - # past_target, - # past_covariates, - # historic_future_covariates, - # static_covariates, - # ] - # if tensor is not None - # ], - # dim=dim_variable, - # ) - # - # x_future = None - # if future_covariates is not None or static_covariates is not None: - # x_future = torch.cat( - # [ - # tensor - # for tensor in [future_covariates, static_covariates] - # if tensor is not None - # ], - # dim=dim_variable, - # ) - x_past = torch.cat( [ tensor @@ -546,10 +520,7 @@ def _process_input_batch( ], dim=dim_variable, ) - - x_future = future_covariates - x_static = static_covariates - return x_past, x_future, x_static + return x_past, future_covariates, static_covariates def _get_batch_prediction( self, n: int, input_batch: Tuple, roll_size: int diff --git a/darts/models/forecasting/rnn_model.py b/darts/models/forecasting/rnn_model.py index 424671250b..33fd9fc66c 100644 --- a/darts/models/forecasting/rnn_model.py +++ b/darts/models/forecasting/rnn_model.py @@ -103,7 +103,7 @@ def forward(self, x, h=None): return predictions, last_hidden_state def _produce_train_output(self, input_batch: Tuple): - past_target, historic_future_covariates, future_covariates = input_batch + past_target, historic_future_covariates, future_covariates, _ = input_batch # For the RNN we concatenate the past_target with the future_covariates # (they have the same length because we enforce a Shift dataset for RNNs) model_input = ( @@ -127,7 +127,7 @@ def _get_batch_prediction( """ This model is recurrent, so we have to write a specific way to obtain the time series forecasts of length n. """ - past_target, historic_future_covariates, future_covariates = input_batch + past_target, historic_future_covariates, future_covariates, _ = input_batch if historic_future_covariates is not None: # RNNs need as inputs (target[t] and covariates[t+1]) so here we shift the covariates diff --git a/darts/models/forecasting/tft_model.py b/darts/models/forecasting/tft_model.py index d0a1511139..0c9e368318 100644 --- a/darts/models/forecasting/tft_model.py +++ b/darts/models/forecasting/tft_model.py @@ -792,8 +792,6 @@ def _create_model(self, train_sample: MixedCovariatesTrainTensorType) -> nn.Modu axis=1, ) - # static_covariates = None # placeholder for future - self.output_dim = ( (future_target.shape[1], 1) if self.likelihood is None diff --git a/darts/timeseries.py b/darts/timeseries.py index 2bbf611714..34932d2c45 100644 --- a/darts/timeseries.py +++ b/darts/timeseries.py @@ -2151,13 +2151,14 @@ def set_static_covariates( "`covariates` must be either a pandas Series, DataFrame or None", logger, ) - # check if valid static covariates multivariate TimeSeries static covariatesx + # check if valid static covariates for multivariate TimeSeries if isinstance(covariates, pd.DataFrame): n_components = len(covariates.columns) raise_if( n_components > 1 and n_components != self.n_components, "When passing a multi-column pandas DataFrame, the number of columns must match the number of " - "components of the TimeSeries object (multivariate static covariates must map to each component).", + "components of the TimeSeries object (multivariate static covariates must map to each TimeSeries " + "component).", logger, ) elif isinstance(covariates, pd.Series): From eacaf3b9344b5caa89ffa3b3168b149b466d6324 Mon Sep 17 00:00:00 2001 From: dennisbader Date: Sat, 28 May 2022 10:45:38 +0200 Subject: [PATCH 10/26] added check for equal static covariates between fit and predict --- darts/models/forecasting/tft_model.py | 2 +- .../forecasting/torch_forecasting_model.py | 57 +++++++++++-------- darts/tests/models/forecasting/test_TFT.py | 30 ++++++++++ .../test_timeseries_static_covariates.py | 2 +- 4 files changed, 66 insertions(+), 25 deletions(-) diff --git a/darts/models/forecasting/tft_model.py b/darts/models/forecasting/tft_model.py index 0c9e368318..3c6a616e86 100644 --- a/darts/models/forecasting/tft_model.py +++ b/darts/models/forecasting/tft_model.py @@ -410,7 +410,7 @@ def forward( } # Embedding and variable selection - if x_static is not None: + if self.static_variables: static_embedding = { name: x_static[:, 0, i].unsqueeze(-1) for i, name in enumerate(self.static_variables) diff --git a/darts/models/forecasting/torch_forecasting_model.py b/darts/models/forecasting/torch_forecasting_model.py index 8e3a23ab79..49c12c3cd4 100644 --- a/darts/models/forecasting/torch_forecasting_model.py +++ b/darts/models/forecasting/torch_forecasting_model.py @@ -1496,29 +1496,37 @@ def _basic_compare_sample(train_sample: Tuple, predict_sample: Tuple): For all models relying on one type of covariates only (Past, Future, Dual), we can rely on the fact that training/inference datasets have target and a covariate in first and second position to do the checks. """ - tgt_train, cov_train = train_sample[:2] - tgt_pred, cov_pred = predict_sample[:2] + tgt_train, cov_train, static_train = train_sample[:2] + (train_sample[-2],) + tgt_pred, cov_pred, static_pred = predict_sample[:2] + (predict_sample[-2],) raise_if_not( tgt_train.shape[-1] == tgt_pred.shape[-1], "The provided target has a dimension (width) that does not match the dimension " "of the target this model has been trained on.", ) - raise_if( - cov_train is not None and cov_pred is None, - "This model has been trained with covariates; some covariates of matching dimensionality are needed " - "for prediction.", - ) - raise_if( - cov_train is None and cov_pred is not None, - "This model has been trained without covariates. No covariates should be provided for prediction.", - ) - raise_if( - cov_train is not None - and cov_pred is not None - and cov_train.shape[-1] != cov_pred.shape[-1], - "The provided covariates must have dimensionality matching that of the covariates used for training " - "the model.", - ) + for (c_train, c_pred), c_descr in zip( + [(cov_train, cov_pred), (static_train, static_pred)], + ["past or future covariates", "static covariates"], + ): + raise_if( + c_train is not None and c_pred is None, + f"This model has been trained with {c_descr}; covariates of matching dimensionality are required " + f"for prediction.", + ) + raise_if( + c_train is None and c_pred is not None, + f"This model has been trained without {c_descr}. No {c_descr} should be provided for prediction.", + ) + raise_if( + c_train is not None + and c_pred is not None + and ( + c_train.shape[-1] != c_pred.shape[-1] + if c_descr != "static covariates" + else c_train.shape == c_pred.shape + ), + f"The provided {c_descr} must have dimensionality matching that of the covariates used for training " + "the model.", + ) def _mixed_compare_sample(train_sample: Tuple, predict_sample: Tuple): @@ -1538,13 +1546,16 @@ def _mixed_compare_sample(train_sample: Tuple, predict_sample: Tuple): "past_covariates", "historic_future_covariates", "future_covariates", + "static_covariates", ] train_has_ds = [ds is not None for ds in train_sample[:-1]] - predict_has_ds = [ds is not None for ds in predict_sample[:4]] + predict_has_ds = [ + ds is not None for ds in predict_sample[:4] + (predict_sample[5],) + ] train_datasets = train_sample[:-1] - predict_datasets = predict_sample[:4] + predict_datasets = predict_sample[:4] + (predict_sample[5],) tgt_train, tgt_pred = train_datasets[0], predict_datasets[0] raise_if_not( @@ -1558,18 +1569,18 @@ def _mixed_compare_sample(train_sample: Tuple, predict_sample: Tuple): ): raise_if( ds_in_train and not ds_in_predict and ds_in_train, - f"This model has been trained with {ds_name}; some {ds_name} of matching dimensionality are needed " + f"This model has been trained with `{ds_name}`; some `{ds_name}` of matching dimensionality are needed " f"for prediction.", ) raise_if( ds_in_train and not ds_in_predict and ds_in_predict, - f"This model has been trained without {ds_name}; No {ds_name} should be provided for prediction.", + f"This model has been trained without `{ds_name}`; No `{ds_name}` should be provided for prediction.", ) raise_if( ds_in_train and ds_in_predict and train_datasets[idx].shape[-1] != predict_datasets[idx].shape[-1], - f"The provided {ds_name} must have dimensionality that of the {ds_name} used for training the model.", + f"The provided `{ds_name}` must have equal dimensionality as the `{ds_name}` used for training the model.", ) diff --git a/darts/tests/models/forecasting/test_TFT.py b/darts/tests/models/forecasting/test_TFT.py index 3cda144c16..5cdd35b2e7 100644 --- a/darts/tests/models/forecasting/test_TFT.py +++ b/darts/tests/models/forecasting/test_TFT.py @@ -1,4 +1,6 @@ import numpy as np +import pandas as pd +import pytest from darts import TimeSeries from darts.dataprocessing.transformers import Scaler @@ -163,6 +165,34 @@ def test_mixed_covariates_and_accuracy(self): kwargs_tft=kwargs_TFT_full_coverage, ) + def test_static_covariates_support(self): + target = tg.sine_timeseries(length=2, freq="h") + target.set_static_covariates(pd.Series([0.0, 1.0], index=["st1", "st2"])) + + # should work with cyclic encoding for time index + model = TFTModel( + input_chunk_length=1, + output_chunk_length=1, + add_encoders={"cyclic": {"future": "hour"}}, + pl_trainer_kwargs={"fast_dev_run": True}, + ) + model.fit(target, verbose=False) + assert len(model.model.static_variables) == len(target.static_covariates) + + model.predict(n=1, series=target, verbose=False) + + # raise an error when trained with static covariates of wrong dimensionality + target.set_static_covariates( + pd.concat([target.static_covariates] * 2, axis=0) + ) + with pytest.raises(ValueError): + model.predict(n=1, series=target, verbose=False) + + # raise an error when trained with static covariates and trying to predict without + target.set_static_covariates(None) + with pytest.raises(ValueError): + model.predict(n=1, series=target, verbose=False) + def helper_generate_multivariate_case_data(self, season_length, n_repeat): """generates multivariate test case data. Target series is a sine wave stacked with a repeating linear curve of equal seasonal length. Covariates are datetime attributes for 'hours'. diff --git a/darts/tests/test_timeseries_static_covariates.py b/darts/tests/test_timeseries_static_covariates.py index 62deb37e3c..a994e577b7 100644 --- a/darts/tests/test_timeseries_static_covariates.py +++ b/darts/tests/test_timeseries_static_covariates.py @@ -161,7 +161,7 @@ def test_set_static_covariates_multivariate(self): static_covs[static_covs.columns[0]].to_frame() ) - # from univariate static covariates + # from multivariate static covariates ts_multi = ts_multi.set_static_covariates(static_covs) assert ts_multi.static_covariates.equals(static_covs) From 55c5090ef1e184b94d6f38e26c3efd90c6a66ebe Mon Sep 17 00:00:00 2001 From: dennisbader Date: Sat, 28 May 2022 11:48:17 +0200 Subject: [PATCH 11/26] added tests for passing static covariates in TimeSeries methods --- darts/tests/models/forecasting/test_TFT.py | 8 +++-- .../test_timeseries_static_covariates.py | 32 +++++++++++++++++-- darts/timeseries.py | 4 +++ 3 files changed, 38 insertions(+), 6 deletions(-) diff --git a/darts/tests/models/forecasting/test_TFT.py b/darts/tests/models/forecasting/test_TFT.py index 5cdd35b2e7..23a8dd7c3d 100644 --- a/darts/tests/models/forecasting/test_TFT.py +++ b/darts/tests/models/forecasting/test_TFT.py @@ -167,7 +167,9 @@ def test_mixed_covariates_and_accuracy(self): def test_static_covariates_support(self): target = tg.sine_timeseries(length=2, freq="h") - target.set_static_covariates(pd.Series([0.0, 1.0], index=["st1", "st2"])) + target = target.set_static_covariates( + pd.Series([0.0, 1.0], index=["st1", "st2"]) + ) # should work with cyclic encoding for time index model = TFTModel( @@ -182,14 +184,14 @@ def test_static_covariates_support(self): model.predict(n=1, series=target, verbose=False) # raise an error when trained with static covariates of wrong dimensionality - target.set_static_covariates( + target = target.set_static_covariates( pd.concat([target.static_covariates] * 2, axis=0) ) with pytest.raises(ValueError): model.predict(n=1, series=target, verbose=False) # raise an error when trained with static covariates and trying to predict without - target.set_static_covariates(None) + target = target.set_static_covariates(None) with pytest.raises(ValueError): model.predict(n=1, series=target, verbose=False) diff --git a/darts/tests/test_timeseries_static_covariates.py b/darts/tests/test_timeseries_static_covariates.py index a994e577b7..d4d50adfbc 100644 --- a/darts/tests/test_timeseries_static_covariates.py +++ b/darts/tests/test_timeseries_static_covariates.py @@ -126,7 +126,7 @@ def test_set_static_covariates_univariate(self): static_covs = pd.Series([0.0, 1.0], index=["st1", "st2"]) # inplace from Series for chained calls - ts.set_static_covariates(static_covs) + ts = ts.set_static_covariates(static_covs) assert ts.static_covariates.equals(static_covs.to_frame()) # from Series @@ -143,12 +143,12 @@ def test_set_static_covariates_univariate(self): # only pd.Series, pd.DataFrame or None with pytest.raises(ValueError): - ts.set_static_covariates([1, 2, 3]) + _ = ts.set_static_covariates([1, 2, 3]) # multivariate does not work with univariate TimeSeries with pytest.raises(ValueError): static_covs_multi = pd.concat([static_covs] * 2, axis=1) - ts.set_static_covariates(static_covs_multi) + _ = ts.set_static_covariates(static_covs_multi) def test_set_static_covariates_multivariate(self): ts = linear_timeseries(length=10) @@ -168,3 +168,29 @@ def test_set_static_covariates_multivariate(self): # raise an error if multivariate static covariates columns don't match the number of components in the series with pytest.raises(ValueError): _ = ts_multi.set_static_covariates(pd.concat([static_covs] * 2, axis=1)) + + def test_ts_methods_with_static_covariates(self): + ts = linear_timeseries(length=10).astype("float64") + static_covs = pd.Series([0, 1], index=["st1", "st2"]).astype(int) + ts = ts.set_static_covariates(static_covs) + + assert ts.static_covariates.dtypes[0] == "float64" + # ts = ts.astype("float32") + # assert ts.static_covariates.dtypes[0] == "float32" + + ts_stochastic = ts.from_times_and_values( + times=ts.time_index, values=np.random.randn(10, 1, 3) + ) + ts_stochastic = ts_stochastic.set_static_covariates(static_covs) + + ts_check = ts.copy() + assert ts_check.static_covariates.equals(ts.static_covariates) + + ts_check = ts.head() + assert ts_check.static_covariates.equals(ts.static_covariates) + + ts_check = ts.tail() + assert ts_check.static_covariates.equals(ts.static_covariates) + + ts_check = ts_stochastic.quantile_timeseries() + assert ts_check.static_covariates.equals(ts_stochastic.static_covariates) diff --git a/darts/timeseries.py b/darts/timeseries.py index 34932d2c45..7511dc7205 100644 --- a/darts/timeseries.py +++ b/darts/timeseries.py @@ -198,6 +198,9 @@ def __init__(self, xa: xr.DataArray): self._freq = 1 self._freq_str = None + if STATIC_COV_TAG not in self._xa.attrs: + self._xa.attrs[STATIC_COV_TAG] = None + """ Factory Methods =============== @@ -1151,6 +1154,7 @@ def quantile_timeseries(self, quantile=0.5, **kwargs) -> "TimeSeries": new_data, dims=self._xa.dims, coords={self._xa.dims[0]: self.time_index, DIMS[1]: pd.Index(cnames)}, + attrs=self._xa.attrs, ) return self.__class__(new_xa) From cc07f5f45f4704b63363d9345e6eafb009b48143 Mon Sep 17 00:00:00 2001 From: dennisbader Date: Sat, 28 May 2022 16:15:44 +0200 Subject: [PATCH 12/26] added static covariate support for stacking TimeSeries --- darts/dataprocessing/transformers/boxcox.py | 4 +- darts/dataprocessing/transformers/scaler.py | 6 +- darts/tests/models/forecasting/test_TFT.py | 6 +- darts/tests/test_datasets.py | 84 ++++----- .../test_timeseries_static_covariates.py | 90 +++++++-- darts/timeseries.py | 177 ++++++++++++++---- 6 files changed, 266 insertions(+), 101 deletions(-) diff --git a/darts/dataprocessing/transformers/boxcox.py b/darts/dataprocessing/transformers/boxcox.py index dbf1eb8598..22067ec6e5 100644 --- a/darts/dataprocessing/transformers/boxcox.py +++ b/darts/dataprocessing/transformers/boxcox.py @@ -167,7 +167,7 @@ def ts_transform( ) return series.with_values( BoxCox._reshape_out(series, transformed_vals, component_mask=component_mask) - ).set_static_covariates(series.static_covariates) + ) @staticmethod def ts_inverse_transform( @@ -185,7 +185,7 @@ def ts_inverse_transform( BoxCox._reshape_out( series, inv_transformed_vals, component_mask=component_mask ) - ).set_static_covariates(series.static_covariates) + ) def fit( self, series: Union[TimeSeries, Sequence[TimeSeries]], **kwargs diff --git a/darts/dataprocessing/transformers/scaler.py b/darts/dataprocessing/transformers/scaler.py index c2b1c32383..ae2da28d29 100644 --- a/darts/dataprocessing/transformers/scaler.py +++ b/darts/dataprocessing/transformers/scaler.py @@ -106,7 +106,8 @@ def ts_transform(series: TimeSeries, transformer, **kwargs) -> TimeSeries: values=transformed_vals, fill_missing_dates=False, columns=series.columns, - ).set_static_covariates(series.static_covariates) + static_covariates=series.static_covariates, + ) @staticmethod def ts_inverse_transform( @@ -126,7 +127,8 @@ def ts_inverse_transform( values=inv_transformed_vals, fill_missing_dates=False, columns=series.columns, - ).set_static_covariates(series.static_covariates) + static_covariates=series.static_covariates, + ) @staticmethod def ts_fit(series: TimeSeries, transformer, *args, **kwargs) -> Any: diff --git a/darts/tests/models/forecasting/test_TFT.py b/darts/tests/models/forecasting/test_TFT.py index 23a8dd7c3d..5488717046 100644 --- a/darts/tests/models/forecasting/test_TFT.py +++ b/darts/tests/models/forecasting/test_TFT.py @@ -167,7 +167,7 @@ def test_mixed_covariates_and_accuracy(self): def test_static_covariates_support(self): target = tg.sine_timeseries(length=2, freq="h") - target = target.set_static_covariates( + target = target.with_static_covariates( pd.Series([0.0, 1.0], index=["st1", "st2"]) ) @@ -184,14 +184,14 @@ def test_static_covariates_support(self): model.predict(n=1, series=target, verbose=False) # raise an error when trained with static covariates of wrong dimensionality - target = target.set_static_covariates( + target = target.with_static_covariates( pd.concat([target.static_covariates] * 2, axis=0) ) with pytest.raises(ValueError): model.predict(n=1, series=target, verbose=False) # raise an error when trained with static covariates and trying to predict without - target = target.set_static_covariates(None) + target = target.with_static_covariates(None) with pytest.raises(ValueError): model.predict(n=1, series=target, verbose=False) diff --git a/darts/tests/test_datasets.py b/darts/tests/test_datasets.py index ac1cd1a9e8..8b9af75fb6 100644 --- a/darts/tests/test_datasets.py +++ b/darts/tests/test_datasets.py @@ -36,10 +36,10 @@ if TORCH_AVAILABLE: class DatasetTestCase(DartsBaseTestClass): - target1 = gaussian_timeseries(length=100).set_static_covariates( + target1 = gaussian_timeseries(length=100).with_static_covariates( pd.Series([0, 1], index=["st1", "st2"]) ) - target2 = gaussian_timeseries(length=150).set_static_covariates( + target2 = gaussian_timeseries(length=150).with_static_covariates( pd.Series([2, 3], index=["st1", "st2"]) ) cov_st1 = target1.static_covariates.T.values @@ -110,7 +110,7 @@ def test_past_covariates_inference_dataset(self): target = TimeSeries.from_times_and_values( times1, np.random.randn(len(times1)) - ).set_static_covariates(self.cov_st2_df) + ).with_static_covariates(self.cov_st2_df) short_cov = TimeSeries.from_times_and_values( times1, np.random.randn(len(times1)) ) @@ -148,7 +148,7 @@ def test_past_covariates_inference_dataset(self): # Should also work for integer-indexed series target = TimeSeries.from_times_and_values( pd.RangeIndex(start=10, stop=50, step=1), np.random.randn(40) - ).set_static_covariates(self.cov_st2_df) + ).with_static_covariates(self.cov_st2_df) covariate = TimeSeries.from_times_and_values( pd.RangeIndex(start=20, stop=80, step=1), np.random.randn(60) ) @@ -197,7 +197,7 @@ def test_future_covariates_inference_dataset(self): target = TimeSeries.from_times_and_values( times1, np.random.randn(len(times1)) - ).set_static_covariates(self.cov_st2_df) + ).with_static_covariates(self.cov_st2_df) short_cov = TimeSeries.from_times_and_values( times1, np.random.randn(len(times1)) ) @@ -226,7 +226,7 @@ def test_future_covariates_inference_dataset(self): # Should also work for integer-indexed series target = TimeSeries.from_times_and_values( pd.RangeIndex(start=10, stop=50, step=1), np.random.randn(40) - ).set_static_covariates(self.cov_st2_df) + ).with_static_covariates(self.cov_st2_df) covariate = TimeSeries.from_times_and_values( pd.RangeIndex(start=20, stop=80, step=1), np.random.randn(60) ) @@ -270,7 +270,7 @@ def test_dual_covariates_inference_dataset(self): target = TimeSeries.from_times_and_values( times1, np.random.randn(len(times1)) - ).set_static_covariates(self.cov_st2_df) + ).with_static_covariates(self.cov_st2_df) short_cov = TimeSeries.from_times_and_values( times1, np.random.randn(len(times1)) ) @@ -308,7 +308,7 @@ def test_dual_covariates_inference_dataset(self): # Should also work for integer-indexed series target = TimeSeries.from_times_and_values( pd.RangeIndex(start=10, stop=50, step=1), np.random.randn(40) - ).set_static_covariates(self.cov_st2_df) + ).with_static_covariates(self.cov_st2_df) covariate = TimeSeries.from_times_and_values( pd.RangeIndex(start=20, stop=80, step=1), np.random.randn(60) ) @@ -336,7 +336,7 @@ def test_mixed_covariates_inference_dataset(self): target = TimeSeries.from_times_and_values( times1, np.random.randn(len(times1)) - ).set_static_covariates(self.cov_st2_df) + ).with_static_covariates(self.cov_st2_df) past_cov = TimeSeries.from_times_and_values( times1, np.random.randn(len(times1)) ) @@ -383,7 +383,7 @@ def test_mixed_covariates_inference_dataset(self): # Should also work for integer-indexed series target = TimeSeries.from_times_and_values( pd.RangeIndex(start=10, stop=50, step=1), np.random.randn(40) - ).set_static_covariates(self.cov_st2_df) + ).with_static_covariates(self.cov_st2_df) past_cov = TimeSeries.from_times_and_values( pd.RangeIndex(start=20, stop=80, step=1), np.random.randn(60) ) @@ -417,7 +417,7 @@ def test_split_covariates_inference_dataset(self): target = TimeSeries.from_times_and_values( times1, np.random.randn(len(times1)) - ).set_static_covariates(self.cov_st2_df) + ).with_static_covariates(self.cov_st2_df) past_cov = TimeSeries.from_times_and_values( times1, np.random.randn(len(times1)) ) @@ -463,7 +463,7 @@ def test_split_covariates_inference_dataset(self): # Should also work for integer-indexed series target = TimeSeries.from_times_and_values( pd.RangeIndex(start=10, stop=50, step=1), np.random.randn(40) - ).set_static_covariates(self.cov_st2_df) + ).with_static_covariates(self.cov_st2_df) past_cov = TimeSeries.from_times_and_values( pd.RangeIndex(start=20, stop=80, step=1), np.random.randn(60) ) @@ -567,7 +567,7 @@ def test_past_covariates_sequential_dataset(self): times2 = pd.date_range(start="20120101", end="20150101", freq="D") target = TimeSeries.from_times_and_values( times1, np.random.randn(len(times1)) - ).set_static_covariates(self.cov_st2_df) + ).with_static_covariates(self.cov_st2_df) cov = TimeSeries.from_times_and_values(times2, np.random.randn(len(times2))) ds = PastCovariatesSequentialDataset( target_series=target, @@ -583,7 +583,7 @@ def test_past_covariates_sequential_dataset(self): times2 = pd.RangeIndex(start=200, stop=400, step=1) target = TimeSeries.from_times_and_values( times1, np.random.randn(len(times1)) - ).set_static_covariates(self.cov_st2_df) + ).with_static_covariates(self.cov_st2_df) cov = TimeSeries.from_times_and_values(times2, np.random.randn(len(times2))) ds = PastCovariatesSequentialDataset( target_series=target, @@ -599,7 +599,7 @@ def test_past_covariates_sequential_dataset(self): times2 = pd.date_range(start="20090101", end="20110106", freq="D") target = TimeSeries.from_times_and_values( times1, np.random.randn(len(times1)) - ).set_static_covariates(self.cov_st2_df) + ).with_static_covariates(self.cov_st2_df) cov = TimeSeries.from_times_and_values(times2, np.random.randn(len(times2))) ds = PastCovariatesSequentialDataset( target_series=target, @@ -616,7 +616,7 @@ def test_past_covariates_sequential_dataset(self): times2 = pd.RangeIndex(start=50, stop=250, step=1) target = TimeSeries.from_times_and_values( times1, np.random.randn(len(times1)) - ).set_static_covariates(self.cov_st2_df) + ).with_static_covariates(self.cov_st2_df) cov = TimeSeries.from_times_and_values(times2, np.random.randn(len(times2))) ds = PastCovariatesSequentialDataset( target_series=target, @@ -680,10 +680,10 @@ def test_future_covariates_sequential_dataset(self): # two targets and two covariates; covariates not aligned, must contain correct values target1 = TimeSeries.from_values( np.random.randn(100) - ).set_static_covariates(self.cov_st2_df) - target2 = TimeSeries.from_values(np.random.randn(50)).set_static_covariates( - self.cov_st2_df - ) + ).with_static_covariates(self.cov_st2_df) + target2 = TimeSeries.from_values( + np.random.randn(50) + ).with_static_covariates(self.cov_st2_df) cov1 = TimeSeries.from_values(np.random.randn(120)) cov2 = TimeSeries.from_values(np.random.randn(80)) @@ -709,7 +709,7 @@ def test_future_covariates_sequential_dataset(self): times2 = pd.date_range(start="20090201", end="20090222", freq="D") target1 = TimeSeries.from_times_and_values( times1, np.random.randn(len(times1)) - ).set_static_covariates(self.cov_st2_df) + ).with_static_covariates(self.cov_st2_df) cov1 = TimeSeries.from_times_and_values( times2, np.random.randn(len(times2)) ) @@ -727,7 +727,7 @@ def test_future_covariates_sequential_dataset(self): np.testing.assert_almost_equal(ds[0][3], target1.values()[-2:]) # Should fail if covariates are not long enough - target1 = TimeSeries.from_values(np.random.randn(8)).set_static_covariates( + target1 = TimeSeries.from_values(np.random.randn(8)).with_static_covariates( self.cov_st2_df ) cov1 = TimeSeries.from_values(np.random.randn(7)) @@ -811,10 +811,10 @@ def test_dual_covariates_sequential_dataset(self): # two targets and two covariates; covariates not aligned, must contain correct values target1 = TimeSeries.from_values( np.random.randn(100) - ).set_static_covariates(self.cov_st2_df) - target2 = TimeSeries.from_values(np.random.randn(50)).set_static_covariates( - self.cov_st2_df - ) + ).with_static_covariates(self.cov_st2_df) + target2 = TimeSeries.from_values( + np.random.randn(50) + ).with_static_covariates(self.cov_st2_df) cov1 = TimeSeries.from_values(np.random.randn(120)) cov2 = TimeSeries.from_values(np.random.randn(80)) @@ -842,7 +842,7 @@ def test_dual_covariates_sequential_dataset(self): times2 = pd.date_range(start="20090201", end="20090222", freq="D") target1 = TimeSeries.from_times_and_values( times1, np.random.randn(len(times1)) - ).set_static_covariates(self.cov_st2_df) + ).with_static_covariates(self.cov_st2_df) cov1 = TimeSeries.from_times_and_values( times2, np.random.randn(len(times2)) ) @@ -861,7 +861,7 @@ def test_dual_covariates_sequential_dataset(self): np.testing.assert_almost_equal(ds[0][4], target1.values()[-2:]) # Should fail if covariates are not long enough - target1 = TimeSeries.from_values(np.random.randn(8)).set_static_covariates( + target1 = TimeSeries.from_values(np.random.randn(8)).with_static_covariates( self.cov_st2_df ) cov1 = TimeSeries.from_values(np.random.randn(7)) @@ -948,7 +948,7 @@ def test_past_covariates_shifted_dataset(self): ) # Should contain correct values even when covariates are not aligned - target1 = TimeSeries.from_values(np.random.randn(8)).set_static_covariates( + target1 = TimeSeries.from_values(np.random.randn(8)).with_static_covariates( self.cov_st2_df ) cov1 = TimeSeries.from_values(np.random.randn(10)) @@ -965,7 +965,7 @@ def test_past_covariates_shifted_dataset(self): times2 = pd.date_range(start="20090201", end="20090222", freq="D") target1 = TimeSeries.from_times_and_values( times1, np.random.randn(len(times1)) - ).set_static_covariates(self.cov_st2_df) + ).with_static_covariates(self.cov_st2_df) cov1 = TimeSeries.from_times_and_values( times2, np.random.randn(len(times2)) ) @@ -978,7 +978,7 @@ def test_past_covariates_shifted_dataset(self): np.testing.assert_almost_equal(ds[0][3], target1.values()[-3:]) # Should fail if covariates are too short - target1 = TimeSeries.from_values(np.random.randn(8)).set_static_covariates( + target1 = TimeSeries.from_values(np.random.randn(8)).with_static_covariates( self.cov_st2_df ) cov1 = TimeSeries.from_values(np.random.randn(5)) @@ -1060,7 +1060,7 @@ def test_future_covariates_shifted_dataset(self): ) # Should contain correct values even when covariates are not aligned - target1 = TimeSeries.from_values(np.random.randn(8)).set_static_covariates( + target1 = TimeSeries.from_values(np.random.randn(8)).with_static_covariates( self.cov_st2_df ) cov1 = TimeSeries.from_values(np.random.randn(10)) @@ -1077,7 +1077,7 @@ def test_future_covariates_shifted_dataset(self): times2 = pd.date_range(start="20090201", end="20090222", freq="D") target1 = TimeSeries.from_times_and_values( times1, np.random.randn(len(times1)) - ).set_static_covariates(self.cov_st2_df) + ).with_static_covariates(self.cov_st2_df) cov1 = TimeSeries.from_times_and_values( times2, np.random.randn(len(times2)) ) @@ -1090,7 +1090,7 @@ def test_future_covariates_shifted_dataset(self): np.testing.assert_almost_equal(ds[0][3], target1.values()[-3:]) # Should fail if covariates are too short - target1 = TimeSeries.from_values(np.random.randn(8)).set_static_covariates( + target1 = TimeSeries.from_values(np.random.randn(8)).with_static_covariates( self.cov_st2_df ) cov1 = TimeSeries.from_values(np.random.randn(7)) @@ -1189,7 +1189,7 @@ def test_dual_covariates_shifted_dataset(self): ) # Should contain correct values even when covariates are not aligned - target1 = TimeSeries.from_values(np.random.randn(8)).set_static_covariates( + target1 = TimeSeries.from_values(np.random.randn(8)).with_static_covariates( self.cov_st2_df ) cov1 = TimeSeries.from_values(np.random.randn(10)) @@ -1207,7 +1207,7 @@ def test_dual_covariates_shifted_dataset(self): times2 = pd.date_range(start="20090201", end="20090222", freq="D") target1 = TimeSeries.from_times_and_values( times1, np.random.randn(len(times1)) - ).set_static_covariates(self.cov_st2_df) + ).with_static_covariates(self.cov_st2_df) cov1 = TimeSeries.from_times_and_values( times2, np.random.randn(len(times2)) ) @@ -1221,7 +1221,7 @@ def test_dual_covariates_shifted_dataset(self): np.testing.assert_almost_equal(ds[0][4], target1.values()[-3:]) # Should fail if covariates are too short - target1 = TimeSeries.from_values(np.random.randn(8)).set_static_covariates( + target1 = TimeSeries.from_values(np.random.randn(8)).with_static_covariates( self.cov_st2_df ) cov1 = TimeSeries.from_values(np.random.randn(7)) @@ -1301,7 +1301,7 @@ def test_get_matching_index(self): times2 = pd.date_range(start="20100101", end="20100320", freq="D") target = TimeSeries.from_times_and_values( times1, np.random.randn(len(times1)) - ).set_static_covariates(self.cov_st2_df) + ).with_static_covariates(self.cov_st2_df) cov = TimeSeries.from_times_and_values(times2, np.random.randn(len(times2))) self.assertEqual(_get_matching_index(target, cov, idx=15), 5) @@ -1310,14 +1310,14 @@ def test_get_matching_index(self): times2 = pd.date_range(start="20090101", end="20110601", freq="M") target = TimeSeries.from_times_and_values( times1, np.random.randn(len(times1)) - ).set_static_covariates(self.cov_st2_df) + ).with_static_covariates(self.cov_st2_df) cov = TimeSeries.from_times_and_values(times2, np.random.randn(len(times2))) self.assertEqual(_get_matching_index(target, cov, idx=15), 15 - 7) # check integer-indexed series times2 = pd.RangeIndex(start=10, stop=90) - target = TimeSeries.from_values(np.random.randn(100)).set_static_covariates( - self.cov_st2_df - ) + target = TimeSeries.from_values( + np.random.randn(100) + ).with_static_covariates(self.cov_st2_df) cov = TimeSeries.from_times_and_values(times2, np.random.randn(len(times2))) self.assertEqual(_get_matching_index(target, cov, idx=15), 5) diff --git a/darts/tests/test_timeseries_static_covariates.py b/darts/tests/test_timeseries_static_covariates.py index d4d50adfbc..28d3c42eb6 100644 --- a/darts/tests/test_timeseries_static_covariates.py +++ b/darts/tests/test_timeseries_static_covariates.py @@ -5,11 +5,12 @@ import pytest from darts import TimeSeries +from darts.dataprocessing.transformers import BoxCox, Scaler from darts.tests.base_test_class import DartsBaseTestClass from darts.utils.timeseries_generation import _generate_index, linear_timeseries -class TimeSeriesMultivariateTestCase(DartsBaseTestClass): +class TimeSeriesStaticCovariateTestCase(DartsBaseTestClass): @classmethod def setUpClass(cls): super().setUpClass() @@ -121,67 +122,111 @@ def test_timeseries_from_longitudinal_df(self): value_cols=value_cols, ) - def test_set_static_covariates_univariate(self): + def test_with_static_covariates_univariate(self): ts = linear_timeseries(length=10) static_covs = pd.Series([0.0, 1.0], index=["st1", "st2"]) # inplace from Series for chained calls - ts = ts.set_static_covariates(static_covs) + ts = ts.with_static_covariates(static_covs) assert ts.static_covariates.equals(static_covs.to_frame()) # from Series - ts = ts.set_static_covariates(static_covs) + ts = ts.with_static_covariates(static_covs) assert ts.static_covariates.equals(static_covs.to_frame()) # from DataFrame - ts = ts.set_static_covariates(static_covs.to_frame()) + ts = ts.with_static_covariates(static_covs.to_frame()) assert ts.static_covariates.equals(static_covs.to_frame()) # with None - ts = ts.set_static_covariates(None) + ts = ts.with_static_covariates(None) assert ts.static_covariates is None # only pd.Series, pd.DataFrame or None with pytest.raises(ValueError): - _ = ts.set_static_covariates([1, 2, 3]) + _ = ts.with_static_covariates([1, 2, 3]) # multivariate does not work with univariate TimeSeries with pytest.raises(ValueError): static_covs_multi = pd.concat([static_covs] * 2, axis=1) - _ = ts.set_static_covariates(static_covs_multi) + _ = ts.with_static_covariates(static_covs_multi) - def test_set_static_covariates_multivariate(self): + def test_with_static_covariates_multivariate(self): ts = linear_timeseries(length=10) ts_multi = ts.stack(ts) static_covs = pd.DataFrame([[0.0, 1.0], [0.0, 1.0]], index=["st1", "st2"]) # from univariate static covariates - ts_multi = ts_multi.set_static_covariates(static_covs[static_covs.columns[0]]) + ts_multi = ts_multi.with_static_covariates(static_covs[static_covs.columns[0]]) assert ts_multi.static_covariates.equals( static_covs[static_covs.columns[0]].to_frame() ) # from multivariate static covariates - ts_multi = ts_multi.set_static_covariates(static_covs) + ts_multi = ts_multi.with_static_covariates(static_covs) assert ts_multi.static_covariates.equals(static_covs) # raise an error if multivariate static covariates columns don't match the number of components in the series with pytest.raises(ValueError): - _ = ts_multi.set_static_covariates(pd.concat([static_covs] * 2, axis=1)) + _ = ts_multi.with_static_covariates(pd.concat([static_covs] * 2, axis=1)) + + def test_stack(self): + ts_uni = linear_timeseries(length=10) + ts_multi = ts_uni.stack(ts_uni) + + static_covs_uni1 = pd.Series([0, 1], index=["st1", "st2"]).astype(int) + static_covs_uni2 = pd.Series([3, 4], index=["st3", "st4"]).astype(int) + static_covs_uni3 = pd.Series([2, 3, 4], index=["st1", "st2", "st3"]).astype(int) + + static_covs_multi = pd.DataFrame([[0, 0], [1, 1]], index=["st1", "st2"]).astype( + int + ) + + ts_uni = ts_uni.with_static_covariates(static_covs_uni1) + ts_multi = ts_multi.with_static_covariates(static_covs_multi) + + # valid static covariates for concatenation/stack + ts_stacked1 = ts_uni.stack(ts_uni) + assert ts_stacked1.static_covariates.equals( + pd.concat([ts_uni.static_covariates] * 2, axis=1) + ) + + # valid static covariates for concatenation/stack: first only has static covs + # -> this gives multivar ts with univar static covs + ts_stacked2 = ts_uni.stack(ts_uni.with_static_covariates(None)) + assert ts_stacked2.static_covariates.equals(ts_uni.static_covariates) + + # mismatch between column names + with pytest.raises(ValueError): + _ = ts_uni.stack(ts_uni.with_static_covariates(static_covs_uni2)) + + # mismatch between number of covariates + with pytest.raises(ValueError): + _ = ts_uni.stack(ts_uni.with_static_covariates(static_covs_uni3)) + + # valid univar ts with univar static covariates + multivar ts with multivar static covariates + ts_stacked3 = ts_uni.stack(ts_multi) + assert ts_stacked3.static_covariates.equals( + pd.concat([ts_uni.static_covariates, ts_multi.static_covariates], axis=1) + ) + + # invalid univar ts with univar static covariates + multivar ts with univar static covariates + with pytest.raises(ValueError): + _ = ts_uni.stack(ts_multi.with_static_covariates(static_covs_uni1)) def test_ts_methods_with_static_covariates(self): ts = linear_timeseries(length=10).astype("float64") static_covs = pd.Series([0, 1], index=["st1", "st2"]).astype(int) - ts = ts.set_static_covariates(static_covs) + ts = ts.with_static_covariates(static_covs) assert ts.static_covariates.dtypes[0] == "float64" - # ts = ts.astype("float32") - # assert ts.static_covariates.dtypes[0] == "float32" + ts = ts.astype("float32") + assert ts.static_covariates.dtypes[0] == "float32" ts_stochastic = ts.from_times_and_values( times=ts.time_index, values=np.random.randn(10, 1, 3) ) - ts_stochastic = ts_stochastic.set_static_covariates(static_covs) + ts_stochastic = ts_stochastic.with_static_covariates(static_covs) ts_check = ts.copy() assert ts_check.static_covariates.equals(ts.static_covariates) @@ -194,3 +239,16 @@ def test_ts_methods_with_static_covariates(self): ts_check = ts_stochastic.quantile_timeseries() assert ts_check.static_covariates.equals(ts_stochastic.static_covariates) + + def test_scalers_with_static_covariates(self): + ts = linear_timeseries(start_value=1.0, end_value=2.0, length=10) + static_covs = pd.Series([0.0, 2.0], index=["st1", "st2"]) + ts = ts.with_static_covariates(static_covs) + + for scaler_cls in [Scaler, BoxCox]: + scaler = scaler_cls() + ts_scaled = scaler.fit_transform(ts) + assert ts_scaled.static_covariates.equals(ts.static_covariates) + + ts_inv = scaler.inverse_transform(ts_scaled) + assert ts_inv.static_covariates.equals(ts.static_covariates) diff --git a/darts/timeseries.py b/darts/timeseries.py index 7511dc7205..a7fca6492d 100644 --- a/darts/timeseries.py +++ b/darts/timeseries.py @@ -198,8 +198,33 @@ def __init__(self, xa: xr.DataArray): self._freq = 1 self._freq_str = None - if STATIC_COV_TAG not in self._xa.attrs: - self._xa.attrs[STATIC_COV_TAG] = None + # check static covariates + static_covariates = self._xa.attrs.get(STATIC_COV_TAG, None) + raise_if_not( + isinstance(static_covariates, (pd.Series, pd.DataFrame)) + or static_covariates is None, + "`static_covariates` must be either a pandas Series, DataFrame or None", + logger, + ) + # check if valid static covariates for multivariate TimeSeries + if isinstance(static_covariates, pd.DataFrame): + n_components = len(static_covariates.columns) + raise_if( + n_components > 1 and n_components != self.n_components, + "When passing a multi-column pandas DataFrame, the number of columns must match the number of " + "components of the TimeSeries object (multivariate static covariates must map to each TimeSeries " + "component).", + logger, + ) + elif isinstance(static_covariates, pd.Series): + static_covariates = static_covariates.to_frame() + else: # None + pass + self._xa.attrs[STATIC_COV_TAG] = ( + static_covariates.astype(self.dtype) + if static_covariates is not None + else static_covariates + ) """ Factory Methods @@ -305,6 +330,7 @@ def _clean_component_list(columns) -> List[str]: time_index_name: xa_.get_index(time_index_name), DIMS[1]: columns_list, }, + attrs=xa_.attrs, ) # We cast the array to float @@ -324,6 +350,7 @@ def from_csv( fill_missing_dates: Optional[bool] = False, freq: Optional[str] = None, fillna_value: Optional[float] = None, + static_covariates: Optional[Union[pd.Series, pd.DataFrame]] = None, **kwargs, ) -> "TimeSeries": """ @@ -350,6 +377,9 @@ def from_csv( fill in missing values if some dates are missing and `fill_missing_dates` is set to `True`. fillna_value Optionally, a numeric value to fill missing values (NaNs) with. + static_covariates + Optionally, a set of static covariates to add to the TimeSeries. Either a pandas Series or a single-column + pandas DataFrame with index representing the uni/multivariate static variables. **kwargs Optional arguments to be passed to `pandas.read_csv` function @@ -367,6 +397,7 @@ def from_csv( fill_missing_dates=fill_missing_dates, freq=freq, fillna_value=fillna_value, + static_covariates=static_covariates, ) @classmethod @@ -378,6 +409,7 @@ def from_dataframe( fill_missing_dates: Optional[bool] = False, freq: Optional[str] = None, fillna_value: Optional[float] = None, + static_covariates: Optional[Union[pd.Series, pd.DataFrame]] = None, ) -> "TimeSeries": """ Build a deterministic TimeSeries instance built from a selection of columns of a DataFrame. @@ -406,6 +438,9 @@ def from_dataframe( fill in missing values if some dates are missing and `fill_missing_dates` is set to `True`. fillna_value Optionally, a numeric value to fill missing values (NaNs) with. + static_covariates + Optionally, a set of static covariates to add to the TimeSeries. Either a pandas Series or a single-column + pandas DataFrame with index representing the uni/multivariate static variables. Returns ------- @@ -491,6 +526,7 @@ def from_dataframe( series_df.values[:, :, np.newaxis], dims=(time_index.name,) + DIMS[-2:], coords={time_index.name: time_index, DIMS[1]: series_df.columns}, + attrs={STATIC_COV_TAG: static_covariates}, ) return cls.from_xarray( @@ -609,7 +645,8 @@ def from_longitudinal_dataframe( fill_missing_dates=fill_missing_dates, freq=freq, fillna_value=fillna_value, - ).set_static_covariates(static_covs) + static_covariates=static_covs, + ) for static_covs, split in splits ] @@ -620,6 +657,7 @@ def from_series( fill_missing_dates: Optional[bool] = False, freq: Optional[str] = None, fillna_value: Optional[float] = None, + static_covariates: Optional[Union[pd.Series, pd.DataFrame]] = None, ) -> "TimeSeries": """ Build a univariate deterministic series from a pandas Series. @@ -642,6 +680,9 @@ def from_series( fill in missing values if some dates are missing and `fill_missing_dates` is set to `True`. fillna_value Optionally, a numeric value to fill missing values (NaNs) with. + static_covariates + Optionally, a set of static covariates to add to the TimeSeries. Either a pandas Series or a single-column + pandas DataFrame with index representing the uni/multivariate static variables. Returns ------- @@ -657,6 +698,7 @@ def from_series( fill_missing_dates=fill_missing_dates, freq=freq, fillna_value=fillna_value, + static_covariates=static_covariates, ) @classmethod @@ -668,6 +710,7 @@ def from_times_and_values( freq: Optional[str] = None, columns: Optional[pd._typing.Axes] = None, fillna_value: Optional[float] = None, + static_covariates: Optional[Union[pd.Series, pd.DataFrame]] = None, ) -> "TimeSeries": """ Build a series from a time index and value array. @@ -694,6 +737,9 @@ def from_times_and_values( Columns to be used by the underlying pandas DataFrame. fillna_value Optionally, a numeric value to fill missing values (NaNs) with. + static_covariates + Optionally, a set of static covariates to add to the TimeSeries. Either a pandas Series or a single-column + pandas DataFrame with index representing the uni/multivariate static variables. Returns ------- @@ -720,7 +766,12 @@ def from_times_and_values( if columns is not None: coords[DIMS[1]] = columns - xa = xr.DataArray(values, dims=(times_name,) + DIMS[-2:], coords=coords) + xa = xr.DataArray( + values, + dims=(times_name,) + DIMS[-2:], + coords=coords, + attrs={STATIC_COV_TAG: static_covariates}, + ) return cls.from_xarray( xa=xa, @@ -735,6 +786,7 @@ def from_values( values: np.ndarray, columns: Optional[pd._typing.Axes] = None, fillna_value: Optional[float] = None, + static_covariates: Optional[Union[pd.Series, pd.DataFrame]] = None, ) -> "TimeSeries": """ Build an integer-indexed series from an array of values. @@ -750,6 +802,9 @@ def from_values( Columns to be used by the underlying pandas DataFrame. fillna_value Optionally, a numeric value to fill missing values (NaNs) with. + static_covariates + Optionally, a set of static covariates to add to the TimeSeries. Either a pandas Series or a single-column + pandas DataFrame with index representing the uni/multivariate static variables. Returns ------- @@ -768,6 +823,7 @@ def from_values( freq=None, columns=columns, fillna_value=fillna_value, + static_covariates=static_covariates, ) @classmethod @@ -2081,6 +2137,7 @@ def append(self, other: "TimeSeries") -> "TimeSeries": self._time_dim: self._time_index.append(other.time_index), DIMS[1]: self.components, }, + attrs=self._xa.attrs, ) # new_xa = xr.concat(objs=[self._xa, other_xa], dim=str(self._time_dim)) @@ -2142,35 +2199,23 @@ def with_values(self, values: np.ndarray) -> "TimeSeries": "Received: {}, expected: {}".format(values.shape, self._xa.values.shape), ) - new_xa = xr.DataArray(values, dims=self._xa.dims, coords=self._xa.coords) + new_xa = xr.DataArray( + values, dims=self._xa.dims, coords=self._xa.coords, attrs=self._xa.attrs + ) return self.__class__(new_xa) - def set_static_covariates( + def with_static_covariates( self, covariates: Optional[Union[pd.Series, pd.DataFrame]] ): - raise_if( - not isinstance(covariates, (pd.Series, pd.DataFrame)) - and covariates is not None, - "`covariates` must be either a pandas Series, DataFrame or None", - logger, - ) - # check if valid static covariates for multivariate TimeSeries - if isinstance(covariates, pd.DataFrame): - n_components = len(covariates.columns) - raise_if( - n_components > 1 and n_components != self.n_components, - "When passing a multi-column pandas DataFrame, the number of columns must match the number of " - "components of the TimeSeries object (multivariate static covariates must map to each TimeSeries " - "component).", - logger, + return self.__class__( + xr.DataArray( + self._xa.values, + dims=self._xa.dims, + coords=self._xa.coords, + attrs={STATIC_COV_TAG: covariates}, ) - elif isinstance(covariates, pd.Series): - covariates = covariates.to_frame() - self._xa.attrs["static_covariates"] = ( - covariates.astype(self.dtype) if covariates is not None else covariates ) - return self def stack(self, other: "TimeSeries") -> "TimeSeries": """ @@ -2215,7 +2260,7 @@ def stack(self, other: "TimeSeries") -> "TimeSeries": new_other_xa = other_xa new_xa = xr.concat((self._xa, new_other_xa), dim=DIMS[1]) - + new_xa.attrs[STATIC_COV_TAG] = _concat_static_covs([self, other]) # we call the factory method here to disambiguate column names if needed. return self.__class__.from_xarray(new_xa, fill_missing_dates=False) @@ -2680,6 +2725,7 @@ def with_columns_renamed( self._xa.values, dims=self._xa.dims, coords={self._xa.dims[0]: self.time_index, DIMS[1]: pd.Index(cols)}, + attrs=self._xa.attrs, ) return self.__class__(new_xa) @@ -2721,7 +2767,9 @@ def mean(self, axis: int = 2) -> "TimeSeries": new_coords = self._get_agg_coords("components_mean", axis) - new_xa = xr.DataArray(new_data, dims=self._xa.dims, coords=new_coords) + new_xa = xr.DataArray( + new_data, dims=self._xa.dims, coords=new_coords, attrs=self._xa.attrs + ) return self.__class__(new_xa) def median(self, axis: int = 2) -> "TimeSeries": @@ -2748,7 +2796,9 @@ def median(self, axis: int = 2) -> "TimeSeries": ) new_coords = self._get_agg_coords("components_median", axis) - new_xa = xr.DataArray(new_data, dims=self._xa.dims, coords=new_coords) + new_xa = xr.DataArray( + new_data, dims=self._xa.dims, coords=new_coords, attrs=self._xa.attrs + ) return self.__class__(new_xa) def sum(self, axis: int = 2) -> "TimeSeries": @@ -2774,7 +2824,9 @@ def sum(self, axis: int = 2) -> "TimeSeries": new_coords = self._get_agg_coords("components_sum", axis) - new_xa = xr.DataArray(new_data, dims=self._xa.dims, coords=new_coords) + new_xa = xr.DataArray( + new_data, dims=self._xa.dims, coords=new_coords, attrs=self._xa.attrs + ) return self.__class__(new_xa) def min(self, axis: int = 2) -> "TimeSeries": @@ -2800,7 +2852,9 @@ def min(self, axis: int = 2) -> "TimeSeries": new_data = self._xa.values.min(axis=axis, keepdims=True) new_coords = self._get_agg_coords("components_min", axis) - new_xa = xr.DataArray(new_data, dims=self._xa.dims, coords=new_coords) + new_xa = xr.DataArray( + new_data, dims=self._xa.dims, coords=new_coords, attrs=self._xa.attrs + ) return self.__class__(new_xa) def max(self, axis: int = 2) -> "TimeSeries": @@ -2825,7 +2879,9 @@ def max(self, axis: int = 2) -> "TimeSeries": new_data = self._xa.values.max(axis=axis, keepdims=True) new_coords = self._get_agg_coords("components_max", axis) - new_xa = xr.DataArray(new_data, dims=self._xa.dims, coords=new_coords) + new_xa = xr.DataArray( + new_data, dims=self._xa.dims, coords=new_coords, attrs=self._xa.attrs + ) return self.__class__(new_xa) def var(self, ddof: int = 1) -> "TimeSeries": @@ -2848,7 +2904,9 @@ def var(self, ddof: int = 1) -> "TimeSeries": """ self._assert_stochastic() new_data = self._xa.values.var(axis=2, ddof=ddof, keepdims=True) - new_xa = xr.DataArray(new_data, dims=self._xa.dims, coords=self._xa.coords) + new_xa = xr.DataArray( + new_data, dims=self._xa.dims, coords=self._xa.coords, attrs=self._xa.attrs + ) return self.__class__(new_xa) def std(self, ddof: int = 1) -> "TimeSeries": @@ -2871,7 +2929,9 @@ def std(self, ddof: int = 1) -> "TimeSeries": """ self._assert_stochastic() new_data = self._xa.values.std(axis=2, ddof=ddof, keepdims=True) - new_xa = xr.DataArray(new_data, dims=self._xa.dims, coords=self._xa.coords) + new_xa = xr.DataArray( + new_data, dims=self._xa.dims, coords=self._xa.coords, attrs=self._xa.attrs + ) return self.__class__(new_xa) def skew(self, **kwargs) -> "TimeSeries": @@ -2893,7 +2953,9 @@ def skew(self, **kwargs) -> "TimeSeries": """ self._assert_stochastic() new_data = np.expand_dims(skew(self._xa.values, axis=2, **kwargs), axis=2) - new_xa = xr.DataArray(new_data, dims=self._xa.dims, coords=self._xa.coords) + new_xa = xr.DataArray( + new_data, dims=self._xa.dims, coords=self._xa.coords, attrs=self._xa.attrs + ) return self.__class__(new_xa) def kurtosis(self, **kwargs) -> "TimeSeries": @@ -2915,7 +2977,9 @@ def kurtosis(self, **kwargs) -> "TimeSeries": """ self._assert_stochastic() new_data = np.expand_dims(kurtosis(self._xa.values, axis=2, **kwargs), axis=2) - new_xa = xr.DataArray(new_data, dims=self._xa.dims, coords=self._xa.coords) + new_xa = xr.DataArray( + new_data, dims=self._xa.dims, coords=self._xa.coords, attrs=self._xa.attrs + ) return self.__class__(new_xa) def quantile(self, quantile: float, **kwargs) -> "TimeSeries": @@ -3117,6 +3181,7 @@ def _restore_xarray_from_frequency(xa: xr.DataArray, freq: str) -> xr.DataArray: data=np.empty(shape=((len(resampled_time_index),) + xa.shape[1:])), dims=xa.dims, coords=coords, + attrs=xa.attrs, ) resampled_xa[:] = np.nan resampled_xa[resampled_time_index.index.isin(time_index)] = sorted_xa.data @@ -3519,6 +3584,46 @@ def _set_freq_in_xa(xa_: xr.DataArray): raise_log(IndexError("The type of your index was not matched."), logger) +def _concat_static_covs(series: List[TimeSeries]) -> Optional[pd.DataFrame]: + """Concatenates static covariates.""" + + if not any([ts.has_static_covariates for ts in series]): + return None + + only_first = series[0].has_static_covariates and not any( + [ts.has_static_covariates for ts in series[1:]] + ) + all_have = all([ts.has_static_covariates for ts in series]) + + raise_if_not( + only_first or all_have, + "Either none, only the first or all TimeSeries must have `static_covariates`.", + logger, + ) + + if only_first: + return series[0].static_covariates + + raise_if_not( + all([len(ts.static_covariates.columns) == ts.n_components for ts in series]) + and all( + [ + ts.static_covariates.index.equals(series[0].static_covariates.index) + for ts in series + ] + ), + "Concatenation of multiple TimeSeries with static covariates requires all `static_covariates` " + "DataFrames to have identical columns (static variable names), and the number of each TimeSeries' " + "components must match the number of corresponding static covariate components (the number of rows " + "in `series.static_covariates`).", + logger, + ) + + return pd.concat( + [ts.static_covariates for ts in series if ts.has_static_covariates], axis=1 + ) + + def concatenate( series: Sequence["TimeSeries"], axis: Union[str, int] = 0, From 0aacd5aed944a4d07a86cdabc61d531fef898c49 Mon Sep 17 00:00:00 2001 From: dennisbader Date: Sun, 29 May 2022 14:42:41 +0200 Subject: [PATCH 13/26] transpose static covariates --- darts/tests/models/forecasting/test_TFT.py | 6 +- darts/tests/test_datasets.py | 4 +- .../test_timeseries_static_covariates.py | 112 ++++++++++++------ darts/timeseries.py | 85 ++++++++----- darts/utils/data/horizon_based_dataset.py | 2 +- darts/utils/data/inference_dataset.py | 2 +- darts/utils/data/shifted_dataset.py | 2 +- 7 files changed, 141 insertions(+), 72 deletions(-) diff --git a/darts/tests/models/forecasting/test_TFT.py b/darts/tests/models/forecasting/test_TFT.py index 5488717046..441d331699 100644 --- a/darts/tests/models/forecasting/test_TFT.py +++ b/darts/tests/models/forecasting/test_TFT.py @@ -179,13 +179,15 @@ def test_static_covariates_support(self): pl_trainer_kwargs={"fast_dev_run": True}, ) model.fit(target, verbose=False) - assert len(model.model.static_variables) == len(target.static_covariates) + assert len(model.model.static_variables) == len( + target.static_covariates.columns + ) model.predict(n=1, series=target, verbose=False) # raise an error when trained with static covariates of wrong dimensionality target = target.with_static_covariates( - pd.concat([target.static_covariates] * 2, axis=0) + pd.concat([target.static_covariates] * 2, axis=1) ) with pytest.raises(ValueError): model.predict(n=1, series=target, verbose=False) diff --git a/darts/tests/test_datasets.py b/darts/tests/test_datasets.py index 8b9af75fb6..7a7c4b8749 100644 --- a/darts/tests/test_datasets.py +++ b/darts/tests/test_datasets.py @@ -42,8 +42,8 @@ class DatasetTestCase(DartsBaseTestClass): target2 = gaussian_timeseries(length=150).with_static_covariates( pd.Series([2, 3], index=["st1", "st2"]) ) - cov_st1 = target1.static_covariates.T.values - cov_st2 = target2.static_covariates.T.values + cov_st1 = target1.static_covariates.values + cov_st2 = target2.static_covariates.values cov_st2_df = pd.Series([2, 3], index=["st1", "st2"]) vals1, vals2 = target1.values(), target2.values() cov1, cov2 = gaussian_timeseries(length=100), gaussian_timeseries(length=150) diff --git a/darts/tests/test_timeseries_static_covariates.py b/darts/tests/test_timeseries_static_covariates.py index 28d3c42eb6..8e199a109b 100644 --- a/darts/tests/test_timeseries_static_covariates.py +++ b/darts/tests/test_timeseries_static_covariates.py @@ -7,6 +7,7 @@ from darts import TimeSeries from darts.dataprocessing.transformers import BoxCox, Scaler from darts.tests.base_test_class import DartsBaseTestClass +from darts.timeseries import DEFAULT_GLOBAL_STATIC_COV_NAME from darts.utils.timeseries_generation import _generate_index, linear_timeseries @@ -64,8 +65,11 @@ def test_timeseries_from_longitudinal_df(self): ) assert len(ts_groups1) == self.n_groups for i, ts in enumerate(ts_groups1): + assert ts.static_covariates.index.equals( + pd.Index([DEFAULT_GLOBAL_STATIC_COV_NAME]) + ) assert ts.static_covariates.shape == (1, 1) - assert ts.static_covariates.index.equals(pd.Index(["st1"])) + assert ts.static_covariates.columns.equals(pd.Index(["st1"])) assert (ts.static_covariates.values == [[i]]).all() # multivariate static covs: only group by "st1", keep static covs "st1", "constant" @@ -78,9 +82,9 @@ def test_timeseries_from_longitudinal_df(self): ) assert len(ts_groups2) == self.n_groups for i, ts in enumerate(ts_groups2): - assert ts.static_covariates.shape == (2, 1) - assert ts.static_covariates.index.equals(pd.Index(["st1", "constant"])) - assert (ts.static_covariates.values == [[i], [1]]).all() + assert ts.static_covariates.shape == (1, 2) + assert ts.static_covariates.columns.equals(pd.Index(["st1", "constant"])) + assert (ts.static_covariates.values == [[i, 1]]).all() # multivariate static covs: group by "st1" and "st2", keep static covs "st1", "st2", "constant" ts_groups3 = TimeSeries.from_longitudinal_dataframe( @@ -94,11 +98,11 @@ def test_timeseries_from_longitudinal_df(self): for idx, ts in enumerate(ts_groups3): i = idx // 2 j = idx % 2 - assert ts.static_covariates.shape == (3, 1) - assert ts.static_covariates.index.equals( + assert ts.static_covariates.shape == (1, 3) + assert ts.static_covariates.columns.equals( pd.Index(["st1", "st2", "constant"]) ) - assert (ts.static_covariates.values == [[i], [j], [1]]).all() + assert (ts.static_covariates.values == [[i, j, 1]]).all() df = copy.deepcopy(self.df_long_multi) df.loc[:, "non_static"] = np.arange(len(df)) @@ -124,23 +128,34 @@ def test_timeseries_from_longitudinal_df(self): def test_with_static_covariates_univariate(self): ts = linear_timeseries(length=10) - static_covs = pd.Series([0.0, 1.0], index=["st1", "st2"]) + static_covs_series = pd.Series([0.0, 1.0], index=["st1", "st2"]) + static_covs_df = pd.DataFrame([[0.0, 1.0]], columns=["st1", "st2"]) - # inplace from Series for chained calls - ts = ts.with_static_covariates(static_covs) - assert ts.static_covariates.equals(static_covs.to_frame()) + # check immutable + ts.with_static_covariates(static_covs_series) + assert not ts.has_static_covariates # from Series - ts = ts.with_static_covariates(static_covs) - assert ts.static_covariates.equals(static_covs.to_frame()) + ts = ts.with_static_covariates(static_covs_series) + assert ts.has_static_covariates + np.testing.assert_almost_equal( + ts.static_covariates.values, np.expand_dims(static_covs_series.values, -1).T + ) + assert ts.static_covariates.index.equals(ts.components) # from DataFrame - ts = ts.with_static_covariates(static_covs.to_frame()) - assert ts.static_covariates.equals(static_covs.to_frame()) + ts = ts.with_static_covariates(static_covs_df) + assert ts.has_static_covariates + np.testing.assert_almost_equal( + ts.static_covariates.values, static_covs_df.values + ) + assert ts.static_covariates.index.equals(ts.components) # with None ts = ts.with_static_covariates(None) - assert ts.static_covariates is None + assert isinstance(ts.static_covariates, pd.DataFrame) + assert ts.static_covariates.empty + assert not ts.has_static_covariates # only pd.Series, pd.DataFrame or None with pytest.raises(ValueError): @@ -148,53 +163,67 @@ def test_with_static_covariates_univariate(self): # multivariate does not work with univariate TimeSeries with pytest.raises(ValueError): - static_covs_multi = pd.concat([static_covs] * 2, axis=1) + static_covs_multi = pd.concat([static_covs_series] * 2, axis=1).T _ = ts.with_static_covariates(static_covs_multi) def test_with_static_covariates_multivariate(self): ts = linear_timeseries(length=10) ts_multi = ts.stack(ts) - static_covs = pd.DataFrame([[0.0, 1.0], [0.0, 1.0]], index=["st1", "st2"]) + static_covs = pd.DataFrame([[0.0, 1.0], [0.0, 1.0]], columns=["st1", "st2"]) # from univariate static covariates - ts_multi = ts_multi.with_static_covariates(static_covs[static_covs.columns[0]]) - assert ts_multi.static_covariates.equals( - static_covs[static_covs.columns[0]].to_frame() + ts_multi = ts_multi.with_static_covariates(static_covs.loc[0]) + assert ts_multi.static_covariates.index.equals( + pd.Index([DEFAULT_GLOBAL_STATIC_COV_NAME]) + ) + assert ts_multi.static_covariates.columns.equals(static_covs.columns) + np.testing.assert_almost_equal( + ts_multi.static_covariates.values, static_covs.loc[0:0].values ) # from multivariate static covariates ts_multi = ts_multi.with_static_covariates(static_covs) - assert ts_multi.static_covariates.equals(static_covs) + assert ts_multi.static_covariates.index.equals(ts_multi.components) + assert ts_multi.static_covariates.columns.equals(static_covs.columns) + np.testing.assert_almost_equal( + ts_multi.static_covariates.values, static_covs.values + ) # raise an error if multivariate static covariates columns don't match the number of components in the series with pytest.raises(ValueError): - _ = ts_multi.with_static_covariates(pd.concat([static_covs] * 2, axis=1)) + _ = ts_multi.with_static_covariates(pd.concat([static_covs] * 2, axis=0)) def test_stack(self): ts_uni = linear_timeseries(length=10) ts_multi = ts_uni.stack(ts_uni) - static_covs_uni1 = pd.Series([0, 1], index=["st1", "st2"]).astype(int) - static_covs_uni2 = pd.Series([3, 4], index=["st3", "st4"]).astype(int) - static_covs_uni3 = pd.Series([2, 3, 4], index=["st1", "st2", "st3"]).astype(int) + static_covs_uni1 = pd.DataFrame([[0, 1]], columns=["st1", "st2"]).astype(int) + static_covs_uni2 = pd.DataFrame([[3, 4]], columns=["st3", "st4"]).astype(int) + static_covs_uni3 = pd.DataFrame( + [[2, 3, 4]], columns=["st1", "st2", "st3"] + ).astype(int) - static_covs_multi = pd.DataFrame([[0, 0], [1, 1]], index=["st1", "st2"]).astype( - int - ) + static_covs_multi = pd.DataFrame( + [[0, 0], [1, 1]], columns=["st1", "st2"] + ).astype(int) ts_uni = ts_uni.with_static_covariates(static_covs_uni1) ts_multi = ts_multi.with_static_covariates(static_covs_multi) # valid static covariates for concatenation/stack ts_stacked1 = ts_uni.stack(ts_uni) - assert ts_stacked1.static_covariates.equals( - pd.concat([ts_uni.static_covariates] * 2, axis=1) + assert ts_stacked1.static_covariates.index.equals(ts_stacked1.components) + np.testing.assert_almost_equal( + ts_stacked1.static_covariates.values, + pd.concat([ts_uni.static_covariates] * 2, axis=0).values, ) # valid static covariates for concatenation/stack: first only has static covs # -> this gives multivar ts with univar static covs ts_stacked2 = ts_uni.stack(ts_uni.with_static_covariates(None)) - assert ts_stacked2.static_covariates.equals(ts_uni.static_covariates) + np.testing.assert_almost_equal( + ts_stacked2.static_covariates.values, ts_uni.static_covariates.values + ) # mismatch between column names with pytest.raises(ValueError): @@ -206,8 +235,11 @@ def test_stack(self): # valid univar ts with univar static covariates + multivar ts with multivar static covariates ts_stacked3 = ts_uni.stack(ts_multi) - assert ts_stacked3.static_covariates.equals( - pd.concat([ts_uni.static_covariates, ts_multi.static_covariates], axis=1) + np.testing.assert_almost_equal( + ts_stacked3.static_covariates.values, + pd.concat( + [ts_uni.static_covariates, ts_multi.static_covariates], axis=0 + ).values, ) # invalid univar ts with univar static covariates + multivar ts with univar static covariates @@ -224,9 +256,10 @@ def test_ts_methods_with_static_covariates(self): assert ts.static_covariates.dtypes[0] == "float32" ts_stochastic = ts.from_times_and_values( - times=ts.time_index, values=np.random.randn(10, 1, 3) + times=ts.time_index, + values=np.random.randn(10, 1, 3), + static_covariates=static_covs, ) - ts_stochastic = ts_stochastic.with_static_covariates(static_covs) ts_check = ts.copy() assert ts_check.static_covariates.equals(ts.static_covariates) @@ -237,8 +270,13 @@ def test_ts_methods_with_static_covariates(self): ts_check = ts.tail() assert ts_check.static_covariates.equals(ts.static_covariates) + # same values but different component names ("0" vs. "0_quantiles") ts_check = ts_stochastic.quantile_timeseries() - assert ts_check.static_covariates.equals(ts_stochastic.static_covariates) + assert not ts_check.components.equals(ts_stochastic.components) + assert ts_stochastic.static_covariates.index.equals(ts_stochastic.components) + np.testing.assert_almost_equal( + ts_check.static_covariates.values, ts_stochastic.static_covariates.values + ) def test_scalers_with_static_covariates(self): ts = linear_timeseries(start_value=1.0, end_value=2.0, length=10) diff --git a/darts/timeseries.py b/darts/timeseries.py index a7fca6492d..cd714d4b13 100644 --- a/darts/timeseries.py +++ b/darts/timeseries.py @@ -47,6 +47,7 @@ VALID_INDEX_TYPES = (pd.DatetimeIndex, pd.RangeIndex) STATIC_COV_TAG = "static_covariates" +DEFAULT_GLOBAL_STATIC_COV_NAME = "global_components" class TimeSeries: @@ -208,23 +209,28 @@ def __init__(self, xa: xr.DataArray): ) # check if valid static covariates for multivariate TimeSeries if isinstance(static_covariates, pd.DataFrame): - n_components = len(static_covariates.columns) + n_components = len(static_covariates) raise_if( n_components > 1 and n_components != self.n_components, - "When passing a multi-column pandas DataFrame, the number of columns must match the number of " - "components of the TimeSeries object (multivariate static covariates must map to each TimeSeries " - "component).", + "When passing a multi-row pandas DataFrame, the number of rows must match the number of " + "components of the TimeSeries object (multi-component/multi-row static covariates must map to each " + "TimeSeries component).", logger, ) + static_covariates = static_covariates.copy() elif isinstance(static_covariates, pd.Series): - static_covariates = static_covariates.to_frame() + static_covariates = static_covariates.to_frame().T else: # None - pass - self._xa.attrs[STATIC_COV_TAG] = ( - static_covariates.astype(self.dtype) - if static_covariates is not None - else static_covariates - ) + static_covariates = pd.DataFrame() + + if not static_covariates.empty: + static_covariates.index = ( + self.components + if len(static_covariates) == self.n_components + else [DEFAULT_GLOBAL_STATIC_COV_NAME] + ) + + self._xa.attrs[STATIC_COV_TAG] = static_covariates.astype(self.dtype) """ Factory Methods @@ -378,8 +384,13 @@ def from_csv( fillna_value Optionally, a numeric value to fill missing values (NaNs) with. static_covariates - Optionally, a set of static covariates to add to the TimeSeries. Either a pandas Series or a single-column - pandas DataFrame with index representing the uni/multivariate static variables. + Optionally, a set of static covariates to be added to the TimeSeries. Either a pandas Series or a pandas + DataFrame. If a Series, the index represents the static variables. The covariates are globally 'applied' + to all components of the TimeSeries. If a DataFrame, the columns represent the static variables and the + rows represent the components of the uni/multivariate TimeSeries. If a single-row DataFrame, the covariates + are globally 'applied' to all components of the TimeSeries. If a multi-row DataFrame, the number of + rows must match the number of components of the TimeSeries (in this case, the number of columns in the CSV + file). This adds control for component-specific static covariates. **kwargs Optional arguments to be passed to `pandas.read_csv` function @@ -439,8 +450,13 @@ def from_dataframe( fillna_value Optionally, a numeric value to fill missing values (NaNs) with. static_covariates - Optionally, a set of static covariates to add to the TimeSeries. Either a pandas Series or a single-column - pandas DataFrame with index representing the uni/multivariate static variables. + Optionally, a set of static covariates to be added to the TimeSeries. Either a pandas Series or a pandas + DataFrame. If a Series, the index represents the static variables. The covariates are globally 'applied' + to all components of the TimeSeries. If a DataFrame, the columns represent the static variables and the + rows represent the components of the uni/multivariate TimeSeries. If a single-row DataFrame, the covariates + are globally 'applied' to all components of the TimeSeries. If a multi-row DataFrame, the number of + rows must match the number of components of the TimeSeries (in this case, the number of columns in + ``value_cols``). This adds control for component-specific static covariates. Returns ------- @@ -553,6 +569,8 @@ def from_longitudinal_dataframe( One column (or the DataFrame index) has to represent the time, a list of columns `group_cols` must be used for extracting the individual TimeSeries by groups, and a list of columns `value_cols` has to represent the values for the individual time series. + Add additional static covariates from columns `static_cols` to the individual TimeSeries without grouping by + these columns. Parameters ---------- @@ -631,7 +649,7 @@ def from_longitudinal_dataframe( # store static covariate Series and group DataFrame (without static cov columns) splits.append( ( - pd.DataFrame(static_cov_vals, index=static_cov_cols), + pd.DataFrame([static_cov_vals], columns=static_cov_cols), group.drop(columns=static_cov_cols), ) ) @@ -681,8 +699,9 @@ def from_series( fillna_value Optionally, a numeric value to fill missing values (NaNs) with. static_covariates - Optionally, a set of static covariates to add to the TimeSeries. Either a pandas Series or a single-column - pandas DataFrame with index representing the uni/multivariate static variables. + Optionally, a set of static covariates to be added to the TimeSeries. Either a pandas Series or a + single-row pandas DataFrame. If a Series, the index represents the static variables. If a DataFrame, the + columns represent the static variables and the single row represents the univariate TimeSeries component. Returns ------- @@ -738,8 +757,13 @@ def from_times_and_values( fillna_value Optionally, a numeric value to fill missing values (NaNs) with. static_covariates - Optionally, a set of static covariates to add to the TimeSeries. Either a pandas Series or a single-column - pandas DataFrame with index representing the uni/multivariate static variables. + Optionally, a set of static covariates to be added to the TimeSeries. Either a pandas Series or a pandas + DataFrame. If a Series, the index represents the static variables. The covariates are globally 'applied' + to all components of the TimeSeries. If a DataFrame, the columns represent the static variables and the + rows represent the components of the uni/multivariate TimeSeries. If a single-row DataFrame, the covariates + are globally 'applied' to all components of the TimeSeries. If a multi-row DataFrame, the number of + rows must match the number of components of the TimeSeries (in this case, the number of columns in + ``values``). This adds control for component-specific static covariates. Returns ------- @@ -803,8 +827,13 @@ def from_values( fillna_value Optionally, a numeric value to fill missing values (NaNs) with. static_covariates - Optionally, a set of static covariates to add to the TimeSeries. Either a pandas Series or a single-column - pandas DataFrame with index representing the uni/multivariate static variables. + Optionally, a set of static covariates to be added to the TimeSeries. Either a pandas Series or a pandas + DataFrame. If a Series, the index represents the static variables. The covariates are globally 'applied' + to all components of the TimeSeries. If a DataFrame, the columns represent the static variables and the + rows represent the components of the uni/multivariate TimeSeries. If a single-row DataFrame, the covariates + are globally 'applied' to all components of the TimeSeries. If a multi-row DataFrame, the number of + rows must match the number of components of the TimeSeries (in this case, the number of columns in + ``values``). This adds control for component-specific static covariates. Returns ------- @@ -879,8 +908,8 @@ def from_pickle(cls, path: str) -> "TimeSeries": """ @property - def static_covariates(self): - return self._xa.attrs.get(STATIC_COV_TAG, None) + def static_covariates(self) -> pd.DataFrame: + return self._xa.attrs.get(STATIC_COV_TAG, pd.DataFrame()) @property def n_samples(self): @@ -970,7 +999,7 @@ def has_range_index(self) -> bool: @property def has_static_covariates(self) -> bool: """Whether this series contains static covariates.""" - return self.static_covariates is not None + return not self.static_covariates.empty @property def duration(self) -> Union[pd.Timedelta, int]: @@ -3605,10 +3634,10 @@ def _concat_static_covs(series: List[TimeSeries]) -> Optional[pd.DataFrame]: return series[0].static_covariates raise_if_not( - all([len(ts.static_covariates.columns) == ts.n_components for ts in series]) + all([len(ts.static_covariates) == ts.n_components for ts in series]) and all( [ - ts.static_covariates.index.equals(series[0].static_covariates.index) + ts.static_covariates.columns.equals(series[0].static_covariates.columns) for ts in series ] ), @@ -3620,7 +3649,7 @@ def _concat_static_covs(series: List[TimeSeries]) -> Optional[pd.DataFrame]: ) return pd.concat( - [ts.static_covariates for ts in series if ts.has_static_covariates], axis=1 + [ts.static_covariates for ts in series if ts.has_static_covariates], axis=0 ) diff --git a/darts/utils/data/horizon_based_dataset.py b/darts/utils/data/horizon_based_dataset.py index 10d836a736..8a8c841bf3 100644 --- a/darts/utils/data/horizon_based_dataset.py +++ b/darts/utils/data/horizon_based_dataset.py @@ -183,7 +183,7 @@ def __getitem__( # TODO: we need think about the dimensionality of static covariates static_covariate = ( - target_series.static_covariates.T.values + target_series.static_covariates.values if target_series.has_static_covariates else None ) diff --git a/darts/utils/data/inference_dataset.py b/darts/utils/data/inference_dataset.py index 92426aa789..7226acf81b 100644 --- a/darts/utils/data/inference_dataset.py +++ b/darts/utils/data/inference_dataset.py @@ -218,7 +218,7 @@ def __getitem__( # TODO: we need think about the dimensionality of static covariates static_covariate = ( - target_series.static_covariates.T.values + target_series.static_covariates.values if target_series.has_static_covariates else None ) diff --git a/darts/utils/data/shifted_dataset.py b/darts/utils/data/shifted_dataset.py index aa29f3f6a9..cba6724753 100644 --- a/darts/utils/data/shifted_dataset.py +++ b/darts/utils/data/shifted_dataset.py @@ -633,7 +633,7 @@ def __getitem__( # TODO: we need think about the dimensionality of static covariates static_covariate = ( - target_series.static_covariates.T.values + target_series.static_covariates.values if target_series.has_static_covariates else None ) From 2845f861d89662a9c0a4db6c09e5a6ccf325d6b9 Mon Sep 17 00:00:00 2001 From: dennisbader Date: Sun, 29 May 2022 15:27:46 +0200 Subject: [PATCH 14/26] added method `static_covariates_values()` --- .../test_timeseries_static_covariates.py | 40 ++++++++++++++----- darts/timeseries.py | 29 ++++++++++++-- darts/utils/data/horizon_based_dataset.py | 2 +- darts/utils/data/inference_dataset.py | 2 +- darts/utils/data/shifted_dataset.py | 2 +- 5 files changed, 57 insertions(+), 18 deletions(-) diff --git a/darts/tests/test_timeseries_static_covariates.py b/darts/tests/test_timeseries_static_covariates.py index 8e199a109b..c8eaeaf86a 100644 --- a/darts/tests/test_timeseries_static_covariates.py +++ b/darts/tests/test_timeseries_static_covariates.py @@ -70,7 +70,7 @@ def test_timeseries_from_longitudinal_df(self): ) assert ts.static_covariates.shape == (1, 1) assert ts.static_covariates.columns.equals(pd.Index(["st1"])) - assert (ts.static_covariates.values == [[i]]).all() + assert (ts.static_covariate_values(copy=False) == [[i]]).all() # multivariate static covs: only group by "st1", keep static covs "st1", "constant" ts_groups2 = TimeSeries.from_longitudinal_dataframe( @@ -84,7 +84,7 @@ def test_timeseries_from_longitudinal_df(self): for i, ts in enumerate(ts_groups2): assert ts.static_covariates.shape == (1, 2) assert ts.static_covariates.columns.equals(pd.Index(["st1", "constant"])) - assert (ts.static_covariates.values == [[i, 1]]).all() + assert (ts.static_covariate_values(copy=False) == [[i, 1]]).all() # multivariate static covs: group by "st1" and "st2", keep static covs "st1", "st2", "constant" ts_groups3 = TimeSeries.from_longitudinal_dataframe( @@ -102,7 +102,7 @@ def test_timeseries_from_longitudinal_df(self): assert ts.static_covariates.columns.equals( pd.Index(["st1", "st2", "constant"]) ) - assert (ts.static_covariates.values == [[i, j, 1]]).all() + assert (ts.static_covariate_values(copy=False) == [[i, j, 1]]).all() df = copy.deepcopy(self.df_long_multi) df.loc[:, "non_static"] = np.arange(len(df)) @@ -139,7 +139,8 @@ def test_with_static_covariates_univariate(self): ts = ts.with_static_covariates(static_covs_series) assert ts.has_static_covariates np.testing.assert_almost_equal( - ts.static_covariates.values, np.expand_dims(static_covs_series.values, -1).T + ts.static_covariate_values(copy=False), + np.expand_dims(static_covs_series.values, -1).T, ) assert ts.static_covariates.index.equals(ts.components) @@ -147,7 +148,7 @@ def test_with_static_covariates_univariate(self): ts = ts.with_static_covariates(static_covs_df) assert ts.has_static_covariates np.testing.assert_almost_equal( - ts.static_covariates.values, static_covs_df.values + ts.static_covariate_values(copy=False), static_covs_df.values ) assert ts.static_covariates.index.equals(ts.components) @@ -166,6 +167,21 @@ def test_with_static_covariates_univariate(self): static_covs_multi = pd.concat([static_covs_series] * 2, axis=1).T _ = ts.with_static_covariates(static_covs_multi) + def test_static_covariate_values(self): + ts = linear_timeseries(length=10) + static_covs = pd.DataFrame([[0.0, 1.0]], columns=["st1", "st2"]) + ts = ts.with_static_covariates(static_covs) + + # changing values of copy should not change original DataFrame + vals = ts.static_covariate_values(copy=True) + vals[:] = -1.0 + assert (ts.static_covariate_values(copy=False) != -1.0).all() + + # changing values of view should change original DataFrame + vals = ts.static_covariate_values(copy=False) + vals[:] = -1.0 + assert (ts.static_covariate_values(copy=False) == -1.0).all() + def test_with_static_covariates_multivariate(self): ts = linear_timeseries(length=10) ts_multi = ts.stack(ts) @@ -178,7 +194,7 @@ def test_with_static_covariates_multivariate(self): ) assert ts_multi.static_covariates.columns.equals(static_covs.columns) np.testing.assert_almost_equal( - ts_multi.static_covariates.values, static_covs.loc[0:0].values + ts_multi.static_covariate_values(copy=False), static_covs.loc[0:0].values ) # from multivariate static covariates @@ -186,7 +202,7 @@ def test_with_static_covariates_multivariate(self): assert ts_multi.static_covariates.index.equals(ts_multi.components) assert ts_multi.static_covariates.columns.equals(static_covs.columns) np.testing.assert_almost_equal( - ts_multi.static_covariates.values, static_covs.values + ts_multi.static_covariate_values(copy=False), static_covs.values ) # raise an error if multivariate static covariates columns don't match the number of components in the series @@ -214,7 +230,7 @@ def test_stack(self): ts_stacked1 = ts_uni.stack(ts_uni) assert ts_stacked1.static_covariates.index.equals(ts_stacked1.components) np.testing.assert_almost_equal( - ts_stacked1.static_covariates.values, + ts_stacked1.static_covariate_values(copy=False), pd.concat([ts_uni.static_covariates] * 2, axis=0).values, ) @@ -222,7 +238,8 @@ def test_stack(self): # -> this gives multivar ts with univar static covs ts_stacked2 = ts_uni.stack(ts_uni.with_static_covariates(None)) np.testing.assert_almost_equal( - ts_stacked2.static_covariates.values, ts_uni.static_covariates.values + ts_stacked2.static_covariate_values(copy=False), + ts_uni.static_covariate_values(copy=False), ) # mismatch between column names @@ -236,7 +253,7 @@ def test_stack(self): # valid univar ts with univar static covariates + multivar ts with multivar static covariates ts_stacked3 = ts_uni.stack(ts_multi) np.testing.assert_almost_equal( - ts_stacked3.static_covariates.values, + ts_stacked3.static_covariate_values(copy=False), pd.concat( [ts_uni.static_covariates, ts_multi.static_covariates], axis=0 ).values, @@ -275,7 +292,8 @@ def test_ts_methods_with_static_covariates(self): assert not ts_check.components.equals(ts_stochastic.components) assert ts_stochastic.static_covariates.index.equals(ts_stochastic.components) np.testing.assert_almost_equal( - ts_check.static_covariates.values, ts_stochastic.static_covariates.values + ts_check.static_covariate_values(copy=False), + ts_stochastic.static_covariate_values(copy=False), ) def test_scalers_with_static_covariates(self): diff --git a/darts/timeseries.py b/darts/timeseries.py index cd714d4b13..1c65b08833 100644 --- a/darts/timeseries.py +++ b/darts/timeseries.py @@ -1360,7 +1360,7 @@ def last_values(self) -> np.ndarray: self._assert_deterministic() return self._xa.values[-1, :, 0].copy() - def values(self, copy=True, sample=0) -> np.ndarray: + def values(self, copy: bool = True, sample: int = 0) -> np.ndarray: """ Return a 2-D array of shape (time, component), containing this series' values for one `sample`. @@ -1388,7 +1388,7 @@ def values(self, copy=True, sample=0) -> np.ndarray: else: return self._xa.values[:, :, sample] - def random_component_values(self, copy=True) -> np.array: + def random_component_values(self, copy: bool = True) -> np.array: """ Return a 2-D array of shape (time, component), containing the values for one sample taken uniformly at random among this series' samples. @@ -1410,7 +1410,7 @@ def random_component_values(self, copy=True) -> np.array: else: return self._xa.values[:, :, sample] - def all_values(self, copy=True) -> np.ndarray: + def all_values(self, copy: bool = True) -> np.ndarray: """ Return a 3-D array of dimension (time, component, sample), containing this series' values for all samples. @@ -1431,7 +1431,7 @@ def all_values(self, copy=True) -> np.ndarray: else: return self._xa.values - def univariate_values(self, copy=True, sample=0) -> np.ndarray: + def univariate_values(self, copy: bool = True, sample: int = 0) -> np.ndarray: """ Return a 1-D Numpy array of shape (time,), containing this univariate series' values for one `sample`. @@ -1440,6 +1440,8 @@ def univariate_values(self, copy=True, sample=0) -> np.ndarray: ---------- copy Whether to return a copy of the values. Leave it to True unless you know what you are doing. + sample + For stochastic series, the sample for which to return values. Default: 0 (first sample). Returns ------- @@ -1453,6 +1455,25 @@ def univariate_values(self, copy=True, sample=0) -> np.ndarray: else: return self._xa[:, 0, sample].values + def static_covariate_values(self, copy: bool = True) -> np.ndarray: + """ + Return a 2-D array of dimension (component, static variable), + containing the static covariate values of the TimeSeries. + + Parameters + ---------- + copy + Whether to return a copy of the values, otherwise returns a view. + Can only return a view if all values have the same dtype. + Leave it to True unless you know what you are doing. + + Returns + ------- + numpy.ndarray + The values composing the time series. + """ + return self.static_covariates.to_numpy(copy=copy) + def head( self, size: Optional[int] = 5, axis: Optional[Union[int, str]] = 0 ) -> "TimeSeries": diff --git a/darts/utils/data/horizon_based_dataset.py b/darts/utils/data/horizon_based_dataset.py index 8a8c841bf3..b0baafec29 100644 --- a/darts/utils/data/horizon_based_dataset.py +++ b/darts/utils/data/horizon_based_dataset.py @@ -183,7 +183,7 @@ def __getitem__( # TODO: we need think about the dimensionality of static covariates static_covariate = ( - target_series.static_covariates.values + target_series.static_covariate_values(copy=False) if target_series.has_static_covariates else None ) diff --git a/darts/utils/data/inference_dataset.py b/darts/utils/data/inference_dataset.py index 7226acf81b..6f11125e4c 100644 --- a/darts/utils/data/inference_dataset.py +++ b/darts/utils/data/inference_dataset.py @@ -218,7 +218,7 @@ def __getitem__( # TODO: we need think about the dimensionality of static covariates static_covariate = ( - target_series.static_covariates.values + target_series.static_covariate_values(copy=False) if target_series.has_static_covariates else None ) diff --git a/darts/utils/data/shifted_dataset.py b/darts/utils/data/shifted_dataset.py index cba6724753..7564d718c5 100644 --- a/darts/utils/data/shifted_dataset.py +++ b/darts/utils/data/shifted_dataset.py @@ -633,7 +633,7 @@ def __getitem__( # TODO: we need think about the dimensionality of static covariates static_covariate = ( - target_series.static_covariates.values + target_series.static_covariate_values(copy=False) if target_series.has_static_covariates else None ) From 2ac58e4ab480ccbb4fcd0f7524b346ea1ff408fb Mon Sep 17 00:00:00 2001 From: dennisbader Date: Sun, 29 May 2022 17:03:49 +0200 Subject: [PATCH 15/26] updated docs --- .../test_timeseries_static_covariates.py | 10 +-- darts/timeseries.py | 67 ++++++++++++++++++- 2 files changed, 69 insertions(+), 8 deletions(-) diff --git a/darts/tests/test_timeseries_static_covariates.py b/darts/tests/test_timeseries_static_covariates.py index c8eaeaf86a..2d0b141bf2 100644 --- a/darts/tests/test_timeseries_static_covariates.py +++ b/darts/tests/test_timeseries_static_covariates.py @@ -56,7 +56,7 @@ def setUpClass(cls): def test_timeseries_from_longitudinal_df(self): # univariate static covs: only group by "st1", keep static covs "st1" value_cols = ["a", "b", "c"] - ts_groups1 = TimeSeries.from_longitudinal_dataframe( + ts_groups1 = TimeSeries.from_group_dataframe( df=self.df_long_uni, group_cols="st1", static_cols=None, @@ -73,7 +73,7 @@ def test_timeseries_from_longitudinal_df(self): assert (ts.static_covariate_values(copy=False) == [[i]]).all() # multivariate static covs: only group by "st1", keep static covs "st1", "constant" - ts_groups2 = TimeSeries.from_longitudinal_dataframe( + ts_groups2 = TimeSeries.from_group_dataframe( df=self.df_long_multi, group_cols=["st1"], static_cols="constant", @@ -87,7 +87,7 @@ def test_timeseries_from_longitudinal_df(self): assert (ts.static_covariate_values(copy=False) == [[i, 1]]).all() # multivariate static covs: group by "st1" and "st2", keep static covs "st1", "st2", "constant" - ts_groups3 = TimeSeries.from_longitudinal_dataframe( + ts_groups3 = TimeSeries.from_group_dataframe( df=self.df_long_multi, group_cols=["st1", "st2"], static_cols=["constant"], @@ -108,7 +108,7 @@ def test_timeseries_from_longitudinal_df(self): df.loc[:, "non_static"] = np.arange(len(df)) # non static columns as static columns should raise an error with pytest.raises(ValueError): - _ = TimeSeries.from_longitudinal_dataframe( + _ = TimeSeries.from_group_dataframe( df=df, group_cols=["st1"], static_cols=["non_static"], @@ -118,7 +118,7 @@ def test_timeseries_from_longitudinal_df(self): # groups that are too short for TimeSeries requirements should raise an error with pytest.raises(ValueError): - _ = TimeSeries.from_longitudinal_dataframe( + _ = TimeSeries.from_group_dataframe( df=df, group_cols=["st1", "non_static"], static_cols=None, diff --git a/darts/timeseries.py b/darts/timeseries.py index 1c65b08833..26caa1ca6f 100644 --- a/darts/timeseries.py +++ b/darts/timeseries.py @@ -23,6 +23,11 @@ - Have distinct components/columns names - Have a well defined frequency (for ``DateTimeIndex``) - Be non-empty. + +``TimeSeries`` can contain global or component-specific static covariate data. Static covariates in `darts` refers +to external time-invariant data that can be used by some forecasting models to help improve predictions. +Read our `user guide on covariates `__ and the +``TimeSeries`` documentation for more information on covariates. """ import pickle @@ -59,6 +64,7 @@ def __init__(self, xa: xr.DataArray): See Also -------- TimeSeries.from_dataframe : Create from a :class:`pandas.DataFrame`. + TimeSeries.from_group_dataframe : Create multiple TimeSeries by groups from a :class:`pandas.DataFrame`. TimeSeries.from_series : Create from a :class:`pandas.Series`. TimeSeries.from_values : Create from a NumPy :class:`ndarray`. TimeSeries.from_times_and_values : Create from a time index and a Numpy :class:`ndarray`. @@ -553,7 +559,7 @@ def from_dataframe( ) @classmethod - def from_longitudinal_dataframe( + def from_group_dataframe( cls, df: pd.DataFrame, group_cols: Union[List[str], str], @@ -569,8 +575,9 @@ def from_longitudinal_dataframe( One column (or the DataFrame index) has to represent the time, a list of columns `group_cols` must be used for extracting the individual TimeSeries by groups, and a list of columns `value_cols` has to represent the values for the individual time series. - Add additional static covariates from columns `static_cols` to the individual TimeSeries without grouping by - these columns. + Values from columns ``group_cols`` and ``static_cols`` are added as static covariates to the resulting + TimeSeries objects. These can be viewed with `my_series.static_covariates`. Different to `group_cols`, + `static_cols` only adds the static values without using the to extract the TimeSeries groups. Parameters ---------- @@ -909,6 +916,24 @@ def from_pickle(cls, path: str) -> "TimeSeries": @property def static_covariates(self) -> pd.DataFrame: + """ + Returns the static covariates contained in the series as a pandas DataFrame. + The columns represent the static variables and the rows represent the components of the uni/multivariate + series. If a single-row DataFrame, the covariates are globally 'applied' to all components of the + TimeSeries. If a multi-row DataFrame, the static covariates are component-specific, with the number of rows + matching the number of components of the series. Use below methods to add static covariates to your TimeSeries + objects. + + See Also + -------- + TimeSeries.with_static_covariates : Return a copy of a series with added static covariates + TimeSeries.from_dataframe : Create from a :class:`pandas.DataFrame`. + TimeSeries.from_group_dataframe : Create multiple TimeSeries by groups from a :class:`pandas.DataFrame`. + TimeSeries.from_series : Create from a :class:`pandas.Series`. + TimeSeries.from_values : Create from a NumPy :class:`ndarray`. + TimeSeries.from_times_and_values : Create from a time index and a Numpy :class:`ndarray`. + TimeSeries.from_csv : Create from a CSV file. + """ return self._xa.attrs.get(STATIC_COV_TAG, pd.DataFrame()) @property @@ -2258,6 +2283,42 @@ def with_values(self, values: np.ndarray) -> "TimeSeries": def with_static_covariates( self, covariates: Optional[Union[pd.Series, pd.DataFrame]] ): + """Returns a new TimeSeries object with added static covariates. + + Parameters + ---------- + covariates + Optionally, a set of static covariates to be added to the TimeSeries. Either a pandas Series, a pandas + DataFrame, or `None`. If `None`, will set the static covariates to `None`. If a Series, the index + represents the static variables. The covariates are then globally 'applied' to all components of the + TimeSeries. If a DataFrame, the columns represent the static variables and the rows represent the + components of the uni/multivariate TimeSeries. If a single-row DataFrame, the covariates are globally + 'applied' to all components of the TimeSeries. If a multi-row DataFrame, the number of rows must match the + number of components of the TimeSeries. This adds component-specific static covariates. + + Examples + -------- + >>> import pandas as pd + >>> from darts.utils.timeseries_generation import linear_timeseries + >>> # add global static covariates + >>> static_covs = pd.Series([0., 1.], index=["static_cov_1", "static_cov_2"]) + >>> series = linear_timeseries(length=3) + >>> series_new1 = series.with_static_covariates(static_covs) + >>> series_new1.static_covariates + static_cov_1 static_cov_2 + component + linear 0.0 1.0 + + >>> # add component specific static covariates + >>> static_covs_multi = pd.DataFrame([[0., 1.], [2., 3.]], columns=["static_cov_1", "static_cov_2"]) + >>> series_multi = series.stack(series) + >>> series_new2 = series_multi.with_static_covariates(static_covs_multi) + >>> series_new2.static_covariates + static_cov_1 static_cov_2 + component + linear 0.0 1.0 + linear_1 2.0 3.0 + """ return self.__class__( xr.DataArray( self._xa.values, From a6fa4fbd6a9249020d0a7dbf8682408092c7d0cd Mon Sep 17 00:00:00 2001 From: dennisbader Date: Mon, 30 May 2022 18:48:04 +0200 Subject: [PATCH 16/26] static covariate support for concatenation --- darts/dataprocessing/transformers/scaler.py | 8 ++ .../test_timeseries_static_covariates.py | 123 +++++++++++++++++- darts/timeseries.py | 41 ++++-- darts/utils/data/horizon_based_dataset.py | 8 +- darts/utils/data/inference_dataset.py | 7 +- darts/utils/data/shifted_dataset.py | 7 +- 6 files changed, 160 insertions(+), 34 deletions(-) diff --git a/darts/dataprocessing/transformers/scaler.py b/darts/dataprocessing/transformers/scaler.py index ae2da28d29..9d3f4ae8ff 100644 --- a/darts/dataprocessing/transformers/scaler.py +++ b/darts/dataprocessing/transformers/scaler.py @@ -31,6 +31,14 @@ def __init__( The transformation is applied independently for each dimension (component) of the time series, effectively merging all samples of a component in order to compute the transform. + Notes + ----- + The scaler will not scale the series' static covariates. This has to be done either before constructing the + series, or later on by extracting the covariates, transforming the values and then reapplying them to the + series. For this, see Timeseries properties/methods + `TimeSeries.static_covariates `_ and + `TimeSeries.static_covariates() `_ + Parameters ---------- scaler diff --git a/darts/tests/test_timeseries_static_covariates.py b/darts/tests/test_timeseries_static_covariates.py index 2d0b141bf2..e3378d46e4 100644 --- a/darts/tests/test_timeseries_static_covariates.py +++ b/darts/tests/test_timeseries_static_covariates.py @@ -4,7 +4,7 @@ import pandas as pd import pytest -from darts import TimeSeries +from darts import TimeSeries, concatenate from darts.dataprocessing.transformers import BoxCox, Scaler from darts.tests.base_test_class import DartsBaseTestClass from darts.timeseries import DEFAULT_GLOBAL_STATIC_COV_NAME @@ -154,8 +154,7 @@ def test_with_static_covariates_univariate(self): # with None ts = ts.with_static_covariates(None) - assert isinstance(ts.static_covariates, pd.DataFrame) - assert ts.static_covariates.empty + assert ts.static_covariates is None assert not ts.has_static_covariates # only pd.Series, pd.DataFrame or None @@ -182,6 +181,9 @@ def test_static_covariate_values(self): vals[:] = -1.0 assert (ts.static_covariate_values(copy=False) == -1.0).all() + ts = ts.with_static_covariates(None) + assert ts.static_covariate_values() is None + def test_with_static_covariates_multivariate(self): ts = linear_timeseries(length=10) ts_multi = ts.stack(ts) @@ -263,6 +265,121 @@ def test_stack(self): with pytest.raises(ValueError): _ = ts_uni.stack(ts_multi.with_static_covariates(static_covs_uni1)) + def test_concatenate_dim_component(self): + """ + test concatenation with static covariates along component dimension (axis=1) + Along component dimension, we concatenate/transfer the static covariates the series only if one of + below cases applies: + 1) concatenate when for each series the number of static cov components is equal to the number of + components in the series. The static variable names (columns in series.static_covariates) must be + identical across all series + 2) if only the first series contains static covariates transfer only those + 3) if `ignore_static_covarites=True`, case 1) is ignored and only the static covariates of the first + series are transferred + """ + ts_uni = linear_timeseries(length=10) + ts_multi = ts_uni.stack(ts_uni) + + static_covs_uni1 = pd.DataFrame([[0, 1]], columns=["st1", "st2"]).astype(int) + static_covs_uni2 = pd.DataFrame([[3, 4]], columns=["st3", "st4"]).astype(int) + static_covs_uni3 = pd.DataFrame( + [[2, 3, 4]], columns=["st1", "st2", "st3"] + ).astype(int) + + static_covs_multi = pd.DataFrame( + [[0, 0], [1, 1]], columns=["st1", "st2"] + ).astype(int) + + ts_uni_static_uni1 = ts_uni.with_static_covariates(static_covs_uni1) + ts_uni_static_uni2 = ts_uni.with_static_covariates(static_covs_uni2) + ts_uni_static_uni3 = ts_uni.with_static_covariates(static_covs_uni3) + + ts_multi_static_uni1 = ts_multi.with_static_covariates(static_covs_uni1) + ts_multi_static_multi = ts_multi.with_static_covariates(static_covs_multi) + + # concatenation without covariates + ts_concat = concatenate([ts_uni, ts_uni], axis=1) + assert ts_concat.static_covariates is None + + # concatenation along component dimension results in multi component static covariates + ts_concat = concatenate([ts_uni_static_uni1, ts_uni_static_uni1], axis=1) + assert ts_concat.static_covariates.shape == (2, 2) + assert ts_concat.components.equals(ts_concat.static_covariates.index) + np.testing.assert_almost_equal( + ts_concat.static_covariate_values(copy=False), + pd.concat([static_covs_uni1] * 2, axis=0).values, + ) + + # concatenation with inconsistent static variable names should fail ... + with pytest.raises(ValueError): + _ = concatenate([ts_uni_static_uni1, ts_uni_static_uni2], axis=1) + + # ... by ignoring the static covariates, it should work and take only the covariates of the first series + ts_concat = concatenate( + [ts_uni_static_uni1, ts_uni_static_uni2], + axis=1, + ignore_static_covariates=True, + ) + assert ts_concat.static_covariates.shape == (1, 2) + assert ts_concat.static_covariates.index.equals( + pd.Index([DEFAULT_GLOBAL_STATIC_COV_NAME]) + ) + np.testing.assert_almost_equal( + ts_concat.static_covariate_values(copy=False), + ts_uni_static_uni1.static_covariate_values(copy=False), + ) + + # concatenation with inconsistent number of static covariates should fail ... + with pytest.raises(ValueError): + _ = concatenate([ts_uni_static_uni1, ts_uni_static_uni3], axis=1) + + # concatenation will only work if for each series the number of static cov components is equal to the + # number of components in the series + with pytest.raises(ValueError): + _ = concatenate([ts_uni_static_uni1, ts_multi_static_uni1], axis=1) + + ts_concat = concatenate([ts_uni_static_uni1, ts_multi_static_multi], axis=1) + assert ts_concat.static_covariates.shape == (ts_concat.n_components, 2) + assert ts_concat.components.equals(ts_concat.static_covariates.index) + np.testing.assert_almost_equal( + ts_concat.static_covariate_values(copy=False), + pd.concat([static_covs_uni1, static_covs_multi], axis=0), + ) + + def test_concatenate_dim_time(self): + """ + Test concatenation with static covariates along time dimension (axis=0) + Along time dimension, we only take the static covariates of the first series (as static covariates are + time-independant). + """ + static_covs_left = pd.DataFrame([[0, 1]], columns=["st1", "st2"]).astype(int) + static_covs_right = pd.DataFrame([[3, 4]], columns=["st3", "st4"]).astype(int) + + ts_left = linear_timeseries(length=10).with_static_covariates(static_covs_left) + ts_right = linear_timeseries( + length=10, start=ts_left.end_time() + ts_left.freq + ).with_static_covariates(static_covs_right) + + ts_concat = concatenate([ts_left, ts_right], axis=0) + assert ts_concat.static_covariates.equals(ts_left.static_covariates) + + def test_concatenate_dim_samples(self): + """ + Test concatenation with static covariates along sample dimension (axis=2) + Along sample dimension, we only take the static covariates of the first series (as we components and + time don't change). + """ + static_covs_left = pd.DataFrame([[0, 1]], columns=["st1", "st2"]).astype(int) + static_covs_right = pd.DataFrame([[3, 4]], columns=["st3", "st4"]).astype(int) + + ts_left = linear_timeseries(length=10).with_static_covariates(static_covs_left) + ts_right = linear_timeseries(length=10).with_static_covariates( + static_covs_right + ) + + ts_concat = concatenate([ts_left, ts_right], axis=2) + assert ts_concat.static_covariates.equals(ts_left.static_covariates) + def test_ts_methods_with_static_covariates(self): ts = linear_timeseries(length=10).astype("float64") static_covs = pd.Series([0, 1], index=["st1", "st2"]).astype(int) diff --git a/darts/timeseries.py b/darts/timeseries.py index 26caa1ca6f..c728a3ebec 100644 --- a/darts/timeseries.py +++ b/darts/timeseries.py @@ -227,16 +227,17 @@ def __init__(self, xa: xr.DataArray): elif isinstance(static_covariates, pd.Series): static_covariates = static_covariates.to_frame().T else: # None - static_covariates = pd.DataFrame() + pass - if not static_covariates.empty: + if static_covariates is None: + self._xa.attrs[STATIC_COV_TAG] = None + else: static_covariates.index = ( self.components if len(static_covariates) == self.n_components else [DEFAULT_GLOBAL_STATIC_COV_NAME] ) - - self._xa.attrs[STATIC_COV_TAG] = static_covariates.astype(self.dtype) + self._xa.attrs[STATIC_COV_TAG] = static_covariates.astype(self.dtype) """ Factory Methods @@ -915,7 +916,7 @@ def from_pickle(cls, path: str) -> "TimeSeries": """ @property - def static_covariates(self) -> pd.DataFrame: + def static_covariates(self) -> Optional[pd.DataFrame]: """ Returns the static covariates contained in the series as a pandas DataFrame. The columns represent the static variables and the rows represent the components of the uni/multivariate @@ -934,7 +935,7 @@ def static_covariates(self) -> pd.DataFrame: TimeSeries.from_times_and_values : Create from a time index and a Numpy :class:`ndarray`. TimeSeries.from_csv : Create from a CSV file. """ - return self._xa.attrs.get(STATIC_COV_TAG, pd.DataFrame()) + return self._xa.attrs.get(STATIC_COV_TAG, None) @property def n_samples(self): @@ -1024,7 +1025,7 @@ def has_range_index(self) -> bool: @property def has_static_covariates(self) -> bool: """Whether this series contains static covariates.""" - return not self.static_covariates.empty + return self.static_covariates is not None @property def duration(self) -> Union[pd.Timedelta, int]: @@ -1480,7 +1481,7 @@ def univariate_values(self, copy: bool = True, sample: int = 0) -> np.ndarray: else: return self._xa[:, 0, sample].values - def static_covariate_values(self, copy: bool = True) -> np.ndarray: + def static_covariate_values(self, copy: bool = True) -> Optional[np.ndarray]: """ Return a 2-D array of dimension (component, static variable), containing the static covariate values of the TimeSeries. @@ -1494,10 +1495,14 @@ def static_covariate_values(self, copy: bool = True) -> np.ndarray: Returns ------- - numpy.ndarray - The values composing the time series. + Optional[numpy.ndarray] + The static covariate values if the series has static covariates, else `None`. """ - return self.static_covariates.to_numpy(copy=copy) + return ( + self.static_covariates.to_numpy(copy=copy) + if self.has_static_covariates + else self.static_covariates + ) def head( self, size: Optional[int] = 5, axis: Optional[Union[int, str]] = 0 @@ -3695,7 +3700,7 @@ def _set_freq_in_xa(xa_: xr.DataArray): raise_log(IndexError("The type of your index was not matched."), logger) -def _concat_static_covs(series: List[TimeSeries]) -> Optional[pd.DataFrame]: +def _concat_static_covs(series: Sequence["TimeSeries"]) -> Optional[pd.DataFrame]: """Concatenates static covariates.""" if not any([ts.has_static_covariates for ts in series]): @@ -3739,6 +3744,7 @@ def concatenate( series: Sequence["TimeSeries"], axis: Union[str, int] = 0, ignore_time_axis: bool = False, + ignore_static_covariates: bool = False, ): """Concatenates multiple ``TimeSeries`` along a given axis. @@ -3758,6 +3764,9 @@ def concatenate( provided series). When done along time dimension, concatenation will work even if the time axes are not contiguous (in this case, the resulting series will have a start time matching the start time of the first provided series). Default: False. + ignore_static_covariates : bool + whether to ignore all requirements for static covariate concatenation and only transfer the static covariates + of the first TimeSeries element in `series` to the concatenated TimeSeries. Only effective when `axis=1`. Return ------- @@ -3823,6 +3832,7 @@ def concatenate( ) da_concat = da_concat.assign_coords({time_dim_name: tindex}) + da_concat.attrs[STATIC_COV_TAG] = series[0].static_covariates else: time_axes_equal = all( @@ -3870,13 +3880,20 @@ def concatenate( component_coords.append(new_comp_name) existing_components.add(new_comp_name) component_index = pd.Index(component_coords) + static_covariates = ( + _concat_static_covs(series) + if not ignore_static_covariates + else series[0].static_covariates + ) else: component_index = da_sequence[0].get_index(DIMS[1]) + static_covariates = series[0].static_covariates da_concat = xr.DataArray( concat_vals, dims=(time_dim_name,) + DIMS[-2:], coords={time_dim_name: series[0].time_index, DIMS[1]: component_index}, + attrs={STATIC_COV_TAG: static_covariates}, ) return TimeSeries(da_concat) diff --git a/darts/utils/data/horizon_based_dataset.py b/darts/utils/data/horizon_based_dataset.py index b0baafec29..afba689de0 100644 --- a/darts/utils/data/horizon_based_dataset.py +++ b/darts/utils/data/horizon_based_dataset.py @@ -181,11 +181,5 @@ def __getitem__( "input (or output) chunk relative to the target series.", ) - # TODO: we need think about the dimensionality of static covariates - static_covariate = ( - target_series.static_covariate_values(copy=False) - if target_series.has_static_covariates - else None - ) - + static_covariate = target_series.static_covariate_values(copy=False) return past_target, covariate, static_covariate, future_target diff --git a/darts/utils/data/inference_dataset.py b/darts/utils/data/inference_dataset.py index 6f11125e4c..e1d3fd9230 100644 --- a/darts/utils/data/inference_dataset.py +++ b/darts/utils/data/inference_dataset.py @@ -216,12 +216,7 @@ def __getitem__( else None ) - # TODO: we need think about the dimensionality of static covariates - static_covariate = ( - target_series.static_covariate_values(copy=False) - if target_series.has_static_covariates - else None - ) + static_covariate = target_series.static_covariate_values(copy=False) return ( past_target, past_covariate, diff --git a/darts/utils/data/shifted_dataset.py b/darts/utils/data/shifted_dataset.py index 7564d718c5..d53b7efc19 100644 --- a/darts/utils/data/shifted_dataset.py +++ b/darts/utils/data/shifted_dataset.py @@ -631,10 +631,5 @@ def __getitem__( f"target series.", ) - # TODO: we need think about the dimensionality of static covariates - static_covariate = ( - target_series.static_covariate_values(copy=False) - if target_series.has_static_covariates - else None - ) + static_covariate = target_series.static_covariate_values(copy=False) return past_target, covariate, static_covariate, future_target From a4ba6171a27602d3ff51b5913346460db3c0b155 Mon Sep 17 00:00:00 2001 From: dennisbader Date: Mon, 30 May 2022 18:54:33 +0200 Subject: [PATCH 17/26] static covariate support for concatenation --- darts/dataprocessing/transformers/boxcox.py | 7 +++++++ darts/dataprocessing/transformers/scaler.py | 5 ++--- 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/darts/dataprocessing/transformers/boxcox.py b/darts/dataprocessing/transformers/boxcox.py index 22067ec6e5..f1402781c9 100644 --- a/darts/dataprocessing/transformers/boxcox.py +++ b/darts/dataprocessing/transformers/boxcox.py @@ -38,6 +38,13 @@ def __init__( For stochastic series, it is done jointly over all samples, effectively merging all samples of a component in order to compute the transform. + Notes + ----- + The scaler will not scale the series' static covariates. This has to be done either before constructing the + series, or later on by extracting the covariates, transforming the values and then reapplying them to the + series. For this, see Timeseries properties `TimeSeries.static_covariates` and method + `TimeSeries.with_static_covariates()` + Parameters ---------- name diff --git a/darts/dataprocessing/transformers/scaler.py b/darts/dataprocessing/transformers/scaler.py index 9d3f4ae8ff..8efa93ab9b 100644 --- a/darts/dataprocessing/transformers/scaler.py +++ b/darts/dataprocessing/transformers/scaler.py @@ -35,9 +35,8 @@ def __init__( ----- The scaler will not scale the series' static covariates. This has to be done either before constructing the series, or later on by extracting the covariates, transforming the values and then reapplying them to the - series. For this, see Timeseries properties/methods - `TimeSeries.static_covariates `_ and - `TimeSeries.static_covariates() `_ + series. For this, see Timeseries properties `TimeSeries.static_covariates` and method + `TimeSeries.with_static_covariates()` Parameters ---------- From 0586b7da8069fc12502c9bfd8a030d71730b43ca Mon Sep 17 00:00:00 2001 From: dennisbader Date: Mon, 30 May 2022 19:16:35 +0200 Subject: [PATCH 18/26] static covariates are now passed to the torch models --- darts/models/forecasting/block_rnn_model.py | 3 +- darts/models/forecasting/nbeats.py | 3 +- darts/models/forecasting/nhits.py | 3 +- .../forecasting/pl_forecasting_module.py | 22 +++++++++------ darts/models/forecasting/rnn_model.py | 28 ++++++++++++++----- darts/models/forecasting/tcn_model.py | 3 +- darts/models/forecasting/tft_model.py | 6 ++-- darts/models/forecasting/transformer_model.py | 3 +- darts/tests/models/forecasting/test_TCN.py | 16 ++++++++--- 9 files changed, 60 insertions(+), 27 deletions(-) diff --git a/darts/models/forecasting/block_rnn_model.py b/darts/models/forecasting/block_rnn_model.py index eccfc656e8..f0f907c143 100644 --- a/darts/models/forecasting/block_rnn_model.py +++ b/darts/models/forecasting/block_rnn_model.py @@ -103,7 +103,8 @@ def __init__( last = feature self.fc = nn.Sequential(*feats) - def forward(self, x): + def forward(self, x_in: Tuple): + x, _ = x_in # data is of size (batch_size, input_chunk_length, input_size) batch_size = x.size(0) diff --git a/darts/models/forecasting/nbeats.py b/darts/models/forecasting/nbeats.py index 7b462fe7ac..b9a0bf957c 100644 --- a/darts/models/forecasting/nbeats.py +++ b/darts/models/forecasting/nbeats.py @@ -488,7 +488,8 @@ def __init__( self.stacks_list[-1].blocks[-1].backcast_linear_layer.requires_grad_(False) self.stacks_list[-1].blocks[-1].backcast_g.requires_grad_(False) - def forward(self, x): + def forward(self, x_in: Tuple): + x, _ = x_in # if x1, x2,... y1, y2... is one multivariate ts containing x and y, and a1, a2... one covariate ts # we reshape into x1, y1, a1, x2, y2, a2... etc diff --git a/darts/models/forecasting/nhits.py b/darts/models/forecasting/nhits.py index 7e9326d977..79e6771176 100644 --- a/darts/models/forecasting/nhits.py +++ b/darts/models/forecasting/nhits.py @@ -415,7 +415,8 @@ def __init__( # on this params (the last block backcast is not part of the final output of the net). self.stacks_list[-1].blocks[-1].backcast_linear_layer.requires_grad_(False) - def forward(self, x): + def forward(self, x_in: Tuple): + x, _ = x_in # if x1, x2,... y1, y2... is one multivariate ts containing x and y, and a1, a2... one covariate ts # we reshape into x1, y1, a1, x2, y2, a2... etc diff --git a/darts/models/forecasting/pl_forecasting_module.py b/darts/models/forecasting/pl_forecasting_module.py index 0a6d937ced..3efb63b8f3 100644 --- a/darts/models/forecasting/pl_forecasting_module.py +++ b/darts/models/forecasting/pl_forecasting_module.py @@ -300,7 +300,7 @@ def _sample_tiling(input_data_tuple, batch_sample_size): def _is_probabilistic(self) -> bool: return self.likelihood is not None - def _produce_predict_output(self, x): + def _produce_predict_output(self, x: Tuple): if self.likelihood: output = self(x) return self.likelihood.sample(output) @@ -351,12 +351,13 @@ def _produce_train_output(self, input_batch: Tuple): input_batch ``(past_target, past_covariates, static_covariates)`` """ - past_target, past_covariate, _ = input_batch + past_target, past_covariates, static_covariates = input_batch # Currently all our PastCovariates models require past target and covariates concatenated inpt = ( - torch.cat([past_target, past_covariate], dim=2) - if past_covariate is not None - else past_target + torch.cat([past_target, past_covariates], dim=2) + if past_covariates is not None + else past_target, + static_covariates, ) return self(inpt) @@ -378,7 +379,12 @@ def _get_batch_prediction( ``self.output_chunk_length`` """ dim_component = 2 - past_target, past_covariates, future_past_covariates, _ = input_batch + ( + past_target, + past_covariates, + future_past_covariates, + static_covariates, + ) = input_batch n_targets = past_target.shape[dim_component] n_past_covs = ( @@ -390,7 +396,7 @@ def _get_batch_prediction( dim=dim_component, ) - out = self._produce_predict_output(input_past)[ + out = self._produce_predict_output((input_past, static_covariates))[ :, self.first_prediction_index :, : ] @@ -439,7 +445,7 @@ def _get_batch_prediction( ] = future_past_covariates[:, left_past:right_past, :] # take only last part of the output sequence where needed - out = self._produce_predict_output(input_past)[ + out = self._produce_predict_output((input_past, static_covariates))[ :, self.first_prediction_index :, : ] batch_prediction.append(out) diff --git a/darts/models/forecasting/rnn_model.py b/darts/models/forecasting/rnn_model.py index 33fd9fc66c..047cfa9580 100644 --- a/darts/models/forecasting/rnn_model.py +++ b/darts/models/forecasting/rnn_model.py @@ -86,7 +86,8 @@ def __init__( # The RNN module needs a linear layer V that transforms hidden states into outputs, individually self.V = nn.Linear(hidden_dim, target_size * nr_params) - def forward(self, x, h=None): + def forward(self, x_in: Tuple, h=None): + x, _ = x_in # data is of size (batch_size, input_length, input_size) batch_size = x.shape[0] @@ -103,17 +104,23 @@ def forward(self, x, h=None): return predictions, last_hidden_state def _produce_train_output(self, input_batch: Tuple): - past_target, historic_future_covariates, future_covariates, _ = input_batch + ( + past_target, + historic_future_covariates, + future_covariates, + static_covariates, + ) = input_batch # For the RNN we concatenate the past_target with the future_covariates # (they have the same length because we enforce a Shift dataset for RNNs) model_input = ( torch.cat([past_target, future_covariates], dim=2) if future_covariates is not None - else past_target + else past_target, + static_covariates, ) return self(model_input)[0] - def _produce_predict_output(self, x, last_hidden_state=None): + def _produce_predict_output(self, x: Tuple, last_hidden_state=None): """overwrite parent classes `_produce_predict_output` method""" output, hidden = self(x, last_hidden_state) if self.likelihood: @@ -127,7 +134,12 @@ def _get_batch_prediction( """ This model is recurrent, so we have to write a specific way to obtain the time series forecasts of length n. """ - past_target, historic_future_covariates, future_covariates, _ = input_batch + ( + past_target, + historic_future_covariates, + future_covariates, + static_covariates, + ) = input_batch if historic_future_covariates is not None: # RNNs need as inputs (target[t] and covariates[t+1]) so here we shift the covariates @@ -144,7 +156,9 @@ def _get_batch_prediction( cov_future = None batch_prediction = [] - out, last_hidden_state = self._produce_predict_output(input_series) + out, last_hidden_state = self._produce_predict_output( + (input_series, static_covariates) + ) batch_prediction.append(out[:, -1:, :]) prediction_length = 1 @@ -165,7 +179,7 @@ def _get_batch_prediction( # feed new input to model, including the last hidden state from the previous iteration out, last_hidden_state = self._produce_predict_output( - new_input, last_hidden_state + (new_input, static_covariates), last_hidden_state ) # append prediction to batch prediction array, increase counter diff --git a/darts/models/forecasting/tcn_model.py b/darts/models/forecasting/tcn_model.py index da96cafe1f..3c4d66261e 100644 --- a/darts/models/forecasting/tcn_model.py +++ b/darts/models/forecasting/tcn_model.py @@ -230,7 +230,8 @@ def __init__( self.res_blocks_list.append(res_block) self.res_blocks = nn.ModuleList(self.res_blocks_list) - def forward(self, x): + def forward(self, x_in: Tuple): + x, _ = x_in # data is of size (batch_size, input_chunk_length, input_size) batch_size = x.size(0) x = x.transpose(1, 2) diff --git a/darts/models/forecasting/tft_model.py b/darts/models/forecasting/tft_model.py index 3c6a616e86..76ea645e58 100644 --- a/darts/models/forecasting/tft_model.py +++ b/darts/models/forecasting/tft_model.py @@ -332,13 +332,13 @@ def get_attention_mask_future( return mask def forward( - self, x: Tuple[torch.Tensor, Optional[torch.Tensor], Optional[torch.Tensor]] + self, x_in: Tuple[torch.Tensor, Optional[torch.Tensor], Optional[torch.Tensor]] ) -> torch.Tensor: """TFT model forward pass. Parameters ---------- - x + x_in comes as tuple `(x_past, x_future, x_static)` where `x_past` is the input/past chunk and `x_future` is the output/future chunk. Input dimensions are `(n_samples, n_time_steps, n_variables)` @@ -347,7 +347,7 @@ def forward( torch.Tensor the output tensor """ - x_cont_past, x_cont_future, x_static = x + x_cont_past, x_cont_future, x_static = x_in dim_samples, dim_time, dim_variable = 0, 1, 2 batch_size = x_cont_past.shape[dim_samples] diff --git a/darts/models/forecasting/transformer_model.py b/darts/models/forecasting/transformer_model.py index 37f355db0c..77a68bcf2e 100644 --- a/darts/models/forecasting/transformer_model.py +++ b/darts/models/forecasting/transformer_model.py @@ -161,7 +161,8 @@ def _create_transformer_inputs(self, data): return src, tgt - def forward(self, data): + def forward(self, x_in: Tuple): + data, _ = x_in # Here we create 'src' and 'tgt', the inputs for the encoder and decoder # side of the Transformer architecture src, tgt = self._create_transformer_inputs(data) diff --git a/darts/tests/models/forecasting/test_TCN.py b/darts/tests/models/forecasting/test_TCN.py index 8393014c64..b880e07aea 100644 --- a/darts/tests/models/forecasting/test_TCN.py +++ b/darts/tests/models/forecasting/test_TCN.py @@ -106,12 +106,16 @@ def test_coverage(self): input_tensor = torch.zeros( [1, input_chunk_length, 1], dtype=torch.float64 ) - zero_output = model.model.forward(input_tensor)[0, -1, 0] + zero_output = model.model.forward((input_tensor, None))[ + 0, -1, 0 + ] # test for full coverage for i in range(input_chunk_length): input_tensor[0, i, 0] = 1 - curr_output = model.model.forward(input_tensor)[0, -1, 0] + curr_output = model.model.forward((input_tensor, None))[ + 0, -1, 0 + ] self.assertNotEqual(zero_output, curr_output) input_tensor[0, i, 0] = 0 @@ -145,7 +149,9 @@ def test_coverage(self): input_tensor = torch.zeros( [1, input_chunk_length, 1], dtype=torch.float64 ) - zero_output = model_2.model.forward(input_tensor)[0, -1, 0] + zero_output = model_2.model.forward((input_tensor, None))[ + 0, -1, 0 + ] # test for incomplete coverage uncovered_input_found = False @@ -153,7 +159,9 @@ def test_coverage(self): continue for i in range(input_chunk_length): input_tensor[0, i, 0] = 1 - curr_output = model_2.model.forward(input_tensor)[0, -1, 0] + curr_output = model_2.model.forward((input_tensor, None))[ + 0, -1, 0 + ] if zero_output == curr_output: uncovered_input_found = True break From c18e806599e02dd673fbcf9b1851bafaaae18f19 Mon Sep 17 00:00:00 2001 From: dennisbader Date: Tue, 31 May 2022 19:04:13 +0200 Subject: [PATCH 19/26] non-numerical dtype support for static covariates --- .../test_timeseries_static_covariates.py | 16 +++++++++++- darts/timeseries.py | 26 +++++++++++++++++-- 2 files changed, 39 insertions(+), 3 deletions(-) diff --git a/darts/tests/test_timeseries_static_covariates.py b/darts/tests/test_timeseries_static_covariates.py index e3378d46e4..f3e42452e4 100644 --- a/darts/tests/test_timeseries_static_covariates.py +++ b/darts/tests/test_timeseries_static_covariates.py @@ -268,7 +268,7 @@ def test_stack(self): def test_concatenate_dim_component(self): """ test concatenation with static covariates along component dimension (axis=1) - Along component dimension, we concatenate/transfer the static covariates the series only if one of + Along component dimension, we concatenate/transfer the static covariates of the series only if one of below cases applies: 1) concatenate when for each series the number of static cov components is equal to the number of components in the series. The static variable names (columns in series.static_covariates) must be @@ -425,3 +425,17 @@ def test_scalers_with_static_covariates(self): ts_inv = scaler.inverse_transform(ts_scaled) assert ts_inv.static_covariates.equals(ts.static_covariates) + + def test_non_numerical_static_covariates(self): + static_covs = pd.DataFrame([["a", 0], ["b", 1]], columns=["cat", "num"]) + assert static_covs.dtypes["num"] == "int64" + + ts = TimeSeries.from_values( + values=np.random.random((10, 2)) + ).with_static_covariates(static_covs) + assert ts.static_covariates.dtypes["num"] == ts.dtype == "float64" + assert ts.static_covariates.dtypes["cat"] == object + + ts = ts.astype(np.float32) + assert ts.static_covariates.dtypes["num"] == ts.dtype == "float32" + assert ts.static_covariates.dtypes["cat"] == object diff --git a/darts/timeseries.py b/darts/timeseries.py index c728a3ebec..dbb60d77b6 100644 --- a/darts/timeseries.py +++ b/darts/timeseries.py @@ -237,7 +237,11 @@ def __init__(self, xa: xr.DataArray): if len(static_covariates) == self.n_components else [DEFAULT_GLOBAL_STATIC_COV_NAME] ) - self._xa.attrs[STATIC_COV_TAG] = static_covariates.astype(self.dtype) + # convert numerical columns to same dtype as series + numeric_cols = static_covariates.select_dtypes(include=np.number).columns + self._xa.attrs[STATIC_COV_TAG] = static_covariates.astype( + {col: self.dtype for col in numeric_cols} + ) """ Factory Methods @@ -3701,7 +3705,25 @@ def _set_freq_in_xa(xa_: xr.DataArray): def _concat_static_covs(series: Sequence["TimeSeries"]) -> Optional[pd.DataFrame]: - """Concatenates static covariates.""" + """Concatenates static covariates. Some context for stacking or concatenating two or more TimeSeries with + static covariates: + + Concat along axis=0 (time) + Along time dimension, we only take the static covariates of the first series (as static covariates are + time-independant). + Concat along axis=1 (components) or stacking + Along component dimension, we concatenate/transfer the static covariates of the series only if one of + below cases applies: + 1) concatenate when for each series the number of static covariate components is equal to the number of + components in the series. The static variable names (columns in series.static_covariates) must be + identical across all series + 2) if only the first series contains static covariates transfer only those + 3) if `ignore_static_covarites=True` (with `concatenate()`), case 1) is ignored and only the static + covariates of the first series are transferred + Concat along axis=2 (samples) + Along sample dimension, we only take the static covariates of the first series (as we components and + time don't change). + """ if not any([ts.has_static_covariates for ts in series]): return None From a048ecc28f3e94c112a7544882e68f75ab13dea4 Mon Sep 17 00:00:00 2001 From: dennisbader Date: Tue, 31 May 2022 19:42:45 +0200 Subject: [PATCH 20/26] added slicing support for static covariates --- .../test_timeseries_static_covariates.py | 61 +++++++++++++++++++ darts/timeseries.py | 26 ++++++-- 2 files changed, 82 insertions(+), 5 deletions(-) diff --git a/darts/tests/test_timeseries_static_covariates.py b/darts/tests/test_timeseries_static_covariates.py index f3e42452e4..1373e65159 100644 --- a/darts/tests/test_timeseries_static_covariates.py +++ b/darts/tests/test_timeseries_static_covariates.py @@ -439,3 +439,64 @@ def test_non_numerical_static_covariates(self): ts = ts.astype(np.float32) assert ts.static_covariates.dtypes["num"] == ts.dtype == "float32" assert ts.static_covariates.dtypes["cat"] == object + + def test_get_item(self): + # multi component static covariates + static_covs = pd.DataFrame([["a", 0], ["b", 1]], columns=["cat", "num"]) + ts = TimeSeries.from_values( + values=np.random.random((10, 2)), columns=["comp1", "comp2"] + ).with_static_covariates(static_covs) + + assert ts.static_covariates.index.equals(ts.components) + + ts0 = ts[0] + assert ts0.static_covariates.index.equals(ts.components) + assert isinstance(ts0.static_covariates, pd.DataFrame) + ts1 = ts["comp1"] + assert ts1.static_covariates.index.equals(pd.Index(["comp1"])) + assert isinstance(ts1.static_covariates, pd.DataFrame) + ts2 = ts["comp2"] + assert ts2.static_covariates.index.equals(pd.Index(["comp2"])) + assert isinstance(ts2.static_covariates, pd.DataFrame) + ts3 = ts["comp1":"comp2"] + assert ts3.static_covariates.index.equals(pd.Index(["comp1", "comp2"])) + assert isinstance(ts3.static_covariates, pd.DataFrame) + ts4 = ts[["comp1", "comp2"]] + assert ts4.static_covariates.index.equals(pd.Index(["comp1", "comp2"])) + assert isinstance(ts4.static_covariates, pd.DataFrame) + + # uni/global component static covariates + static_covs = pd.DataFrame([["a", 0]], columns=["cat", "num"]) + ts = TimeSeries.from_values( + values=np.random.random((10, 3)), columns=["comp1", "comp2", "comp3"] + ).with_static_covariates(static_covs) + + # 1) when static covs have 1 component but series is multivariate -> static covariate component name is set to + # "global_components" + assert ts.static_covariates.index.equals( + pd.Index([DEFAULT_GLOBAL_STATIC_COV_NAME]) + ) + ts0 = ts[0] + assert ts0.static_covariates.index.equals( + pd.Index([DEFAULT_GLOBAL_STATIC_COV_NAME]) + ) + assert isinstance(ts0.static_covariates, pd.DataFrame) + ts1 = ts["comp1":"comp3"] + assert ts1.static_covariates.index.equals( + pd.Index([DEFAULT_GLOBAL_STATIC_COV_NAME]) + ) + assert isinstance(ts1.static_covariates, pd.DataFrame) + ts2 = ts[["comp1", "comp2", "comp3"]] + assert ts2.static_covariates.index.equals( + pd.Index([DEFAULT_GLOBAL_STATIC_COV_NAME]) + ) + assert isinstance(ts2.static_covariates, pd.DataFrame) + + # 2) if number of static cov components match the number of components in the series -> static covariate + # component names are set to be equal to series component names + ts3 = ts["comp1"] + assert ts3.static_covariates.index.equals(pd.Index(["comp1"])) + assert isinstance(ts3.static_covariates, pd.DataFrame) + ts4 = ts["comp2"] + assert ts4.static_covariates.index.equals(pd.Index(["comp2"])) + assert isinstance(ts4.static_covariates, pd.DataFrame) diff --git a/darts/timeseries.py b/darts/timeseries.py index dbb60d77b6..817251b522 100644 --- a/darts/timeseries.py +++ b/darts/timeseries.py @@ -3596,6 +3596,12 @@ def _set_freq_in_xa(xa_: xr.DataArray): else: xa_.get_index(self._time_dim).freq = self._freq + adapt_covs_on_component = ( + True + if self.has_static_covariates and len(self.static_covariates) > 1 + else False + ) + # handle DatetimeIndex and RangeIndex: if isinstance(key, pd.DatetimeIndex): _check_dt() @@ -3619,7 +3625,12 @@ def _set_freq_in_xa(xa_: xr.DataArray): # handle slices: elif isinstance(key, slice): if isinstance(key.start, str) or isinstance(key.stop, str): - return self.__class__(self._xa.sel({DIMS[1]: key})) + xa_ = self._xa.sel({DIMS[1]: key}) + if adapt_covs_on_component: + xa_.attrs[STATIC_COV_TAG] = xa_.attrs[STATIC_COV_TAG][ + key.start : key.stop + ] + return self.__class__(xa_) elif isinstance(key.start, (int, np.int64)) or isinstance( key.stop, (int, np.int64) ): @@ -3640,9 +3651,11 @@ def _set_freq_in_xa(xa_: xr.DataArray): # handle simple types: elif isinstance(key, str): - return self.__class__( - self._xa.sel({DIMS[1]: [key]}) - ) # have to put key in a list not to drop the dimension + # have to put key in a list not to drop the dimension + xa_ = self._xa.sel({DIMS[1]: [key]}) + if adapt_covs_on_component: + xa_.attrs[STATIC_COV_TAG] = xa_.attrs[STATIC_COV_TAG].loc[[key]] + return self.__class__(xa_) elif isinstance(key, (int, np.int64)): xa_ = self._xa.isel({self._time_dim: [key]}) @@ -3669,7 +3682,10 @@ def _set_freq_in_xa(xa_: xr.DataArray): if isinstance(key, list): if all(isinstance(s, str) for s in key): # when string(s) are provided, we consider it as (a list of) component(s) - return self.__class__(self._xa.sel({DIMS[1]: key})) + xa_ = self._xa.sel({DIMS[1]: key}) + if adapt_covs_on_component: + xa_.attrs[STATIC_COV_TAG] = xa_.attrs[STATIC_COV_TAG].loc[key] + return self.__class__(xa_) elif all(isinstance(i, (int, np.int64)) for i in key): xa_ = self._xa.isel({self._time_dim: key}) From 3661385fe74bddff4c70a059e180fa99ceec9000 Mon Sep 17 00:00:00 2001 From: dennisbader Date: Tue, 31 May 2022 20:59:50 +0200 Subject: [PATCH 21/26] multicomponent static covariate support for TFT --- darts/models/forecasting/tft_model.py | 20 +++++++++++--- darts/tests/models/forecasting/test_TFT.py | 31 ++++++++++++---------- 2 files changed, 33 insertions(+), 18 deletions(-) diff --git a/darts/models/forecasting/tft_model.py b/darts/models/forecasting/tft_model.py index 76ea645e58..ce5e324cd5 100644 --- a/darts/models/forecasting/tft_model.py +++ b/darts/models/forecasting/tft_model.py @@ -40,6 +40,7 @@ def __init__( self, output_dim: Tuple[int, int], variables_meta: Dict[str, Dict[str, List[str]]], + num_static_components: int, hidden_size: Union[int, List[int]] = 16, lstm_layers: int = 1, num_attention_heads: int = 4, @@ -60,6 +61,9 @@ def __init__( shape of output given by (n_targets, loss_size). (loss_size corresponds to nr_params in other models). variables_meta : Dict[str, Dict[str, List[str]]] dict containing variable enocder, decoder variable names for mapping tensors in `_TFTModule.forward()` + num_static_components + the number of static components (not variables) of the input target series. This is either equal to the + number of target components or 1. hidden_size : int hidden state size of the TFT. It is the main hyper-parameter and common across the internal TFT architecture. @@ -90,6 +94,7 @@ def __init__( self.n_targets, self.loss_size = output_dim self.variables_meta = variables_meta + self.num_static_components = num_static_components self.hidden_size = hidden_size self.hidden_continuous_size = hidden_continuous_size self.lstm_layers = lstm_layers @@ -113,7 +118,11 @@ def __init__( # # processing inputs # continuous variable processing self.prescalers_linear = { - name: nn.Linear(1, self.hidden_continuous_size) for name in self.reals + name: nn.Linear( + 1 if name not in self.static_variables else self.num_static_components, + self.hidden_continuous_size, + ) + for name in self.reals } static_input_sizes = { @@ -412,8 +421,7 @@ def forward( # Embedding and variable selection if self.static_variables: static_embedding = { - name: x_static[:, 0, i].unsqueeze(-1) - for i, name in enumerate(self.static_variables) + name: x_static[:, :, i] for i, name in enumerate(self.static_variables) } static_embedding, static_covariate_var = self.static_covariates_vsn( static_embedding @@ -864,9 +872,13 @@ def _create_model(self, train_sample: MixedCovariatesTrainTensorType) -> nn.Modu dict.fromkeys(static_input) ) + n_static_components = ( + len(static_covariates) if static_covariates is not None else 0 + ) return _TFTModule( - variables_meta=variables_meta, output_dim=self.output_dim, + variables_meta=variables_meta, + num_static_components=n_static_components, hidden_size=self.hidden_size, lstm_layers=self.lstm_layers, dropout=self.dropout, diff --git a/darts/tests/models/forecasting/test_TFT.py b/darts/tests/models/forecasting/test_TFT.py index 441d331699..9f378dc7d1 100644 --- a/darts/tests/models/forecasting/test_TFT.py +++ b/darts/tests/models/forecasting/test_TFT.py @@ -2,7 +2,7 @@ import pandas as pd import pytest -from darts import TimeSeries +from darts import TimeSeries, concatenate from darts.dataprocessing.transformers import Scaler from darts.logging import get_logger from darts.tests.base_test_class import DartsBaseTestClass @@ -166,36 +166,39 @@ def test_mixed_covariates_and_accuracy(self): ) def test_static_covariates_support(self): - target = tg.sine_timeseries(length=2, freq="h") - target = target.with_static_covariates( - pd.Series([0.0, 1.0], index=["st1", "st2"]) + target_multi = concatenate( + [tg.sine_timeseries(length=10, freq="h")] * 2, axis=1 + ) + + target_multi = target_multi.with_static_covariates( + pd.DataFrame([[0.0, 1.0], [2.0, 3.0]], index=["st1", "st2"]) ) # should work with cyclic encoding for time index model = TFTModel( - input_chunk_length=1, - output_chunk_length=1, + input_chunk_length=3, + output_chunk_length=4, add_encoders={"cyclic": {"future": "hour"}}, pl_trainer_kwargs={"fast_dev_run": True}, ) - model.fit(target, verbose=False) + model.fit(target_multi, verbose=False) assert len(model.model.static_variables) == len( - target.static_covariates.columns + target_multi.static_covariates.columns ) - model.predict(n=1, series=target, verbose=False) + model.predict(n=1, series=target_multi, verbose=False) # raise an error when trained with static covariates of wrong dimensionality - target = target.with_static_covariates( - pd.concat([target.static_covariates] * 2, axis=1) + target_multi = target_multi.with_static_covariates( + pd.concat([target_multi.static_covariates] * 2, axis=1) ) with pytest.raises(ValueError): - model.predict(n=1, series=target, verbose=False) + model.predict(n=1, series=target_multi, verbose=False) # raise an error when trained with static covariates and trying to predict without - target = target.with_static_covariates(None) + target_multi = target_multi.with_static_covariates(None) with pytest.raises(ValueError): - model.predict(n=1, series=target, verbose=False) + model.predict(n=1, series=target_multi, verbose=False) def helper_generate_multivariate_case_data(self, season_length, n_repeat): """generates multivariate test case data. Target series is a sine wave stacked with a repeating From 3a9ad83f756289f017cfe90a8c33adff9f782dc1 Mon Sep 17 00:00:00 2001 From: dennisbader Date: Tue, 31 May 2022 21:28:19 +0200 Subject: [PATCH 22/26] added arithmetic static covariate support --- .../test_timeseries_static_covariates.py | 39 +++++++++++++++++++ darts/timeseries.py | 20 +++++++--- 2 files changed, 54 insertions(+), 5 deletions(-) diff --git a/darts/tests/test_timeseries_static_covariates.py b/darts/tests/test_timeseries_static_covariates.py index 1373e65159..cd395f4b7d 100644 --- a/darts/tests/test_timeseries_static_covariates.py +++ b/darts/tests/test_timeseries_static_covariates.py @@ -500,3 +500,42 @@ def test_get_item(self): ts4 = ts["comp2"] assert ts4.static_covariates.index.equals(pd.Index(["comp2"])) assert isinstance(ts4.static_covariates, pd.DataFrame) + + def test_operations(self): + static_covs = pd.DataFrame([[0, 1]], columns=["st1", "st2"]) + ts = TimeSeries.from_values( + values=np.random.random((10, 2)) + ).with_static_covariates(static_covs) + + ts_new = ts / 3 + assert ts_new.static_covariates.equals(ts.static_covariates) + ts_new = ts * 3 + assert ts_new.static_covariates.equals(ts.static_covariates) + ts_new = ts**3 + assert ts_new.static_covariates.equals(ts.static_covariates) + ts_new = ts + 3 + assert ts_new.static_covariates.equals(ts.static_covariates) + ts_new = ts - 3 + assert ts_new.static_covariates.equals(ts.static_covariates) + ts_new = abs(ts) + assert ts_new.static_covariates.equals(ts.static_covariates) + + ts_new = 3 * ts + assert ts_new.static_covariates.equals(ts.static_covariates) + ts_new = 3 + ts + assert ts_new.static_covariates.equals(ts.static_covariates) + ts_new = 3 - ts + assert ts_new.static_covariates.equals(ts.static_covariates) + + ts_new = ts / ts + assert ts_new.static_covariates.equals(ts.static_covariates) + ts_new = ts * ts + assert ts_new.static_covariates.equals(ts.static_covariates) + ts_new = ts**ts + assert ts_new.static_covariates.equals(ts.static_covariates) + ts_new = ts + ts + assert ts_new.static_covariates.equals(ts.static_covariates) + ts_new = ts - ts + assert ts_new.static_covariates.equals(ts.static_covariates) + ts_new = abs(ts) + assert ts_new.static_covariates.equals(ts.static_covariates) diff --git a/darts/timeseries.py b/darts/timeseries.py index 817251b522..8c88e9c647 100644 --- a/darts/timeseries.py +++ b/darts/timeseries.py @@ -3354,7 +3354,9 @@ def __len__(self): def __add__(self, other): if isinstance(other, (int, float, np.integer)): - return self.__class__(self._xa + other) + xa_ = self._xa + other + xa_.attrs[STATIC_COV_TAG] = self.static_covariates + return self.__class__(xa_) elif isinstance(other, (TimeSeries, xr.DataArray, np.ndarray)): return self._combine_arrays(other, lambda s1, s2: s1 + s2) else: @@ -3372,7 +3374,9 @@ def __radd__(self, other): def __sub__(self, other): if isinstance(other, (int, float, np.integer)): - return self.__class__(self._xa - other) + xa_ = self._xa - other + xa_.attrs[STATIC_COV_TAG] = self.static_covariates + return self.__class__(xa_) elif isinstance(other, (TimeSeries, xr.DataArray, np.ndarray)): return self._combine_arrays(other, lambda s1, s2: s1 - s2) else: @@ -3390,7 +3394,9 @@ def __rsub__(self, other): def __mul__(self, other): if isinstance(other, (int, float, np.integer)): - return self.__class__(self._xa * other) + xa_ = self._xa * other + xa_.attrs[STATIC_COV_TAG] = self.static_covariates + return self.__class__(xa_) elif isinstance(other, (TimeSeries, xr.DataArray, np.ndarray)): return self._combine_arrays(other, lambda s1, s2: s1 * s2) else: @@ -3409,7 +3415,9 @@ def __rmul__(self, other): def __pow__(self, n): if isinstance(n, (int, float, np.integer)): raise_if(n < 0, "Attempted to raise a series to a negative power.", logger) - return self.__class__(self._xa ** float(n)) + xa_ = self._xa ** float(n) + xa_.attrs[STATIC_COV_TAG] = self.static_covariates + return self.__class__(xa_) if isinstance(n, (TimeSeries, xr.DataArray, np.ndarray)): return self._combine_arrays(n, lambda s1, s2: s1**s2) # elementwise power else: @@ -3426,7 +3434,9 @@ def __truediv__(self, other): if isinstance(other, (int, float, np.integer)): if other == 0: raise_log(ZeroDivisionError("Cannot divide by 0."), logger) - return self.__class__(self._xa / other) + xa_ = self._xa / other + xa_.attrs[STATIC_COV_TAG] = self.static_covariates + return self.__class__(xa_) elif isinstance(other, (TimeSeries, xr.DataArray, np.ndarray)): if not (other.all_values(copy=False) != 0).all(): raise_log( From f5fa989855a23b5729c4eea13cf9845e0dc9fb10 Mon Sep 17 00:00:00 2001 From: dennisbader Date: Sat, 4 Jun 2022 11:40:20 +0200 Subject: [PATCH 23/26] updated all timeseries methods/operations with static cov transfer --- .../filtering/gaussian_process_filter.py | 6 +- darts/models/filtering/kalman_filter.py | 5 +- darts/models/filtering/moving_average.py | 4 +- darts/models/forecasting/baselines.py | 8 +- darts/models/forecasting/forecasting_model.py | 2 + darts/models/forecasting/kalman_forecaster.py | 6 +- darts/models/forecasting/varima.py | 3 +- darts/tests/dataprocessing/dtw/test_dtw.py | 8 +- .../test_timeseries_static_covariates.py | 248 +++++++++++++----- darts/timeseries.py | 175 ++++++++---- darts/utils/statistics.py | 8 +- darts/utils/timeseries_generation.py | 1 + 12 files changed, 342 insertions(+), 132 deletions(-) diff --git a/darts/models/filtering/gaussian_process_filter.py b/darts/models/filtering/gaussian_process_filter.py index 047ff8953a..db3b4f3db2 100644 --- a/darts/models/filtering/gaussian_process_filter.py +++ b/darts/models/filtering/gaussian_process_filter.py @@ -72,4 +72,8 @@ def filter(self, series: TimeSeries, num_samples: int = 1) -> TimeSeries: filtered_values = self.model.sample_y(times, n_samples=num_samples) filtered_values = filtered_values.reshape(len(times), -1, num_samples) - return TimeSeries.from_times_and_values(series.time_index, filtered_values) + return TimeSeries.from_times_and_values( + series.time_index, + filtered_values, + static_covariates=series.static_covariates, + ) diff --git a/darts/models/filtering/kalman_filter.py b/darts/models/filtering/kalman_filter.py index cfab623071..748357a31c 100644 --- a/darts/models/filtering/kalman_filter.py +++ b/darts/models/filtering/kalman_filter.py @@ -250,5 +250,8 @@ def filter( ).T return TimeSeries.from_times_and_values( - series.time_index, sampled_outputs, columns=series.columns + series.time_index, + sampled_outputs, + columns=series.columns, + static_covariates=series.static_covariates, ) diff --git a/darts/models/filtering/moving_average.py b/darts/models/filtering/moving_average.py index 2f2318e63b..d6ad95e2ab 100644 --- a/darts/models/filtering/moving_average.py +++ b/darts/models/filtering/moving_average.py @@ -49,4 +49,6 @@ def filter(self, series: TimeSeries): .rolling(window=self.window, min_periods=1, center=self.centered) .mean() ) - return TimeSeries.from_dataframe(filtered_df) + return TimeSeries.from_dataframe( + filtered_df, static_covariates=series.static_covariates + ) diff --git a/darts/models/forecasting/baselines.py b/darts/models/forecasting/baselines.py index b4eb5ca21e..0de8ee91e8 100644 --- a/darts/models/forecasting/baselines.py +++ b/darts/models/forecasting/baselines.py @@ -158,10 +158,14 @@ def ensemble( ) -> Union[TimeSeries, Sequence[TimeSeries]]: if isinstance(predictions, Sequence): return [ - TimeSeries.from_series(p.pd_dataframe().sum(axis=1) / len(self.models)) + TimeSeries.from_series( + p.pd_dataframe().sum(axis=1) / len(self.models), + static_covariates=p.static_covariates, + ) for p in predictions ] else: return TimeSeries.from_series( - predictions.pd_dataframe().sum(axis=1) / len(self.models) + predictions.pd_dataframe().sum(axis=1) / len(self.models), + static_covariates=predictions.static_covariates, ) diff --git a/darts/models/forecasting/forecasting_model.py b/darts/models/forecasting/forecasting_model.py index 821700e3cd..5870a0e49c 100644 --- a/darts/models/forecasting/forecasting_model.py +++ b/darts/models/forecasting/forecasting_model.py @@ -461,6 +461,7 @@ def historical_forecasts( return TimeSeries.from_times_and_values( pd.DatetimeIndex(last_points_times, freq=series.freq * stride), np.array(last_points_values), + static_covariates=series.static_covariates, ) else: return TimeSeries.from_times_and_values( @@ -470,6 +471,7 @@ def historical_forecasts( step=1, ), np.array(last_points_values), + static_covariates=series.static_covariates, ) return forecasts diff --git a/darts/models/forecasting/kalman_forecaster.py b/darts/models/forecasting/kalman_forecaster.py index 3de23bdb36..d02cd7352e 100644 --- a/darts/models/forecasting/kalman_forecaster.py +++ b/darts/models/forecasting/kalman_forecaster.py @@ -76,7 +76,11 @@ def _predict( time_index = self._generate_new_dates(n) placeholder_vals = np.zeros((n, self.training_series.width)) * np.nan - series_future = TimeSeries.from_times_and_values(time_index, placeholder_vals) + series_future = TimeSeries.from_times_and_values( + time_index, + placeholder_vals, + static_covariates=self.training_series.static_covariates, + ) whole_series = self.training_series.append(series_future) filtered_series = self.darts_kf.filter( whole_series, covariates=future_covariates, num_samples=num_samples diff --git a/darts/models/forecasting/varima.py b/darts/models/forecasting/varima.py index 05bc2d3673..bccadda5b8 100644 --- a/darts/models/forecasting/varima.py +++ b/darts/models/forecasting/varima.py @@ -64,7 +64,8 @@ def fit(self, series: TimeSeries, future_covariates: Optional[TimeSeries] = None ) # needed for back-transformation when d=1 for _ in range(self.d): series = TimeSeries.from_dataframe( - series.pd_dataframe(copy=False).diff().dropna() + df=series.pd_dataframe(copy=False).diff().dropna(), + static_covariates=series.static_covariates, ) super().fit(series, future_covariates) diff --git a/darts/tests/dataprocessing/dtw/test_dtw.py b/darts/tests/dataprocessing/dtw/test_dtw.py index 340b2c05c0..0cf59149ff 100644 --- a/darts/tests/dataprocessing/dtw/test_dtw.py +++ b/darts/tests/dataprocessing/dtw/test_dtw.py @@ -1,6 +1,7 @@ import unittest import numpy as np +import pandas as pd from darts.dataprocessing import dtw from darts.metrics import dtw_metric, mae, mape @@ -149,13 +150,16 @@ def test_warp(self): xa1 = self.series1.data_array().rename({"time": "time1"}) xa2 = self.series2.data_array().rename({"time": "time2"}) - series1 = TimeSeries.from_xarray(xa1) - series2 = TimeSeries.from_xarray(xa2) + static_covs = pd.DataFrame([[0.0, 1.0]], columns=["st1", "st2"]) + series1 = TimeSeries.from_xarray(xa1).with_static_covariates(static_covs) + series2 = TimeSeries.from_xarray(xa2).with_static_covariates(static_covs) alignment = dtw.dtw(series1, series2) warped1, warped2 = alignment.warped() self.assertAlmostEqual(alignment.mean_distance(), mae(warped1, warped2)) + assert warped1.static_covariates.equals(series1.static_covariates) + assert warped2.static_covariates.equals(series2.static_covariates) """ See DTWAlignment.warped for why this functionality is currently disabled diff --git a/darts/tests/test_timeseries_static_covariates.py b/darts/tests/test_timeseries_static_covariates.py index cd395f4b7d..8865427faa 100644 --- a/darts/tests/test_timeseries_static_covariates.py +++ b/darts/tests/test_timeseries_static_covariates.py @@ -1,4 +1,7 @@ import copy +import os +import shutil +import tempfile import numpy as np import pandas as pd @@ -7,7 +10,7 @@ from darts import TimeSeries, concatenate from darts.dataprocessing.transformers import BoxCox, Scaler from darts.tests.base_test_class import DartsBaseTestClass -from darts.timeseries import DEFAULT_GLOBAL_STATIC_COV_NAME +from darts.timeseries import DEFAULT_GLOBAL_STATIC_COV_NAME, STATIC_COV_TAG from darts.utils.timeseries_generation import _generate_index, linear_timeseries @@ -53,6 +56,68 @@ def setUpClass(cls): cls.df_long_multi = df_long_multi cls.df_long_uni = df_long_uni + def setUp(self): + self.temp_work_dir = tempfile.mkdtemp(prefix="darts") + + def tearDown(self): + super().tearDown() + shutil.rmtree(self.temp_work_dir) + + def test_ts_from_x(self): + ts = linear_timeseries(length=10).with_static_covariates( + pd.Series([0.0, 1.0], index=["st1", "st2"]) + ) + + self.helper_test_cov_transfer(ts, TimeSeries.from_xarray(ts.data_array())) + self.helper_test_cov_transfer( + ts, + TimeSeries.from_dataframe( + ts.pd_dataframe(), static_covariates=ts.static_covariates + ), + ) + # ts.pd_series() loses component names -> static covariates have different components names + self.helper_test_cov_transfer_values( + ts, + TimeSeries.from_series( + ts.pd_series(), static_covariates=ts.static_covariates + ), + ) + self.helper_test_cov_transfer( + ts, + TimeSeries.from_times_and_values( + times=ts.time_index, + values=ts.all_values(), + columns=ts.components, + static_covariates=ts.static_covariates, + ), + ) + + self.helper_test_cov_transfer( + ts, + TimeSeries.from_values( + values=ts.all_values(), + columns=ts.components, + static_covariates=ts.static_covariates, + ), + ) + + f_csv = os.path.join(self.temp_work_dir, "temp_ts.csv") + f_pkl = os.path.join(self.temp_work_dir, "temp_ts.pkl") + ts.to_csv(f_csv) + ts.to_pickle(f_pkl) + ts_json = ts.to_json() + + self.helper_test_cov_transfer( + ts, + TimeSeries.from_csv( + f_csv, time_col="time", static_covariates=ts.static_covariates + ), + ) + self.helper_test_cov_transfer(ts, TimeSeries.from_pickle(f_pkl)) + self.helper_test_cov_transfer( + ts, TimeSeries.from_json(ts_json, static_covariates=ts.static_covariates) + ) + def test_timeseries_from_longitudinal_df(self): # univariate static covs: only group by "st1", keep static covs "st1" value_cols = ["a", "b", "c"] @@ -380,39 +445,6 @@ def test_concatenate_dim_samples(self): ts_concat = concatenate([ts_left, ts_right], axis=2) assert ts_concat.static_covariates.equals(ts_left.static_covariates) - def test_ts_methods_with_static_covariates(self): - ts = linear_timeseries(length=10).astype("float64") - static_covs = pd.Series([0, 1], index=["st1", "st2"]).astype(int) - ts = ts.with_static_covariates(static_covs) - - assert ts.static_covariates.dtypes[0] == "float64" - ts = ts.astype("float32") - assert ts.static_covariates.dtypes[0] == "float32" - - ts_stochastic = ts.from_times_and_values( - times=ts.time_index, - values=np.random.randn(10, 1, 3), - static_covariates=static_covs, - ) - - ts_check = ts.copy() - assert ts_check.static_covariates.equals(ts.static_covariates) - - ts_check = ts.head() - assert ts_check.static_covariates.equals(ts.static_covariates) - - ts_check = ts.tail() - assert ts_check.static_covariates.equals(ts.static_covariates) - - # same values but different component names ("0" vs. "0_quantiles") - ts_check = ts_stochastic.quantile_timeseries() - assert not ts_check.components.equals(ts_stochastic.components) - assert ts_stochastic.static_covariates.index.equals(ts_stochastic.components) - np.testing.assert_almost_equal( - ts_check.static_covariate_values(copy=False), - ts_stochastic.static_covariate_values(copy=False), - ) - def test_scalers_with_static_covariates(self): ts = linear_timeseries(start_value=1.0, end_value=2.0, length=10) static_covs = pd.Series([0.0, 2.0], index=["st1", "st2"]) @@ -507,35 +539,125 @@ def test_operations(self): values=np.random.random((10, 2)) ).with_static_covariates(static_covs) - ts_new = ts / 3 - assert ts_new.static_covariates.equals(ts.static_covariates) - ts_new = ts * 3 - assert ts_new.static_covariates.equals(ts.static_covariates) - ts_new = ts**3 - assert ts_new.static_covariates.equals(ts.static_covariates) - ts_new = ts + 3 - assert ts_new.static_covariates.equals(ts.static_covariates) - ts_new = ts - 3 - assert ts_new.static_covariates.equals(ts.static_covariates) - ts_new = abs(ts) - assert ts_new.static_covariates.equals(ts.static_covariates) + # arithmetics with series (left) and non-series (right) + self.helper_test_cov_transfer(ts, ts / 3) + self.helper_test_cov_transfer(ts, ts * 3) + self.helper_test_cov_transfer(ts, ts**3) + self.helper_test_cov_transfer(ts, ts + 3) + self.helper_test_cov_transfer(ts, ts - 3) + + # conditions + self.helper_test_cov_transfer_xa(ts, ts < 3) + self.helper_test_cov_transfer_xa(ts, ts >= 3) + self.helper_test_cov_transfer_xa(ts, ts > 3) + self.helper_test_cov_transfer_xa(ts, ts >= 3) + + # arithmetics with non-series (left) and series (right) + self.helper_test_cov_transfer(ts, 3 * ts) + self.helper_test_cov_transfer(ts, 3 + ts) + self.helper_test_cov_transfer(ts, 3 - ts) + # conditions + self.helper_test_cov_transfer_xa(ts, 3 > ts) + self.helper_test_cov_transfer_xa(ts, 3 >= ts) + self.helper_test_cov_transfer_xa(ts, 3 < ts) + self.helper_test_cov_transfer_xa(ts, 3 <= ts) + + # arithmetics with two series + self.helper_test_cov_transfer(ts, ts / ts) + self.helper_test_cov_transfer(ts, ts * ts) + self.helper_test_cov_transfer(ts, ts**ts) + self.helper_test_cov_transfer(ts, ts + ts) + self.helper_test_cov_transfer(ts, ts - ts) + # conditions + self.helper_test_cov_transfer_xa(ts, ts > ts) + self.helper_test_cov_transfer_xa(ts, ts >= ts) + self.helper_test_cov_transfer_xa(ts, ts < ts) + self.helper_test_cov_transfer_xa(ts, ts <= ts) + + # other operations + self.helper_test_cov_transfer(ts, abs(ts)) + self.helper_test_cov_transfer(ts, -ts) + self.helper_test_cov_transfer(ts, round(ts, 2)) - ts_new = 3 * ts - assert ts_new.static_covariates.equals(ts.static_covariates) - ts_new = 3 + ts - assert ts_new.static_covariates.equals(ts.static_covariates) - ts_new = 3 - ts - assert ts_new.static_covariates.equals(ts.static_covariates) + def test_ts_methods_with_static_covariates(self): + ts = linear_timeseries(length=10, start_value=1.0, end_value=2.0).astype( + "float64" + ) + static_covs = pd.Series([0, 1], index=["st1", "st2"]).astype(int) + ts = ts.with_static_covariates(static_covs) - ts_new = ts / ts - assert ts_new.static_covariates.equals(ts.static_covariates) - ts_new = ts * ts - assert ts_new.static_covariates.equals(ts.static_covariates) - ts_new = ts**ts - assert ts_new.static_covariates.equals(ts.static_covariates) - ts_new = ts + ts - assert ts_new.static_covariates.equals(ts.static_covariates) - ts_new = ts - ts - assert ts_new.static_covariates.equals(ts.static_covariates) - ts_new = abs(ts) + assert ts.static_covariates.dtypes[0] == "float64" + ts = ts.astype("float32") + assert ts.static_covariates.dtypes[0] == "float32" + + ts_stoch = ts.from_times_and_values( + times=ts.time_index, + values=np.ones((10, 1, 3)), + static_covariates=static_covs, + ) + assert ts_stoch.static_covariates.index.equals(ts_stoch.components) + + self.helper_test_cov_transfer(ts, ts.with_values(ts.all_values())) + self.helper_test_cov_transfer( + ts, ts.with_columns_renamed(ts.components.tolist(), ts.components.tolist()) + ) + self.helper_test_cov_transfer(ts, ts.copy()) + self.helper_test_cov_transfer(ts, ts.mean()) + self.helper_test_cov_transfer(ts, ts.median()) + self.helper_test_cov_transfer(ts, ts.sum()) + self.helper_test_cov_transfer(ts, ts.min()) + self.helper_test_cov_transfer(ts, ts.max()) + self.helper_test_cov_transfer(ts, ts.head()) + self.helper_test_cov_transfer(ts, ts.tail()) + self.helper_test_cov_transfer(ts, ts.split_after(0.5)[0]) + self.helper_test_cov_transfer(ts, ts.split_after(0.5)[1]) + self.helper_test_cov_transfer(ts, ts.split_before(0.5)[0]) + self.helper_test_cov_transfer(ts, ts.split_before(0.5)[1]) + self.helper_test_cov_transfer(ts, ts.drop_before(0.5)) + self.helper_test_cov_transfer(ts, ts.drop_after(0.5)) + self.helper_test_cov_transfer( + ts, ts.slice(ts.start_time() + ts.freq, ts.end_time() - ts.freq) + ) + self.helper_test_cov_transfer(ts, ts.slice_n_points_after(ts.start_time(), 5)) + self.helper_test_cov_transfer(ts, ts.slice_n_points_before(ts.end_time(), 5)) + self.helper_test_cov_transfer(ts, ts.slice_intersect(ts[2:])) + self.helper_test_cov_transfer(ts, ts.strip()) + self.helper_test_cov_transfer(ts, ts.longest_contiguous_slice()) + self.helper_test_cov_transfer(ts, ts.rescale_with_value(2.0)) + self.helper_test_cov_transfer(ts, ts.shift(2.0)) + self.helper_test_cov_transfer(ts, ts.diff()) + self.helper_test_cov_transfer(ts, ts.univariate_component(0)) + self.helper_test_cov_transfer(ts, ts.map(lambda x: x + 1)) + self.helper_test_cov_transfer(ts, ts.resample(ts.freq)) + self.helper_test_cov_transfer(ts, ts[:5].append(ts[5:])) + self.helper_test_cov_transfer(ts, ts.append_values(ts.all_values())) + + self.helper_test_cov_transfer(ts_stoch, ts_stoch.var()) + self.helper_test_cov_transfer(ts_stoch, ts_stoch.std()) + self.helper_test_cov_transfer(ts_stoch, ts_stoch.skew()) + self.helper_test_cov_transfer(ts_stoch, ts_stoch.kurtosis()) + + # will append "_quantile" to component names + self.helper_test_cov_transfer_values(ts_stoch, ts_stoch.quantile_timeseries()) + self.helper_test_cov_transfer_values(ts_stoch, ts_stoch.quantile(0.5)) + # will change component names + self.helper_test_cov_transfer_values(ts, ts.add_datetime_attribute("hour")) + self.helper_test_cov_transfer_values(ts, ts.add_holidays("US")) + + def helper_test_cov_transfer(self, ts, ts_new): + """static cov dataframes must be identical""" assert ts_new.static_covariates.equals(ts.static_covariates) + + def helper_test_cov_transfer_xa(self, ts, xa_new): + """static cov dataframes must be identical between xarray and TimeSeries""" + assert xa_new.attrs[STATIC_COV_TAG].equals(ts.static_covariates) + + def helper_test_cov_transfer_values(self, ts, ts_new): + """values of static cov dataframes must match but not row index (component names). + I.e. series.quantile_timeseries() adds "_quantiles" to component names + """ + assert not ts_new.static_covariates.index.equals(ts.components) + np.testing.assert_almost_equal( + ts_new.static_covariate_values(copy=False), + ts.static_covariate_values(copy=False), + ) diff --git a/darts/timeseries.py b/darts/timeseries.py index 8c88e9c647..1f9f6eda91 100644 --- a/darts/timeseries.py +++ b/darts/timeseries.py @@ -230,17 +230,19 @@ def __init__(self, xa: xr.DataArray): pass if static_covariates is None: - self._xa.attrs[STATIC_COV_TAG] = None + self._xa = _xarray_with_static_covariates(self._xa, None) else: static_covariates.index = ( self.components if len(static_covariates) == self.n_components else [DEFAULT_GLOBAL_STATIC_COV_NAME] ) + static_covariates.columns.name = STATIC_COV_TAG # convert numerical columns to same dtype as series numeric_cols = static_covariates.select_dtypes(include=np.number).columns - self._xa.attrs[STATIC_COV_TAG] = static_covariates.astype( - {col: self.dtype for col in numeric_cols} + self._xa = _xarray_with_static_covariates( + self._xa, + static_covariates.astype({col: self.dtype for col in numeric_cols}), ) """ @@ -868,7 +870,11 @@ def from_values( ) @classmethod - def from_json(cls, json_str: str) -> "TimeSeries": + def from_json( + cls, + json_str: str, + static_covariates: Optional[Union[pd.Series, pd.DataFrame]] = None, + ) -> "TimeSeries": """ Build a series from the JSON String representation of a ``TimeSeries`` (produced using :func:`TimeSeries.to_json()`). @@ -879,6 +885,14 @@ def from_json(cls, json_str: str) -> "TimeSeries": ---------- json_str The JSON String to convert + static_covariates + Optionally, a set of static covariates to be added to the TimeSeries. Either a pandas Series or a pandas + DataFrame. If a Series, the index represents the static variables. The covariates are globally 'applied' + to all components of the TimeSeries. If a DataFrame, the columns represent the static variables and the + rows represent the components of the uni/multivariate TimeSeries. If a single-row DataFrame, the covariates + are globally 'applied' to all components of the TimeSeries. If a multi-row DataFrame, the number of + rows must match the number of components of the TimeSeries (in this case, the number of columns in + ``value_cols``). This adds control for component-specific static covariates. Returns ------- @@ -886,7 +900,7 @@ def from_json(cls, json_str: str) -> "TimeSeries": The time series object converted from the JSON String """ df = pd.read_json(json_str, orient="split") - return cls.from_dataframe(df) + return cls.from_dataframe(df, static_covariates=static_covariates) @classmethod def from_pickle(cls, path: str) -> "TimeSeries": @@ -1987,7 +2001,9 @@ def strip(self) -> "TimeSeries": new_start_idx = df.first_valid_index() new_end_idx = df.last_valid_index() new_series = df.loc[new_start_idx:new_end_idx] - return self.__class__.from_dataframe(new_series) + return self.__class__.from_dataframe( + new_series, static_covariates=self.static_covariates + ) def longest_contiguous_slice(self, max_gap_size: int = 0) -> "TimeSeries": """ @@ -2258,7 +2274,10 @@ def append_values(self, values: np.ndarray) -> "TimeSeries": return self.append( self.__class__.from_times_and_values( - values=values, times=idx, fill_missing_dates=False + values=values, + times=idx, + fill_missing_dates=False, + static_covariates=self.static_covariates, ) ) @@ -2380,7 +2399,9 @@ def stack(self, other: "TimeSeries") -> "TimeSeries": new_other_xa = other_xa new_xa = xr.concat((self._xa, new_other_xa), dim=DIMS[1]) - new_xa.attrs[STATIC_COV_TAG] = _concat_static_covs([self, other]) + new_xa = _xarray_with_static_covariates( + new_xa, _concat_static_covs([self, other]) + ) # we call the factory method here to disambiguate column names if needed. return self.__class__.from_xarray(new_xa, fill_missing_dates=False) @@ -2626,6 +2647,11 @@ def to_json(self) -> str: At the moment this function works only on deterministic time series (i.e., made of 1 sample). + Notes + ----- + Static covariates are not returned in the JSON string. When using `TimeSeries.from_json()`, the static + covariates can be added with input argument `static_covariates`. + Returns ------- str @@ -3298,7 +3324,9 @@ def _restore_xarray_from_frequency(xa: xr.DataArray, freq: str) -> xr.DataArray: } resampled_xa = xr.DataArray( - data=np.empty(shape=((len(resampled_time_index),) + xa.shape[1:])), + data=np.empty( + shape=((len(resampled_time_index),) + xa.shape[1:]), dtype=xa.dtype + ), dims=xa.dims, coords=coords, attrs=xa.attrs, @@ -3354,8 +3382,9 @@ def __len__(self): def __add__(self, other): if isinstance(other, (int, float, np.integer)): - xa_ = self._xa + other - xa_.attrs[STATIC_COV_TAG] = self.static_covariates + xa_ = _xarray_with_static_covariates( + self._xa + other, self.static_covariates + ) return self.__class__(xa_) elif isinstance(other, (TimeSeries, xr.DataArray, np.ndarray)): return self._combine_arrays(other, lambda s1, s2: s1 + s2) @@ -3374,8 +3403,9 @@ def __radd__(self, other): def __sub__(self, other): if isinstance(other, (int, float, np.integer)): - xa_ = self._xa - other - xa_.attrs[STATIC_COV_TAG] = self.static_covariates + xa_ = _xarray_with_static_covariates( + self._xa - other, self.static_covariates + ) return self.__class__(xa_) elif isinstance(other, (TimeSeries, xr.DataArray, np.ndarray)): return self._combine_arrays(other, lambda s1, s2: s1 - s2) @@ -3394,8 +3424,9 @@ def __rsub__(self, other): def __mul__(self, other): if isinstance(other, (int, float, np.integer)): - xa_ = self._xa * other - xa_.attrs[STATIC_COV_TAG] = self.static_covariates + xa_ = _xarray_with_static_covariates( + self._xa * other, self.static_covariates + ) return self.__class__(xa_) elif isinstance(other, (TimeSeries, xr.DataArray, np.ndarray)): return self._combine_arrays(other, lambda s1, s2: s1 * s2) @@ -3415,8 +3446,9 @@ def __rmul__(self, other): def __pow__(self, n): if isinstance(n, (int, float, np.integer)): raise_if(n < 0, "Attempted to raise a series to a negative power.", logger) - xa_ = self._xa ** float(n) - xa_.attrs[STATIC_COV_TAG] = self.static_covariates + xa_ = _xarray_with_static_covariates( + self._xa ** float(n), self.static_covariates + ) return self.__class__(xa_) if isinstance(n, (TimeSeries, xr.DataArray, np.ndarray)): return self._combine_arrays(n, lambda s1, s2: s1**s2) # elementwise power @@ -3434,8 +3466,9 @@ def __truediv__(self, other): if isinstance(other, (int, float, np.integer)): if other == 0: raise_log(ZeroDivisionError("Cannot divide by 0."), logger) - xa_ = self._xa / other - xa_.attrs[STATIC_COV_TAG] = self.static_covariates + xa_ = _xarray_with_static_covariates( + self._xa / other, self.static_covariates + ) return self.__class__(xa_) elif isinstance(other, (TimeSeries, xr.DataArray, np.ndarray)): if not (other.all_values(copy=False) != 0).all(): @@ -3471,9 +3504,13 @@ def __round__(self, n=None): def __lt__(self, other) -> xr.DataArray: if isinstance(other, (int, float, np.integer, np.ndarray, xr.DataArray)): - series = self._xa < other + return _xarray_with_static_covariates( + self._xa < other, self.static_covariates + ) elif isinstance(other, TimeSeries): - series = self._xa < other.data_array(copy=False) + return _xarray_with_static_covariates( + self._xa < other.data_array(copy=False), self.static_covariates + ) else: raise_log( TypeError( @@ -3483,15 +3520,18 @@ def __lt__(self, other) -> xr.DataArray: ), logger, ) - return series # Note: we return a DataArray def __gt__(self, other) -> xr.DataArray: + if isinstance(other, (int, float, np.integer, np.ndarray, xr.DataArray)): - series = self._xa > other + return _xarray_with_static_covariates( + self._xa > other, self.static_covariates + ) elif isinstance(other, TimeSeries): - series = self._xa > other.data_array(copy=False) + return _xarray_with_static_covariates( + self._xa > other.data_array(copy=False), self.static_covariates + ) else: - series = None raise_log( TypeError( "unsupported operand type(s) for < : '{}' and '{}'.".format( @@ -3500,15 +3540,17 @@ def __gt__(self, other) -> xr.DataArray: ), logger, ) - return series # Note: we return a DataArray def __le__(self, other) -> xr.DataArray: if isinstance(other, (int, float, np.integer, np.ndarray, xr.DataArray)): - series = self._xa <= other + return _xarray_with_static_covariates( + self._xa <= other, self.static_covariates + ) elif isinstance(other, TimeSeries): - series = self._xa <= other.data_array(copy=False) + return _xarray_with_static_covariates( + self._xa <= other.data_array(copy=False), self.static_covariates + ) else: - series = None raise_log( TypeError( "unsupported operand type(s) for < : '{}' and '{}'.".format( @@ -3517,15 +3559,17 @@ def __le__(self, other) -> xr.DataArray: ), logger, ) - return series # Note: we return a DataArray def __ge__(self, other) -> xr.DataArray: if isinstance(other, (int, float, np.integer, np.ndarray, xr.DataArray)): - series = self._xa >= other + return _xarray_with_static_covariates( + self._xa >= other, self.static_covariates + ) elif isinstance(other, TimeSeries): - series = self._xa >= other.data_array(copy=False) + return _xarray_with_static_covariates( + self._xa >= other.data_array(copy=False), self.static_covariates + ) else: - series = None raise_log( TypeError( "unsupported operand type(s) for < : '{}' and '{}'.".format( @@ -3534,7 +3578,6 @@ def __ge__(self, other) -> xr.DataArray: ), logger, ) - return series # Note: we return a DataArray def __str__(self): return str(self._xa).replace("xarray.DataArray", "TimeSeries (DataArray)") @@ -3637,9 +3680,9 @@ def _set_freq_in_xa(xa_: xr.DataArray): if isinstance(key.start, str) or isinstance(key.stop, str): xa_ = self._xa.sel({DIMS[1]: key}) if adapt_covs_on_component: - xa_.attrs[STATIC_COV_TAG] = xa_.attrs[STATIC_COV_TAG][ - key.start : key.stop - ] + xa_ = _xarray_with_static_covariates( + xa_, xa_.attrs[STATIC_COV_TAG][key.start : key.stop] + ) return self.__class__(xa_) elif isinstance(key.start, (int, np.int64)) or isinstance( key.stop, (int, np.int64) @@ -3664,7 +3707,9 @@ def _set_freq_in_xa(xa_: xr.DataArray): # have to put key in a list not to drop the dimension xa_ = self._xa.sel({DIMS[1]: [key]}) if adapt_covs_on_component: - xa_.attrs[STATIC_COV_TAG] = xa_.attrs[STATIC_COV_TAG].loc[[key]] + xa_ = _xarray_with_static_covariates( + xa_, xa_.attrs[STATIC_COV_TAG].loc[[key]] + ) return self.__class__(xa_) elif isinstance(key, (int, np.int64)): xa_ = self._xa.isel({self._time_dim: [key]}) @@ -3694,7 +3739,9 @@ def _set_freq_in_xa(xa_: xr.DataArray): # when string(s) are provided, we consider it as (a list of) component(s) xa_ = self._xa.sel({DIMS[1]: key}) if adapt_covs_on_component: - xa_.attrs[STATIC_COV_TAG] = xa_.attrs[STATIC_COV_TAG].loc[key] + xa_ = _xarray_with_static_covariates( + xa_, xa_.attrs[STATIC_COV_TAG].loc[key] + ) return self.__class__(xa_) elif all(isinstance(i, (int, np.int64)) for i in key): xa_ = self._xa.isel({self._time_dim: key}) @@ -3730,25 +3777,35 @@ def _set_freq_in_xa(xa_: xr.DataArray): raise_log(IndexError("The type of your index was not matched."), logger) +def _xarray_with_static_covariates(xa_, static_covariates): + """Return an DataArray instance with static covariates stored in the array's attributes. + Warning: This is an inplace operation (mutable) and should only be called from within TimeSeries construction + or to restore static covariates after operations in which static covariates did not get transferred. + """ + xa_.attrs[STATIC_COV_TAG] = static_covariates + return xa_ + + def _concat_static_covs(series: Sequence["TimeSeries"]) -> Optional[pd.DataFrame]: - """Concatenates static covariates. Some context for stacking or concatenating two or more TimeSeries with - static covariates: - - Concat along axis=0 (time) - Along time dimension, we only take the static covariates of the first series (as static covariates are - time-independant). - Concat along axis=1 (components) or stacking - Along component dimension, we concatenate/transfer the static covariates of the series only if one of - below cases applies: - 1) concatenate when for each series the number of static covariate components is equal to the number of - components in the series. The static variable names (columns in series.static_covariates) must be - identical across all series - 2) if only the first series contains static covariates transfer only those - 3) if `ignore_static_covarites=True` (with `concatenate()`), case 1) is ignored and only the static - covariates of the first series are transferred - Concat along axis=2 (samples) - Along sample dimension, we only take the static covariates of the first series (as we components and - time don't change). + """Concatenates static covariates along component dimension (rows of static covariates). For stacking or + concatenating TimeSeries along component dimension (axis=1). + + Some context for stacking or concatenating two or more TimeSeries with static covariates: + Concat along axis=0 (time) + Along time dimension, we only take the static covariates of the first series (as static covariates are + time-independent). + Concat along axis=1 (components) or stacking + Along component dimension, we either concatenate or transfer the static covariates of the series if one + of below cases applies: + 1) concatenate along component dimension (rows of static covariates) when for each series the number of + static covariate components is equal to the number of components in the series. The static variable + names (columns in series.static_covariates) must be identical across all series + 2) if only the first series contains static covariates transfer only those + 3) if `ignore_static_covarites=True` (with `concatenate()`), case 1) is ignored and only the static + covariates of the first series are transferred + Concat along axis=2 (samples) + Along sample dimension, we only take the static covariates of the first series (as we components and + time don't change). """ if not any([ts.has_static_covariates for ts in series]): @@ -3880,7 +3937,9 @@ def concatenate( ) da_concat = da_concat.assign_coords({time_dim_name: tindex}) - da_concat.attrs[STATIC_COV_TAG] = series[0].static_covariates + da_concat = _xarray_with_static_covariates( + da_concat, series[0].static_covariates + ) else: time_axes_equal = all( diff --git a/darts/utils/statistics.py b/darts/utils/statistics.py index 0db4014443..77d52b7e72 100644 --- a/darts/utils/statistics.py +++ b/darts/utils/statistics.py @@ -192,8 +192,12 @@ def extract_trend_and_seasonality( else: raise_log(ValueError(f"Unknown value for method: {method}"), logger) - season = TimeSeries.from_times_and_values(ts.time_index, decomp.seasonal) - trend = TimeSeries.from_times_and_values(ts.time_index, decomp.trend) + season = TimeSeries.from_times_and_values( + ts.time_index, decomp.seasonal, static_covariates=ts.static_covariates + ) + trend = TimeSeries.from_times_and_values( + ts.time_index, decomp.trend, static_covariates=ts.static_covariates + ) return trend, season diff --git a/darts/utils/timeseries_generation.py b/darts/utils/timeseries_generation.py index acd0b58526..8b0e719062 100644 --- a/darts/utils/timeseries_generation.py +++ b/darts/utils/timeseries_generation.py @@ -715,6 +715,7 @@ def _build_forecast_series( np.stack(points_preds, axis=2), freq=input_series.freq_str, columns=input_series.columns, + static_covariates=input_series.static_covariates, ) From 41adf3f647194148b2c9344973c18b66fed0c1d3 Mon Sep 17 00:00:00 2001 From: dennisbader Date: Sat, 4 Jun 2022 16:41:06 +0200 Subject: [PATCH 24/26] applied suggestion from PR review part 1 --- .../forecasting/torch_forecasting_model.py | 12 +++-- .../test_global_forecasting_models.py | 11 ++++- .../test_timeseries_static_covariates.py | 46 +++++++++---------- darts/timeseries.py | 28 +++++------ darts/utils/data/horizon_based_dataset.py | 2 +- darts/utils/data/inference_dataset.py | 2 +- darts/utils/data/shifted_dataset.py | 2 +- 7 files changed, 56 insertions(+), 47 deletions(-) diff --git a/darts/models/forecasting/torch_forecasting_model.py b/darts/models/forecasting/torch_forecasting_model.py index 49c12c3cd4..ecde4fc8b9 100644 --- a/darts/models/forecasting/torch_forecasting_model.py +++ b/darts/models/forecasting/torch_forecasting_model.py @@ -1522,7 +1522,7 @@ def _basic_compare_sample(train_sample: Tuple, predict_sample: Tuple): and ( c_train.shape[-1] != c_pred.shape[-1] if c_descr != "static covariates" - else c_train.shape == c_pred.shape + else c_train.shape != c_pred.shape ), f"The provided {c_descr} must have dimensionality matching that of the covariates used for training " "the model.", @@ -1568,18 +1568,22 @@ def _mixed_compare_sample(train_sample: Tuple, predict_sample: Tuple): zip(train_has_ds, predict_has_ds, ds_names) ): raise_if( - ds_in_train and not ds_in_predict and ds_in_train, + ds_in_train and not ds_in_predict, f"This model has been trained with `{ds_name}`; some `{ds_name}` of matching dimensionality are needed " f"for prediction.", ) raise_if( - ds_in_train and not ds_in_predict and ds_in_predict, + not ds_in_train and ds_in_predict, f"This model has been trained without `{ds_name}`; No `{ds_name}` should be provided for prediction.", ) raise_if( ds_in_train and ds_in_predict - and train_datasets[idx].shape[-1] != predict_datasets[idx].shape[-1], + and ( + train_datasets[idx].shape[-1] != predict_datasets[idx].shape[-1] + if ds_name != "static_covariates" + else train_datasets[idx].shape != predict_datasets[idx].shape + ), f"The provided `{ds_name}` must have equal dimensionality as the `{ds_name}` used for training the model.", ) diff --git a/darts/tests/models/forecasting/test_global_forecasting_models.py b/darts/tests/models/forecasting/test_global_forecasting_models.py index ed6d70f883..3afcb345f4 100644 --- a/darts/tests/models/forecasting/test_global_forecasting_models.py +++ b/darts/tests/models/forecasting/test_global_forecasting_models.py @@ -2,6 +2,7 @@ from unittest.mock import ANY, patch import numpy as np +import pandas as pd from darts.dataprocessing.transformers import Scaler from darts.datasets import AirPassengersDataset @@ -107,8 +108,13 @@ class GlobalForecastingModelsTestCase(DartsBaseTestClass): np.random.seed(42) torch.manual_seed(42) + # some arbitrary static covariates + static_covariates = pd.DataFrame([[0.0, 1.0]], columns=["st1", "st2"]) + # real timeseries for functionality tests - ts_passengers = AirPassengersDataset().load() + ts_passengers = ( + AirPassengersDataset().load().with_static_covariates(static_covariates) + ) scaler = Scaler() ts_passengers = scaler.fit_transform(ts_passengers) ts_pass_train, ts_pass_val = ts_passengers[:-36], ts_passengers[-36:] @@ -174,6 +180,9 @@ def test_single_ts(self): "Model {} produces errors too high (one time " "series). Error = {}".format(model_cls, mape_err), ) + self.assertTrue( + pred.static_covariates.equals(self.ts_passengers.static_covariates) + ) def test_multi_ts(self): for model_cls, kwargs, err in models_cls_kwargs_errs: diff --git a/darts/tests/test_timeseries_static_covariates.py b/darts/tests/test_timeseries_static_covariates.py index 8865427faa..0791ecc327 100644 --- a/darts/tests/test_timeseries_static_covariates.py +++ b/darts/tests/test_timeseries_static_covariates.py @@ -135,7 +135,7 @@ def test_timeseries_from_longitudinal_df(self): ) assert ts.static_covariates.shape == (1, 1) assert ts.static_covariates.columns.equals(pd.Index(["st1"])) - assert (ts.static_covariate_values(copy=False) == [[i]]).all() + assert (ts.static_covariates_values(copy=False) == [[i]]).all() # multivariate static covs: only group by "st1", keep static covs "st1", "constant" ts_groups2 = TimeSeries.from_group_dataframe( @@ -149,7 +149,7 @@ def test_timeseries_from_longitudinal_df(self): for i, ts in enumerate(ts_groups2): assert ts.static_covariates.shape == (1, 2) assert ts.static_covariates.columns.equals(pd.Index(["st1", "constant"])) - assert (ts.static_covariate_values(copy=False) == [[i, 1]]).all() + assert (ts.static_covariates_values(copy=False) == [[i, 1]]).all() # multivariate static covs: group by "st1" and "st2", keep static covs "st1", "st2", "constant" ts_groups3 = TimeSeries.from_group_dataframe( @@ -167,7 +167,7 @@ def test_timeseries_from_longitudinal_df(self): assert ts.static_covariates.columns.equals( pd.Index(["st1", "st2", "constant"]) ) - assert (ts.static_covariate_values(copy=False) == [[i, j, 1]]).all() + assert (ts.static_covariates_values(copy=False) == [[i, j, 1]]).all() df = copy.deepcopy(self.df_long_multi) df.loc[:, "non_static"] = np.arange(len(df)) @@ -204,7 +204,7 @@ def test_with_static_covariates_univariate(self): ts = ts.with_static_covariates(static_covs_series) assert ts.has_static_covariates np.testing.assert_almost_equal( - ts.static_covariate_values(copy=False), + ts.static_covariates_values(copy=False), np.expand_dims(static_covs_series.values, -1).T, ) assert ts.static_covariates.index.equals(ts.components) @@ -213,7 +213,7 @@ def test_with_static_covariates_univariate(self): ts = ts.with_static_covariates(static_covs_df) assert ts.has_static_covariates np.testing.assert_almost_equal( - ts.static_covariate_values(copy=False), static_covs_df.values + ts.static_covariates_values(copy=False), static_covs_df.values ) assert ts.static_covariates.index.equals(ts.components) @@ -231,23 +231,23 @@ def test_with_static_covariates_univariate(self): static_covs_multi = pd.concat([static_covs_series] * 2, axis=1).T _ = ts.with_static_covariates(static_covs_multi) - def test_static_covariate_values(self): + def test_static_covariates_values(self): ts = linear_timeseries(length=10) static_covs = pd.DataFrame([[0.0, 1.0]], columns=["st1", "st2"]) ts = ts.with_static_covariates(static_covs) # changing values of copy should not change original DataFrame - vals = ts.static_covariate_values(copy=True) + vals = ts.static_covariates_values(copy=True) vals[:] = -1.0 - assert (ts.static_covariate_values(copy=False) != -1.0).all() + assert (ts.static_covariates_values(copy=False) != -1.0).all() # changing values of view should change original DataFrame - vals = ts.static_covariate_values(copy=False) + vals = ts.static_covariates_values(copy=False) vals[:] = -1.0 - assert (ts.static_covariate_values(copy=False) == -1.0).all() + assert (ts.static_covariates_values(copy=False) == -1.0).all() ts = ts.with_static_covariates(None) - assert ts.static_covariate_values() is None + assert ts.static_covariates_values() is None def test_with_static_covariates_multivariate(self): ts = linear_timeseries(length=10) @@ -261,7 +261,7 @@ def test_with_static_covariates_multivariate(self): ) assert ts_multi.static_covariates.columns.equals(static_covs.columns) np.testing.assert_almost_equal( - ts_multi.static_covariate_values(copy=False), static_covs.loc[0:0].values + ts_multi.static_covariates_values(copy=False), static_covs.loc[0:0].values ) # from multivariate static covariates @@ -269,7 +269,7 @@ def test_with_static_covariates_multivariate(self): assert ts_multi.static_covariates.index.equals(ts_multi.components) assert ts_multi.static_covariates.columns.equals(static_covs.columns) np.testing.assert_almost_equal( - ts_multi.static_covariate_values(copy=False), static_covs.values + ts_multi.static_covariates_values(copy=False), static_covs.values ) # raise an error if multivariate static covariates columns don't match the number of components in the series @@ -297,7 +297,7 @@ def test_stack(self): ts_stacked1 = ts_uni.stack(ts_uni) assert ts_stacked1.static_covariates.index.equals(ts_stacked1.components) np.testing.assert_almost_equal( - ts_stacked1.static_covariate_values(copy=False), + ts_stacked1.static_covariates_values(copy=False), pd.concat([ts_uni.static_covariates] * 2, axis=0).values, ) @@ -305,8 +305,8 @@ def test_stack(self): # -> this gives multivar ts with univar static covs ts_stacked2 = ts_uni.stack(ts_uni.with_static_covariates(None)) np.testing.assert_almost_equal( - ts_stacked2.static_covariate_values(copy=False), - ts_uni.static_covariate_values(copy=False), + ts_stacked2.static_covariates_values(copy=False), + ts_uni.static_covariates_values(copy=False), ) # mismatch between column names @@ -320,7 +320,7 @@ def test_stack(self): # valid univar ts with univar static covariates + multivar ts with multivar static covariates ts_stacked3 = ts_uni.stack(ts_multi) np.testing.assert_almost_equal( - ts_stacked3.static_covariate_values(copy=False), + ts_stacked3.static_covariates_values(copy=False), pd.concat( [ts_uni.static_covariates, ts_multi.static_covariates], axis=0 ).values, @@ -371,7 +371,7 @@ def test_concatenate_dim_component(self): assert ts_concat.static_covariates.shape == (2, 2) assert ts_concat.components.equals(ts_concat.static_covariates.index) np.testing.assert_almost_equal( - ts_concat.static_covariate_values(copy=False), + ts_concat.static_covariates_values(copy=False), pd.concat([static_covs_uni1] * 2, axis=0).values, ) @@ -390,8 +390,8 @@ def test_concatenate_dim_component(self): pd.Index([DEFAULT_GLOBAL_STATIC_COV_NAME]) ) np.testing.assert_almost_equal( - ts_concat.static_covariate_values(copy=False), - ts_uni_static_uni1.static_covariate_values(copy=False), + ts_concat.static_covariates_values(copy=False), + ts_uni_static_uni1.static_covariates_values(copy=False), ) # concatenation with inconsistent number of static covariates should fail ... @@ -407,7 +407,7 @@ def test_concatenate_dim_component(self): assert ts_concat.static_covariates.shape == (ts_concat.n_components, 2) assert ts_concat.components.equals(ts_concat.static_covariates.index) np.testing.assert_almost_equal( - ts_concat.static_covariate_values(copy=False), + ts_concat.static_covariates_values(copy=False), pd.concat([static_covs_uni1, static_covs_multi], axis=0), ) @@ -658,6 +658,6 @@ def helper_test_cov_transfer_values(self, ts, ts_new): """ assert not ts_new.static_covariates.index.equals(ts.components) np.testing.assert_almost_equal( - ts_new.static_covariate_values(copy=False), - ts.static_covariate_values(copy=False), + ts_new.static_covariates_values(copy=False), + ts.static_covariates_values(copy=False), ) diff --git a/darts/timeseries.py b/darts/timeseries.py index 1f9f6eda91..6ed5687aea 100644 --- a/darts/timeseries.py +++ b/darts/timeseries.py @@ -938,20 +938,7 @@ def static_covariates(self) -> Optional[pd.DataFrame]: """ Returns the static covariates contained in the series as a pandas DataFrame. The columns represent the static variables and the rows represent the components of the uni/multivariate - series. If a single-row DataFrame, the covariates are globally 'applied' to all components of the - TimeSeries. If a multi-row DataFrame, the static covariates are component-specific, with the number of rows - matching the number of components of the series. Use below methods to add static covariates to your TimeSeries - objects. - - See Also - -------- - TimeSeries.with_static_covariates : Return a copy of a series with added static covariates - TimeSeries.from_dataframe : Create from a :class:`pandas.DataFrame`. - TimeSeries.from_group_dataframe : Create multiple TimeSeries by groups from a :class:`pandas.DataFrame`. - TimeSeries.from_series : Create from a :class:`pandas.Series`. - TimeSeries.from_values : Create from a NumPy :class:`ndarray`. - TimeSeries.from_times_and_values : Create from a time index and a Numpy :class:`ndarray`. - TimeSeries.from_csv : Create from a CSV file. + series. """ return self._xa.attrs.get(STATIC_COV_TAG, None) @@ -1499,7 +1486,7 @@ def univariate_values(self, copy: bool = True, sample: int = 0) -> np.ndarray: else: return self._xa[:, 0, sample].values - def static_covariate_values(self, copy: bool = True) -> Optional[np.ndarray]: + def static_covariates_values(self, copy: bool = True) -> Optional[np.ndarray]: """ Return a 2-D array of dimension (component, static variable), containing the static covariate values of the TimeSeries. @@ -3323,9 +3310,18 @@ def _restore_xarray_from_frequency(xa: xr.DataArray, freq: str) -> xr.DataArray: xa.dims[1]: xa.coords[DIMS[1]], } + # convert to float as for instance integer arrays cannot accept nans + dtype = ( + xa.dtype + if ( + np.issubdtype(xa.values.dtype, np.float32) + or np.issubdtype(xa.values.dtype, np.float64) + ) + else np.float64 + ) resampled_xa = xr.DataArray( data=np.empty( - shape=((len(resampled_time_index),) + xa.shape[1:]), dtype=xa.dtype + shape=((len(resampled_time_index),) + xa.shape[1:]), dtype=dtype ), dims=xa.dims, coords=coords, diff --git a/darts/utils/data/horizon_based_dataset.py b/darts/utils/data/horizon_based_dataset.py index afba689de0..d132794b42 100644 --- a/darts/utils/data/horizon_based_dataset.py +++ b/darts/utils/data/horizon_based_dataset.py @@ -181,5 +181,5 @@ def __getitem__( "input (or output) chunk relative to the target series.", ) - static_covariate = target_series.static_covariate_values(copy=False) + static_covariate = target_series.static_covariates_values(copy=False) return past_target, covariate, static_covariate, future_target diff --git a/darts/utils/data/inference_dataset.py b/darts/utils/data/inference_dataset.py index e1d3fd9230..c2835ba575 100644 --- a/darts/utils/data/inference_dataset.py +++ b/darts/utils/data/inference_dataset.py @@ -216,7 +216,7 @@ def __getitem__( else None ) - static_covariate = target_series.static_covariate_values(copy=False) + static_covariate = target_series.static_covariates_values(copy=False) return ( past_target, past_covariate, diff --git a/darts/utils/data/shifted_dataset.py b/darts/utils/data/shifted_dataset.py index d53b7efc19..9e9a139139 100644 --- a/darts/utils/data/shifted_dataset.py +++ b/darts/utils/data/shifted_dataset.py @@ -631,5 +631,5 @@ def __getitem__( f"target series.", ) - static_covariate = target_series.static_covariate_values(copy=False) + static_covariate = target_series.static_covariates_values(copy=False) return past_target, covariate, static_covariate, future_target From 6dc7ff886099e775699e5e2dd60fbf8ccd38e75e Mon Sep 17 00:00:00 2001 From: Dennis Bader Date: Sat, 4 Jun 2022 16:44:42 +0200 Subject: [PATCH 25/26] apply suggestions from code review part 2 Co-authored-by: Julien Herzen --- darts/dataprocessing/transformers/boxcox.py | 2 +- darts/dataprocessing/transformers/scaler.py | 2 +- darts/timeseries.py | 8 +++++--- 3 files changed, 7 insertions(+), 5 deletions(-) diff --git a/darts/dataprocessing/transformers/boxcox.py b/darts/dataprocessing/transformers/boxcox.py index f1402781c9..eab7bcfe6a 100644 --- a/darts/dataprocessing/transformers/boxcox.py +++ b/darts/dataprocessing/transformers/boxcox.py @@ -42,7 +42,7 @@ def __init__( ----- The scaler will not scale the series' static covariates. This has to be done either before constructing the series, or later on by extracting the covariates, transforming the values and then reapplying them to the - series. For this, see Timeseries properties `TimeSeries.static_covariates` and method + series. For this, see TimeSeries properties `TimeSeries.static_covariates` and method `TimeSeries.with_static_covariates()` Parameters diff --git a/darts/dataprocessing/transformers/scaler.py b/darts/dataprocessing/transformers/scaler.py index 8efa93ab9b..de56d29947 100644 --- a/darts/dataprocessing/transformers/scaler.py +++ b/darts/dataprocessing/transformers/scaler.py @@ -35,7 +35,7 @@ def __init__( ----- The scaler will not scale the series' static covariates. This has to be done either before constructing the series, or later on by extracting the covariates, transforming the values and then reapplying them to the - series. For this, see Timeseries properties `TimeSeries.static_covariates` and method + series. For this, see TimeSeries properties `TimeSeries.static_covariates` and method `TimeSeries.with_static_covariates()` Parameters diff --git a/darts/timeseries.py b/darts/timeseries.py index 6ed5687aea..bc79de503b 100644 --- a/darts/timeseries.py +++ b/darts/timeseries.py @@ -25,7 +25,7 @@ - Be non-empty. ``TimeSeries`` can contain global or component-specific static covariate data. Static covariates in `darts` refers -to external time-invariant data that can be used by some forecasting models to help improve predictions. +to external time-invariant data that can be used by some models to help improve predictions. Read our `user guide on covariates `__ and the ``TimeSeries`` documentation for more information on covariates. """ @@ -584,7 +584,7 @@ def from_group_dataframe( and a list of columns `value_cols` has to represent the values for the individual time series. Values from columns ``group_cols`` and ``static_cols`` are added as static covariates to the resulting TimeSeries objects. These can be viewed with `my_series.static_covariates`. Different to `group_cols`, - `static_cols` only adds the static values without using the to extract the TimeSeries groups. + `static_cols` only adds the static values but are not used to extract the TimeSeries groups. Parameters ---------- @@ -620,7 +620,7 @@ def from_group_dataframe( Returns ------- TimeSeries - A univariate or multivariate deterministic TimeSeries constructed from the inputs. + A list containing a univariate or multivariate deterministic TimeSeries per group in the DataFrame. """ group_cols = [group_cols] if not isinstance(group_cols, list) else group_cols if static_cols is not None: @@ -2299,6 +2299,8 @@ def with_static_covariates( self, covariates: Optional[Union[pd.Series, pd.DataFrame]] ): """Returns a new TimeSeries object with added static covariates. + + Static covariates contain data attached to the time series, but which are not varying with time. Parameters ---------- From d001e173c1e0b5274da3ae5c07d33cdae38ee851 Mon Sep 17 00:00:00 2001 From: dennisbader Date: Sat, 4 Jun 2022 17:08:55 +0200 Subject: [PATCH 26/26] fix black issue from PR suggestion --- darts/timeseries.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/darts/timeseries.py b/darts/timeseries.py index bc79de503b..8e1afad7c7 100644 --- a/darts/timeseries.py +++ b/darts/timeseries.py @@ -2299,7 +2299,7 @@ def with_static_covariates( self, covariates: Optional[Union[pd.Series, pd.DataFrame]] ): """Returns a new TimeSeries object with added static covariates. - + Static covariates contain data attached to the time series, but which are not varying with time. Parameters