Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feat/sample weights #2404

Merged
merged 42 commits into from
Jun 17, 2024
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
Show all changes
42 commits
Select commit Hold shift + click to select a range
35ac768
Adding building blocks of weight samples
Dec 19, 2023
5668c2e
Adding exponential decay logic
Jan 4, 2024
4ed8058
Linter
Jan 4, 2024
7d46079
Linter flake
Jan 4, 2024
b669439
Linter flake 2
Jan 4, 2024
2bdb71e
Linter isort
Jan 4, 2024
2a33bbb
Adding Timeseries support
Feb 12, 2024
f33a429
Adding first test for equal weights
Feb 19, 2024
48126e6
Adding first round of tests
Mar 4, 2024
a0a61fb
merge
Mar 4, 2024
75a41be
working session with le M
Apr 11, 2024
a916a1f
Adding other tests
Apr 15, 2024
a2a9d9b
Resolve linter issues
Apr 24, 2024
605172e
Resolve flake
Apr 30, 2024
dee5df9
Merge branch 'master' into master
madtoinou May 1, 2024
b17ec26
Merge branch 'master' into master
AntonRagot May 9, 2024
c8a287b
Resolving conflicts
May 10, 2024
8b4d006
Conflicts again
May 10, 2024
5639bff
Removing conflict mistake
May 10, 2024
c3c61d8
fixing some tests
May 10, 2024
2ffc577
fixing catboost tests
May 10, 2024
c6395e8
Merge branch 'master' into master
AntonRagot May 15, 2024
fb0be36
Merge branch 'master' into feat/sample_weights
dennisbader Jun 5, 2024
849099d
fix tests from new val set logic
dennisbader Jun 5, 2024
c272064
some cleaning up of unused functions
dennisbader Jun 5, 2024
8d49c58
correct sample weight options in docs
dennisbader Jun 5, 2024
172514b
make simple sample weights work with fit
dennisbader Jun 5, 2024
41191bd
integrate sample weights into lagged data creation
dennisbader Jun 6, 2024
c5754fc
added support for multi horizon per time step weights
dennisbader Jun 6, 2024
08c5c97
add lgbm catboost to tests
dennisbader Jun 6, 2024
e41c7e4
remove unused tests
dennisbader Jun 6, 2024
8d95774
add tabularization tests
dennisbader Jun 6, 2024
a4ca131
remove unused test
dennisbader Jun 6, 2024
f3cd56a
update docs
dennisbader Jun 6, 2024
daab756
update regression model tests
dennisbader Jun 6, 2024
67ae30e
support val set weights
dennisbader Jun 7, 2024
be92763
use correct static covariates shape in lagged data creation
dennisbader Jun 7, 2024
974e376
update docs
dennisbader Jun 7, 2024
ae06f47
update changelog
dennisbader Jun 7, 2024
4897a18
update docstrings
dennisbader Jun 17, 2024
06cdb39
update changelog
dennisbader Jun 17, 2024
0b9b9cc
Merge branch 'master' into feat/sample_weights
dennisbader Jun 17, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
Adding first round of tests
  • Loading branch information
Anton Ragot committed Mar 4, 2024
commit 48126e646a65eabb3c231ca021c39323feebe3c3
60 changes: 59 additions & 1 deletion darts/tests/models/forecasting/test_regression_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -1487,7 +1487,7 @@ def test_multiple_ts(self, mode):
),
],
)
def test_correct_generated_weights(self, config):
def test_correct_generated_weights_equal(self, config):
model, training_size = config
train_y = self.sine_univariate1[:training_size]
_, _, weights = model._create_lagged_data(
Expand All @@ -1503,6 +1503,64 @@ def test_correct_generated_weights(self, config):
assert len(weights) == weights_size
assert (weights == [1] * weights_size).all()

@pytest.mark.parametrize(
"config",
[
(RegressionModel(lags=4), 10),
(RegressionModel(lags=8, model=LinearRegression()), 20),
(RegressionModel(lags=16, model=RandomForestRegressor()), 50),
(
RegressionModel(lags=2, model=HistGradientBoostingRegressor()),
100,
),
],
)
def test_correct_generated_weights_linear(self, config):
model, training_size = config
weights_size = training_size - len(model.lags["target"])

expected_weights = np.linspace(0, 1, weights_size + 1)[1:]

train_y = self.sine_univariate1[:training_size]
_, _, weights = model._create_lagged_data(
target_series=train_y,
past_covariates=None,
future_covariates=None,
max_samples_per_ts=None,
sample_weight="linear_decay",
)

assert len(weights) == weights_size
assert (weights == expected_weights).all()

@pytest.mark.parametrize(
"config",
[
(RegressionModel(lags=4), 10, 10),
(RegressionModel(lags=8, model=LinearRegression()), 20, 10),
(RegressionModel(lags=16, model=RandomForestRegressor()), 50, 10),
(RegressionModel(lags=2, model=HistGradientBoostingRegressor()), 100, 10),
],
)
def test_correct_generated_weights_exponential(self, config):
model, training_size, decay_rate = config
weights_size = training_size - len(model.lags["target"])

time_steps = np.linspace(0, 1, weights_size)
expected_weights = np.exp(-decay_rate * (1 - time_steps))

train_y = self.sine_univariate1[:training_size]
_, _, weights = model._create_lagged_data(
target_series=train_y,
past_covariates=None,
future_covariates=None,
max_samples_per_ts=None,
sample_weight="exponential_decay",
)

assert len(weights) == weights_size
np.testing.assert_array_almost_equal(weights, expected_weights)

@pytest.mark.parametrize("mode", [True, False])
def test_only_future_covariates(self, mode):
model = RegressionModel(lags_future_covariates=[-2], multi_models=mode)
Expand Down
6 changes: 3 additions & 3 deletions darts/utils/data/tabularization.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
from darts.utils.timeseries_generation import (
constant_timeseries,
exponential_timeseries,
linear_timeseries,
non_zero_linear_timeseries,
)
from darts.utils.utils import get_single_series, series2seq

Expand Down Expand Up @@ -336,13 +336,13 @@ def create_lagged_data(
1, start=times_i[0], end=times_i[-1], freq=times_i.freq
).values()
elif sample_weight == "linear_decay":
weights = linear_timeseries(
weights = non_zero_linear_timeseries(
start=times_i[0], end=times_i[-1], freq=times_i.freq
).values()
elif sample_weight == "exponential_decay":
weights = exponential_timeseries(
start=times_i[0], end=times_i[-1], freq=times_i.freq
).values()
).values()[::-1]
elif isinstance(sample_weight, TimeSeries):
weights = sample_weight.values()
else:
Expand Down
56 changes: 56 additions & 0 deletions darts/utils/timeseries_generation.py
Original file line number Diff line number Diff line change
Expand Up @@ -516,6 +516,62 @@ def autoregressive_timeseries(
)


def non_zero_linear_timeseries(
start_value: float = 0,
end_value: float = 1,
start: Optional[Union[pd.Timestamp, int]] = pd.Timestamp("2000-01-01"),
end: Optional[Union[pd.Timestamp, int]] = None,
length: Optional[int] = None,
freq: Union[str, int] = None,
column_name: Optional[str] = "linear",
dtype: np.dtype = np.float64,
) -> TimeSeries:
"""
Creates a univariate TimeSeries with a starting value of `start_value` that increases linearly such that
it takes on the value `end_value` at the last entry of the TimeSeries. This means that
the difference between two adjacent entries will be equal to
(`end_value` - `start_value`) / (`length` - 1).

Parameters
----------
start_value
The value of the first entry in the TimeSeries.
end_value
The value of the last entry in the TimeSeries.
start
The start of the returned TimeSeries' index. If a pandas Timestamp is passed, the TimeSeries will have a pandas
DatetimeIndex. If an integer is passed, the TimeSeries will have a pandas RangeIndex index. Works only with
either `length` or `end`.
end
Optionally, the end of the returned index. Works only with either `start` or `length`. If `start` is
set, `end` must be of same type as `start`. Else, it can be either a pandas Timestamp or an integer.
length
Optionally, the length of the returned index. Works only with either `start` or `end`.
freq
The time difference between two adjacent entries in the returned index. In case `start` is a timestamp,
a DateOffset alias is expected; see
`docs <https://pandas.pydata.org/pandas-docs/stable/user_guide/TimeSeries.html#dateoffset-objects>`_.
By default, "D" (daily) is used.
If `start` is an integer, `freq` will be interpreted as the step size in the underlying RangeIndex.
The freq is optional for generating an integer index (if not specified, 1 is used).
column_name
Optionally, the name of the value column for the returned TimeSeries
dtype
The desired NumPy dtype (np.float32 or np.float64) for the resulting series

Returns
-------
TimeSeries
A linear TimeSeries created as indicated above.
"""

index = generate_index(start=start, end=end, freq=freq, length=length)
values = np.linspace(start_value, end_value, len(index) + 1, dtype=dtype)[1:]
return TimeSeries.from_times_and_values(
index, values, freq=freq, columns=pd.Index([column_name])
)


def _extend_time_index_until(
time_index: Union[pd.DatetimeIndex, pd.RangeIndex],
until: Optional[Union[int, str, pd.Timestamp]],
Expand Down