Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Algorithm] CrossQ #2033

Merged
merged 49 commits into from
Jul 10, 2024
Merged
Changes from 1 commit
Commits
Show all changes
49 commits
Select commit Hold shift + click to select a range
0a23ae8
add crossQ examples
BY571 Mar 20, 2024
9bdee71
add loss
BY571 Mar 20, 2024
570a20e
Update naming experiment
BY571 Mar 21, 2024
5086249
update
BY571 Mar 21, 2024
c3a927f
update add tests
BY571 Mar 21, 2024
d1c9c34
detach
BY571 Mar 21, 2024
e879b7c
update tests
BY571 Mar 21, 2024
75255e7
update run_test.sh
BY571 Mar 21, 2024
a7b79c3
move crossq to sota-implementations
BY571 Mar 21, 2024
be84f3f
update loss
BY571 Mar 26, 2024
2170ad8
update cat prediction
BY571 Mar 26, 2024
75d4cee
Merge branch 'main' into crossQ
vmoens Jun 12, 2024
7711a4e
Merge branch 'main' into crossQ
BY571 Jun 26, 2024
f0ac167
add batchrenorm to crossq
BY571 Jun 26, 2024
37abb14
Merge branch 'crossQ' of github.com:BY571/rl into crossQ
BY571 Jun 26, 2024
bc7675a
small fixes
BY571 Jun 26, 2024
9543f2e
update docs and sota checks
BY571 Jun 26, 2024
53e35f7
hyperparam fix
BY571 Jun 26, 2024
172e1c0
test
BY571 Jun 27, 2024
fdb7e8b
update batch norm tests
BY571 Jun 27, 2024
5501d43
tests
BY571 Jul 3, 2024
c47ac84
cleanup
BY571 Jul 5, 2024
e718c3f
Merge branch 'main' into crossQ
BY571 Jul 5, 2024
f94165e
update
BY571 Jul 7, 2024
02c94ff
update lr param
BY571 Jul 8, 2024
93b6a7b
Merge branch 'crossQ' of https://github.com/BY571/rl into crossQ
BY571 Jul 8, 2024
4b914e6
Apply suggestions from code review
vmoens Jul 8, 2024
af8c64a
Merge remote-tracking branch 'origin/main' into crossQ
vmoens Jul 8, 2024
845c8a9
Merge branch 'crossQ' of https://github.com/BY571/rl into crossQ
vmoens Jul 8, 2024
7b4a69d
set qnet eval in actor loss
BY571 Jul 8, 2024
77de044
Merge branch 'crossQ' of https://github.com/BY571/rl into crossQ
BY571 Jul 8, 2024
35c7a98
take off comment
BY571 Jul 8, 2024
68a1a9f
amend
vmoens Jul 8, 2024
c04eb3b
Merge branch 'crossQ' of https://github.com/BY571/rl into crossQ
vmoens Jul 8, 2024
12672ee
Merge remote-tracking branch 'origin/main' into crossQ
vmoens Jul 8, 2024
7fbb27d
amend
vmoens Jul 8, 2024
ff80481
amend
vmoens Jul 8, 2024
caf702e
amend
vmoens Jul 8, 2024
70e2882
amend
vmoens Jul 8, 2024
ccd1b7f
amend
vmoens Jul 8, 2024
d3c8b0e
Merge remote-tracking branch 'origin/main' into crossQ
vmoens Jul 9, 2024
d3e0bb1
Apply suggestions from code review
vmoens Jul 9, 2024
349cb28
amend
vmoens Jul 9, 2024
75a43e7
amend
vmoens Jul 9, 2024
abada6c
fix device error
BY571 Jul 9, 2024
c878b81
Update objective delay actor
BY571 Jul 9, 2024
f222b11
Update tests not expecting target update
BY571 Jul 9, 2024
067b560
update example utils
BY571 Jul 9, 2024
c010e39
amend
vmoens Jul 9, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
update loss
  • Loading branch information
BY571 committed Mar 26, 2024
commit be84f3fcba545541cffa91a5997c71b7e92985c5
114 changes: 35 additions & 79 deletions torchrl/objectives/crossq.py
Original file line number Diff line number Diff line change
Expand Up @@ -519,7 +519,7 @@ def _cached_detached_qvalue_params(self):
@_cache_values
def _cached_qvalue_params(self):
return torch.cat(
[self.qvalue_network_params, self.qvalue_network_params.detach()], 0
[self.qvalue_network_params, self.qvalue_network_params], 0 # .detach()
)

def _actor_loss(
BY571 marked this conversation as resolved.
Show resolved Hide resolved
Expand Down Expand Up @@ -549,76 +549,22 @@ def _actor_loss(

return self._alpha * log_prob - min_q_logprob, {"log_prob": log_prob.detach()}

def _compute_target(self, tensordict) -> Tensor:
r"""Value network for CrossQ.

CrossQ is based on a value estimate of the form:

.. math::

V = Q(s,a) - \alpha * \log p(a | s)

This class computes this value given the actor and qvalue network
def _qvalue_loss(
BY571 marked this conversation as resolved.
Show resolved Hide resolved
self, tensordict: TensorDictBase
) -> Tuple[Tensor, Dict[str, Tensor]]:

"""
tensordict = tensordict.clone(False)
# get actions and log-probs
# # compute next action
with torch.no_grad():
with set_exploration_type(
ExplorationType.RANDOM
), self.actor_network_params.to_module(self.actor_network):
next_tensordict = tensordict.get("next").clone(False)
next_dist = self.actor_network.get_dist(next_tensordict)
next_action = next_dist.rsample()
next_action = next_dist.sample()
next_tensordict.set(self.tensor_keys.action, next_action)
next_sample_log_prob = next_dist.log_prob(next_action)

# get q-values
next_tensordict_expand = self._vmap_qnetworkN0(
next_tensordict, self.qvalue_network_params
)
state_action_value = next_tensordict_expand.get(
self.tensor_keys.state_action_value
)
if (
state_action_value.shape[-len(next_sample_log_prob.shape) :]
!= next_sample_log_prob.shape
):
next_sample_log_prob = next_sample_log_prob.unsqueeze(-1)
next_state_value = state_action_value - self._alpha * next_sample_log_prob
next_state_value = next_state_value.min(0)[0]
tensordict.set(
("next", self.value_estimator.tensor_keys.value), next_state_value
)
target_value = self.value_estimator.value_estimate(tensordict).squeeze(-1)
return target_value.detach()

def _qvalue_loss(
self, tensordict: TensorDictBase
) -> Tuple[Tensor, Dict[str, Tensor]]:
# we pass the alpha value to the tensordict. Since it's a scalar, we must erase the batch-size first.

target_value = self._compute_target(tensordict)
tensordict_expand = self._vmap_qnetworkN0(
tensordict.select(*self.qvalue_network.in_keys, strict=False),
self.qvalue_network_params,
)
pred_val = tensordict_expand.get(self.tensor_keys.state_action_value).squeeze(
-1
)

# ############################
# # compute next action
# with torch.no_grad():
# with set_exploration_type(
# ExplorationType.MODE
# ), self.actor_network_params.to_module(self.actor_network):
# next_tensordict = tensordict.get("next").clone(False)
# next_dist = self.actor_network.get_dist(next_tensordict)
# next_action = next_dist.loc #.rsample()
# next_tensordict.set(self.tensor_keys.action, next_action)
# next_sample_log_prob = next_dist.log_prob(next_action)

# TODO: we should pass them together to the qvalue network
# q_values_tensordict = torch.cat(
# [
# tensordict.select(*self.qvalue_network.in_keys, strict=False).expand(
Expand All @@ -645,24 +591,34 @@ def _qvalue_loss(
# ],
# dim=0,
# )
# # compute target value
# next_state_action_value = next_state_action_value.detach()
# if (
# next_state_action_value.shape[-len(next_sample_log_prob.shape) :]
# != next_sample_log_prob.shape
# ):
# next_sample_log_prob = next_sample_log_prob.unsqueeze(-1)
# next_state_action_value = (
# next_state_action_value - self._alpha * next_sample_log_prob
# )
# next_state_action_value = next_state_action_value.min(0)[0]
# tensordict.set(
# ("next", self.value_estimator.tensor_keys.value), next_state_action_value
# )
# target_value = self.value_estimator.value_estimate(tensordict).squeeze(-1).detach()
# # get current q-values
# pred_val = current_state_action_value.squeeze(-1)
# ###########################

next_state_action_value = self._vmap_qnetworkN0(
next_tensordict.select(*self.qvalue_network.in_keys, strict=False),
self.qvalue_network_params,
).get(self.tensor_keys.state_action_value)

current_state_action_value = self._vmap_qnetworkN0(
tensordict.select(*self.qvalue_network.in_keys, strict=False),
self.qvalue_network_params,
).get(self.tensor_keys.state_action_value)

# compute target value
if (
next_state_action_value.shape[-len(next_sample_log_prob.shape) :]
!= next_sample_log_prob.shape
):
next_sample_log_prob = next_sample_log_prob.unsqueeze(-1)
next_state_action_value = next_state_action_value.min(0)[0]
next_state_action_value = (
next_state_action_value - self._alpha * next_sample_log_prob
).detach()
tensordict.set(
("next", self.value_estimator.tensor_keys.value), next_state_action_value
)
target_value = self.value_estimator.value_estimate(tensordict).squeeze(-1)
# get current q-values
pred_val = current_state_action_value.squeeze(-1)

# compute loss
td_error = abs(pred_val - target_value)
loss_qval = distance_loss(
Expand Down