diff --git a/cleanrl/atari/.python-version b/cleanrl/atari/.python-version deleted file mode 100644 index 5bc3dc8b9..000000000 --- a/cleanrl/atari/.python-version +++ /dev/null @@ -1 +0,0 @@ -3.9.5/envs/atari \ No newline at end of file diff --git a/cleanrl/atari/dqn_atari.py b/cleanrl/atari/dqn_atari.py deleted file mode 100644 index a96222848..000000000 --- a/cleanrl/atari/dqn_atari.py +++ /dev/null @@ -1,243 +0,0 @@ -# Reference: https://www.cs.toronto.edu/~vmnih/docs/dqn.pdf - -import argparse -import os -import random -import time -from distutils.util import strtobool - -import gym -import numpy as np -import torch -import torch.nn as nn -import torch.optim as optim -from gym.spaces import Discrete -from gym.wrappers import Monitor -from stable_baselines3.common.atari_wrappers import ( - ClipRewardEnv, - EpisodicLifeEnv, - FireResetEnv, - MaxAndSkipEnv, - NoopResetEnv, - WarpFrame, -) -from stable_baselines3.common.buffers import ReplayBuffer -from stable_baselines3.common.vec_env import DummyVecEnv, VecFrameStack -from torch.utils.tensorboard import SummaryWriter - - -def parse_args(): - # Common arguments - # fmt: off - parser = argparse.ArgumentParser(description='PPO agent') - parser.add_argument('--exp-name', type=str, default=os.path.basename(__file__).rstrip(".py"), - help='the name of this experiment') - parser.add_argument('--gym-id', type=str, default="BreakoutNoFrameskip-v4", - help='the id of the gym environment') - parser.add_argument('--learning-rate', type=float, default=1e-4, - help='the learning rate of the optimizer') - parser.add_argument('--seed', type=int, default=1, - help='seed of the experiment') - parser.add_argument('--total-timesteps', type=int, default=10000000, - help='total timesteps of the experiments') - parser.add_argument('--torch-deterministic', type=lambda x:bool(strtobool(x)), default=True, nargs='?', const=True, - help='if toggled, `torch.backends.cudnn.deterministic=False`') - parser.add_argument('--cuda', type=lambda x:bool(strtobool(x)), default=True, nargs='?', const=True, - help='if toggled, cuda will not be enabled by default') - parser.add_argument('--track', type=lambda x:bool(strtobool(x)), default=False, nargs='?', const=True, - help='run the script in production mode and use wandb to log outputs') - parser.add_argument('--capture-video', type=lambda x:bool(strtobool(x)), default=False, nargs='?', const=True, - help='weather to capture videos of the agent performances (check out `videos` folder)') - parser.add_argument('--wandb-project-name', type=str, default="cleanRL", - help="the wandb's project name") - parser.add_argument('--wandb-entity', type=str, default=None, - help="the entity (team) of wandb's project") - - # Algorithm specific arguments - parser.add_argument('--buffer-size', type=int, default=1000000, - help='the replay memory buffer size') - parser.add_argument('--gamma', type=float, default=0.99, - help='the discount factor gamma') - parser.add_argument('--target-network-frequency', type=int, default=1000, - help="the timesteps it takes to update the target network") - parser.add_argument('--max-grad-norm', type=float, default=0.5, - help='the maximum norm for the gradient clipping') - parser.add_argument('--batch-size', type=int, default=32, - help="the batch size of sample from the reply memory") - parser.add_argument('--start-e', type=float, default=1., - help="the starting epsilon for exploration") - parser.add_argument('--end-e', type=float, default=0.02, - help="the ending epsilon for exploration") - parser.add_argument('--exploration-fraction', type=float, default=0.10, - help="the fraction of `total-timesteps` it takes from start-e to go end-e") - parser.add_argument('--learning-starts', type=int, default=80000, - help="timestep to start learning") - parser.add_argument('--train-frequency', type=int, default=4, - help="the frequency of training") - args = parser.parse_args() - if not args.seed: - args.seed = int(time.time()) - # fmt: on - return args - - -def make_env(gym_id, seed, idx): - def thunk(): - env = gym.make(gym_id) - env = NoopResetEnv(env, noop_max=30) - env = MaxAndSkipEnv(env, skip=4) - env = gym.wrappers.RecordEpisodeStatistics(env) - if args.capture_video: - if idx == 0: - env = Monitor(env, f"videos/{experiment_name}") - env = EpisodicLifeEnv(env) - if "FIRE" in env.unwrapped.get_action_meanings(): - env = FireResetEnv(env) - env = WarpFrame(env, width=84, height=84) - env = ClipRewardEnv(env) - env.seed(seed) - env.action_space.seed(seed) - env.observation_space.seed(seed) - return env - - return thunk - - -class Linear0(nn.Linear): - def reset_parameters(self): - nn.init.constant_(self.weight, 0.0) - if self.bias is not None: - nn.init.constant_(self.bias, 0.0) - - -class Scale(nn.Module): - def __init__(self, scale): - super().__init__() - self.scale = scale - - def forward(self, x): - return x * self.scale - - -class QNetwork(nn.Module): - def __init__(self, env, frames=4): - super(QNetwork, self).__init__() - self.network = nn.Sequential( - Scale(1 / 255), - nn.Conv2d(frames, 32, 8, stride=4), - nn.ReLU(), - nn.Conv2d(32, 64, 4, stride=2), - nn.ReLU(), - nn.Conv2d(64, 64, 3, stride=1), - nn.ReLU(), - nn.Flatten(), - nn.Linear(3136, 512), - nn.ReLU(), - Linear0(512, env.action_space.n), - ) - - def forward(self, x): - return self.network(x.permute((0, 3, 1, 2))) - - -def linear_schedule(start_e: float, end_e: float, duration: int, t: int): - slope = (end_e - start_e) / duration - return max(slope * t + start_e, end_e) - - -if __name__ == "__main__": - args = parse_args() - experiment_name = f"{args.gym_id}__{args.exp_name}__{args.seed}__{int(time.time())}" - writer = SummaryWriter(f"runs/{experiment_name}") - writer.add_text( - "hyperparameters", "|param|value|\n|-|-|\n%s" % ("\n".join([f"|{key}|{value}|" for key, value in vars(args).items()])) - ) - if args.track: - import wandb - - wandb.init( - project=args.wandb_project_name, - entity=args.wandb_entity, - sync_tensorboard=True, - config=vars(args), - name=experiment_name, - monitor_gym=True, - save_code=True, - ) - writer = SummaryWriter(f"/tmp/{experiment_name}") - - device = torch.device("cuda" if torch.cuda.is_available() and args.cuda else "cpu") - envs = VecFrameStack( - DummyVecEnv([make_env(args.gym_id, args.seed, 0)]), - 4, - ) - assert isinstance(envs.action_space, Discrete), "only discrete action space is supported" - - # TRY NOT TO MODIFY: seeding - random.seed(args.seed) - np.random.seed(args.seed) - torch.manual_seed(args.seed) - torch.backends.cudnn.deterministic = args.torch_deterministic - - # ALGO LOGIC: initialize agent here: - rb = ReplayBuffer(args.buffer_size, envs.observation_space, envs.action_space, device=device, optimize_memory_usage=True) - q_network = QNetwork(envs).to(device) - target_network = QNetwork(envs).to(device) - target_network.load_state_dict(q_network.state_dict()) - optimizer = optim.Adam(q_network.parameters(), lr=args.learning_rate) - loss_fn = nn.MSELoss() - - # TRY NOT TO MODIFY: start the game - obs = envs.reset() - for global_step in range(args.total_timesteps): - epsilon = linear_schedule(args.start_e, args.end_e, args.exploration_fraction * args.total_timesteps, global_step) - if random.random() < epsilon: - actions = [envs.action_space.sample()] - else: - logits = q_network.forward(torch.Tensor(obs).to(device)) - actions = torch.argmax(logits, dim=1).cpu().numpy() - - next_obs, rewards, dones, infos = envs.step(actions) - - # TRY NOT TO MODIFY: record rewards for plotting purposes - for info in infos: - if "episode" in info.keys(): - print(f"global_step={global_step}, episode_reward={info['episode']['r']}") - writer.add_scalar("charts/episodic_return", info["episode"]["r"], global_step) - writer.add_scalar("charts/epsilon", epsilon, global_step) - break - - # TRY NOT TO MODIFY: save data to reply buffer; handle `terminal_observation` - real_next_obs = next_obs.copy() - for idx, d in enumerate(dones): - if d: - real_next_obs[idx] = infos[idx]["terminal_observation"] - rb.add(obs, real_next_obs, actions, rewards, dones) - - # TRY NOT TO MODIFY: CRUCIAL step easy to overlook - obs = next_obs - - # ALGO LOGIC: training. - if global_step > args.learning_starts and global_step % args.train_frequency == 0: - data = rb.sample(args.batch_size) - with torch.no_grad(): - target_max, _ = target_network.forward(data.next_observations).max(dim=1) - td_target = data.rewards.flatten() + args.gamma * target_max * (1 - data.dones.flatten()) - old_val = q_network.forward(data.observations).gather(1, data.actions).squeeze() - loss = loss_fn(td_target, old_val) - - if global_step % 100 == 0: - writer.add_scalar("losses/td_loss", loss, global_step) - - # optimize the midel - optimizer.zero_grad() - loss.backward() - nn.utils.clip_grad_norm_(list(q_network.parameters()), args.max_grad_norm) - optimizer.step() - - # update the target network - if global_step % args.target_network_frequency == 0: - target_network.load_state_dict(q_network.state_dict()) - - envs.close() - writer.close() diff --git a/cleanrl/atari/readme.md b/cleanrl/atari/readme.md deleted file mode 100644 index 3cce33459..000000000 --- a/cleanrl/atari/readme.md +++ /dev/null @@ -1,14 +0,0 @@ -``` -cd cleanrl/atari -pyenv install -s $(sed "s/\/envs.*//" .python-version) -pyenv virtualenv $(sed "s/\/envs\// /" .python-version) -poetry install - -isort . --skip wandb -autoflake -r --exclude wandb --in-place --remove-unused-variables --remove-all-unused-imports . -black -l 127 --exclude wandb . - -pyenv shell $(cat .python-version)-prod - -python dqn_atari_cpprb.py --track -``` \ No newline at end of file diff --git a/cleanrl/brax/.python-version b/cleanrl/brax/.python-version deleted file mode 100644 index bd44e2e98..000000000 --- a/cleanrl/brax/.python-version +++ /dev/null @@ -1 +0,0 @@ -3.9.5/envs/brax \ No newline at end of file diff --git a/cleanrl/brax/brax_test.py b/cleanrl/brax/brax_test.py deleted file mode 100644 index 9fc2bfcae..000000000 --- a/cleanrl/brax/brax_test.py +++ /dev/null @@ -1,44 +0,0 @@ -import os -import time -from functools import partial - -import gym -import numpy as np -import torch - -from brax.envs.to_torch import JaxToTorchWrapper -from brax.envs import _envs, create_gym_env - -if 'COLAB_TPU_ADDR' in os.environ: - from jax.tools import colab_tpu - colab_tpu.setup_tpu() - -CUDA_AVAILABLE = torch.cuda.is_available() -if CUDA_AVAILABLE: - # BUG: (@lebrice): Getting a weird "CUDA error: out of memory" RuntimeError - # during JIT, which can be "fixed" by first creating a dummy cuda tensor! - v = torch.ones(1, device="cuda") -device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') - -for env_name, env_class in _envs.items(): - env_id = f"brax_{env_name}-v0" - entry_point = partial(create_gym_env, env_name=env_name) - if env_id not in gym.envs.registry.env_specs: - print(f"Registring brax's '{env_name}' env under id '{env_id}'.") - gym.register(env_id, entry_point=entry_point) - - -total_timesteps = 0 -num_envs = 2048 -env = gym.make("brax_halfcheetah-v0", batch_size=num_envs) -env = JaxToTorchWrapper(env) -obs = env.reset() -start_time = time.time() -for _ in range(10000): - total_timesteps += num_envs - env.step(env.action_space.sample()) - - -print(f"\nDevice used: {device}") -print(f"Number of parallel environments: {num_envs}") -print(f"FPS: {total_timesteps / (time.time()-start_time)}") diff --git a/cleanrl/brax/gymapi.py b/cleanrl/brax/gymapi.py deleted file mode 100644 index 8df161b76..000000000 --- a/cleanrl/brax/gymapi.py +++ /dev/null @@ -1,325 +0,0 @@ -import argparse -import os -import random -import time -from distutils.util import strtobool - -import gym -import numpy as np -import torch -import torch.nn as nn -import torch.optim as optim -from gym.spaces import Discrete -from gym.wrappers import Monitor -from gym.vector import SyncVectorEnv -from torch.distributions.categorical import Categorical -from torch.utils.tensorboard import SummaryWriter - - -def parse_args(): - # fmt: off - parser = argparse.ArgumentParser(description='PPO agent') - parser.add_argument('--exp-name', type=str, default=os.path.basename(__file__).rstrip(".py"), - help='the name of this experiment') - parser.add_argument('--gym-id', type=str, default="CartPole-v1", - help='the id of the gym environment') - parser.add_argument('--learning-rate', type=float, default=2.5e-4, - help='the learning rate of the optimizer') - parser.add_argument('--seed', type=int, default=1, - help='seed of the experiment') - parser.add_argument('--total-timesteps', type=int, default=25000, - help='total timesteps of the experiments') - parser.add_argument('--torch-deterministic', type=lambda x:bool(strtobool(x)), default=True, nargs='?', const=True, - help='if toggled, `torch.backends.cudnn.deterministic=False`') - parser.add_argument('--cuda', type=lambda x:bool(strtobool(x)), default=True, nargs='?', const=True, - help='if toggled, cuda will not be enabled by default') - parser.add_argument('--track', type=lambda x:bool(strtobool(x)), default=False, nargs='?', const=True, - help='run the script in production mode and use wandb to log outputs') - parser.add_argument('--capture-video', type=lambda x:bool(strtobool(x)), default=False, nargs='?', const=True, - help='weather to capture videos of the agent performances (check out `videos` folder)') - parser.add_argument('--wandb-project-name', type=str, default="cleanRL", - help="the wandb's project name") - parser.add_argument('--wandb-entity', type=str, default=None, - help="the entity (team) of wandb's project") - - # Algorithm specific arguments - parser.add_argument('--n-minibatch', type=int, default=4, - help='the number of mini batch') - parser.add_argument('--num-envs', type=int, default=4, - help='the number of parallel game environment') - parser.add_argument('--num-steps', type=int, default=128, - help='the number of steps per game environment') - parser.add_argument('--gamma', type=float, default=0.99, - help='the discount factor gamma') - parser.add_argument('--gae-lambda', type=float, default=0.95, - help='the lambda for the general advantage estimation') - parser.add_argument('--ent-coef', type=float, default=0.01, - help="coefficient of the entropy") - parser.add_argument('--vf-coef', type=float, default=0.5, - help="coefficient of the value function") - parser.add_argument('--max-grad-norm', type=float, default=0.5, - help='the maximum norm for the gradient clipping') - parser.add_argument('--clip-coef', type=float, default=0.2, - help="the surrogate clipping coefficient") - parser.add_argument('--update-epochs', type=int, default=4, - help="the K epochs to update the policy") - parser.add_argument('--kle-stop', type=lambda x:bool(strtobool(x)), default=False, nargs='?', const=True, - help='If toggled, the policy updates will be early stopped w.r.t target-kl') - parser.add_argument('--target-kl', type=float, default=0.03, - help='the target-kl variable that is referred by --kl') - parser.add_argument('--gae', type=lambda x:bool(strtobool(x)), default=True, nargs='?', const=True, - help='Use GAE for advantage computation') - parser.add_argument('--norm-adv', type=lambda x:bool(strtobool(x)), default=True, nargs='?', const=True, - help="Toggles advantages normalization") - parser.add_argument('--anneal-lr', type=lambda x:bool(strtobool(x)), default=True, nargs='?', const=True, - help="Toggle learning rate annealing for policy and value networks") - parser.add_argument('--clip-vloss', type=lambda x:bool(strtobool(x)), default=True, nargs='?', const=True, - help='Toggles wheter or not to use a clipped loss for the value function, as per the paper.') - args = parser.parse_args() - if not args.seed: - args.seed = int(time.time()) - args.batch_size = int(args.num_envs * args.num_steps) - args.minibatch_size = int(args.batch_size // args.n_minibatch) - # fmt: on - return args - - -def make_env(gym_id, seed, idx, capture_video, run_name): - def thunk(): - env = gym.make(gym_id) - env = gym.wrappers.RecordEpisodeStatistics(env) - if capture_video: - if idx == 0: - env = Monitor(env, f'videos/{run_name}') - env.seed(seed) - env.action_space.seed(seed) - env.observation_space.seed(seed) - return env - return thunk - -def layer_init(layer, std=np.sqrt(2), bias_const=0.0): - torch.nn.init.orthogonal_(layer.weight, std) - torch.nn.init.constant_(layer.bias, bias_const) - return layer - - -class Agent(nn.Module): - def __init__(self, envs, frames=4): - super(Agent, self).__init__() - self.critic = nn.Sequential( - layer_init(nn.Linear(np.array(envs.single_observation_space.shape).prod(), 64)), - nn.Tanh(), - layer_init(nn.Linear(64, 64)), - nn.Tanh(), - layer_init(nn.Linear(64, 1), std=1.), - ) - self.actor = nn.Sequential( - layer_init(nn.Linear(np.array(envs.single_observation_space.shape).prod(), 64)), - nn.Tanh(), - layer_init(nn.Linear(64, 64)), - nn.Tanh(), - layer_init(nn.Linear(64, envs.single_action_space.n), std=0.01), - ) - - def get_action_and_value(self, x, action=None): - logits = self.actor(x) - probs = Categorical(logits=logits) - if action is None: - action = probs.sample() - return action, probs.log_prob(action), probs.entropy(), self.critic(x) - - def get_value(self, x): - return self.critic(x) - - -if __name__ == "__main__": - args = parse_args() - run_name = f"{args.gym_id}__{args.exp_name}__{args.seed}__{int(time.time())}" - if args.track: - import wandb - - wandb.init( - project=args.wandb_project_name, - entity=args.wandb_entity, - sync_tensorboard=True, - config=vars(args), - name=run_name, - monitor_gym=True, - save_code=True, - ) - writer = SummaryWriter(f"runs/{run_name}") - writer.add_text( - "hyperparameters", - "|param|value|\n|-|-|\n%s" % ("\n".join([f"|{key}|{value}|" for key, value in vars(args).items()])), - ) - device = torch.device("cuda" if torch.cuda.is_available() and args.cuda else "cpu") - - # TRY NOT TO MODIFY: seeding - random.seed(args.seed) - np.random.seed(args.seed) - torch.manual_seed(args.seed) - torch.backends.cudnn.deterministic = args.torch_deterministic - - # env setup - envs = SyncVectorEnv( - [make_env(args.gym_id, args.seed + i, i, args.capture_video, run_name) - for i in range(args.num_envs)]) - assert isinstance(envs.single_action_space, Discrete), "only discrete action space is supported" - - agent = Agent(envs).to(device) - optimizer = optim.Adam(agent.parameters(), lr=args.learning_rate, eps=1e-5) - if args.anneal_lr: - # https://github.com/openai/baselines/blob/ea25b9e8b234e6ee1bca43083f8f3cf974143998/baselines/ppo2/defaults.py#L20 - lr = lambda f: f * args.learning_rate - - # ALGO Logic: Storage for epoch data - obs = torch.zeros((args.num_steps, args.num_envs) + envs.single_observation_space.shape).to(device) - actions = torch.zeros((args.num_steps, args.num_envs) + envs.single_action_space.shape).to(device) - logprobs = torch.zeros((args.num_steps, args.num_envs)).to(device) - rewards = torch.zeros((args.num_steps, args.num_envs)).to(device) - dones = torch.zeros((args.num_steps, args.num_envs)).to(device) - values = torch.zeros((args.num_steps, args.num_envs)).to(device) - - # TRY NOT TO MODIFY: start the game - global_step = 0 - start_time = time.time() - # Note how `next_obs` and `next_done` are used; their usage is equivalent to - # https://github.com/ikostrikov/pytorch-a2c-ppo-acktr-gail/blob/84a7582477fb0d5c82ad6d850fe476829dddd2e1/a2c_ppo_acktr/storage.py#L60 - next_obs = torch.Tensor(envs.reset()).to(device) - next_done = torch.zeros(args.num_envs).to(device) - num_updates = args.total_timesteps // args.batch_size - for update in range(1, num_updates + 1): - # Annealing the rate if instructed to do so. - if args.anneal_lr: - frac = 1.0 - (update - 1.0) / num_updates - lrnow = lr(frac) - optimizer.param_groups[0]["lr"] = lrnow - - # TRY NOT TO MODIFY: prepare the execution of the game. - for step in range(0, args.num_steps): - global_step += 1 * args.num_envs - obs[step] = next_obs - dones[step] = next_done - - # ALGO LOGIC: put action logic here - with torch.no_grad(): - action, logproba, _, vs = agent.get_action_and_value(next_obs) - values[step] = vs.flatten() - - actions[step] = action - logprobs[step] = logproba - - # TRY NOT TO MODIFY: execute the game and log data. - next_obs, rs, ds, infos = envs.step(action.cpu().numpy()) - next_obs = torch.Tensor(next_obs).to(device) - rewards[step], next_done = torch.tensor(rs).to(device).view(-1), torch.Tensor(ds).to(device) - - for info in infos: - if "episode" in info.keys(): - print(f"global_step={global_step}, episode_reward={info['episode']['r']}") - writer.add_scalar("charts/episodic_return", info["episode"]["r"], global_step) - break - - # bootstrap reward if not done. reached the batch limit - with torch.no_grad(): - last_value = agent.get_value(next_obs.to(device)).reshape(1, -1) - if args.gae: - advantages = torch.zeros_like(rewards).to(device) - lastgaelam = 0 - for t in reversed(range(args.num_steps)): - if t == args.num_steps - 1: - nextnonterminal = 1.0 - next_done - nextvalues = last_value - else: - nextnonterminal = 1.0 - dones[t + 1] - nextvalues = values[t + 1] - delta = rewards[t] + args.gamma * nextvalues * nextnonterminal - values[t] - advantages[t] = lastgaelam = delta + args.gamma * args.gae_lambda * nextnonterminal * lastgaelam - returns = advantages + values - else: - returns = torch.zeros_like(rewards).to(device) - for t in reversed(range(args.num_steps)): - if t == args.num_steps - 1: - nextnonterminal = 1.0 - next_done - next_return = last_value - else: - nextnonterminal = 1.0 - dones[t + 1] - next_return = returns[t + 1] - returns[t] = rewards[t] + args.gamma * nextnonterminal * next_return - advantages = returns - values - - # flatten the batch - b_obs = obs.reshape((-1,) + envs.single_observation_space.shape) - b_logprobs = logprobs.reshape(-1) - b_actions = actions.reshape((-1,) + envs.single_action_space.shape) - b_advantages = advantages.reshape(-1) - b_returns = returns.reshape(-1) - b_values = values.reshape(-1) - - # Optimizaing the policy and value network - inds = np.arange( - args.batch_size, - ) - for i_epoch_pi in range(args.update_epochs): - np.random.shuffle(inds) - for start in range(0, args.batch_size, args.minibatch_size): - end = start + args.minibatch_size - minibatch_ind = inds[start:end] - mb_advantages = b_advantages[minibatch_ind] - if args.norm_adv: - mb_advantages = (mb_advantages - mb_advantages.mean()) / (mb_advantages.std() + 1e-8) - - _, newlogproba, entropy, new_values = agent.get_action_and_value( - b_obs[minibatch_ind], b_actions.long()[minibatch_ind].to(device) - ) - ratio = (newlogproba - b_logprobs[minibatch_ind]).exp() - print(ratio) - raise - # calculate approx_kl http://joschu.net/blog/kl-approx.html - with torch.no_grad(): - log_ratio = newlogproba - b_logprobs[minibatch_ind] - approx_kl = ((log_ratio.exp() - 1) - log_ratio).mean() - - # Policy loss - pg_loss1 = -mb_advantages * ratio - pg_loss2 = -mb_advantages * torch.clamp(ratio, 1 - args.clip_coef, 1 + args.clip_coef) - pg_loss = torch.max(pg_loss1, pg_loss2).mean() - entropy_loss = entropy.mean() - - # Value loss - new_values = new_values.view(-1) - if args.clip_vloss: - v_loss_unclipped = (new_values - b_returns[minibatch_ind]) ** 2 - v_clipped = b_values[minibatch_ind] + torch.clamp( - new_values - b_values[minibatch_ind], - -args.clip_coef, - args.clip_coef, - ) - v_loss_clipped = (v_clipped - b_returns[minibatch_ind]) ** 2 - v_loss_max = torch.max(v_loss_unclipped, v_loss_clipped) - v_loss = 0.5 * v_loss_max.mean() - else: - v_loss = 0.5 * ((new_values - b_returns[minibatch_ind]) ** 2).mean() - - loss = pg_loss - args.ent_coef * entropy_loss + v_loss * args.vf_coef - - optimizer.zero_grad() - loss.backward() - nn.utils.clip_grad_norm_(agent.parameters(), args.max_grad_norm) - optimizer.step() - - if args.kle_stop: - if approx_kl > args.target_kl: - break - - # TRY NOT TO MODIFY: record rewards for plotting purposes - writer.add_scalar("charts/learning_rate", optimizer.param_groups[0]["lr"], global_step) - writer.add_scalar("losses/value_loss", v_loss.item(), global_step) - writer.add_scalar("losses/policy_loss", pg_loss.item(), global_step) - writer.add_scalar("losses/entropy", entropy.mean().item(), global_step) - writer.add_scalar("losses/approx_kl", approx_kl.item(), global_step) - print("SPS:", int(global_step / (time.time() - start_time))) - writer.add_scalar("charts/SPS", int(global_step / (time.time() - start_time)), global_step) - - envs.close() - writer.close() diff --git a/cleanrl/brax/old_ppo_that_works.py b/cleanrl/brax/old_ppo_that_works.py deleted file mode 100755 index 94a5475e1..000000000 --- a/cleanrl/brax/old_ppo_that_works.py +++ /dev/null @@ -1,405 +0,0 @@ -import torch -import torch.nn as nn -import torch.optim as optim -import torch.nn.functional as F -from torch.distributions.normal import Normal -from torch.utils.tensorboard import SummaryWriter - -import argparse -from distutils.util import strtobool -import numpy as np -import gym -from gym.wrappers import TimeLimit, Monitor -import pybullet_envs -from gym.spaces import Discrete, Box, MultiBinary, MultiDiscrete, Space -import time -import random -import os -from stable_baselines3.common.vec_env import DummyVecEnv, SubprocVecEnv, VecEnvWrapper - -# taken from https://github.com/openai/baselines/blob/master/baselines/common/vec_env/vec_normalize.py -class RunningMeanStd(object): - def __init__(self, epsilon=1e-4, shape=()): - self.mean = np.zeros(shape, 'float64') - self.var = np.ones(shape, 'float64') - self.count = epsilon - - def update(self, x): - batch_mean = np.mean([x], axis=0) - batch_var = np.var([x], axis=0) - batch_count = 1 - self.update_from_moments(batch_mean, batch_var, batch_count) - - def update_from_moments(self, batch_mean, batch_var, batch_count): - self.mean, self.var, self.count = update_mean_var_count_from_moments( - self.mean, self.var, self.count, batch_mean, batch_var, batch_count) - -def update_mean_var_count_from_moments(mean, var, count, batch_mean, batch_var, batch_count): - delta = batch_mean - mean - tot_count = count + batch_count - - new_mean = mean + delta * batch_count / tot_count - m_a = var * count - m_b = batch_var * batch_count - M2 = m_a + m_b + np.square(delta) * count * batch_count / tot_count - new_var = M2 / tot_count - new_count = tot_count - - return new_mean, new_var, new_count - -class NormalizedEnv(gym.core.Wrapper): - def __init__(self, env, ob=True, ret=True, clipob=10., cliprew=10., gamma=0.99, epsilon=1e-8): - super(NormalizedEnv, self).__init__(env) - self.ob_rms = RunningMeanStd(shape=self.observation_space.shape) if ob else None - self.ret_rms = RunningMeanStd(shape=(1,)) if ret else None - self.clipob = clipob - self.cliprew = cliprew - self.ret = np.zeros(()) - self.gamma = gamma - self.epsilon = epsilon - - def step(self, action): - obs, rews, dones, infos = self.env.step(action) - infos['real_reward'] = rews - self.ret = self.ret * self.gamma + rews - obs = self._obfilt(obs) - if self.ret_rms: - self.ret_rms.update(np.array([self.ret].copy())) - rews = np.clip(rews / np.sqrt(self.ret_rms.var + self.epsilon), -self.cliprew, self.cliprew) - self.ret = self.ret * (1-float(dones)) - return obs, rews, dones, infos - - def _obfilt(self, obs): - if self.ob_rms: - self.ob_rms.update(obs) - obs = np.clip((obs - self.ob_rms.mean) / np.sqrt(self.ob_rms.var + self.epsilon), -self.clipob, self.clipob) - return obs - else: - return obs - - def reset(self): - self.ret = np.zeros(()) - obs = self.env.reset() - return self._obfilt(obs) - -if __name__ == "__main__": - parser = argparse.ArgumentParser(description='PPO agent') - # Common arguments - parser.add_argument('--exp-name', type=str, default=os.path.basename(__file__).rstrip(".py"), - help='the name of this experiment') - parser.add_argument('--gym-id', type=str, default="HopperBulletEnv-v0", - help='the id of the gym environment') - parser.add_argument('--learning-rate', type=float, default=3e-4, - help='the learning rate of the optimizer') - parser.add_argument('--seed', type=int, default=1, - help='seed of the experiment') - parser.add_argument('--total-timesteps', type=int, default=2000000, - help='total timesteps of the experiments') - parser.add_argument('--torch-deterministic', type=lambda x:bool(strtobool(x)), default=True, nargs='?', const=True, - help='if toggled, `torch.backends.cudnn.deterministic=False`') - parser.add_argument('--cuda', type=lambda x:bool(strtobool(x)), default=True, nargs='?', const=True, - help='if toggled, cuda will not be enabled by default') - parser.add_argument('--track', type=lambda x:bool(strtobool(x)), default=False, nargs='?', const=True, - help='run the script in production mode and use wandb to log outputs') - parser.add_argument('--capture-video', type=lambda x:bool(strtobool(x)), default=True, nargs='?', const=True, - help='weather to capture videos of the agent performances (check out `videos` folder)') - parser.add_argument('--wandb-project-name', type=str, default="cleanRL", - help="the wandb's project name") - parser.add_argument('--wandb-entity', type=str, default=None, - help="the entity (team) of wandb's project") - - # Algorithm specific arguments - parser.add_argument('--n-minibatch', type=int, default=32, - help='the number of mini batch') - parser.add_argument('--num-envs', type=int, default=1, - help='the number of parallel game environment') - parser.add_argument('--num-steps', type=int, default=2048, - help='the number of steps per game environment') - parser.add_argument('--gamma', type=float, default=0.99, - help='the discount factor gamma') - parser.add_argument('--gae-lambda', type=float, default=0.95, - help='the lambda for the general advantage estimation') - parser.add_argument('--ent-coef', type=float, default=0.0, - help="coefficient of the entropy") - parser.add_argument('--vf-coef', type=float, default=0.5, - help="coefficient of the value function") - parser.add_argument('--max-grad-norm', type=float, default=0.5, - help='the maximum norm for the gradient clipping') - parser.add_argument('--clip-coef', type=float, default=0.2, - help="the surrogate clipping coefficient") - parser.add_argument('--update-epochs', type=int, default=10, - help="the K epochs to update the policy") - parser.add_argument('--kle-stop', type=lambda x:bool(strtobool(x)), default=False, nargs='?', const=True, - help='If toggled, the policy updates will be early stopped w.r.t target-kl') - parser.add_argument('--kle-rollback', type=lambda x:bool(strtobool(x)), default=False, nargs='?', const=True, - help='If toggled, the policy updates will roll back to previous policy if KL exceeds target-kl') - parser.add_argument('--target-kl', type=float, default=0.03, - help='the target-kl variable that is referred by --kl') - parser.add_argument('--gae', type=lambda x:bool(strtobool(x)), default=True, nargs='?', const=True, - help='Use GAE for advantage computation') - parser.add_argument('--norm-adv', type=lambda x:bool(strtobool(x)), default=True, nargs='?', const=True, - help="Toggles advantages normalization") - parser.add_argument('--anneal-lr', type=lambda x:bool(strtobool(x)), default=True, nargs='?', const=True, - help="Toggle learning rate annealing for policy and value networks") - parser.add_argument('--clip-vloss', type=lambda x:bool(strtobool(x)), default=True, nargs='?', const=True, - help='Toggles wheter or not to use a clipped loss for the value function, as per the paper.') - - args = parser.parse_args() - if not args.seed: - args.seed = int(time.time()) - -args.batch_size = int(args.num_envs * args.num_steps) -args.minibatch_size = int(args.batch_size // args.n_minibatch) - -class ClipActionsWrapper(gym.Wrapper): - def step(self, action): - import numpy as np - action = np.nan_to_num(action) - action = np.clip(action, self.action_space.low, self.action_space.high) - return self.env.step(action) - -class VecPyTorch(VecEnvWrapper): - def __init__(self, venv, device): - super(VecPyTorch, self).__init__(venv) - self.device = device - - def reset(self): - obs = self.venv.reset() - obs = torch.from_numpy(obs).float().to(self.device) - return obs - - def step_async(self, actions): - actions = actions.cpu().numpy() - self.venv.step_async(actions) - - def step_wait(self): - obs, reward, done, info = self.venv.step_wait() - obs = torch.from_numpy(obs).float().to(self.device) - reward = torch.from_numpy(reward).unsqueeze(dim=1).float() - return obs, reward, done, info - - -# TRY NOT TO MODIFY: setup the environment -experiment_name = f"{args.gym_id}__{args.exp_name}__{args.seed}__{int(time.time())}" -writer = SummaryWriter(f"runs/{experiment_name}") -writer.add_text('hyperparameters', "|param|value|\n|-|-|\n%s" % ( - '\n'.join([f"|{key}|{value}|" for key, value in vars(args).items()]))) -if args.track: - import wandb - wandb.init(project=args.wandb_project_name, entity=args.wandb_entity, sync_tensorboard=True, config=vars(args), name=experiment_name, monitor_gym=True, save_code=True) - writer = SummaryWriter(f"/tmp/{experiment_name}") - -# TRY NOT TO MODIFY: seeding -device = torch.device('cuda' if torch.cuda.is_available() and args.cuda else 'cpu') -random.seed(args.seed) -np.random.seed(args.seed) -torch.manual_seed(args.seed) -torch.backends.cudnn.deterministic = args.torch_deterministic -def make_env(gym_id, seed, idx): - def thunk(): - env = gym.make(gym_id) - env = ClipActionsWrapper(env) - env = gym.wrappers.RecordEpisodeStatistics(env) - if args.capture_video: - if idx == 0: - env = Monitor(env, f'videos/{experiment_name}') - env = NormalizedEnv(env) - env.seed(seed) - env.action_space.seed(seed) - env.observation_space.seed(seed) - return env - return thunk -envs = VecPyTorch(DummyVecEnv([make_env(args.gym_id, args.seed+i, i) for i in range(args.num_envs)]), device) -# if args.track: -# envs = VecPyTorch( -# SubprocVecEnv([make_env(args.gym_id, args.seed+i, i) for i in range(args.num_envs)], "fork"), -# device -# ) -assert isinstance(envs.action_space, Box), "only continuous action space is supported" - -# ALGO LOGIC: initialize agent here: -def layer_init(layer, std=np.sqrt(2), bias_const=0.0): - torch.nn.init.orthogonal_(layer.weight, std) - torch.nn.init.constant_(layer.bias, bias_const) - return layer - -class Agent(nn.Module): - def __init__(self, envs): - super(Agent, self).__init__() - self.critic = nn.Sequential( - layer_init(nn.Linear(np.array(envs.observation_space.shape).prod(), 64)), - nn.Tanh(), - layer_init(nn.Linear(64, 64)), - nn.Tanh(), - layer_init(nn.Linear(64, 1), std=1.), - ) - self.actor_mean = nn.Sequential( - layer_init(nn.Linear(np.array(envs.observation_space.shape).prod(), 64)), - nn.Tanh(), - layer_init(nn.Linear(64, 64)), - nn.Tanh(), - layer_init(nn.Linear(64, np.prod(envs.action_space.shape)), std=0.01), - ) - self.actor_logstd = nn.Parameter(torch.zeros(1, np.prod(envs.action_space.shape))) - - def get_action(self, x, action=None): - action_mean = self.actor_mean(x) - action_logstd = self.actor_logstd.expand_as(action_mean) - action_std = torch.exp(action_logstd) - probs = Normal(action_mean, action_std) - if action is None: - action = probs.sample() - return action, probs.log_prob(action).sum(1), probs.entropy().sum(1) - - def get_value(self, x): - return self.critic(x) - -agent = Agent(envs).to(device) -optimizer = optim.Adam(agent.parameters(), lr=args.learning_rate, eps=1e-5) -if args.anneal_lr: - # https://github.com/openai/baselines/blob/ea25b9e8b234e6ee1bca43083f8f3cf974143998/baselines/ppo2/defaults.py#L20 - lr = lambda f: f * args.learning_rate - -# ALGO Logic: Storage for epoch data -obs = torch.zeros((args.num_steps, args.num_envs) + envs.observation_space.shape).to(device) -actions = torch.zeros((args.num_steps, args.num_envs) + envs.action_space.shape).to(device) -logprobs = torch.zeros((args.num_steps, args.num_envs)).to(device) -rewards = torch.zeros((args.num_steps, args.num_envs)).to(device) -dones = torch.zeros((args.num_steps, args.num_envs)).to(device) -values = torch.zeros((args.num_steps, args.num_envs)).to(device) - -# TRY NOT TO MODIFY: start the game -global_step = 0 -# Note how `next_obs` and `next_done` are used; their usage is equivalent to -# https://github.com/ikostrikov/pytorch-a2c-ppo-acktr-gail/blob/84a7582477fb0d5c82ad6d850fe476829dddd2e1/a2c_ppo_acktr/storage.py#L60 -next_obs = envs.reset() -next_done = torch.zeros(args.num_envs).to(device) -num_updates = args.total_timesteps // args.batch_size -for update in range(1, num_updates+1): - # Annealing the rate if instructed to do so. - if args.anneal_lr: - frac = 1.0 - (update - 1.0) / num_updates - lrnow = lr(frac) - optimizer.param_groups[0]['lr'] = lrnow - - # TRY NOT TO MODIFY: prepare the execution of the game. - for step in range(0, args.num_steps): - global_step += 1 * args.num_envs - obs[step] = next_obs - dones[step] = next_done - - # ALGO LOGIC: put action logic here - with torch.no_grad(): - values[step] = agent.get_value(obs[step]).flatten() - action, logproba, _ = agent.get_action(obs[step]) - - actions[step] = action - logprobs[step] = logproba - - # TRY NOT TO MODIFY: execute the game and log data. - next_obs, rs, ds, infos = envs.step(action) - rewards[step], next_done = rs.view(-1), torch.Tensor(ds).to(device) - - for info in infos: - if 'episode' in info.keys(): - print(f"global_step={global_step}, episode_reward={info['episode']['r']}") - writer.add_scalar("charts/episodic_return", info['episode']['r'], global_step) - break - - # bootstrap reward if not done. reached the batch limit - with torch.no_grad(): - last_value = agent.get_value(next_obs.to(device)).reshape(1, -1) - if args.gae: - advantages = torch.zeros_like(rewards).to(device) - lastgaelam = 0 - for t in reversed(range(args.num_steps)): - if t == args.num_steps - 1: - nextnonterminal = 1.0 - next_done - nextvalues = last_value - else: - nextnonterminal = 1.0 - dones[t+1] - nextvalues = values[t+1] - delta = rewards[t] + args.gamma * nextvalues * nextnonterminal - values[t] - advantages[t] = lastgaelam = delta + args.gamma * args.gae_lambda * nextnonterminal * lastgaelam - returns = advantages + values - else: - returns = torch.zeros_like(rewards).to(device) - for t in reversed(range(args.num_steps)): - if t == args.num_steps - 1: - nextnonterminal = 1.0 - next_done - next_return = last_value - else: - nextnonterminal = 1.0 - dones[t+1] - next_return = returns[t+1] - returns[t] = rewards[t] + args.gamma * nextnonterminal * next_return - advantages = returns - values - - # flatten the batch - b_obs = obs.reshape((-1,)+envs.observation_space.shape) - b_logprobs = logprobs.reshape(-1) - b_actions = actions.reshape((-1,)+envs.action_space.shape) - b_advantages = advantages.reshape(-1) - b_returns = returns.reshape(-1) - b_values = values.reshape(-1) - - # Optimizaing the policy and value network - target_agent = Agent(envs).to(device) - inds = np.arange(args.batch_size,) - for i_epoch_pi in range(args.update_epochs): - np.random.shuffle(inds) - target_agent.load_state_dict(agent.state_dict()) - for start in range(0, args.batch_size, args.minibatch_size): - end = start + args.minibatch_size - minibatch_ind = inds[start:end] - mb_advantages = b_advantages[minibatch_ind] - if args.norm_adv: - mb_advantages = (mb_advantages - mb_advantages.mean()) / (mb_advantages.std() + 1e-8) - - _, newlogproba, entropy = agent.get_action(b_obs[minibatch_ind], b_actions[minibatch_ind]) - ratio = (newlogproba - b_logprobs[minibatch_ind]).exp() - - # Stats - approx_kl = (b_logprobs[minibatch_ind] - newlogproba).mean() - - # Policy loss - pg_loss1 = -mb_advantages * ratio - pg_loss2 = -mb_advantages * torch.clamp(ratio, 1-args.clip_coef, 1+args.clip_coef) - pg_loss = torch.max(pg_loss1, pg_loss2).mean() - entropy_loss = entropy.mean() - - # Value loss - new_values = agent.get_value(b_obs[minibatch_ind]).view(-1) - if args.clip_vloss: - v_loss_unclipped = ((new_values - b_returns[minibatch_ind]) ** 2) - v_clipped = b_values[minibatch_ind] + torch.clamp(new_values - b_values[minibatch_ind], -args.clip_coef, args.clip_coef) - v_loss_clipped = (v_clipped - b_returns[minibatch_ind])**2 - v_loss_max = torch.max(v_loss_unclipped, v_loss_clipped) - v_loss = 0.5 * v_loss_max.mean() - else: - v_loss = 0.5 * ((new_values - b_returns[minibatch_ind]) ** 2).mean() - - loss = pg_loss - args.ent_coef * entropy_loss + v_loss * args.vf_coef - optimizer.zero_grad() - loss.backward() - nn.utils.clip_grad_norm_(agent.parameters(), args.max_grad_norm) - optimizer.step() - - if args.kle_stop: - if approx_kl > args.target_kl: - break - if args.kle_rollback: - if (b_logprobs[minibatch_ind] - agent.get_action(b_obs[minibatch_ind], b_actions[minibatch_ind])[1]).mean() > args.target_kl: - agent.load_state_dict(target_agent.state_dict()) - break - - # TRY NOT TO MODIFY: record rewards for plotting purposes - writer.add_scalar("charts/learning_rate", optimizer.param_groups[0]['lr'], global_step) - writer.add_scalar("losses/value_loss", v_loss.item(), global_step) - writer.add_scalar("losses/policy_loss", pg_loss.item(), global_step) - writer.add_scalar("losses/entropy", entropy.mean().item(), global_step) - writer.add_scalar("losses/approx_kl", approx_kl.item(), global_step) - if args.kle_stop or args.kle_rollback: - writer.add_scalar("debug/pg_stop_iter", i_epoch_pi, global_step) - -envs.close() -writer.close() diff --git a/cleanrl/brax/ppo_brax.py b/cleanrl/brax/ppo_brax.py deleted file mode 100644 index 6a6714ad2..000000000 --- a/cleanrl/brax/ppo_brax.py +++ /dev/null @@ -1,489 +0,0 @@ -import torch -import torch.nn as nn -import torch.optim as optim -import torch.nn.functional as F -from torch.distributions.normal import Normal -from torch.utils.tensorboard import SummaryWriter -from functools import partial - -import argparse -from distutils.util import strtobool -import numpy as np -import gym -from brax.envs.to_torch import JaxToTorchWrapper -from brax.envs import _envs, create_gym_env -from gym.wrappers import TimeLimit, Monitor -from gym.spaces import Discrete, Box, MultiBinary, MultiDiscrete, Space -import time -import random -import os -from stable_baselines3.common.vec_env import DummyVecEnv, VecEnvWrapper, VecNormalize -import jax.numpy as jnp - -def parse_args(): - # fmt: off - parser = argparse.ArgumentParser(description='PPO agent') - parser.add_argument('--exp-name', type=str, default=os.path.basename(__file__).rstrip(".py"), - help='the name of this experiment') - parser.add_argument('--gym-id', type=str, default="halfcheetah", - help='the id of the gym environment') - parser.add_argument('--learning-rate', type=float, default=3e-4, - help='the learning rate of the optimizer') - parser.add_argument('--seed', type=int, default=1, - help='seed of the experiment') - parser.add_argument('--total-timesteps', type=int, default=50000000, - help='total timesteps of the experiments') - parser.add_argument('--torch-deterministic', type=lambda x:bool(strtobool(x)), default=True, nargs='?', const=True, - help='if toggled, `torch.backends.cudnn.deterministic=False`') - parser.add_argument('--cuda', type=lambda x:bool(strtobool(x)), default=True, nargs='?', const=True, - help='if toggled, cuda will not be enabled by default') - parser.add_argument('--track', type=lambda x:bool(strtobool(x)), default=False, nargs='?', const=True, - help='run the script in production mode and use wandb to log outputs') - parser.add_argument('--capture-video', type=lambda x:bool(strtobool(x)), default=False, nargs='?', const=True, - help='weather to capture videos of the agent performances (check out `videos` folder)') - parser.add_argument('--wandb-project-name', type=str, default="cleanRL", - help="the wandb's project name") - parser.add_argument('--wandb-entity', type=str, default=None, - help="the entity (team) of wandb's project") - - # Algorithm specific arguments - parser.add_argument('--n-minibatch', type=int, default=8, - help='the number of mini batch') - parser.add_argument('--num-envs', type=int, default=64, - help='the number of parallel game environment') - parser.add_argument('--num-steps', type=int, default=256, - help='the number of steps per game environment') - parser.add_argument('--gamma', type=float, default=0.99, - help='the discount factor gamma') - parser.add_argument('--gae-lambda', type=float, default=0.95, - help='the lambda for the general advantage estimation') - parser.add_argument('--ent-coef', type=float, default=0.01, - help="coefficient of the entropy") - parser.add_argument('--vf-coef', type=float, default=0.5, - help="coefficient of the value function") - parser.add_argument('--max-grad-norm', type=float, default=0.5, - help='the maximum norm for the gradient clipping') - parser.add_argument('--clip-coef', type=float, default=0.2, - help="the surrogate clipping coefficient") - parser.add_argument('--update-epochs', type=int, default=3, - help="the K epochs to update the policy") - parser.add_argument('--kle-stop', type=lambda x:bool(strtobool(x)), default=False, nargs='?', const=True, - help='If toggled, the policy updates will be early stopped w.r.t target-kl') - parser.add_argument('--target-kl', type=float, default=0.03, - help='the target-kl variable that is referred by --kl') - parser.add_argument('--gae', type=lambda x:bool(strtobool(x)), default=True, nargs='?', const=True, - help='Use GAE for advantage computation') - parser.add_argument('--norm-adv', type=lambda x:bool(strtobool(x)), default=True, nargs='?', const=True, - help="Toggles advantages normalization") - parser.add_argument('--anneal-lr', type=lambda x:bool(strtobool(x)), default=True, nargs='?', const=True, - help="Toggle learning rate annealing for policy and value networks") - parser.add_argument('--clip-vloss', type=lambda x:bool(strtobool(x)), default=True, nargs='?', const=True, - help='Toggles wheter or not to use a clipped loss for the value function, as per the paper.') - args = parser.parse_args() - if not args.seed: - args.seed = int(time.time()) - args.batch_size = int(args.num_envs * args.num_steps) - args.minibatch_size = int(args.batch_size // args.n_minibatch) - # fmt: on - return args - - - -def layer_init(layer, std=np.sqrt(2), bias_const=0.0): - torch.nn.init.orthogonal_(layer.weight, std) - torch.nn.init.constant_(layer.bias, bias_const) - return layer - - -class Agent(nn.Module): - def __init__(self, envs): - super(Agent, self).__init__() - self.critic = nn.Sequential( - layer_init(nn.Linear(np.array(envs.single_observation_space.shape).prod(), 64)), - nn.Tanh(), - layer_init(nn.Linear(64, 64)), - nn.Tanh(), - layer_init(nn.Linear(64, 1), std=1.), - ) - self.actor_mean = nn.Sequential( - layer_init(nn.Linear(np.array(envs.single_observation_space.shape).prod(), 64)), - nn.Tanh(), - layer_init(nn.Linear(64, 64)), - nn.Tanh(), - layer_init(nn.Linear(64, np.prod(envs.single_action_space.shape)), std=0.01), - ) - self.actor_logstd = nn.Parameter(torch.zeros(1, np.prod(envs.single_action_space.shape))) - - def get_action_and_value(self, x, action=None): - action_mean = self.actor_mean(x) - action_logstd = self.actor_logstd.expand_as(action_mean) - action_std = torch.exp(action_logstd) - probs = Normal(action_mean, action_std) - if action is None: - action = probs.sample() - return action, probs.log_prob(action).sum(1), probs.entropy().sum(1), self.critic(x) - - def get_value(self, x): - return self.critic(x) - - -# taken from https://github.com/openai/baselines/blob/master/baselines/common/vec_env/vec_normalize.py -class RunningMeanStd(object): - def __init__(self, epsilon=1e-4, shape=()): - self.mean = np.zeros(shape, 'float64') - self.var = np.ones(shape, 'float64') - self.count = epsilon - - def update(self, x): - batch_mean = np.mean(x, axis=0) - batch_var = np.var(x, axis=0) - batch_count = x.shape[0] - self.update_from_moments(batch_mean, batch_var, batch_count) - - def update_from_moments(self, batch_mean, batch_var, batch_count): - self.mean, self.var, self.count = update_mean_var_count_from_moments( - self.mean, self.var, self.count, batch_mean, batch_var, batch_count) - -def update_mean_var_count_from_moments(mean, var, count, batch_mean, batch_var, batch_count): - delta = batch_mean - mean - tot_count = count + batch_count - - new_mean = mean + delta * batch_count / tot_count - m_a = var * count - m_b = batch_var * batch_count - M2 = m_a + m_b + np.square(delta) * count * batch_count / tot_count - new_var = M2 / tot_count - new_count = tot_count - - return new_mean, new_var, new_count - -class NormalizedEnv(gym.core.Wrapper): - def __init__(self, env, ob=True, ret=True, clipob=10., cliprew=10., gamma=0.99, epsilon=1e-8): - super(NormalizedEnv, self).__init__(env) - self.num_envs = getattr(env, "num_envs", 1) - self.is_vector_env = getattr(env, "is_vector_env", False) - self.ob_rms = RunningMeanStd(shape=self.observation_space.shape) if ob else None - self.ret_rms = RunningMeanStd(shape=()) if ret else None - self.clipob = clipob - self.cliprew = cliprew - self.ret = np.zeros(self.num_envs) - self.gamma = gamma - self.epsilon = epsilon - - def step(self, action): - obs, rews, dones, infos = self.env.step(action) - if not self.is_vector_env: - obs, rews, dones = np.array([obs]), np.array([rews]), np.array([dones]) - self.ret = self.ret * self.gamma + rews - obs = self._obfilt(obs) - if self.ret_rms: - self.ret_rms.update(self.ret) - rews = np.clip(rews / np.sqrt(self.ret_rms.var + self.epsilon), -self.cliprew, self.cliprew) - self.ret[dones] = 0. - if not self.is_vector_env: - return obs[0], rews[0], dones[0], infos - return obs, rews, dones, infos - - def _obfilt(self, obs): - if self.ob_rms: - self.ob_rms.update(obs) - obs = np.clip((obs - self.ob_rms.mean) / np.sqrt(self.ob_rms.var + self.epsilon), -self.clipob, self.clipob) - return obs - else: - return obs - - def reset(self): - self.ret = np.zeros(self.num_envs) - obs = self.env.reset() - if not self.is_vector_env: - obs = np.array([obs]) - obs = self._obfilt(obs) - if not self.is_vector_env: - return obs[0] - - -# taken from https://github.com/openai/baselines/blob/master/baselines/common/vec_env/vec_normalize.py -class RunningMeanStd(object): - def __init__(self, epsilon=1e-4, shape=()): - self.mean = np.zeros(shape, 'float64') - self.var = np.ones(shape, 'float64') - self.count = epsilon - - def update(self, x): - batch_mean = np.mean(x, axis=0) - batch_var = np.var(x, axis=0) - batch_count = x.shape[0] - self.update_from_moments(batch_mean, batch_var, batch_count) - - def update_from_moments(self, batch_mean, batch_var, batch_count): - self.mean, self.var, self.count = update_mean_var_count_from_moments( - self.mean, self.var, self.count, batch_mean, batch_var, batch_count) - -def update_mean_var_count_from_moments(mean, var, count, batch_mean, batch_var, batch_count): - delta = batch_mean - mean - tot_count = count + batch_count - - new_mean = mean + delta * batch_count / tot_count - m_a = var * count - m_b = batch_var * batch_count - M2 = m_a + m_b + np.square(delta) * count * batch_count / tot_count - new_var = M2 / tot_count - new_count = tot_count - - return new_mean, new_var, new_count - -class NormalizedEnv(gym.core.Wrapper): - def __init__(self, env, ob=True, ret=True, clipob=10., cliprew=10., gamma=0.99, epsilon=1e-8): - super(NormalizedEnv, self).__init__(env) - self.num_envs = getattr(env, "num_envs", 1) - self.is_vector_env = getattr(env, "is_vector_env", False) - self.ob_rms = RunningMeanStd(shape=self.observation_space.shape) if ob else None - self.ret_rms = RunningMeanStd(shape=()) if ret else None - self.clipob = clipob - self.cliprew = cliprew - self.ret = jnp.zeros(self.num_envs) - self.gamma = gamma - self.epsilon = epsilon - - def step(self, action): - obs, rews, dones, infos = self.env.step(action) - if not self.is_vector_env: - obs, rews, dones = jnp.array([obs]), jnp.array([rews]), jnp.array([dones]) - self.ret = self.ret * self.gamma + rews - obs = self._obfilt(obs) - if self.ret_rms: - self.ret_rms.update(self.ret) - rews = jnp.clip(rews / jnp.sqrt(self.ret_rms.var + self.epsilon), -self.cliprew, self.cliprew) - self.ret.at[dones.astype(bool)].set(0) - if not self.is_vector_env: - return obs[0], rews[0], dones[0], infos - return obs, rews, dones, infos - - def _obfilt(self, obs): - if self.ob_rms: - self.ob_rms.update(obs) - obs = jnp.clip((obs - self.ob_rms.mean) / jnp.sqrt(self.ob_rms.var + self.epsilon), -self.clipob, self.clipob) - return obs - else: - return obs - - def reset(self): - self.ret = jnp.zeros(self.num_envs) - obs = self.env.reset() - if not self.is_vector_env: - obs = jnp.array([obs]) - obs = self._obfilt(obs) - if not self.is_vector_env: - return obs[0] - return obs - - -def make_env(gym_id, seed, idx, capture_video, run_name): - def thunk(): - env = gym.make(gym_id) - env = gym.wrappers.RecordEpisodeStatistics(env) - if args.capture_video: - if idx == 0: - env = Monitor(env, f'videos/{run_name}') - env.seed(seed) - env.action_space.seed(seed) - env.observation_space.seed(seed) - return env - return thunk - -if __name__ == "__main__": - args = parse_args() - run_name = f"{args.gym_id}__{args.exp_name}__{args.seed}__{int(time.time())}" - if args.track: - import wandb - - wandb.init( - project=args.wandb_project_name, - entity=args.wandb_entity, - sync_tensorboard=True, - config=vars(args), - name=run_name, - monitor_gym=True, - save_code=True, - ) - writer = SummaryWriter(f"runs/{run_name}") - writer.add_text( - "hyperparameters", - "|param|value|\n|-|-|\n%s" % ("\n".join([f"|{key}|{value}|" for key, value in vars(args).items()])), - ) - device = torch.device("cuda" if torch.cuda.is_available() and args.cuda else "cpu") - - # TRY NOT TO MODIFY: seeding - random.seed(args.seed) - np.random.seed(args.seed) - torch.manual_seed(args.seed) - torch.backends.cudnn.deterministic = args.torch_deterministic - - # env setup - dummy_value = torch.ones(1, device=device) - envs = create_gym_env(args.gym_id, batch_size=args.num_envs) - envs.is_vector_env = True - envs = gym.wrappers.RecordEpisodeStatistics(envs) - envs = NormalizedEnv(envs) - envs = JaxToTorchWrapper(envs, device=device) - # envs = VecNormalize( - # envs, - # norm_obs=True, norm_reward=True, clip_obs=10. - # ) - assert isinstance(envs.single_action_space, Box), "only continuous action space is supported" - - agent = Agent(envs).to(device) - optimizer = optim.Adam(agent.parameters(), lr=args.learning_rate, eps=1e-5) - if args.anneal_lr: - # https://github.com/openai/baselines/blob/ea25b9e8b234e6ee1bca43083f8f3cf974143998/baselines/ppo2/defaults.py#L20 - lr = lambda f: f * args.learning_rate - - # ALGO Logic: Storage for epoch data - obs = torch.zeros((args.num_steps, args.num_envs) + envs.single_observation_space.shape).to(device) - actions = torch.zeros((args.num_steps, args.num_envs) + envs.single_action_space.shape).to(device) - logprobs = torch.zeros((args.num_steps, args.num_envs)).to(device) - rewards = torch.zeros((args.num_steps, args.num_envs)).to(device) - dones = torch.zeros((args.num_steps, args.num_envs)).to(device) - values = torch.zeros((args.num_steps, args.num_envs)).to(device) - - # TRY NOT TO MODIFY: start the game - global_step = 0 - start_time = time.time() - # Note how `next_obs` and `next_done` are used; their usage is equivalent to - # https://github.com/ikostrikov/pytorch-a2c-ppo-acktr-gail/blob/84a7582477fb0d5c82ad6d850fe476829dddd2e1/a2c_ppo_acktr/storage.py#L60 - next_obs = envs.reset() - next_done = torch.zeros(args.num_envs).to(device) - num_updates = args.total_timesteps // args.batch_size - for update in range(1, num_updates + 1): - # Annealing the rate if instructed to do so. - if args.anneal_lr: - frac = 1.0 - (update - 1.0) / num_updates - lrnow = lr(frac) - optimizer.param_groups[0]["lr"] = lrnow - - # TRY NOT TO MODIFY: prepare the execution of the game. - for step in range(0, args.num_steps): - global_step += 1 * args.num_envs - obs[step] = next_obs - dones[step] = next_done - - # ALGO LOGIC: put action logic here - with torch.no_grad(): - action, logproba, _, vs = agent.get_action_and_value(next_obs) - values[step] = vs.flatten() - - actions[step] = action - logprobs[step] = logproba - - # TRY NOT TO MODIFY: execute the game and log data. - next_obs, rs, ds, infos = envs.step(action) - rewards[step], next_done = rs, ds - - for info in infos: - if "episode" in info.keys(): - print(f"global_step={global_step}, episode_reward={info['episode']['r']}") - writer.add_scalar("charts/episodic_return", info["episode"]["r"], global_step) - break - - # bootstrap reward if not done. reached the batch limit - with torch.no_grad(): - last_value = agent.get_value(next_obs).reshape(1, -1) - if args.gae: - advantages = torch.zeros_like(rewards).to(device) - lastgaelam = 0 - for t in reversed(range(args.num_steps)): - if t == args.num_steps - 1: - nextnonterminal = 1.0 - next_done - nextvalues = last_value - else: - nextnonterminal = 1.0 - dones[t + 1] - nextvalues = values[t + 1] - delta = rewards[t] + args.gamma * nextvalues * nextnonterminal - values[t] - advantages[t] = lastgaelam = delta + args.gamma * args.gae_lambda * nextnonterminal * lastgaelam - returns = advantages + values - else: - returns = torch.zeros_like(rewards).to(device) - for t in reversed(range(args.num_steps)): - if t == args.num_steps - 1: - nextnonterminal = 1.0 - next_done - next_return = last_value - else: - nextnonterminal = 1.0 - dones[t + 1] - next_return = returns[t + 1] - returns[t] = rewards[t] + args.gamma * nextnonterminal * next_return - advantages = returns - values - - # flatten the batch - b_obs = obs.reshape((-1,) + envs.single_observation_space.shape) - b_logprobs = logprobs.reshape(-1) - b_actions = actions.reshape((-1,) + envs.single_action_space.shape) - b_advantages = advantages.reshape(-1) - b_returns = returns.reshape(-1) - b_values = values.reshape(-1) - - # Optimizaing the policy and value network - inds = np.arange( - args.batch_size, - ) - for i_epoch_pi in range(args.update_epochs): - np.random.shuffle(inds) - for start in range(0, args.batch_size, args.minibatch_size): - end = start + args.minibatch_size - minibatch_ind = inds[start:end] - mb_advantages = b_advantages[minibatch_ind] - if args.norm_adv: - mb_advantages = (mb_advantages - mb_advantages.mean()) / (mb_advantages.std() + 1e-8) - - _, newlogproba, entropy, new_values = agent.get_action_and_value( - b_obs[minibatch_ind], b_actions[minibatch_ind] - ) - ratio = (newlogproba - b_logprobs[minibatch_ind]).exp() - - # calculate approx_kl http://joschu.net/blog/kl-approx.html - with torch.no_grad(): - log_ratio = newlogproba - b_logprobs[minibatch_ind] - approx_kl = ((log_ratio.exp() - 1) - log_ratio).mean() - - # Policy loss - pg_loss1 = -mb_advantages * ratio - pg_loss2 = -mb_advantages * torch.clamp(ratio, 1 - args.clip_coef, 1 + args.clip_coef) - pg_loss = torch.max(pg_loss1, pg_loss2).mean() - entropy_loss = entropy.mean() - - # Value loss - new_values = new_values.view(-1) - if args.clip_vloss: - v_loss_unclipped = (new_values - b_returns[minibatch_ind]) ** 2 - v_clipped = b_values[minibatch_ind] + torch.clamp( - new_values - b_values[minibatch_ind], - -args.clip_coef, - args.clip_coef, - ) - v_loss_clipped = (v_clipped - b_returns[minibatch_ind]) ** 2 - v_loss_max = torch.max(v_loss_unclipped, v_loss_clipped) - v_loss = 0.5 * v_loss_max.mean() - else: - v_loss = 0.5 * ((new_values - b_returns[minibatch_ind]) ** 2).mean() - - loss = pg_loss - args.ent_coef * entropy_loss + v_loss * args.vf_coef - - optimizer.zero_grad() - loss.backward() - nn.utils.clip_grad_norm_(agent.parameters(), args.max_grad_norm) - optimizer.step() - - if args.kle_stop: - if approx_kl > args.target_kl: - break - - # TRY NOT TO MODIFY: record rewards for plotting purposes - writer.add_scalar("charts/learning_rate", optimizer.param_groups[0]["lr"], global_step) - writer.add_scalar("losses/value_loss", v_loss.item(), global_step) - writer.add_scalar("losses/policy_loss", pg_loss.item(), global_step) - writer.add_scalar("losses/entropy", entropy.mean().item(), global_step) - writer.add_scalar("losses/approx_kl", approx_kl.item(), global_step) - print("SPS:", int(global_step / (time.time() - start_time))) - writer.add_scalar("charts/SPS", int(global_step / (time.time() - start_time)), global_step) - - envs.close() - writer.close() diff --git a/cleanrl/brax/ppo_brax_test.py b/cleanrl/brax/ppo_brax_test.py deleted file mode 100644 index a7ed8e65a..000000000 --- a/cleanrl/brax/ppo_brax_test.py +++ /dev/null @@ -1,363 +0,0 @@ -import argparse -import os -import random -import time -from distutils.util import strtobool - -import gym -import numpy as np -from brax.envs import create_gym_env -import torch -import torch.nn as nn -import torch.optim as optim -from gym.spaces import Box -from brax.envs.to_torch import JaxToTorchWrapper -from torch.distributions.normal import Normal -from torch.utils.tensorboard import SummaryWriter - - -def parse_args(): - # fmt: off - parser = argparse.ArgumentParser(description='PPO agent') - parser.add_argument('--exp-name', type=str, default=os.path.basename(__file__).rstrip(".py"), - help='the name of this experiment') - parser.add_argument('--gym-id', type=str, default="halfcheetah", - help='the id of the gym environment') - parser.add_argument('--learning-rate', type=float, default=3e-4, - help='the learning rate of the optimizer') - parser.add_argument('--seed', type=int, default=1, - help='seed of the experiment') - parser.add_argument('--total-timesteps', type=int, default=2000000, - help='total timesteps of the experiments') - parser.add_argument('--torch-deterministic', type=lambda x:bool(strtobool(x)), default=True, nargs='?', const=True, - help='if toggled, `torch.backends.cudnn.deterministic=False`') - parser.add_argument('--cuda', type=lambda x:bool(strtobool(x)), default=True, nargs='?', const=True, - help='if toggled, cuda will not be enabled by default') - parser.add_argument('--track', type=lambda x:bool(strtobool(x)), default=False, nargs='?', const=True, - help='run the script in production mode and use wandb to log outputs') - parser.add_argument('--capture-video', type=lambda x:bool(strtobool(x)), default=False, nargs='?', const=True, - help='weather to capture videos of the agent performances (check out `videos` folder)') - parser.add_argument('--wandb-project-name', type=str, default="cleanRL", - help="the wandb's project name") - parser.add_argument('--wandb-entity', type=str, default=None, - help="the entity (team) of wandb's project") - - # Algorithm specific arguments - parser.add_argument('--n-minibatch', type=int, default=32, - help='the number of mini batch') - parser.add_argument('--num-envs', type=int, default=1, - help='the number of parallel game environment') - parser.add_argument('--num-steps', type=int, default=2048, - help='the number of steps per game environment') - parser.add_argument('--gamma', type=float, default=0.99, - help='the discount factor gamma') - parser.add_argument('--gae-lambda', type=float, default=0.95, - help='the lambda for the general advantage estimation') - parser.add_argument('--ent-coef', type=float, default=0.0, - help="coefficient of the entropy") - parser.add_argument('--vf-coef', type=float, default=0.5, - help="coefficient of the value function") - parser.add_argument('--max-grad-norm', type=float, default=0.5, - help='the maximum norm for the gradient clipping') - parser.add_argument('--clip-coef', type=float, default=0.2, - help="the surrogate clipping coefficient") - parser.add_argument('--update-epochs', type=int, default=10, - help="the K epochs to update the policy") - parser.add_argument('--kle-stop', type=lambda x:bool(strtobool(x)), default=False, nargs='?', const=True, - help='If toggled, the policy updates will be early stopped w.r.t target-kl') - parser.add_argument('--target-kl', type=float, default=0.03, - help='the target-kl variable that is referred by --kl') - parser.add_argument('--gae', type=lambda x:bool(strtobool(x)), default=True, nargs='?', const=True, - help='Use GAE for advantage computation') - parser.add_argument('--norm-adv', type=lambda x:bool(strtobool(x)), default=True, nargs='?', const=True, - help="Toggles advantages normalization") - parser.add_argument('--anneal-lr', type=lambda x:bool(strtobool(x)), default=True, nargs='?', const=True, - help="Toggle learning rate annealing for policy and value networks") - parser.add_argument('--clip-vloss', type=lambda x:bool(strtobool(x)), default=True, nargs='?', const=True, - help='Toggles wheter or not to use a clipped loss for the value function, as per the paper.') - args = parser.parse_args() - if not args.seed: - args.seed = int(time.time()) - args.batch_size = int(args.num_envs * args.num_steps) - args.minibatch_size = int(args.batch_size // args.n_minibatch) - # fmt: on - return args - - -def layer_init(layer, std=np.sqrt(2), bias_const=0.0): - torch.nn.init.orthogonal_(layer.weight, std) - torch.nn.init.constant_(layer.bias, bias_const) - return layer - - -class Agent(nn.Module): - def __init__(self, envs): - super(Agent, self).__init__() - self.critic = nn.Sequential( - layer_init(nn.Linear(np.array(envs.single_observation_space.shape).prod(), 64)), - nn.Tanh(), - layer_init(nn.Linear(64, 64)), - nn.Tanh(), - layer_init(nn.Linear(64, 1), std=1.0), - ) - self.actor_mean = nn.Sequential( - layer_init(nn.Linear(np.array(envs.single_observation_space.shape).prod(), 64)), - nn.Tanh(), - layer_init(nn.Linear(64, 64)), - nn.Tanh(), - layer_init(nn.Linear(64, np.prod(envs.single_action_space.shape)), std=0.01), - ) - self.actor_logstd = nn.Parameter(torch.zeros(1, np.prod(envs.single_action_space.shape))) - - def get_action_and_value(self, x, action=None): - action_mean = self.actor_mean(x) - action_logstd = self.actor_logstd.expand_as(action_mean) - action_std = torch.exp(action_logstd) - probs = Normal(action_mean, action_std) - if action is None: - action = probs.sample() - return action, probs.log_prob(action).sum(1), probs.entropy().sum(1), self.critic(x) - - def get_value(self, x): - return self.critic(x) - -import time -from collections import deque -import numpy as np -import gym - - -class RecordEpisodeStatistics(gym.wrappers.RecordEpisodeStatistics): - """ - have to override `self.episode_returns += np.array(rewards)` - to `self.episode_returns += rewards` - because jax would modify the `self.episode_returns` to be a - jax array. - """ - - def step(self, action): - observations, rewards, dones, infos = super(RecordEpisodeStatistics, self).step( - action - ) - self.episode_returns += np.array(rewards) - self.episode_lengths += 1 - if not self.is_vector_env: - infos = [infos] - dones = [dones] - for i in range(len(dones)): - if dones[i]: - infos[i] = infos[i].copy() - episode_return = self.episode_returns[i] - episode_length = self.episode_lengths[i] - episode_info = { - "r": episode_return, - "l": episode_length, - "t": round(time.time() - self.t0, 6), - } - infos[i]["episode"] = episode_info - self.return_queue.append(episode_return) - self.length_queue.append(episode_length) - self.episode_count += 1 - self.episode_returns[i] = 0 - self.episode_lengths[i] = 0 - return ( - observations, - rewards, - dones if self.is_vector_env else dones[0], - infos if self.is_vector_env else infos[0], - ) - - - -if __name__ == "__main__": - args = parse_args() - run_name = f"{args.gym_id}__{args.exp_name}__{args.seed}__{int(time.time())}" - if args.track: - import wandb - - wandb.init( - project=args.wandb_project_name, - entity=args.wandb_entity, - sync_tensorboard=True, - config=vars(args), - name=run_name, - monitor_gym=True, - save_code=True, - ) - writer = SummaryWriter(f"runs/{run_name}") - writer.add_text( - "hyperparameters", - "|param|value|\n|-|-|\n%s" % ("\n".join([f"|{key}|{value}|" for key, value in vars(args).items()])), - ) - device = torch.device("cuda" if torch.cuda.is_available() and args.cuda else "cpu") - - # TRY NOT TO MODIFY: seeding - random.seed(args.seed) - np.random.seed(args.seed) - torch.manual_seed(args.seed) - torch.backends.cudnn.deterministic = args.torch_deterministic - - # env setup - dummy_value = torch.ones(1, device=device) - envs = create_gym_env(args.gym_id, batch_size=args.num_envs) - envs.is_vector_env = True - envs = RecordEpisodeStatistics(envs) - envs = JaxToTorchWrapper(envs, device=device) - assert isinstance(envs.single_action_space, Box), "only continuous action space is supported" - - agent = Agent(envs).to(device) - optimizer = optim.Adam(agent.parameters(), lr=args.learning_rate, eps=1e-5) - if args.anneal_lr: - # https://github.com/openai/baselines/blob/ea25b9e8b234e6ee1bca43083f8f3cf974143998/baselines/ppo2/defaults.py#L20 - lr = lambda f: f * args.learning_rate - - # ALGO Logic: Storage for epoch data - obs = torch.zeros((args.num_steps, args.num_envs) + envs.single_observation_space.shape).to(device) - actions = torch.zeros((args.num_steps, args.num_envs) + envs.single_action_space.shape).to(device) - logprobs = torch.zeros((args.num_steps, args.num_envs)).to(device) - rewards = torch.zeros((args.num_steps, args.num_envs)).to(device) - dones = torch.zeros((args.num_steps, args.num_envs)).to(device) - values = torch.zeros((args.num_steps, args.num_envs)).to(device) - - # TRY NOT TO MODIFY: start the game - global_step = 0 - start_time = time.time() - # Note how `next_obs` and `next_done` are used; their usage is equivalent to - # https://github.com/ikostrikov/pytorch-a2c-ppo-acktr-gail/blob/84a7582477fb0d5c82ad6d850fe476829dddd2e1/a2c_ppo_acktr/storage.py#L60 - next_obs = torch.Tensor(envs.reset()).to(device) - next_done = torch.zeros(args.num_envs).to(device) - num_updates = args.total_timesteps // args.batch_size - for update in range(1, num_updates + 1): - # Annealing the rate if instructed to do so. - if args.anneal_lr: - frac = 1.0 - (update - 1.0) / num_updates - lrnow = lr(frac) - optimizer.param_groups[0]["lr"] = lrnow - - # TRY NOT TO MODIFY: prepare the execution of the game. - for step in range(0, args.num_steps): - global_step += 1 * args.num_envs - obs[step] = next_obs - dones[step] = next_done - - # ALGO LOGIC: put action logic here - with torch.no_grad(): - action, logproba, _, vs = agent.get_action_and_value(next_obs) - values[step] = vs.flatten() - - actions[step] = action - logprobs[step] = logproba - - # TRY NOT TO MODIFY: execute the game and log data. - next_obs, rs, ds, infos = envs.step(action.cpu().numpy()) - next_obs = torch.Tensor(next_obs).to(device) - rewards[step], next_done = torch.tensor(rs).to(device).view(-1), torch.Tensor(ds).to(device) - - for info in infos: - if "episode" in info.keys(): - print(f"global_step={global_step}, episode_reward={info['episode']['r']}") - writer.add_scalar("charts/episodic_return", info["episode"]["r"], global_step) - break - - # bootstrap reward if not done. reached the batch limit - with torch.no_grad(): - last_value = agent.get_value(next_obs.to(device)).reshape(1, -1) - if args.gae: - advantages = torch.zeros_like(rewards).to(device) - lastgaelam = 0 - for t in reversed(range(args.num_steps)): - if t == args.num_steps - 1: - nextnonterminal = 1.0 - next_done - nextvalues = last_value - else: - nextnonterminal = 1.0 - dones[t + 1] - nextvalues = values[t + 1] - delta = rewards[t] + args.gamma * nextvalues * nextnonterminal - values[t] - advantages[t] = lastgaelam = delta + args.gamma * args.gae_lambda * nextnonterminal * lastgaelam - returns = advantages + values - else: - returns = torch.zeros_like(rewards).to(device) - for t in reversed(range(args.num_steps)): - if t == args.num_steps - 1: - nextnonterminal = 1.0 - next_done - next_return = last_value - else: - nextnonterminal = 1.0 - dones[t + 1] - next_return = returns[t + 1] - returns[t] = rewards[t] + args.gamma * nextnonterminal * next_return - advantages = returns - values - - # flatten the batch - b_obs = obs.reshape((-1,) + envs.single_observation_space.shape) - b_logprobs = logprobs.reshape(-1) - b_actions = actions.reshape((-1,) + envs.single_action_space.shape) - b_advantages = advantages.reshape(-1) - b_returns = returns.reshape(-1) - b_values = values.reshape(-1) - - # Optimizaing the policy and value network - inds = np.arange( - args.batch_size, - ) - for i_epoch_pi in range(args.update_epochs): - np.random.shuffle(inds) - for start in range(0, args.batch_size, args.minibatch_size): - end = start + args.minibatch_size - minibatch_ind = inds[start:end] - mb_advantages = b_advantages[minibatch_ind] - if args.norm_adv: - mb_advantages = (mb_advantages - mb_advantages.mean()) / (mb_advantages.std() + 1e-8) - - _, newlogproba, entropy, new_values = agent.get_action_and_value( - b_obs[minibatch_ind], b_actions[minibatch_ind] - ) - ratio = (newlogproba - b_logprobs[minibatch_ind]).exp() - - # calculate approx_kl http://joschu.net/blog/kl-approx.html - with torch.no_grad(): - log_ratio = newlogproba - b_logprobs[minibatch_ind] - approx_kl = ((log_ratio.exp() - 1) - log_ratio).mean() - - # Policy loss - pg_loss1 = -mb_advantages * ratio - pg_loss2 = -mb_advantages * torch.clamp(ratio, 1 - args.clip_coef, 1 + args.clip_coef) - pg_loss = torch.max(pg_loss1, pg_loss2).mean() - entropy_loss = entropy.mean() - - # Value loss - new_values = new_values.view(-1) - if args.clip_vloss: - v_loss_unclipped = (new_values - b_returns[minibatch_ind]) ** 2 - v_clipped = b_values[minibatch_ind] + torch.clamp( - new_values - b_values[minibatch_ind], - -args.clip_coef, - args.clip_coef, - ) - v_loss_clipped = (v_clipped - b_returns[minibatch_ind]) ** 2 - v_loss_max = torch.max(v_loss_unclipped, v_loss_clipped) - v_loss = 0.5 * v_loss_max.mean() - else: - v_loss = 0.5 * ((new_values - b_returns[minibatch_ind]) ** 2).mean() - - loss = pg_loss - args.ent_coef * entropy_loss + v_loss * args.vf_coef - - optimizer.zero_grad() - loss.backward() - nn.utils.clip_grad_norm_(agent.parameters(), args.max_grad_norm) - optimizer.step() - - if args.kle_stop: - if approx_kl > args.target_kl: - break - - # TRY NOT TO MODIFY: record rewards for plotting purposes - writer.add_scalar("charts/learning_rate", optimizer.param_groups[0]["lr"], global_step) - writer.add_scalar("losses/value_loss", v_loss.item(), global_step) - writer.add_scalar("losses/policy_loss", pg_loss.item(), global_step) - writer.add_scalar("losses/entropy", entropy.mean().item(), global_step) - writer.add_scalar("losses/approx_kl", approx_kl.item(), global_step) - print("SPS:", int(global_step / (time.time() - start_time))) - writer.add_scalar("charts/SPS", int(global_step / (time.time() - start_time)), global_step) - - envs.close() - writer.close() diff --git a/cleanrl/brax/ppo_continuous_action.py b/cleanrl/brax/ppo_continuous_action.py deleted file mode 100644 index 9f6eb0a91..000000000 --- a/cleanrl/brax/ppo_continuous_action.py +++ /dev/null @@ -1,408 +0,0 @@ -import argparse -import os -import random -import time -from distutils.util import strtobool - -import gym -import numpy as np -import pybullet_envs # fmt: off -import torch -import torch.nn as nn -import torch.optim as optim -from gym.spaces import Box -from gym.wrappers import Monitor -from torch.distributions.normal import Normal -from torch.utils.tensorboard import SummaryWriter - - -def parse_args(): - # fmt: off - parser = argparse.ArgumentParser(description='PPO agent') - parser.add_argument('--exp-name', type=str, default=os.path.basename(__file__).rstrip(".py"), - help='the name of this experiment') - parser.add_argument('--gym-id', type=str, default="HalfCheetahBulletEnv-v0", - help='the id of the gym environment') - parser.add_argument('--learning-rate', type=float, default=3e-4, - help='the learning rate of the optimizer') - parser.add_argument('--seed', type=int, default=1, - help='seed of the experiment') - parser.add_argument('--total-timesteps', type=int, default=2000000, - help='total timesteps of the experiments') - parser.add_argument('--torch-deterministic', type=lambda x:bool(strtobool(x)), default=True, nargs='?', const=True, - help='if toggled, `torch.backends.cudnn.deterministic=False`') - parser.add_argument('--cuda', type=lambda x:bool(strtobool(x)), default=True, nargs='?', const=True, - help='if toggled, cuda will not be enabled by default') - parser.add_argument('--track', type=lambda x:bool(strtobool(x)), default=False, nargs='?', const=True, - help='run the script in production mode and use wandb to log outputs') - parser.add_argument('--capture-video', type=lambda x:bool(strtobool(x)), default=False, nargs='?', const=True, - help='weather to capture videos of the agent performances (check out `videos` folder)') - parser.add_argument('--wandb-project-name', type=str, default="cleanRL", - help="the wandb's project name") - parser.add_argument('--wandb-entity', type=str, default=None, - help="the entity (team) of wandb's project") - - # Algorithm specific arguments - parser.add_argument('--n-minibatch', type=int, default=32, - help='the number of mini batch') - parser.add_argument('--num-envs', type=int, default=1, - help='the number of parallel game environment') - parser.add_argument('--num-steps', type=int, default=2048, - help='the number of steps per game environment') - parser.add_argument('--gamma', type=float, default=0.99, - help='the discount factor gamma') - parser.add_argument('--gae-lambda', type=float, default=0.95, - help='the lambda for the general advantage estimation') - parser.add_argument('--ent-coef', type=float, default=0.0, - help="coefficient of the entropy") - parser.add_argument('--vf-coef', type=float, default=0.5, - help="coefficient of the value function") - parser.add_argument('--max-grad-norm', type=float, default=0.5, - help='the maximum norm for the gradient clipping') - parser.add_argument('--clip-coef', type=float, default=0.2, - help="the surrogate clipping coefficient") - parser.add_argument('--update-epochs', type=int, default=10, - help="the K epochs to update the policy") - parser.add_argument('--kle-stop', type=lambda x:bool(strtobool(x)), default=False, nargs='?', const=True, - help='If toggled, the policy updates will be early stopped w.r.t target-kl') - parser.add_argument('--target-kl', type=float, default=0.03, - help='the target-kl variable that is referred by --kl') - parser.add_argument('--gae', type=lambda x:bool(strtobool(x)), default=True, nargs='?', const=True, - help='Use GAE for advantage computation') - parser.add_argument('--norm-adv', type=lambda x:bool(strtobool(x)), default=True, nargs='?', const=True, - help="Toggles advantages normalization") - parser.add_argument('--anneal-lr', type=lambda x:bool(strtobool(x)), default=True, nargs='?', const=True, - help="Toggle learning rate annealing for policy and value networks") - parser.add_argument('--clip-vloss', type=lambda x:bool(strtobool(x)), default=True, nargs='?', const=True, - help='Toggles wheter or not to use a clipped loss for the value function, as per the paper.') - args = parser.parse_args() - if not args.seed: - args.seed = int(time.time()) - args.batch_size = int(args.num_envs * args.num_steps) - args.minibatch_size = int(args.batch_size // args.n_minibatch) - # fmt: on - return args - - -# taken from https://github.com/openai/baselines/blob/master/baselines/common/vec_env/vec_normalize.py -class RunningMeanStd(object): - def __init__(self, epsilon=1e-4, shape=()): - self.mean = np.zeros(shape, "float64") - self.var = np.ones(shape, "float64") - self.count = epsilon - - def update(self, x): - batch_mean = np.mean([x], axis=0) - batch_var = np.var([x], axis=0) - batch_count = 1 - self.update_from_moments(batch_mean, batch_var, batch_count) - - def update_from_moments(self, batch_mean, batch_var, batch_count): - self.mean, self.var, self.count = update_mean_var_count_from_moments( - self.mean, self.var, self.count, batch_mean, batch_var, batch_count - ) - - -def update_mean_var_count_from_moments(mean, var, count, batch_mean, batch_var, batch_count): - delta = batch_mean - mean - tot_count = count + batch_count - - new_mean = mean + delta * batch_count / tot_count - m_a = var * count - m_b = batch_var * batch_count - M2 = m_a + m_b + np.square(delta) * count * batch_count / tot_count - new_var = M2 / tot_count - new_count = tot_count - - return new_mean, new_var, new_count - - -class NormalizedEnv(gym.core.Wrapper): - def __init__(self, env, ob=True, ret=True, clipob=10.0, cliprew=10.0, gamma=0.99, epsilon=1e-8): - super(NormalizedEnv, self).__init__(env) - self.ob_rms = RunningMeanStd(shape=self.observation_space.shape) if ob else None - self.ret_rms = RunningMeanStd(shape=(1,)) if ret else None - self.clipob = clipob - self.cliprew = cliprew - self.ret = np.zeros(()) - self.gamma = gamma - self.epsilon = epsilon - - def step(self, action): - obs, rews, dones, infos = self.env.step(action) - self.ret = self.ret * self.gamma + rews - obs = self._obfilt(obs) - if self.ret_rms: - self.ret_rms.update(np.array([self.ret].copy())) - rews = np.clip(rews / np.sqrt(self.ret_rms.var + self.epsilon), -self.cliprew, self.cliprew) - self.ret = self.ret * (1 - float(dones)) - return obs, rews, dones, infos - - def _obfilt(self, obs): - if self.ob_rms: - self.ob_rms.update(obs) - obs = np.clip((obs - self.ob_rms.mean) / np.sqrt(self.ob_rms.var + self.epsilon), -self.clipob, self.clipob) - return obs - else: - return obs - - def reset(self): - self.ret = np.zeros(()) - obs = self.env.reset() - return self._obfilt(obs) - - -class ClipActionsWrapper(gym.Wrapper): - def step(self, action): - import numpy as np - - action = np.nan_to_num(action) - action = np.clip(action, self.action_space.low, self.action_space.high) - return self.env.step(action) - - -def make_env(gym_id, seed, idx, capture_video, run_name): - def thunk(): - env = gym.make(gym_id) - env = ClipActionsWrapper(env) - env = gym.wrappers.RecordEpisodeStatistics(env) - if args.capture_video: - if idx == 0: - env = Monitor(env, f"videos/{run_name}") - env = NormalizedEnv(env) - env.seed(seed) - env.action_space.seed(seed) - env.observation_space.seed(seed) - return env - - return thunk - - -def layer_init(layer, std=np.sqrt(2), bias_const=0.0): - torch.nn.init.orthogonal_(layer.weight, std) - torch.nn.init.constant_(layer.bias, bias_const) - return layer - - -class Agent(nn.Module): - def __init__(self, envs): - super(Agent, self).__init__() - self.critic = nn.Sequential( - layer_init(nn.Linear(np.array(envs.single_observation_space.shape).prod(), 64)), - nn.Tanh(), - layer_init(nn.Linear(64, 64)), - nn.Tanh(), - layer_init(nn.Linear(64, 1), std=1.0), - ) - self.actor_mean = nn.Sequential( - layer_init(nn.Linear(np.array(envs.single_observation_space.shape).prod(), 64)), - nn.Tanh(), - layer_init(nn.Linear(64, 64)), - nn.Tanh(), - layer_init(nn.Linear(64, np.prod(envs.single_action_space.shape)), std=0.01), - ) - self.actor_logstd = nn.Parameter(torch.zeros(1, np.prod(envs.single_action_space.shape))) - - def get_action_and_value(self, x, action=None): - action_mean = self.actor_mean(x) - action_logstd = self.actor_logstd.expand_as(action_mean) - action_std = torch.exp(action_logstd) - probs = Normal(action_mean, action_std) - if action is None: - action = probs.sample() - return action, probs.log_prob(action).sum(1), probs.entropy().sum(1), self.critic(x) - - def get_value(self, x): - return self.critic(x) - - -if __name__ == "__main__": - args = parse_args() - run_name = f"{args.gym_id}__{args.exp_name}__{args.seed}__{int(time.time())}" - if args.track: - import wandb - - wandb.init( - project=args.wandb_project_name, - entity=args.wandb_entity, - sync_tensorboard=True, - config=vars(args), - name=run_name, - monitor_gym=True, - save_code=True, - ) - writer = SummaryWriter(f"runs/{run_name}") - writer.add_text( - "hyperparameters", - "|param|value|\n|-|-|\n%s" % ("\n".join([f"|{key}|{value}|" for key, value in vars(args).items()])), - ) - device = torch.device("cuda" if torch.cuda.is_available() and args.cuda else "cpu") - - # TRY NOT TO MODIFY: seeding - random.seed(args.seed) - np.random.seed(args.seed) - torch.manual_seed(args.seed) - torch.backends.cudnn.deterministic = args.torch_deterministic - - # env setup - envs = gym.vector.SyncVectorEnv( - [make_env(args.gym_id, args.seed + i, i, args.capture_video, run_name) for i in range(args.num_envs)] - ) - assert isinstance(envs.single_action_space, Box), "only continuous action space is supported" - - agent = Agent(envs).to(device) - optimizer = optim.Adam(agent.parameters(), lr=args.learning_rate, eps=1e-5) - if args.anneal_lr: - # https://github.com/openai/baselines/blob/ea25b9e8b234e6ee1bca43083f8f3cf974143998/baselines/ppo2/defaults.py#L20 - lr = lambda f: f * args.learning_rate - - # ALGO Logic: Storage for epoch data - obs = torch.zeros((args.num_steps, args.num_envs) + envs.single_observation_space.shape).to(device) - actions = torch.zeros((args.num_steps, args.num_envs) + envs.single_action_space.shape).to(device) - logprobs = torch.zeros((args.num_steps, args.num_envs)).to(device) - rewards = torch.zeros((args.num_steps, args.num_envs)).to(device) - dones = torch.zeros((args.num_steps, args.num_envs)).to(device) - values = torch.zeros((args.num_steps, args.num_envs)).to(device) - - # TRY NOT TO MODIFY: start the game - global_step = 0 - start_time = time.time() - # Note how `next_obs` and `next_done` are used; their usage is equivalent to - # https://github.com/ikostrikov/pytorch-a2c-ppo-acktr-gail/blob/84a7582477fb0d5c82ad6d850fe476829dddd2e1/a2c_ppo_acktr/storage.py#L60 - next_obs = torch.Tensor(envs.reset()).to(device) - next_done = torch.zeros(args.num_envs).to(device) - num_updates = args.total_timesteps // args.batch_size - for update in range(1, num_updates + 1): - # Annealing the rate if instructed to do so. - if args.anneal_lr: - frac = 1.0 - (update - 1.0) / num_updates - lrnow = lr(frac) - optimizer.param_groups[0]["lr"] = lrnow - - # TRY NOT TO MODIFY: prepare the execution of the game. - for step in range(0, args.num_steps): - global_step += 1 * args.num_envs - obs[step] = next_obs - dones[step] = next_done - - # ALGO LOGIC: put action logic here - with torch.no_grad(): - action, logproba, _, vs = agent.get_action_and_value(next_obs) - values[step] = vs.flatten() - - actions[step] = action - logprobs[step] = logproba - - # TRY NOT TO MODIFY: execute the game and log data. - next_obs, rs, ds, infos = envs.step(action.cpu().numpy()) - next_obs = torch.Tensor(next_obs).to(device) - rewards[step], next_done = torch.tensor(rs).to(device).view(-1), torch.Tensor(ds).to(device) - - for info in infos: - if "episode" in info.keys(): - print(f"global_step={global_step}, episode_reward={info['episode']['r']}") - writer.add_scalar("charts/episodic_return", info["episode"]["r"], global_step) - break - - # bootstrap reward if not done. reached the batch limit - with torch.no_grad(): - last_value = agent.get_value(next_obs.to(device)).reshape(1, -1) - if args.gae: - advantages = torch.zeros_like(rewards).to(device) - lastgaelam = 0 - for t in reversed(range(args.num_steps)): - if t == args.num_steps - 1: - nextnonterminal = 1.0 - next_done - nextvalues = last_value - else: - nextnonterminal = 1.0 - dones[t + 1] - nextvalues = values[t + 1] - delta = rewards[t] + args.gamma * nextvalues * nextnonterminal - values[t] - advantages[t] = lastgaelam = delta + args.gamma * args.gae_lambda * nextnonterminal * lastgaelam - returns = advantages + values - else: - returns = torch.zeros_like(rewards).to(device) - for t in reversed(range(args.num_steps)): - if t == args.num_steps - 1: - nextnonterminal = 1.0 - next_done - next_return = last_value - else: - nextnonterminal = 1.0 - dones[t + 1] - next_return = returns[t + 1] - returns[t] = rewards[t] + args.gamma * nextnonterminal * next_return - advantages = returns - values - - # flatten the batch - b_obs = obs.reshape((-1,) + envs.single_observation_space.shape) - b_logprobs = logprobs.reshape(-1) - b_actions = actions.reshape((-1,) + envs.single_action_space.shape) - b_advantages = advantages.reshape(-1) - b_returns = returns.reshape(-1) - b_values = values.reshape(-1) - - # Optimizaing the policy and value network - inds = np.arange( - args.batch_size, - ) - for i_epoch_pi in range(args.update_epochs): - np.random.shuffle(inds) - for start in range(0, args.batch_size, args.minibatch_size): - end = start + args.minibatch_size - minibatch_ind = inds[start:end] - mb_advantages = b_advantages[minibatch_ind] - if args.norm_adv: - mb_advantages = (mb_advantages - mb_advantages.mean()) / (mb_advantages.std() + 1e-8) - - _, newlogproba, entropy, new_values = agent.get_action_and_value( - b_obs[minibatch_ind], b_actions[minibatch_ind] - ) - ratio = (newlogproba - b_logprobs[minibatch_ind]).exp() - - # calculate approx_kl http://joschu.net/blog/kl-approx.html - with torch.no_grad(): - log_ratio = newlogproba - b_logprobs[minibatch_ind] - approx_kl = ((log_ratio.exp() - 1) - log_ratio).mean() - - # Policy loss - pg_loss1 = -mb_advantages * ratio - pg_loss2 = -mb_advantages * torch.clamp(ratio, 1 - args.clip_coef, 1 + args.clip_coef) - pg_loss = torch.max(pg_loss1, pg_loss2).mean() - entropy_loss = entropy.mean() - - # Value loss - new_values = new_values.view(-1) - if args.clip_vloss: - v_loss_unclipped = (new_values - b_returns[minibatch_ind]) ** 2 - v_clipped = b_values[minibatch_ind] + torch.clamp( - new_values - b_values[minibatch_ind], - -args.clip_coef, - args.clip_coef, - ) - v_loss_clipped = (v_clipped - b_returns[minibatch_ind]) ** 2 - v_loss_max = torch.max(v_loss_unclipped, v_loss_clipped) - v_loss = 0.5 * v_loss_max.mean() - else: - v_loss = 0.5 * ((new_values - b_returns[minibatch_ind]) ** 2).mean() - - loss = pg_loss - args.ent_coef * entropy_loss + v_loss * args.vf_coef - - optimizer.zero_grad() - loss.backward() - nn.utils.clip_grad_norm_(agent.parameters(), args.max_grad_norm) - optimizer.step() - - if args.kle_stop: - if approx_kl > args.target_kl: - break - - # TRY NOT TO MODIFY: record rewards for plotting purposes - writer.add_scalar("charts/learning_rate", optimizer.param_groups[0]["lr"], global_step) - writer.add_scalar("losses/value_loss", v_loss.item(), global_step) - writer.add_scalar("losses/policy_loss", pg_loss.item(), global_step) - writer.add_scalar("losses/entropy", entropy.mean().item(), global_step) - writer.add_scalar("losses/approx_kl", approx_kl.item(), global_step) - print("SPS:", int(global_step / (time.time() - start_time))) - writer.add_scalar("charts/SPS", int(global_step / (time.time() - start_time)), global_step) - - envs.close() - writer.close() diff --git a/cleanrl/brax/pyproject.toml b/cleanrl/brax/pyproject.toml deleted file mode 100644 index d0a30c9c6..000000000 --- a/cleanrl/brax/pyproject.toml +++ /dev/null @@ -1,14 +0,0 @@ -[tool.poetry] -name = "brax-cleanrl" -version = "0.1.0" -description = "" -authors = ["Costa Huang "] - -[tool.poetry.dependencies] -python = "^3.9" - -[tool.poetry.dev-dependencies] - -[build-system] -requires = ["poetry-core>=1.0.0"] -build-backend = "poetry.core.masonry.api" diff --git a/cleanrl/brax/readme.md b/cleanrl/brax/readme.md deleted file mode 100644 index dfafe930a..000000000 --- a/cleanrl/brax/readme.md +++ /dev/null @@ -1,7 +0,0 @@ -This folder contains experimental script for brax - -```bash -python old_ppo_that_works.py --capture-video --track --wandb-project brax --gym-id HalfCheetahBulletEnv-v0 -python ppo_continuous_action.py --capture-video --track --wandb-project brax --gym-id HalfCheetahBulletEnv-v0 --cuda False -python ppo_brax_test.py --track --wandb-project brax --cuda False -``` diff --git a/cleanrl/brax/requirements.txt b/cleanrl/brax/requirements.txt deleted file mode 100644 index cbe31889f..000000000 --- a/cleanrl/brax/requirements.txt +++ /dev/null @@ -1,89 +0,0 @@ -absl-py==0.12.0 -astunparse==1.6.3 -attrs==21.2.0 -brax==0.0.4 -cachetools==4.2.2 -certifi==2021.5.30 -charset-normalizer==2.0.4 -chex==0.0.8 -clang==5.0 -click==8.0.1 -cloudpickle==1.6.0 -clu==0.0.6 -configparser==5.0.2 -contextlib2==21.6.0 -cycler==0.10.0 -dataclasses==0.6 -decorator==5.0.9 -dill==0.3.4 -dm-tree==0.1.6 -docker-pycreds==0.4.0 -flatbuffers==1.12 -flax==0.3.4 -future==0.18.2 -gast==0.4.0 -gitdb==4.0.7 -GitPython==3.1.18 -google-auth==1.35.0 -google-auth-oauthlib==0.4.5 -google-pasta==0.2.0 -googleapis-common-protos==1.53.0 -grpcio==1.39.0 -gym @ git+https://github.com/openai/gym.git@2853ce4797a4dadb1f8702c8e35143d9fbd43ea6 -h5py==3.1.0 -idna==3.2 -jax==0.2.19 -jaxlib==0.1.70 -keras==2.6.0 -Keras-Preprocessing==1.1.2 -kiwisolver==1.3.1 -Markdown==3.3.4 -matplotlib==3.4.3 -ml-collections==0.1.0 -msgpack==1.0.2 -numpy==1.19.5 -oauthlib==3.1.1 -opt-einsum==3.3.0 -optax==0.0.9 -packaging==21.0 -pandas==1.3.2 -pathtools==0.1.2 -Pillow==8.3.1 -promise==2.3 -protobuf==3.17.3 -psutil==5.8.0 -pyasn1==0.4.8 -pyasn1-modules==0.2.8 -pybullet==3.1.7 -pyglet==1.5.19 -pyparsing==2.4.7 -python-dateutil==2.8.2 -pytz==2021.1 -PyYAML==5.4.1 -requests==2.26.0 -requests-oauthlib==1.3.0 -rsa==4.7.2 -scipy==1.7.1 -sentry-sdk==1.3.1 -shortuuid==1.0.1 -six==1.15.0 -smmap==4.0.0 -stable-baselines3==1.1.0 -subprocess32==3.5.4 -tensorboard==2.6.0 -tensorboard-data-server==0.6.1 -tensorboard-plugin-wit==1.8.0 -tensorflow==2.6.0 -tensorflow-datasets==4.4.0 -tensorflow-estimator==2.6.0 -tensorflow-metadata==1.2.0 -termcolor==1.1.0 -tfp-nightly==0.13.0.dev20210422 -toolz==0.11.1 -torch==1.9.0 -tqdm==4.62.1 -typing-extensions==3.7.4.3 -urllib3==1.26.6 -wandb==0.12.0 -Werkzeug==2.0.1 -wrapt==1.12.1