Skip to content

Commit

Permalink
Add Atari
Browse files Browse the repository at this point in the history
  • Loading branch information
werner-duvaud committed Apr 1, 2020
1 parent f706376 commit 6306b5d
Show file tree
Hide file tree
Showing 13 changed files with 371 additions and 75 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,6 @@ MuZero is a model based reinforcement learning algorithm, successor of AlphaZero
* [x] Easily adaptable for new games
* [x] [Examples](https://github.com/werner-duvaud/muzero-general/blob/master/games/cartpole.py) of board and Gym games (See [list of implemented games](https://github.com/werner-duvaud/muzero-general#games-already-implemented))
* [x] [Pretrained weights](https://github.com/werner-duvaud/muzero-general/tree/master/results) available
* [ ] Atari games
* [ ] Appendix Reanalyse of the paper
* [ ] Windows support ([workaround by ihexx](https://github.com/ihexx/muzero-general) or use the [notebook](https://github.com/werner-duvaud/muzero-general/blob/master/notebook.ipynb) in Google Colab)

Expand All @@ -44,6 +43,7 @@ Testing Lunar Lander :
* Tic-tac-toe (Tested with the fully connected network and the residual network)
* Connect4 (Slightly tested with the residual network)
* Gomoku
* Atari Breakout

Tests are done on Ubuntu with 16 GB RAM / Intel i7 / GTX 1050Ti Max-Q. We make sure to obtain a progression and a level which ensures that it has learned. But we do not systematically reach a human level. For certain environments, we notice a regression after a certain time. The proposed configurations are certainly not optimal and we do not focus for now on the optimization of hyperparameters. Any help is welcome.

Expand Down
4 changes: 2 additions & 2 deletions games/abstract_game.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,7 @@ def render(self):
pass

@abstractmethod
def human_action(self):
def human_to_action(self):
"""
For multiplayer games, ask the user for a legal action
and return the corresponding action number.
Expand All @@ -88,7 +88,7 @@ def human_action(self):
return int(choice)

@abstractmethod
def print_action(self, action_number):
def action_to_string(self, action_number):
"""
Convert an action number to a string representing the action.
Expand Down
199 changes: 199 additions & 0 deletions games/breakout.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,199 @@
import datetime
import os

import cv2
import gym
import numpy
import torch

from .abstract_game import AbstractGame


class MuZeroConfig:
def __init__(self):
self.seed = 0 # Seed for numpy, torch and the game

### Game
self.observation_shape = (3, 96, 96) # Dimensions of the game observation, must be 3D (channel, height, width). For a 1D array, please reshape it to (1, 1, length of array)
self.action_space = [i for i in range(4)] # Fixed list of all possible actions. You should only edit the length
self.players = [i for i in range(1)] # List of players. You should only edit the length
self.stacked_observations = 2 # Number of previous observation and previous actions to add to the current observation

### Self-Play
self.num_actors = 2 # Number of simultaneous threads self-playing to feed the replay buffer
self.max_moves = 500 # Maximum number of moves if game is not finished before
self.num_simulations = 20 # Number of future moves self-simulated
self.discount = 0.997 # Chronological discount of the reward
self.temperature_threshold = 500 # Number of moves before dropping temperature to 0 (ie playing according to the max)

# Root prior exploration noise
self.root_dirichlet_alpha = 0.25
self.root_exploration_fraction = 0.25

# UCB formula
self.pb_c_base = 19652
self.pb_c_init = 1.25

### Network
self.network = "resnet" # "resnet" / "fullyconnected"
self.support_size = 10 # Value and reward are scaled (with almost sqrt) and encoded on a vector with a range of -support_size to support_size

# Residual Network
self.downsample = True # Downsample observations before representation network (See paper appendix Network Architecture)
self.blocks = 1 # Number of blocks in the ResNet
self.channels = 128 # Number of channels in the ResNet
self.reduced_channels = 16 # Number of channels before heads of dynamic and prediction networks
self.resnet_fc_reward_layers = [16] # Define the hidden layers in the reward head of the dynamic network
self.resnet_fc_value_layers = [16] # Define the hidden layers in the value head of the prediction network
self.resnet_fc_policy_layers = [16] # Define the hidden layers in the policy head of the prediction network

# Fully Connected Network
self.encoding_size = 10
self.fc_reward_layers = [16] # Define the hidden layers in the reward network
self.fc_value_layers = [] # Define the hidden layers in the value network
self.fc_policy_layers = [] # Define the hidden layers in the policy network
self.fc_representation_layers = [] # Define the hidden layers in the representation network
self.fc_dynamics_layers = [16] # Define the hidden layers in the dynamics network

### Training
self.results_path = os.path.join(os.path.dirname(__file__), "../results", os.path.basename(__file__)[:-3], datetime.datetime.now().strftime("%Y-%m-%d--%H-%M-%S")) # Path to store the model weights and TensorBoard logs
self.training_steps = 5000 # Total number of training steps (ie weights update according to a batch)
self.batch_size = 128 # Number of parts of games to train on at each training step
self.num_unroll_steps = 10 # Number of game moves to keep for every batch element
self.checkpoint_interval = 10 # Number of training steps before using the model for sef-playing
self.window_size = 1000 # Number of self-play games to keep in the replay buffer
self.td_steps = 50 # Number of steps in the future to take into account for calculating the target value
self.value_loss_weight = 1 # Scale the value loss to avoid overfitting of the value function, paper recommends 0.25 (See paper appendix Reanalyze)
self.training_device = "cuda" if torch.cuda.is_available() else "cpu" # Train on GPU if available

self.optimizer = "Adam" # "Adam" or "SGD". Paper uses SGD
self.weight_decay = 1e-4 # L2 weights regularization
self.momentum = 0.9 # Used only if optimizer is SGD

# Prioritized Replay (See paper appendix Training)
self.PER = True # Select in priority the elements in the replay buffer which are unexpected for the network
self.PER_alpha = 0.5 # How much prioritization is used, 0 corresponding to the uniform case, paper suggests 1
self.PER_beta = 1.0

# Exponential learning rate schedule
self.lr_init = 0.05 # Initial learning rate
self.lr_decay_rate = 0.9 # Set it to 1 to use a constant learning rate
self.lr_decay_steps = 1000

## Adjust the self play / training ratio to avoid over/underfitting
self.self_play_delay = 0 # Number of seconds to wait after each played game
self.training_delay = 0 # Number of seconds to wait after each training step
self.ratio = None # Desired self played games per training step ratio. Equivalent to a synchronous version, training can take much longer. Set it to None to disable it


def visit_softmax_temperature_fn(self, trained_steps):
"""
Parameter to alter the visit count distribution to ensure that the action selection becomes greedier as training progresses.
The smaller it is, the more likely the best action (ie with the highest visit count) is chosen.
Returns:
Positive float.
"""
if trained_steps < 0.5 * self.training_steps:
return 1.0
elif trained_steps < 0.75 * self.training_steps:
return 0.5
else:
return 0.25


class Game(AbstractGame):
"""
Game wrapper.
"""

def __init__(self, seed=None):
self.env = gym.make("Breakout-v4")
if seed is not None:
self.env.seed(seed)

def step(self, action):
"""
Apply action to the game.
Args:
action : action of the action_space to take.
Returns:
The new observation, the reward and a boolean if the game has ended.
"""
observation, reward, done, _ = self.env.step(action)
observation = cv2.resize(observation, (96, 96), interpolation=cv2.INTER_AREA)
observation = numpy.asarray(observation, dtype=numpy.float32) / 255.0
observation = numpy.moveaxis(observation, -1, 0)
return observation, reward, done

def to_play(self):
"""
Return the current player.
Returns:
The current player, it should be an element of the players list in the config.
"""
return 0

def legal_actions(self):
"""
Should return the legal actions at each turn, if it is not available, it can return
the whole action space. At each turn, the game have to be able to handle one of returned actions.
For complex game where calculating legal moves is too long, the idea is to define the legal actions
equal to the action space but to return a negative reward if the action is illegal.
Returns:
An array of integers, subset of the action space.
"""
return [i for i in range(4)]

def reset(self):
"""
Reset the game for a new game.
Returns:
Initial observation of the game.
"""
observation = self.env.reset()
observation = cv2.resize(observation, (96, 96), interpolation=cv2.INTER_AREA)
observation = numpy.asarray(observation, dtype=numpy.float32) / 255.0
observation = numpy.moveaxis(observation, -1, 0)
return observation

def close(self):
"""
Properly close the game.
"""
self.env.close()

def render(self):
"""
Display the game observation.
"""
self.env.render()
input("Press enter to take a step ")

def human_to_action(self):
"""
For multiplayer games, ask the user for a legal action
and return the corresponding action number.
Returns:
An integer from the action space.
"""
pass

def action_to_string(self, action_number):
"""
Convert an action number to a string representing the action.
Args:
action_number: an integer from the action space.
Returns:
String representing the action.
"""
return print(action_number)
14 changes: 7 additions & 7 deletions games/cartpole.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,10 +14,10 @@ def __init__(self):


### Game
self.observation_shape = (1, 1, 4) # Dimensions of the game observation, must be 3D. For a 1D array, please reshape it to (1, 1, length of array)
self.observation_shape = (1, 1, 4) # Dimensions of the game observation, must be 3D (channel, height, width). For a 1D array, please reshape it to (1, 1, length of array)
self.action_space = [i for i in range(2)] # Fixed list of all possible actions. You should only edit the length
self.players = [i for i in range(1)] # List of players. You should only edit the length
self.stacked_observations = 0 # Number of previous observation to add to the current observation
self.stacked_observations = 0 # Number of previous observation and previous actions to add to the current observation


### Self-Play
Expand All @@ -44,9 +44,9 @@ def __init__(self):
self.blocks = 1 # Number of blocks in the ResNet
self.channels = 2 # Number of channels in the ResNet
self.reduced_channels = 2 # Number of channels before heads of dynamic and prediction networks
self.fc_reward_layers = [] # Define the hidden layers in the reward head of the dynamic network
self.fc_value_layers = [] # Define the hidden layers in the value head of the prediction network
self.fc_policy_layers = [] # Define the hidden layers in the policy head of the prediction network
self.resnet_fc_reward_layers = [] # Define the hidden layers in the reward head of the dynamic network
self.resnet_fc_value_layers = [] # Define the hidden layers in the value head of the prediction network
self.resnet_fc_policy_layers = [] # Define the hidden layers in the policy head of the prediction network

# Fully Connected Network
self.encoding_size = 8
Expand Down Expand Up @@ -172,7 +172,7 @@ def render(self):
self.env.render()
input("Press enter to take a step ")

def human_action(self):
def human_to_action(self):
"""
For multiplayer games, ask the user for a legal action
and return the corresponding action number.
Expand All @@ -182,7 +182,7 @@ def human_action(self):
"""
pass

def print_action(self, action_number):
def action_to_string(self, action_number):
"""
Convert an action number to a string representing the action.
Expand Down
33 changes: 17 additions & 16 deletions games/connect4.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,10 +14,10 @@ def __init__(self):


### Game
self.observation_shape = (3, 6, 7) # Dimensions of the game observation, must be 3D. For a 1D array, please reshape it to (1, 1, length of array)
self.observation_shape = (3, 6, 7) # Dimensions of the game observation, must be 3D (channel, height, width). For a 1D array, please reshape it to (1, 1, length of array)
self.action_space = [i for i in range(7)] # Fixed list of all possible actions. You should only edit the length
self.players = [i for i in range(2)] # List of players. You should only edit the length
self.stacked_observations = 0 # Number of previous observation to add to the current observation
self.stacked_observations = 0 # Number of previous observation and previous actions to add to the current observation


### Self-Play
Expand All @@ -41,12 +41,13 @@ def __init__(self):
self.support_size = 10 # Value and reward are scaled (with almost sqrt) and encoded on a vector with a range of -support_size to support_size

# Residual Network
self.downsample = False # Downsample observations before representation network (See paper appendix Network Architecture)
self.blocks = 2 # Number of blocks in the ResNet
self.channels = 8 # Number of channels in the ResNet
self.reduced_channels = 8 # Number of channels before heads of dynamic and prediction networks
self.resnet_fc_reward_layers = [] # Define the hidden layers in the reward head of the dynamic network
self.resnet_fc_value_layers = [] # Define the hidden layers in the value head of the prediction network
self.resnet_fc_policy_layers = [] # Define the hidden layers in the policy head of the prediction network
self.channels = 16 # Number of channels in the ResNet
self.reduced_channels = 16 # Number of channels before heads of dynamic and prediction networks
self.resnet_fc_reward_layers = [8] # Define the hidden layers in the reward head of the dynamic network
self.resnet_fc_value_layers = [8] # Define the hidden layers in the value head of the prediction network
self.resnet_fc_policy_layers = [8] # Define the hidden layers in the policy head of the prediction network

# Fully Connected Network
self.encoding_size = 32
Expand All @@ -59,26 +60,26 @@ def __init__(self):

### Training
self.results_path = os.path.join(os.path.dirname(__file__), "../results", os.path.basename(__file__)[:-3], datetime.datetime.now().strftime("%Y-%m-%d--%H-%M-%S")) # Path to store the model weights and TensorBoard logs
self.training_steps = 40000 # Total number of training steps (ie weights update according to a batch)
self.batch_size = 128*3 # Number of parts of games to train on at each training step
self.num_unroll_steps = 10 # Number of game moves to keep for every batch element
self.training_steps = 100000 # Total number of training steps (ie weights update according to a batch)
self.batch_size = 64 # Number of parts of games to train on at each training step
self.num_unroll_steps = 20 # Number of game moves to keep for every batch element
self.checkpoint_interval = 10 # Number of training steps before using the model for sef-playing
self.window_size = 1000 # Number of self-play games to keep in the replay buffer
self.window_size = 3000 # Number of self-play games to keep in the replay buffer
self.td_steps = 50 # Number of steps in the future to take into account for calculating the target value
self.value_loss_weight = 1 # Scale the value loss to avoid overfitting of the value function, paper recommends 0.25 (See paper appendix Reanalyze)
self.value_loss_weight = 0.25 # Scale the value loss to avoid overfitting of the value function, paper recommends 0.25 (See paper appendix Reanalyze)
self.training_device = "cuda" if torch.cuda.is_available() else "cpu" # Train on GPU if available

self.optimizer = "Adam" # "Adam" or "SGD". Paper uses SGD
self.weight_decay = 1e-4 # L2 weights regularization
self.momentum = 0.9 # Used only if optimizer is SGD

# Prioritized Replay (See paper appendix Training)
self.PER = False # Select in priority the elements in the replay buffer which are unexpected for the network
self.PER = True # Select in priority the elements in the replay buffer which are unexpected for the network
self.PER_alpha = 0.5 # How much prioritization is used, 0 corresponding to the uniform case, paper suggests 1
self.PER_beta = 1.0

# Exponential learning rate schedule
self.lr_init = 0.1 # Initial learning rate
self.lr_init = 0.01 # Initial learning rate
self.lr_decay_rate = 1 # Set it to 1 to use a constant learning rate
self.lr_decay_steps = 10000

Expand Down Expand Up @@ -168,7 +169,7 @@ def render(self):
def encode_board(self):
return self.env.encode_board()

def human_action(self):
def human_to_action(self):
"""
For multiplayer games, ask the user for a legal action
and return the corresponding action number.
Expand All @@ -181,7 +182,7 @@ def human_action(self):
choice = input("Enter another column : ")
return int(choice)

def print_action(self, action_number):
def action_to_string(self, action_number):
"""
Convert an action number to a string representing the action.
Expand Down
8 changes: 4 additions & 4 deletions games/gomoku.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,10 +15,10 @@ def __init__(self):


### Game
self.observation_shape = (3, 11, 11) # Dimensions of the game observation, must be 3. For a 1D array, please reshape it to (1, 1, length of array)
self.observation_shape = (3, 11, 11) # Dimensions of the game observation, must be 3 (channel, height, width). For a 1D array, please reshape it to (1, 1, length of array)
self.action_space = [i for i in range(11 * 11)] # Fixed list of all possible actions. You should only edit the length
self.players = [i for i in range(2)] # List of players. You should only edit the length
self.stacked_observations = 2 # Number of previous observation to add to the current observation
self.stacked_observations = 2 # Number of previous observation and previous actions to add to the current observation


### Self-Play
Expand Down Expand Up @@ -170,7 +170,7 @@ def render(self):
self.env.render()
input("Press enter to take a step ")

def human_action(self):
def human_to_action(self):
"""
For multiplayer games, ask the user for a legal action
and return the corresponding action number.
Expand All @@ -183,7 +183,7 @@ def human_action(self):
valid, action = self.env.human_input_to_action()
return action

def print_action(self, action):
def action_to_string(self, action):
"""
Convert an action number to a string representing the action.
Args:
Expand Down
Loading

0 comments on commit 6306b5d

Please sign in to comment.