Add Atari

smanolloff · Apr 1, 2020 · 6306b5d · 6306b5d
1 parent f706376
commit 6306b5d
Show file tree

Hide file tree

Showing 13 changed files with 371 additions and 75 deletions.
diff --git a/README.md b/README.md
@@ -23,7 +23,6 @@ MuZero is a model based reinforcement learning algorithm, successor of AlphaZero
 * [x] Easily adaptable for new games
 * [x] [Examples](https://github.com/werner-duvaud/muzero-general/blob/master/games/cartpole.py) of board and Gym games (See [list of implemented games](https://github.com/werner-duvaud/muzero-general#games-already-implemented))
 * [x] [Pretrained weights](https://github.com/werner-duvaud/muzero-general/tree/master/results) available
-* [ ] Atari games
 * [ ] Appendix Reanalyse of the paper
 * [ ] Windows support ([workaround by ihexx](https://github.com/ihexx/muzero-general) or use the [notebook](https://github.com/werner-duvaud/muzero-general/blob/master/notebook.ipynb) in Google Colab)
 
@@ -44,6 +43,7 @@ Testing Lunar Lander :
 * Tic-tac-toe   (Tested with the fully connected network and the residual network)
 * Connect4      (Slightly tested with the residual network)
 * Gomoku
+* Atari Breakout
 
 Tests are done on Ubuntu with 16 GB RAM / Intel i7 / GTX 1050Ti Max-Q. We make sure to obtain a progression and a level which ensures that it has learned. But we do not systematically reach a human level. For certain environments, we notice a regression after a certain time. The proposed configurations are certainly not optimal and we do not focus for now on the optimization of hyperparameters. Any help is welcome.
 

diff --git a/games/abstract_game.py b/games/abstract_game.py
@@ -72,7 +72,7 @@ def render(self):
         pass
 
     @abstractmethod
-    def human_action(self):
+    def human_to_action(self):
         """
         For multiplayer games, ask the user for a legal action
         and return the corresponding action number.
@@ -88,7 +88,7 @@ def human_action(self):
         return int(choice)
 
     @abstractmethod
-    def print_action(self, action_number):
+    def action_to_string(self, action_number):
         """
         Convert an action number to a string representing the action.
 

diff --git a/games/breakout.py b/games/breakout.py
@@ -0,0 +1,199 @@
+import datetime
+import os
+
+import cv2
+import gym
+import numpy
+import torch
+
+from .abstract_game import AbstractGame
+
+
+class MuZeroConfig:
+    def __init__(self):
+        self.seed = 0  # Seed for numpy, torch and the game
+
+        ### Game
+        self.observation_shape = (3, 96, 96)  # Dimensions of the game observation, must be 3D (channel, height, width). For a 1D array, please reshape it to (1, 1, length of array)
+        self.action_space = [i for i in range(4)]  # Fixed list of all possible actions. You should only edit the length
+        self.players = [i for i in range(1)]  # List of players. You should only edit the length
+        self.stacked_observations = 2  # Number of previous observation and previous actions to add to the current observation
+
+        ### Self-Play
+        self.num_actors = 2  # Number of simultaneous threads self-playing to feed the replay buffer
+        self.max_moves = 500  # Maximum number of moves if game is not finished before
+        self.num_simulations = 20  # Number of future moves self-simulated
+        self.discount = 0.997  # Chronological discount of the reward
+        self.temperature_threshold = 500  # Number of moves before dropping temperature to 0 (ie playing according to the max)
+
+        # Root prior exploration noise
+        self.root_dirichlet_alpha = 0.25
+        self.root_exploration_fraction = 0.25
+
+        # UCB formula
+        self.pb_c_base = 19652
+        self.pb_c_init = 1.25
+
+        ### Network
+        self.network = "resnet"  # "resnet" / "fullyconnected"
+        self.support_size = 10  # Value and reward are scaled (with almost sqrt) and encoded on a vector with a range of -support_size to support_size
+
+        # Residual Network
+        self.downsample = True  # Downsample observations before representation network (See paper appendix Network Architecture)
+        self.blocks = 1  # Number of blocks in the ResNet
+        self.channels = 128  # Number of channels in the ResNet
+        self.reduced_channels = 16  # Number of channels before heads of dynamic and prediction networks
+        self.resnet_fc_reward_layers = [16]  # Define the hidden layers in the reward head of the dynamic network
+        self.resnet_fc_value_layers = [16]  # Define the hidden layers in the value head of the prediction network
+        self.resnet_fc_policy_layers = [16]  # Define the hidden layers in the policy head of the prediction network
+
+        # Fully Connected Network
+        self.encoding_size = 10
+        self.fc_reward_layers = [16]  # Define the hidden layers in the reward network
+        self.fc_value_layers = []  # Define the hidden layers in the value network
+        self.fc_policy_layers = []  # Define the hidden layers in the policy network
+        self.fc_representation_layers = []  # Define the hidden layers in the representation network
+        self.fc_dynamics_layers = [16]  # Define the hidden layers in the dynamics network
+
+        ### Training
+        self.results_path = os.path.join(os.path.dirname(__file__), "../results", os.path.basename(__file__)[:-3], datetime.datetime.now().strftime("%Y-%m-%d--%H-%M-%S"))  # Path to store the model weights and TensorBoard logs
+        self.training_steps = 5000  # Total number of training steps (ie weights update according to a batch)
+        self.batch_size = 128  # Number of parts of games to train on at each training step
+        self.num_unroll_steps = 10  # Number of game moves to keep for every batch element
+        self.checkpoint_interval = 10  # Number of training steps before using the model for sef-playing
+        self.window_size = 1000  # Number of self-play games to keep in the replay buffer
+        self.td_steps = 50  # Number of steps in the future to take into account for calculating the target value
+        self.value_loss_weight = 1  # Scale the value loss to avoid overfitting of the value function, paper recommends 0.25 (See paper appendix Reanalyze)
+        self.training_device = "cuda" if torch.cuda.is_available() else "cpu"  # Train on GPU if available
+
+        self.optimizer = "Adam"  # "Adam" or "SGD". Paper uses SGD
+        self.weight_decay = 1e-4  # L2 weights regularization
+        self.momentum = 0.9  # Used only if optimizer is SGD
+
+        # Prioritized Replay (See paper appendix Training)
+        self.PER = True  # Select in priority the elements in the replay buffer which are unexpected for the network
+        self.PER_alpha = 0.5  # How much prioritization is used, 0 corresponding to the uniform case, paper suggests 1
+        self.PER_beta = 1.0
+
+        # Exponential learning rate schedule
+        self.lr_init = 0.05  # Initial learning rate
+        self.lr_decay_rate = 0.9  # Set it to 1 to use a constant learning rate
+        self.lr_decay_steps = 1000
+
+        ## Adjust the self play / training ratio to avoid over/underfitting
+        self.self_play_delay = 0  # Number of seconds to wait after each played game
+        self.training_delay = 0  # Number of seconds to wait after each training step
+        self.ratio = None  # Desired self played games per training step ratio. Equivalent to a synchronous version, training can take much longer. Set it to None to disable it
+
+
+    def visit_softmax_temperature_fn(self, trained_steps):
+        """
+        Parameter to alter the visit count distribution to ensure that the action selection becomes greedier as training progresses.
+        The smaller it is, the more likely the best action (ie with the highest visit count) is chosen.
+
+        Returns:
+            Positive float.
+        """
+        if trained_steps < 0.5 * self.training_steps:
+            return 1.0
+        elif trained_steps < 0.75 * self.training_steps:
+            return 0.5
+        else:
+            return 0.25
+
+
+class Game(AbstractGame):
+    """
+    Game wrapper.
+    """
+
+    def __init__(self, seed=None):
+        self.env = gym.make("Breakout-v4")
+        if seed is not None:
+            self.env.seed(seed)
+
+    def step(self, action):
+        """
+        Apply action to the game.
+        
+        Args:
+            action : action of the action_space to take.
+
+        Returns:
+            The new observation, the reward and a boolean if the game has ended.
+        """
+        observation, reward, done, _ = self.env.step(action)
+        observation = cv2.resize(observation, (96, 96), interpolation=cv2.INTER_AREA)
+        observation = numpy.asarray(observation, dtype=numpy.float32) / 255.0
+        observation = numpy.moveaxis(observation, -1, 0)
+        return observation, reward, done
+
+    def to_play(self):
+        """
+        Return the current player.
+
+        Returns:
+            The current player, it should be an element of the players list in the config. 
+        """
+        return 0
+
+    def legal_actions(self):
+        """
+        Should return the legal actions at each turn, if it is not available, it can return
+        the whole action space. At each turn, the game have to be able to handle one of returned actions.
+        
+        For complex game where calculating legal moves is too long, the idea is to define the legal actions
+        equal to the action space but to return a negative reward if the action is illegal.        
+
+        Returns:
+            An array of integers, subset of the action space.
+        """
+        return [i for i in range(4)]
+
+    def reset(self):
+        """
+        Reset the game for a new game.
+        
+        Returns:
+            Initial observation of the game.
+        """
+        observation = self.env.reset()
+        observation = cv2.resize(observation, (96, 96), interpolation=cv2.INTER_AREA)
+        observation = numpy.asarray(observation, dtype=numpy.float32) / 255.0
+        observation = numpy.moveaxis(observation, -1, 0)
+        return observation
+
+    def close(self):
+        """
+        Properly close the game.
+        """
+        self.env.close()
+
+    def render(self):
+        """
+        Display the game observation.
+        """
+        self.env.render()
+        input("Press enter to take a step ")
+
+    def human_to_action(self):
+        """
+        For multiplayer games, ask the user for a legal action
+        and return the corresponding action number.
+
+        Returns:
+            An integer from the action space.
+        """
+        pass
+
+    def action_to_string(self, action_number):
+        """
+        Convert an action number to a string representing the action.
+
+        Args:
+            action_number: an integer from the action space.
+
+        Returns:
+            String representing the action.
+        """
+        return print(action_number)
diff --git a/games/cartpole.py b/games/cartpole.py
@@ -14,10 +14,10 @@ def __init__(self):
 
 
         ### Game
-        self.observation_shape = (1, 1, 4)  # Dimensions of the game observation, must be 3D. For a 1D array, please reshape it to (1, 1, length of array)
+        self.observation_shape = (1, 1, 4)  # Dimensions of the game observation, must be 3D (channel, height, width). For a 1D array, please reshape it to (1, 1, length of array)
         self.action_space = [i for i in range(2)]  # Fixed list of all possible actions. You should only edit the length
         self.players = [i for i in range(1)]  # List of players. You should only edit the length
-        self.stacked_observations = 0  # Number of previous observation to add to the current observation
+        self.stacked_observations = 0  # Number of previous observation and previous actions to add to the current observation
 
 
         ### Self-Play
@@ -44,9 +44,9 @@ def __init__(self):
         self.blocks = 1  # Number of blocks in the ResNet
         self.channels = 2  # Number of channels in the ResNet
         self.reduced_channels = 2  # Number of channels before heads of dynamic and prediction networks
-        self.fc_reward_layers = []  # Define the hidden layers in the reward head of the dynamic network
-        self.fc_value_layers = []  # Define the hidden layers in the value head of the prediction network
-        self.fc_policy_layers = []  # Define the hidden layers in the policy head of the prediction network
+        self.resnet_fc_reward_layers = []  # Define the hidden layers in the reward head of the dynamic network
+        self.resnet_fc_value_layers = []  # Define the hidden layers in the value head of the prediction network
+        self.resnet_fc_policy_layers = []  # Define the hidden layers in the policy head of the prediction network
 
         # Fully Connected Network
         self.encoding_size = 8
@@ -172,7 +172,7 @@ def render(self):
         self.env.render()
         input("Press enter to take a step ")
 
-    def human_action(self):
+    def human_to_action(self):
         """
         For multiplayer games, ask the user for a legal action
         and return the corresponding action number.
@@ -182,7 +182,7 @@ def human_action(self):
         """
         pass
 
-    def print_action(self, action_number):
+    def action_to_string(self, action_number):
         """
         Convert an action number to a string representing the action.
 

diff --git a/games/connect4.py b/games/connect4.py
@@ -14,10 +14,10 @@ def __init__(self):
 
 
         ### Game
-        self.observation_shape = (3, 6, 7)  # Dimensions of the game observation, must be 3D. For a 1D array, please reshape it to (1, 1, length of array)
+        self.observation_shape = (3, 6, 7)  # Dimensions of the game observation, must be 3D (channel, height, width). For a 1D array, please reshape it to (1, 1, length of array)
         self.action_space = [i for i in range(7)]  # Fixed list of all possible actions. You should only edit the length
         self.players = [i for i in range(2)]  # List of players. You should only edit the length
-        self.stacked_observations = 0  # Number of previous observation to add to the current observation
+        self.stacked_observations = 0  # Number of previous observation and previous actions to add to the current observation
 
 
         ### Self-Play
@@ -41,12 +41,13 @@ def __init__(self):
         self.support_size = 10  # Value and reward are scaled (with almost sqrt) and encoded on a vector with a range of -support_size to support_size
 
         # Residual Network
+        self.downsample = False  # Downsample observations before representation network (See paper appendix Network Architecture)
         self.blocks = 2  # Number of blocks in the ResNet
-        self.channels = 8  # Number of channels in the ResNet
-        self.reduced_channels = 8  # Number of channels before heads of dynamic and prediction networks
-        self.resnet_fc_reward_layers = []  # Define the hidden layers in the reward head of the dynamic network
-        self.resnet_fc_value_layers = []  # Define the hidden layers in the value head of the prediction network
-        self.resnet_fc_policy_layers = []  # Define the hidden layers in the policy head of the prediction network
+        self.channels = 16  # Number of channels in the ResNet
+        self.reduced_channels = 16  # Number of channels before heads of dynamic and prediction networks
+        self.resnet_fc_reward_layers = [8]  # Define the hidden layers in the reward head of the dynamic network
+        self.resnet_fc_value_layers = [8]  # Define the hidden layers in the value head of the prediction network
+        self.resnet_fc_policy_layers = [8]  # Define the hidden layers in the policy head of the prediction network
 
         # Fully Connected Network
         self.encoding_size = 32
@@ -59,26 +60,26 @@ def __init__(self):
 
         ### Training
         self.results_path = os.path.join(os.path.dirname(__file__), "../results", os.path.basename(__file__)[:-3], datetime.datetime.now().strftime("%Y-%m-%d--%H-%M-%S"))  # Path to store the model weights and TensorBoard logs
-        self.training_steps = 40000  # Total number of training steps (ie weights update according to a batch)
-        self.batch_size = 128*3  # Number of parts of games to train on at each training step
-        self.num_unroll_steps = 10  # Number of game moves to keep for every batch element
+        self.training_steps = 100000  # Total number of training steps (ie weights update according to a batch)
+        self.batch_size = 64  # Number of parts of games to train on at each training step
+        self.num_unroll_steps = 20  # Number of game moves to keep for every batch element
         self.checkpoint_interval = 10  # Number of training steps before using the model for sef-playing
-        self.window_size = 1000  # Number of self-play games to keep in the replay buffer
+        self.window_size = 3000  # Number of self-play games to keep in the replay buffer
         self.td_steps = 50  # Number of steps in the future to take into account for calculating the target value
-        self.value_loss_weight = 1  # Scale the value loss to avoid overfitting of the value function, paper recommends 0.25 (See paper appendix Reanalyze)
+        self.value_loss_weight = 0.25  # Scale the value loss to avoid overfitting of the value function, paper recommends 0.25 (See paper appendix Reanalyze)
         self.training_device = "cuda" if torch.cuda.is_available() else "cpu"  # Train on GPU if available
 
         self.optimizer = "Adam"  # "Adam" or "SGD". Paper uses SGD
         self.weight_decay = 1e-4  # L2 weights regularization
         self.momentum = 0.9  # Used only if optimizer is SGD
 
         # Prioritized Replay (See paper appendix Training)
-        self.PER = False  # Select in priority the elements in the replay buffer which are unexpected for the network
+        self.PER = True  # Select in priority the elements in the replay buffer which are unexpected for the network
         self.PER_alpha = 0.5  # How much prioritization is used, 0 corresponding to the uniform case, paper suggests 1
         self.PER_beta = 1.0
 
         # Exponential learning rate schedule
-        self.lr_init = 0.1  # Initial learning rate
+        self.lr_init = 0.01  # Initial learning rate
         self.lr_decay_rate = 1  # Set it to 1 to use a constant learning rate
         self.lr_decay_steps = 10000
 
@@ -168,7 +169,7 @@ def render(self):
     def encode_board(self):
         return self.env.encode_board()
 
-    def human_action(self):
+    def human_to_action(self):
         """
         For multiplayer games, ask the user for a legal action
         and return the corresponding action number.
@@ -181,7 +182,7 @@ def human_action(self):
             choice = input("Enter another column : ")
         return int(choice)
 
-    def print_action(self, action_number):
+    def action_to_string(self, action_number):
         """
         Convert an action number to a string representing the action.
 

diff --git a/games/gomoku.py b/games/gomoku.py
@@ -15,10 +15,10 @@ def __init__(self):
 
 
         ### Game
-        self.observation_shape = (3, 11, 11)  # Dimensions of the game observation, must be 3. For a 1D array, please reshape it to (1, 1, length of array)
+        self.observation_shape = (3, 11, 11)  # Dimensions of the game observation, must be 3 (channel, height, width). For a 1D array, please reshape it to (1, 1, length of array)
         self.action_space = [i for i in range(11 * 11)]  # Fixed list of all possible actions. You should only edit the length
         self.players = [i for i in range(2)]  # List of players. You should only edit the length
-        self.stacked_observations = 2  # Number of previous observation to add to the current observation
+        self.stacked_observations = 2  # Number of previous observation and previous actions to add to the current observation
 
 
         ### Self-Play
@@ -170,7 +170,7 @@ def render(self):
         self.env.render()
         input("Press enter to take a step ")
 
-    def human_action(self):
+    def human_to_action(self):
         """
         For multiplayer games, ask the user for a legal action
         and return the corresponding action number.
@@ -183,7 +183,7 @@ def human_action(self):
             valid, action = self.env.human_input_to_action()
         return action
 
-    def print_action(self, action):
+    def action_to_string(self, action):
         """
         Convert an action number to a string representing the action.
         Args: