Add value reanalyze

smanolloff · Apr 4, 2020 · df0e407 · df0e407
1 parent 6bc5bfe
commit df0e407
Show file tree

Hide file tree

Showing 10 changed files with 205 additions and 148 deletions.
diff --git a/games/breakout.py b/games/breakout.py
@@ -13,12 +13,16 @@ class MuZeroConfig:
     def __init__(self):
         self.seed = 0  # Seed for numpy, torch and the game
 
+
+
         ### Game
         self.observation_shape = (3, 96, 96)  # Dimensions of the game observation, must be 3D (channel, height, width). For a 1D array, please reshape it to (1, 1, length of array)
         self.action_space = [i for i in range(4)]  # Fixed list of all possible actions. You should only edit the length
         self.players = [i for i in range(1)]  # List of players. You should only edit the length
         self.stacked_observations = 2  # Number of previous observation and previous actions to add to the current observation
 
+
+
         ### Self-Play
         self.num_actors = 2  # Number of simultaneous threads self-playing to feed the replay buffer
         self.max_moves = 500  # Maximum number of moves if game is not finished before
@@ -34,6 +38,8 @@ def __init__(self):
         self.pb_c_base = 19652
         self.pb_c_init = 1.25
 
+
+
         ### Network
         self.network = "resnet"  # "resnet" / "fullyconnected"
         self.support_size = 10  # Value and reward are scaled (with almost sqrt) and encoded on a vector with a range of -support_size to support_size
@@ -55,33 +61,42 @@ def __init__(self):
         self.fc_representation_layers = []  # Define the hidden layers in the representation network
         self.fc_dynamics_layers = [16]  # Define the hidden layers in the dynamics network
 
+
+
         ### Training
         self.results_path = os.path.join(os.path.dirname(__file__), "../results", os.path.basename(__file__)[:-3], datetime.datetime.now().strftime("%Y-%m-%d--%H-%M-%S"))  # Path to store the model weights and TensorBoard logs
         self.training_steps = 5000  # Total number of training steps (ie weights update according to a batch)
         self.batch_size = 128  # Number of parts of games to train on at each training step
-        self.num_unroll_steps = 10  # Number of game moves to keep for every batch element
         self.checkpoint_interval = 10  # Number of training steps before using the model for sef-playing
-        self.window_size = 1000  # Number of self-play games to keep in the replay buffer
-        self.td_steps = 50  # Number of steps in the future to take into account for calculating the target value
         self.value_loss_weight = 1  # Scale the value loss to avoid overfitting of the value function, paper recommends 0.25 (See paper appendix Reanalyze)
         self.training_device = "cuda" if torch.cuda.is_available() else "cpu"  # Train on GPU if available
 
         self.optimizer = "Adam"  # "Adam" or "SGD". Paper uses SGD
         self.weight_decay = 1e-4  # L2 weights regularization
         self.momentum = 0.9  # Used only if optimizer is SGD
 
+        # Exponential learning rate schedule
+        self.lr_init = 0.05  # Initial learning rate
+        self.lr_decay_rate = 0.9  # Set it to 1 to use a constant learning rate
+        self.lr_decay_steps = 1000
+
+
+
+        ### Replay Buffer
+        self.window_size = 1000  # Number of self-play games to keep in the replay buffer
+        self.num_unroll_steps = 10  # Number of game moves to keep for every batch element
+        self.td_steps = 50  # Number of steps in the future to take into account for calculating the target value
+        self.use_last_model_value = False  # Use the last model to provide a fresher, stable n-step value (See paper appendix Reanalyze)
+
         # Prioritized Replay (See paper appendix Training)
         self.PER = True  # Select in priority the elements in the replay buffer which are unexpected for the network
         self.use_max_priority = True  # Use the n-step TD error as initial priority. Better for large replay buffer
         self.PER_alpha = 0.5  # How much prioritization is used, 0 corresponding to the uniform case, paper suggests 1
         self.PER_beta = 1.0
 
-        # Exponential learning rate schedule
-        self.lr_init = 0.05  # Initial learning rate
-        self.lr_decay_rate = 0.9  # Set it to 1 to use a constant learning rate
-        self.lr_decay_steps = 1000
 
-        ## Adjust the self play / training ratio to avoid over/underfitting
+
+        ### Adjust the self play / training ratio to avoid over/underfitting
         self.self_play_delay = 0  # Number of seconds to wait after each played game
         self.training_delay = 0  # Number of seconds to wait after each training step
         self.ratio = None  # Desired self played games per training step ratio. Equivalent to a synchronous version, training can take much longer. Set it to None to disable it

diff --git a/games/cartpole.py b/games/cartpole.py
@@ -13,13 +13,15 @@ def __init__(self):
         self.seed = 0  # Seed for numpy, torch and the game
 
 
+
         ### Game
         self.observation_shape = (1, 1, 4)  # Dimensions of the game observation, must be 3D (channel, height, width). For a 1D array, please reshape it to (1, 1, length of array)
         self.action_space = [i for i in range(2)]  # Fixed list of all possible actions. You should only edit the length
         self.players = [i for i in range(1)]  # List of players. You should only edit the length
         self.stacked_observations = 0  # Number of previous observation and previous actions to add to the current observation
 
 
+
         ### Self-Play
         self.num_actors = 1  # Number of simultaneous threads self-playing to feed the replay buffer
         self.max_moves = 500  # Maximum number of moves if game is not finished before
@@ -36,6 +38,7 @@ def __init__(self):
         self.pb_c_init = 1.25
 
 
+
         ### Network
         self.network = "fullyconnected"  # "resnet" / "fullyconnected"
         self.support_size = 10  # Value and reward are scaled (with almost sqrt) and encoded on a vector with a range of -support_size to support_size
@@ -58,37 +61,44 @@ def __init__(self):
         self.fc_dynamics_layers = [16]  # Define the hidden layers in the dynamics network
 
 
+
         ### Training
         self.results_path = os.path.join(os.path.dirname(__file__), "../results", os.path.basename(__file__)[:-3], datetime.datetime.now().strftime("%Y-%m-%d--%H-%M-%S"))  # Path to store the model weights and TensorBoard logs
         self.training_steps = 5000  # Total number of training steps (ie weights update according to a batch)
         self.batch_size = 128  # Number of parts of games to train on at each training step
-        self.num_unroll_steps = 10  # Number of game moves to keep for every batch element
         self.checkpoint_interval = 10  # Number of training steps before using the model for sef-playing
-        self.window_size = 1000  # Number of self-play games to keep in the replay buffer
-        self.td_steps = 50  # Number of steps in the future to take into account for calculating the target value
         self.value_loss_weight = 1  # Scale the value loss to avoid overfitting of the value function, paper recommends 0.25 (See paper appendix Reanalyze)
         self.training_device = "cuda" if torch.cuda.is_available() else "cpu"  # Train on GPU if available
 
         self.optimizer = "Adam"  # "Adam" or "SGD". Paper uses SGD
         self.weight_decay = 1e-4  # L2 weights regularization
         self.momentum = 0.9  # Used only if optimizer is SGD
 
+        # Exponential learning rate schedule
+        self.lr_init = 0.05  # Initial learning rate
+        self.lr_decay_rate = 0.9  # Set it to 1 to use a constant learning rate
+        self.lr_decay_steps = 1000
+
+
+
+        ### Replay Buffer
+        self.window_size = 1000  # Number of self-play games to keep in the replay buffer
+        self.num_unroll_steps = 10  # Number of game moves to keep for every batch element
+        self.td_steps = 50  # Number of steps in the future to take into account for calculating the target value
+        self.use_last_model_value = True  # Use the last model to provide a fresher, stable n-step value (See paper appendix Reanalyze)
+
         # Prioritized Replay (See paper appendix Training)
         self.PER = True  # Select in priority the elements in the replay buffer which are unexpected for the network
         self.use_max_priority = False  # Use the n-step TD error as initial priority. Better for large replay buffer
         self.PER_alpha = 0.5  # How much prioritization is used, 0 corresponding to the uniform case, paper suggests 1
         self.PER_beta = 1.0
 
-        # Exponential learning rate schedule
-        self.lr_init = 0.05  # Initial learning rate
-        self.lr_decay_rate = 0.9  # Set it to 1 to use a constant learning rate
-        self.lr_decay_steps = 1000
 
 
-        ## Adjust the self play / training ratio to avoid over/underfitting
+        ### Adjust the self play / training ratio to avoid over/underfitting
         self.self_play_delay = 0  # Number of seconds to wait after each played game
         self.training_delay = 0  # Number of seconds to wait after each training step
-        self.ratio = 1/100  # Desired self played games per training step ratio. Equivalent to a synchronous version, training can take much longer. Set it to None to disable it
+        self.ratio = 1/50  # Desired self played games per training step ratio. Equivalent to a synchronous version, training can take much longer. Set it to None to disable it
 
 
     def visit_softmax_temperature_fn(self, trained_steps):

diff --git a/games/connect4.py b/games/connect4.py
@@ -13,13 +13,15 @@ def __init__(self):
         self.seed = 0  # Seed for numpy, torch and the game
 
 
+
         ### Game
         self.observation_shape = (3, 6, 7)  # Dimensions of the game observation, must be 3D (channel, height, width). For a 1D array, please reshape it to (1, 1, length of array)
         self.action_space = [i for i in range(7)]  # Fixed list of all possible actions. You should only edit the length
         self.players = [i for i in range(2)]  # List of players. You should only edit the length
         self.stacked_observations = 0  # Number of previous observation and previous actions to add to the current observation
 
 
+
         ### Self-Play
         self.num_actors = 1  # Number of simultaneous threads self-playing to feed the replay buffer
         self.max_moves = 50  # Maximum number of moves if game is not finished before
@@ -36,6 +38,7 @@ def __init__(self):
         self.pb_c_init = 1.25
 
 
+
         ### Network
         self.network = "resnet"  # "resnet" / "fullyconnected"
         self.support_size = 10  # Value and reward are scaled (with almost sqrt) and encoded on a vector with a range of -support_size to support_size
@@ -58,37 +61,44 @@ def __init__(self):
         self.fc_dynamics_layers = [64]  # Define the hidden layers in the dynamics network
 
 
+
         ### Training
         self.results_path = os.path.join(os.path.dirname(__file__), "../results", os.path.basename(__file__)[:-3], datetime.datetime.now().strftime("%Y-%m-%d--%H-%M-%S"))  # Path to store the model weights and TensorBoard logs
         self.training_steps = 100000  # Total number of training steps (ie weights update according to a batch)
         self.batch_size = 64  # Number of parts of games to train on at each training step
-        self.num_unroll_steps = 20  # Number of game moves to keep for every batch element
         self.checkpoint_interval = 10  # Number of training steps before using the model for sef-playing
-        self.window_size = 3000  # Number of self-play games to keep in the replay buffer
-        self.td_steps = 50  # Number of steps in the future to take into account for calculating the target value
         self.value_loss_weight = 0.25  # Scale the value loss to avoid overfitting of the value function, paper recommends 0.25 (See paper appendix Reanalyze)
         self.training_device = "cuda" if torch.cuda.is_available() else "cpu"  # Train on GPU if available
 
         self.optimizer = "Adam"  # "Adam" or "SGD". Paper uses SGD
         self.weight_decay = 1e-4  # L2 weights regularization
         self.momentum = 0.9  # Used only if optimizer is SGD
 
+        # Exponential learning rate schedule
+        self.lr_init = 0.01  # Initial learning rate
+        self.lr_decay_rate = 1  # Set it to 1 to use a constant learning rate
+        self.lr_decay_steps = 10000
+
+
+
+        ### Replay Buffer
+        self.window_size = 3000  # Number of self-play games to keep in the replay buffer
+        self.num_unroll_steps = 20  # Number of game moves to keep for every batch element
+        self.td_steps = 50  # Number of steps in the future to take into account for calculating the target value
+        self.use_last_model_value = False  # Use the last model to provide a fresher, stable n-step value (See paper appendix Reanalyze)
+
         # Prioritized Replay (See paper appendix Training)
         self.PER = True  # Select in priority the elements in the replay buffer which are unexpected for the network
         self.use_max_priority = True  # Use the n-step TD error as initial priority. Better for large replay buffer
         self.PER_alpha = 0.5  # How much prioritization is used, 0 corresponding to the uniform case, paper suggests 1
         self.PER_beta = 1.0
 
-        # Exponential learning rate schedule
-        self.lr_init = 0.01  # Initial learning rate
-        self.lr_decay_rate = 1  # Set it to 1 to use a constant learning rate
-        self.lr_decay_steps = 10000
 
 
-        ## Adjust the self play / training ratio to avoid over/underfitting
+        ### Adjust the self play / training ratio to avoid over/underfitting
         self.self_play_delay = 0  # Number of seconds to wait after each played game
         self.training_delay = 0  # Number of seconds to wait after each training step
-        self.ratio = None  # Desired self played games per training step ratio. Set it to None to disable it.
+        self.ratio = None  # Desired self played games per training step ratio. Equivalent to a synchronous version, training can take much longer. Set it to None to disable it
 
 
     def visit_softmax_temperature_fn(self, trained_steps):

diff --git a/games/gomoku.py b/games/gomoku.py
@@ -14,13 +14,15 @@ def __init__(self):
         self.seed = 0  # Seed for numpy, torch and the game
 
 
+
         ### Game
         self.observation_shape = (3, 11, 11)  # Dimensions of the game observation, must be 3 (channel, height, width). For a 1D array, please reshape it to (1, 1, length of array)
         self.action_space = [i for i in range(11 * 11)]  # Fixed list of all possible actions. You should only edit the length
         self.players = [i for i in range(2)]  # List of players. You should only edit the length
         self.stacked_observations = 2  # Number of previous observation and previous actions to add to the current observation
 
 
+
         ### Self-Play
         self.num_actors = 2  # Number of simultaneous threads self-playing to feed the replay buffer
         self.max_moves = 70  # Maximum number of moves if game is not finished before
@@ -37,6 +39,7 @@ def __init__(self):
         self.pb_c_init = 1.25
 
 
+
         ### Network
         self.network = "resnet"  # "resnet" / "fullyconnected"
         self.support_size = 10  # Value and reward are scaled (with almost sqrt) and encoded on a vector with a range of -support_size to support_size
@@ -59,37 +62,44 @@ def __init__(self):
         self.fc_dynamics_layers = [64]  # Define the hidden layers in the dynamics network
 
 
+
         ### Training
         self.results_path = os.path.join(os.path.dirname(__file__), "../results", os.path.basename(__file__)[:-3], datetime.datetime.now().strftime("%Y-%m-%d--%H-%M-%S"))  # Path to store the model weights and TensorBoard logs
         self.training_steps = 10  # Total number of training steps (ie weights update according to a batch)
-        self.batch_size = 128*3  # Number of parts of games to train on at each training step
-        self.num_unroll_steps = 5  # Number of game moves to keep for every batch element
+        self.batch_size = 128  # Number of parts of games to train on at each training step
         self.checkpoint_interval = 10  # Number of training steps before using the model for sef-playing
-        self.window_size = 1000  # Number of self-play games to keep in the replay buffer
-        self.td_steps = 50  # Number of steps in the future to take into account for calculating the target value
         self.value_loss_weight = 1  # Scale the value loss to avoid overfitting of the value function, paper recommends 0.25 (See paper appendix Reanalyze)
         self.training_device = "cuda" if torch.cuda.is_available() else "cpu"  # Train on GPU if available
 
         self.optimizer = "Adam"  # "Adam" or "SGD". Paper uses SGD
         self.weight_decay = 1e-4  # L2 weights regularization
         self.momentum = 0.9  # Used only if optimizer is SGD
 
+        # Exponential learning rate schedule
+        self.lr_init = 0.01  # Initial learning rate
+        self.lr_decay_rate = 0.9  # Set it to 1 to use a constant learning rate
+        self.lr_decay_steps = 10000
+
+
+
+        ### Replay Buffer
+        self.window_size = 1000  # Number of self-play games to keep in the replay buffer
+        self.num_unroll_steps = 5  # Number of game moves to keep for every batch element
+        self.td_steps = 50  # Number of steps in the future to take into account for calculating the target value
+        self.use_last_model_value = False  # Use the last model to provide a fresher, stable n-step value (See paper appendix Reanalyze)
+
         # Prioritized Replay (See paper appendix Training)
         self.PER = True  # Select in priority the elements in the replay buffer which are unexpected for the network
         self.use_max_priority = True  # Use the n-step TD error as initial priority. Better for large replay buffer
         self.PER_alpha = 0.5  # How much prioritization is used, 0 corresponding to the uniform case, paper suggests 1
         self.PER_beta = 1.0
 
-        # Exponential learning rate schedule
-        self.lr_init = 0.01  # Initial learning rate
-        self.lr_decay_rate = 0.9  # Set it to 1 to use a constant learning rate
-        self.lr_decay_steps = 10000
 
 
-        ## Adjust the self play / training ratio to avoid over/underfitting
+        ### Adjust the self play / training ratio to avoid over/underfitting
         self.self_play_delay = 0  # Number of seconds to wait after each played game
         self.training_delay = 0  # Number of seconds to wait after each training step
-        self.ratio = None  # Desired self played games per training step ratio. Set it to None to disable it.
+        self.ratio = None  # Desired self played games per training step ratio. Equivalent to a synchronous version, training can take much longer. Set it to None to disable it
 
 
     def visit_softmax_temperature_fn(self, trained_steps):