Skip to content

Commit

Permalink
Add value reanalyze
Browse files Browse the repository at this point in the history
  • Loading branch information
werner-duvaud committed Apr 4, 2020
1 parent 6bc5bfe commit df0e407
Show file tree
Hide file tree
Showing 10 changed files with 205 additions and 148 deletions.
31 changes: 23 additions & 8 deletions games/breakout.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,12 +13,16 @@ class MuZeroConfig:
def __init__(self):
self.seed = 0 # Seed for numpy, torch and the game



### Game
self.observation_shape = (3, 96, 96) # Dimensions of the game observation, must be 3D (channel, height, width). For a 1D array, please reshape it to (1, 1, length of array)
self.action_space = [i for i in range(4)] # Fixed list of all possible actions. You should only edit the length
self.players = [i for i in range(1)] # List of players. You should only edit the length
self.stacked_observations = 2 # Number of previous observation and previous actions to add to the current observation



### Self-Play
self.num_actors = 2 # Number of simultaneous threads self-playing to feed the replay buffer
self.max_moves = 500 # Maximum number of moves if game is not finished before
Expand All @@ -34,6 +38,8 @@ def __init__(self):
self.pb_c_base = 19652
self.pb_c_init = 1.25



### Network
self.network = "resnet" # "resnet" / "fullyconnected"
self.support_size = 10 # Value and reward are scaled (with almost sqrt) and encoded on a vector with a range of -support_size to support_size
Expand All @@ -55,33 +61,42 @@ def __init__(self):
self.fc_representation_layers = [] # Define the hidden layers in the representation network
self.fc_dynamics_layers = [16] # Define the hidden layers in the dynamics network



### Training
self.results_path = os.path.join(os.path.dirname(__file__), "../results", os.path.basename(__file__)[:-3], datetime.datetime.now().strftime("%Y-%m-%d--%H-%M-%S")) # Path to store the model weights and TensorBoard logs
self.training_steps = 5000 # Total number of training steps (ie weights update according to a batch)
self.batch_size = 128 # Number of parts of games to train on at each training step
self.num_unroll_steps = 10 # Number of game moves to keep for every batch element
self.checkpoint_interval = 10 # Number of training steps before using the model for sef-playing
self.window_size = 1000 # Number of self-play games to keep in the replay buffer
self.td_steps = 50 # Number of steps in the future to take into account for calculating the target value
self.value_loss_weight = 1 # Scale the value loss to avoid overfitting of the value function, paper recommends 0.25 (See paper appendix Reanalyze)
self.training_device = "cuda" if torch.cuda.is_available() else "cpu" # Train on GPU if available

self.optimizer = "Adam" # "Adam" or "SGD". Paper uses SGD
self.weight_decay = 1e-4 # L2 weights regularization
self.momentum = 0.9 # Used only if optimizer is SGD

# Exponential learning rate schedule
self.lr_init = 0.05 # Initial learning rate
self.lr_decay_rate = 0.9 # Set it to 1 to use a constant learning rate
self.lr_decay_steps = 1000



### Replay Buffer
self.window_size = 1000 # Number of self-play games to keep in the replay buffer
self.num_unroll_steps = 10 # Number of game moves to keep for every batch element
self.td_steps = 50 # Number of steps in the future to take into account for calculating the target value
self.use_last_model_value = False # Use the last model to provide a fresher, stable n-step value (See paper appendix Reanalyze)

# Prioritized Replay (See paper appendix Training)
self.PER = True # Select in priority the elements in the replay buffer which are unexpected for the network
self.use_max_priority = True # Use the n-step TD error as initial priority. Better for large replay buffer
self.PER_alpha = 0.5 # How much prioritization is used, 0 corresponding to the uniform case, paper suggests 1
self.PER_beta = 1.0

# Exponential learning rate schedule
self.lr_init = 0.05 # Initial learning rate
self.lr_decay_rate = 0.9 # Set it to 1 to use a constant learning rate
self.lr_decay_steps = 1000

## Adjust the self play / training ratio to avoid over/underfitting

### Adjust the self play / training ratio to avoid over/underfitting
self.self_play_delay = 0 # Number of seconds to wait after each played game
self.training_delay = 0 # Number of seconds to wait after each training step
self.ratio = None # Desired self played games per training step ratio. Equivalent to a synchronous version, training can take much longer. Set it to None to disable it
Expand Down
28 changes: 19 additions & 9 deletions games/cartpole.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,13 +13,15 @@ def __init__(self):
self.seed = 0 # Seed for numpy, torch and the game



### Game
self.observation_shape = (1, 1, 4) # Dimensions of the game observation, must be 3D (channel, height, width). For a 1D array, please reshape it to (1, 1, length of array)
self.action_space = [i for i in range(2)] # Fixed list of all possible actions. You should only edit the length
self.players = [i for i in range(1)] # List of players. You should only edit the length
self.stacked_observations = 0 # Number of previous observation and previous actions to add to the current observation



### Self-Play
self.num_actors = 1 # Number of simultaneous threads self-playing to feed the replay buffer
self.max_moves = 500 # Maximum number of moves if game is not finished before
Expand All @@ -36,6 +38,7 @@ def __init__(self):
self.pb_c_init = 1.25



### Network
self.network = "fullyconnected" # "resnet" / "fullyconnected"
self.support_size = 10 # Value and reward are scaled (with almost sqrt) and encoded on a vector with a range of -support_size to support_size
Expand All @@ -58,37 +61,44 @@ def __init__(self):
self.fc_dynamics_layers = [16] # Define the hidden layers in the dynamics network



### Training
self.results_path = os.path.join(os.path.dirname(__file__), "../results", os.path.basename(__file__)[:-3], datetime.datetime.now().strftime("%Y-%m-%d--%H-%M-%S")) # Path to store the model weights and TensorBoard logs
self.training_steps = 5000 # Total number of training steps (ie weights update according to a batch)
self.batch_size = 128 # Number of parts of games to train on at each training step
self.num_unroll_steps = 10 # Number of game moves to keep for every batch element
self.checkpoint_interval = 10 # Number of training steps before using the model for sef-playing
self.window_size = 1000 # Number of self-play games to keep in the replay buffer
self.td_steps = 50 # Number of steps in the future to take into account for calculating the target value
self.value_loss_weight = 1 # Scale the value loss to avoid overfitting of the value function, paper recommends 0.25 (See paper appendix Reanalyze)
self.training_device = "cuda" if torch.cuda.is_available() else "cpu" # Train on GPU if available

self.optimizer = "Adam" # "Adam" or "SGD". Paper uses SGD
self.weight_decay = 1e-4 # L2 weights regularization
self.momentum = 0.9 # Used only if optimizer is SGD

# Exponential learning rate schedule
self.lr_init = 0.05 # Initial learning rate
self.lr_decay_rate = 0.9 # Set it to 1 to use a constant learning rate
self.lr_decay_steps = 1000



### Replay Buffer
self.window_size = 1000 # Number of self-play games to keep in the replay buffer
self.num_unroll_steps = 10 # Number of game moves to keep for every batch element
self.td_steps = 50 # Number of steps in the future to take into account for calculating the target value
self.use_last_model_value = True # Use the last model to provide a fresher, stable n-step value (See paper appendix Reanalyze)

# Prioritized Replay (See paper appendix Training)
self.PER = True # Select in priority the elements in the replay buffer which are unexpected for the network
self.use_max_priority = False # Use the n-step TD error as initial priority. Better for large replay buffer
self.PER_alpha = 0.5 # How much prioritization is used, 0 corresponding to the uniform case, paper suggests 1
self.PER_beta = 1.0

# Exponential learning rate schedule
self.lr_init = 0.05 # Initial learning rate
self.lr_decay_rate = 0.9 # Set it to 1 to use a constant learning rate
self.lr_decay_steps = 1000


## Adjust the self play / training ratio to avoid over/underfitting
### Adjust the self play / training ratio to avoid over/underfitting
self.self_play_delay = 0 # Number of seconds to wait after each played game
self.training_delay = 0 # Number of seconds to wait after each training step
self.ratio = 1/100 # Desired self played games per training step ratio. Equivalent to a synchronous version, training can take much longer. Set it to None to disable it
self.ratio = 1/50 # Desired self played games per training step ratio. Equivalent to a synchronous version, training can take much longer. Set it to None to disable it


def visit_softmax_temperature_fn(self, trained_steps):
Expand Down
28 changes: 19 additions & 9 deletions games/connect4.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,13 +13,15 @@ def __init__(self):
self.seed = 0 # Seed for numpy, torch and the game



### Game
self.observation_shape = (3, 6, 7) # Dimensions of the game observation, must be 3D (channel, height, width). For a 1D array, please reshape it to (1, 1, length of array)
self.action_space = [i for i in range(7)] # Fixed list of all possible actions. You should only edit the length
self.players = [i for i in range(2)] # List of players. You should only edit the length
self.stacked_observations = 0 # Number of previous observation and previous actions to add to the current observation



### Self-Play
self.num_actors = 1 # Number of simultaneous threads self-playing to feed the replay buffer
self.max_moves = 50 # Maximum number of moves if game is not finished before
Expand All @@ -36,6 +38,7 @@ def __init__(self):
self.pb_c_init = 1.25



### Network
self.network = "resnet" # "resnet" / "fullyconnected"
self.support_size = 10 # Value and reward are scaled (with almost sqrt) and encoded on a vector with a range of -support_size to support_size
Expand All @@ -58,37 +61,44 @@ def __init__(self):
self.fc_dynamics_layers = [64] # Define the hidden layers in the dynamics network



### Training
self.results_path = os.path.join(os.path.dirname(__file__), "../results", os.path.basename(__file__)[:-3], datetime.datetime.now().strftime("%Y-%m-%d--%H-%M-%S")) # Path to store the model weights and TensorBoard logs
self.training_steps = 100000 # Total number of training steps (ie weights update according to a batch)
self.batch_size = 64 # Number of parts of games to train on at each training step
self.num_unroll_steps = 20 # Number of game moves to keep for every batch element
self.checkpoint_interval = 10 # Number of training steps before using the model for sef-playing
self.window_size = 3000 # Number of self-play games to keep in the replay buffer
self.td_steps = 50 # Number of steps in the future to take into account for calculating the target value
self.value_loss_weight = 0.25 # Scale the value loss to avoid overfitting of the value function, paper recommends 0.25 (See paper appendix Reanalyze)
self.training_device = "cuda" if torch.cuda.is_available() else "cpu" # Train on GPU if available

self.optimizer = "Adam" # "Adam" or "SGD". Paper uses SGD
self.weight_decay = 1e-4 # L2 weights regularization
self.momentum = 0.9 # Used only if optimizer is SGD

# Exponential learning rate schedule
self.lr_init = 0.01 # Initial learning rate
self.lr_decay_rate = 1 # Set it to 1 to use a constant learning rate
self.lr_decay_steps = 10000



### Replay Buffer
self.window_size = 3000 # Number of self-play games to keep in the replay buffer
self.num_unroll_steps = 20 # Number of game moves to keep for every batch element
self.td_steps = 50 # Number of steps in the future to take into account for calculating the target value
self.use_last_model_value = False # Use the last model to provide a fresher, stable n-step value (See paper appendix Reanalyze)

# Prioritized Replay (See paper appendix Training)
self.PER = True # Select in priority the elements in the replay buffer which are unexpected for the network
self.use_max_priority = True # Use the n-step TD error as initial priority. Better for large replay buffer
self.PER_alpha = 0.5 # How much prioritization is used, 0 corresponding to the uniform case, paper suggests 1
self.PER_beta = 1.0

# Exponential learning rate schedule
self.lr_init = 0.01 # Initial learning rate
self.lr_decay_rate = 1 # Set it to 1 to use a constant learning rate
self.lr_decay_steps = 10000


## Adjust the self play / training ratio to avoid over/underfitting
### Adjust the self play / training ratio to avoid over/underfitting
self.self_play_delay = 0 # Number of seconds to wait after each played game
self.training_delay = 0 # Number of seconds to wait after each training step
self.ratio = None # Desired self played games per training step ratio. Set it to None to disable it.
self.ratio = None # Desired self played games per training step ratio. Equivalent to a synchronous version, training can take much longer. Set it to None to disable it


def visit_softmax_temperature_fn(self, trained_steps):
Expand Down
30 changes: 20 additions & 10 deletions games/gomoku.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,13 +14,15 @@ def __init__(self):
self.seed = 0 # Seed for numpy, torch and the game



### Game
self.observation_shape = (3, 11, 11) # Dimensions of the game observation, must be 3 (channel, height, width). For a 1D array, please reshape it to (1, 1, length of array)
self.action_space = [i for i in range(11 * 11)] # Fixed list of all possible actions. You should only edit the length
self.players = [i for i in range(2)] # List of players. You should only edit the length
self.stacked_observations = 2 # Number of previous observation and previous actions to add to the current observation



### Self-Play
self.num_actors = 2 # Number of simultaneous threads self-playing to feed the replay buffer
self.max_moves = 70 # Maximum number of moves if game is not finished before
Expand All @@ -37,6 +39,7 @@ def __init__(self):
self.pb_c_init = 1.25



### Network
self.network = "resnet" # "resnet" / "fullyconnected"
self.support_size = 10 # Value and reward are scaled (with almost sqrt) and encoded on a vector with a range of -support_size to support_size
Expand All @@ -59,37 +62,44 @@ def __init__(self):
self.fc_dynamics_layers = [64] # Define the hidden layers in the dynamics network



### Training
self.results_path = os.path.join(os.path.dirname(__file__), "../results", os.path.basename(__file__)[:-3], datetime.datetime.now().strftime("%Y-%m-%d--%H-%M-%S")) # Path to store the model weights and TensorBoard logs
self.training_steps = 10 # Total number of training steps (ie weights update according to a batch)
self.batch_size = 128*3 # Number of parts of games to train on at each training step
self.num_unroll_steps = 5 # Number of game moves to keep for every batch element
self.batch_size = 128 # Number of parts of games to train on at each training step
self.checkpoint_interval = 10 # Number of training steps before using the model for sef-playing
self.window_size = 1000 # Number of self-play games to keep in the replay buffer
self.td_steps = 50 # Number of steps in the future to take into account for calculating the target value
self.value_loss_weight = 1 # Scale the value loss to avoid overfitting of the value function, paper recommends 0.25 (See paper appendix Reanalyze)
self.training_device = "cuda" if torch.cuda.is_available() else "cpu" # Train on GPU if available

self.optimizer = "Adam" # "Adam" or "SGD". Paper uses SGD
self.weight_decay = 1e-4 # L2 weights regularization
self.momentum = 0.9 # Used only if optimizer is SGD

# Exponential learning rate schedule
self.lr_init = 0.01 # Initial learning rate
self.lr_decay_rate = 0.9 # Set it to 1 to use a constant learning rate
self.lr_decay_steps = 10000



### Replay Buffer
self.window_size = 1000 # Number of self-play games to keep in the replay buffer
self.num_unroll_steps = 5 # Number of game moves to keep for every batch element
self.td_steps = 50 # Number of steps in the future to take into account for calculating the target value
self.use_last_model_value = False # Use the last model to provide a fresher, stable n-step value (See paper appendix Reanalyze)

# Prioritized Replay (See paper appendix Training)
self.PER = True # Select in priority the elements in the replay buffer which are unexpected for the network
self.use_max_priority = True # Use the n-step TD error as initial priority. Better for large replay buffer
self.PER_alpha = 0.5 # How much prioritization is used, 0 corresponding to the uniform case, paper suggests 1
self.PER_beta = 1.0

# Exponential learning rate schedule
self.lr_init = 0.01 # Initial learning rate
self.lr_decay_rate = 0.9 # Set it to 1 to use a constant learning rate
self.lr_decay_steps = 10000


## Adjust the self play / training ratio to avoid over/underfitting
### Adjust the self play / training ratio to avoid over/underfitting
self.self_play_delay = 0 # Number of seconds to wait after each played game
self.training_delay = 0 # Number of seconds to wait after each training step
self.ratio = None # Desired self played games per training step ratio. Set it to None to disable it.
self.ratio = None # Desired self played games per training step ratio. Equivalent to a synchronous version, training can take much longer. Set it to None to disable it


def visit_softmax_temperature_fn(self, trained_steps):
Expand Down
Loading

0 comments on commit df0e407

Please sign in to comment.