Skip to content

Commit

Permalink
Change batch aggregation, fix value in replay buffer and prepare merge
Browse files Browse the repository at this point in the history
  • Loading branch information
werner-duvaud committed Mar 28, 2020
1 parent a38e2e8 commit f60f199
Show file tree
Hide file tree
Showing 11 changed files with 275 additions and 77 deletions.
92 changes: 91 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
@@ -1,3 +1,93 @@
# Editors
.vscode/
.idea/

# Mac/OSX
.DS_Store
__pycache__

# Windows
Thumbs.db

# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class

# C extensions
*.so

# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST

# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec

# Installer logs
pip-log.txt
pip-delete-this-directory.txt

# Unit test / coverage reports
htmlcov/
.tox/
.nox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
.hypothesis/
.pytest_cache/

# Translations
*.mo
*.pot

# Sphinx documentation
docs/_build/

# Jupyter Notebook
.ipynb_checkpoints

# IPython
profile_default/
ipython_config.py

# pyenv
.python-version

# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/

# mkdocs documentation
/site

# mypy
.mypy_cache/
.dmypy.json
dmypy.json
7 changes: 6 additions & 1 deletion games/cartpole.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ def __init__(self):
self.num_simulations = 50 # Number of future moves self-simulated
self.discount = 0.997 # Chronological discount of the reward
self.temperature_threshold = 500 # Number of moves before dropping temperature to 0 (ie playing according to the max)
self.self_play_delay = 0 # Number of seconds to wait after each played game to adjust the self play / training ratio to avoid over/underfitting
self.self_play_delay = 0.5 # Number of seconds to wait after each played game to adjust the self play / training ratio to avoid over/underfitting

# Root prior exploration noise
self.root_dirichlet_alpha = 0.25
Expand Down Expand Up @@ -73,6 +73,11 @@ def __init__(self):
self.weight_decay = 1e-4 # L2 weights regularization
self.momentum = 0.9

# Prioritized Replay (See paper appendix Training)
self.PER = True # Select in priority the elements in the replay buffer which are unexpected for the network
self.PER_alpha = 0.5 # How much prioritization is used, 0 corresponding to the uniform case, paper suggests 1
self.PER_beta = 1.0

# Exponential learning rate schedule
self.lr_init = 0.05 # Initial learning rate
self.lr_decay_rate = 0.9 # Set it to 1 to use a constant learning rate
Expand Down
5 changes: 5 additions & 0 deletions games/connect4.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,11 @@ def __init__(self):
self.weight_decay = 1e-4 # L2 weights regularization
self.momentum = 0.9

# Prioritized Replay (See paper appendix Training)
self.PER = False # Select in priority the elements in the replay buffer which are unexpected for the network
self.PER_alpha = 0.5 # How much prioritization is used, 0 corresponding to the uniform case, paper suggests 1
self.PER_beta = 1.0

# Exponential learning rate schedule
self.lr_init = 0.1 # Initial learning rate
self.lr_decay_rate = 1 # Set it to 1 to use a constant learning rate
Expand Down
5 changes: 5 additions & 0 deletions games/gomoku.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,11 @@ def __init__(self):
self.weight_decay = 1e-4 # L2 weights regularization
self.momentum = 0.9

# Prioritized Replay (See paper appendix Training)
self.PER = False # Select in priority the elements in the replay buffer which are unexpected for the network
self.PER_alpha = 0.5 # How much prioritization is used, 0 corresponding to the uniform case, paper suggests 1
self.PER_beta = 1.0

# Exponential learning rate schedule
self.lr_init = 0.01 # Initial learning rate
self.lr_decay_rate = 0.9 # Set it to 1 to use a constant learning rate
Expand Down
6 changes: 3 additions & 3 deletions games/lunarlander.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,9 +73,9 @@ def __init__(self):
self.weight_decay = 1e-4 # L2 weights regularization
self.momentum = 0.9

# Prioritized Replay
self.PER = True
self.PER_alpha = 0.5
# Prioritized Replay (See paper appendix Training)
self.PER = False # Select in priority the elements in the replay buffer which are unexpected for the network
self.PER_alpha = 0.5 # How much prioritization is used, 0 corresponding to the uniform case, paper suggests 1
self.PER_beta = 1.0

# Exponential learning rate schedule
Expand Down
9 changes: 7 additions & 2 deletions games/tictactoe.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,12 +67,17 @@ def __init__(self):
self.window_size = 3000 # Number of self-play games to keep in the replay buffer
self.td_steps = 20 # Number of steps in the future to take into account for calculating the target value
self.training_delay = 0 # Number of seconds to wait after each training to adjust the self play / training ratio to avoid over/underfitting
self.value_loss_weight = 0.7 # Scale the value loss to avoid overfitting of the value function, paper recommends 0.25 (See paper appendix Reanalyze)
self.value_loss_weight = 0.25 # Scale the value loss to avoid overfitting of the value function, paper recommends 0.25 (See paper appendix Reanalyze)
self.training_device = "cuda" if torch.cuda.is_available() else "cpu" # Train on GPU if available

self.weight_decay = 1e-4 # L2 weights regularization
self.momentum = 0.9

# Prioritized Replay (See paper appendix Training)
self.PER = True # Select in priority the elements in the replay buffer which are unexpected for the network
self.PER_alpha = 0.5 # How much prioritization is used, 0 corresponding to the uniform case, paper suggests 1
self.PER_beta = 1.0

# Exponential learning rate schedule
self.lr_init = 0.01 # Initial learning rate
self.lr_decay_rate = 1 # Set it to 1 to use a constant learning rate
Expand Down Expand Up @@ -187,7 +192,7 @@ def print_action(self, action_number):
Returns:
String representing the action.
"""
return "Play column {}".format(action_number + 1)
return "Play column {}".format(action_number)


class TicTacToe:
Expand Down
2 changes: 1 addition & 1 deletion muzero.py
Original file line number Diff line number Diff line change
Expand Up @@ -138,7 +138,7 @@ def train(self):
counter,
)
writer.add_scalar("2.Workers/4.Learning rate", infos["lr"], counter)
writer.add_scalar("3.Loss/1.Total loss", infos["total_loss"], counter)
writer.add_scalar("3.Loss/1.Total weighted loss", infos["total_loss"], counter)
writer.add_scalar("3.Loss/Value loss", infos["value_loss"], counter)
writer.add_scalar("3.Loss/Reward loss", infos["reward_loss"], counter)
writer.add_scalar("3.Loss/Policy loss", infos["policy_loss"], counter)
Expand Down
108 changes: 78 additions & 30 deletions replay_buffer.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,56 +19,87 @@ def save_game(self, game_history):
if len(self.buffer) > self.config.window_size:
self.buffer.pop(0)
self.game_priorities.pop(0)
game_history.priorities = numpy.ones(len(game_history.observation_history)) * self.max_recorded_game_priority
game_history.priorities = (
numpy.ones(len(game_history.observation_history))
* self.max_recorded_game_priority
)
self.buffer.append(game_history)
self.game_priorities.append(numpy.mean(game_history.priorities))

self.self_play_count += 1

def get_self_play_count(self):
return self.self_play_count

def get_batch(self):
index_batch, observation_batch, action_batch, reward_batch, value_batch, policy_batch = (
[],
[],
[],
[],
[],
[]
(
index_batch,
observation_batch,
action_batch,
reward_batch,
value_batch,
policy_batch,
weight_batch,
gradient_scale_batch,
) = ([], [], [], [], [], [], [], [])

total_samples = sum(
(len(game_history.priorities) for game_history in self.buffer)
)
total_samples = sum((len(game_history.priorities) for game_history in self.buffer))
weight_batch = []

for _ in range(self.config.batch_size):
game_index, game_history, game_prob = self.sample_game(self.buffer)
game_pos, pos_prob = self.sample_position(game_history)
index_batch.append([game_index, game_pos])
weight_batch.append((total_samples * game_prob * pos_prob) ** (-self.config.PER_beta))

values, rewards, policies, actions = self.make_target(
game_history, game_pos
)

index_batch.append([game_index, game_pos])
observation_batch.append(game_history.observation_history[game_pos])
action_batch.append(actions)
value_batch.append(values)
reward_batch.append(rewards)
policy_batch.append(policies)
weight_batch.append(
(total_samples * game_prob * pos_prob) ** (-self.config.PER_beta)
)
gradient_scale_batch.append(
[
min(
self.config.num_unroll_steps,
len(game_history.action_history) - game_pos,
)
]
* len(actions)
)

weight_batch = numpy.array(weight_batch) / max(weight_batch)

# observation_batch: batch, channels, height, width
# action_batch: batch, num_unroll_steps+1
# value_batch: batch, num_unroll_steps+1
# reward_batch: batch, num_unroll_steps+1
# policy_batch: batch, num_unroll_steps+1, len(action_space)
weight_batch = numpy.array(weight_batch) / max(weight_batch)
return index_batch, (weight_batch, observation_batch, action_batch, value_batch, reward_batch, policy_batch)
# weight_batch: batch
# gradient_scale_batch: batch, num_unroll_steps+1
return (
index_batch,
(
observation_batch,
action_batch,
value_batch,
reward_batch,
policy_batch,
weight_batch,
gradient_scale_batch,
),
)

def sample_game(self, buffer):
"""
Sample game from buffer either uniformly or according to some priority.
See paper appendix Training.
"""
# TODO: sample with probability link to the highest difference between real and
# predicted value (See paper appendix Training)
game_probs = numpy.array(self.game_priorities) / sum(self.game_priorities)
game_index_candidates = numpy.arange(0, len(self.buffer), dtype=int)
game_index = numpy.random.choice(game_index_candidates, p=game_probs)
Expand All @@ -79,40 +110,56 @@ def sample_game(self, buffer):
def sample_position(self, game_history):
"""
Sample position from game either uniformly or according to some priority.
See paper appendix Training.
"""
# TODO: sample according to some priority
position_probs = numpy.array(game_history.priorities) / sum(game_history.priorities)
position_probs = numpy.array(game_history.priorities) / sum(
game_history.priorities
)
position_index_candidates = numpy.arange(0, len(position_probs), dtype=int)
position_index = numpy.random.choice(position_index_candidates, p=position_probs)
position_index = numpy.random.choice(
position_index_candidates, p=position_probs
)
position_prob = position_probs[position_index]

return position_index, position_prob

def update_priorities(self, priorities, index_info):

"""
Update game and position priorities with priorities calculated during the training.
See Distributed Prioritized Experience Replay https://arxiv.org/abs/1803.00933
"""
for i in range(len(index_info)):
game_index, game_pos = index_info[i]

# update position priorities
priority = priorities[i, :]
start_index = game_pos
end_index = min(game_pos + len(priority), len(self.buffer[game_index].priorities))
numpy.put(self.buffer[game_index].priorities, range(start_index, end_index), priority)
end_index = min(
game_pos + len(priority), len(self.buffer[game_index].priorities)
)
numpy.put(
self.buffer[game_index].priorities,
range(start_index, end_index),
priority,
)

# update game priorities
self.game_priorities[game_index] = numpy.max(self.buffer[game_index].priorities) # option: mean, sum, max
self.game_priorities[game_index] = numpy.max(
self.buffer[game_index].priorities
) # option: mean, sum, max

self.max_recorded_game_priority = numpy.max(self.game_priorities)

def make_target(self, game_history, state_index):
"""
The value target is the discounted root value of the search tree td_steps into the
future, plus the discounted sum of all rewards until then.
Generate targets for every unroll steps.
"""
target_values, target_rewards, target_policies, actions = [], [], [], []
for current_index in range(
state_index, state_index + self.config.num_unroll_steps + 1
):
# The value target is the discounted root value of the search tree td_steps into the
# future, plus the discounted sum of all rewards until then.
bootstrap_index = current_index + self.config.td_steps
if bootstrap_index < len(game_history.root_values):
value = (
Expand All @@ -123,12 +170,12 @@ def make_target(self, game_history, state_index):
value = 0

for i, reward in enumerate(
game_history.reward_history[current_index:bootstrap_index]
game_history.reward_history[current_index + 1 : bootstrap_index + 1]
):
value += (
reward
if game_history.to_play_history[current_index]
== game_history.to_play_history[current_index + i]
== game_history.to_play_history[current_index + 1 + i]
else -reward
) * self.config.discount ** i

Expand All @@ -138,8 +185,9 @@ def make_target(self, game_history, state_index):
target_policies.append(game_history.child_visits[current_index])
actions.append(game_history.action_history[current_index])
elif current_index == len(game_history.root_values):
target_values.append(value)
target_values.append(0)
target_rewards.append(game_history.reward_history[current_index])
# Uniform policy
target_policies.append(
[
1 / len(game_history.child_visits[0])
Expand All @@ -151,7 +199,7 @@ def make_target(self, game_history, state_index):
# States past the end of games are treated as absorbing states
target_values.append(0)
target_rewards.append(0)
# Uniform policy to give the tensor a valid dimension
# Uniform policy
target_policies.append(
[
1 / len(game_history.child_visits[0])
Expand Down
Binary file modified results/tictactoe/experiment1/model.weights
Binary file not shown.
Loading

0 comments on commit f60f199

Please sign in to comment.