Change batch aggregation, fix value in replay buffer and prepare merge

smanolloff · Mar 28, 2020 · f60f199 · f60f199
1 parent a38e2e8
commit f60f199
Show file tree

Hide file tree

Showing 11 changed files with 275 additions and 77 deletions.
diff --git a/.gitignore b/.gitignore
@@ -1,3 +1,93 @@
+# Editors
+.vscode/
+.idea/
+
+# Mac/OSX
 .DS_Store
-__pycache__
 
+# Windows
+Thumbs.db
+
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+.hypothesis/
+.pytest_cache/
+
+# Translations
+*.mo
+*.pot
+
+# Sphinx documentation
+docs/_build/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# IPython
+profile_default/
+ipython_config.py
+
+# pyenv
+.python-version
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
diff --git a/games/cartpole.py b/games/cartpole.py
@@ -26,7 +26,7 @@ def __init__(self):
         self.num_simulations = 50  # Number of future moves self-simulated
         self.discount = 0.997  # Chronological discount of the reward
         self.temperature_threshold = 500  # Number of moves before dropping temperature to 0 (ie playing according to the max)
-        self.self_play_delay = 0  # Number of seconds to wait after each played game to adjust the self play / training ratio to avoid over/underfitting
+        self.self_play_delay = 0.5  # Number of seconds to wait after each played game to adjust the self play / training ratio to avoid over/underfitting
 
         # Root prior exploration noise
         self.root_dirichlet_alpha = 0.25
@@ -73,6 +73,11 @@ def __init__(self):
         self.weight_decay = 1e-4  # L2 weights regularization
         self.momentum = 0.9
 
+        # Prioritized Replay (See paper appendix Training)
+        self.PER = True  # Select in priority the elements in the replay buffer which are unexpected for the network
+        self.PER_alpha = 0.5  # How much prioritization is used, 0 corresponding to the uniform case, paper suggests 1
+        self.PER_beta = 1.0
+
         # Exponential learning rate schedule
         self.lr_init = 0.05  # Initial learning rate
         self.lr_decay_rate = 0.9  # Set it to 1 to use a constant learning rate

diff --git a/games/connect4.py b/games/connect4.py
@@ -73,6 +73,11 @@ def __init__(self):
         self.weight_decay = 1e-4  # L2 weights regularization
         self.momentum = 0.9
 
+        # Prioritized Replay (See paper appendix Training)
+        self.PER = False  # Select in priority the elements in the replay buffer which are unexpected for the network
+        self.PER_alpha = 0.5  # How much prioritization is used, 0 corresponding to the uniform case, paper suggests 1
+        self.PER_beta = 1.0
+
         # Exponential learning rate schedule
         self.lr_init = 0.1  # Initial learning rate
         self.lr_decay_rate = 1  # Set it to 1 to use a constant learning rate

diff --git a/games/gomoku.py b/games/gomoku.py
@@ -74,6 +74,11 @@ def __init__(self):
         self.weight_decay = 1e-4  # L2 weights regularization
         self.momentum = 0.9
 
+        # Prioritized Replay (See paper appendix Training)
+        self.PER = False  # Select in priority the elements in the replay buffer which are unexpected for the network
+        self.PER_alpha = 0.5  # How much prioritization is used, 0 corresponding to the uniform case, paper suggests 1
+        self.PER_beta = 1.0
+
         # Exponential learning rate schedule
         self.lr_init = 0.01  # Initial learning rate
         self.lr_decay_rate = 0.9  # Set it to 1 to use a constant learning rate

diff --git a/games/lunarlander.py b/games/lunarlander.py
@@ -73,9 +73,9 @@ def __init__(self):
         self.weight_decay = 1e-4  # L2 weights regularization
         self.momentum = 0.9
 
-        # Prioritized Replay
-        self.PER = True
-        self.PER_alpha = 0.5
+        # Prioritized Replay (See paper appendix Training)
+        self.PER = False  # Select in priority the elements in the replay buffer which are unexpected for the network
+        self.PER_alpha = 0.5  # How much prioritization is used, 0 corresponding to the uniform case, paper suggests 1
         self.PER_beta = 1.0
 
         # Exponential learning rate schedule

diff --git a/games/tictactoe.py b/games/tictactoe.py
@@ -67,12 +67,17 @@ def __init__(self):
         self.window_size = 3000  # Number of self-play games to keep in the replay buffer
         self.td_steps = 20  # Number of steps in the future to take into account for calculating the target value
         self.training_delay = 0  # Number of seconds to wait after each training to adjust the self play / training ratio to avoid over/underfitting
-        self.value_loss_weight = 0.7  # Scale the value loss to avoid overfitting of the value function, paper recommends 0.25 (See paper appendix Reanalyze)
+        self.value_loss_weight = 0.25  # Scale the value loss to avoid overfitting of the value function, paper recommends 0.25 (See paper appendix Reanalyze)
         self.training_device = "cuda" if torch.cuda.is_available() else "cpu"  # Train on GPU if available
 
         self.weight_decay = 1e-4  # L2 weights regularization
         self.momentum = 0.9
 
+        # Prioritized Replay (See paper appendix Training)
+        self.PER = True  # Select in priority the elements in the replay buffer which are unexpected for the network
+        self.PER_alpha = 0.5  # How much prioritization is used, 0 corresponding to the uniform case, paper suggests 1
+        self.PER_beta = 1.0
+
         # Exponential learning rate schedule
         self.lr_init = 0.01  # Initial learning rate
         self.lr_decay_rate = 1  # Set it to 1 to use a constant learning rate
@@ -187,7 +192,7 @@ def print_action(self, action_number):
         Returns:
             String representing the action.
         """
-        return "Play column {}".format(action_number + 1)
+        return "Play column {}".format(action_number)
 
 
 class TicTacToe:

diff --git a/muzero.py b/muzero.py
@@ -138,7 +138,7 @@ def train(self):
                     counter,
                 )
                 writer.add_scalar("2.Workers/4.Learning rate", infos["lr"], counter)
-                writer.add_scalar("3.Loss/1.Total loss", infos["total_loss"], counter)
+                writer.add_scalar("3.Loss/1.Total weighted loss", infos["total_loss"], counter)
                 writer.add_scalar("3.Loss/Value loss", infos["value_loss"], counter)
                 writer.add_scalar("3.Loss/Reward loss", infos["reward_loss"], counter)
                 writer.add_scalar("3.Loss/Policy loss", infos["policy_loss"], counter)

diff --git a/replay_buffer.py b/replay_buffer.py
@@ -19,56 +19,87 @@ def save_game(self, game_history):
         if len(self.buffer) > self.config.window_size:
             self.buffer.pop(0)
             self.game_priorities.pop(0)
-        game_history.priorities = numpy.ones(len(game_history.observation_history)) * self.max_recorded_game_priority
+        game_history.priorities = (
+            numpy.ones(len(game_history.observation_history))
+            * self.max_recorded_game_priority
+        )
         self.buffer.append(game_history)
         self.game_priorities.append(numpy.mean(game_history.priorities))
-
         self.self_play_count += 1
 
     def get_self_play_count(self):
         return self.self_play_count
 
     def get_batch(self):
-        index_batch, observation_batch, action_batch, reward_batch, value_batch, policy_batch = (
-            [],
-            [],
-            [],
-            [],
-            [],
-            []
+        (
+            index_batch,
+            observation_batch,
+            action_batch,
+            reward_batch,
+            value_batch,
+            policy_batch,
+            weight_batch,
+            gradient_scale_batch,
+        ) = ([], [], [], [], [], [], [], [])
+
+        total_samples = sum(
+            (len(game_history.priorities) for game_history in self.buffer)
         )
-        total_samples = sum((len(game_history.priorities) for game_history in self.buffer))
-        weight_batch = []
+
         for _ in range(self.config.batch_size):
             game_index, game_history, game_prob = self.sample_game(self.buffer)
             game_pos, pos_prob = self.sample_position(game_history)
-            index_batch.append([game_index, game_pos])
-            weight_batch.append((total_samples * game_prob * pos_prob) ** (-self.config.PER_beta))
 
             values, rewards, policies, actions = self.make_target(
                 game_history, game_pos
             )
 
+            index_batch.append([game_index, game_pos])
             observation_batch.append(game_history.observation_history[game_pos])
             action_batch.append(actions)
             value_batch.append(values)
             reward_batch.append(rewards)
             policy_batch.append(policies)
+            weight_batch.append(
+                (total_samples * game_prob * pos_prob) ** (-self.config.PER_beta)
+            )
+            gradient_scale_batch.append(
+                [
+                    min(
+                        self.config.num_unroll_steps,
+                        len(game_history.action_history) - game_pos,
+                    )
+                ]
+                * len(actions)
+            )
+
+        weight_batch = numpy.array(weight_batch) / max(weight_batch)
 
         # observation_batch: batch, channels, height, width
         # action_batch: batch, num_unroll_steps+1
         # value_batch: batch, num_unroll_steps+1
         # reward_batch: batch, num_unroll_steps+1
         # policy_batch: batch, num_unroll_steps+1, len(action_space)
-        weight_batch = numpy.array(weight_batch) / max(weight_batch)
-        return index_batch, (weight_batch, observation_batch, action_batch, value_batch, reward_batch, policy_batch)
+        # weight_batch: batch
+        # gradient_scale_batch: batch, num_unroll_steps+1
+        return (
+            index_batch,
+            (
+                observation_batch,
+                action_batch,
+                value_batch,
+                reward_batch,
+                policy_batch,
+                weight_batch,
+                gradient_scale_batch,
+            ),
+        )
 
     def sample_game(self, buffer):
         """
         Sample game from buffer either uniformly or according to some priority.
+        See paper appendix Training.
         """
-        # TODO: sample with probability link to the highest difference between real and
-        # predicted value (See paper appendix Training)
         game_probs = numpy.array(self.game_priorities) / sum(self.game_priorities)
         game_index_candidates = numpy.arange(0, len(self.buffer), dtype=int)
         game_index = numpy.random.choice(game_index_candidates, p=game_probs)
@@ -79,40 +110,56 @@ def sample_game(self, buffer):
     def sample_position(self, game_history):
         """
         Sample position from game either uniformly or according to some priority.
+        See paper appendix Training.
         """
-        # TODO: sample according to some priority
-        position_probs = numpy.array(game_history.priorities) / sum(game_history.priorities)
+        position_probs = numpy.array(game_history.priorities) / sum(
+            game_history.priorities
+        )
         position_index_candidates = numpy.arange(0, len(position_probs), dtype=int)
-        position_index = numpy.random.choice(position_index_candidates, p=position_probs)
+        position_index = numpy.random.choice(
+            position_index_candidates, p=position_probs
+        )
         position_prob = position_probs[position_index]
 
         return position_index, position_prob
 
     def update_priorities(self, priorities, index_info):
-
+        """
+        Update game and position priorities with priorities calculated during the training.
+        See Distributed Prioritized Experience Replay https://arxiv.org/abs/1803.00933
+        """
         for i in range(len(index_info)):
             game_index, game_pos = index_info[i]
 
             # update position priorities
             priority = priorities[i, :]
             start_index = game_pos
-            end_index = min(game_pos + len(priority), len(self.buffer[game_index].priorities))
-            numpy.put(self.buffer[game_index].priorities, range(start_index, end_index), priority)
+            end_index = min(
+                game_pos + len(priority), len(self.buffer[game_index].priorities)
+            )
+            numpy.put(
+                self.buffer[game_index].priorities,
+                range(start_index, end_index),
+                priority,
+            )
 
             # update game priorities
-            self.game_priorities[game_index] = numpy.max(self.buffer[game_index].priorities)  # option: mean, sum, max
+            self.game_priorities[game_index] = numpy.max(
+                self.buffer[game_index].priorities
+            )  # option: mean, sum, max
 
             self.max_recorded_game_priority = numpy.max(self.game_priorities)
 
     def make_target(self, game_history, state_index):
         """
-        The value target is the discounted root value of the search tree td_steps into the
-        future, plus the discounted sum of all rewards until then.
+        Generate targets for every unroll steps.
         """
         target_values, target_rewards, target_policies, actions = [], [], [], []
         for current_index in range(
             state_index, state_index + self.config.num_unroll_steps + 1
         ):
+            # The value target is the discounted root value of the search tree td_steps into the
+            # future, plus the discounted sum of all rewards until then.
             bootstrap_index = current_index + self.config.td_steps
             if bootstrap_index < len(game_history.root_values):
                 value = (
@@ -123,12 +170,12 @@ def make_target(self, game_history, state_index):
                 value = 0
 
             for i, reward in enumerate(
-                game_history.reward_history[current_index:bootstrap_index]
+                game_history.reward_history[current_index + 1 : bootstrap_index + 1]
             ):
                 value += (
                     reward
                     if game_history.to_play_history[current_index]
-                    == game_history.to_play_history[current_index + i]
+                    == game_history.to_play_history[current_index + 1 + i]
                     else -reward
                 ) * self.config.discount ** i
 
@@ -138,8 +185,9 @@ def make_target(self, game_history, state_index):
                 target_policies.append(game_history.child_visits[current_index])
                 actions.append(game_history.action_history[current_index])
             elif current_index == len(game_history.root_values):
-                target_values.append(value)
+                target_values.append(0)
                 target_rewards.append(game_history.reward_history[current_index])
+                # Uniform policy
                 target_policies.append(
                     [
                         1 / len(game_history.child_visits[0])
@@ -151,7 +199,7 @@ def make_target(self, game_history, state_index):
                 # States past the end of games are treated as absorbing states
                 target_values.append(0)
                 target_rewards.append(0)
-                # Uniform policy to give the tensor a valid dimension
+                # Uniform policy
                 target_policies.append(
                     [
                         1 / len(game_history.child_visits[0])

diff --git a/results/tictactoe/experiment1/model.weights b/results/tictactoe/experiment1/model.weights