Merge pull request google-deepmind#1046 from axelbr:master

PiperOrigin-RevId: 527941276 Change-Id: I0b3bde390aed66dffabef7962994c27ddb2f2421
willmcgowan · May 2, 2023 · f8646d8 · f8646d8
2 parents 25538c7 + f0d30ac
commit f8646d8
Show file tree

Hide file tree

Showing 9 changed files with 1,832 additions and 53 deletions.
diff --git a/.gitignore b/.gitignore
@@ -59,3 +59,4 @@ open_spiel/cmake-build-debug/
 Package.resolved
 # Visual Studio generated files
 open_spiel/.vs
+/.env
diff --git a/docs/algorithms.md b/docs/algorithms.md
diff --git a/open_spiel/python/CMakeLists.txt b/open_spiel/python/CMakeLists.txt
@@ -258,6 +258,7 @@ if (OPEN_SPIEL_ENABLE_JAX)
     jax/deep_cfr_jax_test.py
     jax/dqn_jax_test.py
     jax/nfsp_jax_test.py
+    jax/opponent_shaping_jax_test.py
     jax/policy_gradient_jax_test.py
     algorithms/rnad/rnad_test.py
     mfg/algorithms/fictitious_play_test.py

diff --git a/open_spiel/python/environments/iterated_matrix_game.py b/open_spiel/python/environments/iterated_matrix_game.py
@@ -0,0 +1,172 @@
+"""This module implements a generic environment for iterated normal form games.
+
+It does so wuth automatic vectorization. Along with the environment, it also
+provides pre-defined factory functions for common games such as the iterated
+prisoners dilemma and the iterated matching pennies.
+"""
+
+import numpy as np
+from pyspiel import PlayerId
+
+from open_spiel.python.rl_environment import Environment
+from open_spiel.python.rl_environment import StepType
+from open_spiel.python.rl_environment import TimeStep
+
+
+class IteratedMatrixGame(Environment):
+  """Environment for iterated normal form games.
+
+  Supports automatic vectorization.
+  """
+
+  def __init__(
+      self,
+      payoff_matrix: np.ndarray,
+      iterations: int,
+      batch_size=1,
+      include_remaining_iterations=True,
+  ):
+    # pylint: disable=super-init-not-called
+    self._payoff_matrix = np.array(payoff_matrix, dtype=np.float32)
+    self._iterations = iterations
+    self._num_players = payoff_matrix.ndim - 1
+    self._batch_size = batch_size
+    self._include_remaining_iterations = include_remaining_iterations
+    self._t = 0
+    self._actions = np.arange(
+        np.prod(self.action_spec()['num_actions'])
+    ).reshape(*[payoff_matrix.shape[p] for p in range(self._num_players)])
+
+  def one_hot(self, x, n):
+    return np.eye(n)[x]
+
+  @property
+  def num_players(self):
+    return self._num_players
+
+  def observation_spec(self):
+    info_state_spec, legal_actions_spec = [], []
+    for i in range(self._num_players):
+      num_actions = np.prod(self._payoff_matrix.shape[:-1]) + 1
+      if self._include_remaining_iterations:
+        num_actions += 1
+      info_state_spec.append([num_actions])
+      legal_actions_spec.append(self._payoff_matrix.shape[i])
+    return {
+        'info_state': tuple(info_state_spec),
+        'legal_actions': tuple(legal_actions_spec),
+        'current_player': (),
+    }
+
+  def action_spec(self):
+    num_actions, mins, maxs = [], [], []
+    for i in range(self._num_players):
+      num_actions.append(self._payoff_matrix.shape[i])
+      mins.append(0)
+      maxs.append(self._payoff_matrix.shape[i] - 1)
+
+    return {
+        'num_actions': tuple(num_actions),
+        'min': tuple(mins),
+        'max': tuple(maxs),
+        'dtype': int,
+    }
+
+  def step(self, actions: np.ndarray):
+    if actions.ndim == 1:
+      actions = actions[None, :]
+    payoffs = self._payoff_matrix[tuple(actions.T)]
+    s1 = self.one_hot(
+        self._actions[tuple(actions.T)] + 1, n=np.max(self._actions) + 2
+    )
+    s2 = self.one_hot(
+        self._actions[tuple(actions[..., ::-1].T)] + 1,
+        n=np.max(self._actions) + 2,
+    )
+    rewards = [
+        np.squeeze(p)
+        for p in np.split(
+            payoffs, indices_or_sections=self._num_players, axis=1
+        )
+    ]
+    discounts = [np.ones_like(r) for r in rewards]
+    if self._t == self._iterations - 1:
+      step_type = StepType.LAST
+    else:
+      step_type = StepType.MID
+    self._t += 1
+    remaining_iters = float((self._iterations - self._t)) / self._iterations
+
+    info_state = [s1, s2]
+    if self._include_remaining_iterations:
+      info_state = np.concatenate(
+          [
+              info_state,
+              np.full((self._batch_size, 1), fill_value=remaining_iters),
+          ],
+          axis=-1,
+      )
+
+    legal_actions = self._get_legal_actions()
+    return TimeStep(
+        observations={
+            'info_state': info_state,
+            'legal_actions': legal_actions,
+            'batch_size': actions.shape[0],
+            'current_player': PlayerId.SIMULTANEOUS,
+        },
+        rewards=rewards,
+        discounts=discounts,
+        step_type=step_type,
+    )
+
+  def _get_legal_actions(self):
+    legal_actions = []
+    for p in range(self.num_players):
+      actions = np.arange(self.action_spec()['num_actions'][p])
+      legal_actions.append([actions] * self._batch_size)
+    return np.array(legal_actions)
+
+  def reset(self):
+    self._t = 0
+    info_state = np.zeros((
+        self.num_players,
+        self._batch_size,
+        *self.observation_spec()['info_state'][0],
+    ))
+    info_state[..., 0] = 1.0
+    if self._include_remaining_iterations:
+      info_state[..., -1] = 1.0
+    rewards = np.squeeze(np.zeros((self.num_players, self._batch_size)))
+    discounts = np.squeeze(np.ones((self.num_players, self._batch_size)))
+    return TimeStep(
+        observations={
+            'info_state': [
+                np.squeeze(s).astype(np.float32) for s in info_state
+            ],
+            'legal_actions': self._get_legal_actions(),
+            'batch_size': self._batch_size,
+            'current_player': PlayerId.SIMULTANEOUS,
+        },
+        rewards=[np.squeeze(a).astype(np.float32) for a in rewards],
+        discounts=[np.squeeze(a).astype(np.float32) for a in discounts],
+        step_type=StepType.FIRST,
+    )
+
+
+def IteratedPrisonersDilemma(iterations: int, batch_size=1):
+  return IteratedMatrixGame(
+      payoff_matrix=np.array([[[-1, -1], [-3, 0]], [[0, -3], [-2, -2]]]),
+      iterations=iterations,
+      batch_size=batch_size,
+      include_remaining_iterations=False,
+  )
+
+
+def IteratedMatchingPennies(iterations: int, batch_size=1):
+  return IteratedMatrixGame(
+      payoff_matrix=np.array([[[1, -1], [-1, 1]], [[-1, 1], [1, -1]]]),
+      iterations=iterations,
+      batch_size=batch_size,
+      include_remaining_iterations=False,
+  )