Merge branch 'deepmind:master' into master

willmcgowan · Apr 27, 2023 · 7abb175 · 7abb175
2 parents 54316c8 + 4c96de2
commit 7abb175
Show file tree

Hide file tree

Showing 30 changed files with 9,208 additions and 565 deletions.
diff --git a/docs/developer_guide.md b/docs/developer_guide.md
@@ -73,9 +73,11 @@ ideal to first be aware of the general API (see `spiel.h`).
     be clear from the game you copied from. If not, each API function that is
     overridden will be fully documented in superclasses in `spiel.h`.
 8.  Run your code through a linter so it conforms to Google's
-    [style guides](https://google.github.io/styleguide/). For C++ and Python
-    use [cpplint](https://pypi.org/project/cpplint/). There is also
-    [YAPF](https://github.com/google/yapf/) for Python as well.
+    [style guides](https://google.github.io/styleguide/). For C++
+    use [cpplint](https://pypi.org/project/cpplint/). For Python, use
+    [pylint](https://pypi.org/project/pylint/) with the
+    [pylintrc from the Google style guide](https://google.github.io/styleguide/pyguide.html).
+    There is also [YAPF](https://github.com/google/yapf/) for Python as well.
 9.  Once done, rebuild and rerun the tests to ensure everything passes
     (including your new game’s test!).
 10. Add a playthrough file to catch regressions:

diff --git a/docs/library.md b/docs/library.md
@@ -16,6 +16,12 @@ a shared library once, and then load it dynamically at runtime. This page walks
 through how to do this assuming a bash shell on Linux, but is very similar on
 MacOS or for other shells.
 
+## Install Dependencies
+
+The dependencies of OpenSpiel need to be installed before it can be used as a
+library. On MacOS and Debian/Ubuntu Linux, this is often simply just running
+`./install.sh`. Please see the [installation from source instructions](https://github.com/deepmind/open_spiel/blob/master/docs/install.md#installation-from-source) for more details.
+
 ## Compiling OpenSpiel as a Shared Library
 
 To build OpenSpiel as a shared library, simply run:
@@ -49,8 +55,8 @@ do it every time you load the library. Of course, if you are already using
 ```
 cd ../open_spiel/examples
 clang++ -I${HOME}/open_spiel -I${HOME}/open_spiel/open_spiel/abseil-cpp \
-        -L${HOME}/open_spiel/build -lopen_spiel -std=c++17 \
-        -o shared_library_example shared_library_example.cc
+        -std=c++17 -o shared_library_example shared_library_example.cc \
+        -L${HOME}/open_spiel/build  -lopen_spiel
 ```
 
 The first two flags are the include directory paths and the third is the link

diff --git a/open_spiel/algorithms/CMakeLists.txt b/open_spiel/algorithms/CMakeLists.txt
@@ -172,6 +172,14 @@ add_executable(tabular_exploitability_test tabular_exploitability_test.cc
     $<TARGET_OBJECTS:algorithms> ${OPEN_SPIEL_OBJECTS})
 add_test(tabular_exploitability_test tabular_exploitability_test)
 
+add_executable(tabular_sarsa_test tabular_sarsa_test.cc
+    $<TARGET_OBJECTS:algorithms> ${OPEN_SPIEL_OBJECTS})
+add_test(tabular_sarsa_test tabular_sarsa_test)
+
+add_executable(tabular_q_learning_test tabular_q_learning_test.cc
+    $<TARGET_OBJECTS:algorithms> ${OPEN_SPIEL_OBJECTS})
+add_test(tabular_q_learning_test tabular_q_learning_test)
+
 add_executable(tensor_game_utils_test tensor_game_utils_test.cc
     $<TARGET_OBJECTS:algorithms> ${OPEN_SPIEL_OBJECTS})
 add_test(tensor_game_utils_test tensor_game_utils_test)

diff --git a/open_spiel/algorithms/expected_returns.cc b/open_spiel/algorithms/expected_returns.cc
@@ -218,6 +218,81 @@ std::vector<double> ExpectedReturnsImpl(
   SPIEL_CHECK_EQ(values.size(), state.NumPlayers());
   return values;
 }
+
+std::vector<double> ExpectedReturnsOfDeterministicPoliciesFromSeedsImpl(
+  const State& state,
+  const std::vector<int>& policy_seeds,
+  const std::vector<const Policy*>& policies) {
+  if (state.IsTerminal()) {
+    return state.Rewards();
+  }
+  const int num_players = state.NumPlayers();
+  std::vector<double> values(num_players, 0.0);
+  if (state.IsSimultaneousNode()) {
+    SpielFatalError("Simultaneous not implemented.");
+  } else if (state.IsChanceNode()) {
+    ActionsAndProbs actions_and_probs = state.ChanceOutcomes();
+    for (const auto& action_and_prob : actions_and_probs) {
+      if (action_and_prob.second <= 0.0) continue;
+      std::unique_ptr<const State> child = state.Child(action_and_prob.first);
+      const std::vector<double> child_values = (
+          ExpectedReturnsOfDeterministicPoliciesFromSeedsImpl(
+              *child, policy_seeds, policies));
+      for (auto p = Player{0}; p < num_players; ++p) {
+        values[p] += action_and_prob.second * child_values[p];
+      }
+    }
+  } else {
+    // Get information state string.
+    std::string info_state_string = state.InformationStateString();
+    const int player = state.CurrentPlayer();
+
+    // Search for policy in policies.
+    ActionsAndProbs actions_and_probs = {};
+    for (const auto& policy : policies) {
+      actions_and_probs = policy->GetStatePolicy(state);
+      if (!actions_and_probs.empty()) {
+        break;
+      }
+    }
+    if (!actions_and_probs.empty()) {
+      for (const auto& action_and_prob : actions_and_probs) {
+        if (action_and_prob.second <= 0.0) continue;
+        std::unique_ptr<const State> child = state.Child(action_and_prob.first);
+        const std::vector<double> child_values = (
+            ExpectedReturnsOfDeterministicPoliciesFromSeedsImpl(
+                *child, policy_seeds, policies));
+        for (auto p = Player{0}; p < num_players; ++p) {
+          values[p] += action_and_prob.second * child_values[p];
+        }
+      }
+      return values;
+    }
+
+    // Determine the state seed from the policy seed.
+    auto state_seed = std::hash<std::string>{}(info_state_string);
+    state_seed += policy_seeds[player];
+    state_seed += state.MoveNumber() * num_players;
+    state_seed += player;
+    std::mt19937 gen(state_seed);
+
+    const auto legal_actions = state.LegalActions();
+    std::uniform_int_distribution<int> dist(0, legal_actions.size() - 1);
+    const int sampled_action_index = dist(gen);
+    const Action action = legal_actions[sampled_action_index];
+
+    SPIEL_CHECK_GE(action, 0);
+    std::unique_ptr<const State> child = state.Child(action);
+    std::vector<double> child_values = (
+        ExpectedReturnsOfDeterministicPoliciesFromSeedsImpl(
+            *child, policy_seeds, policies));
+    for (auto p = Player{0}; p < num_players; ++p) {
+      values[p] += child_values[p];
+    }
+  }
+  SPIEL_CHECK_EQ(values.size(), state.NumPlayers());
+  return values;
+}
 }  // namespace
 
 std::vector<double> ExpectedReturns(const State& state,
@@ -267,5 +342,23 @@ std::vector<double> ExpectedReturns(const State& state,
   }
 }
 
+
+std::vector<double> ExpectedReturnsOfDeterministicPoliciesFromSeeds(
+    const State& state, const std::vector<int>& policy_seeds) {
+  const std::vector<const Policy*>& policies = {};
+  SPIEL_CHECK_EQ(policy_seeds.size(), state.NumPlayers());
+  return ExpectedReturnsOfDeterministicPoliciesFromSeedsImpl(
+      state, policy_seeds, policies);
+}
+
+std::vector<double> ExpectedReturnsOfDeterministicPoliciesFromSeeds(
+    const State& state, const std::vector<int>& policy_seeds,
+    const std::vector<const Policy*>& policies) {
+  SPIEL_CHECK_EQ(policy_seeds.size(), state.NumPlayers());
+  return ExpectedReturnsOfDeterministicPoliciesFromSeedsImpl(
+      state, policy_seeds, policies);
+}
+
+
 }  // namespace algorithms
 }  // namespace open_spiel
diff --git a/open_spiel/algorithms/expected_returns.h b/open_spiel/algorithms/expected_returns.h
@@ -49,6 +49,31 @@ std::vector<double> ExpectedReturns(const State& state,
                                     bool use_infostate_get_policy = true,
                                     float prob_cut_threshold = 0.0);
 
+// Computes the (undiscounted) expected returns from random deterministic
+// policies which are specified using a seed. There should be a policy_seed per
+// player. Optionally any number of policies can be provided which override
+// the random deterministic policies.
+//
+// A deterministic policy is one that places all probability mass on a single
+// action at each information state. We randomly generate a deterministic
+// policy from a seed as follows:
+// * Specify a policy seed for each player.
+// * For each information state visited:
+//   - Calculate an integer hash of the information state string.
+//   - Add the move number.
+//   - Add the global seed of the corresponding player.
+//   - This results in a new seed per information state.
+//   - Using this seed, sample an action from a uniform integer distribution.
+//
+// This means that an entire policy can be represented cheaply with a single
+// integer and allows computing expected returns of games whose tabular policies
+// may not fit in memory.
+std::vector<double> ExpectedReturnsOfDeterministicPoliciesFromSeeds(
+  const State& state, const std::vector<int> & policy_seeds);
+std::vector<double> ExpectedReturnsOfDeterministicPoliciesFromSeeds(
+  const State& state, const std::vector<int> & policy_seeds,
+  const std::vector<const Policy*>& policies);
+
 }  // namespace algorithms
 }  // namespace open_spiel
 

diff --git a/open_spiel/algorithms/tabular_q_learning.cc b/open_spiel/algorithms/tabular_q_learning.cc
@@ -32,11 +32,12 @@ Action TabularQLearningSolver::GetBestAction(const State& state,
                                              double min_utility) {
   vector<Action> legal_actions = state.LegalActions();
   SPIEL_CHECK_GT(legal_actions.size(), 0);
-  Action best_action = legal_actions[0];
+  const auto state_str = state.ToString();
 
+  Action best_action = legal_actions[0];
   double value = min_utility;
   for (const Action& action : legal_actions) {
-    double q_val = values_[{state.ToString(), action}];
+    double q_val = values_[{state_str, action}];
     if (q_val >= value) {
       value = q_val;
       best_action = action;
@@ -54,19 +55,21 @@ double TabularQLearningSolver::GetBestActionValue(const State& state,
   return values_[{state.ToString(), GetBestAction(state, min_utility)}];
 }
 
-Action TabularQLearningSolver::SampleActionFromEpsilonGreedyPolicy(
+std::pair<Action, bool>
+TabularQLearningSolver::SampleActionFromEpsilonGreedyPolicy(
     const State& state, double min_utility) {
   vector<Action> legal_actions = state.LegalActions();
   if (legal_actions.empty()) {
-    return kInvalidAction;
+    return {kInvalidAction, false};
   }
 
   if (absl::Uniform(rng_, 0.0, 1.0) < epsilon_) {
     // Choose a random action
-    return legal_actions[absl::Uniform<int>(rng_, 0, legal_actions.size())];
+    return {legal_actions[absl::Uniform<int>(rng_, 0, legal_actions.size())],
+            true};
   }
   // Choose the best action
-  return GetBestAction(state, min_utility);
+  return {GetBestAction(state, min_utility), false};
 }
 
 void TabularQLearningSolver::SampleUntilNextStateOrTerminal(State* state) {
@@ -84,8 +87,8 @@ TabularQLearningSolver::TabularQLearningSolver(std::shared_ptr<const Game> game)
       learning_rate_(kDefaultLearningRate),
       discount_factor_(kDefaultDiscountFactor),
       lambda_(kDefaultLambda) {
-  // Only support lambda=0 for now.
-  SPIEL_CHECK_EQ(lambda_, 0);
+  SPIEL_CHECK_LE(lambda_, 1);
+  SPIEL_CHECK_GE(lambda_, 0);
 
   // Currently only supports 1-player or 2-player zero sum games
   SPIEL_CHECK_TRUE(game_->NumPlayers() == 1 || game_->NumPlayers() == 2);
@@ -109,8 +112,8 @@ TabularQLearningSolver::TabularQLearningSolver(
       learning_rate_(learning_rate),
       discount_factor_(discount_factor),
       lambda_(lambda) {
-  // Only support lambda=0 for now.
-  SPIEL_CHECK_EQ(lambda_, 0);
+  SPIEL_CHECK_LE(lambda_, 1);
+  SPIEL_CHECK_GE(lambda_, 0);
 
   // Currently only supports 1-player or 2-player zero sum games
   SPIEL_CHECK_TRUE(game_->NumPlayers() == 1 || game_->NumPlayers() == 2);
@@ -140,7 +143,7 @@ void TabularQLearningSolver::RunIteration() {
     const Player player = curr_state->CurrentPlayer();
 
     // Sample action from the state using an epsilon-greedy policy
-    Action curr_action =
+    auto [curr_action, chosen_uniformly] =
         SampleActionFromEpsilonGreedyPolicy(*curr_state, min_utility);
 
     std::unique_ptr<State> next_state = curr_state->Child(curr_action);
@@ -158,7 +161,30 @@ void TabularQLearningSolver::RunIteration() {
     double new_q_value = reward + discount_factor_ * next_q_value;
 
     double prev_q_val = values_[{key, curr_action}];
-    values_[{key, curr_action}] += learning_rate_ * (new_q_value - prev_q_val);
+    if (lambda_ == 0) {
+      // If lambda_ is equal to zero run Q-learning as usual.
+      // It's not necessary to update eligibility traces.
+      values_[{key, curr_action}] +=
+          learning_rate_ * (new_q_value - prev_q_val);
+    } else {
+      double lambda =
+          player != next_state->CurrentPlayer() ? -lambda_ : lambda_;
+      eligibility_traces_[{key, curr_action}] += 1;
+
+      for (const auto& q_cell : values_) {
+        std::string state = q_cell.first.first;
+        Action action = q_cell.first.second;
+
+        values_[{state, action}] += learning_rate_ *
+                                    (new_q_value - prev_q_val) *
+                                    eligibility_traces_[{state, action}];
+        if (chosen_uniformly) {
+          eligibility_traces_[{state, action}] = 0;
+        } else {
+          eligibility_traces_[{state, action}] *= discount_factor_ * lambda;
+        }
+      }
+    }
 
     curr_state = std::move(next_state);
   }

diff --git a/open_spiel/algorithms/tabular_q_learning.h b/open_spiel/algorithms/tabular_q_learning.h
@@ -34,7 +34,14 @@ namespace algorithms {
 //
 // Based on the implementation in Sutton and Barto, Intro to RL. Second Edition,
 // 2018. Section 6.5.
-// Note: current implementation only supports full bootstrapping (lambda = 0).
+//
+// Includes implementation of Watkins’s Q(lambda) which can be found in
+// Sutton and Barto, Intro to RL. Second Edition, 2018. Section 12.10.
+// (E.g. https://www.andrew.cmu.edu/course/10-703/textbook/BartoSutton.pdf)
+// Eligibility traces are implemented with the "accumulate"
+// method (+1 at each iteration) instead of "replace" implementation
+// (doesn't sum trace values). Parameter lambda_ determines the level
+// of bootstraping.
 
 class TabularQLearningSolver {
   static inline constexpr double kDefaultDepthLimit = -1;
@@ -63,9 +70,11 @@ class TabularQLearningSolver {
   double GetBestActionValue(const State& state, double min_utility);
 
   // Given a player and a state, gets the action, sampled from an epsilon-greedy
-  // policy
-  Action SampleActionFromEpsilonGreedyPolicy(const State& state,
-                                             double min_utility);
+  // policy. Returns <action, chosen_uniformly> where the second element
+  // indicates whether an action was chosen uniformly (which occurs with epsilon
+  // chance).
+  std::pair<Action, bool> SampleActionFromEpsilonGreedyPolicy(
+      const State& state, double min_utility);
 
   // Moves a chance node to the next decision/terminal node by sampling from
   // the legal actions repeatedly
@@ -79,6 +88,8 @@ class TabularQLearningSolver {
   double lambda_;
   std::mt19937 rng_;
   absl::flat_hash_map<std::pair<std::string, Action>, double> values_;
+  absl::flat_hash_map<std::pair<std::string, Action>, double>
+      eligibility_traces_;
 };
 
 }  // namespace algorithms