Miscellaneous fixes for spring20 (yandexdataschool#392)

* [Week 1: CEM] Do not give away the answer for initialize_policy() * [Week 1: CEM] Cosmetic * [Week 3: Q-learning] Do 10000 training iterations by default * [Week 3: Q-learning] Cosmetic * [Week 5] Cosmetic * [Week 6: REINFORCE] Cleanup JSON * [Week 3] Moar cosmetic * [Week 7: practice] Fix yandexdataschool#256 Co-authored-by: Michael Diskin <yhn1124@gmail.com>
hieuqtran · Apr 27, 2020 · 4a100d3 · 4a100d3
1 parent 8b9a76a
commit 4a100d3
Show file tree

Hide file tree

Showing 5 changed files with 53 additions and 60 deletions.
diff --git a/week01_intro/crossentropy_method.ipynb b/week01_intro/crossentropy_method.ipynb
@@ -18,7 +18,6 @@
     "import sys, os\n",
     "if 'google.colab' in sys.modules and not os.path.exists('.setup_complete'):\n",
     "    !wget -q https://raw.githubusercontent.com/yandexdataschool/Practical_RL/spring20/setup_colab.sh -O- | bash\n",
-    "\n",
     "    !touch .setup_complete\n",
     "\n",
     "# This code creates a virtual display to draw game images on.\n",
@@ -66,7 +65,7 @@
     "\n",
     "Since we still use integer state and action representations, you can use a 2-dimensional array to represent the policy.\n",
     "\n",
-    "Please initialize policy __uniformly__, that is, probabililities of all actions should be equal.\n"
+    "Please initialize the policy __uniformly__, that is, probabililities of all actions should be equal."
    ]
   },
   {
@@ -75,7 +74,12 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "policy = <YOUR CODE: create an array to store action probabilities>"
+    "def initialize_policy(n_states, n_actions):\n",
+    "    <YOUR CODE: create an array to store action probabilities>\n",
+    "    \n",
+    "    return policy\n",
+    "\n",
+    "policy = initialize_policy(n_states, n_actions)"
    ]
   },
   {
@@ -116,19 +120,21 @@
     "    s = env.reset()\n",
     "\n",
     "    for t in range(t_max):\n",
-    "\n",
-    "        a = <YOUR CODE: sample action from policy (hint: use np.random.choice)>\n",
+    "        # Hint: you can use np.random.choice for sampling action\n",
+    "        # https://numpy.org/doc/stable/reference/random/generated/numpy.random.choice.html\n",
+    "        a = <YOUR CODE: sample action from policy>\n",
     "\n",
     "        new_s, r, done, info = env.step(a)\n",
     "\n",
-    "        # Record state, action and add up reward to states,actions and total_reward accordingly.\n",
+    "        # Record information we just got from the environment.\n",
     "        states.append(s)\n",
     "        actions.append(a)\n",
     "        total_reward += r\n",
     "\n",
     "        s = new_s\n",
     "        if done:\n",
     "            break\n",
+    "\n",
     "    return states, actions, total_reward"
    ]
   },
@@ -222,27 +228,24 @@
     "    5,  # game3\n",
     "]\n",
     "\n",
-    "test_result_0 = select_elites(\n",
-    "    states_batch, actions_batch, rewards_batch, percentile=0)\n",
-    "test_result_30 = select_elites(\n",
-    "    states_batch, actions_batch, rewards_batch, percentile=30)\n",
-    "test_result_90 = select_elites(\n",
-    "    states_batch, actions_batch, rewards_batch, percentile=90)\n",
-    "test_result_100 = select_elites(\n",
-    "    states_batch, actions_batch, rewards_batch, percentile=100)\n",
+    "test_result_0 = select_elites(states_batch, actions_batch, rewards_batch, percentile=0)\n",
+    "test_result_30 = select_elites(states_batch, actions_batch, rewards_batch, percentile=30)\n",
+    "test_result_90 = select_elites(states_batch, actions_batch, rewards_batch, percentile=90)\n",
+    "test_result_100 = select_elites(states_batch, actions_batch, rewards_batch, percentile=100)\n",
     "\n",
     "assert np.all(test_result_0[0] == [1, 2, 3, 4, 2, 0, 2, 3, 1])  \\\n",
-    "    and np.all(test_result_0[1] == [0, 2, 4, 3, 2, 0, 1, 3, 3]),\\\n",
+    "    and np.all(test_result_0[1] == [0, 2, 4, 3, 2, 0, 1, 3, 3]), \\\n",
     "    \"For percentile 0 you should return all states and actions in chronological order\"\n",
     "assert np.all(test_result_30[0] == [4, 2, 0, 2, 3, 1]) and \\\n",
-    "    np.all(test_result_30[1] == [3, 2, 0, 1, 3, 3]),\\\n",
+    "    np.all(test_result_30[1] == [3, 2, 0, 1, 3, 3]), \\\n",
     "    \"For percentile 30 you should only select states/actions from two first\"\n",
     "assert np.all(test_result_90[0] == [3, 1]) and \\\n",
-    "    np.all(test_result_90[1] == [3, 3]),\\\n",
+    "    np.all(test_result_90[1] == [3, 3]), \\\n",
     "    \"For percentile 90 you should only select states/actions from one game\"\n",
     "assert np.all(test_result_100[0] == [3, 1]) and\\\n",
-    "    np.all(test_result_100[1] == [3, 3]),\\\n",
+    "    np.all(test_result_100[1] == [3, 3]), \\\n",
     "    \"Please make sure you use >=, not >. Also double-check how you compute percentile.\"\n",
+    "\n",
     "print(\"Ok!\")"
    ]
   },
@@ -286,18 +289,20 @@
     "\n",
     "new_policy = update_policy(elite_states, elite_actions)\n",
     "\n",
-    "assert np.isfinite(new_policy).all(\n",
-    "), \"Your new policy contains NaNs or +-inf. Make sure you don't divide by zero.\"\n",
-    "assert np.all(\n",
-    "    new_policy >= 0), \"Your new policy can't have negative action probabilities\"\n",
-    "assert np.allclose(new_policy.sum(\n",
-    "    axis=-1), 1), \"Your new policy should be a valid probability distribution over actions\"\n",
+    "assert np.isfinite(new_policy).all(), \\\n",
+    "    \"Your new policy contains NaNs or +-inf. Make sure you don't divide by zero.\"\n",
+    "assert np.all(new_policy >= 0), \\\n",
+    "    \"Your new policy can't have negative action probabilities\"\n",
+    "assert np.allclose(new_policy.sum(axis=-1), 1), \\\n",
+    "    \"Your new policy should be a valid probability distribution over actions\"\n",
+    "\n",
     "reference_answer = np.array([\n",
     "    [1.,  0.,  0.,  0.,  0.],\n",
     "    [0.5,  0.,  0.,  0.5,  0.],\n",
     "    [0.,  0.33333333,  0.66666667,  0.,  0.],\n",
     "    [0.,  0.,  0.,  0.5,  0.5]])\n",
     "assert np.allclose(new_policy[:4, :5], reference_answer)\n",
+    "\n",
     "print(\"Ok!\")"
    ]
   },
@@ -352,7 +357,7 @@
    "outputs": [],
    "source": [
     "# reset policy just in case\n",
-    "policy = np.ones([n_states, n_actions]) / n_actions"
+    "policy = initialize_policy(n_states, n_actions)"
    ]
   },
   {
@@ -361,23 +366,22 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "n_sessions = 250  # sample this many sessions\n",
-    "percentile = 50  # take this percent of session with highest rewards\n",
-    "learning_rate = 0.5  # add this thing to all counts for stability\n",
+    "n_sessions = 250     # sample this many sessions\n",
+    "percentile = 50      # take this percent of session with highest rewards\n",
+    "learning_rate = 0.5  # how quickly the policy is updated, on a scale from 0 to 1\n",
     "\n",
     "log = []\n",
     "\n",
     "for i in range(100):\n",
-    "\n",
     "    %time sessions = [ <YOUR CODE: generate a list of n_sessions new sessions> ]\n",
     "\n",
     "    states_batch, actions_batch, rewards_batch = zip(*sessions)\n",
     "\n",
-    "    elite_states, elite_actions = <YOUR CODE: select elite states/actions>\n",
+    "    elite_states, elite_actions = <YOUR CODE: select elite states & actions>\n",
     "\n",
     "    new_policy = <YOUR CODE: compute new policy>\n",
     "\n",
-    "    policy = learning_rate*new_policy + (1-learning_rate)*policy\n",
+    "    policy = learning_rate * new_policy + (1 - learning_rate) * policy\n",
     "\n",
     "    # display results on chart\n",
     "    show_progress(rewards_batch, log, percentile)"
@@ -389,11 +393,11 @@
    "source": [
     "### Reflecting on results\n",
     "\n",
-    "You may have noticed that the taxi problem quickly converges from <-1000 to a near-optimal score and then descends back into -50/-100. This is in part because the environment has some innate randomness. Namely, the starting points of passenger/driver change from episode to episode.\n",
+    "You may have noticed that the taxi problem quickly converges from less than -1000 to a near-optimal score and then descends back into -50/-100. This is in part because the environment has some innate randomness. Namely, the starting points of passenger/driver change from episode to episode.\n",
     "\n",
     "In case CEM failed to learn how to win from one distinct starting point, it will simply discard it because no sessions from that starting point will make it into the \"elites\".\n",
     "\n",
-    "To mitigate that problem, you can either reduce the threshold for elite sessions (duct tape way) or  change the way you evaluate strategy (theoretically correct way). You can first sample an action for every possible state and then evaluate this choice of actions by running _several_ games and averaging rewards."
+    "To mitigate that problem, you can either reduce the threshold for elite sessions (duct tape way) or change the way you evaluate strategy (theoretically correct way). For each starting state, you can sample an action randomly, and then evaluate this action by running _several_ games starting from it and averaging the total reward. Choosing elite sessions with this kind of sampling (where each session's reward is counted as the average of the rewards of all sessions with the same starting state and action) should improve the performance of your policy."
    ]
   },
   {

diff --git a/week03_model_free/seminar_qlearning.ipynb b/week03_model_free/seminar_qlearning.ipynb
@@ -271,12 +271,12 @@
    "source": [
     "# Binarized state spaces\n",
     "\n",
-    "Use agent to train efficiently on CartPole-v0.\n",
+    "Use agent to train efficiently on `CartPole-v0`.\n",
     "This environment has a continuous set of possible states, so you will have to group them into bins somehow.\n",
     "\n",
-    "The simplest way is to use `round(x,n_digits)` (or numpy round) to round real number to a given amount of digits.\n",
+    "The simplest way is to use `round(x,n_digits)` (or `np.round`) to round a real number to a given amount of digits.\n",
     "\n",
-    "The tricky part is to get the n_digits right for each state to train effectively.\n",
+    "The tricky part is to get the `n_digits` right for each state to train effectively.\n",
     "\n",
     "Note that you don't need to convert state to integers, but to __tuples__ of any kind of values."
    ]
@@ -346,9 +346,9 @@
     "\n",
     "    def observation(self, state):\n",
     "\n",
-    "        # state = <round state to some amount digits.>\n",
     "        # hint: you can do that with round(x,n_digits)\n",
-    "        # you will need to pick a different n_digits for each dimension\n",
+    "        # you may pick a different n_digits for each dimension\n",
+    "        state = <YOUR CODE: round state to some amount digits>\n",
     "\n",
     "        return tuple(state)"
    ]
@@ -359,7 +359,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "env = Binarizer(gym.make(\"CartPole-v0\"))"
+    "env = Binarizer(gym.make(\"CartPole-v0\").env)"
    ]
   },
   {
@@ -394,11 +394,11 @@
     "\n",
     "Now let's train a policy that uses binarized state space.\n",
     "\n",
-    "__Tips:__ \n",
+    "__Tips:__\n",
     "* If your binarization is too coarse, your agent may fail to find optimal policy. In that case, change binarization. \n",
     "* If your binarization is too fine-grained, your agent will take much longer than 1000 steps to converge. You can either increase number of iterations and decrease epsilon decay or change binarization.\n",
-    "* Having 10^3 ~ 10^4 distinct states is recommended (`len(QLearningAgent._qvalues)`), but not required.\n",
-    "* A reasonable agent should get to an average reward of >=50."
+    "* Having $10^3$–$10^4$ distinct states is recommended (`len(QLearningAgent._qvalues)`), but not required.\n",
+    "* A reasonable agent should get to an average reward of at least 50."
    ]
   },
   {
@@ -418,10 +418,11 @@
    "outputs": [],
    "source": [
     "rewards = []\n",
-    "for i in range(1000):\n",
+    "for i in range(10000):\n",
     "    rewards.append(play_and_train(env, agent))\n",
     "\n",
-    "    # OPTIONAL YOUR CODE: adjust epsilon\n",
+    "    # OPTIONAL: <YOUR CODE: adjust epsilon>\n",
+    "\n",
     "    if i % 100 == 0:\n",
     "        clear_output(True)\n",
     "        print('eps =', agent.epsilon, 'mean reward =', np.mean(rewards[-10:]))\n",

diff --git a/week05_explore/week5.ipynb b/week05_explore/week5.ipynb
@@ -41,7 +41,7 @@
     "\n",
     "import pandas\n",
     "\n",
-    "from matplotlib import pyplot as plt\n",
+    "import matplotlib.pyplot as plt\n",
     "%matplotlib inline"
    ]
   },

diff --git a/week06_policy_based/reinforce_pytorch.ipynb b/week06_policy_based/reinforce_pytorch.ipynb
@@ -395,22 +395,9 @@
   }
  ],
  "metadata": {
-  "kernelspec": {
-   "display_name": "Python 3",
-   "language": "python",
-   "name": "python3"
-  },
   "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
    "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.7.6"
+   "pygments_lexer": "ipython3"
   }
  },
  "nbformat": 4,

diff --git a/week07_seq2seq/practice_tf.ipynb b/week07_seq2seq/practice_tf.ipynb
@@ -8,7 +8,8 @@
    "source": [
     "import sys, os\n",
     "if 'google.colab' in sys.modules:\n",
-    "    %tensorflow_version 1.x\n",
+    "    # https://github.com/yandexdataschool/Practical_RL/issues/256\n",
+    "    !pip install tensorflow-gpu==1.13.1\n",
     "    \n",
     "    if not os.path.exists('.setup_complete'):\n",
     "        !wget -q https://raw.githubusercontent.com/yandexdataschool/Practical_RL/spring20/setup_colab.sh -O- | bash\n",