Skip to content

Commit

Permalink
Miscellaneous fixes for spring20 (yandexdataschool#392)
Browse files Browse the repository at this point in the history
* [Week 1: CEM] Do not give away the answer for initialize_policy()

* [Week 1: CEM] Cosmetic

* [Week 3: Q-learning] Do 10000 training iterations by default

* [Week 3: Q-learning] Cosmetic

* [Week 5] Cosmetic

* [Week 6: REINFORCE] Cleanup JSON

* [Week 3] Moar cosmetic

* [Week 7: practice] Fix yandexdataschool#256

Co-authored-by: Michael Diskin <yhn1124@gmail.com>
  • Loading branch information
dniku and yhn112 authored Apr 27, 2020
1 parent 8b9a76a commit 4a100d3
Show file tree
Hide file tree
Showing 5 changed files with 53 additions and 60 deletions.
70 changes: 37 additions & 33 deletions week01_intro/crossentropy_method.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,6 @@
"import sys, os\n",
"if 'google.colab' in sys.modules and not os.path.exists('.setup_complete'):\n",
" !wget -q https://raw.githubusercontent.com/yandexdataschool/Practical_RL/spring20/setup_colab.sh -O- | bash\n",
"\n",
" !touch .setup_complete\n",
"\n",
"# This code creates a virtual display to draw game images on.\n",
Expand Down Expand Up @@ -66,7 +65,7 @@
"\n",
"Since we still use integer state and action representations, you can use a 2-dimensional array to represent the policy.\n",
"\n",
"Please initialize policy __uniformly__, that is, probabililities of all actions should be equal.\n"
"Please initialize the policy __uniformly__, that is, probabililities of all actions should be equal."
]
},
{
Expand All @@ -75,7 +74,12 @@
"metadata": {},
"outputs": [],
"source": [
"policy = <YOUR CODE: create an array to store action probabilities>"
"def initialize_policy(n_states, n_actions):\n",
" <YOUR CODE: create an array to store action probabilities>\n",
" \n",
" return policy\n",
"\n",
"policy = initialize_policy(n_states, n_actions)"
]
},
{
Expand Down Expand Up @@ -116,19 +120,21 @@
" s = env.reset()\n",
"\n",
" for t in range(t_max):\n",
"\n",
" a = <YOUR CODE: sample action from policy (hint: use np.random.choice)>\n",
" # Hint: you can use np.random.choice for sampling action\n",
" # https://numpy.org/doc/stable/reference/random/generated/numpy.random.choice.html\n",
" a = <YOUR CODE: sample action from policy>\n",
"\n",
" new_s, r, done, info = env.step(a)\n",
"\n",
" # Record state, action and add up reward to states,actions and total_reward accordingly.\n",
" # Record information we just got from the environment.\n",
" states.append(s)\n",
" actions.append(a)\n",
" total_reward += r\n",
"\n",
" s = new_s\n",
" if done:\n",
" break\n",
"\n",
" return states, actions, total_reward"
]
},
Expand Down Expand Up @@ -222,27 +228,24 @@
" 5, # game3\n",
"]\n",
"\n",
"test_result_0 = select_elites(\n",
" states_batch, actions_batch, rewards_batch, percentile=0)\n",
"test_result_30 = select_elites(\n",
" states_batch, actions_batch, rewards_batch, percentile=30)\n",
"test_result_90 = select_elites(\n",
" states_batch, actions_batch, rewards_batch, percentile=90)\n",
"test_result_100 = select_elites(\n",
" states_batch, actions_batch, rewards_batch, percentile=100)\n",
"test_result_0 = select_elites(states_batch, actions_batch, rewards_batch, percentile=0)\n",
"test_result_30 = select_elites(states_batch, actions_batch, rewards_batch, percentile=30)\n",
"test_result_90 = select_elites(states_batch, actions_batch, rewards_batch, percentile=90)\n",
"test_result_100 = select_elites(states_batch, actions_batch, rewards_batch, percentile=100)\n",
"\n",
"assert np.all(test_result_0[0] == [1, 2, 3, 4, 2, 0, 2, 3, 1]) \\\n",
" and np.all(test_result_0[1] == [0, 2, 4, 3, 2, 0, 1, 3, 3]),\\\n",
" and np.all(test_result_0[1] == [0, 2, 4, 3, 2, 0, 1, 3, 3]), \\\n",
" \"For percentile 0 you should return all states and actions in chronological order\"\n",
"assert np.all(test_result_30[0] == [4, 2, 0, 2, 3, 1]) and \\\n",
" np.all(test_result_30[1] == [3, 2, 0, 1, 3, 3]),\\\n",
" np.all(test_result_30[1] == [3, 2, 0, 1, 3, 3]), \\\n",
" \"For percentile 30 you should only select states/actions from two first\"\n",
"assert np.all(test_result_90[0] == [3, 1]) and \\\n",
" np.all(test_result_90[1] == [3, 3]),\\\n",
" np.all(test_result_90[1] == [3, 3]), \\\n",
" \"For percentile 90 you should only select states/actions from one game\"\n",
"assert np.all(test_result_100[0] == [3, 1]) and\\\n",
" np.all(test_result_100[1] == [3, 3]),\\\n",
" np.all(test_result_100[1] == [3, 3]), \\\n",
" \"Please make sure you use >=, not >. Also double-check how you compute percentile.\"\n",
"\n",
"print(\"Ok!\")"
]
},
Expand Down Expand Up @@ -286,18 +289,20 @@
"\n",
"new_policy = update_policy(elite_states, elite_actions)\n",
"\n",
"assert np.isfinite(new_policy).all(\n",
"), \"Your new policy contains NaNs or +-inf. Make sure you don't divide by zero.\"\n",
"assert np.all(\n",
" new_policy >= 0), \"Your new policy can't have negative action probabilities\"\n",
"assert np.allclose(new_policy.sum(\n",
" axis=-1), 1), \"Your new policy should be a valid probability distribution over actions\"\n",
"assert np.isfinite(new_policy).all(), \\\n",
" \"Your new policy contains NaNs or +-inf. Make sure you don't divide by zero.\"\n",
"assert np.all(new_policy >= 0), \\\n",
" \"Your new policy can't have negative action probabilities\"\n",
"assert np.allclose(new_policy.sum(axis=-1), 1), \\\n",
" \"Your new policy should be a valid probability distribution over actions\"\n",
"\n",
"reference_answer = np.array([\n",
" [1., 0., 0., 0., 0.],\n",
" [0.5, 0., 0., 0.5, 0.],\n",
" [0., 0.33333333, 0.66666667, 0., 0.],\n",
" [0., 0., 0., 0.5, 0.5]])\n",
"assert np.allclose(new_policy[:4, :5], reference_answer)\n",
"\n",
"print(\"Ok!\")"
]
},
Expand Down Expand Up @@ -352,7 +357,7 @@
"outputs": [],
"source": [
"# reset policy just in case\n",
"policy = np.ones([n_states, n_actions]) / n_actions"
"policy = initialize_policy(n_states, n_actions)"
]
},
{
Expand All @@ -361,23 +366,22 @@
"metadata": {},
"outputs": [],
"source": [
"n_sessions = 250 # sample this many sessions\n",
"percentile = 50 # take this percent of session with highest rewards\n",
"learning_rate = 0.5 # add this thing to all counts for stability\n",
"n_sessions = 250 # sample this many sessions\n",
"percentile = 50 # take this percent of session with highest rewards\n",
"learning_rate = 0.5 # how quickly the policy is updated, on a scale from 0 to 1\n",
"\n",
"log = []\n",
"\n",
"for i in range(100):\n",
"\n",
" %time sessions = [ <YOUR CODE: generate a list of n_sessions new sessions> ]\n",
"\n",
" states_batch, actions_batch, rewards_batch = zip(*sessions)\n",
"\n",
" elite_states, elite_actions = <YOUR CODE: select elite states/actions>\n",
" elite_states, elite_actions = <YOUR CODE: select elite states & actions>\n",
"\n",
" new_policy = <YOUR CODE: compute new policy>\n",
"\n",
" policy = learning_rate*new_policy + (1-learning_rate)*policy\n",
" policy = learning_rate * new_policy + (1 - learning_rate) * policy\n",
"\n",
" # display results on chart\n",
" show_progress(rewards_batch, log, percentile)"
Expand All @@ -389,11 +393,11 @@
"source": [
"### Reflecting on results\n",
"\n",
"You may have noticed that the taxi problem quickly converges from <-1000 to a near-optimal score and then descends back into -50/-100. This is in part because the environment has some innate randomness. Namely, the starting points of passenger/driver change from episode to episode.\n",
"You may have noticed that the taxi problem quickly converges from less than -1000 to a near-optimal score and then descends back into -50/-100. This is in part because the environment has some innate randomness. Namely, the starting points of passenger/driver change from episode to episode.\n",
"\n",
"In case CEM failed to learn how to win from one distinct starting point, it will simply discard it because no sessions from that starting point will make it into the \"elites\".\n",
"\n",
"To mitigate that problem, you can either reduce the threshold for elite sessions (duct tape way) or change the way you evaluate strategy (theoretically correct way). You can first sample an action for every possible state and then evaluate this choice of actions by running _several_ games and averaging rewards."
"To mitigate that problem, you can either reduce the threshold for elite sessions (duct tape way) or change the way you evaluate strategy (theoretically correct way). For each starting state, you can sample an action randomly, and then evaluate this action by running _several_ games starting from it and averaging the total reward. Choosing elite sessions with this kind of sampling (where each session's reward is counted as the average of the rewards of all sessions with the same starting state and action) should improve the performance of your policy."
]
},
{
Expand Down
23 changes: 12 additions & 11 deletions week03_model_free/seminar_qlearning.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -271,12 +271,12 @@
"source": [
"# Binarized state spaces\n",
"\n",
"Use agent to train efficiently on CartPole-v0.\n",
"Use agent to train efficiently on `CartPole-v0`.\n",
"This environment has a continuous set of possible states, so you will have to group them into bins somehow.\n",
"\n",
"The simplest way is to use `round(x,n_digits)` (or numpy round) to round real number to a given amount of digits.\n",
"The simplest way is to use `round(x,n_digits)` (or `np.round`) to round a real number to a given amount of digits.\n",
"\n",
"The tricky part is to get the n_digits right for each state to train effectively.\n",
"The tricky part is to get the `n_digits` right for each state to train effectively.\n",
"\n",
"Note that you don't need to convert state to integers, but to __tuples__ of any kind of values."
]
Expand Down Expand Up @@ -346,9 +346,9 @@
"\n",
" def observation(self, state):\n",
"\n",
" # state = <round state to some amount digits.>\n",
" # hint: you can do that with round(x,n_digits)\n",
" # you will need to pick a different n_digits for each dimension\n",
" # you may pick a different n_digits for each dimension\n",
" state = <YOUR CODE: round state to some amount digits>\n",
"\n",
" return tuple(state)"
]
Expand All @@ -359,7 +359,7 @@
"metadata": {},
"outputs": [],
"source": [
"env = Binarizer(gym.make(\"CartPole-v0\"))"
"env = Binarizer(gym.make(\"CartPole-v0\").env)"
]
},
{
Expand Down Expand Up @@ -394,11 +394,11 @@
"\n",
"Now let's train a policy that uses binarized state space.\n",
"\n",
"__Tips:__ \n",
"__Tips:__\n",
"* If your binarization is too coarse, your agent may fail to find optimal policy. In that case, change binarization. \n",
"* If your binarization is too fine-grained, your agent will take much longer than 1000 steps to converge. You can either increase number of iterations and decrease epsilon decay or change binarization.\n",
"* Having 10^3 ~ 10^4 distinct states is recommended (`len(QLearningAgent._qvalues)`), but not required.\n",
"* A reasonable agent should get to an average reward of >=50."
"* Having $10^3$–$10^4$ distinct states is recommended (`len(QLearningAgent._qvalues)`), but not required.\n",
"* A reasonable agent should get to an average reward of at least 50."
]
},
{
Expand All @@ -418,10 +418,11 @@
"outputs": [],
"source": [
"rewards = []\n",
"for i in range(1000):\n",
"for i in range(10000):\n",
" rewards.append(play_and_train(env, agent))\n",
"\n",
" # OPTIONAL YOUR CODE: adjust epsilon\n",
" # OPTIONAL: <YOUR CODE: adjust epsilon>\n",
"\n",
" if i % 100 == 0:\n",
" clear_output(True)\n",
" print('eps =', agent.epsilon, 'mean reward =', np.mean(rewards[-10:]))\n",
Expand Down
2 changes: 1 addition & 1 deletion week05_explore/week5.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@
"\n",
"import pandas\n",
"\n",
"from matplotlib import pyplot as plt\n",
"import matplotlib.pyplot as plt\n",
"%matplotlib inline"
]
},
Expand Down
15 changes: 1 addition & 14 deletions week06_policy_based/reinforce_pytorch.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -395,22 +395,9 @@
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.6"
"pygments_lexer": "ipython3"
}
},
"nbformat": 4,
Expand Down
3 changes: 2 additions & 1 deletion week07_seq2seq/practice_tf.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,8 @@
"source": [
"import sys, os\n",
"if 'google.colab' in sys.modules:\n",
" %tensorflow_version 1.x\n",
" # https://github.com/yandexdataschool/Practical_RL/issues/256\n",
" !pip install tensorflow-gpu==1.13.1\n",
" \n",
" if not os.path.exists('.setup_complete'):\n",
" !wget -q https://raw.githubusercontent.com/yandexdataschool/Practical_RL/spring20/setup_colab.sh -O- | bash\n",
Expand Down

0 comments on commit 4a100d3

Please sign in to comment.