forked from TikhonJelvis/RL-book
-
Notifications
You must be signed in to change notification settings - Fork 0
/
td.py
75 lines (57 loc) · 2.28 KB
/
td.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
'''Monte Carlo methods for working with Markov Reward Process and
Markov Decision Processes.
'''
import itertools
from typing import Callable, Iterable, Iterator, TypeVar, Tuple
from rl.function_approx import FunctionApprox
import rl.markov_process as mp
import rl.markov_decision_process as mdp
S = TypeVar('S')
def evaluate_mrp(
transitions: Iterable[mp.TransitionStep[S]],
approx_0: FunctionApprox[S],
γ: float,
) -> Iterator[FunctionApprox[S]]:
'''Evaluate an MRP using TD(0) using the given sequence of
transitions.
Each value this function yields represents the approximated value
function for the MRP after an additional transition.
Arguments:
transitions -- a sequence of transitions from an MRP which don't
have to be in order or from the same simulation
approx_0 -- initial approximation of value function
γ -- discount rate (0 < γ ≤ 1)
'''
def step(v, transition):
return v.update([(transition.state,
transition.reward + γ * v(transition.next_state))])
return itertools.accumulate(transitions, step, initial=approx_0)
A = TypeVar('A')
# TODO: More specific name (ie experience replay?)
def evaluate_mdp(
transitions: Iterable[mdp.TransitionStep[S, A]],
actions: Callable[[S], Iterable[A]],
approx_0: FunctionApprox[Tuple[S, A]],
γ: float
) -> Iterator[FunctionApprox[Tuple[S, A]]]:
'''Return policies that try to maximize the reward based on the given
set of experiences.
Arguments:
transitions -- a sequence of state, action, reward, state (S, A, R, S')
actions -- a function returning the possible actions for a given state
approx_0 -- initial approximation of q function
γ -- discount rate (0 < γ ≤ 1)
Returns:
an itertor of approximations of the q function based on the
transitions given as input
'''
def step(q, transition):
next_reward = max(
q((transition.next_state, a))
for a in actions(transition.next_state)
)
return q.update([
((transition.state, transition.action),
transition.reward + γ * next_reward)
])
return itertools.accumulate(transitions, step, initial=approx_0)