forked from TikhonJelvis/RL-book
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathreturns.py
59 lines (43 loc) · 1.4 KB
/
returns.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
import itertools
import math
from typing import Iterable, Iterator, TypeVar, overload
import rl.markov_process as mp
import rl.markov_decision_process as mdp
S = TypeVar('S')
A = TypeVar('A')
@overload
def returns(
trace: Iterable[mp.TransitionStep[S]],
γ: float,
tolerance: float
) -> Iterator[mp.ReturnStep[S]]:
...
@overload
def returns(
trace: Iterable[mdp.TransitionStep[S, A]],
γ: float,
tolerance: float
) -> Iterator[mdp.ReturnStep[S, A]]:
...
def returns(trace, γ, tolerance):
'''Given an iterator of states and rewards, calculate the return of
the first N states.
Arguments:
rewards -- instantaneous rewards
γ -- the discount factor (0 < γ ≤ 1)
tolerance -- a small value—we stop iterating once γᵏ ≤ tolerance
'''
trace = iter(trace)
max_steps = round(math.log(tolerance) / math.log(γ)) if γ < 1 else None
if max_steps is not None:
trace = itertools.islice(trace, max_steps * 2)
*transitions, last_transition = list(trace)
return_steps = itertools.accumulate(
reversed(transitions),
func=lambda next, curr: curr.add_return(γ, next.return_),
initial=last_transition.add_return(γ, 0)
)
return_steps = reversed(list(return_steps))
if max_steps is not None:
return_steps = itertools.islice(return_steps, max_steps)
return return_steps