-
-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathtd.py
98 lines (69 loc) · 2.4 KB
/
td.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
from environment import Easy21
import utils
import numpy as np
import dill as pickle
env = Easy21()
N0 = 100
actions = [0, 1]
def reset():
Q = np.zeros((22, 11, len(actions))) # state-action value
NSA = np.zeros((22, 11, len(actions))) # state-action counter
wins = 0
return Q, NSA, wins
Q, NSA, wins = reset()
trueQ = pickle.load(open('Q.dill', 'rb'))
# number of times state s has been visited
NS = lambda p, d: np.sum(NSA[p, d])
# step size
alpha = lambda p, d, a: 1/NSA[p, d, a]
# exploration probability
epsilon = lambda p, d: N0 / (N0 + NS(p, d))
def epsilonGreedy(p, d):
if np.random.random() < epsilon(p, d):
# explore
action = np.random.choice(actions)
else:
# exploit
action = np.argmax( [Q[p, d, a] for a in actions] )
return action
episodes = int(1e4)
lmds = list(np.arange(0,11)/10)
mselambdas = np.zeros((len(lmds), episodes))
finalMSE = np.zeros(len(lmds))
for li, lmd in enumerate(lmds):
Q, NSA, wins = reset()
for episode in range(episodes):
terminated = False
E = np.zeros((22, 11, len(actions))) # Eligibility Trace
p, d = env.initGame()
# inital state and first action
a = epsilonGreedy(p, d)
SA = list()
# Sample Environment
while not terminated:
pPrime, dPrime, r, terminated = env.step(p, d, a)
if not terminated:
aPrime = epsilonGreedy(pPrime, dPrime)
tdError = r + Q[pPrime, dPrime, aPrime] - Q[p, d, a]
else:
tdError = r - Q[p, d, a]
E[p, d, a] += 1
NSA[p, d, a] += 1
SA.append([p, d, a])
for (_p, _d, _a) in SA:
Q[_p, _d, _a] += alpha(_p, _d, _a) * tdError * E[_p, _d, _a]
E[_p, _d, _a] *= lmd
if not terminated:
p, d, a = pPrime, dPrime, aPrime
# bookkeeping
if r == 1:
wins += 1
mse = np.sum(np.square(Q-trueQ)) / (21*10*2)
mselambdas[li, episode] = mse
if episode % 1000 == 0 or episode+1==episodes:
print("Lambda=%.1f Episode %06d, MSE %5.3f, Wins %.3f"%(lmd, episode, mse, wins/(episode+1)))
finalMSE[li] = mse
print("Lambda=%.1f Episode %06d, MSE %5.3f, Wins %.3f"%(lmd, episode, mse, wins/(episode+1)))
print("--------")
utils.plotMseLambdas(finalMSE, lmds)
utils.plotMseEpisodesLambdas(mselambdas)