-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathtrain.py
116 lines (89 loc) · 4.02 KB
/
train.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
import sys, os
from MiniSnake import game
from keras.models import Sequential
from keras.layers.core import Dense, Flatten, Dropout
from keras.layers import Conv2D, MaxPooling2D, ZeroPadding2D, BatchNormalization
import numpy as np
from collections import deque
from skimage import color, transform
import random
from keras.utils import plot_model
game_engine = game()
D = deque()
from config import num_of_cols, num_of_rows, num_of_hidden_layer_neurons, img_channels, num_of_actions, \
batch_size, epsilon, observe, gamma, action_array, reward_on_eat, reward_in_env, death_reward, \
timesteps_to_save_weights, exp_replay_memory
#Convolves 32 filters size 8x8 with stride = 4
model = Sequential()
model.add(Conv2D(32, kernel_size=(8,8), strides=(4, 4), activation='relu',input_shape=(num_of_cols,num_of_rows,img_channels)))
model.add(MaxPooling2D(pool_size=(4,4), strides=(2, 2), padding='same'))
model.add(Conv2D(64, kernel_size=(4,4), strides=(2, 2), activation='relu'))
model.add(MaxPooling2D(pool_size=(2,2), strides=(1, 1), padding='same'))
model.add(Flatten())
model.add(Dense(num_of_hidden_layer_neurons, activation='relu'))
model.add(Dense(num_of_actions))
model.compile(loss='mse',optimizer='adam')
#plot_model(model, to_file='model.png')
model.load_weights("weights.hdf5")
start_game = game_engine.play(0)
#Obtain the starting state
r_0, s_t, s_f = game_engine.play(0)
s_t = color.rgb2gray(s_t)
s_t = transform.resize(s_t,(num_of_cols,num_of_rows))
s_t = np.stack((s_t, s_t, s_t, s_t), axis=2)
#In Keras, need to reshape
s_t = s_t.reshape(1, s_t.shape[0], s_t.shape[1], s_t.shape[2]) #1*num_of_cols*num_of_rows*4
t=0
while True:
explored = False
loss = 0 #initialize the loss of the network
Q_sa = 0 #initialize state
action_index = 0 #initialize action index
r_t = 0 #initialize reward
#a_t = np.zeros([num_of_actions]) #initalize acctions as an array that holds one array [0, 0]
if t < observe:
action_index = 1 if random.random() < 0.5 else 0
else:
if random.random() <= epsilon:
action_index = random.randint(0, num_of_actions-1) #choose a random action
explored = True
else:
q = model.predict(s_t) #input a stack of 4 images, get the prediction
action_index = np.argmax(q)
r_t, s_t1, terminal = game_engine.play(action_array[action_index])
#get and preprocess our transitioned state
s_t1 = color.rgb2gray(s_t1)
s_t1 = transform.resize(s_t1,(num_of_rows,num_of_cols))
s_t1 = s_t1.reshape(1, s_t1.shape[0], s_t1.shape[1], 1) #1x80x80x1
s_t1 = np.append(s_t1, s_t[:, :, :, :3], axis=3)
#append the state to our experience replay memory
D.append((s_t, action_index, r_t, s_t1, terminal))
if len(D) > exp_replay_memory:
D.popleft()
if t > observe:
#sample a random minibatch of transitions in D (replay memory)
random_minibatch = random.sample(list(D), batch_size)
inputs = np.zeros((batch_size, s_t.shape[1], s_t.shape[2], s_t.shape[3])) #32, 80, 80, 4
targets = np.zeros((inputs.shape[0], num_of_actions)) #32, 2
for i in range(0, len(random_minibatch)):
state_t = random_minibatch[i][0]
action_t = random_minibatch[i][1]
reward_t = random_minibatch[i][2]
state_t1 = random_minibatch[i][3]
terminal = random_minibatch[i][4]
#fill out the inputs and outputs with the information in the minibatch, and what values we get from the network
inputs[i:i + 1] = state_t
targets[i] = model.predict(state_t)
Q_sa = model.predict(state_t1)
#set the value of the action we chose in each state in the random minibatch to the reward given at that state (Q-learn)
if terminal:
targets[i, action_t] = death_reward
else:
targets[i, action_t] = reward_t + gamma * np.max(Q_sa)
#train the network with the new values calculated with Q-learning and get loss of our network for evaluation
loss += model.train_on_batch(inputs, targets)
s_t = s_t1
t += 1
if t % timesteps_to_save_weights == 0:
model.save_weights('weights.hdf5', overwrite=True)
print("Timestep: %d, Action: %d, Reward: %.2f, Q: %.2f, Loss: %.2f, Explored: %s" % (t, action_index, r_t, np.max(Q_sa), loss, explored))