diff --git a/src/model.py b/src/model.py index dd35827..e11d0eb 100644 --- a/src/model.py +++ b/src/model.py @@ -37,18 +37,19 @@ def prepare_action(action_holder, num_of_action=NUM_OF_ACTION): def action_score_to_action(action_score, epoch, epsilon_start, epsilon_end, epsilon_end_epoch): action_to_take = tf.argmax(action_score, axis=1) - random_action = tf.cast(tf.random_uniform(shape=[None], minval=0.0, maxval=4.0)), tf.int32) - epsilon = tf.where(epoch < epsilon_end_epoch, (((epsilon_end - epsilon_start) / epsilon_end_epoch) * epoch + 1), epsilon_end) + random_action = tf.cast(tf.random_uniform(shape=[1], minval=0.0, maxval=4.0), tf.int64) + epsilon_start_with = (((epsilon_end - epsilon_start) / epsilon_end_epoch) * epoch + 1) + epsilon = tf.where(epoch < epsilon_end_epoch, epsilon_start_with, epsilon_end) - return tf.where(tf.random_uniform(shape=[None]) > epsilon, random_action, action_to_take) + return tf.where(tf.random_uniform(shape=[1]) > epsilon, random_action, action_to_take) def q_predicted_reward(action_score): return tf.reduce_max(action_score, axis=1) -def q_future_reward(action_score, action_holder, terminal_holder, discount): - action_one_hot = model.prepare_action(action_holder) +def q_future_reward(action_score, action_holder, reward_holder, terminal_holder, discount): + action_one_hot = prepare_action(action_holder) q_predicted = tf.reduce_sum(action_score * action_one_hot, axis=1) - return reward_input + (1.0 - terminal_mask) * discount * q_target + return reward_holder + (1.0 - terminal_holder) * discount * q_predicted def loss(q_predicted_reward, q_truth_reward): return tf.losses.huber_loss(q_predicted_reward, q_truth_reward) diff --git a/src/train.py b/src/train.py index 14b6237..3acfb60 100644 --- a/src/train.py +++ b/src/train.py @@ -3,7 +3,7 @@ import tensorflow as tf import numpy as np from collections import deque - +from operator import itemgetter tf.app.flags.DEFINE_string('checkpoint_dir', '/tmp/inception_train', """Directory where to read training checkpoints.""") @@ -17,13 +17,13 @@ tf.app.flags.DEFINE_float("EPSILON_START", 1, "Starting value for probability of greediness.") tf.app.flags.DEFINE_float("EPSILON_END", 0.1, "Ending value for probability of greediness.") -tf.app.flags.DEFINE_float("EPSILON_END_EPOCH", 1000000, "Ending epoch to anneal epsilon.") +tf.app.flags.DEFINE_float("EPSILON_END_EPOCH", 100, "Ending epoch to anneal epsilon.") tf.app.flags.DEFINE_float("DISCOUNT", 0.99, "Amount to discount future rewards.") tf.app.flags.DEFINE_integer("BURANOUT", 4, "Number of frames to play before training a batch.") tf.app.flags.DEFINE_integer("FRAME_PER_STATE", 4, "Number of frames of past history to model game state.") tf.app.flags.DEFINE_integer("ACTION_SPACE", 4, "Number of possible output actions.") -tf.app.flags.DEFINE_integer("REPLAY_MEMORY_LENGTH", 500000, "Number of historical experiences to store.") +tf.app.flags.DEFINE_integer("REPLAY_MEMORY_LENGTH", 50, "Number of historical experiences to store.") tf.app.flags.DEFINE_integer("MIN_REPLAY_MEMORY_LENGTH", 50000, "Minimum number of experiences to start training.") tf.app.flags.DEFINE_integer("BATCH_SIZE", 32, "Size of mini-batch.") tf.app.flags.DEFINE_integer("TARGET_NETWORK_UPDATE_FREQUENCY", 10000, "Rate at which to update the target network.") @@ -35,44 +35,61 @@ def epsilon(epoch): return (((FLAGS.EPSILON_END - FLAGS.EPSILON_START) / FLAGS.EPSILON_END_EPOCH) * epoch + 1) if epoch < FLAGS.EPSILON_END_EPOCH else FLAGS.EPSILON_END -def get_frame_buffer(frame, max_len=4, fill=True): - frame_buffer = deque(max_len=max_len) +def get_frame_buffer(maxlen=4): + frame_buffer = deque(maxlen=maxlen) def append(frame): - return frame_buffer.append(frame) + frame_buffer.append(frame) + return frame_buffer - if fill: - for i in xrange(max_len): - append(frame) + return append - return frame_buffer, append +def sample(history, batch_size): + history_size = len(history) + index = np.random.randint(0, history_size, batch_size) + sampled_memory = [history[i] for i in index] + return ( + map(itemgetter(0), sampled_memory), + map(itemgetter(1), sampled_memory), + map(itemgetter(2), sampled_memory), + map(itemgetter(3), sampled_memory), + map(itemgetter(4), sampled_memory) + ) def main(_): - input_images = tf.placeholder(tf.float32, shape=[None, 210, 160, 3]) - action_holder = tf.placeholder(tf.int32, shape=[None]) - reward_input = tf.placeholder(tf.int32, shape=[None]) - terminal_holder = tf.placeholder(tf.float32, shape=[None]) - epoch = tf.placeholde(tf.float32, shape=[1]) - - with tf.Graph().as_default() as g: + graph = tf.Graph() + with graph.as_default(): + input_images = tf.placeholder_with_default(tf.zeros([1, 210, 160, 3], tf.float32), shape=[None, 210, 160, 3], name='input_images') + action_holder = tf.placeholder(tf.int32, shape=[None], name='action_holder') + reward_holder = tf.placeholder(tf.float32, shape=[None], name='reward_holder') + terminal_holder = tf.placeholder(tf.float32, shape=[None], name='terminal_holder') + epoch = tf.placeholder(tf.float32, shape=[1], name='epoch') # util for play input_state = model.frames_to_state(input_images) - input_states = tf.expand_dims(input_states, 0) + _input_states = tf.expand_dims(input_state, 0) + + input_states = ( + _input_states + + tf.placeholder_with_default(tf.zeros_like(_input_states), shape=[None, 80, 80, 4], name='batch_states') + ) + action_score = model.q_function(input_states) - action_to_take = model.action_score_to_action(action_score) + epsilon_end = tf.constant([FLAGS.EPSILON_END]) + action_to_take = model.action_score_to_action(action_score, epoch, epsilon_start=FLAGS.EPSILON_START, epsilon_end=epsilon_end, epsilon_end_epoch=FLAGS.EPSILON_END_EPOCH) # util for train # the reason we expose future_reward is that we are using an old theta to calculate them q_future_reward = model.q_future_reward(action_score, action_holder, + reward_holder, terminal_holder, discount=FLAGS.DISCOUNT) q_predicted_reward = model.q_predicted_reward(action_score) - loss = model.loss(q_predicted_reward, q_truth_reward) + loss = model.loss(q_predicted_reward, q_future_reward) trainer = tf.train.RMSPropOptimizer( learning_rate=0.00025, @@ -83,42 +100,49 @@ def main(_): theta = tf.trainable_variables() - with tf.Session() as sess, g.as_default(): + with tf.Session(graph=graph) as sess: game_env = gym.make('BreakoutNoFrameskip-v4') observe = game_env.reset() sess.run(tf.global_variables_initializer()) - frames, append_frame = get_frame_buffer(observe) - - prev_observe_state, action = sess.run([input_state, action], {input_images: frames}) + append_frame = get_frame_buffer() + for i in xrange(FLAGS.FRAME_PER_STATE): + frames = append_frame(observe) + prev_observe_state, action = sess.run([input_state, action_to_take], {input_images: frames, epoch: [0]}) for episode in xrange(FLAGS.NUM_OF_EPISODE): history = [] for step in xrange(FLAGS.REPLAY_MEMORY_LENGTH): - obs_frame, reward, finished_episode, info = game_env.step(next_action) + obs_frame, reward, finished_episode, info = game_env.step(action) frames = append_frame(obs_frame) - observe_state, next_action = sess.run([input_state, action], {input_images: frames}) - history.append([observe_state, reward, action, float(finished_episode), prev_observe_state]) + observe_state, next_action = sess.run([input_state, action_to_take], {input_images: frames, epoch: [episode]}) + history.append([observe_state, reward, action[0], float(finished_episode), prev_observe_state]) prev_observe_state = observe_state action = next_action - for step in xrange(FLAGS.NUM_OF_STEP): - states, rewards, actions, terminals, next_states = sample(history, batch_size) + states, rewards, actions, terminals, next_states = sample(history, FLAGS.BATCH_SIZE) if step % FLAGS.TARGET_NETWORK_UPDATE_FREQUENCY == 0: theta_data = sess.run(theta) feed_dict = dict(zip(theta, theta_data)) - feed_dict.update({input_state:states}) + feed_dict.update({ + input_states:states, + terminal_holder: terminals, + action_holder: actions, + reward_holder: rewards, + }) q_future_reward_data = sess.run(q_future_reward, feed_dict=feed_dict) - sess.run(model_update, feed_dict={q_future_reward: q_future_reward_data, - input_state: next_states, - action_holder: actions, - reward_input: reward, - terminal_holder: terminal}) + sess.run(model_update, feed_dict={ + q_future_reward: q_future_reward_data, + input_states:states, + terminal_holder: terminals, + action_holder: actions, + reward_holder: rewards, + }) if __name__ == "__main__": tf.app.run()