Skip to content

Commit

Permalink
ornstein-uh noise
Browse files Browse the repository at this point in the history
  • Loading branch information
dranaju committed May 4, 2020
1 parent d165b33 commit 1f4c908
Show file tree
Hide file tree
Showing 3 changed files with 101 additions and 76 deletions.
132 changes: 78 additions & 54 deletions src/ddpg_stage_1.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,23 +35,31 @@ def hard_update(target,source):

#---Ornstein-Uhlenbeck Noise for action---#

class ActionNoise:
# Based on http://math.stackexchange.com/questions/1287634/implementing-ornstein-uhlenbeck-in-matlab
def __init__(self, action_dim, mu=0, theta=0.15, sigma=0.2):
self.action_dim = action_dim
self.mu = mu
self.theta = theta
self.sigma = sigma
self.X = np.ones(self.action_dim)*self.mu
class OUNoise(object):
def __init__(self, action_space, mu=0.0, theta=0.15, max_sigma=0.3, min_sigma=0.2, decay_period=10000000):
self.mu = mu
self.theta = theta
self.sigma = max_sigma
self.max_sigma = max_sigma
self.min_sigma = min_sigma
self.decay_period = decay_period
self.action_dim = action_space
self.reset()

def reset(self):
self.X = np.ones(self.action_dim)*self.mu
self.state = np.ones(self.action_dim) * self.mu

def evolve_state(self):
x = self.state
dx = self.theta * (self.mu - x) + self.sigma * np.random.randn(self.action_dim)
self.state = x + dx
return self.state

def sample(self):
dx = self.theta*(self.mu - self.X)
dx = dx + self.sigma*np.random.randn(len(self.X))
self.X = self.X + dx
return self.X
def get_noise(self, t=0):
ou_state = self.evolve_state()
decaying = float(float(t)/ self.decay_period)
self.sigma = max(self.sigma - (self.max_sigma - self.min_sigma) * min(1.0, decaying), self.min_sigma)
return ou_state

#---Critic--#

Expand All @@ -69,22 +77,22 @@ def __init__(self, state_dim, action_dim):
self.action_dim = action_dim

self.fc1 = nn.Linear(state_dim, 250)
# self.fc1.weight.data = fanin_init(self.fc1.weight.data.size())
self.fc1.weight.data = fanin_init(self.fc1.weight.data.size())

self.fa1 = nn.Linear(action_dim, 250)
# self.fa1.weight.data = fanin_init(self.fa1.weight.data.size())
self.fa1.weight.data = fanin_init(self.fa1.weight.data.size())

self.fca1 = nn.Linear(500, 500)
# self.fca1.weight.data = fanin_init(self.fca1.weight.data.size())
self.fca1.weight.data = fanin_init(self.fca1.weight.data.size())

self.fca2 = nn.Linear(500, 1)
# self.fca2.weight.data.uniform_(-EPS, EPS)
self.fca2.weight.data.uniform_(-EPS, EPS)

def forward(self, state, action):
xs = mish(self.fc1(state))
xa = mish(self.fa1(action))
xs = torch.relu(self.fc1(state))
xa = torch.relu(self.fa1(action))
x = torch.cat((xs,xa), dim=1)
x = mish(self.fca1(x))
x = torch.relu(self.fca1(x))
vs = self.fca2(x)
return vs

Expand All @@ -99,19 +107,19 @@ def __init__(self, state_dim, action_dim, action_limit_v, action_limit_w):
self.action_limit_w = action_limit_w

self.fa1 = nn.Linear(state_dim, 500)
# self.fa1.weight.data = fanin_init(self.fa1.weight.data.size())
self.fa1.weight.data = fanin_init(self.fa1.weight.data.size())

self.fa2 = nn.Linear(500, 500)
# self.fa2.weight.data = fanin_init(self.fa2.weight.data.size())
self.fa2.weight.data = fanin_init(self.fa2.weight.data.size())

self.fa3 = nn.Linear(500, action_dim)
# self.fa3.weight.data.uniform_(-EPS,EPS)
self.fa3.weight.data.uniform_(-EPS,EPS)

def forward(self, state):
x = mish(self.fa1(state))
x = mish(self.fa2(x))
x = torch.relu(self.fa1(state))
x = torch.relu(self.fa2(x))
action = self.fa3(x)
if state.shape == torch.Size([14]):
if state.shape <= torch.Size([self.state_dim]):
action[0] = torch.sigmoid(action[0])*self.action_limit_v
action[1] = torch.tanh(action[1])*self.action_limit_w
else:
Expand Down Expand Up @@ -151,7 +159,7 @@ def add(self, s, a, r, new_s):

#---Where the train is made---#

BATCH_SIZE = 128
BATCH_SIZE = 256
LEARNING_RATE = 0.001
GAMMA = 0.99
TAU = 0.001
Expand All @@ -167,7 +175,6 @@ def __init__(self, state_dim, action_dim, action_limit_v, action_limit_w, ram):
#print('w',self.action_limit_w)
self.ram = ram
#self.iter = 0
self.noise = ActionNoise(self.action_dim)

self.actor = Actor(self.state_dim, self.action_dim, self.action_limit_v, self.action_limit_w)
self.target_actor = Actor(self.state_dim, self.action_dim, self.action_limit_v, self.action_limit_w)
Expand All @@ -184,7 +191,7 @@ def __init__(self, state_dim, action_dim, action_limit_v, action_limit_w, ram):

def get_exploitation_action(self,state):
state = torch.from_numpy(state)
action = self.target_actor.forward(state).detach()
action = self.actor.forward(state).detach()
#print('actionploi', action)
return action.data.numpy()

Expand Down Expand Up @@ -273,17 +280,17 @@ def mish(x):
MAX_EPISODES = 10001
MAX_STEPS = 500
MAX_BUFFER = 50000
# rewards_all_episodes = []
rewards_all_episodes = []

STATE_DIMENSION = 14
ACTION_DIMENSION = 2
ACTION_V_MAX = 0.22 # m/s
ACTION_W_MAX = 2 # rad/s
world = 'world_obst'
ACTION_W_MAX = 2. # rad/s
world = 'stage_2'

if is_training:
var_v = ACTION_V_MAX*.1
var_w = ACTION_W_MAX*2*.1/2
var_v = ACTION_V_MAX*.5
var_w = ACTION_W_MAX*2*.5
else:
var_v = ACTION_V_MAX*0.10
var_w = ACTION_W_MAX*0.10
Expand All @@ -293,40 +300,56 @@ def mish(x):
print('Action Max: ' + str(ACTION_V_MAX) + ' m/s and ' + str(ACTION_W_MAX) + ' rad/s')
ram = MemoryBuffer(MAX_BUFFER)
trainer = Trainer(STATE_DIMENSION, ACTION_DIMENSION, ACTION_V_MAX, ACTION_W_MAX, ram)
trainer.load_models(1560)
noise = OUNoise(ACTION_DIMENSION)
# trainer.load_models(140)


if __name__ == '__main__':
rospy.init_node('ddpg_stage_1')
pub_result = rospy.Publisher('result', Float32, queue_size=5)
result = Float32()
env = Env()
env = Env(action_dim=ACTION_DIMENSION)
before_training = 1

past_action = np.array([0.,0.])
past_action = np.zeros(ACTION_DIMENSION)

for ep in range(MAX_EPISODES):
done = False
state = env.reset()
if is_training and ep%2 == 0 and ram.len >= before_training*MAX_STEPS:
if is_training and not ep%10 == 0 and ram.len >= before_training*MAX_STEPS:
print('---------------------------------')
print('Episode: ' + str(ep) + ' training')
print('---------------------------------')
else:
if ram.len >= before_training*MAX_STEPS:
print('---------------------------------')
print('Episode: ' + str(ep) + ' evaluating')
print('---------------------------------')
else:
print('---------------------------------')
print('Episode: ' + str(ep) + ' adding to memory')
print('---------------------------------')

rewards_current_episode = 0.

for step in range(MAX_STEPS):
state = np.float32(state)

if is_training and ep% 2 == 0 and ram.len >= before_training*MAX_STEPS:
if is_training and not ep%10 == 0 and ram.len >= before_training*MAX_STEPS:
action = trainer.get_exploration_action(state)
action[0] = np.clip(np.random.normal(action[0], var_v), 0., ACTION_V_MAX)
action[1] = np.clip(np.random.normal(action[1], var_w), -ACTION_W_MAX, ACTION_W_MAX)
# action[0] = np.clip(
# np.random.normal(action[0], var_v), 0., ACTION_V_MAX)
# action[0] = np.clip(np.clip(
# action[0] + np.random.uniform(-var_v, var_v), action[0] - var_v, action[0] + var_v), 0., ACTION_V_MAX)
# action[1] = np.clip(
# np.random.normal(action[1], var_w), -ACTION_W_MAX, ACTION_W_MAX)
N = copy.deepcopy(noise.get_noise(t=step))
N[0] = round(N[0],4)*ACTION_V_MAX/2
N[1] = round(N[1],4)*ACTION_W_MAX
action[0] = np.clip(action[0] + N[0], 0., ACTION_V_MAX)
action[1] = np.clip(action[1] + N[1], -ACTION_W_MAX, ACTION_W_MAX)
else:
action = trainer.get_exploitation_action(state)
action = trainer.get_exploration_action(state)

if not is_training:
action = trainer.get_exploitation_action(state)
Expand All @@ -336,8 +359,8 @@ def mish(x):

rewards_current_episode += reward
next_state = np.float32(next_state)
if ep%2 == 0 or not ram.len >= before_training*MAX_STEPS:
if reward == 100.:
if not ep%10 == 0 or not ram.len >= before_training*MAX_STEPS:
if reward == 500.:
print('***\n-------- Maximum Reward ----------\n****')
for _ in range(3):
ram.add(state, action, reward, next_state)
Expand All @@ -346,22 +369,23 @@ def mish(x):
state = copy.deepcopy(next_state)


if ram.len >= before_training*MAX_STEPS and is_training and ep% 2 == 0:
var_v = max([var_v*0.99995, 0.1*ACTION_V_MAX])
var_w = max([var_w*0.99995, 0.1*ACTION_W_MAX])
if ram.len >= before_training*MAX_STEPS and is_training and not ep%10 == 0:
# var_v = max([var_v*0.99999, 0.005*ACTION_V_MAX])
# var_w = max([var_w*0.99999, 0.01*ACTION_W_MAX])
trainer.optimizer()

if done or step == MAX_STEPS-1:
print('reward per ep: ' + str(rewards_current_episode))
print('reward average per ep: ' + str(round(rewards_current_episode/step, 2)) + ' and break step: ' + str(step))
print('explore_v: ' + str(var_v) + ' and explore_w: ' + str(var_w))
print('*\nbreak step: ' + str(step) + '\n*')
# print('explore_v: ' + str(var_v) + ' and explore_w: ' + str(var_w))
print('sigma: ' + str(noise.sigma))
# rewards_all_episodes.append(rewards_current_episode)
if (ep)% 2 == 0:
if not ep%10 == 0:
pass
else:
if ram.len >= before_training*MAX_STEPS:
result = round(rewards_current_episode/step, 2)
pub_result.publish(result)
# if ram.len >= before_training*MAX_STEPS:
result = rewards_current_episode
pub_result.publish(result)
break
if ep%20 == 0:
trainer.save_models(ep)
Expand Down
23 changes: 12 additions & 11 deletions src/environment_stage_1.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@
target_not_movable = False

class Env():
def __init__(self):
def __init__(self, action_dim=2):
self.goal_x = 0
self.goal_y = 0
self.heading = 0
Expand All @@ -50,6 +50,7 @@ def __init__(self):
self.respawn_goal = Respawn()
self.past_distance = 0.
self.stopped = 0
self.action_dim = action_dim
#Keys CTRL + c will stop script
rospy.on_shutdown(self.shutdown)

Expand Down Expand Up @@ -111,7 +112,7 @@ def getState(self, scan, past_action):

current_distance = round(math.hypot(self.goal_x - self.position.x, self.goal_y - self.position.y),2)
# current_distance = self.getGoalDistace()
if current_distance < 0.1:
if current_distance < 0.15:
self.get_goalbox = True

return scan_range + [heading, current_distance], done
Expand All @@ -124,15 +125,15 @@ def setReward(self, state, done):

distance_rate = (self.past_distance - current_distance)
if distance_rate > 0:
# reward = 200.*distance_rate
reward = 0.
reward = 200.*distance_rate
# reward = 0.

# if distance_rate == 0:
# reward = 0.

if distance_rate <= 0:
# reward = -8.
reward = 0.
reward = -8.
# reward = 0.

#angle_reward = math.pi - abs(heading)
#print('d', 500*distance_rate)
Expand All @@ -154,14 +155,14 @@ def setReward(self, state, done):

if done:
rospy.loginfo("Collision!!")
# reward = -550.
reward = -10.
reward = -550.
# reward = -10.
self.pub_cmd_vel.publish(Twist())

if self.get_goalbox:
rospy.loginfo("Goal!!")
# reward = 500.
reward = 100.
reward = 500.
# reward = 100.
self.pub_cmd_vel.publish(Twist())
if world:
self.goal_x, self.goal_y = self.respawn_goal.getPosition(True, delete=True, running=True)
Expand Down Expand Up @@ -217,6 +218,6 @@ def reset(self):
self.goal_x, self.goal_y = self.respawn_goal.getPosition(True, delete=True)

self.goal_distance = self.getGoalDistace()
state, _ = self.getState(data, [0.,0.])
state, _ = self.getState(data, [0]*self.action_dim)

return np.asarray(state)
22 changes: 11 additions & 11 deletions src/sac_stage_1.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,9 +69,9 @@ def __init__(self, state_dim, hidden_dim, init_w=3e-3):
self.linear3.bias.data.uniform_(-init_w, init_w)

def forward(self, state):
x = torch.relu(self.linear1(state))
x = torch.relu(self.linear2(x))
x = torch.relu(self.linear2_3(x))
x = mish(self.linear1(state))
x = mish(self.linear2(x))
x = mish(self.linear2_3(x))
x = self.linear3(x)
return x

Expand All @@ -96,9 +96,9 @@ def __init__(self, num_inputs, num_actions, hidden_size, init_w=3e-3):

def forward(self, state, action):
x = torch.cat([state, action], 1)
x = torch.relu(self.linear1(x))
x = torch.relu(self.linear2(x))
x = torch.relu(self.linear2_3(x))
x = mish(self.linear1(x))
x = mish(self.linear2(x))
x = mish(self.linear2_3(x))
x = self.linear3(x)
return x

Expand Down Expand Up @@ -127,8 +127,8 @@ def __init__(self, num_inputs, num_actions, hidden_size, init_w=3e-3, log_std_mi
self.log_std_linear.bias.data.uniform_(-init_w, init_w)

def forward(self, state):
x = torch.relu(self.linear1(state))
x = torch.relu(self.linear2(x))
x = mish(self.linear1(state))
x = mish(self.linear2(x))

mean = self.mean_linear(x)
log_std = self.log_std_linear(x)
Expand Down Expand Up @@ -296,12 +296,12 @@ def load_models(episode):
#****************************
is_training = True

load_models(240)
load_models(120)
hard_update(target_value_net, value_net)
max_episodes = 10001
max_steps = 500
rewards = []
batch_size = 128
batch_size = 256



Expand Down Expand Up @@ -354,7 +354,7 @@ def action_unnormalized(action, high, low):
rewards_current_episode += reward
next_state = np.float32(next_state)
if ep%2 == 0 or not len(replay_buffer) > before_training*batch_size:
if reward == 100.:
if reward == 500.:
print('***\n-------- Maximum Reward ----------\n****')
for _ in range(3):
replay_buffer.push(state, action, reward, next_state, done)
Expand Down

0 comments on commit 1f4c908

Please sign in to comment.