ornstein-uh noise

dranaju · May 4, 2020 · 1f4c908 · 1f4c908
1 parent d165b33
commit 1f4c908
Show file tree

Hide file tree

Showing 3 changed files with 101 additions and 76 deletions.
diff --git a/src/ddpg_stage_1.py b/src/ddpg_stage_1.py
@@ -35,23 +35,31 @@ def hard_update(target,source):
 
 #---Ornstein-Uhlenbeck Noise for action---#
 
-class ActionNoise:
-    # Based on http://math.stackexchange.com/questions/1287634/implementing-ornstein-uhlenbeck-in-matlab
-    def __init__(self, action_dim, mu=0, theta=0.15, sigma=0.2):
-        self.action_dim = action_dim
-        self.mu = mu
-        self.theta = theta
-        self.sigma = sigma
-        self.X = np.ones(self.action_dim)*self.mu
+class OUNoise(object):
+    def __init__(self, action_space, mu=0.0, theta=0.15, max_sigma=0.3, min_sigma=0.2, decay_period=10000000):
+        self.mu           = mu
+        self.theta        = theta
+        self.sigma        = max_sigma
+        self.max_sigma    = max_sigma
+        self.min_sigma    = min_sigma
+        self.decay_period = decay_period
+        self.action_dim   = action_space
+        self.reset()
 
     def reset(self):
-        self.X = np.ones(self.action_dim)*self.mu
+        self.state = np.ones(self.action_dim) * self.mu
+
+    def evolve_state(self):
+        x  = self.state
+        dx = self.theta * (self.mu - x) + self.sigma * np.random.randn(self.action_dim)
+        self.state = x + dx
+        return self.state
 
-    def sample(self):
-        dx = self.theta*(self.mu - self.X)
-        dx = dx + self.sigma*np.random.randn(len(self.X))
-        self.X = self.X + dx
-        return self.X
+    def get_noise(self, t=0): 
+        ou_state = self.evolve_state()
+        decaying = float(float(t)/ self.decay_period)
+        self.sigma = max(self.sigma - (self.max_sigma - self.min_sigma) * min(1.0, decaying), self.min_sigma)
+        return ou_state
 
 #---Critic--#
 
@@ -69,22 +77,22 @@ def __init__(self, state_dim, action_dim):
         self.action_dim = action_dim
 
         self.fc1 = nn.Linear(state_dim, 250)
-        # self.fc1.weight.data = fanin_init(self.fc1.weight.data.size())
+        self.fc1.weight.data = fanin_init(self.fc1.weight.data.size())
 
         self.fa1 = nn.Linear(action_dim, 250)
-        # self.fa1.weight.data = fanin_init(self.fa1.weight.data.size())
+        self.fa1.weight.data = fanin_init(self.fa1.weight.data.size())
 
         self.fca1 = nn.Linear(500, 500)
-        # self.fca1.weight.data = fanin_init(self.fca1.weight.data.size())
+        self.fca1.weight.data = fanin_init(self.fca1.weight.data.size())
 
         self.fca2 = nn.Linear(500, 1)
-        # self.fca2.weight.data.uniform_(-EPS, EPS)
+        self.fca2.weight.data.uniform_(-EPS, EPS)
 
     def forward(self, state, action):
-        xs = mish(self.fc1(state))
-        xa = mish(self.fa1(action))
+        xs = torch.relu(self.fc1(state))
+        xa = torch.relu(self.fa1(action))
         x = torch.cat((xs,xa), dim=1)
-        x = mish(self.fca1(x))
+        x = torch.relu(self.fca1(x))
         vs = self.fca2(x)
         return vs
 
@@ -99,19 +107,19 @@ def __init__(self, state_dim, action_dim, action_limit_v, action_limit_w):
         self.action_limit_w = action_limit_w
 
         self.fa1 = nn.Linear(state_dim, 500)
-        # self.fa1.weight.data = fanin_init(self.fa1.weight.data.size())
+        self.fa1.weight.data = fanin_init(self.fa1.weight.data.size())
 
         self.fa2 = nn.Linear(500, 500)
-        # self.fa2.weight.data = fanin_init(self.fa2.weight.data.size())
+        self.fa2.weight.data = fanin_init(self.fa2.weight.data.size())
 
         self.fa3 = nn.Linear(500, action_dim)
-        # self.fa3.weight.data.uniform_(-EPS,EPS)
+        self.fa3.weight.data.uniform_(-EPS,EPS)
 
     def forward(self, state):
-        x = mish(self.fa1(state))
-        x = mish(self.fa2(x))
+        x = torch.relu(self.fa1(state))
+        x = torch.relu(self.fa2(x))
         action = self.fa3(x)
-        if state.shape == torch.Size([14]):
+        if state.shape <= torch.Size([self.state_dim]):
             action[0] = torch.sigmoid(action[0])*self.action_limit_v
             action[1] = torch.tanh(action[1])*self.action_limit_w
         else:
@@ -151,7 +159,7 @@ def add(self, s, a, r, new_s):
 
 #---Where the train is made---#
 
-BATCH_SIZE = 128
+BATCH_SIZE = 256
 LEARNING_RATE = 0.001
 GAMMA = 0.99
 TAU = 0.001
@@ -167,7 +175,6 @@ def __init__(self, state_dim, action_dim, action_limit_v, action_limit_w, ram):
         #print('w',self.action_limit_w)
         self.ram = ram
         #self.iter = 0 
-        self.noise = ActionNoise(self.action_dim)
 
         self.actor = Actor(self.state_dim, self.action_dim, self.action_limit_v, self.action_limit_w)
         self.target_actor = Actor(self.state_dim, self.action_dim, self.action_limit_v, self.action_limit_w)
@@ -184,7 +191,7 @@ def __init__(self, state_dim, action_dim, action_limit_v, action_limit_w, ram):
 
     def get_exploitation_action(self,state):
         state = torch.from_numpy(state)
-        action = self.target_actor.forward(state).detach()
+        action = self.actor.forward(state).detach()
         #print('actionploi', action)
         return action.data.numpy()
 
@@ -273,17 +280,17 @@ def mish(x):
 MAX_EPISODES = 10001
 MAX_STEPS = 500
 MAX_BUFFER = 50000
-# rewards_all_episodes = []
+rewards_all_episodes = []
 
 STATE_DIMENSION = 14
 ACTION_DIMENSION = 2
 ACTION_V_MAX = 0.22 # m/s
-ACTION_W_MAX = 2 # rad/s
-world = 'world_obst'
+ACTION_W_MAX = 2. # rad/s
+world = 'stage_2'
 
 if is_training:
-    var_v = ACTION_V_MAX*.1
-    var_w = ACTION_W_MAX*2*.1/2
+    var_v = ACTION_V_MAX*.5
+    var_w = ACTION_W_MAX*2*.5
 else:
     var_v = ACTION_V_MAX*0.10
     var_w = ACTION_W_MAX*0.10
@@ -293,40 +300,56 @@ def mish(x):
 print('Action Max: ' + str(ACTION_V_MAX) + ' m/s and ' + str(ACTION_W_MAX) + ' rad/s')
 ram = MemoryBuffer(MAX_BUFFER)
 trainer = Trainer(STATE_DIMENSION, ACTION_DIMENSION, ACTION_V_MAX, ACTION_W_MAX, ram)
-trainer.load_models(1560)
+noise = OUNoise(ACTION_DIMENSION)
+# trainer.load_models(140)
 
 
 if __name__ == '__main__':
     rospy.init_node('ddpg_stage_1')
     pub_result = rospy.Publisher('result', Float32, queue_size=5)
     result = Float32()
-    env = Env()
+    env = Env(action_dim=ACTION_DIMENSION)
     before_training = 1
 
-    past_action = np.array([0.,0.])
+    past_action = np.zeros(ACTION_DIMENSION)
 
     for ep in range(MAX_EPISODES):
         done = False
         state = env.reset()
-        if is_training and ep%2 == 0 and ram.len >= before_training*MAX_STEPS:
+        if is_training and not ep%10 == 0 and ram.len >= before_training*MAX_STEPS:
+            print('---------------------------------')
             print('Episode: ' + str(ep) + ' training')
+            print('---------------------------------')
         else:
             if ram.len >= before_training*MAX_STEPS:
+                print('---------------------------------')
                 print('Episode: ' + str(ep) + ' evaluating')
+                print('---------------------------------')
             else:
+                print('---------------------------------')
                 print('Episode: ' + str(ep) + ' adding to memory')
+                print('---------------------------------')
 
         rewards_current_episode = 0.
 
         for step in range(MAX_STEPS):
             state = np.float32(state)
 
-            if is_training and ep% 2 == 0 and ram.len >= before_training*MAX_STEPS:
+            if is_training and not ep%10 == 0 and ram.len >= before_training*MAX_STEPS:
                 action = trainer.get_exploration_action(state)
-                action[0] = np.clip(np.random.normal(action[0], var_v), 0., ACTION_V_MAX)
-                action[1] = np.clip(np.random.normal(action[1], var_w), -ACTION_W_MAX, ACTION_W_MAX)
+                # action[0] = np.clip(
+                #     np.random.normal(action[0], var_v), 0., ACTION_V_MAX)
+                # action[0] = np.clip(np.clip(
+                #     action[0] + np.random.uniform(-var_v, var_v), action[0] - var_v, action[0] + var_v), 0., ACTION_V_MAX)
+                # action[1] = np.clip(
+                #     np.random.normal(action[1], var_w), -ACTION_W_MAX, ACTION_W_MAX)
+                N = copy.deepcopy(noise.get_noise(t=step))
+                N[0] = round(N[0],4)*ACTION_V_MAX/2
+                N[1] = round(N[1],4)*ACTION_W_MAX
+                action[0] = np.clip(action[0] + N[0], 0., ACTION_V_MAX)
+                action[1] = np.clip(action[1] + N[1], -ACTION_W_MAX, ACTION_W_MAX)
             else:
-                action = trainer.get_exploitation_action(state)
+                action = trainer.get_exploration_action(state)
 
             if not is_training:
                 action = trainer.get_exploitation_action(state)
@@ -336,8 +359,8 @@ def mish(x):
 
             rewards_current_episode += reward
             next_state = np.float32(next_state)
-            if ep%2 == 0 or not ram.len >= before_training*MAX_STEPS:
-                if reward == 100.:
+            if not ep%10 == 0 or not ram.len >= before_training*MAX_STEPS:
+                if reward == 500.:
                     print('***\n-------- Maximum Reward ----------\n****')
                     for _ in range(3):
                         ram.add(state, action, reward, next_state)
@@ -346,22 +369,23 @@ def mish(x):
             state = copy.deepcopy(next_state)
 
 
-            if ram.len >= before_training*MAX_STEPS and is_training and ep% 2 == 0:
-                var_v = max([var_v*0.99995, 0.1*ACTION_V_MAX])
-                var_w = max([var_w*0.99995, 0.1*ACTION_W_MAX])
+            if ram.len >= before_training*MAX_STEPS and is_training and not ep%10 == 0:
+                # var_v = max([var_v*0.99999, 0.005*ACTION_V_MAX])
+                # var_w = max([var_w*0.99999, 0.01*ACTION_W_MAX])
                 trainer.optimizer()
 
             if done or step == MAX_STEPS-1:
                 print('reward per ep: ' + str(rewards_current_episode))
-                print('reward average per ep: ' + str(round(rewards_current_episode/step, 2)) + ' and break step: ' + str(step))
-                print('explore_v: ' + str(var_v) + ' and explore_w: ' + str(var_w))
+                print('*\nbreak step: ' + str(step) + '\n*')
+                # print('explore_v: ' + str(var_v) + ' and explore_w: ' + str(var_w))
+                print('sigma: ' + str(noise.sigma))
                 # rewards_all_episodes.append(rewards_current_episode)
-                if (ep)% 2 == 0:
+                if not ep%10 == 0:
                     pass
                 else:
-                    if ram.len >= before_training*MAX_STEPS:
-                        result = round(rewards_current_episode/step, 2)
-                        pub_result.publish(result)
+                    # if ram.len >= before_training*MAX_STEPS:
+                    result = rewards_current_episode
+                    pub_result.publish(result)
                 break
         if ep%20 == 0:
             trainer.save_models(ep)

diff --git a/src/environment_stage_1.py b/src/environment_stage_1.py
@@ -35,7 +35,7 @@
 target_not_movable = False
 
 class Env():
-    def __init__(self):
+    def __init__(self, action_dim=2):
         self.goal_x = 0
         self.goal_y = 0
         self.heading = 0
@@ -50,6 +50,7 @@ def __init__(self):
         self.respawn_goal = Respawn()
         self.past_distance = 0.
         self.stopped = 0
+        self.action_dim = action_dim
         #Keys CTRL + c will stop script
         rospy.on_shutdown(self.shutdown)
 
@@ -111,7 +112,7 @@ def getState(self, scan, past_action):
 
         current_distance = round(math.hypot(self.goal_x - self.position.x, self.goal_y - self.position.y),2)
         # current_distance = self.getGoalDistace()
-        if current_distance < 0.1:
+        if current_distance < 0.15:
             self.get_goalbox = True
 
         return scan_range + [heading, current_distance], done
@@ -124,15 +125,15 @@ def setReward(self, state, done):
 
         distance_rate = (self.past_distance - current_distance) 
         if distance_rate > 0:
-            # reward = 200.*distance_rate
-            reward = 0.
+            reward = 200.*distance_rate
+            # reward = 0.
 
         # if distance_rate == 0:
         #     reward = 0.
 
         if distance_rate <= 0:
-            # reward = -8.
-            reward = 0.
+            reward = -8.
+            # reward = 0.
 
         #angle_reward = math.pi - abs(heading)
         #print('d', 500*distance_rate)
@@ -154,14 +155,14 @@ def setReward(self, state, done):
 
         if done:
             rospy.loginfo("Collision!!")
-            # reward = -550.
-            reward = -10.
+            reward = -550.
+            # reward = -10.
             self.pub_cmd_vel.publish(Twist())
 
         if self.get_goalbox:
             rospy.loginfo("Goal!!")
-            # reward = 500.
-            reward = 100.
+            reward = 500.
+            # reward = 100.
             self.pub_cmd_vel.publish(Twist())
             if world:
                 self.goal_x, self.goal_y = self.respawn_goal.getPosition(True, delete=True, running=True)
@@ -217,6 +218,6 @@ def reset(self):
             self.goal_x, self.goal_y = self.respawn_goal.getPosition(True, delete=True)
 
         self.goal_distance = self.getGoalDistace()
-        state, _ = self.getState(data, [0.,0.])
+        state, _ = self.getState(data, [0]*self.action_dim)
 
         return np.asarray(state)
diff --git a/src/sac_stage_1.py b/src/sac_stage_1.py
@@ -69,9 +69,9 @@ def __init__(self, state_dim, hidden_dim, init_w=3e-3):
         self.linear3.bias.data.uniform_(-init_w, init_w)
 
     def forward(self, state):
-        x = torch.relu(self.linear1(state))
-        x = torch.relu(self.linear2(x))
-        x = torch.relu(self.linear2_3(x))
+        x = mish(self.linear1(state))
+        x = mish(self.linear2(x))
+        x = mish(self.linear2_3(x))
         x = self.linear3(x)
         return x
 
@@ -96,9 +96,9 @@ def __init__(self, num_inputs, num_actions, hidden_size, init_w=3e-3):
 
     def forward(self, state, action):
         x = torch.cat([state, action], 1)
-        x = torch.relu(self.linear1(x))
-        x = torch.relu(self.linear2(x))
-        x = torch.relu(self.linear2_3(x))
+        x = mish(self.linear1(x))
+        x = mish(self.linear2(x))
+        x = mish(self.linear2_3(x))
         x = self.linear3(x)
         return x
 
@@ -127,8 +127,8 @@ def __init__(self, num_inputs, num_actions, hidden_size, init_w=3e-3, log_std_mi
         self.log_std_linear.bias.data.uniform_(-init_w, init_w)
 
     def forward(self, state):
-        x = torch.relu(self.linear1(state))
-        x = torch.relu(self.linear2(x))
+        x = mish(self.linear1(state))
+        x = mish(self.linear2(x))
 
         mean    = self.mean_linear(x)
         log_std = self.log_std_linear(x)
@@ -296,12 +296,12 @@ def load_models(episode):
 #****************************
 is_training = True
 
-load_models(240)   
+load_models(120)   
 hard_update(target_value_net, value_net)
 max_episodes  = 10001
 max_steps   = 500
 rewards     = []
-batch_size  = 128
+batch_size  = 256
 
 
 
@@ -354,7 +354,7 @@ def action_unnormalized(action, high, low):
             rewards_current_episode += reward
             next_state = np.float32(next_state)
             if ep%2 == 0 or not len(replay_buffer) > before_training*batch_size:
-                if reward == 100.:
+                if reward == 500.:
                     print('***\n-------- Maximum Reward ----------\n****')
                     for _ in range(3):
                         replay_buffer.push(state, action, reward, next_state, done)