TaherFattahi commited on
Commit
a54010a
·
0 Parent(s):

init: td3 robot nav

Browse files
.gitattributes ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ animation/animation.gif filter=lfs diff=lfs merge=lfs -text
2
+ models/TD3/checkpoint/TD3_critic.pth filter=lfs diff=lfs merge=lfs -text
3
+ models/TD3/checkpoint/TD3_critic_target.pth filter=lfs diff=lfs merge=lfs -text
4
+ models/TD3/checkpoint/TD3_actor.pth filter=lfs diff=lfs merge=lfs -text
5
+ models/TD3/checkpoint/TD3_actor_target.pth filter=lfs diff=lfs merge=lfs -text
README.md ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: TD3 Robot Nav Irsim
3
+ emoji: 🦀
4
+ colorFrom: gray
5
+ colorTo: gray
6
+ sdk: gradio
7
+ sdk_version: 5.23.1
8
+ app_file: app.py
9
+ pinned: false
10
+ license: mit
11
+ short_description: 'TD3: Reinforcement Learning for Autonomous Robot Navigation'
12
+ ---
13
+
14
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
animation/animation.gif ADDED

Git LFS Details

  • SHA256: 90e50ce7fc700c36322161bb737834502915dae51df1e46057681e4bbf7c31a4
  • Pointer size: 132 Bytes
  • Size of remote file: 2.56 MB
app.py ADDED
@@ -0,0 +1,223 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from models.TD3.TD3 import TD3
2
+
3
+ import torch
4
+ import numpy as np
5
+ from sim import SIM_ENV
6
+ import yaml
7
+ import gradio as gr
8
+ import os
9
+ from pathlib import Path
10
+ import matplotlib
11
+ import random
12
+
13
+ matplotlib.use('Agg')
14
+
15
+ def generate_random_points(num_scenarios=2):
16
+ """Generate random robot poses and goals"""
17
+ robot_poses = []
18
+ robot_goals = []
19
+
20
+ for _ in range(num_scenarios):
21
+ # Random pose: x, y, orientation, velocity
22
+ pose = [
23
+ [random.uniform(1, 9)], # x position
24
+ [random.uniform(1, 9)], # y position
25
+ [random.uniform(0, 3.14)], # orientation
26
+ [0] # initial velocity
27
+ ]
28
+
29
+ # Random goal: x, y, orientation
30
+ goal = [
31
+ [random.uniform(1, 9)], # x position
32
+ [random.uniform(1, 9)], # y position
33
+ [0] # orientation
34
+ ]
35
+
36
+ robot_poses.append(pose)
37
+ robot_goals.append(goal)
38
+
39
+ return robot_poses, robot_goals
40
+
41
+ def get_predefined_scenarios():
42
+ """Return predefined robot poses and goals"""
43
+ robot_poses = [
44
+ [[3], [4], [0], [0]],
45
+ [[8], [1], [1], [0]],
46
+ [[2], [6], [1], [0]],
47
+ [[7], [1], [0], [0]],
48
+ [[7], [6.5], [2], [0]],
49
+ [[9], [9], [3], [0]],
50
+ [[2], [9], [1], [0]],
51
+ [[3], [6], [3], [0]],
52
+ [[1], [7], [0], [0]],
53
+ [[5], [7], [3], [0]]
54
+ ]
55
+
56
+ robot_goals = [
57
+ [[8], [8], [0]],
58
+ [[2], [9], [0]],
59
+ [[7], [1], [0]],
60
+ [[7.2], [9], [0]],
61
+ [[1], [1], [0]],
62
+ [[5], [1], [0]],
63
+ [[7], [4], [0]],
64
+ [[9], [4], [0]],
65
+ [[1], [9], [0]],
66
+ [[5], [1], [0]]
67
+ ]
68
+
69
+ return robot_poses, robot_goals
70
+
71
+ def run_simulation():
72
+ """Run the simulation and return the path to the generated GIF"""
73
+ action_dim = 2 # number of actions produced by the model
74
+ max_action = 1 # maximum absolute value of output actions
75
+ state_dim = 25 # number of input values in the neural network (vector length of state input)
76
+ device = torch.device(
77
+ "cuda" if torch.cuda.is_available() else "cpu"
78
+ ) # using cuda if it is available, cpu otherwise
79
+ epoch = 0 # epoch number
80
+ max_steps = 300 # maximum number of steps in single episode
81
+
82
+ model = TD3(
83
+ state_dim=state_dim,
84
+ action_dim=action_dim,
85
+ max_action=max_action,
86
+ device=device,
87
+ load_model=True,
88
+ ) # instantiate a model
89
+
90
+ sim = SIM_ENV(world_file="eval_world.yaml", save_ani=True) # instantiate environment
91
+
92
+ # Generate random evaluation points instead of loading from YAML
93
+ # num_scenarios = random.randint(2, 5) # Random number of scenarios between 2 and 5
94
+
95
+ # Get predefined scenarios and randomly select one
96
+ all_poses, all_goals = get_predefined_scenarios()
97
+ scenario_index = random.randint(0, len(all_poses) - 1)
98
+
99
+ robot_poses = [all_poses[scenario_index]]
100
+ robot_goals = [all_goals[scenario_index]]
101
+
102
+ print(f"Selected scenario {scenario_index+1} of {len(all_poses)}")
103
+ total_reward = 0.0
104
+ total_steps = 0
105
+ col = 0
106
+ goals = 0
107
+ for idx in range(len(robot_poses)):
108
+ count = 0
109
+ latest_scan, distance, cos, sin, collision, goal, a, reward = sim.reset(
110
+ robot_state=robot_poses[idx],
111
+ robot_goal=robot_goals[idx],
112
+ random_obstacles=False,
113
+ )
114
+ done = False
115
+ while not done and count < max_steps:
116
+ state, terminal = model.prepare_state(
117
+ latest_scan, distance, cos, sin, collision, goal, a
118
+ )
119
+ action = model.get_action(np.array(state), False)
120
+ a_in = [(action[0] + 1) / 4, action[1]]
121
+ latest_scan, distance, cos, sin, collision, goal, a, reward = sim.step(
122
+ lin_velocity=a_in[0], ang_velocity=a_in[1]
123
+ )
124
+ total_reward += reward
125
+ total_steps += 1
126
+ count += 1
127
+ if collision:
128
+ col += 1
129
+ if goal:
130
+ goals += 1
131
+ done = collision or goal
132
+ avg_step_reward = total_reward / total_steps
133
+ avg_reward = total_reward / len(robot_poses)
134
+ avg_col = col / len(robot_poses)
135
+ avg_goal = goals / len(robot_poses)
136
+ print(f"Total Reward: {total_reward}")
137
+ print(f"Average Reward: {avg_reward}")
138
+ print(f"Average Step Reward: {avg_step_reward}")
139
+ print(f"Average Collision rate: {avg_col}")
140
+ print(f"Average Goal rate: {avg_goal}")
141
+ print("..............................................")
142
+ model.writer.add_scalar("test/total_reward", total_reward, epoch)
143
+ model.writer.add_scalar("test/avg_reward", avg_reward, epoch)
144
+ model.writer.add_scalar("test/avg_step_reward", avg_step_reward, epoch)
145
+ model.writer.add_scalar("test/avg_col", avg_col, epoch)
146
+ model.writer.add_scalar("test/avg_goal", avg_goal, epoch)
147
+
148
+ sim.env.end(ending_time=3)
149
+
150
+ # Find the latest generated GIF file in the animation folder
151
+ animation_dir = Path("animation")
152
+ if animation_dir.exists():
153
+ gif_files = list(animation_dir.glob("*.gif"))
154
+ if gif_files:
155
+ # Sort by creation time (newest first)
156
+ latest_gif = max(gif_files, key=lambda x: x.stat().st_ctime)
157
+ return str(latest_gif), {
158
+ "Total Reward": f"{total_reward:.2f}",
159
+ "Average Reward": f"{avg_reward:.2f}",
160
+ "Average Step Reward": f"{avg_step_reward:.2f}",
161
+ "Collision Rate": f"{avg_col:.2f}",
162
+ "Goal Rate": f"{avg_goal:.2f}"
163
+ }
164
+
165
+ return None, {"Error": "No GIF file was generated"}
166
+
167
+ def get_default_data():
168
+ """Return default animation and statistics data"""
169
+ # Find any existing GIF in the animation folder
170
+ animation_dir = Path("animation")
171
+ default_gif = None
172
+
173
+ if animation_dir.exists():
174
+ gif_files = list(animation_dir.glob("*.gif"))
175
+ if gif_files:
176
+ # Get the most recent GIF
177
+ default_gif = str(max(gif_files, key=lambda x: x.stat().st_ctime))
178
+
179
+ # Default statistics
180
+ default_stats = {
181
+ "Total Reward": "99.12",
182
+ "Average Reward": "99.12",
183
+ "Average Step Reward": "1.40",
184
+ "Collision Rate": "0.00",
185
+ "Goal Rate": "1.00"
186
+ }
187
+
188
+ return default_gif, default_stats
189
+
190
+ def main(args=None):
191
+ """Main function with Gradio interface"""
192
+ # Get default data for initial display
193
+ default_gif, default_stats = get_default_data()
194
+
195
+ with gr.Blocks(title="Robot Navigation Simulation") as demo:
196
+ with gr.Row():
197
+ with gr.Column():
198
+ run_button = gr.Button("Run Simulation", variant="primary")
199
+
200
+ with gr.Row():
201
+ with gr.Column():
202
+ output_image = gr.Image(
203
+ type="filepath",
204
+ label="Simulation Animation",
205
+ value=default_gif # Set default value
206
+ )
207
+
208
+ with gr.Column():
209
+ output_stats = gr.JSON(
210
+ label="Simulation Statistics",
211
+ value=default_stats # Set default value
212
+ )
213
+
214
+ run_button.click(
215
+ fn=run_simulation,
216
+ outputs=[output_image, output_stats]
217
+ )
218
+
219
+ demo.launch(share=False)
220
+
221
+
222
+ if __name__ == "__main__":
223
+ main()
eval_world.yaml ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ world:
2
+ height: 10 # the height of the world
3
+ width: 10 # the height of the world
4
+ step_time: 0.3 # Hz calculate each step
5
+ sample_time: 0.3 # Hz for render and data extraction
6
+ collision_mode: 'react'
7
+
8
+ robot:
9
+ - kinematics: {name: 'diff'}
10
+ shape: {name: 'circle', radius: 0.2}
11
+ vel_min: [ 0, -1.0 ]
12
+ vel_max: [ 1.0, 1.0 ]
13
+ state: [3, 4, 0, 0]
14
+ goal: [9, 9, 0]
15
+ arrive_mode: position
16
+ goal_threshold: 0.3
17
+
18
+ sensors:
19
+ - type: 'lidar2d'
20
+ range_min: 0
21
+ range_max: 7
22
+ angle_range: 3.14
23
+ number: 180
24
+ noise: True
25
+ std: 0.08
26
+ angle_std: 0.1
27
+ offset: [ 0, 0, 0 ]
28
+ alpha: 0.3
29
+
30
+ plot:
31
+ show_trajectory: True
32
+
33
+ obstacle:
34
+ - shape: { name: 'circle', radius: 1.0 } # radius
35
+ state: [ 5, 5, 0 ]
36
+ - shape: { name: 'circle', radius: 0.5 } # radius
37
+ state: [ 7, 8, 0 ]
38
+ - shape: { name: 'circle', radius: 1.4 } # radius
39
+ state: [ 3, 1, 0 ]
40
+ - shape: {name: 'rectangle', length: 1.0, width: 1.2} # length, width
41
+ state: [8, 5, 1]
42
+ - shape: { name: 'rectangle', length: 0.5, width: 2.1 } # length, width
43
+ state: [ 1, 8, 1.3 ]
44
+ - shape: { name: 'rectangle', length: 1.5, width: 0.7 } # length, width
45
+ state: [ 6, 2, 0.5 ]
46
+
47
+
48
+ - shape: { name: 'linestring', vertices: [ [ 0, 0 ], [ 10, 0 ], [ 10, 10 ], [ 0, 10 ],[ 0, 0 ] ] } # vertices
49
+ kinematics: {name: 'static'}
50
+ state: [ 0, 0, 0 ]
models/TD3/TD3.py ADDED
@@ -0,0 +1,277 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pathlib import Path
2
+
3
+ import numpy as np
4
+ import torch
5
+ import torch.nn as nn
6
+ import torch.nn.functional as F
7
+ from numpy import inf
8
+ from torch.utils.tensorboard import SummaryWriter
9
+
10
+
11
+ class Actor(nn.Module):
12
+ def __init__(self, state_dim, action_dim):
13
+ super(Actor, self).__init__()
14
+
15
+ self.layer_1 = nn.Linear(state_dim, 400)
16
+ torch.nn.init.kaiming_uniform_(self.layer_1.weight, nonlinearity="leaky_relu")
17
+ self.layer_2 = nn.Linear(400, 300)
18
+ torch.nn.init.kaiming_uniform_(self.layer_2.weight, nonlinearity="leaky_relu")
19
+ self.layer_3 = nn.Linear(300, action_dim)
20
+ self.tanh = nn.Tanh()
21
+
22
+ def forward(self, s):
23
+ s = F.leaky_relu(self.layer_1(s))
24
+ s = F.leaky_relu(self.layer_2(s))
25
+ a = self.tanh(self.layer_3(s))
26
+ return a
27
+
28
+
29
+ class Critic(nn.Module):
30
+ def __init__(self, state_dim, action_dim):
31
+ super(Critic, self).__init__()
32
+
33
+ self.layer_1 = nn.Linear(state_dim, 400)
34
+ torch.nn.init.kaiming_uniform_(self.layer_1.weight, nonlinearity="leaky_relu")
35
+ self.layer_2_s = nn.Linear(400, 300)
36
+ torch.nn.init.kaiming_uniform_(self.layer_2_s.weight, nonlinearity="leaky_relu")
37
+ self.layer_2_a = nn.Linear(action_dim, 300)
38
+ torch.nn.init.kaiming_uniform_(self.layer_2_a.weight, nonlinearity="leaky_relu")
39
+ self.layer_3 = nn.Linear(300, 1)
40
+ torch.nn.init.kaiming_uniform_(self.layer_3.weight, nonlinearity="leaky_relu")
41
+
42
+ self.layer_4 = nn.Linear(state_dim, 400)
43
+ torch.nn.init.kaiming_uniform_(self.layer_1.weight, nonlinearity="leaky_relu")
44
+ self.layer_5_s = nn.Linear(400, 300)
45
+ torch.nn.init.kaiming_uniform_(self.layer_5_s.weight, nonlinearity="leaky_relu")
46
+ self.layer_5_a = nn.Linear(action_dim, 300)
47
+ torch.nn.init.kaiming_uniform_(self.layer_5_a.weight, nonlinearity="leaky_relu")
48
+ self.layer_6 = nn.Linear(300, 1)
49
+ torch.nn.init.kaiming_uniform_(self.layer_6.weight, nonlinearity="leaky_relu")
50
+
51
+ def forward(self, s, a):
52
+ s1 = F.leaky_relu(self.layer_1(s))
53
+ self.layer_2_s(s1)
54
+ self.layer_2_a(a)
55
+ s11 = torch.mm(s1, self.layer_2_s.weight.data.t())
56
+ s12 = torch.mm(a, self.layer_2_a.weight.data.t())
57
+ s1 = F.leaky_relu(s11 + s12 + self.layer_2_a.bias.data)
58
+ q1 = self.layer_3(s1)
59
+
60
+ s2 = F.leaky_relu(self.layer_4(s))
61
+ self.layer_5_s(s2)
62
+ self.layer_5_a(a)
63
+ s21 = torch.mm(s2, self.layer_5_s.weight.data.t())
64
+ s22 = torch.mm(a, self.layer_5_a.weight.data.t())
65
+ s2 = F.leaky_relu(s21 + s22 + self.layer_5_a.bias.data)
66
+ q2 = self.layer_6(s2)
67
+ return q1, q2
68
+
69
+
70
+ # TD3 network
71
+ class TD3(object):
72
+ def __init__(
73
+ self,
74
+ state_dim,
75
+ action_dim,
76
+ max_action,
77
+ device,
78
+ lr=1e-4,
79
+ save_every=0,
80
+ load_model=False,
81
+ save_directory=Path("models/TD3/new/checkpoint"),
82
+ model_name="TD3",
83
+ load_directory=Path("models/TD3/checkpoint"),
84
+ ):
85
+ # Initialize the Actor network
86
+ self.device = device
87
+ self.actor = Actor(state_dim, action_dim).to(self.device)
88
+ self.actor_target = Actor(state_dim, action_dim).to(self.device)
89
+ self.actor_target.load_state_dict(self.actor.state_dict())
90
+ self.actor_optimizer = torch.optim.Adam(params=self.actor.parameters(), lr=lr)
91
+
92
+ # Initialize the Critic networks
93
+ self.critic = Critic(state_dim, action_dim).to(self.device)
94
+ self.critic_target = Critic(state_dim, action_dim).to(self.device)
95
+ self.critic_target.load_state_dict(self.critic.state_dict())
96
+ self.critic_optimizer = torch.optim.Adam(params=self.critic.parameters(), lr=lr)
97
+
98
+ self.action_dim = action_dim
99
+ self.max_action = max_action
100
+ self.state_dim = state_dim
101
+ self.writer = SummaryWriter()
102
+ self.iter_count = 0
103
+ if load_model:
104
+ self.load(filename=model_name, directory=load_directory)
105
+ self.save_every = save_every
106
+ self.model_name = model_name
107
+ self.save_directory = save_directory
108
+
109
+ def get_action(self, obs, add_noise):
110
+ if add_noise:
111
+ return (
112
+ self.act(obs) + np.random.normal(0, 0.2, size=self.action_dim)
113
+ ).clip(-self.max_action, self.max_action)
114
+ else:
115
+ return self.act(obs)
116
+
117
+ def act(self, state):
118
+ # Function to get the action from the actor
119
+ state = torch.Tensor(state).to(self.device)
120
+ return self.actor(state).cpu().data.numpy().flatten()
121
+
122
+ # training cycle
123
+ def train(
124
+ self,
125
+ replay_buffer,
126
+ iterations,
127
+ batch_size,
128
+ discount=0.99,
129
+ tau=0.005,
130
+ policy_noise=0.2,
131
+ noise_clip=0.5,
132
+ policy_freq=2,
133
+ ):
134
+ av_Q = 0
135
+ max_Q = -inf
136
+ av_loss = 0
137
+ for it in range(iterations):
138
+ # sample a batch from the replay buffer
139
+ (
140
+ batch_states,
141
+ batch_actions,
142
+ batch_rewards,
143
+ batch_dones,
144
+ batch_next_states,
145
+ ) = replay_buffer.sample_batch(batch_size)
146
+ state = torch.Tensor(batch_states).to(self.device)
147
+ next_state = torch.Tensor(batch_next_states).to(self.device)
148
+ action = torch.Tensor(batch_actions).to(self.device)
149
+ reward = torch.Tensor(batch_rewards).to(self.device)
150
+ done = torch.Tensor(batch_dones).to(self.device)
151
+
152
+ # Obtain the estimated action from the next state by using the actor-target
153
+ next_action = self.actor_target(next_state)
154
+
155
+ # Add noise to the action
156
+ noise = (
157
+ torch.Tensor(batch_actions)
158
+ .data.normal_(0, policy_noise)
159
+ .to(self.device)
160
+ )
161
+ noise = noise.clamp(-noise_clip, noise_clip)
162
+ next_action = (next_action + noise).clamp(-self.max_action, self.max_action)
163
+
164
+ # Calculate the Q values from the critic-target network for the next state-action pair
165
+ target_Q1, target_Q2 = self.critic_target(next_state, next_action)
166
+
167
+ # Select the minimal Q value from the 2 calculated values
168
+ target_Q = torch.min(target_Q1, target_Q2)
169
+ av_Q += torch.mean(target_Q)
170
+ max_Q = max(max_Q, torch.max(target_Q))
171
+ # Calculate the final Q value from the target network parameters by using Bellman equation
172
+ target_Q = reward + ((1 - done) * discount * target_Q).detach()
173
+
174
+ # Get the Q values of the basis networks with the current parameters
175
+ current_Q1, current_Q2 = self.critic(state, action)
176
+
177
+ # Calculate the loss between the current Q value and the target Q value
178
+ loss = F.mse_loss(current_Q1, target_Q) + F.mse_loss(current_Q2, target_Q)
179
+
180
+ # Perform the gradient descent
181
+ self.critic_optimizer.zero_grad()
182
+ loss.backward()
183
+ self.critic_optimizer.step()
184
+
185
+ if it % policy_freq == 0:
186
+ # Maximize the actor output value by performing gradient descent on negative Q values
187
+ # (essentially perform gradient ascent)
188
+ actor_grad, _ = self.critic(state, self.actor(state))
189
+ actor_grad = -actor_grad.mean()
190
+ self.actor_optimizer.zero_grad()
191
+ actor_grad.backward()
192
+ self.actor_optimizer.step()
193
+
194
+ # Use soft update to update the actor-target network parameters by
195
+ # infusing small amount of current parameters
196
+ for param, target_param in zip(
197
+ self.actor.parameters(), self.actor_target.parameters()
198
+ ):
199
+ target_param.data.copy_(
200
+ tau * param.data + (1 - tau) * target_param.data
201
+ )
202
+ # Use soft update to update the critic-target network parameters by infusing
203
+ # small amount of current parameters
204
+ for param, target_param in zip(
205
+ self.critic.parameters(), self.critic_target.parameters()
206
+ ):
207
+ target_param.data.copy_(
208
+ tau * param.data + (1 - tau) * target_param.data
209
+ )
210
+
211
+ av_loss += loss
212
+ self.iter_count += 1
213
+ # Write new values for tensorboard
214
+ self.writer.add_scalar("train/loss", av_loss / iterations, self.iter_count)
215
+ self.writer.add_scalar("train/avg_Q", av_Q / iterations, self.iter_count)
216
+ self.writer.add_scalar("train/max_Q", max_Q, self.iter_count)
217
+ if self.save_every > 0 and self.iter_count % self.save_every == 0:
218
+ self.save(filename=self.model_name, directory=self.save_directory)
219
+
220
+ def save(self, filename, directory):
221
+ Path(directory).mkdir(parents=True, exist_ok=True)
222
+ torch.save(self.actor.state_dict(), "%s/%s_actor.pth" % (directory, filename))
223
+ torch.save(
224
+ self.actor_target.state_dict(),
225
+ "%s/%s_actor_target.pth" % (directory, filename),
226
+ )
227
+ torch.save(self.critic.state_dict(), "%s/%s_critic.pth" % (directory, filename))
228
+ torch.save(
229
+ self.critic_target.state_dict(),
230
+ "%s/%s_critic_target.pth" % (directory, filename),
231
+ )
232
+
233
+ def load(self, filename, directory):
234
+ self.actor.load_state_dict(
235
+ torch.load("%s/%s_actor.pth" % (directory, filename), map_location=self.device)
236
+ )
237
+ self.actor_target.load_state_dict(
238
+ torch.load("%s/%s_actor_target.pth" % (directory, filename), map_location=self.device)
239
+ )
240
+ self.critic.load_state_dict(
241
+ torch.load("%s/%s_critic.pth" % (directory, filename), map_location=self.device)
242
+ )
243
+ self.critic_target.load_state_dict(
244
+ torch.load("%s/%s_critic_target.pth" % (directory, filename), map_location=self.device)
245
+ )
246
+ print(f"Loaded weights from: {directory}")
247
+
248
+ def prepare_state(self, latest_scan, distance, cos, sin, collision, goal, action):
249
+ # update the returned data from ROS into a form used for learning in the current model
250
+ latest_scan = np.array(latest_scan)
251
+
252
+ inf_mask = np.isinf(latest_scan)
253
+ latest_scan[inf_mask] = 7.0
254
+
255
+ max_bins = self.state_dim - 5
256
+ bin_size = int(np.ceil(len(latest_scan) / max_bins))
257
+
258
+ # Initialize the list to store the minimum values of each bin
259
+ min_values = []
260
+
261
+ # Loop through the data and create bins
262
+ for i in range(0, len(latest_scan), bin_size):
263
+ # Get the current bin
264
+ bin = latest_scan[i : i + min(bin_size, len(latest_scan) - i)]
265
+ # Find the minimum value in the current bin and append it to the min_values list
266
+ min_values.append(min(bin) / 7)
267
+
268
+ # Normalize to [0, 1] range
269
+ distance /= 10
270
+ lin_vel = action[0] * 2
271
+ ang_vel = (action[1] + 1) / 2
272
+ state = min_values + [distance, cos, sin] + [lin_vel, ang_vel]
273
+
274
+ assert len(state) == self.state_dim
275
+ terminal = 1 if collision or goal else 0
276
+
277
+ return state, terminal
models/TD3/checkpoint/TD3_actor.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6846697cc1c4f16f2de1b329267ce8b001bc42e7f8558fa76a304b693c632063
3
+ size 527836
models/TD3/checkpoint/TD3_actor_target.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ae3fc4e9ec3e4dc6b5a9c512ce5b6bdac56ab5a41ff526750e7cb6b58751eaf0
3
+ size 527906
models/TD3/checkpoint/TD3_critic.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b647c928e7a9ded7fe5f5715d7e5563d71ebbbc75be9e33d29e29b0c335f802e
3
+ size 1060450
models/TD3/checkpoint/TD3_critic_target.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4952b2658115973e4e141c6ea9f86f0cc551580db9fdd8657336daa0cc323d28
3
+ size 1060590
replay_buffer.py ADDED
@@ -0,0 +1,142 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import random
2
+ from collections import deque
3
+ import itertools
4
+
5
+ import numpy as np
6
+
7
+
8
+ class ReplayBuffer(object):
9
+ def __init__(self, buffer_size, random_seed=123):
10
+ """
11
+ The right side of the deque contains the most recent experiences
12
+ """
13
+ self.buffer_size = buffer_size
14
+ self.count = 0
15
+ self.buffer = deque()
16
+ random.seed(random_seed)
17
+
18
+ def add(self, s, a, r, t, s2):
19
+ experience = (s, a, r, t, s2)
20
+ if self.count < self.buffer_size:
21
+ self.buffer.append(experience)
22
+ self.count += 1
23
+ else:
24
+ self.buffer.popleft()
25
+ self.buffer.append(experience)
26
+
27
+ def size(self):
28
+ return self.count
29
+
30
+ def sample_batch(self, batch_size):
31
+ if self.count < batch_size:
32
+ batch = random.sample(self.buffer, self.count)
33
+ else:
34
+ batch = random.sample(self.buffer, batch_size)
35
+
36
+ s_batch = np.array([_[0] for _ in batch])
37
+ a_batch = np.array([_[1] for _ in batch])
38
+ r_batch = np.array([_[2] for _ in batch]).reshape(-1, 1)
39
+ t_batch = np.array([_[3] for _ in batch]).reshape(-1, 1)
40
+ s2_batch = np.array([_[4] for _ in batch])
41
+
42
+ return s_batch, a_batch, r_batch, t_batch, s2_batch
43
+
44
+ def return_buffer(self):
45
+ s = np.array([_[0] for _ in self.buffer])
46
+ a = np.array([_[1] for _ in self.buffer])
47
+ r = np.array([_[2] for _ in self.buffer]).reshape(-1, 1)
48
+ t = np.array([_[3] for _ in self.buffer]).reshape(-1, 1)
49
+ s2 = np.array([_[4] for _ in self.buffer])
50
+
51
+ return s, a, r, t, s2
52
+
53
+ def clear(self):
54
+ self.buffer.clear()
55
+ self.count = 0
56
+
57
+
58
+ class RolloutReplayBuffer(object):
59
+ def __init__(self, buffer_size, random_seed=123, history_len=10):
60
+ """
61
+ The right side of the deque contains the most recent experiences
62
+ """
63
+ self.buffer_size = buffer_size
64
+ self.count = 0
65
+ self.buffer = deque(maxlen=buffer_size)
66
+ random.seed(random_seed)
67
+ self.buffer.append([])
68
+ self.history_len = history_len
69
+
70
+ def add(self, s, a, r, t, s2):
71
+ experience = (s, a, r, t, s2)
72
+ if t:
73
+ self.count += 1
74
+ self.buffer[-1].append(experience)
75
+ self.buffer.append([])
76
+ else:
77
+ self.buffer[-1].append(experience)
78
+
79
+ def size(self):
80
+ return self.count
81
+
82
+ def sample_batch(self, batch_size):
83
+ if self.count < batch_size:
84
+ batch = random.sample(
85
+ list(itertools.islice(self.buffer, 0, len(self.buffer) - 1)), self.count
86
+ )
87
+ else:
88
+ batch = random.sample(
89
+ list(itertools.islice(self.buffer, 0, len(self.buffer) - 1)), batch_size
90
+ )
91
+
92
+ idx = [random.randint(0, len(b) - 1) for b in batch]
93
+
94
+ s_batch = []
95
+ s2_batch = []
96
+ for i in range(len(batch)):
97
+ if idx[i] == len(batch[i]):
98
+ s = batch[i]
99
+ s2 = batch[i]
100
+ else:
101
+ s = batch[i][: idx[i] + 1]
102
+ s2 = batch[i][: idx[i] + 1]
103
+ s = [v[0] for v in s]
104
+ s = s[::-1]
105
+
106
+ s2 = [v[4] for v in s2]
107
+ s2 = s2[::-1]
108
+
109
+ if len(s) < self.history_len:
110
+ missing = self.history_len - len(s)
111
+ s += [s[-1]] * missing
112
+ s2 += [s2[-1]] * missing
113
+ else:
114
+ s = s[: self.history_len]
115
+ s2 = s2[: self.history_len]
116
+ s = s[::-1]
117
+ s_batch.append(s)
118
+ s2 = s2[::-1]
119
+ s2_batch.append(s2)
120
+
121
+ a_batch = np.array([batch[i][idx[i]][1] for i in range(len(batch))])
122
+ r_batch = np.array([batch[i][idx[i]][2] for i in range(len(batch))]).reshape(
123
+ -1, 1
124
+ )
125
+ t_batch = np.array([batch[i][idx[i]][3] for i in range(len(batch))]).reshape(
126
+ -1, 1
127
+ )
128
+
129
+ return np.array(s_batch), a_batch, r_batch, t_batch, np.array(s2_batch)
130
+
131
+ def return_buffer(self):
132
+ s = np.array([_[0] for _ in self.buffer])
133
+ a = np.array([_[1] for _ in self.buffer])
134
+ r = np.array([_[2] for _ in self.buffer]).reshape(-1, 1)
135
+ t = np.array([_[3] for _ in self.buffer]).reshape(-1, 1)
136
+ s2 = np.array([_[4] for _ in self.buffer])
137
+
138
+ return s, a, r, t, s2
139
+
140
+ def clear(self):
141
+ self.buffer.clear()
142
+ self.count = 0
requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ torch
2
+ tensorboard
3
+ numpy
4
+ gradio
5
+ ir-sim
robot_world.yaml ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ world:
2
+ height: 10 # the height of the world
3
+ width: 10 # the height of the world
4
+ step_time: 0.3 # 10Hz calculate each step
5
+ sample_time: 0.3 # 10 Hz for render and data extraction
6
+ collision_mode: 'react'
7
+
8
+ robot:
9
+ - kinematics: {name: 'diff'}
10
+ shape: {name: 'circle', radius: 0.2}
11
+ vel_min: [ 0, -1.0 ]
12
+ vel_max: [ 1.0, 1.0 ]
13
+ state: [2, 2, 0, 0]
14
+ goal: [9, 9, 0]
15
+ arrive_mode: position
16
+ goal_threshold: 0.3
17
+
18
+ sensors:
19
+ - type: 'lidar2d'
20
+ range_min: 0
21
+ range_max: 7
22
+ angle_range: 3.14
23
+ number: 180
24
+ noise: True
25
+ std: 0.08
26
+ angle_std: 0.1
27
+ offset: [ 0, 0, 0 ]
28
+ alpha: 0.3
29
+
30
+ plot:
31
+ show_trajectory: True
32
+
33
+ obstacle:
34
+ - number: 5
35
+ kinematics: {name: 'omni'}
36
+ distribution: {name: 'random', range_low: [0, 0, -3.14], range_high: [10, 10, 3.14]}
37
+ behavior: {name: 'rvo', wander: True, range_low: [0, 0, -3.14], range_high: [10, 10, 3.14], vxmax: 0.2, vymax: 0.2, factor: 1.0}
38
+ vel_max: [0.2, 0.2]
39
+ vel_min: [-0.2, -0.2]
40
+ shape:
41
+ - {name: 'circle', radius: 1.0, random_shape: True}
42
+ - {name: 'polygon', random_shape: true, avg_radius_range: [0.5, 1.0], irregularity_range: [0, 0.4], spikeyness_range: [0, 0.4], num_vertices_range: [4, 6]}
43
+ - shape: { name: 'rectangle', length: 1.0, width: 1.2 } # length, width
44
+ state: [ 8, 5, 1 ]
45
+ kinematics: {name: 'static'}
46
+ - shape: { name: 'rectangle', length: 0.5, width: 2.1 } # length, width
47
+ state: [ 1, 8, 1.3 ]
48
+ kinematics: {name: 'static'}
49
+ - shape: { name: 'linestring', vertices: [ [ 0, 0 ], [ 10, 0 ], [ 10, 10 ], [ 0, 10 ],[ 0, 0 ] ] } # vertices
50
+ kinematics: {name: 'static'}
51
+ state: [ 0, 0, 0 ]
sim.py ADDED
@@ -0,0 +1,95 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import irsim
2
+ import numpy as np
3
+ import random
4
+
5
+ import shapely
6
+ from irsim.lib.handler.geometry_handler import GeometryFactory
7
+ from irsim.world import ObjectBase
8
+
9
+ class SIM_ENV:
10
+ def __init__(self, world_file="robot_world.yaml", save_ani=False):
11
+ self.env = irsim.make(world_file, save_ani=save_ani)
12
+ robot_info = self.env.get_robot_info(0)
13
+ self.robot_goal = robot_info.goal
14
+
15
+ def step(self, lin_velocity=0.0, ang_velocity=0.1):
16
+ self.env.step(action_id=0, action=np.array([[lin_velocity], [ang_velocity]]))
17
+ self.env.render()
18
+
19
+ scan = self.env.get_lidar_scan()
20
+ latest_scan = scan["ranges"]
21
+
22
+ robot_state = self.env.get_robot_state()
23
+ goal_vector = [
24
+ self.robot_goal[0].item() - robot_state[0].item(),
25
+ self.robot_goal[1].item() - robot_state[1].item(),
26
+ ]
27
+ distance = np.linalg.norm(goal_vector)
28
+ goal = self.env.robot.arrive
29
+ pose_vector = [np.cos(robot_state[2]).item(), np.sin(robot_state[2]).item()]
30
+ cos, sin = self.cossin(pose_vector, goal_vector)
31
+ collision = self.env.robot.collision
32
+ action = [lin_velocity, ang_velocity]
33
+ reward = self.get_reward(goal, collision, action, latest_scan)
34
+
35
+ return latest_scan, distance, cos, sin, collision, goal, action, reward
36
+
37
+ def reset(self, robot_state=None, robot_goal=None, random_obstacles=True):
38
+ if robot_state is None:
39
+ robot_state = [[random.uniform(1, 9)], [random.uniform(1, 9)], [0], [0]]
40
+
41
+ self.env.robot.set_state(
42
+ state=np.array(robot_state),
43
+ init=True,
44
+ )
45
+
46
+ if random_obstacles:
47
+ self.env.random_obstacle_position(
48
+ range_low=[0, 0, -3.14],
49
+ range_high=[10, 10, 3.14],
50
+ ids=[i + 1 for i in range(7)],
51
+ non_overlapping=True,
52
+ )
53
+
54
+ if robot_goal is None:
55
+ unique = True
56
+ while unique:
57
+ robot_goal = [[random.uniform(1, 9)], [random.uniform(1, 9)], [0]]
58
+ shape = {"name": "circle", "radius": 0.4}
59
+ state = [robot_goal[0], robot_goal[1], robot_goal[2]]
60
+ gf = GeometryFactory.create_geometry(**shape)
61
+ geometry = gf.step(np.c_[state])
62
+ unique = any(
63
+ [
64
+ shapely.intersects(geometry, obj._geometry)
65
+ for obj in self.env.obstacle_list
66
+ ]
67
+ )
68
+ self.env.robot.set_goal(np.array(robot_goal), init=True)
69
+ self.env.reset()
70
+ self.robot_goal = self.env.robot.goal
71
+
72
+ action = [0.0, 0.0]
73
+ latest_scan, distance, cos, sin, _, _, action, reward = self.step(
74
+ lin_velocity=action[0], ang_velocity=action[1]
75
+ )
76
+ return latest_scan, distance, cos, sin, False, False, action, reward
77
+
78
+ @staticmethod
79
+ def cossin(vec1, vec2):
80
+ vec1 = vec1 / np.linalg.norm(vec1)
81
+ vec2 = vec2 / np.linalg.norm(vec2)
82
+ cos = np.dot(vec1, vec2)
83
+ sin = vec1[0] * vec2[1] - vec1[1] * vec2[0]
84
+
85
+ return cos, sin
86
+
87
+ @staticmethod
88
+ def get_reward(goal, collision, action, laser_scan):
89
+ if goal:
90
+ return 100.0
91
+ elif collision:
92
+ return -100.0
93
+ else:
94
+ r3 = lambda x: 1.35 - x if x < 1.35 else 0.0
95
+ return action[0] - abs(action[1]) / 2 - r3(min(laser_scan)) / 2
train.py ADDED
@@ -0,0 +1,142 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from models.TD3.TD3 import TD3
2
+
3
+ import torch
4
+ import numpy as np
5
+ from sim import SIM_ENV
6
+ from utils import get_buffer
7
+
8
+
9
+ def main(args=None):
10
+ """Main training function"""
11
+ action_dim = 2 # number of actions produced by the model
12
+ max_action = 1 # maximum absolute value of output actions
13
+ state_dim = 25 # number of input values in the neural network (vector length of state input)
14
+ device = torch.device(
15
+ "cuda" if torch.cuda.is_available() else "cpu"
16
+ ) # using cuda if it is available, cpu otherwise
17
+ nr_eval_episodes = 10 # how many episodes to use to run evaluation
18
+ max_epochs = 60 # max number of epochs
19
+ epoch = 0 # starting epoch number
20
+ episodes_per_epoch = 70 # how many episodes to run in single epoch
21
+ episode = 0 # starting episode number
22
+ train_every_n = 2 # train and update network parameters every n episodes
23
+ training_iterations = 80 # how many batches to use for single training cycle
24
+ batch_size = 64 # batch size for each training iteration
25
+ max_steps = 300 # maximum number of steps in single episode
26
+ steps = 0 # starting step number
27
+ load_saved_buffer = False # whether to load experiences from assets/data.yml
28
+ pretrain = False # whether to use the loaded experiences to pre-train the model (load_saved_buffer must be True)
29
+ pretraining_iterations = (
30
+ 10 # number of training iterations to run during pre-training
31
+ )
32
+ save_every = 10 # save the model every n training cycles
33
+
34
+ model = TD3(
35
+ state_dim=state_dim,
36
+ action_dim=action_dim,
37
+ max_action=max_action,
38
+ device=device,
39
+ save_every=save_every,
40
+ load_model=False,
41
+ ) # instantiate a model
42
+
43
+ sim = SIM_ENV() # instantiate environment
44
+ replay_buffer = get_buffer(
45
+ model,
46
+ sim,
47
+ load_saved_buffer,
48
+ pretrain,
49
+ pretraining_iterations,
50
+ training_iterations,
51
+ batch_size,
52
+ )
53
+
54
+ latest_scan, distance, cos, sin, collision, goal, a, reward = sim.step(
55
+ lin_velocity=0.0, ang_velocity=0.0
56
+ ) # get the initial step state
57
+
58
+ while epoch < max_epochs: # train until max_epochs is reached
59
+ state, terminal = model.prepare_state(
60
+ latest_scan, distance, cos, sin, collision, goal, a
61
+ ) # get state a state representation from returned data from the environment
62
+
63
+ action = model.get_action(np.array(state), True) # get an action from the model
64
+ a_in = [
65
+ (action[0] + 1) / 4,
66
+ action[1],
67
+ ] # clip linear velocity to [0, 0.5] m/s range
68
+
69
+ latest_scan, distance, cos, sin, collision, goal, a, reward = sim.step(
70
+ lin_velocity=a_in[0], ang_velocity=a_in[1]
71
+ ) # get data from the environment
72
+ next_state, terminal = model.prepare_state(
73
+ latest_scan, distance, cos, sin, collision, goal, a
74
+ ) # get a next state representation
75
+ replay_buffer.add(
76
+ state, action, reward, terminal, next_state
77
+ ) # add experience to the replay buffer
78
+
79
+ if (
80
+ terminal or steps == max_steps
81
+ ): # reset environment of terminal stat ereached, or max_steps were taken
82
+ latest_scan, distance, cos, sin, collision, goal, a, reward = sim.reset()
83
+ episode += 1
84
+ if episode % train_every_n == 0:
85
+ model.train(
86
+ replay_buffer=replay_buffer,
87
+ iterations=training_iterations,
88
+ batch_size=batch_size,
89
+ ) # train the model and update its parameters
90
+
91
+ steps = 0
92
+ else:
93
+ steps += 1
94
+
95
+ if (
96
+ episode + 1
97
+ ) % episodes_per_epoch == 0: # if epoch is concluded, run evaluation
98
+ episode = 0
99
+ epoch += 1
100
+ evaluate(model, epoch, sim, eval_episodes=nr_eval_episodes)
101
+
102
+
103
+ def evaluate(model, epoch, sim, eval_episodes=10):
104
+ print("..............................................")
105
+ print(f"Epoch {epoch}. Evaluating scenarios")
106
+ avg_reward = 0.0
107
+ col = 0
108
+ goals = 0
109
+ for _ in range(eval_episodes):
110
+ count = 0
111
+ latest_scan, distance, cos, sin, collision, goal, a, reward = sim.reset()
112
+ done = False
113
+ while not done and count < 501:
114
+ state, terminal = model.prepare_state(
115
+ latest_scan, distance, cos, sin, collision, goal, a
116
+ )
117
+ action = model.get_action(np.array(state), False)
118
+ a_in = [(action[0] + 1) / 4, action[1]]
119
+ latest_scan, distance, cos, sin, collision, goal, a, reward = sim.step(
120
+ lin_velocity=a_in[0], ang_velocity=a_in[1]
121
+ )
122
+ avg_reward += reward
123
+ count += 1
124
+ if collision:
125
+ col += 1
126
+ if goal:
127
+ goals += 1
128
+ done = collision or goal
129
+ avg_reward /= eval_episodes
130
+ avg_col = col / eval_episodes
131
+ avg_goal = goals / eval_episodes
132
+ print(f"Average Reward: {avg_reward}")
133
+ print(f"Average Collision rate: {avg_col}")
134
+ print(f"Average Goal rate: {avg_goal}")
135
+ print("..............................................")
136
+ model.writer.add_scalar("eval/avg_reward", avg_reward, epoch)
137
+ model.writer.add_scalar("eval/avg_col", avg_col, epoch)
138
+ model.writer.add_scalar("eval/avg_goal", avg_goal, epoch)
139
+
140
+
141
+ if __name__ == "__main__":
142
+ main()
utils.py ADDED
@@ -0,0 +1,123 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import List
2
+ from tqdm import tqdm
3
+ import yaml
4
+
5
+ from replay_buffer import ReplayBuffer, RolloutReplayBuffer
6
+
7
+ class Pretraining:
8
+ def __init__(
9
+ self,
10
+ file_names: List[str],
11
+ model: object,
12
+ replay_buffer: object,
13
+ reward_function,
14
+ ):
15
+ self.file_names = file_names
16
+ self.model = model
17
+ self.replay_buffer = replay_buffer
18
+ self.reward_function = reward_function
19
+
20
+ def load_buffer(self):
21
+ for file_name in self.file_names:
22
+ print("Loading file: ", file_name)
23
+ with open(file_name, "r") as file:
24
+ samples = yaml.full_load(file)
25
+ for i in tqdm(range(1, len(samples) - 1)):
26
+ sample = samples[i]
27
+ latest_scan = sample["latest_scan"]
28
+ distance = sample["distance"]
29
+ cos = sample["cos"]
30
+ sin = sample["sin"]
31
+ collision = sample["collision"]
32
+ goal = sample["goal"]
33
+ action = sample["action"]
34
+
35
+ state, terminal = self.model.prepare_state(
36
+ latest_scan, distance, cos, sin, collision, goal, action
37
+ )
38
+
39
+ if terminal:
40
+ continue
41
+
42
+ next_sample = samples[i + 1]
43
+ next_latest_scan = next_sample["latest_scan"]
44
+ next_distance = next_sample["distance"]
45
+ next_cos = next_sample["cos"]
46
+ next_sin = next_sample["sin"]
47
+ next_collision = next_sample["collision"]
48
+ next_goal = next_sample["goal"]
49
+ next_action = next_sample["action"]
50
+ next_state, next_terminal = self.model.prepare_state(
51
+ next_latest_scan,
52
+ next_distance,
53
+ next_cos,
54
+ next_sin,
55
+ next_collision,
56
+ next_goal,
57
+ next_action,
58
+ )
59
+ reward = self.reward_function(
60
+ next_goal, next_collision, action, next_latest_scan
61
+ )
62
+ self.replay_buffer.add(
63
+ state, action, reward, next_terminal, next_state
64
+ )
65
+
66
+ return self.replay_buffer
67
+
68
+ def train(
69
+ self,
70
+ pretraining_iterations,
71
+ replay_buffer,
72
+ iterations,
73
+ batch_size,
74
+ ):
75
+ print("Running Pretraining")
76
+ for _ in tqdm(range(pretraining_iterations)):
77
+ self.model.train(
78
+ replay_buffer=replay_buffer,
79
+ iterations=iterations,
80
+ batch_size=batch_size,
81
+ )
82
+ print("Model Pretrained")
83
+
84
+
85
+ def get_buffer(
86
+ model,
87
+ sim,
88
+ load_saved_buffer,
89
+ pretrain,
90
+ pretraining_iterations,
91
+ training_iterations,
92
+ batch_size,
93
+ buffer_size=50000,
94
+ random_seed=666,
95
+ file_names=["assets/data.yml"],
96
+ history_len=10,
97
+ ):
98
+ replay_buffer = ReplayBuffer(buffer_size=buffer_size, random_seed=random_seed)
99
+
100
+ if pretrain:
101
+ assert (
102
+ load_saved_buffer
103
+ ), "To pre-train model, load_saved_buffer must be set to True"
104
+
105
+ if load_saved_buffer:
106
+ pretraining = Pretraining(
107
+ file_names=file_names,
108
+ model=model,
109
+ replay_buffer=replay_buffer,
110
+ reward_function=sim.get_reward,
111
+ ) # instantiate pre-trainind
112
+ replay_buffer = (
113
+ pretraining.load_buffer()
114
+ ) # fill buffer with experiences from the data.yml file
115
+ if pretrain:
116
+ pretraining.train(
117
+ pretraining_iterations=pretraining_iterations,
118
+ replay_buffer=replay_buffer,
119
+ iterations=training_iterations,
120
+ batch_size=batch_size,
121
+ ) # run pre-training
122
+
123
+ return replay_buffer