Commit
·
a54010a
0
Parent(s):
init: td3 robot nav
Browse files- .gitattributes +5 -0
- README.md +14 -0
- animation/animation.gif +3 -0
- app.py +223 -0
- eval_world.yaml +50 -0
- models/TD3/TD3.py +277 -0
- models/TD3/checkpoint/TD3_actor.pth +3 -0
- models/TD3/checkpoint/TD3_actor_target.pth +3 -0
- models/TD3/checkpoint/TD3_critic.pth +3 -0
- models/TD3/checkpoint/TD3_critic_target.pth +3 -0
- replay_buffer.py +142 -0
- requirements.txt +5 -0
- robot_world.yaml +51 -0
- sim.py +95 -0
- train.py +142 -0
- utils.py +123 -0
.gitattributes
ADDED
|
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
animation/animation.gif filter=lfs diff=lfs merge=lfs -text
|
| 2 |
+
models/TD3/checkpoint/TD3_critic.pth filter=lfs diff=lfs merge=lfs -text
|
| 3 |
+
models/TD3/checkpoint/TD3_critic_target.pth filter=lfs diff=lfs merge=lfs -text
|
| 4 |
+
models/TD3/checkpoint/TD3_actor.pth filter=lfs diff=lfs merge=lfs -text
|
| 5 |
+
models/TD3/checkpoint/TD3_actor_target.pth filter=lfs diff=lfs merge=lfs -text
|
README.md
ADDED
|
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
title: TD3 Robot Nav Irsim
|
| 3 |
+
emoji: 🦀
|
| 4 |
+
colorFrom: gray
|
| 5 |
+
colorTo: gray
|
| 6 |
+
sdk: gradio
|
| 7 |
+
sdk_version: 5.23.1
|
| 8 |
+
app_file: app.py
|
| 9 |
+
pinned: false
|
| 10 |
+
license: mit
|
| 11 |
+
short_description: 'TD3: Reinforcement Learning for Autonomous Robot Navigation'
|
| 12 |
+
---
|
| 13 |
+
|
| 14 |
+
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
animation/animation.gif
ADDED
|
Git LFS Details
|
app.py
ADDED
|
@@ -0,0 +1,223 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from models.TD3.TD3 import TD3
|
| 2 |
+
|
| 3 |
+
import torch
|
| 4 |
+
import numpy as np
|
| 5 |
+
from sim import SIM_ENV
|
| 6 |
+
import yaml
|
| 7 |
+
import gradio as gr
|
| 8 |
+
import os
|
| 9 |
+
from pathlib import Path
|
| 10 |
+
import matplotlib
|
| 11 |
+
import random
|
| 12 |
+
|
| 13 |
+
matplotlib.use('Agg')
|
| 14 |
+
|
| 15 |
+
def generate_random_points(num_scenarios=2):
|
| 16 |
+
"""Generate random robot poses and goals"""
|
| 17 |
+
robot_poses = []
|
| 18 |
+
robot_goals = []
|
| 19 |
+
|
| 20 |
+
for _ in range(num_scenarios):
|
| 21 |
+
# Random pose: x, y, orientation, velocity
|
| 22 |
+
pose = [
|
| 23 |
+
[random.uniform(1, 9)], # x position
|
| 24 |
+
[random.uniform(1, 9)], # y position
|
| 25 |
+
[random.uniform(0, 3.14)], # orientation
|
| 26 |
+
[0] # initial velocity
|
| 27 |
+
]
|
| 28 |
+
|
| 29 |
+
# Random goal: x, y, orientation
|
| 30 |
+
goal = [
|
| 31 |
+
[random.uniform(1, 9)], # x position
|
| 32 |
+
[random.uniform(1, 9)], # y position
|
| 33 |
+
[0] # orientation
|
| 34 |
+
]
|
| 35 |
+
|
| 36 |
+
robot_poses.append(pose)
|
| 37 |
+
robot_goals.append(goal)
|
| 38 |
+
|
| 39 |
+
return robot_poses, robot_goals
|
| 40 |
+
|
| 41 |
+
def get_predefined_scenarios():
|
| 42 |
+
"""Return predefined robot poses and goals"""
|
| 43 |
+
robot_poses = [
|
| 44 |
+
[[3], [4], [0], [0]],
|
| 45 |
+
[[8], [1], [1], [0]],
|
| 46 |
+
[[2], [6], [1], [0]],
|
| 47 |
+
[[7], [1], [0], [0]],
|
| 48 |
+
[[7], [6.5], [2], [0]],
|
| 49 |
+
[[9], [9], [3], [0]],
|
| 50 |
+
[[2], [9], [1], [0]],
|
| 51 |
+
[[3], [6], [3], [0]],
|
| 52 |
+
[[1], [7], [0], [0]],
|
| 53 |
+
[[5], [7], [3], [0]]
|
| 54 |
+
]
|
| 55 |
+
|
| 56 |
+
robot_goals = [
|
| 57 |
+
[[8], [8], [0]],
|
| 58 |
+
[[2], [9], [0]],
|
| 59 |
+
[[7], [1], [0]],
|
| 60 |
+
[[7.2], [9], [0]],
|
| 61 |
+
[[1], [1], [0]],
|
| 62 |
+
[[5], [1], [0]],
|
| 63 |
+
[[7], [4], [0]],
|
| 64 |
+
[[9], [4], [0]],
|
| 65 |
+
[[1], [9], [0]],
|
| 66 |
+
[[5], [1], [0]]
|
| 67 |
+
]
|
| 68 |
+
|
| 69 |
+
return robot_poses, robot_goals
|
| 70 |
+
|
| 71 |
+
def run_simulation():
|
| 72 |
+
"""Run the simulation and return the path to the generated GIF"""
|
| 73 |
+
action_dim = 2 # number of actions produced by the model
|
| 74 |
+
max_action = 1 # maximum absolute value of output actions
|
| 75 |
+
state_dim = 25 # number of input values in the neural network (vector length of state input)
|
| 76 |
+
device = torch.device(
|
| 77 |
+
"cuda" if torch.cuda.is_available() else "cpu"
|
| 78 |
+
) # using cuda if it is available, cpu otherwise
|
| 79 |
+
epoch = 0 # epoch number
|
| 80 |
+
max_steps = 300 # maximum number of steps in single episode
|
| 81 |
+
|
| 82 |
+
model = TD3(
|
| 83 |
+
state_dim=state_dim,
|
| 84 |
+
action_dim=action_dim,
|
| 85 |
+
max_action=max_action,
|
| 86 |
+
device=device,
|
| 87 |
+
load_model=True,
|
| 88 |
+
) # instantiate a model
|
| 89 |
+
|
| 90 |
+
sim = SIM_ENV(world_file="eval_world.yaml", save_ani=True) # instantiate environment
|
| 91 |
+
|
| 92 |
+
# Generate random evaluation points instead of loading from YAML
|
| 93 |
+
# num_scenarios = random.randint(2, 5) # Random number of scenarios between 2 and 5
|
| 94 |
+
|
| 95 |
+
# Get predefined scenarios and randomly select one
|
| 96 |
+
all_poses, all_goals = get_predefined_scenarios()
|
| 97 |
+
scenario_index = random.randint(0, len(all_poses) - 1)
|
| 98 |
+
|
| 99 |
+
robot_poses = [all_poses[scenario_index]]
|
| 100 |
+
robot_goals = [all_goals[scenario_index]]
|
| 101 |
+
|
| 102 |
+
print(f"Selected scenario {scenario_index+1} of {len(all_poses)}")
|
| 103 |
+
total_reward = 0.0
|
| 104 |
+
total_steps = 0
|
| 105 |
+
col = 0
|
| 106 |
+
goals = 0
|
| 107 |
+
for idx in range(len(robot_poses)):
|
| 108 |
+
count = 0
|
| 109 |
+
latest_scan, distance, cos, sin, collision, goal, a, reward = sim.reset(
|
| 110 |
+
robot_state=robot_poses[idx],
|
| 111 |
+
robot_goal=robot_goals[idx],
|
| 112 |
+
random_obstacles=False,
|
| 113 |
+
)
|
| 114 |
+
done = False
|
| 115 |
+
while not done and count < max_steps:
|
| 116 |
+
state, terminal = model.prepare_state(
|
| 117 |
+
latest_scan, distance, cos, sin, collision, goal, a
|
| 118 |
+
)
|
| 119 |
+
action = model.get_action(np.array(state), False)
|
| 120 |
+
a_in = [(action[0] + 1) / 4, action[1]]
|
| 121 |
+
latest_scan, distance, cos, sin, collision, goal, a, reward = sim.step(
|
| 122 |
+
lin_velocity=a_in[0], ang_velocity=a_in[1]
|
| 123 |
+
)
|
| 124 |
+
total_reward += reward
|
| 125 |
+
total_steps += 1
|
| 126 |
+
count += 1
|
| 127 |
+
if collision:
|
| 128 |
+
col += 1
|
| 129 |
+
if goal:
|
| 130 |
+
goals += 1
|
| 131 |
+
done = collision or goal
|
| 132 |
+
avg_step_reward = total_reward / total_steps
|
| 133 |
+
avg_reward = total_reward / len(robot_poses)
|
| 134 |
+
avg_col = col / len(robot_poses)
|
| 135 |
+
avg_goal = goals / len(robot_poses)
|
| 136 |
+
print(f"Total Reward: {total_reward}")
|
| 137 |
+
print(f"Average Reward: {avg_reward}")
|
| 138 |
+
print(f"Average Step Reward: {avg_step_reward}")
|
| 139 |
+
print(f"Average Collision rate: {avg_col}")
|
| 140 |
+
print(f"Average Goal rate: {avg_goal}")
|
| 141 |
+
print("..............................................")
|
| 142 |
+
model.writer.add_scalar("test/total_reward", total_reward, epoch)
|
| 143 |
+
model.writer.add_scalar("test/avg_reward", avg_reward, epoch)
|
| 144 |
+
model.writer.add_scalar("test/avg_step_reward", avg_step_reward, epoch)
|
| 145 |
+
model.writer.add_scalar("test/avg_col", avg_col, epoch)
|
| 146 |
+
model.writer.add_scalar("test/avg_goal", avg_goal, epoch)
|
| 147 |
+
|
| 148 |
+
sim.env.end(ending_time=3)
|
| 149 |
+
|
| 150 |
+
# Find the latest generated GIF file in the animation folder
|
| 151 |
+
animation_dir = Path("animation")
|
| 152 |
+
if animation_dir.exists():
|
| 153 |
+
gif_files = list(animation_dir.glob("*.gif"))
|
| 154 |
+
if gif_files:
|
| 155 |
+
# Sort by creation time (newest first)
|
| 156 |
+
latest_gif = max(gif_files, key=lambda x: x.stat().st_ctime)
|
| 157 |
+
return str(latest_gif), {
|
| 158 |
+
"Total Reward": f"{total_reward:.2f}",
|
| 159 |
+
"Average Reward": f"{avg_reward:.2f}",
|
| 160 |
+
"Average Step Reward": f"{avg_step_reward:.2f}",
|
| 161 |
+
"Collision Rate": f"{avg_col:.2f}",
|
| 162 |
+
"Goal Rate": f"{avg_goal:.2f}"
|
| 163 |
+
}
|
| 164 |
+
|
| 165 |
+
return None, {"Error": "No GIF file was generated"}
|
| 166 |
+
|
| 167 |
+
def get_default_data():
|
| 168 |
+
"""Return default animation and statistics data"""
|
| 169 |
+
# Find any existing GIF in the animation folder
|
| 170 |
+
animation_dir = Path("animation")
|
| 171 |
+
default_gif = None
|
| 172 |
+
|
| 173 |
+
if animation_dir.exists():
|
| 174 |
+
gif_files = list(animation_dir.glob("*.gif"))
|
| 175 |
+
if gif_files:
|
| 176 |
+
# Get the most recent GIF
|
| 177 |
+
default_gif = str(max(gif_files, key=lambda x: x.stat().st_ctime))
|
| 178 |
+
|
| 179 |
+
# Default statistics
|
| 180 |
+
default_stats = {
|
| 181 |
+
"Total Reward": "99.12",
|
| 182 |
+
"Average Reward": "99.12",
|
| 183 |
+
"Average Step Reward": "1.40",
|
| 184 |
+
"Collision Rate": "0.00",
|
| 185 |
+
"Goal Rate": "1.00"
|
| 186 |
+
}
|
| 187 |
+
|
| 188 |
+
return default_gif, default_stats
|
| 189 |
+
|
| 190 |
+
def main(args=None):
|
| 191 |
+
"""Main function with Gradio interface"""
|
| 192 |
+
# Get default data for initial display
|
| 193 |
+
default_gif, default_stats = get_default_data()
|
| 194 |
+
|
| 195 |
+
with gr.Blocks(title="Robot Navigation Simulation") as demo:
|
| 196 |
+
with gr.Row():
|
| 197 |
+
with gr.Column():
|
| 198 |
+
run_button = gr.Button("Run Simulation", variant="primary")
|
| 199 |
+
|
| 200 |
+
with gr.Row():
|
| 201 |
+
with gr.Column():
|
| 202 |
+
output_image = gr.Image(
|
| 203 |
+
type="filepath",
|
| 204 |
+
label="Simulation Animation",
|
| 205 |
+
value=default_gif # Set default value
|
| 206 |
+
)
|
| 207 |
+
|
| 208 |
+
with gr.Column():
|
| 209 |
+
output_stats = gr.JSON(
|
| 210 |
+
label="Simulation Statistics",
|
| 211 |
+
value=default_stats # Set default value
|
| 212 |
+
)
|
| 213 |
+
|
| 214 |
+
run_button.click(
|
| 215 |
+
fn=run_simulation,
|
| 216 |
+
outputs=[output_image, output_stats]
|
| 217 |
+
)
|
| 218 |
+
|
| 219 |
+
demo.launch(share=False)
|
| 220 |
+
|
| 221 |
+
|
| 222 |
+
if __name__ == "__main__":
|
| 223 |
+
main()
|
eval_world.yaml
ADDED
|
@@ -0,0 +1,50 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
world:
|
| 2 |
+
height: 10 # the height of the world
|
| 3 |
+
width: 10 # the height of the world
|
| 4 |
+
step_time: 0.3 # Hz calculate each step
|
| 5 |
+
sample_time: 0.3 # Hz for render and data extraction
|
| 6 |
+
collision_mode: 'react'
|
| 7 |
+
|
| 8 |
+
robot:
|
| 9 |
+
- kinematics: {name: 'diff'}
|
| 10 |
+
shape: {name: 'circle', radius: 0.2}
|
| 11 |
+
vel_min: [ 0, -1.0 ]
|
| 12 |
+
vel_max: [ 1.0, 1.0 ]
|
| 13 |
+
state: [3, 4, 0, 0]
|
| 14 |
+
goal: [9, 9, 0]
|
| 15 |
+
arrive_mode: position
|
| 16 |
+
goal_threshold: 0.3
|
| 17 |
+
|
| 18 |
+
sensors:
|
| 19 |
+
- type: 'lidar2d'
|
| 20 |
+
range_min: 0
|
| 21 |
+
range_max: 7
|
| 22 |
+
angle_range: 3.14
|
| 23 |
+
number: 180
|
| 24 |
+
noise: True
|
| 25 |
+
std: 0.08
|
| 26 |
+
angle_std: 0.1
|
| 27 |
+
offset: [ 0, 0, 0 ]
|
| 28 |
+
alpha: 0.3
|
| 29 |
+
|
| 30 |
+
plot:
|
| 31 |
+
show_trajectory: True
|
| 32 |
+
|
| 33 |
+
obstacle:
|
| 34 |
+
- shape: { name: 'circle', radius: 1.0 } # radius
|
| 35 |
+
state: [ 5, 5, 0 ]
|
| 36 |
+
- shape: { name: 'circle', radius: 0.5 } # radius
|
| 37 |
+
state: [ 7, 8, 0 ]
|
| 38 |
+
- shape: { name: 'circle', radius: 1.4 } # radius
|
| 39 |
+
state: [ 3, 1, 0 ]
|
| 40 |
+
- shape: {name: 'rectangle', length: 1.0, width: 1.2} # length, width
|
| 41 |
+
state: [8, 5, 1]
|
| 42 |
+
- shape: { name: 'rectangle', length: 0.5, width: 2.1 } # length, width
|
| 43 |
+
state: [ 1, 8, 1.3 ]
|
| 44 |
+
- shape: { name: 'rectangle', length: 1.5, width: 0.7 } # length, width
|
| 45 |
+
state: [ 6, 2, 0.5 ]
|
| 46 |
+
|
| 47 |
+
|
| 48 |
+
- shape: { name: 'linestring', vertices: [ [ 0, 0 ], [ 10, 0 ], [ 10, 10 ], [ 0, 10 ],[ 0, 0 ] ] } # vertices
|
| 49 |
+
kinematics: {name: 'static'}
|
| 50 |
+
state: [ 0, 0, 0 ]
|
models/TD3/TD3.py
ADDED
|
@@ -0,0 +1,277 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from pathlib import Path
|
| 2 |
+
|
| 3 |
+
import numpy as np
|
| 4 |
+
import torch
|
| 5 |
+
import torch.nn as nn
|
| 6 |
+
import torch.nn.functional as F
|
| 7 |
+
from numpy import inf
|
| 8 |
+
from torch.utils.tensorboard import SummaryWriter
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
class Actor(nn.Module):
|
| 12 |
+
def __init__(self, state_dim, action_dim):
|
| 13 |
+
super(Actor, self).__init__()
|
| 14 |
+
|
| 15 |
+
self.layer_1 = nn.Linear(state_dim, 400)
|
| 16 |
+
torch.nn.init.kaiming_uniform_(self.layer_1.weight, nonlinearity="leaky_relu")
|
| 17 |
+
self.layer_2 = nn.Linear(400, 300)
|
| 18 |
+
torch.nn.init.kaiming_uniform_(self.layer_2.weight, nonlinearity="leaky_relu")
|
| 19 |
+
self.layer_3 = nn.Linear(300, action_dim)
|
| 20 |
+
self.tanh = nn.Tanh()
|
| 21 |
+
|
| 22 |
+
def forward(self, s):
|
| 23 |
+
s = F.leaky_relu(self.layer_1(s))
|
| 24 |
+
s = F.leaky_relu(self.layer_2(s))
|
| 25 |
+
a = self.tanh(self.layer_3(s))
|
| 26 |
+
return a
|
| 27 |
+
|
| 28 |
+
|
| 29 |
+
class Critic(nn.Module):
|
| 30 |
+
def __init__(self, state_dim, action_dim):
|
| 31 |
+
super(Critic, self).__init__()
|
| 32 |
+
|
| 33 |
+
self.layer_1 = nn.Linear(state_dim, 400)
|
| 34 |
+
torch.nn.init.kaiming_uniform_(self.layer_1.weight, nonlinearity="leaky_relu")
|
| 35 |
+
self.layer_2_s = nn.Linear(400, 300)
|
| 36 |
+
torch.nn.init.kaiming_uniform_(self.layer_2_s.weight, nonlinearity="leaky_relu")
|
| 37 |
+
self.layer_2_a = nn.Linear(action_dim, 300)
|
| 38 |
+
torch.nn.init.kaiming_uniform_(self.layer_2_a.weight, nonlinearity="leaky_relu")
|
| 39 |
+
self.layer_3 = nn.Linear(300, 1)
|
| 40 |
+
torch.nn.init.kaiming_uniform_(self.layer_3.weight, nonlinearity="leaky_relu")
|
| 41 |
+
|
| 42 |
+
self.layer_4 = nn.Linear(state_dim, 400)
|
| 43 |
+
torch.nn.init.kaiming_uniform_(self.layer_1.weight, nonlinearity="leaky_relu")
|
| 44 |
+
self.layer_5_s = nn.Linear(400, 300)
|
| 45 |
+
torch.nn.init.kaiming_uniform_(self.layer_5_s.weight, nonlinearity="leaky_relu")
|
| 46 |
+
self.layer_5_a = nn.Linear(action_dim, 300)
|
| 47 |
+
torch.nn.init.kaiming_uniform_(self.layer_5_a.weight, nonlinearity="leaky_relu")
|
| 48 |
+
self.layer_6 = nn.Linear(300, 1)
|
| 49 |
+
torch.nn.init.kaiming_uniform_(self.layer_6.weight, nonlinearity="leaky_relu")
|
| 50 |
+
|
| 51 |
+
def forward(self, s, a):
|
| 52 |
+
s1 = F.leaky_relu(self.layer_1(s))
|
| 53 |
+
self.layer_2_s(s1)
|
| 54 |
+
self.layer_2_a(a)
|
| 55 |
+
s11 = torch.mm(s1, self.layer_2_s.weight.data.t())
|
| 56 |
+
s12 = torch.mm(a, self.layer_2_a.weight.data.t())
|
| 57 |
+
s1 = F.leaky_relu(s11 + s12 + self.layer_2_a.bias.data)
|
| 58 |
+
q1 = self.layer_3(s1)
|
| 59 |
+
|
| 60 |
+
s2 = F.leaky_relu(self.layer_4(s))
|
| 61 |
+
self.layer_5_s(s2)
|
| 62 |
+
self.layer_5_a(a)
|
| 63 |
+
s21 = torch.mm(s2, self.layer_5_s.weight.data.t())
|
| 64 |
+
s22 = torch.mm(a, self.layer_5_a.weight.data.t())
|
| 65 |
+
s2 = F.leaky_relu(s21 + s22 + self.layer_5_a.bias.data)
|
| 66 |
+
q2 = self.layer_6(s2)
|
| 67 |
+
return q1, q2
|
| 68 |
+
|
| 69 |
+
|
| 70 |
+
# TD3 network
|
| 71 |
+
class TD3(object):
|
| 72 |
+
def __init__(
|
| 73 |
+
self,
|
| 74 |
+
state_dim,
|
| 75 |
+
action_dim,
|
| 76 |
+
max_action,
|
| 77 |
+
device,
|
| 78 |
+
lr=1e-4,
|
| 79 |
+
save_every=0,
|
| 80 |
+
load_model=False,
|
| 81 |
+
save_directory=Path("models/TD3/new/checkpoint"),
|
| 82 |
+
model_name="TD3",
|
| 83 |
+
load_directory=Path("models/TD3/checkpoint"),
|
| 84 |
+
):
|
| 85 |
+
# Initialize the Actor network
|
| 86 |
+
self.device = device
|
| 87 |
+
self.actor = Actor(state_dim, action_dim).to(self.device)
|
| 88 |
+
self.actor_target = Actor(state_dim, action_dim).to(self.device)
|
| 89 |
+
self.actor_target.load_state_dict(self.actor.state_dict())
|
| 90 |
+
self.actor_optimizer = torch.optim.Adam(params=self.actor.parameters(), lr=lr)
|
| 91 |
+
|
| 92 |
+
# Initialize the Critic networks
|
| 93 |
+
self.critic = Critic(state_dim, action_dim).to(self.device)
|
| 94 |
+
self.critic_target = Critic(state_dim, action_dim).to(self.device)
|
| 95 |
+
self.critic_target.load_state_dict(self.critic.state_dict())
|
| 96 |
+
self.critic_optimizer = torch.optim.Adam(params=self.critic.parameters(), lr=lr)
|
| 97 |
+
|
| 98 |
+
self.action_dim = action_dim
|
| 99 |
+
self.max_action = max_action
|
| 100 |
+
self.state_dim = state_dim
|
| 101 |
+
self.writer = SummaryWriter()
|
| 102 |
+
self.iter_count = 0
|
| 103 |
+
if load_model:
|
| 104 |
+
self.load(filename=model_name, directory=load_directory)
|
| 105 |
+
self.save_every = save_every
|
| 106 |
+
self.model_name = model_name
|
| 107 |
+
self.save_directory = save_directory
|
| 108 |
+
|
| 109 |
+
def get_action(self, obs, add_noise):
|
| 110 |
+
if add_noise:
|
| 111 |
+
return (
|
| 112 |
+
self.act(obs) + np.random.normal(0, 0.2, size=self.action_dim)
|
| 113 |
+
).clip(-self.max_action, self.max_action)
|
| 114 |
+
else:
|
| 115 |
+
return self.act(obs)
|
| 116 |
+
|
| 117 |
+
def act(self, state):
|
| 118 |
+
# Function to get the action from the actor
|
| 119 |
+
state = torch.Tensor(state).to(self.device)
|
| 120 |
+
return self.actor(state).cpu().data.numpy().flatten()
|
| 121 |
+
|
| 122 |
+
# training cycle
|
| 123 |
+
def train(
|
| 124 |
+
self,
|
| 125 |
+
replay_buffer,
|
| 126 |
+
iterations,
|
| 127 |
+
batch_size,
|
| 128 |
+
discount=0.99,
|
| 129 |
+
tau=0.005,
|
| 130 |
+
policy_noise=0.2,
|
| 131 |
+
noise_clip=0.5,
|
| 132 |
+
policy_freq=2,
|
| 133 |
+
):
|
| 134 |
+
av_Q = 0
|
| 135 |
+
max_Q = -inf
|
| 136 |
+
av_loss = 0
|
| 137 |
+
for it in range(iterations):
|
| 138 |
+
# sample a batch from the replay buffer
|
| 139 |
+
(
|
| 140 |
+
batch_states,
|
| 141 |
+
batch_actions,
|
| 142 |
+
batch_rewards,
|
| 143 |
+
batch_dones,
|
| 144 |
+
batch_next_states,
|
| 145 |
+
) = replay_buffer.sample_batch(batch_size)
|
| 146 |
+
state = torch.Tensor(batch_states).to(self.device)
|
| 147 |
+
next_state = torch.Tensor(batch_next_states).to(self.device)
|
| 148 |
+
action = torch.Tensor(batch_actions).to(self.device)
|
| 149 |
+
reward = torch.Tensor(batch_rewards).to(self.device)
|
| 150 |
+
done = torch.Tensor(batch_dones).to(self.device)
|
| 151 |
+
|
| 152 |
+
# Obtain the estimated action from the next state by using the actor-target
|
| 153 |
+
next_action = self.actor_target(next_state)
|
| 154 |
+
|
| 155 |
+
# Add noise to the action
|
| 156 |
+
noise = (
|
| 157 |
+
torch.Tensor(batch_actions)
|
| 158 |
+
.data.normal_(0, policy_noise)
|
| 159 |
+
.to(self.device)
|
| 160 |
+
)
|
| 161 |
+
noise = noise.clamp(-noise_clip, noise_clip)
|
| 162 |
+
next_action = (next_action + noise).clamp(-self.max_action, self.max_action)
|
| 163 |
+
|
| 164 |
+
# Calculate the Q values from the critic-target network for the next state-action pair
|
| 165 |
+
target_Q1, target_Q2 = self.critic_target(next_state, next_action)
|
| 166 |
+
|
| 167 |
+
# Select the minimal Q value from the 2 calculated values
|
| 168 |
+
target_Q = torch.min(target_Q1, target_Q2)
|
| 169 |
+
av_Q += torch.mean(target_Q)
|
| 170 |
+
max_Q = max(max_Q, torch.max(target_Q))
|
| 171 |
+
# Calculate the final Q value from the target network parameters by using Bellman equation
|
| 172 |
+
target_Q = reward + ((1 - done) * discount * target_Q).detach()
|
| 173 |
+
|
| 174 |
+
# Get the Q values of the basis networks with the current parameters
|
| 175 |
+
current_Q1, current_Q2 = self.critic(state, action)
|
| 176 |
+
|
| 177 |
+
# Calculate the loss between the current Q value and the target Q value
|
| 178 |
+
loss = F.mse_loss(current_Q1, target_Q) + F.mse_loss(current_Q2, target_Q)
|
| 179 |
+
|
| 180 |
+
# Perform the gradient descent
|
| 181 |
+
self.critic_optimizer.zero_grad()
|
| 182 |
+
loss.backward()
|
| 183 |
+
self.critic_optimizer.step()
|
| 184 |
+
|
| 185 |
+
if it % policy_freq == 0:
|
| 186 |
+
# Maximize the actor output value by performing gradient descent on negative Q values
|
| 187 |
+
# (essentially perform gradient ascent)
|
| 188 |
+
actor_grad, _ = self.critic(state, self.actor(state))
|
| 189 |
+
actor_grad = -actor_grad.mean()
|
| 190 |
+
self.actor_optimizer.zero_grad()
|
| 191 |
+
actor_grad.backward()
|
| 192 |
+
self.actor_optimizer.step()
|
| 193 |
+
|
| 194 |
+
# Use soft update to update the actor-target network parameters by
|
| 195 |
+
# infusing small amount of current parameters
|
| 196 |
+
for param, target_param in zip(
|
| 197 |
+
self.actor.parameters(), self.actor_target.parameters()
|
| 198 |
+
):
|
| 199 |
+
target_param.data.copy_(
|
| 200 |
+
tau * param.data + (1 - tau) * target_param.data
|
| 201 |
+
)
|
| 202 |
+
# Use soft update to update the critic-target network parameters by infusing
|
| 203 |
+
# small amount of current parameters
|
| 204 |
+
for param, target_param in zip(
|
| 205 |
+
self.critic.parameters(), self.critic_target.parameters()
|
| 206 |
+
):
|
| 207 |
+
target_param.data.copy_(
|
| 208 |
+
tau * param.data + (1 - tau) * target_param.data
|
| 209 |
+
)
|
| 210 |
+
|
| 211 |
+
av_loss += loss
|
| 212 |
+
self.iter_count += 1
|
| 213 |
+
# Write new values for tensorboard
|
| 214 |
+
self.writer.add_scalar("train/loss", av_loss / iterations, self.iter_count)
|
| 215 |
+
self.writer.add_scalar("train/avg_Q", av_Q / iterations, self.iter_count)
|
| 216 |
+
self.writer.add_scalar("train/max_Q", max_Q, self.iter_count)
|
| 217 |
+
if self.save_every > 0 and self.iter_count % self.save_every == 0:
|
| 218 |
+
self.save(filename=self.model_name, directory=self.save_directory)
|
| 219 |
+
|
| 220 |
+
def save(self, filename, directory):
|
| 221 |
+
Path(directory).mkdir(parents=True, exist_ok=True)
|
| 222 |
+
torch.save(self.actor.state_dict(), "%s/%s_actor.pth" % (directory, filename))
|
| 223 |
+
torch.save(
|
| 224 |
+
self.actor_target.state_dict(),
|
| 225 |
+
"%s/%s_actor_target.pth" % (directory, filename),
|
| 226 |
+
)
|
| 227 |
+
torch.save(self.critic.state_dict(), "%s/%s_critic.pth" % (directory, filename))
|
| 228 |
+
torch.save(
|
| 229 |
+
self.critic_target.state_dict(),
|
| 230 |
+
"%s/%s_critic_target.pth" % (directory, filename),
|
| 231 |
+
)
|
| 232 |
+
|
| 233 |
+
def load(self, filename, directory):
|
| 234 |
+
self.actor.load_state_dict(
|
| 235 |
+
torch.load("%s/%s_actor.pth" % (directory, filename), map_location=self.device)
|
| 236 |
+
)
|
| 237 |
+
self.actor_target.load_state_dict(
|
| 238 |
+
torch.load("%s/%s_actor_target.pth" % (directory, filename), map_location=self.device)
|
| 239 |
+
)
|
| 240 |
+
self.critic.load_state_dict(
|
| 241 |
+
torch.load("%s/%s_critic.pth" % (directory, filename), map_location=self.device)
|
| 242 |
+
)
|
| 243 |
+
self.critic_target.load_state_dict(
|
| 244 |
+
torch.load("%s/%s_critic_target.pth" % (directory, filename), map_location=self.device)
|
| 245 |
+
)
|
| 246 |
+
print(f"Loaded weights from: {directory}")
|
| 247 |
+
|
| 248 |
+
def prepare_state(self, latest_scan, distance, cos, sin, collision, goal, action):
|
| 249 |
+
# update the returned data from ROS into a form used for learning in the current model
|
| 250 |
+
latest_scan = np.array(latest_scan)
|
| 251 |
+
|
| 252 |
+
inf_mask = np.isinf(latest_scan)
|
| 253 |
+
latest_scan[inf_mask] = 7.0
|
| 254 |
+
|
| 255 |
+
max_bins = self.state_dim - 5
|
| 256 |
+
bin_size = int(np.ceil(len(latest_scan) / max_bins))
|
| 257 |
+
|
| 258 |
+
# Initialize the list to store the minimum values of each bin
|
| 259 |
+
min_values = []
|
| 260 |
+
|
| 261 |
+
# Loop through the data and create bins
|
| 262 |
+
for i in range(0, len(latest_scan), bin_size):
|
| 263 |
+
# Get the current bin
|
| 264 |
+
bin = latest_scan[i : i + min(bin_size, len(latest_scan) - i)]
|
| 265 |
+
# Find the minimum value in the current bin and append it to the min_values list
|
| 266 |
+
min_values.append(min(bin) / 7)
|
| 267 |
+
|
| 268 |
+
# Normalize to [0, 1] range
|
| 269 |
+
distance /= 10
|
| 270 |
+
lin_vel = action[0] * 2
|
| 271 |
+
ang_vel = (action[1] + 1) / 2
|
| 272 |
+
state = min_values + [distance, cos, sin] + [lin_vel, ang_vel]
|
| 273 |
+
|
| 274 |
+
assert len(state) == self.state_dim
|
| 275 |
+
terminal = 1 if collision or goal else 0
|
| 276 |
+
|
| 277 |
+
return state, terminal
|
models/TD3/checkpoint/TD3_actor.pth
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:6846697cc1c4f16f2de1b329267ce8b001bc42e7f8558fa76a304b693c632063
|
| 3 |
+
size 527836
|
models/TD3/checkpoint/TD3_actor_target.pth
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:ae3fc4e9ec3e4dc6b5a9c512ce5b6bdac56ab5a41ff526750e7cb6b58751eaf0
|
| 3 |
+
size 527906
|
models/TD3/checkpoint/TD3_critic.pth
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:b647c928e7a9ded7fe5f5715d7e5563d71ebbbc75be9e33d29e29b0c335f802e
|
| 3 |
+
size 1060450
|
models/TD3/checkpoint/TD3_critic_target.pth
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:4952b2658115973e4e141c6ea9f86f0cc551580db9fdd8657336daa0cc323d28
|
| 3 |
+
size 1060590
|
replay_buffer.py
ADDED
|
@@ -0,0 +1,142 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import random
|
| 2 |
+
from collections import deque
|
| 3 |
+
import itertools
|
| 4 |
+
|
| 5 |
+
import numpy as np
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
class ReplayBuffer(object):
|
| 9 |
+
def __init__(self, buffer_size, random_seed=123):
|
| 10 |
+
"""
|
| 11 |
+
The right side of the deque contains the most recent experiences
|
| 12 |
+
"""
|
| 13 |
+
self.buffer_size = buffer_size
|
| 14 |
+
self.count = 0
|
| 15 |
+
self.buffer = deque()
|
| 16 |
+
random.seed(random_seed)
|
| 17 |
+
|
| 18 |
+
def add(self, s, a, r, t, s2):
|
| 19 |
+
experience = (s, a, r, t, s2)
|
| 20 |
+
if self.count < self.buffer_size:
|
| 21 |
+
self.buffer.append(experience)
|
| 22 |
+
self.count += 1
|
| 23 |
+
else:
|
| 24 |
+
self.buffer.popleft()
|
| 25 |
+
self.buffer.append(experience)
|
| 26 |
+
|
| 27 |
+
def size(self):
|
| 28 |
+
return self.count
|
| 29 |
+
|
| 30 |
+
def sample_batch(self, batch_size):
|
| 31 |
+
if self.count < batch_size:
|
| 32 |
+
batch = random.sample(self.buffer, self.count)
|
| 33 |
+
else:
|
| 34 |
+
batch = random.sample(self.buffer, batch_size)
|
| 35 |
+
|
| 36 |
+
s_batch = np.array([_[0] for _ in batch])
|
| 37 |
+
a_batch = np.array([_[1] for _ in batch])
|
| 38 |
+
r_batch = np.array([_[2] for _ in batch]).reshape(-1, 1)
|
| 39 |
+
t_batch = np.array([_[3] for _ in batch]).reshape(-1, 1)
|
| 40 |
+
s2_batch = np.array([_[4] for _ in batch])
|
| 41 |
+
|
| 42 |
+
return s_batch, a_batch, r_batch, t_batch, s2_batch
|
| 43 |
+
|
| 44 |
+
def return_buffer(self):
|
| 45 |
+
s = np.array([_[0] for _ in self.buffer])
|
| 46 |
+
a = np.array([_[1] for _ in self.buffer])
|
| 47 |
+
r = np.array([_[2] for _ in self.buffer]).reshape(-1, 1)
|
| 48 |
+
t = np.array([_[3] for _ in self.buffer]).reshape(-1, 1)
|
| 49 |
+
s2 = np.array([_[4] for _ in self.buffer])
|
| 50 |
+
|
| 51 |
+
return s, a, r, t, s2
|
| 52 |
+
|
| 53 |
+
def clear(self):
|
| 54 |
+
self.buffer.clear()
|
| 55 |
+
self.count = 0
|
| 56 |
+
|
| 57 |
+
|
| 58 |
+
class RolloutReplayBuffer(object):
|
| 59 |
+
def __init__(self, buffer_size, random_seed=123, history_len=10):
|
| 60 |
+
"""
|
| 61 |
+
The right side of the deque contains the most recent experiences
|
| 62 |
+
"""
|
| 63 |
+
self.buffer_size = buffer_size
|
| 64 |
+
self.count = 0
|
| 65 |
+
self.buffer = deque(maxlen=buffer_size)
|
| 66 |
+
random.seed(random_seed)
|
| 67 |
+
self.buffer.append([])
|
| 68 |
+
self.history_len = history_len
|
| 69 |
+
|
| 70 |
+
def add(self, s, a, r, t, s2):
|
| 71 |
+
experience = (s, a, r, t, s2)
|
| 72 |
+
if t:
|
| 73 |
+
self.count += 1
|
| 74 |
+
self.buffer[-1].append(experience)
|
| 75 |
+
self.buffer.append([])
|
| 76 |
+
else:
|
| 77 |
+
self.buffer[-1].append(experience)
|
| 78 |
+
|
| 79 |
+
def size(self):
|
| 80 |
+
return self.count
|
| 81 |
+
|
| 82 |
+
def sample_batch(self, batch_size):
|
| 83 |
+
if self.count < batch_size:
|
| 84 |
+
batch = random.sample(
|
| 85 |
+
list(itertools.islice(self.buffer, 0, len(self.buffer) - 1)), self.count
|
| 86 |
+
)
|
| 87 |
+
else:
|
| 88 |
+
batch = random.sample(
|
| 89 |
+
list(itertools.islice(self.buffer, 0, len(self.buffer) - 1)), batch_size
|
| 90 |
+
)
|
| 91 |
+
|
| 92 |
+
idx = [random.randint(0, len(b) - 1) for b in batch]
|
| 93 |
+
|
| 94 |
+
s_batch = []
|
| 95 |
+
s2_batch = []
|
| 96 |
+
for i in range(len(batch)):
|
| 97 |
+
if idx[i] == len(batch[i]):
|
| 98 |
+
s = batch[i]
|
| 99 |
+
s2 = batch[i]
|
| 100 |
+
else:
|
| 101 |
+
s = batch[i][: idx[i] + 1]
|
| 102 |
+
s2 = batch[i][: idx[i] + 1]
|
| 103 |
+
s = [v[0] for v in s]
|
| 104 |
+
s = s[::-1]
|
| 105 |
+
|
| 106 |
+
s2 = [v[4] for v in s2]
|
| 107 |
+
s2 = s2[::-1]
|
| 108 |
+
|
| 109 |
+
if len(s) < self.history_len:
|
| 110 |
+
missing = self.history_len - len(s)
|
| 111 |
+
s += [s[-1]] * missing
|
| 112 |
+
s2 += [s2[-1]] * missing
|
| 113 |
+
else:
|
| 114 |
+
s = s[: self.history_len]
|
| 115 |
+
s2 = s2[: self.history_len]
|
| 116 |
+
s = s[::-1]
|
| 117 |
+
s_batch.append(s)
|
| 118 |
+
s2 = s2[::-1]
|
| 119 |
+
s2_batch.append(s2)
|
| 120 |
+
|
| 121 |
+
a_batch = np.array([batch[i][idx[i]][1] for i in range(len(batch))])
|
| 122 |
+
r_batch = np.array([batch[i][idx[i]][2] for i in range(len(batch))]).reshape(
|
| 123 |
+
-1, 1
|
| 124 |
+
)
|
| 125 |
+
t_batch = np.array([batch[i][idx[i]][3] for i in range(len(batch))]).reshape(
|
| 126 |
+
-1, 1
|
| 127 |
+
)
|
| 128 |
+
|
| 129 |
+
return np.array(s_batch), a_batch, r_batch, t_batch, np.array(s2_batch)
|
| 130 |
+
|
| 131 |
+
def return_buffer(self):
|
| 132 |
+
s = np.array([_[0] for _ in self.buffer])
|
| 133 |
+
a = np.array([_[1] for _ in self.buffer])
|
| 134 |
+
r = np.array([_[2] for _ in self.buffer]).reshape(-1, 1)
|
| 135 |
+
t = np.array([_[3] for _ in self.buffer]).reshape(-1, 1)
|
| 136 |
+
s2 = np.array([_[4] for _ in self.buffer])
|
| 137 |
+
|
| 138 |
+
return s, a, r, t, s2
|
| 139 |
+
|
| 140 |
+
def clear(self):
|
| 141 |
+
self.buffer.clear()
|
| 142 |
+
self.count = 0
|
requirements.txt
ADDED
|
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
torch
|
| 2 |
+
tensorboard
|
| 3 |
+
numpy
|
| 4 |
+
gradio
|
| 5 |
+
ir-sim
|
robot_world.yaml
ADDED
|
@@ -0,0 +1,51 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
world:
|
| 2 |
+
height: 10 # the height of the world
|
| 3 |
+
width: 10 # the height of the world
|
| 4 |
+
step_time: 0.3 # 10Hz calculate each step
|
| 5 |
+
sample_time: 0.3 # 10 Hz for render and data extraction
|
| 6 |
+
collision_mode: 'react'
|
| 7 |
+
|
| 8 |
+
robot:
|
| 9 |
+
- kinematics: {name: 'diff'}
|
| 10 |
+
shape: {name: 'circle', radius: 0.2}
|
| 11 |
+
vel_min: [ 0, -1.0 ]
|
| 12 |
+
vel_max: [ 1.0, 1.0 ]
|
| 13 |
+
state: [2, 2, 0, 0]
|
| 14 |
+
goal: [9, 9, 0]
|
| 15 |
+
arrive_mode: position
|
| 16 |
+
goal_threshold: 0.3
|
| 17 |
+
|
| 18 |
+
sensors:
|
| 19 |
+
- type: 'lidar2d'
|
| 20 |
+
range_min: 0
|
| 21 |
+
range_max: 7
|
| 22 |
+
angle_range: 3.14
|
| 23 |
+
number: 180
|
| 24 |
+
noise: True
|
| 25 |
+
std: 0.08
|
| 26 |
+
angle_std: 0.1
|
| 27 |
+
offset: [ 0, 0, 0 ]
|
| 28 |
+
alpha: 0.3
|
| 29 |
+
|
| 30 |
+
plot:
|
| 31 |
+
show_trajectory: True
|
| 32 |
+
|
| 33 |
+
obstacle:
|
| 34 |
+
- number: 5
|
| 35 |
+
kinematics: {name: 'omni'}
|
| 36 |
+
distribution: {name: 'random', range_low: [0, 0, -3.14], range_high: [10, 10, 3.14]}
|
| 37 |
+
behavior: {name: 'rvo', wander: True, range_low: [0, 0, -3.14], range_high: [10, 10, 3.14], vxmax: 0.2, vymax: 0.2, factor: 1.0}
|
| 38 |
+
vel_max: [0.2, 0.2]
|
| 39 |
+
vel_min: [-0.2, -0.2]
|
| 40 |
+
shape:
|
| 41 |
+
- {name: 'circle', radius: 1.0, random_shape: True}
|
| 42 |
+
- {name: 'polygon', random_shape: true, avg_radius_range: [0.5, 1.0], irregularity_range: [0, 0.4], spikeyness_range: [0, 0.4], num_vertices_range: [4, 6]}
|
| 43 |
+
- shape: { name: 'rectangle', length: 1.0, width: 1.2 } # length, width
|
| 44 |
+
state: [ 8, 5, 1 ]
|
| 45 |
+
kinematics: {name: 'static'}
|
| 46 |
+
- shape: { name: 'rectangle', length: 0.5, width: 2.1 } # length, width
|
| 47 |
+
state: [ 1, 8, 1.3 ]
|
| 48 |
+
kinematics: {name: 'static'}
|
| 49 |
+
- shape: { name: 'linestring', vertices: [ [ 0, 0 ], [ 10, 0 ], [ 10, 10 ], [ 0, 10 ],[ 0, 0 ] ] } # vertices
|
| 50 |
+
kinematics: {name: 'static'}
|
| 51 |
+
state: [ 0, 0, 0 ]
|
sim.py
ADDED
|
@@ -0,0 +1,95 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import irsim
|
| 2 |
+
import numpy as np
|
| 3 |
+
import random
|
| 4 |
+
|
| 5 |
+
import shapely
|
| 6 |
+
from irsim.lib.handler.geometry_handler import GeometryFactory
|
| 7 |
+
from irsim.world import ObjectBase
|
| 8 |
+
|
| 9 |
+
class SIM_ENV:
|
| 10 |
+
def __init__(self, world_file="robot_world.yaml", save_ani=False):
|
| 11 |
+
self.env = irsim.make(world_file, save_ani=save_ani)
|
| 12 |
+
robot_info = self.env.get_robot_info(0)
|
| 13 |
+
self.robot_goal = robot_info.goal
|
| 14 |
+
|
| 15 |
+
def step(self, lin_velocity=0.0, ang_velocity=0.1):
|
| 16 |
+
self.env.step(action_id=0, action=np.array([[lin_velocity], [ang_velocity]]))
|
| 17 |
+
self.env.render()
|
| 18 |
+
|
| 19 |
+
scan = self.env.get_lidar_scan()
|
| 20 |
+
latest_scan = scan["ranges"]
|
| 21 |
+
|
| 22 |
+
robot_state = self.env.get_robot_state()
|
| 23 |
+
goal_vector = [
|
| 24 |
+
self.robot_goal[0].item() - robot_state[0].item(),
|
| 25 |
+
self.robot_goal[1].item() - robot_state[1].item(),
|
| 26 |
+
]
|
| 27 |
+
distance = np.linalg.norm(goal_vector)
|
| 28 |
+
goal = self.env.robot.arrive
|
| 29 |
+
pose_vector = [np.cos(robot_state[2]).item(), np.sin(robot_state[2]).item()]
|
| 30 |
+
cos, sin = self.cossin(pose_vector, goal_vector)
|
| 31 |
+
collision = self.env.robot.collision
|
| 32 |
+
action = [lin_velocity, ang_velocity]
|
| 33 |
+
reward = self.get_reward(goal, collision, action, latest_scan)
|
| 34 |
+
|
| 35 |
+
return latest_scan, distance, cos, sin, collision, goal, action, reward
|
| 36 |
+
|
| 37 |
+
def reset(self, robot_state=None, robot_goal=None, random_obstacles=True):
|
| 38 |
+
if robot_state is None:
|
| 39 |
+
robot_state = [[random.uniform(1, 9)], [random.uniform(1, 9)], [0], [0]]
|
| 40 |
+
|
| 41 |
+
self.env.robot.set_state(
|
| 42 |
+
state=np.array(robot_state),
|
| 43 |
+
init=True,
|
| 44 |
+
)
|
| 45 |
+
|
| 46 |
+
if random_obstacles:
|
| 47 |
+
self.env.random_obstacle_position(
|
| 48 |
+
range_low=[0, 0, -3.14],
|
| 49 |
+
range_high=[10, 10, 3.14],
|
| 50 |
+
ids=[i + 1 for i in range(7)],
|
| 51 |
+
non_overlapping=True,
|
| 52 |
+
)
|
| 53 |
+
|
| 54 |
+
if robot_goal is None:
|
| 55 |
+
unique = True
|
| 56 |
+
while unique:
|
| 57 |
+
robot_goal = [[random.uniform(1, 9)], [random.uniform(1, 9)], [0]]
|
| 58 |
+
shape = {"name": "circle", "radius": 0.4}
|
| 59 |
+
state = [robot_goal[0], robot_goal[1], robot_goal[2]]
|
| 60 |
+
gf = GeometryFactory.create_geometry(**shape)
|
| 61 |
+
geometry = gf.step(np.c_[state])
|
| 62 |
+
unique = any(
|
| 63 |
+
[
|
| 64 |
+
shapely.intersects(geometry, obj._geometry)
|
| 65 |
+
for obj in self.env.obstacle_list
|
| 66 |
+
]
|
| 67 |
+
)
|
| 68 |
+
self.env.robot.set_goal(np.array(robot_goal), init=True)
|
| 69 |
+
self.env.reset()
|
| 70 |
+
self.robot_goal = self.env.robot.goal
|
| 71 |
+
|
| 72 |
+
action = [0.0, 0.0]
|
| 73 |
+
latest_scan, distance, cos, sin, _, _, action, reward = self.step(
|
| 74 |
+
lin_velocity=action[0], ang_velocity=action[1]
|
| 75 |
+
)
|
| 76 |
+
return latest_scan, distance, cos, sin, False, False, action, reward
|
| 77 |
+
|
| 78 |
+
@staticmethod
|
| 79 |
+
def cossin(vec1, vec2):
|
| 80 |
+
vec1 = vec1 / np.linalg.norm(vec1)
|
| 81 |
+
vec2 = vec2 / np.linalg.norm(vec2)
|
| 82 |
+
cos = np.dot(vec1, vec2)
|
| 83 |
+
sin = vec1[0] * vec2[1] - vec1[1] * vec2[0]
|
| 84 |
+
|
| 85 |
+
return cos, sin
|
| 86 |
+
|
| 87 |
+
@staticmethod
|
| 88 |
+
def get_reward(goal, collision, action, laser_scan):
|
| 89 |
+
if goal:
|
| 90 |
+
return 100.0
|
| 91 |
+
elif collision:
|
| 92 |
+
return -100.0
|
| 93 |
+
else:
|
| 94 |
+
r3 = lambda x: 1.35 - x if x < 1.35 else 0.0
|
| 95 |
+
return action[0] - abs(action[1]) / 2 - r3(min(laser_scan)) / 2
|
train.py
ADDED
|
@@ -0,0 +1,142 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from models.TD3.TD3 import TD3
|
| 2 |
+
|
| 3 |
+
import torch
|
| 4 |
+
import numpy as np
|
| 5 |
+
from sim import SIM_ENV
|
| 6 |
+
from utils import get_buffer
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
def main(args=None):
|
| 10 |
+
"""Main training function"""
|
| 11 |
+
action_dim = 2 # number of actions produced by the model
|
| 12 |
+
max_action = 1 # maximum absolute value of output actions
|
| 13 |
+
state_dim = 25 # number of input values in the neural network (vector length of state input)
|
| 14 |
+
device = torch.device(
|
| 15 |
+
"cuda" if torch.cuda.is_available() else "cpu"
|
| 16 |
+
) # using cuda if it is available, cpu otherwise
|
| 17 |
+
nr_eval_episodes = 10 # how many episodes to use to run evaluation
|
| 18 |
+
max_epochs = 60 # max number of epochs
|
| 19 |
+
epoch = 0 # starting epoch number
|
| 20 |
+
episodes_per_epoch = 70 # how many episodes to run in single epoch
|
| 21 |
+
episode = 0 # starting episode number
|
| 22 |
+
train_every_n = 2 # train and update network parameters every n episodes
|
| 23 |
+
training_iterations = 80 # how many batches to use for single training cycle
|
| 24 |
+
batch_size = 64 # batch size for each training iteration
|
| 25 |
+
max_steps = 300 # maximum number of steps in single episode
|
| 26 |
+
steps = 0 # starting step number
|
| 27 |
+
load_saved_buffer = False # whether to load experiences from assets/data.yml
|
| 28 |
+
pretrain = False # whether to use the loaded experiences to pre-train the model (load_saved_buffer must be True)
|
| 29 |
+
pretraining_iterations = (
|
| 30 |
+
10 # number of training iterations to run during pre-training
|
| 31 |
+
)
|
| 32 |
+
save_every = 10 # save the model every n training cycles
|
| 33 |
+
|
| 34 |
+
model = TD3(
|
| 35 |
+
state_dim=state_dim,
|
| 36 |
+
action_dim=action_dim,
|
| 37 |
+
max_action=max_action,
|
| 38 |
+
device=device,
|
| 39 |
+
save_every=save_every,
|
| 40 |
+
load_model=False,
|
| 41 |
+
) # instantiate a model
|
| 42 |
+
|
| 43 |
+
sim = SIM_ENV() # instantiate environment
|
| 44 |
+
replay_buffer = get_buffer(
|
| 45 |
+
model,
|
| 46 |
+
sim,
|
| 47 |
+
load_saved_buffer,
|
| 48 |
+
pretrain,
|
| 49 |
+
pretraining_iterations,
|
| 50 |
+
training_iterations,
|
| 51 |
+
batch_size,
|
| 52 |
+
)
|
| 53 |
+
|
| 54 |
+
latest_scan, distance, cos, sin, collision, goal, a, reward = sim.step(
|
| 55 |
+
lin_velocity=0.0, ang_velocity=0.0
|
| 56 |
+
) # get the initial step state
|
| 57 |
+
|
| 58 |
+
while epoch < max_epochs: # train until max_epochs is reached
|
| 59 |
+
state, terminal = model.prepare_state(
|
| 60 |
+
latest_scan, distance, cos, sin, collision, goal, a
|
| 61 |
+
) # get state a state representation from returned data from the environment
|
| 62 |
+
|
| 63 |
+
action = model.get_action(np.array(state), True) # get an action from the model
|
| 64 |
+
a_in = [
|
| 65 |
+
(action[0] + 1) / 4,
|
| 66 |
+
action[1],
|
| 67 |
+
] # clip linear velocity to [0, 0.5] m/s range
|
| 68 |
+
|
| 69 |
+
latest_scan, distance, cos, sin, collision, goal, a, reward = sim.step(
|
| 70 |
+
lin_velocity=a_in[0], ang_velocity=a_in[1]
|
| 71 |
+
) # get data from the environment
|
| 72 |
+
next_state, terminal = model.prepare_state(
|
| 73 |
+
latest_scan, distance, cos, sin, collision, goal, a
|
| 74 |
+
) # get a next state representation
|
| 75 |
+
replay_buffer.add(
|
| 76 |
+
state, action, reward, terminal, next_state
|
| 77 |
+
) # add experience to the replay buffer
|
| 78 |
+
|
| 79 |
+
if (
|
| 80 |
+
terminal or steps == max_steps
|
| 81 |
+
): # reset environment of terminal stat ereached, or max_steps were taken
|
| 82 |
+
latest_scan, distance, cos, sin, collision, goal, a, reward = sim.reset()
|
| 83 |
+
episode += 1
|
| 84 |
+
if episode % train_every_n == 0:
|
| 85 |
+
model.train(
|
| 86 |
+
replay_buffer=replay_buffer,
|
| 87 |
+
iterations=training_iterations,
|
| 88 |
+
batch_size=batch_size,
|
| 89 |
+
) # train the model and update its parameters
|
| 90 |
+
|
| 91 |
+
steps = 0
|
| 92 |
+
else:
|
| 93 |
+
steps += 1
|
| 94 |
+
|
| 95 |
+
if (
|
| 96 |
+
episode + 1
|
| 97 |
+
) % episodes_per_epoch == 0: # if epoch is concluded, run evaluation
|
| 98 |
+
episode = 0
|
| 99 |
+
epoch += 1
|
| 100 |
+
evaluate(model, epoch, sim, eval_episodes=nr_eval_episodes)
|
| 101 |
+
|
| 102 |
+
|
| 103 |
+
def evaluate(model, epoch, sim, eval_episodes=10):
|
| 104 |
+
print("..............................................")
|
| 105 |
+
print(f"Epoch {epoch}. Evaluating scenarios")
|
| 106 |
+
avg_reward = 0.0
|
| 107 |
+
col = 0
|
| 108 |
+
goals = 0
|
| 109 |
+
for _ in range(eval_episodes):
|
| 110 |
+
count = 0
|
| 111 |
+
latest_scan, distance, cos, sin, collision, goal, a, reward = sim.reset()
|
| 112 |
+
done = False
|
| 113 |
+
while not done and count < 501:
|
| 114 |
+
state, terminal = model.prepare_state(
|
| 115 |
+
latest_scan, distance, cos, sin, collision, goal, a
|
| 116 |
+
)
|
| 117 |
+
action = model.get_action(np.array(state), False)
|
| 118 |
+
a_in = [(action[0] + 1) / 4, action[1]]
|
| 119 |
+
latest_scan, distance, cos, sin, collision, goal, a, reward = sim.step(
|
| 120 |
+
lin_velocity=a_in[0], ang_velocity=a_in[1]
|
| 121 |
+
)
|
| 122 |
+
avg_reward += reward
|
| 123 |
+
count += 1
|
| 124 |
+
if collision:
|
| 125 |
+
col += 1
|
| 126 |
+
if goal:
|
| 127 |
+
goals += 1
|
| 128 |
+
done = collision or goal
|
| 129 |
+
avg_reward /= eval_episodes
|
| 130 |
+
avg_col = col / eval_episodes
|
| 131 |
+
avg_goal = goals / eval_episodes
|
| 132 |
+
print(f"Average Reward: {avg_reward}")
|
| 133 |
+
print(f"Average Collision rate: {avg_col}")
|
| 134 |
+
print(f"Average Goal rate: {avg_goal}")
|
| 135 |
+
print("..............................................")
|
| 136 |
+
model.writer.add_scalar("eval/avg_reward", avg_reward, epoch)
|
| 137 |
+
model.writer.add_scalar("eval/avg_col", avg_col, epoch)
|
| 138 |
+
model.writer.add_scalar("eval/avg_goal", avg_goal, epoch)
|
| 139 |
+
|
| 140 |
+
|
| 141 |
+
if __name__ == "__main__":
|
| 142 |
+
main()
|
utils.py
ADDED
|
@@ -0,0 +1,123 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from typing import List
|
| 2 |
+
from tqdm import tqdm
|
| 3 |
+
import yaml
|
| 4 |
+
|
| 5 |
+
from replay_buffer import ReplayBuffer, RolloutReplayBuffer
|
| 6 |
+
|
| 7 |
+
class Pretraining:
|
| 8 |
+
def __init__(
|
| 9 |
+
self,
|
| 10 |
+
file_names: List[str],
|
| 11 |
+
model: object,
|
| 12 |
+
replay_buffer: object,
|
| 13 |
+
reward_function,
|
| 14 |
+
):
|
| 15 |
+
self.file_names = file_names
|
| 16 |
+
self.model = model
|
| 17 |
+
self.replay_buffer = replay_buffer
|
| 18 |
+
self.reward_function = reward_function
|
| 19 |
+
|
| 20 |
+
def load_buffer(self):
|
| 21 |
+
for file_name in self.file_names:
|
| 22 |
+
print("Loading file: ", file_name)
|
| 23 |
+
with open(file_name, "r") as file:
|
| 24 |
+
samples = yaml.full_load(file)
|
| 25 |
+
for i in tqdm(range(1, len(samples) - 1)):
|
| 26 |
+
sample = samples[i]
|
| 27 |
+
latest_scan = sample["latest_scan"]
|
| 28 |
+
distance = sample["distance"]
|
| 29 |
+
cos = sample["cos"]
|
| 30 |
+
sin = sample["sin"]
|
| 31 |
+
collision = sample["collision"]
|
| 32 |
+
goal = sample["goal"]
|
| 33 |
+
action = sample["action"]
|
| 34 |
+
|
| 35 |
+
state, terminal = self.model.prepare_state(
|
| 36 |
+
latest_scan, distance, cos, sin, collision, goal, action
|
| 37 |
+
)
|
| 38 |
+
|
| 39 |
+
if terminal:
|
| 40 |
+
continue
|
| 41 |
+
|
| 42 |
+
next_sample = samples[i + 1]
|
| 43 |
+
next_latest_scan = next_sample["latest_scan"]
|
| 44 |
+
next_distance = next_sample["distance"]
|
| 45 |
+
next_cos = next_sample["cos"]
|
| 46 |
+
next_sin = next_sample["sin"]
|
| 47 |
+
next_collision = next_sample["collision"]
|
| 48 |
+
next_goal = next_sample["goal"]
|
| 49 |
+
next_action = next_sample["action"]
|
| 50 |
+
next_state, next_terminal = self.model.prepare_state(
|
| 51 |
+
next_latest_scan,
|
| 52 |
+
next_distance,
|
| 53 |
+
next_cos,
|
| 54 |
+
next_sin,
|
| 55 |
+
next_collision,
|
| 56 |
+
next_goal,
|
| 57 |
+
next_action,
|
| 58 |
+
)
|
| 59 |
+
reward = self.reward_function(
|
| 60 |
+
next_goal, next_collision, action, next_latest_scan
|
| 61 |
+
)
|
| 62 |
+
self.replay_buffer.add(
|
| 63 |
+
state, action, reward, next_terminal, next_state
|
| 64 |
+
)
|
| 65 |
+
|
| 66 |
+
return self.replay_buffer
|
| 67 |
+
|
| 68 |
+
def train(
|
| 69 |
+
self,
|
| 70 |
+
pretraining_iterations,
|
| 71 |
+
replay_buffer,
|
| 72 |
+
iterations,
|
| 73 |
+
batch_size,
|
| 74 |
+
):
|
| 75 |
+
print("Running Pretraining")
|
| 76 |
+
for _ in tqdm(range(pretraining_iterations)):
|
| 77 |
+
self.model.train(
|
| 78 |
+
replay_buffer=replay_buffer,
|
| 79 |
+
iterations=iterations,
|
| 80 |
+
batch_size=batch_size,
|
| 81 |
+
)
|
| 82 |
+
print("Model Pretrained")
|
| 83 |
+
|
| 84 |
+
|
| 85 |
+
def get_buffer(
|
| 86 |
+
model,
|
| 87 |
+
sim,
|
| 88 |
+
load_saved_buffer,
|
| 89 |
+
pretrain,
|
| 90 |
+
pretraining_iterations,
|
| 91 |
+
training_iterations,
|
| 92 |
+
batch_size,
|
| 93 |
+
buffer_size=50000,
|
| 94 |
+
random_seed=666,
|
| 95 |
+
file_names=["assets/data.yml"],
|
| 96 |
+
history_len=10,
|
| 97 |
+
):
|
| 98 |
+
replay_buffer = ReplayBuffer(buffer_size=buffer_size, random_seed=random_seed)
|
| 99 |
+
|
| 100 |
+
if pretrain:
|
| 101 |
+
assert (
|
| 102 |
+
load_saved_buffer
|
| 103 |
+
), "To pre-train model, load_saved_buffer must be set to True"
|
| 104 |
+
|
| 105 |
+
if load_saved_buffer:
|
| 106 |
+
pretraining = Pretraining(
|
| 107 |
+
file_names=file_names,
|
| 108 |
+
model=model,
|
| 109 |
+
replay_buffer=replay_buffer,
|
| 110 |
+
reward_function=sim.get_reward,
|
| 111 |
+
) # instantiate pre-trainind
|
| 112 |
+
replay_buffer = (
|
| 113 |
+
pretraining.load_buffer()
|
| 114 |
+
) # fill buffer with experiences from the data.yml file
|
| 115 |
+
if pretrain:
|
| 116 |
+
pretraining.train(
|
| 117 |
+
pretraining_iterations=pretraining_iterations,
|
| 118 |
+
replay_buffer=replay_buffer,
|
| 119 |
+
iterations=training_iterations,
|
| 120 |
+
batch_size=batch_size,
|
| 121 |
+
) # run pre-training
|
| 122 |
+
|
| 123 |
+
return replay_buffer
|