Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -49,3 +49,5 @@ MANIFEST

# ruff
.ruff_cache/
training/checkpoints/
tb/
91 changes: 91 additions & 0 deletions examples/07_rl_hover.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,91 @@
#!/usr/bin/env python3
"""Example 07: Train a PPO policy for quadcopter hover stabilization.

This is the "hello world" of the RL pipeline — it trains for 50k steps
using the default PPO config and evaluates the resulting policy.

Requires: pip install stable-baselines3 tensorboard
"""

import os
import sys

try:
from stable_baselines3 import PPO
except ImportError:
print("stable-baselines3 is required. Install: pip install stable-baselines3")
sys.exit(1)

import numpy as np

from drones_sim.rl.actions import ThrustBodyRatesAction
from drones_sim.rl.env import QuadcopterEnv
from drones_sim.rl.observations import RelativeStateObs
from drones_sim.rl.reward import RewardConfig, reward
from drones_sim.rl.tasks import HoverTask

CHECKPOINT_DIR = os.path.join(os.path.dirname(__file__), "..", "training", "checkpoints")


def main():
n_steps = 50000
print(f"Training PPO for {n_steps:,} steps on hover task...")

env = QuadcopterEnv(
task=HoverTask(target=(0.0, 0.0, 2.0)),
action_param=ThrustBodyRatesAction(),
obs_builder=RelativeStateObs(),
reward_fn=reward,
reward_cfg=RewardConfig(),
dt=0.01,
episode_len_s=10.0,
seed=0,
)

from stable_baselines3.common.callbacks import CheckpointCallback
from stable_baselines3.common.vec_env import DummyVecEnv, VecNormalize

vec_env = DummyVecEnv([lambda: env])
vec_env = VecNormalize(vec_env, norm_obs=True, norm_reward=False, clip_obs=10.0)

os.makedirs(CHECKPOINT_DIR, exist_ok=True)

model = PPO(
"MlpPolicy", vec_env,
learning_rate=3e-4, n_steps=1024, batch_size=64,
n_epochs=10, gamma=0.99, gae_lambda=0.95,
clip_range=0.2, ent_coef=0.01,
policy_kwargs={"net_arch": [128, 128]},
tensorboard_log="./tb/ppo_hover",
verbose=1,
)

callbacks = [
CheckpointCallback(save_freq=10000, save_path=CHECKPOINT_DIR, name_prefix="ppo_quad"),
]

model.learn(total_timesteps=n_steps, callback=callbacks, progress_bar=True)

model_path = os.path.join(CHECKPOINT_DIR, "ppo_hover_final")
model.save(model_path)
vec_env.save(os.path.join(CHECKPOINT_DIR, "vecnormalize.pkl"))
print(f"Model saved to {model_path}")

# Quick evaluation
print("\nEvaluating...")
vec_env.set_attr("reward_cfg", RewardConfig())
obs = vec_env.reset()
errors = []
for _ in range(200):
action, _ = model.predict(obs, deterministic=True)
obs, r, dones, infos = vec_env.step(action)
pos = env.quad.get_position()
errors.append(np.linalg.norm(pos - np.array([0, 0, 2])))

rmse = np.sqrt(np.mean(np.square(errors[-50:])))
print(f"Final position RMSE (last 50 steps): {rmse:.4f} m")
print("Done! Run 'tensorboard --logdir tb/' to view training curves.")


if __name__ == "__main__":
main()
144 changes: 144 additions & 0 deletions examples/08_rl_vs_pid.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,144 @@
#!/usr/bin/env python3
"""Example 08: Compare RL policy vs cascaded PID on the same trajectory.

Runs a circular trajectory with both:
1. The standard cascaded PID controller
2. A pre-trained RL policy (optimised for trajectory tracking)

Produces a 4-panel matplotlib comparison:
- Position RMSE
- Final error
- Control smoothness
- Success/failure status

Requires: pip install stable-baselines3
"""

import os
import sys

import matplotlib.pyplot as plt
import numpy as np

from drones_sim.control import QuadcopterController
from drones_sim.dynamics import QuadcopterDynamics
from drones_sim.rl.actions import ThrustBodyRatesAction
from drones_sim.rl.env import QuadcopterEnv
from drones_sim.rl.observations import RelativeStateObs
from drones_sim.rl.reward import RewardConfig, reward
from drones_sim.rl.tasks import TrackingTask
from drones_sim.trajectory import generate_circular

DT = 0.01


def _run_pid(traj):
quad = QuadcopterDynamics(motor_time_constant=0.04)
ctrl = QuadcopterController(quad)
quad.reset(position=traj.position[0].copy())
ctrl.reset()
errors = []
prev_target = traj.position[0].copy()
for i in range(len(traj.t)):
target = traj.position[i]
motors = ctrl.compute(target, 0.0, DT, prev_target)
quad.update(DT, motors)
errors.append(np.linalg.norm(quad.get_position() - target))
prev_target = target.copy()
return errors


def _run_rl(traj, model_path):
try:
from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import DummyVecEnv, VecNormalize
except ImportError:
print("stable-baselines3 required. Install: pip install stable-baselines3")
return None

env = QuadcopterEnv(
task=TrackingTask(traj),
action_param=ThrustBodyRatesAction(),
obs_builder=RelativeStateObs(),
reward_fn=reward,
reward_cfg=RewardConfig(),
dt=DT,
episode_len_s=traj.t[-1],
)
vec_env = DummyVecEnv([lambda: env])
vec_env = VecNormalize(vec_env, norm_obs=True, norm_reward=False, training=False)
stats_path = os.path.join(os.path.dirname(model_path), "vecnormalize.pkl")
if os.path.exists(stats_path):
vec_env.load_running_average(os.path.dirname(model_path))

model = PPO.load(model_path, env=vec_env)
obs = vec_env.reset()
errors = []
for _ in range(len(traj.t)):
action, _ = model.predict(obs, deterministic=True)
obs, _, dones, _ = vec_env.step(action)
if dones[0]:
break
errors.append(np.linalg.norm(
traj.position[min(env._step_idx, len(traj.position) - 1)] - env.quad.get_position()
))
return errors


def main():
model_path = os.path.join(
os.path.dirname(__file__), "..", "training", "checkpoints", "ppo_hover_final.zip"
)
if not os.path.exists(model_path):
print(f"Model not found at {model_path}")
print("Run example 07 first or train a model with training/train_ppo.py")
sys.exit(1)

traj = generate_circular(duration=10.0, sample_rate=int(1 / DT), radius=2.0, angular_vel=0.5)

print("Running PID controller...")
pid_errors = _run_pid(traj)

print("Running RL policy...")
rl_errors = _run_rl(traj, model_path)

if rl_errors is None:
return

pid_rmse = np.sqrt(np.mean(np.square(pid_errors)))
rl_rmse = np.sqrt(np.mean(np.square(rl_errors)))
pid_final = pid_errors[-1]
rl_final = rl_errors[-1]

fig, axes = plt.subplots(2, 2, figsize=(10, 8))

axes[0, 0].bar(["PID", "RL"], [pid_rmse, rl_rmse], color=["steelblue", "darkorange"])
axes[0, 0].set_ylabel("Position RMSE (m)")
axes[0, 0].set_title("Tracking RMSE")

axes[0, 1].bar(["PID", "RL"], [pid_final, rl_final], color=["steelblue", "darkorange"])
axes[0, 1].set_ylabel("Final Error (m)")
axes[0, 1].set_title("Final Position Error")

n = min(len(pid_errors), len(rl_errors))
t_arr = traj.t[:n]
axes[1, 0].plot(t_arr, pid_errors[:n], label="PID", color="steelblue")
axes[1, 0].plot(t_arr, rl_errors[:n], label="RL", color="darkorange")
axes[1, 0].set_xlabel("Time (s)")
axes[1, 0].set_ylabel("Error (m)")
axes[1, 0].set_title("Error vs Time")
axes[1, 0].legend()
axes[1, 0].grid(True)

axes[1, 1].axis("off")
axes[1, 1].text(0.1, 0.5, f"PID RMSE: {pid_rmse:.4f} m\nRL RMSE: {rl_rmse:.4f} m\n\n"
f"PID final err: {pid_final:.4f} m\nRL final err: {rl_final:.4f} m",
fontsize=12, family="monospace")

fig.suptitle("Cascaded PID vs RL (PPO hover policy) — Circular Tracking")
fig.tight_layout()
plt.show()


if __name__ == "__main__":
main()
2 changes: 2 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,8 @@ dependencies = [

[project.optional-dependencies]
dev = ["pytest>=7.0", "ruff"]
rl = ["torch>=2.2", "stable-baselines3>=2.3", "gymnasium>=0.29", "tensorboard>=2.15", "pyyaml>=6.0"]
rl-dev = ["drones-sim[rl]", "wandb", "optuna>=3.5", "moviepy"]

[tool.setuptools.packages.find]
where = ["src"]
Expand Down
102 changes: 102 additions & 0 deletions tests/test_rl_training.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,102 @@
"""Tests for the RL training pipeline (requires torch + SB3)."""

import importlib.util
import os

import numpy as np
import pytest

_HAS_SB3 = importlib.util.find_spec("stable_baselines3") is not None

if _HAS_SB3:
from drones_sim.rl.actions import ThrustBodyRatesAction
from drones_sim.rl.env import QuadcopterEnv
from drones_sim.rl.observations import RelativeStateObs
from drones_sim.rl.reward import RewardConfig, reward
from drones_sim.rl.tasks import HoverTask

CHECKPOINT_DIR = os.path.join(
os.path.dirname(__file__), "..", "training", "checkpoints"
)


@pytest.mark.skipif(not _HAS_SB3, reason="stable-baselines3 not installed")
def test_train_ppo_smoke():
"""Smoke-train PPO for 1000 steps — verifies pipeline doesn't crash."""
from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import DummyVecEnv, VecNormalize

env = QuadcopterEnv(
task=HoverTask(target=(0.0, 0.0, 2.0)),
action_param=ThrustBodyRatesAction(),
obs_builder=RelativeStateObs(),
reward_fn=reward,
reward_cfg=RewardConfig(),
dt=0.01,
episode_len_s=5.0,
seed=0,
)
vec_env = DummyVecEnv([lambda: env])
vec_env = VecNormalize(vec_env, norm_obs=True, norm_reward=False, clip_obs=10.0)

model = PPO(
"MlpPolicy", vec_env,
learning_rate=3e-4, n_steps=256, batch_size=64,
n_epochs=4, gamma=0.99,
policy_kwargs={"net_arch": [64, 64]},
verbose=0,
)
model.learn(total_timesteps=1000)

# Save and reload
os.makedirs(CHECKPOINT_DIR, exist_ok=True)
path = os.path.join(CHECKPOINT_DIR, "ppo_smoke_test")
model.save(path)
assert os.path.exists(path + ".zip")

# Quick deterministic prediction
obs = vec_env.reset()
action, _ = model.predict(obs, deterministic=True)
assert action.shape == (1, 4)
assert np.isfinite(action).all()

env.close()


@pytest.mark.skipif(not _HAS_SB3, reason="stable-baselines3 not installed")
def test_eval_policy_runs():
"""eval_policy.evaluate() should return expected metrics dict."""
import tempfile

from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import DummyVecEnv, VecNormalize

env = QuadcopterEnv(
task=HoverTask(target=(0.0, 0.0, 2.0)),
action_param=ThrustBodyRatesAction(),
obs_builder=RelativeStateObs(),
reward_fn=reward,
reward_cfg=RewardConfig(),
dt=0.01, episode_len_s=2.0, seed=0,
)
vec_env = DummyVecEnv([lambda: env])
vec_env = VecNormalize(vec_env, norm_obs=True, norm_reward=False, clip_obs=10.0)

# Train a tiny model
model = PPO("MlpPolicy", vec_env, policy_kwargs={"net_arch": [32, 32]}, verbose=0)
model.learn(total_timesteps=500)

with tempfile.TemporaryDirectory() as tmpdir:
model_path = os.path.join(tmpdir, "test_model")
model.save(model_path)

from training.eval_policy import evaluate

results = evaluate(model_path + ".zip", n_episodes=2, seed=0)
assert isinstance(results, dict)
for key in ["pos_rmse", "success_rate", "crash_rate", "mean_reward"]:
assert key in results, f"Missing key: {key}"
assert 0.0 <= results["success_rate"] <= 1.0
assert 0.0 <= results["crash_rate"] <= 1.0

env.close()
30 changes: 30 additions & 0 deletions training/configs/ppo_hover.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
# PPO hover training configuration
env:
dt: 0.01
episode_len_s: 10.0
target: [0.0, 0.0, 2.0]
reward:
w_pos: 1.0
w_vel: 0.05
w_attitude: 0.1
w_action: 0.01
w_action_d: 0.005
w_alive: 0.1
w_reach: 10.0
w_crash: -10.0
reach_radius: 0.1

ppo:
policy: MlpPolicy
learning_rate: 3.0e-4
n_steps: 1024
batch_size: 64
n_epochs: 10
gamma: 0.99
gae_lambda: 0.95
clip_range: 0.2
ent_coef: 0.01
policy_kwargs:
net_arch: [128, 128]
save_freq: 10000
tensorboard_log: ./tb/ppo_hover
Loading
Loading