diff --git a/.gitignore b/.gitignore index 4265008..4ceeec8 100644 --- a/.gitignore +++ b/.gitignore @@ -49,3 +49,5 @@ MANIFEST # ruff .ruff_cache/ +training/checkpoints/ +tb/ diff --git a/examples/07_rl_hover.py b/examples/07_rl_hover.py new file mode 100644 index 0000000..e7d4089 --- /dev/null +++ b/examples/07_rl_hover.py @@ -0,0 +1,91 @@ +#!/usr/bin/env python3 +"""Example 07: Train a PPO policy for quadcopter hover stabilization. + +This is the "hello world" of the RL pipeline — it trains for 50k steps +using the default PPO config and evaluates the resulting policy. + +Requires: pip install stable-baselines3 tensorboard +""" + +import os +import sys + +try: + from stable_baselines3 import PPO +except ImportError: + print("stable-baselines3 is required. Install: pip install stable-baselines3") + sys.exit(1) + +import numpy as np + +from drones_sim.rl.actions import ThrustBodyRatesAction +from drones_sim.rl.env import QuadcopterEnv +from drones_sim.rl.observations import RelativeStateObs +from drones_sim.rl.reward import RewardConfig, reward +from drones_sim.rl.tasks import HoverTask + +CHECKPOINT_DIR = os.path.join(os.path.dirname(__file__), "..", "training", "checkpoints") + + +def main(): + n_steps = 50000 + print(f"Training PPO for {n_steps:,} steps on hover task...") + + env = QuadcopterEnv( + task=HoverTask(target=(0.0, 0.0, 2.0)), + action_param=ThrustBodyRatesAction(), + obs_builder=RelativeStateObs(), + reward_fn=reward, + reward_cfg=RewardConfig(), + dt=0.01, + episode_len_s=10.0, + seed=0, + ) + + from stable_baselines3.common.callbacks import CheckpointCallback + from stable_baselines3.common.vec_env import DummyVecEnv, VecNormalize + + vec_env = DummyVecEnv([lambda: env]) + vec_env = VecNormalize(vec_env, norm_obs=True, norm_reward=False, clip_obs=10.0) + + os.makedirs(CHECKPOINT_DIR, exist_ok=True) + + model = PPO( + "MlpPolicy", vec_env, + learning_rate=3e-4, n_steps=1024, batch_size=64, + n_epochs=10, gamma=0.99, gae_lambda=0.95, + clip_range=0.2, ent_coef=0.01, + policy_kwargs={"net_arch": [128, 128]}, + tensorboard_log="./tb/ppo_hover", + verbose=1, + ) + + callbacks = [ + CheckpointCallback(save_freq=10000, save_path=CHECKPOINT_DIR, name_prefix="ppo_quad"), + ] + + model.learn(total_timesteps=n_steps, callback=callbacks, progress_bar=True) + + model_path = os.path.join(CHECKPOINT_DIR, "ppo_hover_final") + model.save(model_path) + vec_env.save(os.path.join(CHECKPOINT_DIR, "vecnormalize.pkl")) + print(f"Model saved to {model_path}") + + # Quick evaluation + print("\nEvaluating...") + vec_env.set_attr("reward_cfg", RewardConfig()) + obs = vec_env.reset() + errors = [] + for _ in range(200): + action, _ = model.predict(obs, deterministic=True) + obs, r, dones, infos = vec_env.step(action) + pos = env.quad.get_position() + errors.append(np.linalg.norm(pos - np.array([0, 0, 2]))) + + rmse = np.sqrt(np.mean(np.square(errors[-50:]))) + print(f"Final position RMSE (last 50 steps): {rmse:.4f} m") + print("Done! Run 'tensorboard --logdir tb/' to view training curves.") + + +if __name__ == "__main__": + main() diff --git a/examples/08_rl_vs_pid.py b/examples/08_rl_vs_pid.py new file mode 100644 index 0000000..8c6b9c0 --- /dev/null +++ b/examples/08_rl_vs_pid.py @@ -0,0 +1,144 @@ +#!/usr/bin/env python3 +"""Example 08: Compare RL policy vs cascaded PID on the same trajectory. + +Runs a circular trajectory with both: + 1. The standard cascaded PID controller + 2. A pre-trained RL policy (optimised for trajectory tracking) + +Produces a 4-panel matplotlib comparison: + - Position RMSE + - Final error + - Control smoothness + - Success/failure status + +Requires: pip install stable-baselines3 +""" + +import os +import sys + +import matplotlib.pyplot as plt +import numpy as np + +from drones_sim.control import QuadcopterController +from drones_sim.dynamics import QuadcopterDynamics +from drones_sim.rl.actions import ThrustBodyRatesAction +from drones_sim.rl.env import QuadcopterEnv +from drones_sim.rl.observations import RelativeStateObs +from drones_sim.rl.reward import RewardConfig, reward +from drones_sim.rl.tasks import TrackingTask +from drones_sim.trajectory import generate_circular + +DT = 0.01 + + +def _run_pid(traj): + quad = QuadcopterDynamics(motor_time_constant=0.04) + ctrl = QuadcopterController(quad) + quad.reset(position=traj.position[0].copy()) + ctrl.reset() + errors = [] + prev_target = traj.position[0].copy() + for i in range(len(traj.t)): + target = traj.position[i] + motors = ctrl.compute(target, 0.0, DT, prev_target) + quad.update(DT, motors) + errors.append(np.linalg.norm(quad.get_position() - target)) + prev_target = target.copy() + return errors + + +def _run_rl(traj, model_path): + try: + from stable_baselines3 import PPO + from stable_baselines3.common.vec_env import DummyVecEnv, VecNormalize + except ImportError: + print("stable-baselines3 required. Install: pip install stable-baselines3") + return None + + env = QuadcopterEnv( + task=TrackingTask(traj), + action_param=ThrustBodyRatesAction(), + obs_builder=RelativeStateObs(), + reward_fn=reward, + reward_cfg=RewardConfig(), + dt=DT, + episode_len_s=traj.t[-1], + ) + vec_env = DummyVecEnv([lambda: env]) + vec_env = VecNormalize(vec_env, norm_obs=True, norm_reward=False, training=False) + stats_path = os.path.join(os.path.dirname(model_path), "vecnormalize.pkl") + if os.path.exists(stats_path): + vec_env.load_running_average(os.path.dirname(model_path)) + + model = PPO.load(model_path, env=vec_env) + obs = vec_env.reset() + errors = [] + for _ in range(len(traj.t)): + action, _ = model.predict(obs, deterministic=True) + obs, _, dones, _ = vec_env.step(action) + if dones[0]: + break + errors.append(np.linalg.norm( + traj.position[min(env._step_idx, len(traj.position) - 1)] - env.quad.get_position() + )) + return errors + + +def main(): + model_path = os.path.join( + os.path.dirname(__file__), "..", "training", "checkpoints", "ppo_hover_final.zip" + ) + if not os.path.exists(model_path): + print(f"Model not found at {model_path}") + print("Run example 07 first or train a model with training/train_ppo.py") + sys.exit(1) + + traj = generate_circular(duration=10.0, sample_rate=int(1 / DT), radius=2.0, angular_vel=0.5) + + print("Running PID controller...") + pid_errors = _run_pid(traj) + + print("Running RL policy...") + rl_errors = _run_rl(traj, model_path) + + if rl_errors is None: + return + + pid_rmse = np.sqrt(np.mean(np.square(pid_errors))) + rl_rmse = np.sqrt(np.mean(np.square(rl_errors))) + pid_final = pid_errors[-1] + rl_final = rl_errors[-1] + + fig, axes = plt.subplots(2, 2, figsize=(10, 8)) + + axes[0, 0].bar(["PID", "RL"], [pid_rmse, rl_rmse], color=["steelblue", "darkorange"]) + axes[0, 0].set_ylabel("Position RMSE (m)") + axes[0, 0].set_title("Tracking RMSE") + + axes[0, 1].bar(["PID", "RL"], [pid_final, rl_final], color=["steelblue", "darkorange"]) + axes[0, 1].set_ylabel("Final Error (m)") + axes[0, 1].set_title("Final Position Error") + + n = min(len(pid_errors), len(rl_errors)) + t_arr = traj.t[:n] + axes[1, 0].plot(t_arr, pid_errors[:n], label="PID", color="steelblue") + axes[1, 0].plot(t_arr, rl_errors[:n], label="RL", color="darkorange") + axes[1, 0].set_xlabel("Time (s)") + axes[1, 0].set_ylabel("Error (m)") + axes[1, 0].set_title("Error vs Time") + axes[1, 0].legend() + axes[1, 0].grid(True) + + axes[1, 1].axis("off") + axes[1, 1].text(0.1, 0.5, f"PID RMSE: {pid_rmse:.4f} m\nRL RMSE: {rl_rmse:.4f} m\n\n" + f"PID final err: {pid_final:.4f} m\nRL final err: {rl_final:.4f} m", + fontsize=12, family="monospace") + + fig.suptitle("Cascaded PID vs RL (PPO hover policy) — Circular Tracking") + fig.tight_layout() + plt.show() + + +if __name__ == "__main__": + main() diff --git a/pyproject.toml b/pyproject.toml index 10da453..e9a3365 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -16,6 +16,8 @@ dependencies = [ [project.optional-dependencies] dev = ["pytest>=7.0", "ruff"] +rl = ["torch>=2.2", "stable-baselines3>=2.3", "gymnasium>=0.29", "tensorboard>=2.15", "pyyaml>=6.0"] +rl-dev = ["drones-sim[rl]", "wandb", "optuna>=3.5", "moviepy"] [tool.setuptools.packages.find] where = ["src"] diff --git a/tests/test_rl_training.py b/tests/test_rl_training.py new file mode 100644 index 0000000..1bd1f5f --- /dev/null +++ b/tests/test_rl_training.py @@ -0,0 +1,102 @@ +"""Tests for the RL training pipeline (requires torch + SB3).""" + +import importlib.util +import os + +import numpy as np +import pytest + +_HAS_SB3 = importlib.util.find_spec("stable_baselines3") is not None + +if _HAS_SB3: + from drones_sim.rl.actions import ThrustBodyRatesAction + from drones_sim.rl.env import QuadcopterEnv + from drones_sim.rl.observations import RelativeStateObs + from drones_sim.rl.reward import RewardConfig, reward + from drones_sim.rl.tasks import HoverTask + + CHECKPOINT_DIR = os.path.join( + os.path.dirname(__file__), "..", "training", "checkpoints" + ) + + +@pytest.mark.skipif(not _HAS_SB3, reason="stable-baselines3 not installed") +def test_train_ppo_smoke(): + """Smoke-train PPO for 1000 steps — verifies pipeline doesn't crash.""" + from stable_baselines3 import PPO + from stable_baselines3.common.vec_env import DummyVecEnv, VecNormalize + + env = QuadcopterEnv( + task=HoverTask(target=(0.0, 0.0, 2.0)), + action_param=ThrustBodyRatesAction(), + obs_builder=RelativeStateObs(), + reward_fn=reward, + reward_cfg=RewardConfig(), + dt=0.01, + episode_len_s=5.0, + seed=0, + ) + vec_env = DummyVecEnv([lambda: env]) + vec_env = VecNormalize(vec_env, norm_obs=True, norm_reward=False, clip_obs=10.0) + + model = PPO( + "MlpPolicy", vec_env, + learning_rate=3e-4, n_steps=256, batch_size=64, + n_epochs=4, gamma=0.99, + policy_kwargs={"net_arch": [64, 64]}, + verbose=0, + ) + model.learn(total_timesteps=1000) + + # Save and reload + os.makedirs(CHECKPOINT_DIR, exist_ok=True) + path = os.path.join(CHECKPOINT_DIR, "ppo_smoke_test") + model.save(path) + assert os.path.exists(path + ".zip") + + # Quick deterministic prediction + obs = vec_env.reset() + action, _ = model.predict(obs, deterministic=True) + assert action.shape == (1, 4) + assert np.isfinite(action).all() + + env.close() + + +@pytest.mark.skipif(not _HAS_SB3, reason="stable-baselines3 not installed") +def test_eval_policy_runs(): + """eval_policy.evaluate() should return expected metrics dict.""" + import tempfile + + from stable_baselines3 import PPO + from stable_baselines3.common.vec_env import DummyVecEnv, VecNormalize + + env = QuadcopterEnv( + task=HoverTask(target=(0.0, 0.0, 2.0)), + action_param=ThrustBodyRatesAction(), + obs_builder=RelativeStateObs(), + reward_fn=reward, + reward_cfg=RewardConfig(), + dt=0.01, episode_len_s=2.0, seed=0, + ) + vec_env = DummyVecEnv([lambda: env]) + vec_env = VecNormalize(vec_env, norm_obs=True, norm_reward=False, clip_obs=10.0) + + # Train a tiny model + model = PPO("MlpPolicy", vec_env, policy_kwargs={"net_arch": [32, 32]}, verbose=0) + model.learn(total_timesteps=500) + + with tempfile.TemporaryDirectory() as tmpdir: + model_path = os.path.join(tmpdir, "test_model") + model.save(model_path) + + from training.eval_policy import evaluate + + results = evaluate(model_path + ".zip", n_episodes=2, seed=0) + assert isinstance(results, dict) + for key in ["pos_rmse", "success_rate", "crash_rate", "mean_reward"]: + assert key in results, f"Missing key: {key}" + assert 0.0 <= results["success_rate"] <= 1.0 + assert 0.0 <= results["crash_rate"] <= 1.0 + + env.close() diff --git a/training/configs/ppo_hover.yaml b/training/configs/ppo_hover.yaml new file mode 100644 index 0000000..3c23013 --- /dev/null +++ b/training/configs/ppo_hover.yaml @@ -0,0 +1,30 @@ +# PPO hover training configuration +env: + dt: 0.01 + episode_len_s: 10.0 + target: [0.0, 0.0, 2.0] + reward: + w_pos: 1.0 + w_vel: 0.05 + w_attitude: 0.1 + w_action: 0.01 + w_action_d: 0.005 + w_alive: 0.1 + w_reach: 10.0 + w_crash: -10.0 + reach_radius: 0.1 + +ppo: + policy: MlpPolicy + learning_rate: 3.0e-4 + n_steps: 1024 + batch_size: 64 + n_epochs: 10 + gamma: 0.99 + gae_lambda: 0.95 + clip_range: 0.2 + ent_coef: 0.01 + policy_kwargs: + net_arch: [128, 128] + save_freq: 10000 + tensorboard_log: ./tb/ppo_hover diff --git a/training/eval_policy.py b/training/eval_policy.py new file mode 100644 index 0000000..a9bceeb --- /dev/null +++ b/training/eval_policy.py @@ -0,0 +1,110 @@ +"""Evaluate a trained RL policy and return aggregate metrics. + +Usage:: + + python -m training.eval_policy --path checkpoints/final.zip --episodes 20 +""" + +from __future__ import annotations + +import argparse + +import numpy as np + +from drones_sim.rl.actions import ThrustBodyRatesAction +from drones_sim.rl.env import QuadcopterEnv +from drones_sim.rl.observations import RelativeStateObs +from drones_sim.rl.reward import RewardConfig, reward +from drones_sim.rl.tasks import HoverTask + + +def evaluate( + model_path: str, + n_episodes: int = 10, + deterministic: bool = True, + seed: int = 0, +) -> dict: + """Load a checkpoint and evaluate over *n_episodes*. + + Returns a dict with keys: pos_rmse, success_rate, crash_rate, mean_reward. + """ + from stable_baselines3 import PPO + from stable_baselines3.common.vec_env import DummyVecEnv, VecNormalize + + env = QuadcopterEnv( + task=HoverTask(target=(0.0, 0.0, 2.0)), + action_param=ThrustBodyRatesAction(), + obs_builder=RelativeStateObs(), + reward_fn=reward, + reward_cfg=RewardConfig(), + dt=0.01, episode_len_s=10.0, + ) + vec_env = DummyVecEnv([lambda: env]) + vec_env = VecNormalize(vec_env, norm_obs=True, norm_reward=False, training=False) + + import os + + stats_path = os.path.join(os.path.dirname(model_path), "vecnormalize.pkl") + if os.path.exists(stats_path): + # SB3 VecNormalize saves stats as a .pkl; load with the same class + _vn = VecNormalize.load(stats_path, vec_env) + + model = PPO.load(model_path, env=vec_env) + + errors = [] + successes = [] + crashes = [] + total_reward = 0.0 + + for ep in range(n_episodes): + obs = vec_env.reset() + ep_reward = 0.0 + crashed = False + done = False + while not done: + action, _ = model.predict(obs, deterministic=deterministic) + obs, r, dones, infos = vec_env.step(action) + ep_reward += float(r[0]) + if dones[0]: + crashed = True + done = True + # Also check env state + pos_err = float(np.linalg.norm( + env.task.target_pos(env.quad) - env.quad.get_position() + )) + errors.append(pos_err) + if env._step_idx >= env.max_steps: + done = True + if crashed: + break + + total_reward += ep_reward + # Success: no crash and final error < 0.2 m + final_err = float(np.linalg.norm( + env.task.target_pos(env.quad) - env.quad.get_position() + )) + successes.append(not crashed and final_err < 0.2) + crashes.append(crashed) + + return { + "pos_rmse": float(np.sqrt(np.mean(np.square(errors)))), + "success_rate": float(np.mean(successes)), + "crash_rate": float(np.mean(crashes)), + "mean_reward": total_reward / n_episodes, + } + + +def main() -> None: + parser = argparse.ArgumentParser(description="Evaluate RL policy") + parser.add_argument("--path", required=True, help="Path to model .zip file") + parser.add_argument("--episodes", type=int, default=20) + parser.add_argument("--seed", type=int, default=0) + args = parser.parse_args() + + results = evaluate(args.path, n_episodes=args.episodes, seed=args.seed) + for k, v in results.items(): + print(f"{k:>15s}: {v:.4f}") + + +if __name__ == "__main__": + main() diff --git a/training/train_ppo.py b/training/train_ppo.py new file mode 100644 index 0000000..74ed45e --- /dev/null +++ b/training/train_ppo.py @@ -0,0 +1,101 @@ +"""SB3 PPO training entry point for quadcopter RL. + +Usage:: + + python -m training.train_ppo --config configs/ppo_hover.yaml --timesteps 50000 +""" + +from __future__ import annotations + +import argparse +import os + +import yaml + +from drones_sim.rl.actions import ThrustBodyRatesAction +from drones_sim.rl.env import QuadcopterEnv +from drones_sim.rl.observations import RelativeStateObs +from drones_sim.rl.reward import RewardConfig, reward +from drones_sim.rl.tasks import HoverTask + +_CHECKPOINT_DIR = os.path.join(os.path.dirname(__file__), "checkpoints") + + +def _make_env(rank: int, cfg: dict, seed: int) -> QuadcopterEnv: + task = HoverTask(target=tuple(cfg.get("target", [0.0, 0.0, 2.0]))) + r_cfg = RewardConfig(**cfg.get("reward", {})) + return QuadcopterEnv( + task=task, + action_param=ThrustBodyRatesAction(), + obs_builder=RelativeStateObs(), + reward_fn=reward, + reward_cfg=r_cfg, + dt=cfg.get("dt", 0.01), + episode_len_s=cfg.get("episode_len_s", 10.0), + seed=seed + rank, + ) + + +def main() -> None: + parser = argparse.ArgumentParser(description="Train PPO on quadcopter env") + parser.add_argument("--config", required=True, help="YAML config file") + parser.add_argument("--timesteps", type=int, default=50000) + parser.add_argument("--n-envs", type=int, default=4) + parser.add_argument("--seed", type=int, default=0) + parser.add_argument("--render", action="store_true", help="Render with viser") + args = parser.parse_args() + + with open(args.config) as f: + cfg = yaml.safe_load(f) + + ppo_cfg = cfg.get("ppo", {}) + env_cfg = cfg.get("env", {}) + + from stable_baselines3 import PPO + from stable_baselines3.common.callbacks import CheckpointCallback + from stable_baselines3.common.vec_env import SubprocVecEnv, VecNormalize + + print(f"Creating {args.n_envs} vectorised environments...") + venv = SubprocVecEnv([ + lambda i=i: _make_env(i, env_cfg, args.seed) for i in range(args.n_envs) + ]) + venv = VecNormalize(venv, norm_obs=True, norm_reward=False, clip_obs=10.0) + + os.makedirs(_CHECKPOINT_DIR, exist_ok=True) + + model = PPO( + policy=ppo_cfg.get("policy", "MlpPolicy"), + env=venv, + learning_rate=ppo_cfg.get("learning_rate", 3e-4), + n_steps=ppo_cfg.get("n_steps", 1024), + batch_size=ppo_cfg.get("batch_size", 64), + n_epochs=ppo_cfg.get("n_epochs", 10), + gamma=ppo_cfg.get("gamma", 0.99), + gae_lambda=ppo_cfg.get("gae_lambda", 0.95), + clip_range=ppo_cfg.get("clip_range", 0.2), + ent_coef=ppo_cfg.get("ent_coef", 0.01), + policy_kwargs=ppo_cfg.get("policy_kwargs", {"net_arch": [128, 128]}), + tensorboard_log=ppo_cfg.get("tensorboard_log", "./tb/ppo_hover"), + verbose=1, + ) + + callbacks = [ + CheckpointCallback( + save_freq=ppo_cfg.get("save_freq", 10000), + save_path=_CHECKPOINT_DIR, + name_prefix="ppo_quad", + ), + ] + + print(f"Training for {args.timesteps:,} timesteps...") + model.learn(total_timesteps=args.timesteps, callback=callbacks, progress_bar=True) + + model_path = os.path.join(_CHECKPOINT_DIR, "final") + model.save(model_path) + venv.save(os.path.join(_CHECKPOINT_DIR, "vecnormalize.pkl")) + print(f"Saved model to {model_path}") + print(f"Saved VecNormalize stats to {_CHECKPOINT_DIR}/vecnormalize.pkl") + + +if __name__ == "__main__": + main()