thanhndv212 · thanhndv212 · Jun 21, 2026 · Jun 21, 2026
diff --git a/.gitignore b/.gitignore
@@ -49,3 +49,5 @@ MANIFEST
 
 # ruff
 .ruff_cache/
+training/checkpoints/
+tb/
diff --git a/examples/07_rl_hover.py b/examples/07_rl_hover.py
@@ -0,0 +1,91 @@
+#!/usr/bin/env python3
+"""Example 07: Train a PPO policy for quadcopter hover stabilization.
+
+This is the "hello world" of the RL pipeline — it trains for 50k steps
+using the default PPO config and evaluates the resulting policy.
+
+Requires: pip install stable-baselines3 tensorboard
+"""
+
+import os
+import sys
+
+try:
+    from stable_baselines3 import PPO
+except ImportError:
+    print("stable-baselines3 is required.  Install: pip install stable-baselines3")
+    sys.exit(1)
+
+import numpy as np
+
+from drones_sim.rl.actions import ThrustBodyRatesAction
+from drones_sim.rl.env import QuadcopterEnv
+from drones_sim.rl.observations import RelativeStateObs
+from drones_sim.rl.reward import RewardConfig, reward
+from drones_sim.rl.tasks import HoverTask
+
+CHECKPOINT_DIR = os.path.join(os.path.dirname(__file__), "..", "training", "checkpoints")
+
+
+def main():
+    n_steps = 50000
+    print(f"Training PPO for {n_steps:,} steps on hover task...")
+
+    env = QuadcopterEnv(
+        task=HoverTask(target=(0.0, 0.0, 2.0)),
+        action_param=ThrustBodyRatesAction(),
+        obs_builder=RelativeStateObs(),
+        reward_fn=reward,
+        reward_cfg=RewardConfig(),
+        dt=0.01,
+        episode_len_s=10.0,
+        seed=0,
+    )
+
+    from stable_baselines3.common.callbacks import CheckpointCallback
+    from stable_baselines3.common.vec_env import DummyVecEnv, VecNormalize
+
+    vec_env = DummyVecEnv([lambda: env])
+    vec_env = VecNormalize(vec_env, norm_obs=True, norm_reward=False, clip_obs=10.0)
+
+    os.makedirs(CHECKPOINT_DIR, exist_ok=True)
+
+    model = PPO(
+        "MlpPolicy", vec_env,
+        learning_rate=3e-4, n_steps=1024, batch_size=64,
+        n_epochs=10, gamma=0.99, gae_lambda=0.95,
+        clip_range=0.2, ent_coef=0.01,
+        policy_kwargs={"net_arch": [128, 128]},
+        tensorboard_log="./tb/ppo_hover",
+        verbose=1,
+    )
+
+    callbacks = [
+        CheckpointCallback(save_freq=10000, save_path=CHECKPOINT_DIR, name_prefix="ppo_quad"),
+    ]
+
+    model.learn(total_timesteps=n_steps, callback=callbacks, progress_bar=True)
+
+    model_path = os.path.join(CHECKPOINT_DIR, "ppo_hover_final")
+    model.save(model_path)
+    vec_env.save(os.path.join(CHECKPOINT_DIR, "vecnormalize.pkl"))
+    print(f"Model saved to {model_path}")
+
+    # Quick evaluation
+    print("\nEvaluating...")
+    vec_env.set_attr("reward_cfg", RewardConfig())
+    obs = vec_env.reset()
+    errors = []
+    for _ in range(200):
+        action, _ = model.predict(obs, deterministic=True)
+        obs, r, dones, infos = vec_env.step(action)
+        pos = env.quad.get_position()
+        errors.append(np.linalg.norm(pos - np.array([0, 0, 2])))
+
+    rmse = np.sqrt(np.mean(np.square(errors[-50:])))
+    print(f"Final position RMSE (last 50 steps): {rmse:.4f} m")
+    print("Done! Run 'tensorboard --logdir tb/' to view training curves.")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/08_rl_vs_pid.py b/examples/08_rl_vs_pid.py
@@ -0,0 +1,144 @@
+#!/usr/bin/env python3
+"""Example 08: Compare RL policy vs cascaded PID on the same trajectory.
+
+Runs a circular trajectory with both:
+  1. The standard cascaded PID controller
+  2. A pre-trained RL policy (optimised for trajectory tracking)
+
+Produces a 4-panel matplotlib comparison:
+  - Position RMSE
+  - Final error
+  - Control smoothness
+  - Success/failure status
+
+Requires: pip install stable-baselines3
+"""
+
+import os
+import sys
+
+import matplotlib.pyplot as plt
+import numpy as np
+
+from drones_sim.control import QuadcopterController
+from drones_sim.dynamics import QuadcopterDynamics
+from drones_sim.rl.actions import ThrustBodyRatesAction
+from drones_sim.rl.env import QuadcopterEnv
+from drones_sim.rl.observations import RelativeStateObs
+from drones_sim.rl.reward import RewardConfig, reward
+from drones_sim.rl.tasks import TrackingTask
+from drones_sim.trajectory import generate_circular
+
+DT = 0.01
+
+
+def _run_pid(traj):
+    quad = QuadcopterDynamics(motor_time_constant=0.04)
+    ctrl = QuadcopterController(quad)
+    quad.reset(position=traj.position[0].copy())
+    ctrl.reset()
+    errors = []
+    prev_target = traj.position[0].copy()
+    for i in range(len(traj.t)):
+        target = traj.position[i]
+        motors = ctrl.compute(target, 0.0, DT, prev_target)
+        quad.update(DT, motors)
+        errors.append(np.linalg.norm(quad.get_position() - target))
+        prev_target = target.copy()
+    return errors
+
+
+def _run_rl(traj, model_path):
+    try:
+        from stable_baselines3 import PPO
+        from stable_baselines3.common.vec_env import DummyVecEnv, VecNormalize
+    except ImportError:
+        print("stable-baselines3 required.  Install: pip install stable-baselines3")
+        return None
+
+    env = QuadcopterEnv(
+        task=TrackingTask(traj),
+        action_param=ThrustBodyRatesAction(),
+        obs_builder=RelativeStateObs(),
+        reward_fn=reward,
+        reward_cfg=RewardConfig(),
+        dt=DT,
+        episode_len_s=traj.t[-1],
+    )
+    vec_env = DummyVecEnv([lambda: env])
+    vec_env = VecNormalize(vec_env, norm_obs=True, norm_reward=False, training=False)
+    stats_path = os.path.join(os.path.dirname(model_path), "vecnormalize.pkl")
+    if os.path.exists(stats_path):
+        vec_env.load_running_average(os.path.dirname(model_path))
+
+    model = PPO.load(model_path, env=vec_env)
+    obs = vec_env.reset()
+    errors = []
+    for _ in range(len(traj.t)):
+        action, _ = model.predict(obs, deterministic=True)
+        obs, _, dones, _ = vec_env.step(action)
+        if dones[0]:
+            break
+        errors.append(np.linalg.norm(
+            traj.position[min(env._step_idx, len(traj.position) - 1)] - env.quad.get_position()
+        ))
+    return errors
+
+
+def main():
+    model_path = os.path.join(
+        os.path.dirname(__file__), "..", "training", "checkpoints", "ppo_hover_final.zip"
+    )
+    if not os.path.exists(model_path):
+        print(f"Model not found at {model_path}")
+        print("Run example 07 first or train a model with training/train_ppo.py")
+        sys.exit(1)
+
+    traj = generate_circular(duration=10.0, sample_rate=int(1 / DT), radius=2.0, angular_vel=0.5)
+
+    print("Running PID controller...")
+    pid_errors = _run_pid(traj)
+
+    print("Running RL policy...")
+    rl_errors = _run_rl(traj, model_path)
+
+    if rl_errors is None:
+        return
+
+    pid_rmse = np.sqrt(np.mean(np.square(pid_errors)))
+    rl_rmse = np.sqrt(np.mean(np.square(rl_errors)))
+    pid_final = pid_errors[-1]
+    rl_final = rl_errors[-1]
+
+    fig, axes = plt.subplots(2, 2, figsize=(10, 8))
+
+    axes[0, 0].bar(["PID", "RL"], [pid_rmse, rl_rmse], color=["steelblue", "darkorange"])
+    axes[0, 0].set_ylabel("Position RMSE (m)")
+    axes[0, 0].set_title("Tracking RMSE")
+
+    axes[0, 1].bar(["PID", "RL"], [pid_final, rl_final], color=["steelblue", "darkorange"])
+    axes[0, 1].set_ylabel("Final Error (m)")
+    axes[0, 1].set_title("Final Position Error")
+
+    n = min(len(pid_errors), len(rl_errors))
+    t_arr = traj.t[:n]
+    axes[1, 0].plot(t_arr, pid_errors[:n], label="PID", color="steelblue")
+    axes[1, 0].plot(t_arr, rl_errors[:n], label="RL", color="darkorange")
+    axes[1, 0].set_xlabel("Time (s)")
+    axes[1, 0].set_ylabel("Error (m)")
+    axes[1, 0].set_title("Error vs Time")
+    axes[1, 0].legend()
+    axes[1, 0].grid(True)
+
+    axes[1, 1].axis("off")
+    axes[1, 1].text(0.1, 0.5, f"PID RMSE: {pid_rmse:.4f} m\nRL  RMSE: {rl_rmse:.4f} m\n\n"
+                    f"PID final err: {pid_final:.4f} m\nRL  final err: {rl_final:.4f} m",
+                    fontsize=12, family="monospace")
+
+    fig.suptitle("Cascaded PID vs RL (PPO hover policy) — Circular Tracking")
+    fig.tight_layout()
+    plt.show()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/pyproject.toml b/pyproject.toml
@@ -16,6 +16,8 @@ dependencies = [
 
 [project.optional-dependencies]
 dev = ["pytest>=7.0", "ruff"]
+rl  = ["torch>=2.2", "stable-baselines3>=2.3", "gymnasium>=0.29", "tensorboard>=2.15", "pyyaml>=6.0"]
+rl-dev = ["drones-sim[rl]", "wandb", "optuna>=3.5", "moviepy"]
 
 [tool.setuptools.packages.find]
 where = ["src"]

diff --git a/tests/test_rl_training.py b/tests/test_rl_training.py
@@ -0,0 +1,102 @@
+"""Tests for the RL training pipeline (requires torch + SB3)."""
+
+import importlib.util
+import os
+
+import numpy as np
+import pytest
+
+_HAS_SB3 = importlib.util.find_spec("stable_baselines3") is not None
+
+if _HAS_SB3:
+    from drones_sim.rl.actions import ThrustBodyRatesAction
+    from drones_sim.rl.env import QuadcopterEnv
+    from drones_sim.rl.observations import RelativeStateObs
+    from drones_sim.rl.reward import RewardConfig, reward
+    from drones_sim.rl.tasks import HoverTask
+
+    CHECKPOINT_DIR = os.path.join(
+        os.path.dirname(__file__), "..", "training", "checkpoints"
+    )
+
+
+@pytest.mark.skipif(not _HAS_SB3, reason="stable-baselines3 not installed")
+def test_train_ppo_smoke():
+    """Smoke-train PPO for 1000 steps — verifies pipeline doesn't crash."""
+    from stable_baselines3 import PPO
+    from stable_baselines3.common.vec_env import DummyVecEnv, VecNormalize
+
+    env = QuadcopterEnv(
+        task=HoverTask(target=(0.0, 0.0, 2.0)),
+        action_param=ThrustBodyRatesAction(),
+        obs_builder=RelativeStateObs(),
+        reward_fn=reward,
+        reward_cfg=RewardConfig(),
+        dt=0.01,
+        episode_len_s=5.0,
+        seed=0,
+    )
+    vec_env = DummyVecEnv([lambda: env])
+    vec_env = VecNormalize(vec_env, norm_obs=True, norm_reward=False, clip_obs=10.0)
+
+    model = PPO(
+        "MlpPolicy", vec_env,
+        learning_rate=3e-4, n_steps=256, batch_size=64,
+        n_epochs=4, gamma=0.99,
+        policy_kwargs={"net_arch": [64, 64]},
+        verbose=0,
+    )
+    model.learn(total_timesteps=1000)
+
+    # Save and reload
+    os.makedirs(CHECKPOINT_DIR, exist_ok=True)
+    path = os.path.join(CHECKPOINT_DIR, "ppo_smoke_test")
+    model.save(path)
+    assert os.path.exists(path + ".zip")
+
+    # Quick deterministic prediction
+    obs = vec_env.reset()
+    action, _ = model.predict(obs, deterministic=True)
+    assert action.shape == (1, 4)
+    assert np.isfinite(action).all()
+
+    env.close()
+
+
+@pytest.mark.skipif(not _HAS_SB3, reason="stable-baselines3 not installed")
+def test_eval_policy_runs():
+    """eval_policy.evaluate() should return expected metrics dict."""
+    import tempfile
+
+    from stable_baselines3 import PPO
+    from stable_baselines3.common.vec_env import DummyVecEnv, VecNormalize
+
+    env = QuadcopterEnv(
+        task=HoverTask(target=(0.0, 0.0, 2.0)),
+        action_param=ThrustBodyRatesAction(),
+        obs_builder=RelativeStateObs(),
+        reward_fn=reward,
+        reward_cfg=RewardConfig(),
+        dt=0.01, episode_len_s=2.0, seed=0,
+    )
+    vec_env = DummyVecEnv([lambda: env])
+    vec_env = VecNormalize(vec_env, norm_obs=True, norm_reward=False, clip_obs=10.0)
+
+    # Train a tiny model
+    model = PPO("MlpPolicy", vec_env, policy_kwargs={"net_arch": [32, 32]}, verbose=0)
+    model.learn(total_timesteps=500)
+
+    with tempfile.TemporaryDirectory() as tmpdir:
+        model_path = os.path.join(tmpdir, "test_model")
+        model.save(model_path)
+
+        from training.eval_policy import evaluate
+
+        results = evaluate(model_path + ".zip", n_episodes=2, seed=0)
+        assert isinstance(results, dict)
+        for key in ["pos_rmse", "success_rate", "crash_rate", "mean_reward"]:
+            assert key in results, f"Missing key: {key}"
+        assert 0.0 <= results["success_rate"] <= 1.0
+        assert 0.0 <= results["crash_rate"] <= 1.0
+
+    env.close()
diff --git a/training/configs/ppo_hover.yaml b/training/configs/ppo_hover.yaml
@@ -0,0 +1,30 @@
+# PPO hover training configuration
+env:
+  dt: 0.01
+  episode_len_s: 10.0
+  target: [0.0, 0.0, 2.0]
+  reward:
+    w_pos: 1.0
+    w_vel: 0.05
+    w_attitude: 0.1
+    w_action: 0.01
+    w_action_d: 0.005
+    w_alive: 0.1
+    w_reach: 10.0
+    w_crash: -10.0
+    reach_radius: 0.1
+
+ppo:
+  policy: MlpPolicy
+  learning_rate: 3.0e-4
+  n_steps: 1024
+  batch_size: 64
+  n_epochs: 10
+  gamma: 0.99
+  gae_lambda: 0.95
+  clip_range: 0.2
+  ent_coef: 0.01
+  policy_kwargs:
+    net_arch: [128, 128]
+  save_freq: 10000
+  tensorboard_log: ./tb/ppo_hover
-Original file line number
+Diff line change
@@ Expand Up / @@ -49,3 +49,5 @@ MANIFEST @@
     # ruff
     .ruff_cache/
+    training/checkpoints/
+    tb/