diff --git a/pufferlib/ocean/torch.py b/pufferlib/ocean/torch.py index c0e4f06cb7..93250b4cc5 100644 --- a/pufferlib/ocean/torch.py +++ b/pufferlib/ocean/torch.py @@ -54,6 +54,8 @@ def __init__( # Road features size (lanes + boundaries) self.obs_slots_lane_kept = env.obs_slots_lane_kept self.obs_slots_boundary_kept = env.obs_slots_boundary_kept + self.obs_slots_lane_n = env.obs_slots_lane_n + self.obs_slots_boundary_n = env.obs_slots_boundary_n self.road_features_count = env.road_features # Traffic control size self.obs_slots_traffic_controls_n = env.obs_slots_traffic_controls_n @@ -116,9 +118,18 @@ def __init__( def forward(self, observations, ego_dim): # Extract and slice observations from the flat buffer + + if self.training: + obs_slots_lane_kept = self.obs_slots_lane_kept + obs_slots_boundary_kept = self.obs_slots_boundary_kept + else: + # During evaluation, enforce zero dropout (also in pufferlib/ocean/benchmark/manager.py) + obs_slots_lane_kept = self.obs_slots_lane_n + obs_slots_boundary_kept = self.obs_slots_boundary_n + partner_dim = self.obs_slots_partners_n * self.partner_features_count - lane_dim = self.obs_slots_lane_kept * self.road_features_count - boundary_dim = self.obs_slots_boundary_kept * self.road_features_count + lane_dim = obs_slots_lane_kept * self.road_features_count + boundary_dim = obs_slots_boundary_kept * self.road_features_count traffic_control_dim = self.obs_slots_traffic_controls_n * self.traffic_control_features_count slide_idx = ego_dim @@ -144,12 +155,12 @@ def forward(self, observations, ego_dim): feature_list = [ego_features] # Encode Lanes and Boundaries separately - if self.obs_slots_lane_kept > 0: - lane_objects = lane_observations.view(-1, self.obs_slots_lane_kept, self.road_features_count) + if obs_slots_lane_kept > 0: + lane_objects = lane_observations.view(-1, obs_slots_lane_kept, self.road_features_count) lane_features = self.lane_encoder(lane_objects).max(dim=1).values feature_list.append(lane_features) - if self.obs_slots_boundary_kept > 0: - boundary_objects = boundary_observations.view(-1, self.obs_slots_boundary_kept, self.road_features_count) + if obs_slots_boundary_kept > 0: + boundary_objects = boundary_observations.view(-1, obs_slots_boundary_kept, self.road_features_count) boundary_features = self.boundary_encoder(boundary_objects).max(dim=1).values feature_list.append(boundary_features) @@ -192,9 +203,15 @@ def forward(self, observations, ego_dim): return self.backbone(concat_features) def pool_slot_counts(self, observations, ego_dim): + if self.training: + obs_slots_lane_kept = self.obs_slots_lane_kept + obs_slots_boundary_kept = self.obs_slots_boundary_kept + else: + obs_slots_lane_kept = self.obs_slots_lane_n + obs_slots_boundary_kept = self.obs_slots_boundary_n partner_dim = self.obs_slots_partners_n * self.partner_features_count - lane_dim = self.obs_slots_lane_kept * self.road_features_count - boundary_dim = self.obs_slots_boundary_kept * self.road_features_count + lane_dim = obs_slots_lane_kept * self.road_features_count + boundary_dim = obs_slots_boundary_kept * self.road_features_count traffic_control_dim = self.obs_slots_traffic_controls_n * self.traffic_control_features_count slide_idx = ego_dim + self.conditioning_dim @@ -207,18 +224,18 @@ def pool_slot_counts(self, observations, ego_dim): traffic_control_observations = observations[:, slide_idx : slide_idx + traffic_control_dim] counts = {} - if self.obs_slots_lane_kept > 0: - lane_objects = lane_observations.view(-1, self.obs_slots_lane_kept, self.road_features_count) + if obs_slots_lane_kept > 0: + lane_objects = lane_observations.view(-1, obs_slots_lane_kept, self.road_features_count) lane_winners = self.lane_encoder(lane_objects).max(dim=1).indices lane_counts = torch.zeros( - observations.shape[0], self.obs_slots_lane_kept, device=observations.device, dtype=torch.int64 + observations.shape[0], obs_slots_lane_kept, device=observations.device, dtype=torch.int64 ) counts["pool_lane"] = lane_counts.scatter_add(1, lane_winners, torch.ones_like(lane_winners)) - if self.obs_slots_boundary_kept > 0: - boundary_objects = boundary_observations.view(-1, self.obs_slots_boundary_kept, self.road_features_count) + if obs_slots_boundary_kept > 0: + boundary_objects = boundary_observations.view(-1, obs_slots_boundary_kept, self.road_features_count) boundary_winners = self.boundary_encoder(boundary_objects).max(dim=1).indices boundary_counts = torch.zeros( - observations.shape[0], self.obs_slots_boundary_kept, device=observations.device, dtype=torch.int64 + observations.shape[0], obs_slots_boundary_kept, device=observations.device, dtype=torch.int64 ) counts["pool_boundary"] = boundary_counts.scatter_add( 1, boundary_winners, torch.ones_like(boundary_winners) diff --git a/scripts/cluster_configs/nightly_best_launch.yaml b/scripts/cluster_configs/nightly_best_launch.yaml new file mode 100644 index 0000000000..85d2e5293e --- /dev/null +++ b/scripts/cluster_configs/nightly_best_launch.yaml @@ -0,0 +1,132 @@ +# Multi-agent "best launch" nightly training program config. +# Derived from the oignons2 (emerge/temp_training) configuration at: +# weights/oignons2/config.yaml +# Adapted to NYU Greene cluster paths and resource shape. Multi-agent gigaflow +# training over the 8 local CARLA maps with the oignons2 policy architecture, +# reward shaping (conditioning + randomization on), and partner-blindness / +# phantom-braking perturbations enabled. Keys here override +# pufferlib/config/ocean/drive.ini. +# +# Launch via scripts/launch_nightly_best.sh (3 seeds, date-stamped). + +# Environment — multi-agent gigaflow over all 8 local CARLA towns +env.simulation_mode: gigaflow +env.map_dir: pufferlib/resources/drive/binaries/carla +env.num_maps: 8 +env.num_agents: 720000 +env.min_agents_per_env: 1 +env.max_agents_per_env: 150 +env.use_map_cache: 1 +env.scenario_length: 1200 +# 0 disables periodic scenario resampling — every sub-env keeps the same map +# for the full run instead of swapping every 38400 steps. +env.resample_frequency: 0 +env.termination_mode: 1 +env.inactive_agent_threshold: 0.4 +env.dynamics_model: jerk +env.target_type: static +env.spawn_initial_speed: 0.0 +env.dt: 0.3 +env.traffic_light_behavior: 1 +env.collision_behavior: 1 +env.offroad_behavior: 1 + +# Goal setup — three sequential waypoints, route-based placement [20, 60m] +env.num_target_waypoints: 3 +env.min_waypoint_spacing: 20.0 +env.max_waypoint_spacing: 60.0 +env.goal_radius: 2.0 +env.goal_speed: 3.0 + +# Observation shaping (matches oignons2) +env.obs_slots_lane_n: 80 +env.obs_slots_boundary_n: 80 +env.obs_slots_partners_n: 16 +env.obs_slots_traffic_controls_n: 4 +env.obs_range_partner_m: 200.0 +env.obs_range_road_front_m: 200.0 +env.obs_range_road_behind_m: 40.0 +env.obs_range_road_side_m: 50.0 +env.obs_range_traffic_control_m: 100.0 +env.obs_norm_xy_offset_m: 200.0 +env.obs_norm_goal_offset_m: 200.0 +env.obs_norm_road_seg_length_m: 10.0 +env.obs_norm_road_seg_width_m: 5.0 +env.obs_norm_veh_length_m: 15.0 +env.obs_norm_veh_width_m: 10.0 +env.obs_dropout_lane: 0.5 +env.obs_dropout_boundary: 0.4 + +# Perturbations (on during training; eval's clean macro zeros these) +env.partner_blindness_prob: 0.03 +env.partner_blindness_trigger_prob: 0.05 +env.phantom_braking_prob: 0.02 +env.phantom_braking_trigger_prob: 0.02 +env.phantom_braking_duration: 10 + +# Reward shaping (oignons2 weights + conditioning/randomization on) +env.reward_conditioning: true +env.reward_randomization: true +env.reward_goal: 1.0 +env.reward_collision: 1.5 +env.reward_offroad: 1.5 +env.reward_stop_line: 1.0 +env.reward_comfort: 0.05 +env.reward_lane_align: 0.025 +env.reward_vel_align: 1.0 +env.reward_lane_center: 0.005 +env.reward_velocity: 0.0025 +env.reward_reverse: 0.005 +env.reward_timestep: 2.5e-05 +env.reward_overspeed: 0.05 + +# Policy — 3x1024 backbone, split actor/critic, gigaflow encoder +policy.input_size: 256 +policy.backbone_hidden_size: 1024 +policy.backbone_num_layers: 3 +policy.actor_hidden_size: 1024 +policy.actor_num_layers: 0 +policy.critic_hidden_size: 1024 +policy.critic_num_layers: 0 +policy.split_network: true +policy.encoder_gigaflow: true +policy.dropout: 0.0 + +# Training — 10B steps, large minibatch, compiled bfloat16 +train.total_timesteps: 10_000_000_000 +train.learning_rate: 0.0005 +train.minibatch_size: 153600 +train.max_minibatch_size: 153600 +train.update_epochs: 3 +train.bptt_horizon: 128 +train.compile: true +train.precision: bfloat16 +train.normalize_rewards: false +train.checkpoint_interval: 500 +train.optimizer: adamw + +# Eval — keep validation_gigaflow (CARLA sweep) inline, disable everything else +# (validation_replay needs nuPlan bins; behaviors_* need labelled scene +# categories not used in this nightly). Interval 250 keeps eval cost ~5% of +# wall-clock instead of ~85%. +eval.validation_defaults.interval: 250 +eval.validation_replay.enabled: 0 +eval.validation_gigaflow.render_backend: egl +eval.behaviors_full_dir.enabled: 0 +eval.behaviors_hard_stop.enabled: 0 +eval.behaviors_highway_straight.enabled: 0 +eval.behaviors_lane_change.enabled: 0 +eval.behaviors_merge.enabled: 0 +eval.behaviors_parked_cars.enabled: 0 +eval.behaviors_roundabout.enabled: 0 +eval.behaviors_stopped_traffic.enabled: 0 +eval.behaviors_traffic_light_green.enabled: 0 +eval.behaviors_traffic_light_stop.enabled: 0 +eval.behaviors_unprotected_left.enabled: 0 +eval.behaviors_unprotected_right.enabled: 0 + +# W&B — project nightly-multi-agent; group has no space (submit_cluster.py +# joins the inner command without quoting arg values). +wandb: True +wandb_project: nightly-multi-agent +wandb_group: Nightly_MultiAgent diff --git a/scripts/launch_nightly_best.sh b/scripts/launch_nightly_best.sh new file mode 100755 index 0000000000..f96ce3a78e --- /dev/null +++ b/scripts/launch_nightly_best.sh @@ -0,0 +1,41 @@ +#!/bin/bash +# Launch multi-agent "best launch" nightly training on the cluster via +# submit_cluster.py. Derived from oignons2 (emerge/temp_training); see +# scripts/cluster_configs/nightly_best_launch.yaml for the config. +# Three seeds per launch, date-stamped wandb run names. +# +# Run on the login node (sources the venv and submits from there): +# ./scripts/launch_nightly_best.sh +# +# Overridable via the environment: +# PROGRAM_CONFIG default: scripts/cluster_configs/nightly_best_launch.yaml +# SEEDS colon sweep passed to --args train.seed (default 0:1:2) +# ACCOUNT/PARTITION/TIME/MEM SLURM overrides +# PREFIX run-name prefix (default _multi_agent) +set -euo pipefail + +PROGRAM_CONFIG="${PROGRAM_CONFIG:-scripts/cluster_configs/nightly_best_launch.yaml}" +COMPUTE_CONFIG="${COMPUTE_CONFIG:-scripts/cluster_configs/nyu_greene.yaml}" +ACCOUNT="${ACCOUNT:-torch_pr_924_tandon_advanced}" +PARTITION="${PARTITION:-h200_tandon}" +TIME="${TIME:-1800}" +MEM="${MEM:-192gb}" +SEEDS="${SEEDS:-0:1:2}" +PREFIX="${PREFIX:-$(date +%Y-%m-%d)_multi_agent}" +DATE_STAMP="$(date +%Y-%m-%d)" + +source "/scratch/$USER/venvs/pufferdrive/bin/activate" + +# One submission per seed so we can pass a per-seed run_name (wandb display +# name like 2026-06-01_seed0) +IFS=':' read -ra SEED_LIST <<< "$SEEDS" +for SEED in "${SEED_LIST[@]}"; do + python scripts/submit_cluster.py \ + --save_dir "/scratch/$USER/runs" \ + --prefix "$PREFIX" \ + --compute_config "$COMPUTE_CONFIG" \ + --program_config "$PROGRAM_CONFIG" \ + --container --heartbeat \ + --account "$ACCOUNT" --partition "$PARTITION" --time "$TIME" --mem "$MEM" \ + --args "train.seed=$SEED" "run_name=${DATE_STAMP}_seed${SEED}" +done diff --git a/scripts/run_nightly_best_local.sh b/scripts/run_nightly_best_local.sh new file mode 100755 index 0000000000..6ed699cbee --- /dev/null +++ b/scripts/run_nightly_best_local.sh @@ -0,0 +1,152 @@ +#!/bin/bash +# Local (non-cluster) launch of the nightly_best_launch training. +# Direct `puffer train` equivalent of scripts/launch_nightly_best.sh + +# scripts/cluster_configs/nightly_best_launch.yaml, with no SLURM/submit_cluster. +# +# Activate your puffer env first (per CLAUDE.md) so `puffer`/torchrun are on PATH. +# Boolean flags must be Python literals (True/False), NOT yaml-style true/false, +# or the C binding rejects them (e.g. "Failed to unpack keyword X as int"). +# batch_size is auto = num_agents * bptt_horizon, so the on-GPU obs buffer scales +# with NUM_AGENTS; the 720000 default targets H200-class VRAM. Pass a smaller +# NUM_AGENTS (e.g. 2048) on smaller cards, or add --train.cpu-offload True. +# +# Eval rendering is disabled (validation_gigaflow.render=False): the headless EGL +# render path is compile-gated on (drive.h DRIVE_HAS_EGL) and isn't +# built here, so rendering would fall back to Xvfb/software. The gigaflow eval +# still runs and logs metrics; only the video frames are skipped. +# +# NUM_GPUS > 1 launches DDP via torchrun. Under DDP num_agents is PER-RANK +# (pufferl.py divides total_timesteps by world size but leaves num_agents as-is), +# so NUM_AGENTS=2048 on 4 GPUs means 2048 agents/GPU = 8192 effective. Only rank 0 +# runs eval. +# +# Run from the repo root: ./scripts/run_nightly_best_local.sh [NUM_GPUS] [NUM_AGENTS] +set -euo pipefail + +# Run from the repo root regardless of where this script is invoked from, so +# relative config paths (e.g. env.map_dir = pufferlib/resources/...) resolve. +cd "$(dirname "$(readlink -f "$0")")/.." + +NUM_GPUS="${1:-1}" +NUM_AGENTS="${2:-720000}" +RUN_TAG="${RUN_TAG:-$(date +%Y-%m-%d)_local_${NUM_GPUS}gpu}" + +# pufferl divides train.total_timesteps by world size under DDP (LOCAL_RANK set), +# so each rank targets total/NUM_GPUS. Scale the total by NUM_GPUS to hold the +# PER-RANK budget at 10B regardless of GPU count (4 GPUs -> 40B total -> 10B/rank; +# 1 GPU -> 10B, no division). Aggregate env-steps = 10B * NUM_GPUS. +PER_RANK_TIMESTEPS=10000000000 +TOTAL_TIMESTEPS=$(( PER_RANK_TIMESTEPS * NUM_GPUS )) + +ARGS=( + puffer_drive + --env.simulation-mode gigaflow + --env.map-dir pufferlib/resources/drive/binaries/carla + --env.num-maps 8 + --env.num-agents "$NUM_AGENTS" + --env.min-agents-per-env 1 + --env.max-agents-per-env 150 + --env.use-map-cache 1 + --env.scenario-length 1200 + --env.resample-frequency 0 + --env.termination-mode 1 + --env.inactive-agent-threshold 0.4 + --env.dynamics-model jerk + --env.target-type static + --env.spawn-initial-speed 0.0 + --env.dt 0.3 + --env.traffic-light-behavior 1 + --env.collision-behavior 1 + --env.offroad-behavior 1 + --env.num-target-waypoints 3 + --env.min-waypoint-spacing 20.0 + --env.max-waypoint-spacing 60.0 + --env.goal-radius 2.0 + --env.goal-speed 3.0 + --env.obs-slots-lane-n 80 + --env.obs-slots-boundary-n 80 + --env.obs-slots-partners-n 16 + --env.obs-slots-traffic-controls-n 4 + --env.obs-range-partner-m 200.0 + --env.obs-range-road-front-m 200.0 + --env.obs-range-road-behind-m 40.0 + --env.obs-range-road-side-m 50.0 + --env.obs-range-traffic-control-m 100.0 + --env.obs-norm-xy-offset-m 200.0 + --env.obs-norm-goal-offset-m 200.0 + --env.obs-norm-road-seg-length-m 10.0 + --env.obs-norm-road-seg-width-m 5.0 + --env.obs-norm-veh-length-m 15.0 + --env.obs-norm-veh-width-m 10.0 + --env.obs-dropout-lane 0.5 + --env.obs-dropout-boundary 0.4 + --env.partner-blindness-prob 0.03 + --env.partner-blindness-trigger-prob 0.05 + --env.phantom-braking-prob 0.02 + --env.phantom-braking-trigger-prob 0.02 + --env.phantom-braking-duration 10 + --env.reward-conditioning True + --env.reward-randomization True + --env.reward-goal 1.0 + --env.reward-collision 1.5 + --env.reward-offroad 1.5 + --env.reward-stop-line 1.0 + --env.reward-comfort 0.05 + --env.reward-lane-align 0.025 + --env.reward-vel-align 1.0 + --env.reward-lane-center 0.005 + --env.reward-velocity 0.0025 + --env.reward-reverse 0.005 + --env.reward-timestep 2.5e-05 + --env.reward-overspeed 0.05 + --policy.input-size 256 + --policy.backbone-hidden-size 1024 + --policy.backbone-num-layers 3 + --policy.actor-hidden-size 1024 + --policy.actor-num-layers 0 + --policy.critic-hidden-size 1024 + --policy.critic-num-layers 0 + --policy.split-network True + --policy.encoder-gigaflow True + --policy.dropout 0.0 + --train.total-timesteps "$TOTAL_TIMESTEPS" + --train.learning-rate 0.0005 + --train.minibatch-size 153600 + --train.max-minibatch-size 153600 + --train.update-epochs 3 + --train.bptt-horizon 128 + --train.compile True + --train.precision bfloat16 + --train.normalize-rewards False + --train.checkpoint-interval 500 + --train.optimizer adamw + --train.seed 0 + --eval.validation-defaults.interval 250 + --eval.validation-replay.enabled 0 + --eval.validation-gigaflow.render False + --eval.behaviors-full-dir.enabled 0 + --eval.behaviors-hard-stop.enabled 0 + --eval.behaviors-highway-straight.enabled 0 + --eval.behaviors-lane-change.enabled 0 + --eval.behaviors-merge.enabled 0 + --eval.behaviors-parked-cars.enabled 0 + --eval.behaviors-roundabout.enabled 0 + --eval.behaviors-stopped-traffic.enabled 0 + --eval.behaviors-traffic-light-green.enabled 0 + --eval.behaviors-traffic-light-stop.enabled 0 + --eval.behaviors-unprotected-left.enabled 0 + --eval.behaviors-unprotected-right.enabled 0 + --wandb + --wandb-project nightly-multi-agent + --wandb-group Nightly_MultiAgent + --tag "$RUN_TAG" +) + +# Args after NUM_GPUS/NUM_AGENTS pass straight through to override any config key +# (argparse takes the last value), e.g. `... 4 2048 --env.obs-slots-boundary-n 30`. +if [ "$NUM_GPUS" -gt 1 ]; then + exec torchrun --standalone --nnodes=1 --nproc-per-node="$NUM_GPUS" \ + -m pufferlib.pufferl train "${ARGS[@]}" "${@:3}" +else + exec env CUDA_VISIBLE_DEVICES=0 puffer train "${ARGS[@]}" "${@:3}" +fi