From 48736869adbc5397c9c72dd170fd1c0c6d4c2f6b Mon Sep 17 00:00:00 2001
From: Kashu7100 <echo.six0566@gmail.com>
Date: Mon, 1 Jun 2026 16:43:06 -0400
Subject: [PATCH] [PERF] Split rigid collision BVH into static + dynamic
 subsets (RPL multi-depth)

Follow-up to #2867. A scene with one moving robot on a large static terrain
still rebuilds a single combined collision BVH over every face each step,
because the rebuild-skip keys off "all links in the solver are fixed" -- false
as soon as the robot moves. The static terrain (the bulk of the faces) is
re-fit every step for nothing.

This decomposes the rigid solver's collision faces into two compacted BVHs by
owning-link fixedness (RaycasterSensor._partition_collision_faces):

  - static subset  (faces on fixed links: terrain / walls): maybe_static, built
    once, then skipped + shared across envs (the dominant per-step cost for one
    robot on a big static terrain).
  - dynamic subset (faces on movable links: the robot): rebuilt each step, but
    the rebuild + radix sort now scale with the robot's face count, not the
    whole scene.

The two are cast separately and merged via the existing is_merge path (closest
hit wins), so the result is identical to one combined BVH. This is the
"multi-depth" decomposition from RPL (arXiv:2602.03002): cast the dynamic robot
and static terrain meshes separately and reuse the static acceleration
structure across timesteps and environments.

Implementation
- Each BVH is built over a compacted face subset. A `face_ids` array maps a BVH
  leaf slot to the global face index; bvh_ray_cast remaps after reading the
  morton-code primitive id, and update_aabbs iterates the subset. `n_triangles`
  in bvh_ray_cast now derives from the morton-codes shape (the BVH's own leaf
  count) instead of the solver-global face count.
- The existing maybe_static/needs_rebuild skip and the AABB-derived
  shared_across_envs test are already per-entry, so they apply to each subset
  unchanged: the static subset's GEOMETRY subscriber only fires on an explicit
  set_pos/set_quat (e.g. re-randomized terrain), never on physics-driven robot
  motion, so it stays skipped + shared while the robot subset rebuilds.
- A pure-static or pure-dynamic solver yields a single subset with identity
  face_ids, i.e. the previous single-BVH behavior -- bit-identical.
- kernel_cast_ray (viewer pick) and the viewer plugin thread an identity
  face_ids over the full mesh.

Perf (perceptive depth camera, 64x36 rays, 18.5k-face terrain + G1, RTX 3080):
  1024 envs: 1037 -> 706 ms/step (1.47x) measured on the pre-refactor branch;
  win grows with env count x terrain faces. Re-validated functionally on the
  current main: a moving-robot-on-terrain scene now builds 1 static (skipped +
  shared) + 1 dynamic BVH, depth output unchanged.

Tests
- New tests/test_sensors.py::test_raycaster_static_dynamic_bvh_split asserts the
  split structure (one static + one dynamic collision BVH, static shared across
  envs), the merge reporting the closer of static/dynamic as a movable box
  enters/leaves a ray, and that the static BVH stays skipped across a dynamic
  move. Passes for n_envs in {0, 2}.
- Existing raycaster/lidar suite unchanged (single full-mesh path is the
  identity-face_ids case; mixed scenes now exercise the two-BVH merge).

Co-authored-by: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 genesis/engine/sensors/raycaster.py   | 47 +++++++++++++++++++--
 genesis/utils/raycast_qd.py           | 38 ++++++++++++-----
 genesis/vis/viewer_plugins/raycast.py |  7 ++++
 tests/test_sensors.py                 | 60 +++++++++++++++++++++++++++
 4 files changed, 138 insertions(+), 14 deletions(-)

diff --git a/genesis/engine/sensors/raycaster.py b/genesis/engine/sensors/raycaster.py
index bc684fcffb..c3b2f41cd8 100644
--- a/genesis/engine/sensors/raycaster.py
+++ b/genesis/engine/sensors/raycaster.py
@@ -57,6 +57,12 @@ class BVHContext:
     # True when the geometry is bit-identical across envs, so the cast reads one shared copy (batch 0) with coalesced
     # node loads instead of scattering over n_env identical trees. Recomputed on every rebuild.
     shared_across_envs: bool = False
+    # Compacted face subset this BVH covers: face_ids[k] is the global face index at BVH leaf slot k. Static
+    # (fixed-link) and dynamic (movable-link) collision faces get separate subsets so the static tree is built once
+    # and shared while only the small dynamic subset rebuilds each step. A 1-D int device tensor; for a single
+    # full-mesh BVH it is the identity map arange(n_faces). None for visual BVH entries (which keep the full-mesh
+    # path; their cast/update kernels do not take face_ids).
+    face_ids: torch.Tensor | None = None
 
 
 @dataclass
@@ -101,6 +107,28 @@ def __init__(self, options: RaycasterOptions, sensor_idx: int, manager: "SensorM
         self.debug_objects: list["Mesh"] = []
         self.ray_starts: torch.Tensor = torch.empty((0, 3), device=gs.device, dtype=gs.tc_float)
 
+    @staticmethod
+    def _partition_collision_faces(solver: "RigidSolver") -> list[tuple[torch.Tensor, bool]]:
+        """Partition the solver's collision faces into static (fixed-link) and dynamic (movable-link) subsets.
+
+        Returns ``(face_ids, maybe_static)`` per non-empty subset, where ``face_ids`` are the global face indices in
+        that subset. A pure-static or pure-dynamic solver yields one entry (equivalent to a single full-mesh BVH); a
+        mixed scene (robot on terrain) yields two, so the static terrain tree can be built once + shared while only
+        the robot subset rebuilds. Used by :meth:`build`.
+        """
+        face_geom = qd_to_numpy(solver.faces_info.geom_idx).reshape(-1)  # (n_faces,) global geom per face
+        geom_link = qd_to_numpy(solver.geoms_info.link_idx).reshape(-1)  # (n_geoms,) global link per geom
+        link_fixed = np.array([bool(link.is_fixed) for link in solver.links], dtype=bool)  # (n_links,)
+        face_static = link_fixed[geom_link[face_geom]]  # (n_faces,) is this face on a fixed link?
+
+        out: list[tuple[torch.Tensor, bool]] = []
+        for is_static in (True, False):
+            sel = np.nonzero(face_static == is_static)[0]
+            if sel.size == 0:
+                continue
+            out.append((torch.as_tensor(sel, dtype=gs.tc_int, device=gs.device), bool(is_static)))
+        return out
+
     @staticmethod
     def _compute_visual_raycast_mask(solver: "KinematicSolver") -> np.ndarray:
         """Build a per-vface mask (int8, shape (n_vfaces,)) selecting vfaces opted into visual raycasting.
@@ -144,6 +172,7 @@ def _update_bvh(cls, shared_metadata: RaycasterSharedMetadata):
                     free_verts_state=entry.solver.free_verts_state,
                     fixed_verts_state=entry.solver.fixed_verts_state,
                     links_info=entry.solver.links_info,
+                    face_ids=entry.face_ids,
                     static_rigid_sim_config=entry.solver._static_rigid_sim_config,
                     aabb_state=entry.aabb,
                 )
@@ -201,10 +230,19 @@ def build(self):
                 # catches. Applies to both the collision and the visual BVH.
                 maybe_static = all(link.is_fixed for link in solver.links)
                 if isinstance(solver, RigidSolver):
-                    n_faces = solver.faces_info.geom_idx.shape[0]
-                    aabb = AABB(n_batches=n_envs, n_aabbs=n_faces)
-                    bvh = LBVH(aabb, max_n_query_result_per_aabb=0, n_radix_sort_groups=64)
-                    self._shared_metadata.solver_bvhs.append(BVHContext(solver, bvh, aabb, None, maybe_static))
+                    # Split the collision faces into a static subset (faces on fixed links: terrain / walls) and a
+                    # dynamic subset (faces on movable links: the robot), each with its own compacted BVH. The static
+                    # subset is then built once + skipped + shared across envs while only the small dynamic subset
+                    # rebuilds per step; the cast kernels merge the two via is_merge so the result is identical to one
+                    # combined BVH. This is the RPL "multi-depth" decomposition (arXiv:2602.03002). A pure-static or
+                    # pure-dynamic solver yields a single subset (identity face_ids) == the previous single-BVH path.
+                    for face_ids, subset_static in self._partition_collision_faces(solver):
+                        n_sub = int(face_ids.shape[0])
+                        aabb = AABB(n_batches=n_envs, n_aabbs=n_sub)
+                        bvh = LBVH(aabb, max_n_query_result_per_aabb=0, n_radix_sort_groups=min(64, n_sub))
+                        self._shared_metadata.solver_bvhs.append(
+                            BVHContext(solver, bvh, aabb, None, subset_static, face_ids=face_ids)
+                        )
                 n_vfaces = solver.vfaces_info.vgeom_idx.shape[0]
                 if n_vfaces > 0:
                     mask = self._compute_visual_raycast_mask(solver)
@@ -352,6 +390,7 @@ def _update_raw_data(cls, shared_metadata: RaycasterSharedMetadata, raw_data_T:
                     solver.free_verts_state,
                     solver.verts_info,
                     solver.faces_info,
+                    entry.face_ids,
                     *args_common,
                 )
             else:
diff --git a/genesis/utils/raycast_qd.py b/genesis/utils/raycast_qd.py
index eaf05135af..2bf7f757e4 100644
--- a/genesis/utils/raycast_qd.py
+++ b/genesis/utils/raycast_qd.py
@@ -56,6 +56,7 @@ def bvh_ray_cast(
     verts_info: array_class.VertsInfo,
     fixed_verts_state: array_class.VertsState,
     free_verts_state: array_class.VertsState,
+    face_ids: qd.types.ndarray(ndim=1),
     eps: float,
 ):
     """
@@ -70,7 +71,10 @@ def bvh_ray_cast(
     hit_normal : qd.math.vec3
         normal vector at hit point (zero vector if no hit)
     """
-    n_triangles = faces_info.verts_idx.shape[0]
+    # Leaf count = this BVH's AABB/morton-code count, NOT the solver's global face count: the BVH may cover a
+    # compacted face subset (static terrain vs dynamic robot - see RaycasterSensor.build). morton_codes is
+    # (n_batch, n_leaves); face_ids[leaf] remaps the subset-local leaf back to the global face below.
+    n_triangles = bvh_morton_codes.shape[1]
 
     hit_face = -1
     closest_distance = gs.qd_float(max_range)
@@ -92,9 +96,10 @@ def bvh_ray_cast(
 
         if aabb_t >= 0.0 and aabb_t < closest_distance:
             if node.left == -1:  # Leaf node
-                # Get original triangle/face index
+                # Get original triangle/face index. The morton code carries the subset-local leaf slot;
+                # face_ids remaps it to the solver-global face (identity for a single full-mesh BVH).
                 sorted_leaf_idx = node_idx - (n_triangles - 1)
-                i_f = qd.cast(bvh_morton_codes[i_b, sorted_leaf_idx][1], gs.qd_int)
+                i_f = qd.cast(face_ids[qd.cast(bvh_morton_codes[i_b, sorted_leaf_idx][1], gs.qd_int)], gs.qd_int)
 
                 # Get triangle vertices
                 tri_vertices = get_triangle_vertices(
@@ -234,20 +239,27 @@ def update_aabbs(
     faces_info: array_class.FacesInfo,
     geoms_info: array_class.GeomsInfo,
     links_info: array_class.LinksInfo,
+    face_ids: qd.types.ndarray(ndim=1),
     static_rigid_sim_config: qd.template(),
     aabb_state: qd.template(),
 ):
     """Update per-face collision AABBs from current vertex positions.
 
+    AABB slot k holds the bounding box of the global face face_ids[k]; the BVH is built over this compacted subset
+    (e.g. only the static terrain faces, or only the moving robot faces - see RaycasterSensor.build), so the rebuild
+    + radix sort scale with the subset size rather than every face in the solver. For a single full-mesh BVH face_ids
+    is the identity map.
+
     A face contributes to env i_b only if its geom lies in that env's active geom range (links_info.geom_start /
     geom_end); otherwise its AABB is left inverted (unhittable) and skipped by ray queries. For a homogeneous solver
     every geom is always in range, so this never excludes anything. For a heterogeneous solver, where all envs share
     one vertex buffer but activate different per-env geom ranges, it makes each env cast against only its own variant
     instead of the union of every variant.
     """
-    for i_b, i_f in qd.ndrange(free_verts_state.pos.shape[1], faces_info.verts_idx.shape[0]):
-        aabb_state.aabbs[i_b, i_f].min.fill(qd.math.inf)
-        aabb_state.aabbs[i_b, i_f].max.fill(-qd.math.inf)
+    for i_b, k in qd.ndrange(free_verts_state.pos.shape[1], face_ids.shape[0]):
+        i_f = face_ids[k]
+        aabb_state.aabbs[i_b, k].min.fill(qd.math.inf)
+        aabb_state.aabbs[i_b, k].max.fill(-qd.math.inf)
 
         i_g = faces_info.geom_idx[i_f]
         i_l = geoms_info.link_idx[i_g]
@@ -258,12 +270,12 @@ def update_aabbs(
                 i_fv = verts_info.verts_state_idx[i_v]
                 if verts_info.is_fixed[i_v]:
                     pos_v = fixed_verts_state.pos[i_fv]
-                    aabb_state.aabbs[i_b, i_f].min = qd.min(aabb_state.aabbs[i_b, i_f].min, pos_v)
-                    aabb_state.aabbs[i_b, i_f].max = qd.max(aabb_state.aabbs[i_b, i_f].max, pos_v)
+                    aabb_state.aabbs[i_b, k].min = qd.min(aabb_state.aabbs[i_b, k].min, pos_v)
+                    aabb_state.aabbs[i_b, k].max = qd.max(aabb_state.aabbs[i_b, k].max, pos_v)
                 else:
                     pos_v = free_verts_state.pos[i_fv, i_b]
-                    aabb_state.aabbs[i_b, i_f].min = qd.min(aabb_state.aabbs[i_b, i_f].min, pos_v)
-                    aabb_state.aabbs[i_b, i_f].max = qd.max(aabb_state.aabbs[i_b, i_f].max, pos_v)
+                    aabb_state.aabbs[i_b, k].min = qd.min(aabb_state.aabbs[i_b, k].min, pos_v)
+                    aabb_state.aabbs[i_b, k].max = qd.max(aabb_state.aabbs[i_b, k].max, pos_v)
 
 
 @qd.kernel
@@ -275,6 +287,7 @@ def kernel_update_verts_and_aabbs(
     free_verts_state: array_class.VertsState,
     fixed_verts_state: array_class.VertsState,
     links_info: array_class.LinksInfo,
+    face_ids: qd.types.ndarray(ndim=1),
     static_rigid_sim_config: qd.template(),
     aabb_state: qd.template(),
 ):
@@ -288,6 +301,7 @@ def kernel_update_verts_and_aabbs(
         faces_info,
         geoms_info,
         links_info,
+        face_ids,
         static_rigid_sim_config,
         aabb_state,
     )
@@ -442,6 +456,7 @@ def kernel_cast_ray(
     free_verts_state: array_class.VertsState,
     verts_info: array_class.VertsInfo,
     faces_info: array_class.FacesInfo,
+    face_ids: qd.types.ndarray(ndim=1),  # maps BVH leaf slot -> global face index (identity for a full-mesh BVH)
     bvh_nodes: qd.template(),
     bvh_morton_codes: qd.template(),
     ray_start: qd.types.ndarray(ndim=1),  # (3,)
@@ -482,6 +497,7 @@ def kernel_cast_ray(
             verts_info=verts_info,
             fixed_verts_state=fixed_verts_state,
             free_verts_state=free_verts_state,
+            face_ids=face_ids,
             eps=eps,
         )
         if cur_hit_face >= 0:
@@ -544,6 +560,7 @@ def kernel_cast_rays(
     free_verts_state: array_class.VertsState,
     verts_info: array_class.VertsInfo,
     faces_info: array_class.FacesInfo,
+    face_ids: qd.types.ndarray(ndim=1),  # maps BVH leaf slot -> global face index (identity for a full-mesh BVH)
     bvh_nodes: qd.template(),
     bvh_morton_codes: qd.template(),  # maps sorted leaves to original triangle indices
     links_pos: qd.types.ndarray(ndim=3),  # [n_env, n_sensors, 3]
@@ -610,6 +627,7 @@ def kernel_cast_rays(
             verts_info=verts_info,
             fixed_verts_state=fixed_verts_state,
             free_verts_state=free_verts_state,
+            face_ids=face_ids,
             eps=eps,
         )
 
diff --git a/genesis/vis/viewer_plugins/raycast.py b/genesis/vis/viewer_plugins/raycast.py
index 2cf9ae2f70..1e3417a334 100644
--- a/genesis/vis/viewer_plugins/raycast.py
+++ b/genesis/vis/viewer_plugins/raycast.py
@@ -1,6 +1,7 @@
 from typing import TYPE_CHECKING
 
 import numpy as np
+import torch
 from typing_extensions import override
 
 import genesis as gs
@@ -51,6 +52,10 @@ def __init__(self, scene: "Scene"):
             max_n_query_result_per_aabb=0,  # Not used for ray queries
             n_radix_sort_groups=min(64, n_faces),
         )
+        # The viewer casts against the full mesh (one BVH over every face), so the leaf-slot -> global-face map
+        # the cast/update kernels take is the identity. (Sensors may build compacted per-subset BVHs and pass a
+        # real subset map; see RaycasterSensor.build.)
+        self.face_ids = torch.arange(n_faces, dtype=gs.tc_int, device=gs.device)
         self.result = array_class.get_raycast_result(n_envs_max)
 
         self.update()
@@ -72,6 +77,7 @@ def update(self) -> None:
             free_verts_state=self.solver.free_verts_state,
             fixed_verts_state=self.solver.fixed_verts_state,
             links_info=self.solver.links_info,
+            face_ids=self.face_ids,
             static_rigid_sim_config=self.solver._static_rigid_sim_config,
             aabb_state=self.aabb,
         )
@@ -105,6 +111,7 @@ def cast(
             self.solver.free_verts_state,
             self.solver.verts_info,
             self.solver.faces_info,
+            self.face_ids,
             self.bvh.nodes,
             self.bvh.morton_codes,
             np.ascontiguousarray(ray_origin, dtype=gs.np_float),
diff --git a/tests/test_sensors.py b/tests/test_sensors.py
index bb8bd4b14f..61324cc641 100644
--- a/tests/test_sensors.py
+++ b/tests/test_sensors.py
@@ -1126,6 +1126,66 @@ def test_raycaster_hits(show_viewer, n_envs):
     assert_allclose(grid_distances, grid_distances_ref, tol=1e-3)
 
 
+@pytest.mark.required
+@pytest.mark.parametrize("n_envs", [0, 2])
+def test_raycaster_static_dynamic_bvh_split(show_viewer, n_envs):
+    """A rigid solver's collision mesh is split into a static (fixed-link) BVH and a dynamic (movable-link) BVH,
+    cast separately and merged. Asserts: (a) the split structure (one static + one dynamic collision entry, static
+    shared across envs); (b) the merge reports the closer of static / dynamic as a movable box enters / leaves a
+    ray's path; (c) the static entry is genuinely skipped across a dynamic move (stays needs_rebuild=False).
+    """
+    HEIGHT = 1.0
+    BOX = 0.2  # movable box edge
+
+    scene = gs.Scene(
+        profiling_options=gs.options.ProfilingOptions(show_FPS=False),
+        show_viewer=show_viewer,
+    )
+    scene.add_entity(gs.morphs.Plane())  # static (fixed)
+    # A single downward ray from a fixed mount over the origin. collision=False so the mount carries no collision
+    # faces and the ray doesn't immediately hit its own mount geometry.
+    mount = scene.add_entity(gs.morphs.Box(size=(0.05, 0.05, 0.05), pos=(0.0, 0.0, HEIGHT), fixed=True, collision=False))
+    box = scene.add_entity(gs.morphs.Box(size=(BOX, BOX, BOX), pos=(5.0, 5.0, 0.5 * BOX)))  # dynamic (movable)
+    sensor = scene.add_sensor(
+        gs.sensors.Raycaster(
+            pattern=gs.sensors.raycaster.GridPattern(resolution=1.0, size=(0.0, 0.0), direction=(0.0, 0.0, -1.0)),
+            entity_idx=mount.idx,
+            return_world_frame=False,
+        )
+    )
+
+    scene.build(n_envs=n_envs)
+    batch_shape = (n_envs,) if n_envs > 0 else ()
+
+    # (a) Split structure: exactly two collision BVH entries (raycast_mask is None), one static + one dynamic; the
+    # static one is shared across envs when batched (identical fixed geometry in every env).
+    collision_bvhs = [e for e in sensor._shared_metadata.solver_bvhs if e.raycast_mask is None]
+    assert len(collision_bvhs) == 2, f"expected static+dynamic split, got {len(collision_bvhs)} collision BVHs"
+    static_entries = [e for e in collision_bvhs if e.maybe_static]
+    dynamic_entries = [e for e in collision_bvhs if not e.maybe_static]
+    assert len(static_entries) == 1 and len(dynamic_entries) == 1
+    if n_envs > 0:
+        assert static_entries[0].shared_across_envs, "static terrain BVH should be shared across envs"
+        assert not dynamic_entries[0].shared_across_envs, "dynamic (movable) BVH must stay per-env"
+
+    # (b1) Box parked far away -> the ray falls through to the static ground at distance HEIGHT.
+    scene.sim._sensor_manager.step()
+    assert_allclose(sensor.read().distances.reshape(batch_shape), HEIGHT, tol=gs.EPS)
+
+    # (b2) Move the box directly under the ray -> the merge must now report the closer hit (box top).
+    box.set_pos(np.tile((0.0, 0.0, 0.5 * BOX), (*batch_shape, 1)))
+    scene.sim._sensor_manager.step()
+    assert_allclose(sensor.read().distances.reshape(batch_shape), HEIGHT - BOX, tol=gs.EPS)
+
+    # (c) The static (terrain) BVH stayed skipped across the dynamic move: it never re-flagged for rebuild.
+    assert not static_entries[0].needs_rebuild, "static BVH was flagged for rebuild by a dynamic-only move"
+
+    # (b3) Move the box back out -> ray returns to the static ground distance (dynamic BVH tracked the motion).
+    box.set_pos(np.tile((5.0, 5.0, 0.5 * BOX), (*batch_shape, 1)))
+    scene.sim._sensor_manager.step()
+    assert_allclose(sensor.read().distances.reshape(batch_shape), HEIGHT, tol=gs.EPS)
+
+
 @pytest.mark.required
 @pytest.mark.parametrize("n_envs", [0, 2])
 @pytest.mark.parametrize("kin_raycastable", [True, False])