Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions Cargo.lock
Original file line number Diff line number Diff line change
Expand Up @@ -5893,6 +5893,7 @@ dependencies = [
"memory_range",
"mesh",
"pal_async",
"pal_event",
"parking_lot",
"pci_bus",
"pci_core",
Expand Down
4 changes: 2 additions & 2 deletions Guide/src/reference/devices/firmware/linux_direct.md
Original file line number Diff line number Diff line change
Expand Up @@ -55,9 +55,9 @@ OpenVMM synthesizes a minimal set of EFI structures in guest memory:
2. **EFI Memory Map** — describes the EFI metadata region, ACPI tables, and
conventional RAM.
3. **ACPI Tables** — FADT (with `HW_REDUCED_ACPI`), MADT (GIC distributor, GICv3
redistributors or GICv2 CPU interfaces, optional v2m MSI frame), GTDT
redistributors or GICv2 CPU interfaces, GICv3 ITS or v2m MSI frame), GTDT
(virtual timer), DSDT (VMBus, serial UARTs), and optionally MCFG/SSDT for
PCIe.
PCIe and IORT for PCIe interrupt routing via the ITS.

A **stub device tree** is then built. Unlike a full device tree, it contains
no hardware nodes — no CPUs, GIC, timer, or devices. Its only purpose is a
Expand Down
20 changes: 20 additions & 0 deletions Guide/src/reference/emulated/pcie/overview.md
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,26 @@ hotplug, PME, AER, and other PCIe features rather than
ACPI-based fallbacks. Linux assumes native control regardless,
but Windows requires `_OSC` to enable native hotplug.

### MSI Interrupt Routing (aarch64)

On aarch64, PCIe MSI/MSI-X interrupts are routed through either
a GICv3 ITS or a GICv2m MSI frame, depending on the platform:

- **GICv3 ITS** (default on KVM with GICv3): The VMM creates a
KVM in-kernel ITS device. Each PCIe device gets a 32-bit
device ID composed as `(segment << 16) | BDF`, injected
transparently by per-device wrappers in the interrupt path.
ACPI boots emit an IORT with an ITS Group node and per-root-
complex ID mappings. The device tree includes an `its` child
node under the GIC with `msi-controller`.

- **GICv2m**: MSI writes map to a fixed pool of 64 SPIs via
a v2m doorbell register. The MADT includes a GICv2m MSI
frame entry.

The MSI controller can be overridden with the `--gic-msi`
CLI option (`auto`, `its`, or `v2m`).

### Implementation notes

```admonish note title="No Command Completed support"
Expand Down
4 changes: 2 additions & 2 deletions openhcl/bootloader_fdt_parser/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -537,7 +537,7 @@ fn parse_gic(node: &Node<'_>) -> anyhow::Result<Aarch64PlatformConfig> {
gic_version: vm_topology::processor::aarch64::GicVersion::V3 {
redistributors_base: reg[2],
},
gic_v2m: None,
gic_msi: vm_topology::processor::aarch64::GicMsiController::None,
pmu_gsiv: None,
// TODO: parse from the DT timer node instead of hardcoding.
virt_timer_ppi: 20,
Expand Down Expand Up @@ -1078,7 +1078,7 @@ mod tests {
gic_version: vm_topology::processor::aarch64::GicVersion::V3 {
redistributors_base: 0x20000,
},
gic_v2m: None,
gic_msi: vm_topology::processor::aarch64::GicMsiController::None,
pmu_gsiv: Some(0x17),
virt_timer_ppi: 20,
gic_nr_irqs: 992,
Expand Down
2 changes: 1 addition & 1 deletion openhcl/virt_mshv_vtl/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1314,7 +1314,7 @@ struct UhInterruptTarget {
}

impl pci_core::msi::SignalMsi for UhInterruptTarget {
fn signal_msi(&self, _rid: u32, address: u64, data: u32) {
fn signal_msi(&self, _devid: Option<u32>, address: u64, data: u32) {
self.partition
.request_msi(self.vtl, MsiRequest { address, data });
}
Expand Down
205 changes: 188 additions & 17 deletions openvmm/openvmm_core/src/worker/dispatch.rs
Original file line number Diff line number Diff line change
Expand Up @@ -492,6 +492,7 @@ impl ExtractTopologyConfig for ProcessorTopology<Aarch64Topology> {
Some(gsiv) => PmuGsivConfig::Gsiv(gsiv),
None => PmuGsivConfig::Disabled,
},
gic_msi: Default::default(),
})),
}
}
Expand All @@ -504,18 +505,16 @@ impl BuildTopology<Aarch64Topology> for ProcessorTopologyConfig {
platform_info: &virt::PlatformInfo,
) -> anyhow::Result<ProcessorTopology<Aarch64Topology>> {
use vm_topology::processor::aarch64::Aarch64PlatformConfig;
use vm_topology::processor::aarch64::GicItsInfo;
use vm_topology::processor::aarch64::GicMsiController;
use vm_topology::processor::aarch64::GicV2mInfo;

let arch = match &self.arch {
None => Default::default(),
Some(ArchTopologyConfig::Aarch64(arch)) => arch.clone(),
_ => anyhow::bail!("invalid architecture config"),
};
let gic_v2m = Some(GicV2mInfo {
frame_base: openvmm_defs::config::DEFAULT_GIC_V2M_MSI_FRAME_BASE,
spi_base: openvmm_defs::config::DEFAULT_GIC_V2M_SPI_BASE,
spi_count: openvmm_defs::config::DEFAULT_GIC_V2M_SPI_COUNT,
});

let pmu_gsiv = match arch.pmu_gsiv {
PmuGsivConfig::Disabled => None,
PmuGsivConfig::Gsiv(gsiv) => Some(gsiv),
Expand Down Expand Up @@ -585,10 +584,39 @@ impl BuildTopology<Aarch64Topology> for ProcessorTopologyConfig {
}
};

// Use the ITS for MSI delivery when the backend supports it
// (KVM with GICv3). Otherwise fall back to GICv2m (SPI-based MSIs).
use openvmm_defs::config::GicMsiConfig;
let is_gicv2 = matches!(gic_version, GicVersion::V2 { .. });
let use_its = match arch.gic_msi {
GicMsiConfig::Auto => platform_info.supports_its && !is_gicv2,
GicMsiConfig::Its => {
if is_gicv2 {
anyhow::bail!("ITS is incompatible with GICv2");
}
if !platform_info.supports_its {
anyhow::bail!("ITS requested but the hypervisor does not support it");
}
true
}
GicMsiConfig::V2m => false,
};
let gic_msi = if use_its {
GicMsiController::Its(GicItsInfo {
its_base: openvmm_defs::config::DEFAULT_GIC_ITS_BASE,
})
} else {
GicMsiController::V2m(GicV2mInfo {
frame_base: openvmm_defs::config::DEFAULT_GIC_V2M_MSI_FRAME_BASE,
spi_base: openvmm_defs::config::DEFAULT_GIC_V2M_SPI_BASE,
spi_count: openvmm_defs::config::DEFAULT_GIC_V2M_SPI_COUNT,
})
};

let platform = Aarch64PlatformConfig {
gic_distributor_base,
gic_version,
gic_v2m,
gic_msi,
pmu_gsiv,
virt_timer_ppi: openvmm_defs::config::DEFAULT_VIRT_TIMER_PPI,
gic_nr_irqs: openvmm_defs::config::DEFAULT_GIC_NR_IRQS,
Expand Down Expand Up @@ -1810,8 +1838,46 @@ impl InitializedVm {
(pcie_host_bridges, pcie_root_complexes)
};

// Build a port-name→(segment, bus_range) map covering all ports in
// the PCIe topology (root complex ports and switch downstream ports).
// The segment is used for ITS device ID composition; the bus_range is
// a shared atomic that the config space emulator updates when the
// guest programs secondary/subordinate bus numbers.
struct PortInfo {
segment: u16,
bus_range: pcie::bus_range::AssignedBusRange,
}
let mut port_info: std::collections::HashMap<Arc<str>, PortInfo> =
std::collections::HashMap::new();
for (hb, rc) in pcie_host_bridges.iter().zip(pcie_root_complexes.iter()) {
for p in rc.lock().downstream_ports() {
if let Some(_existing) = port_info.insert(
p.name.clone(),
PortInfo {
segment: hb.segment,
bus_range: p.bus_range,
},
) {
anyhow::bail!("duplicate PCIe port name '{}'", p.name);
}
}
}

for switch in cfg.pcie_switches {
let device_name = format!("pcie-switch:{}", switch.name);

// Inherit the segment from the switch's parent port.
let parent_segment = port_info
.get(switch.parent_port.as_str())
.ok_or_else(|| {
anyhow::anyhow!(
"switch '{}' parent port '{}' not found in any root complex",
switch.name,
switch.parent_port
Comment thread
jstarks marked this conversation as resolved.
)
})?
.segment;

let switch_device = chipset_builder
.arc_mutex_device(device_name)
.on_pcie_port(vmotherboard::BusId::new(&switch.parent_port))
Expand All @@ -1824,6 +1890,20 @@ impl InitializedVm {
GenericPcieSwitch::new(definition)
})?;

// Query the switch's actual downstream port names instead of
// reconstructing them from the naming convention.
for p in switch_device.lock().downstream_ports() {
if let Some(_existing) = port_info.insert(
p.name.clone(),
PortInfo {
segment: parent_segment,
bus_range: p.bus_range,
},
) {
anyhow::bail!("duplicate PCIe port name '{}'", p.name);
}
}

let bus_id = vmotherboard::BusId::new(&switch.name);
chipset_builder.register_weak_mutex_pcie_enumerator(bus_id, Box::new(switch_device));
}
Expand All @@ -1846,26 +1926,77 @@ impl InitializedVm {
Some(handle)
};

// Determine whether ITS wrappers are needed for PCIe MSI delivery.
// Only aarch64 VMs configured with a GICv3 ITS need device ID
// injection; all other configurations pass through directly.
#[cfg(guest_arch = "aarch64")]
let use_its = matches!(
processor_topology.gic_msi(),
vm_topology::processor::aarch64::GicMsiController::Its(_)
);
#[cfg(not(guest_arch = "aarch64"))]
let use_its = false;

// Resolve PCIe devices concurrently.
//
// Each port's ConfigSpaceType1Emulator owns an AssignedBusRange
// (Arc<AtomicU16>). We clone it into the ITS wrappers so that when
// the guest programs bus numbers, the emulator writes the new values
// and the ITS wrapper reads them at interrupt delivery time.
try_join_all(cfg.pcie_devices.into_iter().map(|dev_cfg| {
let chipset_builder = &chipset_builder;
let driver_source = &driver_source;
let resolver = &resolver;
let gm = &gm;
let partition = &partition;
let mapper = &mapper;
let port_info = &port_info;
async move {
let port_name: Arc<str> = dev_cfg.port_name.into();
let pi = port_info.get(&port_name).ok_or_else(|| {
anyhow::anyhow!(
"device port '{}' not found in any root complex or switch",
port_name
)
})?;

// When ITS is active, wrap the partition's SignalMsi
// and IrqFd to inject the device identity. Otherwise
// pass through directly.
let signal_msi = partition.as_signal_msi(Vtl::Vtl0).map(|s| {
if use_its {
Arc::new(pcie::its::ItsSignalMsi::new(
s,
pi.bus_range.clone(),
pi.segment,
)) as Arc<dyn pci_core::msi::SignalMsi>
} else {
s
}
});
let irqfd = partition.irqfd().map(|fd| {
if use_its {
Arc::new(pcie::its::ItsIrqFd::new(
fd,
pi.bus_range.clone(),
pi.segment,
)) as Arc<dyn vmcore::irqfd::IrqFd>
} else {
fd
}
});
Comment on lines +1966 to +1987
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It looks accurate that switch interrupts (ex. hotplug) won't work, but this was never wired up correctly in the first place for non-ITS (a SignalMsi is not passed to the GenericPcieSwitch during construction)


vmm_core::device_builder::build_pcie_device(
chipset_builder,
dev_cfg.port_name.into(),
port_name.clone(),
driver_source,
resolver,
gm,
dev_cfg.resource,
partition.clone().into_doorbell_registration(Vtl::Vtl0),
Some(mapper),
partition.as_signal_msi(Vtl::Vtl0),
partition.irqfd(),
signal_msi,
irqfd,
)
.await
}
Expand Down Expand Up @@ -2872,19 +3003,59 @@ impl LoadedVm {
}
VmRpc::AddPcieDevice(rpc) => {
rpc.handle_failable(async |(port_name, resource)| {
// Validate the port exists before creating the device
// to avoid leaking a DynamicDeviceUnit on error.
let rc = self.inner.pcie_root_complexes.iter()
.find(|rc| {
rc.lock().downstream_ports().iter().any(|(_, name)| name.as_ref() == port_name.as_str())
// Find the root complex and its index for the named port.
let (rc_idx, rc) = self.inner.pcie_root_complexes.iter()
.enumerate()
.find(|(_, rc)| {
rc.lock().downstream_ports().iter().any(|p| p.name.as_ref() == port_name.as_str())
})
.ok_or_else(|| anyhow::anyhow!("port '{}' not found in any root complex", port_name))?;

let msi_conn = match self.inner.partition.irqfd() {
#[cfg(guest_arch = "aarch64")]
let use_its = matches!(
self.inner.processor_topology.gic_msi(),
vm_topology::processor::aarch64::GicMsiController::Its(_)
);
#[cfg(not(guest_arch = "aarch64"))]
let use_its = false;

// Get the bus_range from the port's config space emulator.
let bus_range = rc.lock()
.downstream_ports()
.into_iter()
.find(|p| p.name.as_ref() == port_name.as_str())
.expect("port was just found above")
.bus_range;

let signal_msi = self.inner.partition.as_signal_msi(Vtl::Vtl0).map(|s| {
if use_its {
let segment = self.inner.pcie_host_bridges[rc_idx].segment;
Arc::new(pcie::its::ItsSignalMsi::new(
s,
bus_range.clone(),
segment,
)) as Arc<dyn pci_core::msi::SignalMsi>
} else {
s
}
});
let irqfd = self.inner.partition.irqfd().map(|fd| {
if use_its {
let segment = self.inner.pcie_host_bridges[rc_idx].segment;
Arc::new(pcie::its::ItsIrqFd::new(
fd,
bus_range.clone(),
segment,
)) as Arc<dyn vmcore::irqfd::IrqFd>
} else {
fd
}
});

let msi_conn = match irqfd {
Some(fd) => pci_core::msi::MsiConnection::with_irqfd(fd),
None => pci_core::msi::MsiConnection::new(),
};
let signal_msi = self.inner.partition.as_signal_msi(Vtl::Vtl0);

let (unit, device) = self.inner.chipset_devices.add_dyn_device(
&self.inner.driver_source,
Expand Down Expand Up @@ -2957,7 +3128,7 @@ impl LoadedVm {
// Find the root complex containing the target port
let rc = self.inner.pcie_root_complexes.iter()
.find(|rc| {
rc.lock().downstream_ports().iter().any(|(_, name)| name.as_ref() == port_name.as_str())
rc.lock().downstream_ports().iter().any(|p| p.name.as_ref() == port_name.as_str())
})
.ok_or_else(|| anyhow::anyhow!("port '{}' not found in any root complex", port_name))?;

Expand Down
Loading
Loading