Commit 951bc76fb6 for qemu.org

commit 951bc76fb669eab96cc60e38a50097ad4435163e
Author: Shameer Kolothum <skolothumtho@nvidia.com>
Date:   Fri Aug 29 09:25:28 2025 +0100

    hw/pci: Introduce pci_setup_iommu_per_bus() for per-bus IOMMU ops retrieval

    Currently, pci_setup_iommu() registers IOMMU ops for a given PCIBus.
    However, when retrieving IOMMU ops for a device using
    pci_device_get_iommu_bus_devfn(), the function checks the parent_dev
    and fetches IOMMU ops from the parent device, even if the current
    bus does not have any associated IOMMU ops.

    This behavior works for now because QEMU's IOMMU implementations are
    globally scoped, and host bridges rely on the bypass_iommu property
    to skip IOMMU translation when needed.

    However, this model will break with the soon to be introduced
    arm-smmuv3 device, which allows users to associate the IOMMU
    with a specific PCIe root complex (e.g., the default pcie.0
    or a pxb-pcie root complex).

    For example, consider the following setup with multiple root
    complexes:

    -device arm-smmuv3,primary-bus=pcie.0,id=smmuv3.0 \
    ...
    -device pxb-pcie,id=pcie.1,bus_nr=8,bus=pcie.0 \
    -device pcie-root-port,id=pcie.port1,bus=pcie.1 \
    -device virtio-net-pci,bus=pcie.port1

    In Qemu, pxb-pcie acts as a special root complex whose parent is
    effectively the default root complex(pcie.0). Hence, though pcie.1
    has no associated SMMUv3 as per above, pci_device_get_iommu_bus_devfn()
    will incorrectly return the IOMMU ops from pcie.0 due to the fallback
    via parent_dev.

    To fix this, introduce a new helper pci_setup_iommu_per_bus() that
    explicitly sets the new iommu_per_bus field in the PCIBus structure.
    This helper will be used in a subsequent patch that adds support for
    the new arm-smmuv3 device.

    Update pci_device_get_iommu_bus_devfn() to use iommu_per_bus when
    determining the correct IOMMU ops, ensuring accurate behavior for
    per-bus IOMMUs.

    Reviewed-by: Jonathan Cameron <jonathan.cameron@huawei.com>
    Reviewed-by: Eric Auger <eric.auger@redhat.com>
    Tested-by: Nathan Chen <nathanc@nvidia.com>
    Tested-by: Eric Auger <eric.auger@redhat.com>
    Reviewed-by: Nicolin Chen <nicolinc@nvidia.com>
    Tested-by: Nicolin Chen <nicolinc@nvidia.com>
    Signed-off-by: Shameer Kolothum <shameerali.kolothum.thodi@huawei.com>
    Signed-off-by: Shameer Kolothum <skolothumtho@nvidia.com>
    Reviewed-by: Donald Dutile <ddutile@redhat.com>
    Message-id: 20250829082543.7680-7-skolothumtho@nvidia.com
    Signed-off-by: Peter Maydell <peter.maydell@linaro.org>

diff --git a/hw/pci/pci.c b/hw/pci/pci.c
index 297196b242..c3df9d6656 100644
--- a/hw/pci/pci.c
+++ b/hw/pci/pci.c
@@ -2912,6 +2912,19 @@ static void pci_device_get_iommu_bus_devfn(PCIDevice *dev,
             }
         }

+        /*
+         * When multiple PCI Express Root Buses are defined using pxb-pcie,
+         * the IOMMU configuration may be specific to each root bus. However,
+         * pxb-pcie acts as a special root complex whose parent is effectively
+         * the default root complex(pcie.0). Ensure that we retrieve the
+         * correct IOMMU ops(if any) in such cases.
+         */
+        if (pci_bus_is_express(iommu_bus) && pci_bus_is_root(iommu_bus)) {
+            if (parent_bus->iommu_per_bus) {
+                break;
+            }
+        }
+
         iommu_bus = parent_bus;
     }

@@ -3172,6 +3185,24 @@ void pci_setup_iommu(PCIBus *bus, const PCIIOMMUOps *ops, void *opaque)
     bus->iommu_opaque = opaque;
 }

+/*
+ * Similar to pci_setup_iommu(), but sets iommu_per_bus to true,
+ * indicating that the IOMMU is specific to this bus. This is used by
+ * IOMMU implementations that are tied to a specific PCIe root complex.
+ *
+ * In QEMU, pxb-pcie behaves as a special root complex whose parent is
+ * effectively the default root complex (pcie.0). The iommu_per_bus
+ * is checked in pci_device_get_iommu_bus_devfn() to ensure the correct
+ * IOMMU ops are returned, avoiding the use of the parent’s IOMMU when
+ * it's not appropriate.
+ */
+void pci_setup_iommu_per_bus(PCIBus *bus, const PCIIOMMUOps *ops,
+                             void *opaque)
+{
+    pci_setup_iommu(bus, ops, opaque);
+    bus->iommu_per_bus = true;
+}
+
 static void pci_dev_get_w64(PCIBus *b, PCIDevice *dev, void *opaque)
 {
     Range *range = opaque;
diff --git a/include/hw/pci/pci.h b/include/hw/pci/pci.h
index 6b7d3ac8a3..6bccb25ac2 100644
--- a/include/hw/pci/pci.h
+++ b/include/hw/pci/pci.h
@@ -773,6 +773,8 @@ int pci_iommu_unregister_iotlb_notifier(PCIDevice *dev, uint32_t pasid,
  */
 void pci_setup_iommu(PCIBus *bus, const PCIIOMMUOps *ops, void *opaque);

+void pci_setup_iommu_per_bus(PCIBus *bus, const PCIIOMMUOps *ops, void *opaque);
+
 pcibus_t pci_bar_address(PCIDevice *d,
                          int reg, uint8_t type, pcibus_t size);

diff --git a/include/hw/pci/pci_bus.h b/include/hw/pci/pci_bus.h
index 2261312546..c738446788 100644
--- a/include/hw/pci/pci_bus.h
+++ b/include/hw/pci/pci_bus.h
@@ -35,6 +35,7 @@ struct PCIBus {
     enum PCIBusFlags flags;
     const PCIIOMMUOps *iommu_ops;
     void *iommu_opaque;
+    bool iommu_per_bus;
     uint8_t devfn_min;
     uint32_t slot_reserved_mask;
     pci_set_irq_fn set_irq;