qemu-devel
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: [PATCH v3 6/7] pcie_sriov: Reuse SR-IOV VF device instances


From: Michael S. Tsirkin
Subject: Re: [PATCH v3 6/7] pcie_sriov: Reuse SR-IOV VF device instances
Date: Tue, 13 Feb 2024 06:01:56 -0500

On Mon, Feb 12, 2024 at 07:20:34PM +0900, Akihiko Odaki wrote:
> Disable SR-IOV VF devices by reusing code to power down PCI devices
> instead of removing them when the guest requests to disable VFs. This
> allows to realize devices and report VF realization errors at PF
> realization time.
> 
> Signed-off-by: Akihiko Odaki <akihiko.odaki@daynix.com>

It is simpler for sure, but I am worried that all of these
unused VFs will consume lots of resources even if never
enabled. Thoughts?


> ---
>  docs/pcie_sriov.txt         |   8 ++--
>  include/hw/pci/pci.h        |   2 +-
>  include/hw/pci/pci_device.h |   2 +-
>  include/hw/pci/pcie_sriov.h |   6 +--
>  hw/net/igb.c                |  13 ++++--
>  hw/nvme/ctrl.c              |  24 +++++++----
>  hw/pci/pci.c                |  18 ++++----
>  hw/pci/pci_host.c           |   4 +-
>  hw/pci/pcie.c               |   4 +-
>  hw/pci/pcie_sriov.c         | 100 
> ++++++++++++++++++++------------------------
>  10 files changed, 97 insertions(+), 84 deletions(-)
> 
> diff --git a/docs/pcie_sriov.txt b/docs/pcie_sriov.txt
> index a47aad0bfab0..ab2142807f79 100644
> --- a/docs/pcie_sriov.txt
> +++ b/docs/pcie_sriov.txt
> @@ -52,9 +52,11 @@ setting up a BAR for a VF.
>        ...
>  
>        /* Add and initialize the SR/IOV capability */
> -      pcie_sriov_pf_init(d, 0x200, "your_virtual_dev",
> -                       vf_devid, initial_vfs, total_vfs,
> -                       fun_offset, stride);
> +      if (!pcie_sriov_pf_init(d, 0x200, "your_virtual_dev",
> +                              vf_devid, initial_vfs, total_vfs,
> +                              fun_offset, stride, errp)) {
> +         return;
> +      }
>  
>        /* Set up individual VF BARs (parameters as for normal BARs) */
>        pcie_sriov_pf_init_vf_bar( ... )
> diff --git a/include/hw/pci/pci.h b/include/hw/pci/pci.h
> index fa6313aabc43..fae83b9b723c 100644
> --- a/include/hw/pci/pci.h
> +++ b/include/hw/pci/pci.h
> @@ -643,6 +643,6 @@ static inline void pci_irq_pulse(PCIDevice *pci_dev)
>  }
>  
>  MSIMessage pci_get_msi_message(PCIDevice *dev, int vector);
> -void pci_set_power(PCIDevice *pci_dev, bool state);
> +void pci_set_enabled(PCIDevice *pci_dev, bool state);
>  
>  #endif
> diff --git a/include/hw/pci/pci_device.h b/include/hw/pci/pci_device.h
> index 7564e9536dbd..153e13eaef99 100644
> --- a/include/hw/pci/pci_device.h
> +++ b/include/hw/pci/pci_device.h
> @@ -56,7 +56,7 @@ typedef struct PCIReqIDCache PCIReqIDCache;
>  struct PCIDevice {
>      DeviceState qdev;
>      bool partially_hotplugged;
> -    bool has_power;
> +    bool is_enabled;
>  
>      /* PCI config space */
>      uint8_t *config;
> diff --git a/include/hw/pci/pcie_sriov.h b/include/hw/pci/pcie_sriov.h
> index 095fb0c9edf9..d9a39daccac4 100644
> --- a/include/hw/pci/pcie_sriov.h
> +++ b/include/hw/pci/pcie_sriov.h
> @@ -18,7 +18,6 @@
>  struct PCIESriovPF {
>      uint16_t num_vfs;   /* Number of virtual functions created */
>      uint8_t vf_bar_type[PCI_NUM_REGIONS];   /* Store type for each VF bar */
> -    const char *vfname; /* Reference to the device type used for the VFs */
>      PCIDevice **vf;     /* Pointer to an array of num_vfs VF devices */
>  };
>  
> @@ -27,10 +26,11 @@ struct PCIESriovVF {
>      uint16_t vf_number; /* Logical VF number of this function */
>  };
>  
> -void pcie_sriov_pf_init(PCIDevice *dev, uint16_t offset,
> +bool pcie_sriov_pf_init(PCIDevice *dev, uint16_t offset,
>                          const char *vfname, uint16_t vf_dev_id,
>                          uint16_t init_vfs, uint16_t total_vfs,
> -                        uint16_t vf_offset, uint16_t vf_stride);
> +                        uint16_t vf_offset, uint16_t vf_stride,
> +                        Error **errp);
>  void pcie_sriov_pf_exit(PCIDevice *dev);
>  
>  /* Set up a VF bar in the SR/IOV bar area */
> diff --git a/hw/net/igb.c b/hw/net/igb.c
> index 0b5c31a58bba..1079a33d4000 100644
> --- a/hw/net/igb.c
> +++ b/hw/net/igb.c
> @@ -447,9 +447,16 @@ static void igb_pci_realize(PCIDevice *pci_dev, Error 
> **errp)
>  
>      pcie_ari_init(pci_dev, 0x150);
>  
> -    pcie_sriov_pf_init(pci_dev, IGB_CAP_SRIOV_OFFSET, TYPE_IGBVF,
> -        IGB_82576_VF_DEV_ID, IGB_MAX_VF_FUNCTIONS, IGB_MAX_VF_FUNCTIONS,
> -        IGB_VF_OFFSET, IGB_VF_STRIDE);
> +    if (!pcie_sriov_pf_init(pci_dev, IGB_CAP_SRIOV_OFFSET,
> +                            TYPE_IGBVF, IGB_82576_VF_DEV_ID,
> +                            IGB_MAX_VF_FUNCTIONS, IGB_MAX_VF_FUNCTIONS,
> +                            IGB_VF_OFFSET, IGB_VF_STRIDE,
> +                            errp)) {
> +        pcie_cap_exit(pci_dev);
> +        igb_cleanup_msix(s);
> +        msi_uninit(pci_dev);
> +        return;
> +    }
>  
>      pcie_sriov_pf_init_vf_bar(pci_dev, IGBVF_MMIO_BAR_IDX,
>          PCI_BASE_ADDRESS_MEM_TYPE_64 | PCI_BASE_ADDRESS_MEM_PREFETCH,
> diff --git a/hw/nvme/ctrl.c b/hw/nvme/ctrl.c
> index f026245d1e9e..f8df622fe590 100644
> --- a/hw/nvme/ctrl.c
> +++ b/hw/nvme/ctrl.c
> @@ -8031,7 +8031,8 @@ static uint64_t nvme_bar_size(unsigned total_queues, 
> unsigned total_irqs,
>      return bar_size;
>  }
>  
> -static void nvme_init_sriov(NvmeCtrl *n, PCIDevice *pci_dev, uint16_t offset)
> +static bool nvme_init_sriov(NvmeCtrl *n, PCIDevice *pci_dev, uint16_t offset,
> +                            Error **errp)
>  {
>      uint16_t vf_dev_id = n->params.use_intel_id ?
>                           PCI_DEVICE_ID_INTEL_NVME : 
> PCI_DEVICE_ID_REDHAT_NVME;
> @@ -8040,12 +8041,17 @@ static void nvme_init_sriov(NvmeCtrl *n, PCIDevice 
> *pci_dev, uint16_t offset)
>                                        le16_to_cpu(cap->vifrsm),
>                                        NULL, NULL);
>  
> -    pcie_sriov_pf_init(pci_dev, offset, "nvme", vf_dev_id,
> -                       n->params.sriov_max_vfs, n->params.sriov_max_vfs,
> -                       NVME_VF_OFFSET, NVME_VF_STRIDE);
> +    if (!pcie_sriov_pf_init(pci_dev, offset, "nvme", vf_dev_id,
> +                            n->params.sriov_max_vfs, n->params.sriov_max_vfs,
> +                            NVME_VF_OFFSET, NVME_VF_STRIDE,
> +                            errp)) {
> +        return false;
> +    }
>  
>      pcie_sriov_pf_init_vf_bar(pci_dev, 0, PCI_BASE_ADDRESS_SPACE_MEMORY |
>                                PCI_BASE_ADDRESS_MEM_TYPE_64, bar_size);
> +
> +    return true;
>  }
>  
>  static int nvme_add_pm_capability(PCIDevice *pci_dev, uint8_t offset)
> @@ -8124,6 +8130,12 @@ static bool nvme_init_pci(NvmeCtrl *n, PCIDevice 
> *pci_dev, Error **errp)
>          return false;
>      }
>  
> +    if (!pci_is_vf(pci_dev) && n->params.sriov_max_vfs &&
> +        !nvme_init_sriov(n, pci_dev, 0x120, errp)) {
> +        msix_uninit(pci_dev, &n->bar0, &n->bar0);
> +        return false;
> +    }
> +
>      nvme_update_msixcap_ts(pci_dev, n->conf_msix_qsize);
>  
>      if (n->params.cmb_size_mb) {
> @@ -8134,10 +8146,6 @@ static bool nvme_init_pci(NvmeCtrl *n, PCIDevice 
> *pci_dev, Error **errp)
>          nvme_init_pmr(n, pci_dev);
>      }
>  
> -    if (!pci_is_vf(pci_dev) && n->params.sriov_max_vfs) {
> -        nvme_init_sriov(n, pci_dev, 0x120);
> -    }
> -
>      return true;
>  }
>  
> diff --git a/hw/pci/pci.c b/hw/pci/pci.c
> index d08548d8ffe9..727d812f419f 100644
> --- a/hw/pci/pci.c
> +++ b/hw/pci/pci.c
> @@ -1522,7 +1522,7 @@ static void pci_update_mappings(PCIDevice *d)
>              continue;
>  
>          new_addr = pci_bar_address(d, i, r->type, r->size);
> -        if (!d->has_power) {
> +        if (!d->is_enabled) {
>              new_addr = PCI_BAR_UNMAPPED;
>          }
>  
> @@ -1610,7 +1610,7 @@ void pci_default_write_config(PCIDevice *d, uint32_t 
> addr, uint32_t val_in, int
>          pci_update_irq_disabled(d, was_irq_disabled);
>          memory_region_set_enabled(&d->bus_master_enable_region,
>                                    (pci_get_word(d->config + PCI_COMMAND)
> -                                   & PCI_COMMAND_MASTER) && d->has_power);
> +                                   & PCI_COMMAND_MASTER) && d->is_enabled);
>      }
>  
>      msi_write_config(d, addr, val_in, l);
> @@ -2179,7 +2179,9 @@ static void pci_qdev_realize(DeviceState *qdev, Error 
> **errp)
>          return;
>      }
>  
> -    pci_set_power(pci_dev, true);
> +    if (!pci_is_vf(pci_dev)) {
> +        pci_set_enabled(pci_dev, true);
> +    }
>  
>      pci_dev->msi_trigger = pci_msi_trigger;
>  }
> @@ -2835,18 +2837,18 @@ MSIMessage pci_get_msi_message(PCIDevice *dev, int 
> vector)
>      return msg;
>  }
>  
> -void pci_set_power(PCIDevice *d, bool state)
> +void pci_set_enabled(PCIDevice *d, bool state)
>  {
> -    if (d->has_power == state) {
> +    if (d->is_enabled == state) {
>          return;
>      }
>  
> -    d->has_power = state;
> +    d->is_enabled = state;
>      pci_update_mappings(d);
>      memory_region_set_enabled(&d->bus_master_enable_region,
>                                (pci_get_word(d->config + PCI_COMMAND)
> -                               & PCI_COMMAND_MASTER) && d->has_power);
> -    if (!d->has_power) {
> +                               & PCI_COMMAND_MASTER) && d->is_enabled);
> +    if (d->qdev.realized) {
>          pci_device_reset(d);
>      }
>  }
> diff --git a/hw/pci/pci_host.c b/hw/pci/pci_host.c
> index dfe6fe618401..d7e13d72ce07 100644
> --- a/hw/pci/pci_host.c
> +++ b/hw/pci/pci_host.c
> @@ -86,7 +86,7 @@ void pci_host_config_write_common(PCIDevice *pci_dev, 
> uint32_t addr,
>       * allowing direct removal of unexposed functions.
>       */
>      if ((pci_dev->qdev.hotplugged && !pci_get_function_0(pci_dev)) ||
> -        !pci_dev->has_power || is_pci_dev_ejected(pci_dev)) {
> +        !pci_dev->is_enabled || is_pci_dev_ejected(pci_dev)) {
>          return;
>      }
>  
> @@ -111,7 +111,7 @@ uint32_t pci_host_config_read_common(PCIDevice *pci_dev, 
> uint32_t addr,
>       * allowing direct removal of unexposed functions.
>       */
>      if ((pci_dev->qdev.hotplugged && !pci_get_function_0(pci_dev)) ||
> -        !pci_dev->has_power || is_pci_dev_ejected(pci_dev)) {
> +        !pci_dev->is_enabled || is_pci_dev_ejected(pci_dev)) {
>          return ~0x0;
>      }
>  
> diff --git a/hw/pci/pcie.c b/hw/pci/pcie.c
> index 6db0cf69cd8a..f34c157e1fd3 100644
> --- a/hw/pci/pcie.c
> +++ b/hw/pci/pcie.c
> @@ -394,7 +394,9 @@ static void pcie_set_power_device(PCIBus *bus, PCIDevice 
> *dev, void *opaque)
>  {
>      bool *power = opaque;
>  
> -    pci_set_power(dev, *power);
> +    if (!pci_is_vf(dev)) {
> +        pci_set_enabled(dev, *power);
> +    }
>  }
>  
>  static void pcie_cap_update_power(PCIDevice *hotplug_dev)
> diff --git a/hw/pci/pcie_sriov.c b/hw/pci/pcie_sriov.c
> index da209b7f47fd..9ba34cf8f8ed 100644
> --- a/hw/pci/pcie_sriov.c
> +++ b/hw/pci/pcie_sriov.c
> @@ -20,15 +20,29 @@
>  #include "qapi/error.h"
>  #include "trace.h"
>  
> -static PCIDevice *register_vf(PCIDevice *pf, int devfn,
> -                              const char *name, uint16_t vf_num);
> -static void unregister_vfs(PCIDevice *dev);
> +static void unrealize_vfs(PCIDevice *dev, uint16_t total_vfs)
> +{
> +    for (uint16_t i = 0; i < total_vfs; i++) {
> +        Error *err = NULL;
> +        PCIDevice *vf = dev->exp.sriov_pf.vf[i];
> +        if (!object_property_set_bool(OBJECT(vf), "realized", false, &err)) {
> +            error_reportf_err(err, "Failed to unplug: ");
> +        }
> +        object_unparent(OBJECT(vf));
> +        object_unref(OBJECT(vf));
> +    }
> +    g_free(dev->exp.sriov_pf.vf);
> +    dev->exp.sriov_pf.vf = NULL;
> +}
>  
> -void pcie_sriov_pf_init(PCIDevice *dev, uint16_t offset,
> +bool pcie_sriov_pf_init(PCIDevice *dev, uint16_t offset,
>                          const char *vfname, uint16_t vf_dev_id,
>                          uint16_t init_vfs, uint16_t total_vfs,
> -                        uint16_t vf_offset, uint16_t vf_stride)
> +                        uint16_t vf_offset, uint16_t vf_stride,
> +                        Error **errp)
>  {
> +    BusState *bus = qdev_get_parent_bus(&dev->qdev);
> +    int32_t devfn = dev->devfn + vf_offset;
>      uint8_t *cfg = dev->config + offset;
>      uint8_t *wmask;
>  
> @@ -36,7 +50,6 @@ void pcie_sriov_pf_init(PCIDevice *dev, uint16_t offset,
>                          offset, PCI_EXT_CAP_SRIOV_SIZEOF);
>      dev->exp.sriov_cap = offset;
>      dev->exp.sriov_pf.num_vfs = 0;
> -    dev->exp.sriov_pf.vfname = g_strdup(vfname);
>      dev->exp.sriov_pf.vf = NULL;
>  
>      pci_set_word(cfg + PCI_SRIOV_VF_OFFSET, vf_offset);
> @@ -69,13 +82,35 @@ void pcie_sriov_pf_init(PCIDevice *dev, uint16_t offset,
>      pci_set_word(wmask + PCI_SRIOV_SYS_PGSIZE, 0x553);
>  
>      qdev_prop_set_bit(&dev->qdev, "multifunction", true);
> +
> +    dev->exp.sriov_pf.vf = g_new(PCIDevice *, total_vfs);
> +
> +    for (uint16_t i = 0; i < total_vfs; i++) {
> +        PCIDevice *vf = pci_new(devfn, vfname);
> +        vf->exp.sriov_vf.pf = dev;
> +        vf->exp.sriov_vf.vf_number = i;
> +
> +        if (!qdev_realize(&vf->qdev, bus, errp)) {
> +            unrealize_vfs(dev, i);
> +            return false;
> +        }
> +
> +        /* set vid/did according to sr/iov spec - they are not used */
> +        pci_config_set_vendor_id(vf->config, 0xffff);
> +        pci_config_set_device_id(vf->config, 0xffff);
> +
> +        dev->exp.sriov_pf.vf[i] = vf;
> +        devfn += vf_stride;
> +    }
> +
> +    return true;
>  }
>  
>  void pcie_sriov_pf_exit(PCIDevice *dev)
>  {
> -    unregister_vfs(dev);
> -    g_free((char *)dev->exp.sriov_pf.vfname);
> -    dev->exp.sriov_pf.vfname = NULL;
> +    uint8_t *cfg = dev->config + dev->exp.sriov_cap;
> +
> +    unrealize_vfs(dev, pci_get_word(cfg + PCI_SRIOV_TOTAL_VF));
>  }
>  
>  void pcie_sriov_pf_init_vf_bar(PCIDevice *dev, int region_num,
> @@ -141,38 +176,11 @@ void pcie_sriov_vf_register_bar(PCIDevice *dev, int 
> region_num,
>      }
>  }
>  
> -static PCIDevice *register_vf(PCIDevice *pf, int devfn, const char *name,
> -                              uint16_t vf_num)
> -{
> -    PCIDevice *dev = pci_new(devfn, name);
> -    dev->exp.sriov_vf.pf = pf;
> -    dev->exp.sriov_vf.vf_number = vf_num;
> -    PCIBus *bus = pci_get_bus(pf);
> -    Error *local_err = NULL;
> -
> -    qdev_realize(&dev->qdev, &bus->qbus, &local_err);
> -    if (local_err) {
> -        error_report_err(local_err);
> -        return NULL;
> -    }
> -
> -    /* set vid/did according to sr/iov spec - they are not used */
> -    pci_config_set_vendor_id(dev->config, 0xffff);
> -    pci_config_set_device_id(dev->config, 0xffff);
> -
> -    return dev;
> -}
> -
>  static void register_vfs(PCIDevice *dev)
>  {
>      uint16_t num_vfs;
>      uint16_t i;
>      uint16_t sriov_cap = dev->exp.sriov_cap;
> -    uint16_t vf_offset =
> -        pci_get_word(dev->config + sriov_cap + PCI_SRIOV_VF_OFFSET);
> -    uint16_t vf_stride =
> -        pci_get_word(dev->config + sriov_cap + PCI_SRIOV_VF_STRIDE);
> -    int32_t devfn = dev->devfn + vf_offset;
>  
>      assert(sriov_cap > 0);
>      num_vfs = pci_get_word(dev->config + sriov_cap + PCI_SRIOV_NUM_VF);
> @@ -180,18 +188,10 @@ static void register_vfs(PCIDevice *dev)
>          return;
>      }
>  
> -    dev->exp.sriov_pf.vf = g_new(PCIDevice *, num_vfs);
> -
>      trace_sriov_register_vfs(dev->name, PCI_SLOT(dev->devfn),
>                               PCI_FUNC(dev->devfn), num_vfs);
>      for (i = 0; i < num_vfs; i++) {
> -        dev->exp.sriov_pf.vf[i] = register_vf(dev, devfn,
> -                                              dev->exp.sriov_pf.vfname, i);
> -        if (!dev->exp.sriov_pf.vf[i]) {
> -            num_vfs = i;
> -            break;
> -        }
> -        devfn += vf_stride;
> +        pci_set_enabled(dev->exp.sriov_pf.vf[i], true);
>      }
>      dev->exp.sriov_pf.num_vfs = num_vfs;
>  }
> @@ -204,16 +204,8 @@ static void unregister_vfs(PCIDevice *dev)
>      trace_sriov_unregister_vfs(dev->name, PCI_SLOT(dev->devfn),
>                                 PCI_FUNC(dev->devfn), num_vfs);
>      for (i = 0; i < num_vfs; i++) {
> -        Error *err = NULL;
> -        PCIDevice *vf = dev->exp.sriov_pf.vf[i];
> -        if (!object_property_set_bool(OBJECT(vf), "realized", false, &err)) {
> -            error_reportf_err(err, "Failed to unplug: ");
> -        }
> -        object_unparent(OBJECT(vf));
> -        object_unref(OBJECT(vf));
> +        pci_set_enabled(dev->exp.sriov_pf.vf[i], false);
>      }
> -    g_free(dev->exp.sriov_pf.vf);
> -    dev->exp.sriov_pf.vf = NULL;
>      dev->exp.sriov_pf.num_vfs = 0;
>      pci_set_word(dev->config + dev->exp.sriov_cap + PCI_SRIOV_NUM_VF, 0);
>  }
> 
> -- 
> 2.43.0




reply via email to

[Prev in Thread] Current Thread [Next in Thread]