qemu-devel
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[PATCH V1 30/32] vfio-pci: save and restore


From: Steve Sistare
Subject: [PATCH V1 30/32] vfio-pci: save and restore
Date: Thu, 30 Jul 2020 08:14:34 -0700

Enable vfio-pci devices to be saved and restored across an exec restart
of qemu.

At vfio creation time, save the value of vfio container, group, and device
descriptors in the environment.

In cprsave, save the msi message area as part of vfio-pci vmstate, and
clear the close-on-exec flag for the vfio descriptors.  The flag is not
cleared earlier because the descriptors should not persist across misc
fork and exec calls that may be performed during normal operation.

On qemu restart, vfio_realize() finds the descriptor env vars, uses
the descriptors, and notes that the device is being reused.  Device and
iommu state is already configured, so operations in vfio_realize that
would modify the configuration are skipped for a reused device, including
vfio ioctl's and writes to PCI configuration space.  The result is that
vfio_realize constructs qemu data structures that reflect the current
state of the device.  However, the reconstruction is not complete until
cprload is called, and vfio_pci_post_load uses the msi data to rebuild
interrupt structures and attach the interrupts to the new KVM instance.
Lastly, vfio device reset is suppressed when the VM is started.

Signed-off-by: Steve Sistare <steven.sistare@oracle.com>
---
 hw/pci/pci.c                  |  4 ++
 hw/vfio/common.c              | 99 ++++++++++++++++++++++++++++++++++---------
 hw/vfio/pci.c                 | 79 ++++++++++++++++++++++++++++++++--
 hw/vfio/platform.c            |  2 +-
 include/hw/pci/pci.h          |  1 +
 include/hw/vfio/vfio-common.h |  4 +-
 migration/savevm.c            |  2 +-
 7 files changed, 163 insertions(+), 28 deletions(-)

diff --git a/hw/pci/pci.c b/hw/pci/pci.c
index 7343e00..c2e1509 100644
--- a/hw/pci/pci.c
+++ b/hw/pci/pci.c
@@ -291,6 +291,10 @@ static void pci_do_device_reset(PCIDevice *dev)
 {
     int r;
 
+    if (dev->reused) {
+        return;
+    }
+
     pci_device_deassert_intx(dev);
     assert(dev->irq_state == 0);
 
diff --git a/hw/vfio/common.c b/hw/vfio/common.c
index 3335714..a51a093 100644
--- a/hw/vfio/common.c
+++ b/hw/vfio/common.c
@@ -37,6 +37,7 @@
 #include "sysemu/reset.h"
 #include "trace.h"
 #include "qapi/error.h"
+#include "qemu/cutils.h"
 
 VFIOGroupList vfio_group_list =
     QLIST_HEAD_INITIALIZER(vfio_group_list);
@@ -299,6 +300,10 @@ static int vfio_dma_unmap(VFIOContainer *container,
         .size = size,
     };
 
+    if (container->reused) {
+        return 0;
+    }
+
     while (ioctl(container->fd, VFIO_IOMMU_UNMAP_DMA, &unmap)) {
         /*
          * The type1 backend has an off-by-one bug in the kernel (71a7d3d78e3c
@@ -336,6 +341,10 @@ static int vfio_dma_map(VFIOContainer *container, hwaddr 
iova,
         .size = size,
     };
 
+    if (container->reused) {
+        return 0;
+    }
+
     if (!readonly) {
         map.flags |= VFIO_DMA_MAP_FLAG_WRITE;
     }
@@ -1179,25 +1188,27 @@ static int vfio_init_container(VFIOContainer 
*container, int group_fd,
         return iommu_type;
     }
 
-    ret = ioctl(group_fd, VFIO_GROUP_SET_CONTAINER, &container->fd);
-    if (ret) {
-        error_setg_errno(errp, errno, "Failed to set group container");
-        return -errno;
-    }
+    if (!container->reused) {
+        ret = ioctl(group_fd, VFIO_GROUP_SET_CONTAINER, &container->fd);
+        if (ret) {
+            error_setg_errno(errp, errno, "Failed to set group container");
+            return -errno;
+        }
 
-    while (ioctl(container->fd, VFIO_SET_IOMMU, iommu_type)) {
-        if (iommu_type == VFIO_SPAPR_TCE_v2_IOMMU) {
-            /*
-             * On sPAPR, despite the IOMMU subdriver always advertises v1 and
-             * v2, the running platform may not support v2 and there is no
-             * way to guess it until an IOMMU group gets added to the 
container.
-             * So in case it fails with v2, try v1 as a fallback.
-             */
-            iommu_type = VFIO_SPAPR_TCE_IOMMU;
-            continue;
+        while (ioctl(container->fd, VFIO_SET_IOMMU, iommu_type)) {
+            if (iommu_type == VFIO_SPAPR_TCE_v2_IOMMU) {
+                /*
+                 * On sPAPR, despite the IOMMU subdriver always advertises v1
+                 * and v2, the running platform may not support v2 and there is
+                 * no way to guess it until an IOMMU group gets added to the
+                 * container. So in case it fails with v2, try v1 as a 
fallback.
+                 */
+                iommu_type = VFIO_SPAPR_TCE_IOMMU;
+                continue;
+            }
+            error_setg_errno(errp, errno, "Failed to set iommu for container");
+            return -errno;
         }
-        error_setg_errno(errp, errno, "Failed to set iommu for container");
-        return -errno;
     }
 
     container->iommu_type = iommu_type;
@@ -1210,6 +1221,8 @@ static int vfio_connect_container(VFIOGroup *group, 
AddressSpace *as,
     VFIOContainer *container;
     int ret, fd;
     VFIOAddressSpace *space;
+    char name[40];
+    bool reused;
 
     space = vfio_get_address_space(as);
 
@@ -1254,7 +1267,13 @@ static int vfio_connect_container(VFIOGroup *group, 
AddressSpace *as,
         }
     }
 
-    fd = qemu_open("/dev/vfio/vfio", O_RDWR);
+    snprintf(name, sizeof(name), "vfio_container_%d", group->groupid);
+    fd = getenv_fd(name);
+    reused = (fd >= 0);
+    if (fd < 0) {
+        fd = qemu_open("/dev/vfio/vfio", O_RDWR);
+    }
+
     if (fd < 0) {
         error_setg_errno(errp, errno, "failed to open /dev/vfio/vfio");
         ret = -errno;
@@ -1272,6 +1291,8 @@ static int vfio_connect_container(VFIOGroup *group, 
AddressSpace *as,
     container = g_malloc0(sizeof(*container));
     container->space = space;
     container->fd = fd;
+    container->cid = group->groupid;
+    container->reused = reused;
     container->error = NULL;
     QLIST_INIT(&container->giommu_list);
     QLIST_INIT(&container->hostwin_list);
@@ -1395,6 +1416,10 @@ static int vfio_connect_container(VFIOGroup *group, 
AddressSpace *as,
 
     container->initialized = true;
 
+    if (!reused) {
+        setenv_fd(name, fd);
+    }
+
     return 0;
 listener_release_exit:
     QLIST_REMOVE(group, container_next);
@@ -1418,6 +1443,7 @@ put_space_exit:
 static void vfio_disconnect_container(VFIOGroup *group)
 {
     VFIOContainer *container = group->container;
+    char name[40];
 
     QLIST_REMOVE(group, container_next);
     group->container = NULL;
@@ -1450,6 +1476,8 @@ static void vfio_disconnect_container(VFIOGroup *group)
         }
 
         trace_vfio_disconnect_container(container->fd);
+        snprintf(name, sizeof(name), "vfio_container_%d", container->cid);
+        unsetenv_fd(name);
         close(container->fd);
         g_free(container);
 
@@ -1462,6 +1490,7 @@ VFIOGroup *vfio_get_group(int groupid, AddressSpace *as, 
Error **errp)
     VFIOGroup *group;
     char path[32];
     struct vfio_group_status status = { .argsz = sizeof(status) };
+    bool reused;
 
     QLIST_FOREACH(group, &vfio_group_list, next) {
         if (group->groupid == groupid) {
@@ -1479,7 +1508,13 @@ VFIOGroup *vfio_get_group(int groupid, AddressSpace *as, 
Error **errp)
     group = g_malloc0(sizeof(*group));
 
     snprintf(path, sizeof(path), "/dev/vfio/%d", groupid);
-    group->fd = qemu_open(path, O_RDWR);
+
+    group->fd = getenv_fd(path);
+    reused = (group->fd >= 0);
+    if (group->fd < 0) {
+        group->fd = qemu_open(path, O_RDWR);
+    }
+
     if (group->fd < 0) {
         error_setg_errno(errp, errno, "failed to open %s", path);
         goto free_group_exit;
@@ -1513,6 +1548,10 @@ VFIOGroup *vfio_get_group(int groupid, AddressSpace *as, 
Error **errp)
 
     QLIST_INSERT_HEAD(&vfio_group_list, group, next);
 
+    if (!reused) {
+        setenv_fd(path, group->fd);
+    }
+
     return group;
 
 close_fd_exit:
@@ -1526,6 +1565,8 @@ free_group_exit:
 
 void vfio_put_group(VFIOGroup *group)
 {
+    char path[32];
+
     if (!group || !QLIST_EMPTY(&group->device_list)) {
         return;
     }
@@ -1537,6 +1578,8 @@ void vfio_put_group(VFIOGroup *group)
     vfio_disconnect_container(group);
     QLIST_REMOVE(group, next);
     trace_vfio_put_group(group->fd);
+    snprintf(path, sizeof(path), "/dev/vfio/%d", group->groupid);
+    unsetenv_fd(path);
     close(group->fd);
     g_free(group);
 
@@ -1546,12 +1589,18 @@ void vfio_put_group(VFIOGroup *group)
 }
 
 int vfio_get_device(VFIOGroup *group, const char *name,
-                    VFIODevice *vbasedev, Error **errp)
+                    VFIODevice *vbasedev, bool *reusedp, Error **errp)
 {
     struct vfio_device_info dev_info = { .argsz = sizeof(dev_info) };
     int ret, fd;
+    bool reused;
+
+    fd = getenv_fd(name);
+    reused = (fd >= 0);
+    if (fd < 0) {
+        fd = ioctl(group->fd, VFIO_GROUP_GET_DEVICE_FD, name);
+    }
 
-    fd = ioctl(group->fd, VFIO_GROUP_GET_DEVICE_FD, name);
     if (fd < 0) {
         error_setg_errno(errp, errno, "error getting device from group %d",
                          group->groupid);
@@ -1601,6 +1650,13 @@ int vfio_get_device(VFIOGroup *group, const char *name,
                           dev_info.num_irqs);
 
     vbasedev->reset_works = !!(dev_info.flags & VFIO_DEVICE_FLAGS_RESET);
+
+    if (!reused) {
+        setenv_fd(name, fd);
+    }
+    if (reusedp) {
+        *reusedp = reused;
+    }
     return 0;
 }
 
@@ -1612,6 +1668,7 @@ void vfio_put_base_device(VFIODevice *vbasedev)
     QLIST_REMOVE(vbasedev, next);
     vbasedev->group = NULL;
     trace_vfio_put_base_device(vbasedev->fd);
+    unsetenv_fd(vbasedev->name);
     close(vbasedev->fd);
 }
 
diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
index 2e561c0..5743807 100644
--- a/hw/vfio/pci.c
+++ b/hw/vfio/pci.c
@@ -49,6 +49,7 @@
 
 static void vfio_disable_interrupts(VFIOPCIDevice *vdev);
 static void vfio_mmap_set_enabled(VFIOPCIDevice *vdev, bool enabled);
+static const VMStateDescription vfio_pci_vmstate;
 
 /*
  * Disabling BAR mmaping can be slow, but toggling it around INTx can
@@ -1585,6 +1586,14 @@ static void vfio_mmap_set_enabled(VFIOPCIDevice *vdev, 
bool enabled)
     }
 }
 
+static void vfio_config_sync(VFIOPCIDevice *vdev, uint32_t offset, size_t len)
+{
+    if (pread(vdev->vbasedev.fd, vdev->pdev.config + offset, len,
+          vdev->config_offset + offset) != len) {
+        error_report("vfio_config_sync pread failed");
+    }
+}
+
 static void vfio_bar_prepare(VFIOPCIDevice *vdev, int nr)
 {
     VFIOBAR *bar = &vdev->bars[nr];
@@ -1626,6 +1635,7 @@ static void vfio_bar_register(VFIOPCIDevice *vdev, int nr)
 {
     VFIOBAR *bar = &vdev->bars[nr];
     char *name;
+    PCIDevice *pdev = &vdev->pdev;
 
     if (!bar->size) {
         return;
@@ -1646,6 +1656,9 @@ static void vfio_bar_register(VFIOPCIDevice *vdev, int nr)
     }
 
     pci_register_bar(&vdev->pdev, nr, bar->type, bar->mr);
+    if (pdev->reused) {
+        vfio_config_sync(vdev, pci_bar(pdev, nr), 8);
+    }
 }
 
 static void vfio_bars_register(VFIOPCIDevice *vdev)
@@ -2805,7 +2818,8 @@ static void vfio_realize(PCIDevice *pdev, Error **errp)
         goto error;
     }
 
-    ret = vfio_get_device(group, vdev->vbasedev.name, &vdev->vbasedev, errp);
+    ret = vfio_get_device(group, vdev->vbasedev.name, &vdev->vbasedev,
+                          &pdev->reused, errp);
     if (ret) {
         vfio_put_group(group);
         goto error;
@@ -2972,9 +2986,11 @@ static void vfio_realize(PCIDevice *pdev, Error **errp)
                                              vfio_intx_routing_notifier);
         vdev->irqchip_change_notifier.notify = vfio_irqchip_change;
         kvm_irqchip_add_change_notifier(&vdev->irqchip_change_notifier);
-        ret = vfio_intx_enable(vdev, errp);
-        if (ret) {
-            goto out_deregister;
+        if (!pdev->reused) {
+            ret = vfio_intx_enable(vdev, errp);
+            if (ret) {
+                goto out_deregister;
+            }
         }
     }
 
@@ -3017,6 +3033,11 @@ static void vfio_realize(PCIDevice *pdev, Error **errp)
     vfio_register_req_notifier(vdev);
     vfio_setup_resetfn_quirk(vdev);
 
+    vfio_config_sync(vdev, pdev->msix_cap + PCI_MSIX_FLAGS, 2);
+    if (pdev->reused) {
+        pci_update_mappings(pdev);
+    }
+
     return;
 
 out_deregister:
@@ -3080,6 +3101,10 @@ static void vfio_pci_reset(DeviceState *dev)
 {
     VFIOPCIDevice *vdev = PCI_VFIO(dev);
 
+    if (vdev->pdev.reused) {
+        return;
+    }
+
     trace_vfio_pci_reset(vdev->vbasedev.name);
 
     vfio_pci_pre_reset(vdev);
@@ -3182,6 +3207,51 @@ static Property vfio_pci_dev_properties[] = {
     DEFINE_PROP_END_OF_LIST(),
 };
 
+static int vfio_pci_post_load(void *opaque, int version_id)
+{
+    int vector;
+    MSIMessage msg;
+    Error *err = 0;
+    VFIOPCIDevice *vdev = opaque;
+    PCIDevice *pdev = &vdev->pdev;
+
+    if (msix_enabled(pdev)) {
+        vfio_msix_enable(vdev);
+        pdev->msix_function_masked = false;
+
+        for (vector = 0; vector < vdev->pdev.msix_entries_nr; vector++) {
+            if (!msix_is_masked(pdev, vector)) {
+                msg = msix_get_message(pdev, vector);
+                vfio_msix_vector_use(pdev, vector, msg);
+            }
+        }
+
+    } else if (vfio_pci_read_config(pdev, PCI_INTERRUPT_PIN, 1)) {
+        vfio_intx_enable(vdev, &err);
+        if (err) {
+            error_report_err(err);
+        }
+    }
+
+    vdev->vbasedev.group->container->reused = false;
+    vdev->pdev.reused = false;
+
+    return 0;
+}
+
+static const VMStateDescription vfio_pci_vmstate = {
+    .name = "vfio-pci",
+    .unmigratable = 1,
+    .mode_mask = VMS_RESTART,
+    .version_id = 0,
+    .minimum_version_id = 0,
+    .post_load = vfio_pci_post_load,
+    .fields = (VMStateField[]) {
+        VMSTATE_MSIX(pdev, VFIOPCIDevice),
+        VMSTATE_END_OF_LIST()
+    }
+};
+
 static void vfio_pci_dev_class_init(ObjectClass *klass, void *data)
 {
     DeviceClass *dc = DEVICE_CLASS(klass);
@@ -3189,6 +3259,7 @@ static void vfio_pci_dev_class_init(ObjectClass *klass, 
void *data)
 
     dc->reset = vfio_pci_reset;
     device_class_set_props(dc, vfio_pci_dev_properties);
+    dc->vmsd = &vfio_pci_vmstate;
     dc->desc = "VFIO-based PCI device assignment";
     set_bit(DEVICE_CATEGORY_MISC, dc->categories);
     pdc->realize = vfio_realize;
diff --git a/hw/vfio/platform.c b/hw/vfio/platform.c
index ac2cefc..e6e1a5d 100644
--- a/hw/vfio/platform.c
+++ b/hw/vfio/platform.c
@@ -592,7 +592,7 @@ static int vfio_base_device_init(VFIODevice *vbasedev, 
Error **errp)
             return -EBUSY;
         }
     }
-    ret = vfio_get_device(group, vbasedev->name, vbasedev, errp);
+    ret = vfio_get_device(group, vbasedev->name, vbasedev, 0, errp);
     if (ret) {
         vfio_put_group(group);
         return ret;
diff --git a/include/hw/pci/pci.h b/include/hw/pci/pci.h
index bd07c86..c926a24 100644
--- a/include/hw/pci/pci.h
+++ b/include/hw/pci/pci.h
@@ -358,6 +358,7 @@ struct PCIDevice {
 
     /* ID of standby device in net_failover pair */
     char *failover_pair_id;
+    bool reused;
 };
 
 void pci_register_bar(PCIDevice *pci_dev, int region_num,
diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h
index c78f3ff..4e2a332 100644
--- a/include/hw/vfio/vfio-common.h
+++ b/include/hw/vfio/vfio-common.h
@@ -73,6 +73,8 @@ typedef struct VFIOContainer {
     unsigned iommu_type;
     Error *error;
     bool initialized;
+    bool reused;
+    int cid;
     unsigned long pgsizes;
     QLIST_HEAD(, VFIOGuestIOMMU) giommu_list;
     QLIST_HEAD(, VFIOHostDMAWindow) hostwin_list;
@@ -177,7 +179,7 @@ void vfio_reset_handler(void *opaque);
 VFIOGroup *vfio_get_group(int groupid, AddressSpace *as, Error **errp);
 void vfio_put_group(VFIOGroup *group);
 int vfio_get_device(VFIOGroup *group, const char *name,
-                    VFIODevice *vbasedev, Error **errp);
+                    VFIODevice *vbasedev, bool *reused, Error **errp);
 
 extern const MemoryRegionOps vfio_region_ops;
 typedef QLIST_HEAD(VFIOGroupList, VFIOGroup) VFIOGroupList;
diff --git a/migration/savevm.c b/migration/savevm.c
index 881dc13..2606cf0 100644
--- a/migration/savevm.c
+++ b/migration/savevm.c
@@ -1568,7 +1568,7 @@ static int qemu_savevm_state(QEMUFile *f, VMStateMode 
mode, Error **errp)
         return -EINVAL;
     }
 
-    if (migrate_use_block()) {
+    if ((mode & (VMS_SNAPSHOT | VMS_MIGRATE)) && migrate_use_block()) {
         error_setg(errp, "Block migration and snapshots are incompatible");
         return -EINVAL;
     }
-- 
1.8.3.1




reply via email to

[Prev in Thread] Current Thread [Next in Thread]