[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
[PATCH v4 13/14] vfio-user: register handlers to facilitate migration
From: |
Jagannathan Raman |
Subject: |
[PATCH v4 13/14] vfio-user: register handlers to facilitate migration |
Date: |
Wed, 15 Dec 2021 10:35:37 -0500 |
Store and load the device's state during migration. use libvfio-user's
handlers for this purpose
Signed-off-by: Elena Ufimtseva <elena.ufimtseva@oracle.com>
Signed-off-by: John G Johnson <john.g.johnson@oracle.com>
Signed-off-by: Jagannathan Raman <jag.raman@oracle.com>
---
include/migration/vmstate.h | 2 +
migration/savevm.h | 2 +
hw/remote/vfio-user-obj.c | 323 ++++++++++++++++++++++++++++++++++++
migration/savevm.c | 73 ++++++++
migration/vmstate.c | 19 +++
5 files changed, 419 insertions(+)
diff --git a/include/migration/vmstate.h b/include/migration/vmstate.h
index 017c03675c..68bea576ea 100644
--- a/include/migration/vmstate.h
+++ b/include/migration/vmstate.h
@@ -1165,6 +1165,8 @@ extern const VMStateInfo vmstate_info_qlist;
#define VMSTATE_END_OF_LIST() \
{}
+uint64_t vmstate_vmsd_size(PCIDevice *pci_dev);
+
int vmstate_load_state(QEMUFile *f, const VMStateDescription *vmsd,
void *opaque, int version_id);
int vmstate_save_state(QEMUFile *f, const VMStateDescription *vmsd,
diff --git a/migration/savevm.h b/migration/savevm.h
index 6461342cb4..8007064ff2 100644
--- a/migration/savevm.h
+++ b/migration/savevm.h
@@ -67,5 +67,7 @@ int qemu_loadvm_state_main(QEMUFile *f,
MigrationIncomingState *mis);
int qemu_load_device_state(QEMUFile *f);
int qemu_savevm_state_complete_precopy_non_iterable(QEMUFile *f,
bool in_postcopy, bool inactivate_disks);
+int qemu_remote_savevm(QEMUFile *f, DeviceState *dev);
+int qemu_remote_loadvm(QEMUFile *f);
#endif
diff --git a/hw/remote/vfio-user-obj.c b/hw/remote/vfio-user-obj.c
index 2b28d465d5..cc0b82445b 100644
--- a/hw/remote/vfio-user-obj.c
+++ b/hw/remote/vfio-user-obj.c
@@ -53,6 +53,11 @@
#include "hw/pci/msi.h"
#include "hw/pci/msix.h"
#include "hw/remote/iohub.h"
+#include "migration/qemu-file.h"
+#include "migration/savevm.h"
+#include "migration/vmstate.h"
+#include "migration/global_state.h"
+#include "block/block.h"
#define TYPE_VFU_OBJECT "x-vfio-user-server"
OBJECT_DECLARE_TYPE(VfuObject, VfuObjectClass, VFU_OBJECT)
@@ -82,6 +87,35 @@ struct VfuObject {
PCIDevice *pci_dev;
int vfu_poll_fd;
+
+ /*
+ * vfu_mig_buf holds the migration data. In the remote server, this
+ * buffer replaces the role of an IO channel which links the source
+ * and the destination.
+ *
+ * Whenever the client QEMU process initiates migration, the remote
+ * server gets notified via libvfio-user callbacks. The remote server
+ * sets up a QEMUFile object using this buffer as backend. The remote
+ * server passes this object to its migration subsystem, which slurps
+ * the VMSD of the device ('devid' above) referenced by this object
+ * and stores the VMSD in this buffer.
+ *
+ * The client subsequetly asks the remote server for any data that
+ * needs to be moved over to the destination via libvfio-user
+ * library's vfu_migration_callbacks_t callbacks. The remote hands
+ * over this buffer as data at this time.
+ *
+ * A reverse of this process happens at the destination.
+ */
+ uint8_t *vfu_mig_buf;
+
+ uint64_t vfu_mig_buf_size;
+
+ uint64_t vfu_mig_buf_pending;
+
+ QEMUFile *vfu_mig_file;
+
+ vfu_migr_state_t vfu_state;
};
static GHashTable *vfu_object_dev_table;
@@ -125,6 +159,272 @@ static void vfu_object_set_device(Object *obj, const char
*str, Error **errp)
vfu_object_init_ctx(o, errp);
}
+/**
+ * Migration helper functions
+ *
+ * vfu_mig_buf_read & vfu_mig_buf_write are used by QEMU's migration
+ * subsystem - qemu_remote_loadvm & qemu_remote_savevm. loadvm/savevm
+ * call these functions via QEMUFileOps to load/save the VMSD of a
+ * device into vfu_mig_buf
+ *
+ */
+static ssize_t vfu_mig_buf_read(void *opaque, uint8_t *buf, int64_t pos,
+ size_t size, Error **errp)
+{
+ VfuObject *o = opaque;
+
+ if (pos > o->vfu_mig_buf_size) {
+ size = 0;
+ } else if ((pos + size) > o->vfu_mig_buf_size) {
+ size = o->vfu_mig_buf_size - pos;
+ }
+
+ memcpy(buf, (o->vfu_mig_buf + pos), size);
+
+ return size;
+}
+
+static ssize_t vfu_mig_buf_write(void *opaque, struct iovec *iov, int iovcnt,
+ int64_t pos, Error **errp)
+{
+ VfuObject *o = opaque;
+ uint64_t end = pos + iov_size(iov, iovcnt);
+ int i;
+
+ if (end > o->vfu_mig_buf_size) {
+ o->vfu_mig_buf = g_realloc(o->vfu_mig_buf, end);
+ }
+
+ for (i = 0; i < iovcnt; i++) {
+ memcpy((o->vfu_mig_buf + o->vfu_mig_buf_size), iov[i].iov_base,
+ iov[i].iov_len);
+ o->vfu_mig_buf_size += iov[i].iov_len;
+ o->vfu_mig_buf_pending += iov[i].iov_len;
+ }
+
+ return iov_size(iov, iovcnt);
+}
+
+static int vfu_mig_buf_shutdown(void *opaque, bool rd, bool wr, Error **errp)
+{
+ VfuObject *o = opaque;
+
+ o->vfu_mig_buf_size = 0;
+
+ g_free(o->vfu_mig_buf);
+
+ o->vfu_mig_buf = NULL;
+
+ o->vfu_mig_buf_pending = 0;
+
+ return 0;
+}
+
+static const QEMUFileOps vfu_mig_fops_save = {
+ .writev_buffer = vfu_mig_buf_write,
+ .shut_down = vfu_mig_buf_shutdown,
+};
+
+static const QEMUFileOps vfu_mig_fops_load = {
+ .get_buffer = vfu_mig_buf_read,
+ .shut_down = vfu_mig_buf_shutdown,
+};
+
+/**
+ * handlers for vfu_migration_callbacks_t
+ *
+ * The libvfio-user library accesses these handlers to drive the migration
+ * at the remote end, and also to transport the data stored in vfu_mig_buf
+ *
+ */
+static void vfu_mig_state_stop_and_copy(vfu_ctx_t *vfu_ctx)
+{
+ VfuObject *o = vfu_get_private(vfu_ctx);
+ int ret;
+
+ if (!o->vfu_mig_file) {
+ o->vfu_mig_file = qemu_fopen_ops(o, &vfu_mig_fops_save, false);
+ }
+
+ ret = qemu_remote_savevm(o->vfu_mig_file, DEVICE(o->pci_dev));
+ if (ret) {
+ qemu_file_shutdown(o->vfu_mig_file);
+ o->vfu_mig_file = NULL;
+ return;
+ }
+
+ qemu_fflush(o->vfu_mig_file);
+}
+
+static void vfu_mig_state_running(vfu_ctx_t *vfu_ctx)
+{
+ VfuObject *o = vfu_get_private(vfu_ctx);
+ VfuObjectClass *k = VFU_OBJECT_GET_CLASS(OBJECT(o));
+ static int migrated_devs;
+ Error *local_err = NULL;
+ int ret;
+
+ /**
+ * TODO: move to VFU_MIGR_STATE_RESUME handler. Presently, the
+ * VMSD data from source is not available at RESUME state.
+ * Working on a fix for this.
+ */
+ if (!o->vfu_mig_file) {
+ o->vfu_mig_file = qemu_fopen_ops(o, &vfu_mig_fops_load, false);
+ }
+
+ ret = qemu_remote_loadvm(o->vfu_mig_file);
+ if (ret) {
+ error_setg(&error_abort, "vfu: failed to restore device state");
+ return;
+ }
+
+ qemu_file_shutdown(o->vfu_mig_file);
+ o->vfu_mig_file = NULL;
+
+ /* VFU_MIGR_STATE_RUNNING begins here */
+ if (++migrated_devs == k->nr_devs) {
+ bdrv_invalidate_cache_all(&local_err);
+ if (local_err) {
+ error_report_err(local_err);
+ return;
+ }
+
+ vm_start();
+ }
+}
+
+static void vfu_mig_state_stop(vfu_ctx_t *vfu_ctx)
+{
+ VfuObject *o = vfu_get_private(vfu_ctx);
+ VfuObjectClass *k = VFU_OBJECT_GET_CLASS(OBJECT(o));
+ static int migrated_devs;
+
+ /**
+ * note: calling bdrv_inactivate_all() is not the best approach.
+ *
+ * Ideally, we would identify the block devices (if any) indirectly
+ * linked (such as via a scsi-hd device) to each of the migrated devices,
+ * and inactivate them individually. This is essential while operating
+ * the server in a storage daemon mode, with devices from different VMs.
+ *
+ * However, we currently don't have this capability. As such, we need to
+ * inactivate all devices at the same time when migration is completed.
+ */
+ if (++migrated_devs == k->nr_devs) {
+ vm_stop(RUN_STATE_PAUSED);
+ bdrv_inactivate_all();
+ }
+}
+
+static int vfu_mig_transition(vfu_ctx_t *vfu_ctx, vfu_migr_state_t state)
+{
+ VfuObject *o = vfu_get_private(vfu_ctx);
+
+ if (o->vfu_state == state) {
+ return 0;
+ }
+
+ switch (state) {
+ case VFU_MIGR_STATE_RESUME:
+ break;
+ case VFU_MIGR_STATE_STOP_AND_COPY:
+ vfu_mig_state_stop_and_copy(vfu_ctx);
+ break;
+ case VFU_MIGR_STATE_STOP:
+ vfu_mig_state_stop(vfu_ctx);
+ break;
+ case VFU_MIGR_STATE_PRE_COPY:
+ break;
+ case VFU_MIGR_STATE_RUNNING:
+ if (!runstate_is_running()) {
+ vfu_mig_state_running(vfu_ctx);
+ }
+ break;
+ default:
+ warn_report("vfu: Unknown migration state %d", state);
+ }
+
+ o->vfu_state = state;
+
+ return 0;
+}
+
+static uint64_t vfu_mig_get_pending_bytes(vfu_ctx_t *vfu_ctx)
+{
+ VfuObject *o = vfu_get_private(vfu_ctx);
+
+ return o->vfu_mig_buf_pending;
+}
+
+static int vfu_mig_prepare_data(vfu_ctx_t *vfu_ctx, uint64_t *offset,
+ uint64_t *size)
+{
+ VfuObject *o = vfu_get_private(vfu_ctx);
+
+ if (offset) {
+ *offset = 0;
+ }
+
+ if (size) {
+ *size = o->vfu_mig_buf_size;
+ }
+
+ return 0;
+}
+
+static ssize_t vfu_mig_read_data(vfu_ctx_t *vfu_ctx, void *buf,
+ uint64_t size, uint64_t offset)
+{
+ VfuObject *o = vfu_get_private(vfu_ctx);
+
+ if (offset > o->vfu_mig_buf_size) {
+ return -1;
+ }
+
+ if ((offset + size) > o->vfu_mig_buf_size) {
+ warn_report("vfu: buffer overflow - check pending_bytes");
+ size = o->vfu_mig_buf_size - offset;
+ }
+
+ memcpy(buf, (o->vfu_mig_buf + offset), size);
+
+ o->vfu_mig_buf_pending -= size;
+
+ return size;
+}
+
+static ssize_t vfu_mig_write_data(vfu_ctx_t *vfu_ctx, void *data,
+ uint64_t size, uint64_t offset)
+{
+ VfuObject *o = vfu_get_private(vfu_ctx);
+ uint64_t end = offset + size;
+
+ if (end > o->vfu_mig_buf_size) {
+ o->vfu_mig_buf = g_realloc(o->vfu_mig_buf, end);
+ o->vfu_mig_buf_size = end;
+ }
+
+ memcpy((o->vfu_mig_buf + offset), data, size);
+
+ return size;
+}
+
+static int vfu_mig_data_written(vfu_ctx_t *vfu_ctx, uint64_t count)
+{
+ return 0;
+}
+
+static const vfu_migration_callbacks_t vfu_mig_cbs = {
+ .version = VFU_MIGR_CALLBACKS_VERS,
+ .transition = &vfu_mig_transition,
+ .get_pending_bytes = &vfu_mig_get_pending_bytes,
+ .prepare_data = &vfu_mig_prepare_data,
+ .read_data = &vfu_mig_read_data,
+ .data_written = &vfu_mig_data_written,
+ .write_data = &vfu_mig_write_data,
+};
+
static void vfu_object_ctx_run(void *opaque)
{
VfuObject *o = opaque;
@@ -425,6 +725,7 @@ static void vfu_object_init_ctx(VfuObject *o, Error **errp)
ERRP_GUARD();
DeviceState *dev = NULL;
vfu_pci_type_t pci_type = VFU_PCI_TYPE_CONVENTIONAL;
+ uint64_t migr_regs_size, migr_size;
int ret;
if (o->vfu_ctx || !o->socket || !o->device ||
@@ -497,6 +798,26 @@ static void vfu_object_init_ctx(VfuObject *o, Error **errp)
goto fail;
}
+ migr_regs_size = vfu_get_migr_register_area_size();
+ migr_size = migr_regs_size + vmstate_vmsd_size(o->pci_dev);
+
+ ret = vfu_setup_region(o->vfu_ctx, VFU_PCI_DEV_MIGR_REGION_IDX,
+ migr_size, NULL,
+ VFU_REGION_FLAG_RW, NULL, 0, -1, 0);
+ if (ret < 0) {
+ error_setg(errp, "vfu: Failed to register migration BAR %s- %s",
+ o->device, strerror(errno));
+ goto fail;
+ }
+
+ ret = vfu_setup_device_migration_callbacks(o->vfu_ctx, &vfu_mig_cbs,
+ migr_regs_size);
+ if (ret < 0) {
+ error_setg(errp, "vfu: Failed to setup migration %s- %s",
+ o->device, strerror(errno));
+ goto fail;
+ }
+
ret = vfu_realize_ctx(o->vfu_ctx);
if (ret < 0) {
error_setg(errp, "vfu: Failed to realize device %s- %s",
@@ -542,6 +863,8 @@ static void vfu_object_init(Object *obj)
}
o->vfu_poll_fd = -1;
+
+ o->vfu_state = VFU_MIGR_STATE_STOP;
}
static void vfu_object_finalize(Object *obj)
diff --git a/migration/savevm.c b/migration/savevm.c
index d59e976d50..69b7ea8b09 100644
--- a/migration/savevm.c
+++ b/migration/savevm.c
@@ -1605,6 +1605,49 @@ static int qemu_savevm_state(QEMUFile *f, Error **errp)
return ret;
}
+static SaveStateEntry *find_se_from_dev(DeviceState *dev)
+{
+ SaveStateEntry *se;
+
+ QTAILQ_FOREACH(se, &savevm_state.handlers, entry) {
+ if (se->opaque == dev) {
+ return se;
+ }
+ }
+
+ return NULL;
+}
+
+int qemu_remote_savevm(QEMUFile *f, DeviceState *dev)
+{
+ SaveStateEntry *se;
+ int ret = 0;
+
+ se = find_se_from_dev(dev);
+ if (!se) {
+ return -ENODEV;
+ }
+
+ if (!se->vmsd || !vmstate_save_needed(se->vmsd, se->opaque)) {
+ return ret;
+ }
+
+ save_section_header(f, se, QEMU_VM_SECTION_FULL);
+
+ ret = vmstate_save(f, se, NULL);
+ if (ret) {
+ qemu_file_set_error(f, ret);
+ return ret;
+ }
+
+ save_section_footer(f, se);
+
+ qemu_put_byte(f, QEMU_VM_EOF);
+ qemu_fflush(f);
+
+ return 0;
+}
+
void qemu_savevm_live_state(QEMUFile *f)
{
/* save QEMU_VM_SECTION_END section */
@@ -2445,6 +2488,36 @@ qemu_loadvm_section_start_full(QEMUFile *f,
MigrationIncomingState *mis)
return 0;
}
+int qemu_remote_loadvm(QEMUFile *f)
+{
+ uint8_t section_type;
+ int ret = 0;
+
+ while (true) {
+ section_type = qemu_get_byte(f);
+
+ ret = qemu_file_get_error(f);
+ if (ret) {
+ break;
+ }
+
+ switch (section_type) {
+ case QEMU_VM_SECTION_FULL:
+ ret = qemu_loadvm_section_start_full(f, NULL);
+ if (ret < 0) {
+ break;
+ }
+ break;
+ case QEMU_VM_EOF:
+ return ret;
+ default:
+ return -EINVAL;
+ }
+ }
+
+ return ret;
+}
+
static int
qemu_loadvm_section_part_end(QEMUFile *f, MigrationIncomingState *mis)
{
diff --git a/migration/vmstate.c b/migration/vmstate.c
index 05f87cdddc..83f8562792 100644
--- a/migration/vmstate.c
+++ b/migration/vmstate.c
@@ -63,6 +63,25 @@ static int vmstate_size(void *opaque, const VMStateField
*field)
return size;
}
+uint64_t vmstate_vmsd_size(PCIDevice *pci_dev)
+{
+ DeviceClass *dc = DEVICE_GET_CLASS(DEVICE(pci_dev));
+ const VMStateField *field = NULL;
+ uint64_t size = 0;
+
+ if (!dc->vmsd) {
+ return 0;
+ }
+
+ field = dc->vmsd->fields;
+ while (field && field->name) {
+ size += vmstate_size(pci_dev, field);
+ field++;
+ }
+
+ return size;
+}
+
static void vmstate_handle_alloc(void *ptr, const VMStateField *field,
void *opaque)
{
--
2.20.1
- [PATCH v4 10/14] vfio-user: handle PCI BAR accesses, (continued)
[PATCH v4 12/14] vfio-user: handle device interrupts, Jagannathan Raman, 2021/12/15
[PATCH v4 13/14] vfio-user: register handlers to facilitate migration,
Jagannathan Raman <=
[PATCH v4 14/14] vfio-user: avocado tests for vfio-user, Jagannathan Raman, 2021/12/15
[PATCH v4 04/14] vfio-user: define vfio-user-server object, Jagannathan Raman, 2021/12/15