qemu-devel
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: [PATCH v3 7/7] migration/dirtyrate: implement dirty-ring dirtyrate c


From: Hyman Huang
Subject: Re: [PATCH v3 7/7] migration/dirtyrate: implement dirty-ring dirtyrate calculation
Date: Fri, 11 Jun 2021 22:05:22 +0800
User-agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:78.0) Gecko/20100101 Thunderbird/78.11.0



在 2021/6/8 2:36, Peter Xu 写道:
On Mon, Jun 07, 2021 at 09:15:20AM +0800, huangy81@chinatelecom.cn wrote:
From: Hyman Huang(黄勇) <huangy81@chinatelecom.cn>

use dirty ring feature to implement dirtyrate calculation.
to enable it, set vcpu option as true in calc-dirty-rate.

add per_vcpu as mandatory option in calc_dirty_rate, to calculate
dirty rate for vcpu, and use hmp cmd:
(qemu) calc_dirty_rate 1 on

Signed-off-by: Hyman Huang(黄勇) <huangy81@chinatelecom.cn>
---
  hmp-commands.hx        |   7 +-
  migration/dirtyrate.c  | 226 ++++++++++++++++++++++++++++++++++++++---
  migration/trace-events |   5 +
  3 files changed, 220 insertions(+), 18 deletions(-)

diff --git a/hmp-commands.hx b/hmp-commands.hx
index 84dcc3aae6..cc24ab2ab1 100644
--- a/hmp-commands.hx
+++ b/hmp-commands.hx
@@ -1736,8 +1736,9 @@ ERST
{
          .name       = "calc_dirty_rate",
-        .args_type  = "second:l,sample_pages_per_GB:l?",
-        .params     = "second [sample_pages_per_GB]",
-        .help       = "start a round of guest dirty rate measurement",
+        .args_type  = "second:l,per_vcpu:b,sample_pages_per_GB:l?",

How about "dirty-ring:-r"?  Then it's: "(qemu) calc_dirty_rate -r 10".  It can
still be a bool in HMP even if it's a "*mode" in qmp.  We can further make "-l"
for dirty logging (if we want that at last) and make two flags exclusive.

+        .params     = "second on|off [sample_pages_per_GB]",
+        .help       = "start a round of guest dirty rate measurement, "
+                      "calculate for vcpu use on|off",
          .cmd        = hmp_calc_dirty_rate,
      },
diff --git a/migration/dirtyrate.c b/migration/dirtyrate.c
index 055145c24c..e432118f49 100644
--- a/migration/dirtyrate.c
+++ b/migration/dirtyrate.c
@@ -16,6 +16,9 @@
  #include "cpu.h"
  #include "exec/ramblock.h"
  #include "qemu/rcu_queue.h"
+#include "qemu/main-loop.h"
+#include "sysemu/kvm.h"
+#include "sysemu/runstate.h"
  #include "qapi/qapi-commands-migration.h"
  #include "ram.h"
  #include "trace.h"
@@ -23,9 +26,38 @@
  #include "monitor/hmp.h"
  #include "monitor/monitor.h"
  #include "qapi/qmp/qdict.h"
+#include "exec/memory.h"
+
+typedef enum {
+    CALC_NONE = 0,
+    CALC_DIRTY_RING,
+    CALC_SAMPLE_PAGES,
+} CalcMethod;
+
+typedef struct DirtyPageRecord {
+    int64_t start_pages;
+    int64_t end_pages;

Why not uint64_t?  Note that we can also detect overflows using end_pages <
start_pages when needed, but imho we don't even need to worry about it - since
even if overflowed, "end - start" will still generate the right value..
yeah, it's more efficient. i'll change the type next version.

+} DirtyPageRecord;
+
+static DirtyPageRecord *dirty_pages;
static int CalculatingState = DIRTY_RATE_STATUS_UNSTARTED;
  static struct DirtyRateStat DirtyStat;
+static CalcMethod last_method = CALC_NONE;

How about simply name it "dirty_rate_method" as it's "current" not "last"?
yeah !

+bool register_powerdown_callback = false;
+
+static void dirtyrate_powerdown_req(Notifier *n, void *opaque)
+{
+    if (last_method == CALC_DIRTY_RING) {
+        g_free(DirtyStat.method.vcpu.rates);
+        DirtyStat.method.vcpu.rates = NULL;
+    }
+    trace_dirtyrate_powerdown_callback();
+}

In the cover letter, you did mention this as "add memory free callback to
prevent memory leaking" but I didn't really follow..

If VM quits, QEMU quits, things got freed anyways (by OS)?
ok, i add this callback just ensure the qemu do free the struct logically. but it seems that this can't be done in many scenarios like qemu process has been killed, and the callback can't be triggered.
so letting the OS handle the free work is a wise decision.

+
+static Notifier dirtyrate_powerdown_notifier = {
+    .notify = dirtyrate_powerdown_req
+};
static int64_t set_sample_page_period(int64_t msec, int64_t initial_time)
  {
@@ -72,6 +104,7 @@ static struct DirtyRateInfo *query_dirty_rate_info(void)
  {
      int64_t dirty_rate = DirtyStat.dirty_rate;
      struct DirtyRateInfo *info = g_malloc0(sizeof(DirtyRateInfo));
+    DirtyRateVcpuList *head = NULL, **tail = &head;
if (qatomic_read(&CalculatingState) == DIRTY_RATE_STATUS_MEASURED) {
          info->has_dirty_rate = true;
@@ -81,7 +114,22 @@ static struct DirtyRateInfo *query_dirty_rate_info(void)
      info->status = CalculatingState;
      info->start_time = DirtyStat.start_time;
      info->calc_time = DirtyStat.calc_time;
-    info->sample_pages = DirtyStat.sample_pages;
+
+    if (last_method == CALC_DIRTY_RING) {
+        int i = 0;
+        info->per_vcpu = true;
+        info->has_vcpu_dirty_rate = true;
+        for (i = 0; i < DirtyStat.method.vcpu.nvcpu; i++) {

I would also suggest not use "vcpu" as name of field, maybe also "dirty_ring"?

And I think we can omit the "method" too and compilers should know it (same to
the other places)?  Then it can be written as DirtyState.dirty_ring.nvcpu.

+            DirtyRateVcpu *rate = g_malloc0(sizeof(DirtyRateVcpu));
+            rate->id = DirtyStat.method.vcpu.rates[i].id;
+            rate->dirty_rate = DirtyStat.method.vcpu.rates[i].dirty_rate;
+            QAPI_LIST_APPEND(tail, rate);
+        }
+        info->vcpu_dirty_rate = head;
+    } else {
+        info->has_sample_pages = true;
+        info->sample_pages = DirtyStat.sample_pages;
+    }
trace_query_dirty_rate_info(DirtyRateStatus_str(CalculatingState)); @@ -94,15 +142,37 @@ static void init_dirtyrate_stat(int64_t start_time,
      DirtyStat.dirty_rate = -1;
      DirtyStat.start_time = start_time;
      DirtyStat.calc_time = config.sample_period_seconds;
-    DirtyStat.sample_pages = config.sample_pages_per_gigabytes;
-
-    if (config.per_vcpu) {
-        DirtyStat.method.vcpu.nvcpu = -1;
-        DirtyStat.method.vcpu.rates = NULL;
-    } else {
-        DirtyStat.method.vm.total_dirty_samples = 0;
-        DirtyStat.method.vm.total_sample_count = 0;
-        DirtyStat.method.vm.total_block_mem_MB = 0;
+    DirtyStat.sample_pages =
+        config.per_vcpu ? -1 : config.sample_pages_per_gigabytes;
+
+    if (unlikely(!register_powerdown_callback)) {
+        qemu_register_powerdown_notifier(&dirtyrate_powerdown_notifier);
+        register_powerdown_callback = true;
+    }
+
+    switch (last_method) {
+    case CALC_NONE:
+    case CALC_SAMPLE_PAGES:
+        if (config.per_vcpu) {
+            DirtyStat.method.vcpu.nvcpu = -1;
+            DirtyStat.method.vcpu.rates = NULL;
+        } else {
+            DirtyStat.method.vm.total_dirty_samples = 0;
+            DirtyStat.method.vm.total_sample_count = 0;
+            DirtyStat.method.vm.total_block_mem_MB = 0;
+        }
+        break;
+    case CALC_DIRTY_RING:
+        if (!config.per_vcpu) {
+            g_free(DirtyStat.method.vcpu.rates);
+            DirtyStat.method.vcpu.rates = NULL;
+            DirtyStat.method.vm.total_dirty_samples = 0;
+            DirtyStat.method.vm.total_sample_count = 0;
+            DirtyStat.method.vm.total_block_mem_MB = 0;
+        }

I'm a bit confused; why it's CALC_DIRTY_RING but per_vcpu not set?  Why we need
to care about "last_method" at all?..
this two method share the union, the sample method use the local variable of SampleVMStat type, the dirty ring method should alloc rates of DirtyRateVcpu type every time qmp calc_dirty_rate called in case of add vcpu, once rates has been store dirty rate value, it can't be freed until the next time of executing calc_dirty_rate, because info dirty rate may access the rates struct, so the rates struct can only be freed before calc_dirty_rate with sample.

+        break;
+    default:

We can abort() here.

+        break;
      }
  }
@@ -316,7 +386,7 @@ find_block_matched(RAMBlock *block, int count,
  }
static bool compare_page_hash_info(struct RamblockDirtyInfo *info,
-                                  int block_count)
+                                   int block_count)
  {
      struct RamblockDirtyInfo *block_dinfo = NULL;
      RAMBlock *block = NULL;
@@ -340,14 +410,125 @@ static bool compare_page_hash_info(struct 
RamblockDirtyInfo *info,
      return true;
  }
-static void calculate_dirtyrate(struct DirtyRateConfig config)
+static void record_dirtypages(CPUState *cpu, bool start)
+{
+    if (start) {
+        dirty_pages[cpu->cpu_index].start_pages = cpu->dirty_pages;
+    } else {
+        dirty_pages[cpu->cpu_index].end_pages = cpu->dirty_pages;
+    }
+}
+
+static void dirtyrate_global_dirty_log_start(void)
+{
+    /* dirty logging is enabled already */
+    if (global_dirty_log) {
+        return;
+    }

If it's a bitmask already, then we'd want to drop this..

+
+    qemu_mutex_lock_iothread();
+    memory_global_dirty_log_start(GLOBAL_DIRTY_DIRTY_RATE);
+    qemu_mutex_unlock_iothread();
+    trace_dirtyrate_dirty_log_start();

How about moving this trace into memory_global_dirty_log_start() of the other
patch, dumps the bitmask?
it's a good idea.

+}
+
+static void dirtyrate_global_dirty_log_stop(void)
+{
+    /* migration is in process, do not stop dirty logging,
+     * just clear the GLOBAL_DIRTY_DIRTY_RATE bit */
+    if (global_dirty_log & GLOBAL_DIRTY_MIGRATION) {
+        global_dirty_log &= ~(GLOBAL_DIRTY_DIRTY_RATE);
+        return;
+    }

IIUC we don't need this either..

memory_global_dirty_log_start|stop() will make sure all things work already, we
should only use these apis and stop caring about migration at all.

Or did I miss something?

+
+    qemu_mutex_lock_iothread();
+    memory_global_dirty_log_stop(GLOBAL_DIRTY_DIRTY_RATE);
+    qemu_mutex_unlock_iothread();
+    trace_dirtyrate_dirty_log_stop();

Same question here; maybe better to move into memory_global_dirty_log_stop()?
Can make it trace_global_dirty_changed(bitmask) and call at start/stop.

+}
+
+static int64_t do_calculate_dirtyrate_vcpu(int idx)
+{
+    uint64_t memory_size_MB;
+    int64_t time_s;
+    uint64_t start_pages = dirty_pages[idx].start_pages;
+    uint64_t end_pages = dirty_pages[idx].end_pages;
+    uint64_t dirty_pages = 0;
+
+    /* uint64_t over the INT64_MAX */
+    if (unlikely(end_pages < start_pages)) {
+        dirty_pages = INT64_MAX - start_pages + end_pages + 1;
+    } else {
+        dirty_pages = end_pages - start_pages;
+    }

As mentioned above, IMHO this would be enough:

            dirty_pages = end_pages - start_pages;

even if rare overflowed happened.
yeah, i'll drop the old algorithm

+
+    memory_size_MB = (dirty_pages * TARGET_PAGE_SIZE) >> 20;
+    time_s = DirtyStat.calc_time;
+
+    trace_dirtyrate_do_calculate_vcpu(idx, dirty_pages, time_s);
+
+    return memory_size_MB / time_s;
+}
+
+static void calculate_dirtyrate_vcpu(struct DirtyRateConfig config)
+{
+    CPUState *cpu;
+    int64_t msec = 0;
+    int64_t start_time;
+    uint64_t dirtyrate = 0;
+    uint64_t dirtyrate_sum = 0;
+    int nvcpu = 0;
+    int i = 0;
+
+    CPU_FOREACH(cpu) {
+        nvcpu++;
+    }
+
+    dirty_pages = g_malloc0(sizeof(*dirty_pages) * nvcpu);
+
+    dirtyrate_global_dirty_log_start();
+
+    CPU_FOREACH(cpu) {
+        record_dirtypages(cpu, true);
+    }
+
+    DirtyStat.method.vcpu.nvcpu = nvcpu;
+    if (last_method != CALC_DIRTY_RING) {
+        DirtyStat.method.vcpu.rates =
+            g_malloc0(sizeof(DirtyRateVcpu) * nvcpu);
+    }

I don't see a strong need to optimize malloc() for continuous dirty rate
measurements.  Can we simply malloc() for every measurement we need?

If we really want this, it would be nice to make it a follow up patch, but we'd
better justify why it helps...

Btw, I think it's better the malloc()s happen before measuring starts, e.g.:

   cpu_foreach { nvcpu++ }
   rates = malloc(...)
   dirty_pages = malloc(...)

   global_dirty_log_start(DIRTY_RATE)
   cpu_foreach { record_dirtypages() }
   ...

ok, adjust it next version.
+
+    start_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
+    DirtyStat.start_time = start_time / 1000;
+
+    msec = config.sample_period_seconds * 1000;
+    msec = set_sample_page_period(msec, start_time);
+    DirtyStat.calc_time = msec / 1000;
+
+    CPU_FOREACH(cpu) {
+        record_dirtypages(cpu, false);
+    }
+
+    dirtyrate_global_dirty_log_stop();
+
+    for (i = 0; i < DirtyStat.method.vcpu.nvcpu; i++) {
+        dirtyrate = do_calculate_dirtyrate_vcpu(i);
+        DirtyStat.method.vcpu.rates[i].id = i;
+        DirtyStat.method.vcpu.rates[i].dirty_rate = dirtyrate;
+        dirtyrate_sum += dirtyrate;
+    }
+
+    DirtyStat.dirty_rate = dirtyrate_sum / DirtyStat.method.vcpu.nvcpu;
+    g_free(dirty_pages);
+}
+
+static void calculate_dirtyrate_sample_vm(struct DirtyRateConfig config)
  {
      struct RamblockDirtyInfo *block_dinfo = NULL;
      int block_count = 0;
      int64_t msec = 0;
      int64_t initial_time;
- rcu_register_thread();

Better to make this a separate patch.

Thanks,

      rcu_read_lock();
      initial_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
      if (!record_ramblock_hash_info(&block_dinfo, config, &block_count)) {
@@ -364,13 +545,24 @@ static void calculate_dirtyrate(struct DirtyRateConfig 
config)
      if (!compare_page_hash_info(block_dinfo, block_count)) {
          goto out;
      }
-
      update_dirtyrate(msec);
out:
      rcu_read_unlock();
      free_ramblock_dirty_info(block_dinfo, block_count);
-    rcu_unregister_thread();
+}
+
+static void calculate_dirtyrate(struct DirtyRateConfig config)
+{
+    if (config.per_vcpu) {
+        calculate_dirtyrate_vcpu(config);
+        last_method = CALC_DIRTY_RING;
+    } else {
+        calculate_dirtyrate_sample_vm(config);
+        last_method = CALC_SAMPLE_PAGES;
+    }
+
+    trace_dirtyrate_calculate(DirtyStat.dirty_rate);
  }
void *get_dirtyrate_thread(void *arg)
@@ -379,6 +571,8 @@ void *get_dirtyrate_thread(void *arg)
      int ret;
      int64_t start_time;
+ rcu_register_thread();
+
      ret = dirtyrate_set_state(&CalculatingState, DIRTY_RATE_STATUS_UNSTARTED,
                                DIRTY_RATE_STATUS_MEASURING);
      if (ret == -1) {
@@ -396,6 +590,8 @@ void *get_dirtyrate_thread(void *arg)
      if (ret == -1) {
          error_report("change dirtyrate state failed.");
      }
+
+    rcu_unregister_thread();
      return NULL;
  }
diff --git a/migration/trace-events b/migration/trace-events
index 860c4f4025..4c5a658665 100644
--- a/migration/trace-events
+++ b/migration/trace-events
@@ -330,6 +330,11 @@ get_ramblock_vfn_hash(const char *idstr, uint64_t vfn, uint32_t 
crc) "ramblock n
  calc_page_dirty_rate(const char *idstr, uint32_t new_crc, uint32_t old_crc) "ramblock name: 
%s, new crc: %" PRIu32 ", old crc: %" PRIu32
  skip_sample_ramblock(const char *idstr, uint64_t ramblock_size) "ramblock name: 
%s, ramblock size: %" PRIu64
  find_page_matched(const char *idstr) "ramblock %s addr or size changed"
+dirtyrate_calculate(int64_t dirtyrate) "dirty rate: %" PRIi64
+dirtyrate_do_calculate_vcpu(int idx, uint64_t pages, int64_t seconds) "vcpu[%d]: dirty %"PRIu64 
" pages in %"PRIi64 " seconds"
+dirtyrate_powerdown_callback(void) ""
+dirtyrate_dirty_log_start(void) ""
+dirtyrate_dirty_log_stop(void) ""
# block.c
  migration_block_init_shared(const char *blk_device_name) "Start migration for %s 
with shared base image"
--
2.18.2



--
Best regard

Hyman Huang(黄勇)



reply via email to

[Prev in Thread] Current Thread [Next in Thread]