[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
Re: [Qemu-devel] [PATCH untested for-4.2] memory: fix race between TCG a
From: |
Alex Bennée |
Subject: |
Re: [Qemu-devel] [PATCH untested for-4.2] memory: fix race between TCG and accesses to dirty bitmap |
Date: |
Thu, 08 Aug 2019 10:27:26 +0100 |
User-agent: |
mu4e 1.3.4; emacs 27.0.50 |
Paolo Bonzini <address@hidden> writes:
> The race is as follows:
>
> vCPU thread reader thread
> ----------------------- -----------------------
> TLB check -> slow path
> notdirty_mem_write
> write to RAM
> set dirty flag
> clear dirty flag
> TLB check -> fast path
> read memory
> write to RAM
>
> and the second write is missed by the reader.
>
> Fortunately, in order to fix it, no change is required to the
> vCPU thread. However, the reader thread must delay the read after
> the vCPU thread has finished the write. This can be approximated
> conservatively by run_on_cpu, which waits for the end of the current
> translation block.
>
> A similar technique is used by KVM, which has to do a synchronous TLB
> flush after doing a test-and-clear of the dirty-page flags.
>
> Reported-by: Dr. David Alan Gilbert <address@hidden>
> Signed-off-by: Paolo Bonzini <address@hidden>
Reviewed-by: Alex Bennée <address@hidden>
Tested-by: Alex Bennée <address@hidden>
> ---
> I tested this some time ago, and enough has changed that I don't
> really trust those old results. Nevertheless, I am throwing out
> the patch so that it is not forgotten.
>
> exec.c | 31 +++++++++++++++++++++++++++++++
> include/exec/memory.h | 12 ++++++++++++
> memory.c | 10 +++++++++-
> migration/ram.c | 1 +
> 4 files changed, 53 insertions(+), 1 deletion(-)
>
> diff --git a/exec.c b/exec.c
> index 3e78de3b8f..ae68f72da4 100644
> --- a/exec.c
> +++ b/exec.c
> @@ -198,6 +198,7 @@ typedef struct subpage_t {
>
> static void io_mem_init(void);
> static void memory_map_init(void);
> +static void tcg_log_global_after_sync(MemoryListener *listener);
> static void tcg_commit(MemoryListener *listener);
>
> static MemoryRegion io_mem_watch;
> @@ -906,6 +907,7 @@ void cpu_address_space_init(CPUState *cpu, int asidx,
> newas->cpu = cpu;
> newas->as = as;
> if (tcg_enabled()) {
> + newas->tcg_as_listener.log_global_after_sync =
> tcg_log_global_after_sync;
> newas->tcg_as_listener.commit = tcg_commit;
> memory_listener_register(&newas->tcg_as_listener, as);
> }
> @@ -3143,6 +3145,35 @@ void address_space_dispatch_free(AddressSpaceDispatch
> *d)
> g_free(d);
> }
>
> +static void do_nothing(CPUState *cpu, run_on_cpu_data d)
> +{
> +}
> +
> +static void tcg_log_global_after_sync(MemoryListener *listener)
> +{
> + CPUAddressSpace *cpuas;
> +
> + /* Wait for the CPU to end the current TB. This avoids the following
> + * incorrect race:
> + *
> + * vCPU migration
> + * ---------------------- -------------------------
> + * TLB check -> slow path
> + * notdirty_mem_write
> + * write to RAM
> + * mark dirty
> + * clear dirty flag
> + * TLB check -> fast path
> + * read memory
> + * write to RAM
> + *
> + * by pushing the migration thread's memory read after the vCPU thread
> has
> + * written the memory.
> + */
> + cpuas = container_of(listener, CPUAddressSpace, tcg_as_listener);
> + run_on_cpu(cpuas->cpu, do_nothing, RUN_ON_CPU_NULL);
> +}
> +
> static void tcg_commit(MemoryListener *listener)
> {
> CPUAddressSpace *cpuas;
> diff --git a/include/exec/memory.h b/include/exec/memory.h
> index bb0961ddb9..b6bcf31b0a 100644
> --- a/include/exec/memory.h
> +++ b/include/exec/memory.h
> @@ -419,6 +419,7 @@ struct MemoryListener {
> void (*log_clear)(MemoryListener *listener, MemoryRegionSection
> *section);
> void (*log_global_start)(MemoryListener *listener);
> void (*log_global_stop)(MemoryListener *listener);
> + void (*log_global_after_sync)(MemoryListener *listener);
> void (*eventfd_add)(MemoryListener *listener, MemoryRegionSection
> *section,
> bool match_data, uint64_t data, EventNotifier *e);
> void (*eventfd_del)(MemoryListener *listener, MemoryRegionSection
> *section,
> @@ -1681,6 +1682,17 @@ MemoryRegionSection memory_region_find(MemoryRegion
> *mr,
> */
> void memory_global_dirty_log_sync(void);
>
> +/**
> + * memory_global_dirty_log_sync: synchronize the dirty log for all memory
> + *
> + * Synchronizes the vCPUs with a thread that is reading the dirty bitmap.
> + * This function must be called after the dirty log bitmap is cleared, and
> + * before dirty guest memory pages are read. If you are using
> + * #DirtyBitmapSnapshot, memory_region_snapshot_and_clear_dirty() takes
> + * care of doing this.
> + */
> +void memory_global_after_dirty_log_sync(void);
> +
> /**
> * memory_region_transaction_begin: Start a transaction.
> *
> diff --git a/memory.c b/memory.c
> index e42d63a3a0..edd0c13c38 100644
> --- a/memory.c
> +++ b/memory.c
> @@ -2127,9 +2127,12 @@ DirtyBitmapSnapshot
> *memory_region_snapshot_and_clear_dirty(MemoryRegion *mr,
> hwaddr size,
> unsigned client)
> {
> + DirtyBitmapSnapshot *snapshot;
> assert(mr->ram_block);
> memory_region_sync_dirty_bitmap(mr);
> - return cpu_physical_memory_snapshot_and_clear_dirty(mr, addr, size,
> client);
> + snapshot = cpu_physical_memory_snapshot_and_clear_dirty(mr, addr, size,
> client);
> + memory_global_after_dirty_log_sync();
> + return snapshot;
> }
>
> bool memory_region_snapshot_get_dirty(MemoryRegion *mr, DirtyBitmapSnapshot
> *snap,
> @@ -2620,6 +2623,11 @@ void memory_global_dirty_log_sync(void)
> memory_region_sync_dirty_bitmap(NULL);
> }
>
> +void memory_global_after_dirty_log_sync(void)
> +{
> + MEMORY_LISTENER_CALL_GLOBAL(log_global_after_sync, Forward);
> +}
> +
> static VMChangeStateEntry *vmstate_change;
>
> void memory_global_dirty_log_start(void)
> diff --git a/migration/ram.c b/migration/ram.c
> index 2b0774c2bf..b9d6a3921d 100644
> --- a/migration/ram.c
> +++ b/migration/ram.c
> @@ -1801,6 +1801,7 @@ static void migration_bitmap_sync(RAMState *rs)
> rcu_read_unlock();
> qemu_mutex_unlock(&rs->bitmap_mutex);
>
> + memory_global_after_dirty_log_sync();
> trace_migration_bitmap_sync_end(rs->num_dirty_pages_period);
>
> end_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
--
Alex Bennée