qemu-devel
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: [RFC] migration: introduce failed-unrecovarable status


From: Vladimir Sementsov-Ogievskiy
Subject: Re: [RFC] migration: introduce failed-unrecovarable status
Date: Mon, 20 Jan 2020 09:11:46 +0000

ping

18.12.2019 15:55, Vladimir Sementsov-Ogievskiy wrote:
> We should not start source vm automatically, if the error occured after
> target accessed disks, or if we failed to invalidate nodes.
> 
> Also, fix, that we need invalidate even if bdrv_inactivate_all()
> failed, as in this case it still may successfully inactivate some of
> the nodes.
> 
> Signed-off-by: Vladimir Sementsov-Ogievskiy <address@hidden>
> ---
> 
> Hi all!
> 
> It's an investigation on top of old thread
> https://lists.gnu.org/archive/html/qemu-devel/2018-06/msg02355.html
> 
> Either I'm missing something, or we need this patch. It's a draft, may
> be need to split it into 2-3 small patches. Still I'd like to get
> general approval at first, may be I'm doing something wrong.
> 
> Also, there may be other migration failure cases like this.
> 
>   qapi/migration.json   |  7 +++++--
>   migration/migration.c | 36 ++++++++++++++++++++++++------------
>   2 files changed, 29 insertions(+), 14 deletions(-)
> 
> diff --git a/qapi/migration.json b/qapi/migration.json
> index b7348d0c8b..90fa625cbb 100644
> --- a/qapi/migration.json
> +++ b/qapi/migration.json
> @@ -125,6 +125,9 @@
>   #
>   # @failed: some error occurred during migration process.
>   #
> +# @failed-unrecoverable: postcopy failed after no return point, when disks 
> may
> +#                        already be accessed by target Qemu process. (since 
> 5.0)
> +#
>   # @colo: VM is in the process of fault tolerance, VM can not get into this
>   #        state unless colo capability is enabled for migration. (since 2.8)
>   #
> @@ -142,8 +145,8 @@
>   { 'enum': 'MigrationStatus',
>     'data': [ 'none', 'setup', 'cancelling', 'cancelled',
>               'active', 'postcopy-active', 'postcopy-paused',
> -            'postcopy-recover', 'completed', 'failed', 'colo',
> -            'pre-switchover', 'device', 'wait-unplug' ] }
> +            'postcopy-recover', 'completed', 'failed', 
> 'failed-unrecoverable',
> +            'colo', 'pre-switchover', 'device', 'wait-unplug' ] }
>   
>   ##
>   # @MigrationInfo:
> diff --git a/migration/migration.c b/migration/migration.c
> index 354ad072fa..00684fdef8 100644
> --- a/migration/migration.c
> +++ b/migration/migration.c
> @@ -2576,7 +2576,14 @@ static int postcopy_start(MigrationState *ms)
>       QEMUFile *fb;
>       int64_t time_at_stop = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
>       int64_t bandwidth = migrate_max_postcopy_bandwidth();
> -    bool restart_block = false;
> +
> +    /*
> +     * recoverable_failure
> +     * A failure happened early enough that we know the destination hasn't
> +     * accessed block devices, so we're safe to recover.
> +     */
> +    bool recoverable_failure = true;
> +    bool inactivated = false;
>       int cur_state = MIGRATION_STATUS_ACTIVE;
>       if (!migrate_pause_before_switchover()) {
>           migrate_set_state(&ms->state, MIGRATION_STATUS_ACTIVE,
> @@ -2600,11 +2607,11 @@ static int postcopy_start(MigrationState *ms)
>           goto fail;
>       }
>   
> +    inactivated = true;
>       ret = bdrv_inactivate_all();
>       if (ret < 0) {
>           goto fail;
>       }
> -    restart_block = true;
>   
>       /*
>        * Cause any non-postcopiable, but iterative devices to
> @@ -2682,7 +2689,7 @@ static int postcopy_start(MigrationState *ms)
>           goto fail_closefb;
>       }
>   
> -    restart_block = false;
> +    recoverable_failure = false;
>   
>       /* Now send that blob */
>       if (qemu_savevm_send_packaged(ms->to_dst_file, bioc->data, 
> bioc->usage)) {
> @@ -2716,26 +2723,28 @@ static int postcopy_start(MigrationState *ms)
>       ret = qemu_file_get_error(ms->to_dst_file);
>       if (ret) {
>           error_report("postcopy_start: Migration stream errored");
> -        migrate_set_state(&ms->state, MIGRATION_STATUS_POSTCOPY_ACTIVE,
> -                              MIGRATION_STATUS_FAILED);
> +        goto fail;
>       }
>   
> -    return ret;
> +    return 0;
>   
>   fail_closefb:
>       qemu_fclose(fb);
>   fail:
>       migrate_set_state(&ms->state, MIGRATION_STATUS_POSTCOPY_ACTIVE,
> -                          MIGRATION_STATUS_FAILED);
> -    if (restart_block) {
> -        /* A failure happened early enough that we know the destination 
> hasn't
> -         * accessed block devices, so we're safe to recover.
> -         */
> +                      recoverable_failure ? MIGRATION_STATUS_FAILED :
> +                      MIGRATION_STATUS_FAILED_UNRECOVERABLE);
> +    if (recoverable_failure && inactivated) {
>           Error *local_err = NULL;
>   
>           bdrv_invalidate_cache_all(&local_err);
>           if (local_err) {
>               error_report_err(local_err);
> +            /*
> +             * We failed to invalidate, so we must not start vm 
> automatically.
> +             * User may retry invalidation and start by cont qmp command.
> +             */
> +            ms->vm_was_running = false;
>           }
>       }
>       qemu_mutex_unlock_iothread();
> @@ -3194,9 +3203,12 @@ static void migration_iteration_finish(MigrationState 
> *s)
>           s->vm_was_running = true;
>           /* Fallthrough */
>       case MIGRATION_STATUS_FAILED:
> +    case MIGRATION_STATUS_FAILED_UNRECOVERABLE:
>       case MIGRATION_STATUS_CANCELLED:
>       case MIGRATION_STATUS_CANCELLING:
> -        if (s->vm_was_running) {
> +        if (s->vm_was_running &&
> +            s->state != MIGRATION_STATUS_FAILED_UNRECOVERABLE)
> +        {
>               vm_start();
>           } else {
>               if (runstate_check(RUN_STATE_FINISH_MIGRATE)) {
> 


-- 
Best regards,
Vladimir

reply via email to

[Prev in Thread] Current Thread [Next in Thread]