qemu-block
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: [PATCH v3 6/6] block/qcow2: use seqcache for compressed writes


From: Max Reitz
Subject: Re: [PATCH v3 6/6] block/qcow2: use seqcache for compressed writes
Date: Fri, 12 Mar 2021 19:15:00 +0100
User-agent: Mozilla/5.0 (X11; Linux x86_64; rv:78.0) Gecko/20100101 Thunderbird/78.8.0

On 05.03.21 18:35, Vladimir Sementsov-Ogievskiy wrote:
Compressed writes are unaligned to 512, which works very slow in
O_DIRECT mode. Let's use the cache.

Signed-off-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
---
  block/coroutines.h     |   3 +
  block/qcow2.h          |   4 ++
  block/qcow2-refcount.c |  10 +++
  block/qcow2.c          | 158 ++++++++++++++++++++++++++++++++++++++---
  4 files changed, 164 insertions(+), 11 deletions(-)

diff --git a/block/coroutines.h b/block/coroutines.h
index 4cfb4946e6..cb8e05830e 100644
--- a/block/coroutines.h
+++ b/block/coroutines.h
@@ -66,4 +66,7 @@ int coroutine_fn bdrv_co_readv_vmstate(BlockDriverState *bs,
  int coroutine_fn bdrv_co_writev_vmstate(BlockDriverState *bs,
                                          QEMUIOVector *qiov, int64_t pos);
+int coroutine_fn qcow2_co_flush_compressed_cache(BlockDriverState *bs);
+int generated_co_wrapper qcow2_flush_compressed_cache(BlockDriverState *bs);
+
  #endif /* BLOCK_COROUTINES_INT_H */
diff --git a/block/qcow2.h b/block/qcow2.h
index e18d8dfbae..8b8fed0950 100644
--- a/block/qcow2.h
+++ b/block/qcow2.h
@@ -28,6 +28,7 @@
  #include "crypto/block.h"
  #include "qemu/coroutine.h"
  #include "qemu/units.h"
+#include "qemu/seqcache.h"
  #include "block/block_int.h"
//#define DEBUG_ALLOC
@@ -422,6 +423,9 @@ typedef struct BDRVQcow2State {
      Qcow2CompressionType compression_type;
GHashTable *inflight_writes_counters;
+
+    SeqCache *compressed_cache;
+    int compressed_flush_ret;
  } BDRVQcow2State;
typedef struct Qcow2COWRegion {
diff --git a/block/qcow2-refcount.c b/block/qcow2-refcount.c
index 464d133368..218917308e 100644
--- a/block/qcow2-refcount.c
+++ b/block/qcow2-refcount.c
@@ -29,6 +29,7 @@
  #include "qemu/bswap.h"
  #include "qemu/cutils.h"
  #include "trace.h"
+#include "block/coroutines.h"
static int64_t alloc_clusters_noref(BlockDriverState *bs, uint64_t size,
                                      uint64_t max);
@@ -1040,6 +1041,10 @@ static int QEMU_WARN_UNUSED_RESULT 
update_refcount(BlockDriverState *bs,
                  qcow2_cache_discard(s->l2_table_cache, table);
              }
+ if (s->compressed_cache) {
+                seqcache_discard_cluster(s->compressed_cache, cluster_offset);
+            }
+
              if (s->discard_passthrough[type]) {
                  update_refcount_discard(bs, cluster_offset, s->cluster_size);
              }
@@ -1349,6 +1354,11 @@ int coroutine_fn qcow2_write_caches(BlockDriverState *bs)
      BDRVQcow2State *s = bs->opaque;
      int ret;
+ ret = qcow2_flush_compressed_cache(bs);
+    if (ret < 0) {
+        return ret;
+    }
+

I wonder a bit why this function doesn’t work on a best-effort basis, but that isn’t your problem.

      ret = qcow2_cache_write(bs, s->l2_table_cache);
      if (ret < 0) {
          return ret;
diff --git a/block/qcow2.c b/block/qcow2.c
index 6ee94421e0..5f3713cd6f 100644
--- a/block/qcow2.c
+++ b/block/qcow2.c
@@ -42,6 +42,7 @@
  #include "qapi/qapi-visit-block-core.h"
  #include "crypto.h"
  #include "block/aio_task.h"
+#include "block/coroutines.h"
/*
    Differences with QCOW:
@@ -1834,6 +1835,10 @@ static int coroutine_fn qcow2_do_open(BlockDriverState 
*bs, QDict *options,
      s->inflight_writes_counters =
          g_hash_table_new_full(g_int64_hash, g_int64_equal, g_free, g_free);
+ if (!has_data_file(bs) && (bs->file->bs->open_flags & BDRV_O_NOCACHE)) {

Looks a bit like a layering violation, but I have no better proposal and you gave your reasoning, so, OK.

I wonder whether this should be re-checked during reopen, though.

+        s->compressed_cache = seqcache_new(s->cluster_size);
+    }
+
      return ret;
fail:
@@ -2653,6 +2658,91 @@ fail_nometa:
      return ret;
  }
+/*
+ * Flush one cluster of compressed cache if exists. If @unfinished is true and
+ * there is no finished cluster to flush, flush the unfinished one. Otherwise
+ * flush only finished clusters.
+ *
+ * Return 0 if nothing to flush, 1 if one cluster successfully flushed and <0 
on
+ * error.
+ */
+static int coroutine_fn
+qcow2_co_compressed_flush_one(BlockDriverState *bs, bool unfinished)
+{
+    BDRVQcow2State *s = bs->opaque;
+    int ret;
+    int64_t align = 1;
+    int64_t offset, bytes;
+    uint8_t *buf;
+
+    if (!s->compressed_cache) {
+        return 0;
+    }
+
+    if (!seqcache_get_next_flush(s->compressed_cache, &offset, &bytes, &buf,
+                                 &unfinished))
+    {
+        return 0;
+    }
+
+    qcow2_inflight_writes_inc(bs, offset, bytes);
+
+    /*
+     * Don't try to align-up unfinished cached cluster, parallel write to it is
+     * possible! For finished host clusters data in the tail of the cluster 
will
+     * be never used. So, take some good alignment for speed.
+     *
+     * Note also, that seqcache guarantees that allocated size of @buf is 
enough
+     * to fill the cluster up to the end, so we are safe to align up with
+     * align <= cluster_size.
+     */
+    if (!unfinished) {
+        align = MIN(s->cluster_size,
+                    MAX(s->data_file->bs->bl.request_alignment,
+                        bdrv_opt_mem_align(bs)));
+    }
+

I’d move the pre-write overlap check here, because its purpose is to prevent writes to metadata structures as they are happening, not as they are queued into a cache. (I’d say.)

+    BLKDBG_EVENT(s->data_file, BLKDBG_WRITE_COMPRESSED);
+    ret = bdrv_co_pwrite(s->data_file, offset,
+                         QEMU_ALIGN_UP(offset + bytes, align) - offset,

I remember you said you wanted to describe more of the properties of the buffer returned by seqcache_get_next_flush(). That would be nice here, because intuitively one would assume that the buffer is @bytes bytes long, which aligning here will exceed. (It’s fine, but it would be nicer if there was a comment that assured that it’s fine.)

+                         buf, 0);
+    if (ret < 0 && s->compressed_flush_ret == 0) {
+        /*
+         * The data that was "written" earlier is lost. We don't want to
+         * care with storing it somehow to retry flushing later.

Yeah, there is little point in trying something like writing it again and again in the hope that maybe at some point it’ll work.

It is a shame that the error isn’t returned by the original compressed write, but what can you do.

                                                                  Also, how much
+         * data will we have to store, if not drop it after failed flush?
+         * After this point qcow2_co_flush_compressed_cache() (and
+         * qcow2_flush_compressed_cache()) never return success.
+         */
+        s->compressed_flush_ret = ret;
+    }
+
+    seqcache_discard_cluster(s->compressed_cache, offset);
+
+    qcow2_inflight_writes_dec(bs, offset, bytes);
+
+    return ret < 0 ? ret : 1;
+}
+
+int coroutine_fn qcow2_co_flush_compressed_cache(BlockDriverState *bs)
+{
+    BDRVQcow2State *s = bs->opaque;
+
+    if (s->compressed_cache) {
+        uint64_t nb_clusters = seqcache_nb_clusters(s->compressed_cache);
+
+        /*
+         * If parallel writes are possible we don't want to loop forever. So
+         * flush only clusters available at start of flush operation.
+         */
+        while (nb_clusters--) {
+            qcow2_co_compressed_flush_one(bs, true);
+        }
+    }
+
+    return s->compressed_flush_ret;
+}
+
  static int qcow2_inactivate(BlockDriverState *bs)
  {
      BDRVQcow2State *s = bs->opaque;
@@ -2667,6 +2757,13 @@ static int qcow2_inactivate(BlockDriverState *bs)
                            bdrv_get_device_or_node_name(bs));
      }
+ ret = qcow2_flush_compressed_cache(bs);
+    if (ret) {
+        result = ret;
+        error_report("Failed to flush the compressed write cache: %s",
+                     strerror(-ret));
+    }
+
      ret = qcow2_cache_flush(bs, s->l2_table_cache);
      if (ret) {
          result = ret;
@@ -2699,6 +2796,12 @@ static void qcow2_close(BlockDriverState *bs)
          qcow2_inactivate(bs);
      }
+ /*
+     * Cache should be flushed in qcow2_inactivate() and should be empty in
+     * inactive mode. So we are safe to free it.
+     */
+    seqcache_free(s->compressed_cache);
+
      cache_clean_timer_del(bs);
      qcow2_cache_destroy(s->l2_table_cache);
      qcow2_cache_destroy(s->refcount_block_cache);
@@ -4558,18 +4661,42 @@ qcow2_co_pwritev_compressed_task(BlockDriverState *bs,
          goto fail;
      }
- qcow2_inflight_writes_inc(bs, cluster_offset, out_len);
+    if (s->compressed_cache) {

Why is this conditional?

+        /*
+         * It's important to do seqcache_write() in the same critical section
+         * (by s->lock) as qcow2_alloc_compressed_cluster_offset(), so that the
+         * cache is filled sequentially.
+         */

Yes.

+        seqcache_write(s->compressed_cache, cluster_offset, out_len, out_buf);
- qemu_co_mutex_unlock(&s->lock);
+        qemu_co_mutex_unlock(&s->lock);
- BLKDBG_EVENT(s->data_file, BLKDBG_WRITE_COMPRESSED);
-    ret = bdrv_co_pwrite(s->data_file, cluster_offset, out_len, out_buf, 0);
+        ret = qcow2_co_compressed_flush_one(bs, false);

The qcow2 doc says a compressed cluster can span multiple host clusters. I don’t know whether that can happen with this driver, but if it does, wouldn’t that mean we’d need to flush two clusters here? Oh, no, never mind. Only the first one would be finished and thus flushed, not the second one.

I could have now removed the above paragraph, but it made me think, so I kept it:

Hm. Actually, if we unconditionally flush here, doesn’t that mean that we’ll never have a finished cluster in the cache for longer than the span between the seqcache_write() and this qcow2_co_compressed_flush_one()? I.e., the qcow2_co_flush_compressed_cache() is supposed to never flush any finished cluster, but only the currently active unfinished cluster (if there is one), right?

+        if (ret < 0) {
+            /*
+             * if ret < 0 it probably not this request which failed, but some
+             * previous one that was cached. Moreover, when we write to the
+             * cache we probably may always report success, because
+             * seqcache_write() never fails. Still, it's better to inform
+             * compressed backup now then wait until final flush.
+             */

Yup.

+            goto fail;
+        }
+    } else {
+        qcow2_inflight_writes_inc(bs, cluster_offset, out_len);
- qcow2_inflight_writes_dec(bs, cluster_offset, out_len);
+        qemu_co_mutex_unlock(&s->lock);
- if (ret < 0) {
-        goto fail;
+        BLKDBG_EVENT(s->data_file, BLKDBG_WRITE_COMPRESSED);
+        ret = bdrv_co_pwrite(s->data_file, cluster_offset, out_len, out_buf, 
0);
+
+        qcow2_inflight_writes_dec(bs, cluster_offset, out_len);
+
+        if (ret < 0) {
+            goto fail;
+        }
      }
+
  success:
      ret = 0;
  fail:
@@ -4681,10 +4808,19 @@ qcow2_co_preadv_compressed(BlockDriverState *bs,
out_buf = qemu_blockalign(bs, s->cluster_size); - BLKDBG_EVENT(bs->file, BLKDBG_READ_COMPRESSED);
-    ret = bdrv_co_pread(bs->file, coffset, csize, buf, 0);
-    if (ret < 0) {
-        goto fail;
+    /*
+     * seqcache_read may return less bytes than csize, as csize may exceed
+     * actual compressed data size. So we are OK if seqcache_read returns
+     * something > 0.

I was about to ask what happens when a compressed cluster spans two host clusters (I could have imagined that in theory the second one could have been discarded, but not the first one, so reading from the cache would really be short -- we would have needed to check that we only fell short in the range of 512 bytes, not more).

But then I realized that in this version of the series, all finished clusters are immediately discarded and only the current unfinished one is kept. Does it even make sense to try seqcache_read() here, then?

+     */
+    if (!s->compressed_cache ||

(As for writing, I don’t think this can ever occur.)

Max

+        seqcache_read(s->compressed_cache, coffset, csize, buf) <= 0)
+    {
+        BLKDBG_EVENT(bs->file, BLKDBG_READ_COMPRESSED);
+        ret = bdrv_co_pread(bs->file, coffset, csize, buf, 0);
+        if (ret < 0) {
+            goto fail;
+        }
      }
if (qcow2_co_decompress(bs, out_buf, s->cluster_size, buf, csize) < 0) {





reply via email to

[Prev in Thread] Current Thread [Next in Thread]