block: Convert bdrv_get_block_status_above() to bytes

[qemu.git] / block / qcow2.c
diff --git a/block/qcow2.c b/block/qcow2.c

index 0506c7eb04f765946c684a0fe987b368cb813209..29d0a5095544c319b0697878b8d680063945f331 100644 (file)
--- a/block/qcow2.c
+++ b/block/qcow2.c
@@ -30,7 +30,6 @@
  #include "qemu/error-report.h"
  #include "qapi/qmp/qerror.h"
  #include "qapi/qmp/qbool.h"
-#include "qapi/util.h"
  #include "qapi/qmp/types.h"
  #include "qapi-event.h"
  #include "trace.h"
@@ -302,10 +301,11 @@ static int qcow2_read_extensions(BlockDriverState *bs, uint64_t start_offset,
              }
  
              if (!(s->autoclear_features & QCOW2_AUTOCLEAR_BITMAPS)) {
-                error_report("WARNING: a program lacking bitmap support "
-                             "modified this file, so all bitmaps are now "
-                             "considered inconsistent. Some clusters may be "
-                             "leaked, run 'qemu-img check -r' on the image "
+                warn_report("a program lacking bitmap support "
+                            "modified this file, so all bitmaps are now "
+                            "considered inconsistent");
+                error_printf("Some clusters may be leaked, "
+                             "run 'qemu-img check -r' on the image "
                               "file to fix.");
                  if (need_update_header != NULL) {
                      /* Updating is needed to drop invalid bitmap extension. */
@@ -1360,16 +1360,6 @@ static int qcow2_do_open(BlockDriverState *bs, QDict *options, int flags,
          goto fail;
      }
  
-    s->cluster_cache = g_malloc(s->cluster_size);
-    /* one more sector for decompressed data alignment */
-    s->cluster_data = qemu_try_blockalign(bs->file->bs, QCOW_MAX_CRYPT_CLUSTERS
-                                                    * s->cluster_size + 512);
-    if (s->cluster_data == NULL) {
-        error_setg(errp, "Could not allocate temporary cluster buffer");
-        ret = -ENOMEM;
-        goto fail;
-    }
-
      s->cluster_cache_offset = -1;
      s->flags = flags;
  
@@ -1507,8 +1497,6 @@ static int qcow2_do_open(BlockDriverState *bs, QDict *options, int flags,
      if (s->refcount_block_cache) {
          qcow2_cache_destroy(bs, s->refcount_block_cache);
      }
-    g_free(s->cluster_cache);
-    qemu_vfree(s->cluster_data);
      qcrypto_block_free(s->crypto);
      qapi_free_QCryptoBlockOpenOptions(s->crypto_opts);
      return ret;
@@ -1820,15 +1808,13 @@ static coroutine_fn int qcow2_co_preadv(BlockDriverState *bs, uint64_t offset,
                  assert(s->crypto);
                  assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0);
                  assert((cur_bytes & (BDRV_SECTOR_SIZE - 1)) == 0);
-                Error *err = NULL;
                  if (qcrypto_block_decrypt(s->crypto,
                                            (s->crypt_physical_offset ?
                                             cluster_offset + offset_in_cluster :
-                                           offset) >> BDRV_SECTOR_BITS,
+                                           offset),
                                            cluster_data,
                                            cur_bytes,
-                                          &err) < 0) {
-                    error_free(err);
+                                          NULL) < 0) {
                      ret = -EIO;
                      goto fail;
                  }
@@ -1942,7 +1928,6 @@ static coroutine_fn int qcow2_co_pwritev(BlockDriverState *bs, uint64_t offset,
          qemu_iovec_concat(&hd_qiov, qiov, bytes_done, cur_bytes);
  
          if (bs->encrypted) {
-            Error *err = NULL;
              assert(s->crypto);
              if (!cluster_data) {
                  cluster_data = qemu_try_blockalign(bs->file->bs,
@@ -1961,10 +1946,9 @@ static coroutine_fn int qcow2_co_pwritev(BlockDriverState *bs, uint64_t offset,
              if (qcrypto_block_encrypt(s->crypto,
                                        (s->crypt_physical_offset ?
                                         cluster_offset + offset_in_cluster :
-                                       offset) >> BDRV_SECTOR_BITS,
+                                       offset),
                                        cluster_data,
-                                      cur_bytes, &err) < 0) {
-                error_free(err);
+                                      cur_bytes, NULL) < 0) {
                  ret = -EIO;
                  goto fail;
              }
@@ -2025,8 +2009,6 @@ static coroutine_fn int qcow2_co_pwritev(BlockDriverState *bs, uint64_t offset,
      ret = 0;
  
  fail:
-    qemu_co_mutex_unlock(&s->lock);
-
      while (l2meta != NULL) {
          QCowL2Meta *next;
  
@@ -2040,6 +2022,8 @@ fail:
          l2meta = next;
      }
  
+    qemu_co_mutex_unlock(&s->lock);
+
      qemu_iovec_destroy(&hd_qiov);
      qemu_vfree(cluster_data);
      trace_qcow2_writev_done_req(qemu_coroutine_self(), ret);
@@ -2053,6 +2037,14 @@ static int qcow2_inactivate(BlockDriverState *bs)
      int ret, result = 0;
      Error *local_err = NULL;
  
+    qcow2_store_persistent_dirty_bitmaps(bs, &local_err);
+    if (local_err != NULL) {
+        result = -EINVAL;
+        error_report_err(local_err);
+        error_report("Persistent bitmaps are lost for node '%s'",
+                     bdrv_get_device_or_node_name(bs));
+    }
+
      ret = qcow2_cache_flush(bs, s->l2_table_cache);
      if (ret) {
          result = ret;
@@ -2067,14 +2059,6 @@ static int qcow2_inactivate(BlockDriverState *bs)
                       strerror(-ret));
      }
  
-    qcow2_store_persistent_dirty_bitmaps(bs, &local_err);
-    if (local_err != NULL) {
-        result = -EINVAL;
-        error_report_err(local_err);
-        error_report("Persistent bitmaps are lost for node '%s'",
-                     bdrv_get_device_or_node_name(bs));
-    }
-
      if (result == 0) {
          qcow2_mark_clean(bs);
      }
@@ -2476,24 +2460,36 @@ static int qcow2_set_up_encryption(BlockDriverState *bs, const char *encryptfmt,
  }
  
  
-static int preallocate(BlockDriverState *bs)
+/**
+ * Preallocates metadata structures for data clusters between @offset (in the
+ * guest disk) and @new_length (which is thus generally the new guest disk
+ * size).
+ *
+ * Returns: 0 on success, -errno on failure.
+ */
+static int preallocate(BlockDriverState *bs,
+                       uint64_t offset, uint64_t new_length)
  {
+    BDRVQcow2State *s = bs->opaque;
      uint64_t bytes;
-    uint64_t offset;
      uint64_t host_offset = 0;
      unsigned int cur_bytes;
      int ret;
      QCowL2Meta *meta;
  
-    bytes = bdrv_getlength(bs);
-    offset = 0;
+    if (qemu_in_coroutine()) {
+        qemu_co_mutex_lock(&s->lock);
+    }
+
+    assert(offset <= new_length);
+    bytes = new_length - offset;
  
      while (bytes) {
          cur_bytes = MIN(bytes, INT_MAX);
          ret = qcow2_alloc_cluster_offset(bs, offset, &cur_bytes,
                                           &host_offset, &meta);
          if (ret < 0) {
-            return ret;
+            goto done;
          }
  
          while (meta) {
@@ -2503,7 +2499,7 @@ static int preallocate(BlockDriverState *bs)
              if (ret < 0) {
                  qcow2_free_any_clusters(bs, meta->alloc_offset,
                                          meta->nb_clusters, QCOW2_DISCARD_NEVER);
-                return ret;
+                goto done;
              }
  
              /* There are no dependent requests, but we need to remove our
@@ -2530,11 +2526,66 @@ static int preallocate(BlockDriverState *bs)
          ret = bdrv_pwrite(bs->file, (host_offset + cur_bytes) - 1,
                            &data, 1);
          if (ret < 0) {
-            return ret;
+            goto done;
          }
      }
  
-    return 0;
+    ret = 0;
+
+done:
+    if (qemu_in_coroutine()) {
+        qemu_co_mutex_unlock(&s->lock);
+    }
+    return ret;
+}
+
+/* qcow2_refcount_metadata_size:
+ * @clusters: number of clusters to refcount (including data and L1/L2 tables)
+ * @cluster_size: size of a cluster, in bytes
+ * @refcount_order: refcount bits power-of-2 exponent
+ * @generous_increase: allow for the refcount table to be 1.5x as large as it
+ *                     needs to be
+ *
+ * Returns: Number of bytes required for refcount blocks and table metadata.
+ */
+int64_t qcow2_refcount_metadata_size(int64_t clusters, size_t cluster_size,
+                                     int refcount_order, bool generous_increase,
+                                     uint64_t *refblock_count)
+{
+    /*
+     * Every host cluster is reference-counted, including metadata (even
+     * refcount metadata is recursively included).
+     *
+     * An accurate formula for the size of refcount metadata size is difficult
+     * to derive.  An easier method of calculation is finding the fixed point
+     * where no further refcount blocks or table clusters are required to
+     * reference count every cluster.
+     */
+    int64_t blocks_per_table_cluster = cluster_size / sizeof(uint64_t);
+    int64_t refcounts_per_block = cluster_size * 8 / (1 << refcount_order);
+    int64_t table = 0;  /* number of refcount table clusters */
+    int64_t blocks = 0; /* number of refcount block clusters */
+    int64_t last;
+    int64_t n = 0;
+
+    do {
+        last = n;
+        blocks = DIV_ROUND_UP(clusters + table + blocks, refcounts_per_block);
+        table = DIV_ROUND_UP(blocks, blocks_per_table_cluster);
+        n = clusters + blocks + table;
+
+        if (n == last && generous_increase) {
+            clusters += DIV_ROUND_UP(table, 2);
+            n = 0; /* force another loop */
+            generous_increase = false;
+        }
+    } while (n != last);
+
+    if (refblock_count) {
+        *refblock_count = blocks;
+    }
+
+    return (blocks + table) * cluster_size;
  }
  
  /**
@@ -2550,22 +2601,9 @@ static int64_t qcow2_calc_prealloc_size(int64_t total_size,
                                          size_t cluster_size,
                                          int refcount_order)
  {
-    /* Note: The following calculation does not need to be exact; if it is a
-     * bit off, either some bytes will be "leaked" (which is fine) or we
-     * will need to increase the file size by some bytes (which is fine,
-     * too, as long as the bulk is allocated here). Therefore, using
-     * floating point arithmetic is fine. */
      int64_t meta_size = 0;
-    uint64_t nreftablee, nrefblocke, nl1e, nl2e, refblock_count;
+    uint64_t nl1e, nl2e;
      int64_t aligned_total_size = align_offset(total_size, cluster_size);
-    int cluster_bits = ctz32(cluster_size);
-    int refblock_bits, refblock_size;
-    /* refcount entry size in bytes */
-    double rces = (1 << refcount_order) / 8.;
-
-    /* see qcow2_open() */
-    refblock_bits = cluster_bits - (refcount_order - 3);
-    refblock_size = 1 << refblock_bits;
  
      /* header: 1 cluster */
      meta_size += cluster_size;
@@ -2580,56 +2618,82 @@ static int64_t qcow2_calc_prealloc_size(int64_t total_size,
      nl1e = align_offset(nl1e, cluster_size / sizeof(uint64_t));
      meta_size += nl1e * sizeof(uint64_t);
  
-    /* total size of refcount blocks
-     *
-     * note: every host cluster is reference-counted, including metadata
-     * (even refcount blocks are recursively included).
-     * Let:
-     *   a = total_size (this is the guest disk size)
-     *   m = meta size not including refcount blocks and refcount tables
-     *   c = cluster size
-     *   y1 = number of refcount blocks entries
-     *   y2 = meta size including everything
-     *   rces = refcount entry size in bytes
-     * then,
-     *   y1 = (y2 + a)/c
-     *   y2 = y1 * rces + y1 * rces * sizeof(u64) / c + m
-     * we can get y1:
-     *   y1 = (a + m) / (c - rces - rces * sizeof(u64) / c)
-     */
-    nrefblocke = (aligned_total_size + meta_size + cluster_size)
-        / (cluster_size - rces - rces * sizeof(uint64_t)
-                / cluster_size);
-    refblock_count = DIV_ROUND_UP(nrefblocke, refblock_size);
-    meta_size += refblock_count * cluster_size;
-
-    /* total size of refcount tables */
-    nreftablee = align_offset(refblock_count,
-                              cluster_size / sizeof(uint64_t));
-    meta_size += nreftablee * sizeof(uint64_t);
+    /* total size of refcount table and blocks */
+    meta_size += qcow2_refcount_metadata_size(
+            (meta_size + aligned_total_size) / cluster_size,
+            cluster_size, refcount_order, false, NULL);
  
      return meta_size + aligned_total_size;
  }
  
-static int qcow2_create2(const char *filename, int64_t total_size,
-                         const char *backing_file, const char *backing_format,
-                         int flags, size_t cluster_size, PreallocMode prealloc,
-                         QemuOpts *opts, int version, int refcount_order,
-                         const char *encryptfmt, Error **errp)
+static size_t qcow2_opt_get_cluster_size_del(QemuOpts *opts, Error **errp)
  {
+    size_t cluster_size;
      int cluster_bits;
-    QDict *options;
  
-    /* Calculate cluster_bits */
+    cluster_size = qemu_opt_get_size_del(opts, BLOCK_OPT_CLUSTER_SIZE,
+                                         DEFAULT_CLUSTER_SIZE);
      cluster_bits = ctz32(cluster_size);
      if (cluster_bits < MIN_CLUSTER_BITS || cluster_bits > MAX_CLUSTER_BITS ||
          (1 << cluster_bits) != cluster_size)
      {
          error_setg(errp, "Cluster size must be a power of two between %d and "
                     "%dk", 1 << MIN_CLUSTER_BITS, 1 << (MAX_CLUSTER_BITS - 10));
-        return -EINVAL;
+        return 0;
+    }
+    return cluster_size;
+}
+
+static int qcow2_opt_get_version_del(QemuOpts *opts, Error **errp)
+{
+    char *buf;
+    int ret;
+
+    buf = qemu_opt_get_del(opts, BLOCK_OPT_COMPAT_LEVEL);
+    if (!buf) {
+        ret = 3; /* default */
+    } else if (!strcmp(buf, "0.10")) {
+        ret = 2;
+    } else if (!strcmp(buf, "1.1")) {
+        ret = 3;
+    } else {
+        error_setg(errp, "Invalid compatibility level: '%s'", buf);
+        ret = -EINVAL;
+    }
+    g_free(buf);
+    return ret;
+}
+
+static uint64_t qcow2_opt_get_refcount_bits_del(QemuOpts *opts, int version,
+                                                Error **errp)
+{
+    uint64_t refcount_bits;
+
+    refcount_bits = qemu_opt_get_number_del(opts, BLOCK_OPT_REFCOUNT_BITS, 16);
+    if (refcount_bits > 64 || !is_power_of_2(refcount_bits)) {
+        error_setg(errp, "Refcount width must be a power of two and may not "
+                   "exceed 64 bits");
+        return 0;
+    }
+
+    if (version < 3 && refcount_bits != 16) {
+        error_setg(errp, "Different refcount widths than 16 bits require "
+                   "compatibility level 1.1 or above (use compat=1.1 or "
+                   "greater)");
+        return 0;
      }
  
+    return refcount_bits;
+}
+
+static int qcow2_create2(const char *filename, int64_t total_size,
+                         const char *backing_file, const char *backing_format,
+                         int flags, size_t cluster_size, PreallocMode prealloc,
+                         QemuOpts *opts, int version, int refcount_order,
+                         const char *encryptfmt, Error **errp)
+{
+    QDict *options;
+
      /*
       * Open the image file and write a minimal qcow2 header.
       *
@@ -2652,7 +2716,7 @@ static int qcow2_create2(const char *filename, int64_t total_size,
          int64_t prealloc_size =
              qcow2_calc_prealloc_size(total_size, cluster_size, refcount_order);
          qemu_opt_set_number(opts, BLOCK_OPT_SIZE, prealloc_size, &error_abort);
-        qemu_opt_set(opts, BLOCK_OPT_PREALLOC, PreallocMode_lookup[prealloc],
+        qemu_opt_set(opts, BLOCK_OPT_PREALLOC, PreallocMode_str(prealloc),
                       &error_abort);
      }
  
@@ -2678,7 +2742,7 @@ static int qcow2_create2(const char *filename, int64_t total_size,
      *header = (QCowHeader) {
          .magic                      = cpu_to_be32(QCOW_MAGIC),
          .version                    = cpu_to_be32(version),
-        .cluster_bits               = cpu_to_be32(cluster_bits),
+        .cluster_bits               = cpu_to_be32(ctz32(cluster_size)),
          .size                       = cpu_to_be64(0),
          .l1_table_offset            = cpu_to_be64(0),
          .l1_size                    = cpu_to_be32(0),
@@ -2752,7 +2816,7 @@ static int qcow2_create2(const char *filename, int64_t total_size,
      }
  
      /* Okay, now that we have a valid image, let's give it the right size */
-    ret = blk_truncate(blk, total_size, errp);
+    ret = blk_truncate(blk, total_size, PREALLOC_MODE_OFF, errp);
      if (ret < 0) {
          error_prepend(errp, "Could not resize image: ");
          goto out;
@@ -2778,10 +2842,7 @@ static int qcow2_create2(const char *filename, int64_t total_size,
  
      /* And if we're supposed to preallocate metadata, do that now */
      if (prealloc != PREALLOC_MODE_OFF) {
-        BDRVQcow2State *s = blk_bs(blk)->opaque;
-        qemu_co_mutex_lock(&s->lock);
-        ret = preallocate(blk_bs(blk));
-        qemu_co_mutex_unlock(&s->lock);
+        ret = preallocate(blk_bs(blk), 0, total_size);
          if (ret < 0) {
              error_setg_errno(errp, -ret, "Could not preallocate metadata");
              goto out;
@@ -2825,10 +2886,10 @@ static int qcow2_create(const char *filename, QemuOpts *opts, Error **errp)
      int flags = 0;
      size_t cluster_size = DEFAULT_CLUSTER_SIZE;
      PreallocMode prealloc;
-    int version = 3;
-    uint64_t refcount_bits = 16;
+    int version;
+    uint64_t refcount_bits;
      int refcount_order;
-    const char *encryptfmt = NULL;
+    char *encryptfmt = NULL;
      Error *local_err = NULL;
      int ret;
  
@@ -2839,36 +2900,33 @@ static int qcow2_create(const char *filename, QemuOpts *opts, Error **errp)
      backing_fmt = qemu_opt_get_del(opts, BLOCK_OPT_BACKING_FMT);
      encryptfmt = qemu_opt_get_del(opts, BLOCK_OPT_ENCRYPT_FORMAT);
      if (encryptfmt) {
-        if (qemu_opt_get_del(opts, BLOCK_OPT_ENCRYPT)) {
+        if (qemu_opt_get(opts, BLOCK_OPT_ENCRYPT)) {
              error_setg(errp, "Options " BLOCK_OPT_ENCRYPT " and "
                         BLOCK_OPT_ENCRYPT_FORMAT " are mutually exclusive");
              ret = -EINVAL;
              goto finish;
          }
      } else if (qemu_opt_get_bool_del(opts, BLOCK_OPT_ENCRYPT, false)) {
-        encryptfmt = "aes";
+        encryptfmt = g_strdup("aes");
+    }
+    cluster_size = qcow2_opt_get_cluster_size_del(opts, &local_err);
+    if (local_err) {
+        error_propagate(errp, local_err);
+        ret = -EINVAL;
+        goto finish;
      }
-    cluster_size = qemu_opt_get_size_del(opts, BLOCK_OPT_CLUSTER_SIZE,
-                                         DEFAULT_CLUSTER_SIZE);
      buf = qemu_opt_get_del(opts, BLOCK_OPT_PREALLOC);
-    prealloc = qapi_enum_parse(PreallocMode_lookup, buf,
-                               PREALLOC_MODE__MAX, PREALLOC_MODE_OFF,
-                               &local_err);
+    prealloc = qapi_enum_parse(&PreallocMode_lookup, buf,
+                               PREALLOC_MODE_OFF, &local_err);
      if (local_err) {
          error_propagate(errp, local_err);
          ret = -EINVAL;
          goto finish;
      }
-    g_free(buf);
-    buf = qemu_opt_get_del(opts, BLOCK_OPT_COMPAT_LEVEL);
-    if (!buf) {
-        /* keep the default */
-    } else if (!strcmp(buf, "0.10")) {
-        version = 2;
-    } else if (!strcmp(buf, "1.1")) {
-        version = 3;
-    } else {
-        error_setg(errp, "Invalid compatibility level: '%s'", buf);
+
+    version = qcow2_opt_get_version_del(opts, &local_err);
+    if (local_err) {
+        error_propagate(errp, local_err);
          ret = -EINVAL;
          goto finish;
      }
@@ -2891,19 +2949,9 @@ static int qcow2_create(const char *filename, QemuOpts *opts, Error **errp)
          goto finish;
      }
  
-    refcount_bits = qemu_opt_get_number_del(opts, BLOCK_OPT_REFCOUNT_BITS,
-                                            refcount_bits);
-    if (refcount_bits > 64 || !is_power_of_2(refcount_bits)) {
-        error_setg(errp, "Refcount width must be a power of two and may not "
-                   "exceed 64 bits");
-        ret = -EINVAL;
-        goto finish;
-    }
-
-    if (version < 3 && refcount_bits != 16) {
-        error_setg(errp, "Different refcount widths than 16 bits require "
-                   "compatibility level 1.1 or above (use compat=1.1 or "
-                   "greater)");
+    refcount_bits = qcow2_opt_get_refcount_bits_del(opts, version, &local_err);
+    if (local_err) {
+        error_propagate(errp, local_err);
          ret = -EINVAL;
          goto finish;
      }
@@ -2918,28 +2966,33 @@ static int qcow2_create(const char *filename, QemuOpts *opts, Error **errp)
  finish:
      g_free(backing_file);
      g_free(backing_fmt);
+    g_free(encryptfmt);
      g_free(buf);
      return ret;
  }
  
  
-static bool is_zero_sectors(BlockDriverState *bs, int64_t start,
-                            uint32_t count)
+static bool is_zero(BlockDriverState *bs, int64_t offset, int64_t bytes)
  {
-    int nr;
-    BlockDriverState *file;
-    int64_t res;
+    int64_t nr;
+    int res;
+    int64_t start;
+
+    /* TODO: Widening to sector boundaries should only be needed as
+     * long as we can't query finer granularity. */
+    start = QEMU_ALIGN_DOWN(offset, BDRV_SECTOR_SIZE);
+    bytes = QEMU_ALIGN_UP(offset + bytes, BDRV_SECTOR_SIZE) - start;
  
-    if (start + count > bs->total_sectors) {
-        count = bs->total_sectors - start;
+    /* Clamp to image length, before checking status of underlying sectors */
+    if (start + bytes > bs->total_sectors * BDRV_SECTOR_SIZE) {
+        bytes = bs->total_sectors * BDRV_SECTOR_SIZE - start;
      }
  
-    if (!count) {
+    if (!bytes) {
          return true;
      }
-    res = bdrv_get_block_status_above(bs, NULL, start, count,
-                                      &nr, &file);
-    return res >= 0 && (res & BDRV_BLOCK_ZERO) && nr == count;
+    res = bdrv_block_status_above(bs, NULL, start, bytes, &nr, NULL, NULL);
+    return res >= 0 && (res & BDRV_BLOCK_ZERO) && nr == bytes;
  }
  
  static coroutine_fn int qcow2_co_pwrite_zeroes(BlockDriverState *bs,
@@ -2957,24 +3010,21 @@ static coroutine_fn int qcow2_co_pwrite_zeroes(BlockDriverState *bs,
      }
  
      if (head || tail) {
-        int64_t cl_start = (offset - head) >> BDRV_SECTOR_BITS;
          uint64_t off;
          unsigned int nr;
  
          assert(head + bytes <= s->cluster_size);
  
          /* check whether remainder of cluster already reads as zero */
-        if (!(is_zero_sectors(bs, cl_start,
-                              DIV_ROUND_UP(head, BDRV_SECTOR_SIZE)) &&
-              is_zero_sectors(bs, (offset + bytes) >> BDRV_SECTOR_BITS,
-                              DIV_ROUND_UP(-tail & (s->cluster_size - 1),
-                                           BDRV_SECTOR_SIZE)))) {
+        if (!(is_zero(bs, offset - head, head) &&
+              is_zero(bs, offset + bytes,
+                      tail ? s->cluster_size - tail : 0))) {
              return -ENOTSUP;
          }
  
          qemu_co_mutex_lock(&s->lock);
          /* We can have new write after previous check */
-        offset = cl_start << BDRV_SECTOR_BITS;
+        offset = QEMU_ALIGN_DOWN(offset, s->cluster_size);
          bytes = s->cluster_size;
          nr = s->cluster_size;
          ret = qcow2_get_cluster_offset(bs, offset, &nr, &off);
@@ -3020,12 +3070,22 @@ static coroutine_fn int qcow2_co_pdiscard(BlockDriverState *bs,
      return ret;
  }
  
-static int qcow2_truncate(BlockDriverState *bs, int64_t offset, Error **errp)
+static int qcow2_truncate(BlockDriverState *bs, int64_t offset,
+                          PreallocMode prealloc, Error **errp)
  {
      BDRVQcow2State *s = bs->opaque;
+    uint64_t old_length;
      int64_t new_l1_size;
      int ret;
  
+    if (prealloc != PREALLOC_MODE_OFF && prealloc != PREALLOC_MODE_METADATA &&
+        prealloc != PREALLOC_MODE_FALLOC && prealloc != PREALLOC_MODE_FULL)
+    {
+        error_setg(errp, "Unsupported preallocation mode '%s'",
+                   PreallocMode_str(prealloc));
+        return -ENOTSUP;
+    }
+
      if (offset & 511) {
          error_setg(errp, "The new size must be a multiple of 512");
          return -EINVAL;
@@ -3044,17 +3104,189 @@ static int qcow2_truncate(BlockDriverState *bs, int64_t offset, Error **errp)
          return -ENOTSUP;
      }
  
-    /* shrinking is currently not supported */
-    if (offset < bs->total_sectors * 512) {
-        error_setg(errp, "qcow2 doesn't support shrinking images yet");
-        return -ENOTSUP;
+    old_length = bs->total_sectors * 512;
+    new_l1_size = size_to_l1(s, offset);
+
+    if (offset < old_length) {
+        int64_t last_cluster, old_file_size;
+        if (prealloc != PREALLOC_MODE_OFF) {
+            error_setg(errp,
+                       "Preallocation can't be used for shrinking an image");
+            return -EINVAL;
+        }
+
+        ret = qcow2_cluster_discard(bs, ROUND_UP(offset, s->cluster_size),
+                                    old_length - ROUND_UP(offset,
+                                                          s->cluster_size),
+                                    QCOW2_DISCARD_ALWAYS, true);
+        if (ret < 0) {
+            error_setg_errno(errp, -ret, "Failed to discard cropped clusters");
+            return ret;
+        }
+
+        ret = qcow2_shrink_l1_table(bs, new_l1_size);
+        if (ret < 0) {
+            error_setg_errno(errp, -ret,
+                             "Failed to reduce the number of L2 tables");
+            return ret;
+        }
+
+        ret = qcow2_shrink_reftable(bs);
+        if (ret < 0) {
+            error_setg_errno(errp, -ret,
+                             "Failed to discard unused refblocks");
+            return ret;
+        }
+
+        old_file_size = bdrv_getlength(bs->file->bs);
+        if (old_file_size < 0) {
+            error_setg_errno(errp, -old_file_size,
+                             "Failed to inquire current file length");
+            return old_file_size;
+        }
+        last_cluster = qcow2_get_last_cluster(bs, old_file_size);
+        if (last_cluster < 0) {
+            error_setg_errno(errp, -last_cluster,
+                             "Failed to find the last cluster");
+            return last_cluster;
+        }
+        if ((last_cluster + 1) * s->cluster_size < old_file_size) {
+            ret = bdrv_truncate(bs->file, (last_cluster + 1) * s->cluster_size,
+                                PREALLOC_MODE_OFF, NULL);
+            if (ret < 0) {
+                warn_report("Failed to truncate the tail of the image: %s",
+                            strerror(-ret));
+                ret = 0;
+            }
+        }
+    } else {
+        ret = qcow2_grow_l1_table(bs, new_l1_size, true);
+        if (ret < 0) {
+            error_setg_errno(errp, -ret, "Failed to grow the L1 table");
+            return ret;
+        }
      }
  
-    new_l1_size = size_to_l1(s, offset);
-    ret = qcow2_grow_l1_table(bs, new_l1_size, true);
-    if (ret < 0) {
-        error_setg_errno(errp, -ret, "Failed to grow the L1 table");
-        return ret;
+    switch (prealloc) {
+    case PREALLOC_MODE_OFF:
+        break;
+
+    case PREALLOC_MODE_METADATA:
+        ret = preallocate(bs, old_length, offset);
+        if (ret < 0) {
+            error_setg_errno(errp, -ret, "Preallocation failed");
+            return ret;
+        }
+        break;
+
+    case PREALLOC_MODE_FALLOC:
+    case PREALLOC_MODE_FULL:
+    {
+        int64_t allocation_start, host_offset, guest_offset;
+        int64_t clusters_allocated;
+        int64_t old_file_size, new_file_size;
+        uint64_t nb_new_data_clusters, nb_new_l2_tables;
+
+        old_file_size = bdrv_getlength(bs->file->bs);
+        if (old_file_size < 0) {
+            error_setg_errno(errp, -old_file_size,
+                             "Failed to inquire current file length");
+            return old_file_size;
+        }
+
+        nb_new_data_clusters = DIV_ROUND_UP(offset - old_length,
+                                            s->cluster_size);
+
+        /* This is an overestimation; we will not actually allocate space for
+         * these in the file but just make sure the new refcount structures are
+         * able to cover them so we will not have to allocate new refblocks
+         * while entering the data blocks in the potentially new L2 tables.
+         * (We do not actually care where the L2 tables are placed. Maybe they
+         *  are already allocated or they can be placed somewhere before
+         *  @old_file_size. It does not matter because they will be fully
+         *  allocated automatically, so they do not need to be covered by the
+         *  preallocation. All that matters is that we will not have to allocate
+         *  new refcount structures for them.) */
+        nb_new_l2_tables = DIV_ROUND_UP(nb_new_data_clusters,
+                                        s->cluster_size / sizeof(uint64_t));
+        /* The cluster range may not be aligned to L2 boundaries, so add one L2
+         * table for a potential head/tail */
+        nb_new_l2_tables++;
+
+        allocation_start = qcow2_refcount_area(bs, old_file_size,
+                                               nb_new_data_clusters +
+                                               nb_new_l2_tables,
+                                               true, 0, 0);
+        if (allocation_start < 0) {
+            error_setg_errno(errp, -allocation_start,
+                             "Failed to resize refcount structures");
+            return allocation_start;
+        }
+
+        clusters_allocated = qcow2_alloc_clusters_at(bs, allocation_start,
+                                                     nb_new_data_clusters);
+        if (clusters_allocated < 0) {
+            error_setg_errno(errp, -clusters_allocated,
+                             "Failed to allocate data clusters");
+            return -clusters_allocated;
+        }
+
+        assert(clusters_allocated == nb_new_data_clusters);
+
+        /* Allocate the data area */
+        new_file_size = allocation_start +
+                        nb_new_data_clusters * s->cluster_size;
+        ret = bdrv_truncate(bs->file, new_file_size, prealloc, errp);
+        if (ret < 0) {
+            error_prepend(errp, "Failed to resize underlying file: ");
+            qcow2_free_clusters(bs, allocation_start,
+                                nb_new_data_clusters * s->cluster_size,
+                                QCOW2_DISCARD_OTHER);
+            return ret;
+        }
+
+        /* Create the necessary L2 entries */
+        host_offset = allocation_start;
+        guest_offset = old_length;
+        while (nb_new_data_clusters) {
+            int64_t guest_cluster = guest_offset >> s->cluster_bits;
+            int64_t nb_clusters = MIN(nb_new_data_clusters,
+                                      s->l2_size - guest_cluster % s->l2_size);
+            QCowL2Meta allocation = {
+                .offset       = guest_offset,
+                .alloc_offset = host_offset,
+                .nb_clusters  = nb_clusters,
+            };
+            qemu_co_queue_init(&allocation.dependent_requests);
+
+            ret = qcow2_alloc_cluster_link_l2(bs, &allocation);
+            if (ret < 0) {
+                error_setg_errno(errp, -ret, "Failed to update L2 tables");
+                qcow2_free_clusters(bs, host_offset,
+                                    nb_new_data_clusters * s->cluster_size,
+                                    QCOW2_DISCARD_OTHER);
+                return ret;
+            }
+
+            guest_offset += nb_clusters * s->cluster_size;
+            host_offset += nb_clusters * s->cluster_size;
+            nb_new_data_clusters -= nb_clusters;
+        }
+        break;
+    }
+
+    default:
+        g_assert_not_reached();
+    }
+
+    if (prealloc != PREALLOC_MODE_OFF) {
+        /* Flush metadata before actually changing the image size */
+        ret = bdrv_flush(bs);
+        if (ret < 0) {
+            error_setg_errno(errp, -ret,
+                             "Failed to flush the preallocated area to disk");
+            return ret;
+        }
      }
  
      /* write updated header.size */
@@ -3082,13 +3314,16 @@ qcow2_co_pwritev_compressed(BlockDriverState *bs, uint64_t offset,
      z_stream strm;
      int ret, out_len;
      uint8_t *buf, *out_buf;
-    uint64_t cluster_offset;
+    int64_t cluster_offset;
  
      if (bytes == 0) {
          /* align end of file to a sector boundary to ease reading with
             sector based I/Os */
          cluster_offset = bdrv_getlength(bs->file->bs);
-        return bdrv_truncate(bs->file, cluster_offset, NULL);
+        if (cluster_offset < 0) {
+            return cluster_offset;
+        }
+        return bdrv_truncate(bs->file, cluster_offset, PREALLOC_MODE_OFF, NULL);
      }
  
      buf = qemu_blockalign(bs, s->cluster_size);
@@ -3304,7 +3539,7 @@ static int make_completely_empty(BlockDriverState *bs)
      }
  
      ret = bdrv_truncate(bs->file, (3 + l1_clusters) * s->cluster_size,
-                        &local_err);
+                        PREALLOC_MODE_OFF, &local_err);
      if (ret < 0) {
          error_report_err(local_err);
          goto fail;
@@ -3388,6 +3623,133 @@ static coroutine_fn int qcow2_co_flush_to_os(BlockDriverState *bs)
      return 0;
  }
  
+static BlockMeasureInfo *qcow2_measure(QemuOpts *opts, BlockDriverState *in_bs,
+                                       Error **errp)
+{
+    Error *local_err = NULL;
+    BlockMeasureInfo *info;
+    uint64_t required = 0; /* bytes that contribute to required size */
+    uint64_t virtual_size; /* disk size as seen by guest */
+    uint64_t refcount_bits;
+    uint64_t l2_tables;
+    size_t cluster_size;
+    int version;
+    char *optstr;
+    PreallocMode prealloc;
+    bool has_backing_file;
+
+    /* Parse image creation options */
+    cluster_size = qcow2_opt_get_cluster_size_del(opts, &local_err);
+    if (local_err) {
+        goto err;
+    }
+
+    version = qcow2_opt_get_version_del(opts, &local_err);
+    if (local_err) {
+        goto err;
+    }
+
+    refcount_bits = qcow2_opt_get_refcount_bits_del(opts, version, &local_err);
+    if (local_err) {
+        goto err;
+    }
+
+    optstr = qemu_opt_get_del(opts, BLOCK_OPT_PREALLOC);
+    prealloc = qapi_enum_parse(&PreallocMode_lookup, optstr,
+                               PREALLOC_MODE_OFF, &local_err);
+    g_free(optstr);
+    if (local_err) {
+        goto err;
+    }
+
+    optstr = qemu_opt_get_del(opts, BLOCK_OPT_BACKING_FILE);
+    has_backing_file = !!optstr;
+    g_free(optstr);
+
+    virtual_size = align_offset(qemu_opt_get_size_del(opts, BLOCK_OPT_SIZE, 0),
+                                cluster_size);
+
+    /* Check that virtual disk size is valid */
+    l2_tables = DIV_ROUND_UP(virtual_size / cluster_size,
+                             cluster_size / sizeof(uint64_t));
+    if (l2_tables * sizeof(uint64_t) > QCOW_MAX_L1_SIZE) {
+        error_setg(&local_err, "The image size is too large "
+                               "(try using a larger cluster size)");
+        goto err;
+    }
+
+    /* Account for input image */
+    if (in_bs) {
+        int64_t ssize = bdrv_getlength(in_bs);
+        if (ssize < 0) {
+            error_setg_errno(&local_err, -ssize,
+                             "Unable to get image virtual_size");
+            goto err;
+        }
+
+        virtual_size = align_offset(ssize, cluster_size);
+
+        if (has_backing_file) {
+            /* We don't how much of the backing chain is shared by the input
+             * image and the new image file.  In the worst case the new image's
+             * backing file has nothing in common with the input image.  Be
+             * conservative and assume all clusters need to be written.
+             */
+            required = virtual_size;
+        } else {
+            int64_t offset;
+            int64_t pnum = 0;
+
+            for (offset = 0; offset < ssize; offset += pnum) {
+                int ret;
+
+                ret = bdrv_block_status_above(in_bs, NULL, offset,
+                                              ssize - offset, &pnum, NULL,
+                                              NULL);
+                if (ret < 0) {
+                    error_setg_errno(&local_err, -ret,
+                                     "Unable to get block status");
+                    goto err;
+                }
+
+                if (ret & BDRV_BLOCK_ZERO) {
+                    /* Skip zero regions (safe with no backing file) */
+                } else if ((ret & (BDRV_BLOCK_DATA | BDRV_BLOCK_ALLOCATED)) ==
+                           (BDRV_BLOCK_DATA | BDRV_BLOCK_ALLOCATED)) {
+                    /* Extend pnum to end of cluster for next iteration */
+                    pnum = ROUND_UP(offset + pnum, cluster_size) - offset;
+
+                    /* Count clusters we've seen */
+                    required += offset % cluster_size + pnum;
+                }
+            }
+        }
+    }
+
+    /* Take into account preallocation.  Nothing special is needed for
+     * PREALLOC_MODE_METADATA since metadata is always counted.
+     */
+    if (prealloc == PREALLOC_MODE_FULL || prealloc == PREALLOC_MODE_FALLOC) {
+        required = virtual_size;
+    }
+
+    info = g_new(BlockMeasureInfo, 1);
+    info->fully_allocated =
+        qcow2_calc_prealloc_size(virtual_size, cluster_size,
+                                 ctz32(refcount_bits));
+
+    /* Remove data clusters that are not required.  This overestimates the
+     * required size because metadata needed for the fully allocated file is
+     * still counted.
+     */
+    info->required = info->fully_allocated - virtual_size + required;
+    return info;
+
+err:
+    error_propagate(errp, local_err);
+    return NULL;
+}
+
  static int qcow2_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
  {
      BDRVQcow2State *s = bs->opaque;
@@ -3462,27 +3824,6 @@ static ImageInfoSpecific *qcow2_get_specific_info(BlockDriverState *bs)
      return spec_info;
  }
  
-#if 0
-static void dump_refcounts(BlockDriverState *bs)
-{
-    BDRVQcow2State *s = bs->opaque;
-    int64_t nb_clusters, k, k1, size;
-    int refcount;
-
-    size = bdrv_getlength(bs->file->bs);
-    nb_clusters = size_to_clusters(s, size);
-    for(k = 0; k < nb_clusters;) {
-        k1 = k;
-        refcount = get_refcount(bs, k);
-        k++;
-        while (k < nb_clusters && get_refcount(bs, k) == refcount)
-            k++;
-        printf("%" PRId64 ": refcount=%d nb=%" PRId64 "\n", k, refcount,
-               k - k1);
-    }
-}
-#endif
-
  static int qcow2_save_vmstate(BlockDriverState *bs, QEMUIOVector *qiov,
                                int64_t pos)
  {
@@ -3814,7 +4155,7 @@ static int qcow2_amend_options(BlockDriverState *bs, QemuOpts *opts,
              return ret;
          }
  
-        ret = blk_truncate(blk, new_size, &local_err);
+        ret = blk_truncate(blk, new_size, PREALLOC_MODE_OFF, &local_err);
          blk_unref(blk);
          if (ret < 0) {
              error_report_err(local_err);
@@ -3986,6 +4327,7 @@ BlockDriver bdrv_qcow2 = {
      .bdrv_snapshot_delete   = qcow2_snapshot_delete,
      .bdrv_snapshot_list     = qcow2_snapshot_list,
      .bdrv_snapshot_load_tmp = qcow2_snapshot_load_tmp,
+    .bdrv_measure           = qcow2_measure,
      .bdrv_get_info          = qcow2_get_info,
      .bdrv_get_specific_info = qcow2_get_specific_info,