Merge remote-tracking branch 'remotes/kraxel/tags/pull-vnc-3' into staging

[qemu.git] / block / qcow2-refcount.c
diff --git a/block/qcow2-refcount.c b/block/qcow2-refcount.c

index 6d35e49d32ded07a0ba3c40d299004a1815b0750..9507aef8471c3b3f93769302e35ef4c776d08556 100644 (file)
--- a/block/qcow2-refcount.c
+++ b/block/qcow2-refcount.c
@@ -25,8 +25,10 @@
  #include "qemu-common.h"
  #include "block/block_int.h"
  #include "block/qcow2.h"
+#include "qemu/range.h"
+#include "qapi/qmp/types.h"
  
-static int64_t alloc_clusters_noref(BlockDriverState *bs, int64_t size);
+static int64_t alloc_clusters_noref(BlockDriverState *bs, uint64_t size);
  static int QEMU_WARN_UNUSED_RESULT update_refcount(BlockDriverState *bs,
                              int64_t offset, int64_t length,
                              int addend, enum qcow2_discard_type type);
@@ -38,8 +40,10 @@ static int QEMU_WARN_UNUSED_RESULT update_refcount(BlockDriverState *bs,
  int qcow2_refcount_init(BlockDriverState *bs)
  {
      BDRVQcowState *s = bs->opaque;
-    int ret, refcount_table_size2, i;
+    unsigned int refcount_table_size2, i;
+    int ret;
  
+    assert(s->refcount_table_size <= INT_MAX / sizeof(uint64_t));
      refcount_table_size2 = s->refcount_table_size * sizeof(uint64_t);
      s->refcount_table = g_malloc(refcount_table_size2);
      if (s->refcount_table_size > 0) {
@@ -85,7 +89,7 @@ static int load_refcount_block(BlockDriverState *bs,
  static int get_refcount(BlockDriverState *bs, int64_t cluster_index)
  {
      BDRVQcowState *s = bs->opaque;
-    int refcount_table_index, block_index;
+    uint64_t refcount_table_index, block_index;
      int64_t refcount_block_offset;
      int ret;
      uint16_t *refcount_block;
@@ -94,7 +98,8 @@ static int get_refcount(BlockDriverState *bs, int64_t cluster_index)
      refcount_table_index = cluster_index >> (s->cluster_bits - REFCOUNT_SHIFT);
      if (refcount_table_index >= s->refcount_table_size)
          return 0;
-    refcount_block_offset = s->refcount_table[refcount_table_index];
+    refcount_block_offset =
+        s->refcount_table[refcount_table_index] & REFT_OFFSET_MASK;
      if (!refcount_block_offset)
          return 0;
  
@@ -189,10 +194,11 @@ static int alloc_refcount_block(BlockDriverState *bs,
       *   they can describe them themselves.
       *
       * - We need to consider that at this point we are inside update_refcounts
-     *   and doing the initial refcount increase. This means that some clusters
-     *   have already been allocated by the caller, but their refcount isn't
-     *   accurate yet. free_cluster_index tells us where this allocation ends
-     *   as long as we don't overwrite it by freeing clusters.
+     *   and potentially doing an initial refcount increase. This means that
+     *   some clusters have already been allocated by the caller, but their
+     *   refcount isn't accurate yet. If we allocate clusters for metadata, we
+     *   need to return -EAGAIN to signal the caller that it needs to restart
+     *   the search for free clusters.
       *
       * - alloc_clusters_noref and qcow2_free_clusters may load a different
       *   refcount block into the cache
@@ -277,7 +283,10 @@ static int alloc_refcount_block(BlockDriverState *bs,
          }
  
          s->refcount_table[refcount_table_index] = new_block;
-        return 0;
+
+        /* The new refcount block may be where the caller intended to put its
+         * data, so let it restart the search. */
+        return -EAGAIN;
      }
  
      ret = qcow2_cache_put(bs, s->refcount_block_cache, (void**) refcount_block);
@@ -300,8 +309,11 @@ static int alloc_refcount_block(BlockDriverState *bs,
  
      /* Calculate the number of refcount blocks needed so far */
      uint64_t refcount_block_clusters = 1 << (s->cluster_bits - REFCOUNT_SHIFT);
-    uint64_t blocks_used = (s->free_cluster_index +
-        refcount_block_clusters - 1) / refcount_block_clusters;
+    uint64_t blocks_used = DIV_ROUND_UP(cluster_index, refcount_block_clusters);
+
+    if (blocks_used > QCOW_MAX_REFTABLE_SIZE / sizeof(uint64_t)) {
+        return -EFBIG;
+    }
  
      /* And now we need at least one block more for the new metadata */
      uint64_t table_size = next_refcount_table_size(s, blocks_used + 1);
@@ -334,8 +346,6 @@ static int alloc_refcount_block(BlockDriverState *bs,
      uint16_t *new_blocks = g_malloc0(blocks_clusters * s->cluster_size);
      uint64_t *new_table = g_malloc0(table_size * sizeof(uint64_t));
  
-    assert(meta_offset >= (s->free_cluster_index * s->cluster_size));
-
      /* Fill the new refcount table */
      memcpy(new_table, s->refcount_table,
          s->refcount_table_size * sizeof(uint64_t));
@@ -398,18 +408,19 @@ static int alloc_refcount_block(BlockDriverState *bs,
      s->refcount_table_size = table_size;
      s->refcount_table_offset = table_offset;
  
-    /* Free old table. Remember, we must not change free_cluster_index */
-    uint64_t old_free_cluster_index = s->free_cluster_index;
+    /* Free old table. */
      qcow2_free_clusters(bs, old_table_offset, old_table_size * sizeof(uint64_t),
                          QCOW2_DISCARD_OTHER);
-    s->free_cluster_index = old_free_cluster_index;
  
      ret = load_refcount_block(bs, new_block, (void**) refcount_block);
      if (ret < 0) {
          return ret;
      }
  
-    return 0;
+    /* If we were trying to do the initial refcount update for some cluster
+     * allocation, we might have used the same clusters to store newly
+     * allocated metadata. Make the caller search some new space. */
+    return -EAGAIN;
  
  fail_table:
      g_free(new_table);
@@ -420,6 +431,74 @@ fail_block:
      return ret;
  }
  
+void qcow2_process_discards(BlockDriverState *bs, int ret)
+{
+    BDRVQcowState *s = bs->opaque;
+    Qcow2DiscardRegion *d, *next;
+
+    QTAILQ_FOREACH_SAFE(d, &s->discards, next, next) {
+        QTAILQ_REMOVE(&s->discards, d, next);
+
+        /* Discard is optional, ignore the return value */
+        if (ret >= 0) {
+            bdrv_discard(bs->file,
+                         d->offset >> BDRV_SECTOR_BITS,
+                         d->bytes >> BDRV_SECTOR_BITS);
+        }
+
+        g_free(d);
+    }
+}
+
+static void update_refcount_discard(BlockDriverState *bs,
+                                    uint64_t offset, uint64_t length)
+{
+    BDRVQcowState *s = bs->opaque;
+    Qcow2DiscardRegion *d, *p, *next;
+
+    QTAILQ_FOREACH(d, &s->discards, next) {
+        uint64_t new_start = MIN(offset, d->offset);
+        uint64_t new_end = MAX(offset + length, d->offset + d->bytes);
+
+        if (new_end - new_start <= length + d->bytes) {
+            /* There can't be any overlap, areas ending up here have no
+             * references any more and therefore shouldn't get freed another
+             * time. */
+            assert(d->bytes + length == new_end - new_start);
+            d->offset = new_start;
+            d->bytes = new_end - new_start;
+            goto found;
+        }
+    }
+
+    d = g_malloc(sizeof(*d));
+    *d = (Qcow2DiscardRegion) {
+        .bs     = bs,
+        .offset = offset,
+        .bytes  = length,
+    };
+    QTAILQ_INSERT_TAIL(&s->discards, d, next);
+
+found:
+    /* Merge discard requests if they are adjacent now */
+    QTAILQ_FOREACH_SAFE(p, &s->discards, next, next) {
+        if (p == d
+            || p->offset > d->offset + d->bytes
+            || d->offset > p->offset + p->bytes)
+        {
+            continue;
+        }
+
+        /* Still no overlap possible */
+        assert(p->offset == d->offset + d->bytes
+            || d->offset == p->offset + p->bytes);
+
+        QTAILQ_REMOVE(&s->discards, p, next);
+        d->offset = MIN(d->offset, p->offset);
+        d->bytes += p->bytes;
+    }
+}
+
  /* XXX: cache several refcount block clusters ? */
  static int QEMU_WARN_UNUSED_RESULT update_refcount(BlockDriverState *bs,
      int64_t offset, int64_t length, int addend, enum qcow2_discard_type type)
@@ -445,8 +524,8 @@ static int QEMU_WARN_UNUSED_RESULT update_refcount(BlockDriverState *bs,
              s->l2_table_cache);
      }
  
-    start = offset & ~(s->cluster_size - 1);
-    last = (offset + length - 1) & ~(s->cluster_size - 1);
+    start = start_of_cluster(s, offset);
+    last = start_of_cluster(s, offset + length - 1);
      for(cluster_offset = start; cluster_offset <= last;
          cluster_offset += s->cluster_size)
      {
@@ -488,10 +567,18 @@ static int QEMU_WARN_UNUSED_RESULT update_refcount(BlockDriverState *bs,
              s->free_cluster_index = cluster_index;
          }
          refcount_block[block_index] = cpu_to_be16(refcount);
+
+        if (refcount == 0 && s->discard_passthrough[type]) {
+            update_refcount_discard(bs, cluster_offset, s->cluster_size);
+        }
      }
  
      ret = 0;
  fail:
+    if (!s->cache_discards) {
+        qcow2_process_discards(bs, ret);
+    }
+
      /* Write last changed block to disk */
      if (refcount_block) {
          int wret;
@@ -523,10 +610,10 @@ fail:
   * If the return value is non-negative, it is the new refcount of the cluster.
   * If it is negative, it is -errno and indicates an error.
   */
-static int update_cluster_refcount(BlockDriverState *bs,
-                                   int64_t cluster_index,
-                                   int addend,
-                                   enum qcow2_discard_type type)
+int qcow2_update_cluster_refcount(BlockDriverState *bs,
+                                  int64_t cluster_index,
+                                  int addend,
+                                  enum qcow2_discard_type type)
  {
      BDRVQcowState *s = bs->opaque;
      int ret;
@@ -548,15 +635,16 @@ static int update_cluster_refcount(BlockDriverState *bs,
  
  
  /* return < 0 if error */
-static int64_t alloc_clusters_noref(BlockDriverState *bs, int64_t size)
+static int64_t alloc_clusters_noref(BlockDriverState *bs, uint64_t size)
  {
      BDRVQcowState *s = bs->opaque;
-    int i, nb_clusters, refcount;
+    uint64_t i, nb_clusters;
+    int refcount;
  
      nb_clusters = size_to_clusters(s, size);
  retry:
      for(i = 0; i < nb_clusters; i++) {
-        int64_t next_cluster_index = s->free_cluster_index++;
+        uint64_t next_cluster_index = s->free_cluster_index++;
          refcount = get_refcount(bs, next_cluster_index);
  
          if (refcount < 0) {
@@ -565,6 +653,15 @@ retry:
              goto retry;
          }
      }
+
+    /* Make sure that all offsets in the "allocated" range are representable
+     * in an int64_t */
+    if (s->free_cluster_index > 0 &&
+        s->free_cluster_index - 1 > (INT64_MAX >> s->cluster_bits))
+    {
+        return -EFBIG;
+    }
+
  #ifdef DEBUG_ALLOC2
      fprintf(stderr, "alloc_clusters: size=%" PRId64 " -> %" PRId64 "\n",
              size,
@@ -573,18 +670,21 @@ retry:
      return (s->free_cluster_index - nb_clusters) << s->cluster_bits;
  }
  
-int64_t qcow2_alloc_clusters(BlockDriverState *bs, int64_t size)
+int64_t qcow2_alloc_clusters(BlockDriverState *bs, uint64_t size)
  {
      int64_t offset;
      int ret;
  
      BLKDBG_EVENT(bs->file, BLKDBG_CLUSTER_ALLOC);
-    offset = alloc_clusters_noref(bs, size);
-    if (offset < 0) {
-        return offset;
-    }
+    do {
+        offset = alloc_clusters_noref(bs, size);
+        if (offset < 0) {
+            return offset;
+        }
+
+        ret = update_refcount(bs, offset, size, 1, QCOW2_DISCARD_NEVER);
+    } while (ret == -EAGAIN);
  
-    ret = update_refcount(bs, offset, size, 1, QCOW2_DISCARD_NEVER);
      if (ret < 0) {
          return ret;
      }
@@ -597,33 +697,36 @@ int qcow2_alloc_clusters_at(BlockDriverState *bs, uint64_t offset,
  {
      BDRVQcowState *s = bs->opaque;
      uint64_t cluster_index;
-    uint64_t old_free_cluster_index;
-    int i, refcount, ret;
+    uint64_t i;
+    int refcount, ret;
  
-    /* Check how many clusters there are free */
-    cluster_index = offset >> s->cluster_bits;
-    for(i = 0; i < nb_clusters; i++) {
-        refcount = get_refcount(bs, cluster_index++);
+    assert(nb_clusters >= 0);
+    if (nb_clusters == 0) {
+        return 0;
+    }
  
-        if (refcount < 0) {
-            return refcount;
-        } else if (refcount != 0) {
-            break;
+    do {
+        /* Check how many clusters there are free */
+        cluster_index = offset >> s->cluster_bits;
+        for(i = 0; i < nb_clusters; i++) {
+            refcount = get_refcount(bs, cluster_index++);
+
+            if (refcount < 0) {
+                return refcount;
+            } else if (refcount != 0) {
+                break;
+            }
          }
-    }
  
-    /* And then allocate them */
-    old_free_cluster_index = s->free_cluster_index;
-    s->free_cluster_index = cluster_index + i;
+        /* And then allocate them */
+        ret = update_refcount(bs, offset, i << s->cluster_bits, 1,
+                              QCOW2_DISCARD_NEVER);
+    } while (ret == -EAGAIN);
  
-    ret = update_refcount(bs, offset, i << s->cluster_bits, 1,
-                          QCOW2_DISCARD_NEVER);
      if (ret < 0) {
          return ret;
      }
  
-    s->free_cluster_index = old_free_cluster_index;
-
      return i;
  }
  
@@ -646,7 +749,7 @@ int64_t qcow2_alloc_bytes(BlockDriverState *bs, int size)
      }
   redo:
      free_in_cluster = s->cluster_size -
-        (s->free_byte_offset & (s->cluster_size - 1));
+        offset_into_cluster(s, s->free_byte_offset);
      if (size <= free_in_cluster) {
          /* enough space in current cluster */
          offset = s->free_byte_offset;
@@ -654,20 +757,20 @@ int64_t qcow2_alloc_bytes(BlockDriverState *bs, int size)
          free_in_cluster -= size;
          if (free_in_cluster == 0)
              s->free_byte_offset = 0;
-        if ((offset & (s->cluster_size - 1)) != 0)
-            update_cluster_refcount(bs, offset >> s->cluster_bits, 1,
-                                    QCOW2_DISCARD_NEVER);
+        if (offset_into_cluster(s, offset) != 0)
+            qcow2_update_cluster_refcount(bs, offset >> s->cluster_bits, 1,
+                                          QCOW2_DISCARD_NEVER);
      } else {
          offset = qcow2_alloc_clusters(bs, s->cluster_size);
          if (offset < 0) {
              return offset;
          }
-        cluster_offset = s->free_byte_offset & ~(s->cluster_size - 1);
+        cluster_offset = start_of_cluster(s, s->free_byte_offset);
          if ((cluster_offset + s->cluster_size) == offset) {
              /* we are lucky: contiguous data */
              offset = s->free_byte_offset;
-            update_cluster_refcount(bs, offset >> s->cluster_bits, 1,
-                                    QCOW2_DISCARD_NEVER);
+            qcow2_update_cluster_refcount(bs, offset >> s->cluster_bits, 1,
+                                          QCOW2_DISCARD_NEVER);
              s->free_byte_offset += size;
          } else {
              s->free_byte_offset = offset;
@@ -676,8 +779,8 @@ int64_t qcow2_alloc_bytes(BlockDriverState *bs, int size)
      }
  
      /* The cluster refcount was incremented, either by qcow2_alloc_clusters()
-     * or explicitly by update_cluster_refcount().  Refcount blocks must be
-     * flushed before the caller's L2 table updates.
+     * or explicitly by qcow2_update_cluster_refcount().  Refcount blocks must
+     * be flushed before the caller's L2 table updates.
       */
      qcow2_cache_set_dependency(bs, s->l2_table_cache, s->refcount_block_cache);
      return offset;
@@ -718,11 +821,13 @@ void qcow2_free_any_clusters(BlockDriverState *bs, uint64_t l2_entry,
          }
          break;
      case QCOW2_CLUSTER_NORMAL:
-        qcow2_free_clusters(bs, l2_entry & L2E_OFFSET_MASK,
-                            nb_clusters << s->cluster_bits, type);
+    case QCOW2_CLUSTER_ZERO:
+        if (l2_entry & L2E_OFFSET_MASK) {
+            qcow2_free_clusters(bs, l2_entry & L2E_OFFSET_MASK,
+                                nb_clusters << s->cluster_bits, type);
+        }
          break;
      case QCOW2_CLUSTER_UNALLOCATED:
-    case QCOW2_CLUSTER_ZERO:
          break;
      default:
          abort();
@@ -750,6 +855,8 @@ int qcow2_update_snapshot_refcount(BlockDriverState *bs,
      l1_table = NULL;
      l1_size2 = l1_size * sizeof(uint64_t);
  
+    s->cache_discards = true;
+
      /* WARNING: qcow2_snapshot_goto relies on this function not using the
       * l1_table_offset when it is the current s->l1_table_offset! Be careful
       * when changing this! */
@@ -783,15 +890,17 @@ int qcow2_update_snapshot_refcount(BlockDriverState *bs,
              }
  
              for(j = 0; j < s->l2_size; j++) {
+                uint64_t cluster_index;
+
                  offset = be64_to_cpu(l2_table[j]);
-                if (offset != 0) {
-                    old_offset = offset;
-                    offset &= ~QCOW_OFLAG_COPIED;
-                    if (offset & QCOW_OFLAG_COMPRESSED) {
+                old_offset = offset;
+                offset &= ~QCOW_OFLAG_COPIED;
+
+                switch (qcow2_get_cluster_type(offset)) {
+                    case QCOW2_CLUSTER_COMPRESSED:
                          nb_csectors = ((offset >> s->csize_shift) &
                                         s->csize_mask) + 1;
                          if (addend != 0) {
-                            int ret;
                              ret = update_refcount(bs,
                                  (offset & s->cluster_offset_mask) & ~511,
                                  nb_csectors * 512, addend,
@@ -802,11 +911,20 @@ int qcow2_update_snapshot_refcount(BlockDriverState *bs,
                          }
                          /* compressed clusters are never modified */
                          refcount = 2;
-                    } else {
-                        uint64_t cluster_index = (offset & L2E_OFFSET_MASK) >> s->cluster_bits;
+                        break;
+
+                    case QCOW2_CLUSTER_NORMAL:
+                    case QCOW2_CLUSTER_ZERO:
+                        cluster_index = (offset & L2E_OFFSET_MASK) >> s->cluster_bits;
+                        if (!cluster_index) {
+                            /* unallocated */
+                            refcount = 0;
+                            break;
+                        }
                          if (addend != 0) {
-                            refcount = update_cluster_refcount(bs, cluster_index, addend,
-                                                               QCOW2_DISCARD_SNAPSHOT);
+                            refcount = qcow2_update_cluster_refcount(bs,
+                                    cluster_index, addend,
+                                    QCOW2_DISCARD_SNAPSHOT);
                          } else {
                              refcount = get_refcount(bs, cluster_index);
                          }
@@ -815,19 +933,26 @@ int qcow2_update_snapshot_refcount(BlockDriverState *bs,
                              ret = refcount;
                              goto fail;
                          }
-                    }
+                        break;
  
-                    if (refcount == 1) {
-                        offset |= QCOW_OFLAG_COPIED;
-                    }
-                    if (offset != old_offset) {
-                        if (addend > 0) {
-                            qcow2_cache_set_dependency(bs, s->l2_table_cache,
-                                s->refcount_block_cache);
-                        }
-                        l2_table[j] = cpu_to_be64(offset);
-                        qcow2_cache_entry_mark_dirty(s->l2_table_cache, l2_table);
+                    case QCOW2_CLUSTER_UNALLOCATED:
+                        refcount = 0;
+                        break;
+
+                    default:
+                        abort();
+                }
+
+                if (refcount == 1) {
+                    offset |= QCOW_OFLAG_COPIED;
+                }
+                if (offset != old_offset) {
+                    if (addend > 0) {
+                        qcow2_cache_set_dependency(bs, s->l2_table_cache,
+                            s->refcount_block_cache);
                      }
+                    l2_table[j] = cpu_to_be64(offset);
+                    qcow2_cache_entry_mark_dirty(s->l2_table_cache, l2_table);
                  }
              }
  
@@ -838,8 +963,8 @@ int qcow2_update_snapshot_refcount(BlockDriverState *bs,
  
  
              if (addend != 0) {
-                refcount = update_cluster_refcount(bs, l2_offset >> s->cluster_bits, addend,
-                                                   QCOW2_DISCARD_SNAPSHOT);
+                refcount = qcow2_update_cluster_refcount(bs, l2_offset >>
+                        s->cluster_bits, addend, QCOW2_DISCARD_SNAPSHOT);
              } else {
                  refcount = get_refcount(bs, l2_offset >> s->cluster_bits);
              }
@@ -862,6 +987,9 @@ fail:
          qcow2_cache_put(bs, s->l2_table_cache, (void**) &l2_table);
      }
  
+    s->cache_discards = false;
+    qcow2_process_discards(bs, ret);
+
      /* Update L1 only if it isn't deleted anyway (addend = -1) */
      if (ret == 0 && addend >= 0 && l1_modified) {
          for (i = 0; i < l1_size; i++) {
@@ -901,22 +1029,17 @@ static void inc_refcounts(BlockDriverState *bs,
                            int64_t offset, int64_t size)
  {
      BDRVQcowState *s = bs->opaque;
-    int64_t start, last, cluster_offset;
-    int k;
+    uint64_t start, last, cluster_offset, k;
  
      if (size <= 0)
          return;
  
-    start = offset & ~(s->cluster_size - 1);
-    last = (offset + size - 1) & ~(s->cluster_size - 1);
+    start = start_of_cluster(s, offset);
+    last = start_of_cluster(s, offset + size - 1);
      for(cluster_offset = start; cluster_offset <= last;
          cluster_offset += s->cluster_size) {
          k = cluster_offset >> s->cluster_bits;
-        if (k < 0) {
-            fprintf(stderr, "ERROR: invalid cluster offset=0x%" PRIx64 "\n",
-                cluster_offset);
-            res->corruptions++;
-        } else if (k >= refcount_table_size) {
+        if (k >= refcount_table_size) {
              fprintf(stderr, "Warning: cluster offset=0x%" PRIx64 " is after "
                  "the end of the image file, can't properly check refcounts.\n",
                  cluster_offset);
@@ -933,7 +1056,6 @@ static void inc_refcounts(BlockDriverState *bs,
  
  /* Flags for check_refcounts_l1() and check_refcounts_l2() */
  enum {
-    CHECK_OFLAG_COPIED = 0x1,   /* check QCOW_OFLAG_COPIED matches refcount */
      CHECK_FRAG_INFO = 0x2,      /* update BlockFragInfo counters */
  };
  
@@ -952,7 +1074,7 @@ static int check_refcounts_l2(BlockDriverState *bs, BdrvCheckResult *res,
      BDRVQcowState *s = bs->opaque;
      uint64_t *l2_table, l2_entry;
      uint64_t next_contiguous_offset = 0;
-    int i, l2_size, nb_csectors, refcount;
+    int i, l2_size, nb_csectors;
  
      /* Read L2 table from disk */
      l2_size = s->l2_size * sizeof(uint64_t);
@@ -1004,23 +1126,8 @@ static int check_refcounts_l2(BlockDriverState *bs, BdrvCheckResult *res,
  
          case QCOW2_CLUSTER_NORMAL:
          {
-            /* QCOW_OFLAG_COPIED must be set iff refcount == 1 */
              uint64_t offset = l2_entry & L2E_OFFSET_MASK;
  
-            if (flags & CHECK_OFLAG_COPIED) {
-                refcount = get_refcount(bs, offset >> s->cluster_bits);
-                if (refcount < 0) {
-                    fprintf(stderr, "Can't get refcount for offset %"
-                        PRIx64 ": %s\n", l2_entry, strerror(-refcount));
-                    goto fail;
-                }
-                if ((refcount == 1) != ((l2_entry & QCOW_OFLAG_COPIED) != 0)) {
-                    fprintf(stderr, "ERROR OFLAG_COPIED: offset=%"
-                        PRIx64 " refcount=%d\n", l2_entry, refcount);
-                    res->corruptions++;
-                }
-            }
-
              if (flags & CHECK_FRAG_INFO) {
                  res->bfi.allocated_clusters++;
                  if (next_contiguous_offset &&
@@ -1035,7 +1142,7 @@ static int check_refcounts_l2(BlockDriverState *bs, BdrvCheckResult *res,
                  offset, s->cluster_size);
  
              /* Correct offsets are cluster aligned */
-            if (offset & (s->cluster_size - 1)) {
+            if (offset_into_cluster(s, offset)) {
                  fprintf(stderr, "ERROR offset=%" PRIx64 ": Cluster is not "
                      "properly aligned; L2 entry corrupted.\n", offset);
                  res->corruptions++;
@@ -1077,7 +1184,7 @@ static int check_refcounts_l1(BlockDriverState *bs,
  {
      BDRVQcowState *s = bs->opaque;
      uint64_t *l1_table, l2_offset, l1_size2;
-    int i, refcount, ret;
+    int i, ret;
  
      l1_size2 = l1_size * sizeof(uint64_t);
  
@@ -1101,29 +1208,13 @@ static int check_refcounts_l1(BlockDriverState *bs,
      for(i = 0; i < l1_size; i++) {
          l2_offset = l1_table[i];
          if (l2_offset) {
-            /* QCOW_OFLAG_COPIED must be set iff refcount == 1 */
-            if (flags & CHECK_OFLAG_COPIED) {
-                refcount = get_refcount(bs, (l2_offset & ~QCOW_OFLAG_COPIED)
-                    >> s->cluster_bits);
-                if (refcount < 0) {
-                    fprintf(stderr, "Can't get refcount for l2_offset %"
-                        PRIx64 ": %s\n", l2_offset, strerror(-refcount));
-                    goto fail;
-                }
-                if ((refcount == 1) != ((l2_offset & QCOW_OFLAG_COPIED) != 0)) {
-                    fprintf(stderr, "ERROR OFLAG_COPIED: l2_offset=%" PRIx64
-                        " refcount=%d\n", l2_offset, refcount);
-                    res->corruptions++;
-                }
-            }
-
              /* Mark L2 table as used */
              l2_offset &= L1E_OFFSET_MASK;
              inc_refcounts(bs, res, refcount_table, refcount_table_size,
                  l2_offset, s->cluster_size);
  
              /* L2 tables are cluster aligned */
-            if (l2_offset & (s->cluster_size - 1)) {
+            if (offset_into_cluster(s, l2_offset)) {
                  fprintf(stderr, "ERROR l2_offset=%" PRIx64 ": Table is not "
                      "cluster aligned; L1 entry corrupted\n", l2_offset);
                  res->corruptions++;
@@ -1147,6 +1238,240 @@ fail:
      return -EIO;
  }
  
+/*
+ * Checks the OFLAG_COPIED flag for all L1 and L2 entries.
+ *
+ * This function does not print an error message nor does it increment
+ * check_errors if get_refcount fails (this is because such an error will have
+ * been already detected and sufficiently signaled by the calling function
+ * (qcow2_check_refcounts) by the time this function is called).
+ */
+static int check_oflag_copied(BlockDriverState *bs, BdrvCheckResult *res,
+                              BdrvCheckMode fix)
+{
+    BDRVQcowState *s = bs->opaque;
+    uint64_t *l2_table = qemu_blockalign(bs, s->cluster_size);
+    int ret;
+    int refcount;
+    int i, j;
+
+    for (i = 0; i < s->l1_size; i++) {
+        uint64_t l1_entry = s->l1_table[i];
+        uint64_t l2_offset = l1_entry & L1E_OFFSET_MASK;
+        bool l2_dirty = false;
+
+        if (!l2_offset) {
+            continue;
+        }
+
+        refcount = get_refcount(bs, l2_offset >> s->cluster_bits);
+        if (refcount < 0) {
+            /* don't print message nor increment check_errors */
+            continue;
+        }
+        if ((refcount == 1) != ((l1_entry & QCOW_OFLAG_COPIED) != 0)) {
+            fprintf(stderr, "%s OFLAG_COPIED L2 cluster: l1_index=%d "
+                    "l1_entry=%" PRIx64 " refcount=%d\n",
+                    fix & BDRV_FIX_ERRORS ? "Repairing" :
+                                            "ERROR",
+                    i, l1_entry, refcount);
+            if (fix & BDRV_FIX_ERRORS) {
+                s->l1_table[i] = refcount == 1
+                               ? l1_entry |  QCOW_OFLAG_COPIED
+                               : l1_entry & ~QCOW_OFLAG_COPIED;
+                ret = qcow2_write_l1_entry(bs, i);
+                if (ret < 0) {
+                    res->check_errors++;
+                    goto fail;
+                }
+                res->corruptions_fixed++;
+            } else {
+                res->corruptions++;
+            }
+        }
+
+        ret = bdrv_pread(bs->file, l2_offset, l2_table,
+                         s->l2_size * sizeof(uint64_t));
+        if (ret < 0) {
+            fprintf(stderr, "ERROR: Could not read L2 table: %s\n",
+                    strerror(-ret));
+            res->check_errors++;
+            goto fail;
+        }
+
+        for (j = 0; j < s->l2_size; j++) {
+            uint64_t l2_entry = be64_to_cpu(l2_table[j]);
+            uint64_t data_offset = l2_entry & L2E_OFFSET_MASK;
+            int cluster_type = qcow2_get_cluster_type(l2_entry);
+
+            if ((cluster_type == QCOW2_CLUSTER_NORMAL) ||
+                ((cluster_type == QCOW2_CLUSTER_ZERO) && (data_offset != 0))) {
+                refcount = get_refcount(bs, data_offset >> s->cluster_bits);
+                if (refcount < 0) {
+                    /* don't print message nor increment check_errors */
+                    continue;
+                }
+                if ((refcount == 1) != ((l2_entry & QCOW_OFLAG_COPIED) != 0)) {
+                    fprintf(stderr, "%s OFLAG_COPIED data cluster: "
+                            "l2_entry=%" PRIx64 " refcount=%d\n",
+                            fix & BDRV_FIX_ERRORS ? "Repairing" :
+                                                    "ERROR",
+                            l2_entry, refcount);
+                    if (fix & BDRV_FIX_ERRORS) {
+                        l2_table[j] = cpu_to_be64(refcount == 1
+                                    ? l2_entry |  QCOW_OFLAG_COPIED
+                                    : l2_entry & ~QCOW_OFLAG_COPIED);
+                        l2_dirty = true;
+                        res->corruptions_fixed++;
+                    } else {
+                        res->corruptions++;
+                    }
+                }
+            }
+        }
+
+        if (l2_dirty) {
+            ret = qcow2_pre_write_overlap_check(bs, QCOW2_OL_ACTIVE_L2,
+                                                l2_offset, s->cluster_size);
+            if (ret < 0) {
+                fprintf(stderr, "ERROR: Could not write L2 table; metadata "
+                        "overlap check failed: %s\n", strerror(-ret));
+                res->check_errors++;
+                goto fail;
+            }
+
+            ret = bdrv_pwrite(bs->file, l2_offset, l2_table, s->cluster_size);
+            if (ret < 0) {
+                fprintf(stderr, "ERROR: Could not write L2 table: %s\n",
+                        strerror(-ret));
+                res->check_errors++;
+                goto fail;
+            }
+        }
+    }
+
+    ret = 0;
+
+fail:
+    qemu_vfree(l2_table);
+    return ret;
+}
+
+/*
+ * Writes one sector of the refcount table to the disk
+ */
+#define RT_ENTRIES_PER_SECTOR (512 / sizeof(uint64_t))
+static int write_reftable_entry(BlockDriverState *bs, int rt_index)
+{
+    BDRVQcowState *s = bs->opaque;
+    uint64_t buf[RT_ENTRIES_PER_SECTOR];
+    int rt_start_index;
+    int i, ret;
+
+    rt_start_index = rt_index & ~(RT_ENTRIES_PER_SECTOR - 1);
+    for (i = 0; i < RT_ENTRIES_PER_SECTOR; i++) {
+        buf[i] = cpu_to_be64(s->refcount_table[rt_start_index + i]);
+    }
+
+    ret = qcow2_pre_write_overlap_check(bs, QCOW2_OL_REFCOUNT_TABLE,
+            s->refcount_table_offset + rt_start_index * sizeof(uint64_t),
+            sizeof(buf));
+    if (ret < 0) {
+        return ret;
+    }
+
+    BLKDBG_EVENT(bs->file, BLKDBG_REFTABLE_UPDATE);
+    ret = bdrv_pwrite_sync(bs->file, s->refcount_table_offset +
+            rt_start_index * sizeof(uint64_t), buf, sizeof(buf));
+    if (ret < 0) {
+        return ret;
+    }
+
+    return 0;
+}
+
+/*
+ * Allocates a new cluster for the given refcount block (represented by its
+ * offset in the image file) and copies the current content there. This function
+ * does _not_ decrement the reference count for the currently occupied cluster.
+ *
+ * This function prints an informative message to stderr on error (and returns
+ * -errno); on success, the offset of the newly allocated cluster is returned.
+ */
+static int64_t realloc_refcount_block(BlockDriverState *bs, int reftable_index,
+                                      uint64_t offset)
+{
+    BDRVQcowState *s = bs->opaque;
+    int64_t new_offset = 0;
+    void *refcount_block = NULL;
+    int ret;
+
+    /* allocate new refcount block */
+    new_offset = qcow2_alloc_clusters(bs, s->cluster_size);
+    if (new_offset < 0) {
+        fprintf(stderr, "Could not allocate new cluster: %s\n",
+                strerror(-new_offset));
+        ret = new_offset;
+        goto done;
+    }
+
+    /* fetch current refcount block content */
+    ret = qcow2_cache_get(bs, s->refcount_block_cache, offset, &refcount_block);
+    if (ret < 0) {
+        fprintf(stderr, "Could not fetch refcount block: %s\n", strerror(-ret));
+        goto fail_free_cluster;
+    }
+
+    /* new block has not yet been entered into refcount table, therefore it is
+     * no refcount block yet (regarding this check) */
+    ret = qcow2_pre_write_overlap_check(bs, 0, new_offset, s->cluster_size);
+    if (ret < 0) {
+        fprintf(stderr, "Could not write refcount block; metadata overlap "
+                "check failed: %s\n", strerror(-ret));
+        /* the image will be marked corrupt, so don't even attempt on freeing
+         * the cluster */
+        goto done;
+    }
+
+    /* write to new block */
+    ret = bdrv_write(bs->file, new_offset / BDRV_SECTOR_SIZE, refcount_block,
+            s->cluster_sectors);
+    if (ret < 0) {
+        fprintf(stderr, "Could not write refcount block: %s\n", strerror(-ret));
+        goto fail_free_cluster;
+    }
+
+    /* update refcount table */
+    assert(!offset_into_cluster(s, new_offset));
+    s->refcount_table[reftable_index] = new_offset;
+    ret = write_reftable_entry(bs, reftable_index);
+    if (ret < 0) {
+        fprintf(stderr, "Could not update refcount table: %s\n",
+                strerror(-ret));
+        goto fail_free_cluster;
+    }
+
+    goto done;
+
+fail_free_cluster:
+    qcow2_free_clusters(bs, new_offset, s->cluster_size, QCOW2_DISCARD_OTHER);
+
+done:
+    if (refcount_block) {
+        /* This should never fail, as it would only do so if the given refcount
+         * block cannot be found in the cache. As this is impossible as long as
+         * there are no bugs, assert the success. */
+        int tmp = qcow2_cache_put(bs, s->refcount_block_cache, &refcount_block);
+        assert(tmp == 0);
+    }
+
+    if (ret < 0) {
+        return ret;
+    }
+
+    return new_offset;
+}
+
  /*
   * Checks an image for refcount consistency.
   *
@@ -1157,14 +1482,24 @@ int qcow2_check_refcounts(BlockDriverState *bs, BdrvCheckResult *res,
                            BdrvCheckMode fix)
  {
      BDRVQcowState *s = bs->opaque;
-    int64_t size, i, highest_cluster;
-    int nb_clusters, refcount1, refcount2;
+    int64_t size, i, highest_cluster, nb_clusters;
+    int refcount1, refcount2;
      QCowSnapshot *sn;
      uint16_t *refcount_table;
      int ret;
  
      size = bdrv_getlength(bs->file);
+    if (size < 0) {
+        res->check_errors++;
+        return size;
+    }
+
      nb_clusters = size_to_clusters(s, size);
+    if (nb_clusters > INT_MAX) {
+        res->check_errors++;
+        return -EFBIG;
+    }
+
      refcount_table = g_malloc0(nb_clusters * sizeof(uint16_t));
  
      res->bfi.total_clusters =
@@ -1176,8 +1511,7 @@ int qcow2_check_refcounts(BlockDriverState *bs, BdrvCheckResult *res,
  
      /* current L1 table */
      ret = check_refcounts_l1(bs, res, refcount_table, nb_clusters,
-                             s->l1_table_offset, s->l1_size,
-                             CHECK_OFLAG_COPIED | CHECK_FRAG_INFO);
+                             s->l1_table_offset, s->l1_size, CHECK_FRAG_INFO);
      if (ret < 0) {
          goto fail;
      }
@@ -1205,7 +1539,7 @@ int qcow2_check_refcounts(BlockDriverState *bs, BdrvCheckResult *res,
          cluster = offset >> s->cluster_bits;
  
          /* Refcount blocks are cluster aligned */
-        if (offset & (s->cluster_size - 1)) {
+        if (offset_into_cluster(s, offset)) {
              fprintf(stderr, "ERROR refcount block %" PRId64 " is not "
                  "cluster aligned; refcount table entry corrupted\n", i);
              res->corruptions++;
@@ -1223,10 +1557,39 @@ int qcow2_check_refcounts(BlockDriverState *bs, BdrvCheckResult *res,
              inc_refcounts(bs, res, refcount_table, nb_clusters,
                  offset, s->cluster_size);
              if (refcount_table[cluster] != 1) {
-                fprintf(stderr, "ERROR refcount block %" PRId64
+                fprintf(stderr, "%s refcount block %" PRId64
                      " refcount=%d\n",
+                    fix & BDRV_FIX_ERRORS ? "Repairing" :
+                                            "ERROR",
                      i, refcount_table[cluster]);
-                res->corruptions++;
+
+                if (fix & BDRV_FIX_ERRORS) {
+                    int64_t new_offset;
+
+                    new_offset = realloc_refcount_block(bs, i, offset);
+                    if (new_offset < 0) {
+                        res->corruptions++;
+                        continue;
+                    }
+
+                    /* update refcounts */
+                    if ((new_offset >> s->cluster_bits) >= nb_clusters) {
+                        /* increase refcount_table size if necessary */
+                        int old_nb_clusters = nb_clusters;
+                        nb_clusters = (new_offset >> s->cluster_bits) + 1;
+                        refcount_table = g_realloc(refcount_table,
+                                nb_clusters * sizeof(uint16_t));
+                        memset(&refcount_table[old_nb_clusters], 0, (nb_clusters
+                                - old_nb_clusters) * sizeof(uint16_t));
+                    }
+                    refcount_table[cluster]--;
+                    inc_refcounts(bs, res, refcount_table, nb_clusters,
+                            new_offset, s->cluster_size);
+
+                    res->corruptions_fixed++;
+                } else {
+                    res->corruptions++;
+                }
              }
          }
      }
@@ -1282,6 +1645,12 @@ int qcow2_check_refcounts(BlockDriverState *bs, BdrvCheckResult *res,
          }
      }
  
+    /* check OFLAG_COPIED */
+    ret = check_oflag_copied(bs, res, fix);
+    if (ret < 0) {
+        goto fail;
+    }
+
      res->image_end_offset = (highest_cluster + 1) * s->cluster_size;
      ret = 0;
  
@@ -1291,3 +1660,173 @@ fail:
      return ret;
  }
  
+#define overlaps_with(ofs, sz) \
+    ranges_overlap(offset, size, ofs, sz)
+
+/*
+ * Checks if the given offset into the image file is actually free to use by
+ * looking for overlaps with important metadata sections (L1/L2 tables etc.),
+ * i.e. a sanity check without relying on the refcount tables.
+ *
+ * The ign parameter specifies what checks not to perform (being a bitmask of
+ * QCow2MetadataOverlap values), i.e., what sections to ignore.
+ *
+ * Returns:
+ * - 0 if writing to this offset will not affect the mentioned metadata
+ * - a positive QCow2MetadataOverlap value indicating one overlapping section
+ * - a negative value (-errno) indicating an error while performing a check,
+ *   e.g. when bdrv_read failed on QCOW2_OL_INACTIVE_L2
+ */
+int qcow2_check_metadata_overlap(BlockDriverState *bs, int ign, int64_t offset,
+                                 int64_t size)
+{
+    BDRVQcowState *s = bs->opaque;
+    int chk = s->overlap_check & ~ign;
+    int i, j;
+
+    if (!size) {
+        return 0;
+    }
+
+    if (chk & QCOW2_OL_MAIN_HEADER) {
+        if (offset < s->cluster_size) {
+            return QCOW2_OL_MAIN_HEADER;
+        }
+    }
+
+    /* align range to test to cluster boundaries */
+    size = align_offset(offset_into_cluster(s, offset) + size, s->cluster_size);
+    offset = start_of_cluster(s, offset);
+
+    if ((chk & QCOW2_OL_ACTIVE_L1) && s->l1_size) {
+        if (overlaps_with(s->l1_table_offset, s->l1_size * sizeof(uint64_t))) {
+            return QCOW2_OL_ACTIVE_L1;
+        }
+    }
+
+    if ((chk & QCOW2_OL_REFCOUNT_TABLE) && s->refcount_table_size) {
+        if (overlaps_with(s->refcount_table_offset,
+            s->refcount_table_size * sizeof(uint64_t))) {
+            return QCOW2_OL_REFCOUNT_TABLE;
+        }
+    }
+
+    if ((chk & QCOW2_OL_SNAPSHOT_TABLE) && s->snapshots_size) {
+        if (overlaps_with(s->snapshots_offset, s->snapshots_size)) {
+            return QCOW2_OL_SNAPSHOT_TABLE;
+        }
+    }
+
+    if ((chk & QCOW2_OL_INACTIVE_L1) && s->snapshots) {
+        for (i = 0; i < s->nb_snapshots; i++) {
+            if (s->snapshots[i].l1_size &&
+                overlaps_with(s->snapshots[i].l1_table_offset,
+                s->snapshots[i].l1_size * sizeof(uint64_t))) {
+                return QCOW2_OL_INACTIVE_L1;
+            }
+        }
+    }
+
+    if ((chk & QCOW2_OL_ACTIVE_L2) && s->l1_table) {
+        for (i = 0; i < s->l1_size; i++) {
+            if ((s->l1_table[i] & L1E_OFFSET_MASK) &&
+                overlaps_with(s->l1_table[i] & L1E_OFFSET_MASK,
+                s->cluster_size)) {
+                return QCOW2_OL_ACTIVE_L2;
+            }
+        }
+    }
+
+    if ((chk & QCOW2_OL_REFCOUNT_BLOCK) && s->refcount_table) {
+        for (i = 0; i < s->refcount_table_size; i++) {
+            if ((s->refcount_table[i] & REFT_OFFSET_MASK) &&
+                overlaps_with(s->refcount_table[i] & REFT_OFFSET_MASK,
+                s->cluster_size)) {
+                return QCOW2_OL_REFCOUNT_BLOCK;
+            }
+        }
+    }
+
+    if ((chk & QCOW2_OL_INACTIVE_L2) && s->snapshots) {
+        for (i = 0; i < s->nb_snapshots; i++) {
+            uint64_t l1_ofs = s->snapshots[i].l1_table_offset;
+            uint32_t l1_sz  = s->snapshots[i].l1_size;
+            uint64_t l1_sz2 = l1_sz * sizeof(uint64_t);
+            uint64_t *l1 = g_malloc(l1_sz2);
+            int ret;
+
+            ret = bdrv_pread(bs->file, l1_ofs, l1, l1_sz2);
+            if (ret < 0) {
+                g_free(l1);
+                return ret;
+            }
+
+            for (j = 0; j < l1_sz; j++) {
+                uint64_t l2_ofs = be64_to_cpu(l1[j]) & L1E_OFFSET_MASK;
+                if (l2_ofs && overlaps_with(l2_ofs, s->cluster_size)) {
+                    g_free(l1);
+                    return QCOW2_OL_INACTIVE_L2;
+                }
+            }
+
+            g_free(l1);
+        }
+    }
+
+    return 0;
+}
+
+static const char *metadata_ol_names[] = {
+    [QCOW2_OL_MAIN_HEADER_BITNR]    = "qcow2_header",
+    [QCOW2_OL_ACTIVE_L1_BITNR]      = "active L1 table",
+    [QCOW2_OL_ACTIVE_L2_BITNR]      = "active L2 table",
+    [QCOW2_OL_REFCOUNT_TABLE_BITNR] = "refcount table",
+    [QCOW2_OL_REFCOUNT_BLOCK_BITNR] = "refcount block",
+    [QCOW2_OL_SNAPSHOT_TABLE_BITNR] = "snapshot table",
+    [QCOW2_OL_INACTIVE_L1_BITNR]    = "inactive L1 table",
+    [QCOW2_OL_INACTIVE_L2_BITNR]    = "inactive L2 table",
+};
+
+/*
+ * First performs a check for metadata overlaps (through
+ * qcow2_check_metadata_overlap); if that fails with a negative value (error
+ * while performing a check), that value is returned. If an impending overlap
+ * is detected, the BDS will be made unusable, the qcow2 file marked corrupt
+ * and -EIO returned.
+ *
+ * Returns 0 if there were neither overlaps nor errors while checking for
+ * overlaps; or a negative value (-errno) on error.
+ */
+int qcow2_pre_write_overlap_check(BlockDriverState *bs, int ign, int64_t offset,
+                                  int64_t size)
+{
+    int ret = qcow2_check_metadata_overlap(bs, ign, offset, size);
+
+    if (ret < 0) {
+        return ret;
+    } else if (ret > 0) {
+        int metadata_ol_bitnr = ffs(ret) - 1;
+        char *message;
+        QObject *data;
+
+        assert(metadata_ol_bitnr < QCOW2_OL_MAX_BITNR);
+
+        fprintf(stderr, "qcow2: Preventing invalid write on metadata (overlaps "
+                "with %s); image marked as corrupt.\n",
+                metadata_ol_names[metadata_ol_bitnr]);
+        message = g_strdup_printf("Prevented %s overwrite",
+                metadata_ol_names[metadata_ol_bitnr]);
+        data = qobject_from_jsonf("{ 'device': %s, 'msg': %s, 'offset': %"
+                PRId64 ", 'size': %" PRId64 " }", bs->device_name, message,
+                offset, size);
+        monitor_protocol_event(QEVENT_BLOCK_IMAGE_CORRUPTED, data);
+        g_free(message);
+        qobject_decref(data);
+
+        qcow2_mark_corrupt(bs);
+        bs->drv = NULL; /* make BDS unusable */
+        return -EIO;
+    }
+
+    return 0;
+}