block/raw-posix: avoid bogus fixup for cylinders on DASD disks

[qemu.git] / block / io.c
diff --git a/block/io.c b/block/io.c

index a05ad677d35181f5f44ad42f7d701c0d535a0d12..63e3678036e8c9aae68df5e2487f5b21818f16dd 100644 (file)
--- a/block/io.c
+++ b/block/io.c
@@ -23,9 +23,11 @@
   */
  
  #include "trace.h"
-#include "sysemu/qtest.h"
+#include "sysemu/block-backend.h"
  #include "block/blockjob.h"
  #include "block/block_int.h"
+#include "block/throttle-groups.h"
+#include "qemu/error-report.h"
  
  #define NOT_DONE 0x7fffffff /* used while emulated sync operation in progress */
  
@@ -65,7 +67,7 @@ void bdrv_set_io_limits(BlockDriverState *bs,
  {
      int i;
  
-    throttle_config(&bs->throttle_state, cfg);
+    throttle_group_config(bs, cfg);
  
      for (i = 0; i < 2; i++) {
          qemu_co_enter_next(&bs->throttled_reqs[i]);
@@ -95,72 +97,33 @@ static bool bdrv_start_throttled_reqs(BlockDriverState *bs)
  void bdrv_io_limits_disable(BlockDriverState *bs)
  {
      bs->io_limits_enabled = false;
-
      bdrv_start_throttled_reqs(bs);
-
-    throttle_destroy(&bs->throttle_state);
-}
-
-static void bdrv_throttle_read_timer_cb(void *opaque)
-{
-    BlockDriverState *bs = opaque;
-    qemu_co_enter_next(&bs->throttled_reqs[0]);
-}
-
-static void bdrv_throttle_write_timer_cb(void *opaque)
-{
-    BlockDriverState *bs = opaque;
-    qemu_co_enter_next(&bs->throttled_reqs[1]);
+    throttle_group_unregister_bs(bs);
  }
  
  /* should be called before bdrv_set_io_limits if a limit is set */
-void bdrv_io_limits_enable(BlockDriverState *bs)
+void bdrv_io_limits_enable(BlockDriverState *bs, const char *group)
  {
-    int clock_type = QEMU_CLOCK_REALTIME;
-
-    if (qtest_enabled()) {
-        /* For testing block IO throttling only */
-        clock_type = QEMU_CLOCK_VIRTUAL;
-    }
      assert(!bs->io_limits_enabled);
-    throttle_init(&bs->throttle_state,
-                  bdrv_get_aio_context(bs),
-                  clock_type,
-                  bdrv_throttle_read_timer_cb,
-                  bdrv_throttle_write_timer_cb,
-                  bs);
+    throttle_group_register_bs(bs, group);
      bs->io_limits_enabled = true;
  }
  
-/* This function makes an IO wait if needed
- *
- * @nb_sectors: the number of sectors of the IO
- * @is_write:   is the IO a write
- */
-static void bdrv_io_limits_intercept(BlockDriverState *bs,
-                                     unsigned int bytes,
-                                     bool is_write)
+void bdrv_io_limits_update_group(BlockDriverState *bs, const char *group)
  {
-    /* does this io must wait */
-    bool must_wait = throttle_schedule_timer(&bs->throttle_state, is_write);
-
-    /* if must wait or any request of this type throttled queue the IO */
-    if (must_wait ||
-        !qemu_co_queue_empty(&bs->throttled_reqs[is_write])) {
-        qemu_co_queue_wait(&bs->throttled_reqs[is_write]);
+    /* this bs is not part of any group */
+    if (!bs->throttle_state) {
+        return;
      }
  
-    /* the IO will be executed, do the accounting */
-    throttle_account(&bs->throttle_state, is_write, bytes);
-
-
-    /* if the next request must wait -> do nothing */
-    if (throttle_schedule_timer(&bs->throttle_state, is_write)) {
+    /* this bs is a part of the same group than the one we want */
+    if (!g_strcmp0(throttle_group_get_name(bs), group)) {
          return;
      }
  
-    /* else queue next request for execution */
-    qemu_co_queue_next(&bs->throttled_reqs[is_write]);
+    /* need to change the group this bs belong to */
+    bdrv_io_limits_disable(bs);
+    bdrv_io_limits_enable(bs, group);
  }
  
  void bdrv_setup_io_funcs(BlockDriver *bdrv)
@@ -194,33 +157,45 @@ void bdrv_refresh_limits(BlockDriverState *bs, Error **errp)
  
      /* Take some limits from the children as a default */
      if (bs->file) {
-        bdrv_refresh_limits(bs->file, &local_err);
+        bdrv_refresh_limits(bs->file->bs, &local_err);
          if (local_err) {
              error_propagate(errp, local_err);
              return;
          }
-        bs->bl.opt_transfer_length = bs->file->bl.opt_transfer_length;
-        bs->bl.max_transfer_length = bs->file->bl.max_transfer_length;
-        bs->bl.opt_mem_alignment = bs->file->bl.opt_mem_alignment;
+        bs->bl.opt_transfer_length = bs->file->bs->bl.opt_transfer_length;
+        bs->bl.max_transfer_length = bs->file->bs->bl.max_transfer_length;
+        bs->bl.min_mem_alignment = bs->file->bs->bl.min_mem_alignment;
+        bs->bl.opt_mem_alignment = bs->file->bs->bl.opt_mem_alignment;
+        bs->bl.max_iov = bs->file->bs->bl.max_iov;
      } else {
-        bs->bl.opt_mem_alignment = 512;
+        bs->bl.min_mem_alignment = 512;
+        bs->bl.opt_mem_alignment = getpagesize();
+
+        /* Safe default since most protocols use readv()/writev()/etc */
+        bs->bl.max_iov = IOV_MAX;
      }
  
-    if (bs->backing_hd) {
-        bdrv_refresh_limits(bs->backing_hd, &local_err);
+    if (bs->backing) {
+        bdrv_refresh_limits(bs->backing->bs, &local_err);
          if (local_err) {
              error_propagate(errp, local_err);
              return;
          }
          bs->bl.opt_transfer_length =
              MAX(bs->bl.opt_transfer_length,
-                bs->backing_hd->bl.opt_transfer_length);
+                bs->backing->bs->bl.opt_transfer_length);
          bs->bl.max_transfer_length =
              MIN_NON_ZERO(bs->bl.max_transfer_length,
-                         bs->backing_hd->bl.max_transfer_length);
+                         bs->backing->bs->bl.max_transfer_length);
          bs->bl.opt_mem_alignment =
              MAX(bs->bl.opt_mem_alignment,
-                bs->backing_hd->bl.opt_mem_alignment);
+                bs->backing->bs->bl.opt_mem_alignment);
+        bs->bl.min_mem_alignment =
+            MAX(bs->bl.min_mem_alignment,
+                bs->backing->bs->bl.min_mem_alignment);
+        bs->bl.max_iov =
+            MIN(bs->bl.max_iov,
+                bs->backing->bs->bl.max_iov);
      }
  
      /* Then let the driver override it */
@@ -246,8 +221,10 @@ void bdrv_disable_copy_on_read(BlockDriverState *bs)
  }
  
  /* Check if any requests are in-flight (including throttled requests) */
-static bool bdrv_requests_pending(BlockDriverState *bs)
+bool bdrv_requests_pending(BlockDriverState *bs)
  {
+    BdrvChild *child;
+
      if (!QLIST_EMPTY(&bs->tracked_requests)) {
          return true;
      }
@@ -257,40 +234,49 @@ static bool bdrv_requests_pending(BlockDriverState *bs)
      if (!qemu_co_queue_empty(&bs->throttled_reqs[1])) {
          return true;
      }
-    if (bs->file && bdrv_requests_pending(bs->file)) {
-        return true;
-    }
-    if (bs->backing_hd && bdrv_requests_pending(bs->backing_hd)) {
-        return true;
+
+    QLIST_FOREACH(child, &bs->children, next) {
+        if (bdrv_requests_pending(child->bs)) {
+            return true;
+        }
      }
+
      return false;
  }
  
-static bool bdrv_drain_one(BlockDriverState *bs)
+static void bdrv_drain_recurse(BlockDriverState *bs)
  {
-    bool bs_busy;
+    BdrvChild *child;
  
-    bdrv_flush_io_queue(bs);
-    bdrv_start_throttled_reqs(bs);
-    bs_busy = bdrv_requests_pending(bs);
-    bs_busy |= aio_poll(bdrv_get_aio_context(bs), bs_busy);
-    return bs_busy;
+    if (bs->drv && bs->drv->bdrv_drain) {
+        bs->drv->bdrv_drain(bs);
+    }
+    QLIST_FOREACH(child, &bs->children, next) {
+        bdrv_drain_recurse(child->bs);
+    }
  }
  
  /*
- * Wait for pending requests to complete on a single BlockDriverState subtree
- *
- * See the warning in bdrv_drain_all().  This function can only be called if
- * you are sure nothing can generate I/O because you have op blockers
- * installed.
+ * Wait for pending requests to complete on a single BlockDriverState subtree,
+ * and suspend block driver's internal I/O until next request arrives.
   *
   * Note that unlike bdrv_drain_all(), the caller must hold the BlockDriverState
   * AioContext.
+ *
+ * Only this BlockDriverState's AioContext is run, so in-flight requests must
+ * not depend on events in other AioContexts.  In that case, use
+ * bdrv_drain_all() instead.
   */
  void bdrv_drain(BlockDriverState *bs)
  {
-    while (bdrv_drain_one(bs)) {
+    bool busy = true;
+
+    bdrv_drain_recurse(bs);
+    while (busy) {
          /* Keep iterating */
+         bdrv_flush_io_queue(bs);
+         busy = bdrv_requests_pending(bs);
+         busy |= aio_poll(bdrv_get_aio_context(bs), busy);
      }
  }
  
@@ -299,18 +285,13 @@ void bdrv_drain(BlockDriverState *bs)
   *
   * This function does not flush data to disk, use bdrv_flush_all() for that
   * after calling this function.
- *
- * Note that completion of an asynchronous I/O operation can trigger any
- * number of other I/O operations on other devices---for example a coroutine
- * can be arbitrarily complex and a constant flow of I/O can come until the
- * coroutine is complete.  Because of this, it is not possible to have a
- * function to drain a single device's I/O queue.
   */
  void bdrv_drain_all(void)
  {
      /* Always run first iteration so any pending completion BHs run */
      bool busy = true;
      BlockDriverState *bs = NULL;
+    GSList *aio_ctxs = NULL, *ctx;
  
      while ((bs = bdrv_next(bs))) {
          AioContext *aio_context = bdrv_get_aio_context(bs);
@@ -320,17 +301,36 @@ void bdrv_drain_all(void)
              block_job_pause(bs->job);
          }
          aio_context_release(aio_context);
+
+        if (!g_slist_find(aio_ctxs, aio_context)) {
+            aio_ctxs = g_slist_prepend(aio_ctxs, aio_context);
+        }
      }
  
+    /* Note that completion of an asynchronous I/O operation can trigger any
+     * number of other I/O operations on other devices---for example a
+     * coroutine can submit an I/O request to another device in response to
+     * request completion.  Therefore we must keep looping until there was no
+     * more activity rather than simply draining each device independently.
+     */
      while (busy) {
          busy = false;
-        bs = NULL;
  
-        while ((bs = bdrv_next(bs))) {
-            AioContext *aio_context = bdrv_get_aio_context(bs);
+        for (ctx = aio_ctxs; ctx != NULL; ctx = ctx->next) {
+            AioContext *aio_context = ctx->data;
+            bs = NULL;
  
              aio_context_acquire(aio_context);
-            busy |= bdrv_drain_one(bs);
+            while ((bs = bdrv_next(bs))) {
+                if (aio_context == bdrv_get_aio_context(bs)) {
+                    bdrv_flush_io_queue(bs);
+                    if (bdrv_requests_pending(bs)) {
+                        busy = true;
+                        aio_poll(aio_context, busy);
+                    }
+                }
+            }
+            busy |= aio_poll(aio_context, false);
              aio_context_release(aio_context);
          }
      }
@@ -345,6 +345,7 @@ void bdrv_drain_all(void)
          }
          aio_context_release(aio_context);
      }
+    g_slist_free(aio_ctxs);
  }
  
  /**
@@ -368,13 +369,14 @@ static void tracked_request_end(BdrvTrackedRequest *req)
  static void tracked_request_begin(BdrvTrackedRequest *req,
                                    BlockDriverState *bs,
                                    int64_t offset,
-                                  unsigned int bytes, bool is_write)
+                                  unsigned int bytes,
+                                  enum BdrvTrackedRequestType type)
  {
      *req = (BdrvTrackedRequest){
          .bs = bs,
          .offset         = offset,
          .bytes          = bytes,
-        .is_write       = is_write,
+        .type           = type,
          .co             = qemu_coroutine_self(),
          .serialising    = false,
          .overlap_offset = offset,
@@ -868,7 +870,9 @@ static int coroutine_fn bdrv_aligned_preadv(BlockDriverState *bs,
          mark_request_serialising(req, bdrv_get_cluster_size(bs));
      }
  
-    wait_serialising_requests(req);
+    if (!(flags & BDRV_REQ_NO_SERIALISING)) {
+        wait_serialising_requests(req);
+    }
  
      if (flags & BDRV_REQ_COPY_ON_READ) {
          int pnum;
@@ -929,19 +933,6 @@ out:
      return ret;
  }
  
-static inline uint64_t bdrv_get_align(BlockDriverState *bs)
-{
-    /* TODO Lift BDRV_SECTOR_SIZE restriction in BlockDriver interface */
-    return MAX(BDRV_SECTOR_SIZE, bs->request_alignment);
-}
-
-static inline bool bdrv_req_is_aligned(BlockDriverState *bs,
-                                       int64_t offset, size_t bytes)
-{
-    int64_t align = bdrv_get_align(bs);
-    return !(offset & (align - 1) || (bytes & (align - 1)));
-}
-
  /*
   * Handle a read request in coroutine context
   */
@@ -952,7 +943,8 @@ static int coroutine_fn bdrv_co_do_preadv(BlockDriverState *bs,
      BlockDriver *drv = bs->drv;
      BdrvTrackedRequest req;
  
-    uint64_t align = bdrv_get_align(bs);
+    /* TODO Lift BDRV_SECTOR_SIZE restriction in BlockDriver interface */
+    uint64_t align = MAX(BDRV_SECTOR_SIZE, bs->request_alignment);
      uint8_t *head_buf = NULL;
      uint8_t *tail_buf = NULL;
      QEMUIOVector local_qiov;
@@ -968,13 +960,14 @@ static int coroutine_fn bdrv_co_do_preadv(BlockDriverState *bs,
          return ret;
      }
  
-    if (bs->copy_on_read) {
+    /* Don't do copy-on-read if we read data before write operation */
+    if (bs->copy_on_read && !(flags & BDRV_REQ_NO_SERIALISING)) {
          flags |= BDRV_REQ_COPY_ON_READ;
      }
  
      /* throttling disk I/O */
      if (bs->io_limits_enabled) {
-        bdrv_io_limits_intercept(bs, bytes, false);
+        throttle_group_co_io_limits_intercept(bs, bytes, false);
      }
  
      /* Align read if necessary by padding qiov */
@@ -1002,7 +995,7 @@ static int coroutine_fn bdrv_co_do_preadv(BlockDriverState *bs,
          bytes = ROUND_UP(bytes, align);
      }
  
-    tracked_request_begin(&req, bs, offset, bytes, false);
+    tracked_request_begin(&req, bs, offset, bytes, BDRV_TRACKED_READ);
      ret = bdrv_aligned_preadv(bs, &req, offset, bytes, align,
                                use_local_qiov ? &local_qiov : qiov,
                                flags);
@@ -1037,6 +1030,15 @@ int coroutine_fn bdrv_co_readv(BlockDriverState *bs, int64_t sector_num,
      return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov, 0);
  }
  
+int coroutine_fn bdrv_co_readv_no_serialising(BlockDriverState *bs,
+    int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
+{
+    trace_bdrv_co_readv_no_serialising(bs, sector_num, nb_sectors);
+
+    return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov,
+                            BDRV_REQ_NO_SERIALISING);
+}
+
  int coroutine_fn bdrv_co_copy_on_readv(BlockDriverState *bs,
      int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
  {
@@ -1163,13 +1165,13 @@ static int coroutine_fn bdrv_aligned_pwritev(BlockDriverState *bs,
      if (ret < 0) {
          /* Do nothing, write notifier decided to fail this request */
      } else if (flags & BDRV_REQ_ZERO_WRITE) {
-        BLKDBG_EVENT(bs, BLKDBG_PWRITEV_ZERO);
+        bdrv_debug_event(bs, BLKDBG_PWRITEV_ZERO);
          ret = bdrv_co_do_write_zeroes(bs, sector_num, nb_sectors, flags);
      } else {
-        BLKDBG_EVENT(bs, BLKDBG_PWRITEV);
+        bdrv_debug_event(bs, BLKDBG_PWRITEV);
          ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, qiov);
      }
-    BLKDBG_EVENT(bs, BLKDBG_PWRITEV_DONE);
+    bdrv_debug_event(bs, BLKDBG_PWRITEV_DONE);
  
      if (ret == 0 && !bs->enable_write_cache) {
          ret = bdrv_co_flush(bs);
@@ -1177,7 +1179,9 @@ static int coroutine_fn bdrv_aligned_pwritev(BlockDriverState *bs,
  
      bdrv_set_dirty(bs, sector_num, nb_sectors);
  
-    block_acct_highest_sector(&bs->stats, sector_num, nb_sectors);
+    if (bs->wr_highest_offset < offset + bytes) {
+        bs->wr_highest_offset = offset + bytes;
+    }
  
      if (ret >= 0) {
          bs->total_sectors = MAX(bs->total_sectors, sector_num + nb_sectors);
@@ -1186,6 +1190,94 @@ static int coroutine_fn bdrv_aligned_pwritev(BlockDriverState *bs,
      return ret;
  }
  
+static int coroutine_fn bdrv_co_do_zero_pwritev(BlockDriverState *bs,
+                                                int64_t offset,
+                                                unsigned int bytes,
+                                                BdrvRequestFlags flags,
+                                                BdrvTrackedRequest *req)
+{
+    uint8_t *buf = NULL;
+    QEMUIOVector local_qiov;
+    struct iovec iov;
+    uint64_t align = MAX(BDRV_SECTOR_SIZE, bs->request_alignment);
+    unsigned int head_padding_bytes, tail_padding_bytes;
+    int ret = 0;
+
+    head_padding_bytes = offset & (align - 1);
+    tail_padding_bytes = align - ((offset + bytes) & (align - 1));
+
+
+    assert(flags & BDRV_REQ_ZERO_WRITE);
+    if (head_padding_bytes || tail_padding_bytes) {
+        buf = qemu_blockalign(bs, align);
+        iov = (struct iovec) {
+            .iov_base   = buf,
+            .iov_len    = align,
+        };
+        qemu_iovec_init_external(&local_qiov, &iov, 1);
+    }
+    if (head_padding_bytes) {
+        uint64_t zero_bytes = MIN(bytes, align - head_padding_bytes);
+
+        /* RMW the unaligned part before head. */
+        mark_request_serialising(req, align);
+        wait_serialising_requests(req);
+        bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_HEAD);
+        ret = bdrv_aligned_preadv(bs, req, offset & ~(align - 1), align,
+                                  align, &local_qiov, 0);
+        if (ret < 0) {
+            goto fail;
+        }
+        bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_AFTER_HEAD);
+
+        memset(buf + head_padding_bytes, 0, zero_bytes);
+        ret = bdrv_aligned_pwritev(bs, req, offset & ~(align - 1), align,
+                                   &local_qiov,
+                                   flags & ~BDRV_REQ_ZERO_WRITE);
+        if (ret < 0) {
+            goto fail;
+        }
+        offset += zero_bytes;
+        bytes -= zero_bytes;
+    }
+
+    assert(!bytes || (offset & (align - 1)) == 0);
+    if (bytes >= align) {
+        /* Write the aligned part in the middle. */
+        uint64_t aligned_bytes = bytes & ~(align - 1);
+        ret = bdrv_aligned_pwritev(bs, req, offset, aligned_bytes,
+                                   NULL, flags);
+        if (ret < 0) {
+            goto fail;
+        }
+        bytes -= aligned_bytes;
+        offset += aligned_bytes;
+    }
+
+    assert(!bytes || (offset & (align - 1)) == 0);
+    if (bytes) {
+        assert(align == tail_padding_bytes + bytes);
+        /* RMW the unaligned part after tail. */
+        mark_request_serialising(req, align);
+        wait_serialising_requests(req);
+        bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_TAIL);
+        ret = bdrv_aligned_preadv(bs, req, offset, align,
+                                  align, &local_qiov, 0);
+        if (ret < 0) {
+            goto fail;
+        }
+        bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_AFTER_TAIL);
+
+        memset(buf, 0, bytes);
+        ret = bdrv_aligned_pwritev(bs, req, offset, align,
+                                   &local_qiov, flags & ~BDRV_REQ_ZERO_WRITE);
+    }
+fail:
+    qemu_vfree(buf);
+    return ret;
+
+}
+
  /*
   * Handle a write request in coroutine context
   */
@@ -1194,7 +1286,8 @@ static int coroutine_fn bdrv_co_do_pwritev(BlockDriverState *bs,
      BdrvRequestFlags flags)
  {
      BdrvTrackedRequest req;
-    uint64_t align = bdrv_get_align(bs);
+    /* TODO Lift BDRV_SECTOR_SIZE restriction in BlockDriver interface */
+    uint64_t align = MAX(BDRV_SECTOR_SIZE, bs->request_alignment);
      uint8_t *head_buf = NULL;
      uint8_t *tail_buf = NULL;
      QEMUIOVector local_qiov;
@@ -1215,7 +1308,7 @@ static int coroutine_fn bdrv_co_do_pwritev(BlockDriverState *bs,
  
      /* throttling disk I/O */
      if (bs->io_limits_enabled) {
-        bdrv_io_limits_intercept(bs, bytes, true);
+        throttle_group_co_io_limits_intercept(bs, bytes, true);
      }
  
      /*
@@ -1223,7 +1316,12 @@ static int coroutine_fn bdrv_co_do_pwritev(BlockDriverState *bs,
       * Pad qiov with the read parts and be sure to have a tracked request not
       * only for bdrv_aligned_pwritev, but also for the reads of the RMW cycle.
       */
-    tracked_request_begin(&req, bs, offset, bytes, true);
+    tracked_request_begin(&req, bs, offset, bytes, BDRV_TRACKED_WRITE);
+
+    if (!qiov) {
+        ret = bdrv_co_do_zero_pwritev(bs, offset, bytes, flags, &req);
+        goto out;
+    }
  
      if (offset & (align - 1)) {
          QEMUIOVector head_qiov;
@@ -1239,13 +1337,13 @@ static int coroutine_fn bdrv_co_do_pwritev(BlockDriverState *bs,
          };
          qemu_iovec_init_external(&head_qiov, &head_iov, 1);
  
-        BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_HEAD);
+        bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_HEAD);
          ret = bdrv_aligned_preadv(bs, &req, offset & ~(align - 1), align,
                                    align, &head_qiov, 0);
          if (ret < 0) {
              goto fail;
          }
-        BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_AFTER_HEAD);
+        bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_AFTER_HEAD);
  
          qemu_iovec_init(&local_qiov, qiov->niov + 2);
          qemu_iovec_add(&local_qiov, head_buf, offset & (align - 1));
@@ -1273,13 +1371,13 @@ static int coroutine_fn bdrv_co_do_pwritev(BlockDriverState *bs,
          };
          qemu_iovec_init_external(&tail_qiov, &tail_iov, 1);
  
-        BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_TAIL);
+        bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_TAIL);
          ret = bdrv_aligned_preadv(bs, &req, (offset + bytes) & ~(align - 1), align,
                                    align, &tail_qiov, 0);
          if (ret < 0) {
              goto fail;
          }
-        BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_AFTER_TAIL);
+        bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_AFTER_TAIL);
  
          if (!use_local_qiov) {
              qemu_iovec_init(&local_qiov, qiov->niov + 1);
@@ -1293,23 +1391,19 @@ static int coroutine_fn bdrv_co_do_pwritev(BlockDriverState *bs,
          bytes = ROUND_UP(bytes, align);
      }
  
-    if (use_local_qiov) {
-        /* Local buffer may have non-zero data. */
-        flags &= ~BDRV_REQ_ZERO_WRITE;
-    }
      ret = bdrv_aligned_pwritev(bs, &req, offset, bytes,
                                 use_local_qiov ? &local_qiov : qiov,
                                 flags);
  
  fail:
-    tracked_request_end(&req);
  
      if (use_local_qiov) {
          qemu_iovec_destroy(&local_qiov);
      }
      qemu_vfree(head_buf);
      qemu_vfree(tail_buf);
-
+out:
+    tracked_request_end(&req);
      return ret;
  }
  
@@ -1337,32 +1431,14 @@ int coroutine_fn bdrv_co_write_zeroes(BlockDriverState *bs,
                                        int64_t sector_num, int nb_sectors,
                                        BdrvRequestFlags flags)
  {
-    int ret;
-
      trace_bdrv_co_write_zeroes(bs, sector_num, nb_sectors, flags);
  
      if (!(bs->open_flags & BDRV_O_UNMAP)) {
          flags &= ~BDRV_REQ_MAY_UNMAP;
      }
-    if (bdrv_req_is_aligned(bs, sector_num << BDRV_SECTOR_BITS,
-                            nb_sectors << BDRV_SECTOR_BITS)) {
-        ret = bdrv_co_do_writev(bs, sector_num, nb_sectors, NULL,
-                                BDRV_REQ_ZERO_WRITE | flags);
-    } else {
-        uint8_t *buf;
-        QEMUIOVector local_qiov;
-        size_t bytes = nb_sectors << BDRV_SECTOR_BITS;
  
-        buf = qemu_memalign(bdrv_opt_mem_align(bs), bytes);
-        memset(buf, 0, bytes);
-        qemu_iovec_init(&local_qiov, 1);
-        qemu_iovec_add(&local_qiov, buf, bytes);
-
-        ret = bdrv_co_do_writev(bs, sector_num, nb_sectors, &local_qiov,
-                                BDRV_REQ_ZERO_WRITE | flags);
-        qemu_vfree(buf);
-    }
-    return ret;
+    return bdrv_co_do_writev(bs, sector_num, nb_sectors, NULL,
+                             BDRV_REQ_ZERO_WRITE | flags);
  }
  
  int bdrv_flush_all(void)
@@ -1450,19 +1526,17 @@ static int64_t coroutine_fn bdrv_co_get_block_status(BlockDriverState *bs,
  
      if (ret & BDRV_BLOCK_RAW) {
          assert(ret & BDRV_BLOCK_OFFSET_VALID);
-        return bdrv_get_block_status(bs->file, ret >> BDRV_SECTOR_BITS,
+        return bdrv_get_block_status(bs->file->bs, ret >> BDRV_SECTOR_BITS,
                                       *pnum, pnum);
      }
  
      if (ret & (BDRV_BLOCK_DATA | BDRV_BLOCK_ZERO)) {
          ret |= BDRV_BLOCK_ALLOCATED;
-    }
-
-    if (!(ret & BDRV_BLOCK_DATA) && !(ret & BDRV_BLOCK_ZERO)) {
+    } else {
          if (bdrv_unallocated_blocks_are_zero(bs)) {
              ret |= BDRV_BLOCK_ZERO;
-        } else if (bs->backing_hd) {
-            BlockDriverState *bs2 = bs->backing_hd;
+        } else if (bs->backing) {
+            BlockDriverState *bs2 = bs->backing->bs;
              int64_t nb_sectors2 = bdrv_nb_sectors(bs2);
              if (nb_sectors2 >= 0 && sector_num >= nb_sectors2) {
                  ret |= BDRV_BLOCK_ZERO;
@@ -1475,7 +1549,7 @@ static int64_t coroutine_fn bdrv_co_get_block_status(BlockDriverState *bs,
          (ret & BDRV_BLOCK_OFFSET_VALID)) {
          int file_pnum;
  
-        ret2 = bdrv_co_get_block_status(bs->file, ret >> BDRV_SECTOR_BITS,
+        ret2 = bdrv_co_get_block_status(bs->file->bs, ret >> BDRV_SECTOR_BITS,
                                          *pnum, &file_pnum);
          if (ret2 >= 0) {
              /* Ignore errors.  This is just providing extra information, it
@@ -1497,28 +1571,54 @@ static int64_t coroutine_fn bdrv_co_get_block_status(BlockDriverState *bs,
      return ret;
  }
  
-/* Coroutine wrapper for bdrv_get_block_status() */
-static void coroutine_fn bdrv_get_block_status_co_entry(void *opaque)
+static int64_t coroutine_fn bdrv_co_get_block_status_above(BlockDriverState *bs,
+        BlockDriverState *base,
+        int64_t sector_num,
+        int nb_sectors,
+        int *pnum)
+{
+    BlockDriverState *p;
+    int64_t ret = 0;
+
+    assert(bs != base);
+    for (p = bs; p != base; p = backing_bs(p)) {
+        ret = bdrv_co_get_block_status(p, sector_num, nb_sectors, pnum);
+        if (ret < 0 || ret & BDRV_BLOCK_ALLOCATED) {
+            break;
+        }
+        /* [sector_num, pnum] unallocated on this layer, which could be only
+         * the first part of [sector_num, nb_sectors].  */
+        nb_sectors = MIN(nb_sectors, *pnum);
+    }
+    return ret;
+}
+
+/* Coroutine wrapper for bdrv_get_block_status_above() */
+static void coroutine_fn bdrv_get_block_status_above_co_entry(void *opaque)
  {
      BdrvCoGetBlockStatusData *data = opaque;
-    BlockDriverState *bs = data->bs;
  
-    data->ret = bdrv_co_get_block_status(bs, data->sector_num, data->nb_sectors,
-                                         data->pnum);
+    data->ret = bdrv_co_get_block_status_above(data->bs, data->base,
+                                               data->sector_num,
+                                               data->nb_sectors,
+                                               data->pnum);
      data->done = true;
  }
  
  /*
- * Synchronous wrapper around bdrv_co_get_block_status().
+ * Synchronous wrapper around bdrv_co_get_block_status_above().
   *
- * See bdrv_co_get_block_status() for details.
+ * See bdrv_co_get_block_status_above() for details.
   */
-int64_t bdrv_get_block_status(BlockDriverState *bs, int64_t sector_num,
-                              int nb_sectors, int *pnum)
+int64_t bdrv_get_block_status_above(BlockDriverState *bs,
+                                    BlockDriverState *base,
+                                    int64_t sector_num,
+                                    int nb_sectors, int *pnum)
  {
      Coroutine *co;
      BdrvCoGetBlockStatusData data = {
          .bs = bs,
+        .base = base,
          .sector_num = sector_num,
          .nb_sectors = nb_sectors,
          .pnum = pnum,
@@ -1527,11 +1627,11 @@ int64_t bdrv_get_block_status(BlockDriverState *bs, int64_t sector_num,
  
      if (qemu_in_coroutine()) {
          /* Fast-path if already in coroutine context */
-        bdrv_get_block_status_co_entry(&data);
+        bdrv_get_block_status_above_co_entry(&data);
      } else {
          AioContext *aio_context = bdrv_get_aio_context(bs);
  
-        co = qemu_coroutine_create(bdrv_get_block_status_co_entry);
+        co = qemu_coroutine_create(bdrv_get_block_status_above_co_entry);
          qemu_coroutine_enter(co, &data);
          while (!data.done) {
              aio_poll(aio_context, true);
@@ -1540,6 +1640,14 @@ int64_t bdrv_get_block_status(BlockDriverState *bs, int64_t sector_num,
      return data.ret;
  }
  
+int64_t bdrv_get_block_status(BlockDriverState *bs,
+                              int64_t sector_num,
+                              int nb_sectors, int *pnum)
+{
+    return bdrv_get_block_status_above(bs, backing_bs(bs),
+                                       sector_num, nb_sectors, pnum);
+}
+
  int coroutine_fn bdrv_is_allocated(BlockDriverState *bs, int64_t sector_num,
                                     int nb_sectors, int *pnum)
  {
@@ -1594,7 +1702,7 @@ int bdrv_is_allocated_above(BlockDriverState *top,
              n = pnum_inter;
          }
  
-        intermediate = intermediate->backing_hd;
+        intermediate = backing_bs(intermediate);
      }
  
      *pnum = n;
@@ -1645,7 +1753,7 @@ int bdrv_writev_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos)
      } else if (drv->bdrv_save_vmstate) {
          return drv->bdrv_save_vmstate(bs, qiov, pos);
      } else if (bs->file) {
-        return bdrv_writev_vmstate(bs->file, qiov, pos);
+        return bdrv_writev_vmstate(bs->file->bs, qiov, pos);
      }
  
      return -ENOTSUP;
@@ -1660,7 +1768,7 @@ int bdrv_load_vmstate(BlockDriverState *bs, uint8_t *buf,
      if (drv->bdrv_load_vmstate)
          return drv->bdrv_load_vmstate(bs, buf, pos, size);
      if (bs->file)
-        return bdrv_load_vmstate(bs->file, buf, pos, size);
+        return bdrv_load_vmstate(bs->file->bs, buf, pos, size);
      return -ENOTSUP;
  }
  
@@ -1781,7 +1889,8 @@ static int multiwrite_merge(BlockDriverState *bs, BlockRequest *reqs,
              merge = 1;
          }
  
-        if (reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1 > IOV_MAX) {
+        if (reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1 >
+            bs->bl.max_iov) {
              merge = 0;
          }
  
@@ -1825,7 +1934,10 @@ static int multiwrite_merge(BlockDriverState *bs, BlockRequest *reqs,
          }
      }
  
-    block_acct_merge_done(&bs->stats, BLOCK_ACCT_WRITE, num_reqs - outidx - 1);
+    if (bs->blk) {
+        block_acct_merge_done(blk_get_stats(bs->blk), BLOCK_ACCT_WRITE,
+                              num_reqs - outidx - 1);
+    }
  
      return outidx + 1;
  }
@@ -2140,7 +2252,7 @@ void *qemu_aio_get(const AIOCBInfo *aiocb_info, BlockDriverState *bs,
  {
      BlockAIOCB *acb;
  
-    acb = g_slice_alloc(aiocb_info->aiocb_size);
+    acb = g_malloc(aiocb_info->aiocb_size);
      acb->aiocb_info = aiocb_info;
      acb->bs = bs;
      acb->cb = cb;
@@ -2160,7 +2272,7 @@ void qemu_aio_unref(void *p)
      BlockAIOCB *acb = p;
      assert(acb->refcnt > 0);
      if (--acb->refcnt == 0) {
-        g_slice_free1(acb->aiocb_info->aiocb_size, acb);
+        g_free(acb);
      }
  }
  
@@ -2230,17 +2342,20 @@ static void coroutine_fn bdrv_flush_co_entry(void *opaque)
  int coroutine_fn bdrv_co_flush(BlockDriverState *bs)
  {
      int ret;
+    BdrvTrackedRequest req;
  
-    if (!bs || !bdrv_is_inserted(bs) || bdrv_is_read_only(bs)) {
+    if (!bs || !bdrv_is_inserted(bs) || bdrv_is_read_only(bs) ||
+        bdrv_is_sg(bs)) {
          return 0;
      }
  
+    tracked_request_begin(&req, bs, 0, 0, BDRV_TRACKED_FLUSH);
      /* Write back cached data to the OS even with cache=unsafe */
      BLKDBG_EVENT(bs->file, BLKDBG_FLUSH_TO_OS);
      if (bs->drv->bdrv_co_flush_to_os) {
          ret = bs->drv->bdrv_co_flush_to_os(bs);
          if (ret < 0) {
-            return ret;
+            goto out;
          }
      }
  
@@ -2280,14 +2395,17 @@ int coroutine_fn bdrv_co_flush(BlockDriverState *bs)
          ret = 0;
      }
      if (ret < 0) {
-        return ret;
+        goto out;
      }
  
      /* Now flush the underlying protocol.  It will also have BDRV_O_NO_FLUSH
       * in the case of cache=unsafe, so there are no useless flushes.
       */
  flush_parent:
-    return bdrv_co_flush(bs->file);
+    ret = bs->file ? bdrv_co_flush(bs->file->bs) : 0;
+out:
+    tracked_request_end(&req);
+    return ret;
  }
  
  int bdrv_flush(BlockDriverState *bs)
@@ -2330,6 +2448,7 @@ static void coroutine_fn bdrv_discard_co_entry(void *opaque)
  int coroutine_fn bdrv_co_discard(BlockDriverState *bs, int64_t sector_num,
                                   int nb_sectors)
  {
+    BdrvTrackedRequest req;
      int max_discard, ret;
  
      if (!bs->drv) {
@@ -2343,8 +2462,6 @@ int coroutine_fn bdrv_co_discard(BlockDriverState *bs, int64_t sector_num,
          return -EPERM;
      }
  
-    bdrv_reset_dirty(bs, sector_num, nb_sectors);
-
      /* Do nothing if disabled.  */
      if (!(bs->open_flags & BDRV_O_UNMAP)) {
          return 0;
@@ -2354,6 +2471,10 @@ int coroutine_fn bdrv_co_discard(BlockDriverState *bs, int64_t sector_num,
          return 0;
      }
  
+    tracked_request_begin(&req, bs, sector_num, nb_sectors,
+                          BDRV_TRACKED_DISCARD);
+    bdrv_set_dirty(bs, sector_num, nb_sectors);
+
      max_discard = MIN_NON_ZERO(bs->bl.max_discard, BDRV_REQUEST_MAX_SECTORS);
      while (nb_sectors > 0) {
          int ret;
@@ -2385,20 +2506,24 @@ int coroutine_fn bdrv_co_discard(BlockDriverState *bs, int64_t sector_num,
              acb = bs->drv->bdrv_aio_discard(bs, sector_num, nb_sectors,
                                              bdrv_co_io_em_complete, &co);
              if (acb == NULL) {
-                return -EIO;
+                ret = -EIO;
+                goto out;
              } else {
                  qemu_coroutine_yield();
                  ret = co.ret;
              }
          }
          if (ret && ret != -ENOTSUP) {
-            return ret;
+            goto out;
          }
  
          sector_num += num;
          nb_sectors -= num;
      }
-    return 0;
+    ret = 0;
+out:
+    tracked_request_end(&req);
+    return ret;
  }
  
  int bdrv_discard(BlockDriverState *bs, int64_t sector_num, int nb_sectors)
@@ -2427,26 +2552,110 @@ int bdrv_discard(BlockDriverState *bs, int64_t sector_num, int nb_sectors)
      return rwco.ret;
  }
  
-/* needed for generic scsi interface */
+typedef struct {
+    CoroutineIOCompletion *co;
+    QEMUBH *bh;
+} BdrvIoctlCompletionData;
  
-int bdrv_ioctl(BlockDriverState *bs, unsigned long int req, void *buf)
+static void bdrv_ioctl_bh_cb(void *opaque)
+{
+    BdrvIoctlCompletionData *data = opaque;
+
+    bdrv_co_io_em_complete(data->co, -ENOTSUP);
+    qemu_bh_delete(data->bh);
+}
+
+static int bdrv_co_do_ioctl(BlockDriverState *bs, int req, void *buf)
  {
      BlockDriver *drv = bs->drv;
+    BdrvTrackedRequest tracked_req;
+    CoroutineIOCompletion co = {
+        .coroutine = qemu_coroutine_self(),
+    };
+    BlockAIOCB *acb;
  
-    if (drv && drv->bdrv_ioctl)
-        return drv->bdrv_ioctl(bs, req, buf);
-    return -ENOTSUP;
+    tracked_request_begin(&tracked_req, bs, 0, 0, BDRV_TRACKED_IOCTL);
+    if (!drv || !drv->bdrv_aio_ioctl) {
+        co.ret = -ENOTSUP;
+        goto out;
+    }
+
+    acb = drv->bdrv_aio_ioctl(bs, req, buf, bdrv_co_io_em_complete, &co);
+    if (!acb) {
+        BdrvIoctlCompletionData *data = g_new(BdrvIoctlCompletionData, 1);
+        data->bh = aio_bh_new(bdrv_get_aio_context(bs),
+                                bdrv_ioctl_bh_cb, data);
+        data->co = &co;
+        qemu_bh_schedule(data->bh);
+    }
+    qemu_coroutine_yield();
+out:
+    tracked_request_end(&tracked_req);
+    return co.ret;
+}
+
+typedef struct {
+    BlockDriverState *bs;
+    int req;
+    void *buf;
+    int ret;
+} BdrvIoctlCoData;
+
+static void coroutine_fn bdrv_co_ioctl_entry(void *opaque)
+{
+    BdrvIoctlCoData *data = opaque;
+    data->ret = bdrv_co_do_ioctl(data->bs, data->req, data->buf);
+}
+
+/* needed for generic scsi interface */
+int bdrv_ioctl(BlockDriverState *bs, unsigned long int req, void *buf)
+{
+    BdrvIoctlCoData data = {
+        .bs = bs,
+        .req = req,
+        .buf = buf,
+        .ret = -EINPROGRESS,
+    };
+
+    if (qemu_in_coroutine()) {
+        /* Fast-path if already in coroutine context */
+        bdrv_co_ioctl_entry(&data);
+    } else {
+        Coroutine *co = qemu_coroutine_create(bdrv_co_ioctl_entry);
+
+        qemu_coroutine_enter(co, &data);
+        while (data.ret == -EINPROGRESS) {
+            aio_poll(bdrv_get_aio_context(bs), true);
+        }
+    }
+    return data.ret;
+}
+
+static void coroutine_fn bdrv_co_aio_ioctl_entry(void *opaque)
+{
+    BlockAIOCBCoroutine *acb = opaque;
+    acb->req.error = bdrv_co_do_ioctl(acb->common.bs,
+                                      acb->req.req, acb->req.buf);
+    bdrv_co_complete(acb);
  }
  
  BlockAIOCB *bdrv_aio_ioctl(BlockDriverState *bs,
          unsigned long int req, void *buf,
          BlockCompletionFunc *cb, void *opaque)
  {
-    BlockDriver *drv = bs->drv;
+    BlockAIOCBCoroutine *acb = qemu_aio_get(&bdrv_em_co_aiocb_info,
+                                            bs, cb, opaque);
+    Coroutine *co;
+
+    acb->need_bh = true;
+    acb->req.error = -EINPROGRESS;
+    acb->req.req = req;
+    acb->req.buf = buf;
+    co = qemu_coroutine_create(bdrv_co_aio_ioctl_entry);
+    qemu_coroutine_enter(co, acb);
  
-    if (drv && drv->bdrv_aio_ioctl)
-        return drv->bdrv_aio_ioctl(bs, req, buf, cb, opaque);
-    return NULL;
+    bdrv_co_maybe_schedule_bh(acb);
+    return &acb->common;
  }
  
  void *qemu_blockalign(BlockDriverState *bs, size_t size)
@@ -2489,7 +2698,7 @@ void *qemu_try_blockalign0(BlockDriverState *bs, size_t size)
  bool bdrv_qiov_is_aligned(BlockDriverState *bs, QEMUIOVector *qiov)
  {
      int i;
-    size_t alignment = bdrv_opt_mem_align(bs);
+    size_t alignment = bdrv_min_mem_align(bs);
  
      for (i = 0; i < qiov->niov; i++) {
          if ((uintptr_t) qiov->iov[i].iov_base % alignment) {
@@ -2515,7 +2724,7 @@ void bdrv_io_plug(BlockDriverState *bs)
      if (drv && drv->bdrv_io_plug) {
          drv->bdrv_io_plug(bs);
      } else if (bs->file) {
-        bdrv_io_plug(bs->file);
+        bdrv_io_plug(bs->file->bs);
      }
  }
  
@@ -2525,7 +2734,7 @@ void bdrv_io_unplug(BlockDriverState *bs)
      if (drv && drv->bdrv_io_unplug) {
          drv->bdrv_io_unplug(bs);
      } else if (bs->file) {
-        bdrv_io_unplug(bs->file);
+        bdrv_io_unplug(bs->file->bs);
      }
  }
  
@@ -2535,6 +2744,24 @@ void bdrv_flush_io_queue(BlockDriverState *bs)
      if (drv && drv->bdrv_flush_io_queue) {
          drv->bdrv_flush_io_queue(bs);
      } else if (bs->file) {
-        bdrv_flush_io_queue(bs->file);
+        bdrv_flush_io_queue(bs->file->bs);
+    }
+    bdrv_start_throttled_reqs(bs);
+}
+
+void bdrv_drained_begin(BlockDriverState *bs)
+{
+    if (!bs->quiesce_counter++) {
+        aio_disable_external(bdrv_get_aio_context(bs));
+    }
+    bdrv_drain(bs);
+}
+
+void bdrv_drained_end(BlockDriverState *bs)
+{
+    assert(bs->quiesce_counter > 0);
+    if (--bs->quiesce_counter > 0) {
+        return;
      }
+    aio_enable_external(bdrv_get_aio_context(bs));
  }