block/backup: add 'always' bitmap sync policy

[qemu.git] / block / file-posix.c
diff --git a/block/file-posix.c b/block/file-posix.c

index e9fa6aac484f3c4d94788da54dfa255740945902..b8b4dad553ab298db217571f93d58683bec0ca3d 100644 (file)
--- a/block/file-posix.c
+++ b/block/file-posix.c
@@ -23,6 +23,7 @@
   */
  
  #include "qemu/osdep.h"
+#include "qemu-common.h"
  #include "qapi/error.h"
  #include "qemu/cutils.h"
  #include "qemu/error-report.h"
@@ -145,6 +146,7 @@ typedef struct BDRVRawState {
      uint64_t locked_shared_perm;
  
      int perm_change_fd;
+    int perm_change_flags;
      BDRVReopenState *reopen_state;
  
  #ifdef CONFIG_XFS
@@ -157,6 +159,7 @@ typedef struct BDRVRawState {
      bool page_cache_inconsistent:1;
      bool has_fallocate;
      bool needs_alignment;
+    bool drop_cache;
      bool check_cache_dropped;
  
      PRManager *pr_mgr;
@@ -165,6 +168,7 @@ typedef struct BDRVRawState {
  typedef struct BDRVRawReopenState {
      int fd;
      int open_flags;
+    bool drop_cache;
      bool check_cache_dropped;
  } BDRVRawReopenState;
  
@@ -319,6 +323,7 @@ static void raw_probe_alignment(BlockDriverState *bs, int fd, Error **errp)
      BDRVRawState *s = bs->opaque;
      char *buf;
      size_t max_align = MAX(MAX_BLOCKSIZE, getpagesize());
+    size_t alignments[] = {1, 512, 1024, 2048, 4096};
  
      /* For SCSI generic devices the alignment is not really used.
         With buffered I/O, we don't have any restrictions. */
@@ -345,25 +350,38 @@ static void raw_probe_alignment(BlockDriverState *bs, int fd, Error **errp)
      }
  #endif
  
-    /* If we could not get the sizes so far, we can only guess them */
-    if (!s->buf_align) {
+    /*
+     * If we could not get the sizes so far, we can only guess them. First try
+     * to detect request alignment, since it is more likely to succeed. Then
+     * try to detect buf_align, which cannot be detected in some cases (e.g.
+     * Gluster). If buf_align cannot be detected, we fallback to the value of
+     * request_alignment.
+     */
+
+    if (!bs->bl.request_alignment) {
+        int i;
          size_t align;
-        buf = qemu_memalign(max_align, 2 * max_align);
-        for (align = 512; align <= max_align; align <<= 1) {
-            if (raw_is_io_aligned(fd, buf + align, max_align)) {
-                s->buf_align = align;
+        buf = qemu_memalign(max_align, max_align);
+        for (i = 0; i < ARRAY_SIZE(alignments); i++) {
+            align = alignments[i];
+            if (raw_is_io_aligned(fd, buf, align)) {
+                /* Fallback to safe value. */
+                bs->bl.request_alignment = (align != 1) ? align : max_align;
                  break;
              }
          }
          qemu_vfree(buf);
      }
  
-    if (!bs->bl.request_alignment) {
+    if (!s->buf_align) {
+        int i;
          size_t align;
-        buf = qemu_memalign(s->buf_align, max_align);
-        for (align = 512; align <= max_align; align <<= 1) {
-            if (raw_is_io_aligned(fd, buf, align)) {
-                bs->bl.request_alignment = align;
+        buf = qemu_memalign(max_align, 2 * max_align);
+        for (i = 0; i < ARRAY_SIZE(alignments); i++) {
+            align = alignments[i];
+            if (raw_is_io_aligned(fd, buf + align, max_align)) {
+                /* Fallback to request_aligment. */
+                s->buf_align = (align != 1) ? align : bs->bl.request_alignment;
                  break;
              }
          }
@@ -433,6 +451,13 @@ static QemuOptsList raw_runtime_opts = {
              .type = QEMU_OPT_STRING,
              .help = "id of persistent reservation manager object (default: none)",
          },
+#if defined(__linux__)
+        {
+            .name = "drop-cache",
+            .type = QEMU_OPT_BOOL,
+            .help = "invalidate page cache during live migration (default: on)",
+        },
+#endif
          {
              .name = "x-check-cache-dropped",
              .type = QEMU_OPT_BOOL,
@@ -524,6 +549,7 @@ static int raw_open_common(BlockDriverState *bs, QDict *options,
          }
      }
  
+    s->drop_cache = qemu_opt_get_bool(opts, "drop-cache", true);
      s->check_cache_dropped = qemu_opt_get_bool(opts, "x-check-cache-dropped",
                                                 false);
  
@@ -642,7 +668,7 @@ static int raw_open_common(BlockDriverState *bs, QDict *options,
      }
  #endif
  
-    bs->supported_zero_flags = BDRV_REQ_MAY_UNMAP;
+    bs->supported_zero_flags = BDRV_REQ_MAY_UNMAP | BDRV_REQ_NO_FALLBACK;
      ret = 0;
  fail:
      if (filename && (bdrv_flags & BDRV_O_TEMPORARY)) {
@@ -805,6 +831,18 @@ static int raw_handle_perm_lock(BlockDriverState *bs,
  
      switch (op) {
      case RAW_PL_PREPARE:
+        if ((s->perm | new_perm) == s->perm &&
+            (s->shared_perm & new_shared) == s->shared_perm)
+        {
+            /*
+             * We are going to unlock bytes, it should not fail. If it fail due
+             * to some fs-dependent permission-unrelated reasons (which occurs
+             * sometimes on NFS and leads to abort in bdrv_replace_child) we
+             * can't prevent such errors by any check here. And we ignore them
+             * anyway in ABORT and COMMIT.
+             */
+            return 0;
+        }
          ret = raw_apply_lock_bytes(s, s->fd, s->perm | new_perm,
                                     ~s->shared_perm | ~new_shared,
                                     false, errp);
@@ -933,6 +971,7 @@ static int raw_reopen_prepare(BDRVReopenState *state,
          goto out;
      }
  
+    rs->drop_cache = qemu_opt_get_bool_del(opts, "drop-cache", true);
      rs->check_cache_dropped =
          qemu_opt_get_bool_del(opts, "x-check-cache-dropped", false);
  
@@ -977,6 +1016,7 @@ static void raw_reopen_commit(BDRVReopenState *state)
      BDRVRawReopenState *rs = state->opaque;
      BDRVRawState *s = state->bs->opaque;
  
+    s->drop_cache = rs->drop_cache;
      s->check_cache_dropped = rs->check_cache_dropped;
      s->open_flags = rs->open_flags;
  
@@ -1012,15 +1052,13 @@ static void raw_reopen_abort(BDRVReopenState *state)
      s->reopen_state = NULL;
  }
  
-static int hdev_get_max_transfer_length(BlockDriverState *bs, int fd)
+static int sg_get_max_transfer_length(int fd)
  {
  #ifdef BLKSECTGET
      int max_bytes = 0;
-    short max_sectors = 0;
-    if (bs->sg && ioctl(fd, BLKSECTGET, &max_bytes) == 0) {
+
+    if (ioctl(fd, BLKSECTGET, &max_bytes) == 0) {
          return max_bytes;
-    } else if (!bs->sg && ioctl(fd, BLKSECTGET, &max_sectors) == 0) {
-        return max_sectors << BDRV_SECTOR_BITS;
      } else {
          return -errno;
      }
@@ -1029,25 +1067,31 @@ static int hdev_get_max_transfer_length(BlockDriverState *bs, int fd)
  #endif
  }
  
-static int hdev_get_max_segments(const struct stat *st)
+static int sg_get_max_segments(int fd)
  {
  #ifdef CONFIG_LINUX
      char buf[32];
      const char *end;
-    char *sysfspath;
+    char *sysfspath = NULL;
      int ret;
-    int fd = -1;
+    int sysfd = -1;
      long max_segments;
+    struct stat st;
+
+    if (fstat(fd, &st)) {
+        ret = -errno;
+        goto out;
+    }
  
      sysfspath = g_strdup_printf("/sys/dev/block/%u:%u/queue/max_segments",
-                                major(st->st_rdev), minor(st->st_rdev));
-    fd = open(sysfspath, O_RDONLY);
-    if (fd == -1) {
+                                major(st.st_rdev), minor(st.st_rdev));
+    sysfd = open(sysfspath, O_RDONLY);
+    if (sysfd == -1) {
          ret = -errno;
          goto out;
      }
      do {
-        ret = read(fd, buf, sizeof(buf) - 1);
+        ret = read(sysfd, buf, sizeof(buf) - 1);
      } while (ret == -1 && errno == EINTR);
      if (ret < 0) {
          ret = -errno;
@@ -1064,8 +1108,8 @@ static int hdev_get_max_segments(const struct stat *st)
      }
  
  out:
-    if (fd != -1) {
-        close(fd);
+    if (sysfd != -1) {
+        close(sysfd);
      }
      g_free(sysfspath);
      return ret;
@@ -1077,19 +1121,17 @@ out:
  static void raw_refresh_limits(BlockDriverState *bs, Error **errp)
  {
      BDRVRawState *s = bs->opaque;
-    struct stat st;
  
-    if (!fstat(s->fd, &st)) {
-        if (S_ISBLK(st.st_mode) || S_ISCHR(st.st_mode)) {
-            int ret = hdev_get_max_transfer_length(bs, s->fd);
-            if (ret > 0 && ret <= BDRV_REQUEST_MAX_BYTES) {
-                bs->bl.max_transfer = pow2floor(ret);
-            }
-            ret = hdev_get_max_segments(&st);
-            if (ret > 0) {
-                bs->bl.max_transfer = MIN(bs->bl.max_transfer,
-                                          ret * getpagesize());
-            }
+    if (bs->sg) {
+        int ret = sg_get_max_transfer_length(s->fd);
+
+        if (ret > 0 && ret <= BDRV_REQUEST_MAX_BYTES) {
+            bs->bl.max_transfer = pow2floor(ret);
+        }
+
+        ret = sg_get_max_segments(s->fd);
+        if (ret > 0) {
+            bs->bl.max_transfer = MIN(bs->bl.max_transfer, ret * getpagesize());
          }
      }
  
@@ -1420,9 +1462,22 @@ out:
  #ifdef CONFIG_XFS
  static int xfs_write_zeroes(BDRVRawState *s, int64_t offset, uint64_t bytes)
  {
+    int64_t len;
      struct xfs_flock64 fl;
      int err;
  
+    len = lseek(s->fd, 0, SEEK_END);
+    if (len < 0) {
+        return -errno;
+    }
+
+    if (offset + bytes > len) {
+        /* XFS_IOC_ZERO_RANGE does not increase the file length */
+        if (ftruncate(s->fd, offset + bytes) < 0) {
+            return -errno;
+        }
+    }
+
      memset(&fl, 0, sizeof(fl));
      fl.l_whence = SEEK_SET;
      fl.l_start = offset;
@@ -1488,14 +1543,19 @@ static ssize_t handle_aiocb_write_zeroes_block(RawPosixAIOData *aiocb)
      }
  
  #ifdef BLKZEROOUT
-    do {
-        uint64_t range[2] = { aiocb->aio_offset, aiocb->aio_nbytes };
-        if (ioctl(aiocb->aio_fildes, BLKZEROOUT, range) == 0) {
-            return 0;
-        }
-    } while (errno == EINTR);
+    /* The BLKZEROOUT implementation in the kernel doesn't set
+     * BLKDEV_ZERO_NOFALLBACK, so we can't call this if we have to avoid slow
+     * fallbacks. */
+    if (!(aiocb->aio_type & QEMU_AIO_NO_FALLBACK)) {
+        do {
+            uint64_t range[2] = { aiocb->aio_offset, aiocb->aio_nbytes };
+            if (ioctl(aiocb->aio_fildes, BLKZEROOUT, range) == 0) {
+                return 0;
+            }
+        } while (errno == EINTR);
  
-    ret = translate_err(-errno);
+        ret = translate_err(-errno);
+    }
  #endif
  
      if (ret == -ENOTSUP) {
@@ -2446,6 +2506,8 @@ static int coroutine_fn raw_co_block_status(BlockDriverState *bs,
      off_t data = 0, hole = 0;
      int ret;
  
+    assert(QEMU_IS_ALIGNED(offset | bytes, bs->bl.request_alignment));
+
      ret = fd_open(bs);
      if (ret < 0) {
          return ret;
@@ -2471,6 +2533,20 @@ static int coroutine_fn raw_co_block_status(BlockDriverState *bs,
          /* On a data extent, compute bytes to the end of the extent,
           * possibly including a partial sector at EOF. */
          *pnum = MIN(bytes, hole - offset);
+
+        /*
+         * We are not allowed to return partial sectors, though, so
+         * round up if necessary.
+         */
+        if (!QEMU_IS_ALIGNED(*pnum, bs->bl.request_alignment)) {
+            int64_t file_length = raw_getlength(bs);
+            if (file_length > 0) {
+                /* Ignore errors, this is just a safeguard */
+                assert(hole == file_length);
+            }
+            *pnum = ROUND_UP(*pnum, bs->bl.request_alignment);
+        }
+
          ret = BDRV_BLOCK_DATA;
      } else {
          /* On a hole, compute bytes to the beginning of the next extent.  */
@@ -2562,6 +2638,10 @@ static void coroutine_fn raw_co_invalidate_cache(BlockDriverState *bs,
          return;
      }
  
+    if (!s->drop_cache) {
+        return;
+    }
+
      if (s->open_flags & O_DIRECT) {
          return; /* No host kernel page cache */
      }
@@ -2643,6 +2723,9 @@ raw_do_pwrite_zeroes(BlockDriverState *bs, int64_t offset, int bytes,
      if (blkdev) {
          acb.aio_type |= QEMU_AIO_BLKDEV;
      }
+    if (flags & BDRV_REQ_NO_FALLBACK) {
+        acb.aio_type |= QEMU_AIO_NO_FALLBACK;
+    }
  
      if (flags & BDRV_REQ_MAY_UNMAP) {
          acb.aio_type |= QEMU_AIO_DISCARD;
@@ -2686,7 +2769,11 @@ static QemuOptsList raw_create_opts = {
          {
              .name = BLOCK_OPT_PREALLOC,
              .type = QEMU_OPT_STRING,
-            .help = "Preallocation mode (allowed values: off, falloc, full)"
+            .help = "Preallocation mode (allowed values: off"
+#ifdef CONFIG_POSIX_FALLOCATE
+                    ", falloc"
+#endif
+                    ", full)"
          },
          { /* end of list */ }
      }
@@ -2718,6 +2805,7 @@ static int raw_check_perm(BlockDriverState *bs, uint64_t perm, uint64_t shared,
          assert(s->reopen_state->shared_perm == shared);
          rs = s->reopen_state->opaque;
          s->perm_change_fd = rs->fd;
+        s->perm_change_flags = rs->open_flags;
      } else {
          /* We may need a new fd if auto-read-only switches the mode */
          ret = raw_reconfigure_getfd(bs, bs->open_flags, &open_flags, perm,
@@ -2726,6 +2814,7 @@ static int raw_check_perm(BlockDriverState *bs, uint64_t perm, uint64_t shared,
              return ret;
          } else if (ret != s->fd) {
              s->perm_change_fd = ret;
+            s->perm_change_flags = open_flags;
          }
      }
  
@@ -2764,6 +2853,7 @@ static void raw_set_perm(BlockDriverState *bs, uint64_t perm, uint64_t shared)
      if (s->perm_change_fd && s->fd != s->perm_change_fd) {
          qemu_close(s->fd);
          s->fd = s->perm_change_fd;
+        s->open_flags = s->perm_change_flags;
      }
      s->perm_change_fd = 0;