*/
#include "qemu/osdep.h"
+#include "qemu-common.h"
#include "qapi/error.h"
#include "qemu/cutils.h"
#include "qemu/error-report.h"
uint64_t locked_shared_perm;
int perm_change_fd;
+ int perm_change_flags;
BDRVReopenState *reopen_state;
#ifdef CONFIG_XFS
bool page_cache_inconsistent:1;
bool has_fallocate;
bool needs_alignment;
+ bool drop_cache;
bool check_cache_dropped;
PRManager *pr_mgr;
typedef struct BDRVRawReopenState {
int fd;
int open_flags;
+ bool drop_cache;
bool check_cache_dropped;
} BDRVRawReopenState;
BDRVRawState *s = bs->opaque;
char *buf;
size_t max_align = MAX(MAX_BLOCKSIZE, getpagesize());
+ size_t alignments[] = {1, 512, 1024, 2048, 4096};
/* For SCSI generic devices the alignment is not really used.
With buffered I/O, we don't have any restrictions. */
}
#endif
- /* If we could not get the sizes so far, we can only guess them */
- if (!s->buf_align) {
+ /*
+ * If we could not get the sizes so far, we can only guess them. First try
+ * to detect request alignment, since it is more likely to succeed. Then
+ * try to detect buf_align, which cannot be detected in some cases (e.g.
+ * Gluster). If buf_align cannot be detected, we fallback to the value of
+ * request_alignment.
+ */
+
+ if (!bs->bl.request_alignment) {
+ int i;
size_t align;
- buf = qemu_memalign(max_align, 2 * max_align);
- for (align = 512; align <= max_align; align <<= 1) {
- if (raw_is_io_aligned(fd, buf + align, max_align)) {
- s->buf_align = align;
+ buf = qemu_memalign(max_align, max_align);
+ for (i = 0; i < ARRAY_SIZE(alignments); i++) {
+ align = alignments[i];
+ if (raw_is_io_aligned(fd, buf, align)) {
+ /* Fallback to safe value. */
+ bs->bl.request_alignment = (align != 1) ? align : max_align;
break;
}
}
qemu_vfree(buf);
}
- if (!bs->bl.request_alignment) {
+ if (!s->buf_align) {
+ int i;
size_t align;
- buf = qemu_memalign(s->buf_align, max_align);
- for (align = 512; align <= max_align; align <<= 1) {
- if (raw_is_io_aligned(fd, buf, align)) {
- bs->bl.request_alignment = align;
+ buf = qemu_memalign(max_align, 2 * max_align);
+ for (i = 0; i < ARRAY_SIZE(alignments); i++) {
+ align = alignments[i];
+ if (raw_is_io_aligned(fd, buf + align, max_align)) {
+ /* Fallback to request_aligment. */
+ s->buf_align = (align != 1) ? align : bs->bl.request_alignment;
break;
}
}
.type = QEMU_OPT_STRING,
.help = "id of persistent reservation manager object (default: none)",
},
+#if defined(__linux__)
+ {
+ .name = "drop-cache",
+ .type = QEMU_OPT_BOOL,
+ .help = "invalidate page cache during live migration (default: on)",
+ },
+#endif
{
.name = "x-check-cache-dropped",
.type = QEMU_OPT_BOOL,
}
}
+ s->drop_cache = qemu_opt_get_bool(opts, "drop-cache", true);
s->check_cache_dropped = qemu_opt_get_bool(opts, "x-check-cache-dropped",
false);
}
#endif
- bs->supported_zero_flags = BDRV_REQ_MAY_UNMAP;
+ bs->supported_zero_flags = BDRV_REQ_MAY_UNMAP | BDRV_REQ_NO_FALLBACK;
ret = 0;
fail:
if (filename && (bdrv_flags & BDRV_O_TEMPORARY)) {
switch (op) {
case RAW_PL_PREPARE:
+ if ((s->perm | new_perm) == s->perm &&
+ (s->shared_perm & new_shared) == s->shared_perm)
+ {
+ /*
+ * We are going to unlock bytes, it should not fail. If it fail due
+ * to some fs-dependent permission-unrelated reasons (which occurs
+ * sometimes on NFS and leads to abort in bdrv_replace_child) we
+ * can't prevent such errors by any check here. And we ignore them
+ * anyway in ABORT and COMMIT.
+ */
+ return 0;
+ }
ret = raw_apply_lock_bytes(s, s->fd, s->perm | new_perm,
~s->shared_perm | ~new_shared,
false, errp);
goto out;
}
+ rs->drop_cache = qemu_opt_get_bool_del(opts, "drop-cache", true);
rs->check_cache_dropped =
qemu_opt_get_bool_del(opts, "x-check-cache-dropped", false);
BDRVRawReopenState *rs = state->opaque;
BDRVRawState *s = state->bs->opaque;
+ s->drop_cache = rs->drop_cache;
s->check_cache_dropped = rs->check_cache_dropped;
s->open_flags = rs->open_flags;
s->reopen_state = NULL;
}
-static int hdev_get_max_transfer_length(BlockDriverState *bs, int fd)
+static int sg_get_max_transfer_length(int fd)
{
#ifdef BLKSECTGET
int max_bytes = 0;
- short max_sectors = 0;
- if (bs->sg && ioctl(fd, BLKSECTGET, &max_bytes) == 0) {
+
+ if (ioctl(fd, BLKSECTGET, &max_bytes) == 0) {
return max_bytes;
- } else if (!bs->sg && ioctl(fd, BLKSECTGET, &max_sectors) == 0) {
- return max_sectors << BDRV_SECTOR_BITS;
} else {
return -errno;
}
#endif
}
-static int hdev_get_max_segments(const struct stat *st)
+static int sg_get_max_segments(int fd)
{
#ifdef CONFIG_LINUX
char buf[32];
const char *end;
- char *sysfspath;
+ char *sysfspath = NULL;
int ret;
- int fd = -1;
+ int sysfd = -1;
long max_segments;
+ struct stat st;
+
+ if (fstat(fd, &st)) {
+ ret = -errno;
+ goto out;
+ }
sysfspath = g_strdup_printf("/sys/dev/block/%u:%u/queue/max_segments",
- major(st->st_rdev), minor(st->st_rdev));
- fd = open(sysfspath, O_RDONLY);
- if (fd == -1) {
+ major(st.st_rdev), minor(st.st_rdev));
+ sysfd = open(sysfspath, O_RDONLY);
+ if (sysfd == -1) {
ret = -errno;
goto out;
}
do {
- ret = read(fd, buf, sizeof(buf) - 1);
+ ret = read(sysfd, buf, sizeof(buf) - 1);
} while (ret == -1 && errno == EINTR);
if (ret < 0) {
ret = -errno;
}
out:
- if (fd != -1) {
- close(fd);
+ if (sysfd != -1) {
+ close(sysfd);
}
g_free(sysfspath);
return ret;
static void raw_refresh_limits(BlockDriverState *bs, Error **errp)
{
BDRVRawState *s = bs->opaque;
- struct stat st;
- if (!fstat(s->fd, &st)) {
- if (S_ISBLK(st.st_mode) || S_ISCHR(st.st_mode)) {
- int ret = hdev_get_max_transfer_length(bs, s->fd);
- if (ret > 0 && ret <= BDRV_REQUEST_MAX_BYTES) {
- bs->bl.max_transfer = pow2floor(ret);
- }
- ret = hdev_get_max_segments(&st);
- if (ret > 0) {
- bs->bl.max_transfer = MIN(bs->bl.max_transfer,
- ret * getpagesize());
- }
+ if (bs->sg) {
+ int ret = sg_get_max_transfer_length(s->fd);
+
+ if (ret > 0 && ret <= BDRV_REQUEST_MAX_BYTES) {
+ bs->bl.max_transfer = pow2floor(ret);
+ }
+
+ ret = sg_get_max_segments(s->fd);
+ if (ret > 0) {
+ bs->bl.max_transfer = MIN(bs->bl.max_transfer, ret * getpagesize());
}
}
#ifdef CONFIG_XFS
static int xfs_write_zeroes(BDRVRawState *s, int64_t offset, uint64_t bytes)
{
+ int64_t len;
struct xfs_flock64 fl;
int err;
+ len = lseek(s->fd, 0, SEEK_END);
+ if (len < 0) {
+ return -errno;
+ }
+
+ if (offset + bytes > len) {
+ /* XFS_IOC_ZERO_RANGE does not increase the file length */
+ if (ftruncate(s->fd, offset + bytes) < 0) {
+ return -errno;
+ }
+ }
+
memset(&fl, 0, sizeof(fl));
fl.l_whence = SEEK_SET;
fl.l_start = offset;
}
#ifdef BLKZEROOUT
- do {
- uint64_t range[2] = { aiocb->aio_offset, aiocb->aio_nbytes };
- if (ioctl(aiocb->aio_fildes, BLKZEROOUT, range) == 0) {
- return 0;
- }
- } while (errno == EINTR);
+ /* The BLKZEROOUT implementation in the kernel doesn't set
+ * BLKDEV_ZERO_NOFALLBACK, so we can't call this if we have to avoid slow
+ * fallbacks. */
+ if (!(aiocb->aio_type & QEMU_AIO_NO_FALLBACK)) {
+ do {
+ uint64_t range[2] = { aiocb->aio_offset, aiocb->aio_nbytes };
+ if (ioctl(aiocb->aio_fildes, BLKZEROOUT, range) == 0) {
+ return 0;
+ }
+ } while (errno == EINTR);
- ret = translate_err(-errno);
+ ret = translate_err(-errno);
+ }
#endif
if (ret == -ENOTSUP) {
off_t data = 0, hole = 0;
int ret;
+ assert(QEMU_IS_ALIGNED(offset | bytes, bs->bl.request_alignment));
+
ret = fd_open(bs);
if (ret < 0) {
return ret;
/* On a data extent, compute bytes to the end of the extent,
* possibly including a partial sector at EOF. */
*pnum = MIN(bytes, hole - offset);
+
+ /*
+ * We are not allowed to return partial sectors, though, so
+ * round up if necessary.
+ */
+ if (!QEMU_IS_ALIGNED(*pnum, bs->bl.request_alignment)) {
+ int64_t file_length = raw_getlength(bs);
+ if (file_length > 0) {
+ /* Ignore errors, this is just a safeguard */
+ assert(hole == file_length);
+ }
+ *pnum = ROUND_UP(*pnum, bs->bl.request_alignment);
+ }
+
ret = BDRV_BLOCK_DATA;
} else {
/* On a hole, compute bytes to the beginning of the next extent. */
return;
}
+ if (!s->drop_cache) {
+ return;
+ }
+
if (s->open_flags & O_DIRECT) {
return; /* No host kernel page cache */
}
if (blkdev) {
acb.aio_type |= QEMU_AIO_BLKDEV;
}
+ if (flags & BDRV_REQ_NO_FALLBACK) {
+ acb.aio_type |= QEMU_AIO_NO_FALLBACK;
+ }
if (flags & BDRV_REQ_MAY_UNMAP) {
acb.aio_type |= QEMU_AIO_DISCARD;
{
.name = BLOCK_OPT_PREALLOC,
.type = QEMU_OPT_STRING,
- .help = "Preallocation mode (allowed values: off, falloc, full)"
+ .help = "Preallocation mode (allowed values: off"
+#ifdef CONFIG_POSIX_FALLOCATE
+ ", falloc"
+#endif
+ ", full)"
},
{ /* end of list */ }
}
assert(s->reopen_state->shared_perm == shared);
rs = s->reopen_state->opaque;
s->perm_change_fd = rs->fd;
+ s->perm_change_flags = rs->open_flags;
} else {
/* We may need a new fd if auto-read-only switches the mode */
ret = raw_reconfigure_getfd(bs, bs->open_flags, &open_flags, perm,
return ret;
} else if (ret != s->fd) {
s->perm_change_fd = ret;
+ s->perm_change_flags = open_flags;
}
}
if (s->perm_change_fd && s->fd != s->perm_change_fd) {
qemu_close(s->fd);
s->fd = s->perm_change_fd;
+ s->open_flags = s->perm_change_flags;
}
s->perm_change_fd = 0;