*/
#include "qemu/osdep.h"
+#include "qemu-common.h"
#include "qapi/error.h"
#include "qemu/cutils.h"
#include "qemu/error-report.h"
uint64_t locked_shared_perm;
int perm_change_fd;
+ int perm_change_flags;
BDRVReopenState *reopen_state;
#ifdef CONFIG_XFS
bool needs_alignment;
bool drop_cache;
bool check_cache_dropped;
+ struct {
+ uint64_t discard_nb_ok;
+ uint64_t discard_nb_failed;
+ uint64_t discard_bytes_ok;
+ } stats;
PRManager *pr_mgr;
} BDRVRawState;
fname = *filename;
dp = strrchr(fname, '/');
if (lstat(fname, &sb) < 0) {
- error_setg_errno(errp, errno, "%s: stat failed", fname);
+ error_setg_file_open(errp, errno, fname);
return -errno;
}
BDRVRawState *s = bs->opaque;
char *buf;
size_t max_align = MAX(MAX_BLOCKSIZE, getpagesize());
+ size_t alignments[] = {1, 512, 1024, 2048, 4096};
/* For SCSI generic devices the alignment is not really used.
With buffered I/O, we don't have any restrictions. */
}
#endif
- /* If we could not get the sizes so far, we can only guess them */
- if (!s->buf_align) {
+ /*
+ * If we could not get the sizes so far, we can only guess them. First try
+ * to detect request alignment, since it is more likely to succeed. Then
+ * try to detect buf_align, which cannot be detected in some cases (e.g.
+ * Gluster). If buf_align cannot be detected, we fallback to the value of
+ * request_alignment.
+ */
+
+ if (!bs->bl.request_alignment) {
+ int i;
size_t align;
- buf = qemu_memalign(max_align, 2 * max_align);
- for (align = 512; align <= max_align; align <<= 1) {
- if (raw_is_io_aligned(fd, buf + align, max_align)) {
- s->buf_align = align;
+ buf = qemu_memalign(max_align, max_align);
+ for (i = 0; i < ARRAY_SIZE(alignments); i++) {
+ align = alignments[i];
+ if (raw_is_io_aligned(fd, buf, align)) {
+ /* Fallback to safe value. */
+ bs->bl.request_alignment = (align != 1) ? align : max_align;
break;
}
}
qemu_vfree(buf);
}
- if (!bs->bl.request_alignment) {
+ if (!s->buf_align) {
+ int i;
size_t align;
- buf = qemu_memalign(s->buf_align, max_align);
- for (align = 512; align <= max_align; align <<= 1) {
- if (raw_is_io_aligned(fd, buf, align)) {
- bs->bl.request_alignment = align;
+ buf = qemu_memalign(max_align, 2 * max_align);
+ for (i = 0; i < ARRAY_SIZE(alignments); i++) {
+ align = alignments[i];
+ if (raw_is_io_aligned(fd, buf + align, max_align)) {
+ /* Fallback to request_alignment. */
+ s->buf_align = (align != 1) ? align : bs->bl.request_alignment;
break;
}
}
ret = fd < 0 ? -errno : 0;
if (ret < 0) {
- error_setg_errno(errp, -ret, "Could not open '%s'", filename);
+ error_setg_file_open(errp, -ret, filename);
if (ret == -EROFS) {
ret = -EACCES;
}
}
#endif
- bs->supported_zero_flags = BDRV_REQ_MAY_UNMAP;
+ bs->supported_zero_flags = BDRV_REQ_MAY_UNMAP | BDRV_REQ_NO_FALLBACK;
ret = 0;
fail:
if (filename && (bdrv_flags & BDRV_O_TEMPORARY)) {
switch (op) {
case RAW_PL_PREPARE:
+ if ((s->perm | new_perm) == s->perm &&
+ (s->shared_perm & new_shared) == s->shared_perm)
+ {
+ /*
+ * We are going to unlock bytes, it should not fail. If it fail due
+ * to some fs-dependent permission-unrelated reasons (which occurs
+ * sometimes on NFS and leads to abort in bdrv_replace_child) we
+ * can't prevent such errors by any check here. And we ignore them
+ * anyway in ABORT and COMMIT.
+ */
+ return 0;
+ }
ret = raw_apply_lock_bytes(s, s->fd, s->perm | new_perm,
~s->shared_perm | ~new_shared,
false, errp);
s->reopen_state = NULL;
}
-static int hdev_get_max_transfer_length(BlockDriverState *bs, int fd)
+static int sg_get_max_transfer_length(int fd)
{
#ifdef BLKSECTGET
int max_bytes = 0;
- short max_sectors = 0;
- if (bs->sg && ioctl(fd, BLKSECTGET, &max_bytes) == 0) {
+
+ if (ioctl(fd, BLKSECTGET, &max_bytes) == 0) {
return max_bytes;
- } else if (!bs->sg && ioctl(fd, BLKSECTGET, &max_sectors) == 0) {
- return max_sectors << BDRV_SECTOR_BITS;
} else {
return -errno;
}
#endif
}
-static int hdev_get_max_segments(const struct stat *st)
+static int sg_get_max_segments(int fd)
{
#ifdef CONFIG_LINUX
char buf[32];
const char *end;
- char *sysfspath;
+ char *sysfspath = NULL;
int ret;
- int fd = -1;
+ int sysfd = -1;
long max_segments;
+ struct stat st;
+
+ if (fstat(fd, &st)) {
+ ret = -errno;
+ goto out;
+ }
sysfspath = g_strdup_printf("/sys/dev/block/%u:%u/queue/max_segments",
- major(st->st_rdev), minor(st->st_rdev));
- fd = open(sysfspath, O_RDONLY);
- if (fd == -1) {
+ major(st.st_rdev), minor(st.st_rdev));
+ sysfd = open(sysfspath, O_RDONLY);
+ if (sysfd == -1) {
ret = -errno;
goto out;
}
do {
- ret = read(fd, buf, sizeof(buf) - 1);
+ ret = read(sysfd, buf, sizeof(buf) - 1);
} while (ret == -1 && errno == EINTR);
if (ret < 0) {
ret = -errno;
}
out:
- if (fd != -1) {
- close(fd);
+ if (sysfd != -1) {
+ close(sysfd);
}
g_free(sysfspath);
return ret;
static void raw_refresh_limits(BlockDriverState *bs, Error **errp)
{
BDRVRawState *s = bs->opaque;
- struct stat st;
- if (!fstat(s->fd, &st)) {
- if (S_ISBLK(st.st_mode) || S_ISCHR(st.st_mode)) {
- int ret = hdev_get_max_transfer_length(bs, s->fd);
- if (ret > 0 && ret <= BDRV_REQUEST_MAX_BYTES) {
- bs->bl.max_transfer = pow2floor(ret);
- }
- ret = hdev_get_max_segments(&st);
- if (ret > 0) {
- bs->bl.max_transfer = MIN(bs->bl.max_transfer,
- ret * getpagesize());
- }
+ if (bs->sg) {
+ int ret = sg_get_max_transfer_length(s->fd);
+
+ if (ret > 0 && ret <= BDRV_REQUEST_MAX_BYTES) {
+ bs->bl.max_transfer = pow2floor(ret);
+ }
+
+ ret = sg_get_max_segments(s->fd);
+ if (ret > 0) {
+ bs->bl.max_transfer = MIN(bs->bl.max_transfer, ret * getpagesize());
}
}
}
}
-#ifdef CONFIG_XFS
-static int xfs_write_zeroes(BDRVRawState *s, int64_t offset, uint64_t bytes)
-{
- struct xfs_flock64 fl;
- int err;
-
- memset(&fl, 0, sizeof(fl));
- fl.l_whence = SEEK_SET;
- fl.l_start = offset;
- fl.l_len = bytes;
-
- if (xfsctl(NULL, s->fd, XFS_IOC_ZERO_RANGE, &fl) < 0) {
- err = errno;
- trace_file_xfs_write_zeroes(strerror(errno));
- return -err;
- }
-
- return 0;
-}
-
-static int xfs_discard(BDRVRawState *s, int64_t offset, uint64_t bytes)
-{
- struct xfs_flock64 fl;
- int err;
-
- memset(&fl, 0, sizeof(fl));
- fl.l_whence = SEEK_SET;
- fl.l_start = offset;
- fl.l_len = bytes;
-
- if (xfsctl(NULL, s->fd, XFS_IOC_UNRESVSP64, &fl) < 0) {
- err = errno;
- trace_file_xfs_discard(strerror(errno));
- return -err;
- }
-
- return 0;
-}
-#endif
-
static int translate_err(int err)
{
if (err == -ENODEV || err == -ENOSYS || err == -EOPNOTSUPP ||
}
#ifdef BLKZEROOUT
- do {
- uint64_t range[2] = { aiocb->aio_offset, aiocb->aio_nbytes };
- if (ioctl(aiocb->aio_fildes, BLKZEROOUT, range) == 0) {
- return 0;
- }
- } while (errno == EINTR);
+ /* The BLKZEROOUT implementation in the kernel doesn't set
+ * BLKDEV_ZERO_NOFALLBACK, so we can't call this if we have to avoid slow
+ * fallbacks. */
+ if (!(aiocb->aio_type & QEMU_AIO_NO_FALLBACK)) {
+ do {
+ uint64_t range[2] = { aiocb->aio_offset, aiocb->aio_nbytes };
+ if (ioctl(aiocb->aio_fildes, BLKZEROOUT, range) == 0) {
+ return 0;
+ }
+ } while (errno == EINTR);
- ret = translate_err(-errno);
+ ret = translate_err(-errno);
+ if (ret == -ENOTSUP) {
+ s->has_write_zeroes = false;
+ }
+ }
#endif
- if (ret == -ENOTSUP) {
- s->has_write_zeroes = false;
- }
return ret;
}
static int handle_aiocb_write_zeroes(void *opaque)
{
RawPosixAIOData *aiocb = opaque;
-#if defined(CONFIG_FALLOCATE) || defined(CONFIG_XFS)
- BDRVRawState *s = aiocb->bs->opaque;
-#endif
#ifdef CONFIG_FALLOCATE
+ BDRVRawState *s = aiocb->bs->opaque;
int64_t len;
#endif
return handle_aiocb_write_zeroes_block(aiocb);
}
-#ifdef CONFIG_XFS
- if (s->is_xfs) {
- return xfs_write_zeroes(s, aiocb->aio_offset, aiocb->aio_nbytes);
- }
-#endif
-
#ifdef CONFIG_FALLOCATE_ZERO_RANGE
if (s->has_write_zeroes) {
int ret = do_fallocate(s->fd, FALLOC_FL_ZERO_RANGE,
aiocb->aio_offset, aiocb->aio_nbytes);
+ if (ret == -EINVAL) {
+ /*
+ * Allow falling back to pwrite for file systems that
+ * do not support fallocate() for an unaligned byte range.
+ */
+ return -ENOTSUP;
+ }
if (ret == 0 || ret != -ENOTSUP) {
return ret;
}
}
#endif
-#ifdef CONFIG_XFS
- if (s->is_xfs) {
- /* xfs_discard() guarantees that the discarded area reads as all-zero
- * afterwards, so we can use it here. */
- return xfs_discard(s, aiocb->aio_offset, aiocb->aio_nbytes);
- }
-#endif
-
/* If we couldn't manage to unmap while guaranteed that the area reads as
* all-zero afterwards, just write zeroes without unmapping */
ret = handle_aiocb_write_zeroes(aiocb);
ret = -errno;
#endif
} else {
-#ifdef CONFIG_XFS
- if (s->is_xfs) {
- return xfs_discard(s, aiocb->aio_offset, aiocb->aio_nbytes);
- }
-#endif
-
#ifdef CONFIG_FALLOCATE_PUNCH_HOLE
ret = do_fallocate(s->fd, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE,
aiocb->aio_offset, aiocb->aio_nbytes);
return ret;
}
+/*
+ * Help alignment probing by allocating the first block.
+ *
+ * When reading with direct I/O from unallocated area on Gluster backed by XFS,
+ * reading succeeds regardless of request length. In this case we fallback to
+ * safe alignment which is not optimal. Allocating the first block avoids this
+ * fallback.
+ *
+ * fd may be opened with O_DIRECT, but we don't know the buffer alignment or
+ * request alignment, so we use safe values.
+ *
+ * Returns: 0 on success, -errno on failure. Since this is an optimization,
+ * caller may ignore failures.
+ */
+static int allocate_first_block(int fd, size_t max_size)
+{
+ size_t write_size = (max_size < MAX_BLOCKSIZE)
+ ? BDRV_SECTOR_SIZE
+ : MAX_BLOCKSIZE;
+ size_t max_align = MAX(MAX_BLOCKSIZE, getpagesize());
+ void *buf;
+ ssize_t n;
+ int ret;
+
+ buf = qemu_memalign(max_align, write_size);
+ memset(buf, 0, write_size);
+
+ do {
+ n = pwrite(fd, buf, write_size, 0);
+ } while (n == -1 && errno == EINTR);
+
+ ret = (n == -1) ? -errno : 0;
+
+ qemu_vfree(buf);
+ return ret;
+}
+
static int handle_aiocb_truncate(void *opaque)
{
RawPosixAIOData *aiocb = opaque;
/* posix_fallocate() doesn't set errno. */
error_setg_errno(errp, -result,
"Could not preallocate new data");
+ } else if (current_length == 0) {
+ /*
+ * posix_fallocate() uses fallocate() if the filesystem
+ * supports it, or fallback to manually writing zeroes. If
+ * fallocate() was used, unaligned reads from the fallocated
+ * area in raw_probe_alignment() will succeed, hence we need to
+ * allocate the first block.
+ *
+ * Optimize future alignment probing; ignore failures.
+ */
+ allocate_first_block(fd, offset);
}
} else {
result = 0;
if (ftruncate(fd, offset) != 0) {
result = -errno;
error_setg_errno(errp, -result, "Could not resize file");
+ } else if (current_length == 0 && offset > current_length) {
+ /* Optimize future alignment probing; ignore failures. */
+ allocate_first_block(fd, offset);
}
return result;
default:
off_t data = 0, hole = 0;
int ret;
+ assert(QEMU_IS_ALIGNED(offset | bytes, bs->bl.request_alignment));
+
ret = fd_open(bs);
if (ret < 0) {
return ret;
/* On a data extent, compute bytes to the end of the extent,
* possibly including a partial sector at EOF. */
*pnum = MIN(bytes, hole - offset);
+
+ /*
+ * We are not allowed to return partial sectors, though, so
+ * round up if necessary.
+ */
+ if (!QEMU_IS_ALIGNED(*pnum, bs->bl.request_alignment)) {
+ int64_t file_length = raw_getlength(bs);
+ if (file_length > 0) {
+ /* Ignore errors, this is just a safeguard */
+ assert(hole == file_length);
+ }
+ *pnum = ROUND_UP(*pnum, bs->bl.request_alignment);
+ }
+
ret = BDRV_BLOCK_DATA;
} else {
/* On a hole, compute bytes to the beginning of the next extent. */
#endif /* !__linux__ */
}
+static void raw_account_discard(BDRVRawState *s, uint64_t nbytes, int ret)
+{
+ if (ret) {
+ s->stats.discard_nb_failed++;
+ } else {
+ s->stats.discard_nb_ok++;
+ s->stats.discard_bytes_ok += nbytes;
+ }
+}
+
static coroutine_fn int
raw_do_pdiscard(BlockDriverState *bs, int64_t offset, int bytes, bool blkdev)
{
BDRVRawState *s = bs->opaque;
RawPosixAIOData acb;
+ int ret;
acb = (RawPosixAIOData) {
.bs = bs,
acb.aio_type |= QEMU_AIO_BLKDEV;
}
- return raw_thread_pool_submit(bs, handle_aiocb_discard, &acb);
+ ret = raw_thread_pool_submit(bs, handle_aiocb_discard, &acb);
+ raw_account_discard(s, bytes, ret);
+ return ret;
}
static coroutine_fn int
if (blkdev) {
acb.aio_type |= QEMU_AIO_BLKDEV;
}
+ if (flags & BDRV_REQ_NO_FALLBACK) {
+ acb.aio_type |= QEMU_AIO_NO_FALLBACK;
+ }
if (flags & BDRV_REQ_MAY_UNMAP) {
acb.aio_type |= QEMU_AIO_DISCARD;
return 0;
}
+static BlockStatsSpecificFile get_blockstats_specific_file(BlockDriverState *bs)
+{
+ BDRVRawState *s = bs->opaque;
+ return (BlockStatsSpecificFile) {
+ .discard_nb_ok = s->stats.discard_nb_ok,
+ .discard_nb_failed = s->stats.discard_nb_failed,
+ .discard_bytes_ok = s->stats.discard_bytes_ok,
+ };
+}
+
+static BlockStatsSpecific *raw_get_specific_stats(BlockDriverState *bs)
+{
+ BlockStatsSpecific *stats = g_new(BlockStatsSpecific, 1);
+
+ stats->driver = BLOCKDEV_DRIVER_FILE;
+ stats->u.file = get_blockstats_specific_file(bs);
+
+ return stats;
+}
+
+static BlockStatsSpecific *hdev_get_specific_stats(BlockDriverState *bs)
+{
+ BlockStatsSpecific *stats = g_new(BlockStatsSpecific, 1);
+
+ stats->driver = BLOCKDEV_DRIVER_HOST_DEVICE;
+ stats->u.host_device = get_blockstats_specific_file(bs);
+
+ return stats;
+}
+
static QemuOptsList raw_create_opts = {
.name = "raw-create-opts",
.head = QTAILQ_HEAD_INITIALIZER(raw_create_opts.head),
{
.name = BLOCK_OPT_PREALLOC,
.type = QEMU_OPT_STRING,
- .help = "Preallocation mode (allowed values: off, falloc, full)"
+ .help = "Preallocation mode (allowed values: off"
+#ifdef CONFIG_POSIX_FALLOCATE
+ ", falloc"
+#endif
+ ", full)"
},
{ /* end of list */ }
}
assert(s->reopen_state->shared_perm == shared);
rs = s->reopen_state->opaque;
s->perm_change_fd = rs->fd;
+ s->perm_change_flags = rs->open_flags;
} else {
/* We may need a new fd if auto-read-only switches the mode */
ret = raw_reconfigure_getfd(bs, bs->open_flags, &open_flags, perm,
return ret;
} else if (ret != s->fd) {
s->perm_change_fd = ret;
+ s->perm_change_flags = open_flags;
}
}
if (s->perm_change_fd && s->fd != s->perm_change_fd) {
qemu_close(s->fd);
s->fd = s->perm_change_fd;
+ s->open_flags = s->perm_change_flags;
}
s->perm_change_fd = 0;
.bdrv_co_create = raw_co_create,
.bdrv_co_create_opts = raw_co_create_opts,
.bdrv_has_zero_init = bdrv_has_zero_init_1,
+ .bdrv_has_zero_init_truncate = bdrv_has_zero_init_1,
.bdrv_co_block_status = raw_co_block_status,
.bdrv_co_invalidate_cache = raw_co_invalidate_cache,
.bdrv_co_pwrite_zeroes = raw_co_pwrite_zeroes,
.bdrv_get_info = raw_get_info,
.bdrv_get_allocated_file_size
= raw_get_allocated_file_size,
+ .bdrv_get_specific_stats = raw_get_specific_stats,
.bdrv_check_perm = raw_check_perm,
.bdrv_set_perm = raw_set_perm,
.bdrv_abort_perm_update = raw_abort_perm_update,
static coroutine_fn int
hdev_co_pdiscard(BlockDriverState *bs, int64_t offset, int bytes)
{
+ BDRVRawState *s = bs->opaque;
int ret;
ret = fd_open(bs);
if (ret < 0) {
+ raw_account_discard(s, bytes, ret);
return ret;
}
return raw_do_pdiscard(bs, offset, bytes, true);
.bdrv_get_info = raw_get_info,
.bdrv_get_allocated_file_size
= raw_get_allocated_file_size,
+ .bdrv_get_specific_stats = hdev_get_specific_stats,
.bdrv_check_perm = raw_check_perm,
.bdrv_set_perm = raw_set_perm,
.bdrv_abort_perm_update = raw_abort_perm_update,