*
*/
-#include "qemu-timer.h"
+#include "qemu/timer.h"
#include "trace.h"
#include "qed.h"
-#include "qerror.h"
+#include "qapi/qmp/qerror.h"
+#include "migration/migration.h"
static void qed_aio_cancel(BlockDriverAIOCB *blockacb)
{
}
}
-static AIOPool qed_aio_pool = {
+static const AIOCBInfo qed_aiocb_info = {
.aiocb_size = sizeof(QEDAIOCB),
.cancel = qed_aio_cancel,
};
le->backing_filename_size = cpu_to_le32(cpu->backing_filename_size);
}
-static int qed_write_header_sync(BDRVQEDState *s)
+int qed_write_header_sync(BDRVQEDState *s)
{
QEDHeader le;
int ret;
{
QEDWriteHeaderCB *write_header_cb = opaque;
BDRVQEDState *s = write_header_cb->s;
- BlockDriverAIOCB *acb;
if (ret) {
qed_write_header_cb(write_header_cb, ret);
/* Update header */
qed_header_cpu_to_le(&s->header, (QEDHeader *)write_header_cb->buf);
- acb = bdrv_aio_writev(s->bs->file, 0, &write_header_cb->qiov,
- write_header_cb->nsectors, qed_write_header_cb,
- write_header_cb);
- if (!acb) {
- qed_write_header_cb(write_header_cb, -EIO);
- }
+ bdrv_aio_writev(s->bs->file, 0, &write_header_cb->qiov,
+ write_header_cb->nsectors, qed_write_header_cb,
+ write_header_cb);
}
/**
* them, and write back.
*/
- BlockDriverAIOCB *acb;
int nsectors = (sizeof(QEDHeader) + BDRV_SECTOR_SIZE - 1) /
BDRV_SECTOR_SIZE;
size_t len = nsectors * BDRV_SECTOR_SIZE;
write_header_cb->iov.iov_len = len;
qemu_iovec_init_external(&write_header_cb->qiov, &write_header_cb->iov, 1);
- acb = bdrv_aio_readv(s->bs->file, 0, &write_header_cb->qiov, nsectors,
- qed_write_header_read_cb, write_header_cb);
- if (!acb) {
- qed_write_header_cb(write_header_cb, -EIO);
- }
+ bdrv_aio_readv(s->bs->file, 0, &write_header_cb->qiov, nsectors,
+ qed_write_header_read_cb, write_header_cb);
}
static uint64_t qed_max_image_size(uint32_t cluster_size, uint32_t table_size)
qemu_del_timer(s->need_check_timer);
}
-static int bdrv_qed_open(BlockDriverState *bs, int flags)
+static void bdrv_qed_rebind(BlockDriverState *bs)
+{
+ BDRVQEDState *s = bs->opaque;
+ s->bs = bs;
+}
+
+static int bdrv_qed_open(BlockDriverState *bs, QDict *options, int flags)
{
BDRVQEDState *s = bs->opaque;
QEDHeader le_header;
qed_header_le_to_cpu(&le_header, &s->header);
if (s->header.magic != QED_MAGIC) {
- return -EINVAL;
+ return -EMEDIUMTYPE;
}
if (s->header.features & ~QED_FEATURE_MASK) {
/* image uses unsupported feature bits */
* feature is no longer valid.
*/
if ((s->header.autoclear_features & ~QED_AUTOCLEAR_FEATURE_MASK) != 0 &&
- !bdrv_is_read_only(bs->file)) {
+ !bdrv_is_read_only(bs->file) && !(flags & BDRV_O_INCOMING)) {
s->header.autoclear_features &= QED_AUTOCLEAR_FEATURE_MASK;
ret = qed_write_header_sync(s);
}
/* If image was not closed cleanly, check consistency */
- if (s->header.features & QED_F_NEED_CHECK) {
+ if (!(flags & BDRV_O_CHECK) && (s->header.features & QED_F_NEED_CHECK)) {
/* Read-only images cannot be fixed. There is no risk of corruption
* since write operations are not possible. Therefore, allow
* potentially inconsistent images to be opened read-only. This can
* aid data recovery from an otherwise inconsistent image.
*/
- if (!bdrv_is_read_only(bs->file)) {
+ if (!bdrv_is_read_only(bs->file) &&
+ !(flags & BDRV_O_INCOMING)) {
BdrvCheckResult result = {0};
ret = qed_check(s, &result, true);
if (ret) {
goto out;
}
- if (!result.corruptions && !result.check_errors) {
- /* Ensure fixes reach storage before clearing check bit */
- bdrv_flush(s->bs);
-
- s->header.features &= ~QED_F_NEED_CHECK;
- qed_write_header_sync(s);
- }
}
}
return ret;
}
+/* We have nothing to do for QED reopen, stubs just return
+ * success */
+static int bdrv_qed_reopen_prepare(BDRVReopenState *state,
+ BlockReopenQueue *queue, Error **errp)
+{
+ return 0;
+}
+
static void bdrv_qed_close(BlockDriverState *bs)
{
BDRVQEDState *s = bs->opaque;
return ret;
}
- ret = bdrv_file_open(&bs, filename, BDRV_O_RDWR | BDRV_O_CACHE_WB);
+ ret = bdrv_file_open(&bs, filename, NULL, BDRV_O_RDWR | BDRV_O_CACHE_WB);
if (ret < 0) {
return ret;
}
}
typedef struct {
+ Coroutine *co;
int is_allocated;
int *pnum;
} QEDIsAllocatedCB;
QEDIsAllocatedCB *cb = opaque;
*cb->pnum = len / BDRV_SECTOR_SIZE;
cb->is_allocated = (ret == QED_CLUSTER_FOUND || ret == QED_CLUSTER_ZERO);
+ if (cb->co) {
+ qemu_coroutine_enter(cb->co, NULL);
+ }
}
-static int bdrv_qed_is_allocated(BlockDriverState *bs, int64_t sector_num,
- int nb_sectors, int *pnum)
+static int coroutine_fn bdrv_qed_co_is_allocated(BlockDriverState *bs,
+ int64_t sector_num,
+ int nb_sectors, int *pnum)
{
BDRVQEDState *s = bs->opaque;
uint64_t pos = (uint64_t)sector_num * BDRV_SECTOR_SIZE;
qed_find_cluster(s, &request, pos, len, qed_is_allocated_cb, &cb);
+ /* Now sleep if the callback wasn't invoked immediately */
while (cb.is_allocated == -1) {
- qemu_aio_wait();
+ cb.co = qemu_coroutine_self();
+ qemu_coroutine_yield();
}
qed_unref_l2_cache_entry(request.l2_table);
QEMUIOVector *qiov,
BlockDriverCompletionFunc *cb, void *opaque)
{
- BlockDriverAIOCB *aiocb;
uint64_t backing_length = 0;
size_t size;
/* Zero all sectors if reading beyond the end of the backing file */
if (pos >= backing_length ||
pos + qiov->size > backing_length) {
- qemu_iovec_memset(qiov, 0, qiov->size);
+ qemu_iovec_memset(qiov, 0, 0, qiov->size);
}
/* Complete now if there are no backing file sectors to read */
/* If the read straddles the end of the backing file, shorten it */
size = MIN((uint64_t)backing_length - pos, qiov->size);
- BLKDBG_EVENT(s->bs->file, BLKDBG_READ_BACKING);
- aiocb = bdrv_aio_readv(s->bs->backing_hd, pos / BDRV_SECTOR_SIZE,
- qiov, size / BDRV_SECTOR_SIZE, cb, opaque);
- if (!aiocb) {
- cb(opaque, -EIO);
- }
+ BLKDBG_EVENT(s->bs->file, BLKDBG_READ_BACKING_AIO);
+ bdrv_aio_readv(s->bs->backing_hd, pos / BDRV_SECTOR_SIZE,
+ qiov, size / BDRV_SECTOR_SIZE, cb, opaque);
}
typedef struct {
{
CopyFromBackingFileCB *copy_cb = opaque;
BDRVQEDState *s = copy_cb->s;
- BlockDriverAIOCB *aiocb;
if (ret) {
qed_copy_from_backing_file_cb(copy_cb, ret);
}
BLKDBG_EVENT(s->bs->file, BLKDBG_COW_WRITE);
- aiocb = bdrv_aio_writev(s->bs->file, copy_cb->offset / BDRV_SECTOR_SIZE,
- ©_cb->qiov,
- copy_cb->qiov.size / BDRV_SECTOR_SIZE,
- qed_copy_from_backing_file_cb, copy_cb);
- if (!aiocb) {
- qed_copy_from_backing_file_cb(copy_cb, -EIO);
- }
+ bdrv_aio_writev(s->bs->file, copy_cb->offset / BDRV_SECTOR_SIZE,
+ ©_cb->qiov, copy_cb->qiov.size / BDRV_SECTOR_SIZE,
+ qed_copy_from_backing_file_cb, copy_cb);
}
/**
qemu_iovec_destroy(&acb->cur_qiov);
qed_unref_l2_cache_entry(acb->request.l2_table);
+ /* Free the buffer we may have allocated for zero writes */
+ if (acb->flags & QED_AIOCB_ZERO) {
+ qemu_vfree(acb->qiov->iov[0].iov_base);
+ acb->qiov->iov[0].iov_base = NULL;
+ }
+
/* Arrange for a bh to invoke the completion function */
acb->bh_ret = ret;
acb->bh = qemu_bh_new(qed_aio_complete_bh, acb);
/**
* Update L2 table with new cluster offsets and write them out
*/
-static void qed_aio_write_l2_update(void *opaque, int ret)
+static void qed_aio_write_l2_update(QEDAIOCB *acb, int ret, uint64_t offset)
{
- QEDAIOCB *acb = opaque;
BDRVQEDState *s = acb_to_s(acb);
bool need_alloc = acb->find_cluster_ret == QED_CLUSTER_L1;
int index;
index = qed_l2_index(s, acb->cur_pos);
qed_update_l2_table(s, acb->request.l2_table->table, index, acb->cur_nclusters,
- acb->cur_cluster);
+ offset);
if (need_alloc) {
/* Write out the whole new L2 table */
qed_aio_complete(acb, ret);
}
+static void qed_aio_write_l2_update_cb(void *opaque, int ret)
+{
+ QEDAIOCB *acb = opaque;
+ qed_aio_write_l2_update(acb, ret, acb->cur_cluster);
+}
+
/**
* Flush new data clusters before updating the L2 table
*
QEDAIOCB *acb = opaque;
BDRVQEDState *s = acb_to_s(acb);
- if (!bdrv_aio_flush(s->bs->file, qed_aio_write_l2_update, opaque)) {
+ if (!bdrv_aio_flush(s->bs->file, qed_aio_write_l2_update_cb, opaque)) {
qed_aio_complete(acb, -EIO);
}
}
uint64_t offset = acb->cur_cluster +
qed_offset_into_cluster(s, acb->cur_pos);
BlockDriverCompletionFunc *next_fn;
- BlockDriverAIOCB *file_acb;
trace_qed_aio_write_main(s, acb, ret, offset, acb->cur_qiov.size);
if (s->bs->backing_hd) {
next_fn = qed_aio_write_flush_before_l2_update;
} else {
- next_fn = qed_aio_write_l2_update;
+ next_fn = qed_aio_write_l2_update_cb;
}
}
BLKDBG_EVENT(s->bs->file, BLKDBG_WRITE_AIO);
- file_acb = bdrv_aio_writev(s->bs->file, offset / BDRV_SECTOR_SIZE,
- &acb->cur_qiov,
- acb->cur_qiov.size / BDRV_SECTOR_SIZE,
- next_fn, acb);
- if (!file_acb) {
- qed_aio_complete(acb, -EIO);
- }
+ bdrv_aio_writev(s->bs->file, offset / BDRV_SECTOR_SIZE,
+ &acb->cur_qiov, acb->cur_qiov.size / BDRV_SECTOR_SIZE,
+ next_fn, acb);
}
/**
return !(s->header.features & QED_F_NEED_CHECK);
}
+static void qed_aio_write_zero_cluster(void *opaque, int ret)
+{
+ QEDAIOCB *acb = opaque;
+
+ if (ret) {
+ qed_aio_complete(acb, ret);
+ return;
+ }
+
+ qed_aio_write_l2_update(acb, 0, 1);
+}
+
/**
* Write new data cluster
*
static void qed_aio_write_alloc(QEDAIOCB *acb, size_t len)
{
BDRVQEDState *s = acb_to_s(acb);
+ BlockDriverCompletionFunc *cb;
/* Cancel timer when the first allocating request comes in */
if (QSIMPLEQ_EMPTY(&s->allocating_write_reqs)) {
acb->cur_nclusters = qed_bytes_to_clusters(s,
qed_offset_into_cluster(s, acb->cur_pos) + len);
- acb->cur_cluster = qed_alloc_clusters(s, acb->cur_nclusters);
- qemu_iovec_copy(&acb->cur_qiov, acb->qiov, acb->qiov_offset, len);
+ qemu_iovec_concat(&acb->cur_qiov, acb->qiov, acb->qiov_offset, len);
+
+ if (acb->flags & QED_AIOCB_ZERO) {
+ /* Skip ahead if the clusters are already zero */
+ if (acb->find_cluster_ret == QED_CLUSTER_ZERO) {
+ qed_aio_next_io(acb, 0);
+ return;
+ }
+
+ cb = qed_aio_write_zero_cluster;
+ } else {
+ cb = qed_aio_write_prefill;
+ acb->cur_cluster = qed_alloc_clusters(s, acb->cur_nclusters);
+ }
if (qed_should_set_need_check(s)) {
s->header.features |= QED_F_NEED_CHECK;
- qed_write_header(s, qed_aio_write_prefill, acb);
+ qed_write_header(s, cb, acb);
} else {
- qed_aio_write_prefill(acb, 0);
+ cb(acb, 0);
}
}
*/
static void qed_aio_write_inplace(QEDAIOCB *acb, uint64_t offset, size_t len)
{
+ /* Allocate buffer for zero writes */
+ if (acb->flags & QED_AIOCB_ZERO) {
+ struct iovec *iov = acb->qiov->iov;
+
+ if (!iov->iov_base) {
+ iov->iov_base = qemu_blockalign(acb->common.bs, iov->iov_len);
+ memset(iov->iov_base, 0, iov->iov_len);
+ }
+ }
+
/* Calculate the I/O vector */
acb->cur_cluster = offset;
- qemu_iovec_copy(&acb->cur_qiov, acb->qiov, acb->qiov_offset, len);
+ qemu_iovec_concat(&acb->cur_qiov, acb->qiov, acb->qiov_offset, len);
/* Do the actual write */
qed_aio_write_main(acb, 0);
QEDAIOCB *acb = opaque;
BDRVQEDState *s = acb_to_s(acb);
BlockDriverState *bs = acb->common.bs;
- BlockDriverAIOCB *file_acb;
/* Adjust offset into cluster */
offset += qed_offset_into_cluster(s, acb->cur_pos);
goto err;
}
- qemu_iovec_copy(&acb->cur_qiov, acb->qiov, acb->qiov_offset, len);
+ qemu_iovec_concat(&acb->cur_qiov, acb->qiov, acb->qiov_offset, len);
/* Handle zero cluster and backing file reads */
if (ret == QED_CLUSTER_ZERO) {
- qemu_iovec_memset(&acb->cur_qiov, 0, acb->cur_qiov.size);
+ qemu_iovec_memset(&acb->cur_qiov, 0, 0, acb->cur_qiov.size);
qed_aio_next_io(acb, 0);
return;
} else if (ret != QED_CLUSTER_FOUND) {
}
BLKDBG_EVENT(bs->file, BLKDBG_READ_AIO);
- file_acb = bdrv_aio_readv(bs->file, offset / BDRV_SECTOR_SIZE,
- &acb->cur_qiov,
- acb->cur_qiov.size / BDRV_SECTOR_SIZE,
- qed_aio_next_io, acb);
- if (!file_acb) {
- ret = -EIO;
- goto err;
- }
+ bdrv_aio_readv(bs->file, offset / BDRV_SECTOR_SIZE,
+ &acb->cur_qiov, acb->cur_qiov.size / BDRV_SECTOR_SIZE,
+ qed_aio_next_io, acb);
return;
err:
{
QEDAIOCB *acb = opaque;
BDRVQEDState *s = acb_to_s(acb);
- QEDFindClusterFunc *io_fn =
- acb->is_write ? qed_aio_write_data : qed_aio_read_data;
+ QEDFindClusterFunc *io_fn = (acb->flags & QED_AIOCB_WRITE) ?
+ qed_aio_write_data : qed_aio_read_data;
trace_qed_aio_next_io(s, acb, ret, acb->cur_pos + acb->cur_qiov.size);
int64_t sector_num,
QEMUIOVector *qiov, int nb_sectors,
BlockDriverCompletionFunc *cb,
- void *opaque, bool is_write)
+ void *opaque, int flags)
{
- QEDAIOCB *acb = qemu_aio_get(&qed_aio_pool, bs, cb, opaque);
+ QEDAIOCB *acb = qemu_aio_get(&qed_aiocb_info, bs, cb, opaque);
trace_qed_aio_setup(bs->opaque, acb, sector_num, nb_sectors,
- opaque, is_write);
+ opaque, flags);
- acb->is_write = is_write;
+ acb->flags = flags;
acb->finished = NULL;
acb->qiov = qiov;
acb->qiov_offset = 0;
BlockDriverCompletionFunc *cb,
void *opaque)
{
- return qed_aio_setup(bs, sector_num, qiov, nb_sectors, cb, opaque, false);
+ return qed_aio_setup(bs, sector_num, qiov, nb_sectors, cb, opaque, 0);
}
static BlockDriverAIOCB *bdrv_qed_aio_writev(BlockDriverState *bs,
BlockDriverCompletionFunc *cb,
void *opaque)
{
- return qed_aio_setup(bs, sector_num, qiov, nb_sectors, cb, opaque, true);
+ return qed_aio_setup(bs, sector_num, qiov, nb_sectors, cb,
+ opaque, QED_AIOCB_WRITE);
}
-static BlockDriverAIOCB *bdrv_qed_aio_flush(BlockDriverState *bs,
- BlockDriverCompletionFunc *cb,
- void *opaque)
+typedef struct {
+ Coroutine *co;
+ int ret;
+ bool done;
+} QEDWriteZeroesCB;
+
+static void coroutine_fn qed_co_write_zeroes_cb(void *opaque, int ret)
{
- return bdrv_aio_flush(bs->file, cb, opaque);
+ QEDWriteZeroesCB *cb = opaque;
+
+ cb->done = true;
+ cb->ret = ret;
+ if (cb->co) {
+ qemu_coroutine_enter(cb->co, NULL);
+ }
+}
+
+static int coroutine_fn bdrv_qed_co_write_zeroes(BlockDriverState *bs,
+ int64_t sector_num,
+ int nb_sectors)
+{
+ BlockDriverAIOCB *blockacb;
+ BDRVQEDState *s = bs->opaque;
+ QEDWriteZeroesCB cb = { .done = false };
+ QEMUIOVector qiov;
+ struct iovec iov;
+
+ /* Refuse if there are untouched backing file sectors */
+ if (bs->backing_hd) {
+ if (qed_offset_into_cluster(s, sector_num * BDRV_SECTOR_SIZE) != 0) {
+ return -ENOTSUP;
+ }
+ if (qed_offset_into_cluster(s, nb_sectors * BDRV_SECTOR_SIZE) != 0) {
+ return -ENOTSUP;
+ }
+ }
+
+ /* Zero writes start without an I/O buffer. If a buffer becomes necessary
+ * then it will be allocated during request processing.
+ */
+ iov.iov_base = NULL,
+ iov.iov_len = nb_sectors * BDRV_SECTOR_SIZE,
+
+ qemu_iovec_init_external(&qiov, &iov, 1);
+ blockacb = qed_aio_setup(bs, sector_num, &qiov, nb_sectors,
+ qed_co_write_zeroes_cb, &cb,
+ QED_AIOCB_WRITE | QED_AIOCB_ZERO);
+ if (!blockacb) {
+ return -EIO;
+ }
+ if (!cb.done) {
+ cb.co = qemu_coroutine_self();
+ qemu_coroutine_yield();
+ }
+ assert(cb.done);
+ return cb.ret;
}
static int bdrv_qed_truncate(BlockDriverState *bs, int64_t offset)
memset(bdi, 0, sizeof(*bdi));
bdi->cluster_size = s->header.cluster_size;
+ bdi->is_dirty = s->header.features & QED_F_NEED_CHECK;
return 0;
}
return ret;
}
-static int bdrv_qed_check(BlockDriverState *bs, BdrvCheckResult *result)
+static void bdrv_qed_invalidate_cache(BlockDriverState *bs)
+{
+ BDRVQEDState *s = bs->opaque;
+
+ bdrv_qed_close(bs);
+ memset(s, 0, sizeof(BDRVQEDState));
+ bdrv_qed_open(bs, NULL, bs->open_flags);
+}
+
+static int bdrv_qed_check(BlockDriverState *bs, BdrvCheckResult *result,
+ BdrvCheckMode fix)
{
BDRVQEDState *s = bs->opaque;
- return qed_check(s, result, false);
+ return qed_check(s, result, !!fix);
}
static QEMUOptionParameter qed_create_options[] = {
.create_options = qed_create_options,
.bdrv_probe = bdrv_qed_probe,
+ .bdrv_rebind = bdrv_qed_rebind,
.bdrv_open = bdrv_qed_open,
.bdrv_close = bdrv_qed_close,
+ .bdrv_reopen_prepare = bdrv_qed_reopen_prepare,
.bdrv_create = bdrv_qed_create,
- .bdrv_is_allocated = bdrv_qed_is_allocated,
+ .bdrv_co_is_allocated = bdrv_qed_co_is_allocated,
.bdrv_make_empty = bdrv_qed_make_empty,
.bdrv_aio_readv = bdrv_qed_aio_readv,
.bdrv_aio_writev = bdrv_qed_aio_writev,
- .bdrv_aio_flush = bdrv_qed_aio_flush,
+ .bdrv_co_write_zeroes = bdrv_qed_co_write_zeroes,
.bdrv_truncate = bdrv_qed_truncate,
.bdrv_getlength = bdrv_qed_getlength,
.bdrv_get_info = bdrv_qed_get_info,
.bdrv_change_backing_file = bdrv_qed_change_backing_file,
+ .bdrv_invalidate_cache = bdrv_qed_invalidate_cache,
.bdrv_check = bdrv_qed_check,
};