#define SD_OP_READ_OBJ 0x02
#define SD_OP_WRITE_OBJ 0x03
/* 0x04 is used internally by Sheepdog */
-#define SD_OP_DISCARD_OBJ 0x05
#define SD_OP_NEW_VDI 0x11
#define SD_OP_LOCK_VDI 0x12
AIOCB_DISCARD_OBJ,
};
+#define AIOCBOverlapping(x, y) \
+ (!(x->max_affect_data_idx < y->min_affect_data_idx \
+ || y->max_affect_data_idx < x->min_affect_data_idx))
+
struct SheepdogAIOCB {
BlockAIOCB common;
bool cancelable;
int nr_pending;
+
+ uint32_t min_affect_data_idx;
+ uint32_t max_affect_data_idx;
+
+ /*
+ * The difference between affect_data_idx and dirty_data_idx:
+ * affect_data_idx represents range of index of all request types.
+ * dirty_data_idx represents range of index updated by COW requests.
+ * dirty_data_idx is used for updating an inode object.
+ */
+ uint32_t min_dirty_data_idx;
+ uint32_t max_dirty_data_idx;
+
+ QLIST_ENTRY(SheepdogAIOCB) aiocb_siblings;
};
typedef struct BDRVSheepdogState {
SheepdogInode inode;
- uint32_t min_dirty_data_idx;
- uint32_t max_dirty_data_idx;
-
char name[SD_MAX_VDI_LEN];
bool is_snapshot;
uint32_t cache_flags;
/* Every aio request must be linked to either of these queues. */
QLIST_HEAD(inflight_aio_head, AIOReq) inflight_aio_head;
- QLIST_HEAD(pending_aio_head, AIOReq) pending_aio_head;
QLIST_HEAD(failed_aio_head, AIOReq) failed_aio_head;
+
+ CoQueue overlapping_queue;
+ QLIST_HEAD(inflight_aiocb_head, SheepdogAIOCB) inflight_aiocb_head;
} BDRVSheepdogState;
+typedef struct BDRVSheepdogReopenState {
+ int fd;
+ int cache_flags;
+} BDRVSheepdogReopenState;
+
static const char * sd_strerror(int err)
{
int i;
AIOReq *aioreq, *next;
if (sd_acb_cancelable(acb)) {
- /* Remove outstanding requests from pending and failed queues. */
- QLIST_FOREACH_SAFE(aioreq, &s->pending_aio_head, aio_siblings,
- next) {
- if (aioreq->aiocb == acb) {
- free_aio_req(s, aioreq);
- }
- }
+ /* Remove outstanding requests from failed queue. */
QLIST_FOREACH_SAFE(aioreq, &s->failed_aio_head, aio_siblings,
next) {
if (aioreq->aiocb == acb) {
int64_t sector_num, int nb_sectors)
{
SheepdogAIOCB *acb;
+ uint32_t object_size;
+ BDRVSheepdogState *s = bs->opaque;
+
+ object_size = (UINT32_C(1) << s->inode.block_size_shift);
acb = qemu_aio_get(&sd_aiocb_info, bs, NULL, NULL);
acb->coroutine = qemu_coroutine_self();
acb->ret = 0;
acb->nr_pending = 0;
+
+ acb->min_affect_data_idx = acb->sector_num * BDRV_SECTOR_SIZE / object_size;
+ acb->max_affect_data_idx = (acb->sector_num * BDRV_SECTOR_SIZE +
+ acb->nb_sectors * BDRV_SECTOR_SIZE) / object_size;
+
+ acb->min_dirty_data_idx = UINT32_MAX;
+ acb->max_dirty_data_idx = 0;
+
return acb;
}
static int get_sheep_fd(BDRVSheepdogState *s, Error **errp);
static void co_write_request(void *opaque);
-static AIOReq *find_pending_req(BDRVSheepdogState *s, uint64_t oid)
-{
- AIOReq *aio_req;
-
- QLIST_FOREACH(aio_req, &s->pending_aio_head, aio_siblings) {
- if (aio_req->oid == oid) {
- return aio_req;
- }
- }
-
- return NULL;
-}
-
-/*
- * This function searchs pending requests to the object `oid', and
- * sends them.
- */
-static void coroutine_fn send_pending_req(BDRVSheepdogState *s, uint64_t oid)
-{
- AIOReq *aio_req;
- SheepdogAIOCB *acb;
-
- while ((aio_req = find_pending_req(s, oid)) != NULL) {
- acb = aio_req->aiocb;
- /* move aio_req from pending list to inflight one */
- QLIST_REMOVE(aio_req, aio_siblings);
- QLIST_INSERT_HEAD(&s->inflight_aio_head, aio_req, aio_siblings);
- add_aio_request(s, aio_req, acb->qiov->iov, acb->qiov->niov,
- acb->aiocb_type);
- }
-}
-
static coroutine_fn void reconnect_to_sdog(void *opaque)
{
BDRVSheepdogState *s = opaque;
*/
if (rsp.result == SD_RES_SUCCESS) {
s->inode.data_vdi_id[idx] = s->inode.vdi_id;
- s->max_dirty_data_idx = MAX(idx, s->max_dirty_data_idx);
- s->min_dirty_data_idx = MIN(idx, s->min_dirty_data_idx);
+ acb->max_dirty_data_idx = MAX(idx, acb->max_dirty_data_idx);
+ acb->min_dirty_data_idx = MIN(idx, acb->min_dirty_data_idx);
}
- /*
- * Some requests may be blocked because simultaneous
- * create requests are not allowed, so we search the
- * pending requests here.
- */
- send_pending_req(s, aio_req->oid);
}
break;
case AIOCB_READ_UDATA:
rsp.result = SD_RES_SUCCESS;
s->discard_supported = false;
break;
- case SD_RES_SUCCESS:
- idx = data_oid_to_idx(aio_req->oid);
- s->inode.data_vdi_id[idx] = 0;
- break;
default:
break;
}
hdr.flags = SD_FLAG_CMD_WRITE | flags;
break;
case AIOCB_DISCARD_OBJ:
- hdr.opcode = SD_OP_DISCARD_OBJ;
+ hdr.opcode = SD_OP_WRITE_OBJ;
+ hdr.flags = SD_FLAG_CMD_WRITE | flags;
+ s->inode.data_vdi_id[data_oid_to_idx(oid)] = 0;
+ offset = offsetof(SheepdogInode,
+ data_vdi_id[data_oid_to_idx(oid)]);
+ oid = vid_to_vdi_oid(s->inode.vdi_id);
+ wlen = datalen = sizeof(uint32_t);
break;
}
return ret;
}
-/* Return true if the specified request is linked to the pending list. */
-static bool check_simultaneous_create(BDRVSheepdogState *s, AIOReq *aio_req)
-{
- AIOReq *areq;
- QLIST_FOREACH(areq, &s->inflight_aio_head, aio_siblings) {
- if (areq != aio_req && areq->oid == aio_req->oid) {
- /*
- * Sheepdog cannot handle simultaneous create requests to the same
- * object, so we cannot send the request until the previous request
- * finishes.
- */
- DPRINTF("simultaneous create to %" PRIx64 "\n", aio_req->oid);
- aio_req->flags = 0;
- aio_req->base_oid = 0;
- aio_req->create = false;
- QLIST_REMOVE(aio_req, aio_siblings);
- QLIST_INSERT_HEAD(&s->pending_aio_head, aio_req, aio_siblings);
- return true;
- }
- }
-
- return false;
-}
-
static void coroutine_fn resend_aioreq(BDRVSheepdogState *s, AIOReq *aio_req)
{
SheepdogAIOCB *acb = aio_req->aiocb;
goto out;
}
- if (check_simultaneous_create(s, aio_req)) {
- return;
- }
-
if (s->inode.data_vdi_id[idx]) {
aio_req->base_oid = vid_to_data_oid(s->inode.data_vdi_id[idx], idx);
aio_req->flags |= SD_FLAG_CMD_COW;
filename = qemu_opt_get(opts, "filename");
QLIST_INIT(&s->inflight_aio_head);
- QLIST_INIT(&s->pending_aio_head);
QLIST_INIT(&s->failed_aio_head);
+ QLIST_INIT(&s->inflight_aiocb_head);
s->fd = -1;
memset(vdi, 0, sizeof(vdi));
}
memcpy(&s->inode, buf, sizeof(s->inode));
- s->min_dirty_data_idx = UINT32_MAX;
- s->max_dirty_data_idx = 0;
bs->total_sectors = s->inode.vdi_size / BDRV_SECTOR_SIZE;
pstrcpy(s->name, sizeof(s->name), vdi);
qemu_co_mutex_init(&s->lock);
+ qemu_co_queue_init(&s->overlapping_queue);
qemu_opts_del(opts);
g_free(buf);
return 0;
return ret;
}
+static int sd_reopen_prepare(BDRVReopenState *state, BlockReopenQueue *queue,
+ Error **errp)
+{
+ BDRVSheepdogState *s = state->bs->opaque;
+ BDRVSheepdogReopenState *re_s;
+ int ret = 0;
+
+ re_s = state->opaque = g_new0(BDRVSheepdogReopenState, 1);
+
+ re_s->cache_flags = SD_FLAG_CMD_CACHE;
+ if (state->flags & BDRV_O_NOCACHE) {
+ re_s->cache_flags = SD_FLAG_CMD_DIRECT;
+ }
+
+ re_s->fd = get_sheep_fd(s, errp);
+ if (re_s->fd < 0) {
+ ret = re_s->fd;
+ return ret;
+ }
+
+ return ret;
+}
+
+static void sd_reopen_commit(BDRVReopenState *state)
+{
+ BDRVSheepdogReopenState *re_s = state->opaque;
+ BDRVSheepdogState *s = state->bs->opaque;
+
+ if (s->fd) {
+ aio_set_fd_handler(s->aio_context, s->fd, NULL, NULL, NULL);
+ closesocket(s->fd);
+ }
+
+ s->fd = re_s->fd;
+ s->cache_flags = re_s->cache_flags;
+
+ g_free(state->opaque);
+ state->opaque = NULL;
+
+ return;
+}
+
+static void sd_reopen_abort(BDRVReopenState *state)
+{
+ BDRVSheepdogReopenState *re_s = state->opaque;
+ BDRVSheepdogState *s = state->bs->opaque;
+
+ if (re_s == NULL) {
+ return;
+ }
+
+ if (re_s->fd) {
+ aio_set_fd_handler(s->aio_context, re_s->fd, NULL, NULL, NULL);
+ closesocket(re_s->fd);
+ }
+
+ g_free(state->opaque);
+ state->opaque = NULL;
+
+ return;
+}
+
static int do_sd_create(BDRVSheepdogState *s, uint32_t *vdi_id, int snapshot,
Error **errp)
{
int ret;
ret = bdrv_open(&bs, filename, NULL, NULL, BDRV_O_RDWR | BDRV_O_PROTOCOL,
- NULL, errp);
+ errp);
if (ret < 0) {
goto out_with_err_set;
}
if ((object_size - 1) & object_size) { /* not a power of 2? */
return -EINVAL;
}
- obj_order = ffs(object_size) - 1;
+ obj_order = ctz32(object_size);
if (obj_order < 20 || obj_order > 31) {
return -EINVAL;
}
}
bs = NULL;
- ret = bdrv_open(&bs, backing_file, NULL, NULL, BDRV_O_PROTOCOL, NULL,
- errp);
+ ret = bdrv_open(&bs, backing_file, NULL, NULL, BDRV_O_PROTOCOL, errp);
if (ret < 0) {
goto out;
}
AIOReq *aio_req;
uint32_t offset, data_len, mn, mx;
- mn = s->min_dirty_data_idx;
- mx = s->max_dirty_data_idx;
+ mn = acb->min_dirty_data_idx;
+ mx = acb->max_dirty_data_idx;
if (mn <= mx) {
/* we need to update the vdi object. */
offset = sizeof(s->inode) - sizeof(s->inode.data_vdi_id) +
mn * sizeof(s->inode.data_vdi_id[0]);
data_len = (mx - mn + 1) * sizeof(s->inode.data_vdi_id[0]);
- s->min_dirty_data_idx = UINT32_MAX;
- s->max_dirty_data_idx = 0;
+ acb->min_dirty_data_idx = UINT32_MAX;
+ acb->max_dirty_data_idx = 0;
iov.iov_base = &s->inode;
iov.iov_len = sizeof(s->inode);
}
aio_req = alloc_aio_req(s, acb, oid, len, offset, flags, create,
- old_oid, done);
+ old_oid,
+ acb->aiocb_type == AIOCB_DISCARD_OBJ ?
+ 0 : done);
QLIST_INSERT_HEAD(&s->inflight_aio_head, aio_req, aio_siblings);
- if (create) {
- if (check_simultaneous_create(s, aio_req)) {
- goto done;
- }
- }
-
add_aio_request(s, aio_req, acb->qiov->iov, acb->qiov->niov,
acb->aiocb_type);
done:
return 1;
}
+static bool check_overlapping_aiocb(BDRVSheepdogState *s, SheepdogAIOCB *aiocb)
+{
+ SheepdogAIOCB *cb;
+
+ QLIST_FOREACH(cb, &s->inflight_aiocb_head, aiocb_siblings) {
+ if (AIOCBOverlapping(aiocb, cb)) {
+ return true;
+ }
+ }
+
+ QLIST_INSERT_HEAD(&s->inflight_aiocb_head, aiocb, aiocb_siblings);
+ return false;
+}
+
static coroutine_fn int sd_co_writev(BlockDriverState *bs, int64_t sector_num,
int nb_sectors, QEMUIOVector *qiov)
{
acb->aio_done_func = sd_write_done;
acb->aiocb_type = AIOCB_WRITE_UDATA;
+retry:
+ if (check_overlapping_aiocb(s, acb)) {
+ qemu_co_queue_wait(&s->overlapping_queue);
+ goto retry;
+ }
+
ret = sd_co_rw_vector(acb);
if (ret <= 0) {
+ QLIST_REMOVE(acb, aiocb_siblings);
+ qemu_co_queue_restart_all(&s->overlapping_queue);
qemu_aio_unref(acb);
return ret;
}
qemu_coroutine_yield();
+ QLIST_REMOVE(acb, aiocb_siblings);
+ qemu_co_queue_restart_all(&s->overlapping_queue);
+
return acb->ret;
}
{
SheepdogAIOCB *acb;
int ret;
+ BDRVSheepdogState *s = bs->opaque;
acb = sd_aio_setup(bs, qiov, sector_num, nb_sectors);
acb->aiocb_type = AIOCB_READ_UDATA;
acb->aio_done_func = sd_finish_aiocb;
+retry:
+ if (check_overlapping_aiocb(s, acb)) {
+ qemu_co_queue_wait(&s->overlapping_queue);
+ goto retry;
+ }
+
ret = sd_co_rw_vector(acb);
if (ret <= 0) {
+ QLIST_REMOVE(acb, aiocb_siblings);
+ qemu_co_queue_restart_all(&s->overlapping_queue);
qemu_aio_unref(acb);
return ret;
}
qemu_coroutine_yield();
+ QLIST_REMOVE(acb, aiocb_siblings);
+ qemu_co_queue_restart_all(&s->overlapping_queue);
return acb->ret;
}
if (ret < 0) {
error_report("failed to create inode for snapshot: %s",
error_get_pretty(local_err));
+ error_free(local_err);
goto cleanup;
}
int nb_sectors)
{
SheepdogAIOCB *acb;
- QEMUIOVector dummy;
BDRVSheepdogState *s = bs->opaque;
int ret;
+ QEMUIOVector discard_iov;
+ struct iovec iov;
+ uint32_t zero = 0;
if (!s->discard_supported) {
return 0;
}
- acb = sd_aio_setup(bs, &dummy, sector_num, nb_sectors);
+ memset(&discard_iov, 0, sizeof(discard_iov));
+ memset(&iov, 0, sizeof(iov));
+ iov.iov_base = &zero;
+ iov.iov_len = sizeof(zero);
+ discard_iov.iov = &iov;
+ discard_iov.niov = 1;
+ acb = sd_aio_setup(bs, &discard_iov, sector_num, nb_sectors);
acb->aiocb_type = AIOCB_DISCARD_OBJ;
acb->aio_done_func = sd_finish_aiocb;
+retry:
+ if (check_overlapping_aiocb(s, acb)) {
+ qemu_co_queue_wait(&s->overlapping_queue);
+ goto retry;
+ }
+
ret = sd_co_rw_vector(acb);
if (ret <= 0) {
+ QLIST_REMOVE(acb, aiocb_siblings);
+ qemu_co_queue_restart_all(&s->overlapping_queue);
qemu_aio_unref(acb);
return ret;
}
qemu_coroutine_yield();
+ QLIST_REMOVE(acb, aiocb_siblings);
+ qemu_co_queue_restart_all(&s->overlapping_queue);
+
return acb->ret;
}
.instance_size = sizeof(BDRVSheepdogState),
.bdrv_needs_filename = true,
.bdrv_file_open = sd_open,
+ .bdrv_reopen_prepare = sd_reopen_prepare,
+ .bdrv_reopen_commit = sd_reopen_commit,
+ .bdrv_reopen_abort = sd_reopen_abort,
.bdrv_close = sd_close,
.bdrv_create = sd_create,
.bdrv_has_zero_init = bdrv_has_zero_init_1,
.instance_size = sizeof(BDRVSheepdogState),
.bdrv_needs_filename = true,
.bdrv_file_open = sd_open,
+ .bdrv_reopen_prepare = sd_reopen_prepare,
+ .bdrv_reopen_commit = sd_reopen_commit,
+ .bdrv_reopen_abort = sd_reopen_abort,
.bdrv_close = sd_close,
.bdrv_create = sd_create,
.bdrv_has_zero_init = bdrv_has_zero_init_1,
.instance_size = sizeof(BDRVSheepdogState),
.bdrv_needs_filename = true,
.bdrv_file_open = sd_open,
+ .bdrv_reopen_prepare = sd_reopen_prepare,
+ .bdrv_reopen_commit = sd_reopen_commit,
+ .bdrv_reopen_abort = sd_reopen_abort,
.bdrv_close = sd_close,
.bdrv_create = sd_create,
.bdrv_has_zero_init = bdrv_has_zero_init_1,