#define SD_NR_VDIS (1U << 24)
#define SD_DATA_OBJ_SIZE (UINT64_C(1) << 22)
#define SD_MAX_VDI_SIZE (SD_DATA_OBJ_SIZE * MAX_DATA_OBJS)
+/*
+ * For erasure coding, we use at most SD_EC_MAX_STRIP for data strips and
+ * (SD_EC_MAX_STRIP - 1) for parity strips
+ *
+ * SD_MAX_COPIES is sum of number of data strips and parity strips.
+ */
+#define SD_EC_MAX_STRIP 16
+#define SD_MAX_COPIES (SD_EC_MAX_STRIP * 2 - 1)
#define SD_INODE_SIZE (sizeof(SheepdogInode))
#define CURRENT_VDI_ID 0
uint32_t data_length;
uint64_t oid;
uint64_t cow_oid;
- uint32_t copies;
- uint32_t rsvd;
+ uint8_t copies;
+ uint8_t copy_policy;
+ uint8_t reserved[6];
uint64_t offset;
} SheepdogObjReq;
uint32_t id;
uint32_t data_length;
uint32_t result;
- uint32_t copies;
+ uint8_t copies;
+ uint8_t copy_policy;
+ uint8_t reserved[2];
uint32_t pad[6];
} SheepdogObjRsp;
uint32_t id;
uint32_t data_length;
uint64_t vdi_size;
- uint32_t vdi_id;
- uint32_t copies;
+ uint32_t base_vdi_id;
+ uint8_t copies;
+ uint8_t copy_policy;
+ uint8_t reserved[2];
uint32_t snapid;
uint32_t pad[3];
} SheepdogVdiReq;
return oid & (MAX_DATA_OBJS - 1);
}
+static inline uint32_t oid_to_vid(uint64_t oid)
+{
+ return (oid & ~VDI_BIT) >> VDI_SPACE_SHIFT;
+}
+
static inline uint64_t vid_to_vdi_oid(uint32_t vid)
{
return VDI_BIT | ((uint64_t)vid << VDI_SPACE_SHIFT);
Coroutine *coroutine;
void (*aio_done_func)(SheepdogAIOCB *);
- bool canceled;
+ bool cancelable;
+ bool *finished;
int nr_pending;
};
typedef struct BDRVSheepdogState {
+ BlockDriverState *bs;
+
SheepdogInode inode;
uint32_t min_dirty_data_idx;
Coroutine *co_recv;
uint32_t aioreq_seq_num;
+
+ /* Every aio request must be linked to either of these queues. */
QLIST_HEAD(inflight_aio_head, AIOReq) inflight_aio_head;
QLIST_HEAD(pending_aio_head, AIOReq) pending_aio_head;
+ QLIST_HEAD(failed_aio_head, AIOReq) failed_aio_head;
} BDRVSheepdogState;
static const char * sd_strerror(int err)
{
SheepdogAIOCB *acb = aio_req->aiocb;
+ acb->cancelable = false;
QLIST_REMOVE(aio_req, aio_siblings);
g_free(aio_req);
static void coroutine_fn sd_finish_aiocb(SheepdogAIOCB *acb)
{
- if (!acb->canceled) {
- qemu_coroutine_enter(acb->coroutine, NULL);
+ qemu_coroutine_enter(acb->coroutine, NULL);
+ if (acb->finished) {
+ *acb->finished = true;
}
qemu_aio_release(acb);
}
+/*
+ * Check whether the specified acb can be canceled
+ *
+ * We can cancel aio when any request belonging to the acb is:
+ * - Not processed by the sheepdog server.
+ * - Not linked to the inflight queue.
+ */
+static bool sd_acb_cancelable(const SheepdogAIOCB *acb)
+{
+ BDRVSheepdogState *s = acb->common.bs->opaque;
+ AIOReq *aioreq;
+
+ if (!acb->cancelable) {
+ return false;
+ }
+
+ QLIST_FOREACH(aioreq, &s->inflight_aio_head, aio_siblings) {
+ if (aioreq->aiocb == acb) {
+ return false;
+ }
+ }
+
+ return true;
+}
+
static void sd_aio_cancel(BlockDriverAIOCB *blockacb)
{
SheepdogAIOCB *acb = (SheepdogAIOCB *)blockacb;
+ BDRVSheepdogState *s = acb->common.bs->opaque;
+ AIOReq *aioreq, *next;
+ bool finished = false;
+
+ acb->finished = &finished;
+ while (!finished) {
+ if (sd_acb_cancelable(acb)) {
+ /* Remove outstanding requests from pending and failed queues. */
+ QLIST_FOREACH_SAFE(aioreq, &s->pending_aio_head, aio_siblings,
+ next) {
+ if (aioreq->aiocb == acb) {
+ free_aio_req(s, aioreq);
+ }
+ }
+ QLIST_FOREACH_SAFE(aioreq, &s->failed_aio_head, aio_siblings,
+ next) {
+ if (aioreq->aiocb == acb) {
+ free_aio_req(s, aioreq);
+ }
+ }
- /*
- * Sheepdog cannot cancel the requests which are already sent to
- * the servers, so we just complete the request with -EIO here.
- */
- acb->ret = -EIO;
- qemu_coroutine_enter(acb->coroutine, NULL);
- acb->canceled = true;
+ assert(acb->nr_pending == 0);
+ sd_finish_aiocb(acb);
+ return;
+ }
+ qemu_aio_wait();
+ }
}
static const AIOCBInfo sd_aiocb_info = {
acb->nb_sectors = nb_sectors;
acb->aio_done_func = NULL;
- acb->canceled = false;
+ acb->cancelable = true;
+ acb->finished = NULL;
acb->coroutine = qemu_coroutine_self();
acb->ret = 0;
acb->nr_pending = 0;
return acb;
}
-static int connect_to_sdog(BDRVSheepdogState *s)
+static int connect_to_sdog(BDRVSheepdogState *s, Error **errp)
{
int fd;
- Error *err = NULL;
if (s->is_unix) {
- fd = unix_connect(s->host_spec, &err);
+ fd = unix_connect(s->host_spec, errp);
} else {
- fd = inet_connect(s->host_spec, &err);
+ fd = inet_connect(s->host_spec, errp);
- if (err == NULL) {
+ if (fd >= 0) {
int ret = socket_set_nodelay(fd);
if (ret < 0) {
error_report("%s", strerror(errno));
}
}
- if (err != NULL) {
- qerror_report_err(err);
- error_free(err);
- } else {
+ if (fd >= 0) {
qemu_set_nonblock(fd);
}
int ret;
ret = qemu_co_send(sockfd, hdr, sizeof(*hdr));
- if (ret < sizeof(*hdr)) {
+ if (ret != sizeof(*hdr)) {
error_report("failed to send a req, %s", strerror(errno));
return ret;
}
ret = qemu_co_send(sockfd, data, *wlen);
- if (ret < *wlen) {
+ if (ret != *wlen) {
error_report("failed to send a req, %s", strerror(errno));
}
qemu_aio_set_fd_handler(sockfd, restart_co_req, NULL, co);
ret = qemu_co_recv(sockfd, hdr, sizeof(*hdr));
- if (ret < sizeof(*hdr)) {
+ if (ret != sizeof(*hdr)) {
error_report("failed to get a rsp, %s", strerror(errno));
ret = -errno;
goto out;
if (*rlen) {
ret = qemu_co_recv(sockfd, data, *rlen);
- if (ret < *rlen) {
+ if (ret != *rlen) {
error_report("failed to get the data, %s", strerror(errno));
ret = -errno;
goto out;
return srco.ret;
}
-static int coroutine_fn add_aio_request(BDRVSheepdogState *s, AIOReq *aio_req,
+static void coroutine_fn add_aio_request(BDRVSheepdogState *s, AIOReq *aio_req,
struct iovec *iov, int niov, bool create,
enum AIOCBState aiocb_type);
-static int coroutine_fn resend_aioreq(BDRVSheepdogState *s, AIOReq *aio_req);
-
+static void coroutine_fn resend_aioreq(BDRVSheepdogState *s, AIOReq *aio_req);
+static int reload_inode(BDRVSheepdogState *s, uint32_t snapid, const char *tag);
+static int get_sheep_fd(BDRVSheepdogState *s, Error **errp);
+static void co_write_request(void *opaque);
static AIOReq *find_pending_req(BDRVSheepdogState *s, uint64_t oid)
{
{
AIOReq *aio_req;
SheepdogAIOCB *acb;
- int ret;
while ((aio_req = find_pending_req(s, oid)) != NULL) {
acb = aio_req->aiocb;
/* move aio_req from pending list to inflight one */
QLIST_REMOVE(aio_req, aio_siblings);
QLIST_INSERT_HEAD(&s->inflight_aio_head, aio_req, aio_siblings);
- ret = add_aio_request(s, aio_req, acb->qiov->iov,
- acb->qiov->niov, false, acb->aiocb_type);
- if (ret < 0) {
- error_report("add_aio_request is failed");
- free_aio_req(s, aio_req);
- if (!acb->nr_pending) {
- sd_finish_aiocb(acb);
- }
+ add_aio_request(s, aio_req, acb->qiov->iov, acb->qiov->niov, false,
+ acb->aiocb_type);
+ }
+}
+
+static coroutine_fn void reconnect_to_sdog(void *opaque)
+{
+ Error *local_err = NULL;
+ BDRVSheepdogState *s = opaque;
+ AIOReq *aio_req, *next;
+
+ qemu_aio_set_fd_handler(s->fd, NULL, NULL, NULL);
+ close(s->fd);
+ s->fd = -1;
+
+ /* Wait for outstanding write requests to be completed. */
+ while (s->co_send != NULL) {
+ co_write_request(opaque);
+ }
+
+ /* Try to reconnect the sheepdog server every one second. */
+ while (s->fd < 0) {
+ s->fd = get_sheep_fd(s, &local_err);
+ if (s->fd < 0) {
+ DPRINTF("Wait for connection to be established\n");
+ qerror_report_err(local_err);
+ error_free(local_err);
+ co_aio_sleep_ns(bdrv_get_aio_context(s->bs), QEMU_CLOCK_REALTIME,
+ 1000000000ULL);
}
+ };
+
+ /*
+ * Now we have to resend all the request in the inflight queue. However,
+ * resend_aioreq() can yield and newly created requests can be added to the
+ * inflight queue before the coroutine is resumed. To avoid mixing them, we
+ * have to move all the inflight requests to the failed queue before
+ * resend_aioreq() is called.
+ */
+ QLIST_FOREACH_SAFE(aio_req, &s->inflight_aio_head, aio_siblings, next) {
+ QLIST_REMOVE(aio_req, aio_siblings);
+ QLIST_INSERT_HEAD(&s->failed_aio_head, aio_req, aio_siblings);
+ }
+
+ /* Resend all the failed aio requests. */
+ while (!QLIST_EMPTY(&s->failed_aio_head)) {
+ aio_req = QLIST_FIRST(&s->failed_aio_head);
+ QLIST_REMOVE(aio_req, aio_siblings);
+ QLIST_INSERT_HEAD(&s->inflight_aio_head, aio_req, aio_siblings);
+ resend_aioreq(s, aio_req);
}
}
SheepdogAIOCB *acb;
uint64_t idx;
- if (QLIST_EMPTY(&s->inflight_aio_head)) {
- goto out;
- }
-
/* read a header */
ret = qemu_co_recv(fd, &rsp, sizeof(rsp));
- if (ret < 0) {
+ if (ret != sizeof(rsp)) {
error_report("failed to get the header, %s", strerror(errno));
- goto out;
+ goto err;
}
/* find the right aio_req from the inflight aio list */
}
if (!aio_req) {
error_report("cannot find aio_req %x", rsp.id);
- goto out;
+ goto err;
}
acb = aio_req->aiocb;
case AIOCB_READ_UDATA:
ret = qemu_co_recvv(fd, acb->qiov->iov, acb->qiov->niov,
aio_req->iov_offset, rsp.data_length);
- if (ret < 0) {
+ if (ret != rsp.data_length) {
error_report("failed to get the data, %s", strerror(errno));
- goto out;
+ goto err;
}
break;
case AIOCB_FLUSH_CACHE:
case SD_RES_SUCCESS:
break;
case SD_RES_READONLY:
- ret = resend_aioreq(s, aio_req);
- if (ret == SD_RES_SUCCESS) {
- goto out;
+ if (s->inode.vdi_id == oid_to_vid(aio_req->oid)) {
+ ret = reload_inode(s, 0, "");
+ if (ret < 0) {
+ goto err;
+ }
}
- /* fall through */
+ if (is_data_obj(aio_req->oid)) {
+ aio_req->oid = vid_to_data_oid(s->inode.vdi_id,
+ data_oid_to_idx(aio_req->oid));
+ } else {
+ aio_req->oid = vid_to_vdi_oid(s->inode.vdi_id);
+ }
+ resend_aioreq(s, aio_req);
+ goto out;
default:
acb->ret = -EIO;
error_report("%s", sd_strerror(rsp.result));
}
out:
s->co_recv = NULL;
+ return;
+err:
+ s->co_recv = NULL;
+ reconnect_to_sdog(opaque);
}
static void co_read_response(void *opaque)
}
/*
- * Return a socket discriptor to read/write objects.
+ * Return a socket descriptor to read/write objects.
*
- * We cannot use this discriptor for other operations because
+ * We cannot use this descriptor for other operations because
* the block driver may be on waiting response from the server.
*/
-static int get_sheep_fd(BDRVSheepdogState *s)
+static int get_sheep_fd(BDRVSheepdogState *s, Error **errp)
{
int fd;
- fd = connect_to_sdog(s);
+ fd = connect_to_sdog(s, errp);
if (fd < 0) {
return fd;
}
static int find_vdi_name(BDRVSheepdogState *s, const char *filename,
uint32_t snapid, const char *tag, uint32_t *vid,
- bool lock)
+ bool lock, Error **errp)
{
int ret, fd;
SheepdogVdiReq hdr;
unsigned int wlen, rlen = 0;
char buf[SD_MAX_VDI_LEN + SD_MAX_VDI_TAG_LEN];
- fd = connect_to_sdog(s);
+ fd = connect_to_sdog(s, errp);
if (fd < 0) {
return fd;
}
ret = do_req(fd, (SheepdogReq *)&hdr, buf, &wlen, &rlen);
if (ret) {
+ error_setg_errno(errp, -ret, "cannot get vdi info");
goto out;
}
if (rsp->result != SD_RES_SUCCESS) {
- error_report("cannot get vdi info, %s, %s %d %s",
- sd_strerror(rsp->result), filename, snapid, tag);
+ error_setg(errp, "cannot get vdi info, %s, %s %" PRIu32 " %s",
+ sd_strerror(rsp->result), filename, snapid, tag);
if (rsp->result == SD_RES_NO_VDI) {
ret = -ENOENT;
} else {
return ret;
}
-static int coroutine_fn add_aio_request(BDRVSheepdogState *s, AIOReq *aio_req,
+static void coroutine_fn add_aio_request(BDRVSheepdogState *s, AIOReq *aio_req,
struct iovec *iov, int niov, bool create,
enum AIOCBState aiocb_type)
{
/* send a header */
ret = qemu_co_send(s->fd, &hdr, sizeof(hdr));
- if (ret < 0) {
- qemu_co_mutex_unlock(&s->lock);
+ if (ret != sizeof(hdr)) {
error_report("failed to send a req, %s", strerror(errno));
- return -errno;
+ goto out;
}
if (wlen) {
ret = qemu_co_sendv(s->fd, iov, niov, aio_req->iov_offset, wlen);
- if (ret < 0) {
- qemu_co_mutex_unlock(&s->lock);
+ if (ret != wlen) {
error_report("failed to send a data, %s", strerror(errno));
- return -errno;
}
}
-
+out:
socket_set_cork(s->fd, 0);
qemu_aio_set_fd_handler(s->fd, co_read_response, NULL, s);
+ s->co_send = NULL;
qemu_co_mutex_unlock(&s->lock);
-
- return 0;
}
-static int read_write_object(int fd, char *buf, uint64_t oid, int copies,
+static int read_write_object(int fd, char *buf, uint64_t oid, uint8_t copies,
unsigned int datalen, uint64_t offset,
bool write, bool create, uint32_t cache_flags)
{
}
}
-static int read_object(int fd, char *buf, uint64_t oid, int copies,
+static int read_object(int fd, char *buf, uint64_t oid, uint8_t copies,
unsigned int datalen, uint64_t offset,
uint32_t cache_flags)
{
false, cache_flags);
}
-static int write_object(int fd, char *buf, uint64_t oid, int copies,
+static int write_object(int fd, char *buf, uint64_t oid, uint8_t copies,
unsigned int datalen, uint64_t offset, bool create,
uint32_t cache_flags)
{
/* update inode with the latest state */
static int reload_inode(BDRVSheepdogState *s, uint32_t snapid, const char *tag)
{
+ Error *local_err = NULL;
SheepdogInode *inode;
int ret = 0, fd;
uint32_t vid = 0;
- fd = connect_to_sdog(s);
+ fd = connect_to_sdog(s, &local_err);
if (fd < 0) {
+ qerror_report_err(local_err);
+ error_free(local_err);
return -EIO;
}
inode = g_malloc(sizeof(s->inode));
- ret = find_vdi_name(s, s->name, snapid, tag, &vid, false);
+ ret = find_vdi_name(s, s->name, snapid, tag, &vid, false, &local_err);
if (ret) {
+ qerror_report_err(local_err);
+ error_free(local_err);
goto out;
}
return ret;
}
-static int coroutine_fn resend_aioreq(BDRVSheepdogState *s, AIOReq *aio_req)
+/* Return true if the specified request is linked to the pending list. */
+static bool check_simultaneous_create(BDRVSheepdogState *s, AIOReq *aio_req)
{
- SheepdogAIOCB *acb = aio_req->aiocb;
- bool create = false;
- int ret;
-
- ret = reload_inode(s, 0, "");
- if (ret < 0) {
- return ret;
+ AIOReq *areq;
+ QLIST_FOREACH(areq, &s->inflight_aio_head, aio_siblings) {
+ if (areq != aio_req && areq->oid == aio_req->oid) {
+ /*
+ * Sheepdog cannot handle simultaneous create requests to the same
+ * object, so we cannot send the request until the previous request
+ * finishes.
+ */
+ DPRINTF("simultaneous create to %" PRIx64 "\n", aio_req->oid);
+ aio_req->flags = 0;
+ aio_req->base_oid = 0;
+ QLIST_REMOVE(aio_req, aio_siblings);
+ QLIST_INSERT_HEAD(&s->pending_aio_head, aio_req, aio_siblings);
+ return true;
+ }
}
- aio_req->oid = vid_to_data_oid(s->inode.vdi_id,
- data_oid_to_idx(aio_req->oid));
+ return false;
+}
+
+static void coroutine_fn resend_aioreq(BDRVSheepdogState *s, AIOReq *aio_req)
+{
+ SheepdogAIOCB *acb = aio_req->aiocb;
+ bool create = false;
/* check whether this request becomes a CoW one */
- if (acb->aiocb_type == AIOCB_WRITE_UDATA) {
+ if (acb->aiocb_type == AIOCB_WRITE_UDATA && is_data_obj(aio_req->oid)) {
int idx = data_oid_to_idx(aio_req->oid);
- AIOReq *areq;
- if (s->inode.data_vdi_id[idx] == 0) {
- create = true;
- goto out;
- }
if (is_data_obj_writable(&s->inode, idx)) {
goto out;
}
- /* link to the pending list if there is another CoW request to
- * the same object */
- QLIST_FOREACH(areq, &s->inflight_aio_head, aio_siblings) {
- if (areq != aio_req && areq->oid == aio_req->oid) {
- DPRINTF("simultaneous CoW to %" PRIx64 "\n", aio_req->oid);
- QLIST_REMOVE(aio_req, aio_siblings);
- QLIST_INSERT_HEAD(&s->pending_aio_head, aio_req, aio_siblings);
- return SD_RES_SUCCESS;
- }
+ if (check_simultaneous_create(s, aio_req)) {
+ return;
}
- aio_req->base_oid = vid_to_data_oid(s->inode.data_vdi_id[idx], idx);
- aio_req->flags |= SD_FLAG_CMD_COW;
+ if (s->inode.data_vdi_id[idx]) {
+ aio_req->base_oid = vid_to_data_oid(s->inode.data_vdi_id[idx], idx);
+ aio_req->flags |= SD_FLAG_CMD_COW;
+ }
create = true;
}
out:
- return add_aio_request(s, aio_req, acb->qiov->iov, acb->qiov->niov,
- create, acb->aiocb_type);
+ if (is_data_obj(aio_req->oid)) {
+ add_aio_request(s, aio_req, acb->qiov->iov, acb->qiov->niov, create,
+ acb->aiocb_type);
+ } else {
+ struct iovec iov;
+ iov.iov_base = &s->inode;
+ iov.iov_len = sizeof(s->inode);
+ add_aio_request(s, aio_req, &iov, 1, false, AIOCB_WRITE_UDATA);
+ }
}
/* TODO Convert to fine grained options */
},
};
-static int sd_open(BlockDriverState *bs, QDict *options, int flags)
+static int sd_open(BlockDriverState *bs, QDict *options, int flags,
+ Error **errp)
{
int ret, fd;
uint32_t vid = 0;
Error *local_err = NULL;
const char *filename;
- opts = qemu_opts_create_nofail(&runtime_opts);
+ s->bs = bs;
+
+ opts = qemu_opts_create(&runtime_opts, NULL, 0, &error_abort);
qemu_opts_absorb_qdict(opts, options, &local_err);
- if (error_is_set(&local_err)) {
- qerror_report_err(local_err);
- error_free(local_err);
+ if (local_err) {
+ error_propagate(errp, local_err);
ret = -EINVAL;
goto out;
}
QLIST_INIT(&s->inflight_aio_head);
QLIST_INIT(&s->pending_aio_head);
+ QLIST_INIT(&s->failed_aio_head);
s->fd = -1;
memset(vdi, 0, sizeof(vdi));
ret = parse_vdiname(s, filename, vdi, &snapid, tag);
}
if (ret < 0) {
+ error_setg(errp, "Can't parse filename");
goto out;
}
- s->fd = get_sheep_fd(s);
+ s->fd = get_sheep_fd(s, errp);
if (s->fd < 0) {
ret = s->fd;
goto out;
}
- ret = find_vdi_name(s, vdi, snapid, tag, &vid, true);
+ ret = find_vdi_name(s, vdi, snapid, tag, &vid, true, errp);
if (ret) {
goto out;
}
s->is_snapshot = true;
}
- fd = connect_to_sdog(s);
+ fd = connect_to_sdog(s, errp);
if (fd < 0) {
ret = fd;
goto out;
closesocket(fd);
if (ret) {
+ error_setg(errp, "Can't read snapshot inode");
goto out;
}
return ret;
}
-static int do_sd_create(BDRVSheepdogState *s, char *filename, int64_t vdi_size,
- uint32_t base_vid, uint32_t *vdi_id, int snapshot)
+static int do_sd_create(BDRVSheepdogState *s, uint32_t *vdi_id, int snapshot,
+ Error **errp)
{
SheepdogVdiReq hdr;
SheepdogVdiRsp *rsp = (SheepdogVdiRsp *)&hdr;
unsigned int wlen, rlen = 0;
char buf[SD_MAX_VDI_LEN];
- fd = connect_to_sdog(s);
+ fd = connect_to_sdog(s, errp);
if (fd < 0) {
return fd;
}
* does not fit in buf? For now, just truncate and avoid buffer overrun.
*/
memset(buf, 0, sizeof(buf));
- pstrcpy(buf, sizeof(buf), filename);
+ pstrcpy(buf, sizeof(buf), s->name);
memset(&hdr, 0, sizeof(hdr));
hdr.opcode = SD_OP_NEW_VDI;
- hdr.vdi_id = base_vid;
+ hdr.base_vdi_id = s->inode.vdi_id;
wlen = SD_MAX_VDI_LEN;
hdr.snapid = snapshot;
hdr.data_length = wlen;
- hdr.vdi_size = vdi_size;
+ hdr.vdi_size = s->inode.vdi_size;
+ hdr.copy_policy = s->inode.copy_policy;
+ hdr.copies = s->inode.nr_copies;
ret = do_req(fd, (SheepdogReq *)&hdr, buf, &wlen, &rlen);
closesocket(fd);
if (ret) {
+ error_setg_errno(errp, -ret, "create failed");
return ret;
}
if (rsp->result != SD_RES_SUCCESS) {
- error_report("%s, %s", sd_strerror(rsp->result), filename);
+ error_setg(errp, "%s, %s", sd_strerror(rsp->result), s->inode.name);
return -EIO;
}
return 0;
}
-static int sd_prealloc(const char *filename)
+static int sd_prealloc(const char *filename, Error **errp)
{
BlockDriverState *bs = NULL;
uint32_t idx, max_idx;
void *buf = g_malloc0(SD_DATA_OBJ_SIZE);
int ret;
- ret = bdrv_file_open(&bs, filename, NULL, BDRV_O_RDWR);
+ ret = bdrv_open(&bs, filename, NULL, NULL, BDRV_O_RDWR | BDRV_O_PROTOCOL,
+ NULL, errp);
if (ret < 0) {
- goto out;
+ goto out_with_err_set;
}
vdi_size = bdrv_getlength(bs);
goto out;
}
}
+
out:
+ if (ret < 0) {
+ error_setg_errno(errp, -ret, "Can't pre-allocate");
+ }
+out_with_err_set:
if (bs) {
bdrv_unref(bs);
}
return ret;
}
-static int sd_create(const char *filename, QEMUOptionParameter *options)
+/*
+ * Sheepdog support two kinds of redundancy, full replication and erasure
+ * coding.
+ *
+ * # create a fully replicated vdi with x copies
+ * -o redundancy=x (1 <= x <= SD_MAX_COPIES)
+ *
+ * # create a erasure coded vdi with x data strips and y parity strips
+ * -o redundancy=x:y (x must be one of {2,4,8,16} and 1 <= y < SD_EC_MAX_STRIP)
+ */
+static int parse_redundancy(BDRVSheepdogState *s, const char *opt)
+{
+ struct SheepdogInode *inode = &s->inode;
+ const char *n1, *n2;
+ long copy, parity;
+ char p[10];
+
+ pstrcpy(p, sizeof(p), opt);
+ n1 = strtok(p, ":");
+ n2 = strtok(NULL, ":");
+
+ if (!n1) {
+ return -EINVAL;
+ }
+
+ copy = strtol(n1, NULL, 10);
+ if (copy > SD_MAX_COPIES || copy < 1) {
+ return -EINVAL;
+ }
+ if (!n2) {
+ inode->copy_policy = 0;
+ inode->nr_copies = copy;
+ return 0;
+ }
+
+ if (copy != 2 && copy != 4 && copy != 8 && copy != 16) {
+ return -EINVAL;
+ }
+
+ parity = strtol(n2, NULL, 10);
+ if (parity >= SD_EC_MAX_STRIP || parity < 1) {
+ return -EINVAL;
+ }
+
+ /*
+ * 4 bits for parity and 4 bits for data.
+ * We have to compress upper data bits because it can't represent 16
+ */
+ inode->copy_policy = ((copy / 2) << 4) + parity;
+ inode->nr_copies = copy + parity;
+
+ return 0;
+}
+
+static int sd_create(const char *filename, QEMUOptionParameter *options,
+ Error **errp)
{
int ret = 0;
- uint32_t vid = 0, base_vid = 0;
- int64_t vdi_size = 0;
+ uint32_t vid = 0;
char *backing_file = NULL;
BDRVSheepdogState *s;
- char vdi[SD_MAX_VDI_LEN], tag[SD_MAX_VDI_TAG_LEN];
+ char tag[SD_MAX_VDI_TAG_LEN];
uint32_t snapid;
bool prealloc = false;
s = g_malloc0(sizeof(BDRVSheepdogState));
- memset(vdi, 0, sizeof(vdi));
memset(tag, 0, sizeof(tag));
if (strstr(filename, "://")) {
- ret = sd_parse_uri(s, filename, vdi, &snapid, tag);
+ ret = sd_parse_uri(s, filename, s->name, &snapid, tag);
} else {
- ret = parse_vdiname(s, filename, vdi, &snapid, tag);
+ ret = parse_vdiname(s, filename, s->name, &snapid, tag);
}
if (ret < 0) {
+ error_setg(errp, "Can't parse filename");
goto out;
}
while (options && options->name) {
if (!strcmp(options->name, BLOCK_OPT_SIZE)) {
- vdi_size = options->value.n;
+ s->inode.vdi_size = options->value.n;
} else if (!strcmp(options->name, BLOCK_OPT_BACKING_FILE)) {
backing_file = options->value.s;
} else if (!strcmp(options->name, BLOCK_OPT_PREALLOC)) {
} else if (!strcmp(options->value.s, "full")) {
prealloc = true;
} else {
- error_report("Invalid preallocation mode: '%s'",
- options->value.s);
+ error_setg(errp, "Invalid preallocation mode: '%s'",
+ options->value.s);
ret = -EINVAL;
goto out;
}
+ } else if (!strcmp(options->name, BLOCK_OPT_REDUNDANCY)) {
+ if (options->value.s) {
+ ret = parse_redundancy(s, options->value.s);
+ if (ret < 0) {
+ error_setg(errp, "Invalid redundancy mode: '%s'",
+ options->value.s);
+ goto out;
+ }
+ }
}
options++;
}
- if (vdi_size > SD_MAX_VDI_SIZE) {
- error_report("too big image size");
+ if (s->inode.vdi_size > SD_MAX_VDI_SIZE) {
+ error_setg(errp, "too big image size");
ret = -EINVAL;
goto out;
}
if (backing_file) {
BlockDriverState *bs;
- BDRVSheepdogState *s;
+ BDRVSheepdogState *base;
BlockDriver *drv;
/* Currently, only Sheepdog backing image is supported. */
drv = bdrv_find_protocol(backing_file, true);
if (!drv || strcmp(drv->protocol_name, "sheepdog") != 0) {
- error_report("backing_file must be a sheepdog image");
+ error_setg(errp, "backing_file must be a sheepdog image");
ret = -EINVAL;
goto out;
}
- ret = bdrv_file_open(&bs, backing_file, NULL, 0);
+ bs = NULL;
+ ret = bdrv_open(&bs, backing_file, NULL, NULL, BDRV_O_PROTOCOL, NULL,
+ errp);
if (ret < 0) {
goto out;
}
- s = bs->opaque;
+ base = bs->opaque;
- if (!is_snapshot(&s->inode)) {
- error_report("cannot clone from a non snapshot vdi");
+ if (!is_snapshot(&base->inode)) {
+ error_setg(errp, "cannot clone from a non snapshot vdi");
bdrv_unref(bs);
ret = -EINVAL;
goto out;
}
-
- base_vid = s->inode.vdi_id;
+ s->inode.vdi_id = base->inode.vdi_id;
bdrv_unref(bs);
}
- ret = do_sd_create(s, vdi, vdi_size, base_vid, &vid, 0);
- if (!prealloc || ret) {
+ ret = do_sd_create(s, &vid, 0, errp);
+ if (ret) {
goto out;
}
- ret = sd_prealloc(filename);
+ if (prealloc) {
+ ret = sd_prealloc(filename, errp);
+ }
out:
g_free(s);
return ret;
static void sd_close(BlockDriverState *bs)
{
+ Error *local_err = NULL;
BDRVSheepdogState *s = bs->opaque;
SheepdogVdiReq hdr;
SheepdogVdiRsp *rsp = (SheepdogVdiRsp *)&hdr;
DPRINTF("%s\n", s->name);
- fd = connect_to_sdog(s);
+ fd = connect_to_sdog(s, &local_err);
if (fd < 0) {
+ qerror_report_err(local_err);
+ error_free(local_err);
return;
}
memset(&hdr, 0, sizeof(hdr));
hdr.opcode = SD_OP_RELEASE_VDI;
- hdr.vdi_id = s->inode.vdi_id;
+ hdr.base_vdi_id = s->inode.vdi_id;
wlen = strlen(s->name) + 1;
hdr.data_length = wlen;
hdr.flags = SD_FLAG_CMD_WRITE;
static int sd_truncate(BlockDriverState *bs, int64_t offset)
{
+ Error *local_err = NULL;
BDRVSheepdogState *s = bs->opaque;
int ret, fd;
unsigned int datalen;
return -EINVAL;
}
- fd = connect_to_sdog(s);
+ fd = connect_to_sdog(s, &local_err);
if (fd < 0) {
+ qerror_report_err(local_err);
+ error_free(local_err);
return fd;
}
*/
static void coroutine_fn sd_write_done(SheepdogAIOCB *acb)
{
- int ret;
BDRVSheepdogState *s = acb->common.bs->opaque;
struct iovec iov;
AIOReq *aio_req;
aio_req = alloc_aio_req(s, acb, vid_to_vdi_oid(s->inode.vdi_id),
data_len, offset, 0, 0, offset);
QLIST_INSERT_HEAD(&s->inflight_aio_head, aio_req, aio_siblings);
- ret = add_aio_request(s, aio_req, &iov, 1, false, AIOCB_WRITE_UDATA);
- if (ret) {
- free_aio_req(s, aio_req);
- acb->ret = -EIO;
- goto out;
- }
+ add_aio_request(s, aio_req, &iov, 1, false, AIOCB_WRITE_UDATA);
acb->aio_done_func = sd_finish_aiocb;
acb->aiocb_type = AIOCB_WRITE_UDATA;
return;
}
-out:
+
sd_finish_aiocb(acb);
}
/* Delete current working VDI on the snapshot chain */
static bool sd_delete(BDRVSheepdogState *s)
{
+ Error *local_err = NULL;
unsigned int wlen = SD_MAX_VDI_LEN, rlen = 0;
SheepdogVdiReq hdr = {
.opcode = SD_OP_DEL_VDI,
- .vdi_id = s->inode.vdi_id,
+ .base_vdi_id = s->inode.vdi_id,
.data_length = wlen,
.flags = SD_FLAG_CMD_WRITE,
};
SheepdogVdiRsp *rsp = (SheepdogVdiRsp *)&hdr;
int fd, ret;
- fd = connect_to_sdog(s);
+ fd = connect_to_sdog(s, &local_err);
if (fd < 0) {
+ qerror_report_err(local_err);
+ error_free(local_err);
return false;
}
*/
static int sd_create_branch(BDRVSheepdogState *s)
{
+ Error *local_err = NULL;
int ret, fd;
uint32_t vid;
char *buf;
/*
* Even If deletion fails, we will just create extra snapshot based on
- * the workding VDI which was supposed to be deleted. So no need to
+ * the working VDI which was supposed to be deleted. So no need to
* false bail out.
*/
deleted = sd_delete(s);
- ret = do_sd_create(s, s->name, s->inode.vdi_size, s->inode.vdi_id, &vid,
- !deleted);
+ ret = do_sd_create(s, &vid, !deleted, &local_err);
if (ret) {
+ qerror_report_err(local_err);
+ error_free(local_err);
goto out;
}
DPRINTF("%" PRIx32 " is created.\n", vid);
- fd = connect_to_sdog(s);
+ fd = connect_to_sdog(s, &local_err);
if (fd < 0) {
+ qerror_report_err(local_err);
+ error_free(local_err);
ret = fd;
goto out;
}
}
aio_req = alloc_aio_req(s, acb, oid, len, offset, flags, old_oid, done);
+ QLIST_INSERT_HEAD(&s->inflight_aio_head, aio_req, aio_siblings);
if (create) {
- AIOReq *areq;
- QLIST_FOREACH(areq, &s->inflight_aio_head, aio_siblings) {
- if (areq->oid == oid) {
- /*
- * Sheepdog cannot handle simultaneous create
- * requests to the same object. So we cannot send
- * the request until the previous request
- * finishes.
- */
- aio_req->flags = 0;
- aio_req->base_oid = 0;
- QLIST_INSERT_HEAD(&s->pending_aio_head, aio_req,
- aio_siblings);
- goto done;
- }
+ if (check_simultaneous_create(s, aio_req)) {
+ goto done;
}
}
- QLIST_INSERT_HEAD(&s->inflight_aio_head, aio_req, aio_siblings);
- ret = add_aio_request(s, aio_req, acb->qiov->iov, acb->qiov->niov,
- create, acb->aiocb_type);
- if (ret < 0) {
- error_report("add_aio_request is failed");
- free_aio_req(s, aio_req);
- acb->ret = -EIO;
- goto out;
- }
+ add_aio_request(s, aio_req, acb->qiov->iov, acb->qiov->niov, create,
+ acb->aiocb_type);
done:
offset = 0;
idx++;
{
SheepdogAIOCB *acb;
int ret;
+ int64_t offset = (sector_num + nb_sectors) * BDRV_SECTOR_SIZE;
+ BDRVSheepdogState *s = bs->opaque;
- if (bs->growable && sector_num + nb_sectors > bs->total_sectors) {
- ret = sd_truncate(bs, (sector_num + nb_sectors) * BDRV_SECTOR_SIZE);
+ if (bs->growable && offset > s->inode.vdi_size) {
+ ret = sd_truncate(bs, offset);
if (ret < 0) {
return ret;
}
- bs->total_sectors = sector_num + nb_sectors;
}
acb = sd_aio_setup(bs, qiov, sector_num, nb_sectors);
BDRVSheepdogState *s = bs->opaque;
SheepdogAIOCB *acb;
AIOReq *aio_req;
- int ret;
if (s->cache_flags != SD_FLAG_CMD_CACHE) {
return 0;
aio_req = alloc_aio_req(s, acb, vid_to_vdi_oid(s->inode.vdi_id),
0, 0, 0, 0, 0);
QLIST_INSERT_HEAD(&s->inflight_aio_head, aio_req, aio_siblings);
- ret = add_aio_request(s, aio_req, NULL, 0, false, acb->aiocb_type);
- if (ret < 0) {
- error_report("add_aio_request is failed");
- free_aio_req(s, aio_req);
- qemu_aio_release(acb);
- return ret;
- }
+ add_aio_request(s, aio_req, NULL, 0, false, acb->aiocb_type);
qemu_coroutine_yield();
return acb->ret;
static int sd_snapshot_create(BlockDriverState *bs, QEMUSnapshotInfo *sn_info)
{
+ Error *local_err = NULL;
BDRVSheepdogState *s = bs->opaque;
int ret, fd;
uint32_t new_vid;
datalen = SD_INODE_SIZE - sizeof(s->inode.data_vdi_id);
/* refresh inode. */
- fd = connect_to_sdog(s);
+ fd = connect_to_sdog(s, &local_err);
if (fd < 0) {
+ qerror_report_err(local_err);
+ error_free(local_err);
ret = fd;
goto cleanup;
}
goto cleanup;
}
- ret = do_sd_create(s, s->name, s->inode.vdi_size, s->inode.vdi_id, &new_vid,
- 1);
+ ret = do_sd_create(s, &new_vid, 1, &local_err);
if (ret < 0) {
+ qerror_report_err(local_err);
+ error_free(local_err);
error_report("failed to create inode for snapshot. %s",
strerror(errno));
goto cleanup;
* We implement rollback(loadvm) operation to the specified snapshot by
* 1) switch to the snapshot
* 2) rely on sd_create_branch to delete working VDI and
- * 3) create a new working VDI based on the speicified snapshot
+ * 3) create a new working VDI based on the specified snapshot
*/
static int sd_snapshot_goto(BlockDriverState *bs, const char *snapshot_id)
{
return ret;
}
-static int sd_snapshot_delete(BlockDriverState *bs, const char *snapshot_id)
+static int sd_snapshot_delete(BlockDriverState *bs,
+ const char *snapshot_id,
+ const char *name,
+ Error **errp)
{
/* FIXME: Delete specified snapshot id. */
return 0;
static int sd_snapshot_list(BlockDriverState *bs, QEMUSnapshotInfo **psn_tab)
{
+ Error *local_err = NULL;
BDRVSheepdogState *s = bs->opaque;
SheepdogReq req;
int fd, nr = 1024, ret, max = BITS_TO_LONGS(SD_NR_VDIS) * sizeof(long);
vdi_inuse = g_malloc(max);
- fd = connect_to_sdog(s);
+ fd = connect_to_sdog(s, &local_err);
if (fd < 0) {
+ qerror_report_err(local_err);
+ error_free(local_err);
ret = fd;
goto out;
}
hval = fnv_64a_buf(s->name, strlen(s->name), FNV1A_64_INIT);
start_nr = hval & (SD_NR_VDIS - 1);
- fd = connect_to_sdog(s);
+ fd = connect_to_sdog(s, &local_err);
if (fd < 0) {
+ qerror_report_err(local_err);
+ error_free(local_err);
ret = fd;
goto out;
}
sn_tab[found].vm_state_size = inode.vm_state_size;
sn_tab[found].vm_clock_nsec = inode.vm_clock_nsec;
- snprintf(sn_tab[found].id_str, sizeof(sn_tab[found].id_str), "%u",
- inode.snap_id);
+ snprintf(sn_tab[found].id_str, sizeof(sn_tab[found].id_str),
+ "%" PRIu32, inode.snap_id);
pstrcpy(sn_tab[found].name,
MIN(sizeof(sn_tab[found].name), sizeof(inode.tag)),
inode.tag);
static int do_load_save_vmstate(BDRVSheepdogState *s, uint8_t *data,
int64_t pos, int size, int load)
{
+ Error *local_err = NULL;
bool create;
int fd, ret = 0, remaining = size;
unsigned int data_len;
uint32_t vdi_index;
uint32_t vdi_id = load ? s->inode.parent_vdi_id : s->inode.vdi_id;
- fd = connect_to_sdog(s);
+ fd = connect_to_sdog(s, &local_err);
if (fd < 0) {
+ qerror_report_err(local_err);
+ error_free(local_err);
return fd;
}
{
BDRVSheepdogState *s = bs->opaque;
SheepdogInode *inode = &s->inode;
- unsigned long start = sector_num * BDRV_SECTOR_SIZE / SD_DATA_OBJ_SIZE,
+ uint64_t offset = sector_num * BDRV_SECTOR_SIZE;
+ unsigned long start = offset / SD_DATA_OBJ_SIZE,
end = DIV_ROUND_UP((sector_num + nb_sectors) *
BDRV_SECTOR_SIZE, SD_DATA_OBJ_SIZE);
unsigned long idx;
- int64_t ret = BDRV_BLOCK_DATA;
+ int64_t ret = BDRV_BLOCK_DATA | BDRV_BLOCK_OFFSET_VALID | offset;
for (idx = start; idx < end; idx++) {
if (inode->data_vdi_id[idx] == 0) {
return ret;
}
+static int64_t sd_get_allocated_file_size(BlockDriverState *bs)
+{
+ BDRVSheepdogState *s = bs->opaque;
+ SheepdogInode *inode = &s->inode;
+ unsigned long i, last = DIV_ROUND_UP(inode->vdi_size, SD_DATA_OBJ_SIZE);
+ uint64_t size = 0;
+
+ for (i = 0; i < last; i++) {
+ if (inode->data_vdi_id[i] == 0) {
+ continue;
+ }
+ size += SD_DATA_OBJ_SIZE;
+ }
+ return size;
+}
+
static QEMUOptionParameter sd_create_options[] = {
{
.name = BLOCK_OPT_SIZE,
.type = OPT_STRING,
.help = "Preallocation mode (allowed values: off, full)"
},
+ {
+ .name = BLOCK_OPT_REDUNDANCY,
+ .type = OPT_STRING,
+ .help = "Redundancy of the image"
+ },
{ NULL }
};
.format_name = "sheepdog",
.protocol_name = "sheepdog",
.instance_size = sizeof(BDRVSheepdogState),
+ .bdrv_needs_filename = true,
.bdrv_file_open = sd_open,
.bdrv_close = sd_close,
.bdrv_create = sd_create,
.bdrv_has_zero_init = bdrv_has_zero_init_1,
.bdrv_getlength = sd_getlength,
+ .bdrv_get_allocated_file_size = sd_get_allocated_file_size,
.bdrv_truncate = sd_truncate,
.bdrv_co_readv = sd_co_readv,
.format_name = "sheepdog",
.protocol_name = "sheepdog+tcp",
.instance_size = sizeof(BDRVSheepdogState),
+ .bdrv_needs_filename = true,
.bdrv_file_open = sd_open,
.bdrv_close = sd_close,
.bdrv_create = sd_create,
.bdrv_has_zero_init = bdrv_has_zero_init_1,
.bdrv_getlength = sd_getlength,
+ .bdrv_get_allocated_file_size = sd_get_allocated_file_size,
.bdrv_truncate = sd_truncate,
.bdrv_co_readv = sd_co_readv,
.format_name = "sheepdog",
.protocol_name = "sheepdog+unix",
.instance_size = sizeof(BDRVSheepdogState),
+ .bdrv_needs_filename = true,
.bdrv_file_open = sd_open,
.bdrv_close = sd_close,
.bdrv_create = sd_create,
.bdrv_has_zero_init = bdrv_has_zero_init_1,
.bdrv_getlength = sd_getlength,
+ .bdrv_get_allocated_file_size = sd_get_allocated_file_size,
.bdrv_truncate = sd_truncate,
.bdrv_co_readv = sd_co_readv,