* See the COPYING file in the top-level directory.
*/
-#include <gnutls/gnutls.h>
-#include <gnutls/crypto.h>
+#include "qemu/osdep.h"
+#include "qemu/cutils.h"
#include "block/block_int.h"
#include "qapi/qmp/qbool.h"
#include "qapi/qmp/qdict.h"
+#include "qapi/qmp/qerror.h"
#include "qapi/qmp/qint.h"
#include "qapi/qmp/qjson.h"
#include "qapi/qmp/qlist.h"
#include "qapi/qmp/qstring.h"
#include "qapi-event.h"
+#include "crypto/hash.h"
#define HASH_LENGTH 32
/* This union holds a vote hash value */
typedef union QuorumVoteValue {
- char h[HASH_LENGTH]; /* SHA-256 hash */
+ uint8_t h[HASH_LENGTH]; /* SHA-256 hash */
int64_t l; /* simpler 64 bits hash */
} QuorumVoteValue;
/* the following structure holds the state of one quorum instance */
typedef struct BDRVQuorumState {
- BlockDriverState **bs; /* children BlockDriverStates */
+ BdrvChild **children; /* children BlockDriverStates */
int num_children; /* children count */
+ unsigned next_child_index; /* the index of the next child that should
+ * be added
+ */
int threshold; /* if less than threshold children reads gave the
* same result a quorum error occurs.
*/
bool is_read;
int vote_ret;
- int child_iter; /* which child to read in fifo pattern */
+ int children_read; /* how many children have been read from */
};
static bool quorum_vote(QuorumAIOCB *acb);
static void quorum_aio_finalize(QuorumAIOCB *acb)
{
- int i, ret = 0;
-
- if (acb->vote_ret) {
- ret = acb->vote_ret;
- }
-
- acb->common.cb(acb->common.opaque, ret);
-
- if (acb->is_read) {
- /* on the quorum case acb->child_iter == s->num_children - 1 */
- for (i = 0; i <= acb->child_iter; i++) {
- qemu_vfree(acb->qcrs[i].buf);
- qemu_iovec_destroy(&acb->qcrs[i].qiov);
- }
- }
-
+ acb->common.cb(acb->common.opaque, acb->vote_ret);
g_free(acb->qcrs);
qemu_aio_unref(acb);
}
return acb;
}
-static void quorum_report_bad(QuorumAIOCB *acb, char *node_name, int ret)
+static void quorum_report_bad(QuorumOpType type, uint64_t sector_num,
+ int nb_sectors, char *node_name, int ret)
{
const char *msg = NULL;
if (ret < 0) {
msg = strerror(-ret);
}
- qapi_event_send_quorum_report_bad(!!msg, msg, node_name,
- acb->sector_num, acb->nb_sectors, &error_abort);
+
+ qapi_event_send_quorum_report_bad(type, !!msg, msg, node_name,
+ sector_num, nb_sectors, &error_abort);
}
static void quorum_report_failure(QuorumAIOCB *acb)
}
}
-static void quorum_aio_cb(void *opaque, int ret)
+static void quorum_report_bad_acb(QuorumChildRequest *sacb, int ret)
+{
+ QuorumAIOCB *acb = sacb->parent;
+ QuorumOpType type = acb->is_read ? QUORUM_OP_TYPE_READ : QUORUM_OP_TYPE_WRITE;
+ quorum_report_bad(type, acb->sector_num, acb->nb_sectors,
+ sacb->aiocb->bs->node_name, ret);
+}
+
+static void quorum_fifo_aio_cb(void *opaque, int ret)
{
QuorumChildRequest *sacb = opaque;
QuorumAIOCB *acb = sacb->parent;
BDRVQuorumState *s = acb->common.bs->opaque;
- bool rewrite = false;
- if (acb->is_read && s->read_pattern == QUORUM_READ_PATTERN_FIFO) {
+ assert(acb->is_read && s->read_pattern == QUORUM_READ_PATTERN_FIFO);
+
+ if (ret < 0) {
+ quorum_report_bad_acb(sacb, ret);
+
/* We try to read next child in FIFO order if we fail to read */
- if (ret < 0 && ++acb->child_iter < s->num_children) {
+ if (acb->children_read < s->num_children) {
read_fifo_child(acb);
return;
}
-
- if (ret == 0) {
- quorum_copy_qiov(acb->qiov, &acb->qcrs[acb->child_iter].qiov);
- }
- acb->vote_ret = ret;
- quorum_aio_finalize(acb);
- return;
}
+ acb->vote_ret = ret;
+
+ /* FIXME: rewrite failed children if acb->children_read > 1? */
+ quorum_aio_finalize(acb);
+}
+
+static void quorum_aio_cb(void *opaque, int ret)
+{
+ QuorumChildRequest *sacb = opaque;
+ QuorumAIOCB *acb = sacb->parent;
+ BDRVQuorumState *s = acb->common.bs->opaque;
+ bool rewrite = false;
+ int i;
+
sacb->ret = ret;
- acb->count++;
if (ret == 0) {
acb->success_count++;
} else {
- quorum_report_bad(acb, sacb->aiocb->bs->node_name, ret);
+ quorum_report_bad_acb(sacb, ret);
}
+ acb->count++;
assert(acb->count <= s->num_children);
assert(acb->success_count <= s->num_children);
if (acb->count < s->num_children) {
/* Do the vote on read */
if (acb->is_read) {
rewrite = quorum_vote(acb);
+ for (i = 0; i < s->num_children; i++) {
+ qemu_vfree(acb->qcrs[i].buf);
+ qemu_iovec_destroy(&acb->qcrs[i].qiov);
+ }
} else {
quorum_has_too_much_io_failed(acb);
}
continue;
}
QLIST_FOREACH(item, &version->items, next) {
- quorum_report_bad(acb, s->bs[item->index]->node_name, 0);
+ quorum_report_bad(QUORUM_OP_TYPE_READ, acb->sector_num,
+ acb->nb_sectors,
+ s->children[item->index]->bs->node_name, 0);
}
}
}
continue;
}
QLIST_FOREACH(item, &version->items, next) {
- bdrv_aio_writev(s->bs[item->index], acb->sector_num, acb->qiov,
- acb->nb_sectors, quorum_rewrite_aio_cb, acb);
+ bdrv_aio_writev(s->children[item->index], acb->sector_num,
+ acb->qiov, acb->nb_sectors, quorum_rewrite_aio_cb,
+ acb);
}
}
static int quorum_compute_hash(QuorumAIOCB *acb, int i, QuorumVoteValue *hash)
{
- int j, ret;
- gnutls_hash_hd_t dig;
QEMUIOVector *qiov = &acb->qcrs[i].qiov;
-
- ret = gnutls_hash_init(&dig, GNUTLS_DIG_SHA256);
-
- if (ret < 0) {
- return ret;
- }
-
- for (j = 0; j < qiov->niov; j++) {
- ret = gnutls_hash(dig, qiov->iov[j].iov_base, qiov->iov[j].iov_len);
- if (ret < 0) {
- break;
- }
+ size_t len = sizeof(hash->h);
+ uint8_t *data = hash->h;
+
+ /* XXX - would be nice if we could pass in the Error **
+ * and propagate that back, but this quorum code is
+ * restricted to just errno values currently */
+ if (qcrypto_hash_bytesv(QCRYPTO_HASH_ALG_SHA256,
+ qiov->iov, qiov->niov,
+ &data, &len,
+ NULL) < 0) {
+ return -EINVAL;
}
- gnutls_hash_deinit(dig, (void *) hash);
- return ret;
+ return 0;
}
static QuorumVoteVersion *quorum_get_vote_winner(QuorumVotes *votes)
BDRVQuorumState *s = acb->common.bs->opaque;
int i;
+ acb->children_read = s->num_children;
for (i = 0; i < s->num_children; i++) {
- acb->qcrs[i].buf = qemu_blockalign(s->bs[i], acb->qiov->size);
+ acb->qcrs[i].buf = qemu_blockalign(s->children[i]->bs, acb->qiov->size);
qemu_iovec_init(&acb->qcrs[i].qiov, acb->qiov->niov);
qemu_iovec_clone(&acb->qcrs[i].qiov, acb->qiov, acb->qcrs[i].buf);
}
for (i = 0; i < s->num_children; i++) {
- bdrv_aio_readv(s->bs[i], acb->sector_num, &acb->qcrs[i].qiov,
- acb->nb_sectors, quorum_aio_cb, &acb->qcrs[i]);
+ acb->qcrs[i].aiocb = bdrv_aio_readv(s->children[i], acb->sector_num,
+ &acb->qcrs[i].qiov, acb->nb_sectors,
+ quorum_aio_cb, &acb->qcrs[i]);
}
return &acb->common;
static BlockAIOCB *read_fifo_child(QuorumAIOCB *acb)
{
BDRVQuorumState *s = acb->common.bs->opaque;
+ int n = acb->children_read++;
- acb->qcrs[acb->child_iter].buf = qemu_blockalign(s->bs[acb->child_iter],
- acb->qiov->size);
- qemu_iovec_init(&acb->qcrs[acb->child_iter].qiov, acb->qiov->niov);
- qemu_iovec_clone(&acb->qcrs[acb->child_iter].qiov, acb->qiov,
- acb->qcrs[acb->child_iter].buf);
- bdrv_aio_readv(s->bs[acb->child_iter], acb->sector_num,
- &acb->qcrs[acb->child_iter].qiov, acb->nb_sectors,
- quorum_aio_cb, &acb->qcrs[acb->child_iter]);
+ acb->qcrs[n].aiocb = bdrv_aio_readv(s->children[n], acb->sector_num,
+ acb->qiov, acb->nb_sectors,
+ quorum_fifo_aio_cb, &acb->qcrs[n]);
return &acb->common;
}
QuorumAIOCB *acb = quorum_aio_get(s, bs, qiov, sector_num,
nb_sectors, cb, opaque);
acb->is_read = true;
+ acb->children_read = 0;
if (s->read_pattern == QUORUM_READ_PATTERN_QUORUM) {
- acb->child_iter = s->num_children - 1;
return read_quorum_children(acb);
}
- acb->child_iter = 0;
return read_fifo_child(acb);
}
int i;
for (i = 0; i < s->num_children; i++) {
- acb->qcrs[i].aiocb = bdrv_aio_writev(s->bs[i], sector_num, qiov,
- nb_sectors, &quorum_aio_cb,
+ acb->qcrs[i].aiocb = bdrv_aio_writev(s->children[i], sector_num,
+ qiov, nb_sectors, &quorum_aio_cb,
&acb->qcrs[i]);
}
int i;
/* check that all file have the same length */
- result = bdrv_getlength(s->bs[0]);
+ result = bdrv_getlength(s->children[0]->bs);
if (result < 0) {
return result;
}
for (i = 1; i < s->num_children; i++) {
- int64_t value = bdrv_getlength(s->bs[i]);
+ int64_t value = bdrv_getlength(s->children[i]->bs);
if (value < 0) {
return value;
}
return result;
}
-static void quorum_invalidate_cache(BlockDriverState *bs, Error **errp)
-{
- BDRVQuorumState *s = bs->opaque;
- Error *local_err = NULL;
- int i;
-
- for (i = 0; i < s->num_children; i++) {
- bdrv_invalidate_cache(s->bs[i], &local_err);
- if (local_err) {
- error_propagate(errp, local_err);
- return;
- }
- }
-}
-
static coroutine_fn int quorum_co_flush(BlockDriverState *bs)
{
BDRVQuorumState *s = bs->opaque;
QuorumVoteValue result_value;
int i;
int result = 0;
+ int success_count = 0;
QLIST_INIT(&error_votes.vote_list);
error_votes.compare = quorum_64bits_compare;
for (i = 0; i < s->num_children; i++) {
- result = bdrv_co_flush(s->bs[i]);
- result_value.l = result;
- quorum_count_vote(&error_votes, &result_value, i);
+ result = bdrv_co_flush(s->children[i]->bs);
+ if (result) {
+ quorum_report_bad(QUORUM_OP_TYPE_FLUSH, 0,
+ bdrv_nb_sectors(s->children[i]->bs),
+ s->children[i]->bs->node_name, result);
+ result_value.l = result;
+ quorum_count_vote(&error_votes, &result_value, i);
+ } else {
+ success_count++;
+ }
}
- winner = quorum_get_vote_winner(&error_votes);
- result = winner->value.l;
-
+ if (success_count >= s->threshold) {
+ result = 0;
+ } else {
+ winner = quorum_get_vote_winner(&error_votes);
+ result = winner->value.l;
+ }
quorum_free_vote_list(&error_votes);
return result;
int i;
for (i = 0; i < s->num_children; i++) {
- bool perm = bdrv_recurse_is_first_non_filter(s->bs[i],
+ bool perm = bdrv_recurse_is_first_non_filter(s->children[i]->bs,
candidate);
if (perm) {
return true;
{
if (threshold < 1) {
- error_set(errp, QERR_INVALID_PARAMETER_VALUE,
- "vote-threshold", "value >= 1");
+ error_setg(errp, QERR_INVALID_PARAMETER_VALUE,
+ "vote-threshold", "value >= 1");
return -ERANGE;
}
return QUORUM_READ_PATTERN_QUORUM;
}
- for (i = 0; i < QUORUM_READ_PATTERN_MAX; i++) {
+ for (i = 0; i < QUORUM_READ_PATTERN__MAX; i++) {
if (!strcmp(opt, QuorumReadPattern_lookup[i])) {
return i;
}
Error *local_err = NULL;
QemuOpts *opts = NULL;
bool *opened;
- QDict *sub = NULL;
- QList *list = NULL;
- const QListEntry *lentry;
int i;
int ret = 0;
qdict_flatten(options);
- qdict_extract_subqdict(options, &sub, "children.");
- qdict_array_split(sub, &list);
- if (qdict_size(sub)) {
- error_setg(&local_err, "Invalid option children.%s",
- qdict_first(sub)->key);
+ /* count how many different children are present */
+ s->num_children = qdict_array_entries(options, "children.");
+ if (s->num_children < 0) {
+ error_setg(&local_err, "Option children is not a valid array");
ret = -EINVAL;
goto exit;
}
-
- /* count how many different children are present */
- s->num_children = qlist_size(list);
- if (s->num_children < 2) {
+ if (s->num_children < 1) {
error_setg(&local_err,
- "Number of provided children must be greater than 1");
+ "Number of provided children must be 1 or more");
ret = -EINVAL;
goto exit;
}
}
s->threshold = qemu_opt_get_number(opts, QUORUM_OPT_VOTE_THRESHOLD, 0);
+ /* and validate it against s->num_children */
+ ret = quorum_valid_threshold(s->threshold, s->num_children, &local_err);
+ if (ret < 0) {
+ goto exit;
+ }
+
ret = parse_read_pattern(qemu_opt_get(opts, QUORUM_OPT_READ_PATTERN));
if (ret < 0) {
error_setg(&local_err, "Please set read-pattern as fifo or quorum");
s->read_pattern = ret;
if (s->read_pattern == QUORUM_READ_PATTERN_QUORUM) {
- /* and validate it against s->num_children */
- ret = quorum_valid_threshold(s->threshold, s->num_children, &local_err);
- if (ret < 0) {
- goto exit;
- }
-
/* is the driver in blkverify mode */
if (qemu_opt_get_bool(opts, QUORUM_OPT_BLKVERIFY, false) &&
s->num_children == 2 && s->threshold == 2) {
}
}
- /* allocate the children BlockDriverState array */
- s->bs = g_new0(BlockDriverState *, s->num_children);
+ /* allocate the children array */
+ s->children = g_new0(BdrvChild *, s->num_children);
opened = g_new0(bool, s->num_children);
- for (i = 0, lentry = qlist_first(list); lentry;
- lentry = qlist_next(lentry), i++) {
- QDict *d;
- QString *string;
-
- switch (qobject_type(lentry->value))
- {
- /* List of options */
- case QTYPE_QDICT:
- d = qobject_to_qdict(lentry->value);
- QINCREF(d);
- ret = bdrv_open(&s->bs[i], NULL, NULL, d, flags, NULL,
- &local_err);
- break;
-
- /* QMP reference */
- case QTYPE_QSTRING:
- string = qobject_to_qstring(lentry->value);
- ret = bdrv_open(&s->bs[i], NULL, qstring_get_str(string), NULL,
- flags, NULL, &local_err);
- break;
-
- default:
- error_setg(&local_err, "Specification of child block device %i "
- "is invalid", i);
- ret = -EINVAL;
- }
+ for (i = 0; i < s->num_children; i++) {
+ char indexstr[32];
+ ret = snprintf(indexstr, 32, "children.%d", i);
+ assert(ret < 32);
- if (ret < 0) {
+ s->children[i] = bdrv_open_child(NULL, options, indexstr, bs,
+ &child_format, false, &local_err);
+ if (local_err) {
+ ret = -EINVAL;
goto close_exit;
}
+
opened[i] = true;
}
+ s->next_child_index = s->num_children;
g_free(opened);
goto exit;
if (!opened[i]) {
continue;
}
- bdrv_unref(s->bs[i]);
+ bdrv_unref_child(bs, s->children[i]);
}
- g_free(s->bs);
+ g_free(s->children);
g_free(opened);
exit:
qemu_opts_del(opts);
/* propagate error */
- if (local_err) {
- error_propagate(errp, local_err);
- }
- QDECREF(list);
- QDECREF(sub);
+ error_propagate(errp, local_err);
return ret;
}
int i;
for (i = 0; i < s->num_children; i++) {
- bdrv_unref(s->bs[i]);
+ bdrv_unref_child(bs, s->children[i]);
}
- g_free(s->bs);
+ g_free(s->children);
}
-static void quorum_detach_aio_context(BlockDriverState *bs)
+static void quorum_add_child(BlockDriverState *bs, BlockDriverState *child_bs,
+ Error **errp)
{
BDRVQuorumState *s = bs->opaque;
- int i;
+ BdrvChild *child;
+ char indexstr[32];
+ int ret;
- for (i = 0; i < s->num_children; i++) {
- bdrv_detach_aio_context(s->bs[i]);
+ assert(s->num_children <= INT_MAX / sizeof(BdrvChild *));
+ if (s->num_children == INT_MAX / sizeof(BdrvChild *) ||
+ s->next_child_index == UINT_MAX) {
+ error_setg(errp, "Too many children");
+ return;
}
+
+ ret = snprintf(indexstr, 32, "children.%u", s->next_child_index);
+ if (ret < 0 || ret >= 32) {
+ error_setg(errp, "cannot generate child name");
+ return;
+ }
+ s->next_child_index++;
+
+ bdrv_drained_begin(bs);
+
+ /* We can safely add the child now */
+ bdrv_ref(child_bs);
+ child = bdrv_attach_child(bs, child_bs, indexstr, &child_format);
+ s->children = g_renew(BdrvChild *, s->children, s->num_children + 1);
+ s->children[s->num_children++] = child;
+
+ bdrv_drained_end(bs);
}
-static void quorum_attach_aio_context(BlockDriverState *bs,
- AioContext *new_context)
+static void quorum_del_child(BlockDriverState *bs, BdrvChild *child,
+ Error **errp)
{
BDRVQuorumState *s = bs->opaque;
int i;
for (i = 0; i < s->num_children; i++) {
- bdrv_attach_aio_context(s->bs[i], new_context);
+ if (s->children[i] == child) {
+ break;
+ }
+ }
+
+ /* we have checked it in bdrv_del_child() */
+ assert(i < s->num_children);
+
+ if (s->num_children <= s->threshold) {
+ error_setg(errp,
+ "The number of children cannot be lower than the vote threshold %d",
+ s->threshold);
+ return;
}
+
+ bdrv_drained_begin(bs);
+
+ /* We can safely remove this child now */
+ memmove(&s->children[i], &s->children[i + 1],
+ (s->num_children - i - 1) * sizeof(BdrvChild *));
+ s->children = g_renew(BdrvChild *, s->children, --s->num_children);
+ bdrv_unref_child(bs, child);
+
+ bdrv_drained_end(bs);
}
-static void quorum_refresh_filename(BlockDriverState *bs)
+static void quorum_refresh_filename(BlockDriverState *bs, QDict *options)
{
BDRVQuorumState *s = bs->opaque;
QDict *opts;
int i;
for (i = 0; i < s->num_children; i++) {
- bdrv_refresh_filename(s->bs[i]);
- if (!s->bs[i]->full_open_options) {
+ bdrv_refresh_filename(s->children[i]->bs);
+ if (!s->children[i]->bs->full_open_options) {
return;
}
}
children = qlist_new();
for (i = 0; i < s->num_children; i++) {
- QINCREF(s->bs[i]->full_open_options);
- qlist_append_obj(children, QOBJECT(s->bs[i]->full_open_options));
+ QINCREF(s->children[i]->bs->full_open_options);
+ qlist_append_obj(children,
+ QOBJECT(s->children[i]->bs->full_open_options));
}
opts = qdict_new();
qdict_put_obj(opts, QUORUM_OPT_VOTE_THRESHOLD,
QOBJECT(qint_from_int(s->threshold)));
qdict_put_obj(opts, QUORUM_OPT_BLKVERIFY,
- QOBJECT(qbool_from_int(s->is_blkverify)));
+ QOBJECT(qbool_from_bool(s->is_blkverify)));
qdict_put_obj(opts, QUORUM_OPT_REWRITE,
- QOBJECT(qbool_from_int(s->rewrite_corrupted)));
+ QOBJECT(qbool_from_bool(s->rewrite_corrupted)));
qdict_put_obj(opts, "children", QOBJECT(children));
bs->full_open_options = opts;
.bdrv_aio_readv = quorum_aio_readv,
.bdrv_aio_writev = quorum_aio_writev,
- .bdrv_invalidate_cache = quorum_invalidate_cache,
- .bdrv_detach_aio_context = quorum_detach_aio_context,
- .bdrv_attach_aio_context = quorum_attach_aio_context,
+ .bdrv_add_child = quorum_add_child,
+ .bdrv_del_child = quorum_del_child,
.is_filter = true,
.bdrv_recurse_is_first_non_filter = quorum_recurse_is_first_non_filter,
static void bdrv_quorum_init(void)
{
+ if (!qcrypto_hash_supports(QCRYPTO_HASH_ALG_SHA256)) {
+ /* SHA256 hash support is required for quorum device */
+ return;
+ }
bdrv_register(&bdrv_quorum);
}