2 * GlusterFS backend for QEMU
6 * This work is licensed under the terms of the GNU GPL, version 2 or later.
7 * See the COPYING file in the top-level directory.
10 #include <glusterfs/api/glfs.h>
11 #include "block/block_int.h"
14 typedef struct GlusterAIOCB {
21 typedef struct BDRVGlusterState {
26 typedef struct GlusterConf {
34 static void qemu_gluster_gconf_free(GlusterConf *gconf)
37 g_free(gconf->server);
38 g_free(gconf->volname);
40 g_free(gconf->transport);
45 static int parse_volume_options(GlusterConf *gconf, char *path)
54 p = q = path + strspn(path, "/");
59 gconf->volname = g_strndup(q, p - q);
66 gconf->image = g_strdup(p);
71 * file=gluster[+transport]://[server[:port]]/volname/image[?socket=...]
73 * 'gluster' is the protocol.
75 * 'transport' specifies the transport type used to connect to gluster
76 * management daemon (glusterd). Valid transport types are
77 * tcp, unix and rdma. If a transport type isn't specified, then tcp
80 * 'server' specifies the server where the volume file specification for
81 * the given volume resides. This can be either hostname, ipv4 address
82 * or ipv6 address. ipv6 address needs to be within square brackets [ ].
83 * If transport type is 'unix', then 'server' field should not be specifed.
84 * The 'socket' field needs to be populated with the path to unix domain
87 * 'port' is the port number on which glusterd is listening. This is optional
88 * and if not specified, QEMU will send 0 which will make gluster to use the
89 * default port. If the transport type is unix, then 'port' should not be
92 * 'volname' is the name of the gluster volume which contains the VM image.
94 * 'image' is the path to the actual VM image that resides on gluster volume.
98 * file=gluster://1.2.3.4/testvol/a.img
99 * file=gluster+tcp://1.2.3.4/testvol/a.img
100 * file=gluster+tcp://1.2.3.4:24007/testvol/dir/a.img
101 * file=gluster+tcp://[1:2:3:4:5:6:7:8]/testvol/dir/a.img
102 * file=gluster+tcp://[1:2:3:4:5:6:7:8]:24007/testvol/dir/a.img
103 * file=gluster+tcp://server.domain.com:24007/testvol/dir/a.img
104 * file=gluster+unix:///testvol/dir/a.img?socket=/tmp/glusterd.socket
105 * file=gluster+rdma://1.2.3.4:24007/testvol/a.img
107 static int qemu_gluster_parseuri(GlusterConf *gconf, const char *filename)
110 QueryParams *qp = NULL;
111 bool is_unix = false;
114 uri = uri_parse(filename);
120 if (!uri->scheme || !strcmp(uri->scheme, "gluster")) {
121 gconf->transport = g_strdup("tcp");
122 } else if (!strcmp(uri->scheme, "gluster+tcp")) {
123 gconf->transport = g_strdup("tcp");
124 } else if (!strcmp(uri->scheme, "gluster+unix")) {
125 gconf->transport = g_strdup("unix");
127 } else if (!strcmp(uri->scheme, "gluster+rdma")) {
128 gconf->transport = g_strdup("rdma");
134 ret = parse_volume_options(gconf, uri->path);
139 qp = query_params_parse(uri->query);
140 if (qp->n > 1 || (is_unix && !qp->n) || (!is_unix && qp->n)) {
146 if (uri->server || uri->port) {
150 if (strcmp(qp->p[0].name, "socket")) {
154 gconf->server = g_strdup(qp->p[0].value);
156 gconf->server = g_strdup(uri->server ? uri->server : "localhost");
157 gconf->port = uri->port;
162 query_params_free(qp);
168 static struct glfs *qemu_gluster_init(GlusterConf *gconf, const char *filename,
171 struct glfs *glfs = NULL;
175 ret = qemu_gluster_parseuri(gconf, filename);
177 error_setg(errp, "Usage: file=gluster[+transport]://[server[:port]]/"
178 "volname/image[?socket=...]");
183 glfs = glfs_new(gconf->volname);
188 ret = glfs_set_volfile_server(glfs, gconf->transport, gconf->server,
195 * TODO: Use GF_LOG_ERROR instead of hard code value of 4 here when
196 * GlusterFS makes GF_LOG_* macros available to libgfapi users.
198 ret = glfs_set_logging(glfs, "-", 4);
203 ret = glfs_init(glfs);
205 error_setg_errno(errp, errno,
206 "Gluster connection failed for server=%s port=%d "
207 "volume=%s image=%s transport=%s", gconf->server,
208 gconf->port, gconf->volname, gconf->image,
223 static void qemu_gluster_complete_aio(void *opaque)
225 GlusterAIOCB *acb = (GlusterAIOCB *)opaque;
227 qemu_bh_delete(acb->bh);
229 qemu_coroutine_enter(acb->coroutine, NULL);
233 * AIO callback routine called from GlusterFS thread.
235 static void gluster_finish_aiocb(struct glfs_fd *fd, ssize_t ret, void *arg)
237 GlusterAIOCB *acb = (GlusterAIOCB *)arg;
239 if (!ret || ret == acb->size) {
240 acb->ret = 0; /* Success */
241 } else if (ret < 0) {
242 acb->ret = ret; /* Read/Write failed */
244 acb->ret = -EIO; /* Partial read/write - fail it */
247 acb->bh = qemu_bh_new(qemu_gluster_complete_aio, acb);
248 qemu_bh_schedule(acb->bh);
251 /* TODO Convert to fine grained options */
252 static QemuOptsList runtime_opts = {
254 .head = QTAILQ_HEAD_INITIALIZER(runtime_opts.head),
258 .type = QEMU_OPT_STRING,
259 .help = "URL to the gluster image",
261 { /* end of list */ }
265 static void qemu_gluster_parse_flags(int bdrv_flags, int *open_flags)
267 assert(open_flags != NULL);
269 *open_flags |= O_BINARY;
271 if (bdrv_flags & BDRV_O_RDWR) {
272 *open_flags |= O_RDWR;
274 *open_flags |= O_RDONLY;
277 if ((bdrv_flags & BDRV_O_NOCACHE)) {
278 *open_flags |= O_DIRECT;
282 static int qemu_gluster_open(BlockDriverState *bs, QDict *options,
283 int bdrv_flags, Error **errp)
285 BDRVGlusterState *s = bs->opaque;
288 GlusterConf *gconf = g_malloc0(sizeof(GlusterConf));
290 Error *local_err = NULL;
291 const char *filename;
293 opts = qemu_opts_create(&runtime_opts, NULL, 0, &error_abort);
294 qemu_opts_absorb_qdict(opts, options, &local_err);
296 error_propagate(errp, local_err);
301 filename = qemu_opt_get(opts, "filename");
303 s->glfs = qemu_gluster_init(gconf, filename, errp);
309 qemu_gluster_parse_flags(bdrv_flags, &open_flags);
311 s->fd = glfs_open(s->glfs, gconf->image, open_flags);
318 qemu_gluster_gconf_free(gconf);
331 typedef struct BDRVGlusterReopenState {
334 } BDRVGlusterReopenState;
337 static int qemu_gluster_reopen_prepare(BDRVReopenState *state,
338 BlockReopenQueue *queue, Error **errp)
341 BDRVGlusterReopenState *reop_s;
342 GlusterConf *gconf = NULL;
345 assert(state != NULL);
346 assert(state->bs != NULL);
348 state->opaque = g_malloc0(sizeof(BDRVGlusterReopenState));
349 reop_s = state->opaque;
351 qemu_gluster_parse_flags(state->flags, &open_flags);
353 gconf = g_malloc0(sizeof(GlusterConf));
355 reop_s->glfs = qemu_gluster_init(gconf, state->bs->filename, errp);
356 if (reop_s->glfs == NULL) {
361 reop_s->fd = glfs_open(reop_s->glfs, gconf->image, open_flags);
362 if (reop_s->fd == NULL) {
363 /* reops->glfs will be cleaned up in _abort */
369 /* state->opaque will be freed in either the _abort or _commit */
370 qemu_gluster_gconf_free(gconf);
374 static void qemu_gluster_reopen_commit(BDRVReopenState *state)
376 BDRVGlusterReopenState *reop_s = state->opaque;
377 BDRVGlusterState *s = state->bs->opaque;
388 /* use the newly opened image / connection */
390 s->glfs = reop_s->glfs;
392 g_free(state->opaque);
393 state->opaque = NULL;
399 static void qemu_gluster_reopen_abort(BDRVReopenState *state)
401 BDRVGlusterReopenState *reop_s = state->opaque;
403 if (reop_s == NULL) {
408 glfs_close(reop_s->fd);
412 glfs_fini(reop_s->glfs);
415 g_free(state->opaque);
416 state->opaque = NULL;
421 #ifdef CONFIG_GLUSTERFS_ZEROFILL
422 static coroutine_fn int qemu_gluster_co_write_zeroes(BlockDriverState *bs,
423 int64_t sector_num, int nb_sectors, BdrvRequestFlags flags)
426 GlusterAIOCB *acb = g_slice_new(GlusterAIOCB);
427 BDRVGlusterState *s = bs->opaque;
428 off_t size = nb_sectors * BDRV_SECTOR_SIZE;
429 off_t offset = sector_num * BDRV_SECTOR_SIZE;
433 acb->coroutine = qemu_coroutine_self();
435 ret = glfs_zerofill_async(s->fd, offset, size, &gluster_finish_aiocb, acb);
441 qemu_coroutine_yield();
445 g_slice_free(GlusterAIOCB, acb);
449 static inline bool gluster_supports_zerofill(void)
454 static inline int qemu_gluster_zerofill(struct glfs_fd *fd, int64_t offset,
457 return glfs_zerofill(fd, offset, size);
461 static inline bool gluster_supports_zerofill(void)
466 static inline int qemu_gluster_zerofill(struct glfs_fd *fd, int64_t offset,
473 static int qemu_gluster_create(const char *filename,
474 QEMUOptionParameter *options, Error **errp)
480 int64_t total_size = 0;
481 GlusterConf *gconf = g_malloc0(sizeof(GlusterConf));
483 glfs = qemu_gluster_init(gconf, filename, errp);
489 while (options && options->name) {
490 if (!strcmp(options->name, BLOCK_OPT_SIZE)) {
491 total_size = options->value.n / BDRV_SECTOR_SIZE;
492 } else if (!strcmp(options->name, BLOCK_OPT_PREALLOC)) {
493 if (!options->value.s || !strcmp(options->value.s, "off")) {
495 } else if (!strcmp(options->value.s, "full") &&
496 gluster_supports_zerofill()) {
499 error_setg(errp, "Invalid preallocation mode: '%s'"
500 " or GlusterFS doesn't support zerofill API",
509 fd = glfs_creat(glfs, gconf->image,
510 O_WRONLY | O_CREAT | O_TRUNC | O_BINARY, S_IRUSR | S_IWUSR);
514 if (!glfs_ftruncate(fd, total_size * BDRV_SECTOR_SIZE)) {
515 if (prealloc && qemu_gluster_zerofill(fd, 0,
516 total_size * BDRV_SECTOR_SIZE)) {
523 if (glfs_close(fd) != 0) {
528 qemu_gluster_gconf_free(gconf);
535 static coroutine_fn int qemu_gluster_co_rw(BlockDriverState *bs,
536 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov, int write)
539 GlusterAIOCB *acb = g_slice_new(GlusterAIOCB);
540 BDRVGlusterState *s = bs->opaque;
541 size_t size = nb_sectors * BDRV_SECTOR_SIZE;
542 off_t offset = sector_num * BDRV_SECTOR_SIZE;
546 acb->coroutine = qemu_coroutine_self();
549 ret = glfs_pwritev_async(s->fd, qiov->iov, qiov->niov, offset, 0,
550 &gluster_finish_aiocb, acb);
552 ret = glfs_preadv_async(s->fd, qiov->iov, qiov->niov, offset, 0,
553 &gluster_finish_aiocb, acb);
561 qemu_coroutine_yield();
565 g_slice_free(GlusterAIOCB, acb);
569 static int qemu_gluster_truncate(BlockDriverState *bs, int64_t offset)
572 BDRVGlusterState *s = bs->opaque;
574 ret = glfs_ftruncate(s->fd, offset);
582 static coroutine_fn int qemu_gluster_co_readv(BlockDriverState *bs,
583 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
585 return qemu_gluster_co_rw(bs, sector_num, nb_sectors, qiov, 0);
588 static coroutine_fn int qemu_gluster_co_writev(BlockDriverState *bs,
589 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
591 return qemu_gluster_co_rw(bs, sector_num, nb_sectors, qiov, 1);
594 static coroutine_fn int qemu_gluster_co_flush_to_disk(BlockDriverState *bs)
597 GlusterAIOCB *acb = g_slice_new(GlusterAIOCB);
598 BDRVGlusterState *s = bs->opaque;
602 acb->coroutine = qemu_coroutine_self();
604 ret = glfs_fsync_async(s->fd, &gluster_finish_aiocb, acb);
610 qemu_coroutine_yield();
614 g_slice_free(GlusterAIOCB, acb);
618 #ifdef CONFIG_GLUSTERFS_DISCARD
619 static coroutine_fn int qemu_gluster_co_discard(BlockDriverState *bs,
620 int64_t sector_num, int nb_sectors)
623 GlusterAIOCB *acb = g_slice_new(GlusterAIOCB);
624 BDRVGlusterState *s = bs->opaque;
625 size_t size = nb_sectors * BDRV_SECTOR_SIZE;
626 off_t offset = sector_num * BDRV_SECTOR_SIZE;
630 acb->coroutine = qemu_coroutine_self();
632 ret = glfs_discard_async(s->fd, offset, size, &gluster_finish_aiocb, acb);
638 qemu_coroutine_yield();
642 g_slice_free(GlusterAIOCB, acb);
647 static int64_t qemu_gluster_getlength(BlockDriverState *bs)
649 BDRVGlusterState *s = bs->opaque;
652 ret = glfs_lseek(s->fd, 0, SEEK_END);
660 static int64_t qemu_gluster_allocated_file_size(BlockDriverState *bs)
662 BDRVGlusterState *s = bs->opaque;
666 ret = glfs_fstat(s->fd, &st);
670 return st.st_blocks * 512;
674 static void qemu_gluster_close(BlockDriverState *bs)
676 BDRVGlusterState *s = bs->opaque;
685 static int qemu_gluster_has_zero_init(BlockDriverState *bs)
687 /* GlusterFS volume could be backed by a block device */
691 static QEMUOptionParameter qemu_gluster_create_options[] = {
693 .name = BLOCK_OPT_SIZE,
695 .help = "Virtual disk size"
698 .name = BLOCK_OPT_PREALLOC,
700 .help = "Preallocation mode (allowed values: off, full)"
705 static BlockDriver bdrv_gluster = {
706 .format_name = "gluster",
707 .protocol_name = "gluster",
708 .instance_size = sizeof(BDRVGlusterState),
709 .bdrv_needs_filename = true,
710 .bdrv_file_open = qemu_gluster_open,
711 .bdrv_reopen_prepare = qemu_gluster_reopen_prepare,
712 .bdrv_reopen_commit = qemu_gluster_reopen_commit,
713 .bdrv_reopen_abort = qemu_gluster_reopen_abort,
714 .bdrv_close = qemu_gluster_close,
715 .bdrv_create = qemu_gluster_create,
716 .bdrv_getlength = qemu_gluster_getlength,
717 .bdrv_get_allocated_file_size = qemu_gluster_allocated_file_size,
718 .bdrv_truncate = qemu_gluster_truncate,
719 .bdrv_co_readv = qemu_gluster_co_readv,
720 .bdrv_co_writev = qemu_gluster_co_writev,
721 .bdrv_co_flush_to_disk = qemu_gluster_co_flush_to_disk,
722 .bdrv_has_zero_init = qemu_gluster_has_zero_init,
723 #ifdef CONFIG_GLUSTERFS_DISCARD
724 .bdrv_co_discard = qemu_gluster_co_discard,
726 #ifdef CONFIG_GLUSTERFS_ZEROFILL
727 .bdrv_co_write_zeroes = qemu_gluster_co_write_zeroes,
729 .create_options = qemu_gluster_create_options,
732 static BlockDriver bdrv_gluster_tcp = {
733 .format_name = "gluster",
734 .protocol_name = "gluster+tcp",
735 .instance_size = sizeof(BDRVGlusterState),
736 .bdrv_needs_filename = true,
737 .bdrv_file_open = qemu_gluster_open,
738 .bdrv_reopen_prepare = qemu_gluster_reopen_prepare,
739 .bdrv_reopen_commit = qemu_gluster_reopen_commit,
740 .bdrv_reopen_abort = qemu_gluster_reopen_abort,
741 .bdrv_close = qemu_gluster_close,
742 .bdrv_create = qemu_gluster_create,
743 .bdrv_getlength = qemu_gluster_getlength,
744 .bdrv_get_allocated_file_size = qemu_gluster_allocated_file_size,
745 .bdrv_truncate = qemu_gluster_truncate,
746 .bdrv_co_readv = qemu_gluster_co_readv,
747 .bdrv_co_writev = qemu_gluster_co_writev,
748 .bdrv_co_flush_to_disk = qemu_gluster_co_flush_to_disk,
749 .bdrv_has_zero_init = qemu_gluster_has_zero_init,
750 #ifdef CONFIG_GLUSTERFS_DISCARD
751 .bdrv_co_discard = qemu_gluster_co_discard,
753 #ifdef CONFIG_GLUSTERFS_ZEROFILL
754 .bdrv_co_write_zeroes = qemu_gluster_co_write_zeroes,
756 .create_options = qemu_gluster_create_options,
759 static BlockDriver bdrv_gluster_unix = {
760 .format_name = "gluster",
761 .protocol_name = "gluster+unix",
762 .instance_size = sizeof(BDRVGlusterState),
763 .bdrv_needs_filename = true,
764 .bdrv_file_open = qemu_gluster_open,
765 .bdrv_reopen_prepare = qemu_gluster_reopen_prepare,
766 .bdrv_reopen_commit = qemu_gluster_reopen_commit,
767 .bdrv_reopen_abort = qemu_gluster_reopen_abort,
768 .bdrv_close = qemu_gluster_close,
769 .bdrv_create = qemu_gluster_create,
770 .bdrv_getlength = qemu_gluster_getlength,
771 .bdrv_get_allocated_file_size = qemu_gluster_allocated_file_size,
772 .bdrv_truncate = qemu_gluster_truncate,
773 .bdrv_co_readv = qemu_gluster_co_readv,
774 .bdrv_co_writev = qemu_gluster_co_writev,
775 .bdrv_co_flush_to_disk = qemu_gluster_co_flush_to_disk,
776 .bdrv_has_zero_init = qemu_gluster_has_zero_init,
777 #ifdef CONFIG_GLUSTERFS_DISCARD
778 .bdrv_co_discard = qemu_gluster_co_discard,
780 #ifdef CONFIG_GLUSTERFS_ZEROFILL
781 .bdrv_co_write_zeroes = qemu_gluster_co_write_zeroes,
783 .create_options = qemu_gluster_create_options,
786 static BlockDriver bdrv_gluster_rdma = {
787 .format_name = "gluster",
788 .protocol_name = "gluster+rdma",
789 .instance_size = sizeof(BDRVGlusterState),
790 .bdrv_needs_filename = true,
791 .bdrv_file_open = qemu_gluster_open,
792 .bdrv_reopen_prepare = qemu_gluster_reopen_prepare,
793 .bdrv_reopen_commit = qemu_gluster_reopen_commit,
794 .bdrv_reopen_abort = qemu_gluster_reopen_abort,
795 .bdrv_close = qemu_gluster_close,
796 .bdrv_create = qemu_gluster_create,
797 .bdrv_getlength = qemu_gluster_getlength,
798 .bdrv_get_allocated_file_size = qemu_gluster_allocated_file_size,
799 .bdrv_truncate = qemu_gluster_truncate,
800 .bdrv_co_readv = qemu_gluster_co_readv,
801 .bdrv_co_writev = qemu_gluster_co_writev,
802 .bdrv_co_flush_to_disk = qemu_gluster_co_flush_to_disk,
803 .bdrv_has_zero_init = qemu_gluster_has_zero_init,
804 #ifdef CONFIG_GLUSTERFS_DISCARD
805 .bdrv_co_discard = qemu_gluster_co_discard,
807 #ifdef CONFIG_GLUSTERFS_ZEROFILL
808 .bdrv_co_write_zeroes = qemu_gluster_co_write_zeroes,
810 .create_options = qemu_gluster_create_options,
813 static void bdrv_gluster_init(void)
815 bdrv_register(&bdrv_gluster_rdma);
816 bdrv_register(&bdrv_gluster_unix);
817 bdrv_register(&bdrv_gluster_tcp);
818 bdrv_register(&bdrv_gluster);
821 block_init(bdrv_gluster_init);