]> Git Repo - qemu.git/blame - block/sheepdog.c
sheepdog: move coroutine send/recv function to generic code
[qemu.git] / block / sheepdog.c
CommitLineData
33b1db1c
MK
1/*
2 * Copyright (C) 2009-2010 Nippon Telegraph and Telephone Corporation.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License version
6 * 2 as published by the Free Software Foundation.
7 *
8 * You should have received a copy of the GNU General Public License
9 * along with this program. If not, see <http://www.gnu.org/licenses/>.
10 */
33b1db1c
MK
11
12#include "qemu-common.h"
13#include "qemu-error.h"
14#include "qemu_socket.h"
15#include "block_int.h"
8cffde73 16#include "bitops.h"
33b1db1c
MK
17
18#define SD_PROTO_VER 0x01
19
20#define SD_DEFAULT_ADDR "localhost"
21#define SD_DEFAULT_PORT "7000"
22
23#define SD_OP_CREATE_AND_WRITE_OBJ 0x01
24#define SD_OP_READ_OBJ 0x02
25#define SD_OP_WRITE_OBJ 0x03
26
27#define SD_OP_NEW_VDI 0x11
28#define SD_OP_LOCK_VDI 0x12
29#define SD_OP_RELEASE_VDI 0x13
30#define SD_OP_GET_VDI_INFO 0x14
31#define SD_OP_READ_VDIS 0x15
32
33#define SD_FLAG_CMD_WRITE 0x01
34#define SD_FLAG_CMD_COW 0x02
35
36#define SD_RES_SUCCESS 0x00 /* Success */
37#define SD_RES_UNKNOWN 0x01 /* Unknown error */
38#define SD_RES_NO_OBJ 0x02 /* No object found */
39#define SD_RES_EIO 0x03 /* I/O error */
40#define SD_RES_VDI_EXIST 0x04 /* Vdi exists already */
41#define SD_RES_INVALID_PARMS 0x05 /* Invalid parameters */
42#define SD_RES_SYSTEM_ERROR 0x06 /* System error */
43#define SD_RES_VDI_LOCKED 0x07 /* Vdi is locked */
44#define SD_RES_NO_VDI 0x08 /* No vdi found */
45#define SD_RES_NO_BASE_VDI 0x09 /* No base vdi found */
46#define SD_RES_VDI_READ 0x0A /* Cannot read requested vdi */
47#define SD_RES_VDI_WRITE 0x0B /* Cannot write requested vdi */
48#define SD_RES_BASE_VDI_READ 0x0C /* Cannot read base vdi */
49#define SD_RES_BASE_VDI_WRITE 0x0D /* Cannot write base vdi */
50#define SD_RES_NO_TAG 0x0E /* Requested tag is not found */
51#define SD_RES_STARTUP 0x0F /* Sheepdog is on starting up */
52#define SD_RES_VDI_NOT_LOCKED 0x10 /* Vdi is not locked */
53#define SD_RES_SHUTDOWN 0x11 /* Sheepdog is shutting down */
54#define SD_RES_NO_MEM 0x12 /* Cannot allocate memory */
55#define SD_RES_FULL_VDI 0x13 /* we already have the maximum vdis */
56#define SD_RES_VER_MISMATCH 0x14 /* Protocol version mismatch */
57#define SD_RES_NO_SPACE 0x15 /* Server has no room for new objects */
58#define SD_RES_WAIT_FOR_FORMAT 0x16 /* Waiting for a format operation */
59#define SD_RES_WAIT_FOR_JOIN 0x17 /* Waiting for other nodes joining */
60#define SD_RES_JOIN_FAILED 0x18 /* Target node had failed to join sheepdog */
61
62/*
63 * Object ID rules
64 *
65 * 0 - 19 (20 bits): data object space
66 * 20 - 31 (12 bits): reserved data object space
67 * 32 - 55 (24 bits): vdi object space
68 * 56 - 59 ( 4 bits): reserved vdi object space
7acae208 69 * 60 - 63 ( 4 bits): object type identifier space
33b1db1c
MK
70 */
71
72#define VDI_SPACE_SHIFT 32
73#define VDI_BIT (UINT64_C(1) << 63)
74#define VMSTATE_BIT (UINT64_C(1) << 62)
75#define MAX_DATA_OBJS (UINT64_C(1) << 20)
76#define MAX_CHILDREN 1024
77#define SD_MAX_VDI_LEN 256
78#define SD_MAX_VDI_TAG_LEN 256
79#define SD_NR_VDIS (1U << 24)
80#define SD_DATA_OBJ_SIZE (UINT64_C(1) << 22)
81#define SD_MAX_VDI_SIZE (SD_DATA_OBJ_SIZE * MAX_DATA_OBJS)
82#define SECTOR_SIZE 512
83
84#define SD_INODE_SIZE (sizeof(SheepdogInode))
85#define CURRENT_VDI_ID 0
86
87typedef struct SheepdogReq {
88 uint8_t proto_ver;
89 uint8_t opcode;
90 uint16_t flags;
91 uint32_t epoch;
92 uint32_t id;
93 uint32_t data_length;
94 uint32_t opcode_specific[8];
95} SheepdogReq;
96
97typedef struct SheepdogRsp {
98 uint8_t proto_ver;
99 uint8_t opcode;
100 uint16_t flags;
101 uint32_t epoch;
102 uint32_t id;
103 uint32_t data_length;
104 uint32_t result;
105 uint32_t opcode_specific[7];
106} SheepdogRsp;
107
108typedef struct SheepdogObjReq {
109 uint8_t proto_ver;
110 uint8_t opcode;
111 uint16_t flags;
112 uint32_t epoch;
113 uint32_t id;
114 uint32_t data_length;
115 uint64_t oid;
116 uint64_t cow_oid;
117 uint32_t copies;
118 uint32_t rsvd;
119 uint64_t offset;
120} SheepdogObjReq;
121
122typedef struct SheepdogObjRsp {
123 uint8_t proto_ver;
124 uint8_t opcode;
125 uint16_t flags;
126 uint32_t epoch;
127 uint32_t id;
128 uint32_t data_length;
129 uint32_t result;
130 uint32_t copies;
131 uint32_t pad[6];
132} SheepdogObjRsp;
133
134typedef struct SheepdogVdiReq {
135 uint8_t proto_ver;
136 uint8_t opcode;
137 uint16_t flags;
138 uint32_t epoch;
139 uint32_t id;
140 uint32_t data_length;
141 uint64_t vdi_size;
142 uint32_t base_vdi_id;
143 uint32_t copies;
144 uint32_t snapid;
145 uint32_t pad[3];
146} SheepdogVdiReq;
147
148typedef struct SheepdogVdiRsp {
149 uint8_t proto_ver;
150 uint8_t opcode;
151 uint16_t flags;
152 uint32_t epoch;
153 uint32_t id;
154 uint32_t data_length;
155 uint32_t result;
156 uint32_t rsvd;
157 uint32_t vdi_id;
158 uint32_t pad[5];
159} SheepdogVdiRsp;
160
161typedef struct SheepdogInode {
162 char name[SD_MAX_VDI_LEN];
163 char tag[SD_MAX_VDI_TAG_LEN];
164 uint64_t ctime;
165 uint64_t snap_ctime;
166 uint64_t vm_clock_nsec;
167 uint64_t vdi_size;
168 uint64_t vm_state_size;
169 uint16_t copy_policy;
170 uint8_t nr_copies;
171 uint8_t block_size_shift;
172 uint32_t snap_id;
173 uint32_t vdi_id;
174 uint32_t parent_vdi_id;
175 uint32_t child_vdi_id[MAX_CHILDREN];
176 uint32_t data_vdi_id[MAX_DATA_OBJS];
177} SheepdogInode;
178
179/*
180 * 64 bit FNV-1a non-zero initial basis
181 */
182#define FNV1A_64_INIT ((uint64_t)0xcbf29ce484222325ULL)
183
184/*
185 * 64 bit Fowler/Noll/Vo FNV-1a hash code
186 */
187static inline uint64_t fnv_64a_buf(void *buf, size_t len, uint64_t hval)
188{
189 unsigned char *bp = buf;
190 unsigned char *be = bp + len;
191 while (bp < be) {
192 hval ^= (uint64_t) *bp++;
193 hval += (hval << 1) + (hval << 4) + (hval << 5) +
194 (hval << 7) + (hval << 8) + (hval << 40);
195 }
196 return hval;
197}
198
ebabb67a 199static inline int is_data_obj_writable(SheepdogInode *inode, unsigned int idx)
33b1db1c
MK
200{
201 return inode->vdi_id == inode->data_vdi_id[idx];
202}
203
204static inline int is_data_obj(uint64_t oid)
205{
206 return !(VDI_BIT & oid);
207}
208
209static inline uint64_t data_oid_to_idx(uint64_t oid)
210{
211 return oid & (MAX_DATA_OBJS - 1);
212}
213
214static inline uint64_t vid_to_vdi_oid(uint32_t vid)
215{
216 return VDI_BIT | ((uint64_t)vid << VDI_SPACE_SHIFT);
217}
218
219static inline uint64_t vid_to_vmstate_oid(uint32_t vid, uint32_t idx)
220{
221 return VMSTATE_BIT | ((uint64_t)vid << VDI_SPACE_SHIFT) | idx;
222}
223
224static inline uint64_t vid_to_data_oid(uint32_t vid, uint32_t idx)
225{
226 return ((uint64_t)vid << VDI_SPACE_SHIFT) | idx;
227}
228
229static inline int is_snapshot(struct SheepdogInode *inode)
230{
231 return !!inode->snap_ctime;
232}
233
234#undef dprintf
235#ifdef DEBUG_SDOG
236#define dprintf(fmt, args...) \
237 do { \
238 fprintf(stdout, "%s %d: " fmt, __func__, __LINE__, ##args); \
239 } while (0)
240#else
241#define dprintf(fmt, args...)
242#endif
243
244typedef struct SheepdogAIOCB SheepdogAIOCB;
245
246typedef struct AIOReq {
247 SheepdogAIOCB *aiocb;
248 unsigned int iov_offset;
249
250 uint64_t oid;
251 uint64_t base_oid;
252 uint64_t offset;
253 unsigned int data_len;
254 uint8_t flags;
255 uint32_t id;
256
257 QLIST_ENTRY(AIOReq) outstanding_aio_siblings;
258 QLIST_ENTRY(AIOReq) aioreq_siblings;
259} AIOReq;
260
261enum AIOCBState {
262 AIOCB_WRITE_UDATA,
263 AIOCB_READ_UDATA,
264};
265
266struct SheepdogAIOCB {
267 BlockDriverAIOCB common;
268
269 QEMUIOVector *qiov;
270
271 int64_t sector_num;
272 int nb_sectors;
273
274 int ret;
275 enum AIOCBState aiocb_type;
276
2df46246 277 Coroutine *coroutine;
33b1db1c
MK
278 void (*aio_done_func)(SheepdogAIOCB *);
279
280 int canceled;
281
282 QLIST_HEAD(aioreq_head, AIOReq) aioreq_head;
283};
284
285typedef struct BDRVSheepdogState {
286 SheepdogInode inode;
287
288 uint32_t min_dirty_data_idx;
289 uint32_t max_dirty_data_idx;
290
291 char name[SD_MAX_VDI_LEN];
292 int is_snapshot;
293
294 char *addr;
295 char *port;
296 int fd;
297
2df46246
MK
298 CoMutex lock;
299 Coroutine *co_send;
300 Coroutine *co_recv;
301
33b1db1c
MK
302 uint32_t aioreq_seq_num;
303 QLIST_HEAD(outstanding_aio_head, AIOReq) outstanding_aio_head;
304} BDRVSheepdogState;
305
306static const char * sd_strerror(int err)
307{
308 int i;
309
310 static const struct {
311 int err;
312 const char *desc;
313 } errors[] = {
314 {SD_RES_SUCCESS, "Success"},
315 {SD_RES_UNKNOWN, "Unknown error"},
316 {SD_RES_NO_OBJ, "No object found"},
317 {SD_RES_EIO, "I/O error"},
318 {SD_RES_VDI_EXIST, "VDI exists already"},
319 {SD_RES_INVALID_PARMS, "Invalid parameters"},
320 {SD_RES_SYSTEM_ERROR, "System error"},
321 {SD_RES_VDI_LOCKED, "VDI is already locked"},
322 {SD_RES_NO_VDI, "No vdi found"},
323 {SD_RES_NO_BASE_VDI, "No base VDI found"},
324 {SD_RES_VDI_READ, "Failed read the requested VDI"},
325 {SD_RES_VDI_WRITE, "Failed to write the requested VDI"},
326 {SD_RES_BASE_VDI_READ, "Failed to read the base VDI"},
327 {SD_RES_BASE_VDI_WRITE, "Failed to write the base VDI"},
328 {SD_RES_NO_TAG, "Failed to find the requested tag"},
329 {SD_RES_STARTUP, "The system is still booting"},
330 {SD_RES_VDI_NOT_LOCKED, "VDI isn't locked"},
331 {SD_RES_SHUTDOWN, "The system is shutting down"},
332 {SD_RES_NO_MEM, "Out of memory on the server"},
333 {SD_RES_FULL_VDI, "We already have the maximum vdis"},
334 {SD_RES_VER_MISMATCH, "Protocol version mismatch"},
335 {SD_RES_NO_SPACE, "Server has no space for new objects"},
336 {SD_RES_WAIT_FOR_FORMAT, "Sheepdog is waiting for a format operation"},
337 {SD_RES_WAIT_FOR_JOIN, "Sheepdog is waiting for other nodes joining"},
338 {SD_RES_JOIN_FAILED, "Target node had failed to join sheepdog"},
339 };
340
341 for (i = 0; i < ARRAY_SIZE(errors); ++i) {
342 if (errors[i].err == err) {
343 return errors[i].desc;
344 }
345 }
346
347 return "Invalid error code";
348}
349
350/*
351 * Sheepdog I/O handling:
352 *
2df46246
MK
353 * 1. In sd_co_rw_vector, we send the I/O requests to the server and
354 * link the requests to the outstanding_list in the
355 * BDRVSheepdogState. The function exits without waiting for
356 * receiving the response.
33b1db1c 357 *
2df46246 358 * 2. We receive the response in aio_read_response, the fd handler to
33b1db1c
MK
359 * the sheepdog connection. If metadata update is needed, we send
360 * the write request to the vdi object in sd_write_done, the write
2df46246
MK
361 * completion function. We switch back to sd_co_readv/writev after
362 * all the requests belonging to the AIOCB are finished.
33b1db1c
MK
363 */
364
365static inline AIOReq *alloc_aio_req(BDRVSheepdogState *s, SheepdogAIOCB *acb,
366 uint64_t oid, unsigned int data_len,
367 uint64_t offset, uint8_t flags,
368 uint64_t base_oid, unsigned int iov_offset)
369{
370 AIOReq *aio_req;
371
7267c094 372 aio_req = g_malloc(sizeof(*aio_req));
33b1db1c
MK
373 aio_req->aiocb = acb;
374 aio_req->iov_offset = iov_offset;
375 aio_req->oid = oid;
376 aio_req->base_oid = base_oid;
377 aio_req->offset = offset;
378 aio_req->data_len = data_len;
379 aio_req->flags = flags;
380 aio_req->id = s->aioreq_seq_num++;
381
382 QLIST_INSERT_HEAD(&s->outstanding_aio_head, aio_req,
383 outstanding_aio_siblings);
384 QLIST_INSERT_HEAD(&acb->aioreq_head, aio_req, aioreq_siblings);
385
386 return aio_req;
387}
388
389static inline int free_aio_req(BDRVSheepdogState *s, AIOReq *aio_req)
390{
391 SheepdogAIOCB *acb = aio_req->aiocb;
392 QLIST_REMOVE(aio_req, outstanding_aio_siblings);
393 QLIST_REMOVE(aio_req, aioreq_siblings);
7267c094 394 g_free(aio_req);
33b1db1c
MK
395
396 return !QLIST_EMPTY(&acb->aioreq_head);
397}
398
d8716b41 399static void coroutine_fn sd_finish_aiocb(SheepdogAIOCB *acb)
33b1db1c
MK
400{
401 if (!acb->canceled) {
2df46246 402 qemu_coroutine_enter(acb->coroutine, NULL);
33b1db1c
MK
403 }
404 qemu_aio_release(acb);
405}
406
407static void sd_aio_cancel(BlockDriverAIOCB *blockacb)
408{
409 SheepdogAIOCB *acb = (SheepdogAIOCB *)blockacb;
410
411 /*
412 * Sheepdog cannot cancel the requests which are already sent to
413 * the servers, so we just complete the request with -EIO here.
414 */
2df46246
MK
415 acb->ret = -EIO;
416 qemu_coroutine_enter(acb->coroutine, NULL);
33b1db1c
MK
417 acb->canceled = 1;
418}
419
420static AIOPool sd_aio_pool = {
421 .aiocb_size = sizeof(SheepdogAIOCB),
422 .cancel = sd_aio_cancel,
423};
424
425static SheepdogAIOCB *sd_aio_setup(BlockDriverState *bs, QEMUIOVector *qiov,
426 int64_t sector_num, int nb_sectors,
427 BlockDriverCompletionFunc *cb, void *opaque)
428{
429 SheepdogAIOCB *acb;
430
431 acb = qemu_aio_get(&sd_aio_pool, bs, cb, opaque);
432
433 acb->qiov = qiov;
434
435 acb->sector_num = sector_num;
436 acb->nb_sectors = nb_sectors;
437
438 acb->aio_done_func = NULL;
439 acb->canceled = 0;
2df46246 440 acb->coroutine = qemu_coroutine_self();
33b1db1c
MK
441 acb->ret = 0;
442 QLIST_INIT(&acb->aioreq_head);
443 return acb;
444}
445
33b1db1c
MK
446static int connect_to_sdog(const char *addr, const char *port)
447{
448 char hbuf[NI_MAXHOST], sbuf[NI_MAXSERV];
449 int fd, ret;
450 struct addrinfo hints, *res, *res0;
451
452 if (!addr) {
453 addr = SD_DEFAULT_ADDR;
454 port = SD_DEFAULT_PORT;
455 }
456
457 memset(&hints, 0, sizeof(hints));
458 hints.ai_socktype = SOCK_STREAM;
459
460 ret = getaddrinfo(addr, port, &hints, &res0);
461 if (ret) {
6daf194d 462 error_report("unable to get address info %s, %s",
33b1db1c
MK
463 addr, strerror(errno));
464 return -1;
465 }
466
467 for (res = res0; res; res = res->ai_next) {
468 ret = getnameinfo(res->ai_addr, res->ai_addrlen, hbuf, sizeof(hbuf),
469 sbuf, sizeof(sbuf), NI_NUMERICHOST | NI_NUMERICSERV);
470 if (ret) {
471 continue;
472 }
473
474 fd = socket(res->ai_family, res->ai_socktype, res->ai_protocol);
475 if (fd < 0) {
476 continue;
477 }
478
479 reconnect:
480 ret = connect(fd, res->ai_addr, res->ai_addrlen);
481 if (ret < 0) {
482 if (errno == EINTR) {
483 goto reconnect;
484 }
485 break;
486 }
487
488 dprintf("connected to %s:%s\n", addr, port);
489 goto success;
490 }
491 fd = -1;
6daf194d 492 error_report("failed connect to %s:%s", addr, port);
33b1db1c
MK
493success:
494 freeaddrinfo(res0);
495 return fd;
496}
497
33b1db1c
MK
498static int send_req(int sockfd, SheepdogReq *hdr, void *data,
499 unsigned int *wlen)
500{
501 int ret;
33b1db1c 502
8c5135f9
PB
503 ret = qemu_send_full(sockfd, hdr, sizeof(*hdr), 0);
504 if (ret < sizeof(*hdr)) {
505 error_report("failed to send a req, %s", strerror(errno));
33b1db1c
MK
506 }
507
8c5135f9
PB
508 ret = qemu_send_full(sockfd, data, *wlen, 0);
509 if (ret < *wlen) {
6daf194d 510 error_report("failed to send a req, %s", strerror(errno));
33b1db1c
MK
511 }
512
513 return ret;
514}
515
516static int do_req(int sockfd, SheepdogReq *hdr, void *data,
517 unsigned int *wlen, unsigned int *rlen)
518{
519 int ret;
520
8c5135f9 521 socket_set_block(sockfd);
33b1db1c 522 ret = send_req(sockfd, hdr, data, wlen);
8c5135f9 523 if (ret < 0) {
33b1db1c
MK
524 goto out;
525 }
526
8c5135f9
PB
527 ret = qemu_recv_full(sockfd, hdr, sizeof(*hdr), 0);
528 if (ret < sizeof(*hdr)) {
6daf194d 529 error_report("failed to get a rsp, %s", strerror(errno));
33b1db1c
MK
530 goto out;
531 }
532
533 if (*rlen > hdr->data_length) {
534 *rlen = hdr->data_length;
535 }
536
537 if (*rlen) {
8c5135f9
PB
538 ret = qemu_recv_full(sockfd, data, *rlen, 0);
539 if (ret < *rlen) {
6daf194d 540 error_report("failed to get the data, %s", strerror(errno));
33b1db1c
MK
541 goto out;
542 }
543 }
544 ret = 0;
545out:
8c5135f9 546 socket_set_nonblock(sockfd);
33b1db1c
MK
547 return ret;
548}
549
d8716b41 550static int coroutine_fn add_aio_request(BDRVSheepdogState *s, AIOReq *aio_req,
33b1db1c
MK
551 struct iovec *iov, int niov, int create,
552 enum AIOCBState aiocb_type);
553
554/*
555 * This function searchs pending requests to the object `oid', and
556 * sends them.
557 */
d8716b41 558static void coroutine_fn send_pending_req(BDRVSheepdogState *s, uint64_t oid, uint32_t id)
33b1db1c
MK
559{
560 AIOReq *aio_req, *next;
561 SheepdogAIOCB *acb;
562 int ret;
563
564 QLIST_FOREACH_SAFE(aio_req, &s->outstanding_aio_head,
565 outstanding_aio_siblings, next) {
566 if (id == aio_req->id) {
567 continue;
568 }
569 if (aio_req->oid != oid) {
570 continue;
571 }
572
573 acb = aio_req->aiocb;
574 ret = add_aio_request(s, aio_req, acb->qiov->iov,
575 acb->qiov->niov, 0, acb->aiocb_type);
576 if (ret < 0) {
6daf194d 577 error_report("add_aio_request is failed");
33b1db1c
MK
578 free_aio_req(s, aio_req);
579 if (QLIST_EMPTY(&acb->aioreq_head)) {
580 sd_finish_aiocb(acb);
581 }
582 }
583 }
584}
585
586/*
587 * Receive responses of the I/O requests.
588 *
589 * This function is registered as a fd handler, and called from the
590 * main loop when s->fd is ready for reading responses.
591 */
d8716b41 592static void coroutine_fn aio_read_response(void *opaque)
33b1db1c
MK
593{
594 SheepdogObjRsp rsp;
595 BDRVSheepdogState *s = opaque;
596 int fd = s->fd;
597 int ret;
598 AIOReq *aio_req = NULL;
599 SheepdogAIOCB *acb;
600 int rest;
601 unsigned long idx;
602
603 if (QLIST_EMPTY(&s->outstanding_aio_head)) {
2df46246 604 goto out;
33b1db1c
MK
605 }
606
607 /* read a header */
8c5135f9
PB
608 ret = qemu_co_recv(fd, &rsp, sizeof(rsp));
609 if (ret < 0) {
6daf194d 610 error_report("failed to get the header, %s", strerror(errno));
2df46246 611 goto out;
33b1db1c
MK
612 }
613
614 /* find the right aio_req from the outstanding_aio list */
615 QLIST_FOREACH(aio_req, &s->outstanding_aio_head, outstanding_aio_siblings) {
616 if (aio_req->id == rsp.id) {
617 break;
618 }
619 }
620 if (!aio_req) {
6daf194d 621 error_report("cannot find aio_req %x", rsp.id);
2df46246 622 goto out;
33b1db1c
MK
623 }
624
625 acb = aio_req->aiocb;
626
627 switch (acb->aiocb_type) {
628 case AIOCB_WRITE_UDATA:
629 if (!is_data_obj(aio_req->oid)) {
630 break;
631 }
632 idx = data_oid_to_idx(aio_req->oid);
633
634 if (s->inode.data_vdi_id[idx] != s->inode.vdi_id) {
635 /*
636 * If the object is newly created one, we need to update
637 * the vdi object (metadata object). min_dirty_data_idx
638 * and max_dirty_data_idx are changed to include updated
639 * index between them.
640 */
641 s->inode.data_vdi_id[idx] = s->inode.vdi_id;
642 s->max_dirty_data_idx = MAX(idx, s->max_dirty_data_idx);
643 s->min_dirty_data_idx = MIN(idx, s->min_dirty_data_idx);
644
645 /*
646 * Some requests may be blocked because simultaneous
647 * create requests are not allowed, so we search the
648 * pending requests here.
649 */
650 send_pending_req(s, vid_to_data_oid(s->inode.vdi_id, idx), rsp.id);
651 }
652 break;
653 case AIOCB_READ_UDATA:
8c5135f9
PB
654 ret = qemu_co_recvv(fd, acb->qiov->iov, rsp.data_length,
655 aio_req->iov_offset);
656 if (ret < 0) {
6daf194d 657 error_report("failed to get the data, %s", strerror(errno));
2df46246 658 goto out;
33b1db1c
MK
659 }
660 break;
661 }
662
663 if (rsp.result != SD_RES_SUCCESS) {
664 acb->ret = -EIO;
6daf194d 665 error_report("%s", sd_strerror(rsp.result));
33b1db1c
MK
666 }
667
668 rest = free_aio_req(s, aio_req);
669 if (!rest) {
670 /*
671 * We've finished all requests which belong to the AIOCB, so
2df46246 672 * we can switch back to sd_co_readv/writev now.
33b1db1c
MK
673 */
674 acb->aio_done_func(acb);
675 }
2df46246
MK
676out:
677 s->co_recv = NULL;
678}
679
680static void co_read_response(void *opaque)
681{
682 BDRVSheepdogState *s = opaque;
683
684 if (!s->co_recv) {
685 s->co_recv = qemu_coroutine_create(aio_read_response);
686 }
687
688 qemu_coroutine_enter(s->co_recv, opaque);
689}
690
691static void co_write_request(void *opaque)
692{
693 BDRVSheepdogState *s = opaque;
694
695 qemu_coroutine_enter(s->co_send, NULL);
33b1db1c
MK
696}
697
698static int aio_flush_request(void *opaque)
699{
700 BDRVSheepdogState *s = opaque;
701
702 return !QLIST_EMPTY(&s->outstanding_aio_head);
703}
704
6defcc37 705#if !defined(SOL_TCP) || !defined(TCP_CORK)
33b1db1c
MK
706
707static int set_cork(int fd, int v)
708{
709 return 0;
710}
711
712#else
713
714static int set_cork(int fd, int v)
715{
716 return setsockopt(fd, SOL_TCP, TCP_CORK, &v, sizeof(v));
717}
718
719#endif
720
721static int set_nodelay(int fd)
722{
723 int ret, opt;
724
725 opt = 1;
726 ret = setsockopt(fd, IPPROTO_TCP, TCP_NODELAY, (char *)&opt, sizeof(opt));
727 return ret;
728}
729
730/*
731 * Return a socket discriptor to read/write objects.
732 *
733 * We cannot use this discriptor for other operations because
734 * the block driver may be on waiting response from the server.
735 */
736static int get_sheep_fd(BDRVSheepdogState *s)
737{
738 int ret, fd;
739
740 fd = connect_to_sdog(s->addr, s->port);
741 if (fd < 0) {
6daf194d 742 error_report("%s", strerror(errno));
33b1db1c
MK
743 return -1;
744 }
745
746 socket_set_nonblock(fd);
747
748 ret = set_nodelay(fd);
749 if (ret) {
6daf194d 750 error_report("%s", strerror(errno));
33b1db1c
MK
751 closesocket(fd);
752 return -1;
753 }
754
2df46246 755 qemu_aio_set_fd_handler(fd, co_read_response, NULL, aio_flush_request,
33b1db1c
MK
756 NULL, s);
757 return fd;
758}
759
760/*
761 * Parse a filename
762 *
763 * filename must be one of the following formats:
764 * 1. [vdiname]
765 * 2. [vdiname]:[snapid]
766 * 3. [vdiname]:[tag]
767 * 4. [hostname]:[port]:[vdiname]
768 * 5. [hostname]:[port]:[vdiname]:[snapid]
769 * 6. [hostname]:[port]:[vdiname]:[tag]
770 *
771 * You can boot from the snapshot images by specifying `snapid` or
772 * `tag'.
773 *
774 * You can run VMs outside the Sheepdog cluster by specifying
775 * `hostname' and `port' (experimental).
776 */
777static int parse_vdiname(BDRVSheepdogState *s, const char *filename,
778 char *vdi, uint32_t *snapid, char *tag)
779{
780 char *p, *q;
781 int nr_sep;
782
7267c094 783 p = q = g_strdup(filename);
33b1db1c
MK
784
785 /* count the number of separators */
786 nr_sep = 0;
787 while (*p) {
788 if (*p == ':') {
789 nr_sep++;
790 }
791 p++;
792 }
793 p = q;
794
795 /* use the first two tokens as hostname and port number. */
796 if (nr_sep >= 2) {
797 s->addr = p;
798 p = strchr(p, ':');
799 *p++ = '\0';
800
801 s->port = p;
802 p = strchr(p, ':');
803 *p++ = '\0';
804 } else {
805 s->addr = NULL;
806 s->port = 0;
807 }
808
809 strncpy(vdi, p, SD_MAX_VDI_LEN);
810
811 p = strchr(vdi, ':');
812 if (p) {
813 *p++ = '\0';
814 *snapid = strtoul(p, NULL, 10);
815 if (*snapid == 0) {
816 strncpy(tag, p, SD_MAX_VDI_TAG_LEN);
817 }
818 } else {
819 *snapid = CURRENT_VDI_ID; /* search current vdi */
820 }
821
822 if (s->addr == NULL) {
7267c094 823 g_free(q);
33b1db1c
MK
824 }
825
826 return 0;
827}
828
829static int find_vdi_name(BDRVSheepdogState *s, char *filename, uint32_t snapid,
830 char *tag, uint32_t *vid, int for_snapshot)
831{
832 int ret, fd;
833 SheepdogVdiReq hdr;
834 SheepdogVdiRsp *rsp = (SheepdogVdiRsp *)&hdr;
835 unsigned int wlen, rlen = 0;
836 char buf[SD_MAX_VDI_LEN + SD_MAX_VDI_TAG_LEN];
837
838 fd = connect_to_sdog(s->addr, s->port);
839 if (fd < 0) {
840 return -1;
841 }
842
843 memset(buf, 0, sizeof(buf));
844 strncpy(buf, filename, SD_MAX_VDI_LEN);
845 strncpy(buf + SD_MAX_VDI_LEN, tag, SD_MAX_VDI_TAG_LEN);
846
847 memset(&hdr, 0, sizeof(hdr));
848 if (for_snapshot) {
849 hdr.opcode = SD_OP_GET_VDI_INFO;
850 } else {
851 hdr.opcode = SD_OP_LOCK_VDI;
852 }
853 wlen = SD_MAX_VDI_LEN + SD_MAX_VDI_TAG_LEN;
854 hdr.proto_ver = SD_PROTO_VER;
855 hdr.data_length = wlen;
856 hdr.snapid = snapid;
857 hdr.flags = SD_FLAG_CMD_WRITE;
858
859 ret = do_req(fd, (SheepdogReq *)&hdr, buf, &wlen, &rlen);
860 if (ret) {
861 ret = -1;
862 goto out;
863 }
864
865 if (rsp->result != SD_RES_SUCCESS) {
6daf194d 866 error_report("cannot get vdi info, %s, %s %d %s",
33b1db1c
MK
867 sd_strerror(rsp->result), filename, snapid, tag);
868 ret = -1;
869 goto out;
870 }
871 *vid = rsp->vdi_id;
872
873 ret = 0;
874out:
875 closesocket(fd);
876 return ret;
877}
878
d8716b41 879static int coroutine_fn add_aio_request(BDRVSheepdogState *s, AIOReq *aio_req,
33b1db1c
MK
880 struct iovec *iov, int niov, int create,
881 enum AIOCBState aiocb_type)
882{
883 int nr_copies = s->inode.nr_copies;
884 SheepdogObjReq hdr;
885 unsigned int wlen;
886 int ret;
887 uint64_t oid = aio_req->oid;
888 unsigned int datalen = aio_req->data_len;
889 uint64_t offset = aio_req->offset;
890 uint8_t flags = aio_req->flags;
891 uint64_t old_oid = aio_req->base_oid;
892
893 if (!nr_copies) {
6daf194d 894 error_report("bug");
33b1db1c
MK
895 }
896
897 memset(&hdr, 0, sizeof(hdr));
898
899 if (aiocb_type == AIOCB_READ_UDATA) {
900 wlen = 0;
901 hdr.opcode = SD_OP_READ_OBJ;
902 hdr.flags = flags;
903 } else if (create) {
904 wlen = datalen;
905 hdr.opcode = SD_OP_CREATE_AND_WRITE_OBJ;
906 hdr.flags = SD_FLAG_CMD_WRITE | flags;
907 } else {
908 wlen = datalen;
909 hdr.opcode = SD_OP_WRITE_OBJ;
910 hdr.flags = SD_FLAG_CMD_WRITE | flags;
911 }
912
913 hdr.oid = oid;
914 hdr.cow_oid = old_oid;
915 hdr.copies = s->inode.nr_copies;
916
917 hdr.data_length = datalen;
918 hdr.offset = offset;
919
920 hdr.id = aio_req->id;
921
2df46246
MK
922 qemu_co_mutex_lock(&s->lock);
923 s->co_send = qemu_coroutine_self();
924 qemu_aio_set_fd_handler(s->fd, co_read_response, co_write_request,
925 aio_flush_request, NULL, s);
33b1db1c
MK
926 set_cork(s->fd, 1);
927
928 /* send a header */
8c5135f9
PB
929 ret = qemu_co_send(s->fd, &hdr, sizeof(hdr));
930 if (ret < 0) {
c3fecea5 931 qemu_co_mutex_unlock(&s->lock);
6daf194d 932 error_report("failed to send a req, %s", strerror(errno));
33b1db1c
MK
933 return -EIO;
934 }
935
936 if (wlen) {
8c5135f9
PB
937 ret = qemu_co_sendv(s->fd, iov, wlen, aio_req->iov_offset);
938 if (ret < 0) {
c3fecea5 939 qemu_co_mutex_unlock(&s->lock);
6daf194d 940 error_report("failed to send a data, %s", strerror(errno));
33b1db1c
MK
941 return -EIO;
942 }
943 }
944
945 set_cork(s->fd, 0);
2df46246
MK
946 qemu_aio_set_fd_handler(s->fd, co_read_response, NULL,
947 aio_flush_request, NULL, s);
948 qemu_co_mutex_unlock(&s->lock);
33b1db1c
MK
949
950 return 0;
951}
952
953static int read_write_object(int fd, char *buf, uint64_t oid, int copies,
954 unsigned int datalen, uint64_t offset,
955 int write, int create)
956{
957 SheepdogObjReq hdr;
958 SheepdogObjRsp *rsp = (SheepdogObjRsp *)&hdr;
959 unsigned int wlen, rlen;
960 int ret;
961
962 memset(&hdr, 0, sizeof(hdr));
963
964 if (write) {
965 wlen = datalen;
966 rlen = 0;
967 hdr.flags = SD_FLAG_CMD_WRITE;
968 if (create) {
969 hdr.opcode = SD_OP_CREATE_AND_WRITE_OBJ;
970 } else {
971 hdr.opcode = SD_OP_WRITE_OBJ;
972 }
973 } else {
974 wlen = 0;
975 rlen = datalen;
976 hdr.opcode = SD_OP_READ_OBJ;
977 }
978 hdr.oid = oid;
979 hdr.data_length = datalen;
980 hdr.offset = offset;
981 hdr.copies = copies;
982
983 ret = do_req(fd, (SheepdogReq *)&hdr, buf, &wlen, &rlen);
984 if (ret) {
6daf194d 985 error_report("failed to send a request to the sheep");
33b1db1c
MK
986 return -1;
987 }
988
989 switch (rsp->result) {
990 case SD_RES_SUCCESS:
991 return 0;
992 default:
6daf194d 993 error_report("%s", sd_strerror(rsp->result));
33b1db1c
MK
994 return -1;
995 }
996}
997
998static int read_object(int fd, char *buf, uint64_t oid, int copies,
999 unsigned int datalen, uint64_t offset)
1000{
1001 return read_write_object(fd, buf, oid, copies, datalen, offset, 0, 0);
1002}
1003
1004static int write_object(int fd, char *buf, uint64_t oid, int copies,
1005 unsigned int datalen, uint64_t offset, int create)
1006{
1007 return read_write_object(fd, buf, oid, copies, datalen, offset, 1, create);
1008}
1009
1010static int sd_open(BlockDriverState *bs, const char *filename, int flags)
1011{
1012 int ret, fd;
1013 uint32_t vid = 0;
1014 BDRVSheepdogState *s = bs->opaque;
1015 char vdi[SD_MAX_VDI_LEN], tag[SD_MAX_VDI_TAG_LEN];
1016 uint32_t snapid;
1017 char *buf = NULL;
1018
1019 strstart(filename, "sheepdog:", (const char **)&filename);
1020
1021 QLIST_INIT(&s->outstanding_aio_head);
1022 s->fd = -1;
1023
1024 memset(vdi, 0, sizeof(vdi));
1025 memset(tag, 0, sizeof(tag));
1026 if (parse_vdiname(s, filename, vdi, &snapid, tag) < 0) {
1027 goto out;
1028 }
1029 s->fd = get_sheep_fd(s);
1030 if (s->fd < 0) {
1031 goto out;
1032 }
1033
1034 ret = find_vdi_name(s, vdi, snapid, tag, &vid, 0);
1035 if (ret) {
1036 goto out;
1037 }
1038
1039 if (snapid) {
1040 dprintf("%" PRIx32 " snapshot inode was open.\n", vid);
1041 s->is_snapshot = 1;
1042 }
1043
1044 fd = connect_to_sdog(s->addr, s->port);
1045 if (fd < 0) {
6daf194d 1046 error_report("failed to connect");
33b1db1c
MK
1047 goto out;
1048 }
1049
7267c094 1050 buf = g_malloc(SD_INODE_SIZE);
33b1db1c
MK
1051 ret = read_object(fd, buf, vid_to_vdi_oid(vid), 0, SD_INODE_SIZE, 0);
1052
1053 closesocket(fd);
1054
1055 if (ret) {
1056 goto out;
1057 }
1058
1059 memcpy(&s->inode, buf, sizeof(s->inode));
1060 s->min_dirty_data_idx = UINT32_MAX;
1061 s->max_dirty_data_idx = 0;
1062
1063 bs->total_sectors = s->inode.vdi_size / SECTOR_SIZE;
1064 strncpy(s->name, vdi, sizeof(s->name));
2df46246 1065 qemu_co_mutex_init(&s->lock);
7267c094 1066 g_free(buf);
33b1db1c
MK
1067 return 0;
1068out:
1069 qemu_aio_set_fd_handler(s->fd, NULL, NULL, NULL, NULL, NULL);
1070 if (s->fd >= 0) {
1071 closesocket(s->fd);
1072 }
7267c094 1073 g_free(buf);
33b1db1c
MK
1074 return -1;
1075}
1076
1077static int do_sd_create(char *filename, int64_t vdi_size,
1078 uint32_t base_vid, uint32_t *vdi_id, int snapshot,
1079 const char *addr, const char *port)
1080{
1081 SheepdogVdiReq hdr;
1082 SheepdogVdiRsp *rsp = (SheepdogVdiRsp *)&hdr;
1083 int fd, ret;
1084 unsigned int wlen, rlen = 0;
1085 char buf[SD_MAX_VDI_LEN];
1086
1087 fd = connect_to_sdog(addr, port);
1088 if (fd < 0) {
1089 return -EIO;
1090 }
1091
1092 memset(buf, 0, sizeof(buf));
1093 strncpy(buf, filename, SD_MAX_VDI_LEN);
1094
1095 memset(&hdr, 0, sizeof(hdr));
1096 hdr.opcode = SD_OP_NEW_VDI;
1097 hdr.base_vdi_id = base_vid;
1098
1099 wlen = SD_MAX_VDI_LEN;
1100
1101 hdr.flags = SD_FLAG_CMD_WRITE;
1102 hdr.snapid = snapshot;
1103
1104 hdr.data_length = wlen;
1105 hdr.vdi_size = vdi_size;
1106
1107 ret = do_req(fd, (SheepdogReq *)&hdr, buf, &wlen, &rlen);
1108
1109 closesocket(fd);
1110
1111 if (ret) {
1112 return -EIO;
1113 }
1114
1115 if (rsp->result != SD_RES_SUCCESS) {
6daf194d 1116 error_report("%s, %s", sd_strerror(rsp->result), filename);
33b1db1c
MK
1117 return -EIO;
1118 }
1119
1120 if (vdi_id) {
1121 *vdi_id = rsp->vdi_id;
1122 }
1123
1124 return 0;
1125}
1126
a8e0fdd7
MK
1127static int sd_prealloc(const char *filename)
1128{
1129 BlockDriverState *bs = NULL;
1130 uint32_t idx, max_idx;
1131 int64_t vdi_size;
7267c094 1132 void *buf = g_malloc0(SD_DATA_OBJ_SIZE);
a8e0fdd7
MK
1133 int ret;
1134
1135 ret = bdrv_file_open(&bs, filename, BDRV_O_RDWR);
1136 if (ret < 0) {
1137 goto out;
1138 }
1139
1140 vdi_size = bdrv_getlength(bs);
1141 if (vdi_size < 0) {
1142 ret = vdi_size;
1143 goto out;
1144 }
1145 max_idx = DIV_ROUND_UP(vdi_size, SD_DATA_OBJ_SIZE);
1146
1147 for (idx = 0; idx < max_idx; idx++) {
1148 /*
1149 * The created image can be a cloned image, so we need to read
1150 * a data from the source image.
1151 */
1152 ret = bdrv_pread(bs, idx * SD_DATA_OBJ_SIZE, buf, SD_DATA_OBJ_SIZE);
1153 if (ret < 0) {
1154 goto out;
1155 }
1156 ret = bdrv_pwrite(bs, idx * SD_DATA_OBJ_SIZE, buf, SD_DATA_OBJ_SIZE);
1157 if (ret < 0) {
1158 goto out;
1159 }
1160 }
1161out:
1162 if (bs) {
1163 bdrv_delete(bs);
1164 }
7267c094 1165 g_free(buf);
a8e0fdd7
MK
1166
1167 return ret;
1168}
1169
33b1db1c
MK
1170static int sd_create(const char *filename, QEMUOptionParameter *options)
1171{
1172 int ret;
b4447363 1173 uint32_t vid = 0, base_vid = 0;
33b1db1c
MK
1174 int64_t vdi_size = 0;
1175 char *backing_file = NULL;
b4447363
MK
1176 BDRVSheepdogState s;
1177 char vdi[SD_MAX_VDI_LEN], tag[SD_MAX_VDI_TAG_LEN];
1178 uint32_t snapid;
a8e0fdd7
MK
1179 int prealloc = 0;
1180 const char *vdiname;
33b1db1c 1181
a8e0fdd7 1182 strstart(filename, "sheepdog:", &vdiname);
33b1db1c 1183
b4447363
MK
1184 memset(&s, 0, sizeof(s));
1185 memset(vdi, 0, sizeof(vdi));
1186 memset(tag, 0, sizeof(tag));
a8e0fdd7 1187 if (parse_vdiname(&s, vdiname, vdi, &snapid, tag) < 0) {
6daf194d 1188 error_report("invalid filename");
b4447363
MK
1189 return -EINVAL;
1190 }
1191
33b1db1c
MK
1192 while (options && options->name) {
1193 if (!strcmp(options->name, BLOCK_OPT_SIZE)) {
1194 vdi_size = options->value.n;
1195 } else if (!strcmp(options->name, BLOCK_OPT_BACKING_FILE)) {
1196 backing_file = options->value.s;
a8e0fdd7
MK
1197 } else if (!strcmp(options->name, BLOCK_OPT_PREALLOC)) {
1198 if (!options->value.s || !strcmp(options->value.s, "off")) {
1199 prealloc = 0;
1200 } else if (!strcmp(options->value.s, "full")) {
1201 prealloc = 1;
1202 } else {
1203 error_report("Invalid preallocation mode: '%s'",
1204 options->value.s);
1205 return -EINVAL;
1206 }
33b1db1c
MK
1207 }
1208 options++;
1209 }
1210
1211 if (vdi_size > SD_MAX_VDI_SIZE) {
6daf194d 1212 error_report("too big image size");
33b1db1c
MK
1213 return -EINVAL;
1214 }
1215
1216 if (backing_file) {
1217 BlockDriverState *bs;
1218 BDRVSheepdogState *s;
1219 BlockDriver *drv;
1220
1221 /* Currently, only Sheepdog backing image is supported. */
1222 drv = bdrv_find_protocol(backing_file);
1223 if (!drv || strcmp(drv->protocol_name, "sheepdog") != 0) {
6daf194d 1224 error_report("backing_file must be a sheepdog image");
33b1db1c
MK
1225 return -EINVAL;
1226 }
1227
1228 ret = bdrv_file_open(&bs, backing_file, 0);
1229 if (ret < 0)
1230 return -EIO;
1231
1232 s = bs->opaque;
1233
1234 if (!is_snapshot(&s->inode)) {
6daf194d 1235 error_report("cannot clone from a non snapshot vdi");
33b1db1c
MK
1236 bdrv_delete(bs);
1237 return -EINVAL;
1238 }
1239
b4447363 1240 base_vid = s->inode.vdi_id;
33b1db1c
MK
1241 bdrv_delete(bs);
1242 }
1243
a8e0fdd7
MK
1244 ret = do_sd_create(vdi, vdi_size, base_vid, &vid, 0, s.addr, s.port);
1245 if (!prealloc || ret) {
1246 return ret;
1247 }
1248
1249 return sd_prealloc(filename);
33b1db1c
MK
1250}
1251
1252static void sd_close(BlockDriverState *bs)
1253{
1254 BDRVSheepdogState *s = bs->opaque;
1255 SheepdogVdiReq hdr;
1256 SheepdogVdiRsp *rsp = (SheepdogVdiRsp *)&hdr;
1257 unsigned int wlen, rlen = 0;
1258 int fd, ret;
1259
1260 dprintf("%s\n", s->name);
1261
1262 fd = connect_to_sdog(s->addr, s->port);
1263 if (fd < 0) {
1264 return;
1265 }
1266
1267 memset(&hdr, 0, sizeof(hdr));
1268
1269 hdr.opcode = SD_OP_RELEASE_VDI;
1270 wlen = strlen(s->name) + 1;
1271 hdr.data_length = wlen;
1272 hdr.flags = SD_FLAG_CMD_WRITE;
1273
1274 ret = do_req(fd, (SheepdogReq *)&hdr, s->name, &wlen, &rlen);
1275
1276 closesocket(fd);
1277
1278 if (!ret && rsp->result != SD_RES_SUCCESS &&
1279 rsp->result != SD_RES_VDI_NOT_LOCKED) {
6daf194d 1280 error_report("%s, %s", sd_strerror(rsp->result), s->name);
33b1db1c
MK
1281 }
1282
1283 qemu_aio_set_fd_handler(s->fd, NULL, NULL, NULL, NULL, NULL);
1284 closesocket(s->fd);
7267c094 1285 g_free(s->addr);
33b1db1c
MK
1286}
1287
1288static int64_t sd_getlength(BlockDriverState *bs)
1289{
1290 BDRVSheepdogState *s = bs->opaque;
1291
1292 return s->inode.vdi_size;
1293}
1294
1295static int sd_truncate(BlockDriverState *bs, int64_t offset)
1296{
1297 BDRVSheepdogState *s = bs->opaque;
1298 int ret, fd;
1299 unsigned int datalen;
1300
1301 if (offset < s->inode.vdi_size) {
6daf194d 1302 error_report("shrinking is not supported");
33b1db1c
MK
1303 return -EINVAL;
1304 } else if (offset > SD_MAX_VDI_SIZE) {
6daf194d 1305 error_report("too big image size");
33b1db1c
MK
1306 return -EINVAL;
1307 }
1308
1309 fd = connect_to_sdog(s->addr, s->port);
1310 if (fd < 0) {
1311 return -EIO;
1312 }
1313
1314 /* we don't need to update entire object */
1315 datalen = SD_INODE_SIZE - sizeof(s->inode.data_vdi_id);
1316 s->inode.vdi_size = offset;
1317 ret = write_object(fd, (char *)&s->inode, vid_to_vdi_oid(s->inode.vdi_id),
1318 s->inode.nr_copies, datalen, 0, 0);
1319 close(fd);
1320
1321 if (ret < 0) {
6daf194d 1322 error_report("failed to update an inode.");
33b1db1c
MK
1323 return -EIO;
1324 }
1325
1326 return 0;
1327}
1328
1329/*
1330 * This function is called after writing data objects. If we need to
1331 * update metadata, this sends a write request to the vdi object.
2df46246 1332 * Otherwise, this switches back to sd_co_readv/writev.
33b1db1c 1333 */
d8716b41 1334static void coroutine_fn sd_write_done(SheepdogAIOCB *acb)
33b1db1c
MK
1335{
1336 int ret;
1337 BDRVSheepdogState *s = acb->common.bs->opaque;
1338 struct iovec iov;
1339 AIOReq *aio_req;
1340 uint32_t offset, data_len, mn, mx;
1341
1342 mn = s->min_dirty_data_idx;
1343 mx = s->max_dirty_data_idx;
1344 if (mn <= mx) {
1345 /* we need to update the vdi object. */
1346 offset = sizeof(s->inode) - sizeof(s->inode.data_vdi_id) +
1347 mn * sizeof(s->inode.data_vdi_id[0]);
1348 data_len = (mx - mn + 1) * sizeof(s->inode.data_vdi_id[0]);
1349
1350 s->min_dirty_data_idx = UINT32_MAX;
1351 s->max_dirty_data_idx = 0;
1352
1353 iov.iov_base = &s->inode;
1354 iov.iov_len = sizeof(s->inode);
1355 aio_req = alloc_aio_req(s, acb, vid_to_vdi_oid(s->inode.vdi_id),
1356 data_len, offset, 0, 0, offset);
1357 ret = add_aio_request(s, aio_req, &iov, 1, 0, AIOCB_WRITE_UDATA);
1358 if (ret) {
1359 free_aio_req(s, aio_req);
1360 acb->ret = -EIO;
1361 goto out;
1362 }
1363
1364 acb->aio_done_func = sd_finish_aiocb;
1365 acb->aiocb_type = AIOCB_WRITE_UDATA;
1366 return;
1367 }
1368out:
1369 sd_finish_aiocb(acb);
1370}
1371
1372/*
1373 * Create a writable VDI from a snapshot
1374 */
1375static int sd_create_branch(BDRVSheepdogState *s)
1376{
1377 int ret, fd;
1378 uint32_t vid;
1379 char *buf;
1380
1381 dprintf("%" PRIx32 " is snapshot.\n", s->inode.vdi_id);
1382
7267c094 1383 buf = g_malloc(SD_INODE_SIZE);
33b1db1c
MK
1384
1385 ret = do_sd_create(s->name, s->inode.vdi_size, s->inode.vdi_id, &vid, 1,
1386 s->addr, s->port);
1387 if (ret) {
1388 goto out;
1389 }
1390
1391 dprintf("%" PRIx32 " is created.\n", vid);
1392
1393 fd = connect_to_sdog(s->addr, s->port);
1394 if (fd < 0) {
6daf194d 1395 error_report("failed to connect");
33b1db1c
MK
1396 goto out;
1397 }
1398
1399 ret = read_object(fd, buf, vid_to_vdi_oid(vid), s->inode.nr_copies,
1400 SD_INODE_SIZE, 0);
1401
1402 closesocket(fd);
1403
1404 if (ret < 0) {
1405 goto out;
1406 }
1407
1408 memcpy(&s->inode, buf, sizeof(s->inode));
1409
1410 s->is_snapshot = 0;
1411 ret = 0;
1412 dprintf("%" PRIx32 " was newly created.\n", s->inode.vdi_id);
1413
1414out:
7267c094 1415 g_free(buf);
33b1db1c
MK
1416
1417 return ret;
1418}
1419
1420/*
1421 * Send I/O requests to the server.
1422 *
1423 * This function sends requests to the server, links the requests to
1424 * the outstanding_list in BDRVSheepdogState, and exits without
1425 * waiting the response. The responses are received in the
1426 * `aio_read_response' function which is called from the main loop as
1427 * a fd handler.
2df46246
MK
1428 *
1429 * Returns 1 when we need to wait a response, 0 when there is no sent
1430 * request and -errno in error cases.
33b1db1c 1431 */
d8716b41 1432static int coroutine_fn sd_co_rw_vector(void *p)
33b1db1c
MK
1433{
1434 SheepdogAIOCB *acb = p;
1435 int ret = 0;
1436 unsigned long len, done = 0, total = acb->nb_sectors * SECTOR_SIZE;
1437 unsigned long idx = acb->sector_num * SECTOR_SIZE / SD_DATA_OBJ_SIZE;
1438 uint64_t oid;
1439 uint64_t offset = (acb->sector_num * SECTOR_SIZE) % SD_DATA_OBJ_SIZE;
1440 BDRVSheepdogState *s = acb->common.bs->opaque;
1441 SheepdogInode *inode = &s->inode;
1442 AIOReq *aio_req;
1443
33b1db1c
MK
1444 if (acb->aiocb_type == AIOCB_WRITE_UDATA && s->is_snapshot) {
1445 /*
1446 * In the case we open the snapshot VDI, Sheepdog creates the
1447 * writable VDI when we do a write operation first.
1448 */
1449 ret = sd_create_branch(s);
1450 if (ret) {
1451 acb->ret = -EIO;
1452 goto out;
1453 }
1454 }
1455
1456 while (done != total) {
1457 uint8_t flags = 0;
1458 uint64_t old_oid = 0;
1459 int create = 0;
1460
1461 oid = vid_to_data_oid(inode->data_vdi_id[idx], idx);
1462
1463 len = MIN(total - done, SD_DATA_OBJ_SIZE - offset);
1464
1465 if (!inode->data_vdi_id[idx]) {
1466 if (acb->aiocb_type == AIOCB_READ_UDATA) {
1467 goto done;
1468 }
1469
1470 create = 1;
1471 } else if (acb->aiocb_type == AIOCB_WRITE_UDATA
ebabb67a 1472 && !is_data_obj_writable(inode, idx)) {
33b1db1c
MK
1473 /* Copy-On-Write */
1474 create = 1;
1475 old_oid = oid;
1476 flags = SD_FLAG_CMD_COW;
1477 }
1478
1479 if (create) {
1480 dprintf("update ino (%" PRIu32") %" PRIu64 " %" PRIu64
1481 " %" PRIu64 "\n", inode->vdi_id, oid,
1482 vid_to_data_oid(inode->data_vdi_id[idx], idx), idx);
1483 oid = vid_to_data_oid(inode->vdi_id, idx);
1484 dprintf("new oid %lx\n", oid);
1485 }
1486
1487 aio_req = alloc_aio_req(s, acb, oid, len, offset, flags, old_oid, done);
1488
1489 if (create) {
1490 AIOReq *areq;
1491 QLIST_FOREACH(areq, &s->outstanding_aio_head,
1492 outstanding_aio_siblings) {
1493 if (areq == aio_req) {
1494 continue;
1495 }
1496 if (areq->oid == oid) {
1497 /*
1498 * Sheepdog cannot handle simultaneous create
1499 * requests to the same object. So we cannot send
1500 * the request until the previous request
1501 * finishes.
1502 */
1503 aio_req->flags = 0;
1504 aio_req->base_oid = 0;
1505 goto done;
1506 }
1507 }
1508 }
1509
1510 ret = add_aio_request(s, aio_req, acb->qiov->iov, acb->qiov->niov,
1511 create, acb->aiocb_type);
1512 if (ret < 0) {
6daf194d 1513 error_report("add_aio_request is failed");
33b1db1c
MK
1514 free_aio_req(s, aio_req);
1515 acb->ret = -EIO;
1516 goto out;
1517 }
1518 done:
1519 offset = 0;
1520 idx++;
1521 done += len;
1522 }
1523out:
1524 if (QLIST_EMPTY(&acb->aioreq_head)) {
2df46246 1525 return acb->ret;
33b1db1c 1526 }
2df46246 1527 return 1;
33b1db1c
MK
1528}
1529
a968168c 1530static coroutine_fn int sd_co_writev(BlockDriverState *bs, int64_t sector_num,
2df46246 1531 int nb_sectors, QEMUIOVector *qiov)
33b1db1c
MK
1532{
1533 SheepdogAIOCB *acb;
2df46246 1534 int ret;
33b1db1c
MK
1535
1536 if (bs->growable && sector_num + nb_sectors > bs->total_sectors) {
1537 /* TODO: shouldn't block here */
1538 if (sd_truncate(bs, (sector_num + nb_sectors) * SECTOR_SIZE) < 0) {
2df46246 1539 return -EIO;
33b1db1c
MK
1540 }
1541 bs->total_sectors = sector_num + nb_sectors;
1542 }
1543
2df46246 1544 acb = sd_aio_setup(bs, qiov, sector_num, nb_sectors, NULL, NULL);
33b1db1c
MK
1545 acb->aio_done_func = sd_write_done;
1546 acb->aiocb_type = AIOCB_WRITE_UDATA;
1547
2df46246
MK
1548 ret = sd_co_rw_vector(acb);
1549 if (ret <= 0) {
1550 qemu_aio_release(acb);
1551 return ret;
1552 }
1553
1554 qemu_coroutine_yield();
1555
1556 return acb->ret;
33b1db1c
MK
1557}
1558
a968168c 1559static coroutine_fn int sd_co_readv(BlockDriverState *bs, int64_t sector_num,
2df46246 1560 int nb_sectors, QEMUIOVector *qiov)
33b1db1c
MK
1561{
1562 SheepdogAIOCB *acb;
2df46246 1563 int i, ret;
33b1db1c 1564
2df46246 1565 acb = sd_aio_setup(bs, qiov, sector_num, nb_sectors, NULL, NULL);
33b1db1c
MK
1566 acb->aiocb_type = AIOCB_READ_UDATA;
1567 acb->aio_done_func = sd_finish_aiocb;
1568
1569 /*
1570 * TODO: we can do better; we don't need to initialize
1571 * blindly.
1572 */
1573 for (i = 0; i < qiov->niov; i++) {
1574 memset(qiov->iov[i].iov_base, 0, qiov->iov[i].iov_len);
1575 }
1576
2df46246
MK
1577 ret = sd_co_rw_vector(acb);
1578 if (ret <= 0) {
1579 qemu_aio_release(acb);
1580 return ret;
1581 }
1582
1583 qemu_coroutine_yield();
1584
1585 return acb->ret;
33b1db1c
MK
1586}
1587
1588static int sd_snapshot_create(BlockDriverState *bs, QEMUSnapshotInfo *sn_info)
1589{
1590 BDRVSheepdogState *s = bs->opaque;
1591 int ret, fd;
1592 uint32_t new_vid;
1593 SheepdogInode *inode;
1594 unsigned int datalen;
1595
1596 dprintf("sn_info: name %s id_str %s s: name %s vm_state_size %d "
1597 "is_snapshot %d\n", sn_info->name, sn_info->id_str,
1598 s->name, sn_info->vm_state_size, s->is_snapshot);
1599
1600 if (s->is_snapshot) {
1601 error_report("You can't create a snapshot of a snapshot VDI, "
6daf194d 1602 "%s (%" PRIu32 ").", s->name, s->inode.vdi_id);
33b1db1c
MK
1603
1604 return -EINVAL;
1605 }
1606
1607 dprintf("%s %s\n", sn_info->name, sn_info->id_str);
1608
1609 s->inode.vm_state_size = sn_info->vm_state_size;
1610 s->inode.vm_clock_nsec = sn_info->vm_clock_nsec;
1611 strncpy(s->inode.tag, sn_info->name, sizeof(s->inode.tag));
1612 /* we don't need to update entire object */
1613 datalen = SD_INODE_SIZE - sizeof(s->inode.data_vdi_id);
1614
1615 /* refresh inode. */
1616 fd = connect_to_sdog(s->addr, s->port);
1617 if (fd < 0) {
1618 ret = -EIO;
1619 goto cleanup;
1620 }
1621
1622 ret = write_object(fd, (char *)&s->inode, vid_to_vdi_oid(s->inode.vdi_id),
1623 s->inode.nr_copies, datalen, 0, 0);
1624 if (ret < 0) {
6daf194d 1625 error_report("failed to write snapshot's inode.");
33b1db1c
MK
1626 ret = -EIO;
1627 goto cleanup;
1628 }
1629
1630 ret = do_sd_create(s->name, s->inode.vdi_size, s->inode.vdi_id, &new_vid, 1,
1631 s->addr, s->port);
1632 if (ret < 0) {
6daf194d 1633 error_report("failed to create inode for snapshot. %s",
33b1db1c
MK
1634 strerror(errno));
1635 ret = -EIO;
1636 goto cleanup;
1637 }
1638
7267c094 1639 inode = (SheepdogInode *)g_malloc(datalen);
33b1db1c
MK
1640
1641 ret = read_object(fd, (char *)inode, vid_to_vdi_oid(new_vid),
1642 s->inode.nr_copies, datalen, 0);
1643
1644 if (ret < 0) {
6daf194d 1645 error_report("failed to read new inode info. %s", strerror(errno));
33b1db1c
MK
1646 ret = -EIO;
1647 goto cleanup;
1648 }
1649
1650 memcpy(&s->inode, inode, datalen);
1651 dprintf("s->inode: name %s snap_id %x oid %x\n",
1652 s->inode.name, s->inode.snap_id, s->inode.vdi_id);
1653
1654cleanup:
1655 closesocket(fd);
1656 return ret;
1657}
1658
1659static int sd_snapshot_goto(BlockDriverState *bs, const char *snapshot_id)
1660{
1661 BDRVSheepdogState *s = bs->opaque;
1662 BDRVSheepdogState *old_s;
1663 char vdi[SD_MAX_VDI_LEN], tag[SD_MAX_VDI_TAG_LEN];
1664 char *buf = NULL;
1665 uint32_t vid;
1666 uint32_t snapid = 0;
1667 int ret = -ENOENT, fd;
1668
7267c094 1669 old_s = g_malloc(sizeof(BDRVSheepdogState));
33b1db1c
MK
1670
1671 memcpy(old_s, s, sizeof(BDRVSheepdogState));
1672
1673 memset(vdi, 0, sizeof(vdi));
1674 strncpy(vdi, s->name, sizeof(vdi));
1675
1676 memset(tag, 0, sizeof(tag));
1677 snapid = strtoul(snapshot_id, NULL, 10);
1678 if (!snapid) {
1679 strncpy(tag, s->name, sizeof(tag));
1680 }
1681
1682 ret = find_vdi_name(s, vdi, snapid, tag, &vid, 1);
1683 if (ret) {
6daf194d 1684 error_report("Failed to find_vdi_name");
33b1db1c
MK
1685 ret = -ENOENT;
1686 goto out;
1687 }
1688
1689 fd = connect_to_sdog(s->addr, s->port);
1690 if (fd < 0) {
6daf194d 1691 error_report("failed to connect");
33b1db1c
MK
1692 goto out;
1693 }
1694
7267c094 1695 buf = g_malloc(SD_INODE_SIZE);
33b1db1c
MK
1696 ret = read_object(fd, buf, vid_to_vdi_oid(vid), s->inode.nr_copies,
1697 SD_INODE_SIZE, 0);
1698
1699 closesocket(fd);
1700
1701 if (ret) {
1702 ret = -ENOENT;
1703 goto out;
1704 }
1705
1706 memcpy(&s->inode, buf, sizeof(s->inode));
1707
1708 if (!s->inode.vm_state_size) {
6daf194d 1709 error_report("Invalid snapshot");
33b1db1c
MK
1710 ret = -ENOENT;
1711 goto out;
1712 }
1713
1714 s->is_snapshot = 1;
1715
7267c094
AL
1716 g_free(buf);
1717 g_free(old_s);
33b1db1c
MK
1718
1719 return 0;
1720out:
1721 /* recover bdrv_sd_state */
1722 memcpy(s, old_s, sizeof(BDRVSheepdogState));
7267c094
AL
1723 g_free(buf);
1724 g_free(old_s);
33b1db1c 1725
6daf194d 1726 error_report("failed to open. recover old bdrv_sd_state.");
33b1db1c
MK
1727
1728 return ret;
1729}
1730
1731static int sd_snapshot_delete(BlockDriverState *bs, const char *snapshot_id)
1732{
1733 /* FIXME: Delete specified snapshot id. */
1734 return 0;
1735}
1736
33b1db1c
MK
1737static int sd_snapshot_list(BlockDriverState *bs, QEMUSnapshotInfo **psn_tab)
1738{
1739 BDRVSheepdogState *s = bs->opaque;
1740 SheepdogReq req;
1741 int fd, nr = 1024, ret, max = BITS_TO_LONGS(SD_NR_VDIS) * sizeof(long);
1742 QEMUSnapshotInfo *sn_tab = NULL;
1743 unsigned wlen, rlen;
1744 int found = 0;
1745 static SheepdogInode inode;
1746 unsigned long *vdi_inuse;
1747 unsigned int start_nr;
1748 uint64_t hval;
1749 uint32_t vid;
1750
7267c094 1751 vdi_inuse = g_malloc(max);
33b1db1c
MK
1752
1753 fd = connect_to_sdog(s->addr, s->port);
1754 if (fd < 0) {
1755 goto out;
1756 }
1757
1758 rlen = max;
1759 wlen = 0;
1760
1761 memset(&req, 0, sizeof(req));
1762
1763 req.opcode = SD_OP_READ_VDIS;
1764 req.data_length = max;
1765
1766 ret = do_req(fd, (SheepdogReq *)&req, vdi_inuse, &wlen, &rlen);
1767
1768 closesocket(fd);
1769 if (ret) {
1770 goto out;
1771 }
1772
7267c094 1773 sn_tab = g_malloc0(nr * sizeof(*sn_tab));
33b1db1c
MK
1774
1775 /* calculate a vdi id with hash function */
1776 hval = fnv_64a_buf(s->name, strlen(s->name), FNV1A_64_INIT);
1777 start_nr = hval & (SD_NR_VDIS - 1);
1778
1779 fd = connect_to_sdog(s->addr, s->port);
1780 if (fd < 0) {
6daf194d 1781 error_report("failed to connect");
33b1db1c
MK
1782 goto out;
1783 }
1784
1785 for (vid = start_nr; found < nr; vid = (vid + 1) % SD_NR_VDIS) {
1786 if (!test_bit(vid, vdi_inuse)) {
1787 break;
1788 }
1789
1790 /* we don't need to read entire object */
1791 ret = read_object(fd, (char *)&inode, vid_to_vdi_oid(vid),
1792 0, SD_INODE_SIZE - sizeof(inode.data_vdi_id), 0);
1793
1794 if (ret) {
1795 continue;
1796 }
1797
1798 if (!strcmp(inode.name, s->name) && is_snapshot(&inode)) {
1799 sn_tab[found].date_sec = inode.snap_ctime >> 32;
1800 sn_tab[found].date_nsec = inode.snap_ctime & 0xffffffff;
1801 sn_tab[found].vm_state_size = inode.vm_state_size;
1802 sn_tab[found].vm_clock_nsec = inode.vm_clock_nsec;
1803
1804 snprintf(sn_tab[found].id_str, sizeof(sn_tab[found].id_str), "%u",
1805 inode.snap_id);
1806 strncpy(sn_tab[found].name, inode.tag,
1807 MIN(sizeof(sn_tab[found].name), sizeof(inode.tag)));
1808 found++;
1809 }
1810 }
1811
1812 closesocket(fd);
1813out:
1814 *psn_tab = sn_tab;
1815
7267c094 1816 g_free(vdi_inuse);
33b1db1c
MK
1817
1818 return found;
1819}
1820
1821static int do_load_save_vmstate(BDRVSheepdogState *s, uint8_t *data,
1822 int64_t pos, int size, int load)
1823{
1824 int fd, create;
1825 int ret = 0;
1826 unsigned int data_len;
1827 uint64_t vmstate_oid;
1828 uint32_t vdi_index;
1829 uint64_t offset;
1830
1831 fd = connect_to_sdog(s->addr, s->port);
1832 if (fd < 0) {
1833 ret = -EIO;
1834 goto cleanup;
1835 }
1836
1837 while (size) {
1838 vdi_index = pos / SD_DATA_OBJ_SIZE;
1839 offset = pos % SD_DATA_OBJ_SIZE;
1840
1841 data_len = MIN(size, SD_DATA_OBJ_SIZE);
1842
1843 vmstate_oid = vid_to_vmstate_oid(s->inode.vdi_id, vdi_index);
1844
1845 create = (offset == 0);
1846 if (load) {
1847 ret = read_object(fd, (char *)data, vmstate_oid,
1848 s->inode.nr_copies, data_len, offset);
1849 } else {
1850 ret = write_object(fd, (char *)data, vmstate_oid,
1851 s->inode.nr_copies, data_len, offset, create);
1852 }
1853
1854 if (ret < 0) {
6daf194d 1855 error_report("failed to save vmstate %s", strerror(errno));
33b1db1c
MK
1856 ret = -EIO;
1857 goto cleanup;
1858 }
1859
1860 pos += data_len;
1861 size -= data_len;
1862 ret += data_len;
1863 }
1864cleanup:
1865 closesocket(fd);
1866 return ret;
1867}
1868
1869static int sd_save_vmstate(BlockDriverState *bs, const uint8_t *data,
1870 int64_t pos, int size)
1871{
1872 BDRVSheepdogState *s = bs->opaque;
1873
1874 return do_load_save_vmstate(s, (uint8_t *)data, pos, size, 0);
1875}
1876
1877static int sd_load_vmstate(BlockDriverState *bs, uint8_t *data,
1878 int64_t pos, int size)
1879{
1880 BDRVSheepdogState *s = bs->opaque;
1881
1882 return do_load_save_vmstate(s, data, pos, size, 1);
1883}
1884
1885
1886static QEMUOptionParameter sd_create_options[] = {
1887 {
1888 .name = BLOCK_OPT_SIZE,
1889 .type = OPT_SIZE,
1890 .help = "Virtual disk size"
1891 },
1892 {
1893 .name = BLOCK_OPT_BACKING_FILE,
1894 .type = OPT_STRING,
1895 .help = "File name of a base image"
1896 },
a8e0fdd7
MK
1897 {
1898 .name = BLOCK_OPT_PREALLOC,
1899 .type = OPT_STRING,
1900 .help = "Preallocation mode (allowed values: off, full)"
1901 },
33b1db1c
MK
1902 { NULL }
1903};
1904
1905BlockDriver bdrv_sheepdog = {
1906 .format_name = "sheepdog",
1907 .protocol_name = "sheepdog",
1908 .instance_size = sizeof(BDRVSheepdogState),
1909 .bdrv_file_open = sd_open,
1910 .bdrv_close = sd_close,
1911 .bdrv_create = sd_create,
1912 .bdrv_getlength = sd_getlength,
1913 .bdrv_truncate = sd_truncate,
1914
2df46246
MK
1915 .bdrv_co_readv = sd_co_readv,
1916 .bdrv_co_writev = sd_co_writev,
33b1db1c
MK
1917
1918 .bdrv_snapshot_create = sd_snapshot_create,
1919 .bdrv_snapshot_goto = sd_snapshot_goto,
1920 .bdrv_snapshot_delete = sd_snapshot_delete,
1921 .bdrv_snapshot_list = sd_snapshot_list,
1922
1923 .bdrv_save_vmstate = sd_save_vmstate,
1924 .bdrv_load_vmstate = sd_load_vmstate,
1925
1926 .create_options = sd_create_options,
1927};
1928
1929static void bdrv_sheepdog_init(void)
1930{
1931 bdrv_register(&bdrv_sheepdog);
1932}
1933block_init(bdrv_sheepdog_init);
This page took 0.345187 seconds and 4 git commands to generate.