]> Git Repo - qemu.git/blame - block/sheepdog.c
Include qemu/module.h where needed, drop it from qemu-common.h
[qemu.git] / block / sheepdog.c
CommitLineData
33b1db1c
MK
1/*
2 * Copyright (C) 2009-2010 Nippon Telegraph and Telephone Corporation.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License version
6 * 2 as published by the Free Software Foundation.
7 *
8 * You should have received a copy of the GNU General Public License
9 * along with this program. If not, see <http://www.gnu.org/licenses/>.
6b620ca3
PB
10 *
11 * Contributions after 2012-01-13 are licensed under the terms of the
12 * GNU GPL, version 2 or (at your option) any later version.
33b1db1c 13 */
33b1db1c 14
80c71a24 15#include "qemu/osdep.h"
da34e65c 16#include "qapi/error.h"
9af23989 17#include "qapi/qapi-visit-sockets.h"
63fd65a0 18#include "qapi/qapi-visit-block-core.h"
831acdc9 19#include "qapi/qmp/qdict.h"
d1c13688 20#include "qapi/qobject-input-visitor.h"
63fd65a0 21#include "qapi/qobject-output-visitor.h"
5d6768e3 22#include "qemu/uri.h"
1de7afc9 23#include "qemu/error-report.h"
0b8fa32f 24#include "qemu/module.h"
922a01a0 25#include "qemu/option.h"
1de7afc9 26#include "qemu/sockets.h"
737e150e 27#include "block/block_int.h"
609f45ea 28#include "block/qdict.h"
fba98d45 29#include "sysemu/block-backend.h"
1de7afc9 30#include "qemu/bitops.h"
f348b6d1 31#include "qemu/cutils.h"
70018a14 32#include "trace.h"
33b1db1c
MK
33
34#define SD_PROTO_VER 0x01
35
36#define SD_DEFAULT_ADDR "localhost"
25af257d 37#define SD_DEFAULT_PORT 7000
33b1db1c
MK
38
39#define SD_OP_CREATE_AND_WRITE_OBJ 0x01
40#define SD_OP_READ_OBJ 0x02
41#define SD_OP_WRITE_OBJ 0x03
cac8f4a6 42/* 0x04 is used internally by Sheepdog */
33b1db1c
MK
43
44#define SD_OP_NEW_VDI 0x11
45#define SD_OP_LOCK_VDI 0x12
46#define SD_OP_RELEASE_VDI 0x13
47#define SD_OP_GET_VDI_INFO 0x14
48#define SD_OP_READ_VDIS 0x15
47622c44 49#define SD_OP_FLUSH_VDI 0x16
859e5553 50#define SD_OP_DEL_VDI 0x17
876eb1b0 51#define SD_OP_GET_CLUSTER_DEFAULT 0x18
33b1db1c
MK
52
53#define SD_FLAG_CMD_WRITE 0x01
54#define SD_FLAG_CMD_COW 0x02
0e7106d8
LY
55#define SD_FLAG_CMD_CACHE 0x04 /* Writeback mode for cache */
56#define SD_FLAG_CMD_DIRECT 0x08 /* Don't use cache */
33b1db1c
MK
57
58#define SD_RES_SUCCESS 0x00 /* Success */
59#define SD_RES_UNKNOWN 0x01 /* Unknown error */
60#define SD_RES_NO_OBJ 0x02 /* No object found */
61#define SD_RES_EIO 0x03 /* I/O error */
62#define SD_RES_VDI_EXIST 0x04 /* Vdi exists already */
63#define SD_RES_INVALID_PARMS 0x05 /* Invalid parameters */
64#define SD_RES_SYSTEM_ERROR 0x06 /* System error */
65#define SD_RES_VDI_LOCKED 0x07 /* Vdi is locked */
66#define SD_RES_NO_VDI 0x08 /* No vdi found */
67#define SD_RES_NO_BASE_VDI 0x09 /* No base vdi found */
68#define SD_RES_VDI_READ 0x0A /* Cannot read requested vdi */
69#define SD_RES_VDI_WRITE 0x0B /* Cannot write requested vdi */
70#define SD_RES_BASE_VDI_READ 0x0C /* Cannot read base vdi */
71#define SD_RES_BASE_VDI_WRITE 0x0D /* Cannot write base vdi */
72#define SD_RES_NO_TAG 0x0E /* Requested tag is not found */
73#define SD_RES_STARTUP 0x0F /* Sheepdog is on starting up */
74#define SD_RES_VDI_NOT_LOCKED 0x10 /* Vdi is not locked */
75#define SD_RES_SHUTDOWN 0x11 /* Sheepdog is shutting down */
76#define SD_RES_NO_MEM 0x12 /* Cannot allocate memory */
77#define SD_RES_FULL_VDI 0x13 /* we already have the maximum vdis */
78#define SD_RES_VER_MISMATCH 0x14 /* Protocol version mismatch */
79#define SD_RES_NO_SPACE 0x15 /* Server has no room for new objects */
80#define SD_RES_WAIT_FOR_FORMAT 0x16 /* Waiting for a format operation */
81#define SD_RES_WAIT_FOR_JOIN 0x17 /* Waiting for other nodes joining */
82#define SD_RES_JOIN_FAILED 0x18 /* Target node had failed to join sheepdog */
fca23f0a 83#define SD_RES_HALT 0x19 /* Sheepdog is stopped serving IO request */
6a0b5490 84#define SD_RES_READONLY 0x1A /* Object is read-only */
33b1db1c
MK
85
86/*
87 * Object ID rules
88 *
89 * 0 - 19 (20 bits): data object space
90 * 20 - 31 (12 bits): reserved data object space
91 * 32 - 55 (24 bits): vdi object space
92 * 56 - 59 ( 4 bits): reserved vdi object space
7acae208 93 * 60 - 63 ( 4 bits): object type identifier space
33b1db1c
MK
94 */
95
96#define VDI_SPACE_SHIFT 32
97#define VDI_BIT (UINT64_C(1) << 63)
98#define VMSTATE_BIT (UINT64_C(1) << 62)
99#define MAX_DATA_OBJS (UINT64_C(1) << 20)
100#define MAX_CHILDREN 1024
101#define SD_MAX_VDI_LEN 256
102#define SD_MAX_VDI_TAG_LEN 256
103#define SD_NR_VDIS (1U << 24)
104#define SD_DATA_OBJ_SIZE (UINT64_C(1) << 22)
105#define SD_MAX_VDI_SIZE (SD_DATA_OBJ_SIZE * MAX_DATA_OBJS)
876eb1b0 106#define SD_DEFAULT_BLOCK_SIZE_SHIFT 22
b3af018f
LY
107/*
108 * For erasure coding, we use at most SD_EC_MAX_STRIP for data strips and
109 * (SD_EC_MAX_STRIP - 1) for parity strips
110 *
111 * SD_MAX_COPIES is sum of number of data strips and parity strips.
112 */
113#define SD_EC_MAX_STRIP 16
114#define SD_MAX_COPIES (SD_EC_MAX_STRIP * 2 - 1)
33b1db1c
MK
115
116#define SD_INODE_SIZE (sizeof(SheepdogInode))
117#define CURRENT_VDI_ID 0
118
1dbfafed
HM
119#define LOCK_TYPE_NORMAL 0
120#define LOCK_TYPE_SHARED 1 /* for iSCSI multipath */
121
33b1db1c
MK
122typedef struct SheepdogReq {
123 uint8_t proto_ver;
124 uint8_t opcode;
125 uint16_t flags;
126 uint32_t epoch;
127 uint32_t id;
128 uint32_t data_length;
129 uint32_t opcode_specific[8];
130} SheepdogReq;
131
132typedef struct SheepdogRsp {
133 uint8_t proto_ver;
134 uint8_t opcode;
135 uint16_t flags;
136 uint32_t epoch;
137 uint32_t id;
138 uint32_t data_length;
139 uint32_t result;
140 uint32_t opcode_specific[7];
141} SheepdogRsp;
142
143typedef struct SheepdogObjReq {
144 uint8_t proto_ver;
145 uint8_t opcode;
146 uint16_t flags;
147 uint32_t epoch;
148 uint32_t id;
149 uint32_t data_length;
150 uint64_t oid;
151 uint64_t cow_oid;
29a67f7e 152 uint8_t copies;
1841f880
LY
153 uint8_t copy_policy;
154 uint8_t reserved[6];
33b1db1c
MK
155 uint64_t offset;
156} SheepdogObjReq;
157
158typedef struct SheepdogObjRsp {
159 uint8_t proto_ver;
160 uint8_t opcode;
161 uint16_t flags;
162 uint32_t epoch;
163 uint32_t id;
164 uint32_t data_length;
165 uint32_t result;
29a67f7e 166 uint8_t copies;
1841f880
LY
167 uint8_t copy_policy;
168 uint8_t reserved[2];
33b1db1c
MK
169 uint32_t pad[6];
170} SheepdogObjRsp;
171
172typedef struct SheepdogVdiReq {
173 uint8_t proto_ver;
174 uint8_t opcode;
175 uint16_t flags;
176 uint32_t epoch;
177 uint32_t id;
178 uint32_t data_length;
179 uint64_t vdi_size;
9f23fce7 180 uint32_t base_vdi_id;
29a67f7e 181 uint8_t copies;
1841f880 182 uint8_t copy_policy;
876eb1b0
TI
183 uint8_t store_policy;
184 uint8_t block_size_shift;
33b1db1c 185 uint32_t snapid;
1dbfafed
HM
186 uint32_t type;
187 uint32_t pad[2];
33b1db1c
MK
188} SheepdogVdiReq;
189
190typedef struct SheepdogVdiRsp {
191 uint8_t proto_ver;
192 uint8_t opcode;
193 uint16_t flags;
194 uint32_t epoch;
195 uint32_t id;
196 uint32_t data_length;
197 uint32_t result;
198 uint32_t rsvd;
199 uint32_t vdi_id;
200 uint32_t pad[5];
201} SheepdogVdiRsp;
202
876eb1b0
TI
203typedef struct SheepdogClusterRsp {
204 uint8_t proto_ver;
205 uint8_t opcode;
206 uint16_t flags;
207 uint32_t epoch;
208 uint32_t id;
209 uint32_t data_length;
210 uint32_t result;
211 uint8_t nr_copies;
212 uint8_t copy_policy;
213 uint8_t block_size_shift;
214 uint8_t __pad1;
215 uint32_t __pad2[6];
216} SheepdogClusterRsp;
217
33b1db1c
MK
218typedef struct SheepdogInode {
219 char name[SD_MAX_VDI_LEN];
220 char tag[SD_MAX_VDI_TAG_LEN];
221 uint64_t ctime;
222 uint64_t snap_ctime;
223 uint64_t vm_clock_nsec;
224 uint64_t vdi_size;
225 uint64_t vm_state_size;
226 uint16_t copy_policy;
227 uint8_t nr_copies;
228 uint8_t block_size_shift;
229 uint32_t snap_id;
230 uint32_t vdi_id;
231 uint32_t parent_vdi_id;
232 uint32_t child_vdi_id[MAX_CHILDREN];
233 uint32_t data_vdi_id[MAX_DATA_OBJS];
234} SheepdogInode;
235
5d039bab
HM
236#define SD_INODE_HEADER_SIZE offsetof(SheepdogInode, data_vdi_id)
237
33b1db1c
MK
238/*
239 * 64 bit FNV-1a non-zero initial basis
240 */
241#define FNV1A_64_INIT ((uint64_t)0xcbf29ce484222325ULL)
242
243/*
244 * 64 bit Fowler/Noll/Vo FNV-1a hash code
245 */
246static inline uint64_t fnv_64a_buf(void *buf, size_t len, uint64_t hval)
247{
248 unsigned char *bp = buf;
249 unsigned char *be = bp + len;
250 while (bp < be) {
251 hval ^= (uint64_t) *bp++;
252 hval += (hval << 1) + (hval << 4) + (hval << 5) +
253 (hval << 7) + (hval << 8) + (hval << 40);
254 }
255 return hval;
256}
257
2f536801 258static inline bool is_data_obj_writable(SheepdogInode *inode, unsigned int idx)
33b1db1c
MK
259{
260 return inode->vdi_id == inode->data_vdi_id[idx];
261}
262
2f536801 263static inline bool is_data_obj(uint64_t oid)
33b1db1c
MK
264{
265 return !(VDI_BIT & oid);
266}
267
268static inline uint64_t data_oid_to_idx(uint64_t oid)
269{
270 return oid & (MAX_DATA_OBJS - 1);
271}
272
72e0996c
MK
273static inline uint32_t oid_to_vid(uint64_t oid)
274{
275 return (oid & ~VDI_BIT) >> VDI_SPACE_SHIFT;
276}
277
33b1db1c
MK
278static inline uint64_t vid_to_vdi_oid(uint32_t vid)
279{
280 return VDI_BIT | ((uint64_t)vid << VDI_SPACE_SHIFT);
281}
282
283static inline uint64_t vid_to_vmstate_oid(uint32_t vid, uint32_t idx)
284{
285 return VMSTATE_BIT | ((uint64_t)vid << VDI_SPACE_SHIFT) | idx;
286}
287
288static inline uint64_t vid_to_data_oid(uint32_t vid, uint32_t idx)
289{
290 return ((uint64_t)vid << VDI_SPACE_SHIFT) | idx;
291}
292
2f536801 293static inline bool is_snapshot(struct SheepdogInode *inode)
33b1db1c
MK
294{
295 return !!inode->snap_ctime;
296}
297
eab8eb8d
VT
298static inline size_t count_data_objs(const struct SheepdogInode *inode)
299{
300 return DIV_ROUND_UP(inode->vdi_size,
301 (1UL << inode->block_size_shift));
302}
303
33b1db1c 304typedef struct SheepdogAIOCB SheepdogAIOCB;
28ddd08c 305typedef struct BDRVSheepdogState BDRVSheepdogState;
33b1db1c
MK
306
307typedef struct AIOReq {
308 SheepdogAIOCB *aiocb;
309 unsigned int iov_offset;
310
311 uint64_t oid;
312 uint64_t base_oid;
313 uint64_t offset;
314 unsigned int data_len;
315 uint8_t flags;
316 uint32_t id;
b544c1ab 317 bool create;
33b1db1c 318
c292ee6a 319 QLIST_ENTRY(AIOReq) aio_siblings;
33b1db1c
MK
320} AIOReq;
321
322enum AIOCBState {
323 AIOCB_WRITE_UDATA,
324 AIOCB_READ_UDATA,
47783072 325 AIOCB_FLUSH_CACHE,
cac8f4a6 326 AIOCB_DISCARD_OBJ,
33b1db1c
MK
327};
328
498f2140 329#define AIOCBOverlapping(x, y) \
6a55c82c
HM
330 (!(x->max_affect_data_idx < y->min_affect_data_idx \
331 || y->max_affect_data_idx < x->min_affect_data_idx))
332
33b1db1c 333struct SheepdogAIOCB {
28ddd08c 334 BDRVSheepdogState *s;
33b1db1c
MK
335
336 QEMUIOVector *qiov;
337
338 int64_t sector_num;
339 int nb_sectors;
340
341 int ret;
342 enum AIOCBState aiocb_type;
343
2df46246 344 Coroutine *coroutine;
1d732d7d 345 int nr_pending;
6a55c82c
HM
346
347 uint32_t min_affect_data_idx;
348 uint32_t max_affect_data_idx;
349
498f2140
HM
350 /*
351 * The difference between affect_data_idx and dirty_data_idx:
352 * affect_data_idx represents range of index of all request types.
353 * dirty_data_idx represents range of index updated by COW requests.
354 * dirty_data_idx is used for updating an inode object.
355 */
356 uint32_t min_dirty_data_idx;
357 uint32_t max_dirty_data_idx;
358
6a55c82c 359 QLIST_ENTRY(SheepdogAIOCB) aiocb_siblings;
33b1db1c
MK
360};
361
28ddd08c 362struct BDRVSheepdogState {
011603ca 363 BlockDriverState *bs;
84390bed 364 AioContext *aio_context;
011603ca 365
33b1db1c
MK
366 SheepdogInode inode;
367
33b1db1c 368 char name[SD_MAX_VDI_LEN];
2f536801 369 bool is_snapshot;
0e7106d8 370 uint32_t cache_flags;
cac8f4a6 371 bool discard_supported;
33b1db1c 372
bd269ebc 373 SocketAddress *addr;
33b1db1c
MK
374 int fd;
375
2df46246
MK
376 CoMutex lock;
377 Coroutine *co_send;
378 Coroutine *co_recv;
379
33b1db1c 380 uint32_t aioreq_seq_num;
011603ca
MK
381
382 /* Every aio request must be linked to either of these queues. */
b58deb34
PB
383 QLIST_HEAD(, AIOReq) inflight_aio_head;
384 QLIST_HEAD(, AIOReq) failed_aio_head;
6a55c82c 385
f1af3251 386 CoMutex queue_lock;
498f2140 387 CoQueue overlapping_queue;
b58deb34 388 QLIST_HEAD(, SheepdogAIOCB) inflight_aiocb_head;
28ddd08c 389};
33b1db1c 390
4da65c80
LY
391typedef struct BDRVSheepdogReopenState {
392 int fd;
393 int cache_flags;
394} BDRVSheepdogReopenState;
395
d507c5f6 396static const char *sd_strerror(int err)
33b1db1c
MK
397{
398 int i;
399
400 static const struct {
401 int err;
402 const char *desc;
403 } errors[] = {
404 {SD_RES_SUCCESS, "Success"},
405 {SD_RES_UNKNOWN, "Unknown error"},
406 {SD_RES_NO_OBJ, "No object found"},
407 {SD_RES_EIO, "I/O error"},
408 {SD_RES_VDI_EXIST, "VDI exists already"},
409 {SD_RES_INVALID_PARMS, "Invalid parameters"},
410 {SD_RES_SYSTEM_ERROR, "System error"},
411 {SD_RES_VDI_LOCKED, "VDI is already locked"},
412 {SD_RES_NO_VDI, "No vdi found"},
413 {SD_RES_NO_BASE_VDI, "No base VDI found"},
414 {SD_RES_VDI_READ, "Failed read the requested VDI"},
415 {SD_RES_VDI_WRITE, "Failed to write the requested VDI"},
416 {SD_RES_BASE_VDI_READ, "Failed to read the base VDI"},
417 {SD_RES_BASE_VDI_WRITE, "Failed to write the base VDI"},
418 {SD_RES_NO_TAG, "Failed to find the requested tag"},
419 {SD_RES_STARTUP, "The system is still booting"},
420 {SD_RES_VDI_NOT_LOCKED, "VDI isn't locked"},
421 {SD_RES_SHUTDOWN, "The system is shutting down"},
422 {SD_RES_NO_MEM, "Out of memory on the server"},
423 {SD_RES_FULL_VDI, "We already have the maximum vdis"},
424 {SD_RES_VER_MISMATCH, "Protocol version mismatch"},
425 {SD_RES_NO_SPACE, "Server has no space for new objects"},
426 {SD_RES_WAIT_FOR_FORMAT, "Sheepdog is waiting for a format operation"},
427 {SD_RES_WAIT_FOR_JOIN, "Sheepdog is waiting for other nodes joining"},
428 {SD_RES_JOIN_FAILED, "Target node had failed to join sheepdog"},
fca23f0a 429 {SD_RES_HALT, "Sheepdog is stopped serving IO request"},
6a0b5490 430 {SD_RES_READONLY, "Object is read-only"},
33b1db1c
MK
431 };
432
433 for (i = 0; i < ARRAY_SIZE(errors); ++i) {
434 if (errors[i].err == err) {
435 return errors[i].desc;
436 }
437 }
438
439 return "Invalid error code";
440}
441
442/*
443 * Sheepdog I/O handling:
444 *
2df46246 445 * 1. In sd_co_rw_vector, we send the I/O requests to the server and
c292ee6a 446 * link the requests to the inflight_list in the
e80ab33d 447 * BDRVSheepdogState. The function yields while waiting for
2df46246 448 * receiving the response.
33b1db1c 449 *
2df46246 450 * 2. We receive the response in aio_read_response, the fd handler to
e80ab33d
PB
451 * the sheepdog connection. We switch back to sd_co_readv/sd_writev
452 * after all the requests belonging to the AIOCB are finished. If
453 * needed, sd_co_writev will send another requests for the vdi object.
33b1db1c
MK
454 */
455
456static inline AIOReq *alloc_aio_req(BDRVSheepdogState *s, SheepdogAIOCB *acb,
457 uint64_t oid, unsigned int data_len,
b544c1ab 458 uint64_t offset, uint8_t flags, bool create,
33b1db1c
MK
459 uint64_t base_oid, unsigned int iov_offset)
460{
461 AIOReq *aio_req;
462
7267c094 463 aio_req = g_malloc(sizeof(*aio_req));
33b1db1c
MK
464 aio_req->aiocb = acb;
465 aio_req->iov_offset = iov_offset;
466 aio_req->oid = oid;
467 aio_req->base_oid = base_oid;
468 aio_req->offset = offset;
469 aio_req->data_len = data_len;
470 aio_req->flags = flags;
471 aio_req->id = s->aioreq_seq_num++;
b544c1ab 472 aio_req->create = create;
33b1db1c 473
1d732d7d 474 acb->nr_pending++;
33b1db1c
MK
475 return aio_req;
476}
477
acf6e5f0
PB
478static void wait_for_overlapping_aiocb(BDRVSheepdogState *s, SheepdogAIOCB *acb)
479{
480 SheepdogAIOCB *cb;
481
482retry:
483 QLIST_FOREACH(cb, &s->inflight_aiocb_head, aiocb_siblings) {
484 if (AIOCBOverlapping(acb, cb)) {
f1af3251 485 qemu_co_queue_wait(&s->overlapping_queue, &s->queue_lock);
acf6e5f0
PB
486 goto retry;
487 }
488 }
489}
490
28ddd08c
PB
491static void sd_aio_setup(SheepdogAIOCB *acb, BDRVSheepdogState *s,
492 QEMUIOVector *qiov, int64_t sector_num, int nb_sectors,
493 int type)
33b1db1c 494{
6a55c82c 495 uint32_t object_size;
6a55c82c
HM
496
497 object_size = (UINT32_C(1) << s->inode.block_size_shift);
33b1db1c 498
28ddd08c 499 acb->s = s;
33b1db1c
MK
500
501 acb->qiov = qiov;
502
503 acb->sector_num = sector_num;
504 acb->nb_sectors = nb_sectors;
505
2df46246 506 acb->coroutine = qemu_coroutine_self();
33b1db1c 507 acb->ret = 0;
1d732d7d 508 acb->nr_pending = 0;
6a55c82c
HM
509
510 acb->min_affect_data_idx = acb->sector_num * BDRV_SECTOR_SIZE / object_size;
511 acb->max_affect_data_idx = (acb->sector_num * BDRV_SECTOR_SIZE +
512 acb->nb_sectors * BDRV_SECTOR_SIZE) / object_size;
513
498f2140
HM
514 acb->min_dirty_data_idx = UINT32_MAX;
515 acb->max_dirty_data_idx = 0;
28ddd08c 516 acb->aiocb_type = type;
acf6e5f0
PB
517
518 if (type == AIOCB_FLUSH_CACHE) {
519 return;
520 }
521
f1af3251 522 qemu_co_mutex_lock(&s->queue_lock);
acf6e5f0
PB
523 wait_for_overlapping_aiocb(s, acb);
524 QLIST_INSERT_HEAD(&s->inflight_aiocb_head, acb, aiocb_siblings);
f1af3251 525 qemu_co_mutex_unlock(&s->queue_lock);
33b1db1c
MK
526}
527
bd269ebc 528static SocketAddress *sd_server_config(QDict *options, Error **errp)
d1c13688
MA
529{
530 QDict *server = NULL;
d1c13688 531 Visitor *iv = NULL;
bd269ebc 532 SocketAddress *saddr = NULL;
d1c13688
MA
533 Error *local_err = NULL;
534
535 qdict_extract_subqdict(options, &server, "server.");
536
af91062e
MA
537 iv = qobject_input_visitor_new_flat_confused(server, errp);
538 if (!iv) {
d1c13688
MA
539 goto done;
540 }
541
bd269ebc 542 visit_type_SocketAddress(iv, NULL, &saddr, &local_err);
d1c13688
MA
543 if (local_err) {
544 error_propagate(errp, local_err);
545 goto done;
546 }
547
d1c13688 548done:
d1c13688 549 visit_free(iv);
cb3e7f08 550 qobject_unref(server);
d1c13688
MA
551 return saddr;
552}
553
833a7cc3 554/* Return -EIO in case of error, file descriptor on success */
dfb12bf8 555static int connect_to_sdog(BDRVSheepdogState *s, Error **errp)
33b1db1c 556{
25af257d 557 int fd;
33b1db1c 558
b2587932 559 fd = socket_connect(s->addr, errp);
1b8bbb46 560
bd269ebc 561 if (s->addr->type == SOCKET_ADDRESS_TYPE_INET && fd >= 0) {
8ecc2f9e
MA
562 int ret = socket_set_nodelay(fd);
563 if (ret < 0) {
5197f445 564 warn_report("can't set TCP_NODELAY: %s", strerror(errno));
1b8bbb46
MK
565 }
566 }
33b1db1c 567
dfb12bf8 568 if (fd >= 0) {
f9e8cacc 569 qemu_set_nonblock(fd);
833a7cc3
LY
570 } else {
571 fd = -EIO;
33b1db1c
MK
572 }
573
33b1db1c
MK
574 return fd;
575}
576
833a7cc3 577/* Return 0 on success and -errno in case of error */
e0d93a89
MK
578static coroutine_fn int send_co_req(int sockfd, SheepdogReq *hdr, void *data,
579 unsigned int *wlen)
47622c44
LY
580{
581 int ret;
582
583 ret = qemu_co_send(sockfd, hdr, sizeof(*hdr));
80731d9d 584 if (ret != sizeof(*hdr)) {
47622c44 585 error_report("failed to send a req, %s", strerror(errno));
b16a44e1 586 return -errno;
47622c44
LY
587 }
588
589 ret = qemu_co_send(sockfd, data, *wlen);
80731d9d 590 if (ret != *wlen) {
47622c44 591 error_report("failed to send a req, %s", strerror(errno));
b16a44e1 592 return -errno;
47622c44
LY
593 }
594
595 return ret;
596}
e0d93a89 597
cddd4ac7
MK
598typedef struct SheepdogReqCo {
599 int sockfd;
f11672db 600 BlockDriverState *bs;
84390bed 601 AioContext *aio_context;
cddd4ac7
MK
602 SheepdogReq *hdr;
603 void *data;
604 unsigned int *wlen;
605 unsigned int *rlen;
606 int ret;
607 bool finished;
9d456654 608 Coroutine *co;
cddd4ac7
MK
609} SheepdogReqCo;
610
9d456654
PB
611static void restart_co_req(void *opaque)
612{
613 SheepdogReqCo *srco = opaque;
614
615 aio_co_wake(srco->co);
616}
617
cddd4ac7 618static coroutine_fn void do_co_req(void *opaque)
47622c44
LY
619{
620 int ret;
cddd4ac7
MK
621 SheepdogReqCo *srco = opaque;
622 int sockfd = srco->sockfd;
623 SheepdogReq *hdr = srco->hdr;
624 void *data = srco->data;
625 unsigned int *wlen = srco->wlen;
626 unsigned int *rlen = srco->rlen;
2dfcca3b 627
9d456654 628 srco->co = qemu_coroutine_self();
dca21ef2 629 aio_set_fd_handler(srco->aio_context, sockfd, false,
9d456654 630 NULL, restart_co_req, NULL, srco);
47622c44 631
47622c44
LY
632 ret = send_co_req(sockfd, hdr, data, wlen);
633 if (ret < 0) {
634 goto out;
635 }
636
dca21ef2 637 aio_set_fd_handler(srco->aio_context, sockfd, false,
9d456654 638 restart_co_req, NULL, NULL, srco);
2dfcca3b 639
47622c44 640 ret = qemu_co_recv(sockfd, hdr, sizeof(*hdr));
80731d9d 641 if (ret != sizeof(*hdr)) {
47622c44 642 error_report("failed to get a rsp, %s", strerror(errno));
cb595887 643 ret = -errno;
47622c44
LY
644 goto out;
645 }
646
647 if (*rlen > hdr->data_length) {
648 *rlen = hdr->data_length;
649 }
650
651 if (*rlen) {
652 ret = qemu_co_recv(sockfd, data, *rlen);
80731d9d 653 if (ret != *rlen) {
47622c44 654 error_report("failed to get the data, %s", strerror(errno));
cb595887 655 ret = -errno;
47622c44
LY
656 goto out;
657 }
658 }
659 ret = 0;
660out:
ed9ba724
MK
661 /* there is at most one request for this sockfd, so it is safe to
662 * set each handler to NULL. */
dca21ef2 663 aio_set_fd_handler(srco->aio_context, sockfd, false,
f6a51c84 664 NULL, NULL, NULL, NULL);
cddd4ac7 665
9d456654 666 srco->co = NULL;
cddd4ac7 667 srco->ret = ret;
e2a6ae7f
PB
668 /* Set srco->finished before reading bs->wakeup. */
669 atomic_mb_set(&srco->finished, true);
c9d1a561
PB
670 if (srco->bs) {
671 bdrv_wakeup(srco->bs);
672 }
cddd4ac7
MK
673}
674
833a7cc3
LY
675/*
676 * Send the request to the sheep in a synchronous manner.
677 *
678 * Return 0 on success, -errno in case of error.
679 */
f11672db 680static int do_req(int sockfd, BlockDriverState *bs, SheepdogReq *hdr,
84390bed 681 void *data, unsigned int *wlen, unsigned int *rlen)
cddd4ac7
MK
682{
683 Coroutine *co;
684 SheepdogReqCo srco = {
685 .sockfd = sockfd,
f11672db
PB
686 .aio_context = bs ? bdrv_get_aio_context(bs) : qemu_get_aio_context(),
687 .bs = bs,
cddd4ac7
MK
688 .hdr = hdr,
689 .data = data,
690 .wlen = wlen,
691 .rlen = rlen,
692 .ret = 0,
693 .finished = false,
694 };
695
696 if (qemu_in_coroutine()) {
697 do_co_req(&srco);
698 } else {
0b8b8753 699 co = qemu_coroutine_create(do_co_req, &srco);
f11672db 700 if (bs) {
76296dff 701 bdrv_coroutine_enter(bs, co);
f11672db
PB
702 BDRV_POLL_WHILE(bs, !srco.finished);
703 } else {
704 qemu_coroutine_enter(co);
705 while (!srco.finished) {
706 aio_poll(qemu_get_aio_context(), true);
707 }
cddd4ac7
MK
708 }
709 }
710
711 return srco.ret;
47622c44
LY
712}
713
a37dcdf9 714static void coroutine_fn add_aio_request(BDRVSheepdogState *s, AIOReq *aio_req,
b544c1ab
HM
715 struct iovec *iov, int niov,
716 enum AIOCBState aiocb_type);
a37dcdf9 717static void coroutine_fn resend_aioreq(BDRVSheepdogState *s, AIOReq *aio_req);
72e0996c 718static int reload_inode(BDRVSheepdogState *s, uint32_t snapid, const char *tag);
356b4ca2 719static int get_sheep_fd(BDRVSheepdogState *s, Error **errp);
011603ca 720static void co_write_request(void *opaque);
7dc1cde0 721
011603ca
MK
722static coroutine_fn void reconnect_to_sdog(void *opaque)
723{
724 BDRVSheepdogState *s = opaque;
725 AIOReq *aio_req, *next;
726
dca21ef2 727 aio_set_fd_handler(s->aio_context, s->fd, false, NULL,
f6a51c84 728 NULL, NULL, NULL);
011603ca
MK
729 close(s->fd);
730 s->fd = -1;
731
732 /* Wait for outstanding write requests to be completed. */
733 while (s->co_send != NULL) {
734 co_write_request(opaque);
735 }
736
737 /* Try to reconnect the sheepdog server every one second. */
738 while (s->fd < 0) {
a780dea0 739 Error *local_err = NULL;
356b4ca2 740 s->fd = get_sheep_fd(s, &local_err);
011603ca 741 if (s->fd < 0) {
70018a14 742 trace_sheepdog_reconnect_to_sdog();
565f65d2 743 error_report_err(local_err);
78f1d3d6 744 qemu_co_sleep_ns(QEMU_CLOCK_REALTIME, 1000000000ULL);
011603ca
MK
745 }
746 };
747
748 /*
749 * Now we have to resend all the request in the inflight queue. However,
750 * resend_aioreq() can yield and newly created requests can be added to the
751 * inflight queue before the coroutine is resumed. To avoid mixing them, we
752 * have to move all the inflight requests to the failed queue before
753 * resend_aioreq() is called.
754 */
f1af3251 755 qemu_co_mutex_lock(&s->queue_lock);
011603ca
MK
756 QLIST_FOREACH_SAFE(aio_req, &s->inflight_aio_head, aio_siblings, next) {
757 QLIST_REMOVE(aio_req, aio_siblings);
758 QLIST_INSERT_HEAD(&s->failed_aio_head, aio_req, aio_siblings);
759 }
760
761 /* Resend all the failed aio requests. */
762 while (!QLIST_EMPTY(&s->failed_aio_head)) {
763 aio_req = QLIST_FIRST(&s->failed_aio_head);
764 QLIST_REMOVE(aio_req, aio_siblings);
f1af3251 765 qemu_co_mutex_unlock(&s->queue_lock);
011603ca 766 resend_aioreq(s, aio_req);
f1af3251 767 qemu_co_mutex_lock(&s->queue_lock);
011603ca 768 }
f1af3251 769 qemu_co_mutex_unlock(&s->queue_lock);
011603ca
MK
770}
771
33b1db1c
MK
772/*
773 * Receive responses of the I/O requests.
774 *
775 * This function is registered as a fd handler, and called from the
776 * main loop when s->fd is ready for reading responses.
777 */
d8716b41 778static void coroutine_fn aio_read_response(void *opaque)
33b1db1c
MK
779{
780 SheepdogObjRsp rsp;
781 BDRVSheepdogState *s = opaque;
782 int fd = s->fd;
783 int ret;
784 AIOReq *aio_req = NULL;
785 SheepdogAIOCB *acb;
cac8f4a6 786 uint64_t idx;
33b1db1c 787
33b1db1c 788 /* read a header */
8c5135f9 789 ret = qemu_co_recv(fd, &rsp, sizeof(rsp));
80731d9d 790 if (ret != sizeof(rsp)) {
6daf194d 791 error_report("failed to get the header, %s", strerror(errno));
011603ca 792 goto err;
33b1db1c
MK
793 }
794
c292ee6a
MK
795 /* find the right aio_req from the inflight aio list */
796 QLIST_FOREACH(aio_req, &s->inflight_aio_head, aio_siblings) {
33b1db1c
MK
797 if (aio_req->id == rsp.id) {
798 break;
799 }
800 }
801 if (!aio_req) {
6daf194d 802 error_report("cannot find aio_req %x", rsp.id);
011603ca 803 goto err;
33b1db1c
MK
804 }
805
806 acb = aio_req->aiocb;
807
808 switch (acb->aiocb_type) {
809 case AIOCB_WRITE_UDATA:
810 if (!is_data_obj(aio_req->oid)) {
811 break;
812 }
813 idx = data_oid_to_idx(aio_req->oid);
814
b544c1ab 815 if (aio_req->create) {
33b1db1c
MK
816 /*
817 * If the object is newly created one, we need to update
818 * the vdi object (metadata object). min_dirty_data_idx
819 * and max_dirty_data_idx are changed to include updated
820 * index between them.
821 */
bd751f22
LY
822 if (rsp.result == SD_RES_SUCCESS) {
823 s->inode.data_vdi_id[idx] = s->inode.vdi_id;
498f2140
HM
824 acb->max_dirty_data_idx = MAX(idx, acb->max_dirty_data_idx);
825 acb->min_dirty_data_idx = MIN(idx, acb->min_dirty_data_idx);
bd751f22 826 }
33b1db1c
MK
827 }
828 break;
829 case AIOCB_READ_UDATA:
2fc8ae1d
MT
830 ret = qemu_co_recvv(fd, acb->qiov->iov, acb->qiov->niov,
831 aio_req->iov_offset, rsp.data_length);
80731d9d 832 if (ret != rsp.data_length) {
6daf194d 833 error_report("failed to get the data, %s", strerror(errno));
011603ca 834 goto err;
33b1db1c
MK
835 }
836 break;
47783072
LY
837 case AIOCB_FLUSH_CACHE:
838 if (rsp.result == SD_RES_INVALID_PARMS) {
70018a14 839 trace_sheepdog_aio_read_response();
47783072
LY
840 s->cache_flags = SD_FLAG_CMD_DIRECT;
841 rsp.result = SD_RES_SUCCESS;
842 }
843 break;
cac8f4a6
LY
844 case AIOCB_DISCARD_OBJ:
845 switch (rsp.result) {
846 case SD_RES_INVALID_PARMS:
8ecc2f9e 847 error_report("server doesn't support discard command");
cac8f4a6
LY
848 rsp.result = SD_RES_SUCCESS;
849 s->discard_supported = false;
850 break;
cac8f4a6
LY
851 default:
852 break;
853 }
33b1db1c
MK
854 }
855
e80ab33d
PB
856 /* No more data for this aio_req (reload_inode below uses its own file
857 * descriptor handler which doesn't use co_recv).
858 */
859 s->co_recv = NULL;
860
f1af3251 861 qemu_co_mutex_lock(&s->queue_lock);
c4080e93 862 QLIST_REMOVE(aio_req, aio_siblings);
f1af3251
PB
863 qemu_co_mutex_unlock(&s->queue_lock);
864
13c31de2
MK
865 switch (rsp.result) {
866 case SD_RES_SUCCESS:
867 break;
868 case SD_RES_READONLY:
72e0996c
MK
869 if (s->inode.vdi_id == oid_to_vid(aio_req->oid)) {
870 ret = reload_inode(s, 0, "");
871 if (ret < 0) {
011603ca 872 goto err;
72e0996c
MK
873 }
874 }
72e0996c
MK
875 if (is_data_obj(aio_req->oid)) {
876 aio_req->oid = vid_to_data_oid(s->inode.vdi_id,
877 data_oid_to_idx(aio_req->oid));
878 } else {
879 aio_req->oid = vid_to_vdi_oid(s->inode.vdi_id);
880 }
a37dcdf9 881 resend_aioreq(s, aio_req);
e80ab33d 882 return;
13c31de2 883 default:
33b1db1c 884 acb->ret = -EIO;
6daf194d 885 error_report("%s", sd_strerror(rsp.result));
13c31de2 886 break;
33b1db1c
MK
887 }
888
c4080e93
PB
889 g_free(aio_req);
890
891 if (!--acb->nr_pending) {
33b1db1c
MK
892 /*
893 * We've finished all requests which belong to the AIOCB, so
2df46246 894 * we can switch back to sd_co_readv/writev now.
33b1db1c 895 */
9d456654 896 aio_co_wake(acb->coroutine);
33b1db1c 897 }
e80ab33d 898
011603ca 899 return;
e80ab33d 900
011603ca 901err:
011603ca 902 reconnect_to_sdog(opaque);
2df46246
MK
903}
904
905static void co_read_response(void *opaque)
906{
907 BDRVSheepdogState *s = opaque;
908
909 if (!s->co_recv) {
0b8b8753 910 s->co_recv = qemu_coroutine_create(aio_read_response, opaque);
2df46246
MK
911 }
912
5eceb01a 913 aio_co_enter(s->aio_context, s->co_recv);
2df46246
MK
914}
915
916static void co_write_request(void *opaque)
917{
918 BDRVSheepdogState *s = opaque;
919
9d456654 920 aio_co_wake(s->co_send);
33b1db1c
MK
921}
922
33b1db1c 923/*
dc6fb73d 924 * Return a socket descriptor to read/write objects.
33b1db1c 925 *
dc6fb73d 926 * We cannot use this descriptor for other operations because
33b1db1c
MK
927 * the block driver may be on waiting response from the server.
928 */
356b4ca2 929static int get_sheep_fd(BDRVSheepdogState *s, Error **errp)
33b1db1c 930{
1b8bbb46 931 int fd;
33b1db1c 932
356b4ca2 933 fd = connect_to_sdog(s, errp);
33b1db1c 934 if (fd < 0) {
cb595887 935 return fd;
33b1db1c
MK
936 }
937
dca21ef2 938 aio_set_fd_handler(s->aio_context, fd, false,
f6a51c84 939 co_read_response, NULL, NULL, s);
33b1db1c
MK
940 return fd;
941}
942
89e2a31d
MA
943/*
944 * Parse numeric snapshot ID in @str
945 * If @str can't be parsed as number, return false.
946 * Else, if the number is zero or too large, set *@snapid to zero and
947 * return true.
948 * Else, set *@snapid to the number and return true.
949 */
950static bool sd_parse_snapid(const char *str, uint32_t *snapid)
951{
952 unsigned long ul;
953 int ret;
954
955 ret = qemu_strtoul(str, NULL, 10, &ul);
956 if (ret == -ERANGE) {
957 ul = ret = 0;
958 }
959 if (ret) {
960 return false;
961 }
962 if (ul > UINT32_MAX) {
963 ul = 0;
964 }
965
966 *snapid = ul;
967 return true;
968}
969
970static bool sd_parse_snapid_or_tag(const char *str,
971 uint32_t *snapid, char tag[])
972{
973 if (!sd_parse_snapid(str, snapid)) {
974 *snapid = 0;
975 if (g_strlcpy(tag, str, SD_MAX_VDI_TAG_LEN) >= SD_MAX_VDI_TAG_LEN) {
976 return false;
977 }
978 } else if (!*snapid) {
979 return false;
980 } else {
981 tag[0] = 0;
982 }
983 return true;
984}
985
831acdc9
MA
986typedef struct {
987 const char *path; /* non-null iff transport is tcp */
988 const char *host; /* valid when transport is tcp */
989 int port; /* valid when transport is tcp */
990 char vdi[SD_MAX_VDI_LEN];
991 char tag[SD_MAX_VDI_TAG_LEN];
992 uint32_t snap_id;
993 /* Remainder is only for sd_config_done() */
994 URI *uri;
995 QueryParams *qp;
996} SheepdogConfig;
997
998static void sd_config_done(SheepdogConfig *cfg)
999{
1000 if (cfg->qp) {
1001 query_params_free(cfg->qp);
1002 }
1003 uri_free(cfg->uri);
1004}
1005
1006static void sd_parse_uri(SheepdogConfig *cfg, const char *filename,
36bcac16 1007 Error **errp)
5d6768e3 1008{
36bcac16 1009 Error *err = NULL;
5d6768e3 1010 QueryParams *qp = NULL;
8ecc2f9e
MA
1011 bool is_unix;
1012 URI *uri;
5d6768e3 1013
831acdc9
MA
1014 memset(cfg, 0, sizeof(*cfg));
1015
1016 cfg->uri = uri = uri_parse(filename);
5d6768e3 1017 if (!uri) {
44acd46f 1018 error_setg(&err, "invalid URI '%s'", filename);
36bcac16 1019 goto out;
5d6768e3
MK
1020 }
1021
1b8bbb46 1022 /* transport */
f69165a8 1023 if (!g_strcmp0(uri->scheme, "sheepdog")) {
8ecc2f9e 1024 is_unix = false;
f69165a8 1025 } else if (!g_strcmp0(uri->scheme, "sheepdog+tcp")) {
8ecc2f9e 1026 is_unix = false;
f69165a8 1027 } else if (!g_strcmp0(uri->scheme, "sheepdog+unix")) {
8ecc2f9e 1028 is_unix = true;
1b8bbb46 1029 } else {
36bcac16
MA
1030 error_setg(&err, "URI scheme must be 'sheepdog', 'sheepdog+tcp',"
1031 " or 'sheepdog+unix'");
1b8bbb46
MK
1032 goto out;
1033 }
1034
5d6768e3 1035 if (uri->path == NULL || !strcmp(uri->path, "/")) {
36bcac16 1036 error_setg(&err, "missing file path in URI");
5d6768e3
MK
1037 goto out;
1038 }
831acdc9
MA
1039 if (g_strlcpy(cfg->vdi, uri->path + 1, SD_MAX_VDI_LEN)
1040 >= SD_MAX_VDI_LEN) {
36bcac16 1041 error_setg(&err, "VDI name is too long");
daa0b0d4
MA
1042 goto out;
1043 }
5d6768e3 1044
831acdc9 1045 cfg->qp = qp = query_params_parse(uri->query);
1b8bbb46 1046
8ecc2f9e 1047 if (is_unix) {
1b8bbb46 1048 /* sheepdog+unix:///vdiname?socket=path */
36bcac16
MA
1049 if (uri->server || uri->port) {
1050 error_setg(&err, "URI scheme %s doesn't accept a server address",
1051 uri->scheme);
1052 goto out;
1053 }
1054 if (!qp->n) {
1055 error_setg(&err,
1056 "URI scheme %s requires query parameter 'socket'",
1057 uri->scheme);
1058 goto out;
1059 }
1060 if (qp->n != 1 || strcmp(qp->p[0].name, "socket")) {
1061 error_setg(&err, "unexpected query parameters");
1b8bbb46
MK
1062 goto out;
1063 }
831acdc9 1064 cfg->path = qp->p[0].value;
1b8bbb46
MK
1065 } else {
1066 /* sheepdog[+tcp]://[host:port]/vdiname */
36bcac16
MA
1067 if (qp->n) {
1068 error_setg(&err, "unexpected query parameters");
1069 goto out;
1070 }
831acdc9
MA
1071 cfg->host = uri->server;
1072 cfg->port = uri->port;
1b8bbb46 1073 }
5d6768e3
MK
1074
1075 /* snapshot tag */
1076 if (uri->fragment) {
831acdc9
MA
1077 if (!sd_parse_snapid_or_tag(uri->fragment,
1078 &cfg->snap_id, cfg->tag)) {
36bcac16
MA
1079 error_setg(&err, "'%s' is not a valid snapshot ID",
1080 uri->fragment);
89e2a31d 1081 goto out;
5d6768e3
MK
1082 }
1083 } else {
831acdc9 1084 cfg->snap_id = CURRENT_VDI_ID; /* search current vdi */
5d6768e3
MK
1085 }
1086
1087out:
8ecc2f9e
MA
1088 if (err) {
1089 error_propagate(errp, err);
831acdc9 1090 sd_config_done(cfg);
5d6768e3 1091 }
5d6768e3
MK
1092}
1093
33b1db1c 1094/*
5d6768e3 1095 * Parse a filename (old syntax)
33b1db1c
MK
1096 *
1097 * filename must be one of the following formats:
1098 * 1. [vdiname]
1099 * 2. [vdiname]:[snapid]
1100 * 3. [vdiname]:[tag]
1101 * 4. [hostname]:[port]:[vdiname]
1102 * 5. [hostname]:[port]:[vdiname]:[snapid]
1103 * 6. [hostname]:[port]:[vdiname]:[tag]
1104 *
1105 * You can boot from the snapshot images by specifying `snapid` or
1106 * `tag'.
1107 *
1108 * You can run VMs outside the Sheepdog cluster by specifying
1109 * `hostname' and `port' (experimental).
1110 */
831acdc9 1111static void parse_vdiname(SheepdogConfig *cfg, const char *filename,
36bcac16 1112 Error **errp)
33b1db1c 1113{
36bcac16 1114 Error *err = NULL;
5d6768e3
MK
1115 char *p, *q, *uri;
1116 const char *host_spec, *vdi_spec;
36bcac16 1117 int nr_sep;
33b1db1c 1118
11d816a5 1119 strstart(filename, "sheepdog:", &filename);
7267c094 1120 p = q = g_strdup(filename);
33b1db1c
MK
1121
1122 /* count the number of separators */
1123 nr_sep = 0;
1124 while (*p) {
1125 if (*p == ':') {
1126 nr_sep++;
1127 }
1128 p++;
1129 }
1130 p = q;
1131
5d6768e3 1132 /* use the first two tokens as host_spec. */
33b1db1c 1133 if (nr_sep >= 2) {
5d6768e3 1134 host_spec = p;
33b1db1c 1135 p = strchr(p, ':');
5d6768e3 1136 p++;
33b1db1c
MK
1137 p = strchr(p, ':');
1138 *p++ = '\0';
1139 } else {
5d6768e3 1140 host_spec = "";
33b1db1c
MK
1141 }
1142
5d6768e3 1143 vdi_spec = p;
33b1db1c 1144
5d6768e3 1145 p = strchr(vdi_spec, ':');
33b1db1c 1146 if (p) {
5d6768e3 1147 *p++ = '#';
33b1db1c
MK
1148 }
1149
5d6768e3 1150 uri = g_strdup_printf("sheepdog://%s/%s", host_spec, vdi_spec);
33b1db1c 1151
36bcac16
MA
1152 /*
1153 * FIXME We to escape URI meta-characters, e.g. "x?y=z"
1154 * produces "sheepdog://x?y=z". Because of that ...
1155 */
831acdc9 1156 sd_parse_uri(cfg, uri, &err);
36bcac16
MA
1157 if (err) {
1158 /*
1159 * ... this can fail, but the error message is misleading.
1160 * Replace it by the traditional useless one until the
1161 * escaping is fixed.
1162 */
1163 error_free(err);
1164 error_setg(errp, "Can't parse filename");
1165 }
5d6768e3
MK
1166
1167 g_free(q);
1168 g_free(uri);
33b1db1c
MK
1169}
1170
831acdc9
MA
1171static void sd_parse_filename(const char *filename, QDict *options,
1172 Error **errp)
1173{
1174 Error *err = NULL;
1175 SheepdogConfig cfg;
1176 char buf[32];
1177
1178 if (strstr(filename, "://")) {
1179 sd_parse_uri(&cfg, filename, &err);
1180 } else {
1181 parse_vdiname(&cfg, filename, &err);
1182 }
1183 if (err) {
1184 error_propagate(errp, err);
1185 return;
1186 }
1187
831acdc9 1188 if (cfg.path) {
d1c13688
MA
1189 qdict_set_default_str(options, "server.path", cfg.path);
1190 qdict_set_default_str(options, "server.type", "unix");
1191 } else {
1192 qdict_set_default_str(options, "server.type", "inet");
1193 qdict_set_default_str(options, "server.host",
1194 cfg.host ?: SD_DEFAULT_ADDR);
1195 snprintf(buf, sizeof(buf), "%d", cfg.port ?: SD_DEFAULT_PORT);
1196 qdict_set_default_str(options, "server.port", buf);
831acdc9
MA
1197 }
1198 qdict_set_default_str(options, "vdi", cfg.vdi);
1199 qdict_set_default_str(options, "tag", cfg.tag);
1200 if (cfg.snap_id) {
1201 snprintf(buf, sizeof(buf), "%d", cfg.snap_id);
1202 qdict_set_default_str(options, "snap-id", buf);
1203 }
1204
1205 sd_config_done(&cfg);
1206}
1207
982dcbf4
MK
1208static int find_vdi_name(BDRVSheepdogState *s, const char *filename,
1209 uint32_t snapid, const char *tag, uint32_t *vid,
dc83cd42 1210 bool lock, Error **errp)
33b1db1c
MK
1211{
1212 int ret, fd;
1213 SheepdogVdiReq hdr;
1214 SheepdogVdiRsp *rsp = (SheepdogVdiRsp *)&hdr;
1215 unsigned int wlen, rlen = 0;
97b583f4 1216 char buf[SD_MAX_VDI_LEN + SD_MAX_VDI_TAG_LEN] QEMU_NONSTRING;
33b1db1c 1217
dc83cd42 1218 fd = connect_to_sdog(s, errp);
33b1db1c 1219 if (fd < 0) {
cb595887 1220 return fd;
33b1db1c
MK
1221 }
1222
3178e275
JM
1223 /* This pair of strncpy calls ensures that the buffer is zero-filled,
1224 * which is desirable since we'll soon be sending those bytes, and
1225 * don't want the send_req to read uninitialized data.
1226 */
33b1db1c
MK
1227 strncpy(buf, filename, SD_MAX_VDI_LEN);
1228 strncpy(buf + SD_MAX_VDI_LEN, tag, SD_MAX_VDI_TAG_LEN);
1229
1230 memset(&hdr, 0, sizeof(hdr));
982dcbf4 1231 if (lock) {
33b1db1c 1232 hdr.opcode = SD_OP_LOCK_VDI;
1dbfafed 1233 hdr.type = LOCK_TYPE_NORMAL;
982dcbf4
MK
1234 } else {
1235 hdr.opcode = SD_OP_GET_VDI_INFO;
33b1db1c
MK
1236 }
1237 wlen = SD_MAX_VDI_LEN + SD_MAX_VDI_TAG_LEN;
1238 hdr.proto_ver = SD_PROTO_VER;
1239 hdr.data_length = wlen;
1240 hdr.snapid = snapid;
1241 hdr.flags = SD_FLAG_CMD_WRITE;
1242
f11672db 1243 ret = do_req(fd, s->bs, (SheepdogReq *)&hdr, buf, &wlen, &rlen);
33b1db1c 1244 if (ret) {
dc83cd42 1245 error_setg_errno(errp, -ret, "cannot get vdi info");
33b1db1c
MK
1246 goto out;
1247 }
1248
1249 if (rsp->result != SD_RES_SUCCESS) {
dc83cd42
MA
1250 error_setg(errp, "cannot get vdi info, %s, %s %" PRIu32 " %s",
1251 sd_strerror(rsp->result), filename, snapid, tag);
cb595887
MK
1252 if (rsp->result == SD_RES_NO_VDI) {
1253 ret = -ENOENT;
38890b24
HM
1254 } else if (rsp->result == SD_RES_VDI_LOCKED) {
1255 ret = -EBUSY;
cb595887
MK
1256 } else {
1257 ret = -EIO;
1258 }
33b1db1c
MK
1259 goto out;
1260 }
1261 *vid = rsp->vdi_id;
1262
1263 ret = 0;
1264out:
1265 closesocket(fd);
1266 return ret;
1267}
1268
a37dcdf9 1269static void coroutine_fn add_aio_request(BDRVSheepdogState *s, AIOReq *aio_req,
b544c1ab
HM
1270 struct iovec *iov, int niov,
1271 enum AIOCBState aiocb_type)
33b1db1c
MK
1272{
1273 int nr_copies = s->inode.nr_copies;
1274 SheepdogObjReq hdr;
47783072 1275 unsigned int wlen = 0;
33b1db1c
MK
1276 int ret;
1277 uint64_t oid = aio_req->oid;
1278 unsigned int datalen = aio_req->data_len;
1279 uint64_t offset = aio_req->offset;
1280 uint8_t flags = aio_req->flags;
1281 uint64_t old_oid = aio_req->base_oid;
b544c1ab 1282 bool create = aio_req->create;
33b1db1c 1283
f1af3251 1284 qemu_co_mutex_lock(&s->queue_lock);
c4080e93 1285 QLIST_INSERT_HEAD(&s->inflight_aio_head, aio_req, aio_siblings);
f1af3251 1286 qemu_co_mutex_unlock(&s->queue_lock);
c4080e93 1287
33b1db1c 1288 if (!nr_copies) {
6daf194d 1289 error_report("bug");
33b1db1c
MK
1290 }
1291
1292 memset(&hdr, 0, sizeof(hdr));
1293
47783072
LY
1294 switch (aiocb_type) {
1295 case AIOCB_FLUSH_CACHE:
1296 hdr.opcode = SD_OP_FLUSH_VDI;
1297 break;
1298 case AIOCB_READ_UDATA:
33b1db1c
MK
1299 hdr.opcode = SD_OP_READ_OBJ;
1300 hdr.flags = flags;
47783072
LY
1301 break;
1302 case AIOCB_WRITE_UDATA:
1303 if (create) {
1304 hdr.opcode = SD_OP_CREATE_AND_WRITE_OBJ;
1305 } else {
1306 hdr.opcode = SD_OP_WRITE_OBJ;
1307 }
33b1db1c 1308 wlen = datalen;
33b1db1c 1309 hdr.flags = SD_FLAG_CMD_WRITE | flags;
47783072 1310 break;
cac8f4a6 1311 case AIOCB_DISCARD_OBJ:
e6fd57ea
HM
1312 hdr.opcode = SD_OP_WRITE_OBJ;
1313 hdr.flags = SD_FLAG_CMD_WRITE | flags;
1314 s->inode.data_vdi_id[data_oid_to_idx(oid)] = 0;
1315 offset = offsetof(SheepdogInode,
1316 data_vdi_id[data_oid_to_idx(oid)]);
1317 oid = vid_to_vdi_oid(s->inode.vdi_id);
1318 wlen = datalen = sizeof(uint32_t);
cac8f4a6 1319 break;
33b1db1c
MK
1320 }
1321
0e7106d8
LY
1322 if (s->cache_flags) {
1323 hdr.flags |= s->cache_flags;
47622c44
LY
1324 }
1325
33b1db1c
MK
1326 hdr.oid = oid;
1327 hdr.cow_oid = old_oid;
1328 hdr.copies = s->inode.nr_copies;
1329
1330 hdr.data_length = datalen;
1331 hdr.offset = offset;
1332
1333 hdr.id = aio_req->id;
1334
2df46246
MK
1335 qemu_co_mutex_lock(&s->lock);
1336 s->co_send = qemu_coroutine_self();
dca21ef2 1337 aio_set_fd_handler(s->aio_context, s->fd, false,
f6a51c84 1338 co_read_response, co_write_request, NULL, s);
128aa589 1339 socket_set_cork(s->fd, 1);
33b1db1c
MK
1340
1341 /* send a header */
8c5135f9 1342 ret = qemu_co_send(s->fd, &hdr, sizeof(hdr));
80731d9d 1343 if (ret != sizeof(hdr)) {
6daf194d 1344 error_report("failed to send a req, %s", strerror(errno));
011603ca 1345 goto out;
33b1db1c
MK
1346 }
1347
1348 if (wlen) {
2fc8ae1d 1349 ret = qemu_co_sendv(s->fd, iov, niov, aio_req->iov_offset, wlen);
80731d9d 1350 if (ret != wlen) {
6daf194d 1351 error_report("failed to send a data, %s", strerror(errno));
33b1db1c
MK
1352 }
1353 }
011603ca 1354out:
128aa589 1355 socket_set_cork(s->fd, 0);
dca21ef2 1356 aio_set_fd_handler(s->aio_context, s->fd, false,
f6a51c84 1357 co_read_response, NULL, NULL, s);
011603ca 1358 s->co_send = NULL;
2df46246 1359 qemu_co_mutex_unlock(&s->lock);
33b1db1c
MK
1360}
1361
f11672db 1362static int read_write_object(int fd, BlockDriverState *bs, char *buf,
84390bed 1363 uint64_t oid, uint8_t copies,
33b1db1c 1364 unsigned int datalen, uint64_t offset,
0e7106d8 1365 bool write, bool create, uint32_t cache_flags)
33b1db1c
MK
1366{
1367 SheepdogObjReq hdr;
1368 SheepdogObjRsp *rsp = (SheepdogObjRsp *)&hdr;
1369 unsigned int wlen, rlen;
1370 int ret;
1371
1372 memset(&hdr, 0, sizeof(hdr));
1373
1374 if (write) {
1375 wlen = datalen;
1376 rlen = 0;
1377 hdr.flags = SD_FLAG_CMD_WRITE;
1378 if (create) {
1379 hdr.opcode = SD_OP_CREATE_AND_WRITE_OBJ;
1380 } else {
1381 hdr.opcode = SD_OP_WRITE_OBJ;
1382 }
1383 } else {
1384 wlen = 0;
1385 rlen = datalen;
1386 hdr.opcode = SD_OP_READ_OBJ;
1387 }
47622c44 1388
0e7106d8 1389 hdr.flags |= cache_flags;
47622c44 1390
33b1db1c
MK
1391 hdr.oid = oid;
1392 hdr.data_length = datalen;
1393 hdr.offset = offset;
1394 hdr.copies = copies;
1395
f11672db 1396 ret = do_req(fd, bs, (SheepdogReq *)&hdr, buf, &wlen, &rlen);
33b1db1c 1397 if (ret) {
6daf194d 1398 error_report("failed to send a request to the sheep");
cb595887 1399 return ret;
33b1db1c
MK
1400 }
1401
1402 switch (rsp->result) {
1403 case SD_RES_SUCCESS:
1404 return 0;
1405 default:
6daf194d 1406 error_report("%s", sd_strerror(rsp->result));
cb595887 1407 return -EIO;
33b1db1c
MK
1408 }
1409}
1410
f11672db 1411static int read_object(int fd, BlockDriverState *bs, char *buf,
84390bed 1412 uint64_t oid, uint8_t copies,
0e7106d8
LY
1413 unsigned int datalen, uint64_t offset,
1414 uint32_t cache_flags)
33b1db1c 1415{
f11672db 1416 return read_write_object(fd, bs, buf, oid, copies,
84390bed 1417 datalen, offset, false,
0e7106d8 1418 false, cache_flags);
33b1db1c
MK
1419}
1420
f11672db 1421static int write_object(int fd, BlockDriverState *bs, char *buf,
84390bed 1422 uint64_t oid, uint8_t copies,
2f536801 1423 unsigned int datalen, uint64_t offset, bool create,
0e7106d8 1424 uint32_t cache_flags)
33b1db1c 1425{
f11672db 1426 return read_write_object(fd, bs, buf, oid, copies,
84390bed 1427 datalen, offset, true,
0e7106d8 1428 create, cache_flags);
33b1db1c
MK
1429}
1430
9ff53a0e
MK
1431/* update inode with the latest state */
1432static int reload_inode(BDRVSheepdogState *s, uint32_t snapid, const char *tag)
1433{
dfb12bf8 1434 Error *local_err = NULL;
9ff53a0e
MK
1435 SheepdogInode *inode;
1436 int ret = 0, fd;
1437 uint32_t vid = 0;
1438
dfb12bf8 1439 fd = connect_to_sdog(s, &local_err);
9ff53a0e 1440 if (fd < 0) {
565f65d2 1441 error_report_err(local_err);
9ff53a0e
MK
1442 return -EIO;
1443 }
1444
5d039bab 1445 inode = g_malloc(SD_INODE_HEADER_SIZE);
9ff53a0e 1446
dc83cd42 1447 ret = find_vdi_name(s, s->name, snapid, tag, &vid, false, &local_err);
9ff53a0e 1448 if (ret) {
565f65d2 1449 error_report_err(local_err);
9ff53a0e
MK
1450 goto out;
1451 }
1452
f11672db 1453 ret = read_object(fd, s->bs, (char *)inode, vid_to_vdi_oid(vid),
5d039bab
HM
1454 s->inode.nr_copies, SD_INODE_HEADER_SIZE, 0,
1455 s->cache_flags);
9ff53a0e
MK
1456 if (ret < 0) {
1457 goto out;
1458 }
1459
1460 if (inode->vdi_id != s->inode.vdi_id) {
5d039bab 1461 memcpy(&s->inode, inode, SD_INODE_HEADER_SIZE);
9ff53a0e
MK
1462 }
1463
1464out:
1465 g_free(inode);
1466 closesocket(fd);
1467
1468 return ret;
1469}
1470
a37dcdf9 1471static void coroutine_fn resend_aioreq(BDRVSheepdogState *s, AIOReq *aio_req)
13c31de2
MK
1472{
1473 SheepdogAIOCB *acb = aio_req->aiocb;
b544c1ab
HM
1474
1475 aio_req->create = false;
13c31de2
MK
1476
1477 /* check whether this request becomes a CoW one */
2412aec7 1478 if (acb->aiocb_type == AIOCB_WRITE_UDATA && is_data_obj(aio_req->oid)) {
13c31de2 1479 int idx = data_oid_to_idx(aio_req->oid);
13c31de2 1480
13c31de2
MK
1481 if (is_data_obj_writable(&s->inode, idx)) {
1482 goto out;
1483 }
1484
80308d33
MK
1485 if (s->inode.data_vdi_id[idx]) {
1486 aio_req->base_oid = vid_to_data_oid(s->inode.data_vdi_id[idx], idx);
1487 aio_req->flags |= SD_FLAG_CMD_COW;
1488 }
b544c1ab 1489 aio_req->create = true;
13c31de2
MK
1490 }
1491out:
2412aec7 1492 if (is_data_obj(aio_req->oid)) {
b544c1ab 1493 add_aio_request(s, aio_req, acb->qiov->iov, acb->qiov->niov,
a37dcdf9 1494 acb->aiocb_type);
2412aec7
MK
1495 } else {
1496 struct iovec iov;
1497 iov.iov_base = &s->inode;
1498 iov.iov_len = sizeof(s->inode);
b544c1ab 1499 add_aio_request(s, aio_req, &iov, 1, AIOCB_WRITE_UDATA);
2412aec7 1500 }
13c31de2
MK
1501}
1502
84390bed
SH
1503static void sd_detach_aio_context(BlockDriverState *bs)
1504{
1505 BDRVSheepdogState *s = bs->opaque;
1506
dca21ef2 1507 aio_set_fd_handler(s->aio_context, s->fd, false, NULL,
f6a51c84 1508 NULL, NULL, NULL);
84390bed
SH
1509}
1510
1511static void sd_attach_aio_context(BlockDriverState *bs,
1512 AioContext *new_context)
1513{
1514 BDRVSheepdogState *s = bs->opaque;
1515
1516 s->aio_context = new_context;
dca21ef2 1517 aio_set_fd_handler(new_context, s->fd, false,
f6a51c84 1518 co_read_response, NULL, NULL, s);
84390bed
SH
1519}
1520
c8c96350
KW
1521static QemuOptsList runtime_opts = {
1522 .name = "sheepdog",
1523 .head = QTAILQ_HEAD_INITIALIZER(runtime_opts.head),
1524 .desc = {
831acdc9
MA
1525 {
1526 .name = "vdi",
1527 .type = QEMU_OPT_STRING,
1528 },
1529 {
1530 .name = "snap-id",
1531 .type = QEMU_OPT_NUMBER,
1532 },
1533 {
1534 .name = "tag",
c8c96350 1535 .type = QEMU_OPT_STRING,
c8c96350
KW
1536 },
1537 { /* end of list */ }
1538 },
1539};
1540
015a1036
HR
1541static int sd_open(BlockDriverState *bs, QDict *options, int flags,
1542 Error **errp)
33b1db1c
MK
1543{
1544 int ret, fd;
1545 uint32_t vid = 0;
1546 BDRVSheepdogState *s = bs->opaque;
d1c13688 1547 const char *vdi, *snap_id_str, *tag;
831acdc9 1548 uint64_t snap_id;
33b1db1c 1549 char *buf = NULL;
c8c96350
KW
1550 QemuOpts *opts;
1551 Error *local_err = NULL;
c8c96350 1552
011603ca 1553 s->bs = bs;
84390bed 1554 s->aio_context = bdrv_get_aio_context(bs);
011603ca 1555
87ea75d5 1556 opts = qemu_opts_create(&runtime_opts, NULL, 0, &error_abort);
c8c96350 1557 qemu_opts_absorb_qdict(opts, options, &local_err);
84d18f06 1558 if (local_err) {
e67c3993 1559 error_propagate(errp, local_err);
c8c96350 1560 ret = -EINVAL;
cbc488ee 1561 goto err_no_fd;
c8c96350
KW
1562 }
1563
d1c13688
MA
1564 s->addr = sd_server_config(options, errp);
1565 if (!s->addr) {
1566 ret = -EINVAL;
1567 goto err_no_fd;
1568 }
1569
831acdc9
MA
1570 vdi = qemu_opt_get(opts, "vdi");
1571 snap_id_str = qemu_opt_get(opts, "snap-id");
1572 snap_id = qemu_opt_get_number(opts, "snap-id", CURRENT_VDI_ID);
1573 tag = qemu_opt_get(opts, "tag");
33b1db1c 1574
831acdc9
MA
1575 if (!vdi) {
1576 error_setg(errp, "parameter 'vdi' is missing");
1577 ret = -EINVAL;
1578 goto err_no_fd;
1579 }
1580 if (strlen(vdi) >= SD_MAX_VDI_LEN) {
1581 error_setg(errp, "value of parameter 'vdi' is too long");
1582 ret = -EINVAL;
1583 goto err_no_fd;
1584 }
33b1db1c 1585
831acdc9
MA
1586 if (snap_id > UINT32_MAX) {
1587 snap_id = 0;
1588 }
1589 if (snap_id_str && !snap_id) {
1590 error_setg(errp, "'snap-id=%s' is not a valid snapshot ID",
1591 snap_id_str);
1592 ret = -EINVAL;
1593 goto err_no_fd;
1594 }
5d6768e3 1595
831acdc9
MA
1596 if (!tag) {
1597 tag = "";
5d6768e3 1598 }
ac90dad9 1599 if (strlen(tag) >= SD_MAX_VDI_TAG_LEN) {
831acdc9 1600 error_setg(errp, "value of parameter 'tag' is too long");
36bcac16 1601 ret = -EINVAL;
cbc488ee 1602 goto err_no_fd;
33b1db1c 1603 }
831acdc9 1604
831acdc9
MA
1605 QLIST_INIT(&s->inflight_aio_head);
1606 QLIST_INIT(&s->failed_aio_head);
1607 QLIST_INIT(&s->inflight_aiocb_head);
1608
e67c3993 1609 s->fd = get_sheep_fd(s, errp);
33b1db1c 1610 if (s->fd < 0) {
cb595887 1611 ret = s->fd;
cbc488ee 1612 goto err_no_fd;
33b1db1c
MK
1613 }
1614
831acdc9 1615 ret = find_vdi_name(s, vdi, (uint32_t)snap_id, tag, &vid, true, errp);
33b1db1c 1616 if (ret) {
cbc488ee 1617 goto err;
33b1db1c
MK
1618 }
1619
0e7106d8
LY
1620 /*
1621 * QEMU block layer emulates writethrough cache as 'writeback + flush', so
1622 * we always set SD_FLAG_CMD_CACHE (writeback cache) as default.
1623 */
1624 s->cache_flags = SD_FLAG_CMD_CACHE;
1625 if (flags & BDRV_O_NOCACHE) {
1626 s->cache_flags = SD_FLAG_CMD_DIRECT;
1627 }
cac8f4a6 1628 s->discard_supported = true;
0e7106d8 1629
831acdc9 1630 if (snap_id || tag[0]) {
70018a14 1631 trace_sheepdog_open(vid);
2f536801 1632 s->is_snapshot = true;
33b1db1c
MK
1633 }
1634
e67c3993 1635 fd = connect_to_sdog(s, errp);
33b1db1c 1636 if (fd < 0) {
cb595887 1637 ret = fd;
cbc488ee 1638 goto err;
33b1db1c
MK
1639 }
1640
7267c094 1641 buf = g_malloc(SD_INODE_SIZE);
f11672db 1642 ret = read_object(fd, s->bs, buf, vid_to_vdi_oid(vid),
84390bed 1643 0, SD_INODE_SIZE, 0, s->cache_flags);
33b1db1c
MK
1644
1645 closesocket(fd);
1646
1647 if (ret) {
efde4b62 1648 error_setg(errp, "Can't read snapshot inode");
cbc488ee 1649 goto err;
33b1db1c
MK
1650 }
1651
1652 memcpy(&s->inode, buf, sizeof(s->inode));
33b1db1c 1653
e8bfaa2f 1654 bs->total_sectors = s->inode.vdi_size / BDRV_SECTOR_SIZE;
3178e275 1655 pstrcpy(s->name, sizeof(s->name), vdi);
2df46246 1656 qemu_co_mutex_init(&s->lock);
f1af3251 1657 qemu_co_mutex_init(&s->queue_lock);
498f2140 1658 qemu_co_queue_init(&s->overlapping_queue);
c8c96350 1659 qemu_opts_del(opts);
7267c094 1660 g_free(buf);
33b1db1c 1661 return 0;
cbc488ee
MA
1662
1663err:
dca21ef2 1664 aio_set_fd_handler(bdrv_get_aio_context(bs), s->fd,
f6a51c84 1665 false, NULL, NULL, NULL, NULL);
cbc488ee
MA
1666 closesocket(s->fd);
1667err_no_fd:
c8c96350 1668 qemu_opts_del(opts);
7267c094 1669 g_free(buf);
cb595887 1670 return ret;
33b1db1c
MK
1671}
1672
4da65c80
LY
1673static int sd_reopen_prepare(BDRVReopenState *state, BlockReopenQueue *queue,
1674 Error **errp)
1675{
1676 BDRVSheepdogState *s = state->bs->opaque;
1677 BDRVSheepdogReopenState *re_s;
1678 int ret = 0;
1679
1680 re_s = state->opaque = g_new0(BDRVSheepdogReopenState, 1);
1681
1682 re_s->cache_flags = SD_FLAG_CMD_CACHE;
1683 if (state->flags & BDRV_O_NOCACHE) {
1684 re_s->cache_flags = SD_FLAG_CMD_DIRECT;
1685 }
1686
1687 re_s->fd = get_sheep_fd(s, errp);
1688 if (re_s->fd < 0) {
1689 ret = re_s->fd;
1690 return ret;
1691 }
1692
1693 return ret;
1694}
1695
1696static void sd_reopen_commit(BDRVReopenState *state)
1697{
1698 BDRVSheepdogReopenState *re_s = state->opaque;
1699 BDRVSheepdogState *s = state->bs->opaque;
1700
1701 if (s->fd) {
dca21ef2 1702 aio_set_fd_handler(s->aio_context, s->fd, false,
f6a51c84 1703 NULL, NULL, NULL, NULL);
4da65c80
LY
1704 closesocket(s->fd);
1705 }
1706
1707 s->fd = re_s->fd;
1708 s->cache_flags = re_s->cache_flags;
1709
1710 g_free(state->opaque);
1711 state->opaque = NULL;
1712
1713 return;
1714}
1715
1716static void sd_reopen_abort(BDRVReopenState *state)
1717{
1718 BDRVSheepdogReopenState *re_s = state->opaque;
1719 BDRVSheepdogState *s = state->bs->opaque;
1720
1721 if (re_s == NULL) {
1722 return;
1723 }
1724
1725 if (re_s->fd) {
dca21ef2 1726 aio_set_fd_handler(s->aio_context, re_s->fd, false,
f6a51c84 1727 NULL, NULL, NULL, NULL);
4da65c80
LY
1728 closesocket(re_s->fd);
1729 }
1730
1731 g_free(state->opaque);
1732 state->opaque = NULL;
1733
1734 return;
1735}
1736
7d2d3e74
MA
1737static int do_sd_create(BDRVSheepdogState *s, uint32_t *vdi_id, int snapshot,
1738 Error **errp)
33b1db1c
MK
1739{
1740 SheepdogVdiReq hdr;
1741 SheepdogVdiRsp *rsp = (SheepdogVdiRsp *)&hdr;
1742 int fd, ret;
1743 unsigned int wlen, rlen = 0;
1744 char buf[SD_MAX_VDI_LEN];
1745
7d2d3e74 1746 fd = connect_to_sdog(s, errp);
33b1db1c 1747 if (fd < 0) {
cb595887 1748 return fd;
33b1db1c
MK
1749 }
1750
3178e275
JM
1751 /* FIXME: would it be better to fail (e.g., return -EIO) when filename
1752 * does not fit in buf? For now, just truncate and avoid buffer overrun.
1753 */
33b1db1c 1754 memset(buf, 0, sizeof(buf));
c31d482f 1755 pstrcpy(buf, sizeof(buf), s->name);
33b1db1c
MK
1756
1757 memset(&hdr, 0, sizeof(hdr));
1758 hdr.opcode = SD_OP_NEW_VDI;
9f23fce7 1759 hdr.base_vdi_id = s->inode.vdi_id;
33b1db1c
MK
1760
1761 wlen = SD_MAX_VDI_LEN;
1762
1763 hdr.flags = SD_FLAG_CMD_WRITE;
1764 hdr.snapid = snapshot;
1765
1766 hdr.data_length = wlen;
c31d482f
LY
1767 hdr.vdi_size = s->inode.vdi_size;
1768 hdr.copy_policy = s->inode.copy_policy;
b3af018f 1769 hdr.copies = s->inode.nr_copies;
876eb1b0 1770 hdr.block_size_shift = s->inode.block_size_shift;
33b1db1c 1771
f11672db 1772 ret = do_req(fd, NULL, (SheepdogReq *)&hdr, buf, &wlen, &rlen);
33b1db1c
MK
1773
1774 closesocket(fd);
1775
1776 if (ret) {
7d2d3e74 1777 error_setg_errno(errp, -ret, "create failed");
cb595887 1778 return ret;
33b1db1c
MK
1779 }
1780
1781 if (rsp->result != SD_RES_SUCCESS) {
7d2d3e74 1782 error_setg(errp, "%s, %s", sd_strerror(rsp->result), s->inode.name);
33b1db1c
MK
1783 return -EIO;
1784 }
1785
1786 if (vdi_id) {
1787 *vdi_id = rsp->vdi_id;
1788 }
1789
1790 return 0;
1791}
1792
1a62baf6
HR
1793static int sd_prealloc(BlockDriverState *bs, int64_t old_size, int64_t new_size,
1794 Error **errp)
a8e0fdd7 1795{
fba98d45 1796 BlockBackend *blk = NULL;
8b9ad56e 1797 BDRVSheepdogState *base = bs->opaque;
876eb1b0 1798 unsigned long buf_size;
a8e0fdd7 1799 uint32_t idx, max_idx;
876eb1b0 1800 uint32_t object_size;
876eb1b0 1801 void *buf = NULL;
a8e0fdd7
MK
1802 int ret;
1803
d861ab3a
KW
1804 blk = blk_new(bdrv_get_aio_context(bs),
1805 BLK_PERM_CONSISTENT_READ | BLK_PERM_WRITE | BLK_PERM_RESIZE,
8b9ad56e
HR
1806 BLK_PERM_ALL);
1807
1808 ret = blk_insert_bs(blk, bs, errp);
1809 if (ret < 0) {
318df29e 1810 goto out_with_err_set;
a8e0fdd7
MK
1811 }
1812
fba98d45
KW
1813 blk_set_allow_write_beyond_eof(blk, true);
1814
876eb1b0
TI
1815 object_size = (UINT32_C(1) << base->inode.block_size_shift);
1816 buf_size = MIN(object_size, SD_DATA_OBJ_SIZE);
1817 buf = g_malloc0(buf_size);
1818
1a62baf6 1819 max_idx = DIV_ROUND_UP(new_size, buf_size);
a8e0fdd7 1820
1a62baf6 1821 for (idx = old_size / buf_size; idx < max_idx; idx++) {
a8e0fdd7
MK
1822 /*
1823 * The created image can be a cloned image, so we need to read
1824 * a data from the source image.
1825 */
fba98d45 1826 ret = blk_pread(blk, idx * buf_size, buf, buf_size);
a8e0fdd7
MK
1827 if (ret < 0) {
1828 goto out;
1829 }
8341f00d 1830 ret = blk_pwrite(blk, idx * buf_size, buf, buf_size, 0);
a8e0fdd7
MK
1831 if (ret < 0) {
1832 goto out;
1833 }
1834 }
318df29e 1835
fba98d45 1836 ret = 0;
a8e0fdd7 1837out:
318df29e
MA
1838 if (ret < 0) {
1839 error_setg_errno(errp, -ret, "Can't pre-allocate");
1840 }
1841out_with_err_set:
ae8622ec 1842 blk_unref(blk);
7267c094 1843 g_free(buf);
a8e0fdd7
MK
1844
1845 return ret;
1846}
1847
63fd65a0
KW
1848static int sd_create_prealloc(BlockdevOptionsSheepdog *location, int64_t size,
1849 Error **errp)
1850{
1851 BlockDriverState *bs;
1852 Visitor *v;
1853 QObject *obj = NULL;
1854 QDict *qdict;
1855 Error *local_err = NULL;
1856 int ret;
1857
1858 v = qobject_output_visitor_new(&obj);
1859 visit_type_BlockdevOptionsSheepdog(v, NULL, &location, &local_err);
1860 visit_free(v);
1861
1862 if (local_err) {
1863 error_propagate(errp, local_err);
cb3e7f08 1864 qobject_unref(obj);
63fd65a0
KW
1865 return -EINVAL;
1866 }
1867
7dc847eb 1868 qdict = qobject_to(QDict, obj);
63fd65a0
KW
1869 qdict_flatten(qdict);
1870
1871 qdict_put_str(qdict, "driver", "sheepdog");
1872
1873 bs = bdrv_open(NULL, NULL, qdict, BDRV_O_PROTOCOL | BDRV_O_RDWR, errp);
1874 if (bs == NULL) {
1875 ret = -EIO;
1876 goto fail;
1877 }
1878
1879 ret = sd_prealloc(bs, 0, size, errp);
1880fail:
1881 bdrv_unref(bs);
cb3e7f08 1882 qobject_unref(qdict);
63fd65a0
KW
1883 return ret;
1884}
1885
a595e4bc
KW
1886static int parse_redundancy(BDRVSheepdogState *s, SheepdogRedundancy *opt)
1887{
1888 struct SheepdogInode *inode = &s->inode;
1889
1890 switch (opt->type) {
1891 case SHEEPDOG_REDUNDANCY_TYPE_FULL:
1892 if (opt->u.full.copies > SD_MAX_COPIES || opt->u.full.copies < 1) {
1893 return -EINVAL;
1894 }
1895 inode->copy_policy = 0;
1896 inode->nr_copies = opt->u.full.copies;
1897 return 0;
1898
1899 case SHEEPDOG_REDUNDANCY_TYPE_ERASURE_CODED:
1900 {
1901 int64_t copy = opt->u.erasure_coded.data_strips;
1902 int64_t parity = opt->u.erasure_coded.parity_strips;
1903
1904 if (copy != 2 && copy != 4 && copy != 8 && copy != 16) {
1905 return -EINVAL;
1906 }
1907
1908 if (parity >= SD_EC_MAX_STRIP || parity < 1) {
1909 return -EINVAL;
1910 }
1911
1912 /*
1913 * 4 bits for parity and 4 bits for data.
1914 * We have to compress upper data bits because it can't represent 16
1915 */
1916 inode->copy_policy = ((copy / 2) << 4) + parity;
1917 inode->nr_copies = copy + parity;
1918 return 0;
1919 }
1920
1921 default:
1922 g_assert_not_reached();
1923 }
1924
1925 return -EINVAL;
1926}
1927
b3af018f
LY
1928/*
1929 * Sheepdog support two kinds of redundancy, full replication and erasure
1930 * coding.
1931 *
1932 * # create a fully replicated vdi with x copies
1933 * -o redundancy=x (1 <= x <= SD_MAX_COPIES)
1934 *
1935 * # create a erasure coded vdi with x data strips and y parity strips
1936 * -o redundancy=x:y (x must be one of {2,4,8,16} and 1 <= y < SD_EC_MAX_STRIP)
1937 */
63fd65a0 1938static SheepdogRedundancy *parse_redundancy_str(const char *opt)
b3af018f 1939{
63fd65a0 1940 SheepdogRedundancy *redundancy;
b3af018f
LY
1941 const char *n1, *n2;
1942 long copy, parity;
1943 char p[10];
a595e4bc 1944 int ret;
b3af018f
LY
1945
1946 pstrcpy(p, sizeof(p), opt);
1947 n1 = strtok(p, ":");
1948 n2 = strtok(NULL, ":");
1949
1950 if (!n1) {
63fd65a0 1951 return NULL;
b3af018f
LY
1952 }
1953
a595e4bc
KW
1954 ret = qemu_strtol(n1, NULL, 10, &copy);
1955 if (ret < 0) {
63fd65a0 1956 return NULL;
b3af018f
LY
1957 }
1958
63fd65a0 1959 redundancy = g_new0(SheepdogRedundancy, 1);
a595e4bc 1960 if (!n2) {
63fd65a0 1961 *redundancy = (SheepdogRedundancy) {
a595e4bc
KW
1962 .type = SHEEPDOG_REDUNDANCY_TYPE_FULL,
1963 .u.full.copies = copy,
1964 };
1965 } else {
1966 ret = qemu_strtol(n2, NULL, 10, &parity);
1967 if (ret < 0) {
a2cb9239 1968 g_free(redundancy);
63fd65a0 1969 return NULL;
a595e4bc 1970 }
b3af018f 1971
63fd65a0 1972 *redundancy = (SheepdogRedundancy) {
a595e4bc
KW
1973 .type = SHEEPDOG_REDUNDANCY_TYPE_ERASURE_CODED,
1974 .u.erasure_coded = {
1975 .data_strips = copy,
1976 .parity_strips = parity,
1977 },
1978 };
b3af018f
LY
1979 }
1980
63fd65a0 1981 return redundancy;
b3af018f
LY
1982}
1983
63fd65a0
KW
1984static int parse_block_size_shift(BDRVSheepdogState *s,
1985 BlockdevCreateOptionsSheepdog *opts)
876eb1b0
TI
1986{
1987 struct SheepdogInode *inode = &s->inode;
1988 uint64_t object_size;
1989 int obj_order;
1990
63fd65a0
KW
1991 if (opts->has_object_size) {
1992 object_size = opts->object_size;
1993
876eb1b0
TI
1994 if ((object_size - 1) & object_size) { /* not a power of 2? */
1995 return -EINVAL;
1996 }
786a4ea8 1997 obj_order = ctz32(object_size);
876eb1b0
TI
1998 if (obj_order < 20 || obj_order > 31) {
1999 return -EINVAL;
2000 }
2001 inode->block_size_shift = (uint8_t)obj_order;
2002 }
2003
2004 return 0;
2005}
2006
63fd65a0 2007static int sd_co_create(BlockdevCreateOptions *options, Error **errp)
33b1db1c 2008{
63fd65a0 2009 BlockdevCreateOptionsSheepdog *opts = &options->u.sheepdog;
b6fc8245 2010 int ret = 0;
c31d482f 2011 uint32_t vid = 0;
33b1db1c 2012 char *backing_file = NULL;
b222237b 2013 char *buf = NULL;
b6fc8245 2014 BDRVSheepdogState *s;
876eb1b0 2015 uint64_t max_vdi_size;
2f536801 2016 bool prealloc = false;
33b1db1c 2017
63fd65a0
KW
2018 assert(options->driver == BLOCKDEV_DRIVER_SHEEPDOG);
2019
5839e53b 2020 s = g_new0(BDRVSheepdogState, 1);
b6fc8245 2021
63fd65a0
KW
2022 /* Steal SocketAddress from QAPI, set NULL to prevent double free */
2023 s->addr = opts->location->server;
2024 opts->location->server = NULL;
2025
2026 if (strlen(opts->location->vdi) >= sizeof(s->name)) {
2027 error_setg(errp, "'vdi' string too long");
2028 ret = -EINVAL;
b6fc8245 2029 goto out;
b4447363 2030 }
63fd65a0 2031 pstrcpy(s->name, sizeof(s->name), opts->location->vdi);
b4447363 2032
63fd65a0
KW
2033 s->inode.vdi_size = opts->size;
2034 backing_file = opts->backing_file;
831acdc9 2035
63fd65a0
KW
2036 if (!opts->has_preallocation) {
2037 opts->preallocation = PREALLOC_MODE_OFF;
2038 }
2039 switch (opts->preallocation) {
2040 case PREALLOC_MODE_OFF:
b222237b 2041 prealloc = false;
63fd65a0
KW
2042 break;
2043 case PREALLOC_MODE_FULL:
b222237b 2044 prealloc = true;
63fd65a0
KW
2045 break;
2046 default:
2047 error_setg(errp, "Preallocation mode not supported for Sheepdog");
b222237b
CL
2048 ret = -EINVAL;
2049 goto out;
2050 }
2051
63fd65a0
KW
2052 if (opts->has_redundancy) {
2053 ret = parse_redundancy(s, opts->redundancy);
b222237b 2054 if (ret < 0) {
63fd65a0 2055 error_setg(errp, "Invalid redundancy mode");
b222237b 2056 goto out;
33b1db1c 2057 }
33b1db1c 2058 }
876eb1b0
TI
2059 ret = parse_block_size_shift(s, opts);
2060 if (ret < 0) {
2061 error_setg(errp, "Invalid object_size."
2062 " obect_size needs to be power of 2"
2063 " and be limited from 2^20 to 2^31");
b6fc8245 2064 goto out;
33b1db1c
MK
2065 }
2066
63fd65a0 2067 if (opts->has_backing_file) {
fba98d45 2068 BlockBackend *blk;
9f23fce7 2069 BDRVSheepdogState *base;
33b1db1c
MK
2070 BlockDriver *drv;
2071
2072 /* Currently, only Sheepdog backing image is supported. */
63fd65a0 2073 drv = bdrv_find_protocol(opts->backing_file, true, NULL);
33b1db1c 2074 if (!drv || strcmp(drv->protocol_name, "sheepdog") != 0) {
e67c3993 2075 error_setg(errp, "backing_file must be a sheepdog image");
b6fc8245
MK
2076 ret = -EINVAL;
2077 goto out;
33b1db1c
MK
2078 }
2079
63fd65a0 2080 blk = blk_new_open(opts->backing_file, NULL, NULL,
72e775c7 2081 BDRV_O_PROTOCOL, errp);
fba98d45
KW
2082 if (blk == NULL) {
2083 ret = -EIO;
b6fc8245 2084 goto out;
cb595887 2085 }
33b1db1c 2086
fba98d45 2087 base = blk_bs(blk)->opaque;
33b1db1c 2088
9f23fce7 2089 if (!is_snapshot(&base->inode)) {
e67c3993 2090 error_setg(errp, "cannot clone from a non snapshot vdi");
fba98d45 2091 blk_unref(blk);
b6fc8245
MK
2092 ret = -EINVAL;
2093 goto out;
33b1db1c 2094 }
9f23fce7 2095 s->inode.vdi_id = base->inode.vdi_id;
fba98d45 2096 blk_unref(blk);
33b1db1c
MK
2097 }
2098
5d5da114 2099 s->aio_context = qemu_get_aio_context();
876eb1b0
TI
2100
2101 /* if block_size_shift is not specified, get cluster default value */
2102 if (s->inode.block_size_shift == 0) {
2103 SheepdogVdiReq hdr;
2104 SheepdogClusterRsp *rsp = (SheepdogClusterRsp *)&hdr;
876eb1b0
TI
2105 int fd;
2106 unsigned int wlen = 0, rlen = 0;
2107
48d7c4af 2108 fd = connect_to_sdog(s, errp);
876eb1b0 2109 if (fd < 0) {
48d7c4af 2110 ret = fd;
876eb1b0
TI
2111 goto out;
2112 }
2113
2114 memset(&hdr, 0, sizeof(hdr));
2115 hdr.opcode = SD_OP_GET_CLUSTER_DEFAULT;
2116 hdr.proto_ver = SD_PROTO_VER;
2117
f11672db 2118 ret = do_req(fd, NULL, (SheepdogReq *)&hdr,
876eb1b0
TI
2119 NULL, &wlen, &rlen);
2120 closesocket(fd);
2121 if (ret) {
2122 error_setg_errno(errp, -ret, "failed to get cluster default");
2123 goto out;
2124 }
2125 if (rsp->result == SD_RES_SUCCESS) {
2126 s->inode.block_size_shift = rsp->block_size_shift;
2127 } else {
2128 s->inode.block_size_shift = SD_DEFAULT_BLOCK_SIZE_SHIFT;
2129 }
2130 }
2131
2132 max_vdi_size = (UINT64_C(1) << s->inode.block_size_shift) * MAX_DATA_OBJS;
2133
2134 if (s->inode.vdi_size > max_vdi_size) {
2135 error_setg(errp, "An image is too large."
2136 " The maximum image size is %"PRIu64 "GB",
2137 max_vdi_size / 1024 / 1024 / 1024);
2138 ret = -EINVAL;
2139 goto out;
2140 }
2141
e67c3993 2142 ret = do_sd_create(s, &vid, 0, errp);
7d2d3e74 2143 if (ret) {
b6fc8245 2144 goto out;
a8e0fdd7
MK
2145 }
2146
7d2d3e74 2147 if (prealloc) {
63fd65a0 2148 ret = sd_create_prealloc(opts->location, opts->size, errp);
318df29e 2149 }
b6fc8245 2150out:
b222237b
CL
2151 g_free(backing_file);
2152 g_free(buf);
63fd65a0 2153 g_free(s->addr);
b6fc8245
MK
2154 g_free(s);
2155 return ret;
33b1db1c
MK
2156}
2157
63fd65a0
KW
2158static int coroutine_fn sd_co_create_opts(const char *filename, QemuOpts *opts,
2159 Error **errp)
2160{
2161 BlockdevCreateOptions *create_options = NULL;
2162 QDict *qdict, *location_qdict;
63fd65a0 2163 Visitor *v;
a2cb9239 2164 char *redundancy;
63fd65a0
KW
2165 Error *local_err = NULL;
2166 int ret;
2167
2168 redundancy = qemu_opt_get_del(opts, BLOCK_OPT_REDUNDANCY);
2169
2170 qdict = qemu_opts_to_qdict(opts, NULL);
2171 qdict_put_str(qdict, "driver", "sheepdog");
2172
2173 location_qdict = qdict_new();
2174 qdict_put(qdict, "location", location_qdict);
2175
2176 sd_parse_filename(filename, location_qdict, &local_err);
2177 if (local_err) {
2178 error_propagate(errp, local_err);
2179 ret = -EINVAL;
2180 goto fail;
2181 }
2182
2183 qdict_flatten(qdict);
2184
2185 /* Change legacy command line options into QMP ones */
2186 static const QDictRenames opt_renames[] = {
2187 { BLOCK_OPT_BACKING_FILE, "backing-file" },
2188 { BLOCK_OPT_OBJECT_SIZE, "object-size" },
2189 { NULL, NULL },
2190 };
2191
2192 if (!qdict_rename_keys(qdict, opt_renames, errp)) {
2193 ret = -EINVAL;
2194 goto fail;
2195 }
2196
2197 /* Get the QAPI object */
af91062e
MA
2198 v = qobject_input_visitor_new_flat_confused(qdict, errp);
2199 if (!v) {
63fd65a0
KW
2200 ret = -EINVAL;
2201 goto fail;
2202 }
2203
63fd65a0
KW
2204 visit_type_BlockdevCreateOptions(v, NULL, &create_options, &local_err);
2205 visit_free(v);
63fd65a0
KW
2206
2207 if (local_err) {
2208 error_propagate(errp, local_err);
2209 ret = -EINVAL;
2210 goto fail;
2211 }
2212
2213 assert(create_options->driver == BLOCKDEV_DRIVER_SHEEPDOG);
2214 create_options->u.sheepdog.size =
2215 ROUND_UP(create_options->u.sheepdog.size, BDRV_SECTOR_SIZE);
2216
2217 if (redundancy) {
2218 create_options->u.sheepdog.has_redundancy = true;
2219 create_options->u.sheepdog.redundancy =
2220 parse_redundancy_str(redundancy);
2221 if (create_options->u.sheepdog.redundancy == NULL) {
2222 error_setg(errp, "Invalid redundancy mode");
2223 ret = -EINVAL;
2224 goto fail;
2225 }
2226 }
2227
2228 ret = sd_co_create(create_options, errp);
2229fail:
2230 qapi_free_BlockdevCreateOptions(create_options);
cb3e7f08 2231 qobject_unref(qdict);
a2cb9239 2232 g_free(redundancy);
63fd65a0
KW
2233 return ret;
2234}
2235
33b1db1c
MK
2236static void sd_close(BlockDriverState *bs)
2237{
dfb12bf8 2238 Error *local_err = NULL;
33b1db1c
MK
2239 BDRVSheepdogState *s = bs->opaque;
2240 SheepdogVdiReq hdr;
2241 SheepdogVdiRsp *rsp = (SheepdogVdiRsp *)&hdr;
2242 unsigned int wlen, rlen = 0;
2243 int fd, ret;
2244
70018a14 2245 trace_sheepdog_close(s->name);
33b1db1c 2246
dfb12bf8 2247 fd = connect_to_sdog(s, &local_err);
33b1db1c 2248 if (fd < 0) {
565f65d2 2249 error_report_err(local_err);
33b1db1c
MK
2250 return;
2251 }
2252
2253 memset(&hdr, 0, sizeof(hdr));
2254
2255 hdr.opcode = SD_OP_RELEASE_VDI;
1dbfafed 2256 hdr.type = LOCK_TYPE_NORMAL;
9f23fce7 2257 hdr.base_vdi_id = s->inode.vdi_id;
33b1db1c
MK
2258 wlen = strlen(s->name) + 1;
2259 hdr.data_length = wlen;
2260 hdr.flags = SD_FLAG_CMD_WRITE;
2261
f11672db 2262 ret = do_req(fd, s->bs, (SheepdogReq *)&hdr,
84390bed 2263 s->name, &wlen, &rlen);
33b1db1c
MK
2264
2265 closesocket(fd);
2266
2267 if (!ret && rsp->result != SD_RES_SUCCESS &&
2268 rsp->result != SD_RES_VDI_NOT_LOCKED) {
6daf194d 2269 error_report("%s, %s", sd_strerror(rsp->result), s->name);
33b1db1c
MK
2270 }
2271
dca21ef2 2272 aio_set_fd_handler(bdrv_get_aio_context(bs), s->fd,
f6a51c84 2273 false, NULL, NULL, NULL, NULL);
33b1db1c 2274 closesocket(s->fd);
bd269ebc 2275 qapi_free_SocketAddress(s->addr);
33b1db1c
MK
2276}
2277
2278static int64_t sd_getlength(BlockDriverState *bs)
2279{
2280 BDRVSheepdogState *s = bs->opaque;
2281
2282 return s->inode.vdi_size;
2283}
2284
061ca8a3
KW
2285static int coroutine_fn sd_co_truncate(BlockDriverState *bs, int64_t offset,
2286 PreallocMode prealloc, Error **errp)
33b1db1c
MK
2287{
2288 BDRVSheepdogState *s = bs->opaque;
2289 int ret, fd;
2290 unsigned int datalen;
876eb1b0 2291 uint64_t max_vdi_size;
74f1eabf 2292 int64_t old_size = s->inode.vdi_size;
33b1db1c 2293
74f1eabf 2294 if (prealloc != PREALLOC_MODE_OFF && prealloc != PREALLOC_MODE_FULL) {
8243ccb7 2295 error_setg(errp, "Unsupported preallocation mode '%s'",
977c736f 2296 PreallocMode_str(prealloc));
8243ccb7
HR
2297 return -ENOTSUP;
2298 }
2299
876eb1b0 2300 max_vdi_size = (UINT64_C(1) << s->inode.block_size_shift) * MAX_DATA_OBJS;
74f1eabf 2301 if (offset < old_size) {
4bff28b8 2302 error_setg(errp, "shrinking is not supported");
33b1db1c 2303 return -EINVAL;
876eb1b0 2304 } else if (offset > max_vdi_size) {
4bff28b8 2305 error_setg(errp, "too big image size");
33b1db1c
MK
2306 return -EINVAL;
2307 }
2308
4bff28b8 2309 fd = connect_to_sdog(s, errp);
33b1db1c 2310 if (fd < 0) {
cb595887 2311 return fd;
33b1db1c
MK
2312 }
2313
2314 /* we don't need to update entire object */
03b036cc 2315 datalen = SD_INODE_HEADER_SIZE;
33b1db1c 2316 s->inode.vdi_size = offset;
f11672db 2317 ret = write_object(fd, s->bs, (char *)&s->inode,
84390bed
SH
2318 vid_to_vdi_oid(s->inode.vdi_id), s->inode.nr_copies,
2319 datalen, 0, false, s->cache_flags);
33b1db1c
MK
2320 close(fd);
2321
2322 if (ret < 0) {
4bff28b8 2323 error_setg_errno(errp, -ret, "failed to update an inode");
74f1eabf 2324 return ret;
33b1db1c
MK
2325 }
2326
74f1eabf
HR
2327 if (prealloc == PREALLOC_MODE_FULL) {
2328 ret = sd_prealloc(bs, old_size, offset, errp);
2329 if (ret < 0) {
2330 return ret;
2331 }
2332 }
2333
2334 return 0;
33b1db1c
MK
2335}
2336
2337/*
2338 * This function is called after writing data objects. If we need to
2339 * update metadata, this sends a write request to the vdi object.
33b1db1c 2340 */
d8716b41 2341static void coroutine_fn sd_write_done(SheepdogAIOCB *acb)
33b1db1c 2342{
28ddd08c 2343 BDRVSheepdogState *s = acb->s;
33b1db1c
MK
2344 struct iovec iov;
2345 AIOReq *aio_req;
2346 uint32_t offset, data_len, mn, mx;
2347
498f2140
HM
2348 mn = acb->min_dirty_data_idx;
2349 mx = acb->max_dirty_data_idx;
33b1db1c
MK
2350 if (mn <= mx) {
2351 /* we need to update the vdi object. */
e80ab33d 2352 ++acb->nr_pending;
33b1db1c
MK
2353 offset = sizeof(s->inode) - sizeof(s->inode.data_vdi_id) +
2354 mn * sizeof(s->inode.data_vdi_id[0]);
2355 data_len = (mx - mn + 1) * sizeof(s->inode.data_vdi_id[0]);
2356
498f2140
HM
2357 acb->min_dirty_data_idx = UINT32_MAX;
2358 acb->max_dirty_data_idx = 0;
33b1db1c
MK
2359
2360 iov.iov_base = &s->inode;
2361 iov.iov_len = sizeof(s->inode);
2362 aio_req = alloc_aio_req(s, acb, vid_to_vdi_oid(s->inode.vdi_id),
b544c1ab 2363 data_len, offset, 0, false, 0, offset);
b544c1ab 2364 add_aio_request(s, aio_req, &iov, 1, AIOCB_WRITE_UDATA);
e80ab33d
PB
2365 if (--acb->nr_pending) {
2366 qemu_coroutine_yield();
2367 }
33b1db1c 2368 }
33b1db1c
MK
2369}
2370
859e5553
LY
2371/* Delete current working VDI on the snapshot chain */
2372static bool sd_delete(BDRVSheepdogState *s)
2373{
dfb12bf8 2374 Error *local_err = NULL;
859e5553
LY
2375 unsigned int wlen = SD_MAX_VDI_LEN, rlen = 0;
2376 SheepdogVdiReq hdr = {
2377 .opcode = SD_OP_DEL_VDI,
9f23fce7 2378 .base_vdi_id = s->inode.vdi_id,
859e5553
LY
2379 .data_length = wlen,
2380 .flags = SD_FLAG_CMD_WRITE,
2381 };
2382 SheepdogVdiRsp *rsp = (SheepdogVdiRsp *)&hdr;
2383 int fd, ret;
2384
dfb12bf8 2385 fd = connect_to_sdog(s, &local_err);
859e5553 2386 if (fd < 0) {
565f65d2 2387 error_report_err(local_err);
859e5553
LY
2388 return false;
2389 }
2390
f11672db 2391 ret = do_req(fd, s->bs, (SheepdogReq *)&hdr,
84390bed 2392 s->name, &wlen, &rlen);
859e5553
LY
2393 closesocket(fd);
2394 if (ret) {
2395 return false;
2396 }
2397 switch (rsp->result) {
2398 case SD_RES_NO_VDI:
2399 error_report("%s was already deleted", s->name);
2400 /* fall through */
2401 case SD_RES_SUCCESS:
2402 break;
2403 default:
2404 error_report("%s, %s", sd_strerror(rsp->result), s->name);
2405 return false;
2406 }
2407
2408 return true;
2409}
2410
33b1db1c
MK
2411/*
2412 * Create a writable VDI from a snapshot
2413 */
2414static int sd_create_branch(BDRVSheepdogState *s)
2415{
dfb12bf8 2416 Error *local_err = NULL;
33b1db1c
MK
2417 int ret, fd;
2418 uint32_t vid;
2419 char *buf;
859e5553 2420 bool deleted;
33b1db1c 2421
70018a14 2422 trace_sheepdog_create_branch_snapshot(s->inode.vdi_id);
33b1db1c 2423
7267c094 2424 buf = g_malloc(SD_INODE_SIZE);
33b1db1c 2425
859e5553
LY
2426 /*
2427 * Even If deletion fails, we will just create extra snapshot based on
dc6fb73d 2428 * the working VDI which was supposed to be deleted. So no need to
859e5553
LY
2429 * false bail out.
2430 */
2431 deleted = sd_delete(s);
7d2d3e74 2432 ret = do_sd_create(s, &vid, !deleted, &local_err);
33b1db1c 2433 if (ret) {
565f65d2 2434 error_report_err(local_err);
33b1db1c
MK
2435 goto out;
2436 }
2437
70018a14 2438 trace_sheepdog_create_branch_created(vid);
33b1db1c 2439
dfb12bf8 2440 fd = connect_to_sdog(s, &local_err);
33b1db1c 2441 if (fd < 0) {
565f65d2 2442 error_report_err(local_err);
cb595887 2443 ret = fd;
33b1db1c
MK
2444 goto out;
2445 }
2446
f11672db 2447 ret = read_object(fd, s->bs, buf, vid_to_vdi_oid(vid),
84390bed 2448 s->inode.nr_copies, SD_INODE_SIZE, 0, s->cache_flags);
33b1db1c
MK
2449
2450 closesocket(fd);
2451
2452 if (ret < 0) {
2453 goto out;
2454 }
2455
2456 memcpy(&s->inode, buf, sizeof(s->inode));
2457
2f536801 2458 s->is_snapshot = false;
33b1db1c 2459 ret = 0;
70018a14 2460 trace_sheepdog_create_branch_new(s->inode.vdi_id);
33b1db1c
MK
2461
2462out:
7267c094 2463 g_free(buf);
33b1db1c
MK
2464
2465 return ret;
2466}
2467
2468/*
2469 * Send I/O requests to the server.
2470 *
2471 * This function sends requests to the server, links the requests to
c292ee6a 2472 * the inflight_list in BDRVSheepdogState, and exits without
33b1db1c
MK
2473 * waiting the response. The responses are received in the
2474 * `aio_read_response' function which is called from the main loop as
2475 * a fd handler.
2df46246
MK
2476 *
2477 * Returns 1 when we need to wait a response, 0 when there is no sent
2478 * request and -errno in error cases.
33b1db1c 2479 */
28ddd08c 2480static void coroutine_fn sd_co_rw_vector(SheepdogAIOCB *acb)
33b1db1c 2481{
33b1db1c 2482 int ret = 0;
e8bfaa2f 2483 unsigned long len, done = 0, total = acb->nb_sectors * BDRV_SECTOR_SIZE;
876eb1b0
TI
2484 unsigned long idx;
2485 uint32_t object_size;
33b1db1c 2486 uint64_t oid;
876eb1b0 2487 uint64_t offset;
28ddd08c 2488 BDRVSheepdogState *s = acb->s;
33b1db1c
MK
2489 SheepdogInode *inode = &s->inode;
2490 AIOReq *aio_req;
2491
33b1db1c
MK
2492 if (acb->aiocb_type == AIOCB_WRITE_UDATA && s->is_snapshot) {
2493 /*
2494 * In the case we open the snapshot VDI, Sheepdog creates the
2495 * writable VDI when we do a write operation first.
2496 */
2497 ret = sd_create_branch(s);
2498 if (ret) {
2499 acb->ret = -EIO;
e80ab33d 2500 return;
33b1db1c
MK
2501 }
2502 }
2503
876eb1b0
TI
2504 object_size = (UINT32_C(1) << inode->block_size_shift);
2505 idx = acb->sector_num * BDRV_SECTOR_SIZE / object_size;
2506 offset = (acb->sector_num * BDRV_SECTOR_SIZE) % object_size;
2507
1d732d7d
MK
2508 /*
2509 * Make sure we don't free the aiocb before we are done with all requests.
2510 * This additional reference is dropped at the end of this function.
2511 */
2512 acb->nr_pending++;
2513
33b1db1c
MK
2514 while (done != total) {
2515 uint8_t flags = 0;
2516 uint64_t old_oid = 0;
2f536801 2517 bool create = false;
33b1db1c
MK
2518
2519 oid = vid_to_data_oid(inode->data_vdi_id[idx], idx);
2520
876eb1b0 2521 len = MIN(total - done, object_size - offset);
33b1db1c 2522
19db9b90
CH
2523 switch (acb->aiocb_type) {
2524 case AIOCB_READ_UDATA:
2525 if (!inode->data_vdi_id[idx]) {
2526 qemu_iovec_memset(acb->qiov, done, 0, len);
33b1db1c
MK
2527 goto done;
2528 }
19db9b90
CH
2529 break;
2530 case AIOCB_WRITE_UDATA:
2531 if (!inode->data_vdi_id[idx]) {
2f536801 2532 create = true;
19db9b90
CH
2533 } else if (!is_data_obj_writable(inode, idx)) {
2534 /* Copy-On-Write */
2f536801 2535 create = true;
19db9b90
CH
2536 old_oid = oid;
2537 flags = SD_FLAG_CMD_COW;
2538 }
2539 break;
cac8f4a6
LY
2540 case AIOCB_DISCARD_OBJ:
2541 /*
2542 * We discard the object only when the whole object is
2543 * 1) allocated 2) trimmed. Otherwise, simply skip it.
2544 */
876eb1b0 2545 if (len != object_size || inode->data_vdi_id[idx] == 0) {
cac8f4a6
LY
2546 goto done;
2547 }
2548 break;
19db9b90
CH
2549 default:
2550 break;
33b1db1c
MK
2551 }
2552
2553 if (create) {
70018a14
LV
2554 trace_sheepdog_co_rw_vector_update(inode->vdi_id, oid,
2555 vid_to_data_oid(inode->data_vdi_id[idx], idx),
2556 idx);
33b1db1c 2557 oid = vid_to_data_oid(inode->vdi_id, idx);
70018a14 2558 trace_sheepdog_co_rw_vector_new(oid);
33b1db1c
MK
2559 }
2560
b544c1ab 2561 aio_req = alloc_aio_req(s, acb, oid, len, offset, flags, create,
e6fd57ea
HM
2562 old_oid,
2563 acb->aiocb_type == AIOCB_DISCARD_OBJ ?
2564 0 : done);
b544c1ab 2565 add_aio_request(s, aio_req, acb->qiov->iov, acb->qiov->niov,
a37dcdf9 2566 acb->aiocb_type);
33b1db1c
MK
2567 done:
2568 offset = 0;
2569 idx++;
2570 done += len;
2571 }
e80ab33d
PB
2572 if (--acb->nr_pending) {
2573 qemu_coroutine_yield();
33b1db1c
MK
2574 }
2575}
2576
acf6e5f0 2577static void sd_aio_complete(SheepdogAIOCB *acb)
6a55c82c 2578{
f1af3251 2579 BDRVSheepdogState *s;
acf6e5f0
PB
2580 if (acb->aiocb_type == AIOCB_FLUSH_CACHE) {
2581 return;
6a55c82c
HM
2582 }
2583
f1af3251
PB
2584 s = acb->s;
2585 qemu_co_mutex_lock(&s->queue_lock);
acf6e5f0 2586 QLIST_REMOVE(acb, aiocb_siblings);
f1af3251
PB
2587 qemu_co_queue_restart_all(&s->overlapping_queue);
2588 qemu_co_mutex_unlock(&s->queue_lock);
6a55c82c
HM
2589}
2590
a968168c 2591static coroutine_fn int sd_co_writev(BlockDriverState *bs, int64_t sector_num,
e18a58b4
EB
2592 int nb_sectors, QEMUIOVector *qiov,
2593 int flags)
33b1db1c 2594{
28ddd08c 2595 SheepdogAIOCB acb;
2df46246 2596 int ret;
e50d7607
LY
2597 int64_t offset = (sector_num + nb_sectors) * BDRV_SECTOR_SIZE;
2598 BDRVSheepdogState *s = bs->opaque;
33b1db1c 2599
e18a58b4 2600 assert(!flags);
c0191e76 2601 if (offset > s->inode.vdi_size) {
061ca8a3 2602 ret = sd_co_truncate(bs, offset, PREALLOC_MODE_OFF, NULL);
cb595887
MK
2603 if (ret < 0) {
2604 return ret;
33b1db1c 2605 }
33b1db1c
MK
2606 }
2607
28ddd08c 2608 sd_aio_setup(&acb, s, qiov, sector_num, nb_sectors, AIOCB_WRITE_UDATA);
28ddd08c
PB
2609 sd_co_rw_vector(&acb);
2610 sd_write_done(&acb);
acf6e5f0 2611 sd_aio_complete(&acb);
2df46246 2612
28ddd08c 2613 return acb.ret;
33b1db1c
MK
2614}
2615
a968168c 2616static coroutine_fn int sd_co_readv(BlockDriverState *bs, int64_t sector_num,
2df46246 2617 int nb_sectors, QEMUIOVector *qiov)
33b1db1c 2618{
28ddd08c 2619 SheepdogAIOCB acb;
6a55c82c 2620 BDRVSheepdogState *s = bs->opaque;
33b1db1c 2621
28ddd08c 2622 sd_aio_setup(&acb, s, qiov, sector_num, nb_sectors, AIOCB_READ_UDATA);
28ddd08c 2623 sd_co_rw_vector(&acb);
acf6e5f0 2624 sd_aio_complete(&acb);
2df46246 2625
28ddd08c 2626 return acb.ret;
33b1db1c
MK
2627}
2628
47622c44
LY
2629static int coroutine_fn sd_co_flush_to_disk(BlockDriverState *bs)
2630{
2631 BDRVSheepdogState *s = bs->opaque;
28ddd08c 2632 SheepdogAIOCB acb;
47783072 2633 AIOReq *aio_req;
47622c44 2634
0e7106d8 2635 if (s->cache_flags != SD_FLAG_CMD_CACHE) {
47622c44
LY
2636 return 0;
2637 }
2638
28ddd08c 2639 sd_aio_setup(&acb, s, NULL, 0, 0, AIOCB_FLUSH_CACHE);
47622c44 2640
28ddd08c
PB
2641 acb.nr_pending++;
2642 aio_req = alloc_aio_req(s, &acb, vid_to_vdi_oid(s->inode.vdi_id),
b544c1ab 2643 0, 0, 0, false, 0, 0);
28ddd08c 2644 add_aio_request(s, aio_req, NULL, 0, acb.aiocb_type);
47622c44 2645
28ddd08c 2646 if (--acb.nr_pending) {
e80ab33d
PB
2647 qemu_coroutine_yield();
2648 }
acf6e5f0
PB
2649
2650 sd_aio_complete(&acb);
28ddd08c 2651 return acb.ret;
47622c44
LY
2652}
2653
33b1db1c
MK
2654static int sd_snapshot_create(BlockDriverState *bs, QEMUSnapshotInfo *sn_info)
2655{
dfb12bf8 2656 Error *local_err = NULL;
33b1db1c
MK
2657 BDRVSheepdogState *s = bs->opaque;
2658 int ret, fd;
2659 uint32_t new_vid;
2660 SheepdogInode *inode;
2661 unsigned int datalen;
2662
70018a14
LV
2663 trace_sheepdog_snapshot_create_info(sn_info->name, sn_info->id_str, s->name,
2664 sn_info->vm_state_size, s->is_snapshot);
33b1db1c
MK
2665
2666 if (s->is_snapshot) {
2667 error_report("You can't create a snapshot of a snapshot VDI, "
6daf194d 2668 "%s (%" PRIu32 ").", s->name, s->inode.vdi_id);
33b1db1c
MK
2669
2670 return -EINVAL;
2671 }
2672
70018a14 2673 trace_sheepdog_snapshot_create(sn_info->name, sn_info->id_str);
33b1db1c
MK
2674
2675 s->inode.vm_state_size = sn_info->vm_state_size;
2676 s->inode.vm_clock_nsec = sn_info->vm_clock_nsec;
3178e275
JM
2677 /* It appears that inode.tag does not require a NUL terminator,
2678 * which means this use of strncpy is ok.
2679 */
33b1db1c
MK
2680 strncpy(s->inode.tag, sn_info->name, sizeof(s->inode.tag));
2681 /* we don't need to update entire object */
03b036cc 2682 datalen = SD_INODE_HEADER_SIZE;
2df5fee2 2683 inode = g_malloc(datalen);
33b1db1c
MK
2684
2685 /* refresh inode. */
dfb12bf8 2686 fd = connect_to_sdog(s, &local_err);
33b1db1c 2687 if (fd < 0) {
565f65d2 2688 error_report_err(local_err);
cb595887 2689 ret = fd;
33b1db1c
MK
2690 goto cleanup;
2691 }
2692
f11672db 2693 ret = write_object(fd, s->bs, (char *)&s->inode,
84390bed
SH
2694 vid_to_vdi_oid(s->inode.vdi_id), s->inode.nr_copies,
2695 datalen, 0, false, s->cache_flags);
33b1db1c 2696 if (ret < 0) {
6daf194d 2697 error_report("failed to write snapshot's inode.");
33b1db1c
MK
2698 goto cleanup;
2699 }
2700
7d2d3e74 2701 ret = do_sd_create(s, &new_vid, 1, &local_err);
33b1db1c 2702 if (ret < 0) {
c29b77f9
MA
2703 error_reportf_err(local_err,
2704 "failed to create inode for snapshot: ");
33b1db1c
MK
2705 goto cleanup;
2706 }
2707
f11672db 2708 ret = read_object(fd, s->bs, (char *)inode,
84390bed
SH
2709 vid_to_vdi_oid(new_vid), s->inode.nr_copies, datalen, 0,
2710 s->cache_flags);
33b1db1c
MK
2711
2712 if (ret < 0) {
6daf194d 2713 error_report("failed to read new inode info. %s", strerror(errno));
33b1db1c
MK
2714 goto cleanup;
2715 }
2716
2717 memcpy(&s->inode, inode, datalen);
70018a14
LV
2718 trace_sheepdog_snapshot_create_inode(s->inode.name, s->inode.snap_id,
2719 s->inode.vdi_id);
33b1db1c
MK
2720
2721cleanup:
2df5fee2 2722 g_free(inode);
33b1db1c
MK
2723 closesocket(fd);
2724 return ret;
2725}
2726
859e5553
LY
2727/*
2728 * We implement rollback(loadvm) operation to the specified snapshot by
2729 * 1) switch to the snapshot
2730 * 2) rely on sd_create_branch to delete working VDI and
dc6fb73d 2731 * 3) create a new working VDI based on the specified snapshot
859e5553 2732 */
33b1db1c
MK
2733static int sd_snapshot_goto(BlockDriverState *bs, const char *snapshot_id)
2734{
2735 BDRVSheepdogState *s = bs->opaque;
2736 BDRVSheepdogState *old_s;
9ff53a0e 2737 char tag[SD_MAX_VDI_TAG_LEN];
33b1db1c 2738 uint32_t snapid = 0;
89e2a31d
MA
2739 int ret;
2740
2741 if (!sd_parse_snapid_or_tag(snapshot_id, &snapid, tag)) {
2742 return -EINVAL;
2743 }
33b1db1c 2744
5839e53b 2745 old_s = g_new(BDRVSheepdogState, 1);
33b1db1c
MK
2746
2747 memcpy(old_s, s, sizeof(BDRVSheepdogState));
2748
9ff53a0e 2749 ret = reload_inode(s, snapid, tag);
33b1db1c 2750 if (ret) {
33b1db1c
MK
2751 goto out;
2752 }
2753
cede621f
LY
2754 ret = sd_create_branch(s);
2755 if (ret) {
33b1db1c
MK
2756 goto out;
2757 }
2758
7267c094 2759 g_free(old_s);
33b1db1c
MK
2760
2761 return 0;
2762out:
2763 /* recover bdrv_sd_state */
2764 memcpy(s, old_s, sizeof(BDRVSheepdogState));
7267c094 2765 g_free(old_s);
33b1db1c 2766
6daf194d 2767 error_report("failed to open. recover old bdrv_sd_state.");
33b1db1c
MK
2768
2769 return ret;
2770}
2771
eab8eb8d
VT
2772#define NR_BATCHED_DISCARD 128
2773
e25cad69 2774static int remove_objects(BDRVSheepdogState *s, Error **errp)
eab8eb8d
VT
2775{
2776 int fd, i = 0, nr_objs = 0;
e25cad69 2777 int ret;
eab8eb8d
VT
2778 SheepdogInode *inode = &s->inode;
2779
e25cad69 2780 fd = connect_to_sdog(s, errp);
eab8eb8d 2781 if (fd < 0) {
e25cad69 2782 return fd;
eab8eb8d
VT
2783 }
2784
2785 nr_objs = count_data_objs(inode);
2786 while (i < nr_objs) {
2787 int start_idx, nr_filled_idx;
2788
2789 while (i < nr_objs && !inode->data_vdi_id[i]) {
2790 i++;
2791 }
2792 start_idx = i;
2793
2794 nr_filled_idx = 0;
2795 while (i < nr_objs && nr_filled_idx < NR_BATCHED_DISCARD) {
2796 if (inode->data_vdi_id[i]) {
2797 inode->data_vdi_id[i] = 0;
2798 nr_filled_idx++;
2799 }
2800
2801 i++;
2802 }
2803
f11672db 2804 ret = write_object(fd, s->bs,
eab8eb8d
VT
2805 (char *)&inode->data_vdi_id[start_idx],
2806 vid_to_vdi_oid(s->inode.vdi_id), inode->nr_copies,
2807 (i - start_idx) * sizeof(uint32_t),
2808 offsetof(struct SheepdogInode,
2809 data_vdi_id[start_idx]),
2810 false, s->cache_flags);
2811 if (ret < 0) {
e25cad69 2812 error_setg(errp, "Failed to discard snapshot inode");
eab8eb8d
VT
2813 goto out;
2814 }
2815 }
2816
e25cad69 2817 ret = 0;
eab8eb8d
VT
2818out:
2819 closesocket(fd);
e25cad69 2820 return ret;
eab8eb8d
VT
2821}
2822
a89d89d3
WX
2823static int sd_snapshot_delete(BlockDriverState *bs,
2824 const char *snapshot_id,
2825 const char *name,
2826 Error **errp)
33b1db1c 2827{
a0dc0e2b
MA
2828 /*
2829 * FIXME should delete the snapshot matching both @snapshot_id and
2830 * @name, but @name not used here
2831 */
03c698f0 2832 unsigned long snap_id = 0;
eab8eb8d 2833 char snap_tag[SD_MAX_VDI_TAG_LEN];
eab8eb8d
VT
2834 int fd, ret;
2835 char buf[SD_MAX_VDI_LEN + SD_MAX_VDI_TAG_LEN];
2836 BDRVSheepdogState *s = bs->opaque;
2837 unsigned int wlen = SD_MAX_VDI_LEN + SD_MAX_VDI_TAG_LEN, rlen = 0;
2838 uint32_t vid;
2839 SheepdogVdiReq hdr = {
2840 .opcode = SD_OP_DEL_VDI,
2841 .data_length = wlen,
2842 .flags = SD_FLAG_CMD_WRITE,
2843 };
2844 SheepdogVdiRsp *rsp = (SheepdogVdiRsp *)&hdr;
2845
e25cad69
MA
2846 ret = remove_objects(s, errp);
2847 if (ret) {
2848 return ret;
eab8eb8d
VT
2849 }
2850
2851 memset(buf, 0, sizeof(buf));
2852 memset(snap_tag, 0, sizeof(snap_tag));
2853 pstrcpy(buf, SD_MAX_VDI_LEN, s->name);
89e2a31d 2854 /* TODO Use sd_parse_snapid() once this mess is cleaned up */
03c698f0
JC
2855 ret = qemu_strtoul(snapshot_id, NULL, 10, &snap_id);
2856 if (ret || snap_id > UINT32_MAX) {
a0dc0e2b
MA
2857 /*
2858 * FIXME Since qemu_strtoul() returns -EINVAL when
2859 * @snapshot_id is null, @snapshot_id is mandatory. Correct
2860 * would be to require at least one of @snapshot_id and @name.
2861 */
03c698f0
JC
2862 error_setg(errp, "Invalid snapshot ID: %s",
2863 snapshot_id ? snapshot_id : "<null>");
2864 return -EINVAL;
eab8eb8d
VT
2865 }
2866
2867 if (snap_id) {
03c698f0 2868 hdr.snapid = (uint32_t) snap_id;
eab8eb8d 2869 } else {
a0dc0e2b 2870 /* FIXME I suspect we should use @name here */
89e2a31d 2871 /* FIXME don't truncate silently */
eab8eb8d
VT
2872 pstrcpy(snap_tag, sizeof(snap_tag), snapshot_id);
2873 pstrcpy(buf + SD_MAX_VDI_LEN, SD_MAX_VDI_TAG_LEN, snap_tag);
2874 }
2875
e25cad69 2876 ret = find_vdi_name(s, s->name, snap_id, snap_tag, &vid, true, errp);
eab8eb8d
VT
2877 if (ret) {
2878 return ret;
2879 }
2880
e25cad69 2881 fd = connect_to_sdog(s, errp);
eab8eb8d 2882 if (fd < 0) {
e25cad69 2883 return fd;
eab8eb8d
VT
2884 }
2885
f11672db 2886 ret = do_req(fd, s->bs, (SheepdogReq *)&hdr,
eab8eb8d
VT
2887 buf, &wlen, &rlen);
2888 closesocket(fd);
2889 if (ret) {
e25cad69 2890 error_setg_errno(errp, -ret, "Couldn't send request to server");
eab8eb8d
VT
2891 return ret;
2892 }
2893
2894 switch (rsp->result) {
2895 case SD_RES_NO_VDI:
e25cad69
MA
2896 error_setg(errp, "Can't find the snapshot");
2897 return -ENOENT;
eab8eb8d
VT
2898 case SD_RES_SUCCESS:
2899 break;
2900 default:
e25cad69
MA
2901 error_setg(errp, "%s", sd_strerror(rsp->result));
2902 return -EIO;
eab8eb8d
VT
2903 }
2904
e25cad69 2905 return 0;
33b1db1c
MK
2906}
2907
33b1db1c
MK
2908static int sd_snapshot_list(BlockDriverState *bs, QEMUSnapshotInfo **psn_tab)
2909{
dfb12bf8 2910 Error *local_err = NULL;
33b1db1c
MK
2911 BDRVSheepdogState *s = bs->opaque;
2912 SheepdogReq req;
2913 int fd, nr = 1024, ret, max = BITS_TO_LONGS(SD_NR_VDIS) * sizeof(long);
2914 QEMUSnapshotInfo *sn_tab = NULL;
2915 unsigned wlen, rlen;
2916 int found = 0;
68acc99f 2917 SheepdogInode *inode;
33b1db1c
MK
2918 unsigned long *vdi_inuse;
2919 unsigned int start_nr;
2920 uint64_t hval;
2921 uint32_t vid;
2922
7267c094 2923 vdi_inuse = g_malloc(max);
68acc99f 2924 inode = g_malloc(SD_INODE_HEADER_SIZE);
33b1db1c 2925
dfb12bf8 2926 fd = connect_to_sdog(s, &local_err);
33b1db1c 2927 if (fd < 0) {
565f65d2 2928 error_report_err(local_err);
cb595887 2929 ret = fd;
33b1db1c
MK
2930 goto out;
2931 }
2932
2933 rlen = max;
2934 wlen = 0;
2935
2936 memset(&req, 0, sizeof(req));
2937
2938 req.opcode = SD_OP_READ_VDIS;
2939 req.data_length = max;
2940
f11672db 2941 ret = do_req(fd, s->bs, &req, vdi_inuse, &wlen, &rlen);
33b1db1c
MK
2942
2943 closesocket(fd);
2944 if (ret) {
2945 goto out;
2946 }
2947
02c4f26b 2948 sn_tab = g_new0(QEMUSnapshotInfo, nr);
33b1db1c
MK
2949
2950 /* calculate a vdi id with hash function */
2951 hval = fnv_64a_buf(s->name, strlen(s->name), FNV1A_64_INIT);
2952 start_nr = hval & (SD_NR_VDIS - 1);
2953
dfb12bf8 2954 fd = connect_to_sdog(s, &local_err);
33b1db1c 2955 if (fd < 0) {
565f65d2 2956 error_report_err(local_err);
cb595887 2957 ret = fd;
33b1db1c
MK
2958 goto out;
2959 }
2960
2961 for (vid = start_nr; found < nr; vid = (vid + 1) % SD_NR_VDIS) {
2962 if (!test_bit(vid, vdi_inuse)) {
2963 break;
2964 }
2965
2966 /* we don't need to read entire object */
68acc99f 2967 ret = read_object(fd, s->bs, (char *)inode,
84390bed 2968 vid_to_vdi_oid(vid),
03b036cc 2969 0, SD_INODE_HEADER_SIZE, 0,
0e7106d8 2970 s->cache_flags);
33b1db1c
MK
2971
2972 if (ret) {
2973 continue;
2974 }
2975
68acc99f
PB
2976 if (!strcmp(inode->name, s->name) && is_snapshot(inode)) {
2977 sn_tab[found].date_sec = inode->snap_ctime >> 32;
2978 sn_tab[found].date_nsec = inode->snap_ctime & 0xffffffff;
2979 sn_tab[found].vm_state_size = inode->vm_state_size;
2980 sn_tab[found].vm_clock_nsec = inode->vm_clock_nsec;
33b1db1c 2981
521b2b5d 2982 snprintf(sn_tab[found].id_str, sizeof(sn_tab[found].id_str),
68acc99f 2983 "%" PRIu32, inode->snap_id);
3178e275 2984 pstrcpy(sn_tab[found].name,
68acc99f
PB
2985 MIN(sizeof(sn_tab[found].name), sizeof(inode->tag)),
2986 inode->tag);
33b1db1c
MK
2987 found++;
2988 }
2989 }
2990
2991 closesocket(fd);
2992out:
2993 *psn_tab = sn_tab;
2994
7267c094 2995 g_free(vdi_inuse);
68acc99f 2996 g_free(inode);
33b1db1c 2997
cb595887
MK
2998 if (ret < 0) {
2999 return ret;
3000 }
3001
33b1db1c
MK
3002 return found;
3003}
3004
3005static int do_load_save_vmstate(BDRVSheepdogState *s, uint8_t *data,
3006 int64_t pos, int size, int load)
3007{
dfb12bf8 3008 Error *local_err = NULL;
2f536801
MK
3009 bool create;
3010 int fd, ret = 0, remaining = size;
33b1db1c
MK
3011 unsigned int data_len;
3012 uint64_t vmstate_oid;
33b1db1c 3013 uint64_t offset;
cede621f
LY
3014 uint32_t vdi_index;
3015 uint32_t vdi_id = load ? s->inode.parent_vdi_id : s->inode.vdi_id;
876eb1b0 3016 uint32_t object_size = (UINT32_C(1) << s->inode.block_size_shift);
33b1db1c 3017
dfb12bf8 3018 fd = connect_to_sdog(s, &local_err);
33b1db1c 3019 if (fd < 0) {
565f65d2 3020 error_report_err(local_err);
cb595887 3021 return fd;
33b1db1c
MK
3022 }
3023
6f3c714e 3024 while (remaining) {
876eb1b0
TI
3025 vdi_index = pos / object_size;
3026 offset = pos % object_size;
33b1db1c 3027
876eb1b0 3028 data_len = MIN(remaining, object_size - offset);
33b1db1c 3029
cede621f 3030 vmstate_oid = vid_to_vmstate_oid(vdi_id, vdi_index);
33b1db1c
MK
3031
3032 create = (offset == 0);
3033 if (load) {
f11672db 3034 ret = read_object(fd, s->bs, (char *)data, vmstate_oid,
47622c44 3035 s->inode.nr_copies, data_len, offset,
0e7106d8 3036 s->cache_flags);
33b1db1c 3037 } else {
f11672db 3038 ret = write_object(fd, s->bs, (char *)data, vmstate_oid,
47622c44 3039 s->inode.nr_copies, data_len, offset, create,
0e7106d8 3040 s->cache_flags);
33b1db1c
MK
3041 }
3042
3043 if (ret < 0) {
6daf194d 3044 error_report("failed to save vmstate %s", strerror(errno));
33b1db1c
MK
3045 goto cleanup;
3046 }
3047
3048 pos += data_len;
1f7a48de 3049 data += data_len;
6f3c714e 3050 remaining -= data_len;
33b1db1c 3051 }
6f3c714e 3052 ret = size;
33b1db1c
MK
3053cleanup:
3054 closesocket(fd);
3055 return ret;
3056}
3057
cf8074b3
KW
3058static int sd_save_vmstate(BlockDriverState *bs, QEMUIOVector *qiov,
3059 int64_t pos)
33b1db1c
MK
3060{
3061 BDRVSheepdogState *s = bs->opaque;
cf8074b3
KW
3062 void *buf;
3063 int ret;
33b1db1c 3064
cf8074b3
KW
3065 buf = qemu_blockalign(bs, qiov->size);
3066 qemu_iovec_to_buf(qiov, 0, buf, qiov->size);
3067 ret = do_load_save_vmstate(s, (uint8_t *) buf, pos, qiov->size, 0);
3068 qemu_vfree(buf);
3069
3070 return ret;
33b1db1c
MK
3071}
3072
5ddda0b8
KW
3073static int sd_load_vmstate(BlockDriverState *bs, QEMUIOVector *qiov,
3074 int64_t pos)
33b1db1c
MK
3075{
3076 BDRVSheepdogState *s = bs->opaque;
5ddda0b8
KW
3077 void *buf;
3078 int ret;
33b1db1c 3079
5ddda0b8
KW
3080 buf = qemu_blockalign(bs, qiov->size);
3081 ret = do_load_save_vmstate(s, buf, pos, qiov->size, 1);
3082 qemu_iovec_from_buf(qiov, 0, buf, qiov->size);
3083 qemu_vfree(buf);
3084
3085 return ret;
33b1db1c
MK
3086}
3087
3088
dde47537 3089static coroutine_fn int sd_co_pdiscard(BlockDriverState *bs, int64_t offset,
f5a5ca79 3090 int bytes)
cac8f4a6 3091{
28ddd08c 3092 SheepdogAIOCB acb;
cac8f4a6 3093 BDRVSheepdogState *s = bs->opaque;
e6fd57ea
HM
3094 QEMUIOVector discard_iov;
3095 struct iovec iov;
3096 uint32_t zero = 0;
cac8f4a6
LY
3097
3098 if (!s->discard_supported) {
dde47537 3099 return 0;
cac8f4a6
LY
3100 }
3101
e6fd57ea
HM
3102 memset(&discard_iov, 0, sizeof(discard_iov));
3103 memset(&iov, 0, sizeof(iov));
3104 iov.iov_base = &zero;
3105 iov.iov_len = sizeof(zero);
3106 discard_iov.iov = &iov;
3107 discard_iov.niov = 1;
f5a5ca79 3108 if (!QEMU_IS_ALIGNED(offset | bytes, BDRV_SECTOR_SIZE)) {
49228d1e
EB
3109 return -ENOTSUP;
3110 }
28ddd08c 3111 sd_aio_setup(&acb, s, &discard_iov, offset >> BDRV_SECTOR_BITS,
f5a5ca79 3112 bytes >> BDRV_SECTOR_BITS, AIOCB_DISCARD_OBJ);
28ddd08c 3113 sd_co_rw_vector(&acb);
acf6e5f0 3114 sd_aio_complete(&acb);
cac8f4a6 3115
28ddd08c 3116 return acb.ret;
cac8f4a6
LY
3117}
3118
47943e98
EB
3119static coroutine_fn int
3120sd_co_block_status(BlockDriverState *bs, bool want_zero, int64_t offset,
3121 int64_t bytes, int64_t *pnum, int64_t *map,
3122 BlockDriverState **file)
8d71c631
LY
3123{
3124 BDRVSheepdogState *s = bs->opaque;
3125 SheepdogInode *inode = &s->inode;
876eb1b0 3126 uint32_t object_size = (UINT32_C(1) << inode->block_size_shift);
876eb1b0 3127 unsigned long start = offset / object_size,
47943e98 3128 end = DIV_ROUND_UP(offset + bytes, object_size);
8d71c631 3129 unsigned long idx;
47943e98
EB
3130 *map = offset;
3131 int ret = BDRV_BLOCK_DATA | BDRV_BLOCK_OFFSET_VALID;
8d71c631
LY
3132
3133 for (idx = start; idx < end; idx++) {
3134 if (inode->data_vdi_id[idx] == 0) {
3135 break;
3136 }
3137 }
3138 if (idx == start) {
3139 /* Get the longest length of unallocated sectors */
3140 ret = 0;
3141 for (idx = start + 1; idx < end; idx++) {
3142 if (inode->data_vdi_id[idx] != 0) {
3143 break;
3144 }
3145 }
3146 }
3147
47943e98
EB
3148 *pnum = (idx - start) * object_size;
3149 if (*pnum > bytes) {
3150 *pnum = bytes;
8d71c631 3151 }
d234c929
FZ
3152 if (ret > 0 && ret & BDRV_BLOCK_OFFSET_VALID) {
3153 *file = bs;
3154 }
8d71c631
LY
3155 return ret;
3156}
3157
85829722
LY
3158static int64_t sd_get_allocated_file_size(BlockDriverState *bs)
3159{
3160 BDRVSheepdogState *s = bs->opaque;
3161 SheepdogInode *inode = &s->inode;
876eb1b0
TI
3162 uint32_t object_size = (UINT32_C(1) << inode->block_size_shift);
3163 unsigned long i, last = DIV_ROUND_UP(inode->vdi_size, object_size);
85829722
LY
3164 uint64_t size = 0;
3165
3166 for (i = 0; i < last; i++) {
3167 if (inode->data_vdi_id[i] == 0) {
3168 continue;
3169 }
876eb1b0 3170 size += object_size;
85829722
LY
3171 }
3172 return size;
3173}
3174
b222237b
CL
3175static QemuOptsList sd_create_opts = {
3176 .name = "sheepdog-create-opts",
3177 .head = QTAILQ_HEAD_INITIALIZER(sd_create_opts.head),
3178 .desc = {
3179 {
3180 .name = BLOCK_OPT_SIZE,
3181 .type = QEMU_OPT_SIZE,
3182 .help = "Virtual disk size"
3183 },
3184 {
3185 .name = BLOCK_OPT_BACKING_FILE,
3186 .type = QEMU_OPT_STRING,
3187 .help = "File name of a base image"
3188 },
3189 {
3190 .name = BLOCK_OPT_PREALLOC,
3191 .type = QEMU_OPT_STRING,
3192 .help = "Preallocation mode (allowed values: off, full)"
3193 },
3194 {
3195 .name = BLOCK_OPT_REDUNDANCY,
3196 .type = QEMU_OPT_STRING,
3197 .help = "Redundancy of the image"
3198 },
876eb1b0
TI
3199 {
3200 .name = BLOCK_OPT_OBJECT_SIZE,
3201 .type = QEMU_OPT_SIZE,
3202 .help = "Object size of the image"
3203 },
b222237b
CL
3204 { /* end of list */ }
3205 }
33b1db1c
MK
3206};
3207
2654267c
HR
3208static const char *const sd_strong_runtime_opts[] = {
3209 "vdi",
3210 "snap-id",
3211 "tag",
3212 "server.",
3213
3214 NULL
3215};
3216
5d6768e3 3217static BlockDriver bdrv_sheepdog = {
d507c5f6
JC
3218 .format_name = "sheepdog",
3219 .protocol_name = "sheepdog",
3220 .instance_size = sizeof(BDRVSheepdogState),
3221 .bdrv_parse_filename = sd_parse_filename,
3222 .bdrv_file_open = sd_open,
3223 .bdrv_reopen_prepare = sd_reopen_prepare,
3224 .bdrv_reopen_commit = sd_reopen_commit,
3225 .bdrv_reopen_abort = sd_reopen_abort,
3226 .bdrv_close = sd_close,
63fd65a0 3227 .bdrv_co_create = sd_co_create,
efc75e2a 3228 .bdrv_co_create_opts = sd_co_create_opts,
d507c5f6
JC
3229 .bdrv_has_zero_init = bdrv_has_zero_init_1,
3230 .bdrv_getlength = sd_getlength,
85829722 3231 .bdrv_get_allocated_file_size = sd_get_allocated_file_size,
061ca8a3 3232 .bdrv_co_truncate = sd_co_truncate,
33b1db1c 3233
d507c5f6
JC
3234 .bdrv_co_readv = sd_co_readv,
3235 .bdrv_co_writev = sd_co_writev,
3236 .bdrv_co_flush_to_disk = sd_co_flush_to_disk,
3237 .bdrv_co_pdiscard = sd_co_pdiscard,
47943e98 3238 .bdrv_co_block_status = sd_co_block_status,
33b1db1c 3239
d507c5f6
JC
3240 .bdrv_snapshot_create = sd_snapshot_create,
3241 .bdrv_snapshot_goto = sd_snapshot_goto,
3242 .bdrv_snapshot_delete = sd_snapshot_delete,
3243 .bdrv_snapshot_list = sd_snapshot_list,
33b1db1c 3244
d507c5f6
JC
3245 .bdrv_save_vmstate = sd_save_vmstate,
3246 .bdrv_load_vmstate = sd_load_vmstate,
33b1db1c 3247
d507c5f6
JC
3248 .bdrv_detach_aio_context = sd_detach_aio_context,
3249 .bdrv_attach_aio_context = sd_attach_aio_context,
84390bed 3250
d507c5f6 3251 .create_opts = &sd_create_opts,
2654267c 3252 .strong_runtime_opts = sd_strong_runtime_opts,
33b1db1c
MK
3253};
3254
5d6768e3 3255static BlockDriver bdrv_sheepdog_tcp = {
d507c5f6
JC
3256 .format_name = "sheepdog",
3257 .protocol_name = "sheepdog+tcp",
3258 .instance_size = sizeof(BDRVSheepdogState),
3259 .bdrv_parse_filename = sd_parse_filename,
3260 .bdrv_file_open = sd_open,
3261 .bdrv_reopen_prepare = sd_reopen_prepare,
3262 .bdrv_reopen_commit = sd_reopen_commit,
3263 .bdrv_reopen_abort = sd_reopen_abort,
3264 .bdrv_close = sd_close,
63fd65a0 3265 .bdrv_co_create = sd_co_create,
efc75e2a 3266 .bdrv_co_create_opts = sd_co_create_opts,
d507c5f6
JC
3267 .bdrv_has_zero_init = bdrv_has_zero_init_1,
3268 .bdrv_getlength = sd_getlength,
85829722 3269 .bdrv_get_allocated_file_size = sd_get_allocated_file_size,
061ca8a3 3270 .bdrv_co_truncate = sd_co_truncate,
5d6768e3 3271
d507c5f6
JC
3272 .bdrv_co_readv = sd_co_readv,
3273 .bdrv_co_writev = sd_co_writev,
3274 .bdrv_co_flush_to_disk = sd_co_flush_to_disk,
3275 .bdrv_co_pdiscard = sd_co_pdiscard,
47943e98 3276 .bdrv_co_block_status = sd_co_block_status,
5d6768e3 3277
d507c5f6
JC
3278 .bdrv_snapshot_create = sd_snapshot_create,
3279 .bdrv_snapshot_goto = sd_snapshot_goto,
3280 .bdrv_snapshot_delete = sd_snapshot_delete,
3281 .bdrv_snapshot_list = sd_snapshot_list,
5d6768e3 3282
d507c5f6
JC
3283 .bdrv_save_vmstate = sd_save_vmstate,
3284 .bdrv_load_vmstate = sd_load_vmstate,
5d6768e3 3285
d507c5f6
JC
3286 .bdrv_detach_aio_context = sd_detach_aio_context,
3287 .bdrv_attach_aio_context = sd_attach_aio_context,
84390bed 3288
d507c5f6 3289 .create_opts = &sd_create_opts,
2654267c 3290 .strong_runtime_opts = sd_strong_runtime_opts,
5d6768e3
MK
3291};
3292
1b8bbb46 3293static BlockDriver bdrv_sheepdog_unix = {
d507c5f6
JC
3294 .format_name = "sheepdog",
3295 .protocol_name = "sheepdog+unix",
3296 .instance_size = sizeof(BDRVSheepdogState),
3297 .bdrv_parse_filename = sd_parse_filename,
3298 .bdrv_file_open = sd_open,
3299 .bdrv_reopen_prepare = sd_reopen_prepare,
3300 .bdrv_reopen_commit = sd_reopen_commit,
3301 .bdrv_reopen_abort = sd_reopen_abort,
3302 .bdrv_close = sd_close,
63fd65a0 3303 .bdrv_co_create = sd_co_create,
efc75e2a 3304 .bdrv_co_create_opts = sd_co_create_opts,
d507c5f6
JC
3305 .bdrv_has_zero_init = bdrv_has_zero_init_1,
3306 .bdrv_getlength = sd_getlength,
85829722 3307 .bdrv_get_allocated_file_size = sd_get_allocated_file_size,
061ca8a3 3308 .bdrv_co_truncate = sd_co_truncate,
1b8bbb46 3309
d507c5f6
JC
3310 .bdrv_co_readv = sd_co_readv,
3311 .bdrv_co_writev = sd_co_writev,
3312 .bdrv_co_flush_to_disk = sd_co_flush_to_disk,
3313 .bdrv_co_pdiscard = sd_co_pdiscard,
47943e98 3314 .bdrv_co_block_status = sd_co_block_status,
1b8bbb46 3315
d507c5f6
JC
3316 .bdrv_snapshot_create = sd_snapshot_create,
3317 .bdrv_snapshot_goto = sd_snapshot_goto,
3318 .bdrv_snapshot_delete = sd_snapshot_delete,
3319 .bdrv_snapshot_list = sd_snapshot_list,
1b8bbb46 3320
d507c5f6
JC
3321 .bdrv_save_vmstate = sd_save_vmstate,
3322 .bdrv_load_vmstate = sd_load_vmstate,
1b8bbb46 3323
d507c5f6
JC
3324 .bdrv_detach_aio_context = sd_detach_aio_context,
3325 .bdrv_attach_aio_context = sd_attach_aio_context,
84390bed 3326
d507c5f6 3327 .create_opts = &sd_create_opts,
2654267c 3328 .strong_runtime_opts = sd_strong_runtime_opts,
1b8bbb46
MK
3329};
3330
33b1db1c
MK
3331static void bdrv_sheepdog_init(void)
3332{
3333 bdrv_register(&bdrv_sheepdog);
5d6768e3 3334 bdrv_register(&bdrv_sheepdog_tcp);
1b8bbb46 3335 bdrv_register(&bdrv_sheepdog_unix);
33b1db1c
MK
3336}
3337block_init(bdrv_sheepdog_init);
This page took 1.069704 seconds and 4 git commands to generate.