]>
Commit | Line | Data |
---|---|---|
75818250 TS |
1 | /* |
2 | * QEMU Block driver for NBD | |
3 | * | |
4 | * Copyright (C) 2008 Bull S.A.S. | |
bd5921b4 | 5 | * Author: Laurent Vivier <[email protected]> |
75818250 TS |
6 | * |
7 | * Some parts: | |
8 | * Copyright (C) 2007 Anthony Liguori <[email protected]> | |
9 | * | |
10 | * Permission is hereby granted, free of charge, to any person obtaining a copy | |
11 | * of this software and associated documentation files (the "Software"), to deal | |
12 | * in the Software without restriction, including without limitation the rights | |
13 | * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | |
14 | * copies of the Software, and to permit persons to whom the Software is | |
15 | * furnished to do so, subject to the following conditions: | |
16 | * | |
17 | * The above copyright notice and this permission notice shall be included in | |
18 | * all copies or substantial portions of the Software. | |
19 | * | |
20 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | |
21 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | |
22 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL | |
23 | * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | |
24 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | |
25 | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN | |
26 | * THE SOFTWARE. | |
27 | */ | |
28 | ||
29 | #include "qemu-common.h" | |
30 | #include "nbd.h" | |
ab359cd1 | 31 | #include "block_int.h" |
5efa9d5a | 32 | #include "module.h" |
33897dc7 | 33 | #include "qemu_socket.h" |
75818250 TS |
34 | |
35 | #include <sys/types.h> | |
36 | #include <unistd.h> | |
75818250 | 37 | |
1d45f8b5 LV |
38 | #define EN_OPTSTR ":exportname=" |
39 | ||
33897dc7 NT |
40 | /* #define DEBUG_NBD */ |
41 | ||
42 | #if defined(DEBUG_NBD) | |
43 | #define logout(fmt, ...) \ | |
44 | fprintf(stderr, "nbd\t%-24s" fmt, __func__, ##__VA_ARGS__) | |
45 | #else | |
46 | #define logout(fmt, ...) ((void)0) | |
47 | #endif | |
48 | ||
ecda3447 PB |
49 | #define MAX_NBD_REQUESTS 16 |
50 | #define HANDLE_TO_INDEX(bs, handle) ((handle) ^ ((uint64_t)(intptr_t)bs)) | |
51 | #define INDEX_TO_HANDLE(bs, index) ((index) ^ ((uint64_t)(intptr_t)bs)) | |
52 | ||
75818250 TS |
53 | typedef struct BDRVNBDState { |
54 | int sock; | |
b90fb4b8 | 55 | uint32_t nbdflags; |
75818250 TS |
56 | off_t size; |
57 | size_t blocksize; | |
33897dc7 NT |
58 | char *export_name; /* An NBD server may export several devices */ |
59 | ||
ecda3447 PB |
60 | CoMutex send_mutex; |
61 | CoMutex free_sema; | |
62 | Coroutine *send_coroutine; | |
63 | int in_flight; | |
ae255e52 | 64 | |
ecda3447 | 65 | Coroutine *recv_coroutine[MAX_NBD_REQUESTS]; |
ae255e52 PB |
66 | struct nbd_reply reply; |
67 | ||
33897dc7 NT |
68 | /* If it begins with '/', this is a UNIX domain socket. Otherwise, |
69 | * it's a string of the form <hostname|ip4|\[ip6\]>:port | |
70 | */ | |
71 | char *host_spec; | |
75818250 TS |
72 | } BDRVNBDState; |
73 | ||
33897dc7 | 74 | static int nbd_config(BDRVNBDState *s, const char *filename, int flags) |
75818250 | 75 | { |
1d45f8b5 | 76 | char *file; |
33897dc7 NT |
77 | char *export_name; |
78 | const char *host_spec; | |
75818250 | 79 | const char *unixpath; |
1d45f8b5 | 80 | int err = -EINVAL; |
75818250 | 81 | |
7267c094 | 82 | file = g_strdup(filename); |
1d45f8b5 | 83 | |
33897dc7 NT |
84 | export_name = strstr(file, EN_OPTSTR); |
85 | if (export_name) { | |
86 | if (export_name[strlen(EN_OPTSTR)] == 0) { | |
1d45f8b5 LV |
87 | goto out; |
88 | } | |
33897dc7 NT |
89 | export_name[0] = 0; /* truncate 'file' */ |
90 | export_name += strlen(EN_OPTSTR); | |
7267c094 | 91 | s->export_name = g_strdup(export_name); |
1d45f8b5 LV |
92 | } |
93 | ||
33897dc7 NT |
94 | /* extract the host_spec - fail if it's not nbd:... */ |
95 | if (!strstart(file, "nbd:", &host_spec)) { | |
1d45f8b5 LV |
96 | goto out; |
97 | } | |
75818250 | 98 | |
33897dc7 NT |
99 | /* are we a UNIX or TCP socket? */ |
100 | if (strstart(host_spec, "unix:", &unixpath)) { | |
101 | if (unixpath[0] != '/') { /* We demand an absolute path*/ | |
1d45f8b5 LV |
102 | goto out; |
103 | } | |
7267c094 | 104 | s->host_spec = g_strdup(unixpath); |
75818250 | 105 | } else { |
7267c094 | 106 | s->host_spec = g_strdup(host_spec); |
33897dc7 | 107 | } |
75818250 | 108 | |
33897dc7 | 109 | err = 0; |
75818250 | 110 | |
33897dc7 | 111 | out: |
7267c094 | 112 | g_free(file); |
33897dc7 | 113 | if (err != 0) { |
7267c094 AL |
114 | g_free(s->export_name); |
115 | g_free(s->host_spec); | |
33897dc7 NT |
116 | } |
117 | return err; | |
118 | } | |
1d45f8b5 | 119 | |
ae255e52 PB |
120 | static void nbd_coroutine_start(BDRVNBDState *s, struct nbd_request *request) |
121 | { | |
ecda3447 PB |
122 | int i; |
123 | ||
124 | /* Poor man semaphore. The free_sema is locked when no other request | |
125 | * can be accepted, and unlocked after receiving one reply. */ | |
126 | if (s->in_flight >= MAX_NBD_REQUESTS - 1) { | |
127 | qemu_co_mutex_lock(&s->free_sema); | |
128 | assert(s->in_flight < MAX_NBD_REQUESTS); | |
129 | } | |
130 | s->in_flight++; | |
131 | ||
132 | for (i = 0; i < MAX_NBD_REQUESTS; i++) { | |
133 | if (s->recv_coroutine[i] == NULL) { | |
134 | s->recv_coroutine[i] = qemu_coroutine_self(); | |
135 | break; | |
136 | } | |
137 | } | |
138 | ||
139 | assert(i < MAX_NBD_REQUESTS); | |
140 | request->handle = INDEX_TO_HANDLE(s, i); | |
ae255e52 PB |
141 | } |
142 | ||
143 | static int nbd_have_request(void *opaque) | |
144 | { | |
145 | BDRVNBDState *s = opaque; | |
146 | ||
ecda3447 | 147 | return s->in_flight > 0; |
ae255e52 PB |
148 | } |
149 | ||
150 | static void nbd_reply_ready(void *opaque) | |
151 | { | |
152 | BDRVNBDState *s = opaque; | |
ecda3447 | 153 | int i; |
ae255e52 PB |
154 | |
155 | if (s->reply.handle == 0) { | |
156 | /* No reply already in flight. Fetch a header. */ | |
157 | if (nbd_receive_reply(s->sock, &s->reply) < 0) { | |
158 | s->reply.handle = 0; | |
ecda3447 | 159 | goto fail; |
ae255e52 PB |
160 | } |
161 | } | |
162 | ||
163 | /* There's no need for a mutex on the receive side, because the | |
164 | * handler acts as a synchronization point and ensures that only | |
165 | * one coroutine is called until the reply finishes. */ | |
ecda3447 PB |
166 | i = HANDLE_TO_INDEX(s, s->reply.handle); |
167 | if (s->recv_coroutine[i]) { | |
168 | qemu_coroutine_enter(s->recv_coroutine[i], NULL); | |
169 | return; | |
170 | } | |
171 | ||
172 | fail: | |
173 | for (i = 0; i < MAX_NBD_REQUESTS; i++) { | |
174 | if (s->recv_coroutine[i]) { | |
175 | qemu_coroutine_enter(s->recv_coroutine[i], NULL); | |
176 | } | |
ae255e52 PB |
177 | } |
178 | } | |
179 | ||
180 | static void nbd_restart_write(void *opaque) | |
181 | { | |
182 | BDRVNBDState *s = opaque; | |
ecda3447 | 183 | qemu_coroutine_enter(s->send_coroutine, NULL); |
ae255e52 PB |
184 | } |
185 | ||
186 | static int nbd_co_send_request(BDRVNBDState *s, struct nbd_request *request, | |
187 | struct iovec *iov, int offset) | |
188 | { | |
189 | int rc, ret; | |
190 | ||
ecda3447 PB |
191 | qemu_co_mutex_lock(&s->send_mutex); |
192 | s->send_coroutine = qemu_coroutine_self(); | |
ae255e52 PB |
193 | qemu_aio_set_fd_handler(s->sock, nbd_reply_ready, nbd_restart_write, |
194 | nbd_have_request, NULL, s); | |
195 | rc = nbd_send_request(s->sock, request); | |
196 | if (rc != -1 && iov) { | |
197 | ret = qemu_co_sendv(s->sock, iov, request->len, offset); | |
198 | if (ret != request->len) { | |
199 | errno = -EIO; | |
200 | rc = -1; | |
201 | } | |
202 | } | |
203 | qemu_aio_set_fd_handler(s->sock, nbd_reply_ready, NULL, | |
204 | nbd_have_request, NULL, s); | |
ecda3447 PB |
205 | s->send_coroutine = NULL; |
206 | qemu_co_mutex_unlock(&s->send_mutex); | |
ae255e52 PB |
207 | return rc; |
208 | } | |
209 | ||
210 | static void nbd_co_receive_reply(BDRVNBDState *s, struct nbd_request *request, | |
211 | struct nbd_reply *reply, | |
212 | struct iovec *iov, int offset) | |
213 | { | |
214 | int ret; | |
215 | ||
ecda3447 PB |
216 | /* Wait until we're woken up by the read handler. TODO: perhaps |
217 | * peek at the next reply and avoid yielding if it's ours? */ | |
ae255e52 PB |
218 | qemu_coroutine_yield(); |
219 | *reply = s->reply; | |
220 | if (reply->handle != request->handle) { | |
221 | reply->error = EIO; | |
222 | } else { | |
223 | if (iov && reply->error == 0) { | |
224 | ret = qemu_co_recvv(s->sock, iov, request->len, offset); | |
225 | if (ret != request->len) { | |
226 | reply->error = EIO; | |
227 | } | |
228 | } | |
229 | ||
230 | /* Tell the read handler to read another header. */ | |
231 | s->reply.handle = 0; | |
232 | } | |
233 | } | |
234 | ||
235 | static void nbd_coroutine_end(BDRVNBDState *s, struct nbd_request *request) | |
236 | { | |
ecda3447 PB |
237 | int i = HANDLE_TO_INDEX(s, request->handle); |
238 | s->recv_coroutine[i] = NULL; | |
239 | if (s->in_flight-- == MAX_NBD_REQUESTS) { | |
240 | qemu_co_mutex_unlock(&s->free_sema); | |
241 | } | |
ae255e52 PB |
242 | } |
243 | ||
33897dc7 NT |
244 | static int nbd_establish_connection(BlockDriverState *bs) |
245 | { | |
246 | BDRVNBDState *s = bs->opaque; | |
247 | int sock; | |
248 | int ret; | |
249 | off_t size; | |
250 | size_t blocksize; | |
75818250 | 251 | |
33897dc7 NT |
252 | if (s->host_spec[0] == '/') { |
253 | sock = unix_socket_outgoing(s->host_spec); | |
254 | } else { | |
255 | sock = tcp_socket_outgoing_spec(s->host_spec); | |
75818250 TS |
256 | } |
257 | ||
33897dc7 | 258 | /* Failed to establish connection */ |
1d45f8b5 | 259 | if (sock == -1) { |
33897dc7 NT |
260 | logout("Failed to establish connection to NBD server\n"); |
261 | return -errno; | |
1d45f8b5 | 262 | } |
75818250 | 263 | |
33897dc7 | 264 | /* NBD handshake */ |
b90fb4b8 | 265 | ret = nbd_receive_negotiate(sock, s->export_name, &s->nbdflags, &size, |
33897dc7 | 266 | &blocksize); |
1d45f8b5 | 267 | if (ret == -1) { |
33897dc7 NT |
268 | logout("Failed to negotiate with the NBD server\n"); |
269 | closesocket(sock); | |
270 | return -errno; | |
1d45f8b5 | 271 | } |
75818250 | 272 | |
ae255e52 PB |
273 | /* Now that we're connected, set the socket to be non-blocking and |
274 | * kick the reply mechanism. */ | |
33897dc7 | 275 | socket_set_nonblock(sock); |
ae255e52 PB |
276 | qemu_aio_set_fd_handler(s->sock, nbd_reply_ready, NULL, |
277 | nbd_have_request, NULL, s); | |
33897dc7 | 278 | |
75818250 TS |
279 | s->sock = sock; |
280 | s->size = size; | |
281 | s->blocksize = blocksize; | |
282 | ||
33897dc7 NT |
283 | logout("Established connection with NBD server\n"); |
284 | return 0; | |
285 | } | |
286 | ||
287 | static void nbd_teardown_connection(BlockDriverState *bs) | |
288 | { | |
289 | BDRVNBDState *s = bs->opaque; | |
290 | struct nbd_request request; | |
291 | ||
292 | request.type = NBD_CMD_DISC; | |
33897dc7 NT |
293 | request.from = 0; |
294 | request.len = 0; | |
295 | nbd_send_request(s->sock, &request); | |
296 | ||
ae255e52 | 297 | qemu_aio_set_fd_handler(s->sock, NULL, NULL, NULL, NULL, NULL); |
33897dc7 NT |
298 | closesocket(s->sock); |
299 | } | |
300 | ||
301 | static int nbd_open(BlockDriverState *bs, const char* filename, int flags) | |
302 | { | |
303 | BDRVNBDState *s = bs->opaque; | |
304 | int result; | |
305 | ||
ecda3447 PB |
306 | qemu_co_mutex_init(&s->send_mutex); |
307 | qemu_co_mutex_init(&s->free_sema); | |
ae255e52 | 308 | |
33897dc7 NT |
309 | /* Pop the config into our state object. Exit if invalid. */ |
310 | result = nbd_config(s, filename, flags); | |
311 | if (result != 0) { | |
312 | return result; | |
313 | } | |
314 | ||
315 | /* establish TCP connection, return error if it fails | |
316 | * TODO: Configurable retry-until-timeout behaviour. | |
317 | */ | |
318 | result = nbd_establish_connection(bs); | |
319 | ||
320 | return result; | |
75818250 TS |
321 | } |
322 | ||
d9b09f13 PB |
323 | static int nbd_co_readv_1(BlockDriverState *bs, int64_t sector_num, |
324 | int nb_sectors, QEMUIOVector *qiov, | |
325 | int offset) | |
75818250 TS |
326 | { |
327 | BDRVNBDState *s = bs->opaque; | |
328 | struct nbd_request request; | |
329 | struct nbd_reply reply; | |
330 | ||
331 | request.type = NBD_CMD_READ; | |
3a93113a | 332 | request.from = sector_num * 512; |
75818250 TS |
333 | request.len = nb_sectors * 512; |
334 | ||
ae255e52 PB |
335 | nbd_coroutine_start(s, &request); |
336 | if (nbd_co_send_request(s, &request, NULL, 0) == -1) { | |
337 | reply.error = errno; | |
338 | } else { | |
d9b09f13 | 339 | nbd_co_receive_reply(s, &request, &reply, qiov->iov, offset); |
ae255e52 PB |
340 | } |
341 | nbd_coroutine_end(s, &request); | |
342 | return -reply.error; | |
75818250 | 343 | |
75818250 TS |
344 | } |
345 | ||
d9b09f13 PB |
346 | static int nbd_co_writev_1(BlockDriverState *bs, int64_t sector_num, |
347 | int nb_sectors, QEMUIOVector *qiov, | |
348 | int offset) | |
75818250 TS |
349 | { |
350 | BDRVNBDState *s = bs->opaque; | |
351 | struct nbd_request request; | |
352 | struct nbd_reply reply; | |
353 | ||
354 | request.type = NBD_CMD_WRITE; | |
2c7989a9 PB |
355 | if (!bdrv_enable_write_cache(bs) && (s->nbdflags & NBD_FLAG_SEND_FUA)) { |
356 | request.type |= NBD_CMD_FLAG_FUA; | |
357 | } | |
358 | ||
3a93113a | 359 | request.from = sector_num * 512; |
75818250 TS |
360 | request.len = nb_sectors * 512; |
361 | ||
ae255e52 | 362 | nbd_coroutine_start(s, &request); |
d9b09f13 | 363 | if (nbd_co_send_request(s, &request, qiov->iov, offset) == -1) { |
ae255e52 PB |
364 | reply.error = errno; |
365 | } else { | |
366 | nbd_co_receive_reply(s, &request, &reply, NULL, 0); | |
367 | } | |
368 | nbd_coroutine_end(s, &request); | |
369 | return -reply.error; | |
e183ef75 PB |
370 | } |
371 | ||
d9b09f13 PB |
372 | /* qemu-nbd has a limit of slightly less than 1M per request. Try to |
373 | * remain aligned to 4K. */ | |
374 | #define NBD_MAX_SECTORS 2040 | |
375 | ||
376 | static int nbd_co_readv(BlockDriverState *bs, int64_t sector_num, | |
377 | int nb_sectors, QEMUIOVector *qiov) | |
378 | { | |
379 | int offset = 0; | |
380 | int ret; | |
381 | while (nb_sectors > NBD_MAX_SECTORS) { | |
382 | ret = nbd_co_readv_1(bs, sector_num, NBD_MAX_SECTORS, qiov, offset); | |
383 | if (ret < 0) { | |
384 | return ret; | |
385 | } | |
386 | offset += NBD_MAX_SECTORS * 512; | |
387 | sector_num += NBD_MAX_SECTORS; | |
388 | nb_sectors -= NBD_MAX_SECTORS; | |
389 | } | |
390 | return nbd_co_readv_1(bs, sector_num, nb_sectors, qiov, offset); | |
391 | } | |
392 | ||
393 | static int nbd_co_writev(BlockDriverState *bs, int64_t sector_num, | |
394 | int nb_sectors, QEMUIOVector *qiov) | |
395 | { | |
396 | int offset = 0; | |
397 | int ret; | |
398 | while (nb_sectors > NBD_MAX_SECTORS) { | |
399 | ret = nbd_co_writev_1(bs, sector_num, NBD_MAX_SECTORS, qiov, offset); | |
400 | if (ret < 0) { | |
401 | return ret; | |
402 | } | |
403 | offset += NBD_MAX_SECTORS * 512; | |
404 | sector_num += NBD_MAX_SECTORS; | |
405 | nb_sectors -= NBD_MAX_SECTORS; | |
406 | } | |
407 | return nbd_co_writev_1(bs, sector_num, nb_sectors, qiov, offset); | |
408 | } | |
409 | ||
1486d04a PB |
410 | static int nbd_co_flush(BlockDriverState *bs) |
411 | { | |
412 | BDRVNBDState *s = bs->opaque; | |
413 | struct nbd_request request; | |
414 | struct nbd_reply reply; | |
415 | ||
416 | if (!(s->nbdflags & NBD_FLAG_SEND_FLUSH)) { | |
417 | return 0; | |
418 | } | |
419 | ||
420 | request.type = NBD_CMD_FLUSH; | |
421 | if (s->nbdflags & NBD_FLAG_SEND_FUA) { | |
422 | request.type |= NBD_CMD_FLAG_FUA; | |
423 | } | |
424 | ||
425 | request.from = 0; | |
426 | request.len = 0; | |
427 | ||
428 | nbd_coroutine_start(s, &request); | |
429 | if (nbd_co_send_request(s, &request, NULL, 0) == -1) { | |
430 | reply.error = errno; | |
431 | } else { | |
432 | nbd_co_receive_reply(s, &request, &reply, NULL, 0); | |
433 | } | |
434 | nbd_coroutine_end(s, &request); | |
435 | return -reply.error; | |
436 | } | |
437 | ||
7a706633 PB |
438 | static int nbd_co_discard(BlockDriverState *bs, int64_t sector_num, |
439 | int nb_sectors) | |
440 | { | |
441 | BDRVNBDState *s = bs->opaque; | |
442 | struct nbd_request request; | |
443 | struct nbd_reply reply; | |
444 | ||
445 | if (!(s->nbdflags & NBD_FLAG_SEND_TRIM)) { | |
446 | return 0; | |
447 | } | |
448 | request.type = NBD_CMD_TRIM; | |
449 | request.from = sector_num * 512;; | |
450 | request.len = nb_sectors * 512; | |
451 | ||
452 | nbd_coroutine_start(s, &request); | |
453 | if (nbd_co_send_request(s, &request, NULL, 0) == -1) { | |
454 | reply.error = errno; | |
455 | } else { | |
456 | nbd_co_receive_reply(s, &request, &reply, NULL, 0); | |
457 | } | |
458 | nbd_coroutine_end(s, &request); | |
459 | return -reply.error; | |
460 | } | |
461 | ||
75818250 TS |
462 | static void nbd_close(BlockDriverState *bs) |
463 | { | |
d2d979c6 | 464 | BDRVNBDState *s = bs->opaque; |
7267c094 AL |
465 | g_free(s->export_name); |
466 | g_free(s->host_spec); | |
d2d979c6 | 467 | |
33897dc7 | 468 | nbd_teardown_connection(bs); |
75818250 TS |
469 | } |
470 | ||
471 | static int64_t nbd_getlength(BlockDriverState *bs) | |
472 | { | |
473 | BDRVNBDState *s = bs->opaque; | |
474 | ||
475 | return s->size; | |
476 | } | |
477 | ||
5efa9d5a | 478 | static BlockDriver bdrv_nbd = { |
1486d04a PB |
479 | .format_name = "nbd", |
480 | .instance_size = sizeof(BDRVNBDState), | |
481 | .bdrv_file_open = nbd_open, | |
482 | .bdrv_co_readv = nbd_co_readv, | |
483 | .bdrv_co_writev = nbd_co_writev, | |
484 | .bdrv_close = nbd_close, | |
485 | .bdrv_co_flush_to_os = nbd_co_flush, | |
7a706633 | 486 | .bdrv_co_discard = nbd_co_discard, |
1486d04a PB |
487 | .bdrv_getlength = nbd_getlength, |
488 | .protocol_name = "nbd", | |
75818250 | 489 | }; |
5efa9d5a AL |
490 | |
491 | static void bdrv_nbd_init(void) | |
492 | { | |
493 | bdrv_register(&bdrv_nbd); | |
494 | } | |
495 | ||
496 | block_init(bdrv_nbd_init); |