]>
Commit | Line | Data |
---|---|---|
5c6c3a6c CH |
1 | /* |
2 | * Linux native AIO support. | |
3 | * | |
4 | * Copyright (C) 2009 IBM, Corp. | |
5 | * Copyright (C) 2009 Red Hat, Inc. | |
6 | * | |
7 | * This work is licensed under the terms of the GNU GPL, version 2 or later. | |
8 | * See the COPYING file in the top-level directory. | |
9 | */ | |
80c71a24 | 10 | #include "qemu/osdep.h" |
5c6c3a6c | 11 | #include "qemu-common.h" |
737e150e | 12 | #include "block/aio.h" |
1de7afc9 | 13 | #include "qemu/queue.h" |
2174f12b | 14 | #include "block/block.h" |
9f8540ec | 15 | #include "block/raw-aio.h" |
1de7afc9 | 16 | #include "qemu/event_notifier.h" |
2174f12b | 17 | #include "qemu/coroutine.h" |
5c6c3a6c | 18 | |
5c6c3a6c CH |
19 | #include <libaio.h> |
20 | ||
21 | /* | |
22 | * Queue size (per-device). | |
23 | * | |
24 | * XXX: eventually we need to communicate this to the guest and/or make it | |
25 | * tunable by the guest. If we get more outstanding requests at a time | |
26 | * than this we will get EAGAIN from io_submit which is communicated to | |
27 | * the guest as an I/O error. | |
28 | */ | |
29 | #define MAX_EVENTS 128 | |
30 | ||
31 | struct qemu_laiocb { | |
7c84b1b8 | 32 | BlockAIOCB common; |
2174f12b | 33 | Coroutine *co; |
dd7f7ed1 | 34 | LinuxAioState *ctx; |
5c6c3a6c CH |
35 | struct iocb iocb; |
36 | ssize_t ret; | |
37 | size_t nbytes; | |
b161e2e4 KW |
38 | QEMUIOVector *qiov; |
39 | bool is_read; | |
28b24087 | 40 | QSIMPLEQ_ENTRY(qemu_laiocb) next; |
5c6c3a6c CH |
41 | }; |
42 | ||
1b3abdcc | 43 | typedef struct { |
1b3abdcc | 44 | int plugged; |
5e1b34a3 RP |
45 | unsigned int in_queue; |
46 | unsigned int in_flight; | |
43f2376e | 47 | bool blocked; |
28b24087 | 48 | QSIMPLEQ_HEAD(, qemu_laiocb) pending; |
1b3abdcc ML |
49 | } LaioQueue; |
50 | ||
dd7f7ed1 | 51 | struct LinuxAioState { |
0187f5c9 PB |
52 | AioContext *aio_context; |
53 | ||
5c6c3a6c | 54 | io_context_t ctx; |
c90caf25 | 55 | EventNotifier e; |
1b3abdcc ML |
56 | |
57 | /* io queue for submit at batch */ | |
58 | LaioQueue io_q; | |
2cdff7f6 SH |
59 | |
60 | /* I/O completion processing */ | |
61 | QEMUBH *completion_bh; | |
2cdff7f6 SH |
62 | int event_idx; |
63 | int event_max; | |
5c6c3a6c CH |
64 | }; |
65 | ||
dd7f7ed1 | 66 | static void ioq_submit(LinuxAioState *s); |
28b24087 | 67 | |
5c6c3a6c CH |
68 | static inline ssize_t io_event_ret(struct io_event *ev) |
69 | { | |
70 | return (ssize_t)(((uint64_t)ev->res2 << 32) | ev->res); | |
71 | } | |
72 | ||
db0ffc24 KW |
73 | /* |
74 | * Completes an AIO request (calls the callback and frees the ACB). | |
db0ffc24 | 75 | */ |
dd7f7ed1 | 76 | static void qemu_laio_process_completion(struct qemu_laiocb *laiocb) |
db0ffc24 KW |
77 | { |
78 | int ret; | |
79 | ||
db0ffc24 KW |
80 | ret = laiocb->ret; |
81 | if (ret != -ECANCELED) { | |
b161e2e4 | 82 | if (ret == laiocb->nbytes) { |
db0ffc24 | 83 | ret = 0; |
b161e2e4 KW |
84 | } else if (ret >= 0) { |
85 | /* Short reads mean EOF, pad with zeros. */ | |
86 | if (laiocb->is_read) { | |
3d9b4925 MT |
87 | qemu_iovec_memset(laiocb->qiov, ret, 0, |
88 | laiocb->qiov->size - ret); | |
b161e2e4 | 89 | } else { |
1c42f149 | 90 | ret = -ENOSPC; |
b161e2e4 KW |
91 | } |
92 | } | |
db0ffc24 KW |
93 | } |
94 | ||
2174f12b KW |
95 | laiocb->ret = ret; |
96 | if (laiocb->co) { | |
fe121b9d SH |
97 | /* If the coroutine is already entered it must be in ioq_submit() and |
98 | * will notice laio->ret has been filled in when it eventually runs | |
99 | * later. Coroutines cannot be entered recursively so avoid doing | |
100 | * that! | |
101 | */ | |
102 | if (!qemu_coroutine_entered(laiocb->co)) { | |
0ed93d84 RP |
103 | qemu_coroutine_enter(laiocb->co); |
104 | } | |
2174f12b KW |
105 | } else { |
106 | laiocb->common.cb(laiocb->common.opaque, ret); | |
107 | qemu_aio_unref(laiocb); | |
108 | } | |
db0ffc24 KW |
109 | } |
110 | ||
9e909a58 RP |
111 | /** |
112 | * aio_ring buffer which is shared between userspace and kernel. | |
113 | * | |
114 | * This copied from linux/fs/aio.c, common header does not exist | |
115 | * but AIO exists for ages so we assume ABI is stable. | |
116 | */ | |
117 | struct aio_ring { | |
118 | unsigned id; /* kernel internal index number */ | |
119 | unsigned nr; /* number of io_events */ | |
120 | unsigned head; /* Written to by userland or by kernel. */ | |
121 | unsigned tail; | |
122 | ||
123 | unsigned magic; | |
124 | unsigned compat_features; | |
125 | unsigned incompat_features; | |
126 | unsigned header_length; /* size of aio_ring */ | |
127 | ||
128 | struct io_event io_events[0]; | |
129 | }; | |
130 | ||
131 | /** | |
132 | * io_getevents_peek: | |
133 | * @ctx: AIO context | |
134 | * @events: pointer on events array, output value | |
135 | ||
136 | * Returns the number of completed events and sets a pointer | |
137 | * on events array. This function does not update the internal | |
138 | * ring buffer, only reads head and tail. When @events has been | |
139 | * processed io_getevents_commit() must be called. | |
140 | */ | |
141 | static inline unsigned int io_getevents_peek(io_context_t ctx, | |
142 | struct io_event **events) | |
143 | { | |
144 | struct aio_ring *ring = (struct aio_ring *)ctx; | |
145 | unsigned int head = ring->head, tail = ring->tail; | |
146 | unsigned int nr; | |
147 | ||
148 | nr = tail >= head ? tail - head : ring->nr - head; | |
149 | *events = ring->io_events + head; | |
150 | /* To avoid speculative loads of s->events[i] before observing tail. | |
151 | Paired with smp_wmb() inside linux/fs/aio.c: aio_complete(). */ | |
152 | smp_rmb(); | |
153 | ||
154 | return nr; | |
155 | } | |
156 | ||
157 | /** | |
158 | * io_getevents_commit: | |
159 | * @ctx: AIO context | |
160 | * @nr: the number of events on which head should be advanced | |
161 | * | |
162 | * Advances head of a ring buffer. | |
163 | */ | |
164 | static inline void io_getevents_commit(io_context_t ctx, unsigned int nr) | |
165 | { | |
166 | struct aio_ring *ring = (struct aio_ring *)ctx; | |
167 | ||
168 | if (nr) { | |
169 | ring->head = (ring->head + nr) % ring->nr; | |
170 | } | |
171 | } | |
172 | ||
173 | /** | |
174 | * io_getevents_advance_and_peek: | |
175 | * @ctx: AIO context | |
176 | * @events: pointer on events array, output value | |
177 | * @nr: the number of events on which head should be advanced | |
178 | * | |
179 | * Advances head of a ring buffer and returns number of elements left. | |
180 | */ | |
181 | static inline unsigned int | |
182 | io_getevents_advance_and_peek(io_context_t ctx, | |
183 | struct io_event **events, | |
184 | unsigned int nr) | |
185 | { | |
186 | io_getevents_commit(ctx, nr); | |
187 | return io_getevents_peek(ctx, events); | |
188 | } | |
189 | ||
3407de57 RP |
190 | /** |
191 | * qemu_laio_process_completions: | |
192 | * @s: AIO state | |
193 | * | |
194 | * Fetches completed I/O requests and invokes their callbacks. | |
2cdff7f6 SH |
195 | * |
196 | * The function is somewhat tricky because it supports nested event loops, for | |
197 | * example when a request callback invokes aio_poll(). In order to do this, | |
3407de57 RP |
198 | * indices are kept in LinuxAioState. Function schedules BH completion so it |
199 | * can be called again in a nested event loop. When there are no events left | |
200 | * to complete the BH is being canceled. | |
2cdff7f6 | 201 | */ |
3407de57 | 202 | static void qemu_laio_process_completions(LinuxAioState *s) |
5c6c3a6c | 203 | { |
9e909a58 | 204 | struct io_event *events; |
5c6c3a6c | 205 | |
2cdff7f6 SH |
206 | /* Reschedule so nested event loops see currently pending completions */ |
207 | qemu_bh_schedule(s->completion_bh); | |
5c6c3a6c | 208 | |
9e909a58 RP |
209 | while ((s->event_max = io_getevents_advance_and_peek(s->ctx, &events, |
210 | s->event_idx))) { | |
211 | for (s->event_idx = 0; s->event_idx < s->event_max; ) { | |
212 | struct iocb *iocb = events[s->event_idx].obj; | |
213 | struct qemu_laiocb *laiocb = | |
2cdff7f6 SH |
214 | container_of(iocb, struct qemu_laiocb, iocb); |
215 | ||
9e909a58 | 216 | laiocb->ret = io_event_ret(&events[s->event_idx]); |
2cdff7f6 | 217 | |
9e909a58 RP |
218 | /* Change counters one-by-one because we can be nested. */ |
219 | s->io_q.in_flight--; | |
220 | s->event_idx++; | |
221 | qemu_laio_process_completion(laiocb); | |
222 | } | |
2cdff7f6 | 223 | } |
28b24087 | 224 | |
9e909a58 RP |
225 | qemu_bh_cancel(s->completion_bh); |
226 | ||
227 | /* If we are nested we have to notify the level above that we are done | |
228 | * by setting event_max to zero, upper level will then jump out of it's | |
229 | * own `for` loop. If we are the last all counters droped to zero. */ | |
230 | s->event_max = 0; | |
231 | s->event_idx = 0; | |
3407de57 | 232 | } |
9e909a58 | 233 | |
3407de57 RP |
234 | static void qemu_laio_process_completions_and_submit(LinuxAioState *s) |
235 | { | |
236 | qemu_laio_process_completions(s); | |
28b24087 PB |
237 | if (!s->io_q.plugged && !QSIMPLEQ_EMPTY(&s->io_q.pending)) { |
238 | ioq_submit(s); | |
239 | } | |
2cdff7f6 SH |
240 | } |
241 | ||
3407de57 RP |
242 | static void qemu_laio_completion_bh(void *opaque) |
243 | { | |
244 | LinuxAioState *s = opaque; | |
245 | ||
246 | qemu_laio_process_completions_and_submit(s); | |
247 | } | |
248 | ||
2cdff7f6 SH |
249 | static void qemu_laio_completion_cb(EventNotifier *e) |
250 | { | |
dd7f7ed1 | 251 | LinuxAioState *s = container_of(e, LinuxAioState, e); |
2cdff7f6 SH |
252 | |
253 | if (event_notifier_test_and_clear(&s->e)) { | |
3407de57 | 254 | qemu_laio_process_completions_and_submit(s); |
5c6c3a6c CH |
255 | } |
256 | } | |
257 | ||
7c84b1b8 | 258 | static void laio_cancel(BlockAIOCB *blockacb) |
5c6c3a6c CH |
259 | { |
260 | struct qemu_laiocb *laiocb = (struct qemu_laiocb *)blockacb; | |
261 | struct io_event event; | |
262 | int ret; | |
263 | ||
771b64da | 264 | if (laiocb->ret != -EINPROGRESS) { |
5c6c3a6c | 265 | return; |
771b64da | 266 | } |
5c6c3a6c | 267 | ret = io_cancel(laiocb->ctx->ctx, &laiocb->iocb, &event); |
771b64da FZ |
268 | laiocb->ret = -ECANCELED; |
269 | if (ret != 0) { | |
270 | /* iocb is not cancelled, cb will be called by the event loop later */ | |
5c6c3a6c CH |
271 | return; |
272 | } | |
273 | ||
771b64da | 274 | laiocb->common.cb(laiocb->common.opaque, laiocb->ret); |
5c6c3a6c CH |
275 | } |
276 | ||
d7331bed | 277 | static const AIOCBInfo laio_aiocb_info = { |
5c6c3a6c | 278 | .aiocb_size = sizeof(struct qemu_laiocb), |
771b64da | 279 | .cancel_async = laio_cancel, |
5c6c3a6c CH |
280 | }; |
281 | ||
1b3abdcc ML |
282 | static void ioq_init(LaioQueue *io_q) |
283 | { | |
28b24087 | 284 | QSIMPLEQ_INIT(&io_q->pending); |
1b3abdcc | 285 | io_q->plugged = 0; |
5e1b34a3 RP |
286 | io_q->in_queue = 0; |
287 | io_q->in_flight = 0; | |
43f2376e | 288 | io_q->blocked = false; |
1b3abdcc ML |
289 | } |
290 | ||
dd7f7ed1 | 291 | static void ioq_submit(LinuxAioState *s) |
1b3abdcc | 292 | { |
82595da8 | 293 | int ret, len; |
28b24087 | 294 | struct qemu_laiocb *aiocb; |
5e1b34a3 | 295 | struct iocb *iocbs[MAX_EVENTS]; |
82595da8 | 296 | QSIMPLEQ_HEAD(, qemu_laiocb) completed; |
1b3abdcc | 297 | |
43f2376e | 298 | do { |
5e1b34a3 RP |
299 | if (s->io_q.in_flight >= MAX_EVENTS) { |
300 | break; | |
301 | } | |
43f2376e PB |
302 | len = 0; |
303 | QSIMPLEQ_FOREACH(aiocb, &s->io_q.pending, next) { | |
304 | iocbs[len++] = &aiocb->iocb; | |
5e1b34a3 | 305 | if (s->io_q.in_flight + len >= MAX_EVENTS) { |
43f2376e PB |
306 | break; |
307 | } | |
28b24087 | 308 | } |
1b3abdcc | 309 | |
43f2376e PB |
310 | ret = io_submit(s->ctx, len, iocbs); |
311 | if (ret == -EAGAIN) { | |
82595da8 | 312 | break; |
43f2376e PB |
313 | } |
314 | if (ret < 0) { | |
44713c9e KW |
315 | /* Fail the first request, retry the rest */ |
316 | aiocb = QSIMPLEQ_FIRST(&s->io_q.pending); | |
317 | QSIMPLEQ_REMOVE_HEAD(&s->io_q.pending, next); | |
318 | s->io_q.in_queue--; | |
319 | aiocb->ret = ret; | |
320 | qemu_laio_process_completion(aiocb); | |
321 | continue; | |
43f2376e PB |
322 | } |
323 | ||
5e1b34a3 RP |
324 | s->io_q.in_flight += ret; |
325 | s->io_q.in_queue -= ret; | |
82595da8 PB |
326 | aiocb = container_of(iocbs[ret - 1], struct qemu_laiocb, iocb); |
327 | QSIMPLEQ_SPLIT_AFTER(&s->io_q.pending, aiocb, next, &completed); | |
43f2376e | 328 | } while (ret == len && !QSIMPLEQ_EMPTY(&s->io_q.pending)); |
5e1b34a3 | 329 | s->io_q.blocked = (s->io_q.in_queue > 0); |
0ed93d84 RP |
330 | |
331 | if (s->io_q.in_flight) { | |
332 | /* We can try to complete something just right away if there are | |
333 | * still requests in-flight. */ | |
334 | qemu_laio_process_completions(s); | |
335 | /* | |
336 | * Even we have completed everything (in_flight == 0), the queue can | |
337 | * have still pended requests (in_queue > 0). We do not attempt to | |
338 | * repeat submission to avoid IO hang. The reason is simple: s->e is | |
339 | * still set and completion callback will be called shortly and all | |
340 | * pended requests will be submitted from there. | |
341 | */ | |
342 | } | |
1b3abdcc ML |
343 | } |
344 | ||
dd7f7ed1 | 345 | void laio_io_plug(BlockDriverState *bs, LinuxAioState *s) |
1b3abdcc | 346 | { |
0187f5c9 | 347 | s->io_q.plugged++; |
1b3abdcc ML |
348 | } |
349 | ||
dd7f7ed1 | 350 | void laio_io_unplug(BlockDriverState *bs, LinuxAioState *s) |
1b3abdcc | 351 | { |
6b98bd64 | 352 | assert(s->io_q.plugged); |
0187f5c9 PB |
353 | if (--s->io_q.plugged == 0 && |
354 | !s->io_q.blocked && !QSIMPLEQ_EMPTY(&s->io_q.pending)) { | |
de354644 | 355 | ioq_submit(s); |
1b3abdcc | 356 | } |
1b3abdcc ML |
357 | } |
358 | ||
2174f12b KW |
359 | static int laio_do_submit(int fd, struct qemu_laiocb *laiocb, off_t offset, |
360 | int type) | |
5c6c3a6c | 361 | { |
2174f12b KW |
362 | LinuxAioState *s = laiocb->ctx; |
363 | struct iocb *iocbs = &laiocb->iocb; | |
364 | QEMUIOVector *qiov = laiocb->qiov; | |
5c6c3a6c CH |
365 | |
366 | switch (type) { | |
367 | case QEMU_AIO_WRITE: | |
368 | io_prep_pwritev(iocbs, fd, qiov->iov, qiov->niov, offset); | |
369 | break; | |
370 | case QEMU_AIO_READ: | |
371 | io_prep_preadv(iocbs, fd, qiov->iov, qiov->niov, offset); | |
372 | break; | |
c30e624d | 373 | /* Currently Linux kernel does not support other operations */ |
5c6c3a6c CH |
374 | default: |
375 | fprintf(stderr, "%s: invalid AIO request type 0x%x.\n", | |
376 | __func__, type); | |
2174f12b | 377 | return -EIO; |
5c6c3a6c | 378 | } |
c90caf25 | 379 | io_set_eventfd(&laiocb->iocb, event_notifier_get_fd(&s->e)); |
5c6c3a6c | 380 | |
28b24087 | 381 | QSIMPLEQ_INSERT_TAIL(&s->io_q.pending, laiocb, next); |
5e1b34a3 | 382 | s->io_q.in_queue++; |
43f2376e | 383 | if (!s->io_q.blocked && |
5e1b34a3 RP |
384 | (!s->io_q.plugged || |
385 | s->io_q.in_flight + s->io_q.in_queue >= MAX_EVENTS)) { | |
28b24087 | 386 | ioq_submit(s); |
1b3abdcc | 387 | } |
5c6c3a6c | 388 | |
2174f12b KW |
389 | return 0; |
390 | } | |
391 | ||
392 | int coroutine_fn laio_co_submit(BlockDriverState *bs, LinuxAioState *s, int fd, | |
9d52aa3c | 393 | uint64_t offset, QEMUIOVector *qiov, int type) |
2174f12b | 394 | { |
2174f12b | 395 | int ret; |
2174f12b KW |
396 | struct qemu_laiocb laiocb = { |
397 | .co = qemu_coroutine_self(), | |
9d52aa3c | 398 | .nbytes = qiov->size, |
2174f12b | 399 | .ctx = s, |
0ed93d84 | 400 | .ret = -EINPROGRESS, |
2174f12b KW |
401 | .is_read = (type == QEMU_AIO_READ), |
402 | .qiov = qiov, | |
403 | }; | |
404 | ||
405 | ret = laio_do_submit(fd, &laiocb, offset, type); | |
406 | if (ret < 0) { | |
407 | return ret; | |
408 | } | |
409 | ||
0ed93d84 RP |
410 | if (laiocb.ret == -EINPROGRESS) { |
411 | qemu_coroutine_yield(); | |
412 | } | |
2174f12b KW |
413 | return laiocb.ret; |
414 | } | |
415 | ||
416 | BlockAIOCB *laio_submit(BlockDriverState *bs, LinuxAioState *s, int fd, | |
417 | int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, | |
418 | BlockCompletionFunc *cb, void *opaque, int type) | |
419 | { | |
420 | struct qemu_laiocb *laiocb; | |
421 | off_t offset = sector_num * BDRV_SECTOR_SIZE; | |
422 | int ret; | |
423 | ||
424 | laiocb = qemu_aio_get(&laio_aiocb_info, bs, cb, opaque); | |
425 | laiocb->nbytes = nb_sectors * BDRV_SECTOR_SIZE; | |
426 | laiocb->ctx = s; | |
427 | laiocb->ret = -EINPROGRESS; | |
428 | laiocb->is_read = (type == QEMU_AIO_READ); | |
429 | laiocb->qiov = qiov; | |
430 | ||
431 | ret = laio_do_submit(fd, laiocb, offset, type); | |
432 | if (ret < 0) { | |
433 | qemu_aio_unref(laiocb); | |
434 | return NULL; | |
435 | } | |
436 | ||
437 | return &laiocb->common; | |
5c6c3a6c CH |
438 | } |
439 | ||
dd7f7ed1 | 440 | void laio_detach_aio_context(LinuxAioState *s, AioContext *old_context) |
c2f3426c | 441 | { |
dca21ef2 | 442 | aio_set_event_notifier(old_context, &s->e, false, NULL); |
2cdff7f6 | 443 | qemu_bh_delete(s->completion_bh); |
c2f3426c SH |
444 | } |
445 | ||
dd7f7ed1 | 446 | void laio_attach_aio_context(LinuxAioState *s, AioContext *new_context) |
c2f3426c | 447 | { |
0187f5c9 | 448 | s->aio_context = new_context; |
2cdff7f6 | 449 | s->completion_bh = aio_bh_new(new_context, qemu_laio_completion_bh, s); |
dca21ef2 FZ |
450 | aio_set_event_notifier(new_context, &s->e, false, |
451 | qemu_laio_completion_cb); | |
c2f3426c SH |
452 | } |
453 | ||
dd7f7ed1 | 454 | LinuxAioState *laio_init(void) |
5c6c3a6c | 455 | { |
dd7f7ed1 | 456 | LinuxAioState *s; |
5c6c3a6c | 457 | |
7267c094 | 458 | s = g_malloc0(sizeof(*s)); |
c90caf25 | 459 | if (event_notifier_init(&s->e, false) < 0) { |
5c6c3a6c | 460 | goto out_free_state; |
c90caf25 | 461 | } |
5c6c3a6c | 462 | |
c90caf25 | 463 | if (io_setup(MAX_EVENTS, &s->ctx) != 0) { |
5c6c3a6c | 464 | goto out_close_efd; |
c90caf25 | 465 | } |
5c6c3a6c | 466 | |
1b3abdcc ML |
467 | ioq_init(&s->io_q); |
468 | ||
5c6c3a6c CH |
469 | return s; |
470 | ||
471 | out_close_efd: | |
c90caf25 | 472 | event_notifier_cleanup(&s->e); |
5c6c3a6c | 473 | out_free_state: |
7267c094 | 474 | g_free(s); |
5c6c3a6c CH |
475 | return NULL; |
476 | } | |
abd269b7 | 477 | |
dd7f7ed1 | 478 | void laio_cleanup(LinuxAioState *s) |
abd269b7 | 479 | { |
abd269b7 | 480 | event_notifier_cleanup(&s->e); |
a1abf40d GA |
481 | |
482 | if (io_destroy(s->ctx) != 0) { | |
483 | fprintf(stderr, "%s: destroy AIO context %p failed\n", | |
484 | __func__, &s->ctx); | |
485 | } | |
abd269b7 SH |
486 | g_free(s); |
487 | } |