]>
Commit | Line | Data |
---|---|---|
5c6c3a6c CH |
1 | /* |
2 | * Linux native AIO support. | |
3 | * | |
4 | * Copyright (C) 2009 IBM, Corp. | |
5 | * Copyright (C) 2009 Red Hat, Inc. | |
6 | * | |
7 | * This work is licensed under the terms of the GNU GPL, version 2 or later. | |
8 | * See the COPYING file in the top-level directory. | |
9 | */ | |
80c71a24 | 10 | #include "qemu/osdep.h" |
5c6c3a6c | 11 | #include "qemu-common.h" |
737e150e | 12 | #include "block/aio.h" |
1de7afc9 | 13 | #include "qemu/queue.h" |
2174f12b | 14 | #include "block/block.h" |
9f8540ec | 15 | #include "block/raw-aio.h" |
1de7afc9 | 16 | #include "qemu/event_notifier.h" |
2174f12b | 17 | #include "qemu/coroutine.h" |
5c6c3a6c | 18 | |
5c6c3a6c CH |
19 | #include <libaio.h> |
20 | ||
21 | /* | |
22 | * Queue size (per-device). | |
23 | * | |
24 | * XXX: eventually we need to communicate this to the guest and/or make it | |
25 | * tunable by the guest. If we get more outstanding requests at a time | |
26 | * than this we will get EAGAIN from io_submit which is communicated to | |
27 | * the guest as an I/O error. | |
28 | */ | |
29 | #define MAX_EVENTS 128 | |
30 | ||
1b3abdcc ML |
31 | #define MAX_QUEUED_IO 128 |
32 | ||
5c6c3a6c | 33 | struct qemu_laiocb { |
7c84b1b8 | 34 | BlockAIOCB common; |
2174f12b | 35 | Coroutine *co; |
dd7f7ed1 | 36 | LinuxAioState *ctx; |
5c6c3a6c CH |
37 | struct iocb iocb; |
38 | ssize_t ret; | |
39 | size_t nbytes; | |
b161e2e4 KW |
40 | QEMUIOVector *qiov; |
41 | bool is_read; | |
28b24087 | 42 | QSIMPLEQ_ENTRY(qemu_laiocb) next; |
5c6c3a6c CH |
43 | }; |
44 | ||
1b3abdcc | 45 | typedef struct { |
1b3abdcc | 46 | int plugged; |
8455ce05 | 47 | unsigned int n; |
43f2376e | 48 | bool blocked; |
28b24087 | 49 | QSIMPLEQ_HEAD(, qemu_laiocb) pending; |
1b3abdcc ML |
50 | } LaioQueue; |
51 | ||
dd7f7ed1 | 52 | struct LinuxAioState { |
5c6c3a6c | 53 | io_context_t ctx; |
c90caf25 | 54 | EventNotifier e; |
1b3abdcc ML |
55 | |
56 | /* io queue for submit at batch */ | |
57 | LaioQueue io_q; | |
2cdff7f6 SH |
58 | |
59 | /* I/O completion processing */ | |
60 | QEMUBH *completion_bh; | |
61 | struct io_event events[MAX_EVENTS]; | |
62 | int event_idx; | |
63 | int event_max; | |
5c6c3a6c CH |
64 | }; |
65 | ||
dd7f7ed1 | 66 | static void ioq_submit(LinuxAioState *s); |
28b24087 | 67 | |
5c6c3a6c CH |
68 | static inline ssize_t io_event_ret(struct io_event *ev) |
69 | { | |
70 | return (ssize_t)(((uint64_t)ev->res2 << 32) | ev->res); | |
71 | } | |
72 | ||
db0ffc24 KW |
73 | /* |
74 | * Completes an AIO request (calls the callback and frees the ACB). | |
db0ffc24 | 75 | */ |
dd7f7ed1 | 76 | static void qemu_laio_process_completion(struct qemu_laiocb *laiocb) |
db0ffc24 KW |
77 | { |
78 | int ret; | |
79 | ||
db0ffc24 KW |
80 | ret = laiocb->ret; |
81 | if (ret != -ECANCELED) { | |
b161e2e4 | 82 | if (ret == laiocb->nbytes) { |
db0ffc24 | 83 | ret = 0; |
b161e2e4 KW |
84 | } else if (ret >= 0) { |
85 | /* Short reads mean EOF, pad with zeros. */ | |
86 | if (laiocb->is_read) { | |
3d9b4925 MT |
87 | qemu_iovec_memset(laiocb->qiov, ret, 0, |
88 | laiocb->qiov->size - ret); | |
b161e2e4 | 89 | } else { |
1c42f149 | 90 | ret = -ENOSPC; |
b161e2e4 KW |
91 | } |
92 | } | |
db0ffc24 KW |
93 | } |
94 | ||
2174f12b KW |
95 | laiocb->ret = ret; |
96 | if (laiocb->co) { | |
97 | qemu_coroutine_enter(laiocb->co, NULL); | |
98 | } else { | |
99 | laiocb->common.cb(laiocb->common.opaque, ret); | |
100 | qemu_aio_unref(laiocb); | |
101 | } | |
db0ffc24 KW |
102 | } |
103 | ||
2cdff7f6 SH |
104 | /* The completion BH fetches completed I/O requests and invokes their |
105 | * callbacks. | |
106 | * | |
107 | * The function is somewhat tricky because it supports nested event loops, for | |
108 | * example when a request callback invokes aio_poll(). In order to do this, | |
dd7f7ed1 | 109 | * the completion events array and index are kept in LinuxAioState. The BH |
2cdff7f6 SH |
110 | * reschedules itself as long as there are completions pending so it will |
111 | * either be called again in a nested event loop or will be called after all | |
112 | * events have been completed. When there are no events left to complete, the | |
113 | * BH returns without rescheduling. | |
114 | */ | |
115 | static void qemu_laio_completion_bh(void *opaque) | |
5c6c3a6c | 116 | { |
dd7f7ed1 | 117 | LinuxAioState *s = opaque; |
5c6c3a6c | 118 | |
2cdff7f6 SH |
119 | /* Fetch more completion events when empty */ |
120 | if (s->event_idx == s->event_max) { | |
5c6c3a6c | 121 | do { |
2cdff7f6 SH |
122 | struct timespec ts = { 0 }; |
123 | s->event_max = io_getevents(s->ctx, MAX_EVENTS, MAX_EVENTS, | |
124 | s->events, &ts); | |
125 | } while (s->event_max == -EINTR); | |
126 | ||
127 | s->event_idx = 0; | |
128 | if (s->event_max <= 0) { | |
129 | s->event_max = 0; | |
130 | return; /* no more events */ | |
131 | } | |
132 | } | |
5c6c3a6c | 133 | |
2cdff7f6 SH |
134 | /* Reschedule so nested event loops see currently pending completions */ |
135 | qemu_bh_schedule(s->completion_bh); | |
5c6c3a6c | 136 | |
2cdff7f6 SH |
137 | /* Process completion events */ |
138 | while (s->event_idx < s->event_max) { | |
139 | struct iocb *iocb = s->events[s->event_idx].obj; | |
140 | struct qemu_laiocb *laiocb = | |
141 | container_of(iocb, struct qemu_laiocb, iocb); | |
142 | ||
143 | laiocb->ret = io_event_ret(&s->events[s->event_idx]); | |
144 | s->event_idx++; | |
145 | ||
dd7f7ed1 | 146 | qemu_laio_process_completion(laiocb); |
2cdff7f6 | 147 | } |
28b24087 PB |
148 | |
149 | if (!s->io_q.plugged && !QSIMPLEQ_EMPTY(&s->io_q.pending)) { | |
150 | ioq_submit(s); | |
151 | } | |
ccb9dc10 KW |
152 | |
153 | qemu_bh_cancel(s->completion_bh); | |
2cdff7f6 SH |
154 | } |
155 | ||
156 | static void qemu_laio_completion_cb(EventNotifier *e) | |
157 | { | |
dd7f7ed1 | 158 | LinuxAioState *s = container_of(e, LinuxAioState, e); |
2cdff7f6 SH |
159 | |
160 | if (event_notifier_test_and_clear(&s->e)) { | |
ccb9dc10 | 161 | qemu_laio_completion_bh(s); |
5c6c3a6c CH |
162 | } |
163 | } | |
164 | ||
7c84b1b8 | 165 | static void laio_cancel(BlockAIOCB *blockacb) |
5c6c3a6c CH |
166 | { |
167 | struct qemu_laiocb *laiocb = (struct qemu_laiocb *)blockacb; | |
168 | struct io_event event; | |
169 | int ret; | |
170 | ||
771b64da | 171 | if (laiocb->ret != -EINPROGRESS) { |
5c6c3a6c | 172 | return; |
771b64da | 173 | } |
5c6c3a6c | 174 | ret = io_cancel(laiocb->ctx->ctx, &laiocb->iocb, &event); |
771b64da FZ |
175 | laiocb->ret = -ECANCELED; |
176 | if (ret != 0) { | |
177 | /* iocb is not cancelled, cb will be called by the event loop later */ | |
5c6c3a6c CH |
178 | return; |
179 | } | |
180 | ||
771b64da | 181 | laiocb->common.cb(laiocb->common.opaque, laiocb->ret); |
5c6c3a6c CH |
182 | } |
183 | ||
d7331bed | 184 | static const AIOCBInfo laio_aiocb_info = { |
5c6c3a6c | 185 | .aiocb_size = sizeof(struct qemu_laiocb), |
771b64da | 186 | .cancel_async = laio_cancel, |
5c6c3a6c CH |
187 | }; |
188 | ||
1b3abdcc ML |
189 | static void ioq_init(LaioQueue *io_q) |
190 | { | |
28b24087 | 191 | QSIMPLEQ_INIT(&io_q->pending); |
1b3abdcc | 192 | io_q->plugged = 0; |
8455ce05 | 193 | io_q->n = 0; |
43f2376e | 194 | io_q->blocked = false; |
1b3abdcc ML |
195 | } |
196 | ||
dd7f7ed1 | 197 | static void ioq_submit(LinuxAioState *s) |
1b3abdcc | 198 | { |
82595da8 | 199 | int ret, len; |
28b24087 PB |
200 | struct qemu_laiocb *aiocb; |
201 | struct iocb *iocbs[MAX_QUEUED_IO]; | |
82595da8 | 202 | QSIMPLEQ_HEAD(, qemu_laiocb) completed; |
1b3abdcc | 203 | |
43f2376e PB |
204 | do { |
205 | len = 0; | |
206 | QSIMPLEQ_FOREACH(aiocb, &s->io_q.pending, next) { | |
207 | iocbs[len++] = &aiocb->iocb; | |
208 | if (len == MAX_QUEUED_IO) { | |
209 | break; | |
210 | } | |
28b24087 | 211 | } |
1b3abdcc | 212 | |
43f2376e PB |
213 | ret = io_submit(s->ctx, len, iocbs); |
214 | if (ret == -EAGAIN) { | |
82595da8 | 215 | break; |
43f2376e PB |
216 | } |
217 | if (ret < 0) { | |
218 | abort(); | |
219 | } | |
220 | ||
82595da8 PB |
221 | s->io_q.n -= ret; |
222 | aiocb = container_of(iocbs[ret - 1], struct qemu_laiocb, iocb); | |
223 | QSIMPLEQ_SPLIT_AFTER(&s->io_q.pending, aiocb, next, &completed); | |
43f2376e | 224 | } while (ret == len && !QSIMPLEQ_EMPTY(&s->io_q.pending)); |
8455ce05 | 225 | s->io_q.blocked = (s->io_q.n > 0); |
1b3abdcc ML |
226 | } |
227 | ||
dd7f7ed1 | 228 | void laio_io_plug(BlockDriverState *bs, LinuxAioState *s) |
1b3abdcc | 229 | { |
6b98bd64 PB |
230 | assert(!s->io_q.plugged); |
231 | s->io_q.plugged = 1; | |
1b3abdcc ML |
232 | } |
233 | ||
dd7f7ed1 | 234 | void laio_io_unplug(BlockDriverState *bs, LinuxAioState *s) |
1b3abdcc | 235 | { |
6b98bd64 PB |
236 | assert(s->io_q.plugged); |
237 | s->io_q.plugged = 0; | |
43f2376e | 238 | if (!s->io_q.blocked && !QSIMPLEQ_EMPTY(&s->io_q.pending)) { |
de354644 | 239 | ioq_submit(s); |
1b3abdcc | 240 | } |
1b3abdcc ML |
241 | } |
242 | ||
2174f12b KW |
243 | static int laio_do_submit(int fd, struct qemu_laiocb *laiocb, off_t offset, |
244 | int type) | |
5c6c3a6c | 245 | { |
2174f12b KW |
246 | LinuxAioState *s = laiocb->ctx; |
247 | struct iocb *iocbs = &laiocb->iocb; | |
248 | QEMUIOVector *qiov = laiocb->qiov; | |
5c6c3a6c CH |
249 | |
250 | switch (type) { | |
251 | case QEMU_AIO_WRITE: | |
252 | io_prep_pwritev(iocbs, fd, qiov->iov, qiov->niov, offset); | |
253 | break; | |
254 | case QEMU_AIO_READ: | |
255 | io_prep_preadv(iocbs, fd, qiov->iov, qiov->niov, offset); | |
256 | break; | |
c30e624d | 257 | /* Currently Linux kernel does not support other operations */ |
5c6c3a6c CH |
258 | default: |
259 | fprintf(stderr, "%s: invalid AIO request type 0x%x.\n", | |
260 | __func__, type); | |
2174f12b | 261 | return -EIO; |
5c6c3a6c | 262 | } |
c90caf25 | 263 | io_set_eventfd(&laiocb->iocb, event_notifier_get_fd(&s->e)); |
5c6c3a6c | 264 | |
28b24087 | 265 | QSIMPLEQ_INSERT_TAIL(&s->io_q.pending, laiocb, next); |
8455ce05 | 266 | s->io_q.n++; |
43f2376e | 267 | if (!s->io_q.blocked && |
8455ce05 | 268 | (!s->io_q.plugged || s->io_q.n >= MAX_QUEUED_IO)) { |
28b24087 | 269 | ioq_submit(s); |
1b3abdcc | 270 | } |
5c6c3a6c | 271 | |
2174f12b KW |
272 | return 0; |
273 | } | |
274 | ||
275 | int coroutine_fn laio_co_submit(BlockDriverState *bs, LinuxAioState *s, int fd, | |
9d52aa3c | 276 | uint64_t offset, QEMUIOVector *qiov, int type) |
2174f12b | 277 | { |
2174f12b | 278 | int ret; |
2174f12b KW |
279 | struct qemu_laiocb laiocb = { |
280 | .co = qemu_coroutine_self(), | |
9d52aa3c | 281 | .nbytes = qiov->size, |
2174f12b KW |
282 | .ctx = s, |
283 | .is_read = (type == QEMU_AIO_READ), | |
284 | .qiov = qiov, | |
285 | }; | |
286 | ||
287 | ret = laio_do_submit(fd, &laiocb, offset, type); | |
288 | if (ret < 0) { | |
289 | return ret; | |
290 | } | |
291 | ||
292 | qemu_coroutine_yield(); | |
293 | return laiocb.ret; | |
294 | } | |
295 | ||
296 | BlockAIOCB *laio_submit(BlockDriverState *bs, LinuxAioState *s, int fd, | |
297 | int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, | |
298 | BlockCompletionFunc *cb, void *opaque, int type) | |
299 | { | |
300 | struct qemu_laiocb *laiocb; | |
301 | off_t offset = sector_num * BDRV_SECTOR_SIZE; | |
302 | int ret; | |
303 | ||
304 | laiocb = qemu_aio_get(&laio_aiocb_info, bs, cb, opaque); | |
305 | laiocb->nbytes = nb_sectors * BDRV_SECTOR_SIZE; | |
306 | laiocb->ctx = s; | |
307 | laiocb->ret = -EINPROGRESS; | |
308 | laiocb->is_read = (type == QEMU_AIO_READ); | |
309 | laiocb->qiov = qiov; | |
310 | ||
311 | ret = laio_do_submit(fd, laiocb, offset, type); | |
312 | if (ret < 0) { | |
313 | qemu_aio_unref(laiocb); | |
314 | return NULL; | |
315 | } | |
316 | ||
317 | return &laiocb->common; | |
5c6c3a6c CH |
318 | } |
319 | ||
dd7f7ed1 | 320 | void laio_detach_aio_context(LinuxAioState *s, AioContext *old_context) |
c2f3426c | 321 | { |
dca21ef2 | 322 | aio_set_event_notifier(old_context, &s->e, false, NULL); |
2cdff7f6 | 323 | qemu_bh_delete(s->completion_bh); |
c2f3426c SH |
324 | } |
325 | ||
dd7f7ed1 | 326 | void laio_attach_aio_context(LinuxAioState *s, AioContext *new_context) |
c2f3426c | 327 | { |
2cdff7f6 | 328 | s->completion_bh = aio_bh_new(new_context, qemu_laio_completion_bh, s); |
dca21ef2 FZ |
329 | aio_set_event_notifier(new_context, &s->e, false, |
330 | qemu_laio_completion_cb); | |
c2f3426c SH |
331 | } |
332 | ||
dd7f7ed1 | 333 | LinuxAioState *laio_init(void) |
5c6c3a6c | 334 | { |
dd7f7ed1 | 335 | LinuxAioState *s; |
5c6c3a6c | 336 | |
7267c094 | 337 | s = g_malloc0(sizeof(*s)); |
c90caf25 | 338 | if (event_notifier_init(&s->e, false) < 0) { |
5c6c3a6c | 339 | goto out_free_state; |
c90caf25 | 340 | } |
5c6c3a6c | 341 | |
c90caf25 | 342 | if (io_setup(MAX_EVENTS, &s->ctx) != 0) { |
5c6c3a6c | 343 | goto out_close_efd; |
c90caf25 | 344 | } |
5c6c3a6c | 345 | |
1b3abdcc ML |
346 | ioq_init(&s->io_q); |
347 | ||
5c6c3a6c CH |
348 | return s; |
349 | ||
350 | out_close_efd: | |
c90caf25 | 351 | event_notifier_cleanup(&s->e); |
5c6c3a6c | 352 | out_free_state: |
7267c094 | 353 | g_free(s); |
5c6c3a6c CH |
354 | return NULL; |
355 | } | |
abd269b7 | 356 | |
dd7f7ed1 | 357 | void laio_cleanup(LinuxAioState *s) |
abd269b7 | 358 | { |
abd269b7 | 359 | event_notifier_cleanup(&s->e); |
a1abf40d GA |
360 | |
361 | if (io_destroy(s->ctx) != 0) { | |
362 | fprintf(stderr, "%s: destroy AIO context %p failed\n", | |
363 | __func__, &s->ctx); | |
364 | } | |
abd269b7 SH |
365 | g_free(s); |
366 | } |