]>
Commit | Line | Data |
---|---|---|
5c6c3a6c CH |
1 | /* |
2 | * Linux native AIO support. | |
3 | * | |
4 | * Copyright (C) 2009 IBM, Corp. | |
5 | * Copyright (C) 2009 Red Hat, Inc. | |
6 | * | |
7 | * This work is licensed under the terms of the GNU GPL, version 2 or later. | |
8 | * See the COPYING file in the top-level directory. | |
9 | */ | |
10 | #include "qemu-common.h" | |
737e150e | 11 | #include "block/aio.h" |
1de7afc9 | 12 | #include "qemu/queue.h" |
9f8540ec | 13 | #include "block/raw-aio.h" |
1de7afc9 | 14 | #include "qemu/event_notifier.h" |
5c6c3a6c | 15 | |
5c6c3a6c CH |
16 | #include <libaio.h> |
17 | ||
18 | /* | |
19 | * Queue size (per-device). | |
20 | * | |
21 | * XXX: eventually we need to communicate this to the guest and/or make it | |
22 | * tunable by the guest. If we get more outstanding requests at a time | |
23 | * than this we will get EAGAIN from io_submit which is communicated to | |
24 | * the guest as an I/O error. | |
25 | */ | |
26 | #define MAX_EVENTS 128 | |
27 | ||
1b3abdcc ML |
28 | #define MAX_QUEUED_IO 128 |
29 | ||
5c6c3a6c | 30 | struct qemu_laiocb { |
7c84b1b8 | 31 | BlockAIOCB common; |
5c6c3a6c CH |
32 | struct qemu_laio_state *ctx; |
33 | struct iocb iocb; | |
34 | ssize_t ret; | |
35 | size_t nbytes; | |
b161e2e4 KW |
36 | QEMUIOVector *qiov; |
37 | bool is_read; | |
28b24087 | 38 | QSIMPLEQ_ENTRY(qemu_laiocb) next; |
5c6c3a6c CH |
39 | }; |
40 | ||
1b3abdcc | 41 | typedef struct { |
1b3abdcc | 42 | int plugged; |
8455ce05 | 43 | unsigned int n; |
43f2376e | 44 | bool blocked; |
28b24087 | 45 | QSIMPLEQ_HEAD(, qemu_laiocb) pending; |
1b3abdcc ML |
46 | } LaioQueue; |
47 | ||
5c6c3a6c CH |
48 | struct qemu_laio_state { |
49 | io_context_t ctx; | |
c90caf25 | 50 | EventNotifier e; |
1b3abdcc ML |
51 | |
52 | /* io queue for submit at batch */ | |
53 | LaioQueue io_q; | |
2cdff7f6 SH |
54 | |
55 | /* I/O completion processing */ | |
56 | QEMUBH *completion_bh; | |
57 | struct io_event events[MAX_EVENTS]; | |
58 | int event_idx; | |
59 | int event_max; | |
5c6c3a6c CH |
60 | }; |
61 | ||
de354644 | 62 | static void ioq_submit(struct qemu_laio_state *s); |
28b24087 | 63 | |
5c6c3a6c CH |
64 | static inline ssize_t io_event_ret(struct io_event *ev) |
65 | { | |
66 | return (ssize_t)(((uint64_t)ev->res2 << 32) | ev->res); | |
67 | } | |
68 | ||
db0ffc24 KW |
69 | /* |
70 | * Completes an AIO request (calls the callback and frees the ACB). | |
db0ffc24 KW |
71 | */ |
72 | static void qemu_laio_process_completion(struct qemu_laio_state *s, | |
73 | struct qemu_laiocb *laiocb) | |
74 | { | |
75 | int ret; | |
76 | ||
db0ffc24 KW |
77 | ret = laiocb->ret; |
78 | if (ret != -ECANCELED) { | |
b161e2e4 | 79 | if (ret == laiocb->nbytes) { |
db0ffc24 | 80 | ret = 0; |
b161e2e4 KW |
81 | } else if (ret >= 0) { |
82 | /* Short reads mean EOF, pad with zeros. */ | |
83 | if (laiocb->is_read) { | |
3d9b4925 MT |
84 | qemu_iovec_memset(laiocb->qiov, ret, 0, |
85 | laiocb->qiov->size - ret); | |
b161e2e4 KW |
86 | } else { |
87 | ret = -EINVAL; | |
88 | } | |
89 | } | |
db0ffc24 | 90 | } |
771b64da | 91 | laiocb->common.cb(laiocb->common.opaque, ret); |
db0ffc24 | 92 | |
8007429a | 93 | qemu_aio_unref(laiocb); |
db0ffc24 KW |
94 | } |
95 | ||
2cdff7f6 SH |
96 | /* The completion BH fetches completed I/O requests and invokes their |
97 | * callbacks. | |
98 | * | |
99 | * The function is somewhat tricky because it supports nested event loops, for | |
100 | * example when a request callback invokes aio_poll(). In order to do this, | |
101 | * the completion events array and index are kept in qemu_laio_state. The BH | |
102 | * reschedules itself as long as there are completions pending so it will | |
103 | * either be called again in a nested event loop or will be called after all | |
104 | * events have been completed. When there are no events left to complete, the | |
105 | * BH returns without rescheduling. | |
106 | */ | |
107 | static void qemu_laio_completion_bh(void *opaque) | |
5c6c3a6c | 108 | { |
2cdff7f6 | 109 | struct qemu_laio_state *s = opaque; |
5c6c3a6c | 110 | |
2cdff7f6 SH |
111 | /* Fetch more completion events when empty */ |
112 | if (s->event_idx == s->event_max) { | |
5c6c3a6c | 113 | do { |
2cdff7f6 SH |
114 | struct timespec ts = { 0 }; |
115 | s->event_max = io_getevents(s->ctx, MAX_EVENTS, MAX_EVENTS, | |
116 | s->events, &ts); | |
117 | } while (s->event_max == -EINTR); | |
118 | ||
119 | s->event_idx = 0; | |
120 | if (s->event_max <= 0) { | |
121 | s->event_max = 0; | |
122 | return; /* no more events */ | |
123 | } | |
124 | } | |
5c6c3a6c | 125 | |
2cdff7f6 SH |
126 | /* Reschedule so nested event loops see currently pending completions */ |
127 | qemu_bh_schedule(s->completion_bh); | |
5c6c3a6c | 128 | |
2cdff7f6 SH |
129 | /* Process completion events */ |
130 | while (s->event_idx < s->event_max) { | |
131 | struct iocb *iocb = s->events[s->event_idx].obj; | |
132 | struct qemu_laiocb *laiocb = | |
133 | container_of(iocb, struct qemu_laiocb, iocb); | |
134 | ||
135 | laiocb->ret = io_event_ret(&s->events[s->event_idx]); | |
136 | s->event_idx++; | |
137 | ||
138 | qemu_laio_process_completion(s, laiocb); | |
139 | } | |
28b24087 PB |
140 | |
141 | if (!s->io_q.plugged && !QSIMPLEQ_EMPTY(&s->io_q.pending)) { | |
142 | ioq_submit(s); | |
143 | } | |
2cdff7f6 SH |
144 | } |
145 | ||
146 | static void qemu_laio_completion_cb(EventNotifier *e) | |
147 | { | |
148 | struct qemu_laio_state *s = container_of(e, struct qemu_laio_state, e); | |
149 | ||
150 | if (event_notifier_test_and_clear(&s->e)) { | |
151 | qemu_bh_schedule(s->completion_bh); | |
5c6c3a6c CH |
152 | } |
153 | } | |
154 | ||
7c84b1b8 | 155 | static void laio_cancel(BlockAIOCB *blockacb) |
5c6c3a6c CH |
156 | { |
157 | struct qemu_laiocb *laiocb = (struct qemu_laiocb *)blockacb; | |
158 | struct io_event event; | |
159 | int ret; | |
160 | ||
771b64da | 161 | if (laiocb->ret != -EINPROGRESS) { |
5c6c3a6c | 162 | return; |
771b64da | 163 | } |
5c6c3a6c | 164 | ret = io_cancel(laiocb->ctx->ctx, &laiocb->iocb, &event); |
771b64da FZ |
165 | laiocb->ret = -ECANCELED; |
166 | if (ret != 0) { | |
167 | /* iocb is not cancelled, cb will be called by the event loop later */ | |
5c6c3a6c CH |
168 | return; |
169 | } | |
170 | ||
771b64da | 171 | laiocb->common.cb(laiocb->common.opaque, laiocb->ret); |
5c6c3a6c CH |
172 | } |
173 | ||
d7331bed | 174 | static const AIOCBInfo laio_aiocb_info = { |
5c6c3a6c | 175 | .aiocb_size = sizeof(struct qemu_laiocb), |
771b64da | 176 | .cancel_async = laio_cancel, |
5c6c3a6c CH |
177 | }; |
178 | ||
1b3abdcc ML |
179 | static void ioq_init(LaioQueue *io_q) |
180 | { | |
28b24087 | 181 | QSIMPLEQ_INIT(&io_q->pending); |
1b3abdcc | 182 | io_q->plugged = 0; |
8455ce05 | 183 | io_q->n = 0; |
43f2376e | 184 | io_q->blocked = false; |
1b3abdcc ML |
185 | } |
186 | ||
de354644 | 187 | static void ioq_submit(struct qemu_laio_state *s) |
1b3abdcc | 188 | { |
82595da8 | 189 | int ret, len; |
28b24087 PB |
190 | struct qemu_laiocb *aiocb; |
191 | struct iocb *iocbs[MAX_QUEUED_IO]; | |
82595da8 | 192 | QSIMPLEQ_HEAD(, qemu_laiocb) completed; |
1b3abdcc | 193 | |
43f2376e PB |
194 | do { |
195 | len = 0; | |
196 | QSIMPLEQ_FOREACH(aiocb, &s->io_q.pending, next) { | |
197 | iocbs[len++] = &aiocb->iocb; | |
198 | if (len == MAX_QUEUED_IO) { | |
199 | break; | |
200 | } | |
28b24087 | 201 | } |
1b3abdcc | 202 | |
43f2376e PB |
203 | ret = io_submit(s->ctx, len, iocbs); |
204 | if (ret == -EAGAIN) { | |
82595da8 | 205 | break; |
43f2376e PB |
206 | } |
207 | if (ret < 0) { | |
208 | abort(); | |
209 | } | |
210 | ||
82595da8 PB |
211 | s->io_q.n -= ret; |
212 | aiocb = container_of(iocbs[ret - 1], struct qemu_laiocb, iocb); | |
213 | QSIMPLEQ_SPLIT_AFTER(&s->io_q.pending, aiocb, next, &completed); | |
43f2376e | 214 | } while (ret == len && !QSIMPLEQ_EMPTY(&s->io_q.pending)); |
8455ce05 | 215 | s->io_q.blocked = (s->io_q.n > 0); |
1b3abdcc ML |
216 | } |
217 | ||
1b3abdcc ML |
218 | void laio_io_plug(BlockDriverState *bs, void *aio_ctx) |
219 | { | |
220 | struct qemu_laio_state *s = aio_ctx; | |
221 | ||
222 | s->io_q.plugged++; | |
223 | } | |
224 | ||
de354644 | 225 | void laio_io_unplug(BlockDriverState *bs, void *aio_ctx, bool unplug) |
1b3abdcc ML |
226 | { |
227 | struct qemu_laio_state *s = aio_ctx; | |
1b3abdcc ML |
228 | |
229 | assert(s->io_q.plugged > 0 || !unplug); | |
230 | ||
231 | if (unplug && --s->io_q.plugged > 0) { | |
de354644 | 232 | return; |
1b3abdcc ML |
233 | } |
234 | ||
43f2376e | 235 | if (!s->io_q.blocked && !QSIMPLEQ_EMPTY(&s->io_q.pending)) { |
de354644 | 236 | ioq_submit(s); |
1b3abdcc | 237 | } |
1b3abdcc ML |
238 | } |
239 | ||
7c84b1b8 | 240 | BlockAIOCB *laio_submit(BlockDriverState *bs, void *aio_ctx, int fd, |
5c6c3a6c | 241 | int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, |
097310b5 | 242 | BlockCompletionFunc *cb, void *opaque, int type) |
5c6c3a6c CH |
243 | { |
244 | struct qemu_laio_state *s = aio_ctx; | |
245 | struct qemu_laiocb *laiocb; | |
246 | struct iocb *iocbs; | |
247 | off_t offset = sector_num * 512; | |
248 | ||
d7331bed | 249 | laiocb = qemu_aio_get(&laio_aiocb_info, bs, cb, opaque); |
5c6c3a6c CH |
250 | laiocb->nbytes = nb_sectors * 512; |
251 | laiocb->ctx = s; | |
252 | laiocb->ret = -EINPROGRESS; | |
b161e2e4 KW |
253 | laiocb->is_read = (type == QEMU_AIO_READ); |
254 | laiocb->qiov = qiov; | |
5c6c3a6c CH |
255 | |
256 | iocbs = &laiocb->iocb; | |
257 | ||
258 | switch (type) { | |
259 | case QEMU_AIO_WRITE: | |
260 | io_prep_pwritev(iocbs, fd, qiov->iov, qiov->niov, offset); | |
261 | break; | |
262 | case QEMU_AIO_READ: | |
263 | io_prep_preadv(iocbs, fd, qiov->iov, qiov->niov, offset); | |
264 | break; | |
c30e624d | 265 | /* Currently Linux kernel does not support other operations */ |
5c6c3a6c CH |
266 | default: |
267 | fprintf(stderr, "%s: invalid AIO request type 0x%x.\n", | |
268 | __func__, type); | |
269 | goto out_free_aiocb; | |
270 | } | |
c90caf25 | 271 | io_set_eventfd(&laiocb->iocb, event_notifier_get_fd(&s->e)); |
5c6c3a6c | 272 | |
28b24087 | 273 | QSIMPLEQ_INSERT_TAIL(&s->io_q.pending, laiocb, next); |
8455ce05 | 274 | s->io_q.n++; |
43f2376e | 275 | if (!s->io_q.blocked && |
8455ce05 | 276 | (!s->io_q.plugged || s->io_q.n >= MAX_QUEUED_IO)) { |
28b24087 | 277 | ioq_submit(s); |
1b3abdcc | 278 | } |
5c6c3a6c CH |
279 | return &laiocb->common; |
280 | ||
449c184e | 281 | out_free_aiocb: |
8007429a | 282 | qemu_aio_unref(laiocb); |
5c6c3a6c CH |
283 | return NULL; |
284 | } | |
285 | ||
c2f3426c SH |
286 | void laio_detach_aio_context(void *s_, AioContext *old_context) |
287 | { | |
288 | struct qemu_laio_state *s = s_; | |
289 | ||
290 | aio_set_event_notifier(old_context, &s->e, NULL); | |
2cdff7f6 | 291 | qemu_bh_delete(s->completion_bh); |
c2f3426c SH |
292 | } |
293 | ||
294 | void laio_attach_aio_context(void *s_, AioContext *new_context) | |
295 | { | |
296 | struct qemu_laio_state *s = s_; | |
297 | ||
2cdff7f6 | 298 | s->completion_bh = aio_bh_new(new_context, qemu_laio_completion_bh, s); |
c2f3426c SH |
299 | aio_set_event_notifier(new_context, &s->e, qemu_laio_completion_cb); |
300 | } | |
301 | ||
5c6c3a6c CH |
302 | void *laio_init(void) |
303 | { | |
304 | struct qemu_laio_state *s; | |
305 | ||
7267c094 | 306 | s = g_malloc0(sizeof(*s)); |
c90caf25 | 307 | if (event_notifier_init(&s->e, false) < 0) { |
5c6c3a6c | 308 | goto out_free_state; |
c90caf25 | 309 | } |
5c6c3a6c | 310 | |
c90caf25 | 311 | if (io_setup(MAX_EVENTS, &s->ctx) != 0) { |
5c6c3a6c | 312 | goto out_close_efd; |
c90caf25 | 313 | } |
5c6c3a6c | 314 | |
1b3abdcc ML |
315 | ioq_init(&s->io_q); |
316 | ||
5c6c3a6c CH |
317 | return s; |
318 | ||
319 | out_close_efd: | |
c90caf25 | 320 | event_notifier_cleanup(&s->e); |
5c6c3a6c | 321 | out_free_state: |
7267c094 | 322 | g_free(s); |
5c6c3a6c CH |
323 | return NULL; |
324 | } | |
abd269b7 SH |
325 | |
326 | void laio_cleanup(void *s_) | |
327 | { | |
328 | struct qemu_laio_state *s = s_; | |
329 | ||
330 | event_notifier_cleanup(&s->e); | |
a1abf40d GA |
331 | |
332 | if (io_destroy(s->ctx) != 0) { | |
333 | fprintf(stderr, "%s: destroy AIO context %p failed\n", | |
334 | __func__, &s->ctx); | |
335 | } | |
abd269b7 SH |
336 | g_free(s); |
337 | } |