]>
Commit | Line | Data |
---|---|---|
a76bab49 AL |
1 | /* |
2 | * QEMU aio implementation | |
3 | * | |
4 | * Copyright IBM, Corp. 2008 | |
5 | * | |
6 | * Authors: | |
7 | * Anthony Liguori <[email protected]> | |
8 | * | |
9 | * This work is licensed under the terms of the GNU GPL, version 2. See | |
10 | * the COPYING file in the top-level directory. | |
11 | * | |
6b620ca3 PB |
12 | * Contributions after 2012-01-13 are licensed under the terms of the |
13 | * GNU GPL, version 2 or (at your option) any later version. | |
a76bab49 AL |
14 | */ |
15 | ||
16 | #include "qemu-common.h" | |
737e150e | 17 | #include "block/block.h" |
1de7afc9 PB |
18 | #include "qemu/queue.h" |
19 | #include "qemu/sockets.h" | |
fbe3fc5c FZ |
20 | #ifdef CONFIG_EPOLL |
21 | #include <sys/epoll.h> | |
22 | #endif | |
a76bab49 | 23 | |
a76bab49 AL |
24 | struct AioHandler |
25 | { | |
cd9ba1eb | 26 | GPollFD pfd; |
a76bab49 AL |
27 | IOHandler *io_read; |
28 | IOHandler *io_write; | |
a76bab49 AL |
29 | int deleted; |
30 | void *opaque; | |
dca21ef2 | 31 | bool is_external; |
72cf2d4f | 32 | QLIST_ENTRY(AioHandler) node; |
a76bab49 AL |
33 | }; |
34 | ||
fbe3fc5c FZ |
35 | #ifdef CONFIG_EPOLL |
36 | ||
37 | /* The fd number threashold to switch to epoll */ | |
38 | #define EPOLL_ENABLE_THRESHOLD 64 | |
39 | ||
40 | static void aio_epoll_disable(AioContext *ctx) | |
41 | { | |
42 | ctx->epoll_available = false; | |
43 | if (!ctx->epoll_enabled) { | |
44 | return; | |
45 | } | |
46 | ctx->epoll_enabled = false; | |
47 | close(ctx->epollfd); | |
48 | } | |
49 | ||
50 | static inline int epoll_events_from_pfd(int pfd_events) | |
51 | { | |
52 | return (pfd_events & G_IO_IN ? EPOLLIN : 0) | | |
53 | (pfd_events & G_IO_OUT ? EPOLLOUT : 0) | | |
54 | (pfd_events & G_IO_HUP ? EPOLLHUP : 0) | | |
55 | (pfd_events & G_IO_ERR ? EPOLLERR : 0); | |
56 | } | |
57 | ||
58 | static bool aio_epoll_try_enable(AioContext *ctx) | |
59 | { | |
60 | AioHandler *node; | |
61 | struct epoll_event event; | |
62 | ||
63 | QLIST_FOREACH(node, &ctx->aio_handlers, node) { | |
64 | int r; | |
65 | if (node->deleted || !node->pfd.events) { | |
66 | continue; | |
67 | } | |
68 | event.events = epoll_events_from_pfd(node->pfd.events); | |
69 | event.data.ptr = node; | |
70 | r = epoll_ctl(ctx->epollfd, EPOLL_CTL_ADD, node->pfd.fd, &event); | |
71 | if (r) { | |
72 | return false; | |
73 | } | |
74 | } | |
75 | ctx->epoll_enabled = true; | |
76 | return true; | |
77 | } | |
78 | ||
79 | static void aio_epoll_update(AioContext *ctx, AioHandler *node, bool is_new) | |
80 | { | |
81 | struct epoll_event event; | |
82 | int r; | |
83 | ||
84 | if (!ctx->epoll_enabled) { | |
85 | return; | |
86 | } | |
87 | if (!node->pfd.events) { | |
88 | r = epoll_ctl(ctx->epollfd, EPOLL_CTL_DEL, node->pfd.fd, &event); | |
89 | if (r) { | |
90 | aio_epoll_disable(ctx); | |
91 | } | |
92 | } else { | |
93 | event.data.ptr = node; | |
94 | event.events = epoll_events_from_pfd(node->pfd.events); | |
95 | if (is_new) { | |
96 | r = epoll_ctl(ctx->epollfd, EPOLL_CTL_ADD, node->pfd.fd, &event); | |
97 | if (r) { | |
98 | aio_epoll_disable(ctx); | |
99 | } | |
100 | } else { | |
101 | r = epoll_ctl(ctx->epollfd, EPOLL_CTL_MOD, node->pfd.fd, &event); | |
102 | if (r) { | |
103 | aio_epoll_disable(ctx); | |
104 | } | |
105 | } | |
106 | } | |
107 | } | |
108 | ||
109 | static int aio_epoll(AioContext *ctx, GPollFD *pfds, | |
110 | unsigned npfd, int64_t timeout) | |
111 | { | |
112 | AioHandler *node; | |
113 | int i, ret = 0; | |
114 | struct epoll_event events[128]; | |
115 | ||
116 | assert(npfd == 1); | |
117 | assert(pfds[0].fd == ctx->epollfd); | |
118 | if (timeout > 0) { | |
119 | ret = qemu_poll_ns(pfds, npfd, timeout); | |
120 | } | |
121 | if (timeout <= 0 || ret > 0) { | |
122 | ret = epoll_wait(ctx->epollfd, events, | |
123 | sizeof(events) / sizeof(events[0]), | |
124 | timeout); | |
125 | if (ret <= 0) { | |
126 | goto out; | |
127 | } | |
128 | for (i = 0; i < ret; i++) { | |
129 | int ev = events[i].events; | |
130 | node = events[i].data.ptr; | |
131 | node->pfd.revents = (ev & EPOLLIN ? G_IO_IN : 0) | | |
132 | (ev & EPOLLOUT ? G_IO_OUT : 0) | | |
133 | (ev & EPOLLHUP ? G_IO_HUP : 0) | | |
134 | (ev & EPOLLERR ? G_IO_ERR : 0); | |
135 | } | |
136 | } | |
137 | out: | |
138 | return ret; | |
139 | } | |
140 | ||
141 | static bool aio_epoll_enabled(AioContext *ctx) | |
142 | { | |
143 | /* Fall back to ppoll when external clients are disabled. */ | |
144 | return !aio_external_disabled(ctx) && ctx->epoll_enabled; | |
145 | } | |
146 | ||
147 | static bool aio_epoll_check_poll(AioContext *ctx, GPollFD *pfds, | |
148 | unsigned npfd, int64_t timeout) | |
149 | { | |
150 | if (!ctx->epoll_available) { | |
151 | return false; | |
152 | } | |
153 | if (aio_epoll_enabled(ctx)) { | |
154 | return true; | |
155 | } | |
156 | if (npfd >= EPOLL_ENABLE_THRESHOLD) { | |
157 | if (aio_epoll_try_enable(ctx)) { | |
158 | return true; | |
159 | } else { | |
160 | aio_epoll_disable(ctx); | |
161 | } | |
162 | } | |
163 | return false; | |
164 | } | |
165 | ||
166 | #else | |
167 | ||
168 | static void aio_epoll_update(AioContext *ctx, AioHandler *node, bool is_new) | |
169 | { | |
170 | } | |
171 | ||
172 | static int aio_epoll(AioContext *ctx, GPollFD *pfds, | |
173 | unsigned npfd, int64_t timeout) | |
174 | { | |
175 | assert(false); | |
176 | } | |
177 | ||
178 | static bool aio_epoll_enabled(AioContext *ctx) | |
179 | { | |
180 | return false; | |
181 | } | |
182 | ||
183 | static bool aio_epoll_check_poll(AioContext *ctx, GPollFD *pfds, | |
184 | unsigned npfd, int64_t timeout) | |
185 | { | |
186 | return false; | |
187 | } | |
188 | ||
189 | #endif | |
190 | ||
a915f4bc | 191 | static AioHandler *find_aio_handler(AioContext *ctx, int fd) |
a76bab49 AL |
192 | { |
193 | AioHandler *node; | |
194 | ||
a915f4bc | 195 | QLIST_FOREACH(node, &ctx->aio_handlers, node) { |
cd9ba1eb | 196 | if (node->pfd.fd == fd) |
79d5ca56 AG |
197 | if (!node->deleted) |
198 | return node; | |
a76bab49 AL |
199 | } |
200 | ||
201 | return NULL; | |
202 | } | |
203 | ||
a915f4bc PB |
204 | void aio_set_fd_handler(AioContext *ctx, |
205 | int fd, | |
dca21ef2 | 206 | bool is_external, |
a915f4bc PB |
207 | IOHandler *io_read, |
208 | IOHandler *io_write, | |
a915f4bc | 209 | void *opaque) |
a76bab49 AL |
210 | { |
211 | AioHandler *node; | |
fbe3fc5c | 212 | bool is_new = false; |
a76bab49 | 213 | |
a915f4bc | 214 | node = find_aio_handler(ctx, fd); |
a76bab49 AL |
215 | |
216 | /* Are we deleting the fd handler? */ | |
217 | if (!io_read && !io_write) { | |
218 | if (node) { | |
e3713e00 PB |
219 | g_source_remove_poll(&ctx->source, &node->pfd); |
220 | ||
a76bab49 | 221 | /* If the lock is held, just mark the node as deleted */ |
cd9ba1eb | 222 | if (ctx->walking_handlers) { |
a76bab49 | 223 | node->deleted = 1; |
cd9ba1eb PB |
224 | node->pfd.revents = 0; |
225 | } else { | |
a76bab49 AL |
226 | /* Otherwise, delete it for real. We can't just mark it as |
227 | * deleted because deleted nodes are only cleaned up after | |
228 | * releasing the walking_handlers lock. | |
229 | */ | |
72cf2d4f | 230 | QLIST_REMOVE(node, node); |
7267c094 | 231 | g_free(node); |
a76bab49 AL |
232 | } |
233 | } | |
234 | } else { | |
235 | if (node == NULL) { | |
236 | /* Alloc and insert if it's not already there */ | |
3ba235a0 | 237 | node = g_new0(AioHandler, 1); |
cd9ba1eb | 238 | node->pfd.fd = fd; |
a915f4bc | 239 | QLIST_INSERT_HEAD(&ctx->aio_handlers, node, node); |
e3713e00 PB |
240 | |
241 | g_source_add_poll(&ctx->source, &node->pfd); | |
fbe3fc5c | 242 | is_new = true; |
a76bab49 AL |
243 | } |
244 | /* Update handler with latest information */ | |
245 | node->io_read = io_read; | |
246 | node->io_write = io_write; | |
a76bab49 | 247 | node->opaque = opaque; |
dca21ef2 | 248 | node->is_external = is_external; |
cd9ba1eb | 249 | |
b5a01a70 SH |
250 | node->pfd.events = (io_read ? G_IO_IN | G_IO_HUP | G_IO_ERR : 0); |
251 | node->pfd.events |= (io_write ? G_IO_OUT | G_IO_ERR : 0); | |
a76bab49 | 252 | } |
7ed2b24c | 253 | |
fbe3fc5c | 254 | aio_epoll_update(ctx, node, is_new); |
7ed2b24c | 255 | aio_notify(ctx); |
9958c351 PB |
256 | } |
257 | ||
a915f4bc PB |
258 | void aio_set_event_notifier(AioContext *ctx, |
259 | EventNotifier *notifier, | |
dca21ef2 | 260 | bool is_external, |
f2e5dca4 | 261 | EventNotifierHandler *io_read) |
a76bab49 | 262 | { |
a915f4bc | 263 | aio_set_fd_handler(ctx, event_notifier_get_fd(notifier), |
dca21ef2 | 264 | is_external, (IOHandler *)io_read, NULL, notifier); |
a76bab49 AL |
265 | } |
266 | ||
a3462c65 PB |
267 | bool aio_prepare(AioContext *ctx) |
268 | { | |
269 | return false; | |
270 | } | |
271 | ||
cd9ba1eb PB |
272 | bool aio_pending(AioContext *ctx) |
273 | { | |
274 | AioHandler *node; | |
275 | ||
276 | QLIST_FOREACH(node, &ctx->aio_handlers, node) { | |
277 | int revents; | |
278 | ||
cd9ba1eb PB |
279 | revents = node->pfd.revents & node->pfd.events; |
280 | if (revents & (G_IO_IN | G_IO_HUP | G_IO_ERR) && node->io_read) { | |
281 | return true; | |
282 | } | |
283 | if (revents & (G_IO_OUT | G_IO_ERR) && node->io_write) { | |
284 | return true; | |
285 | } | |
286 | } | |
287 | ||
288 | return false; | |
289 | } | |
290 | ||
e4c7e2d1 | 291 | bool aio_dispatch(AioContext *ctx) |
a76bab49 | 292 | { |
9eb0bfca | 293 | AioHandler *node; |
d0c8d2c0 | 294 | bool progress = false; |
7c0628b2 | 295 | |
e4c7e2d1 PB |
296 | /* |
297 | * If there are callbacks left that have been queued, we need to call them. | |
298 | * Do not call select in this case, because it is possible that the caller | |
299 | * does not need a complete flush (as is the case for aio_poll loops). | |
300 | */ | |
301 | if (aio_bh_poll(ctx)) { | |
302 | progress = true; | |
303 | } | |
304 | ||
cd9ba1eb | 305 | /* |
87f68d31 | 306 | * We have to walk very carefully in case aio_set_fd_handler is |
cd9ba1eb PB |
307 | * called while we're walking. |
308 | */ | |
309 | node = QLIST_FIRST(&ctx->aio_handlers); | |
310 | while (node) { | |
311 | AioHandler *tmp; | |
312 | int revents; | |
313 | ||
314 | ctx->walking_handlers++; | |
315 | ||
316 | revents = node->pfd.revents & node->pfd.events; | |
317 | node->pfd.revents = 0; | |
318 | ||
d0c8d2c0 SH |
319 | if (!node->deleted && |
320 | (revents & (G_IO_IN | G_IO_HUP | G_IO_ERR)) && | |
321 | node->io_read) { | |
cd9ba1eb | 322 | node->io_read(node->opaque); |
164a101f SH |
323 | |
324 | /* aio_notify() does not count as progress */ | |
325 | if (node->opaque != &ctx->notifier) { | |
326 | progress = true; | |
327 | } | |
cd9ba1eb | 328 | } |
d0c8d2c0 SH |
329 | if (!node->deleted && |
330 | (revents & (G_IO_OUT | G_IO_ERR)) && | |
331 | node->io_write) { | |
cd9ba1eb PB |
332 | node->io_write(node->opaque); |
333 | progress = true; | |
334 | } | |
335 | ||
336 | tmp = node; | |
337 | node = QLIST_NEXT(node, node); | |
338 | ||
339 | ctx->walking_handlers--; | |
340 | ||
341 | if (!ctx->walking_handlers && tmp->deleted) { | |
342 | QLIST_REMOVE(tmp, node); | |
343 | g_free(tmp); | |
344 | } | |
345 | } | |
438e1f47 AB |
346 | |
347 | /* Run our timers */ | |
348 | progress |= timerlistgroup_run_timers(&ctx->tlg); | |
349 | ||
d0c8d2c0 SH |
350 | return progress; |
351 | } | |
352 | ||
e98ab097 PB |
353 | /* These thread-local variables are used only in a small part of aio_poll |
354 | * around the call to the poll() system call. In particular they are not | |
355 | * used while aio_poll is performing callbacks, which makes it much easier | |
356 | * to think about reentrancy! | |
357 | * | |
358 | * Stack-allocated arrays would be perfect but they have size limitations; | |
359 | * heap allocation is expensive enough that we want to reuse arrays across | |
360 | * calls to aio_poll(). And because poll() has to be called without holding | |
361 | * any lock, the arrays cannot be stored in AioContext. Thread-local data | |
362 | * has none of the disadvantages of these three options. | |
363 | */ | |
364 | static __thread GPollFD *pollfds; | |
365 | static __thread AioHandler **nodes; | |
366 | static __thread unsigned npfd, nalloc; | |
367 | static __thread Notifier pollfds_cleanup_notifier; | |
368 | ||
369 | static void pollfds_cleanup(Notifier *n, void *unused) | |
370 | { | |
371 | g_assert(npfd == 0); | |
372 | g_free(pollfds); | |
373 | g_free(nodes); | |
374 | nalloc = 0; | |
375 | } | |
376 | ||
377 | static void add_pollfd(AioHandler *node) | |
378 | { | |
379 | if (npfd == nalloc) { | |
380 | if (nalloc == 0) { | |
381 | pollfds_cleanup_notifier.notify = pollfds_cleanup; | |
382 | qemu_thread_atexit_add(&pollfds_cleanup_notifier); | |
383 | nalloc = 8; | |
384 | } else { | |
385 | g_assert(nalloc <= INT_MAX); | |
386 | nalloc *= 2; | |
387 | } | |
388 | pollfds = g_renew(GPollFD, pollfds, nalloc); | |
389 | nodes = g_renew(AioHandler *, nodes, nalloc); | |
390 | } | |
391 | nodes[npfd] = node; | |
392 | pollfds[npfd] = (GPollFD) { | |
393 | .fd = node->pfd.fd, | |
394 | .events = node->pfd.events, | |
395 | }; | |
396 | npfd++; | |
397 | } | |
398 | ||
d0c8d2c0 SH |
399 | bool aio_poll(AioContext *ctx, bool blocking) |
400 | { | |
d0c8d2c0 | 401 | AioHandler *node; |
e98ab097 | 402 | int i, ret; |
164a101f | 403 | bool progress; |
e98ab097 | 404 | int64_t timeout; |
d0c8d2c0 | 405 | |
49110174 | 406 | aio_context_acquire(ctx); |
d0c8d2c0 SH |
407 | progress = false; |
408 | ||
0ceb849b PB |
409 | /* aio_notify can avoid the expensive event_notifier_set if |
410 | * everything (file descriptors, bottom halves, timers) will | |
e4c7e2d1 PB |
411 | * be re-evaluated before the next blocking poll(). This is |
412 | * already true when aio_poll is called with blocking == false; | |
eabc9779 PB |
413 | * if blocking == true, it is only true after poll() returns, |
414 | * so disable the optimization now. | |
0ceb849b | 415 | */ |
eabc9779 PB |
416 | if (blocking) { |
417 | atomic_add(&ctx->notify_me, 2); | |
418 | } | |
0ceb849b | 419 | |
a915f4bc | 420 | ctx->walking_handlers++; |
a76bab49 | 421 | |
e98ab097 | 422 | assert(npfd == 0); |
a76bab49 | 423 | |
6b5f8762 | 424 | /* fill pollfds */ |
a915f4bc | 425 | QLIST_FOREACH(node, &ctx->aio_handlers, node) { |
c1e1e5fa | 426 | if (!node->deleted && node->pfd.events |
fbe3fc5c | 427 | && !aio_epoll_enabled(ctx) |
c1e1e5fa | 428 | && aio_node_check(ctx, node->is_external)) { |
e98ab097 | 429 | add_pollfd(node); |
9eb0bfca PB |
430 | } |
431 | } | |
a76bab49 | 432 | |
e98ab097 | 433 | timeout = blocking ? aio_compute_timeout(ctx) : 0; |
a76bab49 | 434 | |
9eb0bfca | 435 | /* wait until next event */ |
49110174 PB |
436 | if (timeout) { |
437 | aio_context_release(ctx); | |
438 | } | |
fbe3fc5c FZ |
439 | if (aio_epoll_check_poll(ctx, pollfds, npfd, timeout)) { |
440 | AioHandler epoll_handler; | |
441 | ||
442 | epoll_handler.pfd.fd = ctx->epollfd; | |
443 | epoll_handler.pfd.events = G_IO_IN | G_IO_OUT | G_IO_HUP | G_IO_ERR; | |
444 | npfd = 0; | |
445 | add_pollfd(&epoll_handler); | |
446 | ret = aio_epoll(ctx, pollfds, npfd, timeout); | |
447 | } else { | |
448 | ret = qemu_poll_ns(pollfds, npfd, timeout); | |
449 | } | |
eabc9779 PB |
450 | if (blocking) { |
451 | atomic_sub(&ctx->notify_me, 2); | |
452 | } | |
49110174 PB |
453 | if (timeout) { |
454 | aio_context_acquire(ctx); | |
455 | } | |
9eb0bfca | 456 | |
05e514b1 | 457 | aio_notify_accept(ctx); |
21a03d17 | 458 | |
9eb0bfca PB |
459 | /* if we have any readable fds, dispatch event */ |
460 | if (ret > 0) { | |
e98ab097 PB |
461 | for (i = 0; i < npfd; i++) { |
462 | nodes[i]->pfd.revents = pollfds[i].revents; | |
a76bab49 | 463 | } |
438e1f47 AB |
464 | } |
465 | ||
e98ab097 PB |
466 | npfd = 0; |
467 | ctx->walking_handlers--; | |
468 | ||
438e1f47 AB |
469 | /* Run dispatch even if there were no readable fds to run timers */ |
470 | if (aio_dispatch(ctx)) { | |
471 | progress = true; | |
9eb0bfca | 472 | } |
bcdc1857 | 473 | |
49110174 PB |
474 | aio_context_release(ctx); |
475 | ||
164a101f | 476 | return progress; |
a76bab49 | 477 | } |
37fcee5d FZ |
478 | |
479 | void aio_context_setup(AioContext *ctx, Error **errp) | |
480 | { | |
fbe3fc5c FZ |
481 | #ifdef CONFIG_EPOLL |
482 | assert(!ctx->epollfd); | |
483 | ctx->epollfd = epoll_create1(EPOLL_CLOEXEC); | |
484 | if (ctx->epollfd == -1) { | |
485 | ctx->epoll_available = false; | |
486 | } else { | |
487 | ctx->epoll_available = true; | |
488 | } | |
489 | #endif | |
37fcee5d | 490 | } |