]>
Commit | Line | Data |
---|---|---|
17437f31 JA |
1 | // SPDX-License-Identifier: GPL-2.0 |
2 | /* | |
3 | * Contains the core associated with submission side polling of the SQ | |
4 | * ring, offloading submissions from the application to a kernel thread. | |
5 | */ | |
6 | #include <linux/kernel.h> | |
7 | #include <linux/errno.h> | |
8 | #include <linux/file.h> | |
9 | #include <linux/mm.h> | |
10 | #include <linux/slab.h> | |
11 | #include <linux/audit.h> | |
12 | #include <linux/security.h> | |
13 | #include <linux/io_uring.h> | |
14 | ||
15 | #include <uapi/linux/io_uring.h> | |
16 | ||
17437f31 | 17 | #include "io_uring.h" |
ff183d42 | 18 | #include "napi.h" |
17437f31 JA |
19 | #include "sqpoll.h" |
20 | ||
21 | #define IORING_SQPOLL_CAP_ENTRIES_VALUE 8 | |
af5d68f8 | 22 | #define IORING_TW_CAP_ENTRIES_VALUE 8 |
17437f31 JA |
23 | |
24 | enum { | |
25 | IO_SQ_THREAD_SHOULD_STOP = 0, | |
26 | IO_SQ_THREAD_SHOULD_PARK, | |
27 | }; | |
28 | ||
29 | void io_sq_thread_unpark(struct io_sq_data *sqd) | |
30 | __releases(&sqd->lock) | |
31 | { | |
32 | WARN_ON_ONCE(sqd->thread == current); | |
33 | ||
34 | /* | |
35 | * Do the dance but not conditional clear_bit() because it'd race with | |
36 | * other threads incrementing park_pending and setting the bit. | |
37 | */ | |
38 | clear_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state); | |
39 | if (atomic_dec_return(&sqd->park_pending)) | |
40 | set_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state); | |
41 | mutex_unlock(&sqd->lock); | |
42 | } | |
43 | ||
44 | void io_sq_thread_park(struct io_sq_data *sqd) | |
45 | __acquires(&sqd->lock) | |
46 | { | |
47 | WARN_ON_ONCE(sqd->thread == current); | |
48 | ||
49 | atomic_inc(&sqd->park_pending); | |
50 | set_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state); | |
51 | mutex_lock(&sqd->lock); | |
52 | if (sqd->thread) | |
53 | wake_up_process(sqd->thread); | |
54 | } | |
55 | ||
56 | void io_sq_thread_stop(struct io_sq_data *sqd) | |
57 | { | |
58 | WARN_ON_ONCE(sqd->thread == current); | |
59 | WARN_ON_ONCE(test_bit(IO_SQ_THREAD_SHOULD_STOP, &sqd->state)); | |
60 | ||
61 | set_bit(IO_SQ_THREAD_SHOULD_STOP, &sqd->state); | |
62 | mutex_lock(&sqd->lock); | |
63 | if (sqd->thread) | |
64 | wake_up_process(sqd->thread); | |
65 | mutex_unlock(&sqd->lock); | |
66 | wait_for_completion(&sqd->exited); | |
67 | } | |
68 | ||
69 | void io_put_sq_data(struct io_sq_data *sqd) | |
70 | { | |
71 | if (refcount_dec_and_test(&sqd->refs)) { | |
72 | WARN_ON_ONCE(atomic_read(&sqd->park_pending)); | |
73 | ||
74 | io_sq_thread_stop(sqd); | |
75 | kfree(sqd); | |
76 | } | |
77 | } | |
78 | ||
79 | static __cold void io_sqd_update_thread_idle(struct io_sq_data *sqd) | |
80 | { | |
81 | struct io_ring_ctx *ctx; | |
82 | unsigned sq_thread_idle = 0; | |
83 | ||
84 | list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) | |
85 | sq_thread_idle = max(sq_thread_idle, ctx->sq_thread_idle); | |
86 | sqd->sq_thread_idle = sq_thread_idle; | |
87 | } | |
88 | ||
89 | void io_sq_thread_finish(struct io_ring_ctx *ctx) | |
90 | { | |
91 | struct io_sq_data *sqd = ctx->sq_data; | |
92 | ||
93 | if (sqd) { | |
94 | io_sq_thread_park(sqd); | |
95 | list_del_init(&ctx->sqd_list); | |
96 | io_sqd_update_thread_idle(sqd); | |
97 | io_sq_thread_unpark(sqd); | |
98 | ||
99 | io_put_sq_data(sqd); | |
100 | ctx->sq_data = NULL; | |
101 | } | |
102 | } | |
103 | ||
104 | static struct io_sq_data *io_attach_sq_data(struct io_uring_params *p) | |
105 | { | |
106 | struct io_ring_ctx *ctx_attach; | |
107 | struct io_sq_data *sqd; | |
108 | struct fd f; | |
109 | ||
110 | f = fdget(p->wq_fd); | |
111 | if (!f.file) | |
112 | return ERR_PTR(-ENXIO); | |
113 | if (!io_is_uring_fops(f.file)) { | |
114 | fdput(f); | |
115 | return ERR_PTR(-EINVAL); | |
116 | } | |
117 | ||
118 | ctx_attach = f.file->private_data; | |
119 | sqd = ctx_attach->sq_data; | |
120 | if (!sqd) { | |
121 | fdput(f); | |
122 | return ERR_PTR(-EINVAL); | |
123 | } | |
124 | if (sqd->task_tgid != current->tgid) { | |
125 | fdput(f); | |
126 | return ERR_PTR(-EPERM); | |
127 | } | |
128 | ||
129 | refcount_inc(&sqd->refs); | |
130 | fdput(f); | |
131 | return sqd; | |
132 | } | |
133 | ||
134 | static struct io_sq_data *io_get_sq_data(struct io_uring_params *p, | |
135 | bool *attached) | |
136 | { | |
137 | struct io_sq_data *sqd; | |
138 | ||
139 | *attached = false; | |
140 | if (p->flags & IORING_SETUP_ATTACH_WQ) { | |
141 | sqd = io_attach_sq_data(p); | |
142 | if (!IS_ERR(sqd)) { | |
143 | *attached = true; | |
144 | return sqd; | |
145 | } | |
146 | /* fall through for EPERM case, setup new sqd/task */ | |
147 | if (PTR_ERR(sqd) != -EPERM) | |
148 | return sqd; | |
149 | } | |
150 | ||
151 | sqd = kzalloc(sizeof(*sqd), GFP_KERNEL); | |
152 | if (!sqd) | |
153 | return ERR_PTR(-ENOMEM); | |
154 | ||
155 | atomic_set(&sqd->park_pending, 0); | |
156 | refcount_set(&sqd->refs, 1); | |
157 | INIT_LIST_HEAD(&sqd->ctx_list); | |
158 | mutex_init(&sqd->lock); | |
159 | init_waitqueue_head(&sqd->wait); | |
160 | init_completion(&sqd->exited); | |
161 | return sqd; | |
162 | } | |
163 | ||
164 | static inline bool io_sqd_events_pending(struct io_sq_data *sqd) | |
165 | { | |
166 | return READ_ONCE(sqd->state); | |
167 | } | |
168 | ||
169 | static int __io_sq_thread(struct io_ring_ctx *ctx, bool cap_entries) | |
170 | { | |
171 | unsigned int to_submit; | |
172 | int ret = 0; | |
173 | ||
174 | to_submit = io_sqring_entries(ctx); | |
175 | /* if we're handling multiple rings, cap submit size for fairness */ | |
176 | if (cap_entries && to_submit > IORING_SQPOLL_CAP_ENTRIES_VALUE) | |
177 | to_submit = IORING_SQPOLL_CAP_ENTRIES_VALUE; | |
178 | ||
179 | if (!wq_list_empty(&ctx->iopoll_list) || to_submit) { | |
180 | const struct cred *creds = NULL; | |
181 | ||
182 | if (ctx->sq_creds != current_cred()) | |
183 | creds = override_creds(ctx->sq_creds); | |
184 | ||
185 | mutex_lock(&ctx->uring_lock); | |
186 | if (!wq_list_empty(&ctx->iopoll_list)) | |
187 | io_do_iopoll(ctx, true); | |
188 | ||
189 | /* | |
190 | * Don't submit if refs are dying, good for io_uring_register(), | |
191 | * but also it is relied upon by io_ring_exit_work() | |
192 | */ | |
193 | if (to_submit && likely(!percpu_ref_is_dying(&ctx->refs)) && | |
194 | !(ctx->flags & IORING_SETUP_R_DISABLED)) | |
195 | ret = io_submit_sqes(ctx, to_submit); | |
196 | mutex_unlock(&ctx->uring_lock); | |
197 | ||
ff183d42 SR |
198 | if (io_napi(ctx)) |
199 | ret += io_napi_sqpoll_busy_poll(ctx); | |
200 | ||
17437f31 JA |
201 | if (to_submit && wq_has_sleeper(&ctx->sqo_sq_wait)) |
202 | wake_up(&ctx->sqo_sq_wait); | |
203 | if (creds) | |
204 | revert_creds(creds); | |
205 | } | |
206 | ||
207 | return ret; | |
208 | } | |
209 | ||
210 | static bool io_sqd_handle_event(struct io_sq_data *sqd) | |
211 | { | |
212 | bool did_sig = false; | |
213 | struct ksignal ksig; | |
214 | ||
215 | if (test_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state) || | |
216 | signal_pending(current)) { | |
217 | mutex_unlock(&sqd->lock); | |
218 | if (signal_pending(current)) | |
219 | did_sig = get_signal(&ksig); | |
220 | cond_resched(); | |
221 | mutex_lock(&sqd->lock); | |
a0d45c3f | 222 | sqd->sq_cpu = raw_smp_processor_id(); |
17437f31 JA |
223 | } |
224 | return did_sig || test_bit(IO_SQ_THREAD_SHOULD_STOP, &sqd->state); | |
225 | } | |
226 | ||
af5d68f8 JA |
227 | /* |
228 | * Run task_work, processing the retry_list first. The retry_list holds | |
229 | * entries that we passed on in the previous run, if we had more task_work | |
230 | * than we were asked to process. Newly queued task_work isn't run until the | |
231 | * retry list has been fully processed. | |
232 | */ | |
233 | static unsigned int io_sq_tw(struct llist_node **retry_list, int max_entries) | |
234 | { | |
235 | struct io_uring_task *tctx = current->io_uring; | |
236 | unsigned int count = 0; | |
237 | ||
238 | if (*retry_list) { | |
239 | *retry_list = io_handle_tw_list(*retry_list, &count, max_entries); | |
240 | if (count >= max_entries) | |
d13ddd9c | 241 | goto out; |
af5d68f8 JA |
242 | max_entries -= count; |
243 | } | |
af5d68f8 | 244 | *retry_list = tctx_task_work_run(tctx, max_entries, &count); |
d13ddd9c JA |
245 | out: |
246 | if (task_work_pending(current)) | |
247 | task_work_run(); | |
af5d68f8 JA |
248 | return count; |
249 | } | |
250 | ||
c8d8fc3b JA |
251 | static bool io_sq_tw_pending(struct llist_node *retry_list) |
252 | { | |
253 | struct io_uring_task *tctx = current->io_uring; | |
254 | ||
255 | return retry_list || !llist_empty(&tctx->task_list); | |
256 | } | |
257 | ||
3fcb9d17 XL |
258 | static void io_sq_update_worktime(struct io_sq_data *sqd, struct rusage *start) |
259 | { | |
260 | struct rusage end; | |
261 | ||
262 | getrusage(current, RUSAGE_SELF, &end); | |
263 | end.ru_stime.tv_sec -= start->ru_stime.tv_sec; | |
264 | end.ru_stime.tv_usec -= start->ru_stime.tv_usec; | |
265 | ||
266 | sqd->work_time += end.ru_stime.tv_usec + end.ru_stime.tv_sec * 1000000; | |
267 | } | |
268 | ||
17437f31 JA |
269 | static int io_sq_thread(void *data) |
270 | { | |
af5d68f8 | 271 | struct llist_node *retry_list = NULL; |
17437f31 JA |
272 | struct io_sq_data *sqd = data; |
273 | struct io_ring_ctx *ctx; | |
3fcb9d17 | 274 | struct rusage start; |
17437f31 JA |
275 | unsigned long timeout = 0; |
276 | char buf[TASK_COMM_LEN]; | |
277 | DEFINE_WAIT(wait); | |
278 | ||
1251d202 JA |
279 | /* offload context creation failed, just exit */ |
280 | if (!current->io_uring) | |
281 | goto err_out; | |
282 | ||
17437f31 JA |
283 | snprintf(buf, sizeof(buf), "iou-sqp-%d", sqd->task_pid); |
284 | set_task_comm(current, buf); | |
285 | ||
a0d45c3f JA |
286 | /* reset to our pid after we've set task_comm, for fdinfo */ |
287 | sqd->task_pid = current->pid; | |
288 | ||
289 | if (sqd->sq_cpu != -1) { | |
17437f31 | 290 | set_cpus_allowed_ptr(current, cpumask_of(sqd->sq_cpu)); |
a0d45c3f | 291 | } else { |
17437f31 | 292 | set_cpus_allowed_ptr(current, cpu_online_mask); |
a0d45c3f JA |
293 | sqd->sq_cpu = raw_smp_processor_id(); |
294 | } | |
17437f31 | 295 | |
c4ce0ab2 JA |
296 | /* |
297 | * Force audit context to get setup, in case we do prep side async | |
298 | * operations that would trigger an audit call before any issue side | |
299 | * audit has been done. | |
300 | */ | |
301 | audit_uring_entry(IORING_OP_NOP); | |
302 | audit_uring_exit(true, 0); | |
303 | ||
17437f31 JA |
304 | mutex_lock(&sqd->lock); |
305 | while (1) { | |
306 | bool cap_entries, sqt_spin = false; | |
307 | ||
308 | if (io_sqd_events_pending(sqd) || signal_pending(current)) { | |
309 | if (io_sqd_handle_event(sqd)) | |
310 | break; | |
311 | timeout = jiffies + sqd->sq_thread_idle; | |
312 | } | |
313 | ||
314 | cap_entries = !list_is_singular(&sqd->ctx_list); | |
3fcb9d17 | 315 | getrusage(current, RUSAGE_SELF, &start); |
17437f31 JA |
316 | list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) { |
317 | int ret = __io_sq_thread(ctx, cap_entries); | |
318 | ||
319 | if (!sqt_spin && (ret > 0 || !wq_list_empty(&ctx->iopoll_list))) | |
320 | sqt_spin = true; | |
321 | } | |
af5d68f8 | 322 | if (io_sq_tw(&retry_list, IORING_TW_CAP_ENTRIES_VALUE)) |
17437f31 JA |
323 | sqt_spin = true; |
324 | ||
325 | if (sqt_spin || !time_after(jiffies, timeout)) { | |
3fcb9d17 XL |
326 | if (sqt_spin) { |
327 | io_sq_update_worktime(sqd, &start); | |
17437f31 | 328 | timeout = jiffies + sqd->sq_thread_idle; |
3fcb9d17 | 329 | } |
533ab73f WC |
330 | if (unlikely(need_resched())) { |
331 | mutex_unlock(&sqd->lock); | |
332 | cond_resched(); | |
333 | mutex_lock(&sqd->lock); | |
a0d45c3f | 334 | sqd->sq_cpu = raw_smp_processor_id(); |
533ab73f | 335 | } |
17437f31 JA |
336 | continue; |
337 | } | |
338 | ||
339 | prepare_to_wait(&sqd->wait, &wait, TASK_INTERRUPTIBLE); | |
c8d8fc3b | 340 | if (!io_sqd_events_pending(sqd) && !io_sq_tw_pending(retry_list)) { |
17437f31 JA |
341 | bool needs_sched = true; |
342 | ||
343 | list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) { | |
344 | atomic_or(IORING_SQ_NEED_WAKEUP, | |
345 | &ctx->rings->sq_flags); | |
346 | if ((ctx->flags & IORING_SETUP_IOPOLL) && | |
347 | !wq_list_empty(&ctx->iopoll_list)) { | |
348 | needs_sched = false; | |
349 | break; | |
350 | } | |
351 | ||
352 | /* | |
353 | * Ensure the store of the wakeup flag is not | |
354 | * reordered with the load of the SQ tail | |
355 | */ | |
356 | smp_mb__after_atomic(); | |
357 | ||
358 | if (io_sqring_entries(ctx)) { | |
359 | needs_sched = false; | |
360 | break; | |
361 | } | |
362 | } | |
363 | ||
364 | if (needs_sched) { | |
365 | mutex_unlock(&sqd->lock); | |
366 | schedule(); | |
367 | mutex_lock(&sqd->lock); | |
a0d45c3f | 368 | sqd->sq_cpu = raw_smp_processor_id(); |
17437f31 JA |
369 | } |
370 | list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) | |
371 | atomic_andnot(IORING_SQ_NEED_WAKEUP, | |
372 | &ctx->rings->sq_flags); | |
373 | } | |
374 | ||
375 | finish_wait(&sqd->wait, &wait); | |
376 | timeout = jiffies + sqd->sq_thread_idle; | |
377 | } | |
378 | ||
af5d68f8 JA |
379 | if (retry_list) |
380 | io_sq_tw(&retry_list, UINT_MAX); | |
381 | ||
17437f31 JA |
382 | io_uring_cancel_generic(true, sqd); |
383 | sqd->thread = NULL; | |
384 | list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) | |
385 | atomic_or(IORING_SQ_NEED_WAKEUP, &ctx->rings->sq_flags); | |
386 | io_run_task_work(); | |
387 | mutex_unlock(&sqd->lock); | |
1251d202 | 388 | err_out: |
17437f31 JA |
389 | complete(&sqd->exited); |
390 | do_exit(0); | |
391 | } | |
392 | ||
88b80534 | 393 | void io_sqpoll_wait_sq(struct io_ring_ctx *ctx) |
17437f31 JA |
394 | { |
395 | DEFINE_WAIT(wait); | |
396 | ||
397 | do { | |
398 | if (!io_sqring_full(ctx)) | |
399 | break; | |
400 | prepare_to_wait(&ctx->sqo_sq_wait, &wait, TASK_INTERRUPTIBLE); | |
401 | ||
402 | if (!io_sqring_full(ctx)) | |
403 | break; | |
404 | schedule(); | |
405 | } while (!signal_pending(current)); | |
406 | ||
407 | finish_wait(&ctx->sqo_sq_wait, &wait); | |
17437f31 JA |
408 | } |
409 | ||
410 | __cold int io_sq_offload_create(struct io_ring_ctx *ctx, | |
411 | struct io_uring_params *p) | |
412 | { | |
413 | int ret; | |
414 | ||
415 | /* Retain compatibility with failing for an invalid attach attempt */ | |
416 | if ((ctx->flags & (IORING_SETUP_ATTACH_WQ | IORING_SETUP_SQPOLL)) == | |
417 | IORING_SETUP_ATTACH_WQ) { | |
418 | struct fd f; | |
419 | ||
420 | f = fdget(p->wq_fd); | |
421 | if (!f.file) | |
422 | return -ENXIO; | |
423 | if (!io_is_uring_fops(f.file)) { | |
424 | fdput(f); | |
425 | return -EINVAL; | |
426 | } | |
427 | fdput(f); | |
428 | } | |
429 | if (ctx->flags & IORING_SETUP_SQPOLL) { | |
430 | struct task_struct *tsk; | |
431 | struct io_sq_data *sqd; | |
432 | bool attached; | |
433 | ||
434 | ret = security_uring_sqpoll(); | |
435 | if (ret) | |
436 | return ret; | |
437 | ||
438 | sqd = io_get_sq_data(p, &attached); | |
439 | if (IS_ERR(sqd)) { | |
440 | ret = PTR_ERR(sqd); | |
441 | goto err; | |
442 | } | |
443 | ||
444 | ctx->sq_creds = get_current_cred(); | |
445 | ctx->sq_data = sqd; | |
446 | ctx->sq_thread_idle = msecs_to_jiffies(p->sq_thread_idle); | |
447 | if (!ctx->sq_thread_idle) | |
448 | ctx->sq_thread_idle = HZ; | |
449 | ||
450 | io_sq_thread_park(sqd); | |
451 | list_add(&ctx->sqd_list, &sqd->ctx_list); | |
452 | io_sqd_update_thread_idle(sqd); | |
453 | /* don't attach to a dying SQPOLL thread, would be racy */ | |
454 | ret = (attached && !sqd->thread) ? -ENXIO : 0; | |
455 | io_sq_thread_unpark(sqd); | |
456 | ||
457 | if (ret < 0) | |
458 | goto err; | |
459 | if (attached) | |
460 | return 0; | |
461 | ||
462 | if (p->flags & IORING_SETUP_SQ_AFF) { | |
463 | int cpu = p->sq_thread_cpu; | |
464 | ||
465 | ret = -EINVAL; | |
466 | if (cpu >= nr_cpu_ids || !cpu_online(cpu)) | |
467 | goto err_sqpoll; | |
468 | sqd->sq_cpu = cpu; | |
469 | } else { | |
470 | sqd->sq_cpu = -1; | |
471 | } | |
472 | ||
473 | sqd->task_pid = current->pid; | |
474 | sqd->task_tgid = current->tgid; | |
475 | tsk = create_io_thread(io_sq_thread, sqd, NUMA_NO_NODE); | |
476 | if (IS_ERR(tsk)) { | |
477 | ret = PTR_ERR(tsk); | |
478 | goto err_sqpoll; | |
479 | } | |
480 | ||
481 | sqd->thread = tsk; | |
482 | ret = io_uring_alloc_task_context(tsk, ctx); | |
483 | wake_up_new_task(tsk); | |
484 | if (ret) | |
485 | goto err; | |
486 | } else if (p->flags & IORING_SETUP_SQ_AFF) { | |
487 | /* Can't have SQ_AFF without SQPOLL */ | |
488 | ret = -EINVAL; | |
489 | goto err; | |
490 | } | |
491 | ||
492 | return 0; | |
493 | err_sqpoll: | |
494 | complete(&ctx->sq_data->exited); | |
495 | err: | |
496 | io_sq_thread_finish(ctx); | |
497 | return ret; | |
498 | } | |
ebdfefc0 JA |
499 | |
500 | __cold int io_sqpoll_wq_cpu_affinity(struct io_ring_ctx *ctx, | |
501 | cpumask_var_t mask) | |
502 | { | |
503 | struct io_sq_data *sqd = ctx->sq_data; | |
504 | int ret = -EINVAL; | |
505 | ||
506 | if (sqd) { | |
507 | io_sq_thread_park(sqd); | |
bd6fc5da GKB |
508 | /* Don't set affinity for a dying thread */ |
509 | if (sqd->thread) | |
510 | ret = io_wq_cpu_affinity(sqd->thread->io_uring, mask); | |
ebdfefc0 JA |
511 | io_sq_thread_unpark(sqd); |
512 | } | |
513 | ||
514 | return ret; | |
515 | } |