]>
Commit | Line | Data |
---|---|---|
c4320315 JA |
1 | // SPDX-License-Identifier: GPL-2.0 |
2 | /* | |
3 | * Code related to the io_uring_register() syscall | |
4 | * | |
5 | * Copyright (C) 2023 Jens Axboe | |
6 | */ | |
7 | #include <linux/kernel.h> | |
8 | #include <linux/errno.h> | |
9 | #include <linux/syscalls.h> | |
10 | #include <linux/refcount.h> | |
11 | #include <linux/bits.h> | |
12 | #include <linux/fs.h> | |
13 | #include <linux/file.h> | |
14 | #include <linux/slab.h> | |
15 | #include <linux/uaccess.h> | |
16 | #include <linux/nospec.h> | |
baf59771 | 17 | #include <linux/compat.h> |
c4320315 JA |
18 | #include <linux/io_uring.h> |
19 | #include <linux/io_uring_types.h> | |
20 | ||
21 | #include "io_uring.h" | |
22 | #include "opdef.h" | |
23 | #include "tctx.h" | |
24 | #include "rsrc.h" | |
25 | #include "sqpoll.h" | |
26 | #include "register.h" | |
27 | #include "cancel.h" | |
28 | #include "kbuf.h" | |
ef1186c1 | 29 | #include "napi.h" |
200f3abd | 30 | #include "eventfd.h" |
a3771321 | 31 | #include "msg_ring.h" |
79cfe9e5 | 32 | #include "memmap.h" |
c4320315 JA |
33 | |
34 | #define IORING_MAX_RESTRICTIONS (IORING_RESTRICTION_LAST + \ | |
35 | IORING_REGISTER_LAST + IORING_OP_LAST) | |
36 | ||
c4320315 JA |
37 | static __cold int io_probe(struct io_ring_ctx *ctx, void __user *arg, |
38 | unsigned nr_args) | |
39 | { | |
40 | struct io_uring_probe *p; | |
41 | size_t size; | |
42 | int i, ret; | |
43 | ||
6bc9199d GKB |
44 | if (nr_args > IORING_OP_LAST) |
45 | nr_args = IORING_OP_LAST; | |
46 | ||
c4320315 | 47 | size = struct_size(p, ops, nr_args); |
c4320315 JA |
48 | p = kzalloc(size, GFP_KERNEL); |
49 | if (!p) | |
50 | return -ENOMEM; | |
51 | ||
52 | ret = -EFAULT; | |
53 | if (copy_from_user(p, arg, size)) | |
54 | goto out; | |
55 | ret = -EINVAL; | |
56 | if (memchr_inv(p, 0, size)) | |
57 | goto out; | |
58 | ||
59 | p->last_op = IORING_OP_LAST - 1; | |
c4320315 JA |
60 | |
61 | for (i = 0; i < nr_args; i++) { | |
62 | p->ops[i].op = i; | |
3e05b222 | 63 | if (io_uring_op_supported(i)) |
c4320315 JA |
64 | p->ops[i].flags = IO_URING_OP_SUPPORTED; |
65 | } | |
66 | p->ops_len = i; | |
67 | ||
68 | ret = 0; | |
69 | if (copy_to_user(arg, p, size)) | |
70 | ret = -EFAULT; | |
71 | out: | |
72 | kfree(p); | |
73 | return ret; | |
74 | } | |
75 | ||
76 | int io_unregister_personality(struct io_ring_ctx *ctx, unsigned id) | |
77 | { | |
78 | const struct cred *creds; | |
79 | ||
80 | creds = xa_erase(&ctx->personalities, id); | |
81 | if (creds) { | |
82 | put_cred(creds); | |
83 | return 0; | |
84 | } | |
85 | ||
86 | return -EINVAL; | |
87 | } | |
88 | ||
89 | ||
90 | static int io_register_personality(struct io_ring_ctx *ctx) | |
91 | { | |
92 | const struct cred *creds; | |
93 | u32 id; | |
94 | int ret; | |
95 | ||
96 | creds = get_current_cred(); | |
97 | ||
98 | ret = xa_alloc_cyclic(&ctx->personalities, &id, (void *)creds, | |
99 | XA_LIMIT(0, USHRT_MAX), &ctx->pers_next, GFP_KERNEL); | |
100 | if (ret < 0) { | |
101 | put_cred(creds); | |
102 | return ret; | |
103 | } | |
104 | return id; | |
105 | } | |
106 | ||
53745105 JT |
107 | static __cold int io_parse_restrictions(void __user *arg, unsigned int nr_args, |
108 | struct io_restriction *restrictions) | |
c4320315 JA |
109 | { |
110 | struct io_uring_restriction *res; | |
111 | size_t size; | |
112 | int i, ret; | |
113 | ||
c4320315 JA |
114 | if (!arg || nr_args > IORING_MAX_RESTRICTIONS) |
115 | return -EINVAL; | |
116 | ||
117 | size = array_size(nr_args, sizeof(*res)); | |
118 | if (size == SIZE_MAX) | |
119 | return -EOVERFLOW; | |
120 | ||
121 | res = memdup_user(arg, size); | |
122 | if (IS_ERR(res)) | |
123 | return PTR_ERR(res); | |
124 | ||
53745105 | 125 | ret = -EINVAL; |
c4320315 JA |
126 | |
127 | for (i = 0; i < nr_args; i++) { | |
128 | switch (res[i].opcode) { | |
129 | case IORING_RESTRICTION_REGISTER_OP: | |
53745105 JT |
130 | if (res[i].register_op >= IORING_REGISTER_LAST) |
131 | goto err; | |
132 | __set_bit(res[i].register_op, restrictions->register_op); | |
c4320315 JA |
133 | break; |
134 | case IORING_RESTRICTION_SQE_OP: | |
53745105 JT |
135 | if (res[i].sqe_op >= IORING_OP_LAST) |
136 | goto err; | |
137 | __set_bit(res[i].sqe_op, restrictions->sqe_op); | |
c4320315 JA |
138 | break; |
139 | case IORING_RESTRICTION_SQE_FLAGS_ALLOWED: | |
53745105 | 140 | restrictions->sqe_flags_allowed = res[i].sqe_flags; |
c4320315 JA |
141 | break; |
142 | case IORING_RESTRICTION_SQE_FLAGS_REQUIRED: | |
53745105 | 143 | restrictions->sqe_flags_required = res[i].sqe_flags; |
c4320315 JA |
144 | break; |
145 | default: | |
53745105 | 146 | goto err; |
c4320315 JA |
147 | } |
148 | } | |
149 | ||
53745105 JT |
150 | ret = 0; |
151 | ||
152 | err: | |
153 | kfree(res); | |
154 | return ret; | |
155 | } | |
156 | ||
157 | static __cold int io_register_restrictions(struct io_ring_ctx *ctx, | |
158 | void __user *arg, unsigned int nr_args) | |
159 | { | |
160 | int ret; | |
161 | ||
162 | /* Restrictions allowed only if rings started disabled */ | |
163 | if (!(ctx->flags & IORING_SETUP_R_DISABLED)) | |
164 | return -EBADFD; | |
165 | ||
166 | /* We allow only a single restrictions registration */ | |
167 | if (ctx->restrictions.registered) | |
168 | return -EBUSY; | |
169 | ||
170 | ret = io_parse_restrictions(arg, nr_args, &ctx->restrictions); | |
c4320315 JA |
171 | /* Reset all restrictions if an error happened */ |
172 | if (ret != 0) | |
173 | memset(&ctx->restrictions, 0, sizeof(ctx->restrictions)); | |
174 | else | |
175 | ctx->restrictions.registered = true; | |
c4320315 JA |
176 | return ret; |
177 | } | |
178 | ||
179 | static int io_register_enable_rings(struct io_ring_ctx *ctx) | |
180 | { | |
181 | if (!(ctx->flags & IORING_SETUP_R_DISABLED)) | |
182 | return -EBADFD; | |
183 | ||
184 | if (ctx->flags & IORING_SETUP_SINGLE_ISSUER && !ctx->submitter_task) { | |
185 | WRITE_ONCE(ctx->submitter_task, get_task_struct(current)); | |
186 | /* | |
187 | * Lazy activation attempts would fail if it was polled before | |
188 | * submitter_task is set. | |
189 | */ | |
190 | if (wq_has_sleeper(&ctx->poll_wq)) | |
191 | io_activate_pollwq(ctx); | |
192 | } | |
193 | ||
194 | if (ctx->restrictions.registered) | |
195 | ctx->restricted = 1; | |
196 | ||
197 | ctx->flags &= ~IORING_SETUP_R_DISABLED; | |
198 | if (ctx->sq_data && wq_has_sleeper(&ctx->sq_data->wait)) | |
199 | wake_up(&ctx->sq_data->wait); | |
200 | return 0; | |
201 | } | |
202 | ||
203 | static __cold int __io_register_iowq_aff(struct io_ring_ctx *ctx, | |
204 | cpumask_var_t new_mask) | |
205 | { | |
206 | int ret; | |
207 | ||
208 | if (!(ctx->flags & IORING_SETUP_SQPOLL)) { | |
209 | ret = io_wq_cpu_affinity(current->io_uring, new_mask); | |
210 | } else { | |
211 | mutex_unlock(&ctx->uring_lock); | |
212 | ret = io_sqpoll_wq_cpu_affinity(ctx, new_mask); | |
213 | mutex_lock(&ctx->uring_lock); | |
214 | } | |
215 | ||
216 | return ret; | |
217 | } | |
218 | ||
219 | static __cold int io_register_iowq_aff(struct io_ring_ctx *ctx, | |
220 | void __user *arg, unsigned len) | |
221 | { | |
222 | cpumask_var_t new_mask; | |
223 | int ret; | |
224 | ||
225 | if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) | |
226 | return -ENOMEM; | |
227 | ||
228 | cpumask_clear(new_mask); | |
229 | if (len > cpumask_size()) | |
230 | len = cpumask_size(); | |
231 | ||
baf59771 JA |
232 | #ifdef CONFIG_COMPAT |
233 | if (in_compat_syscall()) | |
c4320315 JA |
234 | ret = compat_get_bitmap(cpumask_bits(new_mask), |
235 | (const compat_ulong_t __user *)arg, | |
236 | len * 8 /* CHAR_BIT */); | |
baf59771 JA |
237 | else |
238 | #endif | |
c4320315 | 239 | ret = copy_from_user(new_mask, arg, len); |
c4320315 JA |
240 | |
241 | if (ret) { | |
242 | free_cpumask_var(new_mask); | |
243 | return -EFAULT; | |
244 | } | |
245 | ||
246 | ret = __io_register_iowq_aff(ctx, new_mask); | |
247 | free_cpumask_var(new_mask); | |
248 | return ret; | |
249 | } | |
250 | ||
251 | static __cold int io_unregister_iowq_aff(struct io_ring_ctx *ctx) | |
252 | { | |
253 | return __io_register_iowq_aff(ctx, NULL); | |
254 | } | |
255 | ||
256 | static __cold int io_register_iowq_max_workers(struct io_ring_ctx *ctx, | |
257 | void __user *arg) | |
258 | __must_hold(&ctx->uring_lock) | |
259 | { | |
260 | struct io_tctx_node *node; | |
261 | struct io_uring_task *tctx = NULL; | |
262 | struct io_sq_data *sqd = NULL; | |
263 | __u32 new_count[2]; | |
264 | int i, ret; | |
265 | ||
266 | if (copy_from_user(new_count, arg, sizeof(new_count))) | |
267 | return -EFAULT; | |
268 | for (i = 0; i < ARRAY_SIZE(new_count); i++) | |
269 | if (new_count[i] > INT_MAX) | |
270 | return -EINVAL; | |
271 | ||
272 | if (ctx->flags & IORING_SETUP_SQPOLL) { | |
273 | sqd = ctx->sq_data; | |
274 | if (sqd) { | |
275 | /* | |
276 | * Observe the correct sqd->lock -> ctx->uring_lock | |
277 | * ordering. Fine to drop uring_lock here, we hold | |
278 | * a ref to the ctx. | |
279 | */ | |
280 | refcount_inc(&sqd->refs); | |
281 | mutex_unlock(&ctx->uring_lock); | |
282 | mutex_lock(&sqd->lock); | |
283 | mutex_lock(&ctx->uring_lock); | |
284 | if (sqd->thread) | |
285 | tctx = sqd->thread->io_uring; | |
286 | } | |
287 | } else { | |
288 | tctx = current->io_uring; | |
289 | } | |
290 | ||
291 | BUILD_BUG_ON(sizeof(new_count) != sizeof(ctx->iowq_limits)); | |
292 | ||
293 | for (i = 0; i < ARRAY_SIZE(new_count); i++) | |
294 | if (new_count[i]) | |
295 | ctx->iowq_limits[i] = new_count[i]; | |
296 | ctx->iowq_limits_set = true; | |
297 | ||
298 | if (tctx && tctx->io_wq) { | |
299 | ret = io_wq_max_workers(tctx->io_wq, new_count); | |
300 | if (ret) | |
301 | goto err; | |
302 | } else { | |
303 | memset(new_count, 0, sizeof(new_count)); | |
304 | } | |
305 | ||
306 | if (sqd) { | |
73254a29 | 307 | mutex_unlock(&ctx->uring_lock); |
c4320315 JA |
308 | mutex_unlock(&sqd->lock); |
309 | io_put_sq_data(sqd); | |
73254a29 | 310 | mutex_lock(&ctx->uring_lock); |
c4320315 JA |
311 | } |
312 | ||
313 | if (copy_to_user(arg, new_count, sizeof(new_count))) | |
314 | return -EFAULT; | |
315 | ||
316 | /* that's it for SQPOLL, only the SQPOLL task creates requests */ | |
317 | if (sqd) | |
318 | return 0; | |
319 | ||
320 | /* now propagate the restriction to all registered users */ | |
321 | list_for_each_entry(node, &ctx->tctx_list, ctx_node) { | |
1da2f311 | 322 | tctx = node->task->io_uring; |
c4320315 JA |
323 | if (WARN_ON_ONCE(!tctx->io_wq)) |
324 | continue; | |
325 | ||
326 | for (i = 0; i < ARRAY_SIZE(new_count); i++) | |
327 | new_count[i] = ctx->iowq_limits[i]; | |
328 | /* ignore errors, it always returns zero anyway */ | |
329 | (void)io_wq_max_workers(tctx->io_wq, new_count); | |
330 | } | |
331 | return 0; | |
332 | err: | |
333 | if (sqd) { | |
73254a29 | 334 | mutex_unlock(&ctx->uring_lock); |
c4320315 JA |
335 | mutex_unlock(&sqd->lock); |
336 | io_put_sq_data(sqd); | |
73254a29 | 337 | mutex_lock(&ctx->uring_lock); |
c4320315 JA |
338 | } |
339 | return ret; | |
340 | } | |
341 | ||
2b8e976b PB |
342 | static int io_register_clock(struct io_ring_ctx *ctx, |
343 | struct io_uring_clock_register __user *arg) | |
344 | { | |
345 | struct io_uring_clock_register reg; | |
346 | ||
347 | if (copy_from_user(®, arg, sizeof(reg))) | |
348 | return -EFAULT; | |
349 | if (memchr_inv(®.__resv, 0, sizeof(reg.__resv))) | |
350 | return -EINVAL; | |
351 | ||
352 | switch (reg.clockid) { | |
353 | case CLOCK_MONOTONIC: | |
354 | ctx->clock_offset = 0; | |
355 | break; | |
356 | case CLOCK_BOOTTIME: | |
357 | ctx->clock_offset = TK_OFFS_BOOT; | |
358 | break; | |
359 | default: | |
360 | return -EINVAL; | |
361 | } | |
362 | ||
363 | ctx->clockid = reg.clockid; | |
364 | return 0; | |
365 | } | |
366 | ||
79cfe9e5 JA |
367 | /* |
368 | * State to maintain until we can swap. Both new and old state, used for | |
369 | * either mapping or freeing. | |
370 | */ | |
371 | struct io_ring_ctx_rings { | |
79cfe9e5 | 372 | struct io_rings *rings; |
8078486e | 373 | struct io_uring_sqe *sq_sqes; |
81a4058e | 374 | |
8078486e | 375 | struct io_mapped_region sq_region; |
81a4058e | 376 | struct io_mapped_region ring_region; |
79cfe9e5 JA |
377 | }; |
378 | ||
02255d55 PB |
379 | static void io_register_free_rings(struct io_ring_ctx *ctx, |
380 | struct io_uring_params *p, | |
79cfe9e5 JA |
381 | struct io_ring_ctx_rings *r) |
382 | { | |
8078486e | 383 | io_free_region(ctx, &r->sq_region); |
81a4058e | 384 | io_free_region(ctx, &r->ring_region); |
79cfe9e5 JA |
385 | } |
386 | ||
387 | #define swap_old(ctx, o, n, field) \ | |
388 | do { \ | |
389 | (o).field = (ctx)->field; \ | |
390 | (ctx)->field = (n).field; \ | |
391 | } while (0) | |
392 | ||
393 | #define RESIZE_FLAGS (IORING_SETUP_CQSIZE | IORING_SETUP_CLAMP) | |
394 | #define COPY_FLAGS (IORING_SETUP_NO_SQARRAY | IORING_SETUP_SQE128 | \ | |
395 | IORING_SETUP_CQE32 | IORING_SETUP_NO_MMAP) | |
396 | ||
397 | static int io_register_resize_rings(struct io_ring_ctx *ctx, void __user *arg) | |
398 | { | |
8078486e | 399 | struct io_uring_region_desc rd; |
79cfe9e5 JA |
400 | struct io_ring_ctx_rings o = { }, n = { }, *to_free = NULL; |
401 | size_t size, sq_array_offset; | |
6f7a644e | 402 | unsigned i, tail, old_head; |
79cfe9e5 | 403 | struct io_uring_params p; |
79cfe9e5 JA |
404 | int ret; |
405 | ||
406 | /* for single issuer, must be owner resizing */ | |
407 | if (ctx->flags & IORING_SETUP_SINGLE_ISSUER && | |
408 | current != ctx->submitter_task) | |
409 | return -EEXIST; | |
c261e4f1 JA |
410 | /* limited to DEFER_TASKRUN for now */ |
411 | if (!(ctx->flags & IORING_SETUP_DEFER_TASKRUN)) | |
412 | return -EINVAL; | |
79cfe9e5 JA |
413 | if (copy_from_user(&p, arg, sizeof(p))) |
414 | return -EFAULT; | |
415 | if (p.flags & ~RESIZE_FLAGS) | |
416 | return -EINVAL; | |
417 | ||
418 | /* properties that are always inherited */ | |
419 | p.flags |= (ctx->flags & COPY_FLAGS); | |
420 | ||
421 | ret = io_uring_fill_params(p.sq_entries, &p); | |
422 | if (unlikely(ret)) | |
423 | return ret; | |
424 | ||
425 | /* nothing to do, but copy params back */ | |
426 | if (p.sq_entries == ctx->sq_entries && p.cq_entries == ctx->cq_entries) { | |
427 | if (copy_to_user(arg, &p, sizeof(p))) | |
428 | return -EFAULT; | |
429 | return 0; | |
430 | } | |
431 | ||
432 | size = rings_size(p.flags, p.sq_entries, p.cq_entries, | |
433 | &sq_array_offset); | |
434 | if (size == SIZE_MAX) | |
435 | return -EOVERFLOW; | |
436 | ||
81a4058e PB |
437 | memset(&rd, 0, sizeof(rd)); |
438 | rd.size = PAGE_ALIGN(size); | |
439 | if (p.flags & IORING_SETUP_NO_MMAP) { | |
440 | rd.user_addr = p.cq_off.user_addr; | |
441 | rd.flags |= IORING_MEM_REGION_TYPE_USER; | |
442 | } | |
443 | ret = io_create_region_mmap_safe(ctx, &n.ring_region, &rd, IORING_OFF_CQ_RING); | |
444 | if (ret) { | |
445 | io_register_free_rings(ctx, &p, &n); | |
446 | return ret; | |
447 | } | |
448 | n.rings = io_region_get_ptr(&n.ring_region); | |
79cfe9e5 | 449 | |
2c5aae12 JA |
450 | /* |
451 | * At this point n.rings is shared with userspace, just like o.rings | |
452 | * is as well. While we don't expect userspace to modify it while | |
453 | * a resize is in progress, and it's most likely that userspace will | |
454 | * shoot itself in the foot if it does, we can't always assume good | |
455 | * intent... Use read/write once helpers from here on to indicate the | |
456 | * shared nature of it. | |
457 | */ | |
458 | WRITE_ONCE(n.rings->sq_ring_mask, p.sq_entries - 1); | |
459 | WRITE_ONCE(n.rings->cq_ring_mask, p.cq_entries - 1); | |
460 | WRITE_ONCE(n.rings->sq_ring_entries, p.sq_entries); | |
461 | WRITE_ONCE(n.rings->cq_ring_entries, p.cq_entries); | |
79cfe9e5 JA |
462 | |
463 | if (copy_to_user(arg, &p, sizeof(p))) { | |
02255d55 | 464 | io_register_free_rings(ctx, &p, &n); |
79cfe9e5 JA |
465 | return -EFAULT; |
466 | } | |
467 | ||
468 | if (p.flags & IORING_SETUP_SQE128) | |
469 | size = array_size(2 * sizeof(struct io_uring_sqe), p.sq_entries); | |
470 | else | |
471 | size = array_size(sizeof(struct io_uring_sqe), p.sq_entries); | |
472 | if (size == SIZE_MAX) { | |
02255d55 | 473 | io_register_free_rings(ctx, &p, &n); |
79cfe9e5 JA |
474 | return -EOVERFLOW; |
475 | } | |
476 | ||
8078486e PB |
477 | memset(&rd, 0, sizeof(rd)); |
478 | rd.size = PAGE_ALIGN(size); | |
479 | if (p.flags & IORING_SETUP_NO_MMAP) { | |
480 | rd.user_addr = p.sq_off.user_addr; | |
481 | rd.flags |= IORING_MEM_REGION_TYPE_USER; | |
482 | } | |
483 | ret = io_create_region_mmap_safe(ctx, &n.sq_region, &rd, IORING_OFF_SQES); | |
484 | if (ret) { | |
02255d55 | 485 | io_register_free_rings(ctx, &p, &n); |
8078486e | 486 | return ret; |
79cfe9e5 | 487 | } |
8078486e | 488 | n.sq_sqes = io_region_get_ptr(&n.sq_region); |
79cfe9e5 JA |
489 | |
490 | /* | |
491 | * If using SQPOLL, park the thread | |
492 | */ | |
493 | if (ctx->sq_data) { | |
494 | mutex_unlock(&ctx->uring_lock); | |
495 | io_sq_thread_park(ctx->sq_data); | |
496 | mutex_lock(&ctx->uring_lock); | |
497 | } | |
498 | ||
499 | /* | |
943d0609 | 500 | * We'll do the swap. Grab the ctx->mmap_lock, which will exclude |
79cfe9e5 JA |
501 | * any new mmap's on the ring fd. Clear out existing mappings to prevent |
502 | * mmap from seeing them, as we'll unmap them. Any attempt to mmap | |
503 | * existing rings beyond this point will fail. Not that it could proceed | |
504 | * at this point anyway, as the io_uring mmap side needs go grab the | |
943d0609 | 505 | * ctx->mmap_lock as well. Likewise, hold the completion lock over the |
79cfe9e5 JA |
506 | * duration of the actual swap. |
507 | */ | |
943d0609 | 508 | mutex_lock(&ctx->mmap_lock); |
79cfe9e5 JA |
509 | spin_lock(&ctx->completion_lock); |
510 | o.rings = ctx->rings; | |
511 | ctx->rings = NULL; | |
512 | o.sq_sqes = ctx->sq_sqes; | |
513 | ctx->sq_sqes = NULL; | |
514 | ||
515 | /* | |
516 | * Now copy SQ and CQ entries, if any. If either of the destination | |
517 | * rings can't hold what is already there, then fail the operation. | |
518 | */ | |
2c5aae12 | 519 | tail = READ_ONCE(o.rings->sq.tail); |
6f7a644e JA |
520 | old_head = READ_ONCE(o.rings->sq.head); |
521 | if (tail - old_head > p.sq_entries) | |
79cfe9e5 | 522 | goto overflow; |
6f7a644e | 523 | for (i = old_head; i < tail; i++) { |
79cfe9e5 | 524 | unsigned src_head = i & (ctx->sq_entries - 1); |
8911798d | 525 | unsigned dst_head = i & (p.sq_entries - 1); |
79cfe9e5 JA |
526 | |
527 | n.sq_sqes[dst_head] = o.sq_sqes[src_head]; | |
528 | } | |
a312e170 LT |
529 | WRITE_ONCE(n.rings->sq.head, old_head); |
530 | WRITE_ONCE(n.rings->sq.tail, tail); | |
79cfe9e5 | 531 | |
2c5aae12 | 532 | tail = READ_ONCE(o.rings->cq.tail); |
6f7a644e JA |
533 | old_head = READ_ONCE(o.rings->cq.head); |
534 | if (tail - old_head > p.cq_entries) { | |
79cfe9e5 JA |
535 | overflow: |
536 | /* restore old rings, and return -EOVERFLOW via cleanup path */ | |
537 | ctx->rings = o.rings; | |
538 | ctx->sq_sqes = o.sq_sqes; | |
539 | to_free = &n; | |
540 | ret = -EOVERFLOW; | |
541 | goto out; | |
542 | } | |
6f7a644e | 543 | for (i = old_head; i < tail; i++) { |
79cfe9e5 | 544 | unsigned src_head = i & (ctx->cq_entries - 1); |
8911798d | 545 | unsigned dst_head = i & (p.cq_entries - 1); |
79cfe9e5 JA |
546 | |
547 | n.rings->cqes[dst_head] = o.rings->cqes[src_head]; | |
548 | } | |
a312e170 LT |
549 | WRITE_ONCE(n.rings->cq.head, old_head); |
550 | WRITE_ONCE(n.rings->cq.tail, tail); | |
79cfe9e5 JA |
551 | /* invalidate cached cqe refill */ |
552 | ctx->cqe_cached = ctx->cqe_sentinel = NULL; | |
553 | ||
2c5aae12 | 554 | WRITE_ONCE(n.rings->sq_dropped, READ_ONCE(o.rings->sq_dropped)); |
a23ad06b | 555 | atomic_set(&n.rings->sq_flags, atomic_read(&o.rings->sq_flags)); |
2c5aae12 JA |
556 | WRITE_ONCE(n.rings->cq_flags, READ_ONCE(o.rings->cq_flags)); |
557 | WRITE_ONCE(n.rings->cq_overflow, READ_ONCE(o.rings->cq_overflow)); | |
79cfe9e5 JA |
558 | |
559 | /* all done, store old pointers and assign new ones */ | |
560 | if (!(ctx->flags & IORING_SETUP_NO_SQARRAY)) | |
561 | ctx->sq_array = (u32 *)((char *)n.rings + sq_array_offset); | |
562 | ||
563 | ctx->sq_entries = p.sq_entries; | |
564 | ctx->cq_entries = p.cq_entries; | |
565 | ||
566 | ctx->rings = n.rings; | |
567 | ctx->sq_sqes = n.sq_sqes; | |
81a4058e | 568 | swap_old(ctx, o, n, ring_region); |
8078486e | 569 | swap_old(ctx, o, n, sq_region); |
79cfe9e5 JA |
570 | to_free = &o; |
571 | ret = 0; | |
572 | out: | |
573 | spin_unlock(&ctx->completion_lock); | |
943d0609 | 574 | mutex_unlock(&ctx->mmap_lock); |
02255d55 | 575 | io_register_free_rings(ctx, &p, to_free); |
79cfe9e5 JA |
576 | |
577 | if (ctx->sq_data) | |
578 | io_sq_thread_unpark(ctx->sq_data); | |
579 | ||
580 | return ret; | |
581 | } | |
582 | ||
93238e66 PB |
583 | static int io_register_mem_region(struct io_ring_ctx *ctx, void __user *uarg) |
584 | { | |
585 | struct io_uring_mem_region_reg __user *reg_uptr = uarg; | |
586 | struct io_uring_mem_region_reg reg; | |
587 | struct io_uring_region_desc __user *rd_uptr; | |
588 | struct io_uring_region_desc rd; | |
589 | int ret; | |
590 | ||
591 | if (io_region_is_set(&ctx->param_region)) | |
592 | return -EBUSY; | |
593 | if (copy_from_user(®, reg_uptr, sizeof(reg))) | |
594 | return -EFAULT; | |
595 | rd_uptr = u64_to_user_ptr(reg.region_uptr); | |
596 | if (copy_from_user(&rd, rd_uptr, sizeof(rd))) | |
597 | return -EFAULT; | |
93238e66 PB |
598 | if (memchr_inv(®.__resv, 0, sizeof(reg.__resv))) |
599 | return -EINVAL; | |
d617b314 PB |
600 | if (reg.flags & ~IORING_MEM_REGION_REG_WAIT_ARG) |
601 | return -EINVAL; | |
602 | ||
603 | /* | |
604 | * This ensures there are no waiters. Waiters are unlocked and it's | |
605 | * hard to synchronise with them, especially if we need to initialise | |
606 | * the region. | |
607 | */ | |
608 | if ((reg.flags & IORING_MEM_REGION_REG_WAIT_ARG) && | |
609 | !(ctx->flags & IORING_SETUP_R_DISABLED)) | |
93238e66 PB |
610 | return -EINVAL; |
611 | ||
087f9978 PB |
612 | ret = io_create_region_mmap_safe(ctx, &ctx->param_region, &rd, |
613 | IORING_MAP_OFF_PARAM_REGION); | |
93238e66 PB |
614 | if (ret) |
615 | return ret; | |
616 | if (copy_to_user(rd_uptr, &rd, sizeof(rd))) { | |
617 | io_free_region(ctx, &ctx->param_region); | |
618 | return -EFAULT; | |
619 | } | |
d617b314 PB |
620 | |
621 | if (reg.flags & IORING_MEM_REGION_REG_WAIT_ARG) { | |
622 | ctx->cq_wait_arg = io_region_get_ptr(&ctx->param_region); | |
623 | ctx->cq_wait_size = rd.size; | |
624 | } | |
93238e66 PB |
625 | return 0; |
626 | } | |
627 | ||
c4320315 JA |
628 | static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode, |
629 | void __user *arg, unsigned nr_args) | |
630 | __releases(ctx->uring_lock) | |
631 | __acquires(ctx->uring_lock) | |
632 | { | |
633 | int ret; | |
634 | ||
635 | /* | |
636 | * We don't quiesce the refs for register anymore and so it can't be | |
637 | * dying as we're holding a file ref here. | |
638 | */ | |
639 | if (WARN_ON_ONCE(percpu_ref_is_dying(&ctx->refs))) | |
640 | return -ENXIO; | |
641 | ||
642 | if (ctx->submitter_task && ctx->submitter_task != current) | |
643 | return -EEXIST; | |
644 | ||
645 | if (ctx->restricted) { | |
646 | opcode = array_index_nospec(opcode, IORING_REGISTER_LAST); | |
647 | if (!test_bit(opcode, ctx->restrictions.register_op)) | |
648 | return -EACCES; | |
649 | } | |
650 | ||
651 | switch (opcode) { | |
652 | case IORING_REGISTER_BUFFERS: | |
653 | ret = -EFAULT; | |
654 | if (!arg) | |
655 | break; | |
656 | ret = io_sqe_buffers_register(ctx, arg, nr_args, NULL); | |
657 | break; | |
658 | case IORING_UNREGISTER_BUFFERS: | |
659 | ret = -EINVAL; | |
660 | if (arg || nr_args) | |
661 | break; | |
662 | ret = io_sqe_buffers_unregister(ctx); | |
663 | break; | |
664 | case IORING_REGISTER_FILES: | |
665 | ret = -EFAULT; | |
666 | if (!arg) | |
667 | break; | |
668 | ret = io_sqe_files_register(ctx, arg, nr_args, NULL); | |
669 | break; | |
670 | case IORING_UNREGISTER_FILES: | |
671 | ret = -EINVAL; | |
672 | if (arg || nr_args) | |
673 | break; | |
674 | ret = io_sqe_files_unregister(ctx); | |
675 | break; | |
676 | case IORING_REGISTER_FILES_UPDATE: | |
677 | ret = io_register_files_update(ctx, arg, nr_args); | |
678 | break; | |
679 | case IORING_REGISTER_EVENTFD: | |
680 | ret = -EINVAL; | |
681 | if (nr_args != 1) | |
682 | break; | |
683 | ret = io_eventfd_register(ctx, arg, 0); | |
684 | break; | |
685 | case IORING_REGISTER_EVENTFD_ASYNC: | |
686 | ret = -EINVAL; | |
687 | if (nr_args != 1) | |
688 | break; | |
689 | ret = io_eventfd_register(ctx, arg, 1); | |
690 | break; | |
691 | case IORING_UNREGISTER_EVENTFD: | |
692 | ret = -EINVAL; | |
693 | if (arg || nr_args) | |
694 | break; | |
695 | ret = io_eventfd_unregister(ctx); | |
696 | break; | |
697 | case IORING_REGISTER_PROBE: | |
698 | ret = -EINVAL; | |
699 | if (!arg || nr_args > 256) | |
700 | break; | |
701 | ret = io_probe(ctx, arg, nr_args); | |
702 | break; | |
703 | case IORING_REGISTER_PERSONALITY: | |
704 | ret = -EINVAL; | |
705 | if (arg || nr_args) | |
706 | break; | |
707 | ret = io_register_personality(ctx); | |
708 | break; | |
709 | case IORING_UNREGISTER_PERSONALITY: | |
710 | ret = -EINVAL; | |
711 | if (arg) | |
712 | break; | |
713 | ret = io_unregister_personality(ctx, nr_args); | |
714 | break; | |
715 | case IORING_REGISTER_ENABLE_RINGS: | |
716 | ret = -EINVAL; | |
717 | if (arg || nr_args) | |
718 | break; | |
719 | ret = io_register_enable_rings(ctx); | |
720 | break; | |
721 | case IORING_REGISTER_RESTRICTIONS: | |
722 | ret = io_register_restrictions(ctx, arg, nr_args); | |
723 | break; | |
724 | case IORING_REGISTER_FILES2: | |
725 | ret = io_register_rsrc(ctx, arg, nr_args, IORING_RSRC_FILE); | |
726 | break; | |
727 | case IORING_REGISTER_FILES_UPDATE2: | |
728 | ret = io_register_rsrc_update(ctx, arg, nr_args, | |
729 | IORING_RSRC_FILE); | |
730 | break; | |
731 | case IORING_REGISTER_BUFFERS2: | |
732 | ret = io_register_rsrc(ctx, arg, nr_args, IORING_RSRC_BUFFER); | |
733 | break; | |
734 | case IORING_REGISTER_BUFFERS_UPDATE: | |
735 | ret = io_register_rsrc_update(ctx, arg, nr_args, | |
736 | IORING_RSRC_BUFFER); | |
737 | break; | |
738 | case IORING_REGISTER_IOWQ_AFF: | |
739 | ret = -EINVAL; | |
740 | if (!arg || !nr_args) | |
741 | break; | |
742 | ret = io_register_iowq_aff(ctx, arg, nr_args); | |
743 | break; | |
744 | case IORING_UNREGISTER_IOWQ_AFF: | |
745 | ret = -EINVAL; | |
746 | if (arg || nr_args) | |
747 | break; | |
748 | ret = io_unregister_iowq_aff(ctx); | |
749 | break; | |
750 | case IORING_REGISTER_IOWQ_MAX_WORKERS: | |
751 | ret = -EINVAL; | |
752 | if (!arg || nr_args != 2) | |
753 | break; | |
754 | ret = io_register_iowq_max_workers(ctx, arg); | |
755 | break; | |
756 | case IORING_REGISTER_RING_FDS: | |
757 | ret = io_ringfd_register(ctx, arg, nr_args); | |
758 | break; | |
759 | case IORING_UNREGISTER_RING_FDS: | |
760 | ret = io_ringfd_unregister(ctx, arg, nr_args); | |
761 | break; | |
762 | case IORING_REGISTER_PBUF_RING: | |
763 | ret = -EINVAL; | |
764 | if (!arg || nr_args != 1) | |
765 | break; | |
766 | ret = io_register_pbuf_ring(ctx, arg); | |
767 | break; | |
768 | case IORING_UNREGISTER_PBUF_RING: | |
769 | ret = -EINVAL; | |
770 | if (!arg || nr_args != 1) | |
771 | break; | |
772 | ret = io_unregister_pbuf_ring(ctx, arg); | |
773 | break; | |
774 | case IORING_REGISTER_SYNC_CANCEL: | |
775 | ret = -EINVAL; | |
776 | if (!arg || nr_args != 1) | |
777 | break; | |
778 | ret = io_sync_cancel(ctx, arg); | |
779 | break; | |
780 | case IORING_REGISTER_FILE_ALLOC_RANGE: | |
781 | ret = -EINVAL; | |
782 | if (!arg || nr_args) | |
783 | break; | |
784 | ret = io_register_file_alloc_range(ctx, arg); | |
785 | break; | |
d293b1a8 JA |
786 | case IORING_REGISTER_PBUF_STATUS: |
787 | ret = -EINVAL; | |
788 | if (!arg || nr_args != 1) | |
789 | break; | |
790 | ret = io_register_pbuf_status(ctx, arg); | |
791 | break; | |
ef1186c1 SR |
792 | case IORING_REGISTER_NAPI: |
793 | ret = -EINVAL; | |
794 | if (!arg || nr_args != 1) | |
795 | break; | |
796 | ret = io_register_napi(ctx, arg); | |
797 | break; | |
798 | case IORING_UNREGISTER_NAPI: | |
799 | ret = -EINVAL; | |
800 | if (nr_args != 1) | |
801 | break; | |
802 | ret = io_unregister_napi(ctx, arg); | |
803 | break; | |
2b8e976b PB |
804 | case IORING_REGISTER_CLOCK: |
805 | ret = -EINVAL; | |
806 | if (!arg || nr_args) | |
807 | break; | |
808 | ret = io_register_clock(ctx, arg); | |
809 | break; | |
636119af | 810 | case IORING_REGISTER_CLONE_BUFFERS: |
7cc2a6ea JA |
811 | ret = -EINVAL; |
812 | if (!arg || nr_args != 1) | |
813 | break; | |
636119af | 814 | ret = io_register_clone_buffers(ctx, arg); |
7cc2a6ea | 815 | break; |
79cfe9e5 JA |
816 | case IORING_REGISTER_RESIZE_RINGS: |
817 | ret = -EINVAL; | |
818 | if (!arg || nr_args != 1) | |
819 | break; | |
820 | ret = io_register_resize_rings(ctx, arg); | |
821 | break; | |
93238e66 PB |
822 | case IORING_REGISTER_MEM_REGION: |
823 | ret = -EINVAL; | |
824 | if (!arg || nr_args != 1) | |
825 | break; | |
826 | ret = io_register_mem_region(ctx, arg); | |
827 | break; | |
c4320315 JA |
828 | default: |
829 | ret = -EINVAL; | |
830 | break; | |
831 | } | |
832 | ||
833 | return ret; | |
834 | } | |
835 | ||
0b6d253e JA |
836 | /* |
837 | * Given an 'fd' value, return the ctx associated with if. If 'registered' is | |
838 | * true, then the registered index is used. Otherwise, the normal fd table. | |
839 | * Caller must call fput() on the returned file, unless it's an ERR_PTR. | |
840 | */ | |
2f6a55e4 | 841 | struct file *io_uring_register_get_file(unsigned int fd, bool registered) |
c4320315 | 842 | { |
c4320315 | 843 | struct file *file; |
c4320315 | 844 | |
0b6d253e | 845 | if (registered) { |
c4320315 JA |
846 | /* |
847 | * Ring fd has been registered via IORING_REGISTER_RING_FDS, we | |
848 | * need only dereference our task private array to find it. | |
849 | */ | |
850 | struct io_uring_task *tctx = current->io_uring; | |
851 | ||
852 | if (unlikely(!tctx || fd >= IO_RINGFD_REG_MAX)) | |
0b6d253e | 853 | return ERR_PTR(-EINVAL); |
c4320315 JA |
854 | fd = array_index_nospec(fd, IO_RINGFD_REG_MAX); |
855 | file = tctx->registered_rings[fd]; | |
bb2d7634 PB |
856 | if (file) |
857 | get_file(file); | |
c4320315 JA |
858 | } else { |
859 | file = fget(fd); | |
c4320315 JA |
860 | } |
861 | ||
0b6d253e JA |
862 | if (unlikely(!file)) |
863 | return ERR_PTR(-EBADF); | |
864 | if (io_is_uring_fops(file)) | |
865 | return file; | |
866 | fput(file); | |
867 | return ERR_PTR(-EOPNOTSUPP); | |
868 | } | |
869 | ||
a3771321 JA |
870 | /* |
871 | * "blind" registration opcodes are ones where there's no ring given, and | |
872 | * hence the source fd must be -1. | |
873 | */ | |
874 | static int io_uring_register_blind(unsigned int opcode, void __user *arg, | |
875 | unsigned int nr_args) | |
876 | { | |
877 | switch (opcode) { | |
878 | case IORING_REGISTER_SEND_MSG_RING: { | |
879 | struct io_uring_sqe sqe; | |
880 | ||
881 | if (!arg || nr_args != 1) | |
882 | return -EINVAL; | |
883 | if (copy_from_user(&sqe, arg, sizeof(sqe))) | |
884 | return -EFAULT; | |
885 | /* no flags supported */ | |
886 | if (sqe.flags) | |
887 | return -EINVAL; | |
888 | if (sqe.opcode == IORING_OP_MSG_RING) | |
889 | return io_uring_sync_msg_ring(&sqe); | |
890 | } | |
891 | } | |
892 | ||
893 | return -EINVAL; | |
894 | } | |
895 | ||
0b6d253e JA |
896 | SYSCALL_DEFINE4(io_uring_register, unsigned int, fd, unsigned int, opcode, |
897 | void __user *, arg, unsigned int, nr_args) | |
898 | { | |
899 | struct io_ring_ctx *ctx; | |
900 | long ret = -EBADF; | |
901 | struct file *file; | |
902 | bool use_registered_ring; | |
903 | ||
904 | use_registered_ring = !!(opcode & IORING_REGISTER_USE_REGISTERED_RING); | |
905 | opcode &= ~IORING_REGISTER_USE_REGISTERED_RING; | |
906 | ||
907 | if (opcode >= IORING_REGISTER_LAST) | |
908 | return -EINVAL; | |
909 | ||
a3771321 JA |
910 | if (fd == -1) |
911 | return io_uring_register_blind(opcode, arg, nr_args); | |
912 | ||
0b6d253e JA |
913 | file = io_uring_register_get_file(fd, use_registered_ring); |
914 | if (IS_ERR(file)) | |
915 | return PTR_ERR(file); | |
c4320315 JA |
916 | ctx = file->private_data; |
917 | ||
918 | mutex_lock(&ctx->uring_lock); | |
919 | ret = __io_uring_register(ctx, opcode, arg, nr_args); | |
e358e09a | 920 | |
3597f278 JA |
921 | trace_io_uring_register(ctx, opcode, ctx->file_table.data.nr, |
922 | ctx->buf_table.nr, ret); | |
e358e09a | 923 | mutex_unlock(&ctx->uring_lock); |
bb2d7634 PB |
924 | |
925 | fput(file); | |
c4320315 JA |
926 | return ret; |
927 | } |