1 // SPDX-License-Identifier: GPL-2.0
3 * Code related to the io_uring_register() syscall
5 * Copyright (C) 2023 Jens Axboe
7 #include <linux/kernel.h>
8 #include <linux/errno.h>
9 #include <linux/syscalls.h>
10 #include <linux/refcount.h>
11 #include <linux/bits.h>
13 #include <linux/file.h>
14 #include <linux/slab.h>
15 #include <linux/uaccess.h>
16 #include <linux/nospec.h>
17 #include <linux/compat.h>
18 #include <linux/io_uring.h>
19 #include <linux/io_uring_types.h>
34 #define IORING_MAX_RESTRICTIONS (IORING_RESTRICTION_LAST + \
35 IORING_REGISTER_LAST + IORING_OP_LAST)
37 static __cold int io_probe(struct io_ring_ctx *ctx, void __user *arg,
40 struct io_uring_probe *p;
44 if (nr_args > IORING_OP_LAST)
45 nr_args = IORING_OP_LAST;
47 size = struct_size(p, ops, nr_args);
48 p = kzalloc(size, GFP_KERNEL);
53 if (copy_from_user(p, arg, size))
56 if (memchr_inv(p, 0, size))
59 p->last_op = IORING_OP_LAST - 1;
61 for (i = 0; i < nr_args; i++) {
63 if (io_uring_op_supported(i))
64 p->ops[i].flags = IO_URING_OP_SUPPORTED;
69 if (copy_to_user(arg, p, size))
76 int io_unregister_personality(struct io_ring_ctx *ctx, unsigned id)
78 const struct cred *creds;
80 creds = xa_erase(&ctx->personalities, id);
90 static int io_register_personality(struct io_ring_ctx *ctx)
92 const struct cred *creds;
96 creds = get_current_cred();
98 ret = xa_alloc_cyclic(&ctx->personalities, &id, (void *)creds,
99 XA_LIMIT(0, USHRT_MAX), &ctx->pers_next, GFP_KERNEL);
107 static __cold int io_register_restrictions(struct io_ring_ctx *ctx,
108 void __user *arg, unsigned int nr_args)
110 struct io_uring_restriction *res;
114 /* Restrictions allowed only if rings started disabled */
115 if (!(ctx->flags & IORING_SETUP_R_DISABLED))
118 /* We allow only a single restrictions registration */
119 if (ctx->restrictions.registered)
122 if (!arg || nr_args > IORING_MAX_RESTRICTIONS)
125 size = array_size(nr_args, sizeof(*res));
126 if (size == SIZE_MAX)
129 res = memdup_user(arg, size);
135 for (i = 0; i < nr_args; i++) {
136 switch (res[i].opcode) {
137 case IORING_RESTRICTION_REGISTER_OP:
138 if (res[i].register_op >= IORING_REGISTER_LAST) {
143 __set_bit(res[i].register_op,
144 ctx->restrictions.register_op);
146 case IORING_RESTRICTION_SQE_OP:
147 if (res[i].sqe_op >= IORING_OP_LAST) {
152 __set_bit(res[i].sqe_op, ctx->restrictions.sqe_op);
154 case IORING_RESTRICTION_SQE_FLAGS_ALLOWED:
155 ctx->restrictions.sqe_flags_allowed = res[i].sqe_flags;
157 case IORING_RESTRICTION_SQE_FLAGS_REQUIRED:
158 ctx->restrictions.sqe_flags_required = res[i].sqe_flags;
167 /* Reset all restrictions if an error happened */
169 memset(&ctx->restrictions, 0, sizeof(ctx->restrictions));
171 ctx->restrictions.registered = true;
177 static int io_register_enable_rings(struct io_ring_ctx *ctx)
179 if (!(ctx->flags & IORING_SETUP_R_DISABLED))
182 if (ctx->flags & IORING_SETUP_SINGLE_ISSUER && !ctx->submitter_task) {
183 WRITE_ONCE(ctx->submitter_task, get_task_struct(current));
185 * Lazy activation attempts would fail if it was polled before
186 * submitter_task is set.
188 if (wq_has_sleeper(&ctx->poll_wq))
189 io_activate_pollwq(ctx);
192 if (ctx->restrictions.registered)
195 ctx->flags &= ~IORING_SETUP_R_DISABLED;
196 if (ctx->sq_data && wq_has_sleeper(&ctx->sq_data->wait))
197 wake_up(&ctx->sq_data->wait);
201 static __cold int __io_register_iowq_aff(struct io_ring_ctx *ctx,
202 cpumask_var_t new_mask)
206 if (!(ctx->flags & IORING_SETUP_SQPOLL)) {
207 ret = io_wq_cpu_affinity(current->io_uring, new_mask);
209 mutex_unlock(&ctx->uring_lock);
210 ret = io_sqpoll_wq_cpu_affinity(ctx, new_mask);
211 mutex_lock(&ctx->uring_lock);
217 static __cold int io_register_iowq_aff(struct io_ring_ctx *ctx,
218 void __user *arg, unsigned len)
220 cpumask_var_t new_mask;
223 if (!alloc_cpumask_var(&new_mask, GFP_KERNEL))
226 cpumask_clear(new_mask);
227 if (len > cpumask_size())
228 len = cpumask_size();
231 if (in_compat_syscall())
232 ret = compat_get_bitmap(cpumask_bits(new_mask),
233 (const compat_ulong_t __user *)arg,
234 len * 8 /* CHAR_BIT */);
237 ret = copy_from_user(new_mask, arg, len);
240 free_cpumask_var(new_mask);
244 ret = __io_register_iowq_aff(ctx, new_mask);
245 free_cpumask_var(new_mask);
249 static __cold int io_unregister_iowq_aff(struct io_ring_ctx *ctx)
251 return __io_register_iowq_aff(ctx, NULL);
254 static __cold int io_register_iowq_max_workers(struct io_ring_ctx *ctx,
256 __must_hold(&ctx->uring_lock)
258 struct io_tctx_node *node;
259 struct io_uring_task *tctx = NULL;
260 struct io_sq_data *sqd = NULL;
264 if (copy_from_user(new_count, arg, sizeof(new_count)))
266 for (i = 0; i < ARRAY_SIZE(new_count); i++)
267 if (new_count[i] > INT_MAX)
270 if (ctx->flags & IORING_SETUP_SQPOLL) {
274 * Observe the correct sqd->lock -> ctx->uring_lock
275 * ordering. Fine to drop uring_lock here, we hold
278 refcount_inc(&sqd->refs);
279 mutex_unlock(&ctx->uring_lock);
280 mutex_lock(&sqd->lock);
281 mutex_lock(&ctx->uring_lock);
283 tctx = sqd->thread->io_uring;
286 tctx = current->io_uring;
289 BUILD_BUG_ON(sizeof(new_count) != sizeof(ctx->iowq_limits));
291 for (i = 0; i < ARRAY_SIZE(new_count); i++)
293 ctx->iowq_limits[i] = new_count[i];
294 ctx->iowq_limits_set = true;
296 if (tctx && tctx->io_wq) {
297 ret = io_wq_max_workers(tctx->io_wq, new_count);
301 memset(new_count, 0, sizeof(new_count));
305 mutex_unlock(&ctx->uring_lock);
306 mutex_unlock(&sqd->lock);
308 mutex_lock(&ctx->uring_lock);
311 if (copy_to_user(arg, new_count, sizeof(new_count)))
314 /* that's it for SQPOLL, only the SQPOLL task creates requests */
318 /* now propagate the restriction to all registered users */
319 list_for_each_entry(node, &ctx->tctx_list, ctx_node) {
320 tctx = node->task->io_uring;
321 if (WARN_ON_ONCE(!tctx->io_wq))
324 for (i = 0; i < ARRAY_SIZE(new_count); i++)
325 new_count[i] = ctx->iowq_limits[i];
326 /* ignore errors, it always returns zero anyway */
327 (void)io_wq_max_workers(tctx->io_wq, new_count);
332 mutex_unlock(&ctx->uring_lock);
333 mutex_unlock(&sqd->lock);
335 mutex_lock(&ctx->uring_lock);
340 static int io_register_clock(struct io_ring_ctx *ctx,
341 struct io_uring_clock_register __user *arg)
343 struct io_uring_clock_register reg;
345 if (copy_from_user(®, arg, sizeof(reg)))
347 if (memchr_inv(®.__resv, 0, sizeof(reg.__resv)))
350 switch (reg.clockid) {
351 case CLOCK_MONOTONIC:
352 ctx->clock_offset = 0;
355 ctx->clock_offset = TK_OFFS_BOOT;
361 ctx->clockid = reg.clockid;
366 * State to maintain until we can swap. Both new and old state, used for
367 * either mapping or freeing.
369 struct io_ring_ctx_rings {
370 unsigned short n_ring_pages;
371 unsigned short n_sqe_pages;
372 struct page **ring_pages;
373 struct page **sqe_pages;
374 struct io_uring_sqe *sq_sqes;
375 struct io_rings *rings;
378 static void io_register_free_rings(struct io_uring_params *p,
379 struct io_ring_ctx_rings *r)
381 if (!(p->flags & IORING_SETUP_NO_MMAP)) {
382 io_pages_unmap(r->rings, &r->ring_pages, &r->n_ring_pages,
384 io_pages_unmap(r->sq_sqes, &r->sqe_pages, &r->n_sqe_pages,
387 io_pages_free(&r->ring_pages, r->n_ring_pages);
388 io_pages_free(&r->sqe_pages, r->n_sqe_pages);
394 #define swap_old(ctx, o, n, field) \
396 (o).field = (ctx)->field; \
397 (ctx)->field = (n).field; \
400 #define RESIZE_FLAGS (IORING_SETUP_CQSIZE | IORING_SETUP_CLAMP)
401 #define COPY_FLAGS (IORING_SETUP_NO_SQARRAY | IORING_SETUP_SQE128 | \
402 IORING_SETUP_CQE32 | IORING_SETUP_NO_MMAP)
404 static int io_register_resize_rings(struct io_ring_ctx *ctx, void __user *arg)
406 struct io_ring_ctx_rings o = { }, n = { }, *to_free = NULL;
407 size_t size, sq_array_offset;
408 struct io_uring_params p;
413 /* for single issuer, must be owner resizing */
414 if (ctx->flags & IORING_SETUP_SINGLE_ISSUER &&
415 current != ctx->submitter_task)
417 if (copy_from_user(&p, arg, sizeof(p)))
419 if (p.flags & ~RESIZE_FLAGS)
422 /* properties that are always inherited */
423 p.flags |= (ctx->flags & COPY_FLAGS);
425 ret = io_uring_fill_params(p.sq_entries, &p);
429 /* nothing to do, but copy params back */
430 if (p.sq_entries == ctx->sq_entries && p.cq_entries == ctx->cq_entries) {
431 if (copy_to_user(arg, &p, sizeof(p)))
436 size = rings_size(p.flags, p.sq_entries, p.cq_entries,
438 if (size == SIZE_MAX)
441 if (!(p.flags & IORING_SETUP_NO_MMAP))
442 n.rings = io_pages_map(&n.ring_pages, &n.n_ring_pages, size);
444 n.rings = __io_uaddr_map(&n.ring_pages, &n.n_ring_pages,
445 p.cq_off.user_addr, size);
447 return PTR_ERR(n.rings);
449 n.rings->sq_ring_mask = p.sq_entries - 1;
450 n.rings->cq_ring_mask = p.cq_entries - 1;
451 n.rings->sq_ring_entries = p.sq_entries;
452 n.rings->cq_ring_entries = p.cq_entries;
454 if (copy_to_user(arg, &p, sizeof(p))) {
455 io_register_free_rings(&p, &n);
459 if (p.flags & IORING_SETUP_SQE128)
460 size = array_size(2 * sizeof(struct io_uring_sqe), p.sq_entries);
462 size = array_size(sizeof(struct io_uring_sqe), p.sq_entries);
463 if (size == SIZE_MAX) {
464 io_register_free_rings(&p, &n);
468 if (!(p.flags & IORING_SETUP_NO_MMAP))
469 ptr = io_pages_map(&n.sqe_pages, &n.n_sqe_pages, size);
471 ptr = __io_uaddr_map(&n.sqe_pages, &n.n_sqe_pages,
475 io_register_free_rings(&p, &n);
480 * If using SQPOLL, park the thread
483 mutex_unlock(&ctx->uring_lock);
484 io_sq_thread_park(ctx->sq_data);
485 mutex_lock(&ctx->uring_lock);
489 * We'll do the swap. Grab the ctx->resize_lock, which will exclude
490 * any new mmap's on the ring fd. Clear out existing mappings to prevent
491 * mmap from seeing them, as we'll unmap them. Any attempt to mmap
492 * existing rings beyond this point will fail. Not that it could proceed
493 * at this point anyway, as the io_uring mmap side needs go grab the
494 * ctx->resize_lock as well. Likewise, hold the completion lock over the
495 * duration of the actual swap.
497 mutex_lock(&ctx->resize_lock);
498 spin_lock(&ctx->completion_lock);
499 o.rings = ctx->rings;
501 o.sq_sqes = ctx->sq_sqes;
505 * Now copy SQ and CQ entries, if any. If either of the destination
506 * rings can't hold what is already there, then fail the operation.
509 tail = o.rings->sq.tail;
510 if (tail - o.rings->sq.head > p.sq_entries)
512 for (i = o.rings->sq.head; i < tail; i++) {
513 unsigned src_head = i & (ctx->sq_entries - 1);
514 unsigned dst_head = i & n.rings->sq_ring_mask;
516 n.sq_sqes[dst_head] = o.sq_sqes[src_head];
518 n.rings->sq.head = o.rings->sq.head;
519 n.rings->sq.tail = o.rings->sq.tail;
521 tail = o.rings->cq.tail;
522 if (tail - o.rings->cq.head > p.cq_entries) {
524 /* restore old rings, and return -EOVERFLOW via cleanup path */
525 ctx->rings = o.rings;
526 ctx->sq_sqes = o.sq_sqes;
531 for (i = o.rings->cq.head; i < tail; i++) {
532 unsigned src_head = i & (ctx->cq_entries - 1);
533 unsigned dst_head = i & n.rings->cq_ring_mask;
535 n.rings->cqes[dst_head] = o.rings->cqes[src_head];
537 n.rings->cq.head = o.rings->cq.head;
538 n.rings->cq.tail = o.rings->cq.tail;
539 /* invalidate cached cqe refill */
540 ctx->cqe_cached = ctx->cqe_sentinel = NULL;
542 n.rings->sq_dropped = o.rings->sq_dropped;
543 n.rings->sq_flags = o.rings->sq_flags;
544 n.rings->cq_flags = o.rings->cq_flags;
545 n.rings->cq_overflow = o.rings->cq_overflow;
547 /* all done, store old pointers and assign new ones */
548 if (!(ctx->flags & IORING_SETUP_NO_SQARRAY))
549 ctx->sq_array = (u32 *)((char *)n.rings + sq_array_offset);
551 ctx->sq_entries = p.sq_entries;
552 ctx->cq_entries = p.cq_entries;
554 ctx->rings = n.rings;
555 ctx->sq_sqes = n.sq_sqes;
556 swap_old(ctx, o, n, n_ring_pages);
557 swap_old(ctx, o, n, n_sqe_pages);
558 swap_old(ctx, o, n, ring_pages);
559 swap_old(ctx, o, n, sqe_pages);
563 spin_unlock(&ctx->completion_lock);
564 mutex_unlock(&ctx->resize_lock);
565 io_register_free_rings(&p, to_free);
568 io_sq_thread_unpark(ctx->sq_data);
573 static int io_register_mem_region(struct io_ring_ctx *ctx, void __user *uarg)
575 struct io_uring_mem_region_reg __user *reg_uptr = uarg;
576 struct io_uring_mem_region_reg reg;
577 struct io_uring_region_desc __user *rd_uptr;
578 struct io_uring_region_desc rd;
581 if (io_region_is_set(&ctx->param_region))
583 if (copy_from_user(®, reg_uptr, sizeof(reg)))
585 rd_uptr = u64_to_user_ptr(reg.region_uptr);
586 if (copy_from_user(&rd, rd_uptr, sizeof(rd)))
589 if (memchr_inv(®.__resv, 0, sizeof(reg.__resv)))
591 if (reg.flags & ~IORING_MEM_REGION_REG_WAIT_ARG)
595 * This ensures there are no waiters. Waiters are unlocked and it's
596 * hard to synchronise with them, especially if we need to initialise
599 if ((reg.flags & IORING_MEM_REGION_REG_WAIT_ARG) &&
600 !(ctx->flags & IORING_SETUP_R_DISABLED))
603 ret = io_create_region(ctx, &ctx->param_region, &rd);
606 if (copy_to_user(rd_uptr, &rd, sizeof(rd))) {
607 io_free_region(ctx, &ctx->param_region);
611 if (reg.flags & IORING_MEM_REGION_REG_WAIT_ARG) {
612 ctx->cq_wait_arg = io_region_get_ptr(&ctx->param_region);
613 ctx->cq_wait_size = rd.size;
618 static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
619 void __user *arg, unsigned nr_args)
620 __releases(ctx->uring_lock)
621 __acquires(ctx->uring_lock)
626 * We don't quiesce the refs for register anymore and so it can't be
627 * dying as we're holding a file ref here.
629 if (WARN_ON_ONCE(percpu_ref_is_dying(&ctx->refs)))
632 if (ctx->submitter_task && ctx->submitter_task != current)
635 if (ctx->restricted) {
636 opcode = array_index_nospec(opcode, IORING_REGISTER_LAST);
637 if (!test_bit(opcode, ctx->restrictions.register_op))
642 case IORING_REGISTER_BUFFERS:
646 ret = io_sqe_buffers_register(ctx, arg, nr_args, NULL);
648 case IORING_UNREGISTER_BUFFERS:
652 ret = io_sqe_buffers_unregister(ctx);
654 case IORING_REGISTER_FILES:
658 ret = io_sqe_files_register(ctx, arg, nr_args, NULL);
660 case IORING_UNREGISTER_FILES:
664 ret = io_sqe_files_unregister(ctx);
666 case IORING_REGISTER_FILES_UPDATE:
667 ret = io_register_files_update(ctx, arg, nr_args);
669 case IORING_REGISTER_EVENTFD:
673 ret = io_eventfd_register(ctx, arg, 0);
675 case IORING_REGISTER_EVENTFD_ASYNC:
679 ret = io_eventfd_register(ctx, arg, 1);
681 case IORING_UNREGISTER_EVENTFD:
685 ret = io_eventfd_unregister(ctx);
687 case IORING_REGISTER_PROBE:
689 if (!arg || nr_args > 256)
691 ret = io_probe(ctx, arg, nr_args);
693 case IORING_REGISTER_PERSONALITY:
697 ret = io_register_personality(ctx);
699 case IORING_UNREGISTER_PERSONALITY:
703 ret = io_unregister_personality(ctx, nr_args);
705 case IORING_REGISTER_ENABLE_RINGS:
709 ret = io_register_enable_rings(ctx);
711 case IORING_REGISTER_RESTRICTIONS:
712 ret = io_register_restrictions(ctx, arg, nr_args);
714 case IORING_REGISTER_FILES2:
715 ret = io_register_rsrc(ctx, arg, nr_args, IORING_RSRC_FILE);
717 case IORING_REGISTER_FILES_UPDATE2:
718 ret = io_register_rsrc_update(ctx, arg, nr_args,
721 case IORING_REGISTER_BUFFERS2:
722 ret = io_register_rsrc(ctx, arg, nr_args, IORING_RSRC_BUFFER);
724 case IORING_REGISTER_BUFFERS_UPDATE:
725 ret = io_register_rsrc_update(ctx, arg, nr_args,
728 case IORING_REGISTER_IOWQ_AFF:
730 if (!arg || !nr_args)
732 ret = io_register_iowq_aff(ctx, arg, nr_args);
734 case IORING_UNREGISTER_IOWQ_AFF:
738 ret = io_unregister_iowq_aff(ctx);
740 case IORING_REGISTER_IOWQ_MAX_WORKERS:
742 if (!arg || nr_args != 2)
744 ret = io_register_iowq_max_workers(ctx, arg);
746 case IORING_REGISTER_RING_FDS:
747 ret = io_ringfd_register(ctx, arg, nr_args);
749 case IORING_UNREGISTER_RING_FDS:
750 ret = io_ringfd_unregister(ctx, arg, nr_args);
752 case IORING_REGISTER_PBUF_RING:
754 if (!arg || nr_args != 1)
756 ret = io_register_pbuf_ring(ctx, arg);
758 case IORING_UNREGISTER_PBUF_RING:
760 if (!arg || nr_args != 1)
762 ret = io_unregister_pbuf_ring(ctx, arg);
764 case IORING_REGISTER_SYNC_CANCEL:
766 if (!arg || nr_args != 1)
768 ret = io_sync_cancel(ctx, arg);
770 case IORING_REGISTER_FILE_ALLOC_RANGE:
774 ret = io_register_file_alloc_range(ctx, arg);
776 case IORING_REGISTER_PBUF_STATUS:
778 if (!arg || nr_args != 1)
780 ret = io_register_pbuf_status(ctx, arg);
782 case IORING_REGISTER_NAPI:
784 if (!arg || nr_args != 1)
786 ret = io_register_napi(ctx, arg);
788 case IORING_UNREGISTER_NAPI:
792 ret = io_unregister_napi(ctx, arg);
794 case IORING_REGISTER_CLOCK:
798 ret = io_register_clock(ctx, arg);
800 case IORING_REGISTER_CLONE_BUFFERS:
802 if (!arg || nr_args != 1)
804 ret = io_register_clone_buffers(ctx, arg);
806 case IORING_REGISTER_RESIZE_RINGS:
808 if (!arg || nr_args != 1)
810 ret = io_register_resize_rings(ctx, arg);
812 case IORING_REGISTER_MEM_REGION:
814 if (!arg || nr_args != 1)
816 ret = io_register_mem_region(ctx, arg);
827 * Given an 'fd' value, return the ctx associated with if. If 'registered' is
828 * true, then the registered index is used. Otherwise, the normal fd table.
829 * Caller must call fput() on the returned file, unless it's an ERR_PTR.
831 struct file *io_uring_register_get_file(unsigned int fd, bool registered)
837 * Ring fd has been registered via IORING_REGISTER_RING_FDS, we
838 * need only dereference our task private array to find it.
840 struct io_uring_task *tctx = current->io_uring;
842 if (unlikely(!tctx || fd >= IO_RINGFD_REG_MAX))
843 return ERR_PTR(-EINVAL);
844 fd = array_index_nospec(fd, IO_RINGFD_REG_MAX);
845 file = tctx->registered_rings[fd];
851 return ERR_PTR(-EBADF);
852 if (io_is_uring_fops(file))
855 return ERR_PTR(-EOPNOTSUPP);
859 * "blind" registration opcodes are ones where there's no ring given, and
860 * hence the source fd must be -1.
862 static int io_uring_register_blind(unsigned int opcode, void __user *arg,
863 unsigned int nr_args)
866 case IORING_REGISTER_SEND_MSG_RING: {
867 struct io_uring_sqe sqe;
869 if (!arg || nr_args != 1)
871 if (copy_from_user(&sqe, arg, sizeof(sqe)))
873 /* no flags supported */
876 if (sqe.opcode == IORING_OP_MSG_RING)
877 return io_uring_sync_msg_ring(&sqe);
884 SYSCALL_DEFINE4(io_uring_register, unsigned int, fd, unsigned int, opcode,
885 void __user *, arg, unsigned int, nr_args)
887 struct io_ring_ctx *ctx;
890 bool use_registered_ring;
892 use_registered_ring = !!(opcode & IORING_REGISTER_USE_REGISTERED_RING);
893 opcode &= ~IORING_REGISTER_USE_REGISTERED_RING;
895 if (opcode >= IORING_REGISTER_LAST)
899 return io_uring_register_blind(opcode, arg, nr_args);
901 file = io_uring_register_get_file(fd, use_registered_ring);
903 return PTR_ERR(file);
904 ctx = file->private_data;
906 mutex_lock(&ctx->uring_lock);
907 ret = __io_uring_register(ctx, opcode, arg, nr_args);
908 mutex_unlock(&ctx->uring_lock);
909 trace_io_uring_register(ctx, opcode, ctx->file_table.data.nr,
910 ctx->buf_table.nr, ret);
911 if (!use_registered_ring)