]> Git Repo - linux.git/commitdiff
Merge tag 'for-6.7/io_uring-2023-10-30' of git://git.kernel.dk/linux
authorLinus Torvalds <[email protected]>
Wed, 1 Nov 2023 21:09:19 +0000 (11:09 -1000)
committerLinus Torvalds <[email protected]>
Wed, 1 Nov 2023 21:09:19 +0000 (11:09 -1000)
Pull io_uring updates from Jens Axboe:
 "This contains the core io_uring updates, of which there are not many,
  and adds support for using WAITID through io_uring and hence not
  needing to block on these kinds of events.

  Outside of that, tweaks to the legacy provided buffer handling and
  some cleanups related to cancelations for uring_cmd support"

* tag 'for-6.7/io_uring-2023-10-30' of git://git.kernel.dk/linux:
  io_uring/poll: use IOU_F_TWQ_LAZY_WAKE for wakeups
  io_uring/kbuf: Use slab for struct io_buffer objects
  io_uring/kbuf: Allow the full buffer id space for provided buffers
  io_uring/kbuf: Fix check of BID wrapping in provided buffers
  io_uring/rsrc: cleanup io_pin_pages()
  io_uring: cancelable uring_cmd
  io_uring: retain top 8bits of uring_cmd flags for kernel internal use
  io_uring: add IORING_OP_WAITID support
  exit: add internal include file with helpers
  exit: add kernel_waitid_prepare() helper
  exit: move core of do_wait() into helper
  exit: abstract out should_wake helper for child_wait_callback()
  io_uring/rw: add support for IORING_OP_READ_MULTISHOT
  io_uring/rw: mark readv/writev as vectored in the opcode definition
  io_uring/rw: split io_read() into a helper

1  2 
io_uring/io_uring.c
io_uring/io_uring.h
io_uring/kbuf.c
io_uring/rw.c

diff --combined io_uring/io_uring.c
index 8d1bc6cdfe712e75638ddf99c2f9ebac2d32d1f5,b9e1af5772f3b663bc115475f115a9a69eb00bad..36ae5ac2b070bf0201fd98bd79001f2850690a51
@@@ -92,6 -92,7 +92,7 @@@
  #include "cancel.h"
  #include "net.h"
  #include "notif.h"
+ #include "waitid.h"
  
  #include "timeout.h"
  #include "poll.h"
@@@ -338,7 -339,6 +339,6 @@@ static __cold struct io_ring_ctx *io_ri
        spin_lock_init(&ctx->completion_lock);
        spin_lock_init(&ctx->timeout_lock);
        INIT_WQ_LIST(&ctx->iopoll_list);
-       INIT_LIST_HEAD(&ctx->io_buffers_pages);
        INIT_LIST_HEAD(&ctx->io_buffers_comp);
        INIT_LIST_HEAD(&ctx->defer_list);
        INIT_LIST_HEAD(&ctx->timeout_list);
        INIT_LIST_HEAD(&ctx->tctx_list);
        ctx->submit_state.free_list.next = NULL;
        INIT_WQ_LIST(&ctx->locked_free_list);
+       INIT_HLIST_HEAD(&ctx->waitid_list);
        INIT_DELAYED_WORK(&ctx->fallback_work, io_fallback_req_func);
        INIT_WQ_LIST(&ctx->submit_state.compl_reqs);
+       INIT_HLIST_HEAD(&ctx->cancelable_uring_cmd);
        return ctx;
  err:
        kfree(ctx->cancel_table.hbs);
@@@ -2674,11 -2676,7 +2676,11 @@@ static void io_pages_free(struct page *
  
        if (!pages)
                return;
 +
        page_array = *pages;
 +      if (!page_array)
 +              return;
 +
        for (i = 0; i < npages; i++)
                unpin_user_page(page_array[i]);
        kvfree(page_array);
@@@ -2690,7 -2688,7 +2692,7 @@@ static void *__io_uaddr_map(struct pag
  {
        struct page **page_array;
        unsigned int nr_pages;
 -      int ret;
 +      int ret, i;
  
        *npages = 0;
  
         */
        if (page_array[0] != page_array[ret - 1])
                goto err;
 +
 +      /*
 +       * Can't support mapping user allocated ring memory on 32-bit archs
 +       * where it could potentially reside in highmem. Just fail those with
 +       * -EINVAL, just like we did on kernels that didn't support this
 +       * feature.
 +       */
 +      for (i = 0; i < nr_pages; i++) {
 +              if (PageHighMem(page_array[i])) {
 +                      ret = -EINVAL;
 +                      goto err;
 +              }
 +      }
 +
        *pages = page_array;
        *npages = nr_pages;
        return page_to_virt(page_array[0]);
@@@ -2762,9 -2746,7 +2764,9 @@@ static void io_rings_free(struct io_rin
                ctx->sq_sqes = NULL;
        } else {
                io_pages_free(&ctx->ring_pages, ctx->n_ring_pages);
 +              ctx->n_ring_pages = 0;
                io_pages_free(&ctx->sqe_pages, ctx->n_sqe_pages);
 +              ctx->n_sqe_pages = 0;
        }
  }
  
@@@ -3276,6 -3258,37 +3278,37 @@@ static __cold bool io_uring_try_cancel_
        return ret;
  }
  
+ static bool io_uring_try_cancel_uring_cmd(struct io_ring_ctx *ctx,
+               struct task_struct *task, bool cancel_all)
+ {
+       struct hlist_node *tmp;
+       struct io_kiocb *req;
+       bool ret = false;
+       lockdep_assert_held(&ctx->uring_lock);
+       hlist_for_each_entry_safe(req, tmp, &ctx->cancelable_uring_cmd,
+                       hash_node) {
+               struct io_uring_cmd *cmd = io_kiocb_to_cmd(req,
+                               struct io_uring_cmd);
+               struct file *file = req->file;
+               if (!cancel_all && req->task != task)
+                       continue;
+               if (cmd->flags & IORING_URING_CMD_CANCELABLE) {
+                       /* ->sqe isn't available if no async data */
+                       if (!req_has_async_data(req))
+                               cmd->sqe = NULL;
+                       file->f_op->uring_cmd(cmd, IO_URING_F_CANCEL);
+                       ret = true;
+               }
+       }
+       io_submit_flush_completions(ctx);
+       return ret;
+ }
  static __cold bool io_uring_try_cancel_requests(struct io_ring_ctx *ctx,
                                                struct task_struct *task,
                                                bool cancel_all)
        ret |= io_cancel_defer_files(ctx, task, cancel_all);
        mutex_lock(&ctx->uring_lock);
        ret |= io_poll_remove_all(ctx, task, cancel_all);
+       ret |= io_waitid_remove_all(ctx, task, cancel_all);
+       ret |= io_uring_try_cancel_uring_cmd(ctx, task, cancel_all);
        mutex_unlock(&ctx->uring_lock);
        ret |= io_kill_timeouts(ctx, task, cancel_all);
        if (task)
@@@ -4686,6 -4701,9 +4721,9 @@@ static int __init io_uring_init(void
  
        BUILD_BUG_ON(sizeof(atomic_t) != sizeof(u32));
  
+       /* top 8bits are for internal use */
+       BUILD_BUG_ON((IORING_URING_CMD_MASK & 0xff000000) != 0);
        io_uring_optable_init();
  
        /*
                                SLAB_ACCOUNT | SLAB_TYPESAFE_BY_RCU,
                                offsetof(struct io_kiocb, cmd.data),
                                sizeof_field(struct io_kiocb, cmd.data), NULL);
+       io_buf_cachep = kmem_cache_create("io_buffer", sizeof(struct io_buffer), 0,
+                                         SLAB_HWCACHE_ALIGN | SLAB_PANIC | SLAB_ACCOUNT,
+                                         NULL);
  
  #ifdef CONFIG_SYSCTL
        register_sysctl_init("kernel", kernel_io_uring_disabled_table);
diff --combined io_uring/io_uring.h
index 0bc145614a6e66ae2d2ec0933fa7165e11f74229,2ff719ae1b57774956221302981e8f3ff6952b5b..dc6d779b452b9b0e233a74c7c49e0d9760840b20
@@@ -86,33 -86,20 +86,33 @@@ bool __io_alloc_req_refill(struct io_ri
  bool io_match_task_safe(struct io_kiocb *head, struct task_struct *task,
                        bool cancel_all);
  
 -#define io_lockdep_assert_cq_locked(ctx)                              \
 -      do {                                                            \
 -              lockdep_assert(in_task());                              \
 -                                                                      \
 -              if (ctx->flags & IORING_SETUP_IOPOLL) {                 \
 -                      lockdep_assert_held(&ctx->uring_lock);          \
 -              } else if (!ctx->task_complete) {                       \
 -                      lockdep_assert_held(&ctx->completion_lock);     \
 -              } else if (ctx->submitter_task->flags & PF_EXITING) {   \
 -                      lockdep_assert(current_work());                 \
 -              } else {                                                \
 -                      lockdep_assert(current == ctx->submitter_task); \
 -              }                                                       \
 -      } while (0)
 +#if defined(CONFIG_PROVE_LOCKING)
 +static inline void io_lockdep_assert_cq_locked(struct io_ring_ctx *ctx)
 +{
 +      lockdep_assert(in_task());
 +
 +      if (ctx->flags & IORING_SETUP_IOPOLL) {
 +              lockdep_assert_held(&ctx->uring_lock);
 +      } else if (!ctx->task_complete) {
 +              lockdep_assert_held(&ctx->completion_lock);
 +      } else if (ctx->submitter_task) {
 +              /*
 +               * ->submitter_task may be NULL and we can still post a CQE,
 +               * if the ring has been setup with IORING_SETUP_R_DISABLED.
 +               * Not from an SQE, as those cannot be submitted, but via
 +               * updating tagged resources.
 +               */
 +              if (ctx->submitter_task->flags & PF_EXITING)
 +                      lockdep_assert(current_work());
 +              else
 +                      lockdep_assert(current == ctx->submitter_task);
 +      }
 +}
 +#else
 +static inline void io_lockdep_assert_cq_locked(struct io_ring_ctx *ctx)
 +{
 +}
 +#endif
  
  static inline void io_req_task_work_add(struct io_kiocb *req)
  {
@@@ -343,6 -330,7 +343,7 @@@ static inline bool io_req_cache_empty(s
  }
  
  extern struct kmem_cache *req_cachep;
+ extern struct kmem_cache *io_buf_cachep;
  
  static inline struct io_kiocb *io_extract_req(struct io_ring_ctx *ctx)
  {
diff --combined io_uring/kbuf.c
index 9123138aa9f48b1acc216f08b70870e095523632,d5a04467666f7bdae4900a1c776bf9685725d03f..fea06810b43dbb19a62624908a81d71b9b5bf1e2
  
  #define BGID_ARRAY    64
  
+ /* BIDs are addressed by a 16-bit field in a CQE */
+ #define MAX_BIDS_PER_BGID (1 << 16)
+ struct kmem_cache *io_buf_cachep;
  struct io_provide_buf {
        struct file                     *file;
        __u64                           addr;
        __u32                           len;
        __u32                           bgid;
-       __u16                           nbufs;
+       __u32                           nbufs;
        __u16                           bid;
  };
  
@@@ -255,6 -260,8 +260,8 @@@ static int __io_remove_buffers(struct i
  void io_destroy_buffers(struct io_ring_ctx *ctx)
  {
        struct io_buffer_list *bl;
+       struct list_head *item, *tmp;
+       struct io_buffer *buf;
        unsigned long index;
        int i;
  
                kfree(bl);
        }
  
-       while (!list_empty(&ctx->io_buffers_pages)) {
-               struct page *page;
-               page = list_first_entry(&ctx->io_buffers_pages, struct page, lru);
-               list_del_init(&page->lru);
-               __free_page(page);
+       list_for_each_safe(item, tmp, &ctx->io_buffers_cache) {
+               buf = list_entry(item, struct io_buffer, list);
+               kmem_cache_free(io_buf_cachep, buf);
        }
  }
  
@@@ -289,7 -293,7 +293,7 @@@ int io_remove_buffers_prep(struct io_ki
                return -EINVAL;
  
        tmp = READ_ONCE(sqe->fd);
-       if (!tmp || tmp > USHRT_MAX)
+       if (!tmp || tmp > MAX_BIDS_PER_BGID)
                return -EINVAL;
  
        memset(p, 0, sizeof(*p));
@@@ -332,7 -336,7 +336,7 @@@ int io_provide_buffers_prep(struct io_k
                return -EINVAL;
  
        tmp = READ_ONCE(sqe->fd);
-       if (!tmp || tmp > USHRT_MAX)
+       if (!tmp || tmp > MAX_BIDS_PER_BGID)
                return -E2BIG;
        p->nbufs = tmp;
        p->addr = READ_ONCE(sqe->addr);
        tmp = READ_ONCE(sqe->off);
        if (tmp > USHRT_MAX)
                return -E2BIG;
-       if (tmp + p->nbufs >= USHRT_MAX)
+       if (tmp + p->nbufs > MAX_BIDS_PER_BGID)
                return -EINVAL;
        p->bid = tmp;
        return 0;
  }
  
+ #define IO_BUFFER_ALLOC_BATCH 64
  static int io_refill_buffer_cache(struct io_ring_ctx *ctx)
  {
-       struct io_buffer *buf;
-       struct page *page;
-       int bufs_in_page;
+       struct io_buffer *bufs[IO_BUFFER_ALLOC_BATCH];
+       int allocated;
  
        /*
         * Completions that don't happen inline (eg not under uring_lock) will
  
        /*
         * No free buffers and no completion entries either. Allocate a new
-        * page worth of buffer entries and add those to our freelist.
+        * batch of buffer entries and add those to our freelist.
         */
-       page = alloc_page(GFP_KERNEL_ACCOUNT);
-       if (!page)
-               return -ENOMEM;
-       list_add(&page->lru, &ctx->io_buffers_pages);
  
-       buf = page_address(page);
-       bufs_in_page = PAGE_SIZE / sizeof(*buf);
-       while (bufs_in_page) {
-               list_add_tail(&buf->list, &ctx->io_buffers_cache);
-               buf++;
-               bufs_in_page--;
+       allocated = kmem_cache_alloc_bulk(io_buf_cachep, GFP_KERNEL_ACCOUNT,
+                                         ARRAY_SIZE(bufs), (void **) bufs);
+       if (unlikely(!allocated)) {
+               /*
+                * Bulk alloc is all-or-nothing. If we fail to get a batch,
+                * retry single alloc to be on the safe side.
+                */
+               bufs[0] = kmem_cache_alloc(io_buf_cachep, GFP_KERNEL);
+               if (!bufs[0])
+                       return -ENOMEM;
+               allocated = 1;
        }
  
+       while (allocated)
+               list_add_tail(&bufs[--allocated]->list, &ctx->io_buffers_cache);
        return 0;
  }
  
@@@ -477,7 -485,7 +485,7 @@@ static int io_pin_pbuf_ring(struct io_u
  {
        struct io_uring_buf_ring *br;
        struct page **pages;
 -      int nr_pages;
 +      int i, nr_pages;
  
        pages = io_pin_pages(reg->ring_addr,
                             flex_array_size(br, bufs, reg->ring_entries),
        if (IS_ERR(pages))
                return PTR_ERR(pages);
  
 +      /*
 +       * Apparently some 32-bit boxes (ARM) will return highmem pages,
 +       * which then need to be mapped. We could support that, but it'd
 +       * complicate the code and slowdown the common cases quite a bit.
 +       * So just error out, returning -EINVAL just like we did on kernels
 +       * that didn't support mapped buffer rings.
 +       */
 +      for (i = 0; i < nr_pages; i++)
 +              if (PageHighMem(pages[i]))
 +                      goto error_unpin;
 +
        br = page_address(pages[0]);
  #ifdef SHM_COLOUR
        /*
         * should use IOU_PBUF_RING_MMAP instead, and liburing will handle
         * this transparently.
         */
 -      if ((reg->ring_addr | (unsigned long) br) & (SHM_COLOUR - 1)) {
 -              int i;
 -
 -              for (i = 0; i < nr_pages; i++)
 -                      unpin_user_page(pages[i]);
 -              return -EINVAL;
 -      }
 +      if ((reg->ring_addr | (unsigned long) br) & (SHM_COLOUR - 1))
 +              goto error_unpin;
  #endif
        bl->buf_pages = pages;
        bl->buf_nr_pages = nr_pages;
        bl->is_mapped = 1;
        bl->is_mmap = 0;
        return 0;
 +error_unpin:
 +      for (i = 0; i < nr_pages; i++)
 +              unpin_user_page(pages[i]);
 +      kvfree(pages);
 +      return -EINVAL;
  }
  
  static int io_alloc_pbuf_ring(struct io_uring_buf_reg *reg,
diff --combined io_uring/rw.c
index 8f68d5ad4564fe7b971f9f0c977276179a713faa,ec0cc38ea6824a2a8b6f29841cd08ead84137e6a..3398e1d944c2615f557162bbe0cca7b718a4c1dc
@@@ -123,6 -123,22 +123,22 @@@ int io_prep_rw(struct io_kiocb *req, co
        return 0;
  }
  
+ /*
+  * Multishot read is prepared just like a normal read/write request, only
+  * difference is that we set the MULTISHOT flag.
+  */
+ int io_read_mshot_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
+ {
+       int ret;
+       ret = io_prep_rw(req, sqe);
+       if (unlikely(ret))
+               return ret;
+       req->flags |= REQ_F_APOLL_MULTISHOT;
+       return 0;
+ }
  void io_readv_writev_cleanup(struct io_kiocb *req)
  {
        struct io_async_rw *io = req->async_data;
@@@ -339,7 -355,7 +355,7 @@@ static int kiocb_done(struct io_kiocb *
        struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw);
        unsigned final_ret = io_fixup_rw_res(req, ret);
  
 -      if (req->flags & REQ_F_CUR_POS)
 +      if (ret >= 0 && req->flags & REQ_F_CUR_POS)
                req->file->f_pos = rw->kiocb.ki_pos;
        if (ret >= 0 && (rw->kiocb.ki_complete == io_complete_rw)) {
                if (!__io_complete_rw_common(req, ret)) {
@@@ -388,8 -404,7 +404,7 @@@ static struct iovec *__io_import_iovec(
        buf = u64_to_user_ptr(rw->addr);
        sqe_len = rw->len;
  
-       if (opcode == IORING_OP_READ || opcode == IORING_OP_WRITE ||
-           (req->flags & REQ_F_BUFFER_SELECT)) {
+       if (!io_issue_defs[opcode].vectored || req->flags & REQ_F_BUFFER_SELECT) {
                if (io_do_buffer_select(req)) {
                        buf = io_buffer_select(req, &sqe_len, issue_flags);
                        if (!buf)
@@@ -708,7 -723,7 +723,7 @@@ static int io_rw_init_file(struct io_ki
        return 0;
  }
  
int io_read(struct io_kiocb *req, unsigned int issue_flags)
static int __io_read(struct io_kiocb *req, unsigned int issue_flags)
  {
        struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw);
        struct io_rw_state __s, *s = &__s;
  
        if (ret == -EAGAIN || (req->flags & REQ_F_REISSUE)) {
                req->flags &= ~REQ_F_REISSUE;
-               /* if we can poll, just do that */
-               if (req->opcode == IORING_OP_READ && file_can_poll(req->file))
+               /*
+                * If we can poll, just do that. For a vectored read, we'll
+                * need to copy state first.
+                */
+               if (file_can_poll(req->file) && !io_issue_defs[req->opcode].vectored)
                        return -EAGAIN;
                /* IOPOLL retry should happen for io-wq threads */
                if (!force_nonblock && !(req->ctx->flags & IORING_SETUP_IOPOLL))
@@@ -853,7 -871,69 +871,69 @@@ done
        /* it's faster to check here then delegate to kfree */
        if (iovec)
                kfree(iovec);
-       return kiocb_done(req, ret, issue_flags);
+       return ret;
+ }
+ int io_read(struct io_kiocb *req, unsigned int issue_flags)
+ {
+       int ret;
+       ret = __io_read(req, issue_flags);
+       if (ret >= 0)
+               return kiocb_done(req, ret, issue_flags);
+       return ret;
+ }
+ int io_read_mshot(struct io_kiocb *req, unsigned int issue_flags)
+ {
+       unsigned int cflags = 0;
+       int ret;
+       /*
+        * Multishot MUST be used on a pollable file
+        */
+       if (!file_can_poll(req->file))
+               return -EBADFD;
+       ret = __io_read(req, issue_flags);
+       /*
+        * If we get -EAGAIN, recycle our buffer and just let normal poll
+        * handling arm it.
+        */
+       if (ret == -EAGAIN) {
+               io_kbuf_recycle(req, issue_flags);
+               return -EAGAIN;
+       }
+       /*
+        * Any successful return value will keep the multishot read armed.
+        */
+       if (ret > 0) {
+               /*
+                * Put our buffer and post a CQE. If we fail to post a CQE, then
+                * jump to the termination path. This request is then done.
+                */
+               cflags = io_put_kbuf(req, issue_flags);
+               if (io_fill_cqe_req_aux(req,
+                                       issue_flags & IO_URING_F_COMPLETE_DEFER,
+                                       ret, cflags | IORING_CQE_F_MORE)) {
+                       if (issue_flags & IO_URING_F_MULTISHOT)
+                               return IOU_ISSUE_SKIP_COMPLETE;
+                       return -EAGAIN;
+               }
+       }
+       /*
+        * Either an error, or we've hit overflow posting the CQE. For any
+        * multishot request, hitting overflow will terminate it.
+        */
+       io_req_set_res(req, ret, cflags);
+       if (issue_flags & IO_URING_F_MULTISHOT)
+               return IOU_STOP_MULTISHOT;
+       return IOU_OK;
  }
  
  int io_write(struct io_kiocb *req, unsigned int issue_flags)
                kiocb_start_write(kiocb);
        kiocb->ki_flags |= IOCB_WRITE;
  
 -      /*
 -       * For non-polled IO, set IOCB_DIO_CALLER_COMP, stating that our handler
 -       * groks deferring the completion to task context. This isn't
 -       * necessary and useful for polled IO as that can always complete
 -       * directly.
 -       */
 -      if (!(kiocb->ki_flags & IOCB_HIPRI))
 -              kiocb->ki_flags |= IOCB_DIO_CALLER_COMP;
 -
        if (likely(req->file->f_op->write_iter))
                ret2 = call_write_iter(req->file, kiocb, &s->iter);
        else if (req->file->f_op->write)
This page took 0.112364 seconds and 4 git commands to generate.