Merge tag 'for-6.7/io_uring-2023-10-30' of git://git.kernel.dk/linux

author Linus Torvalds <[email protected]>

Wed, 1 Nov 2023 21:09:19 +0000 (11:09 -1000)

committer Linus Torvalds <[email protected]>

Wed, 1 Nov 2023 21:09:19 +0000 (11:09 -1000)
author Linus Torvalds <[email protected]>
Wed, 1 Nov 2023 21:09:19 +0000 (11:09 -1000)
committer Linus Torvalds <[email protected]>
Wed, 1 Nov 2023 21:09:19 +0000 (11:09 -1000)
diff --combined io_uring/io_uring.c

index 8d1bc6cdfe712e75638ddf99c2f9ebac2d32d1f5,b9e1af5772f3b663bc115475f115a9a69eb00bad..36ae5ac2b070bf0201fd98bd79001f2850690a51
--- 1/io_uring/io_uring.c
--- 2/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@@ -92,6 -92,7 +92,7 @@@
   #include "cancel.h"
   #include "net.h"
   #include "notif.h"
+ #include "waitid.h"
   
   #include "timeout.h"
   #include "poll.h"
@@@ -338,7 -339,6 +339,6 @@@ static __cold struct io_ring_ctx *io_ri
         spin_lock_init(&ctx->completion_lock);
         spin_lock_init(&ctx->timeout_lock);
         INIT_WQ_LIST(&ctx->iopoll_list);
-       INIT_LIST_HEAD(&ctx->io_buffers_pages);
         INIT_LIST_HEAD(&ctx->io_buffers_comp);
         INIT_LIST_HEAD(&ctx->defer_list);
         INIT_LIST_HEAD(&ctx->timeout_list);
@@@ -348,8 -348,10 +348,10 @@@
         INIT_LIST_HEAD(&ctx->tctx_list);
         ctx->submit_state.free_list.next = NULL;
         INIT_WQ_LIST(&ctx->locked_free_list);
+       INIT_HLIST_HEAD(&ctx->waitid_list);
         INIT_DELAYED_WORK(&ctx->fallback_work, io_fallback_req_func);
         INIT_WQ_LIST(&ctx->submit_state.compl_reqs);
+       INIT_HLIST_HEAD(&ctx->cancelable_uring_cmd);
         return ctx;
   err:
         kfree(ctx->cancel_table.hbs);
@@@ -2674,11 -2676,7 +2676,11 @@@ static void io_pages_free(struct page *
   
         if (!pages)
                 return;
+ +
         page_array = *pages;
+ +      if (!page_array)
+ +              return;
+ +
         for (i = 0; i < npages; i++)
                 unpin_user_page(page_array[i]);
         kvfree(page_array);
@@@ -2690,7 -2688,7 +2692,7 @@@ static void *__io_uaddr_map(struct pag
   {
         struct page **page_array;
         unsigned int nr_pages;
- -      int ret;
+ +      int ret, i;
   
         *npages = 0;
   
@@@ -2720,20 -2718,6 +2722,20 @@@ err
          */
         if (page_array[0] != page_array[ret - 1])
                 goto err;
+ +
+ +      /*
+ +       * Can't support mapping user allocated ring memory on 32-bit archs
+ +       * where it could potentially reside in highmem. Just fail those with
+ +       * -EINVAL, just like we did on kernels that didn't support this
+ +       * feature.
+ +       */
+ +      for (i = 0; i < nr_pages; i++) {
+ +              if (PageHighMem(page_array[i])) {
+ +                      ret = -EINVAL;
+ +                      goto err;
+ +              }
+ +      }
+ +
         *pages = page_array;
         *npages = nr_pages;
         return page_to_virt(page_array[0]);
@@@ -2762,9 -2746,7 +2764,9 @@@ static void io_rings_free(struct io_rin
                 ctx->sq_sqes = NULL;
         } else {
                 io_pages_free(&ctx->ring_pages, ctx->n_ring_pages);
+ +              ctx->n_ring_pages = 0;
                 io_pages_free(&ctx->sqe_pages, ctx->n_sqe_pages);
+ +              ctx->n_sqe_pages = 0;
         }
   }
   
@@@ -3276,6 -3258,37 +3278,37 @@@ static __cold bool io_uring_try_cancel_
         return ret;
   }
   
+ static bool io_uring_try_cancel_uring_cmd(struct io_ring_ctx *ctx,
+               struct task_struct *task, bool cancel_all)
+ {
+       struct hlist_node *tmp;
+       struct io_kiocb *req;
+       bool ret = false;
+ 
+       lockdep_assert_held(&ctx->uring_lock);
+ 
+       hlist_for_each_entry_safe(req, tmp, &ctx->cancelable_uring_cmd,
+                       hash_node) {
+               struct io_uring_cmd *cmd = io_kiocb_to_cmd(req,
+                               struct io_uring_cmd);
+               struct file *file = req->file;
+ 
+               if (!cancel_all && req->task != task)
+                       continue;
+ 
+               if (cmd->flags & IORING_URING_CMD_CANCELABLE) {
+                       /* ->sqe isn't available if no async data */
+                       if (!req_has_async_data(req))
+                               cmd->sqe = NULL;
+                       file->f_op->uring_cmd(cmd, IO_URING_F_CANCEL);
+                       ret = true;
+               }
+       }
+       io_submit_flush_completions(ctx);
+ 
+       return ret;
+ }
+ 
   static __cold bool io_uring_try_cancel_requests(struct io_ring_ctx *ctx,
                                                 struct task_struct *task,
                                                 bool cancel_all)
@@@ -3323,6 -3336,8 +3356,8 @@@
         ret |= io_cancel_defer_files(ctx, task, cancel_all);
         mutex_lock(&ctx->uring_lock);
         ret |= io_poll_remove_all(ctx, task, cancel_all);
+       ret |= io_waitid_remove_all(ctx, task, cancel_all);
+       ret |= io_uring_try_cancel_uring_cmd(ctx, task, cancel_all);
         mutex_unlock(&ctx->uring_lock);
         ret |= io_kill_timeouts(ctx, task, cancel_all);
         if (task)
@@@ -4686,6 -4701,9 +4721,9 @@@ static int __init io_uring_init(void
   
         BUILD_BUG_ON(sizeof(atomic_t) != sizeof(u32));
   
+       /* top 8bits are for internal use */
+       BUILD_BUG_ON((IORING_URING_CMD_MASK & 0xff000000) != 0);
+ 
         io_uring_optable_init();
   
         /*
@@@ -4701,6 -4719,9 +4739,9 @@@
                                 SLAB_ACCOUNT | SLAB_TYPESAFE_BY_RCU,
                                 offsetof(struct io_kiocb, cmd.data),
                                 sizeof_field(struct io_kiocb, cmd.data), NULL);
+       io_buf_cachep = kmem_cache_create("io_buffer", sizeof(struct io_buffer), 0,
+                                         SLAB_HWCACHE_ALIGN | SLAB_PANIC | SLAB_ACCOUNT,
+                                         NULL);
   
   #ifdef CONFIG_SYSCTL
         register_sysctl_init("kernel", kernel_io_uring_disabled_table);
diff --combined io_uring/io_uring.h

index 0bc145614a6e66ae2d2ec0933fa7165e11f74229,2ff719ae1b57774956221302981e8f3ff6952b5b..dc6d779b452b9b0e233a74c7c49e0d9760840b20
--- 1/io_uring/io_uring.h
--- 2/io_uring/io_uring.h
+++ b/io_uring/io_uring.h
@@@ -86,33 -86,20 +86,33 @@@ bool __io_alloc_req_refill(struct io_ri
   bool io_match_task_safe(struct io_kiocb *head, struct task_struct *task,
                         bool cancel_all);
   
- -#define io_lockdep_assert_cq_locked(ctx)                              \
- -      do {                                                            \
- -              lockdep_assert(in_task());                              \
- -                                                                      \
- -              if (ctx->flags & IORING_SETUP_IOPOLL) {                 \
- -                      lockdep_assert_held(&ctx->uring_lock);          \
- -              } else if (!ctx->task_complete) {                       \
- -                      lockdep_assert_held(&ctx->completion_lock);     \
- -              } else if (ctx->submitter_task->flags & PF_EXITING) {   \
- -                      lockdep_assert(current_work());                 \
- -              } else {                                                \
- -                      lockdep_assert(current == ctx->submitter_task); \
- -              }                                                       \
- -      } while (0)
+ +#if defined(CONFIG_PROVE_LOCKING)
+ +static inline void io_lockdep_assert_cq_locked(struct io_ring_ctx *ctx)
+ +{
+ +      lockdep_assert(in_task());
+ +
+ +      if (ctx->flags & IORING_SETUP_IOPOLL) {
+ +              lockdep_assert_held(&ctx->uring_lock);
+ +      } else if (!ctx->task_complete) {
+ +              lockdep_assert_held(&ctx->completion_lock);
+ +      } else if (ctx->submitter_task) {
+ +              /*
+ +               * ->submitter_task may be NULL and we can still post a CQE,
+ +               * if the ring has been setup with IORING_SETUP_R_DISABLED.
+ +               * Not from an SQE, as those cannot be submitted, but via
+ +               * updating tagged resources.
+ +               */
+ +              if (ctx->submitter_task->flags & PF_EXITING)
+ +                      lockdep_assert(current_work());
+ +              else
+ +                      lockdep_assert(current == ctx->submitter_task);
+ +      }
+ +}
+ +#else
+ +static inline void io_lockdep_assert_cq_locked(struct io_ring_ctx *ctx)
+ +{
+ +}
+ +#endif
   
   static inline void io_req_task_work_add(struct io_kiocb *req)
   {
@@@ -343,6 -330,7 +343,7 @@@ static inline bool io_req_cache_empty(s
   }
   
   extern struct kmem_cache *req_cachep;
+ extern struct kmem_cache *io_buf_cachep;
   
   static inline struct io_kiocb *io_extract_req(struct io_ring_ctx *ctx)
   {
diff --combined io_uring/kbuf.c

index 9123138aa9f48b1acc216f08b70870e095523632,d5a04467666f7bdae4900a1c776bf9685725d03f..fea06810b43dbb19a62624908a81d71b9b5bf1e2
--- 1/io_uring/kbuf.c
--- 2/io_uring/kbuf.c
+++ b/io_uring/kbuf.c
@@@ -19,12 -19,17 +19,17 @@@
   
   #define BGID_ARRAY    64
   
+ /* BIDs are addressed by a 16-bit field in a CQE */
+ #define MAX_BIDS_PER_BGID (1 << 16)
+ 
+ struct kmem_cache *io_buf_cachep;
+ 
   struct io_provide_buf {
         struct file                     *file;
         __u64                           addr;
         __u32                           len;
         __u32                           bgid;
-       __u16                           nbufs;
+       __u32                           nbufs;
         __u16                           bid;
   };
   
@@@ -255,6 -260,8 +260,8 @@@ static int __io_remove_buffers(struct i
   void io_destroy_buffers(struct io_ring_ctx *ctx)
   {
         struct io_buffer_list *bl;
+       struct list_head *item, *tmp;
+       struct io_buffer *buf;
         unsigned long index;
         int i;
   
@@@ -270,12 -277,9 +277,9 @@@
                 kfree(bl);
         }
   
-       while (!list_empty(&ctx->io_buffers_pages)) {
-               struct page *page;
- 
-               page = list_first_entry(&ctx->io_buffers_pages, struct page, lru);
-               list_del_init(&page->lru);
-               __free_page(page);
+       list_for_each_safe(item, tmp, &ctx->io_buffers_cache) {
+               buf = list_entry(item, struct io_buffer, list);
+               kmem_cache_free(io_buf_cachep, buf);
         }
   }
   
@@@ -289,7 -293,7 +293,7 @@@ int io_remove_buffers_prep(struct io_ki
                 return -EINVAL;
   
         tmp = READ_ONCE(sqe->fd);
-       if (!tmp || tmp > USHRT_MAX)
+       if (!tmp || tmp > MAX_BIDS_PER_BGID)
                 return -EINVAL;
   
         memset(p, 0, sizeof(*p));
@@@ -332,7 -336,7 +336,7 @@@ int io_provide_buffers_prep(struct io_k
                 return -EINVAL;
   
         tmp = READ_ONCE(sqe->fd);
-       if (!tmp || tmp > USHRT_MAX)
+       if (!tmp || tmp > MAX_BIDS_PER_BGID)
                 return -E2BIG;
         p->nbufs = tmp;
         p->addr = READ_ONCE(sqe->addr);
@@@ -352,17 -356,18 +356,18 @@@
         tmp = READ_ONCE(sqe->off);
         if (tmp > USHRT_MAX)
                 return -E2BIG;
-       if (tmp + p->nbufs >= USHRT_MAX)
+       if (tmp + p->nbufs > MAX_BIDS_PER_BGID)
                 return -EINVAL;
         p->bid = tmp;
         return 0;
   }
   
+ #define IO_BUFFER_ALLOC_BATCH 64
+ 
   static int io_refill_buffer_cache(struct io_ring_ctx *ctx)
   {
-       struct io_buffer *buf;
-       struct page *page;
-       int bufs_in_page;
+       struct io_buffer *bufs[IO_BUFFER_ALLOC_BATCH];
+       int allocated;
   
         /*
          * Completions that don't happen inline (eg not under uring_lock) will
@@@ -382,22 -387,25 +387,25 @@@
   
         /*
          * No free buffers and no completion entries either. Allocate a new
-        * page worth of buffer entries and add those to our freelist.
+        * batch of buffer entries and add those to our freelist.
          */
-       page = alloc_page(GFP_KERNEL_ACCOUNT);
-       if (!page)
-               return -ENOMEM;
- 
-       list_add(&page->lru, &ctx->io_buffers_pages);
   
-       buf = page_address(page);
-       bufs_in_page = PAGE_SIZE / sizeof(*buf);
-       while (bufs_in_page) {
-               list_add_tail(&buf->list, &ctx->io_buffers_cache);
-               buf++;
-               bufs_in_page--;
+       allocated = kmem_cache_alloc_bulk(io_buf_cachep, GFP_KERNEL_ACCOUNT,
+                                         ARRAY_SIZE(bufs), (void **) bufs);
+       if (unlikely(!allocated)) {
+               /*
+                * Bulk alloc is all-or-nothing. If we fail to get a batch,
+                * retry single alloc to be on the safe side.
+                */
+               bufs[0] = kmem_cache_alloc(io_buf_cachep, GFP_KERNEL);
+               if (!bufs[0])
+                       return -ENOMEM;
+               allocated = 1;
         }
   
+       while (allocated)
+               list_add_tail(&bufs[--allocated]->list, &ctx->io_buffers_cache);
+ 
         return 0;
   }
   
@@@ -477,7 -485,7 +485,7 @@@ static int io_pin_pbuf_ring(struct io_u
   {
         struct io_uring_buf_ring *br;
         struct page **pages;
- -      int nr_pages;
+ +      int i, nr_pages;
   
         pages = io_pin_pages(reg->ring_addr,
                              flex_array_size(br, bufs, reg->ring_entries),
@@@ -485,17 -493,6 +493,17 @@@
         if (IS_ERR(pages))
                 return PTR_ERR(pages);
   
+ +      /*
+ +       * Apparently some 32-bit boxes (ARM) will return highmem pages,
+ +       * which then need to be mapped. We could support that, but it'd
+ +       * complicate the code and slowdown the common cases quite a bit.
+ +       * So just error out, returning -EINVAL just like we did on kernels
+ +       * that didn't support mapped buffer rings.
+ +       */
+ +      for (i = 0; i < nr_pages; i++)
+ +              if (PageHighMem(pages[i]))
+ +                      goto error_unpin;
+ +
         br = page_address(pages[0]);
   #ifdef SHM_COLOUR
         /*
@@@ -507,8 -504,13 +515,8 @@@
          * should use IOU_PBUF_RING_MMAP instead, and liburing will handle
          * this transparently.
          */
- -      if ((reg->ring_addr | (unsigned long) br) & (SHM_COLOUR - 1)) {
- -              int i;
- -
- -              for (i = 0; i < nr_pages; i++)
- -                      unpin_user_page(pages[i]);
- -              return -EINVAL;
- -      }
+ +      if ((reg->ring_addr | (unsigned long) br) & (SHM_COLOUR - 1))
+ +              goto error_unpin;
   #endif
         bl->buf_pages = pages;
         bl->buf_nr_pages = nr_pages;
@@@ -516,11 -518,6 +524,11 @@@
         bl->is_mapped = 1;
         bl->is_mmap = 0;
         return 0;
+ +error_unpin:
+ +      for (i = 0; i < nr_pages; i++)
+ +              unpin_user_page(pages[i]);
+ +      kvfree(pages);
+ +      return -EINVAL;
   }
   
   static int io_alloc_pbuf_ring(struct io_uring_buf_reg *reg,
diff --combined io_uring/rw.c

index 8f68d5ad4564fe7b971f9f0c977276179a713faa,ec0cc38ea6824a2a8b6f29841cd08ead84137e6a..3398e1d944c2615f557162bbe0cca7b718a4c1dc
--- 1/io_uring/rw.c
--- 2/io_uring/rw.c
+++ b/io_uring/rw.c
@@@ -123,6 -123,22 +123,22 @@@ int io_prep_rw(struct io_kiocb *req, co
         return 0;
   }
   
+ /*
+  * Multishot read is prepared just like a normal read/write request, only
+  * difference is that we set the MULTISHOT flag.
+  */
+ int io_read_mshot_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
+ {
+       int ret;
+ 
+       ret = io_prep_rw(req, sqe);
+       if (unlikely(ret))
+               return ret;
+ 
+       req->flags |= REQ_F_APOLL_MULTISHOT;
+       return 0;
+ }
+ 
   void io_readv_writev_cleanup(struct io_kiocb *req)
   {
         struct io_async_rw *io = req->async_data;
@@@ -339,7 -355,7 +355,7 @@@ static int kiocb_done(struct io_kiocb *
         struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw);
         unsigned final_ret = io_fixup_rw_res(req, ret);
   
- -      if (req->flags & REQ_F_CUR_POS)
+ +      if (ret >= 0 && req->flags & REQ_F_CUR_POS)
                 req->file->f_pos = rw->kiocb.ki_pos;
         if (ret >= 0 && (rw->kiocb.ki_complete == io_complete_rw)) {
                 if (!__io_complete_rw_common(req, ret)) {
@@@ -388,8 -404,7 +404,7 @@@ static struct iovec *__io_import_iovec(
         buf = u64_to_user_ptr(rw->addr);
         sqe_len = rw->len;
   
-       if (opcode == IORING_OP_READ || opcode == IORING_OP_WRITE ||
-           (req->flags & REQ_F_BUFFER_SELECT)) {
+       if (!io_issue_defs[opcode].vectored || req->flags & REQ_F_BUFFER_SELECT) {
                 if (io_do_buffer_select(req)) {
                         buf = io_buffer_select(req, &sqe_len, issue_flags);
                         if (!buf)
@@@ -708,7 -723,7 +723,7 @@@ static int io_rw_init_file(struct io_ki
         return 0;
   }
   
- int io_read(struct io_kiocb *req, unsigned int issue_flags)
+ static int __io_read(struct io_kiocb *req, unsigned int issue_flags)
   {
         struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw);
         struct io_rw_state __s, *s = &__s;
@@@ -776,8 -791,11 +791,11 @@@
   
         if (ret == -EAGAIN || (req->flags & REQ_F_REISSUE)) {
                 req->flags &= ~REQ_F_REISSUE;
-               /* if we can poll, just do that */
-               if (req->opcode == IORING_OP_READ && file_can_poll(req->file))
+               /*
+                * If we can poll, just do that. For a vectored read, we'll
+                * need to copy state first.
+                */
+               if (file_can_poll(req->file) && !io_issue_defs[req->opcode].vectored)
                         return -EAGAIN;
                 /* IOPOLL retry should happen for io-wq threads */
                 if (!force_nonblock && !(req->ctx->flags & IORING_SETUP_IOPOLL))
@@@ -853,7 -871,69 +871,69 @@@ done
         /* it's faster to check here then delegate to kfree */
         if (iovec)
                 kfree(iovec);
-       return kiocb_done(req, ret, issue_flags);
+       return ret;
+ }
+ 
+ int io_read(struct io_kiocb *req, unsigned int issue_flags)
+ {
+       int ret;
+ 
+       ret = __io_read(req, issue_flags);
+       if (ret >= 0)
+               return kiocb_done(req, ret, issue_flags);
+ 
+       return ret;
+ }
+ 
+ int io_read_mshot(struct io_kiocb *req, unsigned int issue_flags)
+ {
+       unsigned int cflags = 0;
+       int ret;
+ 
+       /*
+        * Multishot MUST be used on a pollable file
+        */
+       if (!file_can_poll(req->file))
+               return -EBADFD;
+ 
+       ret = __io_read(req, issue_flags);
+ 
+       /*
+        * If we get -EAGAIN, recycle our buffer and just let normal poll
+        * handling arm it.
+        */
+       if (ret == -EAGAIN) {
+               io_kbuf_recycle(req, issue_flags);
+               return -EAGAIN;
+       }
+ 
+       /*
+        * Any successful return value will keep the multishot read armed.
+        */
+       if (ret > 0) {
+               /*
+                * Put our buffer and post a CQE. If we fail to post a CQE, then
+                * jump to the termination path. This request is then done.
+                */
+               cflags = io_put_kbuf(req, issue_flags);
+ 
+               if (io_fill_cqe_req_aux(req,
+                                       issue_flags & IO_URING_F_COMPLETE_DEFER,
+                                       ret, cflags | IORING_CQE_F_MORE)) {
+                       if (issue_flags & IO_URING_F_MULTISHOT)
+                               return IOU_ISSUE_SKIP_COMPLETE;
+                       return -EAGAIN;
+               }
+       }
+ 
+       /*
+        * Either an error, or we've hit overflow posting the CQE. For any
+        * multishot request, hitting overflow will terminate it.
+        */
+       io_req_set_res(req, ret, cflags);
+       if (issue_flags & IO_URING_F_MULTISHOT)
+               return IOU_STOP_MULTISHOT;
+       return IOU_OK;
   }
   
   int io_write(struct io_kiocb *req, unsigned int issue_flags)
@@@ -913,6 -993,15 +993,6 @@@
                 kiocb_start_write(kiocb);
         kiocb->ki_flags |= IOCB_WRITE;
   
- -      /*
- -       * For non-polled IO, set IOCB_DIO_CALLER_COMP, stating that our handler
- -       * groks deferring the completion to task context. This isn't
- -       * necessary and useful for polled IO as that can always complete
- -       * directly.
- -       */
- -      if (!(kiocb->ki_flags & IOCB_HIPRI))
- -              kiocb->ki_flags |= IOCB_DIO_CALLER_COMP;
- -
         if (likely(req->file->f_op->write_iter))
                 ret2 = call_write_iter(req->file, kiocb, &s->iter);
         else if (req->file->f_op->write)
author	Linus Torvalds <[email protected]>
	Wed, 1 Nov 2023 21:09:19 +0000 (11:09 -1000)
committer	Linus Torvalds <[email protected]>
	Wed, 1 Nov 2023 21:09:19 +0000 (11:09 -1000)
		1	2
io_uring/io_uring.c	patch \|	diff1 \|	diff2 \|	blob \| history
io_uring/io_uring.h	patch \|	diff1 \|	diff2 \|	blob \| history
io_uring/kbuf.c	patch \|	diff1 \|	diff2 \|	blob \| history
io_uring/rw.c	patch \|	diff1 \|	diff2 \|	blob \| history