]> Git Repo - linux.git/blame - io_uring/kbuf.c
PCI: Wait for device readiness with Configuration RRS
[linux.git] / io_uring / kbuf.c
CommitLineData
3b77495a
JA
1// SPDX-License-Identifier: GPL-2.0
2#include <linux/kernel.h>
3#include <linux/errno.h>
4#include <linux/fs.h>
5#include <linux/file.h>
6#include <linux/mm.h>
7#include <linux/slab.h>
8#include <linux/namei.h>
9#include <linux/poll.h>
e270bfd2 10#include <linux/vmalloc.h>
3b77495a
JA
11#include <linux/io_uring.h>
12
13#include <uapi/linux/io_uring.h>
14
3b77495a
JA
15#include "io_uring.h"
16#include "opdef.h"
17#include "kbuf.h"
f15ed8b4 18#include "memmap.h"
3b77495a 19
f74c746e
GKB
20/* BIDs are addressed by a 16-bit field in a CQE */
21#define MAX_BIDS_PER_BGID (1 << 16)
22
b3a4dbc8
GKB
23struct kmem_cache *io_buf_cachep;
24
3b77495a
JA
25struct io_provide_buf {
26 struct file *file;
27 __u64 addr;
28 __u32 len;
29 __u32 bgid;
f74c746e 30 __u32 nbufs;
3b77495a
JA
31 __u16 bid;
32};
33
34static inline struct io_buffer_list *io_buffer_get_list(struct io_ring_ctx *ctx,
35 unsigned int bgid)
36{
5cf4f52e 37 lockdep_assert_held(&ctx->uring_lock);
3b77495a 38
87585b05 39 return xa_load(&ctx->io_bl_xa, bgid);
3b77495a
JA
40}
41
024b8fde
HX
42static int io_buffer_add_list(struct io_ring_ctx *ctx,
43 struct io_buffer_list *bl, unsigned int bgid)
44{
5cf4f52e
JA
45 /*
46 * Store buffer group ID and finally mark the list as visible.
47 * The normal lookup doesn't care about the visibility as we're
48 * always under the ->uring_lock, but the RCU lookup from mmap does.
49 */
024b8fde 50 bl->bgid = bgid;
6b69c4ab 51 atomic_set(&bl->refs, 1);
024b8fde
HX
52 return xa_err(xa_store(&ctx->io_bl_xa, bgid, bl, GFP_KERNEL));
53}
54
89d528ba 55bool io_kbuf_recycle_legacy(struct io_kiocb *req, unsigned issue_flags)
3b77495a
JA
56{
57 struct io_ring_ctx *ctx = req->ctx;
58 struct io_buffer_list *bl;
59 struct io_buffer *buf;
60
3b77495a
JA
61 io_ring_submit_lock(ctx, issue_flags);
62
63 buf = req->kbuf;
64 bl = io_buffer_get_list(ctx, buf->bgid);
65 list_add(&buf->list, &bl->buf_list);
66 req->flags &= ~REQ_F_BUFFER_SELECTED;
67 req->buf_index = buf->bgid;
68
69 io_ring_submit_unlock(ctx, issue_flags);
89d528ba 70 return true;
3b77495a
JA
71}
72
8435c6f3 73void __io_put_kbuf(struct io_kiocb *req, unsigned issue_flags)
53ccf69b 74{
53ccf69b
PB
75 /*
76 * We can add this buffer back to two lists:
77 *
78 * 1) The io_buffers_cache list. This one is protected by the
79 * ctx->uring_lock. If we already hold this lock, add back to this
80 * list as we can grab it from issue as well.
81 * 2) The io_buffers_comp list. This one is protected by the
82 * ctx->completion_lock.
83 *
84 * We migrate buffers from the comp_list to the issue cache list
85 * when we need one.
86 */
8435c6f3 87 if (issue_flags & IO_URING_F_UNLOCKED) {
53ccf69b
PB
88 struct io_ring_ctx *ctx = req->ctx;
89
90 spin_lock(&ctx->completion_lock);
8435c6f3 91 __io_put_kbuf_list(req, &ctx->io_buffers_comp);
53ccf69b
PB
92 spin_unlock(&ctx->completion_lock);
93 } else {
94 lockdep_assert_held(&req->ctx->uring_lock);
95
8435c6f3 96 __io_put_kbuf_list(req, &req->ctx->io_buffers_cache);
53ccf69b 97 }
53ccf69b
PB
98}
99
3b77495a
JA
100static void __user *io_provided_buffer_select(struct io_kiocb *req, size_t *len,
101 struct io_buffer_list *bl)
102{
103 if (!list_empty(&bl->buf_list)) {
104 struct io_buffer *kbuf;
105
106 kbuf = list_first_entry(&bl->buf_list, struct io_buffer, list);
107 list_del(&kbuf->list);
b8c01559 108 if (*len == 0 || *len > kbuf->len)
3b77495a 109 *len = kbuf->len;
c3f9109d
JA
110 if (list_empty(&bl->buf_list))
111 req->flags |= REQ_F_BL_EMPTY;
3b77495a
JA
112 req->flags |= REQ_F_BUFFER_SELECTED;
113 req->kbuf = kbuf;
114 req->buf_index = kbuf->bid;
115 return u64_to_user_ptr(kbuf->addr);
116 }
117 return NULL;
118}
119
35c8711c
JA
120static int io_provided_buffers_select(struct io_kiocb *req, size_t *len,
121 struct io_buffer_list *bl,
122 struct iovec *iov)
123{
124 void __user *buf;
125
126 buf = io_provided_buffer_select(req, len, bl);
127 if (unlikely(!buf))
128 return -ENOBUFS;
129
130 iov[0].iov_base = buf;
131 iov[0].iov_len = *len;
132 return 0;
133}
134
135static struct io_uring_buf *io_ring_head_to_buf(struct io_uring_buf_ring *br,
136 __u16 head, __u16 mask)
137{
138 return &br->bufs[head & mask];
139}
140
3b77495a
JA
141static void __user *io_ring_buffer_select(struct io_kiocb *req, size_t *len,
142 struct io_buffer_list *bl,
143 unsigned int issue_flags)
144{
145 struct io_uring_buf_ring *br = bl->buf_ring;
c3f9109d 146 __u16 tail, head = bl->head;
3b77495a 147 struct io_uring_buf *buf;
3b77495a 148
c3f9109d
JA
149 tail = smp_load_acquire(&br->tail);
150 if (unlikely(tail == head))
3b77495a
JA
151 return NULL;
152
c3f9109d
JA
153 if (head + 1 == tail)
154 req->flags |= REQ_F_BL_EMPTY;
155
35c8711c 156 buf = io_ring_head_to_buf(br, head, bl->mask);
b8c01559 157 if (*len == 0 || *len > buf->len)
3b77495a 158 *len = buf->len;
35c8711c 159 req->flags |= REQ_F_BUFFER_RING | REQ_F_BUFFERS_COMMIT;
3b77495a
JA
160 req->buf_list = bl;
161 req->buf_index = buf->bid;
162
95041b93 163 if (issue_flags & IO_URING_F_UNLOCKED || !io_file_can_poll(req)) {
3b77495a
JA
164 /*
165 * If we came in unlocked, we have no choice but to consume the
f09c8643
HX
166 * buffer here, otherwise nothing ensures that the buffer won't
167 * get used by others. This does mean it'll be pinned until the
168 * IO completes, coming in unlocked means we're being called from
169 * io-wq context and there may be further retries in async hybrid
170 * mode. For the locked case, the caller must call commit when
171 * the transfer completes (or if we get -EAGAIN and must poll of
172 * retry).
3b77495a 173 */
35c8711c 174 req->flags &= ~REQ_F_BUFFERS_COMMIT;
3b77495a
JA
175 req->buf_list = NULL;
176 bl->head++;
177 }
178 return u64_to_user_ptr(buf->addr);
179}
180
181void __user *io_buffer_select(struct io_kiocb *req, size_t *len,
182 unsigned int issue_flags)
183{
184 struct io_ring_ctx *ctx = req->ctx;
185 struct io_buffer_list *bl;
186 void __user *ret = NULL;
187
188 io_ring_submit_lock(req->ctx, issue_flags);
189
190 bl = io_buffer_get_list(ctx, req->buf_index);
191 if (likely(bl)) {
9219e4a9 192 if (bl->is_buf_ring)
3b77495a
JA
193 ret = io_ring_buffer_select(req, len, bl, issue_flags);
194 else
195 ret = io_provided_buffer_select(req, len, bl);
196 }
197 io_ring_submit_unlock(req->ctx, issue_flags);
198 return ret;
199}
200
35c8711c
JA
201/* cap it at a reasonable 256, will be one page even for 4K */
202#define PEEK_MAX_IMPORT 256
203
204static int io_ring_buffers_peek(struct io_kiocb *req, struct buf_sel_arg *arg,
205 struct io_buffer_list *bl)
206{
207 struct io_uring_buf_ring *br = bl->buf_ring;
208 struct iovec *iov = arg->iovs;
209 int nr_iovs = arg->nr_iovs;
210 __u16 nr_avail, tail, head;
211 struct io_uring_buf *buf;
212
213 tail = smp_load_acquire(&br->tail);
214 head = bl->head;
215 nr_avail = min_t(__u16, tail - head, UIO_MAXIOV);
216 if (unlikely(!nr_avail))
217 return -ENOBUFS;
218
219 buf = io_ring_head_to_buf(br, head, bl->mask);
220 if (arg->max_len) {
221 int needed;
222
223 needed = (arg->max_len + buf->len - 1) / buf->len;
224 needed = min(needed, PEEK_MAX_IMPORT);
225 if (nr_avail > needed)
226 nr_avail = needed;
227 }
228
229 /*
230 * only alloc a bigger array if we know we have data to map, eg not
231 * a speculative peek operation.
232 */
233 if (arg->mode & KBUF_MODE_EXPAND && nr_avail > nr_iovs && arg->max_len) {
234 iov = kmalloc_array(nr_avail, sizeof(struct iovec), GFP_KERNEL);
235 if (unlikely(!iov))
236 return -ENOMEM;
237 if (arg->mode & KBUF_MODE_FREE)
238 kfree(arg->iovs);
239 arg->iovs = iov;
240 nr_iovs = nr_avail;
241 } else if (nr_avail < nr_iovs) {
242 nr_iovs = nr_avail;
243 }
244
245 /* set it to max, if not set, so we can use it unconditionally */
246 if (!arg->max_len)
247 arg->max_len = INT_MAX;
248
249 req->buf_index = buf->bid;
250 do {
251 /* truncate end piece, if needed */
252 if (buf->len > arg->max_len)
253 buf->len = arg->max_len;
254
255 iov->iov_base = u64_to_user_ptr(buf->addr);
256 iov->iov_len = buf->len;
257 iov++;
258
259 arg->out_len += buf->len;
260 arg->max_len -= buf->len;
261 if (!arg->max_len)
262 break;
263
264 buf = io_ring_head_to_buf(br, ++head, bl->mask);
265 } while (--nr_iovs);
266
267 if (head == tail)
268 req->flags |= REQ_F_BL_EMPTY;
269
270 req->flags |= REQ_F_BUFFER_RING;
271 req->buf_list = bl;
272 return iov - arg->iovs;
273}
274
275int io_buffers_select(struct io_kiocb *req, struct buf_sel_arg *arg,
276 unsigned int issue_flags)
277{
278 struct io_ring_ctx *ctx = req->ctx;
279 struct io_buffer_list *bl;
280 int ret = -ENOENT;
281
282 io_ring_submit_lock(ctx, issue_flags);
283 bl = io_buffer_get_list(ctx, req->buf_index);
284 if (unlikely(!bl))
285 goto out_unlock;
286
287 if (bl->is_buf_ring) {
288 ret = io_ring_buffers_peek(req, arg, bl);
289 /*
290 * Don't recycle these buffers if we need to go through poll.
291 * Nobody else can use them anyway, and holding on to provided
292 * buffers for a send/write operation would happen on the app
293 * side anyway with normal buffers. Besides, we already
294 * committed them, they cannot be put back in the queue.
295 */
296 if (ret > 0) {
297 req->flags |= REQ_F_BL_NO_RECYCLE;
298 req->buf_list->head += ret;
299 }
300 } else {
301 ret = io_provided_buffers_select(req, &arg->out_len, bl, arg->iovs);
302 }
303out_unlock:
304 io_ring_submit_unlock(ctx, issue_flags);
305 return ret;
306}
307
308int io_buffers_peek(struct io_kiocb *req, struct buf_sel_arg *arg)
309{
310 struct io_ring_ctx *ctx = req->ctx;
311 struct io_buffer_list *bl;
312 int ret;
313
314 lockdep_assert_held(&ctx->uring_lock);
315
316 bl = io_buffer_get_list(ctx, req->buf_index);
317 if (unlikely(!bl))
318 return -ENOENT;
319
320 if (bl->is_buf_ring) {
321 ret = io_ring_buffers_peek(req, arg, bl);
322 if (ret > 0)
323 req->flags |= REQ_F_BUFFERS_COMMIT;
324 return ret;
325 }
326
327 /* don't support multiple buffer selections for legacy */
328 return io_provided_buffers_select(req, &arg->max_len, bl, arg->iovs);
329}
330
3b77495a
JA
331static int __io_remove_buffers(struct io_ring_ctx *ctx,
332 struct io_buffer_list *bl, unsigned nbufs)
333{
334 unsigned i = 0;
335
336 /* shouldn't happen */
337 if (!nbufs)
338 return 0;
339
9219e4a9 340 if (bl->is_buf_ring) {
3b77495a 341 i = bl->buf_ring->tail - bl->head;
87585b05 342 if (bl->buf_nr_pages) {
c56e022c
JA
343 int j;
344
87585b05
JA
345 if (!bl->is_mmap) {
346 for (j = 0; j < bl->buf_nr_pages; j++)
347 unpin_user_page(bl->buf_pages[j]);
348 }
349 io_pages_unmap(bl->buf_ring, &bl->buf_pages,
350 &bl->buf_nr_pages, bl->is_mmap);
351 bl->is_mmap = 0;
c56e022c 352 }
3b77495a
JA
353 /* make sure it's seen as empty */
354 INIT_LIST_HEAD(&bl->buf_list);
9219e4a9 355 bl->is_buf_ring = 0;
3b77495a
JA
356 return i;
357 }
358
b4a72c05
WL
359 /* protects io_buffers_cache */
360 lockdep_assert_held(&ctx->uring_lock);
361
3b77495a
JA
362 while (!list_empty(&bl->buf_list)) {
363 struct io_buffer *nxt;
364
365 nxt = list_first_entry(&bl->buf_list, struct io_buffer, list);
b4a72c05 366 list_move(&nxt->list, &ctx->io_buffers_cache);
3b77495a
JA
367 if (++i == nbufs)
368 return i;
369 cond_resched();
370 }
3b77495a
JA
371
372 return i;
373}
374
561e4f94 375void io_put_bl(struct io_ring_ctx *ctx, struct io_buffer_list *bl)
6b69c4ab
JA
376{
377 if (atomic_dec_and_test(&bl->refs)) {
378 __io_remove_buffers(ctx, bl, -1U);
379 kfree_rcu(bl, rcu);
380 }
381}
382
3b77495a
JA
383void io_destroy_buffers(struct io_ring_ctx *ctx)
384{
385 struct io_buffer_list *bl;
b3a4dbc8
GKB
386 struct list_head *item, *tmp;
387 struct io_buffer *buf;
3b77495a 388 unsigned long index;
3b77495a
JA
389
390 xa_for_each(&ctx->io_bl_xa, index, bl) {
391 xa_erase(&ctx->io_bl_xa, bl->bgid);
6b69c4ab 392 io_put_bl(ctx, bl);
3b77495a
JA
393 }
394
07d6063d
JA
395 /*
396 * Move deferred locked entries to cache before pruning
397 */
398 spin_lock(&ctx->completion_lock);
399 if (!list_empty(&ctx->io_buffers_comp))
400 list_splice_init(&ctx->io_buffers_comp, &ctx->io_buffers_cache);
401 spin_unlock(&ctx->completion_lock);
402
b3a4dbc8
GKB
403 list_for_each_safe(item, tmp, &ctx->io_buffers_cache) {
404 buf = list_entry(item, struct io_buffer, list);
405 kmem_cache_free(io_buf_cachep, buf);
3b77495a
JA
406 }
407}
408
409int io_remove_buffers_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
410{
f2ccb5ae 411 struct io_provide_buf *p = io_kiocb_to_cmd(req, struct io_provide_buf);
3b77495a
JA
412 u64 tmp;
413
414 if (sqe->rw_flags || sqe->addr || sqe->len || sqe->off ||
415 sqe->splice_fd_in)
416 return -EINVAL;
417
418 tmp = READ_ONCE(sqe->fd);
f74c746e 419 if (!tmp || tmp > MAX_BIDS_PER_BGID)
3b77495a
JA
420 return -EINVAL;
421
422 memset(p, 0, sizeof(*p));
423 p->nbufs = tmp;
424 p->bgid = READ_ONCE(sqe->buf_group);
425 return 0;
426}
427
428int io_remove_buffers(struct io_kiocb *req, unsigned int issue_flags)
429{
f2ccb5ae 430 struct io_provide_buf *p = io_kiocb_to_cmd(req, struct io_provide_buf);
3b77495a
JA
431 struct io_ring_ctx *ctx = req->ctx;
432 struct io_buffer_list *bl;
433 int ret = 0;
434
435 io_ring_submit_lock(ctx, issue_flags);
436
437 ret = -ENOENT;
438 bl = io_buffer_get_list(ctx, p->bgid);
439 if (bl) {
440 ret = -EINVAL;
441 /* can't use provide/remove buffers command on mapped buffers */
9219e4a9 442 if (!bl->is_buf_ring)
3b77495a
JA
443 ret = __io_remove_buffers(ctx, bl, p->nbufs);
444 }
c3b49093 445 io_ring_submit_unlock(ctx, issue_flags);
3b77495a
JA
446 if (ret < 0)
447 req_set_fail(req);
3b77495a 448 io_req_set_res(req, ret, 0);
c3b49093 449 return IOU_OK;
3b77495a
JA
450}
451
452int io_provide_buffers_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
453{
454 unsigned long size, tmp_check;
f2ccb5ae 455 struct io_provide_buf *p = io_kiocb_to_cmd(req, struct io_provide_buf);
3b77495a
JA
456 u64 tmp;
457
458 if (sqe->rw_flags || sqe->splice_fd_in)
459 return -EINVAL;
460
461 tmp = READ_ONCE(sqe->fd);
f74c746e 462 if (!tmp || tmp > MAX_BIDS_PER_BGID)
3b77495a
JA
463 return -E2BIG;
464 p->nbufs = tmp;
465 p->addr = READ_ONCE(sqe->addr);
466 p->len = READ_ONCE(sqe->len);
467
468 if (check_mul_overflow((unsigned long)p->len, (unsigned long)p->nbufs,
469 &size))
470 return -EOVERFLOW;
471 if (check_add_overflow((unsigned long)p->addr, size, &tmp_check))
472 return -EOVERFLOW;
473
474 size = (unsigned long)p->len * p->nbufs;
475 if (!access_ok(u64_to_user_ptr(p->addr), size))
476 return -EFAULT;
477
478 p->bgid = READ_ONCE(sqe->buf_group);
479 tmp = READ_ONCE(sqe->off);
480 if (tmp > USHRT_MAX)
481 return -E2BIG;
f74c746e 482 if (tmp + p->nbufs > MAX_BIDS_PER_BGID)
3851d25c 483 return -EINVAL;
3b77495a
JA
484 p->bid = tmp;
485 return 0;
486}
487
b3a4dbc8
GKB
488#define IO_BUFFER_ALLOC_BATCH 64
489
3b77495a
JA
490static int io_refill_buffer_cache(struct io_ring_ctx *ctx)
491{
b3a4dbc8
GKB
492 struct io_buffer *bufs[IO_BUFFER_ALLOC_BATCH];
493 int allocated;
3b77495a
JA
494
495 /*
496 * Completions that don't happen inline (eg not under uring_lock) will
497 * add to ->io_buffers_comp. If we don't have any free buffers, check
498 * the completion list and splice those entries first.
499 */
500 if (!list_empty_careful(&ctx->io_buffers_comp)) {
501 spin_lock(&ctx->completion_lock);
502 if (!list_empty(&ctx->io_buffers_comp)) {
503 list_splice_init(&ctx->io_buffers_comp,
504 &ctx->io_buffers_cache);
505 spin_unlock(&ctx->completion_lock);
506 return 0;
507 }
508 spin_unlock(&ctx->completion_lock);
509 }
510
511 /*
512 * No free buffers and no completion entries either. Allocate a new
b3a4dbc8 513 * batch of buffer entries and add those to our freelist.
3b77495a 514 */
3b77495a 515
b3a4dbc8
GKB
516 allocated = kmem_cache_alloc_bulk(io_buf_cachep, GFP_KERNEL_ACCOUNT,
517 ARRAY_SIZE(bufs), (void **) bufs);
518 if (unlikely(!allocated)) {
519 /*
520 * Bulk alloc is all-or-nothing. If we fail to get a batch,
521 * retry single alloc to be on the safe side.
522 */
523 bufs[0] = kmem_cache_alloc(io_buf_cachep, GFP_KERNEL);
524 if (!bufs[0])
525 return -ENOMEM;
526 allocated = 1;
3b77495a
JA
527 }
528
b3a4dbc8
GKB
529 while (allocated)
530 list_add_tail(&bufs[--allocated]->list, &ctx->io_buffers_cache);
531
3b77495a
JA
532 return 0;
533}
534
535static int io_add_buffers(struct io_ring_ctx *ctx, struct io_provide_buf *pbuf,
536 struct io_buffer_list *bl)
537{
538 struct io_buffer *buf;
539 u64 addr = pbuf->addr;
540 int i, bid = pbuf->bid;
541
542 for (i = 0; i < pbuf->nbufs; i++) {
543 if (list_empty(&ctx->io_buffers_cache) &&
544 io_refill_buffer_cache(ctx))
545 break;
546 buf = list_first_entry(&ctx->io_buffers_cache, struct io_buffer,
547 list);
548 list_move_tail(&buf->list, &bl->buf_list);
549 buf->addr = addr;
550 buf->len = min_t(__u32, pbuf->len, MAX_RW_COUNT);
551 buf->bid = bid;
552 buf->bgid = pbuf->bgid;
553 addr += pbuf->len;
554 bid++;
555 cond_resched();
556 }
557
558 return i ? 0 : -ENOMEM;
559}
560
561int io_provide_buffers(struct io_kiocb *req, unsigned int issue_flags)
562{
f2ccb5ae 563 struct io_provide_buf *p = io_kiocb_to_cmd(req, struct io_provide_buf);
3b77495a
JA
564 struct io_ring_ctx *ctx = req->ctx;
565 struct io_buffer_list *bl;
566 int ret = 0;
567
568 io_ring_submit_lock(ctx, issue_flags);
569
3b77495a
JA
570 bl = io_buffer_get_list(ctx, p->bgid);
571 if (unlikely(!bl)) {
cc18cc5e 572 bl = kzalloc(sizeof(*bl), GFP_KERNEL_ACCOUNT);
3b77495a
JA
573 if (!bl) {
574 ret = -ENOMEM;
575 goto err;
576 }
577 INIT_LIST_HEAD(&bl->buf_list);
578 ret = io_buffer_add_list(ctx, bl, p->bgid);
579 if (ret) {
5cf4f52e
JA
580 /*
581 * Doesn't need rcu free as it was never visible, but
09ab7eff 582 * let's keep it consistent throughout.
5cf4f52e 583 */
09ab7eff 584 kfree_rcu(bl, rcu);
3b77495a
JA
585 goto err;
586 }
587 }
588 /* can't add buffers via this command for a mapped buffer ring */
9219e4a9 589 if (bl->is_buf_ring) {
3b77495a
JA
590 ret = -EINVAL;
591 goto err;
592 }
593
594 ret = io_add_buffers(ctx, p, bl);
595err:
c3b49093
PB
596 io_ring_submit_unlock(ctx, issue_flags);
597
3b77495a
JA
598 if (ret < 0)
599 req_set_fail(req);
3b77495a 600 io_req_set_res(req, ret, 0);
c3b49093 601 return IOU_OK;
3b77495a
JA
602}
603
ba56b632
JA
604static int io_pin_pbuf_ring(struct io_uring_buf_reg *reg,
605 struct io_buffer_list *bl)
3b77495a 606{
e270bfd2 607 struct io_uring_buf_ring *br = NULL;
3b77495a 608 struct page **pages;
18595c0a 609 int nr_pages, ret;
3b77495a 610
ba56b632
JA
611 pages = io_pin_pages(reg->ring_addr,
612 flex_array_size(br, bufs, reg->ring_entries),
613 &nr_pages);
614 if (IS_ERR(pages))
615 return PTR_ERR(pages);
616
e270bfd2
JA
617 br = vmap(pages, nr_pages, VM_MAP, PAGE_KERNEL);
618 if (!br) {
619 ret = -ENOMEM;
620 goto error_unpin;
621 }
f8024f1f 622
fcb46c0c
JA
623#ifdef SHM_COLOUR
624 /*
625 * On platforms that have specific aliasing requirements, SHM_COLOUR
626 * is set and we must guarantee that the kernel and user side align
627 * nicely. We cannot do that if IOU_PBUF_RING_MMAP isn't set and
628 * the application mmap's the provided ring buffer. Fail the request
629 * if we, by chance, don't end up with aligned addresses. The app
630 * should use IOU_PBUF_RING_MMAP instead, and liburing will handle
631 * this transparently.
632 */
e270bfd2
JA
633 if ((reg->ring_addr | (unsigned long) br) & (SHM_COLOUR - 1)) {
634 ret = -EINVAL;
f8024f1f 635 goto error_unpin;
e270bfd2 636 }
fcb46c0c 637#endif
ba56b632
JA
638 bl->buf_pages = pages;
639 bl->buf_nr_pages = nr_pages;
640 bl->buf_ring = br;
9219e4a9 641 bl->is_buf_ring = 1;
c56e022c
JA
642 bl->is_mmap = 0;
643 return 0;
f8024f1f 644error_unpin:
18595c0a 645 unpin_user_pages(pages, nr_pages);
f8024f1f 646 kvfree(pages);
e270bfd2
JA
647 vunmap(br);
648 return ret;
c56e022c
JA
649}
650
c392cbec
JA
651static int io_alloc_pbuf_ring(struct io_ring_ctx *ctx,
652 struct io_uring_buf_reg *reg,
c56e022c
JA
653 struct io_buffer_list *bl)
654{
c56e022c 655 size_t ring_size;
c56e022c
JA
656
657 ring_size = reg->ring_entries * sizeof(struct io_uring_buf_ring);
c56e022c 658
87585b05 659 bl->buf_ring = io_pages_map(&bl->buf_pages, &bl->buf_nr_pages, ring_size);
bcc87d97
PB
660 if (IS_ERR(bl->buf_ring)) {
661 bl->buf_ring = NULL;
87585b05 662 return -ENOMEM;
bcc87d97 663 }
87585b05 664
9219e4a9 665 bl->is_buf_ring = 1;
c56e022c 666 bl->is_mmap = 1;
ba56b632
JA
667 return 0;
668}
669
670int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg)
671{
672 struct io_uring_buf_reg reg;
673 struct io_buffer_list *bl, *free_bl = NULL;
674 int ret;
675
5cf4f52e
JA
676 lockdep_assert_held(&ctx->uring_lock);
677
3b77495a
JA
678 if (copy_from_user(&reg, arg, sizeof(reg)))
679 return -EFAULT;
680
81cf17cd 681 if (reg.resv[0] || reg.resv[1] || reg.resv[2])
3b77495a 682 return -EINVAL;
c56e022c 683 if (reg.flags & ~IOU_PBUF_RING_MMAP)
3b77495a 684 return -EINVAL;
c56e022c
JA
685 if (!(reg.flags & IOU_PBUF_RING_MMAP)) {
686 if (!reg.ring_addr)
687 return -EFAULT;
688 if (reg.ring_addr & ~PAGE_MASK)
689 return -EINVAL;
690 } else {
691 if (reg.ring_addr)
692 return -EINVAL;
693 }
694
3b77495a
JA
695 if (!is_power_of_2(reg.ring_entries))
696 return -EINVAL;
697
698 /* cannot disambiguate full vs empty due to head/tail size */
699 if (reg.ring_entries >= 65536)
700 return -EINVAL;
701
3b77495a
JA
702 bl = io_buffer_get_list(ctx, reg.bgid);
703 if (bl) {
704 /* if mapped buffer ring OR classic exists, don't allow */
9219e4a9 705 if (bl->is_buf_ring || !list_empty(&bl->buf_list))
3b77495a
JA
706 return -EEXIST;
707 } else {
708 free_bl = bl = kzalloc(sizeof(*bl), GFP_KERNEL);
709 if (!bl)
710 return -ENOMEM;
711 }
712
c56e022c
JA
713 if (!(reg.flags & IOU_PBUF_RING_MMAP))
714 ret = io_pin_pbuf_ring(&reg, bl);
715 else
c392cbec 716 ret = io_alloc_pbuf_ring(ctx, &reg, bl);
3b77495a 717
c56e022c
JA
718 if (!ret) {
719 bl->nr_entries = reg.ring_entries;
720 bl->mask = reg.ring_entries - 1;
ba56b632 721
c56e022c
JA
722 io_buffer_add_list(ctx, bl, reg.bgid);
723 return 0;
3b77495a
JA
724 }
725
5cf4f52e 726 kfree_rcu(free_bl, rcu);
c56e022c 727 return ret;
3b77495a
JA
728}
729
730int io_unregister_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg)
731{
732 struct io_uring_buf_reg reg;
733 struct io_buffer_list *bl;
734
5cf4f52e
JA
735 lockdep_assert_held(&ctx->uring_lock);
736
3b77495a
JA
737 if (copy_from_user(&reg, arg, sizeof(reg)))
738 return -EFAULT;
81cf17cd
JA
739 if (reg.resv[0] || reg.resv[1] || reg.resv[2])
740 return -EINVAL;
741 if (reg.flags)
3b77495a
JA
742 return -EINVAL;
743
744 bl = io_buffer_get_list(ctx, reg.bgid);
745 if (!bl)
746 return -ENOENT;
9219e4a9 747 if (!bl->is_buf_ring)
3b77495a
JA
748 return -EINVAL;
749
09ab7eff 750 xa_erase(&ctx->io_bl_xa, bl->bgid);
6b69c4ab 751 io_put_bl(ctx, bl);
3b77495a
JA
752 return 0;
753}
c56e022c 754
d293b1a8
JA
755int io_register_pbuf_status(struct io_ring_ctx *ctx, void __user *arg)
756{
757 struct io_uring_buf_status buf_status;
758 struct io_buffer_list *bl;
759 int i;
760
761 if (copy_from_user(&buf_status, arg, sizeof(buf_status)))
762 return -EFAULT;
763
764 for (i = 0; i < ARRAY_SIZE(buf_status.resv); i++)
765 if (buf_status.resv[i])
766 return -EINVAL;
767
768 bl = io_buffer_get_list(ctx, buf_status.buf_group);
769 if (!bl)
770 return -ENOENT;
9219e4a9 771 if (!bl->is_buf_ring)
d293b1a8
JA
772 return -EINVAL;
773
774 buf_status.head = bl->head;
775 if (copy_to_user(arg, &buf_status, sizeof(buf_status)))
776 return -EFAULT;
777
778 return 0;
779}
780
561e4f94
JA
781struct io_buffer_list *io_pbuf_get_bl(struct io_ring_ctx *ctx,
782 unsigned long bgid)
c56e022c
JA
783{
784 struct io_buffer_list *bl;
561e4f94 785 bool ret;
c56e022c 786
561e4f94
JA
787 /*
788 * We have to be a bit careful here - we're inside mmap and cannot grab
789 * the uring_lock. This means the buffer_list could be simultaneously
790 * going away, if someone is trying to be sneaky. Look it up under rcu
791 * so we know it's not going away, and attempt to grab a reference to
792 * it. If the ref is already zero, then fail the mapping. If successful,
793 * the caller will call io_put_bl() to drop the the reference at at the
794 * end. This may then safely free the buffer_list (and drop the pages)
795 * at that point, vm_insert_pages() would've already grabbed the
796 * necessary vma references.
797 */
798 rcu_read_lock();
799 bl = xa_load(&ctx->io_bl_xa, bgid);
800 /* must be a mmap'able buffer ring and have pages */
801 ret = false;
802 if (bl && bl->is_mmap)
803 ret = atomic_inc_not_zero(&bl->refs);
804 rcu_read_unlock();
805
806 if (ret)
807 return bl;
808
809 return ERR_PTR(-EINVAL);
c56e022c 810}
c392cbec 811
87585b05 812int io_pbuf_mmap(struct file *file, struct vm_area_struct *vma)
c392cbec 813{
87585b05
JA
814 struct io_ring_ctx *ctx = file->private_data;
815 loff_t pgoff = vma->vm_pgoff << PAGE_SHIFT;
816 struct io_buffer_list *bl;
817 int bgid, ret;
c392cbec 818
87585b05
JA
819 bgid = (pgoff & ~IORING_OFF_MMAP_MASK) >> IORING_OFF_PBUF_SHIFT;
820 bl = io_pbuf_get_bl(ctx, bgid);
821 if (IS_ERR(bl))
822 return PTR_ERR(bl);
823
824 ret = io_uring_mmap_pages(ctx, vma, bl->buf_pages, bl->buf_nr_pages);
825 io_put_bl(ctx, bl);
826 return ret;
c392cbec 827}
This page took 0.33962 seconds and 4 git commands to generate.