io_uring/kbuf.c

   1 // SPDX-License-Identifier: GPL-2.0
   2 #include <linux/kernel.h>
   3 #include <linux/errno.h>
   4 #include <linux/fs.h>
   5 #include <linux/file.h>
   6 #include <linux/mm.h>
   7 #include <linux/slab.h>
   8 #include <linux/namei.h>
   9 #include <linux/poll.h>
  10 #include <linux/vmalloc.h>
  11 #include <linux/io_uring.h>
  12
  13 #include <uapi/linux/io_uring.h>
  14
  15 #include "io_uring.h"
  16 #include "opdef.h"
  17 #include "kbuf.h"
  18 #include "memmap.h"
  19
  20 /* BIDs are addressed by a 16-bit field in a CQE */
  21 #define MAX_BIDS_PER_BGID (1 << 16)
  22
  23 struct kmem_cache *io_buf_cachep;
  24
  25 struct io_provide_buf {
  26         struct file                     *file;
  27         __u64                           addr;
  28         __u32                           len;
  29         __u32                           bgid;
  30         __u32                           nbufs;
  31         __u16                           bid;
  32 };
  33
  34 static inline struct io_buffer_list *io_buffer_get_list(struct io_ring_ctx *ctx,
  35                                                         unsigned int bgid)
  36 {
  37         lockdep_assert_held(&ctx->uring_lock);
  38
  39         return xa_load(&ctx->io_bl_xa, bgid);
  40 }
  41
  42 static int io_buffer_add_list(struct io_ring_ctx *ctx,
  43                               struct io_buffer_list *bl, unsigned int bgid)
  44 {
  45         /*
  46          * Store buffer group ID and finally mark the list as visible.
  47          * The normal lookup doesn't care about the visibility as we're
  48          * always under the ->uring_lock, but the RCU lookup from mmap does.
  49          */
  50         bl->bgid = bgid;
  51         atomic_set(&bl->refs, 1);
  52         return xa_err(xa_store(&ctx->io_bl_xa, bgid, bl, GFP_KERNEL));
  53 }
  54
  55 bool io_kbuf_recycle_legacy(struct io_kiocb *req, unsigned issue_flags)
  56 {
  57         struct io_ring_ctx *ctx = req->ctx;
  58         struct io_buffer_list *bl;
  59         struct io_buffer *buf;
  60
  61         io_ring_submit_lock(ctx, issue_flags);
  62
  63         buf = req->kbuf;
  64         bl = io_buffer_get_list(ctx, buf->bgid);
  65         list_add(&buf->list, &bl->buf_list);
  66         req->flags &= ~REQ_F_BUFFER_SELECTED;
  67         req->buf_index = buf->bgid;
  68
  69         io_ring_submit_unlock(ctx, issue_flags);
  70         return true;
  71 }
  72
  73 void __io_put_kbuf(struct io_kiocb *req, int len, unsigned issue_flags)
  74 {
  75         /*
  76          * We can add this buffer back to two lists:
  77          *
  78          * 1) The io_buffers_cache list. This one is protected by the
  79          *    ctx->uring_lock. If we already hold this lock, add back to this
  80          *    list as we can grab it from issue as well.
  81          * 2) The io_buffers_comp list. This one is protected by the
  82          *    ctx->completion_lock.
  83          *
  84          * We migrate buffers from the comp_list to the issue cache list
  85          * when we need one.
  86          */
  87         if (issue_flags & IO_URING_F_UNLOCKED) {
  88                 struct io_ring_ctx *ctx = req->ctx;
  89
  90                 spin_lock(&ctx->completion_lock);
  91                 __io_put_kbuf_list(req, len, &ctx->io_buffers_comp);
  92                 spin_unlock(&ctx->completion_lock);
  93         } else {
  94                 lockdep_assert_held(&req->ctx->uring_lock);
  95
  96                 __io_put_kbuf_list(req, len, &req->ctx->io_buffers_cache);
  97         }
  98 }
  99
 100 static void __user *io_provided_buffer_select(struct io_kiocb *req, size_t *len,
 101                                               struct io_buffer_list *bl)
 102 {
 103         if (!list_empty(&bl->buf_list)) {
 104                 struct io_buffer *kbuf;
 105
 106                 kbuf = list_first_entry(&bl->buf_list, struct io_buffer, list);
 107                 list_del(&kbuf->list);
 108                 if (*len == 0 || *len > kbuf->len)
 109                         *len = kbuf->len;
 110                 if (list_empty(&bl->buf_list))
 111                         req->flags |= REQ_F_BL_EMPTY;
 112                 req->flags |= REQ_F_BUFFER_SELECTED;
 113                 req->kbuf = kbuf;
 114                 req->buf_index = kbuf->bid;
 115                 return u64_to_user_ptr(kbuf->addr);
 116         }
 117         return NULL;
 118 }
 119
 120 static int io_provided_buffers_select(struct io_kiocb *req, size_t *len,
 121                                       struct io_buffer_list *bl,
 122                                       struct iovec *iov)
 123 {
 124         void __user *buf;
 125
 126         buf = io_provided_buffer_select(req, len, bl);
 127         if (unlikely(!buf))
 128                 return -ENOBUFS;
 129
 130         iov[0].iov_base = buf;
 131         iov[0].iov_len = *len;
 132         return 1;
 133 }
 134
 135 static void __user *io_ring_buffer_select(struct io_kiocb *req, size_t *len,
 136                                           struct io_buffer_list *bl,
 137                                           unsigned int issue_flags)
 138 {
 139         struct io_uring_buf_ring *br = bl->buf_ring;
 140         __u16 tail, head = bl->head;
 141         struct io_uring_buf *buf;
 142         void __user *ret;
 143
 144         tail = smp_load_acquire(&br->tail);
 145         if (unlikely(tail == head))
 146                 return NULL;
 147
 148         if (head + 1 == tail)
 149                 req->flags |= REQ_F_BL_EMPTY;
 150
 151         buf = io_ring_head_to_buf(br, head, bl->mask);
 152         if (*len == 0 || *len > buf->len)
 153                 *len = buf->len;
 154         req->flags |= REQ_F_BUFFER_RING | REQ_F_BUFFERS_COMMIT;
 155         req->buf_list = bl;
 156         req->buf_index = buf->bid;
 157         ret = u64_to_user_ptr(buf->addr);
 158
 159         if (issue_flags & IO_URING_F_UNLOCKED || !io_file_can_poll(req)) {
 160                 /*
 161                  * If we came in unlocked, we have no choice but to consume the
 162                  * buffer here, otherwise nothing ensures that the buffer won't
 163                  * get used by others. This does mean it'll be pinned until the
 164                  * IO completes, coming in unlocked means we're being called from
 165                  * io-wq context and there may be further retries in async hybrid
 166                  * mode. For the locked case, the caller must call commit when
 167                  * the transfer completes (or if we get -EAGAIN and must poll of
 168                  * retry).
 169                  */
 170                 io_kbuf_commit(req, bl, *len, 1);
 171                 req->buf_list = NULL;
 172         }
 173         return ret;
 174 }
 175
 176 void __user *io_buffer_select(struct io_kiocb *req, size_t *len,
 177                               unsigned int issue_flags)
 178 {
 179         struct io_ring_ctx *ctx = req->ctx;
 180         struct io_buffer_list *bl;
 181         void __user *ret = NULL;
 182
 183         io_ring_submit_lock(req->ctx, issue_flags);
 184
 185         bl = io_buffer_get_list(ctx, req->buf_index);
 186         if (likely(bl)) {
 187                 if (bl->flags & IOBL_BUF_RING)
 188                         ret = io_ring_buffer_select(req, len, bl, issue_flags);
 189                 else
 190                         ret = io_provided_buffer_select(req, len, bl);
 191         }
 192         io_ring_submit_unlock(req->ctx, issue_flags);
 193         return ret;
 194 }
 195
 196 /* cap it at a reasonable 256, will be one page even for 4K */
 197 #define PEEK_MAX_IMPORT         256
 198
 199 static int io_ring_buffers_peek(struct io_kiocb *req, struct buf_sel_arg *arg,
 200                                 struct io_buffer_list *bl)
 201 {
 202         struct io_uring_buf_ring *br = bl->buf_ring;
 203         struct iovec *iov = arg->iovs;
 204         int nr_iovs = arg->nr_iovs;
 205         __u16 nr_avail, tail, head;
 206         struct io_uring_buf *buf;
 207
 208         tail = smp_load_acquire(&br->tail);
 209         head = bl->head;
 210         nr_avail = min_t(__u16, tail - head, UIO_MAXIOV);
 211         if (unlikely(!nr_avail))
 212                 return -ENOBUFS;
 213
 214         buf = io_ring_head_to_buf(br, head, bl->mask);
 215         if (arg->max_len) {
 216                 u32 len = READ_ONCE(buf->len);
 217
 218                 if (unlikely(!len))
 219                         return -ENOBUFS;
 220                 /*
 221                  * Limit incremental buffers to 1 segment. No point trying
 222                  * to peek ahead and map more than we need, when the buffers
 223                  * themselves should be large when setup with
 224                  * IOU_PBUF_RING_INC.
 225                  */
 226                 if (bl->flags & IOBL_INC) {
 227                         nr_avail = 1;
 228                 } else {
 229                         size_t needed;
 230
 231                         needed = (arg->max_len + len - 1) / len;
 232                         needed = min_not_zero(needed, (size_t) PEEK_MAX_IMPORT);
 233                         if (nr_avail > needed)
 234                                 nr_avail = needed;
 235                 }
 236         }
 237
 238         /*
 239          * only alloc a bigger array if we know we have data to map, eg not
 240          * a speculative peek operation.
 241          */
 242         if (arg->mode & KBUF_MODE_EXPAND && nr_avail > nr_iovs && arg->max_len) {
 243                 iov = kmalloc_array(nr_avail, sizeof(struct iovec), GFP_KERNEL);
 244                 if (unlikely(!iov))
 245                         return -ENOMEM;
 246                 if (arg->mode & KBUF_MODE_FREE)
 247                         kfree(arg->iovs);
 248                 arg->iovs = iov;
 249                 nr_iovs = nr_avail;
 250         } else if (nr_avail < nr_iovs) {
 251                 nr_iovs = nr_avail;
 252         }
 253
 254         /* set it to max, if not set, so we can use it unconditionally */
 255         if (!arg->max_len)
 256                 arg->max_len = INT_MAX;
 257
 258         req->buf_index = buf->bid;
 259         do {
 260                 u32 len = buf->len;
 261
 262                 /* truncate end piece, if needed, for non partial buffers */
 263                 if (len > arg->max_len) {
 264                         len = arg->max_len;
 265                         if (!(bl->flags & IOBL_INC))
 266                                 buf->len = len;
 267                 }
 268
 269                 iov->iov_base = u64_to_user_ptr(buf->addr);
 270                 iov->iov_len = len;
 271                 iov++;
 272
 273                 arg->out_len += len;
 274                 arg->max_len -= len;
 275                 if (!arg->max_len)
 276                         break;
 277
 278                 buf = io_ring_head_to_buf(br, ++head, bl->mask);
 279         } while (--nr_iovs);
 280
 281         if (head == tail)
 282                 req->flags |= REQ_F_BL_EMPTY;
 283
 284         req->flags |= REQ_F_BUFFER_RING;
 285         req->buf_list = bl;
 286         return iov - arg->iovs;
 287 }
 288
 289 int io_buffers_select(struct io_kiocb *req, struct buf_sel_arg *arg,
 290                       unsigned int issue_flags)
 291 {
 292         struct io_ring_ctx *ctx = req->ctx;
 293         struct io_buffer_list *bl;
 294         int ret = -ENOENT;
 295
 296         io_ring_submit_lock(ctx, issue_flags);
 297         bl = io_buffer_get_list(ctx, req->buf_index);
 298         if (unlikely(!bl))
 299                 goto out_unlock;
 300
 301         if (bl->flags & IOBL_BUF_RING) {
 302                 ret = io_ring_buffers_peek(req, arg, bl);
 303                 /*
 304                  * Don't recycle these buffers if we need to go through poll.
 305                  * Nobody else can use them anyway, and holding on to provided
 306                  * buffers for a send/write operation would happen on the app
 307                  * side anyway with normal buffers. Besides, we already
 308                  * committed them, they cannot be put back in the queue.
 309                  */
 310                 if (ret > 0) {
 311                         req->flags |= REQ_F_BUFFERS_COMMIT | REQ_F_BL_NO_RECYCLE;
 312                         io_kbuf_commit(req, bl, arg->out_len, ret);
 313                 }
 314         } else {
 315                 ret = io_provided_buffers_select(req, &arg->out_len, bl, arg->iovs);
 316         }
 317 out_unlock:
 318         io_ring_submit_unlock(ctx, issue_flags);
 319         return ret;
 320 }
 321
 322 int io_buffers_peek(struct io_kiocb *req, struct buf_sel_arg *arg)
 323 {
 324         struct io_ring_ctx *ctx = req->ctx;
 325         struct io_buffer_list *bl;
 326         int ret;
 327
 328         lockdep_assert_held(&ctx->uring_lock);
 329
 330         bl = io_buffer_get_list(ctx, req->buf_index);
 331         if (unlikely(!bl))
 332                 return -ENOENT;
 333
 334         if (bl->flags & IOBL_BUF_RING) {
 335                 ret = io_ring_buffers_peek(req, arg, bl);
 336                 if (ret > 0)
 337                         req->flags |= REQ_F_BUFFERS_COMMIT;
 338                 return ret;
 339         }
 340
 341         /* don't support multiple buffer selections for legacy */
 342         return io_provided_buffers_select(req, &arg->max_len, bl, arg->iovs);
 343 }
 344
 345 static int __io_remove_buffers(struct io_ring_ctx *ctx,
 346                                struct io_buffer_list *bl, unsigned nbufs)
 347 {
 348         unsigned i = 0;
 349
 350         /* shouldn't happen */
 351         if (!nbufs)
 352                 return 0;
 353
 354         if (bl->flags & IOBL_BUF_RING) {
 355                 i = bl->buf_ring->tail - bl->head;
 356                 if (bl->buf_nr_pages) {
 357                         int j;
 358
 359                         if (!(bl->flags & IOBL_MMAP)) {
 360                                 for (j = 0; j < bl->buf_nr_pages; j++)
 361                                         unpin_user_page(bl->buf_pages[j]);
 362                         }
 363                         io_pages_unmap(bl->buf_ring, &bl->buf_pages,
 364                                         &bl->buf_nr_pages, bl->flags & IOBL_MMAP);
 365                         bl->flags &= ~IOBL_MMAP;
 366                 }
 367                 /* make sure it's seen as empty */
 368                 INIT_LIST_HEAD(&bl->buf_list);
 369                 bl->flags &= ~IOBL_BUF_RING;
 370                 return i;
 371         }
 372
 373         /* protects io_buffers_cache */
 374         lockdep_assert_held(&ctx->uring_lock);
 375
 376         while (!list_empty(&bl->buf_list)) {
 377                 struct io_buffer *nxt;
 378
 379                 nxt = list_first_entry(&bl->buf_list, struct io_buffer, list);
 380                 list_move(&nxt->list, &ctx->io_buffers_cache);
 381                 if (++i == nbufs)
 382                         return i;
 383                 cond_resched();
 384         }
 385
 386         return i;
 387 }
 388
 389 void io_put_bl(struct io_ring_ctx *ctx, struct io_buffer_list *bl)
 390 {
 391         if (atomic_dec_and_test(&bl->refs)) {
 392                 __io_remove_buffers(ctx, bl, -1U);
 393                 kfree_rcu(bl, rcu);
 394         }
 395 }
 396
 397 void io_destroy_buffers(struct io_ring_ctx *ctx)
 398 {
 399         struct io_buffer_list *bl;
 400         struct list_head *item, *tmp;
 401         struct io_buffer *buf;
 402         unsigned long index;
 403
 404         xa_for_each(&ctx->io_bl_xa, index, bl) {
 405                 xa_erase(&ctx->io_bl_xa, bl->bgid);
 406                 io_put_bl(ctx, bl);
 407         }
 408
 409         /*
 410          * Move deferred locked entries to cache before pruning
 411          */
 412         spin_lock(&ctx->completion_lock);
 413         if (!list_empty(&ctx->io_buffers_comp))
 414                 list_splice_init(&ctx->io_buffers_comp, &ctx->io_buffers_cache);
 415         spin_unlock(&ctx->completion_lock);
 416
 417         list_for_each_safe(item, tmp, &ctx->io_buffers_cache) {
 418                 buf = list_entry(item, struct io_buffer, list);
 419                 kmem_cache_free(io_buf_cachep, buf);
 420         }
 421 }
 422
 423 int io_remove_buffers_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 424 {
 425         struct io_provide_buf *p = io_kiocb_to_cmd(req, struct io_provide_buf);
 426         u64 tmp;
 427
 428         if (sqe->rw_flags || sqe->addr || sqe->len || sqe->off ||
 429             sqe->splice_fd_in)
 430                 return -EINVAL;
 431
 432         tmp = READ_ONCE(sqe->fd);
 433         if (!tmp || tmp > MAX_BIDS_PER_BGID)
 434                 return -EINVAL;
 435
 436         memset(p, 0, sizeof(*p));
 437         p->nbufs = tmp;
 438         p->bgid = READ_ONCE(sqe->buf_group);
 439         return 0;
 440 }
 441
 442 int io_remove_buffers(struct io_kiocb *req, unsigned int issue_flags)
 443 {
 444         struct io_provide_buf *p = io_kiocb_to_cmd(req, struct io_provide_buf);
 445         struct io_ring_ctx *ctx = req->ctx;
 446         struct io_buffer_list *bl;
 447         int ret = 0;
 448
 449         io_ring_submit_lock(ctx, issue_flags);
 450
 451         ret = -ENOENT;
 452         bl = io_buffer_get_list(ctx, p->bgid);
 453         if (bl) {
 454                 ret = -EINVAL;
 455                 /* can't use provide/remove buffers command on mapped buffers */
 456                 if (!(bl->flags & IOBL_BUF_RING))
 457                         ret = __io_remove_buffers(ctx, bl, p->nbufs);
 458         }
 459         io_ring_submit_unlock(ctx, issue_flags);
 460         if (ret < 0)
 461                 req_set_fail(req);
 462         io_req_set_res(req, ret, 0);
 463         return IOU_OK;
 464 }
 465
 466 int io_provide_buffers_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 467 {
 468         unsigned long size, tmp_check;
 469         struct io_provide_buf *p = io_kiocb_to_cmd(req, struct io_provide_buf);
 470         u64 tmp;
 471
 472         if (sqe->rw_flags || sqe->splice_fd_in)
 473                 return -EINVAL;
 474
 475         tmp = READ_ONCE(sqe->fd);
 476         if (!tmp || tmp > MAX_BIDS_PER_BGID)
 477                 return -E2BIG;
 478         p->nbufs = tmp;
 479         p->addr = READ_ONCE(sqe->addr);
 480         p->len = READ_ONCE(sqe->len);
 481
 482         if (check_mul_overflow((unsigned long)p->len, (unsigned long)p->nbufs,
 483                                 &size))
 484                 return -EOVERFLOW;
 485         if (check_add_overflow((unsigned long)p->addr, size, &tmp_check))
 486                 return -EOVERFLOW;
 487
 488         size = (unsigned long)p->len * p->nbufs;
 489         if (!access_ok(u64_to_user_ptr(p->addr), size))
 490                 return -EFAULT;
 491
 492         p->bgid = READ_ONCE(sqe->buf_group);
 493         tmp = READ_ONCE(sqe->off);
 494         if (tmp > USHRT_MAX)
 495                 return -E2BIG;
 496         if (tmp + p->nbufs > MAX_BIDS_PER_BGID)
 497                 return -EINVAL;
 498         p->bid = tmp;
 499         return 0;
 500 }
 501
 502 #define IO_BUFFER_ALLOC_BATCH 64
 503
 504 static int io_refill_buffer_cache(struct io_ring_ctx *ctx)
 505 {
 506         struct io_buffer *bufs[IO_BUFFER_ALLOC_BATCH];
 507         int allocated;
 508
 509         /*
 510          * Completions that don't happen inline (eg not under uring_lock) will
 511          * add to ->io_buffers_comp. If we don't have any free buffers, check
 512          * the completion list and splice those entries first.
 513          */
 514         if (!list_empty_careful(&ctx->io_buffers_comp)) {
 515                 spin_lock(&ctx->completion_lock);
 516                 if (!list_empty(&ctx->io_buffers_comp)) {
 517                         list_splice_init(&ctx->io_buffers_comp,
 518                                                 &ctx->io_buffers_cache);
 519                         spin_unlock(&ctx->completion_lock);
 520                         return 0;
 521                 }
 522                 spin_unlock(&ctx->completion_lock);
 523         }
 524
 525         /*
 526          * No free buffers and no completion entries either. Allocate a new
 527          * batch of buffer entries and add those to our freelist.
 528          */
 529
 530         allocated = kmem_cache_alloc_bulk(io_buf_cachep, GFP_KERNEL_ACCOUNT,
 531                                           ARRAY_SIZE(bufs), (void **) bufs);
 532         if (unlikely(!allocated)) {
 533                 /*
 534                  * Bulk alloc is all-or-nothing. If we fail to get a batch,
 535                  * retry single alloc to be on the safe side.
 536                  */
 537                 bufs[0] = kmem_cache_alloc(io_buf_cachep, GFP_KERNEL);
 538                 if (!bufs[0])
 539                         return -ENOMEM;
 540                 allocated = 1;
 541         }
 542
 543         while (allocated)
 544                 list_add_tail(&bufs[--allocated]->list, &ctx->io_buffers_cache);
 545
 546         return 0;
 547 }
 548
 549 static int io_add_buffers(struct io_ring_ctx *ctx, struct io_provide_buf *pbuf,
 550                           struct io_buffer_list *bl)
 551 {
 552         struct io_buffer *buf;
 553         u64 addr = pbuf->addr;
 554         int i, bid = pbuf->bid;
 555
 556         for (i = 0; i < pbuf->nbufs; i++) {
 557                 if (list_empty(&ctx->io_buffers_cache) &&
 558                     io_refill_buffer_cache(ctx))
 559                         break;
 560                 buf = list_first_entry(&ctx->io_buffers_cache, struct io_buffer,
 561                                         list);
 562                 list_move_tail(&buf->list, &bl->buf_list);
 563                 buf->addr = addr;
 564                 buf->len = min_t(__u32, pbuf->len, MAX_RW_COUNT);
 565                 buf->bid = bid;
 566                 buf->bgid = pbuf->bgid;
 567                 addr += pbuf->len;
 568                 bid++;
 569                 cond_resched();
 570         }
 571
 572         return i ? 0 : -ENOMEM;
 573 }
 574
 575 int io_provide_buffers(struct io_kiocb *req, unsigned int issue_flags)
 576 {
 577         struct io_provide_buf *p = io_kiocb_to_cmd(req, struct io_provide_buf);
 578         struct io_ring_ctx *ctx = req->ctx;
 579         struct io_buffer_list *bl;
 580         int ret = 0;
 581
 582         io_ring_submit_lock(ctx, issue_flags);
 583
 584         bl = io_buffer_get_list(ctx, p->bgid);
 585         if (unlikely(!bl)) {
 586                 bl = kzalloc(sizeof(*bl), GFP_KERNEL_ACCOUNT);
 587                 if (!bl) {
 588                         ret = -ENOMEM;
 589                         goto err;
 590                 }
 591                 INIT_LIST_HEAD(&bl->buf_list);
 592                 ret = io_buffer_add_list(ctx, bl, p->bgid);
 593                 if (ret) {
 594                         /*
 595                          * Doesn't need rcu free as it was never visible, but
 596                          * let's keep it consistent throughout.
 597                          */
 598                         kfree_rcu(bl, rcu);
 599                         goto err;
 600                 }
 601         }
 602         /* can't add buffers via this command for a mapped buffer ring */
 603         if (bl->flags & IOBL_BUF_RING) {
 604                 ret = -EINVAL;
 605                 goto err;
 606         }
 607
 608         ret = io_add_buffers(ctx, p, bl);
 609 err:
 610         io_ring_submit_unlock(ctx, issue_flags);
 611
 612         if (ret < 0)
 613                 req_set_fail(req);
 614         io_req_set_res(req, ret, 0);
 615         return IOU_OK;
 616 }
 617
 618 static int io_pin_pbuf_ring(struct io_uring_buf_reg *reg,
 619                             struct io_buffer_list *bl)
 620 {
 621         struct io_uring_buf_ring *br = NULL;
 622         struct page **pages;
 623         int nr_pages, ret;
 624
 625         pages = io_pin_pages(reg->ring_addr,
 626                              flex_array_size(br, bufs, reg->ring_entries),
 627                              &nr_pages);
 628         if (IS_ERR(pages))
 629                 return PTR_ERR(pages);
 630
 631         br = vmap(pages, nr_pages, VM_MAP, PAGE_KERNEL);
 632         if (!br) {
 633                 ret = -ENOMEM;
 634                 goto error_unpin;
 635         }
 636
 637 #ifdef SHM_COLOUR
 638         /*
 639          * On platforms that have specific aliasing requirements, SHM_COLOUR
 640          * is set and we must guarantee that the kernel and user side align
 641          * nicely. We cannot do that if IOU_PBUF_RING_MMAP isn't set and
 642          * the application mmap's the provided ring buffer. Fail the request
 643          * if we, by chance, don't end up with aligned addresses. The app
 644          * should use IOU_PBUF_RING_MMAP instead, and liburing will handle
 645          * this transparently.
 646          */
 647         if ((reg->ring_addr | (unsigned long) br) & (SHM_COLOUR - 1)) {
 648                 ret = -EINVAL;
 649                 goto error_unpin;
 650         }
 651 #endif
 652         bl->buf_pages = pages;
 653         bl->buf_nr_pages = nr_pages;
 654         bl->buf_ring = br;
 655         bl->flags |= IOBL_BUF_RING;
 656         bl->flags &= ~IOBL_MMAP;
 657         return 0;
 658 error_unpin:
 659         unpin_user_pages(pages, nr_pages);
 660         kvfree(pages);
 661         vunmap(br);
 662         return ret;
 663 }
 664
 665 static int io_alloc_pbuf_ring(struct io_ring_ctx *ctx,
 666                               struct io_uring_buf_reg *reg,
 667                               struct io_buffer_list *bl)
 668 {
 669         size_t ring_size;
 670
 671         ring_size = reg->ring_entries * sizeof(struct io_uring_buf_ring);
 672
 673         bl->buf_ring = io_pages_map(&bl->buf_pages, &bl->buf_nr_pages, ring_size);
 674         if (IS_ERR(bl->buf_ring)) {
 675                 bl->buf_ring = NULL;
 676                 return -ENOMEM;
 677         }
 678
 679         bl->flags |= (IOBL_BUF_RING | IOBL_MMAP);
 680         return 0;
 681 }
 682
 683 int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg)
 684 {
 685         struct io_uring_buf_reg reg;
 686         struct io_buffer_list *bl, *free_bl = NULL;
 687         int ret;
 688
 689         lockdep_assert_held(&ctx->uring_lock);
 690
 691         if (copy_from_user(&reg, arg, sizeof(reg)))
 692                 return -EFAULT;
 693
 694         if (reg.resv[0] || reg.resv[1] || reg.resv[2])
 695                 return -EINVAL;
 696         if (reg.flags & ~(IOU_PBUF_RING_MMAP | IOU_PBUF_RING_INC))
 697                 return -EINVAL;
 698         if (!(reg.flags & IOU_PBUF_RING_MMAP)) {
 699                 if (!reg.ring_addr)
 700                         return -EFAULT;
 701                 if (reg.ring_addr & ~PAGE_MASK)
 702                         return -EINVAL;
 703         } else {
 704                 if (reg.ring_addr)
 705                         return -EINVAL;
 706         }
 707
 708         if (!is_power_of_2(reg.ring_entries))
 709                 return -EINVAL;
 710
 711         /* cannot disambiguate full vs empty due to head/tail size */
 712         if (reg.ring_entries >= 65536)
 713                 return -EINVAL;
 714
 715         bl = io_buffer_get_list(ctx, reg.bgid);
 716         if (bl) {
 717                 /* if mapped buffer ring OR classic exists, don't allow */
 718                 if (bl->flags & IOBL_BUF_RING || !list_empty(&bl->buf_list))
 719                         return -EEXIST;
 720         } else {
 721                 free_bl = bl = kzalloc(sizeof(*bl), GFP_KERNEL);
 722                 if (!bl)
 723                         return -ENOMEM;
 724         }
 725
 726         if (!(reg.flags & IOU_PBUF_RING_MMAP))
 727                 ret = io_pin_pbuf_ring(&reg, bl);
 728         else
 729                 ret = io_alloc_pbuf_ring(ctx, &reg, bl);
 730
 731         if (!ret) {
 732                 bl->nr_entries = reg.ring_entries;
 733                 bl->mask = reg.ring_entries - 1;
 734                 if (reg.flags & IOU_PBUF_RING_INC)
 735                         bl->flags |= IOBL_INC;
 736
 737                 io_buffer_add_list(ctx, bl, reg.bgid);
 738                 return 0;
 739         }
 740
 741         kfree_rcu(free_bl, rcu);
 742         return ret;
 743 }
 744
 745 int io_unregister_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg)
 746 {
 747         struct io_uring_buf_reg reg;
 748         struct io_buffer_list *bl;
 749
 750         lockdep_assert_held(&ctx->uring_lock);
 751
 752         if (copy_from_user(&reg, arg, sizeof(reg)))
 753                 return -EFAULT;
 754         if (reg.resv[0] || reg.resv[1] || reg.resv[2])
 755                 return -EINVAL;
 756         if (reg.flags)
 757                 return -EINVAL;
 758
 759         bl = io_buffer_get_list(ctx, reg.bgid);
 760         if (!bl)
 761                 return -ENOENT;
 762         if (!(bl->flags & IOBL_BUF_RING))
 763                 return -EINVAL;
 764
 765         xa_erase(&ctx->io_bl_xa, bl->bgid);
 766         io_put_bl(ctx, bl);
 767         return 0;
 768 }
 769
 770 int io_register_pbuf_status(struct io_ring_ctx *ctx, void __user *arg)
 771 {
 772         struct io_uring_buf_status buf_status;
 773         struct io_buffer_list *bl;
 774         int i;
 775
 776         if (copy_from_user(&buf_status, arg, sizeof(buf_status)))
 777                 return -EFAULT;
 778
 779         for (i = 0; i < ARRAY_SIZE(buf_status.resv); i++)
 780                 if (buf_status.resv[i])
 781                         return -EINVAL;
 782
 783         bl = io_buffer_get_list(ctx, buf_status.buf_group);
 784         if (!bl)
 785                 return -ENOENT;
 786         if (!(bl->flags & IOBL_BUF_RING))
 787                 return -EINVAL;
 788
 789         buf_status.head = bl->head;
 790         if (copy_to_user(arg, &buf_status, sizeof(buf_status)))
 791                 return -EFAULT;
 792
 793         return 0;
 794 }
 795
 796 struct io_buffer_list *io_pbuf_get_bl(struct io_ring_ctx *ctx,
 797                                       unsigned long bgid)
 798 {
 799         struct io_buffer_list *bl;
 800         bool ret;
 801
 802         /*
 803          * We have to be a bit careful here - we're inside mmap and cannot grab
 804          * the uring_lock. This means the buffer_list could be simultaneously
 805          * going away, if someone is trying to be sneaky. Look it up under rcu
 806          * so we know it's not going away, and attempt to grab a reference to
 807          * it. If the ref is already zero, then fail the mapping. If successful,
 808          * the caller will call io_put_bl() to drop the the reference at at the
 809          * end. This may then safely free the buffer_list (and drop the pages)
 810          * at that point, vm_insert_pages() would've already grabbed the
 811          * necessary vma references.
 812          */
 813         rcu_read_lock();
 814         bl = xa_load(&ctx->io_bl_xa, bgid);
 815         /* must be a mmap'able buffer ring and have pages */
 816         ret = false;
 817         if (bl && bl->flags & IOBL_MMAP)
 818                 ret = atomic_inc_not_zero(&bl->refs);
 819         rcu_read_unlock();
 820
 821         if (ret)
 822                 return bl;
 823
 824         return ERR_PTR(-EINVAL);
 825 }
 826
 827 int io_pbuf_mmap(struct file *file, struct vm_area_struct *vma)
 828 {
 829         struct io_ring_ctx *ctx = file->private_data;
 830         loff_t pgoff = vma->vm_pgoff << PAGE_SHIFT;
 831         struct io_buffer_list *bl;
 832         int bgid, ret;
 833
 834         bgid = (pgoff & ~IORING_OFF_MMAP_MASK) >> IORING_OFF_PBUF_SHIFT;
 835         bl = io_pbuf_get_bl(ctx, bgid);
 836         if (IS_ERR(bl))
 837                 return PTR_ERR(bl);
 838
 839         ret = io_uring_mmap_pages(ctx, vma, bl->buf_pages, bl->buf_nr_pages);
 840         io_put_bl(ctx, bl);
 841         return ret;
 842 }