]> Git Repo - linux.git/blame - io_uring/rw.c
io_uring: opcode independent fixed buf import
[linux.git] / io_uring / rw.c
CommitLineData
f3b44f92
JA
1// SPDX-License-Identifier: GPL-2.0
2#include <linux/kernel.h>
3#include <linux/errno.h>
4#include <linux/fs.h>
5#include <linux/file.h>
6#include <linux/blk-mq.h>
7#include <linux/mm.h>
8#include <linux/slab.h>
9#include <linux/fsnotify.h>
10#include <linux/poll.h>
11#include <linux/nospec.h>
12#include <linux/compat.h>
13#include <linux/io_uring.h>
14
15#include <uapi/linux/io_uring.h>
16
f3b44f92
JA
17#include "io_uring.h"
18#include "opdef.h"
19#include "kbuf.h"
20#include "rsrc.h"
21#include "rw.h"
22
23struct io_rw {
24 /* NOTE: kiocb has the file as the first member, so don't do it here */
25 struct kiocb kiocb;
26 u64 addr;
27 u32 len;
28 rwf_t flags;
29};
30
31static inline bool io_file_supports_nowait(struct io_kiocb *req)
32{
33 return req->flags & REQ_F_SUPPORT_NOWAIT;
34}
35
36int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe)
37{
38 struct io_rw *rw = io_kiocb_to_cmd(req);
39 unsigned ioprio;
40 int ret;
41
42 rw->kiocb.ki_pos = READ_ONCE(sqe->off);
43 /* used for fixed read/write too - just read unconditionally */
44 req->buf_index = READ_ONCE(sqe->buf_index);
45
46 if (req->opcode == IORING_OP_READ_FIXED ||
47 req->opcode == IORING_OP_WRITE_FIXED) {
48 struct io_ring_ctx *ctx = req->ctx;
49 u16 index;
50
51 if (unlikely(req->buf_index >= ctx->nr_user_bufs))
52 return -EFAULT;
53 index = array_index_nospec(req->buf_index, ctx->nr_user_bufs);
54 req->imu = ctx->user_bufs[index];
55 io_req_set_rsrc_node(req, ctx, 0);
56 }
57
58 ioprio = READ_ONCE(sqe->ioprio);
59 if (ioprio) {
60 ret = ioprio_check_cap(ioprio);
61 if (ret)
62 return ret;
63
64 rw->kiocb.ki_ioprio = ioprio;
65 } else {
66 rw->kiocb.ki_ioprio = get_current_ioprio();
67 }
68
69 rw->addr = READ_ONCE(sqe->addr);
70 rw->len = READ_ONCE(sqe->len);
71 rw->flags = READ_ONCE(sqe->rw_flags);
72 return 0;
73}
74
75void io_readv_writev_cleanup(struct io_kiocb *req)
76{
77 struct io_async_rw *io = req->async_data;
78
79 kfree(io->free_iovec);
80}
81
82static inline void io_rw_done(struct kiocb *kiocb, ssize_t ret)
83{
84 switch (ret) {
85 case -EIOCBQUEUED:
86 break;
87 case -ERESTARTSYS:
88 case -ERESTARTNOINTR:
89 case -ERESTARTNOHAND:
90 case -ERESTART_RESTARTBLOCK:
91 /*
92 * We can't just restart the syscall, since previously
93 * submitted sqes may already be in progress. Just fail this
94 * IO with EINTR.
95 */
96 ret = -EINTR;
97 fallthrough;
98 default:
99 kiocb->ki_complete(kiocb, ret);
100 }
101}
102
103static inline loff_t *io_kiocb_update_pos(struct io_kiocb *req)
104{
105 struct io_rw *rw = io_kiocb_to_cmd(req);
106
107 if (rw->kiocb.ki_pos != -1)
108 return &rw->kiocb.ki_pos;
109
110 if (!(req->file->f_mode & FMODE_STREAM)) {
111 req->flags |= REQ_F_CUR_POS;
112 rw->kiocb.ki_pos = req->file->f_pos;
113 return &rw->kiocb.ki_pos;
114 }
115
116 rw->kiocb.ki_pos = 0;
117 return NULL;
118}
119
120static void io_req_task_queue_reissue(struct io_kiocb *req)
121{
122 req->io_task_work.func = io_queue_iowq;
123 io_req_task_work_add(req);
124}
125
126#ifdef CONFIG_BLOCK
127static bool io_resubmit_prep(struct io_kiocb *req)
128{
129 struct io_async_rw *io = req->async_data;
130
131 if (!req_has_async_data(req))
132 return !io_req_prep_async(req);
133 iov_iter_restore(&io->s.iter, &io->s.iter_state);
134 return true;
135}
136
137static bool io_rw_should_reissue(struct io_kiocb *req)
138{
139 umode_t mode = file_inode(req->file)->i_mode;
140 struct io_ring_ctx *ctx = req->ctx;
141
142 if (!S_ISBLK(mode) && !S_ISREG(mode))
143 return false;
144 if ((req->flags & REQ_F_NOWAIT) || (io_wq_current_is_worker() &&
145 !(ctx->flags & IORING_SETUP_IOPOLL)))
146 return false;
147 /*
148 * If ref is dying, we might be running poll reap from the exit work.
149 * Don't attempt to reissue from that path, just let it fail with
150 * -EAGAIN.
151 */
152 if (percpu_ref_is_dying(&ctx->refs))
153 return false;
154 /*
155 * Play it safe and assume not safe to re-import and reissue if we're
156 * not in the original thread group (or in task context).
157 */
158 if (!same_thread_group(req->task, current) || !in_task())
159 return false;
160 return true;
161}
162#else
163static bool io_resubmit_prep(struct io_kiocb *req)
164{
165 return false;
166}
167static bool io_rw_should_reissue(struct io_kiocb *req)
168{
169 return false;
170}
171#endif
172
173static void kiocb_end_write(struct io_kiocb *req)
174{
175 /*
176 * Tell lockdep we inherited freeze protection from submission
177 * thread.
178 */
179 if (req->flags & REQ_F_ISREG) {
180 struct super_block *sb = file_inode(req->file)->i_sb;
181
182 __sb_writers_acquired(sb, SB_FREEZE_WRITE);
183 sb_end_write(sb);
184 }
185}
186
187static bool __io_complete_rw_common(struct io_kiocb *req, long res)
188{
189 struct io_rw *rw = io_kiocb_to_cmd(req);
190
191 if (rw->kiocb.ki_flags & IOCB_WRITE) {
192 kiocb_end_write(req);
193 fsnotify_modify(req->file);
194 } else {
195 fsnotify_access(req->file);
196 }
197 if (unlikely(res != req->cqe.res)) {
198 if ((res == -EAGAIN || res == -EOPNOTSUPP) &&
199 io_rw_should_reissue(req)) {
200 req->flags |= REQ_F_REISSUE | REQ_F_PARTIAL_IO;
201 return true;
202 }
203 req_set_fail(req);
204 req->cqe.res = res;
205 }
206 return false;
207}
208
f3b44f92
JA
209static void io_complete_rw(struct kiocb *kiocb, long res)
210{
211 struct io_rw *rw = container_of(kiocb, struct io_rw, kiocb);
212 struct io_kiocb *req = cmd_to_io_kiocb(rw);
213
214 if (__io_complete_rw_common(req, res))
215 return;
216 io_req_set_res(req, res, 0);
217 req->io_task_work.func = io_req_task_complete;
218 io_req_task_prio_work_add(req);
219}
220
221static void io_complete_rw_iopoll(struct kiocb *kiocb, long res)
222{
223 struct io_rw *rw = container_of(kiocb, struct io_rw, kiocb);
224 struct io_kiocb *req = cmd_to_io_kiocb(rw);
225
226 if (kiocb->ki_flags & IOCB_WRITE)
227 kiocb_end_write(req);
228 if (unlikely(res != req->cqe.res)) {
229 if (res == -EAGAIN && io_rw_should_reissue(req)) {
230 req->flags |= REQ_F_REISSUE | REQ_F_PARTIAL_IO;
231 return;
232 }
233 req->cqe.res = res;
234 }
235
236 /* order with io_iopoll_complete() checking ->iopoll_completed */
237 smp_store_release(&req->iopoll_completed, 1);
238}
239
df9830d8 240static int kiocb_done(struct io_kiocb *req, ssize_t ret,
f3b44f92
JA
241 unsigned int issue_flags)
242{
243 struct io_async_rw *io = req->async_data;
244 struct io_rw *rw = io_kiocb_to_cmd(req);
245
246 /* add previously done IO, if any */
247 if (req_has_async_data(req) && io->bytes_done > 0) {
248 if (ret < 0)
249 ret = io->bytes_done;
250 else
251 ret += io->bytes_done;
252 }
253
254 if (req->flags & REQ_F_CUR_POS)
255 req->file->f_pos = rw->kiocb.ki_pos;
df9830d8
PB
256 if (ret >= 0 && (rw->kiocb.ki_complete == io_complete_rw)) {
257 if (!__io_complete_rw_common(req, ret)) {
258 io_req_set_res(req, req->cqe.res,
259 io_put_kbuf(req, issue_flags));
260 return IOU_OK;
261 }
262 } else {
f3b44f92 263 io_rw_done(&rw->kiocb, ret);
df9830d8 264 }
f3b44f92
JA
265
266 if (req->flags & REQ_F_REISSUE) {
267 req->flags &= ~REQ_F_REISSUE;
268 if (io_resubmit_prep(req))
269 io_req_task_queue_reissue(req);
270 else
271 io_req_task_queue_fail(req, ret);
272 }
df9830d8 273 return IOU_ISSUE_SKIP_COMPLETE;
f3b44f92
JA
274}
275
f337a84d
PB
276static int io_import_fixed(int ddir, struct iov_iter *iter,
277 struct io_mapped_ubuf *imu,
278 u64 buf_addr, size_t len)
f3b44f92 279{
f337a84d 280 u64 buf_end;
f3b44f92
JA
281 size_t offset;
282
f337a84d
PB
283 if (WARN_ON_ONCE(!imu))
284 return -EFAULT;
f3b44f92
JA
285 if (unlikely(check_add_overflow(buf_addr, (u64)len, &buf_end)))
286 return -EFAULT;
287 /* not inside the mapped region */
288 if (unlikely(buf_addr < imu->ubuf || buf_end > imu->ubuf_end))
289 return -EFAULT;
290
291 /*
292 * May not be a start of buffer, set size appropriately
293 * and advance us to the beginning.
294 */
295 offset = buf_addr - imu->ubuf;
296 iov_iter_bvec(iter, ddir, imu->bvec, imu->nr_bvecs, offset + len);
297
298 if (offset) {
299 /*
300 * Don't use iov_iter_advance() here, as it's really slow for
301 * using the latter parts of a big fixed buffer - it iterates
302 * over each segment manually. We can cheat a bit here, because
303 * we know that:
304 *
305 * 1) it's a BVEC iter, we set it up
306 * 2) all bvecs are PAGE_SIZE in size, except potentially the
307 * first and last bvec
308 *
309 * So just find our index, and adjust the iterator afterwards.
310 * If the offset is within the first bvec (or the whole first
311 * bvec, just use iov_iter_advance(). This makes it easier
312 * since we can just skip the first segment, which may not
313 * be PAGE_SIZE aligned.
314 */
315 const struct bio_vec *bvec = imu->bvec;
316
317 if (offset <= bvec->bv_len) {
318 iov_iter_advance(iter, offset);
319 } else {
320 unsigned long seg_skip;
321
322 /* skip first vec */
323 offset -= bvec->bv_len;
324 seg_skip = 1 + (offset >> PAGE_SHIFT);
325
326 iter->bvec = bvec + seg_skip;
327 iter->nr_segs -= seg_skip;
328 iter->count -= bvec->bv_len + offset;
329 iter->iov_offset = offset & ~PAGE_MASK;
330 }
331 }
332
333 return 0;
334}
335
f3b44f92
JA
336#ifdef CONFIG_COMPAT
337static ssize_t io_compat_import(struct io_kiocb *req, struct iovec *iov,
338 unsigned int issue_flags)
339{
340 struct io_rw *rw = io_kiocb_to_cmd(req);
341 struct compat_iovec __user *uiov;
342 compat_ssize_t clen;
343 void __user *buf;
344 size_t len;
345
346 uiov = u64_to_user_ptr(rw->addr);
347 if (!access_ok(uiov, sizeof(*uiov)))
348 return -EFAULT;
349 if (__get_user(clen, &uiov->iov_len))
350 return -EFAULT;
351 if (clen < 0)
352 return -EINVAL;
353
354 len = clen;
355 buf = io_buffer_select(req, &len, issue_flags);
356 if (!buf)
357 return -ENOBUFS;
358 rw->addr = (unsigned long) buf;
359 iov[0].iov_base = buf;
360 rw->len = iov[0].iov_len = (compat_size_t) len;
361 return 0;
362}
363#endif
364
365static ssize_t __io_iov_buffer_select(struct io_kiocb *req, struct iovec *iov,
366 unsigned int issue_flags)
367{
368 struct io_rw *rw = io_kiocb_to_cmd(req);
369 struct iovec __user *uiov = u64_to_user_ptr(rw->addr);
370 void __user *buf;
371 ssize_t len;
372
373 if (copy_from_user(iov, uiov, sizeof(*uiov)))
374 return -EFAULT;
375
376 len = iov[0].iov_len;
377 if (len < 0)
378 return -EINVAL;
379 buf = io_buffer_select(req, &len, issue_flags);
380 if (!buf)
381 return -ENOBUFS;
382 rw->addr = (unsigned long) buf;
383 iov[0].iov_base = buf;
384 rw->len = iov[0].iov_len = len;
385 return 0;
386}
387
388static ssize_t io_iov_buffer_select(struct io_kiocb *req, struct iovec *iov,
389 unsigned int issue_flags)
390{
391 struct io_rw *rw = io_kiocb_to_cmd(req);
392
393 if (req->flags & (REQ_F_BUFFER_SELECTED|REQ_F_BUFFER_RING)) {
394 iov[0].iov_base = u64_to_user_ptr(rw->addr);
395 iov[0].iov_len = rw->len;
396 return 0;
397 }
398 if (rw->len != 1)
399 return -EINVAL;
400
401#ifdef CONFIG_COMPAT
402 if (req->ctx->compat)
403 return io_compat_import(req, iov, issue_flags);
404#endif
405
406 return __io_iov_buffer_select(req, iov, issue_flags);
407}
408
409static struct iovec *__io_import_iovec(int ddir, struct io_kiocb *req,
410 struct io_rw_state *s,
411 unsigned int issue_flags)
412{
413 struct io_rw *rw = io_kiocb_to_cmd(req);
414 struct iov_iter *iter = &s->iter;
415 u8 opcode = req->opcode;
416 struct iovec *iovec;
417 void __user *buf;
418 size_t sqe_len;
419 ssize_t ret;
420
421 if (opcode == IORING_OP_READ_FIXED || opcode == IORING_OP_WRITE_FIXED) {
f337a84d 422 ret = io_import_fixed(ddir, iter, req->imu, rw->addr, rw->len);
f3b44f92
JA
423 if (ret)
424 return ERR_PTR(ret);
425 return NULL;
426 }
427
428 buf = u64_to_user_ptr(rw->addr);
429 sqe_len = rw->len;
430
431 if (opcode == IORING_OP_READ || opcode == IORING_OP_WRITE) {
432 if (io_do_buffer_select(req)) {
433 buf = io_buffer_select(req, &sqe_len, issue_flags);
434 if (!buf)
435 return ERR_PTR(-ENOBUFS);
436 rw->addr = (unsigned long) buf;
437 rw->len = sqe_len;
438 }
439
440 ret = import_single_range(ddir, buf, sqe_len, s->fast_iov, iter);
441 if (ret)
442 return ERR_PTR(ret);
443 return NULL;
444 }
445
446 iovec = s->fast_iov;
447 if (req->flags & REQ_F_BUFFER_SELECT) {
448 ret = io_iov_buffer_select(req, iovec, issue_flags);
449 if (ret)
450 return ERR_PTR(ret);
451 iov_iter_init(iter, ddir, iovec, 1, iovec->iov_len);
452 return NULL;
453 }
454
455 ret = __import_iovec(ddir, buf, sqe_len, UIO_FASTIOV, &iovec, iter,
456 req->ctx->compat);
457 if (unlikely(ret < 0))
458 return ERR_PTR(ret);
459 return iovec;
460}
461
462static inline int io_import_iovec(int rw, struct io_kiocb *req,
463 struct iovec **iovec, struct io_rw_state *s,
464 unsigned int issue_flags)
465{
466 *iovec = __io_import_iovec(rw, req, s, issue_flags);
467 if (unlikely(IS_ERR(*iovec)))
468 return PTR_ERR(*iovec);
469
470 iov_iter_save_state(&s->iter, &s->iter_state);
471 return 0;
472}
473
474static inline loff_t *io_kiocb_ppos(struct kiocb *kiocb)
475{
476 return (kiocb->ki_filp->f_mode & FMODE_STREAM) ? NULL : &kiocb->ki_pos;
477}
478
479/*
480 * For files that don't have ->read_iter() and ->write_iter(), handle them
481 * by looping over ->read() or ->write() manually.
482 */
483static ssize_t loop_rw_iter(int ddir, struct io_rw *rw, struct iov_iter *iter)
484{
485 struct kiocb *kiocb = &rw->kiocb;
486 struct file *file = kiocb->ki_filp;
487 ssize_t ret = 0;
488 loff_t *ppos;
489
490 /*
491 * Don't support polled IO through this interface, and we can't
492 * support non-blocking either. For the latter, this just causes
493 * the kiocb to be handled from an async context.
494 */
495 if (kiocb->ki_flags & IOCB_HIPRI)
496 return -EOPNOTSUPP;
497 if ((kiocb->ki_flags & IOCB_NOWAIT) &&
498 !(kiocb->ki_filp->f_flags & O_NONBLOCK))
499 return -EAGAIN;
500
501 ppos = io_kiocb_ppos(kiocb);
502
503 while (iov_iter_count(iter)) {
504 struct iovec iovec;
505 ssize_t nr;
506
507 if (!iov_iter_is_bvec(iter)) {
508 iovec = iov_iter_iovec(iter);
509 } else {
510 iovec.iov_base = u64_to_user_ptr(rw->addr);
511 iovec.iov_len = rw->len;
512 }
513
514 if (ddir == READ) {
515 nr = file->f_op->read(file, iovec.iov_base,
516 iovec.iov_len, ppos);
517 } else {
518 nr = file->f_op->write(file, iovec.iov_base,
519 iovec.iov_len, ppos);
520 }
521
522 if (nr < 0) {
523 if (!ret)
524 ret = nr;
525 break;
526 }
527 ret += nr;
528 if (!iov_iter_is_bvec(iter)) {
529 iov_iter_advance(iter, nr);
530 } else {
531 rw->addr += nr;
532 rw->len -= nr;
533 if (!rw->len)
534 break;
535 }
536 if (nr != iovec.iov_len)
537 break;
538 }
539
540 return ret;
541}
542
543static void io_req_map_rw(struct io_kiocb *req, const struct iovec *iovec,
544 const struct iovec *fast_iov, struct iov_iter *iter)
545{
546 struct io_async_rw *io = req->async_data;
547
548 memcpy(&io->s.iter, iter, sizeof(*iter));
549 io->free_iovec = iovec;
550 io->bytes_done = 0;
551 /* can only be fixed buffers, no need to do anything */
552 if (iov_iter_is_bvec(iter))
553 return;
554 if (!iovec) {
555 unsigned iov_off = 0;
556
557 io->s.iter.iov = io->s.fast_iov;
558 if (iter->iov != fast_iov) {
559 iov_off = iter->iov - fast_iov;
560 io->s.iter.iov += iov_off;
561 }
562 if (io->s.fast_iov != fast_iov)
563 memcpy(io->s.fast_iov + iov_off, fast_iov + iov_off,
564 sizeof(struct iovec) * iter->nr_segs);
565 } else {
566 req->flags |= REQ_F_NEED_CLEANUP;
567 }
568}
569
570static int io_setup_async_rw(struct io_kiocb *req, const struct iovec *iovec,
571 struct io_rw_state *s, bool force)
572{
573 if (!force && !io_op_defs[req->opcode].prep_async)
574 return 0;
575 if (!req_has_async_data(req)) {
576 struct io_async_rw *iorw;
577
578 if (io_alloc_async_data(req)) {
579 kfree(iovec);
580 return -ENOMEM;
581 }
582
583 io_req_map_rw(req, iovec, s->fast_iov, &s->iter);
584 iorw = req->async_data;
585 /* we've copied and mapped the iter, ensure state is saved */
586 iov_iter_save_state(&iorw->s.iter, &iorw->s.iter_state);
587 }
588 return 0;
589}
590
591static inline int io_rw_prep_async(struct io_kiocb *req, int rw)
592{
593 struct io_async_rw *iorw = req->async_data;
594 struct iovec *iov;
595 int ret;
596
597 /* submission path, ->uring_lock should already be taken */
598 ret = io_import_iovec(rw, req, &iov, &iorw->s, 0);
599 if (unlikely(ret < 0))
600 return ret;
601
602 iorw->bytes_done = 0;
603 iorw->free_iovec = iov;
604 if (iov)
605 req->flags |= REQ_F_NEED_CLEANUP;
606 return 0;
607}
608
609int io_readv_prep_async(struct io_kiocb *req)
610{
611 return io_rw_prep_async(req, READ);
612}
613
614int io_writev_prep_async(struct io_kiocb *req)
615{
616 return io_rw_prep_async(req, WRITE);
617}
618
619/*
620 * This is our waitqueue callback handler, registered through __folio_lock_async()
621 * when we initially tried to do the IO with the iocb armed our waitqueue.
622 * This gets called when the page is unlocked, and we generally expect that to
623 * happen when the page IO is completed and the page is now uptodate. This will
624 * queue a task_work based retry of the operation, attempting to copy the data
625 * again. If the latter fails because the page was NOT uptodate, then we will
626 * do a thread based blocking retry of the operation. That's the unexpected
627 * slow path.
628 */
629static int io_async_buf_func(struct wait_queue_entry *wait, unsigned mode,
630 int sync, void *arg)
631{
632 struct wait_page_queue *wpq;
633 struct io_kiocb *req = wait->private;
634 struct io_rw *rw = io_kiocb_to_cmd(req);
635 struct wait_page_key *key = arg;
636
637 wpq = container_of(wait, struct wait_page_queue, wait);
638
639 if (!wake_page_match(wpq, key))
640 return 0;
641
642 rw->kiocb.ki_flags &= ~IOCB_WAITQ;
643 list_del_init(&wait->entry);
644 io_req_task_queue(req);
645 return 1;
646}
647
648/*
649 * This controls whether a given IO request should be armed for async page
650 * based retry. If we return false here, the request is handed to the async
651 * worker threads for retry. If we're doing buffered reads on a regular file,
652 * we prepare a private wait_page_queue entry and retry the operation. This
653 * will either succeed because the page is now uptodate and unlocked, or it
654 * will register a callback when the page is unlocked at IO completion. Through
655 * that callback, io_uring uses task_work to setup a retry of the operation.
656 * That retry will attempt the buffered read again. The retry will generally
657 * succeed, or in rare cases where it fails, we then fall back to using the
658 * async worker threads for a blocking retry.
659 */
660static bool io_rw_should_retry(struct io_kiocb *req)
661{
662 struct io_async_rw *io = req->async_data;
663 struct wait_page_queue *wait = &io->wpq;
664 struct io_rw *rw = io_kiocb_to_cmd(req);
665 struct kiocb *kiocb = &rw->kiocb;
666
667 /* never retry for NOWAIT, we just complete with -EAGAIN */
668 if (req->flags & REQ_F_NOWAIT)
669 return false;
670
671 /* Only for buffered IO */
672 if (kiocb->ki_flags & (IOCB_DIRECT | IOCB_HIPRI))
673 return false;
674
675 /*
676 * just use poll if we can, and don't attempt if the fs doesn't
677 * support callback based unlocks
678 */
679 if (file_can_poll(req->file) || !(req->file->f_mode & FMODE_BUF_RASYNC))
680 return false;
681
682 wait->wait.func = io_async_buf_func;
683 wait->wait.private = req;
684 wait->wait.flags = 0;
685 INIT_LIST_HEAD(&wait->wait.entry);
686 kiocb->ki_flags |= IOCB_WAITQ;
687 kiocb->ki_flags &= ~IOCB_NOWAIT;
688 kiocb->ki_waitq = wait;
689 return true;
690}
691
692static inline int io_iter_do_read(struct io_rw *rw, struct iov_iter *iter)
693{
694 struct file *file = rw->kiocb.ki_filp;
695
696 if (likely(file->f_op->read_iter))
697 return call_read_iter(file, &rw->kiocb, iter);
698 else if (file->f_op->read)
699 return loop_rw_iter(READ, rw, iter);
700 else
701 return -EINVAL;
702}
703
704static bool need_read_all(struct io_kiocb *req)
705{
706 return req->flags & REQ_F_ISREG ||
707 S_ISBLK(file_inode(req->file)->i_mode);
708}
709
710static inline bool io_req_ffs_set(struct io_kiocb *req)
711{
712 return req->flags & REQ_F_FIXED_FILE;
713}
714
715static int io_rw_init_file(struct io_kiocb *req, fmode_t mode)
716{
717 struct io_rw *rw = io_kiocb_to_cmd(req);
718 struct kiocb *kiocb = &rw->kiocb;
719 struct io_ring_ctx *ctx = req->ctx;
720 struct file *file = req->file;
721 int ret;
722
723 if (unlikely(!file || !(file->f_mode & mode)))
724 return -EBADF;
725
726 if (!io_req_ffs_set(req))
727 req->flags |= io_file_get_flags(file) << REQ_F_SUPPORT_NOWAIT_BIT;
728
729 kiocb->ki_flags = iocb_flags(file);
730 ret = kiocb_set_rw_flags(kiocb, rw->flags);
731 if (unlikely(ret))
732 return ret;
733
734 /*
735 * If the file is marked O_NONBLOCK, still allow retry for it if it
736 * supports async. Otherwise it's impossible to use O_NONBLOCK files
737 * reliably. If not, or it IOCB_NOWAIT is set, don't retry.
738 */
739 if ((kiocb->ki_flags & IOCB_NOWAIT) ||
740 ((file->f_flags & O_NONBLOCK) && !io_file_supports_nowait(req)))
741 req->flags |= REQ_F_NOWAIT;
742
743 if (ctx->flags & IORING_SETUP_IOPOLL) {
744 if (!(kiocb->ki_flags & IOCB_DIRECT) || !file->f_op->iopoll)
745 return -EOPNOTSUPP;
746
747 kiocb->private = NULL;
748 kiocb->ki_flags |= IOCB_HIPRI | IOCB_ALLOC_CACHE;
749 kiocb->ki_complete = io_complete_rw_iopoll;
750 req->iopoll_completed = 0;
751 } else {
752 if (kiocb->ki_flags & IOCB_HIPRI)
753 return -EINVAL;
754 kiocb->ki_complete = io_complete_rw;
755 }
756
757 return 0;
758}
759
760int io_read(struct io_kiocb *req, unsigned int issue_flags)
761{
762 struct io_rw *rw = io_kiocb_to_cmd(req);
763 struct io_rw_state __s, *s = &__s;
764 struct iovec *iovec;
765 struct kiocb *kiocb = &rw->kiocb;
766 bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
767 struct io_async_rw *io;
768 ssize_t ret, ret2;
769 loff_t *ppos;
770
771 if (!req_has_async_data(req)) {
772 ret = io_import_iovec(READ, req, &iovec, s, issue_flags);
773 if (unlikely(ret < 0))
774 return ret;
775 } else {
776 io = req->async_data;
777 s = &io->s;
778
779 /*
780 * Safe and required to re-import if we're using provided
781 * buffers, as we dropped the selected one before retry.
782 */
783 if (io_do_buffer_select(req)) {
784 ret = io_import_iovec(READ, req, &iovec, s, issue_flags);
785 if (unlikely(ret < 0))
786 return ret;
787 }
788
789 /*
790 * We come here from an earlier attempt, restore our state to
791 * match in case it doesn't. It's cheap enough that we don't
792 * need to make this conditional.
793 */
794 iov_iter_restore(&s->iter, &s->iter_state);
795 iovec = NULL;
796 }
797 ret = io_rw_init_file(req, FMODE_READ);
798 if (unlikely(ret)) {
799 kfree(iovec);
800 return ret;
801 }
802 req->cqe.res = iov_iter_count(&s->iter);
803
804 if (force_nonblock) {
805 /* If the file doesn't support async, just async punt */
806 if (unlikely(!io_file_supports_nowait(req))) {
807 ret = io_setup_async_rw(req, iovec, s, true);
808 return ret ?: -EAGAIN;
809 }
810 kiocb->ki_flags |= IOCB_NOWAIT;
811 } else {
812 /* Ensure we clear previously set non-block flag */
813 kiocb->ki_flags &= ~IOCB_NOWAIT;
814 }
815
816 ppos = io_kiocb_update_pos(req);
817
818 ret = rw_verify_area(READ, req->file, ppos, req->cqe.res);
819 if (unlikely(ret)) {
820 kfree(iovec);
821 return ret;
822 }
823
824 ret = io_iter_do_read(rw, &s->iter);
825
826 if (ret == -EAGAIN || (req->flags & REQ_F_REISSUE)) {
827 req->flags &= ~REQ_F_REISSUE;
828 /* if we can poll, just do that */
829 if (req->opcode == IORING_OP_READ && file_can_poll(req->file))
830 return -EAGAIN;
831 /* IOPOLL retry should happen for io-wq threads */
832 if (!force_nonblock && !(req->ctx->flags & IORING_SETUP_IOPOLL))
833 goto done;
834 /* no retry on NONBLOCK nor RWF_NOWAIT */
835 if (req->flags & REQ_F_NOWAIT)
836 goto done;
837 ret = 0;
838 } else if (ret == -EIOCBQUEUED) {
df9830d8
PB
839 if (iovec)
840 kfree(iovec);
841 return IOU_ISSUE_SKIP_COMPLETE;
f3b44f92
JA
842 } else if (ret == req->cqe.res || ret <= 0 || !force_nonblock ||
843 (req->flags & REQ_F_NOWAIT) || !need_read_all(req)) {
844 /* read all, failed, already did sync or don't want to retry */
845 goto done;
846 }
847
848 /*
849 * Don't depend on the iter state matching what was consumed, or being
850 * untouched in case of error. Restore it and we'll advance it
851 * manually if we need to.
852 */
853 iov_iter_restore(&s->iter, &s->iter_state);
854
855 ret2 = io_setup_async_rw(req, iovec, s, true);
856 if (ret2)
857 return ret2;
858
859 iovec = NULL;
860 io = req->async_data;
861 s = &io->s;
862 /*
863 * Now use our persistent iterator and state, if we aren't already.
864 * We've restored and mapped the iter to match.
865 */
866
867 do {
868 /*
869 * We end up here because of a partial read, either from
870 * above or inside this loop. Advance the iter by the bytes
871 * that were consumed.
872 */
873 iov_iter_advance(&s->iter, ret);
874 if (!iov_iter_count(&s->iter))
875 break;
876 io->bytes_done += ret;
877 iov_iter_save_state(&s->iter, &s->iter_state);
878
879 /* if we can retry, do so with the callbacks armed */
880 if (!io_rw_should_retry(req)) {
881 kiocb->ki_flags &= ~IOCB_WAITQ;
882 return -EAGAIN;
883 }
884
885 /*
886 * Now retry read with the IOCB_WAITQ parts set in the iocb. If
887 * we get -EIOCBQUEUED, then we'll get a notification when the
888 * desired page gets unlocked. We can also get a partial read
889 * here, and if we do, then just retry at the new offset.
890 */
891 ret = io_iter_do_read(rw, &s->iter);
892 if (ret == -EIOCBQUEUED)
893 return IOU_ISSUE_SKIP_COMPLETE;
894 /* we got some bytes, but not all. retry. */
895 kiocb->ki_flags &= ~IOCB_WAITQ;
896 iov_iter_restore(&s->iter, &s->iter_state);
897 } while (ret > 0);
898done:
f3b44f92
JA
899 /* it's faster to check here then delegate to kfree */
900 if (iovec)
901 kfree(iovec);
df9830d8 902 return kiocb_done(req, ret, issue_flags);
f3b44f92
JA
903}
904
905int io_write(struct io_kiocb *req, unsigned int issue_flags)
906{
907 struct io_rw *rw = io_kiocb_to_cmd(req);
908 struct io_rw_state __s, *s = &__s;
909 struct iovec *iovec;
910 struct kiocb *kiocb = &rw->kiocb;
911 bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
912 ssize_t ret, ret2;
913 loff_t *ppos;
914
915 if (!req_has_async_data(req)) {
916 ret = io_import_iovec(WRITE, req, &iovec, s, issue_flags);
917 if (unlikely(ret < 0))
918 return ret;
919 } else {
920 struct io_async_rw *io = req->async_data;
921
922 s = &io->s;
923 iov_iter_restore(&s->iter, &s->iter_state);
924 iovec = NULL;
925 }
926 ret = io_rw_init_file(req, FMODE_WRITE);
927 if (unlikely(ret)) {
928 kfree(iovec);
929 return ret;
930 }
931 req->cqe.res = iov_iter_count(&s->iter);
932
933 if (force_nonblock) {
934 /* If the file doesn't support async, just async punt */
935 if (unlikely(!io_file_supports_nowait(req)))
936 goto copy_iov;
937
938 /* file path doesn't support NOWAIT for non-direct_IO */
939 if (force_nonblock && !(kiocb->ki_flags & IOCB_DIRECT) &&
940 (req->flags & REQ_F_ISREG))
941 goto copy_iov;
942
943 kiocb->ki_flags |= IOCB_NOWAIT;
944 } else {
945 /* Ensure we clear previously set non-block flag */
946 kiocb->ki_flags &= ~IOCB_NOWAIT;
947 }
948
949 ppos = io_kiocb_update_pos(req);
950
951 ret = rw_verify_area(WRITE, req->file, ppos, req->cqe.res);
df9830d8
PB
952 if (unlikely(ret)) {
953 kfree(iovec);
954 return ret;
955 }
f3b44f92
JA
956
957 /*
958 * Open-code file_start_write here to grab freeze protection,
959 * which will be released by another thread in
960 * io_complete_rw(). Fool lockdep by telling it the lock got
961 * released so that it doesn't complain about the held lock when
962 * we return to userspace.
963 */
964 if (req->flags & REQ_F_ISREG) {
965 sb_start_write(file_inode(req->file)->i_sb);
966 __sb_writers_release(file_inode(req->file)->i_sb,
967 SB_FREEZE_WRITE);
968 }
969 kiocb->ki_flags |= IOCB_WRITE;
970
971 if (likely(req->file->f_op->write_iter))
972 ret2 = call_write_iter(req->file, kiocb, &s->iter);
973 else if (req->file->f_op->write)
974 ret2 = loop_rw_iter(WRITE, rw, &s->iter);
975 else
976 ret2 = -EINVAL;
977
978 if (req->flags & REQ_F_REISSUE) {
979 req->flags &= ~REQ_F_REISSUE;
980 ret2 = -EAGAIN;
981 }
982
983 /*
984 * Raw bdev writes will return -EOPNOTSUPP for IOCB_NOWAIT. Just
985 * retry them without IOCB_NOWAIT.
986 */
987 if (ret2 == -EOPNOTSUPP && (kiocb->ki_flags & IOCB_NOWAIT))
988 ret2 = -EAGAIN;
989 /* no retry on NONBLOCK nor RWF_NOWAIT */
990 if (ret2 == -EAGAIN && (req->flags & REQ_F_NOWAIT))
991 goto done;
992 if (!force_nonblock || ret2 != -EAGAIN) {
993 /* IOPOLL retry should happen for io-wq threads */
994 if (ret2 == -EAGAIN && (req->ctx->flags & IORING_SETUP_IOPOLL))
995 goto copy_iov;
996done:
df9830d8 997 ret = kiocb_done(req, ret2, issue_flags);
f3b44f92
JA
998 } else {
999copy_iov:
1000 iov_iter_restore(&s->iter, &s->iter_state);
1001 ret = io_setup_async_rw(req, iovec, s, false);
1002 return ret ?: -EAGAIN;
1003 }
f3b44f92
JA
1004 /* it's reportedly faster than delegating the null check to kfree() */
1005 if (iovec)
1006 kfree(iovec);
1007 return ret;
1008}
1009
1010static void io_cqring_ev_posted_iopoll(struct io_ring_ctx *ctx)
1011{
46929b08 1012 io_commit_cqring_flush(ctx);
f3b44f92
JA
1013 if (ctx->flags & IORING_SETUP_SQPOLL)
1014 io_cqring_wake(ctx);
1015}
1016
1017int io_do_iopoll(struct io_ring_ctx *ctx, bool force_nonspin)
1018{
1019 struct io_wq_work_node *pos, *start, *prev;
1020 unsigned int poll_flags = BLK_POLL_NOSLEEP;
1021 DEFINE_IO_COMP_BATCH(iob);
1022 int nr_events = 0;
1023
1024 /*
1025 * Only spin for completions if we don't have multiple devices hanging
1026 * off our complete list.
1027 */
1028 if (ctx->poll_multi_queue || force_nonspin)
1029 poll_flags |= BLK_POLL_ONESHOT;
1030
1031 wq_list_for_each(pos, start, &ctx->iopoll_list) {
1032 struct io_kiocb *req = container_of(pos, struct io_kiocb, comp_list);
1033 struct io_rw *rw = io_kiocb_to_cmd(req);
1034 int ret;
1035
1036 /*
1037 * Move completed and retryable entries to our local lists.
1038 * If we find a request that requires polling, break out
1039 * and complete those lists first, if we have entries there.
1040 */
1041 if (READ_ONCE(req->iopoll_completed))
1042 break;
1043
1044 ret = rw->kiocb.ki_filp->f_op->iopoll(&rw->kiocb, &iob, poll_flags);
1045 if (unlikely(ret < 0))
1046 return ret;
1047 else if (ret)
1048 poll_flags |= BLK_POLL_ONESHOT;
1049
1050 /* iopoll may have completed current req */
1051 if (!rq_list_empty(iob.req_list) ||
1052 READ_ONCE(req->iopoll_completed))
1053 break;
1054 }
1055
1056 if (!rq_list_empty(iob.req_list))
1057 iob.complete(&iob);
1058 else if (!pos)
1059 return 0;
1060
1061 prev = start;
1062 wq_list_for_each_resume(pos, prev) {
1063 struct io_kiocb *req = container_of(pos, struct io_kiocb, comp_list);
1064
1065 /* order with io_complete_rw_iopoll(), e.g. ->result updates */
1066 if (!smp_load_acquire(&req->iopoll_completed))
1067 break;
1068 nr_events++;
1069 if (unlikely(req->flags & REQ_F_CQE_SKIP))
1070 continue;
1071
1072 req->cqe.flags = io_put_kbuf(req, 0);
1073 __io_fill_cqe_req(req->ctx, req);
1074 }
1075
1076 if (unlikely(!nr_events))
1077 return 0;
1078
1079 io_commit_cqring(ctx);
1080 io_cqring_ev_posted_iopoll(ctx);
1081 pos = start ? start->next : ctx->iopoll_list.first;
1082 wq_list_cut(&ctx->iopoll_list, prev, start);
1083 io_free_batch_list(ctx, pos);
1084 return nr_events;
1085}
This page took 0.15109 seconds and 4 git commands to generate.