1 // SPDX-License-Identifier: GPL-2.0-only
2 /* Network filesystem read subrequest result collection, assessment and
5 * Copyright (C) 2024 Red Hat, Inc. All Rights Reserved.
9 #include <linux/export.h>
12 #include <linux/pagemap.h>
13 #include <linux/slab.h>
14 #include <linux/task_io_accounting_ops.h>
17 /* Notes made in the collector */
18 #define HIT_PENDING 0x01 /* A front op was still pending */
19 #define MADE_PROGRESS 0x04 /* Made progress cleaning up a stream or the folio set */
20 #define BUFFERED 0x08 /* The pagecache needs cleaning up */
21 #define NEED_RETRY 0x10 /* A front op requests retrying */
22 #define COPY_TO_CACHE 0x40 /* Need to copy subrequest to cache */
23 #define ABANDON_SREQ 0x80 /* Need to abandon untransferred part of subrequest */
26 * Clear the unread part of an I/O request.
28 static void netfs_clear_unread(struct netfs_io_subrequest *subreq)
30 netfs_reset_iter(subreq);
31 WARN_ON_ONCE(subreq->len - subreq->transferred != iov_iter_count(&subreq->io_iter));
32 iov_iter_zero(iov_iter_count(&subreq->io_iter), &subreq->io_iter);
33 if (subreq->start + subreq->transferred >= subreq->rreq->i_size)
34 __set_bit(NETFS_SREQ_HIT_EOF, &subreq->flags);
38 * Flush, mark and unlock a folio that's now completely read. If we want to
39 * cache the folio, we set the group to NETFS_FOLIO_COPY_TO_CACHE, mark it
40 * dirty and let writeback handle it.
42 static void netfs_unlock_read_folio(struct netfs_io_request *rreq,
43 struct folio_queue *folioq,
46 struct netfs_folio *finfo;
47 struct folio *folio = folioq_folio(folioq, slot);
49 if (unlikely(folio_pos(folio) < rreq->abandon_to)) {
50 trace_netfs_folio(folio, netfs_folio_trace_abandon);
54 flush_dcache_folio(folio);
55 folio_mark_uptodate(folio);
57 if (!test_bit(NETFS_RREQ_USE_PGPRIV2, &rreq->flags)) {
58 finfo = netfs_folio_info(folio);
60 trace_netfs_folio(folio, netfs_folio_trace_filled_gaps);
61 if (finfo->netfs_group)
62 folio_change_private(folio, finfo->netfs_group);
64 folio_detach_private(folio);
68 if (test_bit(NETFS_RREQ_FOLIO_COPY_TO_CACHE, &rreq->flags)) {
69 if (!WARN_ON_ONCE(folio_get_private(folio) != NULL)) {
70 trace_netfs_folio(folio, netfs_folio_trace_copy_to_cache);
71 folio_attach_private(folio, NETFS_FOLIO_COPY_TO_CACHE);
72 folio_mark_dirty(folio);
75 trace_netfs_folio(folio, netfs_folio_trace_read_done);
78 folioq_clear(folioq, slot);
80 // TODO: Use of PG_private_2 is deprecated.
81 if (test_bit(NETFS_RREQ_FOLIO_COPY_TO_CACHE, &rreq->flags))
82 netfs_pgpriv2_copy_to_cache(rreq, folio);
86 if (!test_bit(NETFS_RREQ_DONT_UNLOCK_FOLIOS, &rreq->flags)) {
87 if (folio->index == rreq->no_unlock_folio &&
88 test_bit(NETFS_RREQ_NO_UNLOCK_FOLIO, &rreq->flags)) {
91 trace_netfs_folio(folio, netfs_folio_trace_read_unlock);
96 folioq_clear(folioq, slot);
100 * Unlock any folios we've finished with.
102 static void netfs_read_unlock_folios(struct netfs_io_request *rreq,
105 struct folio_queue *folioq = rreq->buffer.tail;
106 unsigned long long collected_to = rreq->collected_to;
107 unsigned int slot = rreq->buffer.first_tail_slot;
109 if (rreq->cleaned_to >= rreq->collected_to)
112 // TODO: Begin decryption
114 if (slot >= folioq_nr_slots(folioq)) {
115 folioq = rolling_buffer_delete_spent(&rreq->buffer);
117 rreq->front_folio_order = 0;
125 unsigned long long fpos, fend;
129 if (*notes & COPY_TO_CACHE)
130 set_bit(NETFS_RREQ_FOLIO_COPY_TO_CACHE, &rreq->flags);
132 folio = folioq_folio(folioq, slot);
133 if (WARN_ONCE(!folio_test_locked(folio),
134 "R=%08x: folio %lx is not locked\n",
135 rreq->debug_id, folio->index))
136 trace_netfs_folio(folio, netfs_folio_trace_not_locked);
138 order = folioq_folio_order(folioq, slot);
139 rreq->front_folio_order = order;
140 fsize = PAGE_SIZE << order;
141 fpos = folio_pos(folio);
142 fend = umin(fpos + fsize, rreq->i_size);
144 trace_netfs_collect_folio(rreq, folio, fend, collected_to);
146 /* Unlock any folio we've transferred all of. */
147 if (collected_to < fend)
150 netfs_unlock_read_folio(rreq, folioq, slot);
151 WRITE_ONCE(rreq->cleaned_to, fpos + fsize);
152 *notes |= MADE_PROGRESS;
154 clear_bit(NETFS_RREQ_FOLIO_COPY_TO_CACHE, &rreq->flags);
156 /* Clean up the head folioq. If we clear an entire folioq, then
157 * we can get rid of it provided it's not also the tail folioq
158 * being filled by the issuer.
160 folioq_clear(folioq, slot);
162 if (slot >= folioq_nr_slots(folioq)) {
163 folioq = rolling_buffer_delete_spent(&rreq->buffer);
167 trace_netfs_folioq(folioq, netfs_trace_folioq_read_progress);
170 if (fpos + fsize >= collected_to)
174 rreq->buffer.tail = folioq;
176 rreq->buffer.first_tail_slot = slot;
180 * Collect and assess the results of various read subrequests. We may need to
181 * retry some of the results.
183 * Note that we have a sequence of subrequests, which may be drawing on
184 * different sources and may or may not be the same size or starting position
185 * and may not even correspond in boundary alignment.
187 static void netfs_collect_read_results(struct netfs_io_request *rreq)
189 struct netfs_io_subrequest *front, *remove;
190 struct netfs_io_stream *stream = &rreq->io_streams[0];
193 _enter("%llx-%llx", rreq->start, rreq->start + rreq->len);
194 trace_netfs_rreq(rreq, netfs_rreq_trace_collect);
195 trace_netfs_collect(rreq);
198 if (rreq->origin == NETFS_READAHEAD ||
199 rreq->origin == NETFS_READPAGE ||
200 rreq->origin == NETFS_READ_FOR_WRITE)
205 /* Remove completed subrequests from the front of the stream and
206 * advance the completion point. We stop when we hit something that's
207 * in progress. The issuer thread may be adding stuff to the tail
208 * whilst we're doing this.
210 front = READ_ONCE(stream->front);
214 trace_netfs_collect_sreq(rreq, front);
215 _debug("sreq [%x] %llx %zx/%zx",
216 front->debug_index, front->start, front->transferred, front->len);
218 if (stream->collected_to < front->start) {
219 trace_netfs_collect_gap(rreq, stream, front->start, 'F');
220 stream->collected_to = front->start;
223 if (test_bit(NETFS_SREQ_IN_PROGRESS, &front->flags))
224 notes |= HIT_PENDING;
225 smp_rmb(); /* Read counters after IN_PROGRESS flag. */
226 transferred = READ_ONCE(front->transferred);
228 /* If we can now collect the next folio, do so. We don't want
229 * to defer this as we have to decide whether we need to copy
230 * to the cache or not, and that may differ between adjacent
233 if (notes & BUFFERED) {
234 size_t fsize = PAGE_SIZE << rreq->front_folio_order;
236 /* Clear the tail of a short read. */
237 if (!(notes & HIT_PENDING) &&
239 transferred < front->len &&
240 (test_bit(NETFS_SREQ_HIT_EOF, &front->flags) ||
241 test_bit(NETFS_SREQ_CLEAR_TAIL, &front->flags))) {
242 netfs_clear_unread(front);
243 transferred = front->transferred = front->len;
244 trace_netfs_sreq(front, netfs_sreq_trace_clear);
247 stream->collected_to = front->start + transferred;
248 rreq->collected_to = stream->collected_to;
250 if (test_bit(NETFS_SREQ_COPY_TO_CACHE, &front->flags))
251 notes |= COPY_TO_CACHE;
253 if (test_bit(NETFS_SREQ_FAILED, &front->flags)) {
254 rreq->abandon_to = front->start + front->len;
255 front->transferred = front->len;
256 transferred = front->len;
257 trace_netfs_rreq(rreq, netfs_rreq_trace_set_abandon);
259 if (front->start + transferred >= rreq->cleaned_to + fsize ||
260 test_bit(NETFS_SREQ_HIT_EOF, &front->flags))
261 netfs_read_unlock_folios(rreq, ¬es);
263 stream->collected_to = front->start + transferred;
264 rreq->collected_to = stream->collected_to;
267 /* Stall if the front is still undergoing I/O. */
268 if (notes & HIT_PENDING)
271 if (test_bit(NETFS_SREQ_FAILED, &front->flags)) {
272 if (!stream->failed) {
273 stream->error = front->error;
274 rreq->error = front->error;
275 set_bit(NETFS_RREQ_FAILED, &rreq->flags);
276 stream->failed = true;
278 notes |= MADE_PROGRESS | ABANDON_SREQ;
279 } else if (test_bit(NETFS_SREQ_NEED_RETRY, &front->flags)) {
280 stream->need_retry = true;
281 notes |= NEED_RETRY | MADE_PROGRESS;
285 stream->transferred = stream->collected_to - rreq->start;
286 notes |= MADE_PROGRESS;
289 /* Remove if completely consumed. */
290 stream->source = front->source;
291 spin_lock(&rreq->lock);
294 trace_netfs_sreq(front, netfs_sreq_trace_discard);
295 list_del_init(&front->rreq_link);
296 front = list_first_entry_or_null(&stream->subrequests,
297 struct netfs_io_subrequest, rreq_link);
298 stream->front = front;
299 spin_unlock(&rreq->lock);
300 netfs_put_subrequest(remove, false,
301 notes & ABANDON_SREQ ?
302 netfs_sreq_trace_put_abandon :
303 netfs_sreq_trace_put_done);
306 trace_netfs_collect_stream(rreq, stream);
307 trace_netfs_collect_state(rreq, rreq->collected_to, notes);
309 if (!(notes & BUFFERED))
310 rreq->cleaned_to = rreq->collected_to;
312 if (notes & NEED_RETRY)
314 if ((notes & MADE_PROGRESS) && test_bit(NETFS_RREQ_PAUSE, &rreq->flags)) {
315 trace_netfs_rreq(rreq, netfs_rreq_trace_unpause);
316 clear_bit_unlock(NETFS_RREQ_PAUSE, &rreq->flags);
317 smp_mb__after_atomic(); /* Set PAUSE before task state */
318 wake_up(&rreq->waitq);
321 if (notes & MADE_PROGRESS) {
327 _leave(" = %x", notes);
331 /* Okay... We're going to have to retry parts of the stream. Note
332 * that any partially completed op will have had any wholly transferred
333 * folios removed from it.
336 netfs_retry_reads(rreq);
341 * Do page flushing and suchlike after DIO.
343 static void netfs_rreq_assess_dio(struct netfs_io_request *rreq)
345 struct netfs_io_subrequest *subreq;
346 struct netfs_io_stream *stream = &rreq->io_streams[0];
349 /* Collect unbuffered reads and direct reads, adding up the transfer
350 * sizes until we find the first short or failed subrequest.
352 list_for_each_entry(subreq, &stream->subrequests, rreq_link) {
353 rreq->transferred += subreq->transferred;
355 if (subreq->transferred < subreq->len ||
356 test_bit(NETFS_SREQ_FAILED, &subreq->flags)) {
357 rreq->error = subreq->error;
362 if (rreq->origin == NETFS_DIO_READ) {
363 for (i = 0; i < rreq->direct_bv_count; i++) {
364 flush_dcache_page(rreq->direct_bv[i].bv_page);
365 // TODO: cifs marks pages in the destination buffer
366 // dirty under some circumstances after a read. Do we
367 // need to do that too?
368 set_page_dirty(rreq->direct_bv[i].bv_page);
373 rreq->iocb->ki_pos += rreq->transferred;
374 if (rreq->iocb->ki_complete)
375 rreq->iocb->ki_complete(
376 rreq->iocb, rreq->error ? rreq->error : rreq->transferred);
378 if (rreq->netfs_ops->done)
379 rreq->netfs_ops->done(rreq);
380 if (rreq->origin == NETFS_DIO_READ)
381 inode_dio_end(rreq->inode);
385 * Do processing after reading a monolithic single object.
387 static void netfs_rreq_assess_single(struct netfs_io_request *rreq)
389 struct netfs_io_stream *stream = &rreq->io_streams[0];
391 if (!rreq->error && stream->source == NETFS_DOWNLOAD_FROM_SERVER &&
392 fscache_resources_valid(&rreq->cache_resources)) {
393 trace_netfs_rreq(rreq, netfs_rreq_trace_dirty);
394 netfs_single_mark_inode_dirty(rreq->inode);
398 rreq->iocb->ki_pos += rreq->transferred;
399 if (rreq->iocb->ki_complete)
400 rreq->iocb->ki_complete(
401 rreq->iocb, rreq->error ? rreq->error : rreq->transferred);
403 if (rreq->netfs_ops->done)
404 rreq->netfs_ops->done(rreq);
408 * Perform the collection of subrequests and folios.
410 * Note that we're in normal kernel thread context at this point, possibly
411 * running on a workqueue.
413 static void netfs_read_collection(struct netfs_io_request *rreq)
415 struct netfs_io_stream *stream = &rreq->io_streams[0];
417 netfs_collect_read_results(rreq);
419 /* We're done when the app thread has finished posting subreqs and the
422 if (!test_bit(NETFS_RREQ_ALL_QUEUED, &rreq->flags))
424 smp_rmb(); /* Read ALL_QUEUED before subreq lists. */
426 if (!list_empty(&stream->subrequests))
429 /* Okay, declare that all I/O is complete. */
430 rreq->transferred = stream->transferred;
431 trace_netfs_rreq(rreq, netfs_rreq_trace_complete);
433 //netfs_rreq_is_still_valid(rreq);
435 switch (rreq->origin) {
437 case NETFS_READ_GAPS:
438 netfs_rreq_assess_dio(rreq);
440 case NETFS_READ_SINGLE:
441 netfs_rreq_assess_single(rreq);
446 task_io_account_read(rreq->transferred);
448 trace_netfs_rreq(rreq, netfs_rreq_trace_wake_ip);
449 clear_and_wake_up_bit(NETFS_RREQ_IN_PROGRESS, &rreq->flags);
451 trace_netfs_rreq(rreq, netfs_rreq_trace_done);
452 netfs_clear_subrequests(rreq, false);
453 netfs_unlock_abandoned_read_pages(rreq);
454 if (unlikely(rreq->copy_to_cache))
455 netfs_pgpriv2_end_copy_to_cache(rreq);
458 void netfs_read_collection_worker(struct work_struct *work)
460 struct netfs_io_request *rreq = container_of(work, struct netfs_io_request, work);
462 netfs_see_request(rreq, netfs_rreq_trace_see_work);
463 if (test_bit(NETFS_RREQ_IN_PROGRESS, &rreq->flags))
464 netfs_read_collection(rreq);
465 netfs_put_request(rreq, false, netfs_rreq_trace_put_work);
469 * Wake the collection work item.
471 void netfs_wake_read_collector(struct netfs_io_request *rreq)
473 if (test_bit(NETFS_RREQ_OFFLOAD_COLLECTION, &rreq->flags)) {
474 if (!work_pending(&rreq->work)) {
475 netfs_get_request(rreq, netfs_rreq_trace_get_work);
476 if (!queue_work(system_unbound_wq, &rreq->work))
477 netfs_put_request(rreq, true, netfs_rreq_trace_put_work_nq);
480 trace_netfs_rreq(rreq, netfs_rreq_trace_wake_queue);
481 wake_up(&rreq->waitq);
486 * netfs_read_subreq_progress - Note progress of a read operation.
487 * @subreq: The read request that has terminated.
489 * This tells the read side of netfs lib that a contributory I/O operation has
490 * made some progress and that it may be possible to unlock some folios.
492 * Before calling, the filesystem should update subreq->transferred to track
493 * the amount of data copied into the output buffer.
495 void netfs_read_subreq_progress(struct netfs_io_subrequest *subreq)
497 struct netfs_io_request *rreq = subreq->rreq;
498 struct netfs_io_stream *stream = &rreq->io_streams[0];
499 size_t fsize = PAGE_SIZE << rreq->front_folio_order;
501 trace_netfs_sreq(subreq, netfs_sreq_trace_progress);
503 /* If we are at the head of the queue, wake up the collector,
504 * getting a ref to it if we were the ones to do so.
506 if (subreq->start + subreq->transferred > rreq->cleaned_to + fsize &&
507 (rreq->origin == NETFS_READAHEAD ||
508 rreq->origin == NETFS_READPAGE ||
509 rreq->origin == NETFS_READ_FOR_WRITE) &&
510 list_is_first(&subreq->rreq_link, &stream->subrequests)
512 __set_bit(NETFS_SREQ_MADE_PROGRESS, &subreq->flags);
513 netfs_wake_read_collector(rreq);
516 EXPORT_SYMBOL(netfs_read_subreq_progress);
519 * netfs_read_subreq_terminated - Note the termination of an I/O operation.
520 * @subreq: The I/O request that has terminated.
522 * This tells the read helper that a contributory I/O operation has terminated,
523 * one way or another, and that it should integrate the results.
525 * The caller indicates the outcome of the operation through @subreq->error,
526 * supplying 0 to indicate a successful or retryable transfer (if
527 * NETFS_SREQ_NEED_RETRY is set) or a negative error code. The helper will
528 * look after reissuing I/O operations as appropriate and writing downloaded
531 * Before calling, the filesystem should update subreq->transferred to track
532 * the amount of data copied into the output buffer.
534 void netfs_read_subreq_terminated(struct netfs_io_subrequest *subreq)
536 struct netfs_io_request *rreq = subreq->rreq;
537 struct netfs_io_stream *stream = &rreq->io_streams[0];
539 switch (subreq->source) {
540 case NETFS_READ_FROM_CACHE:
541 netfs_stat(&netfs_n_rh_read_done);
543 case NETFS_DOWNLOAD_FROM_SERVER:
544 netfs_stat(&netfs_n_rh_download_done);
550 /* Deal with retry requests, short reads and errors. If we retry
551 * but don't make progress, we abandon the attempt.
553 if (!subreq->error && subreq->transferred < subreq->len) {
554 if (test_bit(NETFS_SREQ_HIT_EOF, &subreq->flags)) {
555 trace_netfs_sreq(subreq, netfs_sreq_trace_hit_eof);
556 } else if (test_bit(NETFS_SREQ_CLEAR_TAIL, &subreq->flags)) {
557 trace_netfs_sreq(subreq, netfs_sreq_trace_need_clear);
558 } else if (test_bit(NETFS_SREQ_NEED_RETRY, &subreq->flags)) {
559 trace_netfs_sreq(subreq, netfs_sreq_trace_need_retry);
560 } else if (test_bit(NETFS_SREQ_MADE_PROGRESS, &subreq->flags)) {
561 __set_bit(NETFS_SREQ_NEED_RETRY, &subreq->flags);
562 trace_netfs_sreq(subreq, netfs_sreq_trace_partial_read);
564 __set_bit(NETFS_SREQ_FAILED, &subreq->flags);
565 subreq->error = -ENODATA;
566 trace_netfs_sreq(subreq, netfs_sreq_trace_short);
570 if (unlikely(subreq->error < 0)) {
571 trace_netfs_failure(rreq, subreq, subreq->error, netfs_fail_read);
572 if (subreq->source == NETFS_READ_FROM_CACHE) {
573 netfs_stat(&netfs_n_rh_read_failed);
574 __set_bit(NETFS_SREQ_NEED_RETRY, &subreq->flags);
576 netfs_stat(&netfs_n_rh_download_failed);
577 __set_bit(NETFS_SREQ_FAILED, &subreq->flags);
579 trace_netfs_rreq(rreq, netfs_rreq_trace_set_pause);
580 set_bit(NETFS_RREQ_PAUSE, &rreq->flags);
583 trace_netfs_sreq(subreq, netfs_sreq_trace_terminated);
585 clear_bit_unlock(NETFS_SREQ_IN_PROGRESS, &subreq->flags);
586 smp_mb__after_atomic(); /* Clear IN_PROGRESS before task state */
588 /* If we are at the head of the queue, wake up the collector. */
589 if (list_is_first(&subreq->rreq_link, &stream->subrequests))
590 netfs_wake_read_collector(rreq);
592 netfs_put_subrequest(subreq, true, netfs_sreq_trace_put_terminated);
594 EXPORT_SYMBOL(netfs_read_subreq_terminated);
597 * Handle termination of a read from the cache.
599 void netfs_cache_read_terminated(void *priv, ssize_t transferred_or_error, bool was_async)
601 struct netfs_io_subrequest *subreq = priv;
603 if (transferred_or_error > 0) {
605 if (transferred_or_error > 0) {
606 subreq->transferred += transferred_or_error;
607 __set_bit(NETFS_SREQ_MADE_PROGRESS, &subreq->flags);
610 subreq->error = transferred_or_error;
612 netfs_read_subreq_terminated(subreq);
616 * Wait for the read operation to complete, successfully or otherwise.
618 ssize_t netfs_wait_for_read(struct netfs_io_request *rreq)
620 struct netfs_io_subrequest *subreq;
621 struct netfs_io_stream *stream = &rreq->io_streams[0];
626 trace_netfs_rreq(rreq, netfs_rreq_trace_wait_queue);
627 prepare_to_wait(&rreq->waitq, &myself, TASK_UNINTERRUPTIBLE);
629 subreq = list_first_entry_or_null(&stream->subrequests,
630 struct netfs_io_subrequest, rreq_link);
632 (!test_bit(NETFS_SREQ_IN_PROGRESS, &subreq->flags) ||
633 test_bit(NETFS_SREQ_MADE_PROGRESS, &subreq->flags))) {
634 __set_current_state(TASK_RUNNING);
635 netfs_read_collection(rreq);
639 if (!test_bit(NETFS_RREQ_IN_PROGRESS, &rreq->flags))
643 trace_netfs_rreq(rreq, netfs_rreq_trace_woke_queue);
646 finish_wait(&rreq->waitq, &myself);
650 ret = rreq->transferred;
651 switch (rreq->origin) {
653 case NETFS_READ_SINGLE:
654 ret = rreq->transferred;
657 if (rreq->submitted < rreq->len) {
658 trace_netfs_failure(rreq, NULL, ret, netfs_fail_short_read);
669 * Wait for a paused read operation to unpause or complete in some manner.
671 void netfs_wait_for_pause(struct netfs_io_request *rreq)
673 struct netfs_io_subrequest *subreq;
674 struct netfs_io_stream *stream = &rreq->io_streams[0];
677 trace_netfs_rreq(rreq, netfs_rreq_trace_wait_pause);
680 trace_netfs_rreq(rreq, netfs_rreq_trace_wait_queue);
681 prepare_to_wait(&rreq->waitq, &myself, TASK_UNINTERRUPTIBLE);
683 subreq = list_first_entry_or_null(&stream->subrequests,
684 struct netfs_io_subrequest, rreq_link);
686 (!test_bit(NETFS_SREQ_IN_PROGRESS, &subreq->flags) ||
687 test_bit(NETFS_SREQ_MADE_PROGRESS, &subreq->flags))) {
688 __set_current_state(TASK_RUNNING);
689 netfs_read_collection(rreq);
693 if (!test_bit(NETFS_RREQ_IN_PROGRESS, &rreq->flags) ||
694 !test_bit(NETFS_RREQ_PAUSE, &rreq->flags))
698 trace_netfs_rreq(rreq, netfs_rreq_trace_woke_queue);
701 finish_wait(&rreq->waitq, &myself);