fs/netfs/read_collect.c

   1 // SPDX-License-Identifier: GPL-2.0-only
   2 /* Network filesystem read subrequest result collection, assessment and
   3  * retrying.
   4  *
   5  * Copyright (C) 2024 Red Hat, Inc. All Rights Reserved.
   6  * Written by David Howells ([email protected])
   7  */
   8
   9 #include <linux/export.h>
  10 #include <linux/fs.h>
  11 #include <linux/mm.h>
  12 #include <linux/pagemap.h>
  13 #include <linux/slab.h>
  14 #include <linux/task_io_accounting_ops.h>
  15 #include "internal.h"
  16
  17 /*
  18  * Clear the unread part of an I/O request.
  19  */
  20 static void netfs_clear_unread(struct netfs_io_subrequest *subreq)
  21 {
  22         netfs_reset_iter(subreq);
  23         WARN_ON_ONCE(subreq->len - subreq->transferred != iov_iter_count(&subreq->io_iter));
  24         iov_iter_zero(iov_iter_count(&subreq->io_iter), &subreq->io_iter);
  25         if (subreq->start + subreq->transferred >= subreq->rreq->i_size)
  26                 __set_bit(NETFS_SREQ_HIT_EOF, &subreq->flags);
  27 }
  28
  29 /*
  30  * Flush, mark and unlock a folio that's now completely read.  If we want to
  31  * cache the folio, we set the group to NETFS_FOLIO_COPY_TO_CACHE, mark it
  32  * dirty and let writeback handle it.
  33  */
  34 static void netfs_unlock_read_folio(struct netfs_io_subrequest *subreq,
  35                                     struct netfs_io_request *rreq,
  36                                     struct folio_queue *folioq,
  37                                     int slot)
  38 {
  39         struct netfs_folio *finfo;
  40         struct folio *folio = folioq_folio(folioq, slot);
  41
  42         flush_dcache_folio(folio);
  43         folio_mark_uptodate(folio);
  44
  45         if (!test_bit(NETFS_RREQ_USE_PGPRIV2, &rreq->flags)) {
  46                 finfo = netfs_folio_info(folio);
  47                 if (finfo) {
  48                         trace_netfs_folio(folio, netfs_folio_trace_filled_gaps);
  49                         if (finfo->netfs_group)
  50                                 folio_change_private(folio, finfo->netfs_group);
  51                         else
  52                                 folio_detach_private(folio);
  53                         kfree(finfo);
  54                 }
  55
  56                 if (test_bit(NETFS_SREQ_COPY_TO_CACHE, &subreq->flags)) {
  57                         if (!WARN_ON_ONCE(folio_get_private(folio) != NULL)) {
  58                                 trace_netfs_folio(folio, netfs_folio_trace_copy_to_cache);
  59                                 folio_attach_private(folio, NETFS_FOLIO_COPY_TO_CACHE);
  60                                 folio_mark_dirty(folio);
  61                         }
  62                 } else {
  63                         trace_netfs_folio(folio, netfs_folio_trace_read_done);
  64                 }
  65
  66                 folioq_clear(folioq, slot);
  67         } else {
  68                 // TODO: Use of PG_private_2 is deprecated.
  69                 if (test_bit(NETFS_SREQ_COPY_TO_CACHE, &subreq->flags))
  70                         netfs_pgpriv2_mark_copy_to_cache(subreq, rreq, folioq, slot);
  71                 else
  72                         folioq_clear(folioq, slot);
  73         }
  74
  75         if (!test_bit(NETFS_RREQ_DONT_UNLOCK_FOLIOS, &rreq->flags)) {
  76                 if (folio->index == rreq->no_unlock_folio &&
  77                     test_bit(NETFS_RREQ_NO_UNLOCK_FOLIO, &rreq->flags)) {
  78                         _debug("no unlock");
  79                 } else {
  80                         trace_netfs_folio(folio, netfs_folio_trace_read_unlock);
  81                         folio_unlock(folio);
  82                 }
  83         }
  84 }
  85
  86 /*
  87  * Unlock any folios that are now completely read.  Returns true if the
  88  * subrequest is removed from the list.
  89  */
  90 static bool netfs_consume_read_data(struct netfs_io_subrequest *subreq, bool was_async)
  91 {
  92         struct netfs_io_subrequest *prev, *next;
  93         struct netfs_io_request *rreq = subreq->rreq;
  94         struct folio_queue *folioq = subreq->curr_folioq;
  95         size_t avail, prev_donated, next_donated, fsize, part, excess;
  96         loff_t fpos, start;
  97         loff_t fend;
  98         int slot = subreq->curr_folioq_slot;
  99
 100         if (WARN(subreq->transferred > subreq->len,
 101                  "Subreq overread: R%x[%x] %zu > %zu",
 102                  rreq->debug_id, subreq->debug_index,
 103                  subreq->transferred, subreq->len))
 104                 subreq->transferred = subreq->len;
 105
 106 next_folio:
 107         fsize = PAGE_SIZE << subreq->curr_folio_order;
 108         fpos = round_down(subreq->start + subreq->consumed, fsize);
 109         fend = fpos + fsize;
 110
 111         if (WARN_ON_ONCE(!folioq) ||
 112             WARN_ON_ONCE(!folioq_folio(folioq, slot)) ||
 113             WARN_ON_ONCE(folioq_folio(folioq, slot)->index != fpos / PAGE_SIZE)) {
 114                 pr_err("R=%08x[%x] s=%llx-%llx ctl=%zx/%zx/%zx sl=%u\n",
 115                        rreq->debug_id, subreq->debug_index,
 116                        subreq->start, subreq->start + subreq->transferred - 1,
 117                        subreq->consumed, subreq->transferred, subreq->len,
 118                        slot);
 119                 if (folioq) {
 120                         struct folio *folio = folioq_folio(folioq, slot);
 121
 122                         pr_err("folioq: orders=%02x%02x%02x%02x\n",
 123                                folioq->orders[0], folioq->orders[1],
 124                                folioq->orders[2], folioq->orders[3]);
 125                         if (folio)
 126                                 pr_err("folio: %llx-%llx ix=%llx o=%u qo=%u\n",
 127                                        fpos, fend - 1, folio_pos(folio), folio_order(folio),
 128                                        folioq_folio_order(folioq, slot));
 129                 }
 130         }
 131
 132 donation_changed:
 133         /* Try to consume the current folio if we've hit or passed the end of
 134          * it.  There's a possibility that this subreq doesn't start at the
 135          * beginning of the folio, in which case we need to donate to/from the
 136          * preceding subreq.
 137          *
 138          * We also need to include any potential donation back from the
 139          * following subreq.
 140          */
 141         prev_donated = READ_ONCE(subreq->prev_donated);
 142         next_donated =  READ_ONCE(subreq->next_donated);
 143         if (prev_donated || next_donated) {
 144                 spin_lock_bh(&rreq->lock);
 145                 prev_donated = subreq->prev_donated;
 146                 next_donated =  subreq->next_donated;
 147                 subreq->start -= prev_donated;
 148                 subreq->len += prev_donated;
 149                 subreq->transferred += prev_donated;
 150                 prev_donated = subreq->prev_donated = 0;
 151                 if (subreq->transferred == subreq->len) {
 152                         subreq->len += next_donated;
 153                         subreq->transferred += next_donated;
 154                         next_donated = subreq->next_donated = 0;
 155                 }
 156                 trace_netfs_sreq(subreq, netfs_sreq_trace_add_donations);
 157                 spin_unlock_bh(&rreq->lock);
 158         }
 159
 160         avail = subreq->transferred;
 161         if (avail == subreq->len)
 162                 avail += next_donated;
 163         start = subreq->start;
 164         if (subreq->consumed == 0) {
 165                 start -= prev_donated;
 166                 avail += prev_donated;
 167         } else {
 168                 start += subreq->consumed;
 169                 avail -= subreq->consumed;
 170         }
 171         part = umin(avail, fsize);
 172
 173         trace_netfs_progress(subreq, start, avail, part);
 174
 175         if (start + avail >= fend) {
 176                 if (fpos == start) {
 177                         /* Flush, unlock and mark for caching any folio we've just read. */
 178                         subreq->consumed = fend - subreq->start;
 179                         netfs_unlock_read_folio(subreq, rreq, folioq, slot);
 180                         folioq_mark2(folioq, slot);
 181                         if (subreq->consumed >= subreq->len)
 182                                 goto remove_subreq;
 183                 } else if (fpos < start) {
 184                         excess = fend - subreq->start;
 185
 186                         spin_lock_bh(&rreq->lock);
 187                         /* If we complete first on a folio split with the
 188                          * preceding subreq, donate to that subreq - otherwise
 189                          * we get the responsibility.
 190                          */
 191                         if (subreq->prev_donated != prev_donated) {
 192                                 spin_unlock_bh(&rreq->lock);
 193                                 goto donation_changed;
 194                         }
 195
 196                         if (list_is_first(&subreq->rreq_link, &rreq->subrequests)) {
 197                                 spin_unlock_bh(&rreq->lock);
 198                                 pr_err("Can't donate prior to front\n");
 199                                 goto bad;
 200                         }
 201
 202                         prev = list_prev_entry(subreq, rreq_link);
 203                         WRITE_ONCE(prev->next_donated, prev->next_donated + excess);
 204                         subreq->start += excess;
 205                         subreq->len -= excess;
 206                         subreq->transferred -= excess;
 207                         trace_netfs_donate(rreq, subreq, prev, excess,
 208                                            netfs_trace_donate_tail_to_prev);
 209                         trace_netfs_sreq(subreq, netfs_sreq_trace_donate_to_prev);
 210
 211                         if (subreq->consumed >= subreq->len)
 212                                 goto remove_subreq_locked;
 213                         spin_unlock_bh(&rreq->lock);
 214                 } else {
 215                         pr_err("fpos > start\n");
 216                         goto bad;
 217                 }
 218
 219                 /* Advance the rolling buffer to the next folio. */
 220                 slot++;
 221                 if (slot >= folioq_nr_slots(folioq)) {
 222                         slot = 0;
 223                         folioq = folioq->next;
 224                         subreq->curr_folioq = folioq;
 225                 }
 226                 subreq->curr_folioq_slot = slot;
 227                 if (folioq && folioq_folio(folioq, slot))
 228                         subreq->curr_folio_order = folioq->orders[slot];
 229                 if (!was_async)
 230                         cond_resched();
 231                 goto next_folio;
 232         }
 233
 234         /* Deal with partial progress. */
 235         if (subreq->transferred < subreq->len)
 236                 return false;
 237
 238         /* Donate the remaining downloaded data to one of the neighbouring
 239          * subrequests.  Note that we may race with them doing the same thing.
 240          */
 241         spin_lock_bh(&rreq->lock);
 242
 243         if (subreq->prev_donated != prev_donated ||
 244             subreq->next_donated != next_donated) {
 245                 spin_unlock_bh(&rreq->lock);
 246                 cond_resched();
 247                 goto donation_changed;
 248         }
 249
 250         /* Deal with the trickiest case: that this subreq is in the middle of a
 251          * folio, not touching either edge, but finishes first.  In such a
 252          * case, we donate to the previous subreq, if there is one and if it is
 253          * contiguous, so that the donation is only handled when that completes
 254          * - and remove this subreq from the list.
 255          *
 256          * If the previous subreq finished first, we will have acquired their
 257          * donation and should be able to unlock folios and/or donate nextwards.
 258          */
 259         if (!subreq->consumed &&
 260             !prev_donated &&
 261             !list_is_first(&subreq->rreq_link, &rreq->subrequests) &&
 262             subreq->start == prev->start + prev->len) {
 263                 prev = list_prev_entry(subreq, rreq_link);
 264                 WRITE_ONCE(prev->next_donated, prev->next_donated + subreq->len);
 265                 subreq->start += subreq->len;
 266                 subreq->len = 0;
 267                 subreq->transferred = 0;
 268                 trace_netfs_donate(rreq, subreq, prev, subreq->len,
 269                                    netfs_trace_donate_to_prev);
 270                 trace_netfs_sreq(subreq, netfs_sreq_trace_donate_to_prev);
 271                 goto remove_subreq_locked;
 272         }
 273
 274         /* If we can't donate down the chain, donate up the chain instead. */
 275         excess = subreq->len - subreq->consumed + next_donated;
 276
 277         if (!subreq->consumed)
 278                 excess += prev_donated;
 279
 280         if (list_is_last(&subreq->rreq_link, &rreq->subrequests)) {
 281                 rreq->prev_donated = excess;
 282                 trace_netfs_donate(rreq, subreq, NULL, excess,
 283                                    netfs_trace_donate_to_deferred_next);
 284         } else {
 285                 next = list_next_entry(subreq, rreq_link);
 286                 WRITE_ONCE(next->prev_donated, excess);
 287                 trace_netfs_donate(rreq, subreq, next, excess,
 288                                    netfs_trace_donate_to_next);
 289         }
 290         trace_netfs_sreq(subreq, netfs_sreq_trace_donate_to_next);
 291         subreq->len = subreq->consumed;
 292         subreq->transferred = subreq->consumed;
 293         goto remove_subreq_locked;
 294
 295 remove_subreq:
 296         spin_lock_bh(&rreq->lock);
 297 remove_subreq_locked:
 298         subreq->consumed = subreq->len;
 299         list_del(&subreq->rreq_link);
 300         spin_unlock_bh(&rreq->lock);
 301         netfs_put_subrequest(subreq, false, netfs_sreq_trace_put_consumed);
 302         return true;
 303
 304 bad:
 305         /* Errr... prev and next both donated to us, but insufficient to finish
 306          * the folio.
 307          */
 308         printk("R=%08x[%x] s=%llx-%llx %zx/%zx/%zx\n",
 309                rreq->debug_id, subreq->debug_index,
 310                subreq->start, subreq->start + subreq->transferred - 1,
 311                subreq->consumed, subreq->transferred, subreq->len);
 312         printk("folio: %llx-%llx\n", fpos, fend - 1);
 313         printk("donated: prev=%zx next=%zx\n", prev_donated, next_donated);
 314         printk("s=%llx av=%zx part=%zx\n", start, avail, part);
 315         BUG();
 316 }
 317
 318 /*
 319  * Do page flushing and suchlike after DIO.
 320  */
 321 static void netfs_rreq_assess_dio(struct netfs_io_request *rreq)
 322 {
 323         struct netfs_io_subrequest *subreq;
 324         unsigned int i;
 325
 326         /* Collect unbuffered reads and direct reads, adding up the transfer
 327          * sizes until we find the first short or failed subrequest.
 328          */
 329         list_for_each_entry(subreq, &rreq->subrequests, rreq_link) {
 330                 rreq->transferred += subreq->transferred;
 331
 332                 if (subreq->transferred < subreq->len ||
 333                     test_bit(NETFS_SREQ_FAILED, &subreq->flags)) {
 334                         rreq->error = subreq->error;
 335                         break;
 336                 }
 337         }
 338
 339         if (rreq->origin == NETFS_DIO_READ) {
 340                 for (i = 0; i < rreq->direct_bv_count; i++) {
 341                         flush_dcache_page(rreq->direct_bv[i].bv_page);
 342                         // TODO: cifs marks pages in the destination buffer
 343                         // dirty under some circumstances after a read.  Do we
 344                         // need to do that too?
 345                         set_page_dirty(rreq->direct_bv[i].bv_page);
 346                 }
 347         }
 348
 349         if (rreq->iocb) {
 350                 rreq->iocb->ki_pos += rreq->transferred;
 351                 if (rreq->iocb->ki_complete)
 352                         rreq->iocb->ki_complete(
 353                                 rreq->iocb, rreq->error ? rreq->error : rreq->transferred);
 354         }
 355         if (rreq->netfs_ops->done)
 356                 rreq->netfs_ops->done(rreq);
 357         if (rreq->origin == NETFS_DIO_READ)
 358                 inode_dio_end(rreq->inode);
 359 }
 360
 361 /*
 362  * Assess the state of a read request and decide what to do next.
 363  *
 364  * Note that we're in normal kernel thread context at this point, possibly
 365  * running on a workqueue.
 366  */
 367 static void netfs_rreq_assess(struct netfs_io_request *rreq)
 368 {
 369         trace_netfs_rreq(rreq, netfs_rreq_trace_assess);
 370
 371         //netfs_rreq_is_still_valid(rreq);
 372
 373         if (test_and_clear_bit(NETFS_RREQ_NEED_RETRY, &rreq->flags)) {
 374                 netfs_retry_reads(rreq);
 375                 return;
 376         }
 377
 378         if (rreq->origin == NETFS_DIO_READ ||
 379             rreq->origin == NETFS_READ_GAPS)
 380                 netfs_rreq_assess_dio(rreq);
 381         task_io_account_read(rreq->transferred);
 382
 383         trace_netfs_rreq(rreq, netfs_rreq_trace_wake_ip);
 384         clear_and_wake_up_bit(NETFS_RREQ_IN_PROGRESS, &rreq->flags);
 385
 386         trace_netfs_rreq(rreq, netfs_rreq_trace_done);
 387         netfs_clear_subrequests(rreq, false);
 388         netfs_unlock_abandoned_read_pages(rreq);
 389         if (unlikely(test_bit(NETFS_RREQ_USE_PGPRIV2, &rreq->flags)))
 390                 netfs_pgpriv2_write_to_the_cache(rreq);
 391 }
 392
 393 void netfs_read_termination_worker(struct work_struct *work)
 394 {
 395         struct netfs_io_request *rreq =
 396                 container_of(work, struct netfs_io_request, work);
 397         netfs_see_request(rreq, netfs_rreq_trace_see_work);
 398         netfs_rreq_assess(rreq);
 399         netfs_put_request(rreq, false, netfs_rreq_trace_put_work_complete);
 400 }
 401
 402 /*
 403  * Handle the completion of all outstanding I/O operations on a read request.
 404  * We inherit a ref from the caller.
 405  */
 406 void netfs_rreq_terminated(struct netfs_io_request *rreq, bool was_async)
 407 {
 408         if (!was_async)
 409                 return netfs_rreq_assess(rreq);
 410         if (!work_pending(&rreq->work)) {
 411                 netfs_get_request(rreq, netfs_rreq_trace_get_work);
 412                 if (!queue_work(system_unbound_wq, &rreq->work))
 413                         netfs_put_request(rreq, was_async, netfs_rreq_trace_put_work_nq);
 414         }
 415 }
 416
 417 /**
 418  * netfs_read_subreq_progress - Note progress of a read operation.
 419  * @subreq: The read request that has terminated.
 420  * @was_async: True if we're in an asynchronous context.
 421  *
 422  * This tells the read side of netfs lib that a contributory I/O operation has
 423  * made some progress and that it may be possible to unlock some folios.
 424  *
 425  * Before calling, the filesystem should update subreq->transferred to track
 426  * the amount of data copied into the output buffer.
 427  *
 428  * If @was_async is true, the caller might be running in softirq or interrupt
 429  * context and we can't sleep.
 430  */
 431 void netfs_read_subreq_progress(struct netfs_io_subrequest *subreq,
 432                                 bool was_async)
 433 {
 434         struct netfs_io_request *rreq = subreq->rreq;
 435
 436         trace_netfs_sreq(subreq, netfs_sreq_trace_progress);
 437
 438         if (subreq->transferred > subreq->consumed &&
 439             (rreq->origin == NETFS_READAHEAD ||
 440              rreq->origin == NETFS_READPAGE ||
 441              rreq->origin == NETFS_READ_FOR_WRITE)) {
 442                 netfs_consume_read_data(subreq, was_async);
 443                 __set_bit(NETFS_SREQ_MADE_PROGRESS, &subreq->flags);
 444         }
 445 }
 446 EXPORT_SYMBOL(netfs_read_subreq_progress);
 447
 448 /**
 449  * netfs_read_subreq_terminated - Note the termination of an I/O operation.
 450  * @subreq: The I/O request that has terminated.
 451  * @error: Error code indicating type of completion.
 452  * @was_async: The termination was asynchronous
 453  *
 454  * This tells the read helper that a contributory I/O operation has terminated,
 455  * one way or another, and that it should integrate the results.
 456  *
 457  * The caller indicates the outcome of the operation through @error, supplying
 458  * 0 to indicate a successful or retryable transfer (if NETFS_SREQ_NEED_RETRY
 459  * is set) or a negative error code.  The helper will look after reissuing I/O
 460  * operations as appropriate and writing downloaded data to the cache.
 461  *
 462  * Before calling, the filesystem should update subreq->transferred to track
 463  * the amount of data copied into the output buffer.
 464  *
 465  * If @was_async is true, the caller might be running in softirq or interrupt
 466  * context and we can't sleep.
 467  */
 468 void netfs_read_subreq_terminated(struct netfs_io_subrequest *subreq,
 469                                   int error, bool was_async)
 470 {
 471         struct netfs_io_request *rreq = subreq->rreq;
 472
 473         switch (subreq->source) {
 474         case NETFS_READ_FROM_CACHE:
 475                 netfs_stat(&netfs_n_rh_read_done);
 476                 break;
 477         case NETFS_DOWNLOAD_FROM_SERVER:
 478                 netfs_stat(&netfs_n_rh_download_done);
 479                 break;
 480         default:
 481                 break;
 482         }
 483
 484         if (rreq->origin != NETFS_DIO_READ) {
 485                 /* Collect buffered reads.
 486                  *
 487                  * If the read completed validly short, then we can clear the
 488                  * tail before going on to unlock the folios.
 489                  */
 490                 if (error == 0 && subreq->transferred < subreq->len &&
 491                     (test_bit(NETFS_SREQ_HIT_EOF, &subreq->flags) ||
 492                      test_bit(NETFS_SREQ_CLEAR_TAIL, &subreq->flags))) {
 493                         netfs_clear_unread(subreq);
 494                         subreq->transferred = subreq->len;
 495                         trace_netfs_sreq(subreq, netfs_sreq_trace_clear);
 496                 }
 497                 if (subreq->transferred > subreq->consumed &&
 498                     (rreq->origin == NETFS_READAHEAD ||
 499                      rreq->origin == NETFS_READPAGE ||
 500                      rreq->origin == NETFS_READ_FOR_WRITE)) {
 501                         netfs_consume_read_data(subreq, was_async);
 502                         __set_bit(NETFS_SREQ_MADE_PROGRESS, &subreq->flags);
 503                 }
 504                 rreq->transferred += subreq->transferred;
 505         }
 506
 507         /* Deal with retry requests, short reads and errors.  If we retry
 508          * but don't make progress, we abandon the attempt.
 509          */
 510         if (!error && subreq->transferred < subreq->len) {
 511                 if (test_bit(NETFS_SREQ_HIT_EOF, &subreq->flags)) {
 512                         trace_netfs_sreq(subreq, netfs_sreq_trace_hit_eof);
 513                 } else {
 514                         trace_netfs_sreq(subreq, netfs_sreq_trace_short);
 515                         if (subreq->transferred > subreq->consumed) {
 516                                 /* If we didn't read new data, abandon retry. */
 517                                 if (subreq->retry_count &&
 518                                     test_bit(NETFS_SREQ_MADE_PROGRESS, &subreq->flags)) {
 519                                         __set_bit(NETFS_SREQ_NEED_RETRY, &subreq->flags);
 520                                         set_bit(NETFS_RREQ_NEED_RETRY, &rreq->flags);
 521                                 }
 522                         } else if (test_bit(NETFS_SREQ_MADE_PROGRESS, &subreq->flags)) {
 523                                 __set_bit(NETFS_SREQ_NEED_RETRY, &subreq->flags);
 524                                 set_bit(NETFS_RREQ_NEED_RETRY, &rreq->flags);
 525                         } else {
 526                                 __set_bit(NETFS_SREQ_FAILED, &subreq->flags);
 527                                 error = -ENODATA;
 528                         }
 529                 }
 530         }
 531
 532         subreq->error = error;
 533         trace_netfs_sreq(subreq, netfs_sreq_trace_terminated);
 534
 535         if (unlikely(error < 0)) {
 536                 trace_netfs_failure(rreq, subreq, error, netfs_fail_read);
 537                 if (subreq->source == NETFS_READ_FROM_CACHE) {
 538                         netfs_stat(&netfs_n_rh_read_failed);
 539                 } else {
 540                         netfs_stat(&netfs_n_rh_download_failed);
 541                         set_bit(NETFS_RREQ_FAILED, &rreq->flags);
 542                         rreq->error = subreq->error;
 543                 }
 544         }
 545
 546         if (atomic_dec_and_test(&rreq->nr_outstanding))
 547                 netfs_rreq_terminated(rreq, was_async);
 548
 549         netfs_put_subrequest(subreq, was_async, netfs_sreq_trace_put_terminated);
 550 }
 551 EXPORT_SYMBOL(netfs_read_subreq_terminated);