mm/page_io.c

   1 // SPDX-License-Identifier: GPL-2.0
   2 /*
   3  *  linux/mm/page_io.c
   4  *
   5  *  Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
   6  *
   7  *  Swap reorganised 29.12.95,
   8  *  Asynchronous swapping added 30.12.95. Stephen Tweedie
   9  *  Removed race in async swapping. 14.4.1996. Bruno Haible
  10  *  Add swap of shared pages through the page cache. 20.2.1998. Stephen Tweedie
  11  *  Always use brw_page, life becomes simpler. 12 May 1998 Eric Biederman
  12  */
  13
  14 #include <linux/mm.h>
  15 #include <linux/kernel_stat.h>
  16 #include <linux/gfp.h>
  17 #include <linux/pagemap.h>
  18 #include <linux/swap.h>
  19 #include <linux/bio.h>
  20 #include <linux/swapops.h>
  21 #include <linux/writeback.h>
  22 #include <linux/frontswap.h>
  23 #include <linux/blkdev.h>
  24 #include <linux/psi.h>
  25 #include <linux/uio.h>
  26 #include <linux/sched/task.h>
  27 #include <linux/delayacct.h>
  28 #include "swap.h"
  29
  30 static void end_swap_bio_write(struct bio *bio)
  31 {
  32         struct page *page = bio_first_page_all(bio);
  33
  34         if (bio->bi_status) {
  35                 SetPageError(page);
  36                 /*
  37                  * We failed to write the page out to swap-space.
  38                  * Re-dirty the page in order to avoid it being reclaimed.
  39                  * Also print a dire warning that things will go BAD (tm)
  40                  * very quickly.
  41                  *
  42                  * Also clear PG_reclaim to avoid folio_rotate_reclaimable()
  43                  */
  44                 set_page_dirty(page);
  45                 pr_alert_ratelimited("Write-error on swap-device (%u:%u:%llu)\n",
  46                                      MAJOR(bio_dev(bio)), MINOR(bio_dev(bio)),
  47                                      (unsigned long long)bio->bi_iter.bi_sector);
  48                 ClearPageReclaim(page);
  49         }
  50         end_page_writeback(page);
  51         bio_put(bio);
  52 }
  53
  54 static void end_swap_bio_read(struct bio *bio)
  55 {
  56         struct page *page = bio_first_page_all(bio);
  57         struct task_struct *waiter = bio->bi_private;
  58
  59         if (bio->bi_status) {
  60                 SetPageError(page);
  61                 ClearPageUptodate(page);
  62                 pr_alert_ratelimited("Read-error on swap-device (%u:%u:%llu)\n",
  63                                      MAJOR(bio_dev(bio)), MINOR(bio_dev(bio)),
  64                                      (unsigned long long)bio->bi_iter.bi_sector);
  65                 goto out;
  66         }
  67
  68         SetPageUptodate(page);
  69 out:
  70         unlock_page(page);
  71         WRITE_ONCE(bio->bi_private, NULL);
  72         bio_put(bio);
  73         if (waiter) {
  74                 blk_wake_io_task(waiter);
  75                 put_task_struct(waiter);
  76         }
  77 }
  78
  79 int generic_swapfile_activate(struct swap_info_struct *sis,
  80                                 struct file *swap_file,
  81                                 sector_t *span)
  82 {
  83         struct address_space *mapping = swap_file->f_mapping;
  84         struct inode *inode = mapping->host;
  85         unsigned blocks_per_page;
  86         unsigned long page_no;
  87         unsigned blkbits;
  88         sector_t probe_block;
  89         sector_t last_block;
  90         sector_t lowest_block = -1;
  91         sector_t highest_block = 0;
  92         int nr_extents = 0;
  93         int ret;
  94
  95         blkbits = inode->i_blkbits;
  96         blocks_per_page = PAGE_SIZE >> blkbits;
  97
  98         /*
  99          * Map all the blocks into the extent tree.  This code doesn't try
 100          * to be very smart.
 101          */
 102         probe_block = 0;
 103         page_no = 0;
 104         last_block = i_size_read(inode) >> blkbits;
 105         while ((probe_block + blocks_per_page) <= last_block &&
 106                         page_no < sis->max) {
 107                 unsigned block_in_page;
 108                 sector_t first_block;
 109
 110                 cond_resched();
 111
 112                 first_block = probe_block;
 113                 ret = bmap(inode, &first_block);
 114                 if (ret || !first_block)
 115                         goto bad_bmap;
 116
 117                 /*
 118                  * It must be PAGE_SIZE aligned on-disk
 119                  */
 120                 if (first_block & (blocks_per_page - 1)) {
 121                         probe_block++;
 122                         goto reprobe;
 123                 }
 124
 125                 for (block_in_page = 1; block_in_page < blocks_per_page;
 126                                         block_in_page++) {
 127                         sector_t block;
 128
 129                         block = probe_block + block_in_page;
 130                         ret = bmap(inode, &block);
 131                         if (ret || !block)
 132                                 goto bad_bmap;
 133
 134                         if (block != first_block + block_in_page) {
 135                                 /* Discontiguity */
 136                                 probe_block++;
 137                                 goto reprobe;
 138                         }
 139                 }
 140
 141                 first_block >>= (PAGE_SHIFT - blkbits);
 142                 if (page_no) {  /* exclude the header page */
 143                         if (first_block < lowest_block)
 144                                 lowest_block = first_block;
 145                         if (first_block > highest_block)
 146                                 highest_block = first_block;
 147                 }
 148
 149                 /*
 150                  * We found a PAGE_SIZE-length, PAGE_SIZE-aligned run of blocks
 151                  */
 152                 ret = add_swap_extent(sis, page_no, 1, first_block);
 153                 if (ret < 0)
 154                         goto out;
 155                 nr_extents += ret;
 156                 page_no++;
 157                 probe_block += blocks_per_page;
 158 reprobe:
 159                 continue;
 160         }
 161         ret = nr_extents;
 162         *span = 1 + highest_block - lowest_block;
 163         if (page_no == 0)
 164                 page_no = 1;    /* force Empty message */
 165         sis->max = page_no;
 166         sis->pages = page_no - 1;
 167         sis->highest_bit = page_no - 1;
 168 out:
 169         return ret;
 170 bad_bmap:
 171         pr_err("swapon: swapfile has holes\n");
 172         ret = -EINVAL;
 173         goto out;
 174 }
 175
 176 /*
 177  * We may have stale swap cache pages in memory: notice
 178  * them here and get rid of the unnecessary final write.
 179  */
 180 int swap_writepage(struct page *page, struct writeback_control *wbc)
 181 {
 182         struct folio *folio = page_folio(page);
 183         int ret = 0;
 184
 185         if (folio_free_swap(folio)) {
 186                 folio_unlock(folio);
 187                 goto out;
 188         }
 189         /*
 190          * Arch code may have to preserve more data than just the page
 191          * contents, e.g. memory tags.
 192          */
 193         ret = arch_prepare_to_swap(&folio->page);
 194         if (ret) {
 195                 folio_mark_dirty(folio);
 196                 folio_unlock(folio);
 197                 goto out;
 198         }
 199         if (frontswap_store(&folio->page) == 0) {
 200                 folio_start_writeback(folio);
 201                 folio_unlock(folio);
 202                 folio_end_writeback(folio);
 203                 goto out;
 204         }
 205         ret = __swap_writepage(&folio->page, wbc);
 206 out:
 207         return ret;
 208 }
 209
 210 static inline void count_swpout_vm_event(struct page *page)
 211 {
 212 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
 213         if (unlikely(PageTransHuge(page)))
 214                 count_vm_event(THP_SWPOUT);
 215 #endif
 216         count_vm_events(PSWPOUT, thp_nr_pages(page));
 217 }
 218
 219 #if defined(CONFIG_MEMCG) && defined(CONFIG_BLK_CGROUP)
 220 static void bio_associate_blkg_from_page(struct bio *bio, struct page *page)
 221 {
 222         struct cgroup_subsys_state *css;
 223         struct mem_cgroup *memcg;
 224
 225         memcg = page_memcg(page);
 226         if (!memcg)
 227                 return;
 228
 229         rcu_read_lock();
 230         css = cgroup_e_css(memcg->css.cgroup, &io_cgrp_subsys);
 231         bio_associate_blkg_from_css(bio, css);
 232         rcu_read_unlock();
 233 }
 234 #else
 235 #define bio_associate_blkg_from_page(bio, page)         do { } while (0)
 236 #endif /* CONFIG_MEMCG && CONFIG_BLK_CGROUP */
 237
 238 struct swap_iocb {
 239         struct kiocb            iocb;
 240         struct bio_vec          bvec[SWAP_CLUSTER_MAX];
 241         int                     pages;
 242         int                     len;
 243 };
 244 static mempool_t *sio_pool;
 245
 246 int sio_pool_init(void)
 247 {
 248         if (!sio_pool) {
 249                 mempool_t *pool = mempool_create_kmalloc_pool(
 250                         SWAP_CLUSTER_MAX, sizeof(struct swap_iocb));
 251                 if (cmpxchg(&sio_pool, NULL, pool))
 252                         mempool_destroy(pool);
 253         }
 254         if (!sio_pool)
 255                 return -ENOMEM;
 256         return 0;
 257 }
 258
 259 static void sio_write_complete(struct kiocb *iocb, long ret)
 260 {
 261         struct swap_iocb *sio = container_of(iocb, struct swap_iocb, iocb);
 262         struct page *page = sio->bvec[0].bv_page;
 263         int p;
 264
 265         if (ret != sio->len) {
 266                 /*
 267                  * In the case of swap-over-nfs, this can be a
 268                  * temporary failure if the system has limited
 269                  * memory for allocating transmit buffers.
 270                  * Mark the page dirty and avoid
 271                  * folio_rotate_reclaimable but rate-limit the
 272                  * messages but do not flag PageError like
 273                  * the normal direct-to-bio case as it could
 274                  * be temporary.
 275                  */
 276                 pr_err_ratelimited("Write error %ld on dio swapfile (%llu)\n",
 277                                    ret, page_file_offset(page));
 278                 for (p = 0; p < sio->pages; p++) {
 279                         page = sio->bvec[p].bv_page;
 280                         set_page_dirty(page);
 281                         ClearPageReclaim(page);
 282                 }
 283         } else {
 284                 for (p = 0; p < sio->pages; p++)
 285                         count_swpout_vm_event(sio->bvec[p].bv_page);
 286         }
 287
 288         for (p = 0; p < sio->pages; p++)
 289                 end_page_writeback(sio->bvec[p].bv_page);
 290
 291         mempool_free(sio, sio_pool);
 292 }
 293
 294 static int swap_writepage_fs(struct page *page, struct writeback_control *wbc)
 295 {
 296         struct swap_iocb *sio = NULL;
 297         struct swap_info_struct *sis = page_swap_info(page);
 298         struct file *swap_file = sis->swap_file;
 299         loff_t pos = page_file_offset(page);
 300
 301         set_page_writeback(page);
 302         unlock_page(page);
 303         if (wbc->swap_plug)
 304                 sio = *wbc->swap_plug;
 305         if (sio) {
 306                 if (sio->iocb.ki_filp != swap_file ||
 307                     sio->iocb.ki_pos + sio->len != pos) {
 308                         swap_write_unplug(sio);
 309                         sio = NULL;
 310                 }
 311         }
 312         if (!sio) {
 313                 sio = mempool_alloc(sio_pool, GFP_NOIO);
 314                 init_sync_kiocb(&sio->iocb, swap_file);
 315                 sio->iocb.ki_complete = sio_write_complete;
 316                 sio->iocb.ki_pos = pos;
 317                 sio->pages = 0;
 318                 sio->len = 0;
 319         }
 320         sio->bvec[sio->pages].bv_page = page;
 321         sio->bvec[sio->pages].bv_len = thp_size(page);
 322         sio->bvec[sio->pages].bv_offset = 0;
 323         sio->len += thp_size(page);
 324         sio->pages += 1;
 325         if (sio->pages == ARRAY_SIZE(sio->bvec) || !wbc->swap_plug) {
 326                 swap_write_unplug(sio);
 327                 sio = NULL;
 328         }
 329         if (wbc->swap_plug)
 330                 *wbc->swap_plug = sio;
 331
 332         return 0;
 333 }
 334
 335 int __swap_writepage(struct page *page, struct writeback_control *wbc)
 336 {
 337         struct bio *bio;
 338         int ret;
 339         struct swap_info_struct *sis = page_swap_info(page);
 340
 341         VM_BUG_ON_PAGE(!PageSwapCache(page), page);
 342         /*
 343          * ->flags can be updated non-atomicially (scan_swap_map_slots),
 344          * but that will never affect SWP_FS_OPS, so the data_race
 345          * is safe.
 346          */
 347         if (data_race(sis->flags & SWP_FS_OPS))
 348                 return swap_writepage_fs(page, wbc);
 349
 350         ret = bdev_write_page(sis->bdev, swap_page_sector(page), page, wbc);
 351         if (!ret) {
 352                 count_swpout_vm_event(page);
 353                 return 0;
 354         }
 355
 356         bio = bio_alloc(sis->bdev, 1,
 357                         REQ_OP_WRITE | REQ_SWAP | wbc_to_write_flags(wbc),
 358                         GFP_NOIO);
 359         bio->bi_iter.bi_sector = swap_page_sector(page);
 360         bio->bi_end_io = end_swap_bio_write;
 361         bio_add_page(bio, page, thp_size(page), 0);
 362
 363         bio_associate_blkg_from_page(bio, page);
 364         count_swpout_vm_event(page);
 365         set_page_writeback(page);
 366         unlock_page(page);
 367         submit_bio(bio);
 368
 369         return 0;
 370 }
 371
 372 void swap_write_unplug(struct swap_iocb *sio)
 373 {
 374         struct iov_iter from;
 375         struct address_space *mapping = sio->iocb.ki_filp->f_mapping;
 376         int ret;
 377
 378         iov_iter_bvec(&from, ITER_SOURCE, sio->bvec, sio->pages, sio->len);
 379         ret = mapping->a_ops->swap_rw(&sio->iocb, &from);
 380         if (ret != -EIOCBQUEUED)
 381                 sio_write_complete(&sio->iocb, ret);
 382 }
 383
 384 static void sio_read_complete(struct kiocb *iocb, long ret)
 385 {
 386         struct swap_iocb *sio = container_of(iocb, struct swap_iocb, iocb);
 387         int p;
 388
 389         if (ret == sio->len) {
 390                 for (p = 0; p < sio->pages; p++) {
 391                         struct page *page = sio->bvec[p].bv_page;
 392
 393                         SetPageUptodate(page);
 394                         unlock_page(page);
 395                 }
 396                 count_vm_events(PSWPIN, sio->pages);
 397         } else {
 398                 for (p = 0; p < sio->pages; p++) {
 399                         struct page *page = sio->bvec[p].bv_page;
 400
 401                         SetPageError(page);
 402                         ClearPageUptodate(page);
 403                         unlock_page(page);
 404                 }
 405                 pr_alert_ratelimited("Read-error on swap-device\n");
 406         }
 407         mempool_free(sio, sio_pool);
 408 }
 409
 410 static void swap_readpage_fs(struct page *page,
 411                              struct swap_iocb **plug)
 412 {
 413         struct swap_info_struct *sis = page_swap_info(page);
 414         struct swap_iocb *sio = NULL;
 415         loff_t pos = page_file_offset(page);
 416
 417         if (plug)
 418                 sio = *plug;
 419         if (sio) {
 420                 if (sio->iocb.ki_filp != sis->swap_file ||
 421                     sio->iocb.ki_pos + sio->len != pos) {
 422                         swap_read_unplug(sio);
 423                         sio = NULL;
 424                 }
 425         }
 426         if (!sio) {
 427                 sio = mempool_alloc(sio_pool, GFP_KERNEL);
 428                 init_sync_kiocb(&sio->iocb, sis->swap_file);
 429                 sio->iocb.ki_pos = pos;
 430                 sio->iocb.ki_complete = sio_read_complete;
 431                 sio->pages = 0;
 432                 sio->len = 0;
 433         }
 434         sio->bvec[sio->pages].bv_page = page;
 435         sio->bvec[sio->pages].bv_len = thp_size(page);
 436         sio->bvec[sio->pages].bv_offset = 0;
 437         sio->len += thp_size(page);
 438         sio->pages += 1;
 439         if (sio->pages == ARRAY_SIZE(sio->bvec) || !plug) {
 440                 swap_read_unplug(sio);
 441                 sio = NULL;
 442         }
 443         if (plug)
 444                 *plug = sio;
 445 }
 446
 447 int swap_readpage(struct page *page, bool synchronous,
 448                   struct swap_iocb **plug)
 449 {
 450         struct bio *bio;
 451         int ret = 0;
 452         struct swap_info_struct *sis = page_swap_info(page);
 453         bool workingset = PageWorkingset(page);
 454         unsigned long pflags;
 455         bool in_thrashing;
 456
 457         VM_BUG_ON_PAGE(!PageSwapCache(page) && !synchronous, page);
 458         VM_BUG_ON_PAGE(!PageLocked(page), page);
 459         VM_BUG_ON_PAGE(PageUptodate(page), page);
 460
 461         /*
 462          * Count submission time as memory stall and delay. When the device
 463          * is congested, or the submitting cgroup IO-throttled, submission
 464          * can be a significant part of overall IO time.
 465          */
 466         if (workingset) {
 467                 delayacct_thrashing_start(&in_thrashing);
 468                 psi_memstall_enter(&pflags);
 469         }
 470         delayacct_swapin_start();
 471
 472         if (frontswap_load(page) == 0) {
 473                 SetPageUptodate(page);
 474                 unlock_page(page);
 475                 goto out;
 476         }
 477
 478         if (data_race(sis->flags & SWP_FS_OPS)) {
 479                 swap_readpage_fs(page, plug);
 480                 goto out;
 481         }
 482
 483         if (sis->flags & SWP_SYNCHRONOUS_IO) {
 484                 ret = bdev_read_page(sis->bdev, swap_page_sector(page), page);
 485                 if (!ret) {
 486                         count_vm_event(PSWPIN);
 487                         goto out;
 488                 }
 489         }
 490
 491         ret = 0;
 492         bio = bio_alloc(sis->bdev, 1, REQ_OP_READ, GFP_KERNEL);
 493         bio->bi_iter.bi_sector = swap_page_sector(page);
 494         bio->bi_end_io = end_swap_bio_read;
 495         bio_add_page(bio, page, thp_size(page), 0);
 496         /*
 497          * Keep this task valid during swap readpage because the oom killer may
 498          * attempt to access it in the page fault retry time check.
 499          */
 500         if (synchronous) {
 501                 get_task_struct(current);
 502                 bio->bi_private = current;
 503         }
 504         count_vm_event(PSWPIN);
 505         bio_get(bio);
 506         submit_bio(bio);
 507         while (synchronous) {
 508                 set_current_state(TASK_UNINTERRUPTIBLE);
 509                 if (!READ_ONCE(bio->bi_private))
 510                         break;
 511
 512                 blk_io_schedule();
 513         }
 514         __set_current_state(TASK_RUNNING);
 515         bio_put(bio);
 516
 517 out:
 518         if (workingset) {
 519                 delayacct_thrashing_end(&in_thrashing);
 520                 psi_memstall_leave(&pflags);
 521         }
 522         delayacct_swapin_end();
 523         return ret;
 524 }
 525
 526 void __swap_read_unplug(struct swap_iocb *sio)
 527 {
 528         struct iov_iter from;
 529         struct address_space *mapping = sio->iocb.ki_filp->f_mapping;
 530         int ret;
 531
 532         iov_iter_bvec(&from, ITER_DEST, sio->bvec, sio->pages, sio->len);
 533         ret = mapping->a_ops->swap_rw(&sio->iocb, &from);
 534         if (ret != -EIOCBQUEUED)
 535                 sio_read_complete(&sio->iocb, ret);
 536 }