fs/xfs/xfs_aops.c

   1 // SPDX-License-Identifier: GPL-2.0
   2 /*
   3  * Copyright (c) 2000-2005 Silicon Graphics, Inc.
   4  * Copyright (c) 2016-2018 Christoph Hellwig.
   5  * All Rights Reserved.
   6  */
   7 #include "xfs.h"
   8 #include "xfs_shared.h"
   9 #include "xfs_format.h"
  10 #include "xfs_log_format.h"
  11 #include "xfs_trans_resv.h"
  12 #include "xfs_mount.h"
  13 #include "xfs_inode.h"
  14 #include "xfs_trans.h"
  15 #include "xfs_inode_item.h"
  16 #include "xfs_alloc.h"
  17 #include "xfs_error.h"
  18 #include "xfs_iomap.h"
  19 #include "xfs_trace.h"
  20 #include "xfs_bmap.h"
  21 #include "xfs_bmap_util.h"
  22 #include "xfs_bmap_btree.h"
  23 #include "xfs_reflink.h"
  24 #include <linux/writeback.h>
  25
  26 /*
  27  * structure owned by writepages passed to individual writepage calls
  28  */
  29 struct xfs_writepage_ctx {
  30         struct xfs_bmbt_irec    imap;
  31         unsigned int            io_type;
  32         struct xfs_ioend        *ioend;
  33 };
  34
  35 struct block_device *
  36 xfs_find_bdev_for_inode(
  37         struct inode            *inode)
  38 {
  39         struct xfs_inode        *ip = XFS_I(inode);
  40         struct xfs_mount        *mp = ip->i_mount;
  41
  42         if (XFS_IS_REALTIME_INODE(ip))
  43                 return mp->m_rtdev_targp->bt_bdev;
  44         else
  45                 return mp->m_ddev_targp->bt_bdev;
  46 }
  47
  48 struct dax_device *
  49 xfs_find_daxdev_for_inode(
  50         struct inode            *inode)
  51 {
  52         struct xfs_inode        *ip = XFS_I(inode);
  53         struct xfs_mount        *mp = ip->i_mount;
  54
  55         if (XFS_IS_REALTIME_INODE(ip))
  56                 return mp->m_rtdev_targp->bt_daxdev;
  57         else
  58                 return mp->m_ddev_targp->bt_daxdev;
  59 }
  60
  61 static void
  62 xfs_finish_page_writeback(
  63         struct inode            *inode,
  64         struct bio_vec          *bvec,
  65         int                     error)
  66 {
  67         struct iomap_page       *iop = to_iomap_page(bvec->bv_page);
  68
  69         if (error) {
  70                 SetPageError(bvec->bv_page);
  71                 mapping_set_error(inode->i_mapping, -EIO);
  72         }
  73
  74         ASSERT(iop || i_blocksize(inode) == PAGE_SIZE);
  75         ASSERT(!iop || atomic_read(&iop->write_count) > 0);
  76
  77         if (!iop || atomic_dec_and_test(&iop->write_count))
  78                 end_page_writeback(bvec->bv_page);
  79 }
  80
  81 /*
  82  * We're now finished for good with this ioend structure.  Update the page
  83  * state, release holds on bios, and finally free up memory.  Do not use the
  84  * ioend after this.
  85  */
  86 STATIC void
  87 xfs_destroy_ioend(
  88         struct xfs_ioend        *ioend,
  89         int                     error)
  90 {
  91         struct inode            *inode = ioend->io_inode;
  92         struct bio              *bio = &ioend->io_inline_bio;
  93         struct bio              *last = ioend->io_bio, *next;
  94         u64                     start = bio->bi_iter.bi_sector;
  95         bool                    quiet = bio_flagged(bio, BIO_QUIET);
  96
  97         for (bio = &ioend->io_inline_bio; bio; bio = next) {
  98                 struct bio_vec  *bvec;
  99                 int             i;
 100
 101                 /*
 102                  * For the last bio, bi_private points to the ioend, so we
 103                  * need to explicitly end the iteration here.
 104                  */
 105                 if (bio == last)
 106                         next = NULL;
 107                 else
 108                         next = bio->bi_private;
 109
 110                 /* walk each page on bio, ending page IO on them */
 111                 bio_for_each_segment_all(bvec, bio, i)
 112                         xfs_finish_page_writeback(inode, bvec, error);
 113                 bio_put(bio);
 114         }
 115
 116         if (unlikely(error && !quiet)) {
 117                 xfs_err_ratelimited(XFS_I(inode)->i_mount,
 118                         "writeback error on sector %llu", start);
 119         }
 120 }
 121
 122 /*
 123  * Fast and loose check if this write could update the on-disk inode size.
 124  */
 125 static inline bool xfs_ioend_is_append(struct xfs_ioend *ioend)
 126 {
 127         return ioend->io_offset + ioend->io_size >
 128                 XFS_I(ioend->io_inode)->i_d.di_size;
 129 }
 130
 131 STATIC int
 132 xfs_setfilesize_trans_alloc(
 133         struct xfs_ioend        *ioend)
 134 {
 135         struct xfs_mount        *mp = XFS_I(ioend->io_inode)->i_mount;
 136         struct xfs_trans        *tp;
 137         int                     error;
 138
 139         error = xfs_trans_alloc(mp, &M_RES(mp)->tr_fsyncts, 0, 0,
 140                                 XFS_TRANS_NOFS, &tp);
 141         if (error)
 142                 return error;
 143
 144         ioend->io_append_trans = tp;
 145
 146         /*
 147          * We may pass freeze protection with a transaction.  So tell lockdep
 148          * we released it.
 149          */
 150         __sb_writers_release(ioend->io_inode->i_sb, SB_FREEZE_FS);
 151         /*
 152          * We hand off the transaction to the completion thread now, so
 153          * clear the flag here.
 154          */
 155         current_restore_flags_nested(&tp->t_pflags, PF_MEMALLOC_NOFS);
 156         return 0;
 157 }
 158
 159 /*
 160  * Update on-disk file size now that data has been written to disk.
 161  */
 162 STATIC int
 163 __xfs_setfilesize(
 164         struct xfs_inode        *ip,
 165         struct xfs_trans        *tp,
 166         xfs_off_t               offset,
 167         size_t                  size)
 168 {
 169         xfs_fsize_t             isize;
 170
 171         xfs_ilock(ip, XFS_ILOCK_EXCL);
 172         isize = xfs_new_eof(ip, offset + size);
 173         if (!isize) {
 174                 xfs_iunlock(ip, XFS_ILOCK_EXCL);
 175                 xfs_trans_cancel(tp);
 176                 return 0;
 177         }
 178
 179         trace_xfs_setfilesize(ip, offset, size);
 180
 181         ip->i_d.di_size = isize;
 182         xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
 183         xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
 184
 185         return xfs_trans_commit(tp);
 186 }
 187
 188 int
 189 xfs_setfilesize(
 190         struct xfs_inode        *ip,
 191         xfs_off_t               offset,
 192         size_t                  size)
 193 {
 194         struct xfs_mount        *mp = ip->i_mount;
 195         struct xfs_trans        *tp;
 196         int                     error;
 197
 198         error = xfs_trans_alloc(mp, &M_RES(mp)->tr_fsyncts, 0, 0, 0, &tp);
 199         if (error)
 200                 return error;
 201
 202         return __xfs_setfilesize(ip, tp, offset, size);
 203 }
 204
 205 STATIC int
 206 xfs_setfilesize_ioend(
 207         struct xfs_ioend        *ioend,
 208         int                     error)
 209 {
 210         struct xfs_inode        *ip = XFS_I(ioend->io_inode);
 211         struct xfs_trans        *tp = ioend->io_append_trans;
 212
 213         /*
 214          * The transaction may have been allocated in the I/O submission thread,
 215          * thus we need to mark ourselves as being in a transaction manually.
 216          * Similarly for freeze protection.
 217          */
 218         current_set_flags_nested(&tp->t_pflags, PF_MEMALLOC_NOFS);
 219         __sb_writers_acquired(VFS_I(ip)->i_sb, SB_FREEZE_FS);
 220
 221         /* we abort the update if there was an IO error */
 222         if (error) {
 223                 xfs_trans_cancel(tp);
 224                 return error;
 225         }
 226
 227         return __xfs_setfilesize(ip, tp, ioend->io_offset, ioend->io_size);
 228 }
 229
 230 /*
 231  * IO write completion.
 232  */
 233 STATIC void
 234 xfs_end_io(
 235         struct work_struct *work)
 236 {
 237         struct xfs_ioend        *ioend =
 238                 container_of(work, struct xfs_ioend, io_work);
 239         struct xfs_inode        *ip = XFS_I(ioend->io_inode);
 240         xfs_off_t               offset = ioend->io_offset;
 241         size_t                  size = ioend->io_size;
 242         int                     error;
 243
 244         /*
 245          * Just clean up the in-memory strutures if the fs has been shut down.
 246          */
 247         if (XFS_FORCED_SHUTDOWN(ip->i_mount)) {
 248                 error = -EIO;
 249                 goto done;
 250         }
 251
 252         /*
 253          * Clean up any COW blocks on an I/O error.
 254          */
 255         error = blk_status_to_errno(ioend->io_bio->bi_status);
 256         if (unlikely(error)) {
 257                 switch (ioend->io_type) {
 258                 case XFS_IO_COW:
 259                         xfs_reflink_cancel_cow_range(ip, offset, size, true);
 260                         break;
 261                 }
 262
 263                 goto done;
 264         }
 265
 266         /*
 267          * Success:  commit the COW or unwritten blocks if needed.
 268          */
 269         switch (ioend->io_type) {
 270         case XFS_IO_COW:
 271                 error = xfs_reflink_end_cow(ip, offset, size);
 272                 break;
 273         case XFS_IO_UNWRITTEN:
 274                 /* writeback should never update isize */
 275                 error = xfs_iomap_write_unwritten(ip, offset, size, false);
 276                 break;
 277         default:
 278                 ASSERT(!xfs_ioend_is_append(ioend) || ioend->io_append_trans);
 279                 break;
 280         }
 281
 282 done:
 283         if (ioend->io_append_trans)
 284                 error = xfs_setfilesize_ioend(ioend, error);
 285         xfs_destroy_ioend(ioend, error);
 286 }
 287
 288 STATIC void
 289 xfs_end_bio(
 290         struct bio              *bio)
 291 {
 292         struct xfs_ioend        *ioend = bio->bi_private;
 293         struct xfs_mount        *mp = XFS_I(ioend->io_inode)->i_mount;
 294
 295         if (ioend->io_type == XFS_IO_UNWRITTEN || ioend->io_type == XFS_IO_COW)
 296                 queue_work(mp->m_unwritten_workqueue, &ioend->io_work);
 297         else if (ioend->io_append_trans)
 298                 queue_work(mp->m_data_workqueue, &ioend->io_work);
 299         else
 300                 xfs_destroy_ioend(ioend, blk_status_to_errno(bio->bi_status));
 301 }
 302
 303 STATIC int
 304 xfs_map_blocks(
 305         struct xfs_writepage_ctx *wpc,
 306         struct inode            *inode,
 307         loff_t                  offset)
 308 {
 309         struct xfs_inode        *ip = XFS_I(inode);
 310         struct xfs_mount        *mp = ip->i_mount;
 311         ssize_t                 count = i_blocksize(inode);
 312         xfs_fileoff_t           offset_fsb = XFS_B_TO_FSBT(mp, offset), end_fsb;
 313         struct xfs_bmbt_irec    imap;
 314         int                     whichfork = XFS_DATA_FORK;
 315         struct xfs_iext_cursor  icur;
 316         bool                    imap_valid;
 317         int                     error = 0;
 318
 319         /*
 320          * We have to make sure the cached mapping is within EOF to protect
 321          * against eofblocks trimming on file release leaving us with a stale
 322          * mapping. Otherwise, a page for a subsequent file extending buffered
 323          * write could get picked up by this writeback cycle and written to the
 324          * wrong blocks.
 325          *
 326          * Note that what we really want here is a generic mapping invalidation
 327          * mechanism to protect us from arbitrary extent modifying contexts, not
 328          * just eofblocks.
 329          */
 330         xfs_trim_extent_eof(&wpc->imap, ip);
 331
 332         /*
 333          * COW fork blocks can overlap data fork blocks even if the blocks
 334          * aren't shared.  COW I/O always takes precedent, so we must always
 335          * check for overlap on reflink inodes unless the mapping is already a
 336          * COW one.
 337          */
 338         imap_valid = offset_fsb >= wpc->imap.br_startoff &&
 339                      offset_fsb < wpc->imap.br_startoff + wpc->imap.br_blockcount;
 340         if (imap_valid &&
 341             (!xfs_inode_has_cow_data(ip) || wpc->io_type == XFS_IO_COW))
 342                 return 0;
 343
 344         if (XFS_FORCED_SHUTDOWN(mp))
 345                 return -EIO;
 346
 347         /*
 348          * If we don't have a valid map, now it's time to get a new one for this
 349          * offset.  This will convert delayed allocations (including COW ones)
 350          * into real extents.  If we return without a valid map, it means we
 351          * landed in a hole and we skip the block.
 352          */
 353         xfs_ilock(ip, XFS_ILOCK_SHARED);
 354         ASSERT(ip->i_d.di_format != XFS_DINODE_FMT_BTREE ||
 355                (ip->i_df.if_flags & XFS_IFEXTENTS));
 356         ASSERT(offset <= mp->m_super->s_maxbytes);
 357
 358         if (offset > mp->m_super->s_maxbytes - count)
 359                 count = mp->m_super->s_maxbytes - offset;
 360         end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + count);
 361
 362         /*
 363          * Check if this is offset is covered by a COW extents, and if yes use
 364          * it directly instead of looking up anything in the data fork.
 365          */
 366         if (xfs_inode_has_cow_data(ip) &&
 367             xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb, &icur, &imap) &&
 368             imap.br_startoff <= offset_fsb) {
 369                 xfs_iunlock(ip, XFS_ILOCK_SHARED);
 370                 /*
 371                  * Truncate can race with writeback since writeback doesn't
 372                  * take the iolock and truncate decreases the file size before
 373                  * it starts truncating the pages between new_size and old_size.
 374                  * Therefore, we can end up in the situation where writeback
 375                  * gets a CoW fork mapping but the truncate makes the mapping
 376                  * invalid and we end up in here trying to get a new mapping.
 377                  * bail out here so that we simply never get a valid mapping
 378                  * and so we drop the write altogether.  The page truncation
 379                  * will kill the contents anyway.
 380                  */
 381                 if (offset > i_size_read(inode)) {
 382                         wpc->io_type = XFS_IO_HOLE;
 383                         return 0;
 384                 }
 385                 whichfork = XFS_COW_FORK;
 386                 wpc->io_type = XFS_IO_COW;
 387                 goto allocate_blocks;
 388         }
 389
 390         /*
 391          * Map valid and no COW extent in the way?  We're done.
 392          */
 393         if (imap_valid) {
 394                 xfs_iunlock(ip, XFS_ILOCK_SHARED);
 395                 return 0;
 396         }
 397
 398         /*
 399          * If we don't have a valid map, now it's time to get a new one for this
 400          * offset.  This will convert delayed allocations (including COW ones)
 401          * into real extents.
 402          */
 403         if (!xfs_iext_lookup_extent(ip, &ip->i_df, offset_fsb, &icur, &imap))
 404                 imap.br_startoff = end_fsb;     /* fake a hole past EOF */
 405         xfs_iunlock(ip, XFS_ILOCK_SHARED);
 406
 407         if (imap.br_startoff > offset_fsb) {
 408                 /* landed in a hole or beyond EOF */
 409                 imap.br_blockcount = imap.br_startoff - offset_fsb;
 410                 imap.br_startoff = offset_fsb;
 411                 imap.br_startblock = HOLESTARTBLOCK;
 412                 wpc->io_type = XFS_IO_HOLE;
 413         } else {
 414                 if (isnullstartblock(imap.br_startblock)) {
 415                         /* got a delalloc extent */
 416                         wpc->io_type = XFS_IO_DELALLOC;
 417                         goto allocate_blocks;
 418                 }
 419
 420                 if (imap.br_state == XFS_EXT_UNWRITTEN)
 421                         wpc->io_type = XFS_IO_UNWRITTEN;
 422                 else
 423                         wpc->io_type = XFS_IO_OVERWRITE;
 424         }
 425
 426         wpc->imap = imap;
 427         trace_xfs_map_blocks_found(ip, offset, count, wpc->io_type, &imap);
 428         return 0;
 429 allocate_blocks:
 430         error = xfs_iomap_write_allocate(ip, whichfork, offset, &imap);
 431         if (error)
 432                 return error;
 433         wpc->imap = imap;
 434         trace_xfs_map_blocks_alloc(ip, offset, count, wpc->io_type, &imap);
 435         return 0;
 436 }
 437
 438 /*
 439  * Submit the bio for an ioend. We are passed an ioend with a bio attached to
 440  * it, and we submit that bio. The ioend may be used for multiple bio
 441  * submissions, so we only want to allocate an append transaction for the ioend
 442  * once. In the case of multiple bio submission, each bio will take an IO
 443  * reference to the ioend to ensure that the ioend completion is only done once
 444  * all bios have been submitted and the ioend is really done.
 445  *
 446  * If @fail is non-zero, it means that we have a situation where some part of
 447  * the submission process has failed after we have marked paged for writeback
 448  * and unlocked them. In this situation, we need to fail the bio and ioend
 449  * rather than submit it to IO. This typically only happens on a filesystem
 450  * shutdown.
 451  */
 452 STATIC int
 453 xfs_submit_ioend(
 454         struct writeback_control *wbc,
 455         struct xfs_ioend        *ioend,
 456         int                     status)
 457 {
 458         /* Convert CoW extents to regular */
 459         if (!status && ioend->io_type == XFS_IO_COW) {
 460                 /*
 461                  * Yuk. This can do memory allocation, but is not a
 462                  * transactional operation so everything is done in GFP_KERNEL
 463                  * context. That can deadlock, because we hold pages in
 464                  * writeback state and GFP_KERNEL allocations can block on them.
 465                  * Hence we must operate in nofs conditions here.
 466                  */
 467                 unsigned nofs_flag;
 468
 469                 nofs_flag = memalloc_nofs_save();
 470                 status = xfs_reflink_convert_cow(XFS_I(ioend->io_inode),
 471                                 ioend->io_offset, ioend->io_size);
 472                 memalloc_nofs_restore(nofs_flag);
 473         }
 474
 475         /* Reserve log space if we might write beyond the on-disk inode size. */
 476         if (!status &&
 477             ioend->io_type != XFS_IO_UNWRITTEN &&
 478             xfs_ioend_is_append(ioend) &&
 479             !ioend->io_append_trans)
 480                 status = xfs_setfilesize_trans_alloc(ioend);
 481
 482         ioend->io_bio->bi_private = ioend;
 483         ioend->io_bio->bi_end_io = xfs_end_bio;
 484         ioend->io_bio->bi_opf = REQ_OP_WRITE | wbc_to_write_flags(wbc);
 485
 486         /*
 487          * If we are failing the IO now, just mark the ioend with an
 488          * error and finish it. This will run IO completion immediately
 489          * as there is only one reference to the ioend at this point in
 490          * time.
 491          */
 492         if (status) {
 493                 ioend->io_bio->bi_status = errno_to_blk_status(status);
 494                 bio_endio(ioend->io_bio);
 495                 return status;
 496         }
 497
 498         ioend->io_bio->bi_write_hint = ioend->io_inode->i_write_hint;
 499         submit_bio(ioend->io_bio);
 500         return 0;
 501 }
 502
 503 static struct xfs_ioend *
 504 xfs_alloc_ioend(
 505         struct inode            *inode,
 506         unsigned int            type,
 507         xfs_off_t               offset,
 508         struct block_device     *bdev,
 509         sector_t                sector)
 510 {
 511         struct xfs_ioend        *ioend;
 512         struct bio              *bio;
 513
 514         bio = bio_alloc_bioset(GFP_NOFS, BIO_MAX_PAGES, &xfs_ioend_bioset);
 515         bio_set_dev(bio, bdev);
 516         bio->bi_iter.bi_sector = sector;
 517
 518         ioend = container_of(bio, struct xfs_ioend, io_inline_bio);
 519         INIT_LIST_HEAD(&ioend->io_list);
 520         ioend->io_type = type;
 521         ioend->io_inode = inode;
 522         ioend->io_size = 0;
 523         ioend->io_offset = offset;
 524         INIT_WORK(&ioend->io_work, xfs_end_io);
 525         ioend->io_append_trans = NULL;
 526         ioend->io_bio = bio;
 527         return ioend;
 528 }
 529
 530 /*
 531  * Allocate a new bio, and chain the old bio to the new one.
 532  *
 533  * Note that we have to do perform the chaining in this unintuitive order
 534  * so that the bi_private linkage is set up in the right direction for the
 535  * traversal in xfs_destroy_ioend().
 536  */
 537 static void
 538 xfs_chain_bio(
 539         struct xfs_ioend        *ioend,
 540         struct writeback_control *wbc,
 541         struct block_device     *bdev,
 542         sector_t                sector)
 543 {
 544         struct bio *new;
 545
 546         new = bio_alloc(GFP_NOFS, BIO_MAX_PAGES);
 547         bio_set_dev(new, bdev);
 548         new->bi_iter.bi_sector = sector;
 549         bio_chain(ioend->io_bio, new);
 550         bio_get(ioend->io_bio);         /* for xfs_destroy_ioend */
 551         ioend->io_bio->bi_opf = REQ_OP_WRITE | wbc_to_write_flags(wbc);
 552         ioend->io_bio->bi_write_hint = ioend->io_inode->i_write_hint;
 553         submit_bio(ioend->io_bio);
 554         ioend->io_bio = new;
 555 }
 556
 557 /*
 558  * Test to see if we have an existing ioend structure that we could append to
 559  * first, otherwise finish off the current ioend and start another.
 560  */
 561 STATIC void
 562 xfs_add_to_ioend(
 563         struct inode            *inode,
 564         xfs_off_t               offset,
 565         struct page             *page,
 566         struct iomap_page       *iop,
 567         struct xfs_writepage_ctx *wpc,
 568         struct writeback_control *wbc,
 569         struct list_head        *iolist)
 570 {
 571         struct xfs_inode        *ip = XFS_I(inode);
 572         struct xfs_mount        *mp = ip->i_mount;
 573         struct block_device     *bdev = xfs_find_bdev_for_inode(inode);
 574         unsigned                len = i_blocksize(inode);
 575         unsigned                poff = offset & (PAGE_SIZE - 1);
 576         sector_t                sector;
 577
 578         sector = xfs_fsb_to_db(ip, wpc->imap.br_startblock) +
 579                 ((offset - XFS_FSB_TO_B(mp, wpc->imap.br_startoff)) >> 9);
 580
 581         if (!wpc->ioend || wpc->io_type != wpc->ioend->io_type ||
 582             sector != bio_end_sector(wpc->ioend->io_bio) ||
 583             offset != wpc->ioend->io_offset + wpc->ioend->io_size) {
 584                 if (wpc->ioend)
 585                         list_add(&wpc->ioend->io_list, iolist);
 586                 wpc->ioend = xfs_alloc_ioend(inode, wpc->io_type, offset,
 587                                 bdev, sector);
 588         }
 589
 590         if (!__bio_try_merge_page(wpc->ioend->io_bio, page, len, poff)) {
 591                 if (iop)
 592                         atomic_inc(&iop->write_count);
 593                 if (bio_full(wpc->ioend->io_bio))
 594                         xfs_chain_bio(wpc->ioend, wbc, bdev, sector);
 595                 __bio_add_page(wpc->ioend->io_bio, page, len, poff);
 596         }
 597
 598         wpc->ioend->io_size += len;
 599 }
 600
 601 STATIC void
 602 xfs_vm_invalidatepage(
 603         struct page             *page,
 604         unsigned int            offset,
 605         unsigned int            length)
 606 {
 607         trace_xfs_invalidatepage(page->mapping->host, page, offset, length);
 608         iomap_invalidatepage(page, offset, length);
 609 }
 610
 611 /*
 612  * If the page has delalloc blocks on it, we need to punch them out before we
 613  * invalidate the page.  If we don't, we leave a stale delalloc mapping on the
 614  * inode that can trip up a later direct I/O read operation on the same region.
 615  *
 616  * We prevent this by truncating away the delalloc regions on the page.  Because
 617  * they are delalloc, we can do this without needing a transaction. Indeed - if
 618  * we get ENOSPC errors, we have to be able to do this truncation without a
 619  * transaction as there is no space left for block reservation (typically why we
 620  * see a ENOSPC in writeback).
 621  */
 622 STATIC void
 623 xfs_aops_discard_page(
 624         struct page             *page)
 625 {
 626         struct inode            *inode = page->mapping->host;
 627         struct xfs_inode        *ip = XFS_I(inode);
 628         struct xfs_mount        *mp = ip->i_mount;
 629         loff_t                  offset = page_offset(page);
 630         xfs_fileoff_t           start_fsb = XFS_B_TO_FSBT(mp, offset);
 631         int                     error;
 632
 633         if (XFS_FORCED_SHUTDOWN(mp))
 634                 goto out_invalidate;
 635
 636         xfs_alert(mp,
 637                 "page discard on page "PTR_FMT", inode 0x%llx, offset %llu.",
 638                         page, ip->i_ino, offset);
 639
 640         error = xfs_bmap_punch_delalloc_range(ip, start_fsb,
 641                         PAGE_SIZE / i_blocksize(inode));
 642         if (error && !XFS_FORCED_SHUTDOWN(mp))
 643                 xfs_alert(mp, "page discard unable to remove delalloc mapping.");
 644 out_invalidate:
 645         xfs_vm_invalidatepage(page, 0, PAGE_SIZE);
 646 }
 647
 648 /*
 649  * We implement an immediate ioend submission policy here to avoid needing to
 650  * chain multiple ioends and hence nest mempool allocations which can violate
 651  * forward progress guarantees we need to provide. The current ioend we are
 652  * adding blocks to is cached on the writepage context, and if the new block
 653  * does not append to the cached ioend it will create a new ioend and cache that
 654  * instead.
 655  *
 656  * If a new ioend is created and cached, the old ioend is returned and queued
 657  * locally for submission once the entire page is processed or an error has been
 658  * detected.  While ioends are submitted immediately after they are completed,
 659  * batching optimisations are provided by higher level block plugging.
 660  *
 661  * At the end of a writeback pass, there will be a cached ioend remaining on the
 662  * writepage context that the caller will need to submit.
 663  */
 664 static int
 665 xfs_writepage_map(
 666         struct xfs_writepage_ctx *wpc,
 667         struct writeback_control *wbc,
 668         struct inode            *inode,
 669         struct page             *page,
 670         uint64_t                end_offset)
 671 {
 672         LIST_HEAD(submit_list);
 673         struct iomap_page       *iop = to_iomap_page(page);
 674         unsigned                len = i_blocksize(inode);
 675         struct xfs_ioend        *ioend, *next;
 676         uint64_t                file_offset;    /* file offset of page */
 677         int                     error = 0, count = 0, i;
 678
 679         ASSERT(iop || i_blocksize(inode) == PAGE_SIZE);
 680         ASSERT(!iop || atomic_read(&iop->write_count) == 0);
 681
 682         /*
 683          * Walk through the page to find areas to write back. If we run off the
 684          * end of the current map or find the current map invalid, grab a new
 685          * one.
 686          */
 687         for (i = 0, file_offset = page_offset(page);
 688              i < (PAGE_SIZE >> inode->i_blkbits) && file_offset < end_offset;
 689              i++, file_offset += len) {
 690                 if (iop && !test_bit(i, iop->uptodate))
 691                         continue;
 692
 693                 error = xfs_map_blocks(wpc, inode, file_offset);
 694                 if (error)
 695                         break;
 696                 if (wpc->io_type == XFS_IO_HOLE)
 697                         continue;
 698                 xfs_add_to_ioend(inode, file_offset, page, iop, wpc, wbc,
 699                                  &submit_list);
 700                 count++;
 701         }
 702
 703         ASSERT(wpc->ioend || list_empty(&submit_list));
 704         ASSERT(PageLocked(page));
 705         ASSERT(!PageWriteback(page));
 706
 707         /*
 708          * On error, we have to fail the ioend here because we may have set
 709          * pages under writeback, we have to make sure we run IO completion to
 710          * mark the error state of the IO appropriately, so we can't cancel the
 711          * ioend directly here.  That means we have to mark this page as under
 712          * writeback if we included any blocks from it in the ioend chain so
 713          * that completion treats it correctly.
 714          *
 715          * If we didn't include the page in the ioend, the on error we can
 716          * simply discard and unlock it as there are no other users of the page
 717          * now.  The caller will still need to trigger submission of outstanding
 718          * ioends on the writepage context so they are treated correctly on
 719          * error.
 720          */
 721         if (unlikely(error)) {
 722                 if (!count) {
 723                         xfs_aops_discard_page(page);
 724                         ClearPageUptodate(page);
 725                         unlock_page(page);
 726                         goto done;
 727                 }
 728
 729                 /*
 730                  * If the page was not fully cleaned, we need to ensure that the
 731                  * higher layers come back to it correctly.  That means we need
 732                  * to keep the page dirty, and for WB_SYNC_ALL writeback we need
 733                  * to ensure the PAGECACHE_TAG_TOWRITE index mark is not removed
 734                  * so another attempt to write this page in this writeback sweep
 735                  * will be made.
 736                  */
 737                 set_page_writeback_keepwrite(page);
 738         } else {
 739                 clear_page_dirty_for_io(page);
 740                 set_page_writeback(page);
 741         }
 742
 743         unlock_page(page);
 744
 745         /*
 746          * Preserve the original error if there was one, otherwise catch
 747          * submission errors here and propagate into subsequent ioend
 748          * submissions.
 749          */
 750         list_for_each_entry_safe(ioend, next, &submit_list, io_list) {
 751                 int error2;
 752
 753                 list_del_init(&ioend->io_list);
 754                 error2 = xfs_submit_ioend(wbc, ioend, error);
 755                 if (error2 && !error)
 756                         error = error2;
 757         }
 758
 759         /*
 760          * We can end up here with no error and nothing to write only if we race
 761          * with a partial page truncate on a sub-page block sized filesystem.
 762          */
 763         if (!count)
 764                 end_page_writeback(page);
 765 done:
 766         mapping_set_error(page->mapping, error);
 767         return error;
 768 }
 769
 770 /*
 771  * Write out a dirty page.
 772  *
 773  * For delalloc space on the page we need to allocate space and flush it.
 774  * For unwritten space on the page we need to start the conversion to
 775  * regular allocated space.
 776  */
 777 STATIC int
 778 xfs_do_writepage(
 779         struct page             *page,
 780         struct writeback_control *wbc,
 781         void                    *data)
 782 {
 783         struct xfs_writepage_ctx *wpc = data;
 784         struct inode            *inode = page->mapping->host;
 785         loff_t                  offset;
 786         uint64_t              end_offset;
 787         pgoff_t                 end_index;
 788
 789         trace_xfs_writepage(inode, page, 0, 0);
 790
 791         /*
 792          * Refuse to write the page out if we are called from reclaim context.
 793          *
 794          * This avoids stack overflows when called from deeply used stacks in
 795          * random callers for direct reclaim or memcg reclaim.  We explicitly
 796          * allow reclaim from kswapd as the stack usage there is relatively low.
 797          *
 798          * This should never happen except in the case of a VM regression so
 799          * warn about it.
 800          */
 801         if (WARN_ON_ONCE((current->flags & (PF_MEMALLOC|PF_KSWAPD)) ==
 802                         PF_MEMALLOC))
 803                 goto redirty;
 804
 805         /*
 806          * Given that we do not allow direct reclaim to call us, we should
 807          * never be called while in a filesystem transaction.
 808          */
 809         if (WARN_ON_ONCE(current->flags & PF_MEMALLOC_NOFS))
 810                 goto redirty;
 811
 812         /*
 813          * Is this page beyond the end of the file?
 814          *
 815          * The page index is less than the end_index, adjust the end_offset
 816          * to the highest offset that this page should represent.
 817          * -----------------------------------------------------
 818          * |                    file mapping           | <EOF> |
 819          * -----------------------------------------------------
 820          * | Page ... | Page N-2 | Page N-1 |  Page N  |       |
 821          * ^--------------------------------^----------|--------
 822          * |     desired writeback range    |      see else    |
 823          * ---------------------------------^------------------|
 824          */
 825         offset = i_size_read(inode);
 826         end_index = offset >> PAGE_SHIFT;
 827         if (page->index < end_index)
 828                 end_offset = (xfs_off_t)(page->index + 1) << PAGE_SHIFT;
 829         else {
 830                 /*
 831                  * Check whether the page to write out is beyond or straddles
 832                  * i_size or not.
 833                  * -------------------------------------------------------
 834                  * |            file mapping                    | <EOF>  |
 835                  * -------------------------------------------------------
 836                  * | Page ... | Page N-2 | Page N-1 |  Page N   | Beyond |
 837                  * ^--------------------------------^-----------|---------
 838                  * |                                |      Straddles     |
 839                  * ---------------------------------^-----------|--------|
 840                  */
 841                 unsigned offset_into_page = offset & (PAGE_SIZE - 1);
 842
 843                 /*
 844                  * Skip the page if it is fully outside i_size, e.g. due to a
 845                  * truncate operation that is in progress. We must redirty the
 846                  * page so that reclaim stops reclaiming it. Otherwise
 847                  * xfs_vm_releasepage() is called on it and gets confused.
 848                  *
 849                  * Note that the end_index is unsigned long, it would overflow
 850                  * if the given offset is greater than 16TB on 32-bit system
 851                  * and if we do check the page is fully outside i_size or not
 852                  * via "if (page->index >= end_index + 1)" as "end_index + 1"
 853                  * will be evaluated to 0.  Hence this page will be redirtied
 854                  * and be written out repeatedly which would result in an
 855                  * infinite loop, the user program that perform this operation
 856                  * will hang.  Instead, we can verify this situation by checking
 857                  * if the page to write is totally beyond the i_size or if it's
 858                  * offset is just equal to the EOF.
 859                  */
 860                 if (page->index > end_index ||
 861                     (page->index == end_index && offset_into_page == 0))
 862                         goto redirty;
 863
 864                 /*
 865                  * The page straddles i_size.  It must be zeroed out on each
 866                  * and every writepage invocation because it may be mmapped.
 867                  * "A file is mapped in multiples of the page size.  For a file
 868                  * that is not a multiple of the page size, the remaining
 869                  * memory is zeroed when mapped, and writes to that region are
 870                  * not written out to the file."
 871                  */
 872                 zero_user_segment(page, offset_into_page, PAGE_SIZE);
 873
 874                 /* Adjust the end_offset to the end of file */
 875                 end_offset = offset;
 876         }
 877
 878         return xfs_writepage_map(wpc, wbc, inode, page, end_offset);
 879
 880 redirty:
 881         redirty_page_for_writepage(wbc, page);
 882         unlock_page(page);
 883         return 0;
 884 }
 885
 886 STATIC int
 887 xfs_vm_writepage(
 888         struct page             *page,
 889         struct writeback_control *wbc)
 890 {
 891         struct xfs_writepage_ctx wpc = {
 892                 .io_type = XFS_IO_INVALID,
 893         };
 894         int                     ret;
 895
 896         ret = xfs_do_writepage(page, wbc, &wpc);
 897         if (wpc.ioend)
 898                 ret = xfs_submit_ioend(wbc, wpc.ioend, ret);
 899         return ret;
 900 }
 901
 902 STATIC int
 903 xfs_vm_writepages(
 904         struct address_space    *mapping,
 905         struct writeback_control *wbc)
 906 {
 907         struct xfs_writepage_ctx wpc = {
 908                 .io_type = XFS_IO_INVALID,
 909         };
 910         int                     ret;
 911
 912         xfs_iflags_clear(XFS_I(mapping->host), XFS_ITRUNCATED);
 913         ret = write_cache_pages(mapping, wbc, xfs_do_writepage, &wpc);
 914         if (wpc.ioend)
 915                 ret = xfs_submit_ioend(wbc, wpc.ioend, ret);
 916         return ret;
 917 }
 918
 919 STATIC int
 920 xfs_dax_writepages(
 921         struct address_space    *mapping,
 922         struct writeback_control *wbc)
 923 {
 924         xfs_iflags_clear(XFS_I(mapping->host), XFS_ITRUNCATED);
 925         return dax_writeback_mapping_range(mapping,
 926                         xfs_find_bdev_for_inode(mapping->host), wbc);
 927 }
 928
 929 STATIC int
 930 xfs_vm_releasepage(
 931         struct page             *page,
 932         gfp_t                   gfp_mask)
 933 {
 934         trace_xfs_releasepage(page->mapping->host, page, 0, 0);
 935         return iomap_releasepage(page, gfp_mask);
 936 }
 937
 938 STATIC sector_t
 939 xfs_vm_bmap(
 940         struct address_space    *mapping,
 941         sector_t                block)
 942 {
 943         struct xfs_inode        *ip = XFS_I(mapping->host);
 944
 945         trace_xfs_vm_bmap(ip);
 946
 947         /*
 948          * The swap code (ab-)uses ->bmap to get a block mapping and then
 949          * bypasses the file system for actual I/O.  We really can't allow
 950          * that on reflinks inodes, so we have to skip out here.  And yes,
 951          * 0 is the magic code for a bmap error.
 952          *
 953          * Since we don't pass back blockdev info, we can't return bmap
 954          * information for rt files either.
 955          */
 956         if (xfs_is_reflink_inode(ip) || XFS_IS_REALTIME_INODE(ip))
 957                 return 0;
 958         return iomap_bmap(mapping, block, &xfs_iomap_ops);
 959 }
 960
 961 STATIC int
 962 xfs_vm_readpage(
 963         struct file             *unused,
 964         struct page             *page)
 965 {
 966         trace_xfs_vm_readpage(page->mapping->host, 1);
 967         return iomap_readpage(page, &xfs_iomap_ops);
 968 }
 969
 970 STATIC int
 971 xfs_vm_readpages(
 972         struct file             *unused,
 973         struct address_space    *mapping,
 974         struct list_head        *pages,
 975         unsigned                nr_pages)
 976 {
 977         trace_xfs_vm_readpages(mapping->host, nr_pages);
 978         return iomap_readpages(mapping, pages, nr_pages, &xfs_iomap_ops);
 979 }
 980
 981 static int
 982 xfs_iomap_swapfile_activate(
 983         struct swap_info_struct         *sis,
 984         struct file                     *swap_file,
 985         sector_t                        *span)
 986 {
 987         sis->bdev = xfs_find_bdev_for_inode(file_inode(swap_file));
 988         return iomap_swapfile_activate(sis, swap_file, span, &xfs_iomap_ops);
 989 }
 990
 991 const struct address_space_operations xfs_address_space_operations = {
 992         .readpage               = xfs_vm_readpage,
 993         .readpages              = xfs_vm_readpages,
 994         .writepage              = xfs_vm_writepage,
 995         .writepages             = xfs_vm_writepages,
 996         .set_page_dirty         = iomap_set_page_dirty,
 997         .releasepage            = xfs_vm_releasepage,
 998         .invalidatepage         = xfs_vm_invalidatepage,
 999         .bmap                   = xfs_vm_bmap,
1000         .direct_IO              = noop_direct_IO,
1001         .migratepage            = iomap_migrate_page,
1002         .is_partially_uptodate  = iomap_is_partially_uptodate,
1003         .error_remove_page      = generic_error_remove_page,
1004         .swap_activate          = xfs_iomap_swapfile_activate,
1005 };
1006
1007 const struct address_space_operations xfs_dax_aops = {
1008         .writepages             = xfs_dax_writepages,
1009         .direct_IO              = noop_direct_IO,
1010         .set_page_dirty         = noop_set_page_dirty,
1011         .invalidatepage         = noop_invalidatepage,
1012         .swap_activate          = xfs_iomap_swapfile_activate,
1013 };