fs/xfs/xfs_exchrange.c

   1 // SPDX-License-Identifier: GPL-2.0-or-later
   2 /*
   3  * Copyright (c) 2020-2024 Oracle.  All Rights Reserved.
   4  * Author: Darrick J. Wong <[email protected]>
   5  */
   6 #include "xfs.h"
   7 #include "xfs_shared.h"
   8 #include "xfs_format.h"
   9 #include "xfs_log_format.h"
  10 #include "xfs_trans_resv.h"
  11 #include "xfs_mount.h"
  12 #include "xfs_defer.h"
  13 #include "xfs_inode.h"
  14 #include "xfs_trans.h"
  15 #include "xfs_quota.h"
  16 #include "xfs_bmap_util.h"
  17 #include "xfs_reflink.h"
  18 #include "xfs_trace.h"
  19 #include "xfs_exchrange.h"
  20 #include "xfs_exchmaps.h"
  21 #include "xfs_sb.h"
  22 #include "xfs_icache.h"
  23 #include "xfs_log.h"
  24 #include "xfs_rtbitmap.h"
  25 #include <linux/fsnotify.h>
  26
  27 /* Lock (and optionally join) two inodes for a file range exchange. */
  28 void
  29 xfs_exchrange_ilock(
  30         struct xfs_trans        *tp,
  31         struct xfs_inode        *ip1,
  32         struct xfs_inode        *ip2)
  33 {
  34         if (ip1 != ip2)
  35                 xfs_lock_two_inodes(ip1, XFS_ILOCK_EXCL,
  36                                     ip2, XFS_ILOCK_EXCL);
  37         else
  38                 xfs_ilock(ip1, XFS_ILOCK_EXCL);
  39         if (tp) {
  40                 xfs_trans_ijoin(tp, ip1, 0);
  41                 if (ip2 != ip1)
  42                         xfs_trans_ijoin(tp, ip2, 0);
  43         }
  44
  45 }
  46
  47 /* Unlock two inodes after a file range exchange operation. */
  48 void
  49 xfs_exchrange_iunlock(
  50         struct xfs_inode        *ip1,
  51         struct xfs_inode        *ip2)
  52 {
  53         if (ip2 != ip1)
  54                 xfs_iunlock(ip2, XFS_ILOCK_EXCL);
  55         xfs_iunlock(ip1, XFS_ILOCK_EXCL);
  56 }
  57
  58 /*
  59  * Estimate the resource requirements to exchange file contents between the two
  60  * files.  The caller is required to hold the IOLOCK and the MMAPLOCK and to
  61  * have flushed both inodes' pagecache and active direct-ios.
  62  */
  63 int
  64 xfs_exchrange_estimate(
  65         struct xfs_exchmaps_req *req)
  66 {
  67         int                     error;
  68
  69         xfs_exchrange_ilock(NULL, req->ip1, req->ip2);
  70         error = xfs_exchmaps_estimate(req);
  71         xfs_exchrange_iunlock(req->ip1, req->ip2);
  72         return error;
  73 }
  74
  75 #define QRETRY_IP1      (0x1)
  76 #define QRETRY_IP2      (0x2)
  77
  78 /*
  79  * Obtain a quota reservation to make sure we don't hit EDQUOT.  We can skip
  80  * this if quota enforcement is disabled or if both inodes' dquots are the
  81  * same.  The qretry structure must be initialized to zeroes before the first
  82  * call to this function.
  83  */
  84 STATIC int
  85 xfs_exchrange_reserve_quota(
  86         struct xfs_trans                *tp,
  87         const struct xfs_exchmaps_req   *req,
  88         unsigned int                    *qretry)
  89 {
  90         int64_t                         ddelta, rdelta;
  91         int                             ip1_error = 0;
  92         int                             error;
  93
  94         /*
  95          * Don't bother with a quota reservation if we're not enforcing them
  96          * or the two inodes have the same dquots.
  97          */
  98         if (!XFS_IS_QUOTA_ON(tp->t_mountp) || req->ip1 == req->ip2 ||
  99             (req->ip1->i_udquot == req->ip2->i_udquot &&
 100              req->ip1->i_gdquot == req->ip2->i_gdquot &&
 101              req->ip1->i_pdquot == req->ip2->i_pdquot))
 102                 return 0;
 103
 104         *qretry = 0;
 105
 106         /*
 107          * For each file, compute the net gain in the number of regular blocks
 108          * that will be mapped into that file and reserve that much quota.  The
 109          * quota counts must be able to absorb at least that much space.
 110          */
 111         ddelta = req->ip2_bcount - req->ip1_bcount;
 112         rdelta = req->ip2_rtbcount - req->ip1_rtbcount;
 113         if (ddelta > 0 || rdelta > 0) {
 114                 error = xfs_trans_reserve_quota_nblks(tp, req->ip1,
 115                                 ddelta > 0 ? ddelta : 0,
 116                                 rdelta > 0 ? rdelta : 0,
 117                                 false);
 118                 if (error == -EDQUOT || error == -ENOSPC) {
 119                         /*
 120                          * Save this error and see what happens if we try to
 121                          * reserve quota for ip2.  Then report both.
 122                          */
 123                         *qretry |= QRETRY_IP1;
 124                         ip1_error = error;
 125                         error = 0;
 126                 }
 127                 if (error)
 128                         return error;
 129         }
 130         if (ddelta < 0 || rdelta < 0) {
 131                 error = xfs_trans_reserve_quota_nblks(tp, req->ip2,
 132                                 ddelta < 0 ? -ddelta : 0,
 133                                 rdelta < 0 ? -rdelta : 0,
 134                                 false);
 135                 if (error == -EDQUOT || error == -ENOSPC)
 136                         *qretry |= QRETRY_IP2;
 137                 if (error)
 138                         return error;
 139         }
 140         if (ip1_error)
 141                 return ip1_error;
 142
 143         /*
 144          * For each file, forcibly reserve the gross gain in mapped blocks so
 145          * that we don't trip over any quota block reservation assertions.
 146          * We must reserve the gross gain because the quota code subtracts from
 147          * bcount the number of blocks that we unmap; it does not add that
 148          * quantity back to the quota block reservation.
 149          */
 150         error = xfs_trans_reserve_quota_nblks(tp, req->ip1, req->ip1_bcount,
 151                         req->ip1_rtbcount, true);
 152         if (error)
 153                 return error;
 154
 155         return xfs_trans_reserve_quota_nblks(tp, req->ip2, req->ip2_bcount,
 156                         req->ip2_rtbcount, true);
 157 }
 158
 159 /* Exchange the mappings (and hence the contents) of two files' forks. */
 160 STATIC int
 161 xfs_exchrange_mappings(
 162         const struct xfs_exchrange      *fxr,
 163         struct xfs_inode                *ip1,
 164         struct xfs_inode                *ip2)
 165 {
 166         struct xfs_mount                *mp = ip1->i_mount;
 167         struct xfs_exchmaps_req         req = {
 168                 .ip1                    = ip1,
 169                 .ip2                    = ip2,
 170                 .startoff1              = XFS_B_TO_FSBT(mp, fxr->file1_offset),
 171                 .startoff2              = XFS_B_TO_FSBT(mp, fxr->file2_offset),
 172                 .blockcount             = XFS_B_TO_FSB(mp, fxr->length),
 173         };
 174         struct xfs_trans                *tp;
 175         unsigned int                    qretry;
 176         bool                            retried = false;
 177         int                             error;
 178
 179         trace_xfs_exchrange_mappings(fxr, ip1, ip2);
 180
 181         if (fxr->flags & XFS_EXCHANGE_RANGE_TO_EOF)
 182                 req.flags |= XFS_EXCHMAPS_SET_SIZES;
 183         if (fxr->flags & XFS_EXCHANGE_RANGE_FILE1_WRITTEN)
 184                 req.flags |= XFS_EXCHMAPS_INO1_WRITTEN;
 185
 186         /*
 187          * Round the request length up to the nearest file allocation unit.
 188          * The prep function already checked that the request offsets and
 189          * length in @fxr are safe to round up.
 190          */
 191         if (xfs_inode_has_bigrtalloc(ip2))
 192                 req.blockcount = xfs_rtb_roundup_rtx(mp, req.blockcount);
 193
 194         error = xfs_exchrange_estimate(&req);
 195         if (error)
 196                 return error;
 197
 198 retry:
 199         /* Allocate the transaction, lock the inodes, and join them. */
 200         error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, req.resblks, 0,
 201                         XFS_TRANS_RES_FDBLKS, &tp);
 202         if (error)
 203                 return error;
 204
 205         xfs_exchrange_ilock(tp, ip1, ip2);
 206
 207         trace_xfs_exchrange_before(ip2, 2);
 208         trace_xfs_exchrange_before(ip1, 1);
 209
 210         error = xfs_exchmaps_check_forks(mp, &req);
 211         if (error)
 212                 goto out_trans_cancel;
 213
 214         /*
 215          * Reserve ourselves some quota if any of them are in enforcing mode.
 216          * In theory we only need enough to satisfy the change in the number
 217          * of blocks between the two ranges being remapped.
 218          */
 219         error = xfs_exchrange_reserve_quota(tp, &req, &qretry);
 220         if ((error == -EDQUOT || error == -ENOSPC) && !retried) {
 221                 xfs_trans_cancel(tp);
 222                 xfs_exchrange_iunlock(ip1, ip2);
 223                 if (qretry & QRETRY_IP1)
 224                         xfs_blockgc_free_quota(ip1, 0);
 225                 if (qretry & QRETRY_IP2)
 226                         xfs_blockgc_free_quota(ip2, 0);
 227                 retried = true;
 228                 goto retry;
 229         }
 230         if (error)
 231                 goto out_trans_cancel;
 232
 233         /* If we got this far on a dry run, all parameters are ok. */
 234         if (fxr->flags & XFS_EXCHANGE_RANGE_DRY_RUN)
 235                 goto out_trans_cancel;
 236
 237         /* Update the mtime and ctime of both files. */
 238         if (fxr->flags & __XFS_EXCHANGE_RANGE_UPD_CMTIME1)
 239                 xfs_trans_ichgtime(tp, ip1, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
 240         if (fxr->flags & __XFS_EXCHANGE_RANGE_UPD_CMTIME2)
 241                 xfs_trans_ichgtime(tp, ip2, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
 242
 243         xfs_exchange_mappings(tp, &req);
 244
 245         /*
 246          * Force the log to persist metadata updates if the caller or the
 247          * administrator requires this.  The generic prep function already
 248          * flushed the relevant parts of the page cache.
 249          */
 250         if (xfs_has_wsync(mp) || (fxr->flags & XFS_EXCHANGE_RANGE_DSYNC))
 251                 xfs_trans_set_sync(tp);
 252
 253         error = xfs_trans_commit(tp);
 254
 255         trace_xfs_exchrange_after(ip2, 2);
 256         trace_xfs_exchrange_after(ip1, 1);
 257
 258         if (error)
 259                 goto out_unlock;
 260
 261         /*
 262          * If the caller wanted us to exchange the contents of two complete
 263          * files of unequal length, exchange the incore sizes now.  This should
 264          * be safe because we flushed both files' page caches, exchanged all
 265          * the mappings, and updated the ondisk sizes.
 266          */
 267         if (fxr->flags & XFS_EXCHANGE_RANGE_TO_EOF) {
 268                 loff_t  temp;
 269
 270                 temp = i_size_read(VFS_I(ip2));
 271                 i_size_write(VFS_I(ip2), i_size_read(VFS_I(ip1)));
 272                 i_size_write(VFS_I(ip1), temp);
 273         }
 274
 275 out_unlock:
 276         xfs_exchrange_iunlock(ip1, ip2);
 277         return error;
 278
 279 out_trans_cancel:
 280         xfs_trans_cancel(tp);
 281         goto out_unlock;
 282 }
 283
 284 /*
 285  * Generic code for exchanging ranges of two files via XFS_IOC_EXCHANGE_RANGE.
 286  * This part deals with struct file objects and byte ranges and does not deal
 287  * with XFS-specific data structures such as xfs_inodes and block ranges.  This
 288  * separation may some day facilitate porting to another filesystem.
 289  *
 290  * The goal is to exchange fxr.length bytes starting at fxr.file1_offset in
 291  * file1 with the same number of bytes starting at fxr.file2_offset in file2.
 292  * Implementations must call xfs_exchange_range_prep to prepare the two
 293  * files prior to taking locks; and they must update the inode change and mod
 294  * times of both files as part of the metadata update.  The timestamp update
 295  * and freshness checks must be done atomically as part of the data exchange
 296  * operation to ensure correctness of the freshness check.
 297  * xfs_exchange_range_finish must be called after the operation completes
 298  * successfully but before locks are dropped.
 299  */
 300
 301 /* Verify that we have security clearance to perform this operation. */
 302 static int
 303 xfs_exchange_range_verify_area(
 304         struct xfs_exchrange    *fxr)
 305 {
 306         int                     ret;
 307
 308         ret = remap_verify_area(fxr->file1, fxr->file1_offset, fxr->length,
 309                         true);
 310         if (ret)
 311                 return ret;
 312
 313         return remap_verify_area(fxr->file2, fxr->file2_offset, fxr->length,
 314                         true);
 315 }
 316
 317 /*
 318  * Performs necessary checks before doing a range exchange, having stabilized
 319  * mutable inode attributes via i_rwsem.
 320  */
 321 static inline int
 322 xfs_exchange_range_checks(
 323         struct xfs_exchrange    *fxr,
 324         unsigned int            alloc_unit)
 325 {
 326         struct inode            *inode1 = file_inode(fxr->file1);
 327         struct inode            *inode2 = file_inode(fxr->file2);
 328         uint64_t                allocmask = alloc_unit - 1;
 329         int64_t                 test_len;
 330         uint64_t                blen;
 331         loff_t                  size1, size2, tmp;
 332         int                     error;
 333
 334         /* Don't touch certain kinds of inodes */
 335         if (IS_IMMUTABLE(inode1) || IS_IMMUTABLE(inode2))
 336                 return -EPERM;
 337         if (IS_SWAPFILE(inode1) || IS_SWAPFILE(inode2))
 338                 return -ETXTBSY;
 339
 340         size1 = i_size_read(inode1);
 341         size2 = i_size_read(inode2);
 342
 343         /* Ranges cannot start after EOF. */
 344         if (fxr->file1_offset > size1 || fxr->file2_offset > size2)
 345                 return -EINVAL;
 346
 347         /*
 348          * If the caller said to exchange to EOF, we set the length of the
 349          * request large enough to cover everything to the end of both files.
 350          */
 351         if (fxr->flags & XFS_EXCHANGE_RANGE_TO_EOF) {
 352                 fxr->length = max_t(int64_t, size1 - fxr->file1_offset,
 353                                              size2 - fxr->file2_offset);
 354
 355                 error = xfs_exchange_range_verify_area(fxr);
 356                 if (error)
 357                         return error;
 358         }
 359
 360         /*
 361          * The start of both ranges must be aligned to the file allocation
 362          * unit.
 363          */
 364         if (!IS_ALIGNED(fxr->file1_offset, alloc_unit) ||
 365             !IS_ALIGNED(fxr->file2_offset, alloc_unit))
 366                 return -EINVAL;
 367
 368         /* Ensure offsets don't wrap. */
 369         if (check_add_overflow(fxr->file1_offset, fxr->length, &tmp) ||
 370             check_add_overflow(fxr->file2_offset, fxr->length, &tmp))
 371                 return -EINVAL;
 372
 373         /*
 374          * We require both ranges to end within EOF, unless we're exchanging
 375          * to EOF.
 376          */
 377         if (!(fxr->flags & XFS_EXCHANGE_RANGE_TO_EOF) &&
 378             (fxr->file1_offset + fxr->length > size1 ||
 379              fxr->file2_offset + fxr->length > size2))
 380                 return -EINVAL;
 381
 382         /*
 383          * Make sure we don't hit any file size limits.  If we hit any size
 384          * limits such that test_length was adjusted, we abort the whole
 385          * operation.
 386          */
 387         test_len = fxr->length;
 388         error = generic_write_check_limits(fxr->file2, fxr->file2_offset,
 389                         &test_len);
 390         if (error)
 391                 return error;
 392         error = generic_write_check_limits(fxr->file1, fxr->file1_offset,
 393                         &test_len);
 394         if (error)
 395                 return error;
 396         if (test_len != fxr->length)
 397                 return -EINVAL;
 398
 399         /*
 400          * If the user wanted us to exchange up to the infile's EOF, round up
 401          * to the next allocation unit boundary for this check.  Do the same
 402          * for the outfile.
 403          *
 404          * Otherwise, reject the range length if it's not aligned to an
 405          * allocation unit.
 406          */
 407         if (fxr->file1_offset + fxr->length == size1)
 408                 blen = ALIGN(size1, alloc_unit) - fxr->file1_offset;
 409         else if (fxr->file2_offset + fxr->length == size2)
 410                 blen = ALIGN(size2, alloc_unit) - fxr->file2_offset;
 411         else if (!IS_ALIGNED(fxr->length, alloc_unit))
 412                 return -EINVAL;
 413         else
 414                 blen = fxr->length;
 415
 416         /* Don't allow overlapped exchanges within the same file. */
 417         if (inode1 == inode2 &&
 418             fxr->file2_offset + blen > fxr->file1_offset &&
 419             fxr->file1_offset + blen > fxr->file2_offset)
 420                 return -EINVAL;
 421
 422         /*
 423          * Ensure that we don't exchange a partial EOF block into the middle of
 424          * another file.
 425          */
 426         if ((fxr->length & allocmask) == 0)
 427                 return 0;
 428
 429         blen = fxr->length;
 430         if (fxr->file2_offset + blen < size2)
 431                 blen &= ~allocmask;
 432
 433         if (fxr->file1_offset + blen < size1)
 434                 blen &= ~allocmask;
 435
 436         return blen == fxr->length ? 0 : -EINVAL;
 437 }
 438
 439 /*
 440  * Check that the two inodes are eligible for range exchanges, the ranges make
 441  * sense, and then flush all dirty data.  Caller must ensure that the inodes
 442  * have been locked against any other modifications.
 443  */
 444 static inline int
 445 xfs_exchange_range_prep(
 446         struct xfs_exchrange    *fxr,
 447         unsigned int            alloc_unit)
 448 {
 449         struct inode            *inode1 = file_inode(fxr->file1);
 450         struct inode            *inode2 = file_inode(fxr->file2);
 451         bool                    same_inode = (inode1 == inode2);
 452         int                     error;
 453
 454         /* Check that we don't violate system file offset limits. */
 455         error = xfs_exchange_range_checks(fxr, alloc_unit);
 456         if (error || fxr->length == 0)
 457                 return error;
 458
 459         /* Wait for the completion of any pending IOs on both files */
 460         inode_dio_wait(inode1);
 461         if (!same_inode)
 462                 inode_dio_wait(inode2);
 463
 464         error = filemap_write_and_wait_range(inode1->i_mapping,
 465                         fxr->file1_offset,
 466                         fxr->file1_offset + fxr->length - 1);
 467         if (error)
 468                 return error;
 469
 470         error = filemap_write_and_wait_range(inode2->i_mapping,
 471                         fxr->file2_offset,
 472                         fxr->file2_offset + fxr->length - 1);
 473         if (error)
 474                 return error;
 475
 476         /*
 477          * If the files or inodes involved require synchronous writes, amend
 478          * the request to force the filesystem to flush all data and metadata
 479          * to disk after the operation completes.
 480          */
 481         if (((fxr->file1->f_flags | fxr->file2->f_flags) & O_SYNC) ||
 482             IS_SYNC(inode1) || IS_SYNC(inode2))
 483                 fxr->flags |= XFS_EXCHANGE_RANGE_DSYNC;
 484
 485         return 0;
 486 }
 487
 488 /*
 489  * Finish a range exchange operation, if it was successful.  Caller must ensure
 490  * that the inodes are still locked against any other modifications.
 491  */
 492 static inline int
 493 xfs_exchange_range_finish(
 494         struct xfs_exchrange    *fxr)
 495 {
 496         int                     error;
 497
 498         error = file_remove_privs(fxr->file1);
 499         if (error)
 500                 return error;
 501         if (file_inode(fxr->file1) == file_inode(fxr->file2))
 502                 return 0;
 503
 504         return file_remove_privs(fxr->file2);
 505 }
 506
 507 /*
 508  * Check the alignment of an exchange request when the allocation unit size
 509  * isn't a power of two.  The generic file-level helpers use (fast)
 510  * bitmask-based alignment checks, but here we have to use slow long division.
 511  */
 512 static int
 513 xfs_exchrange_check_rtalign(
 514         const struct xfs_exchrange      *fxr,
 515         struct xfs_inode                *ip1,
 516         struct xfs_inode                *ip2,
 517         unsigned int                    alloc_unit)
 518 {
 519         uint64_t                        length = fxr->length;
 520         uint64_t                        blen;
 521         loff_t                          size1, size2;
 522
 523         size1 = i_size_read(VFS_I(ip1));
 524         size2 = i_size_read(VFS_I(ip2));
 525
 526         /* The start of both ranges must be aligned to a rt extent. */
 527         if (!isaligned_64(fxr->file1_offset, alloc_unit) ||
 528             !isaligned_64(fxr->file2_offset, alloc_unit))
 529                 return -EINVAL;
 530
 531         if (fxr->flags & XFS_EXCHANGE_RANGE_TO_EOF)
 532                 length = max_t(int64_t, size1 - fxr->file1_offset,
 533                                         size2 - fxr->file2_offset);
 534
 535         /*
 536          * If the user wanted us to exchange up to the infile's EOF, round up
 537          * to the next rt extent boundary for this check.  Do the same for the
 538          * outfile.
 539          *
 540          * Otherwise, reject the range length if it's not rt extent aligned.
 541          * We already confirmed the starting offsets' rt extent block
 542          * alignment.
 543          */
 544         if (fxr->file1_offset + length == size1)
 545                 blen = roundup_64(size1, alloc_unit) - fxr->file1_offset;
 546         else if (fxr->file2_offset + length == size2)
 547                 blen = roundup_64(size2, alloc_unit) - fxr->file2_offset;
 548         else if (!isaligned_64(length, alloc_unit))
 549                 return -EINVAL;
 550         else
 551                 blen = length;
 552
 553         /* Don't allow overlapped exchanges within the same file. */
 554         if (ip1 == ip2 &&
 555             fxr->file2_offset + blen > fxr->file1_offset &&
 556             fxr->file1_offset + blen > fxr->file2_offset)
 557                 return -EINVAL;
 558
 559         /*
 560          * Ensure that we don't exchange a partial EOF rt extent into the
 561          * middle of another file.
 562          */
 563         if (isaligned_64(length, alloc_unit))
 564                 return 0;
 565
 566         blen = length;
 567         if (fxr->file2_offset + length < size2)
 568                 blen = rounddown_64(blen, alloc_unit);
 569
 570         if (fxr->file1_offset + blen < size1)
 571                 blen = rounddown_64(blen, alloc_unit);
 572
 573         return blen == length ? 0 : -EINVAL;
 574 }
 575
 576 /* Prepare two files to have their data exchanged. */
 577 STATIC int
 578 xfs_exchrange_prep(
 579         struct xfs_exchrange    *fxr,
 580         struct xfs_inode        *ip1,
 581         struct xfs_inode        *ip2)
 582 {
 583         struct xfs_mount        *mp = ip2->i_mount;
 584         unsigned int            alloc_unit = xfs_inode_alloc_unitsize(ip2);
 585         int                     error;
 586
 587         trace_xfs_exchrange_prep(fxr, ip1, ip2);
 588
 589         /* Verify both files are either real-time or non-realtime */
 590         if (XFS_IS_REALTIME_INODE(ip1) != XFS_IS_REALTIME_INODE(ip2))
 591                 return -EINVAL;
 592
 593         /* Check non-power of two alignment issues, if necessary. */
 594         if (!is_power_of_2(alloc_unit)) {
 595                 error = xfs_exchrange_check_rtalign(fxr, ip1, ip2, alloc_unit);
 596                 if (error)
 597                         return error;
 598
 599                 /*
 600                  * Do the generic file-level checks with the regular block
 601                  * alignment.
 602                  */
 603                 alloc_unit = mp->m_sb.sb_blocksize;
 604         }
 605
 606         error = xfs_exchange_range_prep(fxr, alloc_unit);
 607         if (error || fxr->length == 0)
 608                 return error;
 609
 610         /* Attach dquots to both inodes before changing block maps. */
 611         error = xfs_qm_dqattach(ip2);
 612         if (error)
 613                 return error;
 614         error = xfs_qm_dqattach(ip1);
 615         if (error)
 616                 return error;
 617
 618         trace_xfs_exchrange_flush(fxr, ip1, ip2);
 619
 620         /* Flush the relevant ranges of both files. */
 621         error = xfs_flush_unmap_range(ip2, fxr->file2_offset, fxr->length);
 622         if (error)
 623                 return error;
 624         error = xfs_flush_unmap_range(ip1, fxr->file1_offset, fxr->length);
 625         if (error)
 626                 return error;
 627
 628         /*
 629          * Cancel CoW fork preallocations for the ranges of both files.  The
 630          * prep function should have flushed all the dirty data, so the only
 631          * CoW mappings remaining should be speculative.
 632          */
 633         if (xfs_inode_has_cow_data(ip1)) {
 634                 error = xfs_reflink_cancel_cow_range(ip1, fxr->file1_offset,
 635                                 fxr->length, true);
 636                 if (error)
 637                         return error;
 638         }
 639
 640         if (xfs_inode_has_cow_data(ip2)) {
 641                 error = xfs_reflink_cancel_cow_range(ip2, fxr->file2_offset,
 642                                 fxr->length, true);
 643                 if (error)
 644                         return error;
 645         }
 646
 647         return 0;
 648 }
 649
 650 /*
 651  * Exchange contents of files.  This is the binding between the generic
 652  * file-level concepts and the XFS inode-specific implementation.
 653  */
 654 STATIC int
 655 xfs_exchrange_contents(
 656         struct xfs_exchrange    *fxr)
 657 {
 658         struct inode            *inode1 = file_inode(fxr->file1);
 659         struct inode            *inode2 = file_inode(fxr->file2);
 660         struct xfs_inode        *ip1 = XFS_I(inode1);
 661         struct xfs_inode        *ip2 = XFS_I(inode2);
 662         struct xfs_mount        *mp = ip1->i_mount;
 663         int                     error;
 664
 665         if (!xfs_has_exchange_range(mp))
 666                 return -EOPNOTSUPP;
 667
 668         if (fxr->flags & ~(XFS_EXCHANGE_RANGE_ALL_FLAGS |
 669                            XFS_EXCHANGE_RANGE_PRIV_FLAGS))
 670                 return -EINVAL;
 671
 672         if (xfs_is_shutdown(mp))
 673                 return -EIO;
 674
 675         /* Lock both files against IO */
 676         error = xfs_ilock2_io_mmap(ip1, ip2);
 677         if (error)
 678                 goto out_err;
 679
 680         /* Prepare and then exchange file contents. */
 681         error = xfs_exchrange_prep(fxr, ip1, ip2);
 682         if (error)
 683                 goto out_unlock;
 684
 685         error = xfs_exchrange_mappings(fxr, ip1, ip2);
 686         if (error)
 687                 goto out_unlock;
 688
 689         /*
 690          * Finish the exchange by removing special file privileges like any
 691          * other file write would do.  This may involve turning on support for
 692          * logged xattrs if either file has security capabilities.
 693          */
 694         error = xfs_exchange_range_finish(fxr);
 695         if (error)
 696                 goto out_unlock;
 697
 698 out_unlock:
 699         xfs_iunlock2_io_mmap(ip1, ip2);
 700 out_err:
 701         if (error)
 702                 trace_xfs_exchrange_error(ip2, error, _RET_IP_);
 703         return error;
 704 }
 705
 706 /* Exchange parts of two files. */
 707 static int
 708 xfs_exchange_range(
 709         struct xfs_exchrange    *fxr)
 710 {
 711         struct inode            *inode1 = file_inode(fxr->file1);
 712         struct inode            *inode2 = file_inode(fxr->file2);
 713         int                     ret;
 714
 715         BUILD_BUG_ON(XFS_EXCHANGE_RANGE_ALL_FLAGS &
 716                      XFS_EXCHANGE_RANGE_PRIV_FLAGS);
 717
 718         /* Both files must be on the same mount/filesystem. */
 719         if (fxr->file1->f_path.mnt != fxr->file2->f_path.mnt)
 720                 return -EXDEV;
 721
 722         if (fxr->flags & ~XFS_EXCHANGE_RANGE_ALL_FLAGS)
 723                 return -EINVAL;
 724
 725         /* Userspace requests only honored for regular files. */
 726         if (S_ISDIR(inode1->i_mode) || S_ISDIR(inode2->i_mode))
 727                 return -EISDIR;
 728         if (!S_ISREG(inode1->i_mode) || !S_ISREG(inode2->i_mode))
 729                 return -EINVAL;
 730
 731         /* Both files must be opened for read and write. */
 732         if (!(fxr->file1->f_mode & FMODE_READ) ||
 733             !(fxr->file1->f_mode & FMODE_WRITE) ||
 734             !(fxr->file2->f_mode & FMODE_READ) ||
 735             !(fxr->file2->f_mode & FMODE_WRITE))
 736                 return -EBADF;
 737
 738         /* Neither file can be opened append-only. */
 739         if ((fxr->file1->f_flags & O_APPEND) ||
 740             (fxr->file2->f_flags & O_APPEND))
 741                 return -EBADF;
 742
 743         /*
 744          * If we're not exchanging to EOF, we can check the areas before
 745          * stabilizing both files' i_size.
 746          */
 747         if (!(fxr->flags & XFS_EXCHANGE_RANGE_TO_EOF)) {
 748                 ret = xfs_exchange_range_verify_area(fxr);
 749                 if (ret)
 750                         return ret;
 751         }
 752
 753         /* Update cmtime if the fd/inode don't forbid it. */
 754         if (!(fxr->file1->f_mode & FMODE_NOCMTIME) && !IS_NOCMTIME(inode1))
 755                 fxr->flags |= __XFS_EXCHANGE_RANGE_UPD_CMTIME1;
 756         if (!(fxr->file2->f_mode & FMODE_NOCMTIME) && !IS_NOCMTIME(inode2))
 757                 fxr->flags |= __XFS_EXCHANGE_RANGE_UPD_CMTIME2;
 758
 759         file_start_write(fxr->file2);
 760         ret = xfs_exchrange_contents(fxr);
 761         file_end_write(fxr->file2);
 762         if (ret)
 763                 return ret;
 764
 765         fsnotify_modify(fxr->file1);
 766         if (fxr->file2 != fxr->file1)
 767                 fsnotify_modify(fxr->file2);
 768         return 0;
 769 }
 770
 771 /* Collect exchange-range arguments from userspace. */
 772 long
 773 xfs_ioc_exchange_range(
 774         struct file                     *file,
 775         struct xfs_exchange_range __user *argp)
 776 {
 777         struct xfs_exchrange            fxr = {
 778                 .file2                  = file,
 779         };
 780         struct xfs_exchange_range       args;
 781         struct fd                       file1;
 782         int                             error;
 783
 784         if (copy_from_user(&args, argp, sizeof(args)))
 785                 return -EFAULT;
 786         if (memchr_inv(&args.pad, 0, sizeof(args.pad)))
 787                 return -EINVAL;
 788         if (args.flags & ~XFS_EXCHANGE_RANGE_ALL_FLAGS)
 789                 return -EINVAL;
 790
 791         fxr.file1_offset        = args.file1_offset;
 792         fxr.file2_offset        = args.file2_offset;
 793         fxr.length              = args.length;
 794         fxr.flags               = args.flags;
 795
 796         file1 = fdget(args.file1_fd);
 797         if (!file1.file)
 798                 return -EBADF;
 799         fxr.file1 = file1.file;
 800
 801         error = xfs_exchange_range(&fxr);
 802         fdput(file1);
 803         return error;
 804 }