fs/xfs/xfs_exchrange.c

   1 // SPDX-License-Identifier: GPL-2.0-or-later
   2 /*
   3  * Copyright (c) 2020-2024 Oracle.  All Rights Reserved.
   4  * Author: Darrick J. Wong <[email protected]>
   5  */
   6 #include "xfs.h"
   7 #include "xfs_shared.h"
   8 #include "xfs_format.h"
   9 #include "xfs_log_format.h"
  10 #include "xfs_trans_resv.h"
  11 #include "xfs_mount.h"
  12 #include "xfs_defer.h"
  13 #include "xfs_inode.h"
  14 #include "xfs_trans.h"
  15 #include "xfs_quota.h"
  16 #include "xfs_bmap_util.h"
  17 #include "xfs_reflink.h"
  18 #include "xfs_trace.h"
  19 #include "xfs_exchrange.h"
  20 #include "xfs_exchmaps.h"
  21 #include "xfs_sb.h"
  22 #include "xfs_icache.h"
  23 #include "xfs_log.h"
  24 #include "xfs_rtbitmap.h"
  25 #include <linux/fsnotify.h>
  26
  27 /* Lock (and optionally join) two inodes for a file range exchange. */
  28 void
  29 xfs_exchrange_ilock(
  30         struct xfs_trans        *tp,
  31         struct xfs_inode        *ip1,
  32         struct xfs_inode        *ip2)
  33 {
  34         if (ip1 != ip2)
  35                 xfs_lock_two_inodes(ip1, XFS_ILOCK_EXCL,
  36                                     ip2, XFS_ILOCK_EXCL);
  37         else
  38                 xfs_ilock(ip1, XFS_ILOCK_EXCL);
  39         if (tp) {
  40                 xfs_trans_ijoin(tp, ip1, 0);
  41                 if (ip2 != ip1)
  42                         xfs_trans_ijoin(tp, ip2, 0);
  43         }
  44
  45 }
  46
  47 /* Unlock two inodes after a file range exchange operation. */
  48 void
  49 xfs_exchrange_iunlock(
  50         struct xfs_inode        *ip1,
  51         struct xfs_inode        *ip2)
  52 {
  53         if (ip2 != ip1)
  54                 xfs_iunlock(ip2, XFS_ILOCK_EXCL);
  55         xfs_iunlock(ip1, XFS_ILOCK_EXCL);
  56 }
  57
  58 /*
  59  * Estimate the resource requirements to exchange file contents between the two
  60  * files.  The caller is required to hold the IOLOCK and the MMAPLOCK and to
  61  * have flushed both inodes' pagecache and active direct-ios.
  62  */
  63 int
  64 xfs_exchrange_estimate(
  65         struct xfs_exchmaps_req *req)
  66 {
  67         int                     error;
  68
  69         xfs_exchrange_ilock(NULL, req->ip1, req->ip2);
  70         error = xfs_exchmaps_estimate(req);
  71         xfs_exchrange_iunlock(req->ip1, req->ip2);
  72         return error;
  73 }
  74
  75 /*
  76  * Check that file2's metadata agree with the snapshot that we took for the
  77  * range commit request.
  78  *
  79  * This should be called after the filesystem has locked /all/ inode metadata
  80  * against modification.
  81  */
  82 STATIC int
  83 xfs_exchrange_check_freshness(
  84         const struct xfs_exchrange      *fxr,
  85         struct xfs_inode                *ip2)
  86 {
  87         struct inode                    *inode2 = VFS_I(ip2);
  88         struct timespec64               ctime = inode_get_ctime(inode2);
  89         struct timespec64               mtime = inode_get_mtime(inode2);
  90
  91         trace_xfs_exchrange_freshness(fxr, ip2);
  92
  93         /* Check that file2 hasn't otherwise been modified. */
  94         if (fxr->file2_ino != ip2->i_ino ||
  95             fxr->file2_gen != inode2->i_generation ||
  96             !timespec64_equal(&fxr->file2_ctime, &ctime) ||
  97             !timespec64_equal(&fxr->file2_mtime, &mtime))
  98                 return -EBUSY;
  99
 100         return 0;
 101 }
 102
 103 #define QRETRY_IP1      (0x1)
 104 #define QRETRY_IP2      (0x2)
 105
 106 /*
 107  * Obtain a quota reservation to make sure we don't hit EDQUOT.  We can skip
 108  * this if quota enforcement is disabled or if both inodes' dquots are the
 109  * same.  The qretry structure must be initialized to zeroes before the first
 110  * call to this function.
 111  */
 112 STATIC int
 113 xfs_exchrange_reserve_quota(
 114         struct xfs_trans                *tp,
 115         const struct xfs_exchmaps_req   *req,
 116         unsigned int                    *qretry)
 117 {
 118         int64_t                         ddelta, rdelta;
 119         int                             ip1_error = 0;
 120         int                             error;
 121
 122         /*
 123          * Don't bother with a quota reservation if we're not enforcing them
 124          * or the two inodes have the same dquots.
 125          */
 126         if (!XFS_IS_QUOTA_ON(tp->t_mountp) || req->ip1 == req->ip2 ||
 127             (req->ip1->i_udquot == req->ip2->i_udquot &&
 128              req->ip1->i_gdquot == req->ip2->i_gdquot &&
 129              req->ip1->i_pdquot == req->ip2->i_pdquot))
 130                 return 0;
 131
 132         *qretry = 0;
 133
 134         /*
 135          * For each file, compute the net gain in the number of regular blocks
 136          * that will be mapped into that file and reserve that much quota.  The
 137          * quota counts must be able to absorb at least that much space.
 138          */
 139         ddelta = req->ip2_bcount - req->ip1_bcount;
 140         rdelta = req->ip2_rtbcount - req->ip1_rtbcount;
 141         if (ddelta > 0 || rdelta > 0) {
 142                 error = xfs_trans_reserve_quota_nblks(tp, req->ip1,
 143                                 ddelta > 0 ? ddelta : 0,
 144                                 rdelta > 0 ? rdelta : 0,
 145                                 false);
 146                 if (error == -EDQUOT || error == -ENOSPC) {
 147                         /*
 148                          * Save this error and see what happens if we try to
 149                          * reserve quota for ip2.  Then report both.
 150                          */
 151                         *qretry |= QRETRY_IP1;
 152                         ip1_error = error;
 153                         error = 0;
 154                 }
 155                 if (error)
 156                         return error;
 157         }
 158         if (ddelta < 0 || rdelta < 0) {
 159                 error = xfs_trans_reserve_quota_nblks(tp, req->ip2,
 160                                 ddelta < 0 ? -ddelta : 0,
 161                                 rdelta < 0 ? -rdelta : 0,
 162                                 false);
 163                 if (error == -EDQUOT || error == -ENOSPC)
 164                         *qretry |= QRETRY_IP2;
 165                 if (error)
 166                         return error;
 167         }
 168         if (ip1_error)
 169                 return ip1_error;
 170
 171         /*
 172          * For each file, forcibly reserve the gross gain in mapped blocks so
 173          * that we don't trip over any quota block reservation assertions.
 174          * We must reserve the gross gain because the quota code subtracts from
 175          * bcount the number of blocks that we unmap; it does not add that
 176          * quantity back to the quota block reservation.
 177          */
 178         error = xfs_trans_reserve_quota_nblks(tp, req->ip1, req->ip1_bcount,
 179                         req->ip1_rtbcount, true);
 180         if (error)
 181                 return error;
 182
 183         return xfs_trans_reserve_quota_nblks(tp, req->ip2, req->ip2_bcount,
 184                         req->ip2_rtbcount, true);
 185 }
 186
 187 /* Exchange the mappings (and hence the contents) of two files' forks. */
 188 STATIC int
 189 xfs_exchrange_mappings(
 190         const struct xfs_exchrange      *fxr,
 191         struct xfs_inode                *ip1,
 192         struct xfs_inode                *ip2)
 193 {
 194         struct xfs_mount                *mp = ip1->i_mount;
 195         struct xfs_exchmaps_req         req = {
 196                 .ip1                    = ip1,
 197                 .ip2                    = ip2,
 198                 .startoff1              = XFS_B_TO_FSBT(mp, fxr->file1_offset),
 199                 .startoff2              = XFS_B_TO_FSBT(mp, fxr->file2_offset),
 200                 .blockcount             = XFS_B_TO_FSB(mp, fxr->length),
 201         };
 202         struct xfs_trans                *tp;
 203         unsigned int                    qretry;
 204         bool                            retried = false;
 205         int                             error;
 206
 207         trace_xfs_exchrange_mappings(fxr, ip1, ip2);
 208
 209         if (fxr->flags & XFS_EXCHANGE_RANGE_TO_EOF)
 210                 req.flags |= XFS_EXCHMAPS_SET_SIZES;
 211         if (fxr->flags & XFS_EXCHANGE_RANGE_FILE1_WRITTEN)
 212                 req.flags |= XFS_EXCHMAPS_INO1_WRITTEN;
 213
 214         /*
 215          * Round the request length up to the nearest file allocation unit.
 216          * The prep function already checked that the request offsets and
 217          * length in @fxr are safe to round up.
 218          */
 219         if (xfs_inode_has_bigrtalloc(ip2))
 220                 req.blockcount = xfs_blen_roundup_rtx(mp, req.blockcount);
 221
 222         error = xfs_exchrange_estimate(&req);
 223         if (error)
 224                 return error;
 225
 226 retry:
 227         /* Allocate the transaction, lock the inodes, and join them. */
 228         error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, req.resblks, 0,
 229                         XFS_TRANS_RES_FDBLKS, &tp);
 230         if (error)
 231                 return error;
 232
 233         xfs_exchrange_ilock(tp, ip1, ip2);
 234
 235         trace_xfs_exchrange_before(ip2, 2);
 236         trace_xfs_exchrange_before(ip1, 1);
 237
 238         error = xfs_exchmaps_check_forks(mp, &req);
 239         if (error)
 240                 goto out_trans_cancel;
 241
 242         /*
 243          * Reserve ourselves some quota if any of them are in enforcing mode.
 244          * In theory we only need enough to satisfy the change in the number
 245          * of blocks between the two ranges being remapped.
 246          */
 247         error = xfs_exchrange_reserve_quota(tp, &req, &qretry);
 248         if ((error == -EDQUOT || error == -ENOSPC) && !retried) {
 249                 xfs_trans_cancel(tp);
 250                 xfs_exchrange_iunlock(ip1, ip2);
 251                 if (qretry & QRETRY_IP1)
 252                         xfs_blockgc_free_quota(ip1, 0);
 253                 if (qretry & QRETRY_IP2)
 254                         xfs_blockgc_free_quota(ip2, 0);
 255                 retried = true;
 256                 goto retry;
 257         }
 258         if (error)
 259                 goto out_trans_cancel;
 260
 261         /* If we got this far on a dry run, all parameters are ok. */
 262         if (fxr->flags & XFS_EXCHANGE_RANGE_DRY_RUN)
 263                 goto out_trans_cancel;
 264
 265         /* Update the mtime and ctime of both files. */
 266         if (fxr->flags & __XFS_EXCHANGE_RANGE_UPD_CMTIME1)
 267                 xfs_trans_ichgtime(tp, ip1, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
 268         if (fxr->flags & __XFS_EXCHANGE_RANGE_UPD_CMTIME2)
 269                 xfs_trans_ichgtime(tp, ip2, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
 270
 271         xfs_exchange_mappings(tp, &req);
 272
 273         /*
 274          * Force the log to persist metadata updates if the caller or the
 275          * administrator requires this.  The generic prep function already
 276          * flushed the relevant parts of the page cache.
 277          */
 278         if (xfs_has_wsync(mp) || (fxr->flags & XFS_EXCHANGE_RANGE_DSYNC))
 279                 xfs_trans_set_sync(tp);
 280
 281         error = xfs_trans_commit(tp);
 282
 283         trace_xfs_exchrange_after(ip2, 2);
 284         trace_xfs_exchrange_after(ip1, 1);
 285
 286         if (error)
 287                 goto out_unlock;
 288
 289         /*
 290          * If the caller wanted us to exchange the contents of two complete
 291          * files of unequal length, exchange the incore sizes now.  This should
 292          * be safe because we flushed both files' page caches, exchanged all
 293          * the mappings, and updated the ondisk sizes.
 294          */
 295         if (fxr->flags & XFS_EXCHANGE_RANGE_TO_EOF) {
 296                 loff_t  temp;
 297
 298                 temp = i_size_read(VFS_I(ip2));
 299                 i_size_write(VFS_I(ip2), i_size_read(VFS_I(ip1)));
 300                 i_size_write(VFS_I(ip1), temp);
 301         }
 302
 303 out_unlock:
 304         xfs_exchrange_iunlock(ip1, ip2);
 305         return error;
 306
 307 out_trans_cancel:
 308         xfs_trans_cancel(tp);
 309         goto out_unlock;
 310 }
 311
 312 /*
 313  * Generic code for exchanging ranges of two files via XFS_IOC_EXCHANGE_RANGE.
 314  * This part deals with struct file objects and byte ranges and does not deal
 315  * with XFS-specific data structures such as xfs_inodes and block ranges.  This
 316  * separation may some day facilitate porting to another filesystem.
 317  *
 318  * The goal is to exchange fxr.length bytes starting at fxr.file1_offset in
 319  * file1 with the same number of bytes starting at fxr.file2_offset in file2.
 320  * Implementations must call xfs_exchange_range_prep to prepare the two
 321  * files prior to taking locks; and they must update the inode change and mod
 322  * times of both files as part of the metadata update.  The timestamp update
 323  * and freshness checks must be done atomically as part of the data exchange
 324  * operation to ensure correctness of the freshness check.
 325  * xfs_exchange_range_finish must be called after the operation completes
 326  * successfully but before locks are dropped.
 327  */
 328
 329 /* Verify that we have security clearance to perform this operation. */
 330 static int
 331 xfs_exchange_range_verify_area(
 332         struct xfs_exchrange    *fxr)
 333 {
 334         int                     ret;
 335
 336         ret = remap_verify_area(fxr->file1, fxr->file1_offset, fxr->length,
 337                         true);
 338         if (ret)
 339                 return ret;
 340
 341         return remap_verify_area(fxr->file2, fxr->file2_offset, fxr->length,
 342                         true);
 343 }
 344
 345 /*
 346  * Performs necessary checks before doing a range exchange, having stabilized
 347  * mutable inode attributes via i_rwsem.
 348  */
 349 static inline int
 350 xfs_exchange_range_checks(
 351         struct xfs_exchrange    *fxr,
 352         unsigned int            alloc_unit)
 353 {
 354         struct inode            *inode1 = file_inode(fxr->file1);
 355         struct inode            *inode2 = file_inode(fxr->file2);
 356         uint64_t                allocmask = alloc_unit - 1;
 357         int64_t                 test_len;
 358         uint64_t                blen;
 359         loff_t                  size1, size2, tmp;
 360         int                     error;
 361
 362         /* Don't touch certain kinds of inodes */
 363         if (IS_IMMUTABLE(inode1) || IS_IMMUTABLE(inode2))
 364                 return -EPERM;
 365         if (IS_SWAPFILE(inode1) || IS_SWAPFILE(inode2))
 366                 return -ETXTBSY;
 367
 368         size1 = i_size_read(inode1);
 369         size2 = i_size_read(inode2);
 370
 371         /* Ranges cannot start after EOF. */
 372         if (fxr->file1_offset > size1 || fxr->file2_offset > size2)
 373                 return -EINVAL;
 374
 375         /*
 376          * If the caller said to exchange to EOF, we set the length of the
 377          * request large enough to cover everything to the end of both files.
 378          */
 379         if (fxr->flags & XFS_EXCHANGE_RANGE_TO_EOF) {
 380                 fxr->length = max_t(int64_t, size1 - fxr->file1_offset,
 381                                              size2 - fxr->file2_offset);
 382
 383                 error = xfs_exchange_range_verify_area(fxr);
 384                 if (error)
 385                         return error;
 386         }
 387
 388         /*
 389          * The start of both ranges must be aligned to the file allocation
 390          * unit.
 391          */
 392         if (!IS_ALIGNED(fxr->file1_offset, alloc_unit) ||
 393             !IS_ALIGNED(fxr->file2_offset, alloc_unit))
 394                 return -EINVAL;
 395
 396         /* Ensure offsets don't wrap. */
 397         if (check_add_overflow(fxr->file1_offset, fxr->length, &tmp) ||
 398             check_add_overflow(fxr->file2_offset, fxr->length, &tmp))
 399                 return -EINVAL;
 400
 401         /*
 402          * We require both ranges to end within EOF, unless we're exchanging
 403          * to EOF.
 404          */
 405         if (!(fxr->flags & XFS_EXCHANGE_RANGE_TO_EOF) &&
 406             (fxr->file1_offset + fxr->length > size1 ||
 407              fxr->file2_offset + fxr->length > size2))
 408                 return -EINVAL;
 409
 410         /*
 411          * Make sure we don't hit any file size limits.  If we hit any size
 412          * limits such that test_length was adjusted, we abort the whole
 413          * operation.
 414          */
 415         test_len = fxr->length;
 416         error = generic_write_check_limits(fxr->file2, fxr->file2_offset,
 417                         &test_len);
 418         if (error)
 419                 return error;
 420         error = generic_write_check_limits(fxr->file1, fxr->file1_offset,
 421                         &test_len);
 422         if (error)
 423                 return error;
 424         if (test_len != fxr->length)
 425                 return -EINVAL;
 426
 427         /*
 428          * If the user wanted us to exchange up to the infile's EOF, round up
 429          * to the next allocation unit boundary for this check.  Do the same
 430          * for the outfile.
 431          *
 432          * Otherwise, reject the range length if it's not aligned to an
 433          * allocation unit.
 434          */
 435         if (fxr->file1_offset + fxr->length == size1)
 436                 blen = ALIGN(size1, alloc_unit) - fxr->file1_offset;
 437         else if (fxr->file2_offset + fxr->length == size2)
 438                 blen = ALIGN(size2, alloc_unit) - fxr->file2_offset;
 439         else if (!IS_ALIGNED(fxr->length, alloc_unit))
 440                 return -EINVAL;
 441         else
 442                 blen = fxr->length;
 443
 444         /* Don't allow overlapped exchanges within the same file. */
 445         if (inode1 == inode2 &&
 446             fxr->file2_offset + blen > fxr->file1_offset &&
 447             fxr->file1_offset + blen > fxr->file2_offset)
 448                 return -EINVAL;
 449
 450         /*
 451          * Ensure that we don't exchange a partial EOF block into the middle of
 452          * another file.
 453          */
 454         if ((fxr->length & allocmask) == 0)
 455                 return 0;
 456
 457         blen = fxr->length;
 458         if (fxr->file2_offset + blen < size2)
 459                 blen &= ~allocmask;
 460
 461         if (fxr->file1_offset + blen < size1)
 462                 blen &= ~allocmask;
 463
 464         return blen == fxr->length ? 0 : -EINVAL;
 465 }
 466
 467 /*
 468  * Check that the two inodes are eligible for range exchanges, the ranges make
 469  * sense, and then flush all dirty data.  Caller must ensure that the inodes
 470  * have been locked against any other modifications.
 471  */
 472 static inline int
 473 xfs_exchange_range_prep(
 474         struct xfs_exchrange    *fxr,
 475         unsigned int            alloc_unit)
 476 {
 477         struct inode            *inode1 = file_inode(fxr->file1);
 478         struct inode            *inode2 = file_inode(fxr->file2);
 479         bool                    same_inode = (inode1 == inode2);
 480         int                     error;
 481
 482         /* Check that we don't violate system file offset limits. */
 483         error = xfs_exchange_range_checks(fxr, alloc_unit);
 484         if (error || fxr->length == 0)
 485                 return error;
 486
 487         /* Wait for the completion of any pending IOs on both files */
 488         inode_dio_wait(inode1);
 489         if (!same_inode)
 490                 inode_dio_wait(inode2);
 491
 492         error = filemap_write_and_wait_range(inode1->i_mapping,
 493                         fxr->file1_offset,
 494                         fxr->file1_offset + fxr->length - 1);
 495         if (error)
 496                 return error;
 497
 498         error = filemap_write_and_wait_range(inode2->i_mapping,
 499                         fxr->file2_offset,
 500                         fxr->file2_offset + fxr->length - 1);
 501         if (error)
 502                 return error;
 503
 504         /*
 505          * If the files or inodes involved require synchronous writes, amend
 506          * the request to force the filesystem to flush all data and metadata
 507          * to disk after the operation completes.
 508          */
 509         if (((fxr->file1->f_flags | fxr->file2->f_flags) & O_SYNC) ||
 510             IS_SYNC(inode1) || IS_SYNC(inode2))
 511                 fxr->flags |= XFS_EXCHANGE_RANGE_DSYNC;
 512
 513         return 0;
 514 }
 515
 516 /*
 517  * Finish a range exchange operation, if it was successful.  Caller must ensure
 518  * that the inodes are still locked against any other modifications.
 519  */
 520 static inline int
 521 xfs_exchange_range_finish(
 522         struct xfs_exchrange    *fxr)
 523 {
 524         int                     error;
 525
 526         error = file_remove_privs(fxr->file1);
 527         if (error)
 528                 return error;
 529         if (file_inode(fxr->file1) == file_inode(fxr->file2))
 530                 return 0;
 531
 532         return file_remove_privs(fxr->file2);
 533 }
 534
 535 /*
 536  * Check the alignment of an exchange request when the allocation unit size
 537  * isn't a power of two.  The generic file-level helpers use (fast)
 538  * bitmask-based alignment checks, but here we have to use slow long division.
 539  */
 540 static int
 541 xfs_exchrange_check_rtalign(
 542         const struct xfs_exchrange      *fxr,
 543         struct xfs_inode                *ip1,
 544         struct xfs_inode                *ip2,
 545         unsigned int                    alloc_unit)
 546 {
 547         uint64_t                        length = fxr->length;
 548         uint64_t                        blen;
 549         loff_t                          size1, size2;
 550
 551         size1 = i_size_read(VFS_I(ip1));
 552         size2 = i_size_read(VFS_I(ip2));
 553
 554         /* The start of both ranges must be aligned to a rt extent. */
 555         if (!isaligned_64(fxr->file1_offset, alloc_unit) ||
 556             !isaligned_64(fxr->file2_offset, alloc_unit))
 557                 return -EINVAL;
 558
 559         if (fxr->flags & XFS_EXCHANGE_RANGE_TO_EOF)
 560                 length = max_t(int64_t, size1 - fxr->file1_offset,
 561                                         size2 - fxr->file2_offset);
 562
 563         /*
 564          * If the user wanted us to exchange up to the infile's EOF, round up
 565          * to the next rt extent boundary for this check.  Do the same for the
 566          * outfile.
 567          *
 568          * Otherwise, reject the range length if it's not rt extent aligned.
 569          * We already confirmed the starting offsets' rt extent block
 570          * alignment.
 571          */
 572         if (fxr->file1_offset + length == size1)
 573                 blen = roundup_64(size1, alloc_unit) - fxr->file1_offset;
 574         else if (fxr->file2_offset + length == size2)
 575                 blen = roundup_64(size2, alloc_unit) - fxr->file2_offset;
 576         else if (!isaligned_64(length, alloc_unit))
 577                 return -EINVAL;
 578         else
 579                 blen = length;
 580
 581         /* Don't allow overlapped exchanges within the same file. */
 582         if (ip1 == ip2 &&
 583             fxr->file2_offset + blen > fxr->file1_offset &&
 584             fxr->file1_offset + blen > fxr->file2_offset)
 585                 return -EINVAL;
 586
 587         /*
 588          * Ensure that we don't exchange a partial EOF rt extent into the
 589          * middle of another file.
 590          */
 591         if (isaligned_64(length, alloc_unit))
 592                 return 0;
 593
 594         blen = length;
 595         if (fxr->file2_offset + length < size2)
 596                 blen = rounddown_64(blen, alloc_unit);
 597
 598         if (fxr->file1_offset + blen < size1)
 599                 blen = rounddown_64(blen, alloc_unit);
 600
 601         return blen == length ? 0 : -EINVAL;
 602 }
 603
 604 /* Prepare two files to have their data exchanged. */
 605 STATIC int
 606 xfs_exchrange_prep(
 607         struct xfs_exchrange    *fxr,
 608         struct xfs_inode        *ip1,
 609         struct xfs_inode        *ip2)
 610 {
 611         struct xfs_mount        *mp = ip2->i_mount;
 612         unsigned int            alloc_unit = xfs_inode_alloc_unitsize(ip2);
 613         int                     error;
 614
 615         trace_xfs_exchrange_prep(fxr, ip1, ip2);
 616
 617         /* Verify both files are either real-time or non-realtime */
 618         if (XFS_IS_REALTIME_INODE(ip1) != XFS_IS_REALTIME_INODE(ip2))
 619                 return -EINVAL;
 620
 621         /* Check non-power of two alignment issues, if necessary. */
 622         if (!is_power_of_2(alloc_unit)) {
 623                 error = xfs_exchrange_check_rtalign(fxr, ip1, ip2, alloc_unit);
 624                 if (error)
 625                         return error;
 626
 627                 /*
 628                  * Do the generic file-level checks with the regular block
 629                  * alignment.
 630                  */
 631                 alloc_unit = mp->m_sb.sb_blocksize;
 632         }
 633
 634         error = xfs_exchange_range_prep(fxr, alloc_unit);
 635         if (error || fxr->length == 0)
 636                 return error;
 637
 638         if (fxr->flags & __XFS_EXCHANGE_RANGE_CHECK_FRESH2) {
 639                 error = xfs_exchrange_check_freshness(fxr, ip2);
 640                 if (error)
 641                         return error;
 642         }
 643
 644         /* Attach dquots to both inodes before changing block maps. */
 645         error = xfs_qm_dqattach(ip2);
 646         if (error)
 647                 return error;
 648         error = xfs_qm_dqattach(ip1);
 649         if (error)
 650                 return error;
 651
 652         trace_xfs_exchrange_flush(fxr, ip1, ip2);
 653
 654         /* Flush the relevant ranges of both files. */
 655         error = xfs_flush_unmap_range(ip2, fxr->file2_offset, fxr->length);
 656         if (error)
 657                 return error;
 658         error = xfs_flush_unmap_range(ip1, fxr->file1_offset, fxr->length);
 659         if (error)
 660                 return error;
 661
 662         /*
 663          * Cancel CoW fork preallocations for the ranges of both files.  The
 664          * prep function should have flushed all the dirty data, so the only
 665          * CoW mappings remaining should be speculative.
 666          */
 667         if (xfs_inode_has_cow_data(ip1)) {
 668                 error = xfs_reflink_cancel_cow_range(ip1, fxr->file1_offset,
 669                                 fxr->length, true);
 670                 if (error)
 671                         return error;
 672         }
 673
 674         if (xfs_inode_has_cow_data(ip2)) {
 675                 error = xfs_reflink_cancel_cow_range(ip2, fxr->file2_offset,
 676                                 fxr->length, true);
 677                 if (error)
 678                         return error;
 679         }
 680
 681         return 0;
 682 }
 683
 684 /*
 685  * Exchange contents of files.  This is the binding between the generic
 686  * file-level concepts and the XFS inode-specific implementation.
 687  */
 688 STATIC int
 689 xfs_exchrange_contents(
 690         struct xfs_exchrange    *fxr)
 691 {
 692         struct inode            *inode1 = file_inode(fxr->file1);
 693         struct inode            *inode2 = file_inode(fxr->file2);
 694         struct xfs_inode        *ip1 = XFS_I(inode1);
 695         struct xfs_inode        *ip2 = XFS_I(inode2);
 696         struct xfs_mount        *mp = ip1->i_mount;
 697         int                     error;
 698
 699         if (!xfs_has_exchange_range(mp))
 700                 return -EOPNOTSUPP;
 701
 702         if (fxr->flags & ~(XFS_EXCHANGE_RANGE_ALL_FLAGS |
 703                            XFS_EXCHANGE_RANGE_PRIV_FLAGS))
 704                 return -EINVAL;
 705
 706         if (xfs_is_shutdown(mp))
 707                 return -EIO;
 708
 709         /* Lock both files against IO */
 710         error = xfs_ilock2_io_mmap(ip1, ip2);
 711         if (error)
 712                 goto out_err;
 713
 714         /* Prepare and then exchange file contents. */
 715         error = xfs_exchrange_prep(fxr, ip1, ip2);
 716         if (error)
 717                 goto out_unlock;
 718
 719         error = xfs_exchrange_mappings(fxr, ip1, ip2);
 720         if (error)
 721                 goto out_unlock;
 722
 723         /*
 724          * Finish the exchange by removing special file privileges like any
 725          * other file write would do.  This may involve turning on support for
 726          * logged xattrs if either file has security capabilities.
 727          */
 728         error = xfs_exchange_range_finish(fxr);
 729         if (error)
 730                 goto out_unlock;
 731
 732 out_unlock:
 733         xfs_iunlock2_io_mmap(ip1, ip2);
 734 out_err:
 735         if (error)
 736                 trace_xfs_exchrange_error(ip2, error, _RET_IP_);
 737         return error;
 738 }
 739
 740 /* Exchange parts of two files. */
 741 static int
 742 xfs_exchange_range(
 743         struct xfs_exchrange    *fxr)
 744 {
 745         struct inode            *inode1 = file_inode(fxr->file1);
 746         struct inode            *inode2 = file_inode(fxr->file2);
 747         int                     ret;
 748
 749         BUILD_BUG_ON(XFS_EXCHANGE_RANGE_ALL_FLAGS &
 750                      XFS_EXCHANGE_RANGE_PRIV_FLAGS);
 751
 752         /* Both files must be on the same mount/filesystem. */
 753         if (fxr->file1->f_path.mnt != fxr->file2->f_path.mnt)
 754                 return -EXDEV;
 755
 756         if (fxr->flags & ~(XFS_EXCHANGE_RANGE_ALL_FLAGS |
 757                          __XFS_EXCHANGE_RANGE_CHECK_FRESH2))
 758                 return -EINVAL;
 759
 760         /* Userspace requests only honored for regular files. */
 761         if (S_ISDIR(inode1->i_mode) || S_ISDIR(inode2->i_mode))
 762                 return -EISDIR;
 763         if (!S_ISREG(inode1->i_mode) || !S_ISREG(inode2->i_mode))
 764                 return -EINVAL;
 765
 766         /* Both files must be opened for read and write. */
 767         if (!(fxr->file1->f_mode & FMODE_READ) ||
 768             !(fxr->file1->f_mode & FMODE_WRITE) ||
 769             !(fxr->file2->f_mode & FMODE_READ) ||
 770             !(fxr->file2->f_mode & FMODE_WRITE))
 771                 return -EBADF;
 772
 773         /* Neither file can be opened append-only. */
 774         if ((fxr->file1->f_flags & O_APPEND) ||
 775             (fxr->file2->f_flags & O_APPEND))
 776                 return -EBADF;
 777
 778         /*
 779          * If we're not exchanging to EOF, we can check the areas before
 780          * stabilizing both files' i_size.
 781          */
 782         if (!(fxr->flags & XFS_EXCHANGE_RANGE_TO_EOF)) {
 783                 ret = xfs_exchange_range_verify_area(fxr);
 784                 if (ret)
 785                         return ret;
 786         }
 787
 788         /* Update cmtime if the fd/inode don't forbid it. */
 789         if (!(fxr->file1->f_mode & FMODE_NOCMTIME) && !IS_NOCMTIME(inode1))
 790                 fxr->flags |= __XFS_EXCHANGE_RANGE_UPD_CMTIME1;
 791         if (!(fxr->file2->f_mode & FMODE_NOCMTIME) && !IS_NOCMTIME(inode2))
 792                 fxr->flags |= __XFS_EXCHANGE_RANGE_UPD_CMTIME2;
 793
 794         file_start_write(fxr->file2);
 795         ret = xfs_exchrange_contents(fxr);
 796         file_end_write(fxr->file2);
 797         if (ret)
 798                 return ret;
 799
 800         fsnotify_modify(fxr->file1);
 801         if (fxr->file2 != fxr->file1)
 802                 fsnotify_modify(fxr->file2);
 803         return 0;
 804 }
 805
 806 /* Collect exchange-range arguments from userspace. */
 807 long
 808 xfs_ioc_exchange_range(
 809         struct file                     *file,
 810         struct xfs_exchange_range __user *argp)
 811 {
 812         struct xfs_exchrange            fxr = {
 813                 .file2                  = file,
 814         };
 815         struct xfs_exchange_range       args;
 816
 817         if (copy_from_user(&args, argp, sizeof(args)))
 818                 return -EFAULT;
 819         if (memchr_inv(&args.pad, 0, sizeof(args.pad)))
 820                 return -EINVAL;
 821         if (args.flags & ~XFS_EXCHANGE_RANGE_ALL_FLAGS)
 822                 return -EINVAL;
 823
 824         fxr.file1_offset        = args.file1_offset;
 825         fxr.file2_offset        = args.file2_offset;
 826         fxr.length              = args.length;
 827         fxr.flags               = args.flags;
 828
 829         CLASS(fd, file1)(args.file1_fd);
 830         if (fd_empty(file1))
 831                 return -EBADF;
 832         fxr.file1 = fd_file(file1);
 833
 834         return xfs_exchange_range(&fxr);
 835 }
 836
 837 /* Opaque freshness blob for XFS_IOC_COMMIT_RANGE */
 838 struct xfs_commit_range_fresh {
 839         xfs_fsid_t      fsid;           /* m_fixedfsid */
 840         __u64           file2_ino;      /* inode number */
 841         __s64           file2_mtime;    /* modification time */
 842         __s64           file2_ctime;    /* change time */
 843         __s32           file2_mtime_nsec; /* mod time, nsec */
 844         __s32           file2_ctime_nsec; /* change time, nsec */
 845         __u32           file2_gen;      /* inode generation */
 846         __u32           magic;          /* zero */
 847 };
 848 #define XCR_FRESH_MAGIC 0x444F524B      /* DORK */
 849
 850 /* Set up a commitrange operation by sampling file2's write-related attrs */
 851 long
 852 xfs_ioc_start_commit(
 853         struct file                     *file,
 854         struct xfs_commit_range __user  *argp)
 855 {
 856         struct xfs_commit_range         args = { };
 857         struct kstat                    kstat = { };
 858         struct xfs_commit_range_fresh   *kern_f;
 859         struct xfs_commit_range_fresh   __user *user_f;
 860         struct inode                    *inode2 = file_inode(file);
 861         struct xfs_inode                *ip2 = XFS_I(inode2);
 862         const unsigned int              lockflags = XFS_IOLOCK_SHARED |
 863                                                     XFS_MMAPLOCK_SHARED |
 864                                                     XFS_ILOCK_SHARED;
 865
 866         BUILD_BUG_ON(sizeof(struct xfs_commit_range_fresh) !=
 867                      sizeof(args.file2_freshness));
 868
 869         kern_f = (struct xfs_commit_range_fresh *)&args.file2_freshness;
 870
 871         memcpy(&kern_f->fsid, ip2->i_mount->m_fixedfsid, sizeof(xfs_fsid_t));
 872
 873         xfs_ilock(ip2, lockflags);
 874         /* Force writing of a distinct ctime if any writes happen. */
 875         fill_mg_cmtime(&kstat, STATX_CTIME | STATX_MTIME, inode2);
 876         kern_f->file2_ctime             = kstat.ctime.tv_sec;
 877         kern_f->file2_ctime_nsec        = kstat.ctime.tv_nsec;
 878         kern_f->file2_mtime             = kstat.mtime.tv_sec;
 879         kern_f->file2_mtime_nsec        = kstat.mtime.tv_nsec;
 880         kern_f->file2_ino               = ip2->i_ino;
 881         kern_f->file2_gen               = inode2->i_generation;
 882         kern_f->magic                   = XCR_FRESH_MAGIC;
 883         xfs_iunlock(ip2, lockflags);
 884
 885         user_f = (struct xfs_commit_range_fresh __user *)&argp->file2_freshness;
 886         if (copy_to_user(user_f, kern_f, sizeof(*kern_f)))
 887                 return -EFAULT;
 888
 889         return 0;
 890 }
 891
 892 /*
 893  * Exchange file1 and file2 contents if file2 has not been written since the
 894  * start commit operation.
 895  */
 896 long
 897 xfs_ioc_commit_range(
 898         struct file                     *file,
 899         struct xfs_commit_range __user  *argp)
 900 {
 901         struct xfs_exchrange            fxr = {
 902                 .file2                  = file,
 903         };
 904         struct xfs_commit_range         args;
 905         struct xfs_commit_range_fresh   *kern_f;
 906         struct xfs_inode                *ip2 = XFS_I(file_inode(file));
 907         struct xfs_mount                *mp = ip2->i_mount;
 908
 909         kern_f = (struct xfs_commit_range_fresh *)&args.file2_freshness;
 910
 911         if (copy_from_user(&args, argp, sizeof(args)))
 912                 return -EFAULT;
 913         if (args.flags & ~XFS_EXCHANGE_RANGE_ALL_FLAGS)
 914                 return -EINVAL;
 915         if (kern_f->magic != XCR_FRESH_MAGIC)
 916                 return -EBUSY;
 917         if (memcmp(&kern_f->fsid, mp->m_fixedfsid, sizeof(xfs_fsid_t)))
 918                 return -EBUSY;
 919
 920         fxr.file1_offset        = args.file1_offset;
 921         fxr.file2_offset        = args.file2_offset;
 922         fxr.length              = args.length;
 923         fxr.flags               = args.flags | __XFS_EXCHANGE_RANGE_CHECK_FRESH2;
 924         fxr.file2_ino           = kern_f->file2_ino;
 925         fxr.file2_gen           = kern_f->file2_gen;
 926         fxr.file2_mtime.tv_sec  = kern_f->file2_mtime;
 927         fxr.file2_mtime.tv_nsec = kern_f->file2_mtime_nsec;
 928         fxr.file2_ctime.tv_sec  = kern_f->file2_ctime;
 929         fxr.file2_ctime.tv_nsec = kern_f->file2_ctime_nsec;
 930
 931         CLASS(fd, file1)(args.file1_fd);
 932         if (fd_empty(file1))
 933                 return -EBADF;
 934         fxr.file1 = fd_file(file1);
 935
 936         return xfs_exchange_range(&fxr);
 937 }