1 // SPDX-License-Identifier: GPL-2.0-or-later
3 * Copyright (c) 2020-2024 Oracle. All Rights Reserved.
7 #include "xfs_shared.h"
8 #include "xfs_format.h"
9 #include "xfs_log_format.h"
10 #include "xfs_trans_resv.h"
11 #include "xfs_mount.h"
12 #include "xfs_defer.h"
13 #include "xfs_inode.h"
14 #include "xfs_trans.h"
15 #include "xfs_quota.h"
16 #include "xfs_bmap_util.h"
17 #include "xfs_reflink.h"
18 #include "xfs_trace.h"
19 #include "xfs_exchrange.h"
20 #include "xfs_exchmaps.h"
22 #include "xfs_icache.h"
24 #include "xfs_rtbitmap.h"
25 #include <linux/fsnotify.h>
27 /* Lock (and optionally join) two inodes for a file range exchange. */
31 struct xfs_inode *ip1,
32 struct xfs_inode *ip2)
35 xfs_lock_two_inodes(ip1, XFS_ILOCK_EXCL,
38 xfs_ilock(ip1, XFS_ILOCK_EXCL);
40 xfs_trans_ijoin(tp, ip1, 0);
42 xfs_trans_ijoin(tp, ip2, 0);
47 /* Unlock two inodes after a file range exchange operation. */
49 xfs_exchrange_iunlock(
50 struct xfs_inode *ip1,
51 struct xfs_inode *ip2)
54 xfs_iunlock(ip2, XFS_ILOCK_EXCL);
55 xfs_iunlock(ip1, XFS_ILOCK_EXCL);
59 * Estimate the resource requirements to exchange file contents between the two
60 * files. The caller is required to hold the IOLOCK and the MMAPLOCK and to
61 * have flushed both inodes' pagecache and active direct-ios.
64 xfs_exchrange_estimate(
65 struct xfs_exchmaps_req *req)
69 xfs_exchrange_ilock(NULL, req->ip1, req->ip2);
70 error = xfs_exchmaps_estimate(req);
71 xfs_exchrange_iunlock(req->ip1, req->ip2);
75 #define QRETRY_IP1 (0x1)
76 #define QRETRY_IP2 (0x2)
79 * Obtain a quota reservation to make sure we don't hit EDQUOT. We can skip
80 * this if quota enforcement is disabled or if both inodes' dquots are the
81 * same. The qretry structure must be initialized to zeroes before the first
82 * call to this function.
85 xfs_exchrange_reserve_quota(
87 const struct xfs_exchmaps_req *req,
90 int64_t ddelta, rdelta;
95 * Don't bother with a quota reservation if we're not enforcing them
96 * or the two inodes have the same dquots.
98 if (!XFS_IS_QUOTA_ON(tp->t_mountp) || req->ip1 == req->ip2 ||
99 (req->ip1->i_udquot == req->ip2->i_udquot &&
100 req->ip1->i_gdquot == req->ip2->i_gdquot &&
101 req->ip1->i_pdquot == req->ip2->i_pdquot))
107 * For each file, compute the net gain in the number of regular blocks
108 * that will be mapped into that file and reserve that much quota. The
109 * quota counts must be able to absorb at least that much space.
111 ddelta = req->ip2_bcount - req->ip1_bcount;
112 rdelta = req->ip2_rtbcount - req->ip1_rtbcount;
113 if (ddelta > 0 || rdelta > 0) {
114 error = xfs_trans_reserve_quota_nblks(tp, req->ip1,
115 ddelta > 0 ? ddelta : 0,
116 rdelta > 0 ? rdelta : 0,
118 if (error == -EDQUOT || error == -ENOSPC) {
120 * Save this error and see what happens if we try to
121 * reserve quota for ip2. Then report both.
123 *qretry |= QRETRY_IP1;
130 if (ddelta < 0 || rdelta < 0) {
131 error = xfs_trans_reserve_quota_nblks(tp, req->ip2,
132 ddelta < 0 ? -ddelta : 0,
133 rdelta < 0 ? -rdelta : 0,
135 if (error == -EDQUOT || error == -ENOSPC)
136 *qretry |= QRETRY_IP2;
144 * For each file, forcibly reserve the gross gain in mapped blocks so
145 * that we don't trip over any quota block reservation assertions.
146 * We must reserve the gross gain because the quota code subtracts from
147 * bcount the number of blocks that we unmap; it does not add that
148 * quantity back to the quota block reservation.
150 error = xfs_trans_reserve_quota_nblks(tp, req->ip1, req->ip1_bcount,
151 req->ip1_rtbcount, true);
155 return xfs_trans_reserve_quota_nblks(tp, req->ip2, req->ip2_bcount,
156 req->ip2_rtbcount, true);
159 /* Exchange the mappings (and hence the contents) of two files' forks. */
161 xfs_exchrange_mappings(
162 const struct xfs_exchrange *fxr,
163 struct xfs_inode *ip1,
164 struct xfs_inode *ip2)
166 struct xfs_mount *mp = ip1->i_mount;
167 struct xfs_exchmaps_req req = {
170 .startoff1 = XFS_B_TO_FSBT(mp, fxr->file1_offset),
171 .startoff2 = XFS_B_TO_FSBT(mp, fxr->file2_offset),
172 .blockcount = XFS_B_TO_FSB(mp, fxr->length),
174 struct xfs_trans *tp;
176 bool retried = false;
179 trace_xfs_exchrange_mappings(fxr, ip1, ip2);
181 if (fxr->flags & XFS_EXCHANGE_RANGE_TO_EOF)
182 req.flags |= XFS_EXCHMAPS_SET_SIZES;
183 if (fxr->flags & XFS_EXCHANGE_RANGE_FILE1_WRITTEN)
184 req.flags |= XFS_EXCHMAPS_INO1_WRITTEN;
187 * Round the request length up to the nearest file allocation unit.
188 * The prep function already checked that the request offsets and
189 * length in @fxr are safe to round up.
191 if (xfs_inode_has_bigrtalloc(ip2))
192 req.blockcount = xfs_rtb_roundup_rtx(mp, req.blockcount);
194 error = xfs_exchrange_estimate(&req);
199 /* Allocate the transaction, lock the inodes, and join them. */
200 error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, req.resblks, 0,
201 XFS_TRANS_RES_FDBLKS, &tp);
205 xfs_exchrange_ilock(tp, ip1, ip2);
207 trace_xfs_exchrange_before(ip2, 2);
208 trace_xfs_exchrange_before(ip1, 1);
210 error = xfs_exchmaps_check_forks(mp, &req);
212 goto out_trans_cancel;
215 * Reserve ourselves some quota if any of them are in enforcing mode.
216 * In theory we only need enough to satisfy the change in the number
217 * of blocks between the two ranges being remapped.
219 error = xfs_exchrange_reserve_quota(tp, &req, &qretry);
220 if ((error == -EDQUOT || error == -ENOSPC) && !retried) {
221 xfs_trans_cancel(tp);
222 xfs_exchrange_iunlock(ip1, ip2);
223 if (qretry & QRETRY_IP1)
224 xfs_blockgc_free_quota(ip1, 0);
225 if (qretry & QRETRY_IP2)
226 xfs_blockgc_free_quota(ip2, 0);
231 goto out_trans_cancel;
233 /* If we got this far on a dry run, all parameters are ok. */
234 if (fxr->flags & XFS_EXCHANGE_RANGE_DRY_RUN)
235 goto out_trans_cancel;
237 /* Update the mtime and ctime of both files. */
238 if (fxr->flags & __XFS_EXCHANGE_RANGE_UPD_CMTIME1)
239 xfs_trans_ichgtime(tp, ip1, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
240 if (fxr->flags & __XFS_EXCHANGE_RANGE_UPD_CMTIME2)
241 xfs_trans_ichgtime(tp, ip2, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
243 xfs_exchange_mappings(tp, &req);
246 * Force the log to persist metadata updates if the caller or the
247 * administrator requires this. The generic prep function already
248 * flushed the relevant parts of the page cache.
250 if (xfs_has_wsync(mp) || (fxr->flags & XFS_EXCHANGE_RANGE_DSYNC))
251 xfs_trans_set_sync(tp);
253 error = xfs_trans_commit(tp);
255 trace_xfs_exchrange_after(ip2, 2);
256 trace_xfs_exchrange_after(ip1, 1);
262 * If the caller wanted us to exchange the contents of two complete
263 * files of unequal length, exchange the incore sizes now. This should
264 * be safe because we flushed both files' page caches, exchanged all
265 * the mappings, and updated the ondisk sizes.
267 if (fxr->flags & XFS_EXCHANGE_RANGE_TO_EOF) {
270 temp = i_size_read(VFS_I(ip2));
271 i_size_write(VFS_I(ip2), i_size_read(VFS_I(ip1)));
272 i_size_write(VFS_I(ip1), temp);
276 xfs_exchrange_iunlock(ip1, ip2);
280 xfs_trans_cancel(tp);
285 * Generic code for exchanging ranges of two files via XFS_IOC_EXCHANGE_RANGE.
286 * This part deals with struct file objects and byte ranges and does not deal
287 * with XFS-specific data structures such as xfs_inodes and block ranges. This
288 * separation may some day facilitate porting to another filesystem.
290 * The goal is to exchange fxr.length bytes starting at fxr.file1_offset in
291 * file1 with the same number of bytes starting at fxr.file2_offset in file2.
292 * Implementations must call xfs_exchange_range_prep to prepare the two
293 * files prior to taking locks; and they must update the inode change and mod
294 * times of both files as part of the metadata update. The timestamp update
295 * and freshness checks must be done atomically as part of the data exchange
296 * operation to ensure correctness of the freshness check.
297 * xfs_exchange_range_finish must be called after the operation completes
298 * successfully but before locks are dropped.
301 /* Verify that we have security clearance to perform this operation. */
303 xfs_exchange_range_verify_area(
304 struct xfs_exchrange *fxr)
308 ret = remap_verify_area(fxr->file1, fxr->file1_offset, fxr->length,
313 return remap_verify_area(fxr->file2, fxr->file2_offset, fxr->length,
318 * Performs necessary checks before doing a range exchange, having stabilized
319 * mutable inode attributes via i_rwsem.
322 xfs_exchange_range_checks(
323 struct xfs_exchrange *fxr,
324 unsigned int alloc_unit)
326 struct inode *inode1 = file_inode(fxr->file1);
327 struct inode *inode2 = file_inode(fxr->file2);
328 uint64_t allocmask = alloc_unit - 1;
331 loff_t size1, size2, tmp;
334 /* Don't touch certain kinds of inodes */
335 if (IS_IMMUTABLE(inode1) || IS_IMMUTABLE(inode2))
337 if (IS_SWAPFILE(inode1) || IS_SWAPFILE(inode2))
340 size1 = i_size_read(inode1);
341 size2 = i_size_read(inode2);
343 /* Ranges cannot start after EOF. */
344 if (fxr->file1_offset > size1 || fxr->file2_offset > size2)
348 * If the caller said to exchange to EOF, we set the length of the
349 * request large enough to cover everything to the end of both files.
351 if (fxr->flags & XFS_EXCHANGE_RANGE_TO_EOF) {
352 fxr->length = max_t(int64_t, size1 - fxr->file1_offset,
353 size2 - fxr->file2_offset);
355 error = xfs_exchange_range_verify_area(fxr);
361 * The start of both ranges must be aligned to the file allocation
364 if (!IS_ALIGNED(fxr->file1_offset, alloc_unit) ||
365 !IS_ALIGNED(fxr->file2_offset, alloc_unit))
368 /* Ensure offsets don't wrap. */
369 if (check_add_overflow(fxr->file1_offset, fxr->length, &tmp) ||
370 check_add_overflow(fxr->file2_offset, fxr->length, &tmp))
374 * We require both ranges to end within EOF, unless we're exchanging
377 if (!(fxr->flags & XFS_EXCHANGE_RANGE_TO_EOF) &&
378 (fxr->file1_offset + fxr->length > size1 ||
379 fxr->file2_offset + fxr->length > size2))
383 * Make sure we don't hit any file size limits. If we hit any size
384 * limits such that test_length was adjusted, we abort the whole
387 test_len = fxr->length;
388 error = generic_write_check_limits(fxr->file2, fxr->file2_offset,
392 error = generic_write_check_limits(fxr->file1, fxr->file1_offset,
396 if (test_len != fxr->length)
400 * If the user wanted us to exchange up to the infile's EOF, round up
401 * to the next allocation unit boundary for this check. Do the same
404 * Otherwise, reject the range length if it's not aligned to an
407 if (fxr->file1_offset + fxr->length == size1)
408 blen = ALIGN(size1, alloc_unit) - fxr->file1_offset;
409 else if (fxr->file2_offset + fxr->length == size2)
410 blen = ALIGN(size2, alloc_unit) - fxr->file2_offset;
411 else if (!IS_ALIGNED(fxr->length, alloc_unit))
416 /* Don't allow overlapped exchanges within the same file. */
417 if (inode1 == inode2 &&
418 fxr->file2_offset + blen > fxr->file1_offset &&
419 fxr->file1_offset + blen > fxr->file2_offset)
423 * Ensure that we don't exchange a partial EOF block into the middle of
426 if ((fxr->length & allocmask) == 0)
430 if (fxr->file2_offset + blen < size2)
433 if (fxr->file1_offset + blen < size1)
436 return blen == fxr->length ? 0 : -EINVAL;
440 * Check that the two inodes are eligible for range exchanges, the ranges make
441 * sense, and then flush all dirty data. Caller must ensure that the inodes
442 * have been locked against any other modifications.
445 xfs_exchange_range_prep(
446 struct xfs_exchrange *fxr,
447 unsigned int alloc_unit)
449 struct inode *inode1 = file_inode(fxr->file1);
450 struct inode *inode2 = file_inode(fxr->file2);
451 bool same_inode = (inode1 == inode2);
454 /* Check that we don't violate system file offset limits. */
455 error = xfs_exchange_range_checks(fxr, alloc_unit);
456 if (error || fxr->length == 0)
459 /* Wait for the completion of any pending IOs on both files */
460 inode_dio_wait(inode1);
462 inode_dio_wait(inode2);
464 error = filemap_write_and_wait_range(inode1->i_mapping,
466 fxr->file1_offset + fxr->length - 1);
470 error = filemap_write_and_wait_range(inode2->i_mapping,
472 fxr->file2_offset + fxr->length - 1);
477 * If the files or inodes involved require synchronous writes, amend
478 * the request to force the filesystem to flush all data and metadata
479 * to disk after the operation completes.
481 if (((fxr->file1->f_flags | fxr->file2->f_flags) & O_SYNC) ||
482 IS_SYNC(inode1) || IS_SYNC(inode2))
483 fxr->flags |= XFS_EXCHANGE_RANGE_DSYNC;
489 * Finish a range exchange operation, if it was successful. Caller must ensure
490 * that the inodes are still locked against any other modifications.
493 xfs_exchange_range_finish(
494 struct xfs_exchrange *fxr)
498 error = file_remove_privs(fxr->file1);
501 if (file_inode(fxr->file1) == file_inode(fxr->file2))
504 return file_remove_privs(fxr->file2);
508 * Check the alignment of an exchange request when the allocation unit size
509 * isn't a power of two. The generic file-level helpers use (fast)
510 * bitmask-based alignment checks, but here we have to use slow long division.
513 xfs_exchrange_check_rtalign(
514 const struct xfs_exchrange *fxr,
515 struct xfs_inode *ip1,
516 struct xfs_inode *ip2,
517 unsigned int alloc_unit)
519 uint64_t length = fxr->length;
523 size1 = i_size_read(VFS_I(ip1));
524 size2 = i_size_read(VFS_I(ip2));
526 /* The start of both ranges must be aligned to a rt extent. */
527 if (!isaligned_64(fxr->file1_offset, alloc_unit) ||
528 !isaligned_64(fxr->file2_offset, alloc_unit))
531 if (fxr->flags & XFS_EXCHANGE_RANGE_TO_EOF)
532 length = max_t(int64_t, size1 - fxr->file1_offset,
533 size2 - fxr->file2_offset);
536 * If the user wanted us to exchange up to the infile's EOF, round up
537 * to the next rt extent boundary for this check. Do the same for the
540 * Otherwise, reject the range length if it's not rt extent aligned.
541 * We already confirmed the starting offsets' rt extent block
544 if (fxr->file1_offset + length == size1)
545 blen = roundup_64(size1, alloc_unit) - fxr->file1_offset;
546 else if (fxr->file2_offset + length == size2)
547 blen = roundup_64(size2, alloc_unit) - fxr->file2_offset;
548 else if (!isaligned_64(length, alloc_unit))
553 /* Don't allow overlapped exchanges within the same file. */
555 fxr->file2_offset + blen > fxr->file1_offset &&
556 fxr->file1_offset + blen > fxr->file2_offset)
560 * Ensure that we don't exchange a partial EOF rt extent into the
561 * middle of another file.
563 if (isaligned_64(length, alloc_unit))
567 if (fxr->file2_offset + length < size2)
568 blen = rounddown_64(blen, alloc_unit);
570 if (fxr->file1_offset + blen < size1)
571 blen = rounddown_64(blen, alloc_unit);
573 return blen == length ? 0 : -EINVAL;
576 /* Prepare two files to have their data exchanged. */
579 struct xfs_exchrange *fxr,
580 struct xfs_inode *ip1,
581 struct xfs_inode *ip2)
583 struct xfs_mount *mp = ip2->i_mount;
584 unsigned int alloc_unit = xfs_inode_alloc_unitsize(ip2);
587 trace_xfs_exchrange_prep(fxr, ip1, ip2);
589 /* Verify both files are either real-time or non-realtime */
590 if (XFS_IS_REALTIME_INODE(ip1) != XFS_IS_REALTIME_INODE(ip2))
593 /* Check non-power of two alignment issues, if necessary. */
594 if (!is_power_of_2(alloc_unit)) {
595 error = xfs_exchrange_check_rtalign(fxr, ip1, ip2, alloc_unit);
600 * Do the generic file-level checks with the regular block
603 alloc_unit = mp->m_sb.sb_blocksize;
606 error = xfs_exchange_range_prep(fxr, alloc_unit);
607 if (error || fxr->length == 0)
610 /* Attach dquots to both inodes before changing block maps. */
611 error = xfs_qm_dqattach(ip2);
614 error = xfs_qm_dqattach(ip1);
618 trace_xfs_exchrange_flush(fxr, ip1, ip2);
620 /* Flush the relevant ranges of both files. */
621 error = xfs_flush_unmap_range(ip2, fxr->file2_offset, fxr->length);
624 error = xfs_flush_unmap_range(ip1, fxr->file1_offset, fxr->length);
629 * Cancel CoW fork preallocations for the ranges of both files. The
630 * prep function should have flushed all the dirty data, so the only
631 * CoW mappings remaining should be speculative.
633 if (xfs_inode_has_cow_data(ip1)) {
634 error = xfs_reflink_cancel_cow_range(ip1, fxr->file1_offset,
640 if (xfs_inode_has_cow_data(ip2)) {
641 error = xfs_reflink_cancel_cow_range(ip2, fxr->file2_offset,
651 * Exchange contents of files. This is the binding between the generic
652 * file-level concepts and the XFS inode-specific implementation.
655 xfs_exchrange_contents(
656 struct xfs_exchrange *fxr)
658 struct inode *inode1 = file_inode(fxr->file1);
659 struct inode *inode2 = file_inode(fxr->file2);
660 struct xfs_inode *ip1 = XFS_I(inode1);
661 struct xfs_inode *ip2 = XFS_I(inode2);
662 struct xfs_mount *mp = ip1->i_mount;
665 if (!xfs_has_exchange_range(mp))
668 if (fxr->flags & ~(XFS_EXCHANGE_RANGE_ALL_FLAGS |
669 XFS_EXCHANGE_RANGE_PRIV_FLAGS))
672 if (xfs_is_shutdown(mp))
675 /* Lock both files against IO */
676 error = xfs_ilock2_io_mmap(ip1, ip2);
680 /* Prepare and then exchange file contents. */
681 error = xfs_exchrange_prep(fxr, ip1, ip2);
685 error = xfs_exchrange_mappings(fxr, ip1, ip2);
690 * Finish the exchange by removing special file privileges like any
691 * other file write would do. This may involve turning on support for
692 * logged xattrs if either file has security capabilities.
694 error = xfs_exchange_range_finish(fxr);
699 xfs_iunlock2_io_mmap(ip1, ip2);
702 trace_xfs_exchrange_error(ip2, error, _RET_IP_);
706 /* Exchange parts of two files. */
709 struct xfs_exchrange *fxr)
711 struct inode *inode1 = file_inode(fxr->file1);
712 struct inode *inode2 = file_inode(fxr->file2);
715 BUILD_BUG_ON(XFS_EXCHANGE_RANGE_ALL_FLAGS &
716 XFS_EXCHANGE_RANGE_PRIV_FLAGS);
718 /* Both files must be on the same mount/filesystem. */
719 if (fxr->file1->f_path.mnt != fxr->file2->f_path.mnt)
722 if (fxr->flags & ~XFS_EXCHANGE_RANGE_ALL_FLAGS)
725 /* Userspace requests only honored for regular files. */
726 if (S_ISDIR(inode1->i_mode) || S_ISDIR(inode2->i_mode))
728 if (!S_ISREG(inode1->i_mode) || !S_ISREG(inode2->i_mode))
731 /* Both files must be opened for read and write. */
732 if (!(fxr->file1->f_mode & FMODE_READ) ||
733 !(fxr->file1->f_mode & FMODE_WRITE) ||
734 !(fxr->file2->f_mode & FMODE_READ) ||
735 !(fxr->file2->f_mode & FMODE_WRITE))
738 /* Neither file can be opened append-only. */
739 if ((fxr->file1->f_flags & O_APPEND) ||
740 (fxr->file2->f_flags & O_APPEND))
744 * If we're not exchanging to EOF, we can check the areas before
745 * stabilizing both files' i_size.
747 if (!(fxr->flags & XFS_EXCHANGE_RANGE_TO_EOF)) {
748 ret = xfs_exchange_range_verify_area(fxr);
753 /* Update cmtime if the fd/inode don't forbid it. */
754 if (!(fxr->file1->f_mode & FMODE_NOCMTIME) && !IS_NOCMTIME(inode1))
755 fxr->flags |= __XFS_EXCHANGE_RANGE_UPD_CMTIME1;
756 if (!(fxr->file2->f_mode & FMODE_NOCMTIME) && !IS_NOCMTIME(inode2))
757 fxr->flags |= __XFS_EXCHANGE_RANGE_UPD_CMTIME2;
759 file_start_write(fxr->file2);
760 ret = xfs_exchrange_contents(fxr);
761 file_end_write(fxr->file2);
765 fsnotify_modify(fxr->file1);
766 if (fxr->file2 != fxr->file1)
767 fsnotify_modify(fxr->file2);
771 /* Collect exchange-range arguments from userspace. */
773 xfs_ioc_exchange_range(
775 struct xfs_exchange_range __user *argp)
777 struct xfs_exchrange fxr = {
780 struct xfs_exchange_range args;
784 if (copy_from_user(&args, argp, sizeof(args)))
786 if (memchr_inv(&args.pad, 0, sizeof(args.pad)))
788 if (args.flags & ~XFS_EXCHANGE_RANGE_ALL_FLAGS)
791 fxr.file1_offset = args.file1_offset;
792 fxr.file2_offset = args.file2_offset;
793 fxr.length = args.length;
794 fxr.flags = args.flags;
796 file1 = fdget(args.file1_fd);
799 fxr.file1 = file1.file;
801 error = xfs_exchange_range(&fxr);