1 // SPDX-License-Identifier: GPL-2.0
3 * Copyright (C) 2010, 2023 Red Hat, Inc.
7 #include "xfs_shared.h"
8 #include "xfs_format.h"
9 #include "xfs_log_format.h"
10 #include "xfs_trans_resv.h"
11 #include "xfs_trans.h"
12 #include "xfs_mount.h"
13 #include "xfs_btree.h"
14 #include "xfs_alloc_btree.h"
15 #include "xfs_alloc.h"
16 #include "xfs_discard.h"
17 #include "xfs_error.h"
18 #include "xfs_extent_busy.h"
19 #include "xfs_trace.h"
22 #include "xfs_health.h"
23 #include "xfs_rtbitmap.h"
24 #include "xfs_rtgroup.h"
27 * Notes on an efficient, low latency fstrim algorithm
29 * We need to walk the filesystem free space and issue discards on the free
30 * space that meet the search criteria (size and location). We cannot issue
31 * discards on extents that might be in use, or are so recently in use they are
32 * still marked as busy. To serialise against extent state changes whilst we are
33 * gathering extents to trim, we must hold the AGF lock to lock out other
34 * allocations and extent free operations that might change extent state.
36 * However, we cannot just hold the AGF for the entire AG free space walk whilst
37 * we issue discards on each free space that is found. Storage devices can have
38 * extremely slow discard implementations (e.g. ceph RBD) and so walking a
39 * couple of million free extents and issuing synchronous discards on each
40 * extent can take a *long* time. Whilst we are doing this walk, nothing else
41 * can access the AGF, and we can stall transactions and hence the log whilst
42 * modifications wait for the AGF lock to be released. This can lead hung tasks
43 * kicking the hung task timer and rebooting the system. This is bad.
45 * Hence we need to take a leaf from the bulkstat playbook. It takes the AGI
46 * lock, gathers a range of inode cluster buffers that are allocated, drops the
47 * AGI lock and then reads all the inode cluster buffers and processes them. It
48 * loops doing this, using a cursor to keep track of where it is up to in the AG
49 * for each iteration to restart the INOBT lookup from.
51 * We can't do this exactly with free space - once we drop the AGF lock, the
52 * state of the free extent is out of our control and we cannot run a discard
53 * safely on it in this situation. Unless, of course, we've marked the free
54 * extent as busy and undergoing a discard operation whilst we held the AGF
57 * This is exactly how online discard works - free extents are marked busy when
58 * they are freed, and once the extent free has been committed to the journal,
59 * the busy extent record is marked as "undergoing discard" and the discard is
60 * then issued on the free extent. Once the discard completes, the busy extent
61 * record is removed and the extent is able to be allocated again.
63 * In the context of fstrim, if we find a free extent we need to discard, we
64 * don't have to discard it immediately. All we need to do it record that free
65 * extent as being busy and under discard, and all the allocation routines will
66 * now avoid trying to allocate it. Hence if we mark the extent as busy under
67 * the AGF lock, we can safely discard it without holding the AGF lock because
68 * nothing will attempt to allocate that free space until the discard completes.
70 * This also allows us to issue discards asynchronously like we do with online
71 * discard, and so for fast devices fstrim will run much faster as we can have
72 * multiple discard operations in flight at once, as well as pipeline the free
73 * extent search so that it overlaps in flight discard IO.
76 #define XFS_DISCARD_MAX_EXAMINE (100)
78 struct workqueue_struct *xfs_discard_wq;
81 xfs_discard_endio_work(
82 struct work_struct *work)
84 struct xfs_busy_extents *extents =
85 container_of(work, struct xfs_busy_extents, endio_work);
87 xfs_extent_busy_clear(&extents->extent_list, false);
88 kfree(extents->owner);
92 * Queue up the actual completion to a thread to avoid IRQ-safe locking for
99 struct xfs_busy_extents *extents = bio->bi_private;
101 INIT_WORK(&extents->endio_work, xfs_discard_endio_work);
102 queue_work(xfs_discard_wq, &extents->endio_work);
106 static inline struct block_device *
108 const struct xfs_group *xg)
110 struct xfs_mount *mp = xg->xg_mount;
112 switch (xg->xg_type) {
114 return mp->m_ddev_targp->bt_bdev;
116 return mp->m_rtdev_targp->bt_bdev;
125 * Walk the discard list and issue discards on all the busy extents in the
126 * list. We plug and chain the bios so that we only need a single completion
127 * call to clear all the busy extents once the discards are complete.
131 struct xfs_mount *mp,
132 struct xfs_busy_extents *extents)
134 struct xfs_extent_busy *busyp;
135 struct bio *bio = NULL;
136 struct blk_plug plug;
139 blk_start_plug(&plug);
140 list_for_each_entry(busyp, &extents->extent_list, list) {
141 trace_xfs_discard_extent(busyp->group, busyp->bno,
144 error = __blkdev_issue_discard(xfs_group_bdev(busyp->group),
145 xfs_gbno_to_daddr(busyp->group, busyp->bno),
146 XFS_FSB_TO_BB(mp, busyp->length),
148 if (error && error != -EOPNOTSUPP) {
150 "discard failed for extent [0x%llx,%u], error %d",
151 (unsigned long long)busyp->bno,
159 bio->bi_private = extents;
160 bio->bi_end_io = xfs_discard_endio;
163 xfs_discard_endio_work(&extents->endio_work);
165 blk_finish_plug(&plug);
170 struct xfs_trim_cur {
179 xfs_trim_gather_extents(
180 struct xfs_perag *pag,
181 struct xfs_trim_cur *tcur,
182 struct xfs_busy_extents *extents)
184 struct xfs_mount *mp = pag_mount(pag);
185 struct xfs_trans *tp;
186 struct xfs_btree_cur *cur;
187 struct xfs_buf *agbp;
190 int batch = XFS_DISCARD_MAX_EXAMINE;
193 * Force out the log. This means any transactions that might have freed
194 * space before we take the AGF buffer lock are now on disk, and the
195 * volatile disk cache is flushed.
197 xfs_log_force(mp, XFS_LOG_SYNC);
199 error = xfs_trans_alloc_empty(mp, &tp);
203 error = xfs_alloc_read_agf(pag, tp, 0, &agbp);
205 goto out_trans_cancel;
208 /* sub-AG discard request always starts at tcur->start */
209 cur = xfs_bnobt_init_cursor(mp, tp, agbp, pag);
210 error = xfs_alloc_lookup_le(cur, tcur->start, 0, &i);
212 error = xfs_alloc_lookup_ge(cur, tcur->start, 0, &i);
213 } else if (tcur->start == 0) {
214 /* first time through a by-len starts with max length */
215 cur = xfs_cntbt_init_cursor(mp, tp, agbp, pag);
216 error = xfs_alloc_lookup_ge(cur, 0, tcur->count, &i);
218 /* nth time through a by-len starts where we left off */
219 cur = xfs_cntbt_init_cursor(mp, tp, agbp, pag);
220 error = xfs_alloc_lookup_le(cur, tcur->start, tcur->count, &i);
225 /* nothing of that length left in the AG, we are done */
231 * Loop until we are done with all extents that are large
232 * enough to be worth discarding or we hit batch limits.
238 error = xfs_alloc_get_rec(cur, &fbno, &flen, &i);
241 if (XFS_IS_CORRUPT(mp, i != 1)) {
242 xfs_btree_mark_sick(cur);
243 error = -EFSCORRUPTED;
249 * Update the cursor to point at this extent so we
250 * restart the next batch from this extent.
258 * If the extent is entirely outside of the range we are
259 * supposed to skip it. Do not bother to trim down partially
260 * overlapping ranges for now.
262 if (fbno + flen < tcur->start) {
263 trace_xfs_discard_exclude(pag_group(pag), fbno, flen);
266 if (fbno > tcur->end) {
267 trace_xfs_discard_exclude(pag_group(pag), fbno, flen);
275 /* Trim the extent returned to the range we want. */
276 if (fbno < tcur->start) {
277 flen -= tcur->start - fbno;
280 if (fbno + flen > tcur->end + 1)
281 flen = tcur->end - fbno + 1;
283 /* Too small? Give up. */
284 if (flen < tcur->minlen) {
285 trace_xfs_discard_toosmall(pag_group(pag), fbno, flen);
293 * If any blocks in the range are still busy, skip the
294 * discard and try again the next time.
296 if (xfs_extent_busy_search(pag_group(pag), fbno, flen)) {
297 trace_xfs_discard_busy(pag_group(pag), fbno, flen);
301 xfs_extent_busy_insert_discard(pag_group(pag), fbno, flen,
302 &extents->extent_list);
305 error = xfs_btree_increment(cur, 0, &i);
307 error = xfs_btree_decrement(cur, 0, &i);
312 * If there's no more records in the tree, we are done. Set the
313 * cursor block count to 0 to indicate to the caller that there
314 * is no more extents to search.
321 * If there was an error, release all the gathered busy extents because
322 * we aren't going to issue a discard on them any more.
325 xfs_extent_busy_clear(&extents->extent_list, false);
327 xfs_btree_del_cursor(cur, error);
329 xfs_trans_cancel(tp);
334 xfs_trim_should_stop(void)
336 return fatal_signal_pending(current) || freezing(current);
340 * Iterate the free list gathering extents and discarding them. We need a cursor
341 * for the repeated iteration of gather/discard loop, so use the longest extent
342 * we found in the last batch as the key to start the next.
345 xfs_trim_perag_extents(
346 struct xfs_perag *pag,
351 struct xfs_trim_cur tcur = {
353 .count = pag->pagf_longest,
359 if (start != 0 || end != pag_group(pag)->xg_block_count)
363 struct xfs_busy_extents *extents;
365 extents = kzalloc(sizeof(*extents), GFP_KERNEL);
371 extents->owner = extents;
372 INIT_LIST_HEAD(&extents->extent_list);
374 error = xfs_trim_gather_extents(pag, &tcur, extents);
381 * We hand the extent list to the discard function here so the
382 * discarded extents can be removed from the busy extent list.
383 * This allows the discards to run asynchronously with gathering
384 * the next round of extents to discard.
386 * However, we must ensure that we do not reference the extent
387 * list after this function call, as it may have been freed by
388 * the time control returns to us.
390 error = xfs_discard_extents(pag_mount(pag), extents);
394 if (xfs_trim_should_stop())
397 } while (tcur.count != 0);
404 xfs_trim_datadev_extents(
405 struct xfs_mount *mp,
410 xfs_agnumber_t start_agno, end_agno;
411 xfs_agblock_t start_agbno, end_agbno;
412 struct xfs_perag *pag = NULL;
413 xfs_daddr_t ddev_end;
414 int last_error = 0, error;
416 ddev_end = min_t(xfs_daddr_t, end,
417 XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks) - 1);
419 start_agno = xfs_daddr_to_agno(mp, start);
420 start_agbno = xfs_daddr_to_agbno(mp, start);
421 end_agno = xfs_daddr_to_agno(mp, ddev_end);
422 end_agbno = xfs_daddr_to_agbno(mp, ddev_end);
424 while ((pag = xfs_perag_next_range(mp, pag, start_agno, end_agno))) {
425 xfs_agblock_t agend = pag_group(pag)->xg_block_count;
427 if (pag_agno(pag) == end_agno)
429 error = xfs_trim_perag_extents(pag, start_agbno, agend, minlen);
433 if (xfs_trim_should_stop()) {
444 struct xfs_trim_rtdev {
445 /* list of rt extents to free */
446 struct list_head extent_list;
448 /* minimum length that caller allows us to trim */
449 xfs_rtblock_t minlen_fsb;
451 /* restart point for the rtbitmap walk */
452 xfs_rtxnum_t restart_rtx;
454 /* stopping point for the current rtbitmap walk */
455 xfs_rtxnum_t stop_rtx;
458 struct xfs_rtx_busy {
459 struct list_head list;
461 xfs_rtblock_t length;
465 xfs_discard_free_rtdev_extents(
466 struct xfs_trim_rtdev *tr)
468 struct xfs_rtx_busy *busyp, *n;
470 list_for_each_entry_safe(busyp, n, &tr->extent_list, list) {
471 list_del_init(&busyp->list);
477 * Walk the discard list and issue discards on all the busy extents in the
478 * list. We plug and chain the bios so that we only need a single completion
479 * call to clear all the busy extents once the discards are complete.
482 xfs_discard_rtdev_extents(
483 struct xfs_mount *mp,
484 struct xfs_trim_rtdev *tr)
486 struct block_device *bdev = mp->m_rtdev_targp->bt_bdev;
487 struct xfs_rtx_busy *busyp;
488 struct bio *bio = NULL;
489 struct blk_plug plug;
490 xfs_rtblock_t start = NULLRTBLOCK, length = 0;
493 blk_start_plug(&plug);
494 list_for_each_entry(busyp, &tr->extent_list, list) {
495 if (start == NULLRTBLOCK)
497 length += busyp->length;
499 trace_xfs_discard_rtextent(mp, busyp->bno, busyp->length);
501 error = __blkdev_issue_discard(bdev,
502 xfs_rtb_to_daddr(mp, busyp->bno),
503 XFS_FSB_TO_BB(mp, busyp->length),
508 xfs_discard_free_rtdev_extents(tr);
511 error = submit_bio_wait(bio);
512 if (error == -EOPNOTSUPP)
516 "discard failed for rtextent [0x%llx,%llu], error %d",
517 (unsigned long long)start,
518 (unsigned long long)length,
522 blk_finish_plug(&plug);
528 xfs_trim_gather_rtextent(
529 struct xfs_rtgroup *rtg,
530 struct xfs_trans *tp,
531 const struct xfs_rtalloc_rec *rec,
534 struct xfs_trim_rtdev *tr = priv;
535 struct xfs_rtx_busy *busyp;
536 xfs_rtblock_t rbno, rlen;
538 if (rec->ar_startext > tr->stop_rtx) {
540 * If we've scanned a large number of rtbitmap blocks, update
541 * the cursor to point at this extent so we restart the next
542 * batch from this extent.
544 tr->restart_rtx = rec->ar_startext;
548 rbno = xfs_rtx_to_rtb(rtg, rec->ar_startext);
549 rlen = xfs_rtbxlen_to_blen(rtg_mount(rtg), rec->ar_extcount);
551 /* Ignore too small. */
552 if (rlen < tr->minlen_fsb) {
553 trace_xfs_discard_rttoosmall(rtg_mount(rtg), rbno, rlen);
557 busyp = kzalloc(sizeof(struct xfs_rtx_busy), GFP_KERNEL);
562 busyp->length = rlen;
563 INIT_LIST_HEAD(&busyp->list);
564 list_add_tail(&busyp->list, &tr->extent_list);
566 tr->restart_rtx = rec->ar_startext + rec->ar_extcount;
570 /* Trim extents on an !rtgroups realtime device */
573 struct xfs_rtgroup *rtg,
578 struct xfs_mount *mp = rtg_mount(rtg);
579 struct xfs_trim_rtdev tr = {
580 .minlen_fsb = XFS_BB_TO_FSB(mp, minlen),
581 .extent_list = LIST_HEAD_INIT(tr.extent_list),
583 struct xfs_trans *tp;
586 error = xfs_trans_alloc_empty(mp, &tp);
591 * Walk the free ranges between low and high. The query_range function
592 * trims the extents returned.
595 tr.stop_rtx = low + xfs_rtbitmap_rtx_per_rbmblock(mp);
596 xfs_rtgroup_lock(rtg, XFS_RTGLOCK_BITMAP_SHARED);
597 error = xfs_rtalloc_query_range(rtg, tp, low, high,
598 xfs_trim_gather_rtextent, &tr);
600 if (error == -ECANCELED)
603 xfs_rtgroup_unlock(rtg, XFS_RTGLOCK_BITMAP_SHARED);
604 xfs_discard_free_rtdev_extents(&tr);
608 if (list_empty(&tr.extent_list)) {
609 xfs_rtgroup_unlock(rtg, XFS_RTGLOCK_BITMAP_SHARED);
613 error = xfs_discard_rtdev_extents(mp, &tr);
614 xfs_rtgroup_unlock(rtg, XFS_RTGLOCK_BITMAP_SHARED);
618 low = tr.restart_rtx;
619 } while (!xfs_trim_should_stop() && low <= high);
621 xfs_trans_cancel(tp);
625 struct xfs_trim_rtgroup {
626 /* list of rtgroup extents to free */
627 struct xfs_busy_extents *extents;
629 /* minimum length that caller allows us to trim */
630 xfs_rtblock_t minlen_fsb;
632 /* restart point for the rtbitmap walk */
633 xfs_rtxnum_t restart_rtx;
635 /* number of extents to examine before stopping to issue discard ios */
638 /* number of extents queued for discard */
643 xfs_trim_gather_rtgroup_extent(
644 struct xfs_rtgroup *rtg,
645 struct xfs_trans *tp,
646 const struct xfs_rtalloc_rec *rec,
649 struct xfs_trim_rtgroup *tr = priv;
653 if (--tr->batch <= 0) {
655 * If we've checked a large number of extents, update the
656 * cursor to point at this extent so we restart the next batch
659 tr->restart_rtx = rec->ar_startext;
663 rgbno = xfs_rtx_to_rgbno(rtg, rec->ar_startext);
664 len = xfs_rtxlen_to_extlen(rtg_mount(rtg), rec->ar_extcount);
666 /* Ignore too small. */
667 if (len < tr->minlen_fsb) {
668 trace_xfs_discard_toosmall(rtg_group(rtg), rgbno, len);
673 * If any blocks in the range are still busy, skip the discard and try
674 * again the next time.
676 if (xfs_extent_busy_search(rtg_group(rtg), rgbno, len)) {
677 trace_xfs_discard_busy(rtg_group(rtg), rgbno, len);
681 xfs_extent_busy_insert_discard(rtg_group(rtg), rgbno, len,
682 &tr->extents->extent_list);
685 tr->restart_rtx = rec->ar_startext + rec->ar_extcount;
689 /* Trim extents in this rtgroup using the busy extent machinery. */
691 xfs_trim_rtgroup_extents(
692 struct xfs_rtgroup *rtg,
697 struct xfs_mount *mp = rtg_mount(rtg);
698 struct xfs_trim_rtgroup tr = {
699 .minlen_fsb = XFS_BB_TO_FSB(mp, minlen),
701 struct xfs_trans *tp;
704 error = xfs_trans_alloc_empty(mp, &tp);
709 * Walk the free ranges between low and high. The query_range function
710 * trims the extents returned.
713 tr.extents = kzalloc(sizeof(*tr.extents), GFP_KERNEL);
720 tr.batch = XFS_DISCARD_MAX_EXAMINE;
721 tr.extents->owner = tr.extents;
722 INIT_LIST_HEAD(&tr.extents->extent_list);
724 xfs_rtgroup_lock(rtg, XFS_RTGLOCK_BITMAP_SHARED);
725 error = xfs_rtalloc_query_range(rtg, tp, low, high,
726 xfs_trim_gather_rtgroup_extent, &tr);
727 xfs_rtgroup_unlock(rtg, XFS_RTGLOCK_BITMAP_SHARED);
728 if (error == -ECANCELED)
739 * We hand the extent list to the discard function here so the
740 * discarded extents can be removed from the busy extent list.
741 * This allows the discards to run asynchronously with
742 * gathering the next round of extents to discard.
744 * However, we must ensure that we do not reference the extent
745 * list after this function call, as it may have been freed by
746 * the time control returns to us.
748 error = xfs_discard_extents(rtg_mount(rtg), tr.extents);
752 low = tr.restart_rtx;
753 } while (!xfs_trim_should_stop() && low <= high);
755 xfs_trans_cancel(tp);
760 xfs_trim_rtdev_extents(
761 struct xfs_mount *mp,
766 xfs_rtblock_t start_rtbno, end_rtbno;
767 xfs_rtxnum_t start_rtx, end_rtx;
768 xfs_rgnumber_t start_rgno, end_rgno;
769 xfs_daddr_t daddr_offset;
770 int last_error = 0, error;
771 struct xfs_rtgroup *rtg = NULL;
773 /* Shift the start and end downwards to match the rt device. */
774 daddr_offset = XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks);
775 if (start > daddr_offset)
776 start -= daddr_offset;
779 start_rtbno = xfs_daddr_to_rtb(mp, start);
780 start_rtx = xfs_rtb_to_rtx(mp, start_rtbno);
781 start_rgno = xfs_rtb_to_rgno(mp, start_rtbno);
783 if (end <= daddr_offset)
787 end_rtbno = xfs_daddr_to_rtb(mp, end);
788 end_rtx = xfs_rtb_to_rtx(mp, end_rtbno + mp->m_sb.sb_rextsize - 1);
789 end_rgno = xfs_rtb_to_rgno(mp, end_rtbno);
791 while ((rtg = xfs_rtgroup_next_range(mp, rtg, start_rgno, end_rgno))) {
792 xfs_rtxnum_t rtg_end = rtg->rtg_extents;
794 if (rtg_rgno(rtg) == end_rgno)
795 rtg_end = min(rtg_end, end_rtx);
797 if (xfs_has_rtgroups(mp))
798 error = xfs_trim_rtgroup_extents(rtg, start_rtx,
801 error = xfs_trim_rtextents(rtg, start_rtx, rtg_end,
806 if (xfs_trim_should_stop()) {
807 xfs_rtgroup_rele(rtg);
816 # define xfs_trim_rtdev_extents(...) (-EOPNOTSUPP)
817 #endif /* CONFIG_XFS_RT */
820 * trim a range of the filesystem.
822 * Note: the parameters passed from userspace are byte ranges into the
823 * filesystem which does not match to the format we use for filesystem block
824 * addressing. FSB addressing is sparse (AGNO|AGBNO), while the incoming format
825 * is a linear address range. Hence we need to use DADDR based conversions and
826 * comparisons for determining the correct offset and regions to trim.
828 * The realtime device is mapped into the FITRIM "address space" immediately
829 * after the data device.
833 struct xfs_mount *mp,
834 struct fstrim_range __user *urange)
836 unsigned int granularity =
837 bdev_discard_granularity(mp->m_ddev_targp->bt_bdev);
838 struct block_device *rt_bdev = NULL;
839 struct fstrim_range range;
840 xfs_daddr_t start, end;
842 xfs_rfsblock_t max_blocks;
843 int error, last_error = 0;
845 if (!capable(CAP_SYS_ADMIN))
847 if (mp->m_rtdev_targp &&
848 bdev_max_discard_sectors(mp->m_rtdev_targp->bt_bdev))
849 rt_bdev = mp->m_rtdev_targp->bt_bdev;
850 if (!bdev_max_discard_sectors(mp->m_ddev_targp->bt_bdev) && !rt_bdev)
854 granularity = max(granularity,
855 bdev_discard_granularity(rt_bdev));
858 * We haven't recovered the log, so we cannot use our bnobt-guided
859 * storage zapping commands.
861 if (xfs_has_norecovery(mp))
864 if (copy_from_user(&range, urange, sizeof(range)))
867 range.minlen = max_t(u64, granularity, range.minlen);
868 minlen = XFS_B_TO_FSB(mp, range.minlen);
871 * Truncating down the len isn't actually quite correct, but using
872 * BBTOB would mean we trivially get overflows for values
873 * of ULLONG_MAX or slightly lower. And ULLONG_MAX is the default
874 * used by the fstrim application. In the end it really doesn't
875 * matter as trimming blocks is an advisory interface.
877 max_blocks = mp->m_sb.sb_dblocks + mp->m_sb.sb_rblocks;
878 if (range.start >= XFS_FSB_TO_B(mp, max_blocks) ||
879 range.minlen > XFS_FSB_TO_B(mp, mp->m_ag_max_usable) ||
880 range.len < mp->m_sb.sb_blocksize)
883 start = BTOBB(range.start);
884 end = start + BTOBBT(range.len) - 1;
886 if (bdev_max_discard_sectors(mp->m_ddev_targp->bt_bdev)) {
887 error = xfs_trim_datadev_extents(mp, start, end, minlen);
892 if (rt_bdev && !xfs_trim_should_stop()) {
893 error = xfs_trim_rtdev_extents(mp, start, end, minlen);
901 range.len = min_t(unsigned long long, range.len,
902 XFS_FSB_TO_B(mp, max_blocks) - range.start);
903 if (copy_to_user(urange, &range, sizeof(range)))