fs/xfs/scrub/reap.c

   1 // SPDX-License-Identifier: GPL-2.0-or-later
   2 /*
   3  * Copyright (C) 2022-2023 Oracle.  All Rights Reserved.
   4  * Author: Darrick J. Wong <[email protected]>
   5  */
   6 #include "xfs.h"
   7 #include "xfs_fs.h"
   8 #include "xfs_shared.h"
   9 #include "xfs_format.h"
  10 #include "xfs_trans_resv.h"
  11 #include "xfs_mount.h"
  12 #include "xfs_btree.h"
  13 #include "xfs_log_format.h"
  14 #include "xfs_trans.h"
  15 #include "xfs_sb.h"
  16 #include "xfs_inode.h"
  17 #include "xfs_alloc.h"
  18 #include "xfs_alloc_btree.h"
  19 #include "xfs_ialloc.h"
  20 #include "xfs_ialloc_btree.h"
  21 #include "xfs_rmap.h"
  22 #include "xfs_rmap_btree.h"
  23 #include "xfs_refcount_btree.h"
  24 #include "xfs_extent_busy.h"
  25 #include "xfs_ag.h"
  26 #include "xfs_ag_resv.h"
  27 #include "xfs_quota.h"
  28 #include "xfs_qm.h"
  29 #include "xfs_bmap.h"
  30 #include "xfs_da_format.h"
  31 #include "xfs_da_btree.h"
  32 #include "xfs_attr.h"
  33 #include "xfs_attr_remote.h"
  34 #include "scrub/scrub.h"
  35 #include "scrub/common.h"
  36 #include "scrub/trace.h"
  37 #include "scrub/repair.h"
  38 #include "scrub/bitmap.h"
  39 #include "scrub/reap.h"
  40
  41 /*
  42  * Disposal of Blocks from Old Metadata
  43  *
  44  * Now that we've constructed a new btree to replace the damaged one, we want
  45  * to dispose of the blocks that (we think) the old btree was using.
  46  * Previously, we used the rmapbt to collect the extents (bitmap) with the
  47  * rmap owner corresponding to the tree we rebuilt, collected extents for any
  48  * blocks with the same rmap owner that are owned by another data structure
  49  * (sublist), and subtracted sublist from bitmap.  In theory the extents
  50  * remaining in bitmap are the old btree's blocks.
  51  *
  52  * Unfortunately, it's possible that the btree was crosslinked with other
  53  * blocks on disk.  The rmap data can tell us if there are multiple owners, so
  54  * if the rmapbt says there is an owner of this block other than @oinfo, then
  55  * the block is crosslinked.  Remove the reverse mapping and continue.
  56  *
  57  * If there is one rmap record, we can free the block, which removes the
  58  * reverse mapping but doesn't add the block to the free space.  Our repair
  59  * strategy is to hope the other metadata objects crosslinked on this block
  60  * will be rebuilt (atop different blocks), thereby removing all the cross
  61  * links.
  62  *
  63  * If there are no rmap records at all, we also free the block.  If the btree
  64  * being rebuilt lives in the free space (bnobt/cntbt/rmapbt) then there isn't
  65  * supposed to be a rmap record and everything is ok.  For other btrees there
  66  * had to have been an rmap entry for the block to have ended up on @bitmap,
  67  * so if it's gone now there's something wrong and the fs will shut down.
  68  *
  69  * Note: If there are multiple rmap records with only the same rmap owner as
  70  * the btree we're trying to rebuild and the block is indeed owned by another
  71  * data structure with the same rmap owner, then the block will be in sublist
  72  * and therefore doesn't need disposal.  If there are multiple rmap records
  73  * with only the same rmap owner but the block is not owned by something with
  74  * the same rmap owner, the block will be freed.
  75  *
  76  * The caller is responsible for locking the AG headers for the entire rebuild
  77  * operation so that nothing else can sneak in and change the AG state while
  78  * we're not looking.  We must also invalidate any buffers associated with
  79  * @bitmap.
  80  */
  81
  82 /* Information about reaping extents after a repair. */
  83 struct xreap_state {
  84         struct xfs_scrub                *sc;
  85
  86         /* Reverse mapping owner and metadata reservation type. */
  87         const struct xfs_owner_info     *oinfo;
  88         enum xfs_ag_resv_type           resv;
  89
  90         /* If true, roll the transaction before reaping the next extent. */
  91         bool                            force_roll;
  92
  93         /* Number of deferred reaps attached to the current transaction. */
  94         unsigned int                    deferred;
  95
  96         /* Number of invalidated buffers logged to the current transaction. */
  97         unsigned int                    invalidated;
  98
  99         /* Number of deferred reaps queued during the whole reap sequence. */
 100         unsigned long long              total_deferred;
 101 };
 102
 103 /* Put a block back on the AGFL. */
 104 STATIC int
 105 xreap_put_freelist(
 106         struct xfs_scrub        *sc,
 107         xfs_agblock_t           agbno)
 108 {
 109         struct xfs_buf          *agfl_bp;
 110         int                     error;
 111
 112         /* Make sure there's space on the freelist. */
 113         error = xrep_fix_freelist(sc, true);
 114         if (error)
 115                 return error;
 116
 117         /*
 118          * Since we're "freeing" a lost block onto the AGFL, we have to
 119          * create an rmap for the block prior to merging it or else other
 120          * parts will break.
 121          */
 122         error = xfs_rmap_alloc(sc->tp, sc->sa.agf_bp, sc->sa.pag, agbno, 1,
 123                         &XFS_RMAP_OINFO_AG);
 124         if (error)
 125                 return error;
 126
 127         /* Put the block on the AGFL. */
 128         error = xfs_alloc_read_agfl(sc->sa.pag, sc->tp, &agfl_bp);
 129         if (error)
 130                 return error;
 131
 132         error = xfs_alloc_put_freelist(sc->sa.pag, sc->tp, sc->sa.agf_bp,
 133                         agfl_bp, agbno, 0);
 134         if (error)
 135                 return error;
 136         xfs_extent_busy_insert(sc->tp, sc->sa.pag, agbno, 1,
 137                         XFS_EXTENT_BUSY_SKIP_DISCARD);
 138
 139         return 0;
 140 }
 141
 142 /* Are there any uncommitted reap operations? */
 143 static inline bool xreap_dirty(const struct xreap_state *rs)
 144 {
 145         if (rs->force_roll)
 146                 return true;
 147         if (rs->deferred)
 148                 return true;
 149         if (rs->invalidated)
 150                 return true;
 151         if (rs->total_deferred)
 152                 return true;
 153         return false;
 154 }
 155
 156 #define XREAP_MAX_BINVAL        (2048)
 157
 158 /*
 159  * Decide if we want to roll the transaction after reaping an extent.  We don't
 160  * want to overrun the transaction reservation, so we prohibit more than
 161  * 128 EFIs per transaction.  For the same reason, we limit the number
 162  * of buffer invalidations to 2048.
 163  */
 164 static inline bool xreap_want_roll(const struct xreap_state *rs)
 165 {
 166         if (rs->force_roll)
 167                 return true;
 168         if (rs->deferred > XREP_MAX_ITRUNCATE_EFIS)
 169                 return true;
 170         if (rs->invalidated > XREAP_MAX_BINVAL)
 171                 return true;
 172         return false;
 173 }
 174
 175 static inline void xreap_reset(struct xreap_state *rs)
 176 {
 177         rs->total_deferred += rs->deferred;
 178         rs->deferred = 0;
 179         rs->invalidated = 0;
 180         rs->force_roll = false;
 181 }
 182
 183 #define XREAP_MAX_DEFER_CHAIN           (2048)
 184
 185 /*
 186  * Decide if we want to finish the deferred ops that are attached to the scrub
 187  * transaction.  We don't want to queue huge chains of deferred ops because
 188  * that can consume a lot of log space and kernel memory.  Hence we trigger a
 189  * xfs_defer_finish if there are more than 2048 deferred reap operations or the
 190  * caller did some real work.
 191  */
 192 static inline bool
 193 xreap_want_defer_finish(const struct xreap_state *rs)
 194 {
 195         if (rs->force_roll)
 196                 return true;
 197         if (rs->total_deferred > XREAP_MAX_DEFER_CHAIN)
 198                 return true;
 199         return false;
 200 }
 201
 202 static inline void xreap_defer_finish_reset(struct xreap_state *rs)
 203 {
 204         rs->total_deferred = 0;
 205         rs->deferred = 0;
 206         rs->invalidated = 0;
 207         rs->force_roll = false;
 208 }
 209
 210 /* Try to invalidate the incore buffers for an extent that we're freeing. */
 211 STATIC void
 212 xreap_agextent_binval(
 213         struct xreap_state      *rs,
 214         xfs_agblock_t           agbno,
 215         xfs_extlen_t            *aglenp)
 216 {
 217         struct xfs_scrub        *sc = rs->sc;
 218         struct xfs_perag        *pag = sc->sa.pag;
 219         struct xfs_mount        *mp = sc->mp;
 220         xfs_agnumber_t          agno = sc->sa.pag->pag_agno;
 221         xfs_agblock_t           agbno_next = agbno + *aglenp;
 222         xfs_agblock_t           bno = agbno;
 223
 224         /*
 225          * Avoid invalidating AG headers and post-EOFS blocks because we never
 226          * own those.
 227          */
 228         if (!xfs_verify_agbno(pag, agbno) ||
 229             !xfs_verify_agbno(pag, agbno_next - 1))
 230                 return;
 231
 232         /*
 233          * If there are incore buffers for these blocks, invalidate them.  We
 234          * assume that the lack of any other known owners means that the buffer
 235          * can be locked without risk of deadlocking.  The buffer cache cannot
 236          * detect aliasing, so employ nested loops to scan for incore buffers
 237          * of any plausible size.
 238          */
 239         while (bno < agbno_next) {
 240                 xfs_agblock_t   fsbcount;
 241                 xfs_agblock_t   max_fsbs;
 242
 243                 /*
 244                  * Max buffer size is the max remote xattr buffer size, which
 245                  * is one fs block larger than 64k.
 246                  */
 247                 max_fsbs = min_t(xfs_agblock_t, agbno_next - bno,
 248                                 xfs_attr3_rmt_blocks(mp, XFS_XATTR_SIZE_MAX));
 249
 250                 for (fsbcount = 1; fsbcount < max_fsbs; fsbcount++) {
 251                         struct xfs_buf  *bp = NULL;
 252                         xfs_daddr_t     daddr;
 253                         int             error;
 254
 255                         daddr = XFS_AGB_TO_DADDR(mp, agno, bno);
 256                         error = xfs_buf_incore(mp->m_ddev_targp, daddr,
 257                                         XFS_FSB_TO_BB(mp, fsbcount),
 258                                         XBF_LIVESCAN, &bp);
 259                         if (error)
 260                                 continue;
 261
 262                         xfs_trans_bjoin(sc->tp, bp);
 263                         xfs_trans_binval(sc->tp, bp);
 264                         rs->invalidated++;
 265
 266                         /*
 267                          * Stop invalidating if we've hit the limit; we should
 268                          * still have enough reservation left to free however
 269                          * far we've gotten.
 270                          */
 271                         if (rs->invalidated > XREAP_MAX_BINVAL) {
 272                                 *aglenp -= agbno_next - bno;
 273                                 goto out;
 274                         }
 275                 }
 276
 277                 bno++;
 278         }
 279
 280 out:
 281         trace_xreap_agextent_binval(sc->sa.pag, agbno, *aglenp);
 282 }
 283
 284 /*
 285  * Figure out the longest run of blocks that we can dispose of with a single
 286  * call.  Cross-linked blocks should have their reverse mappings removed, but
 287  * single-owner extents can be freed.  AGFL blocks can only be put back one at
 288  * a time.
 289  */
 290 STATIC int
 291 xreap_agextent_select(
 292         struct xreap_state      *rs,
 293         xfs_agblock_t           agbno,
 294         xfs_agblock_t           agbno_next,
 295         bool                    *crosslinked,
 296         xfs_extlen_t            *aglenp)
 297 {
 298         struct xfs_scrub        *sc = rs->sc;
 299         struct xfs_btree_cur    *cur;
 300         xfs_agblock_t           bno = agbno + 1;
 301         xfs_extlen_t            len = 1;
 302         int                     error;
 303
 304         /*
 305          * Determine if there are any other rmap records covering the first
 306          * block of this extent.  If so, the block is crosslinked.
 307          */
 308         cur = xfs_rmapbt_init_cursor(sc->mp, sc->tp, sc->sa.agf_bp,
 309                         sc->sa.pag);
 310         error = xfs_rmap_has_other_keys(cur, agbno, 1, rs->oinfo,
 311                         crosslinked);
 312         if (error)
 313                 goto out_cur;
 314
 315         /* AGFL blocks can only be deal with one at a time. */
 316         if (rs->resv == XFS_AG_RESV_AGFL)
 317                 goto out_found;
 318
 319         /*
 320          * Figure out how many of the subsequent blocks have the same crosslink
 321          * status.
 322          */
 323         while (bno < agbno_next) {
 324                 bool            also_crosslinked;
 325
 326                 error = xfs_rmap_has_other_keys(cur, bno, 1, rs->oinfo,
 327                                 &also_crosslinked);
 328                 if (error)
 329                         goto out_cur;
 330
 331                 if (*crosslinked != also_crosslinked)
 332                         break;
 333
 334                 len++;
 335                 bno++;
 336         }
 337
 338 out_found:
 339         *aglenp = len;
 340         trace_xreap_agextent_select(sc->sa.pag, agbno, len, *crosslinked);
 341 out_cur:
 342         xfs_btree_del_cursor(cur, error);
 343         return error;
 344 }
 345
 346 /*
 347  * Dispose of as much of the beginning of this AG extent as possible.  The
 348  * number of blocks disposed of will be returned in @aglenp.
 349  */
 350 STATIC int
 351 xreap_agextent_iter(
 352         struct xreap_state      *rs,
 353         xfs_agblock_t           agbno,
 354         xfs_extlen_t            *aglenp,
 355         bool                    crosslinked)
 356 {
 357         struct xfs_scrub        *sc = rs->sc;
 358         xfs_fsblock_t           fsbno;
 359         int                     error = 0;
 360
 361         fsbno = XFS_AGB_TO_FSB(sc->mp, sc->sa.pag->pag_agno, agbno);
 362
 363         /*
 364          * If there are other rmappings, this block is cross linked and must
 365          * not be freed.  Remove the reverse mapping and move on.  Otherwise,
 366          * we were the only owner of the block, so free the extent, which will
 367          * also remove the rmap.
 368          *
 369          * XXX: XFS doesn't support detecting the case where a single block
 370          * metadata structure is crosslinked with a multi-block structure
 371          * because the buffer cache doesn't detect aliasing problems, so we
 372          * can't fix 100% of crosslinking problems (yet).  The verifiers will
 373          * blow on writeout, the filesystem will shut down, and the admin gets
 374          * to run xfs_repair.
 375          */
 376         if (crosslinked) {
 377                 trace_xreap_dispose_unmap_extent(sc->sa.pag, agbno, *aglenp);
 378
 379                 rs->force_roll = true;
 380                 return xfs_rmap_free(sc->tp, sc->sa.agf_bp, sc->sa.pag, agbno,
 381                                 *aglenp, rs->oinfo);
 382         }
 383
 384         trace_xreap_dispose_free_extent(sc->sa.pag, agbno, *aglenp);
 385
 386         /*
 387          * Invalidate as many buffers as we can, starting at agbno.  If this
 388          * function sets *aglenp to zero, the transaction is full of logged
 389          * buffer invalidations, so we need to return early so that we can
 390          * roll and retry.
 391          */
 392         xreap_agextent_binval(rs, agbno, aglenp);
 393         if (*aglenp == 0) {
 394                 ASSERT(xreap_want_roll(rs));
 395                 return 0;
 396         }
 397
 398         /* Put blocks back on the AGFL one at a time. */
 399         if (rs->resv == XFS_AG_RESV_AGFL) {
 400                 ASSERT(*aglenp == 1);
 401                 error = xreap_put_freelist(sc, agbno);
 402                 if (error)
 403                         return error;
 404
 405                 rs->force_roll = true;
 406                 return 0;
 407         }
 408
 409         /*
 410          * Use deferred frees to get rid of the old btree blocks to try to
 411          * minimize the window in which we could crash and lose the old blocks.
 412          */
 413         error = __xfs_free_extent_later(sc->tp, fsbno, *aglenp, rs->oinfo,
 414                         rs->resv, true);
 415         if (error)
 416                 return error;
 417
 418         rs->deferred++;
 419         return 0;
 420 }
 421
 422 /*
 423  * Break an AG metadata extent into sub-extents by fate (crosslinked, not
 424  * crosslinked), and dispose of each sub-extent separately.
 425  */
 426 STATIC int
 427 xreap_agmeta_extent(
 428         uint64_t                fsbno,
 429         uint64_t                len,
 430         void                    *priv)
 431 {
 432         struct xreap_state      *rs = priv;
 433         struct xfs_scrub        *sc = rs->sc;
 434         xfs_agblock_t           agbno = fsbno;
 435         xfs_agblock_t           agbno_next = agbno + len;
 436         int                     error = 0;
 437
 438         ASSERT(len <= XFS_MAX_BMBT_EXTLEN);
 439         ASSERT(sc->ip == NULL);
 440
 441         while (agbno < agbno_next) {
 442                 xfs_extlen_t    aglen;
 443                 bool            crosslinked;
 444
 445                 error = xreap_agextent_select(rs, agbno, agbno_next,
 446                                 &crosslinked, &aglen);
 447                 if (error)
 448                         return error;
 449
 450                 error = xreap_agextent_iter(rs, agbno, &aglen, crosslinked);
 451                 if (error)
 452                         return error;
 453
 454                 if (xreap_want_defer_finish(rs)) {
 455                         error = xrep_defer_finish(sc);
 456                         if (error)
 457                                 return error;
 458                         xreap_defer_finish_reset(rs);
 459                 } else if (xreap_want_roll(rs)) {
 460                         error = xrep_roll_ag_trans(sc);
 461                         if (error)
 462                                 return error;
 463                         xreap_reset(rs);
 464                 }
 465
 466                 agbno += aglen;
 467         }
 468
 469         return 0;
 470 }
 471
 472 /* Dispose of every block of every AG metadata extent in the bitmap. */
 473 int
 474 xrep_reap_agblocks(
 475         struct xfs_scrub                *sc,
 476         struct xagb_bitmap              *bitmap,
 477         const struct xfs_owner_info     *oinfo,
 478         enum xfs_ag_resv_type           type)
 479 {
 480         struct xreap_state              rs = {
 481                 .sc                     = sc,
 482                 .oinfo                  = oinfo,
 483                 .resv                   = type,
 484         };
 485         int                             error;
 486
 487         ASSERT(xfs_has_rmapbt(sc->mp));
 488         ASSERT(sc->ip == NULL);
 489
 490         error = xagb_bitmap_walk(bitmap, xreap_agmeta_extent, &rs);
 491         if (error)
 492                 return error;
 493
 494         if (xreap_dirty(&rs))
 495                 return xrep_defer_finish(sc);
 496
 497         return 0;
 498 }