fs/xfs/scrub/newbt.c

   1 // SPDX-License-Identifier: GPL-2.0-or-later
   2 /*
   3  * Copyright (C) 2022-2023 Oracle.  All Rights Reserved.
   4  * Author: Darrick J. Wong <[email protected]>
   5  */
   6 #include "xfs.h"
   7 #include "xfs_fs.h"
   8 #include "xfs_shared.h"
   9 #include "xfs_format.h"
  10 #include "xfs_trans_resv.h"
  11 #include "xfs_mount.h"
  12 #include "xfs_btree.h"
  13 #include "xfs_btree_staging.h"
  14 #include "xfs_log_format.h"
  15 #include "xfs_trans.h"
  16 #include "xfs_sb.h"
  17 #include "xfs_inode.h"
  18 #include "xfs_alloc.h"
  19 #include "xfs_rmap.h"
  20 #include "xfs_ag.h"
  21 #include "xfs_defer.h"
  22 #include "scrub/scrub.h"
  23 #include "scrub/common.h"
  24 #include "scrub/trace.h"
  25 #include "scrub/repair.h"
  26 #include "scrub/newbt.h"
  27
  28 /*
  29  * Estimate proper slack values for a btree that's being reloaded.
  30  *
  31  * Under most circumstances, we'll take whatever default loading value the
  32  * btree bulk loading code calculates for us.  However, there are some
  33  * exceptions to this rule:
  34  *
  35  * (0) If someone turned one of the debug knobs.
  36  * (1) If this is a per-AG btree and the AG has less than 10% space free.
  37  * (2) If this is an inode btree and the FS has less than 10% space free.
  38
  39  * In either case, format the new btree blocks almost completely full to
  40  * minimize space usage.
  41  */
  42 static void
  43 xrep_newbt_estimate_slack(
  44         struct xrep_newbt       *xnr)
  45 {
  46         struct xfs_scrub        *sc = xnr->sc;
  47         struct xfs_btree_bload  *bload = &xnr->bload;
  48         uint64_t                free;
  49         uint64_t                sz;
  50
  51         /*
  52          * The xfs_globals values are set to -1 (i.e. take the bload defaults)
  53          * unless someone has set them otherwise, so we just pull the values
  54          * here.
  55          */
  56         bload->leaf_slack = xfs_globals.bload_leaf_slack;
  57         bload->node_slack = xfs_globals.bload_node_slack;
  58
  59         if (sc->ops->type == ST_PERAG) {
  60                 free = sc->sa.pag->pagf_freeblks;
  61                 sz = xfs_ag_block_count(sc->mp, sc->sa.pag->pag_agno);
  62         } else {
  63                 free = percpu_counter_sum(&sc->mp->m_fdblocks);
  64                 sz = sc->mp->m_sb.sb_dblocks;
  65         }
  66
  67         /* No further changes if there's more than 10% free space left. */
  68         if (free >= div_u64(sz, 10))
  69                 return;
  70
  71         /*
  72          * We're low on space; load the btrees as tightly as possible.  Leave
  73          * a couple of open slots in each btree block so that we don't end up
  74          * splitting the btrees like crazy after a mount.
  75          */
  76         if (bload->leaf_slack < 0)
  77                 bload->leaf_slack = 2;
  78         if (bload->node_slack < 0)
  79                 bload->node_slack = 2;
  80 }
  81
  82 /* Initialize accounting resources for staging a new AG btree. */
  83 void
  84 xrep_newbt_init_ag(
  85         struct xrep_newbt               *xnr,
  86         struct xfs_scrub                *sc,
  87         const struct xfs_owner_info     *oinfo,
  88         xfs_fsblock_t                   alloc_hint,
  89         enum xfs_ag_resv_type           resv)
  90 {
  91         memset(xnr, 0, sizeof(struct xrep_newbt));
  92         xnr->sc = sc;
  93         xnr->oinfo = *oinfo; /* structure copy */
  94         xnr->alloc_hint = alloc_hint;
  95         xnr->resv = resv;
  96         INIT_LIST_HEAD(&xnr->resv_list);
  97         xnr->bload.max_dirty = XFS_B_TO_FSBT(sc->mp, 256U << 10); /* 256K */
  98         xrep_newbt_estimate_slack(xnr);
  99 }
 100
 101 /* Initialize accounting resources for staging a new inode fork btree. */
 102 int
 103 xrep_newbt_init_inode(
 104         struct xrep_newbt               *xnr,
 105         struct xfs_scrub                *sc,
 106         int                             whichfork,
 107         const struct xfs_owner_info     *oinfo)
 108 {
 109         struct xfs_ifork                *ifp;
 110
 111         ifp = kmem_cache_zalloc(xfs_ifork_cache, XCHK_GFP_FLAGS);
 112         if (!ifp)
 113                 return -ENOMEM;
 114
 115         xrep_newbt_init_ag(xnr, sc, oinfo,
 116                         XFS_INO_TO_FSB(sc->mp, sc->ip->i_ino),
 117                         XFS_AG_RESV_NONE);
 118         xnr->ifake.if_fork = ifp;
 119         xnr->ifake.if_fork_size = xfs_inode_fork_size(sc->ip, whichfork);
 120         return 0;
 121 }
 122
 123 /*
 124  * Initialize accounting resources for staging a new btree.  Callers are
 125  * expected to add their own reservations (and clean them up) manually.
 126  */
 127 void
 128 xrep_newbt_init_bare(
 129         struct xrep_newbt               *xnr,
 130         struct xfs_scrub                *sc)
 131 {
 132         xrep_newbt_init_ag(xnr, sc, &XFS_RMAP_OINFO_ANY_OWNER, NULLFSBLOCK,
 133                         XFS_AG_RESV_NONE);
 134 }
 135
 136 /*
 137  * Designate specific blocks to be used to build our new btree.  @pag must be
 138  * a passive reference.
 139  */
 140 STATIC int
 141 xrep_newbt_add_blocks(
 142         struct xrep_newbt               *xnr,
 143         struct xfs_perag                *pag,
 144         const struct xfs_alloc_arg      *args)
 145 {
 146         struct xfs_mount                *mp = xnr->sc->mp;
 147         struct xrep_newbt_resv          *resv;
 148         int                             error;
 149
 150         resv = kmalloc(sizeof(struct xrep_newbt_resv), XCHK_GFP_FLAGS);
 151         if (!resv)
 152                 return -ENOMEM;
 153
 154         INIT_LIST_HEAD(&resv->list);
 155         resv->agbno = XFS_FSB_TO_AGBNO(mp, args->fsbno);
 156         resv->len = args->len;
 157         resv->used = 0;
 158         resv->pag = xfs_perag_hold(pag);
 159
 160         if (args->tp) {
 161                 ASSERT(xnr->oinfo.oi_offset == 0);
 162
 163                 error = xfs_alloc_schedule_autoreap(args, true, &resv->autoreap);
 164                 if (error)
 165                         goto out_pag;
 166         }
 167
 168         list_add_tail(&resv->list, &xnr->resv_list);
 169         return 0;
 170 out_pag:
 171         xfs_perag_put(resv->pag);
 172         kfree(resv);
 173         return error;
 174 }
 175
 176 /*
 177  * Add an extent to the new btree reservation pool.  Callers are required to
 178  * reap this reservation manually if the repair is cancelled.  @pag must be a
 179  * passive reference.
 180  */
 181 int
 182 xrep_newbt_add_extent(
 183         struct xrep_newbt       *xnr,
 184         struct xfs_perag        *pag,
 185         xfs_agblock_t           agbno,
 186         xfs_extlen_t            len)
 187 {
 188         struct xfs_mount        *mp = xnr->sc->mp;
 189         struct xfs_alloc_arg    args = {
 190                 .tp             = NULL, /* no autoreap */
 191                 .oinfo          = xnr->oinfo,
 192                 .fsbno          = XFS_AGB_TO_FSB(mp, pag->pag_agno, agbno),
 193                 .len            = len,
 194                 .resv           = xnr->resv,
 195         };
 196
 197         return xrep_newbt_add_blocks(xnr, pag, &args);
 198 }
 199
 200 /* Don't let our allocation hint take us beyond this AG */
 201 static inline void
 202 xrep_newbt_validate_ag_alloc_hint(
 203         struct xrep_newbt       *xnr)
 204 {
 205         struct xfs_scrub        *sc = xnr->sc;
 206         xfs_agnumber_t          agno = XFS_FSB_TO_AGNO(sc->mp, xnr->alloc_hint);
 207
 208         if (agno == sc->sa.pag->pag_agno &&
 209             xfs_verify_fsbno(sc->mp, xnr->alloc_hint))
 210                 return;
 211
 212         xnr->alloc_hint = XFS_AGB_TO_FSB(sc->mp, sc->sa.pag->pag_agno,
 213                                          XFS_AGFL_BLOCK(sc->mp) + 1);
 214 }
 215
 216 /* Allocate disk space for a new per-AG btree. */
 217 STATIC int
 218 xrep_newbt_alloc_ag_blocks(
 219         struct xrep_newbt       *xnr,
 220         uint64_t                nr_blocks)
 221 {
 222         struct xfs_scrub        *sc = xnr->sc;
 223         struct xfs_mount        *mp = sc->mp;
 224         int                     error = 0;
 225
 226         ASSERT(sc->sa.pag != NULL);
 227
 228         while (nr_blocks > 0) {
 229                 struct xfs_alloc_arg    args = {
 230                         .tp             = sc->tp,
 231                         .mp             = mp,
 232                         .oinfo          = xnr->oinfo,
 233                         .minlen         = 1,
 234                         .maxlen         = nr_blocks,
 235                         .prod           = 1,
 236                         .resv           = xnr->resv,
 237                 };
 238                 xfs_agnumber_t          agno;
 239
 240                 xrep_newbt_validate_ag_alloc_hint(xnr);
 241
 242                 if (xnr->alloc_vextent)
 243                         error = xnr->alloc_vextent(sc, &args, xnr->alloc_hint);
 244                 else
 245                         error = xfs_alloc_vextent_near_bno(&args,
 246                                         xnr->alloc_hint);
 247                 if (error)
 248                         return error;
 249                 if (args.fsbno == NULLFSBLOCK)
 250                         return -ENOSPC;
 251
 252                 agno = XFS_FSB_TO_AGNO(mp, args.fsbno);
 253
 254                 trace_xrep_newbt_alloc_ag_blocks(mp, agno,
 255                                 XFS_FSB_TO_AGBNO(mp, args.fsbno), args.len,
 256                                 xnr->oinfo.oi_owner);
 257
 258                 if (agno != sc->sa.pag->pag_agno) {
 259                         ASSERT(agno == sc->sa.pag->pag_agno);
 260                         return -EFSCORRUPTED;
 261                 }
 262
 263                 error = xrep_newbt_add_blocks(xnr, sc->sa.pag, &args);
 264                 if (error)
 265                         return error;
 266
 267                 nr_blocks -= args.len;
 268                 xnr->alloc_hint = args.fsbno + args.len;
 269
 270                 error = xrep_defer_finish(sc);
 271                 if (error)
 272                         return error;
 273         }
 274
 275         return 0;
 276 }
 277
 278 /* Don't let our allocation hint take us beyond EOFS */
 279 static inline void
 280 xrep_newbt_validate_file_alloc_hint(
 281         struct xrep_newbt       *xnr)
 282 {
 283         struct xfs_scrub        *sc = xnr->sc;
 284
 285         if (xfs_verify_fsbno(sc->mp, xnr->alloc_hint))
 286                 return;
 287
 288         xnr->alloc_hint = XFS_AGB_TO_FSB(sc->mp, 0, XFS_AGFL_BLOCK(sc->mp) + 1);
 289 }
 290
 291 /* Allocate disk space for our new file-based btree. */
 292 STATIC int
 293 xrep_newbt_alloc_file_blocks(
 294         struct xrep_newbt       *xnr,
 295         uint64_t                nr_blocks)
 296 {
 297         struct xfs_scrub        *sc = xnr->sc;
 298         struct xfs_mount        *mp = sc->mp;
 299         int                     error = 0;
 300
 301         while (nr_blocks > 0) {
 302                 struct xfs_alloc_arg    args = {
 303                         .tp             = sc->tp,
 304                         .mp             = mp,
 305                         .oinfo          = xnr->oinfo,
 306                         .minlen         = 1,
 307                         .maxlen         = nr_blocks,
 308                         .prod           = 1,
 309                         .resv           = xnr->resv,
 310                 };
 311                 struct xfs_perag        *pag;
 312                 xfs_agnumber_t          agno;
 313
 314                 xrep_newbt_validate_file_alloc_hint(xnr);
 315
 316                 if (xnr->alloc_vextent)
 317                         error = xnr->alloc_vextent(sc, &args, xnr->alloc_hint);
 318                 else
 319                         error = xfs_alloc_vextent_start_ag(&args,
 320                                         xnr->alloc_hint);
 321                 if (error)
 322                         return error;
 323                 if (args.fsbno == NULLFSBLOCK)
 324                         return -ENOSPC;
 325
 326                 agno = XFS_FSB_TO_AGNO(mp, args.fsbno);
 327
 328                 trace_xrep_newbt_alloc_file_blocks(mp, agno,
 329                                 XFS_FSB_TO_AGBNO(mp, args.fsbno), args.len,
 330                                 xnr->oinfo.oi_owner);
 331
 332                 pag = xfs_perag_get(mp, agno);
 333                 if (!pag) {
 334                         ASSERT(0);
 335                         return -EFSCORRUPTED;
 336                 }
 337
 338                 error = xrep_newbt_add_blocks(xnr, pag, &args);
 339                 xfs_perag_put(pag);
 340                 if (error)
 341                         return error;
 342
 343                 nr_blocks -= args.len;
 344                 xnr->alloc_hint = args.fsbno + args.len;
 345
 346                 error = xrep_defer_finish(sc);
 347                 if (error)
 348                         return error;
 349         }
 350
 351         return 0;
 352 }
 353
 354 /* Allocate disk space for our new btree. */
 355 int
 356 xrep_newbt_alloc_blocks(
 357         struct xrep_newbt       *xnr,
 358         uint64_t                nr_blocks)
 359 {
 360         if (xnr->sc->ip)
 361                 return xrep_newbt_alloc_file_blocks(xnr, nr_blocks);
 362         return xrep_newbt_alloc_ag_blocks(xnr, nr_blocks);
 363 }
 364
 365 /*
 366  * Free the unused part of a space extent that was reserved for a new ondisk
 367  * structure.  Returns the number of EFIs logged or a negative errno.
 368  */
 369 STATIC int
 370 xrep_newbt_free_extent(
 371         struct xrep_newbt       *xnr,
 372         struct xrep_newbt_resv  *resv,
 373         bool                    btree_committed)
 374 {
 375         struct xfs_scrub        *sc = xnr->sc;
 376         xfs_agblock_t           free_agbno = resv->agbno;
 377         xfs_extlen_t            free_aglen = resv->len;
 378         xfs_fsblock_t           fsbno;
 379         int                     error;
 380
 381         if (!btree_committed || resv->used == 0) {
 382                 /*
 383                  * If we're not committing a new btree or we didn't use the
 384                  * space reservation, let the existing EFI free the entire
 385                  * space extent.
 386                  */
 387                 trace_xrep_newbt_free_blocks(sc->mp, resv->pag->pag_agno,
 388                                 free_agbno, free_aglen, xnr->oinfo.oi_owner);
 389                 xfs_alloc_commit_autoreap(sc->tp, &resv->autoreap);
 390                 return 1;
 391         }
 392
 393         /*
 394          * We used space and committed the btree.  Cancel the autoreap, remove
 395          * the written blocks from the reservation, and possibly log a new EFI
 396          * to free any unused reservation space.
 397          */
 398         xfs_alloc_cancel_autoreap(sc->tp, &resv->autoreap);
 399         free_agbno += resv->used;
 400         free_aglen -= resv->used;
 401
 402         if (free_aglen == 0)
 403                 return 0;
 404
 405         trace_xrep_newbt_free_blocks(sc->mp, resv->pag->pag_agno, free_agbno,
 406                         free_aglen, xnr->oinfo.oi_owner);
 407
 408         ASSERT(xnr->resv != XFS_AG_RESV_AGFL);
 409         ASSERT(xnr->resv != XFS_AG_RESV_IGNORE);
 410
 411         /*
 412          * Use EFIs to free the reservations.  This reduces the chance
 413          * that we leak blocks if the system goes down.
 414          */
 415         fsbno = XFS_AGB_TO_FSB(sc->mp, resv->pag->pag_agno, free_agbno);
 416         error = xfs_free_extent_later(sc->tp, fsbno, free_aglen, &xnr->oinfo,
 417                         xnr->resv, true);
 418         if (error)
 419                 return error;
 420
 421         return 1;
 422 }
 423
 424 /* Free all the accounting info and disk space we reserved for a new btree. */
 425 STATIC int
 426 xrep_newbt_free(
 427         struct xrep_newbt       *xnr,
 428         bool                    btree_committed)
 429 {
 430         struct xfs_scrub        *sc = xnr->sc;
 431         struct xrep_newbt_resv  *resv, *n;
 432         unsigned int            freed = 0;
 433         int                     error = 0;
 434
 435         /*
 436          * If the filesystem already went down, we can't free the blocks.  Skip
 437          * ahead to freeing the incore metadata because we can't fix anything.
 438          */
 439         if (xfs_is_shutdown(sc->mp))
 440                 goto junkit;
 441
 442         list_for_each_entry_safe(resv, n, &xnr->resv_list, list) {
 443                 int             ret;
 444
 445                 ret = xrep_newbt_free_extent(xnr, resv, btree_committed);
 446                 list_del(&resv->list);
 447                 xfs_perag_put(resv->pag);
 448                 kfree(resv);
 449                 if (ret < 0) {
 450                         error = ret;
 451                         goto junkit;
 452                 }
 453
 454                 freed += ret;
 455                 if (freed >= XREP_MAX_ITRUNCATE_EFIS) {
 456                         error = xrep_defer_finish(sc);
 457                         if (error)
 458                                 goto junkit;
 459                         freed = 0;
 460                 }
 461         }
 462
 463         if (freed)
 464                 error = xrep_defer_finish(sc);
 465
 466 junkit:
 467         /*
 468          * If we still have reservations attached to @newbt, cleanup must have
 469          * failed and the filesystem is about to go down.  Clean up the incore
 470          * reservations and try to commit to freeing the space we used.
 471          */
 472         list_for_each_entry_safe(resv, n, &xnr->resv_list, list) {
 473                 xfs_alloc_commit_autoreap(sc->tp, &resv->autoreap);
 474                 list_del(&resv->list);
 475                 xfs_perag_put(resv->pag);
 476                 kfree(resv);
 477         }
 478
 479         if (sc->ip) {
 480                 kmem_cache_free(xfs_ifork_cache, xnr->ifake.if_fork);
 481                 xnr->ifake.if_fork = NULL;
 482         }
 483
 484         return error;
 485 }
 486
 487 /*
 488  * Free all the accounting info and unused disk space allocations after
 489  * committing a new btree.
 490  */
 491 int
 492 xrep_newbt_commit(
 493         struct xrep_newbt       *xnr)
 494 {
 495         return xrep_newbt_free(xnr, true);
 496 }
 497
 498 /*
 499  * Free all the accounting info and all of the disk space we reserved for a new
 500  * btree that we're not going to commit.  We want to try to roll things back
 501  * cleanly for things like ENOSPC midway through allocation.
 502  */
 503 void
 504 xrep_newbt_cancel(
 505         struct xrep_newbt       *xnr)
 506 {
 507         xrep_newbt_free(xnr, false);
 508 }
 509
 510 /* Feed one of the reserved btree blocks to the bulk loader. */
 511 int
 512 xrep_newbt_claim_block(
 513         struct xfs_btree_cur    *cur,
 514         struct xrep_newbt       *xnr,
 515         union xfs_btree_ptr     *ptr)
 516 {
 517         struct xrep_newbt_resv  *resv;
 518         struct xfs_mount        *mp = cur->bc_mp;
 519         xfs_agblock_t           agbno;
 520
 521         /*
 522          * The first item in the list should always have a free block unless
 523          * we're completely out.
 524          */
 525         resv = list_first_entry(&xnr->resv_list, struct xrep_newbt_resv, list);
 526         if (resv->used == resv->len)
 527                 return -ENOSPC;
 528
 529         /*
 530          * Peel off a block from the start of the reservation.  We allocate
 531          * blocks in order to place blocks on disk in increasing record or key
 532          * order.  The block reservations tend to end up on the list in
 533          * decreasing order, which hopefully results in leaf blocks ending up
 534          * together.
 535          */
 536         agbno = resv->agbno + resv->used;
 537         resv->used++;
 538
 539         /* If we used all the blocks in this reservation, move it to the end. */
 540         if (resv->used == resv->len)
 541                 list_move_tail(&resv->list, &xnr->resv_list);
 542
 543         trace_xrep_newbt_claim_block(mp, resv->pag->pag_agno, agbno, 1,
 544                         xnr->oinfo.oi_owner);
 545
 546         if (cur->bc_ops->ptr_len == XFS_BTREE_LONG_PTR_LEN)
 547                 ptr->l = cpu_to_be64(XFS_AGB_TO_FSB(mp, resv->pag->pag_agno,
 548                                                                 agbno));
 549         else
 550                 ptr->s = cpu_to_be32(agbno);
 551
 552         /* Relog all the EFIs. */
 553         return xrep_defer_finish(xnr->sc);
 554 }
 555
 556 /* How many reserved blocks are unused? */
 557 unsigned int
 558 xrep_newbt_unused_blocks(
 559         struct xrep_newbt       *xnr)
 560 {
 561         struct xrep_newbt_resv  *resv;
 562         unsigned int            unused = 0;
 563
 564         list_for_each_entry(resv, &xnr->resv_list, list)
 565                 unused += resv->len - resv->used;
 566         return unused;
 567 }