fs/xfs/scrub/common.c

   1 // SPDX-License-Identifier: GPL-2.0-or-later
   2 /*
   3  * Copyright (C) 2017-2023 Oracle.  All Rights Reserved.
   4  * Author: Darrick J. Wong <[email protected]>
   5  */
   6 #include "xfs.h"
   7 #include "xfs_fs.h"
   8 #include "xfs_shared.h"
   9 #include "xfs_format.h"
  10 #include "xfs_trans_resv.h"
  11 #include "xfs_mount.h"
  12 #include "xfs_btree.h"
  13 #include "xfs_log_format.h"
  14 #include "xfs_trans.h"
  15 #include "xfs_inode.h"
  16 #include "xfs_icache.h"
  17 #include "xfs_alloc.h"
  18 #include "xfs_alloc_btree.h"
  19 #include "xfs_ialloc.h"
  20 #include "xfs_ialloc_btree.h"
  21 #include "xfs_refcount_btree.h"
  22 #include "xfs_rmap.h"
  23 #include "xfs_rmap_btree.h"
  24 #include "xfs_log.h"
  25 #include "xfs_trans_priv.h"
  26 #include "xfs_da_format.h"
  27 #include "xfs_da_btree.h"
  28 #include "xfs_dir2_priv.h"
  29 #include "xfs_dir2.h"
  30 #include "xfs_attr.h"
  31 #include "xfs_reflink.h"
  32 #include "xfs_ag.h"
  33 #include "xfs_error.h"
  34 #include "xfs_quota.h"
  35 #include "xfs_exchmaps.h"
  36 #include "xfs_rtbitmap.h"
  37 #include "xfs_rtgroup.h"
  38 #include "scrub/scrub.h"
  39 #include "scrub/common.h"
  40 #include "scrub/trace.h"
  41 #include "scrub/repair.h"
  42 #include "scrub/health.h"
  43 #include "scrub/tempfile.h"
  44
  45 /* Common code for the metadata scrubbers. */
  46
  47 /*
  48  * Handling operational errors.
  49  *
  50  * The *_process_error() family of functions are used to process error return
  51  * codes from functions called as part of a scrub operation.
  52  *
  53  * If there's no error, we return true to tell the caller that it's ok
  54  * to move on to the next check in its list.
  55  *
  56  * For non-verifier errors (e.g. ENOMEM) we return false to tell the
  57  * caller that something bad happened, and we preserve *error so that
  58  * the caller can return the *error up the stack to userspace.
  59  *
  60  * Verifier errors (EFSBADCRC/EFSCORRUPTED) are recorded by setting
  61  * OFLAG_CORRUPT in sm_flags and the *error is cleared.  In other words,
  62  * we track verifier errors (and failed scrub checks) via OFLAG_CORRUPT,
  63  * not via return codes.  We return false to tell the caller that
  64  * something bad happened.  Since the error has been cleared, the caller
  65  * will (presumably) return that zero and scrubbing will move on to
  66  * whatever's next.
  67  *
  68  * ftrace can be used to record the precise metadata location and the
  69  * approximate code location of the failed operation.
  70  */
  71
  72 /* Check for operational errors. */
  73 static bool
  74 __xchk_process_error(
  75         struct xfs_scrub        *sc,
  76         xfs_agnumber_t          agno,
  77         xfs_agblock_t           bno,
  78         int                     *error,
  79         __u32                   errflag,
  80         void                    *ret_ip)
  81 {
  82         switch (*error) {
  83         case 0:
  84                 return true;
  85         case -EDEADLOCK:
  86         case -ECHRNG:
  87                 /* Used to restart an op with deadlock avoidance. */
  88                 trace_xchk_deadlock_retry(
  89                                 sc->ip ? sc->ip : XFS_I(file_inode(sc->file)),
  90                                 sc->sm, *error);
  91                 break;
  92         case -ECANCELED:
  93                 /*
  94                  * ECANCELED here means that the caller set one of the scrub
  95                  * outcome flags (corrupt, xfail, xcorrupt) and wants to exit
  96                  * quickly.  Set error to zero and do not continue.
  97                  */
  98                 trace_xchk_op_error(sc, agno, bno, *error, ret_ip);
  99                 *error = 0;
 100                 break;
 101         case -EFSBADCRC:
 102         case -EFSCORRUPTED:
 103                 /* Note the badness but don't abort. */
 104                 sc->sm->sm_flags |= errflag;
 105                 *error = 0;
 106                 fallthrough;
 107         default:
 108                 trace_xchk_op_error(sc, agno, bno, *error, ret_ip);
 109                 break;
 110         }
 111         return false;
 112 }
 113
 114 bool
 115 xchk_process_error(
 116         struct xfs_scrub        *sc,
 117         xfs_agnumber_t          agno,
 118         xfs_agblock_t           bno,
 119         int                     *error)
 120 {
 121         return __xchk_process_error(sc, agno, bno, error,
 122                         XFS_SCRUB_OFLAG_CORRUPT, __return_address);
 123 }
 124
 125 bool
 126 xchk_process_rt_error(
 127         struct xfs_scrub        *sc,
 128         xfs_rgnumber_t          rgno,
 129         xfs_rgblock_t           rgbno,
 130         int                     *error)
 131 {
 132         return __xchk_process_error(sc, rgno, rgbno, error,
 133                         XFS_SCRUB_OFLAG_CORRUPT, __return_address);
 134 }
 135
 136 bool
 137 xchk_xref_process_error(
 138         struct xfs_scrub        *sc,
 139         xfs_agnumber_t          agno,
 140         xfs_agblock_t           bno,
 141         int                     *error)
 142 {
 143         return __xchk_process_error(sc, agno, bno, error,
 144                         XFS_SCRUB_OFLAG_XFAIL, __return_address);
 145 }
 146
 147 /* Check for operational errors for a file offset. */
 148 static bool
 149 __xchk_fblock_process_error(
 150         struct xfs_scrub        *sc,
 151         int                     whichfork,
 152         xfs_fileoff_t           offset,
 153         int                     *error,
 154         __u32                   errflag,
 155         void                    *ret_ip)
 156 {
 157         switch (*error) {
 158         case 0:
 159                 return true;
 160         case -EDEADLOCK:
 161         case -ECHRNG:
 162                 /* Used to restart an op with deadlock avoidance. */
 163                 trace_xchk_deadlock_retry(sc->ip, sc->sm, *error);
 164                 break;
 165         case -ECANCELED:
 166                 /*
 167                  * ECANCELED here means that the caller set one of the scrub
 168                  * outcome flags (corrupt, xfail, xcorrupt) and wants to exit
 169                  * quickly.  Set error to zero and do not continue.
 170                  */
 171                 trace_xchk_file_op_error(sc, whichfork, offset, *error,
 172                                 ret_ip);
 173                 *error = 0;
 174                 break;
 175         case -EFSBADCRC:
 176         case -EFSCORRUPTED:
 177                 /* Note the badness but don't abort. */
 178                 sc->sm->sm_flags |= errflag;
 179                 *error = 0;
 180                 fallthrough;
 181         default:
 182                 trace_xchk_file_op_error(sc, whichfork, offset, *error,
 183                                 ret_ip);
 184                 break;
 185         }
 186         return false;
 187 }
 188
 189 bool
 190 xchk_fblock_process_error(
 191         struct xfs_scrub        *sc,
 192         int                     whichfork,
 193         xfs_fileoff_t           offset,
 194         int                     *error)
 195 {
 196         return __xchk_fblock_process_error(sc, whichfork, offset, error,
 197                         XFS_SCRUB_OFLAG_CORRUPT, __return_address);
 198 }
 199
 200 bool
 201 xchk_fblock_xref_process_error(
 202         struct xfs_scrub        *sc,
 203         int                     whichfork,
 204         xfs_fileoff_t           offset,
 205         int                     *error)
 206 {
 207         return __xchk_fblock_process_error(sc, whichfork, offset, error,
 208                         XFS_SCRUB_OFLAG_XFAIL, __return_address);
 209 }
 210
 211 /*
 212  * Handling scrub corruption/optimization/warning checks.
 213  *
 214  * The *_set_{corrupt,preen,warning}() family of functions are used to
 215  * record the presence of metadata that is incorrect (corrupt), could be
 216  * optimized somehow (preen), or should be flagged for administrative
 217  * review but is not incorrect (warn).
 218  *
 219  * ftrace can be used to record the precise metadata location and
 220  * approximate code location of the failed check.
 221  */
 222
 223 /* Record a block which could be optimized. */
 224 void
 225 xchk_block_set_preen(
 226         struct xfs_scrub        *sc,
 227         struct xfs_buf          *bp)
 228 {
 229         sc->sm->sm_flags |= XFS_SCRUB_OFLAG_PREEN;
 230         trace_xchk_block_preen(sc, xfs_buf_daddr(bp), __return_address);
 231 }
 232
 233 /*
 234  * Record an inode which could be optimized.  The trace data will
 235  * include the block given by bp if bp is given; otherwise it will use
 236  * the block location of the inode record itself.
 237  */
 238 void
 239 xchk_ino_set_preen(
 240         struct xfs_scrub        *sc,
 241         xfs_ino_t               ino)
 242 {
 243         sc->sm->sm_flags |= XFS_SCRUB_OFLAG_PREEN;
 244         trace_xchk_ino_preen(sc, ino, __return_address);
 245 }
 246
 247 /* Record something being wrong with the filesystem primary superblock. */
 248 void
 249 xchk_set_corrupt(
 250         struct xfs_scrub        *sc)
 251 {
 252         sc->sm->sm_flags |= XFS_SCRUB_OFLAG_CORRUPT;
 253         trace_xchk_fs_error(sc, 0, __return_address);
 254 }
 255
 256 /* Record a corrupt block. */
 257 void
 258 xchk_block_set_corrupt(
 259         struct xfs_scrub        *sc,
 260         struct xfs_buf          *bp)
 261 {
 262         sc->sm->sm_flags |= XFS_SCRUB_OFLAG_CORRUPT;
 263         trace_xchk_block_error(sc, xfs_buf_daddr(bp), __return_address);
 264 }
 265
 266 #ifdef CONFIG_XFS_QUOTA
 267 /* Record a corrupt quota counter. */
 268 void
 269 xchk_qcheck_set_corrupt(
 270         struct xfs_scrub        *sc,
 271         unsigned int            dqtype,
 272         xfs_dqid_t              id)
 273 {
 274         sc->sm->sm_flags |= XFS_SCRUB_OFLAG_CORRUPT;
 275         trace_xchk_qcheck_error(sc, dqtype, id, __return_address);
 276 }
 277 #endif
 278
 279 /* Record a corruption while cross-referencing. */
 280 void
 281 xchk_block_xref_set_corrupt(
 282         struct xfs_scrub        *sc,
 283         struct xfs_buf          *bp)
 284 {
 285         sc->sm->sm_flags |= XFS_SCRUB_OFLAG_XCORRUPT;
 286         trace_xchk_block_error(sc, xfs_buf_daddr(bp), __return_address);
 287 }
 288
 289 /*
 290  * Record a corrupt inode.  The trace data will include the block given
 291  * by bp if bp is given; otherwise it will use the block location of the
 292  * inode record itself.
 293  */
 294 void
 295 xchk_ino_set_corrupt(
 296         struct xfs_scrub        *sc,
 297         xfs_ino_t               ino)
 298 {
 299         sc->sm->sm_flags |= XFS_SCRUB_OFLAG_CORRUPT;
 300         trace_xchk_ino_error(sc, ino, __return_address);
 301 }
 302
 303 /* Record a corruption while cross-referencing with an inode. */
 304 void
 305 xchk_ino_xref_set_corrupt(
 306         struct xfs_scrub        *sc,
 307         xfs_ino_t               ino)
 308 {
 309         sc->sm->sm_flags |= XFS_SCRUB_OFLAG_XCORRUPT;
 310         trace_xchk_ino_error(sc, ino, __return_address);
 311 }
 312
 313 /* Record corruption in a block indexed by a file fork. */
 314 void
 315 xchk_fblock_set_corrupt(
 316         struct xfs_scrub        *sc,
 317         int                     whichfork,
 318         xfs_fileoff_t           offset)
 319 {
 320         sc->sm->sm_flags |= XFS_SCRUB_OFLAG_CORRUPT;
 321         trace_xchk_fblock_error(sc, whichfork, offset, __return_address);
 322 }
 323
 324 /* Record a corruption while cross-referencing a fork block. */
 325 void
 326 xchk_fblock_xref_set_corrupt(
 327         struct xfs_scrub        *sc,
 328         int                     whichfork,
 329         xfs_fileoff_t           offset)
 330 {
 331         sc->sm->sm_flags |= XFS_SCRUB_OFLAG_XCORRUPT;
 332         trace_xchk_fblock_error(sc, whichfork, offset, __return_address);
 333 }
 334
 335 /*
 336  * Warn about inodes that need administrative review but is not
 337  * incorrect.
 338  */
 339 void
 340 xchk_ino_set_warning(
 341         struct xfs_scrub        *sc,
 342         xfs_ino_t               ino)
 343 {
 344         sc->sm->sm_flags |= XFS_SCRUB_OFLAG_WARNING;
 345         trace_xchk_ino_warning(sc, ino, __return_address);
 346 }
 347
 348 /* Warn about a block indexed by a file fork that needs review. */
 349 void
 350 xchk_fblock_set_warning(
 351         struct xfs_scrub        *sc,
 352         int                     whichfork,
 353         xfs_fileoff_t           offset)
 354 {
 355         sc->sm->sm_flags |= XFS_SCRUB_OFLAG_WARNING;
 356         trace_xchk_fblock_warning(sc, whichfork, offset, __return_address);
 357 }
 358
 359 /* Signal an incomplete scrub. */
 360 void
 361 xchk_set_incomplete(
 362         struct xfs_scrub        *sc)
 363 {
 364         sc->sm->sm_flags |= XFS_SCRUB_OFLAG_INCOMPLETE;
 365         trace_xchk_incomplete(sc, __return_address);
 366 }
 367
 368 /*
 369  * rmap scrubbing -- compute the number of blocks with a given owner,
 370  * at least according to the reverse mapping data.
 371  */
 372
 373 struct xchk_rmap_ownedby_info {
 374         const struct xfs_owner_info     *oinfo;
 375         xfs_filblks_t                   *blocks;
 376 };
 377
 378 STATIC int
 379 xchk_count_rmap_ownedby_irec(
 380         struct xfs_btree_cur            *cur,
 381         const struct xfs_rmap_irec      *rec,
 382         void                            *priv)
 383 {
 384         struct xchk_rmap_ownedby_info   *sroi = priv;
 385         bool                            irec_attr;
 386         bool                            oinfo_attr;
 387
 388         irec_attr = rec->rm_flags & XFS_RMAP_ATTR_FORK;
 389         oinfo_attr = sroi->oinfo->oi_flags & XFS_OWNER_INFO_ATTR_FORK;
 390
 391         if (rec->rm_owner != sroi->oinfo->oi_owner)
 392                 return 0;
 393
 394         if (XFS_RMAP_NON_INODE_OWNER(rec->rm_owner) || irec_attr == oinfo_attr)
 395                 (*sroi->blocks) += rec->rm_blockcount;
 396
 397         return 0;
 398 }
 399
 400 /*
 401  * Calculate the number of blocks the rmap thinks are owned by something.
 402  * The caller should pass us an rmapbt cursor.
 403  */
 404 int
 405 xchk_count_rmap_ownedby_ag(
 406         struct xfs_scrub                *sc,
 407         struct xfs_btree_cur            *cur,
 408         const struct xfs_owner_info     *oinfo,
 409         xfs_filblks_t                   *blocks)
 410 {
 411         struct xchk_rmap_ownedby_info   sroi = {
 412                 .oinfo                  = oinfo,
 413                 .blocks                 = blocks,
 414         };
 415
 416         *blocks = 0;
 417         return xfs_rmap_query_all(cur, xchk_count_rmap_ownedby_irec,
 418                         &sroi);
 419 }
 420
 421 /*
 422  * AG scrubbing
 423  *
 424  * These helpers facilitate locking an allocation group's header
 425  * buffers, setting up cursors for all btrees that are present, and
 426  * cleaning everything up once we're through.
 427  */
 428
 429 /* Decide if we want to return an AG header read failure. */
 430 static inline bool
 431 want_ag_read_header_failure(
 432         struct xfs_scrub        *sc,
 433         unsigned int            type)
 434 {
 435         /* Return all AG header read failures when scanning btrees. */
 436         if (sc->sm->sm_type != XFS_SCRUB_TYPE_AGF &&
 437             sc->sm->sm_type != XFS_SCRUB_TYPE_AGFL &&
 438             sc->sm->sm_type != XFS_SCRUB_TYPE_AGI)
 439                 return true;
 440         /*
 441          * If we're scanning a given type of AG header, we only want to
 442          * see read failures from that specific header.  We'd like the
 443          * other headers to cross-check them, but this isn't required.
 444          */
 445         if (sc->sm->sm_type == type)
 446                 return true;
 447         return false;
 448 }
 449
 450 /*
 451  * Grab the AG header buffers for the attached perag structure.
 452  *
 453  * The headers should be released by xchk_ag_free, but as a fail safe we attach
 454  * all the buffers we grab to the scrub transaction so they'll all be freed
 455  * when we cancel it.
 456  */
 457 static inline int
 458 xchk_perag_read_headers(
 459         struct xfs_scrub        *sc,
 460         struct xchk_ag          *sa)
 461 {
 462         int                     error;
 463
 464         error = xfs_ialloc_read_agi(sa->pag, sc->tp, 0, &sa->agi_bp);
 465         if (error && want_ag_read_header_failure(sc, XFS_SCRUB_TYPE_AGI))
 466                 return error;
 467
 468         error = xfs_alloc_read_agf(sa->pag, sc->tp, 0, &sa->agf_bp);
 469         if (error && want_ag_read_header_failure(sc, XFS_SCRUB_TYPE_AGF))
 470                 return error;
 471
 472         return 0;
 473 }
 474
 475 /*
 476  * Grab the AG headers for the attached perag structure and wait for pending
 477  * intents to drain.
 478  */
 479 int
 480 xchk_perag_drain_and_lock(
 481         struct xfs_scrub        *sc)
 482 {
 483         struct xchk_ag          *sa = &sc->sa;
 484         int                     error = 0;
 485
 486         ASSERT(sa->pag != NULL);
 487         ASSERT(sa->agi_bp == NULL);
 488         ASSERT(sa->agf_bp == NULL);
 489
 490         do {
 491                 if (xchk_should_terminate(sc, &error))
 492                         return error;
 493
 494                 error = xchk_perag_read_headers(sc, sa);
 495                 if (error)
 496                         return error;
 497
 498                 /*
 499                  * If we've grabbed an inode for scrubbing then we assume that
 500                  * holding its ILOCK will suffice to coordinate with any intent
 501                  * chains involving this inode.
 502                  */
 503                 if (sc->ip)
 504                         return 0;
 505
 506                 /*
 507                  * Decide if this AG is quiet enough for all metadata to be
 508                  * consistent with each other.  XFS allows the AG header buffer
 509                  * locks to cycle across transaction rolls while processing
 510                  * chains of deferred ops, which means that there could be
 511                  * other threads in the middle of processing a chain of
 512                  * deferred ops.  For regular operations we are careful about
 513                  * ordering operations to prevent collisions between threads
 514                  * (which is why we don't need a per-AG lock), but scrub and
 515                  * repair have to serialize against chained operations.
 516                  *
 517                  * We just locked all the AG headers buffers; now take a look
 518                  * to see if there are any intents in progress.  If there are,
 519                  * drop the AG headers and wait for the intents to drain.
 520                  * Since we hold all the AG header locks for the duration of
 521                  * the scrub, this is the only time we have to sample the
 522                  * intents counter; any threads increasing it after this point
 523                  * can't possibly be in the middle of a chain of AG metadata
 524                  * updates.
 525                  *
 526                  * Obviously, this should be slanted against scrub and in favor
 527                  * of runtime threads.
 528                  */
 529                 if (!xfs_group_intent_busy(pag_group(sa->pag)))
 530                         return 0;
 531
 532                 if (sa->agf_bp) {
 533                         xfs_trans_brelse(sc->tp, sa->agf_bp);
 534                         sa->agf_bp = NULL;
 535                 }
 536
 537                 if (sa->agi_bp) {
 538                         xfs_trans_brelse(sc->tp, sa->agi_bp);
 539                         sa->agi_bp = NULL;
 540                 }
 541
 542                 if (!(sc->flags & XCHK_FSGATES_DRAIN))
 543                         return -ECHRNG;
 544                 error = xfs_group_intent_drain(pag_group(sa->pag));
 545                 if (error == -ERESTARTSYS)
 546                         error = -EINTR;
 547         } while (!error);
 548
 549         return error;
 550 }
 551
 552 /*
 553  * Grab the per-AG structure, grab all AG header buffers, and wait until there
 554  * aren't any pending intents.  Returns -ENOENT if we can't grab the perag
 555  * structure.
 556  */
 557 int
 558 xchk_ag_read_headers(
 559         struct xfs_scrub        *sc,
 560         xfs_agnumber_t          agno,
 561         struct xchk_ag          *sa)
 562 {
 563         struct xfs_mount        *mp = sc->mp;
 564
 565         ASSERT(!sa->pag);
 566         sa->pag = xfs_perag_get(mp, agno);
 567         if (!sa->pag)
 568                 return -ENOENT;
 569
 570         return xchk_perag_drain_and_lock(sc);
 571 }
 572
 573 /* Release all the AG btree cursors. */
 574 void
 575 xchk_ag_btcur_free(
 576         struct xchk_ag          *sa)
 577 {
 578         if (sa->refc_cur)
 579                 xfs_btree_del_cursor(sa->refc_cur, XFS_BTREE_ERROR);
 580         if (sa->rmap_cur)
 581                 xfs_btree_del_cursor(sa->rmap_cur, XFS_BTREE_ERROR);
 582         if (sa->fino_cur)
 583                 xfs_btree_del_cursor(sa->fino_cur, XFS_BTREE_ERROR);
 584         if (sa->ino_cur)
 585                 xfs_btree_del_cursor(sa->ino_cur, XFS_BTREE_ERROR);
 586         if (sa->cnt_cur)
 587                 xfs_btree_del_cursor(sa->cnt_cur, XFS_BTREE_ERROR);
 588         if (sa->bno_cur)
 589                 xfs_btree_del_cursor(sa->bno_cur, XFS_BTREE_ERROR);
 590
 591         sa->refc_cur = NULL;
 592         sa->rmap_cur = NULL;
 593         sa->fino_cur = NULL;
 594         sa->ino_cur = NULL;
 595         sa->bno_cur = NULL;
 596         sa->cnt_cur = NULL;
 597 }
 598
 599 /* Initialize all the btree cursors for an AG. */
 600 void
 601 xchk_ag_btcur_init(
 602         struct xfs_scrub        *sc,
 603         struct xchk_ag          *sa)
 604 {
 605         struct xfs_mount        *mp = sc->mp;
 606
 607         if (sa->agf_bp) {
 608                 /* Set up a bnobt cursor for cross-referencing. */
 609                 sa->bno_cur = xfs_bnobt_init_cursor(mp, sc->tp, sa->agf_bp,
 610                                 sa->pag);
 611                 xchk_ag_btree_del_cursor_if_sick(sc, &sa->bno_cur,
 612                                 XFS_SCRUB_TYPE_BNOBT);
 613
 614                 /* Set up a cntbt cursor for cross-referencing. */
 615                 sa->cnt_cur = xfs_cntbt_init_cursor(mp, sc->tp, sa->agf_bp,
 616                                 sa->pag);
 617                 xchk_ag_btree_del_cursor_if_sick(sc, &sa->cnt_cur,
 618                                 XFS_SCRUB_TYPE_CNTBT);
 619
 620                 /* Set up a rmapbt cursor for cross-referencing. */
 621                 if (xfs_has_rmapbt(mp)) {
 622                         sa->rmap_cur = xfs_rmapbt_init_cursor(mp, sc->tp,
 623                                         sa->agf_bp, sa->pag);
 624                         xchk_ag_btree_del_cursor_if_sick(sc, &sa->rmap_cur,
 625                                         XFS_SCRUB_TYPE_RMAPBT);
 626                 }
 627
 628                 /* Set up a refcountbt cursor for cross-referencing. */
 629                 if (xfs_has_reflink(mp)) {
 630                         sa->refc_cur = xfs_refcountbt_init_cursor(mp, sc->tp,
 631                                         sa->agf_bp, sa->pag);
 632                         xchk_ag_btree_del_cursor_if_sick(sc, &sa->refc_cur,
 633                                         XFS_SCRUB_TYPE_REFCNTBT);
 634                 }
 635         }
 636
 637         if (sa->agi_bp) {
 638                 /* Set up a inobt cursor for cross-referencing. */
 639                 sa->ino_cur = xfs_inobt_init_cursor(sa->pag, sc->tp,
 640                                 sa->agi_bp);
 641                 xchk_ag_btree_del_cursor_if_sick(sc, &sa->ino_cur,
 642                                 XFS_SCRUB_TYPE_INOBT);
 643
 644                 /* Set up a finobt cursor for cross-referencing. */
 645                 if (xfs_has_finobt(mp)) {
 646                         sa->fino_cur = xfs_finobt_init_cursor(sa->pag, sc->tp,
 647                                         sa->agi_bp);
 648                         xchk_ag_btree_del_cursor_if_sick(sc, &sa->fino_cur,
 649                                         XFS_SCRUB_TYPE_FINOBT);
 650                 }
 651         }
 652 }
 653
 654 /* Release the AG header context and btree cursors. */
 655 void
 656 xchk_ag_free(
 657         struct xfs_scrub        *sc,
 658         struct xchk_ag          *sa)
 659 {
 660         xchk_ag_btcur_free(sa);
 661         xrep_reset_perag_resv(sc);
 662         if (sa->agf_bp) {
 663                 xfs_trans_brelse(sc->tp, sa->agf_bp);
 664                 sa->agf_bp = NULL;
 665         }
 666         if (sa->agi_bp) {
 667                 xfs_trans_brelse(sc->tp, sa->agi_bp);
 668                 sa->agi_bp = NULL;
 669         }
 670         if (sa->pag) {
 671                 xfs_perag_put(sa->pag);
 672                 sa->pag = NULL;
 673         }
 674 }
 675
 676 /*
 677  * For scrub, grab the perag structure, the AGI, and the AGF headers, in that
 678  * order.  Locking order requires us to get the AGI before the AGF.  We use the
 679  * transaction to avoid deadlocking on crosslinked metadata buffers; either the
 680  * caller passes one in (bmap scrub) or we have to create a transaction
 681  * ourselves.  Returns ENOENT if the perag struct cannot be grabbed.
 682  */
 683 int
 684 xchk_ag_init(
 685         struct xfs_scrub        *sc,
 686         xfs_agnumber_t          agno,
 687         struct xchk_ag          *sa)
 688 {
 689         int                     error;
 690
 691         error = xchk_ag_read_headers(sc, agno, sa);
 692         if (error)
 693                 return error;
 694
 695         xchk_ag_btcur_init(sc, sa);
 696         return 0;
 697 }
 698
 699 #ifdef CONFIG_XFS_RT
 700 /*
 701  * For scrubbing a realtime group, grab all the in-core resources we'll need to
 702  * check the metadata, which means taking the ILOCK of the realtime group's
 703  * metadata inodes.  Callers must not join these inodes to the transaction with
 704  * non-zero lockflags or concurrency problems will result.  The @rtglock_flags
 705  * argument takes XFS_RTGLOCK_* flags.
 706  */
 707 int
 708 xchk_rtgroup_init(
 709         struct xfs_scrub        *sc,
 710         xfs_rgnumber_t          rgno,
 711         struct xchk_rt          *sr)
 712 {
 713         ASSERT(sr->rtg == NULL);
 714         ASSERT(sr->rtlock_flags == 0);
 715
 716         sr->rtg = xfs_rtgroup_get(sc->mp, rgno);
 717         if (!sr->rtg)
 718                 return -ENOENT;
 719         return 0;
 720 }
 721
 722 void
 723 xchk_rtgroup_lock(
 724         struct xchk_rt          *sr,
 725         unsigned int            rtglock_flags)
 726 {
 727         xfs_rtgroup_lock(sr->rtg, rtglock_flags);
 728         sr->rtlock_flags = rtglock_flags;
 729 }
 730
 731 /*
 732  * Unlock the realtime group.  This must be done /after/ committing (or
 733  * cancelling) the scrub transaction.
 734  */
 735 static void
 736 xchk_rtgroup_unlock(
 737         struct xchk_rt          *sr)
 738 {
 739         ASSERT(sr->rtg != NULL);
 740
 741         if (sr->rtlock_flags) {
 742                 xfs_rtgroup_unlock(sr->rtg, sr->rtlock_flags);
 743                 sr->rtlock_flags = 0;
 744         }
 745 }
 746
 747 /*
 748  * Unlock the realtime group and release its resources.  This must be done
 749  * /after/ committing (or cancelling) the scrub transaction.
 750  */
 751 void
 752 xchk_rtgroup_free(
 753         struct xfs_scrub        *sc,
 754         struct xchk_rt          *sr)
 755 {
 756         ASSERT(sr->rtg != NULL);
 757
 758         xchk_rtgroup_unlock(sr);
 759
 760         xfs_rtgroup_put(sr->rtg);
 761         sr->rtg = NULL;
 762 }
 763 #endif /* CONFIG_XFS_RT */
 764
 765 /* Per-scrubber setup functions */
 766
 767 void
 768 xchk_trans_cancel(
 769         struct xfs_scrub        *sc)
 770 {
 771         xfs_trans_cancel(sc->tp);
 772         sc->tp = NULL;
 773 }
 774
 775 int
 776 xchk_trans_alloc_empty(
 777         struct xfs_scrub        *sc)
 778 {
 779         return xfs_trans_alloc_empty(sc->mp, &sc->tp);
 780 }
 781
 782 /*
 783  * Grab an empty transaction so that we can re-grab locked buffers if
 784  * one of our btrees turns out to be cyclic.
 785  *
 786  * If we're going to repair something, we need to ask for the largest possible
 787  * log reservation so that we can handle the worst case scenario for metadata
 788  * updates while rebuilding a metadata item.  We also need to reserve as many
 789  * blocks in the head transaction as we think we're going to need to rebuild
 790  * the metadata object.
 791  */
 792 int
 793 xchk_trans_alloc(
 794         struct xfs_scrub        *sc,
 795         uint                    resblks)
 796 {
 797         if (sc->sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR)
 798                 return xfs_trans_alloc(sc->mp, &M_RES(sc->mp)->tr_itruncate,
 799                                 resblks, 0, 0, &sc->tp);
 800
 801         return xchk_trans_alloc_empty(sc);
 802 }
 803
 804 /* Set us up with a transaction and an empty context. */
 805 int
 806 xchk_setup_fs(
 807         struct xfs_scrub        *sc)
 808 {
 809         uint                    resblks;
 810
 811         resblks = xrep_calc_ag_resblks(sc);
 812         return xchk_trans_alloc(sc, resblks);
 813 }
 814
 815 /* Set us up with AG headers and btree cursors. */
 816 int
 817 xchk_setup_ag_btree(
 818         struct xfs_scrub        *sc,
 819         bool                    force_log)
 820 {
 821         struct xfs_mount        *mp = sc->mp;
 822         int                     error;
 823
 824         /*
 825          * If the caller asks us to checkpont the log, do so.  This
 826          * expensive operation should be performed infrequently and only
 827          * as a last resort.  Any caller that sets force_log should
 828          * document why they need to do so.
 829          */
 830         if (force_log) {
 831                 error = xchk_checkpoint_log(mp);
 832                 if (error)
 833                         return error;
 834         }
 835
 836         error = xchk_setup_fs(sc);
 837         if (error)
 838                 return error;
 839
 840         return xchk_ag_init(sc, sc->sm->sm_agno, &sc->sa);
 841 }
 842
 843 /* Push everything out of the log onto disk. */
 844 int
 845 xchk_checkpoint_log(
 846         struct xfs_mount        *mp)
 847 {
 848         int                     error;
 849
 850         error = xfs_log_force(mp, XFS_LOG_SYNC);
 851         if (error)
 852                 return error;
 853         xfs_ail_push_all_sync(mp->m_ail);
 854         return 0;
 855 }
 856
 857 /* Verify that an inode is allocated ondisk, then return its cached inode. */
 858 int
 859 xchk_iget(
 860         struct xfs_scrub        *sc,
 861         xfs_ino_t               inum,
 862         struct xfs_inode        **ipp)
 863 {
 864         ASSERT(sc->tp != NULL);
 865
 866         return xfs_iget(sc->mp, sc->tp, inum, XCHK_IGET_FLAGS, 0, ipp);
 867 }
 868
 869 /*
 870  * Try to grab an inode in a manner that avoids races with physical inode
 871  * allocation.  If we can't, return the locked AGI buffer so that the caller
 872  * can single-step the loading process to see where things went wrong.
 873  * Callers must have a valid scrub transaction.
 874  *
 875  * If the iget succeeds, return 0, a NULL AGI, and the inode.
 876  *
 877  * If the iget fails, return the error, the locked AGI, and a NULL inode.  This
 878  * can include -EINVAL and -ENOENT for invalid inode numbers or inodes that are
 879  * no longer allocated; or any other corruption or runtime error.
 880  *
 881  * If the AGI read fails, return the error, a NULL AGI, and NULL inode.
 882  *
 883  * If a fatal signal is pending, return -EINTR, a NULL AGI, and a NULL inode.
 884  */
 885 int
 886 xchk_iget_agi(
 887         struct xfs_scrub        *sc,
 888         xfs_ino_t               inum,
 889         struct xfs_buf          **agi_bpp,
 890         struct xfs_inode        **ipp)
 891 {
 892         struct xfs_mount        *mp = sc->mp;
 893         struct xfs_trans        *tp = sc->tp;
 894         struct xfs_perag        *pag;
 895         int                     error;
 896
 897         ASSERT(sc->tp != NULL);
 898
 899 again:
 900         *agi_bpp = NULL;
 901         *ipp = NULL;
 902         error = 0;
 903
 904         if (xchk_should_terminate(sc, &error))
 905                 return error;
 906
 907         /*
 908          * Attach the AGI buffer to the scrub transaction to avoid deadlocks
 909          * in the iget cache miss path.
 910          */
 911         pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, inum));
 912         error = xfs_ialloc_read_agi(pag, tp, 0, agi_bpp);
 913         xfs_perag_put(pag);
 914         if (error)
 915                 return error;
 916
 917         error = xfs_iget(mp, tp, inum, XFS_IGET_NORETRY | XCHK_IGET_FLAGS, 0,
 918                         ipp);
 919         if (error == -EAGAIN) {
 920                 /*
 921                  * The inode may be in core but temporarily unavailable and may
 922                  * require the AGI buffer before it can be returned.  Drop the
 923                  * AGI buffer and retry the lookup.
 924                  *
 925                  * Incore lookup will fail with EAGAIN on a cache hit if the
 926                  * inode is queued to the inactivation list.  The inactivation
 927                  * worker may remove the inode from the unlinked list and hence
 928                  * needs the AGI.
 929                  *
 930                  * Hence xchk_iget_agi() needs to drop the AGI lock on EAGAIN
 931                  * to allow inodegc to make progress and move the inode to
 932                  * IRECLAIMABLE state where xfs_iget will be able to return it
 933                  * again if it can lock the inode.
 934                  */
 935                 xfs_trans_brelse(tp, *agi_bpp);
 936                 delay(1);
 937                 goto again;
 938         }
 939         if (error)
 940                 return error;
 941
 942         /* We got the inode, so we can release the AGI. */
 943         ASSERT(*ipp != NULL);
 944         xfs_trans_brelse(tp, *agi_bpp);
 945         *agi_bpp = NULL;
 946         return 0;
 947 }
 948
 949 #ifdef CONFIG_XFS_QUOTA
 950 /*
 951  * Try to attach dquots to this inode if we think we might want to repair it.
 952  * Callers must not hold any ILOCKs.  If the dquots are broken and cannot be
 953  * attached, a quotacheck will be scheduled.
 954  */
 955 int
 956 xchk_ino_dqattach(
 957         struct xfs_scrub        *sc)
 958 {
 959         ASSERT(sc->tp != NULL);
 960         ASSERT(sc->ip != NULL);
 961
 962         if (!xchk_could_repair(sc))
 963                 return 0;
 964
 965         return xrep_ino_dqattach(sc);
 966 }
 967 #endif
 968
 969 /* Install an inode that we opened by handle for scrubbing. */
 970 int
 971 xchk_install_handle_inode(
 972         struct xfs_scrub        *sc,
 973         struct xfs_inode        *ip)
 974 {
 975         if (VFS_I(ip)->i_generation != sc->sm->sm_gen) {
 976                 xchk_irele(sc, ip);
 977                 return -ENOENT;
 978         }
 979
 980         sc->ip = ip;
 981         return 0;
 982 }
 983
 984 /*
 985  * Install an already-referenced inode for scrubbing.  Get our own reference to
 986  * the inode to make disposal simpler.  The inode must not be in I_FREEING or
 987  * I_WILL_FREE state!
 988  */
 989 int
 990 xchk_install_live_inode(
 991         struct xfs_scrub        *sc,
 992         struct xfs_inode        *ip)
 993 {
 994         if (!igrab(VFS_I(ip))) {
 995                 xchk_ino_set_corrupt(sc, ip->i_ino);
 996                 return -EFSCORRUPTED;
 997         }
 998
 999         sc->ip = ip;
1000         return 0;
1001 }
1002
1003 /*
1004  * In preparation to scrub metadata structures that hang off of an inode,
1005  * grab either the inode referenced in the scrub control structure or the
1006  * inode passed in.  If the inumber does not reference an allocated inode
1007  * record, the function returns ENOENT to end the scrub early.  The inode
1008  * is not locked.
1009  */
1010 int
1011 xchk_iget_for_scrubbing(
1012         struct xfs_scrub        *sc)
1013 {
1014         struct xfs_imap         imap;
1015         struct xfs_mount        *mp = sc->mp;
1016         struct xfs_perag        *pag;
1017         struct xfs_buf          *agi_bp;
1018         struct xfs_inode        *ip_in = XFS_I(file_inode(sc->file));
1019         struct xfs_inode        *ip = NULL;
1020         xfs_agnumber_t          agno = XFS_INO_TO_AGNO(mp, sc->sm->sm_ino);
1021         int                     error;
1022
1023         ASSERT(sc->tp == NULL);
1024
1025         /* We want to scan the inode we already had opened. */
1026         if (sc->sm->sm_ino == 0 || sc->sm->sm_ino == ip_in->i_ino)
1027                 return xchk_install_live_inode(sc, ip_in);
1028
1029         /*
1030          * On pre-metadir filesystems, reject internal metadata files.  For
1031          * metadir filesystems, limited scrubbing of any file in the metadata
1032          * directory tree by handle is allowed, because that is the only way to
1033          * validate the lack of parent pointers in the sb-root metadata inodes.
1034          */
1035         if (!xfs_has_metadir(mp) && xfs_is_sb_inum(mp, sc->sm->sm_ino))
1036                 return -ENOENT;
1037         /* Reject obviously bad inode numbers. */
1038         if (!xfs_verify_ino(sc->mp, sc->sm->sm_ino))
1039                 return -ENOENT;
1040
1041         /* Try a safe untrusted iget. */
1042         error = xchk_iget_safe(sc, sc->sm->sm_ino, &ip);
1043         if (!error)
1044                 return xchk_install_handle_inode(sc, ip);
1045         if (error == -ENOENT)
1046                 return error;
1047         if (error != -EINVAL)
1048                 goto out_error;
1049
1050         /*
1051          * EINVAL with IGET_UNTRUSTED probably means one of several things:
1052          * userspace gave us an inode number that doesn't correspond to fs
1053          * space; the inode btree lacks a record for this inode; or there is a
1054          * record, and it says this inode is free.
1055          *
1056          * We want to look up this inode in the inobt to distinguish two
1057          * scenarios: (1) the inobt says the inode is free, in which case
1058          * there's nothing to do; and (2) the inobt says the inode is
1059          * allocated, but loading it failed due to corruption.
1060          *
1061          * Allocate a transaction and grab the AGI to prevent inobt activity
1062          * in this AG.  Retry the iget in case someone allocated a new inode
1063          * after the first iget failed.
1064          */
1065         error = xchk_trans_alloc(sc, 0);
1066         if (error)
1067                 goto out_error;
1068
1069         error = xchk_iget_agi(sc, sc->sm->sm_ino, &agi_bp, &ip);
1070         if (error == 0) {
1071                 /* Actually got the inode, so install it. */
1072                 xchk_trans_cancel(sc);
1073                 return xchk_install_handle_inode(sc, ip);
1074         }
1075         if (error == -ENOENT)
1076                 goto out_gone;
1077         if (error != -EINVAL)
1078                 goto out_cancel;
1079
1080         /* Ensure that we have protected against inode allocation/freeing. */
1081         if (agi_bp == NULL) {
1082                 ASSERT(agi_bp != NULL);
1083                 error = -ECANCELED;
1084                 goto out_cancel;
1085         }
1086
1087         /*
1088          * Untrusted iget failed a second time.  Let's try an inobt lookup.
1089          * If the inobt thinks this the inode neither can exist inside the
1090          * filesystem nor is allocated, return ENOENT to signal that the check
1091          * can be skipped.
1092          *
1093          * If the lookup returns corruption, we'll mark this inode corrupt and
1094          * exit to userspace.  There's little chance of fixing anything until
1095          * the inobt is straightened out, but there's nothing we can do here.
1096          *
1097          * If the lookup encounters any other error, exit to userspace.
1098          *
1099          * If the lookup succeeds, something else must be very wrong in the fs
1100          * such that setting up the incore inode failed in some strange way.
1101          * Treat those as corruptions.
1102          */
1103         pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, sc->sm->sm_ino));
1104         if (!pag) {
1105                 error = -EFSCORRUPTED;
1106                 goto out_cancel;
1107         }
1108
1109         error = xfs_imap(pag, sc->tp, sc->sm->sm_ino, &imap,
1110                         XFS_IGET_UNTRUSTED);
1111         xfs_perag_put(pag);
1112         if (error == -EINVAL || error == -ENOENT)
1113                 goto out_gone;
1114         if (!error)
1115                 error = -EFSCORRUPTED;
1116
1117 out_cancel:
1118         xchk_trans_cancel(sc);
1119 out_error:
1120         trace_xchk_op_error(sc, agno, XFS_INO_TO_AGBNO(mp, sc->sm->sm_ino),
1121                         error, __return_address);
1122         return error;
1123 out_gone:
1124         /* The file is gone, so there's nothing to check. */
1125         xchk_trans_cancel(sc);
1126         return -ENOENT;
1127 }
1128
1129 /* Release an inode, possibly dropping it in the process. */
1130 void
1131 xchk_irele(
1132         struct xfs_scrub        *sc,
1133         struct xfs_inode        *ip)
1134 {
1135         if (sc->tp) {
1136                 /*
1137                  * If we are in a transaction, we /cannot/ drop the inode
1138                  * ourselves, because the VFS will trigger writeback, which
1139                  * can require a transaction.  Clear DONTCACHE to force the
1140                  * inode to the LRU, where someone else can take care of
1141                  * dropping it.
1142                  *
1143                  * Note that when we grabbed our reference to the inode, it
1144                  * could have had an active ref and DONTCACHE set if a sysadmin
1145                  * is trying to coerce a change in file access mode.  icache
1146                  * hits do not clear DONTCACHE, so we must do it here.
1147                  */
1148                 spin_lock(&VFS_I(ip)->i_lock);
1149                 VFS_I(ip)->i_state &= ~I_DONTCACHE;
1150                 spin_unlock(&VFS_I(ip)->i_lock);
1151         }
1152
1153         xfs_irele(ip);
1154 }
1155
1156 /*
1157  * Set us up to scrub metadata mapped by a file's fork.  Callers must not use
1158  * this to operate on user-accessible regular file data because the MMAPLOCK is
1159  * not taken.
1160  */
1161 int
1162 xchk_setup_inode_contents(
1163         struct xfs_scrub        *sc,
1164         unsigned int            resblks)
1165 {
1166         int                     error;
1167
1168         error = xchk_iget_for_scrubbing(sc);
1169         if (error)
1170                 return error;
1171
1172         error = xrep_tempfile_adjust_directory_tree(sc);
1173         if (error)
1174                 return error;
1175
1176         /* Lock the inode so the VFS cannot touch this file. */
1177         xchk_ilock(sc, XFS_IOLOCK_EXCL);
1178
1179         error = xchk_trans_alloc(sc, resblks);
1180         if (error)
1181                 goto out;
1182
1183         error = xchk_ino_dqattach(sc);
1184         if (error)
1185                 goto out;
1186
1187         xchk_ilock(sc, XFS_ILOCK_EXCL);
1188 out:
1189         /* scrub teardown will unlock and release the inode for us */
1190         return error;
1191 }
1192
1193 void
1194 xchk_ilock(
1195         struct xfs_scrub        *sc,
1196         unsigned int            ilock_flags)
1197 {
1198         xfs_ilock(sc->ip, ilock_flags);
1199         sc->ilock_flags |= ilock_flags;
1200 }
1201
1202 bool
1203 xchk_ilock_nowait(
1204         struct xfs_scrub        *sc,
1205         unsigned int            ilock_flags)
1206 {
1207         if (xfs_ilock_nowait(sc->ip, ilock_flags)) {
1208                 sc->ilock_flags |= ilock_flags;
1209                 return true;
1210         }
1211
1212         return false;
1213 }
1214
1215 void
1216 xchk_iunlock(
1217         struct xfs_scrub        *sc,
1218         unsigned int            ilock_flags)
1219 {
1220         sc->ilock_flags &= ~ilock_flags;
1221         xfs_iunlock(sc->ip, ilock_flags);
1222 }
1223
1224 /*
1225  * Predicate that decides if we need to evaluate the cross-reference check.
1226  * If there was an error accessing the cross-reference btree, just delete
1227  * the cursor and skip the check.
1228  */
1229 bool
1230 xchk_should_check_xref(
1231         struct xfs_scrub        *sc,
1232         int                     *error,
1233         struct xfs_btree_cur    **curpp)
1234 {
1235         /* No point in xref if we already know we're corrupt. */
1236         if (xchk_skip_xref(sc->sm))
1237                 return false;
1238
1239         if (*error == 0)
1240                 return true;
1241
1242         if (curpp) {
1243                 /* If we've already given up on xref, just bail out. */
1244                 if (!*curpp)
1245                         return false;
1246
1247                 /* xref error, delete cursor and bail out. */
1248                 xfs_btree_del_cursor(*curpp, XFS_BTREE_ERROR);
1249                 *curpp = NULL;
1250         }
1251
1252         sc->sm->sm_flags |= XFS_SCRUB_OFLAG_XFAIL;
1253         trace_xchk_xref_error(sc, *error, __return_address);
1254
1255         /*
1256          * Errors encountered during cross-referencing with another
1257          * data structure should not cause this scrubber to abort.
1258          */
1259         *error = 0;
1260         return false;
1261 }
1262
1263 /* Run the structure verifiers on in-memory buffers to detect bad memory. */
1264 void
1265 xchk_buffer_recheck(
1266         struct xfs_scrub        *sc,
1267         struct xfs_buf          *bp)
1268 {
1269         xfs_failaddr_t          fa;
1270
1271         if (bp->b_ops == NULL) {
1272                 xchk_block_set_corrupt(sc, bp);
1273                 return;
1274         }
1275         if (bp->b_ops->verify_struct == NULL) {
1276                 xchk_set_incomplete(sc);
1277                 return;
1278         }
1279         fa = bp->b_ops->verify_struct(bp);
1280         if (!fa)
1281                 return;
1282         sc->sm->sm_flags |= XFS_SCRUB_OFLAG_CORRUPT;
1283         trace_xchk_block_error(sc, xfs_buf_daddr(bp), fa);
1284 }
1285
1286 static inline int
1287 xchk_metadata_inode_subtype(
1288         struct xfs_scrub        *sc,
1289         unsigned int            scrub_type)
1290 {
1291         struct xfs_scrub_subord *sub;
1292         int                     error;
1293
1294         sub = xchk_scrub_create_subord(sc, scrub_type);
1295         error = sub->sc.ops->scrub(&sub->sc);
1296         xchk_scrub_free_subord(sub);
1297         return error;
1298 }
1299
1300 /*
1301  * Scrub the attr/data forks of a metadata inode.  The metadata inode must be
1302  * pointed to by sc->ip and the ILOCK must be held.
1303  */
1304 int
1305 xchk_metadata_inode_forks(
1306         struct xfs_scrub        *sc)
1307 {
1308         bool                    shared;
1309         int                     error;
1310
1311         if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
1312                 return 0;
1313
1314         /* Check the inode record. */
1315         error = xchk_metadata_inode_subtype(sc, XFS_SCRUB_TYPE_INODE);
1316         if (error || (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT))
1317                 return error;
1318
1319         /* Metadata inodes don't live on the rt device. */
1320         if (sc->ip->i_diflags & XFS_DIFLAG_REALTIME) {
1321                 xchk_ino_set_corrupt(sc, sc->ip->i_ino);
1322                 return 0;
1323         }
1324
1325         /* They should never participate in reflink. */
1326         if (xfs_is_reflink_inode(sc->ip)) {
1327                 xchk_ino_set_corrupt(sc, sc->ip->i_ino);
1328                 return 0;
1329         }
1330
1331         /* Invoke the data fork scrubber. */
1332         error = xchk_metadata_inode_subtype(sc, XFS_SCRUB_TYPE_BMBTD);
1333         if (error || (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT))
1334                 return error;
1335
1336         /* Look for incorrect shared blocks. */
1337         if (xfs_has_reflink(sc->mp)) {
1338                 error = xfs_reflink_inode_has_shared_extents(sc->tp, sc->ip,
1339                                 &shared);
1340                 if (!xchk_fblock_process_error(sc, XFS_DATA_FORK, 0,
1341                                 &error))
1342                         return error;
1343                 if (shared)
1344                         xchk_ino_set_corrupt(sc, sc->ip->i_ino);
1345         }
1346
1347         /*
1348          * Metadata files can only have extended attributes on metadir
1349          * filesystems, either for parent pointers or for actual xattr data.
1350          */
1351         if (xfs_inode_hasattr(sc->ip)) {
1352                 if (!xfs_has_metadir(sc->mp)) {
1353                         xchk_ino_set_corrupt(sc, sc->ip->i_ino);
1354                         return 0;
1355                 }
1356
1357                 error = xchk_metadata_inode_subtype(sc, XFS_SCRUB_TYPE_BMBTA);
1358                 if (error || (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT))
1359                         return error;
1360         }
1361
1362         return 0;
1363 }
1364
1365 /*
1366  * Enable filesystem hooks (i.e. runtime code patching) before starting a scrub
1367  * operation.  Callers must not hold any locks that intersect with the CPU
1368  * hotplug lock (e.g. writeback locks) because code patching must halt the CPUs
1369  * to change kernel code.
1370  */
1371 void
1372 xchk_fsgates_enable(
1373         struct xfs_scrub        *sc,
1374         unsigned int            scrub_fsgates)
1375 {
1376         ASSERT(!(scrub_fsgates & ~XCHK_FSGATES_ALL));
1377         ASSERT(!(sc->flags & scrub_fsgates));
1378
1379         trace_xchk_fsgates_enable(sc, scrub_fsgates);
1380
1381         if (scrub_fsgates & XCHK_FSGATES_DRAIN)
1382                 xfs_drain_wait_enable();
1383
1384         if (scrub_fsgates & XCHK_FSGATES_QUOTA)
1385                 xfs_dqtrx_hook_enable();
1386
1387         if (scrub_fsgates & XCHK_FSGATES_DIRENTS)
1388                 xfs_dir_hook_enable();
1389
1390         if (scrub_fsgates & XCHK_FSGATES_RMAP)
1391                 xfs_rmap_hook_enable();
1392
1393         sc->flags |= scrub_fsgates;
1394 }
1395
1396 /*
1397  * Decide if this is this a cached inode that's also allocated.  The caller
1398  * must hold a reference to an AG and the AGI buffer lock to prevent inodes
1399  * from being allocated or freed.
1400  *
1401  * Look up an inode by number in the given file system.  If the inode number
1402  * is invalid, return -EINVAL.  If the inode is not in cache, return -ENODATA.
1403  * If the inode is being reclaimed, return -ENODATA because we know the inode
1404  * cache cannot be updating the ondisk metadata.
1405  *
1406  * Otherwise, the incore inode is the one we want, and it is either live,
1407  * somewhere in the inactivation machinery, or reclaimable.  The inode is
1408  * allocated if i_mode is nonzero.  In all three cases, the cached inode will
1409  * be more up to date than the ondisk inode buffer, so we must use the incore
1410  * i_mode.
1411  */
1412 int
1413 xchk_inode_is_allocated(
1414         struct xfs_scrub        *sc,
1415         xfs_agino_t             agino,
1416         bool                    *inuse)
1417 {
1418         struct xfs_mount        *mp = sc->mp;
1419         struct xfs_perag        *pag = sc->sa.pag;
1420         xfs_ino_t               ino;
1421         struct xfs_inode        *ip;
1422         int                     error;
1423
1424         /* caller must hold perag reference */
1425         if (pag == NULL) {
1426                 ASSERT(pag != NULL);
1427                 return -EINVAL;
1428         }
1429
1430         /* caller must have AGI buffer */
1431         if (sc->sa.agi_bp == NULL) {
1432                 ASSERT(sc->sa.agi_bp != NULL);
1433                 return -EINVAL;
1434         }
1435
1436         /* reject inode numbers outside existing AGs */
1437         ino = xfs_agino_to_ino(pag, agino);
1438         if (!xfs_verify_ino(mp, ino))
1439                 return -EINVAL;
1440
1441         error = -ENODATA;
1442         rcu_read_lock();
1443         ip = radix_tree_lookup(&pag->pag_ici_root, agino);
1444         if (!ip) {
1445                 /* cache miss */
1446                 goto out_rcu;
1447         }
1448
1449         /*
1450          * If the inode number doesn't match, the incore inode got reused
1451          * during an RCU grace period and the radix tree hasn't been updated.
1452          * This isn't the inode we want.
1453          */
1454         spin_lock(&ip->i_flags_lock);
1455         if (ip->i_ino != ino)
1456                 goto out_skip;
1457
1458         trace_xchk_inode_is_allocated(ip);
1459
1460         /*
1461          * We have an incore inode that matches the inode we want, and the
1462          * caller holds the perag structure and the AGI buffer.  Let's check
1463          * our assumptions below:
1464          */
1465
1466 #ifdef DEBUG
1467         /*
1468          * (1) If the incore inode is live (i.e. referenced from the dcache),
1469          * it will not be INEW, nor will it be in the inactivation or reclaim
1470          * machinery.  The ondisk inode had better be allocated.  This is the
1471          * most trivial case.
1472          */
1473         if (!(ip->i_flags & (XFS_NEED_INACTIVE | XFS_INEW | XFS_IRECLAIMABLE |
1474                              XFS_INACTIVATING))) {
1475                 /* live inode */
1476                 ASSERT(VFS_I(ip)->i_mode != 0);
1477         }
1478
1479         /*
1480          * If the incore inode is INEW, there are several possibilities:
1481          *
1482          * (2) For a file that is being created, note that we allocate the
1483          * ondisk inode before allocating, initializing, and adding the incore
1484          * inode to the radix tree.
1485          *
1486          * (3) If the incore inode is being recycled, the inode has to be
1487          * allocated because we don't allow freed inodes to be recycled.
1488          * Recycling doesn't touch i_mode.
1489          */
1490         if (ip->i_flags & XFS_INEW) {
1491                 /* created on disk already or recycling */
1492                 ASSERT(VFS_I(ip)->i_mode != 0);
1493         }
1494
1495         /*
1496          * (4) If the inode is queued for inactivation (NEED_INACTIVE) but
1497          * inactivation has not started (!INACTIVATING), it is still allocated.
1498          */
1499         if ((ip->i_flags & XFS_NEED_INACTIVE) &&
1500             !(ip->i_flags & XFS_INACTIVATING)) {
1501                 /* definitely before difree */
1502                 ASSERT(VFS_I(ip)->i_mode != 0);
1503         }
1504 #endif
1505
1506         /*
1507          * If the incore inode is undergoing inactivation (INACTIVATING), there
1508          * are two possibilities:
1509          *
1510          * (5) It is before the point where it would get freed ondisk, in which
1511          * case i_mode is still nonzero.
1512          *
1513          * (6) It has already been freed, in which case i_mode is zero.
1514          *
1515          * We don't take the ILOCK here, but difree and dialloc update the AGI,
1516          * and we've taken the AGI buffer lock, which prevents that from
1517          * happening.
1518          */
1519
1520         /*
1521          * (7) Inodes undergoing inactivation (INACTIVATING) or queued for
1522          * reclaim (IRECLAIMABLE) could be allocated or free.  i_mode still
1523          * reflects the ondisk state.
1524          */
1525
1526         /*
1527          * (8) If the inode is in IFLUSHING, it's safe to query i_mode because
1528          * the flush code uses i_mode to format the ondisk inode.
1529          */
1530
1531         /*
1532          * (9) If the inode is in IRECLAIM and was reachable via the radix
1533          * tree, it still has the same i_mode as it did before it entered
1534          * reclaim.  The inode object is still alive because we hold the RCU
1535          * read lock.
1536          */
1537
1538         *inuse = VFS_I(ip)->i_mode != 0;
1539         error = 0;
1540
1541 out_skip:
1542         spin_unlock(&ip->i_flags_lock);
1543 out_rcu:
1544         rcu_read_unlock();
1545         return error;
1546 }
1547
1548 /* Is this inode a root directory for either tree? */
1549 bool
1550 xchk_inode_is_dirtree_root(const struct xfs_inode *ip)
1551 {
1552         struct xfs_mount        *mp = ip->i_mount;
1553
1554         return ip == mp->m_rootip ||
1555                 (xfs_has_metadir(mp) && ip == mp->m_metadirip);
1556 }
1557
1558 /* Does the superblock point down to this inode? */
1559 bool
1560 xchk_inode_is_sb_rooted(const struct xfs_inode *ip)
1561 {
1562         return xchk_inode_is_dirtree_root(ip) ||
1563                xfs_is_sb_inum(ip->i_mount, ip->i_ino);
1564 }
1565
1566 /* What is the root directory inumber for this inode? */
1567 xfs_ino_t
1568 xchk_inode_rootdir_inum(const struct xfs_inode *ip)
1569 {
1570         struct xfs_mount        *mp = ip->i_mount;
1571
1572         if (xfs_is_metadir_inode(ip))
1573                 return mp->m_metadirip->i_ino;
1574         return mp->m_rootip->i_ino;
1575 }