fs/xfs/scrub/common.c

   1 // SPDX-License-Identifier: GPL-2.0-or-later
   2 /*
   3  * Copyright (C) 2017-2023 Oracle.  All Rights Reserved.
   4  * Author: Darrick J. Wong <[email protected]>
   5  */
   6 #include "xfs.h"
   7 #include "xfs_fs.h"
   8 #include "xfs_shared.h"
   9 #include "xfs_format.h"
  10 #include "xfs_trans_resv.h"
  11 #include "xfs_mount.h"
  12 #include "xfs_btree.h"
  13 #include "xfs_log_format.h"
  14 #include "xfs_trans.h"
  15 #include "xfs_inode.h"
  16 #include "xfs_icache.h"
  17 #include "xfs_alloc.h"
  18 #include "xfs_alloc_btree.h"
  19 #include "xfs_ialloc.h"
  20 #include "xfs_ialloc_btree.h"
  21 #include "xfs_refcount_btree.h"
  22 #include "xfs_rmap.h"
  23 #include "xfs_rmap_btree.h"
  24 #include "xfs_log.h"
  25 #include "xfs_trans_priv.h"
  26 #include "xfs_da_format.h"
  27 #include "xfs_da_btree.h"
  28 #include "xfs_attr.h"
  29 #include "xfs_reflink.h"
  30 #include "xfs_ag.h"
  31 #include "scrub/scrub.h"
  32 #include "scrub/common.h"
  33 #include "scrub/trace.h"
  34 #include "scrub/repair.h"
  35 #include "scrub/health.h"
  36
  37 /* Common code for the metadata scrubbers. */
  38
  39 /*
  40  * Handling operational errors.
  41  *
  42  * The *_process_error() family of functions are used to process error return
  43  * codes from functions called as part of a scrub operation.
  44  *
  45  * If there's no error, we return true to tell the caller that it's ok
  46  * to move on to the next check in its list.
  47  *
  48  * For non-verifier errors (e.g. ENOMEM) we return false to tell the
  49  * caller that something bad happened, and we preserve *error so that
  50  * the caller can return the *error up the stack to userspace.
  51  *
  52  * Verifier errors (EFSBADCRC/EFSCORRUPTED) are recorded by setting
  53  * OFLAG_CORRUPT in sm_flags and the *error is cleared.  In other words,
  54  * we track verifier errors (and failed scrub checks) via OFLAG_CORRUPT,
  55  * not via return codes.  We return false to tell the caller that
  56  * something bad happened.  Since the error has been cleared, the caller
  57  * will (presumably) return that zero and scrubbing will move on to
  58  * whatever's next.
  59  *
  60  * ftrace can be used to record the precise metadata location and the
  61  * approximate code location of the failed operation.
  62  */
  63
  64 /* Check for operational errors. */
  65 static bool
  66 __xchk_process_error(
  67         struct xfs_scrub        *sc,
  68         xfs_agnumber_t          agno,
  69         xfs_agblock_t           bno,
  70         int                     *error,
  71         __u32                   errflag,
  72         void                    *ret_ip)
  73 {
  74         switch (*error) {
  75         case 0:
  76                 return true;
  77         case -EDEADLOCK:
  78         case -ECHRNG:
  79                 /* Used to restart an op with deadlock avoidance. */
  80                 trace_xchk_deadlock_retry(
  81                                 sc->ip ? sc->ip : XFS_I(file_inode(sc->file)),
  82                                 sc->sm, *error);
  83                 break;
  84         case -EFSBADCRC:
  85         case -EFSCORRUPTED:
  86                 /* Note the badness but don't abort. */
  87                 sc->sm->sm_flags |= errflag;
  88                 *error = 0;
  89                 fallthrough;
  90         default:
  91                 trace_xchk_op_error(sc, agno, bno, *error,
  92                                 ret_ip);
  93                 break;
  94         }
  95         return false;
  96 }
  97
  98 bool
  99 xchk_process_error(
 100         struct xfs_scrub        *sc,
 101         xfs_agnumber_t          agno,
 102         xfs_agblock_t           bno,
 103         int                     *error)
 104 {
 105         return __xchk_process_error(sc, agno, bno, error,
 106                         XFS_SCRUB_OFLAG_CORRUPT, __return_address);
 107 }
 108
 109 bool
 110 xchk_xref_process_error(
 111         struct xfs_scrub        *sc,
 112         xfs_agnumber_t          agno,
 113         xfs_agblock_t           bno,
 114         int                     *error)
 115 {
 116         return __xchk_process_error(sc, agno, bno, error,
 117                         XFS_SCRUB_OFLAG_XFAIL, __return_address);
 118 }
 119
 120 /* Check for operational errors for a file offset. */
 121 static bool
 122 __xchk_fblock_process_error(
 123         struct xfs_scrub        *sc,
 124         int                     whichfork,
 125         xfs_fileoff_t           offset,
 126         int                     *error,
 127         __u32                   errflag,
 128         void                    *ret_ip)
 129 {
 130         switch (*error) {
 131         case 0:
 132                 return true;
 133         case -EDEADLOCK:
 134         case -ECHRNG:
 135                 /* Used to restart an op with deadlock avoidance. */
 136                 trace_xchk_deadlock_retry(sc->ip, sc->sm, *error);
 137                 break;
 138         case -EFSBADCRC:
 139         case -EFSCORRUPTED:
 140                 /* Note the badness but don't abort. */
 141                 sc->sm->sm_flags |= errflag;
 142                 *error = 0;
 143                 fallthrough;
 144         default:
 145                 trace_xchk_file_op_error(sc, whichfork, offset, *error,
 146                                 ret_ip);
 147                 break;
 148         }
 149         return false;
 150 }
 151
 152 bool
 153 xchk_fblock_process_error(
 154         struct xfs_scrub        *sc,
 155         int                     whichfork,
 156         xfs_fileoff_t           offset,
 157         int                     *error)
 158 {
 159         return __xchk_fblock_process_error(sc, whichfork, offset, error,
 160                         XFS_SCRUB_OFLAG_CORRUPT, __return_address);
 161 }
 162
 163 bool
 164 xchk_fblock_xref_process_error(
 165         struct xfs_scrub        *sc,
 166         int                     whichfork,
 167         xfs_fileoff_t           offset,
 168         int                     *error)
 169 {
 170         return __xchk_fblock_process_error(sc, whichfork, offset, error,
 171                         XFS_SCRUB_OFLAG_XFAIL, __return_address);
 172 }
 173
 174 /*
 175  * Handling scrub corruption/optimization/warning checks.
 176  *
 177  * The *_set_{corrupt,preen,warning}() family of functions are used to
 178  * record the presence of metadata that is incorrect (corrupt), could be
 179  * optimized somehow (preen), or should be flagged for administrative
 180  * review but is not incorrect (warn).
 181  *
 182  * ftrace can be used to record the precise metadata location and
 183  * approximate code location of the failed check.
 184  */
 185
 186 /* Record a block which could be optimized. */
 187 void
 188 xchk_block_set_preen(
 189         struct xfs_scrub        *sc,
 190         struct xfs_buf          *bp)
 191 {
 192         sc->sm->sm_flags |= XFS_SCRUB_OFLAG_PREEN;
 193         trace_xchk_block_preen(sc, xfs_buf_daddr(bp), __return_address);
 194 }
 195
 196 /*
 197  * Record an inode which could be optimized.  The trace data will
 198  * include the block given by bp if bp is given; otherwise it will use
 199  * the block location of the inode record itself.
 200  */
 201 void
 202 xchk_ino_set_preen(
 203         struct xfs_scrub        *sc,
 204         xfs_ino_t               ino)
 205 {
 206         sc->sm->sm_flags |= XFS_SCRUB_OFLAG_PREEN;
 207         trace_xchk_ino_preen(sc, ino, __return_address);
 208 }
 209
 210 /* Record something being wrong with the filesystem primary superblock. */
 211 void
 212 xchk_set_corrupt(
 213         struct xfs_scrub        *sc)
 214 {
 215         sc->sm->sm_flags |= XFS_SCRUB_OFLAG_CORRUPT;
 216         trace_xchk_fs_error(sc, 0, __return_address);
 217 }
 218
 219 /* Record a corrupt block. */
 220 void
 221 xchk_block_set_corrupt(
 222         struct xfs_scrub        *sc,
 223         struct xfs_buf          *bp)
 224 {
 225         sc->sm->sm_flags |= XFS_SCRUB_OFLAG_CORRUPT;
 226         trace_xchk_block_error(sc, xfs_buf_daddr(bp), __return_address);
 227 }
 228
 229 /* Record a corruption while cross-referencing. */
 230 void
 231 xchk_block_xref_set_corrupt(
 232         struct xfs_scrub        *sc,
 233         struct xfs_buf          *bp)
 234 {
 235         sc->sm->sm_flags |= XFS_SCRUB_OFLAG_XCORRUPT;
 236         trace_xchk_block_error(sc, xfs_buf_daddr(bp), __return_address);
 237 }
 238
 239 /*
 240  * Record a corrupt inode.  The trace data will include the block given
 241  * by bp if bp is given; otherwise it will use the block location of the
 242  * inode record itself.
 243  */
 244 void
 245 xchk_ino_set_corrupt(
 246         struct xfs_scrub        *sc,
 247         xfs_ino_t               ino)
 248 {
 249         sc->sm->sm_flags |= XFS_SCRUB_OFLAG_CORRUPT;
 250         trace_xchk_ino_error(sc, ino, __return_address);
 251 }
 252
 253 /* Record a corruption while cross-referencing with an inode. */
 254 void
 255 xchk_ino_xref_set_corrupt(
 256         struct xfs_scrub        *sc,
 257         xfs_ino_t               ino)
 258 {
 259         sc->sm->sm_flags |= XFS_SCRUB_OFLAG_XCORRUPT;
 260         trace_xchk_ino_error(sc, ino, __return_address);
 261 }
 262
 263 /* Record corruption in a block indexed by a file fork. */
 264 void
 265 xchk_fblock_set_corrupt(
 266         struct xfs_scrub        *sc,
 267         int                     whichfork,
 268         xfs_fileoff_t           offset)
 269 {
 270         sc->sm->sm_flags |= XFS_SCRUB_OFLAG_CORRUPT;
 271         trace_xchk_fblock_error(sc, whichfork, offset, __return_address);
 272 }
 273
 274 /* Record a corruption while cross-referencing a fork block. */
 275 void
 276 xchk_fblock_xref_set_corrupt(
 277         struct xfs_scrub        *sc,
 278         int                     whichfork,
 279         xfs_fileoff_t           offset)
 280 {
 281         sc->sm->sm_flags |= XFS_SCRUB_OFLAG_XCORRUPT;
 282         trace_xchk_fblock_error(sc, whichfork, offset, __return_address);
 283 }
 284
 285 /*
 286  * Warn about inodes that need administrative review but is not
 287  * incorrect.
 288  */
 289 void
 290 xchk_ino_set_warning(
 291         struct xfs_scrub        *sc,
 292         xfs_ino_t               ino)
 293 {
 294         sc->sm->sm_flags |= XFS_SCRUB_OFLAG_WARNING;
 295         trace_xchk_ino_warning(sc, ino, __return_address);
 296 }
 297
 298 /* Warn about a block indexed by a file fork that needs review. */
 299 void
 300 xchk_fblock_set_warning(
 301         struct xfs_scrub        *sc,
 302         int                     whichfork,
 303         xfs_fileoff_t           offset)
 304 {
 305         sc->sm->sm_flags |= XFS_SCRUB_OFLAG_WARNING;
 306         trace_xchk_fblock_warning(sc, whichfork, offset, __return_address);
 307 }
 308
 309 /* Signal an incomplete scrub. */
 310 void
 311 xchk_set_incomplete(
 312         struct xfs_scrub        *sc)
 313 {
 314         sc->sm->sm_flags |= XFS_SCRUB_OFLAG_INCOMPLETE;
 315         trace_xchk_incomplete(sc, __return_address);
 316 }
 317
 318 /*
 319  * rmap scrubbing -- compute the number of blocks with a given owner,
 320  * at least according to the reverse mapping data.
 321  */
 322
 323 struct xchk_rmap_ownedby_info {
 324         const struct xfs_owner_info     *oinfo;
 325         xfs_filblks_t                   *blocks;
 326 };
 327
 328 STATIC int
 329 xchk_count_rmap_ownedby_irec(
 330         struct xfs_btree_cur            *cur,
 331         const struct xfs_rmap_irec      *rec,
 332         void                            *priv)
 333 {
 334         struct xchk_rmap_ownedby_info   *sroi = priv;
 335         bool                            irec_attr;
 336         bool                            oinfo_attr;
 337
 338         irec_attr = rec->rm_flags & XFS_RMAP_ATTR_FORK;
 339         oinfo_attr = sroi->oinfo->oi_flags & XFS_OWNER_INFO_ATTR_FORK;
 340
 341         if (rec->rm_owner != sroi->oinfo->oi_owner)
 342                 return 0;
 343
 344         if (XFS_RMAP_NON_INODE_OWNER(rec->rm_owner) || irec_attr == oinfo_attr)
 345                 (*sroi->blocks) += rec->rm_blockcount;
 346
 347         return 0;
 348 }
 349
 350 /*
 351  * Calculate the number of blocks the rmap thinks are owned by something.
 352  * The caller should pass us an rmapbt cursor.
 353  */
 354 int
 355 xchk_count_rmap_ownedby_ag(
 356         struct xfs_scrub                *sc,
 357         struct xfs_btree_cur            *cur,
 358         const struct xfs_owner_info     *oinfo,
 359         xfs_filblks_t                   *blocks)
 360 {
 361         struct xchk_rmap_ownedby_info   sroi = {
 362                 .oinfo                  = oinfo,
 363                 .blocks                 = blocks,
 364         };
 365
 366         *blocks = 0;
 367         return xfs_rmap_query_all(cur, xchk_count_rmap_ownedby_irec,
 368                         &sroi);
 369 }
 370
 371 /*
 372  * AG scrubbing
 373  *
 374  * These helpers facilitate locking an allocation group's header
 375  * buffers, setting up cursors for all btrees that are present, and
 376  * cleaning everything up once we're through.
 377  */
 378
 379 /* Decide if we want to return an AG header read failure. */
 380 static inline bool
 381 want_ag_read_header_failure(
 382         struct xfs_scrub        *sc,
 383         unsigned int            type)
 384 {
 385         /* Return all AG header read failures when scanning btrees. */
 386         if (sc->sm->sm_type != XFS_SCRUB_TYPE_AGF &&
 387             sc->sm->sm_type != XFS_SCRUB_TYPE_AGFL &&
 388             sc->sm->sm_type != XFS_SCRUB_TYPE_AGI)
 389                 return true;
 390         /*
 391          * If we're scanning a given type of AG header, we only want to
 392          * see read failures from that specific header.  We'd like the
 393          * other headers to cross-check them, but this isn't required.
 394          */
 395         if (sc->sm->sm_type == type)
 396                 return true;
 397         return false;
 398 }
 399
 400 /*
 401  * Grab the AG header buffers for the attached perag structure.
 402  *
 403  * The headers should be released by xchk_ag_free, but as a fail safe we attach
 404  * all the buffers we grab to the scrub transaction so they'll all be freed
 405  * when we cancel it.
 406  */
 407 static inline int
 408 xchk_perag_read_headers(
 409         struct xfs_scrub        *sc,
 410         struct xchk_ag          *sa)
 411 {
 412         int                     error;
 413
 414         error = xfs_ialloc_read_agi(sa->pag, sc->tp, &sa->agi_bp);
 415         if (error && want_ag_read_header_failure(sc, XFS_SCRUB_TYPE_AGI))
 416                 return error;
 417
 418         error = xfs_alloc_read_agf(sa->pag, sc->tp, 0, &sa->agf_bp);
 419         if (error && want_ag_read_header_failure(sc, XFS_SCRUB_TYPE_AGF))
 420                 return error;
 421
 422         return 0;
 423 }
 424
 425 /*
 426  * Grab the AG headers for the attached perag structure and wait for pending
 427  * intents to drain.
 428  */
 429 static int
 430 xchk_perag_drain_and_lock(
 431         struct xfs_scrub        *sc)
 432 {
 433         struct xchk_ag          *sa = &sc->sa;
 434         int                     error = 0;
 435
 436         ASSERT(sa->pag != NULL);
 437         ASSERT(sa->agi_bp == NULL);
 438         ASSERT(sa->agf_bp == NULL);
 439
 440         do {
 441                 if (xchk_should_terminate(sc, &error))
 442                         return error;
 443
 444                 error = xchk_perag_read_headers(sc, sa);
 445                 if (error)
 446                         return error;
 447
 448                 /*
 449                  * If we've grabbed an inode for scrubbing then we assume that
 450                  * holding its ILOCK will suffice to coordinate with any intent
 451                  * chains involving this inode.
 452                  */
 453                 if (sc->ip)
 454                         return 0;
 455
 456                 /*
 457                  * Decide if this AG is quiet enough for all metadata to be
 458                  * consistent with each other.  XFS allows the AG header buffer
 459                  * locks to cycle across transaction rolls while processing
 460                  * chains of deferred ops, which means that there could be
 461                  * other threads in the middle of processing a chain of
 462                  * deferred ops.  For regular operations we are careful about
 463                  * ordering operations to prevent collisions between threads
 464                  * (which is why we don't need a per-AG lock), but scrub and
 465                  * repair have to serialize against chained operations.
 466                  *
 467                  * We just locked all the AG headers buffers; now take a look
 468                  * to see if there are any intents in progress.  If there are,
 469                  * drop the AG headers and wait for the intents to drain.
 470                  * Since we hold all the AG header locks for the duration of
 471                  * the scrub, this is the only time we have to sample the
 472                  * intents counter; any threads increasing it after this point
 473                  * can't possibly be in the middle of a chain of AG metadata
 474                  * updates.
 475                  *
 476                  * Obviously, this should be slanted against scrub and in favor
 477                  * of runtime threads.
 478                  */
 479                 if (!xfs_perag_intent_busy(sa->pag))
 480                         return 0;
 481
 482                 if (sa->agf_bp) {
 483                         xfs_trans_brelse(sc->tp, sa->agf_bp);
 484                         sa->agf_bp = NULL;
 485                 }
 486
 487                 if (sa->agi_bp) {
 488                         xfs_trans_brelse(sc->tp, sa->agi_bp);
 489                         sa->agi_bp = NULL;
 490                 }
 491
 492                 if (!(sc->flags & XCHK_FSGATES_DRAIN))
 493                         return -ECHRNG;
 494                 error = xfs_perag_intent_drain(sa->pag);
 495                 if (error == -ERESTARTSYS)
 496                         error = -EINTR;
 497         } while (!error);
 498
 499         return error;
 500 }
 501
 502 /*
 503  * Grab the per-AG structure, grab all AG header buffers, and wait until there
 504  * aren't any pending intents.  Returns -ENOENT if we can't grab the perag
 505  * structure.
 506  */
 507 int
 508 xchk_ag_read_headers(
 509         struct xfs_scrub        *sc,
 510         xfs_agnumber_t          agno,
 511         struct xchk_ag          *sa)
 512 {
 513         struct xfs_mount        *mp = sc->mp;
 514
 515         ASSERT(!sa->pag);
 516         sa->pag = xfs_perag_get(mp, agno);
 517         if (!sa->pag)
 518                 return -ENOENT;
 519
 520         return xchk_perag_drain_and_lock(sc);
 521 }
 522
 523 /* Release all the AG btree cursors. */
 524 void
 525 xchk_ag_btcur_free(
 526         struct xchk_ag          *sa)
 527 {
 528         if (sa->refc_cur)
 529                 xfs_btree_del_cursor(sa->refc_cur, XFS_BTREE_ERROR);
 530         if (sa->rmap_cur)
 531                 xfs_btree_del_cursor(sa->rmap_cur, XFS_BTREE_ERROR);
 532         if (sa->fino_cur)
 533                 xfs_btree_del_cursor(sa->fino_cur, XFS_BTREE_ERROR);
 534         if (sa->ino_cur)
 535                 xfs_btree_del_cursor(sa->ino_cur, XFS_BTREE_ERROR);
 536         if (sa->cnt_cur)
 537                 xfs_btree_del_cursor(sa->cnt_cur, XFS_BTREE_ERROR);
 538         if (sa->bno_cur)
 539                 xfs_btree_del_cursor(sa->bno_cur, XFS_BTREE_ERROR);
 540
 541         sa->refc_cur = NULL;
 542         sa->rmap_cur = NULL;
 543         sa->fino_cur = NULL;
 544         sa->ino_cur = NULL;
 545         sa->bno_cur = NULL;
 546         sa->cnt_cur = NULL;
 547 }
 548
 549 /* Initialize all the btree cursors for an AG. */
 550 void
 551 xchk_ag_btcur_init(
 552         struct xfs_scrub        *sc,
 553         struct xchk_ag          *sa)
 554 {
 555         struct xfs_mount        *mp = sc->mp;
 556
 557         if (sa->agf_bp &&
 558             xchk_ag_btree_healthy_enough(sc, sa->pag, XFS_BTNUM_BNO)) {
 559                 /* Set up a bnobt cursor for cross-referencing. */
 560                 sa->bno_cur = xfs_allocbt_init_cursor(mp, sc->tp, sa->agf_bp,
 561                                 sa->pag, XFS_BTNUM_BNO);
 562         }
 563
 564         if (sa->agf_bp &&
 565             xchk_ag_btree_healthy_enough(sc, sa->pag, XFS_BTNUM_CNT)) {
 566                 /* Set up a cntbt cursor for cross-referencing. */
 567                 sa->cnt_cur = xfs_allocbt_init_cursor(mp, sc->tp, sa->agf_bp,
 568                                 sa->pag, XFS_BTNUM_CNT);
 569         }
 570
 571         /* Set up a inobt cursor for cross-referencing. */
 572         if (sa->agi_bp &&
 573             xchk_ag_btree_healthy_enough(sc, sa->pag, XFS_BTNUM_INO)) {
 574                 sa->ino_cur = xfs_inobt_init_cursor(sa->pag, sc->tp, sa->agi_bp,
 575                                 XFS_BTNUM_INO);
 576         }
 577
 578         /* Set up a finobt cursor for cross-referencing. */
 579         if (sa->agi_bp && xfs_has_finobt(mp) &&
 580             xchk_ag_btree_healthy_enough(sc, sa->pag, XFS_BTNUM_FINO)) {
 581                 sa->fino_cur = xfs_inobt_init_cursor(sa->pag, sc->tp, sa->agi_bp,
 582                                 XFS_BTNUM_FINO);
 583         }
 584
 585         /* Set up a rmapbt cursor for cross-referencing. */
 586         if (sa->agf_bp && xfs_has_rmapbt(mp) &&
 587             xchk_ag_btree_healthy_enough(sc, sa->pag, XFS_BTNUM_RMAP)) {
 588                 sa->rmap_cur = xfs_rmapbt_init_cursor(mp, sc->tp, sa->agf_bp,
 589                                 sa->pag);
 590         }
 591
 592         /* Set up a refcountbt cursor for cross-referencing. */
 593         if (sa->agf_bp && xfs_has_reflink(mp) &&
 594             xchk_ag_btree_healthy_enough(sc, sa->pag, XFS_BTNUM_REFC)) {
 595                 sa->refc_cur = xfs_refcountbt_init_cursor(mp, sc->tp,
 596                                 sa->agf_bp, sa->pag);
 597         }
 598 }
 599
 600 /* Release the AG header context and btree cursors. */
 601 void
 602 xchk_ag_free(
 603         struct xfs_scrub        *sc,
 604         struct xchk_ag          *sa)
 605 {
 606         xchk_ag_btcur_free(sa);
 607         if (sa->agf_bp) {
 608                 xfs_trans_brelse(sc->tp, sa->agf_bp);
 609                 sa->agf_bp = NULL;
 610         }
 611         if (sa->agi_bp) {
 612                 xfs_trans_brelse(sc->tp, sa->agi_bp);
 613                 sa->agi_bp = NULL;
 614         }
 615         if (sa->pag) {
 616                 xfs_perag_put(sa->pag);
 617                 sa->pag = NULL;
 618         }
 619 }
 620
 621 /*
 622  * For scrub, grab the perag structure, the AGI, and the AGF headers, in that
 623  * order.  Locking order requires us to get the AGI before the AGF.  We use the
 624  * transaction to avoid deadlocking on crosslinked metadata buffers; either the
 625  * caller passes one in (bmap scrub) or we have to create a transaction
 626  * ourselves.  Returns ENOENT if the perag struct cannot be grabbed.
 627  */
 628 int
 629 xchk_ag_init(
 630         struct xfs_scrub        *sc,
 631         xfs_agnumber_t          agno,
 632         struct xchk_ag          *sa)
 633 {
 634         int                     error;
 635
 636         error = xchk_ag_read_headers(sc, agno, sa);
 637         if (error)
 638                 return error;
 639
 640         xchk_ag_btcur_init(sc, sa);
 641         return 0;
 642 }
 643
 644 /* Per-scrubber setup functions */
 645
 646 void
 647 xchk_trans_cancel(
 648         struct xfs_scrub        *sc)
 649 {
 650         xfs_trans_cancel(sc->tp);
 651         sc->tp = NULL;
 652 }
 653
 654 /*
 655  * Grab an empty transaction so that we can re-grab locked buffers if
 656  * one of our btrees turns out to be cyclic.
 657  *
 658  * If we're going to repair something, we need to ask for the largest possible
 659  * log reservation so that we can handle the worst case scenario for metadata
 660  * updates while rebuilding a metadata item.  We also need to reserve as many
 661  * blocks in the head transaction as we think we're going to need to rebuild
 662  * the metadata object.
 663  */
 664 int
 665 xchk_trans_alloc(
 666         struct xfs_scrub        *sc,
 667         uint                    resblks)
 668 {
 669         if (sc->sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR)
 670                 return xfs_trans_alloc(sc->mp, &M_RES(sc->mp)->tr_itruncate,
 671                                 resblks, 0, 0, &sc->tp);
 672
 673         return xfs_trans_alloc_empty(sc->mp, &sc->tp);
 674 }
 675
 676 /* Set us up with a transaction and an empty context. */
 677 int
 678 xchk_setup_fs(
 679         struct xfs_scrub        *sc)
 680 {
 681         uint                    resblks;
 682
 683         resblks = xrep_calc_ag_resblks(sc);
 684         return xchk_trans_alloc(sc, resblks);
 685 }
 686
 687 /* Set us up with AG headers and btree cursors. */
 688 int
 689 xchk_setup_ag_btree(
 690         struct xfs_scrub        *sc,
 691         bool                    force_log)
 692 {
 693         struct xfs_mount        *mp = sc->mp;
 694         int                     error;
 695
 696         /*
 697          * If the caller asks us to checkpont the log, do so.  This
 698          * expensive operation should be performed infrequently and only
 699          * as a last resort.  Any caller that sets force_log should
 700          * document why they need to do so.
 701          */
 702         if (force_log) {
 703                 error = xchk_checkpoint_log(mp);
 704                 if (error)
 705                         return error;
 706         }
 707
 708         error = xchk_setup_fs(sc);
 709         if (error)
 710                 return error;
 711
 712         return xchk_ag_init(sc, sc->sm->sm_agno, &sc->sa);
 713 }
 714
 715 /* Push everything out of the log onto disk. */
 716 int
 717 xchk_checkpoint_log(
 718         struct xfs_mount        *mp)
 719 {
 720         int                     error;
 721
 722         error = xfs_log_force(mp, XFS_LOG_SYNC);
 723         if (error)
 724                 return error;
 725         xfs_ail_push_all_sync(mp->m_ail);
 726         return 0;
 727 }
 728
 729 /* Verify that an inode is allocated ondisk, then return its cached inode. */
 730 int
 731 xchk_iget(
 732         struct xfs_scrub        *sc,
 733         xfs_ino_t               inum,
 734         struct xfs_inode        **ipp)
 735 {
 736         return xfs_iget(sc->mp, sc->tp, inum, XFS_IGET_UNTRUSTED, 0, ipp);
 737 }
 738
 739 /*
 740  * Try to grab an inode in a manner that avoids races with physical inode
 741  * allocation.  If we can't, return the locked AGI buffer so that the caller
 742  * can single-step the loading process to see where things went wrong.
 743  * Callers must have a valid scrub transaction.
 744  *
 745  * If the iget succeeds, return 0, a NULL AGI, and the inode.
 746  *
 747  * If the iget fails, return the error, the locked AGI, and a NULL inode.  This
 748  * can include -EINVAL and -ENOENT for invalid inode numbers or inodes that are
 749  * no longer allocated; or any other corruption or runtime error.
 750  *
 751  * If the AGI read fails, return the error, a NULL AGI, and NULL inode.
 752  *
 753  * If a fatal signal is pending, return -EINTR, a NULL AGI, and a NULL inode.
 754  */
 755 int
 756 xchk_iget_agi(
 757         struct xfs_scrub        *sc,
 758         xfs_ino_t               inum,
 759         struct xfs_buf          **agi_bpp,
 760         struct xfs_inode        **ipp)
 761 {
 762         struct xfs_mount        *mp = sc->mp;
 763         struct xfs_trans        *tp = sc->tp;
 764         struct xfs_perag        *pag;
 765         int                     error;
 766
 767         ASSERT(sc->tp != NULL);
 768
 769 again:
 770         *agi_bpp = NULL;
 771         *ipp = NULL;
 772         error = 0;
 773
 774         if (xchk_should_terminate(sc, &error))
 775                 return error;
 776
 777         /*
 778          * Attach the AGI buffer to the scrub transaction to avoid deadlocks
 779          * in the iget cache miss path.
 780          */
 781         pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, inum));
 782         error = xfs_ialloc_read_agi(pag, tp, agi_bpp);
 783         xfs_perag_put(pag);
 784         if (error)
 785                 return error;
 786
 787         error = xfs_iget(mp, tp, inum,
 788                         XFS_IGET_NORETRY | XFS_IGET_UNTRUSTED, 0, ipp);
 789         if (error == -EAGAIN) {
 790                 /*
 791                  * The inode may be in core but temporarily unavailable and may
 792                  * require the AGI buffer before it can be returned.  Drop the
 793                  * AGI buffer and retry the lookup.
 794                  *
 795                  * Incore lookup will fail with EAGAIN on a cache hit if the
 796                  * inode is queued to the inactivation list.  The inactivation
 797                  * worker may remove the inode from the unlinked list and hence
 798                  * needs the AGI.
 799                  *
 800                  * Hence xchk_iget_agi() needs to drop the AGI lock on EAGAIN
 801                  * to allow inodegc to make progress and move the inode to
 802                  * IRECLAIMABLE state where xfs_iget will be able to return it
 803                  * again if it can lock the inode.
 804                  */
 805                 xfs_trans_brelse(tp, *agi_bpp);
 806                 delay(1);
 807                 goto again;
 808         }
 809         if (error)
 810                 return error;
 811
 812         /* We got the inode, so we can release the AGI. */
 813         ASSERT(*ipp != NULL);
 814         xfs_trans_brelse(tp, *agi_bpp);
 815         *agi_bpp = NULL;
 816         return 0;
 817 }
 818
 819 /* Install an inode that we opened by handle for scrubbing. */
 820 int
 821 xchk_install_handle_inode(
 822         struct xfs_scrub        *sc,
 823         struct xfs_inode        *ip)
 824 {
 825         if (VFS_I(ip)->i_generation != sc->sm->sm_gen) {
 826                 xchk_irele(sc, ip);
 827                 return -ENOENT;
 828         }
 829
 830         sc->ip = ip;
 831         return 0;
 832 }
 833
 834 /*
 835  * Install an already-referenced inode for scrubbing.  Get our own reference to
 836  * the inode to make disposal simpler.  The inode must not be in I_FREEING or
 837  * I_WILL_FREE state!
 838  */
 839 int
 840 xchk_install_live_inode(
 841         struct xfs_scrub        *sc,
 842         struct xfs_inode        *ip)
 843 {
 844         if (!igrab(VFS_I(ip))) {
 845                 xchk_ino_set_corrupt(sc, ip->i_ino);
 846                 return -EFSCORRUPTED;
 847         }
 848
 849         sc->ip = ip;
 850         return 0;
 851 }
 852
 853 /*
 854  * In preparation to scrub metadata structures that hang off of an inode,
 855  * grab either the inode referenced in the scrub control structure or the
 856  * inode passed in.  If the inumber does not reference an allocated inode
 857  * record, the function returns ENOENT to end the scrub early.  The inode
 858  * is not locked.
 859  */
 860 int
 861 xchk_iget_for_scrubbing(
 862         struct xfs_scrub        *sc)
 863 {
 864         struct xfs_imap         imap;
 865         struct xfs_mount        *mp = sc->mp;
 866         struct xfs_perag        *pag;
 867         struct xfs_buf          *agi_bp;
 868         struct xfs_inode        *ip_in = XFS_I(file_inode(sc->file));
 869         struct xfs_inode        *ip = NULL;
 870         xfs_agnumber_t          agno = XFS_INO_TO_AGNO(mp, sc->sm->sm_ino);
 871         int                     error;
 872
 873         ASSERT(sc->tp == NULL);
 874
 875         /* We want to scan the inode we already had opened. */
 876         if (sc->sm->sm_ino == 0 || sc->sm->sm_ino == ip_in->i_ino)
 877                 return xchk_install_live_inode(sc, ip_in);
 878
 879         /* Reject internal metadata files and obviously bad inode numbers. */
 880         if (xfs_internal_inum(mp, sc->sm->sm_ino))
 881                 return -ENOENT;
 882         if (!xfs_verify_ino(sc->mp, sc->sm->sm_ino))
 883                 return -ENOENT;
 884
 885         /* Try a regular untrusted iget. */
 886         error = xchk_iget(sc, sc->sm->sm_ino, &ip);
 887         if (!error)
 888                 return xchk_install_handle_inode(sc, ip);
 889         if (error == -ENOENT)
 890                 return error;
 891         if (error != -EINVAL)
 892                 goto out_error;
 893
 894         /*
 895          * EINVAL with IGET_UNTRUSTED probably means one of several things:
 896          * userspace gave us an inode number that doesn't correspond to fs
 897          * space; the inode btree lacks a record for this inode; or there is a
 898          * record, and it says this inode is free.
 899          *
 900          * We want to look up this inode in the inobt to distinguish two
 901          * scenarios: (1) the inobt says the inode is free, in which case
 902          * there's nothing to do; and (2) the inobt says the inode is
 903          * allocated, but loading it failed due to corruption.
 904          *
 905          * Allocate a transaction and grab the AGI to prevent inobt activity
 906          * in this AG.  Retry the iget in case someone allocated a new inode
 907          * after the first iget failed.
 908          */
 909         error = xchk_trans_alloc(sc, 0);
 910         if (error)
 911                 goto out_error;
 912
 913         error = xchk_iget_agi(sc, sc->sm->sm_ino, &agi_bp, &ip);
 914         if (error == 0) {
 915                 /* Actually got the inode, so install it. */
 916                 xchk_trans_cancel(sc);
 917                 return xchk_install_handle_inode(sc, ip);
 918         }
 919         if (error == -ENOENT)
 920                 goto out_gone;
 921         if (error != -EINVAL)
 922                 goto out_cancel;
 923
 924         /* Ensure that we have protected against inode allocation/freeing. */
 925         if (agi_bp == NULL) {
 926                 ASSERT(agi_bp != NULL);
 927                 error = -ECANCELED;
 928                 goto out_cancel;
 929         }
 930
 931         /*
 932          * Untrusted iget failed a second time.  Let's try an inobt lookup.
 933          * If the inobt thinks this the inode neither can exist inside the
 934          * filesystem nor is allocated, return ENOENT to signal that the check
 935          * can be skipped.
 936          *
 937          * If the lookup returns corruption, we'll mark this inode corrupt and
 938          * exit to userspace.  There's little chance of fixing anything until
 939          * the inobt is straightened out, but there's nothing we can do here.
 940          *
 941          * If the lookup encounters any other error, exit to userspace.
 942          *
 943          * If the lookup succeeds, something else must be very wrong in the fs
 944          * such that setting up the incore inode failed in some strange way.
 945          * Treat those as corruptions.
 946          */
 947         pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, sc->sm->sm_ino));
 948         if (!pag) {
 949                 error = -EFSCORRUPTED;
 950                 goto out_cancel;
 951         }
 952
 953         error = xfs_imap(pag, sc->tp, sc->sm->sm_ino, &imap,
 954                         XFS_IGET_UNTRUSTED);
 955         xfs_perag_put(pag);
 956         if (error == -EINVAL || error == -ENOENT)
 957                 goto out_gone;
 958         if (!error)
 959                 error = -EFSCORRUPTED;
 960
 961 out_cancel:
 962         xchk_trans_cancel(sc);
 963 out_error:
 964         trace_xchk_op_error(sc, agno, XFS_INO_TO_AGBNO(mp, sc->sm->sm_ino),
 965                         error, __return_address);
 966         return error;
 967 out_gone:
 968         /* The file is gone, so there's nothing to check. */
 969         xchk_trans_cancel(sc);
 970         return -ENOENT;
 971 }
 972
 973 /* Release an inode, possibly dropping it in the process. */
 974 void
 975 xchk_irele(
 976         struct xfs_scrub        *sc,
 977         struct xfs_inode        *ip)
 978 {
 979         if (current->journal_info != NULL) {
 980                 ASSERT(current->journal_info == sc->tp);
 981
 982                 /*
 983                  * If we are in a transaction, we /cannot/ drop the inode
 984                  * ourselves, because the VFS will trigger writeback, which
 985                  * can require a transaction.  Clear DONTCACHE to force the
 986                  * inode to the LRU, where someone else can take care of
 987                  * dropping it.
 988                  *
 989                  * Note that when we grabbed our reference to the inode, it
 990                  * could have had an active ref and DONTCACHE set if a sysadmin
 991                  * is trying to coerce a change in file access mode.  icache
 992                  * hits do not clear DONTCACHE, so we must do it here.
 993                  */
 994                 spin_lock(&VFS_I(ip)->i_lock);
 995                 VFS_I(ip)->i_state &= ~I_DONTCACHE;
 996                 spin_unlock(&VFS_I(ip)->i_lock);
 997         } else if (atomic_read(&VFS_I(ip)->i_count) == 1) {
 998                 /*
 999                  * If this is the last reference to the inode and the caller
1000                  * permits it, set DONTCACHE to avoid thrashing.
1001                  */
1002                 d_mark_dontcache(VFS_I(ip));
1003         }
1004
1005         xfs_irele(ip);
1006 }
1007
1008 /*
1009  * Set us up to scrub metadata mapped by a file's fork.  Callers must not use
1010  * this to operate on user-accessible regular file data because the MMAPLOCK is
1011  * not taken.
1012  */
1013 int
1014 xchk_setup_inode_contents(
1015         struct xfs_scrub        *sc,
1016         unsigned int            resblks)
1017 {
1018         int                     error;
1019
1020         error = xchk_iget_for_scrubbing(sc);
1021         if (error)
1022                 return error;
1023
1024         /* Lock the inode so the VFS cannot touch this file. */
1025         xchk_ilock(sc, XFS_IOLOCK_EXCL);
1026
1027         error = xchk_trans_alloc(sc, resblks);
1028         if (error)
1029                 goto out;
1030         xchk_ilock(sc, XFS_ILOCK_EXCL);
1031 out:
1032         /* scrub teardown will unlock and release the inode for us */
1033         return error;
1034 }
1035
1036 void
1037 xchk_ilock(
1038         struct xfs_scrub        *sc,
1039         unsigned int            ilock_flags)
1040 {
1041         xfs_ilock(sc->ip, ilock_flags);
1042         sc->ilock_flags |= ilock_flags;
1043 }
1044
1045 bool
1046 xchk_ilock_nowait(
1047         struct xfs_scrub        *sc,
1048         unsigned int            ilock_flags)
1049 {
1050         if (xfs_ilock_nowait(sc->ip, ilock_flags)) {
1051                 sc->ilock_flags |= ilock_flags;
1052                 return true;
1053         }
1054
1055         return false;
1056 }
1057
1058 void
1059 xchk_iunlock(
1060         struct xfs_scrub        *sc,
1061         unsigned int            ilock_flags)
1062 {
1063         sc->ilock_flags &= ~ilock_flags;
1064         xfs_iunlock(sc->ip, ilock_flags);
1065 }
1066
1067 /*
1068  * Predicate that decides if we need to evaluate the cross-reference check.
1069  * If there was an error accessing the cross-reference btree, just delete
1070  * the cursor and skip the check.
1071  */
1072 bool
1073 xchk_should_check_xref(
1074         struct xfs_scrub        *sc,
1075         int                     *error,
1076         struct xfs_btree_cur    **curpp)
1077 {
1078         /* No point in xref if we already know we're corrupt. */
1079         if (xchk_skip_xref(sc->sm))
1080                 return false;
1081
1082         if (*error == 0)
1083                 return true;
1084
1085         if (curpp) {
1086                 /* If we've already given up on xref, just bail out. */
1087                 if (!*curpp)
1088                         return false;
1089
1090                 /* xref error, delete cursor and bail out. */
1091                 xfs_btree_del_cursor(*curpp, XFS_BTREE_ERROR);
1092                 *curpp = NULL;
1093         }
1094
1095         sc->sm->sm_flags |= XFS_SCRUB_OFLAG_XFAIL;
1096         trace_xchk_xref_error(sc, *error, __return_address);
1097
1098         /*
1099          * Errors encountered during cross-referencing with another
1100          * data structure should not cause this scrubber to abort.
1101          */
1102         *error = 0;
1103         return false;
1104 }
1105
1106 /* Run the structure verifiers on in-memory buffers to detect bad memory. */
1107 void
1108 xchk_buffer_recheck(
1109         struct xfs_scrub        *sc,
1110         struct xfs_buf          *bp)
1111 {
1112         xfs_failaddr_t          fa;
1113
1114         if (bp->b_ops == NULL) {
1115                 xchk_block_set_corrupt(sc, bp);
1116                 return;
1117         }
1118         if (bp->b_ops->verify_struct == NULL) {
1119                 xchk_set_incomplete(sc);
1120                 return;
1121         }
1122         fa = bp->b_ops->verify_struct(bp);
1123         if (!fa)
1124                 return;
1125         sc->sm->sm_flags |= XFS_SCRUB_OFLAG_CORRUPT;
1126         trace_xchk_block_error(sc, xfs_buf_daddr(bp), fa);
1127 }
1128
1129 static inline int
1130 xchk_metadata_inode_subtype(
1131         struct xfs_scrub        *sc,
1132         unsigned int            scrub_type)
1133 {
1134         __u32                   smtype = sc->sm->sm_type;
1135         int                     error;
1136
1137         sc->sm->sm_type = scrub_type;
1138
1139         switch (scrub_type) {
1140         case XFS_SCRUB_TYPE_INODE:
1141                 error = xchk_inode(sc);
1142                 break;
1143         case XFS_SCRUB_TYPE_BMBTD:
1144                 error = xchk_bmap_data(sc);
1145                 break;
1146         default:
1147                 ASSERT(0);
1148                 error = -EFSCORRUPTED;
1149                 break;
1150         }
1151
1152         sc->sm->sm_type = smtype;
1153         return error;
1154 }
1155
1156 /*
1157  * Scrub the attr/data forks of a metadata inode.  The metadata inode must be
1158  * pointed to by sc->ip and the ILOCK must be held.
1159  */
1160 int
1161 xchk_metadata_inode_forks(
1162         struct xfs_scrub        *sc)
1163 {
1164         bool                    shared;
1165         int                     error;
1166
1167         if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
1168                 return 0;
1169
1170         /* Check the inode record. */
1171         error = xchk_metadata_inode_subtype(sc, XFS_SCRUB_TYPE_INODE);
1172         if (error || (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT))
1173                 return error;
1174
1175         /* Metadata inodes don't live on the rt device. */
1176         if (sc->ip->i_diflags & XFS_DIFLAG_REALTIME) {
1177                 xchk_ino_set_corrupt(sc, sc->ip->i_ino);
1178                 return 0;
1179         }
1180
1181         /* They should never participate in reflink. */
1182         if (xfs_is_reflink_inode(sc->ip)) {
1183                 xchk_ino_set_corrupt(sc, sc->ip->i_ino);
1184                 return 0;
1185         }
1186
1187         /* They also should never have extended attributes. */
1188         if (xfs_inode_hasattr(sc->ip)) {
1189                 xchk_ino_set_corrupt(sc, sc->ip->i_ino);
1190                 return 0;
1191         }
1192
1193         /* Invoke the data fork scrubber. */
1194         error = xchk_metadata_inode_subtype(sc, XFS_SCRUB_TYPE_BMBTD);
1195         if (error || (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT))
1196                 return error;
1197
1198         /* Look for incorrect shared blocks. */
1199         if (xfs_has_reflink(sc->mp)) {
1200                 error = xfs_reflink_inode_has_shared_extents(sc->tp, sc->ip,
1201                                 &shared);
1202                 if (!xchk_fblock_process_error(sc, XFS_DATA_FORK, 0,
1203                                 &error))
1204                         return error;
1205                 if (shared)
1206                         xchk_ino_set_corrupt(sc, sc->ip->i_ino);
1207         }
1208
1209         return 0;
1210 }
1211
1212 /*
1213  * Enable filesystem hooks (i.e. runtime code patching) before starting a scrub
1214  * operation.  Callers must not hold any locks that intersect with the CPU
1215  * hotplug lock (e.g. writeback locks) because code patching must halt the CPUs
1216  * to change kernel code.
1217  */
1218 void
1219 xchk_fsgates_enable(
1220         struct xfs_scrub        *sc,
1221         unsigned int            scrub_fsgates)
1222 {
1223         ASSERT(!(scrub_fsgates & ~XCHK_FSGATES_ALL));
1224         ASSERT(!(sc->flags & scrub_fsgates));
1225
1226         trace_xchk_fsgates_enable(sc, scrub_fsgates);
1227
1228         if (scrub_fsgates & XCHK_FSGATES_DRAIN)
1229                 xfs_drain_wait_enable();
1230
1231         sc->flags |= scrub_fsgates;
1232 }
1233
1234 /*
1235  * Decide if this is this a cached inode that's also allocated.  The caller
1236  * must hold a reference to an AG and the AGI buffer lock to prevent inodes
1237  * from being allocated or freed.
1238  *
1239  * Look up an inode by number in the given file system.  If the inode number
1240  * is invalid, return -EINVAL.  If the inode is not in cache, return -ENODATA.
1241  * If the inode is being reclaimed, return -ENODATA because we know the inode
1242  * cache cannot be updating the ondisk metadata.
1243  *
1244  * Otherwise, the incore inode is the one we want, and it is either live,
1245  * somewhere in the inactivation machinery, or reclaimable.  The inode is
1246  * allocated if i_mode is nonzero.  In all three cases, the cached inode will
1247  * be more up to date than the ondisk inode buffer, so we must use the incore
1248  * i_mode.
1249  */
1250 int
1251 xchk_inode_is_allocated(
1252         struct xfs_scrub        *sc,
1253         xfs_agino_t             agino,
1254         bool                    *inuse)
1255 {
1256         struct xfs_mount        *mp = sc->mp;
1257         struct xfs_perag        *pag = sc->sa.pag;
1258         xfs_ino_t               ino;
1259         struct xfs_inode        *ip;
1260         int                     error;
1261
1262         /* caller must hold perag reference */
1263         if (pag == NULL) {
1264                 ASSERT(pag != NULL);
1265                 return -EINVAL;
1266         }
1267
1268         /* caller must have AGI buffer */
1269         if (sc->sa.agi_bp == NULL) {
1270                 ASSERT(sc->sa.agi_bp != NULL);
1271                 return -EINVAL;
1272         }
1273
1274         /* reject inode numbers outside existing AGs */
1275         ino = XFS_AGINO_TO_INO(sc->mp, pag->pag_agno, agino);
1276         if (!xfs_verify_ino(mp, ino))
1277                 return -EINVAL;
1278
1279         error = -ENODATA;
1280         rcu_read_lock();
1281         ip = radix_tree_lookup(&pag->pag_ici_root, agino);
1282         if (!ip) {
1283                 /* cache miss */
1284                 goto out_rcu;
1285         }
1286
1287         /*
1288          * If the inode number doesn't match, the incore inode got reused
1289          * during an RCU grace period and the radix tree hasn't been updated.
1290          * This isn't the inode we want.
1291          */
1292         spin_lock(&ip->i_flags_lock);
1293         if (ip->i_ino != ino)
1294                 goto out_skip;
1295
1296         trace_xchk_inode_is_allocated(ip);
1297
1298         /*
1299          * We have an incore inode that matches the inode we want, and the
1300          * caller holds the perag structure and the AGI buffer.  Let's check
1301          * our assumptions below:
1302          */
1303
1304 #ifdef DEBUG
1305         /*
1306          * (1) If the incore inode is live (i.e. referenced from the dcache),
1307          * it will not be INEW, nor will it be in the inactivation or reclaim
1308          * machinery.  The ondisk inode had better be allocated.  This is the
1309          * most trivial case.
1310          */
1311         if (!(ip->i_flags & (XFS_NEED_INACTIVE | XFS_INEW | XFS_IRECLAIMABLE |
1312                              XFS_INACTIVATING))) {
1313                 /* live inode */
1314                 ASSERT(VFS_I(ip)->i_mode != 0);
1315         }
1316
1317         /*
1318          * If the incore inode is INEW, there are several possibilities:
1319          *
1320          * (2) For a file that is being created, note that we allocate the
1321          * ondisk inode before allocating, initializing, and adding the incore
1322          * inode to the radix tree.
1323          *
1324          * (3) If the incore inode is being recycled, the inode has to be
1325          * allocated because we don't allow freed inodes to be recycled.
1326          * Recycling doesn't touch i_mode.
1327          */
1328         if (ip->i_flags & XFS_INEW) {
1329                 /* created on disk already or recycling */
1330                 ASSERT(VFS_I(ip)->i_mode != 0);
1331         }
1332
1333         /*
1334          * (4) If the inode is queued for inactivation (NEED_INACTIVE) but
1335          * inactivation has not started (!INACTIVATING), it is still allocated.
1336          */
1337         if ((ip->i_flags & XFS_NEED_INACTIVE) &&
1338             !(ip->i_flags & XFS_INACTIVATING)) {
1339                 /* definitely before difree */
1340                 ASSERT(VFS_I(ip)->i_mode != 0);
1341         }
1342 #endif
1343
1344         /*
1345          * If the incore inode is undergoing inactivation (INACTIVATING), there
1346          * are two possibilities:
1347          *
1348          * (5) It is before the point where it would get freed ondisk, in which
1349          * case i_mode is still nonzero.
1350          *
1351          * (6) It has already been freed, in which case i_mode is zero.
1352          *
1353          * We don't take the ILOCK here, but difree and dialloc update the AGI,
1354          * and we've taken the AGI buffer lock, which prevents that from
1355          * happening.
1356          */
1357
1358         /*
1359          * (7) Inodes undergoing inactivation (INACTIVATING) or queued for
1360          * reclaim (IRECLAIMABLE) could be allocated or free.  i_mode still
1361          * reflects the ondisk state.
1362          */
1363
1364         /*
1365          * (8) If the inode is in IFLUSHING, it's safe to query i_mode because
1366          * the flush code uses i_mode to format the ondisk inode.
1367          */
1368
1369         /*
1370          * (9) If the inode is in IRECLAIM and was reachable via the radix
1371          * tree, it still has the same i_mode as it did before it entered
1372          * reclaim.  The inode object is still alive because we hold the RCU
1373          * read lock.
1374          */
1375
1376         *inuse = VFS_I(ip)->i_mode != 0;
1377         error = 0;
1378
1379 out_skip:
1380         spin_unlock(&ip->i_flags_lock);
1381 out_rcu:
1382         rcu_read_unlock();
1383         return error;
1384 }