fs/xfs/xfs_log_recover.c

   1 /*
   2  * Copyright (c) 2000-2006 Silicon Graphics, Inc.
   3  * All Rights Reserved.
   4  *
   5  * This program is free software; you can redistribute it and/or
   6  * modify it under the terms of the GNU General Public License as
   7  * published by the Free Software Foundation.
   8  *
   9  * This program is distributed in the hope that it would be useful,
  10  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12  * GNU General Public License for more details.
  13  *
  14  * You should have received a copy of the GNU General Public License
  15  * along with this program; if not, write the Free Software Foundation,
  16  * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
  17  */
  18 #include "xfs.h"
  19 #include "xfs_fs.h"
  20 #include "xfs_format.h"
  21 #include "xfs_bit.h"
  22 #include "xfs_log.h"
  23 #include "xfs_inum.h"
  24 #include "xfs_trans.h"
  25 #include "xfs_sb.h"
  26 #include "xfs_ag.h"
  27 #include "xfs_mount.h"
  28 #include "xfs_error.h"
  29 #include "xfs_bmap_btree.h"
  30 #include "xfs_alloc_btree.h"
  31 #include "xfs_ialloc_btree.h"
  32 #include "xfs_btree.h"
  33 #include "xfs_dinode.h"
  34 #include "xfs_inode.h"
  35 #include "xfs_inode_item.h"
  36 #include "xfs_alloc.h"
  37 #include "xfs_ialloc.h"
  38 #include "xfs_log_priv.h"
  39 #include "xfs_buf_item.h"
  40 #include "xfs_log_recover.h"
  41 #include "xfs_extfree_item.h"
  42 #include "xfs_trans_priv.h"
  43 #include "xfs_quota.h"
  44 #include "xfs_cksum.h"
  45 #include "xfs_trace.h"
  46 #include "xfs_icache.h"
  47 #include "xfs_icreate_item.h"
  48
  49 /* Need all the magic numbers and buffer ops structures from these headers */
  50 #include "xfs_symlink.h"
  51 #include "xfs_da_btree.h"
  52 #include "xfs_dir2_format.h"
  53 #include "xfs_dir2.h"
  54 #include "xfs_attr_leaf.h"
  55 #include "xfs_attr_remote.h"
  56
  57 #define BLK_AVG(blk1, blk2)     ((blk1+blk2) >> 1)
  58
  59 STATIC int
  60 xlog_find_zeroed(
  61         struct xlog     *,
  62         xfs_daddr_t     *);
  63 STATIC int
  64 xlog_clear_stale_blocks(
  65         struct xlog     *,
  66         xfs_lsn_t);
  67 #if defined(DEBUG)
  68 STATIC void
  69 xlog_recover_check_summary(
  70         struct xlog *);
  71 #else
  72 #define xlog_recover_check_summary(log)
  73 #endif
  74
  75 /*
  76  * This structure is used during recovery to record the buf log items which
  77  * have been canceled and should not be replayed.
  78  */
  79 struct xfs_buf_cancel {
  80         xfs_daddr_t             bc_blkno;
  81         uint                    bc_len;
  82         int                     bc_refcount;
  83         struct list_head        bc_list;
  84 };
  85
  86 /*
  87  * Sector aligned buffer routines for buffer create/read/write/access
  88  */
  89
  90 /*
  91  * Verify the given count of basic blocks is valid number of blocks
  92  * to specify for an operation involving the given XFS log buffer.
  93  * Returns nonzero if the count is valid, 0 otherwise.
  94  */
  95
  96 static inline int
  97 xlog_buf_bbcount_valid(
  98         struct xlog     *log,
  99         int             bbcount)
 100 {
 101         return bbcount > 0 && bbcount <= log->l_logBBsize;
 102 }
 103
 104 /*
 105  * Allocate a buffer to hold log data.  The buffer needs to be able
 106  * to map to a range of nbblks basic blocks at any valid (basic
 107  * block) offset within the log.
 108  */
 109 STATIC xfs_buf_t *
 110 xlog_get_bp(
 111         struct xlog     *log,
 112         int             nbblks)
 113 {
 114         struct xfs_buf  *bp;
 115
 116         if (!xlog_buf_bbcount_valid(log, nbblks)) {
 117                 xfs_warn(log->l_mp, "Invalid block length (0x%x) for buffer",
 118                         nbblks);
 119                 XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_HIGH, log->l_mp);
 120                 return NULL;
 121         }
 122
 123         /*
 124          * We do log I/O in units of log sectors (a power-of-2
 125          * multiple of the basic block size), so we round up the
 126          * requested size to accommodate the basic blocks required
 127          * for complete log sectors.
 128          *
 129          * In addition, the buffer may be used for a non-sector-
 130          * aligned block offset, in which case an I/O of the
 131          * requested size could extend beyond the end of the
 132          * buffer.  If the requested size is only 1 basic block it
 133          * will never straddle a sector boundary, so this won't be
 134          * an issue.  Nor will this be a problem if the log I/O is
 135          * done in basic blocks (sector size 1).  But otherwise we
 136          * extend the buffer by one extra log sector to ensure
 137          * there's space to accommodate this possibility.
 138          */
 139         if (nbblks > 1 && log->l_sectBBsize > 1)
 140                 nbblks += log->l_sectBBsize;
 141         nbblks = round_up(nbblks, log->l_sectBBsize);
 142
 143         bp = xfs_buf_get_uncached(log->l_mp->m_logdev_targp, nbblks, 0);
 144         if (bp)
 145                 xfs_buf_unlock(bp);
 146         return bp;
 147 }
 148
 149 STATIC void
 150 xlog_put_bp(
 151         xfs_buf_t       *bp)
 152 {
 153         xfs_buf_free(bp);
 154 }
 155
 156 /*
 157  * Return the address of the start of the given block number's data
 158  * in a log buffer.  The buffer covers a log sector-aligned region.
 159  */
 160 STATIC xfs_caddr_t
 161 xlog_align(
 162         struct xlog     *log,
 163         xfs_daddr_t     blk_no,
 164         int             nbblks,
 165         struct xfs_buf  *bp)
 166 {
 167         xfs_daddr_t     offset = blk_no & ((xfs_daddr_t)log->l_sectBBsize - 1);
 168
 169         ASSERT(offset + nbblks <= bp->b_length);
 170         return bp->b_addr + BBTOB(offset);
 171 }
 172
 173
 174 /*
 175  * nbblks should be uint, but oh well.  Just want to catch that 32-bit length.
 176  */
 177 STATIC int
 178 xlog_bread_noalign(
 179         struct xlog     *log,
 180         xfs_daddr_t     blk_no,
 181         int             nbblks,
 182         struct xfs_buf  *bp)
 183 {
 184         int             error;
 185
 186         if (!xlog_buf_bbcount_valid(log, nbblks)) {
 187                 xfs_warn(log->l_mp, "Invalid block length (0x%x) for buffer",
 188                         nbblks);
 189                 XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_HIGH, log->l_mp);
 190                 return EFSCORRUPTED;
 191         }
 192
 193         blk_no = round_down(blk_no, log->l_sectBBsize);
 194         nbblks = round_up(nbblks, log->l_sectBBsize);
 195
 196         ASSERT(nbblks > 0);
 197         ASSERT(nbblks <= bp->b_length);
 198
 199         XFS_BUF_SET_ADDR(bp, log->l_logBBstart + blk_no);
 200         XFS_BUF_READ(bp);
 201         bp->b_io_length = nbblks;
 202         bp->b_error = 0;
 203
 204         xfsbdstrat(log->l_mp, bp);
 205         error = xfs_buf_iowait(bp);
 206         if (error)
 207                 xfs_buf_ioerror_alert(bp, __func__);
 208         return error;
 209 }
 210
 211 STATIC int
 212 xlog_bread(
 213         struct xlog     *log,
 214         xfs_daddr_t     blk_no,
 215         int             nbblks,
 216         struct xfs_buf  *bp,
 217         xfs_caddr_t     *offset)
 218 {
 219         int             error;
 220
 221         error = xlog_bread_noalign(log, blk_no, nbblks, bp);
 222         if (error)
 223                 return error;
 224
 225         *offset = xlog_align(log, blk_no, nbblks, bp);
 226         return 0;
 227 }
 228
 229 /*
 230  * Read at an offset into the buffer. Returns with the buffer in it's original
 231  * state regardless of the result of the read.
 232  */
 233 STATIC int
 234 xlog_bread_offset(
 235         struct xlog     *log,
 236         xfs_daddr_t     blk_no,         /* block to read from */
 237         int             nbblks,         /* blocks to read */
 238         struct xfs_buf  *bp,
 239         xfs_caddr_t     offset)
 240 {
 241         xfs_caddr_t     orig_offset = bp->b_addr;
 242         int             orig_len = BBTOB(bp->b_length);
 243         int             error, error2;
 244
 245         error = xfs_buf_associate_memory(bp, offset, BBTOB(nbblks));
 246         if (error)
 247                 return error;
 248
 249         error = xlog_bread_noalign(log, blk_no, nbblks, bp);
 250
 251         /* must reset buffer pointer even on error */
 252         error2 = xfs_buf_associate_memory(bp, orig_offset, orig_len);
 253         if (error)
 254                 return error;
 255         return error2;
 256 }
 257
 258 /*
 259  * Write out the buffer at the given block for the given number of blocks.
 260  * The buffer is kept locked across the write and is returned locked.
 261  * This can only be used for synchronous log writes.
 262  */
 263 STATIC int
 264 xlog_bwrite(
 265         struct xlog     *log,
 266         xfs_daddr_t     blk_no,
 267         int             nbblks,
 268         struct xfs_buf  *bp)
 269 {
 270         int             error;
 271
 272         if (!xlog_buf_bbcount_valid(log, nbblks)) {
 273                 xfs_warn(log->l_mp, "Invalid block length (0x%x) for buffer",
 274                         nbblks);
 275                 XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_HIGH, log->l_mp);
 276                 return EFSCORRUPTED;
 277         }
 278
 279         blk_no = round_down(blk_no, log->l_sectBBsize);
 280         nbblks = round_up(nbblks, log->l_sectBBsize);
 281
 282         ASSERT(nbblks > 0);
 283         ASSERT(nbblks <= bp->b_length);
 284
 285         XFS_BUF_SET_ADDR(bp, log->l_logBBstart + blk_no);
 286         XFS_BUF_ZEROFLAGS(bp);
 287         xfs_buf_hold(bp);
 288         xfs_buf_lock(bp);
 289         bp->b_io_length = nbblks;
 290         bp->b_error = 0;
 291
 292         error = xfs_bwrite(bp);
 293         if (error)
 294                 xfs_buf_ioerror_alert(bp, __func__);
 295         xfs_buf_relse(bp);
 296         return error;
 297 }
 298
 299 #ifdef DEBUG
 300 /*
 301  * dump debug superblock and log record information
 302  */
 303 STATIC void
 304 xlog_header_check_dump(
 305         xfs_mount_t             *mp,
 306         xlog_rec_header_t       *head)
 307 {
 308         xfs_debug(mp, "%s:  SB : uuid = %pU, fmt = %d\n",
 309                 __func__, &mp->m_sb.sb_uuid, XLOG_FMT);
 310         xfs_debug(mp, "    log : uuid = %pU, fmt = %d\n",
 311                 &head->h_fs_uuid, be32_to_cpu(head->h_fmt));
 312 }
 313 #else
 314 #define xlog_header_check_dump(mp, head)
 315 #endif
 316
 317 /*
 318  * check log record header for recovery
 319  */
 320 STATIC int
 321 xlog_header_check_recover(
 322         xfs_mount_t             *mp,
 323         xlog_rec_header_t       *head)
 324 {
 325         ASSERT(head->h_magicno == cpu_to_be32(XLOG_HEADER_MAGIC_NUM));
 326
 327         /*
 328          * IRIX doesn't write the h_fmt field and leaves it zeroed
 329          * (XLOG_FMT_UNKNOWN). This stops us from trying to recover
 330          * a dirty log created in IRIX.
 331          */
 332         if (unlikely(head->h_fmt != cpu_to_be32(XLOG_FMT))) {
 333                 xfs_warn(mp,
 334         "dirty log written in incompatible format - can't recover");
 335                 xlog_header_check_dump(mp, head);
 336                 XFS_ERROR_REPORT("xlog_header_check_recover(1)",
 337                                  XFS_ERRLEVEL_HIGH, mp);
 338                 return XFS_ERROR(EFSCORRUPTED);
 339         } else if (unlikely(!uuid_equal(&mp->m_sb.sb_uuid, &head->h_fs_uuid))) {
 340                 xfs_warn(mp,
 341         "dirty log entry has mismatched uuid - can't recover");
 342                 xlog_header_check_dump(mp, head);
 343                 XFS_ERROR_REPORT("xlog_header_check_recover(2)",
 344                                  XFS_ERRLEVEL_HIGH, mp);
 345                 return XFS_ERROR(EFSCORRUPTED);
 346         }
 347         return 0;
 348 }
 349
 350 /*
 351  * read the head block of the log and check the header
 352  */
 353 STATIC int
 354 xlog_header_check_mount(
 355         xfs_mount_t             *mp,
 356         xlog_rec_header_t       *head)
 357 {
 358         ASSERT(head->h_magicno == cpu_to_be32(XLOG_HEADER_MAGIC_NUM));
 359
 360         if (uuid_is_nil(&head->h_fs_uuid)) {
 361                 /*
 362                  * IRIX doesn't write the h_fs_uuid or h_fmt fields. If
 363                  * h_fs_uuid is nil, we assume this log was last mounted
 364                  * by IRIX and continue.
 365                  */
 366                 xfs_warn(mp, "nil uuid in log - IRIX style log");
 367         } else if (unlikely(!uuid_equal(&mp->m_sb.sb_uuid, &head->h_fs_uuid))) {
 368                 xfs_warn(mp, "log has mismatched uuid - can't recover");
 369                 xlog_header_check_dump(mp, head);
 370                 XFS_ERROR_REPORT("xlog_header_check_mount",
 371                                  XFS_ERRLEVEL_HIGH, mp);
 372                 return XFS_ERROR(EFSCORRUPTED);
 373         }
 374         return 0;
 375 }
 376
 377 STATIC void
 378 xlog_recover_iodone(
 379         struct xfs_buf  *bp)
 380 {
 381         if (bp->b_error) {
 382                 /*
 383                  * We're not going to bother about retrying
 384                  * this during recovery. One strike!
 385                  */
 386                 xfs_buf_ioerror_alert(bp, __func__);
 387                 xfs_force_shutdown(bp->b_target->bt_mount,
 388                                         SHUTDOWN_META_IO_ERROR);
 389         }
 390         bp->b_iodone = NULL;
 391         xfs_buf_ioend(bp, 0);
 392 }
 393
 394 /*
 395  * This routine finds (to an approximation) the first block in the physical
 396  * log which contains the given cycle.  It uses a binary search algorithm.
 397  * Note that the algorithm can not be perfect because the disk will not
 398  * necessarily be perfect.
 399  */
 400 STATIC int
 401 xlog_find_cycle_start(
 402         struct xlog     *log,
 403         struct xfs_buf  *bp,
 404         xfs_daddr_t     first_blk,
 405         xfs_daddr_t     *last_blk,
 406         uint            cycle)
 407 {
 408         xfs_caddr_t     offset;
 409         xfs_daddr_t     mid_blk;
 410         xfs_daddr_t     end_blk;
 411         uint            mid_cycle;
 412         int             error;
 413
 414         end_blk = *last_blk;
 415         mid_blk = BLK_AVG(first_blk, end_blk);
 416         while (mid_blk != first_blk && mid_blk != end_blk) {
 417                 error = xlog_bread(log, mid_blk, 1, bp, &offset);
 418                 if (error)
 419                         return error;
 420                 mid_cycle = xlog_get_cycle(offset);
 421                 if (mid_cycle == cycle)
 422                         end_blk = mid_blk;   /* last_half_cycle == mid_cycle */
 423                 else
 424                         first_blk = mid_blk; /* first_half_cycle == mid_cycle */
 425                 mid_blk = BLK_AVG(first_blk, end_blk);
 426         }
 427         ASSERT((mid_blk == first_blk && mid_blk+1 == end_blk) ||
 428                (mid_blk == end_blk && mid_blk-1 == first_blk));
 429
 430         *last_blk = end_blk;
 431
 432         return 0;
 433 }
 434
 435 /*
 436  * Check that a range of blocks does not contain stop_on_cycle_no.
 437  * Fill in *new_blk with the block offset where such a block is
 438  * found, or with -1 (an invalid block number) if there is no such
 439  * block in the range.  The scan needs to occur from front to back
 440  * and the pointer into the region must be updated since a later
 441  * routine will need to perform another test.
 442  */
 443 STATIC int
 444 xlog_find_verify_cycle(
 445         struct xlog     *log,
 446         xfs_daddr_t     start_blk,
 447         int             nbblks,
 448         uint            stop_on_cycle_no,
 449         xfs_daddr_t     *new_blk)
 450 {
 451         xfs_daddr_t     i, j;
 452         uint            cycle;
 453         xfs_buf_t       *bp;
 454         xfs_daddr_t     bufblks;
 455         xfs_caddr_t     buf = NULL;
 456         int             error = 0;
 457
 458         /*
 459          * Greedily allocate a buffer big enough to handle the full
 460          * range of basic blocks we'll be examining.  If that fails,
 461          * try a smaller size.  We need to be able to read at least
 462          * a log sector, or we're out of luck.
 463          */
 464         bufblks = 1 << ffs(nbblks);
 465         while (bufblks > log->l_logBBsize)
 466                 bufblks >>= 1;
 467         while (!(bp = xlog_get_bp(log, bufblks))) {
 468                 bufblks >>= 1;
 469                 if (bufblks < log->l_sectBBsize)
 470                         return ENOMEM;
 471         }
 472
 473         for (i = start_blk; i < start_blk + nbblks; i += bufblks) {
 474                 int     bcount;
 475
 476                 bcount = min(bufblks, (start_blk + nbblks - i));
 477
 478                 error = xlog_bread(log, i, bcount, bp, &buf);
 479                 if (error)
 480                         goto out;
 481
 482                 for (j = 0; j < bcount; j++) {
 483                         cycle = xlog_get_cycle(buf);
 484                         if (cycle == stop_on_cycle_no) {
 485                                 *new_blk = i+j;
 486                                 goto out;
 487                         }
 488
 489                         buf += BBSIZE;
 490                 }
 491         }
 492
 493         *new_blk = -1;
 494
 495 out:
 496         xlog_put_bp(bp);
 497         return error;
 498 }
 499
 500 /*
 501  * Potentially backup over partial log record write.
 502  *
 503  * In the typical case, last_blk is the number of the block directly after
 504  * a good log record.  Therefore, we subtract one to get the block number
 505  * of the last block in the given buffer.  extra_bblks contains the number
 506  * of blocks we would have read on a previous read.  This happens when the
 507  * last log record is split over the end of the physical log.
 508  *
 509  * extra_bblks is the number of blocks potentially verified on a previous
 510  * call to this routine.
 511  */
 512 STATIC int
 513 xlog_find_verify_log_record(
 514         struct xlog             *log,
 515         xfs_daddr_t             start_blk,
 516         xfs_daddr_t             *last_blk,
 517         int                     extra_bblks)
 518 {
 519         xfs_daddr_t             i;
 520         xfs_buf_t               *bp;
 521         xfs_caddr_t             offset = NULL;
 522         xlog_rec_header_t       *head = NULL;
 523         int                     error = 0;
 524         int                     smallmem = 0;
 525         int                     num_blks = *last_blk - start_blk;
 526         int                     xhdrs;
 527
 528         ASSERT(start_blk != 0 || *last_blk != start_blk);
 529
 530         if (!(bp = xlog_get_bp(log, num_blks))) {
 531                 if (!(bp = xlog_get_bp(log, 1)))
 532                         return ENOMEM;
 533                 smallmem = 1;
 534         } else {
 535                 error = xlog_bread(log, start_blk, num_blks, bp, &offset);
 536                 if (error)
 537                         goto out;
 538                 offset += ((num_blks - 1) << BBSHIFT);
 539         }
 540
 541         for (i = (*last_blk) - 1; i >= 0; i--) {
 542                 if (i < start_blk) {
 543                         /* valid log record not found */
 544                         xfs_warn(log->l_mp,
 545                 "Log inconsistent (didn't find previous header)");
 546                         ASSERT(0);
 547                         error = XFS_ERROR(EIO);
 548                         goto out;
 549                 }
 550
 551                 if (smallmem) {
 552                         error = xlog_bread(log, i, 1, bp, &offset);
 553                         if (error)
 554                                 goto out;
 555                 }
 556
 557                 head = (xlog_rec_header_t *)offset;
 558
 559                 if (head->h_magicno == cpu_to_be32(XLOG_HEADER_MAGIC_NUM))
 560                         break;
 561
 562                 if (!smallmem)
 563                         offset -= BBSIZE;
 564         }
 565
 566         /*
 567          * We hit the beginning of the physical log & still no header.  Return
 568          * to caller.  If caller can handle a return of -1, then this routine
 569          * will be called again for the end of the physical log.
 570          */
 571         if (i == -1) {
 572                 error = -1;
 573                 goto out;
 574         }
 575
 576         /*
 577          * We have the final block of the good log (the first block
 578          * of the log record _before_ the head. So we check the uuid.
 579          */
 580         if ((error = xlog_header_check_mount(log->l_mp, head)))
 581                 goto out;
 582
 583         /*
 584          * We may have found a log record header before we expected one.
 585          * last_blk will be the 1st block # with a given cycle #.  We may end
 586          * up reading an entire log record.  In this case, we don't want to
 587          * reset last_blk.  Only when last_blk points in the middle of a log
 588          * record do we update last_blk.
 589          */
 590         if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) {
 591                 uint    h_size = be32_to_cpu(head->h_size);
 592
 593                 xhdrs = h_size / XLOG_HEADER_CYCLE_SIZE;
 594                 if (h_size % XLOG_HEADER_CYCLE_SIZE)
 595                         xhdrs++;
 596         } else {
 597                 xhdrs = 1;
 598         }
 599
 600         if (*last_blk - i + extra_bblks !=
 601             BTOBB(be32_to_cpu(head->h_len)) + xhdrs)
 602                 *last_blk = i;
 603
 604 out:
 605         xlog_put_bp(bp);
 606         return error;
 607 }
 608
 609 /*
 610  * Head is defined to be the point of the log where the next log write
 611  * write could go.  This means that incomplete LR writes at the end are
 612  * eliminated when calculating the head.  We aren't guaranteed that previous
 613  * LR have complete transactions.  We only know that a cycle number of
 614  * current cycle number -1 won't be present in the log if we start writing
 615  * from our current block number.
 616  *
 617  * last_blk contains the block number of the first block with a given
 618  * cycle number.
 619  *
 620  * Return: zero if normal, non-zero if error.
 621  */
 622 STATIC int
 623 xlog_find_head(
 624         struct xlog     *log,
 625         xfs_daddr_t     *return_head_blk)
 626 {
 627         xfs_buf_t       *bp;
 628         xfs_caddr_t     offset;
 629         xfs_daddr_t     new_blk, first_blk, start_blk, last_blk, head_blk;
 630         int             num_scan_bblks;
 631         uint            first_half_cycle, last_half_cycle;
 632         uint            stop_on_cycle;
 633         int             error, log_bbnum = log->l_logBBsize;
 634
 635         /* Is the end of the log device zeroed? */
 636         if ((error = xlog_find_zeroed(log, &first_blk)) == -1) {
 637                 *return_head_blk = first_blk;
 638
 639                 /* Is the whole lot zeroed? */
 640                 if (!first_blk) {
 641                         /* Linux XFS shouldn't generate totally zeroed logs -
 642                          * mkfs etc write a dummy unmount record to a fresh
 643                          * log so we can store the uuid in there
 644                          */
 645                         xfs_warn(log->l_mp, "totally zeroed log");
 646                 }
 647
 648                 return 0;
 649         } else if (error) {
 650                 xfs_warn(log->l_mp, "empty log check failed");
 651                 return error;
 652         }
 653
 654         first_blk = 0;                  /* get cycle # of 1st block */
 655         bp = xlog_get_bp(log, 1);
 656         if (!bp)
 657                 return ENOMEM;
 658
 659         error = xlog_bread(log, 0, 1, bp, &offset);
 660         if (error)
 661                 goto bp_err;
 662
 663         first_half_cycle = xlog_get_cycle(offset);
 664
 665         last_blk = head_blk = log_bbnum - 1;    /* get cycle # of last block */
 666         error = xlog_bread(log, last_blk, 1, bp, &offset);
 667         if (error)
 668                 goto bp_err;
 669
 670         last_half_cycle = xlog_get_cycle(offset);
 671         ASSERT(last_half_cycle != 0);
 672
 673         /*
 674          * If the 1st half cycle number is equal to the last half cycle number,
 675          * then the entire log is stamped with the same cycle number.  In this
 676          * case, head_blk can't be set to zero (which makes sense).  The below
 677          * math doesn't work out properly with head_blk equal to zero.  Instead,
 678          * we set it to log_bbnum which is an invalid block number, but this
 679          * value makes the math correct.  If head_blk doesn't changed through
 680          * all the tests below, *head_blk is set to zero at the very end rather
 681          * than log_bbnum.  In a sense, log_bbnum and zero are the same block
 682          * in a circular file.
 683          */
 684         if (first_half_cycle == last_half_cycle) {
 685                 /*
 686                  * In this case we believe that the entire log should have
 687                  * cycle number last_half_cycle.  We need to scan backwards
 688                  * from the end verifying that there are no holes still
 689                  * containing last_half_cycle - 1.  If we find such a hole,
 690                  * then the start of that hole will be the new head.  The
 691                  * simple case looks like
 692                  *        x | x ... | x - 1 | x
 693                  * Another case that fits this picture would be
 694                  *        x | x + 1 | x ... | x
 695                  * In this case the head really is somewhere at the end of the
 696                  * log, as one of the latest writes at the beginning was
 697                  * incomplete.
 698                  * One more case is
 699                  *        x | x + 1 | x ... | x - 1 | x
 700                  * This is really the combination of the above two cases, and
 701                  * the head has to end up at the start of the x-1 hole at the
 702                  * end of the log.
 703                  *
 704                  * In the 256k log case, we will read from the beginning to the
 705                  * end of the log and search for cycle numbers equal to x-1.
 706                  * We don't worry about the x+1 blocks that we encounter,
 707                  * because we know that they cannot be the head since the log
 708                  * started with x.
 709                  */
 710                 head_blk = log_bbnum;
 711                 stop_on_cycle = last_half_cycle - 1;
 712         } else {
 713                 /*
 714                  * In this case we want to find the first block with cycle
 715                  * number matching last_half_cycle.  We expect the log to be
 716                  * some variation on
 717                  *        x + 1 ... | x ... | x
 718                  * The first block with cycle number x (last_half_cycle) will
 719                  * be where the new head belongs.  First we do a binary search
 720                  * for the first occurrence of last_half_cycle.  The binary
 721                  * search may not be totally accurate, so then we scan back
 722                  * from there looking for occurrences of last_half_cycle before
 723                  * us.  If that backwards scan wraps around the beginning of
 724                  * the log, then we look for occurrences of last_half_cycle - 1
 725                  * at the end of the log.  The cases we're looking for look
 726                  * like
 727                  *                               v binary search stopped here
 728                  *        x + 1 ... | x | x + 1 | x ... | x
 729                  *                   ^ but we want to locate this spot
 730                  * or
 731                  *        <---------> less than scan distance
 732                  *        x + 1 ... | x ... | x - 1 | x
 733                  *                           ^ we want to locate this spot
 734                  */
 735                 stop_on_cycle = last_half_cycle;
 736                 if ((error = xlog_find_cycle_start(log, bp, first_blk,
 737                                                 &head_blk, last_half_cycle)))
 738                         goto bp_err;
 739         }
 740
 741         /*
 742          * Now validate the answer.  Scan back some number of maximum possible
 743          * blocks and make sure each one has the expected cycle number.  The
 744          * maximum is determined by the total possible amount of buffering
 745          * in the in-core log.  The following number can be made tighter if
 746          * we actually look at the block size of the filesystem.
 747          */
 748         num_scan_bblks = XLOG_TOTAL_REC_SHIFT(log);
 749         if (head_blk >= num_scan_bblks) {
 750                 /*
 751                  * We are guaranteed that the entire check can be performed
 752                  * in one buffer.
 753                  */
 754                 start_blk = head_blk - num_scan_bblks;
 755                 if ((error = xlog_find_verify_cycle(log,
 756                                                 start_blk, num_scan_bblks,
 757                                                 stop_on_cycle, &new_blk)))
 758                         goto bp_err;
 759                 if (new_blk != -1)
 760                         head_blk = new_blk;
 761         } else {                /* need to read 2 parts of log */
 762                 /*
 763                  * We are going to scan backwards in the log in two parts.
 764                  * First we scan the physical end of the log.  In this part
 765                  * of the log, we are looking for blocks with cycle number
 766                  * last_half_cycle - 1.
 767                  * If we find one, then we know that the log starts there, as
 768                  * we've found a hole that didn't get written in going around
 769                  * the end of the physical log.  The simple case for this is
 770                  *        x + 1 ... | x ... | x - 1 | x
 771                  *        <---------> less than scan distance
 772                  * If all of the blocks at the end of the log have cycle number
 773                  * last_half_cycle, then we check the blocks at the start of
 774                  * the log looking for occurrences of last_half_cycle.  If we
 775                  * find one, then our current estimate for the location of the
 776                  * first occurrence of last_half_cycle is wrong and we move
 777                  * back to the hole we've found.  This case looks like
 778                  *        x + 1 ... | x | x + 1 | x ...
 779                  *                               ^ binary search stopped here
 780                  * Another case we need to handle that only occurs in 256k
 781                  * logs is
 782                  *        x + 1 ... | x ... | x+1 | x ...
 783                  *                   ^ binary search stops here
 784                  * In a 256k log, the scan at the end of the log will see the
 785                  * x + 1 blocks.  We need to skip past those since that is
 786                  * certainly not the head of the log.  By searching for
 787                  * last_half_cycle-1 we accomplish that.
 788                  */
 789                 ASSERT(head_blk <= INT_MAX &&
 790                         (xfs_daddr_t) num_scan_bblks >= head_blk);
 791                 start_blk = log_bbnum - (num_scan_bblks - head_blk);
 792                 if ((error = xlog_find_verify_cycle(log, start_blk,
 793                                         num_scan_bblks - (int)head_blk,
 794                                         (stop_on_cycle - 1), &new_blk)))
 795                         goto bp_err;
 796                 if (new_blk != -1) {
 797                         head_blk = new_blk;
 798                         goto validate_head;
 799                 }
 800
 801                 /*
 802                  * Scan beginning of log now.  The last part of the physical
 803                  * log is good.  This scan needs to verify that it doesn't find
 804                  * the last_half_cycle.
 805                  */
 806                 start_blk = 0;
 807                 ASSERT(head_blk <= INT_MAX);
 808                 if ((error = xlog_find_verify_cycle(log,
 809                                         start_blk, (int)head_blk,
 810                                         stop_on_cycle, &new_blk)))
 811                         goto bp_err;
 812                 if (new_blk != -1)
 813                         head_blk = new_blk;
 814         }
 815
 816 validate_head:
 817         /*
 818          * Now we need to make sure head_blk is not pointing to a block in
 819          * the middle of a log record.
 820          */
 821         num_scan_bblks = XLOG_REC_SHIFT(log);
 822         if (head_blk >= num_scan_bblks) {
 823                 start_blk = head_blk - num_scan_bblks; /* don't read head_blk */
 824
 825                 /* start ptr at last block ptr before head_blk */
 826                 if ((error = xlog_find_verify_log_record(log, start_blk,
 827                                                         &head_blk, 0)) == -1) {
 828                         error = XFS_ERROR(EIO);
 829                         goto bp_err;
 830                 } else if (error)
 831                         goto bp_err;
 832         } else {
 833                 start_blk = 0;
 834                 ASSERT(head_blk <= INT_MAX);
 835                 if ((error = xlog_find_verify_log_record(log, start_blk,
 836                                                         &head_blk, 0)) == -1) {
 837                         /* We hit the beginning of the log during our search */
 838                         start_blk = log_bbnum - (num_scan_bblks - head_blk);
 839                         new_blk = log_bbnum;
 840                         ASSERT(start_blk <= INT_MAX &&
 841                                 (xfs_daddr_t) log_bbnum-start_blk >= 0);
 842                         ASSERT(head_blk <= INT_MAX);
 843                         if ((error = xlog_find_verify_log_record(log,
 844                                                         start_blk, &new_blk,
 845                                                         (int)head_blk)) == -1) {
 846                                 error = XFS_ERROR(EIO);
 847                                 goto bp_err;
 848                         } else if (error)
 849                                 goto bp_err;
 850                         if (new_blk != log_bbnum)
 851                                 head_blk = new_blk;
 852                 } else if (error)
 853                         goto bp_err;
 854         }
 855
 856         xlog_put_bp(bp);
 857         if (head_blk == log_bbnum)
 858                 *return_head_blk = 0;
 859         else
 860                 *return_head_blk = head_blk;
 861         /*
 862          * When returning here, we have a good block number.  Bad block
 863          * means that during a previous crash, we didn't have a clean break
 864          * from cycle number N to cycle number N-1.  In this case, we need
 865          * to find the first block with cycle number N-1.
 866          */
 867         return 0;
 868
 869  bp_err:
 870         xlog_put_bp(bp);
 871
 872         if (error)
 873                 xfs_warn(log->l_mp, "failed to find log head");
 874         return error;
 875 }
 876
 877 /*
 878  * Find the sync block number or the tail of the log.
 879  *
 880  * This will be the block number of the last record to have its
 881  * associated buffers synced to disk.  Every log record header has
 882  * a sync lsn embedded in it.  LSNs hold block numbers, so it is easy
 883  * to get a sync block number.  The only concern is to figure out which
 884  * log record header to believe.
 885  *
 886  * The following algorithm uses the log record header with the largest
 887  * lsn.  The entire log record does not need to be valid.  We only care
 888  * that the header is valid.
 889  *
 890  * We could speed up search by using current head_blk buffer, but it is not
 891  * available.
 892  */
 893 STATIC int
 894 xlog_find_tail(
 895         struct xlog             *log,
 896         xfs_daddr_t             *head_blk,
 897         xfs_daddr_t             *tail_blk)
 898 {
 899         xlog_rec_header_t       *rhead;
 900         xlog_op_header_t        *op_head;
 901         xfs_caddr_t             offset = NULL;
 902         xfs_buf_t               *bp;
 903         int                     error, i, found;
 904         xfs_daddr_t             umount_data_blk;
 905         xfs_daddr_t             after_umount_blk;
 906         xfs_lsn_t               tail_lsn;
 907         int                     hblks;
 908
 909         found = 0;
 910
 911         /*
 912          * Find previous log record
 913          */
 914         if ((error = xlog_find_head(log, head_blk)))
 915                 return error;
 916
 917         bp = xlog_get_bp(log, 1);
 918         if (!bp)
 919                 return ENOMEM;
 920         if (*head_blk == 0) {                           /* special case */
 921                 error = xlog_bread(log, 0, 1, bp, &offset);
 922                 if (error)
 923                         goto done;
 924
 925                 if (xlog_get_cycle(offset) == 0) {
 926                         *tail_blk = 0;
 927                         /* leave all other log inited values alone */
 928                         goto done;
 929                 }
 930         }
 931
 932         /*
 933          * Search backwards looking for log record header block
 934          */
 935         ASSERT(*head_blk < INT_MAX);
 936         for (i = (int)(*head_blk) - 1; i >= 0; i--) {
 937                 error = xlog_bread(log, i, 1, bp, &offset);
 938                 if (error)
 939                         goto done;
 940
 941                 if (*(__be32 *)offset == cpu_to_be32(XLOG_HEADER_MAGIC_NUM)) {
 942                         found = 1;
 943                         break;
 944                 }
 945         }
 946         /*
 947          * If we haven't found the log record header block, start looking
 948          * again from the end of the physical log.  XXXmiken: There should be
 949          * a check here to make sure we didn't search more than N blocks in
 950          * the previous code.
 951          */
 952         if (!found) {
 953                 for (i = log->l_logBBsize - 1; i >= (int)(*head_blk); i--) {
 954                         error = xlog_bread(log, i, 1, bp, &offset);
 955                         if (error)
 956                                 goto done;
 957
 958                         if (*(__be32 *)offset ==
 959                             cpu_to_be32(XLOG_HEADER_MAGIC_NUM)) {
 960                                 found = 2;
 961                                 break;
 962                         }
 963                 }
 964         }
 965         if (!found) {
 966                 xfs_warn(log->l_mp, "%s: couldn't find sync record", __func__);
 967                 ASSERT(0);
 968                 return XFS_ERROR(EIO);
 969         }
 970
 971         /* find blk_no of tail of log */
 972         rhead = (xlog_rec_header_t *)offset;
 973         *tail_blk = BLOCK_LSN(be64_to_cpu(rhead->h_tail_lsn));
 974
 975         /*
 976          * Reset log values according to the state of the log when we
 977          * crashed.  In the case where head_blk == 0, we bump curr_cycle
 978          * one because the next write starts a new cycle rather than
 979          * continuing the cycle of the last good log record.  At this
 980          * point we have guaranteed that all partial log records have been
 981          * accounted for.  Therefore, we know that the last good log record
 982          * written was complete and ended exactly on the end boundary
 983          * of the physical log.
 984          */
 985         log->l_prev_block = i;
 986         log->l_curr_block = (int)*head_blk;
 987         log->l_curr_cycle = be32_to_cpu(rhead->h_cycle);
 988         if (found == 2)
 989                 log->l_curr_cycle++;
 990         atomic64_set(&log->l_tail_lsn, be64_to_cpu(rhead->h_tail_lsn));
 991         atomic64_set(&log->l_last_sync_lsn, be64_to_cpu(rhead->h_lsn));
 992         xlog_assign_grant_head(&log->l_reserve_head.grant, log->l_curr_cycle,
 993                                         BBTOB(log->l_curr_block));
 994         xlog_assign_grant_head(&log->l_write_head.grant, log->l_curr_cycle,
 995                                         BBTOB(log->l_curr_block));
 996
 997         /*
 998          * Look for unmount record.  If we find it, then we know there
 999          * was a clean unmount.  Since 'i' could be the last block in
1000          * the physical log, we convert to a log block before comparing
1001          * to the head_blk.
1002          *
1003          * Save the current tail lsn to use to pass to
1004          * xlog_clear_stale_blocks() below.  We won't want to clear the
1005          * unmount record if there is one, so we pass the lsn of the
1006          * unmount record rather than the block after it.
1007          */
1008         if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) {
1009                 int     h_size = be32_to_cpu(rhead->h_size);
1010                 int     h_version = be32_to_cpu(rhead->h_version);
1011
1012                 if ((h_version & XLOG_VERSION_2) &&
1013                     (h_size > XLOG_HEADER_CYCLE_SIZE)) {
1014                         hblks = h_size / XLOG_HEADER_CYCLE_SIZE;
1015                         if (h_size % XLOG_HEADER_CYCLE_SIZE)
1016                                 hblks++;
1017                 } else {
1018                         hblks = 1;
1019                 }
1020         } else {
1021                 hblks = 1;
1022         }
1023         after_umount_blk = (i + hblks + (int)
1024                 BTOBB(be32_to_cpu(rhead->h_len))) % log->l_logBBsize;
1025         tail_lsn = atomic64_read(&log->l_tail_lsn);
1026         if (*head_blk == after_umount_blk &&
1027             be32_to_cpu(rhead->h_num_logops) == 1) {
1028                 umount_data_blk = (i + hblks) % log->l_logBBsize;
1029                 error = xlog_bread(log, umount_data_blk, 1, bp, &offset);
1030                 if (error)
1031                         goto done;
1032
1033                 op_head = (xlog_op_header_t *)offset;
1034                 if (op_head->oh_flags & XLOG_UNMOUNT_TRANS) {
1035                         /*
1036                          * Set tail and last sync so that newly written
1037                          * log records will point recovery to after the
1038                          * current unmount record.
1039                          */
1040                         xlog_assign_atomic_lsn(&log->l_tail_lsn,
1041                                         log->l_curr_cycle, after_umount_blk);
1042                         xlog_assign_atomic_lsn(&log->l_last_sync_lsn,
1043                                         log->l_curr_cycle, after_umount_blk);
1044                         *tail_blk = after_umount_blk;
1045
1046                         /*
1047                          * Note that the unmount was clean. If the unmount
1048                          * was not clean, we need to know this to rebuild the
1049                          * superblock counters from the perag headers if we
1050                          * have a filesystem using non-persistent counters.
1051                          */
1052                         log->l_mp->m_flags |= XFS_MOUNT_WAS_CLEAN;
1053                 }
1054         }
1055
1056         /*
1057          * Make sure that there are no blocks in front of the head
1058          * with the same cycle number as the head.  This can happen
1059          * because we allow multiple outstanding log writes concurrently,
1060          * and the later writes might make it out before earlier ones.
1061          *
1062          * We use the lsn from before modifying it so that we'll never
1063          * overwrite the unmount record after a clean unmount.
1064          *
1065          * Do this only if we are going to recover the filesystem
1066          *
1067          * NOTE: This used to say "if (!readonly)"
1068          * However on Linux, we can & do recover a read-only filesystem.
1069          * We only skip recovery if NORECOVERY is specified on mount,
1070          * in which case we would not be here.
1071          *
1072          * But... if the -device- itself is readonly, just skip this.
1073          * We can't recover this device anyway, so it won't matter.
1074          */
1075         if (!xfs_readonly_buftarg(log->l_mp->m_logdev_targp))
1076                 error = xlog_clear_stale_blocks(log, tail_lsn);
1077
1078 done:
1079         xlog_put_bp(bp);
1080
1081         if (error)
1082                 xfs_warn(log->l_mp, "failed to locate log tail");
1083         return error;
1084 }
1085
1086 /*
1087  * Is the log zeroed at all?
1088  *
1089  * The last binary search should be changed to perform an X block read
1090  * once X becomes small enough.  You can then search linearly through
1091  * the X blocks.  This will cut down on the number of reads we need to do.
1092  *
1093  * If the log is partially zeroed, this routine will pass back the blkno
1094  * of the first block with cycle number 0.  It won't have a complete LR
1095  * preceding it.
1096  *
1097  * Return:
1098  *      0  => the log is completely written to
1099  *      -1 => use *blk_no as the first block of the log
1100  *      >0 => error has occurred
1101  */
1102 STATIC int
1103 xlog_find_zeroed(
1104         struct xlog     *log,
1105         xfs_daddr_t     *blk_no)
1106 {
1107         xfs_buf_t       *bp;
1108         xfs_caddr_t     offset;
1109         uint            first_cycle, last_cycle;
1110         xfs_daddr_t     new_blk, last_blk, start_blk;
1111         xfs_daddr_t     num_scan_bblks;
1112         int             error, log_bbnum = log->l_logBBsize;
1113
1114         *blk_no = 0;
1115
1116         /* check totally zeroed log */
1117         bp = xlog_get_bp(log, 1);
1118         if (!bp)
1119                 return ENOMEM;
1120         error = xlog_bread(log, 0, 1, bp, &offset);
1121         if (error)
1122                 goto bp_err;
1123
1124         first_cycle = xlog_get_cycle(offset);
1125         if (first_cycle == 0) {         /* completely zeroed log */
1126                 *blk_no = 0;
1127                 xlog_put_bp(bp);
1128                 return -1;
1129         }
1130
1131         /* check partially zeroed log */
1132         error = xlog_bread(log, log_bbnum-1, 1, bp, &offset);
1133         if (error)
1134                 goto bp_err;
1135
1136         last_cycle = xlog_get_cycle(offset);
1137         if (last_cycle != 0) {          /* log completely written to */
1138                 xlog_put_bp(bp);
1139                 return 0;
1140         } else if (first_cycle != 1) {
1141                 /*
1142                  * If the cycle of the last block is zero, the cycle of
1143                  * the first block must be 1. If it's not, maybe we're
1144                  * not looking at a log... Bail out.
1145                  */
1146                 xfs_warn(log->l_mp,
1147                         "Log inconsistent or not a log (last==0, first!=1)");
1148                 return XFS_ERROR(EINVAL);
1149         }
1150
1151         /* we have a partially zeroed log */
1152         last_blk = log_bbnum-1;
1153         if ((error = xlog_find_cycle_start(log, bp, 0, &last_blk, 0)))
1154                 goto bp_err;
1155
1156         /*
1157          * Validate the answer.  Because there is no way to guarantee that
1158          * the entire log is made up of log records which are the same size,
1159          * we scan over the defined maximum blocks.  At this point, the maximum
1160          * is not chosen to mean anything special.   XXXmiken
1161          */
1162         num_scan_bblks = XLOG_TOTAL_REC_SHIFT(log);
1163         ASSERT(num_scan_bblks <= INT_MAX);
1164
1165         if (last_blk < num_scan_bblks)
1166                 num_scan_bblks = last_blk;
1167         start_blk = last_blk - num_scan_bblks;
1168
1169         /*
1170          * We search for any instances of cycle number 0 that occur before
1171          * our current estimate of the head.  What we're trying to detect is
1172          *        1 ... | 0 | 1 | 0...
1173          *                       ^ binary search ends here
1174          */
1175         if ((error = xlog_find_verify_cycle(log, start_blk,
1176                                          (int)num_scan_bblks, 0, &new_blk)))
1177                 goto bp_err;
1178         if (new_blk != -1)
1179                 last_blk = new_blk;
1180
1181         /*
1182          * Potentially backup over partial log record write.  We don't need
1183          * to search the end of the log because we know it is zero.
1184          */
1185         if ((error = xlog_find_verify_log_record(log, start_blk,
1186                                 &last_blk, 0)) == -1) {
1187             error = XFS_ERROR(EIO);
1188             goto bp_err;
1189         } else if (error)
1190             goto bp_err;
1191
1192         *blk_no = last_blk;
1193 bp_err:
1194         xlog_put_bp(bp);
1195         if (error)
1196                 return error;
1197         return -1;
1198 }
1199
1200 /*
1201  * These are simple subroutines used by xlog_clear_stale_blocks() below
1202  * to initialize a buffer full of empty log record headers and write
1203  * them into the log.
1204  */
1205 STATIC void
1206 xlog_add_record(
1207         struct xlog             *log,
1208         xfs_caddr_t             buf,
1209         int                     cycle,
1210         int                     block,
1211         int                     tail_cycle,
1212         int                     tail_block)
1213 {
1214         xlog_rec_header_t       *recp = (xlog_rec_header_t *)buf;
1215
1216         memset(buf, 0, BBSIZE);
1217         recp->h_magicno = cpu_to_be32(XLOG_HEADER_MAGIC_NUM);
1218         recp->h_cycle = cpu_to_be32(cycle);
1219         recp->h_version = cpu_to_be32(
1220                         xfs_sb_version_haslogv2(&log->l_mp->m_sb) ? 2 : 1);
1221         recp->h_lsn = cpu_to_be64(xlog_assign_lsn(cycle, block));
1222         recp->h_tail_lsn = cpu_to_be64(xlog_assign_lsn(tail_cycle, tail_block));
1223         recp->h_fmt = cpu_to_be32(XLOG_FMT);
1224         memcpy(&recp->h_fs_uuid, &log->l_mp->m_sb.sb_uuid, sizeof(uuid_t));
1225 }
1226
1227 STATIC int
1228 xlog_write_log_records(
1229         struct xlog     *log,
1230         int             cycle,
1231         int             start_block,
1232         int             blocks,
1233         int             tail_cycle,
1234         int             tail_block)
1235 {
1236         xfs_caddr_t     offset;
1237         xfs_buf_t       *bp;
1238         int             balign, ealign;
1239         int             sectbb = log->l_sectBBsize;
1240         int             end_block = start_block + blocks;
1241         int             bufblks;
1242         int             error = 0;
1243         int             i, j = 0;
1244
1245         /*
1246          * Greedily allocate a buffer big enough to handle the full
1247          * range of basic blocks to be written.  If that fails, try
1248          * a smaller size.  We need to be able to write at least a
1249          * log sector, or we're out of luck.
1250          */
1251         bufblks = 1 << ffs(blocks);
1252         while (bufblks > log->l_logBBsize)
1253                 bufblks >>= 1;
1254         while (!(bp = xlog_get_bp(log, bufblks))) {
1255                 bufblks >>= 1;
1256                 if (bufblks < sectbb)
1257                         return ENOMEM;
1258         }
1259
1260         /* We may need to do a read at the start to fill in part of
1261          * the buffer in the starting sector not covered by the first
1262          * write below.
1263          */
1264         balign = round_down(start_block, sectbb);
1265         if (balign != start_block) {
1266                 error = xlog_bread_noalign(log, start_block, 1, bp);
1267                 if (error)
1268                         goto out_put_bp;
1269
1270                 j = start_block - balign;
1271         }
1272
1273         for (i = start_block; i < end_block; i += bufblks) {
1274                 int             bcount, endcount;
1275
1276                 bcount = min(bufblks, end_block - start_block);
1277                 endcount = bcount - j;
1278
1279                 /* We may need to do a read at the end to fill in part of
1280                  * the buffer in the final sector not covered by the write.
1281                  * If this is the same sector as the above read, skip it.
1282                  */
1283                 ealign = round_down(end_block, sectbb);
1284                 if (j == 0 && (start_block + endcount > ealign)) {
1285                         offset = bp->b_addr + BBTOB(ealign - start_block);
1286                         error = xlog_bread_offset(log, ealign, sectbb,
1287                                                         bp, offset);
1288                         if (error)
1289                                 break;
1290
1291                 }
1292
1293                 offset = xlog_align(log, start_block, endcount, bp);
1294                 for (; j < endcount; j++) {
1295                         xlog_add_record(log, offset, cycle, i+j,
1296                                         tail_cycle, tail_block);
1297                         offset += BBSIZE;
1298                 }
1299                 error = xlog_bwrite(log, start_block, endcount, bp);
1300                 if (error)
1301                         break;
1302                 start_block += endcount;
1303                 j = 0;
1304         }
1305
1306  out_put_bp:
1307         xlog_put_bp(bp);
1308         return error;
1309 }
1310
1311 /*
1312  * This routine is called to blow away any incomplete log writes out
1313  * in front of the log head.  We do this so that we won't become confused
1314  * if we come up, write only a little bit more, and then crash again.
1315  * If we leave the partial log records out there, this situation could
1316  * cause us to think those partial writes are valid blocks since they
1317  * have the current cycle number.  We get rid of them by overwriting them
1318  * with empty log records with the old cycle number rather than the
1319  * current one.
1320  *
1321  * The tail lsn is passed in rather than taken from
1322  * the log so that we will not write over the unmount record after a
1323  * clean unmount in a 512 block log.  Doing so would leave the log without
1324  * any valid log records in it until a new one was written.  If we crashed
1325  * during that time we would not be able to recover.
1326  */
1327 STATIC int
1328 xlog_clear_stale_blocks(
1329         struct xlog     *log,
1330         xfs_lsn_t       tail_lsn)
1331 {
1332         int             tail_cycle, head_cycle;
1333         int             tail_block, head_block;
1334         int             tail_distance, max_distance;
1335         int             distance;
1336         int             error;
1337
1338         tail_cycle = CYCLE_LSN(tail_lsn);
1339         tail_block = BLOCK_LSN(tail_lsn);
1340         head_cycle = log->l_curr_cycle;
1341         head_block = log->l_curr_block;
1342
1343         /*
1344          * Figure out the distance between the new head of the log
1345          * and the tail.  We want to write over any blocks beyond the
1346          * head that we may have written just before the crash, but
1347          * we don't want to overwrite the tail of the log.
1348          */
1349         if (head_cycle == tail_cycle) {
1350                 /*
1351                  * The tail is behind the head in the physical log,
1352                  * so the distance from the head to the tail is the
1353                  * distance from the head to the end of the log plus
1354                  * the distance from the beginning of the log to the
1355                  * tail.
1356                  */
1357                 if (unlikely(head_block < tail_block || head_block >= log->l_logBBsize)) {
1358                         XFS_ERROR_REPORT("xlog_clear_stale_blocks(1)",
1359                                          XFS_ERRLEVEL_LOW, log->l_mp);
1360                         return XFS_ERROR(EFSCORRUPTED);
1361                 }
1362                 tail_distance = tail_block + (log->l_logBBsize - head_block);
1363         } else {
1364                 /*
1365                  * The head is behind the tail in the physical log,
1366                  * so the distance from the head to the tail is just
1367                  * the tail block minus the head block.
1368                  */
1369                 if (unlikely(head_block >= tail_block || head_cycle != (tail_cycle + 1))){
1370                         XFS_ERROR_REPORT("xlog_clear_stale_blocks(2)",
1371                                          XFS_ERRLEVEL_LOW, log->l_mp);
1372                         return XFS_ERROR(EFSCORRUPTED);
1373                 }
1374                 tail_distance = tail_block - head_block;
1375         }
1376
1377         /*
1378          * If the head is right up against the tail, we can't clear
1379          * anything.
1380          */
1381         if (tail_distance <= 0) {
1382                 ASSERT(tail_distance == 0);
1383                 return 0;
1384         }
1385
1386         max_distance = XLOG_TOTAL_REC_SHIFT(log);
1387         /*
1388          * Take the smaller of the maximum amount of outstanding I/O
1389          * we could have and the distance to the tail to clear out.
1390          * We take the smaller so that we don't overwrite the tail and
1391          * we don't waste all day writing from the head to the tail
1392          * for no reason.
1393          */
1394         max_distance = MIN(max_distance, tail_distance);
1395
1396         if ((head_block + max_distance) <= log->l_logBBsize) {
1397                 /*
1398                  * We can stomp all the blocks we need to without
1399                  * wrapping around the end of the log.  Just do it
1400                  * in a single write.  Use the cycle number of the
1401                  * current cycle minus one so that the log will look like:
1402                  *     n ... | n - 1 ...
1403                  */
1404                 error = xlog_write_log_records(log, (head_cycle - 1),
1405                                 head_block, max_distance, tail_cycle,
1406                                 tail_block);
1407                 if (error)
1408                         return error;
1409         } else {
1410                 /*
1411                  * We need to wrap around the end of the physical log in
1412                  * order to clear all the blocks.  Do it in two separate
1413                  * I/Os.  The first write should be from the head to the
1414                  * end of the physical log, and it should use the current
1415                  * cycle number minus one just like above.
1416                  */
1417                 distance = log->l_logBBsize - head_block;
1418                 error = xlog_write_log_records(log, (head_cycle - 1),
1419                                 head_block, distance, tail_cycle,
1420                                 tail_block);
1421
1422                 if (error)
1423                         return error;
1424
1425                 /*
1426                  * Now write the blocks at the start of the physical log.
1427                  * This writes the remainder of the blocks we want to clear.
1428                  * It uses the current cycle number since we're now on the
1429                  * same cycle as the head so that we get:
1430                  *    n ... n ... | n - 1 ...
1431                  *    ^^^^^ blocks we're writing
1432                  */
1433                 distance = max_distance - (log->l_logBBsize - head_block);
1434                 error = xlog_write_log_records(log, head_cycle, 0, distance,
1435                                 tail_cycle, tail_block);
1436                 if (error)
1437                         return error;
1438         }
1439
1440         return 0;
1441 }
1442
1443 /******************************************************************************
1444  *
1445  *              Log recover routines
1446  *
1447  ******************************************************************************
1448  */
1449
1450 STATIC xlog_recover_t *
1451 xlog_recover_find_tid(
1452         struct hlist_head       *head,
1453         xlog_tid_t              tid)
1454 {
1455         xlog_recover_t          *trans;
1456
1457         hlist_for_each_entry(trans, head, r_list) {
1458                 if (trans->r_log_tid == tid)
1459                         return trans;
1460         }
1461         return NULL;
1462 }
1463
1464 STATIC void
1465 xlog_recover_new_tid(
1466         struct hlist_head       *head,
1467         xlog_tid_t              tid,
1468         xfs_lsn_t               lsn)
1469 {
1470         xlog_recover_t          *trans;
1471
1472         trans = kmem_zalloc(sizeof(xlog_recover_t), KM_SLEEP);
1473         trans->r_log_tid   = tid;
1474         trans->r_lsn       = lsn;
1475         INIT_LIST_HEAD(&trans->r_itemq);
1476
1477         INIT_HLIST_NODE(&trans->r_list);
1478         hlist_add_head(&trans->r_list, head);
1479 }
1480
1481 STATIC void
1482 xlog_recover_add_item(
1483         struct list_head        *head)
1484 {
1485         xlog_recover_item_t     *item;
1486
1487         item = kmem_zalloc(sizeof(xlog_recover_item_t), KM_SLEEP);
1488         INIT_LIST_HEAD(&item->ri_list);
1489         list_add_tail(&item->ri_list, head);
1490 }
1491
1492 STATIC int
1493 xlog_recover_add_to_cont_trans(
1494         struct xlog             *log,
1495         struct xlog_recover     *trans,
1496         xfs_caddr_t             dp,
1497         int                     len)
1498 {
1499         xlog_recover_item_t     *item;
1500         xfs_caddr_t             ptr, old_ptr;
1501         int                     old_len;
1502
1503         if (list_empty(&trans->r_itemq)) {
1504                 /* finish copying rest of trans header */
1505                 xlog_recover_add_item(&trans->r_itemq);
1506                 ptr = (xfs_caddr_t) &trans->r_theader +
1507                                 sizeof(xfs_trans_header_t) - len;
1508                 memcpy(ptr, dp, len); /* d, s, l */
1509                 return 0;
1510         }
1511         /* take the tail entry */
1512         item = list_entry(trans->r_itemq.prev, xlog_recover_item_t, ri_list);
1513
1514         old_ptr = item->ri_buf[item->ri_cnt-1].i_addr;
1515         old_len = item->ri_buf[item->ri_cnt-1].i_len;
1516
1517         ptr = kmem_realloc(old_ptr, len+old_len, old_len, KM_SLEEP);
1518         memcpy(&ptr[old_len], dp, len); /* d, s, l */
1519         item->ri_buf[item->ri_cnt-1].i_len += len;
1520         item->ri_buf[item->ri_cnt-1].i_addr = ptr;
1521         trace_xfs_log_recover_item_add_cont(log, trans, item, 0);
1522         return 0;
1523 }
1524
1525 /*
1526  * The next region to add is the start of a new region.  It could be
1527  * a whole region or it could be the first part of a new region.  Because
1528  * of this, the assumption here is that the type and size fields of all
1529  * format structures fit into the first 32 bits of the structure.
1530  *
1531  * This works because all regions must be 32 bit aligned.  Therefore, we
1532  * either have both fields or we have neither field.  In the case we have
1533  * neither field, the data part of the region is zero length.  We only have
1534  * a log_op_header and can throw away the header since a new one will appear
1535  * later.  If we have at least 4 bytes, then we can determine how many regions
1536  * will appear in the current log item.
1537  */
1538 STATIC int
1539 xlog_recover_add_to_trans(
1540         struct xlog             *log,
1541         struct xlog_recover     *trans,
1542         xfs_caddr_t             dp,
1543         int                     len)
1544 {
1545         xfs_inode_log_format_t  *in_f;                  /* any will do */
1546         xlog_recover_item_t     *item;
1547         xfs_caddr_t             ptr;
1548
1549         if (!len)
1550                 return 0;
1551         if (list_empty(&trans->r_itemq)) {
1552                 /* we need to catch log corruptions here */
1553                 if (*(uint *)dp != XFS_TRANS_HEADER_MAGIC) {
1554                         xfs_warn(log->l_mp, "%s: bad header magic number",
1555                                 __func__);
1556                         ASSERT(0);
1557                         return XFS_ERROR(EIO);
1558                 }
1559                 if (len == sizeof(xfs_trans_header_t))
1560                         xlog_recover_add_item(&trans->r_itemq);
1561                 memcpy(&trans->r_theader, dp, len); /* d, s, l */
1562                 return 0;
1563         }
1564
1565         ptr = kmem_alloc(len, KM_SLEEP);
1566         memcpy(ptr, dp, len);
1567         in_f = (xfs_inode_log_format_t *)ptr;
1568
1569         /* take the tail entry */
1570         item = list_entry(trans->r_itemq.prev, xlog_recover_item_t, ri_list);
1571         if (item->ri_total != 0 &&
1572              item->ri_total == item->ri_cnt) {
1573                 /* tail item is in use, get a new one */
1574                 xlog_recover_add_item(&trans->r_itemq);
1575                 item = list_entry(trans->r_itemq.prev,
1576                                         xlog_recover_item_t, ri_list);
1577         }
1578
1579         if (item->ri_total == 0) {              /* first region to be added */
1580                 if (in_f->ilf_size == 0 ||
1581                     in_f->ilf_size > XLOG_MAX_REGIONS_IN_ITEM) {
1582                         xfs_warn(log->l_mp,
1583                 "bad number of regions (%d) in inode log format",
1584                                   in_f->ilf_size);
1585                         ASSERT(0);
1586                         return XFS_ERROR(EIO);
1587                 }
1588
1589                 item->ri_total = in_f->ilf_size;
1590                 item->ri_buf =
1591                         kmem_zalloc(item->ri_total * sizeof(xfs_log_iovec_t),
1592                                     KM_SLEEP);
1593         }
1594         ASSERT(item->ri_total > item->ri_cnt);
1595         /* Description region is ri_buf[0] */
1596         item->ri_buf[item->ri_cnt].i_addr = ptr;
1597         item->ri_buf[item->ri_cnt].i_len  = len;
1598         item->ri_cnt++;
1599         trace_xfs_log_recover_item_add(log, trans, item, 0);
1600         return 0;
1601 }
1602
1603 /*
1604  * Sort the log items in the transaction.
1605  *
1606  * The ordering constraints are defined by the inode allocation and unlink
1607  * behaviour. The rules are:
1608  *
1609  *      1. Every item is only logged once in a given transaction. Hence it
1610  *         represents the last logged state of the item. Hence ordering is
1611  *         dependent on the order in which operations need to be performed so
1612  *         required initial conditions are always met.
1613  *
1614  *      2. Cancelled buffers are recorded in pass 1 in a separate table and
1615  *         there's nothing to replay from them so we can simply cull them
1616  *         from the transaction. However, we can't do that until after we've
1617  *         replayed all the other items because they may be dependent on the
1618  *         cancelled buffer and replaying the cancelled buffer can remove it
1619  *         form the cancelled buffer table. Hence they have tobe done last.
1620  *
1621  *      3. Inode allocation buffers must be replayed before inode items that
1622  *         read the buffer and replay changes into it. For filesystems using the
1623  *         ICREATE transactions, this means XFS_LI_ICREATE objects need to get
1624  *         treated the same as inode allocation buffers as they create and
1625  *         initialise the buffers directly.
1626  *
1627  *      4. Inode unlink buffers must be replayed after inode items are replayed.
1628  *         This ensures that inodes are completely flushed to the inode buffer
1629  *         in a "free" state before we remove the unlinked inode list pointer.
1630  *
1631  * Hence the ordering needs to be inode allocation buffers first, inode items
1632  * second, inode unlink buffers third and cancelled buffers last.
1633  *
1634  * But there's a problem with that - we can't tell an inode allocation buffer
1635  * apart from a regular buffer, so we can't separate them. We can, however,
1636  * tell an inode unlink buffer from the others, and so we can separate them out
1637  * from all the other buffers and move them to last.
1638  *
1639  * Hence, 4 lists, in order from head to tail:
1640  *      - buffer_list for all buffers except cancelled/inode unlink buffers
1641  *      - item_list for all non-buffer items
1642  *      - inode_buffer_list for inode unlink buffers
1643  *      - cancel_list for the cancelled buffers
1644  *
1645  * Note that we add objects to the tail of the lists so that first-to-last
1646  * ordering is preserved within the lists. Adding objects to the head of the
1647  * list means when we traverse from the head we walk them in last-to-first
1648  * order. For cancelled buffers and inode unlink buffers this doesn't matter,
1649  * but for all other items there may be specific ordering that we need to
1650  * preserve.
1651  */
1652 STATIC int
1653 xlog_recover_reorder_trans(
1654         struct xlog             *log,
1655         struct xlog_recover     *trans,
1656         int                     pass)
1657 {
1658         xlog_recover_item_t     *item, *n;
1659         LIST_HEAD(sort_list);
1660         LIST_HEAD(cancel_list);
1661         LIST_HEAD(buffer_list);
1662         LIST_HEAD(inode_buffer_list);
1663         LIST_HEAD(inode_list);
1664
1665         list_splice_init(&trans->r_itemq, &sort_list);
1666         list_for_each_entry_safe(item, n, &sort_list, ri_list) {
1667                 xfs_buf_log_format_t    *buf_f = item->ri_buf[0].i_addr;
1668
1669                 switch (ITEM_TYPE(item)) {
1670                 case XFS_LI_ICREATE:
1671                         list_move_tail(&item->ri_list, &buffer_list);
1672                         break;
1673                 case XFS_LI_BUF:
1674                         if (buf_f->blf_flags & XFS_BLF_CANCEL) {
1675                                 trace_xfs_log_recover_item_reorder_head(log,
1676                                                         trans, item, pass);
1677                                 list_move(&item->ri_list, &cancel_list);
1678                                 break;
1679                         }
1680                         if (buf_f->blf_flags & XFS_BLF_INODE_BUF) {
1681                                 list_move(&item->ri_list, &inode_buffer_list);
1682                                 break;
1683                         }
1684                         list_move_tail(&item->ri_list, &buffer_list);
1685                         break;
1686                 case XFS_LI_INODE:
1687                 case XFS_LI_DQUOT:
1688                 case XFS_LI_QUOTAOFF:
1689                 case XFS_LI_EFD:
1690                 case XFS_LI_EFI:
1691                         trace_xfs_log_recover_item_reorder_tail(log,
1692                                                         trans, item, pass);
1693                         list_move_tail(&item->ri_list, &inode_list);
1694                         break;
1695                 default:
1696                         xfs_warn(log->l_mp,
1697                                 "%s: unrecognized type of log operation",
1698                                 __func__);
1699                         ASSERT(0);
1700                         return XFS_ERROR(EIO);
1701                 }
1702         }
1703         ASSERT(list_empty(&sort_list));
1704         if (!list_empty(&buffer_list))
1705                 list_splice(&buffer_list, &trans->r_itemq);
1706         if (!list_empty(&inode_list))
1707                 list_splice_tail(&inode_list, &trans->r_itemq);
1708         if (!list_empty(&inode_buffer_list))
1709                 list_splice_tail(&inode_buffer_list, &trans->r_itemq);
1710         if (!list_empty(&cancel_list))
1711                 list_splice_tail(&cancel_list, &trans->r_itemq);
1712         return 0;
1713 }
1714
1715 /*
1716  * Build up the table of buf cancel records so that we don't replay
1717  * cancelled data in the second pass.  For buffer records that are
1718  * not cancel records, there is nothing to do here so we just return.
1719  *
1720  * If we get a cancel record which is already in the table, this indicates
1721  * that the buffer was cancelled multiple times.  In order to ensure
1722  * that during pass 2 we keep the record in the table until we reach its
1723  * last occurrence in the log, we keep a reference count in the cancel
1724  * record in the table to tell us how many times we expect to see this
1725  * record during the second pass.
1726  */
1727 STATIC int
1728 xlog_recover_buffer_pass1(
1729         struct xlog                     *log,
1730         struct xlog_recover_item        *item)
1731 {
1732         xfs_buf_log_format_t    *buf_f = item->ri_buf[0].i_addr;
1733         struct list_head        *bucket;
1734         struct xfs_buf_cancel   *bcp;
1735
1736         /*
1737          * If this isn't a cancel buffer item, then just return.
1738          */
1739         if (!(buf_f->blf_flags & XFS_BLF_CANCEL)) {
1740                 trace_xfs_log_recover_buf_not_cancel(log, buf_f);
1741                 return 0;
1742         }
1743
1744         /*
1745          * Insert an xfs_buf_cancel record into the hash table of them.
1746          * If there is already an identical record, bump its reference count.
1747          */
1748         bucket = XLOG_BUF_CANCEL_BUCKET(log, buf_f->blf_blkno);
1749         list_for_each_entry(bcp, bucket, bc_list) {
1750                 if (bcp->bc_blkno == buf_f->blf_blkno &&
1751                     bcp->bc_len == buf_f->blf_len) {
1752                         bcp->bc_refcount++;
1753                         trace_xfs_log_recover_buf_cancel_ref_inc(log, buf_f);
1754                         return 0;
1755                 }
1756         }
1757
1758         bcp = kmem_alloc(sizeof(struct xfs_buf_cancel), KM_SLEEP);
1759         bcp->bc_blkno = buf_f->blf_blkno;
1760         bcp->bc_len = buf_f->blf_len;
1761         bcp->bc_refcount = 1;
1762         list_add_tail(&bcp->bc_list, bucket);
1763
1764         trace_xfs_log_recover_buf_cancel_add(log, buf_f);
1765         return 0;
1766 }
1767
1768 /*
1769  * Check to see whether the buffer being recovered has a corresponding
1770  * entry in the buffer cancel record table.  If it does then return 1
1771  * so that it will be cancelled, otherwise return 0.  If the buffer is
1772  * actually a buffer cancel item (XFS_BLF_CANCEL is set), then decrement
1773  * the refcount on the entry in the table and remove it from the table
1774  * if this is the last reference.
1775  *
1776  * We remove the cancel record from the table when we encounter its
1777  * last occurrence in the log so that if the same buffer is re-used
1778  * again after its last cancellation we actually replay the changes
1779  * made at that point.
1780  */
1781 STATIC int
1782 xlog_check_buffer_cancelled(
1783         struct xlog             *log,
1784         xfs_daddr_t             blkno,
1785         uint                    len,
1786         ushort                  flags)
1787 {
1788         struct list_head        *bucket;
1789         struct xfs_buf_cancel   *bcp;
1790
1791         if (log->l_buf_cancel_table == NULL) {
1792                 /*
1793                  * There is nothing in the table built in pass one,
1794                  * so this buffer must not be cancelled.
1795                  */
1796                 ASSERT(!(flags & XFS_BLF_CANCEL));
1797                 return 0;
1798         }
1799
1800         /*
1801          * Search for an entry in the  cancel table that matches our buffer.
1802          */
1803         bucket = XLOG_BUF_CANCEL_BUCKET(log, blkno);
1804         list_for_each_entry(bcp, bucket, bc_list) {
1805                 if (bcp->bc_blkno == blkno && bcp->bc_len == len)
1806                         goto found;
1807         }
1808
1809         /*
1810          * We didn't find a corresponding entry in the table, so return 0 so
1811          * that the buffer is NOT cancelled.
1812          */
1813         ASSERT(!(flags & XFS_BLF_CANCEL));
1814         return 0;
1815
1816 found:
1817         /*
1818          * We've go a match, so return 1 so that the recovery of this buffer
1819          * is cancelled.  If this buffer is actually a buffer cancel log
1820          * item, then decrement the refcount on the one in the table and
1821          * remove it if this is the last reference.
1822          */
1823         if (flags & XFS_BLF_CANCEL) {
1824                 if (--bcp->bc_refcount == 0) {
1825                         list_del(&bcp->bc_list);
1826                         kmem_free(bcp);
1827                 }
1828         }
1829         return 1;
1830 }
1831
1832 /*
1833  * Perform recovery for a buffer full of inodes.  In these buffers, the only
1834  * data which should be recovered is that which corresponds to the
1835  * di_next_unlinked pointers in the on disk inode structures.  The rest of the
1836  * data for the inodes is always logged through the inodes themselves rather
1837  * than the inode buffer and is recovered in xlog_recover_inode_pass2().
1838  *
1839  * The only time when buffers full of inodes are fully recovered is when the
1840  * buffer is full of newly allocated inodes.  In this case the buffer will
1841  * not be marked as an inode buffer and so will be sent to
1842  * xlog_recover_do_reg_buffer() below during recovery.
1843  */
1844 STATIC int
1845 xlog_recover_do_inode_buffer(
1846         struct xfs_mount        *mp,
1847         xlog_recover_item_t     *item,
1848         struct xfs_buf          *bp,
1849         xfs_buf_log_format_t    *buf_f)
1850 {
1851         int                     i;
1852         int                     item_index = 0;
1853         int                     bit = 0;
1854         int                     nbits = 0;
1855         int                     reg_buf_offset = 0;
1856         int                     reg_buf_bytes = 0;
1857         int                     next_unlinked_offset;
1858         int                     inodes_per_buf;
1859         xfs_agino_t             *logged_nextp;
1860         xfs_agino_t             *buffer_nextp;
1861
1862         trace_xfs_log_recover_buf_inode_buf(mp->m_log, buf_f);
1863
1864         /*
1865          * Post recovery validation only works properly on CRC enabled
1866          * filesystems.
1867          */
1868         if (xfs_sb_version_hascrc(&mp->m_sb))
1869                 bp->b_ops = &xfs_inode_buf_ops;
1870
1871         inodes_per_buf = BBTOB(bp->b_io_length) >> mp->m_sb.sb_inodelog;
1872         for (i = 0; i < inodes_per_buf; i++) {
1873                 next_unlinked_offset = (i * mp->m_sb.sb_inodesize) +
1874                         offsetof(xfs_dinode_t, di_next_unlinked);
1875
1876                 while (next_unlinked_offset >=
1877                        (reg_buf_offset + reg_buf_bytes)) {
1878                         /*
1879                          * The next di_next_unlinked field is beyond
1880                          * the current logged region.  Find the next
1881                          * logged region that contains or is beyond
1882                          * the current di_next_unlinked field.
1883                          */
1884                         bit += nbits;
1885                         bit = xfs_next_bit(buf_f->blf_data_map,
1886                                            buf_f->blf_map_size, bit);
1887
1888                         /*
1889                          * If there are no more logged regions in the
1890                          * buffer, then we're done.
1891                          */
1892                         if (bit == -1)
1893                                 return 0;
1894
1895                         nbits = xfs_contig_bits(buf_f->blf_data_map,
1896                                                 buf_f->blf_map_size, bit);
1897                         ASSERT(nbits > 0);
1898                         reg_buf_offset = bit << XFS_BLF_SHIFT;
1899                         reg_buf_bytes = nbits << XFS_BLF_SHIFT;
1900                         item_index++;
1901                 }
1902
1903                 /*
1904                  * If the current logged region starts after the current
1905                  * di_next_unlinked field, then move on to the next
1906                  * di_next_unlinked field.
1907                  */
1908                 if (next_unlinked_offset < reg_buf_offset)
1909                         continue;
1910
1911                 ASSERT(item->ri_buf[item_index].i_addr != NULL);
1912                 ASSERT((item->ri_buf[item_index].i_len % XFS_BLF_CHUNK) == 0);
1913                 ASSERT((reg_buf_offset + reg_buf_bytes) <=
1914                                                         BBTOB(bp->b_io_length));
1915
1916                 /*
1917                  * The current logged region contains a copy of the
1918                  * current di_next_unlinked field.  Extract its value
1919                  * and copy it to the buffer copy.
1920                  */
1921                 logged_nextp = item->ri_buf[item_index].i_addr +
1922                                 next_unlinked_offset - reg_buf_offset;
1923                 if (unlikely(*logged_nextp == 0)) {
1924                         xfs_alert(mp,
1925                 "Bad inode buffer log record (ptr = 0x%p, bp = 0x%p). "
1926                 "Trying to replay bad (0) inode di_next_unlinked field.",
1927                                 item, bp);
1928                         XFS_ERROR_REPORT("xlog_recover_do_inode_buf",
1929                                          XFS_ERRLEVEL_LOW, mp);
1930                         return XFS_ERROR(EFSCORRUPTED);
1931                 }
1932
1933                 buffer_nextp = (xfs_agino_t *)xfs_buf_offset(bp,
1934                                               next_unlinked_offset);
1935                 *buffer_nextp = *logged_nextp;
1936
1937                 /*
1938                  * If necessary, recalculate the CRC in the on-disk inode. We
1939                  * have to leave the inode in a consistent state for whoever
1940                  * reads it next....
1941                  */
1942                 xfs_dinode_calc_crc(mp, (struct xfs_dinode *)
1943                                 xfs_buf_offset(bp, i * mp->m_sb.sb_inodesize));
1944
1945         }
1946
1947         return 0;
1948 }
1949
1950 /*
1951  * Validate the recovered buffer is of the correct type and attach the
1952  * appropriate buffer operations to them for writeback. Magic numbers are in a
1953  * few places:
1954  *      the first 16 bits of the buffer (inode buffer, dquot buffer),
1955  *      the first 32 bits of the buffer (most blocks),
1956  *      inside a struct xfs_da_blkinfo at the start of the buffer.
1957  */
1958 static void
1959 xlog_recovery_validate_buf_type(
1960         struct xfs_mount        *mp,
1961         struct xfs_buf          *bp,
1962         xfs_buf_log_format_t    *buf_f)
1963 {
1964         struct xfs_da_blkinfo   *info = bp->b_addr;
1965         __uint32_t              magic32;
1966         __uint16_t              magic16;
1967         __uint16_t              magicda;
1968
1969         magic32 = be32_to_cpu(*(__be32 *)bp->b_addr);
1970         magic16 = be16_to_cpu(*(__be16*)bp->b_addr);
1971         magicda = be16_to_cpu(info->magic);
1972         switch (xfs_blft_from_flags(buf_f)) {
1973         case XFS_BLFT_BTREE_BUF:
1974                 switch (magic32) {
1975                 case XFS_ABTB_CRC_MAGIC:
1976                 case XFS_ABTC_CRC_MAGIC:
1977                 case XFS_ABTB_MAGIC:
1978                 case XFS_ABTC_MAGIC:
1979                         bp->b_ops = &xfs_allocbt_buf_ops;
1980                         break;
1981                 case XFS_IBT_CRC_MAGIC:
1982                 case XFS_IBT_MAGIC:
1983                         bp->b_ops = &xfs_inobt_buf_ops;
1984                         break;
1985                 case XFS_BMAP_CRC_MAGIC:
1986                 case XFS_BMAP_MAGIC:
1987                         bp->b_ops = &xfs_bmbt_buf_ops;
1988                         break;
1989                 default:
1990                         xfs_warn(mp, "Bad btree block magic!");
1991                         ASSERT(0);
1992                         break;
1993                 }
1994                 break;
1995         case XFS_BLFT_AGF_BUF:
1996                 if (magic32 != XFS_AGF_MAGIC) {
1997                         xfs_warn(mp, "Bad AGF block magic!");
1998                         ASSERT(0);
1999                         break;
2000                 }
2001                 bp->b_ops = &xfs_agf_buf_ops;
2002                 break;
2003         case XFS_BLFT_AGFL_BUF:
2004                 if (!xfs_sb_version_hascrc(&mp->m_sb))
2005                         break;
2006                 if (magic32 != XFS_AGFL_MAGIC) {
2007                         xfs_warn(mp, "Bad AGFL block magic!");
2008                         ASSERT(0);
2009                         break;
2010                 }
2011                 bp->b_ops = &xfs_agfl_buf_ops;
2012                 break;
2013         case XFS_BLFT_AGI_BUF:
2014                 if (magic32 != XFS_AGI_MAGIC) {
2015                         xfs_warn(mp, "Bad AGI block magic!");
2016                         ASSERT(0);
2017                         break;
2018                 }
2019                 bp->b_ops = &xfs_agi_buf_ops;
2020                 break;
2021         case XFS_BLFT_UDQUOT_BUF:
2022         case XFS_BLFT_PDQUOT_BUF:
2023         case XFS_BLFT_GDQUOT_BUF:
2024 #ifdef CONFIG_XFS_QUOTA
2025                 if (magic16 != XFS_DQUOT_MAGIC) {
2026                         xfs_warn(mp, "Bad DQUOT block magic!");
2027                         ASSERT(0);
2028                         break;
2029                 }
2030                 bp->b_ops = &xfs_dquot_buf_ops;
2031 #else
2032                 xfs_alert(mp,
2033         "Trying to recover dquots without QUOTA support built in!");
2034                 ASSERT(0);
2035 #endif
2036                 break;
2037         case XFS_BLFT_DINO_BUF:
2038                 /*
2039                  * we get here with inode allocation buffers, not buffers that
2040                  * track unlinked list changes.
2041                  */
2042                 if (magic16 != XFS_DINODE_MAGIC) {
2043                         xfs_warn(mp, "Bad INODE block magic!");
2044                         ASSERT(0);
2045                         break;
2046                 }
2047                 bp->b_ops = &xfs_inode_buf_ops;
2048                 break;
2049         case XFS_BLFT_SYMLINK_BUF:
2050                 if (magic32 != XFS_SYMLINK_MAGIC) {
2051                         xfs_warn(mp, "Bad symlink block magic!");
2052                         ASSERT(0);
2053                         break;
2054                 }
2055                 bp->b_ops = &xfs_symlink_buf_ops;
2056                 break;
2057         case XFS_BLFT_DIR_BLOCK_BUF:
2058                 if (magic32 != XFS_DIR2_BLOCK_MAGIC &&
2059                     magic32 != XFS_DIR3_BLOCK_MAGIC) {
2060                         xfs_warn(mp, "Bad dir block magic!");
2061                         ASSERT(0);
2062                         break;
2063                 }
2064                 bp->b_ops = &xfs_dir3_block_buf_ops;
2065                 break;
2066         case XFS_BLFT_DIR_DATA_BUF:
2067                 if (magic32 != XFS_DIR2_DATA_MAGIC &&
2068                     magic32 != XFS_DIR3_DATA_MAGIC) {
2069                         xfs_warn(mp, "Bad dir data magic!");
2070                         ASSERT(0);
2071                         break;
2072                 }
2073                 bp->b_ops = &xfs_dir3_data_buf_ops;
2074                 break;
2075         case XFS_BLFT_DIR_FREE_BUF:
2076                 if (magic32 != XFS_DIR2_FREE_MAGIC &&
2077                     magic32 != XFS_DIR3_FREE_MAGIC) {
2078                         xfs_warn(mp, "Bad dir3 free magic!");
2079                         ASSERT(0);
2080                         break;
2081                 }
2082                 bp->b_ops = &xfs_dir3_free_buf_ops;
2083                 break;
2084         case XFS_BLFT_DIR_LEAF1_BUF:
2085                 if (magicda != XFS_DIR2_LEAF1_MAGIC &&
2086                     magicda != XFS_DIR3_LEAF1_MAGIC) {
2087                         xfs_warn(mp, "Bad dir leaf1 magic!");
2088                         ASSERT(0);
2089                         break;
2090                 }
2091                 bp->b_ops = &xfs_dir3_leaf1_buf_ops;
2092                 break;
2093         case XFS_BLFT_DIR_LEAFN_BUF:
2094                 if (magicda != XFS_DIR2_LEAFN_MAGIC &&
2095                     magicda != XFS_DIR3_LEAFN_MAGIC) {
2096                         xfs_warn(mp, "Bad dir leafn magic!");
2097                         ASSERT(0);
2098                         break;
2099                 }
2100                 bp->b_ops = &xfs_dir3_leafn_buf_ops;
2101                 break;
2102         case XFS_BLFT_DA_NODE_BUF:
2103                 if (magicda != XFS_DA_NODE_MAGIC &&
2104                     magicda != XFS_DA3_NODE_MAGIC) {
2105                         xfs_warn(mp, "Bad da node magic!");
2106                         ASSERT(0);
2107                         break;
2108                 }
2109                 bp->b_ops = &xfs_da3_node_buf_ops;
2110                 break;
2111         case XFS_BLFT_ATTR_LEAF_BUF:
2112                 if (magicda != XFS_ATTR_LEAF_MAGIC &&
2113                     magicda != XFS_ATTR3_LEAF_MAGIC) {
2114                         xfs_warn(mp, "Bad attr leaf magic!");
2115                         ASSERT(0);
2116                         break;
2117                 }
2118                 bp->b_ops = &xfs_attr3_leaf_buf_ops;
2119                 break;
2120         case XFS_BLFT_ATTR_RMT_BUF:
2121                 if (!xfs_sb_version_hascrc(&mp->m_sb))
2122                         break;
2123                 if (magic32 != XFS_ATTR3_RMT_MAGIC) {
2124                         xfs_warn(mp, "Bad attr remote magic!");
2125                         ASSERT(0);
2126                         break;
2127                 }
2128                 bp->b_ops = &xfs_attr3_rmt_buf_ops;
2129                 break;
2130         case XFS_BLFT_SB_BUF:
2131                 if (magic32 != XFS_SB_MAGIC) {
2132                         xfs_warn(mp, "Bad SB block magic!");
2133                         ASSERT(0);
2134                         break;
2135                 }
2136                 bp->b_ops = &xfs_sb_buf_ops;
2137                 break;
2138         default:
2139                 xfs_warn(mp, "Unknown buffer type %d!",
2140                          xfs_blft_from_flags(buf_f));
2141                 break;
2142         }
2143 }
2144
2145 /*
2146  * Perform a 'normal' buffer recovery.  Each logged region of the
2147  * buffer should be copied over the corresponding region in the
2148  * given buffer.  The bitmap in the buf log format structure indicates
2149  * where to place the logged data.
2150  */
2151 STATIC void
2152 xlog_recover_do_reg_buffer(
2153         struct xfs_mount        *mp,
2154         xlog_recover_item_t     *item,
2155         struct xfs_buf          *bp,
2156         xfs_buf_log_format_t    *buf_f)
2157 {
2158         int                     i;
2159         int                     bit;
2160         int                     nbits;
2161         int                     error;
2162
2163         trace_xfs_log_recover_buf_reg_buf(mp->m_log, buf_f);
2164
2165         bit = 0;
2166         i = 1;  /* 0 is the buf format structure */
2167         while (1) {
2168                 bit = xfs_next_bit(buf_f->blf_data_map,
2169                                    buf_f->blf_map_size, bit);
2170                 if (bit == -1)
2171                         break;
2172                 nbits = xfs_contig_bits(buf_f->blf_data_map,
2173                                         buf_f->blf_map_size, bit);
2174                 ASSERT(nbits > 0);
2175                 ASSERT(item->ri_buf[i].i_addr != NULL);
2176                 ASSERT(item->ri_buf[i].i_len % XFS_BLF_CHUNK == 0);
2177                 ASSERT(BBTOB(bp->b_io_length) >=
2178                        ((uint)bit << XFS_BLF_SHIFT) + (nbits << XFS_BLF_SHIFT));
2179
2180                 /*
2181                  * The dirty regions logged in the buffer, even though
2182                  * contiguous, may span multiple chunks. This is because the
2183                  * dirty region may span a physical page boundary in a buffer
2184                  * and hence be split into two separate vectors for writing into
2185                  * the log. Hence we need to trim nbits back to the length of
2186                  * the current region being copied out of the log.
2187                  */
2188                 if (item->ri_buf[i].i_len < (nbits << XFS_BLF_SHIFT))
2189                         nbits = item->ri_buf[i].i_len >> XFS_BLF_SHIFT;
2190
2191                 /*
2192                  * Do a sanity check if this is a dquot buffer. Just checking
2193                  * the first dquot in the buffer should do. XXXThis is
2194                  * probably a good thing to do for other buf types also.
2195                  */
2196                 error = 0;
2197                 if (buf_f->blf_flags &
2198                    (XFS_BLF_UDQUOT_BUF|XFS_BLF_PDQUOT_BUF|XFS_BLF_GDQUOT_BUF)) {
2199                         if (item->ri_buf[i].i_addr == NULL) {
2200                                 xfs_alert(mp,
2201                                         "XFS: NULL dquot in %s.", __func__);
2202                                 goto next;
2203                         }
2204                         if (item->ri_buf[i].i_len < sizeof(xfs_disk_dquot_t)) {
2205                                 xfs_alert(mp,
2206                                         "XFS: dquot too small (%d) in %s.",
2207                                         item->ri_buf[i].i_len, __func__);
2208                                 goto next;
2209                         }
2210                         error = xfs_qm_dqcheck(mp, item->ri_buf[i].i_addr,
2211                                                -1, 0, XFS_QMOPT_DOWARN,
2212                                                "dquot_buf_recover");
2213                         if (error)
2214                                 goto next;
2215                 }
2216
2217                 memcpy(xfs_buf_offset(bp,
2218                         (uint)bit << XFS_BLF_SHIFT),    /* dest */
2219                         item->ri_buf[i].i_addr,         /* source */
2220                         nbits<<XFS_BLF_SHIFT);          /* length */
2221  next:
2222                 i++;
2223                 bit += nbits;
2224         }
2225
2226         /* Shouldn't be any more regions */
2227         ASSERT(i == item->ri_total);
2228
2229         /*
2230          * We can only do post recovery validation on items on CRC enabled
2231          * fielsystems as we need to know when the buffer was written to be able
2232          * to determine if we should have replayed the item. If we replay old
2233          * metadata over a newer buffer, then it will enter a temporarily
2234          * inconsistent state resulting in verification failures. Hence for now
2235          * just avoid the verification stage for non-crc filesystems
2236          */
2237         if (xfs_sb_version_hascrc(&mp->m_sb))
2238                 xlog_recovery_validate_buf_type(mp, bp, buf_f);
2239 }
2240
2241 /*
2242  * Do some primitive error checking on ondisk dquot data structures.
2243  */
2244 int
2245 xfs_qm_dqcheck(
2246         struct xfs_mount *mp,
2247         xfs_disk_dquot_t *ddq,
2248         xfs_dqid_t       id,
2249         uint             type,    /* used only when IO_dorepair is true */
2250         uint             flags,
2251         char             *str)
2252 {
2253         xfs_dqblk_t      *d = (xfs_dqblk_t *)ddq;
2254         int             errs = 0;
2255
2256         /*
2257          * We can encounter an uninitialized dquot buffer for 2 reasons:
2258          * 1. If we crash while deleting the quotainode(s), and those blks got
2259          *    used for user data. This is because we take the path of regular
2260          *    file deletion; however, the size field of quotainodes is never
2261          *    updated, so all the tricks that we play in itruncate_finish
2262          *    don't quite matter.
2263          *
2264          * 2. We don't play the quota buffers when there's a quotaoff logitem.
2265          *    But the allocation will be replayed so we'll end up with an
2266          *    uninitialized quota block.
2267          *
2268          * This is all fine; things are still consistent, and we haven't lost
2269          * any quota information. Just don't complain about bad dquot blks.
2270          */
2271         if (ddq->d_magic != cpu_to_be16(XFS_DQUOT_MAGIC)) {
2272                 if (flags & XFS_QMOPT_DOWARN)
2273                         xfs_alert(mp,
2274                         "%s : XFS dquot ID 0x%x, magic 0x%x != 0x%x",
2275                         str, id, be16_to_cpu(ddq->d_magic), XFS_DQUOT_MAGIC);
2276                 errs++;
2277         }
2278         if (ddq->d_version != XFS_DQUOT_VERSION) {
2279                 if (flags & XFS_QMOPT_DOWARN)
2280                         xfs_alert(mp,
2281                         "%s : XFS dquot ID 0x%x, version 0x%x != 0x%x",
2282                         str, id, ddq->d_version, XFS_DQUOT_VERSION);
2283                 errs++;
2284         }
2285
2286         if (ddq->d_flags != XFS_DQ_USER &&
2287             ddq->d_flags != XFS_DQ_PROJ &&
2288             ddq->d_flags != XFS_DQ_GROUP) {
2289                 if (flags & XFS_QMOPT_DOWARN)
2290                         xfs_alert(mp,
2291                         "%s : XFS dquot ID 0x%x, unknown flags 0x%x",
2292                         str, id, ddq->d_flags);
2293                 errs++;
2294         }
2295
2296         if (id != -1 && id != be32_to_cpu(ddq->d_id)) {
2297                 if (flags & XFS_QMOPT_DOWARN)
2298                         xfs_alert(mp,
2299                         "%s : ondisk-dquot 0x%p, ID mismatch: "
2300                         "0x%x expected, found id 0x%x",
2301                         str, ddq, id, be32_to_cpu(ddq->d_id));
2302                 errs++;
2303         }
2304
2305         if (!errs && ddq->d_id) {
2306                 if (ddq->d_blk_softlimit &&
2307                     be64_to_cpu(ddq->d_bcount) >
2308                                 be64_to_cpu(ddq->d_blk_softlimit)) {
2309                         if (!ddq->d_btimer) {
2310                                 if (flags & XFS_QMOPT_DOWARN)
2311                                         xfs_alert(mp,
2312                         "%s : Dquot ID 0x%x (0x%p) BLK TIMER NOT STARTED",
2313                                         str, (int)be32_to_cpu(ddq->d_id), ddq);
2314                                 errs++;
2315                         }
2316                 }
2317                 if (ddq->d_ino_softlimit &&
2318                     be64_to_cpu(ddq->d_icount) >
2319                                 be64_to_cpu(ddq->d_ino_softlimit)) {
2320                         if (!ddq->d_itimer) {
2321                                 if (flags & XFS_QMOPT_DOWARN)
2322                                         xfs_alert(mp,
2323                         "%s : Dquot ID 0x%x (0x%p) INODE TIMER NOT STARTED",
2324                                         str, (int)be32_to_cpu(ddq->d_id), ddq);
2325                                 errs++;
2326                         }
2327                 }
2328                 if (ddq->d_rtb_softlimit &&
2329                     be64_to_cpu(ddq->d_rtbcount) >
2330                                 be64_to_cpu(ddq->d_rtb_softlimit)) {
2331                         if (!ddq->d_rtbtimer) {
2332                                 if (flags & XFS_QMOPT_DOWARN)
2333                                         xfs_alert(mp,
2334                         "%s : Dquot ID 0x%x (0x%p) RTBLK TIMER NOT STARTED",
2335                                         str, (int)be32_to_cpu(ddq->d_id), ddq);
2336                                 errs++;
2337                         }
2338                 }
2339         }
2340
2341         if (!errs || !(flags & XFS_QMOPT_DQREPAIR))
2342                 return errs;
2343
2344         if (flags & XFS_QMOPT_DOWARN)
2345                 xfs_notice(mp, "Re-initializing dquot ID 0x%x", id);
2346
2347         /*
2348          * Typically, a repair is only requested by quotacheck.
2349          */
2350         ASSERT(id != -1);
2351         ASSERT(flags & XFS_QMOPT_DQREPAIR);
2352         memset(d, 0, sizeof(xfs_dqblk_t));
2353
2354         d->dd_diskdq.d_magic = cpu_to_be16(XFS_DQUOT_MAGIC);
2355         d->dd_diskdq.d_version = XFS_DQUOT_VERSION;
2356         d->dd_diskdq.d_flags = type;
2357         d->dd_diskdq.d_id = cpu_to_be32(id);
2358
2359         if (xfs_sb_version_hascrc(&mp->m_sb)) {
2360                 uuid_copy(&d->dd_uuid, &mp->m_sb.sb_uuid);
2361                 xfs_update_cksum((char *)d, sizeof(struct xfs_dqblk),
2362                                  XFS_DQUOT_CRC_OFF);
2363         }
2364
2365         return errs;
2366 }
2367
2368 /*
2369  * Perform a dquot buffer recovery.
2370  * Simple algorithm: if we have found a QUOTAOFF logitem of the same type
2371  * (ie. USR or GRP), then just toss this buffer away; don't recover it.
2372  * Else, treat it as a regular buffer and do recovery.
2373  */
2374 STATIC void
2375 xlog_recover_do_dquot_buffer(
2376         struct xfs_mount                *mp,
2377         struct xlog                     *log,
2378         struct xlog_recover_item        *item,
2379         struct xfs_buf                  *bp,
2380         struct xfs_buf_log_format       *buf_f)
2381 {
2382         uint                    type;
2383
2384         trace_xfs_log_recover_buf_dquot_buf(log, buf_f);
2385
2386         /*
2387          * Filesystems are required to send in quota flags at mount time.
2388          */
2389         if (mp->m_qflags == 0) {
2390                 return;
2391         }
2392
2393         type = 0;
2394         if (buf_f->blf_flags & XFS_BLF_UDQUOT_BUF)
2395                 type |= XFS_DQ_USER;
2396         if (buf_f->blf_flags & XFS_BLF_PDQUOT_BUF)
2397                 type |= XFS_DQ_PROJ;
2398         if (buf_f->blf_flags & XFS_BLF_GDQUOT_BUF)
2399                 type |= XFS_DQ_GROUP;
2400         /*
2401          * This type of quotas was turned off, so ignore this buffer
2402          */
2403         if (log->l_quotaoffs_flag & type)
2404                 return;
2405
2406         xlog_recover_do_reg_buffer(mp, item, bp, buf_f);
2407 }
2408
2409 /*
2410  * This routine replays a modification made to a buffer at runtime.
2411  * There are actually two types of buffer, regular and inode, which
2412  * are handled differently.  Inode buffers are handled differently
2413  * in that we only recover a specific set of data from them, namely
2414  * the inode di_next_unlinked fields.  This is because all other inode
2415  * data is actually logged via inode records and any data we replay
2416  * here which overlaps that may be stale.
2417  *
2418  * When meta-data buffers are freed at run time we log a buffer item
2419  * with the XFS_BLF_CANCEL bit set to indicate that previous copies
2420  * of the buffer in the log should not be replayed at recovery time.
2421  * This is so that if the blocks covered by the buffer are reused for
2422  * file data before we crash we don't end up replaying old, freed
2423  * meta-data into a user's file.
2424  *
2425  * To handle the cancellation of buffer log items, we make two passes
2426  * over the log during recovery.  During the first we build a table of
2427  * those buffers which have been cancelled, and during the second we
2428  * only replay those buffers which do not have corresponding cancel
2429  * records in the table.  See xlog_recover_do_buffer_pass[1,2] above
2430  * for more details on the implementation of the table of cancel records.
2431  */
2432 STATIC int
2433 xlog_recover_buffer_pass2(
2434         struct xlog                     *log,
2435         struct list_head                *buffer_list,
2436         struct xlog_recover_item        *item)
2437 {
2438         xfs_buf_log_format_t    *buf_f = item->ri_buf[0].i_addr;
2439         xfs_mount_t             *mp = log->l_mp;
2440         xfs_buf_t               *bp;
2441         int                     error;
2442         uint                    buf_flags;
2443
2444         /*
2445          * In this pass we only want to recover all the buffers which have
2446          * not been cancelled and are not cancellation buffers themselves.
2447          */
2448         if (xlog_check_buffer_cancelled(log, buf_f->blf_blkno,
2449                         buf_f->blf_len, buf_f->blf_flags)) {
2450                 trace_xfs_log_recover_buf_cancel(log, buf_f);
2451                 return 0;
2452         }
2453
2454         trace_xfs_log_recover_buf_recover(log, buf_f);
2455
2456         buf_flags = 0;
2457         if (buf_f->blf_flags & XFS_BLF_INODE_BUF)
2458                 buf_flags |= XBF_UNMAPPED;
2459
2460         bp = xfs_buf_read(mp->m_ddev_targp, buf_f->blf_blkno, buf_f->blf_len,
2461                           buf_flags, NULL);
2462         if (!bp)
2463                 return XFS_ERROR(ENOMEM);
2464         error = bp->b_error;
2465         if (error) {
2466                 xfs_buf_ioerror_alert(bp, "xlog_recover_do..(read#1)");
2467                 xfs_buf_relse(bp);
2468                 return error;
2469         }
2470
2471         if (buf_f->blf_flags & XFS_BLF_INODE_BUF) {
2472                 error = xlog_recover_do_inode_buffer(mp, item, bp, buf_f);
2473         } else if (buf_f->blf_flags &
2474                   (XFS_BLF_UDQUOT_BUF|XFS_BLF_PDQUOT_BUF|XFS_BLF_GDQUOT_BUF)) {
2475                 xlog_recover_do_dquot_buffer(mp, log, item, bp, buf_f);
2476         } else {
2477                 xlog_recover_do_reg_buffer(mp, item, bp, buf_f);
2478         }
2479         if (error)
2480                 return XFS_ERROR(error);
2481
2482         /*
2483          * Perform delayed write on the buffer.  Asynchronous writes will be
2484          * slower when taking into account all the buffers to be flushed.
2485          *
2486          * Also make sure that only inode buffers with good sizes stay in
2487          * the buffer cache.  The kernel moves inodes in buffers of 1 block
2488          * or XFS_INODE_CLUSTER_SIZE bytes, whichever is bigger.  The inode
2489          * buffers in the log can be a different size if the log was generated
2490          * by an older kernel using unclustered inode buffers or a newer kernel
2491          * running with a different inode cluster size.  Regardless, if the
2492          * the inode buffer size isn't MAX(blocksize, XFS_INODE_CLUSTER_SIZE)
2493          * for *our* value of XFS_INODE_CLUSTER_SIZE, then we need to keep
2494          * the buffer out of the buffer cache so that the buffer won't
2495          * overlap with future reads of those inodes.
2496          */
2497         if (XFS_DINODE_MAGIC ==
2498             be16_to_cpu(*((__be16 *)xfs_buf_offset(bp, 0))) &&
2499             (BBTOB(bp->b_io_length) != MAX(log->l_mp->m_sb.sb_blocksize,
2500                         (__uint32_t)XFS_INODE_CLUSTER_SIZE(log->l_mp)))) {
2501                 xfs_buf_stale(bp);
2502                 error = xfs_bwrite(bp);
2503         } else {
2504                 ASSERT(bp->b_target->bt_mount == mp);
2505                 bp->b_iodone = xlog_recover_iodone;
2506                 xfs_buf_delwri_queue(bp, buffer_list);
2507         }
2508
2509         xfs_buf_relse(bp);
2510         return error;
2511 }
2512
2513 STATIC int
2514 xlog_recover_inode_pass2(
2515         struct xlog                     *log,
2516         struct list_head                *buffer_list,
2517         struct xlog_recover_item        *item)
2518 {
2519         xfs_inode_log_format_t  *in_f;
2520         xfs_mount_t             *mp = log->l_mp;
2521         xfs_buf_t               *bp;
2522         xfs_dinode_t            *dip;
2523         int                     len;
2524         xfs_caddr_t             src;
2525         xfs_caddr_t             dest;
2526         int                     error;
2527         int                     attr_index;
2528         uint                    fields;
2529         xfs_icdinode_t          *dicp;
2530         uint                    isize;
2531         int                     need_free = 0;
2532
2533         if (item->ri_buf[0].i_len == sizeof(xfs_inode_log_format_t)) {
2534                 in_f = item->ri_buf[0].i_addr;
2535         } else {
2536                 in_f = kmem_alloc(sizeof(xfs_inode_log_format_t), KM_SLEEP);
2537                 need_free = 1;
2538                 error = xfs_inode_item_format_convert(&item->ri_buf[0], in_f);
2539                 if (error)
2540                         goto error;
2541         }
2542
2543         /*
2544          * Inode buffers can be freed, look out for it,
2545          * and do not replay the inode.
2546          */
2547         if (xlog_check_buffer_cancelled(log, in_f->ilf_blkno,
2548                                         in_f->ilf_len, 0)) {
2549                 error = 0;
2550                 trace_xfs_log_recover_inode_cancel(log, in_f);
2551                 goto error;
2552         }
2553         trace_xfs_log_recover_inode_recover(log, in_f);
2554
2555         bp = xfs_buf_read(mp->m_ddev_targp, in_f->ilf_blkno, in_f->ilf_len, 0,
2556                           &xfs_inode_buf_ops);
2557         if (!bp) {
2558                 error = ENOMEM;
2559                 goto error;
2560         }
2561         error = bp->b_error;
2562         if (error) {
2563                 xfs_buf_ioerror_alert(bp, "xlog_recover_do..(read#2)");
2564                 xfs_buf_relse(bp);
2565                 goto error;
2566         }
2567         ASSERT(in_f->ilf_fields & XFS_ILOG_CORE);
2568         dip = (xfs_dinode_t *)xfs_buf_offset(bp, in_f->ilf_boffset);
2569
2570         /*
2571          * Make sure the place we're flushing out to really looks
2572          * like an inode!
2573          */
2574         if (unlikely(dip->di_magic != cpu_to_be16(XFS_DINODE_MAGIC))) {
2575                 xfs_buf_relse(bp);
2576                 xfs_alert(mp,
2577         "%s: Bad inode magic number, dip = 0x%p, dino bp = 0x%p, ino = %Ld",
2578                         __func__, dip, bp, in_f->ilf_ino);
2579                 XFS_ERROR_REPORT("xlog_recover_inode_pass2(1)",
2580                                  XFS_ERRLEVEL_LOW, mp);
2581                 error = EFSCORRUPTED;
2582                 goto error;
2583         }
2584         dicp = item->ri_buf[1].i_addr;
2585         if (unlikely(dicp->di_magic != XFS_DINODE_MAGIC)) {
2586                 xfs_buf_relse(bp);
2587                 xfs_alert(mp,
2588                         "%s: Bad inode log record, rec ptr 0x%p, ino %Ld",
2589                         __func__, item, in_f->ilf_ino);
2590                 XFS_ERROR_REPORT("xlog_recover_inode_pass2(2)",
2591                                  XFS_ERRLEVEL_LOW, mp);
2592                 error = EFSCORRUPTED;
2593                 goto error;
2594         }
2595
2596         /*
2597          * di_flushiter is only valid for v1/2 inodes. All changes for v3 inodes
2598          * are transactional and if ordering is necessary we can determine that
2599          * more accurately by the LSN field in the V3 inode core. Don't trust
2600          * the inode versions we might be changing them here - use the
2601          * superblock flag to determine whether we need to look at di_flushiter
2602          * to skip replay when the on disk inode is newer than the log one
2603          */
2604         if (!xfs_sb_version_hascrc(&mp->m_sb) &&
2605             dicp->di_flushiter < be16_to_cpu(dip->di_flushiter)) {
2606                 /*
2607                  * Deal with the wrap case, DI_MAX_FLUSH is less
2608                  * than smaller numbers
2609                  */
2610                 if (be16_to_cpu(dip->di_flushiter) == DI_MAX_FLUSH &&
2611                     dicp->di_flushiter < (DI_MAX_FLUSH >> 1)) {
2612                         /* do nothing */
2613                 } else {
2614                         xfs_buf_relse(bp);
2615                         trace_xfs_log_recover_inode_skip(log, in_f);
2616                         error = 0;
2617                         goto error;
2618                 }
2619         }
2620
2621         /* Take the opportunity to reset the flush iteration count */
2622         dicp->di_flushiter = 0;
2623
2624         if (unlikely(S_ISREG(dicp->di_mode))) {
2625                 if ((dicp->di_format != XFS_DINODE_FMT_EXTENTS) &&
2626                     (dicp->di_format != XFS_DINODE_FMT_BTREE)) {
2627                         XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(3)",
2628                                          XFS_ERRLEVEL_LOW, mp, dicp);
2629                         xfs_buf_relse(bp);
2630                         xfs_alert(mp,
2631                 "%s: Bad regular inode log record, rec ptr 0x%p, "
2632                 "ino ptr = 0x%p, ino bp = 0x%p, ino %Ld",
2633                                 __func__, item, dip, bp, in_f->ilf_ino);
2634                         error = EFSCORRUPTED;
2635                         goto error;
2636                 }
2637         } else if (unlikely(S_ISDIR(dicp->di_mode))) {
2638                 if ((dicp->di_format != XFS_DINODE_FMT_EXTENTS) &&
2639                     (dicp->di_format != XFS_DINODE_FMT_BTREE) &&
2640                     (dicp->di_format != XFS_DINODE_FMT_LOCAL)) {
2641                         XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(4)",
2642                                              XFS_ERRLEVEL_LOW, mp, dicp);
2643                         xfs_buf_relse(bp);
2644                         xfs_alert(mp,
2645                 "%s: Bad dir inode log record, rec ptr 0x%p, "
2646                 "ino ptr = 0x%p, ino bp = 0x%p, ino %Ld",
2647                                 __func__, item, dip, bp, in_f->ilf_ino);
2648                         error = EFSCORRUPTED;
2649                         goto error;
2650                 }
2651         }
2652         if (unlikely(dicp->di_nextents + dicp->di_anextents > dicp->di_nblocks)){
2653                 XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(5)",
2654                                      XFS_ERRLEVEL_LOW, mp, dicp);
2655                 xfs_buf_relse(bp);
2656                 xfs_alert(mp,
2657         "%s: Bad inode log record, rec ptr 0x%p, dino ptr 0x%p, "
2658         "dino bp 0x%p, ino %Ld, total extents = %d, nblocks = %Ld",
2659                         __func__, item, dip, bp, in_f->ilf_ino,
2660                         dicp->di_nextents + dicp->di_anextents,
2661                         dicp->di_nblocks);
2662                 error = EFSCORRUPTED;
2663                 goto error;
2664         }
2665         if (unlikely(dicp->di_forkoff > mp->m_sb.sb_inodesize)) {
2666                 XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(6)",
2667                                      XFS_ERRLEVEL_LOW, mp, dicp);
2668                 xfs_buf_relse(bp);
2669                 xfs_alert(mp,
2670         "%s: Bad inode log record, rec ptr 0x%p, dino ptr 0x%p, "
2671         "dino bp 0x%p, ino %Ld, forkoff 0x%x", __func__,
2672                         item, dip, bp, in_f->ilf_ino, dicp->di_forkoff);
2673                 error = EFSCORRUPTED;
2674                 goto error;
2675         }
2676         isize = xfs_icdinode_size(dicp->di_version);
2677         if (unlikely(item->ri_buf[1].i_len > isize)) {
2678                 XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(7)",
2679                                      XFS_ERRLEVEL_LOW, mp, dicp);
2680                 xfs_buf_relse(bp);
2681                 xfs_alert(mp,
2682                         "%s: Bad inode log record length %d, rec ptr 0x%p",
2683                         __func__, item->ri_buf[1].i_len, item);
2684                 error = EFSCORRUPTED;
2685                 goto error;
2686         }
2687
2688         /* The core is in in-core format */
2689         xfs_dinode_to_disk(dip, dicp);
2690
2691         /* the rest is in on-disk format */
2692         if (item->ri_buf[1].i_len > isize) {
2693                 memcpy((char *)dip + isize,
2694                         item->ri_buf[1].i_addr + isize,
2695                         item->ri_buf[1].i_len - isize);
2696         }
2697
2698         fields = in_f->ilf_fields;
2699         switch (fields & (XFS_ILOG_DEV | XFS_ILOG_UUID)) {
2700         case XFS_ILOG_DEV:
2701                 xfs_dinode_put_rdev(dip, in_f->ilf_u.ilfu_rdev);
2702                 break;
2703         case XFS_ILOG_UUID:
2704                 memcpy(XFS_DFORK_DPTR(dip),
2705                        &in_f->ilf_u.ilfu_uuid,
2706                        sizeof(uuid_t));
2707                 break;
2708         }
2709
2710         if (in_f->ilf_size == 2)
2711                 goto write_inode_buffer;
2712         len = item->ri_buf[2].i_len;
2713         src = item->ri_buf[2].i_addr;
2714         ASSERT(in_f->ilf_size <= 4);
2715         ASSERT((in_f->ilf_size == 3) || (fields & XFS_ILOG_AFORK));
2716         ASSERT(!(fields & XFS_ILOG_DFORK) ||
2717                (len == in_f->ilf_dsize));
2718
2719         switch (fields & XFS_ILOG_DFORK) {
2720         case XFS_ILOG_DDATA:
2721         case XFS_ILOG_DEXT:
2722                 memcpy(XFS_DFORK_DPTR(dip), src, len);
2723                 break;
2724
2725         case XFS_ILOG_DBROOT:
2726                 xfs_bmbt_to_bmdr(mp, (struct xfs_btree_block *)src, len,
2727                                  (xfs_bmdr_block_t *)XFS_DFORK_DPTR(dip),
2728                                  XFS_DFORK_DSIZE(dip, mp));
2729                 break;
2730
2731         default:
2732                 /*
2733                  * There are no data fork flags set.
2734                  */
2735                 ASSERT((fields & XFS_ILOG_DFORK) == 0);
2736                 break;
2737         }
2738
2739         /*
2740          * If we logged any attribute data, recover it.  There may or
2741          * may not have been any other non-core data logged in this
2742          * transaction.
2743          */
2744         if (in_f->ilf_fields & XFS_ILOG_AFORK) {
2745                 if (in_f->ilf_fields & XFS_ILOG_DFORK) {
2746                         attr_index = 3;
2747                 } else {
2748                         attr_index = 2;
2749                 }
2750                 len = item->ri_buf[attr_index].i_len;
2751                 src = item->ri_buf[attr_index].i_addr;
2752                 ASSERT(len == in_f->ilf_asize);
2753
2754                 switch (in_f->ilf_fields & XFS_ILOG_AFORK) {
2755                 case XFS_ILOG_ADATA:
2756                 case XFS_ILOG_AEXT:
2757                         dest = XFS_DFORK_APTR(dip);
2758                         ASSERT(len <= XFS_DFORK_ASIZE(dip, mp));
2759                         memcpy(dest, src, len);
2760                         break;
2761
2762                 case XFS_ILOG_ABROOT:
2763                         dest = XFS_DFORK_APTR(dip);
2764                         xfs_bmbt_to_bmdr(mp, (struct xfs_btree_block *)src,
2765                                          len, (xfs_bmdr_block_t*)dest,
2766                                          XFS_DFORK_ASIZE(dip, mp));
2767                         break;
2768
2769                 default:
2770                         xfs_warn(log->l_mp, "%s: Invalid flag", __func__);
2771                         ASSERT(0);
2772                         xfs_buf_relse(bp);
2773                         error = EIO;
2774                         goto error;
2775                 }
2776         }
2777
2778 write_inode_buffer:
2779         /* re-generate the checksum. */
2780         xfs_dinode_calc_crc(log->l_mp, dip);
2781
2782         ASSERT(bp->b_target->bt_mount == mp);
2783         bp->b_iodone = xlog_recover_iodone;
2784         xfs_buf_delwri_queue(bp, buffer_list);
2785         xfs_buf_relse(bp);
2786 error:
2787         if (need_free)
2788                 kmem_free(in_f);
2789         return XFS_ERROR(error);
2790 }
2791
2792 /*
2793  * Recover QUOTAOFF records. We simply make a note of it in the xlog
2794  * structure, so that we know not to do any dquot item or dquot buffer recovery,
2795  * of that type.
2796  */
2797 STATIC int
2798 xlog_recover_quotaoff_pass1(
2799         struct xlog                     *log,
2800         struct xlog_recover_item        *item)
2801 {
2802         xfs_qoff_logformat_t    *qoff_f = item->ri_buf[0].i_addr;
2803         ASSERT(qoff_f);
2804
2805         /*
2806          * The logitem format's flag tells us if this was user quotaoff,
2807          * group/project quotaoff or both.
2808          */
2809         if (qoff_f->qf_flags & XFS_UQUOTA_ACCT)
2810                 log->l_quotaoffs_flag |= XFS_DQ_USER;
2811         if (qoff_f->qf_flags & XFS_PQUOTA_ACCT)
2812                 log->l_quotaoffs_flag |= XFS_DQ_PROJ;
2813         if (qoff_f->qf_flags & XFS_GQUOTA_ACCT)
2814                 log->l_quotaoffs_flag |= XFS_DQ_GROUP;
2815
2816         return (0);
2817 }
2818
2819 /*
2820  * Recover a dquot record
2821  */
2822 STATIC int
2823 xlog_recover_dquot_pass2(
2824         struct xlog                     *log,
2825         struct list_head                *buffer_list,
2826         struct xlog_recover_item        *item)
2827 {
2828         xfs_mount_t             *mp = log->l_mp;
2829         xfs_buf_t               *bp;
2830         struct xfs_disk_dquot   *ddq, *recddq;
2831         int                     error;
2832         xfs_dq_logformat_t      *dq_f;
2833         uint                    type;
2834
2835
2836         /*
2837          * Filesystems are required to send in quota flags at mount time.
2838          */
2839         if (mp->m_qflags == 0)
2840                 return (0);
2841
2842         recddq = item->ri_buf[1].i_addr;
2843         if (recddq == NULL) {
2844                 xfs_alert(log->l_mp, "NULL dquot in %s.", __func__);
2845                 return XFS_ERROR(EIO);
2846         }
2847         if (item->ri_buf[1].i_len < sizeof(xfs_disk_dquot_t)) {
2848                 xfs_alert(log->l_mp, "dquot too small (%d) in %s.",
2849                         item->ri_buf[1].i_len, __func__);
2850                 return XFS_ERROR(EIO);
2851         }
2852
2853         /*
2854          * This type of quotas was turned off, so ignore this record.
2855          */
2856         type = recddq->d_flags & (XFS_DQ_USER | XFS_DQ_PROJ | XFS_DQ_GROUP);
2857         ASSERT(type);
2858         if (log->l_quotaoffs_flag & type)
2859                 return (0);
2860
2861         /*
2862          * At this point we know that quota was _not_ turned off.
2863          * Since the mount flags are not indicating to us otherwise, this
2864          * must mean that quota is on, and the dquot needs to be replayed.
2865          * Remember that we may not have fully recovered the superblock yet,
2866          * so we can't do the usual trick of looking at the SB quota bits.
2867          *
2868          * The other possibility, of course, is that the quota subsystem was
2869          * removed since the last mount - ENOSYS.
2870          */
2871         dq_f = item->ri_buf[0].i_addr;
2872         ASSERT(dq_f);
2873         error = xfs_qm_dqcheck(mp, recddq, dq_f->qlf_id, 0, XFS_QMOPT_DOWARN,
2874                            "xlog_recover_dquot_pass2 (log copy)");
2875         if (error)
2876                 return XFS_ERROR(EIO);
2877         ASSERT(dq_f->qlf_len == 1);
2878
2879         error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp, dq_f->qlf_blkno,
2880                                    XFS_FSB_TO_BB(mp, dq_f->qlf_len), 0, &bp,
2881                                    NULL);
2882         if (error)
2883                 return error;
2884
2885         ASSERT(bp);
2886         ddq = (xfs_disk_dquot_t *)xfs_buf_offset(bp, dq_f->qlf_boffset);
2887
2888         /*
2889          * At least the magic num portion should be on disk because this
2890          * was among a chunk of dquots created earlier, and we did some
2891          * minimal initialization then.
2892          */
2893         error = xfs_qm_dqcheck(mp, ddq, dq_f->qlf_id, 0, XFS_QMOPT_DOWARN,
2894                            "xlog_recover_dquot_pass2");
2895         if (error) {
2896                 xfs_buf_relse(bp);
2897                 return XFS_ERROR(EIO);
2898         }
2899
2900         memcpy(ddq, recddq, item->ri_buf[1].i_len);
2901         if (xfs_sb_version_hascrc(&mp->m_sb)) {
2902                 xfs_update_cksum((char *)ddq, sizeof(struct xfs_dqblk),
2903                                  XFS_DQUOT_CRC_OFF);
2904         }
2905
2906         ASSERT(dq_f->qlf_size == 2);
2907         ASSERT(bp->b_target->bt_mount == mp);
2908         bp->b_iodone = xlog_recover_iodone;
2909         xfs_buf_delwri_queue(bp, buffer_list);
2910         xfs_buf_relse(bp);
2911
2912         return (0);
2913 }
2914
2915 /*
2916  * This routine is called to create an in-core extent free intent
2917  * item from the efi format structure which was logged on disk.
2918  * It allocates an in-core efi, copies the extents from the format
2919  * structure into it, and adds the efi to the AIL with the given
2920  * LSN.
2921  */
2922 STATIC int
2923 xlog_recover_efi_pass2(
2924         struct xlog                     *log,
2925         struct xlog_recover_item        *item,
2926         xfs_lsn_t                       lsn)
2927 {
2928         int                     error;
2929         xfs_mount_t             *mp = log->l_mp;
2930         xfs_efi_log_item_t      *efip;
2931         xfs_efi_log_format_t    *efi_formatp;
2932
2933         efi_formatp = item->ri_buf[0].i_addr;
2934
2935         efip = xfs_efi_init(mp, efi_formatp->efi_nextents);
2936         if ((error = xfs_efi_copy_format(&(item->ri_buf[0]),
2937                                          &(efip->efi_format)))) {
2938                 xfs_efi_item_free(efip);
2939                 return error;
2940         }
2941         atomic_set(&efip->efi_next_extent, efi_formatp->efi_nextents);
2942
2943         spin_lock(&log->l_ailp->xa_lock);
2944         /*
2945          * xfs_trans_ail_update() drops the AIL lock.
2946          */
2947         xfs_trans_ail_update(log->l_ailp, &efip->efi_item, lsn);
2948         return 0;
2949 }
2950
2951
2952 /*
2953  * This routine is called when an efd format structure is found in
2954  * a committed transaction in the log.  It's purpose is to cancel
2955  * the corresponding efi if it was still in the log.  To do this
2956  * it searches the AIL for the efi with an id equal to that in the
2957  * efd format structure.  If we find it, we remove the efi from the
2958  * AIL and free it.
2959  */
2960 STATIC int
2961 xlog_recover_efd_pass2(
2962         struct xlog                     *log,
2963         struct xlog_recover_item        *item)
2964 {
2965         xfs_efd_log_format_t    *efd_formatp;
2966         xfs_efi_log_item_t      *efip = NULL;
2967         xfs_log_item_t          *lip;
2968         __uint64_t              efi_id;
2969         struct xfs_ail_cursor   cur;
2970         struct xfs_ail          *ailp = log->l_ailp;
2971
2972         efd_formatp = item->ri_buf[0].i_addr;
2973         ASSERT((item->ri_buf[0].i_len == (sizeof(xfs_efd_log_format_32_t) +
2974                 ((efd_formatp->efd_nextents - 1) * sizeof(xfs_extent_32_t)))) ||
2975                (item->ri_buf[0].i_len == (sizeof(xfs_efd_log_format_64_t) +
2976                 ((efd_formatp->efd_nextents - 1) * sizeof(xfs_extent_64_t)))));
2977         efi_id = efd_formatp->efd_efi_id;
2978
2979         /*
2980          * Search for the efi with the id in the efd format structure
2981          * in the AIL.
2982          */
2983         spin_lock(&ailp->xa_lock);
2984         lip = xfs_trans_ail_cursor_first(ailp, &cur, 0);
2985         while (lip != NULL) {
2986                 if (lip->li_type == XFS_LI_EFI) {
2987                         efip = (xfs_efi_log_item_t *)lip;
2988                         if (efip->efi_format.efi_id == efi_id) {
2989                                 /*
2990                                  * xfs_trans_ail_delete() drops the
2991                                  * AIL lock.
2992                                  */
2993                                 xfs_trans_ail_delete(ailp, lip,
2994                                                      SHUTDOWN_CORRUPT_INCORE);
2995                                 xfs_efi_item_free(efip);
2996                                 spin_lock(&ailp->xa_lock);
2997                                 break;
2998                         }
2999                 }
3000                 lip = xfs_trans_ail_cursor_next(ailp, &cur);
3001         }
3002         xfs_trans_ail_cursor_done(ailp, &cur);
3003         spin_unlock(&ailp->xa_lock);
3004
3005         return 0;
3006 }
3007
3008 /*
3009  * This routine is called when an inode create format structure is found in a
3010  * committed transaction in the log.  It's purpose is to initialise the inodes
3011  * being allocated on disk. This requires us to get inode cluster buffers that
3012  * match the range to be intialised, stamped with inode templates and written
3013  * by delayed write so that subsequent modifications will hit the cached buffer
3014  * and only need writing out at the end of recovery.
3015  */
3016 STATIC int
3017 xlog_recover_do_icreate_pass2(
3018         struct xlog             *log,
3019         struct list_head        *buffer_list,
3020         xlog_recover_item_t     *item)
3021 {
3022         struct xfs_mount        *mp = log->l_mp;
3023         struct xfs_icreate_log  *icl;
3024         xfs_agnumber_t          agno;
3025         xfs_agblock_t           agbno;
3026         unsigned int            count;
3027         unsigned int            isize;
3028         xfs_agblock_t           length;
3029
3030         icl = (struct xfs_icreate_log *)item->ri_buf[0].i_addr;
3031         if (icl->icl_type != XFS_LI_ICREATE) {
3032                 xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad type");
3033                 return EINVAL;
3034         }
3035
3036         if (icl->icl_size != 1) {
3037                 xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad icl size");
3038                 return EINVAL;
3039         }
3040
3041         agno = be32_to_cpu(icl->icl_ag);
3042         if (agno >= mp->m_sb.sb_agcount) {
3043                 xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad agno");
3044                 return EINVAL;
3045         }
3046         agbno = be32_to_cpu(icl->icl_agbno);
3047         if (!agbno || agbno == NULLAGBLOCK || agbno >= mp->m_sb.sb_agblocks) {
3048                 xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad agbno");
3049                 return EINVAL;
3050         }
3051         isize = be32_to_cpu(icl->icl_isize);
3052         if (isize != mp->m_sb.sb_inodesize) {
3053                 xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad isize");
3054                 return EINVAL;
3055         }
3056         count = be32_to_cpu(icl->icl_count);
3057         if (!count) {
3058                 xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad count");
3059                 return EINVAL;
3060         }
3061         length = be32_to_cpu(icl->icl_length);
3062         if (!length || length >= mp->m_sb.sb_agblocks) {
3063                 xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad length");
3064                 return EINVAL;
3065         }
3066
3067         /* existing allocation is fixed value */
3068         ASSERT(count == XFS_IALLOC_INODES(mp));
3069         ASSERT(length == XFS_IALLOC_BLOCKS(mp));
3070         if (count != XFS_IALLOC_INODES(mp) ||
3071              length != XFS_IALLOC_BLOCKS(mp)) {
3072                 xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad count 2");
3073                 return EINVAL;
3074         }
3075
3076         /*
3077          * Inode buffers can be freed. Do not replay the inode initialisation as
3078          * we could be overwriting something written after this inode buffer was
3079          * cancelled.
3080          *
3081          * XXX: we need to iterate all buffers and only init those that are not
3082          * cancelled. I think that a more fine grained factoring of
3083          * xfs_ialloc_inode_init may be appropriate here to enable this to be
3084          * done easily.
3085          */
3086         if (xlog_check_buffer_cancelled(log,
3087                         XFS_AGB_TO_DADDR(mp, agno, agbno), length, 0))
3088                 return 0;
3089
3090         xfs_ialloc_inode_init(mp, NULL, buffer_list, agno, agbno, length,
3091                                         be32_to_cpu(icl->icl_gen));
3092         return 0;
3093 }
3094
3095 /*
3096  * Free up any resources allocated by the transaction
3097  *
3098  * Remember that EFIs, EFDs, and IUNLINKs are handled later.
3099  */
3100 STATIC void
3101 xlog_recover_free_trans(
3102         struct xlog_recover     *trans)
3103 {
3104         xlog_recover_item_t     *item, *n;
3105         int                     i;
3106
3107         list_for_each_entry_safe(item, n, &trans->r_itemq, ri_list) {
3108                 /* Free the regions in the item. */
3109                 list_del(&item->ri_list);
3110                 for (i = 0; i < item->ri_cnt; i++)
3111                         kmem_free(item->ri_buf[i].i_addr);
3112                 /* Free the item itself */
3113                 kmem_free(item->ri_buf);
3114                 kmem_free(item);
3115         }
3116         /* Free the transaction recover structure */
3117         kmem_free(trans);
3118 }
3119
3120 STATIC int
3121 xlog_recover_commit_pass1(
3122         struct xlog                     *log,
3123         struct xlog_recover             *trans,
3124         struct xlog_recover_item        *item)
3125 {
3126         trace_xfs_log_recover_item_recover(log, trans, item, XLOG_RECOVER_PASS1);
3127
3128         switch (ITEM_TYPE(item)) {
3129         case XFS_LI_BUF:
3130                 return xlog_recover_buffer_pass1(log, item);
3131         case XFS_LI_QUOTAOFF:
3132                 return xlog_recover_quotaoff_pass1(log, item);
3133         case XFS_LI_INODE:
3134         case XFS_LI_EFI:
3135         case XFS_LI_EFD:
3136         case XFS_LI_DQUOT:
3137         case XFS_LI_ICREATE:
3138                 /* nothing to do in pass 1 */
3139                 return 0;
3140         default:
3141                 xfs_warn(log->l_mp, "%s: invalid item type (%d)",
3142                         __func__, ITEM_TYPE(item));
3143                 ASSERT(0);
3144                 return XFS_ERROR(EIO);
3145         }
3146 }
3147
3148 STATIC int
3149 xlog_recover_commit_pass2(
3150         struct xlog                     *log,
3151         struct xlog_recover             *trans,
3152         struct list_head                *buffer_list,
3153         struct xlog_recover_item        *item)
3154 {
3155         trace_xfs_log_recover_item_recover(log, trans, item, XLOG_RECOVER_PASS2);
3156
3157         switch (ITEM_TYPE(item)) {
3158         case XFS_LI_BUF:
3159                 return xlog_recover_buffer_pass2(log, buffer_list, item);
3160         case XFS_LI_INODE:
3161                 return xlog_recover_inode_pass2(log, buffer_list, item);
3162         case XFS_LI_EFI:
3163                 return xlog_recover_efi_pass2(log, item, trans->r_lsn);
3164         case XFS_LI_EFD:
3165                 return xlog_recover_efd_pass2(log, item);
3166         case XFS_LI_DQUOT:
3167                 return xlog_recover_dquot_pass2(log, buffer_list, item);
3168         case XFS_LI_ICREATE:
3169                 return xlog_recover_do_icreate_pass2(log, buffer_list, item);
3170         case XFS_LI_QUOTAOFF:
3171                 /* nothing to do in pass2 */
3172                 return 0;
3173         default:
3174                 xfs_warn(log->l_mp, "%s: invalid item type (%d)",
3175                         __func__, ITEM_TYPE(item));
3176                 ASSERT(0);
3177                 return XFS_ERROR(EIO);
3178         }
3179 }
3180
3181 /*
3182  * Perform the transaction.
3183  *
3184  * If the transaction modifies a buffer or inode, do it now.  Otherwise,
3185  * EFIs and EFDs get queued up by adding entries into the AIL for them.
3186  */
3187 STATIC int
3188 xlog_recover_commit_trans(
3189         struct xlog             *log,
3190         struct xlog_recover     *trans,
3191         int                     pass)
3192 {
3193         int                     error = 0, error2;
3194         xlog_recover_item_t     *item;
3195         LIST_HEAD               (buffer_list);
3196
3197         hlist_del(&trans->r_list);
3198
3199         error = xlog_recover_reorder_trans(log, trans, pass);
3200         if (error)
3201                 return error;
3202
3203         list_for_each_entry(item, &trans->r_itemq, ri_list) {
3204                 switch (pass) {
3205                 case XLOG_RECOVER_PASS1:
3206                         error = xlog_recover_commit_pass1(log, trans, item);
3207                         break;
3208                 case XLOG_RECOVER_PASS2:
3209                         error = xlog_recover_commit_pass2(log, trans,
3210                                                           &buffer_list, item);
3211                         break;
3212                 default:
3213                         ASSERT(0);
3214                 }
3215
3216                 if (error)
3217                         goto out;
3218         }
3219
3220         xlog_recover_free_trans(trans);
3221
3222 out:
3223         error2 = xfs_buf_delwri_submit(&buffer_list);
3224         return error ? error : error2;
3225 }
3226
3227 STATIC int
3228 xlog_recover_unmount_trans(
3229         struct xlog             *log,
3230         struct xlog_recover     *trans)
3231 {
3232         /* Do nothing now */
3233         xfs_warn(log->l_mp, "%s: Unmount LR", __func__);
3234         return 0;
3235 }
3236
3237 /*
3238  * There are two valid states of the r_state field.  0 indicates that the
3239  * transaction structure is in a normal state.  We have either seen the
3240  * start of the transaction or the last operation we added was not a partial
3241  * operation.  If the last operation we added to the transaction was a
3242  * partial operation, we need to mark r_state with XLOG_WAS_CONT_TRANS.
3243  *
3244  * NOTE: skip LRs with 0 data length.
3245  */
3246 STATIC int
3247 xlog_recover_process_data(
3248         struct xlog             *log,
3249         struct hlist_head       rhash[],
3250         struct xlog_rec_header  *rhead,
3251         xfs_caddr_t             dp,
3252         int                     pass)
3253 {
3254         xfs_caddr_t             lp;
3255         int                     num_logops;
3256         xlog_op_header_t        *ohead;
3257         xlog_recover_t          *trans;
3258         xlog_tid_t              tid;
3259         int                     error;
3260         unsigned long           hash;
3261         uint                    flags;
3262
3263         lp = dp + be32_to_cpu(rhead->h_len);
3264         num_logops = be32_to_cpu(rhead->h_num_logops);
3265
3266         /* check the log format matches our own - else we can't recover */
3267         if (xlog_header_check_recover(log->l_mp, rhead))
3268                 return (XFS_ERROR(EIO));
3269
3270         while ((dp < lp) && num_logops) {
3271                 ASSERT(dp + sizeof(xlog_op_header_t) <= lp);
3272                 ohead = (xlog_op_header_t *)dp;
3273                 dp += sizeof(xlog_op_header_t);
3274                 if (ohead->oh_clientid != XFS_TRANSACTION &&
3275                     ohead->oh_clientid != XFS_LOG) {
3276                         xfs_warn(log->l_mp, "%s: bad clientid 0x%x",
3277                                         __func__, ohead->oh_clientid);
3278                         ASSERT(0);
3279                         return (XFS_ERROR(EIO));
3280                 }
3281                 tid = be32_to_cpu(ohead->oh_tid);
3282                 hash = XLOG_RHASH(tid);
3283                 trans = xlog_recover_find_tid(&rhash[hash], tid);
3284                 if (trans == NULL) {               /* not found; add new tid */
3285                         if (ohead->oh_flags & XLOG_START_TRANS)
3286                                 xlog_recover_new_tid(&rhash[hash], tid,
3287                                         be64_to_cpu(rhead->h_lsn));
3288                 } else {
3289                         if (dp + be32_to_cpu(ohead->oh_len) > lp) {
3290                                 xfs_warn(log->l_mp, "%s: bad length 0x%x",
3291                                         __func__, be32_to_cpu(ohead->oh_len));
3292                                 WARN_ON(1);
3293                                 return (XFS_ERROR(EIO));
3294                         }
3295                         flags = ohead->oh_flags & ~XLOG_END_TRANS;
3296                         if (flags & XLOG_WAS_CONT_TRANS)
3297                                 flags &= ~XLOG_CONTINUE_TRANS;
3298                         switch (flags) {
3299                         case XLOG_COMMIT_TRANS:
3300                                 error = xlog_recover_commit_trans(log,
3301                                                                 trans, pass);
3302                                 break;
3303                         case XLOG_UNMOUNT_TRANS:
3304                                 error = xlog_recover_unmount_trans(log, trans);
3305                                 break;
3306                         case XLOG_WAS_CONT_TRANS:
3307                                 error = xlog_recover_add_to_cont_trans(log,
3308                                                 trans, dp,
3309                                                 be32_to_cpu(ohead->oh_len));
3310                                 break;
3311                         case XLOG_START_TRANS:
3312                                 xfs_warn(log->l_mp, "%s: bad transaction",
3313                                         __func__);
3314                                 ASSERT(0);
3315                                 error = XFS_ERROR(EIO);
3316                                 break;
3317                         case 0:
3318                         case XLOG_CONTINUE_TRANS:
3319                                 error = xlog_recover_add_to_trans(log, trans,
3320                                                 dp, be32_to_cpu(ohead->oh_len));
3321                                 break;
3322                         default:
3323                                 xfs_warn(log->l_mp, "%s: bad flag 0x%x",
3324                                         __func__, flags);
3325                                 ASSERT(0);
3326                                 error = XFS_ERROR(EIO);
3327                                 break;
3328                         }
3329                         if (error)
3330                                 return error;
3331                 }
3332                 dp += be32_to_cpu(ohead->oh_len);
3333                 num_logops--;
3334         }
3335         return 0;
3336 }
3337
3338 /*
3339  * Process an extent free intent item that was recovered from
3340  * the log.  We need to free the extents that it describes.
3341  */
3342 STATIC int
3343 xlog_recover_process_efi(
3344         xfs_mount_t             *mp,
3345         xfs_efi_log_item_t      *efip)
3346 {
3347         xfs_efd_log_item_t      *efdp;
3348         xfs_trans_t             *tp;
3349         int                     i;
3350         int                     error = 0;
3351         xfs_extent_t            *extp;
3352         xfs_fsblock_t           startblock_fsb;
3353
3354         ASSERT(!test_bit(XFS_EFI_RECOVERED, &efip->efi_flags));
3355
3356         /*
3357          * First check the validity of the extents described by the
3358          * EFI.  If any are bad, then assume that all are bad and
3359          * just toss the EFI.
3360          */
3361         for (i = 0; i < efip->efi_format.efi_nextents; i++) {
3362                 extp = &(efip->efi_format.efi_extents[i]);
3363                 startblock_fsb = XFS_BB_TO_FSB(mp,
3364                                    XFS_FSB_TO_DADDR(mp, extp->ext_start));
3365                 if ((startblock_fsb == 0) ||
3366                     (extp->ext_len == 0) ||
3367                     (startblock_fsb >= mp->m_sb.sb_dblocks) ||
3368                     (extp->ext_len >= mp->m_sb.sb_agblocks)) {
3369                         /*
3370                          * This will pull the EFI from the AIL and
3371                          * free the memory associated with it.
3372                          */
3373                         set_bit(XFS_EFI_RECOVERED, &efip->efi_flags);
3374                         xfs_efi_release(efip, efip->efi_format.efi_nextents);
3375                         return XFS_ERROR(EIO);
3376                 }
3377         }
3378
3379         tp = xfs_trans_alloc(mp, 0);
3380         error = xfs_trans_reserve(tp, &M_RES(mp)->tr_itruncate, 0, 0);
3381         if (error)
3382                 goto abort_error;
3383         efdp = xfs_trans_get_efd(tp, efip, efip->efi_format.efi_nextents);
3384
3385         for (i = 0; i < efip->efi_format.efi_nextents; i++) {
3386                 extp = &(efip->efi_format.efi_extents[i]);
3387                 error = xfs_free_extent(tp, extp->ext_start, extp->ext_len);
3388                 if (error)
3389                         goto abort_error;
3390                 xfs_trans_log_efd_extent(tp, efdp, extp->ext_start,
3391                                          extp->ext_len);
3392         }
3393
3394         set_bit(XFS_EFI_RECOVERED, &efip->efi_flags);
3395         error = xfs_trans_commit(tp, 0);
3396         return error;
3397
3398 abort_error:
3399         xfs_trans_cancel(tp, XFS_TRANS_ABORT);
3400         return error;
3401 }
3402
3403 /*
3404  * When this is called, all of the EFIs which did not have
3405  * corresponding EFDs should be in the AIL.  What we do now
3406  * is free the extents associated with each one.
3407  *
3408  * Since we process the EFIs in normal transactions, they
3409  * will be removed at some point after the commit.  This prevents
3410  * us from just walking down the list processing each one.
3411  * We'll use a flag in the EFI to skip those that we've already
3412  * processed and use the AIL iteration mechanism's generation
3413  * count to try to speed this up at least a bit.
3414  *
3415  * When we start, we know that the EFIs are the only things in
3416  * the AIL.  As we process them, however, other items are added
3417  * to the AIL.  Since everything added to the AIL must come after
3418  * everything already in the AIL, we stop processing as soon as
3419  * we see something other than an EFI in the AIL.
3420  */
3421 STATIC int
3422 xlog_recover_process_efis(
3423         struct xlog     *log)
3424 {
3425         xfs_log_item_t          *lip;
3426         xfs_efi_log_item_t      *efip;
3427         int                     error = 0;
3428         struct xfs_ail_cursor   cur;
3429         struct xfs_ail          *ailp;
3430
3431         ailp = log->l_ailp;
3432         spin_lock(&ailp->xa_lock);
3433         lip = xfs_trans_ail_cursor_first(ailp, &cur, 0);
3434         while (lip != NULL) {
3435                 /*
3436                  * We're done when we see something other than an EFI.
3437                  * There should be no EFIs left in the AIL now.
3438                  */
3439                 if (lip->li_type != XFS_LI_EFI) {
3440 #ifdef DEBUG
3441                         for (; lip; lip = xfs_trans_ail_cursor_next(ailp, &cur))
3442                                 ASSERT(lip->li_type != XFS_LI_EFI);
3443 #endif
3444                         break;
3445                 }
3446
3447                 /*
3448                  * Skip EFIs that we've already processed.
3449                  */
3450                 efip = (xfs_efi_log_item_t *)lip;
3451                 if (test_bit(XFS_EFI_RECOVERED, &efip->efi_flags)) {
3452                         lip = xfs_trans_ail_cursor_next(ailp, &cur);
3453                         continue;
3454                 }
3455
3456                 spin_unlock(&ailp->xa_lock);
3457                 error = xlog_recover_process_efi(log->l_mp, efip);
3458                 spin_lock(&ailp->xa_lock);
3459                 if (error)
3460                         goto out;
3461                 lip = xfs_trans_ail_cursor_next(ailp, &cur);
3462         }
3463 out:
3464         xfs_trans_ail_cursor_done(ailp, &cur);
3465         spin_unlock(&ailp->xa_lock);
3466         return error;
3467 }
3468
3469 /*
3470  * This routine performs a transaction to null out a bad inode pointer
3471  * in an agi unlinked inode hash bucket.
3472  */
3473 STATIC void
3474 xlog_recover_clear_agi_bucket(
3475         xfs_mount_t     *mp,
3476         xfs_agnumber_t  agno,
3477         int             bucket)
3478 {
3479         xfs_trans_t     *tp;
3480         xfs_agi_t       *agi;
3481         xfs_buf_t       *agibp;
3482         int             offset;
3483         int             error;
3484
3485         tp = xfs_trans_alloc(mp, XFS_TRANS_CLEAR_AGI_BUCKET);
3486         error = xfs_trans_reserve(tp, &M_RES(mp)->tr_clearagi, 0, 0);
3487         if (error)
3488                 goto out_abort;
3489
3490         error = xfs_read_agi(mp, tp, agno, &agibp);
3491         if (error)
3492                 goto out_abort;
3493
3494         agi = XFS_BUF_TO_AGI(agibp);
3495         agi->agi_unlinked[bucket] = cpu_to_be32(NULLAGINO);
3496         offset = offsetof(xfs_agi_t, agi_unlinked) +
3497                  (sizeof(xfs_agino_t) * bucket);
3498         xfs_trans_log_buf(tp, agibp, offset,
3499                           (offset + sizeof(xfs_agino_t) - 1));
3500
3501         error = xfs_trans_commit(tp, 0);
3502         if (error)
3503                 goto out_error;
3504         return;
3505
3506 out_abort:
3507         xfs_trans_cancel(tp, XFS_TRANS_ABORT);
3508 out_error:
3509         xfs_warn(mp, "%s: failed to clear agi %d. Continuing.", __func__, agno);
3510         return;
3511 }
3512
3513 STATIC xfs_agino_t
3514 xlog_recover_process_one_iunlink(
3515         struct xfs_mount                *mp,
3516         xfs_agnumber_t                  agno,
3517         xfs_agino_t                     agino,
3518         int                             bucket)
3519 {
3520         struct xfs_buf                  *ibp;
3521         struct xfs_dinode               *dip;
3522         struct xfs_inode                *ip;
3523         xfs_ino_t                       ino;
3524         int                             error;
3525
3526         ino = XFS_AGINO_TO_INO(mp, agno, agino);
3527         error = xfs_iget(mp, NULL, ino, 0, 0, &ip);
3528         if (error)
3529                 goto fail;
3530
3531         /*
3532          * Get the on disk inode to find the next inode in the bucket.
3533          */
3534         error = xfs_imap_to_bp(mp, NULL, &ip->i_imap, &dip, &ibp, 0, 0);
3535         if (error)
3536                 goto fail_iput;
3537
3538         ASSERT(ip->i_d.di_nlink == 0);
3539         ASSERT(ip->i_d.di_mode != 0);
3540
3541         /* setup for the next pass */
3542         agino = be32_to_cpu(dip->di_next_unlinked);
3543         xfs_buf_relse(ibp);
3544
3545         /*
3546          * Prevent any DMAPI event from being sent when the reference on
3547          * the inode is dropped.
3548          */
3549         ip->i_d.di_dmevmask = 0;
3550
3551         IRELE(ip);
3552         return agino;
3553
3554  fail_iput:
3555         IRELE(ip);
3556  fail:
3557         /*
3558          * We can't read in the inode this bucket points to, or this inode
3559          * is messed up.  Just ditch this bucket of inodes.  We will lose
3560          * some inodes and space, but at least we won't hang.
3561          *
3562          * Call xlog_recover_clear_agi_bucket() to perform a transaction to
3563          * clear the inode pointer in the bucket.
3564          */
3565         xlog_recover_clear_agi_bucket(mp, agno, bucket);
3566         return NULLAGINO;
3567 }
3568
3569 /*
3570  * xlog_iunlink_recover
3571  *
3572  * This is called during recovery to process any inodes which
3573  * we unlinked but not freed when the system crashed.  These
3574  * inodes will be on the lists in the AGI blocks.  What we do
3575  * here is scan all the AGIs and fully truncate and free any
3576  * inodes found on the lists.  Each inode is removed from the
3577  * lists when it has been fully truncated and is freed.  The
3578  * freeing of the inode and its removal from the list must be
3579  * atomic.
3580  */
3581 STATIC void
3582 xlog_recover_process_iunlinks(
3583         struct xlog     *log)
3584 {
3585         xfs_mount_t     *mp;
3586         xfs_agnumber_t  agno;
3587         xfs_agi_t       *agi;
3588         xfs_buf_t       *agibp;
3589         xfs_agino_t     agino;
3590         int             bucket;
3591         int             error;
3592         uint            mp_dmevmask;
3593
3594         mp = log->l_mp;
3595
3596         /*
3597          * Prevent any DMAPI event from being sent while in this function.
3598          */
3599         mp_dmevmask = mp->m_dmevmask;
3600         mp->m_dmevmask = 0;
3601
3602         for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) {
3603                 /*
3604                  * Find the agi for this ag.
3605                  */
3606                 error = xfs_read_agi(mp, NULL, agno, &agibp);
3607                 if (error) {
3608                         /*
3609                          * AGI is b0rked. Don't process it.
3610                          *
3611                          * We should probably mark the filesystem as corrupt
3612                          * after we've recovered all the ag's we can....
3613                          */
3614                         continue;
3615                 }
3616                 /*
3617                  * Unlock the buffer so that it can be acquired in the normal
3618                  * course of the transaction to truncate and free each inode.
3619                  * Because we are not racing with anyone else here for the AGI
3620                  * buffer, we don't even need to hold it locked to read the
3621                  * initial unlinked bucket entries out of the buffer. We keep
3622                  * buffer reference though, so that it stays pinned in memory
3623                  * while we need the buffer.
3624                  */
3625                 agi = XFS_BUF_TO_AGI(agibp);
3626                 xfs_buf_unlock(agibp);
3627
3628                 for (bucket = 0; bucket < XFS_AGI_UNLINKED_BUCKETS; bucket++) {
3629                         agino = be32_to_cpu(agi->agi_unlinked[bucket]);
3630                         while (agino != NULLAGINO) {
3631                                 agino = xlog_recover_process_one_iunlink(mp,
3632                                                         agno, agino, bucket);
3633                         }
3634                 }
3635                 xfs_buf_rele(agibp);
3636         }
3637
3638         mp->m_dmevmask = mp_dmevmask;
3639 }
3640
3641 /*
3642  * Upack the log buffer data and crc check it. If the check fails, issue a
3643  * warning if and only if the CRC in the header is non-zero. This makes the
3644  * check an advisory warning, and the zero CRC check will prevent failure
3645  * warnings from being emitted when upgrading the kernel from one that does not
3646  * add CRCs by default.
3647  *
3648  * When filesystems are CRC enabled, this CRC mismatch becomes a fatal log
3649  * corruption failure
3650  */
3651 STATIC int
3652 xlog_unpack_data_crc(
3653         struct xlog_rec_header  *rhead,
3654         xfs_caddr_t             dp,
3655         struct xlog             *log)
3656 {
3657         __le32                  crc;
3658
3659         crc = xlog_cksum(log, rhead, dp, be32_to_cpu(rhead->h_len));
3660         if (crc != rhead->h_crc) {
3661                 if (rhead->h_crc || xfs_sb_version_hascrc(&log->l_mp->m_sb)) {
3662                         xfs_alert(log->l_mp,
3663                 "log record CRC mismatch: found 0x%x, expected 0x%x.\n",
3664                                         le32_to_cpu(rhead->h_crc),
3665                                         le32_to_cpu(crc));
3666                         xfs_hex_dump(dp, 32);
3667                 }
3668
3669                 /*
3670                  * If we've detected a log record corruption, then we can't
3671                  * recover past this point. Abort recovery if we are enforcing
3672                  * CRC protection by punting an error back up the stack.
3673                  */
3674                 if (xfs_sb_version_hascrc(&log->l_mp->m_sb))
3675                         return EFSCORRUPTED;
3676         }
3677
3678         return 0;
3679 }
3680
3681 STATIC int
3682 xlog_unpack_data(
3683         struct xlog_rec_header  *rhead,
3684         xfs_caddr_t             dp,
3685         struct xlog             *log)
3686 {
3687         int                     i, j, k;
3688         int                     error;
3689
3690         error = xlog_unpack_data_crc(rhead, dp, log);
3691         if (error)
3692                 return error;
3693
3694         for (i = 0; i < BTOBB(be32_to_cpu(rhead->h_len)) &&
3695                   i < (XLOG_HEADER_CYCLE_SIZE / BBSIZE); i++) {
3696                 *(__be32 *)dp = *(__be32 *)&rhead->h_cycle_data[i];
3697                 dp += BBSIZE;
3698         }
3699
3700         if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) {
3701                 xlog_in_core_2_t *xhdr = (xlog_in_core_2_t *)rhead;
3702                 for ( ; i < BTOBB(be32_to_cpu(rhead->h_len)); i++) {
3703                         j = i / (XLOG_HEADER_CYCLE_SIZE / BBSIZE);
3704                         k = i % (XLOG_HEADER_CYCLE_SIZE / BBSIZE);
3705                         *(__be32 *)dp = xhdr[j].hic_xheader.xh_cycle_data[k];
3706                         dp += BBSIZE;
3707                 }
3708         }
3709
3710         return 0;
3711 }
3712
3713 STATIC int
3714 xlog_valid_rec_header(
3715         struct xlog             *log,
3716         struct xlog_rec_header  *rhead,
3717         xfs_daddr_t             blkno)
3718 {
3719         int                     hlen;
3720
3721         if (unlikely(rhead->h_magicno != cpu_to_be32(XLOG_HEADER_MAGIC_NUM))) {
3722                 XFS_ERROR_REPORT("xlog_valid_rec_header(1)",
3723                                 XFS_ERRLEVEL_LOW, log->l_mp);
3724                 return XFS_ERROR(EFSCORRUPTED);
3725         }
3726         if (unlikely(
3727             (!rhead->h_version ||
3728             (be32_to_cpu(rhead->h_version) & (~XLOG_VERSION_OKBITS))))) {
3729                 xfs_warn(log->l_mp, "%s: unrecognised log version (%d).",
3730                         __func__, be32_to_cpu(rhead->h_version));
3731                 return XFS_ERROR(EIO);
3732         }
3733
3734         /* LR body must have data or it wouldn't have been written */
3735         hlen = be32_to_cpu(rhead->h_len);
3736         if (unlikely( hlen <= 0 || hlen > INT_MAX )) {
3737                 XFS_ERROR_REPORT("xlog_valid_rec_header(2)",
3738                                 XFS_ERRLEVEL_LOW, log->l_mp);
3739                 return XFS_ERROR(EFSCORRUPTED);
3740         }
3741         if (unlikely( blkno > log->l_logBBsize || blkno > INT_MAX )) {
3742                 XFS_ERROR_REPORT("xlog_valid_rec_header(3)",
3743                                 XFS_ERRLEVEL_LOW, log->l_mp);
3744                 return XFS_ERROR(EFSCORRUPTED);
3745         }
3746         return 0;
3747 }
3748
3749 /*
3750  * Read the log from tail to head and process the log records found.
3751  * Handle the two cases where the tail and head are in the same cycle
3752  * and where the active portion of the log wraps around the end of
3753  * the physical log separately.  The pass parameter is passed through
3754  * to the routines called to process the data and is not looked at
3755  * here.
3756  */
3757 STATIC int
3758 xlog_do_recovery_pass(
3759         struct xlog             *log,
3760         xfs_daddr_t             head_blk,
3761         xfs_daddr_t             tail_blk,
3762         int                     pass)
3763 {
3764         xlog_rec_header_t       *rhead;
3765         xfs_daddr_t             blk_no;
3766         xfs_caddr_t             offset;
3767         xfs_buf_t               *hbp, *dbp;
3768         int                     error = 0, h_size;
3769         int                     bblks, split_bblks;
3770         int                     hblks, split_hblks, wrapped_hblks;
3771         struct hlist_head       rhash[XLOG_RHASH_SIZE];
3772
3773         ASSERT(head_blk != tail_blk);
3774
3775         /*
3776          * Read the header of the tail block and get the iclog buffer size from
3777          * h_size.  Use this to tell how many sectors make up the log header.
3778          */
3779         if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) {
3780                 /*
3781                  * When using variable length iclogs, read first sector of
3782                  * iclog header and extract the header size from it.  Get a
3783                  * new hbp that is the correct size.
3784                  */
3785                 hbp = xlog_get_bp(log, 1);
3786                 if (!hbp)
3787                         return ENOMEM;
3788
3789                 error = xlog_bread(log, tail_blk, 1, hbp, &offset);
3790                 if (error)
3791                         goto bread_err1;
3792
3793                 rhead = (xlog_rec_header_t *)offset;
3794                 error = xlog_valid_rec_header(log, rhead, tail_blk);
3795                 if (error)
3796                         goto bread_err1;
3797                 h_size = be32_to_cpu(rhead->h_size);
3798                 if ((be32_to_cpu(rhead->h_version) & XLOG_VERSION_2) &&
3799                     (h_size > XLOG_HEADER_CYCLE_SIZE)) {
3800                         hblks = h_size / XLOG_HEADER_CYCLE_SIZE;
3801                         if (h_size % XLOG_HEADER_CYCLE_SIZE)
3802                                 hblks++;
3803                         xlog_put_bp(hbp);
3804                         hbp = xlog_get_bp(log, hblks);
3805                 } else {
3806                         hblks = 1;
3807                 }
3808         } else {
3809                 ASSERT(log->l_sectBBsize == 1);
3810                 hblks = 1;
3811                 hbp = xlog_get_bp(log, 1);
3812                 h_size = XLOG_BIG_RECORD_BSIZE;
3813         }
3814
3815         if (!hbp)
3816                 return ENOMEM;
3817         dbp = xlog_get_bp(log, BTOBB(h_size));
3818         if (!dbp) {
3819                 xlog_put_bp(hbp);
3820                 return ENOMEM;
3821         }
3822
3823         memset(rhash, 0, sizeof(rhash));
3824         if (tail_blk <= head_blk) {
3825                 for (blk_no = tail_blk; blk_no < head_blk; ) {
3826                         error = xlog_bread(log, blk_no, hblks, hbp, &offset);
3827                         if (error)
3828                                 goto bread_err2;
3829
3830                         rhead = (xlog_rec_header_t *)offset;
3831                         error = xlog_valid_rec_header(log, rhead, blk_no);
3832                         if (error)
3833                                 goto bread_err2;
3834
3835                         /* blocks in data section */
3836                         bblks = (int)BTOBB(be32_to_cpu(rhead->h_len));
3837                         error = xlog_bread(log, blk_no + hblks, bblks, dbp,
3838                                            &offset);
3839                         if (error)
3840                                 goto bread_err2;
3841
3842                         error = xlog_unpack_data(rhead, offset, log);
3843                         if (error)
3844                                 goto bread_err2;
3845
3846                         error = xlog_recover_process_data(log,
3847                                                 rhash, rhead, offset, pass);
3848                         if (error)
3849                                 goto bread_err2;
3850                         blk_no += bblks + hblks;
3851                 }
3852         } else {
3853                 /*
3854                  * Perform recovery around the end of the physical log.
3855                  * When the head is not on the same cycle number as the tail,
3856                  * we can't do a sequential recovery as above.
3857                  */
3858                 blk_no = tail_blk;
3859                 while (blk_no < log->l_logBBsize) {
3860                         /*
3861                          * Check for header wrapping around physical end-of-log
3862                          */
3863                         offset = hbp->b_addr;
3864                         split_hblks = 0;
3865                         wrapped_hblks = 0;
3866                         if (blk_no + hblks <= log->l_logBBsize) {
3867                                 /* Read header in one read */
3868                                 error = xlog_bread(log, blk_no, hblks, hbp,
3869                                                    &offset);
3870                                 if (error)
3871                                         goto bread_err2;
3872                         } else {
3873                                 /* This LR is split across physical log end */
3874                                 if (blk_no != log->l_logBBsize) {
3875                                         /* some data before physical log end */
3876                                         ASSERT(blk_no <= INT_MAX);
3877                                         split_hblks = log->l_logBBsize - (int)blk_no;
3878                                         ASSERT(split_hblks > 0);
3879                                         error = xlog_bread(log, blk_no,
3880                                                            split_hblks, hbp,
3881                                                            &offset);
3882                                         if (error)
3883                                                 goto bread_err2;
3884                                 }
3885
3886                                 /*
3887                                  * Note: this black magic still works with
3888                                  * large sector sizes (non-512) only because:
3889                                  * - we increased the buffer size originally
3890                                  *   by 1 sector giving us enough extra space
3891                                  *   for the second read;
3892                                  * - the log start is guaranteed to be sector
3893                                  *   aligned;
3894                                  * - we read the log end (LR header start)
3895                                  *   _first_, then the log start (LR header end)
3896                                  *   - order is important.
3897                                  */
3898                                 wrapped_hblks = hblks - split_hblks;
3899                                 error = xlog_bread_offset(log, 0,
3900                                                 wrapped_hblks, hbp,
3901                                                 offset + BBTOB(split_hblks));
3902                                 if (error)
3903                                         goto bread_err2;
3904                         }
3905                         rhead = (xlog_rec_header_t *)offset;
3906                         error = xlog_valid_rec_header(log, rhead,
3907                                                 split_hblks ? blk_no : 0);
3908                         if (error)
3909                                 goto bread_err2;
3910
3911                         bblks = (int)BTOBB(be32_to_cpu(rhead->h_len));
3912                         blk_no += hblks;
3913
3914                         /* Read in data for log record */
3915                         if (blk_no + bblks <= log->l_logBBsize) {
3916                                 error = xlog_bread(log, blk_no, bblks, dbp,
3917                                                    &offset);
3918                                 if (error)
3919                                         goto bread_err2;
3920                         } else {
3921                                 /* This log record is split across the
3922                                  * physical end of log */
3923                                 offset = dbp->b_addr;
3924                                 split_bblks = 0;
3925                                 if (blk_no != log->l_logBBsize) {
3926                                         /* some data is before the physical
3927                                          * end of log */
3928                                         ASSERT(!wrapped_hblks);
3929                                         ASSERT(blk_no <= INT_MAX);
3930                                         split_bblks =
3931                                                 log->l_logBBsize - (int)blk_no;
3932                                         ASSERT(split_bblks > 0);
3933                                         error = xlog_bread(log, blk_no,
3934                                                         split_bblks, dbp,
3935                                                         &offset);
3936                                         if (error)
3937                                                 goto bread_err2;
3938                                 }
3939
3940                                 /*
3941                                  * Note: this black magic still works with
3942                                  * large sector sizes (non-512) only because:
3943                                  * - we increased the buffer size originally
3944                                  *   by 1 sector giving us enough extra space
3945                                  *   for the second read;
3946                                  * - the log start is guaranteed to be sector
3947                                  *   aligned;
3948                                  * - we read the log end (LR header start)
3949                                  *   _first_, then the log start (LR header end)
3950                                  *   - order is important.
3951                                  */
3952                                 error = xlog_bread_offset(log, 0,
3953                                                 bblks - split_bblks, dbp,
3954                                                 offset + BBTOB(split_bblks));
3955                                 if (error)
3956                                         goto bread_err2;
3957                         }
3958
3959                         error = xlog_unpack_data(rhead, offset, log);
3960                         if (error)
3961                                 goto bread_err2;
3962
3963                         error = xlog_recover_process_data(log, rhash,
3964                                                         rhead, offset, pass);
3965                         if (error)
3966                                 goto bread_err2;
3967                         blk_no += bblks;
3968                 }
3969
3970                 ASSERT(blk_no >= log->l_logBBsize);
3971                 blk_no -= log->l_logBBsize;
3972
3973                 /* read first part of physical log */
3974                 while (blk_no < head_blk) {
3975                         error = xlog_bread(log, blk_no, hblks, hbp, &offset);
3976                         if (error)
3977                                 goto bread_err2;
3978
3979                         rhead = (xlog_rec_header_t *)offset;
3980                         error = xlog_valid_rec_header(log, rhead, blk_no);
3981                         if (error)
3982                                 goto bread_err2;
3983
3984                         bblks = (int)BTOBB(be32_to_cpu(rhead->h_len));
3985                         error = xlog_bread(log, blk_no+hblks, bblks, dbp,
3986                                            &offset);
3987                         if (error)
3988                                 goto bread_err2;
3989
3990                         error = xlog_unpack_data(rhead, offset, log);
3991                         if (error)
3992                                 goto bread_err2;
3993
3994                         error = xlog_recover_process_data(log, rhash,
3995                                                         rhead, offset, pass);
3996                         if (error)
3997                                 goto bread_err2;
3998                         blk_no += bblks + hblks;
3999                 }
4000         }
4001
4002  bread_err2:
4003         xlog_put_bp(dbp);
4004  bread_err1:
4005         xlog_put_bp(hbp);
4006         return error;
4007 }
4008
4009 /*
4010  * Do the recovery of the log.  We actually do this in two phases.
4011  * The two passes are necessary in order to implement the function
4012  * of cancelling a record written into the log.  The first pass
4013  * determines those things which have been cancelled, and the
4014  * second pass replays log items normally except for those which
4015  * have been cancelled.  The handling of the replay and cancellations
4016  * takes place in the log item type specific routines.
4017  *
4018  * The table of items which have cancel records in the log is allocated
4019  * and freed at this level, since only here do we know when all of
4020  * the log recovery has been completed.
4021  */
4022 STATIC int
4023 xlog_do_log_recovery(
4024         struct xlog     *log,
4025         xfs_daddr_t     head_blk,
4026         xfs_daddr_t     tail_blk)
4027 {
4028         int             error, i;
4029
4030         ASSERT(head_blk != tail_blk);
4031
4032         /*
4033          * First do a pass to find all of the cancelled buf log items.
4034          * Store them in the buf_cancel_table for use in the second pass.
4035          */
4036         log->l_buf_cancel_table = kmem_zalloc(XLOG_BC_TABLE_SIZE *
4037                                                  sizeof(struct list_head),
4038                                                  KM_SLEEP);
4039         for (i = 0; i < XLOG_BC_TABLE_SIZE; i++)
4040                 INIT_LIST_HEAD(&log->l_buf_cancel_table[i]);
4041
4042         error = xlog_do_recovery_pass(log, head_blk, tail_blk,
4043                                       XLOG_RECOVER_PASS1);
4044         if (error != 0) {
4045                 kmem_free(log->l_buf_cancel_table);
4046                 log->l_buf_cancel_table = NULL;
4047                 return error;
4048         }
4049         /*
4050          * Then do a second pass to actually recover the items in the log.
4051          * When it is complete free the table of buf cancel items.
4052          */
4053         error = xlog_do_recovery_pass(log, head_blk, tail_blk,
4054                                       XLOG_RECOVER_PASS2);
4055 #ifdef DEBUG
4056         if (!error) {
4057                 int     i;
4058
4059                 for (i = 0; i < XLOG_BC_TABLE_SIZE; i++)
4060                         ASSERT(list_empty(&log->l_buf_cancel_table[i]));
4061         }
4062 #endif  /* DEBUG */
4063
4064         kmem_free(log->l_buf_cancel_table);
4065         log->l_buf_cancel_table = NULL;
4066
4067         return error;
4068 }
4069
4070 /*
4071  * Do the actual recovery
4072  */
4073 STATIC int
4074 xlog_do_recover(
4075         struct xlog     *log,
4076         xfs_daddr_t     head_blk,
4077         xfs_daddr_t     tail_blk)
4078 {
4079         int             error;
4080         xfs_buf_t       *bp;
4081         xfs_sb_t        *sbp;
4082
4083         /*
4084          * First replay the images in the log.
4085          */
4086         error = xlog_do_log_recovery(log, head_blk, tail_blk);
4087         if (error)
4088                 return error;
4089
4090         /*
4091          * If IO errors happened during recovery, bail out.
4092          */
4093         if (XFS_FORCED_SHUTDOWN(log->l_mp)) {
4094                 return (EIO);
4095         }
4096
4097         /*
4098          * We now update the tail_lsn since much of the recovery has completed
4099          * and there may be space available to use.  If there were no extent
4100          * or iunlinks, we can free up the entire log and set the tail_lsn to
4101          * be the last_sync_lsn.  This was set in xlog_find_tail to be the
4102          * lsn of the last known good LR on disk.  If there are extent frees
4103          * or iunlinks they will have some entries in the AIL; so we look at
4104          * the AIL to determine how to set the tail_lsn.
4105          */
4106         xlog_assign_tail_lsn(log->l_mp);
4107
4108         /*
4109          * Now that we've finished replaying all buffer and inode
4110          * updates, re-read in the superblock and reverify it.
4111          */
4112         bp = xfs_getsb(log->l_mp, 0);
4113         XFS_BUF_UNDONE(bp);
4114         ASSERT(!(XFS_BUF_ISWRITE(bp)));
4115         XFS_BUF_READ(bp);
4116         XFS_BUF_UNASYNC(bp);
4117         bp->b_ops = &xfs_sb_buf_ops;
4118         xfsbdstrat(log->l_mp, bp);
4119         error = xfs_buf_iowait(bp);
4120         if (error) {
4121                 xfs_buf_ioerror_alert(bp, __func__);
4122                 ASSERT(0);
4123                 xfs_buf_relse(bp);
4124                 return error;
4125         }
4126
4127         /* Convert superblock from on-disk format */
4128         sbp = &log->l_mp->m_sb;
4129         xfs_sb_from_disk(sbp, XFS_BUF_TO_SBP(bp));
4130         ASSERT(sbp->sb_magicnum == XFS_SB_MAGIC);
4131         ASSERT(xfs_sb_good_version(sbp));
4132         xfs_buf_relse(bp);
4133
4134         /* We've re-read the superblock so re-initialize per-cpu counters */
4135         xfs_icsb_reinit_counters(log->l_mp);
4136
4137         xlog_recover_check_summary(log);
4138
4139         /* Normal transactions can now occur */
4140         log->l_flags &= ~XLOG_ACTIVE_RECOVERY;
4141         return 0;
4142 }
4143
4144 /*
4145  * Perform recovery and re-initialize some log variables in xlog_find_tail.
4146  *
4147  * Return error or zero.
4148  */
4149 int
4150 xlog_recover(
4151         struct xlog     *log)
4152 {
4153         xfs_daddr_t     head_blk, tail_blk;
4154         int             error;
4155
4156         /* find the tail of the log */
4157         if ((error = xlog_find_tail(log, &head_blk, &tail_blk)))
4158                 return error;
4159
4160         if (tail_blk != head_blk) {
4161                 /* There used to be a comment here:
4162                  *
4163                  * disallow recovery on read-only mounts.  note -- mount
4164                  * checks for ENOSPC and turns it into an intelligent
4165                  * error message.
4166                  * ...but this is no longer true.  Now, unless you specify
4167                  * NORECOVERY (in which case this function would never be
4168                  * called), we just go ahead and recover.  We do this all
4169                  * under the vfs layer, so we can get away with it unless
4170                  * the device itself is read-only, in which case we fail.
4171                  */
4172                 if ((error = xfs_dev_is_read_only(log->l_mp, "recovery"))) {
4173                         return error;
4174                 }
4175
4176                 /*
4177                  * Version 5 superblock log feature mask validation. We know the
4178                  * log is dirty so check if there are any unknown log features
4179                  * in what we need to recover. If there are unknown features
4180                  * (e.g. unsupported transactions, then simply reject the
4181                  * attempt at recovery before touching anything.
4182                  */
4183                 if (XFS_SB_VERSION_NUM(&log->l_mp->m_sb) == XFS_SB_VERSION_5 &&
4184                     xfs_sb_has_incompat_log_feature(&log->l_mp->m_sb,
4185                                         XFS_SB_FEAT_INCOMPAT_LOG_UNKNOWN)) {
4186                         xfs_warn(log->l_mp,
4187 "Superblock has unknown incompatible log features (0x%x) enabled.\n"
4188 "The log can not be fully and/or safely recovered by this kernel.\n"
4189 "Please recover the log on a kernel that supports the unknown features.",
4190                                 (log->l_mp->m_sb.sb_features_log_incompat &
4191                                         XFS_SB_FEAT_INCOMPAT_LOG_UNKNOWN));
4192                         return EINVAL;
4193                 }
4194
4195                 xfs_notice(log->l_mp, "Starting recovery (logdev: %s)",
4196                                 log->l_mp->m_logname ? log->l_mp->m_logname
4197                                                      : "internal");
4198
4199                 error = xlog_do_recover(log, head_blk, tail_blk);
4200                 log->l_flags |= XLOG_RECOVERY_NEEDED;
4201         }
4202         return error;
4203 }
4204
4205 /*
4206  * In the first part of recovery we replay inodes and buffers and build
4207  * up the list of extent free items which need to be processed.  Here
4208  * we process the extent free items and clean up the on disk unlinked
4209  * inode lists.  This is separated from the first part of recovery so
4210  * that the root and real-time bitmap inodes can be read in from disk in
4211  * between the two stages.  This is necessary so that we can free space
4212  * in the real-time portion of the file system.
4213  */
4214 int
4215 xlog_recover_finish(
4216         struct xlog     *log)
4217 {
4218         /*
4219          * Now we're ready to do the transactions needed for the
4220          * rest of recovery.  Start with completing all the extent
4221          * free intent records and then process the unlinked inode
4222          * lists.  At this point, we essentially run in normal mode
4223          * except that we're still performing recovery actions
4224          * rather than accepting new requests.
4225          */
4226         if (log->l_flags & XLOG_RECOVERY_NEEDED) {
4227                 int     error;
4228                 error = xlog_recover_process_efis(log);
4229                 if (error) {
4230                         xfs_alert(log->l_mp, "Failed to recover EFIs");
4231                         return error;
4232                 }
4233                 /*
4234                  * Sync the log to get all the EFIs out of the AIL.
4235                  * This isn't absolutely necessary, but it helps in
4236                  * case the unlink transactions would have problems
4237                  * pushing the EFIs out of the way.
4238                  */
4239                 xfs_log_force(log->l_mp, XFS_LOG_SYNC);
4240
4241                 xlog_recover_process_iunlinks(log);
4242
4243                 xlog_recover_check_summary(log);
4244
4245                 xfs_notice(log->l_mp, "Ending recovery (logdev: %s)",
4246                                 log->l_mp->m_logname ? log->l_mp->m_logname
4247                                                      : "internal");
4248                 log->l_flags &= ~XLOG_RECOVERY_NEEDED;
4249         } else {
4250                 xfs_info(log->l_mp, "Ending clean mount");
4251         }
4252         return 0;
4253 }
4254
4255
4256 #if defined(DEBUG)
4257 /*
4258  * Read all of the agf and agi counters and check that they
4259  * are consistent with the superblock counters.
4260  */
4261 void
4262 xlog_recover_check_summary(
4263         struct xlog     *log)
4264 {
4265         xfs_mount_t     *mp;
4266         xfs_agf_t       *agfp;
4267         xfs_buf_t       *agfbp;
4268         xfs_buf_t       *agibp;
4269         xfs_agnumber_t  agno;
4270         __uint64_t      freeblks;
4271         __uint64_t      itotal;
4272         __uint64_t      ifree;
4273         int             error;
4274
4275         mp = log->l_mp;
4276
4277         freeblks = 0LL;
4278         itotal = 0LL;
4279         ifree = 0LL;
4280         for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) {
4281                 error = xfs_read_agf(mp, NULL, agno, 0, &agfbp);
4282                 if (error) {
4283                         xfs_alert(mp, "%s agf read failed agno %d error %d",
4284                                                 __func__, agno, error);
4285                 } else {
4286                         agfp = XFS_BUF_TO_AGF(agfbp);
4287                         freeblks += be32_to_cpu(agfp->agf_freeblks) +
4288                                     be32_to_cpu(agfp->agf_flcount);
4289                         xfs_buf_relse(agfbp);
4290                 }
4291
4292                 error = xfs_read_agi(mp, NULL, agno, &agibp);
4293                 if (error) {
4294                         xfs_alert(mp, "%s agi read failed agno %d error %d",
4295                                                 __func__, agno, error);
4296                 } else {
4297                         struct xfs_agi  *agi = XFS_BUF_TO_AGI(agibp);
4298
4299                         itotal += be32_to_cpu(agi->agi_count);
4300                         ifree += be32_to_cpu(agi->agi_freecount);
4301                         xfs_buf_relse(agibp);
4302                 }
4303         }
4304 }
4305 #endif /* DEBUG */