1 // SPDX-License-Identifier: GPL-2.0-or-later
3 * Copyright (C) 2017-2023 Oracle. All Rights Reserved.
8 #include "xfs_shared.h"
9 #include "xfs_format.h"
10 #include "xfs_trans_resv.h"
11 #include "xfs_mount.h"
12 #include "xfs_inode.h"
13 #include "xfs_btree.h"
14 #include "scrub/scrub.h"
15 #include "scrub/common.h"
16 #include "scrub/btree.h"
17 #include "scrub/trace.h"
22 * Check for btree operation errors. See the section about handling
23 * operational errors in common.c.
26 __xchk_btree_process_error(
28 struct xfs_btree_cur *cur,
40 /* Used to restart an op with deadlock avoidance. */
41 trace_xchk_deadlock_retry(sc->ip, sc->sm, *error);
45 /* Note the badness but don't abort. */
46 sc->sm->sm_flags |= errflag;
50 if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE)
51 trace_xchk_ifork_btree_op_error(sc, cur, level,
54 trace_xchk_btree_op_error(sc, cur, level,
62 xchk_btree_process_error(
64 struct xfs_btree_cur *cur,
68 return __xchk_btree_process_error(sc, cur, level, error,
69 XFS_SCRUB_OFLAG_CORRUPT, __return_address);
73 xchk_btree_xref_process_error(
75 struct xfs_btree_cur *cur,
79 return __xchk_btree_process_error(sc, cur, level, error,
80 XFS_SCRUB_OFLAG_XFAIL, __return_address);
83 /* Record btree block corruption. */
85 __xchk_btree_set_corrupt(
87 struct xfs_btree_cur *cur,
92 sc->sm->sm_flags |= errflag;
94 if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE)
95 trace_xchk_ifork_btree_error(sc, cur, level,
98 trace_xchk_btree_error(sc, cur, level,
103 xchk_btree_set_corrupt(
104 struct xfs_scrub *sc,
105 struct xfs_btree_cur *cur,
108 __xchk_btree_set_corrupt(sc, cur, level, XFS_SCRUB_OFLAG_CORRUPT,
113 xchk_btree_xref_set_corrupt(
114 struct xfs_scrub *sc,
115 struct xfs_btree_cur *cur,
118 __xchk_btree_set_corrupt(sc, cur, level, XFS_SCRUB_OFLAG_XCORRUPT,
123 xchk_btree_set_preen(
124 struct xfs_scrub *sc,
125 struct xfs_btree_cur *cur,
128 __xchk_btree_set_corrupt(sc, cur, level, XFS_SCRUB_OFLAG_PREEN,
133 * Make sure this record is in order and doesn't stray outside of the parent
138 struct xchk_btree *bs)
140 struct xfs_btree_cur *cur = bs->cur;
141 union xfs_btree_rec *rec;
142 union xfs_btree_key key;
143 union xfs_btree_key hkey;
144 union xfs_btree_key *keyp;
145 struct xfs_btree_block *block;
146 struct xfs_btree_block *keyblock;
149 block = xfs_btree_get_block(cur, 0, &bp);
150 rec = xfs_btree_rec_addr(cur, cur->bc_levels[0].ptr, block);
152 trace_xchk_btree_rec(bs->sc, cur, 0);
154 /* Are all records across all record blocks in order? */
155 if (bs->lastrec_valid &&
156 !cur->bc_ops->recs_inorder(cur, &bs->lastrec, rec))
157 xchk_btree_set_corrupt(bs->sc, cur, 0);
158 memcpy(&bs->lastrec, rec, cur->bc_ops->rec_len);
159 bs->lastrec_valid = true;
161 if (cur->bc_nlevels == 1)
164 /* Is low_key(rec) at least as large as the parent low key? */
165 cur->bc_ops->init_key_from_rec(&key, rec);
166 keyblock = xfs_btree_get_block(cur, 1, &bp);
167 keyp = xfs_btree_key_addr(cur, cur->bc_levels[1].ptr, keyblock);
168 if (xfs_btree_keycmp_lt(cur, &key, keyp))
169 xchk_btree_set_corrupt(bs->sc, cur, 1);
171 if (!(cur->bc_flags & XFS_BTREE_OVERLAPPING))
174 /* Is high_key(rec) no larger than the parent high key? */
175 cur->bc_ops->init_high_key_from_rec(&hkey, rec);
176 keyp = xfs_btree_high_key_addr(cur, cur->bc_levels[1].ptr, keyblock);
177 if (xfs_btree_keycmp_lt(cur, keyp, &hkey))
178 xchk_btree_set_corrupt(bs->sc, cur, 1);
182 * Make sure this key is in order and doesn't stray outside of the parent
187 struct xchk_btree *bs,
190 struct xfs_btree_cur *cur = bs->cur;
191 union xfs_btree_key *key;
192 union xfs_btree_key *keyp;
193 struct xfs_btree_block *block;
194 struct xfs_btree_block *keyblock;
197 block = xfs_btree_get_block(cur, level, &bp);
198 key = xfs_btree_key_addr(cur, cur->bc_levels[level].ptr, block);
200 trace_xchk_btree_key(bs->sc, cur, level);
202 /* Are all low keys across all node blocks in order? */
203 if (bs->lastkey[level - 1].valid &&
204 !cur->bc_ops->keys_inorder(cur, &bs->lastkey[level - 1].key, key))
205 xchk_btree_set_corrupt(bs->sc, cur, level);
206 memcpy(&bs->lastkey[level - 1].key, key, cur->bc_ops->key_len);
207 bs->lastkey[level - 1].valid = true;
209 if (level + 1 >= cur->bc_nlevels)
212 /* Is this block's low key at least as large as the parent low key? */
213 keyblock = xfs_btree_get_block(cur, level + 1, &bp);
214 keyp = xfs_btree_key_addr(cur, cur->bc_levels[level + 1].ptr, keyblock);
215 if (xfs_btree_keycmp_lt(cur, key, keyp))
216 xchk_btree_set_corrupt(bs->sc, cur, level);
218 if (!(cur->bc_flags & XFS_BTREE_OVERLAPPING))
221 /* Is this block's high key no larger than the parent high key? */
222 key = xfs_btree_high_key_addr(cur, cur->bc_levels[level].ptr, block);
223 keyp = xfs_btree_high_key_addr(cur, cur->bc_levels[level + 1].ptr,
225 if (xfs_btree_keycmp_lt(cur, keyp, key))
226 xchk_btree_set_corrupt(bs->sc, cur, level);
230 * Check a btree pointer. Returns true if it's ok to use this pointer.
231 * Callers do not need to set the corrupt flag.
235 struct xchk_btree *bs,
237 union xfs_btree_ptr *ptr)
241 /* A btree rooted in an inode has no block pointer to the root. */
242 if ((bs->cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
243 level == bs->cur->bc_nlevels)
246 /* Otherwise, check the pointers. */
247 if (bs->cur->bc_flags & XFS_BTREE_LONG_PTRS)
248 res = xfs_btree_check_lptr(bs->cur, be64_to_cpu(ptr->l), level);
250 res = xfs_btree_check_sptr(bs->cur, be32_to_cpu(ptr->s), level);
252 xchk_btree_set_corrupt(bs->sc, bs->cur, level);
257 /* Check that a btree block's sibling matches what we expect it. */
259 xchk_btree_block_check_sibling(
260 struct xchk_btree *bs,
263 union xfs_btree_ptr *sibling)
265 struct xfs_btree_cur *cur = bs->cur;
266 struct xfs_btree_block *pblock;
268 struct xfs_btree_cur *ncur = NULL;
269 union xfs_btree_ptr *pp;
273 error = xfs_btree_dup_cursor(cur, &ncur);
274 if (!xchk_btree_process_error(bs->sc, cur, level + 1, &error) ||
279 * If the pointer is null, we shouldn't be able to move the upper
280 * level pointer anywhere.
282 if (xfs_btree_ptr_is_null(cur, sibling)) {
284 error = xfs_btree_increment(ncur, level + 1, &success);
286 error = xfs_btree_decrement(ncur, level + 1, &success);
287 if (error == 0 && success)
288 xchk_btree_set_corrupt(bs->sc, cur, level);
293 /* Increment upper level pointer. */
295 error = xfs_btree_increment(ncur, level + 1, &success);
297 error = xfs_btree_decrement(ncur, level + 1, &success);
298 if (!xchk_btree_process_error(bs->sc, cur, level + 1, &error))
301 xchk_btree_set_corrupt(bs->sc, cur, level + 1);
305 /* Compare upper level pointer to sibling pointer. */
306 pblock = xfs_btree_get_block(ncur, level + 1, &pbp);
307 pp = xfs_btree_ptr_addr(ncur, ncur->bc_levels[level + 1].ptr, pblock);
308 if (!xchk_btree_ptr_ok(bs, level + 1, pp))
311 xchk_buffer_recheck(bs->sc, pbp);
313 if (xfs_btree_diff_two_ptrs(cur, pp, sibling))
314 xchk_btree_set_corrupt(bs->sc, cur, level);
316 xfs_btree_del_cursor(ncur, XFS_BTREE_ERROR);
320 /* Check the siblings of a btree block. */
322 xchk_btree_block_check_siblings(
323 struct xchk_btree *bs,
324 struct xfs_btree_block *block)
326 struct xfs_btree_cur *cur = bs->cur;
327 union xfs_btree_ptr leftsib;
328 union xfs_btree_ptr rightsib;
332 xfs_btree_get_sibling(cur, block, &leftsib, XFS_BB_LEFTSIB);
333 xfs_btree_get_sibling(cur, block, &rightsib, XFS_BB_RIGHTSIB);
334 level = xfs_btree_get_level(block);
336 /* Root block should never have siblings. */
337 if (level == cur->bc_nlevels - 1) {
338 if (!xfs_btree_ptr_is_null(cur, &leftsib) ||
339 !xfs_btree_ptr_is_null(cur, &rightsib))
340 xchk_btree_set_corrupt(bs->sc, cur, level);
345 * Does the left & right sibling pointers match the adjacent
346 * parent level pointers?
347 * (These function absorbs error codes for us.)
349 error = xchk_btree_block_check_sibling(bs, level, -1, &leftsib);
352 error = xchk_btree_block_check_sibling(bs, level, 1, &rightsib);
360 struct list_head list;
366 * Make sure this btree block isn't in the free list and that there's
367 * an rmap record for it.
370 xchk_btree_check_block_owner(
371 struct xchk_btree *bs,
384 btnum = bs->cur->bc_btnum;
385 agno = xfs_daddr_to_agno(bs->cur->bc_mp, daddr);
386 agbno = xfs_daddr_to_agbno(bs->cur->bc_mp, daddr);
388 init_sa = bs->cur->bc_flags & XFS_BTREE_LONG_PTRS;
390 error = xchk_ag_init_existing(bs->sc, agno, &bs->sc->sa);
391 if (!xchk_btree_xref_process_error(bs->sc, bs->cur,
396 xchk_xref_is_used_space(bs->sc, agbno, 1);
398 * The bnobt scrubber aliases bs->cur to bs->sc->sa.bno_cur, so we
399 * have to nullify it (to shut down further block owner checks) if
400 * self-xref encounters problems.
402 if (!bs->sc->sa.bno_cur && btnum == XFS_BTNUM_BNO)
405 xchk_xref_is_only_owned_by(bs->sc, agbno, 1, bs->oinfo);
406 if (!bs->sc->sa.rmap_cur && btnum == XFS_BTNUM_RMAP)
411 xchk_ag_free(bs->sc, &bs->sc->sa);
416 /* Check the owner of a btree block. */
418 xchk_btree_check_owner(
419 struct xchk_btree *bs,
423 struct xfs_btree_cur *cur = bs->cur;
426 * In theory, xfs_btree_get_block should only give us a null buffer
427 * pointer for the root of a root-in-inode btree type, but we need
428 * to check defensively here in case the cursor state is also screwed
432 if (!(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE))
433 xchk_btree_set_corrupt(bs->sc, bs->cur, level);
438 * We want to cross-reference each btree block with the bnobt
439 * and the rmapbt. We cannot cross-reference the bnobt or
440 * rmapbt while scanning the bnobt or rmapbt, respectively,
441 * because we cannot alter the cursor and we'd prefer not to
442 * duplicate cursors. Therefore, save the buffer daddr for
445 if (cur->bc_btnum == XFS_BTNUM_BNO || cur->bc_btnum == XFS_BTNUM_RMAP) {
446 struct check_owner *co;
448 co = kmalloc(sizeof(struct check_owner), XCHK_GFP_FLAGS);
452 INIT_LIST_HEAD(&co->list);
454 co->daddr = xfs_buf_daddr(bp);
455 list_add_tail(&co->list, &bs->to_check);
459 return xchk_btree_check_block_owner(bs, level, xfs_buf_daddr(bp));
462 /* Decide if we want to check minrecs of a btree block in the inode root. */
464 xchk_btree_check_iroot_minrecs(
465 struct xchk_btree *bs)
468 * xfs_bmap_add_attrfork_btree had an implementation bug wherein it
469 * would miscalculate the space required for the data fork bmbt root
470 * when adding an attr fork, and promote the iroot contents to an
471 * external block unnecessarily. This went unnoticed for many years
472 * until scrub found filesystems in this state. Inode rooted btrees are
473 * not supposed to have immediate child blocks that are small enough
474 * that the contents could fit in the inode root, but we can't fail
475 * existing filesystems, so instead we disable the check for data fork
476 * bmap btrees when there's an attr fork.
478 if (bs->cur->bc_btnum == XFS_BTNUM_BMAP &&
479 bs->cur->bc_ino.whichfork == XFS_DATA_FORK &&
480 xfs_inode_has_attr_fork(bs->sc->ip))
487 * Check that this btree block has at least minrecs records or is one of the
488 * special blocks that don't require that.
491 xchk_btree_check_minrecs(
492 struct xchk_btree *bs,
494 struct xfs_btree_block *block)
496 struct xfs_btree_cur *cur = bs->cur;
497 unsigned int root_level = cur->bc_nlevels - 1;
498 unsigned int numrecs = be16_to_cpu(block->bb_numrecs);
500 /* More records than minrecs means the block is ok. */
501 if (numrecs >= cur->bc_ops->get_minrecs(cur, level))
505 * For btrees rooted in the inode, it's possible that the root block
506 * contents spilled into a regular ondisk block because there wasn't
507 * enough space in the inode root. The number of records in that
508 * child block might be less than the standard minrecs, but that's ok
509 * provided that there's only one direct child of the root.
511 if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
512 level == cur->bc_nlevels - 2) {
513 struct xfs_btree_block *root_block;
514 struct xfs_buf *root_bp;
517 root_block = xfs_btree_get_block(cur, root_level, &root_bp);
518 root_maxrecs = cur->bc_ops->get_dmaxrecs(cur, root_level);
519 if (xchk_btree_check_iroot_minrecs(bs) &&
520 (be16_to_cpu(root_block->bb_numrecs) != 1 ||
521 numrecs <= root_maxrecs))
522 xchk_btree_set_corrupt(bs->sc, cur, level);
527 * Otherwise, only the root level is allowed to have fewer than minrecs
528 * records or keyptrs.
530 if (level < root_level)
531 xchk_btree_set_corrupt(bs->sc, cur, level);
535 * If this btree block has a parent, make sure that the parent's keys capture
536 * the keyspace contained in this block.
539 xchk_btree_block_check_keys(
540 struct xchk_btree *bs,
542 struct xfs_btree_block *block)
544 union xfs_btree_key block_key;
545 union xfs_btree_key *block_high_key;
546 union xfs_btree_key *parent_low_key, *parent_high_key;
547 struct xfs_btree_cur *cur = bs->cur;
548 struct xfs_btree_block *parent_block;
551 if (level == cur->bc_nlevels - 1)
554 xfs_btree_get_keys(cur, block, &block_key);
556 /* Make sure the low key of this block matches the parent. */
557 parent_block = xfs_btree_get_block(cur, level + 1, &bp);
558 parent_low_key = xfs_btree_key_addr(cur, cur->bc_levels[level + 1].ptr,
560 if (xfs_btree_keycmp_ne(cur, &block_key, parent_low_key)) {
561 xchk_btree_set_corrupt(bs->sc, bs->cur, level);
565 if (!(cur->bc_flags & XFS_BTREE_OVERLAPPING))
568 /* Make sure the high key of this block matches the parent. */
569 parent_high_key = xfs_btree_high_key_addr(cur,
570 cur->bc_levels[level + 1].ptr, parent_block);
571 block_high_key = xfs_btree_high_key_from_key(cur, &block_key);
572 if (xfs_btree_keycmp_ne(cur, block_high_key, parent_high_key))
573 xchk_btree_set_corrupt(bs->sc, bs->cur, level);
577 * Grab and scrub a btree block given a btree pointer. Returns block
578 * and buffer pointers (if applicable) if they're ok to use.
581 xchk_btree_get_block(
582 struct xchk_btree *bs,
584 union xfs_btree_ptr *pp,
585 struct xfs_btree_block **pblock,
586 struct xfs_buf **pbp)
588 xfs_failaddr_t failed_at;
594 error = xfs_btree_lookup_get_block(bs->cur, level, pp, pblock);
595 if (!xchk_btree_process_error(bs->sc, bs->cur, level, &error) ||
599 xfs_btree_get_block(bs->cur, level, pbp);
600 if (bs->cur->bc_flags & XFS_BTREE_LONG_PTRS)
601 failed_at = __xfs_btree_check_lblock(bs->cur, *pblock,
604 failed_at = __xfs_btree_check_sblock(bs->cur, *pblock,
607 xchk_btree_set_corrupt(bs->sc, bs->cur, level);
611 xchk_buffer_recheck(bs->sc, *pbp);
613 xchk_btree_check_minrecs(bs, level, *pblock);
616 * Check the block's owner; this function absorbs error codes
619 error = xchk_btree_check_owner(bs, level, *pbp);
624 * Check the block's siblings; this function absorbs error codes
627 error = xchk_btree_block_check_siblings(bs, *pblock);
631 xchk_btree_block_check_keys(bs, level, *pblock);
636 * Check that the low and high keys of this block match the keys stored
637 * in the parent block.
640 xchk_btree_block_keys(
641 struct xchk_btree *bs,
643 struct xfs_btree_block *block)
645 union xfs_btree_key block_keys;
646 struct xfs_btree_cur *cur = bs->cur;
647 union xfs_btree_key *high_bk;
648 union xfs_btree_key *parent_keys;
649 union xfs_btree_key *high_pk;
650 struct xfs_btree_block *parent_block;
653 if (level >= cur->bc_nlevels - 1)
656 /* Calculate the keys for this block. */
657 xfs_btree_get_keys(cur, block, &block_keys);
659 /* Obtain the parent's copy of the keys for this block. */
660 parent_block = xfs_btree_get_block(cur, level + 1, &bp);
661 parent_keys = xfs_btree_key_addr(cur, cur->bc_levels[level + 1].ptr,
664 if (xfs_btree_keycmp_ne(cur, &block_keys, parent_keys))
665 xchk_btree_set_corrupt(bs->sc, cur, 1);
667 if (!(cur->bc_flags & XFS_BTREE_OVERLAPPING))
671 high_bk = xfs_btree_high_key_from_key(cur, &block_keys);
672 high_pk = xfs_btree_high_key_addr(cur, cur->bc_levels[level + 1].ptr,
675 if (xfs_btree_keycmp_ne(cur, high_bk, high_pk))
676 xchk_btree_set_corrupt(bs->sc, cur, 1);
680 * Visit all nodes and leaves of a btree. Check that all pointers and
681 * records are in order, that the keys reflect the records, and use a callback
682 * so that the caller can verify individual records.
686 struct xfs_scrub *sc,
687 struct xfs_btree_cur *cur,
688 xchk_btree_rec_fn scrub_fn,
689 const struct xfs_owner_info *oinfo,
692 union xfs_btree_ptr ptr;
693 struct xchk_btree *bs;
694 union xfs_btree_ptr *pp;
695 union xfs_btree_rec *recp;
696 struct xfs_btree_block *block;
698 struct check_owner *co;
699 struct check_owner *n;
705 * Allocate the btree scrub context from the heap, because this
706 * structure can get rather large. Don't let a caller feed us a
707 * totally absurd size.
709 cur_sz = xchk_btree_sizeof(cur->bc_nlevels);
710 if (cur_sz > PAGE_SIZE) {
711 xchk_btree_set_corrupt(sc, cur, 0);
714 bs = kzalloc(cur_sz, XCHK_GFP_FLAGS);
718 bs->scrub_rec = scrub_fn;
720 bs->private = private;
723 /* Initialize scrub state */
724 INIT_LIST_HEAD(&bs->to_check);
727 * Load the root of the btree. The helper function absorbs
728 * error codes for us.
730 level = cur->bc_nlevels - 1;
731 cur->bc_ops->init_ptr_from_cur(cur, &ptr);
732 if (!xchk_btree_ptr_ok(bs, cur->bc_nlevels, &ptr))
734 error = xchk_btree_get_block(bs, level, &ptr, &block, &bp);
738 cur->bc_levels[level].ptr = 1;
740 while (level < cur->bc_nlevels) {
741 block = xfs_btree_get_block(cur, level, &bp);
744 /* End of leaf, pop back towards the root. */
745 if (cur->bc_levels[level].ptr >
746 be16_to_cpu(block->bb_numrecs)) {
747 xchk_btree_block_keys(bs, level, block);
748 if (level < cur->bc_nlevels - 1)
749 cur->bc_levels[level + 1].ptr++;
754 /* Records in order for scrub? */
757 /* Call out to the record checker. */
758 recp = xfs_btree_rec_addr(cur, cur->bc_levels[0].ptr,
760 error = bs->scrub_rec(bs, recp);
763 if (xchk_should_terminate(sc, &error) ||
764 (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT))
767 cur->bc_levels[level].ptr++;
771 /* End of node, pop back towards the root. */
772 if (cur->bc_levels[level].ptr >
773 be16_to_cpu(block->bb_numrecs)) {
774 xchk_btree_block_keys(bs, level, block);
775 if (level < cur->bc_nlevels - 1)
776 cur->bc_levels[level + 1].ptr++;
781 /* Keys in order for scrub? */
782 xchk_btree_key(bs, level);
784 /* Drill another level deeper. */
785 pp = xfs_btree_ptr_addr(cur, cur->bc_levels[level].ptr, block);
786 if (!xchk_btree_ptr_ok(bs, level, pp)) {
787 cur->bc_levels[level].ptr++;
791 error = xchk_btree_get_block(bs, level, pp, &block, &bp);
795 cur->bc_levels[level].ptr = 1;
799 /* Process deferred owner checks on btree blocks. */
800 list_for_each_entry_safe(co, n, &bs->to_check, list) {
801 if (!error && bs->cur)
802 error = xchk_btree_check_block_owner(bs, co->level,