2 * Copyright (C) 2017 Oracle. All Rights Reserved.
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; either version 2
9 * of the License, or (at your option) any later version.
11 * This program is distributed in the hope that it would be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write the Free Software Foundation,
18 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA.
22 #include "xfs_shared.h"
23 #include "xfs_format.h"
24 #include "xfs_trans_resv.h"
25 #include "xfs_mount.h"
26 #include "xfs_defer.h"
27 #include "xfs_btree.h"
29 #include "xfs_log_format.h"
30 #include "xfs_trans.h"
32 #include "xfs_inode.h"
33 #include "xfs_icache.h"
34 #include "xfs_itable.h"
35 #include "xfs_alloc.h"
36 #include "xfs_alloc_btree.h"
38 #include "xfs_bmap_btree.h"
39 #include "xfs_ialloc.h"
40 #include "xfs_ialloc_btree.h"
41 #include "xfs_refcount.h"
42 #include "xfs_refcount_btree.h"
44 #include "xfs_rmap_btree.h"
45 #include "xfs_quota.h"
47 #include "xfs_errortag.h"
48 #include "xfs_error.h"
50 #include "xfs_trans_priv.h"
51 #include "scrub/xfs_scrub.h"
52 #include "scrub/scrub.h"
53 #include "scrub/common.h"
54 #include "scrub/trace.h"
55 #include "scrub/btree.h"
56 #include "scrub/repair.h"
59 * Online Scrub and Repair
61 * Traditionally, XFS (the kernel driver) did not know how to check or
62 * repair on-disk data structures. That task was left to the xfs_check
63 * and xfs_repair tools, both of which require taking the filesystem
64 * offline for a thorough but time consuming examination. Online
65 * scrub & repair, on the other hand, enables us to check the metadata
66 * for obvious errors while carefully stepping around the filesystem's
67 * ongoing operations, locking rules, etc.
69 * Given that most XFS metadata consist of records stored in a btree,
70 * most of the checking functions iterate the btree blocks themselves
71 * looking for irregularities. When a record block is encountered, each
72 * record can be checked for obviously bad values. Record values can
73 * also be cross-referenced against other btrees to look for potential
74 * misunderstandings between pieces of metadata.
76 * It is expected that the checkers responsible for per-AG metadata
77 * structures will lock the AG headers (AGI, AGF, AGFL), iterate the
78 * metadata structure, and perform any relevant cross-referencing before
79 * unlocking the AG and returning the results to userspace. These
80 * scrubbers must not keep an AG locked for too long to avoid tying up
81 * the block and inode allocators.
83 * Block maps and b-trees rooted in an inode present a special challenge
84 * because they can involve extents from any AG. The general scrubber
85 * structure of lock -> check -> xref -> unlock still holds, but AG
86 * locking order rules /must/ be obeyed to avoid deadlocks. The
87 * ordering rule, of course, is that we must lock in increasing AG
88 * order. Helper functions are provided to track which AG headers we've
89 * already locked. If we detect an imminent locking order violation, we
90 * can signal a potential deadlock, in which case the scrubber can jump
91 * out to the top level, lock all the AGs in order, and retry the scrub.
93 * For file data (directories, extended attributes, symlinks) scrub, we
94 * can simply lock the inode and walk the data. For btree data
95 * (directories and attributes) we follow the same btree-scrubbing
96 * strategy outlined previously to check the records.
98 * We use a bit of trickery with transactions to avoid buffer deadlocks
99 * if there is a cycle in the metadata. The basic problem is that
100 * travelling down a btree involves locking the current buffer at each
101 * tree level. If a pointer should somehow point back to a buffer that
102 * we've already examined, we will deadlock due to the second buffer
103 * locking attempt. Note however that grabbing a buffer in transaction
104 * context links the locked buffer to the transaction. If we try to
105 * re-grab the buffer in the context of the same transaction, we avoid
106 * the second lock attempt and continue. Between the verifier and the
107 * scrubber, something will notice that something is amiss and report
108 * the corruption. Therefore, each scrubber will allocate an empty
109 * transaction, attach buffers to it, and cancel the transaction at the
110 * end of the scrub run. Cancelling a non-dirty transaction simply
111 * unlocks the buffers.
113 * There are four pieces of data that scrub can communicate to
114 * userspace. The first is the error code (errno), which can be used to
115 * communicate operational errors in performing the scrub. There are
116 * also three flags that can be set in the scrub context. If the data
117 * structure itself is corrupt, the CORRUPT flag will be set. If
118 * the metadata is correct but otherwise suboptimal, the PREEN flag
121 * We perform secondary validation of filesystem metadata by
122 * cross-referencing every record with all other available metadata.
123 * For example, for block mapping extents, we verify that there are no
124 * records in the free space and inode btrees corresponding to that
125 * space extent and that there is a corresponding entry in the reverse
126 * mapping btree. Inconsistent metadata is noted by setting the
127 * XCORRUPT flag; btree query function errors are noted by setting the
128 * XFAIL flag and deleting the cursor to prevent further attempts to
129 * cross-reference with a defective btree.
131 * If a piece of metadata proves corrupt or suboptimal, the userspace
132 * program can ask the kernel to apply some tender loving care (TLC) to
133 * the metadata object by setting the REPAIR flag and re-calling the
134 * scrub ioctl. "Corruption" is defined by metadata violating the
135 * on-disk specification; operations cannot continue if the violation is
136 * left untreated. It is possible for XFS to continue if an object is
137 * "suboptimal", however performance may be degraded. Repairs are
138 * usually performed by rebuilding the metadata entirely out of
139 * redundant metadata. Optimizing, on the other hand, can sometimes be
140 * done without rebuilding entire structures.
142 * Generally speaking, the repair code has the following code structure:
143 * Lock -> scrub -> repair -> commit -> re-lock -> re-scrub -> unlock.
144 * The first check helps us figure out if we need to rebuild or simply
145 * optimize the structure so that the rebuild knows what to do. The
146 * second check evaluates the completeness of the repair; that is what
147 * is reported to userspace.
151 * Scrub probe -- userspace uses this to probe if we're willing to scrub
152 * or repair a given mountpoint. This will be used by xfs_scrub to
153 * probe the kernel's abilities to scrub (and repair) the metadata. We
154 * do this by validating the ioctl inputs from userspace, preparing the
155 * filesystem for a scrub (or a repair) operation, and immediately
156 * returning to userspace. Userspace can use the returned errno and
157 * structure state to decide (in broad terms) if scrub/repair are
158 * supported by the running kernel.
162 struct xfs_scrub_context *sc)
166 if (xfs_scrub_should_terminate(sc, &error))
172 /* Scrub setup and teardown */
174 /* Free all the resources and finish the transactions. */
177 struct xfs_scrub_context *sc,
178 struct xfs_inode *ip_in,
181 xfs_scrub_ag_free(sc, &sc->sa);
183 if (error == 0 && (sc->sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR))
184 error = xfs_trans_commit(sc->tp);
186 xfs_trans_cancel(sc->tp);
191 xfs_iunlock(sc->ip, sc->ilock_flags);
192 if (sc->ip != ip_in &&
193 !xfs_internal_inum(sc->mp, sc->ip->i_ino))
197 if (sc->has_quotaofflock)
198 mutex_unlock(&sc->mp->m_quotainfo->qi_quotaofflock);
206 /* Scrubbing dispatch. */
208 static const struct xfs_scrub_meta_ops meta_scrub_ops[] = {
209 [XFS_SCRUB_TYPE_PROBE] = { /* ioctl presence test */
211 .setup = xfs_scrub_setup_fs,
212 .scrub = xfs_scrub_probe,
213 .repair = xfs_repair_probe,
215 [XFS_SCRUB_TYPE_SB] = { /* superblock */
217 .setup = xfs_scrub_setup_fs,
218 .scrub = xfs_scrub_superblock,
219 .repair = xfs_repair_superblock,
221 [XFS_SCRUB_TYPE_AGF] = { /* agf */
223 .setup = xfs_scrub_setup_fs,
224 .scrub = xfs_scrub_agf,
225 .repair = xfs_repair_notsupported,
227 [XFS_SCRUB_TYPE_AGFL]= { /* agfl */
229 .setup = xfs_scrub_setup_fs,
230 .scrub = xfs_scrub_agfl,
231 .repair = xfs_repair_notsupported,
233 [XFS_SCRUB_TYPE_AGI] = { /* agi */
235 .setup = xfs_scrub_setup_fs,
236 .scrub = xfs_scrub_agi,
237 .repair = xfs_repair_notsupported,
239 [XFS_SCRUB_TYPE_BNOBT] = { /* bnobt */
241 .setup = xfs_scrub_setup_ag_allocbt,
242 .scrub = xfs_scrub_bnobt,
243 .repair = xfs_repair_notsupported,
245 [XFS_SCRUB_TYPE_CNTBT] = { /* cntbt */
247 .setup = xfs_scrub_setup_ag_allocbt,
248 .scrub = xfs_scrub_cntbt,
249 .repair = xfs_repair_notsupported,
251 [XFS_SCRUB_TYPE_INOBT] = { /* inobt */
253 .setup = xfs_scrub_setup_ag_iallocbt,
254 .scrub = xfs_scrub_inobt,
255 .repair = xfs_repair_notsupported,
257 [XFS_SCRUB_TYPE_FINOBT] = { /* finobt */
259 .setup = xfs_scrub_setup_ag_iallocbt,
260 .scrub = xfs_scrub_finobt,
261 .has = xfs_sb_version_hasfinobt,
262 .repair = xfs_repair_notsupported,
264 [XFS_SCRUB_TYPE_RMAPBT] = { /* rmapbt */
266 .setup = xfs_scrub_setup_ag_rmapbt,
267 .scrub = xfs_scrub_rmapbt,
268 .has = xfs_sb_version_hasrmapbt,
269 .repair = xfs_repair_notsupported,
271 [XFS_SCRUB_TYPE_REFCNTBT] = { /* refcountbt */
273 .setup = xfs_scrub_setup_ag_refcountbt,
274 .scrub = xfs_scrub_refcountbt,
275 .has = xfs_sb_version_hasreflink,
276 .repair = xfs_repair_notsupported,
278 [XFS_SCRUB_TYPE_INODE] = { /* inode record */
280 .setup = xfs_scrub_setup_inode,
281 .scrub = xfs_scrub_inode,
282 .repair = xfs_repair_notsupported,
284 [XFS_SCRUB_TYPE_BMBTD] = { /* inode data fork */
286 .setup = xfs_scrub_setup_inode_bmap,
287 .scrub = xfs_scrub_bmap_data,
288 .repair = xfs_repair_notsupported,
290 [XFS_SCRUB_TYPE_BMBTA] = { /* inode attr fork */
292 .setup = xfs_scrub_setup_inode_bmap,
293 .scrub = xfs_scrub_bmap_attr,
294 .repair = xfs_repair_notsupported,
296 [XFS_SCRUB_TYPE_BMBTC] = { /* inode CoW fork */
298 .setup = xfs_scrub_setup_inode_bmap,
299 .scrub = xfs_scrub_bmap_cow,
300 .repair = xfs_repair_notsupported,
302 [XFS_SCRUB_TYPE_DIR] = { /* directory */
304 .setup = xfs_scrub_setup_directory,
305 .scrub = xfs_scrub_directory,
306 .repair = xfs_repair_notsupported,
308 [XFS_SCRUB_TYPE_XATTR] = { /* extended attributes */
310 .setup = xfs_scrub_setup_xattr,
311 .scrub = xfs_scrub_xattr,
312 .repair = xfs_repair_notsupported,
314 [XFS_SCRUB_TYPE_SYMLINK] = { /* symbolic link */
316 .setup = xfs_scrub_setup_symlink,
317 .scrub = xfs_scrub_symlink,
318 .repair = xfs_repair_notsupported,
320 [XFS_SCRUB_TYPE_PARENT] = { /* parent pointers */
322 .setup = xfs_scrub_setup_parent,
323 .scrub = xfs_scrub_parent,
324 .repair = xfs_repair_notsupported,
326 [XFS_SCRUB_TYPE_RTBITMAP] = { /* realtime bitmap */
328 .setup = xfs_scrub_setup_rt,
329 .scrub = xfs_scrub_rtbitmap,
330 .has = xfs_sb_version_hasrealtime,
331 .repair = xfs_repair_notsupported,
333 [XFS_SCRUB_TYPE_RTSUM] = { /* realtime summary */
335 .setup = xfs_scrub_setup_rt,
336 .scrub = xfs_scrub_rtsummary,
337 .has = xfs_sb_version_hasrealtime,
338 .repair = xfs_repair_notsupported,
340 [XFS_SCRUB_TYPE_UQUOTA] = { /* user quota */
342 .setup = xfs_scrub_setup_quota,
343 .scrub = xfs_scrub_quota,
344 .repair = xfs_repair_notsupported,
346 [XFS_SCRUB_TYPE_GQUOTA] = { /* group quota */
348 .setup = xfs_scrub_setup_quota,
349 .scrub = xfs_scrub_quota,
350 .repair = xfs_repair_notsupported,
352 [XFS_SCRUB_TYPE_PQUOTA] = { /* project quota */
354 .setup = xfs_scrub_setup_quota,
355 .scrub = xfs_scrub_quota,
356 .repair = xfs_repair_notsupported,
360 /* This isn't a stable feature, warn once per day. */
362 xfs_scrub_experimental_warning(
363 struct xfs_mount *mp)
365 static struct ratelimit_state scrub_warning = RATELIMIT_STATE_INIT(
366 "xfs_scrub_warning", 86400 * HZ, 1);
367 ratelimit_set_flags(&scrub_warning, RATELIMIT_MSG_ON_RELEASE);
369 if (__ratelimit(&scrub_warning))
371 "EXPERIMENTAL online scrub feature in use. Use at your own risk!");
375 xfs_scrub_validate_inputs(
376 struct xfs_mount *mp,
377 struct xfs_scrub_metadata *sm)
380 const struct xfs_scrub_meta_ops *ops;
383 /* Check our inputs. */
384 sm->sm_flags &= ~XFS_SCRUB_FLAGS_OUT;
385 if (sm->sm_flags & ~XFS_SCRUB_FLAGS_IN)
387 /* sm_reserved[] must be zero */
388 if (memchr_inv(sm->sm_reserved, 0, sizeof(sm->sm_reserved)))
392 /* Do we know about this type of metadata? */
393 if (sm->sm_type >= XFS_SCRUB_TYPE_NR)
395 ops = &meta_scrub_ops[sm->sm_type];
396 if (ops->setup == NULL || ops->scrub == NULL)
398 /* Does this fs even support this type of metadata? */
399 if (ops->has && !ops->has(&mp->m_sb))
403 /* restricting fields must be appropriate for type */
407 if (sm->sm_ino || sm->sm_gen || sm->sm_agno)
411 if (sm->sm_ino || sm->sm_gen ||
412 sm->sm_agno >= mp->m_sb.sb_agcount)
416 if (sm->sm_agno || (sm->sm_gen && !sm->sm_ino))
425 * We won't scrub any filesystem that doesn't have the ability
426 * to record unwritten extents. The option was made default in
427 * 2003, removed from mkfs in 2007, and cannot be disabled in
428 * v5, so if we find a filesystem without this flag it's either
429 * really old or totally unsupported. Avoid it either way.
430 * We also don't support v1-v3 filesystems, which aren't
433 if (!xfs_sb_version_hasextflgbit(&mp->m_sb))
437 * We only want to repair read-write v5+ filesystems. Defer the check
438 * for ops->repair until after our scrub confirms that we need to
439 * perform repairs so that we avoid failing due to not supporting
440 * repairing an object that doesn't need repairs.
442 if (sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR) {
444 if (!xfs_sb_version_hascrc(&mp->m_sb))
448 if (mp->m_flags & XFS_MOUNT_RDONLY)
457 #ifdef CONFIG_XFS_ONLINE_REPAIR
458 static inline void xfs_scrub_postmortem(struct xfs_scrub_context *sc)
461 * Userspace asked us to repair something, we repaired it, rescanned
462 * it, and the rescan says it's still broken. Scream about this in
465 if ((sc->sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR) &&
466 (sc->sm->sm_flags & (XFS_SCRUB_OFLAG_CORRUPT |
467 XFS_SCRUB_OFLAG_XCORRUPT)))
468 xfs_repair_failure(sc->mp);
471 static inline void xfs_scrub_postmortem(struct xfs_scrub_context *sc)
474 * Userspace asked us to scrub something, it's broken, and we have no
475 * way of fixing it. Scream in the logs.
477 if (sc->sm->sm_flags & (XFS_SCRUB_OFLAG_CORRUPT |
478 XFS_SCRUB_OFLAG_XCORRUPT))
479 xfs_alert_ratelimited(sc->mp,
480 "Corruption detected during scrub.");
482 #endif /* CONFIG_XFS_ONLINE_REPAIR */
484 /* Dispatch metadata scrubbing. */
487 struct xfs_inode *ip,
488 struct xfs_scrub_metadata *sm)
490 struct xfs_scrub_context sc;
491 struct xfs_mount *mp = ip->i_mount;
492 bool try_harder = false;
493 bool already_fixed = false;
496 BUILD_BUG_ON(sizeof(meta_scrub_ops) !=
497 (sizeof(struct xfs_scrub_meta_ops) * XFS_SCRUB_TYPE_NR));
499 trace_xfs_scrub_start(ip, sm, error);
501 /* Forbidden if we are shut down or mounted norecovery. */
503 if (XFS_FORCED_SHUTDOWN(mp))
505 error = -ENOTRECOVERABLE;
506 if (mp->m_flags & XFS_MOUNT_NORECOVERY)
509 error = xfs_scrub_validate_inputs(mp, sm);
513 xfs_scrub_experimental_warning(mp);
516 /* Set up for the operation. */
517 memset(&sc, 0, sizeof(sc));
520 sc.ops = &meta_scrub_ops[sm->sm_type];
521 sc.try_harder = try_harder;
522 sc.sa.agno = NULLAGNUMBER;
523 error = sc.ops->setup(&sc, ip);
527 /* Scrub for errors. */
528 error = sc.ops->scrub(&sc);
529 if (!try_harder && error == -EDEADLOCK) {
531 * Scrubbers return -EDEADLOCK to mean 'try harder'.
532 * Tear down everything we hold, then set up again with
533 * preparation for worst-case scenarios.
535 error = xfs_scrub_teardown(&sc, ip, 0);
543 if ((sc.sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR) && !already_fixed) {
546 /* Let debug users force us into the repair routines. */
547 if (XFS_TEST_ERROR(false, mp, XFS_ERRTAG_FORCE_SCRUB_REPAIR))
548 sc.sm->sm_flags |= XFS_SCRUB_OFLAG_CORRUPT;
550 needs_fix = (sc.sm->sm_flags & (XFS_SCRUB_OFLAG_CORRUPT |
551 XFS_SCRUB_OFLAG_XCORRUPT |
552 XFS_SCRUB_OFLAG_PREEN));
554 * If userspace asked for a repair but it wasn't necessary,
555 * report that back to userspace.
558 sc.sm->sm_flags |= XFS_SCRUB_OFLAG_NO_REPAIR_NEEDED;
563 * If it's broken, userspace wants us to fix it, and we haven't
564 * already tried to fix it, then attempt a repair.
566 error = xfs_repair_attempt(ip, &sc, &already_fixed);
567 if (error == -EAGAIN) {
570 error = xfs_scrub_teardown(&sc, ip, 0);
572 xfs_repair_failure(mp);
580 xfs_scrub_postmortem(&sc);
582 error = xfs_scrub_teardown(&sc, ip, error);
584 trace_xfs_scrub_done(ip, sm, error);
585 if (error == -EFSCORRUPTED || error == -EFSBADCRC) {
586 sm->sm_flags |= XFS_SCRUB_OFLAG_CORRUPT;