1 // SPDX-License-Identifier: GPL-2.0
3 * Copyright (C) 2011, 2012 STRATO. All rights reserved.
6 #include <linux/blkdev.h>
7 #include <linux/ratelimit.h>
8 #include <linux/sched/mm.h>
9 #include <crypto/hash.h>
14 #include "ordered-data.h"
15 #include "transaction.h"
17 #include "extent_io.h"
18 #include "dev-replace.h"
19 #include "check-integrity.h"
20 #include "rcu-string.h"
22 #include "block-group.h"
26 * This is only the first step towards a full-features scrub. It reads all
27 * extent and super block and verifies the checksums. In case a bad checksum
28 * is found or the extent cannot be read, good data will be written back if
31 * Future enhancements:
32 * - In case an unrepairable extent is encountered, track which files are
33 * affected and report them
34 * - track and record media errors, throw out bad devices
35 * - add a mode to also read unallocated space
42 * The following three values only influence the performance.
44 * The last one configures the number of parallel and outstanding I/O
45 * operations. The first one configures an upper limit for the number
46 * of (dynamically allocated) pages that are added to a bio.
48 #define SCRUB_SECTORS_PER_BIO 32 /* 128KiB per bio for 4KiB pages */
49 #define SCRUB_BIOS_PER_SCTX 64 /* 8MiB per device in flight for 4KiB pages */
52 * The following value times PAGE_SIZE needs to be large enough to match the
53 * largest node/leaf/sector size that shall be supported.
55 #define SCRUB_MAX_SECTORS_PER_BLOCK (BTRFS_MAX_METADATA_BLOCKSIZE / SZ_4K)
57 struct scrub_recover {
59 struct btrfs_io_context *bioc;
64 struct scrub_block *sblock;
66 struct btrfs_device *dev;
67 struct list_head list;
68 u64 flags; /* extent flags */
72 u64 physical_for_dev_replace;
75 unsigned int have_csum:1;
76 unsigned int io_error:1;
77 u8 csum[BTRFS_CSUM_SIZE];
79 struct scrub_recover *recover;
84 struct scrub_ctx *sctx;
85 struct btrfs_device *dev;
90 struct scrub_sector *sectors[SCRUB_SECTORS_PER_BIO];
93 struct work_struct work;
97 struct scrub_sector *sectors[SCRUB_MAX_SECTORS_PER_BLOCK];
99 atomic_t outstanding_sectors;
100 refcount_t refs; /* free mem on transition to zero */
101 struct scrub_ctx *sctx;
102 struct scrub_parity *sparity;
104 unsigned int header_error:1;
105 unsigned int checksum_error:1;
106 unsigned int no_io_error_seen:1;
107 unsigned int generation_error:1; /* also sets header_error */
109 /* The following is for the data used to check parity */
110 /* It is for the data with checksum */
111 unsigned int data_corrected:1;
113 struct work_struct work;
116 /* Used for the chunks with parity stripe such RAID5/6 */
117 struct scrub_parity {
118 struct scrub_ctx *sctx;
120 struct btrfs_device *scrub_dev;
132 struct list_head sectors_list;
134 /* Work of parity check and repair */
135 struct work_struct work;
137 /* Mark the parity blocks which have data */
138 unsigned long dbitmap;
141 * Mark the parity blocks which have data, but errors happen when
142 * read data or check data
144 unsigned long ebitmap;
148 struct scrub_bio *bios[SCRUB_BIOS_PER_SCTX];
149 struct btrfs_fs_info *fs_info;
152 atomic_t bios_in_flight;
153 atomic_t workers_pending;
154 spinlock_t list_lock;
155 wait_queue_head_t list_wait;
156 struct list_head csum_list;
161 /* State of IO submission throttling affecting the associated device */
162 ktime_t throttle_deadline;
168 struct scrub_bio *wr_curr_bio;
169 struct mutex wr_lock;
170 struct btrfs_device *wr_tgtdev;
171 bool flush_all_writes;
176 struct btrfs_scrub_progress stat;
177 spinlock_t stat_lock;
180 * Use a ref counter to avoid use-after-free issues. Scrub workers
181 * decrement bios_in_flight and workers_pending and then do a wakeup
182 * on the list_wait wait queue. We must ensure the main scrub task
183 * doesn't free the scrub context before or while the workers are
184 * doing the wakeup() call.
189 struct scrub_warning {
190 struct btrfs_path *path;
191 u64 extent_item_size;
195 struct btrfs_device *dev;
198 struct full_stripe_lock {
205 static int scrub_setup_recheck_block(struct scrub_block *original_sblock,
206 struct scrub_block *sblocks_for_recheck);
207 static void scrub_recheck_block(struct btrfs_fs_info *fs_info,
208 struct scrub_block *sblock,
209 int retry_failed_mirror);
210 static void scrub_recheck_block_checksum(struct scrub_block *sblock);
211 static int scrub_repair_block_from_good_copy(struct scrub_block *sblock_bad,
212 struct scrub_block *sblock_good);
213 static int scrub_repair_sector_from_good_copy(struct scrub_block *sblock_bad,
214 struct scrub_block *sblock_good,
215 int sector_num, int force_write);
216 static void scrub_write_block_to_dev_replace(struct scrub_block *sblock);
217 static int scrub_write_sector_to_dev_replace(struct scrub_block *sblock,
219 static int scrub_checksum_data(struct scrub_block *sblock);
220 static int scrub_checksum_tree_block(struct scrub_block *sblock);
221 static int scrub_checksum_super(struct scrub_block *sblock);
222 static void scrub_block_put(struct scrub_block *sblock);
223 static void scrub_sector_get(struct scrub_sector *sector);
224 static void scrub_sector_put(struct scrub_sector *sector);
225 static void scrub_parity_get(struct scrub_parity *sparity);
226 static void scrub_parity_put(struct scrub_parity *sparity);
227 static int scrub_sectors(struct scrub_ctx *sctx, u64 logical, u32 len,
228 u64 physical, struct btrfs_device *dev, u64 flags,
229 u64 gen, int mirror_num, u8 *csum,
230 u64 physical_for_dev_replace);
231 static void scrub_bio_end_io(struct bio *bio);
232 static void scrub_bio_end_io_worker(struct work_struct *work);
233 static void scrub_block_complete(struct scrub_block *sblock);
234 static void scrub_find_good_copy(struct btrfs_fs_info *fs_info,
235 u64 extent_logical, u32 extent_len,
236 u64 *extent_physical,
237 struct btrfs_device **extent_dev,
238 int *extent_mirror_num);
239 static int scrub_add_sector_to_wr_bio(struct scrub_ctx *sctx,
240 struct scrub_sector *sector);
241 static void scrub_wr_submit(struct scrub_ctx *sctx);
242 static void scrub_wr_bio_end_io(struct bio *bio);
243 static void scrub_wr_bio_end_io_worker(struct work_struct *work);
244 static void scrub_put_ctx(struct scrub_ctx *sctx);
246 static inline int scrub_is_page_on_raid56(struct scrub_sector *sector)
248 return sector->recover &&
249 (sector->recover->bioc->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK);
252 static void scrub_pending_bio_inc(struct scrub_ctx *sctx)
254 refcount_inc(&sctx->refs);
255 atomic_inc(&sctx->bios_in_flight);
258 static void scrub_pending_bio_dec(struct scrub_ctx *sctx)
260 atomic_dec(&sctx->bios_in_flight);
261 wake_up(&sctx->list_wait);
265 static void __scrub_blocked_if_needed(struct btrfs_fs_info *fs_info)
267 while (atomic_read(&fs_info->scrub_pause_req)) {
268 mutex_unlock(&fs_info->scrub_lock);
269 wait_event(fs_info->scrub_pause_wait,
270 atomic_read(&fs_info->scrub_pause_req) == 0);
271 mutex_lock(&fs_info->scrub_lock);
275 static void scrub_pause_on(struct btrfs_fs_info *fs_info)
277 atomic_inc(&fs_info->scrubs_paused);
278 wake_up(&fs_info->scrub_pause_wait);
281 static void scrub_pause_off(struct btrfs_fs_info *fs_info)
283 mutex_lock(&fs_info->scrub_lock);
284 __scrub_blocked_if_needed(fs_info);
285 atomic_dec(&fs_info->scrubs_paused);
286 mutex_unlock(&fs_info->scrub_lock);
288 wake_up(&fs_info->scrub_pause_wait);
291 static void scrub_blocked_if_needed(struct btrfs_fs_info *fs_info)
293 scrub_pause_on(fs_info);
294 scrub_pause_off(fs_info);
298 * Insert new full stripe lock into full stripe locks tree
300 * Return pointer to existing or newly inserted full_stripe_lock structure if
301 * everything works well.
302 * Return ERR_PTR(-ENOMEM) if we failed to allocate memory
304 * NOTE: caller must hold full_stripe_locks_root->lock before calling this
307 static struct full_stripe_lock *insert_full_stripe_lock(
308 struct btrfs_full_stripe_locks_tree *locks_root,
312 struct rb_node *parent = NULL;
313 struct full_stripe_lock *entry;
314 struct full_stripe_lock *ret;
316 lockdep_assert_held(&locks_root->lock);
318 p = &locks_root->root.rb_node;
321 entry = rb_entry(parent, struct full_stripe_lock, node);
322 if (fstripe_logical < entry->logical) {
324 } else if (fstripe_logical > entry->logical) {
335 ret = kmalloc(sizeof(*ret), GFP_KERNEL);
337 return ERR_PTR(-ENOMEM);
338 ret->logical = fstripe_logical;
340 mutex_init(&ret->mutex);
342 rb_link_node(&ret->node, parent, p);
343 rb_insert_color(&ret->node, &locks_root->root);
348 * Search for a full stripe lock of a block group
350 * Return pointer to existing full stripe lock if found
351 * Return NULL if not found
353 static struct full_stripe_lock *search_full_stripe_lock(
354 struct btrfs_full_stripe_locks_tree *locks_root,
357 struct rb_node *node;
358 struct full_stripe_lock *entry;
360 lockdep_assert_held(&locks_root->lock);
362 node = locks_root->root.rb_node;
364 entry = rb_entry(node, struct full_stripe_lock, node);
365 if (fstripe_logical < entry->logical)
366 node = node->rb_left;
367 else if (fstripe_logical > entry->logical)
368 node = node->rb_right;
376 * Helper to get full stripe logical from a normal bytenr.
378 * Caller must ensure @cache is a RAID56 block group.
380 static u64 get_full_stripe_logical(struct btrfs_block_group *cache, u64 bytenr)
385 * Due to chunk item size limit, full stripe length should not be
386 * larger than U32_MAX. Just a sanity check here.
388 WARN_ON_ONCE(cache->full_stripe_len >= U32_MAX);
391 * round_down() can only handle power of 2, while RAID56 full
392 * stripe length can be 64KiB * n, so we need to manually round down.
394 ret = div64_u64(bytenr - cache->start, cache->full_stripe_len) *
395 cache->full_stripe_len + cache->start;
400 * Lock a full stripe to avoid concurrency of recovery and read
402 * It's only used for profiles with parities (RAID5/6), for other profiles it
405 * Return 0 if we locked full stripe covering @bytenr, with a mutex held.
406 * So caller must call unlock_full_stripe() at the same context.
408 * Return <0 if encounters error.
410 static int lock_full_stripe(struct btrfs_fs_info *fs_info, u64 bytenr,
413 struct btrfs_block_group *bg_cache;
414 struct btrfs_full_stripe_locks_tree *locks_root;
415 struct full_stripe_lock *existing;
420 bg_cache = btrfs_lookup_block_group(fs_info, bytenr);
426 /* Profiles not based on parity don't need full stripe lock */
427 if (!(bg_cache->flags & BTRFS_BLOCK_GROUP_RAID56_MASK))
429 locks_root = &bg_cache->full_stripe_locks_root;
431 fstripe_start = get_full_stripe_logical(bg_cache, bytenr);
433 /* Now insert the full stripe lock */
434 mutex_lock(&locks_root->lock);
435 existing = insert_full_stripe_lock(locks_root, fstripe_start);
436 mutex_unlock(&locks_root->lock);
437 if (IS_ERR(existing)) {
438 ret = PTR_ERR(existing);
441 mutex_lock(&existing->mutex);
444 btrfs_put_block_group(bg_cache);
449 * Unlock a full stripe.
451 * NOTE: Caller must ensure it's the same context calling corresponding
452 * lock_full_stripe().
454 * Return 0 if we unlock full stripe without problem.
455 * Return <0 for error
457 static int unlock_full_stripe(struct btrfs_fs_info *fs_info, u64 bytenr,
460 struct btrfs_block_group *bg_cache;
461 struct btrfs_full_stripe_locks_tree *locks_root;
462 struct full_stripe_lock *fstripe_lock;
467 /* If we didn't acquire full stripe lock, no need to continue */
471 bg_cache = btrfs_lookup_block_group(fs_info, bytenr);
476 if (!(bg_cache->flags & BTRFS_BLOCK_GROUP_RAID56_MASK))
479 locks_root = &bg_cache->full_stripe_locks_root;
480 fstripe_start = get_full_stripe_logical(bg_cache, bytenr);
482 mutex_lock(&locks_root->lock);
483 fstripe_lock = search_full_stripe_lock(locks_root, fstripe_start);
484 /* Unpaired unlock_full_stripe() detected */
488 mutex_unlock(&locks_root->lock);
492 if (fstripe_lock->refs == 0) {
494 btrfs_warn(fs_info, "full stripe lock at %llu refcount underflow",
495 fstripe_lock->logical);
497 fstripe_lock->refs--;
500 if (fstripe_lock->refs == 0) {
501 rb_erase(&fstripe_lock->node, &locks_root->root);
504 mutex_unlock(&locks_root->lock);
506 mutex_unlock(&fstripe_lock->mutex);
510 btrfs_put_block_group(bg_cache);
514 static void scrub_free_csums(struct scrub_ctx *sctx)
516 while (!list_empty(&sctx->csum_list)) {
517 struct btrfs_ordered_sum *sum;
518 sum = list_first_entry(&sctx->csum_list,
519 struct btrfs_ordered_sum, list);
520 list_del(&sum->list);
525 static noinline_for_stack void scrub_free_ctx(struct scrub_ctx *sctx)
532 /* this can happen when scrub is cancelled */
533 if (sctx->curr != -1) {
534 struct scrub_bio *sbio = sctx->bios[sctx->curr];
536 for (i = 0; i < sbio->sector_count; i++) {
537 WARN_ON(!sbio->sectors[i]->page);
538 scrub_block_put(sbio->sectors[i]->sblock);
543 for (i = 0; i < SCRUB_BIOS_PER_SCTX; ++i) {
544 struct scrub_bio *sbio = sctx->bios[i];
551 kfree(sctx->wr_curr_bio);
552 scrub_free_csums(sctx);
556 static void scrub_put_ctx(struct scrub_ctx *sctx)
558 if (refcount_dec_and_test(&sctx->refs))
559 scrub_free_ctx(sctx);
562 static noinline_for_stack struct scrub_ctx *scrub_setup_ctx(
563 struct btrfs_fs_info *fs_info, int is_dev_replace)
565 struct scrub_ctx *sctx;
568 sctx = kzalloc(sizeof(*sctx), GFP_KERNEL);
571 refcount_set(&sctx->refs, 1);
572 sctx->is_dev_replace = is_dev_replace;
573 sctx->sectors_per_bio = SCRUB_SECTORS_PER_BIO;
575 sctx->fs_info = fs_info;
576 INIT_LIST_HEAD(&sctx->csum_list);
577 for (i = 0; i < SCRUB_BIOS_PER_SCTX; ++i) {
578 struct scrub_bio *sbio;
580 sbio = kzalloc(sizeof(*sbio), GFP_KERNEL);
583 sctx->bios[i] = sbio;
587 sbio->sector_count = 0;
588 INIT_WORK(&sbio->work, scrub_bio_end_io_worker);
590 if (i != SCRUB_BIOS_PER_SCTX - 1)
591 sctx->bios[i]->next_free = i + 1;
593 sctx->bios[i]->next_free = -1;
595 sctx->first_free = 0;
596 atomic_set(&sctx->bios_in_flight, 0);
597 atomic_set(&sctx->workers_pending, 0);
598 atomic_set(&sctx->cancel_req, 0);
600 spin_lock_init(&sctx->list_lock);
601 spin_lock_init(&sctx->stat_lock);
602 init_waitqueue_head(&sctx->list_wait);
603 sctx->throttle_deadline = 0;
605 WARN_ON(sctx->wr_curr_bio != NULL);
606 mutex_init(&sctx->wr_lock);
607 sctx->wr_curr_bio = NULL;
608 if (is_dev_replace) {
609 WARN_ON(!fs_info->dev_replace.tgtdev);
610 sctx->wr_tgtdev = fs_info->dev_replace.tgtdev;
611 sctx->flush_all_writes = false;
617 scrub_free_ctx(sctx);
618 return ERR_PTR(-ENOMEM);
621 static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root,
628 struct extent_buffer *eb;
629 struct btrfs_inode_item *inode_item;
630 struct scrub_warning *swarn = warn_ctx;
631 struct btrfs_fs_info *fs_info = swarn->dev->fs_info;
632 struct inode_fs_paths *ipath = NULL;
633 struct btrfs_root *local_root;
634 struct btrfs_key key;
636 local_root = btrfs_get_fs_root(fs_info, root, true);
637 if (IS_ERR(local_root)) {
638 ret = PTR_ERR(local_root);
643 * this makes the path point to (inum INODE_ITEM ioff)
646 key.type = BTRFS_INODE_ITEM_KEY;
649 ret = btrfs_search_slot(NULL, local_root, &key, swarn->path, 0, 0);
651 btrfs_put_root(local_root);
652 btrfs_release_path(swarn->path);
656 eb = swarn->path->nodes[0];
657 inode_item = btrfs_item_ptr(eb, swarn->path->slots[0],
658 struct btrfs_inode_item);
659 nlink = btrfs_inode_nlink(eb, inode_item);
660 btrfs_release_path(swarn->path);
663 * init_path might indirectly call vmalloc, or use GFP_KERNEL. Scrub
664 * uses GFP_NOFS in this context, so we keep it consistent but it does
665 * not seem to be strictly necessary.
667 nofs_flag = memalloc_nofs_save();
668 ipath = init_ipath(4096, local_root, swarn->path);
669 memalloc_nofs_restore(nofs_flag);
671 btrfs_put_root(local_root);
672 ret = PTR_ERR(ipath);
676 ret = paths_from_inode(inum, ipath);
682 * we deliberately ignore the bit ipath might have been too small to
683 * hold all of the paths here
685 for (i = 0; i < ipath->fspath->elem_cnt; ++i)
686 btrfs_warn_in_rcu(fs_info,
687 "%s at logical %llu on dev %s, physical %llu, root %llu, inode %llu, offset %llu, length %u, links %u (path: %s)",
688 swarn->errstr, swarn->logical,
689 rcu_str_deref(swarn->dev->name),
692 fs_info->sectorsize, nlink,
693 (char *)(unsigned long)ipath->fspath->val[i]);
695 btrfs_put_root(local_root);
700 btrfs_warn_in_rcu(fs_info,
701 "%s at logical %llu on dev %s, physical %llu, root %llu, inode %llu, offset %llu: path resolving failed with ret=%d",
702 swarn->errstr, swarn->logical,
703 rcu_str_deref(swarn->dev->name),
705 root, inum, offset, ret);
711 static void scrub_print_warning(const char *errstr, struct scrub_block *sblock)
713 struct btrfs_device *dev;
714 struct btrfs_fs_info *fs_info;
715 struct btrfs_path *path;
716 struct btrfs_key found_key;
717 struct extent_buffer *eb;
718 struct btrfs_extent_item *ei;
719 struct scrub_warning swarn;
720 unsigned long ptr = 0;
728 WARN_ON(sblock->sector_count < 1);
729 dev = sblock->sectors[0]->dev;
730 fs_info = sblock->sctx->fs_info;
732 path = btrfs_alloc_path();
736 swarn.physical = sblock->sectors[0]->physical;
737 swarn.logical = sblock->sectors[0]->logical;
738 swarn.errstr = errstr;
741 ret = extent_from_logical(fs_info, swarn.logical, path, &found_key,
746 extent_item_pos = swarn.logical - found_key.objectid;
747 swarn.extent_item_size = found_key.offset;
750 ei = btrfs_item_ptr(eb, path->slots[0], struct btrfs_extent_item);
751 item_size = btrfs_item_size(eb, path->slots[0]);
753 if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
755 ret = tree_backref_for_extent(&ptr, eb, &found_key, ei,
756 item_size, &ref_root,
758 btrfs_warn_in_rcu(fs_info,
759 "%s at logical %llu on dev %s, physical %llu: metadata %s (level %d) in tree %llu",
760 errstr, swarn.logical,
761 rcu_str_deref(dev->name),
763 ref_level ? "node" : "leaf",
764 ret < 0 ? -1 : ref_level,
765 ret < 0 ? -1 : ref_root);
767 btrfs_release_path(path);
769 btrfs_release_path(path);
772 iterate_extent_inodes(fs_info, found_key.objectid,
774 scrub_print_warning_inode, &swarn, false);
778 btrfs_free_path(path);
781 static inline void scrub_get_recover(struct scrub_recover *recover)
783 refcount_inc(&recover->refs);
786 static inline void scrub_put_recover(struct btrfs_fs_info *fs_info,
787 struct scrub_recover *recover)
789 if (refcount_dec_and_test(&recover->refs)) {
790 btrfs_bio_counter_dec(fs_info);
791 btrfs_put_bioc(recover->bioc);
797 * scrub_handle_errored_block gets called when either verification of the
798 * sectors failed or the bio failed to read, e.g. with EIO. In the latter
799 * case, this function handles all sectors in the bio, even though only one
801 * The goal of this function is to repair the errored block by using the
802 * contents of one of the mirrors.
804 static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
806 struct scrub_ctx *sctx = sblock_to_check->sctx;
807 struct btrfs_device *dev;
808 struct btrfs_fs_info *fs_info;
810 unsigned int failed_mirror_index;
811 unsigned int is_metadata;
812 unsigned int have_csum;
813 struct scrub_block *sblocks_for_recheck; /* holds one for each mirror */
814 struct scrub_block *sblock_bad;
819 bool full_stripe_locked;
820 unsigned int nofs_flag;
821 static DEFINE_RATELIMIT_STATE(rs, DEFAULT_RATELIMIT_INTERVAL,
822 DEFAULT_RATELIMIT_BURST);
824 BUG_ON(sblock_to_check->sector_count < 1);
825 fs_info = sctx->fs_info;
826 if (sblock_to_check->sectors[0]->flags & BTRFS_EXTENT_FLAG_SUPER) {
828 * if we find an error in a super block, we just report it.
829 * They will get written with the next transaction commit
832 spin_lock(&sctx->stat_lock);
833 ++sctx->stat.super_errors;
834 spin_unlock(&sctx->stat_lock);
837 logical = sblock_to_check->sectors[0]->logical;
838 BUG_ON(sblock_to_check->sectors[0]->mirror_num < 1);
839 failed_mirror_index = sblock_to_check->sectors[0]->mirror_num - 1;
840 is_metadata = !(sblock_to_check->sectors[0]->flags &
841 BTRFS_EXTENT_FLAG_DATA);
842 have_csum = sblock_to_check->sectors[0]->have_csum;
843 dev = sblock_to_check->sectors[0]->dev;
845 if (!sctx->is_dev_replace && btrfs_repair_one_zone(fs_info, logical))
849 * We must use GFP_NOFS because the scrub task might be waiting for a
850 * worker task executing this function and in turn a transaction commit
851 * might be waiting the scrub task to pause (which needs to wait for all
852 * the worker tasks to complete before pausing).
853 * We do allocations in the workers through insert_full_stripe_lock()
854 * and scrub_add_sector_to_wr_bio(), which happens down the call chain of
857 nofs_flag = memalloc_nofs_save();
859 * For RAID5/6, race can happen for a different device scrub thread.
860 * For data corruption, Parity and Data threads will both try
861 * to recovery the data.
862 * Race can lead to doubly added csum error, or even unrecoverable
865 ret = lock_full_stripe(fs_info, logical, &full_stripe_locked);
867 memalloc_nofs_restore(nofs_flag);
868 spin_lock(&sctx->stat_lock);
870 sctx->stat.malloc_errors++;
871 sctx->stat.read_errors++;
872 sctx->stat.uncorrectable_errors++;
873 spin_unlock(&sctx->stat_lock);
878 * read all mirrors one after the other. This includes to
879 * re-read the extent or metadata block that failed (that was
880 * the cause that this fixup code is called) another time,
881 * sector by sector this time in order to know which sectors
882 * caused I/O errors and which ones are good (for all mirrors).
883 * It is the goal to handle the situation when more than one
884 * mirror contains I/O errors, but the errors do not
885 * overlap, i.e. the data can be repaired by selecting the
886 * sectors from those mirrors without I/O error on the
887 * particular sectors. One example (with blocks >= 2 * sectorsize)
888 * would be that mirror #1 has an I/O error on the first sector,
889 * the second sector is good, and mirror #2 has an I/O error on
890 * the second sector, but the first sector is good.
891 * Then the first sector of the first mirror can be repaired by
892 * taking the first sector of the second mirror, and the
893 * second sector of the second mirror can be repaired by
894 * copying the contents of the 2nd sector of the 1st mirror.
895 * One more note: if the sectors of one mirror contain I/O
896 * errors, the checksum cannot be verified. In order to get
897 * the best data for repairing, the first attempt is to find
898 * a mirror without I/O errors and with a validated checksum.
899 * Only if this is not possible, the sectors are picked from
900 * mirrors with I/O errors without considering the checksum.
901 * If the latter is the case, at the end, the checksum of the
902 * repaired area is verified in order to correctly maintain
906 sblocks_for_recheck = kcalloc(BTRFS_MAX_MIRRORS,
907 sizeof(*sblocks_for_recheck), GFP_KERNEL);
908 if (!sblocks_for_recheck) {
909 spin_lock(&sctx->stat_lock);
910 sctx->stat.malloc_errors++;
911 sctx->stat.read_errors++;
912 sctx->stat.uncorrectable_errors++;
913 spin_unlock(&sctx->stat_lock);
914 btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS);
918 /* Setup the context, map the logical blocks and alloc the sectors */
919 ret = scrub_setup_recheck_block(sblock_to_check, sblocks_for_recheck);
921 spin_lock(&sctx->stat_lock);
922 sctx->stat.read_errors++;
923 sctx->stat.uncorrectable_errors++;
924 spin_unlock(&sctx->stat_lock);
925 btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS);
928 BUG_ON(failed_mirror_index >= BTRFS_MAX_MIRRORS);
929 sblock_bad = sblocks_for_recheck + failed_mirror_index;
931 /* build and submit the bios for the failed mirror, check checksums */
932 scrub_recheck_block(fs_info, sblock_bad, 1);
934 if (!sblock_bad->header_error && !sblock_bad->checksum_error &&
935 sblock_bad->no_io_error_seen) {
937 * The error disappeared after reading sector by sector, or
938 * the area was part of a huge bio and other parts of the
939 * bio caused I/O errors, or the block layer merged several
940 * read requests into one and the error is caused by a
941 * different bio (usually one of the two latter cases is
944 spin_lock(&sctx->stat_lock);
945 sctx->stat.unverified_errors++;
946 sblock_to_check->data_corrected = 1;
947 spin_unlock(&sctx->stat_lock);
949 if (sctx->is_dev_replace)
950 scrub_write_block_to_dev_replace(sblock_bad);
954 if (!sblock_bad->no_io_error_seen) {
955 spin_lock(&sctx->stat_lock);
956 sctx->stat.read_errors++;
957 spin_unlock(&sctx->stat_lock);
958 if (__ratelimit(&rs))
959 scrub_print_warning("i/o error", sblock_to_check);
960 btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS);
961 } else if (sblock_bad->checksum_error) {
962 spin_lock(&sctx->stat_lock);
963 sctx->stat.csum_errors++;
964 spin_unlock(&sctx->stat_lock);
965 if (__ratelimit(&rs))
966 scrub_print_warning("checksum error", sblock_to_check);
967 btrfs_dev_stat_inc_and_print(dev,
968 BTRFS_DEV_STAT_CORRUPTION_ERRS);
969 } else if (sblock_bad->header_error) {
970 spin_lock(&sctx->stat_lock);
971 sctx->stat.verify_errors++;
972 spin_unlock(&sctx->stat_lock);
973 if (__ratelimit(&rs))
974 scrub_print_warning("checksum/header error",
976 if (sblock_bad->generation_error)
977 btrfs_dev_stat_inc_and_print(dev,
978 BTRFS_DEV_STAT_GENERATION_ERRS);
980 btrfs_dev_stat_inc_and_print(dev,
981 BTRFS_DEV_STAT_CORRUPTION_ERRS);
984 if (sctx->readonly) {
985 ASSERT(!sctx->is_dev_replace);
990 * now build and submit the bios for the other mirrors, check
992 * First try to pick the mirror which is completely without I/O
993 * errors and also does not have a checksum error.
994 * If one is found, and if a checksum is present, the full block
995 * that is known to contain an error is rewritten. Afterwards
996 * the block is known to be corrected.
997 * If a mirror is found which is completely correct, and no
998 * checksum is present, only those sectors are rewritten that had
999 * an I/O error in the block to be repaired, since it cannot be
1000 * determined, which copy of the other sectors is better (and it
1001 * could happen otherwise that a correct sector would be
1002 * overwritten by a bad one).
1004 for (mirror_index = 0; ;mirror_index++) {
1005 struct scrub_block *sblock_other;
1007 if (mirror_index == failed_mirror_index)
1010 /* raid56's mirror can be more than BTRFS_MAX_MIRRORS */
1011 if (!scrub_is_page_on_raid56(sblock_bad->sectors[0])) {
1012 if (mirror_index >= BTRFS_MAX_MIRRORS)
1014 if (!sblocks_for_recheck[mirror_index].sector_count)
1017 sblock_other = sblocks_for_recheck + mirror_index;
1019 struct scrub_recover *r = sblock_bad->sectors[0]->recover;
1020 int max_allowed = r->bioc->num_stripes - r->bioc->num_tgtdevs;
1022 if (mirror_index >= max_allowed)
1024 if (!sblocks_for_recheck[1].sector_count)
1027 ASSERT(failed_mirror_index == 0);
1028 sblock_other = sblocks_for_recheck + 1;
1029 sblock_other->sectors[0]->mirror_num = 1 + mirror_index;
1032 /* build and submit the bios, check checksums */
1033 scrub_recheck_block(fs_info, sblock_other, 0);
1035 if (!sblock_other->header_error &&
1036 !sblock_other->checksum_error &&
1037 sblock_other->no_io_error_seen) {
1038 if (sctx->is_dev_replace) {
1039 scrub_write_block_to_dev_replace(sblock_other);
1040 goto corrected_error;
1042 ret = scrub_repair_block_from_good_copy(
1043 sblock_bad, sblock_other);
1045 goto corrected_error;
1050 if (sblock_bad->no_io_error_seen && !sctx->is_dev_replace)
1051 goto did_not_correct_error;
1054 * In case of I/O errors in the area that is supposed to be
1055 * repaired, continue by picking good copies of those sectors.
1056 * Select the good sectors from mirrors to rewrite bad sectors from
1057 * the area to fix. Afterwards verify the checksum of the block
1058 * that is supposed to be repaired. This verification step is
1059 * only done for the purpose of statistic counting and for the
1060 * final scrub report, whether errors remain.
1061 * A perfect algorithm could make use of the checksum and try
1062 * all possible combinations of sectors from the different mirrors
1063 * until the checksum verification succeeds. For example, when
1064 * the 2nd sector of mirror #1 faces I/O errors, and the 2nd sector
1065 * of mirror #2 is readable but the final checksum test fails,
1066 * then the 2nd sector of mirror #3 could be tried, whether now
1067 * the final checksum succeeds. But this would be a rare
1068 * exception and is therefore not implemented. At least it is
1069 * avoided that the good copy is overwritten.
1070 * A more useful improvement would be to pick the sectors
1071 * without I/O error based on sector sizes (512 bytes on legacy
1072 * disks) instead of on sectorsize. Then maybe 512 byte of one
1073 * mirror could be repaired by taking 512 byte of a different
1074 * mirror, even if other 512 byte sectors in the same sectorsize
1075 * area are unreadable.
1078 for (sector_num = 0; sector_num < sblock_bad->sector_count;
1080 struct scrub_sector *sector_bad = sblock_bad->sectors[sector_num];
1081 struct scrub_block *sblock_other = NULL;
1083 /* Skip no-io-error sectors in scrub */
1084 if (!sector_bad->io_error && !sctx->is_dev_replace)
1087 if (scrub_is_page_on_raid56(sblock_bad->sectors[0])) {
1089 * In case of dev replace, if raid56 rebuild process
1090 * didn't work out correct data, then copy the content
1091 * in sblock_bad to make sure target device is identical
1092 * to source device, instead of writing garbage data in
1093 * sblock_for_recheck array to target device.
1095 sblock_other = NULL;
1096 } else if (sector_bad->io_error) {
1097 /* Try to find no-io-error sector in mirrors */
1098 for (mirror_index = 0;
1099 mirror_index < BTRFS_MAX_MIRRORS &&
1100 sblocks_for_recheck[mirror_index].sector_count > 0;
1102 if (!sblocks_for_recheck[mirror_index].
1103 sectors[sector_num]->io_error) {
1104 sblock_other = sblocks_for_recheck +
1113 if (sctx->is_dev_replace) {
1115 * Did not find a mirror to fetch the sector from.
1116 * scrub_write_sector_to_dev_replace() handles this
1117 * case (sector->io_error), by filling the block with
1118 * zeros before submitting the write request
1121 sblock_other = sblock_bad;
1123 if (scrub_write_sector_to_dev_replace(sblock_other,
1126 &fs_info->dev_replace.num_write_errors);
1129 } else if (sblock_other) {
1130 ret = scrub_repair_sector_from_good_copy(sblock_bad,
1134 sector_bad->io_error = 0;
1140 if (success && !sctx->is_dev_replace) {
1141 if (is_metadata || have_csum) {
1143 * need to verify the checksum now that all
1144 * sectors on disk are repaired (the write
1145 * request for data to be repaired is on its way).
1146 * Just be lazy and use scrub_recheck_block()
1147 * which re-reads the data before the checksum
1148 * is verified, but most likely the data comes out
1149 * of the page cache.
1151 scrub_recheck_block(fs_info, sblock_bad, 1);
1152 if (!sblock_bad->header_error &&
1153 !sblock_bad->checksum_error &&
1154 sblock_bad->no_io_error_seen)
1155 goto corrected_error;
1157 goto did_not_correct_error;
1160 spin_lock(&sctx->stat_lock);
1161 sctx->stat.corrected_errors++;
1162 sblock_to_check->data_corrected = 1;
1163 spin_unlock(&sctx->stat_lock);
1164 btrfs_err_rl_in_rcu(fs_info,
1165 "fixed up error at logical %llu on dev %s",
1166 logical, rcu_str_deref(dev->name));
1169 did_not_correct_error:
1170 spin_lock(&sctx->stat_lock);
1171 sctx->stat.uncorrectable_errors++;
1172 spin_unlock(&sctx->stat_lock);
1173 btrfs_err_rl_in_rcu(fs_info,
1174 "unable to fixup (regular) error at logical %llu on dev %s",
1175 logical, rcu_str_deref(dev->name));
1179 if (sblocks_for_recheck) {
1180 for (mirror_index = 0; mirror_index < BTRFS_MAX_MIRRORS;
1182 struct scrub_block *sblock = sblocks_for_recheck +
1184 struct scrub_recover *recover;
1187 for (i = 0; i < sblock->sector_count; i++) {
1188 sblock->sectors[i]->sblock = NULL;
1189 recover = sblock->sectors[i]->recover;
1191 scrub_put_recover(fs_info, recover);
1192 sblock->sectors[i]->recover = NULL;
1194 scrub_sector_put(sblock->sectors[i]);
1197 kfree(sblocks_for_recheck);
1200 ret = unlock_full_stripe(fs_info, logical, full_stripe_locked);
1201 memalloc_nofs_restore(nofs_flag);
1207 static inline int scrub_nr_raid_mirrors(struct btrfs_io_context *bioc)
1209 if (bioc->map_type & BTRFS_BLOCK_GROUP_RAID5)
1211 else if (bioc->map_type & BTRFS_BLOCK_GROUP_RAID6)
1214 return (int)bioc->num_stripes;
1217 static inline void scrub_stripe_index_and_offset(u64 logical, u64 map_type,
1220 int nstripes, int mirror,
1226 if (map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
1228 for (i = 0; i < nstripes; i++) {
1229 if (raid_map[i] == RAID6_Q_STRIPE ||
1230 raid_map[i] == RAID5_P_STRIPE)
1233 if (logical >= raid_map[i] &&
1234 logical < raid_map[i] + mapped_length)
1239 *stripe_offset = logical - raid_map[i];
1241 /* The other RAID type */
1242 *stripe_index = mirror;
1247 static int scrub_setup_recheck_block(struct scrub_block *original_sblock,
1248 struct scrub_block *sblocks_for_recheck)
1250 struct scrub_ctx *sctx = original_sblock->sctx;
1251 struct btrfs_fs_info *fs_info = sctx->fs_info;
1252 u64 length = original_sblock->sector_count << fs_info->sectorsize_bits;
1253 u64 logical = original_sblock->sectors[0]->logical;
1254 u64 generation = original_sblock->sectors[0]->generation;
1255 u64 flags = original_sblock->sectors[0]->flags;
1256 u64 have_csum = original_sblock->sectors[0]->have_csum;
1257 struct scrub_recover *recover;
1258 struct btrfs_io_context *bioc;
1263 int sector_index = 0;
1269 * Note: the two members refs and outstanding_sectors are not used (and
1270 * not set) in the blocks that are used for the recheck procedure.
1273 while (length > 0) {
1274 sublen = min_t(u64, length, fs_info->sectorsize);
1275 mapped_length = sublen;
1279 * With a length of sectorsize, each returned stripe represents
1282 btrfs_bio_counter_inc_blocked(fs_info);
1283 ret = btrfs_map_sblock(fs_info, BTRFS_MAP_GET_READ_MIRRORS,
1284 logical, &mapped_length, &bioc);
1285 if (ret || !bioc || mapped_length < sublen) {
1286 btrfs_put_bioc(bioc);
1287 btrfs_bio_counter_dec(fs_info);
1291 recover = kzalloc(sizeof(struct scrub_recover), GFP_NOFS);
1293 btrfs_put_bioc(bioc);
1294 btrfs_bio_counter_dec(fs_info);
1298 refcount_set(&recover->refs, 1);
1299 recover->bioc = bioc;
1300 recover->map_length = mapped_length;
1302 ASSERT(sector_index < SCRUB_MAX_SECTORS_PER_BLOCK);
1304 nmirrors = min(scrub_nr_raid_mirrors(bioc), BTRFS_MAX_MIRRORS);
1306 for (mirror_index = 0; mirror_index < nmirrors;
1308 struct scrub_block *sblock;
1309 struct scrub_sector *sector;
1311 sblock = sblocks_for_recheck + mirror_index;
1312 sblock->sctx = sctx;
1314 sector = kzalloc(sizeof(*sector), GFP_NOFS);
1317 spin_lock(&sctx->stat_lock);
1318 sctx->stat.malloc_errors++;
1319 spin_unlock(&sctx->stat_lock);
1320 scrub_put_recover(fs_info, recover);
1323 scrub_sector_get(sector);
1324 sblock->sectors[sector_index] = sector;
1325 sector->sblock = sblock;
1326 sector->flags = flags;
1327 sector->generation = generation;
1328 sector->logical = logical;
1329 sector->have_csum = have_csum;
1331 memcpy(sector->csum,
1332 original_sblock->sectors[0]->csum,
1333 sctx->fs_info->csum_size);
1335 scrub_stripe_index_and_offset(logical,
1344 sector->physical = bioc->stripes[stripe_index].physical +
1346 sector->dev = bioc->stripes[stripe_index].dev;
1348 BUG_ON(sector_index >= original_sblock->sector_count);
1349 sector->physical_for_dev_replace =
1350 original_sblock->sectors[sector_index]->
1351 physical_for_dev_replace;
1352 /* For missing devices, dev->bdev is NULL */
1353 sector->mirror_num = mirror_index + 1;
1354 sblock->sector_count++;
1355 sector->page = alloc_page(GFP_NOFS);
1359 scrub_get_recover(recover);
1360 sector->recover = recover;
1362 scrub_put_recover(fs_info, recover);
1371 static void scrub_bio_wait_endio(struct bio *bio)
1373 complete(bio->bi_private);
1376 static int scrub_submit_raid56_bio_wait(struct btrfs_fs_info *fs_info,
1378 struct scrub_sector *sector)
1380 DECLARE_COMPLETION_ONSTACK(done);
1384 bio->bi_iter.bi_sector = sector->logical >> 9;
1385 bio->bi_private = &done;
1386 bio->bi_end_io = scrub_bio_wait_endio;
1388 mirror_num = sector->sblock->sectors[0]->mirror_num;
1389 ret = raid56_parity_recover(bio, sector->recover->bioc,
1390 sector->recover->map_length,
1395 wait_for_completion_io(&done);
1396 return blk_status_to_errno(bio->bi_status);
1399 static void scrub_recheck_block_on_raid56(struct btrfs_fs_info *fs_info,
1400 struct scrub_block *sblock)
1402 struct scrub_sector *first_sector = sblock->sectors[0];
1406 /* All sectors in sblock belong to the same stripe on the same device. */
1407 ASSERT(first_sector->dev);
1408 if (!first_sector->dev->bdev)
1411 bio = bio_alloc(first_sector->dev->bdev, BIO_MAX_VECS, REQ_OP_READ, GFP_NOFS);
1413 for (i = 0; i < sblock->sector_count; i++) {
1414 struct scrub_sector *sector = sblock->sectors[i];
1416 WARN_ON(!sector->page);
1417 bio_add_page(bio, sector->page, PAGE_SIZE, 0);
1420 if (scrub_submit_raid56_bio_wait(fs_info, bio, first_sector)) {
1427 scrub_recheck_block_checksum(sblock);
1431 for (i = 0; i < sblock->sector_count; i++)
1432 sblock->sectors[i]->io_error = 1;
1434 sblock->no_io_error_seen = 0;
1438 * This function will check the on disk data for checksum errors, header errors
1439 * and read I/O errors. If any I/O errors happen, the exact sectors which are
1440 * errored are marked as being bad. The goal is to enable scrub to take those
1441 * sectors that are not errored from all the mirrors so that the sectors that
1442 * are errored in the just handled mirror can be repaired.
1444 static void scrub_recheck_block(struct btrfs_fs_info *fs_info,
1445 struct scrub_block *sblock,
1446 int retry_failed_mirror)
1450 sblock->no_io_error_seen = 1;
1452 /* short cut for raid56 */
1453 if (!retry_failed_mirror && scrub_is_page_on_raid56(sblock->sectors[0]))
1454 return scrub_recheck_block_on_raid56(fs_info, sblock);
1456 for (i = 0; i < sblock->sector_count; i++) {
1457 struct scrub_sector *sector = sblock->sectors[i];
1459 struct bio_vec bvec;
1461 if (sector->dev->bdev == NULL) {
1462 sector->io_error = 1;
1463 sblock->no_io_error_seen = 0;
1467 WARN_ON(!sector->page);
1468 bio_init(&bio, sector->dev->bdev, &bvec, 1, REQ_OP_READ);
1469 bio_add_page(&bio, sector->page, fs_info->sectorsize, 0);
1470 bio.bi_iter.bi_sector = sector->physical >> 9;
1472 btrfsic_check_bio(&bio);
1473 if (submit_bio_wait(&bio)) {
1474 sector->io_error = 1;
1475 sblock->no_io_error_seen = 0;
1481 if (sblock->no_io_error_seen)
1482 scrub_recheck_block_checksum(sblock);
1485 static inline int scrub_check_fsid(u8 fsid[], struct scrub_sector *sector)
1487 struct btrfs_fs_devices *fs_devices = sector->dev->fs_devices;
1490 ret = memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE);
1494 static void scrub_recheck_block_checksum(struct scrub_block *sblock)
1496 sblock->header_error = 0;
1497 sblock->checksum_error = 0;
1498 sblock->generation_error = 0;
1500 if (sblock->sectors[0]->flags & BTRFS_EXTENT_FLAG_DATA)
1501 scrub_checksum_data(sblock);
1503 scrub_checksum_tree_block(sblock);
1506 static int scrub_repair_block_from_good_copy(struct scrub_block *sblock_bad,
1507 struct scrub_block *sblock_good)
1512 for (i = 0; i < sblock_bad->sector_count; i++) {
1515 ret_sub = scrub_repair_sector_from_good_copy(sblock_bad,
1524 static int scrub_repair_sector_from_good_copy(struct scrub_block *sblock_bad,
1525 struct scrub_block *sblock_good,
1526 int sector_num, int force_write)
1528 struct scrub_sector *sector_bad = sblock_bad->sectors[sector_num];
1529 struct scrub_sector *sector_good = sblock_good->sectors[sector_num];
1530 struct btrfs_fs_info *fs_info = sblock_bad->sctx->fs_info;
1531 const u32 sectorsize = fs_info->sectorsize;
1533 BUG_ON(sector_bad->page == NULL);
1534 BUG_ON(sector_good->page == NULL);
1535 if (force_write || sblock_bad->header_error ||
1536 sblock_bad->checksum_error || sector_bad->io_error) {
1538 struct bio_vec bvec;
1541 if (!sector_bad->dev->bdev) {
1542 btrfs_warn_rl(fs_info,
1543 "scrub_repair_page_from_good_copy(bdev == NULL) is unexpected");
1547 bio_init(&bio, sector_bad->dev->bdev, &bvec, 1, REQ_OP_WRITE);
1548 bio.bi_iter.bi_sector = sector_bad->physical >> 9;
1549 __bio_add_page(&bio, sector_good->page, sectorsize, 0);
1551 btrfsic_check_bio(&bio);
1552 ret = submit_bio_wait(&bio);
1556 btrfs_dev_stat_inc_and_print(sector_bad->dev,
1557 BTRFS_DEV_STAT_WRITE_ERRS);
1558 atomic64_inc(&fs_info->dev_replace.num_write_errors);
1566 static void scrub_write_block_to_dev_replace(struct scrub_block *sblock)
1568 struct btrfs_fs_info *fs_info = sblock->sctx->fs_info;
1572 * This block is used for the check of the parity on the source device,
1573 * so the data needn't be written into the destination device.
1575 if (sblock->sparity)
1578 for (i = 0; i < sblock->sector_count; i++) {
1581 ret = scrub_write_sector_to_dev_replace(sblock, i);
1583 atomic64_inc(&fs_info->dev_replace.num_write_errors);
1587 static int scrub_write_sector_to_dev_replace(struct scrub_block *sblock, int sector_num)
1589 struct scrub_sector *sector = sblock->sectors[sector_num];
1591 BUG_ON(sector->page == NULL);
1592 if (sector->io_error)
1593 clear_page(page_address(sector->page));
1595 return scrub_add_sector_to_wr_bio(sblock->sctx, sector);
1598 static int fill_writer_pointer_gap(struct scrub_ctx *sctx, u64 physical)
1603 if (!btrfs_is_zoned(sctx->fs_info))
1606 if (!btrfs_dev_is_sequential(sctx->wr_tgtdev, physical))
1609 if (sctx->write_pointer < physical) {
1610 length = physical - sctx->write_pointer;
1612 ret = btrfs_zoned_issue_zeroout(sctx->wr_tgtdev,
1613 sctx->write_pointer, length);
1615 sctx->write_pointer = physical;
1620 static int scrub_add_sector_to_wr_bio(struct scrub_ctx *sctx,
1621 struct scrub_sector *sector)
1623 struct scrub_bio *sbio;
1625 const u32 sectorsize = sctx->fs_info->sectorsize;
1627 mutex_lock(&sctx->wr_lock);
1629 if (!sctx->wr_curr_bio) {
1630 sctx->wr_curr_bio = kzalloc(sizeof(*sctx->wr_curr_bio),
1632 if (!sctx->wr_curr_bio) {
1633 mutex_unlock(&sctx->wr_lock);
1636 sctx->wr_curr_bio->sctx = sctx;
1637 sctx->wr_curr_bio->sector_count = 0;
1639 sbio = sctx->wr_curr_bio;
1640 if (sbio->sector_count == 0) {
1641 ret = fill_writer_pointer_gap(sctx, sector->physical_for_dev_replace);
1643 mutex_unlock(&sctx->wr_lock);
1647 sbio->physical = sector->physical_for_dev_replace;
1648 sbio->logical = sector->logical;
1649 sbio->dev = sctx->wr_tgtdev;
1651 sbio->bio = bio_alloc(sbio->dev->bdev, sctx->sectors_per_bio,
1652 REQ_OP_WRITE, GFP_NOFS);
1654 sbio->bio->bi_private = sbio;
1655 sbio->bio->bi_end_io = scrub_wr_bio_end_io;
1656 sbio->bio->bi_iter.bi_sector = sbio->physical >> 9;
1658 } else if (sbio->physical + sbio->sector_count * sectorsize !=
1659 sector->physical_for_dev_replace ||
1660 sbio->logical + sbio->sector_count * sectorsize !=
1662 scrub_wr_submit(sctx);
1666 ret = bio_add_page(sbio->bio, sector->page, sectorsize, 0);
1667 if (ret != sectorsize) {
1668 if (sbio->sector_count < 1) {
1671 mutex_unlock(&sctx->wr_lock);
1674 scrub_wr_submit(sctx);
1678 sbio->sectors[sbio->sector_count] = sector;
1679 scrub_sector_get(sector);
1680 sbio->sector_count++;
1681 if (sbio->sector_count == sctx->sectors_per_bio)
1682 scrub_wr_submit(sctx);
1683 mutex_unlock(&sctx->wr_lock);
1688 static void scrub_wr_submit(struct scrub_ctx *sctx)
1690 struct scrub_bio *sbio;
1692 if (!sctx->wr_curr_bio)
1695 sbio = sctx->wr_curr_bio;
1696 sctx->wr_curr_bio = NULL;
1697 scrub_pending_bio_inc(sctx);
1698 /* process all writes in a single worker thread. Then the block layer
1699 * orders the requests before sending them to the driver which
1700 * doubled the write performance on spinning disks when measured
1702 btrfsic_check_bio(sbio->bio);
1703 submit_bio(sbio->bio);
1705 if (btrfs_is_zoned(sctx->fs_info))
1706 sctx->write_pointer = sbio->physical + sbio->sector_count *
1707 sctx->fs_info->sectorsize;
1710 static void scrub_wr_bio_end_io(struct bio *bio)
1712 struct scrub_bio *sbio = bio->bi_private;
1713 struct btrfs_fs_info *fs_info = sbio->dev->fs_info;
1715 sbio->status = bio->bi_status;
1718 INIT_WORK(&sbio->work, scrub_wr_bio_end_io_worker);
1719 queue_work(fs_info->scrub_wr_completion_workers, &sbio->work);
1722 static void scrub_wr_bio_end_io_worker(struct work_struct *work)
1724 struct scrub_bio *sbio = container_of(work, struct scrub_bio, work);
1725 struct scrub_ctx *sctx = sbio->sctx;
1728 ASSERT(sbio->sector_count <= SCRUB_SECTORS_PER_BIO);
1730 struct btrfs_dev_replace *dev_replace =
1731 &sbio->sctx->fs_info->dev_replace;
1733 for (i = 0; i < sbio->sector_count; i++) {
1734 struct scrub_sector *sector = sbio->sectors[i];
1736 sector->io_error = 1;
1737 atomic64_inc(&dev_replace->num_write_errors);
1741 for (i = 0; i < sbio->sector_count; i++)
1742 scrub_sector_put(sbio->sectors[i]);
1746 scrub_pending_bio_dec(sctx);
1749 static int scrub_checksum(struct scrub_block *sblock)
1755 * No need to initialize these stats currently,
1756 * because this function only use return value
1757 * instead of these stats value.
1762 sblock->header_error = 0;
1763 sblock->generation_error = 0;
1764 sblock->checksum_error = 0;
1766 WARN_ON(sblock->sector_count < 1);
1767 flags = sblock->sectors[0]->flags;
1769 if (flags & BTRFS_EXTENT_FLAG_DATA)
1770 ret = scrub_checksum_data(sblock);
1771 else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)
1772 ret = scrub_checksum_tree_block(sblock);
1773 else if (flags & BTRFS_EXTENT_FLAG_SUPER)
1774 (void)scrub_checksum_super(sblock);
1778 scrub_handle_errored_block(sblock);
1783 static int scrub_checksum_data(struct scrub_block *sblock)
1785 struct scrub_ctx *sctx = sblock->sctx;
1786 struct btrfs_fs_info *fs_info = sctx->fs_info;
1787 SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
1788 u8 csum[BTRFS_CSUM_SIZE];
1789 struct scrub_sector *sector;
1792 BUG_ON(sblock->sector_count < 1);
1793 sector = sblock->sectors[0];
1794 if (!sector->have_csum)
1797 kaddr = page_address(sector->page);
1799 shash->tfm = fs_info->csum_shash;
1800 crypto_shash_init(shash);
1803 * In scrub_sectors() and scrub_sectors_for_parity() we ensure each sector
1804 * only contains one sector of data.
1806 crypto_shash_digest(shash, kaddr, fs_info->sectorsize, csum);
1808 if (memcmp(csum, sector->csum, fs_info->csum_size))
1809 sblock->checksum_error = 1;
1810 return sblock->checksum_error;
1813 static int scrub_checksum_tree_block(struct scrub_block *sblock)
1815 struct scrub_ctx *sctx = sblock->sctx;
1816 struct btrfs_header *h;
1817 struct btrfs_fs_info *fs_info = sctx->fs_info;
1818 SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
1819 u8 calculated_csum[BTRFS_CSUM_SIZE];
1820 u8 on_disk_csum[BTRFS_CSUM_SIZE];
1822 * This is done in sectorsize steps even for metadata as there's a
1823 * constraint for nodesize to be aligned to sectorsize. This will need
1824 * to change so we don't misuse data and metadata units like that.
1826 const u32 sectorsize = sctx->fs_info->sectorsize;
1827 const int num_sectors = fs_info->nodesize >> fs_info->sectorsize_bits;
1829 struct scrub_sector *sector;
1832 BUG_ON(sblock->sector_count < 1);
1834 /* Each member in sectors is just one sector */
1835 ASSERT(sblock->sector_count == num_sectors);
1837 sector = sblock->sectors[0];
1838 kaddr = page_address(sector->page);
1839 h = (struct btrfs_header *)kaddr;
1840 memcpy(on_disk_csum, h->csum, sctx->fs_info->csum_size);
1843 * we don't use the getter functions here, as we
1844 * a) don't have an extent buffer and
1845 * b) the page is already kmapped
1847 if (sector->logical != btrfs_stack_header_bytenr(h))
1848 sblock->header_error = 1;
1850 if (sector->generation != btrfs_stack_header_generation(h)) {
1851 sblock->header_error = 1;
1852 sblock->generation_error = 1;
1855 if (!scrub_check_fsid(h->fsid, sector))
1856 sblock->header_error = 1;
1858 if (memcmp(h->chunk_tree_uuid, fs_info->chunk_tree_uuid,
1860 sblock->header_error = 1;
1862 shash->tfm = fs_info->csum_shash;
1863 crypto_shash_init(shash);
1864 crypto_shash_update(shash, kaddr + BTRFS_CSUM_SIZE,
1865 sectorsize - BTRFS_CSUM_SIZE);
1867 for (i = 1; i < num_sectors; i++) {
1868 kaddr = page_address(sblock->sectors[i]->page);
1869 crypto_shash_update(shash, kaddr, sectorsize);
1872 crypto_shash_final(shash, calculated_csum);
1873 if (memcmp(calculated_csum, on_disk_csum, sctx->fs_info->csum_size))
1874 sblock->checksum_error = 1;
1876 return sblock->header_error || sblock->checksum_error;
1879 static int scrub_checksum_super(struct scrub_block *sblock)
1881 struct btrfs_super_block *s;
1882 struct scrub_ctx *sctx = sblock->sctx;
1883 struct btrfs_fs_info *fs_info = sctx->fs_info;
1884 SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
1885 u8 calculated_csum[BTRFS_CSUM_SIZE];
1886 struct scrub_sector *sector;
1891 BUG_ON(sblock->sector_count < 1);
1892 sector = sblock->sectors[0];
1893 kaddr = page_address(sector->page);
1894 s = (struct btrfs_super_block *)kaddr;
1896 if (sector->logical != btrfs_super_bytenr(s))
1899 if (sector->generation != btrfs_super_generation(s))
1902 if (!scrub_check_fsid(s->fsid, sector))
1905 shash->tfm = fs_info->csum_shash;
1906 crypto_shash_init(shash);
1907 crypto_shash_digest(shash, kaddr + BTRFS_CSUM_SIZE,
1908 BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE, calculated_csum);
1910 if (memcmp(calculated_csum, s->csum, sctx->fs_info->csum_size))
1913 if (fail_cor + fail_gen) {
1915 * if we find an error in a super block, we just report it.
1916 * They will get written with the next transaction commit
1919 spin_lock(&sctx->stat_lock);
1920 ++sctx->stat.super_errors;
1921 spin_unlock(&sctx->stat_lock);
1923 btrfs_dev_stat_inc_and_print(sector->dev,
1924 BTRFS_DEV_STAT_CORRUPTION_ERRS);
1926 btrfs_dev_stat_inc_and_print(sector->dev,
1927 BTRFS_DEV_STAT_GENERATION_ERRS);
1930 return fail_cor + fail_gen;
1933 static void scrub_block_get(struct scrub_block *sblock)
1935 refcount_inc(&sblock->refs);
1938 static void scrub_block_put(struct scrub_block *sblock)
1940 if (refcount_dec_and_test(&sblock->refs)) {
1943 if (sblock->sparity)
1944 scrub_parity_put(sblock->sparity);
1946 for (i = 0; i < sblock->sector_count; i++)
1947 scrub_sector_put(sblock->sectors[i]);
1952 static void scrub_sector_get(struct scrub_sector *sector)
1954 atomic_inc(§or->refs);
1957 static void scrub_sector_put(struct scrub_sector *sector)
1959 if (atomic_dec_and_test(§or->refs)) {
1961 __free_page(sector->page);
1967 * Throttling of IO submission, bandwidth-limit based, the timeslice is 1
1968 * second. Limit can be set via /sys/fs/UUID/devinfo/devid/scrub_speed_max.
1970 static void scrub_throttle(struct scrub_ctx *sctx)
1972 const int time_slice = 1000;
1973 struct scrub_bio *sbio;
1974 struct btrfs_device *device;
1980 sbio = sctx->bios[sctx->curr];
1982 bwlimit = READ_ONCE(device->scrub_speed_max);
1987 * Slice is divided into intervals when the IO is submitted, adjust by
1988 * bwlimit and maximum of 64 intervals.
1990 div = max_t(u32, 1, (u32)(bwlimit / (16 * 1024 * 1024)));
1991 div = min_t(u32, 64, div);
1993 /* Start new epoch, set deadline */
1995 if (sctx->throttle_deadline == 0) {
1996 sctx->throttle_deadline = ktime_add_ms(now, time_slice / div);
1997 sctx->throttle_sent = 0;
2000 /* Still in the time to send? */
2001 if (ktime_before(now, sctx->throttle_deadline)) {
2002 /* If current bio is within the limit, send it */
2003 sctx->throttle_sent += sbio->bio->bi_iter.bi_size;
2004 if (sctx->throttle_sent <= div_u64(bwlimit, div))
2007 /* We're over the limit, sleep until the rest of the slice */
2008 delta = ktime_ms_delta(sctx->throttle_deadline, now);
2010 /* New request after deadline, start new epoch */
2017 timeout = div_u64(delta * HZ, 1000);
2018 schedule_timeout_interruptible(timeout);
2021 /* Next call will start the deadline period */
2022 sctx->throttle_deadline = 0;
2025 static void scrub_submit(struct scrub_ctx *sctx)
2027 struct scrub_bio *sbio;
2029 if (sctx->curr == -1)
2032 scrub_throttle(sctx);
2034 sbio = sctx->bios[sctx->curr];
2036 scrub_pending_bio_inc(sctx);
2037 btrfsic_check_bio(sbio->bio);
2038 submit_bio(sbio->bio);
2041 static int scrub_add_sector_to_rd_bio(struct scrub_ctx *sctx,
2042 struct scrub_sector *sector)
2044 struct scrub_block *sblock = sector->sblock;
2045 struct scrub_bio *sbio;
2046 const u32 sectorsize = sctx->fs_info->sectorsize;
2051 * grab a fresh bio or wait for one to become available
2053 while (sctx->curr == -1) {
2054 spin_lock(&sctx->list_lock);
2055 sctx->curr = sctx->first_free;
2056 if (sctx->curr != -1) {
2057 sctx->first_free = sctx->bios[sctx->curr]->next_free;
2058 sctx->bios[sctx->curr]->next_free = -1;
2059 sctx->bios[sctx->curr]->sector_count = 0;
2060 spin_unlock(&sctx->list_lock);
2062 spin_unlock(&sctx->list_lock);
2063 wait_event(sctx->list_wait, sctx->first_free != -1);
2066 sbio = sctx->bios[sctx->curr];
2067 if (sbio->sector_count == 0) {
2068 sbio->physical = sector->physical;
2069 sbio->logical = sector->logical;
2070 sbio->dev = sector->dev;
2072 sbio->bio = bio_alloc(sbio->dev->bdev, sctx->sectors_per_bio,
2073 REQ_OP_READ, GFP_NOFS);
2075 sbio->bio->bi_private = sbio;
2076 sbio->bio->bi_end_io = scrub_bio_end_io;
2077 sbio->bio->bi_iter.bi_sector = sbio->physical >> 9;
2079 } else if (sbio->physical + sbio->sector_count * sectorsize !=
2081 sbio->logical + sbio->sector_count * sectorsize !=
2083 sbio->dev != sector->dev) {
2088 sbio->sectors[sbio->sector_count] = sector;
2089 ret = bio_add_page(sbio->bio, sector->page, sectorsize, 0);
2090 if (ret != sectorsize) {
2091 if (sbio->sector_count < 1) {
2100 scrub_block_get(sblock); /* one for the page added to the bio */
2101 atomic_inc(&sblock->outstanding_sectors);
2102 sbio->sector_count++;
2103 if (sbio->sector_count == sctx->sectors_per_bio)
2109 static void scrub_missing_raid56_end_io(struct bio *bio)
2111 struct scrub_block *sblock = bio->bi_private;
2112 struct btrfs_fs_info *fs_info = sblock->sctx->fs_info;
2115 sblock->no_io_error_seen = 0;
2119 queue_work(fs_info->scrub_workers, &sblock->work);
2122 static void scrub_missing_raid56_worker(struct work_struct *work)
2124 struct scrub_block *sblock = container_of(work, struct scrub_block, work);
2125 struct scrub_ctx *sctx = sblock->sctx;
2126 struct btrfs_fs_info *fs_info = sctx->fs_info;
2128 struct btrfs_device *dev;
2130 logical = sblock->sectors[0]->logical;
2131 dev = sblock->sectors[0]->dev;
2133 if (sblock->no_io_error_seen)
2134 scrub_recheck_block_checksum(sblock);
2136 if (!sblock->no_io_error_seen) {
2137 spin_lock(&sctx->stat_lock);
2138 sctx->stat.read_errors++;
2139 spin_unlock(&sctx->stat_lock);
2140 btrfs_err_rl_in_rcu(fs_info,
2141 "IO error rebuilding logical %llu for dev %s",
2142 logical, rcu_str_deref(dev->name));
2143 } else if (sblock->header_error || sblock->checksum_error) {
2144 spin_lock(&sctx->stat_lock);
2145 sctx->stat.uncorrectable_errors++;
2146 spin_unlock(&sctx->stat_lock);
2147 btrfs_err_rl_in_rcu(fs_info,
2148 "failed to rebuild valid logical %llu for dev %s",
2149 logical, rcu_str_deref(dev->name));
2151 scrub_write_block_to_dev_replace(sblock);
2154 if (sctx->is_dev_replace && sctx->flush_all_writes) {
2155 mutex_lock(&sctx->wr_lock);
2156 scrub_wr_submit(sctx);
2157 mutex_unlock(&sctx->wr_lock);
2160 scrub_block_put(sblock);
2161 scrub_pending_bio_dec(sctx);
2164 static void scrub_missing_raid56_pages(struct scrub_block *sblock)
2166 struct scrub_ctx *sctx = sblock->sctx;
2167 struct btrfs_fs_info *fs_info = sctx->fs_info;
2168 u64 length = sblock->sector_count << fs_info->sectorsize_bits;
2169 u64 logical = sblock->sectors[0]->logical;
2170 struct btrfs_io_context *bioc = NULL;
2172 struct btrfs_raid_bio *rbio;
2176 btrfs_bio_counter_inc_blocked(fs_info);
2177 ret = btrfs_map_sblock(fs_info, BTRFS_MAP_GET_READ_MIRRORS, logical,
2179 if (ret || !bioc || !bioc->raid_map)
2182 if (WARN_ON(!sctx->is_dev_replace ||
2183 !(bioc->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK))) {
2185 * We shouldn't be scrubbing a missing device. Even for dev
2186 * replace, we should only get here for RAID 5/6. We either
2187 * managed to mount something with no mirrors remaining or
2188 * there's a bug in scrub_find_good_copy()/btrfs_map_block().
2193 bio = bio_alloc(NULL, BIO_MAX_VECS, REQ_OP_READ, GFP_NOFS);
2194 bio->bi_iter.bi_sector = logical >> 9;
2195 bio->bi_private = sblock;
2196 bio->bi_end_io = scrub_missing_raid56_end_io;
2198 rbio = raid56_alloc_missing_rbio(bio, bioc, length);
2202 for (i = 0; i < sblock->sector_count; i++) {
2203 struct scrub_sector *sector = sblock->sectors[i];
2206 * For now, our scrub is still one page per sector, so pgoff
2209 raid56_add_scrub_pages(rbio, sector->page, 0, sector->logical);
2212 INIT_WORK(&sblock->work, scrub_missing_raid56_worker);
2213 scrub_block_get(sblock);
2214 scrub_pending_bio_inc(sctx);
2215 raid56_submit_missing_rbio(rbio);
2221 btrfs_bio_counter_dec(fs_info);
2222 btrfs_put_bioc(bioc);
2223 spin_lock(&sctx->stat_lock);
2224 sctx->stat.malloc_errors++;
2225 spin_unlock(&sctx->stat_lock);
2228 static int scrub_sectors(struct scrub_ctx *sctx, u64 logical, u32 len,
2229 u64 physical, struct btrfs_device *dev, u64 flags,
2230 u64 gen, int mirror_num, u8 *csum,
2231 u64 physical_for_dev_replace)
2233 struct scrub_block *sblock;
2234 const u32 sectorsize = sctx->fs_info->sectorsize;
2237 sblock = kzalloc(sizeof(*sblock), GFP_KERNEL);
2239 spin_lock(&sctx->stat_lock);
2240 sctx->stat.malloc_errors++;
2241 spin_unlock(&sctx->stat_lock);
2245 /* one ref inside this function, plus one for each page added to
2247 refcount_set(&sblock->refs, 1);
2248 sblock->sctx = sctx;
2249 sblock->no_io_error_seen = 1;
2251 for (index = 0; len > 0; index++) {
2252 struct scrub_sector *sector;
2254 * Here we will allocate one page for one sector to scrub.
2255 * This is fine if PAGE_SIZE == sectorsize, but will cost
2256 * more memory for PAGE_SIZE > sectorsize case.
2258 u32 l = min(sectorsize, len);
2260 sector = kzalloc(sizeof(*sector), GFP_KERNEL);
2263 spin_lock(&sctx->stat_lock);
2264 sctx->stat.malloc_errors++;
2265 spin_unlock(&sctx->stat_lock);
2266 scrub_block_put(sblock);
2269 ASSERT(index < SCRUB_MAX_SECTORS_PER_BLOCK);
2270 scrub_sector_get(sector);
2271 sblock->sectors[index] = sector;
2272 sector->sblock = sblock;
2274 sector->flags = flags;
2275 sector->generation = gen;
2276 sector->logical = logical;
2277 sector->physical = physical;
2278 sector->physical_for_dev_replace = physical_for_dev_replace;
2279 sector->mirror_num = mirror_num;
2281 sector->have_csum = 1;
2282 memcpy(sector->csum, csum, sctx->fs_info->csum_size);
2284 sector->have_csum = 0;
2286 sblock->sector_count++;
2287 sector->page = alloc_page(GFP_KERNEL);
2293 physical_for_dev_replace += l;
2296 WARN_ON(sblock->sector_count == 0);
2297 if (test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state)) {
2299 * This case should only be hit for RAID 5/6 device replace. See
2300 * the comment in scrub_missing_raid56_pages() for details.
2302 scrub_missing_raid56_pages(sblock);
2304 for (index = 0; index < sblock->sector_count; index++) {
2305 struct scrub_sector *sector = sblock->sectors[index];
2308 ret = scrub_add_sector_to_rd_bio(sctx, sector);
2310 scrub_block_put(sblock);
2315 if (flags & BTRFS_EXTENT_FLAG_SUPER)
2319 /* last one frees, either here or in bio completion for last page */
2320 scrub_block_put(sblock);
2324 static void scrub_bio_end_io(struct bio *bio)
2326 struct scrub_bio *sbio = bio->bi_private;
2327 struct btrfs_fs_info *fs_info = sbio->dev->fs_info;
2329 sbio->status = bio->bi_status;
2332 queue_work(fs_info->scrub_workers, &sbio->work);
2335 static void scrub_bio_end_io_worker(struct work_struct *work)
2337 struct scrub_bio *sbio = container_of(work, struct scrub_bio, work);
2338 struct scrub_ctx *sctx = sbio->sctx;
2341 ASSERT(sbio->sector_count <= SCRUB_SECTORS_PER_BIO);
2343 for (i = 0; i < sbio->sector_count; i++) {
2344 struct scrub_sector *sector = sbio->sectors[i];
2346 sector->io_error = 1;
2347 sector->sblock->no_io_error_seen = 0;
2351 /* Now complete the scrub_block items that have all pages completed */
2352 for (i = 0; i < sbio->sector_count; i++) {
2353 struct scrub_sector *sector = sbio->sectors[i];
2354 struct scrub_block *sblock = sector->sblock;
2356 if (atomic_dec_and_test(&sblock->outstanding_sectors))
2357 scrub_block_complete(sblock);
2358 scrub_block_put(sblock);
2363 spin_lock(&sctx->list_lock);
2364 sbio->next_free = sctx->first_free;
2365 sctx->first_free = sbio->index;
2366 spin_unlock(&sctx->list_lock);
2368 if (sctx->is_dev_replace && sctx->flush_all_writes) {
2369 mutex_lock(&sctx->wr_lock);
2370 scrub_wr_submit(sctx);
2371 mutex_unlock(&sctx->wr_lock);
2374 scrub_pending_bio_dec(sctx);
2377 static inline void __scrub_mark_bitmap(struct scrub_parity *sparity,
2378 unsigned long *bitmap,
2383 u32 sectorsize_bits = sparity->sctx->fs_info->sectorsize_bits;
2385 if (len >= sparity->stripe_len) {
2386 bitmap_set(bitmap, 0, sparity->nsectors);
2390 start -= sparity->logic_start;
2391 start = div64_u64_rem(start, sparity->stripe_len, &offset);
2392 offset = offset >> sectorsize_bits;
2393 nsectors = len >> sectorsize_bits;
2395 if (offset + nsectors <= sparity->nsectors) {
2396 bitmap_set(bitmap, offset, nsectors);
2400 bitmap_set(bitmap, offset, sparity->nsectors - offset);
2401 bitmap_set(bitmap, 0, nsectors - (sparity->nsectors - offset));
2404 static inline void scrub_parity_mark_sectors_error(struct scrub_parity *sparity,
2407 __scrub_mark_bitmap(sparity, &sparity->ebitmap, start, len);
2410 static inline void scrub_parity_mark_sectors_data(struct scrub_parity *sparity,
2413 __scrub_mark_bitmap(sparity, &sparity->dbitmap, start, len);
2416 static void scrub_block_complete(struct scrub_block *sblock)
2420 if (!sblock->no_io_error_seen) {
2422 scrub_handle_errored_block(sblock);
2425 * if has checksum error, write via repair mechanism in
2426 * dev replace case, otherwise write here in dev replace
2429 corrupted = scrub_checksum(sblock);
2430 if (!corrupted && sblock->sctx->is_dev_replace)
2431 scrub_write_block_to_dev_replace(sblock);
2434 if (sblock->sparity && corrupted && !sblock->data_corrected) {
2435 u64 start = sblock->sectors[0]->logical;
2436 u64 end = sblock->sectors[sblock->sector_count - 1]->logical +
2437 sblock->sctx->fs_info->sectorsize;
2439 ASSERT(end - start <= U32_MAX);
2440 scrub_parity_mark_sectors_error(sblock->sparity,
2441 start, end - start);
2445 static void drop_csum_range(struct scrub_ctx *sctx, struct btrfs_ordered_sum *sum)
2447 sctx->stat.csum_discards += sum->len >> sctx->fs_info->sectorsize_bits;
2448 list_del(&sum->list);
2453 * Find the desired csum for range [logical, logical + sectorsize), and store
2454 * the csum into @csum.
2456 * The search source is sctx->csum_list, which is a pre-populated list
2457 * storing bytenr ordered csum ranges. We're responsible to cleanup any range
2458 * that is before @logical.
2460 * Return 0 if there is no csum for the range.
2461 * Return 1 if there is csum for the range and copied to @csum.
2463 static int scrub_find_csum(struct scrub_ctx *sctx, u64 logical, u8 *csum)
2467 while (!list_empty(&sctx->csum_list)) {
2468 struct btrfs_ordered_sum *sum = NULL;
2469 unsigned long index;
2470 unsigned long num_sectors;
2472 sum = list_first_entry(&sctx->csum_list,
2473 struct btrfs_ordered_sum, list);
2474 /* The current csum range is beyond our range, no csum found */
2475 if (sum->bytenr > logical)
2479 * The current sum is before our bytenr, since scrub is always
2480 * done in bytenr order, the csum will never be used anymore,
2481 * clean it up so that later calls won't bother with the range,
2482 * and continue search the next range.
2484 if (sum->bytenr + sum->len <= logical) {
2485 drop_csum_range(sctx, sum);
2489 /* Now the csum range covers our bytenr, copy the csum */
2491 index = (logical - sum->bytenr) >> sctx->fs_info->sectorsize_bits;
2492 num_sectors = sum->len >> sctx->fs_info->sectorsize_bits;
2494 memcpy(csum, sum->sums + index * sctx->fs_info->csum_size,
2495 sctx->fs_info->csum_size);
2497 /* Cleanup the range if we're at the end of the csum range */
2498 if (index == num_sectors - 1)
2499 drop_csum_range(sctx, sum);
2507 /* scrub extent tries to collect up to 64 kB for each bio */
2508 static int scrub_extent(struct scrub_ctx *sctx, struct map_lookup *map,
2509 u64 logical, u32 len,
2510 u64 physical, struct btrfs_device *dev, u64 flags,
2511 u64 gen, int mirror_num)
2513 struct btrfs_device *src_dev = dev;
2514 u64 src_physical = physical;
2515 int src_mirror = mirror_num;
2517 u8 csum[BTRFS_CSUM_SIZE];
2520 if (flags & BTRFS_EXTENT_FLAG_DATA) {
2521 if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK)
2522 blocksize = map->stripe_len;
2524 blocksize = sctx->fs_info->sectorsize;
2525 spin_lock(&sctx->stat_lock);
2526 sctx->stat.data_extents_scrubbed++;
2527 sctx->stat.data_bytes_scrubbed += len;
2528 spin_unlock(&sctx->stat_lock);
2529 } else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
2530 if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK)
2531 blocksize = map->stripe_len;
2533 blocksize = sctx->fs_info->nodesize;
2534 spin_lock(&sctx->stat_lock);
2535 sctx->stat.tree_extents_scrubbed++;
2536 sctx->stat.tree_bytes_scrubbed += len;
2537 spin_unlock(&sctx->stat_lock);
2539 blocksize = sctx->fs_info->sectorsize;
2544 * For dev-replace case, we can have @dev being a missing device.
2545 * Regular scrub will avoid its execution on missing device at all,
2546 * as that would trigger tons of read error.
2548 * Reading from missing device will cause read error counts to
2549 * increase unnecessarily.
2550 * So here we change the read source to a good mirror.
2552 if (sctx->is_dev_replace && !dev->bdev)
2553 scrub_find_good_copy(sctx->fs_info, logical, len, &src_physical,
2554 &src_dev, &src_mirror);
2556 u32 l = min(len, blocksize);
2559 if (flags & BTRFS_EXTENT_FLAG_DATA) {
2560 /* push csums to sbio */
2561 have_csum = scrub_find_csum(sctx, logical, csum);
2563 ++sctx->stat.no_csum;
2565 ret = scrub_sectors(sctx, logical, l, src_physical, src_dev,
2566 flags, gen, src_mirror,
2567 have_csum ? csum : NULL, physical);
2578 static int scrub_sectors_for_parity(struct scrub_parity *sparity,
2579 u64 logical, u32 len,
2580 u64 physical, struct btrfs_device *dev,
2581 u64 flags, u64 gen, int mirror_num, u8 *csum)
2583 struct scrub_ctx *sctx = sparity->sctx;
2584 struct scrub_block *sblock;
2585 const u32 sectorsize = sctx->fs_info->sectorsize;
2588 ASSERT(IS_ALIGNED(len, sectorsize));
2590 sblock = kzalloc(sizeof(*sblock), GFP_KERNEL);
2592 spin_lock(&sctx->stat_lock);
2593 sctx->stat.malloc_errors++;
2594 spin_unlock(&sctx->stat_lock);
2598 /* one ref inside this function, plus one for each page added to
2600 refcount_set(&sblock->refs, 1);
2601 sblock->sctx = sctx;
2602 sblock->no_io_error_seen = 1;
2603 sblock->sparity = sparity;
2604 scrub_parity_get(sparity);
2606 for (index = 0; len > 0; index++) {
2607 struct scrub_sector *sector;
2609 sector = kzalloc(sizeof(*sector), GFP_KERNEL);
2612 spin_lock(&sctx->stat_lock);
2613 sctx->stat.malloc_errors++;
2614 spin_unlock(&sctx->stat_lock);
2615 scrub_block_put(sblock);
2618 ASSERT(index < SCRUB_MAX_SECTORS_PER_BLOCK);
2619 /* For scrub block */
2620 scrub_sector_get(sector);
2621 sblock->sectors[index] = sector;
2622 /* For scrub parity */
2623 scrub_sector_get(sector);
2624 list_add_tail(§or->list, &sparity->sectors_list);
2625 sector->sblock = sblock;
2627 sector->flags = flags;
2628 sector->generation = gen;
2629 sector->logical = logical;
2630 sector->physical = physical;
2631 sector->mirror_num = mirror_num;
2633 sector->have_csum = 1;
2634 memcpy(sector->csum, csum, sctx->fs_info->csum_size);
2636 sector->have_csum = 0;
2638 sblock->sector_count++;
2639 sector->page = alloc_page(GFP_KERNEL);
2644 /* Iterate over the stripe range in sectorsize steps */
2646 logical += sectorsize;
2647 physical += sectorsize;
2650 WARN_ON(sblock->sector_count == 0);
2651 for (index = 0; index < sblock->sector_count; index++) {
2652 struct scrub_sector *sector = sblock->sectors[index];
2655 ret = scrub_add_sector_to_rd_bio(sctx, sector);
2657 scrub_block_put(sblock);
2662 /* Last one frees, either here or in bio completion for last sector */
2663 scrub_block_put(sblock);
2667 static int scrub_extent_for_parity(struct scrub_parity *sparity,
2668 u64 logical, u32 len,
2669 u64 physical, struct btrfs_device *dev,
2670 u64 flags, u64 gen, int mirror_num)
2672 struct scrub_ctx *sctx = sparity->sctx;
2674 u8 csum[BTRFS_CSUM_SIZE];
2677 if (test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state)) {
2678 scrub_parity_mark_sectors_error(sparity, logical, len);
2682 if (flags & BTRFS_EXTENT_FLAG_DATA) {
2683 blocksize = sparity->stripe_len;
2684 } else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
2685 blocksize = sparity->stripe_len;
2687 blocksize = sctx->fs_info->sectorsize;
2692 u32 l = min(len, blocksize);
2695 if (flags & BTRFS_EXTENT_FLAG_DATA) {
2696 /* push csums to sbio */
2697 have_csum = scrub_find_csum(sctx, logical, csum);
2701 ret = scrub_sectors_for_parity(sparity, logical, l, physical, dev,
2702 flags, gen, mirror_num,
2703 have_csum ? csum : NULL);
2715 * Given a physical address, this will calculate it's
2716 * logical offset. if this is a parity stripe, it will return
2717 * the most left data stripe's logical offset.
2719 * return 0 if it is a data stripe, 1 means parity stripe.
2721 static int get_raid56_logic_offset(u64 physical, int num,
2722 struct map_lookup *map, u64 *offset,
2731 const int data_stripes = nr_data_stripes(map);
2733 last_offset = (physical - map->stripes[num].physical) * data_stripes;
2735 *stripe_start = last_offset;
2737 *offset = last_offset;
2738 for (i = 0; i < data_stripes; i++) {
2739 *offset = last_offset + i * map->stripe_len;
2741 stripe_nr = div64_u64(*offset, map->stripe_len);
2742 stripe_nr = div_u64(stripe_nr, data_stripes);
2744 /* Work out the disk rotation on this stripe-set */
2745 stripe_nr = div_u64_rem(stripe_nr, map->num_stripes, &rot);
2746 /* calculate which stripe this data locates */
2748 stripe_index = rot % map->num_stripes;
2749 if (stripe_index == num)
2751 if (stripe_index < num)
2754 *offset = last_offset + j * map->stripe_len;
2758 static void scrub_free_parity(struct scrub_parity *sparity)
2760 struct scrub_ctx *sctx = sparity->sctx;
2761 struct scrub_sector *curr, *next;
2764 nbits = bitmap_weight(&sparity->ebitmap, sparity->nsectors);
2766 spin_lock(&sctx->stat_lock);
2767 sctx->stat.read_errors += nbits;
2768 sctx->stat.uncorrectable_errors += nbits;
2769 spin_unlock(&sctx->stat_lock);
2772 list_for_each_entry_safe(curr, next, &sparity->sectors_list, list) {
2773 list_del_init(&curr->list);
2774 scrub_sector_put(curr);
2780 static void scrub_parity_bio_endio_worker(struct work_struct *work)
2782 struct scrub_parity *sparity = container_of(work, struct scrub_parity,
2784 struct scrub_ctx *sctx = sparity->sctx;
2786 scrub_free_parity(sparity);
2787 scrub_pending_bio_dec(sctx);
2790 static void scrub_parity_bio_endio(struct bio *bio)
2792 struct scrub_parity *sparity = bio->bi_private;
2793 struct btrfs_fs_info *fs_info = sparity->sctx->fs_info;
2796 bitmap_or(&sparity->ebitmap, &sparity->ebitmap,
2797 &sparity->dbitmap, sparity->nsectors);
2801 INIT_WORK(&sparity->work, scrub_parity_bio_endio_worker);
2802 queue_work(fs_info->scrub_parity_workers, &sparity->work);
2805 static void scrub_parity_check_and_repair(struct scrub_parity *sparity)
2807 struct scrub_ctx *sctx = sparity->sctx;
2808 struct btrfs_fs_info *fs_info = sctx->fs_info;
2810 struct btrfs_raid_bio *rbio;
2811 struct btrfs_io_context *bioc = NULL;
2815 if (!bitmap_andnot(&sparity->dbitmap, &sparity->dbitmap,
2816 &sparity->ebitmap, sparity->nsectors))
2819 length = sparity->logic_end - sparity->logic_start;
2821 btrfs_bio_counter_inc_blocked(fs_info);
2822 ret = btrfs_map_sblock(fs_info, BTRFS_MAP_WRITE, sparity->logic_start,
2824 if (ret || !bioc || !bioc->raid_map)
2827 bio = bio_alloc(NULL, BIO_MAX_VECS, REQ_OP_READ, GFP_NOFS);
2828 bio->bi_iter.bi_sector = sparity->logic_start >> 9;
2829 bio->bi_private = sparity;
2830 bio->bi_end_io = scrub_parity_bio_endio;
2832 rbio = raid56_parity_alloc_scrub_rbio(bio, bioc, length,
2839 scrub_pending_bio_inc(sctx);
2840 raid56_parity_submit_scrub_rbio(rbio);
2846 btrfs_bio_counter_dec(fs_info);
2847 btrfs_put_bioc(bioc);
2848 bitmap_or(&sparity->ebitmap, &sparity->ebitmap, &sparity->dbitmap,
2850 spin_lock(&sctx->stat_lock);
2851 sctx->stat.malloc_errors++;
2852 spin_unlock(&sctx->stat_lock);
2854 scrub_free_parity(sparity);
2857 static void scrub_parity_get(struct scrub_parity *sparity)
2859 refcount_inc(&sparity->refs);
2862 static void scrub_parity_put(struct scrub_parity *sparity)
2864 if (!refcount_dec_and_test(&sparity->refs))
2867 scrub_parity_check_and_repair(sparity);
2871 * Return 0 if the extent item range covers any byte of the range.
2872 * Return <0 if the extent item is before @search_start.
2873 * Return >0 if the extent item is after @start_start + @search_len.
2875 static int compare_extent_item_range(struct btrfs_path *path,
2876 u64 search_start, u64 search_len)
2878 struct btrfs_fs_info *fs_info = path->nodes[0]->fs_info;
2880 struct btrfs_key key;
2882 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
2883 ASSERT(key.type == BTRFS_EXTENT_ITEM_KEY ||
2884 key.type == BTRFS_METADATA_ITEM_KEY);
2885 if (key.type == BTRFS_METADATA_ITEM_KEY)
2886 len = fs_info->nodesize;
2890 if (key.objectid + len <= search_start)
2892 if (key.objectid >= search_start + search_len)
2898 * Locate one extent item which covers any byte in range
2899 * [@search_start, @search_start + @search_length)
2901 * If the path is not initialized, we will initialize the search by doing
2902 * a btrfs_search_slot().
2903 * If the path is already initialized, we will use the path as the initial
2904 * slot, to avoid duplicated btrfs_search_slot() calls.
2906 * NOTE: If an extent item starts before @search_start, we will still
2907 * return the extent item. This is for data extent crossing stripe boundary.
2909 * Return 0 if we found such extent item, and @path will point to the extent item.
2910 * Return >0 if no such extent item can be found, and @path will be released.
2911 * Return <0 if hit fatal error, and @path will be released.
2913 static int find_first_extent_item(struct btrfs_root *extent_root,
2914 struct btrfs_path *path,
2915 u64 search_start, u64 search_len)
2917 struct btrfs_fs_info *fs_info = extent_root->fs_info;
2918 struct btrfs_key key;
2921 /* Continue using the existing path */
2923 goto search_forward;
2925 if (btrfs_fs_incompat(fs_info, SKINNY_METADATA))
2926 key.type = BTRFS_METADATA_ITEM_KEY;
2928 key.type = BTRFS_EXTENT_ITEM_KEY;
2929 key.objectid = search_start;
2930 key.offset = (u64)-1;
2932 ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
2938 * Here we intentionally pass 0 as @min_objectid, as there could be
2939 * an extent item starting before @search_start.
2941 ret = btrfs_previous_extent_item(extent_root, path, 0);
2945 * No matter whether we have found an extent item, the next loop will
2946 * properly do every check on the key.
2950 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
2951 if (key.objectid >= search_start + search_len)
2953 if (key.type != BTRFS_METADATA_ITEM_KEY &&
2954 key.type != BTRFS_EXTENT_ITEM_KEY)
2957 ret = compare_extent_item_range(path, search_start, search_len);
2964 if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) {
2965 ret = btrfs_next_leaf(extent_root, path);
2967 /* Either no more item or fatal error */
2968 btrfs_release_path(path);
2973 btrfs_release_path(path);
2977 static void get_extent_info(struct btrfs_path *path, u64 *extent_start_ret,
2978 u64 *size_ret, u64 *flags_ret, u64 *generation_ret)
2980 struct btrfs_key key;
2981 struct btrfs_extent_item *ei;
2983 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
2984 ASSERT(key.type == BTRFS_METADATA_ITEM_KEY ||
2985 key.type == BTRFS_EXTENT_ITEM_KEY);
2986 *extent_start_ret = key.objectid;
2987 if (key.type == BTRFS_METADATA_ITEM_KEY)
2988 *size_ret = path->nodes[0]->fs_info->nodesize;
2990 *size_ret = key.offset;
2991 ei = btrfs_item_ptr(path->nodes[0], path->slots[0], struct btrfs_extent_item);
2992 *flags_ret = btrfs_extent_flags(path->nodes[0], ei);
2993 *generation_ret = btrfs_extent_generation(path->nodes[0], ei);
2996 static bool does_range_cross_boundary(u64 extent_start, u64 extent_len,
2997 u64 boundary_start, u64 boudary_len)
2999 return (extent_start < boundary_start &&
3000 extent_start + extent_len > boundary_start) ||
3001 (extent_start < boundary_start + boudary_len &&
3002 extent_start + extent_len > boundary_start + boudary_len);
3005 static int scrub_raid56_data_stripe_for_parity(struct scrub_ctx *sctx,
3006 struct scrub_parity *sparity,
3007 struct map_lookup *map,
3008 struct btrfs_device *sdev,
3009 struct btrfs_path *path,
3012 struct btrfs_fs_info *fs_info = sctx->fs_info;
3013 struct btrfs_root *extent_root = btrfs_extent_root(fs_info, logical);
3014 struct btrfs_root *csum_root = btrfs_csum_root(fs_info, logical);
3015 u64 cur_logical = logical;
3018 ASSERT(map->type & BTRFS_BLOCK_GROUP_RAID56_MASK);
3020 /* Path must not be populated */
3021 ASSERT(!path->nodes[0]);
3023 while (cur_logical < logical + map->stripe_len) {
3024 struct btrfs_io_context *bioc = NULL;
3025 struct btrfs_device *extent_dev;
3031 u64 extent_physical;
3032 u64 extent_mirror_num;
3034 ret = find_first_extent_item(extent_root, path, cur_logical,
3035 logical + map->stripe_len - cur_logical);
3036 /* No more extent item in this data stripe */
3043 get_extent_info(path, &extent_start, &extent_size, &extent_flags,
3046 /* Metadata should not cross stripe boundaries */
3047 if ((extent_flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) &&
3048 does_range_cross_boundary(extent_start, extent_size,
3049 logical, map->stripe_len)) {
3051 "scrub: tree block %llu spanning stripes, ignored. logical=%llu",
3052 extent_start, logical);
3053 spin_lock(&sctx->stat_lock);
3054 sctx->stat.uncorrectable_errors++;
3055 spin_unlock(&sctx->stat_lock);
3056 cur_logical += extent_size;
3060 /* Skip hole range which doesn't have any extent */
3061 cur_logical = max(extent_start, cur_logical);
3063 /* Truncate the range inside this data stripe */
3064 extent_size = min(extent_start + extent_size,
3065 logical + map->stripe_len) - cur_logical;
3066 extent_start = cur_logical;
3067 ASSERT(extent_size <= U32_MAX);
3069 scrub_parity_mark_sectors_data(sparity, extent_start, extent_size);
3071 mapped_length = extent_size;
3072 ret = btrfs_map_block(fs_info, BTRFS_MAP_READ, extent_start,
3073 &mapped_length, &bioc, 0);
3074 if (!ret && (!bioc || mapped_length < extent_size))
3077 btrfs_put_bioc(bioc);
3078 scrub_parity_mark_sectors_error(sparity, extent_start,
3082 extent_physical = bioc->stripes[0].physical;
3083 extent_mirror_num = bioc->mirror_num;
3084 extent_dev = bioc->stripes[0].dev;
3085 btrfs_put_bioc(bioc);
3087 ret = btrfs_lookup_csums_range(csum_root, extent_start,
3088 extent_start + extent_size - 1,
3089 &sctx->csum_list, 1);
3091 scrub_parity_mark_sectors_error(sparity, extent_start,
3096 ret = scrub_extent_for_parity(sparity, extent_start,
3097 extent_size, extent_physical,
3098 extent_dev, extent_flags,
3099 extent_gen, extent_mirror_num);
3100 scrub_free_csums(sctx);
3103 scrub_parity_mark_sectors_error(sparity, extent_start,
3109 cur_logical += extent_size;
3111 btrfs_release_path(path);
3115 static noinline_for_stack int scrub_raid56_parity(struct scrub_ctx *sctx,
3116 struct map_lookup *map,
3117 struct btrfs_device *sdev,
3121 struct btrfs_fs_info *fs_info = sctx->fs_info;
3122 struct btrfs_path *path;
3125 struct scrub_parity *sparity;
3128 path = btrfs_alloc_path();
3130 spin_lock(&sctx->stat_lock);
3131 sctx->stat.malloc_errors++;
3132 spin_unlock(&sctx->stat_lock);
3135 path->search_commit_root = 1;
3136 path->skip_locking = 1;
3138 ASSERT(map->stripe_len <= U32_MAX);
3139 nsectors = map->stripe_len >> fs_info->sectorsize_bits;
3140 ASSERT(nsectors <= BITS_PER_LONG);
3141 sparity = kzalloc(sizeof(struct scrub_parity), GFP_NOFS);
3143 spin_lock(&sctx->stat_lock);
3144 sctx->stat.malloc_errors++;
3145 spin_unlock(&sctx->stat_lock);
3146 btrfs_free_path(path);
3150 ASSERT(map->stripe_len <= U32_MAX);
3151 sparity->stripe_len = map->stripe_len;
3152 sparity->nsectors = nsectors;
3153 sparity->sctx = sctx;
3154 sparity->scrub_dev = sdev;
3155 sparity->logic_start = logic_start;
3156 sparity->logic_end = logic_end;
3157 refcount_set(&sparity->refs, 1);
3158 INIT_LIST_HEAD(&sparity->sectors_list);
3161 for (cur_logical = logic_start; cur_logical < logic_end;
3162 cur_logical += map->stripe_len) {
3163 ret = scrub_raid56_data_stripe_for_parity(sctx, sparity, map,
3164 sdev, path, cur_logical);
3169 scrub_parity_put(sparity);
3171 mutex_lock(&sctx->wr_lock);
3172 scrub_wr_submit(sctx);
3173 mutex_unlock(&sctx->wr_lock);
3175 btrfs_free_path(path);
3176 return ret < 0 ? ret : 0;
3179 static void sync_replace_for_zoned(struct scrub_ctx *sctx)
3181 if (!btrfs_is_zoned(sctx->fs_info))
3184 sctx->flush_all_writes = true;
3186 mutex_lock(&sctx->wr_lock);
3187 scrub_wr_submit(sctx);
3188 mutex_unlock(&sctx->wr_lock);
3190 wait_event(sctx->list_wait, atomic_read(&sctx->bios_in_flight) == 0);
3193 static int sync_write_pointer_for_zoned(struct scrub_ctx *sctx, u64 logical,
3194 u64 physical, u64 physical_end)
3196 struct btrfs_fs_info *fs_info = sctx->fs_info;
3199 if (!btrfs_is_zoned(fs_info))
3202 wait_event(sctx->list_wait, atomic_read(&sctx->bios_in_flight) == 0);
3204 mutex_lock(&sctx->wr_lock);
3205 if (sctx->write_pointer < physical_end) {
3206 ret = btrfs_sync_zone_write_pointer(sctx->wr_tgtdev, logical,
3208 sctx->write_pointer);
3211 "zoned: failed to recover write pointer");
3213 mutex_unlock(&sctx->wr_lock);
3214 btrfs_dev_clear_zone_empty(sctx->wr_tgtdev, physical);
3220 * Scrub one range which can only has simple mirror based profile.
3221 * (Including all range in SINGLE/DUP/RAID1/RAID1C*, and each stripe in
3224 * Since we may need to handle a subset of block group, we need @logical_start
3225 * and @logical_length parameter.
3227 static int scrub_simple_mirror(struct scrub_ctx *sctx,
3228 struct btrfs_root *extent_root,
3229 struct btrfs_root *csum_root,
3230 struct btrfs_block_group *bg,
3231 struct map_lookup *map,
3232 u64 logical_start, u64 logical_length,
3233 struct btrfs_device *device,
3234 u64 physical, int mirror_num)
3236 struct btrfs_fs_info *fs_info = sctx->fs_info;
3237 const u64 logical_end = logical_start + logical_length;
3238 /* An artificial limit, inherit from old scrub behavior */
3239 const u32 max_length = SZ_64K;
3240 struct btrfs_path path = { 0 };
3241 u64 cur_logical = logical_start;
3244 /* The range must be inside the bg */
3245 ASSERT(logical_start >= bg->start && logical_end <= bg->start + bg->length);
3247 path.search_commit_root = 1;
3248 path.skip_locking = 1;
3249 /* Go through each extent items inside the logical range */
3250 while (cur_logical < logical_end) {
3258 if (atomic_read(&fs_info->scrub_cancel_req) ||
3259 atomic_read(&sctx->cancel_req)) {
3264 if (atomic_read(&fs_info->scrub_pause_req)) {
3265 /* Push queued extents */
3266 sctx->flush_all_writes = true;
3268 mutex_lock(&sctx->wr_lock);
3269 scrub_wr_submit(sctx);
3270 mutex_unlock(&sctx->wr_lock);
3271 wait_event(sctx->list_wait,
3272 atomic_read(&sctx->bios_in_flight) == 0);
3273 sctx->flush_all_writes = false;
3274 scrub_blocked_if_needed(fs_info);
3276 /* Block group removed? */
3277 spin_lock(&bg->lock);
3279 spin_unlock(&bg->lock);
3283 spin_unlock(&bg->lock);
3285 ret = find_first_extent_item(extent_root, &path, cur_logical,
3286 logical_end - cur_logical);
3288 /* No more extent, just update the accounting */
3289 sctx->stat.last_physical = physical + logical_length;
3295 get_extent_info(&path, &extent_start, &extent_len,
3296 &extent_flags, &extent_gen);
3297 /* Skip hole range which doesn't have any extent */
3298 cur_logical = max(extent_start, cur_logical);
3301 * Scrub len has three limits:
3302 * - Extent size limit
3303 * - Scrub range limit
3304 * This is especially imporatant for RAID0/RAID10 to reuse
3306 * - Max scrub size limit
3308 scrub_len = min(min(extent_start + extent_len,
3309 logical_end), cur_logical + max_length) -
3312 if (extent_flags & BTRFS_EXTENT_FLAG_DATA) {
3313 ret = btrfs_lookup_csums_range(csum_root, cur_logical,
3314 cur_logical + scrub_len - 1,
3315 &sctx->csum_list, 1);
3319 if ((extent_flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) &&
3320 does_range_cross_boundary(extent_start, extent_len,
3321 logical_start, logical_length)) {
3323 "scrub: tree block %llu spanning boundaries, ignored. boundary=[%llu, %llu)",
3324 extent_start, logical_start, logical_end);
3325 spin_lock(&sctx->stat_lock);
3326 sctx->stat.uncorrectable_errors++;
3327 spin_unlock(&sctx->stat_lock);
3328 cur_logical += scrub_len;
3331 ret = scrub_extent(sctx, map, cur_logical, scrub_len,
3332 cur_logical - logical_start + physical,
3333 device, extent_flags, extent_gen,
3335 scrub_free_csums(sctx);
3338 if (sctx->is_dev_replace)
3339 sync_replace_for_zoned(sctx);
3340 cur_logical += scrub_len;
3341 /* Don't hold CPU for too long time */
3344 btrfs_release_path(&path);
3348 /* Calculate the full stripe length for simple stripe based profiles */
3349 static u64 simple_stripe_full_stripe_len(const struct map_lookup *map)
3351 ASSERT(map->type & (BTRFS_BLOCK_GROUP_RAID0 |
3352 BTRFS_BLOCK_GROUP_RAID10));
3354 return map->num_stripes / map->sub_stripes * map->stripe_len;
3357 /* Get the logical bytenr for the stripe */
3358 static u64 simple_stripe_get_logical(struct map_lookup *map,
3359 struct btrfs_block_group *bg,
3362 ASSERT(map->type & (BTRFS_BLOCK_GROUP_RAID0 |
3363 BTRFS_BLOCK_GROUP_RAID10));
3364 ASSERT(stripe_index < map->num_stripes);
3367 * (stripe_index / sub_stripes) gives how many data stripes we need to
3370 return (stripe_index / map->sub_stripes) * map->stripe_len + bg->start;
3373 /* Get the mirror number for the stripe */
3374 static int simple_stripe_mirror_num(struct map_lookup *map, int stripe_index)
3376 ASSERT(map->type & (BTRFS_BLOCK_GROUP_RAID0 |
3377 BTRFS_BLOCK_GROUP_RAID10));
3378 ASSERT(stripe_index < map->num_stripes);
3380 /* For RAID0, it's fixed to 1, for RAID10 it's 0,1,0,1... */
3381 return stripe_index % map->sub_stripes + 1;
3384 static int scrub_simple_stripe(struct scrub_ctx *sctx,
3385 struct btrfs_root *extent_root,
3386 struct btrfs_root *csum_root,
3387 struct btrfs_block_group *bg,
3388 struct map_lookup *map,
3389 struct btrfs_device *device,
3392 const u64 logical_increment = simple_stripe_full_stripe_len(map);
3393 const u64 orig_logical = simple_stripe_get_logical(map, bg, stripe_index);
3394 const u64 orig_physical = map->stripes[stripe_index].physical;
3395 const int mirror_num = simple_stripe_mirror_num(map, stripe_index);
3396 u64 cur_logical = orig_logical;
3397 u64 cur_physical = orig_physical;
3400 while (cur_logical < bg->start + bg->length) {
3402 * Inside each stripe, RAID0 is just SINGLE, and RAID10 is
3403 * just RAID1, so we can reuse scrub_simple_mirror() to scrub
3406 ret = scrub_simple_mirror(sctx, extent_root, csum_root, bg, map,
3407 cur_logical, map->stripe_len, device,
3408 cur_physical, mirror_num);
3411 /* Skip to next stripe which belongs to the target device */
3412 cur_logical += logical_increment;
3413 /* For physical offset, we just go to next stripe */
3414 cur_physical += map->stripe_len;
3419 static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
3420 struct btrfs_block_group *bg,
3421 struct map_lookup *map,
3422 struct btrfs_device *scrub_dev,
3423 int stripe_index, u64 dev_extent_len)
3425 struct btrfs_path *path;
3426 struct btrfs_fs_info *fs_info = sctx->fs_info;
3427 struct btrfs_root *root;
3428 struct btrfs_root *csum_root;
3429 struct blk_plug plug;
3430 const u64 profile = map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK;
3431 const u64 chunk_logical = bg->start;
3433 u64 physical = map->stripes[stripe_index].physical;
3434 const u64 physical_end = physical + dev_extent_len;
3437 /* The logical increment after finishing one stripe */
3439 /* Offset inside the chunk */
3445 path = btrfs_alloc_path();
3450 * work on commit root. The related disk blocks are static as
3451 * long as COW is applied. This means, it is save to rewrite
3452 * them to repair disk errors without any race conditions
3454 path->search_commit_root = 1;
3455 path->skip_locking = 1;
3456 path->reada = READA_FORWARD;
3458 wait_event(sctx->list_wait,
3459 atomic_read(&sctx->bios_in_flight) == 0);
3460 scrub_blocked_if_needed(fs_info);
3462 root = btrfs_extent_root(fs_info, bg->start);
3463 csum_root = btrfs_csum_root(fs_info, bg->start);
3466 * collect all data csums for the stripe to avoid seeking during
3467 * the scrub. This might currently (crc32) end up to be about 1MB
3469 blk_start_plug(&plug);
3471 if (sctx->is_dev_replace &&
3472 btrfs_dev_is_sequential(sctx->wr_tgtdev, physical)) {
3473 mutex_lock(&sctx->wr_lock);
3474 sctx->write_pointer = physical;
3475 mutex_unlock(&sctx->wr_lock);
3476 sctx->flush_all_writes = true;
3480 * There used to be a big double loop to handle all profiles using the
3481 * same routine, which grows larger and more gross over time.
3483 * So here we handle each profile differently, so simpler profiles
3484 * have simpler scrubbing function.
3486 if (!(profile & (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID10 |
3487 BTRFS_BLOCK_GROUP_RAID56_MASK))) {
3489 * Above check rules out all complex profile, the remaining
3490 * profiles are SINGLE|DUP|RAID1|RAID1C*, which is simple
3491 * mirrored duplication without stripe.
3493 * Only @physical and @mirror_num needs to calculated using
3496 ret = scrub_simple_mirror(sctx, root, csum_root, bg, map,
3497 bg->start, bg->length, scrub_dev,
3498 map->stripes[stripe_index].physical,
3503 if (profile & (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID10)) {
3504 ret = scrub_simple_stripe(sctx, root, csum_root, bg, map,
3505 scrub_dev, stripe_index);
3506 offset = map->stripe_len * (stripe_index / map->sub_stripes);
3510 /* Only RAID56 goes through the old code */
3511 ASSERT(map->type & BTRFS_BLOCK_GROUP_RAID56_MASK);
3514 /* Calculate the logical end of the stripe */
3515 get_raid56_logic_offset(physical_end, stripe_index,
3516 map, &logic_end, NULL);
3517 logic_end += chunk_logical;
3519 /* Initialize @offset in case we need to go to out: label */
3520 get_raid56_logic_offset(physical, stripe_index, map, &offset, NULL);
3521 increment = map->stripe_len * nr_data_stripes(map);
3524 * Due to the rotation, for RAID56 it's better to iterate each stripe
3525 * using their physical offset.
3527 while (physical < physical_end) {
3528 ret = get_raid56_logic_offset(physical, stripe_index, map,
3529 &logical, &stripe_logical);
3530 logical += chunk_logical;
3532 /* it is parity strip */
3533 stripe_logical += chunk_logical;
3534 stripe_end = stripe_logical + increment;
3535 ret = scrub_raid56_parity(sctx, map, scrub_dev,
3544 * Now we're at a data stripe, scrub each extents in the range.
3546 * At this stage, if we ignore the repair part, inside each data
3547 * stripe it is no different than SINGLE profile.
3548 * We can reuse scrub_simple_mirror() here, as the repair part
3549 * is still based on @mirror_num.
3551 ret = scrub_simple_mirror(sctx, root, csum_root, bg, map,
3552 logical, map->stripe_len,
3553 scrub_dev, physical, 1);
3557 logical += increment;
3558 physical += map->stripe_len;
3559 spin_lock(&sctx->stat_lock);
3561 sctx->stat.last_physical = map->stripes[stripe_index].physical +
3564 sctx->stat.last_physical = physical;
3565 spin_unlock(&sctx->stat_lock);
3570 /* push queued extents */
3572 mutex_lock(&sctx->wr_lock);
3573 scrub_wr_submit(sctx);
3574 mutex_unlock(&sctx->wr_lock);
3576 blk_finish_plug(&plug);
3577 btrfs_free_path(path);
3579 if (sctx->is_dev_replace && ret >= 0) {
3582 ret2 = sync_write_pointer_for_zoned(sctx,
3583 chunk_logical + offset,
3584 map->stripes[stripe_index].physical,
3590 return ret < 0 ? ret : 0;
3593 static noinline_for_stack int scrub_chunk(struct scrub_ctx *sctx,
3594 struct btrfs_block_group *bg,
3595 struct btrfs_device *scrub_dev,
3599 struct btrfs_fs_info *fs_info = sctx->fs_info;
3600 struct extent_map_tree *map_tree = &fs_info->mapping_tree;
3601 struct map_lookup *map;
3602 struct extent_map *em;
3606 read_lock(&map_tree->lock);
3607 em = lookup_extent_mapping(map_tree, bg->start, bg->length);
3608 read_unlock(&map_tree->lock);
3612 * Might have been an unused block group deleted by the cleaner
3613 * kthread or relocation.
3615 spin_lock(&bg->lock);
3618 spin_unlock(&bg->lock);
3622 if (em->start != bg->start)
3624 if (em->len < dev_extent_len)
3627 map = em->map_lookup;
3628 for (i = 0; i < map->num_stripes; ++i) {
3629 if (map->stripes[i].dev->bdev == scrub_dev->bdev &&
3630 map->stripes[i].physical == dev_offset) {
3631 ret = scrub_stripe(sctx, bg, map, scrub_dev, i,
3638 free_extent_map(em);
3643 static int finish_extent_writes_for_zoned(struct btrfs_root *root,
3644 struct btrfs_block_group *cache)
3646 struct btrfs_fs_info *fs_info = cache->fs_info;
3647 struct btrfs_trans_handle *trans;
3649 if (!btrfs_is_zoned(fs_info))
3652 btrfs_wait_block_group_reservations(cache);
3653 btrfs_wait_nocow_writers(cache);
3654 btrfs_wait_ordered_roots(fs_info, U64_MAX, cache->start, cache->length);
3656 trans = btrfs_join_transaction(root);
3658 return PTR_ERR(trans);
3659 return btrfs_commit_transaction(trans);
3662 static noinline_for_stack
3663 int scrub_enumerate_chunks(struct scrub_ctx *sctx,
3664 struct btrfs_device *scrub_dev, u64 start, u64 end)
3666 struct btrfs_dev_extent *dev_extent = NULL;
3667 struct btrfs_path *path;
3668 struct btrfs_fs_info *fs_info = sctx->fs_info;
3669 struct btrfs_root *root = fs_info->dev_root;
3674 struct extent_buffer *l;
3675 struct btrfs_key key;
3676 struct btrfs_key found_key;
3677 struct btrfs_block_group *cache;
3678 struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
3680 path = btrfs_alloc_path();
3684 path->reada = READA_FORWARD;
3685 path->search_commit_root = 1;
3686 path->skip_locking = 1;
3688 key.objectid = scrub_dev->devid;
3690 key.type = BTRFS_DEV_EXTENT_KEY;
3695 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
3699 if (path->slots[0] >=
3700 btrfs_header_nritems(path->nodes[0])) {
3701 ret = btrfs_next_leaf(root, path);
3714 slot = path->slots[0];
3716 btrfs_item_key_to_cpu(l, &found_key, slot);
3718 if (found_key.objectid != scrub_dev->devid)
3721 if (found_key.type != BTRFS_DEV_EXTENT_KEY)
3724 if (found_key.offset >= end)
3727 if (found_key.offset < key.offset)
3730 dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
3731 dev_extent_len = btrfs_dev_extent_length(l, dev_extent);
3733 if (found_key.offset + dev_extent_len <= start)
3736 chunk_offset = btrfs_dev_extent_chunk_offset(l, dev_extent);
3739 * get a reference on the corresponding block group to prevent
3740 * the chunk from going away while we scrub it
3742 cache = btrfs_lookup_block_group(fs_info, chunk_offset);
3744 /* some chunks are removed but not committed to disk yet,
3745 * continue scrubbing */
3749 ASSERT(cache->start <= chunk_offset);
3751 * We are using the commit root to search for device extents, so
3752 * that means we could have found a device extent item from a
3753 * block group that was deleted in the current transaction. The
3754 * logical start offset of the deleted block group, stored at
3755 * @chunk_offset, might be part of the logical address range of
3756 * a new block group (which uses different physical extents).
3757 * In this case btrfs_lookup_block_group() has returned the new
3758 * block group, and its start address is less than @chunk_offset.
3760 * We skip such new block groups, because it's pointless to
3761 * process them, as we won't find their extents because we search
3762 * for them using the commit root of the extent tree. For a device
3763 * replace it's also fine to skip it, we won't miss copying them
3764 * to the target device because we have the write duplication
3765 * setup through the regular write path (by btrfs_map_block()),
3766 * and we have committed a transaction when we started the device
3767 * replace, right after setting up the device replace state.
3769 if (cache->start < chunk_offset) {
3770 btrfs_put_block_group(cache);
3774 if (sctx->is_dev_replace && btrfs_is_zoned(fs_info)) {
3775 spin_lock(&cache->lock);
3776 if (!cache->to_copy) {
3777 spin_unlock(&cache->lock);
3778 btrfs_put_block_group(cache);
3781 spin_unlock(&cache->lock);
3785 * Make sure that while we are scrubbing the corresponding block
3786 * group doesn't get its logical address and its device extents
3787 * reused for another block group, which can possibly be of a
3788 * different type and different profile. We do this to prevent
3789 * false error detections and crashes due to bogus attempts to
3792 spin_lock(&cache->lock);
3793 if (cache->removed) {
3794 spin_unlock(&cache->lock);
3795 btrfs_put_block_group(cache);
3798 btrfs_freeze_block_group(cache);
3799 spin_unlock(&cache->lock);
3802 * we need call btrfs_inc_block_group_ro() with scrubs_paused,
3803 * to avoid deadlock caused by:
3804 * btrfs_inc_block_group_ro()
3805 * -> btrfs_wait_for_commit()
3806 * -> btrfs_commit_transaction()
3807 * -> btrfs_scrub_pause()
3809 scrub_pause_on(fs_info);
3812 * Don't do chunk preallocation for scrub.
3814 * This is especially important for SYSTEM bgs, or we can hit
3815 * -EFBIG from btrfs_finish_chunk_alloc() like:
3816 * 1. The only SYSTEM bg is marked RO.
3817 * Since SYSTEM bg is small, that's pretty common.
3818 * 2. New SYSTEM bg will be allocated
3819 * Due to regular version will allocate new chunk.
3820 * 3. New SYSTEM bg is empty and will get cleaned up
3821 * Before cleanup really happens, it's marked RO again.
3822 * 4. Empty SYSTEM bg get scrubbed
3825 * This can easily boost the amount of SYSTEM chunks if cleaner
3826 * thread can't be triggered fast enough, and use up all space
3827 * of btrfs_super_block::sys_chunk_array
3829 * While for dev replace, we need to try our best to mark block
3830 * group RO, to prevent race between:
3831 * - Write duplication
3832 * Contains latest data
3834 * Contains data from commit tree
3836 * If target block group is not marked RO, nocow writes can
3837 * be overwritten by scrub copy, causing data corruption.
3838 * So for dev-replace, it's not allowed to continue if a block
3841 ret = btrfs_inc_block_group_ro(cache, sctx->is_dev_replace);
3842 if (!ret && sctx->is_dev_replace) {
3843 ret = finish_extent_writes_for_zoned(root, cache);
3845 btrfs_dec_block_group_ro(cache);
3846 scrub_pause_off(fs_info);
3847 btrfs_put_block_group(cache);
3854 } else if (ret == -ENOSPC && !sctx->is_dev_replace) {
3856 * btrfs_inc_block_group_ro return -ENOSPC when it
3857 * failed in creating new chunk for metadata.
3858 * It is not a problem for scrub, because
3859 * metadata are always cowed, and our scrub paused
3860 * commit_transactions.
3863 } else if (ret == -ETXTBSY) {
3865 "skipping scrub of block group %llu due to active swapfile",
3867 scrub_pause_off(fs_info);
3872 "failed setting block group ro: %d", ret);
3873 btrfs_unfreeze_block_group(cache);
3874 btrfs_put_block_group(cache);
3875 scrub_pause_off(fs_info);
3880 * Now the target block is marked RO, wait for nocow writes to
3881 * finish before dev-replace.
3882 * COW is fine, as COW never overwrites extents in commit tree.
3884 if (sctx->is_dev_replace) {
3885 btrfs_wait_nocow_writers(cache);
3886 btrfs_wait_ordered_roots(fs_info, U64_MAX, cache->start,
3890 scrub_pause_off(fs_info);
3891 down_write(&dev_replace->rwsem);
3892 dev_replace->cursor_right = found_key.offset + dev_extent_len;
3893 dev_replace->cursor_left = found_key.offset;
3894 dev_replace->item_needs_writeback = 1;
3895 up_write(&dev_replace->rwsem);
3897 ret = scrub_chunk(sctx, cache, scrub_dev, found_key.offset,
3901 * flush, submit all pending read and write bios, afterwards
3903 * Note that in the dev replace case, a read request causes
3904 * write requests that are submitted in the read completion
3905 * worker. Therefore in the current situation, it is required
3906 * that all write requests are flushed, so that all read and
3907 * write requests are really completed when bios_in_flight
3910 sctx->flush_all_writes = true;
3912 mutex_lock(&sctx->wr_lock);
3913 scrub_wr_submit(sctx);
3914 mutex_unlock(&sctx->wr_lock);
3916 wait_event(sctx->list_wait,
3917 atomic_read(&sctx->bios_in_flight) == 0);
3919 scrub_pause_on(fs_info);
3922 * must be called before we decrease @scrub_paused.
3923 * make sure we don't block transaction commit while
3924 * we are waiting pending workers finished.
3926 wait_event(sctx->list_wait,
3927 atomic_read(&sctx->workers_pending) == 0);
3928 sctx->flush_all_writes = false;
3930 scrub_pause_off(fs_info);
3932 if (sctx->is_dev_replace &&
3933 !btrfs_finish_block_group_to_copy(dev_replace->srcdev,
3934 cache, found_key.offset))
3937 down_write(&dev_replace->rwsem);
3938 dev_replace->cursor_left = dev_replace->cursor_right;
3939 dev_replace->item_needs_writeback = 1;
3940 up_write(&dev_replace->rwsem);
3943 btrfs_dec_block_group_ro(cache);
3946 * We might have prevented the cleaner kthread from deleting
3947 * this block group if it was already unused because we raced
3948 * and set it to RO mode first. So add it back to the unused
3949 * list, otherwise it might not ever be deleted unless a manual
3950 * balance is triggered or it becomes used and unused again.
3952 spin_lock(&cache->lock);
3953 if (!cache->removed && !cache->ro && cache->reserved == 0 &&
3955 spin_unlock(&cache->lock);
3956 if (btrfs_test_opt(fs_info, DISCARD_ASYNC))
3957 btrfs_discard_queue_work(&fs_info->discard_ctl,
3960 btrfs_mark_bg_unused(cache);
3962 spin_unlock(&cache->lock);
3965 btrfs_unfreeze_block_group(cache);
3966 btrfs_put_block_group(cache);
3969 if (sctx->is_dev_replace &&
3970 atomic64_read(&dev_replace->num_write_errors) > 0) {
3974 if (sctx->stat.malloc_errors > 0) {
3979 key.offset = found_key.offset + dev_extent_len;
3980 btrfs_release_path(path);
3983 btrfs_free_path(path);
3988 static noinline_for_stack int scrub_supers(struct scrub_ctx *sctx,
3989 struct btrfs_device *scrub_dev)
3995 struct btrfs_fs_info *fs_info = sctx->fs_info;
3997 if (BTRFS_FS_ERROR(fs_info))
4000 /* Seed devices of a new filesystem has their own generation. */
4001 if (scrub_dev->fs_devices != fs_info->fs_devices)
4002 gen = scrub_dev->generation;
4004 gen = fs_info->last_trans_committed;
4006 for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
4007 bytenr = btrfs_sb_offset(i);
4008 if (bytenr + BTRFS_SUPER_INFO_SIZE >
4009 scrub_dev->commit_total_bytes)
4011 if (!btrfs_check_super_location(scrub_dev, bytenr))
4014 ret = scrub_sectors(sctx, bytenr, BTRFS_SUPER_INFO_SIZE, bytenr,
4015 scrub_dev, BTRFS_EXTENT_FLAG_SUPER, gen, i,
4020 wait_event(sctx->list_wait, atomic_read(&sctx->bios_in_flight) == 0);
4025 static void scrub_workers_put(struct btrfs_fs_info *fs_info)
4027 if (refcount_dec_and_mutex_lock(&fs_info->scrub_workers_refcnt,
4028 &fs_info->scrub_lock)) {
4029 struct workqueue_struct *scrub_workers = fs_info->scrub_workers;
4030 struct workqueue_struct *scrub_wr_comp =
4031 fs_info->scrub_wr_completion_workers;
4032 struct workqueue_struct *scrub_parity =
4033 fs_info->scrub_parity_workers;
4035 fs_info->scrub_workers = NULL;
4036 fs_info->scrub_wr_completion_workers = NULL;
4037 fs_info->scrub_parity_workers = NULL;
4038 mutex_unlock(&fs_info->scrub_lock);
4041 destroy_workqueue(scrub_workers);
4043 destroy_workqueue(scrub_wr_comp);
4045 destroy_workqueue(scrub_parity);
4050 * get a reference count on fs_info->scrub_workers. start worker if necessary
4052 static noinline_for_stack int scrub_workers_get(struct btrfs_fs_info *fs_info,
4055 struct workqueue_struct *scrub_workers = NULL;
4056 struct workqueue_struct *scrub_wr_comp = NULL;
4057 struct workqueue_struct *scrub_parity = NULL;
4058 unsigned int flags = WQ_FREEZABLE | WQ_UNBOUND;
4059 int max_active = fs_info->thread_pool_size;
4062 if (refcount_inc_not_zero(&fs_info->scrub_workers_refcnt))
4065 scrub_workers = alloc_workqueue("btrfs-scrub", flags,
4066 is_dev_replace ? 1 : max_active);
4068 goto fail_scrub_workers;
4070 scrub_wr_comp = alloc_workqueue("btrfs-scrubwrc", flags, max_active);
4072 goto fail_scrub_wr_completion_workers;
4074 scrub_parity = alloc_workqueue("btrfs-scrubparity", flags, max_active);
4076 goto fail_scrub_parity_workers;
4078 mutex_lock(&fs_info->scrub_lock);
4079 if (refcount_read(&fs_info->scrub_workers_refcnt) == 0) {
4080 ASSERT(fs_info->scrub_workers == NULL &&
4081 fs_info->scrub_wr_completion_workers == NULL &&
4082 fs_info->scrub_parity_workers == NULL);
4083 fs_info->scrub_workers = scrub_workers;
4084 fs_info->scrub_wr_completion_workers = scrub_wr_comp;
4085 fs_info->scrub_parity_workers = scrub_parity;
4086 refcount_set(&fs_info->scrub_workers_refcnt, 1);
4087 mutex_unlock(&fs_info->scrub_lock);
4090 /* Other thread raced in and created the workers for us */
4091 refcount_inc(&fs_info->scrub_workers_refcnt);
4092 mutex_unlock(&fs_info->scrub_lock);
4095 destroy_workqueue(scrub_parity);
4096 fail_scrub_parity_workers:
4097 destroy_workqueue(scrub_wr_comp);
4098 fail_scrub_wr_completion_workers:
4099 destroy_workqueue(scrub_workers);
4104 int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
4105 u64 end, struct btrfs_scrub_progress *progress,
4106 int readonly, int is_dev_replace)
4108 struct btrfs_dev_lookup_args args = { .devid = devid };
4109 struct scrub_ctx *sctx;
4111 struct btrfs_device *dev;
4112 unsigned int nofs_flag;
4114 if (btrfs_fs_closing(fs_info))
4117 if (fs_info->nodesize > BTRFS_STRIPE_LEN) {
4119 * in this case scrub is unable to calculate the checksum
4120 * the way scrub is implemented. Do not handle this
4121 * situation at all because it won't ever happen.
4124 "scrub: size assumption nodesize <= BTRFS_STRIPE_LEN (%d <= %d) fails",
4130 if (fs_info->nodesize >
4131 SCRUB_MAX_SECTORS_PER_BLOCK << fs_info->sectorsize_bits ||
4132 fs_info->sectorsize > PAGE_SIZE * SCRUB_MAX_SECTORS_PER_BLOCK) {
4134 * Would exhaust the array bounds of sectorv member in
4135 * struct scrub_block
4138 "scrub: nodesize and sectorsize <= SCRUB_MAX_SECTORS_PER_BLOCK (%d <= %d && %d <= %d) fails",
4139 fs_info->nodesize, SCRUB_MAX_SECTORS_PER_BLOCK,
4140 fs_info->sectorsize, SCRUB_MAX_SECTORS_PER_BLOCK);
4144 /* Allocate outside of device_list_mutex */
4145 sctx = scrub_setup_ctx(fs_info, is_dev_replace);
4147 return PTR_ERR(sctx);
4149 ret = scrub_workers_get(fs_info, is_dev_replace);
4153 mutex_lock(&fs_info->fs_devices->device_list_mutex);
4154 dev = btrfs_find_device(fs_info->fs_devices, &args);
4155 if (!dev || (test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state) &&
4157 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
4162 if (!is_dev_replace && !readonly &&
4163 !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state)) {
4164 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
4165 btrfs_err_in_rcu(fs_info,
4166 "scrub on devid %llu: filesystem on %s is not writable",
4167 devid, rcu_str_deref(dev->name));
4172 mutex_lock(&fs_info->scrub_lock);
4173 if (!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &dev->dev_state) ||
4174 test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &dev->dev_state)) {
4175 mutex_unlock(&fs_info->scrub_lock);
4176 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
4181 down_read(&fs_info->dev_replace.rwsem);
4182 if (dev->scrub_ctx ||
4184 btrfs_dev_replace_is_ongoing(&fs_info->dev_replace))) {
4185 up_read(&fs_info->dev_replace.rwsem);
4186 mutex_unlock(&fs_info->scrub_lock);
4187 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
4191 up_read(&fs_info->dev_replace.rwsem);
4193 sctx->readonly = readonly;
4194 dev->scrub_ctx = sctx;
4195 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
4198 * checking @scrub_pause_req here, we can avoid
4199 * race between committing transaction and scrubbing.
4201 __scrub_blocked_if_needed(fs_info);
4202 atomic_inc(&fs_info->scrubs_running);
4203 mutex_unlock(&fs_info->scrub_lock);
4206 * In order to avoid deadlock with reclaim when there is a transaction
4207 * trying to pause scrub, make sure we use GFP_NOFS for all the
4208 * allocations done at btrfs_scrub_sectors() and scrub_sectors_for_parity()
4209 * invoked by our callees. The pausing request is done when the
4210 * transaction commit starts, and it blocks the transaction until scrub
4211 * is paused (done at specific points at scrub_stripe() or right above
4212 * before incrementing fs_info->scrubs_running).
4214 nofs_flag = memalloc_nofs_save();
4215 if (!is_dev_replace) {
4216 btrfs_info(fs_info, "scrub: started on devid %llu", devid);
4218 * by holding device list mutex, we can
4219 * kick off writing super in log tree sync.
4221 mutex_lock(&fs_info->fs_devices->device_list_mutex);
4222 ret = scrub_supers(sctx, dev);
4223 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
4227 ret = scrub_enumerate_chunks(sctx, dev, start, end);
4228 memalloc_nofs_restore(nofs_flag);
4230 wait_event(sctx->list_wait, atomic_read(&sctx->bios_in_flight) == 0);
4231 atomic_dec(&fs_info->scrubs_running);
4232 wake_up(&fs_info->scrub_pause_wait);
4234 wait_event(sctx->list_wait, atomic_read(&sctx->workers_pending) == 0);
4237 memcpy(progress, &sctx->stat, sizeof(*progress));
4239 if (!is_dev_replace)
4240 btrfs_info(fs_info, "scrub: %s on devid %llu with status: %d",
4241 ret ? "not finished" : "finished", devid, ret);
4243 mutex_lock(&fs_info->scrub_lock);
4244 dev->scrub_ctx = NULL;
4245 mutex_unlock(&fs_info->scrub_lock);
4247 scrub_workers_put(fs_info);
4248 scrub_put_ctx(sctx);
4252 scrub_workers_put(fs_info);
4254 scrub_free_ctx(sctx);
4259 void btrfs_scrub_pause(struct btrfs_fs_info *fs_info)
4261 mutex_lock(&fs_info->scrub_lock);
4262 atomic_inc(&fs_info->scrub_pause_req);
4263 while (atomic_read(&fs_info->scrubs_paused) !=
4264 atomic_read(&fs_info->scrubs_running)) {
4265 mutex_unlock(&fs_info->scrub_lock);
4266 wait_event(fs_info->scrub_pause_wait,
4267 atomic_read(&fs_info->scrubs_paused) ==
4268 atomic_read(&fs_info->scrubs_running));
4269 mutex_lock(&fs_info->scrub_lock);
4271 mutex_unlock(&fs_info->scrub_lock);
4274 void btrfs_scrub_continue(struct btrfs_fs_info *fs_info)
4276 atomic_dec(&fs_info->scrub_pause_req);
4277 wake_up(&fs_info->scrub_pause_wait);
4280 int btrfs_scrub_cancel(struct btrfs_fs_info *fs_info)
4282 mutex_lock(&fs_info->scrub_lock);
4283 if (!atomic_read(&fs_info->scrubs_running)) {
4284 mutex_unlock(&fs_info->scrub_lock);
4288 atomic_inc(&fs_info->scrub_cancel_req);
4289 while (atomic_read(&fs_info->scrubs_running)) {
4290 mutex_unlock(&fs_info->scrub_lock);
4291 wait_event(fs_info->scrub_pause_wait,
4292 atomic_read(&fs_info->scrubs_running) == 0);
4293 mutex_lock(&fs_info->scrub_lock);
4295 atomic_dec(&fs_info->scrub_cancel_req);
4296 mutex_unlock(&fs_info->scrub_lock);
4301 int btrfs_scrub_cancel_dev(struct btrfs_device *dev)
4303 struct btrfs_fs_info *fs_info = dev->fs_info;
4304 struct scrub_ctx *sctx;
4306 mutex_lock(&fs_info->scrub_lock);
4307 sctx = dev->scrub_ctx;
4309 mutex_unlock(&fs_info->scrub_lock);
4312 atomic_inc(&sctx->cancel_req);
4313 while (dev->scrub_ctx) {
4314 mutex_unlock(&fs_info->scrub_lock);
4315 wait_event(fs_info->scrub_pause_wait,
4316 dev->scrub_ctx == NULL);
4317 mutex_lock(&fs_info->scrub_lock);
4319 mutex_unlock(&fs_info->scrub_lock);
4324 int btrfs_scrub_progress(struct btrfs_fs_info *fs_info, u64 devid,
4325 struct btrfs_scrub_progress *progress)
4327 struct btrfs_dev_lookup_args args = { .devid = devid };
4328 struct btrfs_device *dev;
4329 struct scrub_ctx *sctx = NULL;
4331 mutex_lock(&fs_info->fs_devices->device_list_mutex);
4332 dev = btrfs_find_device(fs_info->fs_devices, &args);
4334 sctx = dev->scrub_ctx;
4336 memcpy(progress, &sctx->stat, sizeof(*progress));
4337 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
4339 return dev ? (sctx ? 0 : -ENOTCONN) : -ENODEV;
4342 static void scrub_find_good_copy(struct btrfs_fs_info *fs_info,
4343 u64 extent_logical, u32 extent_len,
4344 u64 *extent_physical,
4345 struct btrfs_device **extent_dev,
4346 int *extent_mirror_num)
4349 struct btrfs_io_context *bioc = NULL;
4352 mapped_length = extent_len;
4353 ret = btrfs_map_block(fs_info, BTRFS_MAP_READ, extent_logical,
4354 &mapped_length, &bioc, 0);
4355 if (ret || !bioc || mapped_length < extent_len ||
4356 !bioc->stripes[0].dev->bdev) {
4357 btrfs_put_bioc(bioc);
4361 *extent_physical = bioc->stripes[0].physical;
4362 *extent_mirror_num = bioc->mirror_num;
4363 *extent_dev = bioc->stripes[0].dev;
4364 btrfs_put_bioc(bioc);