]> Git Repo - linux.git/blob - fs/btrfs/scrub.c
db700e6ec5a932087ff78bcd9955c9b091b7d912
[linux.git] / fs / btrfs / scrub.c
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * Copyright (C) 2011, 2012 STRATO.  All rights reserved.
4  */
5
6 #include <linux/blkdev.h>
7 #include <linux/ratelimit.h>
8 #include <linux/sched/mm.h>
9 #include <crypto/hash.h>
10 #include "ctree.h"
11 #include "discard.h"
12 #include "volumes.h"
13 #include "disk-io.h"
14 #include "ordered-data.h"
15 #include "transaction.h"
16 #include "backref.h"
17 #include "extent_io.h"
18 #include "dev-replace.h"
19 #include "check-integrity.h"
20 #include "rcu-string.h"
21 #include "raid56.h"
22 #include "block-group.h"
23 #include "zoned.h"
24
25 /*
26  * This is only the first step towards a full-features scrub. It reads all
27  * extent and super block and verifies the checksums. In case a bad checksum
28  * is found or the extent cannot be read, good data will be written back if
29  * any can be found.
30  *
31  * Future enhancements:
32  *  - In case an unrepairable extent is encountered, track which files are
33  *    affected and report them
34  *  - track and record media errors, throw out bad devices
35  *  - add a mode to also read unallocated space
36  */
37
38 struct scrub_block;
39 struct scrub_ctx;
40
41 /*
42  * The following three values only influence the performance.
43  *
44  * The last one configures the number of parallel and outstanding I/O
45  * operations. The first one configures an upper limit for the number
46  * of (dynamically allocated) pages that are added to a bio.
47  */
48 #define SCRUB_SECTORS_PER_BIO   32      /* 128KiB per bio for 4KiB pages */
49 #define SCRUB_BIOS_PER_SCTX     64      /* 8MiB per device in flight for 4KiB pages */
50
51 /*
52  * The following value times PAGE_SIZE needs to be large enough to match the
53  * largest node/leaf/sector size that shall be supported.
54  */
55 #define SCRUB_MAX_SECTORS_PER_BLOCK     (BTRFS_MAX_METADATA_BLOCKSIZE / SZ_4K)
56
57 struct scrub_recover {
58         refcount_t              refs;
59         struct btrfs_io_context *bioc;
60         u64                     map_length;
61 };
62
63 struct scrub_sector {
64         struct scrub_block      *sblock;
65         struct page             *page;
66         struct btrfs_device     *dev;
67         struct list_head        list;
68         u64                     flags;  /* extent flags */
69         u64                     generation;
70         u64                     logical;
71         u64                     physical;
72         u64                     physical_for_dev_replace;
73         atomic_t                refs;
74         u8                      mirror_num;
75         unsigned int            have_csum:1;
76         unsigned int            io_error:1;
77         u8                      csum[BTRFS_CSUM_SIZE];
78
79         struct scrub_recover    *recover;
80 };
81
82 struct scrub_bio {
83         int                     index;
84         struct scrub_ctx        *sctx;
85         struct btrfs_device     *dev;
86         struct bio              *bio;
87         blk_status_t            status;
88         u64                     logical;
89         u64                     physical;
90         struct scrub_sector     *sectors[SCRUB_SECTORS_PER_BIO];
91         int                     sector_count;
92         int                     next_free;
93         struct work_struct      work;
94 };
95
96 struct scrub_block {
97         struct scrub_sector     *sectors[SCRUB_MAX_SECTORS_PER_BLOCK];
98         int                     sector_count;
99         atomic_t                outstanding_sectors;
100         refcount_t              refs; /* free mem on transition to zero */
101         struct scrub_ctx        *sctx;
102         struct scrub_parity     *sparity;
103         struct {
104                 unsigned int    header_error:1;
105                 unsigned int    checksum_error:1;
106                 unsigned int    no_io_error_seen:1;
107                 unsigned int    generation_error:1; /* also sets header_error */
108
109                 /* The following is for the data used to check parity */
110                 /* It is for the data with checksum */
111                 unsigned int    data_corrected:1;
112         };
113         struct work_struct      work;
114 };
115
116 /* Used for the chunks with parity stripe such RAID5/6 */
117 struct scrub_parity {
118         struct scrub_ctx        *sctx;
119
120         struct btrfs_device     *scrub_dev;
121
122         u64                     logic_start;
123
124         u64                     logic_end;
125
126         int                     nsectors;
127
128         u32                     stripe_len;
129
130         refcount_t              refs;
131
132         struct list_head        sectors_list;
133
134         /* Work of parity check and repair */
135         struct work_struct      work;
136
137         /* Mark the parity blocks which have data */
138         unsigned long           dbitmap;
139
140         /*
141          * Mark the parity blocks which have data, but errors happen when
142          * read data or check data
143          */
144         unsigned long           ebitmap;
145 };
146
147 struct scrub_ctx {
148         struct scrub_bio        *bios[SCRUB_BIOS_PER_SCTX];
149         struct btrfs_fs_info    *fs_info;
150         int                     first_free;
151         int                     curr;
152         atomic_t                bios_in_flight;
153         atomic_t                workers_pending;
154         spinlock_t              list_lock;
155         wait_queue_head_t       list_wait;
156         struct list_head        csum_list;
157         atomic_t                cancel_req;
158         int                     readonly;
159         int                     sectors_per_bio;
160
161         /* State of IO submission throttling affecting the associated device */
162         ktime_t                 throttle_deadline;
163         u64                     throttle_sent;
164
165         int                     is_dev_replace;
166         u64                     write_pointer;
167
168         struct scrub_bio        *wr_curr_bio;
169         struct mutex            wr_lock;
170         struct btrfs_device     *wr_tgtdev;
171         bool                    flush_all_writes;
172
173         /*
174          * statistics
175          */
176         struct btrfs_scrub_progress stat;
177         spinlock_t              stat_lock;
178
179         /*
180          * Use a ref counter to avoid use-after-free issues. Scrub workers
181          * decrement bios_in_flight and workers_pending and then do a wakeup
182          * on the list_wait wait queue. We must ensure the main scrub task
183          * doesn't free the scrub context before or while the workers are
184          * doing the wakeup() call.
185          */
186         refcount_t              refs;
187 };
188
189 struct scrub_warning {
190         struct btrfs_path       *path;
191         u64                     extent_item_size;
192         const char              *errstr;
193         u64                     physical;
194         u64                     logical;
195         struct btrfs_device     *dev;
196 };
197
198 struct full_stripe_lock {
199         struct rb_node node;
200         u64 logical;
201         u64 refs;
202         struct mutex mutex;
203 };
204
205 static int scrub_setup_recheck_block(struct scrub_block *original_sblock,
206                                      struct scrub_block *sblocks_for_recheck);
207 static void scrub_recheck_block(struct btrfs_fs_info *fs_info,
208                                 struct scrub_block *sblock,
209                                 int retry_failed_mirror);
210 static void scrub_recheck_block_checksum(struct scrub_block *sblock);
211 static int scrub_repair_block_from_good_copy(struct scrub_block *sblock_bad,
212                                              struct scrub_block *sblock_good);
213 static int scrub_repair_sector_from_good_copy(struct scrub_block *sblock_bad,
214                                             struct scrub_block *sblock_good,
215                                             int sector_num, int force_write);
216 static void scrub_write_block_to_dev_replace(struct scrub_block *sblock);
217 static int scrub_write_sector_to_dev_replace(struct scrub_block *sblock,
218                                              int sector_num);
219 static int scrub_checksum_data(struct scrub_block *sblock);
220 static int scrub_checksum_tree_block(struct scrub_block *sblock);
221 static int scrub_checksum_super(struct scrub_block *sblock);
222 static void scrub_block_put(struct scrub_block *sblock);
223 static void scrub_sector_get(struct scrub_sector *sector);
224 static void scrub_sector_put(struct scrub_sector *sector);
225 static void scrub_parity_get(struct scrub_parity *sparity);
226 static void scrub_parity_put(struct scrub_parity *sparity);
227 static int scrub_sectors(struct scrub_ctx *sctx, u64 logical, u32 len,
228                          u64 physical, struct btrfs_device *dev, u64 flags,
229                          u64 gen, int mirror_num, u8 *csum,
230                          u64 physical_for_dev_replace);
231 static void scrub_bio_end_io(struct bio *bio);
232 static void scrub_bio_end_io_worker(struct work_struct *work);
233 static void scrub_block_complete(struct scrub_block *sblock);
234 static void scrub_find_good_copy(struct btrfs_fs_info *fs_info,
235                                  u64 extent_logical, u32 extent_len,
236                                  u64 *extent_physical,
237                                  struct btrfs_device **extent_dev,
238                                  int *extent_mirror_num);
239 static int scrub_add_sector_to_wr_bio(struct scrub_ctx *sctx,
240                                       struct scrub_sector *sector);
241 static void scrub_wr_submit(struct scrub_ctx *sctx);
242 static void scrub_wr_bio_end_io(struct bio *bio);
243 static void scrub_wr_bio_end_io_worker(struct work_struct *work);
244 static void scrub_put_ctx(struct scrub_ctx *sctx);
245
246 static inline int scrub_is_page_on_raid56(struct scrub_sector *sector)
247 {
248         return sector->recover &&
249                (sector->recover->bioc->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK);
250 }
251
252 static void scrub_pending_bio_inc(struct scrub_ctx *sctx)
253 {
254         refcount_inc(&sctx->refs);
255         atomic_inc(&sctx->bios_in_flight);
256 }
257
258 static void scrub_pending_bio_dec(struct scrub_ctx *sctx)
259 {
260         atomic_dec(&sctx->bios_in_flight);
261         wake_up(&sctx->list_wait);
262         scrub_put_ctx(sctx);
263 }
264
265 static void __scrub_blocked_if_needed(struct btrfs_fs_info *fs_info)
266 {
267         while (atomic_read(&fs_info->scrub_pause_req)) {
268                 mutex_unlock(&fs_info->scrub_lock);
269                 wait_event(fs_info->scrub_pause_wait,
270                    atomic_read(&fs_info->scrub_pause_req) == 0);
271                 mutex_lock(&fs_info->scrub_lock);
272         }
273 }
274
275 static void scrub_pause_on(struct btrfs_fs_info *fs_info)
276 {
277         atomic_inc(&fs_info->scrubs_paused);
278         wake_up(&fs_info->scrub_pause_wait);
279 }
280
281 static void scrub_pause_off(struct btrfs_fs_info *fs_info)
282 {
283         mutex_lock(&fs_info->scrub_lock);
284         __scrub_blocked_if_needed(fs_info);
285         atomic_dec(&fs_info->scrubs_paused);
286         mutex_unlock(&fs_info->scrub_lock);
287
288         wake_up(&fs_info->scrub_pause_wait);
289 }
290
291 static void scrub_blocked_if_needed(struct btrfs_fs_info *fs_info)
292 {
293         scrub_pause_on(fs_info);
294         scrub_pause_off(fs_info);
295 }
296
297 /*
298  * Insert new full stripe lock into full stripe locks tree
299  *
300  * Return pointer to existing or newly inserted full_stripe_lock structure if
301  * everything works well.
302  * Return ERR_PTR(-ENOMEM) if we failed to allocate memory
303  *
304  * NOTE: caller must hold full_stripe_locks_root->lock before calling this
305  * function
306  */
307 static struct full_stripe_lock *insert_full_stripe_lock(
308                 struct btrfs_full_stripe_locks_tree *locks_root,
309                 u64 fstripe_logical)
310 {
311         struct rb_node **p;
312         struct rb_node *parent = NULL;
313         struct full_stripe_lock *entry;
314         struct full_stripe_lock *ret;
315
316         lockdep_assert_held(&locks_root->lock);
317
318         p = &locks_root->root.rb_node;
319         while (*p) {
320                 parent = *p;
321                 entry = rb_entry(parent, struct full_stripe_lock, node);
322                 if (fstripe_logical < entry->logical) {
323                         p = &(*p)->rb_left;
324                 } else if (fstripe_logical > entry->logical) {
325                         p = &(*p)->rb_right;
326                 } else {
327                         entry->refs++;
328                         return entry;
329                 }
330         }
331
332         /*
333          * Insert new lock.
334          */
335         ret = kmalloc(sizeof(*ret), GFP_KERNEL);
336         if (!ret)
337                 return ERR_PTR(-ENOMEM);
338         ret->logical = fstripe_logical;
339         ret->refs = 1;
340         mutex_init(&ret->mutex);
341
342         rb_link_node(&ret->node, parent, p);
343         rb_insert_color(&ret->node, &locks_root->root);
344         return ret;
345 }
346
347 /*
348  * Search for a full stripe lock of a block group
349  *
350  * Return pointer to existing full stripe lock if found
351  * Return NULL if not found
352  */
353 static struct full_stripe_lock *search_full_stripe_lock(
354                 struct btrfs_full_stripe_locks_tree *locks_root,
355                 u64 fstripe_logical)
356 {
357         struct rb_node *node;
358         struct full_stripe_lock *entry;
359
360         lockdep_assert_held(&locks_root->lock);
361
362         node = locks_root->root.rb_node;
363         while (node) {
364                 entry = rb_entry(node, struct full_stripe_lock, node);
365                 if (fstripe_logical < entry->logical)
366                         node = node->rb_left;
367                 else if (fstripe_logical > entry->logical)
368                         node = node->rb_right;
369                 else
370                         return entry;
371         }
372         return NULL;
373 }
374
375 /*
376  * Helper to get full stripe logical from a normal bytenr.
377  *
378  * Caller must ensure @cache is a RAID56 block group.
379  */
380 static u64 get_full_stripe_logical(struct btrfs_block_group *cache, u64 bytenr)
381 {
382         u64 ret;
383
384         /*
385          * Due to chunk item size limit, full stripe length should not be
386          * larger than U32_MAX. Just a sanity check here.
387          */
388         WARN_ON_ONCE(cache->full_stripe_len >= U32_MAX);
389
390         /*
391          * round_down() can only handle power of 2, while RAID56 full
392          * stripe length can be 64KiB * n, so we need to manually round down.
393          */
394         ret = div64_u64(bytenr - cache->start, cache->full_stripe_len) *
395                         cache->full_stripe_len + cache->start;
396         return ret;
397 }
398
399 /*
400  * Lock a full stripe to avoid concurrency of recovery and read
401  *
402  * It's only used for profiles with parities (RAID5/6), for other profiles it
403  * does nothing.
404  *
405  * Return 0 if we locked full stripe covering @bytenr, with a mutex held.
406  * So caller must call unlock_full_stripe() at the same context.
407  *
408  * Return <0 if encounters error.
409  */
410 static int lock_full_stripe(struct btrfs_fs_info *fs_info, u64 bytenr,
411                             bool *locked_ret)
412 {
413         struct btrfs_block_group *bg_cache;
414         struct btrfs_full_stripe_locks_tree *locks_root;
415         struct full_stripe_lock *existing;
416         u64 fstripe_start;
417         int ret = 0;
418
419         *locked_ret = false;
420         bg_cache = btrfs_lookup_block_group(fs_info, bytenr);
421         if (!bg_cache) {
422                 ASSERT(0);
423                 return -ENOENT;
424         }
425
426         /* Profiles not based on parity don't need full stripe lock */
427         if (!(bg_cache->flags & BTRFS_BLOCK_GROUP_RAID56_MASK))
428                 goto out;
429         locks_root = &bg_cache->full_stripe_locks_root;
430
431         fstripe_start = get_full_stripe_logical(bg_cache, bytenr);
432
433         /* Now insert the full stripe lock */
434         mutex_lock(&locks_root->lock);
435         existing = insert_full_stripe_lock(locks_root, fstripe_start);
436         mutex_unlock(&locks_root->lock);
437         if (IS_ERR(existing)) {
438                 ret = PTR_ERR(existing);
439                 goto out;
440         }
441         mutex_lock(&existing->mutex);
442         *locked_ret = true;
443 out:
444         btrfs_put_block_group(bg_cache);
445         return ret;
446 }
447
448 /*
449  * Unlock a full stripe.
450  *
451  * NOTE: Caller must ensure it's the same context calling corresponding
452  * lock_full_stripe().
453  *
454  * Return 0 if we unlock full stripe without problem.
455  * Return <0 for error
456  */
457 static int unlock_full_stripe(struct btrfs_fs_info *fs_info, u64 bytenr,
458                               bool locked)
459 {
460         struct btrfs_block_group *bg_cache;
461         struct btrfs_full_stripe_locks_tree *locks_root;
462         struct full_stripe_lock *fstripe_lock;
463         u64 fstripe_start;
464         bool freeit = false;
465         int ret = 0;
466
467         /* If we didn't acquire full stripe lock, no need to continue */
468         if (!locked)
469                 return 0;
470
471         bg_cache = btrfs_lookup_block_group(fs_info, bytenr);
472         if (!bg_cache) {
473                 ASSERT(0);
474                 return -ENOENT;
475         }
476         if (!(bg_cache->flags & BTRFS_BLOCK_GROUP_RAID56_MASK))
477                 goto out;
478
479         locks_root = &bg_cache->full_stripe_locks_root;
480         fstripe_start = get_full_stripe_logical(bg_cache, bytenr);
481
482         mutex_lock(&locks_root->lock);
483         fstripe_lock = search_full_stripe_lock(locks_root, fstripe_start);
484         /* Unpaired unlock_full_stripe() detected */
485         if (!fstripe_lock) {
486                 WARN_ON(1);
487                 ret = -ENOENT;
488                 mutex_unlock(&locks_root->lock);
489                 goto out;
490         }
491
492         if (fstripe_lock->refs == 0) {
493                 WARN_ON(1);
494                 btrfs_warn(fs_info, "full stripe lock at %llu refcount underflow",
495                         fstripe_lock->logical);
496         } else {
497                 fstripe_lock->refs--;
498         }
499
500         if (fstripe_lock->refs == 0) {
501                 rb_erase(&fstripe_lock->node, &locks_root->root);
502                 freeit = true;
503         }
504         mutex_unlock(&locks_root->lock);
505
506         mutex_unlock(&fstripe_lock->mutex);
507         if (freeit)
508                 kfree(fstripe_lock);
509 out:
510         btrfs_put_block_group(bg_cache);
511         return ret;
512 }
513
514 static void scrub_free_csums(struct scrub_ctx *sctx)
515 {
516         while (!list_empty(&sctx->csum_list)) {
517                 struct btrfs_ordered_sum *sum;
518                 sum = list_first_entry(&sctx->csum_list,
519                                        struct btrfs_ordered_sum, list);
520                 list_del(&sum->list);
521                 kfree(sum);
522         }
523 }
524
525 static noinline_for_stack void scrub_free_ctx(struct scrub_ctx *sctx)
526 {
527         int i;
528
529         if (!sctx)
530                 return;
531
532         /* this can happen when scrub is cancelled */
533         if (sctx->curr != -1) {
534                 struct scrub_bio *sbio = sctx->bios[sctx->curr];
535
536                 for (i = 0; i < sbio->sector_count; i++) {
537                         WARN_ON(!sbio->sectors[i]->page);
538                         scrub_block_put(sbio->sectors[i]->sblock);
539                 }
540                 bio_put(sbio->bio);
541         }
542
543         for (i = 0; i < SCRUB_BIOS_PER_SCTX; ++i) {
544                 struct scrub_bio *sbio = sctx->bios[i];
545
546                 if (!sbio)
547                         break;
548                 kfree(sbio);
549         }
550
551         kfree(sctx->wr_curr_bio);
552         scrub_free_csums(sctx);
553         kfree(sctx);
554 }
555
556 static void scrub_put_ctx(struct scrub_ctx *sctx)
557 {
558         if (refcount_dec_and_test(&sctx->refs))
559                 scrub_free_ctx(sctx);
560 }
561
562 static noinline_for_stack struct scrub_ctx *scrub_setup_ctx(
563                 struct btrfs_fs_info *fs_info, int is_dev_replace)
564 {
565         struct scrub_ctx *sctx;
566         int             i;
567
568         sctx = kzalloc(sizeof(*sctx), GFP_KERNEL);
569         if (!sctx)
570                 goto nomem;
571         refcount_set(&sctx->refs, 1);
572         sctx->is_dev_replace = is_dev_replace;
573         sctx->sectors_per_bio = SCRUB_SECTORS_PER_BIO;
574         sctx->curr = -1;
575         sctx->fs_info = fs_info;
576         INIT_LIST_HEAD(&sctx->csum_list);
577         for (i = 0; i < SCRUB_BIOS_PER_SCTX; ++i) {
578                 struct scrub_bio *sbio;
579
580                 sbio = kzalloc(sizeof(*sbio), GFP_KERNEL);
581                 if (!sbio)
582                         goto nomem;
583                 sctx->bios[i] = sbio;
584
585                 sbio->index = i;
586                 sbio->sctx = sctx;
587                 sbio->sector_count = 0;
588                 INIT_WORK(&sbio->work, scrub_bio_end_io_worker);
589
590                 if (i != SCRUB_BIOS_PER_SCTX - 1)
591                         sctx->bios[i]->next_free = i + 1;
592                 else
593                         sctx->bios[i]->next_free = -1;
594         }
595         sctx->first_free = 0;
596         atomic_set(&sctx->bios_in_flight, 0);
597         atomic_set(&sctx->workers_pending, 0);
598         atomic_set(&sctx->cancel_req, 0);
599
600         spin_lock_init(&sctx->list_lock);
601         spin_lock_init(&sctx->stat_lock);
602         init_waitqueue_head(&sctx->list_wait);
603         sctx->throttle_deadline = 0;
604
605         WARN_ON(sctx->wr_curr_bio != NULL);
606         mutex_init(&sctx->wr_lock);
607         sctx->wr_curr_bio = NULL;
608         if (is_dev_replace) {
609                 WARN_ON(!fs_info->dev_replace.tgtdev);
610                 sctx->wr_tgtdev = fs_info->dev_replace.tgtdev;
611                 sctx->flush_all_writes = false;
612         }
613
614         return sctx;
615
616 nomem:
617         scrub_free_ctx(sctx);
618         return ERR_PTR(-ENOMEM);
619 }
620
621 static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root,
622                                      void *warn_ctx)
623 {
624         u32 nlink;
625         int ret;
626         int i;
627         unsigned nofs_flag;
628         struct extent_buffer *eb;
629         struct btrfs_inode_item *inode_item;
630         struct scrub_warning *swarn = warn_ctx;
631         struct btrfs_fs_info *fs_info = swarn->dev->fs_info;
632         struct inode_fs_paths *ipath = NULL;
633         struct btrfs_root *local_root;
634         struct btrfs_key key;
635
636         local_root = btrfs_get_fs_root(fs_info, root, true);
637         if (IS_ERR(local_root)) {
638                 ret = PTR_ERR(local_root);
639                 goto err;
640         }
641
642         /*
643          * this makes the path point to (inum INODE_ITEM ioff)
644          */
645         key.objectid = inum;
646         key.type = BTRFS_INODE_ITEM_KEY;
647         key.offset = 0;
648
649         ret = btrfs_search_slot(NULL, local_root, &key, swarn->path, 0, 0);
650         if (ret) {
651                 btrfs_put_root(local_root);
652                 btrfs_release_path(swarn->path);
653                 goto err;
654         }
655
656         eb = swarn->path->nodes[0];
657         inode_item = btrfs_item_ptr(eb, swarn->path->slots[0],
658                                         struct btrfs_inode_item);
659         nlink = btrfs_inode_nlink(eb, inode_item);
660         btrfs_release_path(swarn->path);
661
662         /*
663          * init_path might indirectly call vmalloc, or use GFP_KERNEL. Scrub
664          * uses GFP_NOFS in this context, so we keep it consistent but it does
665          * not seem to be strictly necessary.
666          */
667         nofs_flag = memalloc_nofs_save();
668         ipath = init_ipath(4096, local_root, swarn->path);
669         memalloc_nofs_restore(nofs_flag);
670         if (IS_ERR(ipath)) {
671                 btrfs_put_root(local_root);
672                 ret = PTR_ERR(ipath);
673                 ipath = NULL;
674                 goto err;
675         }
676         ret = paths_from_inode(inum, ipath);
677
678         if (ret < 0)
679                 goto err;
680
681         /*
682          * we deliberately ignore the bit ipath might have been too small to
683          * hold all of the paths here
684          */
685         for (i = 0; i < ipath->fspath->elem_cnt; ++i)
686                 btrfs_warn_in_rcu(fs_info,
687 "%s at logical %llu on dev %s, physical %llu, root %llu, inode %llu, offset %llu, length %u, links %u (path: %s)",
688                                   swarn->errstr, swarn->logical,
689                                   rcu_str_deref(swarn->dev->name),
690                                   swarn->physical,
691                                   root, inum, offset,
692                                   fs_info->sectorsize, nlink,
693                                   (char *)(unsigned long)ipath->fspath->val[i]);
694
695         btrfs_put_root(local_root);
696         free_ipath(ipath);
697         return 0;
698
699 err:
700         btrfs_warn_in_rcu(fs_info,
701                           "%s at logical %llu on dev %s, physical %llu, root %llu, inode %llu, offset %llu: path resolving failed with ret=%d",
702                           swarn->errstr, swarn->logical,
703                           rcu_str_deref(swarn->dev->name),
704                           swarn->physical,
705                           root, inum, offset, ret);
706
707         free_ipath(ipath);
708         return 0;
709 }
710
711 static void scrub_print_warning(const char *errstr, struct scrub_block *sblock)
712 {
713         struct btrfs_device *dev;
714         struct btrfs_fs_info *fs_info;
715         struct btrfs_path *path;
716         struct btrfs_key found_key;
717         struct extent_buffer *eb;
718         struct btrfs_extent_item *ei;
719         struct scrub_warning swarn;
720         unsigned long ptr = 0;
721         u64 extent_item_pos;
722         u64 flags = 0;
723         u64 ref_root;
724         u32 item_size;
725         u8 ref_level = 0;
726         int ret;
727
728         WARN_ON(sblock->sector_count < 1);
729         dev = sblock->sectors[0]->dev;
730         fs_info = sblock->sctx->fs_info;
731
732         path = btrfs_alloc_path();
733         if (!path)
734                 return;
735
736         swarn.physical = sblock->sectors[0]->physical;
737         swarn.logical = sblock->sectors[0]->logical;
738         swarn.errstr = errstr;
739         swarn.dev = NULL;
740
741         ret = extent_from_logical(fs_info, swarn.logical, path, &found_key,
742                                   &flags);
743         if (ret < 0)
744                 goto out;
745
746         extent_item_pos = swarn.logical - found_key.objectid;
747         swarn.extent_item_size = found_key.offset;
748
749         eb = path->nodes[0];
750         ei = btrfs_item_ptr(eb, path->slots[0], struct btrfs_extent_item);
751         item_size = btrfs_item_size(eb, path->slots[0]);
752
753         if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
754                 do {
755                         ret = tree_backref_for_extent(&ptr, eb, &found_key, ei,
756                                                       item_size, &ref_root,
757                                                       &ref_level);
758                         btrfs_warn_in_rcu(fs_info,
759 "%s at logical %llu on dev %s, physical %llu: metadata %s (level %d) in tree %llu",
760                                 errstr, swarn.logical,
761                                 rcu_str_deref(dev->name),
762                                 swarn.physical,
763                                 ref_level ? "node" : "leaf",
764                                 ret < 0 ? -1 : ref_level,
765                                 ret < 0 ? -1 : ref_root);
766                 } while (ret != 1);
767                 btrfs_release_path(path);
768         } else {
769                 btrfs_release_path(path);
770                 swarn.path = path;
771                 swarn.dev = dev;
772                 iterate_extent_inodes(fs_info, found_key.objectid,
773                                         extent_item_pos, 1,
774                                         scrub_print_warning_inode, &swarn, false);
775         }
776
777 out:
778         btrfs_free_path(path);
779 }
780
781 static inline void scrub_get_recover(struct scrub_recover *recover)
782 {
783         refcount_inc(&recover->refs);
784 }
785
786 static inline void scrub_put_recover(struct btrfs_fs_info *fs_info,
787                                      struct scrub_recover *recover)
788 {
789         if (refcount_dec_and_test(&recover->refs)) {
790                 btrfs_bio_counter_dec(fs_info);
791                 btrfs_put_bioc(recover->bioc);
792                 kfree(recover);
793         }
794 }
795
796 /*
797  * scrub_handle_errored_block gets called when either verification of the
798  * sectors failed or the bio failed to read, e.g. with EIO. In the latter
799  * case, this function handles all sectors in the bio, even though only one
800  * may be bad.
801  * The goal of this function is to repair the errored block by using the
802  * contents of one of the mirrors.
803  */
804 static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
805 {
806         struct scrub_ctx *sctx = sblock_to_check->sctx;
807         struct btrfs_device *dev;
808         struct btrfs_fs_info *fs_info;
809         u64 logical;
810         unsigned int failed_mirror_index;
811         unsigned int is_metadata;
812         unsigned int have_csum;
813         struct scrub_block *sblocks_for_recheck; /* holds one for each mirror */
814         struct scrub_block *sblock_bad;
815         int ret;
816         int mirror_index;
817         int sector_num;
818         int success;
819         bool full_stripe_locked;
820         unsigned int nofs_flag;
821         static DEFINE_RATELIMIT_STATE(rs, DEFAULT_RATELIMIT_INTERVAL,
822                                       DEFAULT_RATELIMIT_BURST);
823
824         BUG_ON(sblock_to_check->sector_count < 1);
825         fs_info = sctx->fs_info;
826         if (sblock_to_check->sectors[0]->flags & BTRFS_EXTENT_FLAG_SUPER) {
827                 /*
828                  * if we find an error in a super block, we just report it.
829                  * They will get written with the next transaction commit
830                  * anyway
831                  */
832                 spin_lock(&sctx->stat_lock);
833                 ++sctx->stat.super_errors;
834                 spin_unlock(&sctx->stat_lock);
835                 return 0;
836         }
837         logical = sblock_to_check->sectors[0]->logical;
838         BUG_ON(sblock_to_check->sectors[0]->mirror_num < 1);
839         failed_mirror_index = sblock_to_check->sectors[0]->mirror_num - 1;
840         is_metadata = !(sblock_to_check->sectors[0]->flags &
841                         BTRFS_EXTENT_FLAG_DATA);
842         have_csum = sblock_to_check->sectors[0]->have_csum;
843         dev = sblock_to_check->sectors[0]->dev;
844
845         if (!sctx->is_dev_replace && btrfs_repair_one_zone(fs_info, logical))
846                 return 0;
847
848         /*
849          * We must use GFP_NOFS because the scrub task might be waiting for a
850          * worker task executing this function and in turn a transaction commit
851          * might be waiting the scrub task to pause (which needs to wait for all
852          * the worker tasks to complete before pausing).
853          * We do allocations in the workers through insert_full_stripe_lock()
854          * and scrub_add_sector_to_wr_bio(), which happens down the call chain of
855          * this function.
856          */
857         nofs_flag = memalloc_nofs_save();
858         /*
859          * For RAID5/6, race can happen for a different device scrub thread.
860          * For data corruption, Parity and Data threads will both try
861          * to recovery the data.
862          * Race can lead to doubly added csum error, or even unrecoverable
863          * error.
864          */
865         ret = lock_full_stripe(fs_info, logical, &full_stripe_locked);
866         if (ret < 0) {
867                 memalloc_nofs_restore(nofs_flag);
868                 spin_lock(&sctx->stat_lock);
869                 if (ret == -ENOMEM)
870                         sctx->stat.malloc_errors++;
871                 sctx->stat.read_errors++;
872                 sctx->stat.uncorrectable_errors++;
873                 spin_unlock(&sctx->stat_lock);
874                 return ret;
875         }
876
877         /*
878          * read all mirrors one after the other. This includes to
879          * re-read the extent or metadata block that failed (that was
880          * the cause that this fixup code is called) another time,
881          * sector by sector this time in order to know which sectors
882          * caused I/O errors and which ones are good (for all mirrors).
883          * It is the goal to handle the situation when more than one
884          * mirror contains I/O errors, but the errors do not
885          * overlap, i.e. the data can be repaired by selecting the
886          * sectors from those mirrors without I/O error on the
887          * particular sectors. One example (with blocks >= 2 * sectorsize)
888          * would be that mirror #1 has an I/O error on the first sector,
889          * the second sector is good, and mirror #2 has an I/O error on
890          * the second sector, but the first sector is good.
891          * Then the first sector of the first mirror can be repaired by
892          * taking the first sector of the second mirror, and the
893          * second sector of the second mirror can be repaired by
894          * copying the contents of the 2nd sector of the 1st mirror.
895          * One more note: if the sectors of one mirror contain I/O
896          * errors, the checksum cannot be verified. In order to get
897          * the best data for repairing, the first attempt is to find
898          * a mirror without I/O errors and with a validated checksum.
899          * Only if this is not possible, the sectors are picked from
900          * mirrors with I/O errors without considering the checksum.
901          * If the latter is the case, at the end, the checksum of the
902          * repaired area is verified in order to correctly maintain
903          * the statistics.
904          */
905
906         sblocks_for_recheck = kcalloc(BTRFS_MAX_MIRRORS,
907                                       sizeof(*sblocks_for_recheck), GFP_KERNEL);
908         if (!sblocks_for_recheck) {
909                 spin_lock(&sctx->stat_lock);
910                 sctx->stat.malloc_errors++;
911                 sctx->stat.read_errors++;
912                 sctx->stat.uncorrectable_errors++;
913                 spin_unlock(&sctx->stat_lock);
914                 btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS);
915                 goto out;
916         }
917
918         /* Setup the context, map the logical blocks and alloc the sectors */
919         ret = scrub_setup_recheck_block(sblock_to_check, sblocks_for_recheck);
920         if (ret) {
921                 spin_lock(&sctx->stat_lock);
922                 sctx->stat.read_errors++;
923                 sctx->stat.uncorrectable_errors++;
924                 spin_unlock(&sctx->stat_lock);
925                 btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS);
926                 goto out;
927         }
928         BUG_ON(failed_mirror_index >= BTRFS_MAX_MIRRORS);
929         sblock_bad = sblocks_for_recheck + failed_mirror_index;
930
931         /* build and submit the bios for the failed mirror, check checksums */
932         scrub_recheck_block(fs_info, sblock_bad, 1);
933
934         if (!sblock_bad->header_error && !sblock_bad->checksum_error &&
935             sblock_bad->no_io_error_seen) {
936                 /*
937                  * The error disappeared after reading sector by sector, or
938                  * the area was part of a huge bio and other parts of the
939                  * bio caused I/O errors, or the block layer merged several
940                  * read requests into one and the error is caused by a
941                  * different bio (usually one of the two latter cases is
942                  * the cause)
943                  */
944                 spin_lock(&sctx->stat_lock);
945                 sctx->stat.unverified_errors++;
946                 sblock_to_check->data_corrected = 1;
947                 spin_unlock(&sctx->stat_lock);
948
949                 if (sctx->is_dev_replace)
950                         scrub_write_block_to_dev_replace(sblock_bad);
951                 goto out;
952         }
953
954         if (!sblock_bad->no_io_error_seen) {
955                 spin_lock(&sctx->stat_lock);
956                 sctx->stat.read_errors++;
957                 spin_unlock(&sctx->stat_lock);
958                 if (__ratelimit(&rs))
959                         scrub_print_warning("i/o error", sblock_to_check);
960                 btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS);
961         } else if (sblock_bad->checksum_error) {
962                 spin_lock(&sctx->stat_lock);
963                 sctx->stat.csum_errors++;
964                 spin_unlock(&sctx->stat_lock);
965                 if (__ratelimit(&rs))
966                         scrub_print_warning("checksum error", sblock_to_check);
967                 btrfs_dev_stat_inc_and_print(dev,
968                                              BTRFS_DEV_STAT_CORRUPTION_ERRS);
969         } else if (sblock_bad->header_error) {
970                 spin_lock(&sctx->stat_lock);
971                 sctx->stat.verify_errors++;
972                 spin_unlock(&sctx->stat_lock);
973                 if (__ratelimit(&rs))
974                         scrub_print_warning("checksum/header error",
975                                             sblock_to_check);
976                 if (sblock_bad->generation_error)
977                         btrfs_dev_stat_inc_and_print(dev,
978                                 BTRFS_DEV_STAT_GENERATION_ERRS);
979                 else
980                         btrfs_dev_stat_inc_and_print(dev,
981                                 BTRFS_DEV_STAT_CORRUPTION_ERRS);
982         }
983
984         if (sctx->readonly) {
985                 ASSERT(!sctx->is_dev_replace);
986                 goto out;
987         }
988
989         /*
990          * now build and submit the bios for the other mirrors, check
991          * checksums.
992          * First try to pick the mirror which is completely without I/O
993          * errors and also does not have a checksum error.
994          * If one is found, and if a checksum is present, the full block
995          * that is known to contain an error is rewritten. Afterwards
996          * the block is known to be corrected.
997          * If a mirror is found which is completely correct, and no
998          * checksum is present, only those sectors are rewritten that had
999          * an I/O error in the block to be repaired, since it cannot be
1000          * determined, which copy of the other sectors is better (and it
1001          * could happen otherwise that a correct sector would be
1002          * overwritten by a bad one).
1003          */
1004         for (mirror_index = 0; ;mirror_index++) {
1005                 struct scrub_block *sblock_other;
1006
1007                 if (mirror_index == failed_mirror_index)
1008                         continue;
1009
1010                 /* raid56's mirror can be more than BTRFS_MAX_MIRRORS */
1011                 if (!scrub_is_page_on_raid56(sblock_bad->sectors[0])) {
1012                         if (mirror_index >= BTRFS_MAX_MIRRORS)
1013                                 break;
1014                         if (!sblocks_for_recheck[mirror_index].sector_count)
1015                                 break;
1016
1017                         sblock_other = sblocks_for_recheck + mirror_index;
1018                 } else {
1019                         struct scrub_recover *r = sblock_bad->sectors[0]->recover;
1020                         int max_allowed = r->bioc->num_stripes - r->bioc->num_tgtdevs;
1021
1022                         if (mirror_index >= max_allowed)
1023                                 break;
1024                         if (!sblocks_for_recheck[1].sector_count)
1025                                 break;
1026
1027                         ASSERT(failed_mirror_index == 0);
1028                         sblock_other = sblocks_for_recheck + 1;
1029                         sblock_other->sectors[0]->mirror_num = 1 + mirror_index;
1030                 }
1031
1032                 /* build and submit the bios, check checksums */
1033                 scrub_recheck_block(fs_info, sblock_other, 0);
1034
1035                 if (!sblock_other->header_error &&
1036                     !sblock_other->checksum_error &&
1037                     sblock_other->no_io_error_seen) {
1038                         if (sctx->is_dev_replace) {
1039                                 scrub_write_block_to_dev_replace(sblock_other);
1040                                 goto corrected_error;
1041                         } else {
1042                                 ret = scrub_repair_block_from_good_copy(
1043                                                 sblock_bad, sblock_other);
1044                                 if (!ret)
1045                                         goto corrected_error;
1046                         }
1047                 }
1048         }
1049
1050         if (sblock_bad->no_io_error_seen && !sctx->is_dev_replace)
1051                 goto did_not_correct_error;
1052
1053         /*
1054          * In case of I/O errors in the area that is supposed to be
1055          * repaired, continue by picking good copies of those sectors.
1056          * Select the good sectors from mirrors to rewrite bad sectors from
1057          * the area to fix. Afterwards verify the checksum of the block
1058          * that is supposed to be repaired. This verification step is
1059          * only done for the purpose of statistic counting and for the
1060          * final scrub report, whether errors remain.
1061          * A perfect algorithm could make use of the checksum and try
1062          * all possible combinations of sectors from the different mirrors
1063          * until the checksum verification succeeds. For example, when
1064          * the 2nd sector of mirror #1 faces I/O errors, and the 2nd sector
1065          * of mirror #2 is readable but the final checksum test fails,
1066          * then the 2nd sector of mirror #3 could be tried, whether now
1067          * the final checksum succeeds. But this would be a rare
1068          * exception and is therefore not implemented. At least it is
1069          * avoided that the good copy is overwritten.
1070          * A more useful improvement would be to pick the sectors
1071          * without I/O error based on sector sizes (512 bytes on legacy
1072          * disks) instead of on sectorsize. Then maybe 512 byte of one
1073          * mirror could be repaired by taking 512 byte of a different
1074          * mirror, even if other 512 byte sectors in the same sectorsize
1075          * area are unreadable.
1076          */
1077         success = 1;
1078         for (sector_num = 0; sector_num < sblock_bad->sector_count;
1079              sector_num++) {
1080                 struct scrub_sector *sector_bad = sblock_bad->sectors[sector_num];
1081                 struct scrub_block *sblock_other = NULL;
1082
1083                 /* Skip no-io-error sectors in scrub */
1084                 if (!sector_bad->io_error && !sctx->is_dev_replace)
1085                         continue;
1086
1087                 if (scrub_is_page_on_raid56(sblock_bad->sectors[0])) {
1088                         /*
1089                          * In case of dev replace, if raid56 rebuild process
1090                          * didn't work out correct data, then copy the content
1091                          * in sblock_bad to make sure target device is identical
1092                          * to source device, instead of writing garbage data in
1093                          * sblock_for_recheck array to target device.
1094                          */
1095                         sblock_other = NULL;
1096                 } else if (sector_bad->io_error) {
1097                         /* Try to find no-io-error sector in mirrors */
1098                         for (mirror_index = 0;
1099                              mirror_index < BTRFS_MAX_MIRRORS &&
1100                              sblocks_for_recheck[mirror_index].sector_count > 0;
1101                              mirror_index++) {
1102                                 if (!sblocks_for_recheck[mirror_index].
1103                                     sectors[sector_num]->io_error) {
1104                                         sblock_other = sblocks_for_recheck +
1105                                                        mirror_index;
1106                                         break;
1107                                 }
1108                         }
1109                         if (!sblock_other)
1110                                 success = 0;
1111                 }
1112
1113                 if (sctx->is_dev_replace) {
1114                         /*
1115                          * Did not find a mirror to fetch the sector from.
1116                          * scrub_write_sector_to_dev_replace() handles this
1117                          * case (sector->io_error), by filling the block with
1118                          * zeros before submitting the write request
1119                          */
1120                         if (!sblock_other)
1121                                 sblock_other = sblock_bad;
1122
1123                         if (scrub_write_sector_to_dev_replace(sblock_other,
1124                                                               sector_num) != 0) {
1125                                 atomic64_inc(
1126                                         &fs_info->dev_replace.num_write_errors);
1127                                 success = 0;
1128                         }
1129                 } else if (sblock_other) {
1130                         ret = scrub_repair_sector_from_good_copy(sblock_bad,
1131                                                                  sblock_other,
1132                                                                  sector_num, 0);
1133                         if (0 == ret)
1134                                 sector_bad->io_error = 0;
1135                         else
1136                                 success = 0;
1137                 }
1138         }
1139
1140         if (success && !sctx->is_dev_replace) {
1141                 if (is_metadata || have_csum) {
1142                         /*
1143                          * need to verify the checksum now that all
1144                          * sectors on disk are repaired (the write
1145                          * request for data to be repaired is on its way).
1146                          * Just be lazy and use scrub_recheck_block()
1147                          * which re-reads the data before the checksum
1148                          * is verified, but most likely the data comes out
1149                          * of the page cache.
1150                          */
1151                         scrub_recheck_block(fs_info, sblock_bad, 1);
1152                         if (!sblock_bad->header_error &&
1153                             !sblock_bad->checksum_error &&
1154                             sblock_bad->no_io_error_seen)
1155                                 goto corrected_error;
1156                         else
1157                                 goto did_not_correct_error;
1158                 } else {
1159 corrected_error:
1160                         spin_lock(&sctx->stat_lock);
1161                         sctx->stat.corrected_errors++;
1162                         sblock_to_check->data_corrected = 1;
1163                         spin_unlock(&sctx->stat_lock);
1164                         btrfs_err_rl_in_rcu(fs_info,
1165                                 "fixed up error at logical %llu on dev %s",
1166                                 logical, rcu_str_deref(dev->name));
1167                 }
1168         } else {
1169 did_not_correct_error:
1170                 spin_lock(&sctx->stat_lock);
1171                 sctx->stat.uncorrectable_errors++;
1172                 spin_unlock(&sctx->stat_lock);
1173                 btrfs_err_rl_in_rcu(fs_info,
1174                         "unable to fixup (regular) error at logical %llu on dev %s",
1175                         logical, rcu_str_deref(dev->name));
1176         }
1177
1178 out:
1179         if (sblocks_for_recheck) {
1180                 for (mirror_index = 0; mirror_index < BTRFS_MAX_MIRRORS;
1181                      mirror_index++) {
1182                         struct scrub_block *sblock = sblocks_for_recheck +
1183                                                      mirror_index;
1184                         struct scrub_recover *recover;
1185                         int i;
1186
1187                         for (i = 0; i < sblock->sector_count; i++) {
1188                                 sblock->sectors[i]->sblock = NULL;
1189                                 recover = sblock->sectors[i]->recover;
1190                                 if (recover) {
1191                                         scrub_put_recover(fs_info, recover);
1192                                         sblock->sectors[i]->recover = NULL;
1193                                 }
1194                                 scrub_sector_put(sblock->sectors[i]);
1195                         }
1196                 }
1197                 kfree(sblocks_for_recheck);
1198         }
1199
1200         ret = unlock_full_stripe(fs_info, logical, full_stripe_locked);
1201         memalloc_nofs_restore(nofs_flag);
1202         if (ret < 0)
1203                 return ret;
1204         return 0;
1205 }
1206
1207 static inline int scrub_nr_raid_mirrors(struct btrfs_io_context *bioc)
1208 {
1209         if (bioc->map_type & BTRFS_BLOCK_GROUP_RAID5)
1210                 return 2;
1211         else if (bioc->map_type & BTRFS_BLOCK_GROUP_RAID6)
1212                 return 3;
1213         else
1214                 return (int)bioc->num_stripes;
1215 }
1216
1217 static inline void scrub_stripe_index_and_offset(u64 logical, u64 map_type,
1218                                                  u64 *raid_map,
1219                                                  u64 mapped_length,
1220                                                  int nstripes, int mirror,
1221                                                  int *stripe_index,
1222                                                  u64 *stripe_offset)
1223 {
1224         int i;
1225
1226         if (map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
1227                 /* RAID5/6 */
1228                 for (i = 0; i < nstripes; i++) {
1229                         if (raid_map[i] == RAID6_Q_STRIPE ||
1230                             raid_map[i] == RAID5_P_STRIPE)
1231                                 continue;
1232
1233                         if (logical >= raid_map[i] &&
1234                             logical < raid_map[i] + mapped_length)
1235                                 break;
1236                 }
1237
1238                 *stripe_index = i;
1239                 *stripe_offset = logical - raid_map[i];
1240         } else {
1241                 /* The other RAID type */
1242                 *stripe_index = mirror;
1243                 *stripe_offset = 0;
1244         }
1245 }
1246
1247 static int scrub_setup_recheck_block(struct scrub_block *original_sblock,
1248                                      struct scrub_block *sblocks_for_recheck)
1249 {
1250         struct scrub_ctx *sctx = original_sblock->sctx;
1251         struct btrfs_fs_info *fs_info = sctx->fs_info;
1252         u64 length = original_sblock->sector_count << fs_info->sectorsize_bits;
1253         u64 logical = original_sblock->sectors[0]->logical;
1254         u64 generation = original_sblock->sectors[0]->generation;
1255         u64 flags = original_sblock->sectors[0]->flags;
1256         u64 have_csum = original_sblock->sectors[0]->have_csum;
1257         struct scrub_recover *recover;
1258         struct btrfs_io_context *bioc;
1259         u64 sublen;
1260         u64 mapped_length;
1261         u64 stripe_offset;
1262         int stripe_index;
1263         int sector_index = 0;
1264         int mirror_index;
1265         int nmirrors;
1266         int ret;
1267
1268         /*
1269          * Note: the two members refs and outstanding_sectors are not used (and
1270          * not set) in the blocks that are used for the recheck procedure.
1271          */
1272
1273         while (length > 0) {
1274                 sublen = min_t(u64, length, fs_info->sectorsize);
1275                 mapped_length = sublen;
1276                 bioc = NULL;
1277
1278                 /*
1279                  * With a length of sectorsize, each returned stripe represents
1280                  * one mirror
1281                  */
1282                 btrfs_bio_counter_inc_blocked(fs_info);
1283                 ret = btrfs_map_sblock(fs_info, BTRFS_MAP_GET_READ_MIRRORS,
1284                                        logical, &mapped_length, &bioc);
1285                 if (ret || !bioc || mapped_length < sublen) {
1286                         btrfs_put_bioc(bioc);
1287                         btrfs_bio_counter_dec(fs_info);
1288                         return -EIO;
1289                 }
1290
1291                 recover = kzalloc(sizeof(struct scrub_recover), GFP_NOFS);
1292                 if (!recover) {
1293                         btrfs_put_bioc(bioc);
1294                         btrfs_bio_counter_dec(fs_info);
1295                         return -ENOMEM;
1296                 }
1297
1298                 refcount_set(&recover->refs, 1);
1299                 recover->bioc = bioc;
1300                 recover->map_length = mapped_length;
1301
1302                 ASSERT(sector_index < SCRUB_MAX_SECTORS_PER_BLOCK);
1303
1304                 nmirrors = min(scrub_nr_raid_mirrors(bioc), BTRFS_MAX_MIRRORS);
1305
1306                 for (mirror_index = 0; mirror_index < nmirrors;
1307                      mirror_index++) {
1308                         struct scrub_block *sblock;
1309                         struct scrub_sector *sector;
1310
1311                         sblock = sblocks_for_recheck + mirror_index;
1312                         sblock->sctx = sctx;
1313
1314                         sector = kzalloc(sizeof(*sector), GFP_NOFS);
1315                         if (!sector) {
1316 leave_nomem:
1317                                 spin_lock(&sctx->stat_lock);
1318                                 sctx->stat.malloc_errors++;
1319                                 spin_unlock(&sctx->stat_lock);
1320                                 scrub_put_recover(fs_info, recover);
1321                                 return -ENOMEM;
1322                         }
1323                         scrub_sector_get(sector);
1324                         sblock->sectors[sector_index] = sector;
1325                         sector->sblock = sblock;
1326                         sector->flags = flags;
1327                         sector->generation = generation;
1328                         sector->logical = logical;
1329                         sector->have_csum = have_csum;
1330                         if (have_csum)
1331                                 memcpy(sector->csum,
1332                                        original_sblock->sectors[0]->csum,
1333                                        sctx->fs_info->csum_size);
1334
1335                         scrub_stripe_index_and_offset(logical,
1336                                                       bioc->map_type,
1337                                                       bioc->raid_map,
1338                                                       mapped_length,
1339                                                       bioc->num_stripes -
1340                                                       bioc->num_tgtdevs,
1341                                                       mirror_index,
1342                                                       &stripe_index,
1343                                                       &stripe_offset);
1344                         sector->physical = bioc->stripes[stripe_index].physical +
1345                                          stripe_offset;
1346                         sector->dev = bioc->stripes[stripe_index].dev;
1347
1348                         BUG_ON(sector_index >= original_sblock->sector_count);
1349                         sector->physical_for_dev_replace =
1350                                 original_sblock->sectors[sector_index]->
1351                                 physical_for_dev_replace;
1352                         /* For missing devices, dev->bdev is NULL */
1353                         sector->mirror_num = mirror_index + 1;
1354                         sblock->sector_count++;
1355                         sector->page = alloc_page(GFP_NOFS);
1356                         if (!sector->page)
1357                                 goto leave_nomem;
1358
1359                         scrub_get_recover(recover);
1360                         sector->recover = recover;
1361                 }
1362                 scrub_put_recover(fs_info, recover);
1363                 length -= sublen;
1364                 logical += sublen;
1365                 sector_index++;
1366         }
1367
1368         return 0;
1369 }
1370
1371 static void scrub_bio_wait_endio(struct bio *bio)
1372 {
1373         complete(bio->bi_private);
1374 }
1375
1376 static int scrub_submit_raid56_bio_wait(struct btrfs_fs_info *fs_info,
1377                                         struct bio *bio,
1378                                         struct scrub_sector *sector)
1379 {
1380         DECLARE_COMPLETION_ONSTACK(done);
1381         int ret;
1382         int mirror_num;
1383
1384         bio->bi_iter.bi_sector = sector->logical >> 9;
1385         bio->bi_private = &done;
1386         bio->bi_end_io = scrub_bio_wait_endio;
1387
1388         mirror_num = sector->sblock->sectors[0]->mirror_num;
1389         ret = raid56_parity_recover(bio, sector->recover->bioc,
1390                                     sector->recover->map_length,
1391                                     mirror_num, 0);
1392         if (ret)
1393                 return ret;
1394
1395         wait_for_completion_io(&done);
1396         return blk_status_to_errno(bio->bi_status);
1397 }
1398
1399 static void scrub_recheck_block_on_raid56(struct btrfs_fs_info *fs_info,
1400                                           struct scrub_block *sblock)
1401 {
1402         struct scrub_sector *first_sector = sblock->sectors[0];
1403         struct bio *bio;
1404         int i;
1405
1406         /* All sectors in sblock belong to the same stripe on the same device. */
1407         ASSERT(first_sector->dev);
1408         if (!first_sector->dev->bdev)
1409                 goto out;
1410
1411         bio = bio_alloc(first_sector->dev->bdev, BIO_MAX_VECS, REQ_OP_READ, GFP_NOFS);
1412
1413         for (i = 0; i < sblock->sector_count; i++) {
1414                 struct scrub_sector *sector = sblock->sectors[i];
1415
1416                 WARN_ON(!sector->page);
1417                 bio_add_page(bio, sector->page, PAGE_SIZE, 0);
1418         }
1419
1420         if (scrub_submit_raid56_bio_wait(fs_info, bio, first_sector)) {
1421                 bio_put(bio);
1422                 goto out;
1423         }
1424
1425         bio_put(bio);
1426
1427         scrub_recheck_block_checksum(sblock);
1428
1429         return;
1430 out:
1431         for (i = 0; i < sblock->sector_count; i++)
1432                 sblock->sectors[i]->io_error = 1;
1433
1434         sblock->no_io_error_seen = 0;
1435 }
1436
1437 /*
1438  * This function will check the on disk data for checksum errors, header errors
1439  * and read I/O errors. If any I/O errors happen, the exact sectors which are
1440  * errored are marked as being bad. The goal is to enable scrub to take those
1441  * sectors that are not errored from all the mirrors so that the sectors that
1442  * are errored in the just handled mirror can be repaired.
1443  */
1444 static void scrub_recheck_block(struct btrfs_fs_info *fs_info,
1445                                 struct scrub_block *sblock,
1446                                 int retry_failed_mirror)
1447 {
1448         int i;
1449
1450         sblock->no_io_error_seen = 1;
1451
1452         /* short cut for raid56 */
1453         if (!retry_failed_mirror && scrub_is_page_on_raid56(sblock->sectors[0]))
1454                 return scrub_recheck_block_on_raid56(fs_info, sblock);
1455
1456         for (i = 0; i < sblock->sector_count; i++) {
1457                 struct scrub_sector *sector = sblock->sectors[i];
1458                 struct bio bio;
1459                 struct bio_vec bvec;
1460
1461                 if (sector->dev->bdev == NULL) {
1462                         sector->io_error = 1;
1463                         sblock->no_io_error_seen = 0;
1464                         continue;
1465                 }
1466
1467                 WARN_ON(!sector->page);
1468                 bio_init(&bio, sector->dev->bdev, &bvec, 1, REQ_OP_READ);
1469                 bio_add_page(&bio, sector->page, fs_info->sectorsize, 0);
1470                 bio.bi_iter.bi_sector = sector->physical >> 9;
1471
1472                 btrfsic_check_bio(&bio);
1473                 if (submit_bio_wait(&bio)) {
1474                         sector->io_error = 1;
1475                         sblock->no_io_error_seen = 0;
1476                 }
1477
1478                 bio_uninit(&bio);
1479         }
1480
1481         if (sblock->no_io_error_seen)
1482                 scrub_recheck_block_checksum(sblock);
1483 }
1484
1485 static inline int scrub_check_fsid(u8 fsid[], struct scrub_sector *sector)
1486 {
1487         struct btrfs_fs_devices *fs_devices = sector->dev->fs_devices;
1488         int ret;
1489
1490         ret = memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE);
1491         return !ret;
1492 }
1493
1494 static void scrub_recheck_block_checksum(struct scrub_block *sblock)
1495 {
1496         sblock->header_error = 0;
1497         sblock->checksum_error = 0;
1498         sblock->generation_error = 0;
1499
1500         if (sblock->sectors[0]->flags & BTRFS_EXTENT_FLAG_DATA)
1501                 scrub_checksum_data(sblock);
1502         else
1503                 scrub_checksum_tree_block(sblock);
1504 }
1505
1506 static int scrub_repair_block_from_good_copy(struct scrub_block *sblock_bad,
1507                                              struct scrub_block *sblock_good)
1508 {
1509         int i;
1510         int ret = 0;
1511
1512         for (i = 0; i < sblock_bad->sector_count; i++) {
1513                 int ret_sub;
1514
1515                 ret_sub = scrub_repair_sector_from_good_copy(sblock_bad,
1516                                                              sblock_good, i, 1);
1517                 if (ret_sub)
1518                         ret = ret_sub;
1519         }
1520
1521         return ret;
1522 }
1523
1524 static int scrub_repair_sector_from_good_copy(struct scrub_block *sblock_bad,
1525                                               struct scrub_block *sblock_good,
1526                                               int sector_num, int force_write)
1527 {
1528         struct scrub_sector *sector_bad = sblock_bad->sectors[sector_num];
1529         struct scrub_sector *sector_good = sblock_good->sectors[sector_num];
1530         struct btrfs_fs_info *fs_info = sblock_bad->sctx->fs_info;
1531         const u32 sectorsize = fs_info->sectorsize;
1532
1533         BUG_ON(sector_bad->page == NULL);
1534         BUG_ON(sector_good->page == NULL);
1535         if (force_write || sblock_bad->header_error ||
1536             sblock_bad->checksum_error || sector_bad->io_error) {
1537                 struct bio bio;
1538                 struct bio_vec bvec;
1539                 int ret;
1540
1541                 if (!sector_bad->dev->bdev) {
1542                         btrfs_warn_rl(fs_info,
1543                                 "scrub_repair_page_from_good_copy(bdev == NULL) is unexpected");
1544                         return -EIO;
1545                 }
1546
1547                 bio_init(&bio, sector_bad->dev->bdev, &bvec, 1, REQ_OP_WRITE);
1548                 bio.bi_iter.bi_sector = sector_bad->physical >> 9;
1549                 __bio_add_page(&bio, sector_good->page, sectorsize, 0);
1550
1551                 btrfsic_check_bio(&bio);
1552                 ret = submit_bio_wait(&bio);
1553                 bio_uninit(&bio);
1554
1555                 if (ret) {
1556                         btrfs_dev_stat_inc_and_print(sector_bad->dev,
1557                                 BTRFS_DEV_STAT_WRITE_ERRS);
1558                         atomic64_inc(&fs_info->dev_replace.num_write_errors);
1559                         return -EIO;
1560                 }
1561         }
1562
1563         return 0;
1564 }
1565
1566 static void scrub_write_block_to_dev_replace(struct scrub_block *sblock)
1567 {
1568         struct btrfs_fs_info *fs_info = sblock->sctx->fs_info;
1569         int i;
1570
1571         /*
1572          * This block is used for the check of the parity on the source device,
1573          * so the data needn't be written into the destination device.
1574          */
1575         if (sblock->sparity)
1576                 return;
1577
1578         for (i = 0; i < sblock->sector_count; i++) {
1579                 int ret;
1580
1581                 ret = scrub_write_sector_to_dev_replace(sblock, i);
1582                 if (ret)
1583                         atomic64_inc(&fs_info->dev_replace.num_write_errors);
1584         }
1585 }
1586
1587 static int scrub_write_sector_to_dev_replace(struct scrub_block *sblock, int sector_num)
1588 {
1589         struct scrub_sector *sector = sblock->sectors[sector_num];
1590
1591         BUG_ON(sector->page == NULL);
1592         if (sector->io_error)
1593                 clear_page(page_address(sector->page));
1594
1595         return scrub_add_sector_to_wr_bio(sblock->sctx, sector);
1596 }
1597
1598 static int fill_writer_pointer_gap(struct scrub_ctx *sctx, u64 physical)
1599 {
1600         int ret = 0;
1601         u64 length;
1602
1603         if (!btrfs_is_zoned(sctx->fs_info))
1604                 return 0;
1605
1606         if (!btrfs_dev_is_sequential(sctx->wr_tgtdev, physical))
1607                 return 0;
1608
1609         if (sctx->write_pointer < physical) {
1610                 length = physical - sctx->write_pointer;
1611
1612                 ret = btrfs_zoned_issue_zeroout(sctx->wr_tgtdev,
1613                                                 sctx->write_pointer, length);
1614                 if (!ret)
1615                         sctx->write_pointer = physical;
1616         }
1617         return ret;
1618 }
1619
1620 static int scrub_add_sector_to_wr_bio(struct scrub_ctx *sctx,
1621                                       struct scrub_sector *sector)
1622 {
1623         struct scrub_bio *sbio;
1624         int ret;
1625         const u32 sectorsize = sctx->fs_info->sectorsize;
1626
1627         mutex_lock(&sctx->wr_lock);
1628 again:
1629         if (!sctx->wr_curr_bio) {
1630                 sctx->wr_curr_bio = kzalloc(sizeof(*sctx->wr_curr_bio),
1631                                               GFP_KERNEL);
1632                 if (!sctx->wr_curr_bio) {
1633                         mutex_unlock(&sctx->wr_lock);
1634                         return -ENOMEM;
1635                 }
1636                 sctx->wr_curr_bio->sctx = sctx;
1637                 sctx->wr_curr_bio->sector_count = 0;
1638         }
1639         sbio = sctx->wr_curr_bio;
1640         if (sbio->sector_count == 0) {
1641                 ret = fill_writer_pointer_gap(sctx, sector->physical_for_dev_replace);
1642                 if (ret) {
1643                         mutex_unlock(&sctx->wr_lock);
1644                         return ret;
1645                 }
1646
1647                 sbio->physical = sector->physical_for_dev_replace;
1648                 sbio->logical = sector->logical;
1649                 sbio->dev = sctx->wr_tgtdev;
1650                 if (!sbio->bio) {
1651                         sbio->bio = bio_alloc(sbio->dev->bdev, sctx->sectors_per_bio,
1652                                               REQ_OP_WRITE, GFP_NOFS);
1653                 }
1654                 sbio->bio->bi_private = sbio;
1655                 sbio->bio->bi_end_io = scrub_wr_bio_end_io;
1656                 sbio->bio->bi_iter.bi_sector = sbio->physical >> 9;
1657                 sbio->status = 0;
1658         } else if (sbio->physical + sbio->sector_count * sectorsize !=
1659                    sector->physical_for_dev_replace ||
1660                    sbio->logical + sbio->sector_count * sectorsize !=
1661                    sector->logical) {
1662                 scrub_wr_submit(sctx);
1663                 goto again;
1664         }
1665
1666         ret = bio_add_page(sbio->bio, sector->page, sectorsize, 0);
1667         if (ret != sectorsize) {
1668                 if (sbio->sector_count < 1) {
1669                         bio_put(sbio->bio);
1670                         sbio->bio = NULL;
1671                         mutex_unlock(&sctx->wr_lock);
1672                         return -EIO;
1673                 }
1674                 scrub_wr_submit(sctx);
1675                 goto again;
1676         }
1677
1678         sbio->sectors[sbio->sector_count] = sector;
1679         scrub_sector_get(sector);
1680         sbio->sector_count++;
1681         if (sbio->sector_count == sctx->sectors_per_bio)
1682                 scrub_wr_submit(sctx);
1683         mutex_unlock(&sctx->wr_lock);
1684
1685         return 0;
1686 }
1687
1688 static void scrub_wr_submit(struct scrub_ctx *sctx)
1689 {
1690         struct scrub_bio *sbio;
1691
1692         if (!sctx->wr_curr_bio)
1693                 return;
1694
1695         sbio = sctx->wr_curr_bio;
1696         sctx->wr_curr_bio = NULL;
1697         scrub_pending_bio_inc(sctx);
1698         /* process all writes in a single worker thread. Then the block layer
1699          * orders the requests before sending them to the driver which
1700          * doubled the write performance on spinning disks when measured
1701          * with Linux 3.5 */
1702         btrfsic_check_bio(sbio->bio);
1703         submit_bio(sbio->bio);
1704
1705         if (btrfs_is_zoned(sctx->fs_info))
1706                 sctx->write_pointer = sbio->physical + sbio->sector_count *
1707                         sctx->fs_info->sectorsize;
1708 }
1709
1710 static void scrub_wr_bio_end_io(struct bio *bio)
1711 {
1712         struct scrub_bio *sbio = bio->bi_private;
1713         struct btrfs_fs_info *fs_info = sbio->dev->fs_info;
1714
1715         sbio->status = bio->bi_status;
1716         sbio->bio = bio;
1717
1718         INIT_WORK(&sbio->work, scrub_wr_bio_end_io_worker);
1719         queue_work(fs_info->scrub_wr_completion_workers, &sbio->work);
1720 }
1721
1722 static void scrub_wr_bio_end_io_worker(struct work_struct *work)
1723 {
1724         struct scrub_bio *sbio = container_of(work, struct scrub_bio, work);
1725         struct scrub_ctx *sctx = sbio->sctx;
1726         int i;
1727
1728         ASSERT(sbio->sector_count <= SCRUB_SECTORS_PER_BIO);
1729         if (sbio->status) {
1730                 struct btrfs_dev_replace *dev_replace =
1731                         &sbio->sctx->fs_info->dev_replace;
1732
1733                 for (i = 0; i < sbio->sector_count; i++) {
1734                         struct scrub_sector *sector = sbio->sectors[i];
1735
1736                         sector->io_error = 1;
1737                         atomic64_inc(&dev_replace->num_write_errors);
1738                 }
1739         }
1740
1741         for (i = 0; i < sbio->sector_count; i++)
1742                 scrub_sector_put(sbio->sectors[i]);
1743
1744         bio_put(sbio->bio);
1745         kfree(sbio);
1746         scrub_pending_bio_dec(sctx);
1747 }
1748
1749 static int scrub_checksum(struct scrub_block *sblock)
1750 {
1751         u64 flags;
1752         int ret;
1753
1754         /*
1755          * No need to initialize these stats currently,
1756          * because this function only use return value
1757          * instead of these stats value.
1758          *
1759          * Todo:
1760          * always use stats
1761          */
1762         sblock->header_error = 0;
1763         sblock->generation_error = 0;
1764         sblock->checksum_error = 0;
1765
1766         WARN_ON(sblock->sector_count < 1);
1767         flags = sblock->sectors[0]->flags;
1768         ret = 0;
1769         if (flags & BTRFS_EXTENT_FLAG_DATA)
1770                 ret = scrub_checksum_data(sblock);
1771         else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)
1772                 ret = scrub_checksum_tree_block(sblock);
1773         else if (flags & BTRFS_EXTENT_FLAG_SUPER)
1774                 (void)scrub_checksum_super(sblock);
1775         else
1776                 WARN_ON(1);
1777         if (ret)
1778                 scrub_handle_errored_block(sblock);
1779
1780         return ret;
1781 }
1782
1783 static int scrub_checksum_data(struct scrub_block *sblock)
1784 {
1785         struct scrub_ctx *sctx = sblock->sctx;
1786         struct btrfs_fs_info *fs_info = sctx->fs_info;
1787         SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
1788         u8 csum[BTRFS_CSUM_SIZE];
1789         struct scrub_sector *sector;
1790         char *kaddr;
1791
1792         BUG_ON(sblock->sector_count < 1);
1793         sector = sblock->sectors[0];
1794         if (!sector->have_csum)
1795                 return 0;
1796
1797         kaddr = page_address(sector->page);
1798
1799         shash->tfm = fs_info->csum_shash;
1800         crypto_shash_init(shash);
1801
1802         /*
1803          * In scrub_sectors() and scrub_sectors_for_parity() we ensure each sector
1804          * only contains one sector of data.
1805          */
1806         crypto_shash_digest(shash, kaddr, fs_info->sectorsize, csum);
1807
1808         if (memcmp(csum, sector->csum, fs_info->csum_size))
1809                 sblock->checksum_error = 1;
1810         return sblock->checksum_error;
1811 }
1812
1813 static int scrub_checksum_tree_block(struct scrub_block *sblock)
1814 {
1815         struct scrub_ctx *sctx = sblock->sctx;
1816         struct btrfs_header *h;
1817         struct btrfs_fs_info *fs_info = sctx->fs_info;
1818         SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
1819         u8 calculated_csum[BTRFS_CSUM_SIZE];
1820         u8 on_disk_csum[BTRFS_CSUM_SIZE];
1821         /*
1822          * This is done in sectorsize steps even for metadata as there's a
1823          * constraint for nodesize to be aligned to sectorsize. This will need
1824          * to change so we don't misuse data and metadata units like that.
1825          */
1826         const u32 sectorsize = sctx->fs_info->sectorsize;
1827         const int num_sectors = fs_info->nodesize >> fs_info->sectorsize_bits;
1828         int i;
1829         struct scrub_sector *sector;
1830         char *kaddr;
1831
1832         BUG_ON(sblock->sector_count < 1);
1833
1834         /* Each member in sectors is just one sector */
1835         ASSERT(sblock->sector_count == num_sectors);
1836
1837         sector = sblock->sectors[0];
1838         kaddr = page_address(sector->page);
1839         h = (struct btrfs_header *)kaddr;
1840         memcpy(on_disk_csum, h->csum, sctx->fs_info->csum_size);
1841
1842         /*
1843          * we don't use the getter functions here, as we
1844          * a) don't have an extent buffer and
1845          * b) the page is already kmapped
1846          */
1847         if (sector->logical != btrfs_stack_header_bytenr(h))
1848                 sblock->header_error = 1;
1849
1850         if (sector->generation != btrfs_stack_header_generation(h)) {
1851                 sblock->header_error = 1;
1852                 sblock->generation_error = 1;
1853         }
1854
1855         if (!scrub_check_fsid(h->fsid, sector))
1856                 sblock->header_error = 1;
1857
1858         if (memcmp(h->chunk_tree_uuid, fs_info->chunk_tree_uuid,
1859                    BTRFS_UUID_SIZE))
1860                 sblock->header_error = 1;
1861
1862         shash->tfm = fs_info->csum_shash;
1863         crypto_shash_init(shash);
1864         crypto_shash_update(shash, kaddr + BTRFS_CSUM_SIZE,
1865                             sectorsize - BTRFS_CSUM_SIZE);
1866
1867         for (i = 1; i < num_sectors; i++) {
1868                 kaddr = page_address(sblock->sectors[i]->page);
1869                 crypto_shash_update(shash, kaddr, sectorsize);
1870         }
1871
1872         crypto_shash_final(shash, calculated_csum);
1873         if (memcmp(calculated_csum, on_disk_csum, sctx->fs_info->csum_size))
1874                 sblock->checksum_error = 1;
1875
1876         return sblock->header_error || sblock->checksum_error;
1877 }
1878
1879 static int scrub_checksum_super(struct scrub_block *sblock)
1880 {
1881         struct btrfs_super_block *s;
1882         struct scrub_ctx *sctx = sblock->sctx;
1883         struct btrfs_fs_info *fs_info = sctx->fs_info;
1884         SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
1885         u8 calculated_csum[BTRFS_CSUM_SIZE];
1886         struct scrub_sector *sector;
1887         char *kaddr;
1888         int fail_gen = 0;
1889         int fail_cor = 0;
1890
1891         BUG_ON(sblock->sector_count < 1);
1892         sector = sblock->sectors[0];
1893         kaddr = page_address(sector->page);
1894         s = (struct btrfs_super_block *)kaddr;
1895
1896         if (sector->logical != btrfs_super_bytenr(s))
1897                 ++fail_cor;
1898
1899         if (sector->generation != btrfs_super_generation(s))
1900                 ++fail_gen;
1901
1902         if (!scrub_check_fsid(s->fsid, sector))
1903                 ++fail_cor;
1904
1905         shash->tfm = fs_info->csum_shash;
1906         crypto_shash_init(shash);
1907         crypto_shash_digest(shash, kaddr + BTRFS_CSUM_SIZE,
1908                         BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE, calculated_csum);
1909
1910         if (memcmp(calculated_csum, s->csum, sctx->fs_info->csum_size))
1911                 ++fail_cor;
1912
1913         if (fail_cor + fail_gen) {
1914                 /*
1915                  * if we find an error in a super block, we just report it.
1916                  * They will get written with the next transaction commit
1917                  * anyway
1918                  */
1919                 spin_lock(&sctx->stat_lock);
1920                 ++sctx->stat.super_errors;
1921                 spin_unlock(&sctx->stat_lock);
1922                 if (fail_cor)
1923                         btrfs_dev_stat_inc_and_print(sector->dev,
1924                                 BTRFS_DEV_STAT_CORRUPTION_ERRS);
1925                 else
1926                         btrfs_dev_stat_inc_and_print(sector->dev,
1927                                 BTRFS_DEV_STAT_GENERATION_ERRS);
1928         }
1929
1930         return fail_cor + fail_gen;
1931 }
1932
1933 static void scrub_block_get(struct scrub_block *sblock)
1934 {
1935         refcount_inc(&sblock->refs);
1936 }
1937
1938 static void scrub_block_put(struct scrub_block *sblock)
1939 {
1940         if (refcount_dec_and_test(&sblock->refs)) {
1941                 int i;
1942
1943                 if (sblock->sparity)
1944                         scrub_parity_put(sblock->sparity);
1945
1946                 for (i = 0; i < sblock->sector_count; i++)
1947                         scrub_sector_put(sblock->sectors[i]);
1948                 kfree(sblock);
1949         }
1950 }
1951
1952 static void scrub_sector_get(struct scrub_sector *sector)
1953 {
1954         atomic_inc(&sector->refs);
1955 }
1956
1957 static void scrub_sector_put(struct scrub_sector *sector)
1958 {
1959         if (atomic_dec_and_test(&sector->refs)) {
1960                 if (sector->page)
1961                         __free_page(sector->page);
1962                 kfree(sector);
1963         }
1964 }
1965
1966 /*
1967  * Throttling of IO submission, bandwidth-limit based, the timeslice is 1
1968  * second.  Limit can be set via /sys/fs/UUID/devinfo/devid/scrub_speed_max.
1969  */
1970 static void scrub_throttle(struct scrub_ctx *sctx)
1971 {
1972         const int time_slice = 1000;
1973         struct scrub_bio *sbio;
1974         struct btrfs_device *device;
1975         s64 delta;
1976         ktime_t now;
1977         u32 div;
1978         u64 bwlimit;
1979
1980         sbio = sctx->bios[sctx->curr];
1981         device = sbio->dev;
1982         bwlimit = READ_ONCE(device->scrub_speed_max);
1983         if (bwlimit == 0)
1984                 return;
1985
1986         /*
1987          * Slice is divided into intervals when the IO is submitted, adjust by
1988          * bwlimit and maximum of 64 intervals.
1989          */
1990         div = max_t(u32, 1, (u32)(bwlimit / (16 * 1024 * 1024)));
1991         div = min_t(u32, 64, div);
1992
1993         /* Start new epoch, set deadline */
1994         now = ktime_get();
1995         if (sctx->throttle_deadline == 0) {
1996                 sctx->throttle_deadline = ktime_add_ms(now, time_slice / div);
1997                 sctx->throttle_sent = 0;
1998         }
1999
2000         /* Still in the time to send? */
2001         if (ktime_before(now, sctx->throttle_deadline)) {
2002                 /* If current bio is within the limit, send it */
2003                 sctx->throttle_sent += sbio->bio->bi_iter.bi_size;
2004                 if (sctx->throttle_sent <= div_u64(bwlimit, div))
2005                         return;
2006
2007                 /* We're over the limit, sleep until the rest of the slice */
2008                 delta = ktime_ms_delta(sctx->throttle_deadline, now);
2009         } else {
2010                 /* New request after deadline, start new epoch */
2011                 delta = 0;
2012         }
2013
2014         if (delta) {
2015                 long timeout;
2016
2017                 timeout = div_u64(delta * HZ, 1000);
2018                 schedule_timeout_interruptible(timeout);
2019         }
2020
2021         /* Next call will start the deadline period */
2022         sctx->throttle_deadline = 0;
2023 }
2024
2025 static void scrub_submit(struct scrub_ctx *sctx)
2026 {
2027         struct scrub_bio *sbio;
2028
2029         if (sctx->curr == -1)
2030                 return;
2031
2032         scrub_throttle(sctx);
2033
2034         sbio = sctx->bios[sctx->curr];
2035         sctx->curr = -1;
2036         scrub_pending_bio_inc(sctx);
2037         btrfsic_check_bio(sbio->bio);
2038         submit_bio(sbio->bio);
2039 }
2040
2041 static int scrub_add_sector_to_rd_bio(struct scrub_ctx *sctx,
2042                                       struct scrub_sector *sector)
2043 {
2044         struct scrub_block *sblock = sector->sblock;
2045         struct scrub_bio *sbio;
2046         const u32 sectorsize = sctx->fs_info->sectorsize;
2047         int ret;
2048
2049 again:
2050         /*
2051          * grab a fresh bio or wait for one to become available
2052          */
2053         while (sctx->curr == -1) {
2054                 spin_lock(&sctx->list_lock);
2055                 sctx->curr = sctx->first_free;
2056                 if (sctx->curr != -1) {
2057                         sctx->first_free = sctx->bios[sctx->curr]->next_free;
2058                         sctx->bios[sctx->curr]->next_free = -1;
2059                         sctx->bios[sctx->curr]->sector_count = 0;
2060                         spin_unlock(&sctx->list_lock);
2061                 } else {
2062                         spin_unlock(&sctx->list_lock);
2063                         wait_event(sctx->list_wait, sctx->first_free != -1);
2064                 }
2065         }
2066         sbio = sctx->bios[sctx->curr];
2067         if (sbio->sector_count == 0) {
2068                 sbio->physical = sector->physical;
2069                 sbio->logical = sector->logical;
2070                 sbio->dev = sector->dev;
2071                 if (!sbio->bio) {
2072                         sbio->bio = bio_alloc(sbio->dev->bdev, sctx->sectors_per_bio,
2073                                               REQ_OP_READ, GFP_NOFS);
2074                 }
2075                 sbio->bio->bi_private = sbio;
2076                 sbio->bio->bi_end_io = scrub_bio_end_io;
2077                 sbio->bio->bi_iter.bi_sector = sbio->physical >> 9;
2078                 sbio->status = 0;
2079         } else if (sbio->physical + sbio->sector_count * sectorsize !=
2080                    sector->physical ||
2081                    sbio->logical + sbio->sector_count * sectorsize !=
2082                    sector->logical ||
2083                    sbio->dev != sector->dev) {
2084                 scrub_submit(sctx);
2085                 goto again;
2086         }
2087
2088         sbio->sectors[sbio->sector_count] = sector;
2089         ret = bio_add_page(sbio->bio, sector->page, sectorsize, 0);
2090         if (ret != sectorsize) {
2091                 if (sbio->sector_count < 1) {
2092                         bio_put(sbio->bio);
2093                         sbio->bio = NULL;
2094                         return -EIO;
2095                 }
2096                 scrub_submit(sctx);
2097                 goto again;
2098         }
2099
2100         scrub_block_get(sblock); /* one for the page added to the bio */
2101         atomic_inc(&sblock->outstanding_sectors);
2102         sbio->sector_count++;
2103         if (sbio->sector_count == sctx->sectors_per_bio)
2104                 scrub_submit(sctx);
2105
2106         return 0;
2107 }
2108
2109 static void scrub_missing_raid56_end_io(struct bio *bio)
2110 {
2111         struct scrub_block *sblock = bio->bi_private;
2112         struct btrfs_fs_info *fs_info = sblock->sctx->fs_info;
2113
2114         if (bio->bi_status)
2115                 sblock->no_io_error_seen = 0;
2116
2117         bio_put(bio);
2118
2119         queue_work(fs_info->scrub_workers, &sblock->work);
2120 }
2121
2122 static void scrub_missing_raid56_worker(struct work_struct *work)
2123 {
2124         struct scrub_block *sblock = container_of(work, struct scrub_block, work);
2125         struct scrub_ctx *sctx = sblock->sctx;
2126         struct btrfs_fs_info *fs_info = sctx->fs_info;
2127         u64 logical;
2128         struct btrfs_device *dev;
2129
2130         logical = sblock->sectors[0]->logical;
2131         dev = sblock->sectors[0]->dev;
2132
2133         if (sblock->no_io_error_seen)
2134                 scrub_recheck_block_checksum(sblock);
2135
2136         if (!sblock->no_io_error_seen) {
2137                 spin_lock(&sctx->stat_lock);
2138                 sctx->stat.read_errors++;
2139                 spin_unlock(&sctx->stat_lock);
2140                 btrfs_err_rl_in_rcu(fs_info,
2141                         "IO error rebuilding logical %llu for dev %s",
2142                         logical, rcu_str_deref(dev->name));
2143         } else if (sblock->header_error || sblock->checksum_error) {
2144                 spin_lock(&sctx->stat_lock);
2145                 sctx->stat.uncorrectable_errors++;
2146                 spin_unlock(&sctx->stat_lock);
2147                 btrfs_err_rl_in_rcu(fs_info,
2148                         "failed to rebuild valid logical %llu for dev %s",
2149                         logical, rcu_str_deref(dev->name));
2150         } else {
2151                 scrub_write_block_to_dev_replace(sblock);
2152         }
2153
2154         if (sctx->is_dev_replace && sctx->flush_all_writes) {
2155                 mutex_lock(&sctx->wr_lock);
2156                 scrub_wr_submit(sctx);
2157                 mutex_unlock(&sctx->wr_lock);
2158         }
2159
2160         scrub_block_put(sblock);
2161         scrub_pending_bio_dec(sctx);
2162 }
2163
2164 static void scrub_missing_raid56_pages(struct scrub_block *sblock)
2165 {
2166         struct scrub_ctx *sctx = sblock->sctx;
2167         struct btrfs_fs_info *fs_info = sctx->fs_info;
2168         u64 length = sblock->sector_count << fs_info->sectorsize_bits;
2169         u64 logical = sblock->sectors[0]->logical;
2170         struct btrfs_io_context *bioc = NULL;
2171         struct bio *bio;
2172         struct btrfs_raid_bio *rbio;
2173         int ret;
2174         int i;
2175
2176         btrfs_bio_counter_inc_blocked(fs_info);
2177         ret = btrfs_map_sblock(fs_info, BTRFS_MAP_GET_READ_MIRRORS, logical,
2178                                &length, &bioc);
2179         if (ret || !bioc || !bioc->raid_map)
2180                 goto bioc_out;
2181
2182         if (WARN_ON(!sctx->is_dev_replace ||
2183                     !(bioc->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK))) {
2184                 /*
2185                  * We shouldn't be scrubbing a missing device. Even for dev
2186                  * replace, we should only get here for RAID 5/6. We either
2187                  * managed to mount something with no mirrors remaining or
2188                  * there's a bug in scrub_find_good_copy()/btrfs_map_block().
2189                  */
2190                 goto bioc_out;
2191         }
2192
2193         bio = bio_alloc(NULL, BIO_MAX_VECS, REQ_OP_READ, GFP_NOFS);
2194         bio->bi_iter.bi_sector = logical >> 9;
2195         bio->bi_private = sblock;
2196         bio->bi_end_io = scrub_missing_raid56_end_io;
2197
2198         rbio = raid56_alloc_missing_rbio(bio, bioc, length);
2199         if (!rbio)
2200                 goto rbio_out;
2201
2202         for (i = 0; i < sblock->sector_count; i++) {
2203                 struct scrub_sector *sector = sblock->sectors[i];
2204
2205                 /*
2206                  * For now, our scrub is still one page per sector, so pgoff
2207                  * is always 0.
2208                  */
2209                 raid56_add_scrub_pages(rbio, sector->page, 0, sector->logical);
2210         }
2211
2212         INIT_WORK(&sblock->work, scrub_missing_raid56_worker);
2213         scrub_block_get(sblock);
2214         scrub_pending_bio_inc(sctx);
2215         raid56_submit_missing_rbio(rbio);
2216         return;
2217
2218 rbio_out:
2219         bio_put(bio);
2220 bioc_out:
2221         btrfs_bio_counter_dec(fs_info);
2222         btrfs_put_bioc(bioc);
2223         spin_lock(&sctx->stat_lock);
2224         sctx->stat.malloc_errors++;
2225         spin_unlock(&sctx->stat_lock);
2226 }
2227
2228 static int scrub_sectors(struct scrub_ctx *sctx, u64 logical, u32 len,
2229                        u64 physical, struct btrfs_device *dev, u64 flags,
2230                        u64 gen, int mirror_num, u8 *csum,
2231                        u64 physical_for_dev_replace)
2232 {
2233         struct scrub_block *sblock;
2234         const u32 sectorsize = sctx->fs_info->sectorsize;
2235         int index;
2236
2237         sblock = kzalloc(sizeof(*sblock), GFP_KERNEL);
2238         if (!sblock) {
2239                 spin_lock(&sctx->stat_lock);
2240                 sctx->stat.malloc_errors++;
2241                 spin_unlock(&sctx->stat_lock);
2242                 return -ENOMEM;
2243         }
2244
2245         /* one ref inside this function, plus one for each page added to
2246          * a bio later on */
2247         refcount_set(&sblock->refs, 1);
2248         sblock->sctx = sctx;
2249         sblock->no_io_error_seen = 1;
2250
2251         for (index = 0; len > 0; index++) {
2252                 struct scrub_sector *sector;
2253                 /*
2254                  * Here we will allocate one page for one sector to scrub.
2255                  * This is fine if PAGE_SIZE == sectorsize, but will cost
2256                  * more memory for PAGE_SIZE > sectorsize case.
2257                  */
2258                 u32 l = min(sectorsize, len);
2259
2260                 sector = kzalloc(sizeof(*sector), GFP_KERNEL);
2261                 if (!sector) {
2262 leave_nomem:
2263                         spin_lock(&sctx->stat_lock);
2264                         sctx->stat.malloc_errors++;
2265                         spin_unlock(&sctx->stat_lock);
2266                         scrub_block_put(sblock);
2267                         return -ENOMEM;
2268                 }
2269                 ASSERT(index < SCRUB_MAX_SECTORS_PER_BLOCK);
2270                 scrub_sector_get(sector);
2271                 sblock->sectors[index] = sector;
2272                 sector->sblock = sblock;
2273                 sector->dev = dev;
2274                 sector->flags = flags;
2275                 sector->generation = gen;
2276                 sector->logical = logical;
2277                 sector->physical = physical;
2278                 sector->physical_for_dev_replace = physical_for_dev_replace;
2279                 sector->mirror_num = mirror_num;
2280                 if (csum) {
2281                         sector->have_csum = 1;
2282                         memcpy(sector->csum, csum, sctx->fs_info->csum_size);
2283                 } else {
2284                         sector->have_csum = 0;
2285                 }
2286                 sblock->sector_count++;
2287                 sector->page = alloc_page(GFP_KERNEL);
2288                 if (!sector->page)
2289                         goto leave_nomem;
2290                 len -= l;
2291                 logical += l;
2292                 physical += l;
2293                 physical_for_dev_replace += l;
2294         }
2295
2296         WARN_ON(sblock->sector_count == 0);
2297         if (test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state)) {
2298                 /*
2299                  * This case should only be hit for RAID 5/6 device replace. See
2300                  * the comment in scrub_missing_raid56_pages() for details.
2301                  */
2302                 scrub_missing_raid56_pages(sblock);
2303         } else {
2304                 for (index = 0; index < sblock->sector_count; index++) {
2305                         struct scrub_sector *sector = sblock->sectors[index];
2306                         int ret;
2307
2308                         ret = scrub_add_sector_to_rd_bio(sctx, sector);
2309                         if (ret) {
2310                                 scrub_block_put(sblock);
2311                                 return ret;
2312                         }
2313                 }
2314
2315                 if (flags & BTRFS_EXTENT_FLAG_SUPER)
2316                         scrub_submit(sctx);
2317         }
2318
2319         /* last one frees, either here or in bio completion for last page */
2320         scrub_block_put(sblock);
2321         return 0;
2322 }
2323
2324 static void scrub_bio_end_io(struct bio *bio)
2325 {
2326         struct scrub_bio *sbio = bio->bi_private;
2327         struct btrfs_fs_info *fs_info = sbio->dev->fs_info;
2328
2329         sbio->status = bio->bi_status;
2330         sbio->bio = bio;
2331
2332         queue_work(fs_info->scrub_workers, &sbio->work);
2333 }
2334
2335 static void scrub_bio_end_io_worker(struct work_struct *work)
2336 {
2337         struct scrub_bio *sbio = container_of(work, struct scrub_bio, work);
2338         struct scrub_ctx *sctx = sbio->sctx;
2339         int i;
2340
2341         ASSERT(sbio->sector_count <= SCRUB_SECTORS_PER_BIO);
2342         if (sbio->status) {
2343                 for (i = 0; i < sbio->sector_count; i++) {
2344                         struct scrub_sector *sector = sbio->sectors[i];
2345
2346                         sector->io_error = 1;
2347                         sector->sblock->no_io_error_seen = 0;
2348                 }
2349         }
2350
2351         /* Now complete the scrub_block items that have all pages completed */
2352         for (i = 0; i < sbio->sector_count; i++) {
2353                 struct scrub_sector *sector = sbio->sectors[i];
2354                 struct scrub_block *sblock = sector->sblock;
2355
2356                 if (atomic_dec_and_test(&sblock->outstanding_sectors))
2357                         scrub_block_complete(sblock);
2358                 scrub_block_put(sblock);
2359         }
2360
2361         bio_put(sbio->bio);
2362         sbio->bio = NULL;
2363         spin_lock(&sctx->list_lock);
2364         sbio->next_free = sctx->first_free;
2365         sctx->first_free = sbio->index;
2366         spin_unlock(&sctx->list_lock);
2367
2368         if (sctx->is_dev_replace && sctx->flush_all_writes) {
2369                 mutex_lock(&sctx->wr_lock);
2370                 scrub_wr_submit(sctx);
2371                 mutex_unlock(&sctx->wr_lock);
2372         }
2373
2374         scrub_pending_bio_dec(sctx);
2375 }
2376
2377 static inline void __scrub_mark_bitmap(struct scrub_parity *sparity,
2378                                        unsigned long *bitmap,
2379                                        u64 start, u32 len)
2380 {
2381         u64 offset;
2382         u32 nsectors;
2383         u32 sectorsize_bits = sparity->sctx->fs_info->sectorsize_bits;
2384
2385         if (len >= sparity->stripe_len) {
2386                 bitmap_set(bitmap, 0, sparity->nsectors);
2387                 return;
2388         }
2389
2390         start -= sparity->logic_start;
2391         start = div64_u64_rem(start, sparity->stripe_len, &offset);
2392         offset = offset >> sectorsize_bits;
2393         nsectors = len >> sectorsize_bits;
2394
2395         if (offset + nsectors <= sparity->nsectors) {
2396                 bitmap_set(bitmap, offset, nsectors);
2397                 return;
2398         }
2399
2400         bitmap_set(bitmap, offset, sparity->nsectors - offset);
2401         bitmap_set(bitmap, 0, nsectors - (sparity->nsectors - offset));
2402 }
2403
2404 static inline void scrub_parity_mark_sectors_error(struct scrub_parity *sparity,
2405                                                    u64 start, u32 len)
2406 {
2407         __scrub_mark_bitmap(sparity, &sparity->ebitmap, start, len);
2408 }
2409
2410 static inline void scrub_parity_mark_sectors_data(struct scrub_parity *sparity,
2411                                                   u64 start, u32 len)
2412 {
2413         __scrub_mark_bitmap(sparity, &sparity->dbitmap, start, len);
2414 }
2415
2416 static void scrub_block_complete(struct scrub_block *sblock)
2417 {
2418         int corrupted = 0;
2419
2420         if (!sblock->no_io_error_seen) {
2421                 corrupted = 1;
2422                 scrub_handle_errored_block(sblock);
2423         } else {
2424                 /*
2425                  * if has checksum error, write via repair mechanism in
2426                  * dev replace case, otherwise write here in dev replace
2427                  * case.
2428                  */
2429                 corrupted = scrub_checksum(sblock);
2430                 if (!corrupted && sblock->sctx->is_dev_replace)
2431                         scrub_write_block_to_dev_replace(sblock);
2432         }
2433
2434         if (sblock->sparity && corrupted && !sblock->data_corrected) {
2435                 u64 start = sblock->sectors[0]->logical;
2436                 u64 end = sblock->sectors[sblock->sector_count - 1]->logical +
2437                           sblock->sctx->fs_info->sectorsize;
2438
2439                 ASSERT(end - start <= U32_MAX);
2440                 scrub_parity_mark_sectors_error(sblock->sparity,
2441                                                 start, end - start);
2442         }
2443 }
2444
2445 static void drop_csum_range(struct scrub_ctx *sctx, struct btrfs_ordered_sum *sum)
2446 {
2447         sctx->stat.csum_discards += sum->len >> sctx->fs_info->sectorsize_bits;
2448         list_del(&sum->list);
2449         kfree(sum);
2450 }
2451
2452 /*
2453  * Find the desired csum for range [logical, logical + sectorsize), and store
2454  * the csum into @csum.
2455  *
2456  * The search source is sctx->csum_list, which is a pre-populated list
2457  * storing bytenr ordered csum ranges.  We're responsible to cleanup any range
2458  * that is before @logical.
2459  *
2460  * Return 0 if there is no csum for the range.
2461  * Return 1 if there is csum for the range and copied to @csum.
2462  */
2463 static int scrub_find_csum(struct scrub_ctx *sctx, u64 logical, u8 *csum)
2464 {
2465         bool found = false;
2466
2467         while (!list_empty(&sctx->csum_list)) {
2468                 struct btrfs_ordered_sum *sum = NULL;
2469                 unsigned long index;
2470                 unsigned long num_sectors;
2471
2472                 sum = list_first_entry(&sctx->csum_list,
2473                                        struct btrfs_ordered_sum, list);
2474                 /* The current csum range is beyond our range, no csum found */
2475                 if (sum->bytenr > logical)
2476                         break;
2477
2478                 /*
2479                  * The current sum is before our bytenr, since scrub is always
2480                  * done in bytenr order, the csum will never be used anymore,
2481                  * clean it up so that later calls won't bother with the range,
2482                  * and continue search the next range.
2483                  */
2484                 if (sum->bytenr + sum->len <= logical) {
2485                         drop_csum_range(sctx, sum);
2486                         continue;
2487                 }
2488
2489                 /* Now the csum range covers our bytenr, copy the csum */
2490                 found = true;
2491                 index = (logical - sum->bytenr) >> sctx->fs_info->sectorsize_bits;
2492                 num_sectors = sum->len >> sctx->fs_info->sectorsize_bits;
2493
2494                 memcpy(csum, sum->sums + index * sctx->fs_info->csum_size,
2495                        sctx->fs_info->csum_size);
2496
2497                 /* Cleanup the range if we're at the end of the csum range */
2498                 if (index == num_sectors - 1)
2499                         drop_csum_range(sctx, sum);
2500                 break;
2501         }
2502         if (!found)
2503                 return 0;
2504         return 1;
2505 }
2506
2507 /* scrub extent tries to collect up to 64 kB for each bio */
2508 static int scrub_extent(struct scrub_ctx *sctx, struct map_lookup *map,
2509                         u64 logical, u32 len,
2510                         u64 physical, struct btrfs_device *dev, u64 flags,
2511                         u64 gen, int mirror_num)
2512 {
2513         struct btrfs_device *src_dev = dev;
2514         u64 src_physical = physical;
2515         int src_mirror = mirror_num;
2516         int ret;
2517         u8 csum[BTRFS_CSUM_SIZE];
2518         u32 blocksize;
2519
2520         if (flags & BTRFS_EXTENT_FLAG_DATA) {
2521                 if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK)
2522                         blocksize = map->stripe_len;
2523                 else
2524                         blocksize = sctx->fs_info->sectorsize;
2525                 spin_lock(&sctx->stat_lock);
2526                 sctx->stat.data_extents_scrubbed++;
2527                 sctx->stat.data_bytes_scrubbed += len;
2528                 spin_unlock(&sctx->stat_lock);
2529         } else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
2530                 if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK)
2531                         blocksize = map->stripe_len;
2532                 else
2533                         blocksize = sctx->fs_info->nodesize;
2534                 spin_lock(&sctx->stat_lock);
2535                 sctx->stat.tree_extents_scrubbed++;
2536                 sctx->stat.tree_bytes_scrubbed += len;
2537                 spin_unlock(&sctx->stat_lock);
2538         } else {
2539                 blocksize = sctx->fs_info->sectorsize;
2540                 WARN_ON(1);
2541         }
2542
2543         /*
2544          * For dev-replace case, we can have @dev being a missing device.
2545          * Regular scrub will avoid its execution on missing device at all,
2546          * as that would trigger tons of read error.
2547          *
2548          * Reading from missing device will cause read error counts to
2549          * increase unnecessarily.
2550          * So here we change the read source to a good mirror.
2551          */
2552         if (sctx->is_dev_replace && !dev->bdev)
2553                 scrub_find_good_copy(sctx->fs_info, logical, len, &src_physical,
2554                                      &src_dev, &src_mirror);
2555         while (len) {
2556                 u32 l = min(len, blocksize);
2557                 int have_csum = 0;
2558
2559                 if (flags & BTRFS_EXTENT_FLAG_DATA) {
2560                         /* push csums to sbio */
2561                         have_csum = scrub_find_csum(sctx, logical, csum);
2562                         if (have_csum == 0)
2563                                 ++sctx->stat.no_csum;
2564                 }
2565                 ret = scrub_sectors(sctx, logical, l, src_physical, src_dev,
2566                                     flags, gen, src_mirror,
2567                                     have_csum ? csum : NULL, physical);
2568                 if (ret)
2569                         return ret;
2570                 len -= l;
2571                 logical += l;
2572                 physical += l;
2573                 src_physical += l;
2574         }
2575         return 0;
2576 }
2577
2578 static int scrub_sectors_for_parity(struct scrub_parity *sparity,
2579                                   u64 logical, u32 len,
2580                                   u64 physical, struct btrfs_device *dev,
2581                                   u64 flags, u64 gen, int mirror_num, u8 *csum)
2582 {
2583         struct scrub_ctx *sctx = sparity->sctx;
2584         struct scrub_block *sblock;
2585         const u32 sectorsize = sctx->fs_info->sectorsize;
2586         int index;
2587
2588         ASSERT(IS_ALIGNED(len, sectorsize));
2589
2590         sblock = kzalloc(sizeof(*sblock), GFP_KERNEL);
2591         if (!sblock) {
2592                 spin_lock(&sctx->stat_lock);
2593                 sctx->stat.malloc_errors++;
2594                 spin_unlock(&sctx->stat_lock);
2595                 return -ENOMEM;
2596         }
2597
2598         /* one ref inside this function, plus one for each page added to
2599          * a bio later on */
2600         refcount_set(&sblock->refs, 1);
2601         sblock->sctx = sctx;
2602         sblock->no_io_error_seen = 1;
2603         sblock->sparity = sparity;
2604         scrub_parity_get(sparity);
2605
2606         for (index = 0; len > 0; index++) {
2607                 struct scrub_sector *sector;
2608
2609                 sector = kzalloc(sizeof(*sector), GFP_KERNEL);
2610                 if (!sector) {
2611 leave_nomem:
2612                         spin_lock(&sctx->stat_lock);
2613                         sctx->stat.malloc_errors++;
2614                         spin_unlock(&sctx->stat_lock);
2615                         scrub_block_put(sblock);
2616                         return -ENOMEM;
2617                 }
2618                 ASSERT(index < SCRUB_MAX_SECTORS_PER_BLOCK);
2619                 /* For scrub block */
2620                 scrub_sector_get(sector);
2621                 sblock->sectors[index] = sector;
2622                 /* For scrub parity */
2623                 scrub_sector_get(sector);
2624                 list_add_tail(&sector->list, &sparity->sectors_list);
2625                 sector->sblock = sblock;
2626                 sector->dev = dev;
2627                 sector->flags = flags;
2628                 sector->generation = gen;
2629                 sector->logical = logical;
2630                 sector->physical = physical;
2631                 sector->mirror_num = mirror_num;
2632                 if (csum) {
2633                         sector->have_csum = 1;
2634                         memcpy(sector->csum, csum, sctx->fs_info->csum_size);
2635                 } else {
2636                         sector->have_csum = 0;
2637                 }
2638                 sblock->sector_count++;
2639                 sector->page = alloc_page(GFP_KERNEL);
2640                 if (!sector->page)
2641                         goto leave_nomem;
2642
2643
2644                 /* Iterate over the stripe range in sectorsize steps */
2645                 len -= sectorsize;
2646                 logical += sectorsize;
2647                 physical += sectorsize;
2648         }
2649
2650         WARN_ON(sblock->sector_count == 0);
2651         for (index = 0; index < sblock->sector_count; index++) {
2652                 struct scrub_sector *sector = sblock->sectors[index];
2653                 int ret;
2654
2655                 ret = scrub_add_sector_to_rd_bio(sctx, sector);
2656                 if (ret) {
2657                         scrub_block_put(sblock);
2658                         return ret;
2659                 }
2660         }
2661
2662         /* Last one frees, either here or in bio completion for last sector */
2663         scrub_block_put(sblock);
2664         return 0;
2665 }
2666
2667 static int scrub_extent_for_parity(struct scrub_parity *sparity,
2668                                    u64 logical, u32 len,
2669                                    u64 physical, struct btrfs_device *dev,
2670                                    u64 flags, u64 gen, int mirror_num)
2671 {
2672         struct scrub_ctx *sctx = sparity->sctx;
2673         int ret;
2674         u8 csum[BTRFS_CSUM_SIZE];
2675         u32 blocksize;
2676
2677         if (test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state)) {
2678                 scrub_parity_mark_sectors_error(sparity, logical, len);
2679                 return 0;
2680         }
2681
2682         if (flags & BTRFS_EXTENT_FLAG_DATA) {
2683                 blocksize = sparity->stripe_len;
2684         } else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
2685                 blocksize = sparity->stripe_len;
2686         } else {
2687                 blocksize = sctx->fs_info->sectorsize;
2688                 WARN_ON(1);
2689         }
2690
2691         while (len) {
2692                 u32 l = min(len, blocksize);
2693                 int have_csum = 0;
2694
2695                 if (flags & BTRFS_EXTENT_FLAG_DATA) {
2696                         /* push csums to sbio */
2697                         have_csum = scrub_find_csum(sctx, logical, csum);
2698                         if (have_csum == 0)
2699                                 goto skip;
2700                 }
2701                 ret = scrub_sectors_for_parity(sparity, logical, l, physical, dev,
2702                                              flags, gen, mirror_num,
2703                                              have_csum ? csum : NULL);
2704                 if (ret)
2705                         return ret;
2706 skip:
2707                 len -= l;
2708                 logical += l;
2709                 physical += l;
2710         }
2711         return 0;
2712 }
2713
2714 /*
2715  * Given a physical address, this will calculate it's
2716  * logical offset. if this is a parity stripe, it will return
2717  * the most left data stripe's logical offset.
2718  *
2719  * return 0 if it is a data stripe, 1 means parity stripe.
2720  */
2721 static int get_raid56_logic_offset(u64 physical, int num,
2722                                    struct map_lookup *map, u64 *offset,
2723                                    u64 *stripe_start)
2724 {
2725         int i;
2726         int j = 0;
2727         u64 stripe_nr;
2728         u64 last_offset;
2729         u32 stripe_index;
2730         u32 rot;
2731         const int data_stripes = nr_data_stripes(map);
2732
2733         last_offset = (physical - map->stripes[num].physical) * data_stripes;
2734         if (stripe_start)
2735                 *stripe_start = last_offset;
2736
2737         *offset = last_offset;
2738         for (i = 0; i < data_stripes; i++) {
2739                 *offset = last_offset + i * map->stripe_len;
2740
2741                 stripe_nr = div64_u64(*offset, map->stripe_len);
2742                 stripe_nr = div_u64(stripe_nr, data_stripes);
2743
2744                 /* Work out the disk rotation on this stripe-set */
2745                 stripe_nr = div_u64_rem(stripe_nr, map->num_stripes, &rot);
2746                 /* calculate which stripe this data locates */
2747                 rot += i;
2748                 stripe_index = rot % map->num_stripes;
2749                 if (stripe_index == num)
2750                         return 0;
2751                 if (stripe_index < num)
2752                         j++;
2753         }
2754         *offset = last_offset + j * map->stripe_len;
2755         return 1;
2756 }
2757
2758 static void scrub_free_parity(struct scrub_parity *sparity)
2759 {
2760         struct scrub_ctx *sctx = sparity->sctx;
2761         struct scrub_sector *curr, *next;
2762         int nbits;
2763
2764         nbits = bitmap_weight(&sparity->ebitmap, sparity->nsectors);
2765         if (nbits) {
2766                 spin_lock(&sctx->stat_lock);
2767                 sctx->stat.read_errors += nbits;
2768                 sctx->stat.uncorrectable_errors += nbits;
2769                 spin_unlock(&sctx->stat_lock);
2770         }
2771
2772         list_for_each_entry_safe(curr, next, &sparity->sectors_list, list) {
2773                 list_del_init(&curr->list);
2774                 scrub_sector_put(curr);
2775         }
2776
2777         kfree(sparity);
2778 }
2779
2780 static void scrub_parity_bio_endio_worker(struct work_struct *work)
2781 {
2782         struct scrub_parity *sparity = container_of(work, struct scrub_parity,
2783                                                     work);
2784         struct scrub_ctx *sctx = sparity->sctx;
2785
2786         scrub_free_parity(sparity);
2787         scrub_pending_bio_dec(sctx);
2788 }
2789
2790 static void scrub_parity_bio_endio(struct bio *bio)
2791 {
2792         struct scrub_parity *sparity = bio->bi_private;
2793         struct btrfs_fs_info *fs_info = sparity->sctx->fs_info;
2794
2795         if (bio->bi_status)
2796                 bitmap_or(&sparity->ebitmap, &sparity->ebitmap,
2797                           &sparity->dbitmap, sparity->nsectors);
2798
2799         bio_put(bio);
2800
2801         INIT_WORK(&sparity->work, scrub_parity_bio_endio_worker);
2802         queue_work(fs_info->scrub_parity_workers, &sparity->work);
2803 }
2804
2805 static void scrub_parity_check_and_repair(struct scrub_parity *sparity)
2806 {
2807         struct scrub_ctx *sctx = sparity->sctx;
2808         struct btrfs_fs_info *fs_info = sctx->fs_info;
2809         struct bio *bio;
2810         struct btrfs_raid_bio *rbio;
2811         struct btrfs_io_context *bioc = NULL;
2812         u64 length;
2813         int ret;
2814
2815         if (!bitmap_andnot(&sparity->dbitmap, &sparity->dbitmap,
2816                            &sparity->ebitmap, sparity->nsectors))
2817                 goto out;
2818
2819         length = sparity->logic_end - sparity->logic_start;
2820
2821         btrfs_bio_counter_inc_blocked(fs_info);
2822         ret = btrfs_map_sblock(fs_info, BTRFS_MAP_WRITE, sparity->logic_start,
2823                                &length, &bioc);
2824         if (ret || !bioc || !bioc->raid_map)
2825                 goto bioc_out;
2826
2827         bio = bio_alloc(NULL, BIO_MAX_VECS, REQ_OP_READ, GFP_NOFS);
2828         bio->bi_iter.bi_sector = sparity->logic_start >> 9;
2829         bio->bi_private = sparity;
2830         bio->bi_end_io = scrub_parity_bio_endio;
2831
2832         rbio = raid56_parity_alloc_scrub_rbio(bio, bioc, length,
2833                                               sparity->scrub_dev,
2834                                               &sparity->dbitmap,
2835                                               sparity->nsectors);
2836         if (!rbio)
2837                 goto rbio_out;
2838
2839         scrub_pending_bio_inc(sctx);
2840         raid56_parity_submit_scrub_rbio(rbio);
2841         return;
2842
2843 rbio_out:
2844         bio_put(bio);
2845 bioc_out:
2846         btrfs_bio_counter_dec(fs_info);
2847         btrfs_put_bioc(bioc);
2848         bitmap_or(&sparity->ebitmap, &sparity->ebitmap, &sparity->dbitmap,
2849                   sparity->nsectors);
2850         spin_lock(&sctx->stat_lock);
2851         sctx->stat.malloc_errors++;
2852         spin_unlock(&sctx->stat_lock);
2853 out:
2854         scrub_free_parity(sparity);
2855 }
2856
2857 static void scrub_parity_get(struct scrub_parity *sparity)
2858 {
2859         refcount_inc(&sparity->refs);
2860 }
2861
2862 static void scrub_parity_put(struct scrub_parity *sparity)
2863 {
2864         if (!refcount_dec_and_test(&sparity->refs))
2865                 return;
2866
2867         scrub_parity_check_and_repair(sparity);
2868 }
2869
2870 /*
2871  * Return 0 if the extent item range covers any byte of the range.
2872  * Return <0 if the extent item is before @search_start.
2873  * Return >0 if the extent item is after @start_start + @search_len.
2874  */
2875 static int compare_extent_item_range(struct btrfs_path *path,
2876                                      u64 search_start, u64 search_len)
2877 {
2878         struct btrfs_fs_info *fs_info = path->nodes[0]->fs_info;
2879         u64 len;
2880         struct btrfs_key key;
2881
2882         btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
2883         ASSERT(key.type == BTRFS_EXTENT_ITEM_KEY ||
2884                key.type == BTRFS_METADATA_ITEM_KEY);
2885         if (key.type == BTRFS_METADATA_ITEM_KEY)
2886                 len = fs_info->nodesize;
2887         else
2888                 len = key.offset;
2889
2890         if (key.objectid + len <= search_start)
2891                 return -1;
2892         if (key.objectid >= search_start + search_len)
2893                 return 1;
2894         return 0;
2895 }
2896
2897 /*
2898  * Locate one extent item which covers any byte in range
2899  * [@search_start, @search_start + @search_length)
2900  *
2901  * If the path is not initialized, we will initialize the search by doing
2902  * a btrfs_search_slot().
2903  * If the path is already initialized, we will use the path as the initial
2904  * slot, to avoid duplicated btrfs_search_slot() calls.
2905  *
2906  * NOTE: If an extent item starts before @search_start, we will still
2907  * return the extent item. This is for data extent crossing stripe boundary.
2908  *
2909  * Return 0 if we found such extent item, and @path will point to the extent item.
2910  * Return >0 if no such extent item can be found, and @path will be released.
2911  * Return <0 if hit fatal error, and @path will be released.
2912  */
2913 static int find_first_extent_item(struct btrfs_root *extent_root,
2914                                   struct btrfs_path *path,
2915                                   u64 search_start, u64 search_len)
2916 {
2917         struct btrfs_fs_info *fs_info = extent_root->fs_info;
2918         struct btrfs_key key;
2919         int ret;
2920
2921         /* Continue using the existing path */
2922         if (path->nodes[0])
2923                 goto search_forward;
2924
2925         if (btrfs_fs_incompat(fs_info, SKINNY_METADATA))
2926                 key.type = BTRFS_METADATA_ITEM_KEY;
2927         else
2928                 key.type = BTRFS_EXTENT_ITEM_KEY;
2929         key.objectid = search_start;
2930         key.offset = (u64)-1;
2931
2932         ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
2933         if (ret < 0)
2934                 return ret;
2935
2936         ASSERT(ret > 0);
2937         /*
2938          * Here we intentionally pass 0 as @min_objectid, as there could be
2939          * an extent item starting before @search_start.
2940          */
2941         ret = btrfs_previous_extent_item(extent_root, path, 0);
2942         if (ret < 0)
2943                 return ret;
2944         /*
2945          * No matter whether we have found an extent item, the next loop will
2946          * properly do every check on the key.
2947          */
2948 search_forward:
2949         while (true) {
2950                 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
2951                 if (key.objectid >= search_start + search_len)
2952                         break;
2953                 if (key.type != BTRFS_METADATA_ITEM_KEY &&
2954                     key.type != BTRFS_EXTENT_ITEM_KEY)
2955                         goto next;
2956
2957                 ret = compare_extent_item_range(path, search_start, search_len);
2958                 if (ret == 0)
2959                         return ret;
2960                 if (ret > 0)
2961                         break;
2962 next:
2963                 path->slots[0]++;
2964                 if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) {
2965                         ret = btrfs_next_leaf(extent_root, path);
2966                         if (ret) {
2967                                 /* Either no more item or fatal error */
2968                                 btrfs_release_path(path);
2969                                 return ret;
2970                         }
2971                 }
2972         }
2973         btrfs_release_path(path);
2974         return 1;
2975 }
2976
2977 static void get_extent_info(struct btrfs_path *path, u64 *extent_start_ret,
2978                             u64 *size_ret, u64 *flags_ret, u64 *generation_ret)
2979 {
2980         struct btrfs_key key;
2981         struct btrfs_extent_item *ei;
2982
2983         btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
2984         ASSERT(key.type == BTRFS_METADATA_ITEM_KEY ||
2985                key.type == BTRFS_EXTENT_ITEM_KEY);
2986         *extent_start_ret = key.objectid;
2987         if (key.type == BTRFS_METADATA_ITEM_KEY)
2988                 *size_ret = path->nodes[0]->fs_info->nodesize;
2989         else
2990                 *size_ret = key.offset;
2991         ei = btrfs_item_ptr(path->nodes[0], path->slots[0], struct btrfs_extent_item);
2992         *flags_ret = btrfs_extent_flags(path->nodes[0], ei);
2993         *generation_ret = btrfs_extent_generation(path->nodes[0], ei);
2994 }
2995
2996 static bool does_range_cross_boundary(u64 extent_start, u64 extent_len,
2997                                       u64 boundary_start, u64 boudary_len)
2998 {
2999         return (extent_start < boundary_start &&
3000                 extent_start + extent_len > boundary_start) ||
3001                (extent_start < boundary_start + boudary_len &&
3002                 extent_start + extent_len > boundary_start + boudary_len);
3003 }
3004
3005 static int scrub_raid56_data_stripe_for_parity(struct scrub_ctx *sctx,
3006                                                struct scrub_parity *sparity,
3007                                                struct map_lookup *map,
3008                                                struct btrfs_device *sdev,
3009                                                struct btrfs_path *path,
3010                                                u64 logical)
3011 {
3012         struct btrfs_fs_info *fs_info = sctx->fs_info;
3013         struct btrfs_root *extent_root = btrfs_extent_root(fs_info, logical);
3014         struct btrfs_root *csum_root = btrfs_csum_root(fs_info, logical);
3015         u64 cur_logical = logical;
3016         int ret;
3017
3018         ASSERT(map->type & BTRFS_BLOCK_GROUP_RAID56_MASK);
3019
3020         /* Path must not be populated */
3021         ASSERT(!path->nodes[0]);
3022
3023         while (cur_logical < logical + map->stripe_len) {
3024                 struct btrfs_io_context *bioc = NULL;
3025                 struct btrfs_device *extent_dev;
3026                 u64 extent_start;
3027                 u64 extent_size;
3028                 u64 mapped_length;
3029                 u64 extent_flags;
3030                 u64 extent_gen;
3031                 u64 extent_physical;
3032                 u64 extent_mirror_num;
3033
3034                 ret = find_first_extent_item(extent_root, path, cur_logical,
3035                                              logical + map->stripe_len - cur_logical);
3036                 /* No more extent item in this data stripe */
3037                 if (ret > 0) {
3038                         ret = 0;
3039                         break;
3040                 }
3041                 if (ret < 0)
3042                         break;
3043                 get_extent_info(path, &extent_start, &extent_size, &extent_flags,
3044                                 &extent_gen);
3045
3046                 /* Metadata should not cross stripe boundaries */
3047                 if ((extent_flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) &&
3048                     does_range_cross_boundary(extent_start, extent_size,
3049                                               logical, map->stripe_len)) {
3050                         btrfs_err(fs_info,
3051         "scrub: tree block %llu spanning stripes, ignored. logical=%llu",
3052                                   extent_start, logical);
3053                         spin_lock(&sctx->stat_lock);
3054                         sctx->stat.uncorrectable_errors++;
3055                         spin_unlock(&sctx->stat_lock);
3056                         cur_logical += extent_size;
3057                         continue;
3058                 }
3059
3060                 /* Skip hole range which doesn't have any extent */
3061                 cur_logical = max(extent_start, cur_logical);
3062
3063                 /* Truncate the range inside this data stripe */
3064                 extent_size = min(extent_start + extent_size,
3065                                   logical + map->stripe_len) - cur_logical;
3066                 extent_start = cur_logical;
3067                 ASSERT(extent_size <= U32_MAX);
3068
3069                 scrub_parity_mark_sectors_data(sparity, extent_start, extent_size);
3070
3071                 mapped_length = extent_size;
3072                 ret = btrfs_map_block(fs_info, BTRFS_MAP_READ, extent_start,
3073                                       &mapped_length, &bioc, 0);
3074                 if (!ret && (!bioc || mapped_length < extent_size))
3075                         ret = -EIO;
3076                 if (ret) {
3077                         btrfs_put_bioc(bioc);
3078                         scrub_parity_mark_sectors_error(sparity, extent_start,
3079                                                         extent_size);
3080                         break;
3081                 }
3082                 extent_physical = bioc->stripes[0].physical;
3083                 extent_mirror_num = bioc->mirror_num;
3084                 extent_dev = bioc->stripes[0].dev;
3085                 btrfs_put_bioc(bioc);
3086
3087                 ret = btrfs_lookup_csums_range(csum_root, extent_start,
3088                                                extent_start + extent_size - 1,
3089                                                &sctx->csum_list, 1);
3090                 if (ret) {
3091                         scrub_parity_mark_sectors_error(sparity, extent_start,
3092                                                         extent_size);
3093                         break;
3094                 }
3095
3096                 ret = scrub_extent_for_parity(sparity, extent_start,
3097                                               extent_size, extent_physical,
3098                                               extent_dev, extent_flags,
3099                                               extent_gen, extent_mirror_num);
3100                 scrub_free_csums(sctx);
3101
3102                 if (ret) {
3103                         scrub_parity_mark_sectors_error(sparity, extent_start,
3104                                                         extent_size);
3105                         break;
3106                 }
3107
3108                 cond_resched();
3109                 cur_logical += extent_size;
3110         }
3111         btrfs_release_path(path);
3112         return ret;
3113 }
3114
3115 static noinline_for_stack int scrub_raid56_parity(struct scrub_ctx *sctx,
3116                                                   struct map_lookup *map,
3117                                                   struct btrfs_device *sdev,
3118                                                   u64 logic_start,
3119                                                   u64 logic_end)
3120 {
3121         struct btrfs_fs_info *fs_info = sctx->fs_info;
3122         struct btrfs_path *path;
3123         u64 cur_logical;
3124         int ret;
3125         struct scrub_parity *sparity;
3126         int nsectors;
3127
3128         path = btrfs_alloc_path();
3129         if (!path) {
3130                 spin_lock(&sctx->stat_lock);
3131                 sctx->stat.malloc_errors++;
3132                 spin_unlock(&sctx->stat_lock);
3133                 return -ENOMEM;
3134         }
3135         path->search_commit_root = 1;
3136         path->skip_locking = 1;
3137
3138         ASSERT(map->stripe_len <= U32_MAX);
3139         nsectors = map->stripe_len >> fs_info->sectorsize_bits;
3140         ASSERT(nsectors <= BITS_PER_LONG);
3141         sparity = kzalloc(sizeof(struct scrub_parity), GFP_NOFS);
3142         if (!sparity) {
3143                 spin_lock(&sctx->stat_lock);
3144                 sctx->stat.malloc_errors++;
3145                 spin_unlock(&sctx->stat_lock);
3146                 btrfs_free_path(path);
3147                 return -ENOMEM;
3148         }
3149
3150         ASSERT(map->stripe_len <= U32_MAX);
3151         sparity->stripe_len = map->stripe_len;
3152         sparity->nsectors = nsectors;
3153         sparity->sctx = sctx;
3154         sparity->scrub_dev = sdev;
3155         sparity->logic_start = logic_start;
3156         sparity->logic_end = logic_end;
3157         refcount_set(&sparity->refs, 1);
3158         INIT_LIST_HEAD(&sparity->sectors_list);
3159
3160         ret = 0;
3161         for (cur_logical = logic_start; cur_logical < logic_end;
3162              cur_logical += map->stripe_len) {
3163                 ret = scrub_raid56_data_stripe_for_parity(sctx, sparity, map,
3164                                                           sdev, path, cur_logical);
3165                 if (ret < 0)
3166                         break;
3167         }
3168
3169         scrub_parity_put(sparity);
3170         scrub_submit(sctx);
3171         mutex_lock(&sctx->wr_lock);
3172         scrub_wr_submit(sctx);
3173         mutex_unlock(&sctx->wr_lock);
3174
3175         btrfs_free_path(path);
3176         return ret < 0 ? ret : 0;
3177 }
3178
3179 static void sync_replace_for_zoned(struct scrub_ctx *sctx)
3180 {
3181         if (!btrfs_is_zoned(sctx->fs_info))
3182                 return;
3183
3184         sctx->flush_all_writes = true;
3185         scrub_submit(sctx);
3186         mutex_lock(&sctx->wr_lock);
3187         scrub_wr_submit(sctx);
3188         mutex_unlock(&sctx->wr_lock);
3189
3190         wait_event(sctx->list_wait, atomic_read(&sctx->bios_in_flight) == 0);
3191 }
3192
3193 static int sync_write_pointer_for_zoned(struct scrub_ctx *sctx, u64 logical,
3194                                         u64 physical, u64 physical_end)
3195 {
3196         struct btrfs_fs_info *fs_info = sctx->fs_info;
3197         int ret = 0;
3198
3199         if (!btrfs_is_zoned(fs_info))
3200                 return 0;
3201
3202         wait_event(sctx->list_wait, atomic_read(&sctx->bios_in_flight) == 0);
3203
3204         mutex_lock(&sctx->wr_lock);
3205         if (sctx->write_pointer < physical_end) {
3206                 ret = btrfs_sync_zone_write_pointer(sctx->wr_tgtdev, logical,
3207                                                     physical,
3208                                                     sctx->write_pointer);
3209                 if (ret)
3210                         btrfs_err(fs_info,
3211                                   "zoned: failed to recover write pointer");
3212         }
3213         mutex_unlock(&sctx->wr_lock);
3214         btrfs_dev_clear_zone_empty(sctx->wr_tgtdev, physical);
3215
3216         return ret;
3217 }
3218
3219 /*
3220  * Scrub one range which can only has simple mirror based profile.
3221  * (Including all range in SINGLE/DUP/RAID1/RAID1C*, and each stripe in
3222  *  RAID0/RAID10).
3223  *
3224  * Since we may need to handle a subset of block group, we need @logical_start
3225  * and @logical_length parameter.
3226  */
3227 static int scrub_simple_mirror(struct scrub_ctx *sctx,
3228                                struct btrfs_root *extent_root,
3229                                struct btrfs_root *csum_root,
3230                                struct btrfs_block_group *bg,
3231                                struct map_lookup *map,
3232                                u64 logical_start, u64 logical_length,
3233                                struct btrfs_device *device,
3234                                u64 physical, int mirror_num)
3235 {
3236         struct btrfs_fs_info *fs_info = sctx->fs_info;
3237         const u64 logical_end = logical_start + logical_length;
3238         /* An artificial limit, inherit from old scrub behavior */
3239         const u32 max_length = SZ_64K;
3240         struct btrfs_path path = { 0 };
3241         u64 cur_logical = logical_start;
3242         int ret;
3243
3244         /* The range must be inside the bg */
3245         ASSERT(logical_start >= bg->start && logical_end <= bg->start + bg->length);
3246
3247         path.search_commit_root = 1;
3248         path.skip_locking = 1;
3249         /* Go through each extent items inside the logical range */
3250         while (cur_logical < logical_end) {
3251                 u64 extent_start;
3252                 u64 extent_len;
3253                 u64 extent_flags;
3254                 u64 extent_gen;
3255                 u64 scrub_len;
3256
3257                 /* Canceled? */
3258                 if (atomic_read(&fs_info->scrub_cancel_req) ||
3259                     atomic_read(&sctx->cancel_req)) {
3260                         ret = -ECANCELED;
3261                         break;
3262                 }
3263                 /* Paused? */
3264                 if (atomic_read(&fs_info->scrub_pause_req)) {
3265                         /* Push queued extents */
3266                         sctx->flush_all_writes = true;
3267                         scrub_submit(sctx);
3268                         mutex_lock(&sctx->wr_lock);
3269                         scrub_wr_submit(sctx);
3270                         mutex_unlock(&sctx->wr_lock);
3271                         wait_event(sctx->list_wait,
3272                                    atomic_read(&sctx->bios_in_flight) == 0);
3273                         sctx->flush_all_writes = false;
3274                         scrub_blocked_if_needed(fs_info);
3275                 }
3276                 /* Block group removed? */
3277                 spin_lock(&bg->lock);
3278                 if (bg->removed) {
3279                         spin_unlock(&bg->lock);
3280                         ret = 0;
3281                         break;
3282                 }
3283                 spin_unlock(&bg->lock);
3284
3285                 ret = find_first_extent_item(extent_root, &path, cur_logical,
3286                                              logical_end - cur_logical);
3287                 if (ret > 0) {
3288                         /* No more extent, just update the accounting */
3289                         sctx->stat.last_physical = physical + logical_length;
3290                         ret = 0;
3291                         break;
3292                 }
3293                 if (ret < 0)
3294                         break;
3295                 get_extent_info(&path, &extent_start, &extent_len,
3296                                 &extent_flags, &extent_gen);
3297                 /* Skip hole range which doesn't have any extent */
3298                 cur_logical = max(extent_start, cur_logical);
3299
3300                 /*
3301                  * Scrub len has three limits:
3302                  * - Extent size limit
3303                  * - Scrub range limit
3304                  *   This is especially imporatant for RAID0/RAID10 to reuse
3305                  *   this function
3306                  * - Max scrub size limit
3307                  */
3308                 scrub_len = min(min(extent_start + extent_len,
3309                                     logical_end), cur_logical + max_length) -
3310                             cur_logical;
3311
3312                 if (extent_flags & BTRFS_EXTENT_FLAG_DATA) {
3313                         ret = btrfs_lookup_csums_range(csum_root, cur_logical,
3314                                         cur_logical + scrub_len - 1,
3315                                         &sctx->csum_list, 1);
3316                         if (ret)
3317                                 break;
3318                 }
3319                 if ((extent_flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) &&
3320                     does_range_cross_boundary(extent_start, extent_len,
3321                                               logical_start, logical_length)) {
3322                         btrfs_err(fs_info,
3323 "scrub: tree block %llu spanning boundaries, ignored. boundary=[%llu, %llu)",
3324                                   extent_start, logical_start, logical_end);
3325                         spin_lock(&sctx->stat_lock);
3326                         sctx->stat.uncorrectable_errors++;
3327                         spin_unlock(&sctx->stat_lock);
3328                         cur_logical += scrub_len;
3329                         continue;
3330                 }
3331                 ret = scrub_extent(sctx, map, cur_logical, scrub_len,
3332                                    cur_logical - logical_start + physical,
3333                                    device, extent_flags, extent_gen,
3334                                    mirror_num);
3335                 scrub_free_csums(sctx);
3336                 if (ret)
3337                         break;
3338                 if (sctx->is_dev_replace)
3339                         sync_replace_for_zoned(sctx);
3340                 cur_logical += scrub_len;
3341                 /* Don't hold CPU for too long time */
3342                 cond_resched();
3343         }
3344         btrfs_release_path(&path);
3345         return ret;
3346 }
3347
3348 /* Calculate the full stripe length for simple stripe based profiles */
3349 static u64 simple_stripe_full_stripe_len(const struct map_lookup *map)
3350 {
3351         ASSERT(map->type & (BTRFS_BLOCK_GROUP_RAID0 |
3352                             BTRFS_BLOCK_GROUP_RAID10));
3353
3354         return map->num_stripes / map->sub_stripes * map->stripe_len;
3355 }
3356
3357 /* Get the logical bytenr for the stripe */
3358 static u64 simple_stripe_get_logical(struct map_lookup *map,
3359                                      struct btrfs_block_group *bg,
3360                                      int stripe_index)
3361 {
3362         ASSERT(map->type & (BTRFS_BLOCK_GROUP_RAID0 |
3363                             BTRFS_BLOCK_GROUP_RAID10));
3364         ASSERT(stripe_index < map->num_stripes);
3365
3366         /*
3367          * (stripe_index / sub_stripes) gives how many data stripes we need to
3368          * skip.
3369          */
3370         return (stripe_index / map->sub_stripes) * map->stripe_len + bg->start;
3371 }
3372
3373 /* Get the mirror number for the stripe */
3374 static int simple_stripe_mirror_num(struct map_lookup *map, int stripe_index)
3375 {
3376         ASSERT(map->type & (BTRFS_BLOCK_GROUP_RAID0 |
3377                             BTRFS_BLOCK_GROUP_RAID10));
3378         ASSERT(stripe_index < map->num_stripes);
3379
3380         /* For RAID0, it's fixed to 1, for RAID10 it's 0,1,0,1... */
3381         return stripe_index % map->sub_stripes + 1;
3382 }
3383
3384 static int scrub_simple_stripe(struct scrub_ctx *sctx,
3385                                struct btrfs_root *extent_root,
3386                                struct btrfs_root *csum_root,
3387                                struct btrfs_block_group *bg,
3388                                struct map_lookup *map,
3389                                struct btrfs_device *device,
3390                                int stripe_index)
3391 {
3392         const u64 logical_increment = simple_stripe_full_stripe_len(map);
3393         const u64 orig_logical = simple_stripe_get_logical(map, bg, stripe_index);
3394         const u64 orig_physical = map->stripes[stripe_index].physical;
3395         const int mirror_num = simple_stripe_mirror_num(map, stripe_index);
3396         u64 cur_logical = orig_logical;
3397         u64 cur_physical = orig_physical;
3398         int ret = 0;
3399
3400         while (cur_logical < bg->start + bg->length) {
3401                 /*
3402                  * Inside each stripe, RAID0 is just SINGLE, and RAID10 is
3403                  * just RAID1, so we can reuse scrub_simple_mirror() to scrub
3404                  * this stripe.
3405                  */
3406                 ret = scrub_simple_mirror(sctx, extent_root, csum_root, bg, map,
3407                                           cur_logical, map->stripe_len, device,
3408                                           cur_physical, mirror_num);
3409                 if (ret)
3410                         return ret;
3411                 /* Skip to next stripe which belongs to the target device */
3412                 cur_logical += logical_increment;
3413                 /* For physical offset, we just go to next stripe */
3414                 cur_physical += map->stripe_len;
3415         }
3416         return ret;
3417 }
3418
3419 static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
3420                                            struct btrfs_block_group *bg,
3421                                            struct map_lookup *map,
3422                                            struct btrfs_device *scrub_dev,
3423                                            int stripe_index, u64 dev_extent_len)
3424 {
3425         struct btrfs_path *path;
3426         struct btrfs_fs_info *fs_info = sctx->fs_info;
3427         struct btrfs_root *root;
3428         struct btrfs_root *csum_root;
3429         struct blk_plug plug;
3430         const u64 profile = map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK;
3431         const u64 chunk_logical = bg->start;
3432         int ret;
3433         u64 physical = map->stripes[stripe_index].physical;
3434         const u64 physical_end = physical + dev_extent_len;
3435         u64 logical;
3436         u64 logic_end;
3437         /* The logical increment after finishing one stripe */
3438         u64 increment;
3439         /* Offset inside the chunk */
3440         u64 offset;
3441         u64 stripe_logical;
3442         u64 stripe_end;
3443         int stop_loop = 0;
3444
3445         path = btrfs_alloc_path();
3446         if (!path)
3447                 return -ENOMEM;
3448
3449         /*
3450          * work on commit root. The related disk blocks are static as
3451          * long as COW is applied. This means, it is save to rewrite
3452          * them to repair disk errors without any race conditions
3453          */
3454         path->search_commit_root = 1;
3455         path->skip_locking = 1;
3456         path->reada = READA_FORWARD;
3457
3458         wait_event(sctx->list_wait,
3459                    atomic_read(&sctx->bios_in_flight) == 0);
3460         scrub_blocked_if_needed(fs_info);
3461
3462         root = btrfs_extent_root(fs_info, bg->start);
3463         csum_root = btrfs_csum_root(fs_info, bg->start);
3464
3465         /*
3466          * collect all data csums for the stripe to avoid seeking during
3467          * the scrub. This might currently (crc32) end up to be about 1MB
3468          */
3469         blk_start_plug(&plug);
3470
3471         if (sctx->is_dev_replace &&
3472             btrfs_dev_is_sequential(sctx->wr_tgtdev, physical)) {
3473                 mutex_lock(&sctx->wr_lock);
3474                 sctx->write_pointer = physical;
3475                 mutex_unlock(&sctx->wr_lock);
3476                 sctx->flush_all_writes = true;
3477         }
3478
3479         /*
3480          * There used to be a big double loop to handle all profiles using the
3481          * same routine, which grows larger and more gross over time.
3482          *
3483          * So here we handle each profile differently, so simpler profiles
3484          * have simpler scrubbing function.
3485          */
3486         if (!(profile & (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID10 |
3487                          BTRFS_BLOCK_GROUP_RAID56_MASK))) {
3488                 /*
3489                  * Above check rules out all complex profile, the remaining
3490                  * profiles are SINGLE|DUP|RAID1|RAID1C*, which is simple
3491                  * mirrored duplication without stripe.
3492                  *
3493                  * Only @physical and @mirror_num needs to calculated using
3494                  * @stripe_index.
3495                  */
3496                 ret = scrub_simple_mirror(sctx, root, csum_root, bg, map,
3497                                 bg->start, bg->length, scrub_dev,
3498                                 map->stripes[stripe_index].physical,
3499                                 stripe_index + 1);
3500                 offset = 0;
3501                 goto out;
3502         }
3503         if (profile & (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID10)) {
3504                 ret = scrub_simple_stripe(sctx, root, csum_root, bg, map,
3505                                           scrub_dev, stripe_index);
3506                 offset = map->stripe_len * (stripe_index / map->sub_stripes);
3507                 goto out;
3508         }
3509
3510         /* Only RAID56 goes through the old code */
3511         ASSERT(map->type & BTRFS_BLOCK_GROUP_RAID56_MASK);
3512         ret = 0;
3513
3514         /* Calculate the logical end of the stripe */
3515         get_raid56_logic_offset(physical_end, stripe_index,
3516                                 map, &logic_end, NULL);
3517         logic_end += chunk_logical;
3518
3519         /* Initialize @offset in case we need to go to out: label */
3520         get_raid56_logic_offset(physical, stripe_index, map, &offset, NULL);
3521         increment = map->stripe_len * nr_data_stripes(map);
3522
3523         /*
3524          * Due to the rotation, for RAID56 it's better to iterate each stripe
3525          * using their physical offset.
3526          */
3527         while (physical < physical_end) {
3528                 ret = get_raid56_logic_offset(physical, stripe_index, map,
3529                                               &logical, &stripe_logical);
3530                 logical += chunk_logical;
3531                 if (ret) {
3532                         /* it is parity strip */
3533                         stripe_logical += chunk_logical;
3534                         stripe_end = stripe_logical + increment;
3535                         ret = scrub_raid56_parity(sctx, map, scrub_dev,
3536                                                   stripe_logical,
3537                                                   stripe_end);
3538                         if (ret)
3539                                 goto out;
3540                         goto next;
3541                 }
3542
3543                 /*
3544                  * Now we're at a data stripe, scrub each extents in the range.
3545                  *
3546                  * At this stage, if we ignore the repair part, inside each data
3547                  * stripe it is no different than SINGLE profile.
3548                  * We can reuse scrub_simple_mirror() here, as the repair part
3549                  * is still based on @mirror_num.
3550                  */
3551                 ret = scrub_simple_mirror(sctx, root, csum_root, bg, map,
3552                                           logical, map->stripe_len,
3553                                           scrub_dev, physical, 1);
3554                 if (ret < 0)
3555                         goto out;
3556 next:
3557                 logical += increment;
3558                 physical += map->stripe_len;
3559                 spin_lock(&sctx->stat_lock);
3560                 if (stop_loop)
3561                         sctx->stat.last_physical = map->stripes[stripe_index].physical +
3562                                                    dev_extent_len;
3563                 else
3564                         sctx->stat.last_physical = physical;
3565                 spin_unlock(&sctx->stat_lock);
3566                 if (stop_loop)
3567                         break;
3568         }
3569 out:
3570         /* push queued extents */
3571         scrub_submit(sctx);
3572         mutex_lock(&sctx->wr_lock);
3573         scrub_wr_submit(sctx);
3574         mutex_unlock(&sctx->wr_lock);
3575
3576         blk_finish_plug(&plug);
3577         btrfs_free_path(path);
3578
3579         if (sctx->is_dev_replace && ret >= 0) {
3580                 int ret2;
3581
3582                 ret2 = sync_write_pointer_for_zoned(sctx,
3583                                 chunk_logical + offset,
3584                                 map->stripes[stripe_index].physical,
3585                                 physical_end);
3586                 if (ret2)
3587                         ret = ret2;
3588         }
3589
3590         return ret < 0 ? ret : 0;
3591 }
3592
3593 static noinline_for_stack int scrub_chunk(struct scrub_ctx *sctx,
3594                                           struct btrfs_block_group *bg,
3595                                           struct btrfs_device *scrub_dev,
3596                                           u64 dev_offset,
3597                                           u64 dev_extent_len)
3598 {
3599         struct btrfs_fs_info *fs_info = sctx->fs_info;
3600         struct extent_map_tree *map_tree = &fs_info->mapping_tree;
3601         struct map_lookup *map;
3602         struct extent_map *em;
3603         int i;
3604         int ret = 0;
3605
3606         read_lock(&map_tree->lock);
3607         em = lookup_extent_mapping(map_tree, bg->start, bg->length);
3608         read_unlock(&map_tree->lock);
3609
3610         if (!em) {
3611                 /*
3612                  * Might have been an unused block group deleted by the cleaner
3613                  * kthread or relocation.
3614                  */
3615                 spin_lock(&bg->lock);
3616                 if (!bg->removed)
3617                         ret = -EINVAL;
3618                 spin_unlock(&bg->lock);
3619
3620                 return ret;
3621         }
3622         if (em->start != bg->start)
3623                 goto out;
3624         if (em->len < dev_extent_len)
3625                 goto out;
3626
3627         map = em->map_lookup;
3628         for (i = 0; i < map->num_stripes; ++i) {
3629                 if (map->stripes[i].dev->bdev == scrub_dev->bdev &&
3630                     map->stripes[i].physical == dev_offset) {
3631                         ret = scrub_stripe(sctx, bg, map, scrub_dev, i,
3632                                            dev_extent_len);
3633                         if (ret)
3634                                 goto out;
3635                 }
3636         }
3637 out:
3638         free_extent_map(em);
3639
3640         return ret;
3641 }
3642
3643 static int finish_extent_writes_for_zoned(struct btrfs_root *root,
3644                                           struct btrfs_block_group *cache)
3645 {
3646         struct btrfs_fs_info *fs_info = cache->fs_info;
3647         struct btrfs_trans_handle *trans;
3648
3649         if (!btrfs_is_zoned(fs_info))
3650                 return 0;
3651
3652         btrfs_wait_block_group_reservations(cache);
3653         btrfs_wait_nocow_writers(cache);
3654         btrfs_wait_ordered_roots(fs_info, U64_MAX, cache->start, cache->length);
3655
3656         trans = btrfs_join_transaction(root);
3657         if (IS_ERR(trans))
3658                 return PTR_ERR(trans);
3659         return btrfs_commit_transaction(trans);
3660 }
3661
3662 static noinline_for_stack
3663 int scrub_enumerate_chunks(struct scrub_ctx *sctx,
3664                            struct btrfs_device *scrub_dev, u64 start, u64 end)
3665 {
3666         struct btrfs_dev_extent *dev_extent = NULL;
3667         struct btrfs_path *path;
3668         struct btrfs_fs_info *fs_info = sctx->fs_info;
3669         struct btrfs_root *root = fs_info->dev_root;
3670         u64 chunk_offset;
3671         int ret = 0;
3672         int ro_set;
3673         int slot;
3674         struct extent_buffer *l;
3675         struct btrfs_key key;
3676         struct btrfs_key found_key;
3677         struct btrfs_block_group *cache;
3678         struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
3679
3680         path = btrfs_alloc_path();
3681         if (!path)
3682                 return -ENOMEM;
3683
3684         path->reada = READA_FORWARD;
3685         path->search_commit_root = 1;
3686         path->skip_locking = 1;
3687
3688         key.objectid = scrub_dev->devid;
3689         key.offset = 0ull;
3690         key.type = BTRFS_DEV_EXTENT_KEY;
3691
3692         while (1) {
3693                 u64 dev_extent_len;
3694
3695                 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
3696                 if (ret < 0)
3697                         break;
3698                 if (ret > 0) {
3699                         if (path->slots[0] >=
3700                             btrfs_header_nritems(path->nodes[0])) {
3701                                 ret = btrfs_next_leaf(root, path);
3702                                 if (ret < 0)
3703                                         break;
3704                                 if (ret > 0) {
3705                                         ret = 0;
3706                                         break;
3707                                 }
3708                         } else {
3709                                 ret = 0;
3710                         }
3711                 }
3712
3713                 l = path->nodes[0];
3714                 slot = path->slots[0];
3715
3716                 btrfs_item_key_to_cpu(l, &found_key, slot);
3717
3718                 if (found_key.objectid != scrub_dev->devid)
3719                         break;
3720
3721                 if (found_key.type != BTRFS_DEV_EXTENT_KEY)
3722                         break;
3723
3724                 if (found_key.offset >= end)
3725                         break;
3726
3727                 if (found_key.offset < key.offset)
3728                         break;
3729
3730                 dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
3731                 dev_extent_len = btrfs_dev_extent_length(l, dev_extent);
3732
3733                 if (found_key.offset + dev_extent_len <= start)
3734                         goto skip;
3735
3736                 chunk_offset = btrfs_dev_extent_chunk_offset(l, dev_extent);
3737
3738                 /*
3739                  * get a reference on the corresponding block group to prevent
3740                  * the chunk from going away while we scrub it
3741                  */
3742                 cache = btrfs_lookup_block_group(fs_info, chunk_offset);
3743
3744                 /* some chunks are removed but not committed to disk yet,
3745                  * continue scrubbing */
3746                 if (!cache)
3747                         goto skip;
3748
3749                 ASSERT(cache->start <= chunk_offset);
3750                 /*
3751                  * We are using the commit root to search for device extents, so
3752                  * that means we could have found a device extent item from a
3753                  * block group that was deleted in the current transaction. The
3754                  * logical start offset of the deleted block group, stored at
3755                  * @chunk_offset, might be part of the logical address range of
3756                  * a new block group (which uses different physical extents).
3757                  * In this case btrfs_lookup_block_group() has returned the new
3758                  * block group, and its start address is less than @chunk_offset.
3759                  *
3760                  * We skip such new block groups, because it's pointless to
3761                  * process them, as we won't find their extents because we search
3762                  * for them using the commit root of the extent tree. For a device
3763                  * replace it's also fine to skip it, we won't miss copying them
3764                  * to the target device because we have the write duplication
3765                  * setup through the regular write path (by btrfs_map_block()),
3766                  * and we have committed a transaction when we started the device
3767                  * replace, right after setting up the device replace state.
3768                  */
3769                 if (cache->start < chunk_offset) {
3770                         btrfs_put_block_group(cache);
3771                         goto skip;
3772                 }
3773
3774                 if (sctx->is_dev_replace && btrfs_is_zoned(fs_info)) {
3775                         spin_lock(&cache->lock);
3776                         if (!cache->to_copy) {
3777                                 spin_unlock(&cache->lock);
3778                                 btrfs_put_block_group(cache);
3779                                 goto skip;
3780                         }
3781                         spin_unlock(&cache->lock);
3782                 }
3783
3784                 /*
3785                  * Make sure that while we are scrubbing the corresponding block
3786                  * group doesn't get its logical address and its device extents
3787                  * reused for another block group, which can possibly be of a
3788                  * different type and different profile. We do this to prevent
3789                  * false error detections and crashes due to bogus attempts to
3790                  * repair extents.
3791                  */
3792                 spin_lock(&cache->lock);
3793                 if (cache->removed) {
3794                         spin_unlock(&cache->lock);
3795                         btrfs_put_block_group(cache);
3796                         goto skip;
3797                 }
3798                 btrfs_freeze_block_group(cache);
3799                 spin_unlock(&cache->lock);
3800
3801                 /*
3802                  * we need call btrfs_inc_block_group_ro() with scrubs_paused,
3803                  * to avoid deadlock caused by:
3804                  * btrfs_inc_block_group_ro()
3805                  * -> btrfs_wait_for_commit()
3806                  * -> btrfs_commit_transaction()
3807                  * -> btrfs_scrub_pause()
3808                  */
3809                 scrub_pause_on(fs_info);
3810
3811                 /*
3812                  * Don't do chunk preallocation for scrub.
3813                  *
3814                  * This is especially important for SYSTEM bgs, or we can hit
3815                  * -EFBIG from btrfs_finish_chunk_alloc() like:
3816                  * 1. The only SYSTEM bg is marked RO.
3817                  *    Since SYSTEM bg is small, that's pretty common.
3818                  * 2. New SYSTEM bg will be allocated
3819                  *    Due to regular version will allocate new chunk.
3820                  * 3. New SYSTEM bg is empty and will get cleaned up
3821                  *    Before cleanup really happens, it's marked RO again.
3822                  * 4. Empty SYSTEM bg get scrubbed
3823                  *    We go back to 2.
3824                  *
3825                  * This can easily boost the amount of SYSTEM chunks if cleaner
3826                  * thread can't be triggered fast enough, and use up all space
3827                  * of btrfs_super_block::sys_chunk_array
3828                  *
3829                  * While for dev replace, we need to try our best to mark block
3830                  * group RO, to prevent race between:
3831                  * - Write duplication
3832                  *   Contains latest data
3833                  * - Scrub copy
3834                  *   Contains data from commit tree
3835                  *
3836                  * If target block group is not marked RO, nocow writes can
3837                  * be overwritten by scrub copy, causing data corruption.
3838                  * So for dev-replace, it's not allowed to continue if a block
3839                  * group is not RO.
3840                  */
3841                 ret = btrfs_inc_block_group_ro(cache, sctx->is_dev_replace);
3842                 if (!ret && sctx->is_dev_replace) {
3843                         ret = finish_extent_writes_for_zoned(root, cache);
3844                         if (ret) {
3845                                 btrfs_dec_block_group_ro(cache);
3846                                 scrub_pause_off(fs_info);
3847                                 btrfs_put_block_group(cache);
3848                                 break;
3849                         }
3850                 }
3851
3852                 if (ret == 0) {
3853                         ro_set = 1;
3854                 } else if (ret == -ENOSPC && !sctx->is_dev_replace) {
3855                         /*
3856                          * btrfs_inc_block_group_ro return -ENOSPC when it
3857                          * failed in creating new chunk for metadata.
3858                          * It is not a problem for scrub, because
3859                          * metadata are always cowed, and our scrub paused
3860                          * commit_transactions.
3861                          */
3862                         ro_set = 0;
3863                 } else if (ret == -ETXTBSY) {
3864                         btrfs_warn(fs_info,
3865                    "skipping scrub of block group %llu due to active swapfile",
3866                                    cache->start);
3867                         scrub_pause_off(fs_info);
3868                         ret = 0;
3869                         goto skip_unfreeze;
3870                 } else {
3871                         btrfs_warn(fs_info,
3872                                    "failed setting block group ro: %d", ret);
3873                         btrfs_unfreeze_block_group(cache);
3874                         btrfs_put_block_group(cache);
3875                         scrub_pause_off(fs_info);
3876                         break;
3877                 }
3878
3879                 /*
3880                  * Now the target block is marked RO, wait for nocow writes to
3881                  * finish before dev-replace.
3882                  * COW is fine, as COW never overwrites extents in commit tree.
3883                  */
3884                 if (sctx->is_dev_replace) {
3885                         btrfs_wait_nocow_writers(cache);
3886                         btrfs_wait_ordered_roots(fs_info, U64_MAX, cache->start,
3887                                         cache->length);
3888                 }
3889
3890                 scrub_pause_off(fs_info);
3891                 down_write(&dev_replace->rwsem);
3892                 dev_replace->cursor_right = found_key.offset + dev_extent_len;
3893                 dev_replace->cursor_left = found_key.offset;
3894                 dev_replace->item_needs_writeback = 1;
3895                 up_write(&dev_replace->rwsem);
3896
3897                 ret = scrub_chunk(sctx, cache, scrub_dev, found_key.offset,
3898                                   dev_extent_len);
3899
3900                 /*
3901                  * flush, submit all pending read and write bios, afterwards
3902                  * wait for them.
3903                  * Note that in the dev replace case, a read request causes
3904                  * write requests that are submitted in the read completion
3905                  * worker. Therefore in the current situation, it is required
3906                  * that all write requests are flushed, so that all read and
3907                  * write requests are really completed when bios_in_flight
3908                  * changes to 0.
3909                  */
3910                 sctx->flush_all_writes = true;
3911                 scrub_submit(sctx);
3912                 mutex_lock(&sctx->wr_lock);
3913                 scrub_wr_submit(sctx);
3914                 mutex_unlock(&sctx->wr_lock);
3915
3916                 wait_event(sctx->list_wait,
3917                            atomic_read(&sctx->bios_in_flight) == 0);
3918
3919                 scrub_pause_on(fs_info);
3920
3921                 /*
3922                  * must be called before we decrease @scrub_paused.
3923                  * make sure we don't block transaction commit while
3924                  * we are waiting pending workers finished.
3925                  */
3926                 wait_event(sctx->list_wait,
3927                            atomic_read(&sctx->workers_pending) == 0);
3928                 sctx->flush_all_writes = false;
3929
3930                 scrub_pause_off(fs_info);
3931
3932                 if (sctx->is_dev_replace &&
3933                     !btrfs_finish_block_group_to_copy(dev_replace->srcdev,
3934                                                       cache, found_key.offset))
3935                         ro_set = 0;
3936
3937                 down_write(&dev_replace->rwsem);
3938                 dev_replace->cursor_left = dev_replace->cursor_right;
3939                 dev_replace->item_needs_writeback = 1;
3940                 up_write(&dev_replace->rwsem);
3941
3942                 if (ro_set)
3943                         btrfs_dec_block_group_ro(cache);
3944
3945                 /*
3946                  * We might have prevented the cleaner kthread from deleting
3947                  * this block group if it was already unused because we raced
3948                  * and set it to RO mode first. So add it back to the unused
3949                  * list, otherwise it might not ever be deleted unless a manual
3950                  * balance is triggered or it becomes used and unused again.
3951                  */
3952                 spin_lock(&cache->lock);
3953                 if (!cache->removed && !cache->ro && cache->reserved == 0 &&
3954                     cache->used == 0) {
3955                         spin_unlock(&cache->lock);
3956                         if (btrfs_test_opt(fs_info, DISCARD_ASYNC))
3957                                 btrfs_discard_queue_work(&fs_info->discard_ctl,
3958                                                          cache);
3959                         else
3960                                 btrfs_mark_bg_unused(cache);
3961                 } else {
3962                         spin_unlock(&cache->lock);
3963                 }
3964 skip_unfreeze:
3965                 btrfs_unfreeze_block_group(cache);
3966                 btrfs_put_block_group(cache);
3967                 if (ret)
3968                         break;
3969                 if (sctx->is_dev_replace &&
3970                     atomic64_read(&dev_replace->num_write_errors) > 0) {
3971                         ret = -EIO;
3972                         break;
3973                 }
3974                 if (sctx->stat.malloc_errors > 0) {
3975                         ret = -ENOMEM;
3976                         break;
3977                 }
3978 skip:
3979                 key.offset = found_key.offset + dev_extent_len;
3980                 btrfs_release_path(path);
3981         }
3982
3983         btrfs_free_path(path);
3984
3985         return ret;
3986 }
3987
3988 static noinline_for_stack int scrub_supers(struct scrub_ctx *sctx,
3989                                            struct btrfs_device *scrub_dev)
3990 {
3991         int     i;
3992         u64     bytenr;
3993         u64     gen;
3994         int     ret;
3995         struct btrfs_fs_info *fs_info = sctx->fs_info;
3996
3997         if (BTRFS_FS_ERROR(fs_info))
3998                 return -EROFS;
3999
4000         /* Seed devices of a new filesystem has their own generation. */
4001         if (scrub_dev->fs_devices != fs_info->fs_devices)
4002                 gen = scrub_dev->generation;
4003         else
4004                 gen = fs_info->last_trans_committed;
4005
4006         for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
4007                 bytenr = btrfs_sb_offset(i);
4008                 if (bytenr + BTRFS_SUPER_INFO_SIZE >
4009                     scrub_dev->commit_total_bytes)
4010                         break;
4011                 if (!btrfs_check_super_location(scrub_dev, bytenr))
4012                         continue;
4013
4014                 ret = scrub_sectors(sctx, bytenr, BTRFS_SUPER_INFO_SIZE, bytenr,
4015                                     scrub_dev, BTRFS_EXTENT_FLAG_SUPER, gen, i,
4016                                     NULL, bytenr);
4017                 if (ret)
4018                         return ret;
4019         }
4020         wait_event(sctx->list_wait, atomic_read(&sctx->bios_in_flight) == 0);
4021
4022         return 0;
4023 }
4024
4025 static void scrub_workers_put(struct btrfs_fs_info *fs_info)
4026 {
4027         if (refcount_dec_and_mutex_lock(&fs_info->scrub_workers_refcnt,
4028                                         &fs_info->scrub_lock)) {
4029                 struct workqueue_struct *scrub_workers = fs_info->scrub_workers;
4030                 struct workqueue_struct *scrub_wr_comp =
4031                                                 fs_info->scrub_wr_completion_workers;
4032                 struct workqueue_struct *scrub_parity =
4033                                                 fs_info->scrub_parity_workers;
4034
4035                 fs_info->scrub_workers = NULL;
4036                 fs_info->scrub_wr_completion_workers = NULL;
4037                 fs_info->scrub_parity_workers = NULL;
4038                 mutex_unlock(&fs_info->scrub_lock);
4039
4040                 if (scrub_workers)
4041                         destroy_workqueue(scrub_workers);
4042                 if (scrub_wr_comp)
4043                         destroy_workqueue(scrub_wr_comp);
4044                 if (scrub_parity)
4045                         destroy_workqueue(scrub_parity);
4046         }
4047 }
4048
4049 /*
4050  * get a reference count on fs_info->scrub_workers. start worker if necessary
4051  */
4052 static noinline_for_stack int scrub_workers_get(struct btrfs_fs_info *fs_info,
4053                                                 int is_dev_replace)
4054 {
4055         struct workqueue_struct *scrub_workers = NULL;
4056         struct workqueue_struct *scrub_wr_comp = NULL;
4057         struct workqueue_struct *scrub_parity = NULL;
4058         unsigned int flags = WQ_FREEZABLE | WQ_UNBOUND;
4059         int max_active = fs_info->thread_pool_size;
4060         int ret = -ENOMEM;
4061
4062         if (refcount_inc_not_zero(&fs_info->scrub_workers_refcnt))
4063                 return 0;
4064
4065         scrub_workers = alloc_workqueue("btrfs-scrub", flags,
4066                                         is_dev_replace ? 1 : max_active);
4067         if (!scrub_workers)
4068                 goto fail_scrub_workers;
4069
4070         scrub_wr_comp = alloc_workqueue("btrfs-scrubwrc", flags, max_active);
4071         if (!scrub_wr_comp)
4072                 goto fail_scrub_wr_completion_workers;
4073
4074         scrub_parity = alloc_workqueue("btrfs-scrubparity", flags, max_active);
4075         if (!scrub_parity)
4076                 goto fail_scrub_parity_workers;
4077
4078         mutex_lock(&fs_info->scrub_lock);
4079         if (refcount_read(&fs_info->scrub_workers_refcnt) == 0) {
4080                 ASSERT(fs_info->scrub_workers == NULL &&
4081                        fs_info->scrub_wr_completion_workers == NULL &&
4082                        fs_info->scrub_parity_workers == NULL);
4083                 fs_info->scrub_workers = scrub_workers;
4084                 fs_info->scrub_wr_completion_workers = scrub_wr_comp;
4085                 fs_info->scrub_parity_workers = scrub_parity;
4086                 refcount_set(&fs_info->scrub_workers_refcnt, 1);
4087                 mutex_unlock(&fs_info->scrub_lock);
4088                 return 0;
4089         }
4090         /* Other thread raced in and created the workers for us */
4091         refcount_inc(&fs_info->scrub_workers_refcnt);
4092         mutex_unlock(&fs_info->scrub_lock);
4093
4094         ret = 0;
4095         destroy_workqueue(scrub_parity);
4096 fail_scrub_parity_workers:
4097         destroy_workqueue(scrub_wr_comp);
4098 fail_scrub_wr_completion_workers:
4099         destroy_workqueue(scrub_workers);
4100 fail_scrub_workers:
4101         return ret;
4102 }
4103
4104 int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
4105                     u64 end, struct btrfs_scrub_progress *progress,
4106                     int readonly, int is_dev_replace)
4107 {
4108         struct btrfs_dev_lookup_args args = { .devid = devid };
4109         struct scrub_ctx *sctx;
4110         int ret;
4111         struct btrfs_device *dev;
4112         unsigned int nofs_flag;
4113
4114         if (btrfs_fs_closing(fs_info))
4115                 return -EAGAIN;
4116
4117         if (fs_info->nodesize > BTRFS_STRIPE_LEN) {
4118                 /*
4119                  * in this case scrub is unable to calculate the checksum
4120                  * the way scrub is implemented. Do not handle this
4121                  * situation at all because it won't ever happen.
4122                  */
4123                 btrfs_err(fs_info,
4124                            "scrub: size assumption nodesize <= BTRFS_STRIPE_LEN (%d <= %d) fails",
4125                        fs_info->nodesize,
4126                        BTRFS_STRIPE_LEN);
4127                 return -EINVAL;
4128         }
4129
4130         if (fs_info->nodesize >
4131             SCRUB_MAX_SECTORS_PER_BLOCK << fs_info->sectorsize_bits ||
4132             fs_info->sectorsize > PAGE_SIZE * SCRUB_MAX_SECTORS_PER_BLOCK) {
4133                 /*
4134                  * Would exhaust the array bounds of sectorv member in
4135                  * struct scrub_block
4136                  */
4137                 btrfs_err(fs_info,
4138 "scrub: nodesize and sectorsize <= SCRUB_MAX_SECTORS_PER_BLOCK (%d <= %d && %d <= %d) fails",
4139                        fs_info->nodesize, SCRUB_MAX_SECTORS_PER_BLOCK,
4140                        fs_info->sectorsize, SCRUB_MAX_SECTORS_PER_BLOCK);
4141                 return -EINVAL;
4142         }
4143
4144         /* Allocate outside of device_list_mutex */
4145         sctx = scrub_setup_ctx(fs_info, is_dev_replace);
4146         if (IS_ERR(sctx))
4147                 return PTR_ERR(sctx);
4148
4149         ret = scrub_workers_get(fs_info, is_dev_replace);
4150         if (ret)
4151                 goto out_free_ctx;
4152
4153         mutex_lock(&fs_info->fs_devices->device_list_mutex);
4154         dev = btrfs_find_device(fs_info->fs_devices, &args);
4155         if (!dev || (test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state) &&
4156                      !is_dev_replace)) {
4157                 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
4158                 ret = -ENODEV;
4159                 goto out;
4160         }
4161
4162         if (!is_dev_replace && !readonly &&
4163             !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state)) {
4164                 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
4165                 btrfs_err_in_rcu(fs_info,
4166                         "scrub on devid %llu: filesystem on %s is not writable",
4167                                  devid, rcu_str_deref(dev->name));
4168                 ret = -EROFS;
4169                 goto out;
4170         }
4171
4172         mutex_lock(&fs_info->scrub_lock);
4173         if (!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &dev->dev_state) ||
4174             test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &dev->dev_state)) {
4175                 mutex_unlock(&fs_info->scrub_lock);
4176                 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
4177                 ret = -EIO;
4178                 goto out;
4179         }
4180
4181         down_read(&fs_info->dev_replace.rwsem);
4182         if (dev->scrub_ctx ||
4183             (!is_dev_replace &&
4184              btrfs_dev_replace_is_ongoing(&fs_info->dev_replace))) {
4185                 up_read(&fs_info->dev_replace.rwsem);
4186                 mutex_unlock(&fs_info->scrub_lock);
4187                 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
4188                 ret = -EINPROGRESS;
4189                 goto out;
4190         }
4191         up_read(&fs_info->dev_replace.rwsem);
4192
4193         sctx->readonly = readonly;
4194         dev->scrub_ctx = sctx;
4195         mutex_unlock(&fs_info->fs_devices->device_list_mutex);
4196
4197         /*
4198          * checking @scrub_pause_req here, we can avoid
4199          * race between committing transaction and scrubbing.
4200          */
4201         __scrub_blocked_if_needed(fs_info);
4202         atomic_inc(&fs_info->scrubs_running);
4203         mutex_unlock(&fs_info->scrub_lock);
4204
4205         /*
4206          * In order to avoid deadlock with reclaim when there is a transaction
4207          * trying to pause scrub, make sure we use GFP_NOFS for all the
4208          * allocations done at btrfs_scrub_sectors() and scrub_sectors_for_parity()
4209          * invoked by our callees. The pausing request is done when the
4210          * transaction commit starts, and it blocks the transaction until scrub
4211          * is paused (done at specific points at scrub_stripe() or right above
4212          * before incrementing fs_info->scrubs_running).
4213          */
4214         nofs_flag = memalloc_nofs_save();
4215         if (!is_dev_replace) {
4216                 btrfs_info(fs_info, "scrub: started on devid %llu", devid);
4217                 /*
4218                  * by holding device list mutex, we can
4219                  * kick off writing super in log tree sync.
4220                  */
4221                 mutex_lock(&fs_info->fs_devices->device_list_mutex);
4222                 ret = scrub_supers(sctx, dev);
4223                 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
4224         }
4225
4226         if (!ret)
4227                 ret = scrub_enumerate_chunks(sctx, dev, start, end);
4228         memalloc_nofs_restore(nofs_flag);
4229
4230         wait_event(sctx->list_wait, atomic_read(&sctx->bios_in_flight) == 0);
4231         atomic_dec(&fs_info->scrubs_running);
4232         wake_up(&fs_info->scrub_pause_wait);
4233
4234         wait_event(sctx->list_wait, atomic_read(&sctx->workers_pending) == 0);
4235
4236         if (progress)
4237                 memcpy(progress, &sctx->stat, sizeof(*progress));
4238
4239         if (!is_dev_replace)
4240                 btrfs_info(fs_info, "scrub: %s on devid %llu with status: %d",
4241                         ret ? "not finished" : "finished", devid, ret);
4242
4243         mutex_lock(&fs_info->scrub_lock);
4244         dev->scrub_ctx = NULL;
4245         mutex_unlock(&fs_info->scrub_lock);
4246
4247         scrub_workers_put(fs_info);
4248         scrub_put_ctx(sctx);
4249
4250         return ret;
4251 out:
4252         scrub_workers_put(fs_info);
4253 out_free_ctx:
4254         scrub_free_ctx(sctx);
4255
4256         return ret;
4257 }
4258
4259 void btrfs_scrub_pause(struct btrfs_fs_info *fs_info)
4260 {
4261         mutex_lock(&fs_info->scrub_lock);
4262         atomic_inc(&fs_info->scrub_pause_req);
4263         while (atomic_read(&fs_info->scrubs_paused) !=
4264                atomic_read(&fs_info->scrubs_running)) {
4265                 mutex_unlock(&fs_info->scrub_lock);
4266                 wait_event(fs_info->scrub_pause_wait,
4267                            atomic_read(&fs_info->scrubs_paused) ==
4268                            atomic_read(&fs_info->scrubs_running));
4269                 mutex_lock(&fs_info->scrub_lock);
4270         }
4271         mutex_unlock(&fs_info->scrub_lock);
4272 }
4273
4274 void btrfs_scrub_continue(struct btrfs_fs_info *fs_info)
4275 {
4276         atomic_dec(&fs_info->scrub_pause_req);
4277         wake_up(&fs_info->scrub_pause_wait);
4278 }
4279
4280 int btrfs_scrub_cancel(struct btrfs_fs_info *fs_info)
4281 {
4282         mutex_lock(&fs_info->scrub_lock);
4283         if (!atomic_read(&fs_info->scrubs_running)) {
4284                 mutex_unlock(&fs_info->scrub_lock);
4285                 return -ENOTCONN;
4286         }
4287
4288         atomic_inc(&fs_info->scrub_cancel_req);
4289         while (atomic_read(&fs_info->scrubs_running)) {
4290                 mutex_unlock(&fs_info->scrub_lock);
4291                 wait_event(fs_info->scrub_pause_wait,
4292                            atomic_read(&fs_info->scrubs_running) == 0);
4293                 mutex_lock(&fs_info->scrub_lock);
4294         }
4295         atomic_dec(&fs_info->scrub_cancel_req);
4296         mutex_unlock(&fs_info->scrub_lock);
4297
4298         return 0;
4299 }
4300
4301 int btrfs_scrub_cancel_dev(struct btrfs_device *dev)
4302 {
4303         struct btrfs_fs_info *fs_info = dev->fs_info;
4304         struct scrub_ctx *sctx;
4305
4306         mutex_lock(&fs_info->scrub_lock);
4307         sctx = dev->scrub_ctx;
4308         if (!sctx) {
4309                 mutex_unlock(&fs_info->scrub_lock);
4310                 return -ENOTCONN;
4311         }
4312         atomic_inc(&sctx->cancel_req);
4313         while (dev->scrub_ctx) {
4314                 mutex_unlock(&fs_info->scrub_lock);
4315                 wait_event(fs_info->scrub_pause_wait,
4316                            dev->scrub_ctx == NULL);
4317                 mutex_lock(&fs_info->scrub_lock);
4318         }
4319         mutex_unlock(&fs_info->scrub_lock);
4320
4321         return 0;
4322 }
4323
4324 int btrfs_scrub_progress(struct btrfs_fs_info *fs_info, u64 devid,
4325                          struct btrfs_scrub_progress *progress)
4326 {
4327         struct btrfs_dev_lookup_args args = { .devid = devid };
4328         struct btrfs_device *dev;
4329         struct scrub_ctx *sctx = NULL;
4330
4331         mutex_lock(&fs_info->fs_devices->device_list_mutex);
4332         dev = btrfs_find_device(fs_info->fs_devices, &args);
4333         if (dev)
4334                 sctx = dev->scrub_ctx;
4335         if (sctx)
4336                 memcpy(progress, &sctx->stat, sizeof(*progress));
4337         mutex_unlock(&fs_info->fs_devices->device_list_mutex);
4338
4339         return dev ? (sctx ? 0 : -ENOTCONN) : -ENODEV;
4340 }
4341
4342 static void scrub_find_good_copy(struct btrfs_fs_info *fs_info,
4343                                  u64 extent_logical, u32 extent_len,
4344                                  u64 *extent_physical,
4345                                  struct btrfs_device **extent_dev,
4346                                  int *extent_mirror_num)
4347 {
4348         u64 mapped_length;
4349         struct btrfs_io_context *bioc = NULL;
4350         int ret;
4351
4352         mapped_length = extent_len;
4353         ret = btrfs_map_block(fs_info, BTRFS_MAP_READ, extent_logical,
4354                               &mapped_length, &bioc, 0);
4355         if (ret || !bioc || mapped_length < extent_len ||
4356             !bioc->stripes[0].dev->bdev) {
4357                 btrfs_put_bioc(bioc);
4358                 return;
4359         }
4360
4361         *extent_physical = bioc->stripes[0].physical;
4362         *extent_mirror_num = bioc->mirror_num;
4363         *extent_dev = bioc->stripes[0].dev;
4364         btrfs_put_bioc(bioc);
4365 }
This page took 0.283162 seconds and 2 git commands to generate.