fs/btrfs/scrub.c

   1 // SPDX-License-Identifier: GPL-2.0
   2 /*
   3  * Copyright (C) 2011, 2012 STRATO.  All rights reserved.
   4  */
   5
   6 #include <linux/blkdev.h>
   7 #include <linux/ratelimit.h>
   8 #include <linux/sched/mm.h>
   9 #include <crypto/hash.h>
  10 #include "ctree.h"
  11 #include "discard.h"
  12 #include "volumes.h"
  13 #include "disk-io.h"
  14 #include "ordered-data.h"
  15 #include "transaction.h"
  16 #include "backref.h"
  17 #include "extent_io.h"
  18 #include "dev-replace.h"
  19 #include "check-integrity.h"
  20 #include "rcu-string.h"
  21 #include "raid56.h"
  22 #include "block-group.h"
  23 #include "zoned.h"
  24
  25 /*
  26  * This is only the first step towards a full-features scrub. It reads all
  27  * extent and super block and verifies the checksums. In case a bad checksum
  28  * is found or the extent cannot be read, good data will be written back if
  29  * any can be found.
  30  *
  31  * Future enhancements:
  32  *  - In case an unrepairable extent is encountered, track which files are
  33  *    affected and report them
  34  *  - track and record media errors, throw out bad devices
  35  *  - add a mode to also read unallocated space
  36  */
  37
  38 struct scrub_block;
  39 struct scrub_ctx;
  40
  41 /*
  42  * The following three values only influence the performance.
  43  *
  44  * The last one configures the number of parallel and outstanding I/O
  45  * operations. The first one configures an upper limit for the number
  46  * of (dynamically allocated) pages that are added to a bio.
  47  */
  48 #define SCRUB_SECTORS_PER_BIO   32      /* 128KiB per bio for 4KiB pages */
  49 #define SCRUB_BIOS_PER_SCTX     64      /* 8MiB per device in flight for 4KiB pages */
  50
  51 /*
  52  * The following value times PAGE_SIZE needs to be large enough to match the
  53  * largest node/leaf/sector size that shall be supported.
  54  */
  55 #define SCRUB_MAX_SECTORS_PER_BLOCK     (BTRFS_MAX_METADATA_BLOCKSIZE / SZ_4K)
  56
  57 struct scrub_recover {
  58         refcount_t              refs;
  59         struct btrfs_io_context *bioc;
  60         u64                     map_length;
  61 };
  62
  63 struct scrub_sector {
  64         struct scrub_block      *sblock;
  65         struct page             *page;
  66         struct btrfs_device     *dev;
  67         struct list_head        list;
  68         u64                     flags;  /* extent flags */
  69         u64                     generation;
  70         u64                     logical;
  71         u64                     physical;
  72         u64                     physical_for_dev_replace;
  73         atomic_t                refs;
  74         u8                      mirror_num;
  75         unsigned int            have_csum:1;
  76         unsigned int            io_error:1;
  77         u8                      csum[BTRFS_CSUM_SIZE];
  78
  79         struct scrub_recover    *recover;
  80 };
  81
  82 struct scrub_bio {
  83         int                     index;
  84         struct scrub_ctx        *sctx;
  85         struct btrfs_device     *dev;
  86         struct bio              *bio;
  87         blk_status_t            status;
  88         u64                     logical;
  89         u64                     physical;
  90         struct scrub_sector     *sectors[SCRUB_SECTORS_PER_BIO];
  91         int                     sector_count;
  92         int                     next_free;
  93         struct work_struct      work;
  94 };
  95
  96 struct scrub_block {
  97         struct scrub_sector     *sectors[SCRUB_MAX_SECTORS_PER_BLOCK];
  98         int                     sector_count;
  99         atomic_t                outstanding_sectors;
 100         refcount_t              refs; /* free mem on transition to zero */
 101         struct scrub_ctx        *sctx;
 102         struct scrub_parity     *sparity;
 103         struct {
 104                 unsigned int    header_error:1;
 105                 unsigned int    checksum_error:1;
 106                 unsigned int    no_io_error_seen:1;
 107                 unsigned int    generation_error:1; /* also sets header_error */
 108
 109                 /* The following is for the data used to check parity */
 110                 /* It is for the data with checksum */
 111                 unsigned int    data_corrected:1;
 112         };
 113         struct work_struct      work;
 114 };
 115
 116 /* Used for the chunks with parity stripe such RAID5/6 */
 117 struct scrub_parity {
 118         struct scrub_ctx        *sctx;
 119
 120         struct btrfs_device     *scrub_dev;
 121
 122         u64                     logic_start;
 123
 124         u64                     logic_end;
 125
 126         int                     nsectors;
 127
 128         u32                     stripe_len;
 129
 130         refcount_t              refs;
 131
 132         struct list_head        sectors_list;
 133
 134         /* Work of parity check and repair */
 135         struct work_struct      work;
 136
 137         /* Mark the parity blocks which have data */
 138         unsigned long           dbitmap;
 139
 140         /*
 141          * Mark the parity blocks which have data, but errors happen when
 142          * read data or check data
 143          */
 144         unsigned long           ebitmap;
 145 };
 146
 147 struct scrub_ctx {
 148         struct scrub_bio        *bios[SCRUB_BIOS_PER_SCTX];
 149         struct btrfs_fs_info    *fs_info;
 150         int                     first_free;
 151         int                     curr;
 152         atomic_t                bios_in_flight;
 153         atomic_t                workers_pending;
 154         spinlock_t              list_lock;
 155         wait_queue_head_t       list_wait;
 156         struct list_head        csum_list;
 157         atomic_t                cancel_req;
 158         int                     readonly;
 159         int                     sectors_per_bio;
 160
 161         /* State of IO submission throttling affecting the associated device */
 162         ktime_t                 throttle_deadline;
 163         u64                     throttle_sent;
 164
 165         int                     is_dev_replace;
 166         u64                     write_pointer;
 167
 168         struct scrub_bio        *wr_curr_bio;
 169         struct mutex            wr_lock;
 170         struct btrfs_device     *wr_tgtdev;
 171         bool                    flush_all_writes;
 172
 173         /*
 174          * statistics
 175          */
 176         struct btrfs_scrub_progress stat;
 177         spinlock_t              stat_lock;
 178
 179         /*
 180          * Use a ref counter to avoid use-after-free issues. Scrub workers
 181          * decrement bios_in_flight and workers_pending and then do a wakeup
 182          * on the list_wait wait queue. We must ensure the main scrub task
 183          * doesn't free the scrub context before or while the workers are
 184          * doing the wakeup() call.
 185          */
 186         refcount_t              refs;
 187 };
 188
 189 struct scrub_warning {
 190         struct btrfs_path       *path;
 191         u64                     extent_item_size;
 192         const char              *errstr;
 193         u64                     physical;
 194         u64                     logical;
 195         struct btrfs_device     *dev;
 196 };
 197
 198 struct full_stripe_lock {
 199         struct rb_node node;
 200         u64 logical;
 201         u64 refs;
 202         struct mutex mutex;
 203 };
 204
 205 static int scrub_setup_recheck_block(struct scrub_block *original_sblock,
 206                                      struct scrub_block *sblocks_for_recheck);
 207 static void scrub_recheck_block(struct btrfs_fs_info *fs_info,
 208                                 struct scrub_block *sblock,
 209                                 int retry_failed_mirror);
 210 static void scrub_recheck_block_checksum(struct scrub_block *sblock);
 211 static int scrub_repair_block_from_good_copy(struct scrub_block *sblock_bad,
 212                                              struct scrub_block *sblock_good);
 213 static int scrub_repair_sector_from_good_copy(struct scrub_block *sblock_bad,
 214                                             struct scrub_block *sblock_good,
 215                                             int sector_num, int force_write);
 216 static void scrub_write_block_to_dev_replace(struct scrub_block *sblock);
 217 static int scrub_write_sector_to_dev_replace(struct scrub_block *sblock,
 218                                              int sector_num);
 219 static int scrub_checksum_data(struct scrub_block *sblock);
 220 static int scrub_checksum_tree_block(struct scrub_block *sblock);
 221 static int scrub_checksum_super(struct scrub_block *sblock);
 222 static void scrub_block_put(struct scrub_block *sblock);
 223 static void scrub_sector_get(struct scrub_sector *sector);
 224 static void scrub_sector_put(struct scrub_sector *sector);
 225 static void scrub_parity_get(struct scrub_parity *sparity);
 226 static void scrub_parity_put(struct scrub_parity *sparity);
 227 static int scrub_sectors(struct scrub_ctx *sctx, u64 logical, u32 len,
 228                          u64 physical, struct btrfs_device *dev, u64 flags,
 229                          u64 gen, int mirror_num, u8 *csum,
 230                          u64 physical_for_dev_replace);
 231 static void scrub_bio_end_io(struct bio *bio);
 232 static void scrub_bio_end_io_worker(struct work_struct *work);
 233 static void scrub_block_complete(struct scrub_block *sblock);
 234 static void scrub_find_good_copy(struct btrfs_fs_info *fs_info,
 235                                  u64 extent_logical, u32 extent_len,
 236                                  u64 *extent_physical,
 237                                  struct btrfs_device **extent_dev,
 238                                  int *extent_mirror_num);
 239 static int scrub_add_sector_to_wr_bio(struct scrub_ctx *sctx,
 240                                       struct scrub_sector *sector);
 241 static void scrub_wr_submit(struct scrub_ctx *sctx);
 242 static void scrub_wr_bio_end_io(struct bio *bio);
 243 static void scrub_wr_bio_end_io_worker(struct work_struct *work);
 244 static void scrub_put_ctx(struct scrub_ctx *sctx);
 245
 246 static inline int scrub_is_page_on_raid56(struct scrub_sector *sector)
 247 {
 248         return sector->recover &&
 249                (sector->recover->bioc->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK);
 250 }
 251
 252 static void scrub_pending_bio_inc(struct scrub_ctx *sctx)
 253 {
 254         refcount_inc(&sctx->refs);
 255         atomic_inc(&sctx->bios_in_flight);
 256 }
 257
 258 static void scrub_pending_bio_dec(struct scrub_ctx *sctx)
 259 {
 260         atomic_dec(&sctx->bios_in_flight);
 261         wake_up(&sctx->list_wait);
 262         scrub_put_ctx(sctx);
 263 }
 264
 265 static void __scrub_blocked_if_needed(struct btrfs_fs_info *fs_info)
 266 {
 267         while (atomic_read(&fs_info->scrub_pause_req)) {
 268                 mutex_unlock(&fs_info->scrub_lock);
 269                 wait_event(fs_info->scrub_pause_wait,
 270                    atomic_read(&fs_info->scrub_pause_req) == 0);
 271                 mutex_lock(&fs_info->scrub_lock);
 272         }
 273 }
 274
 275 static void scrub_pause_on(struct btrfs_fs_info *fs_info)
 276 {
 277         atomic_inc(&fs_info->scrubs_paused);
 278         wake_up(&fs_info->scrub_pause_wait);
 279 }
 280
 281 static void scrub_pause_off(struct btrfs_fs_info *fs_info)
 282 {
 283         mutex_lock(&fs_info->scrub_lock);
 284         __scrub_blocked_if_needed(fs_info);
 285         atomic_dec(&fs_info->scrubs_paused);
 286         mutex_unlock(&fs_info->scrub_lock);
 287
 288         wake_up(&fs_info->scrub_pause_wait);
 289 }
 290
 291 static void scrub_blocked_if_needed(struct btrfs_fs_info *fs_info)
 292 {
 293         scrub_pause_on(fs_info);
 294         scrub_pause_off(fs_info);
 295 }
 296
 297 /*
 298  * Insert new full stripe lock into full stripe locks tree
 299  *
 300  * Return pointer to existing or newly inserted full_stripe_lock structure if
 301  * everything works well.
 302  * Return ERR_PTR(-ENOMEM) if we failed to allocate memory
 303  *
 304  * NOTE: caller must hold full_stripe_locks_root->lock before calling this
 305  * function
 306  */
 307 static struct full_stripe_lock *insert_full_stripe_lock(
 308                 struct btrfs_full_stripe_locks_tree *locks_root,
 309                 u64 fstripe_logical)
 310 {
 311         struct rb_node **p;
 312         struct rb_node *parent = NULL;
 313         struct full_stripe_lock *entry;
 314         struct full_stripe_lock *ret;
 315
 316         lockdep_assert_held(&locks_root->lock);
 317
 318         p = &locks_root->root.rb_node;
 319         while (*p) {
 320                 parent = *p;
 321                 entry = rb_entry(parent, struct full_stripe_lock, node);
 322                 if (fstripe_logical < entry->logical) {
 323                         p = &(*p)->rb_left;
 324                 } else if (fstripe_logical > entry->logical) {
 325                         p = &(*p)->rb_right;
 326                 } else {
 327                         entry->refs++;
 328                         return entry;
 329                 }
 330         }
 331
 332         /*
 333          * Insert new lock.
 334          */
 335         ret = kmalloc(sizeof(*ret), GFP_KERNEL);
 336         if (!ret)
 337                 return ERR_PTR(-ENOMEM);
 338         ret->logical = fstripe_logical;
 339         ret->refs = 1;
 340         mutex_init(&ret->mutex);
 341
 342         rb_link_node(&ret->node, parent, p);
 343         rb_insert_color(&ret->node, &locks_root->root);
 344         return ret;
 345 }
 346
 347 /*
 348  * Search for a full stripe lock of a block group
 349  *
 350  * Return pointer to existing full stripe lock if found
 351  * Return NULL if not found
 352  */
 353 static struct full_stripe_lock *search_full_stripe_lock(
 354                 struct btrfs_full_stripe_locks_tree *locks_root,
 355                 u64 fstripe_logical)
 356 {
 357         struct rb_node *node;
 358         struct full_stripe_lock *entry;
 359
 360         lockdep_assert_held(&locks_root->lock);
 361
 362         node = locks_root->root.rb_node;
 363         while (node) {
 364                 entry = rb_entry(node, struct full_stripe_lock, node);
 365                 if (fstripe_logical < entry->logical)
 366                         node = node->rb_left;
 367                 else if (fstripe_logical > entry->logical)
 368                         node = node->rb_right;
 369                 else
 370                         return entry;
 371         }
 372         return NULL;
 373 }
 374
 375 /*
 376  * Helper to get full stripe logical from a normal bytenr.
 377  *
 378  * Caller must ensure @cache is a RAID56 block group.
 379  */
 380 static u64 get_full_stripe_logical(struct btrfs_block_group *cache, u64 bytenr)
 381 {
 382         u64 ret;
 383
 384         /*
 385          * Due to chunk item size limit, full stripe length should not be
 386          * larger than U32_MAX. Just a sanity check here.
 387          */
 388         WARN_ON_ONCE(cache->full_stripe_len >= U32_MAX);
 389
 390         /*
 391          * round_down() can only handle power of 2, while RAID56 full
 392          * stripe length can be 64KiB * n, so we need to manually round down.
 393          */
 394         ret = div64_u64(bytenr - cache->start, cache->full_stripe_len) *
 395                         cache->full_stripe_len + cache->start;
 396         return ret;
 397 }
 398
 399 /*
 400  * Lock a full stripe to avoid concurrency of recovery and read
 401  *
 402  * It's only used for profiles with parities (RAID5/6), for other profiles it
 403  * does nothing.
 404  *
 405  * Return 0 if we locked full stripe covering @bytenr, with a mutex held.
 406  * So caller must call unlock_full_stripe() at the same context.
 407  *
 408  * Return <0 if encounters error.
 409  */
 410 static int lock_full_stripe(struct btrfs_fs_info *fs_info, u64 bytenr,
 411                             bool *locked_ret)
 412 {
 413         struct btrfs_block_group *bg_cache;
 414         struct btrfs_full_stripe_locks_tree *locks_root;
 415         struct full_stripe_lock *existing;
 416         u64 fstripe_start;
 417         int ret = 0;
 418
 419         *locked_ret = false;
 420         bg_cache = btrfs_lookup_block_group(fs_info, bytenr);
 421         if (!bg_cache) {
 422                 ASSERT(0);
 423                 return -ENOENT;
 424         }
 425
 426         /* Profiles not based on parity don't need full stripe lock */
 427         if (!(bg_cache->flags & BTRFS_BLOCK_GROUP_RAID56_MASK))
 428                 goto out;
 429         locks_root = &bg_cache->full_stripe_locks_root;
 430
 431         fstripe_start = get_full_stripe_logical(bg_cache, bytenr);
 432
 433         /* Now insert the full stripe lock */
 434         mutex_lock(&locks_root->lock);
 435         existing = insert_full_stripe_lock(locks_root, fstripe_start);
 436         mutex_unlock(&locks_root->lock);
 437         if (IS_ERR(existing)) {
 438                 ret = PTR_ERR(existing);
 439                 goto out;
 440         }
 441         mutex_lock(&existing->mutex);
 442         *locked_ret = true;
 443 out:
 444         btrfs_put_block_group(bg_cache);
 445         return ret;
 446 }
 447
 448 /*
 449  * Unlock a full stripe.
 450  *
 451  * NOTE: Caller must ensure it's the same context calling corresponding
 452  * lock_full_stripe().
 453  *
 454  * Return 0 if we unlock full stripe without problem.
 455  * Return <0 for error
 456  */
 457 static int unlock_full_stripe(struct btrfs_fs_info *fs_info, u64 bytenr,
 458                               bool locked)
 459 {
 460         struct btrfs_block_group *bg_cache;
 461         struct btrfs_full_stripe_locks_tree *locks_root;
 462         struct full_stripe_lock *fstripe_lock;
 463         u64 fstripe_start;
 464         bool freeit = false;
 465         int ret = 0;
 466
 467         /* If we didn't acquire full stripe lock, no need to continue */
 468         if (!locked)
 469                 return 0;
 470
 471         bg_cache = btrfs_lookup_block_group(fs_info, bytenr);
 472         if (!bg_cache) {
 473                 ASSERT(0);
 474                 return -ENOENT;
 475         }
 476         if (!(bg_cache->flags & BTRFS_BLOCK_GROUP_RAID56_MASK))
 477                 goto out;
 478
 479         locks_root = &bg_cache->full_stripe_locks_root;
 480         fstripe_start = get_full_stripe_logical(bg_cache, bytenr);
 481
 482         mutex_lock(&locks_root->lock);
 483         fstripe_lock = search_full_stripe_lock(locks_root, fstripe_start);
 484         /* Unpaired unlock_full_stripe() detected */
 485         if (!fstripe_lock) {
 486                 WARN_ON(1);
 487                 ret = -ENOENT;
 488                 mutex_unlock(&locks_root->lock);
 489                 goto out;
 490         }
 491
 492         if (fstripe_lock->refs == 0) {
 493                 WARN_ON(1);
 494                 btrfs_warn(fs_info, "full stripe lock at %llu refcount underflow",
 495                         fstripe_lock->logical);
 496         } else {
 497                 fstripe_lock->refs--;
 498         }
 499
 500         if (fstripe_lock->refs == 0) {
 501                 rb_erase(&fstripe_lock->node, &locks_root->root);
 502                 freeit = true;
 503         }
 504         mutex_unlock(&locks_root->lock);
 505
 506         mutex_unlock(&fstripe_lock->mutex);
 507         if (freeit)
 508                 kfree(fstripe_lock);
 509 out:
 510         btrfs_put_block_group(bg_cache);
 511         return ret;
 512 }
 513
 514 static void scrub_free_csums(struct scrub_ctx *sctx)
 515 {
 516         while (!list_empty(&sctx->csum_list)) {
 517                 struct btrfs_ordered_sum *sum;
 518                 sum = list_first_entry(&sctx->csum_list,
 519                                        struct btrfs_ordered_sum, list);
 520                 list_del(&sum->list);
 521                 kfree(sum);
 522         }
 523 }
 524
 525 static noinline_for_stack void scrub_free_ctx(struct scrub_ctx *sctx)
 526 {
 527         int i;
 528
 529         if (!sctx)
 530                 return;
 531
 532         /* this can happen when scrub is cancelled */
 533         if (sctx->curr != -1) {
 534                 struct scrub_bio *sbio = sctx->bios[sctx->curr];
 535
 536                 for (i = 0; i < sbio->sector_count; i++) {
 537                         WARN_ON(!sbio->sectors[i]->page);
 538                         scrub_block_put(sbio->sectors[i]->sblock);
 539                 }
 540                 bio_put(sbio->bio);
 541         }
 542
 543         for (i = 0; i < SCRUB_BIOS_PER_SCTX; ++i) {
 544                 struct scrub_bio *sbio = sctx->bios[i];
 545
 546                 if (!sbio)
 547                         break;
 548                 kfree(sbio);
 549         }
 550
 551         kfree(sctx->wr_curr_bio);
 552         scrub_free_csums(sctx);
 553         kfree(sctx);
 554 }
 555
 556 static void scrub_put_ctx(struct scrub_ctx *sctx)
 557 {
 558         if (refcount_dec_and_test(&sctx->refs))
 559                 scrub_free_ctx(sctx);
 560 }
 561
 562 static noinline_for_stack struct scrub_ctx *scrub_setup_ctx(
 563                 struct btrfs_fs_info *fs_info, int is_dev_replace)
 564 {
 565         struct scrub_ctx *sctx;
 566         int             i;
 567
 568         sctx = kzalloc(sizeof(*sctx), GFP_KERNEL);
 569         if (!sctx)
 570                 goto nomem;
 571         refcount_set(&sctx->refs, 1);
 572         sctx->is_dev_replace = is_dev_replace;
 573         sctx->sectors_per_bio = SCRUB_SECTORS_PER_BIO;
 574         sctx->curr = -1;
 575         sctx->fs_info = fs_info;
 576         INIT_LIST_HEAD(&sctx->csum_list);
 577         for (i = 0; i < SCRUB_BIOS_PER_SCTX; ++i) {
 578                 struct scrub_bio *sbio;
 579
 580                 sbio = kzalloc(sizeof(*sbio), GFP_KERNEL);
 581                 if (!sbio)
 582                         goto nomem;
 583                 sctx->bios[i] = sbio;
 584
 585                 sbio->index = i;
 586                 sbio->sctx = sctx;
 587                 sbio->sector_count = 0;
 588                 INIT_WORK(&sbio->work, scrub_bio_end_io_worker);
 589
 590                 if (i != SCRUB_BIOS_PER_SCTX - 1)
 591                         sctx->bios[i]->next_free = i + 1;
 592                 else
 593                         sctx->bios[i]->next_free = -1;
 594         }
 595         sctx->first_free = 0;
 596         atomic_set(&sctx->bios_in_flight, 0);
 597         atomic_set(&sctx->workers_pending, 0);
 598         atomic_set(&sctx->cancel_req, 0);
 599
 600         spin_lock_init(&sctx->list_lock);
 601         spin_lock_init(&sctx->stat_lock);
 602         init_waitqueue_head(&sctx->list_wait);
 603         sctx->throttle_deadline = 0;
 604
 605         WARN_ON(sctx->wr_curr_bio != NULL);
 606         mutex_init(&sctx->wr_lock);
 607         sctx->wr_curr_bio = NULL;
 608         if (is_dev_replace) {
 609                 WARN_ON(!fs_info->dev_replace.tgtdev);
 610                 sctx->wr_tgtdev = fs_info->dev_replace.tgtdev;
 611                 sctx->flush_all_writes = false;
 612         }
 613
 614         return sctx;
 615
 616 nomem:
 617         scrub_free_ctx(sctx);
 618         return ERR_PTR(-ENOMEM);
 619 }
 620
 621 static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root,
 622                                      void *warn_ctx)
 623 {
 624         u32 nlink;
 625         int ret;
 626         int i;
 627         unsigned nofs_flag;
 628         struct extent_buffer *eb;
 629         struct btrfs_inode_item *inode_item;
 630         struct scrub_warning *swarn = warn_ctx;
 631         struct btrfs_fs_info *fs_info = swarn->dev->fs_info;
 632         struct inode_fs_paths *ipath = NULL;
 633         struct btrfs_root *local_root;
 634         struct btrfs_key key;
 635
 636         local_root = btrfs_get_fs_root(fs_info, root, true);
 637         if (IS_ERR(local_root)) {
 638                 ret = PTR_ERR(local_root);
 639                 goto err;
 640         }
 641
 642         /*
 643          * this makes the path point to (inum INODE_ITEM ioff)
 644          */
 645         key.objectid = inum;
 646         key.type = BTRFS_INODE_ITEM_KEY;
 647         key.offset = 0;
 648
 649         ret = btrfs_search_slot(NULL, local_root, &key, swarn->path, 0, 0);
 650         if (ret) {
 651                 btrfs_put_root(local_root);
 652                 btrfs_release_path(swarn->path);
 653                 goto err;
 654         }
 655
 656         eb = swarn->path->nodes[0];
 657         inode_item = btrfs_item_ptr(eb, swarn->path->slots[0],
 658                                         struct btrfs_inode_item);
 659         nlink = btrfs_inode_nlink(eb, inode_item);
 660         btrfs_release_path(swarn->path);
 661
 662         /*
 663          * init_path might indirectly call vmalloc, or use GFP_KERNEL. Scrub
 664          * uses GFP_NOFS in this context, so we keep it consistent but it does
 665          * not seem to be strictly necessary.
 666          */
 667         nofs_flag = memalloc_nofs_save();
 668         ipath = init_ipath(4096, local_root, swarn->path);
 669         memalloc_nofs_restore(nofs_flag);
 670         if (IS_ERR(ipath)) {
 671                 btrfs_put_root(local_root);
 672                 ret = PTR_ERR(ipath);
 673                 ipath = NULL;
 674                 goto err;
 675         }
 676         ret = paths_from_inode(inum, ipath);
 677
 678         if (ret < 0)
 679                 goto err;
 680
 681         /*
 682          * we deliberately ignore the bit ipath might have been too small to
 683          * hold all of the paths here
 684          */
 685         for (i = 0; i < ipath->fspath->elem_cnt; ++i)
 686                 btrfs_warn_in_rcu(fs_info,
 687 "%s at logical %llu on dev %s, physical %llu, root %llu, inode %llu, offset %llu, length %u, links %u (path: %s)",
 688                                   swarn->errstr, swarn->logical,
 689                                   rcu_str_deref(swarn->dev->name),
 690                                   swarn->physical,
 691                                   root, inum, offset,
 692                                   fs_info->sectorsize, nlink,
 693                                   (char *)(unsigned long)ipath->fspath->val[i]);
 694
 695         btrfs_put_root(local_root);
 696         free_ipath(ipath);
 697         return 0;
 698
 699 err:
 700         btrfs_warn_in_rcu(fs_info,
 701                           "%s at logical %llu on dev %s, physical %llu, root %llu, inode %llu, offset %llu: path resolving failed with ret=%d",
 702                           swarn->errstr, swarn->logical,
 703                           rcu_str_deref(swarn->dev->name),
 704                           swarn->physical,
 705                           root, inum, offset, ret);
 706
 707         free_ipath(ipath);
 708         return 0;
 709 }
 710
 711 static void scrub_print_warning(const char *errstr, struct scrub_block *sblock)
 712 {
 713         struct btrfs_device *dev;
 714         struct btrfs_fs_info *fs_info;
 715         struct btrfs_path *path;
 716         struct btrfs_key found_key;
 717         struct extent_buffer *eb;
 718         struct btrfs_extent_item *ei;
 719         struct scrub_warning swarn;
 720         unsigned long ptr = 0;
 721         u64 extent_item_pos;
 722         u64 flags = 0;
 723         u64 ref_root;
 724         u32 item_size;
 725         u8 ref_level = 0;
 726         int ret;
 727
 728         WARN_ON(sblock->sector_count < 1);
 729         dev = sblock->sectors[0]->dev;
 730         fs_info = sblock->sctx->fs_info;
 731
 732         path = btrfs_alloc_path();
 733         if (!path)
 734                 return;
 735
 736         swarn.physical = sblock->sectors[0]->physical;
 737         swarn.logical = sblock->sectors[0]->logical;
 738         swarn.errstr = errstr;
 739         swarn.dev = NULL;
 740
 741         ret = extent_from_logical(fs_info, swarn.logical, path, &found_key,
 742                                   &flags);
 743         if (ret < 0)
 744                 goto out;
 745
 746         extent_item_pos = swarn.logical - found_key.objectid;
 747         swarn.extent_item_size = found_key.offset;
 748
 749         eb = path->nodes[0];
 750         ei = btrfs_item_ptr(eb, path->slots[0], struct btrfs_extent_item);
 751         item_size = btrfs_item_size(eb, path->slots[0]);
 752
 753         if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
 754                 do {
 755                         ret = tree_backref_for_extent(&ptr, eb, &found_key, ei,
 756                                                       item_size, &ref_root,
 757                                                       &ref_level);
 758                         btrfs_warn_in_rcu(fs_info,
 759 "%s at logical %llu on dev %s, physical %llu: metadata %s (level %d) in tree %llu",
 760                                 errstr, swarn.logical,
 761                                 rcu_str_deref(dev->name),
 762                                 swarn.physical,
 763                                 ref_level ? "node" : "leaf",
 764                                 ret < 0 ? -1 : ref_level,
 765                                 ret < 0 ? -1 : ref_root);
 766                 } while (ret != 1);
 767                 btrfs_release_path(path);
 768         } else {
 769                 btrfs_release_path(path);
 770                 swarn.path = path;
 771                 swarn.dev = dev;
 772                 iterate_extent_inodes(fs_info, found_key.objectid,
 773                                         extent_item_pos, 1,
 774                                         scrub_print_warning_inode, &swarn, false);
 775         }
 776
 777 out:
 778         btrfs_free_path(path);
 779 }
 780
 781 static inline void scrub_get_recover(struct scrub_recover *recover)
 782 {
 783         refcount_inc(&recover->refs);
 784 }
 785
 786 static inline void scrub_put_recover(struct btrfs_fs_info *fs_info,
 787                                      struct scrub_recover *recover)
 788 {
 789         if (refcount_dec_and_test(&recover->refs)) {
 790                 btrfs_bio_counter_dec(fs_info);
 791                 btrfs_put_bioc(recover->bioc);
 792                 kfree(recover);
 793         }
 794 }
 795
 796 /*
 797  * scrub_handle_errored_block gets called when either verification of the
 798  * sectors failed or the bio failed to read, e.g. with EIO. In the latter
 799  * case, this function handles all sectors in the bio, even though only one
 800  * may be bad.
 801  * The goal of this function is to repair the errored block by using the
 802  * contents of one of the mirrors.
 803  */
 804 static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
 805 {
 806         struct scrub_ctx *sctx = sblock_to_check->sctx;
 807         struct btrfs_device *dev;
 808         struct btrfs_fs_info *fs_info;
 809         u64 logical;
 810         unsigned int failed_mirror_index;
 811         unsigned int is_metadata;
 812         unsigned int have_csum;
 813         struct scrub_block *sblocks_for_recheck; /* holds one for each mirror */
 814         struct scrub_block *sblock_bad;
 815         int ret;
 816         int mirror_index;
 817         int sector_num;
 818         int success;
 819         bool full_stripe_locked;
 820         unsigned int nofs_flag;
 821         static DEFINE_RATELIMIT_STATE(rs, DEFAULT_RATELIMIT_INTERVAL,
 822                                       DEFAULT_RATELIMIT_BURST);
 823
 824         BUG_ON(sblock_to_check->sector_count < 1);
 825         fs_info = sctx->fs_info;
 826         if (sblock_to_check->sectors[0]->flags & BTRFS_EXTENT_FLAG_SUPER) {
 827                 /*
 828                  * if we find an error in a super block, we just report it.
 829                  * They will get written with the next transaction commit
 830                  * anyway
 831                  */
 832                 spin_lock(&sctx->stat_lock);
 833                 ++sctx->stat.super_errors;
 834                 spin_unlock(&sctx->stat_lock);
 835                 return 0;
 836         }
 837         logical = sblock_to_check->sectors[0]->logical;
 838         BUG_ON(sblock_to_check->sectors[0]->mirror_num < 1);
 839         failed_mirror_index = sblock_to_check->sectors[0]->mirror_num - 1;
 840         is_metadata = !(sblock_to_check->sectors[0]->flags &
 841                         BTRFS_EXTENT_FLAG_DATA);
 842         have_csum = sblock_to_check->sectors[0]->have_csum;
 843         dev = sblock_to_check->sectors[0]->dev;
 844
 845         if (!sctx->is_dev_replace && btrfs_repair_one_zone(fs_info, logical))
 846                 return 0;
 847
 848         /*
 849          * We must use GFP_NOFS because the scrub task might be waiting for a
 850          * worker task executing this function and in turn a transaction commit
 851          * might be waiting the scrub task to pause (which needs to wait for all
 852          * the worker tasks to complete before pausing).
 853          * We do allocations in the workers through insert_full_stripe_lock()
 854          * and scrub_add_sector_to_wr_bio(), which happens down the call chain of
 855          * this function.
 856          */
 857         nofs_flag = memalloc_nofs_save();
 858         /*
 859          * For RAID5/6, race can happen for a different device scrub thread.
 860          * For data corruption, Parity and Data threads will both try
 861          * to recovery the data.
 862          * Race can lead to doubly added csum error, or even unrecoverable
 863          * error.
 864          */
 865         ret = lock_full_stripe(fs_info, logical, &full_stripe_locked);
 866         if (ret < 0) {
 867                 memalloc_nofs_restore(nofs_flag);
 868                 spin_lock(&sctx->stat_lock);
 869                 if (ret == -ENOMEM)
 870                         sctx->stat.malloc_errors++;
 871                 sctx->stat.read_errors++;
 872                 sctx->stat.uncorrectable_errors++;
 873                 spin_unlock(&sctx->stat_lock);
 874                 return ret;
 875         }
 876
 877         /*
 878          * read all mirrors one after the other. This includes to
 879          * re-read the extent or metadata block that failed (that was
 880          * the cause that this fixup code is called) another time,
 881          * sector by sector this time in order to know which sectors
 882          * caused I/O errors and which ones are good (for all mirrors).
 883          * It is the goal to handle the situation when more than one
 884          * mirror contains I/O errors, but the errors do not
 885          * overlap, i.e. the data can be repaired by selecting the
 886          * sectors from those mirrors without I/O error on the
 887          * particular sectors. One example (with blocks >= 2 * sectorsize)
 888          * would be that mirror #1 has an I/O error on the first sector,
 889          * the second sector is good, and mirror #2 has an I/O error on
 890          * the second sector, but the first sector is good.
 891          * Then the first sector of the first mirror can be repaired by
 892          * taking the first sector of the second mirror, and the
 893          * second sector of the second mirror can be repaired by
 894          * copying the contents of the 2nd sector of the 1st mirror.
 895          * One more note: if the sectors of one mirror contain I/O
 896          * errors, the checksum cannot be verified. In order to get
 897          * the best data for repairing, the first attempt is to find
 898          * a mirror without I/O errors and with a validated checksum.
 899          * Only if this is not possible, the sectors are picked from
 900          * mirrors with I/O errors without considering the checksum.
 901          * If the latter is the case, at the end, the checksum of the
 902          * repaired area is verified in order to correctly maintain
 903          * the statistics.
 904          */
 905
 906         sblocks_for_recheck = kcalloc(BTRFS_MAX_MIRRORS,
 907                                       sizeof(*sblocks_for_recheck), GFP_KERNEL);
 908         if (!sblocks_for_recheck) {
 909                 spin_lock(&sctx->stat_lock);
 910                 sctx->stat.malloc_errors++;
 911                 sctx->stat.read_errors++;
 912                 sctx->stat.uncorrectable_errors++;
 913                 spin_unlock(&sctx->stat_lock);
 914                 btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS);
 915                 goto out;
 916         }
 917
 918         /* Setup the context, map the logical blocks and alloc the sectors */
 919         ret = scrub_setup_recheck_block(sblock_to_check, sblocks_for_recheck);
 920         if (ret) {
 921                 spin_lock(&sctx->stat_lock);
 922                 sctx->stat.read_errors++;
 923                 sctx->stat.uncorrectable_errors++;
 924                 spin_unlock(&sctx->stat_lock);
 925                 btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS);
 926                 goto out;
 927         }
 928         BUG_ON(failed_mirror_index >= BTRFS_MAX_MIRRORS);
 929         sblock_bad = sblocks_for_recheck + failed_mirror_index;
 930
 931         /* build and submit the bios for the failed mirror, check checksums */
 932         scrub_recheck_block(fs_info, sblock_bad, 1);
 933
 934         if (!sblock_bad->header_error && !sblock_bad->checksum_error &&
 935             sblock_bad->no_io_error_seen) {
 936                 /*
 937                  * The error disappeared after reading sector by sector, or
 938                  * the area was part of a huge bio and other parts of the
 939                  * bio caused I/O errors, or the block layer merged several
 940                  * read requests into one and the error is caused by a
 941                  * different bio (usually one of the two latter cases is
 942                  * the cause)
 943                  */
 944                 spin_lock(&sctx->stat_lock);
 945                 sctx->stat.unverified_errors++;
 946                 sblock_to_check->data_corrected = 1;
 947                 spin_unlock(&sctx->stat_lock);
 948
 949                 if (sctx->is_dev_replace)
 950                         scrub_write_block_to_dev_replace(sblock_bad);
 951                 goto out;
 952         }
 953
 954         if (!sblock_bad->no_io_error_seen) {
 955                 spin_lock(&sctx->stat_lock);
 956                 sctx->stat.read_errors++;
 957                 spin_unlock(&sctx->stat_lock);
 958                 if (__ratelimit(&rs))
 959                         scrub_print_warning("i/o error", sblock_to_check);
 960                 btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS);
 961         } else if (sblock_bad->checksum_error) {
 962                 spin_lock(&sctx->stat_lock);
 963                 sctx->stat.csum_errors++;
 964                 spin_unlock(&sctx->stat_lock);
 965                 if (__ratelimit(&rs))
 966                         scrub_print_warning("checksum error", sblock_to_check);
 967                 btrfs_dev_stat_inc_and_print(dev,
 968                                              BTRFS_DEV_STAT_CORRUPTION_ERRS);
 969         } else if (sblock_bad->header_error) {
 970                 spin_lock(&sctx->stat_lock);
 971                 sctx->stat.verify_errors++;
 972                 spin_unlock(&sctx->stat_lock);
 973                 if (__ratelimit(&rs))
 974                         scrub_print_warning("checksum/header error",
 975                                             sblock_to_check);
 976                 if (sblock_bad->generation_error)
 977                         btrfs_dev_stat_inc_and_print(dev,
 978                                 BTRFS_DEV_STAT_GENERATION_ERRS);
 979                 else
 980                         btrfs_dev_stat_inc_and_print(dev,
 981                                 BTRFS_DEV_STAT_CORRUPTION_ERRS);
 982         }
 983
 984         if (sctx->readonly) {
 985                 ASSERT(!sctx->is_dev_replace);
 986                 goto out;
 987         }
 988
 989         /*
 990          * now build and submit the bios for the other mirrors, check
 991          * checksums.
 992          * First try to pick the mirror which is completely without I/O
 993          * errors and also does not have a checksum error.
 994          * If one is found, and if a checksum is present, the full block
 995          * that is known to contain an error is rewritten. Afterwards
 996          * the block is known to be corrected.
 997          * If a mirror is found which is completely correct, and no
 998          * checksum is present, only those sectors are rewritten that had
 999          * an I/O error in the block to be repaired, since it cannot be
1000          * determined, which copy of the other sectors is better (and it
1001          * could happen otherwise that a correct sector would be
1002          * overwritten by a bad one).
1003          */
1004         for (mirror_index = 0; ;mirror_index++) {
1005                 struct scrub_block *sblock_other;
1006
1007                 if (mirror_index == failed_mirror_index)
1008                         continue;
1009
1010                 /* raid56's mirror can be more than BTRFS_MAX_MIRRORS */
1011                 if (!scrub_is_page_on_raid56(sblock_bad->sectors[0])) {
1012                         if (mirror_index >= BTRFS_MAX_MIRRORS)
1013                                 break;
1014                         if (!sblocks_for_recheck[mirror_index].sector_count)
1015                                 break;
1016
1017                         sblock_other = sblocks_for_recheck + mirror_index;
1018                 } else {
1019                         struct scrub_recover *r = sblock_bad->sectors[0]->recover;
1020                         int max_allowed = r->bioc->num_stripes - r->bioc->num_tgtdevs;
1021
1022                         if (mirror_index >= max_allowed)
1023                                 break;
1024                         if (!sblocks_for_recheck[1].sector_count)
1025                                 break;
1026
1027                         ASSERT(failed_mirror_index == 0);
1028                         sblock_other = sblocks_for_recheck + 1;
1029                         sblock_other->sectors[0]->mirror_num = 1 + mirror_index;
1030                 }
1031
1032                 /* build and submit the bios, check checksums */
1033                 scrub_recheck_block(fs_info, sblock_other, 0);
1034
1035                 if (!sblock_other->header_error &&
1036                     !sblock_other->checksum_error &&
1037                     sblock_other->no_io_error_seen) {
1038                         if (sctx->is_dev_replace) {
1039                                 scrub_write_block_to_dev_replace(sblock_other);
1040                                 goto corrected_error;
1041                         } else {
1042                                 ret = scrub_repair_block_from_good_copy(
1043                                                 sblock_bad, sblock_other);
1044                                 if (!ret)
1045                                         goto corrected_error;
1046                         }
1047                 }
1048         }
1049
1050         if (sblock_bad->no_io_error_seen && !sctx->is_dev_replace)
1051                 goto did_not_correct_error;
1052
1053         /*
1054          * In case of I/O errors in the area that is supposed to be
1055          * repaired, continue by picking good copies of those sectors.
1056          * Select the good sectors from mirrors to rewrite bad sectors from
1057          * the area to fix. Afterwards verify the checksum of the block
1058          * that is supposed to be repaired. This verification step is
1059          * only done for the purpose of statistic counting and for the
1060          * final scrub report, whether errors remain.
1061          * A perfect algorithm could make use of the checksum and try
1062          * all possible combinations of sectors from the different mirrors
1063          * until the checksum verification succeeds. For example, when
1064          * the 2nd sector of mirror #1 faces I/O errors, and the 2nd sector
1065          * of mirror #2 is readable but the final checksum test fails,
1066          * then the 2nd sector of mirror #3 could be tried, whether now
1067          * the final checksum succeeds. But this would be a rare
1068          * exception and is therefore not implemented. At least it is
1069          * avoided that the good copy is overwritten.
1070          * A more useful improvement would be to pick the sectors
1071          * without I/O error based on sector sizes (512 bytes on legacy
1072          * disks) instead of on sectorsize. Then maybe 512 byte of one
1073          * mirror could be repaired by taking 512 byte of a different
1074          * mirror, even if other 512 byte sectors in the same sectorsize
1075          * area are unreadable.
1076          */
1077         success = 1;
1078         for (sector_num = 0; sector_num < sblock_bad->sector_count;
1079              sector_num++) {
1080                 struct scrub_sector *sector_bad = sblock_bad->sectors[sector_num];
1081                 struct scrub_block *sblock_other = NULL;
1082
1083                 /* Skip no-io-error sectors in scrub */
1084                 if (!sector_bad->io_error && !sctx->is_dev_replace)
1085                         continue;
1086
1087                 if (scrub_is_page_on_raid56(sblock_bad->sectors[0])) {
1088                         /*
1089                          * In case of dev replace, if raid56 rebuild process
1090                          * didn't work out correct data, then copy the content
1091                          * in sblock_bad to make sure target device is identical
1092                          * to source device, instead of writing garbage data in
1093                          * sblock_for_recheck array to target device.
1094                          */
1095                         sblock_other = NULL;
1096                 } else if (sector_bad->io_error) {
1097                         /* Try to find no-io-error sector in mirrors */
1098                         for (mirror_index = 0;
1099                              mirror_index < BTRFS_MAX_MIRRORS &&
1100                              sblocks_for_recheck[mirror_index].sector_count > 0;
1101                              mirror_index++) {
1102                                 if (!sblocks_for_recheck[mirror_index].
1103                                     sectors[sector_num]->io_error) {
1104                                         sblock_other = sblocks_for_recheck +
1105                                                        mirror_index;
1106                                         break;
1107                                 }
1108                         }
1109                         if (!sblock_other)
1110                                 success = 0;
1111                 }
1112
1113                 if (sctx->is_dev_replace) {
1114                         /*
1115                          * Did not find a mirror to fetch the sector from.
1116                          * scrub_write_sector_to_dev_replace() handles this
1117                          * case (sector->io_error), by filling the block with
1118                          * zeros before submitting the write request
1119                          */
1120                         if (!sblock_other)
1121                                 sblock_other = sblock_bad;
1122
1123                         if (scrub_write_sector_to_dev_replace(sblock_other,
1124                                                               sector_num) != 0) {
1125                                 atomic64_inc(
1126                                         &fs_info->dev_replace.num_write_errors);
1127                                 success = 0;
1128                         }
1129                 } else if (sblock_other) {
1130                         ret = scrub_repair_sector_from_good_copy(sblock_bad,
1131                                                                  sblock_other,
1132                                                                  sector_num, 0);
1133                         if (0 == ret)
1134                                 sector_bad->io_error = 0;
1135                         else
1136                                 success = 0;
1137                 }
1138         }
1139
1140         if (success && !sctx->is_dev_replace) {
1141                 if (is_metadata || have_csum) {
1142                         /*
1143                          * need to verify the checksum now that all
1144                          * sectors on disk are repaired (the write
1145                          * request for data to be repaired is on its way).
1146                          * Just be lazy and use scrub_recheck_block()
1147                          * which re-reads the data before the checksum
1148                          * is verified, but most likely the data comes out
1149                          * of the page cache.
1150                          */
1151                         scrub_recheck_block(fs_info, sblock_bad, 1);
1152                         if (!sblock_bad->header_error &&
1153                             !sblock_bad->checksum_error &&
1154                             sblock_bad->no_io_error_seen)
1155                                 goto corrected_error;
1156                         else
1157                                 goto did_not_correct_error;
1158                 } else {
1159 corrected_error:
1160                         spin_lock(&sctx->stat_lock);
1161                         sctx->stat.corrected_errors++;
1162                         sblock_to_check->data_corrected = 1;
1163                         spin_unlock(&sctx->stat_lock);
1164                         btrfs_err_rl_in_rcu(fs_info,
1165                                 "fixed up error at logical %llu on dev %s",
1166                                 logical, rcu_str_deref(dev->name));
1167                 }
1168         } else {
1169 did_not_correct_error:
1170                 spin_lock(&sctx->stat_lock);
1171                 sctx->stat.uncorrectable_errors++;
1172                 spin_unlock(&sctx->stat_lock);
1173                 btrfs_err_rl_in_rcu(fs_info,
1174                         "unable to fixup (regular) error at logical %llu on dev %s",
1175                         logical, rcu_str_deref(dev->name));
1176         }
1177
1178 out:
1179         if (sblocks_for_recheck) {
1180                 for (mirror_index = 0; mirror_index < BTRFS_MAX_MIRRORS;
1181                      mirror_index++) {
1182                         struct scrub_block *sblock = sblocks_for_recheck +
1183                                                      mirror_index;
1184                         struct scrub_recover *recover;
1185                         int i;
1186
1187                         for (i = 0; i < sblock->sector_count; i++) {
1188                                 sblock->sectors[i]->sblock = NULL;
1189                                 recover = sblock->sectors[i]->recover;
1190                                 if (recover) {
1191                                         scrub_put_recover(fs_info, recover);
1192                                         sblock->sectors[i]->recover = NULL;
1193                                 }
1194                                 scrub_sector_put(sblock->sectors[i]);
1195                         }
1196                 }
1197                 kfree(sblocks_for_recheck);
1198         }
1199
1200         ret = unlock_full_stripe(fs_info, logical, full_stripe_locked);
1201         memalloc_nofs_restore(nofs_flag);
1202         if (ret < 0)
1203                 return ret;
1204         return 0;
1205 }
1206
1207 static inline int scrub_nr_raid_mirrors(struct btrfs_io_context *bioc)
1208 {
1209         if (bioc->map_type & BTRFS_BLOCK_GROUP_RAID5)
1210                 return 2;
1211         else if (bioc->map_type & BTRFS_BLOCK_GROUP_RAID6)
1212                 return 3;
1213         else
1214                 return (int)bioc->num_stripes;
1215 }
1216
1217 static inline void scrub_stripe_index_and_offset(u64 logical, u64 map_type,
1218                                                  u64 *raid_map,
1219                                                  u64 mapped_length,
1220                                                  int nstripes, int mirror,
1221                                                  int *stripe_index,
1222                                                  u64 *stripe_offset)
1223 {
1224         int i;
1225
1226         if (map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
1227                 /* RAID5/6 */
1228                 for (i = 0; i < nstripes; i++) {
1229                         if (raid_map[i] == RAID6_Q_STRIPE ||
1230                             raid_map[i] == RAID5_P_STRIPE)
1231                                 continue;
1232
1233                         if (logical >= raid_map[i] &&
1234                             logical < raid_map[i] + mapped_length)
1235                                 break;
1236                 }
1237
1238                 *stripe_index = i;
1239                 *stripe_offset = logical - raid_map[i];
1240         } else {
1241                 /* The other RAID type */
1242                 *stripe_index = mirror;
1243                 *stripe_offset = 0;
1244         }
1245 }
1246
1247 static int scrub_setup_recheck_block(struct scrub_block *original_sblock,
1248                                      struct scrub_block *sblocks_for_recheck)
1249 {
1250         struct scrub_ctx *sctx = original_sblock->sctx;
1251         struct btrfs_fs_info *fs_info = sctx->fs_info;
1252         u64 length = original_sblock->sector_count << fs_info->sectorsize_bits;
1253         u64 logical = original_sblock->sectors[0]->logical;
1254         u64 generation = original_sblock->sectors[0]->generation;
1255         u64 flags = original_sblock->sectors[0]->flags;
1256         u64 have_csum = original_sblock->sectors[0]->have_csum;
1257         struct scrub_recover *recover;
1258         struct btrfs_io_context *bioc;
1259         u64 sublen;
1260         u64 mapped_length;
1261         u64 stripe_offset;
1262         int stripe_index;
1263         int sector_index = 0;
1264         int mirror_index;
1265         int nmirrors;
1266         int ret;
1267
1268         /*
1269          * Note: the two members refs and outstanding_sectors are not used (and
1270          * not set) in the blocks that are used for the recheck procedure.
1271          */
1272
1273         while (length > 0) {
1274                 sublen = min_t(u64, length, fs_info->sectorsize);
1275                 mapped_length = sublen;
1276                 bioc = NULL;
1277
1278                 /*
1279                  * With a length of sectorsize, each returned stripe represents
1280                  * one mirror
1281                  */
1282                 btrfs_bio_counter_inc_blocked(fs_info);
1283                 ret = btrfs_map_sblock(fs_info, BTRFS_MAP_GET_READ_MIRRORS,
1284                                        logical, &mapped_length, &bioc);
1285                 if (ret || !bioc || mapped_length < sublen) {
1286                         btrfs_put_bioc(bioc);
1287                         btrfs_bio_counter_dec(fs_info);
1288                         return -EIO;
1289                 }
1290
1291                 recover = kzalloc(sizeof(struct scrub_recover), GFP_NOFS);
1292                 if (!recover) {
1293                         btrfs_put_bioc(bioc);
1294                         btrfs_bio_counter_dec(fs_info);
1295                         return -ENOMEM;
1296                 }
1297
1298                 refcount_set(&recover->refs, 1);
1299                 recover->bioc = bioc;
1300                 recover->map_length = mapped_length;
1301
1302                 ASSERT(sector_index < SCRUB_MAX_SECTORS_PER_BLOCK);
1303
1304                 nmirrors = min(scrub_nr_raid_mirrors(bioc), BTRFS_MAX_MIRRORS);
1305
1306                 for (mirror_index = 0; mirror_index < nmirrors;
1307                      mirror_index++) {
1308                         struct scrub_block *sblock;
1309                         struct scrub_sector *sector;
1310
1311                         sblock = sblocks_for_recheck + mirror_index;
1312                         sblock->sctx = sctx;
1313
1314                         sector = kzalloc(sizeof(*sector), GFP_NOFS);
1315                         if (!sector) {
1316 leave_nomem:
1317                                 spin_lock(&sctx->stat_lock);
1318                                 sctx->stat.malloc_errors++;
1319                                 spin_unlock(&sctx->stat_lock);
1320                                 scrub_put_recover(fs_info, recover);
1321                                 return -ENOMEM;
1322                         }
1323                         scrub_sector_get(sector);
1324                         sblock->sectors[sector_index] = sector;
1325                         sector->sblock = sblock;
1326                         sector->flags = flags;
1327                         sector->generation = generation;
1328                         sector->logical = logical;
1329                         sector->have_csum = have_csum;
1330                         if (have_csum)
1331                                 memcpy(sector->csum,
1332                                        original_sblock->sectors[0]->csum,
1333                                        sctx->fs_info->csum_size);
1334
1335                         scrub_stripe_index_and_offset(logical,
1336                                                       bioc->map_type,
1337                                                       bioc->raid_map,
1338                                                       mapped_length,
1339                                                       bioc->num_stripes -
1340                                                       bioc->num_tgtdevs,
1341                                                       mirror_index,
1342                                                       &stripe_index,
1343                                                       &stripe_offset);
1344                         sector->physical = bioc->stripes[stripe_index].physical +
1345                                          stripe_offset;
1346                         sector->dev = bioc->stripes[stripe_index].dev;
1347
1348                         BUG_ON(sector_index >= original_sblock->sector_count);
1349                         sector->physical_for_dev_replace =
1350                                 original_sblock->sectors[sector_index]->
1351                                 physical_for_dev_replace;
1352                         /* For missing devices, dev->bdev is NULL */
1353                         sector->mirror_num = mirror_index + 1;
1354                         sblock->sector_count++;
1355                         sector->page = alloc_page(GFP_NOFS);
1356                         if (!sector->page)
1357                                 goto leave_nomem;
1358
1359                         scrub_get_recover(recover);
1360                         sector->recover = recover;
1361                 }
1362                 scrub_put_recover(fs_info, recover);
1363                 length -= sublen;
1364                 logical += sublen;
1365                 sector_index++;
1366         }
1367
1368         return 0;
1369 }
1370
1371 static void scrub_bio_wait_endio(struct bio *bio)
1372 {
1373         complete(bio->bi_private);
1374 }
1375
1376 static int scrub_submit_raid56_bio_wait(struct btrfs_fs_info *fs_info,
1377                                         struct bio *bio,
1378                                         struct scrub_sector *sector)
1379 {
1380         DECLARE_COMPLETION_ONSTACK(done);
1381         int ret;
1382         int mirror_num;
1383
1384         bio->bi_iter.bi_sector = sector->logical >> 9;
1385         bio->bi_private = &done;
1386         bio->bi_end_io = scrub_bio_wait_endio;
1387
1388         mirror_num = sector->sblock->sectors[0]->mirror_num;
1389         ret = raid56_parity_recover(bio, sector->recover->bioc,
1390                                     sector->recover->map_length,
1391                                     mirror_num, 0);
1392         if (ret)
1393                 return ret;
1394
1395         wait_for_completion_io(&done);
1396         return blk_status_to_errno(bio->bi_status);
1397 }
1398
1399 static void scrub_recheck_block_on_raid56(struct btrfs_fs_info *fs_info,
1400                                           struct scrub_block *sblock)
1401 {
1402         struct scrub_sector *first_sector = sblock->sectors[0];
1403         struct bio *bio;
1404         int i;
1405
1406         /* All sectors in sblock belong to the same stripe on the same device. */
1407         ASSERT(first_sector->dev);
1408         if (!first_sector->dev->bdev)
1409                 goto out;
1410
1411         bio = bio_alloc(first_sector->dev->bdev, BIO_MAX_VECS, REQ_OP_READ, GFP_NOFS);
1412
1413         for (i = 0; i < sblock->sector_count; i++) {
1414                 struct scrub_sector *sector = sblock->sectors[i];
1415
1416                 WARN_ON(!sector->page);
1417                 bio_add_page(bio, sector->page, PAGE_SIZE, 0);
1418         }
1419
1420         if (scrub_submit_raid56_bio_wait(fs_info, bio, first_sector)) {
1421                 bio_put(bio);
1422                 goto out;
1423         }
1424
1425         bio_put(bio);
1426
1427         scrub_recheck_block_checksum(sblock);
1428
1429         return;
1430 out:
1431         for (i = 0; i < sblock->sector_count; i++)
1432                 sblock->sectors[i]->io_error = 1;
1433
1434         sblock->no_io_error_seen = 0;
1435 }
1436
1437 /*
1438  * This function will check the on disk data for checksum errors, header errors
1439  * and read I/O errors. If any I/O errors happen, the exact sectors which are
1440  * errored are marked as being bad. The goal is to enable scrub to take those
1441  * sectors that are not errored from all the mirrors so that the sectors that
1442  * are errored in the just handled mirror can be repaired.
1443  */
1444 static void scrub_recheck_block(struct btrfs_fs_info *fs_info,
1445                                 struct scrub_block *sblock,
1446                                 int retry_failed_mirror)
1447 {
1448         int i;
1449
1450         sblock->no_io_error_seen = 1;
1451
1452         /* short cut for raid56 */
1453         if (!retry_failed_mirror && scrub_is_page_on_raid56(sblock->sectors[0]))
1454                 return scrub_recheck_block_on_raid56(fs_info, sblock);
1455
1456         for (i = 0; i < sblock->sector_count; i++) {
1457                 struct scrub_sector *sector = sblock->sectors[i];
1458                 struct bio bio;
1459                 struct bio_vec bvec;
1460
1461                 if (sector->dev->bdev == NULL) {
1462                         sector->io_error = 1;
1463                         sblock->no_io_error_seen = 0;
1464                         continue;
1465                 }
1466
1467                 WARN_ON(!sector->page);
1468                 bio_init(&bio, sector->dev->bdev, &bvec, 1, REQ_OP_READ);
1469                 bio_add_page(&bio, sector->page, fs_info->sectorsize, 0);
1470                 bio.bi_iter.bi_sector = sector->physical >> 9;
1471
1472                 btrfsic_check_bio(&bio);
1473                 if (submit_bio_wait(&bio)) {
1474                         sector->io_error = 1;
1475                         sblock->no_io_error_seen = 0;
1476                 }
1477
1478                 bio_uninit(&bio);
1479         }
1480
1481         if (sblock->no_io_error_seen)
1482                 scrub_recheck_block_checksum(sblock);
1483 }
1484
1485 static inline int scrub_check_fsid(u8 fsid[], struct scrub_sector *sector)
1486 {
1487         struct btrfs_fs_devices *fs_devices = sector->dev->fs_devices;
1488         int ret;
1489
1490         ret = memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE);
1491         return !ret;
1492 }
1493
1494 static void scrub_recheck_block_checksum(struct scrub_block *sblock)
1495 {
1496         sblock->header_error = 0;
1497         sblock->checksum_error = 0;
1498         sblock->generation_error = 0;
1499
1500         if (sblock->sectors[0]->flags & BTRFS_EXTENT_FLAG_DATA)
1501                 scrub_checksum_data(sblock);
1502         else
1503                 scrub_checksum_tree_block(sblock);
1504 }
1505
1506 static int scrub_repair_block_from_good_copy(struct scrub_block *sblock_bad,
1507                                              struct scrub_block *sblock_good)
1508 {
1509         int i;
1510         int ret = 0;
1511
1512         for (i = 0; i < sblock_bad->sector_count; i++) {
1513                 int ret_sub;
1514
1515                 ret_sub = scrub_repair_sector_from_good_copy(sblock_bad,
1516                                                              sblock_good, i, 1);
1517                 if (ret_sub)
1518                         ret = ret_sub;
1519         }
1520
1521         return ret;
1522 }
1523
1524 static int scrub_repair_sector_from_good_copy(struct scrub_block *sblock_bad,
1525                                               struct scrub_block *sblock_good,
1526                                               int sector_num, int force_write)
1527 {
1528         struct scrub_sector *sector_bad = sblock_bad->sectors[sector_num];
1529         struct scrub_sector *sector_good = sblock_good->sectors[sector_num];
1530         struct btrfs_fs_info *fs_info = sblock_bad->sctx->fs_info;
1531         const u32 sectorsize = fs_info->sectorsize;
1532
1533         BUG_ON(sector_bad->page == NULL);
1534         BUG_ON(sector_good->page == NULL);
1535         if (force_write || sblock_bad->header_error ||
1536             sblock_bad->checksum_error || sector_bad->io_error) {
1537                 struct bio bio;
1538                 struct bio_vec bvec;
1539                 int ret;
1540
1541                 if (!sector_bad->dev->bdev) {
1542                         btrfs_warn_rl(fs_info,
1543                                 "scrub_repair_page_from_good_copy(bdev == NULL) is unexpected");
1544                         return -EIO;
1545                 }
1546
1547                 bio_init(&bio, sector_bad->dev->bdev, &bvec, 1, REQ_OP_WRITE);
1548                 bio.bi_iter.bi_sector = sector_bad->physical >> 9;
1549                 __bio_add_page(&bio, sector_good->page, sectorsize, 0);
1550
1551                 btrfsic_check_bio(&bio);
1552                 ret = submit_bio_wait(&bio);
1553                 bio_uninit(&bio);
1554
1555                 if (ret) {
1556                         btrfs_dev_stat_inc_and_print(sector_bad->dev,
1557                                 BTRFS_DEV_STAT_WRITE_ERRS);
1558                         atomic64_inc(&fs_info->dev_replace.num_write_errors);
1559                         return -EIO;
1560                 }
1561         }
1562
1563         return 0;
1564 }
1565
1566 static void scrub_write_block_to_dev_replace(struct scrub_block *sblock)
1567 {
1568         struct btrfs_fs_info *fs_info = sblock->sctx->fs_info;
1569         int i;
1570
1571         /*
1572          * This block is used for the check of the parity on the source device,
1573          * so the data needn't be written into the destination device.
1574          */
1575         if (sblock->sparity)
1576                 return;
1577
1578         for (i = 0; i < sblock->sector_count; i++) {
1579                 int ret;
1580
1581                 ret = scrub_write_sector_to_dev_replace(sblock, i);
1582                 if (ret)
1583                         atomic64_inc(&fs_info->dev_replace.num_write_errors);
1584         }
1585 }
1586
1587 static int scrub_write_sector_to_dev_replace(struct scrub_block *sblock, int sector_num)
1588 {
1589         struct scrub_sector *sector = sblock->sectors[sector_num];
1590
1591         BUG_ON(sector->page == NULL);
1592         if (sector->io_error)
1593                 clear_page(page_address(sector->page));
1594
1595         return scrub_add_sector_to_wr_bio(sblock->sctx, sector);
1596 }
1597
1598 static int fill_writer_pointer_gap(struct scrub_ctx *sctx, u64 physical)
1599 {
1600         int ret = 0;
1601         u64 length;
1602
1603         if (!btrfs_is_zoned(sctx->fs_info))
1604                 return 0;
1605
1606         if (!btrfs_dev_is_sequential(sctx->wr_tgtdev, physical))
1607                 return 0;
1608
1609         if (sctx->write_pointer < physical) {
1610                 length = physical - sctx->write_pointer;
1611
1612                 ret = btrfs_zoned_issue_zeroout(sctx->wr_tgtdev,
1613                                                 sctx->write_pointer, length);
1614                 if (!ret)
1615                         sctx->write_pointer = physical;
1616         }
1617         return ret;
1618 }
1619
1620 static int scrub_add_sector_to_wr_bio(struct scrub_ctx *sctx,
1621                                       struct scrub_sector *sector)
1622 {
1623         struct scrub_bio *sbio;
1624         int ret;
1625         const u32 sectorsize = sctx->fs_info->sectorsize;
1626
1627         mutex_lock(&sctx->wr_lock);
1628 again:
1629         if (!sctx->wr_curr_bio) {
1630                 sctx->wr_curr_bio = kzalloc(sizeof(*sctx->wr_curr_bio),
1631                                               GFP_KERNEL);
1632                 if (!sctx->wr_curr_bio) {
1633                         mutex_unlock(&sctx->wr_lock);
1634                         return -ENOMEM;
1635                 }
1636                 sctx->wr_curr_bio->sctx = sctx;
1637                 sctx->wr_curr_bio->sector_count = 0;
1638         }
1639         sbio = sctx->wr_curr_bio;
1640         if (sbio->sector_count == 0) {
1641                 ret = fill_writer_pointer_gap(sctx, sector->physical_for_dev_replace);
1642                 if (ret) {
1643                         mutex_unlock(&sctx->wr_lock);
1644                         return ret;
1645                 }
1646
1647                 sbio->physical = sector->physical_for_dev_replace;
1648                 sbio->logical = sector->logical;
1649                 sbio->dev = sctx->wr_tgtdev;
1650                 if (!sbio->bio) {
1651                         sbio->bio = bio_alloc(sbio->dev->bdev, sctx->sectors_per_bio,
1652                                               REQ_OP_WRITE, GFP_NOFS);
1653                 }
1654                 sbio->bio->bi_private = sbio;
1655                 sbio->bio->bi_end_io = scrub_wr_bio_end_io;
1656                 sbio->bio->bi_iter.bi_sector = sbio->physical >> 9;
1657                 sbio->status = 0;
1658         } else if (sbio->physical + sbio->sector_count * sectorsize !=
1659                    sector->physical_for_dev_replace ||
1660                    sbio->logical + sbio->sector_count * sectorsize !=
1661                    sector->logical) {
1662                 scrub_wr_submit(sctx);
1663                 goto again;
1664         }
1665
1666         ret = bio_add_page(sbio->bio, sector->page, sectorsize, 0);
1667         if (ret != sectorsize) {
1668                 if (sbio->sector_count < 1) {
1669                         bio_put(sbio->bio);
1670                         sbio->bio = NULL;
1671                         mutex_unlock(&sctx->wr_lock);
1672                         return -EIO;
1673                 }
1674                 scrub_wr_submit(sctx);
1675                 goto again;
1676         }
1677
1678         sbio->sectors[sbio->sector_count] = sector;
1679         scrub_sector_get(sector);
1680         sbio->sector_count++;
1681         if (sbio->sector_count == sctx->sectors_per_bio)
1682                 scrub_wr_submit(sctx);
1683         mutex_unlock(&sctx->wr_lock);
1684
1685         return 0;
1686 }
1687
1688 static void scrub_wr_submit(struct scrub_ctx *sctx)
1689 {
1690         struct scrub_bio *sbio;
1691
1692         if (!sctx->wr_curr_bio)
1693                 return;
1694
1695         sbio = sctx->wr_curr_bio;
1696         sctx->wr_curr_bio = NULL;
1697         scrub_pending_bio_inc(sctx);
1698         /* process all writes in a single worker thread. Then the block layer
1699          * orders the requests before sending them to the driver which
1700          * doubled the write performance on spinning disks when measured
1701          * with Linux 3.5 */
1702         btrfsic_check_bio(sbio->bio);
1703         submit_bio(sbio->bio);
1704
1705         if (btrfs_is_zoned(sctx->fs_info))
1706                 sctx->write_pointer = sbio->physical + sbio->sector_count *
1707                         sctx->fs_info->sectorsize;
1708 }
1709
1710 static void scrub_wr_bio_end_io(struct bio *bio)
1711 {
1712         struct scrub_bio *sbio = bio->bi_private;
1713         struct btrfs_fs_info *fs_info = sbio->dev->fs_info;
1714
1715         sbio->status = bio->bi_status;
1716         sbio->bio = bio;
1717
1718         INIT_WORK(&sbio->work, scrub_wr_bio_end_io_worker);
1719         queue_work(fs_info->scrub_wr_completion_workers, &sbio->work);
1720 }
1721
1722 static void scrub_wr_bio_end_io_worker(struct work_struct *work)
1723 {
1724         struct scrub_bio *sbio = container_of(work, struct scrub_bio, work);
1725         struct scrub_ctx *sctx = sbio->sctx;
1726         int i;
1727
1728         ASSERT(sbio->sector_count <= SCRUB_SECTORS_PER_BIO);
1729         if (sbio->status) {
1730                 struct btrfs_dev_replace *dev_replace =
1731                         &sbio->sctx->fs_info->dev_replace;
1732
1733                 for (i = 0; i < sbio->sector_count; i++) {
1734                         struct scrub_sector *sector = sbio->sectors[i];
1735
1736                         sector->io_error = 1;
1737                         atomic64_inc(&dev_replace->num_write_errors);
1738                 }
1739         }
1740
1741         for (i = 0; i < sbio->sector_count; i++)
1742                 scrub_sector_put(sbio->sectors[i]);
1743
1744         bio_put(sbio->bio);
1745         kfree(sbio);
1746         scrub_pending_bio_dec(sctx);
1747 }
1748
1749 static int scrub_checksum(struct scrub_block *sblock)
1750 {
1751         u64 flags;
1752         int ret;
1753
1754         /*
1755          * No need to initialize these stats currently,
1756          * because this function only use return value
1757          * instead of these stats value.
1758          *
1759          * Todo:
1760          * always use stats
1761          */
1762         sblock->header_error = 0;
1763         sblock->generation_error = 0;
1764         sblock->checksum_error = 0;
1765
1766         WARN_ON(sblock->sector_count < 1);
1767         flags = sblock->sectors[0]->flags;
1768         ret = 0;
1769         if (flags & BTRFS_EXTENT_FLAG_DATA)
1770                 ret = scrub_checksum_data(sblock);
1771         else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)
1772                 ret = scrub_checksum_tree_block(sblock);
1773         else if (flags & BTRFS_EXTENT_FLAG_SUPER)
1774                 (void)scrub_checksum_super(sblock);
1775         else
1776                 WARN_ON(1);
1777         if (ret)
1778                 scrub_handle_errored_block(sblock);
1779
1780         return ret;
1781 }
1782
1783 static int scrub_checksum_data(struct scrub_block *sblock)
1784 {
1785         struct scrub_ctx *sctx = sblock->sctx;
1786         struct btrfs_fs_info *fs_info = sctx->fs_info;
1787         SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
1788         u8 csum[BTRFS_CSUM_SIZE];
1789         struct scrub_sector *sector;
1790         char *kaddr;
1791
1792         BUG_ON(sblock->sector_count < 1);
1793         sector = sblock->sectors[0];
1794         if (!sector->have_csum)
1795                 return 0;
1796
1797         kaddr = page_address(sector->page);
1798
1799         shash->tfm = fs_info->csum_shash;
1800         crypto_shash_init(shash);
1801
1802         /*
1803          * In scrub_sectors() and scrub_sectors_for_parity() we ensure each sector
1804          * only contains one sector of data.
1805          */
1806         crypto_shash_digest(shash, kaddr, fs_info->sectorsize, csum);
1807
1808         if (memcmp(csum, sector->csum, fs_info->csum_size))
1809                 sblock->checksum_error = 1;
1810         return sblock->checksum_error;
1811 }
1812
1813 static int scrub_checksum_tree_block(struct scrub_block *sblock)
1814 {
1815         struct scrub_ctx *sctx = sblock->sctx;
1816         struct btrfs_header *h;
1817         struct btrfs_fs_info *fs_info = sctx->fs_info;
1818         SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
1819         u8 calculated_csum[BTRFS_CSUM_SIZE];
1820         u8 on_disk_csum[BTRFS_CSUM_SIZE];
1821         /*
1822          * This is done in sectorsize steps even for metadata as there's a
1823          * constraint for nodesize to be aligned to sectorsize. This will need
1824          * to change so we don't misuse data and metadata units like that.
1825          */
1826         const u32 sectorsize = sctx->fs_info->sectorsize;
1827         const int num_sectors = fs_info->nodesize >> fs_info->sectorsize_bits;
1828         int i;
1829         struct scrub_sector *sector;
1830         char *kaddr;
1831
1832         BUG_ON(sblock->sector_count < 1);
1833
1834         /* Each member in sectors is just one sector */
1835         ASSERT(sblock->sector_count == num_sectors);
1836
1837         sector = sblock->sectors[0];
1838         kaddr = page_address(sector->page);
1839         h = (struct btrfs_header *)kaddr;
1840         memcpy(on_disk_csum, h->csum, sctx->fs_info->csum_size);
1841
1842         /*
1843          * we don't use the getter functions here, as we
1844          * a) don't have an extent buffer and
1845          * b) the page is already kmapped
1846          */
1847         if (sector->logical != btrfs_stack_header_bytenr(h))
1848                 sblock->header_error = 1;
1849
1850         if (sector->generation != btrfs_stack_header_generation(h)) {
1851                 sblock->header_error = 1;
1852                 sblock->generation_error = 1;
1853         }
1854
1855         if (!scrub_check_fsid(h->fsid, sector))
1856                 sblock->header_error = 1;
1857
1858         if (memcmp(h->chunk_tree_uuid, fs_info->chunk_tree_uuid,
1859                    BTRFS_UUID_SIZE))
1860                 sblock->header_error = 1;
1861
1862         shash->tfm = fs_info->csum_shash;
1863         crypto_shash_init(shash);
1864         crypto_shash_update(shash, kaddr + BTRFS_CSUM_SIZE,
1865                             sectorsize - BTRFS_CSUM_SIZE);
1866
1867         for (i = 1; i < num_sectors; i++) {
1868                 kaddr = page_address(sblock->sectors[i]->page);
1869                 crypto_shash_update(shash, kaddr, sectorsize);
1870         }
1871
1872         crypto_shash_final(shash, calculated_csum);
1873         if (memcmp(calculated_csum, on_disk_csum, sctx->fs_info->csum_size))
1874                 sblock->checksum_error = 1;
1875
1876         return sblock->header_error || sblock->checksum_error;
1877 }
1878
1879 static int scrub_checksum_super(struct scrub_block *sblock)
1880 {
1881         struct btrfs_super_block *s;
1882         struct scrub_ctx *sctx = sblock->sctx;
1883         struct btrfs_fs_info *fs_info = sctx->fs_info;
1884         SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
1885         u8 calculated_csum[BTRFS_CSUM_SIZE];
1886         struct scrub_sector *sector;
1887         char *kaddr;
1888         int fail_gen = 0;
1889         int fail_cor = 0;
1890
1891         BUG_ON(sblock->sector_count < 1);
1892         sector = sblock->sectors[0];
1893         kaddr = page_address(sector->page);
1894         s = (struct btrfs_super_block *)kaddr;
1895
1896         if (sector->logical != btrfs_super_bytenr(s))
1897                 ++fail_cor;
1898
1899         if (sector->generation != btrfs_super_generation(s))
1900                 ++fail_gen;
1901
1902         if (!scrub_check_fsid(s->fsid, sector))
1903                 ++fail_cor;
1904
1905         shash->tfm = fs_info->csum_shash;
1906         crypto_shash_init(shash);
1907         crypto_shash_digest(shash, kaddr + BTRFS_CSUM_SIZE,
1908                         BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE, calculated_csum);
1909
1910         if (memcmp(calculated_csum, s->csum, sctx->fs_info->csum_size))
1911                 ++fail_cor;
1912
1913         if (fail_cor + fail_gen) {
1914                 /*
1915                  * if we find an error in a super block, we just report it.
1916                  * They will get written with the next transaction commit
1917                  * anyway
1918                  */
1919                 spin_lock(&sctx->stat_lock);
1920                 ++sctx->stat.super_errors;
1921                 spin_unlock(&sctx->stat_lock);
1922                 if (fail_cor)
1923                         btrfs_dev_stat_inc_and_print(sector->dev,
1924                                 BTRFS_DEV_STAT_CORRUPTION_ERRS);
1925                 else
1926                         btrfs_dev_stat_inc_and_print(sector->dev,
1927                                 BTRFS_DEV_STAT_GENERATION_ERRS);
1928         }
1929
1930         return fail_cor + fail_gen;
1931 }
1932
1933 static void scrub_block_get(struct scrub_block *sblock)
1934 {
1935         refcount_inc(&sblock->refs);
1936 }
1937
1938 static void scrub_block_put(struct scrub_block *sblock)
1939 {
1940         if (refcount_dec_and_test(&sblock->refs)) {
1941                 int i;
1942
1943                 if (sblock->sparity)
1944                         scrub_parity_put(sblock->sparity);
1945
1946                 for (i = 0; i < sblock->sector_count; i++)
1947                         scrub_sector_put(sblock->sectors[i]);
1948                 kfree(sblock);
1949         }
1950 }
1951
1952 static void scrub_sector_get(struct scrub_sector *sector)
1953 {
1954         atomic_inc(&sector->refs);
1955 }
1956
1957 static void scrub_sector_put(struct scrub_sector *sector)
1958 {
1959         if (atomic_dec_and_test(&sector->refs)) {
1960                 if (sector->page)
1961                         __free_page(sector->page);
1962                 kfree(sector);
1963         }
1964 }
1965
1966 /*
1967  * Throttling of IO submission, bandwidth-limit based, the timeslice is 1
1968  * second.  Limit can be set via /sys/fs/UUID/devinfo/devid/scrub_speed_max.
1969  */
1970 static void scrub_throttle(struct scrub_ctx *sctx)
1971 {
1972         const int time_slice = 1000;
1973         struct scrub_bio *sbio;
1974         struct btrfs_device *device;
1975         s64 delta;
1976         ktime_t now;
1977         u32 div;
1978         u64 bwlimit;
1979
1980         sbio = sctx->bios[sctx->curr];
1981         device = sbio->dev;
1982         bwlimit = READ_ONCE(device->scrub_speed_max);
1983         if (bwlimit == 0)
1984                 return;
1985
1986         /*
1987          * Slice is divided into intervals when the IO is submitted, adjust by
1988          * bwlimit and maximum of 64 intervals.
1989          */
1990         div = max_t(u32, 1, (u32)(bwlimit / (16 * 1024 * 1024)));
1991         div = min_t(u32, 64, div);
1992
1993         /* Start new epoch, set deadline */
1994         now = ktime_get();
1995         if (sctx->throttle_deadline == 0) {
1996                 sctx->throttle_deadline = ktime_add_ms(now, time_slice / div);
1997                 sctx->throttle_sent = 0;
1998         }
1999
2000         /* Still in the time to send? */
2001         if (ktime_before(now, sctx->throttle_deadline)) {
2002                 /* If current bio is within the limit, send it */
2003                 sctx->throttle_sent += sbio->bio->bi_iter.bi_size;
2004                 if (sctx->throttle_sent <= div_u64(bwlimit, div))
2005                         return;
2006
2007                 /* We're over the limit, sleep until the rest of the slice */
2008                 delta = ktime_ms_delta(sctx->throttle_deadline, now);
2009         } else {
2010                 /* New request after deadline, start new epoch */
2011                 delta = 0;
2012         }
2013
2014         if (delta) {
2015                 long timeout;
2016
2017                 timeout = div_u64(delta * HZ, 1000);
2018                 schedule_timeout_interruptible(timeout);
2019         }
2020
2021         /* Next call will start the deadline period */
2022         sctx->throttle_deadline = 0;
2023 }
2024
2025 static void scrub_submit(struct scrub_ctx *sctx)
2026 {
2027         struct scrub_bio *sbio;
2028
2029         if (sctx->curr == -1)
2030                 return;
2031
2032         scrub_throttle(sctx);
2033
2034         sbio = sctx->bios[sctx->curr];
2035         sctx->curr = -1;
2036         scrub_pending_bio_inc(sctx);
2037         btrfsic_check_bio(sbio->bio);
2038         submit_bio(sbio->bio);
2039 }
2040
2041 static int scrub_add_sector_to_rd_bio(struct scrub_ctx *sctx,
2042                                       struct scrub_sector *sector)
2043 {
2044         struct scrub_block *sblock = sector->sblock;
2045         struct scrub_bio *sbio;
2046         const u32 sectorsize = sctx->fs_info->sectorsize;
2047         int ret;
2048
2049 again:
2050         /*
2051          * grab a fresh bio or wait for one to become available
2052          */
2053         while (sctx->curr == -1) {
2054                 spin_lock(&sctx->list_lock);
2055                 sctx->curr = sctx->first_free;
2056                 if (sctx->curr != -1) {
2057                         sctx->first_free = sctx->bios[sctx->curr]->next_free;
2058                         sctx->bios[sctx->curr]->next_free = -1;
2059                         sctx->bios[sctx->curr]->sector_count = 0;
2060                         spin_unlock(&sctx->list_lock);
2061                 } else {
2062                         spin_unlock(&sctx->list_lock);
2063                         wait_event(sctx->list_wait, sctx->first_free != -1);
2064                 }
2065         }
2066         sbio = sctx->bios[sctx->curr];
2067         if (sbio->sector_count == 0) {
2068                 sbio->physical = sector->physical;
2069                 sbio->logical = sector->logical;
2070                 sbio->dev = sector->dev;
2071                 if (!sbio->bio) {
2072                         sbio->bio = bio_alloc(sbio->dev->bdev, sctx->sectors_per_bio,
2073                                               REQ_OP_READ, GFP_NOFS);
2074                 }
2075                 sbio->bio->bi_private = sbio;
2076                 sbio->bio->bi_end_io = scrub_bio_end_io;
2077                 sbio->bio->bi_iter.bi_sector = sbio->physical >> 9;
2078                 sbio->status = 0;
2079         } else if (sbio->physical + sbio->sector_count * sectorsize !=
2080                    sector->physical ||
2081                    sbio->logical + sbio->sector_count * sectorsize !=
2082                    sector->logical ||
2083                    sbio->dev != sector->dev) {
2084                 scrub_submit(sctx);
2085                 goto again;
2086         }
2087
2088         sbio->sectors[sbio->sector_count] = sector;
2089         ret = bio_add_page(sbio->bio, sector->page, sectorsize, 0);
2090         if (ret != sectorsize) {
2091                 if (sbio->sector_count < 1) {
2092                         bio_put(sbio->bio);
2093                         sbio->bio = NULL;
2094                         return -EIO;
2095                 }
2096                 scrub_submit(sctx);
2097                 goto again;
2098         }
2099
2100         scrub_block_get(sblock); /* one for the page added to the bio */
2101         atomic_inc(&sblock->outstanding_sectors);
2102         sbio->sector_count++;
2103         if (sbio->sector_count == sctx->sectors_per_bio)
2104                 scrub_submit(sctx);
2105
2106         return 0;
2107 }
2108
2109 static void scrub_missing_raid56_end_io(struct bio *bio)
2110 {
2111         struct scrub_block *sblock = bio->bi_private;
2112         struct btrfs_fs_info *fs_info = sblock->sctx->fs_info;
2113
2114         if (bio->bi_status)
2115                 sblock->no_io_error_seen = 0;
2116
2117         bio_put(bio);
2118
2119         queue_work(fs_info->scrub_workers, &sblock->work);
2120 }
2121
2122 static void scrub_missing_raid56_worker(struct work_struct *work)
2123 {
2124         struct scrub_block *sblock = container_of(work, struct scrub_block, work);
2125         struct scrub_ctx *sctx = sblock->sctx;
2126         struct btrfs_fs_info *fs_info = sctx->fs_info;
2127         u64 logical;
2128         struct btrfs_device *dev;
2129
2130         logical = sblock->sectors[0]->logical;
2131         dev = sblock->sectors[0]->dev;
2132
2133         if (sblock->no_io_error_seen)
2134                 scrub_recheck_block_checksum(sblock);
2135
2136         if (!sblock->no_io_error_seen) {
2137                 spin_lock(&sctx->stat_lock);
2138                 sctx->stat.read_errors++;
2139                 spin_unlock(&sctx->stat_lock);
2140                 btrfs_err_rl_in_rcu(fs_info,
2141                         "IO error rebuilding logical %llu for dev %s",
2142                         logical, rcu_str_deref(dev->name));
2143         } else if (sblock->header_error || sblock->checksum_error) {
2144                 spin_lock(&sctx->stat_lock);
2145                 sctx->stat.uncorrectable_errors++;
2146                 spin_unlock(&sctx->stat_lock);
2147                 btrfs_err_rl_in_rcu(fs_info,
2148                         "failed to rebuild valid logical %llu for dev %s",
2149                         logical, rcu_str_deref(dev->name));
2150         } else {
2151                 scrub_write_block_to_dev_replace(sblock);
2152         }
2153
2154         if (sctx->is_dev_replace && sctx->flush_all_writes) {
2155                 mutex_lock(&sctx->wr_lock);
2156                 scrub_wr_submit(sctx);
2157                 mutex_unlock(&sctx->wr_lock);
2158         }
2159
2160         scrub_block_put(sblock);
2161         scrub_pending_bio_dec(sctx);
2162 }
2163
2164 static void scrub_missing_raid56_pages(struct scrub_block *sblock)
2165 {
2166         struct scrub_ctx *sctx = sblock->sctx;
2167         struct btrfs_fs_info *fs_info = sctx->fs_info;
2168         u64 length = sblock->sector_count << fs_info->sectorsize_bits;
2169         u64 logical = sblock->sectors[0]->logical;
2170         struct btrfs_io_context *bioc = NULL;
2171         struct bio *bio;
2172         struct btrfs_raid_bio *rbio;
2173         int ret;
2174         int i;
2175
2176         btrfs_bio_counter_inc_blocked(fs_info);
2177         ret = btrfs_map_sblock(fs_info, BTRFS_MAP_GET_READ_MIRRORS, logical,
2178                                &length, &bioc);
2179         if (ret || !bioc || !bioc->raid_map)
2180                 goto bioc_out;
2181
2182         if (WARN_ON(!sctx->is_dev_replace ||
2183                     !(bioc->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK))) {
2184                 /*
2185                  * We shouldn't be scrubbing a missing device. Even for dev
2186                  * replace, we should only get here for RAID 5/6. We either
2187                  * managed to mount something with no mirrors remaining or
2188                  * there's a bug in scrub_find_good_copy()/btrfs_map_block().
2189                  */
2190                 goto bioc_out;
2191         }
2192
2193         bio = bio_alloc(NULL, BIO_MAX_VECS, REQ_OP_READ, GFP_NOFS);
2194         bio->bi_iter.bi_sector = logical >> 9;
2195         bio->bi_private = sblock;
2196         bio->bi_end_io = scrub_missing_raid56_end_io;
2197
2198         rbio = raid56_alloc_missing_rbio(bio, bioc, length);
2199         if (!rbio)
2200                 goto rbio_out;
2201
2202         for (i = 0; i < sblock->sector_count; i++) {
2203                 struct scrub_sector *sector = sblock->sectors[i];
2204
2205                 /*
2206                  * For now, our scrub is still one page per sector, so pgoff
2207                  * is always 0.
2208                  */
2209                 raid56_add_scrub_pages(rbio, sector->page, 0, sector->logical);
2210         }
2211
2212         INIT_WORK(&sblock->work, scrub_missing_raid56_worker);
2213         scrub_block_get(sblock);
2214         scrub_pending_bio_inc(sctx);
2215         raid56_submit_missing_rbio(rbio);
2216         return;
2217
2218 rbio_out:
2219         bio_put(bio);
2220 bioc_out:
2221         btrfs_bio_counter_dec(fs_info);
2222         btrfs_put_bioc(bioc);
2223         spin_lock(&sctx->stat_lock);
2224         sctx->stat.malloc_errors++;
2225         spin_unlock(&sctx->stat_lock);
2226 }
2227
2228 static int scrub_sectors(struct scrub_ctx *sctx, u64 logical, u32 len,
2229                        u64 physical, struct btrfs_device *dev, u64 flags,
2230                        u64 gen, int mirror_num, u8 *csum,
2231                        u64 physical_for_dev_replace)
2232 {
2233         struct scrub_block *sblock;
2234         const u32 sectorsize = sctx->fs_info->sectorsize;
2235         int index;
2236
2237         sblock = kzalloc(sizeof(*sblock), GFP_KERNEL);
2238         if (!sblock) {
2239                 spin_lock(&sctx->stat_lock);
2240                 sctx->stat.malloc_errors++;
2241                 spin_unlock(&sctx->stat_lock);
2242                 return -ENOMEM;
2243         }
2244
2245         /* one ref inside this function, plus one for each page added to
2246          * a bio later on */
2247         refcount_set(&sblock->refs, 1);
2248         sblock->sctx = sctx;
2249         sblock->no_io_error_seen = 1;
2250
2251         for (index = 0; len > 0; index++) {
2252                 struct scrub_sector *sector;
2253                 /*
2254                  * Here we will allocate one page for one sector to scrub.
2255                  * This is fine if PAGE_SIZE == sectorsize, but will cost
2256                  * more memory for PAGE_SIZE > sectorsize case.
2257                  */
2258                 u32 l = min(sectorsize, len);
2259
2260                 sector = kzalloc(sizeof(*sector), GFP_KERNEL);
2261                 if (!sector) {
2262 leave_nomem:
2263                         spin_lock(&sctx->stat_lock);
2264                         sctx->stat.malloc_errors++;
2265                         spin_unlock(&sctx->stat_lock);
2266                         scrub_block_put(sblock);
2267                         return -ENOMEM;
2268                 }
2269                 ASSERT(index < SCRUB_MAX_SECTORS_PER_BLOCK);
2270                 scrub_sector_get(sector);
2271                 sblock->sectors[index] = sector;
2272                 sector->sblock = sblock;
2273                 sector->dev = dev;
2274                 sector->flags = flags;
2275                 sector->generation = gen;
2276                 sector->logical = logical;
2277                 sector->physical = physical;
2278                 sector->physical_for_dev_replace = physical_for_dev_replace;
2279                 sector->mirror_num = mirror_num;
2280                 if (csum) {
2281                         sector->have_csum = 1;
2282                         memcpy(sector->csum, csum, sctx->fs_info->csum_size);
2283                 } else {
2284                         sector->have_csum = 0;
2285                 }
2286                 sblock->sector_count++;
2287                 sector->page = alloc_page(GFP_KERNEL);
2288                 if (!sector->page)
2289                         goto leave_nomem;
2290                 len -= l;
2291                 logical += l;
2292                 physical += l;
2293                 physical_for_dev_replace += l;
2294         }
2295
2296         WARN_ON(sblock->sector_count == 0);
2297         if (test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state)) {
2298                 /*
2299                  * This case should only be hit for RAID 5/6 device replace. See
2300                  * the comment in scrub_missing_raid56_pages() for details.
2301                  */
2302                 scrub_missing_raid56_pages(sblock);
2303         } else {
2304                 for (index = 0; index < sblock->sector_count; index++) {
2305                         struct scrub_sector *sector = sblock->sectors[index];
2306                         int ret;
2307
2308                         ret = scrub_add_sector_to_rd_bio(sctx, sector);
2309                         if (ret) {
2310                                 scrub_block_put(sblock);
2311                                 return ret;
2312                         }
2313                 }
2314
2315                 if (flags & BTRFS_EXTENT_FLAG_SUPER)
2316                         scrub_submit(sctx);
2317         }
2318
2319         /* last one frees, either here or in bio completion for last page */
2320         scrub_block_put(sblock);
2321         return 0;
2322 }
2323
2324 static void scrub_bio_end_io(struct bio *bio)
2325 {
2326         struct scrub_bio *sbio = bio->bi_private;
2327         struct btrfs_fs_info *fs_info = sbio->dev->fs_info;
2328
2329         sbio->status = bio->bi_status;
2330         sbio->bio = bio;
2331
2332         queue_work(fs_info->scrub_workers, &sbio->work);
2333 }
2334
2335 static void scrub_bio_end_io_worker(struct work_struct *work)
2336 {
2337         struct scrub_bio *sbio = container_of(work, struct scrub_bio, work);
2338         struct scrub_ctx *sctx = sbio->sctx;
2339         int i;
2340
2341         ASSERT(sbio->sector_count <= SCRUB_SECTORS_PER_BIO);
2342         if (sbio->status) {
2343                 for (i = 0; i < sbio->sector_count; i++) {
2344                         struct scrub_sector *sector = sbio->sectors[i];
2345
2346                         sector->io_error = 1;
2347                         sector->sblock->no_io_error_seen = 0;
2348                 }
2349         }
2350
2351         /* Now complete the scrub_block items that have all pages completed */
2352         for (i = 0; i < sbio->sector_count; i++) {
2353                 struct scrub_sector *sector = sbio->sectors[i];
2354                 struct scrub_block *sblock = sector->sblock;
2355
2356                 if (atomic_dec_and_test(&sblock->outstanding_sectors))
2357                         scrub_block_complete(sblock);
2358                 scrub_block_put(sblock);
2359         }
2360
2361         bio_put(sbio->bio);
2362         sbio->bio = NULL;
2363         spin_lock(&sctx->list_lock);
2364         sbio->next_free = sctx->first_free;
2365         sctx->first_free = sbio->index;
2366         spin_unlock(&sctx->list_lock);
2367
2368         if (sctx->is_dev_replace && sctx->flush_all_writes) {
2369                 mutex_lock(&sctx->wr_lock);
2370                 scrub_wr_submit(sctx);
2371                 mutex_unlock(&sctx->wr_lock);
2372         }
2373
2374         scrub_pending_bio_dec(sctx);
2375 }
2376
2377 static inline void __scrub_mark_bitmap(struct scrub_parity *sparity,
2378                                        unsigned long *bitmap,
2379                                        u64 start, u32 len)
2380 {
2381         u64 offset;
2382         u32 nsectors;
2383         u32 sectorsize_bits = sparity->sctx->fs_info->sectorsize_bits;
2384
2385         if (len >= sparity->stripe_len) {
2386                 bitmap_set(bitmap, 0, sparity->nsectors);
2387                 return;
2388         }
2389
2390         start -= sparity->logic_start;
2391         start = div64_u64_rem(start, sparity->stripe_len, &offset);
2392         offset = offset >> sectorsize_bits;
2393         nsectors = len >> sectorsize_bits;
2394
2395         if (offset + nsectors <= sparity->nsectors) {
2396                 bitmap_set(bitmap, offset, nsectors);
2397                 return;
2398         }
2399
2400         bitmap_set(bitmap, offset, sparity->nsectors - offset);
2401         bitmap_set(bitmap, 0, nsectors - (sparity->nsectors - offset));
2402 }
2403
2404 static inline void scrub_parity_mark_sectors_error(struct scrub_parity *sparity,
2405                                                    u64 start, u32 len)
2406 {
2407         __scrub_mark_bitmap(sparity, &sparity->ebitmap, start, len);
2408 }
2409
2410 static inline void scrub_parity_mark_sectors_data(struct scrub_parity *sparity,
2411                                                   u64 start, u32 len)
2412 {
2413         __scrub_mark_bitmap(sparity, &sparity->dbitmap, start, len);
2414 }
2415
2416 static void scrub_block_complete(struct scrub_block *sblock)
2417 {
2418         int corrupted = 0;
2419
2420         if (!sblock->no_io_error_seen) {
2421                 corrupted = 1;
2422                 scrub_handle_errored_block(sblock);
2423         } else {
2424                 /*
2425                  * if has checksum error, write via repair mechanism in
2426                  * dev replace case, otherwise write here in dev replace
2427                  * case.
2428                  */
2429                 corrupted = scrub_checksum(sblock);
2430                 if (!corrupted && sblock->sctx->is_dev_replace)
2431                         scrub_write_block_to_dev_replace(sblock);
2432         }
2433
2434         if (sblock->sparity && corrupted && !sblock->data_corrected) {
2435                 u64 start = sblock->sectors[0]->logical;
2436                 u64 end = sblock->sectors[sblock->sector_count - 1]->logical +
2437                           sblock->sctx->fs_info->sectorsize;
2438
2439                 ASSERT(end - start <= U32_MAX);
2440                 scrub_parity_mark_sectors_error(sblock->sparity,
2441                                                 start, end - start);
2442         }
2443 }
2444
2445 static void drop_csum_range(struct scrub_ctx *sctx, struct btrfs_ordered_sum *sum)
2446 {
2447         sctx->stat.csum_discards += sum->len >> sctx->fs_info->sectorsize_bits;
2448         list_del(&sum->list);
2449         kfree(sum);
2450 }
2451
2452 /*
2453  * Find the desired csum for range [logical, logical + sectorsize), and store
2454  * the csum into @csum.
2455  *
2456  * The search source is sctx->csum_list, which is a pre-populated list
2457  * storing bytenr ordered csum ranges.  We're responsible to cleanup any range
2458  * that is before @logical.
2459  *
2460  * Return 0 if there is no csum for the range.
2461  * Return 1 if there is csum for the range and copied to @csum.
2462  */
2463 static int scrub_find_csum(struct scrub_ctx *sctx, u64 logical, u8 *csum)
2464 {
2465         bool found = false;
2466
2467         while (!list_empty(&sctx->csum_list)) {
2468                 struct btrfs_ordered_sum *sum = NULL;
2469                 unsigned long index;
2470                 unsigned long num_sectors;
2471
2472                 sum = list_first_entry(&sctx->csum_list,
2473                                        struct btrfs_ordered_sum, list);
2474                 /* The current csum range is beyond our range, no csum found */
2475                 if (sum->bytenr > logical)
2476                         break;
2477
2478                 /*
2479                  * The current sum is before our bytenr, since scrub is always
2480                  * done in bytenr order, the csum will never be used anymore,
2481                  * clean it up so that later calls won't bother with the range,
2482                  * and continue search the next range.
2483                  */
2484                 if (sum->bytenr + sum->len <= logical) {
2485                         drop_csum_range(sctx, sum);
2486                         continue;
2487                 }
2488
2489                 /* Now the csum range covers our bytenr, copy the csum */
2490                 found = true;
2491                 index = (logical - sum->bytenr) >> sctx->fs_info->sectorsize_bits;
2492                 num_sectors = sum->len >> sctx->fs_info->sectorsize_bits;
2493
2494                 memcpy(csum, sum->sums + index * sctx->fs_info->csum_size,
2495                        sctx->fs_info->csum_size);
2496
2497                 /* Cleanup the range if we're at the end of the csum range */
2498                 if (index == num_sectors - 1)
2499                         drop_csum_range(sctx, sum);
2500                 break;
2501         }
2502         if (!found)
2503                 return 0;
2504         return 1;
2505 }
2506
2507 /* scrub extent tries to collect up to 64 kB for each bio */
2508 static int scrub_extent(struct scrub_ctx *sctx, struct map_lookup *map,
2509                         u64 logical, u32 len,
2510                         u64 physical, struct btrfs_device *dev, u64 flags,
2511                         u64 gen, int mirror_num)
2512 {
2513         struct btrfs_device *src_dev = dev;
2514         u64 src_physical = physical;
2515         int src_mirror = mirror_num;
2516         int ret;
2517         u8 csum[BTRFS_CSUM_SIZE];
2518         u32 blocksize;
2519
2520         if (flags & BTRFS_EXTENT_FLAG_DATA) {
2521                 if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK)
2522                         blocksize = map->stripe_len;
2523                 else
2524                         blocksize = sctx->fs_info->sectorsize;
2525                 spin_lock(&sctx->stat_lock);
2526                 sctx->stat.data_extents_scrubbed++;
2527                 sctx->stat.data_bytes_scrubbed += len;
2528                 spin_unlock(&sctx->stat_lock);
2529         } else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
2530                 if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK)
2531                         blocksize = map->stripe_len;
2532                 else
2533                         blocksize = sctx->fs_info->nodesize;
2534                 spin_lock(&sctx->stat_lock);
2535                 sctx->stat.tree_extents_scrubbed++;
2536                 sctx->stat.tree_bytes_scrubbed += len;
2537                 spin_unlock(&sctx->stat_lock);
2538         } else {
2539                 blocksize = sctx->fs_info->sectorsize;
2540                 WARN_ON(1);
2541         }
2542
2543         /*
2544          * For dev-replace case, we can have @dev being a missing device.
2545          * Regular scrub will avoid its execution on missing device at all,
2546          * as that would trigger tons of read error.
2547          *
2548          * Reading from missing device will cause read error counts to
2549          * increase unnecessarily.
2550          * So here we change the read source to a good mirror.
2551          */
2552         if (sctx->is_dev_replace && !dev->bdev)
2553                 scrub_find_good_copy(sctx->fs_info, logical, len, &src_physical,
2554                                      &src_dev, &src_mirror);
2555         while (len) {
2556                 u32 l = min(len, blocksize);
2557                 int have_csum = 0;
2558
2559                 if (flags & BTRFS_EXTENT_FLAG_DATA) {
2560                         /* push csums to sbio */
2561                         have_csum = scrub_find_csum(sctx, logical, csum);
2562                         if (have_csum == 0)
2563                                 ++sctx->stat.no_csum;
2564                 }
2565                 ret = scrub_sectors(sctx, logical, l, src_physical, src_dev,
2566                                     flags, gen, src_mirror,
2567                                     have_csum ? csum : NULL, physical);
2568                 if (ret)
2569                         return ret;
2570                 len -= l;
2571                 logical += l;
2572                 physical += l;
2573                 src_physical += l;
2574         }
2575         return 0;
2576 }
2577
2578 static int scrub_sectors_for_parity(struct scrub_parity *sparity,
2579                                   u64 logical, u32 len,
2580                                   u64 physical, struct btrfs_device *dev,
2581                                   u64 flags, u64 gen, int mirror_num, u8 *csum)
2582 {
2583         struct scrub_ctx *sctx = sparity->sctx;
2584         struct scrub_block *sblock;
2585         const u32 sectorsize = sctx->fs_info->sectorsize;
2586         int index;
2587
2588         ASSERT(IS_ALIGNED(len, sectorsize));
2589
2590         sblock = kzalloc(sizeof(*sblock), GFP_KERNEL);
2591         if (!sblock) {
2592                 spin_lock(&sctx->stat_lock);
2593                 sctx->stat.malloc_errors++;
2594                 spin_unlock(&sctx->stat_lock);
2595                 return -ENOMEM;
2596         }
2597
2598         /* one ref inside this function, plus one for each page added to
2599          * a bio later on */
2600         refcount_set(&sblock->refs, 1);
2601         sblock->sctx = sctx;
2602         sblock->no_io_error_seen = 1;
2603         sblock->sparity = sparity;
2604         scrub_parity_get(sparity);
2605
2606         for (index = 0; len > 0; index++) {
2607                 struct scrub_sector *sector;
2608
2609                 sector = kzalloc(sizeof(*sector), GFP_KERNEL);
2610                 if (!sector) {
2611 leave_nomem:
2612                         spin_lock(&sctx->stat_lock);
2613                         sctx->stat.malloc_errors++;
2614                         spin_unlock(&sctx->stat_lock);
2615                         scrub_block_put(sblock);
2616                         return -ENOMEM;
2617                 }
2618                 ASSERT(index < SCRUB_MAX_SECTORS_PER_BLOCK);
2619                 /* For scrub block */
2620                 scrub_sector_get(sector);
2621                 sblock->sectors[index] = sector;
2622                 /* For scrub parity */
2623                 scrub_sector_get(sector);
2624                 list_add_tail(&sector->list, &sparity->sectors_list);
2625                 sector->sblock = sblock;
2626                 sector->dev = dev;
2627                 sector->flags = flags;
2628                 sector->generation = gen;
2629                 sector->logical = logical;
2630                 sector->physical = physical;
2631                 sector->mirror_num = mirror_num;
2632                 if (csum) {
2633                         sector->have_csum = 1;
2634                         memcpy(sector->csum, csum, sctx->fs_info->csum_size);
2635                 } else {
2636                         sector->have_csum = 0;
2637                 }
2638                 sblock->sector_count++;
2639                 sector->page = alloc_page(GFP_KERNEL);
2640                 if (!sector->page)
2641                         goto leave_nomem;
2642
2643
2644                 /* Iterate over the stripe range in sectorsize steps */
2645                 len -= sectorsize;
2646                 logical += sectorsize;
2647                 physical += sectorsize;
2648         }
2649
2650         WARN_ON(sblock->sector_count == 0);
2651         for (index = 0; index < sblock->sector_count; index++) {
2652                 struct scrub_sector *sector = sblock->sectors[index];
2653                 int ret;
2654
2655                 ret = scrub_add_sector_to_rd_bio(sctx, sector);
2656                 if (ret) {
2657                         scrub_block_put(sblock);
2658                         return ret;
2659                 }
2660         }
2661
2662         /* Last one frees, either here or in bio completion for last sector */
2663         scrub_block_put(sblock);
2664         return 0;
2665 }
2666
2667 static int scrub_extent_for_parity(struct scrub_parity *sparity,
2668                                    u64 logical, u32 len,
2669                                    u64 physical, struct btrfs_device *dev,
2670                                    u64 flags, u64 gen, int mirror_num)
2671 {
2672         struct scrub_ctx *sctx = sparity->sctx;
2673         int ret;
2674         u8 csum[BTRFS_CSUM_SIZE];
2675         u32 blocksize;
2676
2677         if (test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state)) {
2678                 scrub_parity_mark_sectors_error(sparity, logical, len);
2679                 return 0;
2680         }
2681
2682         if (flags & BTRFS_EXTENT_FLAG_DATA) {
2683                 blocksize = sparity->stripe_len;
2684         } else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
2685                 blocksize = sparity->stripe_len;
2686         } else {
2687                 blocksize = sctx->fs_info->sectorsize;
2688                 WARN_ON(1);
2689         }
2690
2691         while (len) {
2692                 u32 l = min(len, blocksize);
2693                 int have_csum = 0;
2694
2695                 if (flags & BTRFS_EXTENT_FLAG_DATA) {
2696                         /* push csums to sbio */
2697                         have_csum = scrub_find_csum(sctx, logical, csum);
2698                         if (have_csum == 0)
2699                                 goto skip;
2700                 }
2701                 ret = scrub_sectors_for_parity(sparity, logical, l, physical, dev,
2702                                              flags, gen, mirror_num,
2703                                              have_csum ? csum : NULL);
2704                 if (ret)
2705                         return ret;
2706 skip:
2707                 len -= l;
2708                 logical += l;
2709                 physical += l;
2710         }
2711         return 0;
2712 }
2713
2714 /*
2715  * Given a physical address, this will calculate it's
2716  * logical offset. if this is a parity stripe, it will return
2717  * the most left data stripe's logical offset.
2718  *
2719  * return 0 if it is a data stripe, 1 means parity stripe.
2720  */
2721 static int get_raid56_logic_offset(u64 physical, int num,
2722                                    struct map_lookup *map, u64 *offset,
2723                                    u64 *stripe_start)
2724 {
2725         int i;
2726         int j = 0;
2727         u64 stripe_nr;
2728         u64 last_offset;
2729         u32 stripe_index;
2730         u32 rot;
2731         const int data_stripes = nr_data_stripes(map);
2732
2733         last_offset = (physical - map->stripes[num].physical) * data_stripes;
2734         if (stripe_start)
2735                 *stripe_start = last_offset;
2736
2737         *offset = last_offset;
2738         for (i = 0; i < data_stripes; i++) {
2739                 *offset = last_offset + i * map->stripe_len;
2740
2741                 stripe_nr = div64_u64(*offset, map->stripe_len);
2742                 stripe_nr = div_u64(stripe_nr, data_stripes);
2743
2744                 /* Work out the disk rotation on this stripe-set */
2745                 stripe_nr = div_u64_rem(stripe_nr, map->num_stripes, &rot);
2746                 /* calculate which stripe this data locates */
2747                 rot += i;
2748                 stripe_index = rot % map->num_stripes;
2749                 if (stripe_index == num)
2750                         return 0;
2751                 if (stripe_index < num)
2752                         j++;
2753         }
2754         *offset = last_offset + j * map->stripe_len;
2755         return 1;
2756 }
2757
2758 static void scrub_free_parity(struct scrub_parity *sparity)
2759 {
2760         struct scrub_ctx *sctx = sparity->sctx;
2761         struct scrub_sector *curr, *next;
2762         int nbits;
2763
2764         nbits = bitmap_weight(&sparity->ebitmap, sparity->nsectors);
2765         if (nbits) {
2766                 spin_lock(&sctx->stat_lock);
2767                 sctx->stat.read_errors += nbits;
2768                 sctx->stat.uncorrectable_errors += nbits;
2769                 spin_unlock(&sctx->stat_lock);
2770         }
2771
2772         list_for_each_entry_safe(curr, next, &sparity->sectors_list, list) {
2773                 list_del_init(&curr->list);
2774                 scrub_sector_put(curr);
2775         }
2776
2777         kfree(sparity);
2778 }
2779
2780 static void scrub_parity_bio_endio_worker(struct work_struct *work)
2781 {
2782         struct scrub_parity *sparity = container_of(work, struct scrub_parity,
2783                                                     work);
2784         struct scrub_ctx *sctx = sparity->sctx;
2785
2786         scrub_free_parity(sparity);
2787         scrub_pending_bio_dec(sctx);
2788 }
2789
2790 static void scrub_parity_bio_endio(struct bio *bio)
2791 {
2792         struct scrub_parity *sparity = bio->bi_private;
2793         struct btrfs_fs_info *fs_info = sparity->sctx->fs_info;
2794
2795         if (bio->bi_status)
2796                 bitmap_or(&sparity->ebitmap, &sparity->ebitmap,
2797                           &sparity->dbitmap, sparity->nsectors);
2798
2799         bio_put(bio);
2800
2801         INIT_WORK(&sparity->work, scrub_parity_bio_endio_worker);
2802         queue_work(fs_info->scrub_parity_workers, &sparity->work);
2803 }
2804
2805 static void scrub_parity_check_and_repair(struct scrub_parity *sparity)
2806 {
2807         struct scrub_ctx *sctx = sparity->sctx;
2808         struct btrfs_fs_info *fs_info = sctx->fs_info;
2809         struct bio *bio;
2810         struct btrfs_raid_bio *rbio;
2811         struct btrfs_io_context *bioc = NULL;
2812         u64 length;
2813         int ret;
2814
2815         if (!bitmap_andnot(&sparity->dbitmap, &sparity->dbitmap,
2816                            &sparity->ebitmap, sparity->nsectors))
2817                 goto out;
2818
2819         length = sparity->logic_end - sparity->logic_start;
2820
2821         btrfs_bio_counter_inc_blocked(fs_info);
2822         ret = btrfs_map_sblock(fs_info, BTRFS_MAP_WRITE, sparity->logic_start,
2823                                &length, &bioc);
2824         if (ret || !bioc || !bioc->raid_map)
2825                 goto bioc_out;
2826
2827         bio = bio_alloc(NULL, BIO_MAX_VECS, REQ_OP_READ, GFP_NOFS);
2828         bio->bi_iter.bi_sector = sparity->logic_start >> 9;
2829         bio->bi_private = sparity;
2830         bio->bi_end_io = scrub_parity_bio_endio;
2831
2832         rbio = raid56_parity_alloc_scrub_rbio(bio, bioc, length,
2833                                               sparity->scrub_dev,
2834                                               &sparity->dbitmap,
2835                                               sparity->nsectors);
2836         if (!rbio)
2837                 goto rbio_out;
2838
2839         scrub_pending_bio_inc(sctx);
2840         raid56_parity_submit_scrub_rbio(rbio);
2841         return;
2842
2843 rbio_out:
2844         bio_put(bio);
2845 bioc_out:
2846         btrfs_bio_counter_dec(fs_info);
2847         btrfs_put_bioc(bioc);
2848         bitmap_or(&sparity->ebitmap, &sparity->ebitmap, &sparity->dbitmap,
2849                   sparity->nsectors);
2850         spin_lock(&sctx->stat_lock);
2851         sctx->stat.malloc_errors++;
2852         spin_unlock(&sctx->stat_lock);
2853 out:
2854         scrub_free_parity(sparity);
2855 }
2856
2857 static void scrub_parity_get(struct scrub_parity *sparity)
2858 {
2859         refcount_inc(&sparity->refs);
2860 }
2861
2862 static void scrub_parity_put(struct scrub_parity *sparity)
2863 {
2864         if (!refcount_dec_and_test(&sparity->refs))
2865                 return;
2866
2867         scrub_parity_check_and_repair(sparity);
2868 }
2869
2870 /*
2871  * Return 0 if the extent item range covers any byte of the range.
2872  * Return <0 if the extent item is before @search_start.
2873  * Return >0 if the extent item is after @start_start + @search_len.
2874  */
2875 static int compare_extent_item_range(struct btrfs_path *path,
2876                                      u64 search_start, u64 search_len)
2877 {
2878         struct btrfs_fs_info *fs_info = path->nodes[0]->fs_info;
2879         u64 len;
2880         struct btrfs_key key;
2881
2882         btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
2883         ASSERT(key.type == BTRFS_EXTENT_ITEM_KEY ||
2884                key.type == BTRFS_METADATA_ITEM_KEY);
2885         if (key.type == BTRFS_METADATA_ITEM_KEY)
2886                 len = fs_info->nodesize;
2887         else
2888                 len = key.offset;
2889
2890         if (key.objectid + len <= search_start)
2891                 return -1;
2892         if (key.objectid >= search_start + search_len)
2893                 return 1;
2894         return 0;
2895 }
2896
2897 /*
2898  * Locate one extent item which covers any byte in range
2899  * [@search_start, @search_start + @search_length)
2900  *
2901  * If the path is not initialized, we will initialize the search by doing
2902  * a btrfs_search_slot().
2903  * If the path is already initialized, we will use the path as the initial
2904  * slot, to avoid duplicated btrfs_search_slot() calls.
2905  *
2906  * NOTE: If an extent item starts before @search_start, we will still
2907  * return the extent item. This is for data extent crossing stripe boundary.
2908  *
2909  * Return 0 if we found such extent item, and @path will point to the extent item.
2910  * Return >0 if no such extent item can be found, and @path will be released.
2911  * Return <0 if hit fatal error, and @path will be released.
2912  */
2913 static int find_first_extent_item(struct btrfs_root *extent_root,
2914                                   struct btrfs_path *path,
2915                                   u64 search_start, u64 search_len)
2916 {
2917         struct btrfs_fs_info *fs_info = extent_root->fs_info;
2918         struct btrfs_key key;
2919         int ret;
2920
2921         /* Continue using the existing path */
2922         if (path->nodes[0])
2923                 goto search_forward;
2924
2925         if (btrfs_fs_incompat(fs_info, SKINNY_METADATA))
2926                 key.type = BTRFS_METADATA_ITEM_KEY;
2927         else
2928                 key.type = BTRFS_EXTENT_ITEM_KEY;
2929         key.objectid = search_start;
2930         key.offset = (u64)-1;
2931
2932         ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
2933         if (ret < 0)
2934                 return ret;
2935
2936         ASSERT(ret > 0);
2937         /*
2938          * Here we intentionally pass 0 as @min_objectid, as there could be
2939          * an extent item starting before @search_start.
2940          */
2941         ret = btrfs_previous_extent_item(extent_root, path, 0);
2942         if (ret < 0)
2943                 return ret;
2944         /*
2945          * No matter whether we have found an extent item, the next loop will
2946          * properly do every check on the key.
2947          */
2948 search_forward:
2949         while (true) {
2950                 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
2951                 if (key.objectid >= search_start + search_len)
2952                         break;
2953                 if (key.type != BTRFS_METADATA_ITEM_KEY &&
2954                     key.type != BTRFS_EXTENT_ITEM_KEY)
2955                         goto next;
2956
2957                 ret = compare_extent_item_range(path, search_start, search_len);
2958                 if (ret == 0)
2959                         return ret;
2960                 if (ret > 0)
2961                         break;
2962 next:
2963                 path->slots[0]++;
2964                 if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) {
2965                         ret = btrfs_next_leaf(extent_root, path);
2966                         if (ret) {
2967                                 /* Either no more item or fatal error */
2968                                 btrfs_release_path(path);
2969                                 return ret;
2970                         }
2971                 }
2972         }
2973         btrfs_release_path(path);
2974         return 1;
2975 }
2976
2977 static void get_extent_info(struct btrfs_path *path, u64 *extent_start_ret,
2978                             u64 *size_ret, u64 *flags_ret, u64 *generation_ret)
2979 {
2980         struct btrfs_key key;
2981         struct btrfs_extent_item *ei;
2982
2983         btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
2984         ASSERT(key.type == BTRFS_METADATA_ITEM_KEY ||
2985                key.type == BTRFS_EXTENT_ITEM_KEY);
2986         *extent_start_ret = key.objectid;
2987         if (key.type == BTRFS_METADATA_ITEM_KEY)
2988                 *size_ret = path->nodes[0]->fs_info->nodesize;
2989         else
2990                 *size_ret = key.offset;
2991         ei = btrfs_item_ptr(path->nodes[0], path->slots[0], struct btrfs_extent_item);
2992         *flags_ret = btrfs_extent_flags(path->nodes[0], ei);
2993         *generation_ret = btrfs_extent_generation(path->nodes[0], ei);
2994 }
2995
2996 static bool does_range_cross_boundary(u64 extent_start, u64 extent_len,
2997                                       u64 boundary_start, u64 boudary_len)
2998 {
2999         return (extent_start < boundary_start &&
3000                 extent_start + extent_len > boundary_start) ||
3001                (extent_start < boundary_start + boudary_len &&
3002                 extent_start + extent_len > boundary_start + boudary_len);
3003 }
3004
3005 static int scrub_raid56_data_stripe_for_parity(struct scrub_ctx *sctx,
3006                                                struct scrub_parity *sparity,
3007                                                struct map_lookup *map,
3008                                                struct btrfs_device *sdev,
3009                                                struct btrfs_path *path,
3010                                                u64 logical)
3011 {
3012         struct btrfs_fs_info *fs_info = sctx->fs_info;
3013         struct btrfs_root *extent_root = btrfs_extent_root(fs_info, logical);
3014         struct btrfs_root *csum_root = btrfs_csum_root(fs_info, logical);
3015         u64 cur_logical = logical;
3016         int ret;
3017
3018         ASSERT(map->type & BTRFS_BLOCK_GROUP_RAID56_MASK);
3019
3020         /* Path must not be populated */
3021         ASSERT(!path->nodes[0]);
3022
3023         while (cur_logical < logical + map->stripe_len) {
3024                 struct btrfs_io_context *bioc = NULL;
3025                 struct btrfs_device *extent_dev;
3026                 u64 extent_start;
3027                 u64 extent_size;
3028                 u64 mapped_length;
3029                 u64 extent_flags;
3030                 u64 extent_gen;
3031                 u64 extent_physical;
3032                 u64 extent_mirror_num;
3033
3034                 ret = find_first_extent_item(extent_root, path, cur_logical,
3035                                              logical + map->stripe_len - cur_logical);
3036                 /* No more extent item in this data stripe */
3037                 if (ret > 0) {
3038                         ret = 0;
3039                         break;
3040                 }
3041                 if (ret < 0)
3042                         break;
3043                 get_extent_info(path, &extent_start, &extent_size, &extent_flags,
3044                                 &extent_gen);
3045
3046                 /* Metadata should not cross stripe boundaries */
3047                 if ((extent_flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) &&
3048                     does_range_cross_boundary(extent_start, extent_size,
3049                                               logical, map->stripe_len)) {
3050                         btrfs_err(fs_info,
3051         "scrub: tree block %llu spanning stripes, ignored. logical=%llu",
3052                                   extent_start, logical);
3053                         spin_lock(&sctx->stat_lock);
3054                         sctx->stat.uncorrectable_errors++;
3055                         spin_unlock(&sctx->stat_lock);
3056                         cur_logical += extent_size;
3057                         continue;
3058                 }
3059
3060                 /* Skip hole range which doesn't have any extent */
3061                 cur_logical = max(extent_start, cur_logical);
3062
3063                 /* Truncate the range inside this data stripe */
3064                 extent_size = min(extent_start + extent_size,
3065                                   logical + map->stripe_len) - cur_logical;
3066                 extent_start = cur_logical;
3067                 ASSERT(extent_size <= U32_MAX);
3068
3069                 scrub_parity_mark_sectors_data(sparity, extent_start, extent_size);
3070
3071                 mapped_length = extent_size;
3072                 ret = btrfs_map_block(fs_info, BTRFS_MAP_READ, extent_start,
3073                                       &mapped_length, &bioc, 0);
3074                 if (!ret && (!bioc || mapped_length < extent_size))
3075                         ret = -EIO;
3076                 if (ret) {
3077                         btrfs_put_bioc(bioc);
3078                         scrub_parity_mark_sectors_error(sparity, extent_start,
3079                                                         extent_size);
3080                         break;
3081                 }
3082                 extent_physical = bioc->stripes[0].physical;
3083                 extent_mirror_num = bioc->mirror_num;
3084                 extent_dev = bioc->stripes[0].dev;
3085                 btrfs_put_bioc(bioc);
3086
3087                 ret = btrfs_lookup_csums_range(csum_root, extent_start,
3088                                                extent_start + extent_size - 1,
3089                                                &sctx->csum_list, 1);
3090                 if (ret) {
3091                         scrub_parity_mark_sectors_error(sparity, extent_start,
3092                                                         extent_size);
3093                         break;
3094                 }
3095
3096                 ret = scrub_extent_for_parity(sparity, extent_start,
3097                                               extent_size, extent_physical,
3098                                               extent_dev, extent_flags,
3099                                               extent_gen, extent_mirror_num);
3100                 scrub_free_csums(sctx);
3101
3102                 if (ret) {
3103                         scrub_parity_mark_sectors_error(sparity, extent_start,
3104                                                         extent_size);
3105                         break;
3106                 }
3107
3108                 cond_resched();
3109                 cur_logical += extent_size;
3110         }
3111         btrfs_release_path(path);
3112         return ret;
3113 }
3114
3115 static noinline_for_stack int scrub_raid56_parity(struct scrub_ctx *sctx,
3116                                                   struct map_lookup *map,
3117                                                   struct btrfs_device *sdev,
3118                                                   u64 logic_start,
3119                                                   u64 logic_end)
3120 {
3121         struct btrfs_fs_info *fs_info = sctx->fs_info;
3122         struct btrfs_path *path;
3123         u64 cur_logical;
3124         int ret;
3125         struct scrub_parity *sparity;
3126         int nsectors;
3127
3128         path = btrfs_alloc_path();
3129         if (!path) {
3130                 spin_lock(&sctx->stat_lock);
3131                 sctx->stat.malloc_errors++;
3132                 spin_unlock(&sctx->stat_lock);
3133                 return -ENOMEM;
3134         }
3135         path->search_commit_root = 1;
3136         path->skip_locking = 1;
3137
3138         ASSERT(map->stripe_len <= U32_MAX);
3139         nsectors = map->stripe_len >> fs_info->sectorsize_bits;
3140         ASSERT(nsectors <= BITS_PER_LONG);
3141         sparity = kzalloc(sizeof(struct scrub_parity), GFP_NOFS);
3142         if (!sparity) {
3143                 spin_lock(&sctx->stat_lock);
3144                 sctx->stat.malloc_errors++;
3145                 spin_unlock(&sctx->stat_lock);
3146                 btrfs_free_path(path);
3147                 return -ENOMEM;
3148         }
3149
3150         ASSERT(map->stripe_len <= U32_MAX);
3151         sparity->stripe_len = map->stripe_len;
3152         sparity->nsectors = nsectors;
3153         sparity->sctx = sctx;
3154         sparity->scrub_dev = sdev;
3155         sparity->logic_start = logic_start;
3156         sparity->logic_end = logic_end;
3157         refcount_set(&sparity->refs, 1);
3158         INIT_LIST_HEAD(&sparity->sectors_list);
3159
3160         ret = 0;
3161         for (cur_logical = logic_start; cur_logical < logic_end;
3162              cur_logical += map->stripe_len) {
3163                 ret = scrub_raid56_data_stripe_for_parity(sctx, sparity, map,
3164                                                           sdev, path, cur_logical);
3165                 if (ret < 0)
3166                         break;
3167         }
3168
3169         scrub_parity_put(sparity);
3170         scrub_submit(sctx);
3171         mutex_lock(&sctx->wr_lock);
3172         scrub_wr_submit(sctx);
3173         mutex_unlock(&sctx->wr_lock);
3174
3175         btrfs_free_path(path);
3176         return ret < 0 ? ret : 0;
3177 }
3178
3179 static void sync_replace_for_zoned(struct scrub_ctx *sctx)
3180 {
3181         if (!btrfs_is_zoned(sctx->fs_info))
3182                 return;
3183
3184         sctx->flush_all_writes = true;
3185         scrub_submit(sctx);
3186         mutex_lock(&sctx->wr_lock);
3187         scrub_wr_submit(sctx);
3188         mutex_unlock(&sctx->wr_lock);
3189
3190         wait_event(sctx->list_wait, atomic_read(&sctx->bios_in_flight) == 0);
3191 }
3192
3193 static int sync_write_pointer_for_zoned(struct scrub_ctx *sctx, u64 logical,
3194                                         u64 physical, u64 physical_end)
3195 {
3196         struct btrfs_fs_info *fs_info = sctx->fs_info;
3197         int ret = 0;
3198
3199         if (!btrfs_is_zoned(fs_info))
3200                 return 0;
3201
3202         wait_event(sctx->list_wait, atomic_read(&sctx->bios_in_flight) == 0);
3203
3204         mutex_lock(&sctx->wr_lock);
3205         if (sctx->write_pointer < physical_end) {
3206                 ret = btrfs_sync_zone_write_pointer(sctx->wr_tgtdev, logical,
3207                                                     physical,
3208                                                     sctx->write_pointer);
3209                 if (ret)
3210                         btrfs_err(fs_info,
3211                                   "zoned: failed to recover write pointer");
3212         }
3213         mutex_unlock(&sctx->wr_lock);
3214         btrfs_dev_clear_zone_empty(sctx->wr_tgtdev, physical);
3215
3216         return ret;
3217 }
3218
3219 /*
3220  * Scrub one range which can only has simple mirror based profile.
3221  * (Including all range in SINGLE/DUP/RAID1/RAID1C*, and each stripe in
3222  *  RAID0/RAID10).
3223  *
3224  * Since we may need to handle a subset of block group, we need @logical_start
3225  * and @logical_length parameter.
3226  */
3227 static int scrub_simple_mirror(struct scrub_ctx *sctx,
3228                                struct btrfs_root *extent_root,
3229                                struct btrfs_root *csum_root,
3230                                struct btrfs_block_group *bg,
3231                                struct map_lookup *map,
3232                                u64 logical_start, u64 logical_length,
3233                                struct btrfs_device *device,
3234                                u64 physical, int mirror_num)
3235 {
3236         struct btrfs_fs_info *fs_info = sctx->fs_info;
3237         const u64 logical_end = logical_start + logical_length;
3238         /* An artificial limit, inherit from old scrub behavior */
3239         const u32 max_length = SZ_64K;
3240         struct btrfs_path path = { 0 };
3241         u64 cur_logical = logical_start;
3242         int ret;
3243
3244         /* The range must be inside the bg */
3245         ASSERT(logical_start >= bg->start && logical_end <= bg->start + bg->length);
3246
3247         path.search_commit_root = 1;
3248         path.skip_locking = 1;
3249         /* Go through each extent items inside the logical range */
3250         while (cur_logical < logical_end) {
3251                 u64 extent_start;
3252                 u64 extent_len;
3253                 u64 extent_flags;
3254                 u64 extent_gen;
3255                 u64 scrub_len;
3256
3257                 /* Canceled? */
3258                 if (atomic_read(&fs_info->scrub_cancel_req) ||
3259                     atomic_read(&sctx->cancel_req)) {
3260                         ret = -ECANCELED;
3261                         break;
3262                 }
3263                 /* Paused? */
3264                 if (atomic_read(&fs_info->scrub_pause_req)) {
3265                         /* Push queued extents */
3266                         sctx->flush_all_writes = true;
3267                         scrub_submit(sctx);
3268                         mutex_lock(&sctx->wr_lock);
3269                         scrub_wr_submit(sctx);
3270                         mutex_unlock(&sctx->wr_lock);
3271                         wait_event(sctx->list_wait,
3272                                    atomic_read(&sctx->bios_in_flight) == 0);
3273                         sctx->flush_all_writes = false;
3274                         scrub_blocked_if_needed(fs_info);
3275                 }
3276                 /* Block group removed? */
3277                 spin_lock(&bg->lock);
3278                 if (bg->removed) {
3279                         spin_unlock(&bg->lock);
3280                         ret = 0;
3281                         break;
3282                 }
3283                 spin_unlock(&bg->lock);
3284
3285                 ret = find_first_extent_item(extent_root, &path, cur_logical,
3286                                              logical_end - cur_logical);
3287                 if (ret > 0) {
3288                         /* No more extent, just update the accounting */
3289                         sctx->stat.last_physical = physical + logical_length;
3290                         ret = 0;
3291                         break;
3292                 }
3293                 if (ret < 0)
3294                         break;
3295                 get_extent_info(&path, &extent_start, &extent_len,
3296                                 &extent_flags, &extent_gen);
3297                 /* Skip hole range which doesn't have any extent */
3298                 cur_logical = max(extent_start, cur_logical);
3299
3300                 /*
3301                  * Scrub len has three limits:
3302                  * - Extent size limit
3303                  * - Scrub range limit
3304                  *   This is especially imporatant for RAID0/RAID10 to reuse
3305                  *   this function
3306                  * - Max scrub size limit
3307                  */
3308                 scrub_len = min(min(extent_start + extent_len,
3309                                     logical_end), cur_logical + max_length) -
3310                             cur_logical;
3311
3312                 if (extent_flags & BTRFS_EXTENT_FLAG_DATA) {
3313                         ret = btrfs_lookup_csums_range(csum_root, cur_logical,
3314                                         cur_logical + scrub_len - 1,
3315                                         &sctx->csum_list, 1);
3316                         if (ret)
3317                                 break;
3318                 }
3319                 if ((extent_flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) &&
3320                     does_range_cross_boundary(extent_start, extent_len,
3321                                               logical_start, logical_length)) {
3322                         btrfs_err(fs_info,
3323 "scrub: tree block %llu spanning boundaries, ignored. boundary=[%llu, %llu)",
3324                                   extent_start, logical_start, logical_end);
3325                         spin_lock(&sctx->stat_lock);
3326                         sctx->stat.uncorrectable_errors++;
3327                         spin_unlock(&sctx->stat_lock);
3328                         cur_logical += scrub_len;
3329                         continue;
3330                 }
3331                 ret = scrub_extent(sctx, map, cur_logical, scrub_len,
3332                                    cur_logical - logical_start + physical,
3333                                    device, extent_flags, extent_gen,
3334                                    mirror_num);
3335                 scrub_free_csums(sctx);
3336                 if (ret)
3337                         break;
3338                 if (sctx->is_dev_replace)
3339                         sync_replace_for_zoned(sctx);
3340                 cur_logical += scrub_len;
3341                 /* Don't hold CPU for too long time */
3342                 cond_resched();
3343         }
3344         btrfs_release_path(&path);
3345         return ret;
3346 }
3347
3348 /* Calculate the full stripe length for simple stripe based profiles */
3349 static u64 simple_stripe_full_stripe_len(const struct map_lookup *map)
3350 {
3351         ASSERT(map->type & (BTRFS_BLOCK_GROUP_RAID0 |
3352                             BTRFS_BLOCK_GROUP_RAID10));
3353
3354         return map->num_stripes / map->sub_stripes * map->stripe_len;
3355 }
3356
3357 /* Get the logical bytenr for the stripe */
3358 static u64 simple_stripe_get_logical(struct map_lookup *map,
3359                                      struct btrfs_block_group *bg,
3360                                      int stripe_index)
3361 {
3362         ASSERT(map->type & (BTRFS_BLOCK_GROUP_RAID0 |
3363                             BTRFS_BLOCK_GROUP_RAID10));
3364         ASSERT(stripe_index < map->num_stripes);
3365
3366         /*
3367          * (stripe_index / sub_stripes) gives how many data stripes we need to
3368          * skip.
3369          */
3370         return (stripe_index / map->sub_stripes) * map->stripe_len + bg->start;
3371 }
3372
3373 /* Get the mirror number for the stripe */
3374 static int simple_stripe_mirror_num(struct map_lookup *map, int stripe_index)
3375 {
3376         ASSERT(map->type & (BTRFS_BLOCK_GROUP_RAID0 |
3377                             BTRFS_BLOCK_GROUP_RAID10));
3378         ASSERT(stripe_index < map->num_stripes);
3379
3380         /* For RAID0, it's fixed to 1, for RAID10 it's 0,1,0,1... */
3381         return stripe_index % map->sub_stripes + 1;
3382 }
3383
3384 static int scrub_simple_stripe(struct scrub_ctx *sctx,
3385                                struct btrfs_root *extent_root,
3386                                struct btrfs_root *csum_root,
3387                                struct btrfs_block_group *bg,
3388                                struct map_lookup *map,
3389                                struct btrfs_device *device,
3390                                int stripe_index)
3391 {
3392         const u64 logical_increment = simple_stripe_full_stripe_len(map);
3393         const u64 orig_logical = simple_stripe_get_logical(map, bg, stripe_index);
3394         const u64 orig_physical = map->stripes[stripe_index].physical;
3395         const int mirror_num = simple_stripe_mirror_num(map, stripe_index);
3396         u64 cur_logical = orig_logical;
3397         u64 cur_physical = orig_physical;
3398         int ret = 0;
3399
3400         while (cur_logical < bg->start + bg->length) {
3401                 /*
3402                  * Inside each stripe, RAID0 is just SINGLE, and RAID10 is
3403                  * just RAID1, so we can reuse scrub_simple_mirror() to scrub
3404                  * this stripe.
3405                  */
3406                 ret = scrub_simple_mirror(sctx, extent_root, csum_root, bg, map,
3407                                           cur_logical, map->stripe_len, device,
3408                                           cur_physical, mirror_num);
3409                 if (ret)
3410                         return ret;
3411                 /* Skip to next stripe which belongs to the target device */
3412                 cur_logical += logical_increment;
3413                 /* For physical offset, we just go to next stripe */
3414                 cur_physical += map->stripe_len;
3415         }
3416         return ret;
3417 }
3418
3419 static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
3420                                            struct btrfs_block_group *bg,
3421                                            struct map_lookup *map,
3422                                            struct btrfs_device *scrub_dev,
3423                                            int stripe_index, u64 dev_extent_len)
3424 {
3425         struct btrfs_path *path;
3426         struct btrfs_fs_info *fs_info = sctx->fs_info;
3427         struct btrfs_root *root;
3428         struct btrfs_root *csum_root;
3429         struct blk_plug plug;
3430         const u64 profile = map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK;
3431         const u64 chunk_logical = bg->start;
3432         int ret;
3433         u64 physical = map->stripes[stripe_index].physical;
3434         const u64 physical_end = physical + dev_extent_len;
3435         u64 logical;
3436         u64 logic_end;
3437         /* The logical increment after finishing one stripe */
3438         u64 increment;
3439         /* Offset inside the chunk */
3440         u64 offset;
3441         u64 stripe_logical;
3442         u64 stripe_end;
3443         int stop_loop = 0;
3444
3445         path = btrfs_alloc_path();
3446         if (!path)
3447                 return -ENOMEM;
3448
3449         /*
3450          * work on commit root. The related disk blocks are static as
3451          * long as COW is applied. This means, it is save to rewrite
3452          * them to repair disk errors without any race conditions
3453          */
3454         path->search_commit_root = 1;
3455         path->skip_locking = 1;
3456         path->reada = READA_FORWARD;
3457
3458         wait_event(sctx->list_wait,
3459                    atomic_read(&sctx->bios_in_flight) == 0);
3460         scrub_blocked_if_needed(fs_info);
3461
3462         root = btrfs_extent_root(fs_info, bg->start);
3463         csum_root = btrfs_csum_root(fs_info, bg->start);
3464
3465         /*
3466          * collect all data csums for the stripe to avoid seeking during
3467          * the scrub. This might currently (crc32) end up to be about 1MB
3468          */
3469         blk_start_plug(&plug);
3470
3471         if (sctx->is_dev_replace &&
3472             btrfs_dev_is_sequential(sctx->wr_tgtdev, physical)) {
3473                 mutex_lock(&sctx->wr_lock);
3474                 sctx->write_pointer = physical;
3475                 mutex_unlock(&sctx->wr_lock);
3476                 sctx->flush_all_writes = true;
3477         }
3478
3479         /*
3480          * There used to be a big double loop to handle all profiles using the
3481          * same routine, which grows larger and more gross over time.
3482          *
3483          * So here we handle each profile differently, so simpler profiles
3484          * have simpler scrubbing function.
3485          */
3486         if (!(profile & (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID10 |
3487                          BTRFS_BLOCK_GROUP_RAID56_MASK))) {
3488                 /*
3489                  * Above check rules out all complex profile, the remaining
3490                  * profiles are SINGLE|DUP|RAID1|RAID1C*, which is simple
3491                  * mirrored duplication without stripe.
3492                  *
3493                  * Only @physical and @mirror_num needs to calculated using
3494                  * @stripe_index.
3495                  */
3496                 ret = scrub_simple_mirror(sctx, root, csum_root, bg, map,
3497                                 bg->start, bg->length, scrub_dev,
3498                                 map->stripes[stripe_index].physical,
3499                                 stripe_index + 1);
3500                 offset = 0;
3501                 goto out;
3502         }
3503         if (profile & (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID10)) {
3504                 ret = scrub_simple_stripe(sctx, root, csum_root, bg, map,
3505                                           scrub_dev, stripe_index);
3506                 offset = map->stripe_len * (stripe_index / map->sub_stripes);
3507                 goto out;
3508         }
3509
3510         /* Only RAID56 goes through the old code */
3511         ASSERT(map->type & BTRFS_BLOCK_GROUP_RAID56_MASK);
3512         ret = 0;
3513
3514         /* Calculate the logical end of the stripe */
3515         get_raid56_logic_offset(physical_end, stripe_index,
3516                                 map, &logic_end, NULL);
3517         logic_end += chunk_logical;
3518
3519         /* Initialize @offset in case we need to go to out: label */
3520         get_raid56_logic_offset(physical, stripe_index, map, &offset, NULL);
3521         increment = map->stripe_len * nr_data_stripes(map);
3522
3523         /*
3524          * Due to the rotation, for RAID56 it's better to iterate each stripe
3525          * using their physical offset.
3526          */
3527         while (physical < physical_end) {
3528                 ret = get_raid56_logic_offset(physical, stripe_index, map,
3529                                               &logical, &stripe_logical);
3530                 logical += chunk_logical;
3531                 if (ret) {
3532                         /* it is parity strip */
3533                         stripe_logical += chunk_logical;
3534                         stripe_end = stripe_logical + increment;
3535                         ret = scrub_raid56_parity(sctx, map, scrub_dev,
3536                                                   stripe_logical,
3537                                                   stripe_end);
3538                         if (ret)
3539                                 goto out;
3540                         goto next;
3541                 }
3542
3543                 /*
3544                  * Now we're at a data stripe, scrub each extents in the range.
3545                  *
3546                  * At this stage, if we ignore the repair part, inside each data
3547                  * stripe it is no different than SINGLE profile.
3548                  * We can reuse scrub_simple_mirror() here, as the repair part
3549                  * is still based on @mirror_num.
3550                  */
3551                 ret = scrub_simple_mirror(sctx, root, csum_root, bg, map,
3552                                           logical, map->stripe_len,
3553                                           scrub_dev, physical, 1);
3554                 if (ret < 0)
3555                         goto out;
3556 next:
3557                 logical += increment;
3558                 physical += map->stripe_len;
3559                 spin_lock(&sctx->stat_lock);
3560                 if (stop_loop)
3561                         sctx->stat.last_physical = map->stripes[stripe_index].physical +
3562                                                    dev_extent_len;
3563                 else
3564                         sctx->stat.last_physical = physical;
3565                 spin_unlock(&sctx->stat_lock);
3566                 if (stop_loop)
3567                         break;
3568         }
3569 out:
3570         /* push queued extents */
3571         scrub_submit(sctx);
3572         mutex_lock(&sctx->wr_lock);
3573         scrub_wr_submit(sctx);
3574         mutex_unlock(&sctx->wr_lock);
3575
3576         blk_finish_plug(&plug);
3577         btrfs_free_path(path);
3578
3579         if (sctx->is_dev_replace && ret >= 0) {
3580                 int ret2;
3581
3582                 ret2 = sync_write_pointer_for_zoned(sctx,
3583                                 chunk_logical + offset,
3584                                 map->stripes[stripe_index].physical,
3585                                 physical_end);
3586                 if (ret2)
3587                         ret = ret2;
3588         }
3589
3590         return ret < 0 ? ret : 0;
3591 }
3592
3593 static noinline_for_stack int scrub_chunk(struct scrub_ctx *sctx,
3594                                           struct btrfs_block_group *bg,
3595                                           struct btrfs_device *scrub_dev,
3596                                           u64 dev_offset,
3597                                           u64 dev_extent_len)
3598 {
3599         struct btrfs_fs_info *fs_info = sctx->fs_info;
3600         struct extent_map_tree *map_tree = &fs_info->mapping_tree;
3601         struct map_lookup *map;
3602         struct extent_map *em;
3603         int i;
3604         int ret = 0;
3605
3606         read_lock(&map_tree->lock);
3607         em = lookup_extent_mapping(map_tree, bg->start, bg->length);
3608         read_unlock(&map_tree->lock);
3609
3610         if (!em) {
3611                 /*
3612                  * Might have been an unused block group deleted by the cleaner
3613                  * kthread or relocation.
3614                  */
3615                 spin_lock(&bg->lock);
3616                 if (!bg->removed)
3617                         ret = -EINVAL;
3618                 spin_unlock(&bg->lock);
3619
3620                 return ret;
3621         }
3622         if (em->start != bg->start)
3623                 goto out;
3624         if (em->len < dev_extent_len)
3625                 goto out;
3626
3627         map = em->map_lookup;
3628         for (i = 0; i < map->num_stripes; ++i) {
3629                 if (map->stripes[i].dev->bdev == scrub_dev->bdev &&
3630                     map->stripes[i].physical == dev_offset) {
3631                         ret = scrub_stripe(sctx, bg, map, scrub_dev, i,
3632                                            dev_extent_len);
3633                         if (ret)
3634                                 goto out;
3635                 }
3636         }
3637 out:
3638         free_extent_map(em);
3639
3640         return ret;
3641 }
3642
3643 static int finish_extent_writes_for_zoned(struct btrfs_root *root,
3644                                           struct btrfs_block_group *cache)
3645 {
3646         struct btrfs_fs_info *fs_info = cache->fs_info;
3647         struct btrfs_trans_handle *trans;
3648
3649         if (!btrfs_is_zoned(fs_info))
3650                 return 0;
3651
3652         btrfs_wait_block_group_reservations(cache);
3653         btrfs_wait_nocow_writers(cache);
3654         btrfs_wait_ordered_roots(fs_info, U64_MAX, cache->start, cache->length);
3655
3656         trans = btrfs_join_transaction(root);
3657         if (IS_ERR(trans))
3658                 return PTR_ERR(trans);
3659         return btrfs_commit_transaction(trans);
3660 }
3661
3662 static noinline_for_stack
3663 int scrub_enumerate_chunks(struct scrub_ctx *sctx,
3664                            struct btrfs_device *scrub_dev, u64 start, u64 end)
3665 {
3666         struct btrfs_dev_extent *dev_extent = NULL;
3667         struct btrfs_path *path;
3668         struct btrfs_fs_info *fs_info = sctx->fs_info;
3669         struct btrfs_root *root = fs_info->dev_root;
3670         u64 chunk_offset;
3671         int ret = 0;
3672         int ro_set;
3673         int slot;
3674         struct extent_buffer *l;
3675         struct btrfs_key key;
3676         struct btrfs_key found_key;
3677         struct btrfs_block_group *cache;
3678         struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
3679
3680         path = btrfs_alloc_path();
3681         if (!path)
3682                 return -ENOMEM;
3683
3684         path->reada = READA_FORWARD;
3685         path->search_commit_root = 1;
3686         path->skip_locking = 1;
3687
3688         key.objectid = scrub_dev->devid;
3689         key.offset = 0ull;
3690         key.type = BTRFS_DEV_EXTENT_KEY;
3691
3692         while (1) {
3693                 u64 dev_extent_len;
3694
3695                 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
3696                 if (ret < 0)
3697                         break;
3698                 if (ret > 0) {
3699                         if (path->slots[0] >=
3700                             btrfs_header_nritems(path->nodes[0])) {
3701                                 ret = btrfs_next_leaf(root, path);
3702                                 if (ret < 0)
3703                                         break;
3704                                 if (ret > 0) {
3705                                         ret = 0;
3706                                         break;
3707                                 }
3708                         } else {
3709                                 ret = 0;
3710                         }
3711                 }
3712
3713                 l = path->nodes[0];
3714                 slot = path->slots[0];
3715
3716                 btrfs_item_key_to_cpu(l, &found_key, slot);
3717
3718                 if (found_key.objectid != scrub_dev->devid)
3719                         break;
3720
3721                 if (found_key.type != BTRFS_DEV_EXTENT_KEY)
3722                         break;
3723
3724                 if (found_key.offset >= end)
3725                         break;
3726
3727                 if (found_key.offset < key.offset)
3728                         break;
3729
3730                 dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
3731                 dev_extent_len = btrfs_dev_extent_length(l, dev_extent);
3732
3733                 if (found_key.offset + dev_extent_len <= start)
3734                         goto skip;
3735
3736                 chunk_offset = btrfs_dev_extent_chunk_offset(l, dev_extent);
3737
3738                 /*
3739                  * get a reference on the corresponding block group to prevent
3740                  * the chunk from going away while we scrub it
3741                  */
3742                 cache = btrfs_lookup_block_group(fs_info, chunk_offset);
3743
3744                 /* some chunks are removed but not committed to disk yet,
3745                  * continue scrubbing */
3746                 if (!cache)
3747                         goto skip;
3748
3749                 ASSERT(cache->start <= chunk_offset);
3750                 /*
3751                  * We are using the commit root to search for device extents, so
3752                  * that means we could have found a device extent item from a
3753                  * block group that was deleted in the current transaction. The
3754                  * logical start offset of the deleted block group, stored at
3755                  * @chunk_offset, might be part of the logical address range of
3756                  * a new block group (which uses different physical extents).
3757                  * In this case btrfs_lookup_block_group() has returned the new
3758                  * block group, and its start address is less than @chunk_offset.
3759                  *
3760                  * We skip such new block groups, because it's pointless to
3761                  * process them, as we won't find their extents because we search
3762                  * for them using the commit root of the extent tree. For a device
3763                  * replace it's also fine to skip it, we won't miss copying them
3764                  * to the target device because we have the write duplication
3765                  * setup through the regular write path (by btrfs_map_block()),
3766                  * and we have committed a transaction when we started the device
3767                  * replace, right after setting up the device replace state.
3768                  */
3769                 if (cache->start < chunk_offset) {
3770                         btrfs_put_block_group(cache);
3771                         goto skip;
3772                 }
3773
3774                 if (sctx->is_dev_replace && btrfs_is_zoned(fs_info)) {
3775                         spin_lock(&cache->lock);
3776                         if (!cache->to_copy) {
3777                                 spin_unlock(&cache->lock);
3778                                 btrfs_put_block_group(cache);
3779                                 goto skip;
3780                         }
3781                         spin_unlock(&cache->lock);
3782                 }
3783
3784                 /*
3785                  * Make sure that while we are scrubbing the corresponding block
3786                  * group doesn't get its logical address and its device extents
3787                  * reused for another block group, which can possibly be of a
3788                  * different type and different profile. We do this to prevent
3789                  * false error detections and crashes due to bogus attempts to
3790                  * repair extents.
3791                  */
3792                 spin_lock(&cache->lock);
3793                 if (cache->removed) {
3794                         spin_unlock(&cache->lock);
3795                         btrfs_put_block_group(cache);
3796                         goto skip;
3797                 }
3798                 btrfs_freeze_block_group(cache);
3799                 spin_unlock(&cache->lock);
3800
3801                 /*
3802                  * we need call btrfs_inc_block_group_ro() with scrubs_paused,
3803                  * to avoid deadlock caused by:
3804                  * btrfs_inc_block_group_ro()
3805                  * -> btrfs_wait_for_commit()
3806                  * -> btrfs_commit_transaction()
3807                  * -> btrfs_scrub_pause()
3808                  */
3809                 scrub_pause_on(fs_info);
3810
3811                 /*
3812                  * Don't do chunk preallocation for scrub.
3813                  *
3814                  * This is especially important for SYSTEM bgs, or we can hit
3815                  * -EFBIG from btrfs_finish_chunk_alloc() like:
3816                  * 1. The only SYSTEM bg is marked RO.
3817                  *    Since SYSTEM bg is small, that's pretty common.
3818                  * 2. New SYSTEM bg will be allocated
3819                  *    Due to regular version will allocate new chunk.
3820                  * 3. New SYSTEM bg is empty and will get cleaned up
3821                  *    Before cleanup really happens, it's marked RO again.
3822                  * 4. Empty SYSTEM bg get scrubbed
3823                  *    We go back to 2.
3824                  *
3825                  * This can easily boost the amount of SYSTEM chunks if cleaner
3826                  * thread can't be triggered fast enough, and use up all space
3827                  * of btrfs_super_block::sys_chunk_array
3828                  *
3829                  * While for dev replace, we need to try our best to mark block
3830                  * group RO, to prevent race between:
3831                  * - Write duplication
3832                  *   Contains latest data
3833                  * - Scrub copy
3834                  *   Contains data from commit tree
3835                  *
3836                  * If target block group is not marked RO, nocow writes can
3837                  * be overwritten by scrub copy, causing data corruption.
3838                  * So for dev-replace, it's not allowed to continue if a block
3839                  * group is not RO.
3840                  */
3841                 ret = btrfs_inc_block_group_ro(cache, sctx->is_dev_replace);
3842                 if (!ret && sctx->is_dev_replace) {
3843                         ret = finish_extent_writes_for_zoned(root, cache);
3844                         if (ret) {
3845                                 btrfs_dec_block_group_ro(cache);
3846                                 scrub_pause_off(fs_info);
3847                                 btrfs_put_block_group(cache);
3848                                 break;
3849                         }
3850                 }
3851
3852                 if (ret == 0) {
3853                         ro_set = 1;
3854                 } else if (ret == -ENOSPC && !sctx->is_dev_replace) {
3855                         /*
3856                          * btrfs_inc_block_group_ro return -ENOSPC when it
3857                          * failed in creating new chunk for metadata.
3858                          * It is not a problem for scrub, because
3859                          * metadata are always cowed, and our scrub paused
3860                          * commit_transactions.
3861                          */
3862                         ro_set = 0;
3863                 } else if (ret == -ETXTBSY) {
3864                         btrfs_warn(fs_info,
3865                    "skipping scrub of block group %llu due to active swapfile",
3866                                    cache->start);
3867                         scrub_pause_off(fs_info);
3868                         ret = 0;
3869                         goto skip_unfreeze;
3870                 } else {
3871                         btrfs_warn(fs_info,
3872                                    "failed setting block group ro: %d", ret);
3873                         btrfs_unfreeze_block_group(cache);
3874                         btrfs_put_block_group(cache);
3875                         scrub_pause_off(fs_info);
3876                         break;
3877                 }
3878
3879                 /*
3880                  * Now the target block is marked RO, wait for nocow writes to
3881                  * finish before dev-replace.
3882                  * COW is fine, as COW never overwrites extents in commit tree.
3883                  */
3884                 if (sctx->is_dev_replace) {
3885                         btrfs_wait_nocow_writers(cache);
3886                         btrfs_wait_ordered_roots(fs_info, U64_MAX, cache->start,
3887                                         cache->length);
3888                 }
3889
3890                 scrub_pause_off(fs_info);
3891                 down_write(&dev_replace->rwsem);
3892                 dev_replace->cursor_right = found_key.offset + dev_extent_len;
3893                 dev_replace->cursor_left = found_key.offset;
3894                 dev_replace->item_needs_writeback = 1;
3895                 up_write(&dev_replace->rwsem);
3896
3897                 ret = scrub_chunk(sctx, cache, scrub_dev, found_key.offset,
3898                                   dev_extent_len);
3899
3900                 /*
3901                  * flush, submit all pending read and write bios, afterwards
3902                  * wait for them.
3903                  * Note that in the dev replace case, a read request causes
3904                  * write requests that are submitted in the read completion
3905                  * worker. Therefore in the current situation, it is required
3906                  * that all write requests are flushed, so that all read and
3907                  * write requests are really completed when bios_in_flight
3908                  * changes to 0.
3909                  */
3910                 sctx->flush_all_writes = true;
3911                 scrub_submit(sctx);
3912                 mutex_lock(&sctx->wr_lock);
3913                 scrub_wr_submit(sctx);
3914                 mutex_unlock(&sctx->wr_lock);
3915
3916                 wait_event(sctx->list_wait,
3917                            atomic_read(&sctx->bios_in_flight) == 0);
3918
3919                 scrub_pause_on(fs_info);
3920
3921                 /*
3922                  * must be called before we decrease @scrub_paused.
3923                  * make sure we don't block transaction commit while
3924                  * we are waiting pending workers finished.
3925                  */
3926                 wait_event(sctx->list_wait,
3927                            atomic_read(&sctx->workers_pending) == 0);
3928                 sctx->flush_all_writes = false;
3929
3930                 scrub_pause_off(fs_info);
3931
3932                 if (sctx->is_dev_replace &&
3933                     !btrfs_finish_block_group_to_copy(dev_replace->srcdev,
3934                                                       cache, found_key.offset))
3935                         ro_set = 0;
3936
3937                 down_write(&dev_replace->rwsem);
3938                 dev_replace->cursor_left = dev_replace->cursor_right;
3939                 dev_replace->item_needs_writeback = 1;
3940                 up_write(&dev_replace->rwsem);
3941
3942                 if (ro_set)
3943                         btrfs_dec_block_group_ro(cache);
3944
3945                 /*
3946                  * We might have prevented the cleaner kthread from deleting
3947                  * this block group if it was already unused because we raced
3948                  * and set it to RO mode first. So add it back to the unused
3949                  * list, otherwise it might not ever be deleted unless a manual
3950                  * balance is triggered or it becomes used and unused again.
3951                  */
3952                 spin_lock(&cache->lock);
3953                 if (!cache->removed && !cache->ro && cache->reserved == 0 &&
3954                     cache->used == 0) {
3955                         spin_unlock(&cache->lock);
3956                         if (btrfs_test_opt(fs_info, DISCARD_ASYNC))
3957                                 btrfs_discard_queue_work(&fs_info->discard_ctl,
3958                                                          cache);
3959                         else
3960                                 btrfs_mark_bg_unused(cache);
3961                 } else {
3962                         spin_unlock(&cache->lock);
3963                 }
3964 skip_unfreeze:
3965                 btrfs_unfreeze_block_group(cache);
3966                 btrfs_put_block_group(cache);
3967                 if (ret)
3968                         break;
3969                 if (sctx->is_dev_replace &&
3970                     atomic64_read(&dev_replace->num_write_errors) > 0) {
3971                         ret = -EIO;
3972                         break;
3973                 }
3974                 if (sctx->stat.malloc_errors > 0) {
3975                         ret = -ENOMEM;
3976                         break;
3977                 }
3978 skip:
3979                 key.offset = found_key.offset + dev_extent_len;
3980                 btrfs_release_path(path);
3981         }
3982
3983         btrfs_free_path(path);
3984
3985         return ret;
3986 }
3987
3988 static noinline_for_stack int scrub_supers(struct scrub_ctx *sctx,
3989                                            struct btrfs_device *scrub_dev)
3990 {
3991         int     i;
3992         u64     bytenr;
3993         u64     gen;
3994         int     ret;
3995         struct btrfs_fs_info *fs_info = sctx->fs_info;
3996
3997         if (BTRFS_FS_ERROR(fs_info))
3998                 return -EROFS;
3999
4000         /* Seed devices of a new filesystem has their own generation. */
4001         if (scrub_dev->fs_devices != fs_info->fs_devices)
4002                 gen = scrub_dev->generation;
4003         else
4004                 gen = fs_info->last_trans_committed;
4005
4006         for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
4007                 bytenr = btrfs_sb_offset(i);
4008                 if (bytenr + BTRFS_SUPER_INFO_SIZE >
4009                     scrub_dev->commit_total_bytes)
4010                         break;
4011                 if (!btrfs_check_super_location(scrub_dev, bytenr))
4012                         continue;
4013
4014                 ret = scrub_sectors(sctx, bytenr, BTRFS_SUPER_INFO_SIZE, bytenr,
4015                                     scrub_dev, BTRFS_EXTENT_FLAG_SUPER, gen, i,
4016                                     NULL, bytenr);
4017                 if (ret)
4018                         return ret;
4019         }
4020         wait_event(sctx->list_wait, atomic_read(&sctx->bios_in_flight) == 0);
4021
4022         return 0;
4023 }
4024
4025 static void scrub_workers_put(struct btrfs_fs_info *fs_info)
4026 {
4027         if (refcount_dec_and_mutex_lock(&fs_info->scrub_workers_refcnt,
4028                                         &fs_info->scrub_lock)) {
4029                 struct workqueue_struct *scrub_workers = fs_info->scrub_workers;
4030                 struct workqueue_struct *scrub_wr_comp =
4031                                                 fs_info->scrub_wr_completion_workers;
4032                 struct workqueue_struct *scrub_parity =
4033                                                 fs_info->scrub_parity_workers;
4034
4035                 fs_info->scrub_workers = NULL;
4036                 fs_info->scrub_wr_completion_workers = NULL;
4037                 fs_info->scrub_parity_workers = NULL;
4038                 mutex_unlock(&fs_info->scrub_lock);
4039
4040                 if (scrub_workers)
4041                         destroy_workqueue(scrub_workers);
4042                 if (scrub_wr_comp)
4043                         destroy_workqueue(scrub_wr_comp);
4044                 if (scrub_parity)
4045                         destroy_workqueue(scrub_parity);
4046         }
4047 }
4048
4049 /*
4050  * get a reference count on fs_info->scrub_workers. start worker if necessary
4051  */
4052 static noinline_for_stack int scrub_workers_get(struct btrfs_fs_info *fs_info,
4053                                                 int is_dev_replace)
4054 {
4055         struct workqueue_struct *scrub_workers = NULL;
4056         struct workqueue_struct *scrub_wr_comp = NULL;
4057         struct workqueue_struct *scrub_parity = NULL;
4058         unsigned int flags = WQ_FREEZABLE | WQ_UNBOUND;
4059         int max_active = fs_info->thread_pool_size;
4060         int ret = -ENOMEM;
4061
4062         if (refcount_inc_not_zero(&fs_info->scrub_workers_refcnt))
4063                 return 0;
4064
4065         scrub_workers = alloc_workqueue("btrfs-scrub", flags,
4066                                         is_dev_replace ? 1 : max_active);
4067         if (!scrub_workers)
4068                 goto fail_scrub_workers;
4069
4070         scrub_wr_comp = alloc_workqueue("btrfs-scrubwrc", flags, max_active);
4071         if (!scrub_wr_comp)
4072                 goto fail_scrub_wr_completion_workers;
4073
4074         scrub_parity = alloc_workqueue("btrfs-scrubparity", flags, max_active);
4075         if (!scrub_parity)
4076                 goto fail_scrub_parity_workers;
4077
4078         mutex_lock(&fs_info->scrub_lock);
4079         if (refcount_read(&fs_info->scrub_workers_refcnt) == 0) {
4080                 ASSERT(fs_info->scrub_workers == NULL &&
4081                        fs_info->scrub_wr_completion_workers == NULL &&
4082                        fs_info->scrub_parity_workers == NULL);
4083                 fs_info->scrub_workers = scrub_workers;
4084                 fs_info->scrub_wr_completion_workers = scrub_wr_comp;
4085                 fs_info->scrub_parity_workers = scrub_parity;
4086                 refcount_set(&fs_info->scrub_workers_refcnt, 1);
4087                 mutex_unlock(&fs_info->scrub_lock);
4088                 return 0;
4089         }
4090         /* Other thread raced in and created the workers for us */
4091         refcount_inc(&fs_info->scrub_workers_refcnt);
4092         mutex_unlock(&fs_info->scrub_lock);
4093
4094         ret = 0;
4095         destroy_workqueue(scrub_parity);
4096 fail_scrub_parity_workers:
4097         destroy_workqueue(scrub_wr_comp);
4098 fail_scrub_wr_completion_workers:
4099         destroy_workqueue(scrub_workers);
4100 fail_scrub_workers:
4101         return ret;
4102 }
4103
4104 int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
4105                     u64 end, struct btrfs_scrub_progress *progress,
4106                     int readonly, int is_dev_replace)
4107 {
4108         struct btrfs_dev_lookup_args args = { .devid = devid };
4109         struct scrub_ctx *sctx;
4110         int ret;
4111         struct btrfs_device *dev;
4112         unsigned int nofs_flag;
4113
4114         if (btrfs_fs_closing(fs_info))
4115                 return -EAGAIN;
4116
4117         if (fs_info->nodesize > BTRFS_STRIPE_LEN) {
4118                 /*
4119                  * in this case scrub is unable to calculate the checksum
4120                  * the way scrub is implemented. Do not handle this
4121                  * situation at all because it won't ever happen.
4122                  */
4123                 btrfs_err(fs_info,
4124                            "scrub: size assumption nodesize <= BTRFS_STRIPE_LEN (%d <= %d) fails",
4125                        fs_info->nodesize,
4126                        BTRFS_STRIPE_LEN);
4127                 return -EINVAL;
4128         }
4129
4130         if (fs_info->nodesize >
4131             SCRUB_MAX_SECTORS_PER_BLOCK << fs_info->sectorsize_bits ||
4132             fs_info->sectorsize > PAGE_SIZE * SCRUB_MAX_SECTORS_PER_BLOCK) {
4133                 /*
4134                  * Would exhaust the array bounds of sectorv member in
4135                  * struct scrub_block
4136                  */
4137                 btrfs_err(fs_info,
4138 "scrub: nodesize and sectorsize <= SCRUB_MAX_SECTORS_PER_BLOCK (%d <= %d && %d <= %d) fails",
4139                        fs_info->nodesize, SCRUB_MAX_SECTORS_PER_BLOCK,
4140                        fs_info->sectorsize, SCRUB_MAX_SECTORS_PER_BLOCK);
4141                 return -EINVAL;
4142         }
4143
4144         /* Allocate outside of device_list_mutex */
4145         sctx = scrub_setup_ctx(fs_info, is_dev_replace);
4146         if (IS_ERR(sctx))
4147                 return PTR_ERR(sctx);
4148
4149         ret = scrub_workers_get(fs_info, is_dev_replace);
4150         if (ret)
4151                 goto out_free_ctx;
4152
4153         mutex_lock(&fs_info->fs_devices->device_list_mutex);
4154         dev = btrfs_find_device(fs_info->fs_devices, &args);
4155         if (!dev || (test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state) &&
4156                      !is_dev_replace)) {
4157                 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
4158                 ret = -ENODEV;
4159                 goto out;
4160         }
4161
4162         if (!is_dev_replace && !readonly &&
4163             !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state)) {
4164                 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
4165                 btrfs_err_in_rcu(fs_info,
4166                         "scrub on devid %llu: filesystem on %s is not writable",
4167                                  devid, rcu_str_deref(dev->name));
4168                 ret = -EROFS;
4169                 goto out;
4170         }
4171
4172         mutex_lock(&fs_info->scrub_lock);
4173         if (!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &dev->dev_state) ||
4174             test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &dev->dev_state)) {
4175                 mutex_unlock(&fs_info->scrub_lock);
4176                 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
4177                 ret = -EIO;
4178                 goto out;
4179         }
4180
4181         down_read(&fs_info->dev_replace.rwsem);
4182         if (dev->scrub_ctx ||
4183             (!is_dev_replace &&
4184              btrfs_dev_replace_is_ongoing(&fs_info->dev_replace))) {
4185                 up_read(&fs_info->dev_replace.rwsem);
4186                 mutex_unlock(&fs_info->scrub_lock);
4187                 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
4188                 ret = -EINPROGRESS;
4189                 goto out;
4190         }
4191         up_read(&fs_info->dev_replace.rwsem);
4192
4193         sctx->readonly = readonly;
4194         dev->scrub_ctx = sctx;
4195         mutex_unlock(&fs_info->fs_devices->device_list_mutex);
4196
4197         /*
4198          * checking @scrub_pause_req here, we can avoid
4199          * race between committing transaction and scrubbing.
4200          */
4201         __scrub_blocked_if_needed(fs_info);
4202         atomic_inc(&fs_info->scrubs_running);
4203         mutex_unlock(&fs_info->scrub_lock);
4204
4205         /*
4206          * In order to avoid deadlock with reclaim when there is a transaction
4207          * trying to pause scrub, make sure we use GFP_NOFS for all the
4208          * allocations done at btrfs_scrub_sectors() and scrub_sectors_for_parity()
4209          * invoked by our callees. The pausing request is done when the
4210          * transaction commit starts, and it blocks the transaction until scrub
4211          * is paused (done at specific points at scrub_stripe() or right above
4212          * before incrementing fs_info->scrubs_running).
4213          */
4214         nofs_flag = memalloc_nofs_save();
4215         if (!is_dev_replace) {
4216                 btrfs_info(fs_info, "scrub: started on devid %llu", devid);
4217                 /*
4218                  * by holding device list mutex, we can
4219                  * kick off writing super in log tree sync.
4220                  */
4221                 mutex_lock(&fs_info->fs_devices->device_list_mutex);
4222                 ret = scrub_supers(sctx, dev);
4223                 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
4224         }
4225
4226         if (!ret)
4227                 ret = scrub_enumerate_chunks(sctx, dev, start, end);
4228         memalloc_nofs_restore(nofs_flag);
4229
4230         wait_event(sctx->list_wait, atomic_read(&sctx->bios_in_flight) == 0);
4231         atomic_dec(&fs_info->scrubs_running);
4232         wake_up(&fs_info->scrub_pause_wait);
4233
4234         wait_event(sctx->list_wait, atomic_read(&sctx->workers_pending) == 0);
4235
4236         if (progress)
4237                 memcpy(progress, &sctx->stat, sizeof(*progress));
4238
4239         if (!is_dev_replace)
4240                 btrfs_info(fs_info, "scrub: %s on devid %llu with status: %d",
4241                         ret ? "not finished" : "finished", devid, ret);
4242
4243         mutex_lock(&fs_info->scrub_lock);
4244         dev->scrub_ctx = NULL;
4245         mutex_unlock(&fs_info->scrub_lock);
4246
4247         scrub_workers_put(fs_info);
4248         scrub_put_ctx(sctx);
4249
4250         return ret;
4251 out:
4252         scrub_workers_put(fs_info);
4253 out_free_ctx:
4254         scrub_free_ctx(sctx);
4255
4256         return ret;
4257 }
4258
4259 void btrfs_scrub_pause(struct btrfs_fs_info *fs_info)
4260 {
4261         mutex_lock(&fs_info->scrub_lock);
4262         atomic_inc(&fs_info->scrub_pause_req);
4263         while (atomic_read(&fs_info->scrubs_paused) !=
4264                atomic_read(&fs_info->scrubs_running)) {
4265                 mutex_unlock(&fs_info->scrub_lock);
4266                 wait_event(fs_info->scrub_pause_wait,
4267                            atomic_read(&fs_info->scrubs_paused) ==
4268                            atomic_read(&fs_info->scrubs_running));
4269                 mutex_lock(&fs_info->scrub_lock);
4270         }
4271         mutex_unlock(&fs_info->scrub_lock);
4272 }
4273
4274 void btrfs_scrub_continue(struct btrfs_fs_info *fs_info)
4275 {
4276         atomic_dec(&fs_info->scrub_pause_req);
4277         wake_up(&fs_info->scrub_pause_wait);
4278 }
4279
4280 int btrfs_scrub_cancel(struct btrfs_fs_info *fs_info)
4281 {
4282         mutex_lock(&fs_info->scrub_lock);
4283         if (!atomic_read(&fs_info->scrubs_running)) {
4284                 mutex_unlock(&fs_info->scrub_lock);
4285                 return -ENOTCONN;
4286         }
4287
4288         atomic_inc(&fs_info->scrub_cancel_req);
4289         while (atomic_read(&fs_info->scrubs_running)) {
4290                 mutex_unlock(&fs_info->scrub_lock);
4291                 wait_event(fs_info->scrub_pause_wait,
4292                            atomic_read(&fs_info->scrubs_running) == 0);
4293                 mutex_lock(&fs_info->scrub_lock);
4294         }
4295         atomic_dec(&fs_info->scrub_cancel_req);
4296         mutex_unlock(&fs_info->scrub_lock);
4297
4298         return 0;
4299 }
4300
4301 int btrfs_scrub_cancel_dev(struct btrfs_device *dev)
4302 {
4303         struct btrfs_fs_info *fs_info = dev->fs_info;
4304         struct scrub_ctx *sctx;
4305
4306         mutex_lock(&fs_info->scrub_lock);
4307         sctx = dev->scrub_ctx;
4308         if (!sctx) {
4309                 mutex_unlock(&fs_info->scrub_lock);
4310                 return -ENOTCONN;
4311         }
4312         atomic_inc(&sctx->cancel_req);
4313         while (dev->scrub_ctx) {
4314                 mutex_unlock(&fs_info->scrub_lock);
4315                 wait_event(fs_info->scrub_pause_wait,
4316                            dev->scrub_ctx == NULL);
4317                 mutex_lock(&fs_info->scrub_lock);
4318         }
4319         mutex_unlock(&fs_info->scrub_lock);
4320
4321         return 0;
4322 }
4323
4324 int btrfs_scrub_progress(struct btrfs_fs_info *fs_info, u64 devid,
4325                          struct btrfs_scrub_progress *progress)
4326 {
4327         struct btrfs_dev_lookup_args args = { .devid = devid };
4328         struct btrfs_device *dev;
4329         struct scrub_ctx *sctx = NULL;
4330
4331         mutex_lock(&fs_info->fs_devices->device_list_mutex);
4332         dev = btrfs_find_device(fs_info->fs_devices, &args);
4333         if (dev)
4334                 sctx = dev->scrub_ctx;
4335         if (sctx)
4336                 memcpy(progress, &sctx->stat, sizeof(*progress));
4337         mutex_unlock(&fs_info->fs_devices->device_list_mutex);
4338
4339         return dev ? (sctx ? 0 : -ENOTCONN) : -ENODEV;
4340 }
4341
4342 static void scrub_find_good_copy(struct btrfs_fs_info *fs_info,
4343                                  u64 extent_logical, u32 extent_len,
4344                                  u64 *extent_physical,
4345                                  struct btrfs_device **extent_dev,
4346                                  int *extent_mirror_num)
4347 {
4348         u64 mapped_length;
4349         struct btrfs_io_context *bioc = NULL;
4350         int ret;
4351
4352         mapped_length = extent_len;
4353         ret = btrfs_map_block(fs_info, BTRFS_MAP_READ, extent_logical,
4354                               &mapped_length, &bioc, 0);
4355         if (ret || !bioc || mapped_length < extent_len ||
4356             !bioc->stripes[0].dev->bdev) {
4357                 btrfs_put_bioc(bioc);
4358                 return;
4359         }
4360
4361         *extent_physical = bioc->stripes[0].physical;
4362         *extent_mirror_num = bioc->mirror_num;
4363         *extent_dev = bioc->stripes[0].dev;
4364         btrfs_put_bioc(bioc);
4365 }