fs/btrfs/scrub.c

   1 // SPDX-License-Identifier: GPL-2.0
   2 /*
   3  * Copyright (C) 2011, 2012 STRATO.  All rights reserved.
   4  */
   5
   6 #include <linux/blkdev.h>
   7 #include <linux/ratelimit.h>
   8 #include <linux/sched/mm.h>
   9 #include <crypto/hash.h>
  10 #include "ctree.h"
  11 #include "discard.h"
  12 #include "volumes.h"
  13 #include "disk-io.h"
  14 #include "ordered-data.h"
  15 #include "transaction.h"
  16 #include "backref.h"
  17 #include "extent_io.h"
  18 #include "dev-replace.h"
  19 #include "check-integrity.h"
  20 #include "rcu-string.h"
  21 #include "raid56.h"
  22 #include "block-group.h"
  23 #include "zoned.h"
  24
  25 /*
  26  * This is only the first step towards a full-features scrub. It reads all
  27  * extent and super block and verifies the checksums. In case a bad checksum
  28  * is found or the extent cannot be read, good data will be written back if
  29  * any can be found.
  30  *
  31  * Future enhancements:
  32  *  - In case an unrepairable extent is encountered, track which files are
  33  *    affected and report them
  34  *  - track and record media errors, throw out bad devices
  35  *  - add a mode to also read unallocated space
  36  */
  37
  38 struct scrub_block;
  39 struct scrub_ctx;
  40
  41 /*
  42  * The following three values only influence the performance.
  43  *
  44  * The last one configures the number of parallel and outstanding I/O
  45  * operations. The first one configures an upper limit for the number
  46  * of (dynamically allocated) pages that are added to a bio.
  47  */
  48 #define SCRUB_PAGES_PER_BIO     32      /* 128KiB per bio for x86 */
  49 #define SCRUB_BIOS_PER_SCTX     64      /* 8MiB per device in flight for x86 */
  50
  51 /*
  52  * The following value times PAGE_SIZE needs to be large enough to match the
  53  * largest node/leaf/sector size that shall be supported.
  54  */
  55 #define SCRUB_MAX_PAGES_PER_BLOCK       (BTRFS_MAX_METADATA_BLOCKSIZE / SZ_4K)
  56
  57 struct scrub_recover {
  58         refcount_t              refs;
  59         struct btrfs_io_context *bioc;
  60         u64                     map_length;
  61 };
  62
  63 struct scrub_page {
  64         struct scrub_block      *sblock;
  65         struct page             *page;
  66         struct btrfs_device     *dev;
  67         struct list_head        list;
  68         u64                     flags;  /* extent flags */
  69         u64                     generation;
  70         u64                     logical;
  71         u64                     physical;
  72         u64                     physical_for_dev_replace;
  73         atomic_t                refs;
  74         u8                      mirror_num;
  75         unsigned int            have_csum:1;
  76         unsigned int            io_error:1;
  77         u8                      csum[BTRFS_CSUM_SIZE];
  78
  79         struct scrub_recover    *recover;
  80 };
  81
  82 struct scrub_bio {
  83         int                     index;
  84         struct scrub_ctx        *sctx;
  85         struct btrfs_device     *dev;
  86         struct bio              *bio;
  87         blk_status_t            status;
  88         u64                     logical;
  89         u64                     physical;
  90         struct scrub_page       *pagev[SCRUB_PAGES_PER_BIO];
  91         int                     page_count;
  92         int                     next_free;
  93         struct btrfs_work       work;
  94 };
  95
  96 struct scrub_block {
  97         struct scrub_page       *pagev[SCRUB_MAX_PAGES_PER_BLOCK];
  98         int                     page_count;
  99         atomic_t                outstanding_pages;
 100         refcount_t              refs; /* free mem on transition to zero */
 101         struct scrub_ctx        *sctx;
 102         struct scrub_parity     *sparity;
 103         struct {
 104                 unsigned int    header_error:1;
 105                 unsigned int    checksum_error:1;
 106                 unsigned int    no_io_error_seen:1;
 107                 unsigned int    generation_error:1; /* also sets header_error */
 108
 109                 /* The following is for the data used to check parity */
 110                 /* It is for the data with checksum */
 111                 unsigned int    data_corrected:1;
 112         };
 113         struct btrfs_work       work;
 114 };
 115
 116 /* Used for the chunks with parity stripe such RAID5/6 */
 117 struct scrub_parity {
 118         struct scrub_ctx        *sctx;
 119
 120         struct btrfs_device     *scrub_dev;
 121
 122         u64                     logic_start;
 123
 124         u64                     logic_end;
 125
 126         int                     nsectors;
 127
 128         u32                     stripe_len;
 129
 130         refcount_t              refs;
 131
 132         struct list_head        spages;
 133
 134         /* Work of parity check and repair */
 135         struct btrfs_work       work;
 136
 137         /* Mark the parity blocks which have data */
 138         unsigned long           *dbitmap;
 139
 140         /*
 141          * Mark the parity blocks which have data, but errors happen when
 142          * read data or check data
 143          */
 144         unsigned long           *ebitmap;
 145
 146         unsigned long           bitmap[];
 147 };
 148
 149 struct scrub_ctx {
 150         struct scrub_bio        *bios[SCRUB_BIOS_PER_SCTX];
 151         struct btrfs_fs_info    *fs_info;
 152         int                     first_free;
 153         int                     curr;
 154         atomic_t                bios_in_flight;
 155         atomic_t                workers_pending;
 156         spinlock_t              list_lock;
 157         wait_queue_head_t       list_wait;
 158         struct list_head        csum_list;
 159         atomic_t                cancel_req;
 160         int                     readonly;
 161         int                     pages_per_bio;
 162
 163         /* State of IO submission throttling affecting the associated device */
 164         ktime_t                 throttle_deadline;
 165         u64                     throttle_sent;
 166
 167         int                     is_dev_replace;
 168         u64                     write_pointer;
 169
 170         struct scrub_bio        *wr_curr_bio;
 171         struct mutex            wr_lock;
 172         struct btrfs_device     *wr_tgtdev;
 173         bool                    flush_all_writes;
 174
 175         /*
 176          * statistics
 177          */
 178         struct btrfs_scrub_progress stat;
 179         spinlock_t              stat_lock;
 180
 181         /*
 182          * Use a ref counter to avoid use-after-free issues. Scrub workers
 183          * decrement bios_in_flight and workers_pending and then do a wakeup
 184          * on the list_wait wait queue. We must ensure the main scrub task
 185          * doesn't free the scrub context before or while the workers are
 186          * doing the wakeup() call.
 187          */
 188         refcount_t              refs;
 189 };
 190
 191 struct scrub_warning {
 192         struct btrfs_path       *path;
 193         u64                     extent_item_size;
 194         const char              *errstr;
 195         u64                     physical;
 196         u64                     logical;
 197         struct btrfs_device     *dev;
 198 };
 199
 200 struct full_stripe_lock {
 201         struct rb_node node;
 202         u64 logical;
 203         u64 refs;
 204         struct mutex mutex;
 205 };
 206
 207 static int scrub_setup_recheck_block(struct scrub_block *original_sblock,
 208                                      struct scrub_block *sblocks_for_recheck);
 209 static void scrub_recheck_block(struct btrfs_fs_info *fs_info,
 210                                 struct scrub_block *sblock,
 211                                 int retry_failed_mirror);
 212 static void scrub_recheck_block_checksum(struct scrub_block *sblock);
 213 static int scrub_repair_block_from_good_copy(struct scrub_block *sblock_bad,
 214                                              struct scrub_block *sblock_good);
 215 static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad,
 216                                             struct scrub_block *sblock_good,
 217                                             int page_num, int force_write);
 218 static void scrub_write_block_to_dev_replace(struct scrub_block *sblock);
 219 static int scrub_write_page_to_dev_replace(struct scrub_block *sblock,
 220                                            int page_num);
 221 static int scrub_checksum_data(struct scrub_block *sblock);
 222 static int scrub_checksum_tree_block(struct scrub_block *sblock);
 223 static int scrub_checksum_super(struct scrub_block *sblock);
 224 static void scrub_block_put(struct scrub_block *sblock);
 225 static void scrub_page_get(struct scrub_page *spage);
 226 static void scrub_page_put(struct scrub_page *spage);
 227 static void scrub_parity_get(struct scrub_parity *sparity);
 228 static void scrub_parity_put(struct scrub_parity *sparity);
 229 static int scrub_pages(struct scrub_ctx *sctx, u64 logical, u32 len,
 230                        u64 physical, struct btrfs_device *dev, u64 flags,
 231                        u64 gen, int mirror_num, u8 *csum,
 232                        u64 physical_for_dev_replace);
 233 static void scrub_bio_end_io(struct bio *bio);
 234 static void scrub_bio_end_io_worker(struct btrfs_work *work);
 235 static void scrub_block_complete(struct scrub_block *sblock);
 236 static void scrub_remap_extent(struct btrfs_fs_info *fs_info,
 237                                u64 extent_logical, u32 extent_len,
 238                                u64 *extent_physical,
 239                                struct btrfs_device **extent_dev,
 240                                int *extent_mirror_num);
 241 static int scrub_add_page_to_wr_bio(struct scrub_ctx *sctx,
 242                                     struct scrub_page *spage);
 243 static void scrub_wr_submit(struct scrub_ctx *sctx);
 244 static void scrub_wr_bio_end_io(struct bio *bio);
 245 static void scrub_wr_bio_end_io_worker(struct btrfs_work *work);
 246 static void scrub_put_ctx(struct scrub_ctx *sctx);
 247
 248 static inline int scrub_is_page_on_raid56(struct scrub_page *spage)
 249 {
 250         return spage->recover &&
 251                (spage->recover->bioc->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK);
 252 }
 253
 254 static void scrub_pending_bio_inc(struct scrub_ctx *sctx)
 255 {
 256         refcount_inc(&sctx->refs);
 257         atomic_inc(&sctx->bios_in_flight);
 258 }
 259
 260 static void scrub_pending_bio_dec(struct scrub_ctx *sctx)
 261 {
 262         atomic_dec(&sctx->bios_in_flight);
 263         wake_up(&sctx->list_wait);
 264         scrub_put_ctx(sctx);
 265 }
 266
 267 static void __scrub_blocked_if_needed(struct btrfs_fs_info *fs_info)
 268 {
 269         while (atomic_read(&fs_info->scrub_pause_req)) {
 270                 mutex_unlock(&fs_info->scrub_lock);
 271                 wait_event(fs_info->scrub_pause_wait,
 272                    atomic_read(&fs_info->scrub_pause_req) == 0);
 273                 mutex_lock(&fs_info->scrub_lock);
 274         }
 275 }
 276
 277 static void scrub_pause_on(struct btrfs_fs_info *fs_info)
 278 {
 279         atomic_inc(&fs_info->scrubs_paused);
 280         wake_up(&fs_info->scrub_pause_wait);
 281 }
 282
 283 static void scrub_pause_off(struct btrfs_fs_info *fs_info)
 284 {
 285         mutex_lock(&fs_info->scrub_lock);
 286         __scrub_blocked_if_needed(fs_info);
 287         atomic_dec(&fs_info->scrubs_paused);
 288         mutex_unlock(&fs_info->scrub_lock);
 289
 290         wake_up(&fs_info->scrub_pause_wait);
 291 }
 292
 293 static void scrub_blocked_if_needed(struct btrfs_fs_info *fs_info)
 294 {
 295         scrub_pause_on(fs_info);
 296         scrub_pause_off(fs_info);
 297 }
 298
 299 /*
 300  * Insert new full stripe lock into full stripe locks tree
 301  *
 302  * Return pointer to existing or newly inserted full_stripe_lock structure if
 303  * everything works well.
 304  * Return ERR_PTR(-ENOMEM) if we failed to allocate memory
 305  *
 306  * NOTE: caller must hold full_stripe_locks_root->lock before calling this
 307  * function
 308  */
 309 static struct full_stripe_lock *insert_full_stripe_lock(
 310                 struct btrfs_full_stripe_locks_tree *locks_root,
 311                 u64 fstripe_logical)
 312 {
 313         struct rb_node **p;
 314         struct rb_node *parent = NULL;
 315         struct full_stripe_lock *entry;
 316         struct full_stripe_lock *ret;
 317
 318         lockdep_assert_held(&locks_root->lock);
 319
 320         p = &locks_root->root.rb_node;
 321         while (*p) {
 322                 parent = *p;
 323                 entry = rb_entry(parent, struct full_stripe_lock, node);
 324                 if (fstripe_logical < entry->logical) {
 325                         p = &(*p)->rb_left;
 326                 } else if (fstripe_logical > entry->logical) {
 327                         p = &(*p)->rb_right;
 328                 } else {
 329                         entry->refs++;
 330                         return entry;
 331                 }
 332         }
 333
 334         /*
 335          * Insert new lock.
 336          */
 337         ret = kmalloc(sizeof(*ret), GFP_KERNEL);
 338         if (!ret)
 339                 return ERR_PTR(-ENOMEM);
 340         ret->logical = fstripe_logical;
 341         ret->refs = 1;
 342         mutex_init(&ret->mutex);
 343
 344         rb_link_node(&ret->node, parent, p);
 345         rb_insert_color(&ret->node, &locks_root->root);
 346         return ret;
 347 }
 348
 349 /*
 350  * Search for a full stripe lock of a block group
 351  *
 352  * Return pointer to existing full stripe lock if found
 353  * Return NULL if not found
 354  */
 355 static struct full_stripe_lock *search_full_stripe_lock(
 356                 struct btrfs_full_stripe_locks_tree *locks_root,
 357                 u64 fstripe_logical)
 358 {
 359         struct rb_node *node;
 360         struct full_stripe_lock *entry;
 361
 362         lockdep_assert_held(&locks_root->lock);
 363
 364         node = locks_root->root.rb_node;
 365         while (node) {
 366                 entry = rb_entry(node, struct full_stripe_lock, node);
 367                 if (fstripe_logical < entry->logical)
 368                         node = node->rb_left;
 369                 else if (fstripe_logical > entry->logical)
 370                         node = node->rb_right;
 371                 else
 372                         return entry;
 373         }
 374         return NULL;
 375 }
 376
 377 /*
 378  * Helper to get full stripe logical from a normal bytenr.
 379  *
 380  * Caller must ensure @cache is a RAID56 block group.
 381  */
 382 static u64 get_full_stripe_logical(struct btrfs_block_group *cache, u64 bytenr)
 383 {
 384         u64 ret;
 385
 386         /*
 387          * Due to chunk item size limit, full stripe length should not be
 388          * larger than U32_MAX. Just a sanity check here.
 389          */
 390         WARN_ON_ONCE(cache->full_stripe_len >= U32_MAX);
 391
 392         /*
 393          * round_down() can only handle power of 2, while RAID56 full
 394          * stripe length can be 64KiB * n, so we need to manually round down.
 395          */
 396         ret = div64_u64(bytenr - cache->start, cache->full_stripe_len) *
 397                         cache->full_stripe_len + cache->start;
 398         return ret;
 399 }
 400
 401 /*
 402  * Lock a full stripe to avoid concurrency of recovery and read
 403  *
 404  * It's only used for profiles with parities (RAID5/6), for other profiles it
 405  * does nothing.
 406  *
 407  * Return 0 if we locked full stripe covering @bytenr, with a mutex held.
 408  * So caller must call unlock_full_stripe() at the same context.
 409  *
 410  * Return <0 if encounters error.
 411  */
 412 static int lock_full_stripe(struct btrfs_fs_info *fs_info, u64 bytenr,
 413                             bool *locked_ret)
 414 {
 415         struct btrfs_block_group *bg_cache;
 416         struct btrfs_full_stripe_locks_tree *locks_root;
 417         struct full_stripe_lock *existing;
 418         u64 fstripe_start;
 419         int ret = 0;
 420
 421         *locked_ret = false;
 422         bg_cache = btrfs_lookup_block_group(fs_info, bytenr);
 423         if (!bg_cache) {
 424                 ASSERT(0);
 425                 return -ENOENT;
 426         }
 427
 428         /* Profiles not based on parity don't need full stripe lock */
 429         if (!(bg_cache->flags & BTRFS_BLOCK_GROUP_RAID56_MASK))
 430                 goto out;
 431         locks_root = &bg_cache->full_stripe_locks_root;
 432
 433         fstripe_start = get_full_stripe_logical(bg_cache, bytenr);
 434
 435         /* Now insert the full stripe lock */
 436         mutex_lock(&locks_root->lock);
 437         existing = insert_full_stripe_lock(locks_root, fstripe_start);
 438         mutex_unlock(&locks_root->lock);
 439         if (IS_ERR(existing)) {
 440                 ret = PTR_ERR(existing);
 441                 goto out;
 442         }
 443         mutex_lock(&existing->mutex);
 444         *locked_ret = true;
 445 out:
 446         btrfs_put_block_group(bg_cache);
 447         return ret;
 448 }
 449
 450 /*
 451  * Unlock a full stripe.
 452  *
 453  * NOTE: Caller must ensure it's the same context calling corresponding
 454  * lock_full_stripe().
 455  *
 456  * Return 0 if we unlock full stripe without problem.
 457  * Return <0 for error
 458  */
 459 static int unlock_full_stripe(struct btrfs_fs_info *fs_info, u64 bytenr,
 460                               bool locked)
 461 {
 462         struct btrfs_block_group *bg_cache;
 463         struct btrfs_full_stripe_locks_tree *locks_root;
 464         struct full_stripe_lock *fstripe_lock;
 465         u64 fstripe_start;
 466         bool freeit = false;
 467         int ret = 0;
 468
 469         /* If we didn't acquire full stripe lock, no need to continue */
 470         if (!locked)
 471                 return 0;
 472
 473         bg_cache = btrfs_lookup_block_group(fs_info, bytenr);
 474         if (!bg_cache) {
 475                 ASSERT(0);
 476                 return -ENOENT;
 477         }
 478         if (!(bg_cache->flags & BTRFS_BLOCK_GROUP_RAID56_MASK))
 479                 goto out;
 480
 481         locks_root = &bg_cache->full_stripe_locks_root;
 482         fstripe_start = get_full_stripe_logical(bg_cache, bytenr);
 483
 484         mutex_lock(&locks_root->lock);
 485         fstripe_lock = search_full_stripe_lock(locks_root, fstripe_start);
 486         /* Unpaired unlock_full_stripe() detected */
 487         if (!fstripe_lock) {
 488                 WARN_ON(1);
 489                 ret = -ENOENT;
 490                 mutex_unlock(&locks_root->lock);
 491                 goto out;
 492         }
 493
 494         if (fstripe_lock->refs == 0) {
 495                 WARN_ON(1);
 496                 btrfs_warn(fs_info, "full stripe lock at %llu refcount underflow",
 497                         fstripe_lock->logical);
 498         } else {
 499                 fstripe_lock->refs--;
 500         }
 501
 502         if (fstripe_lock->refs == 0) {
 503                 rb_erase(&fstripe_lock->node, &locks_root->root);
 504                 freeit = true;
 505         }
 506         mutex_unlock(&locks_root->lock);
 507
 508         mutex_unlock(&fstripe_lock->mutex);
 509         if (freeit)
 510                 kfree(fstripe_lock);
 511 out:
 512         btrfs_put_block_group(bg_cache);
 513         return ret;
 514 }
 515
 516 static void scrub_free_csums(struct scrub_ctx *sctx)
 517 {
 518         while (!list_empty(&sctx->csum_list)) {
 519                 struct btrfs_ordered_sum *sum;
 520                 sum = list_first_entry(&sctx->csum_list,
 521                                        struct btrfs_ordered_sum, list);
 522                 list_del(&sum->list);
 523                 kfree(sum);
 524         }
 525 }
 526
 527 static noinline_for_stack void scrub_free_ctx(struct scrub_ctx *sctx)
 528 {
 529         int i;
 530
 531         if (!sctx)
 532                 return;
 533
 534         /* this can happen when scrub is cancelled */
 535         if (sctx->curr != -1) {
 536                 struct scrub_bio *sbio = sctx->bios[sctx->curr];
 537
 538                 for (i = 0; i < sbio->page_count; i++) {
 539                         WARN_ON(!sbio->pagev[i]->page);
 540                         scrub_block_put(sbio->pagev[i]->sblock);
 541                 }
 542                 bio_put(sbio->bio);
 543         }
 544
 545         for (i = 0; i < SCRUB_BIOS_PER_SCTX; ++i) {
 546                 struct scrub_bio *sbio = sctx->bios[i];
 547
 548                 if (!sbio)
 549                         break;
 550                 kfree(sbio);
 551         }
 552
 553         kfree(sctx->wr_curr_bio);
 554         scrub_free_csums(sctx);
 555         kfree(sctx);
 556 }
 557
 558 static void scrub_put_ctx(struct scrub_ctx *sctx)
 559 {
 560         if (refcount_dec_and_test(&sctx->refs))
 561                 scrub_free_ctx(sctx);
 562 }
 563
 564 static noinline_for_stack struct scrub_ctx *scrub_setup_ctx(
 565                 struct btrfs_fs_info *fs_info, int is_dev_replace)
 566 {
 567         struct scrub_ctx *sctx;
 568         int             i;
 569
 570         sctx = kzalloc(sizeof(*sctx), GFP_KERNEL);
 571         if (!sctx)
 572                 goto nomem;
 573         refcount_set(&sctx->refs, 1);
 574         sctx->is_dev_replace = is_dev_replace;
 575         sctx->pages_per_bio = SCRUB_PAGES_PER_BIO;
 576         sctx->curr = -1;
 577         sctx->fs_info = fs_info;
 578         INIT_LIST_HEAD(&sctx->csum_list);
 579         for (i = 0; i < SCRUB_BIOS_PER_SCTX; ++i) {
 580                 struct scrub_bio *sbio;
 581
 582                 sbio = kzalloc(sizeof(*sbio), GFP_KERNEL);
 583                 if (!sbio)
 584                         goto nomem;
 585                 sctx->bios[i] = sbio;
 586
 587                 sbio->index = i;
 588                 sbio->sctx = sctx;
 589                 sbio->page_count = 0;
 590                 btrfs_init_work(&sbio->work, scrub_bio_end_io_worker, NULL,
 591                                 NULL);
 592
 593                 if (i != SCRUB_BIOS_PER_SCTX - 1)
 594                         sctx->bios[i]->next_free = i + 1;
 595                 else
 596                         sctx->bios[i]->next_free = -1;
 597         }
 598         sctx->first_free = 0;
 599         atomic_set(&sctx->bios_in_flight, 0);
 600         atomic_set(&sctx->workers_pending, 0);
 601         atomic_set(&sctx->cancel_req, 0);
 602
 603         spin_lock_init(&sctx->list_lock);
 604         spin_lock_init(&sctx->stat_lock);
 605         init_waitqueue_head(&sctx->list_wait);
 606         sctx->throttle_deadline = 0;
 607
 608         WARN_ON(sctx->wr_curr_bio != NULL);
 609         mutex_init(&sctx->wr_lock);
 610         sctx->wr_curr_bio = NULL;
 611         if (is_dev_replace) {
 612                 WARN_ON(!fs_info->dev_replace.tgtdev);
 613                 sctx->wr_tgtdev = fs_info->dev_replace.tgtdev;
 614                 sctx->flush_all_writes = false;
 615         }
 616
 617         return sctx;
 618
 619 nomem:
 620         scrub_free_ctx(sctx);
 621         return ERR_PTR(-ENOMEM);
 622 }
 623
 624 static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root,
 625                                      void *warn_ctx)
 626 {
 627         u32 nlink;
 628         int ret;
 629         int i;
 630         unsigned nofs_flag;
 631         struct extent_buffer *eb;
 632         struct btrfs_inode_item *inode_item;
 633         struct scrub_warning *swarn = warn_ctx;
 634         struct btrfs_fs_info *fs_info = swarn->dev->fs_info;
 635         struct inode_fs_paths *ipath = NULL;
 636         struct btrfs_root *local_root;
 637         struct btrfs_key key;
 638
 639         local_root = btrfs_get_fs_root(fs_info, root, true);
 640         if (IS_ERR(local_root)) {
 641                 ret = PTR_ERR(local_root);
 642                 goto err;
 643         }
 644
 645         /*
 646          * this makes the path point to (inum INODE_ITEM ioff)
 647          */
 648         key.objectid = inum;
 649         key.type = BTRFS_INODE_ITEM_KEY;
 650         key.offset = 0;
 651
 652         ret = btrfs_search_slot(NULL, local_root, &key, swarn->path, 0, 0);
 653         if (ret) {
 654                 btrfs_put_root(local_root);
 655                 btrfs_release_path(swarn->path);
 656                 goto err;
 657         }
 658
 659         eb = swarn->path->nodes[0];
 660         inode_item = btrfs_item_ptr(eb, swarn->path->slots[0],
 661                                         struct btrfs_inode_item);
 662         nlink = btrfs_inode_nlink(eb, inode_item);
 663         btrfs_release_path(swarn->path);
 664
 665         /*
 666          * init_path might indirectly call vmalloc, or use GFP_KERNEL. Scrub
 667          * uses GFP_NOFS in this context, so we keep it consistent but it does
 668          * not seem to be strictly necessary.
 669          */
 670         nofs_flag = memalloc_nofs_save();
 671         ipath = init_ipath(4096, local_root, swarn->path);
 672         memalloc_nofs_restore(nofs_flag);
 673         if (IS_ERR(ipath)) {
 674                 btrfs_put_root(local_root);
 675                 ret = PTR_ERR(ipath);
 676                 ipath = NULL;
 677                 goto err;
 678         }
 679         ret = paths_from_inode(inum, ipath);
 680
 681         if (ret < 0)
 682                 goto err;
 683
 684         /*
 685          * we deliberately ignore the bit ipath might have been too small to
 686          * hold all of the paths here
 687          */
 688         for (i = 0; i < ipath->fspath->elem_cnt; ++i)
 689                 btrfs_warn_in_rcu(fs_info,
 690 "%s at logical %llu on dev %s, physical %llu, root %llu, inode %llu, offset %llu, length %u, links %u (path: %s)",
 691                                   swarn->errstr, swarn->logical,
 692                                   rcu_str_deref(swarn->dev->name),
 693                                   swarn->physical,
 694                                   root, inum, offset,
 695                                   fs_info->sectorsize, nlink,
 696                                   (char *)(unsigned long)ipath->fspath->val[i]);
 697
 698         btrfs_put_root(local_root);
 699         free_ipath(ipath);
 700         return 0;
 701
 702 err:
 703         btrfs_warn_in_rcu(fs_info,
 704                           "%s at logical %llu on dev %s, physical %llu, root %llu, inode %llu, offset %llu: path resolving failed with ret=%d",
 705                           swarn->errstr, swarn->logical,
 706                           rcu_str_deref(swarn->dev->name),
 707                           swarn->physical,
 708                           root, inum, offset, ret);
 709
 710         free_ipath(ipath);
 711         return 0;
 712 }
 713
 714 static void scrub_print_warning(const char *errstr, struct scrub_block *sblock)
 715 {
 716         struct btrfs_device *dev;
 717         struct btrfs_fs_info *fs_info;
 718         struct btrfs_path *path;
 719         struct btrfs_key found_key;
 720         struct extent_buffer *eb;
 721         struct btrfs_extent_item *ei;
 722         struct scrub_warning swarn;
 723         unsigned long ptr = 0;
 724         u64 extent_item_pos;
 725         u64 flags = 0;
 726         u64 ref_root;
 727         u32 item_size;
 728         u8 ref_level = 0;
 729         int ret;
 730
 731         WARN_ON(sblock->page_count < 1);
 732         dev = sblock->pagev[0]->dev;
 733         fs_info = sblock->sctx->fs_info;
 734
 735         path = btrfs_alloc_path();
 736         if (!path)
 737                 return;
 738
 739         swarn.physical = sblock->pagev[0]->physical;
 740         swarn.logical = sblock->pagev[0]->logical;
 741         swarn.errstr = errstr;
 742         swarn.dev = NULL;
 743
 744         ret = extent_from_logical(fs_info, swarn.logical, path, &found_key,
 745                                   &flags);
 746         if (ret < 0)
 747                 goto out;
 748
 749         extent_item_pos = swarn.logical - found_key.objectid;
 750         swarn.extent_item_size = found_key.offset;
 751
 752         eb = path->nodes[0];
 753         ei = btrfs_item_ptr(eb, path->slots[0], struct btrfs_extent_item);
 754         item_size = btrfs_item_size(eb, path->slots[0]);
 755
 756         if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
 757                 do {
 758                         ret = tree_backref_for_extent(&ptr, eb, &found_key, ei,
 759                                                       item_size, &ref_root,
 760                                                       &ref_level);
 761                         btrfs_warn_in_rcu(fs_info,
 762 "%s at logical %llu on dev %s, physical %llu: metadata %s (level %d) in tree %llu",
 763                                 errstr, swarn.logical,
 764                                 rcu_str_deref(dev->name),
 765                                 swarn.physical,
 766                                 ref_level ? "node" : "leaf",
 767                                 ret < 0 ? -1 : ref_level,
 768                                 ret < 0 ? -1 : ref_root);
 769                 } while (ret != 1);
 770                 btrfs_release_path(path);
 771         } else {
 772                 btrfs_release_path(path);
 773                 swarn.path = path;
 774                 swarn.dev = dev;
 775                 iterate_extent_inodes(fs_info, found_key.objectid,
 776                                         extent_item_pos, 1,
 777                                         scrub_print_warning_inode, &swarn, false);
 778         }
 779
 780 out:
 781         btrfs_free_path(path);
 782 }
 783
 784 static inline void scrub_get_recover(struct scrub_recover *recover)
 785 {
 786         refcount_inc(&recover->refs);
 787 }
 788
 789 static inline void scrub_put_recover(struct btrfs_fs_info *fs_info,
 790                                      struct scrub_recover *recover)
 791 {
 792         if (refcount_dec_and_test(&recover->refs)) {
 793                 btrfs_bio_counter_dec(fs_info);
 794                 btrfs_put_bioc(recover->bioc);
 795                 kfree(recover);
 796         }
 797 }
 798
 799 /*
 800  * scrub_handle_errored_block gets called when either verification of the
 801  * pages failed or the bio failed to read, e.g. with EIO. In the latter
 802  * case, this function handles all pages in the bio, even though only one
 803  * may be bad.
 804  * The goal of this function is to repair the errored block by using the
 805  * contents of one of the mirrors.
 806  */
 807 static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
 808 {
 809         struct scrub_ctx *sctx = sblock_to_check->sctx;
 810         struct btrfs_device *dev;
 811         struct btrfs_fs_info *fs_info;
 812         u64 logical;
 813         unsigned int failed_mirror_index;
 814         unsigned int is_metadata;
 815         unsigned int have_csum;
 816         struct scrub_block *sblocks_for_recheck; /* holds one for each mirror */
 817         struct scrub_block *sblock_bad;
 818         int ret;
 819         int mirror_index;
 820         int page_num;
 821         int success;
 822         bool full_stripe_locked;
 823         unsigned int nofs_flag;
 824         static DEFINE_RATELIMIT_STATE(rs, DEFAULT_RATELIMIT_INTERVAL,
 825                                       DEFAULT_RATELIMIT_BURST);
 826
 827         BUG_ON(sblock_to_check->page_count < 1);
 828         fs_info = sctx->fs_info;
 829         if (sblock_to_check->pagev[0]->flags & BTRFS_EXTENT_FLAG_SUPER) {
 830                 /*
 831                  * if we find an error in a super block, we just report it.
 832                  * They will get written with the next transaction commit
 833                  * anyway
 834                  */
 835                 spin_lock(&sctx->stat_lock);
 836                 ++sctx->stat.super_errors;
 837                 spin_unlock(&sctx->stat_lock);
 838                 return 0;
 839         }
 840         logical = sblock_to_check->pagev[0]->logical;
 841         BUG_ON(sblock_to_check->pagev[0]->mirror_num < 1);
 842         failed_mirror_index = sblock_to_check->pagev[0]->mirror_num - 1;
 843         is_metadata = !(sblock_to_check->pagev[0]->flags &
 844                         BTRFS_EXTENT_FLAG_DATA);
 845         have_csum = sblock_to_check->pagev[0]->have_csum;
 846         dev = sblock_to_check->pagev[0]->dev;
 847
 848         if (!sctx->is_dev_replace && btrfs_repair_one_zone(fs_info, logical))
 849                 return 0;
 850
 851         /*
 852          * We must use GFP_NOFS because the scrub task might be waiting for a
 853          * worker task executing this function and in turn a transaction commit
 854          * might be waiting the scrub task to pause (which needs to wait for all
 855          * the worker tasks to complete before pausing).
 856          * We do allocations in the workers through insert_full_stripe_lock()
 857          * and scrub_add_page_to_wr_bio(), which happens down the call chain of
 858          * this function.
 859          */
 860         nofs_flag = memalloc_nofs_save();
 861         /*
 862          * For RAID5/6, race can happen for a different device scrub thread.
 863          * For data corruption, Parity and Data threads will both try
 864          * to recovery the data.
 865          * Race can lead to doubly added csum error, or even unrecoverable
 866          * error.
 867          */
 868         ret = lock_full_stripe(fs_info, logical, &full_stripe_locked);
 869         if (ret < 0) {
 870                 memalloc_nofs_restore(nofs_flag);
 871                 spin_lock(&sctx->stat_lock);
 872                 if (ret == -ENOMEM)
 873                         sctx->stat.malloc_errors++;
 874                 sctx->stat.read_errors++;
 875                 sctx->stat.uncorrectable_errors++;
 876                 spin_unlock(&sctx->stat_lock);
 877                 return ret;
 878         }
 879
 880         /*
 881          * read all mirrors one after the other. This includes to
 882          * re-read the extent or metadata block that failed (that was
 883          * the cause that this fixup code is called) another time,
 884          * sector by sector this time in order to know which sectors
 885          * caused I/O errors and which ones are good (for all mirrors).
 886          * It is the goal to handle the situation when more than one
 887          * mirror contains I/O errors, but the errors do not
 888          * overlap, i.e. the data can be repaired by selecting the
 889          * sectors from those mirrors without I/O error on the
 890          * particular sectors. One example (with blocks >= 2 * sectorsize)
 891          * would be that mirror #1 has an I/O error on the first sector,
 892          * the second sector is good, and mirror #2 has an I/O error on
 893          * the second sector, but the first sector is good.
 894          * Then the first sector of the first mirror can be repaired by
 895          * taking the first sector of the second mirror, and the
 896          * second sector of the second mirror can be repaired by
 897          * copying the contents of the 2nd sector of the 1st mirror.
 898          * One more note: if the sectors of one mirror contain I/O
 899          * errors, the checksum cannot be verified. In order to get
 900          * the best data for repairing, the first attempt is to find
 901          * a mirror without I/O errors and with a validated checksum.
 902          * Only if this is not possible, the sectors are picked from
 903          * mirrors with I/O errors without considering the checksum.
 904          * If the latter is the case, at the end, the checksum of the
 905          * repaired area is verified in order to correctly maintain
 906          * the statistics.
 907          */
 908
 909         sblocks_for_recheck = kcalloc(BTRFS_MAX_MIRRORS,
 910                                       sizeof(*sblocks_for_recheck), GFP_KERNEL);
 911         if (!sblocks_for_recheck) {
 912                 spin_lock(&sctx->stat_lock);
 913                 sctx->stat.malloc_errors++;
 914                 sctx->stat.read_errors++;
 915                 sctx->stat.uncorrectable_errors++;
 916                 spin_unlock(&sctx->stat_lock);
 917                 btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS);
 918                 goto out;
 919         }
 920
 921         /* setup the context, map the logical blocks and alloc the pages */
 922         ret = scrub_setup_recheck_block(sblock_to_check, sblocks_for_recheck);
 923         if (ret) {
 924                 spin_lock(&sctx->stat_lock);
 925                 sctx->stat.read_errors++;
 926                 sctx->stat.uncorrectable_errors++;
 927                 spin_unlock(&sctx->stat_lock);
 928                 btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS);
 929                 goto out;
 930         }
 931         BUG_ON(failed_mirror_index >= BTRFS_MAX_MIRRORS);
 932         sblock_bad = sblocks_for_recheck + failed_mirror_index;
 933
 934         /* build and submit the bios for the failed mirror, check checksums */
 935         scrub_recheck_block(fs_info, sblock_bad, 1);
 936
 937         if (!sblock_bad->header_error && !sblock_bad->checksum_error &&
 938             sblock_bad->no_io_error_seen) {
 939                 /*
 940                  * the error disappeared after reading page by page, or
 941                  * the area was part of a huge bio and other parts of the
 942                  * bio caused I/O errors, or the block layer merged several
 943                  * read requests into one and the error is caused by a
 944                  * different bio (usually one of the two latter cases is
 945                  * the cause)
 946                  */
 947                 spin_lock(&sctx->stat_lock);
 948                 sctx->stat.unverified_errors++;
 949                 sblock_to_check->data_corrected = 1;
 950                 spin_unlock(&sctx->stat_lock);
 951
 952                 if (sctx->is_dev_replace)
 953                         scrub_write_block_to_dev_replace(sblock_bad);
 954                 goto out;
 955         }
 956
 957         if (!sblock_bad->no_io_error_seen) {
 958                 spin_lock(&sctx->stat_lock);
 959                 sctx->stat.read_errors++;
 960                 spin_unlock(&sctx->stat_lock);
 961                 if (__ratelimit(&rs))
 962                         scrub_print_warning("i/o error", sblock_to_check);
 963                 btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS);
 964         } else if (sblock_bad->checksum_error) {
 965                 spin_lock(&sctx->stat_lock);
 966                 sctx->stat.csum_errors++;
 967                 spin_unlock(&sctx->stat_lock);
 968                 if (__ratelimit(&rs))
 969                         scrub_print_warning("checksum error", sblock_to_check);
 970                 btrfs_dev_stat_inc_and_print(dev,
 971                                              BTRFS_DEV_STAT_CORRUPTION_ERRS);
 972         } else if (sblock_bad->header_error) {
 973                 spin_lock(&sctx->stat_lock);
 974                 sctx->stat.verify_errors++;
 975                 spin_unlock(&sctx->stat_lock);
 976                 if (__ratelimit(&rs))
 977                         scrub_print_warning("checksum/header error",
 978                                             sblock_to_check);
 979                 if (sblock_bad->generation_error)
 980                         btrfs_dev_stat_inc_and_print(dev,
 981                                 BTRFS_DEV_STAT_GENERATION_ERRS);
 982                 else
 983                         btrfs_dev_stat_inc_and_print(dev,
 984                                 BTRFS_DEV_STAT_CORRUPTION_ERRS);
 985         }
 986
 987         if (sctx->readonly) {
 988                 ASSERT(!sctx->is_dev_replace);
 989                 goto out;
 990         }
 991
 992         /*
 993          * now build and submit the bios for the other mirrors, check
 994          * checksums.
 995          * First try to pick the mirror which is completely without I/O
 996          * errors and also does not have a checksum error.
 997          * If one is found, and if a checksum is present, the full block
 998          * that is known to contain an error is rewritten. Afterwards
 999          * the block is known to be corrected.
1000          * If a mirror is found which is completely correct, and no
1001          * checksum is present, only those pages are rewritten that had
1002          * an I/O error in the block to be repaired, since it cannot be
1003          * determined, which copy of the other pages is better (and it
1004          * could happen otherwise that a correct page would be
1005          * overwritten by a bad one).
1006          */
1007         for (mirror_index = 0; ;mirror_index++) {
1008                 struct scrub_block *sblock_other;
1009
1010                 if (mirror_index == failed_mirror_index)
1011                         continue;
1012
1013                 /* raid56's mirror can be more than BTRFS_MAX_MIRRORS */
1014                 if (!scrub_is_page_on_raid56(sblock_bad->pagev[0])) {
1015                         if (mirror_index >= BTRFS_MAX_MIRRORS)
1016                                 break;
1017                         if (!sblocks_for_recheck[mirror_index].page_count)
1018                                 break;
1019
1020                         sblock_other = sblocks_for_recheck + mirror_index;
1021                 } else {
1022                         struct scrub_recover *r = sblock_bad->pagev[0]->recover;
1023                         int max_allowed = r->bioc->num_stripes - r->bioc->num_tgtdevs;
1024
1025                         if (mirror_index >= max_allowed)
1026                                 break;
1027                         if (!sblocks_for_recheck[1].page_count)
1028                                 break;
1029
1030                         ASSERT(failed_mirror_index == 0);
1031                         sblock_other = sblocks_for_recheck + 1;
1032                         sblock_other->pagev[0]->mirror_num = 1 + mirror_index;
1033                 }
1034
1035                 /* build and submit the bios, check checksums */
1036                 scrub_recheck_block(fs_info, sblock_other, 0);
1037
1038                 if (!sblock_other->header_error &&
1039                     !sblock_other->checksum_error &&
1040                     sblock_other->no_io_error_seen) {
1041                         if (sctx->is_dev_replace) {
1042                                 scrub_write_block_to_dev_replace(sblock_other);
1043                                 goto corrected_error;
1044                         } else {
1045                                 ret = scrub_repair_block_from_good_copy(
1046                                                 sblock_bad, sblock_other);
1047                                 if (!ret)
1048                                         goto corrected_error;
1049                         }
1050                 }
1051         }
1052
1053         if (sblock_bad->no_io_error_seen && !sctx->is_dev_replace)
1054                 goto did_not_correct_error;
1055
1056         /*
1057          * In case of I/O errors in the area that is supposed to be
1058          * repaired, continue by picking good copies of those sectors.
1059          * Select the good sectors from mirrors to rewrite bad sectors from
1060          * the area to fix. Afterwards verify the checksum of the block
1061          * that is supposed to be repaired. This verification step is
1062          * only done for the purpose of statistic counting and for the
1063          * final scrub report, whether errors remain.
1064          * A perfect algorithm could make use of the checksum and try
1065          * all possible combinations of sectors from the different mirrors
1066          * until the checksum verification succeeds. For example, when
1067          * the 2nd sector of mirror #1 faces I/O errors, and the 2nd sector
1068          * of mirror #2 is readable but the final checksum test fails,
1069          * then the 2nd sector of mirror #3 could be tried, whether now
1070          * the final checksum succeeds. But this would be a rare
1071          * exception and is therefore not implemented. At least it is
1072          * avoided that the good copy is overwritten.
1073          * A more useful improvement would be to pick the sectors
1074          * without I/O error based on sector sizes (512 bytes on legacy
1075          * disks) instead of on sectorsize. Then maybe 512 byte of one
1076          * mirror could be repaired by taking 512 byte of a different
1077          * mirror, even if other 512 byte sectors in the same sectorsize
1078          * area are unreadable.
1079          */
1080         success = 1;
1081         for (page_num = 0; page_num < sblock_bad->page_count;
1082              page_num++) {
1083                 struct scrub_page *spage_bad = sblock_bad->pagev[page_num];
1084                 struct scrub_block *sblock_other = NULL;
1085
1086                 /* skip no-io-error page in scrub */
1087                 if (!spage_bad->io_error && !sctx->is_dev_replace)
1088                         continue;
1089
1090                 if (scrub_is_page_on_raid56(sblock_bad->pagev[0])) {
1091                         /*
1092                          * In case of dev replace, if raid56 rebuild process
1093                          * didn't work out correct data, then copy the content
1094                          * in sblock_bad to make sure target device is identical
1095                          * to source device, instead of writing garbage data in
1096                          * sblock_for_recheck array to target device.
1097                          */
1098                         sblock_other = NULL;
1099                 } else if (spage_bad->io_error) {
1100                         /* try to find no-io-error page in mirrors */
1101                         for (mirror_index = 0;
1102                              mirror_index < BTRFS_MAX_MIRRORS &&
1103                              sblocks_for_recheck[mirror_index].page_count > 0;
1104                              mirror_index++) {
1105                                 if (!sblocks_for_recheck[mirror_index].
1106                                     pagev[page_num]->io_error) {
1107                                         sblock_other = sblocks_for_recheck +
1108                                                        mirror_index;
1109                                         break;
1110                                 }
1111                         }
1112                         if (!sblock_other)
1113                                 success = 0;
1114                 }
1115
1116                 if (sctx->is_dev_replace) {
1117                         /*
1118                          * did not find a mirror to fetch the page
1119                          * from. scrub_write_page_to_dev_replace()
1120                          * handles this case (page->io_error), by
1121                          * filling the block with zeros before
1122                          * submitting the write request
1123                          */
1124                         if (!sblock_other)
1125                                 sblock_other = sblock_bad;
1126
1127                         if (scrub_write_page_to_dev_replace(sblock_other,
1128                                                             page_num) != 0) {
1129                                 atomic64_inc(
1130                                         &fs_info->dev_replace.num_write_errors);
1131                                 success = 0;
1132                         }
1133                 } else if (sblock_other) {
1134                         ret = scrub_repair_page_from_good_copy(sblock_bad,
1135                                                                sblock_other,
1136                                                                page_num, 0);
1137                         if (0 == ret)
1138                                 spage_bad->io_error = 0;
1139                         else
1140                                 success = 0;
1141                 }
1142         }
1143
1144         if (success && !sctx->is_dev_replace) {
1145                 if (is_metadata || have_csum) {
1146                         /*
1147                          * need to verify the checksum now that all
1148                          * sectors on disk are repaired (the write
1149                          * request for data to be repaired is on its way).
1150                          * Just be lazy and use scrub_recheck_block()
1151                          * which re-reads the data before the checksum
1152                          * is verified, but most likely the data comes out
1153                          * of the page cache.
1154                          */
1155                         scrub_recheck_block(fs_info, sblock_bad, 1);
1156                         if (!sblock_bad->header_error &&
1157                             !sblock_bad->checksum_error &&
1158                             sblock_bad->no_io_error_seen)
1159                                 goto corrected_error;
1160                         else
1161                                 goto did_not_correct_error;
1162                 } else {
1163 corrected_error:
1164                         spin_lock(&sctx->stat_lock);
1165                         sctx->stat.corrected_errors++;
1166                         sblock_to_check->data_corrected = 1;
1167                         spin_unlock(&sctx->stat_lock);
1168                         btrfs_err_rl_in_rcu(fs_info,
1169                                 "fixed up error at logical %llu on dev %s",
1170                                 logical, rcu_str_deref(dev->name));
1171                 }
1172         } else {
1173 did_not_correct_error:
1174                 spin_lock(&sctx->stat_lock);
1175                 sctx->stat.uncorrectable_errors++;
1176                 spin_unlock(&sctx->stat_lock);
1177                 btrfs_err_rl_in_rcu(fs_info,
1178                         "unable to fixup (regular) error at logical %llu on dev %s",
1179                         logical, rcu_str_deref(dev->name));
1180         }
1181
1182 out:
1183         if (sblocks_for_recheck) {
1184                 for (mirror_index = 0; mirror_index < BTRFS_MAX_MIRRORS;
1185                      mirror_index++) {
1186                         struct scrub_block *sblock = sblocks_for_recheck +
1187                                                      mirror_index;
1188                         struct scrub_recover *recover;
1189                         int page_index;
1190
1191                         for (page_index = 0; page_index < sblock->page_count;
1192                              page_index++) {
1193                                 sblock->pagev[page_index]->sblock = NULL;
1194                                 recover = sblock->pagev[page_index]->recover;
1195                                 if (recover) {
1196                                         scrub_put_recover(fs_info, recover);
1197                                         sblock->pagev[page_index]->recover =
1198                                                                         NULL;
1199                                 }
1200                                 scrub_page_put(sblock->pagev[page_index]);
1201                         }
1202                 }
1203                 kfree(sblocks_for_recheck);
1204         }
1205
1206         ret = unlock_full_stripe(fs_info, logical, full_stripe_locked);
1207         memalloc_nofs_restore(nofs_flag);
1208         if (ret < 0)
1209                 return ret;
1210         return 0;
1211 }
1212
1213 static inline int scrub_nr_raid_mirrors(struct btrfs_io_context *bioc)
1214 {
1215         if (bioc->map_type & BTRFS_BLOCK_GROUP_RAID5)
1216                 return 2;
1217         else if (bioc->map_type & BTRFS_BLOCK_GROUP_RAID6)
1218                 return 3;
1219         else
1220                 return (int)bioc->num_stripes;
1221 }
1222
1223 static inline void scrub_stripe_index_and_offset(u64 logical, u64 map_type,
1224                                                  u64 *raid_map,
1225                                                  u64 mapped_length,
1226                                                  int nstripes, int mirror,
1227                                                  int *stripe_index,
1228                                                  u64 *stripe_offset)
1229 {
1230         int i;
1231
1232         if (map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
1233                 /* RAID5/6 */
1234                 for (i = 0; i < nstripes; i++) {
1235                         if (raid_map[i] == RAID6_Q_STRIPE ||
1236                             raid_map[i] == RAID5_P_STRIPE)
1237                                 continue;
1238
1239                         if (logical >= raid_map[i] &&
1240                             logical < raid_map[i] + mapped_length)
1241                                 break;
1242                 }
1243
1244                 *stripe_index = i;
1245                 *stripe_offset = logical - raid_map[i];
1246         } else {
1247                 /* The other RAID type */
1248                 *stripe_index = mirror;
1249                 *stripe_offset = 0;
1250         }
1251 }
1252
1253 static int scrub_setup_recheck_block(struct scrub_block *original_sblock,
1254                                      struct scrub_block *sblocks_for_recheck)
1255 {
1256         struct scrub_ctx *sctx = original_sblock->sctx;
1257         struct btrfs_fs_info *fs_info = sctx->fs_info;
1258         u64 length = original_sblock->page_count * fs_info->sectorsize;
1259         u64 logical = original_sblock->pagev[0]->logical;
1260         u64 generation = original_sblock->pagev[0]->generation;
1261         u64 flags = original_sblock->pagev[0]->flags;
1262         u64 have_csum = original_sblock->pagev[0]->have_csum;
1263         struct scrub_recover *recover;
1264         struct btrfs_io_context *bioc;
1265         u64 sublen;
1266         u64 mapped_length;
1267         u64 stripe_offset;
1268         int stripe_index;
1269         int page_index = 0;
1270         int mirror_index;
1271         int nmirrors;
1272         int ret;
1273
1274         /*
1275          * note: the two members refs and outstanding_pages
1276          * are not used (and not set) in the blocks that are used for
1277          * the recheck procedure
1278          */
1279
1280         while (length > 0) {
1281                 sublen = min_t(u64, length, fs_info->sectorsize);
1282                 mapped_length = sublen;
1283                 bioc = NULL;
1284
1285                 /*
1286                  * With a length of sectorsize, each returned stripe represents
1287                  * one mirror
1288                  */
1289                 btrfs_bio_counter_inc_blocked(fs_info);
1290                 ret = btrfs_map_sblock(fs_info, BTRFS_MAP_GET_READ_MIRRORS,
1291                                        logical, &mapped_length, &bioc);
1292                 if (ret || !bioc || mapped_length < sublen) {
1293                         btrfs_put_bioc(bioc);
1294                         btrfs_bio_counter_dec(fs_info);
1295                         return -EIO;
1296                 }
1297
1298                 recover = kzalloc(sizeof(struct scrub_recover), GFP_NOFS);
1299                 if (!recover) {
1300                         btrfs_put_bioc(bioc);
1301                         btrfs_bio_counter_dec(fs_info);
1302                         return -ENOMEM;
1303                 }
1304
1305                 refcount_set(&recover->refs, 1);
1306                 recover->bioc = bioc;
1307                 recover->map_length = mapped_length;
1308
1309                 ASSERT(page_index < SCRUB_MAX_PAGES_PER_BLOCK);
1310
1311                 nmirrors = min(scrub_nr_raid_mirrors(bioc), BTRFS_MAX_MIRRORS);
1312
1313                 for (mirror_index = 0; mirror_index < nmirrors;
1314                      mirror_index++) {
1315                         struct scrub_block *sblock;
1316                         struct scrub_page *spage;
1317
1318                         sblock = sblocks_for_recheck + mirror_index;
1319                         sblock->sctx = sctx;
1320
1321                         spage = kzalloc(sizeof(*spage), GFP_NOFS);
1322                         if (!spage) {
1323 leave_nomem:
1324                                 spin_lock(&sctx->stat_lock);
1325                                 sctx->stat.malloc_errors++;
1326                                 spin_unlock(&sctx->stat_lock);
1327                                 scrub_put_recover(fs_info, recover);
1328                                 return -ENOMEM;
1329                         }
1330                         scrub_page_get(spage);
1331                         sblock->pagev[page_index] = spage;
1332                         spage->sblock = sblock;
1333                         spage->flags = flags;
1334                         spage->generation = generation;
1335                         spage->logical = logical;
1336                         spage->have_csum = have_csum;
1337                         if (have_csum)
1338                                 memcpy(spage->csum,
1339                                        original_sblock->pagev[0]->csum,
1340                                        sctx->fs_info->csum_size);
1341
1342                         scrub_stripe_index_and_offset(logical,
1343                                                       bioc->map_type,
1344                                                       bioc->raid_map,
1345                                                       mapped_length,
1346                                                       bioc->num_stripes -
1347                                                       bioc->num_tgtdevs,
1348                                                       mirror_index,
1349                                                       &stripe_index,
1350                                                       &stripe_offset);
1351                         spage->physical = bioc->stripes[stripe_index].physical +
1352                                          stripe_offset;
1353                         spage->dev = bioc->stripes[stripe_index].dev;
1354
1355                         BUG_ON(page_index >= original_sblock->page_count);
1356                         spage->physical_for_dev_replace =
1357                                 original_sblock->pagev[page_index]->
1358                                 physical_for_dev_replace;
1359                         /* for missing devices, dev->bdev is NULL */
1360                         spage->mirror_num = mirror_index + 1;
1361                         sblock->page_count++;
1362                         spage->page = alloc_page(GFP_NOFS);
1363                         if (!spage->page)
1364                                 goto leave_nomem;
1365
1366                         scrub_get_recover(recover);
1367                         spage->recover = recover;
1368                 }
1369                 scrub_put_recover(fs_info, recover);
1370                 length -= sublen;
1371                 logical += sublen;
1372                 page_index++;
1373         }
1374
1375         return 0;
1376 }
1377
1378 static void scrub_bio_wait_endio(struct bio *bio)
1379 {
1380         complete(bio->bi_private);
1381 }
1382
1383 static int scrub_submit_raid56_bio_wait(struct btrfs_fs_info *fs_info,
1384                                         struct bio *bio,
1385                                         struct scrub_page *spage)
1386 {
1387         DECLARE_COMPLETION_ONSTACK(done);
1388         int ret;
1389         int mirror_num;
1390
1391         bio->bi_iter.bi_sector = spage->logical >> 9;
1392         bio->bi_private = &done;
1393         bio->bi_end_io = scrub_bio_wait_endio;
1394
1395         mirror_num = spage->sblock->pagev[0]->mirror_num;
1396         ret = raid56_parity_recover(bio, spage->recover->bioc,
1397                                     spage->recover->map_length,
1398                                     mirror_num, 0);
1399         if (ret)
1400                 return ret;
1401
1402         wait_for_completion_io(&done);
1403         return blk_status_to_errno(bio->bi_status);
1404 }
1405
1406 static void scrub_recheck_block_on_raid56(struct btrfs_fs_info *fs_info,
1407                                           struct scrub_block *sblock)
1408 {
1409         struct scrub_page *first_page = sblock->pagev[0];
1410         struct bio *bio;
1411         int page_num;
1412
1413         /* All pages in sblock belong to the same stripe on the same device. */
1414         ASSERT(first_page->dev);
1415         if (!first_page->dev->bdev)
1416                 goto out;
1417
1418         bio = btrfs_bio_alloc(BIO_MAX_VECS);
1419         bio_set_dev(bio, first_page->dev->bdev);
1420
1421         for (page_num = 0; page_num < sblock->page_count; page_num++) {
1422                 struct scrub_page *spage = sblock->pagev[page_num];
1423
1424                 WARN_ON(!spage->page);
1425                 bio_add_page(bio, spage->page, PAGE_SIZE, 0);
1426         }
1427
1428         if (scrub_submit_raid56_bio_wait(fs_info, bio, first_page)) {
1429                 bio_put(bio);
1430                 goto out;
1431         }
1432
1433         bio_put(bio);
1434
1435         scrub_recheck_block_checksum(sblock);
1436
1437         return;
1438 out:
1439         for (page_num = 0; page_num < sblock->page_count; page_num++)
1440                 sblock->pagev[page_num]->io_error = 1;
1441
1442         sblock->no_io_error_seen = 0;
1443 }
1444
1445 /*
1446  * this function will check the on disk data for checksum errors, header
1447  * errors and read I/O errors. If any I/O errors happen, the exact pages
1448  * which are errored are marked as being bad. The goal is to enable scrub
1449  * to take those pages that are not errored from all the mirrors so that
1450  * the pages that are errored in the just handled mirror can be repaired.
1451  */
1452 static void scrub_recheck_block(struct btrfs_fs_info *fs_info,
1453                                 struct scrub_block *sblock,
1454                                 int retry_failed_mirror)
1455 {
1456         int page_num;
1457
1458         sblock->no_io_error_seen = 1;
1459
1460         /* short cut for raid56 */
1461         if (!retry_failed_mirror && scrub_is_page_on_raid56(sblock->pagev[0]))
1462                 return scrub_recheck_block_on_raid56(fs_info, sblock);
1463
1464         for (page_num = 0; page_num < sblock->page_count; page_num++) {
1465                 struct bio *bio;
1466                 struct scrub_page *spage = sblock->pagev[page_num];
1467
1468                 if (spage->dev->bdev == NULL) {
1469                         spage->io_error = 1;
1470                         sblock->no_io_error_seen = 0;
1471                         continue;
1472                 }
1473
1474                 WARN_ON(!spage->page);
1475                 bio = btrfs_bio_alloc(1);
1476                 bio_set_dev(bio, spage->dev->bdev);
1477
1478                 bio_add_page(bio, spage->page, fs_info->sectorsize, 0);
1479                 bio->bi_iter.bi_sector = spage->physical >> 9;
1480                 bio->bi_opf = REQ_OP_READ;
1481
1482                 if (btrfsic_submit_bio_wait(bio)) {
1483                         spage->io_error = 1;
1484                         sblock->no_io_error_seen = 0;
1485                 }
1486
1487                 bio_put(bio);
1488         }
1489
1490         if (sblock->no_io_error_seen)
1491                 scrub_recheck_block_checksum(sblock);
1492 }
1493
1494 static inline int scrub_check_fsid(u8 fsid[],
1495                                    struct scrub_page *spage)
1496 {
1497         struct btrfs_fs_devices *fs_devices = spage->dev->fs_devices;
1498         int ret;
1499
1500         ret = memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE);
1501         return !ret;
1502 }
1503
1504 static void scrub_recheck_block_checksum(struct scrub_block *sblock)
1505 {
1506         sblock->header_error = 0;
1507         sblock->checksum_error = 0;
1508         sblock->generation_error = 0;
1509
1510         if (sblock->pagev[0]->flags & BTRFS_EXTENT_FLAG_DATA)
1511                 scrub_checksum_data(sblock);
1512         else
1513                 scrub_checksum_tree_block(sblock);
1514 }
1515
1516 static int scrub_repair_block_from_good_copy(struct scrub_block *sblock_bad,
1517                                              struct scrub_block *sblock_good)
1518 {
1519         int page_num;
1520         int ret = 0;
1521
1522         for (page_num = 0; page_num < sblock_bad->page_count; page_num++) {
1523                 int ret_sub;
1524
1525                 ret_sub = scrub_repair_page_from_good_copy(sblock_bad,
1526                                                            sblock_good,
1527                                                            page_num, 1);
1528                 if (ret_sub)
1529                         ret = ret_sub;
1530         }
1531
1532         return ret;
1533 }
1534
1535 static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad,
1536                                             struct scrub_block *sblock_good,
1537                                             int page_num, int force_write)
1538 {
1539         struct scrub_page *spage_bad = sblock_bad->pagev[page_num];
1540         struct scrub_page *spage_good = sblock_good->pagev[page_num];
1541         struct btrfs_fs_info *fs_info = sblock_bad->sctx->fs_info;
1542         const u32 sectorsize = fs_info->sectorsize;
1543
1544         BUG_ON(spage_bad->page == NULL);
1545         BUG_ON(spage_good->page == NULL);
1546         if (force_write || sblock_bad->header_error ||
1547             sblock_bad->checksum_error || spage_bad->io_error) {
1548                 struct bio *bio;
1549                 int ret;
1550
1551                 if (!spage_bad->dev->bdev) {
1552                         btrfs_warn_rl(fs_info,
1553                                 "scrub_repair_page_from_good_copy(bdev == NULL) is unexpected");
1554                         return -EIO;
1555                 }
1556
1557                 bio = btrfs_bio_alloc(1);
1558                 bio_set_dev(bio, spage_bad->dev->bdev);
1559                 bio->bi_iter.bi_sector = spage_bad->physical >> 9;
1560                 bio->bi_opf = REQ_OP_WRITE;
1561
1562                 ret = bio_add_page(bio, spage_good->page, sectorsize, 0);
1563                 if (ret != sectorsize) {
1564                         bio_put(bio);
1565                         return -EIO;
1566                 }
1567
1568                 if (btrfsic_submit_bio_wait(bio)) {
1569                         btrfs_dev_stat_inc_and_print(spage_bad->dev,
1570                                 BTRFS_DEV_STAT_WRITE_ERRS);
1571                         atomic64_inc(&fs_info->dev_replace.num_write_errors);
1572                         bio_put(bio);
1573                         return -EIO;
1574                 }
1575                 bio_put(bio);
1576         }
1577
1578         return 0;
1579 }
1580
1581 static void scrub_write_block_to_dev_replace(struct scrub_block *sblock)
1582 {
1583         struct btrfs_fs_info *fs_info = sblock->sctx->fs_info;
1584         int page_num;
1585
1586         /*
1587          * This block is used for the check of the parity on the source device,
1588          * so the data needn't be written into the destination device.
1589          */
1590         if (sblock->sparity)
1591                 return;
1592
1593         for (page_num = 0; page_num < sblock->page_count; page_num++) {
1594                 int ret;
1595
1596                 ret = scrub_write_page_to_dev_replace(sblock, page_num);
1597                 if (ret)
1598                         atomic64_inc(&fs_info->dev_replace.num_write_errors);
1599         }
1600 }
1601
1602 static int scrub_write_page_to_dev_replace(struct scrub_block *sblock,
1603                                            int page_num)
1604 {
1605         struct scrub_page *spage = sblock->pagev[page_num];
1606
1607         BUG_ON(spage->page == NULL);
1608         if (spage->io_error)
1609                 clear_page(page_address(spage->page));
1610
1611         return scrub_add_page_to_wr_bio(sblock->sctx, spage);
1612 }
1613
1614 static int fill_writer_pointer_gap(struct scrub_ctx *sctx, u64 physical)
1615 {
1616         int ret = 0;
1617         u64 length;
1618
1619         if (!btrfs_is_zoned(sctx->fs_info))
1620                 return 0;
1621
1622         if (!btrfs_dev_is_sequential(sctx->wr_tgtdev, physical))
1623                 return 0;
1624
1625         if (sctx->write_pointer < physical) {
1626                 length = physical - sctx->write_pointer;
1627
1628                 ret = btrfs_zoned_issue_zeroout(sctx->wr_tgtdev,
1629                                                 sctx->write_pointer, length);
1630                 if (!ret)
1631                         sctx->write_pointer = physical;
1632         }
1633         return ret;
1634 }
1635
1636 static int scrub_add_page_to_wr_bio(struct scrub_ctx *sctx,
1637                                     struct scrub_page *spage)
1638 {
1639         struct scrub_bio *sbio;
1640         int ret;
1641         const u32 sectorsize = sctx->fs_info->sectorsize;
1642
1643         mutex_lock(&sctx->wr_lock);
1644 again:
1645         if (!sctx->wr_curr_bio) {
1646                 sctx->wr_curr_bio = kzalloc(sizeof(*sctx->wr_curr_bio),
1647                                               GFP_KERNEL);
1648                 if (!sctx->wr_curr_bio) {
1649                         mutex_unlock(&sctx->wr_lock);
1650                         return -ENOMEM;
1651                 }
1652                 sctx->wr_curr_bio->sctx = sctx;
1653                 sctx->wr_curr_bio->page_count = 0;
1654         }
1655         sbio = sctx->wr_curr_bio;
1656         if (sbio->page_count == 0) {
1657                 struct bio *bio;
1658
1659                 ret = fill_writer_pointer_gap(sctx,
1660                                               spage->physical_for_dev_replace);
1661                 if (ret) {
1662                         mutex_unlock(&sctx->wr_lock);
1663                         return ret;
1664                 }
1665
1666                 sbio->physical = spage->physical_for_dev_replace;
1667                 sbio->logical = spage->logical;
1668                 sbio->dev = sctx->wr_tgtdev;
1669                 bio = sbio->bio;
1670                 if (!bio) {
1671                         bio = btrfs_bio_alloc(sctx->pages_per_bio);
1672                         sbio->bio = bio;
1673                 }
1674
1675                 bio->bi_private = sbio;
1676                 bio->bi_end_io = scrub_wr_bio_end_io;
1677                 bio_set_dev(bio, sbio->dev->bdev);
1678                 bio->bi_iter.bi_sector = sbio->physical >> 9;
1679                 bio->bi_opf = REQ_OP_WRITE;
1680                 sbio->status = 0;
1681         } else if (sbio->physical + sbio->page_count * sectorsize !=
1682                    spage->physical_for_dev_replace ||
1683                    sbio->logical + sbio->page_count * sectorsize !=
1684                    spage->logical) {
1685                 scrub_wr_submit(sctx);
1686                 goto again;
1687         }
1688
1689         ret = bio_add_page(sbio->bio, spage->page, sectorsize, 0);
1690         if (ret != sectorsize) {
1691                 if (sbio->page_count < 1) {
1692                         bio_put(sbio->bio);
1693                         sbio->bio = NULL;
1694                         mutex_unlock(&sctx->wr_lock);
1695                         return -EIO;
1696                 }
1697                 scrub_wr_submit(sctx);
1698                 goto again;
1699         }
1700
1701         sbio->pagev[sbio->page_count] = spage;
1702         scrub_page_get(spage);
1703         sbio->page_count++;
1704         if (sbio->page_count == sctx->pages_per_bio)
1705                 scrub_wr_submit(sctx);
1706         mutex_unlock(&sctx->wr_lock);
1707
1708         return 0;
1709 }
1710
1711 static void scrub_wr_submit(struct scrub_ctx *sctx)
1712 {
1713         struct scrub_bio *sbio;
1714
1715         if (!sctx->wr_curr_bio)
1716                 return;
1717
1718         sbio = sctx->wr_curr_bio;
1719         sctx->wr_curr_bio = NULL;
1720         WARN_ON(!sbio->bio->bi_bdev);
1721         scrub_pending_bio_inc(sctx);
1722         /* process all writes in a single worker thread. Then the block layer
1723          * orders the requests before sending them to the driver which
1724          * doubled the write performance on spinning disks when measured
1725          * with Linux 3.5 */
1726         btrfsic_submit_bio(sbio->bio);
1727
1728         if (btrfs_is_zoned(sctx->fs_info))
1729                 sctx->write_pointer = sbio->physical + sbio->page_count *
1730                         sctx->fs_info->sectorsize;
1731 }
1732
1733 static void scrub_wr_bio_end_io(struct bio *bio)
1734 {
1735         struct scrub_bio *sbio = bio->bi_private;
1736         struct btrfs_fs_info *fs_info = sbio->dev->fs_info;
1737
1738         sbio->status = bio->bi_status;
1739         sbio->bio = bio;
1740
1741         btrfs_init_work(&sbio->work, scrub_wr_bio_end_io_worker, NULL, NULL);
1742         btrfs_queue_work(fs_info->scrub_wr_completion_workers, &sbio->work);
1743 }
1744
1745 static void scrub_wr_bio_end_io_worker(struct btrfs_work *work)
1746 {
1747         struct scrub_bio *sbio = container_of(work, struct scrub_bio, work);
1748         struct scrub_ctx *sctx = sbio->sctx;
1749         int i;
1750
1751         ASSERT(sbio->page_count <= SCRUB_PAGES_PER_BIO);
1752         if (sbio->status) {
1753                 struct btrfs_dev_replace *dev_replace =
1754                         &sbio->sctx->fs_info->dev_replace;
1755
1756                 for (i = 0; i < sbio->page_count; i++) {
1757                         struct scrub_page *spage = sbio->pagev[i];
1758
1759                         spage->io_error = 1;
1760                         atomic64_inc(&dev_replace->num_write_errors);
1761                 }
1762         }
1763
1764         for (i = 0; i < sbio->page_count; i++)
1765                 scrub_page_put(sbio->pagev[i]);
1766
1767         bio_put(sbio->bio);
1768         kfree(sbio);
1769         scrub_pending_bio_dec(sctx);
1770 }
1771
1772 static int scrub_checksum(struct scrub_block *sblock)
1773 {
1774         u64 flags;
1775         int ret;
1776
1777         /*
1778          * No need to initialize these stats currently,
1779          * because this function only use return value
1780          * instead of these stats value.
1781          *
1782          * Todo:
1783          * always use stats
1784          */
1785         sblock->header_error = 0;
1786         sblock->generation_error = 0;
1787         sblock->checksum_error = 0;
1788
1789         WARN_ON(sblock->page_count < 1);
1790         flags = sblock->pagev[0]->flags;
1791         ret = 0;
1792         if (flags & BTRFS_EXTENT_FLAG_DATA)
1793                 ret = scrub_checksum_data(sblock);
1794         else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)
1795                 ret = scrub_checksum_tree_block(sblock);
1796         else if (flags & BTRFS_EXTENT_FLAG_SUPER)
1797                 (void)scrub_checksum_super(sblock);
1798         else
1799                 WARN_ON(1);
1800         if (ret)
1801                 scrub_handle_errored_block(sblock);
1802
1803         return ret;
1804 }
1805
1806 static int scrub_checksum_data(struct scrub_block *sblock)
1807 {
1808         struct scrub_ctx *sctx = sblock->sctx;
1809         struct btrfs_fs_info *fs_info = sctx->fs_info;
1810         SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
1811         u8 csum[BTRFS_CSUM_SIZE];
1812         struct scrub_page *spage;
1813         char *kaddr;
1814
1815         BUG_ON(sblock->page_count < 1);
1816         spage = sblock->pagev[0];
1817         if (!spage->have_csum)
1818                 return 0;
1819
1820         kaddr = page_address(spage->page);
1821
1822         shash->tfm = fs_info->csum_shash;
1823         crypto_shash_init(shash);
1824
1825         /*
1826          * In scrub_pages() and scrub_pages_for_parity() we ensure each spage
1827          * only contains one sector of data.
1828          */
1829         crypto_shash_digest(shash, kaddr, fs_info->sectorsize, csum);
1830
1831         if (memcmp(csum, spage->csum, fs_info->csum_size))
1832                 sblock->checksum_error = 1;
1833         return sblock->checksum_error;
1834 }
1835
1836 static int scrub_checksum_tree_block(struct scrub_block *sblock)
1837 {
1838         struct scrub_ctx *sctx = sblock->sctx;
1839         struct btrfs_header *h;
1840         struct btrfs_fs_info *fs_info = sctx->fs_info;
1841         SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
1842         u8 calculated_csum[BTRFS_CSUM_SIZE];
1843         u8 on_disk_csum[BTRFS_CSUM_SIZE];
1844         /*
1845          * This is done in sectorsize steps even for metadata as there's a
1846          * constraint for nodesize to be aligned to sectorsize. This will need
1847          * to change so we don't misuse data and metadata units like that.
1848          */
1849         const u32 sectorsize = sctx->fs_info->sectorsize;
1850         const int num_sectors = fs_info->nodesize >> fs_info->sectorsize_bits;
1851         int i;
1852         struct scrub_page *spage;
1853         char *kaddr;
1854
1855         BUG_ON(sblock->page_count < 1);
1856
1857         /* Each member in pagev is just one block, not a full page */
1858         ASSERT(sblock->page_count == num_sectors);
1859
1860         spage = sblock->pagev[0];
1861         kaddr = page_address(spage->page);
1862         h = (struct btrfs_header *)kaddr;
1863         memcpy(on_disk_csum, h->csum, sctx->fs_info->csum_size);
1864
1865         /*
1866          * we don't use the getter functions here, as we
1867          * a) don't have an extent buffer and
1868          * b) the page is already kmapped
1869          */
1870         if (spage->logical != btrfs_stack_header_bytenr(h))
1871                 sblock->header_error = 1;
1872
1873         if (spage->generation != btrfs_stack_header_generation(h)) {
1874                 sblock->header_error = 1;
1875                 sblock->generation_error = 1;
1876         }
1877
1878         if (!scrub_check_fsid(h->fsid, spage))
1879                 sblock->header_error = 1;
1880
1881         if (memcmp(h->chunk_tree_uuid, fs_info->chunk_tree_uuid,
1882                    BTRFS_UUID_SIZE))
1883                 sblock->header_error = 1;
1884
1885         shash->tfm = fs_info->csum_shash;
1886         crypto_shash_init(shash);
1887         crypto_shash_update(shash, kaddr + BTRFS_CSUM_SIZE,
1888                             sectorsize - BTRFS_CSUM_SIZE);
1889
1890         for (i = 1; i < num_sectors; i++) {
1891                 kaddr = page_address(sblock->pagev[i]->page);
1892                 crypto_shash_update(shash, kaddr, sectorsize);
1893         }
1894
1895         crypto_shash_final(shash, calculated_csum);
1896         if (memcmp(calculated_csum, on_disk_csum, sctx->fs_info->csum_size))
1897                 sblock->checksum_error = 1;
1898
1899         return sblock->header_error || sblock->checksum_error;
1900 }
1901
1902 static int scrub_checksum_super(struct scrub_block *sblock)
1903 {
1904         struct btrfs_super_block *s;
1905         struct scrub_ctx *sctx = sblock->sctx;
1906         struct btrfs_fs_info *fs_info = sctx->fs_info;
1907         SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
1908         u8 calculated_csum[BTRFS_CSUM_SIZE];
1909         struct scrub_page *spage;
1910         char *kaddr;
1911         int fail_gen = 0;
1912         int fail_cor = 0;
1913
1914         BUG_ON(sblock->page_count < 1);
1915         spage = sblock->pagev[0];
1916         kaddr = page_address(spage->page);
1917         s = (struct btrfs_super_block *)kaddr;
1918
1919         if (spage->logical != btrfs_super_bytenr(s))
1920                 ++fail_cor;
1921
1922         if (spage->generation != btrfs_super_generation(s))
1923                 ++fail_gen;
1924
1925         if (!scrub_check_fsid(s->fsid, spage))
1926                 ++fail_cor;
1927
1928         shash->tfm = fs_info->csum_shash;
1929         crypto_shash_init(shash);
1930         crypto_shash_digest(shash, kaddr + BTRFS_CSUM_SIZE,
1931                         BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE, calculated_csum);
1932
1933         if (memcmp(calculated_csum, s->csum, sctx->fs_info->csum_size))
1934                 ++fail_cor;
1935
1936         if (fail_cor + fail_gen) {
1937                 /*
1938                  * if we find an error in a super block, we just report it.
1939                  * They will get written with the next transaction commit
1940                  * anyway
1941                  */
1942                 spin_lock(&sctx->stat_lock);
1943                 ++sctx->stat.super_errors;
1944                 spin_unlock(&sctx->stat_lock);
1945                 if (fail_cor)
1946                         btrfs_dev_stat_inc_and_print(spage->dev,
1947                                 BTRFS_DEV_STAT_CORRUPTION_ERRS);
1948                 else
1949                         btrfs_dev_stat_inc_and_print(spage->dev,
1950                                 BTRFS_DEV_STAT_GENERATION_ERRS);
1951         }
1952
1953         return fail_cor + fail_gen;
1954 }
1955
1956 static void scrub_block_get(struct scrub_block *sblock)
1957 {
1958         refcount_inc(&sblock->refs);
1959 }
1960
1961 static void scrub_block_put(struct scrub_block *sblock)
1962 {
1963         if (refcount_dec_and_test(&sblock->refs)) {
1964                 int i;
1965
1966                 if (sblock->sparity)
1967                         scrub_parity_put(sblock->sparity);
1968
1969                 for (i = 0; i < sblock->page_count; i++)
1970                         scrub_page_put(sblock->pagev[i]);
1971                 kfree(sblock);
1972         }
1973 }
1974
1975 static void scrub_page_get(struct scrub_page *spage)
1976 {
1977         atomic_inc(&spage->refs);
1978 }
1979
1980 static void scrub_page_put(struct scrub_page *spage)
1981 {
1982         if (atomic_dec_and_test(&spage->refs)) {
1983                 if (spage->page)
1984                         __free_page(spage->page);
1985                 kfree(spage);
1986         }
1987 }
1988
1989 /*
1990  * Throttling of IO submission, bandwidth-limit based, the timeslice is 1
1991  * second.  Limit can be set via /sys/fs/UUID/devinfo/devid/scrub_speed_max.
1992  */
1993 static void scrub_throttle(struct scrub_ctx *sctx)
1994 {
1995         const int time_slice = 1000;
1996         struct scrub_bio *sbio;
1997         struct btrfs_device *device;
1998         s64 delta;
1999         ktime_t now;
2000         u32 div;
2001         u64 bwlimit;
2002
2003         sbio = sctx->bios[sctx->curr];
2004         device = sbio->dev;
2005         bwlimit = READ_ONCE(device->scrub_speed_max);
2006         if (bwlimit == 0)
2007                 return;
2008
2009         /*
2010          * Slice is divided into intervals when the IO is submitted, adjust by
2011          * bwlimit and maximum of 64 intervals.
2012          */
2013         div = max_t(u32, 1, (u32)(bwlimit / (16 * 1024 * 1024)));
2014         div = min_t(u32, 64, div);
2015
2016         /* Start new epoch, set deadline */
2017         now = ktime_get();
2018         if (sctx->throttle_deadline == 0) {
2019                 sctx->throttle_deadline = ktime_add_ms(now, time_slice / div);
2020                 sctx->throttle_sent = 0;
2021         }
2022
2023         /* Still in the time to send? */
2024         if (ktime_before(now, sctx->throttle_deadline)) {
2025                 /* If current bio is within the limit, send it */
2026                 sctx->throttle_sent += sbio->bio->bi_iter.bi_size;
2027                 if (sctx->throttle_sent <= div_u64(bwlimit, div))
2028                         return;
2029
2030                 /* We're over the limit, sleep until the rest of the slice */
2031                 delta = ktime_ms_delta(sctx->throttle_deadline, now);
2032         } else {
2033                 /* New request after deadline, start new epoch */
2034                 delta = 0;
2035         }
2036
2037         if (delta) {
2038                 long timeout;
2039
2040                 timeout = div_u64(delta * HZ, 1000);
2041                 schedule_timeout_interruptible(timeout);
2042         }
2043
2044         /* Next call will start the deadline period */
2045         sctx->throttle_deadline = 0;
2046 }
2047
2048 static void scrub_submit(struct scrub_ctx *sctx)
2049 {
2050         struct scrub_bio *sbio;
2051
2052         if (sctx->curr == -1)
2053                 return;
2054
2055         scrub_throttle(sctx);
2056
2057         sbio = sctx->bios[sctx->curr];
2058         sctx->curr = -1;
2059         scrub_pending_bio_inc(sctx);
2060         btrfsic_submit_bio(sbio->bio);
2061 }
2062
2063 static int scrub_add_page_to_rd_bio(struct scrub_ctx *sctx,
2064                                     struct scrub_page *spage)
2065 {
2066         struct scrub_block *sblock = spage->sblock;
2067         struct scrub_bio *sbio;
2068         const u32 sectorsize = sctx->fs_info->sectorsize;
2069         int ret;
2070
2071 again:
2072         /*
2073          * grab a fresh bio or wait for one to become available
2074          */
2075         while (sctx->curr == -1) {
2076                 spin_lock(&sctx->list_lock);
2077                 sctx->curr = sctx->first_free;
2078                 if (sctx->curr != -1) {
2079                         sctx->first_free = sctx->bios[sctx->curr]->next_free;
2080                         sctx->bios[sctx->curr]->next_free = -1;
2081                         sctx->bios[sctx->curr]->page_count = 0;
2082                         spin_unlock(&sctx->list_lock);
2083                 } else {
2084                         spin_unlock(&sctx->list_lock);
2085                         wait_event(sctx->list_wait, sctx->first_free != -1);
2086                 }
2087         }
2088         sbio = sctx->bios[sctx->curr];
2089         if (sbio->page_count == 0) {
2090                 struct bio *bio;
2091
2092                 sbio->physical = spage->physical;
2093                 sbio->logical = spage->logical;
2094                 sbio->dev = spage->dev;
2095                 bio = sbio->bio;
2096                 if (!bio) {
2097                         bio = btrfs_bio_alloc(sctx->pages_per_bio);
2098                         sbio->bio = bio;
2099                 }
2100
2101                 bio->bi_private = sbio;
2102                 bio->bi_end_io = scrub_bio_end_io;
2103                 bio_set_dev(bio, sbio->dev->bdev);
2104                 bio->bi_iter.bi_sector = sbio->physical >> 9;
2105                 bio->bi_opf = REQ_OP_READ;
2106                 sbio->status = 0;
2107         } else if (sbio->physical + sbio->page_count * sectorsize !=
2108                    spage->physical ||
2109                    sbio->logical + sbio->page_count * sectorsize !=
2110                    spage->logical ||
2111                    sbio->dev != spage->dev) {
2112                 scrub_submit(sctx);
2113                 goto again;
2114         }
2115
2116         sbio->pagev[sbio->page_count] = spage;
2117         ret = bio_add_page(sbio->bio, spage->page, sectorsize, 0);
2118         if (ret != sectorsize) {
2119                 if (sbio->page_count < 1) {
2120                         bio_put(sbio->bio);
2121                         sbio->bio = NULL;
2122                         return -EIO;
2123                 }
2124                 scrub_submit(sctx);
2125                 goto again;
2126         }
2127
2128         scrub_block_get(sblock); /* one for the page added to the bio */
2129         atomic_inc(&sblock->outstanding_pages);
2130         sbio->page_count++;
2131         if (sbio->page_count == sctx->pages_per_bio)
2132                 scrub_submit(sctx);
2133
2134         return 0;
2135 }
2136
2137 static void scrub_missing_raid56_end_io(struct bio *bio)
2138 {
2139         struct scrub_block *sblock = bio->bi_private;
2140         struct btrfs_fs_info *fs_info = sblock->sctx->fs_info;
2141
2142         if (bio->bi_status)
2143                 sblock->no_io_error_seen = 0;
2144
2145         bio_put(bio);
2146
2147         btrfs_queue_work(fs_info->scrub_workers, &sblock->work);
2148 }
2149
2150 static void scrub_missing_raid56_worker(struct btrfs_work *work)
2151 {
2152         struct scrub_block *sblock = container_of(work, struct scrub_block, work);
2153         struct scrub_ctx *sctx = sblock->sctx;
2154         struct btrfs_fs_info *fs_info = sctx->fs_info;
2155         u64 logical;
2156         struct btrfs_device *dev;
2157
2158         logical = sblock->pagev[0]->logical;
2159         dev = sblock->pagev[0]->dev;
2160
2161         if (sblock->no_io_error_seen)
2162                 scrub_recheck_block_checksum(sblock);
2163
2164         if (!sblock->no_io_error_seen) {
2165                 spin_lock(&sctx->stat_lock);
2166                 sctx->stat.read_errors++;
2167                 spin_unlock(&sctx->stat_lock);
2168                 btrfs_err_rl_in_rcu(fs_info,
2169                         "IO error rebuilding logical %llu for dev %s",
2170                         logical, rcu_str_deref(dev->name));
2171         } else if (sblock->header_error || sblock->checksum_error) {
2172                 spin_lock(&sctx->stat_lock);
2173                 sctx->stat.uncorrectable_errors++;
2174                 spin_unlock(&sctx->stat_lock);
2175                 btrfs_err_rl_in_rcu(fs_info,
2176                         "failed to rebuild valid logical %llu for dev %s",
2177                         logical, rcu_str_deref(dev->name));
2178         } else {
2179                 scrub_write_block_to_dev_replace(sblock);
2180         }
2181
2182         if (sctx->is_dev_replace && sctx->flush_all_writes) {
2183                 mutex_lock(&sctx->wr_lock);
2184                 scrub_wr_submit(sctx);
2185                 mutex_unlock(&sctx->wr_lock);
2186         }
2187
2188         scrub_block_put(sblock);
2189         scrub_pending_bio_dec(sctx);
2190 }
2191
2192 static void scrub_missing_raid56_pages(struct scrub_block *sblock)
2193 {
2194         struct scrub_ctx *sctx = sblock->sctx;
2195         struct btrfs_fs_info *fs_info = sctx->fs_info;
2196         u64 length = sblock->page_count * PAGE_SIZE;
2197         u64 logical = sblock->pagev[0]->logical;
2198         struct btrfs_io_context *bioc = NULL;
2199         struct bio *bio;
2200         struct btrfs_raid_bio *rbio;
2201         int ret;
2202         int i;
2203
2204         btrfs_bio_counter_inc_blocked(fs_info);
2205         ret = btrfs_map_sblock(fs_info, BTRFS_MAP_GET_READ_MIRRORS, logical,
2206                                &length, &bioc);
2207         if (ret || !bioc || !bioc->raid_map)
2208                 goto bioc_out;
2209
2210         if (WARN_ON(!sctx->is_dev_replace ||
2211                     !(bioc->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK))) {
2212                 /*
2213                  * We shouldn't be scrubbing a missing device. Even for dev
2214                  * replace, we should only get here for RAID 5/6. We either
2215                  * managed to mount something with no mirrors remaining or
2216                  * there's a bug in scrub_remap_extent()/btrfs_map_block().
2217                  */
2218                 goto bioc_out;
2219         }
2220
2221         bio = btrfs_bio_alloc(BIO_MAX_VECS);
2222         bio->bi_iter.bi_sector = logical >> 9;
2223         bio->bi_private = sblock;
2224         bio->bi_end_io = scrub_missing_raid56_end_io;
2225
2226         rbio = raid56_alloc_missing_rbio(bio, bioc, length);
2227         if (!rbio)
2228                 goto rbio_out;
2229
2230         for (i = 0; i < sblock->page_count; i++) {
2231                 struct scrub_page *spage = sblock->pagev[i];
2232
2233                 raid56_add_scrub_pages(rbio, spage->page, spage->logical);
2234         }
2235
2236         btrfs_init_work(&sblock->work, scrub_missing_raid56_worker, NULL, NULL);
2237         scrub_block_get(sblock);
2238         scrub_pending_bio_inc(sctx);
2239         raid56_submit_missing_rbio(rbio);
2240         return;
2241
2242 rbio_out:
2243         bio_put(bio);
2244 bioc_out:
2245         btrfs_bio_counter_dec(fs_info);
2246         btrfs_put_bioc(bioc);
2247         spin_lock(&sctx->stat_lock);
2248         sctx->stat.malloc_errors++;
2249         spin_unlock(&sctx->stat_lock);
2250 }
2251
2252 static int scrub_pages(struct scrub_ctx *sctx, u64 logical, u32 len,
2253                        u64 physical, struct btrfs_device *dev, u64 flags,
2254                        u64 gen, int mirror_num, u8 *csum,
2255                        u64 physical_for_dev_replace)
2256 {
2257         struct scrub_block *sblock;
2258         const u32 sectorsize = sctx->fs_info->sectorsize;
2259         int index;
2260
2261         sblock = kzalloc(sizeof(*sblock), GFP_KERNEL);
2262         if (!sblock) {
2263                 spin_lock(&sctx->stat_lock);
2264                 sctx->stat.malloc_errors++;
2265                 spin_unlock(&sctx->stat_lock);
2266                 return -ENOMEM;
2267         }
2268
2269         /* one ref inside this function, plus one for each page added to
2270          * a bio later on */
2271         refcount_set(&sblock->refs, 1);
2272         sblock->sctx = sctx;
2273         sblock->no_io_error_seen = 1;
2274
2275         for (index = 0; len > 0; index++) {
2276                 struct scrub_page *spage;
2277                 /*
2278                  * Here we will allocate one page for one sector to scrub.
2279                  * This is fine if PAGE_SIZE == sectorsize, but will cost
2280                  * more memory for PAGE_SIZE > sectorsize case.
2281                  */
2282                 u32 l = min(sectorsize, len);
2283
2284                 spage = kzalloc(sizeof(*spage), GFP_KERNEL);
2285                 if (!spage) {
2286 leave_nomem:
2287                         spin_lock(&sctx->stat_lock);
2288                         sctx->stat.malloc_errors++;
2289                         spin_unlock(&sctx->stat_lock);
2290                         scrub_block_put(sblock);
2291                         return -ENOMEM;
2292                 }
2293                 ASSERT(index < SCRUB_MAX_PAGES_PER_BLOCK);
2294                 scrub_page_get(spage);
2295                 sblock->pagev[index] = spage;
2296                 spage->sblock = sblock;
2297                 spage->dev = dev;
2298                 spage->flags = flags;
2299                 spage->generation = gen;
2300                 spage->logical = logical;
2301                 spage->physical = physical;
2302                 spage->physical_for_dev_replace = physical_for_dev_replace;
2303                 spage->mirror_num = mirror_num;
2304                 if (csum) {
2305                         spage->have_csum = 1;
2306                         memcpy(spage->csum, csum, sctx->fs_info->csum_size);
2307                 } else {
2308                         spage->have_csum = 0;
2309                 }
2310                 sblock->page_count++;
2311                 spage->page = alloc_page(GFP_KERNEL);
2312                 if (!spage->page)
2313                         goto leave_nomem;
2314                 len -= l;
2315                 logical += l;
2316                 physical += l;
2317                 physical_for_dev_replace += l;
2318         }
2319
2320         WARN_ON(sblock->page_count == 0);
2321         if (test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state)) {
2322                 /*
2323                  * This case should only be hit for RAID 5/6 device replace. See
2324                  * the comment in scrub_missing_raid56_pages() for details.
2325                  */
2326                 scrub_missing_raid56_pages(sblock);
2327         } else {
2328                 for (index = 0; index < sblock->page_count; index++) {
2329                         struct scrub_page *spage = sblock->pagev[index];
2330                         int ret;
2331
2332                         ret = scrub_add_page_to_rd_bio(sctx, spage);
2333                         if (ret) {
2334                                 scrub_block_put(sblock);
2335                                 return ret;
2336                         }
2337                 }
2338
2339                 if (flags & BTRFS_EXTENT_FLAG_SUPER)
2340                         scrub_submit(sctx);
2341         }
2342
2343         /* last one frees, either here or in bio completion for last page */
2344         scrub_block_put(sblock);
2345         return 0;
2346 }
2347
2348 static void scrub_bio_end_io(struct bio *bio)
2349 {
2350         struct scrub_bio *sbio = bio->bi_private;
2351         struct btrfs_fs_info *fs_info = sbio->dev->fs_info;
2352
2353         sbio->status = bio->bi_status;
2354         sbio->bio = bio;
2355
2356         btrfs_queue_work(fs_info->scrub_workers, &sbio->work);
2357 }
2358
2359 static void scrub_bio_end_io_worker(struct btrfs_work *work)
2360 {
2361         struct scrub_bio *sbio = container_of(work, struct scrub_bio, work);
2362         struct scrub_ctx *sctx = sbio->sctx;
2363         int i;
2364
2365         ASSERT(sbio->page_count <= SCRUB_PAGES_PER_BIO);
2366         if (sbio->status) {
2367                 for (i = 0; i < sbio->page_count; i++) {
2368                         struct scrub_page *spage = sbio->pagev[i];
2369
2370                         spage->io_error = 1;
2371                         spage->sblock->no_io_error_seen = 0;
2372                 }
2373         }
2374
2375         /* now complete the scrub_block items that have all pages completed */
2376         for (i = 0; i < sbio->page_count; i++) {
2377                 struct scrub_page *spage = sbio->pagev[i];
2378                 struct scrub_block *sblock = spage->sblock;
2379
2380                 if (atomic_dec_and_test(&sblock->outstanding_pages))
2381                         scrub_block_complete(sblock);
2382                 scrub_block_put(sblock);
2383         }
2384
2385         bio_put(sbio->bio);
2386         sbio->bio = NULL;
2387         spin_lock(&sctx->list_lock);
2388         sbio->next_free = sctx->first_free;
2389         sctx->first_free = sbio->index;
2390         spin_unlock(&sctx->list_lock);
2391
2392         if (sctx->is_dev_replace && sctx->flush_all_writes) {
2393                 mutex_lock(&sctx->wr_lock);
2394                 scrub_wr_submit(sctx);
2395                 mutex_unlock(&sctx->wr_lock);
2396         }
2397
2398         scrub_pending_bio_dec(sctx);
2399 }
2400
2401 static inline void __scrub_mark_bitmap(struct scrub_parity *sparity,
2402                                        unsigned long *bitmap,
2403                                        u64 start, u32 len)
2404 {
2405         u64 offset;
2406         u32 nsectors;
2407         u32 sectorsize_bits = sparity->sctx->fs_info->sectorsize_bits;
2408
2409         if (len >= sparity->stripe_len) {
2410                 bitmap_set(bitmap, 0, sparity->nsectors);
2411                 return;
2412         }
2413
2414         start -= sparity->logic_start;
2415         start = div64_u64_rem(start, sparity->stripe_len, &offset);
2416         offset = offset >> sectorsize_bits;
2417         nsectors = len >> sectorsize_bits;
2418
2419         if (offset + nsectors <= sparity->nsectors) {
2420                 bitmap_set(bitmap, offset, nsectors);
2421                 return;
2422         }
2423
2424         bitmap_set(bitmap, offset, sparity->nsectors - offset);
2425         bitmap_set(bitmap, 0, nsectors - (sparity->nsectors - offset));
2426 }
2427
2428 static inline void scrub_parity_mark_sectors_error(struct scrub_parity *sparity,
2429                                                    u64 start, u32 len)
2430 {
2431         __scrub_mark_bitmap(sparity, sparity->ebitmap, start, len);
2432 }
2433
2434 static inline void scrub_parity_mark_sectors_data(struct scrub_parity *sparity,
2435                                                   u64 start, u32 len)
2436 {
2437         __scrub_mark_bitmap(sparity, sparity->dbitmap, start, len);
2438 }
2439
2440 static void scrub_block_complete(struct scrub_block *sblock)
2441 {
2442         int corrupted = 0;
2443
2444         if (!sblock->no_io_error_seen) {
2445                 corrupted = 1;
2446                 scrub_handle_errored_block(sblock);
2447         } else {
2448                 /*
2449                  * if has checksum error, write via repair mechanism in
2450                  * dev replace case, otherwise write here in dev replace
2451                  * case.
2452                  */
2453                 corrupted = scrub_checksum(sblock);
2454                 if (!corrupted && sblock->sctx->is_dev_replace)
2455                         scrub_write_block_to_dev_replace(sblock);
2456         }
2457
2458         if (sblock->sparity && corrupted && !sblock->data_corrected) {
2459                 u64 start = sblock->pagev[0]->logical;
2460                 u64 end = sblock->pagev[sblock->page_count - 1]->logical +
2461                           sblock->sctx->fs_info->sectorsize;
2462
2463                 ASSERT(end - start <= U32_MAX);
2464                 scrub_parity_mark_sectors_error(sblock->sparity,
2465                                                 start, end - start);
2466         }
2467 }
2468
2469 static void drop_csum_range(struct scrub_ctx *sctx, struct btrfs_ordered_sum *sum)
2470 {
2471         sctx->stat.csum_discards += sum->len >> sctx->fs_info->sectorsize_bits;
2472         list_del(&sum->list);
2473         kfree(sum);
2474 }
2475
2476 /*
2477  * Find the desired csum for range [logical, logical + sectorsize), and store
2478  * the csum into @csum.
2479  *
2480  * The search source is sctx->csum_list, which is a pre-populated list
2481  * storing bytenr ordered csum ranges.  We're responsible to cleanup any range
2482  * that is before @logical.
2483  *
2484  * Return 0 if there is no csum for the range.
2485  * Return 1 if there is csum for the range and copied to @csum.
2486  */
2487 static int scrub_find_csum(struct scrub_ctx *sctx, u64 logical, u8 *csum)
2488 {
2489         bool found = false;
2490
2491         while (!list_empty(&sctx->csum_list)) {
2492                 struct btrfs_ordered_sum *sum = NULL;
2493                 unsigned long index;
2494                 unsigned long num_sectors;
2495
2496                 sum = list_first_entry(&sctx->csum_list,
2497                                        struct btrfs_ordered_sum, list);
2498                 /* The current csum range is beyond our range, no csum found */
2499                 if (sum->bytenr > logical)
2500                         break;
2501
2502                 /*
2503                  * The current sum is before our bytenr, since scrub is always
2504                  * done in bytenr order, the csum will never be used anymore,
2505                  * clean it up so that later calls won't bother with the range,
2506                  * and continue search the next range.
2507                  */
2508                 if (sum->bytenr + sum->len <= logical) {
2509                         drop_csum_range(sctx, sum);
2510                         continue;
2511                 }
2512
2513                 /* Now the csum range covers our bytenr, copy the csum */
2514                 found = true;
2515                 index = (logical - sum->bytenr) >> sctx->fs_info->sectorsize_bits;
2516                 num_sectors = sum->len >> sctx->fs_info->sectorsize_bits;
2517
2518                 memcpy(csum, sum->sums + index * sctx->fs_info->csum_size,
2519                        sctx->fs_info->csum_size);
2520
2521                 /* Cleanup the range if we're at the end of the csum range */
2522                 if (index == num_sectors - 1)
2523                         drop_csum_range(sctx, sum);
2524                 break;
2525         }
2526         if (!found)
2527                 return 0;
2528         return 1;
2529 }
2530
2531 /* scrub extent tries to collect up to 64 kB for each bio */
2532 static int scrub_extent(struct scrub_ctx *sctx, struct map_lookup *map,
2533                         u64 logical, u32 len,
2534                         u64 physical, struct btrfs_device *dev, u64 flags,
2535                         u64 gen, int mirror_num, u64 physical_for_dev_replace)
2536 {
2537         int ret;
2538         u8 csum[BTRFS_CSUM_SIZE];
2539         u32 blocksize;
2540
2541         if (flags & BTRFS_EXTENT_FLAG_DATA) {
2542                 if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK)
2543                         blocksize = map->stripe_len;
2544                 else
2545                         blocksize = sctx->fs_info->sectorsize;
2546                 spin_lock(&sctx->stat_lock);
2547                 sctx->stat.data_extents_scrubbed++;
2548                 sctx->stat.data_bytes_scrubbed += len;
2549                 spin_unlock(&sctx->stat_lock);
2550         } else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
2551                 if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK)
2552                         blocksize = map->stripe_len;
2553                 else
2554                         blocksize = sctx->fs_info->nodesize;
2555                 spin_lock(&sctx->stat_lock);
2556                 sctx->stat.tree_extents_scrubbed++;
2557                 sctx->stat.tree_bytes_scrubbed += len;
2558                 spin_unlock(&sctx->stat_lock);
2559         } else {
2560                 blocksize = sctx->fs_info->sectorsize;
2561                 WARN_ON(1);
2562         }
2563
2564         while (len) {
2565                 u32 l = min(len, blocksize);
2566                 int have_csum = 0;
2567
2568                 if (flags & BTRFS_EXTENT_FLAG_DATA) {
2569                         /* push csums to sbio */
2570                         have_csum = scrub_find_csum(sctx, logical, csum);
2571                         if (have_csum == 0)
2572                                 ++sctx->stat.no_csum;
2573                 }
2574                 ret = scrub_pages(sctx, logical, l, physical, dev, flags, gen,
2575                                   mirror_num, have_csum ? csum : NULL,
2576                                   physical_for_dev_replace);
2577                 if (ret)
2578                         return ret;
2579                 len -= l;
2580                 logical += l;
2581                 physical += l;
2582                 physical_for_dev_replace += l;
2583         }
2584         return 0;
2585 }
2586
2587 static int scrub_pages_for_parity(struct scrub_parity *sparity,
2588                                   u64 logical, u32 len,
2589                                   u64 physical, struct btrfs_device *dev,
2590                                   u64 flags, u64 gen, int mirror_num, u8 *csum)
2591 {
2592         struct scrub_ctx *sctx = sparity->sctx;
2593         struct scrub_block *sblock;
2594         const u32 sectorsize = sctx->fs_info->sectorsize;
2595         int index;
2596
2597         ASSERT(IS_ALIGNED(len, sectorsize));
2598
2599         sblock = kzalloc(sizeof(*sblock), GFP_KERNEL);
2600         if (!sblock) {
2601                 spin_lock(&sctx->stat_lock);
2602                 sctx->stat.malloc_errors++;
2603                 spin_unlock(&sctx->stat_lock);
2604                 return -ENOMEM;
2605         }
2606
2607         /* one ref inside this function, plus one for each page added to
2608          * a bio later on */
2609         refcount_set(&sblock->refs, 1);
2610         sblock->sctx = sctx;
2611         sblock->no_io_error_seen = 1;
2612         sblock->sparity = sparity;
2613         scrub_parity_get(sparity);
2614
2615         for (index = 0; len > 0; index++) {
2616                 struct scrub_page *spage;
2617
2618                 spage = kzalloc(sizeof(*spage), GFP_KERNEL);
2619                 if (!spage) {
2620 leave_nomem:
2621                         spin_lock(&sctx->stat_lock);
2622                         sctx->stat.malloc_errors++;
2623                         spin_unlock(&sctx->stat_lock);
2624                         scrub_block_put(sblock);
2625                         return -ENOMEM;
2626                 }
2627                 ASSERT(index < SCRUB_MAX_PAGES_PER_BLOCK);
2628                 /* For scrub block */
2629                 scrub_page_get(spage);
2630                 sblock->pagev[index] = spage;
2631                 /* For scrub parity */
2632                 scrub_page_get(spage);
2633                 list_add_tail(&spage->list, &sparity->spages);
2634                 spage->sblock = sblock;
2635                 spage->dev = dev;
2636                 spage->flags = flags;
2637                 spage->generation = gen;
2638                 spage->logical = logical;
2639                 spage->physical = physical;
2640                 spage->mirror_num = mirror_num;
2641                 if (csum) {
2642                         spage->have_csum = 1;
2643                         memcpy(spage->csum, csum, sctx->fs_info->csum_size);
2644                 } else {
2645                         spage->have_csum = 0;
2646                 }
2647                 sblock->page_count++;
2648                 spage->page = alloc_page(GFP_KERNEL);
2649                 if (!spage->page)
2650                         goto leave_nomem;
2651
2652
2653                 /* Iterate over the stripe range in sectorsize steps */
2654                 len -= sectorsize;
2655                 logical += sectorsize;
2656                 physical += sectorsize;
2657         }
2658
2659         WARN_ON(sblock->page_count == 0);
2660         for (index = 0; index < sblock->page_count; index++) {
2661                 struct scrub_page *spage = sblock->pagev[index];
2662                 int ret;
2663
2664                 ret = scrub_add_page_to_rd_bio(sctx, spage);
2665                 if (ret) {
2666                         scrub_block_put(sblock);
2667                         return ret;
2668                 }
2669         }
2670
2671         /* last one frees, either here or in bio completion for last page */
2672         scrub_block_put(sblock);
2673         return 0;
2674 }
2675
2676 static int scrub_extent_for_parity(struct scrub_parity *sparity,
2677                                    u64 logical, u32 len,
2678                                    u64 physical, struct btrfs_device *dev,
2679                                    u64 flags, u64 gen, int mirror_num)
2680 {
2681         struct scrub_ctx *sctx = sparity->sctx;
2682         int ret;
2683         u8 csum[BTRFS_CSUM_SIZE];
2684         u32 blocksize;
2685
2686         if (test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state)) {
2687                 scrub_parity_mark_sectors_error(sparity, logical, len);
2688                 return 0;
2689         }
2690
2691         if (flags & BTRFS_EXTENT_FLAG_DATA) {
2692                 blocksize = sparity->stripe_len;
2693         } else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
2694                 blocksize = sparity->stripe_len;
2695         } else {
2696                 blocksize = sctx->fs_info->sectorsize;
2697                 WARN_ON(1);
2698         }
2699
2700         while (len) {
2701                 u32 l = min(len, blocksize);
2702                 int have_csum = 0;
2703
2704                 if (flags & BTRFS_EXTENT_FLAG_DATA) {
2705                         /* push csums to sbio */
2706                         have_csum = scrub_find_csum(sctx, logical, csum);
2707                         if (have_csum == 0)
2708                                 goto skip;
2709                 }
2710                 ret = scrub_pages_for_parity(sparity, logical, l, physical, dev,
2711                                              flags, gen, mirror_num,
2712                                              have_csum ? csum : NULL);
2713                 if (ret)
2714                         return ret;
2715 skip:
2716                 len -= l;
2717                 logical += l;
2718                 physical += l;
2719         }
2720         return 0;
2721 }
2722
2723 /*
2724  * Given a physical address, this will calculate it's
2725  * logical offset. if this is a parity stripe, it will return
2726  * the most left data stripe's logical offset.
2727  *
2728  * return 0 if it is a data stripe, 1 means parity stripe.
2729  */
2730 static int get_raid56_logic_offset(u64 physical, int num,
2731                                    struct map_lookup *map, u64 *offset,
2732                                    u64 *stripe_start)
2733 {
2734         int i;
2735         int j = 0;
2736         u64 stripe_nr;
2737         u64 last_offset;
2738         u32 stripe_index;
2739         u32 rot;
2740         const int data_stripes = nr_data_stripes(map);
2741
2742         last_offset = (physical - map->stripes[num].physical) * data_stripes;
2743         if (stripe_start)
2744                 *stripe_start = last_offset;
2745
2746         *offset = last_offset;
2747         for (i = 0; i < data_stripes; i++) {
2748                 *offset = last_offset + i * map->stripe_len;
2749
2750                 stripe_nr = div64_u64(*offset, map->stripe_len);
2751                 stripe_nr = div_u64(stripe_nr, data_stripes);
2752
2753                 /* Work out the disk rotation on this stripe-set */
2754                 stripe_nr = div_u64_rem(stripe_nr, map->num_stripes, &rot);
2755                 /* calculate which stripe this data locates */
2756                 rot += i;
2757                 stripe_index = rot % map->num_stripes;
2758                 if (stripe_index == num)
2759                         return 0;
2760                 if (stripe_index < num)
2761                         j++;
2762         }
2763         *offset = last_offset + j * map->stripe_len;
2764         return 1;
2765 }
2766
2767 static void scrub_free_parity(struct scrub_parity *sparity)
2768 {
2769         struct scrub_ctx *sctx = sparity->sctx;
2770         struct scrub_page *curr, *next;
2771         int nbits;
2772
2773         nbits = bitmap_weight(sparity->ebitmap, sparity->nsectors);
2774         if (nbits) {
2775                 spin_lock(&sctx->stat_lock);
2776                 sctx->stat.read_errors += nbits;
2777                 sctx->stat.uncorrectable_errors += nbits;
2778                 spin_unlock(&sctx->stat_lock);
2779         }
2780
2781         list_for_each_entry_safe(curr, next, &sparity->spages, list) {
2782                 list_del_init(&curr->list);
2783                 scrub_page_put(curr);
2784         }
2785
2786         kfree(sparity);
2787 }
2788
2789 static void scrub_parity_bio_endio_worker(struct btrfs_work *work)
2790 {
2791         struct scrub_parity *sparity = container_of(work, struct scrub_parity,
2792                                                     work);
2793         struct scrub_ctx *sctx = sparity->sctx;
2794
2795         scrub_free_parity(sparity);
2796         scrub_pending_bio_dec(sctx);
2797 }
2798
2799 static void scrub_parity_bio_endio(struct bio *bio)
2800 {
2801         struct scrub_parity *sparity = (struct scrub_parity *)bio->bi_private;
2802         struct btrfs_fs_info *fs_info = sparity->sctx->fs_info;
2803
2804         if (bio->bi_status)
2805                 bitmap_or(sparity->ebitmap, sparity->ebitmap, sparity->dbitmap,
2806                           sparity->nsectors);
2807
2808         bio_put(bio);
2809
2810         btrfs_init_work(&sparity->work, scrub_parity_bio_endio_worker, NULL,
2811                         NULL);
2812         btrfs_queue_work(fs_info->scrub_parity_workers, &sparity->work);
2813 }
2814
2815 static void scrub_parity_check_and_repair(struct scrub_parity *sparity)
2816 {
2817         struct scrub_ctx *sctx = sparity->sctx;
2818         struct btrfs_fs_info *fs_info = sctx->fs_info;
2819         struct bio *bio;
2820         struct btrfs_raid_bio *rbio;
2821         struct btrfs_io_context *bioc = NULL;
2822         u64 length;
2823         int ret;
2824
2825         if (!bitmap_andnot(sparity->dbitmap, sparity->dbitmap, sparity->ebitmap,
2826                            sparity->nsectors))
2827                 goto out;
2828
2829         length = sparity->logic_end - sparity->logic_start;
2830
2831         btrfs_bio_counter_inc_blocked(fs_info);
2832         ret = btrfs_map_sblock(fs_info, BTRFS_MAP_WRITE, sparity->logic_start,
2833                                &length, &bioc);
2834         if (ret || !bioc || !bioc->raid_map)
2835                 goto bioc_out;
2836
2837         bio = btrfs_bio_alloc(BIO_MAX_VECS);
2838         bio->bi_iter.bi_sector = sparity->logic_start >> 9;
2839         bio->bi_private = sparity;
2840         bio->bi_end_io = scrub_parity_bio_endio;
2841
2842         rbio = raid56_parity_alloc_scrub_rbio(bio, bioc, length,
2843                                               sparity->scrub_dev,
2844                                               sparity->dbitmap,
2845                                               sparity->nsectors);
2846         if (!rbio)
2847                 goto rbio_out;
2848
2849         scrub_pending_bio_inc(sctx);
2850         raid56_parity_submit_scrub_rbio(rbio);
2851         return;
2852
2853 rbio_out:
2854         bio_put(bio);
2855 bioc_out:
2856         btrfs_bio_counter_dec(fs_info);
2857         btrfs_put_bioc(bioc);
2858         bitmap_or(sparity->ebitmap, sparity->ebitmap, sparity->dbitmap,
2859                   sparity->nsectors);
2860         spin_lock(&sctx->stat_lock);
2861         sctx->stat.malloc_errors++;
2862         spin_unlock(&sctx->stat_lock);
2863 out:
2864         scrub_free_parity(sparity);
2865 }
2866
2867 static inline int scrub_calc_parity_bitmap_len(int nsectors)
2868 {
2869         return DIV_ROUND_UP(nsectors, BITS_PER_LONG) * sizeof(long);
2870 }
2871
2872 static void scrub_parity_get(struct scrub_parity *sparity)
2873 {
2874         refcount_inc(&sparity->refs);
2875 }
2876
2877 static void scrub_parity_put(struct scrub_parity *sparity)
2878 {
2879         if (!refcount_dec_and_test(&sparity->refs))
2880                 return;
2881
2882         scrub_parity_check_and_repair(sparity);
2883 }
2884
2885 static noinline_for_stack int scrub_raid56_parity(struct scrub_ctx *sctx,
2886                                                   struct map_lookup *map,
2887                                                   struct btrfs_device *sdev,
2888                                                   u64 logic_start,
2889                                                   u64 logic_end)
2890 {
2891         struct btrfs_fs_info *fs_info = sctx->fs_info;
2892         struct btrfs_root *root = btrfs_extent_root(fs_info, logic_start);
2893         struct btrfs_root *csum_root;
2894         struct btrfs_extent_item *extent;
2895         struct btrfs_io_context *bioc = NULL;
2896         struct btrfs_path *path;
2897         u64 flags;
2898         int ret;
2899         int slot;
2900         struct extent_buffer *l;
2901         struct btrfs_key key;
2902         u64 generation;
2903         u64 extent_logical;
2904         u64 extent_physical;
2905         /* Check the comment in scrub_stripe() for why u32 is enough here */
2906         u32 extent_len;
2907         u64 mapped_length;
2908         struct btrfs_device *extent_dev;
2909         struct scrub_parity *sparity;
2910         int nsectors;
2911         int bitmap_len;
2912         int extent_mirror_num;
2913         int stop_loop = 0;
2914
2915         path = btrfs_alloc_path();
2916         if (!path) {
2917                 spin_lock(&sctx->stat_lock);
2918                 sctx->stat.malloc_errors++;
2919                 spin_unlock(&sctx->stat_lock);
2920                 return -ENOMEM;
2921         }
2922         path->search_commit_root = 1;
2923         path->skip_locking = 1;
2924
2925         ASSERT(map->stripe_len <= U32_MAX);
2926         nsectors = map->stripe_len >> fs_info->sectorsize_bits;
2927         bitmap_len = scrub_calc_parity_bitmap_len(nsectors);
2928         sparity = kzalloc(sizeof(struct scrub_parity) + 2 * bitmap_len,
2929                           GFP_NOFS);
2930         if (!sparity) {
2931                 spin_lock(&sctx->stat_lock);
2932                 sctx->stat.malloc_errors++;
2933                 spin_unlock(&sctx->stat_lock);
2934                 btrfs_free_path(path);
2935                 return -ENOMEM;
2936         }
2937
2938         ASSERT(map->stripe_len <= U32_MAX);
2939         sparity->stripe_len = map->stripe_len;
2940         sparity->nsectors = nsectors;
2941         sparity->sctx = sctx;
2942         sparity->scrub_dev = sdev;
2943         sparity->logic_start = logic_start;
2944         sparity->logic_end = logic_end;
2945         refcount_set(&sparity->refs, 1);
2946         INIT_LIST_HEAD(&sparity->spages);
2947         sparity->dbitmap = sparity->bitmap;
2948         sparity->ebitmap = (void *)sparity->bitmap + bitmap_len;
2949
2950         ret = 0;
2951         while (logic_start < logic_end) {
2952                 if (btrfs_fs_incompat(fs_info, SKINNY_METADATA))
2953                         key.type = BTRFS_METADATA_ITEM_KEY;
2954                 else
2955                         key.type = BTRFS_EXTENT_ITEM_KEY;
2956                 key.objectid = logic_start;
2957                 key.offset = (u64)-1;
2958
2959                 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
2960                 if (ret < 0)
2961                         goto out;
2962
2963                 if (ret > 0) {
2964                         ret = btrfs_previous_extent_item(root, path, 0);
2965                         if (ret < 0)
2966                                 goto out;
2967                         if (ret > 0) {
2968                                 btrfs_release_path(path);
2969                                 ret = btrfs_search_slot(NULL, root, &key,
2970                                                         path, 0, 0);
2971                                 if (ret < 0)
2972                                         goto out;
2973                         }
2974                 }
2975
2976                 stop_loop = 0;
2977                 while (1) {
2978                         u64 bytes;
2979
2980                         l = path->nodes[0];
2981                         slot = path->slots[0];
2982                         if (slot >= btrfs_header_nritems(l)) {
2983                                 ret = btrfs_next_leaf(root, path);
2984                                 if (ret == 0)
2985                                         continue;
2986                                 if (ret < 0)
2987                                         goto out;
2988
2989                                 stop_loop = 1;
2990                                 break;
2991                         }
2992                         btrfs_item_key_to_cpu(l, &key, slot);
2993
2994                         if (key.type != BTRFS_EXTENT_ITEM_KEY &&
2995                             key.type != BTRFS_METADATA_ITEM_KEY)
2996                                 goto next;
2997
2998                         if (key.type == BTRFS_METADATA_ITEM_KEY)
2999                                 bytes = fs_info->nodesize;
3000                         else
3001                                 bytes = key.offset;
3002
3003                         if (key.objectid + bytes <= logic_start)
3004                                 goto next;
3005
3006                         if (key.objectid >= logic_end) {
3007                                 stop_loop = 1;
3008                                 break;
3009                         }
3010
3011                         while (key.objectid >= logic_start + map->stripe_len)
3012                                 logic_start += map->stripe_len;
3013
3014                         extent = btrfs_item_ptr(l, slot,
3015                                                 struct btrfs_extent_item);
3016                         flags = btrfs_extent_flags(l, extent);
3017                         generation = btrfs_extent_generation(l, extent);
3018
3019                         if ((flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) &&
3020                             (key.objectid < logic_start ||
3021                              key.objectid + bytes >
3022                              logic_start + map->stripe_len)) {
3023                                 btrfs_err(fs_info,
3024                                           "scrub: tree block %llu spanning stripes, ignored. logical=%llu",
3025                                           key.objectid, logic_start);
3026                                 spin_lock(&sctx->stat_lock);
3027                                 sctx->stat.uncorrectable_errors++;
3028                                 spin_unlock(&sctx->stat_lock);
3029                                 goto next;
3030                         }
3031 again:
3032                         extent_logical = key.objectid;
3033                         ASSERT(bytes <= U32_MAX);
3034                         extent_len = bytes;
3035
3036                         if (extent_logical < logic_start) {
3037                                 extent_len -= logic_start - extent_logical;
3038                                 extent_logical = logic_start;
3039                         }
3040
3041                         if (extent_logical + extent_len >
3042                             logic_start + map->stripe_len)
3043                                 extent_len = logic_start + map->stripe_len -
3044                                              extent_logical;
3045
3046                         scrub_parity_mark_sectors_data(sparity, extent_logical,
3047                                                        extent_len);
3048
3049                         mapped_length = extent_len;
3050                         bioc = NULL;
3051                         ret = btrfs_map_block(fs_info, BTRFS_MAP_READ,
3052                                         extent_logical, &mapped_length, &bioc,
3053                                         0);
3054                         if (!ret) {
3055                                 if (!bioc || mapped_length < extent_len)
3056                                         ret = -EIO;
3057                         }
3058                         if (ret) {
3059                                 btrfs_put_bioc(bioc);
3060                                 goto out;
3061                         }
3062                         extent_physical = bioc->stripes[0].physical;
3063                         extent_mirror_num = bioc->mirror_num;
3064                         extent_dev = bioc->stripes[0].dev;
3065                         btrfs_put_bioc(bioc);
3066
3067                         csum_root = btrfs_csum_root(fs_info, extent_logical);
3068                         ret = btrfs_lookup_csums_range(csum_root,
3069                                                 extent_logical,
3070                                                 extent_logical + extent_len - 1,
3071                                                 &sctx->csum_list, 1);
3072                         if (ret)
3073                                 goto out;
3074
3075                         ret = scrub_extent_for_parity(sparity, extent_logical,
3076                                                       extent_len,
3077                                                       extent_physical,
3078                                                       extent_dev, flags,
3079                                                       generation,
3080                                                       extent_mirror_num);
3081
3082                         scrub_free_csums(sctx);
3083
3084                         if (ret)
3085                                 goto out;
3086
3087                         if (extent_logical + extent_len <
3088                             key.objectid + bytes) {
3089                                 logic_start += map->stripe_len;
3090
3091                                 if (logic_start >= logic_end) {
3092                                         stop_loop = 1;
3093                                         break;
3094                                 }
3095
3096                                 if (logic_start < key.objectid + bytes) {
3097                                         cond_resched();
3098                                         goto again;
3099                                 }
3100                         }
3101 next:
3102                         path->slots[0]++;
3103                 }
3104
3105                 btrfs_release_path(path);
3106
3107                 if (stop_loop)
3108                         break;
3109
3110                 logic_start += map->stripe_len;
3111         }
3112 out:
3113         if (ret < 0) {
3114                 ASSERT(logic_end - logic_start <= U32_MAX);
3115                 scrub_parity_mark_sectors_error(sparity, logic_start,
3116                                                 logic_end - logic_start);
3117         }
3118         scrub_parity_put(sparity);
3119         scrub_submit(sctx);
3120         mutex_lock(&sctx->wr_lock);
3121         scrub_wr_submit(sctx);
3122         mutex_unlock(&sctx->wr_lock);
3123
3124         btrfs_free_path(path);
3125         return ret < 0 ? ret : 0;
3126 }
3127
3128 static void sync_replace_for_zoned(struct scrub_ctx *sctx)
3129 {
3130         if (!btrfs_is_zoned(sctx->fs_info))
3131                 return;
3132
3133         sctx->flush_all_writes = true;
3134         scrub_submit(sctx);
3135         mutex_lock(&sctx->wr_lock);
3136         scrub_wr_submit(sctx);
3137         mutex_unlock(&sctx->wr_lock);
3138
3139         wait_event(sctx->list_wait, atomic_read(&sctx->bios_in_flight) == 0);
3140 }
3141
3142 static int sync_write_pointer_for_zoned(struct scrub_ctx *sctx, u64 logical,
3143                                         u64 physical, u64 physical_end)
3144 {
3145         struct btrfs_fs_info *fs_info = sctx->fs_info;
3146         int ret = 0;
3147
3148         if (!btrfs_is_zoned(fs_info))
3149                 return 0;
3150
3151         wait_event(sctx->list_wait, atomic_read(&sctx->bios_in_flight) == 0);
3152
3153         mutex_lock(&sctx->wr_lock);
3154         if (sctx->write_pointer < physical_end) {
3155                 ret = btrfs_sync_zone_write_pointer(sctx->wr_tgtdev, logical,
3156                                                     physical,
3157                                                     sctx->write_pointer);
3158                 if (ret)
3159                         btrfs_err(fs_info,
3160                                   "zoned: failed to recover write pointer");
3161         }
3162         mutex_unlock(&sctx->wr_lock);
3163         btrfs_dev_clear_zone_empty(sctx->wr_tgtdev, physical);
3164
3165         return ret;
3166 }
3167
3168 static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
3169                                            struct btrfs_block_group *bg,
3170                                            struct map_lookup *map,
3171                                            struct btrfs_device *scrub_dev,
3172                                            int stripe_index, u64 dev_extent_len)
3173 {
3174         struct btrfs_path *path;
3175         struct btrfs_fs_info *fs_info = sctx->fs_info;
3176         struct btrfs_root *root;
3177         struct btrfs_root *csum_root;
3178         struct btrfs_extent_item *extent;
3179         struct blk_plug plug;
3180         const u64 chunk_logical = bg->start;
3181         u64 flags;
3182         int ret;
3183         int slot;
3184         u64 nstripes;
3185         struct extent_buffer *l;
3186         u64 physical;
3187         u64 logical;
3188         u64 logic_end;
3189         u64 physical_end;
3190         u64 generation;
3191         int mirror_num;
3192         struct btrfs_key key;
3193         u64 increment = map->stripe_len;
3194         u64 offset;
3195         u64 extent_logical;
3196         u64 extent_physical;
3197         /*
3198          * Unlike chunk length, extent length should never go beyond
3199          * BTRFS_MAX_EXTENT_SIZE, thus u32 is enough here.
3200          */
3201         u32 extent_len;
3202         u64 stripe_logical;
3203         u64 stripe_end;
3204         struct btrfs_device *extent_dev;
3205         int extent_mirror_num;
3206         int stop_loop = 0;
3207
3208         physical = map->stripes[stripe_index].physical;
3209         offset = 0;
3210         nstripes = div64_u64(dev_extent_len, map->stripe_len);
3211         mirror_num = 1;
3212         increment = map->stripe_len;
3213         if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
3214                 offset = map->stripe_len * stripe_index;
3215                 increment = map->stripe_len * map->num_stripes;
3216         } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
3217                 int factor = map->num_stripes / map->sub_stripes;
3218                 offset = map->stripe_len * (stripe_index / map->sub_stripes);
3219                 increment = map->stripe_len * factor;
3220                 mirror_num = stripe_index % map->sub_stripes + 1;
3221         } else if (map->type & BTRFS_BLOCK_GROUP_RAID1_MASK) {
3222                 mirror_num = stripe_index % map->num_stripes + 1;
3223         } else if (map->type & BTRFS_BLOCK_GROUP_DUP) {
3224                 mirror_num = stripe_index % map->num_stripes + 1;
3225         } else if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
3226                 get_raid56_logic_offset(physical, stripe_index, map, &offset,
3227                                         NULL);
3228                 increment = map->stripe_len * nr_data_stripes(map);
3229         }
3230
3231         path = btrfs_alloc_path();
3232         if (!path)
3233                 return -ENOMEM;
3234
3235         /*
3236          * work on commit root. The related disk blocks are static as
3237          * long as COW is applied. This means, it is save to rewrite
3238          * them to repair disk errors without any race conditions
3239          */
3240         path->search_commit_root = 1;
3241         path->skip_locking = 1;
3242         path->reada = READA_FORWARD;
3243
3244         logical = chunk_logical + offset;
3245         physical_end = physical + nstripes * map->stripe_len;
3246         if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
3247                 get_raid56_logic_offset(physical_end, stripe_index,
3248                                         map, &logic_end, NULL);
3249                 logic_end += chunk_logical;
3250         } else {
3251                 logic_end = logical + increment * nstripes;
3252         }
3253         wait_event(sctx->list_wait,
3254                    atomic_read(&sctx->bios_in_flight) == 0);
3255         scrub_blocked_if_needed(fs_info);
3256
3257         root = btrfs_extent_root(fs_info, logical);
3258         csum_root = btrfs_csum_root(fs_info, logical);
3259
3260         /*
3261          * collect all data csums for the stripe to avoid seeking during
3262          * the scrub. This might currently (crc32) end up to be about 1MB
3263          */
3264         blk_start_plug(&plug);
3265
3266         if (sctx->is_dev_replace &&
3267             btrfs_dev_is_sequential(sctx->wr_tgtdev, physical)) {
3268                 mutex_lock(&sctx->wr_lock);
3269                 sctx->write_pointer = physical;
3270                 mutex_unlock(&sctx->wr_lock);
3271                 sctx->flush_all_writes = true;
3272         }
3273
3274         /*
3275          * now find all extents for each stripe and scrub them
3276          */
3277         ret = 0;
3278         while (physical < physical_end) {
3279                 /*
3280                  * canceled?
3281                  */
3282                 if (atomic_read(&fs_info->scrub_cancel_req) ||
3283                     atomic_read(&sctx->cancel_req)) {
3284                         ret = -ECANCELED;
3285                         goto out;
3286                 }
3287                 /*
3288                  * check to see if we have to pause
3289                  */
3290                 if (atomic_read(&fs_info->scrub_pause_req)) {
3291                         /* push queued extents */
3292                         sctx->flush_all_writes = true;
3293                         scrub_submit(sctx);
3294                         mutex_lock(&sctx->wr_lock);
3295                         scrub_wr_submit(sctx);
3296                         mutex_unlock(&sctx->wr_lock);
3297                         wait_event(sctx->list_wait,
3298                                    atomic_read(&sctx->bios_in_flight) == 0);
3299                         sctx->flush_all_writes = false;
3300                         scrub_blocked_if_needed(fs_info);
3301                 }
3302
3303                 if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
3304                         ret = get_raid56_logic_offset(physical, stripe_index,
3305                                                       map, &logical,
3306                                                       &stripe_logical);
3307                         logical += chunk_logical;
3308                         if (ret) {
3309                                 /* it is parity strip */
3310                                 stripe_logical += chunk_logical;
3311                                 stripe_end = stripe_logical + increment;
3312                                 ret = scrub_raid56_parity(sctx, map, scrub_dev,
3313                                                           stripe_logical,
3314                                                           stripe_end);
3315                                 if (ret)
3316                                         goto out;
3317                                 goto skip;
3318                         }
3319                 }
3320
3321                 if (btrfs_fs_incompat(fs_info, SKINNY_METADATA))
3322                         key.type = BTRFS_METADATA_ITEM_KEY;
3323                 else
3324                         key.type = BTRFS_EXTENT_ITEM_KEY;
3325                 key.objectid = logical;
3326                 key.offset = (u64)-1;
3327
3328                 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
3329                 if (ret < 0)
3330                         goto out;
3331
3332                 if (ret > 0) {
3333                         ret = btrfs_previous_extent_item(root, path, 0);
3334                         if (ret < 0)
3335                                 goto out;
3336                         if (ret > 0) {
3337                                 /* there's no smaller item, so stick with the
3338                                  * larger one */
3339                                 btrfs_release_path(path);
3340                                 ret = btrfs_search_slot(NULL, root, &key,
3341                                                         path, 0, 0);
3342                                 if (ret < 0)
3343                                         goto out;
3344                         }
3345                 }
3346
3347                 stop_loop = 0;
3348                 while (1) {
3349                         u64 bytes;
3350
3351                         l = path->nodes[0];
3352                         slot = path->slots[0];
3353                         if (slot >= btrfs_header_nritems(l)) {
3354                                 ret = btrfs_next_leaf(root, path);
3355                                 if (ret == 0)
3356                                         continue;
3357                                 if (ret < 0)
3358                                         goto out;
3359
3360                                 stop_loop = 1;
3361                                 break;
3362                         }
3363                         btrfs_item_key_to_cpu(l, &key, slot);
3364
3365                         if (key.type != BTRFS_EXTENT_ITEM_KEY &&
3366                             key.type != BTRFS_METADATA_ITEM_KEY)
3367                                 goto next;
3368
3369                         if (key.type == BTRFS_METADATA_ITEM_KEY)
3370                                 bytes = fs_info->nodesize;
3371                         else
3372                                 bytes = key.offset;
3373
3374                         if (key.objectid + bytes <= logical)
3375                                 goto next;
3376
3377                         if (key.objectid >= logical + map->stripe_len) {
3378                                 /* out of this device extent */
3379                                 if (key.objectid >= logic_end)
3380                                         stop_loop = 1;
3381                                 break;
3382                         }
3383
3384                         /*
3385                          * If our block group was removed in the meanwhile, just
3386                          * stop scrubbing since there is no point in continuing.
3387                          * Continuing would prevent reusing its device extents
3388                          * for new block groups for a long time.
3389                          */
3390                         spin_lock(&bg->lock);
3391                         if (bg->removed) {
3392                                 spin_unlock(&bg->lock);
3393                                 ret = 0;
3394                                 goto out;
3395                         }
3396                         spin_unlock(&bg->lock);
3397
3398                         extent = btrfs_item_ptr(l, slot,
3399                                                 struct btrfs_extent_item);
3400                         flags = btrfs_extent_flags(l, extent);
3401                         generation = btrfs_extent_generation(l, extent);
3402
3403                         if ((flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) &&
3404                             (key.objectid < logical ||
3405                              key.objectid + bytes >
3406                              logical + map->stripe_len)) {
3407                                 btrfs_err(fs_info,
3408                                            "scrub: tree block %llu spanning stripes, ignored. logical=%llu",
3409                                        key.objectid, logical);
3410                                 spin_lock(&sctx->stat_lock);
3411                                 sctx->stat.uncorrectable_errors++;
3412                                 spin_unlock(&sctx->stat_lock);
3413                                 goto next;
3414                         }
3415
3416 again:
3417                         extent_logical = key.objectid;
3418                         ASSERT(bytes <= U32_MAX);
3419                         extent_len = bytes;
3420
3421                         /*
3422                          * trim extent to this stripe
3423                          */
3424                         if (extent_logical < logical) {
3425                                 extent_len -= logical - extent_logical;
3426                                 extent_logical = logical;
3427                         }
3428                         if (extent_logical + extent_len >
3429                             logical + map->stripe_len) {
3430                                 extent_len = logical + map->stripe_len -
3431                                              extent_logical;
3432                         }
3433
3434                         extent_physical = extent_logical - logical + physical;
3435                         extent_dev = scrub_dev;
3436                         extent_mirror_num = mirror_num;
3437                         if (sctx->is_dev_replace)
3438                                 scrub_remap_extent(fs_info, extent_logical,
3439                                                    extent_len, &extent_physical,
3440                                                    &extent_dev,
3441                                                    &extent_mirror_num);
3442
3443                         if (flags & BTRFS_EXTENT_FLAG_DATA) {
3444                                 ret = btrfs_lookup_csums_range(csum_root,
3445                                                 extent_logical,
3446                                                 extent_logical + extent_len - 1,
3447                                                 &sctx->csum_list, 1);
3448                                 if (ret)
3449                                         goto out;
3450                         }
3451
3452                         ret = scrub_extent(sctx, map, extent_logical, extent_len,
3453                                            extent_physical, extent_dev, flags,
3454                                            generation, extent_mirror_num,
3455                                            extent_logical - logical + physical);
3456
3457                         scrub_free_csums(sctx);
3458
3459                         if (ret)
3460                                 goto out;
3461
3462                         if (sctx->is_dev_replace)
3463                                 sync_replace_for_zoned(sctx);
3464
3465                         if (extent_logical + extent_len <
3466                             key.objectid + bytes) {
3467                                 if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
3468                                         /*
3469                                          * loop until we find next data stripe
3470                                          * or we have finished all stripes.
3471                                          */
3472 loop:
3473                                         physical += map->stripe_len;
3474                                         ret = get_raid56_logic_offset(physical,
3475                                                         stripe_index, map,
3476                                                         &logical, &stripe_logical);
3477                                         logical += chunk_logical;
3478
3479                                         if (ret && physical < physical_end) {
3480                                                 stripe_logical += chunk_logical;
3481                                                 stripe_end = stripe_logical +
3482                                                                 increment;
3483                                                 ret = scrub_raid56_parity(sctx,
3484                                                         map, scrub_dev,
3485                                                         stripe_logical,
3486                                                         stripe_end);
3487                                                 if (ret)
3488                                                         goto out;
3489                                                 goto loop;
3490                                         }
3491                                 } else {
3492                                         physical += map->stripe_len;
3493                                         logical += increment;
3494                                 }
3495                                 if (logical < key.objectid + bytes) {
3496                                         cond_resched();
3497                                         goto again;
3498                                 }
3499
3500                                 if (physical >= physical_end) {
3501                                         stop_loop = 1;
3502                                         break;
3503                                 }
3504                         }
3505 next:
3506                         path->slots[0]++;
3507                 }
3508                 btrfs_release_path(path);
3509 skip:
3510                 logical += increment;
3511                 physical += map->stripe_len;
3512                 spin_lock(&sctx->stat_lock);
3513                 if (stop_loop)
3514                         sctx->stat.last_physical = map->stripes[stripe_index].physical +
3515                                                    dev_extent_len;
3516                 else
3517                         sctx->stat.last_physical = physical;
3518                 spin_unlock(&sctx->stat_lock);
3519                 if (stop_loop)
3520                         break;
3521         }
3522 out:
3523         /* push queued extents */
3524         scrub_submit(sctx);
3525         mutex_lock(&sctx->wr_lock);
3526         scrub_wr_submit(sctx);
3527         mutex_unlock(&sctx->wr_lock);
3528
3529         blk_finish_plug(&plug);
3530         btrfs_free_path(path);
3531
3532         if (sctx->is_dev_replace && ret >= 0) {
3533                 int ret2;
3534
3535                 ret2 = sync_write_pointer_for_zoned(sctx,
3536                                 chunk_logical + offset,
3537                                 map->stripes[stripe_index].physical,
3538                                 physical_end);
3539                 if (ret2)
3540                         ret = ret2;
3541         }
3542
3543         return ret < 0 ? ret : 0;
3544 }
3545
3546 static noinline_for_stack int scrub_chunk(struct scrub_ctx *sctx,
3547                                           struct btrfs_block_group *bg,
3548                                           struct btrfs_device *scrub_dev,
3549                                           u64 dev_offset,
3550                                           u64 dev_extent_len)
3551 {
3552         struct btrfs_fs_info *fs_info = sctx->fs_info;
3553         struct extent_map_tree *map_tree = &fs_info->mapping_tree;
3554         struct map_lookup *map;
3555         struct extent_map *em;
3556         int i;
3557         int ret = 0;
3558
3559         read_lock(&map_tree->lock);
3560         em = lookup_extent_mapping(map_tree, bg->start, bg->length);
3561         read_unlock(&map_tree->lock);
3562
3563         if (!em) {
3564                 /*
3565                  * Might have been an unused block group deleted by the cleaner
3566                  * kthread or relocation.
3567                  */
3568                 spin_lock(&bg->lock);
3569                 if (!bg->removed)
3570                         ret = -EINVAL;
3571                 spin_unlock(&bg->lock);
3572
3573                 return ret;
3574         }
3575         if (em->start != bg->start)
3576                 goto out;
3577         if (em->len < dev_extent_len)
3578                 goto out;
3579
3580         map = em->map_lookup;
3581         for (i = 0; i < map->num_stripes; ++i) {
3582                 if (map->stripes[i].dev->bdev == scrub_dev->bdev &&
3583                     map->stripes[i].physical == dev_offset) {
3584                         ret = scrub_stripe(sctx, bg, map, scrub_dev, i,
3585                                            dev_extent_len);
3586                         if (ret)
3587                                 goto out;
3588                 }
3589         }
3590 out:
3591         free_extent_map(em);
3592
3593         return ret;
3594 }
3595
3596 static int finish_extent_writes_for_zoned(struct btrfs_root *root,
3597                                           struct btrfs_block_group *cache)
3598 {
3599         struct btrfs_fs_info *fs_info = cache->fs_info;
3600         struct btrfs_trans_handle *trans;
3601
3602         if (!btrfs_is_zoned(fs_info))
3603                 return 0;
3604
3605         btrfs_wait_block_group_reservations(cache);
3606         btrfs_wait_nocow_writers(cache);
3607         btrfs_wait_ordered_roots(fs_info, U64_MAX, cache->start, cache->length);
3608
3609         trans = btrfs_join_transaction(root);
3610         if (IS_ERR(trans))
3611                 return PTR_ERR(trans);
3612         return btrfs_commit_transaction(trans);
3613 }
3614
3615 static noinline_for_stack
3616 int scrub_enumerate_chunks(struct scrub_ctx *sctx,
3617                            struct btrfs_device *scrub_dev, u64 start, u64 end)
3618 {
3619         struct btrfs_dev_extent *dev_extent = NULL;
3620         struct btrfs_path *path;
3621         struct btrfs_fs_info *fs_info = sctx->fs_info;
3622         struct btrfs_root *root = fs_info->dev_root;
3623         u64 chunk_offset;
3624         int ret = 0;
3625         int ro_set;
3626         int slot;
3627         struct extent_buffer *l;
3628         struct btrfs_key key;
3629         struct btrfs_key found_key;
3630         struct btrfs_block_group *cache;
3631         struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
3632
3633         path = btrfs_alloc_path();
3634         if (!path)
3635                 return -ENOMEM;
3636
3637         path->reada = READA_FORWARD;
3638         path->search_commit_root = 1;
3639         path->skip_locking = 1;
3640
3641         key.objectid = scrub_dev->devid;
3642         key.offset = 0ull;
3643         key.type = BTRFS_DEV_EXTENT_KEY;
3644
3645         while (1) {
3646                 u64 dev_extent_len;
3647
3648                 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
3649                 if (ret < 0)
3650                         break;
3651                 if (ret > 0) {
3652                         if (path->slots[0] >=
3653                             btrfs_header_nritems(path->nodes[0])) {
3654                                 ret = btrfs_next_leaf(root, path);
3655                                 if (ret < 0)
3656                                         break;
3657                                 if (ret > 0) {
3658                                         ret = 0;
3659                                         break;
3660                                 }
3661                         } else {
3662                                 ret = 0;
3663                         }
3664                 }
3665
3666                 l = path->nodes[0];
3667                 slot = path->slots[0];
3668
3669                 btrfs_item_key_to_cpu(l, &found_key, slot);
3670
3671                 if (found_key.objectid != scrub_dev->devid)
3672                         break;
3673
3674                 if (found_key.type != BTRFS_DEV_EXTENT_KEY)
3675                         break;
3676
3677                 if (found_key.offset >= end)
3678                         break;
3679
3680                 if (found_key.offset < key.offset)
3681                         break;
3682
3683                 dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
3684                 dev_extent_len = btrfs_dev_extent_length(l, dev_extent);
3685
3686                 if (found_key.offset + dev_extent_len <= start)
3687                         goto skip;
3688
3689                 chunk_offset = btrfs_dev_extent_chunk_offset(l, dev_extent);
3690
3691                 /*
3692                  * get a reference on the corresponding block group to prevent
3693                  * the chunk from going away while we scrub it
3694                  */
3695                 cache = btrfs_lookup_block_group(fs_info, chunk_offset);
3696
3697                 /* some chunks are removed but not committed to disk yet,
3698                  * continue scrubbing */
3699                 if (!cache)
3700                         goto skip;
3701
3702                 if (sctx->is_dev_replace && btrfs_is_zoned(fs_info)) {
3703                         spin_lock(&cache->lock);
3704                         if (!cache->to_copy) {
3705                                 spin_unlock(&cache->lock);
3706                                 btrfs_put_block_group(cache);
3707                                 goto skip;
3708                         }
3709                         spin_unlock(&cache->lock);
3710                 }
3711
3712                 /*
3713                  * Make sure that while we are scrubbing the corresponding block
3714                  * group doesn't get its logical address and its device extents
3715                  * reused for another block group, which can possibly be of a
3716                  * different type and different profile. We do this to prevent
3717                  * false error detections and crashes due to bogus attempts to
3718                  * repair extents.
3719                  */
3720                 spin_lock(&cache->lock);
3721                 if (cache->removed) {
3722                         spin_unlock(&cache->lock);
3723                         btrfs_put_block_group(cache);
3724                         goto skip;
3725                 }
3726                 btrfs_freeze_block_group(cache);
3727                 spin_unlock(&cache->lock);
3728
3729                 /*
3730                  * we need call btrfs_inc_block_group_ro() with scrubs_paused,
3731                  * to avoid deadlock caused by:
3732                  * btrfs_inc_block_group_ro()
3733                  * -> btrfs_wait_for_commit()
3734                  * -> btrfs_commit_transaction()
3735                  * -> btrfs_scrub_pause()
3736                  */
3737                 scrub_pause_on(fs_info);
3738
3739                 /*
3740                  * Don't do chunk preallocation for scrub.
3741                  *
3742                  * This is especially important for SYSTEM bgs, or we can hit
3743                  * -EFBIG from btrfs_finish_chunk_alloc() like:
3744                  * 1. The only SYSTEM bg is marked RO.
3745                  *    Since SYSTEM bg is small, that's pretty common.
3746                  * 2. New SYSTEM bg will be allocated
3747                  *    Due to regular version will allocate new chunk.
3748                  * 3. New SYSTEM bg is empty and will get cleaned up
3749                  *    Before cleanup really happens, it's marked RO again.
3750                  * 4. Empty SYSTEM bg get scrubbed
3751                  *    We go back to 2.
3752                  *
3753                  * This can easily boost the amount of SYSTEM chunks if cleaner
3754                  * thread can't be triggered fast enough, and use up all space
3755                  * of btrfs_super_block::sys_chunk_array
3756                  *
3757                  * While for dev replace, we need to try our best to mark block
3758                  * group RO, to prevent race between:
3759                  * - Write duplication
3760                  *   Contains latest data
3761                  * - Scrub copy
3762                  *   Contains data from commit tree
3763                  *
3764                  * If target block group is not marked RO, nocow writes can
3765                  * be overwritten by scrub copy, causing data corruption.
3766                  * So for dev-replace, it's not allowed to continue if a block
3767                  * group is not RO.
3768                  */
3769                 ret = btrfs_inc_block_group_ro(cache, sctx->is_dev_replace);
3770                 if (!ret && sctx->is_dev_replace) {
3771                         ret = finish_extent_writes_for_zoned(root, cache);
3772                         if (ret) {
3773                                 btrfs_dec_block_group_ro(cache);
3774                                 scrub_pause_off(fs_info);
3775                                 btrfs_put_block_group(cache);
3776                                 break;
3777                         }
3778                 }
3779
3780                 if (ret == 0) {
3781                         ro_set = 1;
3782                 } else if (ret == -ENOSPC && !sctx->is_dev_replace) {
3783                         /*
3784                          * btrfs_inc_block_group_ro return -ENOSPC when it
3785                          * failed in creating new chunk for metadata.
3786                          * It is not a problem for scrub, because
3787                          * metadata are always cowed, and our scrub paused
3788                          * commit_transactions.
3789                          */
3790                         ro_set = 0;
3791                 } else if (ret == -ETXTBSY) {
3792                         btrfs_warn(fs_info,
3793                    "skipping scrub of block group %llu due to active swapfile",
3794                                    cache->start);
3795                         scrub_pause_off(fs_info);
3796                         ret = 0;
3797                         goto skip_unfreeze;
3798                 } else {
3799                         btrfs_warn(fs_info,
3800                                    "failed setting block group ro: %d", ret);
3801                         btrfs_unfreeze_block_group(cache);
3802                         btrfs_put_block_group(cache);
3803                         scrub_pause_off(fs_info);
3804                         break;
3805                 }
3806
3807                 /*
3808                  * Now the target block is marked RO, wait for nocow writes to
3809                  * finish before dev-replace.
3810                  * COW is fine, as COW never overwrites extents in commit tree.
3811                  */
3812                 if (sctx->is_dev_replace) {
3813                         btrfs_wait_nocow_writers(cache);
3814                         btrfs_wait_ordered_roots(fs_info, U64_MAX, cache->start,
3815                                         cache->length);
3816                 }
3817
3818                 scrub_pause_off(fs_info);
3819                 down_write(&dev_replace->rwsem);
3820                 dev_replace->cursor_right = found_key.offset + dev_extent_len;
3821                 dev_replace->cursor_left = found_key.offset;
3822                 dev_replace->item_needs_writeback = 1;
3823                 up_write(&dev_replace->rwsem);
3824
3825                 ASSERT(cache->start == chunk_offset);
3826                 ret = scrub_chunk(sctx, cache, scrub_dev, found_key.offset,
3827                                   dev_extent_len);
3828
3829                 /*
3830                  * flush, submit all pending read and write bios, afterwards
3831                  * wait for them.
3832                  * Note that in the dev replace case, a read request causes
3833                  * write requests that are submitted in the read completion
3834                  * worker. Therefore in the current situation, it is required
3835                  * that all write requests are flushed, so that all read and
3836                  * write requests are really completed when bios_in_flight
3837                  * changes to 0.
3838                  */
3839                 sctx->flush_all_writes = true;
3840                 scrub_submit(sctx);
3841                 mutex_lock(&sctx->wr_lock);
3842                 scrub_wr_submit(sctx);
3843                 mutex_unlock(&sctx->wr_lock);
3844
3845                 wait_event(sctx->list_wait,
3846                            atomic_read(&sctx->bios_in_flight) == 0);
3847
3848                 scrub_pause_on(fs_info);
3849
3850                 /*
3851                  * must be called before we decrease @scrub_paused.
3852                  * make sure we don't block transaction commit while
3853                  * we are waiting pending workers finished.
3854                  */
3855                 wait_event(sctx->list_wait,
3856                            atomic_read(&sctx->workers_pending) == 0);
3857                 sctx->flush_all_writes = false;
3858
3859                 scrub_pause_off(fs_info);
3860
3861                 if (sctx->is_dev_replace &&
3862                     !btrfs_finish_block_group_to_copy(dev_replace->srcdev,
3863                                                       cache, found_key.offset))
3864                         ro_set = 0;
3865
3866                 down_write(&dev_replace->rwsem);
3867                 dev_replace->cursor_left = dev_replace->cursor_right;
3868                 dev_replace->item_needs_writeback = 1;
3869                 up_write(&dev_replace->rwsem);
3870
3871                 if (ro_set)
3872                         btrfs_dec_block_group_ro(cache);
3873
3874                 /*
3875                  * We might have prevented the cleaner kthread from deleting
3876                  * this block group if it was already unused because we raced
3877                  * and set it to RO mode first. So add it back to the unused
3878                  * list, otherwise it might not ever be deleted unless a manual
3879                  * balance is triggered or it becomes used and unused again.
3880                  */
3881                 spin_lock(&cache->lock);
3882                 if (!cache->removed && !cache->ro && cache->reserved == 0 &&
3883                     cache->used == 0) {
3884                         spin_unlock(&cache->lock);
3885                         if (btrfs_test_opt(fs_info, DISCARD_ASYNC))
3886                                 btrfs_discard_queue_work(&fs_info->discard_ctl,
3887                                                          cache);
3888                         else
3889                                 btrfs_mark_bg_unused(cache);
3890                 } else {
3891                         spin_unlock(&cache->lock);
3892                 }
3893 skip_unfreeze:
3894                 btrfs_unfreeze_block_group(cache);
3895                 btrfs_put_block_group(cache);
3896                 if (ret)
3897                         break;
3898                 if (sctx->is_dev_replace &&
3899                     atomic64_read(&dev_replace->num_write_errors) > 0) {
3900                         ret = -EIO;
3901                         break;
3902                 }
3903                 if (sctx->stat.malloc_errors > 0) {
3904                         ret = -ENOMEM;
3905                         break;
3906                 }
3907 skip:
3908                 key.offset = found_key.offset + dev_extent_len;
3909                 btrfs_release_path(path);
3910         }
3911
3912         btrfs_free_path(path);
3913
3914         return ret;
3915 }
3916
3917 static noinline_for_stack int scrub_supers(struct scrub_ctx *sctx,
3918                                            struct btrfs_device *scrub_dev)
3919 {
3920         int     i;
3921         u64     bytenr;
3922         u64     gen;
3923         int     ret;
3924         struct btrfs_fs_info *fs_info = sctx->fs_info;
3925
3926         if (BTRFS_FS_ERROR(fs_info))
3927                 return -EROFS;
3928
3929         /* Seed devices of a new filesystem has their own generation. */
3930         if (scrub_dev->fs_devices != fs_info->fs_devices)
3931                 gen = scrub_dev->generation;
3932         else
3933                 gen = fs_info->last_trans_committed;
3934
3935         for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
3936                 bytenr = btrfs_sb_offset(i);
3937                 if (bytenr + BTRFS_SUPER_INFO_SIZE >
3938                     scrub_dev->commit_total_bytes)
3939                         break;
3940                 if (!btrfs_check_super_location(scrub_dev, bytenr))
3941                         continue;
3942
3943                 ret = scrub_pages(sctx, bytenr, BTRFS_SUPER_INFO_SIZE, bytenr,
3944                                   scrub_dev, BTRFS_EXTENT_FLAG_SUPER, gen, i,
3945                                   NULL, bytenr);
3946                 if (ret)
3947                         return ret;
3948         }
3949         wait_event(sctx->list_wait, atomic_read(&sctx->bios_in_flight) == 0);
3950
3951         return 0;
3952 }
3953
3954 static void scrub_workers_put(struct btrfs_fs_info *fs_info)
3955 {
3956         if (refcount_dec_and_mutex_lock(&fs_info->scrub_workers_refcnt,
3957                                         &fs_info->scrub_lock)) {
3958                 struct btrfs_workqueue *scrub_workers = NULL;
3959                 struct btrfs_workqueue *scrub_wr_comp = NULL;
3960                 struct btrfs_workqueue *scrub_parity = NULL;
3961
3962                 scrub_workers = fs_info->scrub_workers;
3963                 scrub_wr_comp = fs_info->scrub_wr_completion_workers;
3964                 scrub_parity = fs_info->scrub_parity_workers;
3965
3966                 fs_info->scrub_workers = NULL;
3967                 fs_info->scrub_wr_completion_workers = NULL;
3968                 fs_info->scrub_parity_workers = NULL;
3969                 mutex_unlock(&fs_info->scrub_lock);
3970
3971                 btrfs_destroy_workqueue(scrub_workers);
3972                 btrfs_destroy_workqueue(scrub_wr_comp);
3973                 btrfs_destroy_workqueue(scrub_parity);
3974         }
3975 }
3976
3977 /*
3978  * get a reference count on fs_info->scrub_workers. start worker if necessary
3979  */
3980 static noinline_for_stack int scrub_workers_get(struct btrfs_fs_info *fs_info,
3981                                                 int is_dev_replace)
3982 {
3983         struct btrfs_workqueue *scrub_workers = NULL;
3984         struct btrfs_workqueue *scrub_wr_comp = NULL;
3985         struct btrfs_workqueue *scrub_parity = NULL;
3986         unsigned int flags = WQ_FREEZABLE | WQ_UNBOUND;
3987         int max_active = fs_info->thread_pool_size;
3988         int ret = -ENOMEM;
3989
3990         if (refcount_inc_not_zero(&fs_info->scrub_workers_refcnt))
3991                 return 0;
3992
3993         scrub_workers = btrfs_alloc_workqueue(fs_info, "scrub", flags,
3994                                               is_dev_replace ? 1 : max_active, 4);
3995         if (!scrub_workers)
3996                 goto fail_scrub_workers;
3997
3998         scrub_wr_comp = btrfs_alloc_workqueue(fs_info, "scrubwrc", flags,
3999                                               max_active, 2);
4000         if (!scrub_wr_comp)
4001                 goto fail_scrub_wr_completion_workers;
4002
4003         scrub_parity = btrfs_alloc_workqueue(fs_info, "scrubparity", flags,
4004                                              max_active, 2);
4005         if (!scrub_parity)
4006                 goto fail_scrub_parity_workers;
4007
4008         mutex_lock(&fs_info->scrub_lock);
4009         if (refcount_read(&fs_info->scrub_workers_refcnt) == 0) {
4010                 ASSERT(fs_info->scrub_workers == NULL &&
4011                        fs_info->scrub_wr_completion_workers == NULL &&
4012                        fs_info->scrub_parity_workers == NULL);
4013                 fs_info->scrub_workers = scrub_workers;
4014                 fs_info->scrub_wr_completion_workers = scrub_wr_comp;
4015                 fs_info->scrub_parity_workers = scrub_parity;
4016                 refcount_set(&fs_info->scrub_workers_refcnt, 1);
4017                 mutex_unlock(&fs_info->scrub_lock);
4018                 return 0;
4019         }
4020         /* Other thread raced in and created the workers for us */
4021         refcount_inc(&fs_info->scrub_workers_refcnt);
4022         mutex_unlock(&fs_info->scrub_lock);
4023
4024         ret = 0;
4025         btrfs_destroy_workqueue(scrub_parity);
4026 fail_scrub_parity_workers:
4027         btrfs_destroy_workqueue(scrub_wr_comp);
4028 fail_scrub_wr_completion_workers:
4029         btrfs_destroy_workqueue(scrub_workers);
4030 fail_scrub_workers:
4031         return ret;
4032 }
4033
4034 int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
4035                     u64 end, struct btrfs_scrub_progress *progress,
4036                     int readonly, int is_dev_replace)
4037 {
4038         struct btrfs_dev_lookup_args args = { .devid = devid };
4039         struct scrub_ctx *sctx;
4040         int ret;
4041         struct btrfs_device *dev;
4042         unsigned int nofs_flag;
4043
4044         if (btrfs_fs_closing(fs_info))
4045                 return -EAGAIN;
4046
4047         if (fs_info->nodesize > BTRFS_STRIPE_LEN) {
4048                 /*
4049                  * in this case scrub is unable to calculate the checksum
4050                  * the way scrub is implemented. Do not handle this
4051                  * situation at all because it won't ever happen.
4052                  */
4053                 btrfs_err(fs_info,
4054                            "scrub: size assumption nodesize <= BTRFS_STRIPE_LEN (%d <= %d) fails",
4055                        fs_info->nodesize,
4056                        BTRFS_STRIPE_LEN);
4057                 return -EINVAL;
4058         }
4059
4060         if (fs_info->nodesize >
4061             PAGE_SIZE * SCRUB_MAX_PAGES_PER_BLOCK ||
4062             fs_info->sectorsize > PAGE_SIZE * SCRUB_MAX_PAGES_PER_BLOCK) {
4063                 /*
4064                  * would exhaust the array bounds of pagev member in
4065                  * struct scrub_block
4066                  */
4067                 btrfs_err(fs_info,
4068                           "scrub: size assumption nodesize and sectorsize <= SCRUB_MAX_PAGES_PER_BLOCK (%d <= %d && %d <= %d) fails",
4069                        fs_info->nodesize,
4070                        SCRUB_MAX_PAGES_PER_BLOCK,
4071                        fs_info->sectorsize,
4072                        SCRUB_MAX_PAGES_PER_BLOCK);
4073                 return -EINVAL;
4074         }
4075
4076         /* Allocate outside of device_list_mutex */
4077         sctx = scrub_setup_ctx(fs_info, is_dev_replace);
4078         if (IS_ERR(sctx))
4079                 return PTR_ERR(sctx);
4080
4081         ret = scrub_workers_get(fs_info, is_dev_replace);
4082         if (ret)
4083                 goto out_free_ctx;
4084
4085         mutex_lock(&fs_info->fs_devices->device_list_mutex);
4086         dev = btrfs_find_device(fs_info->fs_devices, &args);
4087         if (!dev || (test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state) &&
4088                      !is_dev_replace)) {
4089                 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
4090                 ret = -ENODEV;
4091                 goto out;
4092         }
4093
4094         if (!is_dev_replace && !readonly &&
4095             !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state)) {
4096                 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
4097                 btrfs_err_in_rcu(fs_info,
4098                         "scrub on devid %llu: filesystem on %s is not writable",
4099                                  devid, rcu_str_deref(dev->name));
4100                 ret = -EROFS;
4101                 goto out;
4102         }
4103
4104         mutex_lock(&fs_info->scrub_lock);
4105         if (!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &dev->dev_state) ||
4106             test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &dev->dev_state)) {
4107                 mutex_unlock(&fs_info->scrub_lock);
4108                 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
4109                 ret = -EIO;
4110                 goto out;
4111         }
4112
4113         down_read(&fs_info->dev_replace.rwsem);
4114         if (dev->scrub_ctx ||
4115             (!is_dev_replace &&
4116              btrfs_dev_replace_is_ongoing(&fs_info->dev_replace))) {
4117                 up_read(&fs_info->dev_replace.rwsem);
4118                 mutex_unlock(&fs_info->scrub_lock);
4119                 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
4120                 ret = -EINPROGRESS;
4121                 goto out;
4122         }
4123         up_read(&fs_info->dev_replace.rwsem);
4124
4125         sctx->readonly = readonly;
4126         dev->scrub_ctx = sctx;
4127         mutex_unlock(&fs_info->fs_devices->device_list_mutex);
4128
4129         /*
4130          * checking @scrub_pause_req here, we can avoid
4131          * race between committing transaction and scrubbing.
4132          */
4133         __scrub_blocked_if_needed(fs_info);
4134         atomic_inc(&fs_info->scrubs_running);
4135         mutex_unlock(&fs_info->scrub_lock);
4136
4137         /*
4138          * In order to avoid deadlock with reclaim when there is a transaction
4139          * trying to pause scrub, make sure we use GFP_NOFS for all the
4140          * allocations done at btrfs_scrub_pages() and scrub_pages_for_parity()
4141          * invoked by our callees. The pausing request is done when the
4142          * transaction commit starts, and it blocks the transaction until scrub
4143          * is paused (done at specific points at scrub_stripe() or right above
4144          * before incrementing fs_info->scrubs_running).
4145          */
4146         nofs_flag = memalloc_nofs_save();
4147         if (!is_dev_replace) {
4148                 btrfs_info(fs_info, "scrub: started on devid %llu", devid);
4149                 /*
4150                  * by holding device list mutex, we can
4151                  * kick off writing super in log tree sync.
4152                  */
4153                 mutex_lock(&fs_info->fs_devices->device_list_mutex);
4154                 ret = scrub_supers(sctx, dev);
4155                 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
4156         }
4157
4158         if (!ret)
4159                 ret = scrub_enumerate_chunks(sctx, dev, start, end);
4160         memalloc_nofs_restore(nofs_flag);
4161
4162         wait_event(sctx->list_wait, atomic_read(&sctx->bios_in_flight) == 0);
4163         atomic_dec(&fs_info->scrubs_running);
4164         wake_up(&fs_info->scrub_pause_wait);
4165
4166         wait_event(sctx->list_wait, atomic_read(&sctx->workers_pending) == 0);
4167
4168         if (progress)
4169                 memcpy(progress, &sctx->stat, sizeof(*progress));
4170
4171         if (!is_dev_replace)
4172                 btrfs_info(fs_info, "scrub: %s on devid %llu with status: %d",
4173                         ret ? "not finished" : "finished", devid, ret);
4174
4175         mutex_lock(&fs_info->scrub_lock);
4176         dev->scrub_ctx = NULL;
4177         mutex_unlock(&fs_info->scrub_lock);
4178
4179         scrub_workers_put(fs_info);
4180         scrub_put_ctx(sctx);
4181
4182         return ret;
4183 out:
4184         scrub_workers_put(fs_info);
4185 out_free_ctx:
4186         scrub_free_ctx(sctx);
4187
4188         return ret;
4189 }
4190
4191 void btrfs_scrub_pause(struct btrfs_fs_info *fs_info)
4192 {
4193         mutex_lock(&fs_info->scrub_lock);
4194         atomic_inc(&fs_info->scrub_pause_req);
4195         while (atomic_read(&fs_info->scrubs_paused) !=
4196                atomic_read(&fs_info->scrubs_running)) {
4197                 mutex_unlock(&fs_info->scrub_lock);
4198                 wait_event(fs_info->scrub_pause_wait,
4199                            atomic_read(&fs_info->scrubs_paused) ==
4200                            atomic_read(&fs_info->scrubs_running));
4201                 mutex_lock(&fs_info->scrub_lock);
4202         }
4203         mutex_unlock(&fs_info->scrub_lock);
4204 }
4205
4206 void btrfs_scrub_continue(struct btrfs_fs_info *fs_info)
4207 {
4208         atomic_dec(&fs_info->scrub_pause_req);
4209         wake_up(&fs_info->scrub_pause_wait);
4210 }
4211
4212 int btrfs_scrub_cancel(struct btrfs_fs_info *fs_info)
4213 {
4214         mutex_lock(&fs_info->scrub_lock);
4215         if (!atomic_read(&fs_info->scrubs_running)) {
4216                 mutex_unlock(&fs_info->scrub_lock);
4217                 return -ENOTCONN;
4218         }
4219
4220         atomic_inc(&fs_info->scrub_cancel_req);
4221         while (atomic_read(&fs_info->scrubs_running)) {
4222                 mutex_unlock(&fs_info->scrub_lock);
4223                 wait_event(fs_info->scrub_pause_wait,
4224                            atomic_read(&fs_info->scrubs_running) == 0);
4225                 mutex_lock(&fs_info->scrub_lock);
4226         }
4227         atomic_dec(&fs_info->scrub_cancel_req);
4228         mutex_unlock(&fs_info->scrub_lock);
4229
4230         return 0;
4231 }
4232
4233 int btrfs_scrub_cancel_dev(struct btrfs_device *dev)
4234 {
4235         struct btrfs_fs_info *fs_info = dev->fs_info;
4236         struct scrub_ctx *sctx;
4237
4238         mutex_lock(&fs_info->scrub_lock);
4239         sctx = dev->scrub_ctx;
4240         if (!sctx) {
4241                 mutex_unlock(&fs_info->scrub_lock);
4242                 return -ENOTCONN;
4243         }
4244         atomic_inc(&sctx->cancel_req);
4245         while (dev->scrub_ctx) {
4246                 mutex_unlock(&fs_info->scrub_lock);
4247                 wait_event(fs_info->scrub_pause_wait,
4248                            dev->scrub_ctx == NULL);
4249                 mutex_lock(&fs_info->scrub_lock);
4250         }
4251         mutex_unlock(&fs_info->scrub_lock);
4252
4253         return 0;
4254 }
4255
4256 int btrfs_scrub_progress(struct btrfs_fs_info *fs_info, u64 devid,
4257                          struct btrfs_scrub_progress *progress)
4258 {
4259         struct btrfs_dev_lookup_args args = { .devid = devid };
4260         struct btrfs_device *dev;
4261         struct scrub_ctx *sctx = NULL;
4262
4263         mutex_lock(&fs_info->fs_devices->device_list_mutex);
4264         dev = btrfs_find_device(fs_info->fs_devices, &args);
4265         if (dev)
4266                 sctx = dev->scrub_ctx;
4267         if (sctx)
4268                 memcpy(progress, &sctx->stat, sizeof(*progress));
4269         mutex_unlock(&fs_info->fs_devices->device_list_mutex);
4270
4271         return dev ? (sctx ? 0 : -ENOTCONN) : -ENODEV;
4272 }
4273
4274 static void scrub_remap_extent(struct btrfs_fs_info *fs_info,
4275                                u64 extent_logical, u32 extent_len,
4276                                u64 *extent_physical,
4277                                struct btrfs_device **extent_dev,
4278                                int *extent_mirror_num)
4279 {
4280         u64 mapped_length;
4281         struct btrfs_io_context *bioc = NULL;
4282         int ret;
4283
4284         mapped_length = extent_len;
4285         ret = btrfs_map_block(fs_info, BTRFS_MAP_READ, extent_logical,
4286                               &mapped_length, &bioc, 0);
4287         if (ret || !bioc || mapped_length < extent_len ||
4288             !bioc->stripes[0].dev->bdev) {
4289                 btrfs_put_bioc(bioc);
4290                 return;
4291         }
4292
4293         *extent_physical = bioc->stripes[0].physical;
4294         *extent_mirror_num = bioc->mirror_num;
4295         *extent_dev = bioc->stripes[0].dev;
4296         btrfs_put_bioc(bioc);
4297 }