fs/btrfs/scrub.c

   1 // SPDX-License-Identifier: GPL-2.0
   2 /*
   3  * Copyright (C) 2011, 2012 STRATO.  All rights reserved.
   4  */
   5
   6 #include <linux/blkdev.h>
   7 #include <linux/ratelimit.h>
   8 #include <linux/sched/mm.h>
   9 #include <crypto/hash.h>
  10 #include "ctree.h"
  11 #include "discard.h"
  12 #include "volumes.h"
  13 #include "disk-io.h"
  14 #include "ordered-data.h"
  15 #include "transaction.h"
  16 #include "backref.h"
  17 #include "extent_io.h"
  18 #include "dev-replace.h"
  19 #include "check-integrity.h"
  20 #include "raid56.h"
  21 #include "block-group.h"
  22 #include "zoned.h"
  23 #include "fs.h"
  24 #include "accessors.h"
  25 #include "file-item.h"
  26 #include "scrub.h"
  27
  28 /*
  29  * This is only the first step towards a full-features scrub. It reads all
  30  * extent and super block and verifies the checksums. In case a bad checksum
  31  * is found or the extent cannot be read, good data will be written back if
  32  * any can be found.
  33  *
  34  * Future enhancements:
  35  *  - In case an unrepairable extent is encountered, track which files are
  36  *    affected and report them
  37  *  - track and record media errors, throw out bad devices
  38  *  - add a mode to also read unallocated space
  39  */
  40
  41 struct scrub_block;
  42 struct scrub_ctx;
  43
  44 /*
  45  * The following three values only influence the performance.
  46  *
  47  * The last one configures the number of parallel and outstanding I/O
  48  * operations. The first one configures an upper limit for the number
  49  * of (dynamically allocated) pages that are added to a bio.
  50  */
  51 #define SCRUB_SECTORS_PER_BIO   32      /* 128KiB per bio for 4KiB pages */
  52 #define SCRUB_BIOS_PER_SCTX     64      /* 8MiB per device in flight for 4KiB pages */
  53
  54 /*
  55  * The following value times PAGE_SIZE needs to be large enough to match the
  56  * largest node/leaf/sector size that shall be supported.
  57  */
  58 #define SCRUB_MAX_SECTORS_PER_BLOCK     (BTRFS_MAX_METADATA_BLOCKSIZE / SZ_4K)
  59
  60 #define SCRUB_MAX_PAGES                 (DIV_ROUND_UP(BTRFS_MAX_METADATA_BLOCKSIZE, PAGE_SIZE))
  61
  62 /*
  63  * Maximum number of mirrors that can be available for all profiles counting
  64  * the target device of dev-replace as one. During an active device replace
  65  * procedure, the target device of the copy operation is a mirror for the
  66  * filesystem data as well that can be used to read data in order to repair
  67  * read errors on other disks.
  68  *
  69  * Current value is derived from RAID1C4 with 4 copies.
  70  */
  71 #define BTRFS_MAX_MIRRORS (4 + 1)
  72
  73 struct scrub_recover {
  74         refcount_t              refs;
  75         struct btrfs_io_context *bioc;
  76         u64                     map_length;
  77 };
  78
  79 struct scrub_sector {
  80         struct scrub_block      *sblock;
  81         struct list_head        list;
  82         u64                     flags;  /* extent flags */
  83         u64                     generation;
  84         /* Offset in bytes to @sblock. */
  85         u32                     offset;
  86         atomic_t                refs;
  87         unsigned int            have_csum:1;
  88         unsigned int            io_error:1;
  89         u8                      csum[BTRFS_CSUM_SIZE];
  90
  91         struct scrub_recover    *recover;
  92 };
  93
  94 struct scrub_bio {
  95         int                     index;
  96         struct scrub_ctx        *sctx;
  97         struct btrfs_device     *dev;
  98         struct bio              *bio;
  99         blk_status_t            status;
 100         u64                     logical;
 101         u64                     physical;
 102         struct scrub_sector     *sectors[SCRUB_SECTORS_PER_BIO];
 103         int                     sector_count;
 104         int                     next_free;
 105         struct work_struct      work;
 106 };
 107
 108 struct scrub_block {
 109         /*
 110          * Each page will have its page::private used to record the logical
 111          * bytenr.
 112          */
 113         struct page             *pages[SCRUB_MAX_PAGES];
 114         struct scrub_sector     *sectors[SCRUB_MAX_SECTORS_PER_BLOCK];
 115         struct btrfs_device     *dev;
 116         /* Logical bytenr of the sblock */
 117         u64                     logical;
 118         u64                     physical;
 119         u64                     physical_for_dev_replace;
 120         /* Length of sblock in bytes */
 121         u32                     len;
 122         int                     sector_count;
 123         int                     mirror_num;
 124
 125         atomic_t                outstanding_sectors;
 126         refcount_t              refs; /* free mem on transition to zero */
 127         struct scrub_ctx        *sctx;
 128         struct scrub_parity     *sparity;
 129         struct {
 130                 unsigned int    header_error:1;
 131                 unsigned int    checksum_error:1;
 132                 unsigned int    no_io_error_seen:1;
 133                 unsigned int    generation_error:1; /* also sets header_error */
 134
 135                 /* The following is for the data used to check parity */
 136                 /* It is for the data with checksum */
 137                 unsigned int    data_corrected:1;
 138         };
 139         struct work_struct      work;
 140 };
 141
 142 /* Used for the chunks with parity stripe such RAID5/6 */
 143 struct scrub_parity {
 144         struct scrub_ctx        *sctx;
 145
 146         struct btrfs_device     *scrub_dev;
 147
 148         u64                     logic_start;
 149
 150         u64                     logic_end;
 151
 152         int                     nsectors;
 153
 154         u32                     stripe_len;
 155
 156         refcount_t              refs;
 157
 158         struct list_head        sectors_list;
 159
 160         /* Work of parity check and repair */
 161         struct work_struct      work;
 162
 163         /* Mark the parity blocks which have data */
 164         unsigned long           dbitmap;
 165
 166         /*
 167          * Mark the parity blocks which have data, but errors happen when
 168          * read data or check data
 169          */
 170         unsigned long           ebitmap;
 171 };
 172
 173 struct scrub_ctx {
 174         struct scrub_bio        *bios[SCRUB_BIOS_PER_SCTX];
 175         struct btrfs_fs_info    *fs_info;
 176         int                     first_free;
 177         int                     curr;
 178         atomic_t                bios_in_flight;
 179         atomic_t                workers_pending;
 180         spinlock_t              list_lock;
 181         wait_queue_head_t       list_wait;
 182         struct list_head        csum_list;
 183         atomic_t                cancel_req;
 184         int                     readonly;
 185         int                     sectors_per_bio;
 186
 187         /* State of IO submission throttling affecting the associated device */
 188         ktime_t                 throttle_deadline;
 189         u64                     throttle_sent;
 190
 191         int                     is_dev_replace;
 192         u64                     write_pointer;
 193
 194         struct scrub_bio        *wr_curr_bio;
 195         struct mutex            wr_lock;
 196         struct btrfs_device     *wr_tgtdev;
 197         bool                    flush_all_writes;
 198
 199         /*
 200          * statistics
 201          */
 202         struct btrfs_scrub_progress stat;
 203         spinlock_t              stat_lock;
 204
 205         /*
 206          * Use a ref counter to avoid use-after-free issues. Scrub workers
 207          * decrement bios_in_flight and workers_pending and then do a wakeup
 208          * on the list_wait wait queue. We must ensure the main scrub task
 209          * doesn't free the scrub context before or while the workers are
 210          * doing the wakeup() call.
 211          */
 212         refcount_t              refs;
 213 };
 214
 215 struct scrub_warning {
 216         struct btrfs_path       *path;
 217         u64                     extent_item_size;
 218         const char              *errstr;
 219         u64                     physical;
 220         u64                     logical;
 221         struct btrfs_device     *dev;
 222 };
 223
 224 struct full_stripe_lock {
 225         struct rb_node node;
 226         u64 logical;
 227         u64 refs;
 228         struct mutex mutex;
 229 };
 230
 231 #ifndef CONFIG_64BIT
 232 /* This structure is for architectures whose (void *) is smaller than u64 */
 233 struct scrub_page_private {
 234         u64 logical;
 235 };
 236 #endif
 237
 238 static int attach_scrub_page_private(struct page *page, u64 logical)
 239 {
 240 #ifdef CONFIG_64BIT
 241         attach_page_private(page, (void *)logical);
 242         return 0;
 243 #else
 244         struct scrub_page_private *spp;
 245
 246         spp = kmalloc(sizeof(*spp), GFP_KERNEL);
 247         if (!spp)
 248                 return -ENOMEM;
 249         spp->logical = logical;
 250         attach_page_private(page, (void *)spp);
 251         return 0;
 252 #endif
 253 }
 254
 255 static void detach_scrub_page_private(struct page *page)
 256 {
 257 #ifdef CONFIG_64BIT
 258         detach_page_private(page);
 259         return;
 260 #else
 261         struct scrub_page_private *spp;
 262
 263         spp = detach_page_private(page);
 264         kfree(spp);
 265         return;
 266 #endif
 267 }
 268
 269 static struct scrub_block *alloc_scrub_block(struct scrub_ctx *sctx,
 270                                              struct btrfs_device *dev,
 271                                              u64 logical, u64 physical,
 272                                              u64 physical_for_dev_replace,
 273                                              int mirror_num)
 274 {
 275         struct scrub_block *sblock;
 276
 277         sblock = kzalloc(sizeof(*sblock), GFP_KERNEL);
 278         if (!sblock)
 279                 return NULL;
 280         refcount_set(&sblock->refs, 1);
 281         sblock->sctx = sctx;
 282         sblock->logical = logical;
 283         sblock->physical = physical;
 284         sblock->physical_for_dev_replace = physical_for_dev_replace;
 285         sblock->dev = dev;
 286         sblock->mirror_num = mirror_num;
 287         sblock->no_io_error_seen = 1;
 288         /*
 289          * Scrub_block::pages will be allocated at alloc_scrub_sector() when
 290          * the corresponding page is not allocated.
 291          */
 292         return sblock;
 293 }
 294
 295 /*
 296  * Allocate a new scrub sector and attach it to @sblock.
 297  *
 298  * Will also allocate new pages for @sblock if needed.
 299  */
 300 static struct scrub_sector *alloc_scrub_sector(struct scrub_block *sblock,
 301                                                u64 logical)
 302 {
 303         const pgoff_t page_index = (logical - sblock->logical) >> PAGE_SHIFT;
 304         struct scrub_sector *ssector;
 305
 306         /* We must never have scrub_block exceed U32_MAX in size. */
 307         ASSERT(logical - sblock->logical < U32_MAX);
 308
 309         ssector = kzalloc(sizeof(*ssector), GFP_KERNEL);
 310         if (!ssector)
 311                 return NULL;
 312
 313         /* Allocate a new page if the slot is not allocated */
 314         if (!sblock->pages[page_index]) {
 315                 int ret;
 316
 317                 sblock->pages[page_index] = alloc_page(GFP_KERNEL);
 318                 if (!sblock->pages[page_index]) {
 319                         kfree(ssector);
 320                         return NULL;
 321                 }
 322                 ret = attach_scrub_page_private(sblock->pages[page_index],
 323                                 sblock->logical + (page_index << PAGE_SHIFT));
 324                 if (ret < 0) {
 325                         kfree(ssector);
 326                         __free_page(sblock->pages[page_index]);
 327                         sblock->pages[page_index] = NULL;
 328                         return NULL;
 329                 }
 330         }
 331
 332         atomic_set(&ssector->refs, 1);
 333         ssector->sblock = sblock;
 334         /* The sector to be added should not be used */
 335         ASSERT(sblock->sectors[sblock->sector_count] == NULL);
 336         ssector->offset = logical - sblock->logical;
 337
 338         /* The sector count must be smaller than the limit */
 339         ASSERT(sblock->sector_count < SCRUB_MAX_SECTORS_PER_BLOCK);
 340
 341         sblock->sectors[sblock->sector_count] = ssector;
 342         sblock->sector_count++;
 343         sblock->len += sblock->sctx->fs_info->sectorsize;
 344
 345         return ssector;
 346 }
 347
 348 static struct page *scrub_sector_get_page(struct scrub_sector *ssector)
 349 {
 350         struct scrub_block *sblock = ssector->sblock;
 351         pgoff_t index;
 352         /*
 353          * When calling this function, ssector must be alreaday attached to the
 354          * parent sblock.
 355          */
 356         ASSERT(sblock);
 357
 358         /* The range should be inside the sblock range */
 359         ASSERT(ssector->offset < sblock->len);
 360
 361         index = ssector->offset >> PAGE_SHIFT;
 362         ASSERT(index < SCRUB_MAX_PAGES);
 363         ASSERT(sblock->pages[index]);
 364         ASSERT(PagePrivate(sblock->pages[index]));
 365         return sblock->pages[index];
 366 }
 367
 368 static unsigned int scrub_sector_get_page_offset(struct scrub_sector *ssector)
 369 {
 370         struct scrub_block *sblock = ssector->sblock;
 371
 372         /*
 373          * When calling this function, ssector must be already attached to the
 374          * parent sblock.
 375          */
 376         ASSERT(sblock);
 377
 378         /* The range should be inside the sblock range */
 379         ASSERT(ssector->offset < sblock->len);
 380
 381         return offset_in_page(ssector->offset);
 382 }
 383
 384 static char *scrub_sector_get_kaddr(struct scrub_sector *ssector)
 385 {
 386         return page_address(scrub_sector_get_page(ssector)) +
 387                scrub_sector_get_page_offset(ssector);
 388 }
 389
 390 static int bio_add_scrub_sector(struct bio *bio, struct scrub_sector *ssector,
 391                                 unsigned int len)
 392 {
 393         return bio_add_page(bio, scrub_sector_get_page(ssector), len,
 394                             scrub_sector_get_page_offset(ssector));
 395 }
 396
 397 static int scrub_setup_recheck_block(struct scrub_block *original_sblock,
 398                                      struct scrub_block *sblocks_for_recheck[]);
 399 static void scrub_recheck_block(struct btrfs_fs_info *fs_info,
 400                                 struct scrub_block *sblock,
 401                                 int retry_failed_mirror);
 402 static void scrub_recheck_block_checksum(struct scrub_block *sblock);
 403 static int scrub_repair_block_from_good_copy(struct scrub_block *sblock_bad,
 404                                              struct scrub_block *sblock_good);
 405 static int scrub_repair_sector_from_good_copy(struct scrub_block *sblock_bad,
 406                                             struct scrub_block *sblock_good,
 407                                             int sector_num, int force_write);
 408 static void scrub_write_block_to_dev_replace(struct scrub_block *sblock);
 409 static int scrub_write_sector_to_dev_replace(struct scrub_block *sblock,
 410                                              int sector_num);
 411 static int scrub_checksum_data(struct scrub_block *sblock);
 412 static int scrub_checksum_tree_block(struct scrub_block *sblock);
 413 static int scrub_checksum_super(struct scrub_block *sblock);
 414 static void scrub_block_put(struct scrub_block *sblock);
 415 static void scrub_sector_get(struct scrub_sector *sector);
 416 static void scrub_sector_put(struct scrub_sector *sector);
 417 static void scrub_parity_get(struct scrub_parity *sparity);
 418 static void scrub_parity_put(struct scrub_parity *sparity);
 419 static int scrub_sectors(struct scrub_ctx *sctx, u64 logical, u32 len,
 420                          u64 physical, struct btrfs_device *dev, u64 flags,
 421                          u64 gen, int mirror_num, u8 *csum,
 422                          u64 physical_for_dev_replace);
 423 static void scrub_bio_end_io(struct bio *bio);
 424 static void scrub_bio_end_io_worker(struct work_struct *work);
 425 static void scrub_block_complete(struct scrub_block *sblock);
 426 static void scrub_find_good_copy(struct btrfs_fs_info *fs_info,
 427                                  u64 extent_logical, u32 extent_len,
 428                                  u64 *extent_physical,
 429                                  struct btrfs_device **extent_dev,
 430                                  int *extent_mirror_num);
 431 static int scrub_add_sector_to_wr_bio(struct scrub_ctx *sctx,
 432                                       struct scrub_sector *sector);
 433 static void scrub_wr_submit(struct scrub_ctx *sctx);
 434 static void scrub_wr_bio_end_io(struct bio *bio);
 435 static void scrub_wr_bio_end_io_worker(struct work_struct *work);
 436 static void scrub_put_ctx(struct scrub_ctx *sctx);
 437
 438 static inline int scrub_is_page_on_raid56(struct scrub_sector *sector)
 439 {
 440         return sector->recover &&
 441                (sector->recover->bioc->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK);
 442 }
 443
 444 static void scrub_pending_bio_inc(struct scrub_ctx *sctx)
 445 {
 446         refcount_inc(&sctx->refs);
 447         atomic_inc(&sctx->bios_in_flight);
 448 }
 449
 450 static void scrub_pending_bio_dec(struct scrub_ctx *sctx)
 451 {
 452         atomic_dec(&sctx->bios_in_flight);
 453         wake_up(&sctx->list_wait);
 454         scrub_put_ctx(sctx);
 455 }
 456
 457 static void __scrub_blocked_if_needed(struct btrfs_fs_info *fs_info)
 458 {
 459         while (atomic_read(&fs_info->scrub_pause_req)) {
 460                 mutex_unlock(&fs_info->scrub_lock);
 461                 wait_event(fs_info->scrub_pause_wait,
 462                    atomic_read(&fs_info->scrub_pause_req) == 0);
 463                 mutex_lock(&fs_info->scrub_lock);
 464         }
 465 }
 466
 467 static void scrub_pause_on(struct btrfs_fs_info *fs_info)
 468 {
 469         atomic_inc(&fs_info->scrubs_paused);
 470         wake_up(&fs_info->scrub_pause_wait);
 471 }
 472
 473 static void scrub_pause_off(struct btrfs_fs_info *fs_info)
 474 {
 475         mutex_lock(&fs_info->scrub_lock);
 476         __scrub_blocked_if_needed(fs_info);
 477         atomic_dec(&fs_info->scrubs_paused);
 478         mutex_unlock(&fs_info->scrub_lock);
 479
 480         wake_up(&fs_info->scrub_pause_wait);
 481 }
 482
 483 static void scrub_blocked_if_needed(struct btrfs_fs_info *fs_info)
 484 {
 485         scrub_pause_on(fs_info);
 486         scrub_pause_off(fs_info);
 487 }
 488
 489 /*
 490  * Insert new full stripe lock into full stripe locks tree
 491  *
 492  * Return pointer to existing or newly inserted full_stripe_lock structure if
 493  * everything works well.
 494  * Return ERR_PTR(-ENOMEM) if we failed to allocate memory
 495  *
 496  * NOTE: caller must hold full_stripe_locks_root->lock before calling this
 497  * function
 498  */
 499 static struct full_stripe_lock *insert_full_stripe_lock(
 500                 struct btrfs_full_stripe_locks_tree *locks_root,
 501                 u64 fstripe_logical)
 502 {
 503         struct rb_node **p;
 504         struct rb_node *parent = NULL;
 505         struct full_stripe_lock *entry;
 506         struct full_stripe_lock *ret;
 507
 508         lockdep_assert_held(&locks_root->lock);
 509
 510         p = &locks_root->root.rb_node;
 511         while (*p) {
 512                 parent = *p;
 513                 entry = rb_entry(parent, struct full_stripe_lock, node);
 514                 if (fstripe_logical < entry->logical) {
 515                         p = &(*p)->rb_left;
 516                 } else if (fstripe_logical > entry->logical) {
 517                         p = &(*p)->rb_right;
 518                 } else {
 519                         entry->refs++;
 520                         return entry;
 521                 }
 522         }
 523
 524         /*
 525          * Insert new lock.
 526          */
 527         ret = kmalloc(sizeof(*ret), GFP_KERNEL);
 528         if (!ret)
 529                 return ERR_PTR(-ENOMEM);
 530         ret->logical = fstripe_logical;
 531         ret->refs = 1;
 532         mutex_init(&ret->mutex);
 533
 534         rb_link_node(&ret->node, parent, p);
 535         rb_insert_color(&ret->node, &locks_root->root);
 536         return ret;
 537 }
 538
 539 /*
 540  * Search for a full stripe lock of a block group
 541  *
 542  * Return pointer to existing full stripe lock if found
 543  * Return NULL if not found
 544  */
 545 static struct full_stripe_lock *search_full_stripe_lock(
 546                 struct btrfs_full_stripe_locks_tree *locks_root,
 547                 u64 fstripe_logical)
 548 {
 549         struct rb_node *node;
 550         struct full_stripe_lock *entry;
 551
 552         lockdep_assert_held(&locks_root->lock);
 553
 554         node = locks_root->root.rb_node;
 555         while (node) {
 556                 entry = rb_entry(node, struct full_stripe_lock, node);
 557                 if (fstripe_logical < entry->logical)
 558                         node = node->rb_left;
 559                 else if (fstripe_logical > entry->logical)
 560                         node = node->rb_right;
 561                 else
 562                         return entry;
 563         }
 564         return NULL;
 565 }
 566
 567 /*
 568  * Helper to get full stripe logical from a normal bytenr.
 569  *
 570  * Caller must ensure @cache is a RAID56 block group.
 571  */
 572 static u64 get_full_stripe_logical(struct btrfs_block_group *cache, u64 bytenr)
 573 {
 574         u64 ret;
 575
 576         /*
 577          * Due to chunk item size limit, full stripe length should not be
 578          * larger than U32_MAX. Just a sanity check here.
 579          */
 580         WARN_ON_ONCE(cache->full_stripe_len >= U32_MAX);
 581
 582         /*
 583          * round_down() can only handle power of 2, while RAID56 full
 584          * stripe length can be 64KiB * n, so we need to manually round down.
 585          */
 586         ret = div64_u64(bytenr - cache->start, cache->full_stripe_len) *
 587                         cache->full_stripe_len + cache->start;
 588         return ret;
 589 }
 590
 591 /*
 592  * Lock a full stripe to avoid concurrency of recovery and read
 593  *
 594  * It's only used for profiles with parities (RAID5/6), for other profiles it
 595  * does nothing.
 596  *
 597  * Return 0 if we locked full stripe covering @bytenr, with a mutex held.
 598  * So caller must call unlock_full_stripe() at the same context.
 599  *
 600  * Return <0 if encounters error.
 601  */
 602 static int lock_full_stripe(struct btrfs_fs_info *fs_info, u64 bytenr,
 603                             bool *locked_ret)
 604 {
 605         struct btrfs_block_group *bg_cache;
 606         struct btrfs_full_stripe_locks_tree *locks_root;
 607         struct full_stripe_lock *existing;
 608         u64 fstripe_start;
 609         int ret = 0;
 610
 611         *locked_ret = false;
 612         bg_cache = btrfs_lookup_block_group(fs_info, bytenr);
 613         if (!bg_cache) {
 614                 ASSERT(0);
 615                 return -ENOENT;
 616         }
 617
 618         /* Profiles not based on parity don't need full stripe lock */
 619         if (!(bg_cache->flags & BTRFS_BLOCK_GROUP_RAID56_MASK))
 620                 goto out;
 621         locks_root = &bg_cache->full_stripe_locks_root;
 622
 623         fstripe_start = get_full_stripe_logical(bg_cache, bytenr);
 624
 625         /* Now insert the full stripe lock */
 626         mutex_lock(&locks_root->lock);
 627         existing = insert_full_stripe_lock(locks_root, fstripe_start);
 628         mutex_unlock(&locks_root->lock);
 629         if (IS_ERR(existing)) {
 630                 ret = PTR_ERR(existing);
 631                 goto out;
 632         }
 633         mutex_lock(&existing->mutex);
 634         *locked_ret = true;
 635 out:
 636         btrfs_put_block_group(bg_cache);
 637         return ret;
 638 }
 639
 640 /*
 641  * Unlock a full stripe.
 642  *
 643  * NOTE: Caller must ensure it's the same context calling corresponding
 644  * lock_full_stripe().
 645  *
 646  * Return 0 if we unlock full stripe without problem.
 647  * Return <0 for error
 648  */
 649 static int unlock_full_stripe(struct btrfs_fs_info *fs_info, u64 bytenr,
 650                               bool locked)
 651 {
 652         struct btrfs_block_group *bg_cache;
 653         struct btrfs_full_stripe_locks_tree *locks_root;
 654         struct full_stripe_lock *fstripe_lock;
 655         u64 fstripe_start;
 656         bool freeit = false;
 657         int ret = 0;
 658
 659         /* If we didn't acquire full stripe lock, no need to continue */
 660         if (!locked)
 661                 return 0;
 662
 663         bg_cache = btrfs_lookup_block_group(fs_info, bytenr);
 664         if (!bg_cache) {
 665                 ASSERT(0);
 666                 return -ENOENT;
 667         }
 668         if (!(bg_cache->flags & BTRFS_BLOCK_GROUP_RAID56_MASK))
 669                 goto out;
 670
 671         locks_root = &bg_cache->full_stripe_locks_root;
 672         fstripe_start = get_full_stripe_logical(bg_cache, bytenr);
 673
 674         mutex_lock(&locks_root->lock);
 675         fstripe_lock = search_full_stripe_lock(locks_root, fstripe_start);
 676         /* Unpaired unlock_full_stripe() detected */
 677         if (!fstripe_lock) {
 678                 WARN_ON(1);
 679                 ret = -ENOENT;
 680                 mutex_unlock(&locks_root->lock);
 681                 goto out;
 682         }
 683
 684         if (fstripe_lock->refs == 0) {
 685                 WARN_ON(1);
 686                 btrfs_warn(fs_info, "full stripe lock at %llu refcount underflow",
 687                         fstripe_lock->logical);
 688         } else {
 689                 fstripe_lock->refs--;
 690         }
 691
 692         if (fstripe_lock->refs == 0) {
 693                 rb_erase(&fstripe_lock->node, &locks_root->root);
 694                 freeit = true;
 695         }
 696         mutex_unlock(&locks_root->lock);
 697
 698         mutex_unlock(&fstripe_lock->mutex);
 699         if (freeit)
 700                 kfree(fstripe_lock);
 701 out:
 702         btrfs_put_block_group(bg_cache);
 703         return ret;
 704 }
 705
 706 static void scrub_free_csums(struct scrub_ctx *sctx)
 707 {
 708         while (!list_empty(&sctx->csum_list)) {
 709                 struct btrfs_ordered_sum *sum;
 710                 sum = list_first_entry(&sctx->csum_list,
 711                                        struct btrfs_ordered_sum, list);
 712                 list_del(&sum->list);
 713                 kfree(sum);
 714         }
 715 }
 716
 717 static noinline_for_stack void scrub_free_ctx(struct scrub_ctx *sctx)
 718 {
 719         int i;
 720
 721         if (!sctx)
 722                 return;
 723
 724         /* this can happen when scrub is cancelled */
 725         if (sctx->curr != -1) {
 726                 struct scrub_bio *sbio = sctx->bios[sctx->curr];
 727
 728                 for (i = 0; i < sbio->sector_count; i++)
 729                         scrub_block_put(sbio->sectors[i]->sblock);
 730                 bio_put(sbio->bio);
 731         }
 732
 733         for (i = 0; i < SCRUB_BIOS_PER_SCTX; ++i) {
 734                 struct scrub_bio *sbio = sctx->bios[i];
 735
 736                 if (!sbio)
 737                         break;
 738                 kfree(sbio);
 739         }
 740
 741         kfree(sctx->wr_curr_bio);
 742         scrub_free_csums(sctx);
 743         kfree(sctx);
 744 }
 745
 746 static void scrub_put_ctx(struct scrub_ctx *sctx)
 747 {
 748         if (refcount_dec_and_test(&sctx->refs))
 749                 scrub_free_ctx(sctx);
 750 }
 751
 752 static noinline_for_stack struct scrub_ctx *scrub_setup_ctx(
 753                 struct btrfs_fs_info *fs_info, int is_dev_replace)
 754 {
 755         struct scrub_ctx *sctx;
 756         int             i;
 757
 758         sctx = kzalloc(sizeof(*sctx), GFP_KERNEL);
 759         if (!sctx)
 760                 goto nomem;
 761         refcount_set(&sctx->refs, 1);
 762         sctx->is_dev_replace = is_dev_replace;
 763         sctx->sectors_per_bio = SCRUB_SECTORS_PER_BIO;
 764         sctx->curr = -1;
 765         sctx->fs_info = fs_info;
 766         INIT_LIST_HEAD(&sctx->csum_list);
 767         for (i = 0; i < SCRUB_BIOS_PER_SCTX; ++i) {
 768                 struct scrub_bio *sbio;
 769
 770                 sbio = kzalloc(sizeof(*sbio), GFP_KERNEL);
 771                 if (!sbio)
 772                         goto nomem;
 773                 sctx->bios[i] = sbio;
 774
 775                 sbio->index = i;
 776                 sbio->sctx = sctx;
 777                 sbio->sector_count = 0;
 778                 INIT_WORK(&sbio->work, scrub_bio_end_io_worker);
 779
 780                 if (i != SCRUB_BIOS_PER_SCTX - 1)
 781                         sctx->bios[i]->next_free = i + 1;
 782                 else
 783                         sctx->bios[i]->next_free = -1;
 784         }
 785         sctx->first_free = 0;
 786         atomic_set(&sctx->bios_in_flight, 0);
 787         atomic_set(&sctx->workers_pending, 0);
 788         atomic_set(&sctx->cancel_req, 0);
 789
 790         spin_lock_init(&sctx->list_lock);
 791         spin_lock_init(&sctx->stat_lock);
 792         init_waitqueue_head(&sctx->list_wait);
 793         sctx->throttle_deadline = 0;
 794
 795         WARN_ON(sctx->wr_curr_bio != NULL);
 796         mutex_init(&sctx->wr_lock);
 797         sctx->wr_curr_bio = NULL;
 798         if (is_dev_replace) {
 799                 WARN_ON(!fs_info->dev_replace.tgtdev);
 800                 sctx->wr_tgtdev = fs_info->dev_replace.tgtdev;
 801                 sctx->flush_all_writes = false;
 802         }
 803
 804         return sctx;
 805
 806 nomem:
 807         scrub_free_ctx(sctx);
 808         return ERR_PTR(-ENOMEM);
 809 }
 810
 811 static int scrub_print_warning_inode(u64 inum, u64 offset, u64 num_bytes,
 812                                      u64 root, void *warn_ctx)
 813 {
 814         u32 nlink;
 815         int ret;
 816         int i;
 817         unsigned nofs_flag;
 818         struct extent_buffer *eb;
 819         struct btrfs_inode_item *inode_item;
 820         struct scrub_warning *swarn = warn_ctx;
 821         struct btrfs_fs_info *fs_info = swarn->dev->fs_info;
 822         struct inode_fs_paths *ipath = NULL;
 823         struct btrfs_root *local_root;
 824         struct btrfs_key key;
 825
 826         local_root = btrfs_get_fs_root(fs_info, root, true);
 827         if (IS_ERR(local_root)) {
 828                 ret = PTR_ERR(local_root);
 829                 goto err;
 830         }
 831
 832         /*
 833          * this makes the path point to (inum INODE_ITEM ioff)
 834          */
 835         key.objectid = inum;
 836         key.type = BTRFS_INODE_ITEM_KEY;
 837         key.offset = 0;
 838
 839         ret = btrfs_search_slot(NULL, local_root, &key, swarn->path, 0, 0);
 840         if (ret) {
 841                 btrfs_put_root(local_root);
 842                 btrfs_release_path(swarn->path);
 843                 goto err;
 844         }
 845
 846         eb = swarn->path->nodes[0];
 847         inode_item = btrfs_item_ptr(eb, swarn->path->slots[0],
 848                                         struct btrfs_inode_item);
 849         nlink = btrfs_inode_nlink(eb, inode_item);
 850         btrfs_release_path(swarn->path);
 851
 852         /*
 853          * init_path might indirectly call vmalloc, or use GFP_KERNEL. Scrub
 854          * uses GFP_NOFS in this context, so we keep it consistent but it does
 855          * not seem to be strictly necessary.
 856          */
 857         nofs_flag = memalloc_nofs_save();
 858         ipath = init_ipath(4096, local_root, swarn->path);
 859         memalloc_nofs_restore(nofs_flag);
 860         if (IS_ERR(ipath)) {
 861                 btrfs_put_root(local_root);
 862                 ret = PTR_ERR(ipath);
 863                 ipath = NULL;
 864                 goto err;
 865         }
 866         ret = paths_from_inode(inum, ipath);
 867
 868         if (ret < 0)
 869                 goto err;
 870
 871         /*
 872          * we deliberately ignore the bit ipath might have been too small to
 873          * hold all of the paths here
 874          */
 875         for (i = 0; i < ipath->fspath->elem_cnt; ++i)
 876                 btrfs_warn_in_rcu(fs_info,
 877 "%s at logical %llu on dev %s, physical %llu, root %llu, inode %llu, offset %llu, length %u, links %u (path: %s)",
 878                                   swarn->errstr, swarn->logical,
 879                                   btrfs_dev_name(swarn->dev),
 880                                   swarn->physical,
 881                                   root, inum, offset,
 882                                   fs_info->sectorsize, nlink,
 883                                   (char *)(unsigned long)ipath->fspath->val[i]);
 884
 885         btrfs_put_root(local_root);
 886         free_ipath(ipath);
 887         return 0;
 888
 889 err:
 890         btrfs_warn_in_rcu(fs_info,
 891                           "%s at logical %llu on dev %s, physical %llu, root %llu, inode %llu, offset %llu: path resolving failed with ret=%d",
 892                           swarn->errstr, swarn->logical,
 893                           btrfs_dev_name(swarn->dev),
 894                           swarn->physical,
 895                           root, inum, offset, ret);
 896
 897         free_ipath(ipath);
 898         return 0;
 899 }
 900
 901 static void scrub_print_warning(const char *errstr, struct scrub_block *sblock)
 902 {
 903         struct btrfs_device *dev;
 904         struct btrfs_fs_info *fs_info;
 905         struct btrfs_path *path;
 906         struct btrfs_key found_key;
 907         struct extent_buffer *eb;
 908         struct btrfs_extent_item *ei;
 909         struct scrub_warning swarn;
 910         unsigned long ptr = 0;
 911         u64 flags = 0;
 912         u64 ref_root;
 913         u32 item_size;
 914         u8 ref_level = 0;
 915         int ret;
 916
 917         WARN_ON(sblock->sector_count < 1);
 918         dev = sblock->dev;
 919         fs_info = sblock->sctx->fs_info;
 920
 921         /* Super block error, no need to search extent tree. */
 922         if (sblock->sectors[0]->flags & BTRFS_EXTENT_FLAG_SUPER) {
 923                 btrfs_warn_in_rcu(fs_info, "%s on device %s, physical %llu",
 924                         errstr, btrfs_dev_name(dev), sblock->physical);
 925                 return;
 926         }
 927         path = btrfs_alloc_path();
 928         if (!path)
 929                 return;
 930
 931         swarn.physical = sblock->physical;
 932         swarn.logical = sblock->logical;
 933         swarn.errstr = errstr;
 934         swarn.dev = NULL;
 935
 936         ret = extent_from_logical(fs_info, swarn.logical, path, &found_key,
 937                                   &flags);
 938         if (ret < 0)
 939                 goto out;
 940
 941         swarn.extent_item_size = found_key.offset;
 942
 943         eb = path->nodes[0];
 944         ei = btrfs_item_ptr(eb, path->slots[0], struct btrfs_extent_item);
 945         item_size = btrfs_item_size(eb, path->slots[0]);
 946
 947         if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
 948                 do {
 949                         ret = tree_backref_for_extent(&ptr, eb, &found_key, ei,
 950                                                       item_size, &ref_root,
 951                                                       &ref_level);
 952                         btrfs_warn_in_rcu(fs_info,
 953 "%s at logical %llu on dev %s, physical %llu: metadata %s (level %d) in tree %llu",
 954                                 errstr, swarn.logical,
 955                                 btrfs_dev_name(dev),
 956                                 swarn.physical,
 957                                 ref_level ? "node" : "leaf",
 958                                 ret < 0 ? -1 : ref_level,
 959                                 ret < 0 ? -1 : ref_root);
 960                 } while (ret != 1);
 961                 btrfs_release_path(path);
 962         } else {
 963                 struct btrfs_backref_walk_ctx ctx = { 0 };
 964
 965                 btrfs_release_path(path);
 966
 967                 ctx.bytenr = found_key.objectid;
 968                 ctx.extent_item_pos = swarn.logical - found_key.objectid;
 969                 ctx.fs_info = fs_info;
 970
 971                 swarn.path = path;
 972                 swarn.dev = dev;
 973
 974                 iterate_extent_inodes(&ctx, true, scrub_print_warning_inode, &swarn);
 975         }
 976
 977 out:
 978         btrfs_free_path(path);
 979 }
 980
 981 static inline void scrub_get_recover(struct scrub_recover *recover)
 982 {
 983         refcount_inc(&recover->refs);
 984 }
 985
 986 static inline void scrub_put_recover(struct btrfs_fs_info *fs_info,
 987                                      struct scrub_recover *recover)
 988 {
 989         if (refcount_dec_and_test(&recover->refs)) {
 990                 btrfs_bio_counter_dec(fs_info);
 991                 btrfs_put_bioc(recover->bioc);
 992                 kfree(recover);
 993         }
 994 }
 995
 996 /*
 997  * scrub_handle_errored_block gets called when either verification of the
 998  * sectors failed or the bio failed to read, e.g. with EIO. In the latter
 999  * case, this function handles all sectors in the bio, even though only one
1000  * may be bad.
1001  * The goal of this function is to repair the errored block by using the
1002  * contents of one of the mirrors.
1003  */
1004 static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
1005 {
1006         struct scrub_ctx *sctx = sblock_to_check->sctx;
1007         struct btrfs_device *dev = sblock_to_check->dev;
1008         struct btrfs_fs_info *fs_info;
1009         u64 logical;
1010         unsigned int failed_mirror_index;
1011         unsigned int is_metadata;
1012         unsigned int have_csum;
1013         /* One scrub_block for each mirror */
1014         struct scrub_block *sblocks_for_recheck[BTRFS_MAX_MIRRORS] = { 0 };
1015         struct scrub_block *sblock_bad;
1016         int ret;
1017         int mirror_index;
1018         int sector_num;
1019         int success;
1020         bool full_stripe_locked;
1021         unsigned int nofs_flag;
1022         static DEFINE_RATELIMIT_STATE(rs, DEFAULT_RATELIMIT_INTERVAL,
1023                                       DEFAULT_RATELIMIT_BURST);
1024
1025         BUG_ON(sblock_to_check->sector_count < 1);
1026         fs_info = sctx->fs_info;
1027         if (sblock_to_check->sectors[0]->flags & BTRFS_EXTENT_FLAG_SUPER) {
1028                 /*
1029                  * If we find an error in a super block, we just report it.
1030                  * They will get written with the next transaction commit
1031                  * anyway
1032                  */
1033                 scrub_print_warning("super block error", sblock_to_check);
1034                 spin_lock(&sctx->stat_lock);
1035                 ++sctx->stat.super_errors;
1036                 spin_unlock(&sctx->stat_lock);
1037                 btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_CORRUPTION_ERRS);
1038                 return 0;
1039         }
1040         logical = sblock_to_check->logical;
1041         ASSERT(sblock_to_check->mirror_num);
1042         failed_mirror_index = sblock_to_check->mirror_num - 1;
1043         is_metadata = !(sblock_to_check->sectors[0]->flags &
1044                         BTRFS_EXTENT_FLAG_DATA);
1045         have_csum = sblock_to_check->sectors[0]->have_csum;
1046
1047         if (!sctx->is_dev_replace && btrfs_repair_one_zone(fs_info, logical))
1048                 return 0;
1049
1050         /*
1051          * We must use GFP_NOFS because the scrub task might be waiting for a
1052          * worker task executing this function and in turn a transaction commit
1053          * might be waiting the scrub task to pause (which needs to wait for all
1054          * the worker tasks to complete before pausing).
1055          * We do allocations in the workers through insert_full_stripe_lock()
1056          * and scrub_add_sector_to_wr_bio(), which happens down the call chain of
1057          * this function.
1058          */
1059         nofs_flag = memalloc_nofs_save();
1060         /*
1061          * For RAID5/6, race can happen for a different device scrub thread.
1062          * For data corruption, Parity and Data threads will both try
1063          * to recovery the data.
1064          * Race can lead to doubly added csum error, or even unrecoverable
1065          * error.
1066          */
1067         ret = lock_full_stripe(fs_info, logical, &full_stripe_locked);
1068         if (ret < 0) {
1069                 memalloc_nofs_restore(nofs_flag);
1070                 spin_lock(&sctx->stat_lock);
1071                 if (ret == -ENOMEM)
1072                         sctx->stat.malloc_errors++;
1073                 sctx->stat.read_errors++;
1074                 sctx->stat.uncorrectable_errors++;
1075                 spin_unlock(&sctx->stat_lock);
1076                 return ret;
1077         }
1078
1079         /*
1080          * read all mirrors one after the other. This includes to
1081          * re-read the extent or metadata block that failed (that was
1082          * the cause that this fixup code is called) another time,
1083          * sector by sector this time in order to know which sectors
1084          * caused I/O errors and which ones are good (for all mirrors).
1085          * It is the goal to handle the situation when more than one
1086          * mirror contains I/O errors, but the errors do not
1087          * overlap, i.e. the data can be repaired by selecting the
1088          * sectors from those mirrors without I/O error on the
1089          * particular sectors. One example (with blocks >= 2 * sectorsize)
1090          * would be that mirror #1 has an I/O error on the first sector,
1091          * the second sector is good, and mirror #2 has an I/O error on
1092          * the second sector, but the first sector is good.
1093          * Then the first sector of the first mirror can be repaired by
1094          * taking the first sector of the second mirror, and the
1095          * second sector of the second mirror can be repaired by
1096          * copying the contents of the 2nd sector of the 1st mirror.
1097          * One more note: if the sectors of one mirror contain I/O
1098          * errors, the checksum cannot be verified. In order to get
1099          * the best data for repairing, the first attempt is to find
1100          * a mirror without I/O errors and with a validated checksum.
1101          * Only if this is not possible, the sectors are picked from
1102          * mirrors with I/O errors without considering the checksum.
1103          * If the latter is the case, at the end, the checksum of the
1104          * repaired area is verified in order to correctly maintain
1105          * the statistics.
1106          */
1107         for (mirror_index = 0; mirror_index < BTRFS_MAX_MIRRORS; mirror_index++) {
1108                 /*
1109                  * Note: the two members refs and outstanding_sectors are not
1110                  * used in the blocks that are used for the recheck procedure.
1111                  *
1112                  * But alloc_scrub_block() will initialize sblock::ref anyway,
1113                  * so we can use scrub_block_put() to clean them up.
1114                  *
1115                  * And here we don't setup the physical/dev for the sblock yet,
1116                  * they will be correctly initialized in scrub_setup_recheck_block().
1117                  */
1118                 sblocks_for_recheck[mirror_index] = alloc_scrub_block(sctx, NULL,
1119                                                         logical, 0, 0, mirror_index);
1120                 if (!sblocks_for_recheck[mirror_index]) {
1121                         spin_lock(&sctx->stat_lock);
1122                         sctx->stat.malloc_errors++;
1123                         sctx->stat.read_errors++;
1124                         sctx->stat.uncorrectable_errors++;
1125                         spin_unlock(&sctx->stat_lock);
1126                         btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS);
1127                         goto out;
1128                 }
1129         }
1130
1131         /* Setup the context, map the logical blocks and alloc the sectors */
1132         ret = scrub_setup_recheck_block(sblock_to_check, sblocks_for_recheck);
1133         if (ret) {
1134                 spin_lock(&sctx->stat_lock);
1135                 sctx->stat.read_errors++;
1136                 sctx->stat.uncorrectable_errors++;
1137                 spin_unlock(&sctx->stat_lock);
1138                 btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS);
1139                 goto out;
1140         }
1141         BUG_ON(failed_mirror_index >= BTRFS_MAX_MIRRORS);
1142         sblock_bad = sblocks_for_recheck[failed_mirror_index];
1143
1144         /* build and submit the bios for the failed mirror, check checksums */
1145         scrub_recheck_block(fs_info, sblock_bad, 1);
1146
1147         if (!sblock_bad->header_error && !sblock_bad->checksum_error &&
1148             sblock_bad->no_io_error_seen) {
1149                 /*
1150                  * The error disappeared after reading sector by sector, or
1151                  * the area was part of a huge bio and other parts of the
1152                  * bio caused I/O errors, or the block layer merged several
1153                  * read requests into one and the error is caused by a
1154                  * different bio (usually one of the two latter cases is
1155                  * the cause)
1156                  */
1157                 spin_lock(&sctx->stat_lock);
1158                 sctx->stat.unverified_errors++;
1159                 sblock_to_check->data_corrected = 1;
1160                 spin_unlock(&sctx->stat_lock);
1161
1162                 if (sctx->is_dev_replace)
1163                         scrub_write_block_to_dev_replace(sblock_bad);
1164                 goto out;
1165         }
1166
1167         if (!sblock_bad->no_io_error_seen) {
1168                 spin_lock(&sctx->stat_lock);
1169                 sctx->stat.read_errors++;
1170                 spin_unlock(&sctx->stat_lock);
1171                 if (__ratelimit(&rs))
1172                         scrub_print_warning("i/o error", sblock_to_check);
1173                 btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS);
1174         } else if (sblock_bad->checksum_error) {
1175                 spin_lock(&sctx->stat_lock);
1176                 sctx->stat.csum_errors++;
1177                 spin_unlock(&sctx->stat_lock);
1178                 if (__ratelimit(&rs))
1179                         scrub_print_warning("checksum error", sblock_to_check);
1180                 btrfs_dev_stat_inc_and_print(dev,
1181                                              BTRFS_DEV_STAT_CORRUPTION_ERRS);
1182         } else if (sblock_bad->header_error) {
1183                 spin_lock(&sctx->stat_lock);
1184                 sctx->stat.verify_errors++;
1185                 spin_unlock(&sctx->stat_lock);
1186                 if (__ratelimit(&rs))
1187                         scrub_print_warning("checksum/header error",
1188                                             sblock_to_check);
1189                 if (sblock_bad->generation_error)
1190                         btrfs_dev_stat_inc_and_print(dev,
1191                                 BTRFS_DEV_STAT_GENERATION_ERRS);
1192                 else
1193                         btrfs_dev_stat_inc_and_print(dev,
1194                                 BTRFS_DEV_STAT_CORRUPTION_ERRS);
1195         }
1196
1197         if (sctx->readonly) {
1198                 ASSERT(!sctx->is_dev_replace);
1199                 goto out;
1200         }
1201
1202         /*
1203          * now build and submit the bios for the other mirrors, check
1204          * checksums.
1205          * First try to pick the mirror which is completely without I/O
1206          * errors and also does not have a checksum error.
1207          * If one is found, and if a checksum is present, the full block
1208          * that is known to contain an error is rewritten. Afterwards
1209          * the block is known to be corrected.
1210          * If a mirror is found which is completely correct, and no
1211          * checksum is present, only those sectors are rewritten that had
1212          * an I/O error in the block to be repaired, since it cannot be
1213          * determined, which copy of the other sectors is better (and it
1214          * could happen otherwise that a correct sector would be
1215          * overwritten by a bad one).
1216          */
1217         for (mirror_index = 0; ;mirror_index++) {
1218                 struct scrub_block *sblock_other;
1219
1220                 if (mirror_index == failed_mirror_index)
1221                         continue;
1222
1223                 /* raid56's mirror can be more than BTRFS_MAX_MIRRORS */
1224                 if (!scrub_is_page_on_raid56(sblock_bad->sectors[0])) {
1225                         if (mirror_index >= BTRFS_MAX_MIRRORS)
1226                                 break;
1227                         if (!sblocks_for_recheck[mirror_index]->sector_count)
1228                                 break;
1229
1230                         sblock_other = sblocks_for_recheck[mirror_index];
1231                 } else {
1232                         struct scrub_recover *r = sblock_bad->sectors[0]->recover;
1233                         int max_allowed = r->bioc->num_stripes - r->bioc->num_tgtdevs;
1234
1235                         if (mirror_index >= max_allowed)
1236                                 break;
1237                         if (!sblocks_for_recheck[1]->sector_count)
1238                                 break;
1239
1240                         ASSERT(failed_mirror_index == 0);
1241                         sblock_other = sblocks_for_recheck[1];
1242                         sblock_other->mirror_num = 1 + mirror_index;
1243                 }
1244
1245                 /* build and submit the bios, check checksums */
1246                 scrub_recheck_block(fs_info, sblock_other, 0);
1247
1248                 if (!sblock_other->header_error &&
1249                     !sblock_other->checksum_error &&
1250                     sblock_other->no_io_error_seen) {
1251                         if (sctx->is_dev_replace) {
1252                                 scrub_write_block_to_dev_replace(sblock_other);
1253                                 goto corrected_error;
1254                         } else {
1255                                 ret = scrub_repair_block_from_good_copy(
1256                                                 sblock_bad, sblock_other);
1257                                 if (!ret)
1258                                         goto corrected_error;
1259                         }
1260                 }
1261         }
1262
1263         if (sblock_bad->no_io_error_seen && !sctx->is_dev_replace)
1264                 goto did_not_correct_error;
1265
1266         /*
1267          * In case of I/O errors in the area that is supposed to be
1268          * repaired, continue by picking good copies of those sectors.
1269          * Select the good sectors from mirrors to rewrite bad sectors from
1270          * the area to fix. Afterwards verify the checksum of the block
1271          * that is supposed to be repaired. This verification step is
1272          * only done for the purpose of statistic counting and for the
1273          * final scrub report, whether errors remain.
1274          * A perfect algorithm could make use of the checksum and try
1275          * all possible combinations of sectors from the different mirrors
1276          * until the checksum verification succeeds. For example, when
1277          * the 2nd sector of mirror #1 faces I/O errors, and the 2nd sector
1278          * of mirror #2 is readable but the final checksum test fails,
1279          * then the 2nd sector of mirror #3 could be tried, whether now
1280          * the final checksum succeeds. But this would be a rare
1281          * exception and is therefore not implemented. At least it is
1282          * avoided that the good copy is overwritten.
1283          * A more useful improvement would be to pick the sectors
1284          * without I/O error based on sector sizes (512 bytes on legacy
1285          * disks) instead of on sectorsize. Then maybe 512 byte of one
1286          * mirror could be repaired by taking 512 byte of a different
1287          * mirror, even if other 512 byte sectors in the same sectorsize
1288          * area are unreadable.
1289          */
1290         success = 1;
1291         for (sector_num = 0; sector_num < sblock_bad->sector_count;
1292              sector_num++) {
1293                 struct scrub_sector *sector_bad = sblock_bad->sectors[sector_num];
1294                 struct scrub_block *sblock_other = NULL;
1295
1296                 /* Skip no-io-error sectors in scrub */
1297                 if (!sector_bad->io_error && !sctx->is_dev_replace)
1298                         continue;
1299
1300                 if (scrub_is_page_on_raid56(sblock_bad->sectors[0])) {
1301                         /*
1302                          * In case of dev replace, if raid56 rebuild process
1303                          * didn't work out correct data, then copy the content
1304                          * in sblock_bad to make sure target device is identical
1305                          * to source device, instead of writing garbage data in
1306                          * sblock_for_recheck array to target device.
1307                          */
1308                         sblock_other = NULL;
1309                 } else if (sector_bad->io_error) {
1310                         /* Try to find no-io-error sector in mirrors */
1311                         for (mirror_index = 0;
1312                              mirror_index < BTRFS_MAX_MIRRORS &&
1313                              sblocks_for_recheck[mirror_index]->sector_count > 0;
1314                              mirror_index++) {
1315                                 if (!sblocks_for_recheck[mirror_index]->
1316                                     sectors[sector_num]->io_error) {
1317                                         sblock_other = sblocks_for_recheck[mirror_index];
1318                                         break;
1319                                 }
1320                         }
1321                         if (!sblock_other)
1322                                 success = 0;
1323                 }
1324
1325                 if (sctx->is_dev_replace) {
1326                         /*
1327                          * Did not find a mirror to fetch the sector from.
1328                          * scrub_write_sector_to_dev_replace() handles this
1329                          * case (sector->io_error), by filling the block with
1330                          * zeros before submitting the write request
1331                          */
1332                         if (!sblock_other)
1333                                 sblock_other = sblock_bad;
1334
1335                         if (scrub_write_sector_to_dev_replace(sblock_other,
1336                                                               sector_num) != 0) {
1337                                 atomic64_inc(
1338                                         &fs_info->dev_replace.num_write_errors);
1339                                 success = 0;
1340                         }
1341                 } else if (sblock_other) {
1342                         ret = scrub_repair_sector_from_good_copy(sblock_bad,
1343                                                                  sblock_other,
1344                                                                  sector_num, 0);
1345                         if (0 == ret)
1346                                 sector_bad->io_error = 0;
1347                         else
1348                                 success = 0;
1349                 }
1350         }
1351
1352         if (success && !sctx->is_dev_replace) {
1353                 if (is_metadata || have_csum) {
1354                         /*
1355                          * need to verify the checksum now that all
1356                          * sectors on disk are repaired (the write
1357                          * request for data to be repaired is on its way).
1358                          * Just be lazy and use scrub_recheck_block()
1359                          * which re-reads the data before the checksum
1360                          * is verified, but most likely the data comes out
1361                          * of the page cache.
1362                          */
1363                         scrub_recheck_block(fs_info, sblock_bad, 1);
1364                         if (!sblock_bad->header_error &&
1365                             !sblock_bad->checksum_error &&
1366                             sblock_bad->no_io_error_seen)
1367                                 goto corrected_error;
1368                         else
1369                                 goto did_not_correct_error;
1370                 } else {
1371 corrected_error:
1372                         spin_lock(&sctx->stat_lock);
1373                         sctx->stat.corrected_errors++;
1374                         sblock_to_check->data_corrected = 1;
1375                         spin_unlock(&sctx->stat_lock);
1376                         btrfs_err_rl_in_rcu(fs_info,
1377                                 "fixed up error at logical %llu on dev %s",
1378                                 logical, btrfs_dev_name(dev));
1379                 }
1380         } else {
1381 did_not_correct_error:
1382                 spin_lock(&sctx->stat_lock);
1383                 sctx->stat.uncorrectable_errors++;
1384                 spin_unlock(&sctx->stat_lock);
1385                 btrfs_err_rl_in_rcu(fs_info,
1386                         "unable to fixup (regular) error at logical %llu on dev %s",
1387                         logical, btrfs_dev_name(dev));
1388         }
1389
1390 out:
1391         for (mirror_index = 0; mirror_index < BTRFS_MAX_MIRRORS; mirror_index++) {
1392                 struct scrub_block *sblock = sblocks_for_recheck[mirror_index];
1393                 struct scrub_recover *recover;
1394                 int sector_index;
1395
1396                 /* Not allocated, continue checking the next mirror */
1397                 if (!sblock)
1398                         continue;
1399
1400                 for (sector_index = 0; sector_index < sblock->sector_count;
1401                      sector_index++) {
1402                         /*
1403                          * Here we just cleanup the recover, each sector will be
1404                          * properly cleaned up by later scrub_block_put()
1405                          */
1406                         recover = sblock->sectors[sector_index]->recover;
1407                         if (recover) {
1408                                 scrub_put_recover(fs_info, recover);
1409                                 sblock->sectors[sector_index]->recover = NULL;
1410                         }
1411                 }
1412                 scrub_block_put(sblock);
1413         }
1414
1415         ret = unlock_full_stripe(fs_info, logical, full_stripe_locked);
1416         memalloc_nofs_restore(nofs_flag);
1417         if (ret < 0)
1418                 return ret;
1419         return 0;
1420 }
1421
1422 static inline int scrub_nr_raid_mirrors(struct btrfs_io_context *bioc)
1423 {
1424         if (bioc->map_type & BTRFS_BLOCK_GROUP_RAID5)
1425                 return 2;
1426         else if (bioc->map_type & BTRFS_BLOCK_GROUP_RAID6)
1427                 return 3;
1428         else
1429                 return (int)bioc->num_stripes;
1430 }
1431
1432 static inline void scrub_stripe_index_and_offset(u64 logical, u64 map_type,
1433                                                  u64 *raid_map,
1434                                                  int nstripes, int mirror,
1435                                                  int *stripe_index,
1436                                                  u64 *stripe_offset)
1437 {
1438         int i;
1439
1440         if (map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
1441                 /* RAID5/6 */
1442                 for (i = 0; i < nstripes; i++) {
1443                         if (raid_map[i] == RAID6_Q_STRIPE ||
1444                             raid_map[i] == RAID5_P_STRIPE)
1445                                 continue;
1446
1447                         if (logical >= raid_map[i] &&
1448                             logical < raid_map[i] + BTRFS_STRIPE_LEN)
1449                                 break;
1450                 }
1451
1452                 *stripe_index = i;
1453                 *stripe_offset = logical - raid_map[i];
1454         } else {
1455                 /* The other RAID type */
1456                 *stripe_index = mirror;
1457                 *stripe_offset = 0;
1458         }
1459 }
1460
1461 static int scrub_setup_recheck_block(struct scrub_block *original_sblock,
1462                                      struct scrub_block *sblocks_for_recheck[])
1463 {
1464         struct scrub_ctx *sctx = original_sblock->sctx;
1465         struct btrfs_fs_info *fs_info = sctx->fs_info;
1466         u64 logical = original_sblock->logical;
1467         u64 length = original_sblock->sector_count << fs_info->sectorsize_bits;
1468         u64 generation = original_sblock->sectors[0]->generation;
1469         u64 flags = original_sblock->sectors[0]->flags;
1470         u64 have_csum = original_sblock->sectors[0]->have_csum;
1471         struct scrub_recover *recover;
1472         struct btrfs_io_context *bioc;
1473         u64 sublen;
1474         u64 mapped_length;
1475         u64 stripe_offset;
1476         int stripe_index;
1477         int sector_index = 0;
1478         int mirror_index;
1479         int nmirrors;
1480         int ret;
1481
1482         while (length > 0) {
1483                 sublen = min_t(u64, length, fs_info->sectorsize);
1484                 mapped_length = sublen;
1485                 bioc = NULL;
1486
1487                 /*
1488                  * With a length of sectorsize, each returned stripe represents
1489                  * one mirror
1490                  */
1491                 btrfs_bio_counter_inc_blocked(fs_info);
1492                 ret = btrfs_map_sblock(fs_info, BTRFS_MAP_GET_READ_MIRRORS,
1493                                        logical, &mapped_length, &bioc);
1494                 if (ret || !bioc || mapped_length < sublen) {
1495                         btrfs_put_bioc(bioc);
1496                         btrfs_bio_counter_dec(fs_info);
1497                         return -EIO;
1498                 }
1499
1500                 recover = kzalloc(sizeof(struct scrub_recover), GFP_KERNEL);
1501                 if (!recover) {
1502                         btrfs_put_bioc(bioc);
1503                         btrfs_bio_counter_dec(fs_info);
1504                         return -ENOMEM;
1505                 }
1506
1507                 refcount_set(&recover->refs, 1);
1508                 recover->bioc = bioc;
1509                 recover->map_length = mapped_length;
1510
1511                 ASSERT(sector_index < SCRUB_MAX_SECTORS_PER_BLOCK);
1512
1513                 nmirrors = min(scrub_nr_raid_mirrors(bioc), BTRFS_MAX_MIRRORS);
1514
1515                 for (mirror_index = 0; mirror_index < nmirrors;
1516                      mirror_index++) {
1517                         struct scrub_block *sblock;
1518                         struct scrub_sector *sector;
1519
1520                         sblock = sblocks_for_recheck[mirror_index];
1521                         sblock->sctx = sctx;
1522
1523                         sector = alloc_scrub_sector(sblock, logical);
1524                         if (!sector) {
1525                                 spin_lock(&sctx->stat_lock);
1526                                 sctx->stat.malloc_errors++;
1527                                 spin_unlock(&sctx->stat_lock);
1528                                 scrub_put_recover(fs_info, recover);
1529                                 return -ENOMEM;
1530                         }
1531                         sector->flags = flags;
1532                         sector->generation = generation;
1533                         sector->have_csum = have_csum;
1534                         if (have_csum)
1535                                 memcpy(sector->csum,
1536                                        original_sblock->sectors[0]->csum,
1537                                        sctx->fs_info->csum_size);
1538
1539                         scrub_stripe_index_and_offset(logical,
1540                                                       bioc->map_type,
1541                                                       bioc->raid_map,
1542                                                       bioc->num_stripes -
1543                                                       bioc->num_tgtdevs,
1544                                                       mirror_index,
1545                                                       &stripe_index,
1546                                                       &stripe_offset);
1547                         /*
1548                          * We're at the first sector, also populate @sblock
1549                          * physical and dev.
1550                          */
1551                         if (sector_index == 0) {
1552                                 sblock->physical =
1553                                         bioc->stripes[stripe_index].physical +
1554                                         stripe_offset;
1555                                 sblock->dev = bioc->stripes[stripe_index].dev;
1556                                 sblock->physical_for_dev_replace =
1557                                         original_sblock->physical_for_dev_replace;
1558                         }
1559
1560                         BUG_ON(sector_index >= original_sblock->sector_count);
1561                         scrub_get_recover(recover);
1562                         sector->recover = recover;
1563                 }
1564                 scrub_put_recover(fs_info, recover);
1565                 length -= sublen;
1566                 logical += sublen;
1567                 sector_index++;
1568         }
1569
1570         return 0;
1571 }
1572
1573 static void scrub_bio_wait_endio(struct bio *bio)
1574 {
1575         complete(bio->bi_private);
1576 }
1577
1578 static int scrub_submit_raid56_bio_wait(struct btrfs_fs_info *fs_info,
1579                                         struct bio *bio,
1580                                         struct scrub_sector *sector)
1581 {
1582         DECLARE_COMPLETION_ONSTACK(done);
1583
1584         bio->bi_iter.bi_sector = (sector->offset + sector->sblock->logical) >>
1585                                  SECTOR_SHIFT;
1586         bio->bi_private = &done;
1587         bio->bi_end_io = scrub_bio_wait_endio;
1588         raid56_parity_recover(bio, sector->recover->bioc, sector->sblock->mirror_num);
1589
1590         wait_for_completion_io(&done);
1591         return blk_status_to_errno(bio->bi_status);
1592 }
1593
1594 static void scrub_recheck_block_on_raid56(struct btrfs_fs_info *fs_info,
1595                                           struct scrub_block *sblock)
1596 {
1597         struct scrub_sector *first_sector = sblock->sectors[0];
1598         struct bio *bio;
1599         int i;
1600
1601         /* All sectors in sblock belong to the same stripe on the same device. */
1602         ASSERT(sblock->dev);
1603         if (!sblock->dev->bdev)
1604                 goto out;
1605
1606         bio = bio_alloc(sblock->dev->bdev, BIO_MAX_VECS, REQ_OP_READ, GFP_NOFS);
1607
1608         for (i = 0; i < sblock->sector_count; i++) {
1609                 struct scrub_sector *sector = sblock->sectors[i];
1610
1611                 bio_add_scrub_sector(bio, sector, fs_info->sectorsize);
1612         }
1613
1614         if (scrub_submit_raid56_bio_wait(fs_info, bio, first_sector)) {
1615                 bio_put(bio);
1616                 goto out;
1617         }
1618
1619         bio_put(bio);
1620
1621         scrub_recheck_block_checksum(sblock);
1622
1623         return;
1624 out:
1625         for (i = 0; i < sblock->sector_count; i++)
1626                 sblock->sectors[i]->io_error = 1;
1627
1628         sblock->no_io_error_seen = 0;
1629 }
1630
1631 /*
1632  * This function will check the on disk data for checksum errors, header errors
1633  * and read I/O errors. If any I/O errors happen, the exact sectors which are
1634  * errored are marked as being bad. The goal is to enable scrub to take those
1635  * sectors that are not errored from all the mirrors so that the sectors that
1636  * are errored in the just handled mirror can be repaired.
1637  */
1638 static void scrub_recheck_block(struct btrfs_fs_info *fs_info,
1639                                 struct scrub_block *sblock,
1640                                 int retry_failed_mirror)
1641 {
1642         int i;
1643
1644         sblock->no_io_error_seen = 1;
1645
1646         /* short cut for raid56 */
1647         if (!retry_failed_mirror && scrub_is_page_on_raid56(sblock->sectors[0]))
1648                 return scrub_recheck_block_on_raid56(fs_info, sblock);
1649
1650         for (i = 0; i < sblock->sector_count; i++) {
1651                 struct scrub_sector *sector = sblock->sectors[i];
1652                 struct bio bio;
1653                 struct bio_vec bvec;
1654
1655                 if (sblock->dev->bdev == NULL) {
1656                         sector->io_error = 1;
1657                         sblock->no_io_error_seen = 0;
1658                         continue;
1659                 }
1660
1661                 bio_init(&bio, sblock->dev->bdev, &bvec, 1, REQ_OP_READ);
1662                 bio_add_scrub_sector(&bio, sector, fs_info->sectorsize);
1663                 bio.bi_iter.bi_sector = (sblock->physical + sector->offset) >>
1664                                         SECTOR_SHIFT;
1665
1666                 btrfsic_check_bio(&bio);
1667                 if (submit_bio_wait(&bio)) {
1668                         sector->io_error = 1;
1669                         sblock->no_io_error_seen = 0;
1670                 }
1671
1672                 bio_uninit(&bio);
1673         }
1674
1675         if (sblock->no_io_error_seen)
1676                 scrub_recheck_block_checksum(sblock);
1677 }
1678
1679 static inline int scrub_check_fsid(u8 fsid[], struct scrub_sector *sector)
1680 {
1681         struct btrfs_fs_devices *fs_devices = sector->sblock->dev->fs_devices;
1682         int ret;
1683
1684         ret = memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE);
1685         return !ret;
1686 }
1687
1688 static void scrub_recheck_block_checksum(struct scrub_block *sblock)
1689 {
1690         sblock->header_error = 0;
1691         sblock->checksum_error = 0;
1692         sblock->generation_error = 0;
1693
1694         if (sblock->sectors[0]->flags & BTRFS_EXTENT_FLAG_DATA)
1695                 scrub_checksum_data(sblock);
1696         else
1697                 scrub_checksum_tree_block(sblock);
1698 }
1699
1700 static int scrub_repair_block_from_good_copy(struct scrub_block *sblock_bad,
1701                                              struct scrub_block *sblock_good)
1702 {
1703         int i;
1704         int ret = 0;
1705
1706         for (i = 0; i < sblock_bad->sector_count; i++) {
1707                 int ret_sub;
1708
1709                 ret_sub = scrub_repair_sector_from_good_copy(sblock_bad,
1710                                                              sblock_good, i, 1);
1711                 if (ret_sub)
1712                         ret = ret_sub;
1713         }
1714
1715         return ret;
1716 }
1717
1718 static int scrub_repair_sector_from_good_copy(struct scrub_block *sblock_bad,
1719                                               struct scrub_block *sblock_good,
1720                                               int sector_num, int force_write)
1721 {
1722         struct scrub_sector *sector_bad = sblock_bad->sectors[sector_num];
1723         struct scrub_sector *sector_good = sblock_good->sectors[sector_num];
1724         struct btrfs_fs_info *fs_info = sblock_bad->sctx->fs_info;
1725         const u32 sectorsize = fs_info->sectorsize;
1726
1727         if (force_write || sblock_bad->header_error ||
1728             sblock_bad->checksum_error || sector_bad->io_error) {
1729                 struct bio bio;
1730                 struct bio_vec bvec;
1731                 int ret;
1732
1733                 if (!sblock_bad->dev->bdev) {
1734                         btrfs_warn_rl(fs_info,
1735                                 "scrub_repair_page_from_good_copy(bdev == NULL) is unexpected");
1736                         return -EIO;
1737                 }
1738
1739                 bio_init(&bio, sblock_bad->dev->bdev, &bvec, 1, REQ_OP_WRITE);
1740                 bio.bi_iter.bi_sector = (sblock_bad->physical +
1741                                          sector_bad->offset) >> SECTOR_SHIFT;
1742                 ret = bio_add_scrub_sector(&bio, sector_good, sectorsize);
1743
1744                 btrfsic_check_bio(&bio);
1745                 ret = submit_bio_wait(&bio);
1746                 bio_uninit(&bio);
1747
1748                 if (ret) {
1749                         btrfs_dev_stat_inc_and_print(sblock_bad->dev,
1750                                 BTRFS_DEV_STAT_WRITE_ERRS);
1751                         atomic64_inc(&fs_info->dev_replace.num_write_errors);
1752                         return -EIO;
1753                 }
1754         }
1755
1756         return 0;
1757 }
1758
1759 static void scrub_write_block_to_dev_replace(struct scrub_block *sblock)
1760 {
1761         struct btrfs_fs_info *fs_info = sblock->sctx->fs_info;
1762         int i;
1763
1764         /*
1765          * This block is used for the check of the parity on the source device,
1766          * so the data needn't be written into the destination device.
1767          */
1768         if (sblock->sparity)
1769                 return;
1770
1771         for (i = 0; i < sblock->sector_count; i++) {
1772                 int ret;
1773
1774                 ret = scrub_write_sector_to_dev_replace(sblock, i);
1775                 if (ret)
1776                         atomic64_inc(&fs_info->dev_replace.num_write_errors);
1777         }
1778 }
1779
1780 static int scrub_write_sector_to_dev_replace(struct scrub_block *sblock, int sector_num)
1781 {
1782         const u32 sectorsize = sblock->sctx->fs_info->sectorsize;
1783         struct scrub_sector *sector = sblock->sectors[sector_num];
1784
1785         if (sector->io_error)
1786                 memset(scrub_sector_get_kaddr(sector), 0, sectorsize);
1787
1788         return scrub_add_sector_to_wr_bio(sblock->sctx, sector);
1789 }
1790
1791 static int fill_writer_pointer_gap(struct scrub_ctx *sctx, u64 physical)
1792 {
1793         int ret = 0;
1794         u64 length;
1795
1796         if (!btrfs_is_zoned(sctx->fs_info))
1797                 return 0;
1798
1799         if (!btrfs_dev_is_sequential(sctx->wr_tgtdev, physical))
1800                 return 0;
1801
1802         if (sctx->write_pointer < physical) {
1803                 length = physical - sctx->write_pointer;
1804
1805                 ret = btrfs_zoned_issue_zeroout(sctx->wr_tgtdev,
1806                                                 sctx->write_pointer, length);
1807                 if (!ret)
1808                         sctx->write_pointer = physical;
1809         }
1810         return ret;
1811 }
1812
1813 static void scrub_block_get(struct scrub_block *sblock)
1814 {
1815         refcount_inc(&sblock->refs);
1816 }
1817
1818 static int scrub_add_sector_to_wr_bio(struct scrub_ctx *sctx,
1819                                       struct scrub_sector *sector)
1820 {
1821         struct scrub_block *sblock = sector->sblock;
1822         struct scrub_bio *sbio;
1823         int ret;
1824         const u32 sectorsize = sctx->fs_info->sectorsize;
1825
1826         mutex_lock(&sctx->wr_lock);
1827 again:
1828         if (!sctx->wr_curr_bio) {
1829                 sctx->wr_curr_bio = kzalloc(sizeof(*sctx->wr_curr_bio),
1830                                               GFP_KERNEL);
1831                 if (!sctx->wr_curr_bio) {
1832                         mutex_unlock(&sctx->wr_lock);
1833                         return -ENOMEM;
1834                 }
1835                 sctx->wr_curr_bio->sctx = sctx;
1836                 sctx->wr_curr_bio->sector_count = 0;
1837         }
1838         sbio = sctx->wr_curr_bio;
1839         if (sbio->sector_count == 0) {
1840                 ret = fill_writer_pointer_gap(sctx, sector->offset +
1841                                               sblock->physical_for_dev_replace);
1842                 if (ret) {
1843                         mutex_unlock(&sctx->wr_lock);
1844                         return ret;
1845                 }
1846
1847                 sbio->physical = sblock->physical_for_dev_replace + sector->offset;
1848                 sbio->logical = sblock->logical + sector->offset;
1849                 sbio->dev = sctx->wr_tgtdev;
1850                 if (!sbio->bio) {
1851                         sbio->bio = bio_alloc(sbio->dev->bdev, sctx->sectors_per_bio,
1852                                               REQ_OP_WRITE, GFP_NOFS);
1853                 }
1854                 sbio->bio->bi_private = sbio;
1855                 sbio->bio->bi_end_io = scrub_wr_bio_end_io;
1856                 sbio->bio->bi_iter.bi_sector = sbio->physical >> 9;
1857                 sbio->status = 0;
1858         } else if (sbio->physical + sbio->sector_count * sectorsize !=
1859                    sblock->physical_for_dev_replace + sector->offset ||
1860                    sbio->logical + sbio->sector_count * sectorsize !=
1861                    sblock->logical + sector->offset) {
1862                 scrub_wr_submit(sctx);
1863                 goto again;
1864         }
1865
1866         ret = bio_add_scrub_sector(sbio->bio, sector, sectorsize);
1867         if (ret != sectorsize) {
1868                 if (sbio->sector_count < 1) {
1869                         bio_put(sbio->bio);
1870                         sbio->bio = NULL;
1871                         mutex_unlock(&sctx->wr_lock);
1872                         return -EIO;
1873                 }
1874                 scrub_wr_submit(sctx);
1875                 goto again;
1876         }
1877
1878         sbio->sectors[sbio->sector_count] = sector;
1879         scrub_sector_get(sector);
1880         /*
1881          * Since ssector no longer holds a page, but uses sblock::pages, we
1882          * have to ensure the sblock had not been freed before our write bio
1883          * finished.
1884          */
1885         scrub_block_get(sector->sblock);
1886
1887         sbio->sector_count++;
1888         if (sbio->sector_count == sctx->sectors_per_bio)
1889                 scrub_wr_submit(sctx);
1890         mutex_unlock(&sctx->wr_lock);
1891
1892         return 0;
1893 }
1894
1895 static void scrub_wr_submit(struct scrub_ctx *sctx)
1896 {
1897         struct scrub_bio *sbio;
1898
1899         if (!sctx->wr_curr_bio)
1900                 return;
1901
1902         sbio = sctx->wr_curr_bio;
1903         sctx->wr_curr_bio = NULL;
1904         scrub_pending_bio_inc(sctx);
1905         /* process all writes in a single worker thread. Then the block layer
1906          * orders the requests before sending them to the driver which
1907          * doubled the write performance on spinning disks when measured
1908          * with Linux 3.5 */
1909         btrfsic_check_bio(sbio->bio);
1910         submit_bio(sbio->bio);
1911
1912         if (btrfs_is_zoned(sctx->fs_info))
1913                 sctx->write_pointer = sbio->physical + sbio->sector_count *
1914                         sctx->fs_info->sectorsize;
1915 }
1916
1917 static void scrub_wr_bio_end_io(struct bio *bio)
1918 {
1919         struct scrub_bio *sbio = bio->bi_private;
1920         struct btrfs_fs_info *fs_info = sbio->dev->fs_info;
1921
1922         sbio->status = bio->bi_status;
1923         sbio->bio = bio;
1924
1925         INIT_WORK(&sbio->work, scrub_wr_bio_end_io_worker);
1926         queue_work(fs_info->scrub_wr_completion_workers, &sbio->work);
1927 }
1928
1929 static void scrub_wr_bio_end_io_worker(struct work_struct *work)
1930 {
1931         struct scrub_bio *sbio = container_of(work, struct scrub_bio, work);
1932         struct scrub_ctx *sctx = sbio->sctx;
1933         int i;
1934
1935         ASSERT(sbio->sector_count <= SCRUB_SECTORS_PER_BIO);
1936         if (sbio->status) {
1937                 struct btrfs_dev_replace *dev_replace =
1938                         &sbio->sctx->fs_info->dev_replace;
1939
1940                 for (i = 0; i < sbio->sector_count; i++) {
1941                         struct scrub_sector *sector = sbio->sectors[i];
1942
1943                         sector->io_error = 1;
1944                         atomic64_inc(&dev_replace->num_write_errors);
1945                 }
1946         }
1947
1948         /*
1949          * In scrub_add_sector_to_wr_bio() we grab extra ref for sblock, now in
1950          * endio we should put the sblock.
1951          */
1952         for (i = 0; i < sbio->sector_count; i++) {
1953                 scrub_block_put(sbio->sectors[i]->sblock);
1954                 scrub_sector_put(sbio->sectors[i]);
1955         }
1956
1957         bio_put(sbio->bio);
1958         kfree(sbio);
1959         scrub_pending_bio_dec(sctx);
1960 }
1961
1962 static int scrub_checksum(struct scrub_block *sblock)
1963 {
1964         u64 flags;
1965         int ret;
1966
1967         /*
1968          * No need to initialize these stats currently,
1969          * because this function only use return value
1970          * instead of these stats value.
1971          *
1972          * Todo:
1973          * always use stats
1974          */
1975         sblock->header_error = 0;
1976         sblock->generation_error = 0;
1977         sblock->checksum_error = 0;
1978
1979         WARN_ON(sblock->sector_count < 1);
1980         flags = sblock->sectors[0]->flags;
1981         ret = 0;
1982         if (flags & BTRFS_EXTENT_FLAG_DATA)
1983                 ret = scrub_checksum_data(sblock);
1984         else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)
1985                 ret = scrub_checksum_tree_block(sblock);
1986         else if (flags & BTRFS_EXTENT_FLAG_SUPER)
1987                 ret = scrub_checksum_super(sblock);
1988         else
1989                 WARN_ON(1);
1990         if (ret)
1991                 scrub_handle_errored_block(sblock);
1992
1993         return ret;
1994 }
1995
1996 static int scrub_checksum_data(struct scrub_block *sblock)
1997 {
1998         struct scrub_ctx *sctx = sblock->sctx;
1999         struct btrfs_fs_info *fs_info = sctx->fs_info;
2000         SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
2001         u8 csum[BTRFS_CSUM_SIZE];
2002         struct scrub_sector *sector;
2003         char *kaddr;
2004
2005         BUG_ON(sblock->sector_count < 1);
2006         sector = sblock->sectors[0];
2007         if (!sector->have_csum)
2008                 return 0;
2009
2010         kaddr = scrub_sector_get_kaddr(sector);
2011
2012         shash->tfm = fs_info->csum_shash;
2013         crypto_shash_init(shash);
2014
2015         crypto_shash_digest(shash, kaddr, fs_info->sectorsize, csum);
2016
2017         if (memcmp(csum, sector->csum, fs_info->csum_size))
2018                 sblock->checksum_error = 1;
2019         return sblock->checksum_error;
2020 }
2021
2022 static int scrub_checksum_tree_block(struct scrub_block *sblock)
2023 {
2024         struct scrub_ctx *sctx = sblock->sctx;
2025         struct btrfs_header *h;
2026         struct btrfs_fs_info *fs_info = sctx->fs_info;
2027         SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
2028         u8 calculated_csum[BTRFS_CSUM_SIZE];
2029         u8 on_disk_csum[BTRFS_CSUM_SIZE];
2030         /*
2031          * This is done in sectorsize steps even for metadata as there's a
2032          * constraint for nodesize to be aligned to sectorsize. This will need
2033          * to change so we don't misuse data and metadata units like that.
2034          */
2035         const u32 sectorsize = sctx->fs_info->sectorsize;
2036         const int num_sectors = fs_info->nodesize >> fs_info->sectorsize_bits;
2037         int i;
2038         struct scrub_sector *sector;
2039         char *kaddr;
2040
2041         BUG_ON(sblock->sector_count < 1);
2042
2043         /* Each member in sectors is just one sector */
2044         ASSERT(sblock->sector_count == num_sectors);
2045
2046         sector = sblock->sectors[0];
2047         kaddr = scrub_sector_get_kaddr(sector);
2048         h = (struct btrfs_header *)kaddr;
2049         memcpy(on_disk_csum, h->csum, sctx->fs_info->csum_size);
2050
2051         /*
2052          * we don't use the getter functions here, as we
2053          * a) don't have an extent buffer and
2054          * b) the page is already kmapped
2055          */
2056         if (sblock->logical != btrfs_stack_header_bytenr(h)) {
2057                 sblock->header_error = 1;
2058                 btrfs_warn_rl(fs_info,
2059                 "tree block %llu mirror %u has bad bytenr, has %llu want %llu",
2060                               sblock->logical, sblock->mirror_num,
2061                               btrfs_stack_header_bytenr(h),
2062                               sblock->logical);
2063                 goto out;
2064         }
2065
2066         if (!scrub_check_fsid(h->fsid, sector)) {
2067                 sblock->header_error = 1;
2068                 btrfs_warn_rl(fs_info,
2069                 "tree block %llu mirror %u has bad fsid, has %pU want %pU",
2070                               sblock->logical, sblock->mirror_num,
2071                               h->fsid, sblock->dev->fs_devices->fsid);
2072                 goto out;
2073         }
2074
2075         if (memcmp(h->chunk_tree_uuid, fs_info->chunk_tree_uuid, BTRFS_UUID_SIZE)) {
2076                 sblock->header_error = 1;
2077                 btrfs_warn_rl(fs_info,
2078                 "tree block %llu mirror %u has bad chunk tree uuid, has %pU want %pU",
2079                               sblock->logical, sblock->mirror_num,
2080                               h->chunk_tree_uuid, fs_info->chunk_tree_uuid);
2081                 goto out;
2082         }
2083
2084         shash->tfm = fs_info->csum_shash;
2085         crypto_shash_init(shash);
2086         crypto_shash_update(shash, kaddr + BTRFS_CSUM_SIZE,
2087                             sectorsize - BTRFS_CSUM_SIZE);
2088
2089         for (i = 1; i < num_sectors; i++) {
2090                 kaddr = scrub_sector_get_kaddr(sblock->sectors[i]);
2091                 crypto_shash_update(shash, kaddr, sectorsize);
2092         }
2093
2094         crypto_shash_final(shash, calculated_csum);
2095         if (memcmp(calculated_csum, on_disk_csum, sctx->fs_info->csum_size)) {
2096                 sblock->checksum_error = 1;
2097                 btrfs_warn_rl(fs_info,
2098                 "tree block %llu mirror %u has bad csum, has " CSUM_FMT " want " CSUM_FMT,
2099                               sblock->logical, sblock->mirror_num,
2100                               CSUM_FMT_VALUE(fs_info->csum_size, on_disk_csum),
2101                               CSUM_FMT_VALUE(fs_info->csum_size, calculated_csum));
2102                 goto out;
2103         }
2104
2105         if (sector->generation != btrfs_stack_header_generation(h)) {
2106                 sblock->header_error = 1;
2107                 sblock->generation_error = 1;
2108                 btrfs_warn_rl(fs_info,
2109                 "tree block %llu mirror %u has bad generation, has %llu want %llu",
2110                               sblock->logical, sblock->mirror_num,
2111                               btrfs_stack_header_generation(h),
2112                               sector->generation);
2113         }
2114
2115 out:
2116         return sblock->header_error || sblock->checksum_error;
2117 }
2118
2119 static int scrub_checksum_super(struct scrub_block *sblock)
2120 {
2121         struct btrfs_super_block *s;
2122         struct scrub_ctx *sctx = sblock->sctx;
2123         struct btrfs_fs_info *fs_info = sctx->fs_info;
2124         SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
2125         u8 calculated_csum[BTRFS_CSUM_SIZE];
2126         struct scrub_sector *sector;
2127         char *kaddr;
2128         int fail_gen = 0;
2129         int fail_cor = 0;
2130
2131         BUG_ON(sblock->sector_count < 1);
2132         sector = sblock->sectors[0];
2133         kaddr = scrub_sector_get_kaddr(sector);
2134         s = (struct btrfs_super_block *)kaddr;
2135
2136         if (sblock->logical != btrfs_super_bytenr(s))
2137                 ++fail_cor;
2138
2139         if (sector->generation != btrfs_super_generation(s))
2140                 ++fail_gen;
2141
2142         if (!scrub_check_fsid(s->fsid, sector))
2143                 ++fail_cor;
2144
2145         shash->tfm = fs_info->csum_shash;
2146         crypto_shash_init(shash);
2147         crypto_shash_digest(shash, kaddr + BTRFS_CSUM_SIZE,
2148                         BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE, calculated_csum);
2149
2150         if (memcmp(calculated_csum, s->csum, sctx->fs_info->csum_size))
2151                 ++fail_cor;
2152
2153         return fail_cor + fail_gen;
2154 }
2155
2156 static void scrub_block_put(struct scrub_block *sblock)
2157 {
2158         if (refcount_dec_and_test(&sblock->refs)) {
2159                 int i;
2160
2161                 if (sblock->sparity)
2162                         scrub_parity_put(sblock->sparity);
2163
2164                 for (i = 0; i < sblock->sector_count; i++)
2165                         scrub_sector_put(sblock->sectors[i]);
2166                 for (i = 0; i < DIV_ROUND_UP(sblock->len, PAGE_SIZE); i++) {
2167                         if (sblock->pages[i]) {
2168                                 detach_scrub_page_private(sblock->pages[i]);
2169                                 __free_page(sblock->pages[i]);
2170                         }
2171                 }
2172                 kfree(sblock);
2173         }
2174 }
2175
2176 static void scrub_sector_get(struct scrub_sector *sector)
2177 {
2178         atomic_inc(&sector->refs);
2179 }
2180
2181 static void scrub_sector_put(struct scrub_sector *sector)
2182 {
2183         if (atomic_dec_and_test(&sector->refs))
2184                 kfree(sector);
2185 }
2186
2187 /*
2188  * Throttling of IO submission, bandwidth-limit based, the timeslice is 1
2189  * second.  Limit can be set via /sys/fs/UUID/devinfo/devid/scrub_speed_max.
2190  */
2191 static void scrub_throttle(struct scrub_ctx *sctx)
2192 {
2193         const int time_slice = 1000;
2194         struct scrub_bio *sbio;
2195         struct btrfs_device *device;
2196         s64 delta;
2197         ktime_t now;
2198         u32 div;
2199         u64 bwlimit;
2200
2201         sbio = sctx->bios[sctx->curr];
2202         device = sbio->dev;
2203         bwlimit = READ_ONCE(device->scrub_speed_max);
2204         if (bwlimit == 0)
2205                 return;
2206
2207         /*
2208          * Slice is divided into intervals when the IO is submitted, adjust by
2209          * bwlimit and maximum of 64 intervals.
2210          */
2211         div = max_t(u32, 1, (u32)(bwlimit / (16 * 1024 * 1024)));
2212         div = min_t(u32, 64, div);
2213
2214         /* Start new epoch, set deadline */
2215         now = ktime_get();
2216         if (sctx->throttle_deadline == 0) {
2217                 sctx->throttle_deadline = ktime_add_ms(now, time_slice / div);
2218                 sctx->throttle_sent = 0;
2219         }
2220
2221         /* Still in the time to send? */
2222         if (ktime_before(now, sctx->throttle_deadline)) {
2223                 /* If current bio is within the limit, send it */
2224                 sctx->throttle_sent += sbio->bio->bi_iter.bi_size;
2225                 if (sctx->throttle_sent <= div_u64(bwlimit, div))
2226                         return;
2227
2228                 /* We're over the limit, sleep until the rest of the slice */
2229                 delta = ktime_ms_delta(sctx->throttle_deadline, now);
2230         } else {
2231                 /* New request after deadline, start new epoch */
2232                 delta = 0;
2233         }
2234
2235         if (delta) {
2236                 long timeout;
2237
2238                 timeout = div_u64(delta * HZ, 1000);
2239                 schedule_timeout_interruptible(timeout);
2240         }
2241
2242         /* Next call will start the deadline period */
2243         sctx->throttle_deadline = 0;
2244 }
2245
2246 static void scrub_submit(struct scrub_ctx *sctx)
2247 {
2248         struct scrub_bio *sbio;
2249
2250         if (sctx->curr == -1)
2251                 return;
2252
2253         scrub_throttle(sctx);
2254
2255         sbio = sctx->bios[sctx->curr];
2256         sctx->curr = -1;
2257         scrub_pending_bio_inc(sctx);
2258         btrfsic_check_bio(sbio->bio);
2259         submit_bio(sbio->bio);
2260 }
2261
2262 static int scrub_add_sector_to_rd_bio(struct scrub_ctx *sctx,
2263                                       struct scrub_sector *sector)
2264 {
2265         struct scrub_block *sblock = sector->sblock;
2266         struct scrub_bio *sbio;
2267         const u32 sectorsize = sctx->fs_info->sectorsize;
2268         int ret;
2269
2270 again:
2271         /*
2272          * grab a fresh bio or wait for one to become available
2273          */
2274         while (sctx->curr == -1) {
2275                 spin_lock(&sctx->list_lock);
2276                 sctx->curr = sctx->first_free;
2277                 if (sctx->curr != -1) {
2278                         sctx->first_free = sctx->bios[sctx->curr]->next_free;
2279                         sctx->bios[sctx->curr]->next_free = -1;
2280                         sctx->bios[sctx->curr]->sector_count = 0;
2281                         spin_unlock(&sctx->list_lock);
2282                 } else {
2283                         spin_unlock(&sctx->list_lock);
2284                         wait_event(sctx->list_wait, sctx->first_free != -1);
2285                 }
2286         }
2287         sbio = sctx->bios[sctx->curr];
2288         if (sbio->sector_count == 0) {
2289                 sbio->physical = sblock->physical + sector->offset;
2290                 sbio->logical = sblock->logical + sector->offset;
2291                 sbio->dev = sblock->dev;
2292                 if (!sbio->bio) {
2293                         sbio->bio = bio_alloc(sbio->dev->bdev, sctx->sectors_per_bio,
2294                                               REQ_OP_READ, GFP_NOFS);
2295                 }
2296                 sbio->bio->bi_private = sbio;
2297                 sbio->bio->bi_end_io = scrub_bio_end_io;
2298                 sbio->bio->bi_iter.bi_sector = sbio->physical >> 9;
2299                 sbio->status = 0;
2300         } else if (sbio->physical + sbio->sector_count * sectorsize !=
2301                    sblock->physical + sector->offset ||
2302                    sbio->logical + sbio->sector_count * sectorsize !=
2303                    sblock->logical + sector->offset ||
2304                    sbio->dev != sblock->dev) {
2305                 scrub_submit(sctx);
2306                 goto again;
2307         }
2308
2309         sbio->sectors[sbio->sector_count] = sector;
2310         ret = bio_add_scrub_sector(sbio->bio, sector, sectorsize);
2311         if (ret != sectorsize) {
2312                 if (sbio->sector_count < 1) {
2313                         bio_put(sbio->bio);
2314                         sbio->bio = NULL;
2315                         return -EIO;
2316                 }
2317                 scrub_submit(sctx);
2318                 goto again;
2319         }
2320
2321         scrub_block_get(sblock); /* one for the page added to the bio */
2322         atomic_inc(&sblock->outstanding_sectors);
2323         sbio->sector_count++;
2324         if (sbio->sector_count == sctx->sectors_per_bio)
2325                 scrub_submit(sctx);
2326
2327         return 0;
2328 }
2329
2330 static void scrub_missing_raid56_end_io(struct bio *bio)
2331 {
2332         struct scrub_block *sblock = bio->bi_private;
2333         struct btrfs_fs_info *fs_info = sblock->sctx->fs_info;
2334
2335         btrfs_bio_counter_dec(fs_info);
2336         if (bio->bi_status)
2337                 sblock->no_io_error_seen = 0;
2338
2339         bio_put(bio);
2340
2341         queue_work(fs_info->scrub_workers, &sblock->work);
2342 }
2343
2344 static void scrub_missing_raid56_worker(struct work_struct *work)
2345 {
2346         struct scrub_block *sblock = container_of(work, struct scrub_block, work);
2347         struct scrub_ctx *sctx = sblock->sctx;
2348         struct btrfs_fs_info *fs_info = sctx->fs_info;
2349         u64 logical;
2350         struct btrfs_device *dev;
2351
2352         logical = sblock->logical;
2353         dev = sblock->dev;
2354
2355         if (sblock->no_io_error_seen)
2356                 scrub_recheck_block_checksum(sblock);
2357
2358         if (!sblock->no_io_error_seen) {
2359                 spin_lock(&sctx->stat_lock);
2360                 sctx->stat.read_errors++;
2361                 spin_unlock(&sctx->stat_lock);
2362                 btrfs_err_rl_in_rcu(fs_info,
2363                         "IO error rebuilding logical %llu for dev %s",
2364                         logical, btrfs_dev_name(dev));
2365         } else if (sblock->header_error || sblock->checksum_error) {
2366                 spin_lock(&sctx->stat_lock);
2367                 sctx->stat.uncorrectable_errors++;
2368                 spin_unlock(&sctx->stat_lock);
2369                 btrfs_err_rl_in_rcu(fs_info,
2370                         "failed to rebuild valid logical %llu for dev %s",
2371                         logical, btrfs_dev_name(dev));
2372         } else {
2373                 scrub_write_block_to_dev_replace(sblock);
2374         }
2375
2376         if (sctx->is_dev_replace && sctx->flush_all_writes) {
2377                 mutex_lock(&sctx->wr_lock);
2378                 scrub_wr_submit(sctx);
2379                 mutex_unlock(&sctx->wr_lock);
2380         }
2381
2382         scrub_block_put(sblock);
2383         scrub_pending_bio_dec(sctx);
2384 }
2385
2386 static void scrub_missing_raid56_pages(struct scrub_block *sblock)
2387 {
2388         struct scrub_ctx *sctx = sblock->sctx;
2389         struct btrfs_fs_info *fs_info = sctx->fs_info;
2390         u64 length = sblock->sector_count << fs_info->sectorsize_bits;
2391         u64 logical = sblock->logical;
2392         struct btrfs_io_context *bioc = NULL;
2393         struct bio *bio;
2394         struct btrfs_raid_bio *rbio;
2395         int ret;
2396         int i;
2397
2398         btrfs_bio_counter_inc_blocked(fs_info);
2399         ret = btrfs_map_sblock(fs_info, BTRFS_MAP_GET_READ_MIRRORS, logical,
2400                                &length, &bioc);
2401         if (ret || !bioc || !bioc->raid_map)
2402                 goto bioc_out;
2403
2404         if (WARN_ON(!sctx->is_dev_replace ||
2405                     !(bioc->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK))) {
2406                 /*
2407                  * We shouldn't be scrubbing a missing device. Even for dev
2408                  * replace, we should only get here for RAID 5/6. We either
2409                  * managed to mount something with no mirrors remaining or
2410                  * there's a bug in scrub_find_good_copy()/btrfs_map_block().
2411                  */
2412                 goto bioc_out;
2413         }
2414
2415         bio = bio_alloc(NULL, BIO_MAX_VECS, REQ_OP_READ, GFP_NOFS);
2416         bio->bi_iter.bi_sector = logical >> 9;
2417         bio->bi_private = sblock;
2418         bio->bi_end_io = scrub_missing_raid56_end_io;
2419
2420         rbio = raid56_alloc_missing_rbio(bio, bioc);
2421         if (!rbio)
2422                 goto rbio_out;
2423
2424         for (i = 0; i < sblock->sector_count; i++) {
2425                 struct scrub_sector *sector = sblock->sectors[i];
2426
2427                 raid56_add_scrub_pages(rbio, scrub_sector_get_page(sector),
2428                                        scrub_sector_get_page_offset(sector),
2429                                        sector->offset + sector->sblock->logical);
2430         }
2431
2432         INIT_WORK(&sblock->work, scrub_missing_raid56_worker);
2433         scrub_block_get(sblock);
2434         scrub_pending_bio_inc(sctx);
2435         raid56_submit_missing_rbio(rbio);
2436         btrfs_put_bioc(bioc);
2437         return;
2438
2439 rbio_out:
2440         bio_put(bio);
2441 bioc_out:
2442         btrfs_bio_counter_dec(fs_info);
2443         btrfs_put_bioc(bioc);
2444         spin_lock(&sctx->stat_lock);
2445         sctx->stat.malloc_errors++;
2446         spin_unlock(&sctx->stat_lock);
2447 }
2448
2449 static int scrub_sectors(struct scrub_ctx *sctx, u64 logical, u32 len,
2450                        u64 physical, struct btrfs_device *dev, u64 flags,
2451                        u64 gen, int mirror_num, u8 *csum,
2452                        u64 physical_for_dev_replace)
2453 {
2454         struct scrub_block *sblock;
2455         const u32 sectorsize = sctx->fs_info->sectorsize;
2456         int index;
2457
2458         sblock = alloc_scrub_block(sctx, dev, logical, physical,
2459                                    physical_for_dev_replace, mirror_num);
2460         if (!sblock) {
2461                 spin_lock(&sctx->stat_lock);
2462                 sctx->stat.malloc_errors++;
2463                 spin_unlock(&sctx->stat_lock);
2464                 return -ENOMEM;
2465         }
2466
2467         for (index = 0; len > 0; index++) {
2468                 struct scrub_sector *sector;
2469                 /*
2470                  * Here we will allocate one page for one sector to scrub.
2471                  * This is fine if PAGE_SIZE == sectorsize, but will cost
2472                  * more memory for PAGE_SIZE > sectorsize case.
2473                  */
2474                 u32 l = min(sectorsize, len);
2475
2476                 sector = alloc_scrub_sector(sblock, logical);
2477                 if (!sector) {
2478                         spin_lock(&sctx->stat_lock);
2479                         sctx->stat.malloc_errors++;
2480                         spin_unlock(&sctx->stat_lock);
2481                         scrub_block_put(sblock);
2482                         return -ENOMEM;
2483                 }
2484                 sector->flags = flags;
2485                 sector->generation = gen;
2486                 if (csum) {
2487                         sector->have_csum = 1;
2488                         memcpy(sector->csum, csum, sctx->fs_info->csum_size);
2489                 } else {
2490                         sector->have_csum = 0;
2491                 }
2492                 len -= l;
2493                 logical += l;
2494                 physical += l;
2495                 physical_for_dev_replace += l;
2496         }
2497
2498         WARN_ON(sblock->sector_count == 0);
2499         if (test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state)) {
2500                 /*
2501                  * This case should only be hit for RAID 5/6 device replace. See
2502                  * the comment in scrub_missing_raid56_pages() for details.
2503                  */
2504                 scrub_missing_raid56_pages(sblock);
2505         } else {
2506                 for (index = 0; index < sblock->sector_count; index++) {
2507                         struct scrub_sector *sector = sblock->sectors[index];
2508                         int ret;
2509
2510                         ret = scrub_add_sector_to_rd_bio(sctx, sector);
2511                         if (ret) {
2512                                 scrub_block_put(sblock);
2513                                 return ret;
2514                         }
2515                 }
2516
2517                 if (flags & BTRFS_EXTENT_FLAG_SUPER)
2518                         scrub_submit(sctx);
2519         }
2520
2521         /* last one frees, either here or in bio completion for last page */
2522         scrub_block_put(sblock);
2523         return 0;
2524 }
2525
2526 static void scrub_bio_end_io(struct bio *bio)
2527 {
2528         struct scrub_bio *sbio = bio->bi_private;
2529         struct btrfs_fs_info *fs_info = sbio->dev->fs_info;
2530
2531         sbio->status = bio->bi_status;
2532         sbio->bio = bio;
2533
2534         queue_work(fs_info->scrub_workers, &sbio->work);
2535 }
2536
2537 static void scrub_bio_end_io_worker(struct work_struct *work)
2538 {
2539         struct scrub_bio *sbio = container_of(work, struct scrub_bio, work);
2540         struct scrub_ctx *sctx = sbio->sctx;
2541         int i;
2542
2543         ASSERT(sbio->sector_count <= SCRUB_SECTORS_PER_BIO);
2544         if (sbio->status) {
2545                 for (i = 0; i < sbio->sector_count; i++) {
2546                         struct scrub_sector *sector = sbio->sectors[i];
2547
2548                         sector->io_error = 1;
2549                         sector->sblock->no_io_error_seen = 0;
2550                 }
2551         }
2552
2553         /* Now complete the scrub_block items that have all pages completed */
2554         for (i = 0; i < sbio->sector_count; i++) {
2555                 struct scrub_sector *sector = sbio->sectors[i];
2556                 struct scrub_block *sblock = sector->sblock;
2557
2558                 if (atomic_dec_and_test(&sblock->outstanding_sectors))
2559                         scrub_block_complete(sblock);
2560                 scrub_block_put(sblock);
2561         }
2562
2563         bio_put(sbio->bio);
2564         sbio->bio = NULL;
2565         spin_lock(&sctx->list_lock);
2566         sbio->next_free = sctx->first_free;
2567         sctx->first_free = sbio->index;
2568         spin_unlock(&sctx->list_lock);
2569
2570         if (sctx->is_dev_replace && sctx->flush_all_writes) {
2571                 mutex_lock(&sctx->wr_lock);
2572                 scrub_wr_submit(sctx);
2573                 mutex_unlock(&sctx->wr_lock);
2574         }
2575
2576         scrub_pending_bio_dec(sctx);
2577 }
2578
2579 static inline void __scrub_mark_bitmap(struct scrub_parity *sparity,
2580                                        unsigned long *bitmap,
2581                                        u64 start, u32 len)
2582 {
2583         u64 offset;
2584         u32 nsectors;
2585         u32 sectorsize_bits = sparity->sctx->fs_info->sectorsize_bits;
2586
2587         if (len >= sparity->stripe_len) {
2588                 bitmap_set(bitmap, 0, sparity->nsectors);
2589                 return;
2590         }
2591
2592         start -= sparity->logic_start;
2593         start = div64_u64_rem(start, sparity->stripe_len, &offset);
2594         offset = offset >> sectorsize_bits;
2595         nsectors = len >> sectorsize_bits;
2596
2597         if (offset + nsectors <= sparity->nsectors) {
2598                 bitmap_set(bitmap, offset, nsectors);
2599                 return;
2600         }
2601
2602         bitmap_set(bitmap, offset, sparity->nsectors - offset);
2603         bitmap_set(bitmap, 0, nsectors - (sparity->nsectors - offset));
2604 }
2605
2606 static inline void scrub_parity_mark_sectors_error(struct scrub_parity *sparity,
2607                                                    u64 start, u32 len)
2608 {
2609         __scrub_mark_bitmap(sparity, &sparity->ebitmap, start, len);
2610 }
2611
2612 static inline void scrub_parity_mark_sectors_data(struct scrub_parity *sparity,
2613                                                   u64 start, u32 len)
2614 {
2615         __scrub_mark_bitmap(sparity, &sparity->dbitmap, start, len);
2616 }
2617
2618 static void scrub_block_complete(struct scrub_block *sblock)
2619 {
2620         int corrupted = 0;
2621
2622         if (!sblock->no_io_error_seen) {
2623                 corrupted = 1;
2624                 scrub_handle_errored_block(sblock);
2625         } else {
2626                 /*
2627                  * if has checksum error, write via repair mechanism in
2628                  * dev replace case, otherwise write here in dev replace
2629                  * case.
2630                  */
2631                 corrupted = scrub_checksum(sblock);
2632                 if (!corrupted && sblock->sctx->is_dev_replace)
2633                         scrub_write_block_to_dev_replace(sblock);
2634         }
2635
2636         if (sblock->sparity && corrupted && !sblock->data_corrected) {
2637                 u64 start = sblock->logical;
2638                 u64 end = sblock->logical +
2639                           sblock->sectors[sblock->sector_count - 1]->offset +
2640                           sblock->sctx->fs_info->sectorsize;
2641
2642                 ASSERT(end - start <= U32_MAX);
2643                 scrub_parity_mark_sectors_error(sblock->sparity,
2644                                                 start, end - start);
2645         }
2646 }
2647
2648 static void drop_csum_range(struct scrub_ctx *sctx, struct btrfs_ordered_sum *sum)
2649 {
2650         sctx->stat.csum_discards += sum->len >> sctx->fs_info->sectorsize_bits;
2651         list_del(&sum->list);
2652         kfree(sum);
2653 }
2654
2655 /*
2656  * Find the desired csum for range [logical, logical + sectorsize), and store
2657  * the csum into @csum.
2658  *
2659  * The search source is sctx->csum_list, which is a pre-populated list
2660  * storing bytenr ordered csum ranges.  We're responsible to cleanup any range
2661  * that is before @logical.
2662  *
2663  * Return 0 if there is no csum for the range.
2664  * Return 1 if there is csum for the range and copied to @csum.
2665  */
2666 static int scrub_find_csum(struct scrub_ctx *sctx, u64 logical, u8 *csum)
2667 {
2668         bool found = false;
2669
2670         while (!list_empty(&sctx->csum_list)) {
2671                 struct btrfs_ordered_sum *sum = NULL;
2672                 unsigned long index;
2673                 unsigned long num_sectors;
2674
2675                 sum = list_first_entry(&sctx->csum_list,
2676                                        struct btrfs_ordered_sum, list);
2677                 /* The current csum range is beyond our range, no csum found */
2678                 if (sum->bytenr > logical)
2679                         break;
2680
2681                 /*
2682                  * The current sum is before our bytenr, since scrub is always
2683                  * done in bytenr order, the csum will never be used anymore,
2684                  * clean it up so that later calls won't bother with the range,
2685                  * and continue search the next range.
2686                  */
2687                 if (sum->bytenr + sum->len <= logical) {
2688                         drop_csum_range(sctx, sum);
2689                         continue;
2690                 }
2691
2692                 /* Now the csum range covers our bytenr, copy the csum */
2693                 found = true;
2694                 index = (logical - sum->bytenr) >> sctx->fs_info->sectorsize_bits;
2695                 num_sectors = sum->len >> sctx->fs_info->sectorsize_bits;
2696
2697                 memcpy(csum, sum->sums + index * sctx->fs_info->csum_size,
2698                        sctx->fs_info->csum_size);
2699
2700                 /* Cleanup the range if we're at the end of the csum range */
2701                 if (index == num_sectors - 1)
2702                         drop_csum_range(sctx, sum);
2703                 break;
2704         }
2705         if (!found)
2706                 return 0;
2707         return 1;
2708 }
2709
2710 /* scrub extent tries to collect up to 64 kB for each bio */
2711 static int scrub_extent(struct scrub_ctx *sctx, struct map_lookup *map,
2712                         u64 logical, u32 len,
2713                         u64 physical, struct btrfs_device *dev, u64 flags,
2714                         u64 gen, int mirror_num)
2715 {
2716         struct btrfs_device *src_dev = dev;
2717         u64 src_physical = physical;
2718         int src_mirror = mirror_num;
2719         int ret;
2720         u8 csum[BTRFS_CSUM_SIZE];
2721         u32 blocksize;
2722
2723         if (flags & BTRFS_EXTENT_FLAG_DATA) {
2724                 if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK)
2725                         blocksize = BTRFS_STRIPE_LEN;
2726                 else
2727                         blocksize = sctx->fs_info->sectorsize;
2728                 spin_lock(&sctx->stat_lock);
2729                 sctx->stat.data_extents_scrubbed++;
2730                 sctx->stat.data_bytes_scrubbed += len;
2731                 spin_unlock(&sctx->stat_lock);
2732         } else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
2733                 if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK)
2734                         blocksize = BTRFS_STRIPE_LEN;
2735                 else
2736                         blocksize = sctx->fs_info->nodesize;
2737                 spin_lock(&sctx->stat_lock);
2738                 sctx->stat.tree_extents_scrubbed++;
2739                 sctx->stat.tree_bytes_scrubbed += len;
2740                 spin_unlock(&sctx->stat_lock);
2741         } else {
2742                 blocksize = sctx->fs_info->sectorsize;
2743                 WARN_ON(1);
2744         }
2745
2746         /*
2747          * For dev-replace case, we can have @dev being a missing device.
2748          * Regular scrub will avoid its execution on missing device at all,
2749          * as that would trigger tons of read error.
2750          *
2751          * Reading from missing device will cause read error counts to
2752          * increase unnecessarily.
2753          * So here we change the read source to a good mirror.
2754          */
2755         if (sctx->is_dev_replace && !dev->bdev)
2756                 scrub_find_good_copy(sctx->fs_info, logical, len, &src_physical,
2757                                      &src_dev, &src_mirror);
2758         while (len) {
2759                 u32 l = min(len, blocksize);
2760                 int have_csum = 0;
2761
2762                 if (flags & BTRFS_EXTENT_FLAG_DATA) {
2763                         /* push csums to sbio */
2764                         have_csum = scrub_find_csum(sctx, logical, csum);
2765                         if (have_csum == 0)
2766                                 ++sctx->stat.no_csum;
2767                 }
2768                 ret = scrub_sectors(sctx, logical, l, src_physical, src_dev,
2769                                     flags, gen, src_mirror,
2770                                     have_csum ? csum : NULL, physical);
2771                 if (ret)
2772                         return ret;
2773                 len -= l;
2774                 logical += l;
2775                 physical += l;
2776                 src_physical += l;
2777         }
2778         return 0;
2779 }
2780
2781 static int scrub_sectors_for_parity(struct scrub_parity *sparity,
2782                                   u64 logical, u32 len,
2783                                   u64 physical, struct btrfs_device *dev,
2784                                   u64 flags, u64 gen, int mirror_num, u8 *csum)
2785 {
2786         struct scrub_ctx *sctx = sparity->sctx;
2787         struct scrub_block *sblock;
2788         const u32 sectorsize = sctx->fs_info->sectorsize;
2789         int index;
2790
2791         ASSERT(IS_ALIGNED(len, sectorsize));
2792
2793         sblock = alloc_scrub_block(sctx, dev, logical, physical, physical, mirror_num);
2794         if (!sblock) {
2795                 spin_lock(&sctx->stat_lock);
2796                 sctx->stat.malloc_errors++;
2797                 spin_unlock(&sctx->stat_lock);
2798                 return -ENOMEM;
2799         }
2800
2801         sblock->sparity = sparity;
2802         scrub_parity_get(sparity);
2803
2804         for (index = 0; len > 0; index++) {
2805                 struct scrub_sector *sector;
2806
2807                 sector = alloc_scrub_sector(sblock, logical);
2808                 if (!sector) {
2809                         spin_lock(&sctx->stat_lock);
2810                         sctx->stat.malloc_errors++;
2811                         spin_unlock(&sctx->stat_lock);
2812                         scrub_block_put(sblock);
2813                         return -ENOMEM;
2814                 }
2815                 sblock->sectors[index] = sector;
2816                 /* For scrub parity */
2817                 scrub_sector_get(sector);
2818                 list_add_tail(&sector->list, &sparity->sectors_list);
2819                 sector->flags = flags;
2820                 sector->generation = gen;
2821                 if (csum) {
2822                         sector->have_csum = 1;
2823                         memcpy(sector->csum, csum, sctx->fs_info->csum_size);
2824                 } else {
2825                         sector->have_csum = 0;
2826                 }
2827
2828                 /* Iterate over the stripe range in sectorsize steps */
2829                 len -= sectorsize;
2830                 logical += sectorsize;
2831                 physical += sectorsize;
2832         }
2833
2834         WARN_ON(sblock->sector_count == 0);
2835         for (index = 0; index < sblock->sector_count; index++) {
2836                 struct scrub_sector *sector = sblock->sectors[index];
2837                 int ret;
2838
2839                 ret = scrub_add_sector_to_rd_bio(sctx, sector);
2840                 if (ret) {
2841                         scrub_block_put(sblock);
2842                         return ret;
2843                 }
2844         }
2845
2846         /* Last one frees, either here or in bio completion for last sector */
2847         scrub_block_put(sblock);
2848         return 0;
2849 }
2850
2851 static int scrub_extent_for_parity(struct scrub_parity *sparity,
2852                                    u64 logical, u32 len,
2853                                    u64 physical, struct btrfs_device *dev,
2854                                    u64 flags, u64 gen, int mirror_num)
2855 {
2856         struct scrub_ctx *sctx = sparity->sctx;
2857         int ret;
2858         u8 csum[BTRFS_CSUM_SIZE];
2859         u32 blocksize;
2860
2861         if (test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state)) {
2862                 scrub_parity_mark_sectors_error(sparity, logical, len);
2863                 return 0;
2864         }
2865
2866         if (flags & BTRFS_EXTENT_FLAG_DATA) {
2867                 blocksize = sparity->stripe_len;
2868         } else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
2869                 blocksize = sparity->stripe_len;
2870         } else {
2871                 blocksize = sctx->fs_info->sectorsize;
2872                 WARN_ON(1);
2873         }
2874
2875         while (len) {
2876                 u32 l = min(len, blocksize);
2877                 int have_csum = 0;
2878
2879                 if (flags & BTRFS_EXTENT_FLAG_DATA) {
2880                         /* push csums to sbio */
2881                         have_csum = scrub_find_csum(sctx, logical, csum);
2882                         if (have_csum == 0)
2883                                 goto skip;
2884                 }
2885                 ret = scrub_sectors_for_parity(sparity, logical, l, physical, dev,
2886                                              flags, gen, mirror_num,
2887                                              have_csum ? csum : NULL);
2888                 if (ret)
2889                         return ret;
2890 skip:
2891                 len -= l;
2892                 logical += l;
2893                 physical += l;
2894         }
2895         return 0;
2896 }
2897
2898 /*
2899  * Given a physical address, this will calculate it's
2900  * logical offset. if this is a parity stripe, it will return
2901  * the most left data stripe's logical offset.
2902  *
2903  * return 0 if it is a data stripe, 1 means parity stripe.
2904  */
2905 static int get_raid56_logic_offset(u64 physical, int num,
2906                                    struct map_lookup *map, u64 *offset,
2907                                    u64 *stripe_start)
2908 {
2909         int i;
2910         int j = 0;
2911         u64 last_offset;
2912         const int data_stripes = nr_data_stripes(map);
2913
2914         last_offset = (physical - map->stripes[num].physical) * data_stripes;
2915         if (stripe_start)
2916                 *stripe_start = last_offset;
2917
2918         *offset = last_offset;
2919         for (i = 0; i < data_stripes; i++) {
2920                 u32 stripe_nr;
2921                 u32 stripe_index;
2922                 u32 rot;
2923
2924                 *offset = last_offset + (i << BTRFS_STRIPE_LEN_SHIFT);
2925
2926                 stripe_nr = (u32)(*offset >> BTRFS_STRIPE_LEN_SHIFT) / data_stripes;
2927
2928                 /* Work out the disk rotation on this stripe-set */
2929                 rot = stripe_nr % map->num_stripes;
2930                 stripe_nr /= map->num_stripes;
2931                 /* calculate which stripe this data locates */
2932                 rot += i;
2933                 stripe_index = rot % map->num_stripes;
2934                 if (stripe_index == num)
2935                         return 0;
2936                 if (stripe_index < num)
2937                         j++;
2938         }
2939         *offset = last_offset + (j << BTRFS_STRIPE_LEN_SHIFT);
2940         return 1;
2941 }
2942
2943 static void scrub_free_parity(struct scrub_parity *sparity)
2944 {
2945         struct scrub_ctx *sctx = sparity->sctx;
2946         struct scrub_sector *curr, *next;
2947         int nbits;
2948
2949         nbits = bitmap_weight(&sparity->ebitmap, sparity->nsectors);
2950         if (nbits) {
2951                 spin_lock(&sctx->stat_lock);
2952                 sctx->stat.read_errors += nbits;
2953                 sctx->stat.uncorrectable_errors += nbits;
2954                 spin_unlock(&sctx->stat_lock);
2955         }
2956
2957         list_for_each_entry_safe(curr, next, &sparity->sectors_list, list) {
2958                 list_del_init(&curr->list);
2959                 scrub_sector_put(curr);
2960         }
2961
2962         kfree(sparity);
2963 }
2964
2965 static void scrub_parity_bio_endio_worker(struct work_struct *work)
2966 {
2967         struct scrub_parity *sparity = container_of(work, struct scrub_parity,
2968                                                     work);
2969         struct scrub_ctx *sctx = sparity->sctx;
2970
2971         btrfs_bio_counter_dec(sctx->fs_info);
2972         scrub_free_parity(sparity);
2973         scrub_pending_bio_dec(sctx);
2974 }
2975
2976 static void scrub_parity_bio_endio(struct bio *bio)
2977 {
2978         struct scrub_parity *sparity = bio->bi_private;
2979         struct btrfs_fs_info *fs_info = sparity->sctx->fs_info;
2980
2981         if (bio->bi_status)
2982                 bitmap_or(&sparity->ebitmap, &sparity->ebitmap,
2983                           &sparity->dbitmap, sparity->nsectors);
2984
2985         bio_put(bio);
2986
2987         INIT_WORK(&sparity->work, scrub_parity_bio_endio_worker);
2988         queue_work(fs_info->scrub_parity_workers, &sparity->work);
2989 }
2990
2991 static void scrub_parity_check_and_repair(struct scrub_parity *sparity)
2992 {
2993         struct scrub_ctx *sctx = sparity->sctx;
2994         struct btrfs_fs_info *fs_info = sctx->fs_info;
2995         struct bio *bio;
2996         struct btrfs_raid_bio *rbio;
2997         struct btrfs_io_context *bioc = NULL;
2998         u64 length;
2999         int ret;
3000
3001         if (!bitmap_andnot(&sparity->dbitmap, &sparity->dbitmap,
3002                            &sparity->ebitmap, sparity->nsectors))
3003                 goto out;
3004
3005         length = sparity->logic_end - sparity->logic_start;
3006
3007         btrfs_bio_counter_inc_blocked(fs_info);
3008         ret = btrfs_map_sblock(fs_info, BTRFS_MAP_WRITE, sparity->logic_start,
3009                                &length, &bioc);
3010         if (ret || !bioc || !bioc->raid_map)
3011                 goto bioc_out;
3012
3013         bio = bio_alloc(NULL, BIO_MAX_VECS, REQ_OP_READ, GFP_NOFS);
3014         bio->bi_iter.bi_sector = sparity->logic_start >> 9;
3015         bio->bi_private = sparity;
3016         bio->bi_end_io = scrub_parity_bio_endio;
3017
3018         rbio = raid56_parity_alloc_scrub_rbio(bio, bioc,
3019                                               sparity->scrub_dev,
3020                                               &sparity->dbitmap,
3021                                               sparity->nsectors);
3022         btrfs_put_bioc(bioc);
3023         if (!rbio)
3024                 goto rbio_out;
3025
3026         scrub_pending_bio_inc(sctx);
3027         raid56_parity_submit_scrub_rbio(rbio);
3028         return;
3029
3030 rbio_out:
3031         bio_put(bio);
3032 bioc_out:
3033         btrfs_bio_counter_dec(fs_info);
3034         bitmap_or(&sparity->ebitmap, &sparity->ebitmap, &sparity->dbitmap,
3035                   sparity->nsectors);
3036         spin_lock(&sctx->stat_lock);
3037         sctx->stat.malloc_errors++;
3038         spin_unlock(&sctx->stat_lock);
3039 out:
3040         scrub_free_parity(sparity);
3041 }
3042
3043 static void scrub_parity_get(struct scrub_parity *sparity)
3044 {
3045         refcount_inc(&sparity->refs);
3046 }
3047
3048 static void scrub_parity_put(struct scrub_parity *sparity)
3049 {
3050         if (!refcount_dec_and_test(&sparity->refs))
3051                 return;
3052
3053         scrub_parity_check_and_repair(sparity);
3054 }
3055
3056 /*
3057  * Return 0 if the extent item range covers any byte of the range.
3058  * Return <0 if the extent item is before @search_start.
3059  * Return >0 if the extent item is after @start_start + @search_len.
3060  */
3061 static int compare_extent_item_range(struct btrfs_path *path,
3062                                      u64 search_start, u64 search_len)
3063 {
3064         struct btrfs_fs_info *fs_info = path->nodes[0]->fs_info;
3065         u64 len;
3066         struct btrfs_key key;
3067
3068         btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
3069         ASSERT(key.type == BTRFS_EXTENT_ITEM_KEY ||
3070                key.type == BTRFS_METADATA_ITEM_KEY);
3071         if (key.type == BTRFS_METADATA_ITEM_KEY)
3072                 len = fs_info->nodesize;
3073         else
3074                 len = key.offset;
3075
3076         if (key.objectid + len <= search_start)
3077                 return -1;
3078         if (key.objectid >= search_start + search_len)
3079                 return 1;
3080         return 0;
3081 }
3082
3083 /*
3084  * Locate one extent item which covers any byte in range
3085  * [@search_start, @search_start + @search_length)
3086  *
3087  * If the path is not initialized, we will initialize the search by doing
3088  * a btrfs_search_slot().
3089  * If the path is already initialized, we will use the path as the initial
3090  * slot, to avoid duplicated btrfs_search_slot() calls.
3091  *
3092  * NOTE: If an extent item starts before @search_start, we will still
3093  * return the extent item. This is for data extent crossing stripe boundary.
3094  *
3095  * Return 0 if we found such extent item, and @path will point to the extent item.
3096  * Return >0 if no such extent item can be found, and @path will be released.
3097  * Return <0 if hit fatal error, and @path will be released.
3098  */
3099 static int find_first_extent_item(struct btrfs_root *extent_root,
3100                                   struct btrfs_path *path,
3101                                   u64 search_start, u64 search_len)
3102 {
3103         struct btrfs_fs_info *fs_info = extent_root->fs_info;
3104         struct btrfs_key key;
3105         int ret;
3106
3107         /* Continue using the existing path */
3108         if (path->nodes[0])
3109                 goto search_forward;
3110
3111         if (btrfs_fs_incompat(fs_info, SKINNY_METADATA))
3112                 key.type = BTRFS_METADATA_ITEM_KEY;
3113         else
3114                 key.type = BTRFS_EXTENT_ITEM_KEY;
3115         key.objectid = search_start;
3116         key.offset = (u64)-1;
3117
3118         ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
3119         if (ret < 0)
3120                 return ret;
3121
3122         ASSERT(ret > 0);
3123         /*
3124          * Here we intentionally pass 0 as @min_objectid, as there could be
3125          * an extent item starting before @search_start.
3126          */
3127         ret = btrfs_previous_extent_item(extent_root, path, 0);
3128         if (ret < 0)
3129                 return ret;
3130         /*
3131          * No matter whether we have found an extent item, the next loop will
3132          * properly do every check on the key.
3133          */
3134 search_forward:
3135         while (true) {
3136                 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
3137                 if (key.objectid >= search_start + search_len)
3138                         break;
3139                 if (key.type != BTRFS_METADATA_ITEM_KEY &&
3140                     key.type != BTRFS_EXTENT_ITEM_KEY)
3141                         goto next;
3142
3143                 ret = compare_extent_item_range(path, search_start, search_len);
3144                 if (ret == 0)
3145                         return ret;
3146                 if (ret > 0)
3147                         break;
3148 next:
3149                 path->slots[0]++;
3150                 if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) {
3151                         ret = btrfs_next_leaf(extent_root, path);
3152                         if (ret) {
3153                                 /* Either no more item or fatal error */
3154                                 btrfs_release_path(path);
3155                                 return ret;
3156                         }
3157                 }
3158         }
3159         btrfs_release_path(path);
3160         return 1;
3161 }
3162
3163 static void get_extent_info(struct btrfs_path *path, u64 *extent_start_ret,
3164                             u64 *size_ret, u64 *flags_ret, u64 *generation_ret)
3165 {
3166         struct btrfs_key key;
3167         struct btrfs_extent_item *ei;
3168
3169         btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
3170         ASSERT(key.type == BTRFS_METADATA_ITEM_KEY ||
3171                key.type == BTRFS_EXTENT_ITEM_KEY);
3172         *extent_start_ret = key.objectid;
3173         if (key.type == BTRFS_METADATA_ITEM_KEY)
3174                 *size_ret = path->nodes[0]->fs_info->nodesize;
3175         else
3176                 *size_ret = key.offset;
3177         ei = btrfs_item_ptr(path->nodes[0], path->slots[0], struct btrfs_extent_item);
3178         *flags_ret = btrfs_extent_flags(path->nodes[0], ei);
3179         *generation_ret = btrfs_extent_generation(path->nodes[0], ei);
3180 }
3181
3182 static bool does_range_cross_boundary(u64 extent_start, u64 extent_len,
3183                                       u64 boundary_start, u64 boudary_len)
3184 {
3185         return (extent_start < boundary_start &&
3186                 extent_start + extent_len > boundary_start) ||
3187                (extent_start < boundary_start + boudary_len &&
3188                 extent_start + extent_len > boundary_start + boudary_len);
3189 }
3190
3191 static int scrub_raid56_data_stripe_for_parity(struct scrub_ctx *sctx,
3192                                                struct scrub_parity *sparity,
3193                                                struct map_lookup *map,
3194                                                struct btrfs_device *sdev,
3195                                                struct btrfs_path *path,
3196                                                u64 logical)
3197 {
3198         struct btrfs_fs_info *fs_info = sctx->fs_info;
3199         struct btrfs_root *extent_root = btrfs_extent_root(fs_info, logical);
3200         struct btrfs_root *csum_root = btrfs_csum_root(fs_info, logical);
3201         u64 cur_logical = logical;
3202         int ret;
3203
3204         ASSERT(map->type & BTRFS_BLOCK_GROUP_RAID56_MASK);
3205
3206         /* Path must not be populated */
3207         ASSERT(!path->nodes[0]);
3208
3209         while (cur_logical < logical + BTRFS_STRIPE_LEN) {
3210                 struct btrfs_io_context *bioc = NULL;
3211                 struct btrfs_device *extent_dev;
3212                 u64 extent_start;
3213                 u64 extent_size;
3214                 u64 mapped_length;
3215                 u64 extent_flags;
3216                 u64 extent_gen;
3217                 u64 extent_physical;
3218                 u64 extent_mirror_num;
3219
3220                 ret = find_first_extent_item(extent_root, path, cur_logical,
3221                                              logical + BTRFS_STRIPE_LEN - cur_logical);
3222                 /* No more extent item in this data stripe */
3223                 if (ret > 0) {
3224                         ret = 0;
3225                         break;
3226                 }
3227                 if (ret < 0)
3228                         break;
3229                 get_extent_info(path, &extent_start, &extent_size, &extent_flags,
3230                                 &extent_gen);
3231
3232                 /* Metadata should not cross stripe boundaries */
3233                 if ((extent_flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) &&
3234                     does_range_cross_boundary(extent_start, extent_size,
3235                                               logical, BTRFS_STRIPE_LEN)) {
3236                         btrfs_err(fs_info,
3237         "scrub: tree block %llu spanning stripes, ignored. logical=%llu",
3238                                   extent_start, logical);
3239                         spin_lock(&sctx->stat_lock);
3240                         sctx->stat.uncorrectable_errors++;
3241                         spin_unlock(&sctx->stat_lock);
3242                         cur_logical += extent_size;
3243                         continue;
3244                 }
3245
3246                 /* Skip hole range which doesn't have any extent */
3247                 cur_logical = max(extent_start, cur_logical);
3248
3249                 /* Truncate the range inside this data stripe */
3250                 extent_size = min(extent_start + extent_size,
3251                                   logical + BTRFS_STRIPE_LEN) - cur_logical;
3252                 extent_start = cur_logical;
3253                 ASSERT(extent_size <= U32_MAX);
3254
3255                 scrub_parity_mark_sectors_data(sparity, extent_start, extent_size);
3256
3257                 mapped_length = extent_size;
3258                 ret = btrfs_map_block(fs_info, BTRFS_MAP_READ, extent_start,
3259                                       &mapped_length, &bioc, 0);
3260                 if (!ret && (!bioc || mapped_length < extent_size))
3261                         ret = -EIO;
3262                 if (ret) {
3263                         btrfs_put_bioc(bioc);
3264                         scrub_parity_mark_sectors_error(sparity, extent_start,
3265                                                         extent_size);
3266                         break;
3267                 }
3268                 extent_physical = bioc->stripes[0].physical;
3269                 extent_mirror_num = bioc->mirror_num;
3270                 extent_dev = bioc->stripes[0].dev;
3271                 btrfs_put_bioc(bioc);
3272
3273                 ret = btrfs_lookup_csums_list(csum_root, extent_start,
3274                                               extent_start + extent_size - 1,
3275                                               &sctx->csum_list, 1, false);
3276                 if (ret) {
3277                         scrub_parity_mark_sectors_error(sparity, extent_start,
3278                                                         extent_size);
3279                         break;
3280                 }
3281
3282                 ret = scrub_extent_for_parity(sparity, extent_start,
3283                                               extent_size, extent_physical,
3284                                               extent_dev, extent_flags,
3285                                               extent_gen, extent_mirror_num);
3286                 scrub_free_csums(sctx);
3287
3288                 if (ret) {
3289                         scrub_parity_mark_sectors_error(sparity, extent_start,
3290                                                         extent_size);
3291                         break;
3292                 }
3293
3294                 cond_resched();
3295                 cur_logical += extent_size;
3296         }
3297         btrfs_release_path(path);
3298         return ret;
3299 }
3300
3301 static noinline_for_stack int scrub_raid56_parity(struct scrub_ctx *sctx,
3302                                                   struct map_lookup *map,
3303                                                   struct btrfs_device *sdev,
3304                                                   u64 logic_start,
3305                                                   u64 logic_end)
3306 {
3307         struct btrfs_fs_info *fs_info = sctx->fs_info;
3308         struct btrfs_path *path;
3309         u64 cur_logical;
3310         int ret;
3311         struct scrub_parity *sparity;
3312         int nsectors;
3313
3314         path = btrfs_alloc_path();
3315         if (!path) {
3316                 spin_lock(&sctx->stat_lock);
3317                 sctx->stat.malloc_errors++;
3318                 spin_unlock(&sctx->stat_lock);
3319                 return -ENOMEM;
3320         }
3321         path->search_commit_root = 1;
3322         path->skip_locking = 1;
3323
3324         nsectors = BTRFS_STRIPE_LEN >> fs_info->sectorsize_bits;
3325         ASSERT(nsectors <= BITS_PER_LONG);
3326         sparity = kzalloc(sizeof(struct scrub_parity), GFP_NOFS);
3327         if (!sparity) {
3328                 spin_lock(&sctx->stat_lock);
3329                 sctx->stat.malloc_errors++;
3330                 spin_unlock(&sctx->stat_lock);
3331                 btrfs_free_path(path);
3332                 return -ENOMEM;
3333         }
3334
3335         sparity->stripe_len = BTRFS_STRIPE_LEN;
3336         sparity->nsectors = nsectors;
3337         sparity->sctx = sctx;
3338         sparity->scrub_dev = sdev;
3339         sparity->logic_start = logic_start;
3340         sparity->logic_end = logic_end;
3341         refcount_set(&sparity->refs, 1);
3342         INIT_LIST_HEAD(&sparity->sectors_list);
3343
3344         ret = 0;
3345         for (cur_logical = logic_start; cur_logical < logic_end;
3346              cur_logical += BTRFS_STRIPE_LEN) {
3347                 ret = scrub_raid56_data_stripe_for_parity(sctx, sparity, map,
3348                                                           sdev, path, cur_logical);
3349                 if (ret < 0)
3350                         break;
3351         }
3352
3353         scrub_parity_put(sparity);
3354         scrub_submit(sctx);
3355         mutex_lock(&sctx->wr_lock);
3356         scrub_wr_submit(sctx);
3357         mutex_unlock(&sctx->wr_lock);
3358
3359         btrfs_free_path(path);
3360         return ret < 0 ? ret : 0;
3361 }
3362
3363 static void sync_replace_for_zoned(struct scrub_ctx *sctx)
3364 {
3365         if (!btrfs_is_zoned(sctx->fs_info))
3366                 return;
3367
3368         sctx->flush_all_writes = true;
3369         scrub_submit(sctx);
3370         mutex_lock(&sctx->wr_lock);
3371         scrub_wr_submit(sctx);
3372         mutex_unlock(&sctx->wr_lock);
3373
3374         wait_event(sctx->list_wait, atomic_read(&sctx->bios_in_flight) == 0);
3375 }
3376
3377 static int sync_write_pointer_for_zoned(struct scrub_ctx *sctx, u64 logical,
3378                                         u64 physical, u64 physical_end)
3379 {
3380         struct btrfs_fs_info *fs_info = sctx->fs_info;
3381         int ret = 0;
3382
3383         if (!btrfs_is_zoned(fs_info))
3384                 return 0;
3385
3386         wait_event(sctx->list_wait, atomic_read(&sctx->bios_in_flight) == 0);
3387
3388         mutex_lock(&sctx->wr_lock);
3389         if (sctx->write_pointer < physical_end) {
3390                 ret = btrfs_sync_zone_write_pointer(sctx->wr_tgtdev, logical,
3391                                                     physical,
3392                                                     sctx->write_pointer);
3393                 if (ret)
3394                         btrfs_err(fs_info,
3395                                   "zoned: failed to recover write pointer");
3396         }
3397         mutex_unlock(&sctx->wr_lock);
3398         btrfs_dev_clear_zone_empty(sctx->wr_tgtdev, physical);
3399
3400         return ret;
3401 }
3402
3403 /*
3404  * Scrub one range which can only has simple mirror based profile.
3405  * (Including all range in SINGLE/DUP/RAID1/RAID1C*, and each stripe in
3406  *  RAID0/RAID10).
3407  *
3408  * Since we may need to handle a subset of block group, we need @logical_start
3409  * and @logical_length parameter.
3410  */
3411 static int scrub_simple_mirror(struct scrub_ctx *sctx,
3412                                struct btrfs_root *extent_root,
3413                                struct btrfs_root *csum_root,
3414                                struct btrfs_block_group *bg,
3415                                struct map_lookup *map,
3416                                u64 logical_start, u64 logical_length,
3417                                struct btrfs_device *device,
3418                                u64 physical, int mirror_num)
3419 {
3420         struct btrfs_fs_info *fs_info = sctx->fs_info;
3421         const u64 logical_end = logical_start + logical_length;
3422         /* An artificial limit, inherit from old scrub behavior */
3423         const u32 max_length = SZ_64K;
3424         struct btrfs_path path = { 0 };
3425         u64 cur_logical = logical_start;
3426         int ret;
3427
3428         /* The range must be inside the bg */
3429         ASSERT(logical_start >= bg->start && logical_end <= bg->start + bg->length);
3430
3431         path.search_commit_root = 1;
3432         path.skip_locking = 1;
3433         /* Go through each extent items inside the logical range */
3434         while (cur_logical < logical_end) {
3435                 u64 extent_start;
3436                 u64 extent_len;
3437                 u64 extent_flags;
3438                 u64 extent_gen;
3439                 u64 scrub_len;
3440
3441                 /* Canceled? */
3442                 if (atomic_read(&fs_info->scrub_cancel_req) ||
3443                     atomic_read(&sctx->cancel_req)) {
3444                         ret = -ECANCELED;
3445                         break;
3446                 }
3447                 /* Paused? */
3448                 if (atomic_read(&fs_info->scrub_pause_req)) {
3449                         /* Push queued extents */
3450                         sctx->flush_all_writes = true;
3451                         scrub_submit(sctx);
3452                         mutex_lock(&sctx->wr_lock);
3453                         scrub_wr_submit(sctx);
3454                         mutex_unlock(&sctx->wr_lock);
3455                         wait_event(sctx->list_wait,
3456                                    atomic_read(&sctx->bios_in_flight) == 0);
3457                         sctx->flush_all_writes = false;
3458                         scrub_blocked_if_needed(fs_info);
3459                 }
3460                 /* Block group removed? */
3461                 spin_lock(&bg->lock);
3462                 if (test_bit(BLOCK_GROUP_FLAG_REMOVED, &bg->runtime_flags)) {
3463                         spin_unlock(&bg->lock);
3464                         ret = 0;
3465                         break;
3466                 }
3467                 spin_unlock(&bg->lock);
3468
3469                 ret = find_first_extent_item(extent_root, &path, cur_logical,
3470                                              logical_end - cur_logical);
3471                 if (ret > 0) {
3472                         /* No more extent, just update the accounting */
3473                         sctx->stat.last_physical = physical + logical_length;
3474                         ret = 0;
3475                         break;
3476                 }
3477                 if (ret < 0)
3478                         break;
3479                 get_extent_info(&path, &extent_start, &extent_len,
3480                                 &extent_flags, &extent_gen);
3481                 /* Skip hole range which doesn't have any extent */
3482                 cur_logical = max(extent_start, cur_logical);
3483
3484                 /*
3485                  * Scrub len has three limits:
3486                  * - Extent size limit
3487                  * - Scrub range limit
3488                  *   This is especially imporatant for RAID0/RAID10 to reuse
3489                  *   this function
3490                  * - Max scrub size limit
3491                  */
3492                 scrub_len = min(min(extent_start + extent_len,
3493                                     logical_end), cur_logical + max_length) -
3494                             cur_logical;
3495
3496                 if (extent_flags & BTRFS_EXTENT_FLAG_DATA) {
3497                         ret = btrfs_lookup_csums_list(csum_root, cur_logical,
3498                                         cur_logical + scrub_len - 1,
3499                                         &sctx->csum_list, 1, false);
3500                         if (ret)
3501                                 break;
3502                 }
3503                 if ((extent_flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) &&
3504                     does_range_cross_boundary(extent_start, extent_len,
3505                                               logical_start, logical_length)) {
3506                         btrfs_err(fs_info,
3507 "scrub: tree block %llu spanning boundaries, ignored. boundary=[%llu, %llu)",
3508                                   extent_start, logical_start, logical_end);
3509                         spin_lock(&sctx->stat_lock);
3510                         sctx->stat.uncorrectable_errors++;
3511                         spin_unlock(&sctx->stat_lock);
3512                         cur_logical += scrub_len;
3513                         continue;
3514                 }
3515                 ret = scrub_extent(sctx, map, cur_logical, scrub_len,
3516                                    cur_logical - logical_start + physical,
3517                                    device, extent_flags, extent_gen,
3518                                    mirror_num);
3519                 scrub_free_csums(sctx);
3520                 if (ret)
3521                         break;
3522                 if (sctx->is_dev_replace)
3523                         sync_replace_for_zoned(sctx);
3524                 cur_logical += scrub_len;
3525                 /* Don't hold CPU for too long time */
3526                 cond_resched();
3527         }
3528         btrfs_release_path(&path);
3529         return ret;
3530 }
3531
3532 /* Calculate the full stripe length for simple stripe based profiles */
3533 static u64 simple_stripe_full_stripe_len(const struct map_lookup *map)
3534 {
3535         ASSERT(map->type & (BTRFS_BLOCK_GROUP_RAID0 |
3536                             BTRFS_BLOCK_GROUP_RAID10));
3537
3538         return (map->num_stripes / map->sub_stripes) << BTRFS_STRIPE_LEN_SHIFT;
3539 }
3540
3541 /* Get the logical bytenr for the stripe */
3542 static u64 simple_stripe_get_logical(struct map_lookup *map,
3543                                      struct btrfs_block_group *bg,
3544                                      int stripe_index)
3545 {
3546         ASSERT(map->type & (BTRFS_BLOCK_GROUP_RAID0 |
3547                             BTRFS_BLOCK_GROUP_RAID10));
3548         ASSERT(stripe_index < map->num_stripes);
3549
3550         /*
3551          * (stripe_index / sub_stripes) gives how many data stripes we need to
3552          * skip.
3553          */
3554         return ((stripe_index / map->sub_stripes) << BTRFS_STRIPE_LEN_SHIFT) +
3555                bg->start;
3556 }
3557
3558 /* Get the mirror number for the stripe */
3559 static int simple_stripe_mirror_num(struct map_lookup *map, int stripe_index)
3560 {
3561         ASSERT(map->type & (BTRFS_BLOCK_GROUP_RAID0 |
3562                             BTRFS_BLOCK_GROUP_RAID10));
3563         ASSERT(stripe_index < map->num_stripes);
3564
3565         /* For RAID0, it's fixed to 1, for RAID10 it's 0,1,0,1... */
3566         return stripe_index % map->sub_stripes + 1;
3567 }
3568
3569 static int scrub_simple_stripe(struct scrub_ctx *sctx,
3570                                struct btrfs_root *extent_root,
3571                                struct btrfs_root *csum_root,
3572                                struct btrfs_block_group *bg,
3573                                struct map_lookup *map,
3574                                struct btrfs_device *device,
3575                                int stripe_index)
3576 {
3577         const u64 logical_increment = simple_stripe_full_stripe_len(map);
3578         const u64 orig_logical = simple_stripe_get_logical(map, bg, stripe_index);
3579         const u64 orig_physical = map->stripes[stripe_index].physical;
3580         const int mirror_num = simple_stripe_mirror_num(map, stripe_index);
3581         u64 cur_logical = orig_logical;
3582         u64 cur_physical = orig_physical;
3583         int ret = 0;
3584
3585         while (cur_logical < bg->start + bg->length) {
3586                 /*
3587                  * Inside each stripe, RAID0 is just SINGLE, and RAID10 is
3588                  * just RAID1, so we can reuse scrub_simple_mirror() to scrub
3589                  * this stripe.
3590                  */
3591                 ret = scrub_simple_mirror(sctx, extent_root, csum_root, bg, map,
3592                                           cur_logical, BTRFS_STRIPE_LEN, device,
3593                                           cur_physical, mirror_num);
3594                 if (ret)
3595                         return ret;
3596                 /* Skip to next stripe which belongs to the target device */
3597                 cur_logical += logical_increment;
3598                 /* For physical offset, we just go to next stripe */
3599                 cur_physical += BTRFS_STRIPE_LEN;
3600         }
3601         return ret;
3602 }
3603
3604 static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
3605                                            struct btrfs_block_group *bg,
3606                                            struct extent_map *em,
3607                                            struct btrfs_device *scrub_dev,
3608                                            int stripe_index)
3609 {
3610         struct btrfs_path *path;
3611         struct btrfs_fs_info *fs_info = sctx->fs_info;
3612         struct btrfs_root *root;
3613         struct btrfs_root *csum_root;
3614         struct blk_plug plug;
3615         struct map_lookup *map = em->map_lookup;
3616         const u64 profile = map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK;
3617         const u64 chunk_logical = bg->start;
3618         int ret;
3619         u64 physical = map->stripes[stripe_index].physical;
3620         const u64 dev_stripe_len = btrfs_calc_stripe_length(em);
3621         const u64 physical_end = physical + dev_stripe_len;
3622         u64 logical;
3623         u64 logic_end;
3624         /* The logical increment after finishing one stripe */
3625         u64 increment;
3626         /* Offset inside the chunk */
3627         u64 offset;
3628         u64 stripe_logical;
3629         u64 stripe_end;
3630         int stop_loop = 0;
3631
3632         path = btrfs_alloc_path();
3633         if (!path)
3634                 return -ENOMEM;
3635
3636         /*
3637          * work on commit root. The related disk blocks are static as
3638          * long as COW is applied. This means, it is save to rewrite
3639          * them to repair disk errors without any race conditions
3640          */
3641         path->search_commit_root = 1;
3642         path->skip_locking = 1;
3643         path->reada = READA_FORWARD;
3644
3645         wait_event(sctx->list_wait,
3646                    atomic_read(&sctx->bios_in_flight) == 0);
3647         scrub_blocked_if_needed(fs_info);
3648
3649         root = btrfs_extent_root(fs_info, bg->start);
3650         csum_root = btrfs_csum_root(fs_info, bg->start);
3651
3652         /*
3653          * collect all data csums for the stripe to avoid seeking during
3654          * the scrub. This might currently (crc32) end up to be about 1MB
3655          */
3656         blk_start_plug(&plug);
3657
3658         if (sctx->is_dev_replace &&
3659             btrfs_dev_is_sequential(sctx->wr_tgtdev, physical)) {
3660                 mutex_lock(&sctx->wr_lock);
3661                 sctx->write_pointer = physical;
3662                 mutex_unlock(&sctx->wr_lock);
3663                 sctx->flush_all_writes = true;
3664         }
3665
3666         /*
3667          * There used to be a big double loop to handle all profiles using the
3668          * same routine, which grows larger and more gross over time.
3669          *
3670          * So here we handle each profile differently, so simpler profiles
3671          * have simpler scrubbing function.
3672          */
3673         if (!(profile & (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID10 |
3674                          BTRFS_BLOCK_GROUP_RAID56_MASK))) {
3675                 /*
3676                  * Above check rules out all complex profile, the remaining
3677                  * profiles are SINGLE|DUP|RAID1|RAID1C*, which is simple
3678                  * mirrored duplication without stripe.
3679                  *
3680                  * Only @physical and @mirror_num needs to calculated using
3681                  * @stripe_index.
3682                  */
3683                 ret = scrub_simple_mirror(sctx, root, csum_root, bg, map,
3684                                 bg->start, bg->length, scrub_dev,
3685                                 map->stripes[stripe_index].physical,
3686                                 stripe_index + 1);
3687                 offset = 0;
3688                 goto out;
3689         }
3690         if (profile & (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID10)) {
3691                 ret = scrub_simple_stripe(sctx, root, csum_root, bg, map,
3692                                           scrub_dev, stripe_index);
3693                 offset = (stripe_index / map->sub_stripes) << BTRFS_STRIPE_LEN_SHIFT;
3694                 goto out;
3695         }
3696
3697         /* Only RAID56 goes through the old code */
3698         ASSERT(map->type & BTRFS_BLOCK_GROUP_RAID56_MASK);
3699         ret = 0;
3700
3701         /* Calculate the logical end of the stripe */
3702         get_raid56_logic_offset(physical_end, stripe_index,
3703                                 map, &logic_end, NULL);
3704         logic_end += chunk_logical;
3705
3706         /* Initialize @offset in case we need to go to out: label */
3707         get_raid56_logic_offset(physical, stripe_index, map, &offset, NULL);
3708         increment = nr_data_stripes(map) << BTRFS_STRIPE_LEN_SHIFT;
3709
3710         /*
3711          * Due to the rotation, for RAID56 it's better to iterate each stripe
3712          * using their physical offset.
3713          */
3714         while (physical < physical_end) {
3715                 ret = get_raid56_logic_offset(physical, stripe_index, map,
3716                                               &logical, &stripe_logical);
3717                 logical += chunk_logical;
3718                 if (ret) {
3719                         /* it is parity strip */
3720                         stripe_logical += chunk_logical;
3721                         stripe_end = stripe_logical + increment;
3722                         ret = scrub_raid56_parity(sctx, map, scrub_dev,
3723                                                   stripe_logical,
3724                                                   stripe_end);
3725                         if (ret)
3726                                 goto out;
3727                         goto next;
3728                 }
3729
3730                 /*
3731                  * Now we're at a data stripe, scrub each extents in the range.
3732                  *
3733                  * At this stage, if we ignore the repair part, inside each data
3734                  * stripe it is no different than SINGLE profile.
3735                  * We can reuse scrub_simple_mirror() here, as the repair part
3736                  * is still based on @mirror_num.
3737                  */
3738                 ret = scrub_simple_mirror(sctx, root, csum_root, bg, map,
3739                                           logical, BTRFS_STRIPE_LEN,
3740                                           scrub_dev, physical, 1);
3741                 if (ret < 0)
3742                         goto out;
3743 next:
3744                 logical += increment;
3745                 physical += BTRFS_STRIPE_LEN;
3746                 spin_lock(&sctx->stat_lock);
3747                 if (stop_loop)
3748                         sctx->stat.last_physical =
3749                                 map->stripes[stripe_index].physical + dev_stripe_len;
3750                 else
3751                         sctx->stat.last_physical = physical;
3752                 spin_unlock(&sctx->stat_lock);
3753                 if (stop_loop)
3754                         break;
3755         }
3756 out:
3757         /* push queued extents */
3758         scrub_submit(sctx);
3759         mutex_lock(&sctx->wr_lock);
3760         scrub_wr_submit(sctx);
3761         mutex_unlock(&sctx->wr_lock);
3762
3763         blk_finish_plug(&plug);
3764         btrfs_free_path(path);
3765
3766         if (sctx->is_dev_replace && ret >= 0) {
3767                 int ret2;
3768
3769                 ret2 = sync_write_pointer_for_zoned(sctx,
3770                                 chunk_logical + offset,
3771                                 map->stripes[stripe_index].physical,
3772                                 physical_end);
3773                 if (ret2)
3774                         ret = ret2;
3775         }
3776
3777         return ret < 0 ? ret : 0;
3778 }
3779
3780 static noinline_for_stack int scrub_chunk(struct scrub_ctx *sctx,
3781                                           struct btrfs_block_group *bg,
3782                                           struct btrfs_device *scrub_dev,
3783                                           u64 dev_offset,
3784                                           u64 dev_extent_len)
3785 {
3786         struct btrfs_fs_info *fs_info = sctx->fs_info;
3787         struct extent_map_tree *map_tree = &fs_info->mapping_tree;
3788         struct map_lookup *map;
3789         struct extent_map *em;
3790         int i;
3791         int ret = 0;
3792
3793         read_lock(&map_tree->lock);
3794         em = lookup_extent_mapping(map_tree, bg->start, bg->length);
3795         read_unlock(&map_tree->lock);
3796
3797         if (!em) {
3798                 /*
3799                  * Might have been an unused block group deleted by the cleaner
3800                  * kthread or relocation.
3801                  */
3802                 spin_lock(&bg->lock);
3803                 if (!test_bit(BLOCK_GROUP_FLAG_REMOVED, &bg->runtime_flags))
3804                         ret = -EINVAL;
3805                 spin_unlock(&bg->lock);
3806
3807                 return ret;
3808         }
3809         if (em->start != bg->start)
3810                 goto out;
3811         if (em->len < dev_extent_len)
3812                 goto out;
3813
3814         map = em->map_lookup;
3815         for (i = 0; i < map->num_stripes; ++i) {
3816                 if (map->stripes[i].dev->bdev == scrub_dev->bdev &&
3817                     map->stripes[i].physical == dev_offset) {
3818                         ret = scrub_stripe(sctx, bg, em, scrub_dev, i);
3819                         if (ret)
3820                                 goto out;
3821                 }
3822         }
3823 out:
3824         free_extent_map(em);
3825
3826         return ret;
3827 }
3828
3829 static int finish_extent_writes_for_zoned(struct btrfs_root *root,
3830                                           struct btrfs_block_group *cache)
3831 {
3832         struct btrfs_fs_info *fs_info = cache->fs_info;
3833         struct btrfs_trans_handle *trans;
3834
3835         if (!btrfs_is_zoned(fs_info))
3836                 return 0;
3837
3838         btrfs_wait_block_group_reservations(cache);
3839         btrfs_wait_nocow_writers(cache);
3840         btrfs_wait_ordered_roots(fs_info, U64_MAX, cache->start, cache->length);
3841
3842         trans = btrfs_join_transaction(root);
3843         if (IS_ERR(trans))
3844                 return PTR_ERR(trans);
3845         return btrfs_commit_transaction(trans);
3846 }
3847
3848 static noinline_for_stack
3849 int scrub_enumerate_chunks(struct scrub_ctx *sctx,
3850                            struct btrfs_device *scrub_dev, u64 start, u64 end)
3851 {
3852         struct btrfs_dev_extent *dev_extent = NULL;
3853         struct btrfs_path *path;
3854         struct btrfs_fs_info *fs_info = sctx->fs_info;
3855         struct btrfs_root *root = fs_info->dev_root;
3856         u64 chunk_offset;
3857         int ret = 0;
3858         int ro_set;
3859         int slot;
3860         struct extent_buffer *l;
3861         struct btrfs_key key;
3862         struct btrfs_key found_key;
3863         struct btrfs_block_group *cache;
3864         struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
3865
3866         path = btrfs_alloc_path();
3867         if (!path)
3868                 return -ENOMEM;
3869
3870         path->reada = READA_FORWARD;
3871         path->search_commit_root = 1;
3872         path->skip_locking = 1;
3873
3874         key.objectid = scrub_dev->devid;
3875         key.offset = 0ull;
3876         key.type = BTRFS_DEV_EXTENT_KEY;
3877
3878         while (1) {
3879                 u64 dev_extent_len;
3880
3881                 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
3882                 if (ret < 0)
3883                         break;
3884                 if (ret > 0) {
3885                         if (path->slots[0] >=
3886                             btrfs_header_nritems(path->nodes[0])) {
3887                                 ret = btrfs_next_leaf(root, path);
3888                                 if (ret < 0)
3889                                         break;
3890                                 if (ret > 0) {
3891                                         ret = 0;
3892                                         break;
3893                                 }
3894                         } else {
3895                                 ret = 0;
3896                         }
3897                 }
3898
3899                 l = path->nodes[0];
3900                 slot = path->slots[0];
3901
3902                 btrfs_item_key_to_cpu(l, &found_key, slot);
3903
3904                 if (found_key.objectid != scrub_dev->devid)
3905                         break;
3906
3907                 if (found_key.type != BTRFS_DEV_EXTENT_KEY)
3908                         break;
3909
3910                 if (found_key.offset >= end)
3911                         break;
3912
3913                 if (found_key.offset < key.offset)
3914                         break;
3915
3916                 dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
3917                 dev_extent_len = btrfs_dev_extent_length(l, dev_extent);
3918
3919                 if (found_key.offset + dev_extent_len <= start)
3920                         goto skip;
3921
3922                 chunk_offset = btrfs_dev_extent_chunk_offset(l, dev_extent);
3923
3924                 /*
3925                  * get a reference on the corresponding block group to prevent
3926                  * the chunk from going away while we scrub it
3927                  */
3928                 cache = btrfs_lookup_block_group(fs_info, chunk_offset);
3929
3930                 /* some chunks are removed but not committed to disk yet,
3931                  * continue scrubbing */
3932                 if (!cache)
3933                         goto skip;
3934
3935                 ASSERT(cache->start <= chunk_offset);
3936                 /*
3937                  * We are using the commit root to search for device extents, so
3938                  * that means we could have found a device extent item from a
3939                  * block group that was deleted in the current transaction. The
3940                  * logical start offset of the deleted block group, stored at
3941                  * @chunk_offset, might be part of the logical address range of
3942                  * a new block group (which uses different physical extents).
3943                  * In this case btrfs_lookup_block_group() has returned the new
3944                  * block group, and its start address is less than @chunk_offset.
3945                  *
3946                  * We skip such new block groups, because it's pointless to
3947                  * process them, as we won't find their extents because we search
3948                  * for them using the commit root of the extent tree. For a device
3949                  * replace it's also fine to skip it, we won't miss copying them
3950                  * to the target device because we have the write duplication
3951                  * setup through the regular write path (by btrfs_map_block()),
3952                  * and we have committed a transaction when we started the device
3953                  * replace, right after setting up the device replace state.
3954                  */
3955                 if (cache->start < chunk_offset) {
3956                         btrfs_put_block_group(cache);
3957                         goto skip;
3958                 }
3959
3960                 if (sctx->is_dev_replace && btrfs_is_zoned(fs_info)) {
3961                         if (!test_bit(BLOCK_GROUP_FLAG_TO_COPY, &cache->runtime_flags)) {
3962                                 btrfs_put_block_group(cache);
3963                                 goto skip;
3964                         }
3965                 }
3966
3967                 /*
3968                  * Make sure that while we are scrubbing the corresponding block
3969                  * group doesn't get its logical address and its device extents
3970                  * reused for another block group, which can possibly be of a
3971                  * different type and different profile. We do this to prevent
3972                  * false error detections and crashes due to bogus attempts to
3973                  * repair extents.
3974                  */
3975                 spin_lock(&cache->lock);
3976                 if (test_bit(BLOCK_GROUP_FLAG_REMOVED, &cache->runtime_flags)) {
3977                         spin_unlock(&cache->lock);
3978                         btrfs_put_block_group(cache);
3979                         goto skip;
3980                 }
3981                 btrfs_freeze_block_group(cache);
3982                 spin_unlock(&cache->lock);
3983
3984                 /*
3985                  * we need call btrfs_inc_block_group_ro() with scrubs_paused,
3986                  * to avoid deadlock caused by:
3987                  * btrfs_inc_block_group_ro()
3988                  * -> btrfs_wait_for_commit()
3989                  * -> btrfs_commit_transaction()
3990                  * -> btrfs_scrub_pause()
3991                  */
3992                 scrub_pause_on(fs_info);
3993
3994                 /*
3995                  * Don't do chunk preallocation for scrub.
3996                  *
3997                  * This is especially important for SYSTEM bgs, or we can hit
3998                  * -EFBIG from btrfs_finish_chunk_alloc() like:
3999                  * 1. The only SYSTEM bg is marked RO.
4000                  *    Since SYSTEM bg is small, that's pretty common.
4001                  * 2. New SYSTEM bg will be allocated
4002                  *    Due to regular version will allocate new chunk.
4003                  * 3. New SYSTEM bg is empty and will get cleaned up
4004                  *    Before cleanup really happens, it's marked RO again.
4005                  * 4. Empty SYSTEM bg get scrubbed
4006                  *    We go back to 2.
4007                  *
4008                  * This can easily boost the amount of SYSTEM chunks if cleaner
4009                  * thread can't be triggered fast enough, and use up all space
4010                  * of btrfs_super_block::sys_chunk_array
4011                  *
4012                  * While for dev replace, we need to try our best to mark block
4013                  * group RO, to prevent race between:
4014                  * - Write duplication
4015                  *   Contains latest data
4016                  * - Scrub copy
4017                  *   Contains data from commit tree
4018                  *
4019                  * If target block group is not marked RO, nocow writes can
4020                  * be overwritten by scrub copy, causing data corruption.
4021                  * So for dev-replace, it's not allowed to continue if a block
4022                  * group is not RO.
4023                  */
4024                 ret = btrfs_inc_block_group_ro(cache, sctx->is_dev_replace);
4025                 if (!ret && sctx->is_dev_replace) {
4026                         ret = finish_extent_writes_for_zoned(root, cache);
4027                         if (ret) {
4028                                 btrfs_dec_block_group_ro(cache);
4029                                 scrub_pause_off(fs_info);
4030                                 btrfs_put_block_group(cache);
4031                                 break;
4032                         }
4033                 }
4034
4035                 if (ret == 0) {
4036                         ro_set = 1;
4037                 } else if (ret == -ENOSPC && !sctx->is_dev_replace) {
4038                         /*
4039                          * btrfs_inc_block_group_ro return -ENOSPC when it
4040                          * failed in creating new chunk for metadata.
4041                          * It is not a problem for scrub, because
4042                          * metadata are always cowed, and our scrub paused
4043                          * commit_transactions.
4044                          */
4045                         ro_set = 0;
4046                 } else if (ret == -ETXTBSY) {
4047                         btrfs_warn(fs_info,
4048                    "skipping scrub of block group %llu due to active swapfile",
4049                                    cache->start);
4050                         scrub_pause_off(fs_info);
4051                         ret = 0;
4052                         goto skip_unfreeze;
4053                 } else {
4054                         btrfs_warn(fs_info,
4055                                    "failed setting block group ro: %d", ret);
4056                         btrfs_unfreeze_block_group(cache);
4057                         btrfs_put_block_group(cache);
4058                         scrub_pause_off(fs_info);
4059                         break;
4060                 }
4061
4062                 /*
4063                  * Now the target block is marked RO, wait for nocow writes to
4064                  * finish before dev-replace.
4065                  * COW is fine, as COW never overwrites extents in commit tree.
4066                  */
4067                 if (sctx->is_dev_replace) {
4068                         btrfs_wait_nocow_writers(cache);
4069                         btrfs_wait_ordered_roots(fs_info, U64_MAX, cache->start,
4070                                         cache->length);
4071                 }
4072
4073                 scrub_pause_off(fs_info);
4074                 down_write(&dev_replace->rwsem);
4075                 dev_replace->cursor_right = found_key.offset + dev_extent_len;
4076                 dev_replace->cursor_left = found_key.offset;
4077                 dev_replace->item_needs_writeback = 1;
4078                 up_write(&dev_replace->rwsem);
4079
4080                 ret = scrub_chunk(sctx, cache, scrub_dev, found_key.offset,
4081                                   dev_extent_len);
4082
4083                 /*
4084                  * flush, submit all pending read and write bios, afterwards
4085                  * wait for them.
4086                  * Note that in the dev replace case, a read request causes
4087                  * write requests that are submitted in the read completion
4088                  * worker. Therefore in the current situation, it is required
4089                  * that all write requests are flushed, so that all read and
4090                  * write requests are really completed when bios_in_flight
4091                  * changes to 0.
4092                  */
4093                 sctx->flush_all_writes = true;
4094                 scrub_submit(sctx);
4095                 mutex_lock(&sctx->wr_lock);
4096                 scrub_wr_submit(sctx);
4097                 mutex_unlock(&sctx->wr_lock);
4098
4099                 wait_event(sctx->list_wait,
4100                            atomic_read(&sctx->bios_in_flight) == 0);
4101
4102                 scrub_pause_on(fs_info);
4103
4104                 /*
4105                  * must be called before we decrease @scrub_paused.
4106                  * make sure we don't block transaction commit while
4107                  * we are waiting pending workers finished.
4108                  */
4109                 wait_event(sctx->list_wait,
4110                            atomic_read(&sctx->workers_pending) == 0);
4111                 sctx->flush_all_writes = false;
4112
4113                 scrub_pause_off(fs_info);
4114
4115                 if (sctx->is_dev_replace &&
4116                     !btrfs_finish_block_group_to_copy(dev_replace->srcdev,
4117                                                       cache, found_key.offset))
4118                         ro_set = 0;
4119
4120                 down_write(&dev_replace->rwsem);
4121                 dev_replace->cursor_left = dev_replace->cursor_right;
4122                 dev_replace->item_needs_writeback = 1;
4123                 up_write(&dev_replace->rwsem);
4124
4125                 if (ro_set)
4126                         btrfs_dec_block_group_ro(cache);
4127
4128                 /*
4129                  * We might have prevented the cleaner kthread from deleting
4130                  * this block group if it was already unused because we raced
4131                  * and set it to RO mode first. So add it back to the unused
4132                  * list, otherwise it might not ever be deleted unless a manual
4133                  * balance is triggered or it becomes used and unused again.
4134                  */
4135                 spin_lock(&cache->lock);
4136                 if (!test_bit(BLOCK_GROUP_FLAG_REMOVED, &cache->runtime_flags) &&
4137                     !cache->ro && cache->reserved == 0 && cache->used == 0) {
4138                         spin_unlock(&cache->lock);
4139                         if (btrfs_test_opt(fs_info, DISCARD_ASYNC))
4140                                 btrfs_discard_queue_work(&fs_info->discard_ctl,
4141                                                          cache);
4142                         else
4143                                 btrfs_mark_bg_unused(cache);
4144                 } else {
4145                         spin_unlock(&cache->lock);
4146                 }
4147 skip_unfreeze:
4148                 btrfs_unfreeze_block_group(cache);
4149                 btrfs_put_block_group(cache);
4150                 if (ret)
4151                         break;
4152                 if (sctx->is_dev_replace &&
4153                     atomic64_read(&dev_replace->num_write_errors) > 0) {
4154                         ret = -EIO;
4155                         break;
4156                 }
4157                 if (sctx->stat.malloc_errors > 0) {
4158                         ret = -ENOMEM;
4159                         break;
4160                 }
4161 skip:
4162                 key.offset = found_key.offset + dev_extent_len;
4163                 btrfs_release_path(path);
4164         }
4165
4166         btrfs_free_path(path);
4167
4168         return ret;
4169 }
4170
4171 static noinline_for_stack int scrub_supers(struct scrub_ctx *sctx,
4172                                            struct btrfs_device *scrub_dev)
4173 {
4174         int     i;
4175         u64     bytenr;
4176         u64     gen;
4177         int     ret;
4178         struct btrfs_fs_info *fs_info = sctx->fs_info;
4179
4180         if (BTRFS_FS_ERROR(fs_info))
4181                 return -EROFS;
4182
4183         /* Seed devices of a new filesystem has their own generation. */
4184         if (scrub_dev->fs_devices != fs_info->fs_devices)
4185                 gen = scrub_dev->generation;
4186         else
4187                 gen = fs_info->last_trans_committed;
4188
4189         for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
4190                 bytenr = btrfs_sb_offset(i);
4191                 if (bytenr + BTRFS_SUPER_INFO_SIZE >
4192                     scrub_dev->commit_total_bytes)
4193                         break;
4194                 if (!btrfs_check_super_location(scrub_dev, bytenr))
4195                         continue;
4196
4197                 ret = scrub_sectors(sctx, bytenr, BTRFS_SUPER_INFO_SIZE, bytenr,
4198                                     scrub_dev, BTRFS_EXTENT_FLAG_SUPER, gen, i,
4199                                     NULL, bytenr);
4200                 if (ret)
4201                         return ret;
4202         }
4203         wait_event(sctx->list_wait, atomic_read(&sctx->bios_in_flight) == 0);
4204
4205         return 0;
4206 }
4207
4208 static void scrub_workers_put(struct btrfs_fs_info *fs_info)
4209 {
4210         if (refcount_dec_and_mutex_lock(&fs_info->scrub_workers_refcnt,
4211                                         &fs_info->scrub_lock)) {
4212                 struct workqueue_struct *scrub_workers = fs_info->scrub_workers;
4213                 struct workqueue_struct *scrub_wr_comp =
4214                                                 fs_info->scrub_wr_completion_workers;
4215                 struct workqueue_struct *scrub_parity =
4216                                                 fs_info->scrub_parity_workers;
4217
4218                 fs_info->scrub_workers = NULL;
4219                 fs_info->scrub_wr_completion_workers = NULL;
4220                 fs_info->scrub_parity_workers = NULL;
4221                 mutex_unlock(&fs_info->scrub_lock);
4222
4223                 if (scrub_workers)
4224                         destroy_workqueue(scrub_workers);
4225                 if (scrub_wr_comp)
4226                         destroy_workqueue(scrub_wr_comp);
4227                 if (scrub_parity)
4228                         destroy_workqueue(scrub_parity);
4229         }
4230 }
4231
4232 /*
4233  * get a reference count on fs_info->scrub_workers. start worker if necessary
4234  */
4235 static noinline_for_stack int scrub_workers_get(struct btrfs_fs_info *fs_info,
4236                                                 int is_dev_replace)
4237 {
4238         struct workqueue_struct *scrub_workers = NULL;
4239         struct workqueue_struct *scrub_wr_comp = NULL;
4240         struct workqueue_struct *scrub_parity = NULL;
4241         unsigned int flags = WQ_FREEZABLE | WQ_UNBOUND;
4242         int max_active = fs_info->thread_pool_size;
4243         int ret = -ENOMEM;
4244
4245         if (refcount_inc_not_zero(&fs_info->scrub_workers_refcnt))
4246                 return 0;
4247
4248         scrub_workers = alloc_workqueue("btrfs-scrub", flags,
4249                                         is_dev_replace ? 1 : max_active);
4250         if (!scrub_workers)
4251                 goto fail_scrub_workers;
4252
4253         scrub_wr_comp = alloc_workqueue("btrfs-scrubwrc", flags, max_active);
4254         if (!scrub_wr_comp)
4255                 goto fail_scrub_wr_completion_workers;
4256
4257         scrub_parity = alloc_workqueue("btrfs-scrubparity", flags, max_active);
4258         if (!scrub_parity)
4259                 goto fail_scrub_parity_workers;
4260
4261         mutex_lock(&fs_info->scrub_lock);
4262         if (refcount_read(&fs_info->scrub_workers_refcnt) == 0) {
4263                 ASSERT(fs_info->scrub_workers == NULL &&
4264                        fs_info->scrub_wr_completion_workers == NULL &&
4265                        fs_info->scrub_parity_workers == NULL);
4266                 fs_info->scrub_workers = scrub_workers;
4267                 fs_info->scrub_wr_completion_workers = scrub_wr_comp;
4268                 fs_info->scrub_parity_workers = scrub_parity;
4269                 refcount_set(&fs_info->scrub_workers_refcnt, 1);
4270                 mutex_unlock(&fs_info->scrub_lock);
4271                 return 0;
4272         }
4273         /* Other thread raced in and created the workers for us */
4274         refcount_inc(&fs_info->scrub_workers_refcnt);
4275         mutex_unlock(&fs_info->scrub_lock);
4276
4277         ret = 0;
4278         destroy_workqueue(scrub_parity);
4279 fail_scrub_parity_workers:
4280         destroy_workqueue(scrub_wr_comp);
4281 fail_scrub_wr_completion_workers:
4282         destroy_workqueue(scrub_workers);
4283 fail_scrub_workers:
4284         return ret;
4285 }
4286
4287 int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
4288                     u64 end, struct btrfs_scrub_progress *progress,
4289                     int readonly, int is_dev_replace)
4290 {
4291         struct btrfs_dev_lookup_args args = { .devid = devid };
4292         struct scrub_ctx *sctx;
4293         int ret;
4294         struct btrfs_device *dev;
4295         unsigned int nofs_flag;
4296         bool need_commit = false;
4297
4298         if (btrfs_fs_closing(fs_info))
4299                 return -EAGAIN;
4300
4301         /* At mount time we have ensured nodesize is in the range of [4K, 64K]. */
4302         ASSERT(fs_info->nodesize <= BTRFS_STRIPE_LEN);
4303
4304         /*
4305          * SCRUB_MAX_SECTORS_PER_BLOCK is calculated using the largest possible
4306          * value (max nodesize / min sectorsize), thus nodesize should always
4307          * be fine.
4308          */
4309         ASSERT(fs_info->nodesize <=
4310                SCRUB_MAX_SECTORS_PER_BLOCK << fs_info->sectorsize_bits);
4311
4312         /* Allocate outside of device_list_mutex */
4313         sctx = scrub_setup_ctx(fs_info, is_dev_replace);
4314         if (IS_ERR(sctx))
4315                 return PTR_ERR(sctx);
4316
4317         ret = scrub_workers_get(fs_info, is_dev_replace);
4318         if (ret)
4319                 goto out_free_ctx;
4320
4321         mutex_lock(&fs_info->fs_devices->device_list_mutex);
4322         dev = btrfs_find_device(fs_info->fs_devices, &args);
4323         if (!dev || (test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state) &&
4324                      !is_dev_replace)) {
4325                 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
4326                 ret = -ENODEV;
4327                 goto out;
4328         }
4329
4330         if (!is_dev_replace && !readonly &&
4331             !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state)) {
4332                 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
4333                 btrfs_err_in_rcu(fs_info,
4334                         "scrub on devid %llu: filesystem on %s is not writable",
4335                                  devid, btrfs_dev_name(dev));
4336                 ret = -EROFS;
4337                 goto out;
4338         }
4339
4340         mutex_lock(&fs_info->scrub_lock);
4341         if (!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &dev->dev_state) ||
4342             test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &dev->dev_state)) {
4343                 mutex_unlock(&fs_info->scrub_lock);
4344                 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
4345                 ret = -EIO;
4346                 goto out;
4347         }
4348
4349         down_read(&fs_info->dev_replace.rwsem);
4350         if (dev->scrub_ctx ||
4351             (!is_dev_replace &&
4352              btrfs_dev_replace_is_ongoing(&fs_info->dev_replace))) {
4353                 up_read(&fs_info->dev_replace.rwsem);
4354                 mutex_unlock(&fs_info->scrub_lock);
4355                 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
4356                 ret = -EINPROGRESS;
4357                 goto out;
4358         }
4359         up_read(&fs_info->dev_replace.rwsem);
4360
4361         sctx->readonly = readonly;
4362         dev->scrub_ctx = sctx;
4363         mutex_unlock(&fs_info->fs_devices->device_list_mutex);
4364
4365         /*
4366          * checking @scrub_pause_req here, we can avoid
4367          * race between committing transaction and scrubbing.
4368          */
4369         __scrub_blocked_if_needed(fs_info);
4370         atomic_inc(&fs_info->scrubs_running);
4371         mutex_unlock(&fs_info->scrub_lock);
4372
4373         /*
4374          * In order to avoid deadlock with reclaim when there is a transaction
4375          * trying to pause scrub, make sure we use GFP_NOFS for all the
4376          * allocations done at btrfs_scrub_sectors() and scrub_sectors_for_parity()
4377          * invoked by our callees. The pausing request is done when the
4378          * transaction commit starts, and it blocks the transaction until scrub
4379          * is paused (done at specific points at scrub_stripe() or right above
4380          * before incrementing fs_info->scrubs_running).
4381          */
4382         nofs_flag = memalloc_nofs_save();
4383         if (!is_dev_replace) {
4384                 u64 old_super_errors;
4385
4386                 spin_lock(&sctx->stat_lock);
4387                 old_super_errors = sctx->stat.super_errors;
4388                 spin_unlock(&sctx->stat_lock);
4389
4390                 btrfs_info(fs_info, "scrub: started on devid %llu", devid);
4391                 /*
4392                  * by holding device list mutex, we can
4393                  * kick off writing super in log tree sync.
4394                  */
4395                 mutex_lock(&fs_info->fs_devices->device_list_mutex);
4396                 ret = scrub_supers(sctx, dev);
4397                 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
4398
4399                 spin_lock(&sctx->stat_lock);
4400                 /*
4401                  * Super block errors found, but we can not commit transaction
4402                  * at current context, since btrfs_commit_transaction() needs
4403                  * to pause the current running scrub (hold by ourselves).
4404                  */
4405                 if (sctx->stat.super_errors > old_super_errors && !sctx->readonly)
4406                         need_commit = true;
4407                 spin_unlock(&sctx->stat_lock);
4408         }
4409
4410         if (!ret)
4411                 ret = scrub_enumerate_chunks(sctx, dev, start, end);
4412         memalloc_nofs_restore(nofs_flag);
4413
4414         wait_event(sctx->list_wait, atomic_read(&sctx->bios_in_flight) == 0);
4415         atomic_dec(&fs_info->scrubs_running);
4416         wake_up(&fs_info->scrub_pause_wait);
4417
4418         wait_event(sctx->list_wait, atomic_read(&sctx->workers_pending) == 0);
4419
4420         if (progress)
4421                 memcpy(progress, &sctx->stat, sizeof(*progress));
4422
4423         if (!is_dev_replace)
4424                 btrfs_info(fs_info, "scrub: %s on devid %llu with status: %d",
4425                         ret ? "not finished" : "finished", devid, ret);
4426
4427         mutex_lock(&fs_info->scrub_lock);
4428         dev->scrub_ctx = NULL;
4429         mutex_unlock(&fs_info->scrub_lock);
4430
4431         scrub_workers_put(fs_info);
4432         scrub_put_ctx(sctx);
4433
4434         /*
4435          * We found some super block errors before, now try to force a
4436          * transaction commit, as scrub has finished.
4437          */
4438         if (need_commit) {
4439                 struct btrfs_trans_handle *trans;
4440
4441                 trans = btrfs_start_transaction(fs_info->tree_root, 0);
4442                 if (IS_ERR(trans)) {
4443                         ret = PTR_ERR(trans);
4444                         btrfs_err(fs_info,
4445         "scrub: failed to start transaction to fix super block errors: %d", ret);
4446                         return ret;
4447                 }
4448                 ret = btrfs_commit_transaction(trans);
4449                 if (ret < 0)
4450                         btrfs_err(fs_info,
4451         "scrub: failed to commit transaction to fix super block errors: %d", ret);
4452         }
4453         return ret;
4454 out:
4455         scrub_workers_put(fs_info);
4456 out_free_ctx:
4457         scrub_free_ctx(sctx);
4458
4459         return ret;
4460 }
4461
4462 void btrfs_scrub_pause(struct btrfs_fs_info *fs_info)
4463 {
4464         mutex_lock(&fs_info->scrub_lock);
4465         atomic_inc(&fs_info->scrub_pause_req);
4466         while (atomic_read(&fs_info->scrubs_paused) !=
4467                atomic_read(&fs_info->scrubs_running)) {
4468                 mutex_unlock(&fs_info->scrub_lock);
4469                 wait_event(fs_info->scrub_pause_wait,
4470                            atomic_read(&fs_info->scrubs_paused) ==
4471                            atomic_read(&fs_info->scrubs_running));
4472                 mutex_lock(&fs_info->scrub_lock);
4473         }
4474         mutex_unlock(&fs_info->scrub_lock);
4475 }
4476
4477 void btrfs_scrub_continue(struct btrfs_fs_info *fs_info)
4478 {
4479         atomic_dec(&fs_info->scrub_pause_req);
4480         wake_up(&fs_info->scrub_pause_wait);
4481 }
4482
4483 int btrfs_scrub_cancel(struct btrfs_fs_info *fs_info)
4484 {
4485         mutex_lock(&fs_info->scrub_lock);
4486         if (!atomic_read(&fs_info->scrubs_running)) {
4487                 mutex_unlock(&fs_info->scrub_lock);
4488                 return -ENOTCONN;
4489         }
4490
4491         atomic_inc(&fs_info->scrub_cancel_req);
4492         while (atomic_read(&fs_info->scrubs_running)) {
4493                 mutex_unlock(&fs_info->scrub_lock);
4494                 wait_event(fs_info->scrub_pause_wait,
4495                            atomic_read(&fs_info->scrubs_running) == 0);
4496                 mutex_lock(&fs_info->scrub_lock);
4497         }
4498         atomic_dec(&fs_info->scrub_cancel_req);
4499         mutex_unlock(&fs_info->scrub_lock);
4500
4501         return 0;
4502 }
4503
4504 int btrfs_scrub_cancel_dev(struct btrfs_device *dev)
4505 {
4506         struct btrfs_fs_info *fs_info = dev->fs_info;
4507         struct scrub_ctx *sctx;
4508
4509         mutex_lock(&fs_info->scrub_lock);
4510         sctx = dev->scrub_ctx;
4511         if (!sctx) {
4512                 mutex_unlock(&fs_info->scrub_lock);
4513                 return -ENOTCONN;
4514         }
4515         atomic_inc(&sctx->cancel_req);
4516         while (dev->scrub_ctx) {
4517                 mutex_unlock(&fs_info->scrub_lock);
4518                 wait_event(fs_info->scrub_pause_wait,
4519                            dev->scrub_ctx == NULL);
4520                 mutex_lock(&fs_info->scrub_lock);
4521         }
4522         mutex_unlock(&fs_info->scrub_lock);
4523
4524         return 0;
4525 }
4526
4527 int btrfs_scrub_progress(struct btrfs_fs_info *fs_info, u64 devid,
4528                          struct btrfs_scrub_progress *progress)
4529 {
4530         struct btrfs_dev_lookup_args args = { .devid = devid };
4531         struct btrfs_device *dev;
4532         struct scrub_ctx *sctx = NULL;
4533
4534         mutex_lock(&fs_info->fs_devices->device_list_mutex);
4535         dev = btrfs_find_device(fs_info->fs_devices, &args);
4536         if (dev)
4537                 sctx = dev->scrub_ctx;
4538         if (sctx)
4539                 memcpy(progress, &sctx->stat, sizeof(*progress));
4540         mutex_unlock(&fs_info->fs_devices->device_list_mutex);
4541
4542         return dev ? (sctx ? 0 : -ENOTCONN) : -ENODEV;
4543 }
4544
4545 static void scrub_find_good_copy(struct btrfs_fs_info *fs_info,
4546                                  u64 extent_logical, u32 extent_len,
4547                                  u64 *extent_physical,
4548                                  struct btrfs_device **extent_dev,
4549                                  int *extent_mirror_num)
4550 {
4551         u64 mapped_length;
4552         struct btrfs_io_context *bioc = NULL;
4553         int ret;
4554
4555         mapped_length = extent_len;
4556         ret = btrfs_map_block(fs_info, BTRFS_MAP_READ, extent_logical,
4557                               &mapped_length, &bioc, 0);
4558         if (ret || !bioc || mapped_length < extent_len ||
4559             !bioc->stripes[0].dev->bdev) {
4560                 btrfs_put_bioc(bioc);
4561                 return;
4562         }
4563
4564         *extent_physical = bioc->stripes[0].physical;
4565         *extent_mirror_num = bioc->mirror_num;
4566         *extent_dev = bioc->stripes[0].dev;
4567         btrfs_put_bioc(bioc);
4568 }