fs/btrfs/scrub.c

   1 // SPDX-License-Identifier: GPL-2.0
   2 /*
   3  * Copyright (C) 2011, 2012 STRATO.  All rights reserved.
   4  */
   5
   6 #include <linux/blkdev.h>
   7 #include <linux/ratelimit.h>
   8 #include <linux/sched/mm.h>
   9 #include <crypto/hash.h>
  10 #include "ctree.h"
  11 #include "discard.h"
  12 #include "volumes.h"
  13 #include "disk-io.h"
  14 #include "ordered-data.h"
  15 #include "transaction.h"
  16 #include "backref.h"
  17 #include "extent_io.h"
  18 #include "dev-replace.h"
  19 #include "check-integrity.h"
  20 #include "rcu-string.h"
  21 #include "raid56.h"
  22 #include "block-group.h"
  23 #include "zoned.h"
  24 #include "fs.h"
  25 #include "accessors.h"
  26 #include "file-item.h"
  27 #include "scrub.h"
  28
  29 /*
  30  * This is only the first step towards a full-features scrub. It reads all
  31  * extent and super block and verifies the checksums. In case a bad checksum
  32  * is found or the extent cannot be read, good data will be written back if
  33  * any can be found.
  34  *
  35  * Future enhancements:
  36  *  - In case an unrepairable extent is encountered, track which files are
  37  *    affected and report them
  38  *  - track and record media errors, throw out bad devices
  39  *  - add a mode to also read unallocated space
  40  */
  41
  42 struct scrub_block;
  43 struct scrub_ctx;
  44
  45 /*
  46  * The following three values only influence the performance.
  47  *
  48  * The last one configures the number of parallel and outstanding I/O
  49  * operations. The first one configures an upper limit for the number
  50  * of (dynamically allocated) pages that are added to a bio.
  51  */
  52 #define SCRUB_SECTORS_PER_BIO   32      /* 128KiB per bio for 4KiB pages */
  53 #define SCRUB_BIOS_PER_SCTX     64      /* 8MiB per device in flight for 4KiB pages */
  54
  55 /*
  56  * The following value times PAGE_SIZE needs to be large enough to match the
  57  * largest node/leaf/sector size that shall be supported.
  58  */
  59 #define SCRUB_MAX_SECTORS_PER_BLOCK     (BTRFS_MAX_METADATA_BLOCKSIZE / SZ_4K)
  60
  61 #define SCRUB_MAX_PAGES                 (DIV_ROUND_UP(BTRFS_MAX_METADATA_BLOCKSIZE, PAGE_SIZE))
  62
  63 /*
  64  * Maximum number of mirrors that can be available for all profiles counting
  65  * the target device of dev-replace as one. During an active device replace
  66  * procedure, the target device of the copy operation is a mirror for the
  67  * filesystem data as well that can be used to read data in order to repair
  68  * read errors on other disks.
  69  *
  70  * Current value is derived from RAID1C4 with 4 copies.
  71  */
  72 #define BTRFS_MAX_MIRRORS (4 + 1)
  73
  74 struct scrub_recover {
  75         refcount_t              refs;
  76         struct btrfs_io_context *bioc;
  77         u64                     map_length;
  78 };
  79
  80 struct scrub_sector {
  81         struct scrub_block      *sblock;
  82         struct list_head        list;
  83         u64                     flags;  /* extent flags */
  84         u64                     generation;
  85         /* Offset in bytes to @sblock. */
  86         u32                     offset;
  87         atomic_t                refs;
  88         unsigned int            have_csum:1;
  89         unsigned int            io_error:1;
  90         u8                      csum[BTRFS_CSUM_SIZE];
  91
  92         struct scrub_recover    *recover;
  93 };
  94
  95 struct scrub_bio {
  96         int                     index;
  97         struct scrub_ctx        *sctx;
  98         struct btrfs_device     *dev;
  99         struct bio              *bio;
 100         blk_status_t            status;
 101         u64                     logical;
 102         u64                     physical;
 103         struct scrub_sector     *sectors[SCRUB_SECTORS_PER_BIO];
 104         int                     sector_count;
 105         int                     next_free;
 106         struct work_struct      work;
 107 };
 108
 109 struct scrub_block {
 110         /*
 111          * Each page will have its page::private used to record the logical
 112          * bytenr.
 113          */
 114         struct page             *pages[SCRUB_MAX_PAGES];
 115         struct scrub_sector     *sectors[SCRUB_MAX_SECTORS_PER_BLOCK];
 116         struct btrfs_device     *dev;
 117         /* Logical bytenr of the sblock */
 118         u64                     logical;
 119         u64                     physical;
 120         u64                     physical_for_dev_replace;
 121         /* Length of sblock in bytes */
 122         u32                     len;
 123         int                     sector_count;
 124         int                     mirror_num;
 125
 126         atomic_t                outstanding_sectors;
 127         refcount_t              refs; /* free mem on transition to zero */
 128         struct scrub_ctx        *sctx;
 129         struct scrub_parity     *sparity;
 130         struct {
 131                 unsigned int    header_error:1;
 132                 unsigned int    checksum_error:1;
 133                 unsigned int    no_io_error_seen:1;
 134                 unsigned int    generation_error:1; /* also sets header_error */
 135
 136                 /* The following is for the data used to check parity */
 137                 /* It is for the data with checksum */
 138                 unsigned int    data_corrected:1;
 139         };
 140         struct work_struct      work;
 141 };
 142
 143 /* Used for the chunks with parity stripe such RAID5/6 */
 144 struct scrub_parity {
 145         struct scrub_ctx        *sctx;
 146
 147         struct btrfs_device     *scrub_dev;
 148
 149         u64                     logic_start;
 150
 151         u64                     logic_end;
 152
 153         int                     nsectors;
 154
 155         u32                     stripe_len;
 156
 157         refcount_t              refs;
 158
 159         struct list_head        sectors_list;
 160
 161         /* Work of parity check and repair */
 162         struct work_struct      work;
 163
 164         /* Mark the parity blocks which have data */
 165         unsigned long           dbitmap;
 166
 167         /*
 168          * Mark the parity blocks which have data, but errors happen when
 169          * read data or check data
 170          */
 171         unsigned long           ebitmap;
 172 };
 173
 174 struct scrub_ctx {
 175         struct scrub_bio        *bios[SCRUB_BIOS_PER_SCTX];
 176         struct btrfs_fs_info    *fs_info;
 177         int                     first_free;
 178         int                     curr;
 179         atomic_t                bios_in_flight;
 180         atomic_t                workers_pending;
 181         spinlock_t              list_lock;
 182         wait_queue_head_t       list_wait;
 183         struct list_head        csum_list;
 184         atomic_t                cancel_req;
 185         int                     readonly;
 186         int                     sectors_per_bio;
 187
 188         /* State of IO submission throttling affecting the associated device */
 189         ktime_t                 throttle_deadline;
 190         u64                     throttle_sent;
 191
 192         int                     is_dev_replace;
 193         u64                     write_pointer;
 194
 195         struct scrub_bio        *wr_curr_bio;
 196         struct mutex            wr_lock;
 197         struct btrfs_device     *wr_tgtdev;
 198         bool                    flush_all_writes;
 199
 200         /*
 201          * statistics
 202          */
 203         struct btrfs_scrub_progress stat;
 204         spinlock_t              stat_lock;
 205
 206         /*
 207          * Use a ref counter to avoid use-after-free issues. Scrub workers
 208          * decrement bios_in_flight and workers_pending and then do a wakeup
 209          * on the list_wait wait queue. We must ensure the main scrub task
 210          * doesn't free the scrub context before or while the workers are
 211          * doing the wakeup() call.
 212          */
 213         refcount_t              refs;
 214 };
 215
 216 struct scrub_warning {
 217         struct btrfs_path       *path;
 218         u64                     extent_item_size;
 219         const char              *errstr;
 220         u64                     physical;
 221         u64                     logical;
 222         struct btrfs_device     *dev;
 223 };
 224
 225 struct full_stripe_lock {
 226         struct rb_node node;
 227         u64 logical;
 228         u64 refs;
 229         struct mutex mutex;
 230 };
 231
 232 #ifndef CONFIG_64BIT
 233 /* This structure is for archtectures whose (void *) is smaller than u64 */
 234 struct scrub_page_private {
 235         u64 logical;
 236 };
 237 #endif
 238
 239 static int attach_scrub_page_private(struct page *page, u64 logical)
 240 {
 241 #ifdef CONFIG_64BIT
 242         attach_page_private(page, (void *)logical);
 243         return 0;
 244 #else
 245         struct scrub_page_private *spp;
 246
 247         spp = kmalloc(sizeof(*spp), GFP_KERNEL);
 248         if (!spp)
 249                 return -ENOMEM;
 250         spp->logical = logical;
 251         attach_page_private(page, (void *)spp);
 252         return 0;
 253 #endif
 254 }
 255
 256 static void detach_scrub_page_private(struct page *page)
 257 {
 258 #ifdef CONFIG_64BIT
 259         detach_page_private(page);
 260         return;
 261 #else
 262         struct scrub_page_private *spp;
 263
 264         spp = detach_page_private(page);
 265         kfree(spp);
 266         return;
 267 #endif
 268 }
 269
 270 static struct scrub_block *alloc_scrub_block(struct scrub_ctx *sctx,
 271                                              struct btrfs_device *dev,
 272                                              u64 logical, u64 physical,
 273                                              u64 physical_for_dev_replace,
 274                                              int mirror_num)
 275 {
 276         struct scrub_block *sblock;
 277
 278         sblock = kzalloc(sizeof(*sblock), GFP_KERNEL);
 279         if (!sblock)
 280                 return NULL;
 281         refcount_set(&sblock->refs, 1);
 282         sblock->sctx = sctx;
 283         sblock->logical = logical;
 284         sblock->physical = physical;
 285         sblock->physical_for_dev_replace = physical_for_dev_replace;
 286         sblock->dev = dev;
 287         sblock->mirror_num = mirror_num;
 288         sblock->no_io_error_seen = 1;
 289         /*
 290          * Scrub_block::pages will be allocated at alloc_scrub_sector() when
 291          * the corresponding page is not allocated.
 292          */
 293         return sblock;
 294 }
 295
 296 /*
 297  * Allocate a new scrub sector and attach it to @sblock.
 298  *
 299  * Will also allocate new pages for @sblock if needed.
 300  */
 301 static struct scrub_sector *alloc_scrub_sector(struct scrub_block *sblock,
 302                                                u64 logical)
 303 {
 304         const pgoff_t page_index = (logical - sblock->logical) >> PAGE_SHIFT;
 305         struct scrub_sector *ssector;
 306
 307         /* We must never have scrub_block exceed U32_MAX in size. */
 308         ASSERT(logical - sblock->logical < U32_MAX);
 309
 310         ssector = kzalloc(sizeof(*ssector), GFP_KERNEL);
 311         if (!ssector)
 312                 return NULL;
 313
 314         /* Allocate a new page if the slot is not allocated */
 315         if (!sblock->pages[page_index]) {
 316                 int ret;
 317
 318                 sblock->pages[page_index] = alloc_page(GFP_KERNEL);
 319                 if (!sblock->pages[page_index]) {
 320                         kfree(ssector);
 321                         return NULL;
 322                 }
 323                 ret = attach_scrub_page_private(sblock->pages[page_index],
 324                                 sblock->logical + (page_index << PAGE_SHIFT));
 325                 if (ret < 0) {
 326                         kfree(ssector);
 327                         __free_page(sblock->pages[page_index]);
 328                         sblock->pages[page_index] = NULL;
 329                         return NULL;
 330                 }
 331         }
 332
 333         atomic_set(&ssector->refs, 1);
 334         ssector->sblock = sblock;
 335         /* The sector to be added should not be used */
 336         ASSERT(sblock->sectors[sblock->sector_count] == NULL);
 337         ssector->offset = logical - sblock->logical;
 338
 339         /* The sector count must be smaller than the limit */
 340         ASSERT(sblock->sector_count < SCRUB_MAX_SECTORS_PER_BLOCK);
 341
 342         sblock->sectors[sblock->sector_count] = ssector;
 343         sblock->sector_count++;
 344         sblock->len += sblock->sctx->fs_info->sectorsize;
 345
 346         return ssector;
 347 }
 348
 349 static struct page *scrub_sector_get_page(struct scrub_sector *ssector)
 350 {
 351         struct scrub_block *sblock = ssector->sblock;
 352         pgoff_t index;
 353         /*
 354          * When calling this function, ssector must be alreaday attached to the
 355          * parent sblock.
 356          */
 357         ASSERT(sblock);
 358
 359         /* The range should be inside the sblock range */
 360         ASSERT(ssector->offset < sblock->len);
 361
 362         index = ssector->offset >> PAGE_SHIFT;
 363         ASSERT(index < SCRUB_MAX_PAGES);
 364         ASSERT(sblock->pages[index]);
 365         ASSERT(PagePrivate(sblock->pages[index]));
 366         return sblock->pages[index];
 367 }
 368
 369 static unsigned int scrub_sector_get_page_offset(struct scrub_sector *ssector)
 370 {
 371         struct scrub_block *sblock = ssector->sblock;
 372
 373         /*
 374          * When calling this function, ssector must be already attached to the
 375          * parent sblock.
 376          */
 377         ASSERT(sblock);
 378
 379         /* The range should be inside the sblock range */
 380         ASSERT(ssector->offset < sblock->len);
 381
 382         return offset_in_page(ssector->offset);
 383 }
 384
 385 static char *scrub_sector_get_kaddr(struct scrub_sector *ssector)
 386 {
 387         return page_address(scrub_sector_get_page(ssector)) +
 388                scrub_sector_get_page_offset(ssector);
 389 }
 390
 391 static int bio_add_scrub_sector(struct bio *bio, struct scrub_sector *ssector,
 392                                 unsigned int len)
 393 {
 394         return bio_add_page(bio, scrub_sector_get_page(ssector), len,
 395                             scrub_sector_get_page_offset(ssector));
 396 }
 397
 398 static int scrub_setup_recheck_block(struct scrub_block *original_sblock,
 399                                      struct scrub_block *sblocks_for_recheck[]);
 400 static void scrub_recheck_block(struct btrfs_fs_info *fs_info,
 401                                 struct scrub_block *sblock,
 402                                 int retry_failed_mirror);
 403 static void scrub_recheck_block_checksum(struct scrub_block *sblock);
 404 static int scrub_repair_block_from_good_copy(struct scrub_block *sblock_bad,
 405                                              struct scrub_block *sblock_good);
 406 static int scrub_repair_sector_from_good_copy(struct scrub_block *sblock_bad,
 407                                             struct scrub_block *sblock_good,
 408                                             int sector_num, int force_write);
 409 static void scrub_write_block_to_dev_replace(struct scrub_block *sblock);
 410 static int scrub_write_sector_to_dev_replace(struct scrub_block *sblock,
 411                                              int sector_num);
 412 static int scrub_checksum_data(struct scrub_block *sblock);
 413 static int scrub_checksum_tree_block(struct scrub_block *sblock);
 414 static int scrub_checksum_super(struct scrub_block *sblock);
 415 static void scrub_block_put(struct scrub_block *sblock);
 416 static void scrub_sector_get(struct scrub_sector *sector);
 417 static void scrub_sector_put(struct scrub_sector *sector);
 418 static void scrub_parity_get(struct scrub_parity *sparity);
 419 static void scrub_parity_put(struct scrub_parity *sparity);
 420 static int scrub_sectors(struct scrub_ctx *sctx, u64 logical, u32 len,
 421                          u64 physical, struct btrfs_device *dev, u64 flags,
 422                          u64 gen, int mirror_num, u8 *csum,
 423                          u64 physical_for_dev_replace);
 424 static void scrub_bio_end_io(struct bio *bio);
 425 static void scrub_bio_end_io_worker(struct work_struct *work);
 426 static void scrub_block_complete(struct scrub_block *sblock);
 427 static void scrub_find_good_copy(struct btrfs_fs_info *fs_info,
 428                                  u64 extent_logical, u32 extent_len,
 429                                  u64 *extent_physical,
 430                                  struct btrfs_device **extent_dev,
 431                                  int *extent_mirror_num);
 432 static int scrub_add_sector_to_wr_bio(struct scrub_ctx *sctx,
 433                                       struct scrub_sector *sector);
 434 static void scrub_wr_submit(struct scrub_ctx *sctx);
 435 static void scrub_wr_bio_end_io(struct bio *bio);
 436 static void scrub_wr_bio_end_io_worker(struct work_struct *work);
 437 static void scrub_put_ctx(struct scrub_ctx *sctx);
 438
 439 static inline int scrub_is_page_on_raid56(struct scrub_sector *sector)
 440 {
 441         return sector->recover &&
 442                (sector->recover->bioc->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK);
 443 }
 444
 445 static void scrub_pending_bio_inc(struct scrub_ctx *sctx)
 446 {
 447         refcount_inc(&sctx->refs);
 448         atomic_inc(&sctx->bios_in_flight);
 449 }
 450
 451 static void scrub_pending_bio_dec(struct scrub_ctx *sctx)
 452 {
 453         atomic_dec(&sctx->bios_in_flight);
 454         wake_up(&sctx->list_wait);
 455         scrub_put_ctx(sctx);
 456 }
 457
 458 static void __scrub_blocked_if_needed(struct btrfs_fs_info *fs_info)
 459 {
 460         while (atomic_read(&fs_info->scrub_pause_req)) {
 461                 mutex_unlock(&fs_info->scrub_lock);
 462                 wait_event(fs_info->scrub_pause_wait,
 463                    atomic_read(&fs_info->scrub_pause_req) == 0);
 464                 mutex_lock(&fs_info->scrub_lock);
 465         }
 466 }
 467
 468 static void scrub_pause_on(struct btrfs_fs_info *fs_info)
 469 {
 470         atomic_inc(&fs_info->scrubs_paused);
 471         wake_up(&fs_info->scrub_pause_wait);
 472 }
 473
 474 static void scrub_pause_off(struct btrfs_fs_info *fs_info)
 475 {
 476         mutex_lock(&fs_info->scrub_lock);
 477         __scrub_blocked_if_needed(fs_info);
 478         atomic_dec(&fs_info->scrubs_paused);
 479         mutex_unlock(&fs_info->scrub_lock);
 480
 481         wake_up(&fs_info->scrub_pause_wait);
 482 }
 483
 484 static void scrub_blocked_if_needed(struct btrfs_fs_info *fs_info)
 485 {
 486         scrub_pause_on(fs_info);
 487         scrub_pause_off(fs_info);
 488 }
 489
 490 /*
 491  * Insert new full stripe lock into full stripe locks tree
 492  *
 493  * Return pointer to existing or newly inserted full_stripe_lock structure if
 494  * everything works well.
 495  * Return ERR_PTR(-ENOMEM) if we failed to allocate memory
 496  *
 497  * NOTE: caller must hold full_stripe_locks_root->lock before calling this
 498  * function
 499  */
 500 static struct full_stripe_lock *insert_full_stripe_lock(
 501                 struct btrfs_full_stripe_locks_tree *locks_root,
 502                 u64 fstripe_logical)
 503 {
 504         struct rb_node **p;
 505         struct rb_node *parent = NULL;
 506         struct full_stripe_lock *entry;
 507         struct full_stripe_lock *ret;
 508
 509         lockdep_assert_held(&locks_root->lock);
 510
 511         p = &locks_root->root.rb_node;
 512         while (*p) {
 513                 parent = *p;
 514                 entry = rb_entry(parent, struct full_stripe_lock, node);
 515                 if (fstripe_logical < entry->logical) {
 516                         p = &(*p)->rb_left;
 517                 } else if (fstripe_logical > entry->logical) {
 518                         p = &(*p)->rb_right;
 519                 } else {
 520                         entry->refs++;
 521                         return entry;
 522                 }
 523         }
 524
 525         /*
 526          * Insert new lock.
 527          */
 528         ret = kmalloc(sizeof(*ret), GFP_KERNEL);
 529         if (!ret)
 530                 return ERR_PTR(-ENOMEM);
 531         ret->logical = fstripe_logical;
 532         ret->refs = 1;
 533         mutex_init(&ret->mutex);
 534
 535         rb_link_node(&ret->node, parent, p);
 536         rb_insert_color(&ret->node, &locks_root->root);
 537         return ret;
 538 }
 539
 540 /*
 541  * Search for a full stripe lock of a block group
 542  *
 543  * Return pointer to existing full stripe lock if found
 544  * Return NULL if not found
 545  */
 546 static struct full_stripe_lock *search_full_stripe_lock(
 547                 struct btrfs_full_stripe_locks_tree *locks_root,
 548                 u64 fstripe_logical)
 549 {
 550         struct rb_node *node;
 551         struct full_stripe_lock *entry;
 552
 553         lockdep_assert_held(&locks_root->lock);
 554
 555         node = locks_root->root.rb_node;
 556         while (node) {
 557                 entry = rb_entry(node, struct full_stripe_lock, node);
 558                 if (fstripe_logical < entry->logical)
 559                         node = node->rb_left;
 560                 else if (fstripe_logical > entry->logical)
 561                         node = node->rb_right;
 562                 else
 563                         return entry;
 564         }
 565         return NULL;
 566 }
 567
 568 /*
 569  * Helper to get full stripe logical from a normal bytenr.
 570  *
 571  * Caller must ensure @cache is a RAID56 block group.
 572  */
 573 static u64 get_full_stripe_logical(struct btrfs_block_group *cache, u64 bytenr)
 574 {
 575         u64 ret;
 576
 577         /*
 578          * Due to chunk item size limit, full stripe length should not be
 579          * larger than U32_MAX. Just a sanity check here.
 580          */
 581         WARN_ON_ONCE(cache->full_stripe_len >= U32_MAX);
 582
 583         /*
 584          * round_down() can only handle power of 2, while RAID56 full
 585          * stripe length can be 64KiB * n, so we need to manually round down.
 586          */
 587         ret = div64_u64(bytenr - cache->start, cache->full_stripe_len) *
 588                         cache->full_stripe_len + cache->start;
 589         return ret;
 590 }
 591
 592 /*
 593  * Lock a full stripe to avoid concurrency of recovery and read
 594  *
 595  * It's only used for profiles with parities (RAID5/6), for other profiles it
 596  * does nothing.
 597  *
 598  * Return 0 if we locked full stripe covering @bytenr, with a mutex held.
 599  * So caller must call unlock_full_stripe() at the same context.
 600  *
 601  * Return <0 if encounters error.
 602  */
 603 static int lock_full_stripe(struct btrfs_fs_info *fs_info, u64 bytenr,
 604                             bool *locked_ret)
 605 {
 606         struct btrfs_block_group *bg_cache;
 607         struct btrfs_full_stripe_locks_tree *locks_root;
 608         struct full_stripe_lock *existing;
 609         u64 fstripe_start;
 610         int ret = 0;
 611
 612         *locked_ret = false;
 613         bg_cache = btrfs_lookup_block_group(fs_info, bytenr);
 614         if (!bg_cache) {
 615                 ASSERT(0);
 616                 return -ENOENT;
 617         }
 618
 619         /* Profiles not based on parity don't need full stripe lock */
 620         if (!(bg_cache->flags & BTRFS_BLOCK_GROUP_RAID56_MASK))
 621                 goto out;
 622         locks_root = &bg_cache->full_stripe_locks_root;
 623
 624         fstripe_start = get_full_stripe_logical(bg_cache, bytenr);
 625
 626         /* Now insert the full stripe lock */
 627         mutex_lock(&locks_root->lock);
 628         existing = insert_full_stripe_lock(locks_root, fstripe_start);
 629         mutex_unlock(&locks_root->lock);
 630         if (IS_ERR(existing)) {
 631                 ret = PTR_ERR(existing);
 632                 goto out;
 633         }
 634         mutex_lock(&existing->mutex);
 635         *locked_ret = true;
 636 out:
 637         btrfs_put_block_group(bg_cache);
 638         return ret;
 639 }
 640
 641 /*
 642  * Unlock a full stripe.
 643  *
 644  * NOTE: Caller must ensure it's the same context calling corresponding
 645  * lock_full_stripe().
 646  *
 647  * Return 0 if we unlock full stripe without problem.
 648  * Return <0 for error
 649  */
 650 static int unlock_full_stripe(struct btrfs_fs_info *fs_info, u64 bytenr,
 651                               bool locked)
 652 {
 653         struct btrfs_block_group *bg_cache;
 654         struct btrfs_full_stripe_locks_tree *locks_root;
 655         struct full_stripe_lock *fstripe_lock;
 656         u64 fstripe_start;
 657         bool freeit = false;
 658         int ret = 0;
 659
 660         /* If we didn't acquire full stripe lock, no need to continue */
 661         if (!locked)
 662                 return 0;
 663
 664         bg_cache = btrfs_lookup_block_group(fs_info, bytenr);
 665         if (!bg_cache) {
 666                 ASSERT(0);
 667                 return -ENOENT;
 668         }
 669         if (!(bg_cache->flags & BTRFS_BLOCK_GROUP_RAID56_MASK))
 670                 goto out;
 671
 672         locks_root = &bg_cache->full_stripe_locks_root;
 673         fstripe_start = get_full_stripe_logical(bg_cache, bytenr);
 674
 675         mutex_lock(&locks_root->lock);
 676         fstripe_lock = search_full_stripe_lock(locks_root, fstripe_start);
 677         /* Unpaired unlock_full_stripe() detected */
 678         if (!fstripe_lock) {
 679                 WARN_ON(1);
 680                 ret = -ENOENT;
 681                 mutex_unlock(&locks_root->lock);
 682                 goto out;
 683         }
 684
 685         if (fstripe_lock->refs == 0) {
 686                 WARN_ON(1);
 687                 btrfs_warn(fs_info, "full stripe lock at %llu refcount underflow",
 688                         fstripe_lock->logical);
 689         } else {
 690                 fstripe_lock->refs--;
 691         }
 692
 693         if (fstripe_lock->refs == 0) {
 694                 rb_erase(&fstripe_lock->node, &locks_root->root);
 695                 freeit = true;
 696         }
 697         mutex_unlock(&locks_root->lock);
 698
 699         mutex_unlock(&fstripe_lock->mutex);
 700         if (freeit)
 701                 kfree(fstripe_lock);
 702 out:
 703         btrfs_put_block_group(bg_cache);
 704         return ret;
 705 }
 706
 707 static void scrub_free_csums(struct scrub_ctx *sctx)
 708 {
 709         while (!list_empty(&sctx->csum_list)) {
 710                 struct btrfs_ordered_sum *sum;
 711                 sum = list_first_entry(&sctx->csum_list,
 712                                        struct btrfs_ordered_sum, list);
 713                 list_del(&sum->list);
 714                 kfree(sum);
 715         }
 716 }
 717
 718 static noinline_for_stack void scrub_free_ctx(struct scrub_ctx *sctx)
 719 {
 720         int i;
 721
 722         if (!sctx)
 723                 return;
 724
 725         /* this can happen when scrub is cancelled */
 726         if (sctx->curr != -1) {
 727                 struct scrub_bio *sbio = sctx->bios[sctx->curr];
 728
 729                 for (i = 0; i < sbio->sector_count; i++)
 730                         scrub_block_put(sbio->sectors[i]->sblock);
 731                 bio_put(sbio->bio);
 732         }
 733
 734         for (i = 0; i < SCRUB_BIOS_PER_SCTX; ++i) {
 735                 struct scrub_bio *sbio = sctx->bios[i];
 736
 737                 if (!sbio)
 738                         break;
 739                 kfree(sbio);
 740         }
 741
 742         kfree(sctx->wr_curr_bio);
 743         scrub_free_csums(sctx);
 744         kfree(sctx);
 745 }
 746
 747 static void scrub_put_ctx(struct scrub_ctx *sctx)
 748 {
 749         if (refcount_dec_and_test(&sctx->refs))
 750                 scrub_free_ctx(sctx);
 751 }
 752
 753 static noinline_for_stack struct scrub_ctx *scrub_setup_ctx(
 754                 struct btrfs_fs_info *fs_info, int is_dev_replace)
 755 {
 756         struct scrub_ctx *sctx;
 757         int             i;
 758
 759         sctx = kzalloc(sizeof(*sctx), GFP_KERNEL);
 760         if (!sctx)
 761                 goto nomem;
 762         refcount_set(&sctx->refs, 1);
 763         sctx->is_dev_replace = is_dev_replace;
 764         sctx->sectors_per_bio = SCRUB_SECTORS_PER_BIO;
 765         sctx->curr = -1;
 766         sctx->fs_info = fs_info;
 767         INIT_LIST_HEAD(&sctx->csum_list);
 768         for (i = 0; i < SCRUB_BIOS_PER_SCTX; ++i) {
 769                 struct scrub_bio *sbio;
 770
 771                 sbio = kzalloc(sizeof(*sbio), GFP_KERNEL);
 772                 if (!sbio)
 773                         goto nomem;
 774                 sctx->bios[i] = sbio;
 775
 776                 sbio->index = i;
 777                 sbio->sctx = sctx;
 778                 sbio->sector_count = 0;
 779                 INIT_WORK(&sbio->work, scrub_bio_end_io_worker);
 780
 781                 if (i != SCRUB_BIOS_PER_SCTX - 1)
 782                         sctx->bios[i]->next_free = i + 1;
 783                 else
 784                         sctx->bios[i]->next_free = -1;
 785         }
 786         sctx->first_free = 0;
 787         atomic_set(&sctx->bios_in_flight, 0);
 788         atomic_set(&sctx->workers_pending, 0);
 789         atomic_set(&sctx->cancel_req, 0);
 790
 791         spin_lock_init(&sctx->list_lock);
 792         spin_lock_init(&sctx->stat_lock);
 793         init_waitqueue_head(&sctx->list_wait);
 794         sctx->throttle_deadline = 0;
 795
 796         WARN_ON(sctx->wr_curr_bio != NULL);
 797         mutex_init(&sctx->wr_lock);
 798         sctx->wr_curr_bio = NULL;
 799         if (is_dev_replace) {
 800                 WARN_ON(!fs_info->dev_replace.tgtdev);
 801                 sctx->wr_tgtdev = fs_info->dev_replace.tgtdev;
 802                 sctx->flush_all_writes = false;
 803         }
 804
 805         return sctx;
 806
 807 nomem:
 808         scrub_free_ctx(sctx);
 809         return ERR_PTR(-ENOMEM);
 810 }
 811
 812 static int scrub_print_warning_inode(u64 inum, u64 offset, u64 num_bytes,
 813                                      u64 root, void *warn_ctx)
 814 {
 815         u32 nlink;
 816         int ret;
 817         int i;
 818         unsigned nofs_flag;
 819         struct extent_buffer *eb;
 820         struct btrfs_inode_item *inode_item;
 821         struct scrub_warning *swarn = warn_ctx;
 822         struct btrfs_fs_info *fs_info = swarn->dev->fs_info;
 823         struct inode_fs_paths *ipath = NULL;
 824         struct btrfs_root *local_root;
 825         struct btrfs_key key;
 826
 827         local_root = btrfs_get_fs_root(fs_info, root, true);
 828         if (IS_ERR(local_root)) {
 829                 ret = PTR_ERR(local_root);
 830                 goto err;
 831         }
 832
 833         /*
 834          * this makes the path point to (inum INODE_ITEM ioff)
 835          */
 836         key.objectid = inum;
 837         key.type = BTRFS_INODE_ITEM_KEY;
 838         key.offset = 0;
 839
 840         ret = btrfs_search_slot(NULL, local_root, &key, swarn->path, 0, 0);
 841         if (ret) {
 842                 btrfs_put_root(local_root);
 843                 btrfs_release_path(swarn->path);
 844                 goto err;
 845         }
 846
 847         eb = swarn->path->nodes[0];
 848         inode_item = btrfs_item_ptr(eb, swarn->path->slots[0],
 849                                         struct btrfs_inode_item);
 850         nlink = btrfs_inode_nlink(eb, inode_item);
 851         btrfs_release_path(swarn->path);
 852
 853         /*
 854          * init_path might indirectly call vmalloc, or use GFP_KERNEL. Scrub
 855          * uses GFP_NOFS in this context, so we keep it consistent but it does
 856          * not seem to be strictly necessary.
 857          */
 858         nofs_flag = memalloc_nofs_save();
 859         ipath = init_ipath(4096, local_root, swarn->path);
 860         memalloc_nofs_restore(nofs_flag);
 861         if (IS_ERR(ipath)) {
 862                 btrfs_put_root(local_root);
 863                 ret = PTR_ERR(ipath);
 864                 ipath = NULL;
 865                 goto err;
 866         }
 867         ret = paths_from_inode(inum, ipath);
 868
 869         if (ret < 0)
 870                 goto err;
 871
 872         /*
 873          * we deliberately ignore the bit ipath might have been too small to
 874          * hold all of the paths here
 875          */
 876         for (i = 0; i < ipath->fspath->elem_cnt; ++i)
 877                 btrfs_warn_in_rcu(fs_info,
 878 "%s at logical %llu on dev %s, physical %llu, root %llu, inode %llu, offset %llu, length %u, links %u (path: %s)",
 879                                   swarn->errstr, swarn->logical,
 880                                   rcu_str_deref(swarn->dev->name),
 881                                   swarn->physical,
 882                                   root, inum, offset,
 883                                   fs_info->sectorsize, nlink,
 884                                   (char *)(unsigned long)ipath->fspath->val[i]);
 885
 886         btrfs_put_root(local_root);
 887         free_ipath(ipath);
 888         return 0;
 889
 890 err:
 891         btrfs_warn_in_rcu(fs_info,
 892                           "%s at logical %llu on dev %s, physical %llu, root %llu, inode %llu, offset %llu: path resolving failed with ret=%d",
 893                           swarn->errstr, swarn->logical,
 894                           rcu_str_deref(swarn->dev->name),
 895                           swarn->physical,
 896                           root, inum, offset, ret);
 897
 898         free_ipath(ipath);
 899         return 0;
 900 }
 901
 902 static void scrub_print_warning(const char *errstr, struct scrub_block *sblock)
 903 {
 904         struct btrfs_device *dev;
 905         struct btrfs_fs_info *fs_info;
 906         struct btrfs_path *path;
 907         struct btrfs_key found_key;
 908         struct extent_buffer *eb;
 909         struct btrfs_extent_item *ei;
 910         struct scrub_warning swarn;
 911         unsigned long ptr = 0;
 912         u64 extent_item_pos;
 913         u64 flags = 0;
 914         u64 ref_root;
 915         u32 item_size;
 916         u8 ref_level = 0;
 917         int ret;
 918
 919         WARN_ON(sblock->sector_count < 1);
 920         dev = sblock->dev;
 921         fs_info = sblock->sctx->fs_info;
 922
 923         /* Super block error, no need to search extent tree. */
 924         if (sblock->sectors[0]->flags & BTRFS_EXTENT_FLAG_SUPER) {
 925                 btrfs_warn_in_rcu(fs_info, "%s on device %s, physical %llu",
 926                         errstr, rcu_str_deref(dev->name),
 927                         sblock->physical);
 928                 return;
 929         }
 930         path = btrfs_alloc_path();
 931         if (!path)
 932                 return;
 933
 934         swarn.physical = sblock->physical;
 935         swarn.logical = sblock->logical;
 936         swarn.errstr = errstr;
 937         swarn.dev = NULL;
 938
 939         ret = extent_from_logical(fs_info, swarn.logical, path, &found_key,
 940                                   &flags);
 941         if (ret < 0)
 942                 goto out;
 943
 944         extent_item_pos = swarn.logical - found_key.objectid;
 945         swarn.extent_item_size = found_key.offset;
 946
 947         eb = path->nodes[0];
 948         ei = btrfs_item_ptr(eb, path->slots[0], struct btrfs_extent_item);
 949         item_size = btrfs_item_size(eb, path->slots[0]);
 950
 951         if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
 952                 do {
 953                         ret = tree_backref_for_extent(&ptr, eb, &found_key, ei,
 954                                                       item_size, &ref_root,
 955                                                       &ref_level);
 956                         btrfs_warn_in_rcu(fs_info,
 957 "%s at logical %llu on dev %s, physical %llu: metadata %s (level %d) in tree %llu",
 958                                 errstr, swarn.logical,
 959                                 rcu_str_deref(dev->name),
 960                                 swarn.physical,
 961                                 ref_level ? "node" : "leaf",
 962                                 ret < 0 ? -1 : ref_level,
 963                                 ret < 0 ? -1 : ref_root);
 964                 } while (ret != 1);
 965                 btrfs_release_path(path);
 966         } else {
 967                 btrfs_release_path(path);
 968                 swarn.path = path;
 969                 swarn.dev = dev;
 970                 iterate_extent_inodes(fs_info, found_key.objectid,
 971                                         extent_item_pos, 1,
 972                                         scrub_print_warning_inode, &swarn);
 973         }
 974
 975 out:
 976         btrfs_free_path(path);
 977 }
 978
 979 static inline void scrub_get_recover(struct scrub_recover *recover)
 980 {
 981         refcount_inc(&recover->refs);
 982 }
 983
 984 static inline void scrub_put_recover(struct btrfs_fs_info *fs_info,
 985                                      struct scrub_recover *recover)
 986 {
 987         if (refcount_dec_and_test(&recover->refs)) {
 988                 btrfs_bio_counter_dec(fs_info);
 989                 btrfs_put_bioc(recover->bioc);
 990                 kfree(recover);
 991         }
 992 }
 993
 994 /*
 995  * scrub_handle_errored_block gets called when either verification of the
 996  * sectors failed or the bio failed to read, e.g. with EIO. In the latter
 997  * case, this function handles all sectors in the bio, even though only one
 998  * may be bad.
 999  * The goal of this function is to repair the errored block by using the
1000  * contents of one of the mirrors.
1001  */
1002 static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
1003 {
1004         struct scrub_ctx *sctx = sblock_to_check->sctx;
1005         struct btrfs_device *dev = sblock_to_check->dev;
1006         struct btrfs_fs_info *fs_info;
1007         u64 logical;
1008         unsigned int failed_mirror_index;
1009         unsigned int is_metadata;
1010         unsigned int have_csum;
1011         /* One scrub_block for each mirror */
1012         struct scrub_block *sblocks_for_recheck[BTRFS_MAX_MIRRORS] = { 0 };
1013         struct scrub_block *sblock_bad;
1014         int ret;
1015         int mirror_index;
1016         int sector_num;
1017         int success;
1018         bool full_stripe_locked;
1019         unsigned int nofs_flag;
1020         static DEFINE_RATELIMIT_STATE(rs, DEFAULT_RATELIMIT_INTERVAL,
1021                                       DEFAULT_RATELIMIT_BURST);
1022
1023         BUG_ON(sblock_to_check->sector_count < 1);
1024         fs_info = sctx->fs_info;
1025         if (sblock_to_check->sectors[0]->flags & BTRFS_EXTENT_FLAG_SUPER) {
1026                 /*
1027                  * If we find an error in a super block, we just report it.
1028                  * They will get written with the next transaction commit
1029                  * anyway
1030                  */
1031                 scrub_print_warning("super block error", sblock_to_check);
1032                 spin_lock(&sctx->stat_lock);
1033                 ++sctx->stat.super_errors;
1034                 spin_unlock(&sctx->stat_lock);
1035                 btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_CORRUPTION_ERRS);
1036                 return 0;
1037         }
1038         logical = sblock_to_check->logical;
1039         ASSERT(sblock_to_check->mirror_num);
1040         failed_mirror_index = sblock_to_check->mirror_num - 1;
1041         is_metadata = !(sblock_to_check->sectors[0]->flags &
1042                         BTRFS_EXTENT_FLAG_DATA);
1043         have_csum = sblock_to_check->sectors[0]->have_csum;
1044
1045         if (!sctx->is_dev_replace && btrfs_repair_one_zone(fs_info, logical))
1046                 return 0;
1047
1048         /*
1049          * We must use GFP_NOFS because the scrub task might be waiting for a
1050          * worker task executing this function and in turn a transaction commit
1051          * might be waiting the scrub task to pause (which needs to wait for all
1052          * the worker tasks to complete before pausing).
1053          * We do allocations in the workers through insert_full_stripe_lock()
1054          * and scrub_add_sector_to_wr_bio(), which happens down the call chain of
1055          * this function.
1056          */
1057         nofs_flag = memalloc_nofs_save();
1058         /*
1059          * For RAID5/6, race can happen for a different device scrub thread.
1060          * For data corruption, Parity and Data threads will both try
1061          * to recovery the data.
1062          * Race can lead to doubly added csum error, or even unrecoverable
1063          * error.
1064          */
1065         ret = lock_full_stripe(fs_info, logical, &full_stripe_locked);
1066         if (ret < 0) {
1067                 memalloc_nofs_restore(nofs_flag);
1068                 spin_lock(&sctx->stat_lock);
1069                 if (ret == -ENOMEM)
1070                         sctx->stat.malloc_errors++;
1071                 sctx->stat.read_errors++;
1072                 sctx->stat.uncorrectable_errors++;
1073                 spin_unlock(&sctx->stat_lock);
1074                 return ret;
1075         }
1076
1077         /*
1078          * read all mirrors one after the other. This includes to
1079          * re-read the extent or metadata block that failed (that was
1080          * the cause that this fixup code is called) another time,
1081          * sector by sector this time in order to know which sectors
1082          * caused I/O errors and which ones are good (for all mirrors).
1083          * It is the goal to handle the situation when more than one
1084          * mirror contains I/O errors, but the errors do not
1085          * overlap, i.e. the data can be repaired by selecting the
1086          * sectors from those mirrors without I/O error on the
1087          * particular sectors. One example (with blocks >= 2 * sectorsize)
1088          * would be that mirror #1 has an I/O error on the first sector,
1089          * the second sector is good, and mirror #2 has an I/O error on
1090          * the second sector, but the first sector is good.
1091          * Then the first sector of the first mirror can be repaired by
1092          * taking the first sector of the second mirror, and the
1093          * second sector of the second mirror can be repaired by
1094          * copying the contents of the 2nd sector of the 1st mirror.
1095          * One more note: if the sectors of one mirror contain I/O
1096          * errors, the checksum cannot be verified. In order to get
1097          * the best data for repairing, the first attempt is to find
1098          * a mirror without I/O errors and with a validated checksum.
1099          * Only if this is not possible, the sectors are picked from
1100          * mirrors with I/O errors without considering the checksum.
1101          * If the latter is the case, at the end, the checksum of the
1102          * repaired area is verified in order to correctly maintain
1103          * the statistics.
1104          */
1105         for (mirror_index = 0; mirror_index < BTRFS_MAX_MIRRORS; mirror_index++) {
1106                 /*
1107                  * Note: the two members refs and outstanding_sectors are not
1108                  * used in the blocks that are used for the recheck procedure.
1109                  *
1110                  * But alloc_scrub_block() will initialize sblock::ref anyway,
1111                  * so we can use scrub_block_put() to clean them up.
1112                  *
1113                  * And here we don't setup the physical/dev for the sblock yet,
1114                  * they will be correctly initialized in scrub_setup_recheck_block().
1115                  */
1116                 sblocks_for_recheck[mirror_index] = alloc_scrub_block(sctx, NULL,
1117                                                         logical, 0, 0, mirror_index);
1118                 if (!sblocks_for_recheck[mirror_index]) {
1119                         spin_lock(&sctx->stat_lock);
1120                         sctx->stat.malloc_errors++;
1121                         sctx->stat.read_errors++;
1122                         sctx->stat.uncorrectable_errors++;
1123                         spin_unlock(&sctx->stat_lock);
1124                         btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS);
1125                         goto out;
1126                 }
1127         }
1128
1129         /* Setup the context, map the logical blocks and alloc the sectors */
1130         ret = scrub_setup_recheck_block(sblock_to_check, sblocks_for_recheck);
1131         if (ret) {
1132                 spin_lock(&sctx->stat_lock);
1133                 sctx->stat.read_errors++;
1134                 sctx->stat.uncorrectable_errors++;
1135                 spin_unlock(&sctx->stat_lock);
1136                 btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS);
1137                 goto out;
1138         }
1139         BUG_ON(failed_mirror_index >= BTRFS_MAX_MIRRORS);
1140         sblock_bad = sblocks_for_recheck[failed_mirror_index];
1141
1142         /* build and submit the bios for the failed mirror, check checksums */
1143         scrub_recheck_block(fs_info, sblock_bad, 1);
1144
1145         if (!sblock_bad->header_error && !sblock_bad->checksum_error &&
1146             sblock_bad->no_io_error_seen) {
1147                 /*
1148                  * The error disappeared after reading sector by sector, or
1149                  * the area was part of a huge bio and other parts of the
1150                  * bio caused I/O errors, or the block layer merged several
1151                  * read requests into one and the error is caused by a
1152                  * different bio (usually one of the two latter cases is
1153                  * the cause)
1154                  */
1155                 spin_lock(&sctx->stat_lock);
1156                 sctx->stat.unverified_errors++;
1157                 sblock_to_check->data_corrected = 1;
1158                 spin_unlock(&sctx->stat_lock);
1159
1160                 if (sctx->is_dev_replace)
1161                         scrub_write_block_to_dev_replace(sblock_bad);
1162                 goto out;
1163         }
1164
1165         if (!sblock_bad->no_io_error_seen) {
1166                 spin_lock(&sctx->stat_lock);
1167                 sctx->stat.read_errors++;
1168                 spin_unlock(&sctx->stat_lock);
1169                 if (__ratelimit(&rs))
1170                         scrub_print_warning("i/o error", sblock_to_check);
1171                 btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS);
1172         } else if (sblock_bad->checksum_error) {
1173                 spin_lock(&sctx->stat_lock);
1174                 sctx->stat.csum_errors++;
1175                 spin_unlock(&sctx->stat_lock);
1176                 if (__ratelimit(&rs))
1177                         scrub_print_warning("checksum error", sblock_to_check);
1178                 btrfs_dev_stat_inc_and_print(dev,
1179                                              BTRFS_DEV_STAT_CORRUPTION_ERRS);
1180         } else if (sblock_bad->header_error) {
1181                 spin_lock(&sctx->stat_lock);
1182                 sctx->stat.verify_errors++;
1183                 spin_unlock(&sctx->stat_lock);
1184                 if (__ratelimit(&rs))
1185                         scrub_print_warning("checksum/header error",
1186                                             sblock_to_check);
1187                 if (sblock_bad->generation_error)
1188                         btrfs_dev_stat_inc_and_print(dev,
1189                                 BTRFS_DEV_STAT_GENERATION_ERRS);
1190                 else
1191                         btrfs_dev_stat_inc_and_print(dev,
1192                                 BTRFS_DEV_STAT_CORRUPTION_ERRS);
1193         }
1194
1195         if (sctx->readonly) {
1196                 ASSERT(!sctx->is_dev_replace);
1197                 goto out;
1198         }
1199
1200         /*
1201          * now build and submit the bios for the other mirrors, check
1202          * checksums.
1203          * First try to pick the mirror which is completely without I/O
1204          * errors and also does not have a checksum error.
1205          * If one is found, and if a checksum is present, the full block
1206          * that is known to contain an error is rewritten. Afterwards
1207          * the block is known to be corrected.
1208          * If a mirror is found which is completely correct, and no
1209          * checksum is present, only those sectors are rewritten that had
1210          * an I/O error in the block to be repaired, since it cannot be
1211          * determined, which copy of the other sectors is better (and it
1212          * could happen otherwise that a correct sector would be
1213          * overwritten by a bad one).
1214          */
1215         for (mirror_index = 0; ;mirror_index++) {
1216                 struct scrub_block *sblock_other;
1217
1218                 if (mirror_index == failed_mirror_index)
1219                         continue;
1220
1221                 /* raid56's mirror can be more than BTRFS_MAX_MIRRORS */
1222                 if (!scrub_is_page_on_raid56(sblock_bad->sectors[0])) {
1223                         if (mirror_index >= BTRFS_MAX_MIRRORS)
1224                                 break;
1225                         if (!sblocks_for_recheck[mirror_index]->sector_count)
1226                                 break;
1227
1228                         sblock_other = sblocks_for_recheck[mirror_index];
1229                 } else {
1230                         struct scrub_recover *r = sblock_bad->sectors[0]->recover;
1231                         int max_allowed = r->bioc->num_stripes - r->bioc->num_tgtdevs;
1232
1233                         if (mirror_index >= max_allowed)
1234                                 break;
1235                         if (!sblocks_for_recheck[1]->sector_count)
1236                                 break;
1237
1238                         ASSERT(failed_mirror_index == 0);
1239                         sblock_other = sblocks_for_recheck[1];
1240                         sblock_other->mirror_num = 1 + mirror_index;
1241                 }
1242
1243                 /* build and submit the bios, check checksums */
1244                 scrub_recheck_block(fs_info, sblock_other, 0);
1245
1246                 if (!sblock_other->header_error &&
1247                     !sblock_other->checksum_error &&
1248                     sblock_other->no_io_error_seen) {
1249                         if (sctx->is_dev_replace) {
1250                                 scrub_write_block_to_dev_replace(sblock_other);
1251                                 goto corrected_error;
1252                         } else {
1253                                 ret = scrub_repair_block_from_good_copy(
1254                                                 sblock_bad, sblock_other);
1255                                 if (!ret)
1256                                         goto corrected_error;
1257                         }
1258                 }
1259         }
1260
1261         if (sblock_bad->no_io_error_seen && !sctx->is_dev_replace)
1262                 goto did_not_correct_error;
1263
1264         /*
1265          * In case of I/O errors in the area that is supposed to be
1266          * repaired, continue by picking good copies of those sectors.
1267          * Select the good sectors from mirrors to rewrite bad sectors from
1268          * the area to fix. Afterwards verify the checksum of the block
1269          * that is supposed to be repaired. This verification step is
1270          * only done for the purpose of statistic counting and for the
1271          * final scrub report, whether errors remain.
1272          * A perfect algorithm could make use of the checksum and try
1273          * all possible combinations of sectors from the different mirrors
1274          * until the checksum verification succeeds. For example, when
1275          * the 2nd sector of mirror #1 faces I/O errors, and the 2nd sector
1276          * of mirror #2 is readable but the final checksum test fails,
1277          * then the 2nd sector of mirror #3 could be tried, whether now
1278          * the final checksum succeeds. But this would be a rare
1279          * exception and is therefore not implemented. At least it is
1280          * avoided that the good copy is overwritten.
1281          * A more useful improvement would be to pick the sectors
1282          * without I/O error based on sector sizes (512 bytes on legacy
1283          * disks) instead of on sectorsize. Then maybe 512 byte of one
1284          * mirror could be repaired by taking 512 byte of a different
1285          * mirror, even if other 512 byte sectors in the same sectorsize
1286          * area are unreadable.
1287          */
1288         success = 1;
1289         for (sector_num = 0; sector_num < sblock_bad->sector_count;
1290              sector_num++) {
1291                 struct scrub_sector *sector_bad = sblock_bad->sectors[sector_num];
1292                 struct scrub_block *sblock_other = NULL;
1293
1294                 /* Skip no-io-error sectors in scrub */
1295                 if (!sector_bad->io_error && !sctx->is_dev_replace)
1296                         continue;
1297
1298                 if (scrub_is_page_on_raid56(sblock_bad->sectors[0])) {
1299                         /*
1300                          * In case of dev replace, if raid56 rebuild process
1301                          * didn't work out correct data, then copy the content
1302                          * in sblock_bad to make sure target device is identical
1303                          * to source device, instead of writing garbage data in
1304                          * sblock_for_recheck array to target device.
1305                          */
1306                         sblock_other = NULL;
1307                 } else if (sector_bad->io_error) {
1308                         /* Try to find no-io-error sector in mirrors */
1309                         for (mirror_index = 0;
1310                              mirror_index < BTRFS_MAX_MIRRORS &&
1311                              sblocks_for_recheck[mirror_index]->sector_count > 0;
1312                              mirror_index++) {
1313                                 if (!sblocks_for_recheck[mirror_index]->
1314                                     sectors[sector_num]->io_error) {
1315                                         sblock_other = sblocks_for_recheck[mirror_index];
1316                                         break;
1317                                 }
1318                         }
1319                         if (!sblock_other)
1320                                 success = 0;
1321                 }
1322
1323                 if (sctx->is_dev_replace) {
1324                         /*
1325                          * Did not find a mirror to fetch the sector from.
1326                          * scrub_write_sector_to_dev_replace() handles this
1327                          * case (sector->io_error), by filling the block with
1328                          * zeros before submitting the write request
1329                          */
1330                         if (!sblock_other)
1331                                 sblock_other = sblock_bad;
1332
1333                         if (scrub_write_sector_to_dev_replace(sblock_other,
1334                                                               sector_num) != 0) {
1335                                 atomic64_inc(
1336                                         &fs_info->dev_replace.num_write_errors);
1337                                 success = 0;
1338                         }
1339                 } else if (sblock_other) {
1340                         ret = scrub_repair_sector_from_good_copy(sblock_bad,
1341                                                                  sblock_other,
1342                                                                  sector_num, 0);
1343                         if (0 == ret)
1344                                 sector_bad->io_error = 0;
1345                         else
1346                                 success = 0;
1347                 }
1348         }
1349
1350         if (success && !sctx->is_dev_replace) {
1351                 if (is_metadata || have_csum) {
1352                         /*
1353                          * need to verify the checksum now that all
1354                          * sectors on disk are repaired (the write
1355                          * request for data to be repaired is on its way).
1356                          * Just be lazy and use scrub_recheck_block()
1357                          * which re-reads the data before the checksum
1358                          * is verified, but most likely the data comes out
1359                          * of the page cache.
1360                          */
1361                         scrub_recheck_block(fs_info, sblock_bad, 1);
1362                         if (!sblock_bad->header_error &&
1363                             !sblock_bad->checksum_error &&
1364                             sblock_bad->no_io_error_seen)
1365                                 goto corrected_error;
1366                         else
1367                                 goto did_not_correct_error;
1368                 } else {
1369 corrected_error:
1370                         spin_lock(&sctx->stat_lock);
1371                         sctx->stat.corrected_errors++;
1372                         sblock_to_check->data_corrected = 1;
1373                         spin_unlock(&sctx->stat_lock);
1374                         btrfs_err_rl_in_rcu(fs_info,
1375                                 "fixed up error at logical %llu on dev %s",
1376                                 logical, rcu_str_deref(dev->name));
1377                 }
1378         } else {
1379 did_not_correct_error:
1380                 spin_lock(&sctx->stat_lock);
1381                 sctx->stat.uncorrectable_errors++;
1382                 spin_unlock(&sctx->stat_lock);
1383                 btrfs_err_rl_in_rcu(fs_info,
1384                         "unable to fixup (regular) error at logical %llu on dev %s",
1385                         logical, rcu_str_deref(dev->name));
1386         }
1387
1388 out:
1389         for (mirror_index = 0; mirror_index < BTRFS_MAX_MIRRORS; mirror_index++) {
1390                 struct scrub_block *sblock = sblocks_for_recheck[mirror_index];
1391                 struct scrub_recover *recover;
1392                 int sector_index;
1393
1394                 /* Not allocated, continue checking the next mirror */
1395                 if (!sblock)
1396                         continue;
1397
1398                 for (sector_index = 0; sector_index < sblock->sector_count;
1399                      sector_index++) {
1400                         /*
1401                          * Here we just cleanup the recover, each sector will be
1402                          * properly cleaned up by later scrub_block_put()
1403                          */
1404                         recover = sblock->sectors[sector_index]->recover;
1405                         if (recover) {
1406                                 scrub_put_recover(fs_info, recover);
1407                                 sblock->sectors[sector_index]->recover = NULL;
1408                         }
1409                 }
1410                 scrub_block_put(sblock);
1411         }
1412
1413         ret = unlock_full_stripe(fs_info, logical, full_stripe_locked);
1414         memalloc_nofs_restore(nofs_flag);
1415         if (ret < 0)
1416                 return ret;
1417         return 0;
1418 }
1419
1420 static inline int scrub_nr_raid_mirrors(struct btrfs_io_context *bioc)
1421 {
1422         if (bioc->map_type & BTRFS_BLOCK_GROUP_RAID5)
1423                 return 2;
1424         else if (bioc->map_type & BTRFS_BLOCK_GROUP_RAID6)
1425                 return 3;
1426         else
1427                 return (int)bioc->num_stripes;
1428 }
1429
1430 static inline void scrub_stripe_index_and_offset(u64 logical, u64 map_type,
1431                                                  u64 *raid_map,
1432                                                  int nstripes, int mirror,
1433                                                  int *stripe_index,
1434                                                  u64 *stripe_offset)
1435 {
1436         int i;
1437
1438         if (map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
1439                 /* RAID5/6 */
1440                 for (i = 0; i < nstripes; i++) {
1441                         if (raid_map[i] == RAID6_Q_STRIPE ||
1442                             raid_map[i] == RAID5_P_STRIPE)
1443                                 continue;
1444
1445                         if (logical >= raid_map[i] &&
1446                             logical < raid_map[i] + BTRFS_STRIPE_LEN)
1447                                 break;
1448                 }
1449
1450                 *stripe_index = i;
1451                 *stripe_offset = logical - raid_map[i];
1452         } else {
1453                 /* The other RAID type */
1454                 *stripe_index = mirror;
1455                 *stripe_offset = 0;
1456         }
1457 }
1458
1459 static int scrub_setup_recheck_block(struct scrub_block *original_sblock,
1460                                      struct scrub_block *sblocks_for_recheck[])
1461 {
1462         struct scrub_ctx *sctx = original_sblock->sctx;
1463         struct btrfs_fs_info *fs_info = sctx->fs_info;
1464         u64 logical = original_sblock->logical;
1465         u64 length = original_sblock->sector_count << fs_info->sectorsize_bits;
1466         u64 generation = original_sblock->sectors[0]->generation;
1467         u64 flags = original_sblock->sectors[0]->flags;
1468         u64 have_csum = original_sblock->sectors[0]->have_csum;
1469         struct scrub_recover *recover;
1470         struct btrfs_io_context *bioc;
1471         u64 sublen;
1472         u64 mapped_length;
1473         u64 stripe_offset;
1474         int stripe_index;
1475         int sector_index = 0;
1476         int mirror_index;
1477         int nmirrors;
1478         int ret;
1479
1480         while (length > 0) {
1481                 sublen = min_t(u64, length, fs_info->sectorsize);
1482                 mapped_length = sublen;
1483                 bioc = NULL;
1484
1485                 /*
1486                  * With a length of sectorsize, each returned stripe represents
1487                  * one mirror
1488                  */
1489                 btrfs_bio_counter_inc_blocked(fs_info);
1490                 ret = btrfs_map_sblock(fs_info, BTRFS_MAP_GET_READ_MIRRORS,
1491                                        logical, &mapped_length, &bioc);
1492                 if (ret || !bioc || mapped_length < sublen) {
1493                         btrfs_put_bioc(bioc);
1494                         btrfs_bio_counter_dec(fs_info);
1495                         return -EIO;
1496                 }
1497
1498                 recover = kzalloc(sizeof(struct scrub_recover), GFP_KERNEL);
1499                 if (!recover) {
1500                         btrfs_put_bioc(bioc);
1501                         btrfs_bio_counter_dec(fs_info);
1502                         return -ENOMEM;
1503                 }
1504
1505                 refcount_set(&recover->refs, 1);
1506                 recover->bioc = bioc;
1507                 recover->map_length = mapped_length;
1508
1509                 ASSERT(sector_index < SCRUB_MAX_SECTORS_PER_BLOCK);
1510
1511                 nmirrors = min(scrub_nr_raid_mirrors(bioc), BTRFS_MAX_MIRRORS);
1512
1513                 for (mirror_index = 0; mirror_index < nmirrors;
1514                      mirror_index++) {
1515                         struct scrub_block *sblock;
1516                         struct scrub_sector *sector;
1517
1518                         sblock = sblocks_for_recheck[mirror_index];
1519                         sblock->sctx = sctx;
1520
1521                         sector = alloc_scrub_sector(sblock, logical);
1522                         if (!sector) {
1523                                 spin_lock(&sctx->stat_lock);
1524                                 sctx->stat.malloc_errors++;
1525                                 spin_unlock(&sctx->stat_lock);
1526                                 scrub_put_recover(fs_info, recover);
1527                                 return -ENOMEM;
1528                         }
1529                         sector->flags = flags;
1530                         sector->generation = generation;
1531                         sector->have_csum = have_csum;
1532                         if (have_csum)
1533                                 memcpy(sector->csum,
1534                                        original_sblock->sectors[0]->csum,
1535                                        sctx->fs_info->csum_size);
1536
1537                         scrub_stripe_index_and_offset(logical,
1538                                                       bioc->map_type,
1539                                                       bioc->raid_map,
1540                                                       bioc->num_stripes -
1541                                                       bioc->num_tgtdevs,
1542                                                       mirror_index,
1543                                                       &stripe_index,
1544                                                       &stripe_offset);
1545                         /*
1546                          * We're at the first sector, also populate @sblock
1547                          * physical and dev.
1548                          */
1549                         if (sector_index == 0) {
1550                                 sblock->physical =
1551                                         bioc->stripes[stripe_index].physical +
1552                                         stripe_offset;
1553                                 sblock->dev = bioc->stripes[stripe_index].dev;
1554                                 sblock->physical_for_dev_replace =
1555                                         original_sblock->physical_for_dev_replace;
1556                         }
1557
1558                         BUG_ON(sector_index >= original_sblock->sector_count);
1559                         scrub_get_recover(recover);
1560                         sector->recover = recover;
1561                 }
1562                 scrub_put_recover(fs_info, recover);
1563                 length -= sublen;
1564                 logical += sublen;
1565                 sector_index++;
1566         }
1567
1568         return 0;
1569 }
1570
1571 static void scrub_bio_wait_endio(struct bio *bio)
1572 {
1573         complete(bio->bi_private);
1574 }
1575
1576 static int scrub_submit_raid56_bio_wait(struct btrfs_fs_info *fs_info,
1577                                         struct bio *bio,
1578                                         struct scrub_sector *sector)
1579 {
1580         DECLARE_COMPLETION_ONSTACK(done);
1581
1582         bio->bi_iter.bi_sector = (sector->offset + sector->sblock->logical) >>
1583                                  SECTOR_SHIFT;
1584         bio->bi_private = &done;
1585         bio->bi_end_io = scrub_bio_wait_endio;
1586         raid56_parity_recover(bio, sector->recover->bioc, sector->sblock->mirror_num);
1587
1588         wait_for_completion_io(&done);
1589         return blk_status_to_errno(bio->bi_status);
1590 }
1591
1592 static void scrub_recheck_block_on_raid56(struct btrfs_fs_info *fs_info,
1593                                           struct scrub_block *sblock)
1594 {
1595         struct scrub_sector *first_sector = sblock->sectors[0];
1596         struct bio *bio;
1597         int i;
1598
1599         /* All sectors in sblock belong to the same stripe on the same device. */
1600         ASSERT(sblock->dev);
1601         if (!sblock->dev->bdev)
1602                 goto out;
1603
1604         bio = bio_alloc(sblock->dev->bdev, BIO_MAX_VECS, REQ_OP_READ, GFP_NOFS);
1605
1606         for (i = 0; i < sblock->sector_count; i++) {
1607                 struct scrub_sector *sector = sblock->sectors[i];
1608
1609                 bio_add_scrub_sector(bio, sector, fs_info->sectorsize);
1610         }
1611
1612         if (scrub_submit_raid56_bio_wait(fs_info, bio, first_sector)) {
1613                 bio_put(bio);
1614                 goto out;
1615         }
1616
1617         bio_put(bio);
1618
1619         scrub_recheck_block_checksum(sblock);
1620
1621         return;
1622 out:
1623         for (i = 0; i < sblock->sector_count; i++)
1624                 sblock->sectors[i]->io_error = 1;
1625
1626         sblock->no_io_error_seen = 0;
1627 }
1628
1629 /*
1630  * This function will check the on disk data for checksum errors, header errors
1631  * and read I/O errors. If any I/O errors happen, the exact sectors which are
1632  * errored are marked as being bad. The goal is to enable scrub to take those
1633  * sectors that are not errored from all the mirrors so that the sectors that
1634  * are errored in the just handled mirror can be repaired.
1635  */
1636 static void scrub_recheck_block(struct btrfs_fs_info *fs_info,
1637                                 struct scrub_block *sblock,
1638                                 int retry_failed_mirror)
1639 {
1640         int i;
1641
1642         sblock->no_io_error_seen = 1;
1643
1644         /* short cut for raid56 */
1645         if (!retry_failed_mirror && scrub_is_page_on_raid56(sblock->sectors[0]))
1646                 return scrub_recheck_block_on_raid56(fs_info, sblock);
1647
1648         for (i = 0; i < sblock->sector_count; i++) {
1649                 struct scrub_sector *sector = sblock->sectors[i];
1650                 struct bio bio;
1651                 struct bio_vec bvec;
1652
1653                 if (sblock->dev->bdev == NULL) {
1654                         sector->io_error = 1;
1655                         sblock->no_io_error_seen = 0;
1656                         continue;
1657                 }
1658
1659                 bio_init(&bio, sblock->dev->bdev, &bvec, 1, REQ_OP_READ);
1660                 bio_add_scrub_sector(&bio, sector, fs_info->sectorsize);
1661                 bio.bi_iter.bi_sector = (sblock->physical + sector->offset) >>
1662                                         SECTOR_SHIFT;
1663
1664                 btrfsic_check_bio(&bio);
1665                 if (submit_bio_wait(&bio)) {
1666                         sector->io_error = 1;
1667                         sblock->no_io_error_seen = 0;
1668                 }
1669
1670                 bio_uninit(&bio);
1671         }
1672
1673         if (sblock->no_io_error_seen)
1674                 scrub_recheck_block_checksum(sblock);
1675 }
1676
1677 static inline int scrub_check_fsid(u8 fsid[], struct scrub_sector *sector)
1678 {
1679         struct btrfs_fs_devices *fs_devices = sector->sblock->dev->fs_devices;
1680         int ret;
1681
1682         ret = memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE);
1683         return !ret;
1684 }
1685
1686 static void scrub_recheck_block_checksum(struct scrub_block *sblock)
1687 {
1688         sblock->header_error = 0;
1689         sblock->checksum_error = 0;
1690         sblock->generation_error = 0;
1691
1692         if (sblock->sectors[0]->flags & BTRFS_EXTENT_FLAG_DATA)
1693                 scrub_checksum_data(sblock);
1694         else
1695                 scrub_checksum_tree_block(sblock);
1696 }
1697
1698 static int scrub_repair_block_from_good_copy(struct scrub_block *sblock_bad,
1699                                              struct scrub_block *sblock_good)
1700 {
1701         int i;
1702         int ret = 0;
1703
1704         for (i = 0; i < sblock_bad->sector_count; i++) {
1705                 int ret_sub;
1706
1707                 ret_sub = scrub_repair_sector_from_good_copy(sblock_bad,
1708                                                              sblock_good, i, 1);
1709                 if (ret_sub)
1710                         ret = ret_sub;
1711         }
1712
1713         return ret;
1714 }
1715
1716 static int scrub_repair_sector_from_good_copy(struct scrub_block *sblock_bad,
1717                                               struct scrub_block *sblock_good,
1718                                               int sector_num, int force_write)
1719 {
1720         struct scrub_sector *sector_bad = sblock_bad->sectors[sector_num];
1721         struct scrub_sector *sector_good = sblock_good->sectors[sector_num];
1722         struct btrfs_fs_info *fs_info = sblock_bad->sctx->fs_info;
1723         const u32 sectorsize = fs_info->sectorsize;
1724
1725         if (force_write || sblock_bad->header_error ||
1726             sblock_bad->checksum_error || sector_bad->io_error) {
1727                 struct bio bio;
1728                 struct bio_vec bvec;
1729                 int ret;
1730
1731                 if (!sblock_bad->dev->bdev) {
1732                         btrfs_warn_rl(fs_info,
1733                                 "scrub_repair_page_from_good_copy(bdev == NULL) is unexpected");
1734                         return -EIO;
1735                 }
1736
1737                 bio_init(&bio, sblock_bad->dev->bdev, &bvec, 1, REQ_OP_WRITE);
1738                 bio.bi_iter.bi_sector = (sblock_bad->physical +
1739                                          sector_bad->offset) >> SECTOR_SHIFT;
1740                 ret = bio_add_scrub_sector(&bio, sector_good, sectorsize);
1741
1742                 btrfsic_check_bio(&bio);
1743                 ret = submit_bio_wait(&bio);
1744                 bio_uninit(&bio);
1745
1746                 if (ret) {
1747                         btrfs_dev_stat_inc_and_print(sblock_bad->dev,
1748                                 BTRFS_DEV_STAT_WRITE_ERRS);
1749                         atomic64_inc(&fs_info->dev_replace.num_write_errors);
1750                         return -EIO;
1751                 }
1752         }
1753
1754         return 0;
1755 }
1756
1757 static void scrub_write_block_to_dev_replace(struct scrub_block *sblock)
1758 {
1759         struct btrfs_fs_info *fs_info = sblock->sctx->fs_info;
1760         int i;
1761
1762         /*
1763          * This block is used for the check of the parity on the source device,
1764          * so the data needn't be written into the destination device.
1765          */
1766         if (sblock->sparity)
1767                 return;
1768
1769         for (i = 0; i < sblock->sector_count; i++) {
1770                 int ret;
1771
1772                 ret = scrub_write_sector_to_dev_replace(sblock, i);
1773                 if (ret)
1774                         atomic64_inc(&fs_info->dev_replace.num_write_errors);
1775         }
1776 }
1777
1778 static int scrub_write_sector_to_dev_replace(struct scrub_block *sblock, int sector_num)
1779 {
1780         const u32 sectorsize = sblock->sctx->fs_info->sectorsize;
1781         struct scrub_sector *sector = sblock->sectors[sector_num];
1782
1783         if (sector->io_error)
1784                 memset(scrub_sector_get_kaddr(sector), 0, sectorsize);
1785
1786         return scrub_add_sector_to_wr_bio(sblock->sctx, sector);
1787 }
1788
1789 static int fill_writer_pointer_gap(struct scrub_ctx *sctx, u64 physical)
1790 {
1791         int ret = 0;
1792         u64 length;
1793
1794         if (!btrfs_is_zoned(sctx->fs_info))
1795                 return 0;
1796
1797         if (!btrfs_dev_is_sequential(sctx->wr_tgtdev, physical))
1798                 return 0;
1799
1800         if (sctx->write_pointer < physical) {
1801                 length = physical - sctx->write_pointer;
1802
1803                 ret = btrfs_zoned_issue_zeroout(sctx->wr_tgtdev,
1804                                                 sctx->write_pointer, length);
1805                 if (!ret)
1806                         sctx->write_pointer = physical;
1807         }
1808         return ret;
1809 }
1810
1811 static void scrub_block_get(struct scrub_block *sblock)
1812 {
1813         refcount_inc(&sblock->refs);
1814 }
1815
1816 static int scrub_add_sector_to_wr_bio(struct scrub_ctx *sctx,
1817                                       struct scrub_sector *sector)
1818 {
1819         struct scrub_block *sblock = sector->sblock;
1820         struct scrub_bio *sbio;
1821         int ret;
1822         const u32 sectorsize = sctx->fs_info->sectorsize;
1823
1824         mutex_lock(&sctx->wr_lock);
1825 again:
1826         if (!sctx->wr_curr_bio) {
1827                 sctx->wr_curr_bio = kzalloc(sizeof(*sctx->wr_curr_bio),
1828                                               GFP_KERNEL);
1829                 if (!sctx->wr_curr_bio) {
1830                         mutex_unlock(&sctx->wr_lock);
1831                         return -ENOMEM;
1832                 }
1833                 sctx->wr_curr_bio->sctx = sctx;
1834                 sctx->wr_curr_bio->sector_count = 0;
1835         }
1836         sbio = sctx->wr_curr_bio;
1837         if (sbio->sector_count == 0) {
1838                 ret = fill_writer_pointer_gap(sctx, sector->offset +
1839                                               sblock->physical_for_dev_replace);
1840                 if (ret) {
1841                         mutex_unlock(&sctx->wr_lock);
1842                         return ret;
1843                 }
1844
1845                 sbio->physical = sblock->physical_for_dev_replace + sector->offset;
1846                 sbio->logical = sblock->logical + sector->offset;
1847                 sbio->dev = sctx->wr_tgtdev;
1848                 if (!sbio->bio) {
1849                         sbio->bio = bio_alloc(sbio->dev->bdev, sctx->sectors_per_bio,
1850                                               REQ_OP_WRITE, GFP_NOFS);
1851                 }
1852                 sbio->bio->bi_private = sbio;
1853                 sbio->bio->bi_end_io = scrub_wr_bio_end_io;
1854                 sbio->bio->bi_iter.bi_sector = sbio->physical >> 9;
1855                 sbio->status = 0;
1856         } else if (sbio->physical + sbio->sector_count * sectorsize !=
1857                    sblock->physical_for_dev_replace + sector->offset ||
1858                    sbio->logical + sbio->sector_count * sectorsize !=
1859                    sblock->logical + sector->offset) {
1860                 scrub_wr_submit(sctx);
1861                 goto again;
1862         }
1863
1864         ret = bio_add_scrub_sector(sbio->bio, sector, sectorsize);
1865         if (ret != sectorsize) {
1866                 if (sbio->sector_count < 1) {
1867                         bio_put(sbio->bio);
1868                         sbio->bio = NULL;
1869                         mutex_unlock(&sctx->wr_lock);
1870                         return -EIO;
1871                 }
1872                 scrub_wr_submit(sctx);
1873                 goto again;
1874         }
1875
1876         sbio->sectors[sbio->sector_count] = sector;
1877         scrub_sector_get(sector);
1878         /*
1879          * Since ssector no longer holds a page, but uses sblock::pages, we
1880          * have to ensure the sblock had not been freed before our write bio
1881          * finished.
1882          */
1883         scrub_block_get(sector->sblock);
1884
1885         sbio->sector_count++;
1886         if (sbio->sector_count == sctx->sectors_per_bio)
1887                 scrub_wr_submit(sctx);
1888         mutex_unlock(&sctx->wr_lock);
1889
1890         return 0;
1891 }
1892
1893 static void scrub_wr_submit(struct scrub_ctx *sctx)
1894 {
1895         struct scrub_bio *sbio;
1896
1897         if (!sctx->wr_curr_bio)
1898                 return;
1899
1900         sbio = sctx->wr_curr_bio;
1901         sctx->wr_curr_bio = NULL;
1902         scrub_pending_bio_inc(sctx);
1903         /* process all writes in a single worker thread. Then the block layer
1904          * orders the requests before sending them to the driver which
1905          * doubled the write performance on spinning disks when measured
1906          * with Linux 3.5 */
1907         btrfsic_check_bio(sbio->bio);
1908         submit_bio(sbio->bio);
1909
1910         if (btrfs_is_zoned(sctx->fs_info))
1911                 sctx->write_pointer = sbio->physical + sbio->sector_count *
1912                         sctx->fs_info->sectorsize;
1913 }
1914
1915 static void scrub_wr_bio_end_io(struct bio *bio)
1916 {
1917         struct scrub_bio *sbio = bio->bi_private;
1918         struct btrfs_fs_info *fs_info = sbio->dev->fs_info;
1919
1920         sbio->status = bio->bi_status;
1921         sbio->bio = bio;
1922
1923         INIT_WORK(&sbio->work, scrub_wr_bio_end_io_worker);
1924         queue_work(fs_info->scrub_wr_completion_workers, &sbio->work);
1925 }
1926
1927 static void scrub_wr_bio_end_io_worker(struct work_struct *work)
1928 {
1929         struct scrub_bio *sbio = container_of(work, struct scrub_bio, work);
1930         struct scrub_ctx *sctx = sbio->sctx;
1931         int i;
1932
1933         ASSERT(sbio->sector_count <= SCRUB_SECTORS_PER_BIO);
1934         if (sbio->status) {
1935                 struct btrfs_dev_replace *dev_replace =
1936                         &sbio->sctx->fs_info->dev_replace;
1937
1938                 for (i = 0; i < sbio->sector_count; i++) {
1939                         struct scrub_sector *sector = sbio->sectors[i];
1940
1941                         sector->io_error = 1;
1942                         atomic64_inc(&dev_replace->num_write_errors);
1943                 }
1944         }
1945
1946         /*
1947          * In scrub_add_sector_to_wr_bio() we grab extra ref for sblock, now in
1948          * endio we should put the sblock.
1949          */
1950         for (i = 0; i < sbio->sector_count; i++) {
1951                 scrub_block_put(sbio->sectors[i]->sblock);
1952                 scrub_sector_put(sbio->sectors[i]);
1953         }
1954
1955         bio_put(sbio->bio);
1956         kfree(sbio);
1957         scrub_pending_bio_dec(sctx);
1958 }
1959
1960 static int scrub_checksum(struct scrub_block *sblock)
1961 {
1962         u64 flags;
1963         int ret;
1964
1965         /*
1966          * No need to initialize these stats currently,
1967          * because this function only use return value
1968          * instead of these stats value.
1969          *
1970          * Todo:
1971          * always use stats
1972          */
1973         sblock->header_error = 0;
1974         sblock->generation_error = 0;
1975         sblock->checksum_error = 0;
1976
1977         WARN_ON(sblock->sector_count < 1);
1978         flags = sblock->sectors[0]->flags;
1979         ret = 0;
1980         if (flags & BTRFS_EXTENT_FLAG_DATA)
1981                 ret = scrub_checksum_data(sblock);
1982         else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)
1983                 ret = scrub_checksum_tree_block(sblock);
1984         else if (flags & BTRFS_EXTENT_FLAG_SUPER)
1985                 ret = scrub_checksum_super(sblock);
1986         else
1987                 WARN_ON(1);
1988         if (ret)
1989                 scrub_handle_errored_block(sblock);
1990
1991         return ret;
1992 }
1993
1994 static int scrub_checksum_data(struct scrub_block *sblock)
1995 {
1996         struct scrub_ctx *sctx = sblock->sctx;
1997         struct btrfs_fs_info *fs_info = sctx->fs_info;
1998         SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
1999         u8 csum[BTRFS_CSUM_SIZE];
2000         struct scrub_sector *sector;
2001         char *kaddr;
2002
2003         BUG_ON(sblock->sector_count < 1);
2004         sector = sblock->sectors[0];
2005         if (!sector->have_csum)
2006                 return 0;
2007
2008         kaddr = scrub_sector_get_kaddr(sector);
2009
2010         shash->tfm = fs_info->csum_shash;
2011         crypto_shash_init(shash);
2012
2013         crypto_shash_digest(shash, kaddr, fs_info->sectorsize, csum);
2014
2015         if (memcmp(csum, sector->csum, fs_info->csum_size))
2016                 sblock->checksum_error = 1;
2017         return sblock->checksum_error;
2018 }
2019
2020 static int scrub_checksum_tree_block(struct scrub_block *sblock)
2021 {
2022         struct scrub_ctx *sctx = sblock->sctx;
2023         struct btrfs_header *h;
2024         struct btrfs_fs_info *fs_info = sctx->fs_info;
2025         SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
2026         u8 calculated_csum[BTRFS_CSUM_SIZE];
2027         u8 on_disk_csum[BTRFS_CSUM_SIZE];
2028         /*
2029          * This is done in sectorsize steps even for metadata as there's a
2030          * constraint for nodesize to be aligned to sectorsize. This will need
2031          * to change so we don't misuse data and metadata units like that.
2032          */
2033         const u32 sectorsize = sctx->fs_info->sectorsize;
2034         const int num_sectors = fs_info->nodesize >> fs_info->sectorsize_bits;
2035         int i;
2036         struct scrub_sector *sector;
2037         char *kaddr;
2038
2039         BUG_ON(sblock->sector_count < 1);
2040
2041         /* Each member in sectors is just one sector */
2042         ASSERT(sblock->sector_count == num_sectors);
2043
2044         sector = sblock->sectors[0];
2045         kaddr = scrub_sector_get_kaddr(sector);
2046         h = (struct btrfs_header *)kaddr;
2047         memcpy(on_disk_csum, h->csum, sctx->fs_info->csum_size);
2048
2049         /*
2050          * we don't use the getter functions here, as we
2051          * a) don't have an extent buffer and
2052          * b) the page is already kmapped
2053          */
2054         if (sblock->logical != btrfs_stack_header_bytenr(h))
2055                 sblock->header_error = 1;
2056
2057         if (sector->generation != btrfs_stack_header_generation(h)) {
2058                 sblock->header_error = 1;
2059                 sblock->generation_error = 1;
2060         }
2061
2062         if (!scrub_check_fsid(h->fsid, sector))
2063                 sblock->header_error = 1;
2064
2065         if (memcmp(h->chunk_tree_uuid, fs_info->chunk_tree_uuid,
2066                    BTRFS_UUID_SIZE))
2067                 sblock->header_error = 1;
2068
2069         shash->tfm = fs_info->csum_shash;
2070         crypto_shash_init(shash);
2071         crypto_shash_update(shash, kaddr + BTRFS_CSUM_SIZE,
2072                             sectorsize - BTRFS_CSUM_SIZE);
2073
2074         for (i = 1; i < num_sectors; i++) {
2075                 kaddr = scrub_sector_get_kaddr(sblock->sectors[i]);
2076                 crypto_shash_update(shash, kaddr, sectorsize);
2077         }
2078
2079         crypto_shash_final(shash, calculated_csum);
2080         if (memcmp(calculated_csum, on_disk_csum, sctx->fs_info->csum_size))
2081                 sblock->checksum_error = 1;
2082
2083         return sblock->header_error || sblock->checksum_error;
2084 }
2085
2086 static int scrub_checksum_super(struct scrub_block *sblock)
2087 {
2088         struct btrfs_super_block *s;
2089         struct scrub_ctx *sctx = sblock->sctx;
2090         struct btrfs_fs_info *fs_info = sctx->fs_info;
2091         SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
2092         u8 calculated_csum[BTRFS_CSUM_SIZE];
2093         struct scrub_sector *sector;
2094         char *kaddr;
2095         int fail_gen = 0;
2096         int fail_cor = 0;
2097
2098         BUG_ON(sblock->sector_count < 1);
2099         sector = sblock->sectors[0];
2100         kaddr = scrub_sector_get_kaddr(sector);
2101         s = (struct btrfs_super_block *)kaddr;
2102
2103         if (sblock->logical != btrfs_super_bytenr(s))
2104                 ++fail_cor;
2105
2106         if (sector->generation != btrfs_super_generation(s))
2107                 ++fail_gen;
2108
2109         if (!scrub_check_fsid(s->fsid, sector))
2110                 ++fail_cor;
2111
2112         shash->tfm = fs_info->csum_shash;
2113         crypto_shash_init(shash);
2114         crypto_shash_digest(shash, kaddr + BTRFS_CSUM_SIZE,
2115                         BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE, calculated_csum);
2116
2117         if (memcmp(calculated_csum, s->csum, sctx->fs_info->csum_size))
2118                 ++fail_cor;
2119
2120         return fail_cor + fail_gen;
2121 }
2122
2123 static void scrub_block_put(struct scrub_block *sblock)
2124 {
2125         if (refcount_dec_and_test(&sblock->refs)) {
2126                 int i;
2127
2128                 if (sblock->sparity)
2129                         scrub_parity_put(sblock->sparity);
2130
2131                 for (i = 0; i < sblock->sector_count; i++)
2132                         scrub_sector_put(sblock->sectors[i]);
2133                 for (i = 0; i < DIV_ROUND_UP(sblock->len, PAGE_SIZE); i++) {
2134                         if (sblock->pages[i]) {
2135                                 detach_scrub_page_private(sblock->pages[i]);
2136                                 __free_page(sblock->pages[i]);
2137                         }
2138                 }
2139                 kfree(sblock);
2140         }
2141 }
2142
2143 static void scrub_sector_get(struct scrub_sector *sector)
2144 {
2145         atomic_inc(&sector->refs);
2146 }
2147
2148 static void scrub_sector_put(struct scrub_sector *sector)
2149 {
2150         if (atomic_dec_and_test(&sector->refs))
2151                 kfree(sector);
2152 }
2153
2154 /*
2155  * Throttling of IO submission, bandwidth-limit based, the timeslice is 1
2156  * second.  Limit can be set via /sys/fs/UUID/devinfo/devid/scrub_speed_max.
2157  */
2158 static void scrub_throttle(struct scrub_ctx *sctx)
2159 {
2160         const int time_slice = 1000;
2161         struct scrub_bio *sbio;
2162         struct btrfs_device *device;
2163         s64 delta;
2164         ktime_t now;
2165         u32 div;
2166         u64 bwlimit;
2167
2168         sbio = sctx->bios[sctx->curr];
2169         device = sbio->dev;
2170         bwlimit = READ_ONCE(device->scrub_speed_max);
2171         if (bwlimit == 0)
2172                 return;
2173
2174         /*
2175          * Slice is divided into intervals when the IO is submitted, adjust by
2176          * bwlimit and maximum of 64 intervals.
2177          */
2178         div = max_t(u32, 1, (u32)(bwlimit / (16 * 1024 * 1024)));
2179         div = min_t(u32, 64, div);
2180
2181         /* Start new epoch, set deadline */
2182         now = ktime_get();
2183         if (sctx->throttle_deadline == 0) {
2184                 sctx->throttle_deadline = ktime_add_ms(now, time_slice / div);
2185                 sctx->throttle_sent = 0;
2186         }
2187
2188         /* Still in the time to send? */
2189         if (ktime_before(now, sctx->throttle_deadline)) {
2190                 /* If current bio is within the limit, send it */
2191                 sctx->throttle_sent += sbio->bio->bi_iter.bi_size;
2192                 if (sctx->throttle_sent <= div_u64(bwlimit, div))
2193                         return;
2194
2195                 /* We're over the limit, sleep until the rest of the slice */
2196                 delta = ktime_ms_delta(sctx->throttle_deadline, now);
2197         } else {
2198                 /* New request after deadline, start new epoch */
2199                 delta = 0;
2200         }
2201
2202         if (delta) {
2203                 long timeout;
2204
2205                 timeout = div_u64(delta * HZ, 1000);
2206                 schedule_timeout_interruptible(timeout);
2207         }
2208
2209         /* Next call will start the deadline period */
2210         sctx->throttle_deadline = 0;
2211 }
2212
2213 static void scrub_submit(struct scrub_ctx *sctx)
2214 {
2215         struct scrub_bio *sbio;
2216
2217         if (sctx->curr == -1)
2218                 return;
2219
2220         scrub_throttle(sctx);
2221
2222         sbio = sctx->bios[sctx->curr];
2223         sctx->curr = -1;
2224         scrub_pending_bio_inc(sctx);
2225         btrfsic_check_bio(sbio->bio);
2226         submit_bio(sbio->bio);
2227 }
2228
2229 static int scrub_add_sector_to_rd_bio(struct scrub_ctx *sctx,
2230                                       struct scrub_sector *sector)
2231 {
2232         struct scrub_block *sblock = sector->sblock;
2233         struct scrub_bio *sbio;
2234         const u32 sectorsize = sctx->fs_info->sectorsize;
2235         int ret;
2236
2237 again:
2238         /*
2239          * grab a fresh bio or wait for one to become available
2240          */
2241         while (sctx->curr == -1) {
2242                 spin_lock(&sctx->list_lock);
2243                 sctx->curr = sctx->first_free;
2244                 if (sctx->curr != -1) {
2245                         sctx->first_free = sctx->bios[sctx->curr]->next_free;
2246                         sctx->bios[sctx->curr]->next_free = -1;
2247                         sctx->bios[sctx->curr]->sector_count = 0;
2248                         spin_unlock(&sctx->list_lock);
2249                 } else {
2250                         spin_unlock(&sctx->list_lock);
2251                         wait_event(sctx->list_wait, sctx->first_free != -1);
2252                 }
2253         }
2254         sbio = sctx->bios[sctx->curr];
2255         if (sbio->sector_count == 0) {
2256                 sbio->physical = sblock->physical + sector->offset;
2257                 sbio->logical = sblock->logical + sector->offset;
2258                 sbio->dev = sblock->dev;
2259                 if (!sbio->bio) {
2260                         sbio->bio = bio_alloc(sbio->dev->bdev, sctx->sectors_per_bio,
2261                                               REQ_OP_READ, GFP_NOFS);
2262                 }
2263                 sbio->bio->bi_private = sbio;
2264                 sbio->bio->bi_end_io = scrub_bio_end_io;
2265                 sbio->bio->bi_iter.bi_sector = sbio->physical >> 9;
2266                 sbio->status = 0;
2267         } else if (sbio->physical + sbio->sector_count * sectorsize !=
2268                    sblock->physical + sector->offset ||
2269                    sbio->logical + sbio->sector_count * sectorsize !=
2270                    sblock->logical + sector->offset ||
2271                    sbio->dev != sblock->dev) {
2272                 scrub_submit(sctx);
2273                 goto again;
2274         }
2275
2276         sbio->sectors[sbio->sector_count] = sector;
2277         ret = bio_add_scrub_sector(sbio->bio, sector, sectorsize);
2278         if (ret != sectorsize) {
2279                 if (sbio->sector_count < 1) {
2280                         bio_put(sbio->bio);
2281                         sbio->bio = NULL;
2282                         return -EIO;
2283                 }
2284                 scrub_submit(sctx);
2285                 goto again;
2286         }
2287
2288         scrub_block_get(sblock); /* one for the page added to the bio */
2289         atomic_inc(&sblock->outstanding_sectors);
2290         sbio->sector_count++;
2291         if (sbio->sector_count == sctx->sectors_per_bio)
2292                 scrub_submit(sctx);
2293
2294         return 0;
2295 }
2296
2297 static void scrub_missing_raid56_end_io(struct bio *bio)
2298 {
2299         struct scrub_block *sblock = bio->bi_private;
2300         struct btrfs_fs_info *fs_info = sblock->sctx->fs_info;
2301
2302         btrfs_bio_counter_dec(fs_info);
2303         if (bio->bi_status)
2304                 sblock->no_io_error_seen = 0;
2305
2306         bio_put(bio);
2307
2308         queue_work(fs_info->scrub_workers, &sblock->work);
2309 }
2310
2311 static void scrub_missing_raid56_worker(struct work_struct *work)
2312 {
2313         struct scrub_block *sblock = container_of(work, struct scrub_block, work);
2314         struct scrub_ctx *sctx = sblock->sctx;
2315         struct btrfs_fs_info *fs_info = sctx->fs_info;
2316         u64 logical;
2317         struct btrfs_device *dev;
2318
2319         logical = sblock->logical;
2320         dev = sblock->dev;
2321
2322         if (sblock->no_io_error_seen)
2323                 scrub_recheck_block_checksum(sblock);
2324
2325         if (!sblock->no_io_error_seen) {
2326                 spin_lock(&sctx->stat_lock);
2327                 sctx->stat.read_errors++;
2328                 spin_unlock(&sctx->stat_lock);
2329                 btrfs_err_rl_in_rcu(fs_info,
2330                         "IO error rebuilding logical %llu for dev %s",
2331                         logical, rcu_str_deref(dev->name));
2332         } else if (sblock->header_error || sblock->checksum_error) {
2333                 spin_lock(&sctx->stat_lock);
2334                 sctx->stat.uncorrectable_errors++;
2335                 spin_unlock(&sctx->stat_lock);
2336                 btrfs_err_rl_in_rcu(fs_info,
2337                         "failed to rebuild valid logical %llu for dev %s",
2338                         logical, rcu_str_deref(dev->name));
2339         } else {
2340                 scrub_write_block_to_dev_replace(sblock);
2341         }
2342
2343         if (sctx->is_dev_replace && sctx->flush_all_writes) {
2344                 mutex_lock(&sctx->wr_lock);
2345                 scrub_wr_submit(sctx);
2346                 mutex_unlock(&sctx->wr_lock);
2347         }
2348
2349         scrub_block_put(sblock);
2350         scrub_pending_bio_dec(sctx);
2351 }
2352
2353 static void scrub_missing_raid56_pages(struct scrub_block *sblock)
2354 {
2355         struct scrub_ctx *sctx = sblock->sctx;
2356         struct btrfs_fs_info *fs_info = sctx->fs_info;
2357         u64 length = sblock->sector_count << fs_info->sectorsize_bits;
2358         u64 logical = sblock->logical;
2359         struct btrfs_io_context *bioc = NULL;
2360         struct bio *bio;
2361         struct btrfs_raid_bio *rbio;
2362         int ret;
2363         int i;
2364
2365         btrfs_bio_counter_inc_blocked(fs_info);
2366         ret = btrfs_map_sblock(fs_info, BTRFS_MAP_GET_READ_MIRRORS, logical,
2367                                &length, &bioc);
2368         if (ret || !bioc || !bioc->raid_map)
2369                 goto bioc_out;
2370
2371         if (WARN_ON(!sctx->is_dev_replace ||
2372                     !(bioc->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK))) {
2373                 /*
2374                  * We shouldn't be scrubbing a missing device. Even for dev
2375                  * replace, we should only get here for RAID 5/6. We either
2376                  * managed to mount something with no mirrors remaining or
2377                  * there's a bug in scrub_find_good_copy()/btrfs_map_block().
2378                  */
2379                 goto bioc_out;
2380         }
2381
2382         bio = bio_alloc(NULL, BIO_MAX_VECS, REQ_OP_READ, GFP_NOFS);
2383         bio->bi_iter.bi_sector = logical >> 9;
2384         bio->bi_private = sblock;
2385         bio->bi_end_io = scrub_missing_raid56_end_io;
2386
2387         rbio = raid56_alloc_missing_rbio(bio, bioc);
2388         if (!rbio)
2389                 goto rbio_out;
2390
2391         for (i = 0; i < sblock->sector_count; i++) {
2392                 struct scrub_sector *sector = sblock->sectors[i];
2393
2394                 raid56_add_scrub_pages(rbio, scrub_sector_get_page(sector),
2395                                        scrub_sector_get_page_offset(sector),
2396                                        sector->offset + sector->sblock->logical);
2397         }
2398
2399         INIT_WORK(&sblock->work, scrub_missing_raid56_worker);
2400         scrub_block_get(sblock);
2401         scrub_pending_bio_inc(sctx);
2402         raid56_submit_missing_rbio(rbio);
2403         btrfs_put_bioc(bioc);
2404         return;
2405
2406 rbio_out:
2407         bio_put(bio);
2408 bioc_out:
2409         btrfs_bio_counter_dec(fs_info);
2410         btrfs_put_bioc(bioc);
2411         spin_lock(&sctx->stat_lock);
2412         sctx->stat.malloc_errors++;
2413         spin_unlock(&sctx->stat_lock);
2414 }
2415
2416 static int scrub_sectors(struct scrub_ctx *sctx, u64 logical, u32 len,
2417                        u64 physical, struct btrfs_device *dev, u64 flags,
2418                        u64 gen, int mirror_num, u8 *csum,
2419                        u64 physical_for_dev_replace)
2420 {
2421         struct scrub_block *sblock;
2422         const u32 sectorsize = sctx->fs_info->sectorsize;
2423         int index;
2424
2425         sblock = alloc_scrub_block(sctx, dev, logical, physical,
2426                                    physical_for_dev_replace, mirror_num);
2427         if (!sblock) {
2428                 spin_lock(&sctx->stat_lock);
2429                 sctx->stat.malloc_errors++;
2430                 spin_unlock(&sctx->stat_lock);
2431                 return -ENOMEM;
2432         }
2433
2434         for (index = 0; len > 0; index++) {
2435                 struct scrub_sector *sector;
2436                 /*
2437                  * Here we will allocate one page for one sector to scrub.
2438                  * This is fine if PAGE_SIZE == sectorsize, but will cost
2439                  * more memory for PAGE_SIZE > sectorsize case.
2440                  */
2441                 u32 l = min(sectorsize, len);
2442
2443                 sector = alloc_scrub_sector(sblock, logical);
2444                 if (!sector) {
2445                         spin_lock(&sctx->stat_lock);
2446                         sctx->stat.malloc_errors++;
2447                         spin_unlock(&sctx->stat_lock);
2448                         scrub_block_put(sblock);
2449                         return -ENOMEM;
2450                 }
2451                 sector->flags = flags;
2452                 sector->generation = gen;
2453                 if (csum) {
2454                         sector->have_csum = 1;
2455                         memcpy(sector->csum, csum, sctx->fs_info->csum_size);
2456                 } else {
2457                         sector->have_csum = 0;
2458                 }
2459                 len -= l;
2460                 logical += l;
2461                 physical += l;
2462                 physical_for_dev_replace += l;
2463         }
2464
2465         WARN_ON(sblock->sector_count == 0);
2466         if (test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state)) {
2467                 /*
2468                  * This case should only be hit for RAID 5/6 device replace. See
2469                  * the comment in scrub_missing_raid56_pages() for details.
2470                  */
2471                 scrub_missing_raid56_pages(sblock);
2472         } else {
2473                 for (index = 0; index < sblock->sector_count; index++) {
2474                         struct scrub_sector *sector = sblock->sectors[index];
2475                         int ret;
2476
2477                         ret = scrub_add_sector_to_rd_bio(sctx, sector);
2478                         if (ret) {
2479                                 scrub_block_put(sblock);
2480                                 return ret;
2481                         }
2482                 }
2483
2484                 if (flags & BTRFS_EXTENT_FLAG_SUPER)
2485                         scrub_submit(sctx);
2486         }
2487
2488         /* last one frees, either here or in bio completion for last page */
2489         scrub_block_put(sblock);
2490         return 0;
2491 }
2492
2493 static void scrub_bio_end_io(struct bio *bio)
2494 {
2495         struct scrub_bio *sbio = bio->bi_private;
2496         struct btrfs_fs_info *fs_info = sbio->dev->fs_info;
2497
2498         sbio->status = bio->bi_status;
2499         sbio->bio = bio;
2500
2501         queue_work(fs_info->scrub_workers, &sbio->work);
2502 }
2503
2504 static void scrub_bio_end_io_worker(struct work_struct *work)
2505 {
2506         struct scrub_bio *sbio = container_of(work, struct scrub_bio, work);
2507         struct scrub_ctx *sctx = sbio->sctx;
2508         int i;
2509
2510         ASSERT(sbio->sector_count <= SCRUB_SECTORS_PER_BIO);
2511         if (sbio->status) {
2512                 for (i = 0; i < sbio->sector_count; i++) {
2513                         struct scrub_sector *sector = sbio->sectors[i];
2514
2515                         sector->io_error = 1;
2516                         sector->sblock->no_io_error_seen = 0;
2517                 }
2518         }
2519
2520         /* Now complete the scrub_block items that have all pages completed */
2521         for (i = 0; i < sbio->sector_count; i++) {
2522                 struct scrub_sector *sector = sbio->sectors[i];
2523                 struct scrub_block *sblock = sector->sblock;
2524
2525                 if (atomic_dec_and_test(&sblock->outstanding_sectors))
2526                         scrub_block_complete(sblock);
2527                 scrub_block_put(sblock);
2528         }
2529
2530         bio_put(sbio->bio);
2531         sbio->bio = NULL;
2532         spin_lock(&sctx->list_lock);
2533         sbio->next_free = sctx->first_free;
2534         sctx->first_free = sbio->index;
2535         spin_unlock(&sctx->list_lock);
2536
2537         if (sctx->is_dev_replace && sctx->flush_all_writes) {
2538                 mutex_lock(&sctx->wr_lock);
2539                 scrub_wr_submit(sctx);
2540                 mutex_unlock(&sctx->wr_lock);
2541         }
2542
2543         scrub_pending_bio_dec(sctx);
2544 }
2545
2546 static inline void __scrub_mark_bitmap(struct scrub_parity *sparity,
2547                                        unsigned long *bitmap,
2548                                        u64 start, u32 len)
2549 {
2550         u64 offset;
2551         u32 nsectors;
2552         u32 sectorsize_bits = sparity->sctx->fs_info->sectorsize_bits;
2553
2554         if (len >= sparity->stripe_len) {
2555                 bitmap_set(bitmap, 0, sparity->nsectors);
2556                 return;
2557         }
2558
2559         start -= sparity->logic_start;
2560         start = div64_u64_rem(start, sparity->stripe_len, &offset);
2561         offset = offset >> sectorsize_bits;
2562         nsectors = len >> sectorsize_bits;
2563
2564         if (offset + nsectors <= sparity->nsectors) {
2565                 bitmap_set(bitmap, offset, nsectors);
2566                 return;
2567         }
2568
2569         bitmap_set(bitmap, offset, sparity->nsectors - offset);
2570         bitmap_set(bitmap, 0, nsectors - (sparity->nsectors - offset));
2571 }
2572
2573 static inline void scrub_parity_mark_sectors_error(struct scrub_parity *sparity,
2574                                                    u64 start, u32 len)
2575 {
2576         __scrub_mark_bitmap(sparity, &sparity->ebitmap, start, len);
2577 }
2578
2579 static inline void scrub_parity_mark_sectors_data(struct scrub_parity *sparity,
2580                                                   u64 start, u32 len)
2581 {
2582         __scrub_mark_bitmap(sparity, &sparity->dbitmap, start, len);
2583 }
2584
2585 static void scrub_block_complete(struct scrub_block *sblock)
2586 {
2587         int corrupted = 0;
2588
2589         if (!sblock->no_io_error_seen) {
2590                 corrupted = 1;
2591                 scrub_handle_errored_block(sblock);
2592         } else {
2593                 /*
2594                  * if has checksum error, write via repair mechanism in
2595                  * dev replace case, otherwise write here in dev replace
2596                  * case.
2597                  */
2598                 corrupted = scrub_checksum(sblock);
2599                 if (!corrupted && sblock->sctx->is_dev_replace)
2600                         scrub_write_block_to_dev_replace(sblock);
2601         }
2602
2603         if (sblock->sparity && corrupted && !sblock->data_corrected) {
2604                 u64 start = sblock->logical;
2605                 u64 end = sblock->logical +
2606                           sblock->sectors[sblock->sector_count - 1]->offset +
2607                           sblock->sctx->fs_info->sectorsize;
2608
2609                 ASSERT(end - start <= U32_MAX);
2610                 scrub_parity_mark_sectors_error(sblock->sparity,
2611                                                 start, end - start);
2612         }
2613 }
2614
2615 static void drop_csum_range(struct scrub_ctx *sctx, struct btrfs_ordered_sum *sum)
2616 {
2617         sctx->stat.csum_discards += sum->len >> sctx->fs_info->sectorsize_bits;
2618         list_del(&sum->list);
2619         kfree(sum);
2620 }
2621
2622 /*
2623  * Find the desired csum for range [logical, logical + sectorsize), and store
2624  * the csum into @csum.
2625  *
2626  * The search source is sctx->csum_list, which is a pre-populated list
2627  * storing bytenr ordered csum ranges.  We're responsible to cleanup any range
2628  * that is before @logical.
2629  *
2630  * Return 0 if there is no csum for the range.
2631  * Return 1 if there is csum for the range and copied to @csum.
2632  */
2633 static int scrub_find_csum(struct scrub_ctx *sctx, u64 logical, u8 *csum)
2634 {
2635         bool found = false;
2636
2637         while (!list_empty(&sctx->csum_list)) {
2638                 struct btrfs_ordered_sum *sum = NULL;
2639                 unsigned long index;
2640                 unsigned long num_sectors;
2641
2642                 sum = list_first_entry(&sctx->csum_list,
2643                                        struct btrfs_ordered_sum, list);
2644                 /* The current csum range is beyond our range, no csum found */
2645                 if (sum->bytenr > logical)
2646                         break;
2647
2648                 /*
2649                  * The current sum is before our bytenr, since scrub is always
2650                  * done in bytenr order, the csum will never be used anymore,
2651                  * clean it up so that later calls won't bother with the range,
2652                  * and continue search the next range.
2653                  */
2654                 if (sum->bytenr + sum->len <= logical) {
2655                         drop_csum_range(sctx, sum);
2656                         continue;
2657                 }
2658
2659                 /* Now the csum range covers our bytenr, copy the csum */
2660                 found = true;
2661                 index = (logical - sum->bytenr) >> sctx->fs_info->sectorsize_bits;
2662                 num_sectors = sum->len >> sctx->fs_info->sectorsize_bits;
2663
2664                 memcpy(csum, sum->sums + index * sctx->fs_info->csum_size,
2665                        sctx->fs_info->csum_size);
2666
2667                 /* Cleanup the range if we're at the end of the csum range */
2668                 if (index == num_sectors - 1)
2669                         drop_csum_range(sctx, sum);
2670                 break;
2671         }
2672         if (!found)
2673                 return 0;
2674         return 1;
2675 }
2676
2677 /* scrub extent tries to collect up to 64 kB for each bio */
2678 static int scrub_extent(struct scrub_ctx *sctx, struct map_lookup *map,
2679                         u64 logical, u32 len,
2680                         u64 physical, struct btrfs_device *dev, u64 flags,
2681                         u64 gen, int mirror_num)
2682 {
2683         struct btrfs_device *src_dev = dev;
2684         u64 src_physical = physical;
2685         int src_mirror = mirror_num;
2686         int ret;
2687         u8 csum[BTRFS_CSUM_SIZE];
2688         u32 blocksize;
2689
2690         if (flags & BTRFS_EXTENT_FLAG_DATA) {
2691                 if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK)
2692                         blocksize = map->stripe_len;
2693                 else
2694                         blocksize = sctx->fs_info->sectorsize;
2695                 spin_lock(&sctx->stat_lock);
2696                 sctx->stat.data_extents_scrubbed++;
2697                 sctx->stat.data_bytes_scrubbed += len;
2698                 spin_unlock(&sctx->stat_lock);
2699         } else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
2700                 if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK)
2701                         blocksize = map->stripe_len;
2702                 else
2703                         blocksize = sctx->fs_info->nodesize;
2704                 spin_lock(&sctx->stat_lock);
2705                 sctx->stat.tree_extents_scrubbed++;
2706                 sctx->stat.tree_bytes_scrubbed += len;
2707                 spin_unlock(&sctx->stat_lock);
2708         } else {
2709                 blocksize = sctx->fs_info->sectorsize;
2710                 WARN_ON(1);
2711         }
2712
2713         /*
2714          * For dev-replace case, we can have @dev being a missing device.
2715          * Regular scrub will avoid its execution on missing device at all,
2716          * as that would trigger tons of read error.
2717          *
2718          * Reading from missing device will cause read error counts to
2719          * increase unnecessarily.
2720          * So here we change the read source to a good mirror.
2721          */
2722         if (sctx->is_dev_replace && !dev->bdev)
2723                 scrub_find_good_copy(sctx->fs_info, logical, len, &src_physical,
2724                                      &src_dev, &src_mirror);
2725         while (len) {
2726                 u32 l = min(len, blocksize);
2727                 int have_csum = 0;
2728
2729                 if (flags & BTRFS_EXTENT_FLAG_DATA) {
2730                         /* push csums to sbio */
2731                         have_csum = scrub_find_csum(sctx, logical, csum);
2732                         if (have_csum == 0)
2733                                 ++sctx->stat.no_csum;
2734                 }
2735                 ret = scrub_sectors(sctx, logical, l, src_physical, src_dev,
2736                                     flags, gen, src_mirror,
2737                                     have_csum ? csum : NULL, physical);
2738                 if (ret)
2739                         return ret;
2740                 len -= l;
2741                 logical += l;
2742                 physical += l;
2743                 src_physical += l;
2744         }
2745         return 0;
2746 }
2747
2748 static int scrub_sectors_for_parity(struct scrub_parity *sparity,
2749                                   u64 logical, u32 len,
2750                                   u64 physical, struct btrfs_device *dev,
2751                                   u64 flags, u64 gen, int mirror_num, u8 *csum)
2752 {
2753         struct scrub_ctx *sctx = sparity->sctx;
2754         struct scrub_block *sblock;
2755         const u32 sectorsize = sctx->fs_info->sectorsize;
2756         int index;
2757
2758         ASSERT(IS_ALIGNED(len, sectorsize));
2759
2760         sblock = alloc_scrub_block(sctx, dev, logical, physical, physical, mirror_num);
2761         if (!sblock) {
2762                 spin_lock(&sctx->stat_lock);
2763                 sctx->stat.malloc_errors++;
2764                 spin_unlock(&sctx->stat_lock);
2765                 return -ENOMEM;
2766         }
2767
2768         sblock->sparity = sparity;
2769         scrub_parity_get(sparity);
2770
2771         for (index = 0; len > 0; index++) {
2772                 struct scrub_sector *sector;
2773
2774                 sector = alloc_scrub_sector(sblock, logical);
2775                 if (!sector) {
2776                         spin_lock(&sctx->stat_lock);
2777                         sctx->stat.malloc_errors++;
2778                         spin_unlock(&sctx->stat_lock);
2779                         scrub_block_put(sblock);
2780                         return -ENOMEM;
2781                 }
2782                 sblock->sectors[index] = sector;
2783                 /* For scrub parity */
2784                 scrub_sector_get(sector);
2785                 list_add_tail(&sector->list, &sparity->sectors_list);
2786                 sector->flags = flags;
2787                 sector->generation = gen;
2788                 if (csum) {
2789                         sector->have_csum = 1;
2790                         memcpy(sector->csum, csum, sctx->fs_info->csum_size);
2791                 } else {
2792                         sector->have_csum = 0;
2793                 }
2794
2795                 /* Iterate over the stripe range in sectorsize steps */
2796                 len -= sectorsize;
2797                 logical += sectorsize;
2798                 physical += sectorsize;
2799         }
2800
2801         WARN_ON(sblock->sector_count == 0);
2802         for (index = 0; index < sblock->sector_count; index++) {
2803                 struct scrub_sector *sector = sblock->sectors[index];
2804                 int ret;
2805
2806                 ret = scrub_add_sector_to_rd_bio(sctx, sector);
2807                 if (ret) {
2808                         scrub_block_put(sblock);
2809                         return ret;
2810                 }
2811         }
2812
2813         /* Last one frees, either here or in bio completion for last sector */
2814         scrub_block_put(sblock);
2815         return 0;
2816 }
2817
2818 static int scrub_extent_for_parity(struct scrub_parity *sparity,
2819                                    u64 logical, u32 len,
2820                                    u64 physical, struct btrfs_device *dev,
2821                                    u64 flags, u64 gen, int mirror_num)
2822 {
2823         struct scrub_ctx *sctx = sparity->sctx;
2824         int ret;
2825         u8 csum[BTRFS_CSUM_SIZE];
2826         u32 blocksize;
2827
2828         if (test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state)) {
2829                 scrub_parity_mark_sectors_error(sparity, logical, len);
2830                 return 0;
2831         }
2832
2833         if (flags & BTRFS_EXTENT_FLAG_DATA) {
2834                 blocksize = sparity->stripe_len;
2835         } else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
2836                 blocksize = sparity->stripe_len;
2837         } else {
2838                 blocksize = sctx->fs_info->sectorsize;
2839                 WARN_ON(1);
2840         }
2841
2842         while (len) {
2843                 u32 l = min(len, blocksize);
2844                 int have_csum = 0;
2845
2846                 if (flags & BTRFS_EXTENT_FLAG_DATA) {
2847                         /* push csums to sbio */
2848                         have_csum = scrub_find_csum(sctx, logical, csum);
2849                         if (have_csum == 0)
2850                                 goto skip;
2851                 }
2852                 ret = scrub_sectors_for_parity(sparity, logical, l, physical, dev,
2853                                              flags, gen, mirror_num,
2854                                              have_csum ? csum : NULL);
2855                 if (ret)
2856                         return ret;
2857 skip:
2858                 len -= l;
2859                 logical += l;
2860                 physical += l;
2861         }
2862         return 0;
2863 }
2864
2865 /*
2866  * Given a physical address, this will calculate it's
2867  * logical offset. if this is a parity stripe, it will return
2868  * the most left data stripe's logical offset.
2869  *
2870  * return 0 if it is a data stripe, 1 means parity stripe.
2871  */
2872 static int get_raid56_logic_offset(u64 physical, int num,
2873                                    struct map_lookup *map, u64 *offset,
2874                                    u64 *stripe_start)
2875 {
2876         int i;
2877         int j = 0;
2878         u64 stripe_nr;
2879         u64 last_offset;
2880         u32 stripe_index;
2881         u32 rot;
2882         const int data_stripes = nr_data_stripes(map);
2883
2884         last_offset = (physical - map->stripes[num].physical) * data_stripes;
2885         if (stripe_start)
2886                 *stripe_start = last_offset;
2887
2888         *offset = last_offset;
2889         for (i = 0; i < data_stripes; i++) {
2890                 *offset = last_offset + i * map->stripe_len;
2891
2892                 stripe_nr = div64_u64(*offset, map->stripe_len);
2893                 stripe_nr = div_u64(stripe_nr, data_stripes);
2894
2895                 /* Work out the disk rotation on this stripe-set */
2896                 stripe_nr = div_u64_rem(stripe_nr, map->num_stripes, &rot);
2897                 /* calculate which stripe this data locates */
2898                 rot += i;
2899                 stripe_index = rot % map->num_stripes;
2900                 if (stripe_index == num)
2901                         return 0;
2902                 if (stripe_index < num)
2903                         j++;
2904         }
2905         *offset = last_offset + j * map->stripe_len;
2906         return 1;
2907 }
2908
2909 static void scrub_free_parity(struct scrub_parity *sparity)
2910 {
2911         struct scrub_ctx *sctx = sparity->sctx;
2912         struct scrub_sector *curr, *next;
2913         int nbits;
2914
2915         nbits = bitmap_weight(&sparity->ebitmap, sparity->nsectors);
2916         if (nbits) {
2917                 spin_lock(&sctx->stat_lock);
2918                 sctx->stat.read_errors += nbits;
2919                 sctx->stat.uncorrectable_errors += nbits;
2920                 spin_unlock(&sctx->stat_lock);
2921         }
2922
2923         list_for_each_entry_safe(curr, next, &sparity->sectors_list, list) {
2924                 list_del_init(&curr->list);
2925                 scrub_sector_put(curr);
2926         }
2927
2928         kfree(sparity);
2929 }
2930
2931 static void scrub_parity_bio_endio_worker(struct work_struct *work)
2932 {
2933         struct scrub_parity *sparity = container_of(work, struct scrub_parity,
2934                                                     work);
2935         struct scrub_ctx *sctx = sparity->sctx;
2936
2937         btrfs_bio_counter_dec(sctx->fs_info);
2938         scrub_free_parity(sparity);
2939         scrub_pending_bio_dec(sctx);
2940 }
2941
2942 static void scrub_parity_bio_endio(struct bio *bio)
2943 {
2944         struct scrub_parity *sparity = bio->bi_private;
2945         struct btrfs_fs_info *fs_info = sparity->sctx->fs_info;
2946
2947         if (bio->bi_status)
2948                 bitmap_or(&sparity->ebitmap, &sparity->ebitmap,
2949                           &sparity->dbitmap, sparity->nsectors);
2950
2951         bio_put(bio);
2952
2953         INIT_WORK(&sparity->work, scrub_parity_bio_endio_worker);
2954         queue_work(fs_info->scrub_parity_workers, &sparity->work);
2955 }
2956
2957 static void scrub_parity_check_and_repair(struct scrub_parity *sparity)
2958 {
2959         struct scrub_ctx *sctx = sparity->sctx;
2960         struct btrfs_fs_info *fs_info = sctx->fs_info;
2961         struct bio *bio;
2962         struct btrfs_raid_bio *rbio;
2963         struct btrfs_io_context *bioc = NULL;
2964         u64 length;
2965         int ret;
2966
2967         if (!bitmap_andnot(&sparity->dbitmap, &sparity->dbitmap,
2968                            &sparity->ebitmap, sparity->nsectors))
2969                 goto out;
2970
2971         length = sparity->logic_end - sparity->logic_start;
2972
2973         btrfs_bio_counter_inc_blocked(fs_info);
2974         ret = btrfs_map_sblock(fs_info, BTRFS_MAP_WRITE, sparity->logic_start,
2975                                &length, &bioc);
2976         if (ret || !bioc || !bioc->raid_map)
2977                 goto bioc_out;
2978
2979         bio = bio_alloc(NULL, BIO_MAX_VECS, REQ_OP_READ, GFP_NOFS);
2980         bio->bi_iter.bi_sector = sparity->logic_start >> 9;
2981         bio->bi_private = sparity;
2982         bio->bi_end_io = scrub_parity_bio_endio;
2983
2984         rbio = raid56_parity_alloc_scrub_rbio(bio, bioc,
2985                                               sparity->scrub_dev,
2986                                               &sparity->dbitmap,
2987                                               sparity->nsectors);
2988         btrfs_put_bioc(bioc);
2989         if (!rbio)
2990                 goto rbio_out;
2991
2992         scrub_pending_bio_inc(sctx);
2993         raid56_parity_submit_scrub_rbio(rbio);
2994         return;
2995
2996 rbio_out:
2997         bio_put(bio);
2998 bioc_out:
2999         btrfs_bio_counter_dec(fs_info);
3000         bitmap_or(&sparity->ebitmap, &sparity->ebitmap, &sparity->dbitmap,
3001                   sparity->nsectors);
3002         spin_lock(&sctx->stat_lock);
3003         sctx->stat.malloc_errors++;
3004         spin_unlock(&sctx->stat_lock);
3005 out:
3006         scrub_free_parity(sparity);
3007 }
3008
3009 static void scrub_parity_get(struct scrub_parity *sparity)
3010 {
3011         refcount_inc(&sparity->refs);
3012 }
3013
3014 static void scrub_parity_put(struct scrub_parity *sparity)
3015 {
3016         if (!refcount_dec_and_test(&sparity->refs))
3017                 return;
3018
3019         scrub_parity_check_and_repair(sparity);
3020 }
3021
3022 /*
3023  * Return 0 if the extent item range covers any byte of the range.
3024  * Return <0 if the extent item is before @search_start.
3025  * Return >0 if the extent item is after @start_start + @search_len.
3026  */
3027 static int compare_extent_item_range(struct btrfs_path *path,
3028                                      u64 search_start, u64 search_len)
3029 {
3030         struct btrfs_fs_info *fs_info = path->nodes[0]->fs_info;
3031         u64 len;
3032         struct btrfs_key key;
3033
3034         btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
3035         ASSERT(key.type == BTRFS_EXTENT_ITEM_KEY ||
3036                key.type == BTRFS_METADATA_ITEM_KEY);
3037         if (key.type == BTRFS_METADATA_ITEM_KEY)
3038                 len = fs_info->nodesize;
3039         else
3040                 len = key.offset;
3041
3042         if (key.objectid + len <= search_start)
3043                 return -1;
3044         if (key.objectid >= search_start + search_len)
3045                 return 1;
3046         return 0;
3047 }
3048
3049 /*
3050  * Locate one extent item which covers any byte in range
3051  * [@search_start, @search_start + @search_length)
3052  *
3053  * If the path is not initialized, we will initialize the search by doing
3054  * a btrfs_search_slot().
3055  * If the path is already initialized, we will use the path as the initial
3056  * slot, to avoid duplicated btrfs_search_slot() calls.
3057  *
3058  * NOTE: If an extent item starts before @search_start, we will still
3059  * return the extent item. This is for data extent crossing stripe boundary.
3060  *
3061  * Return 0 if we found such extent item, and @path will point to the extent item.
3062  * Return >0 if no such extent item can be found, and @path will be released.
3063  * Return <0 if hit fatal error, and @path will be released.
3064  */
3065 static int find_first_extent_item(struct btrfs_root *extent_root,
3066                                   struct btrfs_path *path,
3067                                   u64 search_start, u64 search_len)
3068 {
3069         struct btrfs_fs_info *fs_info = extent_root->fs_info;
3070         struct btrfs_key key;
3071         int ret;
3072
3073         /* Continue using the existing path */
3074         if (path->nodes[0])
3075                 goto search_forward;
3076
3077         if (btrfs_fs_incompat(fs_info, SKINNY_METADATA))
3078                 key.type = BTRFS_METADATA_ITEM_KEY;
3079         else
3080                 key.type = BTRFS_EXTENT_ITEM_KEY;
3081         key.objectid = search_start;
3082         key.offset = (u64)-1;
3083
3084         ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
3085         if (ret < 0)
3086                 return ret;
3087
3088         ASSERT(ret > 0);
3089         /*
3090          * Here we intentionally pass 0 as @min_objectid, as there could be
3091          * an extent item starting before @search_start.
3092          */
3093         ret = btrfs_previous_extent_item(extent_root, path, 0);
3094         if (ret < 0)
3095                 return ret;
3096         /*
3097          * No matter whether we have found an extent item, the next loop will
3098          * properly do every check on the key.
3099          */
3100 search_forward:
3101         while (true) {
3102                 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
3103                 if (key.objectid >= search_start + search_len)
3104                         break;
3105                 if (key.type != BTRFS_METADATA_ITEM_KEY &&
3106                     key.type != BTRFS_EXTENT_ITEM_KEY)
3107                         goto next;
3108
3109                 ret = compare_extent_item_range(path, search_start, search_len);
3110                 if (ret == 0)
3111                         return ret;
3112                 if (ret > 0)
3113                         break;
3114 next:
3115                 path->slots[0]++;
3116                 if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) {
3117                         ret = btrfs_next_leaf(extent_root, path);
3118                         if (ret) {
3119                                 /* Either no more item or fatal error */
3120                                 btrfs_release_path(path);
3121                                 return ret;
3122                         }
3123                 }
3124         }
3125         btrfs_release_path(path);
3126         return 1;
3127 }
3128
3129 static void get_extent_info(struct btrfs_path *path, u64 *extent_start_ret,
3130                             u64 *size_ret, u64 *flags_ret, u64 *generation_ret)
3131 {
3132         struct btrfs_key key;
3133         struct btrfs_extent_item *ei;
3134
3135         btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
3136         ASSERT(key.type == BTRFS_METADATA_ITEM_KEY ||
3137                key.type == BTRFS_EXTENT_ITEM_KEY);
3138         *extent_start_ret = key.objectid;
3139         if (key.type == BTRFS_METADATA_ITEM_KEY)
3140                 *size_ret = path->nodes[0]->fs_info->nodesize;
3141         else
3142                 *size_ret = key.offset;
3143         ei = btrfs_item_ptr(path->nodes[0], path->slots[0], struct btrfs_extent_item);
3144         *flags_ret = btrfs_extent_flags(path->nodes[0], ei);
3145         *generation_ret = btrfs_extent_generation(path->nodes[0], ei);
3146 }
3147
3148 static bool does_range_cross_boundary(u64 extent_start, u64 extent_len,
3149                                       u64 boundary_start, u64 boudary_len)
3150 {
3151         return (extent_start < boundary_start &&
3152                 extent_start + extent_len > boundary_start) ||
3153                (extent_start < boundary_start + boudary_len &&
3154                 extent_start + extent_len > boundary_start + boudary_len);
3155 }
3156
3157 static int scrub_raid56_data_stripe_for_parity(struct scrub_ctx *sctx,
3158                                                struct scrub_parity *sparity,
3159                                                struct map_lookup *map,
3160                                                struct btrfs_device *sdev,
3161                                                struct btrfs_path *path,
3162                                                u64 logical)
3163 {
3164         struct btrfs_fs_info *fs_info = sctx->fs_info;
3165         struct btrfs_root *extent_root = btrfs_extent_root(fs_info, logical);
3166         struct btrfs_root *csum_root = btrfs_csum_root(fs_info, logical);
3167         u64 cur_logical = logical;
3168         int ret;
3169
3170         ASSERT(map->type & BTRFS_BLOCK_GROUP_RAID56_MASK);
3171
3172         /* Path must not be populated */
3173         ASSERT(!path->nodes[0]);
3174
3175         while (cur_logical < logical + map->stripe_len) {
3176                 struct btrfs_io_context *bioc = NULL;
3177                 struct btrfs_device *extent_dev;
3178                 u64 extent_start;
3179                 u64 extent_size;
3180                 u64 mapped_length;
3181                 u64 extent_flags;
3182                 u64 extent_gen;
3183                 u64 extent_physical;
3184                 u64 extent_mirror_num;
3185
3186                 ret = find_first_extent_item(extent_root, path, cur_logical,
3187                                              logical + map->stripe_len - cur_logical);
3188                 /* No more extent item in this data stripe */
3189                 if (ret > 0) {
3190                         ret = 0;
3191                         break;
3192                 }
3193                 if (ret < 0)
3194                         break;
3195                 get_extent_info(path, &extent_start, &extent_size, &extent_flags,
3196                                 &extent_gen);
3197
3198                 /* Metadata should not cross stripe boundaries */
3199                 if ((extent_flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) &&
3200                     does_range_cross_boundary(extent_start, extent_size,
3201                                               logical, map->stripe_len)) {
3202                         btrfs_err(fs_info,
3203         "scrub: tree block %llu spanning stripes, ignored. logical=%llu",
3204                                   extent_start, logical);
3205                         spin_lock(&sctx->stat_lock);
3206                         sctx->stat.uncorrectable_errors++;
3207                         spin_unlock(&sctx->stat_lock);
3208                         cur_logical += extent_size;
3209                         continue;
3210                 }
3211
3212                 /* Skip hole range which doesn't have any extent */
3213                 cur_logical = max(extent_start, cur_logical);
3214
3215                 /* Truncate the range inside this data stripe */
3216                 extent_size = min(extent_start + extent_size,
3217                                   logical + map->stripe_len) - cur_logical;
3218                 extent_start = cur_logical;
3219                 ASSERT(extent_size <= U32_MAX);
3220
3221                 scrub_parity_mark_sectors_data(sparity, extent_start, extent_size);
3222
3223                 mapped_length = extent_size;
3224                 ret = btrfs_map_block(fs_info, BTRFS_MAP_READ, extent_start,
3225                                       &mapped_length, &bioc, 0);
3226                 if (!ret && (!bioc || mapped_length < extent_size))
3227                         ret = -EIO;
3228                 if (ret) {
3229                         btrfs_put_bioc(bioc);
3230                         scrub_parity_mark_sectors_error(sparity, extent_start,
3231                                                         extent_size);
3232                         break;
3233                 }
3234                 extent_physical = bioc->stripes[0].physical;
3235                 extent_mirror_num = bioc->mirror_num;
3236                 extent_dev = bioc->stripes[0].dev;
3237                 btrfs_put_bioc(bioc);
3238
3239                 ret = btrfs_lookup_csums_range(csum_root, extent_start,
3240                                                extent_start + extent_size - 1,
3241                                                &sctx->csum_list, 1, false);
3242                 if (ret) {
3243                         scrub_parity_mark_sectors_error(sparity, extent_start,
3244                                                         extent_size);
3245                         break;
3246                 }
3247
3248                 ret = scrub_extent_for_parity(sparity, extent_start,
3249                                               extent_size, extent_physical,
3250                                               extent_dev, extent_flags,
3251                                               extent_gen, extent_mirror_num);
3252                 scrub_free_csums(sctx);
3253
3254                 if (ret) {
3255                         scrub_parity_mark_sectors_error(sparity, extent_start,
3256                                                         extent_size);
3257                         break;
3258                 }
3259
3260                 cond_resched();
3261                 cur_logical += extent_size;
3262         }
3263         btrfs_release_path(path);
3264         return ret;
3265 }
3266
3267 static noinline_for_stack int scrub_raid56_parity(struct scrub_ctx *sctx,
3268                                                   struct map_lookup *map,
3269                                                   struct btrfs_device *sdev,
3270                                                   u64 logic_start,
3271                                                   u64 logic_end)
3272 {
3273         struct btrfs_fs_info *fs_info = sctx->fs_info;
3274         struct btrfs_path *path;
3275         u64 cur_logical;
3276         int ret;
3277         struct scrub_parity *sparity;
3278         int nsectors;
3279
3280         path = btrfs_alloc_path();
3281         if (!path) {
3282                 spin_lock(&sctx->stat_lock);
3283                 sctx->stat.malloc_errors++;
3284                 spin_unlock(&sctx->stat_lock);
3285                 return -ENOMEM;
3286         }
3287         path->search_commit_root = 1;
3288         path->skip_locking = 1;
3289
3290         ASSERT(map->stripe_len <= U32_MAX);
3291         nsectors = map->stripe_len >> fs_info->sectorsize_bits;
3292         ASSERT(nsectors <= BITS_PER_LONG);
3293         sparity = kzalloc(sizeof(struct scrub_parity), GFP_NOFS);
3294         if (!sparity) {
3295                 spin_lock(&sctx->stat_lock);
3296                 sctx->stat.malloc_errors++;
3297                 spin_unlock(&sctx->stat_lock);
3298                 btrfs_free_path(path);
3299                 return -ENOMEM;
3300         }
3301
3302         ASSERT(map->stripe_len <= U32_MAX);
3303         sparity->stripe_len = map->stripe_len;
3304         sparity->nsectors = nsectors;
3305         sparity->sctx = sctx;
3306         sparity->scrub_dev = sdev;
3307         sparity->logic_start = logic_start;
3308         sparity->logic_end = logic_end;
3309         refcount_set(&sparity->refs, 1);
3310         INIT_LIST_HEAD(&sparity->sectors_list);
3311
3312         ret = 0;
3313         for (cur_logical = logic_start; cur_logical < logic_end;
3314              cur_logical += map->stripe_len) {
3315                 ret = scrub_raid56_data_stripe_for_parity(sctx, sparity, map,
3316                                                           sdev, path, cur_logical);
3317                 if (ret < 0)
3318                         break;
3319         }
3320
3321         scrub_parity_put(sparity);
3322         scrub_submit(sctx);
3323         mutex_lock(&sctx->wr_lock);
3324         scrub_wr_submit(sctx);
3325         mutex_unlock(&sctx->wr_lock);
3326
3327         btrfs_free_path(path);
3328         return ret < 0 ? ret : 0;
3329 }
3330
3331 static void sync_replace_for_zoned(struct scrub_ctx *sctx)
3332 {
3333         if (!btrfs_is_zoned(sctx->fs_info))
3334                 return;
3335
3336         sctx->flush_all_writes = true;
3337         scrub_submit(sctx);
3338         mutex_lock(&sctx->wr_lock);
3339         scrub_wr_submit(sctx);
3340         mutex_unlock(&sctx->wr_lock);
3341
3342         wait_event(sctx->list_wait, atomic_read(&sctx->bios_in_flight) == 0);
3343 }
3344
3345 static int sync_write_pointer_for_zoned(struct scrub_ctx *sctx, u64 logical,
3346                                         u64 physical, u64 physical_end)
3347 {
3348         struct btrfs_fs_info *fs_info = sctx->fs_info;
3349         int ret = 0;
3350
3351         if (!btrfs_is_zoned(fs_info))
3352                 return 0;
3353
3354         wait_event(sctx->list_wait, atomic_read(&sctx->bios_in_flight) == 0);
3355
3356         mutex_lock(&sctx->wr_lock);
3357         if (sctx->write_pointer < physical_end) {
3358                 ret = btrfs_sync_zone_write_pointer(sctx->wr_tgtdev, logical,
3359                                                     physical,
3360                                                     sctx->write_pointer);
3361                 if (ret)
3362                         btrfs_err(fs_info,
3363                                   "zoned: failed to recover write pointer");
3364         }
3365         mutex_unlock(&sctx->wr_lock);
3366         btrfs_dev_clear_zone_empty(sctx->wr_tgtdev, physical);
3367
3368         return ret;
3369 }
3370
3371 /*
3372  * Scrub one range which can only has simple mirror based profile.
3373  * (Including all range in SINGLE/DUP/RAID1/RAID1C*, and each stripe in
3374  *  RAID0/RAID10).
3375  *
3376  * Since we may need to handle a subset of block group, we need @logical_start
3377  * and @logical_length parameter.
3378  */
3379 static int scrub_simple_mirror(struct scrub_ctx *sctx,
3380                                struct btrfs_root *extent_root,
3381                                struct btrfs_root *csum_root,
3382                                struct btrfs_block_group *bg,
3383                                struct map_lookup *map,
3384                                u64 logical_start, u64 logical_length,
3385                                struct btrfs_device *device,
3386                                u64 physical, int mirror_num)
3387 {
3388         struct btrfs_fs_info *fs_info = sctx->fs_info;
3389         const u64 logical_end = logical_start + logical_length;
3390         /* An artificial limit, inherit from old scrub behavior */
3391         const u32 max_length = SZ_64K;
3392         struct btrfs_path path = { 0 };
3393         u64 cur_logical = logical_start;
3394         int ret;
3395
3396         /* The range must be inside the bg */
3397         ASSERT(logical_start >= bg->start && logical_end <= bg->start + bg->length);
3398
3399         path.search_commit_root = 1;
3400         path.skip_locking = 1;
3401         /* Go through each extent items inside the logical range */
3402         while (cur_logical < logical_end) {
3403                 u64 extent_start;
3404                 u64 extent_len;
3405                 u64 extent_flags;
3406                 u64 extent_gen;
3407                 u64 scrub_len;
3408
3409                 /* Canceled? */
3410                 if (atomic_read(&fs_info->scrub_cancel_req) ||
3411                     atomic_read(&sctx->cancel_req)) {
3412                         ret = -ECANCELED;
3413                         break;
3414                 }
3415                 /* Paused? */
3416                 if (atomic_read(&fs_info->scrub_pause_req)) {
3417                         /* Push queued extents */
3418                         sctx->flush_all_writes = true;
3419                         scrub_submit(sctx);
3420                         mutex_lock(&sctx->wr_lock);
3421                         scrub_wr_submit(sctx);
3422                         mutex_unlock(&sctx->wr_lock);
3423                         wait_event(sctx->list_wait,
3424                                    atomic_read(&sctx->bios_in_flight) == 0);
3425                         sctx->flush_all_writes = false;
3426                         scrub_blocked_if_needed(fs_info);
3427                 }
3428                 /* Block group removed? */
3429                 spin_lock(&bg->lock);
3430                 if (test_bit(BLOCK_GROUP_FLAG_REMOVED, &bg->runtime_flags)) {
3431                         spin_unlock(&bg->lock);
3432                         ret = 0;
3433                         break;
3434                 }
3435                 spin_unlock(&bg->lock);
3436
3437                 ret = find_first_extent_item(extent_root, &path, cur_logical,
3438                                              logical_end - cur_logical);
3439                 if (ret > 0) {
3440                         /* No more extent, just update the accounting */
3441                         sctx->stat.last_physical = physical + logical_length;
3442                         ret = 0;
3443                         break;
3444                 }
3445                 if (ret < 0)
3446                         break;
3447                 get_extent_info(&path, &extent_start, &extent_len,
3448                                 &extent_flags, &extent_gen);
3449                 /* Skip hole range which doesn't have any extent */
3450                 cur_logical = max(extent_start, cur_logical);
3451
3452                 /*
3453                  * Scrub len has three limits:
3454                  * - Extent size limit
3455                  * - Scrub range limit
3456                  *   This is especially imporatant for RAID0/RAID10 to reuse
3457                  *   this function
3458                  * - Max scrub size limit
3459                  */
3460                 scrub_len = min(min(extent_start + extent_len,
3461                                     logical_end), cur_logical + max_length) -
3462                             cur_logical;
3463
3464                 if (extent_flags & BTRFS_EXTENT_FLAG_DATA) {
3465                         ret = btrfs_lookup_csums_range(csum_root, cur_logical,
3466                                         cur_logical + scrub_len - 1,
3467                                         &sctx->csum_list, 1, false);
3468                         if (ret)
3469                                 break;
3470                 }
3471                 if ((extent_flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) &&
3472                     does_range_cross_boundary(extent_start, extent_len,
3473                                               logical_start, logical_length)) {
3474                         btrfs_err(fs_info,
3475 "scrub: tree block %llu spanning boundaries, ignored. boundary=[%llu, %llu)",
3476                                   extent_start, logical_start, logical_end);
3477                         spin_lock(&sctx->stat_lock);
3478                         sctx->stat.uncorrectable_errors++;
3479                         spin_unlock(&sctx->stat_lock);
3480                         cur_logical += scrub_len;
3481                         continue;
3482                 }
3483                 ret = scrub_extent(sctx, map, cur_logical, scrub_len,
3484                                    cur_logical - logical_start + physical,
3485                                    device, extent_flags, extent_gen,
3486                                    mirror_num);
3487                 scrub_free_csums(sctx);
3488                 if (ret)
3489                         break;
3490                 if (sctx->is_dev_replace)
3491                         sync_replace_for_zoned(sctx);
3492                 cur_logical += scrub_len;
3493                 /* Don't hold CPU for too long time */
3494                 cond_resched();
3495         }
3496         btrfs_release_path(&path);
3497         return ret;
3498 }
3499
3500 /* Calculate the full stripe length for simple stripe based profiles */
3501 static u64 simple_stripe_full_stripe_len(const struct map_lookup *map)
3502 {
3503         ASSERT(map->type & (BTRFS_BLOCK_GROUP_RAID0 |
3504                             BTRFS_BLOCK_GROUP_RAID10));
3505
3506         return map->num_stripes / map->sub_stripes * map->stripe_len;
3507 }
3508
3509 /* Get the logical bytenr for the stripe */
3510 static u64 simple_stripe_get_logical(struct map_lookup *map,
3511                                      struct btrfs_block_group *bg,
3512                                      int stripe_index)
3513 {
3514         ASSERT(map->type & (BTRFS_BLOCK_GROUP_RAID0 |
3515                             BTRFS_BLOCK_GROUP_RAID10));
3516         ASSERT(stripe_index < map->num_stripes);
3517
3518         /*
3519          * (stripe_index / sub_stripes) gives how many data stripes we need to
3520          * skip.
3521          */
3522         return (stripe_index / map->sub_stripes) * map->stripe_len + bg->start;
3523 }
3524
3525 /* Get the mirror number for the stripe */
3526 static int simple_stripe_mirror_num(struct map_lookup *map, int stripe_index)
3527 {
3528         ASSERT(map->type & (BTRFS_BLOCK_GROUP_RAID0 |
3529                             BTRFS_BLOCK_GROUP_RAID10));
3530         ASSERT(stripe_index < map->num_stripes);
3531
3532         /* For RAID0, it's fixed to 1, for RAID10 it's 0,1,0,1... */
3533         return stripe_index % map->sub_stripes + 1;
3534 }
3535
3536 static int scrub_simple_stripe(struct scrub_ctx *sctx,
3537                                struct btrfs_root *extent_root,
3538                                struct btrfs_root *csum_root,
3539                                struct btrfs_block_group *bg,
3540                                struct map_lookup *map,
3541                                struct btrfs_device *device,
3542                                int stripe_index)
3543 {
3544         const u64 logical_increment = simple_stripe_full_stripe_len(map);
3545         const u64 orig_logical = simple_stripe_get_logical(map, bg, stripe_index);
3546         const u64 orig_physical = map->stripes[stripe_index].physical;
3547         const int mirror_num = simple_stripe_mirror_num(map, stripe_index);
3548         u64 cur_logical = orig_logical;
3549         u64 cur_physical = orig_physical;
3550         int ret = 0;
3551
3552         while (cur_logical < bg->start + bg->length) {
3553                 /*
3554                  * Inside each stripe, RAID0 is just SINGLE, and RAID10 is
3555                  * just RAID1, so we can reuse scrub_simple_mirror() to scrub
3556                  * this stripe.
3557                  */
3558                 ret = scrub_simple_mirror(sctx, extent_root, csum_root, bg, map,
3559                                           cur_logical, map->stripe_len, device,
3560                                           cur_physical, mirror_num);
3561                 if (ret)
3562                         return ret;
3563                 /* Skip to next stripe which belongs to the target device */
3564                 cur_logical += logical_increment;
3565                 /* For physical offset, we just go to next stripe */
3566                 cur_physical += map->stripe_len;
3567         }
3568         return ret;
3569 }
3570
3571 static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
3572                                            struct btrfs_block_group *bg,
3573                                            struct extent_map *em,
3574                                            struct btrfs_device *scrub_dev,
3575                                            int stripe_index)
3576 {
3577         struct btrfs_path *path;
3578         struct btrfs_fs_info *fs_info = sctx->fs_info;
3579         struct btrfs_root *root;
3580         struct btrfs_root *csum_root;
3581         struct blk_plug plug;
3582         struct map_lookup *map = em->map_lookup;
3583         const u64 profile = map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK;
3584         const u64 chunk_logical = bg->start;
3585         int ret;
3586         u64 physical = map->stripes[stripe_index].physical;
3587         const u64 dev_stripe_len = btrfs_calc_stripe_length(em);
3588         const u64 physical_end = physical + dev_stripe_len;
3589         u64 logical;
3590         u64 logic_end;
3591         /* The logical increment after finishing one stripe */
3592         u64 increment;
3593         /* Offset inside the chunk */
3594         u64 offset;
3595         u64 stripe_logical;
3596         u64 stripe_end;
3597         int stop_loop = 0;
3598
3599         path = btrfs_alloc_path();
3600         if (!path)
3601                 return -ENOMEM;
3602
3603         /*
3604          * work on commit root. The related disk blocks are static as
3605          * long as COW is applied. This means, it is save to rewrite
3606          * them to repair disk errors without any race conditions
3607          */
3608         path->search_commit_root = 1;
3609         path->skip_locking = 1;
3610         path->reada = READA_FORWARD;
3611
3612         wait_event(sctx->list_wait,
3613                    atomic_read(&sctx->bios_in_flight) == 0);
3614         scrub_blocked_if_needed(fs_info);
3615
3616         root = btrfs_extent_root(fs_info, bg->start);
3617         csum_root = btrfs_csum_root(fs_info, bg->start);
3618
3619         /*
3620          * collect all data csums for the stripe to avoid seeking during
3621          * the scrub. This might currently (crc32) end up to be about 1MB
3622          */
3623         blk_start_plug(&plug);
3624
3625         if (sctx->is_dev_replace &&
3626             btrfs_dev_is_sequential(sctx->wr_tgtdev, physical)) {
3627                 mutex_lock(&sctx->wr_lock);
3628                 sctx->write_pointer = physical;
3629                 mutex_unlock(&sctx->wr_lock);
3630                 sctx->flush_all_writes = true;
3631         }
3632
3633         /*
3634          * There used to be a big double loop to handle all profiles using the
3635          * same routine, which grows larger and more gross over time.
3636          *
3637          * So here we handle each profile differently, so simpler profiles
3638          * have simpler scrubbing function.
3639          */
3640         if (!(profile & (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID10 |
3641                          BTRFS_BLOCK_GROUP_RAID56_MASK))) {
3642                 /*
3643                  * Above check rules out all complex profile, the remaining
3644                  * profiles are SINGLE|DUP|RAID1|RAID1C*, which is simple
3645                  * mirrored duplication without stripe.
3646                  *
3647                  * Only @physical and @mirror_num needs to calculated using
3648                  * @stripe_index.
3649                  */
3650                 ret = scrub_simple_mirror(sctx, root, csum_root, bg, map,
3651                                 bg->start, bg->length, scrub_dev,
3652                                 map->stripes[stripe_index].physical,
3653                                 stripe_index + 1);
3654                 offset = 0;
3655                 goto out;
3656         }
3657         if (profile & (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID10)) {
3658                 ret = scrub_simple_stripe(sctx, root, csum_root, bg, map,
3659                                           scrub_dev, stripe_index);
3660                 offset = map->stripe_len * (stripe_index / map->sub_stripes);
3661                 goto out;
3662         }
3663
3664         /* Only RAID56 goes through the old code */
3665         ASSERT(map->type & BTRFS_BLOCK_GROUP_RAID56_MASK);
3666         ret = 0;
3667
3668         /* Calculate the logical end of the stripe */
3669         get_raid56_logic_offset(physical_end, stripe_index,
3670                                 map, &logic_end, NULL);
3671         logic_end += chunk_logical;
3672
3673         /* Initialize @offset in case we need to go to out: label */
3674         get_raid56_logic_offset(physical, stripe_index, map, &offset, NULL);
3675         increment = map->stripe_len * nr_data_stripes(map);
3676
3677         /*
3678          * Due to the rotation, for RAID56 it's better to iterate each stripe
3679          * using their physical offset.
3680          */
3681         while (physical < physical_end) {
3682                 ret = get_raid56_logic_offset(physical, stripe_index, map,
3683                                               &logical, &stripe_logical);
3684                 logical += chunk_logical;
3685                 if (ret) {
3686                         /* it is parity strip */
3687                         stripe_logical += chunk_logical;
3688                         stripe_end = stripe_logical + increment;
3689                         ret = scrub_raid56_parity(sctx, map, scrub_dev,
3690                                                   stripe_logical,
3691                                                   stripe_end);
3692                         if (ret)
3693                                 goto out;
3694                         goto next;
3695                 }
3696
3697                 /*
3698                  * Now we're at a data stripe, scrub each extents in the range.
3699                  *
3700                  * At this stage, if we ignore the repair part, inside each data
3701                  * stripe it is no different than SINGLE profile.
3702                  * We can reuse scrub_simple_mirror() here, as the repair part
3703                  * is still based on @mirror_num.
3704                  */
3705                 ret = scrub_simple_mirror(sctx, root, csum_root, bg, map,
3706                                           logical, map->stripe_len,
3707                                           scrub_dev, physical, 1);
3708                 if (ret < 0)
3709                         goto out;
3710 next:
3711                 logical += increment;
3712                 physical += map->stripe_len;
3713                 spin_lock(&sctx->stat_lock);
3714                 if (stop_loop)
3715                         sctx->stat.last_physical =
3716                                 map->stripes[stripe_index].physical + dev_stripe_len;
3717                 else
3718                         sctx->stat.last_physical = physical;
3719                 spin_unlock(&sctx->stat_lock);
3720                 if (stop_loop)
3721                         break;
3722         }
3723 out:
3724         /* push queued extents */
3725         scrub_submit(sctx);
3726         mutex_lock(&sctx->wr_lock);
3727         scrub_wr_submit(sctx);
3728         mutex_unlock(&sctx->wr_lock);
3729
3730         blk_finish_plug(&plug);
3731         btrfs_free_path(path);
3732
3733         if (sctx->is_dev_replace && ret >= 0) {
3734                 int ret2;
3735
3736                 ret2 = sync_write_pointer_for_zoned(sctx,
3737                                 chunk_logical + offset,
3738                                 map->stripes[stripe_index].physical,
3739                                 physical_end);
3740                 if (ret2)
3741                         ret = ret2;
3742         }
3743
3744         return ret < 0 ? ret : 0;
3745 }
3746
3747 static noinline_for_stack int scrub_chunk(struct scrub_ctx *sctx,
3748                                           struct btrfs_block_group *bg,
3749                                           struct btrfs_device *scrub_dev,
3750                                           u64 dev_offset,
3751                                           u64 dev_extent_len)
3752 {
3753         struct btrfs_fs_info *fs_info = sctx->fs_info;
3754         struct extent_map_tree *map_tree = &fs_info->mapping_tree;
3755         struct map_lookup *map;
3756         struct extent_map *em;
3757         int i;
3758         int ret = 0;
3759
3760         read_lock(&map_tree->lock);
3761         em = lookup_extent_mapping(map_tree, bg->start, bg->length);
3762         read_unlock(&map_tree->lock);
3763
3764         if (!em) {
3765                 /*
3766                  * Might have been an unused block group deleted by the cleaner
3767                  * kthread or relocation.
3768                  */
3769                 spin_lock(&bg->lock);
3770                 if (!test_bit(BLOCK_GROUP_FLAG_REMOVED, &bg->runtime_flags))
3771                         ret = -EINVAL;
3772                 spin_unlock(&bg->lock);
3773
3774                 return ret;
3775         }
3776         if (em->start != bg->start)
3777                 goto out;
3778         if (em->len < dev_extent_len)
3779                 goto out;
3780
3781         map = em->map_lookup;
3782         for (i = 0; i < map->num_stripes; ++i) {
3783                 if (map->stripes[i].dev->bdev == scrub_dev->bdev &&
3784                     map->stripes[i].physical == dev_offset) {
3785                         ret = scrub_stripe(sctx, bg, em, scrub_dev, i);
3786                         if (ret)
3787                                 goto out;
3788                 }
3789         }
3790 out:
3791         free_extent_map(em);
3792
3793         return ret;
3794 }
3795
3796 static int finish_extent_writes_for_zoned(struct btrfs_root *root,
3797                                           struct btrfs_block_group *cache)
3798 {
3799         struct btrfs_fs_info *fs_info = cache->fs_info;
3800         struct btrfs_trans_handle *trans;
3801
3802         if (!btrfs_is_zoned(fs_info))
3803                 return 0;
3804
3805         btrfs_wait_block_group_reservations(cache);
3806         btrfs_wait_nocow_writers(cache);
3807         btrfs_wait_ordered_roots(fs_info, U64_MAX, cache->start, cache->length);
3808
3809         trans = btrfs_join_transaction(root);
3810         if (IS_ERR(trans))
3811                 return PTR_ERR(trans);
3812         return btrfs_commit_transaction(trans);
3813 }
3814
3815 static noinline_for_stack
3816 int scrub_enumerate_chunks(struct scrub_ctx *sctx,
3817                            struct btrfs_device *scrub_dev, u64 start, u64 end)
3818 {
3819         struct btrfs_dev_extent *dev_extent = NULL;
3820         struct btrfs_path *path;
3821         struct btrfs_fs_info *fs_info = sctx->fs_info;
3822         struct btrfs_root *root = fs_info->dev_root;
3823         u64 chunk_offset;
3824         int ret = 0;
3825         int ro_set;
3826         int slot;
3827         struct extent_buffer *l;
3828         struct btrfs_key key;
3829         struct btrfs_key found_key;
3830         struct btrfs_block_group *cache;
3831         struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
3832
3833         path = btrfs_alloc_path();
3834         if (!path)
3835                 return -ENOMEM;
3836
3837         path->reada = READA_FORWARD;
3838         path->search_commit_root = 1;
3839         path->skip_locking = 1;
3840
3841         key.objectid = scrub_dev->devid;
3842         key.offset = 0ull;
3843         key.type = BTRFS_DEV_EXTENT_KEY;
3844
3845         while (1) {
3846                 u64 dev_extent_len;
3847
3848                 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
3849                 if (ret < 0)
3850                         break;
3851                 if (ret > 0) {
3852                         if (path->slots[0] >=
3853                             btrfs_header_nritems(path->nodes[0])) {
3854                                 ret = btrfs_next_leaf(root, path);
3855                                 if (ret < 0)
3856                                         break;
3857                                 if (ret > 0) {
3858                                         ret = 0;
3859                                         break;
3860                                 }
3861                         } else {
3862                                 ret = 0;
3863                         }
3864                 }
3865
3866                 l = path->nodes[0];
3867                 slot = path->slots[0];
3868
3869                 btrfs_item_key_to_cpu(l, &found_key, slot);
3870
3871                 if (found_key.objectid != scrub_dev->devid)
3872                         break;
3873
3874                 if (found_key.type != BTRFS_DEV_EXTENT_KEY)
3875                         break;
3876
3877                 if (found_key.offset >= end)
3878                         break;
3879
3880                 if (found_key.offset < key.offset)
3881                         break;
3882
3883                 dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
3884                 dev_extent_len = btrfs_dev_extent_length(l, dev_extent);
3885
3886                 if (found_key.offset + dev_extent_len <= start)
3887                         goto skip;
3888
3889                 chunk_offset = btrfs_dev_extent_chunk_offset(l, dev_extent);
3890
3891                 /*
3892                  * get a reference on the corresponding block group to prevent
3893                  * the chunk from going away while we scrub it
3894                  */
3895                 cache = btrfs_lookup_block_group(fs_info, chunk_offset);
3896
3897                 /* some chunks are removed but not committed to disk yet,
3898                  * continue scrubbing */
3899                 if (!cache)
3900                         goto skip;
3901
3902                 ASSERT(cache->start <= chunk_offset);
3903                 /*
3904                  * We are using the commit root to search for device extents, so
3905                  * that means we could have found a device extent item from a
3906                  * block group that was deleted in the current transaction. The
3907                  * logical start offset of the deleted block group, stored at
3908                  * @chunk_offset, might be part of the logical address range of
3909                  * a new block group (which uses different physical extents).
3910                  * In this case btrfs_lookup_block_group() has returned the new
3911                  * block group, and its start address is less than @chunk_offset.
3912                  *
3913                  * We skip such new block groups, because it's pointless to
3914                  * process them, as we won't find their extents because we search
3915                  * for them using the commit root of the extent tree. For a device
3916                  * replace it's also fine to skip it, we won't miss copying them
3917                  * to the target device because we have the write duplication
3918                  * setup through the regular write path (by btrfs_map_block()),
3919                  * and we have committed a transaction when we started the device
3920                  * replace, right after setting up the device replace state.
3921                  */
3922                 if (cache->start < chunk_offset) {
3923                         btrfs_put_block_group(cache);
3924                         goto skip;
3925                 }
3926
3927                 if (sctx->is_dev_replace && btrfs_is_zoned(fs_info)) {
3928                         if (!test_bit(BLOCK_GROUP_FLAG_TO_COPY, &cache->runtime_flags)) {
3929                                 btrfs_put_block_group(cache);
3930                                 goto skip;
3931                         }
3932                 }
3933
3934                 /*
3935                  * Make sure that while we are scrubbing the corresponding block
3936                  * group doesn't get its logical address and its device extents
3937                  * reused for another block group, which can possibly be of a
3938                  * different type and different profile. We do this to prevent
3939                  * false error detections and crashes due to bogus attempts to
3940                  * repair extents.
3941                  */
3942                 spin_lock(&cache->lock);
3943                 if (test_bit(BLOCK_GROUP_FLAG_REMOVED, &cache->runtime_flags)) {
3944                         spin_unlock(&cache->lock);
3945                         btrfs_put_block_group(cache);
3946                         goto skip;
3947                 }
3948                 btrfs_freeze_block_group(cache);
3949                 spin_unlock(&cache->lock);
3950
3951                 /*
3952                  * we need call btrfs_inc_block_group_ro() with scrubs_paused,
3953                  * to avoid deadlock caused by:
3954                  * btrfs_inc_block_group_ro()
3955                  * -> btrfs_wait_for_commit()
3956                  * -> btrfs_commit_transaction()
3957                  * -> btrfs_scrub_pause()
3958                  */
3959                 scrub_pause_on(fs_info);
3960
3961                 /*
3962                  * Don't do chunk preallocation for scrub.
3963                  *
3964                  * This is especially important for SYSTEM bgs, or we can hit
3965                  * -EFBIG from btrfs_finish_chunk_alloc() like:
3966                  * 1. The only SYSTEM bg is marked RO.
3967                  *    Since SYSTEM bg is small, that's pretty common.
3968                  * 2. New SYSTEM bg will be allocated
3969                  *    Due to regular version will allocate new chunk.
3970                  * 3. New SYSTEM bg is empty and will get cleaned up
3971                  *    Before cleanup really happens, it's marked RO again.
3972                  * 4. Empty SYSTEM bg get scrubbed
3973                  *    We go back to 2.
3974                  *
3975                  * This can easily boost the amount of SYSTEM chunks if cleaner
3976                  * thread can't be triggered fast enough, and use up all space
3977                  * of btrfs_super_block::sys_chunk_array
3978                  *
3979                  * While for dev replace, we need to try our best to mark block
3980                  * group RO, to prevent race between:
3981                  * - Write duplication
3982                  *   Contains latest data
3983                  * - Scrub copy
3984                  *   Contains data from commit tree
3985                  *
3986                  * If target block group is not marked RO, nocow writes can
3987                  * be overwritten by scrub copy, causing data corruption.
3988                  * So for dev-replace, it's not allowed to continue if a block
3989                  * group is not RO.
3990                  */
3991                 ret = btrfs_inc_block_group_ro(cache, sctx->is_dev_replace);
3992                 if (!ret && sctx->is_dev_replace) {
3993                         ret = finish_extent_writes_for_zoned(root, cache);
3994                         if (ret) {
3995                                 btrfs_dec_block_group_ro(cache);
3996                                 scrub_pause_off(fs_info);
3997                                 btrfs_put_block_group(cache);
3998                                 break;
3999                         }
4000                 }
4001
4002                 if (ret == 0) {
4003                         ro_set = 1;
4004                 } else if (ret == -ENOSPC && !sctx->is_dev_replace) {
4005                         /*
4006                          * btrfs_inc_block_group_ro return -ENOSPC when it
4007                          * failed in creating new chunk for metadata.
4008                          * It is not a problem for scrub, because
4009                          * metadata are always cowed, and our scrub paused
4010                          * commit_transactions.
4011                          */
4012                         ro_set = 0;
4013                 } else if (ret == -ETXTBSY) {
4014                         btrfs_warn(fs_info,
4015                    "skipping scrub of block group %llu due to active swapfile",
4016                                    cache->start);
4017                         scrub_pause_off(fs_info);
4018                         ret = 0;
4019                         goto skip_unfreeze;
4020                 } else {
4021                         btrfs_warn(fs_info,
4022                                    "failed setting block group ro: %d", ret);
4023                         btrfs_unfreeze_block_group(cache);
4024                         btrfs_put_block_group(cache);
4025                         scrub_pause_off(fs_info);
4026                         break;
4027                 }
4028
4029                 /*
4030                  * Now the target block is marked RO, wait for nocow writes to
4031                  * finish before dev-replace.
4032                  * COW is fine, as COW never overwrites extents in commit tree.
4033                  */
4034                 if (sctx->is_dev_replace) {
4035                         btrfs_wait_nocow_writers(cache);
4036                         btrfs_wait_ordered_roots(fs_info, U64_MAX, cache->start,
4037                                         cache->length);
4038                 }
4039
4040                 scrub_pause_off(fs_info);
4041                 down_write(&dev_replace->rwsem);
4042                 dev_replace->cursor_right = found_key.offset + dev_extent_len;
4043                 dev_replace->cursor_left = found_key.offset;
4044                 dev_replace->item_needs_writeback = 1;
4045                 up_write(&dev_replace->rwsem);
4046
4047                 ret = scrub_chunk(sctx, cache, scrub_dev, found_key.offset,
4048                                   dev_extent_len);
4049
4050                 /*
4051                  * flush, submit all pending read and write bios, afterwards
4052                  * wait for them.
4053                  * Note that in the dev replace case, a read request causes
4054                  * write requests that are submitted in the read completion
4055                  * worker. Therefore in the current situation, it is required
4056                  * that all write requests are flushed, so that all read and
4057                  * write requests are really completed when bios_in_flight
4058                  * changes to 0.
4059                  */
4060                 sctx->flush_all_writes = true;
4061                 scrub_submit(sctx);
4062                 mutex_lock(&sctx->wr_lock);
4063                 scrub_wr_submit(sctx);
4064                 mutex_unlock(&sctx->wr_lock);
4065
4066                 wait_event(sctx->list_wait,
4067                            atomic_read(&sctx->bios_in_flight) == 0);
4068
4069                 scrub_pause_on(fs_info);
4070
4071                 /*
4072                  * must be called before we decrease @scrub_paused.
4073                  * make sure we don't block transaction commit while
4074                  * we are waiting pending workers finished.
4075                  */
4076                 wait_event(sctx->list_wait,
4077                            atomic_read(&sctx->workers_pending) == 0);
4078                 sctx->flush_all_writes = false;
4079
4080                 scrub_pause_off(fs_info);
4081
4082                 if (sctx->is_dev_replace &&
4083                     !btrfs_finish_block_group_to_copy(dev_replace->srcdev,
4084                                                       cache, found_key.offset))
4085                         ro_set = 0;
4086
4087                 down_write(&dev_replace->rwsem);
4088                 dev_replace->cursor_left = dev_replace->cursor_right;
4089                 dev_replace->item_needs_writeback = 1;
4090                 up_write(&dev_replace->rwsem);
4091
4092                 if (ro_set)
4093                         btrfs_dec_block_group_ro(cache);
4094
4095                 /*
4096                  * We might have prevented the cleaner kthread from deleting
4097                  * this block group if it was already unused because we raced
4098                  * and set it to RO mode first. So add it back to the unused
4099                  * list, otherwise it might not ever be deleted unless a manual
4100                  * balance is triggered or it becomes used and unused again.
4101                  */
4102                 spin_lock(&cache->lock);
4103                 if (!test_bit(BLOCK_GROUP_FLAG_REMOVED, &cache->runtime_flags) &&
4104                     !cache->ro && cache->reserved == 0 && cache->used == 0) {
4105                         spin_unlock(&cache->lock);
4106                         if (btrfs_test_opt(fs_info, DISCARD_ASYNC))
4107                                 btrfs_discard_queue_work(&fs_info->discard_ctl,
4108                                                          cache);
4109                         else
4110                                 btrfs_mark_bg_unused(cache);
4111                 } else {
4112                         spin_unlock(&cache->lock);
4113                 }
4114 skip_unfreeze:
4115                 btrfs_unfreeze_block_group(cache);
4116                 btrfs_put_block_group(cache);
4117                 if (ret)
4118                         break;
4119                 if (sctx->is_dev_replace &&
4120                     atomic64_read(&dev_replace->num_write_errors) > 0) {
4121                         ret = -EIO;
4122                         break;
4123                 }
4124                 if (sctx->stat.malloc_errors > 0) {
4125                         ret = -ENOMEM;
4126                         break;
4127                 }
4128 skip:
4129                 key.offset = found_key.offset + dev_extent_len;
4130                 btrfs_release_path(path);
4131         }
4132
4133         btrfs_free_path(path);
4134
4135         return ret;
4136 }
4137
4138 static noinline_for_stack int scrub_supers(struct scrub_ctx *sctx,
4139                                            struct btrfs_device *scrub_dev)
4140 {
4141         int     i;
4142         u64     bytenr;
4143         u64     gen;
4144         int     ret;
4145         struct btrfs_fs_info *fs_info = sctx->fs_info;
4146
4147         if (BTRFS_FS_ERROR(fs_info))
4148                 return -EROFS;
4149
4150         /* Seed devices of a new filesystem has their own generation. */
4151         if (scrub_dev->fs_devices != fs_info->fs_devices)
4152                 gen = scrub_dev->generation;
4153         else
4154                 gen = fs_info->last_trans_committed;
4155
4156         for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
4157                 bytenr = btrfs_sb_offset(i);
4158                 if (bytenr + BTRFS_SUPER_INFO_SIZE >
4159                     scrub_dev->commit_total_bytes)
4160                         break;
4161                 if (!btrfs_check_super_location(scrub_dev, bytenr))
4162                         continue;
4163
4164                 ret = scrub_sectors(sctx, bytenr, BTRFS_SUPER_INFO_SIZE, bytenr,
4165                                     scrub_dev, BTRFS_EXTENT_FLAG_SUPER, gen, i,
4166                                     NULL, bytenr);
4167                 if (ret)
4168                         return ret;
4169         }
4170         wait_event(sctx->list_wait, atomic_read(&sctx->bios_in_flight) == 0);
4171
4172         return 0;
4173 }
4174
4175 static void scrub_workers_put(struct btrfs_fs_info *fs_info)
4176 {
4177         if (refcount_dec_and_mutex_lock(&fs_info->scrub_workers_refcnt,
4178                                         &fs_info->scrub_lock)) {
4179                 struct workqueue_struct *scrub_workers = fs_info->scrub_workers;
4180                 struct workqueue_struct *scrub_wr_comp =
4181                                                 fs_info->scrub_wr_completion_workers;
4182                 struct workqueue_struct *scrub_parity =
4183                                                 fs_info->scrub_parity_workers;
4184
4185                 fs_info->scrub_workers = NULL;
4186                 fs_info->scrub_wr_completion_workers = NULL;
4187                 fs_info->scrub_parity_workers = NULL;
4188                 mutex_unlock(&fs_info->scrub_lock);
4189
4190                 if (scrub_workers)
4191                         destroy_workqueue(scrub_workers);
4192                 if (scrub_wr_comp)
4193                         destroy_workqueue(scrub_wr_comp);
4194                 if (scrub_parity)
4195                         destroy_workqueue(scrub_parity);
4196         }
4197 }
4198
4199 /*
4200  * get a reference count on fs_info->scrub_workers. start worker if necessary
4201  */
4202 static noinline_for_stack int scrub_workers_get(struct btrfs_fs_info *fs_info,
4203                                                 int is_dev_replace)
4204 {
4205         struct workqueue_struct *scrub_workers = NULL;
4206         struct workqueue_struct *scrub_wr_comp = NULL;
4207         struct workqueue_struct *scrub_parity = NULL;
4208         unsigned int flags = WQ_FREEZABLE | WQ_UNBOUND;
4209         int max_active = fs_info->thread_pool_size;
4210         int ret = -ENOMEM;
4211
4212         if (refcount_inc_not_zero(&fs_info->scrub_workers_refcnt))
4213                 return 0;
4214
4215         scrub_workers = alloc_workqueue("btrfs-scrub", flags,
4216                                         is_dev_replace ? 1 : max_active);
4217         if (!scrub_workers)
4218                 goto fail_scrub_workers;
4219
4220         scrub_wr_comp = alloc_workqueue("btrfs-scrubwrc", flags, max_active);
4221         if (!scrub_wr_comp)
4222                 goto fail_scrub_wr_completion_workers;
4223
4224         scrub_parity = alloc_workqueue("btrfs-scrubparity", flags, max_active);
4225         if (!scrub_parity)
4226                 goto fail_scrub_parity_workers;
4227
4228         mutex_lock(&fs_info->scrub_lock);
4229         if (refcount_read(&fs_info->scrub_workers_refcnt) == 0) {
4230                 ASSERT(fs_info->scrub_workers == NULL &&
4231                        fs_info->scrub_wr_completion_workers == NULL &&
4232                        fs_info->scrub_parity_workers == NULL);
4233                 fs_info->scrub_workers = scrub_workers;
4234                 fs_info->scrub_wr_completion_workers = scrub_wr_comp;
4235                 fs_info->scrub_parity_workers = scrub_parity;
4236                 refcount_set(&fs_info->scrub_workers_refcnt, 1);
4237                 mutex_unlock(&fs_info->scrub_lock);
4238                 return 0;
4239         }
4240         /* Other thread raced in and created the workers for us */
4241         refcount_inc(&fs_info->scrub_workers_refcnt);
4242         mutex_unlock(&fs_info->scrub_lock);
4243
4244         ret = 0;
4245         destroy_workqueue(scrub_parity);
4246 fail_scrub_parity_workers:
4247         destroy_workqueue(scrub_wr_comp);
4248 fail_scrub_wr_completion_workers:
4249         destroy_workqueue(scrub_workers);
4250 fail_scrub_workers:
4251         return ret;
4252 }
4253
4254 int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
4255                     u64 end, struct btrfs_scrub_progress *progress,
4256                     int readonly, int is_dev_replace)
4257 {
4258         struct btrfs_dev_lookup_args args = { .devid = devid };
4259         struct scrub_ctx *sctx;
4260         int ret;
4261         struct btrfs_device *dev;
4262         unsigned int nofs_flag;
4263         bool need_commit = false;
4264
4265         if (btrfs_fs_closing(fs_info))
4266                 return -EAGAIN;
4267
4268         /* At mount time we have ensured nodesize is in the range of [4K, 64K]. */
4269         ASSERT(fs_info->nodesize <= BTRFS_STRIPE_LEN);
4270
4271         /*
4272          * SCRUB_MAX_SECTORS_PER_BLOCK is calculated using the largest possible
4273          * value (max nodesize / min sectorsize), thus nodesize should always
4274          * be fine.
4275          */
4276         ASSERT(fs_info->nodesize <=
4277                SCRUB_MAX_SECTORS_PER_BLOCK << fs_info->sectorsize_bits);
4278
4279         /* Allocate outside of device_list_mutex */
4280         sctx = scrub_setup_ctx(fs_info, is_dev_replace);
4281         if (IS_ERR(sctx))
4282                 return PTR_ERR(sctx);
4283
4284         ret = scrub_workers_get(fs_info, is_dev_replace);
4285         if (ret)
4286                 goto out_free_ctx;
4287
4288         mutex_lock(&fs_info->fs_devices->device_list_mutex);
4289         dev = btrfs_find_device(fs_info->fs_devices, &args);
4290         if (!dev || (test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state) &&
4291                      !is_dev_replace)) {
4292                 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
4293                 ret = -ENODEV;
4294                 goto out;
4295         }
4296
4297         if (!is_dev_replace && !readonly &&
4298             !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state)) {
4299                 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
4300                 btrfs_err_in_rcu(fs_info,
4301                         "scrub on devid %llu: filesystem on %s is not writable",
4302                                  devid, rcu_str_deref(dev->name));
4303                 ret = -EROFS;
4304                 goto out;
4305         }
4306
4307         mutex_lock(&fs_info->scrub_lock);
4308         if (!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &dev->dev_state) ||
4309             test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &dev->dev_state)) {
4310                 mutex_unlock(&fs_info->scrub_lock);
4311                 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
4312                 ret = -EIO;
4313                 goto out;
4314         }
4315
4316         down_read(&fs_info->dev_replace.rwsem);
4317         if (dev->scrub_ctx ||
4318             (!is_dev_replace &&
4319              btrfs_dev_replace_is_ongoing(&fs_info->dev_replace))) {
4320                 up_read(&fs_info->dev_replace.rwsem);
4321                 mutex_unlock(&fs_info->scrub_lock);
4322                 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
4323                 ret = -EINPROGRESS;
4324                 goto out;
4325         }
4326         up_read(&fs_info->dev_replace.rwsem);
4327
4328         sctx->readonly = readonly;
4329         dev->scrub_ctx = sctx;
4330         mutex_unlock(&fs_info->fs_devices->device_list_mutex);
4331
4332         /*
4333          * checking @scrub_pause_req here, we can avoid
4334          * race between committing transaction and scrubbing.
4335          */
4336         __scrub_blocked_if_needed(fs_info);
4337         atomic_inc(&fs_info->scrubs_running);
4338         mutex_unlock(&fs_info->scrub_lock);
4339
4340         /*
4341          * In order to avoid deadlock with reclaim when there is a transaction
4342          * trying to pause scrub, make sure we use GFP_NOFS for all the
4343          * allocations done at btrfs_scrub_sectors() and scrub_sectors_for_parity()
4344          * invoked by our callees. The pausing request is done when the
4345          * transaction commit starts, and it blocks the transaction until scrub
4346          * is paused (done at specific points at scrub_stripe() or right above
4347          * before incrementing fs_info->scrubs_running).
4348          */
4349         nofs_flag = memalloc_nofs_save();
4350         if (!is_dev_replace) {
4351                 u64 old_super_errors;
4352
4353                 spin_lock(&sctx->stat_lock);
4354                 old_super_errors = sctx->stat.super_errors;
4355                 spin_unlock(&sctx->stat_lock);
4356
4357                 btrfs_info(fs_info, "scrub: started on devid %llu", devid);
4358                 /*
4359                  * by holding device list mutex, we can
4360                  * kick off writing super in log tree sync.
4361                  */
4362                 mutex_lock(&fs_info->fs_devices->device_list_mutex);
4363                 ret = scrub_supers(sctx, dev);
4364                 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
4365
4366                 spin_lock(&sctx->stat_lock);
4367                 /*
4368                  * Super block errors found, but we can not commit transaction
4369                  * at current context, since btrfs_commit_transaction() needs
4370                  * to pause the current running scrub (hold by ourselves).
4371                  */
4372                 if (sctx->stat.super_errors > old_super_errors && !sctx->readonly)
4373                         need_commit = true;
4374                 spin_unlock(&sctx->stat_lock);
4375         }
4376
4377         if (!ret)
4378                 ret = scrub_enumerate_chunks(sctx, dev, start, end);
4379         memalloc_nofs_restore(nofs_flag);
4380
4381         wait_event(sctx->list_wait, atomic_read(&sctx->bios_in_flight) == 0);
4382         atomic_dec(&fs_info->scrubs_running);
4383         wake_up(&fs_info->scrub_pause_wait);
4384
4385         wait_event(sctx->list_wait, atomic_read(&sctx->workers_pending) == 0);
4386
4387         if (progress)
4388                 memcpy(progress, &sctx->stat, sizeof(*progress));
4389
4390         if (!is_dev_replace)
4391                 btrfs_info(fs_info, "scrub: %s on devid %llu with status: %d",
4392                         ret ? "not finished" : "finished", devid, ret);
4393
4394         mutex_lock(&fs_info->scrub_lock);
4395         dev->scrub_ctx = NULL;
4396         mutex_unlock(&fs_info->scrub_lock);
4397
4398         scrub_workers_put(fs_info);
4399         scrub_put_ctx(sctx);
4400
4401         /*
4402          * We found some super block errors before, now try to force a
4403          * transaction commit, as scrub has finished.
4404          */
4405         if (need_commit) {
4406                 struct btrfs_trans_handle *trans;
4407
4408                 trans = btrfs_start_transaction(fs_info->tree_root, 0);
4409                 if (IS_ERR(trans)) {
4410                         ret = PTR_ERR(trans);
4411                         btrfs_err(fs_info,
4412         "scrub: failed to start transaction to fix super block errors: %d", ret);
4413                         return ret;
4414                 }
4415                 ret = btrfs_commit_transaction(trans);
4416                 if (ret < 0)
4417                         btrfs_err(fs_info,
4418         "scrub: failed to commit transaction to fix super block errors: %d", ret);
4419         }
4420         return ret;
4421 out:
4422         scrub_workers_put(fs_info);
4423 out_free_ctx:
4424         scrub_free_ctx(sctx);
4425
4426         return ret;
4427 }
4428
4429 void btrfs_scrub_pause(struct btrfs_fs_info *fs_info)
4430 {
4431         mutex_lock(&fs_info->scrub_lock);
4432         atomic_inc(&fs_info->scrub_pause_req);
4433         while (atomic_read(&fs_info->scrubs_paused) !=
4434                atomic_read(&fs_info->scrubs_running)) {
4435                 mutex_unlock(&fs_info->scrub_lock);
4436                 wait_event(fs_info->scrub_pause_wait,
4437                            atomic_read(&fs_info->scrubs_paused) ==
4438                            atomic_read(&fs_info->scrubs_running));
4439                 mutex_lock(&fs_info->scrub_lock);
4440         }
4441         mutex_unlock(&fs_info->scrub_lock);
4442 }
4443
4444 void btrfs_scrub_continue(struct btrfs_fs_info *fs_info)
4445 {
4446         atomic_dec(&fs_info->scrub_pause_req);
4447         wake_up(&fs_info->scrub_pause_wait);
4448 }
4449
4450 int btrfs_scrub_cancel(struct btrfs_fs_info *fs_info)
4451 {
4452         mutex_lock(&fs_info->scrub_lock);
4453         if (!atomic_read(&fs_info->scrubs_running)) {
4454                 mutex_unlock(&fs_info->scrub_lock);
4455                 return -ENOTCONN;
4456         }
4457
4458         atomic_inc(&fs_info->scrub_cancel_req);
4459         while (atomic_read(&fs_info->scrubs_running)) {
4460                 mutex_unlock(&fs_info->scrub_lock);
4461                 wait_event(fs_info->scrub_pause_wait,
4462                            atomic_read(&fs_info->scrubs_running) == 0);
4463                 mutex_lock(&fs_info->scrub_lock);
4464         }
4465         atomic_dec(&fs_info->scrub_cancel_req);
4466         mutex_unlock(&fs_info->scrub_lock);
4467
4468         return 0;
4469 }
4470
4471 int btrfs_scrub_cancel_dev(struct btrfs_device *dev)
4472 {
4473         struct btrfs_fs_info *fs_info = dev->fs_info;
4474         struct scrub_ctx *sctx;
4475
4476         mutex_lock(&fs_info->scrub_lock);
4477         sctx = dev->scrub_ctx;
4478         if (!sctx) {
4479                 mutex_unlock(&fs_info->scrub_lock);
4480                 return -ENOTCONN;
4481         }
4482         atomic_inc(&sctx->cancel_req);
4483         while (dev->scrub_ctx) {
4484                 mutex_unlock(&fs_info->scrub_lock);
4485                 wait_event(fs_info->scrub_pause_wait,
4486                            dev->scrub_ctx == NULL);
4487                 mutex_lock(&fs_info->scrub_lock);
4488         }
4489         mutex_unlock(&fs_info->scrub_lock);
4490
4491         return 0;
4492 }
4493
4494 int btrfs_scrub_progress(struct btrfs_fs_info *fs_info, u64 devid,
4495                          struct btrfs_scrub_progress *progress)
4496 {
4497         struct btrfs_dev_lookup_args args = { .devid = devid };
4498         struct btrfs_device *dev;
4499         struct scrub_ctx *sctx = NULL;
4500
4501         mutex_lock(&fs_info->fs_devices->device_list_mutex);
4502         dev = btrfs_find_device(fs_info->fs_devices, &args);
4503         if (dev)
4504                 sctx = dev->scrub_ctx;
4505         if (sctx)
4506                 memcpy(progress, &sctx->stat, sizeof(*progress));
4507         mutex_unlock(&fs_info->fs_devices->device_list_mutex);
4508
4509         return dev ? (sctx ? 0 : -ENOTCONN) : -ENODEV;
4510 }
4511
4512 static void scrub_find_good_copy(struct btrfs_fs_info *fs_info,
4513                                  u64 extent_logical, u32 extent_len,
4514                                  u64 *extent_physical,
4515                                  struct btrfs_device **extent_dev,
4516                                  int *extent_mirror_num)
4517 {
4518         u64 mapped_length;
4519         struct btrfs_io_context *bioc = NULL;
4520         int ret;
4521
4522         mapped_length = extent_len;
4523         ret = btrfs_map_block(fs_info, BTRFS_MAP_READ, extent_logical,
4524                               &mapped_length, &bioc, 0);
4525         if (ret || !bioc || mapped_length < extent_len ||
4526             !bioc->stripes[0].dev->bdev) {
4527                 btrfs_put_bioc(bioc);
4528                 return;
4529         }
4530
4531         *extent_physical = bioc->stripes[0].physical;
4532         *extent_mirror_num = bioc->mirror_num;
4533         *extent_dev = bioc->stripes[0].dev;
4534         btrfs_put_bioc(bioc);
4535 }