fs/btrfs/block-group.c

   1 // SPDX-License-Identifier: GPL-2.0
   2
   3 #include <linux/sizes.h>
   4 #include <linux/list_sort.h>
   5 #include "misc.h"
   6 #include "ctree.h"
   7 #include "block-group.h"
   8 #include "space-info.h"
   9 #include "disk-io.h"
  10 #include "free-space-cache.h"
  11 #include "free-space-tree.h"
  12 #include "volumes.h"
  13 #include "transaction.h"
  14 #include "ref-verify.h"
  15 #include "sysfs.h"
  16 #include "tree-log.h"
  17 #include "delalloc-space.h"
  18 #include "discard.h"
  19 #include "raid56.h"
  20 #include "zoned.h"
  21 #include "fs.h"
  22 #include "accessors.h"
  23 #include "extent-tree.h"
  24
  25 #ifdef CONFIG_BTRFS_DEBUG
  26 int btrfs_should_fragment_free_space(struct btrfs_block_group *block_group)
  27 {
  28         struct btrfs_fs_info *fs_info = block_group->fs_info;
  29
  30         return (btrfs_test_opt(fs_info, FRAGMENT_METADATA) &&
  31                 block_group->flags & BTRFS_BLOCK_GROUP_METADATA) ||
  32                (btrfs_test_opt(fs_info, FRAGMENT_DATA) &&
  33                 block_group->flags &  BTRFS_BLOCK_GROUP_DATA);
  34 }
  35 #endif
  36
  37 /*
  38  * Return target flags in extended format or 0 if restripe for this chunk_type
  39  * is not in progress
  40  *
  41  * Should be called with balance_lock held
  42  */
  43 static u64 get_restripe_target(struct btrfs_fs_info *fs_info, u64 flags)
  44 {
  45         struct btrfs_balance_control *bctl = fs_info->balance_ctl;
  46         u64 target = 0;
  47
  48         if (!bctl)
  49                 return 0;
  50
  51         if (flags & BTRFS_BLOCK_GROUP_DATA &&
  52             bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) {
  53                 target = BTRFS_BLOCK_GROUP_DATA | bctl->data.target;
  54         } else if (flags & BTRFS_BLOCK_GROUP_SYSTEM &&
  55                    bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) {
  56                 target = BTRFS_BLOCK_GROUP_SYSTEM | bctl->sys.target;
  57         } else if (flags & BTRFS_BLOCK_GROUP_METADATA &&
  58                    bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) {
  59                 target = BTRFS_BLOCK_GROUP_METADATA | bctl->meta.target;
  60         }
  61
  62         return target;
  63 }
  64
  65 /*
  66  * @flags: available profiles in extended format (see ctree.h)
  67  *
  68  * Return reduced profile in chunk format.  If profile changing is in progress
  69  * (either running or paused) picks the target profile (if it's already
  70  * available), otherwise falls back to plain reducing.
  71  */
  72 static u64 btrfs_reduce_alloc_profile(struct btrfs_fs_info *fs_info, u64 flags)
  73 {
  74         u64 num_devices = fs_info->fs_devices->rw_devices;
  75         u64 target;
  76         u64 raid_type;
  77         u64 allowed = 0;
  78
  79         /*
  80          * See if restripe for this chunk_type is in progress, if so try to
  81          * reduce to the target profile
  82          */
  83         spin_lock(&fs_info->balance_lock);
  84         target = get_restripe_target(fs_info, flags);
  85         if (target) {
  86                 spin_unlock(&fs_info->balance_lock);
  87                 return extended_to_chunk(target);
  88         }
  89         spin_unlock(&fs_info->balance_lock);
  90
  91         /* First, mask out the RAID levels which aren't possible */
  92         for (raid_type = 0; raid_type < BTRFS_NR_RAID_TYPES; raid_type++) {
  93                 if (num_devices >= btrfs_raid_array[raid_type].devs_min)
  94                         allowed |= btrfs_raid_array[raid_type].bg_flag;
  95         }
  96         allowed &= flags;
  97
  98         if (allowed & BTRFS_BLOCK_GROUP_RAID6)
  99                 allowed = BTRFS_BLOCK_GROUP_RAID6;
 100         else if (allowed & BTRFS_BLOCK_GROUP_RAID5)
 101                 allowed = BTRFS_BLOCK_GROUP_RAID5;
 102         else if (allowed & BTRFS_BLOCK_GROUP_RAID10)
 103                 allowed = BTRFS_BLOCK_GROUP_RAID10;
 104         else if (allowed & BTRFS_BLOCK_GROUP_RAID1)
 105                 allowed = BTRFS_BLOCK_GROUP_RAID1;
 106         else if (allowed & BTRFS_BLOCK_GROUP_RAID0)
 107                 allowed = BTRFS_BLOCK_GROUP_RAID0;
 108
 109         flags &= ~BTRFS_BLOCK_GROUP_PROFILE_MASK;
 110
 111         return extended_to_chunk(flags | allowed);
 112 }
 113
 114 u64 btrfs_get_alloc_profile(struct btrfs_fs_info *fs_info, u64 orig_flags)
 115 {
 116         unsigned seq;
 117         u64 flags;
 118
 119         do {
 120                 flags = orig_flags;
 121                 seq = read_seqbegin(&fs_info->profiles_lock);
 122
 123                 if (flags & BTRFS_BLOCK_GROUP_DATA)
 124                         flags |= fs_info->avail_data_alloc_bits;
 125                 else if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
 126                         flags |= fs_info->avail_system_alloc_bits;
 127                 else if (flags & BTRFS_BLOCK_GROUP_METADATA)
 128                         flags |= fs_info->avail_metadata_alloc_bits;
 129         } while (read_seqretry(&fs_info->profiles_lock, seq));
 130
 131         return btrfs_reduce_alloc_profile(fs_info, flags);
 132 }
 133
 134 void btrfs_get_block_group(struct btrfs_block_group *cache)
 135 {
 136         refcount_inc(&cache->refs);
 137 }
 138
 139 void btrfs_put_block_group(struct btrfs_block_group *cache)
 140 {
 141         if (refcount_dec_and_test(&cache->refs)) {
 142                 WARN_ON(cache->pinned > 0);
 143                 /*
 144                  * If there was a failure to cleanup a log tree, very likely due
 145                  * to an IO failure on a writeback attempt of one or more of its
 146                  * extent buffers, we could not do proper (and cheap) unaccounting
 147                  * of their reserved space, so don't warn on reserved > 0 in that
 148                  * case.
 149                  */
 150                 if (!(cache->flags & BTRFS_BLOCK_GROUP_METADATA) ||
 151                     !BTRFS_FS_LOG_CLEANUP_ERROR(cache->fs_info))
 152                         WARN_ON(cache->reserved > 0);
 153
 154                 /*
 155                  * A block_group shouldn't be on the discard_list anymore.
 156                  * Remove the block_group from the discard_list to prevent us
 157                  * from causing a panic due to NULL pointer dereference.
 158                  */
 159                 if (WARN_ON(!list_empty(&cache->discard_list)))
 160                         btrfs_discard_cancel_work(&cache->fs_info->discard_ctl,
 161                                                   cache);
 162
 163                 kfree(cache->free_space_ctl);
 164                 kfree(cache->physical_map);
 165                 kfree(cache);
 166         }
 167 }
 168
 169 /*
 170  * This adds the block group to the fs_info rb tree for the block group cache
 171  */
 172 static int btrfs_add_block_group_cache(struct btrfs_fs_info *info,
 173                                        struct btrfs_block_group *block_group)
 174 {
 175         struct rb_node **p;
 176         struct rb_node *parent = NULL;
 177         struct btrfs_block_group *cache;
 178         bool leftmost = true;
 179
 180         ASSERT(block_group->length != 0);
 181
 182         write_lock(&info->block_group_cache_lock);
 183         p = &info->block_group_cache_tree.rb_root.rb_node;
 184
 185         while (*p) {
 186                 parent = *p;
 187                 cache = rb_entry(parent, struct btrfs_block_group, cache_node);
 188                 if (block_group->start < cache->start) {
 189                         p = &(*p)->rb_left;
 190                 } else if (block_group->start > cache->start) {
 191                         p = &(*p)->rb_right;
 192                         leftmost = false;
 193                 } else {
 194                         write_unlock(&info->block_group_cache_lock);
 195                         return -EEXIST;
 196                 }
 197         }
 198
 199         rb_link_node(&block_group->cache_node, parent, p);
 200         rb_insert_color_cached(&block_group->cache_node,
 201                                &info->block_group_cache_tree, leftmost);
 202
 203         write_unlock(&info->block_group_cache_lock);
 204
 205         return 0;
 206 }
 207
 208 /*
 209  * This will return the block group at or after bytenr if contains is 0, else
 210  * it will return the block group that contains the bytenr
 211  */
 212 static struct btrfs_block_group *block_group_cache_tree_search(
 213                 struct btrfs_fs_info *info, u64 bytenr, int contains)
 214 {
 215         struct btrfs_block_group *cache, *ret = NULL;
 216         struct rb_node *n;
 217         u64 end, start;
 218
 219         read_lock(&info->block_group_cache_lock);
 220         n = info->block_group_cache_tree.rb_root.rb_node;
 221
 222         while (n) {
 223                 cache = rb_entry(n, struct btrfs_block_group, cache_node);
 224                 end = cache->start + cache->length - 1;
 225                 start = cache->start;
 226
 227                 if (bytenr < start) {
 228                         if (!contains && (!ret || start < ret->start))
 229                                 ret = cache;
 230                         n = n->rb_left;
 231                 } else if (bytenr > start) {
 232                         if (contains && bytenr <= end) {
 233                                 ret = cache;
 234                                 break;
 235                         }
 236                         n = n->rb_right;
 237                 } else {
 238                         ret = cache;
 239                         break;
 240                 }
 241         }
 242         if (ret)
 243                 btrfs_get_block_group(ret);
 244         read_unlock(&info->block_group_cache_lock);
 245
 246         return ret;
 247 }
 248
 249 /*
 250  * Return the block group that starts at or after bytenr
 251  */
 252 struct btrfs_block_group *btrfs_lookup_first_block_group(
 253                 struct btrfs_fs_info *info, u64 bytenr)
 254 {
 255         return block_group_cache_tree_search(info, bytenr, 0);
 256 }
 257
 258 /*
 259  * Return the block group that contains the given bytenr
 260  */
 261 struct btrfs_block_group *btrfs_lookup_block_group(
 262                 struct btrfs_fs_info *info, u64 bytenr)
 263 {
 264         return block_group_cache_tree_search(info, bytenr, 1);
 265 }
 266
 267 struct btrfs_block_group *btrfs_next_block_group(
 268                 struct btrfs_block_group *cache)
 269 {
 270         struct btrfs_fs_info *fs_info = cache->fs_info;
 271         struct rb_node *node;
 272
 273         read_lock(&fs_info->block_group_cache_lock);
 274
 275         /* If our block group was removed, we need a full search. */
 276         if (RB_EMPTY_NODE(&cache->cache_node)) {
 277                 const u64 next_bytenr = cache->start + cache->length;
 278
 279                 read_unlock(&fs_info->block_group_cache_lock);
 280                 btrfs_put_block_group(cache);
 281                 return btrfs_lookup_first_block_group(fs_info, next_bytenr);
 282         }
 283         node = rb_next(&cache->cache_node);
 284         btrfs_put_block_group(cache);
 285         if (node) {
 286                 cache = rb_entry(node, struct btrfs_block_group, cache_node);
 287                 btrfs_get_block_group(cache);
 288         } else
 289                 cache = NULL;
 290         read_unlock(&fs_info->block_group_cache_lock);
 291         return cache;
 292 }
 293
 294 /*
 295  * Check if we can do a NOCOW write for a given extent.
 296  *
 297  * @fs_info:       The filesystem information object.
 298  * @bytenr:        Logical start address of the extent.
 299  *
 300  * Check if we can do a NOCOW write for the given extent, and increments the
 301  * number of NOCOW writers in the block group that contains the extent, as long
 302  * as the block group exists and it's currently not in read-only mode.
 303  *
 304  * Returns: A non-NULL block group pointer if we can do a NOCOW write, the caller
 305  *          is responsible for calling btrfs_dec_nocow_writers() later.
 306  *
 307  *          Or NULL if we can not do a NOCOW write
 308  */
 309 struct btrfs_block_group *btrfs_inc_nocow_writers(struct btrfs_fs_info *fs_info,
 310                                                   u64 bytenr)
 311 {
 312         struct btrfs_block_group *bg;
 313         bool can_nocow = true;
 314
 315         bg = btrfs_lookup_block_group(fs_info, bytenr);
 316         if (!bg)
 317                 return NULL;
 318
 319         spin_lock(&bg->lock);
 320         if (bg->ro)
 321                 can_nocow = false;
 322         else
 323                 atomic_inc(&bg->nocow_writers);
 324         spin_unlock(&bg->lock);
 325
 326         if (!can_nocow) {
 327                 btrfs_put_block_group(bg);
 328                 return NULL;
 329         }
 330
 331         /* No put on block group, done by btrfs_dec_nocow_writers(). */
 332         return bg;
 333 }
 334
 335 /*
 336  * Decrement the number of NOCOW writers in a block group.
 337  *
 338  * This is meant to be called after a previous call to btrfs_inc_nocow_writers(),
 339  * and on the block group returned by that call. Typically this is called after
 340  * creating an ordered extent for a NOCOW write, to prevent races with scrub and
 341  * relocation.
 342  *
 343  * After this call, the caller should not use the block group anymore. It it wants
 344  * to use it, then it should get a reference on it before calling this function.
 345  */
 346 void btrfs_dec_nocow_writers(struct btrfs_block_group *bg)
 347 {
 348         if (atomic_dec_and_test(&bg->nocow_writers))
 349                 wake_up_var(&bg->nocow_writers);
 350
 351         /* For the lookup done by a previous call to btrfs_inc_nocow_writers(). */
 352         btrfs_put_block_group(bg);
 353 }
 354
 355 void btrfs_wait_nocow_writers(struct btrfs_block_group *bg)
 356 {
 357         wait_var_event(&bg->nocow_writers, !atomic_read(&bg->nocow_writers));
 358 }
 359
 360 void btrfs_dec_block_group_reservations(struct btrfs_fs_info *fs_info,
 361                                         const u64 start)
 362 {
 363         struct btrfs_block_group *bg;
 364
 365         bg = btrfs_lookup_block_group(fs_info, start);
 366         ASSERT(bg);
 367         if (atomic_dec_and_test(&bg->reservations))
 368                 wake_up_var(&bg->reservations);
 369         btrfs_put_block_group(bg);
 370 }
 371
 372 void btrfs_wait_block_group_reservations(struct btrfs_block_group *bg)
 373 {
 374         struct btrfs_space_info *space_info = bg->space_info;
 375
 376         ASSERT(bg->ro);
 377
 378         if (!(bg->flags & BTRFS_BLOCK_GROUP_DATA))
 379                 return;
 380
 381         /*
 382          * Our block group is read only but before we set it to read only,
 383          * some task might have had allocated an extent from it already, but it
 384          * has not yet created a respective ordered extent (and added it to a
 385          * root's list of ordered extents).
 386          * Therefore wait for any task currently allocating extents, since the
 387          * block group's reservations counter is incremented while a read lock
 388          * on the groups' semaphore is held and decremented after releasing
 389          * the read access on that semaphore and creating the ordered extent.
 390          */
 391         down_write(&space_info->groups_sem);
 392         up_write(&space_info->groups_sem);
 393
 394         wait_var_event(&bg->reservations, !atomic_read(&bg->reservations));
 395 }
 396
 397 struct btrfs_caching_control *btrfs_get_caching_control(
 398                 struct btrfs_block_group *cache)
 399 {
 400         struct btrfs_caching_control *ctl;
 401
 402         spin_lock(&cache->lock);
 403         if (!cache->caching_ctl) {
 404                 spin_unlock(&cache->lock);
 405                 return NULL;
 406         }
 407
 408         ctl = cache->caching_ctl;
 409         refcount_inc(&ctl->count);
 410         spin_unlock(&cache->lock);
 411         return ctl;
 412 }
 413
 414 void btrfs_put_caching_control(struct btrfs_caching_control *ctl)
 415 {
 416         if (refcount_dec_and_test(&ctl->count))
 417                 kfree(ctl);
 418 }
 419
 420 /*
 421  * When we wait for progress in the block group caching, its because our
 422  * allocation attempt failed at least once.  So, we must sleep and let some
 423  * progress happen before we try again.
 424  *
 425  * This function will sleep at least once waiting for new free space to show
 426  * up, and then it will check the block group free space numbers for our min
 427  * num_bytes.  Another option is to have it go ahead and look in the rbtree for
 428  * a free extent of a given size, but this is a good start.
 429  *
 430  * Callers of this must check if cache->cached == BTRFS_CACHE_ERROR before using
 431  * any of the information in this block group.
 432  */
 433 void btrfs_wait_block_group_cache_progress(struct btrfs_block_group *cache,
 434                                            u64 num_bytes)
 435 {
 436         struct btrfs_caching_control *caching_ctl;
 437
 438         caching_ctl = btrfs_get_caching_control(cache);
 439         if (!caching_ctl)
 440                 return;
 441
 442         wait_event(caching_ctl->wait, btrfs_block_group_done(cache) ||
 443                    (cache->free_space_ctl->free_space >= num_bytes));
 444
 445         btrfs_put_caching_control(caching_ctl);
 446 }
 447
 448 static int btrfs_caching_ctl_wait_done(struct btrfs_block_group *cache,
 449                                        struct btrfs_caching_control *caching_ctl)
 450 {
 451         wait_event(caching_ctl->wait, btrfs_block_group_done(cache));
 452         return cache->cached == BTRFS_CACHE_ERROR ? -EIO : 0;
 453 }
 454
 455 static int btrfs_wait_block_group_cache_done(struct btrfs_block_group *cache)
 456 {
 457         struct btrfs_caching_control *caching_ctl;
 458         int ret;
 459
 460         caching_ctl = btrfs_get_caching_control(cache);
 461         if (!caching_ctl)
 462                 return (cache->cached == BTRFS_CACHE_ERROR) ? -EIO : 0;
 463         ret = btrfs_caching_ctl_wait_done(cache, caching_ctl);
 464         btrfs_put_caching_control(caching_ctl);
 465         return ret;
 466 }
 467
 468 #ifdef CONFIG_BTRFS_DEBUG
 469 static void fragment_free_space(struct btrfs_block_group *block_group)
 470 {
 471         struct btrfs_fs_info *fs_info = block_group->fs_info;
 472         u64 start = block_group->start;
 473         u64 len = block_group->length;
 474         u64 chunk = block_group->flags & BTRFS_BLOCK_GROUP_METADATA ?
 475                 fs_info->nodesize : fs_info->sectorsize;
 476         u64 step = chunk << 1;
 477
 478         while (len > chunk) {
 479                 btrfs_remove_free_space(block_group, start, chunk);
 480                 start += step;
 481                 if (len < step)
 482                         len = 0;
 483                 else
 484                         len -= step;
 485         }
 486 }
 487 #endif
 488
 489 /*
 490  * This is only called by btrfs_cache_block_group, since we could have freed
 491  * extents we need to check the pinned_extents for any extents that can't be
 492  * used yet since their free space will be released as soon as the transaction
 493  * commits.
 494  */
 495 u64 add_new_free_space(struct btrfs_block_group *block_group, u64 start, u64 end)
 496 {
 497         struct btrfs_fs_info *info = block_group->fs_info;
 498         u64 extent_start, extent_end, size, total_added = 0;
 499         int ret;
 500
 501         while (start < end) {
 502                 ret = find_first_extent_bit(&info->excluded_extents, start,
 503                                             &extent_start, &extent_end,
 504                                             EXTENT_DIRTY | EXTENT_UPTODATE,
 505                                             NULL);
 506                 if (ret)
 507                         break;
 508
 509                 if (extent_start <= start) {
 510                         start = extent_end + 1;
 511                 } else if (extent_start > start && extent_start < end) {
 512                         size = extent_start - start;
 513                         total_added += size;
 514                         ret = btrfs_add_free_space_async_trimmed(block_group,
 515                                                                  start, size);
 516                         BUG_ON(ret); /* -ENOMEM or logic error */
 517                         start = extent_end + 1;
 518                 } else {
 519                         break;
 520                 }
 521         }
 522
 523         if (start < end) {
 524                 size = end - start;
 525                 total_added += size;
 526                 ret = btrfs_add_free_space_async_trimmed(block_group, start,
 527                                                          size);
 528                 BUG_ON(ret); /* -ENOMEM or logic error */
 529         }
 530
 531         return total_added;
 532 }
 533
 534 /*
 535  * Get an arbitrary extent item index / max_index through the block group
 536  *
 537  * @block_group   the block group to sample from
 538  * @index:        the integral step through the block group to grab from
 539  * @max_index:    the granularity of the sampling
 540  * @key:          return value parameter for the item we find
 541  *
 542  * Pre-conditions on indices:
 543  * 0 <= index <= max_index
 544  * 0 < max_index
 545  *
 546  * Returns: 0 on success, 1 if the search didn't yield a useful item, negative
 547  * error code on error.
 548  */
 549 static int sample_block_group_extent_item(struct btrfs_caching_control *caching_ctl,
 550                                           struct btrfs_block_group *block_group,
 551                                           int index, int max_index,
 552                                           struct btrfs_key *found_key)
 553 {
 554         struct btrfs_fs_info *fs_info = block_group->fs_info;
 555         struct btrfs_root *extent_root;
 556         u64 search_offset;
 557         u64 search_end = block_group->start + block_group->length;
 558         struct btrfs_path *path;
 559         struct btrfs_key search_key;
 560         int ret = 0;
 561
 562         ASSERT(index >= 0);
 563         ASSERT(index <= max_index);
 564         ASSERT(max_index > 0);
 565         lockdep_assert_held(&caching_ctl->mutex);
 566         lockdep_assert_held_read(&fs_info->commit_root_sem);
 567
 568         path = btrfs_alloc_path();
 569         if (!path)
 570                 return -ENOMEM;
 571
 572         extent_root = btrfs_extent_root(fs_info, max_t(u64, block_group->start,
 573                                                        BTRFS_SUPER_INFO_OFFSET));
 574
 575         path->skip_locking = 1;
 576         path->search_commit_root = 1;
 577         path->reada = READA_FORWARD;
 578
 579         search_offset = index * div_u64(block_group->length, max_index);
 580         search_key.objectid = block_group->start + search_offset;
 581         search_key.type = BTRFS_EXTENT_ITEM_KEY;
 582         search_key.offset = 0;
 583
 584         btrfs_for_each_slot(extent_root, &search_key, found_key, path, ret) {
 585                 /* Success; sampled an extent item in the block group */
 586                 if (found_key->type == BTRFS_EXTENT_ITEM_KEY &&
 587                     found_key->objectid >= block_group->start &&
 588                     found_key->objectid + found_key->offset <= search_end)
 589                         break;
 590
 591                 /* We can't possibly find a valid extent item anymore */
 592                 if (found_key->objectid >= search_end) {
 593                         ret = 1;
 594                         break;
 595                 }
 596         }
 597
 598         lockdep_assert_held(&caching_ctl->mutex);
 599         lockdep_assert_held_read(&fs_info->commit_root_sem);
 600         btrfs_free_path(path);
 601         return ret;
 602 }
 603
 604 /*
 605  * Best effort attempt to compute a block group's size class while caching it.
 606  *
 607  * @block_group: the block group we are caching
 608  *
 609  * We cannot infer the size class while adding free space extents, because that
 610  * logic doesn't care about contiguous file extents (it doesn't differentiate
 611  * between a 100M extent and 100 contiguous 1M extents). So we need to read the
 612  * file extent items. Reading all of them is quite wasteful, because usually
 613  * only a handful are enough to give a good answer. Therefore, we just grab 5 of
 614  * them at even steps through the block group and pick the smallest size class
 615  * we see. Since size class is best effort, and not guaranteed in general,
 616  * inaccuracy is acceptable.
 617  *
 618  * To be more explicit about why this algorithm makes sense:
 619  *
 620  * If we are caching in a block group from disk, then there are three major cases
 621  * to consider:
 622  * 1. the block group is well behaved and all extents in it are the same size
 623  *    class.
 624  * 2. the block group is mostly one size class with rare exceptions for last
 625  *    ditch allocations
 626  * 3. the block group was populated before size classes and can have a totally
 627  *    arbitrary mix of size classes.
 628  *
 629  * In case 1, looking at any extent in the block group will yield the correct
 630  * result. For the mixed cases, taking the minimum size class seems like a good
 631  * approximation, since gaps from frees will be usable to the size class. For
 632  * 2., a small handful of file extents is likely to yield the right answer. For
 633  * 3, we can either read every file extent, or admit that this is best effort
 634  * anyway and try to stay fast.
 635  *
 636  * Returns: 0 on success, negative error code on error.
 637  */
 638 static int load_block_group_size_class(struct btrfs_caching_control *caching_ctl,
 639                                        struct btrfs_block_group *block_group)
 640 {
 641         struct btrfs_fs_info *fs_info = block_group->fs_info;
 642         struct btrfs_key key;
 643         int i;
 644         u64 min_size = block_group->length;
 645         enum btrfs_block_group_size_class size_class = BTRFS_BG_SZ_NONE;
 646         int ret;
 647
 648         if (!btrfs_block_group_should_use_size_class(block_group))
 649                 return 0;
 650
 651         lockdep_assert_held(&caching_ctl->mutex);
 652         lockdep_assert_held_read(&fs_info->commit_root_sem);
 653         for (i = 0; i < 5; ++i) {
 654                 ret = sample_block_group_extent_item(caching_ctl, block_group, i, 5, &key);
 655                 if (ret < 0)
 656                         goto out;
 657                 if (ret > 0)
 658                         continue;
 659                 min_size = min_t(u64, min_size, key.offset);
 660                 size_class = btrfs_calc_block_group_size_class(min_size);
 661         }
 662         if (size_class != BTRFS_BG_SZ_NONE) {
 663                 spin_lock(&block_group->lock);
 664                 block_group->size_class = size_class;
 665                 spin_unlock(&block_group->lock);
 666         }
 667 out:
 668         return ret;
 669 }
 670
 671 static int load_extent_tree_free(struct btrfs_caching_control *caching_ctl)
 672 {
 673         struct btrfs_block_group *block_group = caching_ctl->block_group;
 674         struct btrfs_fs_info *fs_info = block_group->fs_info;
 675         struct btrfs_root *extent_root;
 676         struct btrfs_path *path;
 677         struct extent_buffer *leaf;
 678         struct btrfs_key key;
 679         u64 total_found = 0;
 680         u64 last = 0;
 681         u32 nritems;
 682         int ret;
 683         bool wakeup = true;
 684
 685         path = btrfs_alloc_path();
 686         if (!path)
 687                 return -ENOMEM;
 688
 689         last = max_t(u64, block_group->start, BTRFS_SUPER_INFO_OFFSET);
 690         extent_root = btrfs_extent_root(fs_info, last);
 691
 692 #ifdef CONFIG_BTRFS_DEBUG
 693         /*
 694          * If we're fragmenting we don't want to make anybody think we can
 695          * allocate from this block group until we've had a chance to fragment
 696          * the free space.
 697          */
 698         if (btrfs_should_fragment_free_space(block_group))
 699                 wakeup = false;
 700 #endif
 701         /*
 702          * We don't want to deadlock with somebody trying to allocate a new
 703          * extent for the extent root while also trying to search the extent
 704          * root to add free space.  So we skip locking and search the commit
 705          * root, since its read-only
 706          */
 707         path->skip_locking = 1;
 708         path->search_commit_root = 1;
 709         path->reada = READA_FORWARD;
 710
 711         key.objectid = last;
 712         key.offset = 0;
 713         key.type = BTRFS_EXTENT_ITEM_KEY;
 714
 715 next:
 716         ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
 717         if (ret < 0)
 718                 goto out;
 719
 720         leaf = path->nodes[0];
 721         nritems = btrfs_header_nritems(leaf);
 722
 723         while (1) {
 724                 if (btrfs_fs_closing(fs_info) > 1) {
 725                         last = (u64)-1;
 726                         break;
 727                 }
 728
 729                 if (path->slots[0] < nritems) {
 730                         btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
 731                 } else {
 732                         ret = btrfs_find_next_key(extent_root, path, &key, 0, 0);
 733                         if (ret)
 734                                 break;
 735
 736                         if (need_resched() ||
 737                             rwsem_is_contended(&fs_info->commit_root_sem)) {
 738                                 btrfs_release_path(path);
 739                                 up_read(&fs_info->commit_root_sem);
 740                                 mutex_unlock(&caching_ctl->mutex);
 741                                 cond_resched();
 742                                 mutex_lock(&caching_ctl->mutex);
 743                                 down_read(&fs_info->commit_root_sem);
 744                                 goto next;
 745                         }
 746
 747                         ret = btrfs_next_leaf(extent_root, path);
 748                         if (ret < 0)
 749                                 goto out;
 750                         if (ret)
 751                                 break;
 752                         leaf = path->nodes[0];
 753                         nritems = btrfs_header_nritems(leaf);
 754                         continue;
 755                 }
 756
 757                 if (key.objectid < last) {
 758                         key.objectid = last;
 759                         key.offset = 0;
 760                         key.type = BTRFS_EXTENT_ITEM_KEY;
 761                         btrfs_release_path(path);
 762                         goto next;
 763                 }
 764
 765                 if (key.objectid < block_group->start) {
 766                         path->slots[0]++;
 767                         continue;
 768                 }
 769
 770                 if (key.objectid >= block_group->start + block_group->length)
 771                         break;
 772
 773                 if (key.type == BTRFS_EXTENT_ITEM_KEY ||
 774                     key.type == BTRFS_METADATA_ITEM_KEY) {
 775                         total_found += add_new_free_space(block_group, last,
 776                                                           key.objectid);
 777                         if (key.type == BTRFS_METADATA_ITEM_KEY)
 778                                 last = key.objectid +
 779                                         fs_info->nodesize;
 780                         else
 781                                 last = key.objectid + key.offset;
 782
 783                         if (total_found > CACHING_CTL_WAKE_UP) {
 784                                 total_found = 0;
 785                                 if (wakeup)
 786                                         wake_up(&caching_ctl->wait);
 787                         }
 788                 }
 789                 path->slots[0]++;
 790         }
 791         ret = 0;
 792
 793         total_found += add_new_free_space(block_group, last,
 794                                 block_group->start + block_group->length);
 795
 796 out:
 797         btrfs_free_path(path);
 798         return ret;
 799 }
 800
 801 static noinline void caching_thread(struct btrfs_work *work)
 802 {
 803         struct btrfs_block_group *block_group;
 804         struct btrfs_fs_info *fs_info;
 805         struct btrfs_caching_control *caching_ctl;
 806         int ret;
 807
 808         caching_ctl = container_of(work, struct btrfs_caching_control, work);
 809         block_group = caching_ctl->block_group;
 810         fs_info = block_group->fs_info;
 811
 812         mutex_lock(&caching_ctl->mutex);
 813         down_read(&fs_info->commit_root_sem);
 814
 815         load_block_group_size_class(caching_ctl, block_group);
 816         if (btrfs_test_opt(fs_info, SPACE_CACHE)) {
 817                 ret = load_free_space_cache(block_group);
 818                 if (ret == 1) {
 819                         ret = 0;
 820                         goto done;
 821                 }
 822
 823                 /*
 824                  * We failed to load the space cache, set ourselves to
 825                  * CACHE_STARTED and carry on.
 826                  */
 827                 spin_lock(&block_group->lock);
 828                 block_group->cached = BTRFS_CACHE_STARTED;
 829                 spin_unlock(&block_group->lock);
 830                 wake_up(&caching_ctl->wait);
 831         }
 832
 833         /*
 834          * If we are in the transaction that populated the free space tree we
 835          * can't actually cache from the free space tree as our commit root and
 836          * real root are the same, so we could change the contents of the blocks
 837          * while caching.  Instead do the slow caching in this case, and after
 838          * the transaction has committed we will be safe.
 839          */
 840         if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE) &&
 841             !(test_bit(BTRFS_FS_FREE_SPACE_TREE_UNTRUSTED, &fs_info->flags)))
 842                 ret = load_free_space_tree(caching_ctl);
 843         else
 844                 ret = load_extent_tree_free(caching_ctl);
 845 done:
 846         spin_lock(&block_group->lock);
 847         block_group->caching_ctl = NULL;
 848         block_group->cached = ret ? BTRFS_CACHE_ERROR : BTRFS_CACHE_FINISHED;
 849         spin_unlock(&block_group->lock);
 850
 851 #ifdef CONFIG_BTRFS_DEBUG
 852         if (btrfs_should_fragment_free_space(block_group)) {
 853                 u64 bytes_used;
 854
 855                 spin_lock(&block_group->space_info->lock);
 856                 spin_lock(&block_group->lock);
 857                 bytes_used = block_group->length - block_group->used;
 858                 block_group->space_info->bytes_used += bytes_used >> 1;
 859                 spin_unlock(&block_group->lock);
 860                 spin_unlock(&block_group->space_info->lock);
 861                 fragment_free_space(block_group);
 862         }
 863 #endif
 864
 865         up_read(&fs_info->commit_root_sem);
 866         btrfs_free_excluded_extents(block_group);
 867         mutex_unlock(&caching_ctl->mutex);
 868
 869         wake_up(&caching_ctl->wait);
 870
 871         btrfs_put_caching_control(caching_ctl);
 872         btrfs_put_block_group(block_group);
 873 }
 874
 875 int btrfs_cache_block_group(struct btrfs_block_group *cache, bool wait)
 876 {
 877         struct btrfs_fs_info *fs_info = cache->fs_info;
 878         struct btrfs_caching_control *caching_ctl = NULL;
 879         int ret = 0;
 880
 881         /* Allocator for zoned filesystems does not use the cache at all */
 882         if (btrfs_is_zoned(fs_info))
 883                 return 0;
 884
 885         caching_ctl = kzalloc(sizeof(*caching_ctl), GFP_NOFS);
 886         if (!caching_ctl)
 887                 return -ENOMEM;
 888
 889         INIT_LIST_HEAD(&caching_ctl->list);
 890         mutex_init(&caching_ctl->mutex);
 891         init_waitqueue_head(&caching_ctl->wait);
 892         caching_ctl->block_group = cache;
 893         refcount_set(&caching_ctl->count, 2);
 894         btrfs_init_work(&caching_ctl->work, caching_thread, NULL, NULL);
 895
 896         spin_lock(&cache->lock);
 897         if (cache->cached != BTRFS_CACHE_NO) {
 898                 kfree(caching_ctl);
 899
 900                 caching_ctl = cache->caching_ctl;
 901                 if (caching_ctl)
 902                         refcount_inc(&caching_ctl->count);
 903                 spin_unlock(&cache->lock);
 904                 goto out;
 905         }
 906         WARN_ON(cache->caching_ctl);
 907         cache->caching_ctl = caching_ctl;
 908         cache->cached = BTRFS_CACHE_STARTED;
 909         spin_unlock(&cache->lock);
 910
 911         write_lock(&fs_info->block_group_cache_lock);
 912         refcount_inc(&caching_ctl->count);
 913         list_add_tail(&caching_ctl->list, &fs_info->caching_block_groups);
 914         write_unlock(&fs_info->block_group_cache_lock);
 915
 916         btrfs_get_block_group(cache);
 917
 918         btrfs_queue_work(fs_info->caching_workers, &caching_ctl->work);
 919 out:
 920         if (wait && caching_ctl)
 921                 ret = btrfs_caching_ctl_wait_done(cache, caching_ctl);
 922         if (caching_ctl)
 923                 btrfs_put_caching_control(caching_ctl);
 924
 925         return ret;
 926 }
 927
 928 static void clear_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
 929 {
 930         u64 extra_flags = chunk_to_extended(flags) &
 931                                 BTRFS_EXTENDED_PROFILE_MASK;
 932
 933         write_seqlock(&fs_info->profiles_lock);
 934         if (flags & BTRFS_BLOCK_GROUP_DATA)
 935                 fs_info->avail_data_alloc_bits &= ~extra_flags;
 936         if (flags & BTRFS_BLOCK_GROUP_METADATA)
 937                 fs_info->avail_metadata_alloc_bits &= ~extra_flags;
 938         if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
 939                 fs_info->avail_system_alloc_bits &= ~extra_flags;
 940         write_sequnlock(&fs_info->profiles_lock);
 941 }
 942
 943 /*
 944  * Clear incompat bits for the following feature(s):
 945  *
 946  * - RAID56 - in case there's neither RAID5 nor RAID6 profile block group
 947  *            in the whole filesystem
 948  *
 949  * - RAID1C34 - same as above for RAID1C3 and RAID1C4 block groups
 950  */
 951 static void clear_incompat_bg_bits(struct btrfs_fs_info *fs_info, u64 flags)
 952 {
 953         bool found_raid56 = false;
 954         bool found_raid1c34 = false;
 955
 956         if ((flags & BTRFS_BLOCK_GROUP_RAID56_MASK) ||
 957             (flags & BTRFS_BLOCK_GROUP_RAID1C3) ||
 958             (flags & BTRFS_BLOCK_GROUP_RAID1C4)) {
 959                 struct list_head *head = &fs_info->space_info;
 960                 struct btrfs_space_info *sinfo;
 961
 962                 list_for_each_entry_rcu(sinfo, head, list) {
 963                         down_read(&sinfo->groups_sem);
 964                         if (!list_empty(&sinfo->block_groups[BTRFS_RAID_RAID5]))
 965                                 found_raid56 = true;
 966                         if (!list_empty(&sinfo->block_groups[BTRFS_RAID_RAID6]))
 967                                 found_raid56 = true;
 968                         if (!list_empty(&sinfo->block_groups[BTRFS_RAID_RAID1C3]))
 969                                 found_raid1c34 = true;
 970                         if (!list_empty(&sinfo->block_groups[BTRFS_RAID_RAID1C4]))
 971                                 found_raid1c34 = true;
 972                         up_read(&sinfo->groups_sem);
 973                 }
 974                 if (!found_raid56)
 975                         btrfs_clear_fs_incompat(fs_info, RAID56);
 976                 if (!found_raid1c34)
 977                         btrfs_clear_fs_incompat(fs_info, RAID1C34);
 978         }
 979 }
 980
 981 static int remove_block_group_item(struct btrfs_trans_handle *trans,
 982                                    struct btrfs_path *path,
 983                                    struct btrfs_block_group *block_group)
 984 {
 985         struct btrfs_fs_info *fs_info = trans->fs_info;
 986         struct btrfs_root *root;
 987         struct btrfs_key key;
 988         int ret;
 989
 990         root = btrfs_block_group_root(fs_info);
 991         key.objectid = block_group->start;
 992         key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
 993         key.offset = block_group->length;
 994
 995         ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
 996         if (ret > 0)
 997                 ret = -ENOENT;
 998         if (ret < 0)
 999                 return ret;
1000
1001         ret = btrfs_del_item(trans, root, path);
1002         return ret;
1003 }
1004
1005 int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
1006                              u64 group_start, struct extent_map *em)
1007 {
1008         struct btrfs_fs_info *fs_info = trans->fs_info;
1009         struct btrfs_path *path;
1010         struct btrfs_block_group *block_group;
1011         struct btrfs_free_cluster *cluster;
1012         struct inode *inode;
1013         struct kobject *kobj = NULL;
1014         int ret;
1015         int index;
1016         int factor;
1017         struct btrfs_caching_control *caching_ctl = NULL;
1018         bool remove_em;
1019         bool remove_rsv = false;
1020
1021         block_group = btrfs_lookup_block_group(fs_info, group_start);
1022         BUG_ON(!block_group);
1023         BUG_ON(!block_group->ro);
1024
1025         trace_btrfs_remove_block_group(block_group);
1026         /*
1027          * Free the reserved super bytes from this block group before
1028          * remove it.
1029          */
1030         btrfs_free_excluded_extents(block_group);
1031         btrfs_free_ref_tree_range(fs_info, block_group->start,
1032                                   block_group->length);
1033
1034         index = btrfs_bg_flags_to_raid_index(block_group->flags);
1035         factor = btrfs_bg_type_to_factor(block_group->flags);
1036
1037         /* make sure this block group isn't part of an allocation cluster */
1038         cluster = &fs_info->data_alloc_cluster;
1039         spin_lock(&cluster->refill_lock);
1040         btrfs_return_cluster_to_free_space(block_group, cluster);
1041         spin_unlock(&cluster->refill_lock);
1042
1043         /*
1044          * make sure this block group isn't part of a metadata
1045          * allocation cluster
1046          */
1047         cluster = &fs_info->meta_alloc_cluster;
1048         spin_lock(&cluster->refill_lock);
1049         btrfs_return_cluster_to_free_space(block_group, cluster);
1050         spin_unlock(&cluster->refill_lock);
1051
1052         btrfs_clear_treelog_bg(block_group);
1053         btrfs_clear_data_reloc_bg(block_group);
1054
1055         path = btrfs_alloc_path();
1056         if (!path) {
1057                 ret = -ENOMEM;
1058                 goto out;
1059         }
1060
1061         /*
1062          * get the inode first so any iput calls done for the io_list
1063          * aren't the final iput (no unlinks allowed now)
1064          */
1065         inode = lookup_free_space_inode(block_group, path);
1066
1067         mutex_lock(&trans->transaction->cache_write_mutex);
1068         /*
1069          * Make sure our free space cache IO is done before removing the
1070          * free space inode
1071          */
1072         spin_lock(&trans->transaction->dirty_bgs_lock);
1073         if (!list_empty(&block_group->io_list)) {
1074                 list_del_init(&block_group->io_list);
1075
1076                 WARN_ON(!IS_ERR(inode) && inode != block_group->io_ctl.inode);
1077
1078                 spin_unlock(&trans->transaction->dirty_bgs_lock);
1079                 btrfs_wait_cache_io(trans, block_group, path);
1080                 btrfs_put_block_group(block_group);
1081                 spin_lock(&trans->transaction->dirty_bgs_lock);
1082         }
1083
1084         if (!list_empty(&block_group->dirty_list)) {
1085                 list_del_init(&block_group->dirty_list);
1086                 remove_rsv = true;
1087                 btrfs_put_block_group(block_group);
1088         }
1089         spin_unlock(&trans->transaction->dirty_bgs_lock);
1090         mutex_unlock(&trans->transaction->cache_write_mutex);
1091
1092         ret = btrfs_remove_free_space_inode(trans, inode, block_group);
1093         if (ret)
1094                 goto out;
1095
1096         write_lock(&fs_info->block_group_cache_lock);
1097         rb_erase_cached(&block_group->cache_node,
1098                         &fs_info->block_group_cache_tree);
1099         RB_CLEAR_NODE(&block_group->cache_node);
1100
1101         /* Once for the block groups rbtree */
1102         btrfs_put_block_group(block_group);
1103
1104         write_unlock(&fs_info->block_group_cache_lock);
1105
1106         down_write(&block_group->space_info->groups_sem);
1107         /*
1108          * we must use list_del_init so people can check to see if they
1109          * are still on the list after taking the semaphore
1110          */
1111         list_del_init(&block_group->list);
1112         if (list_empty(&block_group->space_info->block_groups[index])) {
1113                 kobj = block_group->space_info->block_group_kobjs[index];
1114                 block_group->space_info->block_group_kobjs[index] = NULL;
1115                 clear_avail_alloc_bits(fs_info, block_group->flags);
1116         }
1117         up_write(&block_group->space_info->groups_sem);
1118         clear_incompat_bg_bits(fs_info, block_group->flags);
1119         if (kobj) {
1120                 kobject_del(kobj);
1121                 kobject_put(kobj);
1122         }
1123
1124         if (block_group->cached == BTRFS_CACHE_STARTED)
1125                 btrfs_wait_block_group_cache_done(block_group);
1126
1127         write_lock(&fs_info->block_group_cache_lock);
1128         caching_ctl = btrfs_get_caching_control(block_group);
1129         if (!caching_ctl) {
1130                 struct btrfs_caching_control *ctl;
1131
1132                 list_for_each_entry(ctl, &fs_info->caching_block_groups, list) {
1133                         if (ctl->block_group == block_group) {
1134                                 caching_ctl = ctl;
1135                                 refcount_inc(&caching_ctl->count);
1136                                 break;
1137                         }
1138                 }
1139         }
1140         if (caching_ctl)
1141                 list_del_init(&caching_ctl->list);
1142         write_unlock(&fs_info->block_group_cache_lock);
1143
1144         if (caching_ctl) {
1145                 /* Once for the caching bgs list and once for us. */
1146                 btrfs_put_caching_control(caching_ctl);
1147                 btrfs_put_caching_control(caching_ctl);
1148         }
1149
1150         spin_lock(&trans->transaction->dirty_bgs_lock);
1151         WARN_ON(!list_empty(&block_group->dirty_list));
1152         WARN_ON(!list_empty(&block_group->io_list));
1153         spin_unlock(&trans->transaction->dirty_bgs_lock);
1154
1155         btrfs_remove_free_space_cache(block_group);
1156
1157         spin_lock(&block_group->space_info->lock);
1158         list_del_init(&block_group->ro_list);
1159
1160         if (btrfs_test_opt(fs_info, ENOSPC_DEBUG)) {
1161                 WARN_ON(block_group->space_info->total_bytes
1162                         < block_group->length);
1163                 WARN_ON(block_group->space_info->bytes_readonly
1164                         < block_group->length - block_group->zone_unusable);
1165                 WARN_ON(block_group->space_info->bytes_zone_unusable
1166                         < block_group->zone_unusable);
1167                 WARN_ON(block_group->space_info->disk_total
1168                         < block_group->length * factor);
1169         }
1170         block_group->space_info->total_bytes -= block_group->length;
1171         block_group->space_info->bytes_readonly -=
1172                 (block_group->length - block_group->zone_unusable);
1173         block_group->space_info->bytes_zone_unusable -=
1174                 block_group->zone_unusable;
1175         block_group->space_info->disk_total -= block_group->length * factor;
1176
1177         spin_unlock(&block_group->space_info->lock);
1178
1179         /*
1180          * Remove the free space for the block group from the free space tree
1181          * and the block group's item from the extent tree before marking the
1182          * block group as removed. This is to prevent races with tasks that
1183          * freeze and unfreeze a block group, this task and another task
1184          * allocating a new block group - the unfreeze task ends up removing
1185          * the block group's extent map before the task calling this function
1186          * deletes the block group item from the extent tree, allowing for
1187          * another task to attempt to create another block group with the same
1188          * item key (and failing with -EEXIST and a transaction abort).
1189          */
1190         ret = remove_block_group_free_space(trans, block_group);
1191         if (ret)
1192                 goto out;
1193
1194         ret = remove_block_group_item(trans, path, block_group);
1195         if (ret < 0)
1196                 goto out;
1197
1198         spin_lock(&block_group->lock);
1199         set_bit(BLOCK_GROUP_FLAG_REMOVED, &block_group->runtime_flags);
1200
1201         /*
1202          * At this point trimming or scrub can't start on this block group,
1203          * because we removed the block group from the rbtree
1204          * fs_info->block_group_cache_tree so no one can't find it anymore and
1205          * even if someone already got this block group before we removed it
1206          * from the rbtree, they have already incremented block_group->frozen -
1207          * if they didn't, for the trimming case they won't find any free space
1208          * entries because we already removed them all when we called
1209          * btrfs_remove_free_space_cache().
1210          *
1211          * And we must not remove the extent map from the fs_info->mapping_tree
1212          * to prevent the same logical address range and physical device space
1213          * ranges from being reused for a new block group. This is needed to
1214          * avoid races with trimming and scrub.
1215          *
1216          * An fs trim operation (btrfs_trim_fs() / btrfs_ioctl_fitrim()) is
1217          * completely transactionless, so while it is trimming a range the
1218          * currently running transaction might finish and a new one start,
1219          * allowing for new block groups to be created that can reuse the same
1220          * physical device locations unless we take this special care.
1221          *
1222          * There may also be an implicit trim operation if the file system
1223          * is mounted with -odiscard. The same protections must remain
1224          * in place until the extents have been discarded completely when
1225          * the transaction commit has completed.
1226          */
1227         remove_em = (atomic_read(&block_group->frozen) == 0);
1228         spin_unlock(&block_group->lock);
1229
1230         if (remove_em) {
1231                 struct extent_map_tree *em_tree;
1232
1233                 em_tree = &fs_info->mapping_tree;
1234                 write_lock(&em_tree->lock);
1235                 remove_extent_mapping(em_tree, em);
1236                 write_unlock(&em_tree->lock);
1237                 /* once for the tree */
1238                 free_extent_map(em);
1239         }
1240
1241 out:
1242         /* Once for the lookup reference */
1243         btrfs_put_block_group(block_group);
1244         if (remove_rsv)
1245                 btrfs_delayed_refs_rsv_release(fs_info, 1);
1246         btrfs_free_path(path);
1247         return ret;
1248 }
1249
1250 struct btrfs_trans_handle *btrfs_start_trans_remove_block_group(
1251                 struct btrfs_fs_info *fs_info, const u64 chunk_offset)
1252 {
1253         struct btrfs_root *root = btrfs_block_group_root(fs_info);
1254         struct extent_map_tree *em_tree = &fs_info->mapping_tree;
1255         struct extent_map *em;
1256         struct map_lookup *map;
1257         unsigned int num_items;
1258
1259         read_lock(&em_tree->lock);
1260         em = lookup_extent_mapping(em_tree, chunk_offset, 1);
1261         read_unlock(&em_tree->lock);
1262         ASSERT(em && em->start == chunk_offset);
1263
1264         /*
1265          * We need to reserve 3 + N units from the metadata space info in order
1266          * to remove a block group (done at btrfs_remove_chunk() and at
1267          * btrfs_remove_block_group()), which are used for:
1268          *
1269          * 1 unit for adding the free space inode's orphan (located in the tree
1270          * of tree roots).
1271          * 1 unit for deleting the block group item (located in the extent
1272          * tree).
1273          * 1 unit for deleting the free space item (located in tree of tree
1274          * roots).
1275          * N units for deleting N device extent items corresponding to each
1276          * stripe (located in the device tree).
1277          *
1278          * In order to remove a block group we also need to reserve units in the
1279          * system space info in order to update the chunk tree (update one or
1280          * more device items and remove one chunk item), but this is done at
1281          * btrfs_remove_chunk() through a call to check_system_chunk().
1282          */
1283         map = em->map_lookup;
1284         num_items = 3 + map->num_stripes;
1285         free_extent_map(em);
1286
1287         return btrfs_start_transaction_fallback_global_rsv(root, num_items);
1288 }
1289
1290 /*
1291  * Mark block group @cache read-only, so later write won't happen to block
1292  * group @cache.
1293  *
1294  * If @force is not set, this function will only mark the block group readonly
1295  * if we have enough free space (1M) in other metadata/system block groups.
1296  * If @force is not set, this function will mark the block group readonly
1297  * without checking free space.
1298  *
1299  * NOTE: This function doesn't care if other block groups can contain all the
1300  * data in this block group. That check should be done by relocation routine,
1301  * not this function.
1302  */
1303 static int inc_block_group_ro(struct btrfs_block_group *cache, int force)
1304 {
1305         struct btrfs_space_info *sinfo = cache->space_info;
1306         u64 num_bytes;
1307         int ret = -ENOSPC;
1308
1309         spin_lock(&sinfo->lock);
1310         spin_lock(&cache->lock);
1311
1312         if (cache->swap_extents) {
1313                 ret = -ETXTBSY;
1314                 goto out;
1315         }
1316
1317         if (cache->ro) {
1318                 cache->ro++;
1319                 ret = 0;
1320                 goto out;
1321         }
1322
1323         num_bytes = cache->length - cache->reserved - cache->pinned -
1324                     cache->bytes_super - cache->zone_unusable - cache->used;
1325
1326         /*
1327          * Data never overcommits, even in mixed mode, so do just the straight
1328          * check of left over space in how much we have allocated.
1329          */
1330         if (force) {
1331                 ret = 0;
1332         } else if (sinfo->flags & BTRFS_BLOCK_GROUP_DATA) {
1333                 u64 sinfo_used = btrfs_space_info_used(sinfo, true);
1334
1335                 /*
1336                  * Here we make sure if we mark this bg RO, we still have enough
1337                  * free space as buffer.
1338                  */
1339                 if (sinfo_used + num_bytes <= sinfo->total_bytes)
1340                         ret = 0;
1341         } else {
1342                 /*
1343                  * We overcommit metadata, so we need to do the
1344                  * btrfs_can_overcommit check here, and we need to pass in
1345                  * BTRFS_RESERVE_NO_FLUSH to give ourselves the most amount of
1346                  * leeway to allow us to mark this block group as read only.
1347                  */
1348                 if (btrfs_can_overcommit(cache->fs_info, sinfo, num_bytes,
1349                                          BTRFS_RESERVE_NO_FLUSH))
1350                         ret = 0;
1351         }
1352
1353         if (!ret) {
1354                 sinfo->bytes_readonly += num_bytes;
1355                 if (btrfs_is_zoned(cache->fs_info)) {
1356                         /* Migrate zone_unusable bytes to readonly */
1357                         sinfo->bytes_readonly += cache->zone_unusable;
1358                         sinfo->bytes_zone_unusable -= cache->zone_unusable;
1359                         cache->zone_unusable = 0;
1360                 }
1361                 cache->ro++;
1362                 list_add_tail(&cache->ro_list, &sinfo->ro_bgs);
1363         }
1364 out:
1365         spin_unlock(&cache->lock);
1366         spin_unlock(&sinfo->lock);
1367         if (ret == -ENOSPC && btrfs_test_opt(cache->fs_info, ENOSPC_DEBUG)) {
1368                 btrfs_info(cache->fs_info,
1369                         "unable to make block group %llu ro", cache->start);
1370                 btrfs_dump_space_info(cache->fs_info, cache->space_info, 0, 0);
1371         }
1372         return ret;
1373 }
1374
1375 static bool clean_pinned_extents(struct btrfs_trans_handle *trans,
1376                                  struct btrfs_block_group *bg)
1377 {
1378         struct btrfs_fs_info *fs_info = bg->fs_info;
1379         struct btrfs_transaction *prev_trans = NULL;
1380         const u64 start = bg->start;
1381         const u64 end = start + bg->length - 1;
1382         int ret;
1383
1384         spin_lock(&fs_info->trans_lock);
1385         if (trans->transaction->list.prev != &fs_info->trans_list) {
1386                 prev_trans = list_last_entry(&trans->transaction->list,
1387                                              struct btrfs_transaction, list);
1388                 refcount_inc(&prev_trans->use_count);
1389         }
1390         spin_unlock(&fs_info->trans_lock);
1391
1392         /*
1393          * Hold the unused_bg_unpin_mutex lock to avoid racing with
1394          * btrfs_finish_extent_commit(). If we are at transaction N, another
1395          * task might be running finish_extent_commit() for the previous
1396          * transaction N - 1, and have seen a range belonging to the block
1397          * group in pinned_extents before we were able to clear the whole block
1398          * group range from pinned_extents. This means that task can lookup for
1399          * the block group after we unpinned it from pinned_extents and removed
1400          * it, leading to a BUG_ON() at unpin_extent_range().
1401          */
1402         mutex_lock(&fs_info->unused_bg_unpin_mutex);
1403         if (prev_trans) {
1404                 ret = clear_extent_bits(&prev_trans->pinned_extents, start, end,
1405                                         EXTENT_DIRTY);
1406                 if (ret)
1407                         goto out;
1408         }
1409
1410         ret = clear_extent_bits(&trans->transaction->pinned_extents, start, end,
1411                                 EXTENT_DIRTY);
1412 out:
1413         mutex_unlock(&fs_info->unused_bg_unpin_mutex);
1414         if (prev_trans)
1415                 btrfs_put_transaction(prev_trans);
1416
1417         return ret == 0;
1418 }
1419
1420 /*
1421  * Process the unused_bgs list and remove any that don't have any allocated
1422  * space inside of them.
1423  */
1424 void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
1425 {
1426         struct btrfs_block_group *block_group;
1427         struct btrfs_space_info *space_info;
1428         struct btrfs_trans_handle *trans;
1429         const bool async_trim_enabled = btrfs_test_opt(fs_info, DISCARD_ASYNC);
1430         int ret = 0;
1431
1432         if (!test_bit(BTRFS_FS_OPEN, &fs_info->flags))
1433                 return;
1434
1435         if (btrfs_fs_closing(fs_info))
1436                 return;
1437
1438         /*
1439          * Long running balances can keep us blocked here for eternity, so
1440          * simply skip deletion if we're unable to get the mutex.
1441          */
1442         if (!mutex_trylock(&fs_info->reclaim_bgs_lock))
1443                 return;
1444
1445         spin_lock(&fs_info->unused_bgs_lock);
1446         while (!list_empty(&fs_info->unused_bgs)) {
1447                 int trimming;
1448
1449                 block_group = list_first_entry(&fs_info->unused_bgs,
1450                                                struct btrfs_block_group,
1451                                                bg_list);
1452                 list_del_init(&block_group->bg_list);
1453
1454                 space_info = block_group->space_info;
1455
1456                 if (ret || btrfs_mixed_space_info(space_info)) {
1457                         btrfs_put_block_group(block_group);
1458                         continue;
1459                 }
1460                 spin_unlock(&fs_info->unused_bgs_lock);
1461
1462                 btrfs_discard_cancel_work(&fs_info->discard_ctl, block_group);
1463
1464                 /* Don't want to race with allocators so take the groups_sem */
1465                 down_write(&space_info->groups_sem);
1466
1467                 /*
1468                  * Async discard moves the final block group discard to be prior
1469                  * to the unused_bgs code path.  Therefore, if it's not fully
1470                  * trimmed, punt it back to the async discard lists.
1471                  */
1472                 if (btrfs_test_opt(fs_info, DISCARD_ASYNC) &&
1473                     !btrfs_is_free_space_trimmed(block_group)) {
1474                         trace_btrfs_skip_unused_block_group(block_group);
1475                         up_write(&space_info->groups_sem);
1476                         /* Requeue if we failed because of async discard */
1477                         btrfs_discard_queue_work(&fs_info->discard_ctl,
1478                                                  block_group);
1479                         goto next;
1480                 }
1481
1482                 spin_lock(&block_group->lock);
1483                 if (block_group->reserved || block_group->pinned ||
1484                     block_group->used || block_group->ro ||
1485                     list_is_singular(&block_group->list)) {
1486                         /*
1487                          * We want to bail if we made new allocations or have
1488                          * outstanding allocations in this block group.  We do
1489                          * the ro check in case balance is currently acting on
1490                          * this block group.
1491                          */
1492                         trace_btrfs_skip_unused_block_group(block_group);
1493                         spin_unlock(&block_group->lock);
1494                         up_write(&space_info->groups_sem);
1495                         goto next;
1496                 }
1497                 spin_unlock(&block_group->lock);
1498
1499                 /* We don't want to force the issue, only flip if it's ok. */
1500                 ret = inc_block_group_ro(block_group, 0);
1501                 up_write(&space_info->groups_sem);
1502                 if (ret < 0) {
1503                         ret = 0;
1504                         goto next;
1505                 }
1506
1507                 ret = btrfs_zone_finish(block_group);
1508                 if (ret < 0) {
1509                         btrfs_dec_block_group_ro(block_group);
1510                         if (ret == -EAGAIN)
1511                                 ret = 0;
1512                         goto next;
1513                 }
1514
1515                 /*
1516                  * Want to do this before we do anything else so we can recover
1517                  * properly if we fail to join the transaction.
1518                  */
1519                 trans = btrfs_start_trans_remove_block_group(fs_info,
1520                                                      block_group->start);
1521                 if (IS_ERR(trans)) {
1522                         btrfs_dec_block_group_ro(block_group);
1523                         ret = PTR_ERR(trans);
1524                         goto next;
1525                 }
1526
1527                 /*
1528                  * We could have pending pinned extents for this block group,
1529                  * just delete them, we don't care about them anymore.
1530                  */
1531                 if (!clean_pinned_extents(trans, block_group)) {
1532                         btrfs_dec_block_group_ro(block_group);
1533                         goto end_trans;
1534                 }
1535
1536                 /*
1537                  * At this point, the block_group is read only and should fail
1538                  * new allocations.  However, btrfs_finish_extent_commit() can
1539                  * cause this block_group to be placed back on the discard
1540                  * lists because now the block_group isn't fully discarded.
1541                  * Bail here and try again later after discarding everything.
1542                  */
1543                 spin_lock(&fs_info->discard_ctl.lock);
1544                 if (!list_empty(&block_group->discard_list)) {
1545                         spin_unlock(&fs_info->discard_ctl.lock);
1546                         btrfs_dec_block_group_ro(block_group);
1547                         btrfs_discard_queue_work(&fs_info->discard_ctl,
1548                                                  block_group);
1549                         goto end_trans;
1550                 }
1551                 spin_unlock(&fs_info->discard_ctl.lock);
1552
1553                 /* Reset pinned so btrfs_put_block_group doesn't complain */
1554                 spin_lock(&space_info->lock);
1555                 spin_lock(&block_group->lock);
1556
1557                 btrfs_space_info_update_bytes_pinned(fs_info, space_info,
1558                                                      -block_group->pinned);
1559                 space_info->bytes_readonly += block_group->pinned;
1560                 block_group->pinned = 0;
1561
1562                 spin_unlock(&block_group->lock);
1563                 spin_unlock(&space_info->lock);
1564
1565                 /*
1566                  * The normal path here is an unused block group is passed here,
1567                  * then trimming is handled in the transaction commit path.
1568                  * Async discard interposes before this to do the trimming
1569                  * before coming down the unused block group path as trimming
1570                  * will no longer be done later in the transaction commit path.
1571                  */
1572                 if (!async_trim_enabled && btrfs_test_opt(fs_info, DISCARD_ASYNC))
1573                         goto flip_async;
1574
1575                 /*
1576                  * DISCARD can flip during remount. On zoned filesystems, we
1577                  * need to reset sequential-required zones.
1578                  */
1579                 trimming = btrfs_test_opt(fs_info, DISCARD_SYNC) ||
1580                                 btrfs_is_zoned(fs_info);
1581
1582                 /* Implicit trim during transaction commit. */
1583                 if (trimming)
1584                         btrfs_freeze_block_group(block_group);
1585
1586                 /*
1587                  * Btrfs_remove_chunk will abort the transaction if things go
1588                  * horribly wrong.
1589                  */
1590                 ret = btrfs_remove_chunk(trans, block_group->start);
1591
1592                 if (ret) {
1593                         if (trimming)
1594                                 btrfs_unfreeze_block_group(block_group);
1595                         goto end_trans;
1596                 }
1597
1598                 /*
1599                  * If we're not mounted with -odiscard, we can just forget
1600                  * about this block group. Otherwise we'll need to wait
1601                  * until transaction commit to do the actual discard.
1602                  */
1603                 if (trimming) {
1604                         spin_lock(&fs_info->unused_bgs_lock);
1605                         /*
1606                          * A concurrent scrub might have added us to the list
1607                          * fs_info->unused_bgs, so use a list_move operation
1608                          * to add the block group to the deleted_bgs list.
1609                          */
1610                         list_move(&block_group->bg_list,
1611                                   &trans->transaction->deleted_bgs);
1612                         spin_unlock(&fs_info->unused_bgs_lock);
1613                         btrfs_get_block_group(block_group);
1614                 }
1615 end_trans:
1616                 btrfs_end_transaction(trans);
1617 next:
1618                 btrfs_put_block_group(block_group);
1619                 spin_lock(&fs_info->unused_bgs_lock);
1620         }
1621         spin_unlock(&fs_info->unused_bgs_lock);
1622         mutex_unlock(&fs_info->reclaim_bgs_lock);
1623         return;
1624
1625 flip_async:
1626         btrfs_end_transaction(trans);
1627         mutex_unlock(&fs_info->reclaim_bgs_lock);
1628         btrfs_put_block_group(block_group);
1629         btrfs_discard_punt_unused_bgs_list(fs_info);
1630 }
1631
1632 void btrfs_mark_bg_unused(struct btrfs_block_group *bg)
1633 {
1634         struct btrfs_fs_info *fs_info = bg->fs_info;
1635
1636         spin_lock(&fs_info->unused_bgs_lock);
1637         if (list_empty(&bg->bg_list)) {
1638                 btrfs_get_block_group(bg);
1639                 trace_btrfs_add_unused_block_group(bg);
1640                 list_add_tail(&bg->bg_list, &fs_info->unused_bgs);
1641         }
1642         spin_unlock(&fs_info->unused_bgs_lock);
1643 }
1644
1645 /*
1646  * We want block groups with a low number of used bytes to be in the beginning
1647  * of the list, so they will get reclaimed first.
1648  */
1649 static int reclaim_bgs_cmp(void *unused, const struct list_head *a,
1650                            const struct list_head *b)
1651 {
1652         const struct btrfs_block_group *bg1, *bg2;
1653
1654         bg1 = list_entry(a, struct btrfs_block_group, bg_list);
1655         bg2 = list_entry(b, struct btrfs_block_group, bg_list);
1656
1657         return bg1->used > bg2->used;
1658 }
1659
1660 static inline bool btrfs_should_reclaim(struct btrfs_fs_info *fs_info)
1661 {
1662         if (btrfs_is_zoned(fs_info))
1663                 return btrfs_zoned_should_reclaim(fs_info);
1664         return true;
1665 }
1666
1667 static bool should_reclaim_block_group(struct btrfs_block_group *bg, u64 bytes_freed)
1668 {
1669         const struct btrfs_space_info *space_info = bg->space_info;
1670         const int reclaim_thresh = READ_ONCE(space_info->bg_reclaim_threshold);
1671         const u64 new_val = bg->used;
1672         const u64 old_val = new_val + bytes_freed;
1673         u64 thresh;
1674
1675         if (reclaim_thresh == 0)
1676                 return false;
1677
1678         thresh = mult_perc(bg->length, reclaim_thresh);
1679
1680         /*
1681          * If we were below the threshold before don't reclaim, we are likely a
1682          * brand new block group and we don't want to relocate new block groups.
1683          */
1684         if (old_val < thresh)
1685                 return false;
1686         if (new_val >= thresh)
1687                 return false;
1688         return true;
1689 }
1690
1691 void btrfs_reclaim_bgs_work(struct work_struct *work)
1692 {
1693         struct btrfs_fs_info *fs_info =
1694                 container_of(work, struct btrfs_fs_info, reclaim_bgs_work);
1695         struct btrfs_block_group *bg;
1696         struct btrfs_space_info *space_info;
1697
1698         if (!test_bit(BTRFS_FS_OPEN, &fs_info->flags))
1699                 return;
1700
1701         if (btrfs_fs_closing(fs_info))
1702                 return;
1703
1704         if (!btrfs_should_reclaim(fs_info))
1705                 return;
1706
1707         sb_start_write(fs_info->sb);
1708
1709         if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_BALANCE)) {
1710                 sb_end_write(fs_info->sb);
1711                 return;
1712         }
1713
1714         /*
1715          * Long running balances can keep us blocked here for eternity, so
1716          * simply skip reclaim if we're unable to get the mutex.
1717          */
1718         if (!mutex_trylock(&fs_info->reclaim_bgs_lock)) {
1719                 btrfs_exclop_finish(fs_info);
1720                 sb_end_write(fs_info->sb);
1721                 return;
1722         }
1723
1724         spin_lock(&fs_info->unused_bgs_lock);
1725         /*
1726          * Sort happens under lock because we can't simply splice it and sort.
1727          * The block groups might still be in use and reachable via bg_list,
1728          * and their presence in the reclaim_bgs list must be preserved.
1729          */
1730         list_sort(NULL, &fs_info->reclaim_bgs, reclaim_bgs_cmp);
1731         while (!list_empty(&fs_info->reclaim_bgs)) {
1732                 u64 zone_unusable;
1733                 int ret = 0;
1734
1735                 bg = list_first_entry(&fs_info->reclaim_bgs,
1736                                       struct btrfs_block_group,
1737                                       bg_list);
1738                 list_del_init(&bg->bg_list);
1739
1740                 space_info = bg->space_info;
1741                 spin_unlock(&fs_info->unused_bgs_lock);
1742
1743                 /* Don't race with allocators so take the groups_sem */
1744                 down_write(&space_info->groups_sem);
1745
1746                 spin_lock(&bg->lock);
1747                 if (bg->reserved || bg->pinned || bg->ro) {
1748                         /*
1749                          * We want to bail if we made new allocations or have
1750                          * outstanding allocations in this block group.  We do
1751                          * the ro check in case balance is currently acting on
1752                          * this block group.
1753                          */
1754                         spin_unlock(&bg->lock);
1755                         up_write(&space_info->groups_sem);
1756                         goto next;
1757                 }
1758                 if (bg->used == 0) {
1759                         /*
1760                          * It is possible that we trigger relocation on a block
1761                          * group as its extents are deleted and it first goes
1762                          * below the threshold, then shortly after goes empty.
1763                          *
1764                          * In this case, relocating it does delete it, but has
1765                          * some overhead in relocation specific metadata, looking
1766                          * for the non-existent extents and running some extra
1767                          * transactions, which we can avoid by using one of the
1768                          * other mechanisms for dealing with empty block groups.
1769                          */
1770                         if (!btrfs_test_opt(fs_info, DISCARD_ASYNC))
1771                                 btrfs_mark_bg_unused(bg);
1772                         spin_unlock(&bg->lock);
1773                         up_write(&space_info->groups_sem);
1774                         goto next;
1775
1776                 }
1777                 /*
1778                  * The block group might no longer meet the reclaim condition by
1779                  * the time we get around to reclaiming it, so to avoid
1780                  * reclaiming overly full block_groups, skip reclaiming them.
1781                  *
1782                  * Since the decision making process also depends on the amount
1783                  * being freed, pass in a fake giant value to skip that extra
1784                  * check, which is more meaningful when adding to the list in
1785                  * the first place.
1786                  */
1787                 if (!should_reclaim_block_group(bg, bg->length)) {
1788                         spin_unlock(&bg->lock);
1789                         up_write(&space_info->groups_sem);
1790                         goto next;
1791                 }
1792                 spin_unlock(&bg->lock);
1793
1794                 /* Get out fast, in case we're unmounting the filesystem */
1795                 if (btrfs_fs_closing(fs_info)) {
1796                         up_write(&space_info->groups_sem);
1797                         goto next;
1798                 }
1799
1800                 /*
1801                  * Cache the zone_unusable value before turning the block group
1802                  * to read only. As soon as the blog group is read only it's
1803                  * zone_unusable value gets moved to the block group's read-only
1804                  * bytes and isn't available for calculations anymore.
1805                  */
1806                 zone_unusable = bg->zone_unusable;
1807                 ret = inc_block_group_ro(bg, 0);
1808                 up_write(&space_info->groups_sem);
1809                 if (ret < 0)
1810                         goto next;
1811
1812                 btrfs_info(fs_info,
1813                         "reclaiming chunk %llu with %llu%% used %llu%% unusable",
1814                                 bg->start,
1815                                 div64_u64(bg->used * 100, bg->length),
1816                                 div64_u64(zone_unusable * 100, bg->length));
1817                 trace_btrfs_reclaim_block_group(bg);
1818                 ret = btrfs_relocate_chunk(fs_info, bg->start);
1819                 if (ret) {
1820                         btrfs_dec_block_group_ro(bg);
1821                         btrfs_err(fs_info, "error relocating chunk %llu",
1822                                   bg->start);
1823                 }
1824
1825 next:
1826                 btrfs_put_block_group(bg);
1827                 spin_lock(&fs_info->unused_bgs_lock);
1828         }
1829         spin_unlock(&fs_info->unused_bgs_lock);
1830         mutex_unlock(&fs_info->reclaim_bgs_lock);
1831         btrfs_exclop_finish(fs_info);
1832         sb_end_write(fs_info->sb);
1833 }
1834
1835 void btrfs_reclaim_bgs(struct btrfs_fs_info *fs_info)
1836 {
1837         spin_lock(&fs_info->unused_bgs_lock);
1838         if (!list_empty(&fs_info->reclaim_bgs))
1839                 queue_work(system_unbound_wq, &fs_info->reclaim_bgs_work);
1840         spin_unlock(&fs_info->unused_bgs_lock);
1841 }
1842
1843 void btrfs_mark_bg_to_reclaim(struct btrfs_block_group *bg)
1844 {
1845         struct btrfs_fs_info *fs_info = bg->fs_info;
1846
1847         spin_lock(&fs_info->unused_bgs_lock);
1848         if (list_empty(&bg->bg_list)) {
1849                 btrfs_get_block_group(bg);
1850                 trace_btrfs_add_reclaim_block_group(bg);
1851                 list_add_tail(&bg->bg_list, &fs_info->reclaim_bgs);
1852         }
1853         spin_unlock(&fs_info->unused_bgs_lock);
1854 }
1855
1856 static int read_bg_from_eb(struct btrfs_fs_info *fs_info, struct btrfs_key *key,
1857                            struct btrfs_path *path)
1858 {
1859         struct extent_map_tree *em_tree;
1860         struct extent_map *em;
1861         struct btrfs_block_group_item bg;
1862         struct extent_buffer *leaf;
1863         int slot;
1864         u64 flags;
1865         int ret = 0;
1866
1867         slot = path->slots[0];
1868         leaf = path->nodes[0];
1869
1870         em_tree = &fs_info->mapping_tree;
1871         read_lock(&em_tree->lock);
1872         em = lookup_extent_mapping(em_tree, key->objectid, key->offset);
1873         read_unlock(&em_tree->lock);
1874         if (!em) {
1875                 btrfs_err(fs_info,
1876                           "logical %llu len %llu found bg but no related chunk",
1877                           key->objectid, key->offset);
1878                 return -ENOENT;
1879         }
1880
1881         if (em->start != key->objectid || em->len != key->offset) {
1882                 btrfs_err(fs_info,
1883                         "block group %llu len %llu mismatch with chunk %llu len %llu",
1884                         key->objectid, key->offset, em->start, em->len);
1885                 ret = -EUCLEAN;
1886                 goto out_free_em;
1887         }
1888
1889         read_extent_buffer(leaf, &bg, btrfs_item_ptr_offset(leaf, slot),
1890                            sizeof(bg));
1891         flags = btrfs_stack_block_group_flags(&bg) &
1892                 BTRFS_BLOCK_GROUP_TYPE_MASK;
1893
1894         if (flags != (em->map_lookup->type & BTRFS_BLOCK_GROUP_TYPE_MASK)) {
1895                 btrfs_err(fs_info,
1896 "block group %llu len %llu type flags 0x%llx mismatch with chunk type flags 0x%llx",
1897                           key->objectid, key->offset, flags,
1898                           (BTRFS_BLOCK_GROUP_TYPE_MASK & em->map_lookup->type));
1899                 ret = -EUCLEAN;
1900         }
1901
1902 out_free_em:
1903         free_extent_map(em);
1904         return ret;
1905 }
1906
1907 static int find_first_block_group(struct btrfs_fs_info *fs_info,
1908                                   struct btrfs_path *path,
1909                                   struct btrfs_key *key)
1910 {
1911         struct btrfs_root *root = btrfs_block_group_root(fs_info);
1912         int ret;
1913         struct btrfs_key found_key;
1914
1915         btrfs_for_each_slot(root, key, &found_key, path, ret) {
1916                 if (found_key.objectid >= key->objectid &&
1917                     found_key.type == BTRFS_BLOCK_GROUP_ITEM_KEY) {
1918                         return read_bg_from_eb(fs_info, &found_key, path);
1919                 }
1920         }
1921         return ret;
1922 }
1923
1924 static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
1925 {
1926         u64 extra_flags = chunk_to_extended(flags) &
1927                                 BTRFS_EXTENDED_PROFILE_MASK;
1928
1929         write_seqlock(&fs_info->profiles_lock);
1930         if (flags & BTRFS_BLOCK_GROUP_DATA)
1931                 fs_info->avail_data_alloc_bits |= extra_flags;
1932         if (flags & BTRFS_BLOCK_GROUP_METADATA)
1933                 fs_info->avail_metadata_alloc_bits |= extra_flags;
1934         if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
1935                 fs_info->avail_system_alloc_bits |= extra_flags;
1936         write_sequnlock(&fs_info->profiles_lock);
1937 }
1938
1939 /*
1940  * Map a physical disk address to a list of logical addresses.
1941  *
1942  * @fs_info:       the filesystem
1943  * @chunk_start:   logical address of block group
1944  * @physical:      physical address to map to logical addresses
1945  * @logical:       return array of logical addresses which map to @physical
1946  * @naddrs:        length of @logical
1947  * @stripe_len:    size of IO stripe for the given block group
1948  *
1949  * Maps a particular @physical disk address to a list of @logical addresses.
1950  * Used primarily to exclude those portions of a block group that contain super
1951  * block copies.
1952  */
1953 int btrfs_rmap_block(struct btrfs_fs_info *fs_info, u64 chunk_start,
1954                      u64 physical, u64 **logical, int *naddrs, int *stripe_len)
1955 {
1956         struct extent_map *em;
1957         struct map_lookup *map;
1958         u64 *buf;
1959         u64 bytenr;
1960         u64 data_stripe_length;
1961         u64 io_stripe_size;
1962         int i, nr = 0;
1963         int ret = 0;
1964
1965         em = btrfs_get_chunk_map(fs_info, chunk_start, 1);
1966         if (IS_ERR(em))
1967                 return -EIO;
1968
1969         map = em->map_lookup;
1970         data_stripe_length = em->orig_block_len;
1971         io_stripe_size = BTRFS_STRIPE_LEN;
1972         chunk_start = em->start;
1973
1974         /* For RAID5/6 adjust to a full IO stripe length */
1975         if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK)
1976                 io_stripe_size = nr_data_stripes(map) << BTRFS_STRIPE_LEN_SHIFT;
1977
1978         buf = kcalloc(map->num_stripes, sizeof(u64), GFP_NOFS);
1979         if (!buf) {
1980                 ret = -ENOMEM;
1981                 goto out;
1982         }
1983
1984         for (i = 0; i < map->num_stripes; i++) {
1985                 bool already_inserted = false;
1986                 u32 stripe_nr;
1987                 u32 offset;
1988                 int j;
1989
1990                 if (!in_range(physical, map->stripes[i].physical,
1991                               data_stripe_length))
1992                         continue;
1993
1994                 stripe_nr = (physical - map->stripes[i].physical) >>
1995                             BTRFS_STRIPE_LEN_SHIFT;
1996                 offset = (physical - map->stripes[i].physical) &
1997                          BTRFS_STRIPE_LEN_MASK;
1998
1999                 if (map->type & (BTRFS_BLOCK_GROUP_RAID0 |
2000                                  BTRFS_BLOCK_GROUP_RAID10))
2001                         stripe_nr = div_u64(stripe_nr * map->num_stripes + i,
2002                                             map->sub_stripes);
2003                 /*
2004                  * The remaining case would be for RAID56, multiply by
2005                  * nr_data_stripes().  Alternatively, just use rmap_len below
2006                  * instead of map->stripe_len
2007                  */
2008                 bytenr = chunk_start + stripe_nr * io_stripe_size + offset;
2009
2010                 /* Ensure we don't add duplicate addresses */
2011                 for (j = 0; j < nr; j++) {
2012                         if (buf[j] == bytenr) {
2013                                 already_inserted = true;
2014                                 break;
2015                         }
2016                 }
2017
2018                 if (!already_inserted)
2019                         buf[nr++] = bytenr;
2020         }
2021
2022         *logical = buf;
2023         *naddrs = nr;
2024         *stripe_len = io_stripe_size;
2025 out:
2026         free_extent_map(em);
2027         return ret;
2028 }
2029
2030 static int exclude_super_stripes(struct btrfs_block_group *cache)
2031 {
2032         struct btrfs_fs_info *fs_info = cache->fs_info;
2033         const bool zoned = btrfs_is_zoned(fs_info);
2034         u64 bytenr;
2035         u64 *logical;
2036         int stripe_len;
2037         int i, nr, ret;
2038
2039         if (cache->start < BTRFS_SUPER_INFO_OFFSET) {
2040                 stripe_len = BTRFS_SUPER_INFO_OFFSET - cache->start;
2041                 cache->bytes_super += stripe_len;
2042                 ret = btrfs_add_excluded_extent(fs_info, cache->start,
2043                                                 stripe_len);
2044                 if (ret)
2045                         return ret;
2046         }
2047
2048         for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
2049                 bytenr = btrfs_sb_offset(i);
2050                 ret = btrfs_rmap_block(fs_info, cache->start,
2051                                        bytenr, &logical, &nr, &stripe_len);
2052                 if (ret)
2053                         return ret;
2054
2055                 /* Shouldn't have super stripes in sequential zones */
2056                 if (zoned && nr) {
2057                         btrfs_err(fs_info,
2058                         "zoned: block group %llu must not contain super block",
2059                                   cache->start);
2060                         return -EUCLEAN;
2061                 }
2062
2063                 while (nr--) {
2064                         u64 len = min_t(u64, stripe_len,
2065                                 cache->start + cache->length - logical[nr]);
2066
2067                         cache->bytes_super += len;
2068                         ret = btrfs_add_excluded_extent(fs_info, logical[nr],
2069                                                         len);
2070                         if (ret) {
2071                                 kfree(logical);
2072                                 return ret;
2073                         }
2074                 }
2075
2076                 kfree(logical);
2077         }
2078         return 0;
2079 }
2080
2081 static struct btrfs_block_group *btrfs_create_block_group_cache(
2082                 struct btrfs_fs_info *fs_info, u64 start)
2083 {
2084         struct btrfs_block_group *cache;
2085
2086         cache = kzalloc(sizeof(*cache), GFP_NOFS);
2087         if (!cache)
2088                 return NULL;
2089
2090         cache->free_space_ctl = kzalloc(sizeof(*cache->free_space_ctl),
2091                                         GFP_NOFS);
2092         if (!cache->free_space_ctl) {
2093                 kfree(cache);
2094                 return NULL;
2095         }
2096
2097         cache->start = start;
2098
2099         cache->fs_info = fs_info;
2100         cache->full_stripe_len = btrfs_full_stripe_len(fs_info, start);
2101
2102         cache->discard_index = BTRFS_DISCARD_INDEX_UNUSED;
2103
2104         refcount_set(&cache->refs, 1);
2105         spin_lock_init(&cache->lock);
2106         init_rwsem(&cache->data_rwsem);
2107         INIT_LIST_HEAD(&cache->list);
2108         INIT_LIST_HEAD(&cache->cluster_list);
2109         INIT_LIST_HEAD(&cache->bg_list);
2110         INIT_LIST_HEAD(&cache->ro_list);
2111         INIT_LIST_HEAD(&cache->discard_list);
2112         INIT_LIST_HEAD(&cache->dirty_list);
2113         INIT_LIST_HEAD(&cache->io_list);
2114         INIT_LIST_HEAD(&cache->active_bg_list);
2115         btrfs_init_free_space_ctl(cache, cache->free_space_ctl);
2116         atomic_set(&cache->frozen, 0);
2117         mutex_init(&cache->free_space_lock);
2118
2119         return cache;
2120 }
2121
2122 /*
2123  * Iterate all chunks and verify that each of them has the corresponding block
2124  * group
2125  */
2126 static int check_chunk_block_group_mappings(struct btrfs_fs_info *fs_info)
2127 {
2128         struct extent_map_tree *map_tree = &fs_info->mapping_tree;
2129         struct extent_map *em;
2130         struct btrfs_block_group *bg;
2131         u64 start = 0;
2132         int ret = 0;
2133
2134         while (1) {
2135                 read_lock(&map_tree->lock);
2136                 /*
2137                  * lookup_extent_mapping will return the first extent map
2138                  * intersecting the range, so setting @len to 1 is enough to
2139                  * get the first chunk.
2140                  */
2141                 em = lookup_extent_mapping(map_tree, start, 1);
2142                 read_unlock(&map_tree->lock);
2143                 if (!em)
2144                         break;
2145
2146                 bg = btrfs_lookup_block_group(fs_info, em->start);
2147                 if (!bg) {
2148                         btrfs_err(fs_info,
2149         "chunk start=%llu len=%llu doesn't have corresponding block group",
2150                                      em->start, em->len);
2151                         ret = -EUCLEAN;
2152                         free_extent_map(em);
2153                         break;
2154                 }
2155                 if (bg->start != em->start || bg->length != em->len ||
2156                     (bg->flags & BTRFS_BLOCK_GROUP_TYPE_MASK) !=
2157                     (em->map_lookup->type & BTRFS_BLOCK_GROUP_TYPE_MASK)) {
2158                         btrfs_err(fs_info,
2159 "chunk start=%llu len=%llu flags=0x%llx doesn't match block group start=%llu len=%llu flags=0x%llx",
2160                                 em->start, em->len,
2161                                 em->map_lookup->type & BTRFS_BLOCK_GROUP_TYPE_MASK,
2162                                 bg->start, bg->length,
2163                                 bg->flags & BTRFS_BLOCK_GROUP_TYPE_MASK);
2164                         ret = -EUCLEAN;
2165                         free_extent_map(em);
2166                         btrfs_put_block_group(bg);
2167                         break;
2168                 }
2169                 start = em->start + em->len;
2170                 free_extent_map(em);
2171                 btrfs_put_block_group(bg);
2172         }
2173         return ret;
2174 }
2175
2176 static int read_one_block_group(struct btrfs_fs_info *info,
2177                                 struct btrfs_block_group_item *bgi,
2178                                 const struct btrfs_key *key,
2179                                 int need_clear)
2180 {
2181         struct btrfs_block_group *cache;
2182         const bool mixed = btrfs_fs_incompat(info, MIXED_GROUPS);
2183         int ret;
2184
2185         ASSERT(key->type == BTRFS_BLOCK_GROUP_ITEM_KEY);
2186
2187         cache = btrfs_create_block_group_cache(info, key->objectid);
2188         if (!cache)
2189                 return -ENOMEM;
2190
2191         cache->length = key->offset;
2192         cache->used = btrfs_stack_block_group_used(bgi);
2193         cache->commit_used = cache->used;
2194         cache->flags = btrfs_stack_block_group_flags(bgi);
2195         cache->global_root_id = btrfs_stack_block_group_chunk_objectid(bgi);
2196
2197         set_free_space_tree_thresholds(cache);
2198
2199         if (need_clear) {
2200                 /*
2201                  * When we mount with old space cache, we need to
2202                  * set BTRFS_DC_CLEAR and set dirty flag.
2203                  *
2204                  * a) Setting 'BTRFS_DC_CLEAR' makes sure that we
2205                  *    truncate the old free space cache inode and
2206                  *    setup a new one.
2207                  * b) Setting 'dirty flag' makes sure that we flush
2208                  *    the new space cache info onto disk.
2209                  */
2210                 if (btrfs_test_opt(info, SPACE_CACHE))
2211                         cache->disk_cache_state = BTRFS_DC_CLEAR;
2212         }
2213         if (!mixed && ((cache->flags & BTRFS_BLOCK_GROUP_METADATA) &&
2214             (cache->flags & BTRFS_BLOCK_GROUP_DATA))) {
2215                         btrfs_err(info,
2216 "bg %llu is a mixed block group but filesystem hasn't enabled mixed block groups",
2217                                   cache->start);
2218                         ret = -EINVAL;
2219                         goto error;
2220         }
2221
2222         ret = btrfs_load_block_group_zone_info(cache, false);
2223         if (ret) {
2224                 btrfs_err(info, "zoned: failed to load zone info of bg %llu",
2225                           cache->start);
2226                 goto error;
2227         }
2228
2229         /*
2230          * We need to exclude the super stripes now so that the space info has
2231          * super bytes accounted for, otherwise we'll think we have more space
2232          * than we actually do.
2233          */
2234         ret = exclude_super_stripes(cache);
2235         if (ret) {
2236                 /* We may have excluded something, so call this just in case. */
2237                 btrfs_free_excluded_extents(cache);
2238                 goto error;
2239         }
2240
2241         /*
2242          * For zoned filesystem, space after the allocation offset is the only
2243          * free space for a block group. So, we don't need any caching work.
2244          * btrfs_calc_zone_unusable() will set the amount of free space and
2245          * zone_unusable space.
2246          *
2247          * For regular filesystem, check for two cases, either we are full, and
2248          * therefore don't need to bother with the caching work since we won't
2249          * find any space, or we are empty, and we can just add all the space
2250          * in and be done with it.  This saves us _a_lot_ of time, particularly
2251          * in the full case.
2252          */
2253         if (btrfs_is_zoned(info)) {
2254                 btrfs_calc_zone_unusable(cache);
2255                 /* Should not have any excluded extents. Just in case, though. */
2256                 btrfs_free_excluded_extents(cache);
2257         } else if (cache->length == cache->used) {
2258                 cache->cached = BTRFS_CACHE_FINISHED;
2259                 btrfs_free_excluded_extents(cache);
2260         } else if (cache->used == 0) {
2261                 cache->cached = BTRFS_CACHE_FINISHED;
2262                 add_new_free_space(cache, cache->start,
2263                                    cache->start + cache->length);
2264                 btrfs_free_excluded_extents(cache);
2265         }
2266
2267         ret = btrfs_add_block_group_cache(info, cache);
2268         if (ret) {
2269                 btrfs_remove_free_space_cache(cache);
2270                 goto error;
2271         }
2272         trace_btrfs_add_block_group(info, cache, 0);
2273         btrfs_add_bg_to_space_info(info, cache);
2274
2275         set_avail_alloc_bits(info, cache->flags);
2276         if (btrfs_chunk_writeable(info, cache->start)) {
2277                 if (cache->used == 0) {
2278                         ASSERT(list_empty(&cache->bg_list));
2279                         if (btrfs_test_opt(info, DISCARD_ASYNC))
2280                                 btrfs_discard_queue_work(&info->discard_ctl, cache);
2281                         else
2282                                 btrfs_mark_bg_unused(cache);
2283                 }
2284         } else {
2285                 inc_block_group_ro(cache, 1);
2286         }
2287
2288         return 0;
2289 error:
2290         btrfs_put_block_group(cache);
2291         return ret;
2292 }
2293
2294 static int fill_dummy_bgs(struct btrfs_fs_info *fs_info)
2295 {
2296         struct extent_map_tree *em_tree = &fs_info->mapping_tree;
2297         struct rb_node *node;
2298         int ret = 0;
2299
2300         for (node = rb_first_cached(&em_tree->map); node; node = rb_next(node)) {
2301                 struct extent_map *em;
2302                 struct map_lookup *map;
2303                 struct btrfs_block_group *bg;
2304
2305                 em = rb_entry(node, struct extent_map, rb_node);
2306                 map = em->map_lookup;
2307                 bg = btrfs_create_block_group_cache(fs_info, em->start);
2308                 if (!bg) {
2309                         ret = -ENOMEM;
2310                         break;
2311                 }
2312
2313                 /* Fill dummy cache as FULL */
2314                 bg->length = em->len;
2315                 bg->flags = map->type;
2316                 bg->cached = BTRFS_CACHE_FINISHED;
2317                 bg->used = em->len;
2318                 bg->flags = map->type;
2319                 ret = btrfs_add_block_group_cache(fs_info, bg);
2320                 /*
2321                  * We may have some valid block group cache added already, in
2322                  * that case we skip to the next one.
2323                  */
2324                 if (ret == -EEXIST) {
2325                         ret = 0;
2326                         btrfs_put_block_group(bg);
2327                         continue;
2328                 }
2329
2330                 if (ret) {
2331                         btrfs_remove_free_space_cache(bg);
2332                         btrfs_put_block_group(bg);
2333                         break;
2334                 }
2335
2336                 btrfs_add_bg_to_space_info(fs_info, bg);
2337
2338                 set_avail_alloc_bits(fs_info, bg->flags);
2339         }
2340         if (!ret)
2341                 btrfs_init_global_block_rsv(fs_info);
2342         return ret;
2343 }
2344
2345 int btrfs_read_block_groups(struct btrfs_fs_info *info)
2346 {
2347         struct btrfs_root *root = btrfs_block_group_root(info);
2348         struct btrfs_path *path;
2349         int ret;
2350         struct btrfs_block_group *cache;
2351         struct btrfs_space_info *space_info;
2352         struct btrfs_key key;
2353         int need_clear = 0;
2354         u64 cache_gen;
2355
2356         /*
2357          * Either no extent root (with ibadroots rescue option) or we have
2358          * unsupported RO options. The fs can never be mounted read-write, so no
2359          * need to waste time searching block group items.
2360          *
2361          * This also allows new extent tree related changes to be RO compat,
2362          * no need for a full incompat flag.
2363          */
2364         if (!root || (btrfs_super_compat_ro_flags(info->super_copy) &
2365                       ~BTRFS_FEATURE_COMPAT_RO_SUPP))
2366                 return fill_dummy_bgs(info);
2367
2368         key.objectid = 0;
2369         key.offset = 0;
2370         key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
2371         path = btrfs_alloc_path();
2372         if (!path)
2373                 return -ENOMEM;
2374
2375         cache_gen = btrfs_super_cache_generation(info->super_copy);
2376         if (btrfs_test_opt(info, SPACE_CACHE) &&
2377             btrfs_super_generation(info->super_copy) != cache_gen)
2378                 need_clear = 1;
2379         if (btrfs_test_opt(info, CLEAR_CACHE))
2380                 need_clear = 1;
2381
2382         while (1) {
2383                 struct btrfs_block_group_item bgi;
2384                 struct extent_buffer *leaf;
2385                 int slot;
2386
2387                 ret = find_first_block_group(info, path, &key);
2388                 if (ret > 0)
2389                         break;
2390                 if (ret != 0)
2391                         goto error;
2392
2393                 leaf = path->nodes[0];
2394                 slot = path->slots[0];
2395
2396                 read_extent_buffer(leaf, &bgi, btrfs_item_ptr_offset(leaf, slot),
2397                                    sizeof(bgi));
2398
2399                 btrfs_item_key_to_cpu(leaf, &key, slot);
2400                 btrfs_release_path(path);
2401                 ret = read_one_block_group(info, &bgi, &key, need_clear);
2402                 if (ret < 0)
2403                         goto error;
2404                 key.objectid += key.offset;
2405                 key.offset = 0;
2406         }
2407         btrfs_release_path(path);
2408
2409         list_for_each_entry(space_info, &info->space_info, list) {
2410                 int i;
2411
2412                 for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) {
2413                         if (list_empty(&space_info->block_groups[i]))
2414                                 continue;
2415                         cache = list_first_entry(&space_info->block_groups[i],
2416                                                  struct btrfs_block_group,
2417                                                  list);
2418                         btrfs_sysfs_add_block_group_type(cache);
2419                 }
2420
2421                 if (!(btrfs_get_alloc_profile(info, space_info->flags) &
2422                       (BTRFS_BLOCK_GROUP_RAID10 |
2423                        BTRFS_BLOCK_GROUP_RAID1_MASK |
2424                        BTRFS_BLOCK_GROUP_RAID56_MASK |
2425                        BTRFS_BLOCK_GROUP_DUP)))
2426                         continue;
2427                 /*
2428                  * Avoid allocating from un-mirrored block group if there are
2429                  * mirrored block groups.
2430                  */
2431                 list_for_each_entry(cache,
2432                                 &space_info->block_groups[BTRFS_RAID_RAID0],
2433                                 list)
2434                         inc_block_group_ro(cache, 1);
2435                 list_for_each_entry(cache,
2436                                 &space_info->block_groups[BTRFS_RAID_SINGLE],
2437                                 list)
2438                         inc_block_group_ro(cache, 1);
2439         }
2440
2441         btrfs_init_global_block_rsv(info);
2442         ret = check_chunk_block_group_mappings(info);
2443 error:
2444         btrfs_free_path(path);
2445         /*
2446          * We've hit some error while reading the extent tree, and have
2447          * rescue=ibadroots mount option.
2448          * Try to fill the tree using dummy block groups so that the user can
2449          * continue to mount and grab their data.
2450          */
2451         if (ret && btrfs_test_opt(info, IGNOREBADROOTS))
2452                 ret = fill_dummy_bgs(info);
2453         return ret;
2454 }
2455
2456 /*
2457  * This function, insert_block_group_item(), belongs to the phase 2 of chunk
2458  * allocation.
2459  *
2460  * See the comment at btrfs_chunk_alloc() for details about the chunk allocation
2461  * phases.
2462  */
2463 static int insert_block_group_item(struct btrfs_trans_handle *trans,
2464                                    struct btrfs_block_group *block_group)
2465 {
2466         struct btrfs_fs_info *fs_info = trans->fs_info;
2467         struct btrfs_block_group_item bgi;
2468         struct btrfs_root *root = btrfs_block_group_root(fs_info);
2469         struct btrfs_key key;
2470         u64 old_commit_used;
2471         int ret;
2472
2473         spin_lock(&block_group->lock);
2474         btrfs_set_stack_block_group_used(&bgi, block_group->used);
2475         btrfs_set_stack_block_group_chunk_objectid(&bgi,
2476                                                    block_group->global_root_id);
2477         btrfs_set_stack_block_group_flags(&bgi, block_group->flags);
2478         old_commit_used = block_group->commit_used;
2479         block_group->commit_used = block_group->used;
2480         key.objectid = block_group->start;
2481         key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
2482         key.offset = block_group->length;
2483         spin_unlock(&block_group->lock);
2484
2485         ret = btrfs_insert_item(trans, root, &key, &bgi, sizeof(bgi));
2486         if (ret < 0) {
2487                 spin_lock(&block_group->lock);
2488                 block_group->commit_used = old_commit_used;
2489                 spin_unlock(&block_group->lock);
2490         }
2491
2492         return ret;
2493 }
2494
2495 static int insert_dev_extent(struct btrfs_trans_handle *trans,
2496                             struct btrfs_device *device, u64 chunk_offset,
2497                             u64 start, u64 num_bytes)
2498 {
2499         struct btrfs_fs_info *fs_info = device->fs_info;
2500         struct btrfs_root *root = fs_info->dev_root;
2501         struct btrfs_path *path;
2502         struct btrfs_dev_extent *extent;
2503         struct extent_buffer *leaf;
2504         struct btrfs_key key;
2505         int ret;
2506
2507         WARN_ON(!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state));
2508         WARN_ON(test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state));
2509         path = btrfs_alloc_path();
2510         if (!path)
2511                 return -ENOMEM;
2512
2513         key.objectid = device->devid;
2514         key.type = BTRFS_DEV_EXTENT_KEY;
2515         key.offset = start;
2516         ret = btrfs_insert_empty_item(trans, root, path, &key, sizeof(*extent));
2517         if (ret)
2518                 goto out;
2519
2520         leaf = path->nodes[0];
2521         extent = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_extent);
2522         btrfs_set_dev_extent_chunk_tree(leaf, extent, BTRFS_CHUNK_TREE_OBJECTID);
2523         btrfs_set_dev_extent_chunk_objectid(leaf, extent,
2524                                             BTRFS_FIRST_CHUNK_TREE_OBJECTID);
2525         btrfs_set_dev_extent_chunk_offset(leaf, extent, chunk_offset);
2526
2527         btrfs_set_dev_extent_length(leaf, extent, num_bytes);
2528         btrfs_mark_buffer_dirty(leaf);
2529 out:
2530         btrfs_free_path(path);
2531         return ret;
2532 }
2533
2534 /*
2535  * This function belongs to phase 2.
2536  *
2537  * See the comment at btrfs_chunk_alloc() for details about the chunk allocation
2538  * phases.
2539  */
2540 static int insert_dev_extents(struct btrfs_trans_handle *trans,
2541                                    u64 chunk_offset, u64 chunk_size)
2542 {
2543         struct btrfs_fs_info *fs_info = trans->fs_info;
2544         struct btrfs_device *device;
2545         struct extent_map *em;
2546         struct map_lookup *map;
2547         u64 dev_offset;
2548         u64 stripe_size;
2549         int i;
2550         int ret = 0;
2551
2552         em = btrfs_get_chunk_map(fs_info, chunk_offset, chunk_size);
2553         if (IS_ERR(em))
2554                 return PTR_ERR(em);
2555
2556         map = em->map_lookup;
2557         stripe_size = em->orig_block_len;
2558
2559         /*
2560          * Take the device list mutex to prevent races with the final phase of
2561          * a device replace operation that replaces the device object associated
2562          * with the map's stripes, because the device object's id can change
2563          * at any time during that final phase of the device replace operation
2564          * (dev-replace.c:btrfs_dev_replace_finishing()), so we could grab the
2565          * replaced device and then see it with an ID of BTRFS_DEV_REPLACE_DEVID,
2566          * resulting in persisting a device extent item with such ID.
2567          */
2568         mutex_lock(&fs_info->fs_devices->device_list_mutex);
2569         for (i = 0; i < map->num_stripes; i++) {
2570                 device = map->stripes[i].dev;
2571                 dev_offset = map->stripes[i].physical;
2572
2573                 ret = insert_dev_extent(trans, device, chunk_offset, dev_offset,
2574                                        stripe_size);
2575                 if (ret)
2576                         break;
2577         }
2578         mutex_unlock(&fs_info->fs_devices->device_list_mutex);
2579
2580         free_extent_map(em);
2581         return ret;
2582 }
2583
2584 /*
2585  * This function, btrfs_create_pending_block_groups(), belongs to the phase 2 of
2586  * chunk allocation.
2587  *
2588  * See the comment at btrfs_chunk_alloc() for details about the chunk allocation
2589  * phases.
2590  */
2591 void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans)
2592 {
2593         struct btrfs_fs_info *fs_info = trans->fs_info;
2594         struct btrfs_block_group *block_group;
2595         int ret = 0;
2596
2597         while (!list_empty(&trans->new_bgs)) {
2598                 int index;
2599
2600                 block_group = list_first_entry(&trans->new_bgs,
2601                                                struct btrfs_block_group,
2602                                                bg_list);
2603                 if (ret)
2604                         goto next;
2605
2606                 index = btrfs_bg_flags_to_raid_index(block_group->flags);
2607
2608                 ret = insert_block_group_item(trans, block_group);
2609                 if (ret)
2610                         btrfs_abort_transaction(trans, ret);
2611                 if (!test_bit(BLOCK_GROUP_FLAG_CHUNK_ITEM_INSERTED,
2612                               &block_group->runtime_flags)) {
2613                         mutex_lock(&fs_info->chunk_mutex);
2614                         ret = btrfs_chunk_alloc_add_chunk_item(trans, block_group);
2615                         mutex_unlock(&fs_info->chunk_mutex);
2616                         if (ret)
2617                                 btrfs_abort_transaction(trans, ret);
2618                 }
2619                 ret = insert_dev_extents(trans, block_group->start,
2620                                          block_group->length);
2621                 if (ret)
2622                         btrfs_abort_transaction(trans, ret);
2623                 add_block_group_free_space(trans, block_group);
2624
2625                 /*
2626                  * If we restriped during balance, we may have added a new raid
2627                  * type, so now add the sysfs entries when it is safe to do so.
2628                  * We don't have to worry about locking here as it's handled in
2629                  * btrfs_sysfs_add_block_group_type.
2630                  */
2631                 if (block_group->space_info->block_group_kobjs[index] == NULL)
2632                         btrfs_sysfs_add_block_group_type(block_group);
2633
2634                 /* Already aborted the transaction if it failed. */
2635 next:
2636                 btrfs_delayed_refs_rsv_release(fs_info, 1);
2637                 list_del_init(&block_group->bg_list);
2638         }
2639         btrfs_trans_release_chunk_metadata(trans);
2640 }
2641
2642 /*
2643  * For extent tree v2 we use the block_group_item->chunk_offset to point at our
2644  * global root id.  For v1 it's always set to BTRFS_FIRST_CHUNK_TREE_OBJECTID.
2645  */
2646 static u64 calculate_global_root_id(struct btrfs_fs_info *fs_info, u64 offset)
2647 {
2648         u64 div = SZ_1G;
2649         u64 index;
2650
2651         if (!btrfs_fs_incompat(fs_info, EXTENT_TREE_V2))
2652                 return BTRFS_FIRST_CHUNK_TREE_OBJECTID;
2653
2654         /* If we have a smaller fs index based on 128MiB. */
2655         if (btrfs_super_total_bytes(fs_info->super_copy) <= (SZ_1G * 10ULL))
2656                 div = SZ_128M;
2657
2658         offset = div64_u64(offset, div);
2659         div64_u64_rem(offset, fs_info->nr_global_roots, &index);
2660         return index;
2661 }
2662
2663 struct btrfs_block_group *btrfs_make_block_group(struct btrfs_trans_handle *trans,
2664                                                  u64 type,
2665                                                  u64 chunk_offset, u64 size)
2666 {
2667         struct btrfs_fs_info *fs_info = trans->fs_info;
2668         struct btrfs_block_group *cache;
2669         int ret;
2670
2671         btrfs_set_log_full_commit(trans);
2672
2673         cache = btrfs_create_block_group_cache(fs_info, chunk_offset);
2674         if (!cache)
2675                 return ERR_PTR(-ENOMEM);
2676
2677         cache->length = size;
2678         set_free_space_tree_thresholds(cache);
2679         cache->flags = type;
2680         cache->cached = BTRFS_CACHE_FINISHED;
2681         cache->global_root_id = calculate_global_root_id(fs_info, cache->start);
2682
2683         if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE))
2684                 set_bit(BLOCK_GROUP_FLAG_NEEDS_FREE_SPACE, &cache->runtime_flags);
2685
2686         ret = btrfs_load_block_group_zone_info(cache, true);
2687         if (ret) {
2688                 btrfs_put_block_group(cache);
2689                 return ERR_PTR(ret);
2690         }
2691
2692         ret = exclude_super_stripes(cache);
2693         if (ret) {
2694                 /* We may have excluded something, so call this just in case */
2695                 btrfs_free_excluded_extents(cache);
2696                 btrfs_put_block_group(cache);
2697                 return ERR_PTR(ret);
2698         }
2699
2700         add_new_free_space(cache, chunk_offset, chunk_offset + size);
2701
2702         btrfs_free_excluded_extents(cache);
2703
2704         /*
2705          * Ensure the corresponding space_info object is created and
2706          * assigned to our block group. We want our bg to be added to the rbtree
2707          * with its ->space_info set.
2708          */
2709         cache->space_info = btrfs_find_space_info(fs_info, cache->flags);
2710         ASSERT(cache->space_info);
2711
2712         ret = btrfs_add_block_group_cache(fs_info, cache);
2713         if (ret) {
2714                 btrfs_remove_free_space_cache(cache);
2715                 btrfs_put_block_group(cache);
2716                 return ERR_PTR(ret);
2717         }
2718
2719         /*
2720          * Now that our block group has its ->space_info set and is inserted in
2721          * the rbtree, update the space info's counters.
2722          */
2723         trace_btrfs_add_block_group(fs_info, cache, 1);
2724         btrfs_add_bg_to_space_info(fs_info, cache);
2725         btrfs_update_global_block_rsv(fs_info);
2726
2727 #ifdef CONFIG_BTRFS_DEBUG
2728         if (btrfs_should_fragment_free_space(cache)) {
2729                 cache->space_info->bytes_used += size >> 1;
2730                 fragment_free_space(cache);
2731         }
2732 #endif
2733
2734         list_add_tail(&cache->bg_list, &trans->new_bgs);
2735         trans->delayed_ref_updates++;
2736         btrfs_update_delayed_refs_rsv(trans);
2737
2738         set_avail_alloc_bits(fs_info, type);
2739         return cache;
2740 }
2741
2742 /*
2743  * Mark one block group RO, can be called several times for the same block
2744  * group.
2745  *
2746  * @cache:              the destination block group
2747  * @do_chunk_alloc:     whether need to do chunk pre-allocation, this is to
2748  *                      ensure we still have some free space after marking this
2749  *                      block group RO.
2750  */
2751 int btrfs_inc_block_group_ro(struct btrfs_block_group *cache,
2752                              bool do_chunk_alloc)
2753 {
2754         struct btrfs_fs_info *fs_info = cache->fs_info;
2755         struct btrfs_trans_handle *trans;
2756         struct btrfs_root *root = btrfs_block_group_root(fs_info);
2757         u64 alloc_flags;
2758         int ret;
2759         bool dirty_bg_running;
2760
2761         /*
2762          * This can only happen when we are doing read-only scrub on read-only
2763          * mount.
2764          * In that case we should not start a new transaction on read-only fs.
2765          * Thus here we skip all chunk allocations.
2766          */
2767         if (sb_rdonly(fs_info->sb)) {
2768                 mutex_lock(&fs_info->ro_block_group_mutex);
2769                 ret = inc_block_group_ro(cache, 0);
2770                 mutex_unlock(&fs_info->ro_block_group_mutex);
2771                 return ret;
2772         }
2773
2774         do {
2775                 trans = btrfs_join_transaction(root);
2776                 if (IS_ERR(trans))
2777                         return PTR_ERR(trans);
2778
2779                 dirty_bg_running = false;
2780
2781                 /*
2782                  * We're not allowed to set block groups readonly after the dirty
2783                  * block group cache has started writing.  If it already started,
2784                  * back off and let this transaction commit.
2785                  */
2786                 mutex_lock(&fs_info->ro_block_group_mutex);
2787                 if (test_bit(BTRFS_TRANS_DIRTY_BG_RUN, &trans->transaction->flags)) {
2788                         u64 transid = trans->transid;
2789
2790                         mutex_unlock(&fs_info->ro_block_group_mutex);
2791                         btrfs_end_transaction(trans);
2792
2793                         ret = btrfs_wait_for_commit(fs_info, transid);
2794                         if (ret)
2795                                 return ret;
2796                         dirty_bg_running = true;
2797                 }
2798         } while (dirty_bg_running);
2799
2800         if (do_chunk_alloc) {
2801                 /*
2802                  * If we are changing raid levels, try to allocate a
2803                  * corresponding block group with the new raid level.
2804                  */
2805                 alloc_flags = btrfs_get_alloc_profile(fs_info, cache->flags);
2806                 if (alloc_flags != cache->flags) {
2807                         ret = btrfs_chunk_alloc(trans, alloc_flags,
2808                                                 CHUNK_ALLOC_FORCE);
2809                         /*
2810                          * ENOSPC is allowed here, we may have enough space
2811                          * already allocated at the new raid level to carry on
2812                          */
2813                         if (ret == -ENOSPC)
2814                                 ret = 0;
2815                         if (ret < 0)
2816                                 goto out;
2817                 }
2818         }
2819
2820         ret = inc_block_group_ro(cache, 0);
2821         if (!do_chunk_alloc || ret == -ETXTBSY)
2822                 goto unlock_out;
2823         if (!ret)
2824                 goto out;
2825         alloc_flags = btrfs_get_alloc_profile(fs_info, cache->space_info->flags);
2826         ret = btrfs_chunk_alloc(trans, alloc_flags, CHUNK_ALLOC_FORCE);
2827         if (ret < 0)
2828                 goto out;
2829         /*
2830          * We have allocated a new chunk. We also need to activate that chunk to
2831          * grant metadata tickets for zoned filesystem.
2832          */
2833         ret = btrfs_zoned_activate_one_bg(fs_info, cache->space_info, true);
2834         if (ret < 0)
2835                 goto out;
2836
2837         ret = inc_block_group_ro(cache, 0);
2838         if (ret == -ETXTBSY)
2839                 goto unlock_out;
2840 out:
2841         if (cache->flags & BTRFS_BLOCK_GROUP_SYSTEM) {
2842                 alloc_flags = btrfs_get_alloc_profile(fs_info, cache->flags);
2843                 mutex_lock(&fs_info->chunk_mutex);
2844                 check_system_chunk(trans, alloc_flags);
2845                 mutex_unlock(&fs_info->chunk_mutex);
2846         }
2847 unlock_out:
2848         mutex_unlock(&fs_info->ro_block_group_mutex);
2849
2850         btrfs_end_transaction(trans);
2851         return ret;
2852 }
2853
2854 void btrfs_dec_block_group_ro(struct btrfs_block_group *cache)
2855 {
2856         struct btrfs_space_info *sinfo = cache->space_info;
2857         u64 num_bytes;
2858
2859         BUG_ON(!cache->ro);
2860
2861         spin_lock(&sinfo->lock);
2862         spin_lock(&cache->lock);
2863         if (!--cache->ro) {
2864                 if (btrfs_is_zoned(cache->fs_info)) {
2865                         /* Migrate zone_unusable bytes back */
2866                         cache->zone_unusable =
2867                                 (cache->alloc_offset - cache->used) +
2868                                 (cache->length - cache->zone_capacity);
2869                         sinfo->bytes_zone_unusable += cache->zone_unusable;
2870                         sinfo->bytes_readonly -= cache->zone_unusable;
2871                 }
2872                 num_bytes = cache->length - cache->reserved -
2873                             cache->pinned - cache->bytes_super -
2874                             cache->zone_unusable - cache->used;
2875                 sinfo->bytes_readonly -= num_bytes;
2876                 list_del_init(&cache->ro_list);
2877         }
2878         spin_unlock(&cache->lock);
2879         spin_unlock(&sinfo->lock);
2880 }
2881
2882 static int update_block_group_item(struct btrfs_trans_handle *trans,
2883                                    struct btrfs_path *path,
2884                                    struct btrfs_block_group *cache)
2885 {
2886         struct btrfs_fs_info *fs_info = trans->fs_info;
2887         int ret;
2888         struct btrfs_root *root = btrfs_block_group_root(fs_info);
2889         unsigned long bi;
2890         struct extent_buffer *leaf;
2891         struct btrfs_block_group_item bgi;
2892         struct btrfs_key key;
2893         u64 old_commit_used;
2894         u64 used;
2895
2896         /*
2897          * Block group items update can be triggered out of commit transaction
2898          * critical section, thus we need a consistent view of used bytes.
2899          * We cannot use cache->used directly outside of the spin lock, as it
2900          * may be changed.
2901          */
2902         spin_lock(&cache->lock);
2903         old_commit_used = cache->commit_used;
2904         used = cache->used;
2905         /* No change in used bytes, can safely skip it. */
2906         if (cache->commit_used == used) {
2907                 spin_unlock(&cache->lock);
2908                 return 0;
2909         }
2910         cache->commit_used = used;
2911         spin_unlock(&cache->lock);
2912
2913         key.objectid = cache->start;
2914         key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
2915         key.offset = cache->length;
2916
2917         ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
2918         if (ret) {
2919                 if (ret > 0)
2920                         ret = -ENOENT;
2921                 goto fail;
2922         }
2923
2924         leaf = path->nodes[0];
2925         bi = btrfs_item_ptr_offset(leaf, path->slots[0]);
2926         btrfs_set_stack_block_group_used(&bgi, used);
2927         btrfs_set_stack_block_group_chunk_objectid(&bgi,
2928                                                    cache->global_root_id);
2929         btrfs_set_stack_block_group_flags(&bgi, cache->flags);
2930         write_extent_buffer(leaf, &bgi, bi, sizeof(bgi));
2931         btrfs_mark_buffer_dirty(leaf);
2932 fail:
2933         btrfs_release_path(path);
2934         /* We didn't update the block group item, need to revert @commit_used. */
2935         if (ret < 0) {
2936                 spin_lock(&cache->lock);
2937                 cache->commit_used = old_commit_used;
2938                 spin_unlock(&cache->lock);
2939         }
2940         return ret;
2941
2942 }
2943
2944 static int cache_save_setup(struct btrfs_block_group *block_group,
2945                             struct btrfs_trans_handle *trans,
2946                             struct btrfs_path *path)
2947 {
2948         struct btrfs_fs_info *fs_info = block_group->fs_info;
2949         struct btrfs_root *root = fs_info->tree_root;
2950         struct inode *inode = NULL;
2951         struct extent_changeset *data_reserved = NULL;
2952         u64 alloc_hint = 0;
2953         int dcs = BTRFS_DC_ERROR;
2954         u64 cache_size = 0;
2955         int retries = 0;
2956         int ret = 0;
2957
2958         if (!btrfs_test_opt(fs_info, SPACE_CACHE))
2959                 return 0;
2960
2961         /*
2962          * If this block group is smaller than 100 megs don't bother caching the
2963          * block group.
2964          */
2965         if (block_group->length < (100 * SZ_1M)) {
2966                 spin_lock(&block_group->lock);
2967                 block_group->disk_cache_state = BTRFS_DC_WRITTEN;
2968                 spin_unlock(&block_group->lock);
2969                 return 0;
2970         }
2971
2972         if (TRANS_ABORTED(trans))
2973                 return 0;
2974 again:
2975         inode = lookup_free_space_inode(block_group, path);
2976         if (IS_ERR(inode) && PTR_ERR(inode) != -ENOENT) {
2977                 ret = PTR_ERR(inode);
2978                 btrfs_release_path(path);
2979                 goto out;
2980         }
2981
2982         if (IS_ERR(inode)) {
2983                 BUG_ON(retries);
2984                 retries++;
2985
2986                 if (block_group->ro)
2987                         goto out_free;
2988
2989                 ret = create_free_space_inode(trans, block_group, path);
2990                 if (ret)
2991                         goto out_free;
2992                 goto again;
2993         }
2994
2995         /*
2996          * We want to set the generation to 0, that way if anything goes wrong
2997          * from here on out we know not to trust this cache when we load up next
2998          * time.
2999          */
3000         BTRFS_I(inode)->generation = 0;
3001         ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
3002         if (ret) {
3003                 /*
3004                  * So theoretically we could recover from this, simply set the
3005                  * super cache generation to 0 so we know to invalidate the
3006                  * cache, but then we'd have to keep track of the block groups
3007                  * that fail this way so we know we _have_ to reset this cache
3008                  * before the next commit or risk reading stale cache.  So to
3009                  * limit our exposure to horrible edge cases lets just abort the
3010                  * transaction, this only happens in really bad situations
3011                  * anyway.
3012                  */
3013                 btrfs_abort_transaction(trans, ret);
3014                 goto out_put;
3015         }
3016         WARN_ON(ret);
3017
3018         /* We've already setup this transaction, go ahead and exit */
3019         if (block_group->cache_generation == trans->transid &&
3020             i_size_read(inode)) {
3021                 dcs = BTRFS_DC_SETUP;
3022                 goto out_put;
3023         }
3024
3025         if (i_size_read(inode) > 0) {
3026                 ret = btrfs_check_trunc_cache_free_space(fs_info,
3027                                         &fs_info->global_block_rsv);
3028                 if (ret)
3029                         goto out_put;
3030
3031                 ret = btrfs_truncate_free_space_cache(trans, NULL, inode);
3032                 if (ret)
3033                         goto out_put;
3034         }
3035
3036         spin_lock(&block_group->lock);
3037         if (block_group->cached != BTRFS_CACHE_FINISHED ||
3038             !btrfs_test_opt(fs_info, SPACE_CACHE)) {
3039                 /*
3040                  * don't bother trying to write stuff out _if_
3041                  * a) we're not cached,
3042                  * b) we're with nospace_cache mount option,
3043                  * c) we're with v2 space_cache (FREE_SPACE_TREE).
3044                  */
3045                 dcs = BTRFS_DC_WRITTEN;
3046                 spin_unlock(&block_group->lock);
3047                 goto out_put;
3048         }
3049         spin_unlock(&block_group->lock);
3050
3051         /*
3052          * We hit an ENOSPC when setting up the cache in this transaction, just
3053          * skip doing the setup, we've already cleared the cache so we're safe.
3054          */
3055         if (test_bit(BTRFS_TRANS_CACHE_ENOSPC, &trans->transaction->flags)) {
3056                 ret = -ENOSPC;
3057                 goto out_put;
3058         }
3059
3060         /*
3061          * Try to preallocate enough space based on how big the block group is.
3062          * Keep in mind this has to include any pinned space which could end up
3063          * taking up quite a bit since it's not folded into the other space
3064          * cache.
3065          */
3066         cache_size = div_u64(block_group->length, SZ_256M);
3067         if (!cache_size)
3068                 cache_size = 1;
3069
3070         cache_size *= 16;
3071         cache_size *= fs_info->sectorsize;
3072
3073         ret = btrfs_check_data_free_space(BTRFS_I(inode), &data_reserved, 0,
3074                                           cache_size, false);
3075         if (ret)
3076                 goto out_put;
3077
3078         ret = btrfs_prealloc_file_range_trans(inode, trans, 0, 0, cache_size,
3079                                               cache_size, cache_size,
3080                                               &alloc_hint);
3081         /*
3082          * Our cache requires contiguous chunks so that we don't modify a bunch
3083          * of metadata or split extents when writing the cache out, which means
3084          * we can enospc if we are heavily fragmented in addition to just normal
3085          * out of space conditions.  So if we hit this just skip setting up any
3086          * other block groups for this transaction, maybe we'll unpin enough
3087          * space the next time around.
3088          */
3089         if (!ret)
3090                 dcs = BTRFS_DC_SETUP;
3091         else if (ret == -ENOSPC)
3092                 set_bit(BTRFS_TRANS_CACHE_ENOSPC, &trans->transaction->flags);
3093
3094 out_put:
3095         iput(inode);
3096 out_free:
3097         btrfs_release_path(path);
3098 out:
3099         spin_lock(&block_group->lock);
3100         if (!ret && dcs == BTRFS_DC_SETUP)
3101                 block_group->cache_generation = trans->transid;
3102         block_group->disk_cache_state = dcs;
3103         spin_unlock(&block_group->lock);
3104
3105         extent_changeset_free(data_reserved);
3106         return ret;
3107 }
3108
3109 int btrfs_setup_space_cache(struct btrfs_trans_handle *trans)
3110 {
3111         struct btrfs_fs_info *fs_info = trans->fs_info;
3112         struct btrfs_block_group *cache, *tmp;
3113         struct btrfs_transaction *cur_trans = trans->transaction;
3114         struct btrfs_path *path;
3115
3116         if (list_empty(&cur_trans->dirty_bgs) ||
3117             !btrfs_test_opt(fs_info, SPACE_CACHE))
3118                 return 0;
3119
3120         path = btrfs_alloc_path();
3121         if (!path)
3122                 return -ENOMEM;
3123
3124         /* Could add new block groups, use _safe just in case */
3125         list_for_each_entry_safe(cache, tmp, &cur_trans->dirty_bgs,
3126                                  dirty_list) {
3127                 if (cache->disk_cache_state == BTRFS_DC_CLEAR)
3128                         cache_save_setup(cache, trans, path);
3129         }
3130
3131         btrfs_free_path(path);
3132         return 0;
3133 }
3134
3135 /*
3136  * Transaction commit does final block group cache writeback during a critical
3137  * section where nothing is allowed to change the FS.  This is required in
3138  * order for the cache to actually match the block group, but can introduce a
3139  * lot of latency into the commit.
3140  *
3141  * So, btrfs_start_dirty_block_groups is here to kick off block group cache IO.
3142  * There's a chance we'll have to redo some of it if the block group changes
3143  * again during the commit, but it greatly reduces the commit latency by
3144  * getting rid of the easy block groups while we're still allowing others to
3145  * join the commit.
3146  */
3147 int btrfs_start_dirty_block_groups(struct btrfs_trans_handle *trans)
3148 {
3149         struct btrfs_fs_info *fs_info = trans->fs_info;
3150         struct btrfs_block_group *cache;
3151         struct btrfs_transaction *cur_trans = trans->transaction;
3152         int ret = 0;
3153         int should_put;
3154         struct btrfs_path *path = NULL;
3155         LIST_HEAD(dirty);
3156         struct list_head *io = &cur_trans->io_bgs;
3157         int loops = 0;
3158
3159         spin_lock(&cur_trans->dirty_bgs_lock);
3160         if (list_empty(&cur_trans->dirty_bgs)) {
3161                 spin_unlock(&cur_trans->dirty_bgs_lock);
3162                 return 0;
3163         }
3164         list_splice_init(&cur_trans->dirty_bgs, &dirty);
3165         spin_unlock(&cur_trans->dirty_bgs_lock);
3166
3167 again:
3168         /* Make sure all the block groups on our dirty list actually exist */
3169         btrfs_create_pending_block_groups(trans);
3170
3171         if (!path) {
3172                 path = btrfs_alloc_path();
3173                 if (!path) {
3174                         ret = -ENOMEM;
3175                         goto out;
3176                 }
3177         }
3178
3179         /*
3180          * cache_write_mutex is here only to save us from balance or automatic
3181          * removal of empty block groups deleting this block group while we are
3182          * writing out the cache
3183          */
3184         mutex_lock(&trans->transaction->cache_write_mutex);
3185         while (!list_empty(&dirty)) {
3186                 bool drop_reserve = true;
3187
3188                 cache = list_first_entry(&dirty, struct btrfs_block_group,
3189                                          dirty_list);
3190                 /*
3191                  * This can happen if something re-dirties a block group that
3192                  * is already under IO.  Just wait for it to finish and then do
3193                  * it all again
3194                  */
3195                 if (!list_empty(&cache->io_list)) {
3196                         list_del_init(&cache->io_list);
3197                         btrfs_wait_cache_io(trans, cache, path);
3198                         btrfs_put_block_group(cache);
3199                 }
3200
3201
3202                 /*
3203                  * btrfs_wait_cache_io uses the cache->dirty_list to decide if
3204                  * it should update the cache_state.  Don't delete until after
3205                  * we wait.
3206                  *
3207                  * Since we're not running in the commit critical section
3208                  * we need the dirty_bgs_lock to protect from update_block_group
3209                  */
3210                 spin_lock(&cur_trans->dirty_bgs_lock);
3211                 list_del_init(&cache->dirty_list);
3212                 spin_unlock(&cur_trans->dirty_bgs_lock);
3213
3214                 should_put = 1;
3215
3216                 cache_save_setup(cache, trans, path);
3217
3218                 if (cache->disk_cache_state == BTRFS_DC_SETUP) {
3219                         cache->io_ctl.inode = NULL;
3220                         ret = btrfs_write_out_cache(trans, cache, path);
3221                         if (ret == 0 && cache->io_ctl.inode) {
3222                                 should_put = 0;
3223
3224                                 /*
3225                                  * The cache_write_mutex is protecting the
3226                                  * io_list, also refer to the definition of
3227                                  * btrfs_transaction::io_bgs for more details
3228                                  */
3229                                 list_add_tail(&cache->io_list, io);
3230                         } else {
3231                                 /*
3232                                  * If we failed to write the cache, the
3233                                  * generation will be bad and life goes on
3234                                  */
3235                                 ret = 0;
3236                         }
3237                 }
3238                 if (!ret) {
3239                         ret = update_block_group_item(trans, path, cache);
3240                         /*
3241                          * Our block group might still be attached to the list
3242                          * of new block groups in the transaction handle of some
3243                          * other task (struct btrfs_trans_handle->new_bgs). This
3244                          * means its block group item isn't yet in the extent
3245                          * tree. If this happens ignore the error, as we will
3246                          * try again later in the critical section of the
3247                          * transaction commit.
3248                          */
3249                         if (ret == -ENOENT) {
3250                                 ret = 0;
3251                                 spin_lock(&cur_trans->dirty_bgs_lock);
3252                                 if (list_empty(&cache->dirty_list)) {
3253                                         list_add_tail(&cache->dirty_list,
3254                                                       &cur_trans->dirty_bgs);
3255                                         btrfs_get_block_group(cache);
3256                                         drop_reserve = false;
3257                                 }
3258                                 spin_unlock(&cur_trans->dirty_bgs_lock);
3259                         } else if (ret) {
3260                                 btrfs_abort_transaction(trans, ret);
3261                         }
3262                 }
3263
3264                 /* If it's not on the io list, we need to put the block group */
3265                 if (should_put)
3266                         btrfs_put_block_group(cache);
3267                 if (drop_reserve)
3268                         btrfs_delayed_refs_rsv_release(fs_info, 1);
3269                 /*
3270                  * Avoid blocking other tasks for too long. It might even save
3271                  * us from writing caches for block groups that are going to be
3272                  * removed.
3273                  */
3274                 mutex_unlock(&trans->transaction->cache_write_mutex);
3275                 if (ret)
3276                         goto out;
3277                 mutex_lock(&trans->transaction->cache_write_mutex);
3278         }
3279         mutex_unlock(&trans->transaction->cache_write_mutex);
3280
3281         /*
3282          * Go through delayed refs for all the stuff we've just kicked off
3283          * and then loop back (just once)
3284          */
3285         if (!ret)
3286                 ret = btrfs_run_delayed_refs(trans, 0);
3287         if (!ret && loops == 0) {
3288                 loops++;
3289                 spin_lock(&cur_trans->dirty_bgs_lock);
3290                 list_splice_init(&cur_trans->dirty_bgs, &dirty);
3291                 /*
3292                  * dirty_bgs_lock protects us from concurrent block group
3293                  * deletes too (not just cache_write_mutex).
3294                  */
3295                 if (!list_empty(&dirty)) {
3296                         spin_unlock(&cur_trans->dirty_bgs_lock);
3297                         goto again;
3298                 }
3299                 spin_unlock(&cur_trans->dirty_bgs_lock);
3300         }
3301 out:
3302         if (ret < 0) {
3303                 spin_lock(&cur_trans->dirty_bgs_lock);
3304                 list_splice_init(&dirty, &cur_trans->dirty_bgs);
3305                 spin_unlock(&cur_trans->dirty_bgs_lock);
3306                 btrfs_cleanup_dirty_bgs(cur_trans, fs_info);
3307         }
3308
3309         btrfs_free_path(path);
3310         return ret;
3311 }
3312
3313 int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans)
3314 {
3315         struct btrfs_fs_info *fs_info = trans->fs_info;
3316         struct btrfs_block_group *cache;
3317         struct btrfs_transaction *cur_trans = trans->transaction;
3318         int ret = 0;
3319         int should_put;
3320         struct btrfs_path *path;
3321         struct list_head *io = &cur_trans->io_bgs;
3322
3323         path = btrfs_alloc_path();
3324         if (!path)
3325                 return -ENOMEM;
3326
3327         /*
3328          * Even though we are in the critical section of the transaction commit,
3329          * we can still have concurrent tasks adding elements to this
3330          * transaction's list of dirty block groups. These tasks correspond to
3331          * endio free space workers started when writeback finishes for a
3332          * space cache, which run inode.c:btrfs_finish_ordered_io(), and can
3333          * allocate new block groups as a result of COWing nodes of the root
3334          * tree when updating the free space inode. The writeback for the space
3335          * caches is triggered by an earlier call to
3336          * btrfs_start_dirty_block_groups() and iterations of the following
3337          * loop.
3338          * Also we want to do the cache_save_setup first and then run the
3339          * delayed refs to make sure we have the best chance at doing this all
3340          * in one shot.
3341          */
3342         spin_lock(&cur_trans->dirty_bgs_lock);
3343         while (!list_empty(&cur_trans->dirty_bgs)) {
3344                 cache = list_first_entry(&cur_trans->dirty_bgs,
3345                                          struct btrfs_block_group,
3346                                          dirty_list);
3347
3348                 /*
3349                  * This can happen if cache_save_setup re-dirties a block group
3350                  * that is already under IO.  Just wait for it to finish and
3351                  * then do it all again
3352                  */
3353                 if (!list_empty(&cache->io_list)) {
3354                         spin_unlock(&cur_trans->dirty_bgs_lock);
3355                         list_del_init(&cache->io_list);
3356                         btrfs_wait_cache_io(trans, cache, path);
3357                         btrfs_put_block_group(cache);
3358                         spin_lock(&cur_trans->dirty_bgs_lock);
3359                 }
3360
3361                 /*
3362                  * Don't remove from the dirty list until after we've waited on
3363                  * any pending IO
3364                  */
3365                 list_del_init(&cache->dirty_list);
3366                 spin_unlock(&cur_trans->dirty_bgs_lock);
3367                 should_put = 1;
3368
3369                 cache_save_setup(cache, trans, path);
3370
3371                 if (!ret)
3372                         ret = btrfs_run_delayed_refs(trans,
3373                                                      (unsigned long) -1);
3374
3375                 if (!ret && cache->disk_cache_state == BTRFS_DC_SETUP) {
3376                         cache->io_ctl.inode = NULL;
3377                         ret = btrfs_write_out_cache(trans, cache, path);
3378                         if (ret == 0 && cache->io_ctl.inode) {
3379                                 should_put = 0;
3380                                 list_add_tail(&cache->io_list, io);
3381                         } else {
3382                                 /*
3383                                  * If we failed to write the cache, the
3384                                  * generation will be bad and life goes on
3385                                  */
3386                                 ret = 0;
3387                         }
3388                 }
3389                 if (!ret) {
3390                         ret = update_block_group_item(trans, path, cache);
3391                         /*
3392                          * One of the free space endio workers might have
3393                          * created a new block group while updating a free space
3394                          * cache's inode (at inode.c:btrfs_finish_ordered_io())
3395                          * and hasn't released its transaction handle yet, in
3396                          * which case the new block group is still attached to
3397                          * its transaction handle and its creation has not
3398                          * finished yet (no block group item in the extent tree
3399                          * yet, etc). If this is the case, wait for all free
3400                          * space endio workers to finish and retry. This is a
3401                          * very rare case so no need for a more efficient and
3402                          * complex approach.
3403                          */
3404                         if (ret == -ENOENT) {
3405                                 wait_event(cur_trans->writer_wait,
3406                                    atomic_read(&cur_trans->num_writers) == 1);
3407                                 ret = update_block_group_item(trans, path, cache);
3408                         }
3409                         if (ret)
3410                                 btrfs_abort_transaction(trans, ret);
3411                 }
3412
3413                 /* If its not on the io list, we need to put the block group */
3414                 if (should_put)
3415                         btrfs_put_block_group(cache);
3416                 btrfs_delayed_refs_rsv_release(fs_info, 1);
3417                 spin_lock(&cur_trans->dirty_bgs_lock);
3418         }
3419         spin_unlock(&cur_trans->dirty_bgs_lock);
3420
3421         /*
3422          * Refer to the definition of io_bgs member for details why it's safe
3423          * to use it without any locking
3424          */
3425         while (!list_empty(io)) {
3426                 cache = list_first_entry(io, struct btrfs_block_group,
3427                                          io_list);
3428                 list_del_init(&cache->io_list);
3429                 btrfs_wait_cache_io(trans, cache, path);
3430                 btrfs_put_block_group(cache);
3431         }
3432
3433         btrfs_free_path(path);
3434         return ret;
3435 }
3436
3437 int btrfs_update_block_group(struct btrfs_trans_handle *trans,
3438                              u64 bytenr, u64 num_bytes, bool alloc)
3439 {
3440         struct btrfs_fs_info *info = trans->fs_info;
3441         struct btrfs_block_group *cache = NULL;
3442         u64 total = num_bytes;
3443         u64 old_val;
3444         u64 byte_in_group;
3445         int factor;
3446         int ret = 0;
3447
3448         /* Block accounting for super block */
3449         spin_lock(&info->delalloc_root_lock);
3450         old_val = btrfs_super_bytes_used(info->super_copy);
3451         if (alloc)
3452                 old_val += num_bytes;
3453         else
3454                 old_val -= num_bytes;
3455         btrfs_set_super_bytes_used(info->super_copy, old_val);
3456         spin_unlock(&info->delalloc_root_lock);
3457
3458         while (total) {
3459                 struct btrfs_space_info *space_info;
3460                 bool reclaim = false;
3461
3462                 cache = btrfs_lookup_block_group(info, bytenr);
3463                 if (!cache) {
3464                         ret = -ENOENT;
3465                         break;
3466                 }
3467                 space_info = cache->space_info;
3468                 factor = btrfs_bg_type_to_factor(cache->flags);
3469
3470                 /*
3471                  * If this block group has free space cache written out, we
3472                  * need to make sure to load it if we are removing space.  This
3473                  * is because we need the unpinning stage to actually add the
3474                  * space back to the block group, otherwise we will leak space.
3475                  */
3476                 if (!alloc && !btrfs_block_group_done(cache))
3477                         btrfs_cache_block_group(cache, true);
3478
3479                 byte_in_group = bytenr - cache->start;
3480                 WARN_ON(byte_in_group > cache->length);
3481
3482                 spin_lock(&space_info->lock);
3483                 spin_lock(&cache->lock);
3484
3485                 if (btrfs_test_opt(info, SPACE_CACHE) &&
3486                     cache->disk_cache_state < BTRFS_DC_CLEAR)
3487                         cache->disk_cache_state = BTRFS_DC_CLEAR;
3488
3489                 old_val = cache->used;
3490                 num_bytes = min(total, cache->length - byte_in_group);
3491                 if (alloc) {
3492                         old_val += num_bytes;
3493                         cache->used = old_val;
3494                         cache->reserved -= num_bytes;
3495                         space_info->bytes_reserved -= num_bytes;
3496                         space_info->bytes_used += num_bytes;
3497                         space_info->disk_used += num_bytes * factor;
3498                         spin_unlock(&cache->lock);
3499                         spin_unlock(&space_info->lock);
3500                 } else {
3501                         old_val -= num_bytes;
3502                         cache->used = old_val;
3503                         cache->pinned += num_bytes;
3504                         btrfs_space_info_update_bytes_pinned(info, space_info,
3505                                                              num_bytes);
3506                         space_info->bytes_used -= num_bytes;
3507                         space_info->disk_used -= num_bytes * factor;
3508
3509                         reclaim = should_reclaim_block_group(cache, num_bytes);
3510
3511                         spin_unlock(&cache->lock);
3512                         spin_unlock(&space_info->lock);
3513
3514                         set_extent_dirty(&trans->transaction->pinned_extents,
3515                                          bytenr, bytenr + num_bytes - 1,
3516                                          GFP_NOFS | __GFP_NOFAIL);
3517                 }
3518
3519                 spin_lock(&trans->transaction->dirty_bgs_lock);
3520                 if (list_empty(&cache->dirty_list)) {
3521                         list_add_tail(&cache->dirty_list,
3522                                       &trans->transaction->dirty_bgs);
3523                         trans->delayed_ref_updates++;
3524                         btrfs_get_block_group(cache);
3525                 }
3526                 spin_unlock(&trans->transaction->dirty_bgs_lock);
3527
3528                 /*
3529                  * No longer have used bytes in this block group, queue it for
3530                  * deletion. We do this after adding the block group to the
3531                  * dirty list to avoid races between cleaner kthread and space
3532                  * cache writeout.
3533                  */
3534                 if (!alloc && old_val == 0) {
3535                         if (!btrfs_test_opt(info, DISCARD_ASYNC))
3536                                 btrfs_mark_bg_unused(cache);
3537                 } else if (!alloc && reclaim) {
3538                         btrfs_mark_bg_to_reclaim(cache);
3539                 }
3540
3541                 btrfs_put_block_group(cache);
3542                 total -= num_bytes;
3543                 bytenr += num_bytes;
3544         }
3545
3546         /* Modified block groups are accounted for in the delayed_refs_rsv. */
3547         btrfs_update_delayed_refs_rsv(trans);
3548         return ret;
3549 }
3550
3551 /*
3552  * Update the block_group and space info counters.
3553  *
3554  * @cache:      The cache we are manipulating
3555  * @ram_bytes:  The number of bytes of file content, and will be same to
3556  *              @num_bytes except for the compress path.
3557  * @num_bytes:  The number of bytes in question
3558  * @delalloc:   The blocks are allocated for the delalloc write
3559  *
3560  * This is called by the allocator when it reserves space. If this is a
3561  * reservation and the block group has become read only we cannot make the
3562  * reservation and return -EAGAIN, otherwise this function always succeeds.
3563  */
3564 int btrfs_add_reserved_bytes(struct btrfs_block_group *cache,
3565                              u64 ram_bytes, u64 num_bytes, int delalloc,
3566                              bool force_wrong_size_class)
3567 {
3568         struct btrfs_space_info *space_info = cache->space_info;
3569         enum btrfs_block_group_size_class size_class;
3570         int ret = 0;
3571
3572         spin_lock(&space_info->lock);
3573         spin_lock(&cache->lock);
3574         if (cache->ro) {
3575                 ret = -EAGAIN;
3576                 goto out;
3577         }
3578
3579         if (btrfs_block_group_should_use_size_class(cache)) {
3580                 size_class = btrfs_calc_block_group_size_class(num_bytes);
3581                 ret = btrfs_use_block_group_size_class(cache, size_class, force_wrong_size_class);
3582                 if (ret)
3583                         goto out;
3584         }
3585         cache->reserved += num_bytes;
3586         space_info->bytes_reserved += num_bytes;
3587         trace_btrfs_space_reservation(cache->fs_info, "space_info",
3588                                       space_info->flags, num_bytes, 1);
3589         btrfs_space_info_update_bytes_may_use(cache->fs_info,
3590                                               space_info, -ram_bytes);
3591         if (delalloc)
3592                 cache->delalloc_bytes += num_bytes;
3593
3594         /*
3595          * Compression can use less space than we reserved, so wake tickets if
3596          * that happens.
3597          */
3598         if (num_bytes < ram_bytes)
3599                 btrfs_try_granting_tickets(cache->fs_info, space_info);
3600 out:
3601         spin_unlock(&cache->lock);
3602         spin_unlock(&space_info->lock);
3603         return ret;
3604 }
3605
3606 /*
3607  * Update the block_group and space info counters.
3608  *
3609  * @cache:      The cache we are manipulating
3610  * @num_bytes:  The number of bytes in question
3611  * @delalloc:   The blocks are allocated for the delalloc write
3612  *
3613  * This is called by somebody who is freeing space that was never actually used
3614  * on disk.  For example if you reserve some space for a new leaf in transaction
3615  * A and before transaction A commits you free that leaf, you call this with
3616  * reserve set to 0 in order to clear the reservation.
3617  */
3618 void btrfs_free_reserved_bytes(struct btrfs_block_group *cache,
3619                                u64 num_bytes, int delalloc)
3620 {
3621         struct btrfs_space_info *space_info = cache->space_info;
3622
3623         spin_lock(&space_info->lock);
3624         spin_lock(&cache->lock);
3625         if (cache->ro)
3626                 space_info->bytes_readonly += num_bytes;
3627         cache->reserved -= num_bytes;
3628         space_info->bytes_reserved -= num_bytes;
3629         space_info->max_extent_size = 0;
3630
3631         if (delalloc)
3632                 cache->delalloc_bytes -= num_bytes;
3633         spin_unlock(&cache->lock);
3634
3635         btrfs_try_granting_tickets(cache->fs_info, space_info);
3636         spin_unlock(&space_info->lock);
3637 }
3638
3639 static void force_metadata_allocation(struct btrfs_fs_info *info)
3640 {
3641         struct list_head *head = &info->space_info;
3642         struct btrfs_space_info *found;
3643
3644         list_for_each_entry(found, head, list) {
3645                 if (found->flags & BTRFS_BLOCK_GROUP_METADATA)
3646                         found->force_alloc = CHUNK_ALLOC_FORCE;
3647         }
3648 }
3649
3650 static int should_alloc_chunk(struct btrfs_fs_info *fs_info,
3651                               struct btrfs_space_info *sinfo, int force)
3652 {
3653         u64 bytes_used = btrfs_space_info_used(sinfo, false);
3654         u64 thresh;
3655
3656         if (force == CHUNK_ALLOC_FORCE)
3657                 return 1;
3658
3659         /*
3660          * in limited mode, we want to have some free space up to
3661          * about 1% of the FS size.
3662          */
3663         if (force == CHUNK_ALLOC_LIMITED) {
3664                 thresh = btrfs_super_total_bytes(fs_info->super_copy);
3665                 thresh = max_t(u64, SZ_64M, mult_perc(thresh, 1));
3666
3667                 if (sinfo->total_bytes - bytes_used < thresh)
3668                         return 1;
3669         }
3670
3671         if (bytes_used + SZ_2M < mult_perc(sinfo->total_bytes, 80))
3672                 return 0;
3673         return 1;
3674 }
3675
3676 int btrfs_force_chunk_alloc(struct btrfs_trans_handle *trans, u64 type)
3677 {
3678         u64 alloc_flags = btrfs_get_alloc_profile(trans->fs_info, type);
3679
3680         return btrfs_chunk_alloc(trans, alloc_flags, CHUNK_ALLOC_FORCE);
3681 }
3682
3683 static struct btrfs_block_group *do_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags)
3684 {
3685         struct btrfs_block_group *bg;
3686         int ret;
3687
3688         /*
3689          * Check if we have enough space in the system space info because we
3690          * will need to update device items in the chunk btree and insert a new
3691          * chunk item in the chunk btree as well. This will allocate a new
3692          * system block group if needed.
3693          */
3694         check_system_chunk(trans, flags);
3695
3696         bg = btrfs_create_chunk(trans, flags);
3697         if (IS_ERR(bg)) {
3698                 ret = PTR_ERR(bg);
3699                 goto out;
3700         }
3701
3702         ret = btrfs_chunk_alloc_add_chunk_item(trans, bg);
3703         /*
3704          * Normally we are not expected to fail with -ENOSPC here, since we have
3705          * previously reserved space in the system space_info and allocated one
3706          * new system chunk if necessary. However there are three exceptions:
3707          *
3708          * 1) We may have enough free space in the system space_info but all the
3709          *    existing system block groups have a profile which can not be used
3710          *    for extent allocation.
3711          *
3712          *    This happens when mounting in degraded mode. For example we have a
3713          *    RAID1 filesystem with 2 devices, lose one device and mount the fs
3714          *    using the other device in degraded mode. If we then allocate a chunk,
3715          *    we may have enough free space in the existing system space_info, but
3716          *    none of the block groups can be used for extent allocation since they
3717          *    have a RAID1 profile, and because we are in degraded mode with a
3718          *    single device, we are forced to allocate a new system chunk with a
3719          *    SINGLE profile. Making check_system_chunk() iterate over all system
3720          *    block groups and check if they have a usable profile and enough space
3721          *    can be slow on very large filesystems, so we tolerate the -ENOSPC and
3722          *    try again after forcing allocation of a new system chunk. Like this
3723          *    we avoid paying the cost of that search in normal circumstances, when
3724          *    we were not mounted in degraded mode;
3725          *
3726          * 2) We had enough free space info the system space_info, and one suitable
3727          *    block group to allocate from when we called check_system_chunk()
3728          *    above. However right after we called it, the only system block group
3729          *    with enough free space got turned into RO mode by a running scrub,
3730          *    and in this case we have to allocate a new one and retry. We only
3731          *    need do this allocate and retry once, since we have a transaction
3732          *    handle and scrub uses the commit root to search for block groups;
3733          *
3734          * 3) We had one system block group with enough free space when we called
3735          *    check_system_chunk(), but after that, right before we tried to
3736          *    allocate the last extent buffer we needed, a discard operation came
3737          *    in and it temporarily removed the last free space entry from the
3738          *    block group (discard removes a free space entry, discards it, and
3739          *    then adds back the entry to the block group cache).
3740          */
3741         if (ret == -ENOSPC) {
3742                 const u64 sys_flags = btrfs_system_alloc_profile(trans->fs_info);
3743                 struct btrfs_block_group *sys_bg;
3744
3745                 sys_bg = btrfs_create_chunk(trans, sys_flags);
3746                 if (IS_ERR(sys_bg)) {
3747                         ret = PTR_ERR(sys_bg);
3748                         btrfs_abort_transaction(trans, ret);
3749                         goto out;
3750                 }
3751
3752                 ret = btrfs_chunk_alloc_add_chunk_item(trans, sys_bg);
3753                 if (ret) {
3754                         btrfs_abort_transaction(trans, ret);
3755                         goto out;
3756                 }
3757
3758                 ret = btrfs_chunk_alloc_add_chunk_item(trans, bg);
3759                 if (ret) {
3760                         btrfs_abort_transaction(trans, ret);
3761                         goto out;
3762                 }
3763         } else if (ret) {
3764                 btrfs_abort_transaction(trans, ret);
3765                 goto out;
3766         }
3767 out:
3768         btrfs_trans_release_chunk_metadata(trans);
3769
3770         if (ret)
3771                 return ERR_PTR(ret);
3772
3773         btrfs_get_block_group(bg);
3774         return bg;
3775 }
3776
3777 /*
3778  * Chunk allocation is done in 2 phases:
3779  *
3780  * 1) Phase 1 - through btrfs_chunk_alloc() we allocate device extents for
3781  *    the chunk, the chunk mapping, create its block group and add the items
3782  *    that belong in the chunk btree to it - more specifically, we need to
3783  *    update device items in the chunk btree and add a new chunk item to it.
3784  *
3785  * 2) Phase 2 - through btrfs_create_pending_block_groups(), we add the block
3786  *    group item to the extent btree and the device extent items to the devices
3787  *    btree.
3788  *
3789  * This is done to prevent deadlocks. For example when COWing a node from the
3790  * extent btree we are holding a write lock on the node's parent and if we
3791  * trigger chunk allocation and attempted to insert the new block group item
3792  * in the extent btree right way, we could deadlock because the path for the
3793  * insertion can include that parent node. At first glance it seems impossible
3794  * to trigger chunk allocation after starting a transaction since tasks should
3795  * reserve enough transaction units (metadata space), however while that is true
3796  * most of the time, chunk allocation may still be triggered for several reasons:
3797  *
3798  * 1) When reserving metadata, we check if there is enough free space in the
3799  *    metadata space_info and therefore don't trigger allocation of a new chunk.
3800  *    However later when the task actually tries to COW an extent buffer from
3801  *    the extent btree or from the device btree for example, it is forced to
3802  *    allocate a new block group (chunk) because the only one that had enough
3803  *    free space was just turned to RO mode by a running scrub for example (or
3804  *    device replace, block group reclaim thread, etc), so we can not use it
3805  *    for allocating an extent and end up being forced to allocate a new one;
3806  *
3807  * 2) Because we only check that the metadata space_info has enough free bytes,
3808  *    we end up not allocating a new metadata chunk in that case. However if
3809  *    the filesystem was mounted in degraded mode, none of the existing block
3810  *    groups might be suitable for extent allocation due to their incompatible
3811  *    profile (for e.g. mounting a 2 devices filesystem, where all block groups
3812  *    use a RAID1 profile, in degraded mode using a single device). In this case
3813  *    when the task attempts to COW some extent buffer of the extent btree for
3814  *    example, it will trigger allocation of a new metadata block group with a
3815  *    suitable profile (SINGLE profile in the example of the degraded mount of
3816  *    the RAID1 filesystem);
3817  *
3818  * 3) The task has reserved enough transaction units / metadata space, but when
3819  *    it attempts to COW an extent buffer from the extent or device btree for
3820  *    example, it does not find any free extent in any metadata block group,
3821  *    therefore forced to try to allocate a new metadata block group.
3822  *    This is because some other task allocated all available extents in the
3823  *    meanwhile - this typically happens with tasks that don't reserve space
3824  *    properly, either intentionally or as a bug. One example where this is
3825  *    done intentionally is fsync, as it does not reserve any transaction units
3826  *    and ends up allocating a variable number of metadata extents for log
3827  *    tree extent buffers;
3828  *
3829  * 4) The task has reserved enough transaction units / metadata space, but right
3830  *    before it tries to allocate the last extent buffer it needs, a discard
3831  *    operation comes in and, temporarily, removes the last free space entry from
3832  *    the only metadata block group that had free space (discard starts by
3833  *    removing a free space entry from a block group, then does the discard
3834  *    operation and, once it's done, it adds back the free space entry to the
3835  *    block group).
3836  *
3837  * We also need this 2 phases setup when adding a device to a filesystem with
3838  * a seed device - we must create new metadata and system chunks without adding
3839  * any of the block group items to the chunk, extent and device btrees. If we
3840  * did not do it this way, we would get ENOSPC when attempting to update those
3841  * btrees, since all the chunks from the seed device are read-only.
3842  *
3843  * Phase 1 does the updates and insertions to the chunk btree because if we had
3844  * it done in phase 2 and have a thundering herd of tasks allocating chunks in
3845  * parallel, we risk having too many system chunks allocated by many tasks if
3846  * many tasks reach phase 1 without the previous ones completing phase 2. In the
3847  * extreme case this leads to exhaustion of the system chunk array in the
3848  * superblock. This is easier to trigger if using a btree node/leaf size of 64K
3849  * and with RAID filesystems (so we have more device items in the chunk btree).
3850  * This has happened before and commit eafa4fd0ad0607 ("btrfs: fix exhaustion of
3851  * the system chunk array due to concurrent allocations") provides more details.
3852  *
3853  * Allocation of system chunks does not happen through this function. A task that
3854  * needs to update the chunk btree (the only btree that uses system chunks), must
3855  * preallocate chunk space by calling either check_system_chunk() or
3856  * btrfs_reserve_chunk_metadata() - the former is used when allocating a data or
3857  * metadata chunk or when removing a chunk, while the later is used before doing
3858  * a modification to the chunk btree - use cases for the later are adding,
3859  * removing and resizing a device as well as relocation of a system chunk.
3860  * See the comment below for more details.
3861  *
3862  * The reservation of system space, done through check_system_chunk(), as well
3863  * as all the updates and insertions into the chunk btree must be done while
3864  * holding fs_info->chunk_mutex. This is important to guarantee that while COWing
3865  * an extent buffer from the chunks btree we never trigger allocation of a new
3866  * system chunk, which would result in a deadlock (trying to lock twice an
3867  * extent buffer of the chunk btree, first time before triggering the chunk
3868  * allocation and the second time during chunk allocation while attempting to
3869  * update the chunks btree). The system chunk array is also updated while holding
3870  * that mutex. The same logic applies to removing chunks - we must reserve system
3871  * space, update the chunk btree and the system chunk array in the superblock
3872  * while holding fs_info->chunk_mutex.
3873  *
3874  * This function, btrfs_chunk_alloc(), belongs to phase 1.
3875  *
3876  * If @force is CHUNK_ALLOC_FORCE:
3877  *    - return 1 if it successfully allocates a chunk,
3878  *    - return errors including -ENOSPC otherwise.
3879  * If @force is NOT CHUNK_ALLOC_FORCE:
3880  *    - return 0 if it doesn't need to allocate a new chunk,
3881  *    - return 1 if it successfully allocates a chunk,
3882  *    - return errors including -ENOSPC otherwise.
3883  */
3884 int btrfs_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags,
3885                       enum btrfs_chunk_alloc_enum force)
3886 {
3887         struct btrfs_fs_info *fs_info = trans->fs_info;
3888         struct btrfs_space_info *space_info;
3889         struct btrfs_block_group *ret_bg;
3890         bool wait_for_alloc = false;
3891         bool should_alloc = false;
3892         bool from_extent_allocation = false;
3893         int ret = 0;
3894
3895         if (force == CHUNK_ALLOC_FORCE_FOR_EXTENT) {
3896                 from_extent_allocation = true;
3897                 force = CHUNK_ALLOC_FORCE;
3898         }
3899
3900         /* Don't re-enter if we're already allocating a chunk */
3901         if (trans->allocating_chunk)
3902                 return -ENOSPC;
3903         /*
3904          * Allocation of system chunks can not happen through this path, as we
3905          * could end up in a deadlock if we are allocating a data or metadata
3906          * chunk and there is another task modifying the chunk btree.
3907          *
3908          * This is because while we are holding the chunk mutex, we will attempt
3909          * to add the new chunk item to the chunk btree or update an existing
3910          * device item in the chunk btree, while the other task that is modifying
3911          * the chunk btree is attempting to COW an extent buffer while holding a
3912          * lock on it and on its parent - if the COW operation triggers a system
3913          * chunk allocation, then we can deadlock because we are holding the
3914          * chunk mutex and we may need to access that extent buffer or its parent
3915          * in order to add the chunk item or update a device item.
3916          *
3917          * Tasks that want to modify the chunk tree should reserve system space
3918          * before updating the chunk btree, by calling either
3919          * btrfs_reserve_chunk_metadata() or check_system_chunk().
3920          * It's possible that after a task reserves the space, it still ends up
3921          * here - this happens in the cases described above at do_chunk_alloc().
3922          * The task will have to either retry or fail.
3923          */
3924         if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
3925                 return -ENOSPC;
3926
3927         space_info = btrfs_find_space_info(fs_info, flags);
3928         ASSERT(space_info);
3929
3930         do {
3931                 spin_lock(&space_info->lock);
3932                 if (force < space_info->force_alloc)
3933                         force = space_info->force_alloc;
3934                 should_alloc = should_alloc_chunk(fs_info, space_info, force);
3935                 if (space_info->full) {
3936                         /* No more free physical space */
3937                         if (should_alloc)
3938                                 ret = -ENOSPC;
3939                         else
3940                                 ret = 0;
3941                         spin_unlock(&space_info->lock);
3942                         return ret;
3943                 } else if (!should_alloc) {
3944                         spin_unlock(&space_info->lock);
3945                         return 0;
3946                 } else if (space_info->chunk_alloc) {
3947                         /*
3948                          * Someone is already allocating, so we need to block
3949                          * until this someone is finished and then loop to
3950                          * recheck if we should continue with our allocation
3951                          * attempt.
3952                          */
3953                         wait_for_alloc = true;
3954                         force = CHUNK_ALLOC_NO_FORCE;
3955                         spin_unlock(&space_info->lock);
3956                         mutex_lock(&fs_info->chunk_mutex);
3957                         mutex_unlock(&fs_info->chunk_mutex);
3958                 } else {
3959                         /* Proceed with allocation */
3960                         space_info->chunk_alloc = 1;
3961                         wait_for_alloc = false;
3962                         spin_unlock(&space_info->lock);
3963                 }
3964
3965                 cond_resched();
3966         } while (wait_for_alloc);
3967
3968         mutex_lock(&fs_info->chunk_mutex);
3969         trans->allocating_chunk = true;
3970
3971         /*
3972          * If we have mixed data/metadata chunks we want to make sure we keep
3973          * allocating mixed chunks instead of individual chunks.
3974          */
3975         if (btrfs_mixed_space_info(space_info))
3976                 flags |= (BTRFS_BLOCK_GROUP_DATA | BTRFS_BLOCK_GROUP_METADATA);
3977
3978         /*
3979          * if we're doing a data chunk, go ahead and make sure that
3980          * we keep a reasonable number of metadata chunks allocated in the
3981          * FS as well.
3982          */
3983         if (flags & BTRFS_BLOCK_GROUP_DATA && fs_info->metadata_ratio) {
3984                 fs_info->data_chunk_allocations++;
3985                 if (!(fs_info->data_chunk_allocations %
3986                       fs_info->metadata_ratio))
3987                         force_metadata_allocation(fs_info);
3988         }
3989
3990         ret_bg = do_chunk_alloc(trans, flags);
3991         trans->allocating_chunk = false;
3992
3993         if (IS_ERR(ret_bg)) {
3994                 ret = PTR_ERR(ret_bg);
3995         } else if (from_extent_allocation) {
3996                 /*
3997                  * New block group is likely to be used soon. Try to activate
3998                  * it now. Failure is OK for now.
3999                  */
4000                 btrfs_zone_activate(ret_bg);
4001         }
4002
4003         if (!ret)
4004                 btrfs_put_block_group(ret_bg);
4005
4006         spin_lock(&space_info->lock);
4007         if (ret < 0) {
4008                 if (ret == -ENOSPC)
4009                         space_info->full = 1;
4010                 else
4011                         goto out;
4012         } else {
4013                 ret = 1;
4014                 space_info->max_extent_size = 0;
4015         }
4016
4017         space_info->force_alloc = CHUNK_ALLOC_NO_FORCE;
4018 out:
4019         space_info->chunk_alloc = 0;
4020         spin_unlock(&space_info->lock);
4021         mutex_unlock(&fs_info->chunk_mutex);
4022
4023         return ret;
4024 }
4025
4026 static u64 get_profile_num_devs(struct btrfs_fs_info *fs_info, u64 type)
4027 {
4028         u64 num_dev;
4029
4030         num_dev = btrfs_raid_array[btrfs_bg_flags_to_raid_index(type)].devs_max;
4031         if (!num_dev)
4032                 num_dev = fs_info->fs_devices->rw_devices;
4033
4034         return num_dev;
4035 }
4036
4037 static void reserve_chunk_space(struct btrfs_trans_handle *trans,
4038                                 u64 bytes,
4039                                 u64 type)
4040 {
4041         struct btrfs_fs_info *fs_info = trans->fs_info;
4042         struct btrfs_space_info *info;
4043         u64 left;
4044         int ret = 0;
4045
4046         /*
4047          * Needed because we can end up allocating a system chunk and for an
4048          * atomic and race free space reservation in the chunk block reserve.
4049          */
4050         lockdep_assert_held(&fs_info->chunk_mutex);
4051
4052         info = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_SYSTEM);
4053         spin_lock(&info->lock);
4054         left = info->total_bytes - btrfs_space_info_used(info, true);
4055         spin_unlock(&info->lock);
4056
4057         if (left < bytes && btrfs_test_opt(fs_info, ENOSPC_DEBUG)) {
4058                 btrfs_info(fs_info, "left=%llu, need=%llu, flags=%llu",
4059                            left, bytes, type);
4060                 btrfs_dump_space_info(fs_info, info, 0, 0);
4061         }
4062
4063         if (left < bytes) {
4064                 u64 flags = btrfs_system_alloc_profile(fs_info);
4065                 struct btrfs_block_group *bg;
4066
4067                 /*
4068                  * Ignore failure to create system chunk. We might end up not
4069                  * needing it, as we might not need to COW all nodes/leafs from
4070                  * the paths we visit in the chunk tree (they were already COWed
4071                  * or created in the current transaction for example).
4072                  */
4073                 bg = btrfs_create_chunk(trans, flags);
4074                 if (IS_ERR(bg)) {
4075                         ret = PTR_ERR(bg);
4076                 } else {
4077                         /*
4078                          * We have a new chunk. We also need to activate it for
4079                          * zoned filesystem.
4080                          */
4081                         ret = btrfs_zoned_activate_one_bg(fs_info, info, true);
4082                         if (ret < 0)
4083                                 return;
4084
4085                         /*
4086                          * If we fail to add the chunk item here, we end up
4087                          * trying again at phase 2 of chunk allocation, at
4088                          * btrfs_create_pending_block_groups(). So ignore
4089                          * any error here. An ENOSPC here could happen, due to
4090                          * the cases described at do_chunk_alloc() - the system
4091                          * block group we just created was just turned into RO
4092                          * mode by a scrub for example, or a running discard
4093                          * temporarily removed its free space entries, etc.
4094                          */
4095                         btrfs_chunk_alloc_add_chunk_item(trans, bg);
4096                 }
4097         }
4098
4099         if (!ret) {
4100                 ret = btrfs_block_rsv_add(fs_info,
4101                                           &fs_info->chunk_block_rsv,
4102                                           bytes, BTRFS_RESERVE_NO_FLUSH);
4103                 if (!ret)
4104                         trans->chunk_bytes_reserved += bytes;
4105         }
4106 }
4107
4108 /*
4109  * Reserve space in the system space for allocating or removing a chunk.
4110  * The caller must be holding fs_info->chunk_mutex.
4111  */
4112 void check_system_chunk(struct btrfs_trans_handle *trans, u64 type)
4113 {
4114         struct btrfs_fs_info *fs_info = trans->fs_info;
4115         const u64 num_devs = get_profile_num_devs(fs_info, type);
4116         u64 bytes;
4117
4118         /* num_devs device items to update and 1 chunk item to add or remove. */
4119         bytes = btrfs_calc_metadata_size(fs_info, num_devs) +
4120                 btrfs_calc_insert_metadata_size(fs_info, 1);
4121
4122         reserve_chunk_space(trans, bytes, type);
4123 }
4124
4125 /*
4126  * Reserve space in the system space, if needed, for doing a modification to the
4127  * chunk btree.
4128  *
4129  * @trans:              A transaction handle.
4130  * @is_item_insertion:  Indicate if the modification is for inserting a new item
4131  *                      in the chunk btree or if it's for the deletion or update
4132  *                      of an existing item.
4133  *
4134  * This is used in a context where we need to update the chunk btree outside
4135  * block group allocation and removal, to avoid a deadlock with a concurrent
4136  * task that is allocating a metadata or data block group and therefore needs to
4137  * update the chunk btree while holding the chunk mutex. After the update to the
4138  * chunk btree is done, btrfs_trans_release_chunk_metadata() should be called.
4139  *
4140  */
4141 void btrfs_reserve_chunk_metadata(struct btrfs_trans_handle *trans,
4142                                   bool is_item_insertion)
4143 {
4144         struct btrfs_fs_info *fs_info = trans->fs_info;
4145         u64 bytes;
4146
4147         if (is_item_insertion)
4148                 bytes = btrfs_calc_insert_metadata_size(fs_info, 1);
4149         else
4150                 bytes = btrfs_calc_metadata_size(fs_info, 1);
4151
4152         mutex_lock(&fs_info->chunk_mutex);
4153         reserve_chunk_space(trans, bytes, BTRFS_BLOCK_GROUP_SYSTEM);
4154         mutex_unlock(&fs_info->chunk_mutex);
4155 }
4156
4157 void btrfs_put_block_group_cache(struct btrfs_fs_info *info)
4158 {
4159         struct btrfs_block_group *block_group;
4160
4161         block_group = btrfs_lookup_first_block_group(info, 0);
4162         while (block_group) {
4163                 btrfs_wait_block_group_cache_done(block_group);
4164                 spin_lock(&block_group->lock);
4165                 if (test_and_clear_bit(BLOCK_GROUP_FLAG_IREF,
4166                                        &block_group->runtime_flags)) {
4167                         struct inode *inode = block_group->inode;
4168
4169                         block_group->inode = NULL;
4170                         spin_unlock(&block_group->lock);
4171
4172                         ASSERT(block_group->io_ctl.inode == NULL);
4173                         iput(inode);
4174                 } else {
4175                         spin_unlock(&block_group->lock);
4176                 }
4177                 block_group = btrfs_next_block_group(block_group);
4178         }
4179 }
4180
4181 /*
4182  * Must be called only after stopping all workers, since we could have block
4183  * group caching kthreads running, and therefore they could race with us if we
4184  * freed the block groups before stopping them.
4185  */
4186 int btrfs_free_block_groups(struct btrfs_fs_info *info)
4187 {
4188         struct btrfs_block_group *block_group;
4189         struct btrfs_space_info *space_info;
4190         struct btrfs_caching_control *caching_ctl;
4191         struct rb_node *n;
4192
4193         write_lock(&info->block_group_cache_lock);
4194         while (!list_empty(&info->caching_block_groups)) {
4195                 caching_ctl = list_entry(info->caching_block_groups.next,
4196                                          struct btrfs_caching_control, list);
4197                 list_del(&caching_ctl->list);
4198                 btrfs_put_caching_control(caching_ctl);
4199         }
4200         write_unlock(&info->block_group_cache_lock);
4201
4202         spin_lock(&info->unused_bgs_lock);
4203         while (!list_empty(&info->unused_bgs)) {
4204                 block_group = list_first_entry(&info->unused_bgs,
4205                                                struct btrfs_block_group,
4206                                                bg_list);
4207                 list_del_init(&block_group->bg_list);
4208                 btrfs_put_block_group(block_group);
4209         }
4210
4211         while (!list_empty(&info->reclaim_bgs)) {
4212                 block_group = list_first_entry(&info->reclaim_bgs,
4213                                                struct btrfs_block_group,
4214                                                bg_list);
4215                 list_del_init(&block_group->bg_list);
4216                 btrfs_put_block_group(block_group);
4217         }
4218         spin_unlock(&info->unused_bgs_lock);
4219
4220         spin_lock(&info->zone_active_bgs_lock);
4221         while (!list_empty(&info->zone_active_bgs)) {
4222                 block_group = list_first_entry(&info->zone_active_bgs,
4223                                                struct btrfs_block_group,
4224                                                active_bg_list);
4225                 list_del_init(&block_group->active_bg_list);
4226                 btrfs_put_block_group(block_group);
4227         }
4228         spin_unlock(&info->zone_active_bgs_lock);
4229
4230         write_lock(&info->block_group_cache_lock);
4231         while ((n = rb_last(&info->block_group_cache_tree.rb_root)) != NULL) {
4232                 block_group = rb_entry(n, struct btrfs_block_group,
4233                                        cache_node);
4234                 rb_erase_cached(&block_group->cache_node,
4235                                 &info->block_group_cache_tree);
4236                 RB_CLEAR_NODE(&block_group->cache_node);
4237                 write_unlock(&info->block_group_cache_lock);
4238
4239                 down_write(&block_group->space_info->groups_sem);
4240                 list_del(&block_group->list);
4241                 up_write(&block_group->space_info->groups_sem);
4242
4243                 /*
4244                  * We haven't cached this block group, which means we could
4245                  * possibly have excluded extents on this block group.
4246                  */
4247                 if (block_group->cached == BTRFS_CACHE_NO ||
4248                     block_group->cached == BTRFS_CACHE_ERROR)
4249                         btrfs_free_excluded_extents(block_group);
4250
4251                 btrfs_remove_free_space_cache(block_group);
4252                 ASSERT(block_group->cached != BTRFS_CACHE_STARTED);
4253                 ASSERT(list_empty(&block_group->dirty_list));
4254                 ASSERT(list_empty(&block_group->io_list));
4255                 ASSERT(list_empty(&block_group->bg_list));
4256                 ASSERT(refcount_read(&block_group->refs) == 1);
4257                 ASSERT(block_group->swap_extents == 0);
4258                 btrfs_put_block_group(block_group);
4259
4260                 write_lock(&info->block_group_cache_lock);
4261         }
4262         write_unlock(&info->block_group_cache_lock);
4263
4264         btrfs_release_global_block_rsv(info);
4265
4266         while (!list_empty(&info->space_info)) {
4267                 space_info = list_entry(info->space_info.next,
4268                                         struct btrfs_space_info,
4269                                         list);
4270
4271                 /*
4272                  * Do not hide this behind enospc_debug, this is actually
4273                  * important and indicates a real bug if this happens.
4274                  */
4275                 if (WARN_ON(space_info->bytes_pinned > 0 ||
4276                             space_info->bytes_may_use > 0))
4277                         btrfs_dump_space_info(info, space_info, 0, 0);
4278
4279                 /*
4280                  * If there was a failure to cleanup a log tree, very likely due
4281                  * to an IO failure on a writeback attempt of one or more of its
4282                  * extent buffers, we could not do proper (and cheap) unaccounting
4283                  * of their reserved space, so don't warn on bytes_reserved > 0 in
4284                  * that case.
4285                  */
4286                 if (!(space_info->flags & BTRFS_BLOCK_GROUP_METADATA) ||
4287                     !BTRFS_FS_LOG_CLEANUP_ERROR(info)) {
4288                         if (WARN_ON(space_info->bytes_reserved > 0))
4289                                 btrfs_dump_space_info(info, space_info, 0, 0);
4290                 }
4291
4292                 WARN_ON(space_info->reclaim_size > 0);
4293                 list_del(&space_info->list);
4294                 btrfs_sysfs_remove_space_info(space_info);
4295         }
4296         return 0;
4297 }
4298
4299 void btrfs_freeze_block_group(struct btrfs_block_group *cache)
4300 {
4301         atomic_inc(&cache->frozen);
4302 }
4303
4304 void btrfs_unfreeze_block_group(struct btrfs_block_group *block_group)
4305 {
4306         struct btrfs_fs_info *fs_info = block_group->fs_info;
4307         struct extent_map_tree *em_tree;
4308         struct extent_map *em;
4309         bool cleanup;
4310
4311         spin_lock(&block_group->lock);
4312         cleanup = (atomic_dec_and_test(&block_group->frozen) &&
4313                    test_bit(BLOCK_GROUP_FLAG_REMOVED, &block_group->runtime_flags));
4314         spin_unlock(&block_group->lock);
4315
4316         if (cleanup) {
4317                 em_tree = &fs_info->mapping_tree;
4318                 write_lock(&em_tree->lock);
4319                 em = lookup_extent_mapping(em_tree, block_group->start,
4320                                            1);
4321                 BUG_ON(!em); /* logic error, can't happen */
4322                 remove_extent_mapping(em_tree, em);
4323                 write_unlock(&em_tree->lock);
4324
4325                 /* once for us and once for the tree */
4326                 free_extent_map(em);
4327                 free_extent_map(em);
4328
4329                 /*
4330                  * We may have left one free space entry and other possible
4331                  * tasks trimming this block group have left 1 entry each one.
4332                  * Free them if any.
4333                  */
4334                 btrfs_remove_free_space_cache(block_group);
4335         }
4336 }
4337
4338 bool btrfs_inc_block_group_swap_extents(struct btrfs_block_group *bg)
4339 {
4340         bool ret = true;
4341
4342         spin_lock(&bg->lock);
4343         if (bg->ro)
4344                 ret = false;
4345         else
4346                 bg->swap_extents++;
4347         spin_unlock(&bg->lock);
4348
4349         return ret;
4350 }
4351
4352 void btrfs_dec_block_group_swap_extents(struct btrfs_block_group *bg, int amount)
4353 {
4354         spin_lock(&bg->lock);
4355         ASSERT(!bg->ro);
4356         ASSERT(bg->swap_extents >= amount);
4357         bg->swap_extents -= amount;
4358         spin_unlock(&bg->lock);
4359 }
4360
4361 enum btrfs_block_group_size_class btrfs_calc_block_group_size_class(u64 size)
4362 {
4363         if (size <= SZ_128K)
4364                 return BTRFS_BG_SZ_SMALL;
4365         if (size <= SZ_8M)
4366                 return BTRFS_BG_SZ_MEDIUM;
4367         return BTRFS_BG_SZ_LARGE;
4368 }
4369
4370 /*
4371  * Handle a block group allocating an extent in a size class
4372  *
4373  * @bg:                         The block group we allocated in.
4374  * @size_class:                 The size class of the allocation.
4375  * @force_wrong_size_class:     Whether we are desperate enough to allow
4376  *                              mismatched size classes.
4377  *
4378  * Returns: 0 if the size class was valid for this block_group, -EAGAIN in the
4379  * case of a race that leads to the wrong size class without
4380  * force_wrong_size_class set.
4381  *
4382  * find_free_extent will skip block groups with a mismatched size class until
4383  * it really needs to avoid ENOSPC. In that case it will set
4384  * force_wrong_size_class. However, if a block group is newly allocated and
4385  * doesn't yet have a size class, then it is possible for two allocations of
4386  * different sizes to race and both try to use it. The loser is caught here and
4387  * has to retry.
4388  */
4389 int btrfs_use_block_group_size_class(struct btrfs_block_group *bg,
4390                                      enum btrfs_block_group_size_class size_class,
4391                                      bool force_wrong_size_class)
4392 {
4393         ASSERT(size_class != BTRFS_BG_SZ_NONE);
4394
4395         /* The new allocation is in the right size class, do nothing */
4396         if (bg->size_class == size_class)
4397                 return 0;
4398         /*
4399          * The new allocation is in a mismatched size class.
4400          * This means one of two things:
4401          *
4402          * 1. Two tasks in find_free_extent for different size_classes raced
4403          *    and hit the same empty block_group. Make the loser try again.
4404          * 2. A call to find_free_extent got desperate enough to set
4405          *    'force_wrong_slab'. Don't change the size_class, but allow the
4406          *    allocation.
4407          */
4408         if (bg->size_class != BTRFS_BG_SZ_NONE) {
4409                 if (force_wrong_size_class)
4410                         return 0;
4411                 return -EAGAIN;
4412         }
4413         /*
4414          * The happy new block group case: the new allocation is the first
4415          * one in the block_group so we set size_class.
4416          */
4417         bg->size_class = size_class;
4418
4419         return 0;
4420 }
4421
4422 bool btrfs_block_group_should_use_size_class(struct btrfs_block_group *bg)
4423 {
4424         if (btrfs_is_zoned(bg->fs_info))
4425                 return false;
4426         if (!btrfs_is_block_group_data_only(bg))
4427                 return false;
4428         return true;
4429 }