fs/btrfs/extent_map.c

   1 // SPDX-License-Identifier: GPL-2.0
   2
   3 #include <linux/err.h>
   4 #include <linux/slab.h>
   5 #include <linux/spinlock.h>
   6 #include "messages.h"
   7 #include "ctree.h"
   8 #include "extent_map.h"
   9 #include "compression.h"
  10 #include "btrfs_inode.h"
  11 #include "disk-io.h"
  12
  13
  14 static struct kmem_cache *extent_map_cache;
  15
  16 int __init extent_map_init(void)
  17 {
  18         extent_map_cache = kmem_cache_create("btrfs_extent_map",
  19                                              sizeof(struct extent_map), 0, 0, NULL);
  20         if (!extent_map_cache)
  21                 return -ENOMEM;
  22         return 0;
  23 }
  24
  25 void __cold extent_map_exit(void)
  26 {
  27         kmem_cache_destroy(extent_map_cache);
  28 }
  29
  30 /*
  31  * Initialize the extent tree @tree.  Should be called for each new inode or
  32  * other user of the extent_map interface.
  33  */
  34 void extent_map_tree_init(struct extent_map_tree *tree)
  35 {
  36         tree->map = RB_ROOT_CACHED;
  37         INIT_LIST_HEAD(&tree->modified_extents);
  38         rwlock_init(&tree->lock);
  39 }
  40
  41 /*
  42  * Allocate a new extent_map structure.  The new structure is returned with a
  43  * reference count of one and needs to be freed using free_extent_map()
  44  */
  45 struct extent_map *alloc_extent_map(void)
  46 {
  47         struct extent_map *em;
  48         em = kmem_cache_zalloc(extent_map_cache, GFP_NOFS);
  49         if (!em)
  50                 return NULL;
  51         RB_CLEAR_NODE(&em->rb_node);
  52         refcount_set(&em->refs, 1);
  53         INIT_LIST_HEAD(&em->list);
  54         return em;
  55 }
  56
  57 /*
  58  * Drop the reference out on @em by one and free the structure if the reference
  59  * count hits zero.
  60  */
  61 void free_extent_map(struct extent_map *em)
  62 {
  63         if (!em)
  64                 return;
  65         if (refcount_dec_and_test(&em->refs)) {
  66                 WARN_ON(extent_map_in_tree(em));
  67                 WARN_ON(!list_empty(&em->list));
  68                 kmem_cache_free(extent_map_cache, em);
  69         }
  70 }
  71
  72 /* Do the math around the end of an extent, handling wrapping. */
  73 static u64 range_end(u64 start, u64 len)
  74 {
  75         if (start + len < start)
  76                 return (u64)-1;
  77         return start + len;
  78 }
  79
  80 static void dec_evictable_extent_maps(struct btrfs_inode *inode)
  81 {
  82         struct btrfs_fs_info *fs_info = inode->root->fs_info;
  83
  84         if (!btrfs_is_testing(fs_info) && is_fstree(btrfs_root_id(inode->root)))
  85                 percpu_counter_dec(&fs_info->evictable_extent_maps);
  86 }
  87
  88 static int tree_insert(struct rb_root_cached *root, struct extent_map *em)
  89 {
  90         struct rb_node **p = &root->rb_root.rb_node;
  91         struct rb_node *parent = NULL;
  92         struct extent_map *entry = NULL;
  93         struct rb_node *orig_parent = NULL;
  94         u64 end = range_end(em->start, em->len);
  95         bool leftmost = true;
  96
  97         while (*p) {
  98                 parent = *p;
  99                 entry = rb_entry(parent, struct extent_map, rb_node);
 100
 101                 if (em->start < entry->start) {
 102                         p = &(*p)->rb_left;
 103                 } else if (em->start >= extent_map_end(entry)) {
 104                         p = &(*p)->rb_right;
 105                         leftmost = false;
 106                 } else {
 107                         return -EEXIST;
 108                 }
 109         }
 110
 111         orig_parent = parent;
 112         while (parent && em->start >= extent_map_end(entry)) {
 113                 parent = rb_next(parent);
 114                 entry = rb_entry(parent, struct extent_map, rb_node);
 115         }
 116         if (parent)
 117                 if (end > entry->start && em->start < extent_map_end(entry))
 118                         return -EEXIST;
 119
 120         parent = orig_parent;
 121         entry = rb_entry(parent, struct extent_map, rb_node);
 122         while (parent && em->start < entry->start) {
 123                 parent = rb_prev(parent);
 124                 entry = rb_entry(parent, struct extent_map, rb_node);
 125         }
 126         if (parent)
 127                 if (end > entry->start && em->start < extent_map_end(entry))
 128                         return -EEXIST;
 129
 130         rb_link_node(&em->rb_node, orig_parent, p);
 131         rb_insert_color_cached(&em->rb_node, root, leftmost);
 132         return 0;
 133 }
 134
 135 /*
 136  * Search through the tree for an extent_map with a given offset.  If it can't
 137  * be found, try to find some neighboring extents
 138  */
 139 static struct rb_node *__tree_search(struct rb_root *root, u64 offset,
 140                                      struct rb_node **prev_or_next_ret)
 141 {
 142         struct rb_node *n = root->rb_node;
 143         struct rb_node *prev = NULL;
 144         struct rb_node *orig_prev = NULL;
 145         struct extent_map *entry;
 146         struct extent_map *prev_entry = NULL;
 147
 148         ASSERT(prev_or_next_ret);
 149
 150         while (n) {
 151                 entry = rb_entry(n, struct extent_map, rb_node);
 152                 prev = n;
 153                 prev_entry = entry;
 154
 155                 if (offset < entry->start)
 156                         n = n->rb_left;
 157                 else if (offset >= extent_map_end(entry))
 158                         n = n->rb_right;
 159                 else
 160                         return n;
 161         }
 162
 163         orig_prev = prev;
 164         while (prev && offset >= extent_map_end(prev_entry)) {
 165                 prev = rb_next(prev);
 166                 prev_entry = rb_entry(prev, struct extent_map, rb_node);
 167         }
 168
 169         /*
 170          * Previous extent map found, return as in this case the caller does not
 171          * care about the next one.
 172          */
 173         if (prev) {
 174                 *prev_or_next_ret = prev;
 175                 return NULL;
 176         }
 177
 178         prev = orig_prev;
 179         prev_entry = rb_entry(prev, struct extent_map, rb_node);
 180         while (prev && offset < prev_entry->start) {
 181                 prev = rb_prev(prev);
 182                 prev_entry = rb_entry(prev, struct extent_map, rb_node);
 183         }
 184         *prev_or_next_ret = prev;
 185
 186         return NULL;
 187 }
 188
 189 static inline u64 extent_map_block_end(const struct extent_map *em)
 190 {
 191         if (em->block_start + em->block_len < em->block_start)
 192                 return (u64)-1;
 193         return em->block_start + em->block_len;
 194 }
 195
 196 static bool can_merge_extent_map(const struct extent_map *em)
 197 {
 198         if (em->flags & EXTENT_FLAG_PINNED)
 199                 return false;
 200
 201         /* Don't merge compressed extents, we need to know their actual size. */
 202         if (extent_map_is_compressed(em))
 203                 return false;
 204
 205         if (em->flags & EXTENT_FLAG_LOGGING)
 206                 return false;
 207
 208         /*
 209          * We don't want to merge stuff that hasn't been written to the log yet
 210          * since it may not reflect exactly what is on disk, and that would be
 211          * bad.
 212          */
 213         if (!list_empty(&em->list))
 214                 return false;
 215
 216         return true;
 217 }
 218
 219 /* Check to see if two extent_map structs are adjacent and safe to merge. */
 220 static bool mergeable_maps(const struct extent_map *prev, const struct extent_map *next)
 221 {
 222         if (extent_map_end(prev) != next->start)
 223                 return false;
 224
 225         if (prev->flags != next->flags)
 226                 return false;
 227
 228         if (next->block_start < EXTENT_MAP_LAST_BYTE - 1)
 229                 return next->block_start == extent_map_block_end(prev);
 230
 231         /* HOLES and INLINE extents. */
 232         return next->block_start == prev->block_start;
 233 }
 234
 235 static void try_merge_map(struct btrfs_inode *inode, struct extent_map *em)
 236 {
 237         struct extent_map_tree *tree = &inode->extent_tree;
 238         struct extent_map *merge = NULL;
 239         struct rb_node *rb;
 240
 241         /*
 242          * We can't modify an extent map that is in the tree and that is being
 243          * used by another task, as it can cause that other task to see it in
 244          * inconsistent state during the merging. We always have 1 reference for
 245          * the tree and 1 for this task (which is unpinning the extent map or
 246          * clearing the logging flag), so anything > 2 means it's being used by
 247          * other tasks too.
 248          */
 249         if (refcount_read(&em->refs) > 2)
 250                 return;
 251
 252         if (!can_merge_extent_map(em))
 253                 return;
 254
 255         if (em->start != 0) {
 256                 rb = rb_prev(&em->rb_node);
 257                 if (rb)
 258                         merge = rb_entry(rb, struct extent_map, rb_node);
 259                 if (rb && can_merge_extent_map(merge) && mergeable_maps(merge, em)) {
 260                         em->start = merge->start;
 261                         em->orig_start = merge->orig_start;
 262                         em->len += merge->len;
 263                         em->block_len += merge->block_len;
 264                         em->block_start = merge->block_start;
 265                         em->generation = max(em->generation, merge->generation);
 266                         em->flags |= EXTENT_FLAG_MERGED;
 267
 268                         rb_erase_cached(&merge->rb_node, &tree->map);
 269                         RB_CLEAR_NODE(&merge->rb_node);
 270                         free_extent_map(merge);
 271                         dec_evictable_extent_maps(inode);
 272                 }
 273         }
 274
 275         rb = rb_next(&em->rb_node);
 276         if (rb)
 277                 merge = rb_entry(rb, struct extent_map, rb_node);
 278         if (rb && can_merge_extent_map(merge) && mergeable_maps(em, merge)) {
 279                 em->len += merge->len;
 280                 em->block_len += merge->block_len;
 281                 rb_erase_cached(&merge->rb_node, &tree->map);
 282                 RB_CLEAR_NODE(&merge->rb_node);
 283                 em->generation = max(em->generation, merge->generation);
 284                 em->flags |= EXTENT_FLAG_MERGED;
 285                 free_extent_map(merge);
 286                 dec_evictable_extent_maps(inode);
 287         }
 288 }
 289
 290 /*
 291  * Unpin an extent from the cache.
 292  *
 293  * @inode:      the inode from which we are unpinning an extent range
 294  * @start:      logical offset in the file
 295  * @len:        length of the extent
 296  * @gen:        generation that this extent has been modified in
 297  *
 298  * Called after an extent has been written to disk properly.  Set the generation
 299  * to the generation that actually added the file item to the inode so we know
 300  * we need to sync this extent when we call fsync().
 301  *
 302  * Returns: 0        on success
 303  *          -ENOENT  when the extent is not found in the tree
 304  *          -EUCLEAN if the found extent does not match the expected start
 305  */
 306 int unpin_extent_cache(struct btrfs_inode *inode, u64 start, u64 len, u64 gen)
 307 {
 308         struct btrfs_fs_info *fs_info = inode->root->fs_info;
 309         struct extent_map_tree *tree = &inode->extent_tree;
 310         int ret = 0;
 311         struct extent_map *em;
 312
 313         write_lock(&tree->lock);
 314         em = lookup_extent_mapping(tree, start, len);
 315
 316         if (WARN_ON(!em)) {
 317                 btrfs_warn(fs_info,
 318 "no extent map found for inode %llu (root %lld) when unpinning extent range [%llu, %llu), generation %llu",
 319                            btrfs_ino(inode), btrfs_root_id(inode->root),
 320                            start, start + len, gen);
 321                 ret = -ENOENT;
 322                 goto out;
 323         }
 324
 325         if (WARN_ON(em->start != start)) {
 326                 btrfs_warn(fs_info,
 327 "found extent map for inode %llu (root %lld) with unexpected start offset %llu when unpinning extent range [%llu, %llu), generation %llu",
 328                            btrfs_ino(inode), btrfs_root_id(inode->root),
 329                            em->start, start, start + len, gen);
 330                 ret = -EUCLEAN;
 331                 goto out;
 332         }
 333
 334         em->generation = gen;
 335         em->flags &= ~EXTENT_FLAG_PINNED;
 336
 337         try_merge_map(inode, em);
 338
 339 out:
 340         write_unlock(&tree->lock);
 341         free_extent_map(em);
 342         return ret;
 343
 344 }
 345
 346 void clear_em_logging(struct btrfs_inode *inode, struct extent_map *em)
 347 {
 348         lockdep_assert_held_write(&inode->extent_tree.lock);
 349
 350         em->flags &= ~EXTENT_FLAG_LOGGING;
 351         if (extent_map_in_tree(em))
 352                 try_merge_map(inode, em);
 353 }
 354
 355 static inline void setup_extent_mapping(struct btrfs_inode *inode,
 356                                         struct extent_map *em,
 357                                         int modified)
 358 {
 359         refcount_inc(&em->refs);
 360
 361         ASSERT(list_empty(&em->list));
 362
 363         if (modified)
 364                 list_add(&em->list, &inode->extent_tree.modified_extents);
 365         else
 366                 try_merge_map(inode, em);
 367 }
 368
 369 /*
 370  * Add a new extent map to an inode's extent map tree.
 371  *
 372  * @inode:      the target inode
 373  * @em:         map to insert
 374  * @modified:   indicate whether the given @em should be added to the
 375  *              modified list, which indicates the extent needs to be logged
 376  *
 377  * Insert @em into the @inode's extent map tree or perform a simple
 378  * forward/backward merge with existing mappings.  The extent_map struct passed
 379  * in will be inserted into the tree directly, with an additional reference
 380  * taken, or a reference dropped if the merge attempt was successful.
 381  */
 382 static int add_extent_mapping(struct btrfs_inode *inode,
 383                               struct extent_map *em, int modified)
 384 {
 385         struct extent_map_tree *tree = &inode->extent_tree;
 386         struct btrfs_root *root = inode->root;
 387         struct btrfs_fs_info *fs_info = root->fs_info;
 388         int ret;
 389
 390         lockdep_assert_held_write(&tree->lock);
 391
 392         ret = tree_insert(&tree->map, em);
 393         if (ret)
 394                 return ret;
 395
 396         setup_extent_mapping(inode, em, modified);
 397
 398         if (!btrfs_is_testing(fs_info) && is_fstree(btrfs_root_id(root)))
 399                 percpu_counter_inc(&fs_info->evictable_extent_maps);
 400
 401         return 0;
 402 }
 403
 404 static struct extent_map *
 405 __lookup_extent_mapping(struct extent_map_tree *tree,
 406                         u64 start, u64 len, int strict)
 407 {
 408         struct extent_map *em;
 409         struct rb_node *rb_node;
 410         struct rb_node *prev_or_next = NULL;
 411         u64 end = range_end(start, len);
 412
 413         rb_node = __tree_search(&tree->map.rb_root, start, &prev_or_next);
 414         if (!rb_node) {
 415                 if (prev_or_next)
 416                         rb_node = prev_or_next;
 417                 else
 418                         return NULL;
 419         }
 420
 421         em = rb_entry(rb_node, struct extent_map, rb_node);
 422
 423         if (strict && !(end > em->start && start < extent_map_end(em)))
 424                 return NULL;
 425
 426         refcount_inc(&em->refs);
 427         return em;
 428 }
 429
 430 /*
 431  * Lookup extent_map that intersects @start + @len range.
 432  *
 433  * @tree:       tree to lookup in
 434  * @start:      byte offset to start the search
 435  * @len:        length of the lookup range
 436  *
 437  * Find and return the first extent_map struct in @tree that intersects the
 438  * [start, len] range.  There may be additional objects in the tree that
 439  * intersect, so check the object returned carefully to make sure that no
 440  * additional lookups are needed.
 441  */
 442 struct extent_map *lookup_extent_mapping(struct extent_map_tree *tree,
 443                                          u64 start, u64 len)
 444 {
 445         return __lookup_extent_mapping(tree, start, len, 1);
 446 }
 447
 448 /*
 449  * Find a nearby extent map intersecting @start + @len (not an exact search).
 450  *
 451  * @tree:       tree to lookup in
 452  * @start:      byte offset to start the search
 453  * @len:        length of the lookup range
 454  *
 455  * Find and return the first extent_map struct in @tree that intersects the
 456  * [start, len] range.
 457  *
 458  * If one can't be found, any nearby extent may be returned
 459  */
 460 struct extent_map *search_extent_mapping(struct extent_map_tree *tree,
 461                                          u64 start, u64 len)
 462 {
 463         return __lookup_extent_mapping(tree, start, len, 0);
 464 }
 465
 466 /*
 467  * Remove an extent_map from its inode's extent tree.
 468  *
 469  * @inode:      the inode the extent map belongs to
 470  * @em:         extent map being removed
 471  *
 472  * Remove @em from the extent tree of @inode.  No reference counts are dropped,
 473  * and no checks are done to see if the range is in use.
 474  */
 475 void remove_extent_mapping(struct btrfs_inode *inode, struct extent_map *em)
 476 {
 477         struct extent_map_tree *tree = &inode->extent_tree;
 478
 479         lockdep_assert_held_write(&tree->lock);
 480
 481         WARN_ON(em->flags & EXTENT_FLAG_PINNED);
 482         rb_erase_cached(&em->rb_node, &tree->map);
 483         if (!(em->flags & EXTENT_FLAG_LOGGING))
 484                 list_del_init(&em->list);
 485         RB_CLEAR_NODE(&em->rb_node);
 486
 487         dec_evictable_extent_maps(inode);
 488 }
 489
 490 static void replace_extent_mapping(struct btrfs_inode *inode,
 491                                    struct extent_map *cur,
 492                                    struct extent_map *new,
 493                                    int modified)
 494 {
 495         struct extent_map_tree *tree = &inode->extent_tree;
 496
 497         lockdep_assert_held_write(&tree->lock);
 498
 499         WARN_ON(cur->flags & EXTENT_FLAG_PINNED);
 500         ASSERT(extent_map_in_tree(cur));
 501         if (!(cur->flags & EXTENT_FLAG_LOGGING))
 502                 list_del_init(&cur->list);
 503         rb_replace_node_cached(&cur->rb_node, &new->rb_node, &tree->map);
 504         RB_CLEAR_NODE(&cur->rb_node);
 505
 506         setup_extent_mapping(inode, new, modified);
 507 }
 508
 509 static struct extent_map *next_extent_map(const struct extent_map *em)
 510 {
 511         struct rb_node *next;
 512
 513         next = rb_next(&em->rb_node);
 514         if (!next)
 515                 return NULL;
 516         return container_of(next, struct extent_map, rb_node);
 517 }
 518
 519 static struct extent_map *prev_extent_map(struct extent_map *em)
 520 {
 521         struct rb_node *prev;
 522
 523         prev = rb_prev(&em->rb_node);
 524         if (!prev)
 525                 return NULL;
 526         return container_of(prev, struct extent_map, rb_node);
 527 }
 528
 529 /*
 530  * Helper for btrfs_get_extent.  Given an existing extent in the tree,
 531  * the existing extent is the nearest extent to map_start,
 532  * and an extent that you want to insert, deal with overlap and insert
 533  * the best fitted new extent into the tree.
 534  */
 535 static noinline int merge_extent_mapping(struct btrfs_inode *inode,
 536                                          struct extent_map *existing,
 537                                          struct extent_map *em,
 538                                          u64 map_start)
 539 {
 540         struct extent_map *prev;
 541         struct extent_map *next;
 542         u64 start;
 543         u64 end;
 544         u64 start_diff;
 545
 546         if (map_start < em->start || map_start >= extent_map_end(em))
 547                 return -EINVAL;
 548
 549         if (existing->start > map_start) {
 550                 next = existing;
 551                 prev = prev_extent_map(next);
 552         } else {
 553                 prev = existing;
 554                 next = next_extent_map(prev);
 555         }
 556
 557         start = prev ? extent_map_end(prev) : em->start;
 558         start = max_t(u64, start, em->start);
 559         end = next ? next->start : extent_map_end(em);
 560         end = min_t(u64, end, extent_map_end(em));
 561         start_diff = start - em->start;
 562         em->start = start;
 563         em->len = end - start;
 564         if (em->block_start < EXTENT_MAP_LAST_BYTE &&
 565             !extent_map_is_compressed(em)) {
 566                 em->block_start += start_diff;
 567                 em->block_len = em->len;
 568         }
 569         return add_extent_mapping(inode, em, 0);
 570 }
 571
 572 /*
 573  * Add extent mapping into an inode's extent map tree.
 574  *
 575  * @inode:    target inode
 576  * @em_in:    extent we are inserting
 577  * @start:    start of the logical range btrfs_get_extent() is requesting
 578  * @len:      length of the logical range btrfs_get_extent() is requesting
 579  *
 580  * Note that @em_in's range may be different from [start, start+len),
 581  * but they must be overlapped.
 582  *
 583  * Insert @em_in into the inode's extent map tree. In case there is an
 584  * overlapping range, handle the -EEXIST by either:
 585  * a) Returning the existing extent in @em_in if @start is within the
 586  *    existing em.
 587  * b) Merge the existing extent with @em_in passed in.
 588  *
 589  * Return 0 on success, otherwise -EEXIST.
 590  *
 591  */
 592 int btrfs_add_extent_mapping(struct btrfs_inode *inode,
 593                              struct extent_map **em_in, u64 start, u64 len)
 594 {
 595         int ret;
 596         struct extent_map *em = *em_in;
 597         struct btrfs_fs_info *fs_info = inode->root->fs_info;
 598
 599         /*
 600          * Tree-checker should have rejected any inline extent with non-zero
 601          * file offset. Here just do a sanity check.
 602          */
 603         if (em->block_start == EXTENT_MAP_INLINE)
 604                 ASSERT(em->start == 0);
 605
 606         ret = add_extent_mapping(inode, em, 0);
 607         /* it is possible that someone inserted the extent into the tree
 608          * while we had the lock dropped.  It is also possible that
 609          * an overlapping map exists in the tree
 610          */
 611         if (ret == -EEXIST) {
 612                 struct extent_map *existing;
 613
 614                 existing = search_extent_mapping(&inode->extent_tree, start, len);
 615
 616                 trace_btrfs_handle_em_exist(fs_info, existing, em, start, len);
 617
 618                 /*
 619                  * existing will always be non-NULL, since there must be
 620                  * extent causing the -EEXIST.
 621                  */
 622                 if (start >= existing->start &&
 623                     start < extent_map_end(existing)) {
 624                         free_extent_map(em);
 625                         *em_in = existing;
 626                         ret = 0;
 627                 } else {
 628                         u64 orig_start = em->start;
 629                         u64 orig_len = em->len;
 630
 631                         /*
 632                          * The existing extent map is the one nearest to
 633                          * the [start, start + len) range which overlaps
 634                          */
 635                         ret = merge_extent_mapping(inode, existing, em, start);
 636                         if (WARN_ON(ret)) {
 637                                 free_extent_map(em);
 638                                 *em_in = NULL;
 639                                 btrfs_warn(fs_info,
 640 "extent map merge error existing [%llu, %llu) with em [%llu, %llu) start %llu",
 641                                            existing->start, extent_map_end(existing),
 642                                            orig_start, orig_start + orig_len, start);
 643                         }
 644                         free_extent_map(existing);
 645                 }
 646         }
 647
 648         ASSERT(ret == 0 || ret == -EEXIST);
 649         return ret;
 650 }
 651
 652 /*
 653  * Drop all extent maps from a tree in the fastest possible way, rescheduling
 654  * if needed. This avoids searching the tree, from the root down to the first
 655  * extent map, before each deletion.
 656  */
 657 static void drop_all_extent_maps_fast(struct btrfs_inode *inode)
 658 {
 659         struct extent_map_tree *tree = &inode->extent_tree;
 660
 661         write_lock(&tree->lock);
 662         while (!RB_EMPTY_ROOT(&tree->map.rb_root)) {
 663                 struct extent_map *em;
 664                 struct rb_node *node;
 665
 666                 node = rb_first_cached(&tree->map);
 667                 em = rb_entry(node, struct extent_map, rb_node);
 668                 em->flags &= ~(EXTENT_FLAG_PINNED | EXTENT_FLAG_LOGGING);
 669                 remove_extent_mapping(inode, em);
 670                 free_extent_map(em);
 671                 cond_resched_rwlock_write(&tree->lock);
 672         }
 673         write_unlock(&tree->lock);
 674 }
 675
 676 /*
 677  * Drop all extent maps in a given range.
 678  *
 679  * @inode:       The target inode.
 680  * @start:       Start offset of the range.
 681  * @end:         End offset of the range (inclusive value).
 682  * @skip_pinned: Indicate if pinned extent maps should be ignored or not.
 683  *
 684  * This drops all the extent maps that intersect the given range [@start, @end].
 685  * Extent maps that partially overlap the range and extend behind or beyond it,
 686  * are split.
 687  * The caller should have locked an appropriate file range in the inode's io
 688  * tree before calling this function.
 689  */
 690 void btrfs_drop_extent_map_range(struct btrfs_inode *inode, u64 start, u64 end,
 691                                  bool skip_pinned)
 692 {
 693         struct extent_map *split;
 694         struct extent_map *split2;
 695         struct extent_map *em;
 696         struct extent_map_tree *em_tree = &inode->extent_tree;
 697         u64 len = end - start + 1;
 698
 699         WARN_ON(end < start);
 700         if (end == (u64)-1) {
 701                 if (start == 0 && !skip_pinned) {
 702                         drop_all_extent_maps_fast(inode);
 703                         return;
 704                 }
 705                 len = (u64)-1;
 706         } else {
 707                 /* Make end offset exclusive for use in the loop below. */
 708                 end++;
 709         }
 710
 711         /*
 712          * It's ok if we fail to allocate the extent maps, see the comment near
 713          * the bottom of the loop below. We only need two spare extent maps in
 714          * the worst case, where the first extent map that intersects our range
 715          * starts before the range and the last extent map that intersects our
 716          * range ends after our range (and they might be the same extent map),
 717          * because we need to split those two extent maps at the boundaries.
 718          */
 719         split = alloc_extent_map();
 720         split2 = alloc_extent_map();
 721
 722         write_lock(&em_tree->lock);
 723         em = lookup_extent_mapping(em_tree, start, len);
 724
 725         while (em) {
 726                 /* extent_map_end() returns exclusive value (last byte + 1). */
 727                 const u64 em_end = extent_map_end(em);
 728                 struct extent_map *next_em = NULL;
 729                 u64 gen;
 730                 unsigned long flags;
 731                 bool modified;
 732                 bool compressed;
 733
 734                 if (em_end < end) {
 735                         next_em = next_extent_map(em);
 736                         if (next_em) {
 737                                 if (next_em->start < end)
 738                                         refcount_inc(&next_em->refs);
 739                                 else
 740                                         next_em = NULL;
 741                         }
 742                 }
 743
 744                 if (skip_pinned && (em->flags & EXTENT_FLAG_PINNED)) {
 745                         start = em_end;
 746                         goto next;
 747                 }
 748
 749                 flags = em->flags;
 750                 /*
 751                  * In case we split the extent map, we want to preserve the
 752                  * EXTENT_FLAG_LOGGING flag on our extent map, but we don't want
 753                  * it on the new extent maps.
 754                  */
 755                 em->flags &= ~(EXTENT_FLAG_PINNED | EXTENT_FLAG_LOGGING);
 756                 modified = !list_empty(&em->list);
 757
 758                 /*
 759                  * The extent map does not cross our target range, so no need to
 760                  * split it, we can remove it directly.
 761                  */
 762                 if (em->start >= start && em_end <= end)
 763                         goto remove_em;
 764
 765                 gen = em->generation;
 766                 compressed = extent_map_is_compressed(em);
 767
 768                 if (em->start < start) {
 769                         if (!split) {
 770                                 split = split2;
 771                                 split2 = NULL;
 772                                 if (!split)
 773                                         goto remove_em;
 774                         }
 775                         split->start = em->start;
 776                         split->len = start - em->start;
 777
 778                         if (em->block_start < EXTENT_MAP_LAST_BYTE) {
 779                                 split->orig_start = em->orig_start;
 780                                 split->block_start = em->block_start;
 781
 782                                 if (compressed)
 783                                         split->block_len = em->block_len;
 784                                 else
 785                                         split->block_len = split->len;
 786                                 split->orig_block_len = max(split->block_len,
 787                                                 em->orig_block_len);
 788                                 split->ram_bytes = em->ram_bytes;
 789                         } else {
 790                                 split->orig_start = split->start;
 791                                 split->block_len = 0;
 792                                 split->block_start = em->block_start;
 793                                 split->orig_block_len = 0;
 794                                 split->ram_bytes = split->len;
 795                         }
 796
 797                         split->generation = gen;
 798                         split->flags = flags;
 799                         replace_extent_mapping(inode, em, split, modified);
 800                         free_extent_map(split);
 801                         split = split2;
 802                         split2 = NULL;
 803                 }
 804                 if (em_end > end) {
 805                         if (!split) {
 806                                 split = split2;
 807                                 split2 = NULL;
 808                                 if (!split)
 809                                         goto remove_em;
 810                         }
 811                         split->start = end;
 812                         split->len = em_end - end;
 813                         split->block_start = em->block_start;
 814                         split->flags = flags;
 815                         split->generation = gen;
 816
 817                         if (em->block_start < EXTENT_MAP_LAST_BYTE) {
 818                                 split->orig_block_len = max(em->block_len,
 819                                                     em->orig_block_len);
 820
 821                                 split->ram_bytes = em->ram_bytes;
 822                                 if (compressed) {
 823                                         split->block_len = em->block_len;
 824                                         split->orig_start = em->orig_start;
 825                                 } else {
 826                                         const u64 diff = end - em->start;
 827
 828                                         split->block_len = split->len;
 829                                         split->block_start += diff;
 830                                         split->orig_start = em->orig_start;
 831                                 }
 832                         } else {
 833                                 split->ram_bytes = split->len;
 834                                 split->orig_start = split->start;
 835                                 split->block_len = 0;
 836                                 split->orig_block_len = 0;
 837                         }
 838
 839                         if (extent_map_in_tree(em)) {
 840                                 replace_extent_mapping(inode, em, split, modified);
 841                         } else {
 842                                 int ret;
 843
 844                                 ret = add_extent_mapping(inode, split, modified);
 845                                 /* Logic error, shouldn't happen. */
 846                                 ASSERT(ret == 0);
 847                                 if (WARN_ON(ret != 0) && modified)
 848                                         btrfs_set_inode_full_sync(inode);
 849                         }
 850                         free_extent_map(split);
 851                         split = NULL;
 852                 }
 853 remove_em:
 854                 if (extent_map_in_tree(em)) {
 855                         /*
 856                          * If the extent map is still in the tree it means that
 857                          * either of the following is true:
 858                          *
 859                          * 1) It fits entirely in our range (doesn't end beyond
 860                          *    it or starts before it);
 861                          *
 862                          * 2) It starts before our range and/or ends after our
 863                          *    range, and we were not able to allocate the extent
 864                          *    maps for split operations, @split and @split2.
 865                          *
 866                          * If we are at case 2) then we just remove the entire
 867                          * extent map - this is fine since if anyone needs it to
 868                          * access the subranges outside our range, will just
 869                          * load it again from the subvolume tree's file extent
 870                          * item. However if the extent map was in the list of
 871                          * modified extents, then we must mark the inode for a
 872                          * full fsync, otherwise a fast fsync will miss this
 873                          * extent if it's new and needs to be logged.
 874                          */
 875                         if ((em->start < start || em_end > end) && modified) {
 876                                 ASSERT(!split);
 877                                 btrfs_set_inode_full_sync(inode);
 878                         }
 879                         remove_extent_mapping(inode, em);
 880                 }
 881
 882                 /*
 883                  * Once for the tree reference (we replaced or removed the
 884                  * extent map from the tree).
 885                  */
 886                 free_extent_map(em);
 887 next:
 888                 /* Once for us (for our lookup reference). */
 889                 free_extent_map(em);
 890
 891                 em = next_em;
 892         }
 893
 894         write_unlock(&em_tree->lock);
 895
 896         free_extent_map(split);
 897         free_extent_map(split2);
 898 }
 899
 900 /*
 901  * Replace a range in the inode's extent map tree with a new extent map.
 902  *
 903  * @inode:      The target inode.
 904  * @new_em:     The new extent map to add to the inode's extent map tree.
 905  * @modified:   Indicate if the new extent map should be added to the list of
 906  *              modified extents (for fast fsync tracking).
 907  *
 908  * Drops all the extent maps in the inode's extent map tree that intersect the
 909  * range of the new extent map and adds the new extent map to the tree.
 910  * The caller should have locked an appropriate file range in the inode's io
 911  * tree before calling this function.
 912  */
 913 int btrfs_replace_extent_map_range(struct btrfs_inode *inode,
 914                                    struct extent_map *new_em,
 915                                    bool modified)
 916 {
 917         const u64 end = new_em->start + new_em->len - 1;
 918         struct extent_map_tree *tree = &inode->extent_tree;
 919         int ret;
 920
 921         ASSERT(!extent_map_in_tree(new_em));
 922
 923         /*
 924          * The caller has locked an appropriate file range in the inode's io
 925          * tree, but getting -EEXIST when adding the new extent map can still
 926          * happen in case there are extents that partially cover the range, and
 927          * this is due to two tasks operating on different parts of the extent.
 928          * See commit 18e83ac75bfe67 ("Btrfs: fix unexpected EEXIST from
 929          * btrfs_get_extent") for an example and details.
 930          */
 931         do {
 932                 btrfs_drop_extent_map_range(inode, new_em->start, end, false);
 933                 write_lock(&tree->lock);
 934                 ret = add_extent_mapping(inode, new_em, modified);
 935                 write_unlock(&tree->lock);
 936         } while (ret == -EEXIST);
 937
 938         return ret;
 939 }
 940
 941 /*
 942  * Split off the first pre bytes from the extent_map at [start, start + len],
 943  * and set the block_start for it to new_logical.
 944  *
 945  * This function is used when an ordered_extent needs to be split.
 946  */
 947 int split_extent_map(struct btrfs_inode *inode, u64 start, u64 len, u64 pre,
 948                      u64 new_logical)
 949 {
 950         struct extent_map_tree *em_tree = &inode->extent_tree;
 951         struct extent_map *em;
 952         struct extent_map *split_pre = NULL;
 953         struct extent_map *split_mid = NULL;
 954         int ret = 0;
 955         unsigned long flags;
 956
 957         ASSERT(pre != 0);
 958         ASSERT(pre < len);
 959
 960         split_pre = alloc_extent_map();
 961         if (!split_pre)
 962                 return -ENOMEM;
 963         split_mid = alloc_extent_map();
 964         if (!split_mid) {
 965                 ret = -ENOMEM;
 966                 goto out_free_pre;
 967         }
 968
 969         lock_extent(&inode->io_tree, start, start + len - 1, NULL);
 970         write_lock(&em_tree->lock);
 971         em = lookup_extent_mapping(em_tree, start, len);
 972         if (!em) {
 973                 ret = -EIO;
 974                 goto out_unlock;
 975         }
 976
 977         ASSERT(em->len == len);
 978         ASSERT(!extent_map_is_compressed(em));
 979         ASSERT(em->block_start < EXTENT_MAP_LAST_BYTE);
 980         ASSERT(em->flags & EXTENT_FLAG_PINNED);
 981         ASSERT(!(em->flags & EXTENT_FLAG_LOGGING));
 982         ASSERT(!list_empty(&em->list));
 983
 984         flags = em->flags;
 985         em->flags &= ~EXTENT_FLAG_PINNED;
 986
 987         /* First, replace the em with a new extent_map starting from * em->start */
 988         split_pre->start = em->start;
 989         split_pre->len = pre;
 990         split_pre->orig_start = split_pre->start;
 991         split_pre->block_start = new_logical;
 992         split_pre->block_len = split_pre->len;
 993         split_pre->orig_block_len = split_pre->block_len;
 994         split_pre->ram_bytes = split_pre->len;
 995         split_pre->flags = flags;
 996         split_pre->generation = em->generation;
 997
 998         replace_extent_mapping(inode, em, split_pre, 1);
 999
1000         /*
1001          * Now we only have an extent_map at:
1002          *     [em->start, em->start + pre]
1003          */
1004
1005         /* Insert the middle extent_map. */
1006         split_mid->start = em->start + pre;
1007         split_mid->len = em->len - pre;
1008         split_mid->orig_start = split_mid->start;
1009         split_mid->block_start = em->block_start + pre;
1010         split_mid->block_len = split_mid->len;
1011         split_mid->orig_block_len = split_mid->block_len;
1012         split_mid->ram_bytes = split_mid->len;
1013         split_mid->flags = flags;
1014         split_mid->generation = em->generation;
1015         add_extent_mapping(inode, split_mid, 1);
1016
1017         /* Once for us */
1018         free_extent_map(em);
1019         /* Once for the tree */
1020         free_extent_map(em);
1021
1022 out_unlock:
1023         write_unlock(&em_tree->lock);
1024         unlock_extent(&inode->io_tree, start, start + len - 1, NULL);
1025         free_extent_map(split_mid);
1026 out_free_pre:
1027         free_extent_map(split_pre);
1028         return ret;
1029 }
1030
1031 static long btrfs_scan_inode(struct btrfs_inode *inode, long *scanned, long nr_to_scan)
1032 {
1033         const u64 cur_fs_gen = btrfs_get_fs_generation(inode->root->fs_info);
1034         struct extent_map_tree *tree = &inode->extent_tree;
1035         long nr_dropped = 0;
1036         struct rb_node *node;
1037
1038         /*
1039          * Take the mmap lock so that we serialize with the inode logging phase
1040          * of fsync because we may need to set the full sync flag on the inode,
1041          * in case we have to remove extent maps in the tree's list of modified
1042          * extents. If we set the full sync flag in the inode while an fsync is
1043          * in progress, we may risk missing new extents because before the flag
1044          * is set, fsync decides to only wait for writeback to complete and then
1045          * during inode logging it sees the flag set and uses the subvolume tree
1046          * to find new extents, which may not be there yet because ordered
1047          * extents haven't completed yet.
1048          *
1049          * We also do a try lock because otherwise we could deadlock. This is
1050          * because the shrinker for this filesystem may be invoked while we are
1051          * in a path that is holding the mmap lock in write mode. For example in
1052          * a reflink operation while COWing an extent buffer, when allocating
1053          * pages for a new extent buffer and under memory pressure, the shrinker
1054          * may be invoked, and therefore we would deadlock by attempting to read
1055          * lock the mmap lock while we are holding already a write lock on it.
1056          */
1057         if (!down_read_trylock(&inode->i_mmap_lock))
1058                 return 0;
1059
1060         write_lock(&tree->lock);
1061         node = rb_first_cached(&tree->map);
1062         while (node) {
1063                 struct extent_map *em;
1064
1065                 em = rb_entry(node, struct extent_map, rb_node);
1066                 node = rb_next(node);
1067                 (*scanned)++;
1068
1069                 if (em->flags & EXTENT_FLAG_PINNED)
1070                         goto next;
1071
1072                 /*
1073                  * If the inode is in the list of modified extents (new) and its
1074                  * generation is the same (or is greater than) the current fs
1075                  * generation, it means it was not yet persisted so we have to
1076                  * set the full sync flag so that the next fsync will not miss
1077                  * it.
1078                  */
1079                 if (!list_empty(&em->list) && em->generation >= cur_fs_gen)
1080                         btrfs_set_inode_full_sync(inode);
1081
1082                 remove_extent_mapping(inode, em);
1083                 trace_btrfs_extent_map_shrinker_remove_em(inode, em);
1084                 /* Drop the reference for the tree. */
1085                 free_extent_map(em);
1086                 nr_dropped++;
1087 next:
1088                 if (*scanned >= nr_to_scan)
1089                         break;
1090
1091                 /*
1092                  * Restart if we had to reschedule, and any extent maps that were
1093                  * pinned before may have become unpinned after we released the
1094                  * lock and took it again.
1095                  */
1096                 if (cond_resched_rwlock_write(&tree->lock))
1097                         node = rb_first_cached(&tree->map);
1098         }
1099         write_unlock(&tree->lock);
1100         up_read(&inode->i_mmap_lock);
1101
1102         return nr_dropped;
1103 }
1104
1105 static long btrfs_scan_root(struct btrfs_root *root, long *scanned, long nr_to_scan)
1106 {
1107         struct btrfs_fs_info *fs_info = root->fs_info;
1108         struct btrfs_inode *inode;
1109         long nr_dropped = 0;
1110         u64 min_ino = fs_info->extent_map_shrinker_last_ino + 1;
1111
1112         inode = btrfs_find_first_inode(root, min_ino);
1113         while (inode) {
1114                 nr_dropped += btrfs_scan_inode(inode, scanned, nr_to_scan);
1115
1116                 min_ino = btrfs_ino(inode) + 1;
1117                 fs_info->extent_map_shrinker_last_ino = btrfs_ino(inode);
1118                 iput(&inode->vfs_inode);
1119
1120                 if (*scanned >= nr_to_scan)
1121                         break;
1122
1123                 cond_resched();
1124                 inode = btrfs_find_first_inode(root, min_ino);
1125         }
1126
1127         if (inode) {
1128                 /*
1129                  * There are still inodes in this root or we happened to process
1130                  * the last one and reached the scan limit. In either case set
1131                  * the current root to this one, so we'll resume from the next
1132                  * inode if there is one or we will find out this was the last
1133                  * one and move to the next root.
1134                  */
1135                 fs_info->extent_map_shrinker_last_root = btrfs_root_id(root);
1136         } else {
1137                 /*
1138                  * No more inodes in this root, set extent_map_shrinker_last_ino to 0 so
1139                  * that when processing the next root we start from its first inode.
1140                  */
1141                 fs_info->extent_map_shrinker_last_ino = 0;
1142                 fs_info->extent_map_shrinker_last_root = btrfs_root_id(root) + 1;
1143         }
1144
1145         return nr_dropped;
1146 }
1147
1148 long btrfs_free_extent_maps(struct btrfs_fs_info *fs_info, long nr_to_scan)
1149 {
1150         const u64 start_root_id = fs_info->extent_map_shrinker_last_root;
1151         u64 next_root_id = start_root_id;
1152         bool cycled = false;
1153         long nr_dropped = 0;
1154         long scanned = 0;
1155
1156         if (trace_btrfs_extent_map_shrinker_scan_enter_enabled()) {
1157                 s64 nr = percpu_counter_sum_positive(&fs_info->evictable_extent_maps);
1158
1159                 trace_btrfs_extent_map_shrinker_scan_enter(fs_info, nr_to_scan, nr);
1160         }
1161
1162         while (scanned < nr_to_scan) {
1163                 struct btrfs_root *root;
1164                 unsigned long count;
1165
1166                 spin_lock(&fs_info->fs_roots_radix_lock);
1167                 count = radix_tree_gang_lookup(&fs_info->fs_roots_radix,
1168                                                (void **)&root,
1169                                                (unsigned long)next_root_id, 1);
1170                 if (count == 0) {
1171                         spin_unlock(&fs_info->fs_roots_radix_lock);
1172                         if (start_root_id > 0 && !cycled) {
1173                                 next_root_id = 0;
1174                                 fs_info->extent_map_shrinker_last_root = 0;
1175                                 fs_info->extent_map_shrinker_last_ino = 0;
1176                                 cycled = true;
1177                                 continue;
1178                         }
1179                         break;
1180                 }
1181                 next_root_id = btrfs_root_id(root) + 1;
1182                 root = btrfs_grab_root(root);
1183                 spin_unlock(&fs_info->fs_roots_radix_lock);
1184
1185                 if (!root)
1186                         continue;
1187
1188                 if (is_fstree(btrfs_root_id(root)))
1189                         nr_dropped += btrfs_scan_root(root, &scanned, nr_to_scan);
1190
1191                 btrfs_put_root(root);
1192         }
1193
1194         if (trace_btrfs_extent_map_shrinker_scan_exit_enabled()) {
1195                 s64 nr = percpu_counter_sum_positive(&fs_info->evictable_extent_maps);
1196
1197                 trace_btrfs_extent_map_shrinker_scan_exit(fs_info, nr_dropped, nr);
1198         }
1199
1200         return nr_dropped;
1201 }