1 // SPDX-License-Identifier: GPL-2.0
4 #include <linux/slab.h>
5 #include <linux/spinlock.h>
8 #include "extent_map.h"
9 #include "compression.h"
10 #include "btrfs_inode.h"
14 static struct kmem_cache *extent_map_cache;
16 int __init extent_map_init(void)
18 extent_map_cache = kmem_cache_create("btrfs_extent_map",
19 sizeof(struct extent_map), 0, 0, NULL);
20 if (!extent_map_cache)
25 void __cold extent_map_exit(void)
27 kmem_cache_destroy(extent_map_cache);
31 * Initialize the extent tree @tree. Should be called for each new inode or
32 * other user of the extent_map interface.
34 void extent_map_tree_init(struct extent_map_tree *tree)
36 tree->map = RB_ROOT_CACHED;
37 INIT_LIST_HEAD(&tree->modified_extents);
38 rwlock_init(&tree->lock);
42 * Allocate a new extent_map structure. The new structure is returned with a
43 * reference count of one and needs to be freed using free_extent_map()
45 struct extent_map *alloc_extent_map(void)
47 struct extent_map *em;
48 em = kmem_cache_zalloc(extent_map_cache, GFP_NOFS);
51 RB_CLEAR_NODE(&em->rb_node);
52 refcount_set(&em->refs, 1);
53 INIT_LIST_HEAD(&em->list);
58 * Drop the reference out on @em by one and free the structure if the reference
61 void free_extent_map(struct extent_map *em)
65 if (refcount_dec_and_test(&em->refs)) {
66 WARN_ON(extent_map_in_tree(em));
67 WARN_ON(!list_empty(&em->list));
68 kmem_cache_free(extent_map_cache, em);
72 /* Do the math around the end of an extent, handling wrapping. */
73 static u64 range_end(u64 start, u64 len)
75 if (start + len < start)
80 static void dec_evictable_extent_maps(struct btrfs_inode *inode)
82 struct btrfs_fs_info *fs_info = inode->root->fs_info;
84 if (!btrfs_is_testing(fs_info) && is_fstree(btrfs_root_id(inode->root)))
85 percpu_counter_dec(&fs_info->evictable_extent_maps);
88 static int tree_insert(struct rb_root_cached *root, struct extent_map *em)
90 struct rb_node **p = &root->rb_root.rb_node;
91 struct rb_node *parent = NULL;
92 struct extent_map *entry = NULL;
93 struct rb_node *orig_parent = NULL;
94 u64 end = range_end(em->start, em->len);
99 entry = rb_entry(parent, struct extent_map, rb_node);
101 if (em->start < entry->start) {
103 } else if (em->start >= extent_map_end(entry)) {
111 orig_parent = parent;
112 while (parent && em->start >= extent_map_end(entry)) {
113 parent = rb_next(parent);
114 entry = rb_entry(parent, struct extent_map, rb_node);
117 if (end > entry->start && em->start < extent_map_end(entry))
120 parent = orig_parent;
121 entry = rb_entry(parent, struct extent_map, rb_node);
122 while (parent && em->start < entry->start) {
123 parent = rb_prev(parent);
124 entry = rb_entry(parent, struct extent_map, rb_node);
127 if (end > entry->start && em->start < extent_map_end(entry))
130 rb_link_node(&em->rb_node, orig_parent, p);
131 rb_insert_color_cached(&em->rb_node, root, leftmost);
136 * Search through the tree for an extent_map with a given offset. If it can't
137 * be found, try to find some neighboring extents
139 static struct rb_node *__tree_search(struct rb_root *root, u64 offset,
140 struct rb_node **prev_or_next_ret)
142 struct rb_node *n = root->rb_node;
143 struct rb_node *prev = NULL;
144 struct rb_node *orig_prev = NULL;
145 struct extent_map *entry;
146 struct extent_map *prev_entry = NULL;
148 ASSERT(prev_or_next_ret);
151 entry = rb_entry(n, struct extent_map, rb_node);
155 if (offset < entry->start)
157 else if (offset >= extent_map_end(entry))
164 while (prev && offset >= extent_map_end(prev_entry)) {
165 prev = rb_next(prev);
166 prev_entry = rb_entry(prev, struct extent_map, rb_node);
170 * Previous extent map found, return as in this case the caller does not
171 * care about the next one.
174 *prev_or_next_ret = prev;
179 prev_entry = rb_entry(prev, struct extent_map, rb_node);
180 while (prev && offset < prev_entry->start) {
181 prev = rb_prev(prev);
182 prev_entry = rb_entry(prev, struct extent_map, rb_node);
184 *prev_or_next_ret = prev;
189 static inline u64 extent_map_block_end(const struct extent_map *em)
191 if (em->block_start + em->block_len < em->block_start)
193 return em->block_start + em->block_len;
196 static bool can_merge_extent_map(const struct extent_map *em)
198 if (em->flags & EXTENT_FLAG_PINNED)
201 /* Don't merge compressed extents, we need to know their actual size. */
202 if (extent_map_is_compressed(em))
205 if (em->flags & EXTENT_FLAG_LOGGING)
209 * We don't want to merge stuff that hasn't been written to the log yet
210 * since it may not reflect exactly what is on disk, and that would be
213 if (!list_empty(&em->list))
219 /* Check to see if two extent_map structs are adjacent and safe to merge. */
220 static bool mergeable_maps(const struct extent_map *prev, const struct extent_map *next)
222 if (extent_map_end(prev) != next->start)
225 if (prev->flags != next->flags)
228 if (next->block_start < EXTENT_MAP_LAST_BYTE - 1)
229 return next->block_start == extent_map_block_end(prev);
231 /* HOLES and INLINE extents. */
232 return next->block_start == prev->block_start;
235 static void try_merge_map(struct btrfs_inode *inode, struct extent_map *em)
237 struct extent_map_tree *tree = &inode->extent_tree;
238 struct extent_map *merge = NULL;
242 * We can't modify an extent map that is in the tree and that is being
243 * used by another task, as it can cause that other task to see it in
244 * inconsistent state during the merging. We always have 1 reference for
245 * the tree and 1 for this task (which is unpinning the extent map or
246 * clearing the logging flag), so anything > 2 means it's being used by
249 if (refcount_read(&em->refs) > 2)
252 if (!can_merge_extent_map(em))
255 if (em->start != 0) {
256 rb = rb_prev(&em->rb_node);
258 merge = rb_entry(rb, struct extent_map, rb_node);
259 if (rb && can_merge_extent_map(merge) && mergeable_maps(merge, em)) {
260 em->start = merge->start;
261 em->orig_start = merge->orig_start;
262 em->len += merge->len;
263 em->block_len += merge->block_len;
264 em->block_start = merge->block_start;
265 em->generation = max(em->generation, merge->generation);
266 em->flags |= EXTENT_FLAG_MERGED;
268 rb_erase_cached(&merge->rb_node, &tree->map);
269 RB_CLEAR_NODE(&merge->rb_node);
270 free_extent_map(merge);
271 dec_evictable_extent_maps(inode);
275 rb = rb_next(&em->rb_node);
277 merge = rb_entry(rb, struct extent_map, rb_node);
278 if (rb && can_merge_extent_map(merge) && mergeable_maps(em, merge)) {
279 em->len += merge->len;
280 em->block_len += merge->block_len;
281 rb_erase_cached(&merge->rb_node, &tree->map);
282 RB_CLEAR_NODE(&merge->rb_node);
283 em->generation = max(em->generation, merge->generation);
284 em->flags |= EXTENT_FLAG_MERGED;
285 free_extent_map(merge);
286 dec_evictable_extent_maps(inode);
291 * Unpin an extent from the cache.
293 * @inode: the inode from which we are unpinning an extent range
294 * @start: logical offset in the file
295 * @len: length of the extent
296 * @gen: generation that this extent has been modified in
298 * Called after an extent has been written to disk properly. Set the generation
299 * to the generation that actually added the file item to the inode so we know
300 * we need to sync this extent when we call fsync().
302 * Returns: 0 on success
303 * -ENOENT when the extent is not found in the tree
304 * -EUCLEAN if the found extent does not match the expected start
306 int unpin_extent_cache(struct btrfs_inode *inode, u64 start, u64 len, u64 gen)
308 struct btrfs_fs_info *fs_info = inode->root->fs_info;
309 struct extent_map_tree *tree = &inode->extent_tree;
311 struct extent_map *em;
313 write_lock(&tree->lock);
314 em = lookup_extent_mapping(tree, start, len);
318 "no extent map found for inode %llu (root %lld) when unpinning extent range [%llu, %llu), generation %llu",
319 btrfs_ino(inode), btrfs_root_id(inode->root),
320 start, start + len, gen);
325 if (WARN_ON(em->start != start)) {
327 "found extent map for inode %llu (root %lld) with unexpected start offset %llu when unpinning extent range [%llu, %llu), generation %llu",
328 btrfs_ino(inode), btrfs_root_id(inode->root),
329 em->start, start, start + len, gen);
334 em->generation = gen;
335 em->flags &= ~EXTENT_FLAG_PINNED;
337 try_merge_map(inode, em);
340 write_unlock(&tree->lock);
346 void clear_em_logging(struct btrfs_inode *inode, struct extent_map *em)
348 lockdep_assert_held_write(&inode->extent_tree.lock);
350 em->flags &= ~EXTENT_FLAG_LOGGING;
351 if (extent_map_in_tree(em))
352 try_merge_map(inode, em);
355 static inline void setup_extent_mapping(struct btrfs_inode *inode,
356 struct extent_map *em,
359 refcount_inc(&em->refs);
361 ASSERT(list_empty(&em->list));
364 list_add(&em->list, &inode->extent_tree.modified_extents);
366 try_merge_map(inode, em);
370 * Add a new extent map to an inode's extent map tree.
372 * @inode: the target inode
374 * @modified: indicate whether the given @em should be added to the
375 * modified list, which indicates the extent needs to be logged
377 * Insert @em into the @inode's extent map tree or perform a simple
378 * forward/backward merge with existing mappings. The extent_map struct passed
379 * in will be inserted into the tree directly, with an additional reference
380 * taken, or a reference dropped if the merge attempt was successful.
382 static int add_extent_mapping(struct btrfs_inode *inode,
383 struct extent_map *em, int modified)
385 struct extent_map_tree *tree = &inode->extent_tree;
386 struct btrfs_root *root = inode->root;
387 struct btrfs_fs_info *fs_info = root->fs_info;
390 lockdep_assert_held_write(&tree->lock);
392 ret = tree_insert(&tree->map, em);
396 setup_extent_mapping(inode, em, modified);
398 if (!btrfs_is_testing(fs_info) && is_fstree(btrfs_root_id(root)))
399 percpu_counter_inc(&fs_info->evictable_extent_maps);
404 static struct extent_map *
405 __lookup_extent_mapping(struct extent_map_tree *tree,
406 u64 start, u64 len, int strict)
408 struct extent_map *em;
409 struct rb_node *rb_node;
410 struct rb_node *prev_or_next = NULL;
411 u64 end = range_end(start, len);
413 rb_node = __tree_search(&tree->map.rb_root, start, &prev_or_next);
416 rb_node = prev_or_next;
421 em = rb_entry(rb_node, struct extent_map, rb_node);
423 if (strict && !(end > em->start && start < extent_map_end(em)))
426 refcount_inc(&em->refs);
431 * Lookup extent_map that intersects @start + @len range.
433 * @tree: tree to lookup in
434 * @start: byte offset to start the search
435 * @len: length of the lookup range
437 * Find and return the first extent_map struct in @tree that intersects the
438 * [start, len] range. There may be additional objects in the tree that
439 * intersect, so check the object returned carefully to make sure that no
440 * additional lookups are needed.
442 struct extent_map *lookup_extent_mapping(struct extent_map_tree *tree,
445 return __lookup_extent_mapping(tree, start, len, 1);
449 * Find a nearby extent map intersecting @start + @len (not an exact search).
451 * @tree: tree to lookup in
452 * @start: byte offset to start the search
453 * @len: length of the lookup range
455 * Find and return the first extent_map struct in @tree that intersects the
456 * [start, len] range.
458 * If one can't be found, any nearby extent may be returned
460 struct extent_map *search_extent_mapping(struct extent_map_tree *tree,
463 return __lookup_extent_mapping(tree, start, len, 0);
467 * Remove an extent_map from its inode's extent tree.
469 * @inode: the inode the extent map belongs to
470 * @em: extent map being removed
472 * Remove @em from the extent tree of @inode. No reference counts are dropped,
473 * and no checks are done to see if the range is in use.
475 void remove_extent_mapping(struct btrfs_inode *inode, struct extent_map *em)
477 struct extent_map_tree *tree = &inode->extent_tree;
479 lockdep_assert_held_write(&tree->lock);
481 WARN_ON(em->flags & EXTENT_FLAG_PINNED);
482 rb_erase_cached(&em->rb_node, &tree->map);
483 if (!(em->flags & EXTENT_FLAG_LOGGING))
484 list_del_init(&em->list);
485 RB_CLEAR_NODE(&em->rb_node);
487 dec_evictable_extent_maps(inode);
490 static void replace_extent_mapping(struct btrfs_inode *inode,
491 struct extent_map *cur,
492 struct extent_map *new,
495 struct extent_map_tree *tree = &inode->extent_tree;
497 lockdep_assert_held_write(&tree->lock);
499 WARN_ON(cur->flags & EXTENT_FLAG_PINNED);
500 ASSERT(extent_map_in_tree(cur));
501 if (!(cur->flags & EXTENT_FLAG_LOGGING))
502 list_del_init(&cur->list);
503 rb_replace_node_cached(&cur->rb_node, &new->rb_node, &tree->map);
504 RB_CLEAR_NODE(&cur->rb_node);
506 setup_extent_mapping(inode, new, modified);
509 static struct extent_map *next_extent_map(const struct extent_map *em)
511 struct rb_node *next;
513 next = rb_next(&em->rb_node);
516 return container_of(next, struct extent_map, rb_node);
519 static struct extent_map *prev_extent_map(struct extent_map *em)
521 struct rb_node *prev;
523 prev = rb_prev(&em->rb_node);
526 return container_of(prev, struct extent_map, rb_node);
530 * Helper for btrfs_get_extent. Given an existing extent in the tree,
531 * the existing extent is the nearest extent to map_start,
532 * and an extent that you want to insert, deal with overlap and insert
533 * the best fitted new extent into the tree.
535 static noinline int merge_extent_mapping(struct btrfs_inode *inode,
536 struct extent_map *existing,
537 struct extent_map *em,
540 struct extent_map *prev;
541 struct extent_map *next;
546 if (map_start < em->start || map_start >= extent_map_end(em))
549 if (existing->start > map_start) {
551 prev = prev_extent_map(next);
554 next = next_extent_map(prev);
557 start = prev ? extent_map_end(prev) : em->start;
558 start = max_t(u64, start, em->start);
559 end = next ? next->start : extent_map_end(em);
560 end = min_t(u64, end, extent_map_end(em));
561 start_diff = start - em->start;
563 em->len = end - start;
564 if (em->block_start < EXTENT_MAP_LAST_BYTE &&
565 !extent_map_is_compressed(em)) {
566 em->block_start += start_diff;
567 em->block_len = em->len;
569 return add_extent_mapping(inode, em, 0);
573 * Add extent mapping into an inode's extent map tree.
575 * @inode: target inode
576 * @em_in: extent we are inserting
577 * @start: start of the logical range btrfs_get_extent() is requesting
578 * @len: length of the logical range btrfs_get_extent() is requesting
580 * Note that @em_in's range may be different from [start, start+len),
581 * but they must be overlapped.
583 * Insert @em_in into the inode's extent map tree. In case there is an
584 * overlapping range, handle the -EEXIST by either:
585 * a) Returning the existing extent in @em_in if @start is within the
587 * b) Merge the existing extent with @em_in passed in.
589 * Return 0 on success, otherwise -EEXIST.
592 int btrfs_add_extent_mapping(struct btrfs_inode *inode,
593 struct extent_map **em_in, u64 start, u64 len)
596 struct extent_map *em = *em_in;
597 struct btrfs_fs_info *fs_info = inode->root->fs_info;
600 * Tree-checker should have rejected any inline extent with non-zero
601 * file offset. Here just do a sanity check.
603 if (em->block_start == EXTENT_MAP_INLINE)
604 ASSERT(em->start == 0);
606 ret = add_extent_mapping(inode, em, 0);
607 /* it is possible that someone inserted the extent into the tree
608 * while we had the lock dropped. It is also possible that
609 * an overlapping map exists in the tree
611 if (ret == -EEXIST) {
612 struct extent_map *existing;
614 existing = search_extent_mapping(&inode->extent_tree, start, len);
616 trace_btrfs_handle_em_exist(fs_info, existing, em, start, len);
619 * existing will always be non-NULL, since there must be
620 * extent causing the -EEXIST.
622 if (start >= existing->start &&
623 start < extent_map_end(existing)) {
628 u64 orig_start = em->start;
629 u64 orig_len = em->len;
632 * The existing extent map is the one nearest to
633 * the [start, start + len) range which overlaps
635 ret = merge_extent_mapping(inode, existing, em, start);
640 "extent map merge error existing [%llu, %llu) with em [%llu, %llu) start %llu",
641 existing->start, extent_map_end(existing),
642 orig_start, orig_start + orig_len, start);
644 free_extent_map(existing);
648 ASSERT(ret == 0 || ret == -EEXIST);
653 * Drop all extent maps from a tree in the fastest possible way, rescheduling
654 * if needed. This avoids searching the tree, from the root down to the first
655 * extent map, before each deletion.
657 static void drop_all_extent_maps_fast(struct btrfs_inode *inode)
659 struct extent_map_tree *tree = &inode->extent_tree;
661 write_lock(&tree->lock);
662 while (!RB_EMPTY_ROOT(&tree->map.rb_root)) {
663 struct extent_map *em;
664 struct rb_node *node;
666 node = rb_first_cached(&tree->map);
667 em = rb_entry(node, struct extent_map, rb_node);
668 em->flags &= ~(EXTENT_FLAG_PINNED | EXTENT_FLAG_LOGGING);
669 remove_extent_mapping(inode, em);
671 cond_resched_rwlock_write(&tree->lock);
673 write_unlock(&tree->lock);
677 * Drop all extent maps in a given range.
679 * @inode: The target inode.
680 * @start: Start offset of the range.
681 * @end: End offset of the range (inclusive value).
682 * @skip_pinned: Indicate if pinned extent maps should be ignored or not.
684 * This drops all the extent maps that intersect the given range [@start, @end].
685 * Extent maps that partially overlap the range and extend behind or beyond it,
687 * The caller should have locked an appropriate file range in the inode's io
688 * tree before calling this function.
690 void btrfs_drop_extent_map_range(struct btrfs_inode *inode, u64 start, u64 end,
693 struct extent_map *split;
694 struct extent_map *split2;
695 struct extent_map *em;
696 struct extent_map_tree *em_tree = &inode->extent_tree;
697 u64 len = end - start + 1;
699 WARN_ON(end < start);
700 if (end == (u64)-1) {
701 if (start == 0 && !skip_pinned) {
702 drop_all_extent_maps_fast(inode);
707 /* Make end offset exclusive for use in the loop below. */
712 * It's ok if we fail to allocate the extent maps, see the comment near
713 * the bottom of the loop below. We only need two spare extent maps in
714 * the worst case, where the first extent map that intersects our range
715 * starts before the range and the last extent map that intersects our
716 * range ends after our range (and they might be the same extent map),
717 * because we need to split those two extent maps at the boundaries.
719 split = alloc_extent_map();
720 split2 = alloc_extent_map();
722 write_lock(&em_tree->lock);
723 em = lookup_extent_mapping(em_tree, start, len);
726 /* extent_map_end() returns exclusive value (last byte + 1). */
727 const u64 em_end = extent_map_end(em);
728 struct extent_map *next_em = NULL;
735 next_em = next_extent_map(em);
737 if (next_em->start < end)
738 refcount_inc(&next_em->refs);
744 if (skip_pinned && (em->flags & EXTENT_FLAG_PINNED)) {
751 * In case we split the extent map, we want to preserve the
752 * EXTENT_FLAG_LOGGING flag on our extent map, but we don't want
753 * it on the new extent maps.
755 em->flags &= ~(EXTENT_FLAG_PINNED | EXTENT_FLAG_LOGGING);
756 modified = !list_empty(&em->list);
759 * The extent map does not cross our target range, so no need to
760 * split it, we can remove it directly.
762 if (em->start >= start && em_end <= end)
765 gen = em->generation;
766 compressed = extent_map_is_compressed(em);
768 if (em->start < start) {
775 split->start = em->start;
776 split->len = start - em->start;
778 if (em->block_start < EXTENT_MAP_LAST_BYTE) {
779 split->orig_start = em->orig_start;
780 split->block_start = em->block_start;
783 split->block_len = em->block_len;
785 split->block_len = split->len;
786 split->orig_block_len = max(split->block_len,
788 split->ram_bytes = em->ram_bytes;
790 split->orig_start = split->start;
791 split->block_len = 0;
792 split->block_start = em->block_start;
793 split->orig_block_len = 0;
794 split->ram_bytes = split->len;
797 split->generation = gen;
798 split->flags = flags;
799 replace_extent_mapping(inode, em, split, modified);
800 free_extent_map(split);
812 split->len = em_end - end;
813 split->block_start = em->block_start;
814 split->flags = flags;
815 split->generation = gen;
817 if (em->block_start < EXTENT_MAP_LAST_BYTE) {
818 split->orig_block_len = max(em->block_len,
821 split->ram_bytes = em->ram_bytes;
823 split->block_len = em->block_len;
824 split->orig_start = em->orig_start;
826 const u64 diff = end - em->start;
828 split->block_len = split->len;
829 split->block_start += diff;
830 split->orig_start = em->orig_start;
833 split->ram_bytes = split->len;
834 split->orig_start = split->start;
835 split->block_len = 0;
836 split->orig_block_len = 0;
839 if (extent_map_in_tree(em)) {
840 replace_extent_mapping(inode, em, split, modified);
844 ret = add_extent_mapping(inode, split, modified);
845 /* Logic error, shouldn't happen. */
847 if (WARN_ON(ret != 0) && modified)
848 btrfs_set_inode_full_sync(inode);
850 free_extent_map(split);
854 if (extent_map_in_tree(em)) {
856 * If the extent map is still in the tree it means that
857 * either of the following is true:
859 * 1) It fits entirely in our range (doesn't end beyond
860 * it or starts before it);
862 * 2) It starts before our range and/or ends after our
863 * range, and we were not able to allocate the extent
864 * maps for split operations, @split and @split2.
866 * If we are at case 2) then we just remove the entire
867 * extent map - this is fine since if anyone needs it to
868 * access the subranges outside our range, will just
869 * load it again from the subvolume tree's file extent
870 * item. However if the extent map was in the list of
871 * modified extents, then we must mark the inode for a
872 * full fsync, otherwise a fast fsync will miss this
873 * extent if it's new and needs to be logged.
875 if ((em->start < start || em_end > end) && modified) {
877 btrfs_set_inode_full_sync(inode);
879 remove_extent_mapping(inode, em);
883 * Once for the tree reference (we replaced or removed the
884 * extent map from the tree).
888 /* Once for us (for our lookup reference). */
894 write_unlock(&em_tree->lock);
896 free_extent_map(split);
897 free_extent_map(split2);
901 * Replace a range in the inode's extent map tree with a new extent map.
903 * @inode: The target inode.
904 * @new_em: The new extent map to add to the inode's extent map tree.
905 * @modified: Indicate if the new extent map should be added to the list of
906 * modified extents (for fast fsync tracking).
908 * Drops all the extent maps in the inode's extent map tree that intersect the
909 * range of the new extent map and adds the new extent map to the tree.
910 * The caller should have locked an appropriate file range in the inode's io
911 * tree before calling this function.
913 int btrfs_replace_extent_map_range(struct btrfs_inode *inode,
914 struct extent_map *new_em,
917 const u64 end = new_em->start + new_em->len - 1;
918 struct extent_map_tree *tree = &inode->extent_tree;
921 ASSERT(!extent_map_in_tree(new_em));
924 * The caller has locked an appropriate file range in the inode's io
925 * tree, but getting -EEXIST when adding the new extent map can still
926 * happen in case there are extents that partially cover the range, and
927 * this is due to two tasks operating on different parts of the extent.
928 * See commit 18e83ac75bfe67 ("Btrfs: fix unexpected EEXIST from
929 * btrfs_get_extent") for an example and details.
932 btrfs_drop_extent_map_range(inode, new_em->start, end, false);
933 write_lock(&tree->lock);
934 ret = add_extent_mapping(inode, new_em, modified);
935 write_unlock(&tree->lock);
936 } while (ret == -EEXIST);
942 * Split off the first pre bytes from the extent_map at [start, start + len],
943 * and set the block_start for it to new_logical.
945 * This function is used when an ordered_extent needs to be split.
947 int split_extent_map(struct btrfs_inode *inode, u64 start, u64 len, u64 pre,
950 struct extent_map_tree *em_tree = &inode->extent_tree;
951 struct extent_map *em;
952 struct extent_map *split_pre = NULL;
953 struct extent_map *split_mid = NULL;
960 split_pre = alloc_extent_map();
963 split_mid = alloc_extent_map();
969 lock_extent(&inode->io_tree, start, start + len - 1, NULL);
970 write_lock(&em_tree->lock);
971 em = lookup_extent_mapping(em_tree, start, len);
977 ASSERT(em->len == len);
978 ASSERT(!extent_map_is_compressed(em));
979 ASSERT(em->block_start < EXTENT_MAP_LAST_BYTE);
980 ASSERT(em->flags & EXTENT_FLAG_PINNED);
981 ASSERT(!(em->flags & EXTENT_FLAG_LOGGING));
982 ASSERT(!list_empty(&em->list));
985 em->flags &= ~EXTENT_FLAG_PINNED;
987 /* First, replace the em with a new extent_map starting from * em->start */
988 split_pre->start = em->start;
989 split_pre->len = pre;
990 split_pre->orig_start = split_pre->start;
991 split_pre->block_start = new_logical;
992 split_pre->block_len = split_pre->len;
993 split_pre->orig_block_len = split_pre->block_len;
994 split_pre->ram_bytes = split_pre->len;
995 split_pre->flags = flags;
996 split_pre->generation = em->generation;
998 replace_extent_mapping(inode, em, split_pre, 1);
1001 * Now we only have an extent_map at:
1002 * [em->start, em->start + pre]
1005 /* Insert the middle extent_map. */
1006 split_mid->start = em->start + pre;
1007 split_mid->len = em->len - pre;
1008 split_mid->orig_start = split_mid->start;
1009 split_mid->block_start = em->block_start + pre;
1010 split_mid->block_len = split_mid->len;
1011 split_mid->orig_block_len = split_mid->block_len;
1012 split_mid->ram_bytes = split_mid->len;
1013 split_mid->flags = flags;
1014 split_mid->generation = em->generation;
1015 add_extent_mapping(inode, split_mid, 1);
1018 free_extent_map(em);
1019 /* Once for the tree */
1020 free_extent_map(em);
1023 write_unlock(&em_tree->lock);
1024 unlock_extent(&inode->io_tree, start, start + len - 1, NULL);
1025 free_extent_map(split_mid);
1027 free_extent_map(split_pre);
1031 static long btrfs_scan_inode(struct btrfs_inode *inode, long *scanned, long nr_to_scan)
1033 const u64 cur_fs_gen = btrfs_get_fs_generation(inode->root->fs_info);
1034 struct extent_map_tree *tree = &inode->extent_tree;
1035 long nr_dropped = 0;
1036 struct rb_node *node;
1039 * Take the mmap lock so that we serialize with the inode logging phase
1040 * of fsync because we may need to set the full sync flag on the inode,
1041 * in case we have to remove extent maps in the tree's list of modified
1042 * extents. If we set the full sync flag in the inode while an fsync is
1043 * in progress, we may risk missing new extents because before the flag
1044 * is set, fsync decides to only wait for writeback to complete and then
1045 * during inode logging it sees the flag set and uses the subvolume tree
1046 * to find new extents, which may not be there yet because ordered
1047 * extents haven't completed yet.
1049 * We also do a try lock because otherwise we could deadlock. This is
1050 * because the shrinker for this filesystem may be invoked while we are
1051 * in a path that is holding the mmap lock in write mode. For example in
1052 * a reflink operation while COWing an extent buffer, when allocating
1053 * pages for a new extent buffer and under memory pressure, the shrinker
1054 * may be invoked, and therefore we would deadlock by attempting to read
1055 * lock the mmap lock while we are holding already a write lock on it.
1057 if (!down_read_trylock(&inode->i_mmap_lock))
1060 write_lock(&tree->lock);
1061 node = rb_first_cached(&tree->map);
1063 struct extent_map *em;
1065 em = rb_entry(node, struct extent_map, rb_node);
1066 node = rb_next(node);
1069 if (em->flags & EXTENT_FLAG_PINNED)
1073 * If the inode is in the list of modified extents (new) and its
1074 * generation is the same (or is greater than) the current fs
1075 * generation, it means it was not yet persisted so we have to
1076 * set the full sync flag so that the next fsync will not miss
1079 if (!list_empty(&em->list) && em->generation >= cur_fs_gen)
1080 btrfs_set_inode_full_sync(inode);
1082 remove_extent_mapping(inode, em);
1083 trace_btrfs_extent_map_shrinker_remove_em(inode, em);
1084 /* Drop the reference for the tree. */
1085 free_extent_map(em);
1088 if (*scanned >= nr_to_scan)
1092 * Restart if we had to reschedule, and any extent maps that were
1093 * pinned before may have become unpinned after we released the
1094 * lock and took it again.
1096 if (cond_resched_rwlock_write(&tree->lock))
1097 node = rb_first_cached(&tree->map);
1099 write_unlock(&tree->lock);
1100 up_read(&inode->i_mmap_lock);
1105 static long btrfs_scan_root(struct btrfs_root *root, long *scanned, long nr_to_scan)
1107 struct btrfs_fs_info *fs_info = root->fs_info;
1108 struct btrfs_inode *inode;
1109 long nr_dropped = 0;
1110 u64 min_ino = fs_info->extent_map_shrinker_last_ino + 1;
1112 inode = btrfs_find_first_inode(root, min_ino);
1114 nr_dropped += btrfs_scan_inode(inode, scanned, nr_to_scan);
1116 min_ino = btrfs_ino(inode) + 1;
1117 fs_info->extent_map_shrinker_last_ino = btrfs_ino(inode);
1118 iput(&inode->vfs_inode);
1120 if (*scanned >= nr_to_scan)
1124 inode = btrfs_find_first_inode(root, min_ino);
1129 * There are still inodes in this root or we happened to process
1130 * the last one and reached the scan limit. In either case set
1131 * the current root to this one, so we'll resume from the next
1132 * inode if there is one or we will find out this was the last
1133 * one and move to the next root.
1135 fs_info->extent_map_shrinker_last_root = btrfs_root_id(root);
1138 * No more inodes in this root, set extent_map_shrinker_last_ino to 0 so
1139 * that when processing the next root we start from its first inode.
1141 fs_info->extent_map_shrinker_last_ino = 0;
1142 fs_info->extent_map_shrinker_last_root = btrfs_root_id(root) + 1;
1148 long btrfs_free_extent_maps(struct btrfs_fs_info *fs_info, long nr_to_scan)
1150 const u64 start_root_id = fs_info->extent_map_shrinker_last_root;
1151 u64 next_root_id = start_root_id;
1152 bool cycled = false;
1153 long nr_dropped = 0;
1156 if (trace_btrfs_extent_map_shrinker_scan_enter_enabled()) {
1157 s64 nr = percpu_counter_sum_positive(&fs_info->evictable_extent_maps);
1159 trace_btrfs_extent_map_shrinker_scan_enter(fs_info, nr_to_scan, nr);
1162 while (scanned < nr_to_scan) {
1163 struct btrfs_root *root;
1164 unsigned long count;
1166 spin_lock(&fs_info->fs_roots_radix_lock);
1167 count = radix_tree_gang_lookup(&fs_info->fs_roots_radix,
1169 (unsigned long)next_root_id, 1);
1171 spin_unlock(&fs_info->fs_roots_radix_lock);
1172 if (start_root_id > 0 && !cycled) {
1174 fs_info->extent_map_shrinker_last_root = 0;
1175 fs_info->extent_map_shrinker_last_ino = 0;
1181 next_root_id = btrfs_root_id(root) + 1;
1182 root = btrfs_grab_root(root);
1183 spin_unlock(&fs_info->fs_roots_radix_lock);
1188 if (is_fstree(btrfs_root_id(root)))
1189 nr_dropped += btrfs_scan_root(root, &scanned, nr_to_scan);
1191 btrfs_put_root(root);
1194 if (trace_btrfs_extent_map_shrinker_scan_exit_enabled()) {
1195 s64 nr = percpu_counter_sum_positive(&fs_info->evictable_extent_maps);
1197 trace_btrfs_extent_map_shrinker_scan_exit(fs_info, nr_dropped, nr);