From: Linus Torvalds Date: Tue, 5 Apr 2022 15:59:37 +0000 (-0700) Subject: Merge tag 'for-5.18-rc1-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave... X-Git-Url: https://repo.jachan.dev/J-linux.git/commitdiff_plain/ce4c854ee8681bc66c1c369518b6594e93b11ee5?hp=-c Merge tag 'for-5.18-rc1-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux Pull btrfs fixes from David Sterba: - prevent deleting subvolume with active swapfile - fix qgroup reserve limit calculation overflow - remove device count in superblock and its item in one transaction so they cant't get out of sync - skip defragmenting an isolated sector, this could cause some extra IO - unify handling of mtime/permissions in hole punch with fallocate - zoned mode fixes: - remove assert checking for only single mode, we have the DUP mode implemented - fix potential lockdep warning while traversing devices when checking for zone activation * tag 'for-5.18-rc1-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux: btrfs: prevent subvol with swapfile from being deleted btrfs: do not warn for free space inode in cow_file_range btrfs: avoid defragging extents whose next extents are not targets btrfs: fix fallocate to use file_modified to update permissions consistently btrfs: remove device item and update super block in the same transaction btrfs: fix qgroup reserve overflow the qgroup limit btrfs: zoned: remove left over ASSERT checking for single profile btrfs: zoned: traverse devices under chunk_mutex in btrfs_can_activate_zone --- ce4c854ee8681bc66c1c369518b6594e93b11ee5 diff --combined fs/btrfs/inode.c index 6bfc4343c98d,5aab6af88349..17d5557f98ec --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@@ -1128,7 -1128,6 +1128,6 @@@ static noinline int cow_file_range(stru int ret = 0; if (btrfs_is_free_space_inode(inode)) { - WARN_ON_ONCE(1); ret = -EINVAL; goto out_unlock; } @@@ -4488,6 -4487,13 +4487,13 @@@ int btrfs_delete_subvolume(struct inod dest->root_key.objectid); return -EPERM; } + if (atomic_read(&dest->nr_swapfiles)) { + spin_unlock(&dest->root_item_lock); + btrfs_warn(fs_info, + "attempt to delete subvolume %llu with active swapfile", + root->root_key.objectid); + return -EPERM; + } root_flags = btrfs_root_flags(&dest->root_item); btrfs_set_root_flags(&dest->root_item, root_flags | BTRFS_ROOT_SUBVOL_DEAD); @@@ -5080,17 -5086,16 +5086,17 @@@ static int btrfs_setattr(struct user_na } /* - * While truncating the inode pages during eviction, we get the VFS calling - * btrfs_invalidatepage() against each page of the inode. This is slow because - * the calls to btrfs_invalidatepage() result in a huge amount of calls to - * lock_extent_bits() and clear_extent_bit(), which keep merging and splitting - * extent_state structures over and over, wasting lots of time. + * While truncating the inode pages during eviction, we get the VFS + * calling btrfs_invalidate_folio() against each folio of the inode. This + * is slow because the calls to btrfs_invalidate_folio() result in a + * huge amount of calls to lock_extent_bits() and clear_extent_bit(), + * which keep merging and splitting extent_state structures over and over, + * wasting lots of time. * - * Therefore if the inode is being evicted, let btrfs_invalidatepage() skip all - * those expensive operations on a per page basis and do only the ordered io - * finishing, while we release here the extent_map and extent_state structures, - * without the excessive merging and splitting. + * Therefore if the inode is being evicted, let btrfs_invalidate_folio() + * skip all those expensive operations on a per folio basis and do only + * the ordered io finishing, while we release here the extent_map and + * extent_state structures, without the excessive merging and splitting. */ static void evict_inode_truncate_pages(struct inode *inode) { @@@ -5156,7 -5161,7 +5162,7 @@@ * If still has DELALLOC flag, the extent didn't reach disk, * and its reserved space won't be freed by delayed_ref. * So we need to free its reserved space here. - * (Refer to comment in btrfs_invalidatepage, case 2) + * (Refer to comment in btrfs_invalidate_folio, case 2) * * Note, end is the bytenr of last byte, so we need + 1 here. */ @@@ -8179,8 -8184,8 +8185,8 @@@ static void btrfs_readahead(struct read } /* - * For releasepage() and invalidatepage() we have a race window where - * end_page_writeback() is called but the subpage spinlock is not yet released. + * For releasepage() and invalidate_folio() we have a race window where + * folio_end_writeback() is called but the subpage spinlock is not yet released. * If we continue to release/invalidate the page, we could cause use-after-free * for subpage spinlock. So this function is to spin and wait for subpage * spinlock. @@@ -8256,48 -8261,48 +8262,48 @@@ static int btrfs_migratepage(struct add } #endif -static void btrfs_invalidatepage(struct page *page, unsigned int offset, - unsigned int length) +static void btrfs_invalidate_folio(struct folio *folio, size_t offset, + size_t length) { - struct btrfs_inode *inode = BTRFS_I(page->mapping->host); + struct btrfs_inode *inode = BTRFS_I(folio->mapping->host); struct btrfs_fs_info *fs_info = inode->root->fs_info; struct extent_io_tree *tree = &inode->io_tree; struct extent_state *cached_state = NULL; - u64 page_start = page_offset(page); - u64 page_end = page_start + PAGE_SIZE - 1; + u64 page_start = folio_pos(folio); + u64 page_end = page_start + folio_size(folio) - 1; u64 cur; int inode_evicting = inode->vfs_inode.i_state & I_FREEING; /* - * We have page locked so no new ordered extent can be created on this - * page, nor bio can be submitted for this page. + * We have folio locked so no new ordered extent can be created on this + * page, nor bio can be submitted for this folio. * - * But already submitted bio can still be finished on this page. - * Furthermore, endio function won't skip page which has Ordered + * But already submitted bio can still be finished on this folio. + * Furthermore, endio function won't skip folio which has Ordered * (Private2) already cleared, so it's possible for endio and - * invalidatepage to do the same ordered extent accounting twice - * on one page. + * invalidate_folio to do the same ordered extent accounting twice + * on one folio. * * So here we wait for any submitted bios to finish, so that we won't - * do double ordered extent accounting on the same page. + * do double ordered extent accounting on the same folio. */ - wait_on_page_writeback(page); - wait_subpage_spinlock(page); + folio_wait_writeback(folio); + wait_subpage_spinlock(&folio->page); /* * For subpage case, we have call sites like * btrfs_punch_hole_lock_range() which passes range not aligned to * sectorsize. - * If the range doesn't cover the full page, we don't need to and - * shouldn't clear page extent mapped, as page->private can still + * If the range doesn't cover the full folio, we don't need to and + * shouldn't clear page extent mapped, as folio->private can still * record subpage dirty bits for other part of the range. * - * For cases that can invalidate the full even the range doesn't - * cover the full page, like invalidating the last page, we're + * For cases that invalidate the full folio even the range doesn't + * cover the full folio, like invalidating the last folio, we're * still safe to wait for ordered extent to finish. */ - if (!(offset == 0 && length == PAGE_SIZE)) { - btrfs_releasepage(page, GFP_NOFS); + if (!(offset == 0 && length == folio_size(folio))) { + btrfs_releasepage(&folio->page, GFP_NOFS); return; } @@@ -8338,7 -8343,7 +8344,7 @@@ page_end); ASSERT(range_end + 1 - cur < U32_MAX); range_len = range_end + 1 - cur; - if (!btrfs_page_test_ordered(fs_info, page, cur, range_len)) { + if (!btrfs_page_test_ordered(fs_info, &folio->page, cur, range_len)) { /* * If Ordered (Private2) is cleared, it means endio has * already been executed for the range. @@@ -8348,7 -8353,7 +8354,7 @@@ delete_states = false; goto next; } - btrfs_page_clear_ordered(fs_info, page, cur, range_len); + btrfs_page_clear_ordered(fs_info, &folio->page, cur, range_len); /* * IO on this page will never be started, so we need to account @@@ -8418,11 -8423,11 +8424,11 @@@ next * should not have Ordered (Private2) anymore, or the above iteration * did something wrong. */ - ASSERT(!PageOrdered(page)); - btrfs_page_clear_checked(fs_info, page, page_offset(page), PAGE_SIZE); + ASSERT(!folio_test_ordered(folio)); + btrfs_page_clear_checked(fs_info, &folio->page, folio_pos(folio), folio_size(folio)); if (!inode_evicting) - __btrfs_releasepage(page, GFP_NOFS); - clear_page_extent_mapped(page); + __btrfs_releasepage(&folio->page, GFP_NOFS); + clear_page_extent_mapped(&folio->page); } /* @@@ -8820,7 -8825,7 +8826,7 @@@ struct inode *btrfs_alloc_inode(struct struct btrfs_inode *ei; struct inode *inode; - ei = kmem_cache_alloc(btrfs_inode_cachep, GFP_KERNEL); + ei = alloc_inode_sb(sb, btrfs_inode_cachep, GFP_KERNEL); if (!ei) return NULL; @@@ -10057,6 -10062,11 +10063,6 @@@ int btrfs_prealloc_file_range_trans(str min_size, actual_len, alloc_hint, trans); } -static int btrfs_set_page_dirty(struct page *page) -{ - return __set_page_dirty_nobuffers(page); -} - static int btrfs_permission(struct user_namespace *mnt_userns, struct inode *inode, int mask) { @@@ -11107,8 -11117,23 +11113,23 @@@ static int btrfs_swap_activate(struct s * set. We use this counter to prevent snapshots. We must increment it * before walking the extents because we don't want a concurrent * snapshot to run after we've already checked the extents. + * + * It is possible that subvolume is marked for deletion but still not + * removed yet. To prevent this race, we check the root status before + * activating the swapfile. */ + spin_lock(&root->root_item_lock); + if (btrfs_root_dead(root)) { + spin_unlock(&root->root_item_lock); + + btrfs_exclop_finish(fs_info); + btrfs_warn(fs_info, + "cannot activate swapfile because subvolume %llu is being deleted", + root->root_key.objectid); + return -EPERM; + } atomic_inc(&root->nr_swapfiles); + spin_unlock(&root->root_item_lock); isize = ALIGN_DOWN(inode->i_size, fs_info->sectorsize); @@@ -11355,12 -11380,12 +11376,12 @@@ static const struct address_space_opera .writepages = btrfs_writepages, .readahead = btrfs_readahead, .direct_IO = noop_direct_IO, - .invalidatepage = btrfs_invalidatepage, + .invalidate_folio = btrfs_invalidate_folio, .releasepage = btrfs_releasepage, #ifdef CONFIG_MIGRATION .migratepage = btrfs_migratepage, #endif - .set_page_dirty = btrfs_set_page_dirty, + .dirty_folio = filemap_dirty_folio, .error_remove_page = generic_error_remove_page, .swap_activate = btrfs_swap_activate, .swap_deactivate = btrfs_swap_deactivate,