From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Tue, 5 Apr 2022 15:59:37 +0000 (-0700)
Subject: Merge tag 'for-5.18-rc1-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave... 
X-Git-Url: https://repo.jachan.dev/J-linux.git/commitdiff_plain/ce4c854ee8681bc66c1c369518b6594e93b11ee5?hp=-c

Merge tag 'for-5.18-rc1-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux

Pull btrfs fixes from David Sterba:

 - prevent deleting subvolume with active swapfile

 - fix qgroup reserve limit calculation overflow

 - remove device count in superblock and its item in one transaction so
   they cant't get out of sync

 - skip defragmenting an isolated sector, this could cause some extra IO

 - unify handling of mtime/permissions in hole punch with fallocate

 - zoned mode fixes:
     - remove assert checking for only single mode, we have the
       DUP mode implemented
     - fix potential lockdep warning while traversing devices
       when checking for zone activation

* tag 'for-5.18-rc1-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux:
  btrfs: prevent subvol with swapfile from being deleted
  btrfs: do not warn for free space inode in cow_file_range
  btrfs: avoid defragging extents whose next extents are not targets
  btrfs: fix fallocate to use file_modified to update permissions consistently
  btrfs: remove device item and update super block in the same transaction
  btrfs: fix qgroup reserve overflow the qgroup limit
  btrfs: zoned: remove left over ASSERT checking for single profile
  btrfs: zoned: traverse devices under chunk_mutex in btrfs_can_activate_zone
---

ce4c854ee8681bc66c1c369518b6594e93b11ee5
diff --combined fs/btrfs/inode.c
index 6bfc4343c98d,5aab6af88349..17d5557f98ec
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@@ -1128,7 -1128,6 +1128,6 @@@ static noinline int cow_file_range(stru
  	int ret = 0;
  
  	if (btrfs_is_free_space_inode(inode)) {
- 		WARN_ON_ONCE(1);
  		ret = -EINVAL;
  		goto out_unlock;
  	}
@@@ -4488,6 -4487,13 +4487,13 @@@ int btrfs_delete_subvolume(struct inod
  			   dest->root_key.objectid);
  		return -EPERM;
  	}
+ 	if (atomic_read(&dest->nr_swapfiles)) {
+ 		spin_unlock(&dest->root_item_lock);
+ 		btrfs_warn(fs_info,
+ 			   "attempt to delete subvolume %llu with active swapfile",
+ 			   root->root_key.objectid);
+ 		return -EPERM;
+ 	}
  	root_flags = btrfs_root_flags(&dest->root_item);
  	btrfs_set_root_flags(&dest->root_item,
  			     root_flags | BTRFS_ROOT_SUBVOL_DEAD);
@@@ -5080,17 -5086,16 +5086,17 @@@ static int btrfs_setattr(struct user_na
  }
  
  /*
 - * While truncating the inode pages during eviction, we get the VFS calling
 - * btrfs_invalidatepage() against each page of the inode. This is slow because
 - * the calls to btrfs_invalidatepage() result in a huge amount of calls to
 - * lock_extent_bits() and clear_extent_bit(), which keep merging and splitting
 - * extent_state structures over and over, wasting lots of time.
 + * While truncating the inode pages during eviction, we get the VFS
 + * calling btrfs_invalidate_folio() against each folio of the inode. This
 + * is slow because the calls to btrfs_invalidate_folio() result in a
 + * huge amount of calls to lock_extent_bits() and clear_extent_bit(),
 + * which keep merging and splitting extent_state structures over and over,
 + * wasting lots of time.
   *
 - * Therefore if the inode is being evicted, let btrfs_invalidatepage() skip all
 - * those expensive operations on a per page basis and do only the ordered io
 - * finishing, while we release here the extent_map and extent_state structures,
 - * without the excessive merging and splitting.
 + * Therefore if the inode is being evicted, let btrfs_invalidate_folio()
 + * skip all those expensive operations on a per folio basis and do only
 + * the ordered io finishing, while we release here the extent_map and
 + * extent_state structures, without the excessive merging and splitting.
   */
  static void evict_inode_truncate_pages(struct inode *inode)
  {
@@@ -5156,7 -5161,7 +5162,7 @@@
  		 * If still has DELALLOC flag, the extent didn't reach disk,
  		 * and its reserved space won't be freed by delayed_ref.
  		 * So we need to free its reserved space here.
 -		 * (Refer to comment in btrfs_invalidatepage, case 2)
 +		 * (Refer to comment in btrfs_invalidate_folio, case 2)
  		 *
  		 * Note, end is the bytenr of last byte, so we need + 1 here.
  		 */
@@@ -8179,8 -8184,8 +8185,8 @@@ static void btrfs_readahead(struct read
  }
  
  /*
 - * For releasepage() and invalidatepage() we have a race window where
 - * end_page_writeback() is called but the subpage spinlock is not yet released.
 + * For releasepage() and invalidate_folio() we have a race window where
 + * folio_end_writeback() is called but the subpage spinlock is not yet released.
   * If we continue to release/invalidate the page, we could cause use-after-free
   * for subpage spinlock.  So this function is to spin and wait for subpage
   * spinlock.
@@@ -8256,48 -8261,48 +8262,48 @@@ static int btrfs_migratepage(struct add
  }
  #endif
  
 -static void btrfs_invalidatepage(struct page *page, unsigned int offset,
 -				 unsigned int length)
 +static void btrfs_invalidate_folio(struct folio *folio, size_t offset,
 +				 size_t length)
  {
 -	struct btrfs_inode *inode = BTRFS_I(page->mapping->host);
 +	struct btrfs_inode *inode = BTRFS_I(folio->mapping->host);
  	struct btrfs_fs_info *fs_info = inode->root->fs_info;
  	struct extent_io_tree *tree = &inode->io_tree;
  	struct extent_state *cached_state = NULL;
 -	u64 page_start = page_offset(page);
 -	u64 page_end = page_start + PAGE_SIZE - 1;
 +	u64 page_start = folio_pos(folio);
 +	u64 page_end = page_start + folio_size(folio) - 1;
  	u64 cur;
  	int inode_evicting = inode->vfs_inode.i_state & I_FREEING;
  
  	/*
 -	 * We have page locked so no new ordered extent can be created on this
 -	 * page, nor bio can be submitted for this page.
 +	 * We have folio locked so no new ordered extent can be created on this
 +	 * page, nor bio can be submitted for this folio.
  	 *
 -	 * But already submitted bio can still be finished on this page.
 -	 * Furthermore, endio function won't skip page which has Ordered
 +	 * But already submitted bio can still be finished on this folio.
 +	 * Furthermore, endio function won't skip folio which has Ordered
  	 * (Private2) already cleared, so it's possible for endio and
 -	 * invalidatepage to do the same ordered extent accounting twice
 -	 * on one page.
 +	 * invalidate_folio to do the same ordered extent accounting twice
 +	 * on one folio.
  	 *
  	 * So here we wait for any submitted bios to finish, so that we won't
 -	 * do double ordered extent accounting on the same page.
 +	 * do double ordered extent accounting on the same folio.
  	 */
 -	wait_on_page_writeback(page);
 -	wait_subpage_spinlock(page);
 +	folio_wait_writeback(folio);
 +	wait_subpage_spinlock(&folio->page);
  
  	/*
  	 * For subpage case, we have call sites like
  	 * btrfs_punch_hole_lock_range() which passes range not aligned to
  	 * sectorsize.
 -	 * If the range doesn't cover the full page, we don't need to and
 -	 * shouldn't clear page extent mapped, as page->private can still
 +	 * If the range doesn't cover the full folio, we don't need to and
 +	 * shouldn't clear page extent mapped, as folio->private can still
  	 * record subpage dirty bits for other part of the range.
  	 *
 -	 * For cases that can invalidate the full even the range doesn't
 -	 * cover the full page, like invalidating the last page, we're
 +	 * For cases that invalidate the full folio even the range doesn't
 +	 * cover the full folio, like invalidating the last folio, we're
  	 * still safe to wait for ordered extent to finish.
  	 */
 -	if (!(offset == 0 && length == PAGE_SIZE)) {
 -		btrfs_releasepage(page, GFP_NOFS);
 +	if (!(offset == 0 && length == folio_size(folio))) {
 +		btrfs_releasepage(&folio->page, GFP_NOFS);
  		return;
  	}
  
@@@ -8338,7 -8343,7 +8344,7 @@@
  				page_end);
  		ASSERT(range_end + 1 - cur < U32_MAX);
  		range_len = range_end + 1 - cur;
 -		if (!btrfs_page_test_ordered(fs_info, page, cur, range_len)) {
 +		if (!btrfs_page_test_ordered(fs_info, &folio->page, cur, range_len)) {
  			/*
  			 * If Ordered (Private2) is cleared, it means endio has
  			 * already been executed for the range.
@@@ -8348,7 -8353,7 +8354,7 @@@
  			delete_states = false;
  			goto next;
  		}
 -		btrfs_page_clear_ordered(fs_info, page, cur, range_len);
 +		btrfs_page_clear_ordered(fs_info, &folio->page, cur, range_len);
  
  		/*
  		 * IO on this page will never be started, so we need to account
@@@ -8418,11 -8423,11 +8424,11 @@@ next
  	 * should not have Ordered (Private2) anymore, or the above iteration
  	 * did something wrong.
  	 */
 -	ASSERT(!PageOrdered(page));
 -	btrfs_page_clear_checked(fs_info, page, page_offset(page), PAGE_SIZE);
 +	ASSERT(!folio_test_ordered(folio));
 +	btrfs_page_clear_checked(fs_info, &folio->page, folio_pos(folio), folio_size(folio));
  	if (!inode_evicting)
 -		__btrfs_releasepage(page, GFP_NOFS);
 -	clear_page_extent_mapped(page);
 +		__btrfs_releasepage(&folio->page, GFP_NOFS);
 +	clear_page_extent_mapped(&folio->page);
  }
  
  /*
@@@ -8820,7 -8825,7 +8826,7 @@@ struct inode *btrfs_alloc_inode(struct 
  	struct btrfs_inode *ei;
  	struct inode *inode;
  
 -	ei = kmem_cache_alloc(btrfs_inode_cachep, GFP_KERNEL);
 +	ei = alloc_inode_sb(sb, btrfs_inode_cachep, GFP_KERNEL);
  	if (!ei)
  		return NULL;
  
@@@ -10057,6 -10062,11 +10063,6 @@@ int btrfs_prealloc_file_range_trans(str
  					   min_size, actual_len, alloc_hint, trans);
  }
  
 -static int btrfs_set_page_dirty(struct page *page)
 -{
 -	return __set_page_dirty_nobuffers(page);
 -}
 -
  static int btrfs_permission(struct user_namespace *mnt_userns,
  			    struct inode *inode, int mask)
  {
@@@ -11107,8 -11117,23 +11113,23 @@@ static int btrfs_swap_activate(struct s
  	 * set. We use this counter to prevent snapshots. We must increment it
  	 * before walking the extents because we don't want a concurrent
  	 * snapshot to run after we've already checked the extents.
+ 	 *
+ 	 * It is possible that subvolume is marked for deletion but still not
+ 	 * removed yet. To prevent this race, we check the root status before
+ 	 * activating the swapfile.
  	 */
+ 	spin_lock(&root->root_item_lock);
+ 	if (btrfs_root_dead(root)) {
+ 		spin_unlock(&root->root_item_lock);
+ 
+ 		btrfs_exclop_finish(fs_info);
+ 		btrfs_warn(fs_info,
+ 		"cannot activate swapfile because subvolume %llu is being deleted",
+ 			root->root_key.objectid);
+ 		return -EPERM;
+ 	}
  	atomic_inc(&root->nr_swapfiles);
+ 	spin_unlock(&root->root_item_lock);
  
  	isize = ALIGN_DOWN(inode->i_size, fs_info->sectorsize);
  
@@@ -11355,12 -11380,12 +11376,12 @@@ static const struct address_space_opera
  	.writepages	= btrfs_writepages,
  	.readahead	= btrfs_readahead,
  	.direct_IO	= noop_direct_IO,
 -	.invalidatepage = btrfs_invalidatepage,
 +	.invalidate_folio = btrfs_invalidate_folio,
  	.releasepage	= btrfs_releasepage,
  #ifdef CONFIG_MIGRATION
  	.migratepage	= btrfs_migratepage,
  #endif
 -	.set_page_dirty	= btrfs_set_page_dirty,
 +	.dirty_folio	= filemap_dirty_folio,
  	.error_remove_page = generic_error_remove_page,
  	.swap_activate	= btrfs_swap_activate,
  	.swap_deactivate = btrfs_swap_deactivate,