Merge tag 'for-5.19-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux

author Linus Torvalds <[email protected]>

Wed, 25 May 2022 01:52:35 +0000 (18:52 -0700)

committer Linus Torvalds <[email protected]>

Wed, 25 May 2022 01:52:35 +0000 (18:52 -0700)
author Linus Torvalds <[email protected]>
Wed, 25 May 2022 01:52:35 +0000 (18:52 -0700)
committer Linus Torvalds <[email protected]>
Wed, 25 May 2022 01:52:35 +0000 (18:52 -0700)
diff --combined fs/btrfs/disk-io.c

index 84795d831282b3152440354db4a2ddc3e0b594e7,f33093513360dc988b1d190827b479f50d45c150..14f8a90df3217b6c6cee48ae21a00a7657ccc64b
--- 1/fs/btrfs/disk-io.c
--- 2/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@@ -5,7 -5,6 +5,6 @@@
   
   #include <linux/fs.h>
   #include <linux/blkdev.h>
- #include <linux/radix-tree.h>
   #include <linux/writeback.h>
   #include <linux/workqueue.h>
   #include <linux/kthread.h>
@@@ -374,9 -373,9 +373,9 @@@ int btrfs_verify_level_key(struct exten
    * @level:            expected level, mandatory check
    * @first_key:                expected key of first slot, skip check if NULL
    */
- static int btree_read_extent_buffer_pages(struct extent_buffer *eb,
-                                         u64 parent_transid, int level,
-                                         struct btrfs_key *first_key)
+ int btrfs_read_extent_buffer(struct extent_buffer *eb,
+                            u64 parent_transid, int level,
+                            struct btrfs_key *first_key)
   {
         struct btrfs_fs_info *fs_info = eb->fs_info;
         struct extent_io_tree *io_tree;
@@@ -486,7 -485,7 +485,7 @@@ static int csum_dirty_subpage_buffers(s
                 uptodate = btrfs_subpage_test_uptodate(fs_info, page, cur,
                                                        fs_info->nodesize);
   
-               /* A dirty eb shouldn't disappear from buffer_radix */
+               /* A dirty eb shouldn't disappear from extent_buffers */
                 if (WARN_ON(!eb))
                         return -EUCLEAN;
   
@@@ -519,7 -518,7 +518,7 @@@ static int csum_dirty_buffer(struct btr
         u64 found_start;
         struct extent_buffer *eb;
   
-       if (fs_info->sectorsize < PAGE_SIZE)
+       if (fs_info->nodesize < PAGE_SIZE)
                 return csum_dirty_subpage_buffers(fs_info, bvec);
   
         eb = (struct extent_buffer *)page->private;
@@@ -704,7 -703,7 +703,7 @@@ int btrfs_validate_metadata_buffer(stru
   
         ASSERT(page->private);
   
-       if (btrfs_sb(page->mapping->host->i_sb)->sectorsize < PAGE_SIZE)
+       if (btrfs_sb(page->mapping->host->i_sb)->nodesize < PAGE_SIZE)
                 return validate_subpage_buffer(page, start, end, mirror);
   
         eb = (struct extent_buffer *)page->private;
@@@ -850,8 -849,7 +849,7 @@@ static void run_one_async_free(struct b
   }
   
   blk_status_t btrfs_wq_submit_bio(struct inode *inode, struct bio *bio,
-                                int mirror_num, unsigned long bio_flags,
-                                u64 dio_file_offset,
+                                int mirror_num, u64 dio_file_offset,
                                  extent_submit_bio_start_t *submit_bio_start)
   {
         struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
@@@ -874,9 -872,9 +872,9 @@@
         async->status = 0;
   
         if (op_is_sync(bio->bi_opf))
-               btrfs_set_work_high_priority(&async->work);
- 
-       btrfs_queue_work(fs_info->workers, &async->work);
+               btrfs_queue_work(fs_info->hipri_workers, &async->work);
+       else
+               btrfs_queue_work(fs_info->workers, &async->work);
         return 0;
   }
   
@@@ -920,8 -918,7 +918,7 @@@ static bool should_async_write(struct b
         return true;
   }
   
- blk_status_t btrfs_submit_metadata_bio(struct inode *inode, struct bio *bio,
-                                      int mirror_num, unsigned long bio_flags)
+ void btrfs_submit_metadata_bio(struct inode *inode, struct bio *bio, int mirror_num)
   {
         struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
         blk_status_t ret;
@@@ -933,31 -930,25 +930,25 @@@
                  */
                 ret = btrfs_bio_wq_end_io(fs_info, bio,
                                           BTRFS_WQ_ENDIO_METADATA);
-               if (ret)
-                       goto out_w_error;
-               ret = btrfs_map_bio(fs_info, bio, mirror_num);
+               if (!ret)
+                       ret = btrfs_map_bio(fs_info, bio, mirror_num);
         } else if (!should_async_write(fs_info, BTRFS_I(inode))) {
                 ret = btree_csum_one_bio(bio);
-               if (ret)
-                       goto out_w_error;
-               ret = btrfs_map_bio(fs_info, bio, mirror_num);
+               if (!ret)
+                       ret = btrfs_map_bio(fs_info, bio, mirror_num);
         } else {
                 /*
                  * kthread helpers are used to submit writes so that
                  * checksumming can happen in parallel across all CPUs
                  */
                 ret = btrfs_wq_submit_bio(inode, bio, mirror_num, 0,
-                                         0, btree_submit_bio_start);
+                                         btree_submit_bio_start);
         }
   
-       if (ret)
-               goto out_w_error;
-       return 0;
- 
- out_w_error:
-       bio->bi_status = ret;
-       bio_endio(bio);
-       return ret;
+       if (ret) {
+               bio->bi_status = ret;
+               bio_endio(bio);
+       }
   }
   
   #ifdef CONFIG_MIGRATION
@@@ -1118,12 -1109,15 +1109,15 @@@ struct extent_buffer *read_tree_block(s
         if (IS_ERR(buf))
                 return buf;
   
-       ret = btree_read_extent_buffer_pages(buf, parent_transid,
-                                            level, first_key);
+       ret = btrfs_read_extent_buffer(buf, parent_transid, level, first_key);
         if (ret) {
                 free_extent_buffer_stale(buf);
                 return ERR_PTR(ret);
         }
+       if (btrfs_check_eb_owner(buf, owner_root)) {
+               free_extent_buffer_stale(buf);
+               return ERR_PTR(-EUCLEAN);
+       }
         return buf;
   
   }
@@@ -1164,7 -1158,7 +1158,7 @@@ static void __setup_root(struct btrfs_r
         root->nr_delalloc_inodes = 0;
         root->nr_ordered_extents = 0;
         root->inode_tree = RB_ROOT;
-       INIT_RADIX_TREE(&root->delayed_nodes_tree, GFP_ATOMIC);
+       xa_init_flags(&root->delayed_nodes, GFP_ATOMIC);
   
         btrfs_init_root_block_rsv(root);
   
@@@ -1216,9 -1210,9 +1210,9 @@@
         btrfs_qgroup_init_swapped_blocks(&root->swapped_blocks);
   #ifdef CONFIG_BTRFS_DEBUG
         INIT_LIST_HEAD(&root->leak_list);
-       spin_lock(&fs_info->fs_roots_radix_lock);
+       spin_lock(&fs_info->fs_roots_lock);
         list_add_tail(&root->leak_list, &fs_info->allocated_roots);
-       spin_unlock(&fs_info->fs_roots_radix_lock);
+       spin_unlock(&fs_info->fs_roots_lock);
   #endif
   }
   
@@@ -1563,6 -1557,23 +1557,23 @@@ static struct btrfs_root *read_tree_roo
                 ret = -EIO;
                 goto fail;
         }
+ 
+       /*
+        * For real fs, and not log/reloc trees, root owner must
+        * match its root node owner
+        */
+       if (!test_bit(BTRFS_FS_STATE_DUMMY_FS_INFO, &fs_info->fs_state) &&
+           root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID &&
+           root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID &&
+           root->root_key.objectid != btrfs_header_owner(root->node)) {
+               btrfs_crit(fs_info,
+ "root=%llu block=%llu, tree root owner mismatch, have %llu expect %llu",
+                          root->root_key.objectid, root->node->start,
+                          btrfs_header_owner(root->node),
+                          root->root_key.objectid);
+               ret = -EUCLEAN;
+               goto fail;
+       }
         root->commit_root = btrfs_root_node(root);
         return root;
   fail:
@@@ -1648,12 -1659,11 +1659,11 @@@ static struct btrfs_root *btrfs_lookup_
   {
         struct btrfs_root *root;
   
-       spin_lock(&fs_info->fs_roots_radix_lock);
-       root = radix_tree_lookup(&fs_info->fs_roots_radix,
-                                (unsigned long)root_id);
+       spin_lock(&fs_info->fs_roots_lock);
+       root = xa_load(&fs_info->fs_roots, (unsigned long)root_id);
         if (root)
                 root = btrfs_grab_root(root);
-       spin_unlock(&fs_info->fs_roots_radix_lock);
+       spin_unlock(&fs_info->fs_roots_lock);
         return root;
   }
   
@@@ -1695,20 -1705,14 +1705,14 @@@ int btrfs_insert_fs_root(struct btrfs_f
   {
         int ret;
   
-       ret = radix_tree_preload(GFP_NOFS);
-       if (ret)
-               return ret;
- 
-       spin_lock(&fs_info->fs_roots_radix_lock);
-       ret = radix_tree_insert(&fs_info->fs_roots_radix,
-                               (unsigned long)root->root_key.objectid,
-                               root);
+       spin_lock(&fs_info->fs_roots_lock);
+       ret = xa_insert(&fs_info->fs_roots, (unsigned long)root->root_key.objectid,
+                       root, GFP_NOFS);
         if (ret == 0) {
                 btrfs_grab_root(root);
-               set_bit(BTRFS_ROOT_IN_RADIX, &root->state);
+               set_bit(BTRFS_ROOT_REGISTERED, &root->state);
         }
-       spin_unlock(&fs_info->fs_roots_radix_lock);
-       radix_tree_preload_end();
+       spin_unlock(&fs_info->fs_roots_lock);
   
         return ret;
   }
@@@ -1964,7 -1968,7 +1968,7 @@@ static void end_workqueue_fn(struct btr
   
   static int cleaner_kthread(void *arg)
   {
-       struct btrfs_fs_info *fs_info = (struct btrfs_fs_info *)arg;
+       struct btrfs_fs_info *fs_info = arg;
         int again;
   
         while (1) {
@@@ -2266,10 -2270,12 +2270,12 @@@ static void btrfs_stop_all_workers(stru
   {
         btrfs_destroy_workqueue(fs_info->fixup_workers);
         btrfs_destroy_workqueue(fs_info->delalloc_workers);
+       btrfs_destroy_workqueue(fs_info->hipri_workers);
         btrfs_destroy_workqueue(fs_info->workers);
         btrfs_destroy_workqueue(fs_info->endio_workers);
         btrfs_destroy_workqueue(fs_info->endio_raid56_workers);
-       btrfs_destroy_workqueue(fs_info->rmw_workers);
+       if (fs_info->rmw_workers)
+               destroy_workqueue(fs_info->rmw_workers);
         btrfs_destroy_workqueue(fs_info->endio_write_workers);
         btrfs_destroy_workqueue(fs_info->endio_freespace_worker);
         btrfs_destroy_workqueue(fs_info->delayed_workers);
@@@ -2336,9 -2342,9 +2342,9 @@@ void btrfs_put_root(struct btrfs_root *
                 btrfs_drew_lock_destroy(&root->snapshot_lock);
                 free_root_extent_buffers(root);
   #ifdef CONFIG_BTRFS_DEBUG
-               spin_lock(&root->fs_info->fs_roots_radix_lock);
+               spin_lock(&root->fs_info->fs_roots_lock);
                 list_del_init(&root->leak_list);
-               spin_unlock(&root->fs_info->fs_roots_radix_lock);
+               spin_unlock(&root->fs_info->fs_roots_lock);
   #endif
                 kfree(root);
         }
@@@ -2346,28 -2352,21 +2352,21 @@@
   
   void btrfs_free_fs_roots(struct btrfs_fs_info *fs_info)
   {
-       int ret;
-       struct btrfs_root *gang[8];
-       int i;
+       struct btrfs_root *root;
+       unsigned long index = 0;
   
         while (!list_empty(&fs_info->dead_roots)) {
-               gang[0] = list_entry(fs_info->dead_roots.next,
-                                    struct btrfs_root, root_list);
-               list_del(&gang[0]->root_list);
+               root = list_entry(fs_info->dead_roots.next,
+                                 struct btrfs_root, root_list);
+               list_del(&root->root_list);
   
-               if (test_bit(BTRFS_ROOT_IN_RADIX, &gang[0]->state))
-                       btrfs_drop_and_free_fs_root(fs_info, gang[0]);
-               btrfs_put_root(gang[0]);
+               if (test_bit(BTRFS_ROOT_REGISTERED, &root->state))
+                       btrfs_drop_and_free_fs_root(fs_info, root);
+               btrfs_put_root(root);
         }
   
-       while (1) {
-               ret = radix_tree_gang_lookup(&fs_info->fs_roots_radix,
-                                            (void **)gang, 0,
-                                            ARRAY_SIZE(gang));
-               if (!ret)
-                       break;
-               for (i = 0; i < ret; i++)
-                       btrfs_drop_and_free_fs_root(fs_info, gang[i]);
+       xa_for_each(&fs_info->fs_roots, index, root) {
+               btrfs_drop_and_free_fs_root(fs_info, root);
         }
   }
   
@@@ -2444,7 -2443,9 +2443,9 @@@ static int btrfs_init_workqueues(struc
         unsigned int flags = WQ_MEM_RECLAIM | WQ_FREEZABLE | WQ_UNBOUND;
   
         fs_info->workers =
-               btrfs_alloc_workqueue(fs_info, "worker",
+               btrfs_alloc_workqueue(fs_info, "worker", flags, max_active, 16);
+       fs_info->hipri_workers =
+               btrfs_alloc_workqueue(fs_info, "worker-high",
                                       flags | WQ_HIGHPRI, max_active, 16);
   
         fs_info->delalloc_workers =
@@@ -2476,8 -2477,7 +2477,7 @@@
         fs_info->endio_raid56_workers =
                 btrfs_alloc_workqueue(fs_info, "endio-raid56", flags,
                                       max_active, 4);
-       fs_info->rmw_workers =
-               btrfs_alloc_workqueue(fs_info, "rmw", flags, max_active, 2);
+       fs_info->rmw_workers = alloc_workqueue("btrfs-rmw", flags, max_active);
         fs_info->endio_write_workers =
                 btrfs_alloc_workqueue(fs_info, "endio-write", flags,
                                       max_active, 2);
@@@ -2492,8 -2492,8 +2492,8 @@@
         fs_info->discard_ctl.discard_workers =
                 alloc_workqueue("btrfs_discard", WQ_UNBOUND | WQ_FREEZABLE, 1);
   
-       if (!(fs_info->workers && fs_info->delalloc_workers &&
-             fs_info->flush_workers &&
+       if (!(fs_info->workers && fs_info->hipri_workers &&
+             fs_info->delalloc_workers && fs_info->flush_workers &&
               fs_info->endio_workers && fs_info->endio_meta_workers &&
               fs_info->endio_meta_write_workers &&
               fs_info->endio_write_workers && fs_info->endio_raid56_workers &&
@@@ -2815,12 -2815,14 +2815,14 @@@ static int validate_super(struct btrfs_
         }
   
         /*
-        * For 4K page size, we only support 4K sector size.
-        * For 64K page size, we support 64K and 4K sector sizes.
+        * We only support at most two sectorsizes: 4K and PAGE_SIZE.
+        *
+        * We can support 16K sectorsize with 64K page size without problem,
+        * but such sectorsize/pagesize combination doesn't make much sense.
+        * 4K will be our future standard, PAGE_SIZE is supported from the very
+        * beginning.
          */
-       if ((PAGE_SIZE == SZ_4K && sectorsize != PAGE_SIZE) ||
-           (PAGE_SIZE == SZ_64K && (sectorsize != SZ_4K &&
-                                    sectorsize != SZ_64K))) {
+       if (sectorsize > PAGE_SIZE || (sectorsize != SZ_4K && sectorsize != PAGE_SIZE)) {
                 btrfs_err(fs_info,
                         "sectorsize %llu not yet supported for page size %lu",
                         sectorsize, PAGE_SIZE);
@@@ -3132,8 -3134,8 +3134,8 @@@ static int __cold init_tree_roots(struc
   
   void btrfs_init_fs_info(struct btrfs_fs_info *fs_info)
   {
-       INIT_RADIX_TREE(&fs_info->fs_roots_radix, GFP_ATOMIC);
-       INIT_RADIX_TREE(&fs_info->buffer_radix, GFP_ATOMIC);
+       xa_init_flags(&fs_info->fs_roots, GFP_ATOMIC);
+       xa_init_flags(&fs_info->extent_buffers, GFP_ATOMIC);
         INIT_LIST_HEAD(&fs_info->trans_list);
         INIT_LIST_HEAD(&fs_info->dead_roots);
         INIT_LIST_HEAD(&fs_info->delayed_iputs);
@@@ -3141,7 -3143,7 +3143,7 @@@
         INIT_LIST_HEAD(&fs_info->caching_block_groups);
         spin_lock_init(&fs_info->delalloc_root_lock);
         spin_lock_init(&fs_info->trans_lock);
-       spin_lock_init(&fs_info->fs_roots_radix_lock);
+       spin_lock_init(&fs_info->fs_roots_lock);
         spin_lock_init(&fs_info->delayed_iput_lock);
         spin_lock_init(&fs_info->defrag_inodes_lock);
         spin_lock_init(&fs_info->super_lock);
@@@ -3209,9 -3211,8 +3211,8 @@@
         btrfs_init_balance(fs_info);
         btrfs_init_async_reclaim_work(fs_info);
   
-       spin_lock_init(&fs_info->block_group_cache_lock);
-       fs_info->block_group_cache_tree = RB_ROOT;
-       fs_info->first_logical_byte = (u64)-1;
+       rwlock_init(&fs_info->block_group_cache_lock);
+       fs_info->block_group_cache_tree = RB_ROOT_CACHED;
   
         extent_io_tree_init(fs_info, &fs_info->excluded_extents,
                             IO_TREE_FS_EXCLUDED_EXTENTS, NULL);
@@@ -3295,7 -3296,7 +3296,7 @@@ static int init_mount_fs_info(struct bt
   
   static int btrfs_uuid_rescan_kthread(void *data)
   {
-       struct btrfs_fs_info *fs_info = (struct btrfs_fs_info *)data;
+       struct btrfs_fs_info *fs_info = data;
         int ret;
   
         /*
@@@ -3373,7 -3374,7 +3374,7 @@@ int btrfs_start_pre_rw_mount(struct btr
         /*
          * btrfs_find_orphan_roots() is responsible for finding all the dead
          * roots (with 0 refs), flag them with BTRFS_ROOT_DEAD_TREE and load
-        * them into the fs_info->fs_roots_radix tree. This must be done before
+        * them into the fs_info->fs_roots. This must be done before
          * calling btrfs_orphan_cleanup() on the tree root. If we don't do it
          * first, then btrfs_orphan_cleanup() will delete a dead root's orphan
          * item before the root's tree is deleted - this means that if we unmount
@@@ -3611,7 -3612,7 +3612,7 @@@ int __cold open_ctree(struct super_bloc
                 ~BTRFS_FEATURE_INCOMPAT_SUPP;
         if (features) {
                 btrfs_err(fs_info,
-                   "cannot mount because of unsupported optional features (%llx)",
+                   "cannot mount because of unsupported optional features (0x%llx)",
                     features);
                 err = -EINVAL;
                 goto fail_alloc;
@@@ -3649,7 -3650,7 +3650,7 @@@
                 ~BTRFS_FEATURE_COMPAT_RO_SUPP;
         if (!sb_rdonly(sb) && features) {
                 btrfs_err(fs_info,
-       "cannot mount read-write because of unsupported optional features (%llx)",
+       "cannot mount read-write because of unsupported optional features (0x%llx)",
                        features);
                 err = -EINVAL;
                 goto fail_alloc;
@@@ -3672,14 -3673,6 +3673,6 @@@
                 btrfs_warn(fs_info,
                 "read-write for sector size %u with page size %lu is experimental",
                            sectorsize, PAGE_SIZE);
-               if (btrfs_super_incompat_flags(fs_info->super_copy) &
-                       BTRFS_FEATURE_INCOMPAT_RAID56) {
-                       btrfs_err(fs_info,
-               "RAID56 is not yet supported for sector size %u with page size %lu",
-                               sectorsize, PAGE_SIZE);
-                       err = -EINVAL;
-                       goto fail_alloc;
-               }
                 subpage_info = kzalloc(sizeof(*subpage_info), GFP_KERNEL);
                 if (!subpage_info)
                         goto fail_alloc;
@@@ -4157,7 -4150,8 +4150,8 @@@ static int write_dev_supers(struct btrf
                 if (i == 0 && !btrfs_test_opt(device->fs_info, NOBARRIER))
                         bio->bi_opf |= REQ_FUA;
   
-               btrfsic_submit_bio(bio);
+               btrfsic_check_bio(bio);
+               submit_bio(bio);
   
                 if (btrfs_advance_sb_log(device, i))
                         errors++;
@@@ -4238,7 -4232,6 +4232,7 @@@ static int wait_dev_supers(struct btrfs
    */
   static void btrfs_end_empty_barrier(struct bio *bio)
   {
+ +      bio_uninit(bio);
         complete(bio->bi_private);
   }
   
@@@ -4248,7 -4241,7 +4242,7 @@@
    */
   static void write_dev_flush(struct btrfs_device *device)
   {
- -      struct bio *bio = device->flush_bio;
+ +      struct bio *bio = &device->flush_bio;
   
   #ifndef CONFIG_BTRFS_FS_CHECK_INTEGRITY
         /*
@@@ -4261,17 -4254,18 +4255,18 @@@
          * of simplicity, since this is a debug tool and not meant for use in
          * non-debug builds.
          */
- -      struct request_queue *q = bdev_get_queue(device->bdev);
- -      if (!test_bit(QUEUE_FLAG_WC, &q->queue_flags))
+ +      if (!bdev_write_cache(device->bdev))
                 return;
   #endif
   
- -      bio_reset(bio, device->bdev, REQ_OP_WRITE | REQ_SYNC | REQ_PREFLUSH);
+ +      bio_init(bio, device->bdev, NULL, 0,
+ +               REQ_OP_WRITE | REQ_SYNC | REQ_PREFLUSH);
         bio->bi_end_io = btrfs_end_empty_barrier;
         init_completion(&device->flush_wait);
         bio->bi_private = &device->flush_wait;
   
-       btrfsic_submit_bio(bio);
+       btrfsic_check_bio(bio);
+       submit_bio(bio);
         set_bit(BTRFS_DEV_STATE_FLUSH_SENT, &device->dev_state);
   }
   
@@@ -4280,7 -4274,7 +4275,7 @@@
    */
   static blk_status_t wait_dev_flush(struct btrfs_device *device)
   {
- -      struct bio *bio = device->flush_bio;
+ +      struct bio *bio = &device->flush_bio;
   
         if (!test_bit(BTRFS_DEV_STATE_FLUSH_SENT, &device->dev_state))
                 return BLK_STS_OK;
@@@ -4504,12 -4498,11 +4499,11 @@@ void btrfs_drop_and_free_fs_root(struc
   {
         bool drop_ref = false;
   
-       spin_lock(&fs_info->fs_roots_radix_lock);
-       radix_tree_delete(&fs_info->fs_roots_radix,
-                         (unsigned long)root->root_key.objectid);
-       if (test_and_clear_bit(BTRFS_ROOT_IN_RADIX, &root->state))
+       spin_lock(&fs_info->fs_roots_lock);
+       xa_erase(&fs_info->fs_roots, (unsigned long)root->root_key.objectid);
+       if (test_and_clear_bit(BTRFS_ROOT_REGISTERED, &root->state))
                 drop_ref = true;
-       spin_unlock(&fs_info->fs_roots_radix_lock);
+       spin_unlock(&fs_info->fs_roots_lock);
   
         if (BTRFS_FS_ERROR(fs_info)) {
                 ASSERT(root->log_root == NULL);
@@@ -4525,50 -4518,48 +4519,48 @@@
   
   int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info)
   {
-       u64 root_objectid = 0;
-       struct btrfs_root *gang[8];
-       int i = 0;
+       struct btrfs_root *roots[8];
+       unsigned long index = 0;
+       int i;
         int err = 0;
-       unsigned int ret = 0;
+       int grabbed;
   
         while (1) {
-               spin_lock(&fs_info->fs_roots_radix_lock);
-               ret = radix_tree_gang_lookup(&fs_info->fs_roots_radix,
-                                            (void **)gang, root_objectid,
-                                            ARRAY_SIZE(gang));
-               if (!ret) {
-                       spin_unlock(&fs_info->fs_roots_radix_lock);
-                       break;
+               struct btrfs_root *root;
+ 
+               spin_lock(&fs_info->fs_roots_lock);
+               if (!xa_find(&fs_info->fs_roots, &index, ULONG_MAX, XA_PRESENT)) {
+                       spin_unlock(&fs_info->fs_roots_lock);
+                       return err;
                 }
-               root_objectid = gang[ret - 1]->root_key.objectid + 1;
   
-               for (i = 0; i < ret; i++) {
-                       /* Avoid to grab roots in dead_roots */
-                       if (btrfs_root_refs(&gang[i]->root_item) == 0) {
-                               gang[i] = NULL;
-                               continue;
-                       }
-                       /* grab all the search result for later use */
-                       gang[i] = btrfs_grab_root(gang[i]);
+               grabbed = 0;
+               xa_for_each_start(&fs_info->fs_roots, index, root, index) {
+                       /* Avoid grabbing roots in dead_roots */
+                       if (btrfs_root_refs(&root->root_item) > 0)
+                               roots[grabbed++] = btrfs_grab_root(root);
+                       if (grabbed >= ARRAY_SIZE(roots))
+                               break;
                 }
-               spin_unlock(&fs_info->fs_roots_radix_lock);
+               spin_unlock(&fs_info->fs_roots_lock);
   
-               for (i = 0; i < ret; i++) {
-                       if (!gang[i])
+               for (i = 0; i < grabbed; i++) {
+                       if (!roots[i])
                                 continue;
-                       root_objectid = gang[i]->root_key.objectid;
-                       err = btrfs_orphan_cleanup(gang[i]);
+                       index = roots[i]->root_key.objectid;
+                       err = btrfs_orphan_cleanup(roots[i]);
                         if (err)
-                               break;
-                       btrfs_put_root(gang[i]);
+                               goto out;
+                       btrfs_put_root(roots[i]);
                 }
-               root_objectid++;
+               index++;
         }
   
-       /* release the uncleaned roots due to error */
-       for (; i < ret; i++) {
-               if (gang[i])
-                       btrfs_put_root(gang[i]);
+ out:
+       /* Release the roots that remain uncleaned due to error */
+       for (; i < grabbed; i++) {
+               if (roots[i])
+                       btrfs_put_root(roots[i]);
         }
         return err;
   }
@@@ -4863,13 -4854,6 +4855,6 @@@ void btrfs_btree_balance_dirty_nodelay(
         __btrfs_btree_balance_dirty(fs_info, 0);
   }
   
- int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid, int level,
-                     struct btrfs_key *first_key)
- {
-       return btree_read_extent_buffer_pages(buf, parent_transid,
-                                             level, first_key);
- }
- 
   static void btrfs_error_commit_super(struct btrfs_fs_info *fs_info)
   {
         /* cleanup FS via transaction */
@@@ -4885,31 -4869,28 +4870,28 @@@
   
   static void btrfs_drop_all_logs(struct btrfs_fs_info *fs_info)
   {
-       struct btrfs_root *gang[8];
-       u64 root_objectid = 0;
-       int ret;
- 
-       spin_lock(&fs_info->fs_roots_radix_lock);
-       while ((ret = radix_tree_gang_lookup(&fs_info->fs_roots_radix,
-                                            (void **)gang, root_objectid,
-                                            ARRAY_SIZE(gang))) != 0) {
-               int i;
- 
-               for (i = 0; i < ret; i++)
-                       gang[i] = btrfs_grab_root(gang[i]);
-               spin_unlock(&fs_info->fs_roots_radix_lock);
- 
-               for (i = 0; i < ret; i++) {
-                       if (!gang[i])
+       unsigned long index = 0;
+       int grabbed = 0;
+       struct btrfs_root *roots[8];
+ 
+       spin_lock(&fs_info->fs_roots_lock);
+       while ((grabbed = xa_extract(&fs_info->fs_roots, (void **)roots, index,
+                                    ULONG_MAX, 8, XA_PRESENT))) {
+               for (int i = 0; i < grabbed; i++)
+                       roots[i] = btrfs_grab_root(roots[i]);
+               spin_unlock(&fs_info->fs_roots_lock);
+ 
+               for (int i = 0; i < grabbed; i++) {
+                       if (!roots[i])
                                 continue;
-                       root_objectid = gang[i]->root_key.objectid;
-                       btrfs_free_log(NULL, gang[i]);
-                       btrfs_put_root(gang[i]);
+                       index = roots[i]->root_key.objectid;
+                       btrfs_free_log(NULL, roots[i]);
+                       btrfs_put_root(roots[i]);
                 }
-               root_objectid++;
-               spin_lock(&fs_info->fs_roots_radix_lock);
+               index++;
+               spin_lock(&fs_info->fs_roots_lock);
         }
-       spin_unlock(&fs_info->fs_roots_radix_lock);
+       spin_unlock(&fs_info->fs_roots_lock);
         btrfs_free_log_root_tree(NULL, fs_info);
   }
   
diff --combined fs/btrfs/extent-tree.c

index 6260784e74b5ae66b7ef6559ac67ba936376d2c6,fb367689d9d20e6b7c5e1c6e0a4298873735bf52..0867c5cd6e017a79865cb2f7334709021469bf1e
--- 1/fs/btrfs/extent-tree.c
--- 2/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@@ -895,7 -895,13 +895,13 @@@ again
         err = -ENOENT;
         while (1) {
                 if (ptr >= end) {
-                       WARN_ON(ptr > end);
+                       if (ptr > end) {
+                               err = -EUCLEAN;
+                               btrfs_print_leaf(path->nodes[0]);
+                               btrfs_crit(fs_info,
+ "overrun extent record at slot %d while looking for inline extent for root %llu owner %llu offset %llu parent %llu",
+                                       path->slots[0], root_objectid, owner, offset, parent);
+                       }
                         break;
                 }
                 iref = (struct btrfs_extent_inline_ref *)ptr;
@@@ -1239,7 -1245,7 +1245,7 @@@ static int btrfs_issue_discard(struct b
   
                 if (size) {
                         ret = blkdev_issue_discard(bdev, start >> 9, size >> 9,
- -                                                 GFP_NOFS, 0);
+ +                                                 GFP_NOFS);
                         if (!ret)
                                 *discarded_bytes += size;
                         else if (ret != -EOPNOTSUPP)
@@@ -1256,7 -1262,7 +1262,7 @@@
   
         if (bytes_left) {
                 ret = blkdev_issue_discard(bdev, start >> 9, bytes_left >> 9,
- -                                         GFP_NOFS, 0);
+ +                                         GFP_NOFS);
                 if (!ret)
                         *discarded_bytes += bytes_left;
         }
@@@ -1291,7 -1297,7 +1297,7 @@@ static int do_discard_extent(struct btr
                 ret = btrfs_reset_device_zone(dev_replace->tgtdev, phys, len,
                                               &discarded);
                 discarded += src_disc;
- -      } else if (blk_queue_discard(bdev_get_queue(stripe->dev->bdev))) {
+ +      } else if (bdev_max_discard_sectors(stripe->dev->bdev)) {
                 ret = btrfs_issue_discard(dev->bdev, phys, len, &discarded);
         } else {
                 ret = 0;
@@@ -1577,12 -1583,12 +1583,12 @@@ static int run_delayed_extent_op(struc
         u32 item_size;
         int ret;
         int err = 0;
-       int metadata = !extent_op->is_data;
+       int metadata = 1;
   
         if (TRANS_ABORTED(trans))
                 return 0;
   
-       if (metadata && !btrfs_fs_incompat(fs_info, SKINNY_METADATA))
+       if (!btrfs_fs_incompat(fs_info, SKINNY_METADATA))
                 metadata = 0;
   
         path = btrfs_alloc_path();
@@@ -2180,7 -2186,7 +2186,7 @@@ out
   
   int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans,
                                 struct extent_buffer *eb, u64 flags,
-                               int level, int is_data)
+                               int level)
   {
         struct btrfs_delayed_extent_op *extent_op;
         int ret;
@@@ -2192,7 -2198,6 +2198,6 @@@
         extent_op->flags_to_set = flags;
         extent_op->update_flags = true;
         extent_op->update_key = false;
-       extent_op->is_data = is_data ? true : false;
         extent_op->level = level;
   
         ret = btrfs_add_delayed_extent_op(trans, eb->start, eb->len, extent_op);
@@@ -2357,15 -2362,10 +2362,10 @@@ out
   }
   
   int btrfs_cross_ref_exist(struct btrfs_root *root, u64 objectid, u64 offset,
-                         u64 bytenr, bool strict)
+                         u64 bytenr, bool strict, struct btrfs_path *path)
   {
-       struct btrfs_path *path;
         int ret;
   
-       path = btrfs_alloc_path();
-       if (!path)
-               return -ENOMEM;
- 
         do {
                 ret = check_committed_ref(root, path, objectid,
                                           offset, bytenr, strict);
@@@ -2376,7 -2376,7 +2376,7 @@@
         } while (ret == -EAGAIN);
   
   out:
-       btrfs_free_path(path);
+       btrfs_release_path(path);
         if (btrfs_is_data_reloc_root(root))
                 WARN_ON(ret > 0);
         return ret;
@@@ -2497,24 -2497,21 +2497,21 @@@ static u64 get_alloc_profile_by_root(st
         return ret;
   }
   
- static u64 first_logical_byte(struct btrfs_fs_info *fs_info, u64 search_start)
+ static u64 first_logical_byte(struct btrfs_fs_info *fs_info)
   {
-       struct btrfs_block_group *cache;
-       u64 bytenr;
- 
-       spin_lock(&fs_info->block_group_cache_lock);
-       bytenr = fs_info->first_logical_byte;
-       spin_unlock(&fs_info->block_group_cache_lock);
- 
-       if (bytenr < (u64)-1)
-               return bytenr;
+       struct rb_node *leftmost;
+       u64 bytenr = 0;
   
-       cache = btrfs_lookup_first_block_group(fs_info, search_start);
-       if (!cache)
-               return 0;
+       read_lock(&fs_info->block_group_cache_lock);
+       /* Get the block group with the lowest logical start address. */
+       leftmost = rb_first_cached(&fs_info->block_group_cache_tree);
+       if (leftmost) {
+               struct btrfs_block_group *bg;
   
-       bytenr = cache->start;
-       btrfs_put_block_group(cache);
+               bg = rb_entry(leftmost, struct btrfs_block_group, cache_node);
+               bytenr = bg->start;
+       }
+       read_unlock(&fs_info->block_group_cache_lock);
   
         return bytenr;
   }
@@@ -3803,8 -3800,7 +3800,7 @@@ static int do_allocation_zoned(struct b
   
         /* Check RO and no space case before trying to activate it */
         spin_lock(&block_group->lock);
-       if (block_group->ro ||
-           block_group->alloc_offset == block_group->zone_capacity) {
+       if (block_group->ro || btrfs_zoned_bg_is_full(block_group)) {
                 ret = 1;
                 /*
                  * May need to clear fs_info->{treelog,data_reloc}_bg.
@@@ -4272,7 -4268,7 +4268,7 @@@ static noinline int find_free_extent(st
                 return ret;
   
         ffe_ctl->search_start = max(ffe_ctl->search_start,
-                                   first_logical_byte(fs_info, 0));
+                                   first_logical_byte(fs_info));
         ffe_ctl->search_start = max(ffe_ctl->search_start, ffe_ctl->hint_byte);
         if (ffe_ctl->search_start == ffe_ctl->hint_byte) {
                 block_group = btrfs_lookup_block_group(fs_info,
@@@ -4959,7 -4955,6 +4955,6 @@@ struct extent_buffer *btrfs_alloc_tree_
                 extent_op->flags_to_set = flags;
                 extent_op->update_key = skinny_metadata ? false : true;
                 extent_op->update_flags = true;
-               extent_op->is_data = false;
                 extent_op->level = level;
   
                 btrfs_init_generic_ref(&generic_ref, BTRFS_ADD_DELAYED_EXTENT,
@@@ -5144,7 -5139,7 +5139,7 @@@ static noinline int walk_down_proc(stru
                 ret = btrfs_dec_ref(trans, root, eb, 0);
                 BUG_ON(ret); /* -ENOMEM */
                 ret = btrfs_set_disk_extent_flags(trans, eb, flag,
-                                                 btrfs_header_level(eb), 0);
+                                                 btrfs_header_level(eb));
                 BUG_ON(ret); /* -ENOMEM */
                 wc->flags[level] |= flag;
         }
@@@ -5818,7 -5813,7 +5813,7 @@@ int btrfs_drop_snapshot(struct btrfs_ro
         btrfs_qgroup_convert_reserved_meta(root, INT_MAX);
         btrfs_qgroup_free_meta_all_pertrans(root);
   
-       if (test_bit(BTRFS_ROOT_IN_RADIX, &root->state))
+       if (test_bit(BTRFS_ROOT_REGISTERED, &root->state))
                 btrfs_add_dropped_root(trans, root);
         else
                 btrfs_put_root(root);
@@@ -5987,7 -5982,7 +5982,7 @@@ static int btrfs_trim_free_extents(stru
         *trimmed = 0;
   
         /* Discard not supported = nothing to do. */
- -      if (!blk_queue_discard(bdev_get_queue(device->bdev)))
+ +      if (!bdev_max_discard_sectors(device->bdev))
                 return 0;
   
         /* Not writable = nothing to do. */
diff --combined fs/btrfs/ioctl.c

index b2c692b2fd8d35d1e631ae5d1ea8b103acf2f31f,fdc23d1b72162b47a2433022d3c4cec4529e8bbd..43b6f23bbd8926a6518636f6ac407131b709c9d9
--- 1/fs/btrfs/ioctl.c
--- 2/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@@ -468,6 -468,7 +468,6 @@@ static noinline int btrfs_ioctl_fitrim(
                                         void __user *arg)
   {
         struct btrfs_device *device;
- -      struct request_queue *q;
         struct fstrim_range range;
         u64 minlen = ULLONG_MAX;
         u64 num_devices = 0;
@@@ -497,11 -498,14 +497,11 @@@
         rcu_read_lock();
         list_for_each_entry_rcu(device, &fs_info->fs_devices->devices,
                                 dev_list) {
- -              if (!device->bdev)
+ +              if (!device->bdev || !bdev_max_discard_sectors(device->bdev))
                         continue;
- -              q = bdev_get_queue(device->bdev);
- -              if (blk_queue_discard(q)) {
- -                      num_devices++;
- -                      minlen = min_t(u64, q->limits.discard_granularity,
- -                                   minlen);
- -              }
+ +              num_devices++;
+ +              minlen = min_t(u64, bdev_discard_granularity(device->bdev),
+ +                                  minlen);
         }
         rcu_read_unlock();
   
@@@ -540,9 -544,35 +540,35 @@@ int __pure btrfs_is_empty_uuid(u8 *uuid
         return 1;
   }
   
+ /*
+  * Calculate the number of transaction items to reserve for creating a subvolume
+  * or snapshot, not including the inode, directory entries, or parent directory.
+  */
+ static unsigned int create_subvol_num_items(struct btrfs_qgroup_inherit *inherit)
+ {
+       /*
+        * 1 to add root block
+        * 1 to add root item
+        * 1 to add root ref
+        * 1 to add root backref
+        * 1 to add UUID item
+        * 1 to add qgroup info
+        * 1 to add qgroup limit
+        *
+        * Ideally the last two would only be accounted if qgroups are enabled,
+        * but that can change between now and the time we would insert them.
+        */
+       unsigned int num_items = 7;
+ 
+       if (inherit) {
+               /* 2 to add qgroup relations for each inherited qgroup */
+               num_items += 2 * inherit->num_qgroups;
+       }
+       return num_items;
+ }
+ 
   static noinline int create_subvol(struct user_namespace *mnt_userns,
                                   struct inode *dir, struct dentry *dentry,
-                                 const char *name, int namelen,
                                   struct btrfs_qgroup_inherit *inherit)
   {
         struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb);
@@@ -555,11 -585,15 +581,15 @@@
         struct btrfs_root *new_root;
         struct btrfs_block_rsv block_rsv;
         struct timespec64 cur_time = current_time(dir);
-       struct inode *inode;
+       struct btrfs_new_inode_args new_inode_args = {
+               .dir = dir,
+               .dentry = dentry,
+               .subvol = true,
+       };
+       unsigned int trans_num_items;
         int ret;
-       dev_t anon_dev = 0;
+       dev_t anon_dev;
         u64 objectid;
-       u64 index = 0;
   
         root_item = kzalloc(sizeof(*root_item), GFP_KERNEL);
         if (!root_item)
@@@ -567,11 -601,7 +597,7 @@@
   
         ret = btrfs_get_free_objectid(fs_info->tree_root, &objectid);
         if (ret)
-               goto fail_free;
- 
-       ret = get_anon_bdev(&anon_dev);
-       if (ret < 0)
-               goto fail_free;
+               goto out_root_item;
   
         /*
          * Don't create subvolume whose level is not zero. Or qgroup will be
@@@ -579,36 -609,47 +605,47 @@@
          */
         if (btrfs_qgroup_level(objectid)) {
                 ret = -ENOSPC;
-               goto fail_free;
+               goto out_root_item;
         }
   
+       ret = get_anon_bdev(&anon_dev);
+       if (ret < 0)
+               goto out_root_item;
+ 
+       new_inode_args.inode = btrfs_new_subvol_inode(mnt_userns, dir);
+       if (!new_inode_args.inode) {
+               ret = -ENOMEM;
+               goto out_anon_dev;
+       }
+       ret = btrfs_new_inode_prepare(&new_inode_args, &trans_num_items);
+       if (ret)
+               goto out_inode;
+       trans_num_items += create_subvol_num_items(inherit);
+ 
         btrfs_init_block_rsv(&block_rsv, BTRFS_BLOCK_RSV_TEMP);
-       /*
-        * The same as the snapshot creation, please see the comment
-        * of create_snapshot().
-        */
-       ret = btrfs_subvolume_reserve_metadata(root, &block_rsv, 8, false);
+       ret = btrfs_subvolume_reserve_metadata(root, &block_rsv,
+                                              trans_num_items, false);
         if (ret)
-               goto fail_free;
+               goto out_new_inode_args;
   
         trans = btrfs_start_transaction(root, 0);
         if (IS_ERR(trans)) {
                 ret = PTR_ERR(trans);
                 btrfs_subvolume_release_metadata(root, &block_rsv);
-               goto fail_free;
+               goto out_new_inode_args;
         }
         trans->block_rsv = &block_rsv;
         trans->bytes_reserved = block_rsv.size;
   
         ret = btrfs_qgroup_inherit(trans, 0, objectid, inherit);
         if (ret)
-               goto fail;
+               goto out;
   
         leaf = btrfs_alloc_tree_block(trans, root, 0, objectid, NULL, 0, 0, 0,
                                       BTRFS_NESTING_NORMAL);
         if (IS_ERR(leaf)) {
                 ret = PTR_ERR(leaf);
-               goto fail;
+               goto out;
         }
   
         btrfs_mark_buffer_dirty(leaf);
@@@ -663,75 -704,46 +700,46 @@@
                 btrfs_tree_unlock(leaf);
                 btrfs_free_tree_block(trans, objectid, leaf, 0, 1);
                 free_extent_buffer(leaf);
-               goto fail;
+               goto out;
         }
   
         free_extent_buffer(leaf);
         leaf = NULL;
   
-       key.offset = (u64)-1;
         new_root = btrfs_get_new_fs_root(fs_info, objectid, anon_dev);
         if (IS_ERR(new_root)) {
-               free_anon_bdev(anon_dev);
                 ret = PTR_ERR(new_root);
                 btrfs_abort_transaction(trans, ret);
-               goto fail;
+               goto out;
         }
-       /* Freeing will be done in btrfs_put_root() of new_root */
+       /* anon_dev is owned by new_root now. */
         anon_dev = 0;
+       BTRFS_I(new_inode_args.inode)->root = new_root;
+       /* ... and new_root is owned by new_inode_args.inode now. */
   
         ret = btrfs_record_root_in_trans(trans, new_root);
         if (ret) {
-               btrfs_put_root(new_root);
-               btrfs_abort_transaction(trans, ret);
-               goto fail;
-       }
- 
-       ret = btrfs_create_subvol_root(trans, new_root, root, mnt_userns);
-       btrfs_put_root(new_root);
-       if (ret) {
-               /* We potentially lose an unused inode item here */
                 btrfs_abort_transaction(trans, ret);
-               goto fail;
-       }
- 
-       /*
-        * insert the directory item
-        */
-       ret = btrfs_set_inode_index(BTRFS_I(dir), &index);
-       if (ret) {
-               btrfs_abort_transaction(trans, ret);
-               goto fail;
-       }
- 
-       ret = btrfs_insert_dir_item(trans, name, namelen, BTRFS_I(dir), &key,
-                                   BTRFS_FT_DIR, index);
-       if (ret) {
-               btrfs_abort_transaction(trans, ret);
-               goto fail;
+               goto out;
         }
   
-       btrfs_i_size_write(BTRFS_I(dir), dir->i_size + namelen * 2);
-       ret = btrfs_update_inode(trans, root, BTRFS_I(dir));
+       ret = btrfs_uuid_tree_add(trans, root_item->uuid,
+                                 BTRFS_UUID_KEY_SUBVOL, objectid);
         if (ret) {
                 btrfs_abort_transaction(trans, ret);
-               goto fail;
+               goto out;
         }
   
-       ret = btrfs_add_root_ref(trans, objectid, root->root_key.objectid,
-                                btrfs_ino(BTRFS_I(dir)), index, name, namelen);
+       ret = btrfs_create_new_inode(trans, &new_inode_args);
         if (ret) {
                 btrfs_abort_transaction(trans, ret);
-               goto fail;
+               goto out;
         }
   
-       ret = btrfs_uuid_tree_add(trans, root_item->uuid,
-                                 BTRFS_UUID_KEY_SUBVOL, objectid);
-       if (ret)
-               btrfs_abort_transaction(trans, ret);
+       d_instantiate_new(dentry, new_inode_args.inode);
+       new_inode_args.inode = NULL;
   
- fail:
-       kfree(root_item);
+ out:
         trans->block_rsv = NULL;
         trans->bytes_reserved = 0;
         btrfs_subvolume_release_metadata(root, &block_rsv);
@@@ -740,18 -752,14 +748,14 @@@
                 btrfs_end_transaction(trans);
         else
                 ret = btrfs_commit_transaction(trans);
- 
-       if (!ret) {
-               inode = btrfs_lookup_dentry(dir, dentry);
-               if (IS_ERR(inode))
-                       return PTR_ERR(inode);
-               d_instantiate(dentry, inode);
-       }
-       return ret;
- 
- fail_free:
+ out_new_inode_args:
+       btrfs_new_inode_args_destroy(&new_inode_args);
+ out_inode:
+       iput(new_inode_args.inode);
+ out_anon_dev:
         if (anon_dev)
                 free_anon_bdev(anon_dev);
+ out_root_item:
         kfree(root_item);
         return ret;
   }
@@@ -763,6 -771,7 +767,7 @@@ static int create_snapshot(struct btrfs
         struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb);
         struct inode *inode;
         struct btrfs_pending_snapshot *pending_snapshot;
+       unsigned int trans_num_items;
         struct btrfs_trans_handle *trans;
         int ret;
   
@@@ -800,16 -809,14 +805,14 @@@
         btrfs_init_block_rsv(&pending_snapshot->block_rsv,
                              BTRFS_BLOCK_RSV_TEMP);
         /*
-        * 1 - parent dir inode
-        * 2 - dir entries
-        * 1 - root item
-        * 2 - root ref/backref
-        * 1 - root of snapshot
-        * 1 - UUID item
+        * 1 to add dir item
+        * 1 to add dir index
+        * 1 to update parent inode item
          */
+       trans_num_items = create_subvol_num_items(inherit) + 3;
         ret = btrfs_subvolume_reserve_metadata(BTRFS_I(dir)->root,
-                                       &pending_snapshot->block_rsv, 8,
-                                       false);
+                                              &pending_snapshot->block_rsv,
+                                              trans_num_items, false);
         if (ret)
                 goto free_pending;
   
@@@ -979,7 -986,7 +982,7 @@@ static noinline int btrfs_mksubvol(cons
         if (snap_src)
                 error = create_snapshot(snap_src, dir, dentry, readonly, inherit);
         else
-               error = create_subvol(mnt_userns, dir, dentry, name, namelen, inherit);
+               error = create_subvol(mnt_userns, dir, dentry, inherit);
   
         if (!error)
                 fsnotify_mkdir(dir, dentry);
@@@ -1413,8 -1420,19 +1416,19 @@@ static int defrag_collect_targets(struc
                 if (!em)
                         break;
   
-               /* Skip hole/inline/preallocated extents */
-               if (em->block_start >= EXTENT_MAP_LAST_BYTE ||
+               /*
+                * If the file extent is an inlined one, we may still want to
+                * defrag it (fallthrough) if it will cause a regular extent.
+                * This is for users who want to convert inline extents to
+                * regular ones through max_inline= mount option.
+                */
+               if (em->block_start == EXTENT_MAP_INLINE &&
+                   em->len <= inode->root->fs_info->max_inline)
+                       goto next;
+ 
+               /* Skip hole/delalloc/preallocated extents */
+               if (em->block_start == EXTENT_MAP_HOLE ||
+                   em->block_start == EXTENT_MAP_DELALLOC ||
                     test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
                         goto next;
   
@@@ -1473,6 -1491,15 +1487,15 @@@
                 if (em->len >= get_extent_max_capacity(em))
                         goto next;
   
+               /*
+                * Normally there are no more extents after an inline one, thus
+                * @next_mergeable will normally be false and not defragged.
+                * So if an inline extent passed all above checks, just add it
+                * for defrag, and be converted to regular extents.
+                */
+               if (em->block_start == EXTENT_MAP_INLINE)
+                       goto add;
+ 
                 next_mergeable = defrag_check_next_extent(&inode->vfs_inode, em,
                                                 extent_thresh, newer_than, locked);
                 if (!next_mergeable) {
@@@ -2561,12 -2588,7 +2584,12 @@@ static noinline int search_ioctl(struc
   
         while (1) {
                 ret = -EFAULT;
- -              if (fault_in_writeable(ubuf + sk_offset, *buf_size - sk_offset))
+ +              /*
+ +               * Ensure that the whole user buffer is faulted in at sub-page
+ +               * granularity, otherwise the loop may live-lock.
+ +               */
+ +              if (fault_in_subpage_writeable(ubuf + sk_offset,
+ +                                             *buf_size - sk_offset))
                         break;
   
                 ret = btrfs_search_forward(root, &key, path, sk->min_transid);
@@@ -2594,7 -2616,7 +2617,7 @@@ err
   static noinline int btrfs_ioctl_tree_search(struct inode *inode,
                                             void __user *argp)
   {
-       struct btrfs_ioctl_search_args __user *uargs;
+       struct btrfs_ioctl_search_args __user *uargs = argp;
         struct btrfs_ioctl_search_key sk;
         int ret;
         size_t buf_size;
@@@ -2602,8 -2624,6 +2625,6 @@@
         if (!capable(CAP_SYS_ADMIN))
                 return -EPERM;
   
-       uargs = (struct btrfs_ioctl_search_args __user *)argp;
- 
         if (copy_from_user(&sk, &uargs->key, sizeof(sk)))
                 return -EFAULT;
   
@@@ -2626,7 -2646,7 +2647,7 @@@
   static noinline int btrfs_ioctl_tree_search_v2(struct inode *inode,
                                                void __user *argp)
   {
-       struct btrfs_ioctl_search_args_v2 __user *uarg;
+       struct btrfs_ioctl_search_args_v2 __user *uarg = argp;
         struct btrfs_ioctl_search_args_v2 args;
         int ret;
         size_t buf_size;
@@@ -2636,7 -2656,6 +2657,6 @@@
                 return -EPERM;
   
         /* copy search header and buffer size */
-       uarg = (struct btrfs_ioctl_search_args_v2 __user *)argp;
         if (copy_from_user(&args, uarg, sizeof(args)))
                 return -EFAULT;
   
@@@ -4344,10 -4363,6 +4364,6 @@@ static long btrfs_ioctl_balance(struct 
         bool need_unlock; /* for mut. excl. ops lock */
         int ret;
   
-       if (!arg)
-               btrfs_warn(fs_info,
-       "IOC_BALANCE ioctl (v1) is deprecated and will be removed in kernel 5.18");
- 
         if (!capable(CAP_SYS_ADMIN))
                 return -EPERM;
   
@@@ -4355,6 -4370,13 +4371,13 @@@
         if (ret)
                 return ret;
   
+       bargs = memdup_user(arg, sizeof(*bargs));
+       if (IS_ERR(bargs)) {
+               ret = PTR_ERR(bargs);
+               bargs = NULL;
+               goto out;
+       }
+ 
   again:
         if (btrfs_exclop_start(fs_info, BTRFS_EXCLOP_BALANCE)) {
                 mutex_lock(&fs_info->balance_mutex);
@@@ -4402,59 -4424,42 +4425,42 @@@
         }
   
   locked:
- 
-       if (arg) {
-               bargs = memdup_user(arg, sizeof(*bargs));
-               if (IS_ERR(bargs)) {
-                       ret = PTR_ERR(bargs);
+       if (bargs->flags & BTRFS_BALANCE_RESUME) {
+               if (!fs_info->balance_ctl) {
+                       ret = -ENOTCONN;
                         goto out_unlock;
                 }
   
-               if (bargs->flags & BTRFS_BALANCE_RESUME) {
-                       if (!fs_info->balance_ctl) {
-                               ret = -ENOTCONN;
-                               goto out_bargs;
-                       }
+               bctl = fs_info->balance_ctl;
+               spin_lock(&fs_info->balance_lock);
+               bctl->flags |= BTRFS_BALANCE_RESUME;
+               spin_unlock(&fs_info->balance_lock);
+               btrfs_exclop_balance(fs_info, BTRFS_EXCLOP_BALANCE);
   
-                       bctl = fs_info->balance_ctl;
-                       spin_lock(&fs_info->balance_lock);
-                       bctl->flags |= BTRFS_BALANCE_RESUME;
-                       spin_unlock(&fs_info->balance_lock);
-                       btrfs_exclop_balance(fs_info, BTRFS_EXCLOP_BALANCE);
+               goto do_balance;
+       }
   
-                       goto do_balance;
-               }
-       } else {
-               bargs = NULL;
+       if (bargs->flags & ~(BTRFS_BALANCE_ARGS_MASK | BTRFS_BALANCE_TYPE_MASK)) {
+               ret = -EINVAL;
+               goto out_unlock;
         }
   
         if (fs_info->balance_ctl) {
                 ret = -EINPROGRESS;
-               goto out_bargs;
+               goto out_unlock;
         }
   
         bctl = kzalloc(sizeof(*bctl), GFP_KERNEL);
         if (!bctl) {
                 ret = -ENOMEM;
-               goto out_bargs;
-       }
- 
-       if (arg) {
-               memcpy(&bctl->data, &bargs->data, sizeof(bctl->data));
-               memcpy(&bctl->meta, &bargs->meta, sizeof(bctl->meta));
-               memcpy(&bctl->sys, &bargs->sys, sizeof(bctl->sys));
- 
-               bctl->flags = bargs->flags;
-       } else {
-               /* balance everything - no filters */
-               bctl->flags |= BTRFS_BALANCE_TYPE_MASK;
+               goto out_unlock;
         }
   
-       if (bctl->flags & ~(BTRFS_BALANCE_ARGS_MASK | BTRFS_BALANCE_TYPE_MASK)) {
-               ret = -EINVAL;
-               goto out_bctl;
-       }
+       memcpy(&bctl->data, &bargs->data, sizeof(bctl->data));
+       memcpy(&bctl->meta, &bargs->meta, sizeof(bctl->meta));
+       memcpy(&bctl->sys, &bargs->sys, sizeof(bctl->sys));
   
+       bctl->flags = bargs->flags;
   do_balance:
         /*
          * Ownership of bctl and exclusive operation goes to btrfs_balance.
@@@ -4467,21 -4472,19 +4473,19 @@@
         ret = btrfs_balance(fs_info, bctl, bargs);
         bctl = NULL;
   
-       if ((ret == 0 || ret == -ECANCELED) && arg) {
+       if (ret == 0 || ret == -ECANCELED) {
                 if (copy_to_user(arg, bargs, sizeof(*bargs)))
                         ret = -EFAULT;
         }
   
- out_bctl:
         kfree(bctl);
- out_bargs:
-       kfree(bargs);
   out_unlock:
         mutex_unlock(&fs_info->balance_mutex);
         if (need_unlock)
                 btrfs_exclop_finish(fs_info);
   out:
         mnt_drop_write_file(file);
+       kfree(bargs);
         return ret;
   }
   
diff --combined fs/btrfs/volumes.c

index b6b00338037c49b56d27af50ed7668763b89bbcf,58f3eece8a48c4fb67ffd354ff63015157201839..9c20049d1fecf3927b5ffaa908c67f769ece77c1
--- 1/fs/btrfs/volumes.c
--- 2/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@@ -164,24 -164,12 +164,12 @@@ const struct btrfs_raid_attr btrfs_raid
    */
   enum btrfs_raid_types __attribute_const__ btrfs_bg_flags_to_raid_index(u64 flags)
   {
-       if (flags & BTRFS_BLOCK_GROUP_RAID10)
-               return BTRFS_RAID_RAID10;
-       else if (flags & BTRFS_BLOCK_GROUP_RAID1)
-               return BTRFS_RAID_RAID1;
-       else if (flags & BTRFS_BLOCK_GROUP_RAID1C3)
-               return BTRFS_RAID_RAID1C3;
-       else if (flags & BTRFS_BLOCK_GROUP_RAID1C4)
-               return BTRFS_RAID_RAID1C4;
-       else if (flags & BTRFS_BLOCK_GROUP_DUP)
-               return BTRFS_RAID_DUP;
-       else if (flags & BTRFS_BLOCK_GROUP_RAID0)
-               return BTRFS_RAID_RAID0;
-       else if (flags & BTRFS_BLOCK_GROUP_RAID5)
-               return BTRFS_RAID_RAID5;
-       else if (flags & BTRFS_BLOCK_GROUP_RAID6)
-               return BTRFS_RAID_RAID6;
- 
-       return BTRFS_RAID_SINGLE; /* BTRFS_BLOCK_GROUP_SINGLE */
+       const u64 profile = (flags & BTRFS_BLOCK_GROUP_PROFILE_MASK);
+ 
+       if (!profile)
+               return BTRFS_RAID_SINGLE;
+ 
+       return BTRFS_BG_FLAG_TO_INDEX(profile);
   }
   
   const char *btrfs_bg_type_to_raid_name(u64 flags)
@@@ -405,6 -393,7 +393,6 @@@ void btrfs_free_device(struct btrfs_dev
         WARN_ON(!list_empty(&device->post_commit_list));
         rcu_string_free(device->name);
         extent_io_tree_release(&device->alloc_state);
- -      bio_put(device->flush_bio);
         btrfs_destroy_dev_zone_info(device);
         kfree(device);
   }
@@@ -642,7 -631,7 +630,7 @@@ static int btrfs_open_one_device(struc
                         set_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
         }
   
- -      if (!blk_queue_nonrot(bdev_get_queue(bdev)))
+ +      if (!bdev_nonrot(bdev))
                 fs_devices->rotating = true;
   
         device->bdev = bdev;
@@@ -2705,7 -2694,7 +2693,7 @@@ int btrfs_init_new_device(struct btrfs_
   
         atomic64_add(device->total_bytes, &fs_info->free_chunk_space);
   
- -      if (!blk_queue_nonrot(bdev_get_queue(bdev)))
+ +      if (!bdev_nonrot(bdev))
                 fs_devices->rotating = true;
   
         orig_super_total_bytes = btrfs_super_total_bytes(fs_info->super_copy);
@@@ -4062,13 -4051,6 +4050,6 @@@ static inline int validate_convert_prof
         if (!(bargs->flags & BTRFS_BALANCE_ARGS_CONVERT))
                 return true;
   
-       if (fs_info->sectorsize < PAGE_SIZE &&
-               bargs->target & BTRFS_BLOCK_GROUP_RAID56_MASK) {
-               btrfs_err(fs_info,
-               "RAID56 is not yet supported for sectorsize %u with page size %lu",
-                         fs_info->sectorsize, PAGE_SIZE);
-               return false;
-       }
         /* Profile is valid and does not have bits outside of the allowed set */
         if (alloc_profile_is_valid(bargs->target, 1) &&
             (bargs->target & ~allowed) == 0)
@@@ -6312,7 -6294,7 +6293,7 @@@ int btrfs_get_io_geometry(struct btrfs_
         u64 offset;
         u64 stripe_offset;
         u64 stripe_nr;
-       u64 stripe_len;
+       u32 stripe_len;
         u64 raid56_full_stripe_start = (u64)-1;
         int data_stripes;
   
@@@ -6323,19 -6305,13 +6304,13 @@@
         offset = logical - em->start;
         /* Len of a stripe in a chunk */
         stripe_len = map->stripe_len;
-       /* Stripe where this block falls in */
-       stripe_nr = div64_u64(offset, stripe_len);
-       /* Offset of stripe in the chunk */
-       stripe_offset = stripe_nr * stripe_len;
-       if (offset < stripe_offset) {
-               btrfs_crit(fs_info,
- "stripe math has gone wrong, stripe_offset=%llu offset=%llu start=%llu logical=%llu stripe_len=%llu",
-                       stripe_offset, offset, em->start, logical, stripe_len);
-               return -EINVAL;
-       }
+       /*
+        * Stripe_nr is where this block falls in
+        * stripe_offset is the offset of this block in its stripe.
+        */
+       stripe_nr = div64_u64_rem(offset, stripe_len, &stripe_offset);
+       ASSERT(stripe_offset < U32_MAX);
   
-       /* stripe_offset is the offset of this block in its stripe */
-       stripe_offset = offset - stripe_offset;
         data_stripes = nr_data_stripes(map);
   
         /* Only stripe based profiles needs to check against stripe length. */
@@@ -6737,11 -6713,11 +6712,11 @@@ static void submit_stripe_bio(struct bt
                 bio_op(bio), bio->bi_opf, bio->bi_iter.bi_sector,
                 (unsigned long)dev->bdev->bd_dev, rcu_str_deref(dev->name),
                 dev->devid, bio->bi_iter.bi_size);
-       bio_set_dev(bio, dev->bdev);
   
         btrfs_bio_counter_inc_noblocked(fs_info);
   
-       btrfsic_submit_bio(bio);
+       btrfsic_check_bio(bio);
+       submit_bio(bio);
   }
   
   static void bioc_error(struct btrfs_io_context *bioc, struct bio *bio, u64 logical)
@@@ -6823,10 -6799,12 +6798,12 @@@ blk_status_t btrfs_map_bio(struct btrfs
                         continue;
                 }
   
-               if (dev_nr < total_devs - 1)
-                       bio = btrfs_bio_clone(first_bio);
-               else
+               if (dev_nr < total_devs - 1) {
+                       bio = btrfs_bio_clone(dev->bdev, first_bio);
+               } else {
                         bio = first_bio;
+                       bio_set_dev(bio, dev->bdev);
+               }
   
                 submit_stripe_bio(bioc, bio, bioc->stripes[dev_nr].physical, dev);
         }
@@@ -6948,6 -6926,16 +6925,6 @@@ struct btrfs_device *btrfs_alloc_device
         if (!dev)
                 return ERR_PTR(-ENOMEM);
   
- -      /*
- -       * Preallocate a bio that's always going to be used for flushing device
- -       * barriers and matches the device lifespan
- -       */
- -      dev->flush_bio = bio_kmalloc(GFP_KERNEL, 0);
- -      if (!dev->flush_bio) {
- -              kfree(dev);
- -              return ERR_PTR(-ENOMEM);
- -      }
- -
         INIT_LIST_HEAD(&dev->dev_list);
         INIT_LIST_HEAD(&dev->dev_alloc_list);
         INIT_LIST_HEAD(&dev->post_commit_list);
@@@ -7359,7 -7347,6 +7336,6 @@@ static int read_one_dev(struct extent_b
   
   int btrfs_read_sys_array(struct btrfs_fs_info *fs_info)
   {
-       struct btrfs_root *root = fs_info->tree_root;
         struct btrfs_super_block *super_copy = fs_info->super_copy;
         struct extent_buffer *sb;
         struct btrfs_disk_key *disk_key;
@@@ -7375,30 -7362,16 +7351,16 @@@
         struct btrfs_key key;
   
         ASSERT(BTRFS_SUPER_INFO_SIZE <= fs_info->nodesize);
+ 
         /*
-        * This will create extent buffer of nodesize, superblock size is
-        * fixed to BTRFS_SUPER_INFO_SIZE. If nodesize > sb size, this will
-        * overallocate but we can keep it as-is, only the first page is used.
+        * We allocated a dummy extent, just to use extent buffer accessors.
+        * There will be unused space after BTRFS_SUPER_INFO_SIZE, but
+        * that's fine, we will not go beyond system chunk array anyway.
          */
-       sb = btrfs_find_create_tree_block(fs_info, BTRFS_SUPER_INFO_OFFSET,
-                                         root->root_key.objectid, 0);
-       if (IS_ERR(sb))
-               return PTR_ERR(sb);
+       sb = alloc_dummy_extent_buffer(fs_info, BTRFS_SUPER_INFO_OFFSET);
+       if (!sb)
+               return -ENOMEM;
         set_extent_buffer_uptodate(sb);
-       /*
-        * The sb extent buffer is artificial and just used to read the system array.
-        * set_extent_buffer_uptodate() call does not properly mark all it's
-        * pages up-to-date when the page is larger: extent does not cover the
-        * whole page and consequently check_page_uptodate does not find all
-        * the page's extents up-to-date (the hole beyond sb),
-        * write_extent_buffer then triggers a WARN_ON.
-        *
-        * Regular short extents go through mark_extent_buffer_dirty/writeback cycle,
-        * but sb spans only this function. Add an explicit SetPageUptodate call
-        * to silence the warning eg. on PowerPC 64.
-        */
-       if (PAGE_SIZE > BTRFS_SUPER_INFO_SIZE)
-               SetPageUptodate(sb->pages[0]);
   
         write_extent_buffer(sb, super_copy, 0, BTRFS_SUPER_INFO_SIZE);
         array_size = btrfs_super_sys_array_size(super_copy);
@@@ -7561,6 -7534,7 +7523,7 @@@ int btrfs_read_chunk_tree(struct btrfs_
         struct btrfs_key found_key;
         int ret;
         int slot;
+       int iter_ret = 0;
         u64 total_dev = 0;
         u64 last_ra_node = 0;
   
@@@ -7604,30 -7578,18 +7567,18 @@@
         key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
         key.offset = 0;
         key.type = 0;
-       ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
-       if (ret < 0)
-               goto error;
-       while (1) {
-               struct extent_buffer *node;
+       btrfs_for_each_slot(root, &key, &found_key, path, iter_ret) {
+               struct extent_buffer *node = path->nodes[1];
   
                 leaf = path->nodes[0];
                 slot = path->slots[0];
-               if (slot >= btrfs_header_nritems(leaf)) {
-                       ret = btrfs_next_leaf(root, path);
-                       if (ret == 0)
-                               continue;
-                       if (ret < 0)
-                               goto error;
-                       break;
-               }
-               node = path->nodes[1];
+ 
                 if (node) {
                         if (last_ra_node != node->start) {
                                 readahead_tree_node_children(node);
                                 last_ra_node = node->start;
                         }
                 }
-               btrfs_item_key_to_cpu(leaf, &found_key, slot);
                 if (found_key.type == BTRFS_DEV_ITEM_KEY) {
                         struct btrfs_dev_item *dev_item;
                         dev_item = btrfs_item_ptr(leaf, slot,
@@@ -7652,7 -7614,11 +7603,11 @@@
                         if (ret)
                                 goto error;
                 }
-               path->slots[0]++;
+       }
+       /* Catch error found during iteration */
+       if (iter_ret < 0) {
+               ret = iter_ret;
+               goto error;
         }
   
         /*
@@@ -7660,12 -7626,12 +7615,12 @@@
          * do another round of validation checks.
          */
         if (total_dev != fs_info->fs_devices->total_devices) {
-               btrfs_err(fs_info,
-          "super_num_devices %llu mismatch with num_devices %llu found here",
+               btrfs_warn(fs_info,
+ "super block num_devices %llu mismatch with DEV_ITEM count %llu, will be repaired on next transaction commit",
                           btrfs_super_num_devices(fs_info->super_copy),
                           total_dev);
-               ret = -EINVAL;
-               goto error;
+               fs_info->fs_devices->total_devices = total_dev;
+               btrfs_set_super_num_devices(fs_info->super_copy, total_dev);
         }
         if (btrfs_super_total_bytes(fs_info->super_copy) <
             fs_info->fs_devices->total_rw_bytes) {
@@@ -8277,7 -8243,7 +8232,7 @@@ bool btrfs_pinned_by_swapfile(struct bt
   
   static int relocating_repair_kthread(void *data)
   {
-       struct btrfs_block_group *cache = (struct btrfs_block_group *)data;
+       struct btrfs_block_group *cache = data;
         struct btrfs_fs_info *fs_info = cache->fs_info;
         u64 target;
         int ret = 0;
diff --combined fs/btrfs/volumes.h

index b11c563d2025e52d1e1ca2ddffde4df55af08f98,12b2af9260e92a5af40f5b7255107f7b795d41a2..6721002000ee0fc0cb69ec727c19189783cea40f
--- 1/fs/btrfs/volumes.h
--- 2/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@@ -17,17 -17,51 +17,51 @@@ extern struct mutex uuid_mutex
   
   #define BTRFS_STRIPE_LEN      SZ_64K
   
+ /* Used by sanity check for btrfs_raid_types. */
+ #define const_ffs(n) (__builtin_ctzll(n) + 1)
+ 
+ /*
+  * The conversion from BTRFS_BLOCK_GROUP_* bits to btrfs_raid_type requires
+  * RAID0 always to be the lowest profile bit.
+  * Although it's part of on-disk format and should never change, do extra
+  * compile-time sanity checks.
+  */
+ static_assert(const_ffs(BTRFS_BLOCK_GROUP_RAID0) <
+             const_ffs(BTRFS_BLOCK_GROUP_PROFILE_MASK & ~BTRFS_BLOCK_GROUP_RAID0));
+ static_assert(const_ilog2(BTRFS_BLOCK_GROUP_RAID0) >
+             ilog2(BTRFS_BLOCK_GROUP_TYPE_MASK));
+ 
+ /* ilog2() can handle both constants and variables */
+ #define BTRFS_BG_FLAG_TO_INDEX(profile)                                       \
+       ilog2((profile) >> (ilog2(BTRFS_BLOCK_GROUP_RAID0) - 1))
+ 
+ enum btrfs_raid_types {
+       /* SINGLE is the special one as it doesn't have on-disk bit. */
+       BTRFS_RAID_SINGLE  = 0,
+ 
+       BTRFS_RAID_RAID0   = BTRFS_BG_FLAG_TO_INDEX(BTRFS_BLOCK_GROUP_RAID0),
+       BTRFS_RAID_RAID1   = BTRFS_BG_FLAG_TO_INDEX(BTRFS_BLOCK_GROUP_RAID1),
+       BTRFS_RAID_DUP     = BTRFS_BG_FLAG_TO_INDEX(BTRFS_BLOCK_GROUP_DUP),
+       BTRFS_RAID_RAID10  = BTRFS_BG_FLAG_TO_INDEX(BTRFS_BLOCK_GROUP_RAID10),
+       BTRFS_RAID_RAID5   = BTRFS_BG_FLAG_TO_INDEX(BTRFS_BLOCK_GROUP_RAID5),
+       BTRFS_RAID_RAID6   = BTRFS_BG_FLAG_TO_INDEX(BTRFS_BLOCK_GROUP_RAID6),
+       BTRFS_RAID_RAID1C3 = BTRFS_BG_FLAG_TO_INDEX(BTRFS_BLOCK_GROUP_RAID1C3),
+       BTRFS_RAID_RAID1C4 = BTRFS_BG_FLAG_TO_INDEX(BTRFS_BLOCK_GROUP_RAID1C4),
+ 
+       BTRFS_NR_RAID_TYPES
+ };
+ 
   struct btrfs_io_geometry {
         /* remaining bytes before crossing a stripe */
         u64 len;
         /* offset of logical address in chunk */
         u64 offset;
         /* length of single IO stripe */
-       u64 stripe_len;
+       u32 stripe_len;
+       /* offset of address in stripe */
+       u32 stripe_offset;
         /* number of stripe where address falls */
         u64 stripe_nr;
-       /* offset of address in stripe */
-       u64 stripe_offset;
         /* offset of raid56 stripe into the chunk */
         u64 raid56_stripe_offset;
   };
@@@ -121,8 -155,8 +155,8 @@@ struct btrfs_device 
         /* bytes used on the current transaction */
         u64 commit_bytes_used;
   
- -      /* for sending down flush barriers */
- -      struct bio *flush_bio;
+ +      /* Bio used for flushing device barriers */
+ +      struct bio flush_bio;
         struct completion flush_wait;
   
         /* per-device scrub information */
@@@ -430,7 -464,7 +464,7 @@@ struct map_lookup 
         u64 type;
         int io_align;
         int io_width;
-       u64 stripe_len;
+       u32 stripe_len;
         int num_stripes;
         int sub_stripes;
         int verified_stripes; /* For mount time dev extent verification */
diff --combined fs/btrfs/zoned.c

index 29b54fd9c128dffdb1bff294d8e41e102df466c4,057babaa3e05c10812edf9b1a516cda239ffbdbb..11237a913beed1ee8b3fb0b877b587d1d13c4b0c
--- 1/fs/btrfs/zoned.c
--- 2/fs/btrfs/zoned.c
+++ b/fs/btrfs/zoned.c
@@@ -51,11 -51,13 +51,13 @@@
   #define BTRFS_MIN_ACTIVE_ZONES                (BTRFS_SUPER_MIRROR_MAX + 5)
   
   /*
-  * Maximum supported zone size. Currently, SMR disks have a zone size of
-  * 256MiB, and we are expecting ZNS drives to be in the 1-4GiB range. We do not
-  * expect the zone size to become larger than 8GiB in the near future.
+  * Minimum / maximum supported zone size. Currently, SMR disks have a zone
+  * size of 256MiB, and we are expecting ZNS drives to be in the 1-4GiB range.
+  * We do not expect the zone size to become larger than 8GiB or smaller than
+  * 4MiB in the near future.
    */
   #define BTRFS_MAX_ZONE_SIZE           SZ_8G
+ #define BTRFS_MIN_ZONE_SIZE           SZ_4M
   
   #define SUPER_INFO_SECTORS    ((u64)BTRFS_SUPER_INFO_SIZE >> SECTOR_SHIFT)
   
@@@ -350,6 -352,7 +352,6 @@@ int btrfs_get_dev_zone_info(struct btrf
         struct btrfs_fs_info *fs_info = device->fs_info;
         struct btrfs_zoned_device_info *zone_info = NULL;
         struct block_device *bdev = device->bdev;
- -      struct request_queue *queue = bdev_get_queue(bdev);
         unsigned int max_active_zones;
         unsigned int nactive;
         sector_t nr_sectors;
@@@ -401,6 -404,13 +403,13 @@@
                                  zone_info->zone_size, BTRFS_MAX_ZONE_SIZE);
                 ret = -EINVAL;
                 goto out;
+       } else if (zone_info->zone_size < BTRFS_MIN_ZONE_SIZE) {
+               btrfs_err_in_rcu(fs_info,
+               "zoned: %s: zone size %llu smaller than supported minimum %u",
+                                rcu_str_deref(device->name),
+                                zone_info->zone_size, BTRFS_MIN_ZONE_SIZE);
+               ret = -EINVAL;
+               goto out;
         }
   
         nr_sectors = bdev_nr_sectors(bdev);
@@@ -409,7 -419,7 +418,7 @@@
         if (!IS_ALIGNED(nr_sectors, zone_sectors))
                 zone_info->nr_zones++;
   
- -      max_active_zones = queue_max_active_zones(queue);
+ +      max_active_zones = bdev_max_active_zones(bdev);
         if (max_active_zones && max_active_zones < BTRFS_MIN_ACTIVE_ZONES) {
                 btrfs_err_in_rcu(fs_info,
   "zoned: %s: max active zones %u is too small, need at least %u active zones",
@@@ -1835,7 -1845,7 +1844,7 @@@ bool btrfs_zone_activate(struct btrfs_b
         }
   
         /* No space left */
-       if (block_group->alloc_offset == block_group->zone_capacity) {
+       if (btrfs_zoned_bg_is_full(block_group)) {
                 ret = false;
                 goto out_unlock;
         }
@@@ -1872,20 -1882,14 +1881,14 @@@ out_unlock
         return ret;
   }
   
- int btrfs_zone_finish(struct btrfs_block_group *block_group)
+ static int do_zone_finish(struct btrfs_block_group *block_group, bool fully_written)
   {
         struct btrfs_fs_info *fs_info = block_group->fs_info;
         struct map_lookup *map;
-       struct btrfs_device *device;
-       u64 physical;
+       bool need_zone_finish;
         int ret = 0;
         int i;
   
-       if (!btrfs_is_zoned(fs_info))
-               return 0;
- 
-       map = block_group->physical_map;
- 
         spin_lock(&block_group->lock);
         if (!block_group->zone_is_active) {
                 spin_unlock(&block_group->lock);
@@@ -1895,40 -1899,56 +1898,56 @@@
         /* Check if we have unwritten allocated space */
         if ((block_group->flags &
              (BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_SYSTEM)) &&
-           block_group->alloc_offset > block_group->meta_write_pointer) {
+           block_group->start + block_group->alloc_offset > block_group->meta_write_pointer) {
                 spin_unlock(&block_group->lock);
                 return -EAGAIN;
         }
-       spin_unlock(&block_group->lock);
- 
-       ret = btrfs_inc_block_group_ro(block_group, false);
-       if (ret)
-               return ret;
- 
-       /* Ensure all writes in this block group finish */
-       btrfs_wait_block_group_reservations(block_group);
-       /* No need to wait for NOCOW writers. Zoned mode does not allow that. */
-       btrfs_wait_ordered_roots(fs_info, U64_MAX, block_group->start,
-                                block_group->length);
- 
-       spin_lock(&block_group->lock);
   
         /*
-        * Bail out if someone already deactivated the block group, or
-        * allocated space is left in the block group.
+        * If we are sure that the block group is full (= no more room left for
+        * new allocation) and the IO for the last usable block is completed, we
+        * don't need to wait for the other IOs. This holds because we ensure
+        * the sequential IO submissions using the ZONE_APPEND command for data
+        * and block_group->meta_write_pointer for metadata.
          */
-       if (!block_group->zone_is_active) {
+       if (!fully_written) {
                 spin_unlock(&block_group->lock);
-               btrfs_dec_block_group_ro(block_group);
-               return 0;
-       }
   
-       if (block_group->reserved) {
-               spin_unlock(&block_group->lock);
-               btrfs_dec_block_group_ro(block_group);
-               return -EAGAIN;
+               ret = btrfs_inc_block_group_ro(block_group, false);
+               if (ret)
+                       return ret;
+ 
+               /* Ensure all writes in this block group finish */
+               btrfs_wait_block_group_reservations(block_group);
+               /* No need to wait for NOCOW writers. Zoned mode does not allow that */
+               btrfs_wait_ordered_roots(fs_info, U64_MAX, block_group->start,
+                                        block_group->length);
+ 
+               spin_lock(&block_group->lock);
+ 
+               /*
+                * Bail out if someone already deactivated the block group, or
+                * allocated space is left in the block group.
+                */
+               if (!block_group->zone_is_active) {
+                       spin_unlock(&block_group->lock);
+                       btrfs_dec_block_group_ro(block_group);
+                       return 0;
+               }
+ 
+               if (block_group->reserved) {
+                       spin_unlock(&block_group->lock);
+                       btrfs_dec_block_group_ro(block_group);
+                       return -EAGAIN;
+               }
         }
   
+       /*
+        * The block group is not fully allocated, so not fully written yet. We
+        * need to send ZONE_FINISH command to free up an active zone.
+        */
+       need_zone_finish = !btrfs_zoned_bg_is_full(block_group);
+ 
         block_group->zone_is_active = 0;
         block_group->alloc_offset = block_group->zone_capacity;
         block_group->free_space_ctl->free_space = 0;
@@@ -1936,24 -1956,29 +1955,29 @@@
         btrfs_clear_data_reloc_bg(block_group);
         spin_unlock(&block_group->lock);
   
+       map = block_group->physical_map;
         for (i = 0; i < map->num_stripes; i++) {
-               device = map->stripes[i].dev;
-               physical = map->stripes[i].physical;
+               struct btrfs_device *device = map->stripes[i].dev;
+               const u64 physical = map->stripes[i].physical;
   
                 if (device->zone_info->max_active_zones == 0)
                         continue;
   
-               ret = blkdev_zone_mgmt(device->bdev, REQ_OP_ZONE_FINISH,
-                                      physical >> SECTOR_SHIFT,
-                                      device->zone_info->zone_size >> SECTOR_SHIFT,
-                                      GFP_NOFS);
+               if (need_zone_finish) {
+                       ret = blkdev_zone_mgmt(device->bdev, REQ_OP_ZONE_FINISH,
+                                              physical >> SECTOR_SHIFT,
+                                              device->zone_info->zone_size >> SECTOR_SHIFT,
+                                              GFP_NOFS);
   
-               if (ret)
-                       return ret;
+                       if (ret)
+                               return ret;
+               }
   
                 btrfs_dev_clear_active_zone(device, physical);
         }
-       btrfs_dec_block_group_ro(block_group);
+ 
+       if (!fully_written)
+               btrfs_dec_block_group_ro(block_group);
   
         spin_lock(&fs_info->zone_active_bgs_lock);
         ASSERT(!list_empty(&block_group->active_bg_list));
@@@ -1966,6 -1991,14 +1990,14 @@@
         return 0;
   }
   
+ int btrfs_zone_finish(struct btrfs_block_group *block_group)
+ {
+       if (!btrfs_is_zoned(block_group->fs_info))
+               return 0;
+ 
+       return do_zone_finish(block_group, false);
+ }
+ 
   bool btrfs_can_activate_zone(struct btrfs_fs_devices *fs_devices, u64 flags)
   {
         struct btrfs_fs_info *fs_info = fs_devices->fs_info;
@@@ -1997,9 -2030,7 +2029,7 @@@
   void btrfs_zone_finish_endio(struct btrfs_fs_info *fs_info, u64 logical, u64 length)
   {
         struct btrfs_block_group *block_group;
-       struct map_lookup *map;
-       struct btrfs_device *device;
-       u64 physical;
+       u64 min_alloc_bytes;
   
         if (!btrfs_is_zoned(fs_info))
                 return;
@@@ -2007,42 -2038,52 +2037,52 @@@
         block_group = btrfs_lookup_block_group(fs_info, logical);
         ASSERT(block_group);
   
-       if (logical + length < block_group->start + block_group->zone_capacity)
-               goto out;
- 
-       spin_lock(&block_group->lock);
+       /* No MIXED_BG on zoned btrfs. */
+       if (block_group->flags & BTRFS_BLOCK_GROUP_DATA)
+               min_alloc_bytes = fs_info->sectorsize;
+       else
+               min_alloc_bytes = fs_info->nodesize;
   
-       if (!block_group->zone_is_active) {
-               spin_unlock(&block_group->lock);
+       /* Bail out if we can allocate more data from this block group. */
+       if (logical + length + min_alloc_bytes <=
+           block_group->start + block_group->zone_capacity)
                 goto out;
-       }
   
-       block_group->zone_is_active = 0;
-       /* We should have consumed all the free space */
-       ASSERT(block_group->alloc_offset == block_group->zone_capacity);
-       ASSERT(block_group->free_space_ctl->free_space == 0);
-       btrfs_clear_treelog_bg(block_group);
-       btrfs_clear_data_reloc_bg(block_group);
-       spin_unlock(&block_group->lock);
+       do_zone_finish(block_group, true);
   
-       map = block_group->physical_map;
-       device = map->stripes[0].dev;
-       physical = map->stripes[0].physical;
+ out:
+       btrfs_put_block_group(block_group);
+ }
   
-       if (!device->zone_info->max_active_zones)
-               goto out;
+ static void btrfs_zone_finish_endio_workfn(struct work_struct *work)
+ {
+       struct btrfs_block_group *bg =
+               container_of(work, struct btrfs_block_group, zone_finish_work);
   
-       btrfs_dev_clear_active_zone(device, physical);
+       wait_on_extent_buffer_writeback(bg->last_eb);
+       free_extent_buffer(bg->last_eb);
+       btrfs_zone_finish_endio(bg->fs_info, bg->start, bg->length);
+       btrfs_put_block_group(bg);
+ }
   
-       spin_lock(&fs_info->zone_active_bgs_lock);
-       ASSERT(!list_empty(&block_group->active_bg_list));
-       list_del_init(&block_group->active_bg_list);
-       spin_unlock(&fs_info->zone_active_bgs_lock);
+ void btrfs_schedule_zone_finish_bg(struct btrfs_block_group *bg,
+                                  struct extent_buffer *eb)
+ {
+       if (!bg->seq_zone || eb->start + eb->len * 2 <= bg->start + bg->zone_capacity)
+               return;
   
-       btrfs_put_block_group(block_group);
+       if (WARN_ON(bg->zone_finish_work.func == btrfs_zone_finish_endio_workfn)) {
+               btrfs_err(bg->fs_info, "double scheduling of bg %llu zone finishing",
+                         bg->start);
+               return;
+       }
   
- out:
-       btrfs_put_block_group(block_group);
+       /* For the work */
+       btrfs_get_block_group(bg);
+       atomic_inc(&eb->refs);
+       bg->last_eb = eb;
+       INIT_WORK(&bg->zone_finish_work, btrfs_zone_finish_endio_workfn);
+       queue_work(system_unbound_wq, &bg->zone_finish_work);
   }
   
   void btrfs_clear_data_reloc_bg(struct btrfs_block_group *bg)
@@@ -2072,3 -2113,30 +2112,30 @@@ void btrfs_free_zone_cache(struct btrfs
         }
         mutex_unlock(&fs_devices->device_list_mutex);
   }
+ 
+ bool btrfs_zoned_should_reclaim(struct btrfs_fs_info *fs_info)
+ {
+       struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
+       struct btrfs_device *device;
+       u64 used = 0;
+       u64 total = 0;
+       u64 factor;
+ 
+       ASSERT(btrfs_is_zoned(fs_info));
+ 
+       if (fs_info->bg_reclaim_threshold == 0)
+               return false;
+ 
+       mutex_lock(&fs_devices->device_list_mutex);
+       list_for_each_entry(device, &fs_devices->devices, dev_list) {
+               if (!device->bdev)
+                       continue;
+ 
+               total += device->disk_total_bytes;
+               used += device->bytes_used;
+       }
+       mutex_unlock(&fs_devices->device_list_mutex);
+ 
+       factor = div64_u64(used * 100, total);
+       return factor >= fs_info->bg_reclaim_threshold;
+ }
diff --combined fs/erofs/data.c

index bb9c1fd48c1936cc6ddabc26b33cd99bc11d7e8a,91c11d5bb9990d83d1b5f5022292feae8d7e7ce0..252f4ee977d56965ce463f917e9a36456bccc023
--- 1/fs/erofs/data.c
--- 2/fs/erofs/data.c
+++ b/fs/erofs/data.c
@@@ -6,7 -6,6 +6,7 @@@
    */
   #include "internal.h"
   #include <linux/prefetch.h>
+ +#include <linux/sched/mm.h>
   #include <linux/dax.h>
   #include <trace/events/erofs.h>
   
@@@ -36,20 -35,14 +36,20 @@@ void *erofs_bread(struct erofs_buf *buf
         erofs_off_t offset = blknr_to_addr(blkaddr);
         pgoff_t index = offset >> PAGE_SHIFT;
         struct page *page = buf->page;
+ +      struct folio *folio;
+ +      unsigned int nofs_flag;
   
         if (!page || page->index != index) {
                 erofs_put_metabuf(buf);
- -              page = read_cache_page_gfp(mapping, index,
- -                              mapping_gfp_constraint(mapping, ~__GFP_FS));
- -              if (IS_ERR(page))
- -                      return page;
+ +
+ +              nofs_flag = memalloc_nofs_save();
+ +              folio = read_cache_folio(mapping, index, NULL, NULL);
+ +              memalloc_nofs_restore(nofs_flag);
+ +              if (IS_ERR(folio))
+ +                      return folio;
+ +
                 /* should already be PageUptodate, no need to lock page */
+ +              page = folio_file_page(folio, index);
                 buf->page = page;
         }
         if (buf->kmap_type == EROFS_NO_KMAP) {
@@@ -70,10 -63,6 +70,10 @@@
   void *erofs_read_metabuf(struct erofs_buf *buf, struct super_block *sb,
                          erofs_blk_t blkaddr, enum erofs_kmap_type type)
   {
+ +      if (erofs_is_fscache_mode(sb))
+ +              return erofs_bread(buf, EROFS_SB(sb)->s_fscache->inode,
+ +                                 blkaddr, type);
+ +
         return erofs_bread(buf, sb->s_bdev->bd_inode, blkaddr, type);
   }
   
@@@ -121,8 -110,8 +121,8 @@@ static int erofs_map_blocks_flatmode(st
         return 0;
   }
   
- -static int erofs_map_blocks(struct inode *inode,
- -                          struct erofs_map_blocks *map, int flags)
+ +int erofs_map_blocks(struct inode *inode,
+ +                   struct erofs_map_blocks *map, int flags)
   {
         struct super_block *sb = inode->i_sb;
         struct erofs_inode *vi = EROFS_I(inode);
@@@ -210,7 -199,6 +210,7 @@@ int erofs_map_dev(struct super_block *s
         map->m_bdev = sb->s_bdev;
         map->m_daxdev = EROFS_SB(sb)->dax_dev;
         map->m_dax_part_off = EROFS_SB(sb)->dax_part_off;
+ +      map->m_fscache = EROFS_SB(sb)->s_fscache;
   
         if (map->m_deviceid) {
                 down_read(&devs->rwsem);
@@@ -222,7 -210,6 +222,7 @@@
                 map->m_bdev = dif->bdev;
                 map->m_daxdev = dif->dax_dev;
                 map->m_dax_part_off = dif->dax_part_off;
+ +              map->m_fscache = dif->fscache;
                 up_read(&devs->rwsem);
         } else if (devs->extra_devices) {
                 down_read(&devs->rwsem);
@@@ -240,7 -227,6 +240,7 @@@
                                 map->m_bdev = dif->bdev;
                                 map->m_daxdev = dif->dax_dev;
                                 map->m_dax_part_off = dif->dax_part_off;
+ +                              map->m_fscache = dif->fscache;
                                 break;
                         }
                 }
@@@ -399,7 -385,7 +399,7 @@@ static ssize_t erofs_file_read_iter(str
   
                 if (!err)
                         return iomap_dio_rw(iocb, to, &erofs_iomap_ops,
-                                           NULL, 0, 0);
+                                           NULL, 0, NULL, 0);
                 if (err < 0)
                         return err;
         }
diff --combined fs/f2fs/file.c

index 35b6c720c2bc155211ea024ee1c05a4175350e1d,04bc8709314bf22ecbdfc54f0529ac9a60873262..100637b1adb3646c8f69e1b8ba1214898e225c2d
--- 1/fs/f2fs/file.c
--- 2/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@@ -2285,6 -2285,7 +2285,6 @@@ static int f2fs_ioc_fitrim(struct file 
   {
         struct inode *inode = file_inode(filp);
         struct super_block *sb = inode->i_sb;
- -      struct request_queue *q = bdev_get_queue(sb->s_bdev);
         struct fstrim_range range;
         int ret;
   
@@@ -2303,7 -2304,7 +2303,7 @@@
                 return ret;
   
         range.minlen = max((unsigned int)range.minlen,
- -                              q->limits.discard_granularity);
+ +                         bdev_discard_granularity(sb->s_bdev));
         ret = f2fs_trim_fs(F2FS_SB(sb), &range);
         mnt_drop_write_file(filp);
         if (ret < 0)
@@@ -3685,18 -3686,18 +3685,18 @@@ out
   static int f2fs_secure_erase(struct block_device *bdev, struct inode *inode,
                 pgoff_t off, block_t block, block_t len, u32 flags)
   {
- -      struct request_queue *q = bdev_get_queue(bdev);
         sector_t sector = SECTOR_FROM_BLOCK(block);
         sector_t nr_sects = SECTOR_FROM_BLOCK(len);
         int ret = 0;
   
- -      if (!q)
- -              return -ENXIO;
- -
- -      if (flags & F2FS_TRIM_FILE_DISCARD)
- -              ret = blkdev_issue_discard(bdev, sector, nr_sects, GFP_NOFS,
- -                                              blk_queue_secure_erase(q) ?
- -                                              BLKDEV_DISCARD_SECURE : 0);
+ +      if (flags & F2FS_TRIM_FILE_DISCARD) {
+ +              if (bdev_max_secure_erase_sectors(bdev))
+ +                      ret = blkdev_issue_secure_erase(bdev, sector, nr_sects,
+ +                                      GFP_NOFS);
+ +              else
+ +                      ret = blkdev_issue_discard(bdev, sector, nr_sects,
+ +                                      GFP_NOFS);
+ +      }
   
         if (!ret && (flags & F2FS_TRIM_FILE_ZEROOUT)) {
                 if (IS_ENCRYPTED(inode))
@@@ -4308,7 -4309,7 +4308,7 @@@ static ssize_t f2fs_dio_read_iter(struc
          */
         inc_page_count(sbi, F2FS_DIO_READ);
         dio = __iomap_dio_rw(iocb, to, &f2fs_iomap_ops,
-                            &f2fs_iomap_dio_read_ops, 0, 0);
+                            &f2fs_iomap_dio_read_ops, 0, NULL, 0);
         if (IS_ERR_OR_NULL(dio)) {
                 ret = PTR_ERR_OR_ZERO(dio);
                 if (ret != -EIOCBQUEUED)
@@@ -4526,7 -4527,7 +4526,7 @@@ static ssize_t f2fs_dio_write_iter(stru
         if (pos + count > inode->i_size)
                 dio_flags |= IOMAP_DIO_FORCE_WAIT;
         dio = __iomap_dio_rw(iocb, from, &f2fs_iomap_ops,
-                            &f2fs_iomap_dio_write_ops, dio_flags, 0);
+                            &f2fs_iomap_dio_write_ops, dio_flags, NULL, 0);
         if (IS_ERR_OR_NULL(dio)) {
                 ret = PTR_ERR_OR_ZERO(dio);
                 if (ret == -ENOTBLK)
diff --combined fs/iomap/direct-io.c

index 80f9b047aa1b6298523daf76638ed67ae823bc24,cf224a8bb31150b9b5a1e37d51f4bcbc4f3859a0..370c3241618a091bb5800a75dc638ef024d54a17
--- 1/fs/iomap/direct-io.c
--- 2/fs/iomap/direct-io.c
+++ b/fs/iomap/direct-io.c
@@@ -51,13 -51,21 +51,22 @@@ struct iomap_dio 
         };
   };
   
+ static struct bio *iomap_dio_alloc_bio(const struct iomap_iter *iter,
+               struct iomap_dio *dio, unsigned short nr_vecs, unsigned int opf)
+ {
+       if (dio->dops && dio->dops->bio_set)
+               return bio_alloc_bioset(iter->iomap.bdev, nr_vecs, opf,
+                                       GFP_KERNEL, dio->dops->bio_set);
+       return bio_alloc(iter->iomap.bdev, nr_vecs, opf, GFP_KERNEL);
+ }
+ 
   static void iomap_dio_submit_bio(const struct iomap_iter *iter,
                 struct iomap_dio *dio, struct bio *bio, loff_t pos)
   {
         atomic_inc(&dio->ref);
   
- -      if (dio->iocb->ki_flags & IOCB_HIPRI) {
+ +      /* Sync dio can't be polled reliably */
+ +      if ((dio->iocb->ki_flags & IOCB_HIPRI) && !is_sync_kiocb(dio->iocb)) {
                 bio_set_polled(bio, dio->iocb);
                 dio->submit.poll_bio = bio;
         }
@@@ -145,7 -153,7 +154,7 @@@ static inline void iomap_dio_set_error(
         cmpxchg(&dio->error, 0, ret);
   }
   
- static void iomap_dio_bio_end_io(struct bio *bio)
+ void iomap_dio_bio_end_io(struct bio *bio)
   {
         struct iomap_dio *dio = bio->bi_private;
         bool should_dirty = (dio->flags & IOMAP_DIO_DIRTY);
@@@ -177,16 -185,16 +186,16 @@@
                 bio_put(bio);
         }
   }
+ EXPORT_SYMBOL_GPL(iomap_dio_bio_end_io);
   
   static void iomap_dio_zero(const struct iomap_iter *iter, struct iomap_dio *dio,
                 loff_t pos, unsigned len)
   {
         struct inode *inode = file_inode(dio->iocb->ki_filp);
         struct page *page = ZERO_PAGE(0);
-       int flags = REQ_SYNC | REQ_IDLE;
         struct bio *bio;
   
-       bio = bio_alloc(iter->iomap.bdev, 1, REQ_OP_WRITE | flags, GFP_KERNEL);
+       bio = iomap_dio_alloc_bio(iter, dio, 1, REQ_OP_WRITE | REQ_SYNC | REQ_IDLE);
         fscrypt_set_bio_crypt_ctx(bio, inode, pos >> inode->i_blkbits,
                                   GFP_KERNEL);
         bio->bi_iter.bi_sector = iomap_sector(&iter->iomap, pos);
@@@ -266,7 -274,8 +275,7 @@@ static loff_t iomap_dio_bio_iter(const 
                  * cache flushes on IO completion.
                  */
                 if (!(iomap->flags & (IOMAP_F_SHARED|IOMAP_F_DIRTY)) &&
- -                  (dio->flags & IOMAP_DIO_WRITE_FUA) &&
- -                  blk_queue_fua(bdev_get_queue(iomap->bdev)))
+ +                  (dio->flags & IOMAP_DIO_WRITE_FUA) && bdev_fua(iomap->bdev))
                         use_fua = true;
         }
   
@@@ -311,7 -320,7 +320,7 @@@
                         goto out;
                 }
   
-               bio = bio_alloc(iomap->bdev, nr_pages, bio_opf, GFP_KERNEL);
+               bio = iomap_dio_alloc_bio(iter, dio, nr_pages, bio_opf);
                 fscrypt_set_bio_crypt_ctx(bio, inode, pos >> inode->i_blkbits,
                                           GFP_KERNEL);
                 bio->bi_iter.bi_sector = iomap_sector(iomap, pos);
@@@ -474,7 -483,7 +483,7 @@@ static loff_t iomap_dio_iter(const stru
   struct iomap_dio *
   __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
                 const struct iomap_ops *ops, const struct iomap_dio_ops *dops,
-               unsigned int dio_flags, size_t done_before)
+               unsigned int dio_flags, void *private, size_t done_before)
   {
         struct address_space *mapping = iocb->ki_filp->f_mapping;
         struct inode *inode = file_inode(iocb->ki_filp);
@@@ -483,6 -492,7 +492,7 @@@
                 .pos            = iocb->ki_pos,
                 .len            = iov_iter_count(iter),
                 .flags          = IOMAP_DIRECT,
+               .private        = private,
         };
         loff_t end = iomi.pos + iomi.len - 1, ret = 0;
         bool wait_for_completion =
@@@ -654,7 -664,9 +664,7 @@@
                         if (!READ_ONCE(dio->submit.waiter))
                                 break;
   
- -                      if (!dio->submit.poll_bio ||
- -                          !bio_poll(dio->submit.poll_bio, NULL, 0))
- -                              blk_io_schedule();
+ +                      blk_io_schedule();
                 }
                 __set_current_state(TASK_RUNNING);
         }
@@@ -672,11 -684,12 +682,12 @@@ EXPORT_SYMBOL_GPL(__iomap_dio_rw)
   ssize_t
   iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
                 const struct iomap_ops *ops, const struct iomap_dio_ops *dops,
-               unsigned int dio_flags, size_t done_before)
+               unsigned int dio_flags, void *private, size_t done_before)
   {
         struct iomap_dio *dio;
   
-       dio = __iomap_dio_rw(iocb, iter, ops, dops, dio_flags, done_before);
+       dio = __iomap_dio_rw(iocb, iter, ops, dops, dio_flags, private,
+                            done_before);
         if (IS_ERR_OR_NULL(dio))
                 return PTR_ERR_OR_ZERO(dio);
         return iomap_dio_complete(dio);
diff --combined fs/zonefs/super.c

index 652752df1a2f478ffbb733d042b67f82d96bb6aa,777fe626c2b38efe7d698bab01adc4c18e3b0fa7..8f306485c9538516ef20189812aba58b37b94b18
--- 1/fs/zonefs/super.c
--- 2/fs/zonefs/super.c
+++ b/fs/zonefs/super.c
@@@ -27,39 -27,6 +27,39 @@@
   #define CREATE_TRACE_POINTS
   #include "trace.h"
   
+ +/*
+ + * Manage the active zone count. Called with zi->i_truncate_mutex held.
+ + */
+ +static void zonefs_account_active(struct inode *inode)
+ +{
+ +      struct zonefs_sb_info *sbi = ZONEFS_SB(inode->i_sb);
+ +      struct zonefs_inode_info *zi = ZONEFS_I(inode);
+ +
+ +      lockdep_assert_held(&zi->i_truncate_mutex);
+ +
+ +      if (zi->i_ztype != ZONEFS_ZTYPE_SEQ)
+ +              return;
+ +
+ +      /*
+ +       * If the zone is active, that is, if it is explicitly open or
+ +       * partially written, check if it was already accounted as active.
+ +       */
+ +      if ((zi->i_flags & ZONEFS_ZONE_OPEN) ||
+ +          (zi->i_wpoffset > 0 && zi->i_wpoffset < zi->i_max_size)) {
+ +              if (!(zi->i_flags & ZONEFS_ZONE_ACTIVE)) {
+ +                      zi->i_flags |= ZONEFS_ZONE_ACTIVE;
+ +                      atomic_inc(&sbi->s_active_seq_files);
+ +              }
+ +              return;
+ +      }
+ +
+ +      /* The zone is not active. If it was, update the active count */
+ +      if (zi->i_flags & ZONEFS_ZONE_ACTIVE) {
+ +              zi->i_flags &= ~ZONEFS_ZONE_ACTIVE;
+ +              atomic_dec(&sbi->s_active_seq_files);
+ +      }
+ +}
+ +
   static inline int zonefs_zone_mgmt(struct inode *inode,
                                    enum req_opf op)
   {
@@@ -101,13 -68,8 +101,13 @@@ static inline void zonefs_i_size_write(
          * A full zone is no longer open/active and does not need
          * explicit closing.
          */
- -      if (isize >= zi->i_max_size)
- -              zi->i_flags &= ~ZONEFS_ZONE_OPEN;
+ +      if (isize >= zi->i_max_size) {
+ +              struct zonefs_sb_info *sbi = ZONEFS_SB(inode->i_sb);
+ +
+ +              if (zi->i_flags & ZONEFS_ZONE_ACTIVE)
+ +                      atomic_dec(&sbi->s_active_seq_files);
+ +              zi->i_flags &= ~(ZONEFS_ZONE_OPEN | ZONEFS_ZONE_ACTIVE);
+ +      }
   }
   
   static int zonefs_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
@@@ -435,7 -397,6 +435,7 @@@ static int zonefs_io_error_cb(struct bl
         zonefs_update_stats(inode, data_size);
         zonefs_i_size_write(inode, data_size);
         zi->i_wpoffset = data_size;
+ +      zonefs_account_active(inode);
   
         return 0;
   }
@@@ -547,7 -508,6 +547,7 @@@ static int zonefs_file_truncate(struct 
         zonefs_update_stats(inode, isize);
         truncate_setsize(inode, isize);
         zi->i_wpoffset = isize;
+ +      zonefs_account_active(inode);
   
   unlock:
         mutex_unlock(&zi->i_truncate_mutex);
@@@ -729,12 -689,13 +729,12 @@@ static ssize_t zonefs_file_dio_append(s
         struct inode *inode = file_inode(iocb->ki_filp);
         struct zonefs_inode_info *zi = ZONEFS_I(inode);
         struct block_device *bdev = inode->i_sb->s_bdev;
- -      unsigned int max;
+ +      unsigned int max = bdev_max_zone_append_sectors(bdev);
         struct bio *bio;
         ssize_t size;
         int nr_pages;
         ssize_t ret;
   
- -      max = queue_max_zone_append_sectors(bdev_get_queue(bdev));
         max = ALIGN_DOWN(max << SECTOR_SHIFT, inode->i_sb->s_blocksize);
         iov_iter_truncate(from, max);
   
@@@ -900,20 -861,13 +900,20 @@@ static ssize_t zonefs_file_dio_write(st
                 ret = zonefs_file_dio_append(iocb, from);
         else
                 ret = iomap_dio_rw(iocb, from, &zonefs_iomap_ops,
-                                  &zonefs_write_dio_ops, 0, 0);
+                                  &zonefs_write_dio_ops, 0, NULL, 0);
         if (zi->i_ztype == ZONEFS_ZTYPE_SEQ &&
             (ret > 0 || ret == -EIOCBQUEUED)) {
                 if (ret > 0)
                         count = ret;
+ +
+ +              /*
+ +               * Update the zone write pointer offset assuming the write
+ +               * operation succeeded. If it did not, the error recovery path
+ +               * will correct it. Also do active seq file accounting.
+ +               */
                 mutex_lock(&zi->i_truncate_mutex);
                 zi->i_wpoffset += count;
+ +              zonefs_account_active(inode);
                 mutex_unlock(&zi->i_truncate_mutex);
         }
   
@@@ -1042,7 -996,7 +1042,7 @@@ static ssize_t zonefs_file_read_iter(st
                 }
                 file_accessed(iocb->ki_filp);
                 ret = iomap_dio_rw(iocb, to, &zonefs_iomap_ops,
-                                  &zonefs_read_dio_ops, 0, 0);
+                                  &zonefs_read_dio_ops, 0, NULL, 0);
         } else {
                 ret = generic_file_read_iter(iocb, to);
                 if (ret == -EIO)
@@@ -1055,13 -1009,13 +1055,13 @@@ inode_unlock
         return ret;
   }
   
- -static inline bool zonefs_file_use_exp_open(struct inode *inode, struct file *file)
+ +/*
+ + * Write open accounting is done only for sequential files.
+ + */
+ +static inline bool zonefs_seq_file_need_wro(struct inode *inode,
+ +                                          struct file *file)
   {
         struct zonefs_inode_info *zi = ZONEFS_I(inode);
- -      struct zonefs_sb_info *sbi = ZONEFS_SB(inode->i_sb);
- -
- -      if (!(sbi->s_mount_opts & ZONEFS_MNTOPT_EXPLICIT_OPEN))
- -              return false;
   
         if (zi->i_ztype != ZONEFS_ZTYPE_SEQ)
                 return false;
@@@ -1072,34 -1026,28 +1072,34 @@@
         return true;
   }
   
- -static int zonefs_open_zone(struct inode *inode)
+ +static int zonefs_seq_file_write_open(struct inode *inode)
   {
         struct zonefs_inode_info *zi = ZONEFS_I(inode);
- -      struct zonefs_sb_info *sbi = ZONEFS_SB(inode->i_sb);
         int ret = 0;
   
         mutex_lock(&zi->i_truncate_mutex);
   
         if (!zi->i_wr_refcnt) {
- -              if (atomic_inc_return(&sbi->s_open_zones) > sbi->s_max_open_zones) {
- -                      atomic_dec(&sbi->s_open_zones);
- -                      ret = -EBUSY;
- -                      goto unlock;
- -              }
+ +              struct zonefs_sb_info *sbi = ZONEFS_SB(inode->i_sb);
+ +              unsigned int wro = atomic_inc_return(&sbi->s_wro_seq_files);
   
- -              if (i_size_read(inode) < zi->i_max_size) {
- -                      ret = zonefs_zone_mgmt(inode, REQ_OP_ZONE_OPEN);
- -                      if (ret) {
- -                              atomic_dec(&sbi->s_open_zones);
+ +              if (sbi->s_mount_opts & ZONEFS_MNTOPT_EXPLICIT_OPEN) {
+ +
+ +                      if (wro > sbi->s_max_wro_seq_files) {
+ +                              atomic_dec(&sbi->s_wro_seq_files);
+ +                              ret = -EBUSY;
                                 goto unlock;
                         }
- -                      zi->i_flags |= ZONEFS_ZONE_OPEN;
+ +
+ +                      if (i_size_read(inode) < zi->i_max_size) {
+ +                              ret = zonefs_zone_mgmt(inode, REQ_OP_ZONE_OPEN);
+ +                              if (ret) {
+ +                                      atomic_dec(&sbi->s_wro_seq_files);
+ +                                      goto unlock;
+ +                              }
+ +                              zi->i_flags |= ZONEFS_ZONE_OPEN;
+ +                              zonefs_account_active(inode);
+ +                      }
                 }
         }
   
@@@ -1119,31 -1067,30 +1119,31 @@@ static int zonefs_file_open(struct inod
         if (ret)
                 return ret;
   
- -      if (zonefs_file_use_exp_open(inode, file))
- -              return zonefs_open_zone(inode);
+ +      if (zonefs_seq_file_need_wro(inode, file))
+ +              return zonefs_seq_file_write_open(inode);
   
         return 0;
   }
   
- -static void zonefs_close_zone(struct inode *inode)
+ +static void zonefs_seq_file_write_close(struct inode *inode)
   {
         struct zonefs_inode_info *zi = ZONEFS_I(inode);
+ +      struct super_block *sb = inode->i_sb;
+ +      struct zonefs_sb_info *sbi = ZONEFS_SB(sb);
         int ret = 0;
   
         mutex_lock(&zi->i_truncate_mutex);
- -      zi->i_wr_refcnt--;
- -      if (!zi->i_wr_refcnt) {
- -              struct zonefs_sb_info *sbi = ZONEFS_SB(inode->i_sb);
- -              struct super_block *sb = inode->i_sb;
   
- -              /*
- -               * If the file zone is full, it is not open anymore and we only
- -               * need to decrement the open count.
- -               */
- -              if (!(zi->i_flags & ZONEFS_ZONE_OPEN))
- -                      goto dec;
+ +      zi->i_wr_refcnt--;
+ +      if (zi->i_wr_refcnt)
+ +              goto unlock;
   
+ +      /*
+ +       * The file zone may not be open anymore (e.g. the file was truncated to
+ +       * its maximum size or it was fully written). For this case, we only
+ +       * need to decrement the write open count.
+ +       */
+ +      if (zi->i_flags & ZONEFS_ZONE_OPEN) {
                 ret = zonefs_zone_mgmt(inode, REQ_OP_ZONE_CLOSE);
                 if (ret) {
                         __zonefs_io_error(inode, false);
@@@ -1155,23 -1102,14 +1155,23 @@@
                          */
                         if (zi->i_flags & ZONEFS_ZONE_OPEN &&
                             !(sb->s_flags & SB_RDONLY)) {
- -                              zonefs_warn(sb, "closing zone failed, remounting filesystem read-only\n");
+ +                              zonefs_warn(sb,
+ +                                      "closing zone at %llu failed %d\n",
+ +                                      zi->i_zsector, ret);
+ +                              zonefs_warn(sb,
+ +                                      "remounting filesystem read-only\n");
                                 sb->s_flags |= SB_RDONLY;
                         }
+ +                      goto unlock;
                 }
+ +
                 zi->i_flags &= ~ZONEFS_ZONE_OPEN;
- -dec:
- -              atomic_dec(&sbi->s_open_zones);
+ +              zonefs_account_active(inode);
         }
+ +
+ +      atomic_dec(&sbi->s_wro_seq_files);
+ +
+ +unlock:
         mutex_unlock(&zi->i_truncate_mutex);
   }
   
@@@ -1183,8 -1121,8 +1183,8 @@@ static int zonefs_file_release(struct i
          * the zone has gone offline or read-only). Make sure we don't fail the
          * close(2) for user-space.
          */
- -      if (zonefs_file_use_exp_open(inode, file))
- -              zonefs_close_zone(inode);
+ +      if (zonefs_seq_file_need_wro(inode, file))
+ +              zonefs_seq_file_write_close(inode);
   
         return 0;
   }
@@@ -1399,8 -1337,6 +1399,8 @@@ static int zonefs_init_file_inode(struc
         sbi->s_blocks += zi->i_max_size >> sb->s_blocksize_bits;
         sbi->s_used_blocks += zi->i_wpoffset >> sb->s_blocksize_bits;
   
+ +      mutex_lock(&zi->i_truncate_mutex);
+ +
         /*
          * For sequential zones, make sure that any open zone is closed first
          * to ensure that the initial number of open zones is 0, in sync with
@@@ -1410,16 -1346,11 +1410,16 @@@
         if (type == ZONEFS_ZTYPE_SEQ &&
             (zone->cond == BLK_ZONE_COND_IMP_OPEN ||
              zone->cond == BLK_ZONE_COND_EXP_OPEN)) {
- -              mutex_lock(&zi->i_truncate_mutex);
                 ret = zonefs_zone_mgmt(inode, REQ_OP_ZONE_CLOSE);
- -              mutex_unlock(&zi->i_truncate_mutex);
+ +              if (ret)
+ +                      goto unlock;
         }
   
+ +      zonefs_account_active(inode);
+ +
+ +unlock:
+ +      mutex_unlock(&zi->i_truncate_mutex);
+ +
         return ret;
   }
   
@@@ -1757,18 -1688,14 +1757,18 @@@ static int zonefs_fill_super(struct sup
         sbi->s_gid = GLOBAL_ROOT_GID;
         sbi->s_perm = 0640;
         sbi->s_mount_opts = ZONEFS_MNTOPT_ERRORS_RO;
- -      sbi->s_max_open_zones = bdev_max_open_zones(sb->s_bdev);
- -      atomic_set(&sbi->s_open_zones, 0);
- -      if (!sbi->s_max_open_zones &&
+ +
+ +      atomic_set(&sbi->s_wro_seq_files, 0);
+ +      sbi->s_max_wro_seq_files = bdev_max_open_zones(sb->s_bdev);
+ +      if (!sbi->s_max_wro_seq_files &&
             sbi->s_mount_opts & ZONEFS_MNTOPT_EXPLICIT_OPEN) {
                 zonefs_info(sb, "No open zones limit. Ignoring explicit_open mount option\n");
                 sbi->s_mount_opts &= ~ZONEFS_MNTOPT_EXPLICIT_OPEN;
         }
   
+ +      atomic_set(&sbi->s_active_seq_files, 0);
+ +      sbi->s_max_active_seq_files = bdev_max_active_zones(sb->s_bdev);
+ +
         ret = zonefs_read_super(sb);
         if (ret)
                 return ret;
@@@ -1783,10 -1710,6 +1783,10 @@@
         if (ret)
                 goto cleanup;
   
+ +      ret = zonefs_sysfs_register(sb);
+ +      if (ret)
+ +              goto cleanup;
+ +
         zonefs_info(sb, "Mounting %u zones",
                     blkdev_nr_zones(sb->s_bdev->bd_disk));
   
@@@ -1832,8 -1755,6 +1832,8 @@@ static void zonefs_kill_super(struct su
   
         if (sb->s_root)
                 d_genocide(sb->s_root);
+ +
+ +      zonefs_sysfs_unregister(sb);
         kill_block_super(sb);
         kfree(sbi);
   }
@@@ -1881,26 -1802,16 +1881,26 @@@ static int __init zonefs_init(void
                 return ret;
   
         ret = register_filesystem(&zonefs_type);
- -      if (ret) {
- -              zonefs_destroy_inodecache();
- -              return ret;
- -      }
+ +      if (ret)
+ +              goto destroy_inodecache;
+ +
+ +      ret = zonefs_sysfs_init();
+ +      if (ret)
+ +              goto unregister_fs;
   
         return 0;
+ +
+ +unregister_fs:
+ +      unregister_filesystem(&zonefs_type);
+ +destroy_inodecache:
+ +      zonefs_destroy_inodecache();
+ +
+ +      return ret;
   }
   
   static void __exit zonefs_exit(void)
   {
+ +      zonefs_sysfs_exit();
         zonefs_destroy_inodecache();
         unregister_filesystem(&zonefs_type);
   }
diff --combined include/linux/fs.h

index 87b5af1d9fbe037dbcbe404547fac061544c3abb,01d61984ce7ae975e120247789252dd4137595b0..02e7f60638b847a60abc9a6fa687dda8a246e29f
--- 1/include/linux/fs.h
--- 2/include/linux/fs.h
+++ b/include/linux/fs.h
@@@ -1708,6 -1708,11 +1708,11 @@@ static inline bool __sb_start_write_try
   #define __sb_writers_release(sb, lev) \
         percpu_rwsem_release(&(sb)->s_writers.rw_sem[(lev)-1], 1, _THIS_IP_)
   
+ static inline bool sb_write_started(const struct super_block *sb)
+ {
+       return lockdep_is_held_type(sb->s_writers.rw_sem + SB_FREEZE_WRITE - 1, 1);
+ }
+ 
   /**
    * sb_end_write - drop write access to a superblock
    * @sb: the super we wrote to
@@@ -1953,7 -1958,6 +1958,7 @@@ struct dir_context 
   #define REMAP_FILE_ADVISORY           (REMAP_FILE_CAN_SHORTEN)
   
   struct iov_iter;
+ +struct io_uring_cmd;
   
   struct file_operations {
         struct module *owner;
@@@ -1996,7 -2000,6 +2001,7 @@@
                                    struct file *file_out, loff_t pos_out,
                                    loff_t len, unsigned int remap_flags);
         int (*fadvise)(struct file *, loff_t, loff_t, int);
+ +      int (*uring_cmd)(struct io_uring_cmd *ioucmd, unsigned int issue_flags);
   } __randomize_layout;
   
   struct inode_operations {
author	Linus Torvalds <[email protected]>
	Wed, 25 May 2022 01:52:35 +0000 (18:52 -0700)
committer	Linus Torvalds <[email protected]>
	Wed, 25 May 2022 01:52:35 +0000 (18:52 -0700)
		1	2
fs/btrfs/disk-io.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/btrfs/extent-tree.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/btrfs/ioctl.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/btrfs/volumes.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/btrfs/volumes.h	patch \|	diff1 \|	diff2 \|	blob \| history
fs/btrfs/zoned.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/erofs/data.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/f2fs/file.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/iomap/direct-io.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/zonefs/super.c	patch \|	diff1 \|	diff2 \|	blob \| history
include/linux/fs.h	patch \|	diff1 \|	diff2 \|	blob \| history