]> Git Repo - J-linux.git/commitdiff
Merge tag 'for-5.19-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux
authorLinus Torvalds <[email protected]>
Wed, 25 May 2022 01:52:35 +0000 (18:52 -0700)
committerLinus Torvalds <[email protected]>
Wed, 25 May 2022 01:52:35 +0000 (18:52 -0700)
Pull btrfs updates from David Sterba:
 "Features:

   - subpage:
      - support for PAGE_SIZE > 4K (previously only 64K)
      - make it work with raid56

   - repair super block num_devices automatically if it does not match
     the number of device items

   - defrag can convert inline extents to regular extents, up to now
     inline files were skipped but the setting of mount option
     max_inline could affect the decision logic

   - zoned:
      - minimal accepted zone size is explicitly set to 4MiB
      - make zone reclaim less aggressive and don't reclaim if there are
        enough free zones
      - add per-profile sysfs tunable of the reclaim threshold

   - allow automatic block group reclaim for non-zoned filesystems, with
     sysfs tunables

   - tree-checker: new check, compare extent buffer owner against owner
     rootid

  Performance:

   - avoid blocking on space reservation when doing nowait direct io
     writes (+7% throughput for reads and writes)

   - NOCOW write throughput improvement due to refined locking (+3%)

   - send: reduce pressure to page cache by dropping extent pages right
     after they're processed

  Core:

   - convert all radix trees to xarray

   - add iterators for b-tree node items

   - support printk message index

   - user bulk page allocation for extent buffers

   - switch to bio_alloc API, use on-stack bios where convenient, other
     bio cleanups

   - use rw lock for block groups to favor concurrent reads

   - simplify workques, don't allocate high priority threads for all
     normal queues as we need only one

   - refactor scrub, process chunks based on their constraints and
     similarity

   - allocate direct io structures on stack and pass around only
     pointers, avoids allocation and reduces potential error handling

  Fixes:

   - fix count of reserved transaction items for various inode
     operations

   - fix deadlock between concurrent dio writes when low on free data
     space

   - fix a few cases when zones need to be finished

  VFS, iomap:

   - add helper to check if sb write has started (usable for assertions)

   - new helper iomap_dio_alloc_bio, export iomap_dio_bio_end_io"

* tag 'for-5.19-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux: (173 commits)
  btrfs: zoned: introduce a minimal zone size 4M and reject mount
  btrfs: allow defrag to convert inline extents to regular extents
  btrfs: add "0x" prefix for unsupported optional features
  btrfs: do not account twice for inode ref when reserving metadata units
  btrfs: zoned: fix comparison of alloc_offset vs meta_write_pointer
  btrfs: send: avoid trashing the page cache
  btrfs: send: keep the current inode open while processing it
  btrfs: allocate the btrfs_dio_private as part of the iomap dio bio
  btrfs: move struct btrfs_dio_private to inode.c
  btrfs: remove the disk_bytenr in struct btrfs_dio_private
  btrfs: allocate dio_data on stack
  iomap: add per-iomap_iter private data
  iomap: allow the file system to provide a bio_set for direct I/O
  btrfs: add a btrfs_dio_rw wrapper
  btrfs: zoned: zone finish unused block group
  btrfs: zoned: properly finish block group on metadata write
  btrfs: zoned: finish block group when there are no more allocatable bytes left
  btrfs: zoned: consolidate zone finish functions
  btrfs: zoned: introduce btrfs_zoned_bg_is_full
  btrfs: improve error reporting in lookup_inline_extent_backref
  ...

1  2 
fs/btrfs/disk-io.c
fs/btrfs/extent-tree.c
fs/btrfs/ioctl.c
fs/btrfs/volumes.c
fs/btrfs/volumes.h
fs/btrfs/zoned.c
fs/erofs/data.c
fs/f2fs/file.c
fs/iomap/direct-io.c
fs/zonefs/super.c
include/linux/fs.h

diff --combined fs/btrfs/disk-io.c
index 84795d831282b3152440354db4a2ddc3e0b594e7,f33093513360dc988b1d190827b479f50d45c150..14f8a90df3217b6c6cee48ae21a00a7657ccc64b
@@@ -5,7 -5,6 +5,6 @@@
  
  #include <linux/fs.h>
  #include <linux/blkdev.h>
- #include <linux/radix-tree.h>
  #include <linux/writeback.h>
  #include <linux/workqueue.h>
  #include <linux/kthread.h>
@@@ -374,9 -373,9 +373,9 @@@ int btrfs_verify_level_key(struct exten
   * @level:            expected level, mandatory check
   * @first_key:                expected key of first slot, skip check if NULL
   */
static int btree_read_extent_buffer_pages(struct extent_buffer *eb,
-                                         u64 parent_transid, int level,
-                                         struct btrfs_key *first_key)
int btrfs_read_extent_buffer(struct extent_buffer *eb,
+                            u64 parent_transid, int level,
+                            struct btrfs_key *first_key)
  {
        struct btrfs_fs_info *fs_info = eb->fs_info;
        struct extent_io_tree *io_tree;
@@@ -486,7 -485,7 +485,7 @@@ static int csum_dirty_subpage_buffers(s
                uptodate = btrfs_subpage_test_uptodate(fs_info, page, cur,
                                                       fs_info->nodesize);
  
-               /* A dirty eb shouldn't disappear from buffer_radix */
+               /* A dirty eb shouldn't disappear from extent_buffers */
                if (WARN_ON(!eb))
                        return -EUCLEAN;
  
@@@ -519,7 -518,7 +518,7 @@@ static int csum_dirty_buffer(struct btr
        u64 found_start;
        struct extent_buffer *eb;
  
-       if (fs_info->sectorsize < PAGE_SIZE)
+       if (fs_info->nodesize < PAGE_SIZE)
                return csum_dirty_subpage_buffers(fs_info, bvec);
  
        eb = (struct extent_buffer *)page->private;
@@@ -704,7 -703,7 +703,7 @@@ int btrfs_validate_metadata_buffer(stru
  
        ASSERT(page->private);
  
-       if (btrfs_sb(page->mapping->host->i_sb)->sectorsize < PAGE_SIZE)
+       if (btrfs_sb(page->mapping->host->i_sb)->nodesize < PAGE_SIZE)
                return validate_subpage_buffer(page, start, end, mirror);
  
        eb = (struct extent_buffer *)page->private;
@@@ -850,8 -849,7 +849,7 @@@ static void run_one_async_free(struct b
  }
  
  blk_status_t btrfs_wq_submit_bio(struct inode *inode, struct bio *bio,
-                                int mirror_num, unsigned long bio_flags,
-                                u64 dio_file_offset,
+                                int mirror_num, u64 dio_file_offset,
                                 extent_submit_bio_start_t *submit_bio_start)
  {
        struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
        async->status = 0;
  
        if (op_is_sync(bio->bi_opf))
-               btrfs_set_work_high_priority(&async->work);
-       btrfs_queue_work(fs_info->workers, &async->work);
+               btrfs_queue_work(fs_info->hipri_workers, &async->work);
+       else
+               btrfs_queue_work(fs_info->workers, &async->work);
        return 0;
  }
  
@@@ -920,8 -918,7 +918,7 @@@ static bool should_async_write(struct b
        return true;
  }
  
- blk_status_t btrfs_submit_metadata_bio(struct inode *inode, struct bio *bio,
-                                      int mirror_num, unsigned long bio_flags)
+ void btrfs_submit_metadata_bio(struct inode *inode, struct bio *bio, int mirror_num)
  {
        struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
        blk_status_t ret;
                 */
                ret = btrfs_bio_wq_end_io(fs_info, bio,
                                          BTRFS_WQ_ENDIO_METADATA);
-               if (ret)
-                       goto out_w_error;
-               ret = btrfs_map_bio(fs_info, bio, mirror_num);
+               if (!ret)
+                       ret = btrfs_map_bio(fs_info, bio, mirror_num);
        } else if (!should_async_write(fs_info, BTRFS_I(inode))) {
                ret = btree_csum_one_bio(bio);
-               if (ret)
-                       goto out_w_error;
-               ret = btrfs_map_bio(fs_info, bio, mirror_num);
+               if (!ret)
+                       ret = btrfs_map_bio(fs_info, bio, mirror_num);
        } else {
                /*
                 * kthread helpers are used to submit writes so that
                 * checksumming can happen in parallel across all CPUs
                 */
                ret = btrfs_wq_submit_bio(inode, bio, mirror_num, 0,
-                                         0, btree_submit_bio_start);
+                                         btree_submit_bio_start);
        }
  
-       if (ret)
-               goto out_w_error;
-       return 0;
- out_w_error:
-       bio->bi_status = ret;
-       bio_endio(bio);
-       return ret;
+       if (ret) {
+               bio->bi_status = ret;
+               bio_endio(bio);
+       }
  }
  
  #ifdef CONFIG_MIGRATION
@@@ -1118,12 -1109,15 +1109,15 @@@ struct extent_buffer *read_tree_block(s
        if (IS_ERR(buf))
                return buf;
  
-       ret = btree_read_extent_buffer_pages(buf, parent_transid,
-                                            level, first_key);
+       ret = btrfs_read_extent_buffer(buf, parent_transid, level, first_key);
        if (ret) {
                free_extent_buffer_stale(buf);
                return ERR_PTR(ret);
        }
+       if (btrfs_check_eb_owner(buf, owner_root)) {
+               free_extent_buffer_stale(buf);
+               return ERR_PTR(-EUCLEAN);
+       }
        return buf;
  
  }
@@@ -1164,7 -1158,7 +1158,7 @@@ static void __setup_root(struct btrfs_r
        root->nr_delalloc_inodes = 0;
        root->nr_ordered_extents = 0;
        root->inode_tree = RB_ROOT;
-       INIT_RADIX_TREE(&root->delayed_nodes_tree, GFP_ATOMIC);
+       xa_init_flags(&root->delayed_nodes, GFP_ATOMIC);
  
        btrfs_init_root_block_rsv(root);
  
        btrfs_qgroup_init_swapped_blocks(&root->swapped_blocks);
  #ifdef CONFIG_BTRFS_DEBUG
        INIT_LIST_HEAD(&root->leak_list);
-       spin_lock(&fs_info->fs_roots_radix_lock);
+       spin_lock(&fs_info->fs_roots_lock);
        list_add_tail(&root->leak_list, &fs_info->allocated_roots);
-       spin_unlock(&fs_info->fs_roots_radix_lock);
+       spin_unlock(&fs_info->fs_roots_lock);
  #endif
  }
  
@@@ -1563,6 -1557,23 +1557,23 @@@ static struct btrfs_root *read_tree_roo
                ret = -EIO;
                goto fail;
        }
+       /*
+        * For real fs, and not log/reloc trees, root owner must
+        * match its root node owner
+        */
+       if (!test_bit(BTRFS_FS_STATE_DUMMY_FS_INFO, &fs_info->fs_state) &&
+           root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID &&
+           root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID &&
+           root->root_key.objectid != btrfs_header_owner(root->node)) {
+               btrfs_crit(fs_info,
+ "root=%llu block=%llu, tree root owner mismatch, have %llu expect %llu",
+                          root->root_key.objectid, root->node->start,
+                          btrfs_header_owner(root->node),
+                          root->root_key.objectid);
+               ret = -EUCLEAN;
+               goto fail;
+       }
        root->commit_root = btrfs_root_node(root);
        return root;
  fail:
@@@ -1648,12 -1659,11 +1659,11 @@@ static struct btrfs_root *btrfs_lookup_
  {
        struct btrfs_root *root;
  
-       spin_lock(&fs_info->fs_roots_radix_lock);
-       root = radix_tree_lookup(&fs_info->fs_roots_radix,
-                                (unsigned long)root_id);
+       spin_lock(&fs_info->fs_roots_lock);
+       root = xa_load(&fs_info->fs_roots, (unsigned long)root_id);
        if (root)
                root = btrfs_grab_root(root);
-       spin_unlock(&fs_info->fs_roots_radix_lock);
+       spin_unlock(&fs_info->fs_roots_lock);
        return root;
  }
  
@@@ -1695,20 -1705,14 +1705,14 @@@ int btrfs_insert_fs_root(struct btrfs_f
  {
        int ret;
  
-       ret = radix_tree_preload(GFP_NOFS);
-       if (ret)
-               return ret;
-       spin_lock(&fs_info->fs_roots_radix_lock);
-       ret = radix_tree_insert(&fs_info->fs_roots_radix,
-                               (unsigned long)root->root_key.objectid,
-                               root);
+       spin_lock(&fs_info->fs_roots_lock);
+       ret = xa_insert(&fs_info->fs_roots, (unsigned long)root->root_key.objectid,
+                       root, GFP_NOFS);
        if (ret == 0) {
                btrfs_grab_root(root);
-               set_bit(BTRFS_ROOT_IN_RADIX, &root->state);
+               set_bit(BTRFS_ROOT_REGISTERED, &root->state);
        }
-       spin_unlock(&fs_info->fs_roots_radix_lock);
-       radix_tree_preload_end();
+       spin_unlock(&fs_info->fs_roots_lock);
  
        return ret;
  }
@@@ -1964,7 -1968,7 +1968,7 @@@ static void end_workqueue_fn(struct btr
  
  static int cleaner_kthread(void *arg)
  {
-       struct btrfs_fs_info *fs_info = (struct btrfs_fs_info *)arg;
+       struct btrfs_fs_info *fs_info = arg;
        int again;
  
        while (1) {
@@@ -2266,10 -2270,12 +2270,12 @@@ static void btrfs_stop_all_workers(stru
  {
        btrfs_destroy_workqueue(fs_info->fixup_workers);
        btrfs_destroy_workqueue(fs_info->delalloc_workers);
+       btrfs_destroy_workqueue(fs_info->hipri_workers);
        btrfs_destroy_workqueue(fs_info->workers);
        btrfs_destroy_workqueue(fs_info->endio_workers);
        btrfs_destroy_workqueue(fs_info->endio_raid56_workers);
-       btrfs_destroy_workqueue(fs_info->rmw_workers);
+       if (fs_info->rmw_workers)
+               destroy_workqueue(fs_info->rmw_workers);
        btrfs_destroy_workqueue(fs_info->endio_write_workers);
        btrfs_destroy_workqueue(fs_info->endio_freespace_worker);
        btrfs_destroy_workqueue(fs_info->delayed_workers);
@@@ -2336,9 -2342,9 +2342,9 @@@ void btrfs_put_root(struct btrfs_root *
                btrfs_drew_lock_destroy(&root->snapshot_lock);
                free_root_extent_buffers(root);
  #ifdef CONFIG_BTRFS_DEBUG
-               spin_lock(&root->fs_info->fs_roots_radix_lock);
+               spin_lock(&root->fs_info->fs_roots_lock);
                list_del_init(&root->leak_list);
-               spin_unlock(&root->fs_info->fs_roots_radix_lock);
+               spin_unlock(&root->fs_info->fs_roots_lock);
  #endif
                kfree(root);
        }
  
  void btrfs_free_fs_roots(struct btrfs_fs_info *fs_info)
  {
-       int ret;
-       struct btrfs_root *gang[8];
-       int i;
+       struct btrfs_root *root;
+       unsigned long index = 0;
  
        while (!list_empty(&fs_info->dead_roots)) {
-               gang[0] = list_entry(fs_info->dead_roots.next,
-                                    struct btrfs_root, root_list);
-               list_del(&gang[0]->root_list);
+               root = list_entry(fs_info->dead_roots.next,
+                                 struct btrfs_root, root_list);
+               list_del(&root->root_list);
  
-               if (test_bit(BTRFS_ROOT_IN_RADIX, &gang[0]->state))
-                       btrfs_drop_and_free_fs_root(fs_info, gang[0]);
-               btrfs_put_root(gang[0]);
+               if (test_bit(BTRFS_ROOT_REGISTERED, &root->state))
+                       btrfs_drop_and_free_fs_root(fs_info, root);
+               btrfs_put_root(root);
        }
  
-       while (1) {
-               ret = radix_tree_gang_lookup(&fs_info->fs_roots_radix,
-                                            (void **)gang, 0,
-                                            ARRAY_SIZE(gang));
-               if (!ret)
-                       break;
-               for (i = 0; i < ret; i++)
-                       btrfs_drop_and_free_fs_root(fs_info, gang[i]);
+       xa_for_each(&fs_info->fs_roots, index, root) {
+               btrfs_drop_and_free_fs_root(fs_info, root);
        }
  }
  
@@@ -2444,7 -2443,9 +2443,9 @@@ static int btrfs_init_workqueues(struc
        unsigned int flags = WQ_MEM_RECLAIM | WQ_FREEZABLE | WQ_UNBOUND;
  
        fs_info->workers =
-               btrfs_alloc_workqueue(fs_info, "worker",
+               btrfs_alloc_workqueue(fs_info, "worker", flags, max_active, 16);
+       fs_info->hipri_workers =
+               btrfs_alloc_workqueue(fs_info, "worker-high",
                                      flags | WQ_HIGHPRI, max_active, 16);
  
        fs_info->delalloc_workers =
        fs_info->endio_raid56_workers =
                btrfs_alloc_workqueue(fs_info, "endio-raid56", flags,
                                      max_active, 4);
-       fs_info->rmw_workers =
-               btrfs_alloc_workqueue(fs_info, "rmw", flags, max_active, 2);
+       fs_info->rmw_workers = alloc_workqueue("btrfs-rmw", flags, max_active);
        fs_info->endio_write_workers =
                btrfs_alloc_workqueue(fs_info, "endio-write", flags,
                                      max_active, 2);
        fs_info->discard_ctl.discard_workers =
                alloc_workqueue("btrfs_discard", WQ_UNBOUND | WQ_FREEZABLE, 1);
  
-       if (!(fs_info->workers && fs_info->delalloc_workers &&
-             fs_info->flush_workers &&
+       if (!(fs_info->workers && fs_info->hipri_workers &&
+             fs_info->delalloc_workers && fs_info->flush_workers &&
              fs_info->endio_workers && fs_info->endio_meta_workers &&
              fs_info->endio_meta_write_workers &&
              fs_info->endio_write_workers && fs_info->endio_raid56_workers &&
@@@ -2815,12 -2815,14 +2815,14 @@@ static int validate_super(struct btrfs_
        }
  
        /*
-        * For 4K page size, we only support 4K sector size.
-        * For 64K page size, we support 64K and 4K sector sizes.
+        * We only support at most two sectorsizes: 4K and PAGE_SIZE.
+        *
+        * We can support 16K sectorsize with 64K page size without problem,
+        * but such sectorsize/pagesize combination doesn't make much sense.
+        * 4K will be our future standard, PAGE_SIZE is supported from the very
+        * beginning.
         */
-       if ((PAGE_SIZE == SZ_4K && sectorsize != PAGE_SIZE) ||
-           (PAGE_SIZE == SZ_64K && (sectorsize != SZ_4K &&
-                                    sectorsize != SZ_64K))) {
+       if (sectorsize > PAGE_SIZE || (sectorsize != SZ_4K && sectorsize != PAGE_SIZE)) {
                btrfs_err(fs_info,
                        "sectorsize %llu not yet supported for page size %lu",
                        sectorsize, PAGE_SIZE);
@@@ -3132,8 -3134,8 +3134,8 @@@ static int __cold init_tree_roots(struc
  
  void btrfs_init_fs_info(struct btrfs_fs_info *fs_info)
  {
-       INIT_RADIX_TREE(&fs_info->fs_roots_radix, GFP_ATOMIC);
-       INIT_RADIX_TREE(&fs_info->buffer_radix, GFP_ATOMIC);
+       xa_init_flags(&fs_info->fs_roots, GFP_ATOMIC);
+       xa_init_flags(&fs_info->extent_buffers, GFP_ATOMIC);
        INIT_LIST_HEAD(&fs_info->trans_list);
        INIT_LIST_HEAD(&fs_info->dead_roots);
        INIT_LIST_HEAD(&fs_info->delayed_iputs);
        INIT_LIST_HEAD(&fs_info->caching_block_groups);
        spin_lock_init(&fs_info->delalloc_root_lock);
        spin_lock_init(&fs_info->trans_lock);
-       spin_lock_init(&fs_info->fs_roots_radix_lock);
+       spin_lock_init(&fs_info->fs_roots_lock);
        spin_lock_init(&fs_info->delayed_iput_lock);
        spin_lock_init(&fs_info->defrag_inodes_lock);
        spin_lock_init(&fs_info->super_lock);
        btrfs_init_balance(fs_info);
        btrfs_init_async_reclaim_work(fs_info);
  
-       spin_lock_init(&fs_info->block_group_cache_lock);
-       fs_info->block_group_cache_tree = RB_ROOT;
-       fs_info->first_logical_byte = (u64)-1;
+       rwlock_init(&fs_info->block_group_cache_lock);
+       fs_info->block_group_cache_tree = RB_ROOT_CACHED;
  
        extent_io_tree_init(fs_info, &fs_info->excluded_extents,
                            IO_TREE_FS_EXCLUDED_EXTENTS, NULL);
@@@ -3295,7 -3296,7 +3296,7 @@@ static int init_mount_fs_info(struct bt
  
  static int btrfs_uuid_rescan_kthread(void *data)
  {
-       struct btrfs_fs_info *fs_info = (struct btrfs_fs_info *)data;
+       struct btrfs_fs_info *fs_info = data;
        int ret;
  
        /*
@@@ -3373,7 -3374,7 +3374,7 @@@ int btrfs_start_pre_rw_mount(struct btr
        /*
         * btrfs_find_orphan_roots() is responsible for finding all the dead
         * roots (with 0 refs), flag them with BTRFS_ROOT_DEAD_TREE and load
-        * them into the fs_info->fs_roots_radix tree. This must be done before
+        * them into the fs_info->fs_roots. This must be done before
         * calling btrfs_orphan_cleanup() on the tree root. If we don't do it
         * first, then btrfs_orphan_cleanup() will delete a dead root's orphan
         * item before the root's tree is deleted - this means that if we unmount
@@@ -3611,7 -3612,7 +3612,7 @@@ int __cold open_ctree(struct super_bloc
                ~BTRFS_FEATURE_INCOMPAT_SUPP;
        if (features) {
                btrfs_err(fs_info,
-                   "cannot mount because of unsupported optional features (%llx)",
+                   "cannot mount because of unsupported optional features (0x%llx)",
                    features);
                err = -EINVAL;
                goto fail_alloc;
                ~BTRFS_FEATURE_COMPAT_RO_SUPP;
        if (!sb_rdonly(sb) && features) {
                btrfs_err(fs_info,
-       "cannot mount read-write because of unsupported optional features (%llx)",
+       "cannot mount read-write because of unsupported optional features (0x%llx)",
                       features);
                err = -EINVAL;
                goto fail_alloc;
                btrfs_warn(fs_info,
                "read-write for sector size %u with page size %lu is experimental",
                           sectorsize, PAGE_SIZE);
-               if (btrfs_super_incompat_flags(fs_info->super_copy) &
-                       BTRFS_FEATURE_INCOMPAT_RAID56) {
-                       btrfs_err(fs_info,
-               "RAID56 is not yet supported for sector size %u with page size %lu",
-                               sectorsize, PAGE_SIZE);
-                       err = -EINVAL;
-                       goto fail_alloc;
-               }
                subpage_info = kzalloc(sizeof(*subpage_info), GFP_KERNEL);
                if (!subpage_info)
                        goto fail_alloc;
@@@ -4157,7 -4150,8 +4150,8 @@@ static int write_dev_supers(struct btrf
                if (i == 0 && !btrfs_test_opt(device->fs_info, NOBARRIER))
                        bio->bi_opf |= REQ_FUA;
  
-               btrfsic_submit_bio(bio);
+               btrfsic_check_bio(bio);
+               submit_bio(bio);
  
                if (btrfs_advance_sb_log(device, i))
                        errors++;
@@@ -4238,7 -4232,6 +4232,7 @@@ static int wait_dev_supers(struct btrfs
   */
  static void btrfs_end_empty_barrier(struct bio *bio)
  {
 +      bio_uninit(bio);
        complete(bio->bi_private);
  }
  
   */
  static void write_dev_flush(struct btrfs_device *device)
  {
 -      struct bio *bio = device->flush_bio;
 +      struct bio *bio = &device->flush_bio;
  
  #ifndef CONFIG_BTRFS_FS_CHECK_INTEGRITY
        /*
         * of simplicity, since this is a debug tool and not meant for use in
         * non-debug builds.
         */
 -      struct request_queue *q = bdev_get_queue(device->bdev);
 -      if (!test_bit(QUEUE_FLAG_WC, &q->queue_flags))
 +      if (!bdev_write_cache(device->bdev))
                return;
  #endif
  
 -      bio_reset(bio, device->bdev, REQ_OP_WRITE | REQ_SYNC | REQ_PREFLUSH);
 +      bio_init(bio, device->bdev, NULL, 0,
 +               REQ_OP_WRITE | REQ_SYNC | REQ_PREFLUSH);
        bio->bi_end_io = btrfs_end_empty_barrier;
        init_completion(&device->flush_wait);
        bio->bi_private = &device->flush_wait;
  
-       btrfsic_submit_bio(bio);
+       btrfsic_check_bio(bio);
+       submit_bio(bio);
        set_bit(BTRFS_DEV_STATE_FLUSH_SENT, &device->dev_state);
  }
  
   */
  static blk_status_t wait_dev_flush(struct btrfs_device *device)
  {
 -      struct bio *bio = device->flush_bio;
 +      struct bio *bio = &device->flush_bio;
  
        if (!test_bit(BTRFS_DEV_STATE_FLUSH_SENT, &device->dev_state))
                return BLK_STS_OK;
@@@ -4504,12 -4498,11 +4499,11 @@@ void btrfs_drop_and_free_fs_root(struc
  {
        bool drop_ref = false;
  
-       spin_lock(&fs_info->fs_roots_radix_lock);
-       radix_tree_delete(&fs_info->fs_roots_radix,
-                         (unsigned long)root->root_key.objectid);
-       if (test_and_clear_bit(BTRFS_ROOT_IN_RADIX, &root->state))
+       spin_lock(&fs_info->fs_roots_lock);
+       xa_erase(&fs_info->fs_roots, (unsigned long)root->root_key.objectid);
+       if (test_and_clear_bit(BTRFS_ROOT_REGISTERED, &root->state))
                drop_ref = true;
-       spin_unlock(&fs_info->fs_roots_radix_lock);
+       spin_unlock(&fs_info->fs_roots_lock);
  
        if (BTRFS_FS_ERROR(fs_info)) {
                ASSERT(root->log_root == NULL);
  
  int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info)
  {
-       u64 root_objectid = 0;
-       struct btrfs_root *gang[8];
-       int i = 0;
+       struct btrfs_root *roots[8];
+       unsigned long index = 0;
+       int i;
        int err = 0;
-       unsigned int ret = 0;
+       int grabbed;
  
        while (1) {
-               spin_lock(&fs_info->fs_roots_radix_lock);
-               ret = radix_tree_gang_lookup(&fs_info->fs_roots_radix,
-                                            (void **)gang, root_objectid,
-                                            ARRAY_SIZE(gang));
-               if (!ret) {
-                       spin_unlock(&fs_info->fs_roots_radix_lock);
-                       break;
+               struct btrfs_root *root;
+               spin_lock(&fs_info->fs_roots_lock);
+               if (!xa_find(&fs_info->fs_roots, &index, ULONG_MAX, XA_PRESENT)) {
+                       spin_unlock(&fs_info->fs_roots_lock);
+                       return err;
                }
-               root_objectid = gang[ret - 1]->root_key.objectid + 1;
  
-               for (i = 0; i < ret; i++) {
-                       /* Avoid to grab roots in dead_roots */
-                       if (btrfs_root_refs(&gang[i]->root_item) == 0) {
-                               gang[i] = NULL;
-                               continue;
-                       }
-                       /* grab all the search result for later use */
-                       gang[i] = btrfs_grab_root(gang[i]);
+               grabbed = 0;
+               xa_for_each_start(&fs_info->fs_roots, index, root, index) {
+                       /* Avoid grabbing roots in dead_roots */
+                       if (btrfs_root_refs(&root->root_item) > 0)
+                               roots[grabbed++] = btrfs_grab_root(root);
+                       if (grabbed >= ARRAY_SIZE(roots))
+                               break;
                }
-               spin_unlock(&fs_info->fs_roots_radix_lock);
+               spin_unlock(&fs_info->fs_roots_lock);
  
-               for (i = 0; i < ret; i++) {
-                       if (!gang[i])
+               for (i = 0; i < grabbed; i++) {
+                       if (!roots[i])
                                continue;
-                       root_objectid = gang[i]->root_key.objectid;
-                       err = btrfs_orphan_cleanup(gang[i]);
+                       index = roots[i]->root_key.objectid;
+                       err = btrfs_orphan_cleanup(roots[i]);
                        if (err)
-                               break;
-                       btrfs_put_root(gang[i]);
+                               goto out;
+                       btrfs_put_root(roots[i]);
                }
-               root_objectid++;
+               index++;
        }
  
-       /* release the uncleaned roots due to error */
-       for (; i < ret; i++) {
-               if (gang[i])
-                       btrfs_put_root(gang[i]);
+ out:
+       /* Release the roots that remain uncleaned due to error */
+       for (; i < grabbed; i++) {
+               if (roots[i])
+                       btrfs_put_root(roots[i]);
        }
        return err;
  }
@@@ -4863,13 -4854,6 +4855,6 @@@ void btrfs_btree_balance_dirty_nodelay(
        __btrfs_btree_balance_dirty(fs_info, 0);
  }
  
- int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid, int level,
-                     struct btrfs_key *first_key)
- {
-       return btree_read_extent_buffer_pages(buf, parent_transid,
-                                             level, first_key);
- }
  static void btrfs_error_commit_super(struct btrfs_fs_info *fs_info)
  {
        /* cleanup FS via transaction */
  
  static void btrfs_drop_all_logs(struct btrfs_fs_info *fs_info)
  {
-       struct btrfs_root *gang[8];
-       u64 root_objectid = 0;
-       int ret;
-       spin_lock(&fs_info->fs_roots_radix_lock);
-       while ((ret = radix_tree_gang_lookup(&fs_info->fs_roots_radix,
-                                            (void **)gang, root_objectid,
-                                            ARRAY_SIZE(gang))) != 0) {
-               int i;
-               for (i = 0; i < ret; i++)
-                       gang[i] = btrfs_grab_root(gang[i]);
-               spin_unlock(&fs_info->fs_roots_radix_lock);
-               for (i = 0; i < ret; i++) {
-                       if (!gang[i])
+       unsigned long index = 0;
+       int grabbed = 0;
+       struct btrfs_root *roots[8];
+       spin_lock(&fs_info->fs_roots_lock);
+       while ((grabbed = xa_extract(&fs_info->fs_roots, (void **)roots, index,
+                                    ULONG_MAX, 8, XA_PRESENT))) {
+               for (int i = 0; i < grabbed; i++)
+                       roots[i] = btrfs_grab_root(roots[i]);
+               spin_unlock(&fs_info->fs_roots_lock);
+               for (int i = 0; i < grabbed; i++) {
+                       if (!roots[i])
                                continue;
-                       root_objectid = gang[i]->root_key.objectid;
-                       btrfs_free_log(NULL, gang[i]);
-                       btrfs_put_root(gang[i]);
+                       index = roots[i]->root_key.objectid;
+                       btrfs_free_log(NULL, roots[i]);
+                       btrfs_put_root(roots[i]);
                }
-               root_objectid++;
-               spin_lock(&fs_info->fs_roots_radix_lock);
+               index++;
+               spin_lock(&fs_info->fs_roots_lock);
        }
-       spin_unlock(&fs_info->fs_roots_radix_lock);
+       spin_unlock(&fs_info->fs_roots_lock);
        btrfs_free_log_root_tree(NULL, fs_info);
  }
  
diff --combined fs/btrfs/extent-tree.c
index 6260784e74b5ae66b7ef6559ac67ba936376d2c6,fb367689d9d20e6b7c5e1c6e0a4298873735bf52..0867c5cd6e017a79865cb2f7334709021469bf1e
@@@ -895,7 -895,13 +895,13 @@@ again
        err = -ENOENT;
        while (1) {
                if (ptr >= end) {
-                       WARN_ON(ptr > end);
+                       if (ptr > end) {
+                               err = -EUCLEAN;
+                               btrfs_print_leaf(path->nodes[0]);
+                               btrfs_crit(fs_info,
+ "overrun extent record at slot %d while looking for inline extent for root %llu owner %llu offset %llu parent %llu",
+                                       path->slots[0], root_objectid, owner, offset, parent);
+                       }
                        break;
                }
                iref = (struct btrfs_extent_inline_ref *)ptr;
@@@ -1239,7 -1245,7 +1245,7 @@@ static int btrfs_issue_discard(struct b
  
                if (size) {
                        ret = blkdev_issue_discard(bdev, start >> 9, size >> 9,
 -                                                 GFP_NOFS, 0);
 +                                                 GFP_NOFS);
                        if (!ret)
                                *discarded_bytes += size;
                        else if (ret != -EOPNOTSUPP)
  
        if (bytes_left) {
                ret = blkdev_issue_discard(bdev, start >> 9, bytes_left >> 9,
 -                                         GFP_NOFS, 0);
 +                                         GFP_NOFS);
                if (!ret)
                        *discarded_bytes += bytes_left;
        }
@@@ -1291,7 -1297,7 +1297,7 @@@ static int do_discard_extent(struct btr
                ret = btrfs_reset_device_zone(dev_replace->tgtdev, phys, len,
                                              &discarded);
                discarded += src_disc;
 -      } else if (blk_queue_discard(bdev_get_queue(stripe->dev->bdev))) {
 +      } else if (bdev_max_discard_sectors(stripe->dev->bdev)) {
                ret = btrfs_issue_discard(dev->bdev, phys, len, &discarded);
        } else {
                ret = 0;
@@@ -1577,12 -1583,12 +1583,12 @@@ static int run_delayed_extent_op(struc
        u32 item_size;
        int ret;
        int err = 0;
-       int metadata = !extent_op->is_data;
+       int metadata = 1;
  
        if (TRANS_ABORTED(trans))
                return 0;
  
-       if (metadata && !btrfs_fs_incompat(fs_info, SKINNY_METADATA))
+       if (!btrfs_fs_incompat(fs_info, SKINNY_METADATA))
                metadata = 0;
  
        path = btrfs_alloc_path();
@@@ -2180,7 -2186,7 +2186,7 @@@ out
  
  int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans,
                                struct extent_buffer *eb, u64 flags,
-                               int level, int is_data)
+                               int level)
  {
        struct btrfs_delayed_extent_op *extent_op;
        int ret;
        extent_op->flags_to_set = flags;
        extent_op->update_flags = true;
        extent_op->update_key = false;
-       extent_op->is_data = is_data ? true : false;
        extent_op->level = level;
  
        ret = btrfs_add_delayed_extent_op(trans, eb->start, eb->len, extent_op);
  }
  
  int btrfs_cross_ref_exist(struct btrfs_root *root, u64 objectid, u64 offset,
-                         u64 bytenr, bool strict)
+                         u64 bytenr, bool strict, struct btrfs_path *path)
  {
-       struct btrfs_path *path;
        int ret;
  
-       path = btrfs_alloc_path();
-       if (!path)
-               return -ENOMEM;
        do {
                ret = check_committed_ref(root, path, objectid,
                                          offset, bytenr, strict);
        } while (ret == -EAGAIN);
  
  out:
-       btrfs_free_path(path);
+       btrfs_release_path(path);
        if (btrfs_is_data_reloc_root(root))
                WARN_ON(ret > 0);
        return ret;
@@@ -2497,24 -2497,21 +2497,21 @@@ static u64 get_alloc_profile_by_root(st
        return ret;
  }
  
- static u64 first_logical_byte(struct btrfs_fs_info *fs_info, u64 search_start)
+ static u64 first_logical_byte(struct btrfs_fs_info *fs_info)
  {
-       struct btrfs_block_group *cache;
-       u64 bytenr;
-       spin_lock(&fs_info->block_group_cache_lock);
-       bytenr = fs_info->first_logical_byte;
-       spin_unlock(&fs_info->block_group_cache_lock);
-       if (bytenr < (u64)-1)
-               return bytenr;
+       struct rb_node *leftmost;
+       u64 bytenr = 0;
  
-       cache = btrfs_lookup_first_block_group(fs_info, search_start);
-       if (!cache)
-               return 0;
+       read_lock(&fs_info->block_group_cache_lock);
+       /* Get the block group with the lowest logical start address. */
+       leftmost = rb_first_cached(&fs_info->block_group_cache_tree);
+       if (leftmost) {
+               struct btrfs_block_group *bg;
  
-       bytenr = cache->start;
-       btrfs_put_block_group(cache);
+               bg = rb_entry(leftmost, struct btrfs_block_group, cache_node);
+               bytenr = bg->start;
+       }
+       read_unlock(&fs_info->block_group_cache_lock);
  
        return bytenr;
  }
@@@ -3803,8 -3800,7 +3800,7 @@@ static int do_allocation_zoned(struct b
  
        /* Check RO and no space case before trying to activate it */
        spin_lock(&block_group->lock);
-       if (block_group->ro ||
-           block_group->alloc_offset == block_group->zone_capacity) {
+       if (block_group->ro || btrfs_zoned_bg_is_full(block_group)) {
                ret = 1;
                /*
                 * May need to clear fs_info->{treelog,data_reloc}_bg.
@@@ -4272,7 -4268,7 +4268,7 @@@ static noinline int find_free_extent(st
                return ret;
  
        ffe_ctl->search_start = max(ffe_ctl->search_start,
-                                   first_logical_byte(fs_info, 0));
+                                   first_logical_byte(fs_info));
        ffe_ctl->search_start = max(ffe_ctl->search_start, ffe_ctl->hint_byte);
        if (ffe_ctl->search_start == ffe_ctl->hint_byte) {
                block_group = btrfs_lookup_block_group(fs_info,
@@@ -4959,7 -4955,6 +4955,6 @@@ struct extent_buffer *btrfs_alloc_tree_
                extent_op->flags_to_set = flags;
                extent_op->update_key = skinny_metadata ? false : true;
                extent_op->update_flags = true;
-               extent_op->is_data = false;
                extent_op->level = level;
  
                btrfs_init_generic_ref(&generic_ref, BTRFS_ADD_DELAYED_EXTENT,
@@@ -5144,7 -5139,7 +5139,7 @@@ static noinline int walk_down_proc(stru
                ret = btrfs_dec_ref(trans, root, eb, 0);
                BUG_ON(ret); /* -ENOMEM */
                ret = btrfs_set_disk_extent_flags(trans, eb, flag,
-                                                 btrfs_header_level(eb), 0);
+                                                 btrfs_header_level(eb));
                BUG_ON(ret); /* -ENOMEM */
                wc->flags[level] |= flag;
        }
@@@ -5818,7 -5813,7 +5813,7 @@@ int btrfs_drop_snapshot(struct btrfs_ro
        btrfs_qgroup_convert_reserved_meta(root, INT_MAX);
        btrfs_qgroup_free_meta_all_pertrans(root);
  
-       if (test_bit(BTRFS_ROOT_IN_RADIX, &root->state))
+       if (test_bit(BTRFS_ROOT_REGISTERED, &root->state))
                btrfs_add_dropped_root(trans, root);
        else
                btrfs_put_root(root);
@@@ -5987,7 -5982,7 +5982,7 @@@ static int btrfs_trim_free_extents(stru
        *trimmed = 0;
  
        /* Discard not supported = nothing to do. */
 -      if (!blk_queue_discard(bdev_get_queue(device->bdev)))
 +      if (!bdev_max_discard_sectors(device->bdev))
                return 0;
  
        /* Not writable = nothing to do. */
diff --combined fs/btrfs/ioctl.c
index b2c692b2fd8d35d1e631ae5d1ea8b103acf2f31f,fdc23d1b72162b47a2433022d3c4cec4529e8bbd..43b6f23bbd8926a6518636f6ac407131b709c9d9
@@@ -468,6 -468,7 +468,6 @@@ static noinline int btrfs_ioctl_fitrim(
                                        void __user *arg)
  {
        struct btrfs_device *device;
 -      struct request_queue *q;
        struct fstrim_range range;
        u64 minlen = ULLONG_MAX;
        u64 num_devices = 0;
        rcu_read_lock();
        list_for_each_entry_rcu(device, &fs_info->fs_devices->devices,
                                dev_list) {
 -              if (!device->bdev)
 +              if (!device->bdev || !bdev_max_discard_sectors(device->bdev))
                        continue;
 -              q = bdev_get_queue(device->bdev);
 -              if (blk_queue_discard(q)) {
 -                      num_devices++;
 -                      minlen = min_t(u64, q->limits.discard_granularity,
 -                                   minlen);
 -              }
 +              num_devices++;
 +              minlen = min_t(u64, bdev_discard_granularity(device->bdev),
 +                                  minlen);
        }
        rcu_read_unlock();
  
@@@ -540,9 -544,35 +540,35 @@@ int __pure btrfs_is_empty_uuid(u8 *uuid
        return 1;
  }
  
+ /*
+  * Calculate the number of transaction items to reserve for creating a subvolume
+  * or snapshot, not including the inode, directory entries, or parent directory.
+  */
+ static unsigned int create_subvol_num_items(struct btrfs_qgroup_inherit *inherit)
+ {
+       /*
+        * 1 to add root block
+        * 1 to add root item
+        * 1 to add root ref
+        * 1 to add root backref
+        * 1 to add UUID item
+        * 1 to add qgroup info
+        * 1 to add qgroup limit
+        *
+        * Ideally the last two would only be accounted if qgroups are enabled,
+        * but that can change between now and the time we would insert them.
+        */
+       unsigned int num_items = 7;
+       if (inherit) {
+               /* 2 to add qgroup relations for each inherited qgroup */
+               num_items += 2 * inherit->num_qgroups;
+       }
+       return num_items;
+ }
  static noinline int create_subvol(struct user_namespace *mnt_userns,
                                  struct inode *dir, struct dentry *dentry,
-                                 const char *name, int namelen,
                                  struct btrfs_qgroup_inherit *inherit)
  {
        struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb);
        struct btrfs_root *new_root;
        struct btrfs_block_rsv block_rsv;
        struct timespec64 cur_time = current_time(dir);
-       struct inode *inode;
+       struct btrfs_new_inode_args new_inode_args = {
+               .dir = dir,
+               .dentry = dentry,
+               .subvol = true,
+       };
+       unsigned int trans_num_items;
        int ret;
-       dev_t anon_dev = 0;
+       dev_t anon_dev;
        u64 objectid;
-       u64 index = 0;
  
        root_item = kzalloc(sizeof(*root_item), GFP_KERNEL);
        if (!root_item)
  
        ret = btrfs_get_free_objectid(fs_info->tree_root, &objectid);
        if (ret)
-               goto fail_free;
-       ret = get_anon_bdev(&anon_dev);
-       if (ret < 0)
-               goto fail_free;
+               goto out_root_item;
  
        /*
         * Don't create subvolume whose level is not zero. Or qgroup will be
         */
        if (btrfs_qgroup_level(objectid)) {
                ret = -ENOSPC;
-               goto fail_free;
+               goto out_root_item;
        }
  
+       ret = get_anon_bdev(&anon_dev);
+       if (ret < 0)
+               goto out_root_item;
+       new_inode_args.inode = btrfs_new_subvol_inode(mnt_userns, dir);
+       if (!new_inode_args.inode) {
+               ret = -ENOMEM;
+               goto out_anon_dev;
+       }
+       ret = btrfs_new_inode_prepare(&new_inode_args, &trans_num_items);
+       if (ret)
+               goto out_inode;
+       trans_num_items += create_subvol_num_items(inherit);
        btrfs_init_block_rsv(&block_rsv, BTRFS_BLOCK_RSV_TEMP);
-       /*
-        * The same as the snapshot creation, please see the comment
-        * of create_snapshot().
-        */
-       ret = btrfs_subvolume_reserve_metadata(root, &block_rsv, 8, false);
+       ret = btrfs_subvolume_reserve_metadata(root, &block_rsv,
+                                              trans_num_items, false);
        if (ret)
-               goto fail_free;
+               goto out_new_inode_args;
  
        trans = btrfs_start_transaction(root, 0);
        if (IS_ERR(trans)) {
                ret = PTR_ERR(trans);
                btrfs_subvolume_release_metadata(root, &block_rsv);
-               goto fail_free;
+               goto out_new_inode_args;
        }
        trans->block_rsv = &block_rsv;
        trans->bytes_reserved = block_rsv.size;
  
        ret = btrfs_qgroup_inherit(trans, 0, objectid, inherit);
        if (ret)
-               goto fail;
+               goto out;
  
        leaf = btrfs_alloc_tree_block(trans, root, 0, objectid, NULL, 0, 0, 0,
                                      BTRFS_NESTING_NORMAL);
        if (IS_ERR(leaf)) {
                ret = PTR_ERR(leaf);
-               goto fail;
+               goto out;
        }
  
        btrfs_mark_buffer_dirty(leaf);
                btrfs_tree_unlock(leaf);
                btrfs_free_tree_block(trans, objectid, leaf, 0, 1);
                free_extent_buffer(leaf);
-               goto fail;
+               goto out;
        }
  
        free_extent_buffer(leaf);
        leaf = NULL;
  
-       key.offset = (u64)-1;
        new_root = btrfs_get_new_fs_root(fs_info, objectid, anon_dev);
        if (IS_ERR(new_root)) {
-               free_anon_bdev(anon_dev);
                ret = PTR_ERR(new_root);
                btrfs_abort_transaction(trans, ret);
-               goto fail;
+               goto out;
        }
-       /* Freeing will be done in btrfs_put_root() of new_root */
+       /* anon_dev is owned by new_root now. */
        anon_dev = 0;
+       BTRFS_I(new_inode_args.inode)->root = new_root;
+       /* ... and new_root is owned by new_inode_args.inode now. */
  
        ret = btrfs_record_root_in_trans(trans, new_root);
        if (ret) {
-               btrfs_put_root(new_root);
-               btrfs_abort_transaction(trans, ret);
-               goto fail;
-       }
-       ret = btrfs_create_subvol_root(trans, new_root, root, mnt_userns);
-       btrfs_put_root(new_root);
-       if (ret) {
-               /* We potentially lose an unused inode item here */
                btrfs_abort_transaction(trans, ret);
-               goto fail;
-       }
-       /*
-        * insert the directory item
-        */
-       ret = btrfs_set_inode_index(BTRFS_I(dir), &index);
-       if (ret) {
-               btrfs_abort_transaction(trans, ret);
-               goto fail;
-       }
-       ret = btrfs_insert_dir_item(trans, name, namelen, BTRFS_I(dir), &key,
-                                   BTRFS_FT_DIR, index);
-       if (ret) {
-               btrfs_abort_transaction(trans, ret);
-               goto fail;
+               goto out;
        }
  
-       btrfs_i_size_write(BTRFS_I(dir), dir->i_size + namelen * 2);
-       ret = btrfs_update_inode(trans, root, BTRFS_I(dir));
+       ret = btrfs_uuid_tree_add(trans, root_item->uuid,
+                                 BTRFS_UUID_KEY_SUBVOL, objectid);
        if (ret) {
                btrfs_abort_transaction(trans, ret);
-               goto fail;
+               goto out;
        }
  
-       ret = btrfs_add_root_ref(trans, objectid, root->root_key.objectid,
-                                btrfs_ino(BTRFS_I(dir)), index, name, namelen);
+       ret = btrfs_create_new_inode(trans, &new_inode_args);
        if (ret) {
                btrfs_abort_transaction(trans, ret);
-               goto fail;
+               goto out;
        }
  
-       ret = btrfs_uuid_tree_add(trans, root_item->uuid,
-                                 BTRFS_UUID_KEY_SUBVOL, objectid);
-       if (ret)
-               btrfs_abort_transaction(trans, ret);
+       d_instantiate_new(dentry, new_inode_args.inode);
+       new_inode_args.inode = NULL;
  
- fail:
-       kfree(root_item);
+ out:
        trans->block_rsv = NULL;
        trans->bytes_reserved = 0;
        btrfs_subvolume_release_metadata(root, &block_rsv);
                btrfs_end_transaction(trans);
        else
                ret = btrfs_commit_transaction(trans);
-       if (!ret) {
-               inode = btrfs_lookup_dentry(dir, dentry);
-               if (IS_ERR(inode))
-                       return PTR_ERR(inode);
-               d_instantiate(dentry, inode);
-       }
-       return ret;
- fail_free:
+ out_new_inode_args:
+       btrfs_new_inode_args_destroy(&new_inode_args);
+ out_inode:
+       iput(new_inode_args.inode);
+ out_anon_dev:
        if (anon_dev)
                free_anon_bdev(anon_dev);
+ out_root_item:
        kfree(root_item);
        return ret;
  }
@@@ -763,6 -771,7 +767,7 @@@ static int create_snapshot(struct btrfs
        struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb);
        struct inode *inode;
        struct btrfs_pending_snapshot *pending_snapshot;
+       unsigned int trans_num_items;
        struct btrfs_trans_handle *trans;
        int ret;
  
        btrfs_init_block_rsv(&pending_snapshot->block_rsv,
                             BTRFS_BLOCK_RSV_TEMP);
        /*
-        * 1 - parent dir inode
-        * 2 - dir entries
-        * 1 - root item
-        * 2 - root ref/backref
-        * 1 - root of snapshot
-        * 1 - UUID item
+        * 1 to add dir item
+        * 1 to add dir index
+        * 1 to update parent inode item
         */
+       trans_num_items = create_subvol_num_items(inherit) + 3;
        ret = btrfs_subvolume_reserve_metadata(BTRFS_I(dir)->root,
-                                       &pending_snapshot->block_rsv, 8,
-                                       false);
+                                              &pending_snapshot->block_rsv,
+                                              trans_num_items, false);
        if (ret)
                goto free_pending;
  
@@@ -979,7 -986,7 +982,7 @@@ static noinline int btrfs_mksubvol(cons
        if (snap_src)
                error = create_snapshot(snap_src, dir, dentry, readonly, inherit);
        else
-               error = create_subvol(mnt_userns, dir, dentry, name, namelen, inherit);
+               error = create_subvol(mnt_userns, dir, dentry, inherit);
  
        if (!error)
                fsnotify_mkdir(dir, dentry);
@@@ -1413,8 -1420,19 +1416,19 @@@ static int defrag_collect_targets(struc
                if (!em)
                        break;
  
-               /* Skip hole/inline/preallocated extents */
-               if (em->block_start >= EXTENT_MAP_LAST_BYTE ||
+               /*
+                * If the file extent is an inlined one, we may still want to
+                * defrag it (fallthrough) if it will cause a regular extent.
+                * This is for users who want to convert inline extents to
+                * regular ones through max_inline= mount option.
+                */
+               if (em->block_start == EXTENT_MAP_INLINE &&
+                   em->len <= inode->root->fs_info->max_inline)
+                       goto next;
+               /* Skip hole/delalloc/preallocated extents */
+               if (em->block_start == EXTENT_MAP_HOLE ||
+                   em->block_start == EXTENT_MAP_DELALLOC ||
                    test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
                        goto next;
  
                if (em->len >= get_extent_max_capacity(em))
                        goto next;
  
+               /*
+                * Normally there are no more extents after an inline one, thus
+                * @next_mergeable will normally be false and not defragged.
+                * So if an inline extent passed all above checks, just add it
+                * for defrag, and be converted to regular extents.
+                */
+               if (em->block_start == EXTENT_MAP_INLINE)
+                       goto add;
                next_mergeable = defrag_check_next_extent(&inode->vfs_inode, em,
                                                extent_thresh, newer_than, locked);
                if (!next_mergeable) {
@@@ -2561,12 -2588,7 +2584,12 @@@ static noinline int search_ioctl(struc
  
        while (1) {
                ret = -EFAULT;
 -              if (fault_in_writeable(ubuf + sk_offset, *buf_size - sk_offset))
 +              /*
 +               * Ensure that the whole user buffer is faulted in at sub-page
 +               * granularity, otherwise the loop may live-lock.
 +               */
 +              if (fault_in_subpage_writeable(ubuf + sk_offset,
 +                                             *buf_size - sk_offset))
                        break;
  
                ret = btrfs_search_forward(root, &key, path, sk->min_transid);
@@@ -2594,7 -2616,7 +2617,7 @@@ err
  static noinline int btrfs_ioctl_tree_search(struct inode *inode,
                                            void __user *argp)
  {
-       struct btrfs_ioctl_search_args __user *uargs;
+       struct btrfs_ioctl_search_args __user *uargs = argp;
        struct btrfs_ioctl_search_key sk;
        int ret;
        size_t buf_size;
        if (!capable(CAP_SYS_ADMIN))
                return -EPERM;
  
-       uargs = (struct btrfs_ioctl_search_args __user *)argp;
        if (copy_from_user(&sk, &uargs->key, sizeof(sk)))
                return -EFAULT;
  
  static noinline int btrfs_ioctl_tree_search_v2(struct inode *inode,
                                               void __user *argp)
  {
-       struct btrfs_ioctl_search_args_v2 __user *uarg;
+       struct btrfs_ioctl_search_args_v2 __user *uarg = argp;
        struct btrfs_ioctl_search_args_v2 args;
        int ret;
        size_t buf_size;
                return -EPERM;
  
        /* copy search header and buffer size */
-       uarg = (struct btrfs_ioctl_search_args_v2 __user *)argp;
        if (copy_from_user(&args, uarg, sizeof(args)))
                return -EFAULT;
  
@@@ -4344,10 -4363,6 +4364,6 @@@ static long btrfs_ioctl_balance(struct 
        bool need_unlock; /* for mut. excl. ops lock */
        int ret;
  
-       if (!arg)
-               btrfs_warn(fs_info,
-       "IOC_BALANCE ioctl (v1) is deprecated and will be removed in kernel 5.18");
        if (!capable(CAP_SYS_ADMIN))
                return -EPERM;
  
        if (ret)
                return ret;
  
+       bargs = memdup_user(arg, sizeof(*bargs));
+       if (IS_ERR(bargs)) {
+               ret = PTR_ERR(bargs);
+               bargs = NULL;
+               goto out;
+       }
  again:
        if (btrfs_exclop_start(fs_info, BTRFS_EXCLOP_BALANCE)) {
                mutex_lock(&fs_info->balance_mutex);
        }
  
  locked:
-       if (arg) {
-               bargs = memdup_user(arg, sizeof(*bargs));
-               if (IS_ERR(bargs)) {
-                       ret = PTR_ERR(bargs);
+       if (bargs->flags & BTRFS_BALANCE_RESUME) {
+               if (!fs_info->balance_ctl) {
+                       ret = -ENOTCONN;
                        goto out_unlock;
                }
  
-               if (bargs->flags & BTRFS_BALANCE_RESUME) {
-                       if (!fs_info->balance_ctl) {
-                               ret = -ENOTCONN;
-                               goto out_bargs;
-                       }
+               bctl = fs_info->balance_ctl;
+               spin_lock(&fs_info->balance_lock);
+               bctl->flags |= BTRFS_BALANCE_RESUME;
+               spin_unlock(&fs_info->balance_lock);
+               btrfs_exclop_balance(fs_info, BTRFS_EXCLOP_BALANCE);
  
-                       bctl = fs_info->balance_ctl;
-                       spin_lock(&fs_info->balance_lock);
-                       bctl->flags |= BTRFS_BALANCE_RESUME;
-                       spin_unlock(&fs_info->balance_lock);
-                       btrfs_exclop_balance(fs_info, BTRFS_EXCLOP_BALANCE);
+               goto do_balance;
+       }
  
-                       goto do_balance;
-               }
-       } else {
-               bargs = NULL;
+       if (bargs->flags & ~(BTRFS_BALANCE_ARGS_MASK | BTRFS_BALANCE_TYPE_MASK)) {
+               ret = -EINVAL;
+               goto out_unlock;
        }
  
        if (fs_info->balance_ctl) {
                ret = -EINPROGRESS;
-               goto out_bargs;
+               goto out_unlock;
        }
  
        bctl = kzalloc(sizeof(*bctl), GFP_KERNEL);
        if (!bctl) {
                ret = -ENOMEM;
-               goto out_bargs;
-       }
-       if (arg) {
-               memcpy(&bctl->data, &bargs->data, sizeof(bctl->data));
-               memcpy(&bctl->meta, &bargs->meta, sizeof(bctl->meta));
-               memcpy(&bctl->sys, &bargs->sys, sizeof(bctl->sys));
-               bctl->flags = bargs->flags;
-       } else {
-               /* balance everything - no filters */
-               bctl->flags |= BTRFS_BALANCE_TYPE_MASK;
+               goto out_unlock;
        }
  
-       if (bctl->flags & ~(BTRFS_BALANCE_ARGS_MASK | BTRFS_BALANCE_TYPE_MASK)) {
-               ret = -EINVAL;
-               goto out_bctl;
-       }
+       memcpy(&bctl->data, &bargs->data, sizeof(bctl->data));
+       memcpy(&bctl->meta, &bargs->meta, sizeof(bctl->meta));
+       memcpy(&bctl->sys, &bargs->sys, sizeof(bctl->sys));
  
+       bctl->flags = bargs->flags;
  do_balance:
        /*
         * Ownership of bctl and exclusive operation goes to btrfs_balance.
        ret = btrfs_balance(fs_info, bctl, bargs);
        bctl = NULL;
  
-       if ((ret == 0 || ret == -ECANCELED) && arg) {
+       if (ret == 0 || ret == -ECANCELED) {
                if (copy_to_user(arg, bargs, sizeof(*bargs)))
                        ret = -EFAULT;
        }
  
- out_bctl:
        kfree(bctl);
- out_bargs:
-       kfree(bargs);
  out_unlock:
        mutex_unlock(&fs_info->balance_mutex);
        if (need_unlock)
                btrfs_exclop_finish(fs_info);
  out:
        mnt_drop_write_file(file);
+       kfree(bargs);
        return ret;
  }
  
diff --combined fs/btrfs/volumes.c
index b6b00338037c49b56d27af50ed7668763b89bbcf,58f3eece8a48c4fb67ffd354ff63015157201839..9c20049d1fecf3927b5ffaa908c67f769ece77c1
@@@ -164,24 -164,12 +164,12 @@@ const struct btrfs_raid_attr btrfs_raid
   */
  enum btrfs_raid_types __attribute_const__ btrfs_bg_flags_to_raid_index(u64 flags)
  {
-       if (flags & BTRFS_BLOCK_GROUP_RAID10)
-               return BTRFS_RAID_RAID10;
-       else if (flags & BTRFS_BLOCK_GROUP_RAID1)
-               return BTRFS_RAID_RAID1;
-       else if (flags & BTRFS_BLOCK_GROUP_RAID1C3)
-               return BTRFS_RAID_RAID1C3;
-       else if (flags & BTRFS_BLOCK_GROUP_RAID1C4)
-               return BTRFS_RAID_RAID1C4;
-       else if (flags & BTRFS_BLOCK_GROUP_DUP)
-               return BTRFS_RAID_DUP;
-       else if (flags & BTRFS_BLOCK_GROUP_RAID0)
-               return BTRFS_RAID_RAID0;
-       else if (flags & BTRFS_BLOCK_GROUP_RAID5)
-               return BTRFS_RAID_RAID5;
-       else if (flags & BTRFS_BLOCK_GROUP_RAID6)
-               return BTRFS_RAID_RAID6;
-       return BTRFS_RAID_SINGLE; /* BTRFS_BLOCK_GROUP_SINGLE */
+       const u64 profile = (flags & BTRFS_BLOCK_GROUP_PROFILE_MASK);
+       if (!profile)
+               return BTRFS_RAID_SINGLE;
+       return BTRFS_BG_FLAG_TO_INDEX(profile);
  }
  
  const char *btrfs_bg_type_to_raid_name(u64 flags)
@@@ -405,6 -393,7 +393,6 @@@ void btrfs_free_device(struct btrfs_dev
        WARN_ON(!list_empty(&device->post_commit_list));
        rcu_string_free(device->name);
        extent_io_tree_release(&device->alloc_state);
 -      bio_put(device->flush_bio);
        btrfs_destroy_dev_zone_info(device);
        kfree(device);
  }
@@@ -642,7 -631,7 +630,7 @@@ static int btrfs_open_one_device(struc
                        set_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
        }
  
 -      if (!blk_queue_nonrot(bdev_get_queue(bdev)))
 +      if (!bdev_nonrot(bdev))
                fs_devices->rotating = true;
  
        device->bdev = bdev;
@@@ -2705,7 -2694,7 +2693,7 @@@ int btrfs_init_new_device(struct btrfs_
  
        atomic64_add(device->total_bytes, &fs_info->free_chunk_space);
  
 -      if (!blk_queue_nonrot(bdev_get_queue(bdev)))
 +      if (!bdev_nonrot(bdev))
                fs_devices->rotating = true;
  
        orig_super_total_bytes = btrfs_super_total_bytes(fs_info->super_copy);
@@@ -4062,13 -4051,6 +4050,6 @@@ static inline int validate_convert_prof
        if (!(bargs->flags & BTRFS_BALANCE_ARGS_CONVERT))
                return true;
  
-       if (fs_info->sectorsize < PAGE_SIZE &&
-               bargs->target & BTRFS_BLOCK_GROUP_RAID56_MASK) {
-               btrfs_err(fs_info,
-               "RAID56 is not yet supported for sectorsize %u with page size %lu",
-                         fs_info->sectorsize, PAGE_SIZE);
-               return false;
-       }
        /* Profile is valid and does not have bits outside of the allowed set */
        if (alloc_profile_is_valid(bargs->target, 1) &&
            (bargs->target & ~allowed) == 0)
@@@ -6312,7 -6294,7 +6293,7 @@@ int btrfs_get_io_geometry(struct btrfs_
        u64 offset;
        u64 stripe_offset;
        u64 stripe_nr;
-       u64 stripe_len;
+       u32 stripe_len;
        u64 raid56_full_stripe_start = (u64)-1;
        int data_stripes;
  
        offset = logical - em->start;
        /* Len of a stripe in a chunk */
        stripe_len = map->stripe_len;
-       /* Stripe where this block falls in */
-       stripe_nr = div64_u64(offset, stripe_len);
-       /* Offset of stripe in the chunk */
-       stripe_offset = stripe_nr * stripe_len;
-       if (offset < stripe_offset) {
-               btrfs_crit(fs_info,
- "stripe math has gone wrong, stripe_offset=%llu offset=%llu start=%llu logical=%llu stripe_len=%llu",
-                       stripe_offset, offset, em->start, logical, stripe_len);
-               return -EINVAL;
-       }
+       /*
+        * Stripe_nr is where this block falls in
+        * stripe_offset is the offset of this block in its stripe.
+        */
+       stripe_nr = div64_u64_rem(offset, stripe_len, &stripe_offset);
+       ASSERT(stripe_offset < U32_MAX);
  
-       /* stripe_offset is the offset of this block in its stripe */
-       stripe_offset = offset - stripe_offset;
        data_stripes = nr_data_stripes(map);
  
        /* Only stripe based profiles needs to check against stripe length. */
@@@ -6737,11 -6713,11 +6712,11 @@@ static void submit_stripe_bio(struct bt
                bio_op(bio), bio->bi_opf, bio->bi_iter.bi_sector,
                (unsigned long)dev->bdev->bd_dev, rcu_str_deref(dev->name),
                dev->devid, bio->bi_iter.bi_size);
-       bio_set_dev(bio, dev->bdev);
  
        btrfs_bio_counter_inc_noblocked(fs_info);
  
-       btrfsic_submit_bio(bio);
+       btrfsic_check_bio(bio);
+       submit_bio(bio);
  }
  
  static void bioc_error(struct btrfs_io_context *bioc, struct bio *bio, u64 logical)
@@@ -6823,10 -6799,12 +6798,12 @@@ blk_status_t btrfs_map_bio(struct btrfs
                        continue;
                }
  
-               if (dev_nr < total_devs - 1)
-                       bio = btrfs_bio_clone(first_bio);
-               else
+               if (dev_nr < total_devs - 1) {
+                       bio = btrfs_bio_clone(dev->bdev, first_bio);
+               } else {
                        bio = first_bio;
+                       bio_set_dev(bio, dev->bdev);
+               }
  
                submit_stripe_bio(bioc, bio, bioc->stripes[dev_nr].physical, dev);
        }
@@@ -6948,6 -6926,16 +6925,6 @@@ struct btrfs_device *btrfs_alloc_device
        if (!dev)
                return ERR_PTR(-ENOMEM);
  
 -      /*
 -       * Preallocate a bio that's always going to be used for flushing device
 -       * barriers and matches the device lifespan
 -       */
 -      dev->flush_bio = bio_kmalloc(GFP_KERNEL, 0);
 -      if (!dev->flush_bio) {
 -              kfree(dev);
 -              return ERR_PTR(-ENOMEM);
 -      }
 -
        INIT_LIST_HEAD(&dev->dev_list);
        INIT_LIST_HEAD(&dev->dev_alloc_list);
        INIT_LIST_HEAD(&dev->post_commit_list);
@@@ -7359,7 -7347,6 +7336,6 @@@ static int read_one_dev(struct extent_b
  
  int btrfs_read_sys_array(struct btrfs_fs_info *fs_info)
  {
-       struct btrfs_root *root = fs_info->tree_root;
        struct btrfs_super_block *super_copy = fs_info->super_copy;
        struct extent_buffer *sb;
        struct btrfs_disk_key *disk_key;
        struct btrfs_key key;
  
        ASSERT(BTRFS_SUPER_INFO_SIZE <= fs_info->nodesize);
        /*
-        * This will create extent buffer of nodesize, superblock size is
-        * fixed to BTRFS_SUPER_INFO_SIZE. If nodesize > sb size, this will
-        * overallocate but we can keep it as-is, only the first page is used.
+        * We allocated a dummy extent, just to use extent buffer accessors.
+        * There will be unused space after BTRFS_SUPER_INFO_SIZE, but
+        * that's fine, we will not go beyond system chunk array anyway.
         */
-       sb = btrfs_find_create_tree_block(fs_info, BTRFS_SUPER_INFO_OFFSET,
-                                         root->root_key.objectid, 0);
-       if (IS_ERR(sb))
-               return PTR_ERR(sb);
+       sb = alloc_dummy_extent_buffer(fs_info, BTRFS_SUPER_INFO_OFFSET);
+       if (!sb)
+               return -ENOMEM;
        set_extent_buffer_uptodate(sb);
-       /*
-        * The sb extent buffer is artificial and just used to read the system array.
-        * set_extent_buffer_uptodate() call does not properly mark all it's
-        * pages up-to-date when the page is larger: extent does not cover the
-        * whole page and consequently check_page_uptodate does not find all
-        * the page's extents up-to-date (the hole beyond sb),
-        * write_extent_buffer then triggers a WARN_ON.
-        *
-        * Regular short extents go through mark_extent_buffer_dirty/writeback cycle,
-        * but sb spans only this function. Add an explicit SetPageUptodate call
-        * to silence the warning eg. on PowerPC 64.
-        */
-       if (PAGE_SIZE > BTRFS_SUPER_INFO_SIZE)
-               SetPageUptodate(sb->pages[0]);
  
        write_extent_buffer(sb, super_copy, 0, BTRFS_SUPER_INFO_SIZE);
        array_size = btrfs_super_sys_array_size(super_copy);
@@@ -7561,6 -7534,7 +7523,7 @@@ int btrfs_read_chunk_tree(struct btrfs_
        struct btrfs_key found_key;
        int ret;
        int slot;
+       int iter_ret = 0;
        u64 total_dev = 0;
        u64 last_ra_node = 0;
  
        key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
        key.offset = 0;
        key.type = 0;
-       ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
-       if (ret < 0)
-               goto error;
-       while (1) {
-               struct extent_buffer *node;
+       btrfs_for_each_slot(root, &key, &found_key, path, iter_ret) {
+               struct extent_buffer *node = path->nodes[1];
  
                leaf = path->nodes[0];
                slot = path->slots[0];
-               if (slot >= btrfs_header_nritems(leaf)) {
-                       ret = btrfs_next_leaf(root, path);
-                       if (ret == 0)
-                               continue;
-                       if (ret < 0)
-                               goto error;
-                       break;
-               }
-               node = path->nodes[1];
                if (node) {
                        if (last_ra_node != node->start) {
                                readahead_tree_node_children(node);
                                last_ra_node = node->start;
                        }
                }
-               btrfs_item_key_to_cpu(leaf, &found_key, slot);
                if (found_key.type == BTRFS_DEV_ITEM_KEY) {
                        struct btrfs_dev_item *dev_item;
                        dev_item = btrfs_item_ptr(leaf, slot,
                        if (ret)
                                goto error;
                }
-               path->slots[0]++;
+       }
+       /* Catch error found during iteration */
+       if (iter_ret < 0) {
+               ret = iter_ret;
+               goto error;
        }
  
        /*
         * do another round of validation checks.
         */
        if (total_dev != fs_info->fs_devices->total_devices) {
-               btrfs_err(fs_info,
         "super_num_devices %llu mismatch with num_devices %llu found here",
+               btrfs_warn(fs_info,
"super block num_devices %llu mismatch with DEV_ITEM count %llu, will be repaired on next transaction commit",
                          btrfs_super_num_devices(fs_info->super_copy),
                          total_dev);
-               ret = -EINVAL;
-               goto error;
+               fs_info->fs_devices->total_devices = total_dev;
+               btrfs_set_super_num_devices(fs_info->super_copy, total_dev);
        }
        if (btrfs_super_total_bytes(fs_info->super_copy) <
            fs_info->fs_devices->total_rw_bytes) {
@@@ -8277,7 -8243,7 +8232,7 @@@ bool btrfs_pinned_by_swapfile(struct bt
  
  static int relocating_repair_kthread(void *data)
  {
-       struct btrfs_block_group *cache = (struct btrfs_block_group *)data;
+       struct btrfs_block_group *cache = data;
        struct btrfs_fs_info *fs_info = cache->fs_info;
        u64 target;
        int ret = 0;
diff --combined fs/btrfs/volumes.h
index b11c563d2025e52d1e1ca2ddffde4df55af08f98,12b2af9260e92a5af40f5b7255107f7b795d41a2..6721002000ee0fc0cb69ec727c19189783cea40f
@@@ -17,17 -17,51 +17,51 @@@ extern struct mutex uuid_mutex
  
  #define BTRFS_STRIPE_LEN      SZ_64K
  
+ /* Used by sanity check for btrfs_raid_types. */
+ #define const_ffs(n) (__builtin_ctzll(n) + 1)
+ /*
+  * The conversion from BTRFS_BLOCK_GROUP_* bits to btrfs_raid_type requires
+  * RAID0 always to be the lowest profile bit.
+  * Although it's part of on-disk format and should never change, do extra
+  * compile-time sanity checks.
+  */
+ static_assert(const_ffs(BTRFS_BLOCK_GROUP_RAID0) <
+             const_ffs(BTRFS_BLOCK_GROUP_PROFILE_MASK & ~BTRFS_BLOCK_GROUP_RAID0));
+ static_assert(const_ilog2(BTRFS_BLOCK_GROUP_RAID0) >
+             ilog2(BTRFS_BLOCK_GROUP_TYPE_MASK));
+ /* ilog2() can handle both constants and variables */
+ #define BTRFS_BG_FLAG_TO_INDEX(profile)                                       \
+       ilog2((profile) >> (ilog2(BTRFS_BLOCK_GROUP_RAID0) - 1))
+ enum btrfs_raid_types {
+       /* SINGLE is the special one as it doesn't have on-disk bit. */
+       BTRFS_RAID_SINGLE  = 0,
+       BTRFS_RAID_RAID0   = BTRFS_BG_FLAG_TO_INDEX(BTRFS_BLOCK_GROUP_RAID0),
+       BTRFS_RAID_RAID1   = BTRFS_BG_FLAG_TO_INDEX(BTRFS_BLOCK_GROUP_RAID1),
+       BTRFS_RAID_DUP     = BTRFS_BG_FLAG_TO_INDEX(BTRFS_BLOCK_GROUP_DUP),
+       BTRFS_RAID_RAID10  = BTRFS_BG_FLAG_TO_INDEX(BTRFS_BLOCK_GROUP_RAID10),
+       BTRFS_RAID_RAID5   = BTRFS_BG_FLAG_TO_INDEX(BTRFS_BLOCK_GROUP_RAID5),
+       BTRFS_RAID_RAID6   = BTRFS_BG_FLAG_TO_INDEX(BTRFS_BLOCK_GROUP_RAID6),
+       BTRFS_RAID_RAID1C3 = BTRFS_BG_FLAG_TO_INDEX(BTRFS_BLOCK_GROUP_RAID1C3),
+       BTRFS_RAID_RAID1C4 = BTRFS_BG_FLAG_TO_INDEX(BTRFS_BLOCK_GROUP_RAID1C4),
+       BTRFS_NR_RAID_TYPES
+ };
  struct btrfs_io_geometry {
        /* remaining bytes before crossing a stripe */
        u64 len;
        /* offset of logical address in chunk */
        u64 offset;
        /* length of single IO stripe */
-       u64 stripe_len;
+       u32 stripe_len;
+       /* offset of address in stripe */
+       u32 stripe_offset;
        /* number of stripe where address falls */
        u64 stripe_nr;
-       /* offset of address in stripe */
-       u64 stripe_offset;
        /* offset of raid56 stripe into the chunk */
        u64 raid56_stripe_offset;
  };
@@@ -121,8 -155,8 +155,8 @@@ struct btrfs_device 
        /* bytes used on the current transaction */
        u64 commit_bytes_used;
  
 -      /* for sending down flush barriers */
 -      struct bio *flush_bio;
 +      /* Bio used for flushing device barriers */
 +      struct bio flush_bio;
        struct completion flush_wait;
  
        /* per-device scrub information */
@@@ -430,7 -464,7 +464,7 @@@ struct map_lookup 
        u64 type;
        int io_align;
        int io_width;
-       u64 stripe_len;
+       u32 stripe_len;
        int num_stripes;
        int sub_stripes;
        int verified_stripes; /* For mount time dev extent verification */
diff --combined fs/btrfs/zoned.c
index 29b54fd9c128dffdb1bff294d8e41e102df466c4,057babaa3e05c10812edf9b1a516cda239ffbdbb..11237a913beed1ee8b3fb0b877b587d1d13c4b0c
  #define BTRFS_MIN_ACTIVE_ZONES                (BTRFS_SUPER_MIRROR_MAX + 5)
  
  /*
-  * Maximum supported zone size. Currently, SMR disks have a zone size of
-  * 256MiB, and we are expecting ZNS drives to be in the 1-4GiB range. We do not
-  * expect the zone size to become larger than 8GiB in the near future.
+  * Minimum / maximum supported zone size. Currently, SMR disks have a zone
+  * size of 256MiB, and we are expecting ZNS drives to be in the 1-4GiB range.
+  * We do not expect the zone size to become larger than 8GiB or smaller than
+  * 4MiB in the near future.
   */
  #define BTRFS_MAX_ZONE_SIZE           SZ_8G
+ #define BTRFS_MIN_ZONE_SIZE           SZ_4M
  
  #define SUPER_INFO_SECTORS    ((u64)BTRFS_SUPER_INFO_SIZE >> SECTOR_SHIFT)
  
@@@ -350,6 -352,7 +352,6 @@@ int btrfs_get_dev_zone_info(struct btrf
        struct btrfs_fs_info *fs_info = device->fs_info;
        struct btrfs_zoned_device_info *zone_info = NULL;
        struct block_device *bdev = device->bdev;
 -      struct request_queue *queue = bdev_get_queue(bdev);
        unsigned int max_active_zones;
        unsigned int nactive;
        sector_t nr_sectors;
                                 zone_info->zone_size, BTRFS_MAX_ZONE_SIZE);
                ret = -EINVAL;
                goto out;
+       } else if (zone_info->zone_size < BTRFS_MIN_ZONE_SIZE) {
+               btrfs_err_in_rcu(fs_info,
+               "zoned: %s: zone size %llu smaller than supported minimum %u",
+                                rcu_str_deref(device->name),
+                                zone_info->zone_size, BTRFS_MIN_ZONE_SIZE);
+               ret = -EINVAL;
+               goto out;
        }
  
        nr_sectors = bdev_nr_sectors(bdev);
        if (!IS_ALIGNED(nr_sectors, zone_sectors))
                zone_info->nr_zones++;
  
 -      max_active_zones = queue_max_active_zones(queue);
 +      max_active_zones = bdev_max_active_zones(bdev);
        if (max_active_zones && max_active_zones < BTRFS_MIN_ACTIVE_ZONES) {
                btrfs_err_in_rcu(fs_info,
  "zoned: %s: max active zones %u is too small, need at least %u active zones",
@@@ -1835,7 -1845,7 +1844,7 @@@ bool btrfs_zone_activate(struct btrfs_b
        }
  
        /* No space left */
-       if (block_group->alloc_offset == block_group->zone_capacity) {
+       if (btrfs_zoned_bg_is_full(block_group)) {
                ret = false;
                goto out_unlock;
        }
@@@ -1872,20 -1882,14 +1881,14 @@@ out_unlock
        return ret;
  }
  
int btrfs_zone_finish(struct btrfs_block_group *block_group)
static int do_zone_finish(struct btrfs_block_group *block_group, bool fully_written)
  {
        struct btrfs_fs_info *fs_info = block_group->fs_info;
        struct map_lookup *map;
-       struct btrfs_device *device;
-       u64 physical;
+       bool need_zone_finish;
        int ret = 0;
        int i;
  
-       if (!btrfs_is_zoned(fs_info))
-               return 0;
-       map = block_group->physical_map;
        spin_lock(&block_group->lock);
        if (!block_group->zone_is_active) {
                spin_unlock(&block_group->lock);
        /* Check if we have unwritten allocated space */
        if ((block_group->flags &
             (BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_SYSTEM)) &&
-           block_group->alloc_offset > block_group->meta_write_pointer) {
+           block_group->start + block_group->alloc_offset > block_group->meta_write_pointer) {
                spin_unlock(&block_group->lock);
                return -EAGAIN;
        }
-       spin_unlock(&block_group->lock);
-       ret = btrfs_inc_block_group_ro(block_group, false);
-       if (ret)
-               return ret;
-       /* Ensure all writes in this block group finish */
-       btrfs_wait_block_group_reservations(block_group);
-       /* No need to wait for NOCOW writers. Zoned mode does not allow that. */
-       btrfs_wait_ordered_roots(fs_info, U64_MAX, block_group->start,
-                                block_group->length);
-       spin_lock(&block_group->lock);
  
        /*
-        * Bail out if someone already deactivated the block group, or
-        * allocated space is left in the block group.
+        * If we are sure that the block group is full (= no more room left for
+        * new allocation) and the IO for the last usable block is completed, we
+        * don't need to wait for the other IOs. This holds because we ensure
+        * the sequential IO submissions using the ZONE_APPEND command for data
+        * and block_group->meta_write_pointer for metadata.
         */
-       if (!block_group->zone_is_active) {
+       if (!fully_written) {
                spin_unlock(&block_group->lock);
-               btrfs_dec_block_group_ro(block_group);
-               return 0;
-       }
  
-       if (block_group->reserved) {
-               spin_unlock(&block_group->lock);
-               btrfs_dec_block_group_ro(block_group);
-               return -EAGAIN;
+               ret = btrfs_inc_block_group_ro(block_group, false);
+               if (ret)
+                       return ret;
+               /* Ensure all writes in this block group finish */
+               btrfs_wait_block_group_reservations(block_group);
+               /* No need to wait for NOCOW writers. Zoned mode does not allow that */
+               btrfs_wait_ordered_roots(fs_info, U64_MAX, block_group->start,
+                                        block_group->length);
+               spin_lock(&block_group->lock);
+               /*
+                * Bail out if someone already deactivated the block group, or
+                * allocated space is left in the block group.
+                */
+               if (!block_group->zone_is_active) {
+                       spin_unlock(&block_group->lock);
+                       btrfs_dec_block_group_ro(block_group);
+                       return 0;
+               }
+               if (block_group->reserved) {
+                       spin_unlock(&block_group->lock);
+                       btrfs_dec_block_group_ro(block_group);
+                       return -EAGAIN;
+               }
        }
  
+       /*
+        * The block group is not fully allocated, so not fully written yet. We
+        * need to send ZONE_FINISH command to free up an active zone.
+        */
+       need_zone_finish = !btrfs_zoned_bg_is_full(block_group);
        block_group->zone_is_active = 0;
        block_group->alloc_offset = block_group->zone_capacity;
        block_group->free_space_ctl->free_space = 0;
        btrfs_clear_data_reloc_bg(block_group);
        spin_unlock(&block_group->lock);
  
+       map = block_group->physical_map;
        for (i = 0; i < map->num_stripes; i++) {
-               device = map->stripes[i].dev;
-               physical = map->stripes[i].physical;
+               struct btrfs_device *device = map->stripes[i].dev;
+               const u64 physical = map->stripes[i].physical;
  
                if (device->zone_info->max_active_zones == 0)
                        continue;
  
-               ret = blkdev_zone_mgmt(device->bdev, REQ_OP_ZONE_FINISH,
-                                      physical >> SECTOR_SHIFT,
-                                      device->zone_info->zone_size >> SECTOR_SHIFT,
-                                      GFP_NOFS);
+               if (need_zone_finish) {
+                       ret = blkdev_zone_mgmt(device->bdev, REQ_OP_ZONE_FINISH,
+                                              physical >> SECTOR_SHIFT,
+                                              device->zone_info->zone_size >> SECTOR_SHIFT,
+                                              GFP_NOFS);
  
-               if (ret)
-                       return ret;
+                       if (ret)
+                               return ret;
+               }
  
                btrfs_dev_clear_active_zone(device, physical);
        }
-       btrfs_dec_block_group_ro(block_group);
+       if (!fully_written)
+               btrfs_dec_block_group_ro(block_group);
  
        spin_lock(&fs_info->zone_active_bgs_lock);
        ASSERT(!list_empty(&block_group->active_bg_list));
        return 0;
  }
  
+ int btrfs_zone_finish(struct btrfs_block_group *block_group)
+ {
+       if (!btrfs_is_zoned(block_group->fs_info))
+               return 0;
+       return do_zone_finish(block_group, false);
+ }
  bool btrfs_can_activate_zone(struct btrfs_fs_devices *fs_devices, u64 flags)
  {
        struct btrfs_fs_info *fs_info = fs_devices->fs_info;
  void btrfs_zone_finish_endio(struct btrfs_fs_info *fs_info, u64 logical, u64 length)
  {
        struct btrfs_block_group *block_group;
-       struct map_lookup *map;
-       struct btrfs_device *device;
-       u64 physical;
+       u64 min_alloc_bytes;
  
        if (!btrfs_is_zoned(fs_info))
                return;
        block_group = btrfs_lookup_block_group(fs_info, logical);
        ASSERT(block_group);
  
-       if (logical + length < block_group->start + block_group->zone_capacity)
-               goto out;
-       spin_lock(&block_group->lock);
+       /* No MIXED_BG on zoned btrfs. */
+       if (block_group->flags & BTRFS_BLOCK_GROUP_DATA)
+               min_alloc_bytes = fs_info->sectorsize;
+       else
+               min_alloc_bytes = fs_info->nodesize;
  
-       if (!block_group->zone_is_active) {
-               spin_unlock(&block_group->lock);
+       /* Bail out if we can allocate more data from this block group. */
+       if (logical + length + min_alloc_bytes <=
+           block_group->start + block_group->zone_capacity)
                goto out;
-       }
  
-       block_group->zone_is_active = 0;
-       /* We should have consumed all the free space */
-       ASSERT(block_group->alloc_offset == block_group->zone_capacity);
-       ASSERT(block_group->free_space_ctl->free_space == 0);
-       btrfs_clear_treelog_bg(block_group);
-       btrfs_clear_data_reloc_bg(block_group);
-       spin_unlock(&block_group->lock);
+       do_zone_finish(block_group, true);
  
-       map = block_group->physical_map;
-       device = map->stripes[0].dev;
-       physical = map->stripes[0].physical;
+ out:
+       btrfs_put_block_group(block_group);
+ }
  
-       if (!device->zone_info->max_active_zones)
-               goto out;
+ static void btrfs_zone_finish_endio_workfn(struct work_struct *work)
+ {
+       struct btrfs_block_group *bg =
+               container_of(work, struct btrfs_block_group, zone_finish_work);
  
-       btrfs_dev_clear_active_zone(device, physical);
+       wait_on_extent_buffer_writeback(bg->last_eb);
+       free_extent_buffer(bg->last_eb);
+       btrfs_zone_finish_endio(bg->fs_info, bg->start, bg->length);
+       btrfs_put_block_group(bg);
+ }
  
-       spin_lock(&fs_info->zone_active_bgs_lock);
-       ASSERT(!list_empty(&block_group->active_bg_list));
-       list_del_init(&block_group->active_bg_list);
-       spin_unlock(&fs_info->zone_active_bgs_lock);
+ void btrfs_schedule_zone_finish_bg(struct btrfs_block_group *bg,
+                                  struct extent_buffer *eb)
+ {
+       if (!bg->seq_zone || eb->start + eb->len * 2 <= bg->start + bg->zone_capacity)
+               return;
  
-       btrfs_put_block_group(block_group);
+       if (WARN_ON(bg->zone_finish_work.func == btrfs_zone_finish_endio_workfn)) {
+               btrfs_err(bg->fs_info, "double scheduling of bg %llu zone finishing",
+                         bg->start);
+               return;
+       }
  
- out:
-       btrfs_put_block_group(block_group);
+       /* For the work */
+       btrfs_get_block_group(bg);
+       atomic_inc(&eb->refs);
+       bg->last_eb = eb;
+       INIT_WORK(&bg->zone_finish_work, btrfs_zone_finish_endio_workfn);
+       queue_work(system_unbound_wq, &bg->zone_finish_work);
  }
  
  void btrfs_clear_data_reloc_bg(struct btrfs_block_group *bg)
@@@ -2072,3 -2113,30 +2112,30 @@@ void btrfs_free_zone_cache(struct btrfs
        }
        mutex_unlock(&fs_devices->device_list_mutex);
  }
+ bool btrfs_zoned_should_reclaim(struct btrfs_fs_info *fs_info)
+ {
+       struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
+       struct btrfs_device *device;
+       u64 used = 0;
+       u64 total = 0;
+       u64 factor;
+       ASSERT(btrfs_is_zoned(fs_info));
+       if (fs_info->bg_reclaim_threshold == 0)
+               return false;
+       mutex_lock(&fs_devices->device_list_mutex);
+       list_for_each_entry(device, &fs_devices->devices, dev_list) {
+               if (!device->bdev)
+                       continue;
+               total += device->disk_total_bytes;
+               used += device->bytes_used;
+       }
+       mutex_unlock(&fs_devices->device_list_mutex);
+       factor = div64_u64(used * 100, total);
+       return factor >= fs_info->bg_reclaim_threshold;
+ }
diff --combined fs/erofs/data.c
index bb9c1fd48c1936cc6ddabc26b33cd99bc11d7e8a,91c11d5bb9990d83d1b5f5022292feae8d7e7ce0..252f4ee977d56965ce463f917e9a36456bccc023
@@@ -6,7 -6,6 +6,7 @@@
   */
  #include "internal.h"
  #include <linux/prefetch.h>
 +#include <linux/sched/mm.h>
  #include <linux/dax.h>
  #include <trace/events/erofs.h>
  
@@@ -36,20 -35,14 +36,20 @@@ void *erofs_bread(struct erofs_buf *buf
        erofs_off_t offset = blknr_to_addr(blkaddr);
        pgoff_t index = offset >> PAGE_SHIFT;
        struct page *page = buf->page;
 +      struct folio *folio;
 +      unsigned int nofs_flag;
  
        if (!page || page->index != index) {
                erofs_put_metabuf(buf);
 -              page = read_cache_page_gfp(mapping, index,
 -                              mapping_gfp_constraint(mapping, ~__GFP_FS));
 -              if (IS_ERR(page))
 -                      return page;
 +
 +              nofs_flag = memalloc_nofs_save();
 +              folio = read_cache_folio(mapping, index, NULL, NULL);
 +              memalloc_nofs_restore(nofs_flag);
 +              if (IS_ERR(folio))
 +                      return folio;
 +
                /* should already be PageUptodate, no need to lock page */
 +              page = folio_file_page(folio, index);
                buf->page = page;
        }
        if (buf->kmap_type == EROFS_NO_KMAP) {
  void *erofs_read_metabuf(struct erofs_buf *buf, struct super_block *sb,
                         erofs_blk_t blkaddr, enum erofs_kmap_type type)
  {
 +      if (erofs_is_fscache_mode(sb))
 +              return erofs_bread(buf, EROFS_SB(sb)->s_fscache->inode,
 +                                 blkaddr, type);
 +
        return erofs_bread(buf, sb->s_bdev->bd_inode, blkaddr, type);
  }
  
@@@ -121,8 -110,8 +121,8 @@@ static int erofs_map_blocks_flatmode(st
        return 0;
  }
  
 -static int erofs_map_blocks(struct inode *inode,
 -                          struct erofs_map_blocks *map, int flags)
 +int erofs_map_blocks(struct inode *inode,
 +                   struct erofs_map_blocks *map, int flags)
  {
        struct super_block *sb = inode->i_sb;
        struct erofs_inode *vi = EROFS_I(inode);
@@@ -210,7 -199,6 +210,7 @@@ int erofs_map_dev(struct super_block *s
        map->m_bdev = sb->s_bdev;
        map->m_daxdev = EROFS_SB(sb)->dax_dev;
        map->m_dax_part_off = EROFS_SB(sb)->dax_part_off;
 +      map->m_fscache = EROFS_SB(sb)->s_fscache;
  
        if (map->m_deviceid) {
                down_read(&devs->rwsem);
                map->m_bdev = dif->bdev;
                map->m_daxdev = dif->dax_dev;
                map->m_dax_part_off = dif->dax_part_off;
 +              map->m_fscache = dif->fscache;
                up_read(&devs->rwsem);
        } else if (devs->extra_devices) {
                down_read(&devs->rwsem);
                                map->m_bdev = dif->bdev;
                                map->m_daxdev = dif->dax_dev;
                                map->m_dax_part_off = dif->dax_part_off;
 +                              map->m_fscache = dif->fscache;
                                break;
                        }
                }
@@@ -399,7 -385,7 +399,7 @@@ static ssize_t erofs_file_read_iter(str
  
                if (!err)
                        return iomap_dio_rw(iocb, to, &erofs_iomap_ops,
-                                           NULL, 0, 0);
+                                           NULL, 0, NULL, 0);
                if (err < 0)
                        return err;
        }
diff --combined fs/f2fs/file.c
index 35b6c720c2bc155211ea024ee1c05a4175350e1d,04bc8709314bf22ecbdfc54f0529ac9a60873262..100637b1adb3646c8f69e1b8ba1214898e225c2d
@@@ -2285,6 -2285,7 +2285,6 @@@ static int f2fs_ioc_fitrim(struct file 
  {
        struct inode *inode = file_inode(filp);
        struct super_block *sb = inode->i_sb;
 -      struct request_queue *q = bdev_get_queue(sb->s_bdev);
        struct fstrim_range range;
        int ret;
  
                return ret;
  
        range.minlen = max((unsigned int)range.minlen,
 -                              q->limits.discard_granularity);
 +                         bdev_discard_granularity(sb->s_bdev));
        ret = f2fs_trim_fs(F2FS_SB(sb), &range);
        mnt_drop_write_file(filp);
        if (ret < 0)
  static int f2fs_secure_erase(struct block_device *bdev, struct inode *inode,
                pgoff_t off, block_t block, block_t len, u32 flags)
  {
 -      struct request_queue *q = bdev_get_queue(bdev);
        sector_t sector = SECTOR_FROM_BLOCK(block);
        sector_t nr_sects = SECTOR_FROM_BLOCK(len);
        int ret = 0;
  
 -      if (!q)
 -              return -ENXIO;
 -
 -      if (flags & F2FS_TRIM_FILE_DISCARD)
 -              ret = blkdev_issue_discard(bdev, sector, nr_sects, GFP_NOFS,
 -                                              blk_queue_secure_erase(q) ?
 -                                              BLKDEV_DISCARD_SECURE : 0);
 +      if (flags & F2FS_TRIM_FILE_DISCARD) {
 +              if (bdev_max_secure_erase_sectors(bdev))
 +                      ret = blkdev_issue_secure_erase(bdev, sector, nr_sects,
 +                                      GFP_NOFS);
 +              else
 +                      ret = blkdev_issue_discard(bdev, sector, nr_sects,
 +                                      GFP_NOFS);
 +      }
  
        if (!ret && (flags & F2FS_TRIM_FILE_ZEROOUT)) {
                if (IS_ENCRYPTED(inode))
@@@ -4308,7 -4309,7 +4308,7 @@@ static ssize_t f2fs_dio_read_iter(struc
         */
        inc_page_count(sbi, F2FS_DIO_READ);
        dio = __iomap_dio_rw(iocb, to, &f2fs_iomap_ops,
-                            &f2fs_iomap_dio_read_ops, 0, 0);
+                            &f2fs_iomap_dio_read_ops, 0, NULL, 0);
        if (IS_ERR_OR_NULL(dio)) {
                ret = PTR_ERR_OR_ZERO(dio);
                if (ret != -EIOCBQUEUED)
@@@ -4526,7 -4527,7 +4526,7 @@@ static ssize_t f2fs_dio_write_iter(stru
        if (pos + count > inode->i_size)
                dio_flags |= IOMAP_DIO_FORCE_WAIT;
        dio = __iomap_dio_rw(iocb, from, &f2fs_iomap_ops,
-                            &f2fs_iomap_dio_write_ops, dio_flags, 0);
+                            &f2fs_iomap_dio_write_ops, dio_flags, NULL, 0);
        if (IS_ERR_OR_NULL(dio)) {
                ret = PTR_ERR_OR_ZERO(dio);
                if (ret == -ENOTBLK)
diff --combined fs/iomap/direct-io.c
index 80f9b047aa1b6298523daf76638ed67ae823bc24,cf224a8bb31150b9b5a1e37d51f4bcbc4f3859a0..370c3241618a091bb5800a75dc638ef024d54a17
@@@ -51,13 -51,21 +51,22 @@@ struct iomap_dio 
        };
  };
  
+ static struct bio *iomap_dio_alloc_bio(const struct iomap_iter *iter,
+               struct iomap_dio *dio, unsigned short nr_vecs, unsigned int opf)
+ {
+       if (dio->dops && dio->dops->bio_set)
+               return bio_alloc_bioset(iter->iomap.bdev, nr_vecs, opf,
+                                       GFP_KERNEL, dio->dops->bio_set);
+       return bio_alloc(iter->iomap.bdev, nr_vecs, opf, GFP_KERNEL);
+ }
  static void iomap_dio_submit_bio(const struct iomap_iter *iter,
                struct iomap_dio *dio, struct bio *bio, loff_t pos)
  {
        atomic_inc(&dio->ref);
  
 -      if (dio->iocb->ki_flags & IOCB_HIPRI) {
 +      /* Sync dio can't be polled reliably */
 +      if ((dio->iocb->ki_flags & IOCB_HIPRI) && !is_sync_kiocb(dio->iocb)) {
                bio_set_polled(bio, dio->iocb);
                dio->submit.poll_bio = bio;
        }
@@@ -145,7 -153,7 +154,7 @@@ static inline void iomap_dio_set_error(
        cmpxchg(&dio->error, 0, ret);
  }
  
static void iomap_dio_bio_end_io(struct bio *bio)
+ void iomap_dio_bio_end_io(struct bio *bio)
  {
        struct iomap_dio *dio = bio->bi_private;
        bool should_dirty = (dio->flags & IOMAP_DIO_DIRTY);
                bio_put(bio);
        }
  }
+ EXPORT_SYMBOL_GPL(iomap_dio_bio_end_io);
  
  static void iomap_dio_zero(const struct iomap_iter *iter, struct iomap_dio *dio,
                loff_t pos, unsigned len)
  {
        struct inode *inode = file_inode(dio->iocb->ki_filp);
        struct page *page = ZERO_PAGE(0);
-       int flags = REQ_SYNC | REQ_IDLE;
        struct bio *bio;
  
-       bio = bio_alloc(iter->iomap.bdev, 1, REQ_OP_WRITE | flags, GFP_KERNEL);
+       bio = iomap_dio_alloc_bio(iter, dio, 1, REQ_OP_WRITE | REQ_SYNC | REQ_IDLE);
        fscrypt_set_bio_crypt_ctx(bio, inode, pos >> inode->i_blkbits,
                                  GFP_KERNEL);
        bio->bi_iter.bi_sector = iomap_sector(&iter->iomap, pos);
@@@ -266,7 -274,8 +275,7 @@@ static loff_t iomap_dio_bio_iter(const 
                 * cache flushes on IO completion.
                 */
                if (!(iomap->flags & (IOMAP_F_SHARED|IOMAP_F_DIRTY)) &&
 -                  (dio->flags & IOMAP_DIO_WRITE_FUA) &&
 -                  blk_queue_fua(bdev_get_queue(iomap->bdev)))
 +                  (dio->flags & IOMAP_DIO_WRITE_FUA) && bdev_fua(iomap->bdev))
                        use_fua = true;
        }
  
                        goto out;
                }
  
-               bio = bio_alloc(iomap->bdev, nr_pages, bio_opf, GFP_KERNEL);
+               bio = iomap_dio_alloc_bio(iter, dio, nr_pages, bio_opf);
                fscrypt_set_bio_crypt_ctx(bio, inode, pos >> inode->i_blkbits,
                                          GFP_KERNEL);
                bio->bi_iter.bi_sector = iomap_sector(iomap, pos);
@@@ -474,7 -483,7 +483,7 @@@ static loff_t iomap_dio_iter(const stru
  struct iomap_dio *
  __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
                const struct iomap_ops *ops, const struct iomap_dio_ops *dops,
-               unsigned int dio_flags, size_t done_before)
+               unsigned int dio_flags, void *private, size_t done_before)
  {
        struct address_space *mapping = iocb->ki_filp->f_mapping;
        struct inode *inode = file_inode(iocb->ki_filp);
                .pos            = iocb->ki_pos,
                .len            = iov_iter_count(iter),
                .flags          = IOMAP_DIRECT,
+               .private        = private,
        };
        loff_t end = iomi.pos + iomi.len - 1, ret = 0;
        bool wait_for_completion =
                        if (!READ_ONCE(dio->submit.waiter))
                                break;
  
 -                      if (!dio->submit.poll_bio ||
 -                          !bio_poll(dio->submit.poll_bio, NULL, 0))
 -                              blk_io_schedule();
 +                      blk_io_schedule();
                }
                __set_current_state(TASK_RUNNING);
        }
@@@ -672,11 -684,12 +682,12 @@@ EXPORT_SYMBOL_GPL(__iomap_dio_rw)
  ssize_t
  iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
                const struct iomap_ops *ops, const struct iomap_dio_ops *dops,
-               unsigned int dio_flags, size_t done_before)
+               unsigned int dio_flags, void *private, size_t done_before)
  {
        struct iomap_dio *dio;
  
-       dio = __iomap_dio_rw(iocb, iter, ops, dops, dio_flags, done_before);
+       dio = __iomap_dio_rw(iocb, iter, ops, dops, dio_flags, private,
+                            done_before);
        if (IS_ERR_OR_NULL(dio))
                return PTR_ERR_OR_ZERO(dio);
        return iomap_dio_complete(dio);
diff --combined fs/zonefs/super.c
index 652752df1a2f478ffbb733d042b67f82d96bb6aa,777fe626c2b38efe7d698bab01adc4c18e3b0fa7..8f306485c9538516ef20189812aba58b37b94b18
  #define CREATE_TRACE_POINTS
  #include "trace.h"
  
 +/*
 + * Manage the active zone count. Called with zi->i_truncate_mutex held.
 + */
 +static void zonefs_account_active(struct inode *inode)
 +{
 +      struct zonefs_sb_info *sbi = ZONEFS_SB(inode->i_sb);
 +      struct zonefs_inode_info *zi = ZONEFS_I(inode);
 +
 +      lockdep_assert_held(&zi->i_truncate_mutex);
 +
 +      if (zi->i_ztype != ZONEFS_ZTYPE_SEQ)
 +              return;
 +
 +      /*
 +       * If the zone is active, that is, if it is explicitly open or
 +       * partially written, check if it was already accounted as active.
 +       */
 +      if ((zi->i_flags & ZONEFS_ZONE_OPEN) ||
 +          (zi->i_wpoffset > 0 && zi->i_wpoffset < zi->i_max_size)) {
 +              if (!(zi->i_flags & ZONEFS_ZONE_ACTIVE)) {
 +                      zi->i_flags |= ZONEFS_ZONE_ACTIVE;
 +                      atomic_inc(&sbi->s_active_seq_files);
 +              }
 +              return;
 +      }
 +
 +      /* The zone is not active. If it was, update the active count */
 +      if (zi->i_flags & ZONEFS_ZONE_ACTIVE) {
 +              zi->i_flags &= ~ZONEFS_ZONE_ACTIVE;
 +              atomic_dec(&sbi->s_active_seq_files);
 +      }
 +}
 +
  static inline int zonefs_zone_mgmt(struct inode *inode,
                                   enum req_opf op)
  {
@@@ -101,13 -68,8 +101,13 @@@ static inline void zonefs_i_size_write(
         * A full zone is no longer open/active and does not need
         * explicit closing.
         */
 -      if (isize >= zi->i_max_size)
 -              zi->i_flags &= ~ZONEFS_ZONE_OPEN;
 +      if (isize >= zi->i_max_size) {
 +              struct zonefs_sb_info *sbi = ZONEFS_SB(inode->i_sb);
 +
 +              if (zi->i_flags & ZONEFS_ZONE_ACTIVE)
 +                      atomic_dec(&sbi->s_active_seq_files);
 +              zi->i_flags &= ~(ZONEFS_ZONE_OPEN | ZONEFS_ZONE_ACTIVE);
 +      }
  }
  
  static int zonefs_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
@@@ -435,7 -397,6 +435,7 @@@ static int zonefs_io_error_cb(struct bl
        zonefs_update_stats(inode, data_size);
        zonefs_i_size_write(inode, data_size);
        zi->i_wpoffset = data_size;
 +      zonefs_account_active(inode);
  
        return 0;
  }
@@@ -547,7 -508,6 +547,7 @@@ static int zonefs_file_truncate(struct 
        zonefs_update_stats(inode, isize);
        truncate_setsize(inode, isize);
        zi->i_wpoffset = isize;
 +      zonefs_account_active(inode);
  
  unlock:
        mutex_unlock(&zi->i_truncate_mutex);
@@@ -729,12 -689,13 +729,12 @@@ static ssize_t zonefs_file_dio_append(s
        struct inode *inode = file_inode(iocb->ki_filp);
        struct zonefs_inode_info *zi = ZONEFS_I(inode);
        struct block_device *bdev = inode->i_sb->s_bdev;
 -      unsigned int max;
 +      unsigned int max = bdev_max_zone_append_sectors(bdev);
        struct bio *bio;
        ssize_t size;
        int nr_pages;
        ssize_t ret;
  
 -      max = queue_max_zone_append_sectors(bdev_get_queue(bdev));
        max = ALIGN_DOWN(max << SECTOR_SHIFT, inode->i_sb->s_blocksize);
        iov_iter_truncate(from, max);
  
@@@ -900,20 -861,13 +900,20 @@@ static ssize_t zonefs_file_dio_write(st
                ret = zonefs_file_dio_append(iocb, from);
        else
                ret = iomap_dio_rw(iocb, from, &zonefs_iomap_ops,
-                                  &zonefs_write_dio_ops, 0, 0);
+                                  &zonefs_write_dio_ops, 0, NULL, 0);
        if (zi->i_ztype == ZONEFS_ZTYPE_SEQ &&
            (ret > 0 || ret == -EIOCBQUEUED)) {
                if (ret > 0)
                        count = ret;
 +
 +              /*
 +               * Update the zone write pointer offset assuming the write
 +               * operation succeeded. If it did not, the error recovery path
 +               * will correct it. Also do active seq file accounting.
 +               */
                mutex_lock(&zi->i_truncate_mutex);
                zi->i_wpoffset += count;
 +              zonefs_account_active(inode);
                mutex_unlock(&zi->i_truncate_mutex);
        }
  
@@@ -1042,7 -996,7 +1042,7 @@@ static ssize_t zonefs_file_read_iter(st
                }
                file_accessed(iocb->ki_filp);
                ret = iomap_dio_rw(iocb, to, &zonefs_iomap_ops,
-                                  &zonefs_read_dio_ops, 0, 0);
+                                  &zonefs_read_dio_ops, 0, NULL, 0);
        } else {
                ret = generic_file_read_iter(iocb, to);
                if (ret == -EIO)
@@@ -1055,13 -1009,13 +1055,13 @@@ inode_unlock
        return ret;
  }
  
 -static inline bool zonefs_file_use_exp_open(struct inode *inode, struct file *file)
 +/*
 + * Write open accounting is done only for sequential files.
 + */
 +static inline bool zonefs_seq_file_need_wro(struct inode *inode,
 +                                          struct file *file)
  {
        struct zonefs_inode_info *zi = ZONEFS_I(inode);
 -      struct zonefs_sb_info *sbi = ZONEFS_SB(inode->i_sb);
 -
 -      if (!(sbi->s_mount_opts & ZONEFS_MNTOPT_EXPLICIT_OPEN))
 -              return false;
  
        if (zi->i_ztype != ZONEFS_ZTYPE_SEQ)
                return false;
        return true;
  }
  
 -static int zonefs_open_zone(struct inode *inode)
 +static int zonefs_seq_file_write_open(struct inode *inode)
  {
        struct zonefs_inode_info *zi = ZONEFS_I(inode);
 -      struct zonefs_sb_info *sbi = ZONEFS_SB(inode->i_sb);
        int ret = 0;
  
        mutex_lock(&zi->i_truncate_mutex);
  
        if (!zi->i_wr_refcnt) {
 -              if (atomic_inc_return(&sbi->s_open_zones) > sbi->s_max_open_zones) {
 -                      atomic_dec(&sbi->s_open_zones);
 -                      ret = -EBUSY;
 -                      goto unlock;
 -              }
 +              struct zonefs_sb_info *sbi = ZONEFS_SB(inode->i_sb);
 +              unsigned int wro = atomic_inc_return(&sbi->s_wro_seq_files);
  
 -              if (i_size_read(inode) < zi->i_max_size) {
 -                      ret = zonefs_zone_mgmt(inode, REQ_OP_ZONE_OPEN);
 -                      if (ret) {
 -                              atomic_dec(&sbi->s_open_zones);
 +              if (sbi->s_mount_opts & ZONEFS_MNTOPT_EXPLICIT_OPEN) {
 +
 +                      if (wro > sbi->s_max_wro_seq_files) {
 +                              atomic_dec(&sbi->s_wro_seq_files);
 +                              ret = -EBUSY;
                                goto unlock;
                        }
 -                      zi->i_flags |= ZONEFS_ZONE_OPEN;
 +
 +                      if (i_size_read(inode) < zi->i_max_size) {
 +                              ret = zonefs_zone_mgmt(inode, REQ_OP_ZONE_OPEN);
 +                              if (ret) {
 +                                      atomic_dec(&sbi->s_wro_seq_files);
 +                                      goto unlock;
 +                              }
 +                              zi->i_flags |= ZONEFS_ZONE_OPEN;
 +                              zonefs_account_active(inode);
 +                      }
                }
        }
  
@@@ -1119,31 -1067,30 +1119,31 @@@ static int zonefs_file_open(struct inod
        if (ret)
                return ret;
  
 -      if (zonefs_file_use_exp_open(inode, file))
 -              return zonefs_open_zone(inode);
 +      if (zonefs_seq_file_need_wro(inode, file))
 +              return zonefs_seq_file_write_open(inode);
  
        return 0;
  }
  
 -static void zonefs_close_zone(struct inode *inode)
 +static void zonefs_seq_file_write_close(struct inode *inode)
  {
        struct zonefs_inode_info *zi = ZONEFS_I(inode);
 +      struct super_block *sb = inode->i_sb;
 +      struct zonefs_sb_info *sbi = ZONEFS_SB(sb);
        int ret = 0;
  
        mutex_lock(&zi->i_truncate_mutex);
 -      zi->i_wr_refcnt--;
 -      if (!zi->i_wr_refcnt) {
 -              struct zonefs_sb_info *sbi = ZONEFS_SB(inode->i_sb);
 -              struct super_block *sb = inode->i_sb;
  
 -              /*
 -               * If the file zone is full, it is not open anymore and we only
 -               * need to decrement the open count.
 -               */
 -              if (!(zi->i_flags & ZONEFS_ZONE_OPEN))
 -                      goto dec;
 +      zi->i_wr_refcnt--;
 +      if (zi->i_wr_refcnt)
 +              goto unlock;
  
 +      /*
 +       * The file zone may not be open anymore (e.g. the file was truncated to
 +       * its maximum size or it was fully written). For this case, we only
 +       * need to decrement the write open count.
 +       */
 +      if (zi->i_flags & ZONEFS_ZONE_OPEN) {
                ret = zonefs_zone_mgmt(inode, REQ_OP_ZONE_CLOSE);
                if (ret) {
                        __zonefs_io_error(inode, false);
                         */
                        if (zi->i_flags & ZONEFS_ZONE_OPEN &&
                            !(sb->s_flags & SB_RDONLY)) {
 -                              zonefs_warn(sb, "closing zone failed, remounting filesystem read-only\n");
 +                              zonefs_warn(sb,
 +                                      "closing zone at %llu failed %d\n",
 +                                      zi->i_zsector, ret);
 +                              zonefs_warn(sb,
 +                                      "remounting filesystem read-only\n");
                                sb->s_flags |= SB_RDONLY;
                        }
 +                      goto unlock;
                }
 +
                zi->i_flags &= ~ZONEFS_ZONE_OPEN;
 -dec:
 -              atomic_dec(&sbi->s_open_zones);
 +              zonefs_account_active(inode);
        }
 +
 +      atomic_dec(&sbi->s_wro_seq_files);
 +
 +unlock:
        mutex_unlock(&zi->i_truncate_mutex);
  }
  
@@@ -1183,8 -1121,8 +1183,8 @@@ static int zonefs_file_release(struct i
         * the zone has gone offline or read-only). Make sure we don't fail the
         * close(2) for user-space.
         */
 -      if (zonefs_file_use_exp_open(inode, file))
 -              zonefs_close_zone(inode);
 +      if (zonefs_seq_file_need_wro(inode, file))
 +              zonefs_seq_file_write_close(inode);
  
        return 0;
  }
@@@ -1399,8 -1337,6 +1399,8 @@@ static int zonefs_init_file_inode(struc
        sbi->s_blocks += zi->i_max_size >> sb->s_blocksize_bits;
        sbi->s_used_blocks += zi->i_wpoffset >> sb->s_blocksize_bits;
  
 +      mutex_lock(&zi->i_truncate_mutex);
 +
        /*
         * For sequential zones, make sure that any open zone is closed first
         * to ensure that the initial number of open zones is 0, in sync with
        if (type == ZONEFS_ZTYPE_SEQ &&
            (zone->cond == BLK_ZONE_COND_IMP_OPEN ||
             zone->cond == BLK_ZONE_COND_EXP_OPEN)) {
 -              mutex_lock(&zi->i_truncate_mutex);
                ret = zonefs_zone_mgmt(inode, REQ_OP_ZONE_CLOSE);
 -              mutex_unlock(&zi->i_truncate_mutex);
 +              if (ret)
 +                      goto unlock;
        }
  
 +      zonefs_account_active(inode);
 +
 +unlock:
 +      mutex_unlock(&zi->i_truncate_mutex);
 +
        return ret;
  }
  
@@@ -1757,18 -1688,14 +1757,18 @@@ static int zonefs_fill_super(struct sup
        sbi->s_gid = GLOBAL_ROOT_GID;
        sbi->s_perm = 0640;
        sbi->s_mount_opts = ZONEFS_MNTOPT_ERRORS_RO;
 -      sbi->s_max_open_zones = bdev_max_open_zones(sb->s_bdev);
 -      atomic_set(&sbi->s_open_zones, 0);
 -      if (!sbi->s_max_open_zones &&
 +
 +      atomic_set(&sbi->s_wro_seq_files, 0);
 +      sbi->s_max_wro_seq_files = bdev_max_open_zones(sb->s_bdev);
 +      if (!sbi->s_max_wro_seq_files &&
            sbi->s_mount_opts & ZONEFS_MNTOPT_EXPLICIT_OPEN) {
                zonefs_info(sb, "No open zones limit. Ignoring explicit_open mount option\n");
                sbi->s_mount_opts &= ~ZONEFS_MNTOPT_EXPLICIT_OPEN;
        }
  
 +      atomic_set(&sbi->s_active_seq_files, 0);
 +      sbi->s_max_active_seq_files = bdev_max_active_zones(sb->s_bdev);
 +
        ret = zonefs_read_super(sb);
        if (ret)
                return ret;
        if (ret)
                goto cleanup;
  
 +      ret = zonefs_sysfs_register(sb);
 +      if (ret)
 +              goto cleanup;
 +
        zonefs_info(sb, "Mounting %u zones",
                    blkdev_nr_zones(sb->s_bdev->bd_disk));
  
@@@ -1832,8 -1755,6 +1832,8 @@@ static void zonefs_kill_super(struct su
  
        if (sb->s_root)
                d_genocide(sb->s_root);
 +
 +      zonefs_sysfs_unregister(sb);
        kill_block_super(sb);
        kfree(sbi);
  }
@@@ -1881,26 -1802,16 +1881,26 @@@ static int __init zonefs_init(void
                return ret;
  
        ret = register_filesystem(&zonefs_type);
 -      if (ret) {
 -              zonefs_destroy_inodecache();
 -              return ret;
 -      }
 +      if (ret)
 +              goto destroy_inodecache;
 +
 +      ret = zonefs_sysfs_init();
 +      if (ret)
 +              goto unregister_fs;
  
        return 0;
 +
 +unregister_fs:
 +      unregister_filesystem(&zonefs_type);
 +destroy_inodecache:
 +      zonefs_destroy_inodecache();
 +
 +      return ret;
  }
  
  static void __exit zonefs_exit(void)
  {
 +      zonefs_sysfs_exit();
        zonefs_destroy_inodecache();
        unregister_filesystem(&zonefs_type);
  }
diff --combined include/linux/fs.h
index 87b5af1d9fbe037dbcbe404547fac061544c3abb,01d61984ce7ae975e120247789252dd4137595b0..02e7f60638b847a60abc9a6fa687dda8a246e29f
@@@ -1708,6 -1708,11 +1708,11 @@@ static inline bool __sb_start_write_try
  #define __sb_writers_release(sb, lev) \
        percpu_rwsem_release(&(sb)->s_writers.rw_sem[(lev)-1], 1, _THIS_IP_)
  
+ static inline bool sb_write_started(const struct super_block *sb)
+ {
+       return lockdep_is_held_type(sb->s_writers.rw_sem + SB_FREEZE_WRITE - 1, 1);
+ }
  /**
   * sb_end_write - drop write access to a superblock
   * @sb: the super we wrote to
@@@ -1953,7 -1958,6 +1958,7 @@@ struct dir_context 
  #define REMAP_FILE_ADVISORY           (REMAP_FILE_CAN_SHORTEN)
  
  struct iov_iter;
 +struct io_uring_cmd;
  
  struct file_operations {
        struct module *owner;
                                   struct file *file_out, loff_t pos_out,
                                   loff_t len, unsigned int remap_flags);
        int (*fadvise)(struct file *, loff_t, loff_t, int);
 +      int (*uring_cmd)(struct io_uring_cmd *ioucmd, unsigned int issue_flags);
  } __randomize_layout;
  
  struct inode_operations {
This page took 0.172562 seconds and 4 git commands to generate.