Merge tag 'bcachefs-2024-09-21' of git://evilpiepirate.org/bcachefs

author Linus Torvalds <[email protected]>

Mon, 23 Sep 2024 17:05:41 +0000 (10:05 -0700)

committer Linus Torvalds <[email protected]>

Mon, 23 Sep 2024 17:05:41 +0000 (10:05 -0700)
author Linus Torvalds <[email protected]>
Mon, 23 Sep 2024 17:05:41 +0000 (10:05 -0700)
committer Linus Torvalds <[email protected]>
Mon, 23 Sep 2024 17:05:41 +0000 (10:05 -0700)
diff --combined fs/bcachefs/fs-io-buffered.c

index ff60c041abe56857b1a5b0a446436d39b4f8b599,99fef9342199660ddbc54272df7d2117d2a2d93c..48a1ab9a649bc11fea05f1aaed6b32cdfe772b36
--- 1/fs/bcachefs/fs-io-buffered.c
--- 2/fs/bcachefs/fs-io-buffered.c
+++ b/fs/bcachefs/fs-io-buffered.c
@@@ -151,7 -151,6 +151,6 @@@ static void bchfs_read(struct btree_tra
         struct bkey_buf sk;
         int flags = BCH_READ_RETRY_IF_STALE|
                 BCH_READ_MAY_PROMOTE;
-       u32 snapshot;
         int ret = 0;
   
         rbio->c = c;
@@@ -159,29 -158,23 +158,23 @@@
         rbio->subvol = inum.subvol;
   
         bch2_bkey_buf_init(&sk);
- retry:
         bch2_trans_begin(trans);
-       iter = (struct btree_iter) { NULL };
- 
-       ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot);
-       if (ret)
-               goto err;
- 
         bch2_trans_iter_init(trans, &iter, BTREE_ID_extents,
-                            SPOS(inum.inum, rbio->bio.bi_iter.bi_sector, snapshot),
+                            POS(inum.inum, rbio->bio.bi_iter.bi_sector),
                              BTREE_ITER_slots);
         while (1) {
                 struct bkey_s_c k;
                 unsigned bytes, sectors, offset_into_extent;
                 enum btree_id data_btree = BTREE_ID_extents;
   
-               /*
-                * read_extent -> io_time_reset may cause a transaction restart
-                * without returning an error, we need to check for that here:
-                */
-               ret = bch2_trans_relock(trans);
+               bch2_trans_begin(trans);
+ 
+               u32 snapshot;
+               ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot);
                 if (ret)
-                       break;
+                       goto err;
+ 
+               bch2_btree_iter_set_snapshot(&iter, snapshot);
   
                 bch2_btree_iter_set_pos(&iter,
                                 POS(inum.inum, rbio->bio.bi_iter.bi_sector));
@@@ -189,7 -182,7 +182,7 @@@
                 k = bch2_btree_iter_peek_slot(&iter);
                 ret = bkey_err(k);
                 if (ret)
-                       break;
+                       goto err;
   
                 offset_into_extent = iter.pos.offset -
                         bkey_start_offset(k.k);
@@@ -200,7 -193,7 +193,7 @@@
                 ret = bch2_read_indirect_extent(trans, &data_btree,
                                         &offset_into_extent, &sk);
                 if (ret)
-                       break;
+                       goto err;
   
                 k = bkey_i_to_s_c(sk.k);
   
@@@ -210,7 -203,7 +203,7 @@@
                         ret = readpage_bio_extend(trans, readpages_iter, &rbio->bio, sectors,
                                                   extent_partial_reads_expensive(k));
                         if (ret)
-                               break;
+                               goto err;
                 }
   
                 bytes = min(sectors, bio_sectors(&rbio->bio)) << 9;
@@@ -229,17 -222,13 +222,13 @@@
   
                 swap(rbio->bio.bi_iter.bi_size, bytes);
                 bio_advance(&rbio->bio, bytes);
- 
-               ret = btree_trans_too_many_iters(trans);
-               if (ret)
+ err:
+               if (ret &&
+                   !bch2_err_matches(ret, BCH_ERR_transaction_restart))
                         break;
         }
- err:
         bch2_trans_iter_exit(trans, &iter);
   
-       if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
-               goto retry;
- 
         if (ret) {
                 bch_err_inum_offset_ratelimited(c,
                                 iter.pos.inode,
@@@ -486,7 -475,7 +475,7 @@@ static void bch2_writepage_io_alloc(str
         op->nr_replicas         = nr_replicas;
         op->res.nr_replicas     = nr_replicas;
         op->write_point         = writepoint_hashed(inode->ei_last_dirtied);
-       op->subvol              = inode->ei_subvol;
+       op->subvol              = inode->ei_inum.subvol;
         op->pos                 = POS(inode->v.i_ino, sector);
         op->end_io              = bch2_writepage_io_done;
         op->devs_need_flush     = &inode->ei_devs_need_flush;
@@@ -659,7 -648,7 +648,7 @@@ int bch2_writepages(struct address_spac
   
   int bch2_write_begin(struct file *file, struct address_space *mapping,
                      loff_t pos, unsigned len,
- -                   struct page **pagep, void **fsdata)
+ +                   struct folio **foliop, void **fsdata)
   {
         struct bch_inode_info *inode = to_bch_ei(mapping->host);
         struct bch_fs *c = inode->v.i_sb->s_fs_info;
@@@ -728,11 -717,12 +717,11 @@@ out
                 goto err;
         }
   
- -      *pagep = &folio->page;
+ +      *foliop = folio;
         return 0;
   err:
         folio_unlock(folio);
         folio_put(folio);
- -      *pagep = NULL;
   err_unlock:
         bch2_pagecache_add_put(inode);
         kfree(res);
@@@ -742,11 -732,12 +731,11 @@@
   
   int bch2_write_end(struct file *file, struct address_space *mapping,
                    loff_t pos, unsigned len, unsigned copied,
- -                 struct page *page, void *fsdata)
+ +                 struct folio *folio, void *fsdata)
   {
         struct bch_inode_info *inode = to_bch_ei(mapping->host);
         struct bch_fs *c = inode->v.i_sb->s_fs_info;
         struct bch2_folio_reservation *res = fsdata;
- -      struct folio *folio = page_folio(page);
         unsigned offset = pos - folio_pos(folio);
   
         lockdep_assert_held(&inode->v.i_rwsem);
diff --combined fs/bcachefs/fs.c

index 011817afc3adcf54e8288043d9bbca9b4fffedd7,1aee5bafaae54cb79a518ca205ada336feb05e8f..4a1bb07a2574481e9bbffcd4d32c7dbf7ffb3929
--- 1/fs/bcachefs/fs.c
--- 2/fs/bcachefs/fs.c
+++ b/fs/bcachefs/fs.c
@@@ -108,7 -108,7 +108,7 @@@ retry
                 goto retry;
   
         bch2_fs_fatal_err_on(bch2_err_matches(ret, ENOENT), c,
-                            "%s: inode %u:%llu not found when updating",
+                            "%s: inode %llu:%llu not found when updating",
                              bch2_err_str(ret),
                              inode_inum(inode).subvol,
                              inode_inum(inode).inum);
@@@ -152,50 -152,106 +152,106 @@@ int bch2_fs_quota_transfer(struct bch_f
         return ret;
   }
   
- static int bch2_iget5_test(struct inode *vinode, void *p)
+ static bool subvol_inum_eq(subvol_inum a, subvol_inum b)
   {
-       struct bch_inode_info *inode = to_bch_ei(vinode);
-       subvol_inum *inum = p;
- 
-       return inode->ei_subvol == inum->subvol &&
-               inode->ei_inode.bi_inum == inum->inum;
+       return a.subvol == b.subvol && a.inum == b.inum;
   }
   
- static int bch2_iget5_set(struct inode *vinode, void *p)
+ static int bch2_vfs_inode_cmp_fn(struct rhashtable_compare_arg *arg,
+                                const void *obj)
   {
-       struct bch_inode_info *inode = to_bch_ei(vinode);
-       subvol_inum *inum = p;
+       const struct bch_inode_info *inode = obj;
+       const subvol_inum *v = arg->key;
   
-       inode->v.i_ino          = inum->inum;
-       inode->ei_subvol        = inum->subvol;
-       inode->ei_inode.bi_inum = inum->inum;
-       return 0;
+       return !subvol_inum_eq(inode->ei_inum, *v);
   }
   
- static unsigned bch2_inode_hash(subvol_inum inum)
+ static const struct rhashtable_params bch2_vfs_inodes_params = {
+       .head_offset            = offsetof(struct bch_inode_info, hash),
+       .key_offset             = offsetof(struct bch_inode_info, ei_inum),
+       .key_len                = sizeof(subvol_inum),
+       .obj_cmpfn              = bch2_vfs_inode_cmp_fn,
+       .automatic_shrinking    = true,
+ };
+ 
+ static void __wait_on_freeing_inode(struct inode *inode)
   {
-       return jhash_3words(inum.subvol, inum.inum >> 32, inum.inum, JHASH_INITVAL);
+       wait_queue_head_t *wq;
+       DEFINE_WAIT_BIT(wait, &inode->i_state, __I_NEW);
+       wq = bit_waitqueue(&inode->i_state, __I_NEW);
+       prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE);
+       spin_unlock(&inode->i_lock);
+       schedule();
+       finish_wait(wq, &wait.wq_entry);
   }
   
   struct bch_inode_info *__bch2_inode_hash_find(struct bch_fs *c, subvol_inum inum)
   {
-       return to_bch_ei(ilookup5_nowait(c->vfs_sb,
-                                        bch2_inode_hash(inum),
-                                        bch2_iget5_test,
-                                        &inum));
+       return rhashtable_lookup_fast(&c->vfs_inodes_table, &inum, bch2_vfs_inodes_params);
+ }
+ 
+ static struct bch_inode_info *bch2_inode_hash_find(struct bch_fs *c, struct btree_trans *trans,
+                                                  subvol_inum inum)
+ {
+       struct bch_inode_info *inode;
+ repeat:
+       inode = __bch2_inode_hash_find(c, inum);
+       if (inode) {
+               spin_lock(&inode->v.i_lock);
+               if (!test_bit(EI_INODE_HASHED, &inode->ei_flags)) {
+                       spin_unlock(&inode->v.i_lock);
+                       return NULL;
+               }
+               if ((inode->v.i_state & (I_FREEING|I_WILL_FREE))) {
+                       if (!trans) {
+                               __wait_on_freeing_inode(&inode->v);
+                       } else {
+                               bch2_trans_unlock(trans);
+                               __wait_on_freeing_inode(&inode->v);
+                               int ret = bch2_trans_relock(trans);
+                               if (ret)
+                                       return ERR_PTR(ret);
+                       }
+                       goto repeat;
+               }
+               __iget(&inode->v);
+               spin_unlock(&inode->v.i_lock);
+       }
+ 
+       return inode;
+ }
+ 
+ static void bch2_inode_hash_remove(struct bch_fs *c, struct bch_inode_info *inode)
+ {
+       spin_lock(&inode->v.i_lock);
+       bool remove = test_and_clear_bit(EI_INODE_HASHED, &inode->ei_flags);
+       spin_unlock(&inode->v.i_lock);
+ 
+       if (remove) {
+               int ret = rhashtable_remove_fast(&c->vfs_inodes_table,
+                                       &inode->hash, bch2_vfs_inodes_params);
+               BUG_ON(ret);
+               inode->v.i_hash.pprev = NULL;
+       }
   }
   
- static struct bch_inode_info *bch2_inode_insert(struct bch_fs *c, struct bch_inode_info *inode)
+ static struct bch_inode_info *bch2_inode_hash_insert(struct bch_fs *c,
+                                                    struct btree_trans *trans,
+                                                    struct bch_inode_info *inode)
   {
-       subvol_inum inum = inode_inum(inode);
-       struct bch_inode_info *old = to_bch_ei(inode_insert5(&inode->v,
-                                     bch2_inode_hash(inum),
-                                     bch2_iget5_test,
-                                     bch2_iget5_set,
-                                     &inum));
-       BUG_ON(!old);
+       struct bch_inode_info *old = inode;
+ 
+       set_bit(EI_INODE_HASHED, &inode->ei_flags);
+ retry:
+       if (unlikely(rhashtable_lookup_insert_fast(&c->vfs_inodes_table,
+                                       &inode->hash,
+                                       bch2_vfs_inodes_params))) {
+               old = bch2_inode_hash_find(c, trans, inode->ei_inum);
+               if (!old)
+                       goto retry;
+ 
+               clear_bit(EI_INODE_HASHED, &inode->ei_flags);
   
-       if (unlikely(old != inode)) {
                 /*
                  * bcachefs doesn't use I_NEW; we have no use for it since we
                  * only insert fully created inodes in the inode hash table. But
@@@ -209,21 -265,17 +265,17 @@@
                  */
                 set_nlink(&inode->v, 1);
                 discard_new_inode(&inode->v);
-               inode = old;
+               return old;
         } else {
+               inode_fake_hash(&inode->v);
+ 
+               inode_sb_list_add(&inode->v);
+ 
                 mutex_lock(&c->vfs_inodes_lock);
                 list_add(&inode->ei_vfs_inode_list, &c->vfs_inodes_list);
                 mutex_unlock(&c->vfs_inodes_lock);
-               /*
-                * Again, I_NEW makes no sense for bcachefs. This is only needed
-                * for clearing I_NEW, but since the inode was already fully
-                * created and initialized we didn't actually want
-                * inode_insert5() to set it for us.
-                */
-               unlock_new_inode(&inode->v);
+               return inode;
         }
- 
-       return inode;
   }
   
   #define memalloc_flags_do(_flags, _do)                                                \
@@@ -241,7 -293,8 +293,8 @@@ static struct inode *bch2_alloc_inode(s
   
   static struct bch_inode_info *__bch2_new_inode(struct bch_fs *c)
   {
-       struct bch_inode_info *inode = kmem_cache_alloc(bch2_inode_cache, GFP_NOFS);
+       struct bch_inode_info *inode = alloc_inode_sb(c->vfs_sb,
+                                               bch2_inode_cache, GFP_NOFS);
         if (!inode)
                 return NULL;
   
@@@ -283,13 -336,24 +336,24 @@@ static struct bch_inode_info *bch2_new_
         return inode;
   }
   
+ static struct bch_inode_info *bch2_inode_hash_init_insert(struct btree_trans *trans,
+                                                         subvol_inum inum,
+                                                         struct bch_inode_unpacked *bi,
+                                                         struct bch_subvolume *subvol)
+ {
+       struct bch_inode_info *inode = bch2_new_inode(trans);
+       if (IS_ERR(inode))
+               return inode;
+ 
+       bch2_vfs_inode_init(trans, inum, inode, bi, subvol);
+ 
+       return bch2_inode_hash_insert(trans->c, trans, inode);
+ 
+ }
+ 
   struct inode *bch2_vfs_inode_get(struct bch_fs *c, subvol_inum inum)
   {
-       struct bch_inode_info *inode =
-               to_bch_ei(ilookup5_nowait(c->vfs_sb,
-                                         bch2_inode_hash(inum),
-                                         bch2_iget5_test,
-                                         &inum));
+       struct bch_inode_info *inode = bch2_inode_hash_find(c, NULL, inum);
         if (inode)
                 return &inode->v;
   
@@@ -300,11 -364,7 +364,7 @@@
         int ret = lockrestart_do(trans,
                 bch2_subvolume_get(trans, inum.subvol, true, 0, &subvol) ?:
                 bch2_inode_find_by_inum_trans(trans, inum, &inode_u)) ?:
-               PTR_ERR_OR_ZERO(inode = bch2_new_inode(trans));
-       if (!ret) {
-               bch2_vfs_inode_init(trans, inum, inode, &inode_u, &subvol);
-               inode = bch2_inode_insert(c, inode);
-       }
+               PTR_ERR_OR_ZERO(inode = bch2_inode_hash_init_insert(trans, inum, &inode_u, &subvol));
         bch2_trans_put(trans);
   
         return ret ? ERR_PTR(ret) : &inode->v;
@@@ -325,6 -385,8 +385,8 @@@ __bch2_create(struct mnt_idmap *idmap
         subvol_inum inum;
         struct bch_subvolume subvol;
         u64 journal_seq = 0;
+       kuid_t kuid;
+       kgid_t kgid;
         int ret;
   
         /*
@@@ -351,13 -413,15 +413,15 @@@
   retry:
         bch2_trans_begin(trans);
   
-       ret   = bch2_subvol_is_ro_trans(trans, dir->ei_subvol) ?:
+       kuid = mapped_fsuid(idmap, i_user_ns(&dir->v));
+       kgid = mapped_fsgid(idmap, i_user_ns(&dir->v));
+       ret   = bch2_subvol_is_ro_trans(trans, dir->ei_inum.subvol) ?:
                 bch2_create_trans(trans,
                                   inode_inum(dir), &dir_u, &inode_u,
                                   !(flags & BCH_CREATE_TMPFILE)
                                   ? &dentry->d_name : NULL,
-                                 from_kuid(i_user_ns(&dir->v), current_fsuid()),
-                                 from_kgid(i_user_ns(&dir->v), current_fsgid()),
+                                 from_kuid(i_user_ns(&dir->v), kuid),
+                                 from_kgid(i_user_ns(&dir->v), kgid),
                                   mode, rdev,
                                   default_acl, acl, snapshot_src, flags) ?:
                 bch2_quota_acct(c, bch_qid(&inode_u), Q_INO, 1,
@@@ -365,7 -429,7 +429,7 @@@
         if (unlikely(ret))
                 goto err_before_quota;
   
-       inum.subvol = inode_u.bi_subvol ?: dir->ei_subvol;
+       inum.subvol = inode_u.bi_subvol ?: dir->ei_inum.subvol;
         inum.inum = inode_u.bi_inum;
   
         ret   = bch2_subvolume_get(trans, inum.subvol, true,
@@@ -395,8 -459,16 +459,16 @@@ err_before_quota
          * we must insert the new inode into the inode cache before calling
          * bch2_trans_exit() and dropping locks, else we could race with another
          * thread pulling the inode in and modifying it:
+        *
+        * also, calling bch2_inode_hash_insert() without passing in the
+        * transaction object is sketchy - if we could ever end up in
+        * __wait_on_freeing_inode(), we'd risk deadlock.
+        *
+        * But that shouldn't be possible, since we still have the inode locked
+        * that we just created, and we _really_ can't take a transaction
+        * restart here.
          */
-       inode = bch2_inode_insert(c, inode);
+       inode = bch2_inode_hash_insert(c, NULL, inode);
         bch2_trans_put(trans);
   err:
         posix_acl_release(default_acl);
@@@ -436,11 -508,7 +508,7 @@@ static struct bch_inode_info *bch2_look
         if (ret)
                 goto err;
   
-       struct bch_inode_info *inode =
-               to_bch_ei(ilookup5_nowait(c->vfs_sb,
-                                         bch2_inode_hash(inum),
-                                         bch2_iget5_test,
-                                         &inum));
+       struct bch_inode_info *inode = bch2_inode_hash_find(c, trans, inum);
         if (inode)
                 goto out;
   
@@@ -448,7 -516,7 +516,7 @@@
         struct bch_inode_unpacked inode_u;
         ret =   bch2_subvolume_get(trans, inum.subvol, true, 0, &subvol) ?:
                 bch2_inode_find_by_inum_nowarn_trans(trans, inum, &inode_u) ?:
-               PTR_ERR_OR_ZERO(inode = bch2_new_inode(trans));
+               PTR_ERR_OR_ZERO(inode = bch2_inode_hash_init_insert(trans, inum, &inode_u, &subvol));
   
         bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT),
                                 c, "dirent to missing inode:\n  %s",
@@@ -468,9 -536,6 +536,6 @@@
                 ret = -ENOENT;
                 goto err;
         }
- 
-       bch2_vfs_inode_init(trans, inum, inode, &inode_u, &subvol);
-       inode = bch2_inode_insert(c, inode);
   out:
         bch2_trans_iter_exit(trans, &dirent_iter);
         printbuf_exit(&buf);
@@@ -557,8 -622,8 +622,8 @@@ static int bch2_link(struct dentry *old
   
         lockdep_assert_held(&inode->v.i_rwsem);
   
-       ret   = bch2_subvol_is_ro(c, dir->ei_subvol) ?:
-               bch2_subvol_is_ro(c, inode->ei_subvol) ?:
+       ret   = bch2_subvol_is_ro(c, dir->ei_inum.subvol) ?:
+               bch2_subvol_is_ro(c, inode->ei_inum.subvol) ?:
                 __bch2_link(c, inode, dir, dentry);
         if (unlikely(ret))
                 return bch2_err_class(ret);
@@@ -614,7 -679,7 +679,7 @@@ static int bch2_unlink(struct inode *vd
         struct bch_inode_info *dir= to_bch_ei(vdir);
         struct bch_fs *c = dir->v.i_sb->s_fs_info;
   
-       int ret = bch2_subvol_is_ro(c, dir->ei_subvol) ?:
+       int ret = bch2_subvol_is_ro(c, dir->ei_inum.subvol) ?:
                 __bch2_unlink(vdir, dentry, false);
         return bch2_err_class(ret);
   }
@@@ -671,15 -736,16 +736,16 @@@ static int bch2_rename2(struct mnt_idma
         struct bch_inode_info *src_inode = to_bch_ei(src_dentry->d_inode);
         struct bch_inode_info *dst_inode = to_bch_ei(dst_dentry->d_inode);
         struct bch_inode_unpacked dst_dir_u, src_dir_u;
-       struct bch_inode_unpacked src_inode_u, dst_inode_u;
+       struct bch_inode_unpacked src_inode_u, dst_inode_u, *whiteout_inode_u;
         struct btree_trans *trans;
         enum bch_rename_mode mode = flags & RENAME_EXCHANGE
                 ? BCH_RENAME_EXCHANGE
                 : dst_dentry->d_inode
                 ? BCH_RENAME_OVERWRITE : BCH_RENAME;
+       bool whiteout = !!(flags & RENAME_WHITEOUT);
         int ret;
   
-       if (flags & ~(RENAME_NOREPLACE|RENAME_EXCHANGE))
+       if (flags & ~(RENAME_NOREPLACE|RENAME_EXCHANGE|RENAME_WHITEOUT))
                 return -EINVAL;
   
         if (mode == BCH_RENAME_OVERWRITE) {
@@@ -697,8 -763,8 +763,8 @@@
   
         trans = bch2_trans_get(c);
   
-       ret   = bch2_subvol_is_ro_trans(trans, src_dir->ei_subvol) ?:
-               bch2_subvol_is_ro_trans(trans, dst_dir->ei_subvol);
+       ret   = bch2_subvol_is_ro_trans(trans, src_dir->ei_inum.subvol) ?:
+               bch2_subvol_is_ro_trans(trans, dst_dir->ei_inum.subvol);
         if (ret)
                 goto err;
   
@@@ -720,18 -786,48 +786,48 @@@
                 if (ret)
                         goto err;
         }
+ retry:
+       bch2_trans_begin(trans);
   
-       ret = commit_do(trans, NULL, NULL, 0,
-                       bch2_rename_trans(trans,
-                                         inode_inum(src_dir), &src_dir_u,
-                                         inode_inum(dst_dir), &dst_dir_u,
-                                         &src_inode_u,
-                                         &dst_inode_u,
-                                         &src_dentry->d_name,
-                                         &dst_dentry->d_name,
-                                         mode));
+       ret = bch2_rename_trans(trans,
+                               inode_inum(src_dir), &src_dir_u,
+                               inode_inum(dst_dir), &dst_dir_u,
+                               &src_inode_u,
+                               &dst_inode_u,
+                               &src_dentry->d_name,
+                               &dst_dentry->d_name,
+                               mode);
         if (unlikely(ret))
+               goto err_tx_restart;
+ 
+       if (whiteout) {
+               whiteout_inode_u = bch2_trans_kmalloc_nomemzero(trans, sizeof(*whiteout_inode_u));
+               ret = PTR_ERR_OR_ZERO(whiteout_inode_u);
+               if (unlikely(ret))
+                       goto err_tx_restart;
+               bch2_inode_init_early(c, whiteout_inode_u);
+ 
+               ret = bch2_create_trans(trans,
+                                       inode_inum(src_dir), &src_dir_u,
+                                       whiteout_inode_u,
+                                       &src_dentry->d_name,
+                                       from_kuid(i_user_ns(&src_dir->v), current_fsuid()),
+                                       from_kgid(i_user_ns(&src_dir->v), current_fsgid()),
+                                       S_IFCHR|WHITEOUT_MODE, 0,
+                                       NULL, NULL, (subvol_inum) { 0 }, 0) ?:
+                     bch2_quota_acct(c, bch_qid(whiteout_inode_u), Q_INO, 1,
+                                     KEY_TYPE_QUOTA_PREALLOC);
+               if (unlikely(ret))
+                       goto err_tx_restart;
+       }
+ 
+       ret = bch2_trans_commit(trans, NULL, NULL, 0);
+       if (unlikely(ret)) {
+ err_tx_restart:
+               if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+                       goto retry;
                 goto err;
+       }
   
         BUG_ON(src_inode->v.i_ino != src_inode_u.bi_inum);
         BUG_ON(dst_inode &&
@@@ -779,11 -875,17 +875,17 @@@ static void bch2_setattr_copy(struct mn
   {
         struct bch_fs *c = inode->v.i_sb->s_fs_info;
         unsigned int ia_valid = attr->ia_valid;
+       kuid_t kuid;
+       kgid_t kgid;
   
-       if (ia_valid & ATTR_UID)
-               bi->bi_uid = from_kuid(i_user_ns(&inode->v), attr->ia_uid);
-       if (ia_valid & ATTR_GID)
-               bi->bi_gid = from_kgid(i_user_ns(&inode->v), attr->ia_gid);
+       if (ia_valid & ATTR_UID) {
+               kuid = from_vfsuid(idmap, i_user_ns(&inode->v), attr->ia_vfsuid);
+               bi->bi_uid = from_kuid(i_user_ns(&inode->v), kuid);
+       }
+       if (ia_valid & ATTR_GID) {
+               kgid = from_vfsgid(idmap, i_user_ns(&inode->v), attr->ia_vfsgid);
+               bi->bi_gid = from_kgid(i_user_ns(&inode->v), kgid);
+       }
   
         if (ia_valid & ATTR_SIZE)
                 bi->bi_size = attr->ia_size;
@@@ -798,11 -900,11 +900,11 @@@
         if (ia_valid & ATTR_MODE) {
                 umode_t mode = attr->ia_mode;
                 kgid_t gid = ia_valid & ATTR_GID
-                       ? attr->ia_gid
+                       ? kgid
                         : inode->v.i_gid;
   
-               if (!in_group_p(gid) &&
-                   !capable_wrt_inode_uidgid(idmap, &inode->v, CAP_FSETID))
+               if (!in_group_or_capable(idmap, &inode->v,
+                       make_vfsgid(idmap, i_user_ns(&inode->v), gid)))
                         mode &= ~S_ISGID;
                 bi->bi_mode = mode;
         }
@@@ -818,17 -920,23 +920,23 @@@ int bch2_setattr_nonsize(struct mnt_idm
         struct btree_iter inode_iter = { NULL };
         struct bch_inode_unpacked inode_u;
         struct posix_acl *acl = NULL;
+       kuid_t kuid;
+       kgid_t kgid;
         int ret;
   
         mutex_lock(&inode->ei_update_lock);
   
         qid = inode->ei_qid;
   
-       if (attr->ia_valid & ATTR_UID)
-               qid.q[QTYP_USR] = from_kuid(i_user_ns(&inode->v), attr->ia_uid);
+       if (attr->ia_valid & ATTR_UID) {
+               kuid = from_vfsuid(idmap, i_user_ns(&inode->v), attr->ia_vfsuid);
+               qid.q[QTYP_USR] = from_kuid(i_user_ns(&inode->v), kuid);
+       }
   
-       if (attr->ia_valid & ATTR_GID)
-               qid.q[QTYP_GRP] = from_kgid(i_user_ns(&inode->v), attr->ia_gid);
+       if (attr->ia_valid & ATTR_GID) {
+               kgid = from_vfsgid(idmap, i_user_ns(&inode->v), attr->ia_vfsgid);
+               qid.q[QTYP_GRP] = from_kgid(i_user_ns(&inode->v), kgid);
+       }
   
         ret = bch2_fs_quota_transfer(c, inode, qid, ~0,
                                      KEY_TYPE_QUOTA_PREALLOC);
@@@ -884,13 -992,15 +992,15 @@@ static int bch2_getattr(struct mnt_idma
   {
         struct bch_inode_info *inode = to_bch_ei(d_inode(path->dentry));
         struct bch_fs *c = inode->v.i_sb->s_fs_info;
+       vfsuid_t vfsuid = i_uid_into_vfsuid(idmap, &inode->v);
+       vfsgid_t vfsgid = i_gid_into_vfsgid(idmap, &inode->v);
   
         stat->dev       = inode->v.i_sb->s_dev;
         stat->ino       = inode->v.i_ino;
         stat->mode      = inode->v.i_mode;
         stat->nlink     = inode->v.i_nlink;
-       stat->uid       = inode->v.i_uid;
-       stat->gid       = inode->v.i_gid;
+       stat->uid       = vfsuid_into_kuid(vfsuid);
+       stat->gid       = vfsgid_into_kgid(vfsgid);
         stat->rdev      = inode->v.i_rdev;
         stat->size      = i_size_read(&inode->v);
         stat->atime     = inode_get_atime(&inode->v);
@@@ -899,7 -1009,7 +1009,7 @@@
         stat->blksize   = block_bytes(c);
         stat->blocks    = inode->v.i_blocks;
   
-       stat->subvol    = inode->ei_subvol;
+       stat->subvol    = inode->ei_inum.subvol;
         stat->result_mask |= STATX_SUBVOL;
   
         if ((request_mask & STATX_DIOALIGN) && S_ISREG(inode->v.i_mode)) {
@@@ -941,7 -1051,7 +1051,7 @@@ static int bch2_setattr(struct mnt_idma
   
         lockdep_assert_held(&inode->v.i_rwsem);
   
-       ret   = bch2_subvol_is_ro(c, inode->ei_subvol) ?:
+       ret   = bch2_subvol_is_ro(c, inode->ei_inum.subvol) ?:
                 setattr_prepare(idmap, dentry, iattr);
         if (ret)
                 return ret;
@@@ -1034,7 -1144,6 +1144,6 @@@ static int bch2_fiemap(struct inode *vi
         struct bkey_buf cur, prev;
         unsigned offset_into_extent, sectors;
         bool have_extent = false;
-       u32 snapshot;
         int ret = 0;
   
         ret = fiemap_prep(&ei->v, info, start, &len, FIEMAP_FLAG_SYNC);
@@@ -1050,21 -1159,30 +1159,30 @@@
         bch2_bkey_buf_init(&cur);
         bch2_bkey_buf_init(&prev);
         trans = bch2_trans_get(c);
- retry:
-       bch2_trans_begin(trans);
- 
-       ret = bch2_subvolume_get_snapshot(trans, ei->ei_subvol, &snapshot);
-       if (ret)
-               goto err;
   
         bch2_trans_iter_init(trans, &iter, BTREE_ID_extents,
-                            SPOS(ei->v.i_ino, start, snapshot), 0);
+                            POS(ei->v.i_ino, start), 0);
   
-       while (!(ret = btree_trans_too_many_iters(trans)) &&
-              (k = bch2_btree_iter_peek_upto(&iter, end)).k &&
-              !(ret = bkey_err(k))) {
+       while (true) {
                 enum btree_id data_btree = BTREE_ID_extents;
   
+               bch2_trans_begin(trans);
+ 
+               u32 snapshot;
+               ret = bch2_subvolume_get_snapshot(trans, ei->ei_inum.subvol, &snapshot);
+               if (ret)
+                       goto err;
+ 
+               bch2_btree_iter_set_snapshot(&iter, snapshot);
+ 
+               k = bch2_btree_iter_peek_upto(&iter, end);
+               ret = bkey_err(k);
+               if (ret)
+                       goto err;
+ 
+               if (!k.k)
+                       break;
+ 
                 if (!bkey_extent_is_data(k.k) &&
                     k.k->type != KEY_TYPE_reservation) {
                         bch2_btree_iter_advance(&iter);
@@@ -1108,16 -1226,12 +1226,12 @@@
   
                 bch2_btree_iter_set_pos(&iter,
                         POS(iter.pos.inode, iter.pos.offset + sectors));
- 
-               ret = bch2_trans_relock(trans);
-               if (ret)
+ err:
+               if (ret &&
+                   !bch2_err_matches(ret, BCH_ERR_transaction_restart))
                         break;
         }
-       start = iter.pos.offset;
         bch2_trans_iter_exit(trans, &iter);
- err:
-       if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
-               goto retry;
   
         if (!ret && have_extent) {
                 bch2_trans_unlock(trans);
@@@ -1173,7 -1287,7 +1287,7 @@@ static int bch2_open(struct inode *vino
                 struct bch_inode_info *inode = to_bch_ei(vinode);
                 struct bch_fs *c = inode->v.i_sb->s_fs_info;
   
-               int ret = bch2_subvol_is_ro(c, inode->ei_subvol);
+               int ret = bch2_subvol_is_ro(c, inode->ei_inum.subvol);
                 if (ret)
                         return ret;
         }
@@@ -1305,8 -1419,8 +1419,8 @@@ static int bcachefs_fid_valid(int fh_le
   static struct bcachefs_fid bch2_inode_to_fid(struct bch_inode_info *inode)
   {
         return (struct bcachefs_fid) {
-               .inum   = inode->ei_inode.bi_inum,
-               .subvol = inode->ei_subvol,
+               .inum   = inode->ei_inum.inum,
+               .subvol = inode->ei_inum.subvol,
                 .gen    = inode->ei_inode.bi_generation,
         };
   }
@@@ -1391,7 -1505,7 +1505,7 @@@ static struct dentry *bch2_get_parent(s
         struct bch_fs *c = inode->v.i_sb->s_fs_info;
         subvol_inum parent_inum = {
                 .subvol = inode->ei_inode.bi_parent_subvol ?:
-                       inode->ei_subvol,
+                       inode->ei_inum.subvol,
                 .inum = inode->ei_inode.bi_dir,
         };
   
@@@ -1427,7 -1541,7 +1541,7 @@@ static int bch2_get_name(struct dentry 
   retry:
         bch2_trans_begin(trans);
   
-       ret = bch2_subvolume_get_snapshot(trans, dir->ei_subvol, &snapshot);
+       ret = bch2_subvolume_get_snapshot(trans, dir->ei_inum.subvol, &snapshot);
         if (ret)
                 goto err;
   
@@@ -1458,8 -1572,7 +1572,7 @@@
                 if (ret)
                         goto err;
   
-               if (target.subvol       == inode->ei_subvol &&
-                   target.inum         == inode->ei_inode.bi_inum)
+               if (subvol_inum_eq(target, inode->ei_inum))
                         goto found;
         } else {
                 /*
@@@ -1480,8 -1593,7 +1593,7 @@@
                         if (ret)
                                 continue;
   
-                       if (target.subvol       == inode->ei_subvol &&
-                           target.inum         == inode->ei_inode.bi_inum)
+                       if (subvol_inum_eq(target, inode->ei_inum))
                                 goto found;
                 }
         }
@@@ -1513,12 -1625,15 +1625,15 @@@ static const struct export_operations b
         .get_name       = bch2_get_name,
   };
   
- static void bch2_vfs_inode_init(struct btree_trans *trans, subvol_inum inum,
+ static void bch2_vfs_inode_init(struct btree_trans *trans,
+                               subvol_inum inum,
                                 struct bch_inode_info *inode,
                                 struct bch_inode_unpacked *bi,
                                 struct bch_subvolume *subvol)
   {
-       bch2_iget5_set(&inode->v, &inum);
+       inode->v.i_ino          = inum.inum;
+       inode->ei_inum          = inum;
+       inode->ei_inode.bi_inum = inum.inum;
         bch2_inode_update_after_write(trans, inode, bi, ~0);
   
         inode->v.i_blocks       = bi->bi_sectors;
@@@ -1530,7 -1645,6 +1645,6 @@@
         inode->ei_flags         = 0;
         inode->ei_quota_reserved = 0;
         inode->ei_qid           = bch_qid(bi);
-       inode->ei_subvol        = inum.subvol;
   
         if (BCH_SUBVOLUME_SNAP(subvol))
                 set_bit(EI_INODE_SNAPSHOT, &inode->ei_flags);
@@@ -1597,6 -1711,17 +1711,17 @@@ static void bch2_evict_inode(struct ino
   {
         struct bch_fs *c = vinode->i_sb->s_fs_info;
         struct bch_inode_info *inode = to_bch_ei(vinode);
+       bool delete = !inode->v.i_nlink && !is_bad_inode(&inode->v);
+ 
+       /*
+        * evict() has waited for outstanding writeback, we'll do no more IO
+        * through this inode: it's safe to remove from VFS inode hashtable here
+        *
+        * Do that now so that other threads aren't blocked from pulling it back
+        * in, there's no reason for them to be:
+        */
+       if (!delete)
+               bch2_inode_hash_remove(c, inode);
   
         truncate_inode_pages_final(&inode->v.i_data);
   
@@@ -1604,12 -1729,18 +1729,18 @@@
   
         BUG_ON(!is_bad_inode(&inode->v) && inode->ei_quota_reserved);
   
-       if (!inode->v.i_nlink && !is_bad_inode(&inode->v)) {
+       if (delete) {
                 bch2_quota_acct(c, inode->ei_qid, Q_SPC, -((s64) inode->v.i_blocks),
                                 KEY_TYPE_QUOTA_WARN);
                 bch2_quota_acct(c, inode->ei_qid, Q_INO, -1,
                                 KEY_TYPE_QUOTA_WARN);
                 bch2_inode_rm(c, inode_inum(inode));
+ 
+               /*
+                * If we are deleting, we need it present in the vfs hash table
+                * so that fsck can check if unlinked inodes are still open:
+                */
+               bch2_inode_hash_remove(c, inode);
         }
   
         mutex_lock(&c->vfs_inodes_lock);
@@@ -1639,7 -1770,7 +1770,7 @@@ again
   
         mutex_lock(&c->vfs_inodes_lock);
         list_for_each_entry(inode, &c->vfs_inodes_list, ei_vfs_inode_list) {
-               if (!snapshot_list_has_id(s, inode->ei_subvol))
+               if (!snapshot_list_has_id(s, inode->ei_inum.subvol))
                         continue;
   
                 if (!(inode->v.i_state & I_DONTCACHE) &&
@@@ -1652,16 -1783,14 +1783,16 @@@
                                 break;
                         }
                 } else if (clean_pass && this_pass_clean) {
- -                      wait_queue_head_t *wq = bit_waitqueue(&inode->v.i_state, __I_NEW);
- -                      DEFINE_WAIT_BIT(wait, &inode->v.i_state, __I_NEW);
+ +                      struct wait_bit_queue_entry wqe;
+ +                      struct wait_queue_head *wq_head;
   
- -                      prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE);
+ +                      wq_head = inode_bit_waitqueue(&wqe, &inode->v, __I_NEW);
+ +                      prepare_to_wait_event(wq_head, &wqe.wq_entry,
+ +                                            TASK_UNINTERRUPTIBLE);
                         mutex_unlock(&c->vfs_inodes_lock);
   
                         schedule();
- -                      finish_wait(wq, &wait.wq_entry);
+ +                      finish_wait(wq_head, &wqe.wq_entry);
                         goto again;
                 }
         }
@@@ -1801,30 -1930,14 +1932,14 @@@ static int bch2_show_devname(struct seq
   static int bch2_show_options(struct seq_file *seq, struct dentry *root)
   {
         struct bch_fs *c = root->d_sb->s_fs_info;
-       enum bch_opt_id i;
         struct printbuf buf = PRINTBUF;
-       int ret = 0;
   
-       for (i = 0; i < bch2_opts_nr; i++) {
-               const struct bch_option *opt = &bch2_opt_table[i];
-               u64 v = bch2_opt_get_by_id(&c->opts, i);
+       bch2_opts_to_text(&buf, c->opts, c, c->disk_sb.sb,
+                         OPT_MOUNT, OPT_HIDDEN, OPT_SHOW_MOUNT_STYLE);
+       printbuf_nul_terminate(&buf);
+       seq_puts(seq, buf.buf);
   
-               if ((opt->flags & OPT_HIDDEN) ||
-                   !(opt->flags & OPT_MOUNT))
-                       continue;
- 
-               if (v == bch2_opt_get_by_id(&bch2_opts_default, i))
-                       continue;
- 
-               printbuf_reset(&buf);
-               bch2_opt_to_text(&buf, c, c->disk_sb.sb, opt, v,
-                                OPT_SHOW_MOUNT_STYLE);
-               seq_putc(seq, ',');
-               seq_puts(seq, buf.buf);
-       }
- 
-       if (buf.allocation_failure)
-               ret = -ENOMEM;
+       int ret = buf.allocation_failure ? -ENOMEM : 0;
         printbuf_exit(&buf);
         return ret;
   }
@@@ -2129,12 -2242,23 +2244,23 @@@ static int bch2_init_fs_context(struct 
         return 0;
   }
   
+ void bch2_fs_vfs_exit(struct bch_fs *c)
+ {
+       if (c->vfs_inodes_table.tbl)
+               rhashtable_destroy(&c->vfs_inodes_table);
+ }
+ 
+ int bch2_fs_vfs_init(struct bch_fs *c)
+ {
+       return rhashtable_init(&c->vfs_inodes_table, &bch2_vfs_inodes_params);
+ }
+ 
   static struct file_system_type bcache_fs_type = {
         .owner                  = THIS_MODULE,
         .name                   = "bcachefs",
         .init_fs_context        = bch2_init_fs_context,
         .kill_sb                = bch2_kill_sb,
-       .fs_flags               = FS_REQUIRES_DEV,
+       .fs_flags               = FS_REQUIRES_DEV | FS_ALLOW_IDMAP,
   };
   
   MODULE_ALIAS_FS("bcachefs");
@@@ -2149,7 -2273,8 +2275,8 @@@ int __init bch2_vfs_init(void
   {
         int ret = -ENOMEM;
   
-       bch2_inode_cache = KMEM_CACHE(bch_inode_info, SLAB_RECLAIM_ACCOUNT);
+       bch2_inode_cache = KMEM_CACHE(bch_inode_info, SLAB_RECLAIM_ACCOUNT |
+                                     SLAB_ACCOUNT);
         if (!bch2_inode_cache)
                 goto err;
   
diff --combined fs/inode.c

index af78f515403f3d78245877c4ae51f1fbd35d8fcc,5e7dcdeedd4db5a053379d4b5ffd720bbf6973a6..471ae4a315498fc4aeb55c23bf47bd9513e34ad0
--- 1/fs/inode.c
--- 2/fs/inode.c
+++ b/fs/inode.c
@@@ -438,14 -438,6 +438,6 @@@ static void init_once(void *foo
         inode_init_once(inode);
   }
   
- /*
-  * inode->i_lock must be held
-  */
- void __iget(struct inode *inode)
- {
-       atomic_inc(&inode->i_count);
- }
- 
   /*
    * get additional reference to inode; caller must already hold one.
    */
@@@ -472,17 -464,6 +464,17 @@@ static void __inode_add_lru(struct inod
                 inode->i_state |= I_REFERENCED;
   }
   
+ +struct wait_queue_head *inode_bit_waitqueue(struct wait_bit_queue_entry *wqe,
+ +                                          struct inode *inode, u32 bit)
+ +{
+ +        void *bit_address;
+ +
+ +        bit_address = inode_state_wait_address(inode, bit);
+ +        init_wait_var_entry(wqe, bit_address, 0);
+ +        return __var_waitqueue(bit_address);
+ +}
+ +EXPORT_SYMBOL(inode_bit_waitqueue);
+ +
   /*
    * Add inode to LRU if needed (inode is unused and clean).
    *
@@@ -499,49 -480,6 +491,49 @@@ static void inode_lru_list_del(struct i
                 this_cpu_dec(nr_unused);
   }
   
+ +static void inode_pin_lru_isolating(struct inode *inode)
+ +{
+ +      lockdep_assert_held(&inode->i_lock);
+ +      WARN_ON(inode->i_state & (I_LRU_ISOLATING | I_FREEING | I_WILL_FREE));
+ +      inode->i_state |= I_LRU_ISOLATING;
+ +}
+ +
+ +static void inode_unpin_lru_isolating(struct inode *inode)
+ +{
+ +      spin_lock(&inode->i_lock);
+ +      WARN_ON(!(inode->i_state & I_LRU_ISOLATING));
+ +      inode->i_state &= ~I_LRU_ISOLATING;
+ +      /* Called with inode->i_lock which ensures memory ordering. */
+ +      inode_wake_up_bit(inode, __I_LRU_ISOLATING);
+ +      spin_unlock(&inode->i_lock);
+ +}
+ +
+ +static void inode_wait_for_lru_isolating(struct inode *inode)
+ +{
+ +      struct wait_bit_queue_entry wqe;
+ +      struct wait_queue_head *wq_head;
+ +
+ +      lockdep_assert_held(&inode->i_lock);
+ +      if (!(inode->i_state & I_LRU_ISOLATING))
+ +              return;
+ +
+ +      wq_head = inode_bit_waitqueue(&wqe, inode, __I_LRU_ISOLATING);
+ +      for (;;) {
+ +              prepare_to_wait_event(wq_head, &wqe.wq_entry, TASK_UNINTERRUPTIBLE);
+ +              /*
+ +               * Checking I_LRU_ISOLATING with inode->i_lock guarantees
+ +               * memory ordering.
+ +               */
+ +              if (!(inode->i_state & I_LRU_ISOLATING))
+ +                      break;
+ +              spin_unlock(&inode->i_lock);
+ +              schedule();
+ +              spin_lock(&inode->i_lock);
+ +      }
+ +      finish_wait(wq_head, &wqe.wq_entry);
+ +      WARN_ON(inode->i_state & I_LRU_ISOLATING);
+ +}
+ +
   /**
    * inode_sb_list_add - add inode to the superblock list of inodes
    * @inode: inode to add
@@@ -616,7 -554,6 +608,7 @@@ void dump_mapping(const struct address_
         struct hlist_node *dentry_first;
         struct dentry *dentry_ptr;
         struct dentry dentry;
+ +      char fname[64] = {};
         unsigned long ino;
   
         /*
@@@ -653,14 -590,11 +645,14 @@@
                 return;
         }
   
+ +      if (strncpy_from_kernel_nofault(fname, dentry.d_name.name, 63) < 0)
+ +              strscpy(fname, "<invalid>");
         /*
- -       * if dentry is corrupted, the %pd handler may still crash,
- -       * but it's unlikely that we reach here with a corrupt mapping
+ +       * Even if strncpy_from_kernel_nofault() succeeded,
+ +       * the fname could be unreliable
          */
- -      pr_warn("aops:%ps ino:%lx dentry name:\"%pd\"\n", a_ops, ino, &dentry);
+ +      pr_warn("aops:%ps ino:%lx dentry name(?):\"%s\"\n",
+ +              a_ops, ino, fname);
   }
   
   void clear_inode(struct inode *inode)
@@@ -715,9 -649,6 +707,9 @@@ static void evict(struct inode *inode
   
         inode_sb_list_del(inode);
   
+ +      spin_lock(&inode->i_lock);
+ +      inode_wait_for_lru_isolating(inode);
+ +
         /*
          * Wait for flusher thread to be done with the inode so that filesystem
          * does not start destroying it while writeback is still running. Since
@@@ -725,7 -656,6 +717,7 @@@
          * the inode.  We just have to wait for running writeback to finish.
          */
         inode_wait_for_writeback(inode);
+ +      spin_unlock(&inode->i_lock);
   
         if (op->evict_inode) {
                 op->evict_inode(inode);
@@@ -749,13 -679,7 +741,13 @@@
          * used as an indicator whether blocking on it is safe.
          */
         spin_lock(&inode->i_lock);
- -      wake_up_bit(&inode->i_state, __I_NEW);
+ +      /*
+ +       * Pairs with the barrier in prepare_to_wait_event() to make sure
+ +       * ___wait_var_event() either sees the bit cleared or
+ +       * waitqueue_active() check in wake_up_var() sees the waiter.
+ +       */
+ +      smp_mb();
+ +      inode_wake_up_bit(inode, __I_NEW);
         BUG_ON(inode->i_state != (I_FREEING | I_CLEAR));
         spin_unlock(&inode->i_lock);
   
@@@ -803,10 -727,6 +795,10 @@@ again
                         continue;
   
                 spin_lock(&inode->i_lock);
+ +              if (atomic_read(&inode->i_count)) {
+ +                      spin_unlock(&inode->i_lock);
+ +                      continue;
+ +              }
                 if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE)) {
                         spin_unlock(&inode->i_lock);
                         continue;
@@@ -927,7 -847,7 +919,7 @@@ static enum lru_status inode_lru_isolat
          * be under pressure before the cache inside the highmem zone.
          */
         if (inode_has_buffers(inode) || !mapping_empty(&inode->i_data)) {
- -              __iget(inode);
+ +              inode_pin_lru_isolating(inode);
                 spin_unlock(&inode->i_lock);
                 spin_unlock(lru_lock);
                 if (remove_inode_buffers(inode)) {
@@@ -939,7 -859,7 +931,7 @@@
                                 __count_vm_events(PGINODESTEAL, reap);
                         mm_account_reclaimed_pages(reap);
                 }
- -              iput(inode);
+ +              inode_unpin_lru_isolating(inode);
                 spin_lock(lru_lock);
                 return LRU_RETRY;
         }
@@@ -1167,13 -1087,8 +1159,13 @@@ void unlock_new_inode(struct inode *ino
         spin_lock(&inode->i_lock);
         WARN_ON(!(inode->i_state & I_NEW));
         inode->i_state &= ~I_NEW & ~I_CREATING;
+ +      /*
+ +       * Pairs with the barrier in prepare_to_wait_event() to make sure
+ +       * ___wait_var_event() either sees the bit cleared or
+ +       * waitqueue_active() check in wake_up_var() sees the waiter.
+ +       */
         smp_mb();
- -      wake_up_bit(&inode->i_state, __I_NEW);
+ +      inode_wake_up_bit(inode, __I_NEW);
         spin_unlock(&inode->i_lock);
   }
   EXPORT_SYMBOL(unlock_new_inode);
@@@ -1184,13 -1099,8 +1176,13 @@@ void discard_new_inode(struct inode *in
         spin_lock(&inode->i_lock);
         WARN_ON(!(inode->i_state & I_NEW));
         inode->i_state &= ~I_NEW;
+ +      /*
+ +       * Pairs with the barrier in prepare_to_wait_event() to make sure
+ +       * ___wait_var_event() either sees the bit cleared or
+ +       * waitqueue_active() check in wake_up_var() sees the waiter.
+ +       */
         smp_mb();
- -      wake_up_bit(&inode->i_state, __I_NEW);
+ +      inode_wake_up_bit(inode, __I_NEW);
         spin_unlock(&inode->i_lock);
         iput(inode);
   }
@@@ -1617,7 -1527,9 +1609,7 @@@ struct inode *ilookup(struct super_bloc
         struct hlist_head *head = inode_hashtable + hash(sb, ino);
         struct inode *inode;
   again:
- -      spin_lock(&inode_hash_lock);
- -      inode = find_inode_fast(sb, head, ino, true);
- -      spin_unlock(&inode_hash_lock);
+ +      inode = find_inode_fast(sb, head, ino, false);
   
         if (inode) {
                 if (IS_ERR(inode))
@@@ -2379,8 -2291,8 +2371,8 @@@ EXPORT_SYMBOL(inode_needs_sync)
    */
   static void __wait_on_freeing_inode(struct inode *inode, bool is_inode_hash_locked)
   {
- -      wait_queue_head_t *wq;
- -      DEFINE_WAIT_BIT(wait, &inode->i_state, __I_NEW);
+ +      struct wait_bit_queue_entry wqe;
+ +      struct wait_queue_head *wq_head;
   
         /*
          * Handle racing against evict(), see that routine for more details.
@@@ -2391,14 -2303,14 +2383,14 @@@
                 return;
         }
   
- -      wq = bit_waitqueue(&inode->i_state, __I_NEW);
- -      prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE);
+ +      wq_head = inode_bit_waitqueue(&wqe, inode, __I_NEW);
+ +      prepare_to_wait_event(wq_head, &wqe.wq_entry, TASK_UNINTERRUPTIBLE);
         spin_unlock(&inode->i_lock);
         rcu_read_unlock();
         if (is_inode_hash_locked)
                 spin_unlock(&inode_hash_lock);
         schedule();
- -      finish_wait(wq, &wait.wq_entry);
+ +      finish_wait(wq_head, &wqe.wq_entry);
         if (is_inode_hash_locked)
                 spin_lock(&inode_hash_lock);
         rcu_read_lock();
@@@ -2547,11 -2459,18 +2539,11 @@@ EXPORT_SYMBOL(inode_owner_or_capable)
   /*
    * Direct i/o helper functions
    */
- -static void __inode_dio_wait(struct inode *inode)
+ +bool inode_dio_finished(const struct inode *inode)
   {
- -      wait_queue_head_t *wq = bit_waitqueue(&inode->i_state, __I_DIO_WAKEUP);
- -      DEFINE_WAIT_BIT(q, &inode->i_state, __I_DIO_WAKEUP);
- -
- -      do {
- -              prepare_to_wait(wq, &q.wq_entry, TASK_UNINTERRUPTIBLE);
- -              if (atomic_read(&inode->i_dio_count))
- -                      schedule();
- -      } while (atomic_read(&inode->i_dio_count));
- -      finish_wait(wq, &q.wq_entry);
+ +      return atomic_read(&inode->i_dio_count) == 0;
   }
+ +EXPORT_SYMBOL(inode_dio_finished);
   
   /**
    * inode_dio_wait - wait for outstanding DIO requests to finish
@@@ -2565,17 -2484,11 +2557,17 @@@
    */
   void inode_dio_wait(struct inode *inode)
   {
- -      if (atomic_read(&inode->i_dio_count))
- -              __inode_dio_wait(inode);
+ +      wait_var_event(&inode->i_dio_count, inode_dio_finished(inode));
   }
   EXPORT_SYMBOL(inode_dio_wait);
   
+ +void inode_dio_wait_interruptible(struct inode *inode)
+ +{
+ +      wait_var_event_interruptible(&inode->i_dio_count,
+ +                                   inode_dio_finished(inode));
+ +}
+ +EXPORT_SYMBOL(inode_dio_wait_interruptible);
+ +
   /*
    * inode_set_flags - atomically set some inode flags
    *
diff --combined include/linux/fs.h

index 6b8df574729cb691c21623a63d11cf03a9a7af4c,8fc4bad3b6aed00fcb5223a2d7c623cbccbbf5f6..776298fbfcb4421f01e09a9f0378e8b2f41ef413
--- 1/include/linux/fs.h
--- 2/include/linux/fs.h
+++ b/include/linux/fs.h
@@@ -146,7 -146,8 +146,7 @@@ typedef int (dio_iodone_t)(struct kioc
   /* Expect random access pattern */
   #define FMODE_RANDOM          ((__force fmode_t)(1 << 12))
   
- -/* File is huge (eg. /dev/mem): treat loff_t as unsigned */
- -#define FMODE_UNSIGNED_OFFSET ((__force fmode_t)(1 << 13))
+ +/* FMODE_* bit 13 */
   
   /* File is opened with O_PATH; almost nothing can be done with it */
   #define FMODE_PATH            ((__force fmode_t)(1 << 14))
@@@ -209,7 -210,6 +209,7 @@@
   #define ATTR_OPEN     (1 << 15) /* Truncating from open(O_TRUNC) */
   #define ATTR_TIMES_SET        (1 << 16)
   #define ATTR_TOUCH    (1 << 17)
+ +#define ATTR_DELEG    (1 << 18) /* Delegated attrs. Don't break write delegations */
   
   /*
    * Whiteout is represented by a char device.  The following constants define the
@@@ -408,10 -408,10 +408,10 @@@ struct address_space_operations 
   
         int (*write_begin)(struct file *, struct address_space *mapping,
                                 loff_t pos, unsigned len,
- -                              struct page **pagep, void **fsdata);
+ +                              struct folio **foliop, void **fsdata);
         int (*write_end)(struct file *, struct address_space *mapping,
                                 loff_t pos, unsigned len, unsigned copied,
- -                              struct page *page, void *fsdata);
+ +                              struct folio *folio, void *fsdata);
   
         /* Unfortunately this kludge is needed for FIBMAP. Don't use it */
         sector_t (*bmap)(struct address_space *, sector_t);
@@@ -682,8 -682,7 +682,8 @@@ struct inode 
   #endif
   
         /* Misc */
- -      unsigned long           i_state;
+ +      u32                     i_state;
+ +      /* 32-bit hole */
         struct rw_semaphore     i_rwsem;
   
         unsigned long           dirtied_when;   /* jiffies of first dirtying */
@@@ -746,21 -745,6 +746,21 @@@
         void                    *i_private; /* fs or device private pointer */
   } __randomize_layout;
   
+ +/*
+ + * Get bit address from inode->i_state to use with wait_var_event()
+ + * infrastructre.
+ + */
+ +#define inode_state_wait_address(inode, bit) ((char *)&(inode)->i_state + (bit))
+ +
+ +struct wait_queue_head *inode_bit_waitqueue(struct wait_bit_queue_entry *wqe,
+ +                                          struct inode *inode, u32 bit);
+ +
+ +static inline void inode_wake_up_bit(struct inode *inode, u32 bit)
+ +{
+ +      /* Caller is responsible for correct memory barriers. */
+ +      wake_up_var(inode_state_wait_address(inode, bit));
+ +}
+ +
   struct timespec64 timestamp_truncate(struct timespec64 t, struct inode *inode);
   
   static inline unsigned int i_blocksize(const struct inode *node)
@@@ -963,7 -947,6 +963,7 @@@ static inline unsigned imajor(const str
   }
   
   struct fown_struct {
+ +      struct file *file;      /* backpointer for security modules */
         rwlock_t lock;          /* protects pid, uid, euid fields */
         struct pid *pid;        /* pid or -pgrp where SIGIO should be sent */
         enum pid_type pid_type; /* Kind of process group SIGIO should be sent to */
@@@ -1003,69 -986,52 +1003,69 @@@ static inline int ra_has_index(struct f
                 index <  ra->start + ra->size);
   }
   
- -/*
- - * f_{lock,count,pos_lock} members can be highly contended and share
- - * the same cacheline. f_{lock,mode} are very frequently used together
- - * and so share the same cacheline as well. The read-mostly
- - * f_{path,inode,op} are kept on a separate cacheline.
+ +/**
+ + * struct file - Represents a file
+ + * @f_count: reference count
+ + * @f_lock: Protects f_ep, f_flags. Must not be taken from IRQ context.
+ + * @f_mode: FMODE_* flags often used in hotpaths
+ + * @f_op: file operations
+ + * @f_mapping: Contents of a cacheable, mappable object.
+ + * @private_data: filesystem or driver specific data
+ + * @f_inode: cached inode
+ + * @f_flags: file flags
+ + * @f_iocb_flags: iocb flags
+ + * @f_cred: stashed credentials of creator/opener
+ + * @f_path: path of the file
+ + * @f_pos_lock: lock protecting file position
+ + * @f_pipe: specific to pipes
+ + * @f_pos: file position
+ + * @f_security: LSM security context of this file
+ + * @f_owner: file owner
+ + * @f_wb_err: writeback error
+ + * @f_sb_err: per sb writeback errors
+ + * @f_ep: link of all epoll hooks for this file
+ + * @f_task_work: task work entry point
+ + * @f_llist: work queue entrypoint
+ + * @f_ra: file's readahead state
+ + * @f_freeptr: Pointer used by SLAB_TYPESAFE_BY_RCU file cache (don't touch.)
    */
   struct file {
+ +      atomic_long_t                   f_count;
+ +      spinlock_t                      f_lock;
+ +      fmode_t                         f_mode;
+ +      const struct file_operations    *f_op;
+ +      struct address_space            *f_mapping;
+ +      void                            *private_data;
+ +      struct inode                    *f_inode;
+ +      unsigned int                    f_flags;
+ +      unsigned int                    f_iocb_flags;
+ +      const struct cred               *f_cred;
+ +      /* --- cacheline 1 boundary (64 bytes) --- */
+ +      struct path                     f_path;
         union {
- -              /* fput() uses task work when closing and freeing file (default). */
- -              struct callback_head    f_task_work;
- -              /* fput() must use workqueue (most kernel threads). */
- -              struct llist_node       f_llist;
- -              unsigned int            f_iocb_flags;
+ +              /* regular files (with FMODE_ATOMIC_POS) and directories */
+ +              struct mutex            f_pos_lock;
+ +              /* pipes */
+ +              u64                     f_pipe;
         };
- -
- -      /*
- -       * Protects f_ep, f_flags.
- -       * Must not be taken from IRQ context.
- -       */
- -      spinlock_t              f_lock;
- -      fmode_t                 f_mode;
- -      atomic_long_t           f_count;
- -      struct mutex            f_pos_lock;
- -      loff_t                  f_pos;
- -      unsigned int            f_flags;
- -      struct fown_struct      f_owner;
- -      const struct cred       *f_cred;
- -      struct file_ra_state    f_ra;
- -      struct path             f_path;
- -      struct inode            *f_inode;       /* cached value */
- -      const struct file_operations    *f_op;
- -
- -      u64                     f_version;
+ +      loff_t                          f_pos;
   #ifdef CONFIG_SECURITY
- -      void                    *f_security;
+ +      void                            *f_security;
   #endif
- -      /* needed for tty driver, and maybe others */
- -      void                    *private_data;
- -
+ +      /* --- cacheline 2 boundary (128 bytes) --- */
+ +      struct fown_struct              *f_owner;
+ +      errseq_t                        f_wb_err;
+ +      errseq_t                        f_sb_err;
   #ifdef CONFIG_EPOLL
- -      /* Used by fs/eventpoll.c to link all the hooks to this file */
- -      struct hlist_head       *f_ep;
- -#endif /* #ifdef CONFIG_EPOLL */
- -      struct address_space    *f_mapping;
- -      errseq_t                f_wb_err;
- -      errseq_t                f_sb_err; /* for syncfs */
+ +      struct hlist_head               *f_ep;
+ +#endif
+ +      union {
+ +              struct callback_head    f_task_work;
+ +              struct llist_node       f_llist;
+ +              struct file_ra_state    f_ra;
+ +              freeptr_t               f_freeptr;
+ +      };
+ +      /* --- cacheline 3 boundary (192 bytes) --- */
   } __randomize_layout
     __attribute__((aligned(4)));        /* lest something weird decides that 2 is OK */
   
@@@ -1110,12 -1076,6 +1110,12 @@@ struct file_lease
   #define OFFT_OFFSET_MAX       type_max(off_t)
   #endif
   
+ +int file_f_owner_allocate(struct file *file);
+ +static inline struct fown_struct *file_f_owner(const struct file *file)
+ +{
+ +      return READ_ONCE(file->f_owner);
+ +}
+ +
   extern void send_sigio(struct fown_struct *fown, int fd, int band);
   
   static inline struct inode *file_inode(const struct file *f)
@@@ -1164,7 -1124,7 +1164,7 @@@ extern void __f_setown(struct file *fil
   extern int f_setown(struct file *filp, int who, int force);
   extern void f_delown(struct file *filp);
   extern pid_t f_getown(struct file *filp);
- -extern int send_sigurg(struct fown_struct *fown);
+ +extern int send_sigurg(struct file *file);
   
   /*
    * sb->s_flags.  Note that these mirror the equivalent MS_* flags where
@@@ -1307,7 -1267,7 +1307,7 @@@ struct super_block 
         time64_t                   s_time_min;
         time64_t                   s_time_max;
   #ifdef CONFIG_FSNOTIFY
- -      __u32                   s_fsnotify_mask;
+ +      u32                     s_fsnotify_mask;
         struct fsnotify_sb_info *s_fsnotify_info;
   #endif
   
@@@ -1723,7 -1683,7 +1723,7 @@@ static inline bool __sb_start_write_try
   #define __sb_writers_acquired(sb, lev)        \
         percpu_rwsem_acquire(&(sb)->s_writers.rw_sem[(lev)-1], 1, _THIS_IP_)
   #define __sb_writers_release(sb, lev) \
- -      percpu_rwsem_release(&(sb)->s_writers.rw_sem[(lev)-1], 1, _THIS_IP_)
+ +      percpu_rwsem_release(&(sb)->s_writers.rw_sem[(lev)-1], _THIS_IP_)
   
   /**
    * __sb_write_started - check if sb freeze level is held
@@@ -2113,8 -2073,6 +2113,8 @@@ struct file_operations 
   #define FOP_DIO_PARALLEL_WRITE        ((__force fop_flags_t)(1 << 3))
   /* Contains huge pages */
   #define FOP_HUGE_PAGES                ((__force fop_flags_t)(1 << 4))
+ +/* Treat loff_t as unsigned (e.g., /dev/mem) */
+ +#define FOP_UNSIGNED_OFFSET   ((__force fop_flags_t)(1 << 5))
   
   /* Wrap a directory iterator that needs exclusive inode access */
   int wrap_directory_iterator(struct file *, struct dir_context *,
@@@ -2414,6 -2372,8 +2414,6 @@@ static inline void kiocb_clone(struct k
    *
    * I_REFERENCED               Marks the inode as recently references on the LRU list.
    *
- - * I_DIO_WAKEUP               Never set.  Only used as a key for wait_on_bit().
- - *
    * I_WB_SWITCH                Cgroup bdi_writeback switching in progress.  Used to
    *                    synchronize competing switching instances and to tell
    *                    wb stat updates to grab the i_pages lock.  See
@@@ -2432,36 -2392,29 +2432,36 @@@
    *
    * I_PINNING_FSCACHE_WB       Inode is pinning an fscache object for writeback.
    *
+ + * I_LRU_ISOLATING    Inode is pinned being isolated from LRU without holding
+ + *                    i_count.
+ + *
    * Q: What is the difference between I_WILL_FREE and I_FREEING?
+ + *
+ + * __I_{SYNC,NEW,LRU_ISOLATING} are used to derive unique addresses to wait
+ + * upon. There's one free address left.
    */
- -#define I_DIRTY_SYNC          (1 << 0)
- -#define I_DIRTY_DATASYNC      (1 << 1)
- -#define I_DIRTY_PAGES         (1 << 2)
- -#define __I_NEW                       3
+ +#define __I_NEW                       0
   #define I_NEW                 (1 << __I_NEW)
- -#define I_WILL_FREE           (1 << 4)
- -#define I_FREEING             (1 << 5)
- -#define I_CLEAR                       (1 << 6)
- -#define __I_SYNC              7
+ +#define __I_SYNC              1
   #define I_SYNC                        (1 << __I_SYNC)
- -#define I_REFERENCED          (1 << 8)
- -#define __I_DIO_WAKEUP                9
- -#define I_DIO_WAKEUP          (1 << __I_DIO_WAKEUP)
+ +#define __I_LRU_ISOLATING     2
+ +#define I_LRU_ISOLATING               (1 << __I_LRU_ISOLATING)
+ +
+ +#define I_DIRTY_SYNC          (1 << 3)
+ +#define I_DIRTY_DATASYNC      (1 << 4)
+ +#define I_DIRTY_PAGES         (1 << 5)
+ +#define I_WILL_FREE           (1 << 6)
+ +#define I_FREEING             (1 << 7)
+ +#define I_CLEAR                       (1 << 8)
+ +#define I_REFERENCED          (1 << 9)
   #define I_LINKABLE            (1 << 10)
   #define I_DIRTY_TIME          (1 << 11)
- -#define I_WB_SWITCH           (1 << 13)
- -#define I_OVL_INUSE           (1 << 14)
- -#define I_CREATING            (1 << 15)
- -#define I_DONTCACHE           (1 << 16)
- -#define I_SYNC_QUEUED         (1 << 17)
- -#define I_PINNING_NETFS_WB    (1 << 18)
+ +#define I_WB_SWITCH           (1 << 12)
+ +#define I_OVL_INUSE           (1 << 13)
+ +#define I_CREATING            (1 << 14)
+ +#define I_DONTCACHE           (1 << 15)
+ +#define I_SYNC_QUEUED         (1 << 16)
+ +#define I_PINNING_NETFS_WB    (1 << 17)
   
   #define I_DIRTY_INODE (I_DIRTY_SYNC | I_DIRTY_DATASYNC)
   #define I_DIRTY (I_DIRTY_INODE | I_DIRTY_PAGES)
@@@ -2595,17 -2548,10 +2595,17 @@@ struct super_block *sget(struct file_sy
   struct super_block *sget_dev(struct fs_context *fc, dev_t dev);
   
   /* Alas, no aliases. Too much hassle with bringing module.h everywhere */
- -#define fops_get(fops) \
- -      (((fops) && try_module_get((fops)->owner) ? (fops) : NULL))
- -#define fops_put(fops) \
- -      do { if (fops) module_put((fops)->owner); } while(0)
+ +#define fops_get(fops) ({                                             \
+ +      const struct file_operations *_fops = (fops);                   \
+ +      (((_fops) && try_module_get((_fops)->owner) ? (_fops) : NULL)); \
+ +})
+ +
+ +#define fops_put(fops) ({                                             \
+ +      const struct file_operations *_fops = (fops);                   \
+ +      if (_fops)                                                      \
+ +              module_put((_fops)->owner);                             \
+ +})
+ +
   /*
    * This one is to be used *ONLY* from ->open() instances.
    * fops must be non-NULL, pinned down *and* module dependencies
@@@ -3148,7 -3094,14 +3148,14 @@@ static inline bool is_zero_ino(ino_t in
         return (u32)ino == 0;
   }
   
- extern void __iget(struct inode * inode);
+ /*
+  * inode->i_lock must be held
+  */
+ static inline void __iget(struct inode *inode)
+ {
+       atomic_inc(&inode->i_count);
+ }
+ 
   extern void iget_failed(struct inode *);
   extern void clear_inode(struct inode *);
   extern void __destroy_inode(struct inode *);
@@@ -3231,8 -3184,6 +3238,8 @@@ extern loff_t vfs_setpos(struct file *f
   extern loff_t generic_file_llseek(struct file *file, loff_t offset, int whence);
   extern loff_t generic_file_llseek_size(struct file *file, loff_t offset,
                 int whence, loff_t maxsize, loff_t eof);
+ +loff_t generic_llseek_cookie(struct file *file, loff_t offset, int whence,
+ +                           u64 *cookie);
   extern loff_t fixed_size_llseek(struct file *file, loff_t offset,
                 int whence, loff_t size);
   extern loff_t no_seek_end_llseek_size(struct file *, loff_t, int, loff_t);
@@@ -3270,9 -3221,7 +3277,9 @@@ static inline ssize_t blockdev_direct_I
   }
   #endif
   
+ +bool inode_dio_finished(const struct inode *inode);
   void inode_dio_wait(struct inode *inode);
+ +void inode_dio_wait_interruptible(struct inode *inode);
   
   /**
    * inode_dio_begin - signal start of a direct I/O requests
@@@ -3296,7 -3245,7 +3303,7 @@@ static inline void inode_dio_begin(stru
   static inline void inode_dio_end(struct inode *inode)
   {
         if (atomic_dec_and_test(&inode->i_dio_count))
- -              wake_up_bit(&inode->i_state, __I_DIO_WAKEUP);
+ +              wake_up_var(&inode->i_dio_count);
   }
   
   extern void inode_set_flags(struct inode *inode, unsigned int flags,
@@@ -3389,7 -3338,7 +3396,7 @@@ extern ssize_t noop_direct_IO(struct ki
   extern int simple_empty(struct dentry *);
   extern int simple_write_begin(struct file *file, struct address_space *mapping,
                         loff_t pos, unsigned len,
- -                      struct page **pagep, void **fsdata);
+ +                      struct folio **foliop, void **fsdata);
   extern const struct address_space_operations ram_aops;
   extern int always_delete_dentry(const struct dentry *);
   extern struct inode *alloc_anon_inode(struct super_block *);
@@@ -3513,6 -3462,7 +3520,6 @@@ static inline int kiocb_set_rw_flags(st
         if (flags & RWF_NOWAIT) {
                 if (!(ki->ki_filp->f_mode & FMODE_NOWAIT))
                         return -EOPNOTSUPP;
- -              kiocb_flags |= IOCB_NOIO;
         }
         if (flags & RWF_ATOMIC) {
                 if (rw_type != WRITE)
author	Linus Torvalds <[email protected]>
	Mon, 23 Sep 2024 17:05:41 +0000 (10:05 -0700)
committer	Linus Torvalds <[email protected]>
	Mon, 23 Sep 2024 17:05:41 +0000 (10:05 -0700)
		1	2
fs/bcachefs/fs-io-buffered.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/bcachefs/fs.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/inode.c	patch \|	diff1 \|	diff2 \|	blob \| history
include/linux/fs.h	patch \|	diff1 \|	diff2 \|	blob \| history