Merge tag 'vfs-6.9.pidfd' of git://git.kernel.org/pub/scm/linux/kernel/git/vfs/vfs

author Linus Torvalds <[email protected]>

Mon, 11 Mar 2024 17:21:06 +0000 (10:21 -0700)

committer Linus Torvalds <[email protected]>

Mon, 11 Mar 2024 17:21:06 +0000 (10:21 -0700)
author Linus Torvalds <[email protected]>
Mon, 11 Mar 2024 17:21:06 +0000 (10:21 -0700)
committer Linus Torvalds <[email protected]>
Mon, 11 Mar 2024 17:21:06 +0000 (10:21 -0700)
diff --combined fs/Kconfig

index ea2f77446080ef9f22ed4f4e860b77b2f4ce8167,f3dbd84a0e40a05fc37312ad092d74229d52be7b..4bc7dd420874aa979d325d5bc620d4425f0e04db
--- 1/fs/Kconfig
--- 2/fs/Kconfig
+++ b/fs/Kconfig
@@@ -162,6 -162,7 +162,6 @@@ menu "DOS/FAT/EXFAT/NT Filesystems
   
   source "fs/fat/Kconfig"
   source "fs/exfat/Kconfig"
- -source "fs/ntfs/Kconfig"
   source "fs/ntfs3/Kconfig"
   
   endmenu
@@@ -173,6 -174,13 +173,13 @@@ source "fs/proc/Kconfig
   source "fs/kernfs/Kconfig"
   source "fs/sysfs/Kconfig"
   
+ config FS_PID
+       bool "Pseudo filesystem for process file descriptors"
+       depends on 64BIT
+       default y
+       help
+         Pidfs implements advanced features for process file descriptors.
+ 
   config TMPFS
         bool "Tmpfs virtual memory file system support (former shm fs)"
         depends on SHMEM
diff --combined fs/Makefile

index c32b8c586800a9ad82c790eb762b9c81bd408743,1253016739858374f215abacb00ca33a547c8ef4..6ecc9b0a53f2b0478385fe131c325d377d794c64
--- 1/fs/Makefile
--- 2/fs/Makefile
+++ b/fs/Makefile
@@@ -15,7 -15,7 +15,7 @@@ obj-y :=      open.o read_write.o file_table
                 pnode.o splice.o sync.o utimes.o d_path.o \
                 stack.o fs_struct.o statfs.o fs_pin.o nsfs.o \
                 fs_types.o fs_context.o fs_parser.o fsopen.o init.o \
-               kernel_read_file.o mnt_idmapping.o remap_range.o
+               kernel_read_file.o mnt_idmapping.o remap_range.o pidfs.o
   
   obj-$(CONFIG_BUFFER_HEAD)     += buffer.o mpage.o
   obj-$(CONFIG_PROC_FS)         += proc_namespace.o
@@@ -91,6 -91,7 +91,6 @@@ obj-y                         += unicode
   obj-$(CONFIG_SYSV_FS)         += sysv/
   obj-$(CONFIG_SMBFS)           += smb/
   obj-$(CONFIG_HPFS_FS)         += hpfs/
- -obj-$(CONFIG_NTFS_FS)         += ntfs/
   obj-$(CONFIG_NTFS3_FS)                += ntfs3/
   obj-$(CONFIG_UFS_FS)          += ufs/
   obj-$(CONFIG_EFS_FS)          += efs/
diff --combined fs/exec.c

index af4fbb61cd53e97c788387a0d8277d1ce5495d7d,ca0d53edac99a953918530d9fa01a818ccb6b35e..ece3ab0998e11ee3fb6f0e13d7766ae2c50b2968
--- 1/fs/exec.c
--- 2/fs/exec.c
+++ b/fs/exec.c
@@@ -128,7 -128,7 +128,7 @@@ SYSCALL_DEFINE1(uselib, const char __us
         struct filename *tmp = getname(library);
         int error = PTR_ERR(tmp);
         static const struct open_flags uselib_flags = {
- -              .open_flag = O_LARGEFILE | O_RDONLY | __FMODE_EXEC,
+ +              .open_flag = O_LARGEFILE | O_RDONLY,
                 .acc_mode = MAY_READ | MAY_EXEC,
                 .intent = LOOKUP_OPEN,
                 .lookup_flags = LOOKUP_FOLLOW,
@@@ -904,10 -904,6 +904,10 @@@ EXPORT_SYMBOL(transfer_args_to_stack)
   
   #endif /* CONFIG_MMU */
   
+ +/*
+ + * On success, caller must call do_close_execat() on the returned
+ + * struct file to close it.
+ + */
   static struct file *do_open_execat(int fd, struct filename *name, int flags)
   {
         struct file *file;
@@@ -952,17 -948,6 +952,17 @@@ exit
         return ERR_PTR(err);
   }
   
+ +/**
+ + * open_exec - Open a path name for execution
+ + *
+ + * @name: path name to open with the intent of executing it.
+ + *
+ + * Returns ERR_PTR on failure or allocated struct file on success.
+ + *
+ + * As this is a wrapper for the internal do_open_execat(), callers
+ + * must call allow_write_access() before fput() on release. Also see
+ + * do_close_execat().
+ + */
   struct file *open_exec(const char *name)
   {
         struct filename *filename = getname_kernel(name);
@@@ -1158,7 -1143,6 +1158,6 @@@ static int de_thread(struct task_struc
   
                 BUG_ON(leader->exit_state != EXIT_ZOMBIE);
                 leader->exit_state = EXIT_DEAD;
- 
                 /*
                  * We are going to release_task()->ptrace_unlink() silently,
                  * the tracer can sleep in do_wait(). EXIT_DEAD guarantees
@@@ -1424,9 -1408,6 +1423,9 @@@ int begin_new_exec(struct linux_binprm 
   
   out_unlock:
         up_write(&me->signal->exec_update_lock);
+ +      if (!bprm->cred)
+ +              mutex_unlock(&me->signal->cred_guard_mutex);
+ +
   out:
         return retval;
   }
@@@ -1502,15 -1483,6 +1501,15 @@@ static int prepare_bprm_creds(struct li
         return -ENOMEM;
   }
   
+ +/* Matches do_open_execat() */
+ +static void do_close_execat(struct file *file)
+ +{
+ +      if (!file)
+ +              return;
+ +      allow_write_access(file);
+ +      fput(file);
+ +}
+ +
   static void free_bprm(struct linux_binprm *bprm)
   {
         if (bprm->mm) {
@@@ -1522,7 -1494,10 +1521,7 @@@
                 mutex_unlock(&current->signal->cred_guard_mutex);
                 abort_creds(bprm->cred);
         }
- -      if (bprm->file) {
- -              allow_write_access(bprm->file);
- -              fput(bprm->file);
- -      }
+ +      do_close_execat(bprm->file);
         if (bprm->executable)
                 fput(bprm->executable);
         /* If a binfmt changed the interp, free it. */
@@@ -1544,7 -1519,8 +1543,7 @@@ static struct linux_binprm *alloc_bprm(
   
         bprm = kzalloc(sizeof(*bprm), GFP_KERNEL);
         if (!bprm) {
- -              allow_write_access(file);
- -              fput(file);
+ +              do_close_execat(file);
                 return ERR_PTR(-ENOMEM);
         }
   
@@@ -1633,7 -1609,6 +1632,7 @@@ static void check_unsafe_exec(struct li
         }
         rcu_read_unlock();
   
+ +      /* "users" and "in_exec" locked for copy_fs() */
         if (p->fs->users > n_fs)
                 bprm->unsafe |= LSM_UNSAFE_SHARE;
         else
@@@ -1850,6 -1825,9 +1849,6 @@@ static int exec_binprm(struct linux_bin
         return 0;
   }
   
- -/*
- - * sys_execve() executes a new program.
- - */
   static int bprm_execve(struct linux_binprm *bprm)
   {
         int retval;
diff --combined fs/libfs.c

index 78c71a9e2e18b7aa4c365f825a459c6282f7ac63,65322e11bcdae27c7a52dd20b96072eb799e68fe..680c727d1bbc4ac8ff022638fa9098b788cf92ac
--- 1/fs/libfs.c
--- 2/fs/libfs.c
+++ b/fs/libfs.c
@@@ -23,6 -23,7 +23,7 @@@
   #include <linux/fsnotify.h>
   #include <linux/unicode.h>
   #include <linux/fscrypt.h>
+ #include <linux/pidfs.h>
   
   #include <linux/uaccess.h>
   
@@@ -240,22 -241,17 +241,22 @@@ const struct inode_operations simple_di
   };
   EXPORT_SYMBOL(simple_dir_inode_operations);
   
- -static void offset_set(struct dentry *dentry, u32 offset)
+ +/* 0 is '.', 1 is '..', so always start with offset 2 or more */
+ +enum {
+ +      DIR_OFFSET_MIN  = 2,
+ +};
+ +
+ +static void offset_set(struct dentry *dentry, long offset)
   {
- -      dentry->d_fsdata = (void *)((uintptr_t)(offset));
+ +      dentry->d_fsdata = (void *)offset;
   }
   
- -static u32 dentry2offset(struct dentry *dentry)
+ +static long dentry2offset(struct dentry *dentry)
   {
- -      return (u32)((uintptr_t)(dentry->d_fsdata));
+ +      return (long)dentry->d_fsdata;
   }
   
- -static struct lock_class_key simple_offset_xa_lock;
+ +static struct lock_class_key simple_offset_lock_class;
   
   /**
    * simple_offset_init - initialize an offset_ctx
@@@ -264,9 -260,11 +265,9 @@@
    */
   void simple_offset_init(struct offset_ctx *octx)
   {
- -      xa_init_flags(&octx->xa, XA_FLAGS_ALLOC1);
- -      lockdep_set_class(&octx->xa.xa_lock, &simple_offset_xa_lock);
- -
- -      /* 0 is '.', 1 is '..', so always start with offset 2 */
- -      octx->next_offset = 2;
+ +      mt_init_flags(&octx->mt, MT_FLAGS_ALLOC_RANGE);
+ +      lockdep_set_class(&octx->mt.ma_lock, &simple_offset_lock_class);
+ +      octx->next_offset = DIR_OFFSET_MIN;
   }
   
   /**
@@@ -274,19 -272,20 +275,19 @@@
    * @octx: directory offset ctx to be updated
    * @dentry: new dentry being added
    *
- - * Returns zero on success. @so_ctx and the dentry offset are updated.
+ + * Returns zero on success. @octx and the dentry's offset are updated.
    * Otherwise, a negative errno value is returned.
    */
   int simple_offset_add(struct offset_ctx *octx, struct dentry *dentry)
   {
- -      static const struct xa_limit limit = XA_LIMIT(2, U32_MAX);
- -      u32 offset;
+ +      unsigned long offset;
         int ret;
   
         if (dentry2offset(dentry) != 0)
                 return -EBUSY;
   
- -      ret = xa_alloc_cyclic(&octx->xa, &offset, dentry, limit,
- -                            &octx->next_offset, GFP_KERNEL);
+ +      ret = mtree_alloc_cyclic(&octx->mt, &offset, dentry, DIR_OFFSET_MIN,
+ +                               LONG_MAX, &octx->next_offset, GFP_KERNEL);
         if (ret < 0)
                 return ret;
   
@@@ -302,48 -301,16 +303,48 @@@
    */
   void simple_offset_remove(struct offset_ctx *octx, struct dentry *dentry)
   {
- -      u32 offset;
+ +      long offset;
   
         offset = dentry2offset(dentry);
         if (offset == 0)
                 return;
   
- -      xa_erase(&octx->xa, offset);
+ +      mtree_erase(&octx->mt, offset);
         offset_set(dentry, 0);
   }
   
+ +/**
+ + * simple_offset_empty - Check if a dentry can be unlinked
+ + * @dentry: dentry to be tested
+ + *
+ + * Returns 0 if @dentry is a non-empty directory; otherwise returns 1.
+ + */
+ +int simple_offset_empty(struct dentry *dentry)
+ +{
+ +      struct inode *inode = d_inode(dentry);
+ +      struct offset_ctx *octx;
+ +      struct dentry *child;
+ +      unsigned long index;
+ +      int ret = 1;
+ +
+ +      if (!inode || !S_ISDIR(inode->i_mode))
+ +              return ret;
+ +
+ +      index = DIR_OFFSET_MIN;
+ +      octx = inode->i_op->get_offset_ctx(inode);
+ +      mt_for_each(&octx->mt, child, index, LONG_MAX) {
+ +              spin_lock(&child->d_lock);
+ +              if (simple_positive(child)) {
+ +                      spin_unlock(&child->d_lock);
+ +                      ret = 0;
+ +                      break;
+ +              }
+ +              spin_unlock(&child->d_lock);
+ +      }
+ +
+ +      return ret;
+ +}
+ +
   /**
    * simple_offset_rename_exchange - exchange rename with directory offsets
    * @old_dir: parent of dentry being moved
@@@ -361,8 -328,8 +362,8 @@@ int simple_offset_rename_exchange(struc
   {
         struct offset_ctx *old_ctx = old_dir->i_op->get_offset_ctx(old_dir);
         struct offset_ctx *new_ctx = new_dir->i_op->get_offset_ctx(new_dir);
- -      u32 old_index = dentry2offset(old_dentry);
- -      u32 new_index = dentry2offset(new_dentry);
+ +      long old_index = dentry2offset(old_dentry);
+ +      long new_index = dentry2offset(new_dentry);
         int ret;
   
         simple_offset_remove(old_ctx, old_dentry);
@@@ -388,9 -355,9 +389,9 @@@
   
   out_restore:
         offset_set(old_dentry, old_index);
- -      xa_store(&old_ctx->xa, old_index, old_dentry, GFP_KERNEL);
+ +      mtree_store(&old_ctx->mt, old_index, old_dentry, GFP_KERNEL);
         offset_set(new_dentry, new_index);
- -      xa_store(&new_ctx->xa, new_index, new_dentry, GFP_KERNEL);
+ +      mtree_store(&new_ctx->mt, new_index, new_dentry, GFP_KERNEL);
         return ret;
   }
   
@@@ -403,7 -370,7 +404,7 @@@
    */
   void simple_offset_destroy(struct offset_ctx *octx)
   {
- -      xa_destroy(&octx->xa);
+ +      mtree_destroy(&octx->mt);
   }
   
   /**
@@@ -433,16 -400,15 +434,16 @@@ static loff_t offset_dir_llseek(struct 
   
         /* In this case, ->private_data is protected by f_pos_lock */
         file->private_data = NULL;
- -      return vfs_setpos(file, offset, U32_MAX);
+ +      return vfs_setpos(file, offset, LONG_MAX);
   }
   
- -static struct dentry *offset_find_next(struct xa_state *xas)
+ +static struct dentry *offset_find_next(struct offset_ctx *octx, loff_t offset)
   {
+ +      MA_STATE(mas, &octx->mt, offset, offset);
         struct dentry *child, *found = NULL;
   
         rcu_read_lock();
- -      child = xas_next_entry(xas, U32_MAX);
+ +      child = mas_find(&mas, LONG_MAX);
         if (!child)
                 goto out;
         spin_lock(&child->d_lock);
@@@ -456,8 -422,8 +457,8 @@@ out
   
   static bool offset_dir_emit(struct dir_context *ctx, struct dentry *dentry)
   {
- -      u32 offset = dentry2offset(dentry);
         struct inode *inode = d_inode(dentry);
+ +      long offset = dentry2offset(dentry);
   
         return ctx->actor(ctx, dentry->d_name.name, dentry->d_name.len, offset,
                           inode->i_ino, fs_umode_to_dtype(inode->i_mode));
@@@ -465,11 -431,12 +466,11 @@@
   
   static void *offset_iterate_dir(struct inode *inode, struct dir_context *ctx)
   {
- -      struct offset_ctx *so_ctx = inode->i_op->get_offset_ctx(inode);
- -      XA_STATE(xas, &so_ctx->xa, ctx->pos);
+ +      struct offset_ctx *octx = inode->i_op->get_offset_ctx(inode);
         struct dentry *dentry;
   
         while (true) {
- -              dentry = offset_find_next(&xas);
+ +              dentry = offset_find_next(octx, ctx->pos);
                 if (!dentry)
                         return ERR_PTR(-ENOENT);
   
@@@ -478,8 -445,8 +479,8 @@@
                         break;
                 }
   
+ +              ctx->pos = dentry2offset(dentry) + 1;
                 dput(dentry);
- -              ctx->pos = xas.xa_index + 1;
         }
         return NULL;
   }
@@@ -515,7 -482,7 +516,7 @@@ static int offset_readdir(struct file *
                 return 0;
   
         /* In this case, ->private_data is protected by f_pos_lock */
- -      if (ctx->pos == 2)
+ +      if (ctx->pos == DIR_OFFSET_MIN)
                 file->private_data = NULL;
         else if (file->private_data == ERR_PTR(-ENOENT))
                 return 0;
@@@ -1738,28 -1705,16 +1739,28 @@@ bool is_empty_dir_inode(struct inode *i
   static int generic_ci_d_compare(const struct dentry *dentry, unsigned int len,
                                 const char *str, const struct qstr *name)
   {
- -      const struct dentry *parent = READ_ONCE(dentry->d_parent);
- -      const struct inode *dir = READ_ONCE(parent->d_inode);
- -      const struct super_block *sb = dentry->d_sb;
- -      const struct unicode_map *um = sb->s_encoding;
- -      struct qstr qstr = QSTR_INIT(str, len);
+ +      const struct dentry *parent;
+ +      const struct inode *dir;
         char strbuf[DNAME_INLINE_LEN];
- -      int ret;
+ +      struct qstr qstr;
+ +
+ +      /*
+ +       * Attempt a case-sensitive match first. It is cheaper and
+ +       * should cover most lookups, including all the sane
+ +       * applications that expect a case-sensitive filesystem.
+ +       *
+ +       * This comparison is safe under RCU because the caller
+ +       * guarantees the consistency between str and len. See
+ +       * __d_lookup_rcu_op_compare() for details.
+ +       */
+ +      if (len == name->len && !memcmp(str, name->name, len))
+ +              return 0;
   
+ +      parent = READ_ONCE(dentry->d_parent);
+ +      dir = READ_ONCE(parent->d_inode);
         if (!dir || !IS_CASEFOLDED(dir))
- -              goto fallback;
+ +              return 1;
+ +
         /*
          * If the dentry name is stored in-line, then it may be concurrently
          * modified by a rename.  If this happens, the VFS will eventually retry
@@@ -1770,14 -1725,20 +1771,14 @@@
         if (len <= DNAME_INLINE_LEN - 1) {
                 memcpy(strbuf, str, len);
                 strbuf[len] = 0;
- -              qstr.name = strbuf;
+ +              str = strbuf;
                 /* prevent compiler from optimizing out the temporary buffer */
                 barrier();
         }
- -      ret = utf8_strncasecmp(um, name, &qstr);
- -      if (ret >= 0)
- -              return ret;
+ +      qstr.len = len;
+ +      qstr.name = str;
   
- -      if (sb_has_strict_encoding(sb))
- -              return -EINVAL;
- -fallback:
- -      if (len != name->len)
- -              return 1;
- -      return !!memcmp(str, name->name, len);
+ +      return utf8_strncasecmp(dentry->d_sb->s_encoding, name, &qstr);
   }
   
   /**
@@@ -1792,7 -1753,7 +1793,7 @@@ static int generic_ci_d_hash(const stru
         const struct inode *dir = READ_ONCE(dentry->d_inode);
         struct super_block *sb = dentry->d_sb;
         const struct unicode_map *um = sb->s_encoding;
- -      int ret = 0;
+ +      int ret;
   
         if (!dir || !IS_CASEFOLDED(dir))
                 return 0;
@@@ -1806,45 -1767,73 +1807,45 @@@
   static const struct dentry_operations generic_ci_dentry_ops = {
         .d_hash = generic_ci_d_hash,
         .d_compare = generic_ci_d_compare,
- -};
- -#endif
- -
   #ifdef CONFIG_FS_ENCRYPTION
- -static const struct dentry_operations generic_encrypted_dentry_ops = {
         .d_revalidate = fscrypt_d_revalidate,
+ +#endif
   };
   #endif
   
- -#if defined(CONFIG_FS_ENCRYPTION) && IS_ENABLED(CONFIG_UNICODE)
- -static const struct dentry_operations generic_encrypted_ci_dentry_ops = {
- -      .d_hash = generic_ci_d_hash,
- -      .d_compare = generic_ci_d_compare,
+ +#ifdef CONFIG_FS_ENCRYPTION
+ +static const struct dentry_operations generic_encrypted_dentry_ops = {
         .d_revalidate = fscrypt_d_revalidate,
   };
   #endif
   
   /**
- - * generic_set_encrypted_ci_d_ops - helper for setting d_ops for given dentry
- - * @dentry:   dentry to set ops on
+ + * generic_set_sb_d_ops - helper for choosing the set of
+ + * filesystem-wide dentry operations for the enabled features
+ + * @sb: superblock to be configured
    *
- - * Casefolded directories need d_hash and d_compare set, so that the dentries
- - * contained in them are handled case-insensitively.  Note that these operations
- - * are needed on the parent directory rather than on the dentries in it, and
- - * while the casefolding flag can be toggled on and off on an empty directory,
- - * dentry_operations can't be changed later.  As a result, if the filesystem has
- - * casefolding support enabled at all, we have to give all dentries the
- - * casefolding operations even if their inode doesn't have the casefolding flag
- - * currently (and thus the casefolding ops would be no-ops for now).
- - *
- - * Encryption works differently in that the only dentry operation it needs is
- - * d_revalidate, which it only needs on dentries that have the no-key name flag.
- - * The no-key flag can't be set "later", so we don't have to worry about that.
- - *
- - * Finally, to maximize compatibility with overlayfs (which isn't compatible
- - * with certain dentry operations) and to avoid taking an unnecessary
- - * performance hit, we use custom dentry_operations for each possible
- - * combination rather than always installing all operations.
+ + * Filesystems supporting casefolding and/or fscrypt can call this
+ + * helper at mount-time to configure sb->s_d_op to best set of dentry
+ + * operations required for the enabled features. The helper must be
+ + * called after these have been configured, but before the root dentry
+ + * is created.
    */
- -void generic_set_encrypted_ci_d_ops(struct dentry *dentry)
+ +void generic_set_sb_d_ops(struct super_block *sb)
   {
- -#ifdef CONFIG_FS_ENCRYPTION
- -      bool needs_encrypt_ops = dentry->d_flags & DCACHE_NOKEY_NAME;
- -#endif
   #if IS_ENABLED(CONFIG_UNICODE)
- -      bool needs_ci_ops = dentry->d_sb->s_encoding;
- -#endif
- -#if defined(CONFIG_FS_ENCRYPTION) && IS_ENABLED(CONFIG_UNICODE)
- -      if (needs_encrypt_ops && needs_ci_ops) {
- -              d_set_d_op(dentry, &generic_encrypted_ci_dentry_ops);
+ +      if (sb->s_encoding) {
+ +              sb->s_d_op = &generic_ci_dentry_ops;
                 return;
         }
   #endif
   #ifdef CONFIG_FS_ENCRYPTION
- -      if (needs_encrypt_ops) {
- -              d_set_d_op(dentry, &generic_encrypted_dentry_ops);
- -              return;
- -      }
- -#endif
- -#if IS_ENABLED(CONFIG_UNICODE)
- -      if (needs_ci_ops) {
- -              d_set_d_op(dentry, &generic_ci_dentry_ops);
+ +      if (sb->s_cop) {
+ +              sb->s_d_op = &generic_encrypted_dentry_ops;
                 return;
         }
   #endif
   }
- -EXPORT_SYMBOL(generic_set_encrypted_ci_d_ops);
+ +EXPORT_SYMBOL(generic_set_sb_d_ops);
   
   /**
    * inode_maybe_inc_iversion - increments i_version
@@@ -1985,3 -1974,144 +1986,144 @@@ struct timespec64 simple_inode_init_ts(
         return ts;
   }
   EXPORT_SYMBOL(simple_inode_init_ts);
+ 
+ static inline struct dentry *get_stashed_dentry(struct dentry *stashed)
+ {
+       struct dentry *dentry;
+ 
+       guard(rcu)();
+       dentry = READ_ONCE(stashed);
+       if (!dentry)
+               return NULL;
+       if (!lockref_get_not_dead(&dentry->d_lockref))
+               return NULL;
+       return dentry;
+ }
+ 
+ static struct dentry *prepare_anon_dentry(struct dentry **stashed,
+                                         unsigned long ino,
+                                         struct super_block *sb,
+                                         void *data)
+ {
+       struct dentry *dentry;
+       struct inode *inode;
+       const struct stashed_operations *sops = sb->s_fs_info;
+ 
+       dentry = d_alloc_anon(sb);
+       if (!dentry)
+               return ERR_PTR(-ENOMEM);
+ 
+       inode = new_inode_pseudo(sb);
+       if (!inode) {
+               dput(dentry);
+               return ERR_PTR(-ENOMEM);
+       }
+ 
+       inode->i_ino = ino;
+       inode->i_flags |= S_IMMUTABLE;
+       inode->i_mode = S_IFREG;
+       simple_inode_init_ts(inode);
+       sops->init_inode(inode, data);
+ 
+       /* Notice when this is changed. */
+       WARN_ON_ONCE(!S_ISREG(inode->i_mode));
+       WARN_ON_ONCE(!IS_IMMUTABLE(inode));
+ 
+       /* Store address of location where dentry's supposed to be stashed. */
+       dentry->d_fsdata = stashed;
+ 
+       /* @data is now owned by the fs */
+       d_instantiate(dentry, inode);
+       return dentry;
+ }
+ 
+ static struct dentry *stash_dentry(struct dentry **stashed,
+                                  struct dentry *dentry)
+ {
+       guard(rcu)();
+       for (;;) {
+               struct dentry *old;
+ 
+               /* Assume any old dentry was cleared out. */
+               old = cmpxchg(stashed, NULL, dentry);
+               if (likely(!old))
+                       return dentry;
+ 
+               /* Check if somebody else installed a reusable dentry. */
+               if (lockref_get_not_dead(&old->d_lockref))
+                       return old;
+ 
+               /* There's an old dead dentry there, try to take it over. */
+               if (likely(try_cmpxchg(stashed, &old, dentry)))
+                       return dentry;
+       }
+ }
+ 
+ /**
+  * path_from_stashed - create path from stashed or new dentry
+  * @stashed:    where to retrieve or stash dentry
+  * @ino:        inode number to use
+  * @mnt:        mnt of the filesystems to use
+  * @data:       data to store in inode->i_private
+  * @path:       path to create
+  *
+  * The function tries to retrieve a stashed dentry from @stashed. If the dentry
+  * is still valid then it will be reused. If the dentry isn't able the function
+  * will allocate a new dentry and inode. It will then check again whether it
+  * can reuse an existing dentry in case one has been added in the meantime or
+  * update @stashed with the newly added dentry.
+  *
+  * Special-purpose helper for nsfs and pidfs.
+  *
+  * Return: On success zero and on failure a negative error is returned.
+  */
+ int path_from_stashed(struct dentry **stashed, unsigned long ino,
+                     struct vfsmount *mnt, void *data, struct path *path)
+ {
+       struct dentry *dentry;
+       const struct stashed_operations *sops = mnt->mnt_sb->s_fs_info;
+ 
+       /* See if dentry can be reused. */
+       path->dentry = get_stashed_dentry(*stashed);
+       if (path->dentry) {
+               sops->put_data(data);
+               goto out_path;
+       }
+ 
+       /* Allocate a new dentry. */
+       dentry = prepare_anon_dentry(stashed, ino, mnt->mnt_sb, data);
+       if (IS_ERR(dentry)) {
+               sops->put_data(data);
+               return PTR_ERR(dentry);
+       }
+ 
+       /* Added a new dentry. @data is now owned by the filesystem. */
+       path->dentry = stash_dentry(stashed, dentry);
+       if (path->dentry != dentry)
+               dput(dentry);
+ 
+ out_path:
+       WARN_ON_ONCE(path->dentry->d_fsdata != stashed);
+       WARN_ON_ONCE(d_inode(path->dentry)->i_private != data);
+       path->mnt = mntget(mnt);
+       return 0;
+ }
+ 
+ void stashed_dentry_prune(struct dentry *dentry)
+ {
+       struct dentry **stashed = dentry->d_fsdata;
+       struct inode *inode = d_inode(dentry);
+ 
+       if (WARN_ON_ONCE(!stashed))
+               return;
+ 
+       if (!inode)
+               return;
+ 
+       /*
+        * Only replace our own @dentry as someone else might've
+        * already cleared out @dentry and stashed their own
+        * dentry in there.
+        */
+       cmpxchg(stashed, dentry, NULL);
+ }
diff --combined kernel/exit.c

index dfb963d2f862ada6a2f06259c6df4923272cb218,0e2f5dec91fb1f2caf3158e7302e16c8b97439c3..41a12630cbbc9cd80b6b5a154041c514b46ad3fe
--- 1/kernel/exit.c
--- 2/kernel/exit.c
+++ b/kernel/exit.c
@@@ -739,6 -739,13 +739,13 @@@ static void exit_notify(struct task_str
                 kill_orphaned_pgrp(tsk->group_leader, NULL);
   
         tsk->exit_state = EXIT_ZOMBIE;
+       /*
+        * sub-thread or delay_group_leader(), wake up the
+        * PIDFD_THREAD waiters.
+        */
+       if (!thread_group_empty(tsk))
+               do_notify_pidfd(tsk);
+ 
         if (unlikely(tsk->ptrace)) {
                 int sig = thread_group_leader(tsk) &&
                                 thread_group_empty(tsk) &&
@@@ -1127,14 -1134,17 +1134,14 @@@ static int wait_task_zombie(struct wait
                  * and nobody can change them.
                  *
                  * psig->stats_lock also protects us from our sub-threads
- -               * which can reap other children at the same time. Until
- -               * we change k_getrusage()-like users to rely on this lock
- -               * we have to take ->siglock as well.
+ +               * which can reap other children at the same time.
                  *
                  * We use thread_group_cputime_adjusted() to get times for
                  * the thread group, which consolidates times for all threads
                  * in the group including the group leader.
                  */
                 thread_group_cputime_adjusted(p, &tgutime, &tgstime);
- -              spin_lock_irq(&current->sighand->siglock);
- -              write_seqlock(&psig->stats_lock);
+ +              write_seqlock_irq(&psig->stats_lock);
                 psig->cutime += tgutime + sig->cutime;
                 psig->cstime += tgstime + sig->cstime;
                 psig->cgtime += task_gtime(p) + sig->gtime + sig->cgtime;
@@@ -1157,7 -1167,8 +1164,7 @@@
                         psig->cmaxrss = maxrss;
                 task_io_accounting_add(&psig->ioac, &p->ioac);
                 task_io_accounting_add(&psig->ioac, &sig->ioac);
- -              write_sequnlock(&psig->stats_lock);
- -              spin_unlock_irq(&current->sighand->siglock);
+ +              write_sequnlock_irq(&psig->stats_lock);
         }
   
         if (wo->wo_rusage)
@@@ -1889,30 -1900,6 +1896,6 @@@ Efault
   }
   #endif
   
- /**
-  * thread_group_exited - check that a thread group has exited
-  * @pid: tgid of thread group to be checked.
-  *
-  * Test if the thread group represented by tgid has exited (all
-  * threads are zombies, dead or completely gone).
-  *
-  * Return: true if the thread group has exited. false otherwise.
-  */
- bool thread_group_exited(struct pid *pid)
- {
-       struct task_struct *task;
-       bool exited;
- 
-       rcu_read_lock();
-       task = pid_task(pid, PIDTYPE_PID);
-       exited = !task ||
-               (READ_ONCE(task->exit_state) && thread_group_empty(task));
-       rcu_read_unlock();
- 
-       return exited;
- }
- EXPORT_SYMBOL(thread_group_exited);
- 
   /*
    * This needs to be __function_aligned as GCC implicitly makes any
    * implementation of abort() cold and drops alignment specified by
diff --combined kernel/fork.c

index 0d944e92a43ffa13bdbcce6c6a28c44bab29ca19,2f839c290dcf2967ede43ed6d1fe039daa015bc2..1af8dfd149ee03fa60aad39981a1a6d97e8c2d78
--- 1/kernel/fork.c
--- 2/kernel/fork.c
+++ b/kernel/fork.c
@@@ -101,6 -101,8 +101,8 @@@
   #include <linux/user_events.h>
   #include <linux/iommu.h>
   #include <linux/rseq.h>
+ #include <uapi/linux/pidfd.h>
+ #include <linux/pidfs.h>
   
   #include <asm/pgalloc.h>
   #include <linux/uaccess.h>
@@@ -1748,7 -1750,6 +1750,7 @@@ static int copy_fs(unsigned long clone_
         if (clone_flags & CLONE_FS) {
                 /* tsk->fs is already what we want */
                 spin_lock(&fs->lock);
+ +              /* "users" and "in_exec" locked for check_unsafe_exec() */
                 if (fs->in_exec) {
                         spin_unlock(&fs->lock);
                         return -EAGAIN;
@@@ -1985,119 -1986,6 +1987,6 @@@ static inline void rcu_copy_process(str
   #endif /* #ifdef CONFIG_TASKS_TRACE_RCU */
   }
   
- struct pid *pidfd_pid(const struct file *file)
- {
-       if (file->f_op == &pidfd_fops)
-               return file->private_data;
- 
-       return ERR_PTR(-EBADF);
- }
- 
- static int pidfd_release(struct inode *inode, struct file *file)
- {
-       struct pid *pid = file->private_data;
- 
-       file->private_data = NULL;
-       put_pid(pid);
-       return 0;
- }
- 
- #ifdef CONFIG_PROC_FS
- /**
-  * pidfd_show_fdinfo - print information about a pidfd
-  * @m: proc fdinfo file
-  * @f: file referencing a pidfd
-  *
-  * Pid:
-  * This function will print the pid that a given pidfd refers to in the
-  * pid namespace of the procfs instance.
-  * If the pid namespace of the process is not a descendant of the pid
-  * namespace of the procfs instance 0 will be shown as its pid. This is
-  * similar to calling getppid() on a process whose parent is outside of
-  * its pid namespace.
-  *
-  * NSpid:
-  * If pid namespaces are supported then this function will also print
-  * the pid of a given pidfd refers to for all descendant pid namespaces
-  * starting from the current pid namespace of the instance, i.e. the
-  * Pid field and the first entry in the NSpid field will be identical.
-  * If the pid namespace of the process is not a descendant of the pid
-  * namespace of the procfs instance 0 will be shown as its first NSpid
-  * entry and no others will be shown.
-  * Note that this differs from the Pid and NSpid fields in
-  * /proc/<pid>/status where Pid and NSpid are always shown relative to
-  * the  pid namespace of the procfs instance. The difference becomes
-  * obvious when sending around a pidfd between pid namespaces from a
-  * different branch of the tree, i.e. where no ancestral relation is
-  * present between the pid namespaces:
-  * - create two new pid namespaces ns1 and ns2 in the initial pid
-  *   namespace (also take care to create new mount namespaces in the
-  *   new pid namespace and mount procfs)
-  * - create a process with a pidfd in ns1
-  * - send pidfd from ns1 to ns2
-  * - read /proc/self/fdinfo/<pidfd> and observe that both Pid and NSpid
-  *   have exactly one entry, which is 0
-  */
- static void pidfd_show_fdinfo(struct seq_file *m, struct file *f)
- {
-       struct pid *pid = f->private_data;
-       struct pid_namespace *ns;
-       pid_t nr = -1;
- 
-       if (likely(pid_has_task(pid, PIDTYPE_PID))) {
-               ns = proc_pid_ns(file_inode(m->file)->i_sb);
-               nr = pid_nr_ns(pid, ns);
-       }
- 
-       seq_put_decimal_ll(m, "Pid:\t", nr);
- 
- #ifdef CONFIG_PID_NS
-       seq_put_decimal_ll(m, "\nNSpid:\t", nr);
-       if (nr > 0) {
-               int i;
- 
-               /* If nr is non-zero it means that 'pid' is valid and that
-                * ns, i.e. the pid namespace associated with the procfs
-                * instance, is in the pid namespace hierarchy of pid.
-                * Start at one below the already printed level.
-                */
-               for (i = ns->level + 1; i <= pid->level; i++)
-                       seq_put_decimal_ll(m, "\t", pid->numbers[i].nr);
-       }
- #endif
-       seq_putc(m, '\n');
- }
- #endif
- 
- /*
-  * Poll support for process exit notification.
-  */
- static __poll_t pidfd_poll(struct file *file, struct poll_table_struct *pts)
- {
-       struct pid *pid = file->private_data;
-       __poll_t poll_flags = 0;
- 
-       poll_wait(file, &pid->wait_pidfd, pts);
- 
-       /*
-        * Inform pollers only when the whole thread group exits.
-        * If the thread group leader exits before all other threads in the
-        * group, then poll(2) should block, similar to the wait(2) family.
-        */
-       if (thread_group_exited(pid))
-               poll_flags = EPOLLIN | EPOLLRDNORM;
- 
-       return poll_flags;
- }
- 
- const struct file_operations pidfd_fops = {
-       .release = pidfd_release,
-       .poll = pidfd_poll,
- #ifdef CONFIG_PROC_FS
-       .show_fdinfo = pidfd_show_fdinfo,
- #endif
- };
- 
   /**
    * __pidfd_prepare - allocate a new pidfd_file and reserve a pidfd
    * @pid:   the struct pid for which to create a pidfd
@@@ -2131,20 -2019,20 +2020,20 @@@ static int __pidfd_prepare(struct pid *
         int pidfd;
         struct file *pidfd_file;
   
-       if (flags & ~(O_NONBLOCK | O_RDWR | O_CLOEXEC))
-               return -EINVAL;
- 
-       pidfd = get_unused_fd_flags(O_RDWR | O_CLOEXEC);
+       pidfd = get_unused_fd_flags(O_CLOEXEC);
         if (pidfd < 0)
                 return pidfd;
   
-       pidfd_file = anon_inode_getfile("[pidfd]", &pidfd_fops, pid,
-                                       flags | O_RDWR | O_CLOEXEC);
+       pidfd_file = pidfs_alloc_file(pid, flags | O_RDWR);
         if (IS_ERR(pidfd_file)) {
                 put_unused_fd(pidfd);
                 return PTR_ERR(pidfd_file);
         }
-       get_pid(pid); /* held by pidfd_file now */
+       /*
+        * anon_inode_getfile() ignores everything outside of the
+        * O_ACCMODE | O_NONBLOCK mask, set PIDFD_THREAD manually.
+        */
+       pidfd_file->f_flags |= (flags & PIDFD_THREAD);
         *ret = pidfd_file;
         return pidfd;
   }
@@@ -2158,7 -2046,8 +2047,8 @@@
    * Allocate a new file that stashes @pid and reserve a new pidfd number in the
    * caller's file descriptor table. The pidfd is reserved but not installed yet.
    *
-  * The helper verifies that @pid is used as a thread group leader.
+  * The helper verifies that @pid is still in use, without PIDFD_THREAD the
+  * task identified by @pid must be a thread-group leader.
    *
    * If this function returns successfully the caller is responsible to either
    * call fd_install() passing the returned pidfd and pidfd file as arguments in
@@@ -2177,7 -2066,9 +2067,9 @@@
    */
   int pidfd_prepare(struct pid *pid, unsigned int flags, struct file **ret)
   {
-       if (!pid || !pid_has_task(pid, PIDTYPE_TGID))
+       bool thread = flags & PIDFD_THREAD;
+ 
+       if (!pid || !pid_has_task(pid, thread ? PIDTYPE_PID : PIDTYPE_TGID))
                 return -EINVAL;
   
         return __pidfd_prepare(pid, flags, ret);
@@@ -2299,9 -2190,8 +2191,8 @@@ __latent_entropy struct task_struct *co
                 /*
                  * - CLONE_DETACHED is blocked so that we can potentially
                  *   reuse it later for CLONE_PIDFD.
-                * - CLONE_THREAD is blocked until someone really needs it.
                  */
-               if (clone_flags & (CLONE_DETACHED | CLONE_THREAD))
+               if (clone_flags & CLONE_DETACHED)
                         return ERR_PTR(-EINVAL);
         }
   
@@@ -2524,8 -2414,10 +2415,10 @@@
          * if the fd table isn't shared).
          */
         if (clone_flags & CLONE_PIDFD) {
+               int flags = (clone_flags & CLONE_THREAD) ? PIDFD_THREAD : 0;
+ 
                 /* Note that no task has been attached to @pid yet. */
-               retval = __pidfd_prepare(pid, O_RDWR | O_CLOEXEC, &pidfile);
+               retval = __pidfd_prepare(pid, flags, &pidfile);
                 if (retval < 0)
                         goto bad_fork_free_pid;
                 pidfd = retval;
@@@ -2876,8 -2768,8 +2769,8 @@@ pid_t kernel_clone(struct kernel_clone_
          * here has the advantage that we don't need to have a separate helper
          * to check for legacy clone().
          */
-       if ((args->flags & CLONE_PIDFD) &&
-           (args->flags & CLONE_PARENT_SETTID) &&
+       if ((clone_flags & CLONE_PIDFD) &&
+           (clone_flags & CLONE_PARENT_SETTID) &&
             (args->pidfd == args->parent_tid))
                 return -EINVAL;
author	Linus Torvalds <[email protected]>
	Mon, 11 Mar 2024 17:21:06 +0000 (10:21 -0700)
committer	Linus Torvalds <[email protected]>
	Mon, 11 Mar 2024 17:21:06 +0000 (10:21 -0700)
		1	2
fs/Kconfig	patch \|	diff1 \|	diff2 \|	blob \| history
fs/Makefile	patch \|	diff1 \|	diff2 \|	blob \| history
fs/exec.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/libfs.c	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/exit.c	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/fork.c	patch \|	diff1 \|	diff2 \|	blob \| history