Merge tag 'execve-v6.7-rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/kees...

author Linus Torvalds <[email protected]>

Tue, 31 Oct 2023 05:28:19 +0000 (19:28 -1000)

committer Linus Torvalds <[email protected]>

Tue, 31 Oct 2023 05:28:19 +0000 (19:28 -1000)
author Linus Torvalds <[email protected]>
Tue, 31 Oct 2023 05:28:19 +0000 (19:28 -1000)
committer Linus Torvalds <[email protected]>
Tue, 31 Oct 2023 05:28:19 +0000 (19:28 -1000)
diff --combined fs/binfmt_elf_fdpic.c

index 206812ce544aebbf131c5647808774ceb5afb015,97c3e8551aacc87db35ad516e6572f46cbcb4fd5..fefc642541cbe421af17506458af4d225c736374
--- 1/fs/binfmt_elf_fdpic.c
--- 2/fs/binfmt_elf_fdpic.c
+++ b/fs/binfmt_elf_fdpic.c
@@@ -345,9 -345,10 +345,9 @@@ static int load_elf_fdpic_binary(struc
         /* there's now no turning back... the old userspace image is dead,
          * defunct, deceased, etc.
          */
+ +      SET_PERSONALITY(exec_params.hdr);
         if (elf_check_fdpic(&exec_params.hdr))
- -              set_personality(PER_LINUX_FDPIC);
- -      else
- -              set_personality(PER_LINUX);
+ +              current->personality |= PER_LINUX_FDPIC;
         if (elf_read_implies_exec(&exec_params.hdr, executable_stack))
                 current->personality |= READ_IMPLIES_EXEC;
   
@@@ -899,10 -900,12 +899,12 @@@ static int elf_fdpic_map_file(struct el
         kdebug("- DYNAMIC[]: %lx", params->dynamic_addr);
         seg = loadmap->segs;
         for (loop = 0; loop < loadmap->nsegs; loop++, seg++)
-               kdebug("- LOAD[%d] : %08x-%08x [va=%x ms=%x]",
+               kdebug("- LOAD[%d] : %08llx-%08llx [va=%llx ms=%llx]",
                        loop,
-                      seg->addr, seg->addr + seg->p_memsz - 1,
-                      seg->p_vaddr, seg->p_memsz);
+                      (unsigned long long) seg->addr,
+                      (unsigned long long) seg->addr + seg->p_memsz - 1,
+                      (unsigned long long) seg->p_vaddr,
+                      (unsigned long long) seg->p_memsz);
   
         return 0;
   
@@@ -1081,9 -1084,10 +1083,10 @@@ static int elf_fdpic_map_file_by_direct
                 maddr = vm_mmap(file, maddr, phdr->p_memsz + disp, prot, flags,
                                 phdr->p_offset - disp);
   
-               kdebug("mmap[%d] <file> sz=%lx pr=%x fl=%x of=%lx --> %08lx",
-                      loop, phdr->p_memsz + disp, prot, flags,
-                      phdr->p_offset - disp, maddr);
+               kdebug("mmap[%d] <file> sz=%llx pr=%x fl=%x of=%llx --> %08lx",
+                      loop, (unsigned long long) phdr->p_memsz + disp,
+                      prot, flags, (unsigned long long) phdr->p_offset - disp,
+                      maddr);
   
                 if (IS_ERR_VALUE(maddr))
                         return (int) maddr;
@@@ -1145,8 -1149,9 +1148,9 @@@
   
   #else
                 if (excess > 0) {
-                       kdebug("clear[%d] ad=%lx sz=%lx",
-                              loop, maddr + phdr->p_filesz, excess);
+                       kdebug("clear[%d] ad=%llx sz=%lx", loop,
+                              (unsigned long long) maddr + phdr->p_filesz,
+                              excess);
                         if (clear_user((void *) maddr + phdr->p_filesz, excess))
                                 return -EFAULT;
                 }
diff --combined fs/binfmt_misc.c

index 5d2be9b0a0a597c8ea027ba2d8d6fcb4ea41f1fa,deacc105119da42d8073075d5a54a71140be354f..68fa225f89e54d10af16e5a9fb94a109ea556507
--- 1/fs/binfmt_misc.c
--- 2/fs/binfmt_misc.c
+++ b/fs/binfmt_misc.c
@@@ -40,9 -40,6 +40,6 @@@ enum 
         VERBOSE_STATUS = 1 /* make it zero to save 400 bytes kernel memory */
   };
   
- static LIST_HEAD(entries);
- static int enabled = 1;
- 
   enum {Enabled, Magic};
   #define MISC_FMT_PRESERVE_ARGV0 (1UL << 31)
   #define MISC_FMT_OPEN_BINARY (1UL << 30)
@@@ -60,12 -57,10 +57,10 @@@ typedef struct 
         char *name;
         struct dentry *dentry;
         struct file *interp_file;
+       refcount_t users;               /* sync removal with load_misc_binary() */
   } Node;
   
- static DEFINE_RWLOCK(entries_lock);
   static struct file_system_type bm_fs_type;
- static struct vfsmount *bm_mnt;
- static int entry_count;
   
   /*
    * Max length of the register string.  Determined by:
@@@ -82,19 -77,24 +77,24 @@@
    */
   #define MAX_REGISTER_LENGTH 1920
   
- /*
-  * Check if we support the binfmt
-  * if we do, return the node, else NULL
-  * locking is done in load_misc_binary
+ /**
+  * search_binfmt_handler - search for a binary handler for @bprm
+  * @misc: handle to binfmt_misc instance
+  * @bprm: binary for which we are looking for a handler
+  *
+  * Search for a binary type handler for @bprm in the list of registered binary
+  * type handlers.
+  *
+  * Return: binary type list entry on success, NULL on failure
    */
- static Node *check_file(struct linux_binprm *bprm)
+ static Node *search_binfmt_handler(struct binfmt_misc *misc,
+                                  struct linux_binprm *bprm)
   {
         char *p = strrchr(bprm->interp, '.');
-       struct list_head *l;
+       Node *e;
   
         /* Walk all the registered handlers. */
-       list_for_each(l, &entries) {
-               Node *e = list_entry(l, Node, list);
+       list_for_each_entry(e, &misc->entries, list) {
                 char *s;
                 int j;
   
@@@ -123,9 -123,79 +123,79 @@@
                 if (j == e->size)
                         return e;
         }
+ 
         return NULL;
   }
   
+ /**
+  * get_binfmt_handler - try to find a binary type handler
+  * @misc: handle to binfmt_misc instance
+  * @bprm: binary for which we are looking for a handler
+  *
+  * Try to find a binfmt handler for the binary type. If one is found take a
+  * reference to protect against removal via bm_{entry,status}_write().
+  *
+  * Return: binary type list entry on success, NULL on failure
+  */
+ static Node *get_binfmt_handler(struct binfmt_misc *misc,
+                               struct linux_binprm *bprm)
+ {
+       Node *e;
+ 
+       read_lock(&misc->entries_lock);
+       e = search_binfmt_handler(misc, bprm);
+       if (e)
+               refcount_inc(&e->users);
+       read_unlock(&misc->entries_lock);
+       return e;
+ }
+ 
+ /**
+  * put_binfmt_handler - put binary handler node
+  * @e: node to put
+  *
+  * Free node syncing with load_misc_binary() and defer final free to
+  * load_misc_binary() in case it is using the binary type handler we were
+  * requested to remove.
+  */
+ static void put_binfmt_handler(Node *e)
+ {
+       if (refcount_dec_and_test(&e->users)) {
+               if (e->flags & MISC_FMT_OPEN_FILE)
+                       filp_close(e->interp_file, NULL);
+               kfree(e);
+       }
+ }
+ 
+ /**
+  * load_binfmt_misc - load the binfmt_misc of the caller's user namespace
+  *
+  * To be called in load_misc_binary() to load the relevant struct binfmt_misc.
+  * If a user namespace doesn't have its own binfmt_misc mount it can make use
+  * of its ancestor's binfmt_misc handlers. This mimicks the behavior of
+  * pre-namespaced binfmt_misc where all registered binfmt_misc handlers where
+  * available to all user and user namespaces on the system.
+  *
+  * Return: the binfmt_misc instance of the caller's user namespace
+  */
+ static struct binfmt_misc *load_binfmt_misc(void)
+ {
+       const struct user_namespace *user_ns;
+       struct binfmt_misc *misc;
+ 
+       user_ns = current_user_ns();
+       while (user_ns) {
+               /* Pairs with smp_store_release() in bm_fill_super(). */
+               misc = smp_load_acquire(&user_ns->binfmt_misc);
+               if (misc)
+                       return misc;
+ 
+               user_ns = user_ns->parent;
+       }
+ 
+       return &init_binfmt_misc;
+ }
+ 
   /*
    * the loader itself
    */
@@@ -133,18 -203,14 +203,14 @@@ static int load_misc_binary(struct linu
   {
         Node *fmt;
         struct file *interp_file = NULL;
-       int retval;
+       int retval = -ENOEXEC;
+       struct binfmt_misc *misc;
   
-       retval = -ENOEXEC;
-       if (!enabled)
+       misc = load_binfmt_misc();
+       if (!misc->enabled)
                 return retval;
   
-       /* to keep locking time low, we copy the interpreter string */
-       read_lock(&entries_lock);
-       fmt = check_file(bprm);
-       if (fmt)
-               dget(fmt->dentry);
-       read_unlock(&entries_lock);
+       fmt = get_binfmt_handler(misc, bprm);
         if (!fmt)
                 return retval;
   
@@@ -198,7 -264,16 +264,16 @@@
   
         retval = 0;
   ret:
-       dput(fmt->dentry);
+ 
+       /*
+        * If we actually put the node here all concurrent calls to
+        * load_misc_binary() will have finished. We also know
+        * that for the refcount to be zero someone must have concurently
+        * removed the binary type handler from the list and it's our job to
+        * free it.
+        */
+       put_binfmt_handler(fmt);
+ 
         return retval;
   }
   
@@@ -287,7 -362,7 +362,7 @@@ static Node *create_entry(const char __
   
         err = -ENOMEM;
         memsize = sizeof(Node) + count + 8;
-       e = kmalloc(memsize, GFP_KERNEL);
+       e = kmalloc(memsize, GFP_KERNEL_ACCOUNT);
         if (!e)
                 goto out;
   
@@@ -399,7 -474,7 +474,7 @@@
   
                         if (e->mask) {
                                 int i;
-                               char *masked = kmalloc(e->size, GFP_KERNEL);
+                               char *masked = kmalloc(e->size, GFP_KERNEL_ACCOUNT);
   
                                 print_hex_dump_bytes(
                                         KBUILD_MODNAME ": register:  mask[decoded]: ",
@@@ -547,35 -622,114 +622,114 @@@ static struct inode *bm_get_inode(struc
         if (inode) {
                 inode->i_ino = get_next_ino();
                 inode->i_mode = mode;
- -              inode->i_atime = inode->i_mtime = inode_set_ctime_current(inode);
+ +              simple_inode_init_ts(inode);
         }
         return inode;
   }
   
+ /**
+  * i_binfmt_misc - retrieve struct binfmt_misc from a binfmt_misc inode
+  * @inode: inode of the relevant binfmt_misc instance
+  *
+  * This helper retrieves struct binfmt_misc from a binfmt_misc inode. This can
+  * be done without any memory barriers because we are guaranteed that
+  * user_ns->binfmt_misc is fully initialized. It was fully initialized when the
+  * binfmt_misc mount was first created.
+  *
+  * Return: struct binfmt_misc of the relevant binfmt_misc instance
+  */
+ static struct binfmt_misc *i_binfmt_misc(struct inode *inode)
+ {
+       return inode->i_sb->s_user_ns->binfmt_misc;
+ }
+ 
+ /**
+  * bm_evict_inode - cleanup data associated with @inode
+  * @inode: inode to which the data is attached
+  *
+  * Cleanup the binary type handler data associated with @inode if a binary type
+  * entry is removed or the filesystem is unmounted and the super block is
+  * shutdown.
+  *
+  * If the ->evict call was not caused by a super block shutdown but by a write
+  * to remove the entry or all entries via bm_{entry,status}_write() the entry
+  * will have already been removed from the list. We keep the list_empty() check
+  * to make that explicit.
+ */
   static void bm_evict_inode(struct inode *inode)
   {
         Node *e = inode->i_private;
   
-       if (e && e->flags & MISC_FMT_OPEN_FILE)
-               filp_close(e->interp_file, NULL);
- 
         clear_inode(inode);
-       kfree(e);
+ 
+       if (e) {
+               struct binfmt_misc *misc;
+ 
+               misc = i_binfmt_misc(inode);
+               write_lock(&misc->entries_lock);
+               if (!list_empty(&e->list))
+                       list_del_init(&e->list);
+               write_unlock(&misc->entries_lock);
+               put_binfmt_handler(e);
+       }
   }
   
- static void kill_node(Node *e)
+ /**
+  * unlink_binfmt_dentry - remove the dentry for the binary type handler
+  * @dentry: dentry associated with the binary type handler
+  *
+  * Do the actual filesystem work to remove a dentry for a registered binary
+  * type handler. Since binfmt_misc only allows simple files to be created
+  * directly under the root dentry of the filesystem we ensure that we are
+  * indeed passed a dentry directly beneath the root dentry, that the inode
+  * associated with the root dentry is locked, and that it is a regular file we
+  * are asked to remove.
+  */
+ static void unlink_binfmt_dentry(struct dentry *dentry)
   {
-       struct dentry *dentry;
+       struct dentry *parent = dentry->d_parent;
+       struct inode *inode, *parent_inode;
   
-       write_lock(&entries_lock);
-       list_del_init(&e->list);
-       write_unlock(&entries_lock);
+       /* All entries are immediate descendants of the root dentry. */
+       if (WARN_ON_ONCE(dentry->d_sb->s_root != parent))
+               return;
   
-       dentry = e->dentry;
-       drop_nlink(d_inode(dentry));
-       d_drop(dentry);
-       dput(dentry);
-       simple_release_fs(&bm_mnt, &entry_count);
+       /* We only expect to be called on regular files. */
+       inode = d_inode(dentry);
+       if (WARN_ON_ONCE(!S_ISREG(inode->i_mode)))
+               return;
+ 
+       /* The parent inode must be locked. */
+       parent_inode = d_inode(parent);
+       if (WARN_ON_ONCE(!inode_is_locked(parent_inode)))
+               return;
+ 
+       if (simple_positive(dentry)) {
+               dget(dentry);
+               simple_unlink(parent_inode, dentry);
+               d_delete(dentry);
+               dput(dentry);
+       }
+ }
+ 
+ /**
+  * remove_binfmt_handler - remove a binary type handler
+  * @misc: handle to binfmt_misc instance
+  * @e: binary type handler to remove
+  *
+  * Remove a binary type handler from the list of binary type handlers and
+  * remove its associated dentry. This is called from
+  * binfmt_{entry,status}_write(). In the future, we might want to think about
+  * adding a proper ->unlink() method to binfmt_misc instead of forcing caller's
+  * to use writes to files in order to delete binary type handlers. But it has
+  * worked for so long that it's not a pressing issue.
+  */
+ static void remove_binfmt_handler(struct binfmt_misc *misc, Node *e)
+ {
+       write_lock(&misc->entries_lock);
+       list_del_init(&e->list);
+       write_unlock(&misc->entries_lock);
+       unlink_binfmt_dentry(e->dentry);
   }
   
   /* /<entry> */
@@@ -602,8 -756,8 +756,8 @@@ bm_entry_read(struct file *file, char _
   static ssize_t bm_entry_write(struct file *file, const char __user *buffer,
                                 size_t count, loff_t *ppos)
   {
-       struct dentry *root;
-       Node *e = file_inode(file)->i_private;
+       struct inode *inode = file_inode(file);
+       Node *e = inode->i_private;
         int res = parse_command(buffer, count);
   
         switch (res) {
@@@ -617,13 -771,22 +771,22 @@@
                 break;
         case 3:
                 /* Delete this handler. */
-               root = file_inode(file)->i_sb->s_root;
-               inode_lock(d_inode(root));
+               inode = d_inode(inode->i_sb->s_root);
+               inode_lock(inode);
   
+               /*
+                * In order to add new element or remove elements from the list
+                * via bm_{entry,register,status}_write() inode_lock() on the
+                * root inode must be held.
+                * The lock is exclusive ensuring that the list can't be
+                * modified. Only load_misc_binary() can access but does so
+                * read-only. So we only need to take the write lock when we
+                * actually remove the entry from the list.
+                */
                 if (!list_empty(&e->list))
-                       kill_node(e);
+                       remove_binfmt_handler(i_binfmt_misc(inode), e);
   
-               inode_unlock(d_inode(root));
+               inode_unlock(inode);
                 break;
         default:
                 return res;
@@@ -647,6 -810,7 +810,7 @@@ static ssize_t bm_register_write(struc
         struct inode *inode;
         struct super_block *sb = file_inode(file)->i_sb;
         struct dentry *root = sb->s_root, *dentry;
+       struct binfmt_misc *misc;
         int err = 0;
         struct file *f = NULL;
   
@@@ -656,7 -820,18 +820,18 @@@
                 return PTR_ERR(e);
   
         if (e->flags & MISC_FMT_OPEN_FILE) {
+               const struct cred *old_cred;
+ 
+               /*
+                * Now that we support unprivileged binfmt_misc mounts make
+                * sure we use the credentials that the register @file was
+                * opened with to also open the interpreter. Before that this
+                * didn't matter much as only a privileged process could open
+                * the register file.
+                */
+               old_cred = override_creds(file->f_cred);
                 f = open_exec(e->interpreter);
+               revert_creds(old_cred);
                 if (IS_ERR(f)) {
                         pr_notice("register: failed to install interpreter file %s\n",
                                  e->interpreter);
@@@ -682,21 -857,16 +857,16 @@@
         if (!inode)
                 goto out2;
   
-       err = simple_pin_fs(&bm_fs_type, &bm_mnt, &entry_count);
-       if (err) {
-               iput(inode);
-               inode = NULL;
-               goto out2;
-       }
- 
+       refcount_set(&e->users, 1);
         e->dentry = dget(dentry);
         inode->i_private = e;
         inode->i_fop = &bm_entry_operations;
   
         d_instantiate(dentry, inode);
-       write_lock(&entries_lock);
-       list_add(&e->list, &entries);
-       write_unlock(&entries_lock);
+       misc = i_binfmt_misc(inode);
+       write_lock(&misc->entries_lock);
+       list_add(&e->list, &misc->entries);
+       write_unlock(&misc->entries_lock);
   
         err = 0;
   out2:
@@@ -723,35 -893,50 +893,50 @@@ static const struct file_operations bm_
   static ssize_t
   bm_status_read(struct file *file, char __user *buf, size_t nbytes, loff_t *ppos)
   {
-       char *s = enabled ? "enabled\n" : "disabled\n";
+       struct binfmt_misc *misc;
+       char *s;
   
+       misc = i_binfmt_misc(file_inode(file));
+       s = misc->enabled ? "enabled\n" : "disabled\n";
         return simple_read_from_buffer(buf, nbytes, ppos, s, strlen(s));
   }
   
   static ssize_t bm_status_write(struct file *file, const char __user *buffer,
                 size_t count, loff_t *ppos)
   {
+       struct binfmt_misc *misc;
         int res = parse_command(buffer, count);
-       struct dentry *root;
+       Node *e, *next;
+       struct inode *inode;
   
+       misc = i_binfmt_misc(file_inode(file));
         switch (res) {
         case 1:
                 /* Disable all handlers. */
-               enabled = 0;
+               misc->enabled = false;
                 break;
         case 2:
                 /* Enable all handlers. */
-               enabled = 1;
+               misc->enabled = true;
                 break;
         case 3:
                 /* Delete all handlers. */
-               root = file_inode(file)->i_sb->s_root;
-               inode_lock(d_inode(root));
+               inode = d_inode(file_inode(file)->i_sb->s_root);
+               inode_lock(inode);
   
-               while (!list_empty(&entries))
-                       kill_node(list_first_entry(&entries, Node, list));
+               /*
+                * In order to add new element or remove elements from the list
+                * via bm_{entry,register,status}_write() inode_lock() on the
+                * root inode must be held.
+                * The lock is exclusive ensuring that the list can't be
+                * modified. Only load_misc_binary() can access but does so
+                * read-only. So we only need to take the write lock when we
+                * actually remove the entry from the list.
+                */
+               list_for_each_entry_safe(e, next, &misc->entries, list)
+                       remove_binfmt_handler(misc, e);
   
-               inode_unlock(d_inode(root));
+               inode_unlock(inode);
                 break;
         default:
                 return res;
@@@ -768,32 -953,100 +953,100 @@@ static const struct file_operations bm_
   
   /* Superblock handling */
   
+ static void bm_put_super(struct super_block *sb)
+ {
+       struct user_namespace *user_ns = sb->s_fs_info;
+ 
+       sb->s_fs_info = NULL;
+       put_user_ns(user_ns);
+ }
+ 
   static const struct super_operations s_ops = {
         .statfs         = simple_statfs,
         .evict_inode    = bm_evict_inode,
+       .put_super      = bm_put_super,
   };
   
   static int bm_fill_super(struct super_block *sb, struct fs_context *fc)
   {
         int err;
+       struct user_namespace *user_ns = sb->s_user_ns;
+       struct binfmt_misc *misc;
         static const struct tree_descr bm_files[] = {
                 [2] = {"status", &bm_status_operations, S_IWUSR|S_IRUGO},
                 [3] = {"register", &bm_register_operations, S_IWUSR},
                 /* last one */ {""}
         };
   
+       if (WARN_ON(user_ns != current_user_ns()))
+               return -EINVAL;
+ 
+       /*
+        * Lazily allocate a new binfmt_misc instance for this namespace, i.e.
+        * do it here during the first mount of binfmt_misc. We don't need to
+        * waste memory for every user namespace allocation. It's likely much
+        * more common to not mount a separate binfmt_misc instance than it is
+        * to mount one.
+        *
+        * While multiple superblocks can exist they are keyed by userns in
+        * s_fs_info for binfmt_misc. Hence, the vfs guarantees that
+        * bm_fill_super() is called exactly once whenever a binfmt_misc
+        * superblock for a userns is created. This in turn lets us conclude
+        * that when a binfmt_misc superblock is created for the first time for
+        * a userns there's no one racing us. Therefore we don't need any
+        * barriers when we dereference binfmt_misc.
+        */
+       misc = user_ns->binfmt_misc;
+       if (!misc) {
+               /*
+                * If it turns out that most user namespaces actually want to
+                * register their own binary type handler and therefore all
+                * create their own separate binfm_misc mounts we should
+                * consider turning this into a kmem cache.
+                */
+               misc = kzalloc(sizeof(struct binfmt_misc), GFP_KERNEL);
+               if (!misc)
+                       return -ENOMEM;
+ 
+               INIT_LIST_HEAD(&misc->entries);
+               rwlock_init(&misc->entries_lock);
+ 
+               /* Pairs with smp_load_acquire() in load_binfmt_misc(). */
+               smp_store_release(&user_ns->binfmt_misc, misc);
+       }
+ 
+       /*
+        * When the binfmt_misc superblock for this userns is shutdown
+        * ->enabled might have been set to false and we don't reinitialize
+        * ->enabled again in put_super() as someone might already be mounting
+        * binfmt_misc again. It also would be pointless since by the time
+        * ->put_super() is called we know that the binary type list for this
+        * bintfmt_misc mount is empty making load_misc_binary() return
+        * -ENOEXEC independent of whether ->enabled is true. Instead, if
+        * someone mounts binfmt_misc for the first time or again we simply
+        * reset ->enabled to true.
+        */
+       misc->enabled = true;
+ 
         err = simple_fill_super(sb, BINFMTFS_MAGIC, bm_files);
         if (!err)
                 sb->s_op = &s_ops;
         return err;
   }
   
+ static void bm_free(struct fs_context *fc)
+ {
+       if (fc->s_fs_info)
+               put_user_ns(fc->s_fs_info);
+ }
+ 
   static int bm_get_tree(struct fs_context *fc)
   {
-       return get_tree_single(fc, bm_fill_super);
+       return get_tree_keyed(fc, bm_fill_super, get_user_ns(fc->user_ns));
   }
   
   static const struct fs_context_operations bm_context_ops = {
+       .free           = bm_free,
         .get_tree       = bm_get_tree,
   };
   
@@@ -812,6 -1065,7 +1065,7 @@@ static struct file_system_type bm_fs_ty
         .owner          = THIS_MODULE,
         .name           = "binfmt_misc",
         .init_fs_context = bm_init_fs_context,
+       .fs_flags       = FS_USERNS_MOUNT,
         .kill_sb        = kill_litter_super,
   };
   MODULE_ALIAS_FS("binfmt_misc");
diff --combined include/linux/mm.h

index 19fc73b02c9f73b5b321bc8c73c1a3320d1aa970,216dd0c6dcf8519d7c9f6601e66f3c5090869c5d..116c28c514682f9ab306dc4d074f84fdd0be97aa
--- 1/include/linux/mm.h
--- 2/include/linux/mm.h
+++ b/include/linux/mm.h
@@@ -1726,8 -1726,8 +1726,8 @@@ static inline void vma_set_access_pid_b
         unsigned int pid_bit;
   
         pid_bit = hash_32(current->pid, ilog2(BITS_PER_LONG));
- -      if (vma->numab_state && !test_bit(pid_bit, &vma->numab_state->access_pids[1])) {
- -              __set_bit(pid_bit, &vma->numab_state->access_pids[1]);
+ +      if (vma->numab_state && !test_bit(pid_bit, &vma->numab_state->pids_active[1])) {
+ +              __set_bit(pid_bit, &vma->numab_state->pids_active[1]);
         }
   }
   #else /* !CONFIG_NUMA_BALANCING */
@@@ -3308,8 -3308,7 +3308,7 @@@ static inline void mm_populate(unsigne
   static inline void mm_populate(unsigned long addr, unsigned long len) {}
   #endif
   
- /* These take the mm semaphore themselves */
- extern int __must_check vm_brk(unsigned long, unsigned long);
+ /* This takes the mm semaphore itself */
   extern int __must_check vm_brk_flags(unsigned long, unsigned long, unsigned long);
   extern int vm_munmap(unsigned long, size_t);
   extern unsigned long __must_check vm_mmap(struct file *, unsigned long,
diff --combined mm/mmap.c

index 9e018d8dd7d6930519760e8d96d02386294e4a68,34d2337ace59e9bc7195c13106150c7b3eaead2c..853489ca05ef9af79d70fa2fc3889b7273907c68
--- 1/mm/mmap.c
--- 2/mm/mmap.c
+++ b/mm/mmap.c
@@@ -583,12 -583,11 +583,12 @@@ again
    * dup_anon_vma() - Helper function to duplicate anon_vma
    * @dst: The destination VMA
    * @src: The source VMA
+ + * @dup: Pointer to the destination VMA when successful.
    *
    * Returns: 0 on success.
    */
   static inline int dup_anon_vma(struct vm_area_struct *dst,
- -                             struct vm_area_struct *src)
+ +              struct vm_area_struct *src, struct vm_area_struct **dup)
   {
         /*
          * Easily overlooked: when mprotect shifts the boundary, make sure the
@@@ -596,15 -595,9 +596,15 @@@
          * anon pages imported.
          */
         if (src->anon_vma && !dst->anon_vma) {
+ +              int ret;
+ +
                 vma_assert_write_locked(dst);
                 dst->anon_vma = src->anon_vma;
- -              return anon_vma_clone(dst, src);
+ +              ret = anon_vma_clone(dst, src);
+ +              if (ret)
+ +                      return ret;
+ +
+ +              *dup = dst;
         }
   
         return 0;
@@@ -631,7 -624,6 +631,7 @@@ int vma_expand(struct vma_iterator *vmi
                unsigned long start, unsigned long end, pgoff_t pgoff,
                struct vm_area_struct *next)
   {
+ +      struct vm_area_struct *anon_dup = NULL;
         bool remove_next = false;
         struct vma_prepare vp;
   
@@@ -641,7 -633,7 +641,7 @@@
   
                 remove_next = true;
                 vma_start_write(next);
- -              ret = dup_anon_vma(vma, next);
+ +              ret = dup_anon_vma(vma, next, &anon_dup);
                 if (ret)
                         return ret;
         }
@@@ -669,8 -661,6 +669,8 @@@
         return 0;
   
   nomem:
+ +      if (anon_dup)
+ +              unlink_anon_vmas(anon_dup);
         return -ENOMEM;
   }
   
@@@ -870,7 -860,6 +870,7 @@@ struct vm_area_struct *vma_merge(struc
   {
         struct vm_area_struct *curr, *next, *res;
         struct vm_area_struct *vma, *adjust, *remove, *remove2;
+ +      struct vm_area_struct *anon_dup = NULL;
         struct vma_prepare vp;
         pgoff_t vma_pgoff;
         int err = 0;
@@@ -938,18 -927,18 +938,18 @@@
                 vma_start_write(next);
                 remove = next;                          /* case 1 */
                 vma_end = next->vm_end;
- -              err = dup_anon_vma(prev, next);
+ +              err = dup_anon_vma(prev, next, &anon_dup);
                 if (curr) {                             /* case 6 */
                         vma_start_write(curr);
                         remove = curr;
                         remove2 = next;
                         if (!next->anon_vma)
- -                              err = dup_anon_vma(prev, curr);
+ +                              err = dup_anon_vma(prev, curr, &anon_dup);
                 }
         } else if (merge_prev) {                        /* case 2 */
                 if (curr) {
                         vma_start_write(curr);
- -                      err = dup_anon_vma(prev, curr);
+ +                      err = dup_anon_vma(prev, curr, &anon_dup);
                         if (end == curr->vm_end) {      /* case 7 */
                                 remove = curr;
                         } else {                        /* case 5 */
@@@ -965,7 -954,7 +965,7 @@@
                         vma_end = addr;
                         adjust = next;
                         adj_start = -(prev->vm_end - addr);
- -                      err = dup_anon_vma(next, prev);
+ +                      err = dup_anon_vma(next, prev, &anon_dup);
                 } else {
                         /*
                          * Note that cases 3 and 8 are the ONLY ones where prev
@@@ -979,14 -968,14 +979,14 @@@
                                 vma_pgoff = curr->vm_pgoff;
                                 vma_start_write(curr);
                                 remove = curr;
- -                              err = dup_anon_vma(next, curr);
+ +                              err = dup_anon_vma(next, curr, &anon_dup);
                         }
                 }
         }
   
         /* Error in anon_vma clone. */
         if (err)
- -              return NULL;
+ +              goto anon_vma_fail;
   
         if (vma_start < vma->vm_start || vma_end > vma->vm_end)
                 vma_expanded = true;
@@@ -999,7 -988,7 +999,7 @@@
         }
   
         if (vma_iter_prealloc(vmi, vma))
- -              return NULL;
+ +              goto prealloc_fail;
   
         init_multi_vma_prep(&vp, vma, adjust, remove, remove2);
         VM_WARN_ON(vp.anon_vma && adjust && adjust->anon_vma &&
@@@ -1027,15 -1016,6 +1027,15 @@@
         vma_complete(&vp, vmi, mm);
         khugepaged_enter_vma(res, vm_flags);
         return res;
+ +
+ +prealloc_fail:
+ +      if (anon_dup)
+ +              unlink_anon_vmas(anon_dup);
+ +
+ +anon_vma_fail:
+ +      vma_iter_set(vmi, addr);
+ +      vma_iter_load(vmi);
+ +      return NULL;
   }
   
   /*
@@@ -3163,13 -3143,13 +3163,13 @@@ int vm_brk_flags(unsigned long addr, un
         if (!len)
                 return 0;
   
- -      if (mmap_write_lock_killable(mm))
- -              return -EINTR;
- -
         /* Until we need other flags, refuse anything except VM_EXEC. */
         if ((flags & (~VM_EXEC)) != 0)
                 return -EINVAL;
   
+ +      if (mmap_write_lock_killable(mm))
+ +              return -EINTR;
+ +
         ret = check_brk_limits(addr, len);
         if (ret)
                 goto limits_failed;
@@@ -3194,12 -3174,6 +3194,6 @@@ limits_failed
   }
   EXPORT_SYMBOL(vm_brk_flags);
   
- int vm_brk(unsigned long addr, unsigned long len)
- {
-       return vm_brk_flags(addr, len, 0);
- }
- EXPORT_SYMBOL(vm_brk);
- 
   /* Release all mmaps. */
   void exit_mmap(struct mm_struct *mm)
   {
author	Linus Torvalds <[email protected]>
	Tue, 31 Oct 2023 05:28:19 +0000 (19:28 -1000)
committer	Linus Torvalds <[email protected]>
	Tue, 31 Oct 2023 05:28:19 +0000 (19:28 -1000)
		1	2
fs/binfmt_elf_fdpic.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/binfmt_misc.c	patch \|	diff1 \|	diff2 \|	blob \| history
include/linux/mm.h	patch \|	diff1 \|	diff2 \|	blob \| history
mm/mmap.c	patch \|	diff1 \|	diff2 \|	blob \| history