Merge tag 'pull-stable-struct_fd' of git://git.kernel.org/pub/scm/linux/kernel/git...

author Linus Torvalds <[email protected]>

Mon, 23 Sep 2024 16:35:36 +0000 (09:35 -0700)

committer Linus Torvalds <[email protected]>

Mon, 23 Sep 2024 16:35:36 +0000 (09:35 -0700)
author Linus Torvalds <[email protected]>
Mon, 23 Sep 2024 16:35:36 +0000 (09:35 -0700)
committer Linus Torvalds <[email protected]>
Mon, 23 Sep 2024 16:35:36 +0000 (09:35 -0700)
diff --combined arch/alpha/kernel/osf_sys.c

index 8886ab539273d236a92a73b741acd846d6cd7b8a,56fea57f9642d9afbe812a3560c38637b6db3f32..c0424de9e7cda2d51967914763d0ff52f3fa9389
--- 1/arch/alpha/kernel/osf_sys.c
--- 2/arch/alpha/kernel/osf_sys.c
+++ b/arch/alpha/kernel/osf_sys.c
@@@ -160,10 -160,10 +160,10 @@@ SYSCALL_DEFINE4(osf_getdirentries, unsi
                 .count = count
         };
   
-       if (!arg.file)
+       if (!fd_file(arg))
                 return -EBADF;
   
-       error = iterate_dir(arg.file, &buf.ctx);
+       error = iterate_dir(fd_file(arg), &buf.ctx);
         if (error >= 0)
                 error = buf.error;
         if (count != buf.count)
@@@ -1229,7 -1229,7 +1229,7 @@@ arch_get_unmapped_area_1(unsigned long 
   unsigned long
   arch_get_unmapped_area(struct file *filp, unsigned long addr,
                        unsigned long len, unsigned long pgoff,
- -                     unsigned long flags)
+ +                     unsigned long flags, vm_flags_t vm_flags)
   {
         unsigned long limit;
   
diff --combined arch/x86/kernel/cpu/sgx/main.c

index 3a79105455f1dfc25c9e23527491cf8df0f02cef,d01deb3863955e7a63cd8bd5de3bb11e19934cfa..9ace84486499b8d45e5449a3da418328dde75cf7
--- 1/arch/x86/kernel/cpu/sgx/main.c
--- 2/arch/x86/kernel/cpu/sgx/main.c
+++ b/arch/x86/kernel/cpu/sgx/main.c
@@@ -475,25 -475,24 +475,25 @@@ struct sgx_epc_page *__sgx_alloc_epc_pa
   {
         struct sgx_epc_page *page;
         int nid_of_current = numa_node_id();
- -      int nid = nid_of_current;
+ +      int nid_start, nid;
   
- -      if (node_isset(nid_of_current, sgx_numa_mask)) {
- -              page = __sgx_alloc_epc_page_from_node(nid_of_current);
- -              if (page)
- -                      return page;
- -      }
- -
- -      /* Fall back to the non-local NUMA nodes: */
- -      while (true) {
- -              nid = next_node_in(nid, sgx_numa_mask);
- -              if (nid == nid_of_current)
- -                      break;
+ +      /*
+ +       * Try local node first. If it doesn't have an EPC section,
+ +       * fall back to the non-local NUMA nodes.
+ +       */
+ +      if (node_isset(nid_of_current, sgx_numa_mask))
+ +              nid_start = nid_of_current;
+ +      else
+ +              nid_start = next_node_in(nid_of_current, sgx_numa_mask);
   
+ +      nid = nid_start;
+ +      do {
                 page = __sgx_alloc_epc_page_from_node(nid);
                 if (page)
                         return page;
- -      }
+ +
+ +              nid = next_node_in(nid, sgx_numa_mask);
+ +      } while (nid != nid_start);
   
         return ERR_PTR(-ENOMEM);
   }
@@@ -733,7 -732,7 +733,7 @@@ out
         return 0;
   }
   
- -/**
+ +/*
    * A section metric is concatenated in a way that @low bits 12-31 define the
    * bits 12-31 of the metric and @high bits 0-19 define the bits 32-51 of the
    * metric.
@@@ -848,13 -847,6 +848,13 @@@ static bool __init sgx_page_cache_init(
                 return false;
         }
   
+ +      for_each_online_node(nid) {
+ +              if (!node_isset(nid, sgx_numa_mask) &&
+ +                  node_state(nid, N_MEMORY) && node_state(nid, N_CPU))
+ +                      pr_info("node%d has both CPUs and memory but doesn't have an EPC section\n",
+ +                              nid);
+ +      }
+ +
         return true;
   }
   
@@@ -903,10 -895,10 +903,10 @@@ int sgx_set_attribute(unsigned long *al
   {
         struct fd f = fdget(attribute_fd);
   
-       if (!f.file)
+       if (!fd_file(f))
                 return -EINVAL;
   
-       if (f.file->f_op != &sgx_provision_fops) {
+       if (fd_file(f)->f_op != &sgx_provision_fops) {
                 fdput(f);
                 return -EINVAL;
         }
diff --combined arch/x86/kvm/svm/sev.c

index 714c517dd4b72bda0879885c7d1a681f897e7ac2,0e38b52232639be1257712461a21c4b051892fcb..0b851ef937f29761d421e4e266c232f3d2e63156
--- 1/arch/x86/kvm/svm/sev.c
--- 2/arch/x86/kvm/svm/sev.c
+++ b/arch/x86/kvm/svm/sev.c
@@@ -534,10 -534,10 +534,10 @@@ static int __sev_issue_cmd(int fd, int 
         int ret;
   
         f = fdget(fd);
-       if (!f.file)
+       if (!fd_file(f))
                 return -EBADF;
   
-       ret = sev_issue_cmd_external_user(f.file, id, data, error);
+       ret = sev_issue_cmd_external_user(fd_file(f), id, data, error);
   
         fdput(f);
         return ret;
@@@ -2078,15 -2078,15 +2078,15 @@@ int sev_vm_move_enc_context_from(struc
         bool charged = false;
         int ret;
   
-       if (!f.file)
+       if (!fd_file(f))
                 return -EBADF;
   
-       if (!file_is_kvm(f.file)) {
+       if (!file_is_kvm(fd_file(f))) {
                 ret = -EBADF;
                 goto out_fput;
         }
   
-       source_kvm = f.file->private_data;
+       source_kvm = fd_file(f)->private_data;
         ret = sev_lock_two_vms(kvm, source_kvm);
         if (ret)
                 goto out_fput;
@@@ -2276,24 -2276,30 +2276,24 @@@ static int sev_gmem_post_populate(struc
   
         for (gfn = gfn_start, i = 0; gfn < gfn_start + npages; gfn++, i++) {
                 struct sev_data_snp_launch_update fw_args = {0};
- -              bool assigned;
+ +              bool assigned = false;
                 int level;
   
- -              if (!kvm_mem_is_private(kvm, gfn)) {
- -                      pr_debug("%s: Failed to ensure GFN 0x%llx has private memory attribute set\n",
- -                               __func__, gfn);
- -                      ret = -EINVAL;
- -                      goto err;
- -              }
- -
                 ret = snp_lookup_rmpentry((u64)pfn + i, &assigned, &level);
                 if (ret || assigned) {
                         pr_debug("%s: Failed to ensure GFN 0x%llx RMP entry is initial shared state, ret: %d assigned: %d\n",
                                  __func__, gfn, ret, assigned);
- -                      ret = -EINVAL;
+ +                      ret = ret ? -EINVAL : -EEXIST;
                         goto err;
                 }
   
                 if (src) {
                         void *vaddr = kmap_local_pfn(pfn + i);
   
- -                      ret = copy_from_user(vaddr, src + i * PAGE_SIZE, PAGE_SIZE);
- -                      if (ret)
+ +                      if (copy_from_user(vaddr, src + i * PAGE_SIZE, PAGE_SIZE)) {
+ +                              ret = -EFAULT;
                                 goto err;
+ +                      }
                         kunmap_local(vaddr);
                 }
   
@@@ -2543,14 -2549,6 +2543,14 @@@ static int snp_launch_finish(struct kv
         data->gctx_paddr = __psp_pa(sev->snp_context);
         ret = sev_issue_cmd(kvm, SEV_CMD_SNP_LAUNCH_FINISH, data, &argp->error);
   
+ +      /*
+ +       * Now that there will be no more SNP_LAUNCH_UPDATE ioctls, private pages
+ +       * can be given to the guest simply by marking the RMP entry as private.
+ +       * This can happen on first access and also with KVM_PRE_FAULT_MEMORY.
+ +       */
+ +      if (!ret)
+ +              kvm->arch.pre_fault_allowed = true;
+ +
         kfree(id_auth);
   
   e_free_id_block:
@@@ -2803,15 -2801,15 +2803,15 @@@ int sev_vm_copy_enc_context_from(struc
         struct kvm_sev_info *source_sev, *mirror_sev;
         int ret;
   
-       if (!f.file)
+       if (!fd_file(f))
                 return -EBADF;
   
-       if (!file_is_kvm(f.file)) {
+       if (!file_is_kvm(fd_file(f))) {
                 ret = -EBADF;
                 goto e_source_fput;
         }
   
-       source_kvm = f.file->private_data;
+       source_kvm = fd_file(f)->private_data;
         ret = sev_lock_two_vms(kvm, source_kvm);
         if (ret)
                 goto e_source_fput;
diff --combined drivers/gpu/drm/amd/amdgpu/amdgpu_sched.c

index f9ff493c100e62482a7424705eb9b499e021599b,a9298cb8d19a59c5aa788003acd57381581e2880..b0a8abc7a8ecfd9cf190d3c03a4dd6dcb2e735f6
--- 1/drivers/gpu/drm/amd/amdgpu/amdgpu_sched.c
--- 2/drivers/gpu/drm/amd/amdgpu/amdgpu_sched.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_sched.c
@@@ -22,6 -22,7 +22,6 @@@
    * Authors: Andres Rodriguez <[email protected]>
    */
   
- -#include <linux/fdtable.h>
   #include <linux/file.h>
   #include <linux/pid.h>
   
@@@ -42,10 -43,10 +42,10 @@@ static int amdgpu_sched_process_priorit
         uint32_t id;
         int r;
   
-       if (!f.file)
+       if (!fd_file(f))
                 return -EINVAL;
   
-       r = amdgpu_file_to_fpriv(f.file, &fpriv);
+       r = amdgpu_file_to_fpriv(fd_file(f), &fpriv);
         if (r) {
                 fdput(f);
                 return r;
@@@ -71,10 -72,10 +71,10 @@@ static int amdgpu_sched_context_priorit
         struct amdgpu_ctx *ctx;
         int r;
   
-       if (!f.file)
+       if (!fd_file(f))
                 return -EINVAL;
   
-       r = amdgpu_file_to_fpriv(f.file, &fpriv);
+       r = amdgpu_file_to_fpriv(fd_file(f), &fpriv);
         if (r) {
                 fdput(f);
                 return r;
diff --combined drivers/gpu/drm/drm_syncobj.c

index 4fcfc0b9b386cffd0e36803ed256f24ee13bac70,7fb31ca3b5fc5e30043e283267aa5aa47aa386fd..8e3d2d7060f80ec8035ad5de49b6d849e3fa49f6
--- 1/drivers/gpu/drm/drm_syncobj.c
--- 2/drivers/gpu/drm/drm_syncobj.c
+++ b/drivers/gpu/drm/drm_syncobj.c
@@@ -715,16 -715,16 +715,16 @@@ static int drm_syncobj_fd_to_handle(str
         struct fd f = fdget(fd);
         int ret;
   
-       if (!f.file)
+       if (!fd_file(f))
                 return -EINVAL;
   
-       if (f.file->f_op != &drm_syncobj_file_fops) {
+       if (fd_file(f)->f_op != &drm_syncobj_file_fops) {
                 fdput(f);
                 return -EINVAL;
         }
   
         /* take a reference to put in the idr */
-       syncobj = f.file->private_data;
+       syncobj = fd_file(f)->private_data;
         drm_syncobj_get(syncobj);
   
         idr_preload(GFP_KERNEL);
@@@ -1464,7 -1464,6 +1464,7 @@@ drm_syncobj_eventfd_ioctl(struct drm_de
         struct drm_syncobj *syncobj;
         struct eventfd_ctx *ev_fd_ctx;
         struct syncobj_eventfd_entry *entry;
+ +      int ret;
   
         if (!drm_core_check_feature(dev, DRIVER_SYNCOBJ_TIMELINE))
                 return -EOPNOTSUPP;
@@@ -1480,15 -1479,13 +1480,15 @@@
                 return -ENOENT;
   
         ev_fd_ctx = eventfd_ctx_fdget(args->fd);
- -      if (IS_ERR(ev_fd_ctx))
- -              return PTR_ERR(ev_fd_ctx);
+ +      if (IS_ERR(ev_fd_ctx)) {
+ +              ret = PTR_ERR(ev_fd_ctx);
+ +              goto err_fdget;
+ +      }
   
         entry = kzalloc(sizeof(*entry), GFP_KERNEL);
         if (!entry) {
- -              eventfd_ctx_put(ev_fd_ctx);
- -              return -ENOMEM;
+ +              ret = -ENOMEM;
+ +              goto err_kzalloc;
         }
         entry->syncobj = syncobj;
         entry->ev_fd_ctx = ev_fd_ctx;
@@@ -1499,12 -1496,6 +1499,12 @@@
         drm_syncobj_put(syncobj);
   
         return 0;
+ +
+ +err_kzalloc:
+ +      eventfd_ctx_put(ev_fd_ctx);
+ +err_fdget:
+ +      drm_syncobj_put(syncobj);
+ +      return ret;
   }
   
   int
diff --combined fs/btrfs/ioctl.c

index 8537eb9b553121095ee544ee5957aeab86676ce3,32ddd3d317194ba96463f77a3270322dbca1d069..226c91fe31a7079079d821c9b6a44ca8fddfb75b
--- 1/fs/btrfs/ioctl.c
--- 2/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@@ -543,11 -543,13 +543,11 @@@ static noinline int btrfs_ioctl_fitrim(
   
         range.minlen = max(range.minlen, minlen);
         ret = btrfs_trim_fs(fs_info, &range);
- -      if (ret < 0)
- -              return ret;
   
         if (copy_to_user(arg, &range, sizeof(range)))
                 return -EFAULT;
   
- -      return 0;
+ +      return ret;
   }
   
   int __pure btrfs_is_empty_uuid(const u8 *uuid)
@@@ -1310,12 -1312,12 +1310,12 @@@ static noinline int __btrfs_ioctl_snap_
         } else {
                 struct fd src = fdget(fd);
                 struct inode *src_inode;
-               if (!src.file) {
+               if (!fd_file(src)) {
                         ret = -EINVAL;
                         goto out_drop_write;
                 }
   
-               src_inode = file_inode(src.file);
+               src_inode = file_inode(fd_file(src));
                 if (src_inode->i_sb != file_inode(file)->i_sb) {
                         btrfs_info(BTRFS_I(file_inode(file))->root->fs_info,
                                    "Snapshot src from another FS");
@@@ -4763,10 -4765,11 +4763,10 @@@ long btrfs_ioctl(struct file *file, uns
                         return ret;
                 ret = btrfs_sync_fs(inode->i_sb, 1);
                 /*
- -               * The transaction thread may want to do more work,
- -               * namely it pokes the cleaner kthread that will start
- -               * processing uncleaned subvols.
+ +               * There may be work for the cleaner kthread to do (subvolume
+ +               * deletion, delayed iputs, defrag inodes, etc), so wake it up.
                  */
- -              wake_up_process(fs_info->transaction_kthread);
+ +              wake_up_process(fs_info->cleaner_kthread);
                 return ret;
         }
         case BTRFS_IOC_START_SYNC:
diff --combined fs/eventpoll.c

index 145f5349c612e289d2f38d09c502877cfc5c9153,28d1a754cf33f3a109d44c6e1df314220c28dbe4..1ae4542f0bd88b07e323d0dd75be6c0fe9fff54f
--- 1/fs/eventpoll.c
--- 2/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@@ -420,7 -420,7 +420,7 @@@ static bool busy_loop_ep_timeout(unsign
   
   static bool ep_busy_loop_on(struct eventpoll *ep)
   {
- -      return !!ep->busy_poll_usecs || net_busy_loop_on();
+ +      return !!READ_ONCE(ep->busy_poll_usecs) || net_busy_loop_on();
   }
   
   static bool ep_busy_loop_end(void *p, unsigned long start_time)
@@@ -2200,6 -2200,11 +2200,6 @@@ static int do_epoll_create(int flags
                 error = PTR_ERR(file);
                 goto out_free_fd;
         }
- -#ifdef CONFIG_NET_RX_BUSY_POLL
- -      ep->busy_poll_usecs = 0;
- -      ep->busy_poll_budget = 0;
- -      ep->prefer_busy_poll = false;
- -#endif
         ep->file = file;
         fd_install(fd, file);
         return fd;
@@@ -2261,17 -2266,17 +2261,17 @@@ int do_epoll_ctl(int epfd, int op, int 
   
         error = -EBADF;
         f = fdget(epfd);
-       if (!f.file)
+       if (!fd_file(f))
                 goto error_return;
   
         /* Get the "struct file *" for the target file */
         tf = fdget(fd);
-       if (!tf.file)
+       if (!fd_file(tf))
                 goto error_fput;
   
         /* The target file descriptor must support poll */
         error = -EPERM;
-       if (!file_can_poll(tf.file))
+       if (!file_can_poll(fd_file(tf)))
                 goto error_tgt_fput;
   
         /* Check if EPOLLWAKEUP is allowed */
@@@ -2284,7 -2289,7 +2284,7 @@@
          * adding an epoll file descriptor inside itself.
          */
         error = -EINVAL;
-       if (f.file == tf.file || !is_file_epoll(f.file))
+       if (fd_file(f) == fd_file(tf) || !is_file_epoll(fd_file(f)))
                 goto error_tgt_fput;
   
         /*
@@@ -2295,7 -2300,7 +2295,7 @@@
         if (ep_op_has_event(op) && (epds->events & EPOLLEXCLUSIVE)) {
                 if (op == EPOLL_CTL_MOD)
                         goto error_tgt_fput;
-               if (op == EPOLL_CTL_ADD && (is_file_epoll(tf.file) ||
+               if (op == EPOLL_CTL_ADD && (is_file_epoll(fd_file(tf)) ||
                                 (epds->events & ~EPOLLEXCLUSIVE_OK_BITS)))
                         goto error_tgt_fput;
         }
@@@ -2304,7 -2309,7 +2304,7 @@@
          * At this point it is safe to assume that the "private_data" contains
          * our own data structure.
          */
-       ep = f.file->private_data;
+       ep = fd_file(f)->private_data;
   
         /*
          * When we insert an epoll file descriptor inside another epoll file
@@@ -2325,16 -2330,16 +2325,16 @@@
         if (error)
                 goto error_tgt_fput;
         if (op == EPOLL_CTL_ADD) {
-               if (READ_ONCE(f.file->f_ep) || ep->gen == loop_check_gen ||
-                   is_file_epoll(tf.file)) {
+               if (READ_ONCE(fd_file(f)->f_ep) || ep->gen == loop_check_gen ||
+                   is_file_epoll(fd_file(tf))) {
                         mutex_unlock(&ep->mtx);
                         error = epoll_mutex_lock(&epnested_mutex, 0, nonblock);
                         if (error)
                                 goto error_tgt_fput;
                         loop_check_gen++;
                         full_check = 1;
-                       if (is_file_epoll(tf.file)) {
-                               tep = tf.file->private_data;
+                       if (is_file_epoll(fd_file(tf))) {
+                               tep = fd_file(tf)->private_data;
                                 error = -ELOOP;
                                 if (ep_loop_check(ep, tep) != 0)
                                         goto error_tgt_fput;
@@@ -2350,14 -2355,14 +2350,14 @@@
          * above, we can be sure to be able to use the item looked up by
          * ep_find() till we release the mutex.
          */
-       epi = ep_find(ep, tf.file, fd);
+       epi = ep_find(ep, fd_file(tf), fd);
   
         error = -EINVAL;
         switch (op) {
         case EPOLL_CTL_ADD:
                 if (!epi) {
                         epds->events |= EPOLLERR | EPOLLHUP;
-                       error = ep_insert(ep, epds, tf.file, fd, full_check);
+                       error = ep_insert(ep, epds, fd_file(tf), fd, full_check);
                 } else
                         error = -EEXIST;
                 break;
@@@ -2438,7 -2443,7 +2438,7 @@@ static int do_epoll_wait(int epfd, stru
   
         /* Get the "struct file *" for the eventpoll file */
         f = fdget(epfd);
-       if (!f.file)
+       if (!fd_file(f))
                 return -EBADF;
   
         /*
@@@ -2446,14 -2451,14 +2446,14 @@@
          * the user passed to us _is_ an eventpoll file.
          */
         error = -EINVAL;
-       if (!is_file_epoll(f.file))
+       if (!is_file_epoll(fd_file(f)))
                 goto error_fput;
   
         /*
          * At this point it is safe to assume that the "private_data" contains
          * our own data structure.
          */
-       ep = f.file->private_data;
+       ep = fd_file(f)->private_data;
   
         /* Time to fish for events ... */
         error = ep_poll(ep, events, maxevents, to);
diff --combined fs/fcntl.c

index 081e5e3d89ea3c64eda138f9841d0a149c0b7411,2b56167623546de27aac6ae45c9e1e1de279601a..22dd9dcce7ecc8536afd9dd59b8aaf4644babb29
--- 1/fs/fcntl.c
--- 2/fs/fcntl.c
+++ b/fs/fcntl.c
@@@ -33,8 -33,6 +33,8 @@@
   #include <asm/siginfo.h>
   #include <linux/uaccess.h>
   
+ +#include "internal.h"
+ +
   #define SETFL_MASK (O_APPEND | O_NONBLOCK | O_NDELAY | O_DIRECT | O_NOATIME)
   
   static int setfl(int fd, struct file * filp, unsigned int arg)
@@@ -89,65 -87,29 +89,65 @@@
         return error;
   }
   
- -static void f_modown(struct file *filp, struct pid *pid, enum pid_type type,
- -                     int force)
+ +/*
+ + * Allocate an file->f_owner struct if it doesn't exist, handling racing
+ + * allocations correctly.
+ + */
+ +int file_f_owner_allocate(struct file *file)
   {
- -      write_lock_irq(&filp->f_owner.lock);
- -      if (force || !filp->f_owner.pid) {
- -              put_pid(filp->f_owner.pid);
- -              filp->f_owner.pid = get_pid(pid);
- -              filp->f_owner.pid_type = type;
+ +      struct fown_struct *f_owner;
   
- -              if (pid) {
- -                      const struct cred *cred = current_cred();
- -                      filp->f_owner.uid = cred->uid;
- -                      filp->f_owner.euid = cred->euid;
- -              }
+ +      f_owner = file_f_owner(file);
+ +      if (f_owner)
+ +              return 0;
+ +
+ +      f_owner = kzalloc(sizeof(struct fown_struct), GFP_KERNEL);
+ +      if (!f_owner)
+ +              return -ENOMEM;
+ +
+ +      rwlock_init(&f_owner->lock);
+ +      f_owner->file = file;
+ +      /* If someone else raced us, drop our allocation. */
+ +      if (unlikely(cmpxchg(&file->f_owner, NULL, f_owner)))
+ +              kfree(f_owner);
+ +      return 0;
+ +}
+ +EXPORT_SYMBOL(file_f_owner_allocate);
+ +
+ +void file_f_owner_release(struct file *file)
+ +{
+ +      struct fown_struct *f_owner;
+ +
+ +      f_owner = file_f_owner(file);
+ +      if (f_owner) {
+ +              put_pid(f_owner->pid);
+ +              kfree(f_owner);
         }
- -      write_unlock_irq(&filp->f_owner.lock);
   }
   
   void __f_setown(struct file *filp, struct pid *pid, enum pid_type type,
                 int force)
   {
- -      security_file_set_fowner(filp);
- -      f_modown(filp, pid, type, force);
+ +      struct fown_struct *f_owner;
+ +
+ +      f_owner = file_f_owner(filp);
+ +      if (WARN_ON_ONCE(!f_owner))
+ +              return;
+ +
+ +      write_lock_irq(&f_owner->lock);
+ +      if (force || !f_owner->pid) {
+ +              put_pid(f_owner->pid);
+ +              f_owner->pid = get_pid(pid);
+ +              f_owner->pid_type = type;
+ +
+ +              if (pid) {
+ +                      const struct cred *cred = current_cred();
+ +                      security_file_set_fowner(filp);
+ +                      f_owner->uid = cred->uid;
+ +                      f_owner->euid = cred->euid;
+ +              }
+ +      }
+ +      write_unlock_irq(&f_owner->lock);
   }
   EXPORT_SYMBOL(__f_setown);
   
@@@ -157,8 -119,6 +157,8 @@@ int f_setown(struct file *filp, int who
         struct pid *pid = NULL;
         int ret = 0;
   
+ +      might_sleep();
+ +
         type = PIDTYPE_TGID;
         if (who < 0) {
                 /* avoid overflow below */
@@@ -169,10 -129,6 +169,10 @@@
                 who = -who;
         }
   
+ +      ret = file_f_owner_allocate(filp);
+ +      if (ret)
+ +              return ret;
+ +
         rcu_read_lock();
         if (who) {
                 pid = find_vpid(who);
@@@ -190,27 -146,22 +190,27 @@@ EXPORT_SYMBOL(f_setown)
   
   void f_delown(struct file *filp)
   {
- -      f_modown(filp, NULL, PIDTYPE_TGID, 1);
+ +      __f_setown(filp, NULL, PIDTYPE_TGID, 1);
   }
   
   pid_t f_getown(struct file *filp)
   {
         pid_t pid = 0;
+ +      struct fown_struct *f_owner;
   
- -      read_lock_irq(&filp->f_owner.lock);
+ +      f_owner = file_f_owner(filp);
+ +      if (!f_owner)
+ +              return pid;
+ +
+ +      read_lock_irq(&f_owner->lock);
         rcu_read_lock();
- -      if (pid_task(filp->f_owner.pid, filp->f_owner.pid_type)) {
- -              pid = pid_vnr(filp->f_owner.pid);
- -              if (filp->f_owner.pid_type == PIDTYPE_PGID)
+ +      if (pid_task(f_owner->pid, f_owner->pid_type)) {
+ +              pid = pid_vnr(f_owner->pid);
+ +              if (f_owner->pid_type == PIDTYPE_PGID)
                         pid = -pid;
         }
         rcu_read_unlock();
- -      read_unlock_irq(&filp->f_owner.lock);
+ +      read_unlock_irq(&f_owner->lock);
         return pid;
   }
   
@@@ -243,10 -194,6 +243,10 @@@ static int f_setown_ex(struct file *fil
                 return -EINVAL;
         }
   
+ +      ret = file_f_owner_allocate(filp);
+ +      if (ret)
+ +              return ret;
+ +
         rcu_read_lock();
         pid = find_vpid(owner.pid);
         if (owner.pid && !pid)
@@@ -263,20 -210,13 +263,20 @@@ static int f_getown_ex(struct file *fil
         struct f_owner_ex __user *owner_p = (void __user *)arg;
         struct f_owner_ex owner = {};
         int ret = 0;
+ +      struct fown_struct *f_owner;
+ +      enum pid_type pid_type = PIDTYPE_PID;
   
- -      read_lock_irq(&filp->f_owner.lock);
- -      rcu_read_lock();
- -      if (pid_task(filp->f_owner.pid, filp->f_owner.pid_type))
- -              owner.pid = pid_vnr(filp->f_owner.pid);
- -      rcu_read_unlock();
- -      switch (filp->f_owner.pid_type) {
+ +      f_owner = file_f_owner(filp);
+ +      if (f_owner) {
+ +              read_lock_irq(&f_owner->lock);
+ +              rcu_read_lock();
+ +              if (pid_task(f_owner->pid, f_owner->pid_type))
+ +                      owner.pid = pid_vnr(f_owner->pid);
+ +              rcu_read_unlock();
+ +              pid_type = f_owner->pid_type;
+ +      }
+ +
+ +      switch (pid_type) {
         case PIDTYPE_PID:
                 owner.type = F_OWNER_TID;
                 break;
@@@ -294,8 -234,7 +294,8 @@@
                 ret = -EINVAL;
                 break;
         }
- -      read_unlock_irq(&filp->f_owner.lock);
+ +      if (f_owner)
+ +              read_unlock_irq(&f_owner->lock);
   
         if (!ret) {
                 ret = copy_to_user(owner_p, &owner, sizeof(owner));
@@@ -309,18 -248,14 +309,18 @@@
   static int f_getowner_uids(struct file *filp, unsigned long arg)
   {
         struct user_namespace *user_ns = current_user_ns();
+ +      struct fown_struct *f_owner;
         uid_t __user *dst = (void __user *)arg;
- -      uid_t src[2];
+ +      uid_t src[2] = {0, 0};
         int err;
   
- -      read_lock_irq(&filp->f_owner.lock);
- -      src[0] = from_kuid(user_ns, filp->f_owner.uid);
- -      src[1] = from_kuid(user_ns, filp->f_owner.euid);
- -      read_unlock_irq(&filp->f_owner.lock);
+ +      f_owner = file_f_owner(filp);
+ +      if (f_owner) {
+ +              read_lock_irq(&f_owner->lock);
+ +              src[0] = from_kuid(user_ns, f_owner->uid);
+ +              src[1] = from_kuid(user_ns, f_owner->euid);
+ +              read_unlock_irq(&f_owner->lock);
+ +      }
   
         err  = put_user(src[0], &dst[0]);
         err |= put_user(src[1], &dst[1]);
@@@ -405,39 -340,9 +405,39 @@@ static long f_dupfd_query(int fd, struc
          * overkill, but given our lockless file pointer lookup, the
          * alternatives are complicated.
          */
-       return f.file == filp;
+       return fd_file(f) == filp;
   }
   
+ +/* Let the caller figure out whether a given file was just created. */
+ +static long f_created_query(const struct file *filp)
+ +{
+ +      return !!(filp->f_mode & FMODE_CREATED);
+ +}
+ +
+ +static int f_owner_sig(struct file *filp, int signum, bool setsig)
+ +{
+ +      int ret = 0;
+ +      struct fown_struct *f_owner;
+ +
+ +      might_sleep();
+ +
+ +      if (setsig) {
+ +              if (!valid_signal(signum))
+ +                      return -EINVAL;
+ +
+ +              ret = file_f_owner_allocate(filp);
+ +              if (ret)
+ +                      return ret;
+ +      }
+ +
+ +      f_owner = file_f_owner(filp);
+ +      if (setsig)
+ +              f_owner->signum = signum;
+ +      else if (f_owner)
+ +              ret = f_owner->signum;
+ +      return ret;
+ +}
+ +
   static long do_fcntl(int fd, unsigned int cmd, unsigned long arg,
                 struct file *filp)
   {
@@@ -447,9 -352,6 +447,9 @@@
         long err = -EINVAL;
   
         switch (cmd) {
+ +      case F_CREATED_QUERY:
+ +              err = f_created_query(filp);
+ +              break;
         case F_DUPFD:
                 err = f_dupfd(argi, filp, 0);
                 break;
@@@ -519,10 -421,15 +519,10 @@@
                 err = f_getowner_uids(filp, arg);
                 break;
         case F_GETSIG:
- -              err = filp->f_owner.signum;
+ +              err = f_owner_sig(filp, 0, false);
                 break;
         case F_SETSIG:
- -              /* arg == 0 restores default behaviour. */
- -              if (!valid_signal(argi)) {
- -                      break;
- -              }
- -              err = 0;
- -              filp->f_owner.signum = argi;
+ +              err = f_owner_sig(filp, argi, true);
                 break;
         case F_GETLEASE:
                 err = fcntl_getlease(filp);
@@@ -556,7 -463,6 +556,7 @@@
   static int check_fcntl_cmd(unsigned cmd)
   {
         switch (cmd) {
+ +      case F_CREATED_QUERY:
         case F_DUPFD:
         case F_DUPFD_CLOEXEC:
         case F_DUPFD_QUERY:
@@@ -573,17 -479,17 +573,17 @@@ SYSCALL_DEFINE3(fcntl, unsigned int, fd
         struct fd f = fdget_raw(fd);
         long err = -EBADF;
   
-       if (!f.file)
+       if (!fd_file(f))
                 goto out;
   
-       if (unlikely(f.file->f_mode & FMODE_PATH)) {
+       if (unlikely(fd_file(f)->f_mode & FMODE_PATH)) {
                 if (!check_fcntl_cmd(cmd))
                         goto out1;
         }
   
-       err = security_file_fcntl(f.file, cmd, arg);
+       err = security_file_fcntl(fd_file(f), cmd, arg);
         if (!err)
-               err = do_fcntl(fd, cmd, arg, f.file);
+               err = do_fcntl(fd, cmd, arg, fd_file(f));
   
   out1:
         fdput(f);
@@@ -600,15 -506,15 +600,15 @@@ SYSCALL_DEFINE3(fcntl64, unsigned int, 
         struct flock64 flock;
         long err = -EBADF;
   
-       if (!f.file)
+       if (!fd_file(f))
                 goto out;
   
-       if (unlikely(f.file->f_mode & FMODE_PATH)) {
+       if (unlikely(fd_file(f)->f_mode & FMODE_PATH)) {
                 if (!check_fcntl_cmd(cmd))
                         goto out1;
         }
   
-       err = security_file_fcntl(f.file, cmd, arg);
+       err = security_file_fcntl(fd_file(f), cmd, arg);
         if (err)
                 goto out1;
         
@@@ -618,7 -524,7 +618,7 @@@
                 err = -EFAULT;
                 if (copy_from_user(&flock, argp, sizeof(flock)))
                         break;
-               err = fcntl_getlk64(f.file, cmd, &flock);
+               err = fcntl_getlk64(fd_file(f), cmd, &flock);
                 if (!err && copy_to_user(argp, &flock, sizeof(flock)))
                         err = -EFAULT;
                 break;
@@@ -629,10 -535,10 +629,10 @@@
                 err = -EFAULT;
                 if (copy_from_user(&flock, argp, sizeof(flock)))
                         break;
-               err = fcntl_setlk64(fd, f.file, cmd, &flock);
+               err = fcntl_setlk64(fd, fd_file(f), cmd, &flock);
                 break;
         default:
-               err = do_fcntl(fd, cmd, arg, f.file);
+               err = do_fcntl(fd, cmd, arg, fd_file(f));
                 break;
         }
   out1:
@@@ -737,15 -643,15 +737,15 @@@ static long do_compat_fcntl64(unsigned 
         struct flock flock;
         long err = -EBADF;
   
-       if (!f.file)
+       if (!fd_file(f))
                 return err;
   
-       if (unlikely(f.file->f_mode & FMODE_PATH)) {
+       if (unlikely(fd_file(f)->f_mode & FMODE_PATH)) {
                 if (!check_fcntl_cmd(cmd))
                         goto out_put;
         }
   
-       err = security_file_fcntl(f.file, cmd, arg);
+       err = security_file_fcntl(fd_file(f), cmd, arg);
         if (err)
                 goto out_put;
   
@@@ -754,7 -660,7 +754,7 @@@
                 err = get_compat_flock(&flock, compat_ptr(arg));
                 if (err)
                         break;
-               err = fcntl_getlk(f.file, convert_fcntl_cmd(cmd), &flock);
+               err = fcntl_getlk(fd_file(f), convert_fcntl_cmd(cmd), &flock);
                 if (err)
                         break;
                 err = fixup_compat_flock(&flock);
@@@ -766,7 -672,7 +766,7 @@@
                 err = get_compat_flock64(&flock, compat_ptr(arg));
                 if (err)
                         break;
-               err = fcntl_getlk(f.file, convert_fcntl_cmd(cmd), &flock);
+               err = fcntl_getlk(fd_file(f), convert_fcntl_cmd(cmd), &flock);
                 if (!err)
                         err = put_compat_flock64(&flock, compat_ptr(arg));
                 break;
@@@ -775,7 -681,7 +775,7 @@@
                 err = get_compat_flock(&flock, compat_ptr(arg));
                 if (err)
                         break;
-               err = fcntl_setlk(fd, f.file, convert_fcntl_cmd(cmd), &flock);
+               err = fcntl_setlk(fd, fd_file(f), convert_fcntl_cmd(cmd), &flock);
                 break;
         case F_SETLK64:
         case F_SETLKW64:
@@@ -784,10 -690,10 +784,10 @@@
                 err = get_compat_flock64(&flock, compat_ptr(arg));
                 if (err)
                         break;
-               err = fcntl_setlk(fd, f.file, convert_fcntl_cmd(cmd), &flock);
+               err = fcntl_setlk(fd, fd_file(f), convert_fcntl_cmd(cmd), &flock);
                 break;
         default:
-               err = do_fcntl(fd, cmd, arg, f.file);
+               err = do_fcntl(fd, cmd, arg, fd_file(f));
                 break;
         }
   out_put:
@@@ -938,19 -844,14 +938,19 @@@ static void send_sigurg_to_task(struct 
                 do_send_sig_info(SIGURG, SEND_SIG_PRIV, p, type);
   }
   
- -int send_sigurg(struct fown_struct *fown)
+ +int send_sigurg(struct file *file)
   {
+ +      struct fown_struct *fown;
         struct task_struct *p;
         enum pid_type type;
         struct pid *pid;
         unsigned long flags;
         int ret = 0;
         
+ +      fown = file_f_owner(file);
+ +      if (!fown)
+ +              return 0;
+ +
         read_lock_irqsave(&fown->lock, flags);
   
         type = fown->pid_type;
@@@ -1126,16 -1027,13 +1126,16 @@@ static void kill_fasync_rcu(struct fasy
                 }
                 read_lock_irqsave(&fa->fa_lock, flags);
                 if (fa->fa_file) {
- -                      fown = &fa->fa_file->f_owner;
+ +                      fown = file_f_owner(fa->fa_file);
+ +                      if (!fown)
+ +                              goto next;
                         /* Don't send SIGURG to processes which have not set a
                            queued signum: SIGURG has its own default signalling
                            mechanism. */
                         if (!(sig == SIGURG && fown->signum == 0))
                                 send_sigio(fown, fa->fa_fd, band);
                 }
+ +next:
                 read_unlock_irqrestore(&fa->fa_lock, flags);
                 fa = rcu_dereference(fa->fa_next);
         }
diff --combined fs/fhandle.c

index 8cb665629f4a895b13ed3a2c99a85d18fbc12cce,3f07b52874a8181e284ff818d235292d1a9dc6a9..82df28d45cd70a7df525f50bbb398d646110cd99
--- 1/fs/fhandle.c
--- 2/fs/fhandle.c
+++ b/fs/fhandle.c
@@@ -16,8 -16,7 +16,8 @@@
   
   static long do_sys_name_to_handle(const struct path *path,
                                   struct file_handle __user *ufh,
- -                                int __user *mnt_id, int fh_flags)
+ +                                void __user *mnt_id, bool unique_mntid,
+ +                                int fh_flags)
   {
         long retval;
         struct file_handle f_handle;
@@@ -70,19 -69,9 +70,19 @@@
         } else
                 retval = 0;
         /* copy the mount id */
- -      if (put_user(real_mount(path->mnt)->mnt_id, mnt_id) ||
- -          copy_to_user(ufh, handle,
- -                       struct_size(handle, f_handle, handle_bytes)))
+ +      if (unique_mntid) {
+ +              if (put_user(real_mount(path->mnt)->mnt_id_unique,
+ +                           (u64 __user *) mnt_id))
+ +                      retval = -EFAULT;
+ +      } else {
+ +              if (put_user(real_mount(path->mnt)->mnt_id,
+ +                           (int __user *) mnt_id))
+ +                      retval = -EFAULT;
+ +      }
+ +      /* copy the handle */
+ +      if (retval != -EFAULT &&
+ +              copy_to_user(ufh, handle,
+ +                           struct_size(handle, f_handle, handle_bytes)))
                 retval = -EFAULT;
         kfree(handle);
         return retval;
@@@ -94,7 -83,6 +94,7 @@@
    * @name: name that should be converted to handle.
    * @handle: resulting file handle
    * @mnt_id: mount id of the file system containing the file
+ + *          (u64 if AT_HANDLE_MNT_ID_UNIQUE, otherwise int)
    * @flag: flag value to indicate whether to follow symlink or not
    *        and whether a decodable file handle is required.
    *
@@@ -104,7 -92,7 +104,7 @@@
    * value required.
    */
   SYSCALL_DEFINE5(name_to_handle_at, int, dfd, const char __user *, name,
- -              struct file_handle __user *, handle, int __user *, mnt_id,
+ +              struct file_handle __user *, handle, void __user *, mnt_id,
                 int, flag)
   {
         struct path path;
@@@ -112,8 -100,7 +112,8 @@@
         int fh_flags;
         int err;
   
- -      if (flag & ~(AT_SYMLINK_FOLLOW | AT_EMPTY_PATH | AT_HANDLE_FID))
+ +      if (flag & ~(AT_SYMLINK_FOLLOW | AT_EMPTY_PATH | AT_HANDLE_FID |
+ +                   AT_HANDLE_MNT_ID_UNIQUE))
                 return -EINVAL;
   
         lookup_flags = (flag & AT_SYMLINK_FOLLOW) ? LOOKUP_FOLLOW : 0;
@@@ -122,9 -109,7 +122,9 @@@
                 lookup_flags |= LOOKUP_EMPTY;
         err = user_path_at(dfd, name, lookup_flags, &path);
         if (!err) {
- -              err = do_sys_name_to_handle(&path, handle, mnt_id, fh_flags);
+ +              err = do_sys_name_to_handle(&path, handle, mnt_id,
+ +                                          flag & AT_HANDLE_MNT_ID_UNIQUE,
+ +                                          fh_flags);
                 path_put(&path);
         }
         return err;
@@@ -140,9 -125,9 +140,9 @@@ static int get_path_from_fd(int fd, str
                 spin_unlock(&fs->lock);
         } else {
                 struct fd f = fdget(fd);
-               if (!f.file)
+               if (!fd_file(f))
                         return -EBADF;
-               *root = f.file->f_path;
+               *root = fd_file(f)->f_path;
                 path_get(root);
                 fdput(f);
         }
diff --combined fs/file.c

index 976ecd4ce2c6c84e17d42b56054aae54b77bbe8d,6f5ed4daafd28f64dea8340b514cbbf0815527b9..5125607d040a2ff073e170d043124db5f444a90a
--- 1/fs/file.c
--- 2/fs/file.c
+++ b/fs/file.c
@@@ -46,23 -46,27 +46,23 @@@ static void free_fdtable_rcu(struct rcu
   #define BITBIT_NR(nr) BITS_TO_LONGS(BITS_TO_LONGS(nr))
   #define BITBIT_SIZE(nr)       (BITBIT_NR(nr) * sizeof(long))
   
+ +#define fdt_words(fdt) ((fdt)->max_fds / BITS_PER_LONG) // words in ->open_fds
   /*
    * Copy 'count' fd bits from the old table to the new table and clear the extra
    * space if any.  This does not copy the file pointers.  Called with the files
    * spinlock held for write.
    */
- -static void copy_fd_bitmaps(struct fdtable *nfdt, struct fdtable *ofdt,
- -                          unsigned int count)
+ +static inline void copy_fd_bitmaps(struct fdtable *nfdt, struct fdtable *ofdt,
+ +                          unsigned int copy_words)
   {
- -      unsigned int cpy, set;
- -
- -      cpy = count / BITS_PER_BYTE;
- -      set = (nfdt->max_fds - count) / BITS_PER_BYTE;
- -      memcpy(nfdt->open_fds, ofdt->open_fds, cpy);
- -      memset((char *)nfdt->open_fds + cpy, 0, set);
- -      memcpy(nfdt->close_on_exec, ofdt->close_on_exec, cpy);
- -      memset((char *)nfdt->close_on_exec + cpy, 0, set);
- -
- -      cpy = BITBIT_SIZE(count);
- -      set = BITBIT_SIZE(nfdt->max_fds) - cpy;
- -      memcpy(nfdt->full_fds_bits, ofdt->full_fds_bits, cpy);
- -      memset((char *)nfdt->full_fds_bits + cpy, 0, set);
+ +      unsigned int nwords = fdt_words(nfdt);
+ +
+ +      bitmap_copy_and_extend(nfdt->open_fds, ofdt->open_fds,
+ +                      copy_words * BITS_PER_LONG, nwords * BITS_PER_LONG);
+ +      bitmap_copy_and_extend(nfdt->close_on_exec, ofdt->close_on_exec,
+ +                      copy_words * BITS_PER_LONG, nwords * BITS_PER_LONG);
+ +      bitmap_copy_and_extend(nfdt->full_fds_bits, ofdt->full_fds_bits,
+ +                      copy_words, nwords);
   }
   
   /*
@@@ -80,7 -84,7 +80,7 @@@ static void copy_fdtable(struct fdtabl
         memcpy(nfdt->fd, ofdt->fd, cpy);
         memset((char *)nfdt->fd + cpy, 0, set);
   
- -      copy_fd_bitmaps(nfdt, ofdt, ofdt->max_fds);
+ +      copy_fd_bitmaps(nfdt, ofdt, fdt_words(ofdt));
   }
   
   /*
@@@ -375,7 -379,7 +375,7 @@@ struct files_struct *dup_fd(struct file
                 open_files = sane_fdtable_size(old_fdt, max_fds);
         }
   
- -      copy_fd_bitmaps(new_fdt, old_fdt, open_files);
+ +      copy_fd_bitmaps(new_fdt, old_fdt, open_files / BITS_PER_LONG);
   
         old_fds = old_fdt->fd;
         new_fds = new_fdt->fd;
@@@ -672,7 -676,7 +672,7 @@@ int close_fd(unsigned fd
   
         return filp_close(file, files);
   }
- -EXPORT_SYMBOL(close_fd); /* for ksys_close() */
+ +EXPORT_SYMBOL(close_fd);
   
   /**
    * last_fd - return last valid index into fd table
@@@ -1124,7 -1128,7 +1124,7 @@@ EXPORT_SYMBOL(task_lookup_next_fdget_rc
    * The fput_needed flag returned by fget_light should be passed to the
    * corresponding fput_light.
    */
- static unsigned long __fget_light(unsigned int fd, fmode_t mask)
+ static inline struct fd __fget_light(unsigned int fd, fmode_t mask)
   {
         struct files_struct *files = current->files;
         struct file *file;
@@@ -1141,22 -1145,22 +1141,22 @@@
         if (likely(atomic_read_acquire(&files->count) == 1)) {
                 file = files_lookup_fd_raw(files, fd);
                 if (!file || unlikely(file->f_mode & mask))
-                       return 0;
-               return (unsigned long)file;
+                       return EMPTY_FD;
+               return BORROWED_FD(file);
         } else {
                 file = __fget_files(files, fd, mask);
                 if (!file)
-                       return 0;
-               return FDPUT_FPUT | (unsigned long)file;
+                       return EMPTY_FD;
+               return CLONED_FD(file);
         }
   }
- unsigned long __fdget(unsigned int fd)
+ struct fd fdget(unsigned int fd)
   {
         return __fget_light(fd, FMODE_PATH);
   }
- EXPORT_SYMBOL(__fdget);
+ EXPORT_SYMBOL(fdget);
   
- unsigned long __fdget_raw(unsigned int fd)
+ struct fd fdget_raw(unsigned int fd)
   {
         return __fget_light(fd, 0);
   }
@@@ -1177,16 -1181,16 +1177,16 @@@ static inline bool file_needs_f_pos_loc
                 (file_count(file) > 1 || file->f_op->iterate_shared);
   }
   
- unsigned long __fdget_pos(unsigned int fd)
+ struct fd fdget_pos(unsigned int fd)
   {
-       unsigned long v = __fdget(fd);
-       struct file *file = (struct file *)(v & ~3);
+       struct fd f = fdget(fd);
+       struct file *file = fd_file(f);
   
         if (file && file_needs_f_pos_lock(file)) {
-               v |= FDPUT_POS_UNLOCK;
+               f.word |= FDPUT_POS_UNLOCK;
                 mutex_lock(&file->f_pos_lock);
         }
-       return v;
+       return f;
   }
   
   void __f_unlock_pos(struct file *f)
@@@ -1244,7 -1248,6 +1244,7 @@@ __releases(&files->file_lock
          * tables and this condition does not arise without those.
          */
         fdt = files_fdtable(files);
+ +      fd = array_index_nospec(fd, fdt->max_fds);
         tofree = fdt->fd[fd];
         if (!tofree && fd_is_open(fd, fdt))
                 goto Ebusy;
diff --combined fs/fuse/dev.c

index f0c9cd1a0b3978ef57a390f0e6e44faa726f1a96,991b9ae8e7c910109db80cb7340e8f0c73d2cb80..3dc035e419cf0ec87199e17f070b2b8e46b34546
--- 1/fs/fuse/dev.c
--- 2/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@@ -31,8 -31,6 +31,8 @@@ MODULE_ALIAS("devname:fuse")
   
   static struct kmem_cache *fuse_req_cachep;
   
+ +static void end_requests(struct list_head *head);
+ +
   static struct fuse_dev *fuse_get_dev(struct file *file)
   {
         /*
@@@ -775,6 -773,7 +775,6 @@@ static int fuse_check_folio(struct foli
             (folio->flags & PAGE_FLAGS_CHECK_AT_PREP &
              ~(1 << PG_locked |
                1 << PG_referenced |
- -             1 << PG_uptodate |
                1 << PG_lru |
                1 << PG_active |
                1 << PG_workingset |
@@@ -819,7 -818,9 +819,7 @@@ static int fuse_try_move_page(struct fu
   
         newfolio = page_folio(buf->page);
   
- -      if (!folio_test_uptodate(newfolio))
- -              folio_mark_uptodate(newfolio);
- -
+ +      folio_clear_uptodate(newfolio);
         folio_clear_mappedtodisk(newfolio);
   
         if (fuse_check_folio(newfolio) != 0)
@@@ -1617,11 -1618,9 +1617,11 @@@ static int fuse_notify_store(struct fus
   
                 this_num = min_t(unsigned, num, PAGE_SIZE - offset);
                 err = fuse_copy_page(cs, &page, offset, this_num, 0);
- -              if (!err && offset == 0 &&
- -                  (this_num == PAGE_SIZE || file_size == end))
+ +              if (!PageUptodate(page) && !err && offset == 0 &&
+ +                  (this_num == PAGE_SIZE || file_size == end)) {
+ +                      zero_user_segment(page, this_num, PAGE_SIZE);
                         SetPageUptodate(page);
+ +              }
                 unlock_page(page);
                 put_page(page);
   
@@@ -1821,13 -1820,6 +1821,13 @@@ static void fuse_resend(struct fuse_con
         }
   
         spin_lock(&fiq->lock);
+ +      if (!fiq->connected) {
+ +              spin_unlock(&fiq->lock);
+ +              list_for_each_entry(req, &to_queue, list)
+ +                      clear_bit(FR_PENDING, &req->flags);
+ +              end_requests(&to_queue);
+ +              return;
+ +      }
         /* iq and pq requests are both oldest to newest */
         list_splice(&to_queue, &fiq->pending);
         fiq->ops->wake_pending_and_unlock(fiq);
@@@ -2329,15 -2321,15 +2329,15 @@@ static long fuse_dev_ioctl_clone(struc
                 return -EFAULT;
   
         f = fdget(oldfd);
-       if (!f.file)
+       if (!fd_file(f))
                 return -EINVAL;
   
         /*
          * Check against file->f_op because CUSE
          * uses the same ioctl handler.
          */
-       if (f.file->f_op == file->f_op)
-               fud = fuse_get_dev(f.file);
+       if (fd_file(f)->f_op == file->f_op)
+               fud = fuse_get_dev(fd_file(f));
   
         res = -EINVAL;
         if (fud) {
diff --combined fs/locks.c

index b51b1c395ce68b906b1a07838792624fcd57d84f,239759a356e96c32cbd50563dc9471a7b2a27851..204847628f3efc689e94a19ed67b88e76a10f3b5
--- 1/fs/locks.c
--- 2/fs/locks.c
+++ b/fs/locks.c
@@@ -1451,7 -1451,7 +1451,7 @@@ int lease_modify(struct file_lease *fl
                 struct file *filp = fl->c.flc_file;
   
                 f_delown(filp);
- -              filp->f_owner.signum = 0;
+ +              file_f_owner(filp)->signum = 0;
                 fasync_helper(0, fl->c.flc_file, 0, &fl->fl_fasync);
                 if (fl->fl_fasync != NULL) {
                         printk(KERN_ERR "locks_delete_lock: fasync == %p\n", fl->fl_fasync);
@@@ -1783,10 -1783,6 +1783,10 @@@ generic_add_lease(struct file *filp, in
         lease = *flp;
         trace_generic_add_lease(inode, lease);
   
+ +      error = file_f_owner_allocate(filp);
+ +      if (error)
+ +              return error;
+ +
         /* Note that arg is never F_UNLCK here */
         ctx = locks_get_lock_context(inode, arg);
         if (!ctx)
@@@ -2157,15 -2153,15 +2157,15 @@@ SYSCALL_DEFINE2(flock, unsigned int, fd
   
         error = -EBADF;
         f = fdget(fd);
-       if (!f.file)
+       if (!fd_file(f))
                 return error;
   
-       if (type != F_UNLCK && !(f.file->f_mode & (FMODE_READ | FMODE_WRITE)))
+       if (type != F_UNLCK && !(fd_file(f)->f_mode & (FMODE_READ | FMODE_WRITE)))
                 goto out_putf;
   
-       flock_make_lock(f.file, &fl, type);
+       flock_make_lock(fd_file(f), &fl, type);
   
-       error = security_file_lock(f.file, fl.c.flc_type);
+       error = security_file_lock(fd_file(f), fl.c.flc_type);
         if (error)
                 goto out_putf;
   
@@@ -2173,12 -2169,12 +2173,12 @@@
         if (can_sleep)
                 fl.c.flc_flags |= FL_SLEEP;
   
-       if (f.file->f_op->flock)
-               error = f.file->f_op->flock(f.file,
+       if (fd_file(f)->f_op->flock)
+               error = fd_file(f)->f_op->flock(fd_file(f),
                                             (can_sleep) ? F_SETLKW : F_SETLK,
                                             &fl);
         else
-               error = locks_lock_file_wait(f.file, &fl);
+               error = locks_lock_file_wait(fd_file(f), &fl);
   
         locks_release_private(&fl);
    out_putf:
@@@ -2988,7 -2984,7 +2988,7 @@@ static int __init filelock_init(void
         filelock_cache = kmem_cache_create("file_lock_cache",
                         sizeof(struct file_lock), 0, SLAB_PANIC, NULL);
   
- -      filelease_cache = kmem_cache_create("file_lock_cache",
+ +      filelease_cache = kmem_cache_create("file_lease_cache",
                         sizeof(struct file_lease), 0, SLAB_PANIC, NULL);
   
         for_each_possible_cpu(i) {
diff --combined fs/namei.c

index 891b169e38c96b3dc01e9a70aaa1186ff03c9921,af86e3330594976ac31bffaa2013601816ddedb6..4a4a22a08ac20df0e2cbe4b242d47eb4bc630faa
--- 1/fs/namei.c
--- 2/fs/namei.c
+++ b/fs/namei.c
@@@ -1639,20 -1639,6 +1639,20 @@@ struct dentry *lookup_one_qstr_excl(con
   }
   EXPORT_SYMBOL(lookup_one_qstr_excl);
   
+ +/**
+ + * lookup_fast - do fast lockless (but racy) lookup of a dentry
+ + * @nd: current nameidata
+ + *
+ + * Do a fast, but racy lookup in the dcache for the given dentry, and
+ + * revalidate it. Returns a valid dentry pointer or NULL if one wasn't
+ + * found. On error, an ERR_PTR will be returned.
+ + *
+ + * If this function returns a valid dentry and the walk is no longer
+ + * lazy, the dentry will carry a reference that must later be put. If
+ + * RCU mode is still in force, then this is not the case and the dentry
+ + * must be legitimized before use. If this returns NULL, then the walk
+ + * will no longer be in RCU mode.
+ + */
   static struct dentry *lookup_fast(struct nameidata *nd)
   {
         struct dentry *dentry, *parent = nd->path.dentry;
@@@ -2506,25 -2492,25 +2506,25 @@@ static const char *path_init(struct nam
                 struct fd f = fdget_raw(nd->dfd);
                 struct dentry *dentry;
   
-               if (!f.file)
+               if (!fd_file(f))
                         return ERR_PTR(-EBADF);
   
                 if (flags & LOOKUP_LINKAT_EMPTY) {
-                       if (f.file->f_cred != current_cred() &&
-                           !ns_capable(f.file->f_cred->user_ns, CAP_DAC_READ_SEARCH)) {
+                       if (fd_file(f)->f_cred != current_cred() &&
+                           !ns_capable(fd_file(f)->f_cred->user_ns, CAP_DAC_READ_SEARCH)) {
                                 fdput(f);
                                 return ERR_PTR(-ENOENT);
                         }
                 }
   
-               dentry = f.file->f_path.dentry;
+               dentry = fd_file(f)->f_path.dentry;
   
                 if (*s && unlikely(!d_can_lookup(dentry))) {
                         fdput(f);
                         return ERR_PTR(-ENOTDIR);
                 }
   
-               nd->path = f.file->f_path;
+               nd->path = fd_file(f)->f_path;
                 if (flags & LOOKUP_RCU) {
                         nd->inode = nd->path.dentry->d_inode;
                         nd->seq = read_seqcount_begin(&nd->path.dentry->d_seq);
@@@ -3535,9 -3521,6 +3535,9 @@@ static struct dentry *lookup_open(struc
                 return dentry;
         }
   
+ +      if (open_flag & O_CREAT)
+ +              audit_inode(nd->name, dir, AUDIT_INODE_PARENT);
+ +
         /*
          * Checking write permission is tricky, bacuse we don't know if we are
          * going to actually need it: O_CREAT opens should work as long as the
@@@ -3608,42 -3591,6 +3608,42 @@@ out_dput
         return ERR_PTR(error);
   }
   
+ +static inline bool trailing_slashes(struct nameidata *nd)
+ +{
+ +      return (bool)nd->last.name[nd->last.len];
+ +}
+ +
+ +static struct dentry *lookup_fast_for_open(struct nameidata *nd, int open_flag)
+ +{
+ +      struct dentry *dentry;
+ +
+ +      if (open_flag & O_CREAT) {
+ +              if (trailing_slashes(nd))
+ +                      return ERR_PTR(-EISDIR);
+ +
+ +              /* Don't bother on an O_EXCL create */
+ +              if (open_flag & O_EXCL)
+ +                      return NULL;
+ +      }
+ +
+ +      if (trailing_slashes(nd))
+ +              nd->flags |= LOOKUP_FOLLOW | LOOKUP_DIRECTORY;
+ +
+ +      dentry = lookup_fast(nd);
+ +      if (IS_ERR_OR_NULL(dentry))
+ +              return dentry;
+ +
+ +      if (open_flag & O_CREAT) {
+ +              /* Discard negative dentries. Need inode_lock to do the create */
+ +              if (!dentry->d_inode) {
+ +                      if (!(nd->flags & LOOKUP_RCU))
+ +                              dput(dentry);
+ +                      dentry = NULL;
+ +              }
+ +      }
+ +      return dentry;
+ +}
+ +
   static const char *open_last_lookups(struct nameidata *nd,
                    struct file *file, const struct open_flags *op)
   {
@@@ -3661,22 -3608,28 +3661,22 @@@
                 return handle_dots(nd, nd->last_type);
         }
   
- -      if (!(open_flag & O_CREAT)) {
- -              if (nd->last.name[nd->last.len])
- -                      nd->flags |= LOOKUP_FOLLOW | LOOKUP_DIRECTORY;
- -              /* we _can_ be in RCU mode here */
- -              dentry = lookup_fast(nd);
- -              if (IS_ERR(dentry))
- -                      return ERR_CAST(dentry);
- -              if (likely(dentry))
- -                      goto finish_lookup;
+ +      /* We _can_ be in RCU mode here */
+ +      dentry = lookup_fast_for_open(nd, open_flag);
+ +      if (IS_ERR(dentry))
+ +              return ERR_CAST(dentry);
   
+ +      if (likely(dentry))
+ +              goto finish_lookup;
+ +
+ +      if (!(open_flag & O_CREAT)) {
                 if (WARN_ON_ONCE(nd->flags & LOOKUP_RCU))
                         return ERR_PTR(-ECHILD);
         } else {
- -              /* create side of things */
                 if (nd->flags & LOOKUP_RCU) {
                         if (!try_to_unlazy(nd))
                                 return ERR_PTR(-ECHILD);
                 }
- -              audit_inode(nd->name, dir, AUDIT_INODE_PARENT);
- -              /* trailing slashes? */
- -              if (unlikely(nd->last.name[nd->last.len]))
- -                      return ERR_PTR(-EISDIR);
         }
   
         if (open_flag & (O_CREAT | O_TRUNC | O_WRONLY | O_RDWR)) {
@@@ -5351,7 -5304,7 +5351,7 @@@ int page_symlink(struct inode *inode, c
         struct address_space *mapping = inode->i_mapping;
         const struct address_space_operations *aops = mapping->a_ops;
         bool nofs = !mapping_gfp_constraint(mapping, __GFP_FS);
- -      struct page *page;
+ +      struct folio *folio;
         void *fsdata = NULL;
         int err;
         unsigned int flags;
@@@ -5359,16 -5312,16 +5359,16 @@@
   retry:
         if (nofs)
                 flags = memalloc_nofs_save();
- -      err = aops->write_begin(NULL, mapping, 0, len-1, &page, &fsdata);
+ +      err = aops->write_begin(NULL, mapping, 0, len-1, &folio, &fsdata);
         if (nofs)
                 memalloc_nofs_restore(flags);
         if (err)
                 goto fail;
   
- -      memcpy(page_address(page), symname, len-1);
+ +      memcpy(folio_address(folio), symname, len - 1);
   
- -      err = aops->write_end(NULL, mapping, 0, len-1, len-1,
- -                                                      page, fsdata);
+ +      err = aops->write_end(NULL, mapping, 0, len - 1, len - 1,
+ +                                              folio, fsdata);
         if (err < 0)
                 goto fail;
         if (err < len-1)
diff --combined fs/namespace.c

index e71e4564987ba187d2cf1761da1b043212ee3085,c46d48bb38cda6111120e123e4a32d77390371eb..6ba9c434cc9f7abff6e4e096564dda8660c85be5
--- 1/fs/namespace.c
--- 2/fs/namespace.c
+++ b/fs/namespace.c
@@@ -1774,7 -1774,7 +1774,7 @@@ static void umount_tree(struct mount *m
                 list_del_init(&p->mnt_child);
         }
   
- -      /* Add propogated mounts to the tmp_list */
+ +      /* Add propagated mounts to the tmp_list */
         if (how & UMOUNT_PROPAGATE)
                 propagate_umount(&tmp_list);
   
@@@ -2060,41 -2060,14 +2060,41 @@@ static bool is_mnt_ns_file(struct dentr
                dentry->d_fsdata == &mntns_operations;
   }
   
- -static struct mnt_namespace *to_mnt_ns(struct ns_common *ns)
+ +struct ns_common *from_mnt_ns(struct mnt_namespace *mnt)
   {
- -      return container_of(ns, struct mnt_namespace, ns);
+ +      return &mnt->ns;
   }
   
- -struct ns_common *from_mnt_ns(struct mnt_namespace *mnt)
+ +struct mnt_namespace *__lookup_next_mnt_ns(struct mnt_namespace *mntns, bool previous)
   {
- -      return &mnt->ns;
+ +      guard(read_lock)(&mnt_ns_tree_lock);
+ +      for (;;) {
+ +              struct rb_node *node;
+ +
+ +              if (previous)
+ +                      node = rb_prev(&mntns->mnt_ns_tree_node);
+ +              else
+ +                      node = rb_next(&mntns->mnt_ns_tree_node);
+ +              if (!node)
+ +                      return ERR_PTR(-ENOENT);
+ +
+ +              mntns = node_to_mnt_ns(node);
+ +              node = &mntns->mnt_ns_tree_node;
+ +
+ +              if (!ns_capable_noaudit(mntns->user_ns, CAP_SYS_ADMIN))
+ +                      continue;
+ +
+ +              /*
+ +               * Holding mnt_ns_tree_lock prevents the mount namespace from
+ +               * being freed but it may well be on it's deathbed. We want an
+ +               * active reference, not just a passive one here as we're
+ +               * persisting the mount namespace.
+ +               */
+ +              if (!refcount_inc_not_zero(&mntns->ns.count))
+ +                      continue;
+ +
+ +              return mntns;
+ +      }
   }
   
   static bool mnt_ns_loop(struct dentry *dentry)
@@@ -2948,15 -2921,8 +2948,15 @@@ static void mnt_warn_timestamp_expiry(s
         if (!__mnt_is_readonly(mnt) &&
            (!(sb->s_iflags & SB_I_TS_EXPIRY_WARNED)) &&
            (ktime_get_real_seconds() + TIME_UPTIME_SEC_MAX > sb->s_time_max)) {
- -              char *buf = (char *)__get_free_page(GFP_KERNEL);
- -              char *mntpath = buf ? d_path(mountpoint, buf, PAGE_SIZE) : ERR_PTR(-ENOMEM);
+ +              char *buf, *mntpath;
+ +
+ +              buf = (char *)__get_free_page(GFP_KERNEL);
+ +              if (buf)
+ +                      mntpath = d_path(mountpoint, buf, PAGE_SIZE);
+ +              else
+ +                      mntpath = ERR_PTR(-ENOMEM);
+ +              if (IS_ERR(mntpath))
+ +                      mntpath = "(unknown)";
   
                 pr_warn("%s filesystem being %s at %s supports timestamps until %ptTd (0x%llx)\n",
                         sb->s_type->name,
@@@ -2964,9 -2930,8 +2964,9 @@@
                         mntpath, &sb->s_time_max,
                         (unsigned long long)sb->s_time_max);
   
- -              free_page((unsigned long)buf);
                 sb->s_iflags |= SB_I_TS_EXPIRY_WARNED;
+ +              if (buf)
+ +                      free_page((unsigned long)buf);
         }
   }
   
@@@ -4134,14 -4099,14 +4134,14 @@@ SYSCALL_DEFINE3(fsmount, int, fs_fd, un
         }
   
         f = fdget(fs_fd);
-       if (!f.file)
+       if (!fd_file(f))
                 return -EBADF;
   
         ret = -EINVAL;
-       if (f.file->f_op != &fscontext_fops)
+       if (fd_file(f)->f_op != &fscontext_fops)
                 goto err_fsfd;
   
-       fc = f.file->private_data;
+       fc = fd_file(f)->private_data;
   
         ret = mutex_lock_interruptible(&fc->uapi_mutex);
         if (ret < 0)
@@@ -4684,15 -4649,15 +4684,15 @@@ static int build_mount_idmapped(const s
                 return -EINVAL;
   
         f = fdget(attr->userns_fd);
-       if (!f.file)
+       if (!fd_file(f))
                 return -EBADF;
   
-       if (!proc_ns_file(f.file)) {
+       if (!proc_ns_file(fd_file(f))) {
                 err = -EINVAL;
                 goto out_fput;
         }
   
-       ns = get_proc_ns(file_inode(f.file));
+       ns = get_proc_ns(file_inode(fd_file(f)));
         if (ns->ops->type != CLONE_NEWUSER) {
                 err = -EINVAL;
                 goto out_fput;
@@@ -5278,37 -5243,12 +5278,37 @@@ static int copy_mnt_id_req(const struc
    * that, or if not simply grab a passive reference on our mount namespace and
    * return that.
    */
- -static struct mnt_namespace *grab_requested_mnt_ns(u64 mnt_ns_id)
+ +static struct mnt_namespace *grab_requested_mnt_ns(const struct mnt_id_req *kreq)
   {
- -      if (mnt_ns_id)
- -              return lookup_mnt_ns(mnt_ns_id);
- -      refcount_inc(&current->nsproxy->mnt_ns->passive);
- -      return current->nsproxy->mnt_ns;
+ +      struct mnt_namespace *mnt_ns;
+ +
+ +      if (kreq->mnt_ns_id && kreq->spare)
+ +              return ERR_PTR(-EINVAL);
+ +
+ +      if (kreq->mnt_ns_id)
+ +              return lookup_mnt_ns(kreq->mnt_ns_id);
+ +
+ +      if (kreq->spare) {
+ +              struct ns_common *ns;
+ +
+ +              CLASS(fd, f)(kreq->spare);
-               if (!f.file)
++              if (fd_empty(f))
+ +                      return ERR_PTR(-EBADF);
+ +
-               if (!proc_ns_file(f.file))
++              if (!proc_ns_file(fd_file(f)))
+ +                      return ERR_PTR(-EINVAL);
+ +
-               ns = get_proc_ns(file_inode(f.file));
++              ns = get_proc_ns(file_inode(fd_file(f)));
+ +              if (ns->ops->type != CLONE_NEWNS)
+ +                      return ERR_PTR(-EINVAL);
+ +
+ +              mnt_ns = to_mnt_ns(ns);
+ +      } else {
+ +              mnt_ns = current->nsproxy->mnt_ns;
+ +      }
+ +
+ +      refcount_inc(&mnt_ns->passive);
+ +      return mnt_ns;
   }
   
   SYSCALL_DEFINE4(statmount, const struct mnt_id_req __user *, req,
@@@ -5329,7 -5269,7 +5329,7 @@@
         if (ret)
                 return ret;
   
- -      ns = grab_requested_mnt_ns(kreq.mnt_ns_id);
+ +      ns = grab_requested_mnt_ns(&kreq);
         if (!ns)
                 return -ENOENT;
   
@@@ -5456,7 -5396,7 +5456,7 @@@ SYSCALL_DEFINE4(listmount, const struc
         if (!kmnt_ids)
                 return -ENOMEM;
   
- -      ns = grab_requested_mnt_ns(kreq.mnt_ns_id);
+ +      ns = grab_requested_mnt_ns(&kreq);
         if (!ns)
                 return -ENOENT;
   
@@@ -5665,7 -5605,7 +5665,7 @@@ static bool mnt_already_visible(struct 
                         /* Only worry about locked mounts */
                         if (!(child->mnt.mnt_flags & MNT_LOCKED))
                                 continue;
- -                      /* Is the directory permanetly empty? */
+ +                      /* Is the directory permanently empty? */
                         if (!is_empty_dir_inode(inode))
                                 goto next;
                 }
diff --combined fs/open.c

index daf1b55ca8180b161bfb1f14f0e4ce7140650555,a388828ccd22494a75979fd0b4e4525d41e456fc..acaeb3e25c88ecdd635619872eea8ebe23622349
--- 1/fs/open.c
--- 2/fs/open.c
+++ b/fs/open.c
@@@ -193,10 -193,10 +193,10 @@@ long do_sys_ftruncate(unsigned int fd, 
         if (length < 0)
                 return -EINVAL;
         f = fdget(fd);
-       if (!f.file)
+       if (!fd_file(f))
                 return -EBADF;
   
-       error = do_ftruncate(f.file, length, small);
+       error = do_ftruncate(fd_file(f), length, small);
   
         fdput(f);
         return error;
@@@ -252,39 -252,40 +252,39 @@@ int vfs_fallocate(struct file *file, in
         if (offset < 0 || len <= 0)
                 return -EINVAL;
   
- -      /* Return error if mode is not supported */
- -      if (mode & ~FALLOC_FL_SUPPORTED_MASK)
+ +      if (mode & ~(FALLOC_FL_MODE_MASK | FALLOC_FL_KEEP_SIZE))
                 return -EOPNOTSUPP;
   
- -      /* Punch hole and zero range are mutually exclusive */
- -      if ((mode & (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_ZERO_RANGE)) ==
- -          (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_ZERO_RANGE))
- -              return -EOPNOTSUPP;
- -
- -      /* Punch hole must have keep size set */
- -      if ((mode & FALLOC_FL_PUNCH_HOLE) &&
- -          !(mode & FALLOC_FL_KEEP_SIZE))
+ +      /*
+ +       * Modes are exclusive, even if that is not obvious from the encoding
+ +       * as bit masks and the mix with the flag in the same namespace.
+ +       *
+ +       * To make things even more complicated, FALLOC_FL_ALLOCATE_RANGE is
+ +       * encoded as no bit set.
+ +       */
+ +      switch (mode & FALLOC_FL_MODE_MASK) {
+ +      case FALLOC_FL_ALLOCATE_RANGE:
+ +      case FALLOC_FL_UNSHARE_RANGE:
+ +      case FALLOC_FL_ZERO_RANGE:
+ +              break;
+ +      case FALLOC_FL_PUNCH_HOLE:
+ +              if (!(mode & FALLOC_FL_KEEP_SIZE))
+ +                      return -EOPNOTSUPP;
+ +              break;
+ +      case FALLOC_FL_COLLAPSE_RANGE:
+ +      case FALLOC_FL_INSERT_RANGE:
+ +              if (mode & FALLOC_FL_KEEP_SIZE)
+ +                      return -EOPNOTSUPP;
+ +              break;
+ +      default:
                 return -EOPNOTSUPP;
- -
- -      /* Collapse range should only be used exclusively. */
- -      if ((mode & FALLOC_FL_COLLAPSE_RANGE) &&
- -          (mode & ~FALLOC_FL_COLLAPSE_RANGE))
- -              return -EINVAL;
- -
- -      /* Insert range should only be used exclusively. */
- -      if ((mode & FALLOC_FL_INSERT_RANGE) &&
- -          (mode & ~FALLOC_FL_INSERT_RANGE))
- -              return -EINVAL;
- -
- -      /* Unshare range should only be used with allocate mode. */
- -      if ((mode & FALLOC_FL_UNSHARE_RANGE) &&
- -          (mode & ~(FALLOC_FL_UNSHARE_RANGE | FALLOC_FL_KEEP_SIZE)))
- -              return -EINVAL;
+ +      }
   
         if (!(file->f_mode & FMODE_WRITE))
                 return -EBADF;
   
         /*
- -       * We can only allow pure fallocate on append only files
+ +       * On append-only files only space preallocation is supported.
          */
         if ((mode & ~FALLOC_FL_KEEP_SIZE) && IS_APPEND(inode))
                 return -EPERM;
@@@ -352,8 -353,8 +352,8 @@@ int ksys_fallocate(int fd, int mode, lo
         struct fd f = fdget(fd);
         int error = -EBADF;
   
-       if (f.file) {
-               error = vfs_fallocate(f.file, mode, offset, len);
+       if (fd_file(f)) {
+               error = vfs_fallocate(fd_file(f), mode, offset, len);
                 fdput(f);
         }
         return error;
@@@ -584,16 -585,16 +584,16 @@@ SYSCALL_DEFINE1(fchdir, unsigned int, f
         int error;
   
         error = -EBADF;
-       if (!f.file)
+       if (!fd_file(f))
                 goto out;
   
         error = -ENOTDIR;
-       if (!d_can_lookup(f.file->f_path.dentry))
+       if (!d_can_lookup(fd_file(f)->f_path.dentry))
                 goto out_putf;
   
-       error = file_permission(f.file, MAY_EXEC | MAY_CHDIR);
+       error = file_permission(fd_file(f), MAY_EXEC | MAY_CHDIR);
         if (!error)
-               set_fs_pwd(current->fs, &f.file->f_path);
+               set_fs_pwd(current->fs, &fd_file(f)->f_path);
   out_putf:
         fdput(f);
   out:
@@@ -674,8 -675,8 +674,8 @@@ SYSCALL_DEFINE2(fchmod, unsigned int, f
         struct fd f = fdget(fd);
         int err = -EBADF;
   
-       if (f.file) {
-               err = vfs_fchmod(f.file, mode);
+       if (fd_file(f)) {
+               err = vfs_fchmod(fd_file(f), mode);
                 fdput(f);
         }
         return err;
@@@ -868,8 -869,8 +868,8 @@@ int ksys_fchown(unsigned int fd, uid_t 
         struct fd f = fdget(fd);
         int error = -EBADF;
   
-       if (f.file) {
-               error = vfs_fchown(f.file, user, group);
+       if (fd_file(f)) {
+               error = vfs_fchown(fd_file(f), user, group);
                 fdput(f);
         }
         return error;
diff --combined fs/read_write.c

index 070a7c33b9dddcce14aa7b5379929feab179d83a,59d6d0dee5792c23436ad558516fd8945d6c7dfd..64dc24afdb3a7f8feca6e597c8c7e188a2b22d19
--- 1/fs/read_write.c
--- 2/fs/read_write.c
+++ b/fs/read_write.c
@@@ -36,24 -36,22 +36,24 @@@ EXPORT_SYMBOL(generic_ro_fops)
   
   static inline bool unsigned_offsets(struct file *file)
   {
- -      return file->f_mode & FMODE_UNSIGNED_OFFSET;
+ +      return file->f_op->fop_flags & FOP_UNSIGNED_OFFSET;
   }
   
   /**
- - * vfs_setpos - update the file offset for lseek
+ + * vfs_setpos_cookie - update the file offset for lseek and reset cookie
    * @file:     file structure in question
    * @offset:   file offset to seek to
    * @maxsize:  maximum file size
+ + * @cookie:   cookie to reset
    *
- - * This is a low-level filesystem helper for updating the file offset to
- - * the value specified by @offset if the given offset is valid and it is
- - * not equal to the current file offset.
+ + * Update the file offset to the value specified by @offset if the given
+ + * offset is valid and it is not equal to the current file offset and
+ + * reset the specified cookie to indicate that a seek happened.
    *
    * Return the specified offset on success and -EINVAL on invalid offset.
    */
- -loff_t vfs_setpos(struct file *file, loff_t offset, loff_t maxsize)
+ +static loff_t vfs_setpos_cookie(struct file *file, loff_t offset,
+ +                              loff_t maxsize, u64 *cookie)
   {
         if (offset < 0 && !unsigned_offsets(file))
                 return -EINVAL;
@@@ -62,48 -60,35 +62,48 @@@
   
         if (offset != file->f_pos) {
                 file->f_pos = offset;
- -              file->f_version = 0;
+ +              if (cookie)
+ +                      *cookie = 0;
         }
         return offset;
   }
- -EXPORT_SYMBOL(vfs_setpos);
   
   /**
- - * generic_file_llseek_size - generic llseek implementation for regular files
- - * @file:     file structure to seek on
+ + * vfs_setpos - update the file offset for lseek
+ + * @file:     file structure in question
    * @offset:   file offset to seek to
- - * @whence:   type of seek
- - * @maxsize:  max size of this file in file system
- - * @eof:      offset used for SEEK_END position
+ + * @maxsize:  maximum file size
    *
- - * This is a variant of generic_file_llseek that allows passing in a custom
- - * maximum file size and a custom EOF position, for e.g. hashed directories
+ + * This is a low-level filesystem helper for updating the file offset to
+ + * the value specified by @offset if the given offset is valid and it is
+ + * not equal to the current file offset.
    *
- - * Synchronization:
- - * SEEK_SET and SEEK_END are unsynchronized (but atomic on 64bit platforms)
- - * SEEK_CUR is synchronized against other SEEK_CURs, but not read/writes.
- - * read/writes behave like SEEK_SET against seeks.
+ + * Return the specified offset on success and -EINVAL on invalid offset.
    */
- -loff_t
- -generic_file_llseek_size(struct file *file, loff_t offset, int whence,
- -              loff_t maxsize, loff_t eof)
+ +loff_t vfs_setpos(struct file *file, loff_t offset, loff_t maxsize)
+ +{
+ +      return vfs_setpos_cookie(file, offset, maxsize, NULL);
+ +}
+ +EXPORT_SYMBOL(vfs_setpos);
+ +
+ +/**
+ + * must_set_pos - check whether f_pos has to be updated
+ + * @file: file to seek on
+ + * @offset: offset to use
+ + * @whence: type of seek operation
+ + * @eof: end of file
+ + *
+ + * Check whether f_pos needs to be updated and update @offset according
+ + * to @whence.
+ + *
+ + * Return: 0 if f_pos doesn't need to be updated, 1 if f_pos has to be
+ + * updated, and negative error code on failure.
+ + */
+ +static int must_set_pos(struct file *file, loff_t *offset, int whence, loff_t eof)
   {
         switch (whence) {
         case SEEK_END:
- -              offset += eof;
+ +              *offset += eof;
                 break;
         case SEEK_CUR:
                 /*
@@@ -112,17 -97,23 +112,17 @@@
                  * f_pos value back to the file because a concurrent read(),
                  * write() or lseek() might have altered it
                  */
- -              if (offset == 0)
- -                      return file->f_pos;
- -              /*
- -               * f_lock protects against read/modify/write race with other
- -               * SEEK_CURs. Note that parallel writes and reads behave
- -               * like SEEK_SET.
- -               */
- -              spin_lock(&file->f_lock);
- -              offset = vfs_setpos(file, file->f_pos + offset, maxsize);
- -              spin_unlock(&file->f_lock);
- -              return offset;
+ +              if (*offset == 0) {
+ +                      *offset = file->f_pos;
+ +                      return 0;
+ +              }
+ +              break;
         case SEEK_DATA:
                 /*
                  * In the generic case the entire file is data, so as long as
                  * offset isn't at the end of the file then the offset is data.
                  */
- -              if ((unsigned long long)offset >= eof)
+ +              if ((unsigned long long)*offset >= eof)
                         return -ENXIO;
                 break;
         case SEEK_HOLE:
@@@ -130,102 -121,16 +130,102 @@@
                  * There is a virtual hole at the end of the file, so as long as
                  * offset isn't i_size or larger, return i_size.
                  */
- -              if ((unsigned long long)offset >= eof)
+ +              if ((unsigned long long)*offset >= eof)
                         return -ENXIO;
- -              offset = eof;
+ +              *offset = eof;
                 break;
         }
   
+ +      return 1;
+ +}
+ +
+ +/**
+ + * generic_file_llseek_size - generic llseek implementation for regular files
+ + * @file:     file structure to seek on
+ + * @offset:   file offset to seek to
+ + * @whence:   type of seek
+ + * @maxsize:  max size of this file in file system
+ + * @eof:      offset used for SEEK_END position
+ + *
+ + * This is a variant of generic_file_llseek that allows passing in a custom
+ + * maximum file size and a custom EOF position, for e.g. hashed directories
+ + *
+ + * Synchronization:
+ + * SEEK_SET and SEEK_END are unsynchronized (but atomic on 64bit platforms)
+ + * SEEK_CUR is synchronized against other SEEK_CURs, but not read/writes.
+ + * read/writes behave like SEEK_SET against seeks.
+ + */
+ +loff_t
+ +generic_file_llseek_size(struct file *file, loff_t offset, int whence,
+ +              loff_t maxsize, loff_t eof)
+ +{
+ +      int ret;
+ +
+ +      ret = must_set_pos(file, &offset, whence, eof);
+ +      if (ret < 0)
+ +              return ret;
+ +      if (ret == 0)
+ +              return offset;
+ +
+ +      if (whence == SEEK_CUR) {
+ +              /*
+ +               * f_lock protects against read/modify/write race with
+ +               * other SEEK_CURs. Note that parallel writes and reads
+ +               * behave like SEEK_SET.
+ +               */
+ +              guard(spinlock)(&file->f_lock);
+ +              return vfs_setpos(file, file->f_pos + offset, maxsize);
+ +      }
+ +
         return vfs_setpos(file, offset, maxsize);
   }
   EXPORT_SYMBOL(generic_file_llseek_size);
   
+ +/**
+ + * generic_llseek_cookie - versioned llseek implementation
+ + * @file:     file structure to seek on
+ + * @offset:   file offset to seek to
+ + * @whence:   type of seek
+ + * @cookie:   cookie to update
+ + *
+ + * See generic_file_llseek for a general description and locking assumptions.
+ + *
+ + * In contrast to generic_file_llseek, this function also resets a
+ + * specified cookie to indicate a seek took place.
+ + */
+ +loff_t generic_llseek_cookie(struct file *file, loff_t offset, int whence,
+ +                           u64 *cookie)
+ +{
+ +      struct inode *inode = file->f_mapping->host;
+ +      loff_t maxsize = inode->i_sb->s_maxbytes;
+ +      loff_t eof = i_size_read(inode);
+ +      int ret;
+ +
+ +      if (WARN_ON_ONCE(!cookie))
+ +              return -EINVAL;
+ +
+ +      /*
+ +       * Require that this is only used for directories that guarantee
+ +       * synchronization between readdir and seek so that an update to
+ +       * @cookie is correctly synchronized with concurrent readdir.
+ +       */
+ +      if (WARN_ON_ONCE(!(file->f_mode & FMODE_ATOMIC_POS)))
+ +              return -EINVAL;
+ +
+ +      ret = must_set_pos(file, &offset, whence, eof);
+ +      if (ret < 0)
+ +              return ret;
+ +      if (ret == 0)
+ +              return offset;
+ +
+ +      /* No need to hold f_lock because we know that f_pos_lock is held. */
+ +      if (whence == SEEK_CUR)
+ +              return vfs_setpos_cookie(file, file->f_pos + offset, maxsize, cookie);
+ +
+ +      return vfs_setpos_cookie(file, offset, maxsize, cookie);
+ +}
+ +EXPORT_SYMBOL(generic_llseek_cookie);
+ +
   /**
    * generic_file_llseek - generic llseek implementation for regular files
    * @file:     file structure to seek on
@@@ -365,8 -270,10 +365,8 @@@ loff_t default_llseek(struct file *file
         }
         retval = -EINVAL;
         if (offset >= 0 || unsigned_offsets(file)) {
- -              if (offset != file->f_pos) {
+ +              if (offset != file->f_pos)
                         file->f_pos = offset;
- -                      file->f_version = 0;
- -              }
                 retval = offset;
         }
   out:
@@@ -387,12 -294,12 +387,12 @@@ static off_t ksys_lseek(unsigned int fd
   {
         off_t retval;
         struct fd f = fdget_pos(fd);
-       if (!f.file)
+       if (!fd_file(f))
                 return -EBADF;
   
         retval = -EINVAL;
         if (whence <= SEEK_MAX) {
-               loff_t res = vfs_llseek(f.file, offset, whence);
+               loff_t res = vfs_llseek(fd_file(f), offset, whence);
                 retval = res;
                 if (res != (loff_t)retval)
                         retval = -EOVERFLOW;    /* LFS: should only happen on 32 bit platforms */
@@@ -423,14 -330,14 +423,14 @@@ SYSCALL_DEFINE5(llseek, unsigned int, f
         struct fd f = fdget_pos(fd);
         loff_t offset;
   
-       if (!f.file)
+       if (!fd_file(f))
                 return -EBADF;
   
         retval = -EINVAL;
         if (whence > SEEK_MAX)
                 goto out_putf;
   
-       offset = vfs_llseek(f.file, ((loff_t) offset_high << 32) | offset_low,
+       offset = vfs_llseek(fd_file(f), ((loff_t) offset_high << 32) | offset_low,
                         whence);
   
         retval = (int)offset;
@@@ -703,15 -610,15 +703,15 @@@ ssize_t ksys_read(unsigned int fd, cha
         struct fd f = fdget_pos(fd);
         ssize_t ret = -EBADF;
   
-       if (f.file) {
-               loff_t pos, *ppos = file_ppos(f.file);
+       if (fd_file(f)) {
+               loff_t pos, *ppos = file_ppos(fd_file(f));
                 if (ppos) {
                         pos = *ppos;
                         ppos = &pos;
                 }
-               ret = vfs_read(f.file, buf, count, ppos);
+               ret = vfs_read(fd_file(f), buf, count, ppos);
                 if (ret >= 0 && ppos)
-                       f.file->f_pos = pos;
+                       fd_file(f)->f_pos = pos;
                 fdput_pos(f);
         }
         return ret;
@@@ -727,15 -634,15 +727,15 @@@ ssize_t ksys_write(unsigned int fd, con
         struct fd f = fdget_pos(fd);
         ssize_t ret = -EBADF;
   
-       if (f.file) {
-               loff_t pos, *ppos = file_ppos(f.file);
+       if (fd_file(f)) {
+               loff_t pos, *ppos = file_ppos(fd_file(f));
                 if (ppos) {
                         pos = *ppos;
                         ppos = &pos;
                 }
-               ret = vfs_write(f.file, buf, count, ppos);
+               ret = vfs_write(fd_file(f), buf, count, ppos);
                 if (ret >= 0 && ppos)
-                       f.file->f_pos = pos;
+                       fd_file(f)->f_pos = pos;
                 fdput_pos(f);
         }
   
@@@ -758,10 -665,10 +758,10 @@@ ssize_t ksys_pread64(unsigned int fd, c
                 return -EINVAL;
   
         f = fdget(fd);
-       if (f.file) {
+       if (fd_file(f)) {
                 ret = -ESPIPE;
-               if (f.file->f_mode & FMODE_PREAD)
-                       ret = vfs_read(f.file, buf, count, &pos);
+               if (fd_file(f)->f_mode & FMODE_PREAD)
+                       ret = vfs_read(fd_file(f), buf, count, &pos);
                 fdput(f);
         }
   
@@@ -792,10 -699,10 +792,10 @@@ ssize_t ksys_pwrite64(unsigned int fd, 
                 return -EINVAL;
   
         f = fdget(fd);
-       if (f.file) {
+       if (fd_file(f)) {
                 ret = -ESPIPE;
-               if (f.file->f_mode & FMODE_PWRITE)  
-                       ret = vfs_write(f.file, buf, count, &pos);
+               if (fd_file(f)->f_mode & FMODE_PWRITE)
+                       ret = vfs_write(fd_file(f), buf, count, &pos);
                 fdput(f);
         }
   
@@@ -1078,15 -985,15 +1078,15 @@@ static ssize_t do_readv(unsigned long f
         struct fd f = fdget_pos(fd);
         ssize_t ret = -EBADF;
   
-       if (f.file) {
-               loff_t pos, *ppos = file_ppos(f.file);
+       if (fd_file(f)) {
+               loff_t pos, *ppos = file_ppos(fd_file(f));
                 if (ppos) {
                         pos = *ppos;
                         ppos = &pos;
                 }
-               ret = vfs_readv(f.file, vec, vlen, ppos, flags);
+               ret = vfs_readv(fd_file(f), vec, vlen, ppos, flags);
                 if (ret >= 0 && ppos)
-                       f.file->f_pos = pos;
+                       fd_file(f)->f_pos = pos;
                 fdput_pos(f);
         }
   
@@@ -1102,15 -1009,15 +1102,15 @@@ static ssize_t do_writev(unsigned long 
         struct fd f = fdget_pos(fd);
         ssize_t ret = -EBADF;
   
-       if (f.file) {
-               loff_t pos, *ppos = file_ppos(f.file);
+       if (fd_file(f)) {
+               loff_t pos, *ppos = file_ppos(fd_file(f));
                 if (ppos) {
                         pos = *ppos;
                         ppos = &pos;
                 }
-               ret = vfs_writev(f.file, vec, vlen, ppos, flags);
+               ret = vfs_writev(fd_file(f), vec, vlen, ppos, flags);
                 if (ret >= 0 && ppos)
-                       f.file->f_pos = pos;
+                       fd_file(f)->f_pos = pos;
                 fdput_pos(f);
         }
   
@@@ -1136,10 -1043,10 +1136,10 @@@ static ssize_t do_preadv(unsigned long 
                 return -EINVAL;
   
         f = fdget(fd);
-       if (f.file) {
+       if (fd_file(f)) {
                 ret = -ESPIPE;
-               if (f.file->f_mode & FMODE_PREAD)
-                       ret = vfs_readv(f.file, vec, vlen, &pos, flags);
+               if (fd_file(f)->f_mode & FMODE_PREAD)
+                       ret = vfs_readv(fd_file(f), vec, vlen, &pos, flags);
                 fdput(f);
         }
   
@@@ -1159,10 -1066,10 +1159,10 @@@ static ssize_t do_pwritev(unsigned lon
                 return -EINVAL;
   
         f = fdget(fd);
-       if (f.file) {
+       if (fd_file(f)) {
                 ret = -ESPIPE;
-               if (f.file->f_mode & FMODE_PWRITE)
-                       ret = vfs_writev(f.file, vec, vlen, &pos, flags);
+               if (fd_file(f)->f_mode & FMODE_PWRITE)
+                       ret = vfs_writev(fd_file(f), vec, vlen, &pos, flags);
                 fdput(f);
         }
   
@@@ -1328,19 -1235,19 +1328,19 @@@ static ssize_t do_sendfile(int out_fd, 
          */
         retval = -EBADF;
         in = fdget(in_fd);
-       if (!in.file)
+       if (!fd_file(in))
                 goto out;
-       if (!(in.file->f_mode & FMODE_READ))
+       if (!(fd_file(in)->f_mode & FMODE_READ))
                 goto fput_in;
         retval = -ESPIPE;
         if (!ppos) {
-               pos = in.file->f_pos;
+               pos = fd_file(in)->f_pos;
         } else {
                 pos = *ppos;
-               if (!(in.file->f_mode & FMODE_PREAD))
+               if (!(fd_file(in)->f_mode & FMODE_PREAD))
                         goto fput_in;
         }
-       retval = rw_verify_area(READ, in.file, &pos, count);
+       retval = rw_verify_area(READ, fd_file(in), &pos, count);
         if (retval < 0)
                 goto fput_in;
         if (count > MAX_RW_COUNT)
@@@ -1351,13 -1258,13 +1351,13 @@@
          */
         retval = -EBADF;
         out = fdget(out_fd);
-       if (!out.file)
+       if (!fd_file(out))
                 goto fput_in;
-       if (!(out.file->f_mode & FMODE_WRITE))
+       if (!(fd_file(out)->f_mode & FMODE_WRITE))
                 goto fput_out;
-       in_inode = file_inode(in.file);
-       out_inode = file_inode(out.file);
-       out_pos = out.file->f_pos;
+       in_inode = file_inode(fd_file(in));
+       out_inode = file_inode(fd_file(out));
+       out_pos = fd_file(out)->f_pos;
   
         if (!max)
                 max = min(in_inode->i_sb->s_maxbytes, out_inode->i_sb->s_maxbytes);
@@@ -1377,33 -1284,33 +1377,33 @@@
          * and the application is arguably buggy if it doesn't expect
          * EAGAIN on a non-blocking file descriptor.
          */
-       if (in.file->f_flags & O_NONBLOCK)
+       if (fd_file(in)->f_flags & O_NONBLOCK)
                 fl = SPLICE_F_NONBLOCK;
   #endif
-       opipe = get_pipe_info(out.file, true);
+       opipe = get_pipe_info(fd_file(out), true);
         if (!opipe) {
-               retval = rw_verify_area(WRITE, out.file, &out_pos, count);
+               retval = rw_verify_area(WRITE, fd_file(out), &out_pos, count);
                 if (retval < 0)
                         goto fput_out;
-               retval = do_splice_direct(in.file, &pos, out.file, &out_pos,
+               retval = do_splice_direct(fd_file(in), &pos, fd_file(out), &out_pos,
                                           count, fl);
         } else {
-               if (out.file->f_flags & O_NONBLOCK)
+               if (fd_file(out)->f_flags & O_NONBLOCK)
                         fl |= SPLICE_F_NONBLOCK;
   
-               retval = splice_file_to_pipe(in.file, opipe, &pos, count, fl);
+               retval = splice_file_to_pipe(fd_file(in), opipe, &pos, count, fl);
         }
   
         if (retval > 0) {
                 add_rchar(current, retval);
                 add_wchar(current, retval);
-               fsnotify_access(in.file);
-               fsnotify_modify(out.file);
-               out.file->f_pos = out_pos;
+               fsnotify_access(fd_file(in));
+               fsnotify_modify(fd_file(out));
+               fd_file(out)->f_pos = out_pos;
                 if (ppos)
                         *ppos = pos;
                 else
-                       in.file->f_pos = pos;
+                       fd_file(in)->f_pos = pos;
         }
   
         inc_syscr(current);
@@@ -1676,11 -1583,11 +1676,11 @@@ SYSCALL_DEFINE6(copy_file_range, int, f
         ssize_t ret = -EBADF;
   
         f_in = fdget(fd_in);
-       if (!f_in.file)
+       if (!fd_file(f_in))
                 goto out2;
   
         f_out = fdget(fd_out);
-       if (!f_out.file)
+       if (!fd_file(f_out))
                 goto out1;
   
         ret = -EFAULT;
@@@ -1688,21 -1595,21 +1688,21 @@@
                 if (copy_from_user(&pos_in, off_in, sizeof(loff_t)))
                         goto out;
         } else {
-               pos_in = f_in.file->f_pos;
+               pos_in = fd_file(f_in)->f_pos;
         }
   
         if (off_out) {
                 if (copy_from_user(&pos_out, off_out, sizeof(loff_t)))
                         goto out;
         } else {
-               pos_out = f_out.file->f_pos;
+               pos_out = fd_file(f_out)->f_pos;
         }
   
         ret = -EINVAL;
         if (flags != 0)
                 goto out;
   
-       ret = vfs_copy_file_range(f_in.file, pos_in, f_out.file, pos_out, len,
+       ret = vfs_copy_file_range(fd_file(f_in), pos_in, fd_file(f_out), pos_out, len,
                                   flags);
         if (ret > 0) {
                 pos_in += ret;
@@@ -1712,14 -1619,14 +1712,14 @@@
                         if (copy_to_user(off_in, &pos_in, sizeof(loff_t)))
                                 ret = -EFAULT;
                 } else {
-                       f_in.file->f_pos = pos_in;
+                       fd_file(f_in)->f_pos = pos_in;
                 }
   
                 if (off_out) {
                         if (copy_to_user(off_out, &pos_out, sizeof(loff_t)))
                                 ret = -EFAULT;
                 } else {
-                       f_out.file->f_pos = pos_out;
+                       fd_file(f_out)->f_pos = pos_out;
                 }
         }
   
diff --combined fs/select.c

index 437034ed85c6be58b470880ce3e6a00e4ff471b3,97e1009dde0085094c1774f1720ade0ed3eaff28..a77907faf2b459d9a2005575d1fd1befa4c06e5b
--- 1/fs/select.c
--- 2/fs/select.c
+++ b/fs/select.c
@@@ -77,16 -77,19 +77,16 @@@ u64 select_estimate_accuracy(struct tim
   {
         u64 ret;
         struct timespec64 now;
+ +      u64 slack = current->timer_slack_ns;
   
- -      /*
- -       * Realtime tasks get a slack of 0 for obvious reasons.
- -       */
- -
- -      if (rt_task(current))
+ +      if (slack == 0)
                 return 0;
   
         ktime_get_ts64(&now);
         now = timespec64_sub(*tv, now);
         ret = __estimate_accuracy(&now);
- -      if (ret < current->timer_slack_ns)
- -              return current->timer_slack_ns;
+ +      if (ret < slack)
+ +              return slack;
         return ret;
   }
   
@@@ -529,10 -532,10 +529,10 @@@ static noinline_for_stack int do_select
                                         continue;
                                 mask = EPOLLNVAL;
                                 f = fdget(i);
-                               if (f.file) {
+                               if (fd_file(f)) {
                                         wait_key_set(wait, in, out, bit,
                                                      busy_flag);
-                                       mask = vfs_poll(f.file, wait);
+                                       mask = vfs_poll(fd_file(f), wait);
   
                                         fdput(f);
                                 }
@@@ -777,9 -780,7 +777,9 @@@ static inline int get_sigset_argpack(st
   {
         // the path is hot enough for overhead of copy_from_user() to matter
         if (from) {
- -              if (!user_read_access_begin(from, sizeof(*from)))
+ +              if (can_do_masked_user_access())
+ +                      from = masked_user_access_begin(from);
+ +              else if (!user_read_access_begin(from, sizeof(*from)))
                         return -EFAULT;
                 unsafe_get_user(to->p, &from->p, Efault);
                 unsafe_get_user(to->size, &from->size, Efault);
@@@ -839,7 -840,7 +839,7 @@@ SYSCALL_DEFINE1(old_select, struct sel_
   struct poll_list {
         struct poll_list *next;
         unsigned int len;
- -      struct pollfd entries[];
+ +      struct pollfd entries[] __counted_by(len);
   };
   
   #define POLLFD_PER_PAGE  ((PAGE_SIZE-sizeof(struct poll_list)) / sizeof(struct pollfd))
@@@ -863,13 -864,13 +863,13 @@@ static inline __poll_t do_pollfd(struc
                 goto out;
         mask = EPOLLNVAL;
         f = fdget(fd);
-       if (!f.file)
+       if (!fd_file(f))
                 goto out;
   
         /* userland u16 ->events contains POLL... bitmap */
         filter = demangle_poll(pollfd->events) | EPOLLERR | EPOLLHUP;
         pwait->_key = filter | busy_flag;
-       mask = vfs_poll(f.file, pwait);
+       mask = vfs_poll(fd_file(f), pwait);
         if (mask & busy_flag)
                 *can_busy_poll = true;
         mask &= filter;         /* Mask out unneeded events. */
diff --combined fs/signalfd.c

index d0333bce015e96b269213798cdbf95034031ae82,777e889ab0e80ba01ff59ddc0035570c8b122a17..736bebf9359186c528e2310df395ba766e2739df
--- 1/fs/signalfd.c
--- 2/fs/signalfd.c
+++ b/fs/signalfd.c
@@@ -159,7 -159,7 +159,7 @@@ static ssize_t signalfd_dequeue(struct 
         DECLARE_WAITQUEUE(wait, current);
   
         spin_lock_irq(&current->sighand->siglock);
- -      ret = dequeue_signal(current, &ctx->sigmask, info, &type);
+ +      ret = dequeue_signal(&ctx->sigmask, info, &type);
         switch (ret) {
         case 0:
                 if (!nonblock)
@@@ -174,7 -174,7 +174,7 @@@
         add_wait_queue(&current->sighand->signalfd_wqh, &wait);
         for (;;) {
                 set_current_state(TASK_INTERRUPTIBLE);
- -              ret = dequeue_signal(current, &ctx->sigmask, info, &type);
+ +              ret = dequeue_signal(&ctx->sigmask, info, &type);
                 if (ret != 0)
                         break;
                 if (signal_pending(current)) {
@@@ -289,10 -289,10 +289,10 @@@ static int do_signalfd4(int ufd, sigset
                 fd_install(ufd, file);
         } else {
                 struct fd f = fdget(ufd);
-               if (!f.file)
+               if (!fd_file(f))
                         return -EBADF;
-               ctx = f.file->private_data;
-               if (f.file->f_op != &signalfd_fops) {
+               ctx = fd_file(f)->private_data;
+               if (fd_file(f)->f_op != &signalfd_fops) {
                         fdput(f);
                         return -EINVAL;
                 }
diff --combined fs/smb/client/ioctl.c

index 9bb5c869f4db7e60ad215967e5cf7721543fee21,94bf2e5014d9370b60d3d773a214a79a3aa58039..2ce193609d8b23c60238712020fdc871e3201a49
--- 1/fs/smb/client/ioctl.c
--- 2/fs/smb/client/ioctl.c
+++ b/fs/smb/client/ioctl.c
@@@ -90,23 -90,23 +90,23 @@@ static long cifs_ioctl_copychunk(unsign
         }
   
         src_file = fdget(srcfd);
-       if (!src_file.file) {
+       if (!fd_file(src_file)) {
                 rc = -EBADF;
                 goto out_drop_write;
         }
   
-       if (src_file.file->f_op->unlocked_ioctl != cifs_ioctl) {
+       if (fd_file(src_file)->f_op->unlocked_ioctl != cifs_ioctl) {
                 rc = -EBADF;
                 cifs_dbg(VFS, "src file seems to be from a different filesystem type\n");
                 goto out_fput;
         }
   
-       src_inode = file_inode(src_file.file);
+       src_inode = file_inode(fd_file(src_file));
         rc = -EINVAL;
         if (S_ISDIR(src_inode->i_mode))
                 goto out_fput;
   
-       rc = cifs_file_copychunk_range(xid, src_file.file, 0, dst_file, 0,
+       rc = cifs_file_copychunk_range(xid, fd_file(src_file), 0, dst_file, 0,
                                         src_inode->i_size, 0);
         if (rc > 0)
                 rc = 0;
@@@ -170,10 -170,7 +170,10 @@@ static long smb_mnt_get_fsinfo(unsigne
   static int cifs_shutdown(struct super_block *sb, unsigned long arg)
   {
         struct cifs_sb_info *sbi = CIFS_SB(sb);
+ +      struct tcon_link *tlink;
+ +      struct cifs_tcon *tcon;
         __u32 flags;
+ +      int rc;
   
         if (!capable(CAP_SYS_ADMIN))
                 return -EPERM;
@@@ -181,21 -178,14 +181,21 @@@
         if (get_user(flags, (__u32 __user *)arg))
                 return -EFAULT;
   
- -      if (flags > CIFS_GOING_FLAGS_NOLOGFLUSH)
- -              return -EINVAL;
+ +      tlink = cifs_sb_tlink(sbi);
+ +      if (IS_ERR(tlink))
+ +              return PTR_ERR(tlink);
+ +      tcon = tlink_tcon(tlink);
+ +
+ +      trace_smb3_shutdown_enter(flags, tcon->tid);
+ +      if (flags > CIFS_GOING_FLAGS_NOLOGFLUSH) {
+ +              rc = -EINVAL;
+ +              goto shutdown_out_err;
+ +      }
   
         if (cifs_forced_shutdown(sbi))
- -              return 0;
+ +              goto shutdown_good;
   
         cifs_dbg(VFS, "shut down requested (%d)", flags);
- -/*    trace_cifs_shutdown(sb, flags);*/
   
         /*
          * see:
@@@ -211,8 -201,7 +211,8 @@@
          */
         case CIFS_GOING_FLAGS_DEFAULT:
                 cifs_dbg(FYI, "shutdown with default flag not supported\n");
- -              return -EINVAL;
+ +              rc = -EINVAL;
+ +              goto shutdown_out_err;
         /*
          * FLAGS_LOGFLUSH is easy since it asks to write out metadata (not
          * data) but metadata writes are not cached on the client, so can treat
@@@ -221,20 -210,11 +221,20 @@@
         case CIFS_GOING_FLAGS_LOGFLUSH:
         case CIFS_GOING_FLAGS_NOLOGFLUSH:
                 sbi->mnt_cifs_flags |= CIFS_MOUNT_SHUTDOWN;
- -              return 0;
+ +              goto shutdown_good;
         default:
- -              return -EINVAL;
+ +              rc = -EINVAL;
+ +              goto shutdown_out_err;
         }
+ +
+ +shutdown_good:
+ +      trace_smb3_shutdown_done(flags, tcon->tid);
+ +      cifs_put_tlink(tlink);
         return 0;
+ +shutdown_out_err:
+ +      trace_smb3_shutdown_err(rc, flags, tcon->tid);
+ +      cifs_put_tlink(tlink);
+ +      return rc;
   }
   
   static int cifs_dump_full_key(struct cifs_tcon *tcon, struct smb3_full_key_debug_info __user *in)
diff --combined fs/xfs/xfs_exchrange.c

index d0889190ab7ff5b01497f19c2ecefa5956d1eba2,9790e0f45d147463c90eaa102a43d1233a9373fb..75cb53f090d1f7c6729ebac07598cc0238d27943
--- 1/fs/xfs/xfs_exchrange.c
--- 2/fs/xfs/xfs_exchrange.c
+++ b/fs/xfs/xfs_exchrange.c
@@@ -72,34 -72,6 +72,34 @@@ xfs_exchrange_estimate
         return error;
   }
   
+ +/*
+ + * Check that file2's metadata agree with the snapshot that we took for the
+ + * range commit request.
+ + *
+ + * This should be called after the filesystem has locked /all/ inode metadata
+ + * against modification.
+ + */
+ +STATIC int
+ +xfs_exchrange_check_freshness(
+ +      const struct xfs_exchrange      *fxr,
+ +      struct xfs_inode                *ip2)
+ +{
+ +      struct inode                    *inode2 = VFS_I(ip2);
+ +      struct timespec64               ctime = inode_get_ctime(inode2);
+ +      struct timespec64               mtime = inode_get_mtime(inode2);
+ +
+ +      trace_xfs_exchrange_freshness(fxr, ip2);
+ +
+ +      /* Check that file2 hasn't otherwise been modified. */
+ +      if (fxr->file2_ino != ip2->i_ino ||
+ +          fxr->file2_gen != inode2->i_generation ||
+ +          !timespec64_equal(&fxr->file2_ctime, &ctime) ||
+ +          !timespec64_equal(&fxr->file2_mtime, &mtime))
+ +              return -EBUSY;
+ +
+ +      return 0;
+ +}
+ +
   #define QRETRY_IP1    (0x1)
   #define QRETRY_IP2    (0x2)
   
@@@ -635,12 -607,6 +635,12 @@@ xfs_exchrange_prep
         if (error || fxr->length == 0)
                 return error;
   
+ +      if (fxr->flags & __XFS_EXCHANGE_RANGE_CHECK_FRESH2) {
+ +              error = xfs_exchrange_check_freshness(fxr, ip2);
+ +              if (error)
+ +                      return error;
+ +      }
+ +
         /* Attach dquots to both inodes before changing block maps. */
         error = xfs_qm_dqattach(ip2);
         if (error)
@@@ -753,8 -719,7 +753,8 @@@ xfs_exchange_range
         if (fxr->file1->f_path.mnt != fxr->file2->f_path.mnt)
                 return -EXDEV;
   
- -      if (fxr->flags & ~XFS_EXCHANGE_RANGE_ALL_FLAGS)
+ +      if (fxr->flags & ~(XFS_EXCHANGE_RANGE_ALL_FLAGS |
+ +                       __XFS_EXCHANGE_RANGE_CHECK_FRESH2))
                 return -EINVAL;
   
         /* Userspace requests only honored for regular files. */
@@@ -829,117 -794,11 +829,117 @@@ xfs_ioc_exchange_range
         fxr.flags               = args.flags;
   
         file1 = fdget(args.file1_fd);
-       if (!file1.file)
+       if (!fd_file(file1))
                 return -EBADF;
-       fxr.file1 = file1.file;
+       fxr.file1 = fd_file(file1);
   
         error = xfs_exchange_range(&fxr);
         fdput(file1);
         return error;
   }
-       if (!file1.file)
+ +
+ +/* Opaque freshness blob for XFS_IOC_COMMIT_RANGE */
+ +struct xfs_commit_range_fresh {
+ +      xfs_fsid_t      fsid;           /* m_fixedfsid */
+ +      __u64           file2_ino;      /* inode number */
+ +      __s64           file2_mtime;    /* modification time */
+ +      __s64           file2_ctime;    /* change time */
+ +      __s32           file2_mtime_nsec; /* mod time, nsec */
+ +      __s32           file2_ctime_nsec; /* change time, nsec */
+ +      __u32           file2_gen;      /* inode generation */
+ +      __u32           magic;          /* zero */
+ +};
+ +#define XCR_FRESH_MAGIC       0x444F524B      /* DORK */
+ +
+ +/* Set up a commitrange operation by sampling file2's write-related attrs */
+ +long
+ +xfs_ioc_start_commit(
+ +      struct file                     *file,
+ +      struct xfs_commit_range __user  *argp)
+ +{
+ +      struct xfs_commit_range         args = { };
+ +      struct timespec64               ts;
+ +      struct xfs_commit_range_fresh   *kern_f;
+ +      struct xfs_commit_range_fresh   __user *user_f;
+ +      struct inode                    *inode2 = file_inode(file);
+ +      struct xfs_inode                *ip2 = XFS_I(inode2);
+ +      const unsigned int              lockflags = XFS_IOLOCK_SHARED |
+ +                                                  XFS_MMAPLOCK_SHARED |
+ +                                                  XFS_ILOCK_SHARED;
+ +
+ +      BUILD_BUG_ON(sizeof(struct xfs_commit_range_fresh) !=
+ +                   sizeof(args.file2_freshness));
+ +
+ +      kern_f = (struct xfs_commit_range_fresh *)&args.file2_freshness;
+ +
+ +      memcpy(&kern_f->fsid, ip2->i_mount->m_fixedfsid, sizeof(xfs_fsid_t));
+ +
+ +      xfs_ilock(ip2, lockflags);
+ +      ts = inode_get_ctime(inode2);
+ +      kern_f->file2_ctime             = ts.tv_sec;
+ +      kern_f->file2_ctime_nsec        = ts.tv_nsec;
+ +      ts = inode_get_mtime(inode2);
+ +      kern_f->file2_mtime             = ts.tv_sec;
+ +      kern_f->file2_mtime_nsec        = ts.tv_nsec;
+ +      kern_f->file2_ino               = ip2->i_ino;
+ +      kern_f->file2_gen               = inode2->i_generation;
+ +      kern_f->magic                   = XCR_FRESH_MAGIC;
+ +      xfs_iunlock(ip2, lockflags);
+ +
+ +      user_f = (struct xfs_commit_range_fresh __user *)&argp->file2_freshness;
+ +      if (copy_to_user(user_f, kern_f, sizeof(*kern_f)))
+ +              return -EFAULT;
+ +
+ +      return 0;
+ +}
+ +
+ +/*
+ + * Exchange file1 and file2 contents if file2 has not been written since the
+ + * start commit operation.
+ + */
+ +long
+ +xfs_ioc_commit_range(
+ +      struct file                     *file,
+ +      struct xfs_commit_range __user  *argp)
+ +{
+ +      struct xfs_exchrange            fxr = {
+ +              .file2                  = file,
+ +      };
+ +      struct xfs_commit_range         args;
+ +      struct xfs_commit_range_fresh   *kern_f;
+ +      struct xfs_inode                *ip2 = XFS_I(file_inode(file));
+ +      struct xfs_mount                *mp = ip2->i_mount;
+ +      struct fd                       file1;
+ +      int                             error;
+ +
+ +      kern_f = (struct xfs_commit_range_fresh *)&args.file2_freshness;
+ +
+ +      if (copy_from_user(&args, argp, sizeof(args)))
+ +              return -EFAULT;
+ +      if (args.flags & ~XFS_EXCHANGE_RANGE_ALL_FLAGS)
+ +              return -EINVAL;
+ +      if (kern_f->magic != XCR_FRESH_MAGIC)
+ +              return -EBUSY;
+ +      if (memcmp(&kern_f->fsid, mp->m_fixedfsid, sizeof(xfs_fsid_t)))
+ +              return -EBUSY;
+ +
+ +      fxr.file1_offset        = args.file1_offset;
+ +      fxr.file2_offset        = args.file2_offset;
+ +      fxr.length              = args.length;
+ +      fxr.flags               = args.flags | __XFS_EXCHANGE_RANGE_CHECK_FRESH2;
+ +      fxr.file2_ino           = kern_f->file2_ino;
+ +      fxr.file2_gen           = kern_f->file2_gen;
+ +      fxr.file2_mtime.tv_sec  = kern_f->file2_mtime;
+ +      fxr.file2_mtime.tv_nsec = kern_f->file2_mtime_nsec;
+ +      fxr.file2_ctime.tv_sec  = kern_f->file2_ctime;
+ +      fxr.file2_ctime.tv_nsec = kern_f->file2_ctime_nsec;
+ +
+ +      file1 = fdget(args.file1_fd);
-       fxr.file1 = file1.file;
++      if (fd_empty(file1))
+ +              return -EBADF;
++      fxr.file1 = fd_file(file1);
+ +
+ +      error = xfs_exchange_range(&fxr);
+ +      fdput(file1);
+ +      return error;
+ +}
diff --combined fs/xfs/xfs_ioctl.c

index 7226d27e8afc3f29e090c6b308c07782802e9b39,95641817370b211b3a229ea3376b9f5d81684e97..a20d426ef021f23c18b821dcd7b11e5e6741b0a2
--- 1/fs/xfs/xfs_ioctl.c
--- 2/fs/xfs/xfs_ioctl.c
+++ b/fs/xfs/xfs_ioctl.c
@@@ -483,17 -483,6 +483,17 @@@ xfs_ioctl_setattr_xflags
                 /* Can't change realtime flag if any extents are allocated. */
                 if (ip->i_df.if_nextents || ip->i_delayed_blks)
                         return -EINVAL;
+ +
+ +              /*
+ +               * If S_DAX is enabled on this file, we can only switch the
+ +               * device if both support fsdax.  We can't update S_DAX because
+ +               * there might be other threads walking down the access paths.
+ +               */
+ +              if (IS_DAX(VFS_I(ip)) &&
+ +                  (mp->m_ddev_targp->bt_daxdev == NULL ||
+ +                   (mp->m_rtdev_targp &&
+ +                    mp->m_rtdev_targp->bt_daxdev == NULL)))
+ +                      return -EINVAL;
         }
   
         if (rtflag) {
@@@ -876,6 -865,136 +876,6 @@@ out_free_buf
         return error;
   }
   
- -STATIC int
- -xfs_ioc_getfsmap(
- -      struct xfs_inode        *ip,
- -      struct fsmap_head       __user *arg)
- -{
- -      struct xfs_fsmap_head   xhead = {0};
- -      struct fsmap_head       head;
- -      struct fsmap            *recs;
- -      unsigned int            count;
- -      __u32                   last_flags = 0;
- -      bool                    done = false;
- -      int                     error;
- -
- -      if (copy_from_user(&head, arg, sizeof(struct fsmap_head)))
- -              return -EFAULT;
- -      if (memchr_inv(head.fmh_reserved, 0, sizeof(head.fmh_reserved)) ||
- -          memchr_inv(head.fmh_keys[0].fmr_reserved, 0,
- -                     sizeof(head.fmh_keys[0].fmr_reserved)) ||
- -          memchr_inv(head.fmh_keys[1].fmr_reserved, 0,
- -                     sizeof(head.fmh_keys[1].fmr_reserved)))
- -              return -EINVAL;
- -
- -      /*
- -       * Use an internal memory buffer so that we don't have to copy fsmap
- -       * data to userspace while holding locks.  Start by trying to allocate
- -       * up to 128k for the buffer, but fall back to a single page if needed.
- -       */
- -      count = min_t(unsigned int, head.fmh_count,
- -                      131072 / sizeof(struct fsmap));
- -      recs = kvcalloc(count, sizeof(struct fsmap), GFP_KERNEL);
- -      if (!recs) {
- -              count = min_t(unsigned int, head.fmh_count,
- -                              PAGE_SIZE / sizeof(struct fsmap));
- -              recs = kvcalloc(count, sizeof(struct fsmap), GFP_KERNEL);
- -              if (!recs)
- -                      return -ENOMEM;
- -      }
- -
- -      xhead.fmh_iflags = head.fmh_iflags;
- -      xfs_fsmap_to_internal(&xhead.fmh_keys[0], &head.fmh_keys[0]);
- -      xfs_fsmap_to_internal(&xhead.fmh_keys[1], &head.fmh_keys[1]);
- -
- -      trace_xfs_getfsmap_low_key(ip->i_mount, &xhead.fmh_keys[0]);
- -      trace_xfs_getfsmap_high_key(ip->i_mount, &xhead.fmh_keys[1]);
- -
- -      head.fmh_entries = 0;
- -      do {
- -              struct fsmap __user     *user_recs;
- -              struct fsmap            *last_rec;
- -
- -              user_recs = &arg->fmh_recs[head.fmh_entries];
- -              xhead.fmh_entries = 0;
- -              xhead.fmh_count = min_t(unsigned int, count,
- -                                      head.fmh_count - head.fmh_entries);
- -
- -              /* Run query, record how many entries we got. */
- -              error = xfs_getfsmap(ip->i_mount, &xhead, recs);
- -              switch (error) {
- -              case 0:
- -                      /*
- -                       * There are no more records in the result set.  Copy
- -                       * whatever we got to userspace and break out.
- -                       */
- -                      done = true;
- -                      break;
- -              case -ECANCELED:
- -                      /*
- -                       * The internal memory buffer is full.  Copy whatever
- -                       * records we got to userspace and go again if we have
- -                       * not yet filled the userspace buffer.
- -                       */
- -                      error = 0;
- -                      break;
- -              default:
- -                      goto out_free;
- -              }
- -              head.fmh_entries += xhead.fmh_entries;
- -              head.fmh_oflags = xhead.fmh_oflags;
- -
- -              /*
- -               * If the caller wanted a record count or there aren't any
- -               * new records to return, we're done.
- -               */
- -              if (head.fmh_count == 0 || xhead.fmh_entries == 0)
- -                      break;
- -
- -              /* Copy all the records we got out to userspace. */
- -              if (copy_to_user(user_recs, recs,
- -                               xhead.fmh_entries * sizeof(struct fsmap))) {
- -                      error = -EFAULT;
- -                      goto out_free;
- -              }
- -
- -              /* Remember the last record flags we copied to userspace. */
- -              last_rec = &recs[xhead.fmh_entries - 1];
- -              last_flags = last_rec->fmr_flags;
- -
- -              /* Set up the low key for the next iteration. */
- -              xfs_fsmap_to_internal(&xhead.fmh_keys[0], last_rec);
- -              trace_xfs_getfsmap_low_key(ip->i_mount, &xhead.fmh_keys[0]);
- -      } while (!done && head.fmh_entries < head.fmh_count);
- -
- -      /*
- -       * If there are no more records in the query result set and we're not
- -       * in counting mode, mark the last record returned with the LAST flag.
- -       */
- -      if (done && head.fmh_count > 0 && head.fmh_entries > 0) {
- -              struct fsmap __user     *user_rec;
- -
- -              last_flags |= FMR_OF_LAST;
- -              user_rec = &arg->fmh_recs[head.fmh_entries - 1];
- -
- -              if (copy_to_user(&user_rec->fmr_flags, &last_flags,
- -                                      sizeof(last_flags))) {
- -                      error = -EFAULT;
- -                      goto out_free;
- -              }
- -      }
- -
- -      /* copy back header */
- -      if (copy_to_user(arg, &head, sizeof(struct fsmap_head))) {
- -              error = -EFAULT;
- -              goto out_free;
- -      }
- -
- -out_free:
- -      kvfree(recs);
- -      return error;
- -}
- -
   int
   xfs_ioc_swapext(
         xfs_swapext_t   *sxp)
@@@ -886,33 -1005,33 +886,33 @@@
   
         /* Pull information for the target fd */
         f = fdget((int)sxp->sx_fdtarget);
-       if (!f.file) {
+       if (!fd_file(f)) {
                 error = -EINVAL;
                 goto out;
         }
   
-       if (!(f.file->f_mode & FMODE_WRITE) ||
-           !(f.file->f_mode & FMODE_READ) ||
-           (f.file->f_flags & O_APPEND)) {
+       if (!(fd_file(f)->f_mode & FMODE_WRITE) ||
+           !(fd_file(f)->f_mode & FMODE_READ) ||
+           (fd_file(f)->f_flags & O_APPEND)) {
                 error = -EBADF;
                 goto out_put_file;
         }
   
         tmp = fdget((int)sxp->sx_fdtmp);
-       if (!tmp.file) {
+       if (!fd_file(tmp)) {
                 error = -EINVAL;
                 goto out_put_file;
         }
   
-       if (!(tmp.file->f_mode & FMODE_WRITE) ||
-           !(tmp.file->f_mode & FMODE_READ) ||
-           (tmp.file->f_flags & O_APPEND)) {
+       if (!(fd_file(tmp)->f_mode & FMODE_WRITE) ||
+           !(fd_file(tmp)->f_mode & FMODE_READ) ||
+           (fd_file(tmp)->f_flags & O_APPEND)) {
                 error = -EBADF;
                 goto out_put_tmp_file;
         }
   
-       if (IS_SWAPFILE(file_inode(f.file)) ||
-           IS_SWAPFILE(file_inode(tmp.file))) {
+       if (IS_SWAPFILE(file_inode(fd_file(f))) ||
+           IS_SWAPFILE(file_inode(fd_file(tmp)))) {
                 error = -EINVAL;
                 goto out_put_tmp_file;
         }
@@@ -922,14 -1041,14 +922,14 @@@
          * before we cast and access them as XFS structures as we have no
          * control over what the user passes us here.
          */
-       if (f.file->f_op != &xfs_file_operations ||
-           tmp.file->f_op != &xfs_file_operations) {
+       if (fd_file(f)->f_op != &xfs_file_operations ||
+           fd_file(tmp)->f_op != &xfs_file_operations) {
                 error = -EINVAL;
                 goto out_put_tmp_file;
         }
   
-       ip = XFS_I(file_inode(f.file));
-       tip = XFS_I(file_inode(tmp.file));
+       ip = XFS_I(file_inode(fd_file(f)));
+       tip = XFS_I(file_inode(fd_file(tmp)));
   
         if (ip->i_mount != tip->i_mount) {
                 error = -EINVAL;
@@@ -1388,10 -1507,6 +1388,10 @@@ xfs_file_ioctl
   
         case XFS_IOC_EXCHANGE_RANGE:
                 return xfs_ioc_exchange_range(filp, arg);
+ +      case XFS_IOC_START_COMMIT:
+ +              return xfs_ioc_start_commit(filp, arg);
+ +      case XFS_IOC_COMMIT_RANGE:
+ +              return xfs_ioc_commit_range(filp, arg);
   
         default:
                 return -ENOTTY;
diff --combined include/linux/file.h

index 6bd9cd9c87e574f1a9c64a5ea78e8c5b0e7620a8,aab1caff671348407673bd49d0cc95d88ff64dd3..f98de143245abc1361f6e7118bc415ba0206d28a
--- 1/include/linux/file.h
--- 2/include/linux/file.h
+++ b/include/linux/file.h
@@@ -11,7 -11,6 +11,7 @@@
   #include <linux/posix_types.h>
   #include <linux/errno.h>
   #include <linux/cleanup.h>
+ +#include <linux/err.h>
   
   struct file;
   
@@@ -36,51 -35,52 +36,52 @@@ static inline void fput_light(struct fi
                 fput(file);
   }
   
+ /* either a reference to struct file + flags
+  * (cloned vs. borrowed, pos locked), with
+  * flags stored in lower bits of value,
+  * or empty (represented by 0).
+  */
   struct fd {
-       struct file *file;
-       unsigned int flags;
+       unsigned long word;
   };
   #define FDPUT_FPUT       1
   #define FDPUT_POS_UNLOCK 2
   
- static inline void fdput(struct fd fd)
+ #define fd_file(f) ((struct file *)((f).word & ~(FDPUT_FPUT|FDPUT_POS_UNLOCK)))
+ static inline bool fd_empty(struct fd f)
   {
-       if (fd.flags & FDPUT_FPUT)
-               fput(fd.file);
+       return unlikely(!f.word);
   }
   
- extern struct file *fget(unsigned int fd);
- extern struct file *fget_raw(unsigned int fd);
- extern struct file *fget_task(struct task_struct *task, unsigned int fd);
- extern unsigned long __fdget(unsigned int fd);
- extern unsigned long __fdget_raw(unsigned int fd);
- extern unsigned long __fdget_pos(unsigned int fd);
- extern void __f_unlock_pos(struct file *);
- 
- static inline struct fd __to_fd(unsigned long v)
+ #define EMPTY_FD (struct fd){0}
+ static inline struct fd BORROWED_FD(struct file *f)
   {
-       return (struct fd){(struct file *)(v & ~3),v & 3};
+       return (struct fd){(unsigned long)f};
   }
- 
- static inline struct fd fdget(unsigned int fd)
+ static inline struct fd CLONED_FD(struct file *f)
   {
-       return __to_fd(__fdget(fd));
+       return (struct fd){(unsigned long)f | FDPUT_FPUT};
   }
   
- static inline struct fd fdget_raw(unsigned int fd)
+ static inline void fdput(struct fd fd)
   {
-       return __to_fd(__fdget_raw(fd));
+       if (fd.word & FDPUT_FPUT)
+               fput(fd_file(fd));
   }
   
- static inline struct fd fdget_pos(int fd)
- {
-       return __to_fd(__fdget_pos(fd));
- }
+ extern struct file *fget(unsigned int fd);
+ extern struct file *fget_raw(unsigned int fd);
+ extern struct file *fget_task(struct task_struct *task, unsigned int fd);
+ extern void __f_unlock_pos(struct file *);
+ 
+ struct fd fdget(unsigned int fd);
+ struct fd fdget_raw(unsigned int fd);
+ struct fd fdget_pos(unsigned int fd);
   
   static inline void fdput_pos(struct fd f)
   {
-       if (f.flags & FDPUT_POS_UNLOCK)
-               __f_unlock_pos(f.file);
+       if (f.word & FDPUT_POS_UNLOCK)
+               __f_unlock_pos(fd_file(f));
         fdput(f);
   }
   
@@@ -97,7 -97,6 +98,7 @@@ extern void put_unused_fd(unsigned int 
   
   DEFINE_CLASS(get_unused_fd, int, if (_T >= 0) put_unused_fd(_T),
              get_unused_fd_flags(flags), unsigned flags)
+ +DEFINE_FREE(fput, struct file *, if (!IS_ERR_OR_NULL(_T)) fput(_T))
   
   /*
    * take_fd() will take care to set @fd to -EBADF ensuring that
@@@ -112,7 -111,7 +113,7 @@@
    *
    * f = dentry_open(&path, O_RDONLY, current_cred());
    * if (IS_ERR(f))
- - *         return PTR_ERR(fd);
+ + *         return PTR_ERR(f);
    *
    * fd_install(fd, f);
    * return take_fd(fd);
diff --combined io_uring/sqpoll.c

index 272df9d00f45bbe391d4f52dd0df24f4c5eae0ac,ffa7d341bd95f1b89e1af8a5b3c8b642538162ee..4a59a024278a56f051a2170288cba45363649db2
--- 1/io_uring/sqpoll.c
--- 2/io_uring/sqpoll.c
+++ b/io_uring/sqpoll.c
@@@ -10,7 -10,6 +10,7 @@@
   #include <linux/slab.h>
   #include <linux/audit.h>
   #include <linux/security.h>
+ +#include <linux/cpuset.h>
   #include <linux/io_uring.h>
   
   #include <uapi/linux/io_uring.h>
@@@ -45,7 -44,7 +45,7 @@@ void io_sq_thread_unpark(struct io_sq_d
   void io_sq_thread_park(struct io_sq_data *sqd)
         __acquires(&sqd->lock)
   {
- -      WARN_ON_ONCE(sqd->thread == current);
+ +      WARN_ON_ONCE(data_race(sqd->thread) == current);
   
         atomic_inc(&sqd->park_pending);
         set_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state);
@@@ -109,14 -108,14 +109,14 @@@ static struct io_sq_data *io_attach_sq_
         struct fd f;
   
         f = fdget(p->wq_fd);
-       if (!f.file)
+       if (!fd_file(f))
                 return ERR_PTR(-ENXIO);
-       if (!io_is_uring_fops(f.file)) {
+       if (!io_is_uring_fops(fd_file(f))) {
                 fdput(f);
                 return ERR_PTR(-EINVAL);
         }
   
-       ctx_attach = f.file->private_data;
+       ctx_attach = fd_file(f)->private_data;
         sqd = ctx_attach->sq_data;
         if (!sqd) {
                 fdput(f);
@@@ -177,7 -176,7 +177,7 @@@ static int __io_sq_thread(struct io_rin
         if (cap_entries && to_submit > IORING_SQPOLL_CAP_ENTRIES_VALUE)
                 to_submit = IORING_SQPOLL_CAP_ENTRIES_VALUE;
   
- -      if (!wq_list_empty(&ctx->iopoll_list) || to_submit) {
+ +      if (to_submit || !wq_list_empty(&ctx->iopoll_list)) {
                 const struct cred *creds = NULL;
   
                 if (ctx->sq_creds != current_cred())
@@@ -419,9 -418,9 +419,9 @@@ __cold int io_sq_offload_create(struct 
                 struct fd f;
   
                 f = fdget(p->wq_fd);
-               if (!f.file)
+               if (!fd_file(f))
                         return -ENXIO;
-               if (!io_is_uring_fops(f.file)) {
+               if (!io_is_uring_fops(fd_file(f))) {
                         fdput(f);
                         return -EINVAL;
                 }
@@@ -461,12 -460,10 +461,12 @@@
                         return 0;
   
                 if (p->flags & IORING_SETUP_SQ_AFF) {
+ +                      struct cpumask allowed_mask;
                         int cpu = p->sq_thread_cpu;
   
                         ret = -EINVAL;
- -                      if (cpu >= nr_cpu_ids || !cpu_online(cpu))
+ +                      cpuset_cpus_allowed(current, &allowed_mask);
+ +                      if (!cpumask_test_cpu(cpu, &allowed_mask))
                                 goto err_sqpoll;
                         sqd->sq_cpu = cpu;
                 } else {
diff --combined kernel/bpf/btf.c

index 8ae092ae157381c4db1385c8323df0b18f89ef80,4de1e3dc22844da7d334e5ed5699c94b82504700..83bbf935c56289b89757a1eab6da915b83126d4c
--- 1/kernel/bpf/btf.c
--- 2/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@@ -212,7 -212,7 +212,7 @@@ enum btf_kfunc_hook 
         BTF_KFUNC_HOOK_TRACING,
         BTF_KFUNC_HOOK_SYSCALL,
         BTF_KFUNC_HOOK_FMODRET,
- -      BTF_KFUNC_HOOK_CGROUP_SKB,
+ +      BTF_KFUNC_HOOK_CGROUP,
         BTF_KFUNC_HOOK_SCHED_ACT,
         BTF_KFUNC_HOOK_SK_SKB,
         BTF_KFUNC_HOOK_SOCKET_FILTER,
@@@ -790,7 -790,7 +790,7 @@@ const char *btf_str_by_offset(const str
         return NULL;
   }
   
- -static bool __btf_name_valid(const struct btf *btf, u32 offset)
+ +static bool btf_name_valid_identifier(const struct btf *btf, u32 offset)
   {
         /* offset must be valid */
         const char *src = btf_str_by_offset(btf, offset);
@@@ -811,6 -811,11 +811,6 @@@
         return !*src;
   }
   
- -static bool btf_name_valid_identifier(const struct btf *btf, u32 offset)
- -{
- -      return __btf_name_valid(btf, offset);
- -}
- -
   /* Allow any printable character in DATASEC names */
   static bool btf_name_valid_section(const struct btf *btf, u32 offset)
   {
@@@ -818,11 -823,9 +818,11 @@@
         const char *src = btf_str_by_offset(btf, offset);
         const char *src_limit;
   
+ +      if (!*src)
+ +              return false;
+ +
         /* set a limit on identifier length */
         src_limit = src + KSYM_NAME_LEN;
- -      src++;
         while (*src && src < src_limit) {
                 if (!isprint(*src))
                         return false;
@@@ -3756,7 -3759,6 +3756,7 @@@ static int btf_find_field(const struct 
         return -EINVAL;
   }
   
+ +/* Callers have to ensure the life cycle of btf if it is program BTF */
   static int btf_parse_kptr(const struct btf *btf, struct btf_field *field,
                           struct btf_field_info *info)
   {
@@@ -3785,6 -3787,7 +3785,6 @@@
                 field->kptr.dtor = NULL;
                 id = info->kptr.type_id;
                 kptr_btf = (struct btf *)btf;
- -              btf_get(kptr_btf);
                 goto found_dtor;
         }
         if (id < 0)
@@@ -4626,7 -4629,7 +4626,7 @@@ static s32 btf_var_check_meta(struct bt
         }
   
         if (!t->name_off ||
- -          !__btf_name_valid(env->btf, t->name_off)) {
+ +          !btf_name_valid_identifier(env->btf, t->name_off)) {
                 btf_verifier_log_type(env, t, "Invalid name");
                 return -EINVAL;
         }
@@@ -5514,72 -5517,36 +5514,72 @@@ static const char *alloc_obj_fields[] 
   static struct btf_struct_metas *
   btf_parse_struct_metas(struct bpf_verifier_log *log, struct btf *btf)
   {
- -      union {
- -              struct btf_id_set set;
- -              struct {
- -                      u32 _cnt;
- -                      u32 _ids[ARRAY_SIZE(alloc_obj_fields)];
- -              } _arr;
- -      } aof;
         struct btf_struct_metas *tab = NULL;
+ +      struct btf_id_set *aof;
         int i, n, id, ret;
   
         BUILD_BUG_ON(offsetof(struct btf_id_set, cnt) != 0);
         BUILD_BUG_ON(sizeof(struct btf_id_set) != sizeof(u32));
   
- -      memset(&aof, 0, sizeof(aof));
+ +      aof = kmalloc(sizeof(*aof), GFP_KERNEL | __GFP_NOWARN);
+ +      if (!aof)
+ +              return ERR_PTR(-ENOMEM);
+ +      aof->cnt = 0;
+ +
         for (i = 0; i < ARRAY_SIZE(alloc_obj_fields); i++) {
                 /* Try to find whether this special type exists in user BTF, and
                  * if so remember its ID so we can easily find it among members
                  * of structs that we iterate in the next loop.
                  */
+ +              struct btf_id_set *new_aof;
+ +
                 id = btf_find_by_name_kind(btf, alloc_obj_fields[i], BTF_KIND_STRUCT);
                 if (id < 0)
                         continue;
- -              aof.set.ids[aof.set.cnt++] = id;
+ +
+ +              new_aof = krealloc(aof, offsetof(struct btf_id_set, ids[aof->cnt + 1]),
+ +                                 GFP_KERNEL | __GFP_NOWARN);
+ +              if (!new_aof) {
+ +                      ret = -ENOMEM;
+ +                      goto free_aof;
+ +              }
+ +              aof = new_aof;
+ +              aof->ids[aof->cnt++] = id;
         }
   
- -      if (!aof.set.cnt)
+ +      n = btf_nr_types(btf);
+ +      for (i = 1; i < n; i++) {
+ +              /* Try to find if there are kptrs in user BTF and remember their ID */
+ +              struct btf_id_set *new_aof;
+ +              struct btf_field_info tmp;
+ +              const struct btf_type *t;
+ +
+ +              t = btf_type_by_id(btf, i);
+ +              if (!t) {
+ +                      ret = -EINVAL;
+ +                      goto free_aof;
+ +              }
+ +
+ +              ret = btf_find_kptr(btf, t, 0, 0, &tmp);
+ +              if (ret != BTF_FIELD_FOUND)
+ +                      continue;
+ +
+ +              new_aof = krealloc(aof, offsetof(struct btf_id_set, ids[aof->cnt + 1]),
+ +                                 GFP_KERNEL | __GFP_NOWARN);
+ +              if (!new_aof) {
+ +                      ret = -ENOMEM;
+ +                      goto free_aof;
+ +              }
+ +              aof = new_aof;
+ +              aof->ids[aof->cnt++] = i;
+ +      }
+ +
+ +      if (!aof->cnt) {
+ +              kfree(aof);
                 return NULL;
- -      sort(&aof.set.ids, aof.set.cnt, sizeof(aof.set.ids[0]), btf_id_cmp_func, NULL);
+ +      }
+ +      sort(&aof->ids, aof->cnt, sizeof(aof->ids[0]), btf_id_cmp_func, NULL);
   
- -      n = btf_nr_types(btf);
         for (i = 1; i < n; i++) {
                 struct btf_struct_metas *new_tab;
                 const struct btf_member *member;
@@@ -5589,13 -5556,17 +5589,13 @@@
                 int j, tab_cnt;
   
                 t = btf_type_by_id(btf, i);
- -              if (!t) {
- -                      ret = -EINVAL;
- -                      goto free;
- -              }
                 if (!__btf_type_is_struct(t))
                         continue;
   
                 cond_resched();
   
                 for_each_member(j, t, member) {
- -                      if (btf_id_set_contains(&aof.set, member->type))
+ +                      if (btf_id_set_contains(aof, member->type))
                                 goto parse;
                 }
                 continue;
@@@ -5614,8 -5585,7 +5614,8 @@@
                 type = &tab->types[tab->cnt];
                 type->btf_id = i;
                 record = btf_parse_fields(btf, t, BPF_SPIN_LOCK | BPF_LIST_HEAD | BPF_LIST_NODE |
- -                                                BPF_RB_ROOT | BPF_RB_NODE | BPF_REFCOUNT, t->size);
+ +                                                BPF_RB_ROOT | BPF_RB_NODE | BPF_REFCOUNT |
+ +                                                BPF_KPTR, t->size);
                 /* The record cannot be unset, treat it as an error if so */
                 if (IS_ERR_OR_NULL(record)) {
                         ret = PTR_ERR_OR_ZERO(record) ?: -EFAULT;
@@@ -5624,12 -5594,9 +5624,12 @@@
                 type->record = record;
                 tab->cnt++;
         }
+ +      kfree(aof);
         return tab;
   free:
         btf_struct_metas_free(tab);
+ +free_aof:
+ +      kfree(aof);
         return ERR_PTR(ret);
   }
   
@@@ -6276,11 -6243,12 +6276,11 @@@ static struct btf *btf_parse_module(con
         btf->kernel_btf = true;
         snprintf(btf->name, sizeof(btf->name), "%s", module_name);
   
- -      btf->data = kvmalloc(data_size, GFP_KERNEL | __GFP_NOWARN);
+ +      btf->data = kvmemdup(data, data_size, GFP_KERNEL | __GFP_NOWARN);
         if (!btf->data) {
                 err = -ENOMEM;
                 goto errout;
         }
- -      memcpy(btf->data, data, data_size);
         btf->data_size = data_size;
   
         err = btf_parse_hdr(env);
@@@ -6315,7 -6283,7 +6315,7 @@@
   
   errout:
         btf_verifier_env_free(env);
- -      if (base_btf != vmlinux_btf)
+ +      if (!IS_ERR(base_btf) && base_btf != vmlinux_btf)
                 btf_free(base_btf);
         if (btf) {
                 kvfree(btf->data);
@@@ -6448,11 -6416,8 +6448,11 @@@ bool btf_ctx_access(int off, int size, 
   
         if (arg == nr_args) {
                 switch (prog->expected_attach_type) {
- -              case BPF_LSM_CGROUP:
                 case BPF_LSM_MAC:
+ +                      /* mark we are accessing the return value */
+ +                      info->is_retval = true;
+ +                      fallthrough;
+ +              case BPF_LSM_CGROUP:
                 case BPF_TRACE_FEXIT:
                         /* When LSM programs are attached to void LSM hooks
                          * they use FEXIT trampolines and when attached to
@@@ -6558,9 -6523,6 +6558,9 @@@
         if (prog_args_trusted(prog))
                 info->reg_type |= PTR_TRUSTED;
   
+ +      if (btf_param_match_suffix(btf, &args[arg], "__nullable"))
+ +              info->reg_type |= PTR_MAYBE_NULL;
+ +
         if (tgt_prog) {
                 enum bpf_prog_type tgt_type;
   
@@@ -7715,15 -7677,15 +7715,15 @@@ struct btf *btf_get_by_fd(int fd
   
         f = fdget(fd);
   
-       if (!f.file)
+       if (!fd_file(f))
                 return ERR_PTR(-EBADF);
   
-       if (f.file->f_op != &btf_fops) {
+       if (fd_file(f)->f_op != &btf_fops) {
                 fdput(f);
                 return ERR_PTR(-EINVAL);
         }
   
-       btf = f.file->private_data;
+       btf = fd_file(f)->private_data;
         refcount_inc(&btf->refcnt);
         fdput(f);
   
@@@ -8087,44 -8049,15 +8087,44 @@@ BTF_ID_LIST_GLOBAL(btf_tracing_ids, MAX
   BTF_TRACING_TYPE_xxx
   #undef BTF_TRACING_TYPE
   
+ +/* Validate well-formedness of iter argument type.
+ + * On success, return positive BTF ID of iter state's STRUCT type.
+ + * On error, negative error is returned.
+ + */
+ +int btf_check_iter_arg(struct btf *btf, const struct btf_type *func, int arg_idx)
+ +{
+ +      const struct btf_param *arg;
+ +      const struct btf_type *t;
+ +      const char *name;
+ +      int btf_id;
+ +
+ +      if (btf_type_vlen(func) <= arg_idx)
+ +              return -EINVAL;
+ +
+ +      arg = &btf_params(func)[arg_idx];
+ +      t = btf_type_skip_modifiers(btf, arg->type, NULL);
+ +      if (!t || !btf_type_is_ptr(t))
+ +              return -EINVAL;
+ +      t = btf_type_skip_modifiers(btf, t->type, &btf_id);
+ +      if (!t || !__btf_type_is_struct(t))
+ +              return -EINVAL;
+ +
+ +      name = btf_name_by_offset(btf, t->name_off);
+ +      if (!name || strncmp(name, ITER_PREFIX, sizeof(ITER_PREFIX) - 1))
+ +              return -EINVAL;
+ +
+ +      return btf_id;
+ +}
+ +
   static int btf_check_iter_kfuncs(struct btf *btf, const char *func_name,
                                  const struct btf_type *func, u32 func_flags)
   {
         u32 flags = func_flags & (KF_ITER_NEW | KF_ITER_NEXT | KF_ITER_DESTROY);
- -      const char *name, *sfx, *iter_name;
- -      const struct btf_param *arg;
+ +      const char *sfx, *iter_name;
         const struct btf_type *t;
         char exp_name[128];
         u32 nr_args;
+ +      int btf_id;
   
         /* exactly one of KF_ITER_{NEW,NEXT,DESTROY} can be set */
         if (!flags || (flags & (flags - 1)))
@@@ -8135,21 -8068,28 +8135,21 @@@
         if (nr_args < 1)
                 return -EINVAL;
   
- -      arg = &btf_params(func)[0];
- -      t = btf_type_skip_modifiers(btf, arg->type, NULL);
- -      if (!t || !btf_type_is_ptr(t))
- -              return -EINVAL;
- -      t = btf_type_skip_modifiers(btf, t->type, NULL);
- -      if (!t || !__btf_type_is_struct(t))
- -              return -EINVAL;
- -
- -      name = btf_name_by_offset(btf, t->name_off);
- -      if (!name || strncmp(name, ITER_PREFIX, sizeof(ITER_PREFIX) - 1))
- -              return -EINVAL;
+ +      btf_id = btf_check_iter_arg(btf, func, 0);
+ +      if (btf_id < 0)
+ +              return btf_id;
   
         /* sizeof(struct bpf_iter_<type>) should be a multiple of 8 to
          * fit nicely in stack slots
          */
+ +      t = btf_type_by_id(btf, btf_id);
         if (t->size == 0 || (t->size % 8))
                 return -EINVAL;
   
         /* validate bpf_iter_<type>_{new,next,destroy}(struct bpf_iter_<type> *)
          * naming pattern
          */
- -      iter_name = name + sizeof(ITER_PREFIX) - 1;
+ +      iter_name = btf_name_by_offset(btf, t->name_off) + sizeof(ITER_PREFIX) - 1;
         if (flags & KF_ITER_NEW)
                 sfx = "new";
         else if (flags & KF_ITER_NEXT)
@@@ -8364,19 -8304,13 +8364,19 @@@ static int bpf_prog_type_to_kfunc_hook(
         case BPF_PROG_TYPE_STRUCT_OPS:
                 return BTF_KFUNC_HOOK_STRUCT_OPS;
         case BPF_PROG_TYPE_TRACING:
+ +      case BPF_PROG_TYPE_TRACEPOINT:
+ +      case BPF_PROG_TYPE_PERF_EVENT:
         case BPF_PROG_TYPE_LSM:
                 return BTF_KFUNC_HOOK_TRACING;
         case BPF_PROG_TYPE_SYSCALL:
                 return BTF_KFUNC_HOOK_SYSCALL;
         case BPF_PROG_TYPE_CGROUP_SKB:
+ +      case BPF_PROG_TYPE_CGROUP_SOCK:
+ +      case BPF_PROG_TYPE_CGROUP_DEVICE:
         case BPF_PROG_TYPE_CGROUP_SOCK_ADDR:
- -              return BTF_KFUNC_HOOK_CGROUP_SKB;
+ +      case BPF_PROG_TYPE_CGROUP_SOCKOPT:
+ +      case BPF_PROG_TYPE_CGROUP_SYSCTL:
+ +              return BTF_KFUNC_HOOK_CGROUP;
         case BPF_PROG_TYPE_SCHED_ACT:
                 return BTF_KFUNC_HOOK_SCHED_ACT;
         case BPF_PROG_TYPE_SK_SKB:
@@@ -8952,7 -8886,6 +8952,7 @@@ int bpf_core_apply(struct bpf_core_ctx 
         struct bpf_core_cand_list cands = {};
         struct bpf_core_relo_res targ_res;
         struct bpf_core_spec *specs;
+ +      const struct btf_type *type;
         int err;
   
         /* ~4k of temp memory necessary to convert LLVM spec like "0:1:0:5"
@@@ -8962,13 -8895,6 +8962,13 @@@
         if (!specs)
                 return -ENOMEM;
   
+ +      type = btf_type_by_id(ctx->btf, relo->type_id);
+ +      if (!type) {
+ +              bpf_log(ctx->log, "relo #%u: bad type id %u\n",
+ +                      relo_idx, relo->type_id);
+ +              return -EINVAL;
+ +      }
+ +
         if (need_cands) {
                 struct bpf_cand_cache *cc;
                 int i;
diff --combined kernel/bpf/syscall.c

index 8a4117f6d76107df037a5ee80b78a3d3fd810d38,3093bf2cc2660be7308a410064cd7b9b3ad57018..8386f25bc532cbb9ff60a6afcd75e7e0a78fefc8
--- 1/kernel/bpf/syscall.c
--- 2/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@@ -550,8 -550,7 +550,8 @@@ void btf_record_free(struct btf_record 
                 case BPF_KPTR_PERCPU:
                         if (rec->fields[i].kptr.module)
                                 module_put(rec->fields[i].kptr.module);
- -                      btf_put(rec->fields[i].kptr.btf);
+ +                      if (btf_is_kernel(rec->fields[i].kptr.btf))
+ +                              btf_put(rec->fields[i].kptr.btf);
                         break;
                 case BPF_LIST_HEAD:
                 case BPF_LIST_NODE:
@@@ -597,8 -596,7 +597,8 @@@ struct btf_record *btf_record_dup(cons
                 case BPF_KPTR_UNREF:
                 case BPF_KPTR_REF:
                 case BPF_KPTR_PERCPU:
- -                      btf_get(fields[i].kptr.btf);
+ +                      if (btf_is_kernel(fields[i].kptr.btf))
+ +                              btf_get(fields[i].kptr.btf);
                         if (fields[i].kptr.module && !try_module_get(fields[i].kptr.module)) {
                                 ret = -ENXIO;
                                 goto free;
@@@ -735,11 -733,15 +735,11 @@@ void bpf_obj_free_fields(const struct b
         }
   }
   
- -/* called from workqueue */
- -static void bpf_map_free_deferred(struct work_struct *work)
+ +static void bpf_map_free(struct bpf_map *map)
   {
- -      struct bpf_map *map = container_of(work, struct bpf_map, work);
         struct btf_record *rec = map->record;
         struct btf *btf = map->btf;
   
- -      security_bpf_map_free(map);
- -      bpf_map_release_memcg(map);
         /* implementation dependent freeing */
         map->ops->map_free(map);
         /* Delay freeing of btf_record for maps, as map_free
@@@ -758,16 -760,6 +758,16 @@@
         btf_put(btf);
   }
   
+ +/* called from workqueue */
+ +static void bpf_map_free_deferred(struct work_struct *work)
+ +{
+ +      struct bpf_map *map = container_of(work, struct bpf_map, work);
+ +
+ +      security_bpf_map_free(map);
+ +      bpf_map_release_memcg(map);
+ +      bpf_map_free(map);
+ +}
+ +
   static void bpf_map_put_uref(struct bpf_map *map)
   {
         if (atomic64_dec_and_test(&map->usercnt)) {
@@@ -837,7 -829,7 +837,7 @@@ static int bpf_map_release(struct inod
   
   static fmode_t map_get_sys_perms(struct bpf_map *map, struct fd f)
   {
-       fmode_t mode = f.file->f_mode;
+       fmode_t mode = fd_file(f)->f_mode;
   
         /* Our file permissions may have been overridden by global
          * map permissions facing syscall side.
@@@ -1419,7 -1411,8 +1419,7 @@@ static int map_create(union bpf_attr *a
   free_map_sec:
         security_bpf_map_free(map);
   free_map:
- -      btf_put(map->btf);
- -      map->ops->map_free(map);
+ +      bpf_map_free(map);
   put_token:
         bpf_token_put(token);
         return err;
@@@ -1430,14 -1423,14 +1430,14 @@@
    */
   struct bpf_map *__bpf_map_get(struct fd f)
   {
-       if (!f.file)
+       if (!fd_file(f))
                 return ERR_PTR(-EBADF);
-       if (f.file->f_op != &bpf_map_fops) {
+       if (fd_file(f)->f_op != &bpf_map_fops) {
                 fdput(f);
                 return ERR_PTR(-EINVAL);
         }
   
-       return f.file->private_data;
+       return fd_file(f)->private_data;
   }
   
   void bpf_map_inc(struct bpf_map *map)
@@@ -1658,7 -1651,7 +1658,7 @@@ static int map_update_elem(union bpf_at
                 goto free_key;
         }
   
-       err = bpf_map_update_value(map, f.file, key, value, attr->flags);
+       err = bpf_map_update_value(map, fd_file(f), key, value, attr->flags);
         if (!err)
                 maybe_wait_bpf_programs(map);
   
@@@ -2416,14 -2409,14 +2416,14 @@@ int bpf_prog_new_fd(struct bpf_prog *pr
   
   static struct bpf_prog *____bpf_prog_get(struct fd f)
   {
-       if (!f.file)
+       if (!fd_file(f))
                 return ERR_PTR(-EBADF);
-       if (f.file->f_op != &bpf_prog_fops) {
+       if (fd_file(f)->f_op != &bpf_prog_fops) {
                 fdput(f);
                 return ERR_PTR(-EINVAL);
         }
   
-       return f.file->private_data;
+       return fd_file(f)->private_data;
   }
   
   void bpf_prog_add(struct bpf_prog *prog, int i)
@@@ -3266,14 -3259,14 +3266,14 @@@ struct bpf_link *bpf_link_get_from_fd(u
         struct fd f = fdget(ufd);
         struct bpf_link *link;
   
-       if (!f.file)
+       if (!fd_file(f))
                 return ERR_PTR(-EBADF);
-       if (f.file->f_op != &bpf_link_fops && f.file->f_op != &bpf_link_fops_poll) {
+       if (fd_file(f)->f_op != &bpf_link_fops && fd_file(f)->f_op != &bpf_link_fops_poll) {
                 fdput(f);
                 return ERR_PTR(-EINVAL);
         }
   
-       link = f.file->private_data;
+       link = fd_file(f)->private_data;
         bpf_link_inc(link);
         fdput(f);
   
@@@ -4989,19 -4982,19 +4989,19 @@@ static int bpf_obj_get_info_by_fd(cons
                 return -EINVAL;
   
         f = fdget(ufd);
-       if (!f.file)
+       if (!fd_file(f))
                 return -EBADFD;
   
-       if (f.file->f_op == &bpf_prog_fops)
-               err = bpf_prog_get_info_by_fd(f.file, f.file->private_data, attr,
+       if (fd_file(f)->f_op == &bpf_prog_fops)
+               err = bpf_prog_get_info_by_fd(fd_file(f), fd_file(f)->private_data, attr,
                                               uattr);
-       else if (f.file->f_op == &bpf_map_fops)
-               err = bpf_map_get_info_by_fd(f.file, f.file->private_data, attr,
+       else if (fd_file(f)->f_op == &bpf_map_fops)
+               err = bpf_map_get_info_by_fd(fd_file(f), fd_file(f)->private_data, attr,
                                              uattr);
-       else if (f.file->f_op == &btf_fops)
-               err = bpf_btf_get_info_by_fd(f.file, f.file->private_data, attr, uattr);
-       else if (f.file->f_op == &bpf_link_fops || f.file->f_op == &bpf_link_fops_poll)
-               err = bpf_link_get_info_by_fd(f.file, f.file->private_data,
+       else if (fd_file(f)->f_op == &btf_fops)
+               err = bpf_btf_get_info_by_fd(fd_file(f), fd_file(f)->private_data, attr, uattr);
+       else if (fd_file(f)->f_op == &bpf_link_fops || fd_file(f)->f_op == &bpf_link_fops_poll)
+               err = bpf_link_get_info_by_fd(fd_file(f), fd_file(f)->private_data,
                                               attr, uattr);
         else
                 err = -EINVAL;
@@@ -5222,7 -5215,7 +5222,7 @@@ static int bpf_map_do_batch(const unio
         else if (cmd == BPF_MAP_LOOKUP_AND_DELETE_BATCH)
                 BPF_DO_BATCH(map->ops->map_lookup_and_delete_batch, map, attr, uattr);
         else if (cmd == BPF_MAP_UPDATE_BATCH)
-               BPF_DO_BATCH(map->ops->map_update_batch, map, f.file, attr, uattr);
+               BPF_DO_BATCH(map->ops->map_update_batch, map, fd_file(f), attr, uattr);
         else
                 BPF_DO_BATCH(map->ops->map_delete_batch, map, attr, uattr);
   err_put:
@@@ -5675,7 -5668,7 +5675,7 @@@ static int token_create(union bpf_attr 
         return bpf_token_create(attr);
   }
   
- -static int __sys_bpf(int cmd, bpfptr_t uattr, unsigned int size)
+ +static int __sys_bpf(enum bpf_cmd cmd, bpfptr_t uattr, unsigned int size)
   {
         union bpf_attr attr;
         int err;
@@@ -5939,7 -5932,6 +5939,7 @@@ static const struct bpf_func_proto bpf_
   
   BPF_CALL_4(bpf_kallsyms_lookup_name, const char *, name, int, name_sz, int, flags, u64 *, res)
   {
+ +      *res = 0;
         if (flags)
                 return -EINVAL;
   
@@@ -5960,8 -5952,7 +5960,8 @@@ static const struct bpf_func_proto bpf_
         .arg1_type      = ARG_PTR_TO_MEM,
         .arg2_type      = ARG_CONST_SIZE_OR_ZERO,
         .arg3_type      = ARG_ANYTHING,
- -      .arg4_type      = ARG_PTR_TO_LONG,
+ +      .arg4_type      = ARG_PTR_TO_FIXED_SIZE_MEM | MEM_UNINIT | MEM_ALIGNED,
+ +      .arg4_size      = sizeof(u64),
   };
   
   static const struct bpf_func_proto *
diff --combined kernel/cgroup/cgroup.c

index f38f61bc068a0ba32ebb9e0d81a0720edebeb7cf,b96489277f70075df1f045d9e6288fc524d1d60b..5886b95c6eaee60e12d5035e4eee41964ed47e9a
--- 1/kernel/cgroup/cgroup.c
--- 2/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@@ -1972,13 -1972,6 +1972,13 @@@ static int cgroup2_parse_param(struct f
         return -EINVAL;
   }
   
+ +struct cgroup_of_peak *of_peak(struct kernfs_open_file *of)
+ +{
+ +      struct cgroup_file_ctx *ctx = of->priv;
+ +
+ +      return &ctx->peak;
+ +}
+ +
   static void apply_cgroup_root_flags(unsigned int root_flags)
   {
         if (current->nsproxy->cgroup_ns == &init_cgroup_ns) {
@@@ -2338,7 -2331,7 +2338,7 @@@ static struct file_system_type cgroup2_
         .fs_flags               = FS_USERNS_MOUNT,
   };
   
- -#ifdef CONFIG_CPUSETS
+ +#ifdef CONFIG_CPUSETS_V1
   static const struct fs_context_operations cpuset_fs_context_ops = {
         .get_tree       = cgroup1_get_tree,
         .free           = cgroup_fs_context_free,
@@@ -3676,40 -3669,12 +3676,40 @@@ static int cgroup_events_show(struct se
   static int cgroup_stat_show(struct seq_file *seq, void *v)
   {
         struct cgroup *cgroup = seq_css(seq)->cgroup;
+ +      struct cgroup_subsys_state *css;
+ +      int dying_cnt[CGROUP_SUBSYS_COUNT];
+ +      int ssid;
   
         seq_printf(seq, "nr_descendants %d\n",
                    cgroup->nr_descendants);
+ +
+ +      /*
+ +       * Show the number of live and dying csses associated with each of
+ +       * non-inhibited cgroup subsystems that is bound to cgroup v2.
+ +       *
+ +       * Without proper lock protection, racing is possible. So the
+ +       * numbers may not be consistent when that happens.
+ +       */
+ +      rcu_read_lock();
+ +      for (ssid = 0; ssid < CGROUP_SUBSYS_COUNT; ssid++) {
+ +              dying_cnt[ssid] = -1;
+ +              if ((BIT(ssid) & cgrp_dfl_inhibit_ss_mask) ||
+ +                  (cgroup_subsys[ssid]->root !=  &cgrp_dfl_root))
+ +                      continue;
+ +              css = rcu_dereference_raw(cgroup->subsys[ssid]);
+ +              dying_cnt[ssid] = cgroup->nr_dying_subsys[ssid];
+ +              seq_printf(seq, "nr_subsys_%s %d\n", cgroup_subsys[ssid]->name,
+ +                         css ? (css->nr_descendants + 1) : 0);
+ +      }
+ +
         seq_printf(seq, "nr_dying_descendants %d\n",
                    cgroup->nr_dying_descendants);
- -
+ +      for (ssid = 0; ssid < CGROUP_SUBSYS_COUNT; ssid++) {
+ +              if (dying_cnt[ssid] >= 0)
+ +                      seq_printf(seq, "nr_dying_subsys_%s %d\n",
+ +                                 cgroup_subsys[ssid]->name, dying_cnt[ssid]);
+ +      }
+ +      rcu_read_unlock();
         return 0;
   }
   
@@@ -4131,7 -4096,7 +4131,7 @@@ static ssize_t cgroup_file_write(struc
          * If namespaces are delegation boundaries, disallow writes to
          * files in an non-init namespace root from inside the namespace
          * except for the files explicitly marked delegatable -
- -       * cgroup.procs and cgroup.subtree_control.
+ +       * eg. cgroup.procs, cgroup.threads and cgroup.subtree_control.
          */
         if ((cgrp->root->flags & CGRP_ROOT_NS_DELEGATE) &&
             !(cft->flags & CFTYPE_NS_DELEGATABLE) &&
@@@ -4630,9 -4595,8 +4630,9 @@@ struct cgroup_subsys_state *css_next_ch
    *
    * While this function requires cgroup_mutex or RCU read locking, it
    * doesn't require the whole traversal to be contained in a single critical
- - * section.  This function will return the correct next descendant as long
- - * as both @pos and @root are accessible and @pos is a descendant of @root.
+ + * section. Additionally, it isn't necessary to hold onto a reference to @pos.
+ + * This function will return the correct next descendant as long as both @pos
+ + * and @root are accessible and @pos is a descendant of @root.
    *
    * If a subsystem synchronizes ->css_online() and the start of iteration, a
    * css which finished ->css_online() is guaranteed to be visible in the
@@@ -4680,9 -4644,8 +4680,9 @@@ EXPORT_SYMBOL_GPL(css_next_descendant_p
    *
    * While this function requires cgroup_mutex or RCU read locking, it
    * doesn't require the whole traversal to be contained in a single critical
- - * section.  This function will return the correct rightmost descendant as
- - * long as @pos is accessible.
+ + * section. Additionally, it isn't necessary to hold onto a reference to @pos.
+ + * This function will return the correct rightmost descendant as long as @pos
+ + * is accessible.
    */
   struct cgroup_subsys_state *
   css_rightmost_descendant(struct cgroup_subsys_state *pos)
@@@ -4726,9 -4689,9 +4726,9 @@@ css_leftmost_descendant(struct cgroup_s
    *
    * While this function requires cgroup_mutex or RCU read locking, it
    * doesn't require the whole traversal to be contained in a single critical
- - * section.  This function will return the correct next descendant as long
- - * as both @pos and @cgroup are accessible and @pos is a descendant of
- - * @cgroup.
+ + * section. Additionally, it isn't necessary to hold onto a reference to @pos.
+ + * This function will return the correct next descendant as long as both @pos
+ + * and @cgroup are accessible and @pos is a descendant of @cgroup.
    *
    * If a subsystem synchronizes ->css_online() and the start of iteration, a
    * css which finished ->css_online() is guaranteed to be visible in the
@@@ -5461,8 -5424,6 +5461,8 @@@ static void css_release_work_fn(struct 
         list_del_rcu(&css->sibling);
   
         if (ss) {
+ +              struct cgroup *parent_cgrp;
+ +
                 /* css release path */
                 if (!list_empty(&css->rstat_css_node)) {
                         cgroup_rstat_flush(cgrp);
@@@ -5472,21 -5433,6 +5472,21 @@@
                 cgroup_idr_replace(&ss->css_idr, NULL, css->id);
                 if (ss->css_released)
                         ss->css_released(css);
+ +
+ +              cgrp->nr_dying_subsys[ss->id]--;
+ +              /*
+ +               * When a css is released and ready to be freed, its
+ +               * nr_descendants must be zero. However, the corresponding
+ +               * cgrp->nr_dying_subsys[ss->id] may not be 0 if a subsystem
+ +               * is activated and deactivated multiple times with one or
+ +               * more of its previous activation leaving behind dying csses.
+ +               */
+ +              WARN_ON_ONCE(css->nr_descendants);
+ +              parent_cgrp = cgroup_parent(cgrp);
+ +              while (parent_cgrp) {
+ +                      parent_cgrp->nr_dying_subsys[ss->id]--;
+ +                      parent_cgrp = cgroup_parent(parent_cgrp);
+ +              }
         } else {
                 struct cgroup *tcgrp;
   
@@@ -5571,11 -5517,8 +5571,11 @@@ static int online_css(struct cgroup_sub
                 rcu_assign_pointer(css->cgroup->subsys[ss->id], css);
   
                 atomic_inc(&css->online_cnt);
- -              if (css->parent)
+ +              if (css->parent) {
                         atomic_inc(&css->parent->online_cnt);
+ +                      while ((css = css->parent))
+ +                              css->nr_descendants++;
+ +              }
         }
         return ret;
   }
@@@ -5597,16 -5540,6 +5597,16 @@@ static void offline_css(struct cgroup_s
         RCU_INIT_POINTER(css->cgroup->subsys[ss->id], NULL);
   
         wake_up_all(&css->cgroup->offline_waitq);
+ +
+ +      css->cgroup->nr_dying_subsys[ss->id]++;
+ +      /*
+ +       * Parent css and cgroup cannot be freed until after the freeing
+ +       * of child css, see css_free_rwork_fn().
+ +       */
+ +      while ((css = css->parent)) {
+ +              css->nr_descendants--;
+ +              css->cgroup->nr_dying_subsys[ss->id]++;
+ +      }
   }
   
   /**
@@@ -6245,7 -6178,7 +6245,7 @@@ int __init cgroup_init(void
         WARN_ON(register_filesystem(&cgroup_fs_type));
         WARN_ON(register_filesystem(&cgroup2_fs_type));
         WARN_ON(!proc_create_single("cgroups", 0, NULL, proc_cgroupstats_show));
- -#ifdef CONFIG_CPUSETS
+ +#ifdef CONFIG_CPUSETS_V1
         WARN_ON(register_filesystem(&cpuset_fs_type));
   #endif
   
@@@ -6968,10 -6901,10 +6968,10 @@@ struct cgroup *cgroup_v1v2_get_from_fd(
   {
         struct cgroup *cgrp;
         struct fd f = fdget_raw(fd);
-       if (!f.file)
+       if (!fd_file(f))
                 return ERR_PTR(-EBADF);
   
-       cgrp = cgroup_v1v2_get_from_file(f.file);
+       cgrp = cgroup_v1v2_get_from_file(fd_file(f));
         fdput(f);
         return cgrp;
   }
diff --combined kernel/events/core.c

index 5afde9f7b52a56241c80f96072405cc78c3211ba,fd2ac9c7fd7733c88c51a603fdc1d2b62fe411a6..5a8071c45c8091167ed7794c488d6409527e496a
--- 1/kernel/events/core.c
--- 2/kernel/events/core.c
+++ b/kernel/events/core.c
@@@ -155,55 -155,20 +155,55 @@@ static int cpu_function_call(int cpu, r
         return data.ret;
   }
   
+ +enum event_type_t {
+ +      EVENT_FLEXIBLE  = 0x01,
+ +      EVENT_PINNED    = 0x02,
+ +      EVENT_TIME      = 0x04,
+ +      EVENT_FROZEN    = 0x08,
+ +      /* see ctx_resched() for details */
+ +      EVENT_CPU       = 0x10,
+ +      EVENT_CGROUP    = 0x20,
+ +
+ +      /* compound helpers */
+ +      EVENT_ALL         = EVENT_FLEXIBLE | EVENT_PINNED,
+ +      EVENT_TIME_FROZEN = EVENT_TIME | EVENT_FROZEN,
+ +};
+ +
+ +static inline void __perf_ctx_lock(struct perf_event_context *ctx)
+ +{
+ +      raw_spin_lock(&ctx->lock);
+ +      WARN_ON_ONCE(ctx->is_active & EVENT_FROZEN);
+ +}
+ +
   static void perf_ctx_lock(struct perf_cpu_context *cpuctx,
                           struct perf_event_context *ctx)
   {
- -      raw_spin_lock(&cpuctx->ctx.lock);
+ +      __perf_ctx_lock(&cpuctx->ctx);
         if (ctx)
- -              raw_spin_lock(&ctx->lock);
+ +              __perf_ctx_lock(ctx);
+ +}
+ +
+ +static inline void __perf_ctx_unlock(struct perf_event_context *ctx)
+ +{
+ +      /*
+ +       * If ctx_sched_in() didn't again set any ALL flags, clean up
+ +       * after ctx_sched_out() by clearing is_active.
+ +       */
+ +      if (ctx->is_active & EVENT_FROZEN) {
+ +              if (!(ctx->is_active & EVENT_ALL))
+ +                      ctx->is_active = 0;
+ +              else
+ +                      ctx->is_active &= ~EVENT_FROZEN;
+ +      }
+ +      raw_spin_unlock(&ctx->lock);
   }
   
   static void perf_ctx_unlock(struct perf_cpu_context *cpuctx,
                             struct perf_event_context *ctx)
   {
         if (ctx)
- -              raw_spin_unlock(&ctx->lock);
- -      raw_spin_unlock(&cpuctx->ctx.lock);
+ +              __perf_ctx_unlock(ctx);
+ +      __perf_ctx_unlock(&cpuctx->ctx);
   }
   
   #define TASK_TOMBSTONE ((void *)-1L)
@@@ -299,7 -264,6 +299,7 @@@ static void event_function_call(struct 
   {
         struct perf_event_context *ctx = event->ctx;
         struct task_struct *task = READ_ONCE(ctx->task); /* verified in event_function */
+ +      struct perf_cpu_context *cpuctx;
         struct event_function_struct efs = {
                 .event = event,
                 .func = func,
@@@ -327,25 -291,22 +327,25 @@@ again
         if (!task_function_call(task, event_function, &efs))
                 return;
   
- -      raw_spin_lock_irq(&ctx->lock);
+ +      local_irq_disable();
+ +      cpuctx = this_cpu_ptr(&perf_cpu_context);
+ +      perf_ctx_lock(cpuctx, ctx);
         /*
          * Reload the task pointer, it might have been changed by
          * a concurrent perf_event_context_sched_out().
          */
         task = ctx->task;
- -      if (task == TASK_TOMBSTONE) {
- -              raw_spin_unlock_irq(&ctx->lock);
- -              return;
- -      }
+ +      if (task == TASK_TOMBSTONE)
+ +              goto unlock;
         if (ctx->is_active) {
- -              raw_spin_unlock_irq(&ctx->lock);
+ +              perf_ctx_unlock(cpuctx, ctx);
+ +              local_irq_enable();
                 goto again;
         }
         func(event, NULL, ctx, data);
- -      raw_spin_unlock_irq(&ctx->lock);
+ +unlock:
+ +      perf_ctx_unlock(cpuctx, ctx);
+ +      local_irq_enable();
   }
   
   /*
@@@ -408,6 -369,16 +408,6 @@@ unlock
         (PERF_SAMPLE_BRANCH_KERNEL |\
          PERF_SAMPLE_BRANCH_HV)
   
- -enum event_type_t {
- -      EVENT_FLEXIBLE = 0x1,
- -      EVENT_PINNED = 0x2,
- -      EVENT_TIME = 0x4,
- -      /* see ctx_resched() for details */
- -      EVENT_CPU = 0x8,
- -      EVENT_CGROUP = 0x10,
- -      EVENT_ALL = EVENT_FLEXIBLE | EVENT_PINNED,
- -};
- -
   /*
    * perf_sched_events : >0 events exist
    */
@@@ -436,11 -407,6 +436,11 @@@ static LIST_HEAD(pmus)
   static DEFINE_MUTEX(pmus_lock);
   static struct srcu_struct pmus_srcu;
   static cpumask_var_t perf_online_mask;
+ +static cpumask_var_t perf_online_core_mask;
+ +static cpumask_var_t perf_online_die_mask;
+ +static cpumask_var_t perf_online_cluster_mask;
+ +static cpumask_var_t perf_online_pkg_mask;
+ +static cpumask_var_t perf_online_sys_mask;
   static struct kmem_cache *perf_event_cache;
   
   /*
@@@ -719,32 -685,30 +719,32 @@@ do {                                                                    
         ___p;                                                           \
   })
   
+ +#define for_each_epc(_epc, _ctx, _pmu, _cgroup)                               \
+ +      list_for_each_entry(_epc, &((_ctx)->pmu_ctx_list), pmu_ctx_entry) \
+ +              if (_cgroup && !_epc->nr_cgroups)                       \
+ +                      continue;                                       \
+ +              else if (_pmu && _epc->pmu != _pmu)                     \
+ +                      continue;                                       \
+ +              else
+ +
   static void perf_ctx_disable(struct perf_event_context *ctx, bool cgroup)
   {
         struct perf_event_pmu_context *pmu_ctx;
   
- -      list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) {
- -              if (cgroup && !pmu_ctx->nr_cgroups)
- -                      continue;
+ +      for_each_epc(pmu_ctx, ctx, NULL, cgroup)
                 perf_pmu_disable(pmu_ctx->pmu);
- -      }
   }
   
   static void perf_ctx_enable(struct perf_event_context *ctx, bool cgroup)
   {
         struct perf_event_pmu_context *pmu_ctx;
   
- -      list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) {
- -              if (cgroup && !pmu_ctx->nr_cgroups)
- -                      continue;
+ +      for_each_epc(pmu_ctx, ctx, NULL, cgroup)
                 perf_pmu_enable(pmu_ctx->pmu);
- -      }
   }
   
- -static void ctx_sched_out(struct perf_event_context *ctx, enum event_type_t event_type);
- -static void ctx_sched_in(struct perf_event_context *ctx, enum event_type_t event_type);
+ +static void ctx_sched_out(struct perf_event_context *ctx, struct pmu *pmu, enum event_type_t event_type);
+ +static void ctx_sched_in(struct perf_event_context *ctx, struct pmu *pmu, enum event_type_t event_type);
   
   #ifdef CONFIG_CGROUP_PERF
   
@@@ -901,7 -865,7 +901,7 @@@ static void perf_cgroup_switch(struct t
         perf_ctx_lock(cpuctx, cpuctx->task_ctx);
         perf_ctx_disable(&cpuctx->ctx, true);
   
- -      ctx_sched_out(&cpuctx->ctx, EVENT_ALL|EVENT_CGROUP);
+ +      ctx_sched_out(&cpuctx->ctx, NULL, EVENT_ALL|EVENT_CGROUP);
         /*
          * must not be done before ctxswout due
          * to update_cgrp_time_from_cpuctx() in
@@@ -913,7 -877,7 +913,7 @@@
          * perf_cgroup_set_timestamp() in ctx_sched_in()
          * to not have to pass task around
          */
- -      ctx_sched_in(&cpuctx->ctx, EVENT_ALL|EVENT_CGROUP);
+ +      ctx_sched_in(&cpuctx->ctx, NULL, EVENT_ALL|EVENT_CGROUP);
   
         perf_ctx_enable(&cpuctx->ctx, true);
         perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
@@@ -969,10 -933,10 +969,10 @@@ static inline int perf_cgroup_connect(i
         struct fd f = fdget(fd);
         int ret = 0;
   
-       if (!f.file)
+       if (!fd_file(f))
                 return -EBADF;
   
-       css = css_tryget_online_from_dir(f.file->f_path.dentry,
+       css = css_tryget_online_from_dir(fd_file(f)->f_path.dentry,
                                          &perf_event_cgrp_subsys);
         if (IS_ERR(css)) {
                 ret = PTR_ERR(css);
@@@ -1291,9 -1255,8 +1291,9 @@@ static void put_ctx(struct perf_event_c
    *      perf_event_context::mutex
    *        perf_event::child_mutex;
    *          perf_event_context::lock
- - *        perf_event::mmap_mutex
    *        mmap_lock
+ + *          perf_event::mmap_mutex
+ + *            perf_buffer::aux_mutex
    *          perf_addr_filters_head::lock
    *
    *    cpu_hotplug_lock
@@@ -1804,14 -1767,6 +1804,14 @@@ perf_event_groups_next(struct perf_even
                 event = rb_entry_safe(rb_next(&event->group_node),      \
                                 typeof(*event), group_node))
   
+ +/*
+ + * Does the event attribute request inherit with PERF_SAMPLE_READ
+ + */
+ +static inline bool has_inherit_and_sample_read(struct perf_event_attr *attr)
+ +{
+ +      return attr->inherit && (attr->sample_type & PERF_SAMPLE_READ);
+ +}
+ +
   /*
    * Add an event from the lists for its context.
    * Must be called with ctx->mutex and ctx->lock held.
@@@ -1842,8 -1797,6 +1842,8 @@@ list_add_event(struct perf_event *event
                 ctx->nr_user++;
         if (event->attr.inherit_stat)
                 ctx->nr_stat++;
+ +      if (has_inherit_and_sample_read(&event->attr))
+ +              local_inc(&ctx->nr_no_switch_fast);
   
         if (event->state > PERF_EVENT_STATE_OFF)
                 perf_cgroup_event_enable(event, ctx);
@@@ -2068,8 -2021,6 +2068,8 @@@ list_del_event(struct perf_event *event
                 ctx->nr_user--;
         if (event->attr.inherit_stat)
                 ctx->nr_stat--;
+ +      if (has_inherit_and_sample_read(&event->attr))
+ +              local_dec(&ctx->nr_no_switch_fast);
   
         list_del_rcu(&event->event_entry);
   
@@@ -2365,45 -2316,6 +2365,45 @@@ group_sched_out(struct perf_event *grou
                 event_sched_out(event, ctx);
   }
   
+ +static inline void
+ +__ctx_time_update(struct perf_cpu_context *cpuctx, struct perf_event_context *ctx, bool final)
+ +{
+ +      if (ctx->is_active & EVENT_TIME) {
+ +              if (ctx->is_active & EVENT_FROZEN)
+ +                      return;
+ +              update_context_time(ctx);
+ +              update_cgrp_time_from_cpuctx(cpuctx, final);
+ +      }
+ +}
+ +
+ +static inline void
+ +ctx_time_update(struct perf_cpu_context *cpuctx, struct perf_event_context *ctx)
+ +{
+ +      __ctx_time_update(cpuctx, ctx, false);
+ +}
+ +
+ +/*
+ + * To be used inside perf_ctx_lock() / perf_ctx_unlock(). Lasts until perf_ctx_unlock().
+ + */
+ +static inline void
+ +ctx_time_freeze(struct perf_cpu_context *cpuctx, struct perf_event_context *ctx)
+ +{
+ +      ctx_time_update(cpuctx, ctx);
+ +      if (ctx->is_active & EVENT_TIME)
+ +              ctx->is_active |= EVENT_FROZEN;
+ +}
+ +
+ +static inline void
+ +ctx_time_update_event(struct perf_event_context *ctx, struct perf_event *event)
+ +{
+ +      if (ctx->is_active & EVENT_TIME) {
+ +              if (ctx->is_active & EVENT_FROZEN)
+ +                      return;
+ +              update_context_time(ctx);
+ +              update_cgrp_time_from_event(event);
+ +      }
+ +}
+ +
   #define DETACH_GROUP  0x01UL
   #define DETACH_CHILD  0x02UL
   #define DETACH_DEAD   0x04UL
@@@ -2423,7 -2335,10 +2423,7 @@@ __perf_remove_from_context(struct perf_
         struct perf_event_pmu_context *pmu_ctx = event->pmu_ctx;
         unsigned long flags = (unsigned long)info;
   
- -      if (ctx->is_active & EVENT_TIME) {
- -              update_context_time(ctx);
- -              update_cgrp_time_from_cpuctx(cpuctx, false);
- -      }
+ +      ctx_time_update(cpuctx, ctx);
   
         /*
          * Ensure event_sched_out() switches to OFF, at the very least
@@@ -2508,8 -2423,12 +2508,8 @@@ static void __perf_event_disable(struc
         if (event->state < PERF_EVENT_STATE_INACTIVE)
                 return;
   
- -      if (ctx->is_active & EVENT_TIME) {
- -              update_context_time(ctx);
- -              update_cgrp_time_from_event(event);
- -      }
- -
         perf_pmu_disable(event->pmu_ctx->pmu);
+ +      ctx_time_update_event(ctx, event);
   
         if (event == event->group_leader)
                 group_sched_out(event, ctx);
@@@ -2725,8 -2644,7 +2725,8 @@@ static void add_event_to_ctx(struct per
   }
   
   static void task_ctx_sched_out(struct perf_event_context *ctx,
- -                              enum event_type_t event_type)
+ +                             struct pmu *pmu,
+ +                             enum event_type_t event_type)
   {
         struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
   
@@@ -2736,19 -2654,18 +2736,19 @@@
         if (WARN_ON_ONCE(ctx != cpuctx->task_ctx))
                 return;
   
- -      ctx_sched_out(ctx, event_type);
+ +      ctx_sched_out(ctx, pmu, event_type);
   }
   
   static void perf_event_sched_in(struct perf_cpu_context *cpuctx,
- -                              struct perf_event_context *ctx)
+ +                              struct perf_event_context *ctx,
+ +                              struct pmu *pmu)
   {
- -      ctx_sched_in(&cpuctx->ctx, EVENT_PINNED);
+ +      ctx_sched_in(&cpuctx->ctx, pmu, EVENT_PINNED);
         if (ctx)
- -               ctx_sched_in(ctx, EVENT_PINNED);
- -      ctx_sched_in(&cpuctx->ctx, EVENT_FLEXIBLE);
+ +               ctx_sched_in(ctx, pmu, EVENT_PINNED);
+ +      ctx_sched_in(&cpuctx->ctx, pmu, EVENT_FLEXIBLE);
         if (ctx)
- -               ctx_sched_in(ctx, EVENT_FLEXIBLE);
+ +               ctx_sched_in(ctx, pmu, EVENT_FLEXIBLE);
   }
   
   /*
@@@ -2766,12 -2683,16 +2766,12 @@@
    * event_type is a bit mask of the types of events involved. For CPU events,
    * event_type is only either EVENT_PINNED or EVENT_FLEXIBLE.
    */
- -/*
- - * XXX: ctx_resched() reschedule entire perf_event_context while adding new
- - * event to the context or enabling existing event in the context. We can
- - * probably optimize it by rescheduling only affected pmu_ctx.
- - */
   static void ctx_resched(struct perf_cpu_context *cpuctx,
                         struct perf_event_context *task_ctx,
- -                      enum event_type_t event_type)
+ +                      struct pmu *pmu, enum event_type_t event_type)
   {
         bool cpu_event = !!(event_type & EVENT_CPU);
+ +      struct perf_event_pmu_context *epc;
   
         /*
          * If pinned groups are involved, flexible groups also need to be
@@@ -2782,14 -2703,10 +2782,14 @@@
   
         event_type &= EVENT_ALL;
   
- -      perf_ctx_disable(&cpuctx->ctx, false);
+ +      for_each_epc(epc, &cpuctx->ctx, pmu, false)
+ +              perf_pmu_disable(epc->pmu);
+ +
         if (task_ctx) {
- -              perf_ctx_disable(task_ctx, false);
- -              task_ctx_sched_out(task_ctx, event_type);
+ +              for_each_epc(epc, task_ctx, pmu, false)
+ +                      perf_pmu_disable(epc->pmu);
+ +
+ +              task_ctx_sched_out(task_ctx, pmu, event_type);
         }
   
         /*
@@@ -2800,19 -2717,15 +2800,19 @@@
          *  - otherwise, do nothing more.
          */
         if (cpu_event)
- -              ctx_sched_out(&cpuctx->ctx, event_type);
+ +              ctx_sched_out(&cpuctx->ctx, pmu, event_type);
         else if (event_type & EVENT_PINNED)
- -              ctx_sched_out(&cpuctx->ctx, EVENT_FLEXIBLE);
+ +              ctx_sched_out(&cpuctx->ctx, pmu, EVENT_FLEXIBLE);
   
- -      perf_event_sched_in(cpuctx, task_ctx);
+ +      perf_event_sched_in(cpuctx, task_ctx, pmu);
   
- -      perf_ctx_enable(&cpuctx->ctx, false);
- -      if (task_ctx)
- -              perf_ctx_enable(task_ctx, false);
+ +      for_each_epc(epc, &cpuctx->ctx, pmu, false)
+ +              perf_pmu_enable(epc->pmu);
+ +
+ +      if (task_ctx) {
+ +              for_each_epc(epc, task_ctx, pmu, false)
+ +                      perf_pmu_enable(epc->pmu);
+ +      }
   }
   
   void perf_pmu_resched(struct pmu *pmu)
@@@ -2821,7 -2734,7 +2821,7 @@@
         struct perf_event_context *task_ctx = cpuctx->task_ctx;
   
         perf_ctx_lock(cpuctx, task_ctx);
- -      ctx_resched(cpuctx, task_ctx, EVENT_ALL|EVENT_CPU);
+ +      ctx_resched(cpuctx, task_ctx, pmu, EVENT_ALL|EVENT_CPU);
         perf_ctx_unlock(cpuctx, task_ctx);
   }
   
@@@ -2877,10 -2790,9 +2877,10 @@@ static int  __perf_install_in_context(v
   #endif
   
         if (reprogram) {
- -              ctx_sched_out(ctx, EVENT_TIME);
+ +              ctx_time_freeze(cpuctx, ctx);
                 add_event_to_ctx(event, ctx);
- -              ctx_resched(cpuctx, task_ctx, get_event_type(event));
+ +              ctx_resched(cpuctx, task_ctx, event->pmu_ctx->pmu,
+ +                          get_event_type(event));
         } else {
                 add_event_to_ctx(event, ctx);
         }
@@@ -3023,7 -2935,8 +3023,7 @@@ static void __perf_event_enable(struct 
             event->state <= PERF_EVENT_STATE_ERROR)
                 return;
   
- -      if (ctx->is_active)
- -              ctx_sched_out(ctx, EVENT_TIME);
+ +      ctx_time_freeze(cpuctx, ctx);
   
         perf_event_set_state(event, PERF_EVENT_STATE_INACTIVE);
         perf_cgroup_event_enable(event, ctx);
@@@ -3031,21 -2944,25 +3031,21 @@@
         if (!ctx->is_active)
                 return;
   
- -      if (!event_filter_match(event)) {
- -              ctx_sched_in(ctx, EVENT_TIME);
+ +      if (!event_filter_match(event))
                 return;
- -      }
   
         /*
          * If the event is in a group and isn't the group leader,
          * then don't put it on unless the group is on.
          */
- -      if (leader != event && leader->state != PERF_EVENT_STATE_ACTIVE) {
- -              ctx_sched_in(ctx, EVENT_TIME);
+ +      if (leader != event && leader->state != PERF_EVENT_STATE_ACTIVE)
                 return;
- -      }
   
         task_ctx = cpuctx->task_ctx;
         if (ctx->task)
                 WARN_ON_ONCE(task_ctx != ctx);
   
- -      ctx_resched(cpuctx, task_ctx, get_event_type(event));
+ +      ctx_resched(cpuctx, task_ctx, event->pmu_ctx->pmu, get_event_type(event));
   }
   
   /*
@@@ -3313,7 -3230,7 +3313,7 @@@ static void __pmu_ctx_sched_out(struct 
         struct perf_event *event, *tmp;
         struct pmu *pmu = pmu_ctx->pmu;
   
- -      if (ctx->task && !ctx->is_active) {
+ +      if (ctx->task && !(ctx->is_active & EVENT_ALL)) {
                 struct perf_cpu_pmu_context *cpc;
   
                 cpc = this_cpu_ptr(pmu->cpu_pmu_context);
@@@ -3321,7 -3238,7 +3321,7 @@@
                 cpc->task_epc = NULL;
         }
   
- -      if (!event_type)
+ +      if (!(event_type & EVENT_ALL))
                 return;
   
         perf_pmu_disable(pmu);
@@@ -3347,17 -3264,8 +3347,17 @@@
         perf_pmu_enable(pmu);
   }
   
+ +/*
+ + * Be very careful with the @pmu argument since this will change ctx state.
+ + * The @pmu argument works for ctx_resched(), because that is symmetric in
+ + * ctx_sched_out() / ctx_sched_in() usage and the ctx state ends up invariant.
+ + *
+ + * However, if you were to be asymmetrical, you could end up with messed up
+ + * state, eg. ctx->is_active cleared even though most EPCs would still actually
+ + * be active.
+ + */
   static void
- -ctx_sched_out(struct perf_event_context *ctx, enum event_type_t event_type)
+ +ctx_sched_out(struct perf_event_context *ctx, struct pmu *pmu, enum event_type_t event_type)
   {
         struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
         struct perf_event_pmu_context *pmu_ctx;
@@@ -3388,36 -3296,34 +3388,36 @@@
          *
          * would only update time for the pinned events.
          */
- -      if (is_active & EVENT_TIME) {
- -              /* update (and stop) ctx time */
- -              update_context_time(ctx);
- -              update_cgrp_time_from_cpuctx(cpuctx, ctx == &cpuctx->ctx);
+ +      __ctx_time_update(cpuctx, ctx, ctx == &cpuctx->ctx);
+ +
+ +      /*
+ +       * CPU-release for the below ->is_active store,
+ +       * see __load_acquire() in perf_event_time_now()
+ +       */
+ +      barrier();
+ +      ctx->is_active &= ~event_type;
+ +
+ +      if (!(ctx->is_active & EVENT_ALL)) {
                 /*
- -               * CPU-release for the below ->is_active store,
- -               * see __load_acquire() in perf_event_time_now()
+ +               * For FROZEN, preserve TIME|FROZEN such that perf_event_time_now()
+ +               * does not observe a hole. perf_ctx_unlock() will clean up.
                  */
- -              barrier();
+ +              if (ctx->is_active & EVENT_FROZEN)
+ +                      ctx->is_active &= EVENT_TIME_FROZEN;
+ +              else
+ +                      ctx->is_active = 0;
         }
   
- -      ctx->is_active &= ~event_type;
- -      if (!(ctx->is_active & EVENT_ALL))
- -              ctx->is_active = 0;
- -
         if (ctx->task) {
                 WARN_ON_ONCE(cpuctx->task_ctx != ctx);
- -              if (!ctx->is_active)
+ +              if (!(ctx->is_active & EVENT_ALL))
                         cpuctx->task_ctx = NULL;
         }
   
         is_active ^= ctx->is_active; /* changed bits */
   
- -      list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) {
- -              if (cgroup && !pmu_ctx->nr_cgroups)
- -                      continue;
+ +      for_each_epc(pmu_ctx, ctx, pmu, cgroup)
                 __pmu_ctx_sched_out(pmu_ctx, is_active);
- -      }
   }
   
   /*
@@@ -3610,17 -3516,12 +3610,17 @@@ perf_event_context_sched_out(struct tas
   
                         perf_ctx_disable(ctx, false);
   
- -                      /* PMIs are disabled; ctx->nr_pending is stable. */
- -                      if (local_read(&ctx->nr_pending) ||
- -                          local_read(&next_ctx->nr_pending)) {
+ +                      /* PMIs are disabled; ctx->nr_no_switch_fast is stable. */
+ +                      if (local_read(&ctx->nr_no_switch_fast) ||
+ +                          local_read(&next_ctx->nr_no_switch_fast)) {
                                 /*
                                  * Must not swap out ctx when there's pending
                                  * events that rely on the ctx->task relation.
+ +                               *
+ +                               * Likewise, when a context contains inherit +
+ +                               * SAMPLE_READ events they should be switched
+ +                               * out using the slow path so that they are
+ +                               * treated as if they were distinct contexts.
                                  */
                                 raw_spin_unlock(&next_ctx->lock);
                                 rcu_read_unlock();
@@@ -3661,7 -3562,7 +3661,7 @@@ unlock
   
   inside_switch:
                 perf_ctx_sched_task_cb(ctx, false);
- -              task_ctx_sched_out(ctx, EVENT_ALL);
+ +              task_ctx_sched_out(ctx, NULL, EVENT_ALL);
   
                 perf_ctx_enable(ctx, false);
                 raw_spin_unlock(&ctx->lock);
@@@ -3959,22 -3860,29 +3959,22 @@@ static void pmu_groups_sched_in(struct 
                            merge_sched_in, &can_add_hw);
   }
   
- -static void ctx_groups_sched_in(struct perf_event_context *ctx,
- -                              struct perf_event_groups *groups,
- -                              bool cgroup)
+ +static void __pmu_ctx_sched_in(struct perf_event_pmu_context *pmu_ctx,
+ +                             enum event_type_t event_type)
   {
- -      struct perf_event_pmu_context *pmu_ctx;
- -
- -      list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) {
- -              if (cgroup && !pmu_ctx->nr_cgroups)
- -                      continue;
- -              pmu_groups_sched_in(ctx, groups, pmu_ctx->pmu);
- -      }
- -}
+ +      struct perf_event_context *ctx = pmu_ctx->ctx;
   
- -static void __pmu_ctx_sched_in(struct perf_event_context *ctx,
- -                             struct pmu *pmu)
- -{
- -      pmu_groups_sched_in(ctx, &ctx->flexible_groups, pmu);
+ +      if (event_type & EVENT_PINNED)
+ +              pmu_groups_sched_in(ctx, &ctx->pinned_groups, pmu_ctx->pmu);
+ +      if (event_type & EVENT_FLEXIBLE)
+ +              pmu_groups_sched_in(ctx, &ctx->flexible_groups, pmu_ctx->pmu);
   }
   
   static void
- -ctx_sched_in(struct perf_event_context *ctx, enum event_type_t event_type)
+ +ctx_sched_in(struct perf_event_context *ctx, struct pmu *pmu, enum event_type_t event_type)
   {
         struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
+ +      struct perf_event_pmu_context *pmu_ctx;
         int is_active = ctx->is_active;
         bool cgroup = event_type & EVENT_CGROUP;
   
@@@ -3998,7 -3906,7 +3998,7 @@@
   
         ctx->is_active |= (event_type | EVENT_TIME);
         if (ctx->task) {
- -              if (!is_active)
+ +              if (!(is_active & EVENT_ALL))
                         cpuctx->task_ctx = ctx;
                 else
                         WARN_ON_ONCE(cpuctx->task_ctx != ctx);
@@@ -4010,16 -3918,12 +4010,16 @@@
          * First go through the list and put on any pinned groups
          * in order to give them the best chance of going on.
          */
- -      if (is_active & EVENT_PINNED)
- -              ctx_groups_sched_in(ctx, &ctx->pinned_groups, cgroup);
+ +      if (is_active & EVENT_PINNED) {
+ +              for_each_epc(pmu_ctx, ctx, pmu, cgroup)
+ +                      __pmu_ctx_sched_in(pmu_ctx, EVENT_PINNED);
+ +      }
   
         /* Then walk through the lower prio flexible groups */
- -      if (is_active & EVENT_FLEXIBLE)
- -              ctx_groups_sched_in(ctx, &ctx->flexible_groups, cgroup);
+ +      if (is_active & EVENT_FLEXIBLE) {
+ +              for_each_epc(pmu_ctx, ctx, pmu, cgroup)
+ +                      __pmu_ctx_sched_in(pmu_ctx, EVENT_FLEXIBLE);
+ +      }
   }
   
   static void perf_event_context_sched_in(struct task_struct *task)
@@@ -4062,10 -3966,10 +4062,10 @@@
          */
         if (!RB_EMPTY_ROOT(&ctx->pinned_groups.tree)) {
                 perf_ctx_disable(&cpuctx->ctx, false);
- -              ctx_sched_out(&cpuctx->ctx, EVENT_FLEXIBLE);
+ +              ctx_sched_out(&cpuctx->ctx, NULL, EVENT_FLEXIBLE);
         }
   
- -      perf_event_sched_in(cpuctx, ctx);
+ +      perf_event_sched_in(cpuctx, ctx, NULL);
   
         perf_ctx_sched_task_cb(cpuctx->task_ctx, true);
   
@@@ -4188,11 -4092,7 +4188,11 @@@ static void perf_adjust_period(struct p
         period = perf_calculate_period(event, nsec, count);
   
         delta = (s64)(period - hwc->sample_period);
- -      delta = (delta + 7) / 8; /* low pass filter */
+ +      if (delta >= 0)
+ +              delta += 7;
+ +      else
+ +              delta -= 7;
+ +      delta /= 8; /* low pass filter */
   
         sample_period = hwc->sample_period + delta;
   
@@@ -4410,14 -4310,14 +4410,14 @@@ static bool perf_rotate_context(struct 
                 update_context_time(&cpuctx->ctx);
                 __pmu_ctx_sched_out(cpu_epc, EVENT_FLEXIBLE);
                 rotate_ctx(&cpuctx->ctx, cpu_event);
- -              __pmu_ctx_sched_in(&cpuctx->ctx, pmu);
+ +              __pmu_ctx_sched_in(cpu_epc, EVENT_FLEXIBLE);
         }
   
         if (task_event)
                 rotate_ctx(task_epc->ctx, task_event);
   
         if (task_event || (task_epc && cpu_event))
- -              __pmu_ctx_sched_in(task_epc->ctx, pmu);
+ +              __pmu_ctx_sched_in(task_epc, EVENT_FLEXIBLE);
   
         perf_pmu_enable(pmu);
         perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
@@@ -4483,7 -4383,7 +4483,7 @@@ static void perf_event_enable_on_exec(s
   
         cpuctx = this_cpu_ptr(&perf_cpu_context);
         perf_ctx_lock(cpuctx, ctx);
- -      ctx_sched_out(ctx, EVENT_TIME);
+ +      ctx_time_freeze(cpuctx, ctx);
   
         list_for_each_entry(event, &ctx->event_list, event_entry) {
                 enabled |= event_enable_on_exec(event, ctx);
@@@ -4495,7 -4395,9 +4495,7 @@@
          */
         if (enabled) {
                 clone_ctx = unclone_ctx(ctx);
- -              ctx_resched(cpuctx, ctx, event_type);
- -      } else {
- -              ctx_sched_in(ctx, EVENT_TIME);
+ +              ctx_resched(cpuctx, ctx, NULL, event_type);
         }
         perf_ctx_unlock(cpuctx, ctx);
   
@@@ -4556,24 -4458,16 +4556,24 @@@ struct perf_read_data 
         int ret;
   };
   
+ +static inline const struct cpumask *perf_scope_cpu_topology_cpumask(unsigned int scope, int cpu);
+ +
   static int __perf_event_read_cpu(struct perf_event *event, int event_cpu)
   {
+ +      int local_cpu = smp_processor_id();
         u16 local_pkg, event_pkg;
   
         if ((unsigned)event_cpu >= nr_cpu_ids)
                 return event_cpu;
   
- -      if (event->group_caps & PERF_EV_CAP_READ_ACTIVE_PKG) {
- -              int local_cpu = smp_processor_id();
+ +      if (event->group_caps & PERF_EV_CAP_READ_SCOPE) {
+ +              const struct cpumask *cpumask = perf_scope_cpu_topology_cpumask(event->pmu->scope, event_cpu);
+ +
+ +              if (cpumask && cpumask_test_cpu(local_cpu, cpumask))
+ +                      return local_cpu;
+ +      }
   
+ +      if (event->group_caps & PERF_EV_CAP_READ_ACTIVE_PKG) {
                 event_pkg = topology_physical_package_id(event_cpu);
                 local_pkg = topology_physical_package_id(local_cpu);
   
@@@ -4606,7 -4500,10 +4606,7 @@@ static void __perf_event_read(void *inf
                 return;
   
         raw_spin_lock(&ctx->lock);
- -      if (ctx->is_active & EVENT_TIME) {
- -              update_context_time(ctx);
- -              update_cgrp_time_from_event(event);
- -      }
+ +      ctx_time_update_event(ctx, event);
   
         perf_event_update_time(event);
         if (data->group)
@@@ -4641,11 -4538,8 +4641,11 @@@ unlock
         raw_spin_unlock(&ctx->lock);
   }
   
- -static inline u64 perf_event_count(struct perf_event *event)
+ +static inline u64 perf_event_count(struct perf_event *event, bool self)
   {
+ +      if (self)
+ +              return local64_read(&event->count);
+ +
         return local64_read(&event->count) + atomic64_read(&event->child_count);
   }
   
@@@ -4806,7 -4700,10 +4806,7 @@@ again
                  * May read while context is not active (e.g., thread is
                  * blocked), in that case we cannot update context time
                  */
- -              if (ctx->is_active & EVENT_TIME) {
- -                      update_context_time(ctx);
- -                      update_cgrp_time_from_event(event);
- -              }
+ +              ctx_time_update_event(ctx, event);
   
                 perf_event_update_time(event);
                 if (group)
@@@ -5307,7 -5204,7 +5307,7 @@@ static void perf_pending_task_sync(stru
          */
         if (task_work_cancel(current, head)) {
                 event->pending_work = 0;
- -              local_dec(&event->ctx->nr_pending);
+ +              local_dec(&event->ctx->nr_no_switch_fast);
                 return;
         }
   
@@@ -5601,7 -5498,7 +5601,7 @@@ static u64 __perf_event_read_value(stru
         mutex_lock(&event->child_mutex);
   
         (void)perf_event_read(event, false);
- -      total += perf_event_count(event);
+ +      total += perf_event_count(event, false);
   
         *enabled += event->total_time_enabled +
                         atomic64_read(&event->child_total_time_enabled);
@@@ -5610,7 -5507,7 +5610,7 @@@
   
         list_for_each_entry(child, &event->child_list, child_list) {
                 (void)perf_event_read(child, false);
- -              total += perf_event_count(child);
+ +              total += perf_event_count(child, false);
                 *enabled += child->total_time_enabled;
                 *running += child->total_time_running;
         }
@@@ -5692,14 -5589,14 +5692,14 @@@ static int __perf_read_group_add(struc
         /*
          * Write {count,id} tuples for every sibling.
          */
- -      values[n++] += perf_event_count(leader);
+ +      values[n++] += perf_event_count(leader, false);
         if (read_format & PERF_FORMAT_ID)
                 values[n++] = primary_event_id(leader);
         if (read_format & PERF_FORMAT_LOST)
                 values[n++] = atomic64_read(&leader->lost_samples);
   
         for_each_sibling_event(sub, leader) {
- -              values[n++] += perf_event_count(sub);
+ +              values[n++] += perf_event_count(sub, false);
                 if (read_format & PERF_FORMAT_ID)
                         values[n++] = primary_event_id(sub);
                 if (read_format & PERF_FORMAT_LOST)
@@@ -6001,10 -5898,10 +6001,10 @@@ static const struct file_operations per
   static inline int perf_fget_light(int fd, struct fd *p)
   {
         struct fd f = fdget(fd);
-       if (!f.file)
+       if (!fd_file(f))
                 return -EBADF;
   
-       if (f.file->f_op != &perf_fops) {
+       if (fd_file(f)->f_op != &perf_fops) {
                 fdput(f);
                 return -EBADF;
         }
@@@ -6064,7 -5961,7 +6064,7 @@@ static long _perf_ioctl(struct perf_eve
                         ret = perf_fget_light(arg, &output);
                         if (ret)
                                 return ret;
-                       output_event = output.file->private_data;
+                       output_event = fd_file(output)->private_data;
                         ret = perf_event_set_output(event, output_event);
                         fdput(output);
                 } else {
@@@ -6279,7 -6176,7 +6279,7 @@@ void perf_event_update_userpage(struct 
         ++userpg->lock;
         barrier();
         userpg->index = perf_event_index(event);
- -      userpg->offset = perf_event_count(event);
+ +      userpg->offset = perf_event_count(event, false);
         if (userpg->index)
                 userpg->offset -= local64_read(&event->hw.prev_count);
   
@@@ -6476,11 -6373,12 +6476,11 @@@ static void perf_mmap_close(struct vm_a
                 event->pmu->event_unmapped(event, vma->vm_mm);
   
         /*
- -       * rb->aux_mmap_count will always drop before rb->mmap_count and
- -       * event->mmap_count, so it is ok to use event->mmap_mutex to
- -       * serialize with perf_mmap here.
+ +       * The AUX buffer is strictly a sub-buffer, serialize using aux_mutex
+ +       * to avoid complications.
          */
         if (rb_has_aux(rb) && vma->vm_pgoff == rb->aux_pgoff &&
- -          atomic_dec_and_mutex_lock(&rb->aux_mmap_count, &event->mmap_mutex)) {
+ +          atomic_dec_and_mutex_lock(&rb->aux_mmap_count, &rb->aux_mutex)) {
                 /*
                  * Stop all AUX events that are writing to this buffer,
                  * so that we can free its AUX pages and corresponding PMU
@@@ -6497,7 -6395,7 +6497,7 @@@
                 rb_free_aux(rb);
                 WARN_ON_ONCE(refcount_read(&rb->aux_refcount));
   
- -              mutex_unlock(&event->mmap_mutex);
+ +              mutex_unlock(&rb->aux_mutex);
         }
   
         if (atomic_dec_and_test(&rb->mmap_count))
@@@ -6585,7 -6483,6 +6585,7 @@@ static int perf_mmap(struct file *file
         struct perf_event *event = file->private_data;
         unsigned long user_locked, user_lock_limit;
         struct user_struct *user = current_user();
+ +      struct mutex *aux_mutex = NULL;
         struct perf_buffer *rb = NULL;
         unsigned long locked, lock_limit;
         unsigned long vma_size;
@@@ -6634,9 -6531,6 +6634,9 @@@
                 if (!rb)
                         goto aux_unlock;
   
+ +              aux_mutex = &rb->aux_mutex;
+ +              mutex_lock(aux_mutex);
+ +
                 aux_offset = READ_ONCE(rb->user_page->aux_offset);
                 aux_size = READ_ONCE(rb->user_page->aux_size);
   
@@@ -6787,8 -6681,6 +6787,8 @@@ unlock
                 atomic_dec(&rb->mmap_count);
         }
   aux_unlock:
+ +      if (aux_mutex)
+ +              mutex_unlock(aux_mutex);
         mutex_unlock(&event->mmap_mutex);
   
         /*
@@@ -6976,7 -6868,7 +6976,7 @@@ static void perf_pending_task(struct ca
         if (event->pending_work) {
                 event->pending_work = 0;
                 perf_sigtrap(event);
- -              local_dec(&event->ctx->nr_pending);
+ +              local_dec(&event->ctx->nr_no_switch_fast);
                 rcuwait_wake_up(&event->pending_work_wait);
         }
         rcu_read_unlock();
@@@ -7358,7 -7250,7 +7358,7 @@@ static void perf_output_read_one(struc
         u64 values[5];
         int n = 0;
   
- -      values[n++] = perf_event_count(event);
+ +      values[n++] = perf_event_count(event, has_inherit_and_sample_read(&event->attr));
         if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {
                 values[n++] = enabled +
                         atomic64_read(&event->child_total_time_enabled);
@@@ -7376,15 -7268,14 +7376,15 @@@
   }
   
   static void perf_output_read_group(struct perf_output_handle *handle,
- -                          struct perf_event *event,
- -                          u64 enabled, u64 running)
+ +                                 struct perf_event *event,
+ +                                 u64 enabled, u64 running)
   {
         struct perf_event *leader = event->group_leader, *sub;
         u64 read_format = event->attr.read_format;
         unsigned long flags;
         u64 values[6];
         int n = 0;
+ +      bool self = has_inherit_and_sample_read(&event->attr);
   
         /*
          * Disabling interrupts avoids all counter scheduling
@@@ -7404,7 -7295,7 +7404,7 @@@
             (leader->state == PERF_EVENT_STATE_ACTIVE))
                 leader->pmu->read(leader);
   
- -      values[n++] = perf_event_count(leader);
+ +      values[n++] = perf_event_count(leader, self);
         if (read_format & PERF_FORMAT_ID)
                 values[n++] = primary_event_id(leader);
         if (read_format & PERF_FORMAT_LOST)
@@@ -7419,7 -7310,7 +7419,7 @@@
                     (sub->state == PERF_EVENT_STATE_ACTIVE))
                         sub->pmu->read(sub);
   
- -              values[n++] = perf_event_count(sub);
+ +              values[n++] = perf_event_count(sub, self);
                 if (read_format & PERF_FORMAT_ID)
                         values[n++] = primary_event_id(sub);
                 if (read_format & PERF_FORMAT_LOST)
@@@ -7440,10 -7331,6 +7440,10 @@@
    * The problem is that its both hard and excessively expensive to iterate the
    * child list, not to mention that its impossible to IPI the children running
    * on another CPU, from interrupt/NMI context.
+ + *
+ + * Instead the combination of PERF_SAMPLE_READ and inherit will track per-thread
+ + * counts rather than attempting to accumulate some value across all children on
+ + * all cores.
    */
   static void perf_output_read(struct perf_output_handle *handle,
                              struct perf_event *event)
@@@ -8964,7 -8851,7 +8964,7 @@@ got_name
         mmap_event->event_id.header.size = sizeof(mmap_event->event_id) + size;
   
         if (atomic_read(&nr_build_id_events))
- -              build_id_parse(vma, mmap_event->build_id, &mmap_event->build_id_size);
+ +              build_id_parse_nofault(vma, mmap_event->build_id, &mmap_event->build_id_size);
   
         perf_iterate_sb(perf_event_mmap_output,
                        mmap_event,
@@@ -9819,8 -9706,7 +9819,8 @@@ static int __perf_event_overflow(struc
   
         ret = __perf_event_account_interrupt(event, throttle);
   
- -      if (event->prog && !bpf_overflow_handler(event, data, regs))
+ +      if (event->prog && event->prog->type == BPF_PROG_TYPE_PERF_EVENT &&
+ +          !bpf_overflow_handler(event, data, regs))
                 return ret;
   
         /*
@@@ -9854,7 -9740,7 +9854,7 @@@
                 if (!event->pending_work &&
                     !task_work_add(current, &event->pending_task, notify_mode)) {
                         event->pending_work = pending_id;
- -                      local_inc(&event->ctx->nr_pending);
+ +                      local_inc(&event->ctx->nr_no_switch_fast);
   
                         event->pending_addr = 0;
                         if (valid_sample && (data->sample_flags & PERF_SAMPLE_ADDR))
@@@ -11591,60 -11477,10 +11591,60 @@@ perf_event_mux_interval_ms_store(struc
   }
   static DEVICE_ATTR_RW(perf_event_mux_interval_ms);
   
+ +static inline const struct cpumask *perf_scope_cpu_topology_cpumask(unsigned int scope, int cpu)
+ +{
+ +      switch (scope) {
+ +      case PERF_PMU_SCOPE_CORE:
+ +              return topology_sibling_cpumask(cpu);
+ +      case PERF_PMU_SCOPE_DIE:
+ +              return topology_die_cpumask(cpu);
+ +      case PERF_PMU_SCOPE_CLUSTER:
+ +              return topology_cluster_cpumask(cpu);
+ +      case PERF_PMU_SCOPE_PKG:
+ +              return topology_core_cpumask(cpu);
+ +      case PERF_PMU_SCOPE_SYS_WIDE:
+ +              return cpu_online_mask;
+ +      }
+ +
+ +      return NULL;
+ +}
+ +
+ +static inline struct cpumask *perf_scope_cpumask(unsigned int scope)
+ +{
+ +      switch (scope) {
+ +      case PERF_PMU_SCOPE_CORE:
+ +              return perf_online_core_mask;
+ +      case PERF_PMU_SCOPE_DIE:
+ +              return perf_online_die_mask;
+ +      case PERF_PMU_SCOPE_CLUSTER:
+ +              return perf_online_cluster_mask;
+ +      case PERF_PMU_SCOPE_PKG:
+ +              return perf_online_pkg_mask;
+ +      case PERF_PMU_SCOPE_SYS_WIDE:
+ +              return perf_online_sys_mask;
+ +      }
+ +
+ +      return NULL;
+ +}
+ +
+ +static ssize_t cpumask_show(struct device *dev, struct device_attribute *attr,
+ +                          char *buf)
+ +{
+ +      struct pmu *pmu = dev_get_drvdata(dev);
+ +      struct cpumask *mask = perf_scope_cpumask(pmu->scope);
+ +
+ +      if (mask)
+ +              return cpumap_print_to_pagebuf(true, buf, mask);
+ +      return 0;
+ +}
+ +
+ +static DEVICE_ATTR_RO(cpumask);
+ +
   static struct attribute *pmu_dev_attrs[] = {
         &dev_attr_type.attr,
         &dev_attr_perf_event_mux_interval_ms.attr,
         &dev_attr_nr_addr_filters.attr,
+ +      &dev_attr_cpumask.attr,
         NULL,
   };
   
@@@ -11656,10 -11492,6 +11656,10 @@@ static umode_t pmu_dev_is_visible(struc
         if (n == 2 && !pmu->nr_addr_filters)
                 return 0;
   
+ +      /* cpumask */
+ +      if (n == 3 && pmu->scope == PERF_PMU_SCOPE_NONE)
+ +              return 0;
+ +
         return a->mode;
   }
   
@@@ -11744,11 -11576,6 +11744,11 @@@ int perf_pmu_register(struct pmu *pmu, 
                 goto free_pdc;
         }
   
+ +      if (WARN_ONCE(pmu->scope >= PERF_PMU_MAX_SCOPE, "Can not register a pmu with an invalid scope.\n")) {
+ +              ret = -EINVAL;
+ +              goto free_pdc;
+ +      }
+ +
         pmu->name = name;
   
         if (type >= 0)
@@@ -11903,22 -11730,6 +11903,22 @@@ static int perf_try_init_event(struct p
                     event_has_any_exclude_flag(event))
                         ret = -EINVAL;
   
+ +              if (pmu->scope != PERF_PMU_SCOPE_NONE && event->cpu >= 0) {
+ +                      const struct cpumask *cpumask = perf_scope_cpu_topology_cpumask(pmu->scope, event->cpu);
+ +                      struct cpumask *pmu_cpumask = perf_scope_cpumask(pmu->scope);
+ +                      int cpu;
+ +
+ +                      if (pmu_cpumask && cpumask) {
+ +                              cpu = cpumask_any_and(pmu_cpumask, cpumask);
+ +                              if (cpu >= nr_cpu_ids)
+ +                                      ret = -ENODEV;
+ +                              else
+ +                                      event->event_caps |= PERF_EV_CAP_READ_SCOPE;
+ +                      } else {
+ +                              ret = -ENODEV;
+ +                      }
+ +              }
+ +
                 if (ret && event->destroy)
                         event->destroy(event);
         }
@@@ -12246,12 -12057,10 +12246,12 @@@ perf_event_alloc(struct perf_event_att
         local64_set(&hwc->period_left, hwc->sample_period);
   
         /*
- -       * We currently do not support PERF_SAMPLE_READ on inherited events.
+ +       * We do not support PERF_SAMPLE_READ on inherited events unless
+ +       * PERF_SAMPLE_TID is also selected, which allows inherited events to
+ +       * collect per-thread samples.
          * See perf_output_read().
          */
- -      if (attr->inherit && (attr->sample_type & PERF_SAMPLE_READ))
+ +      if (has_inherit_and_sample_read(attr) && !(attr->sample_type & PERF_SAMPLE_TID))
                 goto err_ns;
   
         if (!has_branch_stack(event))
@@@ -12665,7 -12474,7 +12665,7 @@@ SYSCALL_DEFINE5(perf_event_open
         struct perf_event_attr attr;
         struct perf_event_context *ctx;
         struct file *event_file = NULL;
-       struct fd group = {NULL, 0};
+       struct fd group = EMPTY_FD;
         struct task_struct *task = NULL;
         struct pmu *pmu;
         int event_fd;
@@@ -12740,7 -12549,7 +12740,7 @@@
                 err = perf_fget_light(group_fd, &group);
                 if (err)
                         goto err_fd;
-               group_leader = group.file->private_data;
+               group_leader = fd_file(group)->private_data;
                 if (flags & PERF_FLAG_FD_OUTPUT)
                         output_event = group_leader;
                 if (flags & PERF_FLAG_FD_NO_GROUP)
@@@ -13275,7 -13084,7 +13275,7 @@@ static void sync_child_event(struct per
                         perf_event_read_event(child_event, task);
         }
   
- -      child_val = perf_event_count(child_event);
+ +      child_val = perf_event_count(child_event, false);
   
         /*
          * Add back the child's count to the parent's count:
@@@ -13366,7 -13175,7 +13366,7 @@@ static void perf_event_exit_task_contex
          * in.
          */
         raw_spin_lock_irq(&child_ctx->lock);
- -      task_ctx_sched_out(child_ctx, EVENT_ALL);
+ +      task_ctx_sched_out(child_ctx, NULL, EVENT_ALL);
   
         /*
          * Now that the context is inactive, destroy the task <-> ctx relation
@@@ -13542,15 -13351,6 +13542,15 @@@ const struct perf_event_attr *perf_even
         return &event->attr;
   }
   
+ +int perf_allow_kernel(struct perf_event_attr *attr)
+ +{
+ +      if (sysctl_perf_event_paranoid > 1 && !perfmon_capable())
+ +              return -EACCES;
+ +
+ +      return security_perf_event_open(attr, PERF_SECURITY_KERNEL);
+ +}
+ +EXPORT_SYMBOL_GPL(perf_allow_kernel);
+ +
   /*
    * Inherit an event from parent task to child task.
    *
@@@ -13881,12 -13681,6 +13881,12 @@@ static void __init perf_event_init_all_
         int cpu;
   
         zalloc_cpumask_var(&perf_online_mask, GFP_KERNEL);
+ +      zalloc_cpumask_var(&perf_online_core_mask, GFP_KERNEL);
+ +      zalloc_cpumask_var(&perf_online_die_mask, GFP_KERNEL);
+ +      zalloc_cpumask_var(&perf_online_cluster_mask, GFP_KERNEL);
+ +      zalloc_cpumask_var(&perf_online_pkg_mask, GFP_KERNEL);
+ +      zalloc_cpumask_var(&perf_online_sys_mask, GFP_KERNEL);
+ +
   
         for_each_possible_cpu(cpu) {
                 swhash = &per_cpu(swevent_htable, cpu);
@@@ -13930,46 -13724,12 +13930,46 @@@ static void __perf_event_exit_context(v
         struct perf_event *event;
   
         raw_spin_lock(&ctx->lock);
- -      ctx_sched_out(ctx, EVENT_TIME);
+ +      ctx_sched_out(ctx, NULL, EVENT_TIME);
         list_for_each_entry(event, &ctx->event_list, event_entry)
                 __perf_remove_from_context(event, cpuctx, ctx, (void *)DETACH_GROUP);
         raw_spin_unlock(&ctx->lock);
   }
   
+ +static void perf_event_clear_cpumask(unsigned int cpu)
+ +{
+ +      int target[PERF_PMU_MAX_SCOPE];
+ +      unsigned int scope;
+ +      struct pmu *pmu;
+ +
+ +      cpumask_clear_cpu(cpu, perf_online_mask);
+ +
+ +      for (scope = PERF_PMU_SCOPE_NONE + 1; scope < PERF_PMU_MAX_SCOPE; scope++) {
+ +              const struct cpumask *cpumask = perf_scope_cpu_topology_cpumask(scope, cpu);
+ +              struct cpumask *pmu_cpumask = perf_scope_cpumask(scope);
+ +
+ +              target[scope] = -1;
+ +              if (WARN_ON_ONCE(!pmu_cpumask || !cpumask))
+ +                      continue;
+ +
+ +              if (!cpumask_test_and_clear_cpu(cpu, pmu_cpumask))
+ +                      continue;
+ +              target[scope] = cpumask_any_but(cpumask, cpu);
+ +              if (target[scope] < nr_cpu_ids)
+ +                      cpumask_set_cpu(target[scope], pmu_cpumask);
+ +      }
+ +
+ +      /* migrate */
+ +      list_for_each_entry_rcu(pmu, &pmus, entry, lockdep_is_held(&pmus_srcu)) {
+ +              if (pmu->scope == PERF_PMU_SCOPE_NONE ||
+ +                  WARN_ON_ONCE(pmu->scope >= PERF_PMU_MAX_SCOPE))
+ +                      continue;
+ +
+ +              if (target[pmu->scope] >= 0 && target[pmu->scope] < nr_cpu_ids)
+ +                      perf_pmu_migrate_context(pmu, cpu, target[pmu->scope]);
+ +      }
+ +}
+ +
   static void perf_event_exit_cpu_context(int cpu)
   {
         struct perf_cpu_context *cpuctx;
@@@ -13977,11 -13737,6 +13977,11 @@@
   
         // XXX simplify cpuctx->online
         mutex_lock(&pmus_lock);
+ +      /*
+ +       * Clear the cpumasks, and migrate to other CPUs if possible.
+ +       * Must be invoked before the __perf_event_exit_context.
+ +       */
+ +      perf_event_clear_cpumask(cpu);
         cpuctx = per_cpu_ptr(&perf_cpu_context, cpu);
         ctx = &cpuctx->ctx;
   
@@@ -13989,6 -13744,7 +13989,6 @@@
         smp_call_function_single(cpu, __perf_event_exit_context, ctx, 1);
         cpuctx->online = 0;
         mutex_unlock(&ctx->mutex);
- -      cpumask_clear_cpu(cpu, perf_online_mask);
         mutex_unlock(&pmus_lock);
   }
   #else
@@@ -13997,42 -13753,6 +13997,42 @@@ static void perf_event_exit_cpu_context
   
   #endif
   
+ +static void perf_event_setup_cpumask(unsigned int cpu)
+ +{
+ +      struct cpumask *pmu_cpumask;
+ +      unsigned int scope;
+ +
+ +      /*
+ +       * Early boot stage, the cpumask hasn't been set yet.
+ +       * The perf_online_<domain>_masks includes the first CPU of each domain.
+ +       * Always unconditionally set the boot CPU for the perf_online_<domain>_masks.
+ +       */
+ +      if (cpumask_empty(perf_online_mask)) {
+ +              for (scope = PERF_PMU_SCOPE_NONE + 1; scope < PERF_PMU_MAX_SCOPE; scope++) {
+ +                      pmu_cpumask = perf_scope_cpumask(scope);
+ +                      if (WARN_ON_ONCE(!pmu_cpumask))
+ +                              continue;
+ +                      cpumask_set_cpu(cpu, pmu_cpumask);
+ +              }
+ +              goto end;
+ +      }
+ +
+ +      for (scope = PERF_PMU_SCOPE_NONE + 1; scope < PERF_PMU_MAX_SCOPE; scope++) {
+ +              const struct cpumask *cpumask = perf_scope_cpu_topology_cpumask(scope, cpu);
+ +
+ +              pmu_cpumask = perf_scope_cpumask(scope);
+ +
+ +              if (WARN_ON_ONCE(!pmu_cpumask || !cpumask))
+ +                      continue;
+ +
+ +              if (!cpumask_empty(cpumask) &&
+ +                  cpumask_any_and(pmu_cpumask, cpumask) >= nr_cpu_ids)
+ +                      cpumask_set_cpu(cpu, pmu_cpumask);
+ +      }
+ +end:
+ +      cpumask_set_cpu(cpu, perf_online_mask);
+ +}
+ +
   int perf_event_init_cpu(unsigned int cpu)
   {
         struct perf_cpu_context *cpuctx;
@@@ -14041,7 -13761,7 +14041,7 @@@
         perf_swevent_init_cpu(cpu);
   
         mutex_lock(&pmus_lock);
- -      cpumask_set_cpu(cpu, perf_online_mask);
+ +      perf_event_setup_cpumask(cpu);
         cpuctx = per_cpu_ptr(&perf_cpu_context, cpu);
         ctx = &cpuctx->ctx;
   
diff --combined kernel/module/main.c

index 71396e297499f47f9871dbac48ccc5a79f8db65a,6ed334eecc14d3b604480287b408e7f3230fcd65..49b9bca9de12f77121e80aafa1c705516e3b7e98
--- 1/kernel/module/main.c
--- 2/kernel/module/main.c
+++ b/kernel/module/main.c
@@@ -3104,7 -3104,7 +3104,7 @@@ static bool idempotent(struct idempoten
         struct idempotent *existing;
         bool first;
   
- -      u->ret = 0;
+ +      u->ret = -EINTR;
         u->cookie = cookie;
         init_completion(&u->complete);
   
@@@ -3140,7 -3140,7 +3140,7 @@@ static int idempotent_complete(struct i
         hlist_for_each_entry_safe(pos, next, head, entry) {
                 if (pos->cookie != cookie)
                         continue;
- -              hlist_del(&pos->entry);
+ +              hlist_del_init(&pos->entry);
                 pos->ret = ret;
                 complete(&pos->complete);
         }
@@@ -3148,28 -3148,6 +3148,28 @@@
         return ret;
   }
   
+ +/*
+ + * Wait for the idempotent worker.
+ + *
+ + * If we get interrupted, we need to remove ourselves from the
+ + * the idempotent list, and the completion may still come in.
+ + *
+ + * The 'idem_lock' protects against the race, and 'idem.ret' was
+ + * initialized to -EINTR and is thus always the right return
+ + * value even if the idempotent work then completes between
+ + * the wait_for_completion and the cleanup.
+ + */
+ +static int idempotent_wait_for_completion(struct idempotent *u)
+ +{
+ +      if (wait_for_completion_interruptible(&u->complete)) {
+ +              spin_lock(&idem_lock);
+ +              if (!hlist_unhashed(&u->entry))
+ +                      hlist_del(&u->entry);
+ +              spin_unlock(&idem_lock);
+ +      }
+ +      return u->ret;
+ +}
+ +
   static int init_module_from_file(struct file *f, const char __user * uargs, int flags)
   {
         struct load_info info = { };
@@@ -3205,16 -3183,15 +3205,16 @@@ static int idempotent_init_module(struc
         if (!f || !(f->f_mode & FMODE_READ))
                 return -EBADF;
   
- -      /* See if somebody else is doing the operation? */
- -      if (idempotent(&idem, file_inode(f))) {
- -              wait_for_completion(&idem.complete);
- -              return idem.ret;
+ +      /* Are we the winners of the race and get to do this? */
+ +      if (!idempotent(&idem, file_inode(f))) {
+ +              int ret = init_module_from_file(f, uargs, flags);
+ +              return idempotent_complete(&idem, ret);
         }
   
- -      /* Otherwise, we'll do it and complete others */
- -      return idempotent_complete(&idem,
- -              init_module_from_file(f, uargs, flags));
+ +      /*
+ +       * Somebody else won the race and is loading the module.
+ +       */
+ +      return idempotent_wait_for_completion(&idem);
   }
   
   SYSCALL_DEFINE3(finit_module, int, fd, const char __user *, uargs, int, flags)
@@@ -3234,7 -3211,7 +3234,7 @@@
                 return -EINVAL;
   
         f = fdget(fd);
-       err = idempotent_init_module(f.file, uargs, flags);
+       err = idempotent_init_module(fd_file(f), uargs, flags);
         fdput(f);
         return err;
   }
diff --combined kernel/signal.c

index 6fe29715105b853ec203a864ff52fd64e30cad6c,cc5d87cfa7c0adcf589aa7bacb709eae43d8533e..6e57036f947fc68eabba02b40fda48c80e68f344
--- 1/kernel/signal.c
--- 2/kernel/signal.c
+++ b/kernel/signal.c
@@@ -618,18 -618,20 +618,18 @@@ static int __dequeue_signal(struct sigp
   }
   
   /*
- - * Dequeue a signal and return the element to the caller, which is
- - * expected to free it.
- - *
- - * All callers have to hold the siglock.
+ + * Try to dequeue a signal. If a deliverable signal is found fill in the
+ + * caller provided siginfo and return the signal number. Otherwise return
+ + * 0.
    */
- -int dequeue_signal(struct task_struct *tsk, sigset_t *mask,
- -                 kernel_siginfo_t *info, enum pid_type *type)
+ +int dequeue_signal(sigset_t *mask, kernel_siginfo_t *info, enum pid_type *type)
   {
+ +      struct task_struct *tsk = current;
         bool resched_timer = false;
         int signr;
   
- -      /* We only dequeue private signals from ourselves, we don't let
- -       * signalfd steal them
- -       */
+ +      lockdep_assert_held(&tsk->sighand->siglock);
+ +
         *type = PIDTYPE_PID;
         signr = __dequeue_signal(&tsk->pending, mask, info, &resched_timer);
         if (!signr) {
@@@ -1938,11 -1940,10 +1938,11 @@@ struct sigqueue *sigqueue_alloc(void
   
   void sigqueue_free(struct sigqueue *q)
   {
- -      unsigned long flags;
         spinlock_t *lock = &current->sighand->siglock;
+ +      unsigned long flags;
   
- -      BUG_ON(!(q->flags & SIGQUEUE_PREALLOC));
+ +      if (WARN_ON_ONCE(!(q->flags & SIGQUEUE_PREALLOC)))
+ +              return;
         /*
          * We must hold ->siglock while testing q->list
          * to serialize with collect_signal() or with
@@@ -1970,10 -1971,7 +1970,10 @@@ int send_sigqueue(struct sigqueue *q, s
         unsigned long flags;
         int ret, result;
   
- -      BUG_ON(!(q->flags & SIGQUEUE_PREALLOC));
+ +      if (WARN_ON_ONCE(!(q->flags & SIGQUEUE_PREALLOC)))
+ +              return 0;
+ +      if (WARN_ON_ONCE(q->info.si_code != SI_TIMER))
+ +              return 0;
   
         ret = -1;
         rcu_read_lock();
@@@ -2008,6 -2006,7 +2008,6 @@@
                  * If an SI_TIMER entry is already queue just increment
                  * the overrun count.
                  */
- -              BUG_ON(q->info.si_code != SI_TIMER);
                 q->info.si_overrun++;
                 result = TRACE_SIGNAL_ALREADY_PENDING;
                 goto out;
@@@ -2794,7 -2793,8 +2794,7 @@@ relock
                 type = PIDTYPE_PID;
                 signr = dequeue_synchronous_signal(&ksig->info);
                 if (!signr)
- -                      signr = dequeue_signal(current, &current->blocked,
- -                                             &ksig->info, &type);
+ +                      signr = dequeue_signal(&current->blocked, &ksig->info, &type);
   
                 if (!signr)
                         break; /* will return 0 */
@@@ -2888,8 -2888,6 +2888,8 @@@
                 current->flags |= PF_SIGNALED;
   
                 if (sig_kernel_coredump(signr)) {
+ +                      int ret;
+ +
                         if (print_fatal_signals)
                                 print_fatal_signal(signr);
                         proc_coredump_connector(current);
@@@ -2901,24 -2899,7 +2901,24 @@@
                          * first and our do_group_exit call below will use
                          * that value and ignore the one we pass it.
                          */
- -                      do_coredump(&ksig->info);
+ +                      ret = do_coredump(&ksig->info);
+ +                      if (ret)
+ +                              coredump_report_failure("coredump has not been created, error %d",
+ +                                      ret);
+ +                      else if (!IS_ENABLED(CONFIG_COREDUMP)) {
+ +                              /*
+ +                               * Coredumps are not available, can't fail collecting
+ +                               * the coredump.
+ +                               *
+ +                               * Leave a note though that the coredump is going to be
+ +                               * not created. This is not an error or a warning as disabling
+ +                               * support in the kernel for coredumps isn't commonplace, and
+ +                               * the user must've built the kernel with the custom config so
+ +                               * let them know all works as desired.
+ +                               */
+ +                              coredump_report("no coredump collected as "
+ +                                      "that is disabled in the kernel configuration");
+ +                      }
                 }
   
                 /*
@@@ -3667,7 -3648,7 +3667,7 @@@ static int do_sigtimedwait(const sigset
         signotset(&mask);
   
         spin_lock_irq(&tsk->sighand->siglock);
- -      sig = dequeue_signal(tsk, &mask, info, &type);
+ +      sig = dequeue_signal(&mask, info, &type);
         if (!sig && timeout) {
                 /*
                  * None ready, temporarily unblock those we're interested
@@@ -3686,7 -3667,7 +3686,7 @@@
                 spin_lock_irq(&tsk->sighand->siglock);
                 __set_task_blocked(tsk, &tsk->real_blocked);
                 sigemptyset(&tsk->real_blocked);
- -              sig = dequeue_signal(tsk, &mask, info, &type);
+ +              sig = dequeue_signal(&mask, info, &type);
         }
         spin_unlock_irq(&tsk->sighand->siglock);
   
@@@ -3941,11 -3922,11 +3941,11 @@@ SYSCALL_DEFINE4(pidfd_send_signal, int
                 return -EINVAL;
   
         f = fdget(pidfd);
-       if (!f.file)
+       if (!fd_file(f))
                 return -EBADF;
   
         /* Is this a pidfd? */
-       pid = pidfd_to_pid(f.file);
+       pid = pidfd_to_pid(fd_file(f));
         if (IS_ERR(pid)) {
                 ret = PTR_ERR(pid);
                 goto err;
@@@ -3958,7 -3939,7 +3958,7 @@@
         switch (flags) {
         case 0:
                 /* Infer scope from the type of pidfd. */
-               if (f.file->f_flags & PIDFD_THREAD)
+               if (fd_file(f)->f_flags & PIDFD_THREAD)
                         type = PIDTYPE_PID;
                 else
                         type = PIDTYPE_TGID;
diff --combined kernel/sys.c

index b7e096e1c3a13d808731f0f4e55a16a5524dcb81,a4be1e568ff5c6f3dc5d4811d3fd7a7027b91ec3..4da31f28fda81f7dc0fd56fb276133f6bc26e404
--- 1/kernel/sys.c
--- 2/kernel/sys.c
+++ b/kernel/sys.c
@@@ -1916,10 -1916,10 +1916,10 @@@ static int prctl_set_mm_exe_file(struc
         int err;
   
         exe = fdget(fd);
-       if (!exe.file)
+       if (!fd_file(exe))
                 return -EBADF;
   
-       inode = file_inode(exe.file);
+       inode = file_inode(fd_file(exe));
   
         /*
          * Because the original mm->exe_file points to executable file, make
@@@ -1927,14 -1927,14 +1927,14 @@@
          * overall picture.
          */
         err = -EACCES;
-       if (!S_ISREG(inode->i_mode) || path_noexec(&exe.file->f_path))
+       if (!S_ISREG(inode->i_mode) || path_noexec(&fd_file(exe)->f_path))
                 goto exit;
   
-       err = file_permission(exe.file, MAY_EXEC);
+       err = file_permission(fd_file(exe), MAY_EXEC);
         if (err)
                 goto exit;
   
-       err = replace_mm_exe_file(mm, exe.file);
+       err = replace_mm_exe_file(mm, fd_file(exe));
   exit:
         fdput(exe);
         return err;
@@@ -2557,8 -2557,6 +2557,8 @@@ SYSCALL_DEFINE5(prctl, int, option, uns
                         error = current->timer_slack_ns;
                 break;
         case PR_SET_TIMERSLACK:
+ +              if (rt_or_dl_task_policy(current))
+ +                      break;
                 if (arg2 <= 0)
                         current->timer_slack_ns =
                                         current->default_timer_slack_ns;
diff --combined mm/filemap.c

index 4f3753f0a158c9706c90402addc93ba66f66d2f4,0b5cbd644fddb355eb0010ec847455fe3a027dc4..bbaed3dd5049cb108250da18582d3df91f97e670
--- 1/mm/filemap.c
--- 2/mm/filemap.c
+++ b/mm/filemap.c
@@@ -46,7 -46,6 +46,7 @@@
   #include <linux/pipe_fs_i.h>
   #include <linux/splice.h>
   #include <linux/rcupdate_wait.h>
+ +#include <linux/sched/mm.h>
   #include <asm/pgalloc.h>
   #include <asm/tlbflush.h>
   #include "internal.h"
@@@ -113,8 -112,8 +113,8 @@@
    *    ->swap_lock             (try_to_unmap_one)
    *    ->private_lock          (try_to_unmap_one)
    *    ->i_pages lock          (try_to_unmap_one)
- - *    ->lruvec->lru_lock      (follow_page->mark_page_accessed)
- - *    ->lruvec->lru_lock      (check_pte_range->isolate_lru_page)
+ + *    ->lruvec->lru_lock      (follow_page_mask->mark_page_accessed)
+ + *    ->lruvec->lru_lock      (check_pte_range->folio_isolate_lru)
    *    ->private_lock          (folio_remove_rmap_pte->set_page_dirty)
    *    ->i_pages lock          (folio_remove_rmap_pte->set_page_dirty)
    *    bdi.wb->list_lock               (folio_remove_rmap_pte->set_page_dirty)
@@@ -531,6 -530,7 +531,6 @@@ static void __filemap_fdatawait_range(s
                         struct folio *folio = fbatch.folios[i];
   
                         folio_wait_writeback(folio);
- -                      folio_clear_error(folio);
                 }
                 folio_batch_release(&fbatch);
                 cond_resched();
@@@ -859,8 -859,6 +859,8 @@@ noinline int __filemap_add_folio(struc
   
         VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
         VM_BUG_ON_FOLIO(folio_test_swapbacked(folio), folio);
+ +      VM_BUG_ON_FOLIO(folio_order(folio) < mapping_min_folio_order(mapping),
+ +                      folio);
         mapping_set_update(&xas, mapping);
   
         VM_BUG_ON_FOLIO(index & (folio_nr_pages(folio) - 1), folio);
@@@ -1921,10 -1919,8 +1921,10 @@@ repeat
                 folio_wait_stable(folio);
   no_page:
         if (!folio && (fgp_flags & FGP_CREAT)) {
- -              unsigned order = FGF_GET_ORDER(fgp_flags);
+ +              unsigned int min_order = mapping_min_folio_order(mapping);
+ +              unsigned int order = max(min_order, FGF_GET_ORDER(fgp_flags));
                 int err;
+ +              index = mapping_align_index(mapping, index);
   
                 if ((fgp_flags & FGP_WRITE) && mapping_can_writeback(mapping))
                         gfp |= __GFP_WRITE;
@@@ -1937,8 -1933,10 +1937,8 @@@
                 if (WARN_ON_ONCE(!(fgp_flags & (FGP_LOCK | FGP_FOR_MMAP))))
                         fgp_flags |= FGP_LOCK;
   
- -              if (!mapping_large_folio_support(mapping))
- -                      order = 0;
- -              if (order > MAX_PAGECACHE_ORDER)
- -                      order = MAX_PAGECACHE_ORDER;
+ +              if (order > mapping_max_folio_order(mapping))
+ +                      order = mapping_max_folio_order(mapping);
                 /* If we're not aligned, allocate a smaller folio */
                 if (index & ((1UL << order) - 1))
                         order = __ffs(index);
@@@ -1947,7 -1945,7 +1947,7 @@@
                         gfp_t alloc_gfp = gfp;
   
                         err = -ENOMEM;
- -                      if (order > 0)
+ +                      if (order > min_order)
                                 alloc_gfp |= __GFP_NORETRY | __GFP_NOWARN;
                         folio = filemap_alloc_folio(alloc_gfp, order);
                         if (!folio)
@@@ -1962,7 -1960,7 +1962,7 @@@
                                 break;
                         folio_put(folio);
                         folio = NULL;
- -              } while (order-- > 0);
+ +              } while (order-- > min_order);
   
                 if (err == -EEXIST)
                         goto repeat;
@@@ -2049,20 -2047,17 +2049,20 @@@ unsigned find_get_entries(struct addres
                 if (!folio_batch_add(fbatch, folio))
                         break;
         }
- -      rcu_read_unlock();
   
         if (folio_batch_count(fbatch)) {
- -              unsigned long nr = 1;
+ +              unsigned long nr;
                 int idx = folio_batch_count(fbatch) - 1;
   
                 folio = fbatch->folios[idx];
                 if (!xa_is_value(folio))
                         nr = folio_nr_pages(folio);
- -              *start = indices[idx] + nr;
+ +              else
+ +                      nr = 1 << xa_get_order(&mapping->i_pages, indices[idx]);
+ +              *start = round_down(indices[idx] + nr, nr);
         }
+ +      rcu_read_unlock();
+ +
         return folio_batch_count(fbatch);
   }
   
@@@ -2094,17 -2089,10 +2094,17 @@@ unsigned find_lock_entries(struct addre
   
         rcu_read_lock();
         while ((folio = find_get_entry(&xas, end, XA_PRESENT))) {
+ +              unsigned long base;
+ +              unsigned long nr;
+ +
                 if (!xa_is_value(folio)) {
- -                      if (folio->index < *start)
+ +                      nr = folio_nr_pages(folio);
+ +                      base = folio->index;
+ +                      /* Omit large folio which begins before the start */
+ +                      if (base < *start)
                                 goto put;
- -                      if (folio_next_index(folio) - 1 > end)
+ +                      /* Omit large folio which extends beyond the end */
+ +                      if (base + nr - 1 > end)
                                 goto put;
                         if (!folio_trylock(folio))
                                 goto put;
@@@ -2113,19 -2101,7 +2113,19 @@@
                                 goto unlock;
                         VM_BUG_ON_FOLIO(!folio_contains(folio, xas.xa_index),
                                         folio);
+ +              } else {
+ +                      nr = 1 << xas_get_order(&xas);
+ +                      base = xas.xa_index & ~(nr - 1);
+ +                      /* Omit order>0 value which begins before the start */
+ +                      if (base < *start)
+ +                              continue;
+ +                      /* Omit order>0 value which extends beyond the end */
+ +                      if (base + nr - 1 > end)
+ +                              break;
                 }
+ +
+ +              /* Update start now so that last update is correct on return */
+ +              *start = base + nr;
                 indices[fbatch->nr] = xas.xa_index;
                 if (!folio_batch_add(fbatch, folio))
                         break;
@@@ -2137,6 -2113,15 +2137,6 @@@ put
         }
         rcu_read_unlock();
   
- -      if (folio_batch_count(fbatch)) {
- -              unsigned long nr = 1;
- -              int idx = folio_batch_count(fbatch) - 1;
- -
- -              folio = fbatch->folios[idx];
- -              if (!xa_is_value(folio))
- -                      nr = folio_nr_pages(folio);
- -              *start = indices[idx] + nr;
- -      }
         return folio_batch_count(fbatch);
   }
   
@@@ -2357,6 -2342,13 +2357,6 @@@ static int filemap_read_folio(struct fi
         unsigned long pflags;
         int error;
   
- -      /*
- -       * A previous I/O error may have been due to temporary failures,
- -       * eg. multipath errors.  PG_error will be set again if read_folio
- -       * fails.
- -       */
- -      folio_clear_error(folio);
- -
         /* Start the actual read. The read will unlock the page. */
         if (unlikely(workingset))
                 psi_memstall_enter(&pflags);
@@@ -2457,15 -2449,13 +2457,15 @@@ unlock_mapping
   }
   
   static int filemap_create_folio(struct file *file,
- -              struct address_space *mapping, pgoff_t index,
+ +              struct address_space *mapping, loff_t pos,
                 struct folio_batch *fbatch)
   {
         struct folio *folio;
         int error;
+ +      unsigned int min_order = mapping_min_folio_order(mapping);
+ +      pgoff_t index;
   
- -      folio = filemap_alloc_folio(mapping_gfp_mask(mapping), 0);
+ +      folio = filemap_alloc_folio(mapping_gfp_mask(mapping), min_order);
         if (!folio)
                 return -ENOMEM;
   
@@@ -2483,7 -2473,6 +2483,7 @@@
          * well to keep locking rules simple.
          */
         filemap_invalidate_lock_shared(mapping);
+ +      index = (pos >> (PAGE_SHIFT + min_order)) << min_order;
         error = filemap_add_folio(mapping, folio, index,
                         mapping_gfp_constraint(mapping, GFP_KERNEL));
         if (error == -EEXIST)
@@@ -2525,7 -2514,6 +2525,7 @@@ static int filemap_get_pages(struct kio
         pgoff_t index = iocb->ki_pos >> PAGE_SHIFT;
         pgoff_t last_index;
         struct folio *folio;
+ +      unsigned int flags;
         int err = 0;
   
         /* "last_index" is the index of the page beyond the end of the read */
@@@ -2538,18 -2526,15 +2538,18 @@@ retry
         if (!folio_batch_count(fbatch)) {
                 if (iocb->ki_flags & IOCB_NOIO)
                         return -EAGAIN;
+ +              if (iocb->ki_flags & IOCB_NOWAIT)
+ +                      flags = memalloc_noio_save();
                 page_cache_sync_readahead(mapping, ra, filp, index,
                                 last_index - index);
+ +              if (iocb->ki_flags & IOCB_NOWAIT)
+ +                      memalloc_noio_restore(flags);
                 filemap_get_read_batch(mapping, index, last_index - 1, fbatch);
         }
         if (!folio_batch_count(fbatch)) {
                 if (iocb->ki_flags & (IOCB_NOWAIT | IOCB_WAITQ))
                         return -EAGAIN;
- -              err = filemap_create_folio(filp, mapping,
- -                              iocb->ki_pos >> PAGE_SHIFT, fbatch);
+ +              err = filemap_create_folio(filp, mapping, iocb->ki_pos, fbatch);
                 if (err == AOP_TRUNCATED_PAGE)
                         goto retry;
                 return err;
@@@ -2571,7 -2556,6 +2571,7 @@@
                         goto err;
         }
   
+ +      trace_mm_filemap_get_pages(mapping, index, last_index - 1);
         return 0;
   err:
         if (err < 0)
@@@ -2728,12 -2712,14 +2728,12 @@@ int kiocb_write_and_wait(struct kiocb *
   }
   EXPORT_SYMBOL_GPL(kiocb_write_and_wait);
   
- -int kiocb_invalidate_pages(struct kiocb *iocb, size_t count)
+ +int filemap_invalidate_pages(struct address_space *mapping,
+ +                           loff_t pos, loff_t end, bool nowait)
   {
- -      struct address_space *mapping = iocb->ki_filp->f_mapping;
- -      loff_t pos = iocb->ki_pos;
- -      loff_t end = pos + count - 1;
         int ret;
   
- -      if (iocb->ki_flags & IOCB_NOWAIT) {
+ +      if (nowait) {
                 /* we could block if there are any pages in the range */
                 if (filemap_range_has_page(mapping, pos, end))
                         return -EAGAIN;
@@@ -2752,15 -2738,6 +2752,15 @@@
         return invalidate_inode_pages2_range(mapping, pos >> PAGE_SHIFT,
                                              end >> PAGE_SHIFT);
   }
+ +
+ +int kiocb_invalidate_pages(struct kiocb *iocb, size_t count)
+ +{
+ +      struct address_space *mapping = iocb->ki_filp->f_mapping;
+ +
+ +      return filemap_invalidate_pages(mapping, iocb->ki_pos,
+ +                                      iocb->ki_pos + count - 1,
+ +                                      iocb->ki_flags & IOCB_NOWAIT);
+ +}
   EXPORT_SYMBOL_GPL(kiocb_invalidate_pages);
   
   /**
@@@ -3012,7 -2989,7 +3012,7 @@@ unlock
   static inline size_t seek_folio_size(struct xa_state *xas, struct folio *folio)
   {
         if (xa_is_value(folio))
- -              return PAGE_SIZE << xa_get_order(xas->xa, xas->xa_index);
+ +              return PAGE_SIZE << xas_get_order(xas);
         return folio_size(folio);
   }
   
@@@ -3310,8 -3287,6 +3310,8 @@@ vm_fault_t filemap_fault(struct vm_faul
         if (unlikely(index >= max_idx))
                 return VM_FAULT_SIGBUS;
   
+ +      trace_mm_filemap_fault(mapping, index);
+ +
         /*
          * Do we have something in the page cache already?
          */
@@@ -3629,7 -3604,7 +3629,7 @@@ vm_fault_t filemap_map_pages(struct vm_
         struct vm_area_struct *vma = vmf->vma;
         struct file *file = vma->vm_file;
         struct address_space *mapping = file->f_mapping;
- -      pgoff_t last_pgoff = start_pgoff;
+ +      pgoff_t file_end, last_pgoff = start_pgoff;
         unsigned long addr;
         XA_STATE(xas, &mapping->i_pages, start_pgoff);
         struct folio *folio;
@@@ -3655,10 -3630,6 +3655,10 @@@
                 goto out;
         }
   
+ +      file_end = DIV_ROUND_UP(i_size_read(mapping->host), PAGE_SIZE) - 1;
+ +      if (end_pgoff > file_end)
+ +              end_pgoff = file_end;
+ +
         folio_type = mm_counter_file(folio);
         do {
                 unsigned long end;
@@@ -3682,7 -3653,6 +3682,7 @@@
         } while ((folio = next_uptodate_folio(&xas, mapping, end_pgoff)) != NULL);
         add_mm_counter(vma->vm_mm, folio_type, rss);
         pte_unmap_unlock(vmf->pte, vmf->ptl);
+ +      trace_mm_filemap_map_pages(mapping, start_pgoff, end_pgoff);
   out:
         rcu_read_unlock();
   
@@@ -3780,11 -3750,9 +3780,11 @@@ static struct folio *do_read_cache_foli
   repeat:
         folio = filemap_get_folio(mapping, index);
         if (IS_ERR(folio)) {
- -              folio = filemap_alloc_folio(gfp, 0);
+ +              folio = filemap_alloc_folio(gfp,
+ +                                          mapping_min_folio_order(mapping));
                 if (!folio)
                         return ERR_PTR(-ENOMEM);
+ +              index = mapping_align_index(mapping, index);
                 err = filemap_add_folio(mapping, folio, index, gfp);
                 if (unlikely(err)) {
                         folio_put(folio);
@@@ -4019,6 -3987,7 +4019,6 @@@ ssize_t generic_perform_write(struct ki
         ssize_t written = 0;
   
         do {
- -              struct page *page;
                 struct folio *folio;
                 size_t offset;          /* Offset into folio */
                 size_t bytes;           /* Bytes to write to folio */
@@@ -4048,10 -4017,11 +4048,10 @@@ retry
                 }
   
                 status = a_ops->write_begin(file, mapping, pos, bytes,
- -                                              &page, &fsdata);
+ +                                              &folio, &fsdata);
                 if (unlikely(status < 0))
                         break;
   
- -              folio = page_folio(page);
                 offset = offset_in_folio(folio, pos);
                 if (bytes > folio_size(folio) - offset)
                         bytes = folio_size(folio) - offset;
@@@ -4063,7 -4033,7 +4063,7 @@@
                 flush_dcache_folio(folio);
   
                 status = a_ops->write_end(file, mapping, pos, bytes, copied,
- -                                              page, fsdata);
+ +                                              folio, fsdata);
                 if (unlikely(status != copied)) {
                         iov_iter_revert(i, copied - max(status, 0L));
                         if (unlikely(status < 0))
@@@ -4261,7 -4231,7 +4261,7 @@@ int filemap_invalidate_inode(struct ino
         }
   
         /* Wait for writeback to complete on all folios and discard. */
- -      truncate_inode_pages_range(mapping, start, end);
+ +      invalidate_inode_pages2_range(mapping, start / PAGE_SIZE, end / PAGE_SIZE);
   
   unlock:
         filemap_invalidate_unlock(mapping);
@@@ -4312,7 -4282,7 +4312,7 @@@ static void filemap_cachestat(struct ad
                 if (xas_retry(&xas, folio))
                         continue;
   
- -              order = xa_get_order(xas.xa, xas.xa_index);
+ +              order = xas_get_order(&xas);
                 nr_pages = 1 << order;
                 folio_first_index = round_down(xas.xa_index, 1 << order);
                 folio_last_index = folio_first_index + nr_pages - 1;
@@@ -4423,7 -4393,7 +4423,7 @@@ SYSCALL_DEFINE4(cachestat, unsigned int
         struct cachestat cs;
         pgoff_t first_index, last_index;
   
-       if (!f.file)
+       if (!fd_file(f))
                 return -EBADF;
   
         if (copy_from_user(&csr, cstat_range,
@@@ -4433,7 -4403,7 +4433,7 @@@
         }
   
         /* hugetlbfs is not supported */
-       if (is_file_hugepages(f.file)) {
+       if (is_file_hugepages(fd_file(f))) {
                 fdput(f);
                 return -EOPNOTSUPP;
         }
@@@ -4447,7 -4417,7 +4447,7 @@@
         last_index =
                 csr.len == 0 ? ULONG_MAX : (csr.off + csr.len - 1) >> PAGE_SHIFT;
         memset(&cs, 0, sizeof(struct cachestat));
-       mapping = f.file->f_mapping;
+       mapping = fd_file(f)->f_mapping;
         filemap_cachestat(mapping, first_index, last_index, &cs);
         fdput(f);
   
diff --combined mm/memcontrol-v1.c

index b37c0d870816d180da98c7924b1599704ee74d72,da626ddccffda961664951a944167ab11fe8aa98..81d8819f13cdbf0c1a6a6c584fa6a633253a00ed
--- 1/mm/memcontrol-v1.c
--- 2/mm/memcontrol-v1.c
+++ b/mm/memcontrol-v1.c
@@@ -742,9 -742,6 +742,9 @@@ static struct page *mc_handle_file_pte(
         return folio_file_page(folio, index);
   }
   
+ +static void memcg1_check_events(struct mem_cgroup *memcg, int nid);
+ +static void memcg1_charge_statistics(struct mem_cgroup *memcg, int nr_pages);
+ +
   /**
    * mem_cgroup_move_account - move account of the folio
    * @folio: The folio.
@@@ -856,9 -853,9 +856,9 @@@ static int mem_cgroup_move_account(stru
         nid = folio_nid(folio);
   
         local_irq_disable();
- -      mem_cgroup_charge_statistics(to, nr_pages);
+ +      memcg1_charge_statistics(to, nr_pages);
         memcg1_check_events(to, nid);
- -      mem_cgroup_charge_statistics(from, -nr_pages);
+ +      memcg1_charge_statistics(from, -nr_pages);
         memcg1_check_events(from, nid);
         local_irq_enable();
   out:
@@@ -1442,68 -1439,21 +1442,68 @@@ static void mem_cgroup_threshold(struc
         }
   }
   
+ +/* Cgroup1: threshold notifications & softlimit tree updates */
+ +struct memcg1_events_percpu {
+ +      unsigned long nr_page_events;
+ +      unsigned long targets[MEM_CGROUP_NTARGETS];
+ +};
+ +
+ +static void memcg1_charge_statistics(struct mem_cgroup *memcg, int nr_pages)
+ +{
+ +      /* pagein of a big page is an event. So, ignore page size */
+ +      if (nr_pages > 0)
+ +              __count_memcg_events(memcg, PGPGIN, 1);
+ +      else {
+ +              __count_memcg_events(memcg, PGPGOUT, 1);
+ +              nr_pages = -nr_pages; /* for event */
+ +      }
+ +
+ +      __this_cpu_add(memcg->events_percpu->nr_page_events, nr_pages);
+ +}
+ +
+ +#define THRESHOLDS_EVENTS_TARGET 128
+ +#define SOFTLIMIT_EVENTS_TARGET 1024
+ +
+ +static bool memcg1_event_ratelimit(struct mem_cgroup *memcg,
+ +                              enum mem_cgroup_events_target target)
+ +{
+ +      unsigned long val, next;
+ +
+ +      val = __this_cpu_read(memcg->events_percpu->nr_page_events);
+ +      next = __this_cpu_read(memcg->events_percpu->targets[target]);
+ +      /* from time_after() in jiffies.h */
+ +      if ((long)(next - val) < 0) {
+ +              switch (target) {
+ +              case MEM_CGROUP_TARGET_THRESH:
+ +                      next = val + THRESHOLDS_EVENTS_TARGET;
+ +                      break;
+ +              case MEM_CGROUP_TARGET_SOFTLIMIT:
+ +                      next = val + SOFTLIMIT_EVENTS_TARGET;
+ +                      break;
+ +              default:
+ +                      break;
+ +              }
+ +              __this_cpu_write(memcg->events_percpu->targets[target], next);
+ +              return true;
+ +      }
+ +      return false;
+ +}
+ +
   /*
    * Check events in order.
    *
    */
- -void memcg1_check_events(struct mem_cgroup *memcg, int nid)
+ +static void memcg1_check_events(struct mem_cgroup *memcg, int nid)
   {
         if (IS_ENABLED(CONFIG_PREEMPT_RT))
                 return;
   
         /* threshold event is triggered in finer grain than soft limit */
- -      if (unlikely(mem_cgroup_event_ratelimit(memcg,
+ +      if (unlikely(memcg1_event_ratelimit(memcg,
                                                 MEM_CGROUP_TARGET_THRESH))) {
                 bool do_softlimit;
   
- -              do_softlimit = mem_cgroup_event_ratelimit(memcg,
+ +              do_softlimit = memcg1_event_ratelimit(memcg,
                                                 MEM_CGROUP_TARGET_SOFTLIMIT);
                 mem_cgroup_threshold(memcg);
                 if (unlikely(do_softlimit))
@@@ -1511,43 -1461,6 +1511,43 @@@
         }
   }
   
+ +void memcg1_commit_charge(struct folio *folio, struct mem_cgroup *memcg)
+ +{
+ +      unsigned long flags;
+ +
+ +      local_irq_save(flags);
+ +      memcg1_charge_statistics(memcg, folio_nr_pages(folio));
+ +      memcg1_check_events(memcg, folio_nid(folio));
+ +      local_irq_restore(flags);
+ +}
+ +
+ +void memcg1_swapout(struct folio *folio, struct mem_cgroup *memcg)
+ +{
+ +      /*
+ +       * Interrupts should be disabled here because the caller holds the
+ +       * i_pages lock which is taken with interrupts-off. It is
+ +       * important here to have the interrupts disabled because it is the
+ +       * only synchronisation we have for updating the per-CPU variables.
+ +       */
+ +      preempt_disable_nested();
+ +      VM_WARN_ON_IRQS_ENABLED();
+ +      memcg1_charge_statistics(memcg, -folio_nr_pages(folio));
+ +      preempt_enable_nested();
+ +      memcg1_check_events(memcg, folio_nid(folio));
+ +}
+ +
+ +void memcg1_uncharge_batch(struct mem_cgroup *memcg, unsigned long pgpgout,
+ +                         unsigned long nr_memory, int nid)
+ +{
+ +      unsigned long flags;
+ +
+ +      local_irq_save(flags);
+ +      __count_memcg_events(memcg, PGPGOUT, pgpgout);
+ +      __this_cpu_add(memcg->events_percpu->nr_page_events, nr_memory);
+ +      memcg1_check_events(memcg, nid);
+ +      local_irq_restore(flags);
+ +}
+ +
   static int compare_thresholds(const void *a, const void *b)
   {
         const struct mem_cgroup_threshold *_a = a;
@@@ -1929,12 -1842,9 +1929,12 @@@ static ssize_t memcg_write_event_contro
         buf = endp + 1;
   
         cfd = simple_strtoul(buf, &endp, 10);
- -      if ((*endp != ' ') && (*endp != '\0'))
+ +      if (*endp == '\0')
+ +              buf = endp;
+ +      else if (*endp == ' ')
+ +              buf = endp + 1;
+ +      else
                 return -EINVAL;
- -      buf = endp + 1;
   
         event = kzalloc(sizeof(*event), GFP_KERNEL);
         if (!event)
@@@ -1947,26 -1857,26 +1947,26 @@@
         INIT_WORK(&event->remove, memcg_event_remove);
   
         efile = fdget(efd);
-       if (!efile.file) {
+       if (!fd_file(efile)) {
                 ret = -EBADF;
                 goto out_kfree;
         }
   
-       event->eventfd = eventfd_ctx_fileget(efile.file);
+       event->eventfd = eventfd_ctx_fileget(fd_file(efile));
         if (IS_ERR(event->eventfd)) {
                 ret = PTR_ERR(event->eventfd);
                 goto out_put_efile;
         }
   
         cfile = fdget(cfd);
-       if (!cfile.file) {
+       if (!fd_file(cfile)) {
                 ret = -EBADF;
                 goto out_put_eventfd;
         }
   
         /* the process need read permission on control file */
         /* AV: shouldn't we check that it's been opened for read instead? */
-       ret = file_permission(cfile.file, MAY_READ);
+       ret = file_permission(fd_file(cfile), MAY_READ);
         if (ret < 0)
                 goto out_put_cfile;
   
@@@ -1974,7 -1884,7 +1974,7 @@@
          * The control file must be a regular cgroup1 file. As a regular cgroup
          * file can't be renamed, it's safe to access its name afterwards.
          */
-       cdentry = cfile.file->f_path.dentry;
+       cdentry = fd_file(cfile)->f_path.dentry;
         if (cdentry->d_sb->s_type != &cgroup_fs_type || !d_is_reg(cdentry)) {
                 ret = -EINVAL;
                 goto out_put_cfile;
@@@ -1994,15 -1904,9 +1994,15 @@@
                 event->register_event = mem_cgroup_usage_register_event;
                 event->unregister_event = mem_cgroup_usage_unregister_event;
         } else if (!strcmp(name, "memory.oom_control")) {
+ +              pr_warn_once("oom_control is deprecated and will be removed. "
+ +                           "Please report your usecase to [email protected]"
+ +                           " if you depend on this functionality. \n");
                 event->register_event = mem_cgroup_oom_register_event;
                 event->unregister_event = mem_cgroup_oom_unregister_event;
         } else if (!strcmp(name, "memory.pressure_level")) {
+ +              pr_warn_once("pressure_level is deprecated and will be removed. "
+ +                           "Please report your usecase to [email protected] "
+ +                           "if you depend on this functionality. \n");
                 event->register_event = vmpressure_register_event;
                 event->unregister_event = vmpressure_unregister_event;
         } else if (!strcmp(name, "memory.memsw.usage_in_bytes")) {
@@@ -2032,7 -1936,7 +2032,7 @@@
         if (ret)
                 goto out_put_css;
   
-       vfs_poll(efile.file, &event->pt);
+       vfs_poll(fd_file(efile), &event->pt);
   
         spin_lock_irq(&memcg->event_list_lock);
         list_add(&event->list, &memcg->event_list);
@@@ -2540,9 -2444,6 +2540,9 @@@ static ssize_t mem_cgroup_write(struct 
                         ret = 0;
                         break;
                 case _TCP:
+ +                      pr_warn_once("kmem.tcp.limit_in_bytes is deprecated and will be removed. "
+ +                                   "Please report your usecase to [email protected] if you "
+ +                                   "depend on this functionality.\n");
                         ret = memcg_update_tcp_max(memcg, nr_pages);
                         break;
                 }
@@@ -2551,9 -2452,6 +2551,9 @@@
                 if (IS_ENABLED(CONFIG_PREEMPT_RT)) {
                         ret = -EOPNOTSUPP;
                 } else {
+ +                      pr_warn_once("soft_limit_in_bytes is deprecated and will be removed. "
+ +                                   "Please report your usecase to [email protected] if you "
+ +                                   "depend on this functionality.\n");
                         WRITE_ONCE(memcg->soft_limit, nr_pages);
                         ret = 0;
                 }
@@@ -2847,10 -2745,6 +2847,10 @@@ static int mem_cgroup_oom_control_write
   {
         struct mem_cgroup *memcg = mem_cgroup_from_css(css);
   
+ +      pr_warn_once("oom_control is deprecated and will be removed. "
+ +                   "Please report your usecase to [email protected] if you "
+ +                   "depend on this functionality. \n");
+ +
         /* cannot set to root cgroup and only 0 and 1 are allowed */
         if (mem_cgroup_is_root(memcg) || !((val == 0) || (val == 1)))
                 return -EINVAL;
@@@ -3055,19 -2949,6 +3055,19 @@@ bool memcg1_charge_skmem(struct mem_cgr
         return false;
   }
   
+ +bool memcg1_alloc_events(struct mem_cgroup *memcg)
+ +{
+ +      memcg->events_percpu = alloc_percpu_gfp(struct memcg1_events_percpu,
+ +                                              GFP_KERNEL_ACCOUNT);
+ +      return !!memcg->events_percpu;
+ +}
+ +
+ +void memcg1_free_events(struct mem_cgroup *memcg)
+ +{
+ +      if (memcg->events_percpu)
+ +              free_percpu(memcg->events_percpu);
+ +}
+ +
   static int __init memcg1_init(void)
   {
         int node;
diff --combined mm/readahead.c

index 2078c42777a62588a1caf70a464fb1f2328792af,e83fe1c6e5acd437c32116254c6d797a9bcf2b8d..3dc6c7a128dd35bfeda20ecb165a852f4b46eeaf
--- 1/mm/readahead.c
--- 2/mm/readahead.c
+++ b/mm/readahead.c
@@@ -206,10 -206,9 +206,10 @@@ void page_cache_ra_unbounded(struct rea
                 unsigned long nr_to_read, unsigned long lookahead_size)
   {
         struct address_space *mapping = ractl->mapping;
- -      unsigned long index = readahead_index(ractl);
+ +      unsigned long ra_folio_index, index = readahead_index(ractl);
         gfp_t gfp_mask = readahead_gfp_mask(mapping);
- -      unsigned long i;
+ +      unsigned long mark, i = 0;
+ +      unsigned int min_nrpages = mapping_min_folio_nrpages(mapping);
   
         /*
          * Partway through the readahead operation, we will have added
@@@ -224,24 -223,10 +224,24 @@@
         unsigned int nofs = memalloc_nofs_save();
   
         filemap_invalidate_lock_shared(mapping);
+ +      index = mapping_align_index(mapping, index);
+ +
+ +      /*
+ +       * As iterator `i` is aligned to min_nrpages, round_up the
+ +       * difference between nr_to_read and lookahead_size to mark the
+ +       * index that only has lookahead or "async_region" to set the
+ +       * readahead flag.
+ +       */
+ +      ra_folio_index = round_up(readahead_index(ractl) + nr_to_read - lookahead_size,
+ +                                min_nrpages);
+ +      mark = ra_folio_index - index;
+ +      nr_to_read += readahead_index(ractl) - index;
+ +      ractl->_index = index;
+ +
         /*
          * Preallocate as many pages as we will need.
          */
- -      for (i = 0; i < nr_to_read; i++) {
+ +      while (i < nr_to_read) {
                 struct folio *folio = xa_load(&mapping->i_pages, index + i);
                 int ret;
   
@@@ -255,13 -240,12 +255,13 @@@
                          * not worth getting one just for that.
                          */
                         read_pages(ractl);
- -                      ractl->_index++;
- -                      i = ractl->_index + ractl->_nr_pages - index - 1;
+ +                      ractl->_index += min_nrpages;
+ +                      i = ractl->_index + ractl->_nr_pages - index;
                         continue;
                 }
   
- -              folio = filemap_alloc_folio(gfp_mask, 0);
+ +              folio = filemap_alloc_folio(gfp_mask,
+ +                                          mapping_min_folio_order(mapping));
                 if (!folio)
                         break;
   
@@@ -271,15 -255,14 +271,15 @@@
                         if (ret == -ENOMEM)
                                 break;
                         read_pages(ractl);
- -                      ractl->_index++;
- -                      i = ractl->_index + ractl->_nr_pages - index - 1;
+ +                      ractl->_index += min_nrpages;
+ +                      i = ractl->_index + ractl->_nr_pages - index;
                         continue;
                 }
- -              if (i == nr_to_read - lookahead_size)
+ +              if (i == mark)
                         folio_set_readahead(folio);
                 ractl->_workingset |= folio_test_workingset(folio);
- -              ractl->_nr_pages++;
+ +              ractl->_nr_pages += min_nrpages;
+ +              i += min_nrpages;
         }
   
         /*
@@@ -455,41 -438,26 +455,41 @@@ void page_cache_ra_order(struct readahe
         struct address_space *mapping = ractl->mapping;
         pgoff_t start = readahead_index(ractl);
         pgoff_t index = start;
+ +      unsigned int min_order = mapping_min_folio_order(mapping);
         pgoff_t limit = (i_size_read(mapping->host) - 1) >> PAGE_SHIFT;
         pgoff_t mark = index + ra->size - ra->async_size;
         unsigned int nofs;
         int err = 0;
         gfp_t gfp = readahead_gfp_mask(mapping);
+ +      unsigned int min_ra_size = max(4, mapping_min_folio_nrpages(mapping));
   
- -      if (!mapping_large_folio_support(mapping) || ra->size < 4)
+ +      /*
+ +       * Fallback when size < min_nrpages as each folio should be
+ +       * at least min_nrpages anyway.
+ +       */
+ +      if (!mapping_large_folio_support(mapping) || ra->size < min_ra_size)
                 goto fallback;
   
         limit = min(limit, index + ra->size - 1);
   
- -      if (new_order < MAX_PAGECACHE_ORDER)
+ +      if (new_order < mapping_max_folio_order(mapping))
                 new_order += 2;
   
- -      new_order = min_t(unsigned int, MAX_PAGECACHE_ORDER, new_order);
+ +      new_order = min(mapping_max_folio_order(mapping), new_order);
         new_order = min_t(unsigned int, new_order, ilog2(ra->size));
+ +      new_order = max(new_order, min_order);
   
         /* See comment in page_cache_ra_unbounded() */
         nofs = memalloc_nofs_save();
         filemap_invalidate_lock_shared(mapping);
+ +      /*
+ +       * If the new_order is greater than min_order and index is
+ +       * already aligned to new_order, then this will be noop as index
+ +       * aligned to new_order should also be aligned to min_order.
+ +       */
+ +      ractl->_index = mapping_align_index(mapping, index);
+ +      index = readahead_index(ractl);
+ +
         while (index <= limit) {
                 unsigned int order = new_order;
   
@@@ -497,7 -465,7 +497,7 @@@
                 if (index & ((1UL << order) - 1))
                         order = __ffs(index);
                 /* Don't allocate pages past EOF */
- -              while (index + (1UL << order) - 1 > limit)
+ +              while (order > min_order && index + (1UL << order) - 1 > limit)
                         order--;
                 err = ra_alloc_folio(ractl, index, mark, order, gfp);
                 if (err)
@@@ -678,7 -646,7 +678,7 @@@ ssize_t ksys_readahead(int fd, loff_t o
   
         ret = -EBADF;
         f = fdget(fd);
-       if (!f.file || !(f.file->f_mode & FMODE_READ))
+       if (!fd_file(f) || !(fd_file(f)->f_mode & FMODE_READ))
                 goto out;
   
         /*
@@@ -687,12 -655,12 +687,12 @@@
          * on this file, then we must return -EINVAL.
          */
         ret = -EINVAL;
-       if (!f.file->f_mapping || !f.file->f_mapping->a_ops ||
-           (!S_ISREG(file_inode(f.file)->i_mode) &&
-           !S_ISBLK(file_inode(f.file)->i_mode)))
+       if (!fd_file(f)->f_mapping || !fd_file(f)->f_mapping->a_ops ||
+           (!S_ISREG(file_inode(fd_file(f))->i_mode) &&
+           !S_ISBLK(file_inode(fd_file(f))->i_mode)))
                 goto out;
   
-       ret = vfs_fadvise(f.file, offset, count, POSIX_FADV_WILLNEED);
+       ret = vfs_fadvise(fd_file(f), offset, count, POSIX_FADV_WILLNEED);
   out:
         fdput(f);
         return ret;
@@@ -735,15 -703,8 +735,15 @@@ void readahead_expand(struct readahead_
         struct file_ra_state *ra = ractl->ra;
         pgoff_t new_index, new_nr_pages;
         gfp_t gfp_mask = readahead_gfp_mask(mapping);
+ +      unsigned long min_nrpages = mapping_min_folio_nrpages(mapping);
+ +      unsigned int min_order = mapping_min_folio_order(mapping);
   
         new_index = new_start / PAGE_SIZE;
+ +      /*
+ +       * Readahead code should have aligned the ractl->_index to
+ +       * min_nrpages before calling readahead aops.
+ +       */
+ +      VM_BUG_ON(!IS_ALIGNED(ractl->_index, min_nrpages));
   
         /* Expand the leading edge downwards */
         while (ractl->_index > new_index) {
@@@ -753,11 -714,9 +753,11 @@@
                 if (folio && !xa_is_value(folio))
                         return; /* Folio apparently present */
   
- -              folio = filemap_alloc_folio(gfp_mask, 0);
+ +              folio = filemap_alloc_folio(gfp_mask, min_order);
                 if (!folio)
                         return;
+ +
+ +              index = mapping_align_index(mapping, index);
                 if (filemap_add_folio(mapping, folio, index, gfp_mask) < 0) {
                         folio_put(folio);
                         return;
@@@ -767,7 -726,7 +767,7 @@@
                         ractl->_workingset = true;
                         psi_memstall_enter(&ractl->_pflags);
                 }
- -              ractl->_nr_pages++;
+ +              ractl->_nr_pages += min_nrpages;
                 ractl->_index = folio->index;
         }
   
@@@ -782,11 -741,9 +782,11 @@@
                 if (folio && !xa_is_value(folio))
                         return; /* Folio apparently present */
   
- -              folio = filemap_alloc_folio(gfp_mask, 0);
+ +              folio = filemap_alloc_folio(gfp_mask, min_order);
                 if (!folio)
                         return;
+ +
+ +              index = mapping_align_index(mapping, index);
                 if (filemap_add_folio(mapping, folio, index, gfp_mask) < 0) {
                         folio_put(folio);
                         return;
@@@ -796,10 -753,10 +796,10 @@@
                         ractl->_workingset = true;
                         psi_memstall_enter(&ractl->_pflags);
                 }
- -              ractl->_nr_pages++;
+ +              ractl->_nr_pages += min_nrpages;
                 if (ra) {
- -                      ra->size++;
- -                      ra->async_size++;
+ +                      ra->size += min_nrpages;
+ +                      ra->async_size += min_nrpages;
                 }
         }
   }
diff --combined net/core/net_namespace.c

index 11e4dd4f09ede9f02632079770a3adb63c850609,18b7c8f31234a71f86cbb5f8357b87e226177271..e39479f1c9a486cc2968d3544db7044c1c56fcf9
--- 1/net/core/net_namespace.c
--- 2/net/core/net_namespace.c
+++ b/net/core/net_namespace.c
@@@ -125,7 -125,7 +125,7 @@@ static int ops_init(const struct pernet
         int err = -ENOMEM;
         void *data = NULL;
   
- -      if (ops->id && ops->size) {
+ +      if (ops->id) {
                 data = kzalloc(ops->size, GFP_KERNEL);
                 if (!data)
                         goto out;
@@@ -140,7 -140,7 +140,7 @@@
         if (!err)
                 return 0;
   
- -      if (ops->id && ops->size) {
+ +      if (ops->id) {
                 ng = rcu_dereference_protected(net->gen,
                                                lockdep_is_held(&pernet_ops_rwsem));
                 ng->ptr[*ops->id] = NULL;
@@@ -182,8 -182,7 +182,8 @@@ static void ops_free_list(const struct 
                           struct list_head *net_exit_list)
   {
         struct net *net;
- -      if (ops->size && ops->id) {
+ +
+ +      if (ops->id) {
                 list_for_each_entry(net, net_exit_list, exit_list)
                         kfree(net_generic(net, *ops->id));
         }
@@@ -309,38 -308,16 +309,38 @@@ struct net *get_net_ns_by_id(const stru
   }
   EXPORT_SYMBOL_GPL(get_net_ns_by_id);
   
+ +static __net_init void preinit_net_sysctl(struct net *net)
+ +{
+ +      net->core.sysctl_somaxconn = SOMAXCONN;
+ +      /* Limits per socket sk_omem_alloc usage.
+ +       * TCP zerocopy regular usage needs 128 KB.
+ +       */
+ +      net->core.sysctl_optmem_max = 128 * 1024;
+ +      net->core.sysctl_txrehash = SOCK_TXREHASH_ENABLED;
+ +}
+ +
   /* init code that must occur even if setup_net() is not called. */
- -static __net_init void preinit_net(struct net *net)
+ +static __net_init void preinit_net(struct net *net, struct user_namespace *user_ns)
   {
+ +      refcount_set(&net->passive, 1);
+ +      refcount_set(&net->ns.count, 1);
+ +      ref_tracker_dir_init(&net->refcnt_tracker, 128, "net refcnt");
         ref_tracker_dir_init(&net->notrefcnt_tracker, 128, "net notrefcnt");
+ +
+ +      get_random_bytes(&net->hash_mix, sizeof(u32));
+ +      net->dev_base_seq = 1;
+ +      net->user_ns = user_ns;
+ +
+ +      idr_init(&net->netns_ids);
+ +      spin_lock_init(&net->nsid_lock);
+ +      mutex_init(&net->ipv4.ra_mutex);
+ +      preinit_net_sysctl(net);
   }
   
   /*
    * setup_net runs the initializers for the network namespace object.
    */
- -static __net_init int setup_net(struct net *net, struct user_namespace *user_ns)
+ +static __net_init int setup_net(struct net *net)
   {
         /* Must be called with pernet_ops_rwsem held */
         const struct pernet_operations *ops, *saved_ops;
@@@ -348,9 -325,19 +348,9 @@@
         LIST_HEAD(dev_kill_list);
         int error = 0;
   
- -      refcount_set(&net->ns.count, 1);
- -      ref_tracker_dir_init(&net->refcnt_tracker, 128, "net refcnt");
- -
- -      refcount_set(&net->passive, 1);
- -      get_random_bytes(&net->hash_mix, sizeof(u32));
         preempt_disable();
         net->net_cookie = gen_cookie_next(&net_cookie);
         preempt_enable();
- -      net->dev_base_seq = 1;
- -      net->user_ns = user_ns;
- -      idr_init(&net->netns_ids);
- -      spin_lock_init(&net->nsid_lock);
- -      mutex_init(&net->ipv4.ra_mutex);
   
         list_for_each_entry(ops, &pernet_list, list) {
                 error = ops_init(ops, net);
@@@ -395,6 -382,32 +395,6 @@@ out_undo
         goto out;
   }
   
- -static int __net_init net_defaults_init_net(struct net *net)
- -{
- -      net->core.sysctl_somaxconn = SOMAXCONN;
- -      /* Limits per socket sk_omem_alloc usage.
- -       * TCP zerocopy regular usage needs 128 KB.
- -       */
- -      net->core.sysctl_optmem_max = 128 * 1024;
- -      net->core.sysctl_txrehash = SOCK_TXREHASH_ENABLED;
- -
- -      return 0;
- -}
- -
- -static struct pernet_operations net_defaults_ops = {
- -      .init = net_defaults_init_net,
- -};
- -
- -static __init int net_defaults_init(void)
- -{
- -      if (register_pernet_subsys(&net_defaults_ops))
- -              panic("Cannot initialize net default settings");
- -
- -      return 0;
- -}
- -
- -core_initcall(net_defaults_init);
- -
   #ifdef CONFIG_NET_NS
   static struct ucounts *inc_net_namespaces(struct user_namespace *ns)
   {
@@@ -483,7 -496,8 +483,7 @@@ struct net *copy_net_ns(unsigned long f
                 goto dec_ucounts;
         }
   
- -      preinit_net(net);
- -      refcount_set(&net->passive, 1);
+ +      preinit_net(net, user_ns);
         net->ucounts = ucounts;
         get_user_ns(user_ns);
   
@@@ -491,7 -505,7 +491,7 @@@
         if (rv < 0)
                 goto put_userns;
   
- -      rv = setup_net(net, user_ns);
+ +      rv = setup_net(net);
   
         up_read(&pernet_ops_rwsem);
   
@@@ -697,11 -711,11 +697,11 @@@ struct net *get_net_ns_by_fd(int fd
         struct fd f = fdget(fd);
         struct net *net = ERR_PTR(-EINVAL);
   
-       if (!f.file)
+       if (!fd_file(f))
                 return ERR_PTR(-EBADF);
   
-       if (proc_ns_file(f.file)) {
-               struct ns_common *ns = get_proc_ns(file_inode(f.file));
+       if (proc_ns_file(fd_file(f))) {
+               struct ns_common *ns = get_proc_ns(file_inode(fd_file(f)));
                 if (ns->ops == &netns_operations)
                         net = get_net(container_of(ns, struct net, ns));
         }
@@@ -1185,10 -1199,9 +1185,10 @@@ void __init net_ns_init(void
   #ifdef CONFIG_KEYS
         init_net.key_domain = &init_net_key_domain;
   #endif
+ +      preinit_net(&init_net, &init_user_ns);
+ +
         down_write(&pernet_ops_rwsem);
- -      preinit_net(&init_net);
- -      if (setup_net(&init_net, &init_user_ns))
+ +      if (setup_net(&init_net))
                 panic("Could not setup the initial network namespace");
   
         init_net_initialized = true;
@@@ -1231,7 -1244,7 +1231,7 @@@ static int __register_pernet_operations
         LIST_HEAD(net_exit_list);
   
         list_add_tail(&ops->list, list);
- -      if (ops->init || (ops->id && ops->size)) {
+ +      if (ops->init || ops->id) {
                 /* We held write locked pernet_ops_rwsem, and parallel
                  * setup_net() and cleanup_net() are not possible.
                  */
@@@ -1297,9 -1310,6 +1297,9 @@@ static int register_pernet_operations(s
   {
         int error;
   
+ +      if (WARN_ON(!!ops->id ^ !!ops->size))
+ +              return -EINVAL;
+ +
         if (ops->id) {
                 error = ida_alloc_min(&net_generic_ids, MIN_PERNET_OPS_ID,
                                 GFP_KERNEL);
diff --combined net/socket.c

index 8d8b84fa404aa574d4c4219bf85a6fc5a1d5bca7,c0d4f5032374983cf05c826ca0cb58bffa25d9aa..7b046dd3e9a73affaf7d1fa56aaecc4979b7b59c
--- 1/net/socket.c
--- 2/net/socket.c
+++ b/net/socket.c
@@@ -556,10 -556,10 +556,10 @@@ static struct socket *sockfd_lookup_lig
         struct socket *sock;
   
         *err = -EBADF;
-       if (f.file) {
-               sock = sock_from_file(f.file);
+       if (fd_file(f)) {
+               sock = sock_from_file(fd_file(f));
                 if (likely(sock)) {
-                       *fput_needed = f.flags & FDPUT_FPUT;
+                       *fput_needed = f.word & FDPUT_FPUT;
                         return sock;
                 }
                 *err = -ENOTSOCK;
@@@ -946,17 -946,11 +946,17 @@@ void __sock_recv_timestamp(struct msghd
   
         memset(&tss, 0, sizeof(tss));
         tsflags = READ_ONCE(sk->sk_tsflags);
- -      if ((tsflags & SOF_TIMESTAMPING_SOFTWARE) &&
+ +      if ((tsflags & SOF_TIMESTAMPING_SOFTWARE &&
+ +           (tsflags & SOF_TIMESTAMPING_RX_SOFTWARE ||
+ +            skb_is_err_queue(skb) ||
+ +            !(tsflags & SOF_TIMESTAMPING_OPT_RX_FILTER))) &&
             ktime_to_timespec64_cond(skb->tstamp, tss.ts + 0))
                 empty = 0;
         if (shhwtstamps &&
- -          (tsflags & SOF_TIMESTAMPING_RAW_HARDWARE) &&
+ +          (tsflags & SOF_TIMESTAMPING_RAW_HARDWARE &&
+ +           (tsflags & SOF_TIMESTAMPING_RX_HARDWARE ||
+ +            skb_is_err_queue(skb) ||
+ +            !(tsflags & SOF_TIMESTAMPING_OPT_RX_FILTER))) &&
             !skb_is_swtx_tstamp(skb, false_tstamp)) {
                 if_index = 0;
                 if (skb_shinfo(skb)->tx_flags & SKBTX_HW_TSTAMP_NETDEV)
@@@ -2014,8 -2008,8 +2014,8 @@@ int __sys_accept4(int fd, struct sockad
         struct fd f;
   
         f = fdget(fd);
-       if (f.file) {
-               ret = __sys_accept4_file(f.file, upeer_sockaddr,
+       if (fd_file(f)) {
+               ret = __sys_accept4_file(fd_file(f), upeer_sockaddr,
                                          upeer_addrlen, flags);
                 fdput(f);
         }
@@@ -2076,12 -2070,12 +2076,12 @@@ int __sys_connect(int fd, struct sockad
         struct fd f;
   
         f = fdget(fd);
-       if (f.file) {
+       if (fd_file(f)) {
                 struct sockaddr_storage address;
   
                 ret = move_addr_to_kernel(uservaddr, addrlen, &address);
                 if (!ret)
-                       ret = __sys_connect_file(f.file, &address, addrlen, 0);
+                       ret = __sys_connect_file(fd_file(f), &address, addrlen, 0);
                 fdput(f);
         }
   
@@@ -2368,7 -2362,7 +2368,7 @@@ INDIRECT_CALLABLE_DECLARE(bool tcp_bpf_
   int do_sock_getsockopt(struct socket *sock, bool compat, int level,
                        int optname, sockptr_t optval, sockptr_t optlen)
   {
- -      int max_optlen __maybe_unused;
+ +      int max_optlen __maybe_unused = 0;
         const struct proto_ops *ops;
         int err;
   
@@@ -2377,7 -2371,7 +2377,7 @@@
                 return err;
   
         if (!compat)
- -              max_optlen = BPF_CGROUP_GETSOCKOPT_MAX_OPTLEN(optlen);
+ +              copy_from_sockptr(&max_optlen, optlen, sizeof(int));
   
         ops = READ_ONCE(sock->ops);
         if (level == SOL_SOCKET) {
diff --combined security/integrity/ima/ima_main.c

index 5b3394864b218e0388fd24dd7131bbe5292784f6,e7c1d3ae33fe2bd8e3bce768ce383c88e36c31a6..06132cf47016da1da2122cb8f918e2266330849a
--- 1/security/integrity/ima/ima_main.c
--- 2/security/integrity/ima/ima_main.c
+++ b/security/integrity/ima/ima_main.c
@@@ -1068,10 -1068,10 +1068,10 @@@ void ima_kexec_cmdline(int kernel_fd, c
                 return;
   
         f = fdget(kernel_fd);
-       if (!f.file)
+       if (!fd_file(f))
                 return;
   
-       process_buffer_measurement(file_mnt_idmap(f.file), file_inode(f.file),
+       process_buffer_measurement(file_mnt_idmap(fd_file(f)), file_inode(fd_file(f)),
                                    buf, size, "kexec-cmdline", KEXEC_CMDLINE, 0,
                                    NULL, false, NULL, 0);
         fdput(f);
@@@ -1193,7 -1193,7 +1193,7 @@@ static struct security_hook_list ima_ho
   #ifdef CONFIG_INTEGRITY_ASYMMETRIC_KEYS
         LSM_HOOK_INIT(kernel_module_request, ima_kernel_module_request),
   #endif
- -      LSM_HOOK_INIT(inode_free_security, ima_inode_free),
+ +      LSM_HOOK_INIT(inode_free_security_rcu, ima_inode_free_rcu),
   };
   
   static const struct lsm_id ima_lsmid = {
diff --combined sound/core/pcm_native.c

index 5e1e6006707b46e3896f90de192a3584ee6e06cc,cbb9c972cb93f18d14e139d1c4b71f085e83a378..99e39b5359cc02078a2649a49eadae40e992eec1
--- 1/sound/core/pcm_native.c
--- 2/sound/core/pcm_native.c
+++ b/sound/core/pcm_native.c
@@@ -582,7 -582,7 +582,7 @@@ static int snd_pcm_hw_refine_user(struc
   
         params = memdup_user(_params, sizeof(*params));
         if (IS_ERR(params))
- -              return PTR_ERR(no_free_ptr(params));
+ +              return PTR_ERR(params);
   
         err = snd_pcm_hw_refine(substream, params);
         if (err < 0)
@@@ -872,7 -872,7 +872,7 @@@ static int snd_pcm_hw_params_user(struc
   
         params = memdup_user(_params, sizeof(*params));
         if (IS_ERR(params))
- -              return PTR_ERR(no_free_ptr(params));
+ +              return PTR_ERR(params);
   
         err = snd_pcm_hw_params(substream, params);
         if (err < 0)
@@@ -2250,12 -2250,12 +2250,12 @@@ static int snd_pcm_link(struct snd_pcm_
         bool nonatomic = substream->pcm->nonatomic;
         CLASS(fd, f)(fd);
   
-       if (!f.file)
+       if (!fd_file(f))
                 return -EBADFD;
-       if (!is_pcm_file(f.file))
+       if (!is_pcm_file(fd_file(f)))
                 return -EBADFD;
   
-       pcm_file = f.file->private_data;
+       pcm_file = fd_file(f)->private_data;
         substream1 = pcm_file->substream;
   
         if (substream == substream1)
@@@ -2418,17 -2418,13 +2418,17 @@@ static int snd_pcm_hw_rule_sample_bits(
         return snd_interval_refine(hw_param_interval(params, rule->var), &t);
   }
   
- -#if SNDRV_PCM_RATE_5512 != 1 << 0 || SNDRV_PCM_RATE_192000 != 1 << 12
+ +#if SNDRV_PCM_RATE_5512 != 1 << 0 || SNDRV_PCM_RATE_192000 != 1 << 12 ||\
+ +      SNDRV_PCM_RATE_128000 != 1 << 19
   #error "Change this table"
   #endif
   
+ +/* NOTE: the list is unsorted! */
   static const unsigned int rates[] = {
         5512, 8000, 11025, 16000, 22050, 32000, 44100,
- -      48000, 64000, 88200, 96000, 176400, 192000, 352800, 384000, 705600, 768000
+ +      48000, 64000, 88200, 96000, 176400, 192000, 352800, 384000, 705600, 768000,
+ +      /* extended */
+ +      12000, 24000, 128000
   };
   
   const struct snd_pcm_hw_constraint_list snd_pcm_known_rates = {
@@@ -3247,7 -3243,7 +3247,7 @@@ static int snd_pcm_xfern_frames_ioctl(s
   
         bufs = memdup_user(xfern.bufs, sizeof(void *) * runtime->channels);
         if (IS_ERR(bufs))
- -              return PTR_ERR(no_free_ptr(bufs));
+ +              return PTR_ERR(bufs);
         if (substream->stream == SNDRV_PCM_STREAM_PLAYBACK)
                 result = snd_pcm_lib_writev(substream, bufs, xfern.frames);
         else
@@@ -4036,7 -4032,7 +4036,7 @@@ static int snd_pcm_hw_refine_old_user(s
   
         oparams = memdup_user(_oparams, sizeof(*oparams));
         if (IS_ERR(oparams))
- -              return PTR_ERR(no_free_ptr(oparams));
+ +              return PTR_ERR(oparams);
         snd_pcm_hw_convert_from_old_params(params, oparams);
         err = snd_pcm_hw_refine(substream, params);
         if (err < 0)
@@@ -4065,7 -4061,7 +4065,7 @@@ static int snd_pcm_hw_params_old_user(s
   
         oparams = memdup_user(_oparams, sizeof(*oparams));
         if (IS_ERR(oparams))
- -              return PTR_ERR(no_free_ptr(oparams));
+ +              return PTR_ERR(oparams);
   
         snd_pcm_hw_convert_from_old_params(params, oparams);
         err = snd_pcm_hw_params(substream, params);
diff --combined virt/kvm/eventfd.c

index 992f9beb3e7d0403d7f3a4d780025a5660fdbaa5,65efb3735e790f8f6304bcc529c884bfbd70da33..6b390b622b728ef3fb08c586046f27b2a49188b9
--- 1/virt/kvm/eventfd.c
--- 2/virt/kvm/eventfd.c
+++ b/virt/kvm/eventfd.c
@@@ -97,19 -97,18 +97,19 @@@ irqfd_resampler_shutdown(struct kvm_ker
         mutex_lock(&kvm->irqfds.resampler_lock);
   
         list_del_rcu(&irqfd->resampler_link);
- -      synchronize_srcu(&kvm->irq_srcu);
   
         if (list_empty(&resampler->list)) {
                 list_del_rcu(&resampler->link);
                 kvm_unregister_irq_ack_notifier(kvm, &resampler->notifier);
                 /*
- -               * synchronize_srcu(&kvm->irq_srcu) already called
+ +               * synchronize_srcu_expedited(&kvm->irq_srcu) already called
                  * in kvm_unregister_irq_ack_notifier().
                  */
                 kvm_set_irq(kvm, KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID,
                             resampler->notifier.gsi, 0, false);
                 kfree(resampler);
+ +      } else {
+ +              synchronize_srcu_expedited(&kvm->irq_srcu);
         }
   
         mutex_unlock(&kvm->irqfds.resampler_lock);
@@@ -127,7 -126,7 +127,7 @@@ irqfd_shutdown(struct work_struct *work
         u64 cnt;
   
         /* Make sure irqfd has been initialized in assign path. */
- -      synchronize_srcu(&kvm->irq_srcu);
+ +      synchronize_srcu_expedited(&kvm->irq_srcu);
   
         /*
          * Synchronize with the wait-queue and unhook ourselves to prevent
@@@ -328,12 -327,12 +328,12 @@@ kvm_irqfd_assign(struct kvm *kvm, struc
         seqcount_spinlock_init(&irqfd->irq_entry_sc, &kvm->irqfds.lock);
   
         f = fdget(args->fd);
-       if (!f.file) {
+       if (!fd_file(f)) {
                 ret = -EBADF;
                 goto out;
         }
   
-       eventfd = eventfd_ctx_fileget(f.file);
+       eventfd = eventfd_ctx_fileget(fd_file(f));
         if (IS_ERR(eventfd)) {
                 ret = PTR_ERR(eventfd);
                 goto fail;
@@@ -385,7 -384,7 +385,7 @@@
                 }
   
                 list_add_rcu(&irqfd->resampler_link, &irqfd->resampler->list);
- -              synchronize_srcu(&kvm->irq_srcu);
+ +              synchronize_srcu_expedited(&kvm->irq_srcu);
   
                 mutex_unlock(&kvm->irqfds.resampler_lock);
         }
@@@ -420,7 -419,7 +420,7 @@@
          * Check if there was an event already pending on the eventfd
          * before we registered, and trigger it as if we didn't miss it.
          */
-       events = vfs_poll(f.file, &irqfd->pt);
+       events = vfs_poll(fd_file(f), &irqfd->pt);
   
         if (events & EPOLLIN)
                 schedule_work(&irqfd->inject);
@@@ -524,7 -523,7 +524,7 @@@ void kvm_unregister_irq_ack_notifier(st
         mutex_lock(&kvm->irq_lock);
         hlist_del_init_rcu(&kian->link);
         mutex_unlock(&kvm->irq_lock);
- -      synchronize_srcu(&kvm->irq_srcu);
+ +      synchronize_srcu_expedited(&kvm->irq_srcu);
         kvm_arch_post_irq_ack_notifier_list_update(kvm);
   }
   
@@@ -609,7 -608,7 +609,7 @@@ kvm_irqfd_release(struct kvm *kvm
   
   /*
    * Take note of a change in irq routing.
- - * Caller must invoke synchronize_srcu(&kvm->irq_srcu) afterwards.
+ + * Caller must invoke synchronize_srcu_expedited(&kvm->irq_srcu) afterwards.
    */
   void kvm_irq_routing_update(struct kvm *kvm)
   {
author	Linus Torvalds <[email protected]>
	Mon, 23 Sep 2024 16:35:36 +0000 (09:35 -0700)
committer	Linus Torvalds <[email protected]>
	Mon, 23 Sep 2024 16:35:36 +0000 (09:35 -0700)
		1	2
arch/alpha/kernel/osf_sys.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/kernel/cpu/sgx/main.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/kvm/svm/sev.c	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/gpu/drm/amd/amdgpu/amdgpu_sched.c	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/gpu/drm/drm_syncobj.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/btrfs/ioctl.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/eventpoll.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/fcntl.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/fhandle.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/file.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/fuse/dev.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/locks.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/namei.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/namespace.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/open.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/read_write.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/select.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/signalfd.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/smb/client/ioctl.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/xfs/xfs_exchrange.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/xfs/xfs_ioctl.c	patch \|	diff1 \|	diff2 \|	blob \| history
include/linux/file.h	patch \|	diff1 \|	diff2 \|	blob \| history
io_uring/sqpoll.c	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/bpf/btf.c	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/bpf/syscall.c	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/cgroup/cgroup.c	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/events/core.c	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/module/main.c	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/signal.c	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/sys.c	patch \|	diff1 \|	diff2 \|	blob \| history
mm/filemap.c	patch \|	diff1 \|	diff2 \|	blob \| history
mm/memcontrol-v1.c	patch \|	diff1 \|	diff2 \|	blob \| history
mm/readahead.c	patch \|	diff1 \|	diff2 \|	blob \| history
net/core/net_namespace.c	patch \|	diff1 \|	diff2 \|	blob \| history
net/socket.c	patch \|	diff1 \|	diff2 \|	blob \| history
security/integrity/ima/ima_main.c	patch \|	diff1 \|	diff2 \|	blob \| history
sound/core/pcm_native.c	patch \|	diff1 \|	diff2 \|	blob \| history
virt/kvm/eventfd.c	patch \|	diff1 \|	diff2 \|	blob \| history