]> Git Repo - linux.git/commitdiff
Merge tag 'locking-urgent-2020-08-10' of git://git.kernel.org/pub/scm/linux/kernel...
authorLinus Torvalds <[email protected]>
Tue, 11 Aug 2020 02:07:44 +0000 (19:07 -0700)
committerLinus Torvalds <[email protected]>
Tue, 11 Aug 2020 02:07:44 +0000 (19:07 -0700)
Pull locking updates from Thomas Gleixner:
 "A set of locking fixes and updates:

   - Untangle the header spaghetti which causes build failures in
     various situations caused by the lockdep additions to seqcount to
     validate that the write side critical sections are non-preemptible.

   - The seqcount associated lock debug addons which were blocked by the
     above fallout.

     seqcount writers contrary to seqlock writers must be externally
     serialized, which usually happens via locking - except for strict
     per CPU seqcounts. As the lock is not part of the seqcount, lockdep
     cannot validate that the lock is held.

     This new debug mechanism adds the concept of associated locks.
     sequence count has now lock type variants and corresponding
     initializers which take a pointer to the associated lock used for
     writer serialization. If lockdep is enabled the pointer is stored
     and write_seqcount_begin() has a lockdep assertion to validate that
     the lock is held.

     Aside of the type and the initializer no other code changes are
     required at the seqcount usage sites. The rest of the seqcount API
     is unchanged and determines the type at compile time with the help
     of _Generic which is possible now that the minimal GCC version has
     been moved up.

     Adding this lockdep coverage unearthed a handful of seqcount bugs
     which have been addressed already independent of this.

     While generally useful this comes with a Trojan Horse twist: On RT
     kernels the write side critical section can become preemtible if
     the writers are serialized by an associated lock, which leads to
     the well known reader preempts writer livelock. RT prevents this by
     storing the associated lock pointer independent of lockdep in the
     seqcount and changing the reader side to block on the lock when a
     reader detects that a writer is in the write side critical section.

   - Conversion of seqcount usage sites to associated types and
     initializers"

* tag 'locking-urgent-2020-08-10' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (25 commits)
  locking/seqlock, headers: Untangle the spaghetti monster
  locking, arch/ia64: Reduce <asm/smp.h> header dependencies by moving XTP bits into the new <asm/xtp.h> header
  x86/headers: Remove APIC headers from <asm/smp.h>
  seqcount: More consistent seqprop names
  seqcount: Compress SEQCNT_LOCKNAME_ZERO()
  seqlock: Fold seqcount_LOCKNAME_init() definition
  seqlock: Fold seqcount_LOCKNAME_t definition
  seqlock: s/__SEQ_LOCKDEP/__SEQ_LOCK/g
  hrtimer: Use sequence counter with associated raw spinlock
  kvm/eventfd: Use sequence counter with associated spinlock
  userfaultfd: Use sequence counter with associated spinlock
  NFSv4: Use sequence counter with associated spinlock
  iocost: Use sequence counter with associated spinlock
  raid5: Use sequence counter with associated spinlock
  vfs: Use sequence counter with associated spinlock
  timekeeping: Use sequence counter with associated raw spinlock
  xfrm: policy: Use sequence counters with associated lock
  netfilter: nft_set_rbtree: Use sequence counter with associated rwlock
  netfilter: conntrack: Use sequence counter with associated spinlock
  sched: tasks: Use sequence counter with associated spinlock
  ...

24 files changed:
1  2 
arch/ia64/kernel/process.c
arch/ia64/kernel/smp.c
arch/x86/include/asm/tsc.h
arch/x86/kernel/apic/apic.c
arch/x86/kernel/cpu/common.c
arch/x86/kernel/cpu/intel.c
arch/x86/kernel/mpparse.c
arch/x86/kernel/setup.c
arch/x86/mm/init_32.c
arch/x86/xen/smp_pv.c
block/blk-iocost.c
drivers/dma-buf/dma-resv.c
drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
drivers/iommu/intel/irq_remapping.c
drivers/md/raid5.c
drivers/md/raid5.h
fs/userfaultfd.c
include/linux/sched.h
include/net/netfilter/nf_conntrack.h
init/init_task.c
kernel/fork.c
kernel/time/timekeeping.c
net/netfilter/nf_conntrack_core.c
net/xfrm/xfrm_policy.c

index 36eba4ba6543986a91fa8f47da9163ceda4292d1,4562a1aed454a220b338d7de1758e9b2b2b29d95..f19cb97c009878672b2706dd0dd243eaad6a7d94
@@@ -40,6 -40,7 +40,6 @@@
  #include <asm/elf.h>
  #include <asm/irq.h>
  #include <asm/kexec.h>
 -#include <asm/pgalloc.h>
  #include <asm/processor.h>
  #include <asm/sal.h>
  #include <asm/switch_to.h>
@@@ -47,6 -48,7 +47,7 @@@
  #include <linux/uaccess.h>
  #include <asm/unwind.h>
  #include <asm/user.h>
+ #include <asm/xtp.h>
  
  #include "entry.h"
  
@@@ -295,7 -297,7 +296,7 @@@ ia64_load_extra (struct task_struct *ta
                pfm_load_regs(task);
  
        info = __this_cpu_read(pfm_syst_info);
 -      if (info & PFM_CPUINFO_SYST_WIDE) 
 +      if (info & PFM_CPUINFO_SYST_WIDE)
                pfm_syst_wide_update_task(task, info, 1);
  #endif
  }
   *
   *    <clone syscall>         <some kernel call frames>
   *    sys_clone                  :
 - *    do_fork                 do_fork
 + *    _do_fork                _do_fork
   *    copy_thread             copy_thread
   *
   * This means that the stack layout is as follows:
   * so there is nothing to worry about.
   */
  int
 -copy_thread(unsigned long clone_flags,
 -           unsigned long user_stack_base, unsigned long user_stack_size,
 -           struct task_struct *p)
 +copy_thread(unsigned long clone_flags, unsigned long user_stack_base,
 +          unsigned long user_stack_size, struct task_struct *p, unsigned long tls)
  {
        extern char ia64_ret_from_clone;
        struct switch_stack *child_stack, *stack;
        rbs_size = stack->ar_bspstore - rbs;
        memcpy((void *) child_rbs, (void *) rbs, rbs_size);
        if (clone_flags & CLONE_SETTLS)
 -              child_ptregs->r13 = regs->r16;  /* see sys_clone2() in entry.S */
 +              child_ptregs->r13 = tls;
        if (user_stack_base) {
                child_ptregs->r12 = user_stack_base + user_stack_size - 16;
                child_ptregs->ar_bspstore = user_stack_base;
        return retval;
  }
  
 +asmlinkage long ia64_clone(unsigned long clone_flags, unsigned long stack_start,
 +                         unsigned long stack_size, unsigned long parent_tidptr,
 +                         unsigned long child_tidptr, unsigned long tls)
 +{
 +      struct kernel_clone_args args = {
 +              .flags          = (lower_32_bits(clone_flags) & ~CSIGNAL),
 +              .pidfd          = (int __user *)parent_tidptr,
 +              .child_tid      = (int __user *)child_tidptr,
 +              .parent_tid     = (int __user *)parent_tidptr,
 +              .exit_signal    = (lower_32_bits(clone_flags) & CSIGNAL),
 +              .stack          = stack_start,
 +              .stack_size     = stack_size,
 +              .tls            = tls,
 +      };
 +
 +      return _do_fork(&args);
 +}
 +
  static void
  do_copy_task_regs (struct task_struct *task, struct unw_frame_info *info, void *arg)
  {
        unsigned long mask, sp, nat_bits = 0, ar_rnat, urbs_end, cfm;
 -      unsigned long uninitialized_var(ip);    /* GCC be quiet */
 +      unsigned long ip;
        elf_greg_t *dst = arg;
        struct pt_regs *pt;
        char nat;
        unw_get_ar(info, UNW_AR_SSD, &dst[56]);
  }
  
 -void
 -do_dump_task_fpu (struct task_struct *task, struct unw_frame_info *info, void *arg)
 -{
 -      elf_fpreg_t *dst = arg;
 -      int i;
 -
 -      memset(dst, 0, sizeof(elf_fpregset_t)); /* don't leak any "random" bits */
 -
 -      if (unw_unwind_to_user(info) < 0)
 -              return;
 -
 -      /* f0 is 0.0, f1 is 1.0 */
 -
 -      for (i = 2; i < 32; ++i)
 -              unw_get_fr(info, i, dst + i);
 -
 -      ia64_flush_fph(task);
 -      if ((task->thread.flags & IA64_THREAD_FPH_VALID) != 0)
 -              memcpy(dst + 32, task->thread.fph, 96*16);
 -}
 -
  void
  do_copy_regs (struct unw_frame_info *info, void *arg)
  {
        do_copy_task_regs(current, info, arg);
  }
  
 -void
 -do_dump_fpu (struct unw_frame_info *info, void *arg)
 -{
 -      do_dump_task_fpu(current, info, arg);
 -}
 -
  void
  ia64_elf_core_copy_regs (struct pt_regs *pt, elf_gregset_t dst)
  {
        unw_init_running(do_copy_regs, dst);
  }
  
 -int
 -dump_fpu (struct pt_regs *pt, elf_fpregset_t dst)
 -{
 -      unw_init_running(do_dump_fpu, dst);
 -      return 1;       /* f0-f31 are always valid so we always return 1 */
 -}
 -
  /*
   * Flush thread state.  This is called when a thread does an execve().
   */
diff --combined arch/ia64/kernel/smp.c
index 0e27420031219afe141b20ae0714050c7870ca1e,1cf7b9b3c1e21544643da969c28e186d2376f14c..7b7b64eb312975c6e677683f68a5abbbd3206f87
  #include <asm/io.h>
  #include <asm/irq.h>
  #include <asm/page.h>
 -#include <asm/pgalloc.h>
  #include <asm/processor.h>
  #include <asm/ptrace.h>
  #include <asm/sal.h>
  #include <asm/tlbflush.h>
  #include <asm/unistd.h>
  #include <asm/mca.h>
+ #include <asm/xtp.h>
  
  /*
   * Note: alignment of 4 entries/cacheline was empirically determined
index b7b2624fba8666a70adc55f358d7d6edf18bb840,db5977174ce7442007d2c70bcafd8519075e49e4..01a300a9700b9bb4ac352fcd9c65593a8b762567
@@@ -6,7 -6,11 +6,8 @@@
  #define _ASM_X86_TSC_H
  
  #include <asm/processor.h>
+ #include <asm/cpufeature.h>
  
 -#define NS_SCALE      10 /* 2^10, carefully chosen */
 -#define US_SCALE      32 /* 2^32, arbitralrily chosen */
 -
  /*
   * Standard way to access the cycle counter.
   */
index ccf726cc87b77fc5701a6cee166611aff3eab94b,0c89003e7f50f8fe58e30926006d4cb537efa037..5f943b93816759fdc2588d4f5bb5af978851384a
  #include <asm/irq_remapping.h>
  #include <asm/perf_event.h>
  #include <asm/x86_init.h>
 -#include <asm/pgalloc.h>
  #include <linux/atomic.h>
  #include <asm/mpspec.h>
  #include <asm/i8259.h>
  #include <asm/proto.h>
  #include <asm/traps.h>
  #include <asm/apic.h>
+ #include <asm/acpi.h>
  #include <asm/io_apic.h>
  #include <asm/desc.h>
  #include <asm/hpet.h>
index 965474d78cef46ff98fe82d6f0cab4ec2370ebe7,52b565016eb1f3abe897597b730ebaf94b9888ad..c5d6f17d9b9d38d213524cd2842ee737c94881a7
@@@ -45,6 -45,7 +45,7 @@@
  #include <asm/mtrr.h>
  #include <asm/hwcap2.h>
  #include <linux/numa.h>
+ #include <asm/numa.h>
  #include <asm/asm.h>
  #include <asm/bugs.h>
  #include <asm/cpu.h>
@@@ -441,22 -442,6 +442,22 @@@ static void __init setup_cr_pinning(voi
        static_key_enable(&cr_pinning.key);
  }
  
 +static __init int x86_nofsgsbase_setup(char *arg)
 +{
 +      /* Require an exact match without trailing characters. */
 +      if (strlen(arg))
 +              return 0;
 +
 +      /* Do not emit a message if the feature is not present. */
 +      if (!boot_cpu_has(X86_FEATURE_FSGSBASE))
 +              return 1;
 +
 +      setup_clear_cpu_cap(X86_FEATURE_FSGSBASE);
 +      pr_info("FSGSBASE disabled via kernel command line\n");
 +      return 1;
 +}
 +__setup("nofsgsbase", x86_nofsgsbase_setup);
 +
  /*
   * Protection Keys are not available in 32-bit mode.
   */
@@@ -1511,12 -1496,6 +1512,12 @@@ static void identify_cpu(struct cpuinfo
        setup_smap(c);
        setup_umip(c);
  
 +      /* Enable FSGSBASE instructions if available. */
 +      if (cpu_has(c, X86_FEATURE_FSGSBASE)) {
 +              cr4_set_bits(X86_CR4_FSGSBASE);
 +              elf_hwcap2 |= HWCAP2_FSGSBASE;
 +      }
 +
        /*
         * The vendor-specific functions might have changed features.
         * Now we do "generic changes."
index b6b7b38dff5fd73bc7a9e9266f56c7d7ca457698,6eb42d7a3dfdfc6118fa4baf497a2ae9332f8031..59a1e3ce3f145cc7b7115431dd7c025cf6d55498
@@@ -23,6 -23,7 +23,7 @@@
  #include <asm/cmdline.h>
  #include <asm/traps.h>
  #include <asm/resctrl.h>
+ #include <asm/numa.h>
  
  #ifdef CONFIG_X86_64
  #include <linux/topology.h>
@@@ -1156,8 -1157,6 +1157,8 @@@ static const struct x86_cpu_id split_lo
        X86_MATCH_INTEL_FAM6_MODEL(ATOM_TREMONT_L,      1),
        X86_MATCH_INTEL_FAM6_MODEL(TIGERLAKE_L,         1),
        X86_MATCH_INTEL_FAM6_MODEL(TIGERLAKE,           1),
 +      X86_MATCH_INTEL_FAM6_MODEL(SAPPHIRERAPIDS_X,    1),
 +      X86_MATCH_INTEL_FAM6_MODEL(ALDERLAKE,           1),
        {}
  };
  
index c27b82b62c8bda9734f1ad54752f8ac687ad0d69,db509e1134ceaaa7e28c561a0eed998a761f5ebf..411af4aa7b51f3de393c9efa04be95697838734b
  #include <linux/smp.h>
  #include <linux/pci.h>
  
+ #include <asm/io_apic.h>
+ #include <asm/acpi.h>
  #include <asm/irqdomain.h>
  #include <asm/mtrr.h>
  #include <asm/mpspec.h>
 -#include <asm/pgalloc.h>
  #include <asm/io_apic.h>
  #include <asm/proto.h>
  #include <asm/bios_ebda.h>
diff --combined arch/x86/kernel/setup.c
index b9a68d8e06d8d1964cc54db97933eaa2b22ed130,f7671980b5251a8c44458bd18cfb0488210f8150..3511736fbc747e3b606daf8bee7ae896ea91980b
@@@ -25,6 -25,7 +25,7 @@@
  #include <xen/xen.h>
  
  #include <asm/apic.h>
+ #include <asm/numa.h>
  #include <asm/bios_ebda.h>
  #include <asm/bugs.h>
  #include <asm/cpu.h>
@@@ -870,6 -871,8 +871,6 @@@ void __init setup_arch(char **cmdline_p
  
  #ifdef CONFIG_BLK_DEV_RAM
        rd_image_start = boot_params.hdr.ram_size & RAMDISK_IMAGE_START_MASK;
 -      rd_prompt = ((boot_params.hdr.ram_size & RAMDISK_PROMPT_FLAG) != 0);
 -      rd_doload = ((boot_params.hdr.ram_size & RAMDISK_LOAD_FLAG) != 0);
  #endif
  #ifdef CONFIG_EFI
        if (!strncmp((char *)&boot_params.efi_info.efi_loader_signature,
diff --combined arch/x86/mm/init_32.c
index 4cb958419fb0f2ce11055520449961556ebe8b00,d46a5cf6ccb0ff13b1e5de3522a603952dc719c0..7c055259de3a66d4685395942c06348f45b14241
@@@ -52,6 -52,7 +52,7 @@@
  #include <asm/cpu_entry_area.h>
  #include <asm/init.h>
  #include <asm/pgtable_areas.h>
+ #include <asm/numa.h>
  
  #include "mm_internal.h"
  
@@@ -678,6 -679,7 +679,6 @@@ void __init initmem_init(void
  #endif
  
        memblock_set_node(0, PHYS_ADDR_MAX, &memblock.memory, 0);
 -      sparse_memory_present_with_active_regions(0);
  
  #ifdef CONFIG_FLATMEM
        max_mapnr = IS_ENABLED(CONFIG_HIGHMEM) ? highend_pfn : max_low_pfn;
@@@ -717,6 -719,7 +718,6 @@@ void __init paging_init(void
         * NOTE: at this point the bootmem allocator is fully available.
         */
        olpc_dt_build_devicetree();
 -      sparse_memory_present_with_active_regions(MAX_NUMNODES);
        sparse_init();
        zone_sizes_init();
  }
diff --combined arch/x86/xen/smp_pv.c
index 9ea598dcc132fc4c7ef02cd23f918892082d65b6,8b04c0ddee37429f0f71d7abf785e4021798d635..47c8f4b444c9a8ffc5cdb6fd0163784442d2709c
@@@ -29,6 -29,7 +29,7 @@@
  #include <asm/idtentry.h>
  #include <asm/desc.h>
  #include <asm/cpu.h>
+ #include <asm/io_apic.h>
  
  #include <xen/interface/xen.h>
  #include <xen/interface/vcpu.h>
@@@ -92,7 -93,9 +93,7 @@@ static void cpu_bringup(void
  asmlinkage __visible void cpu_bringup_and_idle(void)
  {
        cpu_bringup();
 -      boot_init_stack_canary();
        cpu_startup_entry(CPUHP_AP_ONLINE_IDLE);
 -      prevent_tail_call_optimization();
  }
  
  void xen_smp_intr_free_pv(unsigned int cpu)
diff --combined block/blk-iocost.c
index 521c29b8ae297f9477c6a95a4b47695f41898fa7,8e940c27c27cfd7938f7cd8a26fed53c53e52393..413e0b5c8e6b08cb9e76af181b267bbed84e8ead
@@@ -406,7 -406,7 +406,7 @@@ struct ioc 
        enum ioc_running                running;
        atomic64_t                      vtime_rate;
  
-       seqcount_t                      period_seqcount;
+       seqcount_spinlock_t             period_seqcount;
        u32                             period_at;      /* wallclock starttime */
        u64                             period_at_vtime; /* vtime starttime */
  
@@@ -873,7 -873,6 +873,6 @@@ static void ioc_now(struct ioc *ioc, st
  
  static void ioc_start_period(struct ioc *ioc, struct ioc_now *now)
  {
-       lockdep_assert_held(&ioc->lock);
        WARN_ON_ONCE(ioc->running != IOC_RUNNING);
  
        write_seqcount_begin(&ioc->period_seqcount);
@@@ -1370,7 -1369,7 +1369,7 @@@ static void ioc_timer_fn(struct timer_l
         * should have woken up in the last period and expire idle iocgs.
         */
        list_for_each_entry_safe(iocg, tiocg, &ioc->active_iocgs, active_list) {
 -              if (!waitqueue_active(&iocg->waitq) && iocg->abs_vdebt &&
 +              if (!waitqueue_active(&iocg->waitq) && !iocg->abs_vdebt &&
                    !iocg_is_idle(iocg))
                        continue;
  
@@@ -2001,7 -2000,7 +2000,7 @@@ static int blk_iocost_init(struct reque
  
        ioc->running = IOC_IDLE;
        atomic64_set(&ioc->vtime_rate, VTIME_PER_USEC);
-       seqcount_init(&ioc->period_seqcount);
+       seqcount_spinlock_init(&ioc->period_seqcount, &ioc->lock);
        ioc->period_at = ktime_to_us(ktime_get());
        atomic64_set(&ioc->cur_period, 0);
        atomic_set(&ioc->hweight_gen, 0);
@@@ -2045,7 -2044,8 +2044,7 @@@ static struct blkg_policy_data *ioc_pd_
        int levels = blkcg->css.cgroup->level + 1;
        struct ioc_gq *iocg;
  
 -      iocg = kzalloc_node(sizeof(*iocg) + levels * sizeof(iocg->ancestors[0]),
 -                          gfp, q->node);
 +      iocg = kzalloc_node(struct_size(iocg, ancestors, levels), gfp, q->node);
        if (!iocg)
                return NULL;
  
index 07f5273207e7006d0c69c0cb0cb5d52b24998bb2,a7631352a486749ee7ecbeb6ed397e479c3402c2..434a3314fb0ed3913a8f816743ef01ac6bfae6ae
@@@ -36,7 -36,6 +36,7 @@@
  #include <linux/export.h>
  #include <linux/mm.h>
  #include <linux/sched/mm.h>
 +#include <linux/mmu_notifier.h>
  
  /**
   * DOC: Reservation Object Overview
  DEFINE_WD_CLASS(reservation_ww_class);
  EXPORT_SYMBOL(reservation_ww_class);
  
- struct lock_class_key reservation_seqcount_class;
- EXPORT_SYMBOL(reservation_seqcount_class);
- const char reservation_seqcount_string[] = "reservation_seqcount";
- EXPORT_SYMBOL(reservation_seqcount_string);
  /**
   * dma_resv_list_alloc - allocate fence list
   * @shared_max: number of fences we need space for
@@@ -117,13 -110,6 +111,13 @@@ static int __init dma_resv_lockdep(void
        if (ret == -EDEADLK)
                dma_resv_lock_slow(&obj, &ctx);
        fs_reclaim_acquire(GFP_KERNEL);
 +#ifdef CONFIG_MMU_NOTIFIER
 +      lock_map_acquire(&__mmu_notifier_invalidate_range_start_map);
 +      __dma_fence_might_wait();
 +      lock_map_release(&__mmu_notifier_invalidate_range_start_map);
 +#else
 +      __dma_fence_might_wait();
 +#endif
        fs_reclaim_release(GFP_KERNEL);
        ww_mutex_unlock(&obj.lock);
        ww_acquire_fini(&ctx);
@@@ -143,9 -129,8 +137,8 @@@ subsys_initcall(dma_resv_lockdep)
  void dma_resv_init(struct dma_resv *obj)
  {
        ww_mutex_init(&obj->lock, &reservation_ww_class);
+       seqcount_ww_mutex_init(&obj->seq, &obj->lock);
  
-       __seqcount_init(&obj->seq, reservation_seqcount_string,
-                       &reservation_seqcount_class);
        RCU_INIT_POINTER(obj->fence, NULL);
        RCU_INIT_POINTER(obj->fence_excl, NULL);
  }
@@@ -275,7 -260,6 +268,6 @@@ void dma_resv_add_shared_fence(struct d
        fobj = dma_resv_get_list(obj);
        count = fobj->shared_count;
  
-       preempt_disable();
        write_seqcount_begin(&obj->seq);
  
        for (i = 0; i < count; ++i) {
@@@ -297,7 -281,6 +289,6 @@@ replace
        smp_store_mb(fobj->shared_count, count);
  
        write_seqcount_end(&obj->seq);
-       preempt_enable();
        dma_fence_put(old);
  }
  EXPORT_SYMBOL(dma_resv_add_shared_fence);
@@@ -324,14 -307,12 +315,12 @@@ void dma_resv_add_excl_fence(struct dma
        if (fence)
                dma_fence_get(fence);
  
-       preempt_disable();
        write_seqcount_begin(&obj->seq);
        /* write_seqcount_begin provides the necessary memory barrier */
        RCU_INIT_POINTER(obj->fence_excl, fence);
        if (old)
                old->shared_count = 0;
        write_seqcount_end(&obj->seq);
-       preempt_enable();
  
        /* inplace update, no shared fences */
        while (i--)
@@@ -409,13 -390,11 +398,11 @@@ retry
        src_list = dma_resv_get_list(dst);
        old = dma_resv_get_excl(dst);
  
-       preempt_disable();
        write_seqcount_begin(&dst->seq);
        /* write_seqcount_begin provides the necessary memory barrier */
        RCU_INIT_POINTER(dst->fence_excl, new);
        RCU_INIT_POINTER(dst->fence, dst_list);
        write_seqcount_end(&dst->seq);
-       preempt_enable();
  
        dma_resv_list_free(src_list);
        dma_fence_put(old);
index e5a5ba869eb4a25bb0b9ced934222da73b866e08,ff4b583cb96acda8ab31d5c63679223ffec42fad..a58af513c952628e545f306790e5aa7f936f5537
@@@ -258,11 -258,9 +258,9 @@@ static int amdgpu_amdkfd_remove_evictio
        new->shared_count = k;
  
        /* Install the new fence list, seqcount provides the barriers */
-       preempt_disable();
        write_seqcount_begin(&resv->seq);
        RCU_INIT_POINTER(resv->fence, new);
        write_seqcount_end(&resv->seq);
-       preempt_enable();
  
        /* Drop the references to the removed fences or move them to ef_list */
        for (i = j, k = 0; i < old->shared_count; ++i) {
@@@ -395,7 -393,7 +393,7 @@@ static int vm_update_pds(struct amdgpu_
        if (ret)
                return ret;
  
 -      return amdgpu_sync_fence(sync, vm->last_update, false);
 +      return amdgpu_sync_fence(sync, vm->last_update);
  }
  
  static uint64_t get_pte_flags(struct amdgpu_device *adev, struct kgd_mem *mem)
@@@ -785,7 -783,7 +783,7 @@@ static int unmap_bo_from_gpuvm(struct a
  
        amdgpu_vm_clear_freed(adev, vm, &bo_va->last_pt_update);
  
 -      amdgpu_sync_fence(sync, bo_va->last_pt_update, false);
 +      amdgpu_sync_fence(sync, bo_va->last_pt_update);
  
        return 0;
  }
@@@ -804,7 -802,7 +802,7 @@@ static int update_gpuvm_pte(struct amdg
                return ret;
        }
  
 -      return amdgpu_sync_fence(sync, bo_va->last_pt_update, false);
 +      return amdgpu_sync_fence(sync, bo_va->last_pt_update);
  }
  
  static int map_bo_to_gpuvm(struct amdgpu_device *adev,
@@@ -1354,7 -1352,7 +1352,7 @@@ int amdgpu_amdkfd_gpuvm_free_memory_of_
        }
  
        /* Free the BO*/
 -      drm_gem_object_put_unlocked(&mem->bo->tbo.base);
 +      drm_gem_object_put(&mem->bo->tbo.base);
        mutex_destroy(&mem->lock);
        kfree(mem);
  
@@@ -2102,7 -2100,7 +2100,7 @@@ int amdgpu_amdkfd_gpuvm_restore_process
                        pr_debug("Memory eviction: Validate BOs failed. Try again\n");
                        goto validate_map_fail;
                }
 -              ret = amdgpu_sync_fence(&sync_obj, bo->tbo.moving, false);
 +              ret = amdgpu_sync_fence(&sync_obj, bo->tbo.moving);
                if (ret) {
                        pr_debug("Memory eviction: Sync BO fence failed. Try again\n");
                        goto validate_map_fail;
index aa096b333a9914300eb3c6fe04c38f1bd6b41261,3cf9d570607617f7f8ba06470af705a8bd3cb49e..23583b0e66a5e12dc18cbb79dbc1b8ffa6dc7552
@@@ -15,6 -15,7 +15,7 @@@
  #include <linux/irqdomain.h>
  #include <linux/crash_dump.h>
  #include <asm/io_apic.h>
+ #include <asm/apic.h>
  #include <asm/smp.h>
  #include <asm/cpu.h>
  #include <asm/irq_remapping.h>
@@@ -628,21 -629,13 +629,21 @@@ out_free_table
  
  static void intel_teardown_irq_remapping(struct intel_iommu *iommu)
  {
 +      struct fwnode_handle *fn;
 +
        if (iommu && iommu->ir_table) {
                if (iommu->ir_msi_domain) {
 +                      fn = iommu->ir_msi_domain->fwnode;
 +
                        irq_domain_remove(iommu->ir_msi_domain);
 +                      irq_domain_free_fwnode(fn);
                        iommu->ir_msi_domain = NULL;
                }
                if (iommu->ir_domain) {
 +                      fn = iommu->ir_domain->fwnode;
 +
                        irq_domain_remove(iommu->ir_domain);
 +                      irq_domain_free_fwnode(fn);
                        iommu->ir_domain = NULL;
                }
                free_pages((unsigned long)iommu->ir_table->base,
diff --combined drivers/md/raid5.c
index fb8d1fb1408876675e1d0094bdae8acf1bdf7eb5,892aefe88fa7ca55ca52f3a3143d45bacfdd49e4..ef0fd4830803f00c1142ee7feebb55bf91840b26
@@@ -69,13 -69,13 +69,13 @@@ static struct workqueue_struct *raid5_w
  
  static inline struct hlist_head *stripe_hash(struct r5conf *conf, sector_t sect)
  {
 -      int hash = (sect >> STRIPE_SHIFT) & HASH_MASK;
 +      int hash = (sect >> RAID5_STRIPE_SHIFT(conf)) & HASH_MASK;
        return &conf->stripe_hashtbl[hash];
  }
  
 -static inline int stripe_hash_locks_hash(sector_t sect)
 +static inline int stripe_hash_locks_hash(struct r5conf *conf, sector_t sect)
  {
 -      return (sect >> STRIPE_SHIFT) & STRIPE_HASH_LOCKS_MASK;
 +      return (sect >> RAID5_STRIPE_SHIFT(conf)) & STRIPE_HASH_LOCKS_MASK;
  }
  
  static inline void lock_device_hash_lock(struct r5conf *conf, int hash)
@@@ -627,7 -627,7 +627,7 @@@ raid5_get_active_stripe(struct r5conf *
                        int previous, int noblock, int noquiesce)
  {
        struct stripe_head *sh;
 -      int hash = stripe_hash_locks_hash(sector);
 +      int hash = stripe_hash_locks_hash(conf, sector);
        int inc_empty_inactive_list_flag;
  
        pr_debug("get_stripe, sector %llu\n", (unsigned long long)sector);
@@@ -748,9 -748,9 +748,9 @@@ static void stripe_add_to_batch_list(st
        tmp_sec = sh->sector;
        if (!sector_div(tmp_sec, conf->chunk_sectors))
                return;
 -      head_sector = sh->sector - STRIPE_SECTORS;
 +      head_sector = sh->sector - RAID5_STRIPE_SECTORS(conf);
  
 -      hash = stripe_hash_locks_hash(head_sector);
 +      hash = stripe_hash_locks_hash(conf, head_sector);
        spin_lock_irq(conf->hash_locks + hash);
        head = __find_stripe(conf, head_sector, conf->generation);
        if (head && !atomic_inc_not_zero(&head->count)) {
@@@ -873,7 -873,7 +873,7 @@@ static void dispatch_bio_list(struct bi
        struct bio *bio;
  
        while ((bio = bio_list_pop(tmp)))
 -              generic_make_request(bio);
 +              submit_bio_noacct(bio);
  }
  
  static int cmp_stripe(void *priv, struct list_head *a, struct list_head *b)
@@@ -1057,7 -1057,7 +1057,7 @@@ again
                       test_bit(WriteErrorSeen, &rdev->flags)) {
                        sector_t first_bad;
                        int bad_sectors;
 -                      int bad = is_badblock(rdev, sh->sector, STRIPE_SECTORS,
 +                      int bad = is_badblock(rdev, sh->sector, RAID5_STRIPE_SECTORS(conf),
                                              &first_bad, &bad_sectors);
                        if (!bad)
                                break;
                if (rdev) {
                        if (s->syncing || s->expanding || s->expanded
                            || s->replacing)
 -                              md_sync_acct(rdev->bdev, STRIPE_SECTORS);
 +                              md_sync_acct(rdev->bdev, RAID5_STRIPE_SECTORS(conf));
  
                        set_bit(STRIPE_IO_STARTED, &sh->state);
  
                        else
                                sh->dev[i].vec.bv_page = sh->dev[i].page;
                        bi->bi_vcnt = 1;
 -                      bi->bi_io_vec[0].bv_len = STRIPE_SIZE;
 +                      bi->bi_io_vec[0].bv_len = RAID5_STRIPE_SIZE(conf);
                        bi->bi_io_vec[0].bv_offset = 0;
 -                      bi->bi_iter.bi_size = STRIPE_SIZE;
 +                      bi->bi_iter.bi_size = RAID5_STRIPE_SIZE(conf);
                        bi->bi_write_hint = sh->dev[i].write_hint;
                        if (!rrdev)
                                sh->dev[i].write_hint = RWH_WRITE_LIFE_NOT_SET;
                        if (should_defer && op_is_write(op))
                                bio_list_add(&pending_bios, bi);
                        else
 -                              generic_make_request(bi);
 +                              submit_bio_noacct(bi);
                }
                if (rrdev) {
                        if (s->syncing || s->expanding || s->expanded
                            || s->replacing)
 -                              md_sync_acct(rrdev->bdev, STRIPE_SECTORS);
 +                              md_sync_acct(rrdev->bdev, RAID5_STRIPE_SECTORS(conf));
  
                        set_bit(STRIPE_IO_STARTED, &sh->state);
  
                                WARN_ON(test_bit(R5_UPTODATE, &sh->dev[i].flags));
                        sh->dev[i].rvec.bv_page = sh->dev[i].page;
                        rbi->bi_vcnt = 1;
 -                      rbi->bi_io_vec[0].bv_len = STRIPE_SIZE;
 +                      rbi->bi_io_vec[0].bv_len = RAID5_STRIPE_SIZE(conf);
                        rbi->bi_io_vec[0].bv_offset = 0;
 -                      rbi->bi_iter.bi_size = STRIPE_SIZE;
 +                      rbi->bi_iter.bi_size = RAID5_STRIPE_SIZE(conf);
                        rbi->bi_write_hint = sh->dev[i].write_hint;
                        sh->dev[i].write_hint = RWH_WRITE_LIFE_NOT_SET;
                        /*
                        if (should_defer && op_is_write(op))
                                bio_list_add(&pending_bios, rbi);
                        else
 -                              generic_make_request(rbi);
 +                              submit_bio_noacct(rbi);
                }
                if (!rdev && !rrdev) {
                        if (op_is_write(op))
@@@ -1235,7 -1235,6 +1235,7 @@@ async_copy_data(int frombio, struct bi
        int page_offset;
        struct async_submit_ctl submit;
        enum async_tx_flags flags = 0;
 +      struct r5conf *conf = sh->raid_conf;
  
        if (bio->bi_iter.bi_sector >= sector)
                page_offset = (signed)(bio->bi_iter.bi_sector - sector) * 512;
                        len -= b_offset;
                }
  
 -              if (len > 0 && page_offset + len > STRIPE_SIZE)
 -                      clen = STRIPE_SIZE - page_offset;
 +              if (len > 0 && page_offset + len > RAID5_STRIPE_SIZE(conf))
 +                      clen = RAID5_STRIPE_SIZE(conf) - page_offset;
                else
                        clen = len;
  
                        b_offset += bvl.bv_offset;
                        bio_page = bvl.bv_page;
                        if (frombio) {
 -                              if (sh->raid_conf->skip_copy &&
 +                              if (conf->skip_copy &&
                                    b_offset == 0 && page_offset == 0 &&
 -                                  clen == STRIPE_SIZE &&
 +                                  clen == RAID5_STRIPE_SIZE(conf) &&
                                    !no_skipcopy)
                                        *page = bio_page;
                                else
@@@ -1293,7 -1292,6 +1293,7 @@@ static void ops_complete_biofill(void *
  {
        struct stripe_head *sh = stripe_head_ref;
        int i;
 +      struct r5conf *conf = sh->raid_conf;
  
        pr_debug("%s: stripe %llu\n", __func__,
                (unsigned long long)sh->sector);
                        rbi = dev->read;
                        dev->read = NULL;
                        while (rbi && rbi->bi_iter.bi_sector <
 -                              dev->sector + STRIPE_SECTORS) {
 -                              rbi2 = r5_next_bio(rbi, dev->sector);
 +                              dev->sector + RAID5_STRIPE_SECTORS(conf)) {
 +                              rbi2 = r5_next_bio(conf, rbi, dev->sector);
                                bio_endio(rbi);
                                rbi = rbi2;
                        }
@@@ -1332,7 -1330,6 +1332,7 @@@ static void ops_run_biofill(struct stri
        struct dma_async_tx_descriptor *tx = NULL;
        struct async_submit_ctl submit;
        int i;
 +      struct r5conf *conf = sh->raid_conf;
  
        BUG_ON(sh->batch_head);
        pr_debug("%s: stripe %llu\n", __func__,
                        dev->toread = NULL;
                        spin_unlock_irq(&sh->stripe_lock);
                        while (rbi && rbi->bi_iter.bi_sector <
 -                              dev->sector + STRIPE_SECTORS) {
 +                              dev->sector + RAID5_STRIPE_SECTORS(conf)) {
                                tx = async_copy_data(0, rbi, &dev->page,
                                                     dev->sector, tx, sh, 0);
 -                              rbi = r5_next_bio(rbi, dev->sector);
 +                              rbi = r5_next_bio(conf, rbi, dev->sector);
                        }
                }
        }
@@@ -1432,11 -1429,9 +1432,11 @@@ ops_run_compute5(struct stripe_head *sh
        init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST, NULL,
                          ops_complete_compute, sh, to_addr_conv(sh, percpu, 0));
        if (unlikely(count == 1))
 -              tx = async_memcpy(xor_dest, xor_srcs[0], 0, 0, STRIPE_SIZE, &submit);
 +              tx = async_memcpy(xor_dest, xor_srcs[0], 0, 0,
 +                              RAID5_STRIPE_SIZE(sh->raid_conf), &submit);
        else
 -              tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, &submit);
 +              tx = async_xor(xor_dest, xor_srcs, 0, count,
 +                              RAID5_STRIPE_SIZE(sh->raid_conf), &submit);
  
        return tx;
  }
@@@ -1527,8 -1522,7 +1527,8 @@@ ops_run_compute6_1(struct stripe_head *
                init_async_submit(&submit, ASYNC_TX_FENCE, NULL,
                                  ops_complete_compute, sh,
                                  to_addr_conv(sh, percpu, 0));
 -              tx = async_gen_syndrome(blocks, 0, count+2, STRIPE_SIZE, &submit);
 +              tx = async_gen_syndrome(blocks, 0, count+2,
 +                              RAID5_STRIPE_SIZE(sh->raid_conf), &submit);
        } else {
                /* Compute any data- or p-drive using XOR */
                count = 0;
                init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST,
                                  NULL, ops_complete_compute, sh,
                                  to_addr_conv(sh, percpu, 0));
 -              tx = async_xor(dest, blocks, 0, count, STRIPE_SIZE, &submit);
 +              tx = async_xor(dest, blocks, 0, count,
 +                              RAID5_STRIPE_SIZE(sh->raid_conf), &submit);
        }
  
        return tx;
@@@ -1605,8 -1598,7 +1605,8 @@@ ops_run_compute6_2(struct stripe_head *
                                          ops_complete_compute, sh,
                                          to_addr_conv(sh, percpu, 0));
                        return async_gen_syndrome(blocks, 0, syndrome_disks+2,
 -                                                STRIPE_SIZE, &submit);
 +                                                RAID5_STRIPE_SIZE(sh->raid_conf),
 +                                                &submit);
                } else {
                        struct page *dest;
                        int data_target;
                                          ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST,
                                          NULL, NULL, NULL,
                                          to_addr_conv(sh, percpu, 0));
 -                      tx = async_xor(dest, blocks, 0, count, STRIPE_SIZE,
 +                      tx = async_xor(dest, blocks, 0, count,
 +                                     RAID5_STRIPE_SIZE(sh->raid_conf),
                                       &submit);
  
                        count = set_syndrome_sources(blocks, sh, SYNDROME_SRC_ALL);
                                          ops_complete_compute, sh,
                                          to_addr_conv(sh, percpu, 0));
                        return async_gen_syndrome(blocks, 0, count+2,
 -                                                STRIPE_SIZE, &submit);
 +                                                RAID5_STRIPE_SIZE(sh->raid_conf),
 +                                                &submit);
                }
        } else {
                init_async_submit(&submit, ASYNC_TX_FENCE, NULL,
                if (failb == syndrome_disks) {
                        /* We're missing D+P. */
                        return async_raid6_datap_recov(syndrome_disks+2,
 -                                                     STRIPE_SIZE, faila,
 -                                                     blocks, &submit);
 +                                              RAID5_STRIPE_SIZE(sh->raid_conf),
 +                                              faila,
 +                                              blocks, &submit);
                } else {
                        /* We're missing D+D. */
                        return async_raid6_2data_recov(syndrome_disks+2,
 -                                                     STRIPE_SIZE, faila, failb,
 -                                                     blocks, &submit);
 +                                              RAID5_STRIPE_SIZE(sh->raid_conf),
 +                                              faila, failb,
 +                                              blocks, &submit);
                }
        }
  }
@@@ -1703,8 -1691,7 +1703,8 @@@ ops_run_prexor5(struct stripe_head *sh
  
        init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_DROP_DST, tx,
                          ops_complete_prexor, sh, to_addr_conv(sh, percpu, 0));
 -      tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, &submit);
 +      tx = async_xor(xor_dest, xor_srcs, 0, count,
 +                      RAID5_STRIPE_SIZE(sh->raid_conf), &submit);
  
        return tx;
  }
@@@ -1724,8 -1711,7 +1724,8 @@@ ops_run_prexor6(struct stripe_head *sh
  
        init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_PQ_XOR_DST, tx,
                          ops_complete_prexor, sh, to_addr_conv(sh, percpu, 0));
 -      tx = async_gen_syndrome(blocks, 0, count+2, STRIPE_SIZE,  &submit);
 +      tx = async_gen_syndrome(blocks, 0, count+2,
 +                      RAID5_STRIPE_SIZE(sh->raid_conf), &submit);
  
        return tx;
  }
@@@ -1766,7 -1752,7 +1766,7 @@@ again
                        WARN_ON(dev->page != dev->orig_page);
  
                        while (wbi && wbi->bi_iter.bi_sector <
 -                              dev->sector + STRIPE_SECTORS) {
 +                              dev->sector + RAID5_STRIPE_SECTORS(conf)) {
                                if (wbi->bi_opf & REQ_FUA)
                                        set_bit(R5_WantFUA, &dev->flags);
                                if (wbi->bi_opf & REQ_SYNC)
                                                clear_bit(R5_OVERWRITE, &dev->flags);
                                        }
                                }
 -                              wbi = r5_next_bio(wbi, dev->sector);
 +                              wbi = r5_next_bio(conf, wbi, dev->sector);
                        }
  
                        if (head_sh->batch_head) {
@@@ -1924,11 -1910,9 +1924,11 @@@ again
        }
  
        if (unlikely(count == 1))
 -              tx = async_memcpy(xor_dest, xor_srcs[0], 0, 0, STRIPE_SIZE, &submit);
 +              tx = async_memcpy(xor_dest, xor_srcs[0], 0, 0,
 +                              RAID5_STRIPE_SIZE(sh->raid_conf), &submit);
        else
 -              tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, &submit);
 +              tx = async_xor(xor_dest, xor_srcs, 0, count,
 +                              RAID5_STRIPE_SIZE(sh->raid_conf), &submit);
        if (!last_stripe) {
                j++;
                sh = list_first_entry(&sh->batch_list, struct stripe_head,
@@@ -1988,8 -1972,7 +1988,8 @@@ again
        } else
                init_async_submit(&submit, 0, tx, NULL, NULL,
                                  to_addr_conv(sh, percpu, j));
 -      tx = async_gen_syndrome(blocks, 0, count+2, STRIPE_SIZE,  &submit);
 +      tx = async_gen_syndrome(blocks, 0, count+2,
 +                      RAID5_STRIPE_SIZE(sh->raid_conf),  &submit);
        if (!last_stripe) {
                j++;
                sh = list_first_entry(&sh->batch_list, struct stripe_head,
@@@ -2037,8 -2020,7 +2037,8 @@@ static void ops_run_check_p(struct stri
  
        init_async_submit(&submit, 0, NULL, NULL, NULL,
                          to_addr_conv(sh, percpu, 0));
 -      tx = async_xor_val(xor_dest, xor_srcs, 0, count, STRIPE_SIZE,
 +      tx = async_xor_val(xor_dest, xor_srcs, 0, count,
 +                         RAID5_STRIPE_SIZE(sh->raid_conf),
                           &sh->ops.zero_sum_result, &submit);
  
        atomic_inc(&sh->count);
@@@ -2063,8 -2045,7 +2063,8 @@@ static void ops_run_check_pq(struct str
        atomic_inc(&sh->count);
        init_async_submit(&submit, ASYNC_TX_ACK, NULL, ops_complete_check,
                          sh, to_addr_conv(sh, percpu, 0));
 -      async_syndrome_val(srcs, 0, count+2, STRIPE_SIZE,
 +      async_syndrome_val(srcs, 0, count+2,
 +                         RAID5_STRIPE_SIZE(sh->raid_conf),
                           &sh->ops.zero_sum_result, percpu->spare_page, &submit);
  }
  
@@@ -2236,9 -2217,9 +2236,9 @@@ static int grow_stripes(struct r5conf *
  /**
   * scribble_alloc - allocate percpu scribble buffer for required size
   *                of the scribble region
 - * @percpu - from for_each_present_cpu() of the caller
 - * @num - total number of disks in the array
 - * @cnt - scribble objs count for required size of the scribble region
 + * @percpu: from for_each_present_cpu() of the caller
 + * @num: total number of disks in the array
 + * @cnt: scribble objs count for required size of the scribble region
   *
   * The scribble buffer size must be enough to contain:
   * 1/ a struct page pointer for each device in the array +2
@@@ -2294,7 -2275,7 +2294,7 @@@ static int resize_chunks(struct r5conf 
  
                percpu = per_cpu_ptr(conf->percpu, cpu);
                err = scribble_alloc(percpu, new_disks,
 -                                   new_sectors / STRIPE_SECTORS);
 +                                   new_sectors / RAID5_STRIPE_SECTORS(conf));
                if (err)
                        break;
        }
@@@ -2528,10 -2509,10 +2528,10 @@@ static void raid5_end_read_request(stru
                         */
                        pr_info_ratelimited(
                                "md/raid:%s: read error corrected (%lu sectors at %llu on %s)\n",
 -                              mdname(conf->mddev), STRIPE_SECTORS,
 +                              mdname(conf->mddev), RAID5_STRIPE_SECTORS(conf),
                                (unsigned long long)s,
                                bdevname(rdev->bdev, b));
 -                      atomic_add(STRIPE_SECTORS, &rdev->corrected_errors);
 +                      atomic_add(RAID5_STRIPE_SECTORS(conf), &rdev->corrected_errors);
                        clear_bit(R5_ReadError, &sh->dev[i].flags);
                        clear_bit(R5_ReWrite, &sh->dev[i].flags);
                } else if (test_bit(R5_ReadNoMerge, &sh->dev[i].flags))
                        if (!(set_bad
                              && test_bit(In_sync, &rdev->flags)
                              && rdev_set_badblocks(
 -                                    rdev, sh->sector, STRIPE_SECTORS, 0)))
 +                                    rdev, sh->sector, RAID5_STRIPE_SECTORS(conf), 0)))
                                md_error(conf->mddev, rdev);
                }
        }
@@@ -2620,7 -2601,7 +2620,7 @@@ static void raid5_end_write_request(str
        struct stripe_head *sh = bi->bi_private;
        struct r5conf *conf = sh->raid_conf;
        int disks = sh->disks, i;
 -      struct md_rdev *uninitialized_var(rdev);
 +      struct md_rdev *rdev;
        sector_t first_bad;
        int bad_sectors;
        int replacement = 0;
                if (bi->bi_status)
                        md_error(conf->mddev, rdev);
                else if (is_badblock(rdev, sh->sector,
 -                                   STRIPE_SECTORS,
 +                                   RAID5_STRIPE_SECTORS(conf),
                                     &first_bad, &bad_sectors))
                        set_bit(R5_MadeGoodRepl, &sh->dev[i].flags);
        } else {
                                set_bit(MD_RECOVERY_NEEDED,
                                        &rdev->mddev->recovery);
                } else if (is_badblock(rdev, sh->sector,
 -                                     STRIPE_SECTORS,
 +                                     RAID5_STRIPE_SECTORS(conf),
                                       &first_bad, &bad_sectors)) {
                        set_bit(R5_MadeGood, &sh->dev[i].flags);
                        if (test_bit(R5_ReadError, &sh->dev[i].flags))
@@@ -3302,13 -3283,13 +3302,13 @@@ static int add_stripe_bio(struct stripe
                /* check if page is covered */
                sector_t sector = sh->dev[dd_idx].sector;
                for (bi=sh->dev[dd_idx].towrite;
 -                   sector < sh->dev[dd_idx].sector + STRIPE_SECTORS &&
 +                   sector < sh->dev[dd_idx].sector + RAID5_STRIPE_SECTORS(conf) &&
                             bi && bi->bi_iter.bi_sector <= sector;
 -                   bi = r5_next_bio(bi, sh->dev[dd_idx].sector)) {
 +                   bi = r5_next_bio(conf, bi, sh->dev[dd_idx].sector)) {
                        if (bio_end_sector(bi) >= sector)
                                sector = bio_end_sector(bi);
                }
 -              if (sector >= sh->dev[dd_idx].sector + STRIPE_SECTORS)
 +              if (sector >= sh->dev[dd_idx].sector + RAID5_STRIPE_SECTORS(conf))
                        if (!test_and_set_bit(R5_OVERWRITE, &sh->dev[dd_idx].flags))
                                sh->overwrite_disks++;
        }
                set_bit(STRIPE_BITMAP_PENDING, &sh->state);
                spin_unlock_irq(&sh->stripe_lock);
                md_bitmap_startwrite(conf->mddev->bitmap, sh->sector,
 -                                   STRIPE_SECTORS, 0);
 +                                   RAID5_STRIPE_SECTORS(conf), 0);
                spin_lock_irq(&sh->stripe_lock);
                clear_bit(STRIPE_BITMAP_PENDING, &sh->state);
                if (!sh->batch_head) {
@@@ -3395,7 -3376,7 +3395,7 @@@ handle_failed_stripe(struct r5conf *con
                                if (!rdev_set_badblocks(
                                            rdev,
                                            sh->sector,
 -                                          STRIPE_SECTORS, 0))
 +                                          RAID5_STRIPE_SECTORS(conf), 0))
                                        md_error(conf->mddev, rdev);
                                rdev_dec_pending(rdev, conf->mddev);
                        }
                        wake_up(&conf->wait_for_overlap);
  
                while (bi && bi->bi_iter.bi_sector <
 -                      sh->dev[i].sector + STRIPE_SECTORS) {
 -                      struct bio *nextbi = r5_next_bio(bi, sh->dev[i].sector);
 +                      sh->dev[i].sector + RAID5_STRIPE_SECTORS(conf)) {
 +                      struct bio *nextbi = r5_next_bio(conf, bi, sh->dev[i].sector);
  
                        md_write_end(conf->mddev);
                        bio_io_error(bi);
                }
                if (bitmap_end)
                        md_bitmap_endwrite(conf->mddev->bitmap, sh->sector,
 -                                         STRIPE_SECTORS, 0, 0);
 +                                         RAID5_STRIPE_SECTORS(conf), 0, 0);
                bitmap_end = 0;
                /* and fail all 'written' */
                bi = sh->dev[i].written;
  
                if (bi) bitmap_end = 1;
                while (bi && bi->bi_iter.bi_sector <
 -                     sh->dev[i].sector + STRIPE_SECTORS) {
 -                      struct bio *bi2 = r5_next_bio(bi, sh->dev[i].sector);
 +                     sh->dev[i].sector + RAID5_STRIPE_SECTORS(conf)) {
 +                      struct bio *bi2 = r5_next_bio(conf, bi, sh->dev[i].sector);
  
                        md_write_end(conf->mddev);
                        bio_io_error(bi);
                        if (bi)
                                s->to_read--;
                        while (bi && bi->bi_iter.bi_sector <
 -                             sh->dev[i].sector + STRIPE_SECTORS) {
 +                             sh->dev[i].sector + RAID5_STRIPE_SECTORS(conf)) {
                                struct bio *nextbi =
 -                                      r5_next_bio(bi, sh->dev[i].sector);
 +                                      r5_next_bio(conf, bi, sh->dev[i].sector);
  
                                bio_io_error(bi);
                                bi = nextbi;
                }
                if (bitmap_end)
                        md_bitmap_endwrite(conf->mddev->bitmap, sh->sector,
 -                                         STRIPE_SECTORS, 0, 0);
 +                                         RAID5_STRIPE_SECTORS(conf), 0, 0);
                /* If we were in the middle of a write the parity block might
                 * still be locked - so just clear all R5_LOCKED flags
                 */
@@@ -3515,14 -3496,14 +3515,14 @@@ handle_failed_sync(struct r5conf *conf
                            && !test_bit(Faulty, &rdev->flags)
                            && !test_bit(In_sync, &rdev->flags)
                            && !rdev_set_badblocks(rdev, sh->sector,
 -                                                 STRIPE_SECTORS, 0))
 +                                                 RAID5_STRIPE_SECTORS(conf), 0))
                                abort = 1;
                        rdev = rcu_dereference(conf->disks[i].replacement);
                        if (rdev
                            && !test_bit(Faulty, &rdev->flags)
                            && !test_bit(In_sync, &rdev->flags)
                            && !rdev_set_badblocks(rdev, sh->sector,
 -                                                 STRIPE_SECTORS, 0))
 +                                                 RAID5_STRIPE_SECTORS(conf), 0))
                                abort = 1;
                }
                rcu_read_unlock();
                        conf->recovery_disabled =
                                conf->mddev->recovery_disabled;
        }
 -      md_done_sync(conf->mddev, STRIPE_SECTORS, !abort);
 +      md_done_sync(conf->mddev, RAID5_STRIPE_SECTORS(conf), !abort);
  }
  
  static int want_replace(struct stripe_head *sh, int disk_idx)
@@@ -3557,7 -3538,6 +3557,7 @@@ static int need_this_block(struct strip
        struct r5dev *fdev[2] = { &sh->dev[s->failed_num[0]],
                                  &sh->dev[s->failed_num[1]] };
        int i;
 +      bool force_rcw = (sh->raid_conf->rmw_level == PARITY_DISABLE_RMW);
  
  
        if (test_bit(R5_LOCKED, &dev->flags) ||
                         * devices must be read.
                         */
                        return 1;
 +
 +              if (s->failed >= 2 &&
 +                  (fdev[i]->towrite ||
 +                   s->failed_num[i] == sh->pd_idx ||
 +                   s->failed_num[i] == sh->qd_idx) &&
 +                  !test_bit(R5_UPTODATE, &fdev[i]->flags))
 +                      /* In max degraded raid6, If the failed disk is P, Q,
 +                       * or we want to read the failed disk, we need to do
 +                       * reconstruct-write.
 +                       */
 +                      force_rcw = true;
        }
  
 -      /* If we are forced to do a reconstruct-write, either because
 -       * the current RAID6 implementation only supports that, or
 -       * because parity cannot be trusted and we are currently
 -       * recovering it, there is extra need to be careful.
 +      /* If we are forced to do a reconstruct-write, because parity
 +       * cannot be trusted and we are currently recovering it, there
 +       * is extra need to be careful.
         * If one of the devices that we would need to read, because
         * it is not being overwritten (and maybe not written at all)
         * is missing/faulty, then we need to read everything we can.
         */
 -      if (sh->raid_conf->level != 6 &&
 +      if (!force_rcw &&
            sh->sector < sh->raid_conf->mddev->recovery_cp)
                /* reconstruct-write isn't being forced */
                return 0;
@@@ -3740,7 -3710,7 +3740,7 @@@ static int fetch_block(struct stripe_he
        return 0;
  }
  
 -/**
 +/*
   * handle_stripe_fill - read or compute data to satisfy pending requests.
   */
  static void handle_stripe_fill(struct stripe_head *sh,
@@@ -3815,14 -3785,14 +3815,14 @@@ returnbi
                                wbi = dev->written;
                                dev->written = NULL;
                                while (wbi && wbi->bi_iter.bi_sector <
 -                                      dev->sector + STRIPE_SECTORS) {
 -                                      wbi2 = r5_next_bio(wbi, dev->sector);
 +                                      dev->sector + RAID5_STRIPE_SECTORS(conf)) {
 +                                      wbi2 = r5_next_bio(conf, wbi, dev->sector);
                                        md_write_end(conf->mddev);
                                        bio_endio(wbi);
                                        wbi = wbi2;
                                }
                                md_bitmap_endwrite(conf->mddev->bitmap, sh->sector,
 -                                                 STRIPE_SECTORS,
 +                                                 RAID5_STRIPE_SECTORS(conf),
                                                   !test_bit(STRIPE_DEGRADED, &sh->state),
                                                   0);
                                if (head_sh->batch_head) {
@@@ -4006,8 -3976,10 +4006,8 @@@ static int handle_stripe_dirtying(struc
                                        set_bit(R5_LOCKED, &dev->flags);
                                        set_bit(R5_Wantread, &dev->flags);
                                        s->locked++;
 -                              } else {
 +                              } else
                                        set_bit(STRIPE_DELAYED, &sh->state);
 -                                      set_bit(STRIPE_HANDLE, &sh->state);
 -                              }
                        }
                }
        }
                                        set_bit(R5_Wantread, &dev->flags);
                                        s->locked++;
                                        qread++;
 -                              } else {
 +                              } else
                                        set_bit(STRIPE_DELAYED, &sh->state);
 -                                      set_bit(STRIPE_HANDLE, &sh->state);
 -                              }
                        }
                }
                if (rcw && conf->mddev->queue)
@@@ -4125,7 -4099,7 +4125,7 @@@ static void handle_parity_checks5(struc
                         */
                        set_bit(STRIPE_INSYNC, &sh->state);
                else {
 -                      atomic64_add(STRIPE_SECTORS, &conf->mddev->resync_mismatches);
 +                      atomic64_add(RAID5_STRIPE_SECTORS(conf), &conf->mddev->resync_mismatches);
                        if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery)) {
                                /* don't try to repair!! */
                                set_bit(STRIPE_INSYNC, &sh->state);
                                                    "%llu-%llu\n", mdname(conf->mddev),
                                                    (unsigned long long) sh->sector,
                                                    (unsigned long long) sh->sector +
 -                                                  STRIPE_SECTORS);
 +                                                  RAID5_STRIPE_SECTORS(conf));
                        } else {
                                sh->check_state = check_state_compute_run;
                                set_bit(STRIPE_COMPUTE_RUN, &sh->state);
@@@ -4290,7 -4264,7 +4290,7 @@@ static void handle_parity_checks6(struc
                                 */
                        }
                } else {
 -                      atomic64_add(STRIPE_SECTORS, &conf->mddev->resync_mismatches);
 +                      atomic64_add(RAID5_STRIPE_SECTORS(conf), &conf->mddev->resync_mismatches);
                        if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery)) {
                                /* don't try to repair!! */
                                set_bit(STRIPE_INSYNC, &sh->state);
                                                    "%llu-%llu\n", mdname(conf->mddev),
                                                    (unsigned long long) sh->sector,
                                                    (unsigned long long) sh->sector +
 -                                                  STRIPE_SECTORS);
 +                                                  RAID5_STRIPE_SECTORS(conf));
                        } else {
                                int *target = &sh->ops.target;
  
@@@ -4369,7 -4343,7 +4369,7 @@@ static void handle_stripe_expansion(str
                        /* place all the copies on one channel */
                        init_async_submit(&submit, 0, tx, NULL, NULL, NULL);
                        tx = async_memcpy(sh2->dev[dd_idx].page,
 -                                        sh->dev[i].page, 0, 0, STRIPE_SIZE,
 +                                        sh->dev[i].page, 0, 0, RAID5_STRIPE_SIZE(conf),
                                          &submit);
  
                        set_bit(R5_Expanded, &sh2->dev[dd_idx].flags);
@@@ -4468,8 -4442,8 +4468,8 @@@ static void analyse_stripe(struct strip
                 */
                rdev = rcu_dereference(conf->disks[i].replacement);
                if (rdev && !test_bit(Faulty, &rdev->flags) &&
 -                  rdev->recovery_offset >= sh->sector + STRIPE_SECTORS &&
 -                  !is_badblock(rdev, sh->sector, STRIPE_SECTORS,
 +                  rdev->recovery_offset >= sh->sector + RAID5_STRIPE_SECTORS(conf) &&
 +                  !is_badblock(rdev, sh->sector, RAID5_STRIPE_SECTORS(conf),
                                 &first_bad, &bad_sectors))
                        set_bit(R5_ReadRepl, &dev->flags);
                else {
                if (rdev && test_bit(Faulty, &rdev->flags))
                        rdev = NULL;
                if (rdev) {
 -                      is_bad = is_badblock(rdev, sh->sector, STRIPE_SECTORS,
 +                      is_bad = is_badblock(rdev, sh->sector, RAID5_STRIPE_SECTORS(conf),
                                             &first_bad, &bad_sectors);
                        if (s->blocked_rdev == NULL
                            && (test_bit(Blocked, &rdev->flags)
                        }
                } else if (test_bit(In_sync, &rdev->flags))
                        set_bit(R5_Insync, &dev->flags);
 -              else if (sh->sector + STRIPE_SECTORS <= rdev->recovery_offset)
 +              else if (sh->sector + RAID5_STRIPE_SECTORS(conf) <= rdev->recovery_offset)
                        /* in sync if before recovery_offset */
                        set_bit(R5_Insync, &dev->flags);
                else if (test_bit(R5_UPTODATE, &dev->flags) &&
        rcu_read_unlock();
  }
  
 +/*
 + * Return '1' if this is a member of batch, or '0' if it is a lone stripe or
 + * a head which can now be handled.
 + */
  static int clear_batch_ready(struct stripe_head *sh)
  {
 -      /* Return '1' if this is a member of batch, or
 -       * '0' if it is a lone stripe or a head which can now be
 -       * handled.
 -       */
        struct stripe_head *tmp;
        if (!test_and_clear_bit(STRIPE_BATCH_READY, &sh->state))
                return (sh->batch_head && sh->batch_head != sh);
@@@ -4708,16 -4682,6 +4708,16 @@@ static void handle_stripe(struct stripe
        struct r5dev *pdev, *qdev;
  
        clear_bit(STRIPE_HANDLE, &sh->state);
 +
 +      /*
 +       * handle_stripe should not continue handle the batched stripe, only
 +       * the head of batch list or lone stripe can continue. Otherwise we
 +       * could see break_stripe_batch_list warns about the STRIPE_ACTIVE
 +       * is set for the batched stripe.
 +       */
 +      if (clear_batch_ready(sh))
 +              return;
 +
        if (test_and_set_bit_lock(STRIPE_ACTIVE, &sh->state)) {
                /* already being handled, ensure it gets handled
                 * again when current action finishes */
                return;
        }
  
 -      if (clear_batch_ready(sh) ) {
 -              clear_bit_unlock(STRIPE_ACTIVE, &sh->state);
 -              return;
 -      }
 -
        if (test_and_clear_bit(STRIPE_BATCH_ERR, &sh->state))
                break_stripe_batch_list(sh, 0);
  
         * or to load a block that is being partially written.
         */
        if (s.to_read || s.non_overwrite
 -          || (conf->level == 6 && s.to_write && s.failed)
 +          || (s.to_write && s.failed)
            || (s.syncing && (s.uptodate + s.compute < disks))
            || s.replacing
            || s.expanding)
        if ((s.syncing || s.replacing) && s.locked == 0 &&
            !test_bit(STRIPE_COMPUTE_RUN, &sh->state) &&
            test_bit(STRIPE_INSYNC, &sh->state)) {
 -              md_done_sync(conf->mddev, STRIPE_SECTORS, 1);
 +              md_done_sync(conf->mddev, RAID5_STRIPE_SECTORS(conf), 1);
                clear_bit(STRIPE_SYNCING, &sh->state);
                if (test_and_clear_bit(R5_Overlap, &sh->dev[sh->pd_idx].flags))
                        wake_up(&conf->wait_for_overlap);
                                if (!test_bit(R5_ReWrite, &dev->flags)) {
                                        set_bit(R5_Wantwrite, &dev->flags);
                                        set_bit(R5_ReWrite, &dev->flags);
 -                                      set_bit(R5_LOCKED, &dev->flags);
 -                                      s.locked++;
 -                              } else {
 +                              } else
                                        /* let's read it back */
                                        set_bit(R5_Wantread, &dev->flags);
 -                                      set_bit(R5_LOCKED, &dev->flags);
 -                                      s.locked++;
 -                              }
 +                              set_bit(R5_LOCKED, &dev->flags);
 +                              s.locked++;
                        }
                }
  
                clear_bit(STRIPE_EXPAND_READY, &sh->state);
                atomic_dec(&conf->reshape_stripes);
                wake_up(&conf->wait_for_overlap);
 -              md_done_sync(conf->mddev, STRIPE_SECTORS, 1);
 +              md_done_sync(conf->mddev, RAID5_STRIPE_SECTORS(conf), 1);
        }
  
        if (s.expanding && s.locked == 0 &&
@@@ -5053,14 -5025,14 +5053,14 @@@ finish
                                /* We own a safe reference to the rdev */
                                rdev = conf->disks[i].rdev;
                                if (!rdev_set_badblocks(rdev, sh->sector,
 -                                                      STRIPE_SECTORS, 0))
 +                                                      RAID5_STRIPE_SECTORS(conf), 0))
                                        md_error(conf->mddev, rdev);
                                rdev_dec_pending(rdev, conf->mddev);
                        }
                        if (test_and_clear_bit(R5_MadeGood, &dev->flags)) {
                                rdev = conf->disks[i].rdev;
                                rdev_clear_badblocks(rdev, sh->sector,
 -                                                   STRIPE_SECTORS, 0);
 +                                                   RAID5_STRIPE_SECTORS(conf), 0);
                                rdev_dec_pending(rdev, conf->mddev);
                        }
                        if (test_and_clear_bit(R5_MadeGoodRepl, &dev->flags)) {
                                        /* rdev have been moved down */
                                        rdev = conf->disks[i].rdev;
                                rdev_clear_badblocks(rdev, sh->sector,
 -                                                   STRIPE_SECTORS, 0);
 +                                                   RAID5_STRIPE_SECTORS(conf), 0);
                                rdev_dec_pending(rdev, conf->mddev);
                        }
                }
@@@ -5127,6 -5099,28 +5127,6 @@@ static void activate_bit_delay(struct r
        }
  }
  
 -static int raid5_congested(struct mddev *mddev, int bits)
 -{
 -      struct r5conf *conf = mddev->private;
 -
 -      /* No difference between reads and writes.  Just check
 -       * how busy the stripe_cache is
 -       */
 -
 -      if (test_bit(R5_INACTIVE_BLOCKED, &conf->cache_state))
 -              return 1;
 -
 -      /* Also checks whether there is pressure on r5cache log space */
 -      if (test_bit(R5C_LOG_TIGHT, &conf->cache_state))
 -              return 1;
 -      if (conf->quiesce)
 -              return 1;
 -      if (atomic_read(&conf->empty_inactive_list_nr))
 -              return 1;
 -
 -      return 0;
 -}
 -
  static int in_chunk_boundary(struct mddev *mddev, struct bio *bio)
  {
        struct r5conf *conf = mddev->private;
@@@ -5295,7 -5289,7 +5295,7 @@@ static int raid5_read_one_chunk(struct 
                        trace_block_bio_remap(align_bi->bi_disk->queue,
                                              align_bi, disk_devt(mddev->gendisk),
                                              raid_bio->bi_iter.bi_sector);
 -              generic_make_request(align_bi);
 +              submit_bio_noacct(align_bi);
                return 1;
        } else {
                rcu_read_unlock();
@@@ -5315,7 -5309,7 +5315,7 @@@ static struct bio *chunk_aligned_read(s
                struct r5conf *conf = mddev->private;
                split = bio_split(raid_bio, sectors, GFP_NOIO, &conf->bio_split);
                bio_chain(split, raid_bio);
 -              generic_make_request(raid_bio);
 +              submit_bio_noacct(raid_bio);
                raid_bio = split;
        }
  
@@@ -5511,7 -5505,7 +5511,7 @@@ static void make_discard_request(struc
                /* Skip discard while reshape is happening */
                return;
  
 -      logical_sector = bi->bi_iter.bi_sector & ~((sector_t)STRIPE_SECTORS-1);
 +      logical_sector = bi->bi_iter.bi_sector & ~((sector_t)RAID5_STRIPE_SECTORS(conf)-1);
        last_sector = bio_end_sector(bi);
  
        bi->bi_next = NULL;
        last_sector *= conf->chunk_sectors;
  
        for (; logical_sector < last_sector;
 -           logical_sector += STRIPE_SECTORS) {
 +           logical_sector += RAID5_STRIPE_SECTORS(conf)) {
                DEFINE_WAIT(w);
                int d;
        again:
                             d++)
                                md_bitmap_startwrite(mddev->bitmap,
                                                     sh->sector,
 -                                                   STRIPE_SECTORS,
 +                                                   RAID5_STRIPE_SECTORS(conf),
                                                     0);
                        sh->bm_seq = conf->seq_flush + 1;
                        set_bit(STRIPE_BIT_DELAY, &sh->state);
@@@ -5636,12 -5630,12 +5636,12 @@@ static bool raid5_make_request(struct m
                return true;
        }
  
 -      logical_sector = bi->bi_iter.bi_sector & ~((sector_t)STRIPE_SECTORS-1);
 +      logical_sector = bi->bi_iter.bi_sector & ~((sector_t)RAID5_STRIPE_SECTORS(conf)-1);
        last_sector = bio_end_sector(bi);
        bi->bi_next = NULL;
  
        prepare_to_wait(&conf->wait_for_overlap, &w, TASK_UNINTERRUPTIBLE);
 -      for (;logical_sector < last_sector; logical_sector += STRIPE_SECTORS) {
 +      for (; logical_sector < last_sector; logical_sector += RAID5_STRIPE_SECTORS(conf)) {
                int previous;
                int seq;
  
                                do_flush = false;
                        }
  
 -                      if (!sh->batch_head || sh == sh->batch_head)
 -                              set_bit(STRIPE_HANDLE, &sh->state);
 +                      set_bit(STRIPE_HANDLE, &sh->state);
                        clear_bit(STRIPE_DELAYED, &sh->state);
                        if ((!sh->batch_head || sh == sh->batch_head) &&
                            (bi->bi_opf & REQ_SYNC) &&
@@@ -5804,7 -5799,7 +5804,7 @@@ static sector_t reshape_request(struct 
                sector_div(sector_nr, new_data_disks);
                if (sector_nr) {
                        mddev->curr_resync_completed = sector_nr;
 -                      sysfs_notify(&mddev->kobj, NULL, "sync_completed");
 +                      sysfs_notify_dirent_safe(mddev->sysfs_completed);
                        *skipped = 1;
                        retn = sector_nr;
                        goto finish;
                conf->reshape_safe = mddev->reshape_position;
                spin_unlock_irq(&conf->device_lock);
                wake_up(&conf->wait_for_overlap);
 -              sysfs_notify(&mddev->kobj, NULL, "sync_completed");
 +              sysfs_notify_dirent_safe(mddev->sysfs_completed);
        }
  
        INIT_LIST_HEAD(&stripes);
 -      for (i = 0; i < reshape_sectors; i += STRIPE_SECTORS) {
 +      for (i = 0; i < reshape_sectors; i += RAID5_STRIPE_SECTORS(conf)) {
                int j;
                int skipped_disk = 0;
                sh = raid5_get_active_stripe(conf, stripe_addr+i, 0, 0, 1);
                                skipped_disk = 1;
                                continue;
                        }
 -                      memset(page_address(sh->dev[j].page), 0, STRIPE_SIZE);
 +                      memset(page_address(sh->dev[j].page), 0, RAID5_STRIPE_SIZE(conf));
                        set_bit(R5_Expanded, &sh->dev[j].flags);
                        set_bit(R5_UPTODATE, &sh->dev[j].flags);
                }
                set_bit(STRIPE_EXPAND_SOURCE, &sh->state);
                set_bit(STRIPE_HANDLE, &sh->state);
                raid5_release_stripe(sh);
 -              first_sector += STRIPE_SECTORS;
 +              first_sector += RAID5_STRIPE_SECTORS(conf);
        }
        /* Now that the sources are clearly marked, we can release
         * the destination stripes
@@@ -6025,7 -6020,7 +6025,7 @@@ finish
                conf->reshape_safe = mddev->reshape_position;
                spin_unlock_irq(&conf->device_lock);
                wake_up(&conf->wait_for_overlap);
 -              sysfs_notify(&mddev->kobj, NULL, "sync_completed");
 +              sysfs_notify_dirent_safe(mddev->sysfs_completed);
        }
  ret:
        return retn;
@@@ -6084,12 -6079,11 +6084,12 @@@ static inline sector_t raid5_sync_reque
        if (!test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery) &&
            !conf->fullsync &&
            !md_bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, 1) &&
 -          sync_blocks >= STRIPE_SECTORS) {
 +          sync_blocks >= RAID5_STRIPE_SECTORS(conf)) {
                /* we can skip this block, and probably more */
 -              sync_blocks /= STRIPE_SECTORS;
 +              do_div(sync_blocks, RAID5_STRIPE_SECTORS(conf));
                *skipped = 1;
 -              return sync_blocks * STRIPE_SECTORS; /* keep things rounded to whole stripes */
 +              /* keep things rounded to whole stripes */
 +              return sync_blocks * RAID5_STRIPE_SECTORS(conf);
        }
  
        md_bitmap_cond_end_sync(mddev->bitmap, sector_nr, false);
  
        raid5_release_stripe(sh);
  
 -      return STRIPE_SECTORS;
 +      return RAID5_STRIPE_SECTORS(conf);
  }
  
  static int  retry_aligned_read(struct r5conf *conf, struct bio *raid_bio,
        int handled = 0;
  
        logical_sector = raid_bio->bi_iter.bi_sector &
 -              ~((sector_t)STRIPE_SECTORS-1);
 +              ~((sector_t)RAID5_STRIPE_SECTORS(conf)-1);
        sector = raid5_compute_sector(conf, logical_sector,
                                      0, &dd_idx, NULL);
        last_sector = bio_end_sector(raid_bio);
  
        for (; logical_sector < last_sector;
 -           logical_sector += STRIPE_SECTORS,
 -                   sector += STRIPE_SECTORS,
 +           logical_sector += RAID5_STRIPE_SECTORS(conf),
 +                   sector += RAID5_STRIPE_SECTORS(conf),
                     scnt++) {
  
                if (scnt < offset)
@@@ -6485,77 -6479,6 +6485,77 @@@ raid5_rmw_level = __ATTR(rmw_level, S_I
                         raid5_show_rmw_level,
                         raid5_store_rmw_level);
  
 +static ssize_t
 +raid5_show_stripe_size(struct mddev  *mddev, char *page)
 +{
 +      struct r5conf *conf;
 +      int ret = 0;
 +
 +      spin_lock(&mddev->lock);
 +      conf = mddev->private;
 +      if (conf)
 +              ret = sprintf(page, "%lu\n", RAID5_STRIPE_SIZE(conf));
 +      spin_unlock(&mddev->lock);
 +      return ret;
 +}
 +
 +#if PAGE_SIZE != DEFAULT_STRIPE_SIZE
 +static ssize_t
 +raid5_store_stripe_size(struct mddev  *mddev, const char *page, size_t len)
 +{
 +      struct r5conf *conf;
 +      unsigned long new;
 +      int err;
 +
 +      if (len >= PAGE_SIZE)
 +              return -EINVAL;
 +      if (kstrtoul(page, 10, &new))
 +              return -EINVAL;
 +
 +      /*
 +       * The value should not be bigger than PAGE_SIZE. It requires to
 +       * be multiple of DEFAULT_STRIPE_SIZE.
 +       */
 +      if (new % DEFAULT_STRIPE_SIZE != 0 || new > PAGE_SIZE || new == 0)
 +              return -EINVAL;
 +
 +      err = mddev_lock(mddev);
 +      if (err)
 +              return err;
 +
 +      conf = mddev->private;
 +      if (!conf) {
 +              err = -ENODEV;
 +              goto out_unlock;
 +      }
 +
 +      if (new == conf->stripe_size)
 +              goto out_unlock;
 +
 +      pr_debug("md/raid: change stripe_size from %lu to %lu\n",
 +                      conf->stripe_size, new);
 +
 +      mddev_suspend(mddev);
 +      conf->stripe_size = new;
 +      conf->stripe_shift = ilog2(new) - 9;
 +      conf->stripe_sectors = new >> 9;
 +      mddev_resume(mddev);
 +
 +out_unlock:
 +      mddev_unlock(mddev);
 +      return err ?: len;
 +}
 +
 +static struct md_sysfs_entry
 +raid5_stripe_size = __ATTR(stripe_size, 0644,
 +                       raid5_show_stripe_size,
 +                       raid5_store_stripe_size);
 +#else
 +static struct md_sysfs_entry
 +raid5_stripe_size = __ATTR(stripe_size, 0444,
 +                       raid5_show_stripe_size,
 +                       NULL);
 +#endif
  
  static ssize_t
  raid5_show_preread_threshold(struct mddev *mddev, char *page)
@@@ -6744,7 -6667,6 +6744,7 @@@ static struct attribute *raid5_attrs[] 
        &raid5_group_thread_cnt.attr,
        &raid5_skip_copy.attr,
        &raid5_rmw_level.attr,
 +      &raid5_stripe_size.attr,
        &r5c_journal_mode.attr,
        &ppl_write_hint.attr,
        NULL,
@@@ -6844,7 -6766,7 +6844,7 @@@ static int alloc_scratch_buffer(struct 
                               conf->previous_raid_disks),
                           max(conf->chunk_sectors,
                               conf->prev_chunk_sectors)
 -                         / STRIPE_SECTORS)) {
 +                         / RAID5_STRIPE_SECTORS(conf))) {
                free_scratch_buffer(conf, percpu);
                return -ENOMEM;
        }
@@@ -6996,12 -6918,6 +6996,12 @@@ static struct r5conf *setup_conf(struc
        conf = kzalloc(sizeof(struct r5conf), GFP_KERNEL);
        if (conf == NULL)
                goto abort;
 +
 +#if PAGE_SIZE != DEFAULT_STRIPE_SIZE
 +      conf->stripe_size = DEFAULT_STRIPE_SIZE;
 +      conf->stripe_shift = ilog2(DEFAULT_STRIPE_SIZE) - 9;
 +      conf->stripe_sectors = DEFAULT_STRIPE_SIZE >> 9;
 +#endif
        INIT_LIST_HEAD(&conf->free_list);
        INIT_LIST_HEAD(&conf->pending_list);
        conf->pending_data = kcalloc(PENDING_IO_MAX,
        } else
                goto abort;
        spin_lock_init(&conf->device_lock);
-       seqcount_init(&conf->gen_lock);
+       seqcount_spinlock_init(&conf->gen_lock, &conf->device_lock);
        mutex_init(&conf->cache_size_mutex);
        init_waitqueue_head(&conf->wait_for_quiescent);
        init_waitqueue_head(&conf->wait_for_stripe);
        conf->min_nr_stripes = NR_STRIPES;
        if (mddev->reshape_position != MaxSector) {
                int stripes = max_t(int,
 -                      ((mddev->chunk_sectors << 9) / STRIPE_SIZE) * 4,
 -                      ((mddev->new_chunk_sectors << 9) / STRIPE_SIZE) * 4);
 +                      ((mddev->chunk_sectors << 9) / RAID5_STRIPE_SIZE(conf)) * 4,
 +                      ((mddev->new_chunk_sectors << 9) / RAID5_STRIPE_SIZE(conf)) * 4);
                conf->min_nr_stripes = max(NR_STRIPES, stripes);
                if (conf->min_nr_stripes != NR_STRIPES)
                        pr_info("md/raid:%s: force stripe size %d for reshape\n",
@@@ -7885,14 -7801,14 +7885,14 @@@ static int check_stripe_cache(struct md
         * stripe_heads first.
         */
        struct r5conf *conf = mddev->private;
 -      if (((mddev->chunk_sectors << 9) / STRIPE_SIZE) * 4
 +      if (((mddev->chunk_sectors << 9) / RAID5_STRIPE_SIZE(conf)) * 4
            > conf->min_nr_stripes ||
 -          ((mddev->new_chunk_sectors << 9) / STRIPE_SIZE) * 4
 +          ((mddev->new_chunk_sectors << 9) / RAID5_STRIPE_SIZE(conf)) * 4
            > conf->min_nr_stripes) {
                pr_warn("md/raid:%s: reshape: not enough stripes.  Needed %lu\n",
                        mdname(mddev),
                        ((max(mddev->chunk_sectors, mddev->new_chunk_sectors) << 9)
 -                       / STRIPE_SIZE)*4);
 +                       / RAID5_STRIPE_SIZE(conf))*4);
                return 0;
        }
        return 1;
@@@ -8028,8 -7944,8 +8028,8 @@@ static int raid5_start_reshape(struct m
                                        else
                                                rdev->recovery_offset = 0;
  
 -                                      if (sysfs_link_rdev(mddev, rdev))
 -                                              /* Failure here is OK */;
 +                                      /* Failure here is OK */
 +                                      sysfs_link_rdev(mddev, rdev);
                                }
                        } else if (rdev->raid_disk >= conf->previous_raid_disks
                                   && !test_bit(Faulty, &rdev->flags)) {
@@@ -8224,7 -8140,7 +8224,7 @@@ static void *raid5_takeover_raid1(struc
        while (chunksect && (mddev->array_sectors & (chunksect-1)))
                chunksect >>= 1;
  
 -      if ((chunksect<<9) < STRIPE_SIZE)
 +      if ((chunksect<<9) < RAID5_STRIPE_SIZE((struct r5conf *)mddev->private))
                /* array size does not allow a suitable chunk size */
                return ERR_PTR(-EINVAL);
  
@@@ -8511,6 -8427,7 +8511,6 @@@ static struct md_personality raid6_pers
        .finish_reshape = raid5_finish_reshape,
        .quiesce        = raid5_quiesce,
        .takeover       = raid6_takeover,
 -      .congested      = raid5_congested,
        .change_consistency_policy = raid5_change_consistency_policy,
  };
  static struct md_personality raid5_personality =
        .finish_reshape = raid5_finish_reshape,
        .quiesce        = raid5_quiesce,
        .takeover       = raid5_takeover,
 -      .congested      = raid5_congested,
        .change_consistency_policy = raid5_change_consistency_policy,
  };
  
@@@ -8560,6 -8478,7 +8560,6 @@@ static struct md_personality raid4_pers
        .finish_reshape = raid5_finish_reshape,
        .quiesce        = raid5_quiesce,
        .takeover       = raid4_takeover,
 -      .congested      = raid5_congested,
        .change_consistency_policy = raid5_change_consistency_policy,
  };
  
diff --combined drivers/md/raid5.h
index 7fb3b26a181a9f9442fcbfdfa79bd050cc04192e,a2c9e9e9f5ac8346c07a7d72ddf3960fa82e0238..16fc29472f5c0a6ef7b25005cee2dd566ed258d9
@@@ -472,20 -472,32 +472,20 @@@ struct disk_info 
   */
  
  #define NR_STRIPES            256
 +#define DEFAULT_STRIPE_SIZE   4096
 +
 +#if PAGE_SIZE == DEFAULT_STRIPE_SIZE
  #define STRIPE_SIZE           PAGE_SIZE
  #define STRIPE_SHIFT          (PAGE_SHIFT - 9)
  #define STRIPE_SECTORS                (STRIPE_SIZE>>9)
 +#endif
 +
  #define       IO_THRESHOLD            1
  #define BYPASS_THRESHOLD      1
  #define NR_HASH                       (PAGE_SIZE / sizeof(struct hlist_head))
  #define HASH_MASK             (NR_HASH - 1)
  #define MAX_STRIPE_BATCH      8
  
 -/* bio's attached to a stripe+device for I/O are linked together in bi_sector
 - * order without overlap.  There may be several bio's per stripe+device, and
 - * a bio could span several devices.
 - * When walking this list for a particular stripe+device, we must never proceed
 - * beyond a bio that extends past this device, as the next bio might no longer
 - * be valid.
 - * This function is used to determine the 'next' bio in the list, given the
 - * sector of the current stripe+device
 - */
 -static inline struct bio *r5_next_bio(struct bio *bio, sector_t sector)
 -{
 -      if (bio_end_sector(bio) < sector + STRIPE_SECTORS)
 -              return bio->bi_next;
 -      else
 -              return NULL;
 -}
 -
  /* NOTE NR_STRIPE_HASH_LOCKS must remain below 64.
   * This is because we sometimes take all the spinlocks
   * and creating that much locking depth can cause
@@@ -562,11 -574,6 +562,11 @@@ struct r5conf 
        int                     raid_disks;
        int                     max_nr_stripes;
        int                     min_nr_stripes;
 +#if PAGE_SIZE != DEFAULT_STRIPE_SIZE
 +      unsigned long   stripe_size;
 +      unsigned int    stripe_shift;
 +      unsigned long   stripe_sectors;
 +#endif
  
        /* reshape_progress is the leading edge of a 'reshape'
         * It has value MaxSector when no reshape is happening
        int                     prev_chunk_sectors;
        int                     prev_algo;
        short                   generation; /* increments with every reshape */
-       seqcount_t              gen_lock;       /* lock against generation changes */
+       seqcount_spinlock_t     gen_lock;       /* lock against generation changes */
        unsigned long           reshape_checkpoint; /* Time we last updated
                                                     * metadata */
        long long               min_offset_diff; /* minimum difference between
        struct r5pending_data   *next_pending_data;
  };
  
 +#if PAGE_SIZE == DEFAULT_STRIPE_SIZE
 +#define RAID5_STRIPE_SIZE(conf)       STRIPE_SIZE
 +#define RAID5_STRIPE_SHIFT(conf)      STRIPE_SHIFT
 +#define RAID5_STRIPE_SECTORS(conf)    STRIPE_SECTORS
 +#else
 +#define RAID5_STRIPE_SIZE(conf)       ((conf)->stripe_size)
 +#define RAID5_STRIPE_SHIFT(conf)      ((conf)->stripe_shift)
 +#define RAID5_STRIPE_SECTORS(conf)    ((conf)->stripe_sectors)
 +#endif
 +
 +/* bio's attached to a stripe+device for I/O are linked together in bi_sector
 + * order without overlap.  There may be several bio's per stripe+device, and
 + * a bio could span several devices.
 + * When walking this list for a particular stripe+device, we must never proceed
 + * beyond a bio that extends past this device, as the next bio might no longer
 + * be valid.
 + * This function is used to determine the 'next' bio in the list, given the
 + * sector of the current stripe+device
 + */
 +static inline struct bio *r5_next_bio(struct r5conf *conf, struct bio *bio, sector_t sector)
 +{
 +      if (bio_end_sector(bio) < sector + RAID5_STRIPE_SECTORS(conf))
 +              return bio->bi_next;
 +      else
 +              return NULL;
 +}
  
  /*
   * Our supported algorithms
diff --combined fs/userfaultfd.c
index 6e264dded46e4f51928a06d3c3e96d9358def764,26e8b23594fb3d14a078e8f38b47a645229a7804..0e4a3837da52e64134f8de280690485b8dbcb4a1
@@@ -61,7 -61,7 +61,7 @@@ struct userfaultfd_ctx 
        /* waitqueue head for events */
        wait_queue_head_t event_wqh;
        /* a refile sequence protected by fault_pending_wqh lock */
-       struct seqcount refile_seq;
+       seqcount_spinlock_t refile_seq;
        /* pseudo fd refcounting */
        refcount_t refcount;
        /* userfaultfd syscall flags */
@@@ -339,6 -339,7 +339,6 @@@ out
        return ret;
  }
  
 -/* Should pair with userfaultfd_signal_pending() */
  static inline long userfaultfd_get_blocking_state(unsigned int flags)
  {
        if (flags & FAULT_FLAG_INTERRUPTIBLE)
        return TASK_UNINTERRUPTIBLE;
  }
  
 -/* Should pair with userfaultfd_get_blocking_state() */
 -static inline bool userfaultfd_signal_pending(unsigned int flags)
 -{
 -      if (flags & FAULT_FLAG_INTERRUPTIBLE)
 -              return signal_pending(current);
 -
 -      if (flags & FAULT_FLAG_KILLABLE)
 -              return fatal_signal_pending(current);
 -
 -      return false;
 -}
 -
  /*
   * The locking rules involved in returning VM_FAULT_RETRY depending on
   * FAULT_FLAG_ALLOW_RETRY, FAULT_FLAG_RETRY_NOWAIT and
@@@ -503,9 -516,33 +503,9 @@@ vm_fault_t handle_userfault(struct vm_f
                                                       vmf->flags, reason);
        mmap_read_unlock(mm);
  
 -      if (likely(must_wait && !READ_ONCE(ctx->released) &&
 -                 !userfaultfd_signal_pending(vmf->flags))) {
 +      if (likely(must_wait && !READ_ONCE(ctx->released))) {
                wake_up_poll(&ctx->fd_wqh, EPOLLIN);
                schedule();
 -              ret |= VM_FAULT_MAJOR;
 -
 -              /*
 -               * False wakeups can orginate even from rwsem before
 -               * up_read() however userfaults will wait either for a
 -               * targeted wakeup on the specific uwq waitqueue from
 -               * wake_userfault() or for signals or for uffd
 -               * release.
 -               */
 -              while (!READ_ONCE(uwq.waken)) {
 -                      /*
 -                       * This needs the full smp_store_mb()
 -                       * guarantee as the state write must be
 -                       * visible to other CPUs before reading
 -                       * uwq.waken from other CPUs.
 -                       */
 -                      set_current_state(blocking_state);
 -                      if (READ_ONCE(uwq.waken) ||
 -                          READ_ONCE(ctx->released) ||
 -                          userfaultfd_signal_pending(vmf->flags))
 -                              break;
 -                      schedule();
 -              }
        }
  
        __set_current_state(TASK_RUNNING);
@@@ -1961,7 -1998,7 +1961,7 @@@ static void init_once_userfaultfd_ctx(v
        init_waitqueue_head(&ctx->fault_wqh);
        init_waitqueue_head(&ctx->event_wqh);
        init_waitqueue_head(&ctx->fd_wqh);
-       seqcount_init(&ctx->refile_seq);
+       seqcount_spinlock_init(&ctx->refile_seq, &ctx->fault_pending_wqh.lock);
  }
  
  SYSCALL_DEFINE1(userfaultfd, int, flags)
diff --combined include/linux/sched.h
index 52bcc9f48e176e46c15d1a840d2feaf3ea13d78a,7c7a9499d7bc7151f581bfe9983790da23039bfb..53ddc02e2e7922a7c505fed554437c0b5e98beb3
@@@ -18,7 -18,6 +18,7 @@@
  #include <linux/mutex.h>
  #include <linux/plist.h>
  #include <linux/hrtimer.h>
 +#include <linux/irqflags.h>
  #include <linux/seccomp.h>
  #include <linux/nodemask.h>
  #include <linux/rcupdate.h>
@@@ -32,6 -31,7 +32,7 @@@
  #include <linux/task_io_accounting.h>
  #include <linux/posix-timers.h>
  #include <linux/rseq.h>
+ #include <linux/seqlock.h>
  #include <linux/kcsan.h>
  
  /* task_struct member predeclarations (sorted alphabetically): */
@@@ -155,24 -155,24 +156,24 @@@ struct task_group
   *
   *   for (;;) {
   *    set_current_state(TASK_UNINTERRUPTIBLE);
 - *    if (!need_sleep)
 - *            break;
 + *    if (CONDITION)
 + *       break;
   *
   *    schedule();
   *   }
   *   __set_current_state(TASK_RUNNING);
   *
   * If the caller does not need such serialisation (because, for instance, the
 - * condition test and condition change and wakeup are under the same lock) then
 + * CONDITION test and condition change and wakeup are under the same lock) then
   * use __set_current_state().
   *
   * The above is typically ordered against the wakeup, which does:
   *
 - *   need_sleep = false;
 + *   CONDITION = 1;
   *   wake_up_state(p, TASK_UNINTERRUPTIBLE);
   *
 - * where wake_up_state() executes a full memory barrier before accessing the
 - * task state.
 + * where wake_up_state()/try_to_wake_up() executes a full memory barrier before
 + * accessing p->state.
   *
   * Wakeup will do: if (@state & p->state) p->state = TASK_RUNNING, that is,
   * once it observes the TASK_UNINTERRUPTIBLE store the waking CPU can issue a
@@@ -375,7 -375,7 +376,7 @@@ struct util_est 
   * For cfs_rq, they are the aggregated values of all runnable and blocked
   * sched_entities.
   *
 - * The load/runnable/util_avg doesn't direcly factor frequency scaling and CPU
 + * The load/runnable/util_avg doesn't directly factor frequency scaling and CPU
   * capacity scaling. The scaling is done through the rq_clock_pelt that is used
   * for computing those signals (see update_rq_clock_pelt())
   *
@@@ -687,15 -687,9 +688,15 @@@ struct task_struct 
        struct sched_dl_entity          dl;
  
  #ifdef CONFIG_UCLAMP_TASK
 -      /* Clamp values requested for a scheduling entity */
 +      /*
 +       * Clamp values requested for a scheduling entity.
 +       * Must be updated with task_rq_lock() held.
 +       */
        struct uclamp_se                uclamp_req[UCLAMP_CNT];
 -      /* Effective clamp values used for a scheduling entity */
 +      /*
 +       * Effective clamp values used for a scheduling entity.
 +       * Must be updated with task_rq_lock() held.
 +       */
        struct uclamp_se                uclamp[UCLAMP_CNT];
  #endif
  
  #endif
  
  #ifdef CONFIG_TRACE_IRQFLAGS
 -      unsigned int                    irq_events;
 +      struct irqtrace_events          irqtrace;
        unsigned int                    hardirq_threaded;
 -      unsigned long                   hardirq_enable_ip;
 -      unsigned long                   hardirq_disable_ip;
 -      unsigned int                    hardirq_enable_event;
 -      unsigned int                    hardirq_disable_event;
        u64                             hardirq_chain_key;
 -      unsigned long                   softirq_disable_ip;
 -      unsigned long                   softirq_enable_ip;
 -      unsigned int                    softirq_disable_event;
 -      unsigned int                    softirq_enable_event;
        int                             softirqs_enabled;
        int                             softirq_context;
        int                             irq_config;
        /* Protected by ->alloc_lock: */
        nodemask_t                      mems_allowed;
        /* Seqence number to catch updates: */
-       seqcount_t                      mems_allowed_seq;
+       seqcount_spinlock_t             mems_allowed_seq;
        int                             cpuset_mem_spread_rotor;
        int                             cpuset_slab_spread_rotor;
  #endif
  #ifdef CONFIG_KASAN
        unsigned int                    kasan_depth;
  #endif
 +
  #ifdef CONFIG_KCSAN
        struct kcsan_ctx                kcsan_ctx;
 +#ifdef CONFIG_TRACE_IRQFLAGS
 +      struct irqtrace_events          kcsan_save_irqtrace;
 +#endif
  #endif
  
  #ifdef CONFIG_FUNCTION_GRAPH_TRACER
@@@ -1507,6 -1505,7 +1508,6 @@@ extern struct pid *cad_pid
  #define PF_KTHREAD            0x00200000      /* I am a kernel thread */
  #define PF_RANDOMIZE          0x00400000      /* Randomize virtual address space */
  #define PF_SWAPWRITE          0x00800000      /* Allowed to write to swap */
 -#define PF_UMH                        0x02000000      /* I'm an Usermodehelper process */
  #define PF_NO_SETAFFINITY     0x04000000      /* Userland is not allowed to meddle with cpus_mask */
  #define PF_MCE_EARLY          0x08000000      /* Early kill for mce process policy */
  #define PF_MEMALLOC_NOCMA     0x10000000      /* All allocation request will have _GFP_MOVABLE cleared */
@@@ -1648,9 -1647,6 +1649,9 @@@ extern int idle_cpu(int cpu)
  extern int available_idle_cpu(int cpu);
  extern int sched_setscheduler(struct task_struct *, int, const struct sched_param *);
  extern int sched_setscheduler_nocheck(struct task_struct *, int, const struct sched_param *);
 +extern void sched_set_fifo(struct task_struct *p);
 +extern void sched_set_fifo_low(struct task_struct *p);
 +extern void sched_set_normal(struct task_struct *p, int nice);
  extern int sched_setattr(struct task_struct *, const struct sched_attr *);
  extern int sched_setattr_nocheck(struct task_struct *, const struct sched_attr *);
  extern struct task_struct *idle_task(int cpu);
@@@ -2018,6 -2014,14 +2019,6 @@@ static inline void rseq_execve(struct t
  
  #endif
  
 -void __exit_umh(struct task_struct *tsk);
 -
 -static inline void exit_umh(struct task_struct *tsk)
 -{
 -      if (unlikely(tsk->flags & PF_UMH))
 -              __exit_umh(tsk);
 -}
 -
  #ifdef CONFIG_DEBUG_RSEQ
  
  void rseq_syscall(struct pt_regs *regs);
@@@ -2039,7 -2043,6 +2040,7 @@@ const struct sched_avg *sched_trace_rq_
  const struct sched_avg *sched_trace_rq_avg_irq(struct rq *rq);
  
  int sched_trace_rq_cpu(struct rq *rq);
 +int sched_trace_rq_nr_running(struct rq *rq);
  
  const struct cpumask *sched_trace_rd_span(struct root_domain *rd);
  
index c7bfddfc65b02e4eaf22aac0409e56404455db69,ea4e2010b2465bd58511904dfdf63717df64de64..439379ca9fface9171d47e5bc6361e34e9cf4360
@@@ -279,18 -279,6 +279,18 @@@ static inline bool nf_ct_should_gc(cons
               !nf_ct_is_dying(ct);
  }
  
 +#define       NF_CT_DAY       (86400 * HZ)
 +
 +/* Set an arbitrary timeout large enough not to ever expire, this save
 + * us a check for the IPS_OFFLOAD_BIT from the packet path via
 + * nf_ct_is_expired().
 + */
 +static inline void nf_ct_offload_timeout(struct nf_conn *ct)
 +{
 +      if (nf_ct_expires(ct) < NF_CT_DAY / 2)
 +              ct->timeout = nfct_time_stamp + NF_CT_DAY;
 +}
 +
  struct kernel_param;
  
  int nf_conntrack_set_hashsize(const char *val, const struct kernel_param *kp);
@@@ -298,7 -286,7 +298,7 @@@ int nf_conntrack_hash_resize(unsigned i
  
  extern struct hlist_nulls_head *nf_conntrack_hash;
  extern unsigned int nf_conntrack_htable_size;
- extern seqcount_t nf_conntrack_generation;
+ extern seqcount_spinlock_t nf_conntrack_generation;
  extern unsigned int nf_conntrack_max;
  
  /* must be called with rcu read lock held */
diff --combined init/init_task.c
index a3eb3847e1f48d5c8c9df2f6e50520e35af4b9c8,94fe3ba1bb600cf40d07ea7af04f327fb24a034b..89024e8c4e95e56eb6abf6526c8bf7c52db68b56
@@@ -154,7 -154,8 +154,8 @@@ struct task_struct init_tas
        .trc_holdout_list = LIST_HEAD_INIT(init_task.trc_holdout_list),
  #endif
  #ifdef CONFIG_CPUSETS
-       .mems_allowed_seq = SEQCNT_ZERO(init_task.mems_allowed_seq),
+       .mems_allowed_seq = SEQCNT_SPINLOCK_ZERO(init_task.mems_allowed_seq,
+                                                &init_task.alloc_lock),
  #endif
  #ifdef CONFIG_RT_MUTEXES
        .pi_waiters     = RB_ROOT_CACHED,
  #ifdef CONFIG_SECURITY
        .security       = NULL,
  #endif
 +#ifdef CONFIG_SECCOMP
 +      .seccomp        = { .filter_count = ATOMIC_INIT(0) },
 +#endif
  };
  EXPORT_SYMBOL(init_task);
  
diff --combined kernel/fork.c
index 35e9894d394c2941a2ebf873494e6e1546834421,fc72f09a61b2b7d5c2bda8f46f01a77c93cebde3..4d32190861bdc6500730bf296b2f79f7a51ceacf
@@@ -261,7 -261,7 +261,7 @@@ static unsigned long *alloc_thread_stac
                                             THREAD_SIZE_ORDER);
  
        if (likely(page)) {
 -              tsk->stack = page_address(page);
 +              tsk->stack = kasan_reset_tag(page_address(page));
                return tsk->stack;
        }
        return NULL;
@@@ -276,8 -276,13 +276,8 @@@ static inline void free_thread_stack(st
        if (vm) {
                int i;
  
 -              for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++) {
 -                      mod_memcg_page_state(vm->pages[i],
 -                                           MEMCG_KERNEL_STACK_KB,
 -                                           -(int)(PAGE_SIZE / 1024));
 -
 +              for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++)
                        memcg_kmem_uncharge_page(vm->pages[i], 0);
 -              }
  
                for (i = 0; i < NR_CACHED_STACKS; i++) {
                        if (this_cpu_cmpxchg(cached_stacks[i],
@@@ -302,7 -307,6 +302,7 @@@ static unsigned long *alloc_thread_stac
  {
        unsigned long *stack;
        stack = kmem_cache_alloc_node(thread_stack_cache, THREADINFO_GFP, node);
 +      stack = kasan_reset_tag(stack);
        tsk->stack = stack;
        return stack;
  }
@@@ -355,13 -359,7 +355,13 @@@ struct vm_area_struct *vm_area_dup(stru
        struct vm_area_struct *new = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
  
        if (new) {
 -              *new = *orig;
 +              ASSERT_EXCLUSIVE_WRITER(orig->vm_flags);
 +              ASSERT_EXCLUSIVE_WRITER(orig->vm_file);
 +              /*
 +               * orig->shared.rb may be modified concurrently, but the clone
 +               * will be reinitialized.
 +               */
 +              *new = data_race(*orig);
                INIT_LIST_HEAD(&new->anon_vma_chain);
                new->vm_next = new->vm_prev = NULL;
        }
@@@ -378,14 -376,31 +378,14 @@@ static void account_kernel_stack(struc
        void *stack = task_stack_page(tsk);
        struct vm_struct *vm = task_stack_vm_area(tsk);
  
 -      BUILD_BUG_ON(IS_ENABLED(CONFIG_VMAP_STACK) && PAGE_SIZE % 1024 != 0);
 -
 -      if (vm) {
 -              int i;
 -
 -              BUG_ON(vm->nr_pages != THREAD_SIZE / PAGE_SIZE);
 -
 -              for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++) {
 -                      mod_zone_page_state(page_zone(vm->pages[i]),
 -                                          NR_KERNEL_STACK_KB,
 -                                          PAGE_SIZE / 1024 * account);
 -              }
 -      } else {
 -              /*
 -               * All stack pages are in the same zone and belong to the
 -               * same memcg.
 -               */
 -              struct page *first_page = virt_to_page(stack);
 -
 -              mod_zone_page_state(page_zone(first_page), NR_KERNEL_STACK_KB,
 -                                  THREAD_SIZE / 1024 * account);
  
 -              mod_memcg_obj_state(stack, MEMCG_KERNEL_STACK_KB,
 -                                  account * (THREAD_SIZE / 1024));
 -      }
 +      /* All stack pages are in the same node. */
 +      if (vm)
 +              mod_lruvec_page_state(vm->pages[0], NR_KERNEL_STACK_KB,
 +                                    account * (THREAD_SIZE / 1024));
 +      else
 +              mod_lruvec_slab_state(stack, NR_KERNEL_STACK_KB,
 +                                    account * (THREAD_SIZE / 1024));
  }
  
  static int memcg_charge_kernel_stack(struct task_struct *tsk)
        struct vm_struct *vm = task_stack_vm_area(tsk);
        int ret;
  
 +      BUILD_BUG_ON(IS_ENABLED(CONFIG_VMAP_STACK) && PAGE_SIZE % 1024 != 0);
 +
        if (vm) {
                int i;
  
 +              BUG_ON(vm->nr_pages != THREAD_SIZE / PAGE_SIZE);
 +
                for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++) {
                        /*
                         * If memcg_kmem_charge_page() fails, page->mem_cgroup
 -                       * pointer is NULL, and both memcg_kmem_uncharge_page()
 -                       * and mod_memcg_page_state() in free_thread_stack()
 -                       * will ignore this page. So it's safe.
 +                       * pointer is NULL, and memcg_kmem_uncharge_page() in
 +                       * free_thread_stack() will ignore this page.
                         */
                        ret = memcg_kmem_charge_page(vm->pages[i], GFP_KERNEL,
                                                     0);
                        if (ret)
                                return ret;
 -
 -                      mod_memcg_page_state(vm->pages[i],
 -                                           MEMCG_KERNEL_STACK_KB,
 -                                           PAGE_SIZE / 1024);
                }
        }
  #endif
@@@ -457,6 -473,7 +457,6 @@@ void free_task(struct task_struct *tsk
  #endif
        rt_mutex_debug_task_free(tsk);
        ftrace_graph_exit_task(tsk);
 -      put_seccomp_filter(tsk);
        arch_release_task_struct(tsk);
        if (tsk->flags & PF_KTHREAD)
                free_kthread_struct(tsk);
@@@ -1457,7 -1474,7 +1457,7 @@@ static int copy_files(unsigned long clo
                goto out;
        }
  
 -      newf = dup_fd(oldf, &error);
 +      newf = dup_fd(oldf, NR_OPEN_MAX, &error);
        if (!newf)
                goto out;
  
@@@ -1770,18 -1787,22 +1770,18 @@@ static void pidfd_show_fdinfo(struct se
   */
  static __poll_t pidfd_poll(struct file *file, struct poll_table_struct *pts)
  {
 -      struct task_struct *task;
        struct pid *pid = file->private_data;
        __poll_t poll_flags = 0;
  
        poll_wait(file, &pid->wait_pidfd, pts);
  
 -      rcu_read_lock();
 -      task = pid_task(pid, PIDTYPE_PID);
        /*
         * Inform pollers only when the whole thread group exits.
         * If the thread group leader exits before all other threads in the
         * group, then poll(2) should block, similar to the wait(2) family.
         */
 -      if (!task || (task->exit_state && thread_group_empty(task)))
 +      if (thread_group_exited(pid))
                poll_flags = EPOLLIN | EPOLLRDNORM;
 -      rcu_read_unlock();
  
        return poll_flags;
  }
@@@ -2011,14 -2032,20 +2011,14 @@@ static __latent_entropy struct task_str
  #ifdef CONFIG_CPUSETS
        p->cpuset_mem_spread_rotor = NUMA_NO_NODE;
        p->cpuset_slab_spread_rotor = NUMA_NO_NODE;
-       seqcount_init(&p->mems_allowed_seq);
+       seqcount_spinlock_init(&p->mems_allowed_seq, &p->alloc_lock);
  #endif
  #ifdef CONFIG_TRACE_IRQFLAGS
 -      p->irq_events = 0;
 -      p->hardirq_enable_ip = 0;
 -      p->hardirq_enable_event = 0;
 -      p->hardirq_disable_ip = _THIS_IP_;
 -      p->hardirq_disable_event = 0;
 -      p->softirqs_enabled = 1;
 -      p->softirq_enable_ip = _THIS_IP_;
 -      p->softirq_enable_event = 0;
 -      p->softirq_disable_ip = 0;
 -      p->softirq_disable_event = 0;
 -      p->softirq_context = 0;
 +      memset(&p->irqtrace, 0, sizeof(p->irqtrace));
 +      p->irqtrace.hardirq_disable_ip  = _THIS_IP_;
 +      p->irqtrace.softirq_enable_ip   = _THIS_IP_;
 +      p->softirqs_enabled             = 1;
 +      p->softirq_context              = 0;
  #endif
  
        p->pagefault_disabled = 0;
        retval = copy_io(clone_flags, p);
        if (retval)
                goto bad_fork_cleanup_namespaces;
 -      retval = copy_thread_tls(clone_flags, args->stack, args->stack_size, p,
 -                               args->tls);
 +      retval = copy_thread(clone_flags, args->stack, args->stack_size, p, args->tls);
        if (retval)
                goto bad_fork_cleanup_io;
  
        write_unlock_irq(&tasklist_lock);
  
        proc_fork_connector(p);
 +      sched_post_fork(p);
        cgroup_post_fork(p, args);
        perf_event_fork(p);
  
@@@ -2393,20 -2420,6 +2393,20 @@@ long _do_fork(struct kernel_clone_args 
        int trace = 0;
        long nr;
  
 +      /*
 +       * For legacy clone() calls, CLONE_PIDFD uses the parent_tid argument
 +       * to return the pidfd. Hence, CLONE_PIDFD and CLONE_PARENT_SETTID are
 +       * mutually exclusive. With clone3() CLONE_PIDFD has grown a separate
 +       * field in struct clone_args and it still doesn't make sense to have
 +       * them both point at the same memory location. Performing this check
 +       * here has the advantage that we don't need to have a separate helper
 +       * to check for legacy clone().
 +       */
 +      if ((args->flags & CLONE_PIDFD) &&
 +          (args->flags & CLONE_PARENT_SETTID) &&
 +          (args->pidfd == args->parent_tid))
 +              return -EINVAL;
 +
        /*
         * Determine whether and which event to report to ptracer.  When
         * called from kernel_thread or CLONE_UNTRACED is explicitly
        return nr;
  }
  
 -bool legacy_clone_args_valid(const struct kernel_clone_args *kargs)
 -{
 -      /* clone(CLONE_PIDFD) uses parent_tidptr to return a pidfd */
 -      if ((kargs->flags & CLONE_PIDFD) &&
 -          (kargs->flags & CLONE_PARENT_SETTID))
 -              return false;
 -
 -      return true;
 -}
 -
 -#ifndef CONFIG_HAVE_COPY_THREAD_TLS
 -/* For compatibility with architectures that call do_fork directly rather than
 - * using the syscall entry points below. */
 -long do_fork(unsigned long clone_flags,
 -            unsigned long stack_start,
 -            unsigned long stack_size,
 -            int __user *parent_tidptr,
 -            int __user *child_tidptr)
 -{
 -      struct kernel_clone_args args = {
 -              .flags          = (lower_32_bits(clone_flags) & ~CSIGNAL),
 -              .pidfd          = parent_tidptr,
 -              .child_tid      = child_tidptr,
 -              .parent_tid     = parent_tidptr,
 -              .exit_signal    = (lower_32_bits(clone_flags) & CSIGNAL),
 -              .stack          = stack_start,
 -              .stack_size     = stack_size,
 -      };
 -
 -      if (!legacy_clone_args_valid(&args))
 -              return -EINVAL;
 -
 -      return _do_fork(&args);
 -}
 -#endif
 -
  /*
   * Create a kernel thread.
   */
@@@ -2542,12 -2591,24 +2542,12 @@@ SYSCALL_DEFINE5(clone, unsigned long, c
                .tls            = tls,
        };
  
 -      if (!legacy_clone_args_valid(&args))
 -              return -EINVAL;
 -
        return _do_fork(&args);
  }
  #endif
  
  #ifdef __ARCH_WANT_SYS_CLONE3
  
 -/*
 - * copy_thread implementations handle CLONE_SETTLS by reading the TLS value from
 - * the registers containing the syscall arguments for clone. This doesn't work
 - * with clone3 since the TLS value is passed in clone_args instead.
 - */
 -#ifndef CONFIG_HAVE_COPY_THREAD_TLS
 -#error clone3 requires copy_thread_tls support in arch
 -#endif
 -
  noinline static int copy_clone_args_from_user(struct kernel_clone_args *kargs,
                                              struct clone_args __user *uargs,
                                              size_t usize)
@@@ -2844,15 -2905,14 +2844,15 @@@ static int unshare_fs(unsigned long uns
  /*
   * Unshare file descriptor table if it is being shared
   */
 -static int unshare_fd(unsigned long unshare_flags, struct files_struct **new_fdp)
 +int unshare_fd(unsigned long unshare_flags, unsigned int max_fds,
 +             struct files_struct **new_fdp)
  {
        struct files_struct *fd = current->files;
        int error = 0;
  
        if ((unshare_flags & CLONE_FILES) &&
            (fd && atomic_read(&fd->count) > 1)) {
 -              *new_fdp = dup_fd(fd, &error);
 +              *new_fdp = dup_fd(fd, max_fds, &error);
                if (!*new_fdp)
                        return error;
        }
  /*
   * unshare allows a process to 'unshare' part of the process
   * context which was originally shared using clone.  copy_*
 - * functions used by do_fork() cannot be used here directly
 + * functions used by _do_fork() cannot be used here directly
   * because they modify an inactive task_struct that is being
   * constructed. Here we are modifying the current, active,
   * task_struct.
@@@ -2912,7 -2972,7 +2912,7 @@@ int ksys_unshare(unsigned long unshare_
        err = unshare_fs(unshare_flags, &new_fs);
        if (err)
                goto bad_unshare_out;
 -      err = unshare_fd(unshare_flags, &new_fd);
 +      err = unshare_fd(unshare_flags, NR_OPEN_MAX, &new_fd);
        if (err)
                goto bad_unshare_cleanup_fs;
        err = unshare_userns(unshare_flags, &new_cred);
@@@ -3001,7 -3061,7 +3001,7 @@@ int unshare_files(struct files_struct *
        struct files_struct *copy = NULL;
        int error;
  
 -      error = unshare_fd(CLONE_FILES, &copy);
 +      error = unshare_fd(CLONE_FILES, NR_OPEN_MAX, &copy);
        if (error || !copy) {
                *displaced = NULL;
                return error;
index 63a632f9896c38133e4877dc828390af052105f1,05ecfd8a3314037e2f84fbdfb7b557efaebb72c9..406306b3345232298f4b7dbb3b0a58e77536a568
@@@ -39,18 -39,19 +39,19 @@@ enum timekeeping_adv_mode 
        TK_ADV_FREQ
  };
  
+ static DEFINE_RAW_SPINLOCK(timekeeper_lock);
  /*
   * The most important data for readout fits into a single 64 byte
   * cache line.
   */
  static struct {
-       seqcount_t              seq;
+       seqcount_raw_spinlock_t seq;
        struct timekeeper       timekeeper;
  } tk_core ____cacheline_aligned = {
-       .seq = SEQCNT_ZERO(tk_core.seq),
+       .seq = SEQCNT_RAW_SPINLOCK_ZERO(tk_core.seq, &timekeeper_lock),
  };
  
- static DEFINE_RAW_SPINLOCK(timekeeper_lock);
  static struct timekeeper shadow_timekeeper;
  
  /**
@@@ -63,7 -64,7 +64,7 @@@
   * See @update_fast_timekeeper() below.
   */
  struct tk_fast {
-       seqcount_t              seq;
+       seqcount_raw_spinlock_t seq;
        struct tk_read_base     base[2];
  };
  
@@@ -80,11 -81,13 +81,13 @@@ static struct clocksource dummy_clock 
  };
  
  static struct tk_fast tk_fast_mono ____cacheline_aligned = {
+       .seq     = SEQCNT_RAW_SPINLOCK_ZERO(tk_fast_mono.seq, &timekeeper_lock),
        .base[0] = { .clock = &dummy_clock, },
        .base[1] = { .clock = &dummy_clock, },
  };
  
  static struct tk_fast tk_fast_raw  ____cacheline_aligned = {
+       .seq     = SEQCNT_RAW_SPINLOCK_ZERO(tk_fast_raw.seq, &timekeeper_lock),
        .base[0] = { .clock = &dummy_clock, },
        .base[1] = { .clock = &dummy_clock, },
  };
@@@ -157,7 -160,7 +160,7 @@@ static inline void tk_update_sleep_time
   * tk_clock_read - atomic clocksource read() helper
   *
   * This helper is necessary to use in the read paths because, while the
-  * seqlock ensures we don't return a bad value while structures are updated,
+  * seqcount ensures we don't return a bad value while structures are updated,
   * it doesn't protect from potential crashes. There is the possibility that
   * the tkr's clocksource may change between the read reference, and the
   * clock reference passed to the read function.  This can cause crashes if
@@@ -222,10 -225,10 +225,10 @@@ static inline u64 timekeeping_get_delta
        unsigned int seq;
  
        /*
-        * Since we're called holding a seqlock, the data may shift
+        * Since we're called holding a seqcount, the data may shift
         * under us while we're doing the calculation. This can cause
         * false positives, since we'd note a problem but throw the
-        * results away. So nest another seqlock here to atomically
+        * results away. So nest another seqcount here to atomically
         * grab the points we are checking with.
         */
        do {
@@@ -486,7 -489,7 +489,7 @@@ EXPORT_SYMBOL_GPL(ktime_get_raw_fast_ns
   *
   * To keep it NMI safe since we're accessing from tracing, we're not using a
   * separate timekeeper with updates to monotonic clock and boot offset
-  * protected with seqlocks. This has the following minor side effects:
+  * protected with seqcounts. This has the following minor side effects:
   *
   * (1) Its possible that a timestamp be taken after the boot offset is updated
   * but before the timekeeper is updated. If this happens, the new boot offset
@@@ -2193,7 -2196,7 +2196,7 @@@ EXPORT_SYMBOL(ktime_get_coarse_ts64)
  void do_timer(unsigned long ticks)
  {
        jiffies_64 += ticks;
 -      calc_global_load(ticks);
 +      calc_global_load();
  }
  
  /**
index e38b60fc183e364003e148eec5478670fdc99326,b597b5b16ba1919785468ddf84b96c316fa00d58..5b97d233f89ba5d8477311a2c2d82f9faf22dc5e
@@@ -180,7 -180,7 +180,7 @@@ EXPORT_SYMBOL_GPL(nf_conntrack_htable_s
  
  unsigned int nf_conntrack_max __read_mostly;
  EXPORT_SYMBOL_GPL(nf_conntrack_max);
- seqcount_t nf_conntrack_generation __read_mostly;
+ seqcount_spinlock_t nf_conntrack_generation __read_mostly;
  static unsigned int nf_conntrack_hash_rnd __read_mostly;
  
  static u32 hash_conntrack_raw(const struct nf_conntrack_tuple *tuple,
@@@ -1006,7 -1006,7 +1006,7 @@@ static int nf_ct_resolve_clash_harder(s
   *
   * @skb: skb that causes the clash
   * @h: tuplehash of the clashing entry already in table
 - * @hash_reply: hash slot for reply direction
 + * @reply_hash: hash slot for reply direction
   *
   * A conntrack entry can be inserted to the connection tracking table
   * if there is no existing entry with an identical tuple.
@@@ -1344,6 -1344,18 +1344,6 @@@ static bool gc_worker_can_early_drop(co
        return false;
  }
  
 -#define       DAY     (86400 * HZ)
 -
 -/* Set an arbitrary timeout large enough not to ever expire, this save
 - * us a check for the IPS_OFFLOAD_BIT from the packet path via
 - * nf_ct_is_expired().
 - */
 -static void nf_ct_offload_timeout(struct nf_conn *ct)
 -{
 -      if (nf_ct_expires(ct) < DAY / 2)
 -              ct->timeout = nfct_time_stamp + DAY;
 -}
 -
  static void gc_worker(struct work_struct *work)
  {
        unsigned int min_interval = max(HZ / GC_MAX_BUCKETS_DIV, 1u);
@@@ -2588,7 -2600,8 +2588,8 @@@ int nf_conntrack_init_start(void
        /* struct nf_ct_ext uses u8 to store offsets/size */
        BUILD_BUG_ON(total_extension_size() > 255u);
  
-       seqcount_init(&nf_conntrack_generation);
+       seqcount_spinlock_init(&nf_conntrack_generation,
+                              &nf_conntrack_locks_all_lock);
  
        for (i = 0; i < CONNTRACK_LOCKS; i++)
                spin_lock_init(&nf_conntrack_locks[i]);
diff --combined net/xfrm/xfrm_policy.c
index 042ea9b40c7b4183b6a0a4f9322c8361d4f59f94,732a940468b075fabf44df3e891a875cee593986..d5280fd6f9c127b6099f07108a74f76005923e98
@@@ -39,7 -39,7 +39,7 @@@
  #ifdef CONFIG_XFRM_STATISTICS
  #include <net/snmp.h>
  #endif
 -#ifdef CONFIG_INET_ESPINTCP
 +#ifdef CONFIG_XFRM_ESPINTCP
  #include <net/espintcp.h>
  #endif
  
@@@ -122,7 -122,7 +122,7 @@@ struct xfrm_pol_inexact_bin 
        /* list containing '*:*' policies */
        struct hlist_head hhead;
  
-       seqcount_t count;
+       seqcount_spinlock_t count;
        /* tree sorted by daddr/prefix */
        struct rb_root root_d;
  
@@@ -155,7 -155,7 +155,7 @@@ static struct xfrm_policy_afinfo const 
                                                __read_mostly;
  
  static struct kmem_cache *xfrm_dst_cache __ro_after_init;
- static __read_mostly seqcount_t xfrm_policy_hash_generation;
+ static __read_mostly seqcount_mutex_t xfrm_policy_hash_generation;
  
  static struct rhashtable xfrm_policy_inexact_table;
  static const struct rhashtable_params xfrm_pol_inexact_params;
@@@ -719,7 -719,7 +719,7 @@@ xfrm_policy_inexact_alloc_bin(const str
        INIT_HLIST_HEAD(&bin->hhead);
        bin->root_d = RB_ROOT;
        bin->root_s = RB_ROOT;
-       seqcount_init(&bin->count);
+       seqcount_spinlock_init(&bin->count, &net->xfrm.xfrm_policy_lock);
  
        prev = rhashtable_lookup_get_insert_key(&xfrm_policy_inexact_table,
                                                &bin->k, &bin->head,
@@@ -1433,10 -1433,14 +1433,10 @@@ static void xfrm_policy_requeue(struct 
        spin_unlock_bh(&pq->hold_queue.lock);
  }
  
 -static bool xfrm_policy_mark_match(struct xfrm_policy *policy,
 -                                 struct xfrm_policy *pol)
 +static inline bool xfrm_policy_mark_match(const struct xfrm_mark *mark,
 +                                        struct xfrm_policy *pol)
  {
 -      if (policy->mark.v == pol->mark.v &&
 -          policy->priority == pol->priority)
 -              return true;
 -
 -      return false;
 +      return mark->v == pol->mark.v && mark->m == pol->mark.m;
  }
  
  static u32 xfrm_pol_bin_key(const void *data, u32 len, u32 seed)
@@@ -1499,7 -1503,7 +1499,7 @@@ static void xfrm_policy_insert_inexact_
                if (pol->type == policy->type &&
                    pol->if_id == policy->if_id &&
                    !selector_cmp(&pol->selector, &policy->selector) &&
 -                  xfrm_policy_mark_match(policy, pol) &&
 +                  xfrm_policy_mark_match(&policy->mark, pol) &&
                    xfrm_sec_ctx_match(pol->security, policy->security) &&
                    !WARN_ON(delpol)) {
                        delpol = pol;
@@@ -1534,7 -1538,7 +1534,7 @@@ static struct xfrm_policy *xfrm_policy_
                if (pol->type == policy->type &&
                    pol->if_id == policy->if_id &&
                    !selector_cmp(&pol->selector, &policy->selector) &&
 -                  xfrm_policy_mark_match(policy, pol) &&
 +                  xfrm_policy_mark_match(&policy->mark, pol) &&
                    xfrm_sec_ctx_match(pol->security, policy->security) &&
                    !WARN_ON(delpol)) {
                        if (excl)
@@@ -1606,8 -1610,9 +1606,8 @@@ int xfrm_policy_insert(int dir, struct 
  EXPORT_SYMBOL(xfrm_policy_insert);
  
  static struct xfrm_policy *
 -__xfrm_policy_bysel_ctx(struct hlist_head *chain, u32 mark, u32 if_id,
 -                      u8 type, int dir,
 -                      struct xfrm_selector *sel,
 +__xfrm_policy_bysel_ctx(struct hlist_head *chain, const struct xfrm_mark *mark,
 +                      u32 if_id, u8 type, int dir, struct xfrm_selector *sel,
                        struct xfrm_sec_ctx *ctx)
  {
        struct xfrm_policy *pol;
        hlist_for_each_entry(pol, chain, bydst) {
                if (pol->type == type &&
                    pol->if_id == if_id &&
 -                  (mark & pol->mark.m) == pol->mark.v &&
 +                  xfrm_policy_mark_match(mark, pol) &&
                    !selector_cmp(sel, &pol->selector) &&
                    xfrm_sec_ctx_match(ctx, pol->security))
                        return pol;
        return NULL;
  }
  
 -struct xfrm_policy *xfrm_policy_bysel_ctx(struct net *net, u32 mark, u32 if_id,
 -                                        u8 type, int dir,
 -                                        struct xfrm_selector *sel,
 -                                        struct xfrm_sec_ctx *ctx, int delete,
 -                                        int *err)
 +struct xfrm_policy *
 +xfrm_policy_bysel_ctx(struct net *net, const struct xfrm_mark *mark, u32 if_id,
 +                    u8 type, int dir, struct xfrm_selector *sel,
 +                    struct xfrm_sec_ctx *ctx, int delete, int *err)
  {
        struct xfrm_pol_inexact_bin *bin = NULL;
        struct xfrm_policy *pol, *ret = NULL;
  }
  EXPORT_SYMBOL(xfrm_policy_bysel_ctx);
  
 -struct xfrm_policy *xfrm_policy_byid(struct net *net, u32 mark, u32 if_id,
 -                                   u8 type, int dir, u32 id, int delete,
 -                                   int *err)
 +struct xfrm_policy *
 +xfrm_policy_byid(struct net *net, const struct xfrm_mark *mark, u32 if_id,
 +               u8 type, int dir, u32 id, int delete, int *err)
  {
        struct xfrm_policy *pol, *ret;
        struct hlist_head *chain;
        ret = NULL;
        hlist_for_each_entry(pol, chain, byidx) {
                if (pol->type == type && pol->index == id &&
 -                  pol->if_id == if_id &&
 -                  (mark & pol->mark.m) == pol->mark.v) {
 +                  pol->if_id == if_id && xfrm_policy_mark_match(mark, pol)) {
                        xfrm_pol_hold(pol);
                        if (delete) {
                                *err = security_xfrm_policy_delete(
@@@ -1899,7 -1906,7 +1899,7 @@@ static int xfrm_policy_match(const stru
  
  static struct xfrm_pol_inexact_node *
  xfrm_policy_lookup_inexact_addr(const struct rb_root *r,
-                               seqcount_t *count,
+                               seqcount_spinlock_t *count,
                                const xfrm_address_t *addr, u16 family)
  {
        const struct rb_node *parent;
@@@ -2751,7 -2758,6 +2751,7 @@@ static void xfrm_policy_queue_process(s
        struct xfrm_policy_queue *pq = &pol->polq;
        struct flowi fl;
        struct sk_buff_head list;
 +      __u32 skb_mark;
  
        spin_lock(&pq->hold_queue.lock);
        skb = skb_peek(&pq->hold_queue);
        }
        dst = skb_dst(skb);
        sk = skb->sk;
 +
 +      /* Fixup the mark to support VTI. */
 +      skb_mark = skb->mark;
 +      skb->mark = pol->mark.v;
        xfrm_decode_session(skb, &fl, dst->ops->family);
 +      skb->mark = skb_mark;
        spin_unlock(&pq->hold_queue.lock);
  
        dst_hold(xfrm_dst_path(dst));
        while (!skb_queue_empty(&list)) {
                skb = __skb_dequeue(&list);
  
 +              /* Fixup the mark to support VTI. */
 +              skb_mark = skb->mark;
 +              skb->mark = pol->mark.v;
                xfrm_decode_session(skb, &fl, skb_dst(skb)->ops->family);
 +              skb->mark = skb_mark;
 +
                dst_hold(xfrm_dst_path(skb_dst(skb)));
                dst = xfrm_lookup(net, xfrm_dst_path(skb_dst(skb)), &fl, skb->sk, 0);
                if (IS_ERR(dst)) {
@@@ -4157,10 -4153,10 +4157,10 @@@ void __init xfrm_init(void
  {
        register_pernet_subsys(&xfrm_net_ops);
        xfrm_dev_init();
-       seqcount_init(&xfrm_policy_hash_generation);
+       seqcount_mutex_init(&xfrm_policy_hash_generation, &hash_resize_mutex);
        xfrm_input_init();
  
 -#ifdef CONFIG_INET_ESPINTCP
 +#ifdef CONFIG_XFRM_ESPINTCP
        espintcp_init();
  #endif
  
This page took 0.295102 seconds and 4 git commands to generate.