Merge tag 'sched-core-2021-02-17' of git://git.kernel.org/pub/scm/linux/kernel/git...

author Linus Torvalds <[email protected]>

Sun, 21 Feb 2021 20:35:04 +0000 (12:35 -0800)

committer Linus Torvalds <[email protected]>

Sun, 21 Feb 2021 20:35:04 +0000 (12:35 -0800)
author Linus Torvalds <[email protected]>
Sun, 21 Feb 2021 20:35:04 +0000 (12:35 -0800)
committer Linus Torvalds <[email protected]>
Sun, 21 Feb 2021 20:35:04 +0000 (12:35 -0800)
diff --combined Documentation/admin-guide/kernel-parameters.txt

index 36d6ce7cc88688bd33b8f857cc3ace757016054b,78ab29400dd3603e0b0d361ffe9fb195ab380740..b93aaa374266ffa2d7aff66538e1eb3cca86e847
--- 1/Documentation/admin-guide/kernel-parameters.txt
--- 2/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@@ -802,14 -802,13 +802,14 @@@
                         insecure, please do not use on production kernels.
   
         debug_locks_verbose=
- -                      [KNL] verbose self-tests
- -                      Format=<0|1>
+ +                      [KNL] verbose locking self-tests
+ +                      Format: <int>
                         Print debugging info while doing the locking API
                         self-tests.
- -                      We default to 0 (no extra messages), setting it to
- -                      1 will print _a lot_ more information - normally
- -                      only useful to kernel developers.
+ +                      Bitmask for the various LOCKTYPE_ tests. Defaults to 0
+ +                      (no extra messages), setting it to -1 (all bits set)
+ +                      will print _a_lot_ more information - normally only
+ +                      useful to lockdep developers.
   
         debug_objects   [KNL] Enable object debugging
   
@@@ -3459,6 -3458,20 +3459,6 @@@
                         For example, to override I2C bus2:
                         omap_mux=i2c2_scl.i2c2_scl=0x100,i2c2_sda.i2c2_sda=0x100
   
- -      oprofile.timer= [HW]
- -                      Use timer interrupt instead of performance counters
- -
- -      oprofile.cpu_type=      Force an oprofile cpu type
- -                      This might be useful if you have an older oprofile
- -                      userland or if you want common events.
- -                      Format: { arch_perfmon }
- -                      arch_perfmon: [X86] Force use of architectural
- -                              perfmon on Intel CPUs instead of the
- -                              CPU specific event set.
- -                      timer: [X86] Force use of architectural NMI
- -                              timer mode (see also oprofile.timer
- -                              for generic hr timer mode)
- -
         oops=panic      Always panic on oopses. Default is to just kill the
                         process, but there is a small probability of
                         deadlocking the machine.
@@@ -3903,6 -3916,13 +3903,13 @@@
                         Format: {"off"}
                         Disable Hardware Transactional Memory
   
+       preempt=        [KNL]
+                       Select preemption mode if you have CONFIG_PREEMPT_DYNAMIC
+                       none - Limited to cond_resched() calls
+                       voluntary - Limited to cond_resched() and might_sleep() calls
+                       full - Any section that isn't explicitly preempt disabled
+                              can be preempted anytime.
+ 
         print-fatal-signals=
                         [KNL] debug: print fatal signals
   
@@@ -4079,10 -4099,6 +4086,10 @@@
                         value, meaning that RCU_SOFTIRQ is used by default.
                         Specify rcutree.use_softirq=0 to use rcuc kthreads.
   
+ +                      But note that CONFIG_PREEMPT_RT=y kernels disable
+ +                      this kernel boot parameter, forcibly setting it
+ +                      to zero.
+ +
         rcutree.rcu_fanout_exact= [KNL]
                         Disable autobalancing of the rcu_node combining
                         tree.  This is used by rcutorture, and might
@@@ -4170,6 -4186,12 +4177,6 @@@
                         Set wakeup interval for idle CPUs that have
                         RCU callbacks (RCU_FAST_NO_HZ=y).
   
- -      rcutree.rcu_idle_lazy_gp_delay= [KNL]
- -                      Set wakeup interval for idle CPUs that have
- -                      only "lazy" RCU callbacks (RCU_FAST_NO_HZ=y).
- -                      Lazy RCU callbacks are those which RCU can
- -                      prove do nothing more than free memory.
- -
         rcutree.rcu_kick_kthreads= [KNL]
                         Cause the grace-period kthread to get an extra
                         wake_up() if it sleeps three times longer than
@@@ -4323,14 -4345,6 +4330,14 @@@
                         stress RCU, they don't participate in the actual
                         test, hence the "fake".
   
+ +      rcutorture.nocbs_nthreads= [KNL]
+ +                      Set number of RCU callback-offload togglers.
+ +                      Zero (the default) disables toggling.
+ +
+ +      rcutorture.nocbs_toggle= [KNL]
+ +                      Set the delay in milliseconds between successive
+ +                      callback-offload toggling attempts.
+ +
         rcutorture.nreaders= [KNL]
                         Set number of RCU readers.  The value -1 selects
                         N-1, where N is the number of CPUs.  A value
@@@ -4463,13 -4477,6 +4470,13 @@@
                         only normal grace-period primitives.  No effect
                         on CONFIG_TINY_RCU kernels.
   
+ +                      But note that CONFIG_PREEMPT_RT=y kernels enables
+ +                      this kernel boot parameter, forcibly setting
+ +                      it to the value one, that is, converting any
+ +                      post-boot attempt at an expedited RCU grace
+ +                      period to instead use normal non-expedited
+ +                      grace-period processing.
+ +
         rcupdate.rcu_task_ipi_delay= [KNL]
                         Set time in jiffies during which RCU tasks will
                         avoid sending IPIs, starting with the beginning
@@@ -4557,12 -4564,6 +4564,12 @@@
         refscale.verbose= [KNL]
                         Enable additional printk() statements.
   
+ +      refscale.verbose_batched= [KNL]
+ +                      Batch the additional printk() statements.  If zero
+ +                      (the default) or negative, print everything.  Otherwise,
+ +                      print every Nth verbose statement, where N is the value
+ +                      specified.
+ +
         relax_domain_level=
                         [KNL, SMP] Set scheduler's default relax_domain_level.
                         See Documentation/admin-guide/cgroup-v1/cpusets.rst.
@@@ -5337,14 -5338,6 +5344,14 @@@
                         are running concurrently, especially on systems
                         with rotating-rust storage.
   
+ +      torture.verbose_sleep_frequency= [KNL]
+ +                      Specifies how many verbose printk()s should be
+ +                      emitted between each sleep.  The default of zero
+ +                      disables verbose-printk() sleeping.
+ +
+ +      torture.verbose_sleep_duration= [KNL]
+ +                      Duration of each verbose-printk() sleep in jiffies.
+ +
         tp720=          [HW,PS2]
   
         tpm_suspend_pcr=[HW,TPM]
diff --combined arch/Kconfig

index 87608c2fa02771014e73021c34b1e9d79221f4a0,12450790782494de476f18e806118085ccac9289..4790a5f23d9fd15caf1fff15fba45df539234697
--- 1/arch/Kconfig
--- 2/arch/Kconfig
+++ b/arch/Kconfig
@@@ -33,6 -33,38 +33,6 @@@ config HOTPLUG_SM
   config GENERIC_ENTRY
          bool
   
- -config OPROFILE
- -      tristate "OProfile system profiling"
- -      depends on PROFILING
- -      depends on HAVE_OPROFILE
- -      select RING_BUFFER
- -      select RING_BUFFER_ALLOW_SWAP
- -      help
- -        OProfile is a profiling system capable of profiling the
- -        whole system, include the kernel, kernel modules, libraries,
- -        and applications.
- -
- -        If unsure, say N.
- -
- -config OPROFILE_EVENT_MULTIPLEX
- -      bool "OProfile multiplexing support (EXPERIMENTAL)"
- -      default n
- -      depends on OPROFILE && X86
- -      help
- -        The number of hardware counters is limited. The multiplexing
- -        feature enables OProfile to gather more events than counters
- -        are provided by the hardware. This is realized by switching
- -        between events at a user specified time interval.
- -
- -        If unsure, say N.
- -
- -config HAVE_OPROFILE
- -      bool
- -
- -config OPROFILE_NMI_TIMER
- -      def_bool y
- -      depends on PERF_EVENTS && HAVE_PERF_EVENTS_NMI && !PPC64
- -
   config KPROBES
         bool "Kprobes"
         depends on MODULES
@@@ -1058,6 -1090,15 +1058,15 @@@ config HAVE_STATIC_CALL_INLIN
         bool
         depends on HAVE_STATIC_CALL
   
+ config HAVE_PREEMPT_DYNAMIC
+       bool
+       depends on HAVE_STATIC_CALL
+       depends on GENERIC_ENTRY
+       help
+          Select this if the architecture support boot time preempt setting
+          on top of static calls. It is strongly advised to support inline
+          static call to avoid any overhead.
+ 
   config ARCH_WANT_LD_ORPHAN_WARN
         bool
         help
@@@ -1079,9 -1120,6 +1088,9 @@@ config ARCH_SPLIT_ARG6
            If a 32-bit architecture requires 64-bit arguments to be split into
            pairs of 32-bit arguments, select this option.
   
+ +config ARCH_HAS_ELFCORE_COMPAT
+ +      bool
+ +
   source "kernel/gcov/Kconfig"
   
   source "scripts/gcc-plugins/Kconfig"
diff --combined arch/powerpc/platforms/cell/spufs/sched.c

index 9d06fffb1526c9696b61056d8c596693b5caf37b,aeb7f3922106a033e1063731bd3f8e0f1e0c262b..369206489895a28cb15ab329aa7f11d1eb0610ee
--- 1/arch/powerpc/platforms/cell/spufs/sched.c
--- 2/arch/powerpc/platforms/cell/spufs/sched.c
+++ b/arch/powerpc/platforms/cell/spufs/sched.c
@@@ -72,7 -72,7 +72,7 @@@ static struct timer_list spuloadavg_tim
   #define DEF_SPU_TIMESLICE     (100 * HZ / (1000 * SPUSCHED_TICK))
   
   #define SCALE_PRIO(x, prio) \
-       max(x * (MAX_PRIO - prio) / (MAX_USER_PRIO / 2), MIN_SPU_TIMESLICE)
+       max(x * (MAX_PRIO - prio) / (NICE_WIDTH / 2), MIN_SPU_TIMESLICE)
   
   /*
    * scale user-nice values [ -20 ... 0 ... 19 ] to time slice values:
@@@ -181,6 -181,9 +181,6 @@@ void do_notify_spus_active(void
   
         /*
          * Wake up the active spu_contexts.
- -       *
- -       * When the awakened processes see their "notify_active" flag is set,
- -       * they will call spu_switch_notify().
          */
         for_each_online_node(node) {
                 struct spu *spu;
@@@ -236,6 -239,7 +236,6 @@@ static void spu_bind_context(struct sp
         spu_switch_log_notify(spu, ctx, SWITCH_LOG_START, 0);
         spu_restore(&ctx->csa, spu);
         spu->timestamp = jiffies;
- -      spu_switch_notify(spu, ctx);
         ctx->state = SPU_STATE_RUNNABLE;
   
         spuctx_switch_state(ctx, SPU_UTIL_USER);
@@@ -436,6 -440,7 +436,6 @@@ static void spu_unbind_context(struct s
                  */
                 atomic_dec_if_positive(&ctx->gang->aff_sched_count);
   
- -      spu_switch_notify(spu, NULL);
         spu_unmap_mappings(ctx);
         spu_save(&ctx->csa, spu);
         spu_switch_log_notify(spu, ctx, SWITCH_LOG_STOP, 0);
diff --combined arch/x86/Kconfig

index 7b934a591df2bfc91e9723bbe2566e32627bdd65,d3338a87761f107a4880d968b741b6f3ae140678..595193bc2d31e32f39d19dd767310ef51087f1dc
--- 1/arch/x86/Kconfig
--- 2/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@@ -32,7 -32,6 +32,7 @@@ config X86_6
         select MODULES_USE_ELF_RELA
         select NEED_DMA_MAP_STATE
         select SWIOTLB
+ +      select ARCH_HAS_ELFCORE_COMPAT
   
   config FORCE_DYNAMIC_FTRACE
         def_bool y
@@@ -207,6 -206,7 +207,6 @@@ config X8
         select HAVE_MOVE_PMD
         select HAVE_MOVE_PUD
         select HAVE_NMI
- -      select HAVE_OPROFILE
         select HAVE_OPTPROBES
         select HAVE_PCSPKR_PLATFORM
         select HAVE_PERF_EVENTS
@@@ -224,6 -224,7 +224,7 @@@
         select HAVE_STACK_VALIDATION            if X86_64
         select HAVE_STATIC_CALL
         select HAVE_STATIC_CALL_INLINE          if HAVE_STACK_VALIDATION
+       select HAVE_PREEMPT_DYNAMIC
         select HAVE_RSEQ
         select HAVE_SYSCALL_TRACEPOINTS
         select HAVE_UNSTABLE_SCHED_CLOCK
@@@ -890,7 -891,7 +891,7 @@@ config HPET_TIME
   
   config HPET_EMULATE_RTC
         def_bool y
- -      depends on HPET_TIMER && (RTC=y || RTC=m || RTC_DRV_CMOS=m || RTC_DRV_CMOS=y)
+ +      depends on HPET_TIMER && (RTC_DRV_CMOS=m || RTC_DRV_CMOS=y)
   
   config APB_TIMER
         def_bool y if X86_INTEL_MID
@@@ -1158,6 -1159,10 +1159,6 @@@ config X86_MCE_INJEC
           If you don't know what a machine check is and you don't do kernel
           QA it is safe to say n.
   
- -config X86_THERMAL_VECTOR
- -      def_bool y
- -      depends on X86_MCE_INTEL
- -
   source "arch/x86/events/Kconfig"
   
   config X86_LEGACY_VM86
@@@ -2860,6 -2865,7 +2861,6 @@@ config IA32_EMULATIO
         depends on X86_64
         select ARCH_WANT_OLD_COMPAT_IPC
         select BINFMT_ELF
- -      select COMPAT_BINFMT_ELF
         select COMPAT_OLD_SIGACTION
         help
           Include code to run legacy 32-bit programs under a
diff --combined include/asm-generic/vmlinux.lds.h

index 52dbd58f6810a0d5fca88c74c6d18e8e25c59c90,3f747de1934d77b3cf45091fae87ba0d78128315..a54e08d77789a2535a7002f61de6ea802fde4452
--- 1/include/asm-generic/vmlinux.lds.h
--- 2/include/asm-generic/vmlinux.lds.h
+++ b/include/asm-generic/vmlinux.lds.h
@@@ -316,16 -316,6 +316,16 @@@
   #define THERMAL_TABLE(name)
   #endif
   
+ +#ifdef CONFIG_DTPM
+ +#define DTPM_TABLE()                                                  \
+ +      . = ALIGN(8);                                                   \
+ +      __dtpm_table = .;                                               \
+ +      KEEP(*(__dtpm_table))                                           \
+ +      __dtpm_table_end = .;
+ +#else
+ +#define DTPM_TABLE()
+ +#endif
+ +
   #define KERNEL_DTB()                                                  \
         STRUCT_ALIGN();                                                 \
         __dtb_start = .;                                                \
@@@ -403,7 -393,10 +403,10 @@@
         . = ALIGN(8);                                                   \
         __start_static_call_sites = .;                                  \
         KEEP(*(.static_call_sites))                                     \
-       __stop_static_call_sites = .;
+       __stop_static_call_sites = .;                                   \
+       __start_static_call_tramp_key = .;                              \
+       KEEP(*(.static_call_tramp_key))                                 \
+       __stop_static_call_tramp_key = .;
   
   /*
    * Allow architectures to handle ro_after_init data on their
@@@ -743,7 -736,6 +746,7 @@@
         ACPI_PROBE_TABLE(irqchip)                                       \
         ACPI_PROBE_TABLE(timer)                                         \
         THERMAL_TABLE(governor)                                         \
+ +      DTPM_TABLE()                                                    \
         EARLYCON_TABLE()                                                \
         LSM_TABLE()                                                     \
         EARLY_LSM_TABLE()                                               \
diff --combined include/linux/rcupdate.h

index ebd8dcca4997d2134ceae47ab9d2b8e1764d0453,36c2119de7022f0cd63f231e54a65bd9a119e79c..bd04f722714f65dea4791076d3f63feb5e16e3c9
--- 1/include/linux/rcupdate.h
--- 2/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@@ -33,8 -33,6 +33,8 @@@
   #define ULONG_CMP_GE(a, b)    (ULONG_MAX / 2 >= (a) - (b))
   #define ULONG_CMP_LT(a, b)    (ULONG_MAX / 2 < (a) - (b))
   #define ulong2long(a)         (*(long *)(&(a)))
+ +#define USHORT_CMP_GE(a, b)   (USHRT_MAX / 2 >= (unsigned short)((a) - (b)))
+ +#define USHORT_CMP_LT(a, b)   (USHRT_MAX / 2 < (unsigned short)((a) - (b)))
   
   /* Exported common interfaces */
   void call_rcu(struct rcu_head *head, rcu_callback_t func);
@@@ -112,12 -110,10 +112,14 @@@ static inline void rcu_user_exit(void) 
   
   #ifdef CONFIG_RCU_NOCB_CPU
   void rcu_init_nohz(void);
+ +int rcu_nocb_cpu_offload(int cpu);
+ +int rcu_nocb_cpu_deoffload(int cpu);
+ void rcu_nocb_flush_deferred_wakeup(void);
   #else /* #ifdef CONFIG_RCU_NOCB_CPU */
   static inline void rcu_init_nohz(void) { }
+ +static inline int rcu_nocb_cpu_offload(int cpu) { return -EINVAL; }
+ +static inline int rcu_nocb_cpu_deoffload(int cpu) { return 0; }
+ static inline void rcu_nocb_flush_deferred_wakeup(void) { }
   #endif /* #else #ifdef CONFIG_RCU_NOCB_CPU */
   
   /**
@@@ -852,11 -848,19 +854,11 @@@ static inline notrace void rcu_read_unl
    */
   #define __is_kvfree_rcu_offset(offset) ((offset) < 4096)
   
- -/*
- - * Helper macro for kfree_rcu() to prevent argument-expansion eyestrain.
- - */
- -#define __kvfree_rcu(head, offset) \
- -      do { \
- -              BUILD_BUG_ON(!__is_kvfree_rcu_offset(offset)); \
- -              kvfree_call_rcu(head, (rcu_callback_t)(unsigned long)(offset)); \
- -      } while (0)
- -
   /**
    * kfree_rcu() - kfree an object after a grace period.
- - * @ptr:      pointer to kfree
- - * @rhf:      the name of the struct rcu_head within the type of @ptr.
+ + * @ptr: pointer to kfree for both single- and double-argument invocations.
+ + * @rhf: the name of the struct rcu_head within the type of @ptr,
+ + *       but only for double-argument invocations.
    *
    * Many rcu callbacks functions just call kfree() on the base structure.
    * These functions are trivial, but their size adds up, and furthermore
@@@ -869,7 -873,7 +871,7 @@@
    * Because the functions are not allowed in the low-order 4096 bytes of
    * kernel virtual memory, offsets up to 4095 bytes can be accommodated.
    * If the offset is larger than 4095 bytes, a compile-time error will
- - * be generated in __kvfree_rcu(). If this error is triggered, you can
+ + * be generated in kvfree_rcu_arg_2(). If this error is triggered, you can
    * either fall back to use of call_rcu() or rearrange the structure to
    * position the rcu_head structure into the first 4096 bytes.
    *
@@@ -879,7 -883,13 +881,7 @@@
    * The BUILD_BUG_ON check must not involve any function calls, hence the
    * checks are done in macros here.
    */
- -#define kfree_rcu(ptr, rhf)                                           \
- -do {                                                                  \
- -      typeof (ptr) ___p = (ptr);                                      \
- -                                                                      \
- -      if (___p)                                                       \
- -              __kvfree_rcu(&((___p)->rhf), offsetof(typeof(*(ptr)), rhf)); \
- -} while (0)
+ +#define kfree_rcu kvfree_rcu
   
   /**
    * kvfree_rcu() - kvfree an object after a grace period.
@@@ -911,17 -921,7 +913,17 @@@
         kvfree_rcu_arg_2, kvfree_rcu_arg_1)(__VA_ARGS__)
   
   #define KVFREE_GET_MACRO(_1, _2, NAME, ...) NAME
- -#define kvfree_rcu_arg_2(ptr, rhf) kfree_rcu(ptr, rhf)
+ +#define kvfree_rcu_arg_2(ptr, rhf)                                    \
+ +do {                                                                  \
+ +      typeof (ptr) ___p = (ptr);                                      \
+ +                                                                      \
+ +      if (___p) {                                                                     \
+ +              BUILD_BUG_ON(!__is_kvfree_rcu_offset(offsetof(typeof(*(ptr)), rhf)));   \
+ +              kvfree_call_rcu(&((___p)->rhf), (rcu_callback_t)(unsigned long)         \
+ +                      (offsetof(typeof(*(ptr)), rhf)));                               \
+ +      }                                                                               \
+ +} while (0)
+ +
   #define kvfree_rcu_arg_1(ptr)                                 \
   do {                                                          \
         typeof(ptr) ___p = (ptr);                               \
diff --combined init/Kconfig

index 17e955fdec97fd1dd6c5c52808d65b91b2f453c7,a1046963de2746a1e8d19e6e1ca34ddcce6bd837..096e1af5c5865eafe438b1c0673f7f7ad46f1711
--- 1/init/Kconfig
--- 2/init/Kconfig
+++ b/init/Kconfig
@@@ -524,7 -524,7 +524,7 @@@ config SCHED_THERMAL_PRESSUR
           i.e. put less load on throttled CPUs than on non/less throttled ones.
   
           This requires the architecture to implement
-         arch_set_thermal_pressure() and arch_get_thermal_pressure().
+         arch_set_thermal_pressure() and arch_scale_thermal_pressure().
   
   config BSD_PROCESS_ACCT
         bool "BSD Process Accounting"
@@@ -2023,7 -2023,7 +2023,7 @@@ config PROFILIN
         bool "Profiling support"
         help
           Say Y here to enable the extended profiling support mechanisms used
- -        by profilers such as OProfile.
+ +        by profilers.
   
   #
   # Place an empty function call at each tracepoint site. Can be
diff --combined kernel/events/core.c

index c37401e3e5f7326b2dbbe1762f5150d0bc28d6e2,3d890961f6e5451caf7fc99c16b818fa75f8c147..5fe7d63467629e9b5b57e033257fb8ee81a3ca35
--- 1/kernel/events/core.c
--- 2/kernel/events/core.c
+++ b/kernel/events/core.c
@@@ -53,7 -53,6 +53,7 @@@
   #include <linux/min_heap.h>
   #include <linux/highmem.h>
   #include <linux/pgtable.h>
+ +#include <linux/buildid.h>
   
   #include "internal.h"
   
@@@ -398,7 -397,6 +398,7 @@@ static atomic_t nr_ksymbol_events __rea
   static atomic_t nr_bpf_events __read_mostly;
   static atomic_t nr_cgroup_events __read_mostly;
   static atomic_t nr_text_poke_events __read_mostly;
+ +static atomic_t nr_build_id_events __read_mostly;
   
   static LIST_HEAD(pmus);
   static DEFINE_MUTEX(pmus_lock);
@@@ -1597,50 -1595,91 +1597,91 @@@ static void perf_event_groups_init(stru
         groups->index = 0;
   }
   
+ static inline struct cgroup *event_cgroup(const struct perf_event *event)
+ {
+       struct cgroup *cgroup = NULL;
+ 
+ #ifdef CONFIG_CGROUP_PERF
+       if (event->cgrp)
+               cgroup = event->cgrp->css.cgroup;
+ #endif
+ 
+       return cgroup;
+ }
+ 
   /*
    * Compare function for event groups;
    *
    * Implements complex key that first sorts by CPU and then by virtual index
    * which provides ordering when rotating groups for the same CPU.
    */
- static bool
- perf_event_groups_less(struct perf_event *left, struct perf_event *right)
+ static __always_inline int
+ perf_event_groups_cmp(const int left_cpu, const struct cgroup *left_cgroup,
+                     const u64 left_group_index, const struct perf_event *right)
   {
-       if (left->cpu < right->cpu)
-               return true;
-       if (left->cpu > right->cpu)
-               return false;
+       if (left_cpu < right->cpu)
+               return -1;
+       if (left_cpu > right->cpu)
+               return 1;
   
   #ifdef CONFIG_CGROUP_PERF
-       if (left->cgrp != right->cgrp) {
-               if (!left->cgrp || !left->cgrp->css.cgroup) {
-                       /*
-                        * Left has no cgroup but right does, no cgroups come
-                        * first.
-                        */
-                       return true;
+       {
+               const struct cgroup *right_cgroup = event_cgroup(right);
+ 
+               if (left_cgroup != right_cgroup) {
+                       if (!left_cgroup) {
+                               /*
+                                * Left has no cgroup but right does, no
+                                * cgroups come first.
+                                */
+                               return -1;
+                       }
+                       if (!right_cgroup) {
+                               /*
+                                * Right has no cgroup but left does, no
+                                * cgroups come first.
+                                */
+                               return 1;
+                       }
+                       /* Two dissimilar cgroups, order by id. */
+                       if (cgroup_id(left_cgroup) < cgroup_id(right_cgroup))
+                               return -1;
+ 
+                       return 1;
                 }
-               if (!right->cgrp || !right->cgrp->css.cgroup) {
-                       /*
-                        * Right has no cgroup but left does, no cgroups come
-                        * first.
-                        */
-                       return false;
-               }
-               /* Two dissimilar cgroups, order by id. */
-               if (left->cgrp->css.cgroup->kn->id < right->cgrp->css.cgroup->kn->id)
-                       return true;
- 
-               return false;
         }
   #endif
   
-       if (left->group_index < right->group_index)
-               return true;
-       if (left->group_index > right->group_index)
-               return false;
+       if (left_group_index < right->group_index)
+               return -1;
+       if (left_group_index > right->group_index)
+               return 1;
+ 
+       return 0;
+ }
   
-       return false;
+ #define __node_2_pe(node) \
+       rb_entry((node), struct perf_event, group_node)
+ 
+ static inline bool __group_less(struct rb_node *a, const struct rb_node *b)
+ {
+       struct perf_event *e = __node_2_pe(a);
+       return perf_event_groups_cmp(e->cpu, event_cgroup(e), e->group_index,
+                                    __node_2_pe(b)) < 0;
+ }
+ 
+ struct __group_key {
+       int cpu;
+       struct cgroup *cgroup;
+ };
+ 
+ static inline int __group_cmp(const void *key, const struct rb_node *node)
+ {
+       const struct __group_key *a = key;
+       const struct perf_event *b = __node_2_pe(node);
+ 
+       /* partial/subtree match: @cpu, @cgroup; ignore: @group_index */
+       return perf_event_groups_cmp(a->cpu, a->cgroup, b->group_index, b);
   }
   
   /*
@@@ -1652,27 -1691,9 +1693,9 @@@ static voi
   perf_event_groups_insert(struct perf_event_groups *groups,
                          struct perf_event *event)
   {
-       struct perf_event *node_event;
-       struct rb_node *parent;
-       struct rb_node **node;
- 
         event->group_index = ++groups->index;
   
-       node = &groups->tree.rb_node;
-       parent = *node;
- 
-       while (*node) {
-               parent = *node;
-               node_event = container_of(*node, struct perf_event, group_node);
- 
-               if (perf_event_groups_less(event, node_event))
-                       node = &parent->rb_left;
-               else
-                       node = &parent->rb_right;
-       }
- 
-       rb_link_node(&event->group_node, parent, node);
-       rb_insert_color(&event->group_node, &groups->tree);
+       rb_add(&event->group_node, &groups->tree, __group_less);
   }
   
   /*
@@@ -1720,45 -1741,17 +1743,17 @@@ static struct perf_event 
   perf_event_groups_first(struct perf_event_groups *groups, int cpu,
                         struct cgroup *cgrp)
   {
-       struct perf_event *node_event = NULL, *match = NULL;
-       struct rb_node *node = groups->tree.rb_node;
- #ifdef CONFIG_CGROUP_PERF
-       u64 node_cgrp_id, cgrp_id = 0;
- 
-       if (cgrp)
-               cgrp_id = cgrp->kn->id;
- #endif
- 
-       while (node) {
-               node_event = container_of(node, struct perf_event, group_node);
- 
-               if (cpu < node_event->cpu) {
-                       node = node->rb_left;
-                       continue;
-               }
-               if (cpu > node_event->cpu) {
-                       node = node->rb_right;
-                       continue;
-               }
- #ifdef CONFIG_CGROUP_PERF
-               node_cgrp_id = 0;
-               if (node_event->cgrp && node_event->cgrp->css.cgroup)
-                       node_cgrp_id = node_event->cgrp->css.cgroup->kn->id;
+       struct __group_key key = {
+               .cpu = cpu,
+               .cgroup = cgrp,
+       };
+       struct rb_node *node;
   
-               if (cgrp_id < node_cgrp_id) {
-                       node = node->rb_left;
-                       continue;
-               }
-               if (cgrp_id > node_cgrp_id) {
-                       node = node->rb_right;
-                       continue;
-               }
- #endif
-               match = node_event;
-               node = node->rb_left;
-       }
+       node = rb_find_first(&key, &groups->tree, __group_cmp);
+       if (node)
+               return __node_2_pe(node);
   
-       return match;
+       return NULL;
   }
   
   /*
@@@ -1767,27 -1760,17 +1762,17 @@@
   static struct perf_event *
   perf_event_groups_next(struct perf_event *event)
   {
-       struct perf_event *next;
- #ifdef CONFIG_CGROUP_PERF
-       u64 curr_cgrp_id = 0;
-       u64 next_cgrp_id = 0;
- #endif
- 
-       next = rb_entry_safe(rb_next(&event->group_node), typeof(*event), group_node);
-       if (next == NULL || next->cpu != event->cpu)
-               return NULL;
- 
- #ifdef CONFIG_CGROUP_PERF
-       if (event->cgrp && event->cgrp->css.cgroup)
-               curr_cgrp_id = event->cgrp->css.cgroup->kn->id;
+       struct __group_key key = {
+               .cpu = event->cpu,
+               .cgroup = event_cgroup(event),
+       };
+       struct rb_node *next;
   
-       if (next->cgrp && next->cgrp->css.cgroup)
-               next_cgrp_id = next->cgrp->css.cgroup->kn->id;
+       next = rb_next_match(&key, &event->group_node, __group_cmp);
+       if (next)
+               return __node_2_pe(next);
   
-       if (curr_cgrp_id != next_cgrp_id)
-               return NULL;
- #endif
-       return next;
+       return NULL;
   }
   
   /*
@@@ -4675,8 -4658,6 +4660,8 @@@ static void unaccount_event(struct perf
                 dec = true;
         if (event->attr.mmap || event->attr.mmap_data)
                 atomic_dec(&nr_mmap_events);
+ +      if (event->attr.build_id)
+ +              atomic_dec(&nr_build_id_events);
         if (event->attr.comm)
                 atomic_dec(&nr_comm_events);
         if (event->attr.namespaces)
@@@ -8050,8 -8031,6 +8035,8 @@@ struct perf_mmap_event 
         u64                     ino;
         u64                     ino_generation;
         u32                     prot, flags;
+ +      u8                      build_id[BUILD_ID_SIZE_MAX];
+ +      u32                     build_id_size;
   
         struct {
                 struct perf_event_header        header;
@@@ -8083,7 -8062,6 +8068,7 @@@ static void perf_event_mmap_output(stru
         struct perf_sample_data sample;
         int size = mmap_event->event_id.header.size;
         u32 type = mmap_event->event_id.header.type;
+ +      bool use_build_id;
         int ret;
   
         if (!perf_event_mmap_match(event, data))
@@@ -8108,25 -8086,13 +8093,25 @@@
         mmap_event->event_id.pid = perf_event_pid(event, current);
         mmap_event->event_id.tid = perf_event_tid(event, current);
   
+ +      use_build_id = event->attr.build_id && mmap_event->build_id_size;
+ +
+ +      if (event->attr.mmap2 && use_build_id)
+ +              mmap_event->event_id.header.misc |= PERF_RECORD_MISC_MMAP_BUILD_ID;
+ +
         perf_output_put(&handle, mmap_event->event_id);
   
         if (event->attr.mmap2) {
- -              perf_output_put(&handle, mmap_event->maj);
- -              perf_output_put(&handle, mmap_event->min);
- -              perf_output_put(&handle, mmap_event->ino);
- -              perf_output_put(&handle, mmap_event->ino_generation);
+ +              if (use_build_id) {
+ +                      u8 size[4] = { (u8) mmap_event->build_id_size, 0, 0, 0 };
+ +
+ +                      __output_copy(&handle, size, 4);
+ +                      __output_copy(&handle, mmap_event->build_id, BUILD_ID_SIZE_MAX);
+ +              } else {
+ +                      perf_output_put(&handle, mmap_event->maj);
+ +                      perf_output_put(&handle, mmap_event->min);
+ +                      perf_output_put(&handle, mmap_event->ino);
+ +                      perf_output_put(&handle, mmap_event->ino_generation);
+ +              }
                 perf_output_put(&handle, mmap_event->prot);
                 perf_output_put(&handle, mmap_event->flags);
         }
@@@ -8255,9 -8221,6 +8240,9 @@@ got_name
   
         mmap_event->event_id.header.size = sizeof(mmap_event->event_id) + size;
   
+ +      if (atomic_read(&nr_build_id_events))
+ +              build_id_parse(vma, mmap_event->build_id, &mmap_event->build_id_size);
+ +
         perf_iterate_sb(perf_event_mmap_output,
                        mmap_event,
                        NULL);
@@@ -11194,8 -11157,6 +11179,8 @@@ static void account_event(struct perf_e
                 inc = true;
         if (event->attr.mmap || event->attr.mmap_data)
                 atomic_inc(&nr_mmap_events);
+ +      if (event->attr.build_id)
+ +              atomic_inc(&nr_build_id_events);
         if (event->attr.comm)
                 atomic_inc(&nr_comm_events);
         if (event->attr.namespaces)
diff --combined kernel/locking/rtmutex.c

index 47a6e0b8073d1d47e544230313a7da428d62ec67,57e380453bf963141d4f68e4756392a0466b7183..03b21135313cbefd19d3751aa878db61dfc1e5a6
--- 1/kernel/locking/rtmutex.c
--- 2/kernel/locking/rtmutex.c
+++ b/kernel/locking/rtmutex.c
@@@ -267,27 -267,18 +267,18 @@@ rt_mutex_waiter_equal(struct rt_mutex_w
         return 1;
   }
   
+ #define __node_2_waiter(node) \
+       rb_entry((node), struct rt_mutex_waiter, tree_entry)
+ 
+ static inline bool __waiter_less(struct rb_node *a, const struct rb_node *b)
+ {
+       return rt_mutex_waiter_less(__node_2_waiter(a), __node_2_waiter(b));
+ }
+ 
   static void
   rt_mutex_enqueue(struct rt_mutex *lock, struct rt_mutex_waiter *waiter)
   {
-       struct rb_node **link = &lock->waiters.rb_root.rb_node;
-       struct rb_node *parent = NULL;
-       struct rt_mutex_waiter *entry;
-       bool leftmost = true;
- 
-       while (*link) {
-               parent = *link;
-               entry = rb_entry(parent, struct rt_mutex_waiter, tree_entry);
-               if (rt_mutex_waiter_less(waiter, entry)) {
-                       link = &parent->rb_left;
-               } else {
-                       link = &parent->rb_right;
-                       leftmost = false;
-               }
-       }
- 
-       rb_link_node(&waiter->tree_entry, parent, link);
-       rb_insert_color_cached(&waiter->tree_entry, &lock->waiters, leftmost);
+       rb_add_cached(&waiter->tree_entry, &lock->waiters, __waiter_less);
   }
   
   static void
@@@ -300,27 -291,18 +291,18 @@@ rt_mutex_dequeue(struct rt_mutex *lock
         RB_CLEAR_NODE(&waiter->tree_entry);
   }
   
+ #define __node_2_pi_waiter(node) \
+       rb_entry((node), struct rt_mutex_waiter, pi_tree_entry)
+ 
+ static inline bool __pi_waiter_less(struct rb_node *a, const struct rb_node *b)
+ {
+       return rt_mutex_waiter_less(__node_2_pi_waiter(a), __node_2_pi_waiter(b));
+ }
+ 
   static void
   rt_mutex_enqueue_pi(struct task_struct *task, struct rt_mutex_waiter *waiter)
   {
-       struct rb_node **link = &task->pi_waiters.rb_root.rb_node;
-       struct rb_node *parent = NULL;
-       struct rt_mutex_waiter *entry;
-       bool leftmost = true;
- 
-       while (*link) {
-               parent = *link;
-               entry = rb_entry(parent, struct rt_mutex_waiter, pi_tree_entry);
-               if (rt_mutex_waiter_less(waiter, entry)) {
-                       link = &parent->rb_left;
-               } else {
-                       link = &parent->rb_right;
-                       leftmost = false;
-               }
-       }
- 
-       rb_link_node(&waiter->pi_tree_entry, parent, link);
-       rb_insert_color_cached(&waiter->pi_tree_entry, &task->pi_waiters, leftmost);
+       rb_add_cached(&waiter->pi_tree_entry, &task->pi_waiters, __pi_waiter_less);
   }
   
   static void
@@@ -1604,11 -1586,8 +1586,11 @@@ void __sched rt_mutex_unlock(struct rt_
   EXPORT_SYMBOL_GPL(rt_mutex_unlock);
   
   /**
- - * Futex variant, that since futex variants do not use the fast-path, can be
- - * simple and will not need to retry.
+ + * __rt_mutex_futex_unlock - Futex variant, that since futex variants
+ + * do not use the fast-path, can be simple and will not need to retry.
+ + *
+ + * @lock:     The rt_mutex to be unlocked
+ + * @wake_q:   The wake queue head from which to get the next lock waiter
    */
   bool __sched __rt_mutex_futex_unlock(struct rt_mutex *lock,
                                     struct wake_q_head *wake_q)
@@@ -1665,15 -1644,13 +1647,15 @@@ void rt_mutex_destroy(struct rt_mutex *
   EXPORT_SYMBOL_GPL(rt_mutex_destroy);
   
   /**
- - * __rt_mutex_init - initialize the rt lock
+ + * __rt_mutex_init - initialize the rt_mutex
    *
- - * @lock: the rt lock to be initialized
+ + * @lock:     The rt_mutex to be initialized
+ + * @name:     The lock name used for debugging
+ + * @key:      The lock class key used for debugging
    *
- - * Initialize the rt lock to unlocked state.
+ + * Initialize the rt_mutex to unlocked state.
    *
- - * Initializing of a locked rt lock is not allowed
+ + * Initializing of a locked rt_mutex is not allowed
    */
   void __rt_mutex_init(struct rt_mutex *lock, const char *name,
                      struct lock_class_key *key)
diff --combined kernel/rcu/tree.c

index 0f4a6a3c057b0120be8ff35f3f40be6bde7fa3aa,ce17b8477442fcbfbb6866a853b5726f1c5020aa..da6f5213fb74cb119f2acb6da8a3a6dcb6ac84c5
--- 1/kernel/rcu/tree.c
--- 2/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@@ -83,9 -83,6 +83,9 @@@ static DEFINE_PER_CPU_SHARED_ALIGNED(st
         .dynticks_nesting = 1,
         .dynticks_nmi_nesting = DYNTICK_IRQ_NONIDLE,
         .dynticks = ATOMIC_INIT(RCU_DYNTICK_CTRL_CTR),
+ +#ifdef CONFIG_RCU_NOCB_CPU
+ +      .cblist.flags = SEGCBLIST_SOFTIRQ_ONLY,
+ +#endif
   };
   static struct rcu_state rcu_state = {
         .level = { &rcu_state.node[0] },
@@@ -103,10 -100,8 +103,10 @@@
   static bool dump_tree;
   module_param(dump_tree, bool, 0444);
   /* By default, use RCU_SOFTIRQ instead of rcuc kthreads. */
- -static bool use_softirq = true;
+ +static bool use_softirq = !IS_ENABLED(CONFIG_PREEMPT_RT);
+ +#ifndef CONFIG_PREEMPT_RT
   module_param(use_softirq, bool, 0444);
+ +#endif
   /* Control rcu_node-tree auto-balancing at boot time. */
   static bool rcu_fanout_exact;
   module_param(rcu_fanout_exact, bool, 0444);
@@@ -649,7 -644,6 +649,6 @@@ static noinstr void rcu_eqs_enter(bool 
         trace_rcu_dyntick(TPS("Start"), rdp->dynticks_nesting, 0, atomic_read(&rdp->dynticks));
         WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !user && !is_idle_task(current));
         rdp = this_cpu_ptr(&rcu_data);
-       do_nocb_deferred_wakeup(rdp);
         rcu_prepare_for_idle();
         rcu_preempt_deferred_qs(current);
   
@@@ -683,6 -677,50 +682,50 @@@ void rcu_idle_enter(void
   EXPORT_SYMBOL_GPL(rcu_idle_enter);
   
   #ifdef CONFIG_NO_HZ_FULL
+ 
+ #if !defined(CONFIG_GENERIC_ENTRY) || !defined(CONFIG_KVM_XFER_TO_GUEST_WORK)
+ /*
+  * An empty function that will trigger a reschedule on
+  * IRQ tail once IRQs get re-enabled on userspace/guest resume.
+  */
+ static void late_wakeup_func(struct irq_work *work)
+ {
+ }
+ 
+ static DEFINE_PER_CPU(struct irq_work, late_wakeup_work) =
+       IRQ_WORK_INIT(late_wakeup_func);
+ 
+ /*
+  * If either:
+  *
+  * 1) the task is about to enter in guest mode and $ARCH doesn't support KVM generic work
+  * 2) the task is about to enter in user mode and $ARCH doesn't support generic entry.
+  *
+  * In these cases the late RCU wake ups aren't supported in the resched loops and our
+  * last resort is to fire a local irq_work that will trigger a reschedule once IRQs
+  * get re-enabled again.
+  */
+ noinstr static void rcu_irq_work_resched(void)
+ {
+       struct rcu_data *rdp = this_cpu_ptr(&rcu_data);
+ 
+       if (IS_ENABLED(CONFIG_GENERIC_ENTRY) && !(current->flags & PF_VCPU))
+               return;
+ 
+       if (IS_ENABLED(CONFIG_KVM_XFER_TO_GUEST_WORK) && (current->flags & PF_VCPU))
+               return;
+ 
+       instrumentation_begin();
+       if (do_nocb_deferred_wakeup(rdp) && need_resched()) {
+               irq_work_queue(this_cpu_ptr(&late_wakeup_work));
+       }
+       instrumentation_end();
+ }
+ 
+ #else
+ static inline void rcu_irq_work_resched(void) { }
+ #endif
+ 
   /**
    * rcu_user_enter - inform RCU that we are resuming userspace.
    *
@@@ -697,8 -735,16 +740,16 @@@
   noinstr void rcu_user_enter(void)
   {
         lockdep_assert_irqs_disabled();
+ 
+       /*
+        * Other than generic entry implementation, we may be past the last
+        * rescheduling opportunity in the entry code. Trigger a self IPI
+        * that will fire and reschedule once we resume in user/guest mode.
+        */
+       rcu_irq_work_resched();
         rcu_eqs_enter(true);
   }
+ 
   #endif /* CONFIG_NO_HZ_FULL */
   
   /**
@@@ -1500,8 -1546,6 +1551,8 @@@ static bool rcu_accelerate_cbs(struct r
         if (!rcu_segcblist_pend_cbs(&rdp->cblist))
                 return false;
   
+ +      trace_rcu_segcb_stats(&rdp->cblist, TPS("SegCbPreAcc"));
+ +
         /*
          * Callbacks are often registered with incomplete grace-period
          * information.  Something about the fact that getting exact
@@@ -1522,8 -1566,6 +1573,8 @@@
         else
                 trace_rcu_grace_period(rcu_state.name, gp_seq_req, TPS("AccReadyCB"));
   
+ +      trace_rcu_segcb_stats(&rdp->cblist, TPS("SegCbPostAcc"));
+ +
         return ret;
   }
   
@@@ -1774,7 -1816,7 +1825,7 @@@ static bool rcu_gp_init(void
          * go offline later.  Please also refer to "Hotplug CPU" section
          * of RCU's Requirements documentation.
          */
- -      rcu_state.gp_state = RCU_GP_ONOFF;
+ +      WRITE_ONCE(rcu_state.gp_state, RCU_GP_ONOFF);
         rcu_for_each_leaf_node(rnp) {
                 smp_mb(); // Pair with barriers used when updating ->ofl_seq to odd values.
                 firstseq = READ_ONCE(rnp->ofl_seq);
@@@ -1840,7 -1882,7 +1891,7 @@@
          * The grace period cannot complete until the initialization
          * process finishes, because this kthread handles both.
          */
- -      rcu_state.gp_state = RCU_GP_INIT;
+ +      WRITE_ONCE(rcu_state.gp_state, RCU_GP_INIT);
         rcu_for_each_node_breadth_first(rnp) {
                 rcu_gp_slow(gp_init_delay);
                 raw_spin_lock_irqsave_rcu_node(rnp, flags);
@@@ -1939,22 -1981,17 +1990,22 @@@ static void rcu_gp_fqs_loop(void
         ret = 0;
         for (;;) {
                 if (!ret) {
- -                      rcu_state.jiffies_force_qs = jiffies + j;
+ +                      WRITE_ONCE(rcu_state.jiffies_force_qs, jiffies + j);
+ +                      /*
+ +                       * jiffies_force_qs before RCU_GP_WAIT_FQS state
+ +                       * update; required for stall checks.
+ +                       */
+ +                      smp_wmb();
                         WRITE_ONCE(rcu_state.jiffies_kick_kthreads,
                                    jiffies + (j ? 3 * j : 2));
                 }
                 trace_rcu_grace_period(rcu_state.name, rcu_state.gp_seq,
                                        TPS("fqswait"));
- -              rcu_state.gp_state = RCU_GP_WAIT_FQS;
+ +              WRITE_ONCE(rcu_state.gp_state, RCU_GP_WAIT_FQS);
                 ret = swait_event_idle_timeout_exclusive(
                                 rcu_state.gp_wq, rcu_gp_fqs_check_wake(&gf), j);
                 rcu_gp_torture_wait();
- -              rcu_state.gp_state = RCU_GP_DOING_FQS;
+ +              WRITE_ONCE(rcu_state.gp_state, RCU_GP_DOING_FQS);
                 /* Locking provides needed memory barriers. */
                 /* If grace period done, leave loop. */
                 if (!READ_ONCE(rnp->qsmask) &&
@@@ -2068,7 -2105,7 +2119,7 @@@ static void rcu_gp_cleanup(void
         trace_rcu_grace_period(rcu_state.name, rcu_state.gp_seq, TPS("end"));
         rcu_seq_end(&rcu_state.gp_seq);
         ASSERT_EXCLUSIVE_WRITER(rcu_state.gp_seq);
- -      rcu_state.gp_state = RCU_GP_IDLE;
+ +      WRITE_ONCE(rcu_state.gp_state, RCU_GP_IDLE);
         /* Check for GP requests since above loop. */
         rdp = this_cpu_ptr(&rcu_data);
         if (!needgp && ULONG_CMP_LT(rnp->gp_seq, rnp->gp_seq_needed)) {
@@@ -2107,12 -2144,12 +2158,12 @@@ static int __noreturn rcu_gp_kthread(vo
                 for (;;) {
                         trace_rcu_grace_period(rcu_state.name, rcu_state.gp_seq,
                                                TPS("reqwait"));
- -                      rcu_state.gp_state = RCU_GP_WAIT_GPS;
+ +                      WRITE_ONCE(rcu_state.gp_state, RCU_GP_WAIT_GPS);
                         swait_event_idle_exclusive(rcu_state.gp_wq,
                                          READ_ONCE(rcu_state.gp_flags) &
                                          RCU_GP_FLAG_INIT);
                         rcu_gp_torture_wait();
- -                      rcu_state.gp_state = RCU_GP_DONE_GPS;
+ +                      WRITE_ONCE(rcu_state.gp_state, RCU_GP_DONE_GPS);
                         /* Locking provides needed memory barrier. */
                         if (rcu_gp_init())
                                 break;
@@@ -2127,9 -2164,9 +2178,9 @@@
                 rcu_gp_fqs_loop();
   
                 /* Handle grace-period end. */
- -              rcu_state.gp_state = RCU_GP_CLEANUP;
+ +              WRITE_ONCE(rcu_state.gp_state, RCU_GP_CLEANUP);
                 rcu_gp_cleanup();
- -              rcu_state.gp_state = RCU_GP_CLEANED;
+ +              WRITE_ONCE(rcu_state.gp_state, RCU_GP_CLEANED);
         }
   }
   
@@@ -2444,12 -2481,11 +2495,12 @@@ int rcutree_dead_cpu(unsigned int cpu
   static void rcu_do_batch(struct rcu_data *rdp)
   {
         int div;
+ +      bool __maybe_unused empty;
         unsigned long flags;
         const bool offloaded = rcu_segcblist_is_offloaded(&rdp->cblist);
         struct rcu_head *rhp;
         struct rcu_cblist rcl = RCU_CBLIST_INITIALIZER(rcl);
- -      long bl, count;
+ +      long bl, count = 0;
         long pending, tlimit = 0;
   
         /* If no callbacks are ready, just return. */
@@@ -2486,18 -2522,14 +2537,18 @@@
         rcu_segcblist_extract_done_cbs(&rdp->cblist, &rcl);
         if (offloaded)
                 rdp->qlen_last_fqs_check = rcu_segcblist_n_cbs(&rdp->cblist);
+ +
+ +      trace_rcu_segcb_stats(&rdp->cblist, TPS("SegCbDequeued"));
         rcu_nocb_unlock_irqrestore(rdp, flags);
   
         /* Invoke callbacks. */
         tick_dep_set_task(current, TICK_DEP_BIT_RCU);
         rhp = rcu_cblist_dequeue(&rcl);
+ +
         for (; rhp; rhp = rcu_cblist_dequeue(&rcl)) {
                 rcu_callback_t f;
   
+ +              count++;
                 debug_rcu_head_unqueue(rhp);
   
                 rcu_lock_acquire(&rcu_callback_map);
@@@ -2511,19 -2543,21 +2562,19 @@@
   
                 /*
                  * Stop only if limit reached and CPU has something to do.
- -               * Note: The rcl structure counts down from zero.
                  */
- -              if (-rcl.len >= bl && !offloaded &&
+ +              if (count >= bl && !offloaded &&
                     (need_resched() ||
                      (!is_idle_task(current) && !rcu_is_callbacks_kthread())))
                         break;
                 if (unlikely(tlimit)) {
                         /* only call local_clock() every 32 callbacks */
- -                      if (likely((-rcl.len & 31) || local_clock() < tlimit))
+ +                      if (likely((count & 31) || local_clock() < tlimit))
                                 continue;
                         /* Exceeded the time limit, so leave. */
                         break;
                 }
- -              if (offloaded) {
- -                      WARN_ON_ONCE(in_serving_softirq());
+ +              if (!in_serving_softirq()) {
                         local_bh_enable();
                         lockdep_assert_irqs_enabled();
                         cond_resched_tasks_rcu_qs();
@@@ -2534,13 -2568,15 +2585,13 @@@
   
         local_irq_save(flags);
         rcu_nocb_lock(rdp);
- -      count = -rcl.len;
         rdp->n_cbs_invoked += count;
         trace_rcu_batch_end(rcu_state.name, count, !!rcl.head, need_resched(),
                             is_idle_task(current), rcu_is_callbacks_kthread());
   
         /* Update counts and requeue any remaining callbacks. */
         rcu_segcblist_insert_done_cbs(&rdp->cblist, &rcl);
- -      smp_mb(); /* List handling before counting for rcu_barrier(). */
- -      rcu_segcblist_insert_count(&rdp->cblist, &rcl);
+ +      rcu_segcblist_add_len(&rdp->cblist, -count);
   
         /* Reinstate batch limit if we have worked down the excess. */
         count = rcu_segcblist_n_cbs(&rdp->cblist);
@@@ -2558,12 -2594,9 +2609,12 @@@
          * The following usually indicates a double call_rcu().  To track
          * this down, try building with CONFIG_DEBUG_OBJECTS_RCU_HEAD=y.
          */
- -      WARN_ON_ONCE(count == 0 && !rcu_segcblist_empty(&rdp->cblist));
+ +      empty = rcu_segcblist_empty(&rdp->cblist);
+ +      WARN_ON_ONCE(count == 0 && !empty);
         WARN_ON_ONCE(!IS_ENABLED(CONFIG_RCU_NOCB_CPU) &&
- -                   count != 0 && rcu_segcblist_empty(&rdp->cblist));
+ +                   count != 0 && empty);
+ +      WARN_ON_ONCE(count == 0 && rcu_segcblist_n_segment_cbs(&rdp->cblist) != 0);
+ +      WARN_ON_ONCE(!empty && rcu_segcblist_n_segment_cbs(&rdp->cblist) == 0);
   
         rcu_nocb_unlock_irqrestore(rdp, flags);
   
@@@ -2584,7 -2617,6 +2635,7 @@@
   void rcu_sched_clock_irq(int user)
   {
         trace_rcu_utilization(TPS("Start scheduler-tick"));
+ +      lockdep_assert_irqs_disabled();
         raw_cpu_inc(rcu_data.ticks_this_gp);
         /* The load-acquire pairs with the store-release setting to true. */
         if (smp_load_acquire(this_cpu_ptr(&rcu_data.rcu_urgent_qs))) {
@@@ -2598,7 -2630,6 +2649,7 @@@
         rcu_flavor_sched_clock_irq(user);
         if (rcu_pending(user))
                 invoke_rcu_core();
+ +      lockdep_assert_irqs_disabled();
   
         trace_rcu_utilization(TPS("End scheduler-tick"));
   }
@@@ -2708,7 -2739,7 +2759,7 @@@ static __latent_entropy void rcu_core(v
         unsigned long flags;
         struct rcu_data *rdp = raw_cpu_ptr(&rcu_data);
         struct rcu_node *rnp = rdp->mynode;
- -      const bool offloaded = rcu_segcblist_is_offloaded(&rdp->cblist);
+ +      const bool do_batch = !rcu_segcblist_completely_offloaded(&rdp->cblist);
   
         if (cpu_is_offline(smp_processor_id()))
                 return;
@@@ -2728,17 -2759,17 +2779,17 @@@
   
         /* No grace period and unregistered callbacks? */
         if (!rcu_gp_in_progress() &&
- -          rcu_segcblist_is_enabled(&rdp->cblist) && !offloaded) {
- -              local_irq_save(flags);
+ +          rcu_segcblist_is_enabled(&rdp->cblist) && do_batch) {
+ +              rcu_nocb_lock_irqsave(rdp, flags);
                 if (!rcu_segcblist_restempty(&rdp->cblist, RCU_NEXT_READY_TAIL))
                         rcu_accelerate_cbs_unlocked(rnp, rdp);
- -              local_irq_restore(flags);
+ +              rcu_nocb_unlock_irqrestore(rdp, flags);
         }
   
         rcu_check_gp_start_stall(rnp, rdp, rcu_jiffies_till_stall_check());
   
         /* If there are callbacks ready, invoke them. */
- -      if (!offloaded && rcu_segcblist_ready_cbs(&rdp->cblist) &&
+ +      if (do_batch && rcu_segcblist_ready_cbs(&rdp->cblist) &&
             likely(READ_ONCE(rcu_scheduler_fully_active)))
                 rcu_do_batch(rdp);
   
@@@ -2961,7 -2992,6 +3012,7 @@@ static void check_cb_ovld(struct rcu_da
   static void
   __call_rcu(struct rcu_head *head, rcu_callback_t func)
   {
+ +      static atomic_t doublefrees;
         unsigned long flags;
         struct rcu_data *rdp;
         bool was_alldone;
@@@ -2975,10 -3005,8 +3026,10 @@@
                  * Use rcu:rcu_callback trace event to find the previous
                  * time callback was passed to __call_rcu().
                  */
- -              WARN_ONCE(1, "__call_rcu(): Double-freed CB %p->%pS()!!!\n",
- -                        head, head->func);
+ +              if (atomic_inc_return(&doublefrees) < 4) {
+ +                      pr_err("%s(): Double-freed CB %p->%pS()!!!  ", __func__, head, head->func);
+ +                      mem_dump_obj(head);
+ +              }
                 WRITE_ONCE(head->func, rcu_leak_callback);
                 return;
         }
@@@ -3012,8 -3040,6 +3063,8 @@@
                 trace_rcu_callback(rcu_state.name, head,
                                    rcu_segcblist_n_cbs(&rdp->cblist));
   
+ +      trace_rcu_segcb_stats(&rdp->cblist, TPS("SegCBQueued"));
+ +
         /* Go handle any RCU core processing required. */
         if (unlikely(rcu_segcblist_is_offloaded(&rdp->cblist))) {
                 __call_rcu_nocb_wake(rdp, was_alldone, flags); /* unlocks */
@@@ -3523,7 -3549,6 +3574,7 @@@ void kvfree_call_rcu(struct rcu_head *h
                 goto unlock_return;
         }
   
+ +      kasan_record_aux_stack(ptr);
         success = kvfree_call_rcu_add_ptr_to_bulk(krcp, ptr);
         if (!success) {
                 run_page_cache_worker(krcp);
@@@ -3773,8 -3798,6 +3824,8 @@@ static int rcu_pending(int user
         struct rcu_data *rdp = this_cpu_ptr(&rcu_data);
         struct rcu_node *rnp = rdp->mynode;
   
+ +      lockdep_assert_irqs_disabled();
+ +
         /* Check for CPU stalls, if enabled. */
         check_cpu_stall(rdp);
   
@@@ -4029,18 -4052,12 +4080,18 @@@ int rcutree_prepare_cpu(unsigned int cp
         rdp->qlen_last_fqs_check = 0;
         rdp->n_force_qs_snap = rcu_state.n_force_qs;
         rdp->blimit = blimit;
- -      if (rcu_segcblist_empty(&rdp->cblist) && /* No early-boot CBs? */
- -          !rcu_segcblist_is_offloaded(&rdp->cblist))
- -              rcu_segcblist_init(&rdp->cblist);  /* Re-enable callbacks. */
         rdp->dynticks_nesting = 1;      /* CPU not up, no tearing. */
         rcu_dynticks_eqs_online();
         raw_spin_unlock_rcu_node(rnp);          /* irqs remain disabled. */
+ +      /*
+ +       * Lock in case the CB/GP kthreads are still around handling
+ +       * old callbacks (longer term we should flush all callbacks
+ +       * before completing CPU offline)
+ +       */
+ +      rcu_nocb_lock(rdp);
+ +      if (rcu_segcblist_empty(&rdp->cblist)) /* No early-boot CBs? */
+ +              rcu_segcblist_init(&rdp->cblist);  /* Re-enable callbacks. */
+ +      rcu_nocb_unlock(rdp);
   
         /*
          * Add CPU to leaf rcu_node pending-online bitmask.  Any needed
@@@ -4193,9 -4210,6 +4244,9 @@@ void rcu_report_dead(unsigned int cpu
         struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
         struct rcu_node *rnp = rdp->mynode;  /* Outgoing CPU's rdp & rnp. */
   
+ +      // Do any dangling deferred wakeups.
+ +      do_nocb_deferred_wakeup(rdp);
+ +
         /* QS for any half-done expedited grace period. */
         preempt_disable();
         rcu_report_exp_rdp(this_cpu_ptr(&rcu_data));
diff --combined kernel/rcu/tree.h

index 5d359b9f9fec404dceb945fda8abff54127b73f2,9226f4021a36dd341aa85597c21f6cf110a6197d..71821d59d95c58beed17f41cb4d62d315fe03a4e
--- 1/kernel/rcu/tree.h
--- 2/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@@ -201,7 -201,6 +201,7 @@@ struct rcu_data 
         /* 5) Callback offloading. */
   #ifdef CONFIG_RCU_NOCB_CPU
         struct swait_queue_head nocb_cb_wq; /* For nocb kthreads to sleep on. */
+ +      struct swait_queue_head nocb_state_wq; /* For offloading state changes */
         struct task_struct *nocb_gp_kthread;
         raw_spinlock_t nocb_lock;       /* Guard following pair of fields. */
         atomic_t nocb_lock_contended;   /* Contention experienced. */
@@@ -257,7 -256,6 +257,7 @@@
   };
   
   /* Values for nocb_defer_wakeup field in struct rcu_data. */
+ +#define RCU_NOCB_WAKE_OFF     -1
   #define RCU_NOCB_WAKE_NOT     0
   #define RCU_NOCB_WAKE         1
   #define RCU_NOCB_WAKE_FORCE   2
@@@ -435,7 -433,7 +435,7 @@@ static bool rcu_nocb_try_bypass(struct 
   static void __call_rcu_nocb_wake(struct rcu_data *rdp, bool was_empty,
                                  unsigned long flags);
   static int rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp);
- static void do_nocb_deferred_wakeup(struct rcu_data *rdp);
+ static bool do_nocb_deferred_wakeup(struct rcu_data *rdp);
   static void rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp);
   static void rcu_spawn_cpu_nocb_kthread(int cpu);
   static void __init rcu_spawn_nocb_kthreads(void);
diff --combined kernel/rcu/tree_plugin.h

index 231a0c6cf03c179580cd1c9a44097c369d20307d,cdc1b7651c0399ac5533651d684aa94a8e7f32c3..2d603771c7dce8164c56adb3a3f0564c0e1d95f8
--- 1/kernel/rcu/tree_plugin.h
--- 2/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@@ -682,7 -682,6 +682,7 @@@ static void rcu_flavor_sched_clock_irq(
   {
         struct task_struct *t = current;
   
+ +      lockdep_assert_irqs_disabled();
         if (user || rcu_is_cpu_rrupt_from_idle()) {
                 rcu_note_voluntary_context_switch(current);
         }
@@@ -1632,8 -1631,8 +1632,8 @@@ bool rcu_is_nocb_cpu(int cpu
    * Kick the GP kthread for this NOCB group.  Caller holds ->nocb_lock
    * and this function releases it.
    */
- static void wake_nocb_gp(struct rcu_data *rdp, bool force,
-                          unsigned long flags)
+ static bool wake_nocb_gp(struct rcu_data *rdp, bool force,
+                        unsigned long flags)
         __releases(rdp->nocb_lock)
   {
         bool needwake = false;
@@@ -1644,7 -1643,7 +1644,7 @@@
                 trace_rcu_nocb_wake(rcu_state.name, rdp->cpu,
                                     TPS("AlreadyAwake"));
                 rcu_nocb_unlock_irqrestore(rdp, flags);
-               return;
+               return false;
         }
         del_timer(&rdp->nocb_timer);
         rcu_nocb_unlock_irqrestore(rdp, flags);
@@@ -1657,6 -1656,8 +1657,8 @@@
         raw_spin_unlock_irqrestore(&rdp_gp->nocb_gp_lock, flags);
         if (needwake)
                 wake_up_process(rdp_gp->nocb_gp_kthread);
+ 
+       return needwake;
   }
   
   /*
@@@ -1666,8 -1667,6 +1668,8 @@@
   static void wake_nocb_gp_defer(struct rcu_data *rdp, int waketype,
                                const char *reason)
   {
+ +      if (rdp->nocb_defer_wakeup == RCU_NOCB_WAKE_OFF)
+ +              return;
         if (rdp->nocb_defer_wakeup == RCU_NOCB_WAKE_NOT)
                 mod_timer(&rdp->nocb_timer, jiffies + 1);
         if (rdp->nocb_defer_wakeup < waketype)
@@@ -1931,52 -1930,6 +1933,52 @@@ static void do_nocb_bypass_wakeup_timer
         __call_rcu_nocb_wake(rdp, true, flags);
   }
   
+ +/*
+ + * Check if we ignore this rdp.
+ + *
+ + * We check that without holding the nocb lock but
+ + * we make sure not to miss a freshly offloaded rdp
+ + * with the current ordering:
+ + *
+ + *  rdp_offload_toggle()        nocb_gp_enabled_cb()
+ + * -------------------------   ----------------------------
+ + *    WRITE flags                 LOCK nocb_gp_lock
+ + *    LOCK nocb_gp_lock           READ/WRITE nocb_gp_sleep
+ + *    READ/WRITE nocb_gp_sleep    UNLOCK nocb_gp_lock
+ + *    UNLOCK nocb_gp_lock         READ flags
+ + */
+ +static inline bool nocb_gp_enabled_cb(struct rcu_data *rdp)
+ +{
+ +      u8 flags = SEGCBLIST_OFFLOADED | SEGCBLIST_KTHREAD_GP;
+ +
+ +      return rcu_segcblist_test_flags(&rdp->cblist, flags);
+ +}
+ +
+ +static inline bool nocb_gp_update_state(struct rcu_data *rdp, bool *needwake_state)
+ +{
+ +      struct rcu_segcblist *cblist = &rdp->cblist;
+ +
+ +      if (rcu_segcblist_test_flags(cblist, SEGCBLIST_OFFLOADED)) {
+ +              if (!rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_GP)) {
+ +                      rcu_segcblist_set_flags(cblist, SEGCBLIST_KTHREAD_GP);
+ +                      if (rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_CB))
+ +                              *needwake_state = true;
+ +              }
+ +              return true;
+ +      }
+ +
+ +      /*
+ +       * De-offloading. Clear our flag and notify the de-offload worker.
+ +       * We will ignore this rdp until it ever gets re-offloaded.
+ +       */
+ +      WARN_ON_ONCE(!rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_GP));
+ +      rcu_segcblist_clear_flags(cblist, SEGCBLIST_KTHREAD_GP);
+ +      if (!rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_CB))
+ +              *needwake_state = true;
+ +      return false;
+ +}
+ +
+ +
   /*
    * No-CBs GP kthreads come here to wait for additional callbacks to show up
    * or for grace periods to end.
@@@ -2005,18 -1958,8 +2007,18 @@@ static void nocb_gp_wait(struct rcu_dat
          */
         WARN_ON_ONCE(my_rdp->nocb_gp_rdp != my_rdp);
         for (rdp = my_rdp; rdp; rdp = rdp->nocb_next_cb_rdp) {
+ +              bool needwake_state = false;
+ +
+ +              if (!nocb_gp_enabled_cb(rdp))
+ +                      continue;
                 trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("Check"));
                 rcu_nocb_lock_irqsave(rdp, flags);
+ +              if (!nocb_gp_update_state(rdp, &needwake_state)) {
+ +                      rcu_nocb_unlock_irqrestore(rdp, flags);
+ +                      if (needwake_state)
+ +                              swake_up_one(&rdp->nocb_state_wq);
+ +                      continue;
+ +              }
                 bypass_ncbs = rcu_cblist_n_cbs(&rdp->nocb_bypass);
                 if (bypass_ncbs &&
                     (time_after(j, READ_ONCE(rdp->nocb_bypass_first) + 1) ||
@@@ -2026,8 -1969,6 +2028,8 @@@
                         bypass_ncbs = rcu_cblist_n_cbs(&rdp->nocb_bypass);
                 } else if (!bypass_ncbs && rcu_segcblist_empty(&rdp->cblist)) {
                         rcu_nocb_unlock_irqrestore(rdp, flags);
+ +                      if (needwake_state)
+ +                              swake_up_one(&rdp->nocb_state_wq);
                         continue; /* No callbacks here, try next. */
                 }
                 if (bypass_ncbs) {
@@@ -2079,8 -2020,6 +2081,8 @@@
                 }
                 if (needwake_gp)
                         rcu_gp_kthread_wake();
+ +              if (needwake_state)
+ +                      swake_up_one(&rdp->nocb_state_wq);
         }
   
         my_rdp->nocb_gp_bypass = bypass;
@@@ -2144,27 -2083,14 +2146,27 @@@ static int rcu_nocb_gp_kthread(void *ar
         return 0;
   }
   
+ +static inline bool nocb_cb_can_run(struct rcu_data *rdp)
+ +{
+ +      u8 flags = SEGCBLIST_OFFLOADED | SEGCBLIST_KTHREAD_CB;
+ +      return rcu_segcblist_test_flags(&rdp->cblist, flags);
+ +}
+ +
+ +static inline bool nocb_cb_wait_cond(struct rcu_data *rdp)
+ +{
+ +      return nocb_cb_can_run(rdp) && !READ_ONCE(rdp->nocb_cb_sleep);
+ +}
+ +
   /*
    * Invoke any ready callbacks from the corresponding no-CBs CPU,
    * then, if there are no more, wait for more to appear.
    */
   static void nocb_cb_wait(struct rcu_data *rdp)
   {
+ +      struct rcu_segcblist *cblist = &rdp->cblist;
         unsigned long cur_gp_seq;
         unsigned long flags;
+ +      bool needwake_state = false;
         bool needwake_gp = false;
         struct rcu_node *rnp = rdp->mynode;
   
@@@ -2176,55 -2102,32 +2178,55 @@@
         local_bh_enable();
         lockdep_assert_irqs_enabled();
         rcu_nocb_lock_irqsave(rdp, flags);
- -      if (rcu_segcblist_nextgp(&rdp->cblist, &cur_gp_seq) &&
+ +      if (rcu_segcblist_nextgp(cblist, &cur_gp_seq) &&
             rcu_seq_done(&rnp->gp_seq, cur_gp_seq) &&
             raw_spin_trylock_rcu_node(rnp)) { /* irqs already disabled. */
                 needwake_gp = rcu_advance_cbs(rdp->mynode, rdp);
                 raw_spin_unlock_rcu_node(rnp); /* irqs remain disabled. */
         }
- -      if (rcu_segcblist_ready_cbs(&rdp->cblist)) {
- -              rcu_nocb_unlock_irqrestore(rdp, flags);
- -              if (needwake_gp)
- -                      rcu_gp_kthread_wake();
- -              return;
- -      }
   
- -      trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("CBSleep"));
         WRITE_ONCE(rdp->nocb_cb_sleep, true);
+ +
+ +      if (rcu_segcblist_test_flags(cblist, SEGCBLIST_OFFLOADED)) {
+ +              if (!rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_CB)) {
+ +                      rcu_segcblist_set_flags(cblist, SEGCBLIST_KTHREAD_CB);
+ +                      if (rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_GP))
+ +                              needwake_state = true;
+ +              }
+ +              if (rcu_segcblist_ready_cbs(cblist))
+ +                      WRITE_ONCE(rdp->nocb_cb_sleep, false);
+ +      } else {
+ +              /*
+ +               * De-offloading. Clear our flag and notify the de-offload worker.
+ +               * We won't touch the callbacks and keep sleeping until we ever
+ +               * get re-offloaded.
+ +               */
+ +              WARN_ON_ONCE(!rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_CB));
+ +              rcu_segcblist_clear_flags(cblist, SEGCBLIST_KTHREAD_CB);
+ +              if (!rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_GP))
+ +                      needwake_state = true;
+ +      }
+ +
+ +      if (rdp->nocb_cb_sleep)
+ +              trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("CBSleep"));
+ +
         rcu_nocb_unlock_irqrestore(rdp, flags);
         if (needwake_gp)
                 rcu_gp_kthread_wake();
- -      swait_event_interruptible_exclusive(rdp->nocb_cb_wq,
- -                               !READ_ONCE(rdp->nocb_cb_sleep));
- -      if (!smp_load_acquire(&rdp->nocb_cb_sleep)) { /* VVV */
- -              /* ^^^ Ensure CB invocation follows _sleep test. */
- -              return;
- -      }
- -      WARN_ON(signal_pending(current));
- -      trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("WokeEmpty"));
+ +
+ +      if (needwake_state)
+ +              swake_up_one(&rdp->nocb_state_wq);
+ +
+ +      do {
+ +              swait_event_interruptible_exclusive(rdp->nocb_cb_wq,
+ +                                                  nocb_cb_wait_cond(rdp));
+ +
+ +              // VVV Ensure CB invocation follows _sleep test.
+ +              if (smp_load_acquire(&rdp->nocb_cb_sleep)) { // ^^^
+ +                      WARN_ON(signal_pending(current));
+ +                      trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("WokeEmpty"));
+ +              }
+ +      } while (!nocb_cb_can_run(rdp));
   }
   
   /*
@@@ -2247,24 -2150,27 +2249,27 @@@ static int rcu_nocb_cb_kthread(void *ar
   /* Is a deferred wakeup of rcu_nocb_kthread() required? */
   static int rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp)
   {
- -      return READ_ONCE(rdp->nocb_defer_wakeup);
+ +      return READ_ONCE(rdp->nocb_defer_wakeup) > RCU_NOCB_WAKE_NOT;
   }
   
   /* Do a deferred wakeup of rcu_nocb_kthread(). */
- static void do_nocb_deferred_wakeup_common(struct rcu_data *rdp)
+ static bool do_nocb_deferred_wakeup_common(struct rcu_data *rdp)
   {
         unsigned long flags;
         int ndw;
+       int ret;
   
         rcu_nocb_lock_irqsave(rdp, flags);
         if (!rcu_nocb_need_deferred_wakeup(rdp)) {
                 rcu_nocb_unlock_irqrestore(rdp, flags);
-               return;
+               return false;
         }
         ndw = READ_ONCE(rdp->nocb_defer_wakeup);
         WRITE_ONCE(rdp->nocb_defer_wakeup, RCU_NOCB_WAKE_NOT);
-       wake_nocb_gp(rdp, ndw == RCU_NOCB_WAKE_FORCE, flags);
+       ret = wake_nocb_gp(rdp, ndw == RCU_NOCB_WAKE_FORCE, flags);
         trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("DeferredWake"));
+ 
+       return ret;
   }
   
   /* Do a deferred wakeup of rcu_nocb_kthread() from a timer handler. */
@@@ -2280,201 -2186,19 +2285,208 @@@ static void do_nocb_deferred_wakeup_tim
    * This means we do an inexact common-case check.  Note that if
    * we miss, ->nocb_timer will eventually clean things up.
    */
- static void do_nocb_deferred_wakeup(struct rcu_data *rdp)
+ static bool do_nocb_deferred_wakeup(struct rcu_data *rdp)
   {
         if (rcu_nocb_need_deferred_wakeup(rdp))
-               do_nocb_deferred_wakeup_common(rdp);
+               return do_nocb_deferred_wakeup_common(rdp);
+       return false;
+ }
+ 
+ void rcu_nocb_flush_deferred_wakeup(void)
+ {
+       do_nocb_deferred_wakeup(this_cpu_ptr(&rcu_data));
   }
+ EXPORT_SYMBOL_GPL(rcu_nocb_flush_deferred_wakeup);
   
+ +static int rdp_offload_toggle(struct rcu_data *rdp,
+ +                             bool offload, unsigned long flags)
+ +      __releases(rdp->nocb_lock)
+ +{
+ +      struct rcu_segcblist *cblist = &rdp->cblist;
+ +      struct rcu_data *rdp_gp = rdp->nocb_gp_rdp;
+ +      bool wake_gp = false;
+ +
+ +      rcu_segcblist_offload(cblist, offload);
+ +
+ +      if (rdp->nocb_cb_sleep)
+ +              rdp->nocb_cb_sleep = false;
+ +      rcu_nocb_unlock_irqrestore(rdp, flags);
+ +
+ +      /*
+ +       * Ignore former value of nocb_cb_sleep and force wake up as it could
+ +       * have been spuriously set to false already.
+ +       */
+ +      swake_up_one(&rdp->nocb_cb_wq);
+ +
+ +      raw_spin_lock_irqsave(&rdp_gp->nocb_gp_lock, flags);
+ +      if (rdp_gp->nocb_gp_sleep) {
+ +              rdp_gp->nocb_gp_sleep = false;
+ +              wake_gp = true;
+ +      }
+ +      raw_spin_unlock_irqrestore(&rdp_gp->nocb_gp_lock, flags);
+ +
+ +      if (wake_gp)
+ +              wake_up_process(rdp_gp->nocb_gp_kthread);
+ +
+ +      return 0;
+ +}
+ +
+ +static int __rcu_nocb_rdp_deoffload(struct rcu_data *rdp)
+ +{
+ +      struct rcu_segcblist *cblist = &rdp->cblist;
+ +      unsigned long flags;
+ +      int ret;
+ +
+ +      pr_info("De-offloading %d\n", rdp->cpu);
+ +
+ +      rcu_nocb_lock_irqsave(rdp, flags);
+ +      /*
+ +       * If there are still pending work offloaded, the offline
+ +       * CPU won't help much handling them.
+ +       */
+ +      if (cpu_is_offline(rdp->cpu) && !rcu_segcblist_empty(&rdp->cblist)) {
+ +              rcu_nocb_unlock_irqrestore(rdp, flags);
+ +              return -EBUSY;
+ +      }
+ +
+ +      ret = rdp_offload_toggle(rdp, false, flags);
+ +      swait_event_exclusive(rdp->nocb_state_wq,
+ +                            !rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_CB |
+ +                                                      SEGCBLIST_KTHREAD_GP));
+ +      rcu_nocb_lock_irqsave(rdp, flags);
+ +      /* Make sure nocb timer won't stay around */
+ +      WRITE_ONCE(rdp->nocb_defer_wakeup, RCU_NOCB_WAKE_OFF);
+ +      rcu_nocb_unlock_irqrestore(rdp, flags);
+ +      del_timer_sync(&rdp->nocb_timer);
+ +
+ +      /*
+ +       * Flush bypass. While IRQs are disabled and once we set
+ +       * SEGCBLIST_SOFTIRQ_ONLY, no callback is supposed to be
+ +       * enqueued on bypass.
+ +       */
+ +      rcu_nocb_lock_irqsave(rdp, flags);
+ +      rcu_nocb_flush_bypass(rdp, NULL, jiffies);
+ +      rcu_segcblist_set_flags(cblist, SEGCBLIST_SOFTIRQ_ONLY);
+ +      /*
+ +       * With SEGCBLIST_SOFTIRQ_ONLY, we can't use
+ +       * rcu_nocb_unlock_irqrestore() anymore. Theoretically we
+ +       * could set SEGCBLIST_SOFTIRQ_ONLY with cb unlocked and IRQs
+ +       * disabled now, but let's be paranoid.
+ +       */
+ +      raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags);
+ +
+ +      return ret;
+ +}
+ +
+ +static long rcu_nocb_rdp_deoffload(void *arg)
+ +{
+ +      struct rcu_data *rdp = arg;
+ +
+ +      WARN_ON_ONCE(rdp->cpu != raw_smp_processor_id());
+ +      return __rcu_nocb_rdp_deoffload(rdp);
+ +}
+ +
+ +int rcu_nocb_cpu_deoffload(int cpu)
+ +{
+ +      struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
+ +      int ret = 0;
+ +
+ +      if (rdp == rdp->nocb_gp_rdp) {
+ +              pr_info("Can't deoffload an rdp GP leader (yet)\n");
+ +              return -EINVAL;
+ +      }
+ +      mutex_lock(&rcu_state.barrier_mutex);
+ +      cpus_read_lock();
+ +      if (rcu_segcblist_is_offloaded(&rdp->cblist)) {
+ +              if (cpu_online(cpu))
+ +                      ret = work_on_cpu(cpu, rcu_nocb_rdp_deoffload, rdp);
+ +              else
+ +                      ret = __rcu_nocb_rdp_deoffload(rdp);
+ +              if (!ret)
+ +                      cpumask_clear_cpu(cpu, rcu_nocb_mask);
+ +      }
+ +      cpus_read_unlock();
+ +      mutex_unlock(&rcu_state.barrier_mutex);
+ +
+ +      return ret;
+ +}
+ +EXPORT_SYMBOL_GPL(rcu_nocb_cpu_deoffload);
+ +
+ +static int __rcu_nocb_rdp_offload(struct rcu_data *rdp)
+ +{
+ +      struct rcu_segcblist *cblist = &rdp->cblist;
+ +      unsigned long flags;
+ +      int ret;
+ +
+ +      /*
+ +       * For now we only support re-offload, ie: the rdp must have been
+ +       * offloaded on boot first.
+ +       */
+ +      if (!rdp->nocb_gp_rdp)
+ +              return -EINVAL;
+ +
+ +      pr_info("Offloading %d\n", rdp->cpu);
+ +      /*
+ +       * Can't use rcu_nocb_lock_irqsave() while we are in
+ +       * SEGCBLIST_SOFTIRQ_ONLY mode.
+ +       */
+ +      raw_spin_lock_irqsave(&rdp->nocb_lock, flags);
+ +      /* Re-enable nocb timer */
+ +      WRITE_ONCE(rdp->nocb_defer_wakeup, RCU_NOCB_WAKE_NOT);
+ +      /*
+ +       * We didn't take the nocb lock while working on the
+ +       * rdp->cblist in SEGCBLIST_SOFTIRQ_ONLY mode.
+ +       * Every modifications that have been done previously on
+ +       * rdp->cblist must be visible remotely by the nocb kthreads
+ +       * upon wake up after reading the cblist flags.
+ +       *
+ +       * The layout against nocb_lock enforces that ordering:
+ +       *
+ +       *  __rcu_nocb_rdp_offload()   nocb_cb_wait()/nocb_gp_wait()
+ +       * -------------------------   ----------------------------
+ +       *      WRITE callbacks           rcu_nocb_lock()
+ +       *      rcu_nocb_lock()           READ flags
+ +       *      WRITE flags               READ callbacks
+ +       *      rcu_nocb_unlock()         rcu_nocb_unlock()
+ +       */
+ +      ret = rdp_offload_toggle(rdp, true, flags);
+ +      swait_event_exclusive(rdp->nocb_state_wq,
+ +                            rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_CB) &&
+ +                            rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_GP));
+ +
+ +      return ret;
+ +}
+ +
+ +static long rcu_nocb_rdp_offload(void *arg)
+ +{
+ +      struct rcu_data *rdp = arg;
+ +
+ +      WARN_ON_ONCE(rdp->cpu != raw_smp_processor_id());
+ +      return __rcu_nocb_rdp_offload(rdp);
+ +}
+ +
+ +int rcu_nocb_cpu_offload(int cpu)
+ +{
+ +      struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
+ +      int ret = 0;
+ +
+ +      mutex_lock(&rcu_state.barrier_mutex);
+ +      cpus_read_lock();
+ +      if (!rcu_segcblist_is_offloaded(&rdp->cblist)) {
+ +              if (cpu_online(cpu))
+ +                      ret = work_on_cpu(cpu, rcu_nocb_rdp_offload, rdp);
+ +              else
+ +                      ret = __rcu_nocb_rdp_offload(rdp);
+ +              if (!ret)
+ +                      cpumask_set_cpu(cpu, rcu_nocb_mask);
+ +      }
+ +      cpus_read_unlock();
+ +      mutex_unlock(&rcu_state.barrier_mutex);
+ +
+ +      return ret;
+ +}
+ +EXPORT_SYMBOL_GPL(rcu_nocb_cpu_offload);
+ +
   void __init rcu_init_nohz(void)
   {
         int cpu;
@@@ -2517,9 -2241,7 +2529,9 @@@
                 rdp = per_cpu_ptr(&rcu_data, cpu);
                 if (rcu_segcblist_empty(&rdp->cblist))
                         rcu_segcblist_init(&rdp->cblist);
- -              rcu_segcblist_offload(&rdp->cblist);
+ +              rcu_segcblist_offload(&rdp->cblist, true);
+ +              rcu_segcblist_set_flags(&rdp->cblist, SEGCBLIST_KTHREAD_CB);
+ +              rcu_segcblist_set_flags(&rdp->cblist, SEGCBLIST_KTHREAD_GP);
         }
         rcu_organize_nocb_kthreads();
   }
@@@ -2529,7 -2251,6 +2541,7 @@@ static void __init rcu_boot_init_nocb_p
   {
         init_swait_queue_head(&rdp->nocb_cb_wq);
         init_swait_queue_head(&rdp->nocb_gp_wq);
+ +      init_swait_queue_head(&rdp->nocb_state_wq);
         raw_spin_lock_init(&rdp->nocb_lock);
         raw_spin_lock_init(&rdp->nocb_bypass_lock);
         raw_spin_lock_init(&rdp->nocb_gp_lock);
@@@ -2672,19 -2393,6 +2684,19 @@@ void rcu_bind_current_to_nocb(void
   }
   EXPORT_SYMBOL_GPL(rcu_bind_current_to_nocb);
   
+ +// The ->on_cpu field is available only in CONFIG_SMP=y, so...
+ +#ifdef CONFIG_SMP
+ +static char *show_rcu_should_be_on_cpu(struct task_struct *tsp)
+ +{
+ +      return tsp && tsp->state == TASK_RUNNING && !tsp->on_cpu ? "!" : "";
+ +}
+ +#else // #ifdef CONFIG_SMP
+ +static char *show_rcu_should_be_on_cpu(struct task_struct *tsp)
+ +{
+ +      return "";
+ +}
+ +#endif // #else #ifdef CONFIG_SMP
+ +
   /*
    * Dump out nocb grace-period kthread state for the specified rcu_data
    * structure.
@@@ -2693,7 -2401,7 +2705,7 @@@ static void show_rcu_nocb_gp_state(stru
   {
         struct rcu_node *rnp = rdp->mynode;
   
- -      pr_info("nocb GP %d %c%c%c%c%c%c %c[%c%c] %c%c:%ld rnp %d:%d %lu\n",
+ +      pr_info("nocb GP %d %c%c%c%c%c%c %c[%c%c] %c%c:%ld rnp %d:%d %lu %c CPU %d%s\n",
                 rdp->cpu,
                 "kK"[!!rdp->nocb_gp_kthread],
                 "lL"[raw_spin_is_locked(&rdp->nocb_gp_lock)],
@@@ -2707,17 -2415,12 +2719,17 @@@
                 ".B"[!!rdp->nocb_gp_bypass],
                 ".G"[!!rdp->nocb_gp_gp],
                 (long)rdp->nocb_gp_seq,
- -              rnp->grplo, rnp->grphi, READ_ONCE(rdp->nocb_gp_loops));
+ +              rnp->grplo, rnp->grphi, READ_ONCE(rdp->nocb_gp_loops),
+ +              rdp->nocb_gp_kthread ? task_state_to_char(rdp->nocb_gp_kthread) : '.',
+ +              rdp->nocb_cb_kthread ? (int)task_cpu(rdp->nocb_gp_kthread) : -1,
+ +              show_rcu_should_be_on_cpu(rdp->nocb_cb_kthread));
   }
   
   /* Dump out nocb kthread state for the specified rcu_data structure. */
   static void show_rcu_nocb_state(struct rcu_data *rdp)
   {
+ +      char bufw[20];
+ +      char bufr[20];
         struct rcu_segcblist *rsclp = &rdp->cblist;
         bool waslocked;
         bool wastimer;
@@@ -2726,11 -2429,8 +2738,11 @@@
         if (rdp->nocb_gp_rdp == rdp)
                 show_rcu_nocb_gp_state(rdp);
   
- -      pr_info("   CB %d->%d %c%c%c%c%c%c F%ld L%ld C%d %c%c%c%c%c q%ld\n",
+ +      sprintf(bufw, "%ld", rsclp->gp_seq[RCU_WAIT_TAIL]);
+ +      sprintf(bufr, "%ld", rsclp->gp_seq[RCU_NEXT_READY_TAIL]);
+ +      pr_info("   CB %d^%d->%d %c%c%c%c%c%c F%ld L%ld C%d %c%c%s%c%s%c%c q%ld %c CPU %d%s\n",
                 rdp->cpu, rdp->nocb_gp_rdp->cpu,
+ +              rdp->nocb_next_cb_rdp ? rdp->nocb_next_cb_rdp->cpu : -1,
                 "kK"[!!rdp->nocb_cb_kthread],
                 "bB"[raw_spin_is_locked(&rdp->nocb_bypass_lock)],
                 "cC"[!!atomic_read(&rdp->nocb_lock_contended)],
@@@ -2741,16 -2441,11 +2753,16 @@@
                 jiffies - rdp->nocb_nobypass_last,
                 rdp->nocb_nobypass_count,
                 ".D"[rcu_segcblist_ready_cbs(rsclp)],
- -              ".W"[!rcu_segcblist_restempty(rsclp, RCU_DONE_TAIL)],
- -              ".R"[!rcu_segcblist_restempty(rsclp, RCU_WAIT_TAIL)],
- -              ".N"[!rcu_segcblist_restempty(rsclp, RCU_NEXT_READY_TAIL)],
+ +              ".W"[!rcu_segcblist_segempty(rsclp, RCU_WAIT_TAIL)],
+ +              rcu_segcblist_segempty(rsclp, RCU_WAIT_TAIL) ? "" : bufw,
+ +              ".R"[!rcu_segcblist_segempty(rsclp, RCU_NEXT_READY_TAIL)],
+ +              rcu_segcblist_segempty(rsclp, RCU_NEXT_READY_TAIL) ? "" : bufr,
+ +              ".N"[!rcu_segcblist_segempty(rsclp, RCU_NEXT_TAIL)],
                 ".B"[!!rcu_cblist_n_cbs(&rdp->nocb_bypass)],
- -              rcu_segcblist_n_cbs(&rdp->cblist));
+ +              rcu_segcblist_n_cbs(&rdp->cblist),
+ +              rdp->nocb_cb_kthread ? task_state_to_char(rdp->nocb_cb_kthread) : '.',
+ +              rdp->nocb_cb_kthread ? (int)task_cpu(rdp->nocb_gp_kthread) : -1,
+ +              show_rcu_should_be_on_cpu(rdp->nocb_cb_kthread));
   
         /* It is OK for GP kthreads to have GP state. */
         if (rdp->nocb_gp_rdp == rdp)
@@@ -2835,8 -2530,9 +2847,9 @@@ static int rcu_nocb_need_deferred_wakeu
         return false;
   }
   
- static void do_nocb_deferred_wakeup(struct rcu_data *rdp)
+ static bool do_nocb_deferred_wakeup(struct rcu_data *rdp)
   {
+       return false;
   }
   
   static void rcu_spawn_cpu_nocb_kthread(int cpu)
diff --combined kernel/sched/core.c

index 22f6748c16f68111cef8a8da74a39d26fd860de4,88a2e2bdbabeb8a11c631907d2c7f5da0b4c0a1b..7f5ffc8784110736f4e3705f2f5ad49374192ba9
--- 1/kernel/sched/core.c
--- 2/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@@ -355,8 -355,9 +355,9 @@@ static enum hrtimer_restart hrtick(stru
   static void __hrtick_restart(struct rq *rq)
   {
         struct hrtimer *timer = &rq->hrtick_timer;
+       ktime_t time = rq->hrtick_time;
   
-       hrtimer_start_expires(timer, HRTIMER_MODE_ABS_PINNED_HARD);
+       hrtimer_start(timer, time, HRTIMER_MODE_ABS_PINNED_HARD);
   }
   
   /*
@@@ -380,7 -381,6 +381,6 @@@ static void __hrtick_start(void *arg
   void hrtick_start(struct rq *rq, u64 delay)
   {
         struct hrtimer *timer = &rq->hrtick_timer;
-       ktime_t time;
         s64 delta;
   
         /*
@@@ -388,9 -388,7 +388,7 @@@
          * doesn't make sense and can cause timer DoS.
          */
         delta = max_t(s64, delay, 10000LL);
-       time = ktime_add_ns(timer->base->get_time(), delta);
- 
-       hrtimer_set_expires(timer, time);
+       rq->hrtick_time = ktime_add_ns(timer->base->get_time(), delta);
   
         if (rq == this_rq())
                 __hrtick_restart(rq);
@@@ -3478,7 -3476,7 +3476,7 @@@ out
   
   /**
    * try_invoke_on_locked_down_task - Invoke a function on task in fixed state
- - * @p: Process for which the function is to be invoked.
+ + * @p: Process for which the function is to be invoked, can be @current.
    * @func: Function to invoke.
    * @arg: Argument to function.
    *
@@@ -3496,11 -3494,12 +3494,11 @@@
    */
   bool try_invoke_on_locked_down_task(struct task_struct *p, bool (*func)(struct task_struct *t, void *arg), void *arg)
   {
- -      bool ret = false;
         struct rq_flags rf;
+ +      bool ret = false;
         struct rq *rq;
   
- -      lockdep_assert_irqs_enabled();
- -      raw_spin_lock_irq(&p->pi_lock);
+ +      raw_spin_lock_irqsave(&p->pi_lock, rf.flags);
         if (p->on_rq) {
                 rq = __task_rq_lock(p, &rf);
                 if (task_rq(p) == rq)
@@@ -3517,7 -3516,7 +3515,7 @@@
                                 ret = func(p, arg);
                 }
         }
- -      raw_spin_unlock_irq(&p->pi_lock);
+ +      raw_spin_unlock_irqrestore(&p->pi_lock, rf.flags);
         return ret;
   }
   
@@@ -4970,7 -4969,7 +4968,7 @@@ static void __sched notrace __schedule(
   
         schedule_debug(prev, preempt);
   
-       if (sched_feat(HRTICK))
+       if (sched_feat(HRTICK) || sched_feat(HRTICK_DL))
                 hrtick_clear(rq);
   
         local_irq_disable();
@@@ -5264,6 -5263,12 +5262,12 @@@ asmlinkage __visible void __sched notra
   NOKPROBE_SYMBOL(preempt_schedule);
   EXPORT_SYMBOL(preempt_schedule);
   
+ #ifdef CONFIG_PREEMPT_DYNAMIC
+ DEFINE_STATIC_CALL(preempt_schedule, __preempt_schedule_func);
+ EXPORT_STATIC_CALL_TRAMP(preempt_schedule);
+ #endif
+ 
+ 
   /**
    * preempt_schedule_notrace - preempt_schedule called by tracing
    *
@@@ -5316,8 -5321,197 +5320,197 @@@ asmlinkage __visible void __sched notra
   }
   EXPORT_SYMBOL_GPL(preempt_schedule_notrace);
   
+ #ifdef CONFIG_PREEMPT_DYNAMIC
+ DEFINE_STATIC_CALL(preempt_schedule_notrace, __preempt_schedule_notrace_func);
+ EXPORT_STATIC_CALL_TRAMP(preempt_schedule_notrace);
+ #endif
+ 
   #endif /* CONFIG_PREEMPTION */
   
+ #ifdef CONFIG_PREEMPT_DYNAMIC
+ 
+ #include <linux/entry-common.h>
+ 
+ /*
+  * SC:cond_resched
+  * SC:might_resched
+  * SC:preempt_schedule
+  * SC:preempt_schedule_notrace
+  * SC:irqentry_exit_cond_resched
+  *
+  *
+  * NONE:
+  *   cond_resched               <- __cond_resched
+  *   might_resched              <- RET0
+  *   preempt_schedule           <- NOP
+  *   preempt_schedule_notrace   <- NOP
+  *   irqentry_exit_cond_resched <- NOP
+  *
+  * VOLUNTARY:
+  *   cond_resched               <- __cond_resched
+  *   might_resched              <- __cond_resched
+  *   preempt_schedule           <- NOP
+  *   preempt_schedule_notrace   <- NOP
+  *   irqentry_exit_cond_resched <- NOP
+  *
+  * FULL:
+  *   cond_resched               <- RET0
+  *   might_resched              <- RET0
+  *   preempt_schedule           <- preempt_schedule
+  *   preempt_schedule_notrace   <- preempt_schedule_notrace
+  *   irqentry_exit_cond_resched <- irqentry_exit_cond_resched
+  */
+ 
+ enum {
+       preempt_dynamic_none = 0,
+       preempt_dynamic_voluntary,
+       preempt_dynamic_full,
+ };
+ 
+ static int preempt_dynamic_mode = preempt_dynamic_full;
+ 
+ static int sched_dynamic_mode(const char *str)
+ {
+       if (!strcmp(str, "none"))
+               return 0;
+ 
+       if (!strcmp(str, "voluntary"))
+               return 1;
+ 
+       if (!strcmp(str, "full"))
+               return 2;
+ 
+       return -1;
+ }
+ 
+ static void sched_dynamic_update(int mode)
+ {
+       /*
+        * Avoid {NONE,VOLUNTARY} -> FULL transitions from ever ending up in
+        * the ZERO state, which is invalid.
+        */
+       static_call_update(cond_resched, __cond_resched);
+       static_call_update(might_resched, __cond_resched);
+       static_call_update(preempt_schedule, __preempt_schedule_func);
+       static_call_update(preempt_schedule_notrace, __preempt_schedule_notrace_func);
+       static_call_update(irqentry_exit_cond_resched, irqentry_exit_cond_resched);
+ 
+       switch (mode) {
+       case preempt_dynamic_none:
+               static_call_update(cond_resched, __cond_resched);
+               static_call_update(might_resched, (typeof(&__cond_resched)) __static_call_return0);
+               static_call_update(preempt_schedule, (typeof(&preempt_schedule)) NULL);
+               static_call_update(preempt_schedule_notrace, (typeof(&preempt_schedule_notrace)) NULL);
+               static_call_update(irqentry_exit_cond_resched, (typeof(&irqentry_exit_cond_resched)) NULL);
+               pr_info("Dynamic Preempt: none\n");
+               break;
+ 
+       case preempt_dynamic_voluntary:
+               static_call_update(cond_resched, __cond_resched);
+               static_call_update(might_resched, __cond_resched);
+               static_call_update(preempt_schedule, (typeof(&preempt_schedule)) NULL);
+               static_call_update(preempt_schedule_notrace, (typeof(&preempt_schedule_notrace)) NULL);
+               static_call_update(irqentry_exit_cond_resched, (typeof(&irqentry_exit_cond_resched)) NULL);
+               pr_info("Dynamic Preempt: voluntary\n");
+               break;
+ 
+       case preempt_dynamic_full:
+               static_call_update(cond_resched, (typeof(&__cond_resched)) __static_call_return0);
+               static_call_update(might_resched, (typeof(&__cond_resched)) __static_call_return0);
+               static_call_update(preempt_schedule, __preempt_schedule_func);
+               static_call_update(preempt_schedule_notrace, __preempt_schedule_notrace_func);
+               static_call_update(irqentry_exit_cond_resched, irqentry_exit_cond_resched);
+               pr_info("Dynamic Preempt: full\n");
+               break;
+       }
+ 
+       preempt_dynamic_mode = mode;
+ }
+ 
+ static int __init setup_preempt_mode(char *str)
+ {
+       int mode = sched_dynamic_mode(str);
+       if (mode < 0) {
+               pr_warn("Dynamic Preempt: unsupported mode: %s\n", str);
+               return 1;
+       }
+ 
+       sched_dynamic_update(mode);
+       return 0;
+ }
+ __setup("preempt=", setup_preempt_mode);
+ 
+ #ifdef CONFIG_SCHED_DEBUG
+ 
+ static ssize_t sched_dynamic_write(struct file *filp, const char __user *ubuf,
+                                  size_t cnt, loff_t *ppos)
+ {
+       char buf[16];
+       int mode;
+ 
+       if (cnt > 15)
+               cnt = 15;
+ 
+       if (copy_from_user(&buf, ubuf, cnt))
+               return -EFAULT;
+ 
+       buf[cnt] = 0;
+       mode = sched_dynamic_mode(strstrip(buf));
+       if (mode < 0)
+               return mode;
+ 
+       sched_dynamic_update(mode);
+ 
+       *ppos += cnt;
+ 
+       return cnt;
+ }
+ 
+ static int sched_dynamic_show(struct seq_file *m, void *v)
+ {
+       static const char * preempt_modes[] = {
+               "none", "voluntary", "full"
+       };
+       int i;
+ 
+       for (i = 0; i < ARRAY_SIZE(preempt_modes); i++) {
+               if (preempt_dynamic_mode == i)
+                       seq_puts(m, "(");
+               seq_puts(m, preempt_modes[i]);
+               if (preempt_dynamic_mode == i)
+                       seq_puts(m, ")");
+ 
+               seq_puts(m, " ");
+       }
+ 
+       seq_puts(m, "\n");
+       return 0;
+ }
+ 
+ static int sched_dynamic_open(struct inode *inode, struct file *filp)
+ {
+       return single_open(filp, sched_dynamic_show, NULL);
+ }
+ 
+ static const struct file_operations sched_dynamic_fops = {
+       .open           = sched_dynamic_open,
+       .write          = sched_dynamic_write,
+       .read           = seq_read,
+       .llseek         = seq_lseek,
+       .release        = single_release,
+ };
+ 
+ static __init int sched_init_debug_dynamic(void)
+ {
+       debugfs_create_file("sched_preempt", 0644, NULL, NULL, &sched_dynamic_fops);
+       return 0;
+ }
+ late_initcall(sched_init_debug_dynamic);
+ 
+ #endif /* CONFIG_SCHED_DEBUG */
+ #endif /* CONFIG_PREEMPT_DYNAMIC */
+ 
+ 
   /*
    * This is the entry point to schedule() from kernel preemption
    * off of irq context.
@@@ -5615,8 -5809,12 +5808,12 @@@ SYSCALL_DEFINE1(nice, int, increment
    * @p: the task in question.
    *
    * Return: The priority value as seen by users in /proc.
-  * RT tasks are offset by -200. Normal tasks are centered
-  * around 0, value goes from -16 to +15.
+  *
+  * sched policy         return value   kernel prio    user prio/nice
+  *
+  * normal, batch, idle     [0 ... 39]  [100 ... 139]          0/[-20 ... 19]
+  * fifo, rr             [-2 ... -100]     [98 ... 0]  [1 ... 99]
+  * deadline                     -101             -1           0
    */
   int task_prio(const struct task_struct *p)
   {
@@@ -5675,6 -5873,120 +5872,120 @@@ struct task_struct *idle_task(int cpu
         return cpu_rq(cpu)->idle;
   }
   
+ #ifdef CONFIG_SMP
+ /*
+  * This function computes an effective utilization for the given CPU, to be
+  * used for frequency selection given the linear relation: f = u * f_max.
+  *
+  * The scheduler tracks the following metrics:
+  *
+  *   cpu_util_{cfs,rt,dl,irq}()
+  *   cpu_bw_dl()
+  *
+  * Where the cfs,rt and dl util numbers are tracked with the same metric and
+  * synchronized windows and are thus directly comparable.
+  *
+  * The cfs,rt,dl utilization are the running times measured with rq->clock_task
+  * which excludes things like IRQ and steal-time. These latter are then accrued
+  * in the irq utilization.
+  *
+  * The DL bandwidth number otoh is not a measured metric but a value computed
+  * based on the task model parameters and gives the minimal utilization
+  * required to meet deadlines.
+  */
+ unsigned long effective_cpu_util(int cpu, unsigned long util_cfs,
+                                unsigned long max, enum cpu_util_type type,
+                                struct task_struct *p)
+ {
+       unsigned long dl_util, util, irq;
+       struct rq *rq = cpu_rq(cpu);
+ 
+       if (!uclamp_is_used() &&
+           type == FREQUENCY_UTIL && rt_rq_is_runnable(&rq->rt)) {
+               return max;
+       }
+ 
+       /*
+        * Early check to see if IRQ/steal time saturates the CPU, can be
+        * because of inaccuracies in how we track these -- see
+        * update_irq_load_avg().
+        */
+       irq = cpu_util_irq(rq);
+       if (unlikely(irq >= max))
+               return max;
+ 
+       /*
+        * Because the time spend on RT/DL tasks is visible as 'lost' time to
+        * CFS tasks and we use the same metric to track the effective
+        * utilization (PELT windows are synchronized) we can directly add them
+        * to obtain the CPU's actual utilization.
+        *
+        * CFS and RT utilization can be boosted or capped, depending on
+        * utilization clamp constraints requested by currently RUNNABLE
+        * tasks.
+        * When there are no CFS RUNNABLE tasks, clamps are released and
+        * frequency will be gracefully reduced with the utilization decay.
+        */
+       util = util_cfs + cpu_util_rt(rq);
+       if (type == FREQUENCY_UTIL)
+               util = uclamp_rq_util_with(rq, util, p);
+ 
+       dl_util = cpu_util_dl(rq);
+ 
+       /*
+        * For frequency selection we do not make cpu_util_dl() a permanent part
+        * of this sum because we want to use cpu_bw_dl() later on, but we need
+        * to check if the CFS+RT+DL sum is saturated (ie. no idle time) such
+        * that we select f_max when there is no idle time.
+        *
+        * NOTE: numerical errors or stop class might cause us to not quite hit
+        * saturation when we should -- something for later.
+        */
+       if (util + dl_util >= max)
+               return max;
+ 
+       /*
+        * OTOH, for energy computation we need the estimated running time, so
+        * include util_dl and ignore dl_bw.
+        */
+       if (type == ENERGY_UTIL)
+               util += dl_util;
+ 
+       /*
+        * There is still idle time; further improve the number by using the
+        * irq metric. Because IRQ/steal time is hidden from the task clock we
+        * need to scale the task numbers:
+        *
+        *              max - irq
+        *   U' = irq + --------- * U
+        *                 max
+        */
+       util = scale_irq_capacity(util, irq, max);
+       util += irq;
+ 
+       /*
+        * Bandwidth required by DEADLINE must always be granted while, for
+        * FAIR and RT, we use blocked utilization of IDLE CPUs as a mechanism
+        * to gracefully reduce the frequency when no tasks show up for longer
+        * periods of time.
+        *
+        * Ideally we would like to set bw_dl as min/guaranteed freq and util +
+        * bw_dl as requested freq. However, cpufreq is not yet ready for such
+        * an interface. So, we only do the latter for now.
+        */
+       if (type == FREQUENCY_UTIL)
+               util += cpu_bw_dl(rq);
+ 
+       return min(max, util);
+ }
+ 
+ unsigned long sched_cpu_util(int cpu, unsigned long max)
+ {
+       return effective_cpu_util(cpu, cpu_util_cfs(cpu_rq(cpu)), max,
+                                 ENERGY_UTIL, NULL);
+ }
+ #endif /* CONFIG_SMP */
+ 
   /**
    * find_process_by_pid - find a process with a matching PID value.
    * @pid: the pid in question.
@@@ -5796,11 -6108,10 +6107,10 @@@ recheck
   
         /*
          * Valid priorities for SCHED_FIFO and SCHED_RR are
-        * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL,
+        * 1..MAX_RT_PRIO-1, valid priority for SCHED_NORMAL,
          * SCHED_BATCH and SCHED_IDLE is 0.
          */
-       if ((p->mm && attr->sched_priority > MAX_USER_RT_PRIO-1) ||
-           (!p->mm && attr->sched_priority > MAX_RT_PRIO-1))
+       if (attr->sched_priority > MAX_RT_PRIO-1)
                 return -EINVAL;
         if ((dl_policy(policy) && !__checkparam_dl(attr)) ||
             (rt_policy(policy) != (attr->sched_priority != 0)))
@@@ -6667,17 -6978,27 +6977,27 @@@ SYSCALL_DEFINE0(sched_yield
         return 0;
   }
   
- #ifndef CONFIG_PREEMPTION
- int __sched _cond_resched(void)
+ #if !defined(CONFIG_PREEMPTION) || defined(CONFIG_PREEMPT_DYNAMIC)
+ int __sched __cond_resched(void)
   {
         if (should_resched(0)) {
                 preempt_schedule_common();
                 return 1;
         }
+ #ifndef CONFIG_PREEMPT_RCU
         rcu_all_qs();
+ #endif
         return 0;
   }
- EXPORT_SYMBOL(_cond_resched);
+ EXPORT_SYMBOL(__cond_resched);
+ #endif
+ 
+ #ifdef CONFIG_PREEMPT_DYNAMIC
+ DEFINE_STATIC_CALL_RET0(cond_resched, __cond_resched);
+ EXPORT_STATIC_CALL_TRAMP(cond_resched);
+ 
+ DEFINE_STATIC_CALL_RET0(might_resched, __cond_resched);
+ EXPORT_STATIC_CALL_TRAMP(might_resched);
   #endif
   
   /*
@@@ -6868,7 -7189,7 +7188,7 @@@ SYSCALL_DEFINE1(sched_get_priority_max
         switch (policy) {
         case SCHED_FIFO:
         case SCHED_RR:
-               ret = MAX_USER_RT_PRIO-1;
+               ret = MAX_RT_PRIO-1;
                 break;
         case SCHED_DEADLINE:
         case SCHED_NORMAL:
@@@ -7508,6 -7829,12 +7828,12 @@@ int sched_cpu_deactivate(unsigned int c
         struct rq_flags rf;
         int ret;
   
+       /*
+        * Remove CPU from nohz.idle_cpus_mask to prevent participating in
+        * load balancing when not active
+        */
+       nohz_balance_exit_idle(rq);
+ 
         set_cpu_active(cpu, false);
   
         /*
@@@ -7652,7 -7979,6 +7978,6 @@@ int sched_cpu_dying(unsigned int cpu
   
         calc_load_migrate(rq);
         update_max_interval();
-       nohz_balance_exit_idle(rq);
         hrtick_clear(rq);
         return 0;
   }
author	Linus Torvalds <[email protected]>
	Sun, 21 Feb 2021 20:35:04 +0000 (12:35 -0800)
committer	Linus Torvalds <[email protected]>
	Sun, 21 Feb 2021 20:35:04 +0000 (12:35 -0800)
		1	2
Documentation/admin-guide/kernel-parameters.txt	patch \|	diff1 \|	diff2 \|	blob \| history
arch/Kconfig	patch \|	diff1 \|	diff2 \|	blob \| history
arch/powerpc/platforms/cell/spufs/sched.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/Kconfig	patch \|	diff1 \|	diff2 \|	blob \| history
include/asm-generic/vmlinux.lds.h	patch \|	diff1 \|	diff2 \|	blob \| history
include/linux/rcupdate.h	patch \|	diff1 \|	diff2 \|	blob \| history
init/Kconfig	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/events/core.c	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/locking/rtmutex.c	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/rcu/tree.c	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/rcu/tree.h	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/rcu/tree_plugin.h	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/sched/core.c	patch \|	diff1 \|	diff2 \|	blob \| history