]> Git Repo - linux.git/commitdiff
Merge tag 'sched-core-2021-02-17' of git://git.kernel.org/pub/scm/linux/kernel/git...
authorLinus Torvalds <[email protected]>
Sun, 21 Feb 2021 20:35:04 +0000 (12:35 -0800)
committerLinus Torvalds <[email protected]>
Sun, 21 Feb 2021 20:35:04 +0000 (12:35 -0800)
Pull scheduler updates from Ingo Molnar:
 "Core scheduler updates:

   - Add CONFIG_PREEMPT_DYNAMIC: this in its current form adds the
     preempt=none/voluntary/full boot options (default: full), to allow
     distros to build a PREEMPT kernel but fall back to close to
     PREEMPT_VOLUNTARY (or PREEMPT_NONE) runtime scheduling behavior via
     a boot time selection.

     There's also the /debug/sched_debug switch to do this runtime.

     This feature is implemented via runtime patching (a new variant of
     static calls).

     The scope of the runtime patching can be best reviewed by looking
     at the sched_dynamic_update() function in kernel/sched/core.c.

     ( Note that the dynamic none/voluntary mode isn't 100% identical,
       for example preempt-RCU is available in all cases, plus the
       preempt count is maintained in all models, which has runtime
       overhead even with the code patching. )

     The PREEMPT_VOLUNTARY/PREEMPT_NONE models, used by the vast
     majority of distributions, are supposed to be unaffected.

   - Fix ignored rescheduling after rcu_eqs_enter(). This is a bug that
     was found via rcutorture triggering a hang. The bug is that
     rcu_idle_enter() may wake up a NOCB kthread, but this happens after
     the last generic need_resched() check. Some cpuidle drivers fix it
     by chance but many others don't.

     In true 2020 fashion the original bug fix has grown into a 5-patch
     scheduler/RCU fix series plus another 16 RCU patches to address the
     underlying issue of missed preemption events. These are the initial
     fixes that should fix current incarnations of the bug.

   - Clean up rbtree usage in the scheduler, by providing & using the
     following consistent set of rbtree APIs:

       partial-order; less() based:
         - rb_add(): add a new entry to the rbtree
         - rb_add_cached(): like rb_add(), but for a rb_root_cached

       total-order; cmp() based:
         - rb_find(): find an entry in an rbtree
         - rb_find_add(): find an entry, and add if not found

         - rb_find_first(): find the first (leftmost) matching entry
         - rb_next_match(): continue from rb_find_first()
         - rb_for_each(): iterate a sub-tree using the previous two

   - Improve the SMP/NUMA load-balancer: scan for an idle sibling in a
     single pass. This is a 4-commit series where each commit improves
     one aspect of the idle sibling scan logic.

   - Improve the cpufreq cooling driver by getting the effective CPU
     utilization metrics from the scheduler

   - Improve the fair scheduler's active load-balancing logic by
     reducing the number of active LB attempts & lengthen the
     load-balancing interval. This improves stress-ng mmapfork
     performance.

   - Fix CFS's estimated utilization (util_est) calculation bug that can
     result in too high utilization values

  Misc updates & fixes:

   - Fix the HRTICK reprogramming & optimization feature

   - Fix SCHED_SOFTIRQ raising race & warning in the CPU offlining code

   - Reduce dl_add_task_root_domain() overhead

   - Fix uprobes refcount bug

   - Process pending softirqs in flush_smp_call_function_from_idle()

   - Clean up task priority related defines, remove *USER_*PRIO and
     USER_PRIO()

   - Simplify the sched_init_numa() deduplication sort

   - Documentation updates

   - Fix EAS bug in update_misfit_status(), which degraded the quality
     of energy-balancing

   - Smaller cleanups"

* tag 'sched-core-2021-02-17' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (51 commits)
  sched,x86: Allow !PREEMPT_DYNAMIC
  entry/kvm: Explicitly flush pending rcuog wakeup before last rescheduling point
  entry: Explicitly flush pending rcuog wakeup before last rescheduling point
  rcu/nocb: Trigger self-IPI on late deferred wake up before user resume
  rcu/nocb: Perform deferred wake up before last idle's need_resched() check
  rcu: Pull deferred rcuog wake up to rcu_eqs_enter() callers
  sched/features: Distinguish between NORMAL and DEADLINE hrtick
  sched/features: Fix hrtick reprogramming
  sched/deadline: Reduce rq lock contention in dl_add_task_root_domain()
  uprobes: (Re)add missing get_uprobe() in __find_uprobe()
  smp: Process pending softirqs in flush_smp_call_function_from_idle()
  sched: Harden PREEMPT_DYNAMIC
  static_call: Allow module use without exposing static_call_key
  sched: Add /debug/sched_preempt
  preempt/dynamic: Support dynamic preempt with preempt= boot option
  preempt/dynamic: Provide irqentry_exit_cond_resched() static call
  preempt/dynamic: Provide preempt_schedule[_notrace]() static calls
  preempt/dynamic: Provide cond_resched() and might_resched() static calls
  preempt: Introduce CONFIG_PREEMPT_DYNAMIC
  static_call: Provide DEFINE_STATIC_CALL_RET0()
  ...

13 files changed:
1  2 
Documentation/admin-guide/kernel-parameters.txt
arch/Kconfig
arch/powerpc/platforms/cell/spufs/sched.c
arch/x86/Kconfig
include/asm-generic/vmlinux.lds.h
include/linux/rcupdate.h
init/Kconfig
kernel/events/core.c
kernel/locking/rtmutex.c
kernel/rcu/tree.c
kernel/rcu/tree.h
kernel/rcu/tree_plugin.h
kernel/sched/core.c

index 36d6ce7cc88688bd33b8f857cc3ace757016054b,78ab29400dd3603e0b0d361ffe9fb195ab380740..b93aaa374266ffa2d7aff66538e1eb3cca86e847
                        insecure, please do not use on production kernels.
  
        debug_locks_verbose=
 -                      [KNL] verbose self-tests
 -                      Format=<0|1>
 +                      [KNL] verbose locking self-tests
 +                      Format: <int>
                        Print debugging info while doing the locking API
                        self-tests.
 -                      We default to 0 (no extra messages), setting it to
 -                      1 will print _a lot_ more information - normally
 -                      only useful to kernel developers.
 +                      Bitmask for the various LOCKTYPE_ tests. Defaults to 0
 +                      (no extra messages), setting it to -1 (all bits set)
 +                      will print _a_lot_ more information - normally only
 +                      useful to lockdep developers.
  
        debug_objects   [KNL] Enable object debugging
  
                        For example, to override I2C bus2:
                        omap_mux=i2c2_scl.i2c2_scl=0x100,i2c2_sda.i2c2_sda=0x100
  
 -      oprofile.timer= [HW]
 -                      Use timer interrupt instead of performance counters
 -
 -      oprofile.cpu_type=      Force an oprofile cpu type
 -                      This might be useful if you have an older oprofile
 -                      userland or if you want common events.
 -                      Format: { arch_perfmon }
 -                      arch_perfmon: [X86] Force use of architectural
 -                              perfmon on Intel CPUs instead of the
 -                              CPU specific event set.
 -                      timer: [X86] Force use of architectural NMI
 -                              timer mode (see also oprofile.timer
 -                              for generic hr timer mode)
 -
        oops=panic      Always panic on oopses. Default is to just kill the
                        process, but there is a small probability of
                        deadlocking the machine.
                        Format: {"off"}
                        Disable Hardware Transactional Memory
  
+       preempt=        [KNL]
+                       Select preemption mode if you have CONFIG_PREEMPT_DYNAMIC
+                       none - Limited to cond_resched() calls
+                       voluntary - Limited to cond_resched() and might_sleep() calls
+                       full - Any section that isn't explicitly preempt disabled
+                              can be preempted anytime.
        print-fatal-signals=
                        [KNL] debug: print fatal signals
  
                        value, meaning that RCU_SOFTIRQ is used by default.
                        Specify rcutree.use_softirq=0 to use rcuc kthreads.
  
 +                      But note that CONFIG_PREEMPT_RT=y kernels disable
 +                      this kernel boot parameter, forcibly setting it
 +                      to zero.
 +
        rcutree.rcu_fanout_exact= [KNL]
                        Disable autobalancing of the rcu_node combining
                        tree.  This is used by rcutorture, and might
                        Set wakeup interval for idle CPUs that have
                        RCU callbacks (RCU_FAST_NO_HZ=y).
  
 -      rcutree.rcu_idle_lazy_gp_delay= [KNL]
 -                      Set wakeup interval for idle CPUs that have
 -                      only "lazy" RCU callbacks (RCU_FAST_NO_HZ=y).
 -                      Lazy RCU callbacks are those which RCU can
 -                      prove do nothing more than free memory.
 -
        rcutree.rcu_kick_kthreads= [KNL]
                        Cause the grace-period kthread to get an extra
                        wake_up() if it sleeps three times longer than
                        stress RCU, they don't participate in the actual
                        test, hence the "fake".
  
 +      rcutorture.nocbs_nthreads= [KNL]
 +                      Set number of RCU callback-offload togglers.
 +                      Zero (the default) disables toggling.
 +
 +      rcutorture.nocbs_toggle= [KNL]
 +                      Set the delay in milliseconds between successive
 +                      callback-offload toggling attempts.
 +
        rcutorture.nreaders= [KNL]
                        Set number of RCU readers.  The value -1 selects
                        N-1, where N is the number of CPUs.  A value
                        only normal grace-period primitives.  No effect
                        on CONFIG_TINY_RCU kernels.
  
 +                      But note that CONFIG_PREEMPT_RT=y kernels enables
 +                      this kernel boot parameter, forcibly setting
 +                      it to the value one, that is, converting any
 +                      post-boot attempt at an expedited RCU grace
 +                      period to instead use normal non-expedited
 +                      grace-period processing.
 +
        rcupdate.rcu_task_ipi_delay= [KNL]
                        Set time in jiffies during which RCU tasks will
                        avoid sending IPIs, starting with the beginning
        refscale.verbose= [KNL]
                        Enable additional printk() statements.
  
 +      refscale.verbose_batched= [KNL]
 +                      Batch the additional printk() statements.  If zero
 +                      (the default) or negative, print everything.  Otherwise,
 +                      print every Nth verbose statement, where N is the value
 +                      specified.
 +
        relax_domain_level=
                        [KNL, SMP] Set scheduler's default relax_domain_level.
                        See Documentation/admin-guide/cgroup-v1/cpusets.rst.
                        are running concurrently, especially on systems
                        with rotating-rust storage.
  
 +      torture.verbose_sleep_frequency= [KNL]
 +                      Specifies how many verbose printk()s should be
 +                      emitted between each sleep.  The default of zero
 +                      disables verbose-printk() sleeping.
 +
 +      torture.verbose_sleep_duration= [KNL]
 +                      Duration of each verbose-printk() sleep in jiffies.
 +
        tp720=          [HW,PS2]
  
        tpm_suspend_pcr=[HW,TPM]
diff --combined arch/Kconfig
index 87608c2fa02771014e73021c34b1e9d79221f4a0,12450790782494de476f18e806118085ccac9289..4790a5f23d9fd15caf1fff15fba45df539234697
@@@ -33,6 -33,38 +33,6 @@@ config HOTPLUG_SM
  config GENERIC_ENTRY
         bool
  
 -config OPROFILE
 -      tristate "OProfile system profiling"
 -      depends on PROFILING
 -      depends on HAVE_OPROFILE
 -      select RING_BUFFER
 -      select RING_BUFFER_ALLOW_SWAP
 -      help
 -        OProfile is a profiling system capable of profiling the
 -        whole system, include the kernel, kernel modules, libraries,
 -        and applications.
 -
 -        If unsure, say N.
 -
 -config OPROFILE_EVENT_MULTIPLEX
 -      bool "OProfile multiplexing support (EXPERIMENTAL)"
 -      default n
 -      depends on OPROFILE && X86
 -      help
 -        The number of hardware counters is limited. The multiplexing
 -        feature enables OProfile to gather more events than counters
 -        are provided by the hardware. This is realized by switching
 -        between events at a user specified time interval.
 -
 -        If unsure, say N.
 -
 -config HAVE_OPROFILE
 -      bool
 -
 -config OPROFILE_NMI_TIMER
 -      def_bool y
 -      depends on PERF_EVENTS && HAVE_PERF_EVENTS_NMI && !PPC64
 -
  config KPROBES
        bool "Kprobes"
        depends on MODULES
@@@ -1058,6 -1090,15 +1058,15 @@@ config HAVE_STATIC_CALL_INLIN
        bool
        depends on HAVE_STATIC_CALL
  
+ config HAVE_PREEMPT_DYNAMIC
+       bool
+       depends on HAVE_STATIC_CALL
+       depends on GENERIC_ENTRY
+       help
+          Select this if the architecture support boot time preempt setting
+          on top of static calls. It is strongly advised to support inline
+          static call to avoid any overhead.
  config ARCH_WANT_LD_ORPHAN_WARN
        bool
        help
@@@ -1079,9 -1120,6 +1088,9 @@@ config ARCH_SPLIT_ARG6
           If a 32-bit architecture requires 64-bit arguments to be split into
           pairs of 32-bit arguments, select this option.
  
 +config ARCH_HAS_ELFCORE_COMPAT
 +      bool
 +
  source "kernel/gcov/Kconfig"
  
  source "scripts/gcc-plugins/Kconfig"
index 9d06fffb1526c9696b61056d8c596693b5caf37b,aeb7f3922106a033e1063731bd3f8e0f1e0c262b..369206489895a28cb15ab329aa7f11d1eb0610ee
@@@ -72,7 -72,7 +72,7 @@@ static struct timer_list spuloadavg_tim
  #define DEF_SPU_TIMESLICE     (100 * HZ / (1000 * SPUSCHED_TICK))
  
  #define SCALE_PRIO(x, prio) \
-       max(x * (MAX_PRIO - prio) / (MAX_USER_PRIO / 2), MIN_SPU_TIMESLICE)
+       max(x * (MAX_PRIO - prio) / (NICE_WIDTH / 2), MIN_SPU_TIMESLICE)
  
  /*
   * scale user-nice values [ -20 ... 0 ... 19 ] to time slice values:
@@@ -181,6 -181,9 +181,6 @@@ void do_notify_spus_active(void
  
        /*
         * Wake up the active spu_contexts.
 -       *
 -       * When the awakened processes see their "notify_active" flag is set,
 -       * they will call spu_switch_notify().
         */
        for_each_online_node(node) {
                struct spu *spu;
@@@ -236,6 -239,7 +236,6 @@@ static void spu_bind_context(struct sp
        spu_switch_log_notify(spu, ctx, SWITCH_LOG_START, 0);
        spu_restore(&ctx->csa, spu);
        spu->timestamp = jiffies;
 -      spu_switch_notify(spu, ctx);
        ctx->state = SPU_STATE_RUNNABLE;
  
        spuctx_switch_state(ctx, SPU_UTIL_USER);
@@@ -436,6 -440,7 +436,6 @@@ static void spu_unbind_context(struct s
                 */
                atomic_dec_if_positive(&ctx->gang->aff_sched_count);
  
 -      spu_switch_notify(spu, NULL);
        spu_unmap_mappings(ctx);
        spu_save(&ctx->csa, spu);
        spu_switch_log_notify(spu, ctx, SWITCH_LOG_STOP, 0);
diff --combined arch/x86/Kconfig
index 7b934a591df2bfc91e9723bbe2566e32627bdd65,d3338a87761f107a4880d968b741b6f3ae140678..595193bc2d31e32f39d19dd767310ef51087f1dc
@@@ -32,7 -32,6 +32,7 @@@ config X86_6
        select MODULES_USE_ELF_RELA
        select NEED_DMA_MAP_STATE
        select SWIOTLB
 +      select ARCH_HAS_ELFCORE_COMPAT
  
  config FORCE_DYNAMIC_FTRACE
        def_bool y
@@@ -207,6 -206,7 +207,6 @@@ config X8
        select HAVE_MOVE_PMD
        select HAVE_MOVE_PUD
        select HAVE_NMI
 -      select HAVE_OPROFILE
        select HAVE_OPTPROBES
        select HAVE_PCSPKR_PLATFORM
        select HAVE_PERF_EVENTS
        select HAVE_STACK_VALIDATION            if X86_64
        select HAVE_STATIC_CALL
        select HAVE_STATIC_CALL_INLINE          if HAVE_STACK_VALIDATION
+       select HAVE_PREEMPT_DYNAMIC
        select HAVE_RSEQ
        select HAVE_SYSCALL_TRACEPOINTS
        select HAVE_UNSTABLE_SCHED_CLOCK
@@@ -890,7 -891,7 +891,7 @@@ config HPET_TIME
  
  config HPET_EMULATE_RTC
        def_bool y
 -      depends on HPET_TIMER && (RTC=y || RTC=m || RTC_DRV_CMOS=m || RTC_DRV_CMOS=y)
 +      depends on HPET_TIMER && (RTC_DRV_CMOS=m || RTC_DRV_CMOS=y)
  
  config APB_TIMER
        def_bool y if X86_INTEL_MID
@@@ -1158,6 -1159,10 +1159,6 @@@ config X86_MCE_INJEC
          If you don't know what a machine check is and you don't do kernel
          QA it is safe to say n.
  
 -config X86_THERMAL_VECTOR
 -      def_bool y
 -      depends on X86_MCE_INTEL
 -
  source "arch/x86/events/Kconfig"
  
  config X86_LEGACY_VM86
@@@ -2860,6 -2865,7 +2861,6 @@@ config IA32_EMULATIO
        depends on X86_64
        select ARCH_WANT_OLD_COMPAT_IPC
        select BINFMT_ELF
 -      select COMPAT_BINFMT_ELF
        select COMPAT_OLD_SIGACTION
        help
          Include code to run legacy 32-bit programs under a
index 52dbd58f6810a0d5fca88c74c6d18e8e25c59c90,3f747de1934d77b3cf45091fae87ba0d78128315..a54e08d77789a2535a7002f61de6ea802fde4452
  #define THERMAL_TABLE(name)
  #endif
  
 +#ifdef CONFIG_DTPM
 +#define DTPM_TABLE()                                                  \
 +      . = ALIGN(8);                                                   \
 +      __dtpm_table = .;                                               \
 +      KEEP(*(__dtpm_table))                                           \
 +      __dtpm_table_end = .;
 +#else
 +#define DTPM_TABLE()
 +#endif
 +
  #define KERNEL_DTB()                                                  \
        STRUCT_ALIGN();                                                 \
        __dtb_start = .;                                                \
        . = ALIGN(8);                                                   \
        __start_static_call_sites = .;                                  \
        KEEP(*(.static_call_sites))                                     \
-       __stop_static_call_sites = .;
+       __stop_static_call_sites = .;                                   \
+       __start_static_call_tramp_key = .;                              \
+       KEEP(*(.static_call_tramp_key))                                 \
+       __stop_static_call_tramp_key = .;
  
  /*
   * Allow architectures to handle ro_after_init data on their
        ACPI_PROBE_TABLE(irqchip)                                       \
        ACPI_PROBE_TABLE(timer)                                         \
        THERMAL_TABLE(governor)                                         \
 +      DTPM_TABLE()                                                    \
        EARLYCON_TABLE()                                                \
        LSM_TABLE()                                                     \
        EARLY_LSM_TABLE()                                               \
diff --combined include/linux/rcupdate.h
index ebd8dcca4997d2134ceae47ab9d2b8e1764d0453,36c2119de7022f0cd63f231e54a65bd9a119e79c..bd04f722714f65dea4791076d3f63feb5e16e3c9
@@@ -33,8 -33,6 +33,8 @@@
  #define ULONG_CMP_GE(a, b)    (ULONG_MAX / 2 >= (a) - (b))
  #define ULONG_CMP_LT(a, b)    (ULONG_MAX / 2 < (a) - (b))
  #define ulong2long(a)         (*(long *)(&(a)))
 +#define USHORT_CMP_GE(a, b)   (USHRT_MAX / 2 >= (unsigned short)((a) - (b)))
 +#define USHORT_CMP_LT(a, b)   (USHRT_MAX / 2 < (unsigned short)((a) - (b)))
  
  /* Exported common interfaces */
  void call_rcu(struct rcu_head *head, rcu_callback_t func);
@@@ -112,12 -110,10 +112,14 @@@ static inline void rcu_user_exit(void) 
  
  #ifdef CONFIG_RCU_NOCB_CPU
  void rcu_init_nohz(void);
 +int rcu_nocb_cpu_offload(int cpu);
 +int rcu_nocb_cpu_deoffload(int cpu);
+ void rcu_nocb_flush_deferred_wakeup(void);
  #else /* #ifdef CONFIG_RCU_NOCB_CPU */
  static inline void rcu_init_nohz(void) { }
 +static inline int rcu_nocb_cpu_offload(int cpu) { return -EINVAL; }
 +static inline int rcu_nocb_cpu_deoffload(int cpu) { return 0; }
+ static inline void rcu_nocb_flush_deferred_wakeup(void) { }
  #endif /* #else #ifdef CONFIG_RCU_NOCB_CPU */
  
  /**
@@@ -852,11 -848,19 +854,11 @@@ static inline notrace void rcu_read_unl
   */
  #define __is_kvfree_rcu_offset(offset) ((offset) < 4096)
  
 -/*
 - * Helper macro for kfree_rcu() to prevent argument-expansion eyestrain.
 - */
 -#define __kvfree_rcu(head, offset) \
 -      do { \
 -              BUILD_BUG_ON(!__is_kvfree_rcu_offset(offset)); \
 -              kvfree_call_rcu(head, (rcu_callback_t)(unsigned long)(offset)); \
 -      } while (0)
 -
  /**
   * kfree_rcu() - kfree an object after a grace period.
 - * @ptr:      pointer to kfree
 - * @rhf:      the name of the struct rcu_head within the type of @ptr.
 + * @ptr: pointer to kfree for both single- and double-argument invocations.
 + * @rhf: the name of the struct rcu_head within the type of @ptr,
 + *       but only for double-argument invocations.
   *
   * Many rcu callbacks functions just call kfree() on the base structure.
   * These functions are trivial, but their size adds up, and furthermore
   * Because the functions are not allowed in the low-order 4096 bytes of
   * kernel virtual memory, offsets up to 4095 bytes can be accommodated.
   * If the offset is larger than 4095 bytes, a compile-time error will
 - * be generated in __kvfree_rcu(). If this error is triggered, you can
 + * be generated in kvfree_rcu_arg_2(). If this error is triggered, you can
   * either fall back to use of call_rcu() or rearrange the structure to
   * position the rcu_head structure into the first 4096 bytes.
   *
   * The BUILD_BUG_ON check must not involve any function calls, hence the
   * checks are done in macros here.
   */
 -#define kfree_rcu(ptr, rhf)                                           \
 -do {                                                                  \
 -      typeof (ptr) ___p = (ptr);                                      \
 -                                                                      \
 -      if (___p)                                                       \
 -              __kvfree_rcu(&((___p)->rhf), offsetof(typeof(*(ptr)), rhf)); \
 -} while (0)
 +#define kfree_rcu kvfree_rcu
  
  /**
   * kvfree_rcu() - kvfree an object after a grace period.
        kvfree_rcu_arg_2, kvfree_rcu_arg_1)(__VA_ARGS__)
  
  #define KVFREE_GET_MACRO(_1, _2, NAME, ...) NAME
 -#define kvfree_rcu_arg_2(ptr, rhf) kfree_rcu(ptr, rhf)
 +#define kvfree_rcu_arg_2(ptr, rhf)                                    \
 +do {                                                                  \
 +      typeof (ptr) ___p = (ptr);                                      \
 +                                                                      \
 +      if (___p) {                                                                     \
 +              BUILD_BUG_ON(!__is_kvfree_rcu_offset(offsetof(typeof(*(ptr)), rhf)));   \
 +              kvfree_call_rcu(&((___p)->rhf), (rcu_callback_t)(unsigned long)         \
 +                      (offsetof(typeof(*(ptr)), rhf)));                               \
 +      }                                                                               \
 +} while (0)
 +
  #define kvfree_rcu_arg_1(ptr)                                 \
  do {                                                          \
        typeof(ptr) ___p = (ptr);                               \
diff --combined init/Kconfig
index 17e955fdec97fd1dd6c5c52808d65b91b2f453c7,a1046963de2746a1e8d19e6e1ca34ddcce6bd837..096e1af5c5865eafe438b1c0673f7f7ad46f1711
@@@ -524,7 -524,7 +524,7 @@@ config SCHED_THERMAL_PRESSUR
          i.e. put less load on throttled CPUs than on non/less throttled ones.
  
          This requires the architecture to implement
-         arch_set_thermal_pressure() and arch_get_thermal_pressure().
+         arch_set_thermal_pressure() and arch_scale_thermal_pressure().
  
  config BSD_PROCESS_ACCT
        bool "BSD Process Accounting"
@@@ -2023,7 -2023,7 +2023,7 @@@ config PROFILIN
        bool "Profiling support"
        help
          Say Y here to enable the extended profiling support mechanisms used
 -        by profilers such as OProfile.
 +        by profilers.
  
  #
  # Place an empty function call at each tracepoint site. Can be
diff --combined kernel/events/core.c
index c37401e3e5f7326b2dbbe1762f5150d0bc28d6e2,3d890961f6e5451caf7fc99c16b818fa75f8c147..5fe7d63467629e9b5b57e033257fb8ee81a3ca35
@@@ -53,7 -53,6 +53,7 @@@
  #include <linux/min_heap.h>
  #include <linux/highmem.h>
  #include <linux/pgtable.h>
 +#include <linux/buildid.h>
  
  #include "internal.h"
  
@@@ -398,7 -397,6 +398,7 @@@ static atomic_t nr_ksymbol_events __rea
  static atomic_t nr_bpf_events __read_mostly;
  static atomic_t nr_cgroup_events __read_mostly;
  static atomic_t nr_text_poke_events __read_mostly;
 +static atomic_t nr_build_id_events __read_mostly;
  
  static LIST_HEAD(pmus);
  static DEFINE_MUTEX(pmus_lock);
@@@ -1597,50 -1595,91 +1597,91 @@@ static void perf_event_groups_init(stru
        groups->index = 0;
  }
  
+ static inline struct cgroup *event_cgroup(const struct perf_event *event)
+ {
+       struct cgroup *cgroup = NULL;
+ #ifdef CONFIG_CGROUP_PERF
+       if (event->cgrp)
+               cgroup = event->cgrp->css.cgroup;
+ #endif
+       return cgroup;
+ }
  /*
   * Compare function for event groups;
   *
   * Implements complex key that first sorts by CPU and then by virtual index
   * which provides ordering when rotating groups for the same CPU.
   */
- static bool
- perf_event_groups_less(struct perf_event *left, struct perf_event *right)
+ static __always_inline int
+ perf_event_groups_cmp(const int left_cpu, const struct cgroup *left_cgroup,
+                     const u64 left_group_index, const struct perf_event *right)
  {
-       if (left->cpu < right->cpu)
-               return true;
-       if (left->cpu > right->cpu)
-               return false;
+       if (left_cpu < right->cpu)
+               return -1;
+       if (left_cpu > right->cpu)
+               return 1;
  
  #ifdef CONFIG_CGROUP_PERF
-       if (left->cgrp != right->cgrp) {
-               if (!left->cgrp || !left->cgrp->css.cgroup) {
-                       /*
-                        * Left has no cgroup but right does, no cgroups come
-                        * first.
-                        */
-                       return true;
+       {
+               const struct cgroup *right_cgroup = event_cgroup(right);
+               if (left_cgroup != right_cgroup) {
+                       if (!left_cgroup) {
+                               /*
+                                * Left has no cgroup but right does, no
+                                * cgroups come first.
+                                */
+                               return -1;
+                       }
+                       if (!right_cgroup) {
+                               /*
+                                * Right has no cgroup but left does, no
+                                * cgroups come first.
+                                */
+                               return 1;
+                       }
+                       /* Two dissimilar cgroups, order by id. */
+                       if (cgroup_id(left_cgroup) < cgroup_id(right_cgroup))
+                               return -1;
+                       return 1;
                }
-               if (!right->cgrp || !right->cgrp->css.cgroup) {
-                       /*
-                        * Right has no cgroup but left does, no cgroups come
-                        * first.
-                        */
-                       return false;
-               }
-               /* Two dissimilar cgroups, order by id. */
-               if (left->cgrp->css.cgroup->kn->id < right->cgrp->css.cgroup->kn->id)
-                       return true;
-               return false;
        }
  #endif
  
-       if (left->group_index < right->group_index)
-               return true;
-       if (left->group_index > right->group_index)
-               return false;
+       if (left_group_index < right->group_index)
+               return -1;
+       if (left_group_index > right->group_index)
+               return 1;
+       return 0;
+ }
  
-       return false;
+ #define __node_2_pe(node) \
+       rb_entry((node), struct perf_event, group_node)
+ static inline bool __group_less(struct rb_node *a, const struct rb_node *b)
+ {
+       struct perf_event *e = __node_2_pe(a);
+       return perf_event_groups_cmp(e->cpu, event_cgroup(e), e->group_index,
+                                    __node_2_pe(b)) < 0;
+ }
+ struct __group_key {
+       int cpu;
+       struct cgroup *cgroup;
+ };
+ static inline int __group_cmp(const void *key, const struct rb_node *node)
+ {
+       const struct __group_key *a = key;
+       const struct perf_event *b = __node_2_pe(node);
+       /* partial/subtree match: @cpu, @cgroup; ignore: @group_index */
+       return perf_event_groups_cmp(a->cpu, a->cgroup, b->group_index, b);
  }
  
  /*
@@@ -1652,27 -1691,9 +1693,9 @@@ static voi
  perf_event_groups_insert(struct perf_event_groups *groups,
                         struct perf_event *event)
  {
-       struct perf_event *node_event;
-       struct rb_node *parent;
-       struct rb_node **node;
        event->group_index = ++groups->index;
  
-       node = &groups->tree.rb_node;
-       parent = *node;
-       while (*node) {
-               parent = *node;
-               node_event = container_of(*node, struct perf_event, group_node);
-               if (perf_event_groups_less(event, node_event))
-                       node = &parent->rb_left;
-               else
-                       node = &parent->rb_right;
-       }
-       rb_link_node(&event->group_node, parent, node);
-       rb_insert_color(&event->group_node, &groups->tree);
+       rb_add(&event->group_node, &groups->tree, __group_less);
  }
  
  /*
@@@ -1720,45 -1741,17 +1743,17 @@@ static struct perf_event 
  perf_event_groups_first(struct perf_event_groups *groups, int cpu,
                        struct cgroup *cgrp)
  {
-       struct perf_event *node_event = NULL, *match = NULL;
-       struct rb_node *node = groups->tree.rb_node;
- #ifdef CONFIG_CGROUP_PERF
-       u64 node_cgrp_id, cgrp_id = 0;
-       if (cgrp)
-               cgrp_id = cgrp->kn->id;
- #endif
-       while (node) {
-               node_event = container_of(node, struct perf_event, group_node);
-               if (cpu < node_event->cpu) {
-                       node = node->rb_left;
-                       continue;
-               }
-               if (cpu > node_event->cpu) {
-                       node = node->rb_right;
-                       continue;
-               }
- #ifdef CONFIG_CGROUP_PERF
-               node_cgrp_id = 0;
-               if (node_event->cgrp && node_event->cgrp->css.cgroup)
-                       node_cgrp_id = node_event->cgrp->css.cgroup->kn->id;
+       struct __group_key key = {
+               .cpu = cpu,
+               .cgroup = cgrp,
+       };
+       struct rb_node *node;
  
-               if (cgrp_id < node_cgrp_id) {
-                       node = node->rb_left;
-                       continue;
-               }
-               if (cgrp_id > node_cgrp_id) {
-                       node = node->rb_right;
-                       continue;
-               }
- #endif
-               match = node_event;
-               node = node->rb_left;
-       }
+       node = rb_find_first(&key, &groups->tree, __group_cmp);
+       if (node)
+               return __node_2_pe(node);
  
-       return match;
+       return NULL;
  }
  
  /*
  static struct perf_event *
  perf_event_groups_next(struct perf_event *event)
  {
-       struct perf_event *next;
- #ifdef CONFIG_CGROUP_PERF
-       u64 curr_cgrp_id = 0;
-       u64 next_cgrp_id = 0;
- #endif
-       next = rb_entry_safe(rb_next(&event->group_node), typeof(*event), group_node);
-       if (next == NULL || next->cpu != event->cpu)
-               return NULL;
- #ifdef CONFIG_CGROUP_PERF
-       if (event->cgrp && event->cgrp->css.cgroup)
-               curr_cgrp_id = event->cgrp->css.cgroup->kn->id;
+       struct __group_key key = {
+               .cpu = event->cpu,
+               .cgroup = event_cgroup(event),
+       };
+       struct rb_node *next;
  
-       if (next->cgrp && next->cgrp->css.cgroup)
-               next_cgrp_id = next->cgrp->css.cgroup->kn->id;
+       next = rb_next_match(&key, &event->group_node, __group_cmp);
+       if (next)
+               return __node_2_pe(next);
  
-       if (curr_cgrp_id != next_cgrp_id)
-               return NULL;
- #endif
-       return next;
+       return NULL;
  }
  
  /*
@@@ -4675,8 -4658,6 +4660,8 @@@ static void unaccount_event(struct perf
                dec = true;
        if (event->attr.mmap || event->attr.mmap_data)
                atomic_dec(&nr_mmap_events);
 +      if (event->attr.build_id)
 +              atomic_dec(&nr_build_id_events);
        if (event->attr.comm)
                atomic_dec(&nr_comm_events);
        if (event->attr.namespaces)
@@@ -8050,8 -8031,6 +8035,8 @@@ struct perf_mmap_event 
        u64                     ino;
        u64                     ino_generation;
        u32                     prot, flags;
 +      u8                      build_id[BUILD_ID_SIZE_MAX];
 +      u32                     build_id_size;
  
        struct {
                struct perf_event_header        header;
@@@ -8083,7 -8062,6 +8068,7 @@@ static void perf_event_mmap_output(stru
        struct perf_sample_data sample;
        int size = mmap_event->event_id.header.size;
        u32 type = mmap_event->event_id.header.type;
 +      bool use_build_id;
        int ret;
  
        if (!perf_event_mmap_match(event, data))
        mmap_event->event_id.pid = perf_event_pid(event, current);
        mmap_event->event_id.tid = perf_event_tid(event, current);
  
 +      use_build_id = event->attr.build_id && mmap_event->build_id_size;
 +
 +      if (event->attr.mmap2 && use_build_id)
 +              mmap_event->event_id.header.misc |= PERF_RECORD_MISC_MMAP_BUILD_ID;
 +
        perf_output_put(&handle, mmap_event->event_id);
  
        if (event->attr.mmap2) {
 -              perf_output_put(&handle, mmap_event->maj);
 -              perf_output_put(&handle, mmap_event->min);
 -              perf_output_put(&handle, mmap_event->ino);
 -              perf_output_put(&handle, mmap_event->ino_generation);
 +              if (use_build_id) {
 +                      u8 size[4] = { (u8) mmap_event->build_id_size, 0, 0, 0 };
 +
 +                      __output_copy(&handle, size, 4);
 +                      __output_copy(&handle, mmap_event->build_id, BUILD_ID_SIZE_MAX);
 +              } else {
 +                      perf_output_put(&handle, mmap_event->maj);
 +                      perf_output_put(&handle, mmap_event->min);
 +                      perf_output_put(&handle, mmap_event->ino);
 +                      perf_output_put(&handle, mmap_event->ino_generation);
 +              }
                perf_output_put(&handle, mmap_event->prot);
                perf_output_put(&handle, mmap_event->flags);
        }
@@@ -8255,9 -8221,6 +8240,9 @@@ got_name
  
        mmap_event->event_id.header.size = sizeof(mmap_event->event_id) + size;
  
 +      if (atomic_read(&nr_build_id_events))
 +              build_id_parse(vma, mmap_event->build_id, &mmap_event->build_id_size);
 +
        perf_iterate_sb(perf_event_mmap_output,
                       mmap_event,
                       NULL);
@@@ -11194,8 -11157,6 +11179,8 @@@ static void account_event(struct perf_e
                inc = true;
        if (event->attr.mmap || event->attr.mmap_data)
                atomic_inc(&nr_mmap_events);
 +      if (event->attr.build_id)
 +              atomic_inc(&nr_build_id_events);
        if (event->attr.comm)
                atomic_inc(&nr_comm_events);
        if (event->attr.namespaces)
diff --combined kernel/locking/rtmutex.c
index 47a6e0b8073d1d47e544230313a7da428d62ec67,57e380453bf963141d4f68e4756392a0466b7183..03b21135313cbefd19d3751aa878db61dfc1e5a6
@@@ -267,27 -267,18 +267,18 @@@ rt_mutex_waiter_equal(struct rt_mutex_w
        return 1;
  }
  
+ #define __node_2_waiter(node) \
+       rb_entry((node), struct rt_mutex_waiter, tree_entry)
+ static inline bool __waiter_less(struct rb_node *a, const struct rb_node *b)
+ {
+       return rt_mutex_waiter_less(__node_2_waiter(a), __node_2_waiter(b));
+ }
  static void
  rt_mutex_enqueue(struct rt_mutex *lock, struct rt_mutex_waiter *waiter)
  {
-       struct rb_node **link = &lock->waiters.rb_root.rb_node;
-       struct rb_node *parent = NULL;
-       struct rt_mutex_waiter *entry;
-       bool leftmost = true;
-       while (*link) {
-               parent = *link;
-               entry = rb_entry(parent, struct rt_mutex_waiter, tree_entry);
-               if (rt_mutex_waiter_less(waiter, entry)) {
-                       link = &parent->rb_left;
-               } else {
-                       link = &parent->rb_right;
-                       leftmost = false;
-               }
-       }
-       rb_link_node(&waiter->tree_entry, parent, link);
-       rb_insert_color_cached(&waiter->tree_entry, &lock->waiters, leftmost);
+       rb_add_cached(&waiter->tree_entry, &lock->waiters, __waiter_less);
  }
  
  static void
@@@ -300,27 -291,18 +291,18 @@@ rt_mutex_dequeue(struct rt_mutex *lock
        RB_CLEAR_NODE(&waiter->tree_entry);
  }
  
+ #define __node_2_pi_waiter(node) \
+       rb_entry((node), struct rt_mutex_waiter, pi_tree_entry)
+ static inline bool __pi_waiter_less(struct rb_node *a, const struct rb_node *b)
+ {
+       return rt_mutex_waiter_less(__node_2_pi_waiter(a), __node_2_pi_waiter(b));
+ }
  static void
  rt_mutex_enqueue_pi(struct task_struct *task, struct rt_mutex_waiter *waiter)
  {
-       struct rb_node **link = &task->pi_waiters.rb_root.rb_node;
-       struct rb_node *parent = NULL;
-       struct rt_mutex_waiter *entry;
-       bool leftmost = true;
-       while (*link) {
-               parent = *link;
-               entry = rb_entry(parent, struct rt_mutex_waiter, pi_tree_entry);
-               if (rt_mutex_waiter_less(waiter, entry)) {
-                       link = &parent->rb_left;
-               } else {
-                       link = &parent->rb_right;
-                       leftmost = false;
-               }
-       }
-       rb_link_node(&waiter->pi_tree_entry, parent, link);
-       rb_insert_color_cached(&waiter->pi_tree_entry, &task->pi_waiters, leftmost);
+       rb_add_cached(&waiter->pi_tree_entry, &task->pi_waiters, __pi_waiter_less);
  }
  
  static void
@@@ -1604,11 -1586,8 +1586,11 @@@ void __sched rt_mutex_unlock(struct rt_
  EXPORT_SYMBOL_GPL(rt_mutex_unlock);
  
  /**
 - * Futex variant, that since futex variants do not use the fast-path, can be
 - * simple and will not need to retry.
 + * __rt_mutex_futex_unlock - Futex variant, that since futex variants
 + * do not use the fast-path, can be simple and will not need to retry.
 + *
 + * @lock:     The rt_mutex to be unlocked
 + * @wake_q:   The wake queue head from which to get the next lock waiter
   */
  bool __sched __rt_mutex_futex_unlock(struct rt_mutex *lock,
                                    struct wake_q_head *wake_q)
@@@ -1665,15 -1644,13 +1647,15 @@@ void rt_mutex_destroy(struct rt_mutex *
  EXPORT_SYMBOL_GPL(rt_mutex_destroy);
  
  /**
 - * __rt_mutex_init - initialize the rt lock
 + * __rt_mutex_init - initialize the rt_mutex
   *
 - * @lock: the rt lock to be initialized
 + * @lock:     The rt_mutex to be initialized
 + * @name:     The lock name used for debugging
 + * @key:      The lock class key used for debugging
   *
 - * Initialize the rt lock to unlocked state.
 + * Initialize the rt_mutex to unlocked state.
   *
 - * Initializing of a locked rt lock is not allowed
 + * Initializing of a locked rt_mutex is not allowed
   */
  void __rt_mutex_init(struct rt_mutex *lock, const char *name,
                     struct lock_class_key *key)
diff --combined kernel/rcu/tree.c
index 0f4a6a3c057b0120be8ff35f3f40be6bde7fa3aa,ce17b8477442fcbfbb6866a853b5726f1c5020aa..da6f5213fb74cb119f2acb6da8a3a6dcb6ac84c5
@@@ -83,9 -83,6 +83,9 @@@ static DEFINE_PER_CPU_SHARED_ALIGNED(st
        .dynticks_nesting = 1,
        .dynticks_nmi_nesting = DYNTICK_IRQ_NONIDLE,
        .dynticks = ATOMIC_INIT(RCU_DYNTICK_CTRL_CTR),
 +#ifdef CONFIG_RCU_NOCB_CPU
 +      .cblist.flags = SEGCBLIST_SOFTIRQ_ONLY,
 +#endif
  };
  static struct rcu_state rcu_state = {
        .level = { &rcu_state.node[0] },
  static bool dump_tree;
  module_param(dump_tree, bool, 0444);
  /* By default, use RCU_SOFTIRQ instead of rcuc kthreads. */
 -static bool use_softirq = true;
 +static bool use_softirq = !IS_ENABLED(CONFIG_PREEMPT_RT);
 +#ifndef CONFIG_PREEMPT_RT
  module_param(use_softirq, bool, 0444);
 +#endif
  /* Control rcu_node-tree auto-balancing at boot time. */
  static bool rcu_fanout_exact;
  module_param(rcu_fanout_exact, bool, 0444);
@@@ -649,7 -644,6 +649,6 @@@ static noinstr void rcu_eqs_enter(bool 
        trace_rcu_dyntick(TPS("Start"), rdp->dynticks_nesting, 0, atomic_read(&rdp->dynticks));
        WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !user && !is_idle_task(current));
        rdp = this_cpu_ptr(&rcu_data);
-       do_nocb_deferred_wakeup(rdp);
        rcu_prepare_for_idle();
        rcu_preempt_deferred_qs(current);
  
@@@ -683,6 -677,50 +682,50 @@@ void rcu_idle_enter(void
  EXPORT_SYMBOL_GPL(rcu_idle_enter);
  
  #ifdef CONFIG_NO_HZ_FULL
+ #if !defined(CONFIG_GENERIC_ENTRY) || !defined(CONFIG_KVM_XFER_TO_GUEST_WORK)
+ /*
+  * An empty function that will trigger a reschedule on
+  * IRQ tail once IRQs get re-enabled on userspace/guest resume.
+  */
+ static void late_wakeup_func(struct irq_work *work)
+ {
+ }
+ static DEFINE_PER_CPU(struct irq_work, late_wakeup_work) =
+       IRQ_WORK_INIT(late_wakeup_func);
+ /*
+  * If either:
+  *
+  * 1) the task is about to enter in guest mode and $ARCH doesn't support KVM generic work
+  * 2) the task is about to enter in user mode and $ARCH doesn't support generic entry.
+  *
+  * In these cases the late RCU wake ups aren't supported in the resched loops and our
+  * last resort is to fire a local irq_work that will trigger a reschedule once IRQs
+  * get re-enabled again.
+  */
+ noinstr static void rcu_irq_work_resched(void)
+ {
+       struct rcu_data *rdp = this_cpu_ptr(&rcu_data);
+       if (IS_ENABLED(CONFIG_GENERIC_ENTRY) && !(current->flags & PF_VCPU))
+               return;
+       if (IS_ENABLED(CONFIG_KVM_XFER_TO_GUEST_WORK) && (current->flags & PF_VCPU))
+               return;
+       instrumentation_begin();
+       if (do_nocb_deferred_wakeup(rdp) && need_resched()) {
+               irq_work_queue(this_cpu_ptr(&late_wakeup_work));
+       }
+       instrumentation_end();
+ }
+ #else
+ static inline void rcu_irq_work_resched(void) { }
+ #endif
  /**
   * rcu_user_enter - inform RCU that we are resuming userspace.
   *
  noinstr void rcu_user_enter(void)
  {
        lockdep_assert_irqs_disabled();
+       /*
+        * Other than generic entry implementation, we may be past the last
+        * rescheduling opportunity in the entry code. Trigger a self IPI
+        * that will fire and reschedule once we resume in user/guest mode.
+        */
+       rcu_irq_work_resched();
        rcu_eqs_enter(true);
  }
  #endif /* CONFIG_NO_HZ_FULL */
  
  /**
@@@ -1500,8 -1546,6 +1551,8 @@@ static bool rcu_accelerate_cbs(struct r
        if (!rcu_segcblist_pend_cbs(&rdp->cblist))
                return false;
  
 +      trace_rcu_segcb_stats(&rdp->cblist, TPS("SegCbPreAcc"));
 +
        /*
         * Callbacks are often registered with incomplete grace-period
         * information.  Something about the fact that getting exact
        else
                trace_rcu_grace_period(rcu_state.name, gp_seq_req, TPS("AccReadyCB"));
  
 +      trace_rcu_segcb_stats(&rdp->cblist, TPS("SegCbPostAcc"));
 +
        return ret;
  }
  
@@@ -1774,7 -1816,7 +1825,7 @@@ static bool rcu_gp_init(void
         * go offline later.  Please also refer to "Hotplug CPU" section
         * of RCU's Requirements documentation.
         */
 -      rcu_state.gp_state = RCU_GP_ONOFF;
 +      WRITE_ONCE(rcu_state.gp_state, RCU_GP_ONOFF);
        rcu_for_each_leaf_node(rnp) {
                smp_mb(); // Pair with barriers used when updating ->ofl_seq to odd values.
                firstseq = READ_ONCE(rnp->ofl_seq);
         * The grace period cannot complete until the initialization
         * process finishes, because this kthread handles both.
         */
 -      rcu_state.gp_state = RCU_GP_INIT;
 +      WRITE_ONCE(rcu_state.gp_state, RCU_GP_INIT);
        rcu_for_each_node_breadth_first(rnp) {
                rcu_gp_slow(gp_init_delay);
                raw_spin_lock_irqsave_rcu_node(rnp, flags);
@@@ -1939,22 -1981,17 +1990,22 @@@ static void rcu_gp_fqs_loop(void
        ret = 0;
        for (;;) {
                if (!ret) {
 -                      rcu_state.jiffies_force_qs = jiffies + j;
 +                      WRITE_ONCE(rcu_state.jiffies_force_qs, jiffies + j);
 +                      /*
 +                       * jiffies_force_qs before RCU_GP_WAIT_FQS state
 +                       * update; required for stall checks.
 +                       */
 +                      smp_wmb();
                        WRITE_ONCE(rcu_state.jiffies_kick_kthreads,
                                   jiffies + (j ? 3 * j : 2));
                }
                trace_rcu_grace_period(rcu_state.name, rcu_state.gp_seq,
                                       TPS("fqswait"));
 -              rcu_state.gp_state = RCU_GP_WAIT_FQS;
 +              WRITE_ONCE(rcu_state.gp_state, RCU_GP_WAIT_FQS);
                ret = swait_event_idle_timeout_exclusive(
                                rcu_state.gp_wq, rcu_gp_fqs_check_wake(&gf), j);
                rcu_gp_torture_wait();
 -              rcu_state.gp_state = RCU_GP_DOING_FQS;
 +              WRITE_ONCE(rcu_state.gp_state, RCU_GP_DOING_FQS);
                /* Locking provides needed memory barriers. */
                /* If grace period done, leave loop. */
                if (!READ_ONCE(rnp->qsmask) &&
@@@ -2068,7 -2105,7 +2119,7 @@@ static void rcu_gp_cleanup(void
        trace_rcu_grace_period(rcu_state.name, rcu_state.gp_seq, TPS("end"));
        rcu_seq_end(&rcu_state.gp_seq);
        ASSERT_EXCLUSIVE_WRITER(rcu_state.gp_seq);
 -      rcu_state.gp_state = RCU_GP_IDLE;
 +      WRITE_ONCE(rcu_state.gp_state, RCU_GP_IDLE);
        /* Check for GP requests since above loop. */
        rdp = this_cpu_ptr(&rcu_data);
        if (!needgp && ULONG_CMP_LT(rnp->gp_seq, rnp->gp_seq_needed)) {
@@@ -2107,12 -2144,12 +2158,12 @@@ static int __noreturn rcu_gp_kthread(vo
                for (;;) {
                        trace_rcu_grace_period(rcu_state.name, rcu_state.gp_seq,
                                               TPS("reqwait"));
 -                      rcu_state.gp_state = RCU_GP_WAIT_GPS;
 +                      WRITE_ONCE(rcu_state.gp_state, RCU_GP_WAIT_GPS);
                        swait_event_idle_exclusive(rcu_state.gp_wq,
                                         READ_ONCE(rcu_state.gp_flags) &
                                         RCU_GP_FLAG_INIT);
                        rcu_gp_torture_wait();
 -                      rcu_state.gp_state = RCU_GP_DONE_GPS;
 +                      WRITE_ONCE(rcu_state.gp_state, RCU_GP_DONE_GPS);
                        /* Locking provides needed memory barrier. */
                        if (rcu_gp_init())
                                break;
                rcu_gp_fqs_loop();
  
                /* Handle grace-period end. */
 -              rcu_state.gp_state = RCU_GP_CLEANUP;
 +              WRITE_ONCE(rcu_state.gp_state, RCU_GP_CLEANUP);
                rcu_gp_cleanup();
 -              rcu_state.gp_state = RCU_GP_CLEANED;
 +              WRITE_ONCE(rcu_state.gp_state, RCU_GP_CLEANED);
        }
  }
  
@@@ -2444,12 -2481,11 +2495,12 @@@ int rcutree_dead_cpu(unsigned int cpu
  static void rcu_do_batch(struct rcu_data *rdp)
  {
        int div;
 +      bool __maybe_unused empty;
        unsigned long flags;
        const bool offloaded = rcu_segcblist_is_offloaded(&rdp->cblist);
        struct rcu_head *rhp;
        struct rcu_cblist rcl = RCU_CBLIST_INITIALIZER(rcl);
 -      long bl, count;
 +      long bl, count = 0;
        long pending, tlimit = 0;
  
        /* If no callbacks are ready, just return. */
        rcu_segcblist_extract_done_cbs(&rdp->cblist, &rcl);
        if (offloaded)
                rdp->qlen_last_fqs_check = rcu_segcblist_n_cbs(&rdp->cblist);
 +
 +      trace_rcu_segcb_stats(&rdp->cblist, TPS("SegCbDequeued"));
        rcu_nocb_unlock_irqrestore(rdp, flags);
  
        /* Invoke callbacks. */
        tick_dep_set_task(current, TICK_DEP_BIT_RCU);
        rhp = rcu_cblist_dequeue(&rcl);
 +
        for (; rhp; rhp = rcu_cblist_dequeue(&rcl)) {
                rcu_callback_t f;
  
 +              count++;
                debug_rcu_head_unqueue(rhp);
  
                rcu_lock_acquire(&rcu_callback_map);
  
                /*
                 * Stop only if limit reached and CPU has something to do.
 -               * Note: The rcl structure counts down from zero.
                 */
 -              if (-rcl.len >= bl && !offloaded &&
 +              if (count >= bl && !offloaded &&
                    (need_resched() ||
                     (!is_idle_task(current) && !rcu_is_callbacks_kthread())))
                        break;
                if (unlikely(tlimit)) {
                        /* only call local_clock() every 32 callbacks */
 -                      if (likely((-rcl.len & 31) || local_clock() < tlimit))
 +                      if (likely((count & 31) || local_clock() < tlimit))
                                continue;
                        /* Exceeded the time limit, so leave. */
                        break;
                }
 -              if (offloaded) {
 -                      WARN_ON_ONCE(in_serving_softirq());
 +              if (!in_serving_softirq()) {
                        local_bh_enable();
                        lockdep_assert_irqs_enabled();
                        cond_resched_tasks_rcu_qs();
  
        local_irq_save(flags);
        rcu_nocb_lock(rdp);
 -      count = -rcl.len;
        rdp->n_cbs_invoked += count;
        trace_rcu_batch_end(rcu_state.name, count, !!rcl.head, need_resched(),
                            is_idle_task(current), rcu_is_callbacks_kthread());
  
        /* Update counts and requeue any remaining callbacks. */
        rcu_segcblist_insert_done_cbs(&rdp->cblist, &rcl);
 -      smp_mb(); /* List handling before counting for rcu_barrier(). */
 -      rcu_segcblist_insert_count(&rdp->cblist, &rcl);
 +      rcu_segcblist_add_len(&rdp->cblist, -count);
  
        /* Reinstate batch limit if we have worked down the excess. */
        count = rcu_segcblist_n_cbs(&rdp->cblist);
         * The following usually indicates a double call_rcu().  To track
         * this down, try building with CONFIG_DEBUG_OBJECTS_RCU_HEAD=y.
         */
 -      WARN_ON_ONCE(count == 0 && !rcu_segcblist_empty(&rdp->cblist));
 +      empty = rcu_segcblist_empty(&rdp->cblist);
 +      WARN_ON_ONCE(count == 0 && !empty);
        WARN_ON_ONCE(!IS_ENABLED(CONFIG_RCU_NOCB_CPU) &&
 -                   count != 0 && rcu_segcblist_empty(&rdp->cblist));
 +                   count != 0 && empty);
 +      WARN_ON_ONCE(count == 0 && rcu_segcblist_n_segment_cbs(&rdp->cblist) != 0);
 +      WARN_ON_ONCE(!empty && rcu_segcblist_n_segment_cbs(&rdp->cblist) == 0);
  
        rcu_nocb_unlock_irqrestore(rdp, flags);
  
  void rcu_sched_clock_irq(int user)
  {
        trace_rcu_utilization(TPS("Start scheduler-tick"));
 +      lockdep_assert_irqs_disabled();
        raw_cpu_inc(rcu_data.ticks_this_gp);
        /* The load-acquire pairs with the store-release setting to true. */
        if (smp_load_acquire(this_cpu_ptr(&rcu_data.rcu_urgent_qs))) {
        rcu_flavor_sched_clock_irq(user);
        if (rcu_pending(user))
                invoke_rcu_core();
 +      lockdep_assert_irqs_disabled();
  
        trace_rcu_utilization(TPS("End scheduler-tick"));
  }
@@@ -2708,7 -2739,7 +2759,7 @@@ static __latent_entropy void rcu_core(v
        unsigned long flags;
        struct rcu_data *rdp = raw_cpu_ptr(&rcu_data);
        struct rcu_node *rnp = rdp->mynode;
 -      const bool offloaded = rcu_segcblist_is_offloaded(&rdp->cblist);
 +      const bool do_batch = !rcu_segcblist_completely_offloaded(&rdp->cblist);
  
        if (cpu_is_offline(smp_processor_id()))
                return;
  
        /* No grace period and unregistered callbacks? */
        if (!rcu_gp_in_progress() &&
 -          rcu_segcblist_is_enabled(&rdp->cblist) && !offloaded) {
 -              local_irq_save(flags);
 +          rcu_segcblist_is_enabled(&rdp->cblist) && do_batch) {
 +              rcu_nocb_lock_irqsave(rdp, flags);
                if (!rcu_segcblist_restempty(&rdp->cblist, RCU_NEXT_READY_TAIL))
                        rcu_accelerate_cbs_unlocked(rnp, rdp);
 -              local_irq_restore(flags);
 +              rcu_nocb_unlock_irqrestore(rdp, flags);
        }
  
        rcu_check_gp_start_stall(rnp, rdp, rcu_jiffies_till_stall_check());
  
        /* If there are callbacks ready, invoke them. */
 -      if (!offloaded && rcu_segcblist_ready_cbs(&rdp->cblist) &&
 +      if (do_batch && rcu_segcblist_ready_cbs(&rdp->cblist) &&
            likely(READ_ONCE(rcu_scheduler_fully_active)))
                rcu_do_batch(rdp);
  
@@@ -2961,7 -2992,6 +3012,7 @@@ static void check_cb_ovld(struct rcu_da
  static void
  __call_rcu(struct rcu_head *head, rcu_callback_t func)
  {
 +      static atomic_t doublefrees;
        unsigned long flags;
        struct rcu_data *rdp;
        bool was_alldone;
                 * Use rcu:rcu_callback trace event to find the previous
                 * time callback was passed to __call_rcu().
                 */
 -              WARN_ONCE(1, "__call_rcu(): Double-freed CB %p->%pS()!!!\n",
 -                        head, head->func);
 +              if (atomic_inc_return(&doublefrees) < 4) {
 +                      pr_err("%s(): Double-freed CB %p->%pS()!!!  ", __func__, head, head->func);
 +                      mem_dump_obj(head);
 +              }
                WRITE_ONCE(head->func, rcu_leak_callback);
                return;
        }
                trace_rcu_callback(rcu_state.name, head,
                                   rcu_segcblist_n_cbs(&rdp->cblist));
  
 +      trace_rcu_segcb_stats(&rdp->cblist, TPS("SegCBQueued"));
 +
        /* Go handle any RCU core processing required. */
        if (unlikely(rcu_segcblist_is_offloaded(&rdp->cblist))) {
                __call_rcu_nocb_wake(rdp, was_alldone, flags); /* unlocks */
@@@ -3523,7 -3549,6 +3574,7 @@@ void kvfree_call_rcu(struct rcu_head *h
                goto unlock_return;
        }
  
 +      kasan_record_aux_stack(ptr);
        success = kvfree_call_rcu_add_ptr_to_bulk(krcp, ptr);
        if (!success) {
                run_page_cache_worker(krcp);
@@@ -3773,8 -3798,6 +3824,8 @@@ static int rcu_pending(int user
        struct rcu_data *rdp = this_cpu_ptr(&rcu_data);
        struct rcu_node *rnp = rdp->mynode;
  
 +      lockdep_assert_irqs_disabled();
 +
        /* Check for CPU stalls, if enabled. */
        check_cpu_stall(rdp);
  
@@@ -4029,18 -4052,12 +4080,18 @@@ int rcutree_prepare_cpu(unsigned int cp
        rdp->qlen_last_fqs_check = 0;
        rdp->n_force_qs_snap = rcu_state.n_force_qs;
        rdp->blimit = blimit;
 -      if (rcu_segcblist_empty(&rdp->cblist) && /* No early-boot CBs? */
 -          !rcu_segcblist_is_offloaded(&rdp->cblist))
 -              rcu_segcblist_init(&rdp->cblist);  /* Re-enable callbacks. */
        rdp->dynticks_nesting = 1;      /* CPU not up, no tearing. */
        rcu_dynticks_eqs_online();
        raw_spin_unlock_rcu_node(rnp);          /* irqs remain disabled. */
 +      /*
 +       * Lock in case the CB/GP kthreads are still around handling
 +       * old callbacks (longer term we should flush all callbacks
 +       * before completing CPU offline)
 +       */
 +      rcu_nocb_lock(rdp);
 +      if (rcu_segcblist_empty(&rdp->cblist)) /* No early-boot CBs? */
 +              rcu_segcblist_init(&rdp->cblist);  /* Re-enable callbacks. */
 +      rcu_nocb_unlock(rdp);
  
        /*
         * Add CPU to leaf rcu_node pending-online bitmask.  Any needed
@@@ -4193,9 -4210,6 +4244,9 @@@ void rcu_report_dead(unsigned int cpu
        struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
        struct rcu_node *rnp = rdp->mynode;  /* Outgoing CPU's rdp & rnp. */
  
 +      // Do any dangling deferred wakeups.
 +      do_nocb_deferred_wakeup(rdp);
 +
        /* QS for any half-done expedited grace period. */
        preempt_disable();
        rcu_report_exp_rdp(this_cpu_ptr(&rcu_data));
diff --combined kernel/rcu/tree.h
index 5d359b9f9fec404dceb945fda8abff54127b73f2,9226f4021a36dd341aa85597c21f6cf110a6197d..71821d59d95c58beed17f41cb4d62d315fe03a4e
@@@ -201,7 -201,6 +201,7 @@@ struct rcu_data 
        /* 5) Callback offloading. */
  #ifdef CONFIG_RCU_NOCB_CPU
        struct swait_queue_head nocb_cb_wq; /* For nocb kthreads to sleep on. */
 +      struct swait_queue_head nocb_state_wq; /* For offloading state changes */
        struct task_struct *nocb_gp_kthread;
        raw_spinlock_t nocb_lock;       /* Guard following pair of fields. */
        atomic_t nocb_lock_contended;   /* Contention experienced. */
  };
  
  /* Values for nocb_defer_wakeup field in struct rcu_data. */
 +#define RCU_NOCB_WAKE_OFF     -1
  #define RCU_NOCB_WAKE_NOT     0
  #define RCU_NOCB_WAKE         1
  #define RCU_NOCB_WAKE_FORCE   2
@@@ -435,7 -433,7 +435,7 @@@ static bool rcu_nocb_try_bypass(struct 
  static void __call_rcu_nocb_wake(struct rcu_data *rdp, bool was_empty,
                                 unsigned long flags);
  static int rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp);
- static void do_nocb_deferred_wakeup(struct rcu_data *rdp);
+ static bool do_nocb_deferred_wakeup(struct rcu_data *rdp);
  static void rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp);
  static void rcu_spawn_cpu_nocb_kthread(int cpu);
  static void __init rcu_spawn_nocb_kthreads(void);
diff --combined kernel/rcu/tree_plugin.h
index 231a0c6cf03c179580cd1c9a44097c369d20307d,cdc1b7651c0399ac5533651d684aa94a8e7f32c3..2d603771c7dce8164c56adb3a3f0564c0e1d95f8
@@@ -682,7 -682,6 +682,7 @@@ static void rcu_flavor_sched_clock_irq(
  {
        struct task_struct *t = current;
  
 +      lockdep_assert_irqs_disabled();
        if (user || rcu_is_cpu_rrupt_from_idle()) {
                rcu_note_voluntary_context_switch(current);
        }
@@@ -1632,8 -1631,8 +1632,8 @@@ bool rcu_is_nocb_cpu(int cpu
   * Kick the GP kthread for this NOCB group.  Caller holds ->nocb_lock
   * and this function releases it.
   */
- static void wake_nocb_gp(struct rcu_data *rdp, bool force,
-                          unsigned long flags)
+ static bool wake_nocb_gp(struct rcu_data *rdp, bool force,
+                        unsigned long flags)
        __releases(rdp->nocb_lock)
  {
        bool needwake = false;
                trace_rcu_nocb_wake(rcu_state.name, rdp->cpu,
                                    TPS("AlreadyAwake"));
                rcu_nocb_unlock_irqrestore(rdp, flags);
-               return;
+               return false;
        }
        del_timer(&rdp->nocb_timer);
        rcu_nocb_unlock_irqrestore(rdp, flags);
        raw_spin_unlock_irqrestore(&rdp_gp->nocb_gp_lock, flags);
        if (needwake)
                wake_up_process(rdp_gp->nocb_gp_kthread);
+       return needwake;
  }
  
  /*
  static void wake_nocb_gp_defer(struct rcu_data *rdp, int waketype,
                               const char *reason)
  {
 +      if (rdp->nocb_defer_wakeup == RCU_NOCB_WAKE_OFF)
 +              return;
        if (rdp->nocb_defer_wakeup == RCU_NOCB_WAKE_NOT)
                mod_timer(&rdp->nocb_timer, jiffies + 1);
        if (rdp->nocb_defer_wakeup < waketype)
@@@ -1931,52 -1930,6 +1933,52 @@@ static void do_nocb_bypass_wakeup_timer
        __call_rcu_nocb_wake(rdp, true, flags);
  }
  
 +/*
 + * Check if we ignore this rdp.
 + *
 + * We check that without holding the nocb lock but
 + * we make sure not to miss a freshly offloaded rdp
 + * with the current ordering:
 + *
 + *  rdp_offload_toggle()        nocb_gp_enabled_cb()
 + * -------------------------   ----------------------------
 + *    WRITE flags                 LOCK nocb_gp_lock
 + *    LOCK nocb_gp_lock           READ/WRITE nocb_gp_sleep
 + *    READ/WRITE nocb_gp_sleep    UNLOCK nocb_gp_lock
 + *    UNLOCK nocb_gp_lock         READ flags
 + */
 +static inline bool nocb_gp_enabled_cb(struct rcu_data *rdp)
 +{
 +      u8 flags = SEGCBLIST_OFFLOADED | SEGCBLIST_KTHREAD_GP;
 +
 +      return rcu_segcblist_test_flags(&rdp->cblist, flags);
 +}
 +
 +static inline bool nocb_gp_update_state(struct rcu_data *rdp, bool *needwake_state)
 +{
 +      struct rcu_segcblist *cblist = &rdp->cblist;
 +
 +      if (rcu_segcblist_test_flags(cblist, SEGCBLIST_OFFLOADED)) {
 +              if (!rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_GP)) {
 +                      rcu_segcblist_set_flags(cblist, SEGCBLIST_KTHREAD_GP);
 +                      if (rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_CB))
 +                              *needwake_state = true;
 +              }
 +              return true;
 +      }
 +
 +      /*
 +       * De-offloading. Clear our flag and notify the de-offload worker.
 +       * We will ignore this rdp until it ever gets re-offloaded.
 +       */
 +      WARN_ON_ONCE(!rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_GP));
 +      rcu_segcblist_clear_flags(cblist, SEGCBLIST_KTHREAD_GP);
 +      if (!rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_CB))
 +              *needwake_state = true;
 +      return false;
 +}
 +
 +
  /*
   * No-CBs GP kthreads come here to wait for additional callbacks to show up
   * or for grace periods to end.
@@@ -2005,18 -1958,8 +2007,18 @@@ static void nocb_gp_wait(struct rcu_dat
         */
        WARN_ON_ONCE(my_rdp->nocb_gp_rdp != my_rdp);
        for (rdp = my_rdp; rdp; rdp = rdp->nocb_next_cb_rdp) {
 +              bool needwake_state = false;
 +
 +              if (!nocb_gp_enabled_cb(rdp))
 +                      continue;
                trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("Check"));
                rcu_nocb_lock_irqsave(rdp, flags);
 +              if (!nocb_gp_update_state(rdp, &needwake_state)) {
 +                      rcu_nocb_unlock_irqrestore(rdp, flags);
 +                      if (needwake_state)
 +                              swake_up_one(&rdp->nocb_state_wq);
 +                      continue;
 +              }
                bypass_ncbs = rcu_cblist_n_cbs(&rdp->nocb_bypass);
                if (bypass_ncbs &&
                    (time_after(j, READ_ONCE(rdp->nocb_bypass_first) + 1) ||
                        bypass_ncbs = rcu_cblist_n_cbs(&rdp->nocb_bypass);
                } else if (!bypass_ncbs && rcu_segcblist_empty(&rdp->cblist)) {
                        rcu_nocb_unlock_irqrestore(rdp, flags);
 +                      if (needwake_state)
 +                              swake_up_one(&rdp->nocb_state_wq);
                        continue; /* No callbacks here, try next. */
                }
                if (bypass_ncbs) {
                }
                if (needwake_gp)
                        rcu_gp_kthread_wake();
 +              if (needwake_state)
 +                      swake_up_one(&rdp->nocb_state_wq);
        }
  
        my_rdp->nocb_gp_bypass = bypass;
@@@ -2144,27 -2083,14 +2146,27 @@@ static int rcu_nocb_gp_kthread(void *ar
        return 0;
  }
  
 +static inline bool nocb_cb_can_run(struct rcu_data *rdp)
 +{
 +      u8 flags = SEGCBLIST_OFFLOADED | SEGCBLIST_KTHREAD_CB;
 +      return rcu_segcblist_test_flags(&rdp->cblist, flags);
 +}
 +
 +static inline bool nocb_cb_wait_cond(struct rcu_data *rdp)
 +{
 +      return nocb_cb_can_run(rdp) && !READ_ONCE(rdp->nocb_cb_sleep);
 +}
 +
  /*
   * Invoke any ready callbacks from the corresponding no-CBs CPU,
   * then, if there are no more, wait for more to appear.
   */
  static void nocb_cb_wait(struct rcu_data *rdp)
  {
 +      struct rcu_segcblist *cblist = &rdp->cblist;
        unsigned long cur_gp_seq;
        unsigned long flags;
 +      bool needwake_state = false;
        bool needwake_gp = false;
        struct rcu_node *rnp = rdp->mynode;
  
        local_bh_enable();
        lockdep_assert_irqs_enabled();
        rcu_nocb_lock_irqsave(rdp, flags);
 -      if (rcu_segcblist_nextgp(&rdp->cblist, &cur_gp_seq) &&
 +      if (rcu_segcblist_nextgp(cblist, &cur_gp_seq) &&
            rcu_seq_done(&rnp->gp_seq, cur_gp_seq) &&
            raw_spin_trylock_rcu_node(rnp)) { /* irqs already disabled. */
                needwake_gp = rcu_advance_cbs(rdp->mynode, rdp);
                raw_spin_unlock_rcu_node(rnp); /* irqs remain disabled. */
        }
 -      if (rcu_segcblist_ready_cbs(&rdp->cblist)) {
 -              rcu_nocb_unlock_irqrestore(rdp, flags);
 -              if (needwake_gp)
 -                      rcu_gp_kthread_wake();
 -              return;
 -      }
  
 -      trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("CBSleep"));
        WRITE_ONCE(rdp->nocb_cb_sleep, true);
 +
 +      if (rcu_segcblist_test_flags(cblist, SEGCBLIST_OFFLOADED)) {
 +              if (!rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_CB)) {
 +                      rcu_segcblist_set_flags(cblist, SEGCBLIST_KTHREAD_CB);
 +                      if (rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_GP))
 +                              needwake_state = true;
 +              }
 +              if (rcu_segcblist_ready_cbs(cblist))
 +                      WRITE_ONCE(rdp->nocb_cb_sleep, false);
 +      } else {
 +              /*
 +               * De-offloading. Clear our flag and notify the de-offload worker.
 +               * We won't touch the callbacks and keep sleeping until we ever
 +               * get re-offloaded.
 +               */
 +              WARN_ON_ONCE(!rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_CB));
 +              rcu_segcblist_clear_flags(cblist, SEGCBLIST_KTHREAD_CB);
 +              if (!rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_GP))
 +                      needwake_state = true;
 +      }
 +
 +      if (rdp->nocb_cb_sleep)
 +              trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("CBSleep"));
 +
        rcu_nocb_unlock_irqrestore(rdp, flags);
        if (needwake_gp)
                rcu_gp_kthread_wake();
 -      swait_event_interruptible_exclusive(rdp->nocb_cb_wq,
 -                               !READ_ONCE(rdp->nocb_cb_sleep));
 -      if (!smp_load_acquire(&rdp->nocb_cb_sleep)) { /* VVV */
 -              /* ^^^ Ensure CB invocation follows _sleep test. */
 -              return;
 -      }
 -      WARN_ON(signal_pending(current));
 -      trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("WokeEmpty"));
 +
 +      if (needwake_state)
 +              swake_up_one(&rdp->nocb_state_wq);
 +
 +      do {
 +              swait_event_interruptible_exclusive(rdp->nocb_cb_wq,
 +                                                  nocb_cb_wait_cond(rdp));
 +
 +              // VVV Ensure CB invocation follows _sleep test.
 +              if (smp_load_acquire(&rdp->nocb_cb_sleep)) { // ^^^
 +                      WARN_ON(signal_pending(current));
 +                      trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("WokeEmpty"));
 +              }
 +      } while (!nocb_cb_can_run(rdp));
  }
  
  /*
@@@ -2247,24 -2150,27 +2249,27 @@@ static int rcu_nocb_cb_kthread(void *ar
  /* Is a deferred wakeup of rcu_nocb_kthread() required? */
  static int rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp)
  {
 -      return READ_ONCE(rdp->nocb_defer_wakeup);
 +      return READ_ONCE(rdp->nocb_defer_wakeup) > RCU_NOCB_WAKE_NOT;
  }
  
  /* Do a deferred wakeup of rcu_nocb_kthread(). */
- static void do_nocb_deferred_wakeup_common(struct rcu_data *rdp)
+ static bool do_nocb_deferred_wakeup_common(struct rcu_data *rdp)
  {
        unsigned long flags;
        int ndw;
+       int ret;
  
        rcu_nocb_lock_irqsave(rdp, flags);
        if (!rcu_nocb_need_deferred_wakeup(rdp)) {
                rcu_nocb_unlock_irqrestore(rdp, flags);
-               return;
+               return false;
        }
        ndw = READ_ONCE(rdp->nocb_defer_wakeup);
        WRITE_ONCE(rdp->nocb_defer_wakeup, RCU_NOCB_WAKE_NOT);
-       wake_nocb_gp(rdp, ndw == RCU_NOCB_WAKE_FORCE, flags);
+       ret = wake_nocb_gp(rdp, ndw == RCU_NOCB_WAKE_FORCE, flags);
        trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("DeferredWake"));
+       return ret;
  }
  
  /* Do a deferred wakeup of rcu_nocb_kthread() from a timer handler. */
@@@ -2280,201 -2186,19 +2285,208 @@@ static void do_nocb_deferred_wakeup_tim
   * This means we do an inexact common-case check.  Note that if
   * we miss, ->nocb_timer will eventually clean things up.
   */
- static void do_nocb_deferred_wakeup(struct rcu_data *rdp)
+ static bool do_nocb_deferred_wakeup(struct rcu_data *rdp)
  {
        if (rcu_nocb_need_deferred_wakeup(rdp))
-               do_nocb_deferred_wakeup_common(rdp);
+               return do_nocb_deferred_wakeup_common(rdp);
+       return false;
+ }
+ void rcu_nocb_flush_deferred_wakeup(void)
+ {
+       do_nocb_deferred_wakeup(this_cpu_ptr(&rcu_data));
  }
+ EXPORT_SYMBOL_GPL(rcu_nocb_flush_deferred_wakeup);
  
 +static int rdp_offload_toggle(struct rcu_data *rdp,
 +                             bool offload, unsigned long flags)
 +      __releases(rdp->nocb_lock)
 +{
 +      struct rcu_segcblist *cblist = &rdp->cblist;
 +      struct rcu_data *rdp_gp = rdp->nocb_gp_rdp;
 +      bool wake_gp = false;
 +
 +      rcu_segcblist_offload(cblist, offload);
 +
 +      if (rdp->nocb_cb_sleep)
 +              rdp->nocb_cb_sleep = false;
 +      rcu_nocb_unlock_irqrestore(rdp, flags);
 +
 +      /*
 +       * Ignore former value of nocb_cb_sleep and force wake up as it could
 +       * have been spuriously set to false already.
 +       */
 +      swake_up_one(&rdp->nocb_cb_wq);
 +
 +      raw_spin_lock_irqsave(&rdp_gp->nocb_gp_lock, flags);
 +      if (rdp_gp->nocb_gp_sleep) {
 +              rdp_gp->nocb_gp_sleep = false;
 +              wake_gp = true;
 +      }
 +      raw_spin_unlock_irqrestore(&rdp_gp->nocb_gp_lock, flags);
 +
 +      if (wake_gp)
 +              wake_up_process(rdp_gp->nocb_gp_kthread);
 +
 +      return 0;
 +}
 +
 +static int __rcu_nocb_rdp_deoffload(struct rcu_data *rdp)
 +{
 +      struct rcu_segcblist *cblist = &rdp->cblist;
 +      unsigned long flags;
 +      int ret;
 +
 +      pr_info("De-offloading %d\n", rdp->cpu);
 +
 +      rcu_nocb_lock_irqsave(rdp, flags);
 +      /*
 +       * If there are still pending work offloaded, the offline
 +       * CPU won't help much handling them.
 +       */
 +      if (cpu_is_offline(rdp->cpu) && !rcu_segcblist_empty(&rdp->cblist)) {
 +              rcu_nocb_unlock_irqrestore(rdp, flags);
 +              return -EBUSY;
 +      }
 +
 +      ret = rdp_offload_toggle(rdp, false, flags);
 +      swait_event_exclusive(rdp->nocb_state_wq,
 +                            !rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_CB |
 +                                                      SEGCBLIST_KTHREAD_GP));
 +      rcu_nocb_lock_irqsave(rdp, flags);
 +      /* Make sure nocb timer won't stay around */
 +      WRITE_ONCE(rdp->nocb_defer_wakeup, RCU_NOCB_WAKE_OFF);
 +      rcu_nocb_unlock_irqrestore(rdp, flags);
 +      del_timer_sync(&rdp->nocb_timer);
 +
 +      /*
 +       * Flush bypass. While IRQs are disabled and once we set
 +       * SEGCBLIST_SOFTIRQ_ONLY, no callback is supposed to be
 +       * enqueued on bypass.
 +       */
 +      rcu_nocb_lock_irqsave(rdp, flags);
 +      rcu_nocb_flush_bypass(rdp, NULL, jiffies);
 +      rcu_segcblist_set_flags(cblist, SEGCBLIST_SOFTIRQ_ONLY);
 +      /*
 +       * With SEGCBLIST_SOFTIRQ_ONLY, we can't use
 +       * rcu_nocb_unlock_irqrestore() anymore. Theoretically we
 +       * could set SEGCBLIST_SOFTIRQ_ONLY with cb unlocked and IRQs
 +       * disabled now, but let's be paranoid.
 +       */
 +      raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags);
 +
 +      return ret;
 +}
 +
 +static long rcu_nocb_rdp_deoffload(void *arg)
 +{
 +      struct rcu_data *rdp = arg;
 +
 +      WARN_ON_ONCE(rdp->cpu != raw_smp_processor_id());
 +      return __rcu_nocb_rdp_deoffload(rdp);
 +}
 +
 +int rcu_nocb_cpu_deoffload(int cpu)
 +{
 +      struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
 +      int ret = 0;
 +
 +      if (rdp == rdp->nocb_gp_rdp) {
 +              pr_info("Can't deoffload an rdp GP leader (yet)\n");
 +              return -EINVAL;
 +      }
 +      mutex_lock(&rcu_state.barrier_mutex);
 +      cpus_read_lock();
 +      if (rcu_segcblist_is_offloaded(&rdp->cblist)) {
 +              if (cpu_online(cpu))
 +                      ret = work_on_cpu(cpu, rcu_nocb_rdp_deoffload, rdp);
 +              else
 +                      ret = __rcu_nocb_rdp_deoffload(rdp);
 +              if (!ret)
 +                      cpumask_clear_cpu(cpu, rcu_nocb_mask);
 +      }
 +      cpus_read_unlock();
 +      mutex_unlock(&rcu_state.barrier_mutex);
 +
 +      return ret;
 +}
 +EXPORT_SYMBOL_GPL(rcu_nocb_cpu_deoffload);
 +
 +static int __rcu_nocb_rdp_offload(struct rcu_data *rdp)
 +{
 +      struct rcu_segcblist *cblist = &rdp->cblist;
 +      unsigned long flags;
 +      int ret;
 +
 +      /*
 +       * For now we only support re-offload, ie: the rdp must have been
 +       * offloaded on boot first.
 +       */
 +      if (!rdp->nocb_gp_rdp)
 +              return -EINVAL;
 +
 +      pr_info("Offloading %d\n", rdp->cpu);
 +      /*
 +       * Can't use rcu_nocb_lock_irqsave() while we are in
 +       * SEGCBLIST_SOFTIRQ_ONLY mode.
 +       */
 +      raw_spin_lock_irqsave(&rdp->nocb_lock, flags);
 +      /* Re-enable nocb timer */
 +      WRITE_ONCE(rdp->nocb_defer_wakeup, RCU_NOCB_WAKE_NOT);
 +      /*
 +       * We didn't take the nocb lock while working on the
 +       * rdp->cblist in SEGCBLIST_SOFTIRQ_ONLY mode.
 +       * Every modifications that have been done previously on
 +       * rdp->cblist must be visible remotely by the nocb kthreads
 +       * upon wake up after reading the cblist flags.
 +       *
 +       * The layout against nocb_lock enforces that ordering:
 +       *
 +       *  __rcu_nocb_rdp_offload()   nocb_cb_wait()/nocb_gp_wait()
 +       * -------------------------   ----------------------------
 +       *      WRITE callbacks           rcu_nocb_lock()
 +       *      rcu_nocb_lock()           READ flags
 +       *      WRITE flags               READ callbacks
 +       *      rcu_nocb_unlock()         rcu_nocb_unlock()
 +       */
 +      ret = rdp_offload_toggle(rdp, true, flags);
 +      swait_event_exclusive(rdp->nocb_state_wq,
 +                            rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_CB) &&
 +                            rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_GP));
 +
 +      return ret;
 +}
 +
 +static long rcu_nocb_rdp_offload(void *arg)
 +{
 +      struct rcu_data *rdp = arg;
 +
 +      WARN_ON_ONCE(rdp->cpu != raw_smp_processor_id());
 +      return __rcu_nocb_rdp_offload(rdp);
 +}
 +
 +int rcu_nocb_cpu_offload(int cpu)
 +{
 +      struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
 +      int ret = 0;
 +
 +      mutex_lock(&rcu_state.barrier_mutex);
 +      cpus_read_lock();
 +      if (!rcu_segcblist_is_offloaded(&rdp->cblist)) {
 +              if (cpu_online(cpu))
 +                      ret = work_on_cpu(cpu, rcu_nocb_rdp_offload, rdp);
 +              else
 +                      ret = __rcu_nocb_rdp_offload(rdp);
 +              if (!ret)
 +                      cpumask_set_cpu(cpu, rcu_nocb_mask);
 +      }
 +      cpus_read_unlock();
 +      mutex_unlock(&rcu_state.barrier_mutex);
 +
 +      return ret;
 +}
 +EXPORT_SYMBOL_GPL(rcu_nocb_cpu_offload);
 +
  void __init rcu_init_nohz(void)
  {
        int cpu;
                rdp = per_cpu_ptr(&rcu_data, cpu);
                if (rcu_segcblist_empty(&rdp->cblist))
                        rcu_segcblist_init(&rdp->cblist);
 -              rcu_segcblist_offload(&rdp->cblist);
 +              rcu_segcblist_offload(&rdp->cblist, true);
 +              rcu_segcblist_set_flags(&rdp->cblist, SEGCBLIST_KTHREAD_CB);
 +              rcu_segcblist_set_flags(&rdp->cblist, SEGCBLIST_KTHREAD_GP);
        }
        rcu_organize_nocb_kthreads();
  }
@@@ -2529,7 -2251,6 +2541,7 @@@ static void __init rcu_boot_init_nocb_p
  {
        init_swait_queue_head(&rdp->nocb_cb_wq);
        init_swait_queue_head(&rdp->nocb_gp_wq);
 +      init_swait_queue_head(&rdp->nocb_state_wq);
        raw_spin_lock_init(&rdp->nocb_lock);
        raw_spin_lock_init(&rdp->nocb_bypass_lock);
        raw_spin_lock_init(&rdp->nocb_gp_lock);
@@@ -2672,19 -2393,6 +2684,19 @@@ void rcu_bind_current_to_nocb(void
  }
  EXPORT_SYMBOL_GPL(rcu_bind_current_to_nocb);
  
 +// The ->on_cpu field is available only in CONFIG_SMP=y, so...
 +#ifdef CONFIG_SMP
 +static char *show_rcu_should_be_on_cpu(struct task_struct *tsp)
 +{
 +      return tsp && tsp->state == TASK_RUNNING && !tsp->on_cpu ? "!" : "";
 +}
 +#else // #ifdef CONFIG_SMP
 +static char *show_rcu_should_be_on_cpu(struct task_struct *tsp)
 +{
 +      return "";
 +}
 +#endif // #else #ifdef CONFIG_SMP
 +
  /*
   * Dump out nocb grace-period kthread state for the specified rcu_data
   * structure.
@@@ -2693,7 -2401,7 +2705,7 @@@ static void show_rcu_nocb_gp_state(stru
  {
        struct rcu_node *rnp = rdp->mynode;
  
 -      pr_info("nocb GP %d %c%c%c%c%c%c %c[%c%c] %c%c:%ld rnp %d:%d %lu\n",
 +      pr_info("nocb GP %d %c%c%c%c%c%c %c[%c%c] %c%c:%ld rnp %d:%d %lu %c CPU %d%s\n",
                rdp->cpu,
                "kK"[!!rdp->nocb_gp_kthread],
                "lL"[raw_spin_is_locked(&rdp->nocb_gp_lock)],
                ".B"[!!rdp->nocb_gp_bypass],
                ".G"[!!rdp->nocb_gp_gp],
                (long)rdp->nocb_gp_seq,
 -              rnp->grplo, rnp->grphi, READ_ONCE(rdp->nocb_gp_loops));
 +              rnp->grplo, rnp->grphi, READ_ONCE(rdp->nocb_gp_loops),
 +              rdp->nocb_gp_kthread ? task_state_to_char(rdp->nocb_gp_kthread) : '.',
 +              rdp->nocb_cb_kthread ? (int)task_cpu(rdp->nocb_gp_kthread) : -1,
 +              show_rcu_should_be_on_cpu(rdp->nocb_cb_kthread));
  }
  
  /* Dump out nocb kthread state for the specified rcu_data structure. */
  static void show_rcu_nocb_state(struct rcu_data *rdp)
  {
 +      char bufw[20];
 +      char bufr[20];
        struct rcu_segcblist *rsclp = &rdp->cblist;
        bool waslocked;
        bool wastimer;
        if (rdp->nocb_gp_rdp == rdp)
                show_rcu_nocb_gp_state(rdp);
  
 -      pr_info("   CB %d->%d %c%c%c%c%c%c F%ld L%ld C%d %c%c%c%c%c q%ld\n",
 +      sprintf(bufw, "%ld", rsclp->gp_seq[RCU_WAIT_TAIL]);
 +      sprintf(bufr, "%ld", rsclp->gp_seq[RCU_NEXT_READY_TAIL]);
 +      pr_info("   CB %d^%d->%d %c%c%c%c%c%c F%ld L%ld C%d %c%c%s%c%s%c%c q%ld %c CPU %d%s\n",
                rdp->cpu, rdp->nocb_gp_rdp->cpu,
 +              rdp->nocb_next_cb_rdp ? rdp->nocb_next_cb_rdp->cpu : -1,
                "kK"[!!rdp->nocb_cb_kthread],
                "bB"[raw_spin_is_locked(&rdp->nocb_bypass_lock)],
                "cC"[!!atomic_read(&rdp->nocb_lock_contended)],
                jiffies - rdp->nocb_nobypass_last,
                rdp->nocb_nobypass_count,
                ".D"[rcu_segcblist_ready_cbs(rsclp)],
 -              ".W"[!rcu_segcblist_restempty(rsclp, RCU_DONE_TAIL)],
 -              ".R"[!rcu_segcblist_restempty(rsclp, RCU_WAIT_TAIL)],
 -              ".N"[!rcu_segcblist_restempty(rsclp, RCU_NEXT_READY_TAIL)],
 +              ".W"[!rcu_segcblist_segempty(rsclp, RCU_WAIT_TAIL)],
 +              rcu_segcblist_segempty(rsclp, RCU_WAIT_TAIL) ? "" : bufw,
 +              ".R"[!rcu_segcblist_segempty(rsclp, RCU_NEXT_READY_TAIL)],
 +              rcu_segcblist_segempty(rsclp, RCU_NEXT_READY_TAIL) ? "" : bufr,
 +              ".N"[!rcu_segcblist_segempty(rsclp, RCU_NEXT_TAIL)],
                ".B"[!!rcu_cblist_n_cbs(&rdp->nocb_bypass)],
 -              rcu_segcblist_n_cbs(&rdp->cblist));
 +              rcu_segcblist_n_cbs(&rdp->cblist),
 +              rdp->nocb_cb_kthread ? task_state_to_char(rdp->nocb_cb_kthread) : '.',
 +              rdp->nocb_cb_kthread ? (int)task_cpu(rdp->nocb_gp_kthread) : -1,
 +              show_rcu_should_be_on_cpu(rdp->nocb_cb_kthread));
  
        /* It is OK for GP kthreads to have GP state. */
        if (rdp->nocb_gp_rdp == rdp)
@@@ -2835,8 -2530,9 +2847,9 @@@ static int rcu_nocb_need_deferred_wakeu
        return false;
  }
  
- static void do_nocb_deferred_wakeup(struct rcu_data *rdp)
+ static bool do_nocb_deferred_wakeup(struct rcu_data *rdp)
  {
+       return false;
  }
  
  static void rcu_spawn_cpu_nocb_kthread(int cpu)
diff --combined kernel/sched/core.c
index 22f6748c16f68111cef8a8da74a39d26fd860de4,88a2e2bdbabeb8a11c631907d2c7f5da0b4c0a1b..7f5ffc8784110736f4e3705f2f5ad49374192ba9
@@@ -355,8 -355,9 +355,9 @@@ static enum hrtimer_restart hrtick(stru
  static void __hrtick_restart(struct rq *rq)
  {
        struct hrtimer *timer = &rq->hrtick_timer;
+       ktime_t time = rq->hrtick_time;
  
-       hrtimer_start_expires(timer, HRTIMER_MODE_ABS_PINNED_HARD);
+       hrtimer_start(timer, time, HRTIMER_MODE_ABS_PINNED_HARD);
  }
  
  /*
@@@ -380,7 -381,6 +381,6 @@@ static void __hrtick_start(void *arg
  void hrtick_start(struct rq *rq, u64 delay)
  {
        struct hrtimer *timer = &rq->hrtick_timer;
-       ktime_t time;
        s64 delta;
  
        /*
         * doesn't make sense and can cause timer DoS.
         */
        delta = max_t(s64, delay, 10000LL);
-       time = ktime_add_ns(timer->base->get_time(), delta);
-       hrtimer_set_expires(timer, time);
+       rq->hrtick_time = ktime_add_ns(timer->base->get_time(), delta);
  
        if (rq == this_rq())
                __hrtick_restart(rq);
@@@ -3478,7 -3476,7 +3476,7 @@@ out
  
  /**
   * try_invoke_on_locked_down_task - Invoke a function on task in fixed state
 - * @p: Process for which the function is to be invoked.
 + * @p: Process for which the function is to be invoked, can be @current.
   * @func: Function to invoke.
   * @arg: Argument to function.
   *
   */
  bool try_invoke_on_locked_down_task(struct task_struct *p, bool (*func)(struct task_struct *t, void *arg), void *arg)
  {
 -      bool ret = false;
        struct rq_flags rf;
 +      bool ret = false;
        struct rq *rq;
  
 -      lockdep_assert_irqs_enabled();
 -      raw_spin_lock_irq(&p->pi_lock);
 +      raw_spin_lock_irqsave(&p->pi_lock, rf.flags);
        if (p->on_rq) {
                rq = __task_rq_lock(p, &rf);
                if (task_rq(p) == rq)
                                ret = func(p, arg);
                }
        }
 -      raw_spin_unlock_irq(&p->pi_lock);
 +      raw_spin_unlock_irqrestore(&p->pi_lock, rf.flags);
        return ret;
  }
  
@@@ -4970,7 -4969,7 +4968,7 @@@ static void __sched notrace __schedule(
  
        schedule_debug(prev, preempt);
  
-       if (sched_feat(HRTICK))
+       if (sched_feat(HRTICK) || sched_feat(HRTICK_DL))
                hrtick_clear(rq);
  
        local_irq_disable();
@@@ -5264,6 -5263,12 +5262,12 @@@ asmlinkage __visible void __sched notra
  NOKPROBE_SYMBOL(preempt_schedule);
  EXPORT_SYMBOL(preempt_schedule);
  
+ #ifdef CONFIG_PREEMPT_DYNAMIC
+ DEFINE_STATIC_CALL(preempt_schedule, __preempt_schedule_func);
+ EXPORT_STATIC_CALL_TRAMP(preempt_schedule);
+ #endif
  /**
   * preempt_schedule_notrace - preempt_schedule called by tracing
   *
@@@ -5316,8 -5321,197 +5320,197 @@@ asmlinkage __visible void __sched notra
  }
  EXPORT_SYMBOL_GPL(preempt_schedule_notrace);
  
+ #ifdef CONFIG_PREEMPT_DYNAMIC
+ DEFINE_STATIC_CALL(preempt_schedule_notrace, __preempt_schedule_notrace_func);
+ EXPORT_STATIC_CALL_TRAMP(preempt_schedule_notrace);
+ #endif
  #endif /* CONFIG_PREEMPTION */
  
+ #ifdef CONFIG_PREEMPT_DYNAMIC
+ #include <linux/entry-common.h>
+ /*
+  * SC:cond_resched
+  * SC:might_resched
+  * SC:preempt_schedule
+  * SC:preempt_schedule_notrace
+  * SC:irqentry_exit_cond_resched
+  *
+  *
+  * NONE:
+  *   cond_resched               <- __cond_resched
+  *   might_resched              <- RET0
+  *   preempt_schedule           <- NOP
+  *   preempt_schedule_notrace   <- NOP
+  *   irqentry_exit_cond_resched <- NOP
+  *
+  * VOLUNTARY:
+  *   cond_resched               <- __cond_resched
+  *   might_resched              <- __cond_resched
+  *   preempt_schedule           <- NOP
+  *   preempt_schedule_notrace   <- NOP
+  *   irqentry_exit_cond_resched <- NOP
+  *
+  * FULL:
+  *   cond_resched               <- RET0
+  *   might_resched              <- RET0
+  *   preempt_schedule           <- preempt_schedule
+  *   preempt_schedule_notrace   <- preempt_schedule_notrace
+  *   irqentry_exit_cond_resched <- irqentry_exit_cond_resched
+  */
+ enum {
+       preempt_dynamic_none = 0,
+       preempt_dynamic_voluntary,
+       preempt_dynamic_full,
+ };
+ static int preempt_dynamic_mode = preempt_dynamic_full;
+ static int sched_dynamic_mode(const char *str)
+ {
+       if (!strcmp(str, "none"))
+               return 0;
+       if (!strcmp(str, "voluntary"))
+               return 1;
+       if (!strcmp(str, "full"))
+               return 2;
+       return -1;
+ }
+ static void sched_dynamic_update(int mode)
+ {
+       /*
+        * Avoid {NONE,VOLUNTARY} -> FULL transitions from ever ending up in
+        * the ZERO state, which is invalid.
+        */
+       static_call_update(cond_resched, __cond_resched);
+       static_call_update(might_resched, __cond_resched);
+       static_call_update(preempt_schedule, __preempt_schedule_func);
+       static_call_update(preempt_schedule_notrace, __preempt_schedule_notrace_func);
+       static_call_update(irqentry_exit_cond_resched, irqentry_exit_cond_resched);
+       switch (mode) {
+       case preempt_dynamic_none:
+               static_call_update(cond_resched, __cond_resched);
+               static_call_update(might_resched, (typeof(&__cond_resched)) __static_call_return0);
+               static_call_update(preempt_schedule, (typeof(&preempt_schedule)) NULL);
+               static_call_update(preempt_schedule_notrace, (typeof(&preempt_schedule_notrace)) NULL);
+               static_call_update(irqentry_exit_cond_resched, (typeof(&irqentry_exit_cond_resched)) NULL);
+               pr_info("Dynamic Preempt: none\n");
+               break;
+       case preempt_dynamic_voluntary:
+               static_call_update(cond_resched, __cond_resched);
+               static_call_update(might_resched, __cond_resched);
+               static_call_update(preempt_schedule, (typeof(&preempt_schedule)) NULL);
+               static_call_update(preempt_schedule_notrace, (typeof(&preempt_schedule_notrace)) NULL);
+               static_call_update(irqentry_exit_cond_resched, (typeof(&irqentry_exit_cond_resched)) NULL);
+               pr_info("Dynamic Preempt: voluntary\n");
+               break;
+       case preempt_dynamic_full:
+               static_call_update(cond_resched, (typeof(&__cond_resched)) __static_call_return0);
+               static_call_update(might_resched, (typeof(&__cond_resched)) __static_call_return0);
+               static_call_update(preempt_schedule, __preempt_schedule_func);
+               static_call_update(preempt_schedule_notrace, __preempt_schedule_notrace_func);
+               static_call_update(irqentry_exit_cond_resched, irqentry_exit_cond_resched);
+               pr_info("Dynamic Preempt: full\n");
+               break;
+       }
+       preempt_dynamic_mode = mode;
+ }
+ static int __init setup_preempt_mode(char *str)
+ {
+       int mode = sched_dynamic_mode(str);
+       if (mode < 0) {
+               pr_warn("Dynamic Preempt: unsupported mode: %s\n", str);
+               return 1;
+       }
+       sched_dynamic_update(mode);
+       return 0;
+ }
+ __setup("preempt=", setup_preempt_mode);
+ #ifdef CONFIG_SCHED_DEBUG
+ static ssize_t sched_dynamic_write(struct file *filp, const char __user *ubuf,
+                                  size_t cnt, loff_t *ppos)
+ {
+       char buf[16];
+       int mode;
+       if (cnt > 15)
+               cnt = 15;
+       if (copy_from_user(&buf, ubuf, cnt))
+               return -EFAULT;
+       buf[cnt] = 0;
+       mode = sched_dynamic_mode(strstrip(buf));
+       if (mode < 0)
+               return mode;
+       sched_dynamic_update(mode);
+       *ppos += cnt;
+       return cnt;
+ }
+ static int sched_dynamic_show(struct seq_file *m, void *v)
+ {
+       static const char * preempt_modes[] = {
+               "none", "voluntary", "full"
+       };
+       int i;
+       for (i = 0; i < ARRAY_SIZE(preempt_modes); i++) {
+               if (preempt_dynamic_mode == i)
+                       seq_puts(m, "(");
+               seq_puts(m, preempt_modes[i]);
+               if (preempt_dynamic_mode == i)
+                       seq_puts(m, ")");
+               seq_puts(m, " ");
+       }
+       seq_puts(m, "\n");
+       return 0;
+ }
+ static int sched_dynamic_open(struct inode *inode, struct file *filp)
+ {
+       return single_open(filp, sched_dynamic_show, NULL);
+ }
+ static const struct file_operations sched_dynamic_fops = {
+       .open           = sched_dynamic_open,
+       .write          = sched_dynamic_write,
+       .read           = seq_read,
+       .llseek         = seq_lseek,
+       .release        = single_release,
+ };
+ static __init int sched_init_debug_dynamic(void)
+ {
+       debugfs_create_file("sched_preempt", 0644, NULL, NULL, &sched_dynamic_fops);
+       return 0;
+ }
+ late_initcall(sched_init_debug_dynamic);
+ #endif /* CONFIG_SCHED_DEBUG */
+ #endif /* CONFIG_PREEMPT_DYNAMIC */
  /*
   * This is the entry point to schedule() from kernel preemption
   * off of irq context.
@@@ -5615,8 -5809,12 +5808,12 @@@ SYSCALL_DEFINE1(nice, int, increment
   * @p: the task in question.
   *
   * Return: The priority value as seen by users in /proc.
-  * RT tasks are offset by -200. Normal tasks are centered
-  * around 0, value goes from -16 to +15.
+  *
+  * sched policy         return value   kernel prio    user prio/nice
+  *
+  * normal, batch, idle     [0 ... 39]  [100 ... 139]          0/[-20 ... 19]
+  * fifo, rr             [-2 ... -100]     [98 ... 0]  [1 ... 99]
+  * deadline                     -101             -1           0
   */
  int task_prio(const struct task_struct *p)
  {
@@@ -5675,6 -5873,120 +5872,120 @@@ struct task_struct *idle_task(int cpu
        return cpu_rq(cpu)->idle;
  }
  
+ #ifdef CONFIG_SMP
+ /*
+  * This function computes an effective utilization for the given CPU, to be
+  * used for frequency selection given the linear relation: f = u * f_max.
+  *
+  * The scheduler tracks the following metrics:
+  *
+  *   cpu_util_{cfs,rt,dl,irq}()
+  *   cpu_bw_dl()
+  *
+  * Where the cfs,rt and dl util numbers are tracked with the same metric and
+  * synchronized windows and are thus directly comparable.
+  *
+  * The cfs,rt,dl utilization are the running times measured with rq->clock_task
+  * which excludes things like IRQ and steal-time. These latter are then accrued
+  * in the irq utilization.
+  *
+  * The DL bandwidth number otoh is not a measured metric but a value computed
+  * based on the task model parameters and gives the minimal utilization
+  * required to meet deadlines.
+  */
+ unsigned long effective_cpu_util(int cpu, unsigned long util_cfs,
+                                unsigned long max, enum cpu_util_type type,
+                                struct task_struct *p)
+ {
+       unsigned long dl_util, util, irq;
+       struct rq *rq = cpu_rq(cpu);
+       if (!uclamp_is_used() &&
+           type == FREQUENCY_UTIL && rt_rq_is_runnable(&rq->rt)) {
+               return max;
+       }
+       /*
+        * Early check to see if IRQ/steal time saturates the CPU, can be
+        * because of inaccuracies in how we track these -- see
+        * update_irq_load_avg().
+        */
+       irq = cpu_util_irq(rq);
+       if (unlikely(irq >= max))
+               return max;
+       /*
+        * Because the time spend on RT/DL tasks is visible as 'lost' time to
+        * CFS tasks and we use the same metric to track the effective
+        * utilization (PELT windows are synchronized) we can directly add them
+        * to obtain the CPU's actual utilization.
+        *
+        * CFS and RT utilization can be boosted or capped, depending on
+        * utilization clamp constraints requested by currently RUNNABLE
+        * tasks.
+        * When there are no CFS RUNNABLE tasks, clamps are released and
+        * frequency will be gracefully reduced with the utilization decay.
+        */
+       util = util_cfs + cpu_util_rt(rq);
+       if (type == FREQUENCY_UTIL)
+               util = uclamp_rq_util_with(rq, util, p);
+       dl_util = cpu_util_dl(rq);
+       /*
+        * For frequency selection we do not make cpu_util_dl() a permanent part
+        * of this sum because we want to use cpu_bw_dl() later on, but we need
+        * to check if the CFS+RT+DL sum is saturated (ie. no idle time) such
+        * that we select f_max when there is no idle time.
+        *
+        * NOTE: numerical errors or stop class might cause us to not quite hit
+        * saturation when we should -- something for later.
+        */
+       if (util + dl_util >= max)
+               return max;
+       /*
+        * OTOH, for energy computation we need the estimated running time, so
+        * include util_dl and ignore dl_bw.
+        */
+       if (type == ENERGY_UTIL)
+               util += dl_util;
+       /*
+        * There is still idle time; further improve the number by using the
+        * irq metric. Because IRQ/steal time is hidden from the task clock we
+        * need to scale the task numbers:
+        *
+        *              max - irq
+        *   U' = irq + --------- * U
+        *                 max
+        */
+       util = scale_irq_capacity(util, irq, max);
+       util += irq;
+       /*
+        * Bandwidth required by DEADLINE must always be granted while, for
+        * FAIR and RT, we use blocked utilization of IDLE CPUs as a mechanism
+        * to gracefully reduce the frequency when no tasks show up for longer
+        * periods of time.
+        *
+        * Ideally we would like to set bw_dl as min/guaranteed freq and util +
+        * bw_dl as requested freq. However, cpufreq is not yet ready for such
+        * an interface. So, we only do the latter for now.
+        */
+       if (type == FREQUENCY_UTIL)
+               util += cpu_bw_dl(rq);
+       return min(max, util);
+ }
+ unsigned long sched_cpu_util(int cpu, unsigned long max)
+ {
+       return effective_cpu_util(cpu, cpu_util_cfs(cpu_rq(cpu)), max,
+                                 ENERGY_UTIL, NULL);
+ }
+ #endif /* CONFIG_SMP */
  /**
   * find_process_by_pid - find a process with a matching PID value.
   * @pid: the pid in question.
@@@ -5796,11 -6108,10 +6107,10 @@@ recheck
  
        /*
         * Valid priorities for SCHED_FIFO and SCHED_RR are
-        * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL,
+        * 1..MAX_RT_PRIO-1, valid priority for SCHED_NORMAL,
         * SCHED_BATCH and SCHED_IDLE is 0.
         */
-       if ((p->mm && attr->sched_priority > MAX_USER_RT_PRIO-1) ||
-           (!p->mm && attr->sched_priority > MAX_RT_PRIO-1))
+       if (attr->sched_priority > MAX_RT_PRIO-1)
                return -EINVAL;
        if ((dl_policy(policy) && !__checkparam_dl(attr)) ||
            (rt_policy(policy) != (attr->sched_priority != 0)))
@@@ -6667,17 -6978,27 +6977,27 @@@ SYSCALL_DEFINE0(sched_yield
        return 0;
  }
  
- #ifndef CONFIG_PREEMPTION
- int __sched _cond_resched(void)
+ #if !defined(CONFIG_PREEMPTION) || defined(CONFIG_PREEMPT_DYNAMIC)
+ int __sched __cond_resched(void)
  {
        if (should_resched(0)) {
                preempt_schedule_common();
                return 1;
        }
+ #ifndef CONFIG_PREEMPT_RCU
        rcu_all_qs();
+ #endif
        return 0;
  }
- EXPORT_SYMBOL(_cond_resched);
+ EXPORT_SYMBOL(__cond_resched);
+ #endif
+ #ifdef CONFIG_PREEMPT_DYNAMIC
+ DEFINE_STATIC_CALL_RET0(cond_resched, __cond_resched);
+ EXPORT_STATIC_CALL_TRAMP(cond_resched);
+ DEFINE_STATIC_CALL_RET0(might_resched, __cond_resched);
+ EXPORT_STATIC_CALL_TRAMP(might_resched);
  #endif
  
  /*
@@@ -6868,7 -7189,7 +7188,7 @@@ SYSCALL_DEFINE1(sched_get_priority_max
        switch (policy) {
        case SCHED_FIFO:
        case SCHED_RR:
-               ret = MAX_USER_RT_PRIO-1;
+               ret = MAX_RT_PRIO-1;
                break;
        case SCHED_DEADLINE:
        case SCHED_NORMAL:
@@@ -7508,6 -7829,12 +7828,12 @@@ int sched_cpu_deactivate(unsigned int c
        struct rq_flags rf;
        int ret;
  
+       /*
+        * Remove CPU from nohz.idle_cpus_mask to prevent participating in
+        * load balancing when not active
+        */
+       nohz_balance_exit_idle(rq);
        set_cpu_active(cpu, false);
  
        /*
@@@ -7652,7 -7979,6 +7978,6 @@@ int sched_cpu_dying(unsigned int cpu
  
        calc_load_migrate(rq);
        update_max_interval();
-       nohz_balance_exit_idle(rq);
        hrtick_clear(rq);
        return 0;
  }
This page took 0.212771 seconds and 4 git commands to generate.