Merge tag 'sysctl-6.10-rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/sysctl...

author Linus Torvalds <[email protected]>

Sat, 18 May 2024 00:31:24 +0000 (17:31 -0700)

committer Linus Torvalds <[email protected]>

Sat, 18 May 2024 00:31:24 +0000 (17:31 -0700)
author Linus Torvalds <[email protected]>
Sat, 18 May 2024 00:31:24 +0000 (17:31 -0700)
committer Linus Torvalds <[email protected]>
Sat, 18 May 2024 00:31:24 +0000 (17:31 -0700)
diff --combined include/linux/sysctl.h

index 9413241df962c71e4dcdaa61d5dad23aa13ca0a6,47bd28ffa88f47648873f115a63f66188acbf6d6..09db2f2e6488ee9a1051c76ac1a7ec211cb2ad61
--- 1/include/linux/sysctl.h
--- 2/include/linux/sysctl.h
+++ b/include/linux/sysctl.h
@@@ -137,17 -137,6 +137,6 @@@ struct ctl_table 
         void *data;
         int maxlen;
         umode_t mode;
-       /**
-        * enum type - Enumeration to differentiate between ctl target types
-        * @SYSCTL_TABLE_TYPE_DEFAULT: ctl target with no special considerations
-        * @SYSCTL_TABLE_TYPE_PERMANENTLY_EMPTY: Used to identify a permanently
-        *                                       empty directory target to serve
-        *                                       as mount point.
-        */
-       enum {
-               SYSCTL_TABLE_TYPE_DEFAULT,
-               SYSCTL_TABLE_TYPE_PERMANENTLY_EMPTY
-       } type;
         proc_handler *proc_handler;     /* Callback for text formatting */
         struct ctl_table_poll *poll;
         void *extra1;
@@@ -182,12 -171,23 +171,23 @@@ struct ctl_table_header 
                 struct rcu_head rcu;
         };
         struct completion *unregistering;
- -      struct ctl_table *ctl_table_arg;
+ +      const struct ctl_table *ctl_table_arg;
         struct ctl_table_root *root;
         struct ctl_table_set *set;
         struct ctl_dir *parent;
         struct ctl_node *node;
         struct hlist_head inodes; /* head for proc_inode->sysctl_inodes */
+       /**
+        * enum type - Enumeration to differentiate between ctl target types
+        * @SYSCTL_TABLE_TYPE_DEFAULT: ctl target with no special considerations
+        * @SYSCTL_TABLE_TYPE_PERMANENTLY_EMPTY: Used to identify a permanently
+        *                                       empty directory target to serve
+        *                                       as mount point.
+        */
+       enum {
+               SYSCTL_TABLE_TYPE_DEFAULT,
+               SYSCTL_TABLE_TYPE_PERMANENTLY_EMPTY,
+       } type;
   };
   
   struct ctl_dir {
@@@ -205,9 -205,8 +205,8 @@@ struct ctl_table_root 
         struct ctl_table_set default_set;
         struct ctl_table_set *(*lookup)(struct ctl_table_root *root);
         void (*set_ownership)(struct ctl_table_header *head,
-                             struct ctl_table *table,
                               kuid_t *uid, kgid_t *gid);
-       int (*permissions)(struct ctl_table_header *head, struct ctl_table *table);
+       int (*permissions)(struct ctl_table_header *head, const struct ctl_table *table);
   };
   
   #define register_sysctl(path, table)  \
diff --combined ipc/ipc_sysctl.c

index 3c3755918d3470c351a719a3e818dba1c4ab1994,19b2a67aef406875359045b45dcf7d508037c9d7..113452038303b412281a6fd76231d6a37b0b4431
--- 1/ipc/ipc_sysctl.c
--- 2/ipc/ipc_sysctl.c
+++ b/ipc/ipc_sysctl.c
@@@ -192,7 -192,6 +192,6 @@@ static int set_is_seen(struct ctl_table
   }
   
   static void ipc_set_ownership(struct ctl_table_header *head,
-                             struct ctl_table *table,
                               kuid_t *uid, kgid_t *gid)
   {
         struct ipc_namespace *ns =
@@@ -205,7 -204,7 +204,7 @@@
         *gid = gid_valid(ns_root_gid) ? ns_root_gid : GLOBAL_ROOT_GID;
   }
   
- static int ipc_permissions(struct ctl_table_header *head, struct ctl_table *table)
+ static int ipc_permissions(struct ctl_table_header *head, const struct ctl_table *table)
   {
         int mode = table->mode;
   
@@@ -224,7 -223,7 +223,7 @@@
                 kuid_t ns_root_uid;
                 kgid_t ns_root_gid;
   
-               ipc_set_ownership(head, table, &ns_root_uid, &ns_root_gid);
+               ipc_set_ownership(head, &ns_root_uid, &ns_root_gid);
   
                 if (uid_eq(current_euid(), ns_root_uid))
                         mode >>= 6;
@@@ -306,7 -305,7 +305,7 @@@ bool setup_ipc_sysctls(struct ipc_names
   
   void retire_ipc_sysctls(struct ipc_namespace *ns)
   {
- -      struct ctl_table *tbl;
+ +      const struct ctl_table *tbl;
   
         tbl = ns->ipc_sysctls->ctl_table_arg;
         unregister_sysctl_table(ns->ipc_sysctls);
diff --combined ipc/mq_sysctl.c

index 69c709262f5adf97eef052cbc81656dd7ce2f641,43c0825da9e8b771ca53993f4bfcec1a258427e5..068e7d5aa42b8bd3cc506a677d13f215fdb89088
--- 1/ipc/mq_sysctl.c
--- 2/ipc/mq_sysctl.c
+++ b/ipc/mq_sysctl.c
@@@ -78,7 -78,6 +78,6 @@@ static int set_is_seen(struct ctl_table
   }
   
   static void mq_set_ownership(struct ctl_table_header *head,
-                            struct ctl_table *table,
                              kuid_t *uid, kgid_t *gid)
   {
         struct ipc_namespace *ns =
@@@ -91,13 -90,13 +90,13 @@@
         *gid = gid_valid(ns_root_gid) ? ns_root_gid : GLOBAL_ROOT_GID;
   }
   
- static int mq_permissions(struct ctl_table_header *head, struct ctl_table *table)
+ static int mq_permissions(struct ctl_table_header *head, const struct ctl_table *table)
   {
         int mode = table->mode;
         kuid_t ns_root_uid;
         kgid_t ns_root_gid;
   
-       mq_set_ownership(head, table, &ns_root_uid, &ns_root_gid);
+       mq_set_ownership(head, &ns_root_uid, &ns_root_gid);
   
         if (uid_eq(current_euid(), ns_root_uid))
                 mode >>= 6;
@@@ -160,7 -159,7 +159,7 @@@ bool setup_mq_sysctls(struct ipc_namesp
   
   void retire_mq_sysctls(struct ipc_namespace *ns)
   {
- -      struct ctl_table *tbl;
+ +      const struct ctl_table *tbl;
   
         tbl = ns->mq_sysctls->ctl_table_arg;
         unregister_sysctl_table(ns->mq_sysctls);
diff --combined kernel/bpf/syscall.c

index cf6285760aea51d3114e804fe156521caf142582,c7e805087b062bbef7bae93a7549bfe2a852fb0e..2b7d3c96c7ea7ece27a4fc0a917bd658c4ce1a6f
--- 1/kernel/bpf/syscall.c
--- 2/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@@ -559,7 -559,6 +559,7 @@@ void btf_record_free(struct btf_record 
                 case BPF_SPIN_LOCK:
                 case BPF_TIMER:
                 case BPF_REFCOUNT:
+ +              case BPF_WORKQUEUE:
                         /* Nothing to release */
                         break;
                 default:
@@@ -609,7 -608,6 +609,7 @@@ struct btf_record *btf_record_dup(cons
                 case BPF_SPIN_LOCK:
                 case BPF_TIMER:
                 case BPF_REFCOUNT:
+ +              case BPF_WORKQUEUE:
                         /* Nothing to acquire */
                         break;
                 default:
@@@ -661,13 -659,6 +661,13 @@@ void bpf_obj_free_timer(const struct bt
         bpf_timer_cancel_and_free(obj + rec->timer_off);
   }
   
+ +void bpf_obj_free_workqueue(const struct btf_record *rec, void *obj)
+ +{
+ +      if (WARN_ON_ONCE(!btf_record_has_field(rec, BPF_WORKQUEUE)))
+ +              return;
+ +      bpf_wq_cancel_and_free(obj + rec->wq_off);
+ +}
+ +
   void bpf_obj_free_fields(const struct btf_record *rec, void *obj)
   {
         const struct btf_field *fields;
@@@ -688,9 -679,6 +688,9 @@@
                 case BPF_TIMER:
                         bpf_timer_cancel_and_free(field_ptr);
                         break;
+ +              case BPF_WORKQUEUE:
+ +                      bpf_wq_cancel_and_free(field_ptr);
+ +                      break;
                 case BPF_KPTR_UNREF:
                         WRITE_ONCE(*(u64 *)field_ptr, 0);
                         break;
@@@ -1097,7 -1085,7 +1097,7 @@@ static int map_check_btf(struct bpf_ma
   
         map->record = btf_parse_fields(btf, value_type,
                                        BPF_SPIN_LOCK | BPF_TIMER | BPF_KPTR | BPF_LIST_HEAD |
- -                                     BPF_RB_ROOT | BPF_REFCOUNT,
+ +                                     BPF_RB_ROOT | BPF_REFCOUNT | BPF_WORKQUEUE,
                                        map->value_size);
         if (!IS_ERR_OR_NULL(map->record)) {
                 int i;
@@@ -1127,7 -1115,6 +1127,7 @@@
                                 }
                                 break;
                         case BPF_TIMER:
+ +                      case BPF_WORKQUEUE:
                                 if (map->map_type != BPF_MAP_TYPE_HASH &&
                                     map->map_type != BPF_MAP_TYPE_LRU_HASH &&
                                     map->map_type != BPF_MAP_TYPE_ARRAY) {
@@@ -3037,46 -3024,17 +3037,46 @@@ void bpf_link_inc(struct bpf_link *link
         atomic64_inc(&link->refcnt);
   }
   
+ +static void bpf_link_defer_dealloc_rcu_gp(struct rcu_head *rcu)
+ +{
+ +      struct bpf_link *link = container_of(rcu, struct bpf_link, rcu);
+ +
+ +      /* free bpf_link and its containing memory */
+ +      link->ops->dealloc_deferred(link);
+ +}
+ +
+ +static void bpf_link_defer_dealloc_mult_rcu_gp(struct rcu_head *rcu)
+ +{
+ +      if (rcu_trace_implies_rcu_gp())
+ +              bpf_link_defer_dealloc_rcu_gp(rcu);
+ +      else
+ +              call_rcu(rcu, bpf_link_defer_dealloc_rcu_gp);
+ +}
+ +
   /* bpf_link_free is guaranteed to be called from process context */
   static void bpf_link_free(struct bpf_link *link)
   {
+ +      bool sleepable = false;
+ +
         bpf_link_free_id(link->id);
         if (link->prog) {
+ +              sleepable = link->prog->sleepable;
                 /* detach BPF program, clean up used resources */
                 link->ops->release(link);
                 bpf_prog_put(link->prog);
         }
- -      /* free bpf_link and its containing memory */
- -      link->ops->dealloc(link);
+ +      if (link->ops->dealloc_deferred) {
+ +              /* schedule BPF link deallocation; if underlying BPF program
+ +               * is sleepable, we need to first wait for RCU tasks trace
+ +               * sync, then go through "classic" RCU grace period
+ +               */
+ +              if (sleepable)
+ +                      call_rcu_tasks_trace(&link->rcu, bpf_link_defer_dealloc_mult_rcu_gp);
+ +              else
+ +                      call_rcu(&link->rcu, bpf_link_defer_dealloc_rcu_gp);
+ +      }
+ +      if (link->ops->dealloc)
+ +              link->ops->dealloc(link);
   }
   
   static void bpf_link_put_deferred(struct work_struct *work)
@@@ -3511,12 -3469,17 +3511,12 @@@ out_put_prog
         return err;
   }
   
- -struct bpf_raw_tp_link {
- -      struct bpf_link link;
- -      struct bpf_raw_event_map *btp;
- -};
- -
   static void bpf_raw_tp_link_release(struct bpf_link *link)
   {
         struct bpf_raw_tp_link *raw_tp =
                 container_of(link, struct bpf_raw_tp_link, link);
   
- -      bpf_probe_unregister(raw_tp->btp, raw_tp->link.prog);
+ +      bpf_probe_unregister(raw_tp->btp, raw_tp);
         bpf_put_raw_tracepoint(raw_tp->btp);
   }
   
@@@ -3581,7 -3544,7 +3581,7 @@@ static int bpf_raw_tp_link_fill_link_in
   
   static const struct bpf_link_ops bpf_raw_tp_link_lops = {
         .release = bpf_raw_tp_link_release,
- -      .dealloc = bpf_raw_tp_link_dealloc,
+ +      .dealloc_deferred = bpf_raw_tp_link_dealloc,
         .show_fdinfo = bpf_raw_tp_link_show_fdinfo,
         .fill_link_info = bpf_raw_tp_link_fill_link_info,
   };
@@@ -3816,7 -3779,7 +3816,7 @@@ static int bpf_perf_link_attach(const u
   #endif /* CONFIG_PERF_EVENTS */
   
   static int bpf_raw_tp_link_attach(struct bpf_prog *prog,
- -                                const char __user *user_tp_name)
+ +                                const char __user *user_tp_name, u64 cookie)
   {
         struct bpf_link_primer link_primer;
         struct bpf_raw_tp_link *link;
@@@ -3863,7 -3826,6 +3863,7 @@@
         bpf_link_init(&link->link, BPF_LINK_TYPE_RAW_TRACEPOINT,
                       &bpf_raw_tp_link_lops, prog);
         link->btp = btp;
+ +      link->cookie = cookie;
   
         err = bpf_link_prime(&link->link, &link_primer);
         if (err) {
@@@ -3871,7 -3833,7 +3871,7 @@@
                 goto out_put_btp;
         }
   
- -      err = bpf_probe_register(link->btp, prog);
+ +      err = bpf_probe_register(link->btp, link);
         if (err) {
                 bpf_link_cleanup(&link_primer);
                 goto out_put_btp;
@@@ -3884,13 -3846,11 +3884,13 @@@ out_put_btp
         return err;
   }
   
- -#define BPF_RAW_TRACEPOINT_OPEN_LAST_FIELD raw_tracepoint.prog_fd
+ +#define BPF_RAW_TRACEPOINT_OPEN_LAST_FIELD raw_tracepoint.cookie
   
   static int bpf_raw_tracepoint_open(const union bpf_attr *attr)
   {
         struct bpf_prog *prog;
+ +      void __user *tp_name;
+ +      __u64 cookie;
         int fd;
   
         if (CHECK_ATTR(BPF_RAW_TRACEPOINT_OPEN))
@@@ -3900,9 -3860,7 +3900,9 @@@
         if (IS_ERR(prog))
                 return PTR_ERR(prog);
   
- -      fd = bpf_raw_tp_link_attach(prog, u64_to_user_ptr(attr->raw_tracepoint.name));
+ +      tp_name = u64_to_user_ptr(attr->raw_tracepoint.name);
+ +      cookie = attr->raw_tracepoint.cookie;
+ +      fd = bpf_raw_tp_link_attach(prog, tp_name, cookie);
         if (fd < 0)
                 bpf_prog_put(prog);
         return fd;
@@@ -3998,11 -3956,6 +3998,11 @@@ static int bpf_prog_attach_check_attach
                          * check permissions at attach time.
                          */
                         return -EPERM;
+ +
+ +              ptype = attach_type_to_prog_type(attach_type);
+ +              if (prog->type != ptype)
+ +                      return -EINVAL;
+ +
                 return prog->enforce_expected_attach_type &&
                         prog->expected_attach_type != attach_type ?
                         -EINVAL : 0;
@@@ -4021,15 -3974,11 +4021,15 @@@
                 if (prog->expected_attach_type == BPF_TRACE_KPROBE_MULTI &&
                     attach_type != BPF_TRACE_KPROBE_MULTI)
                         return -EINVAL;
+ +              if (prog->expected_attach_type == BPF_TRACE_KPROBE_SESSION &&
+ +                  attach_type != BPF_TRACE_KPROBE_SESSION)
+ +                      return -EINVAL;
                 if (prog->expected_attach_type == BPF_TRACE_UPROBE_MULTI &&
                     attach_type != BPF_TRACE_UPROBE_MULTI)
                         return -EINVAL;
                 if (attach_type != BPF_PERF_EVENT &&
                     attach_type != BPF_TRACE_KPROBE_MULTI &&
+ +                  attach_type != BPF_TRACE_KPROBE_SESSION &&
                     attach_type != BPF_TRACE_UPROBE_MULTI)
                         return -EINVAL;
                 return 0;
@@@ -5249,7 -5198,7 +5249,7 @@@ static int link_create(union bpf_attr *
                         goto out;
                 }
                 if (prog->expected_attach_type == BPF_TRACE_RAW_TP)
- -                      ret = bpf_raw_tp_link_attach(prog, NULL);
+ +                      ret = bpf_raw_tp_link_attach(prog, NULL, attr->link_create.tracing.cookie);
                 else if (prog->expected_attach_type == BPF_TRACE_ITER)
                         ret = bpf_iter_link_attach(attr, uattr, prog);
                 else if (prog->expected_attach_type == BPF_LSM_CGROUP)
@@@ -5264,10 -5213,6 +5264,10 @@@
         case BPF_PROG_TYPE_SK_LOOKUP:
                 ret = netns_bpf_link_create(attr, prog);
                 break;
+ +      case BPF_PROG_TYPE_SK_MSG:
+ +      case BPF_PROG_TYPE_SK_SKB:
+ +              ret = sock_map_link_create(attr, prog);
+ +              break;
   #ifdef CONFIG_NET
         case BPF_PROG_TYPE_XDP:
                 ret = bpf_xdp_link_attach(attr, prog);
@@@ -5290,8 -5235,7 +5290,8 @@@
         case BPF_PROG_TYPE_KPROBE:
                 if (attr->link_create.attach_type == BPF_PERF_EVENT)
                         ret = bpf_perf_link_attach(attr, prog);
- -              else if (attr->link_create.attach_type == BPF_TRACE_KPROBE_MULTI)
+ +              else if (attr->link_create.attach_type == BPF_TRACE_KPROBE_MULTI ||
+ +                       attr->link_create.attach_type == BPF_TRACE_KPROBE_SESSION)
                         ret = bpf_kprobe_multi_link_attach(attr, prog);
                 else if (attr->link_create.attach_type == BPF_TRACE_UPROBE_MULTI)
                         ret = bpf_uprobe_multi_link_attach(attr, prog);
@@@ -6035,7 -5979,6 +6035,6 @@@ static struct ctl_table bpf_syscall_tab
                 .mode           = 0644,
                 .proc_handler   = bpf_stats_handler,
         },
-       { }
   };
   
   static int __init bpf_syscall_sysctl_init(void)
diff --combined kernel/kprobes.c

index ca2c6cbd42d219dd2d2b7d5e40b7bed12c32707c,85af0e05a38f08a9776f97b50b30382d6b552a99..48b49925f7ffa7236b142597b72176f3fe8e0eb5
--- 1/kernel/kprobes.c
--- 2/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@@ -26,6 -26,7 +26,6 @@@
   #include <linux/slab.h>
   #include <linux/stddef.h>
   #include <linux/export.h>
- -#include <linux/moduleloader.h>
   #include <linux/kallsyms.h>
   #include <linux/freezer.h>
   #include <linux/seq_file.h>
@@@ -38,7 -39,6 +38,7 @@@
   #include <linux/jump_label.h>
   #include <linux/static_call.h>
   #include <linux/perf_event.h>
+ +#include <linux/execmem.h>
   
   #include <asm/sections.h>
   #include <asm/cacheflush.h>
@@@ -113,17 -113,17 +113,17 @@@ enum kprobe_slot_state 
   void __weak *alloc_insn_page(void)
   {
         /*
- -       * Use module_alloc() so this page is within +/- 2GB of where the
+ +       * Use execmem_alloc() so this page is within +/- 2GB of where the
          * kernel image and loaded module images reside. This is required
          * for most of the architectures.
          * (e.g. x86-64 needs this to handle the %rip-relative fixups.)
          */
- -      return module_alloc(PAGE_SIZE);
+ +      return execmem_alloc(EXECMEM_KPROBES, PAGE_SIZE);
   }
   
   static void free_insn_page(void *page)
   {
- -      module_memfree(page);
+ +      execmem_free(page);
   }
   
   struct kprobe_insn_cache kprobe_insn_slots = {
@@@ -968,7 -968,6 +968,6 @@@ static struct ctl_table kprobe_sysctls[
                 .extra1         = SYSCTL_ZERO,
                 .extra2         = SYSCTL_ONE,
         },
-       {}
   };
   
   static void __init kprobe_sysctls_init(void)
@@@ -1567,17 -1566,10 +1566,17 @@@ static int check_kprobe_address_safe(st
         jump_label_lock();
         preempt_disable();
   
- -      /* Ensure it is not in reserved area nor out of text */
- -      if (!(core_kernel_text((unsigned long) p->addr) ||
- -          is_module_text_address((unsigned long) p->addr)) ||
- -          in_gate_area_no_mm((unsigned long) p->addr) ||
+ +      /* Ensure the address is in a text area, and find a module if exists. */
+ +      *probed_mod = NULL;
+ +      if (!core_kernel_text((unsigned long) p->addr)) {
+ +              *probed_mod = __module_text_address((unsigned long) p->addr);
+ +              if (!(*probed_mod)) {
+ +                      ret = -EINVAL;
+ +                      goto out;
+ +              }
+ +      }
+ +      /* Ensure it is not in reserved area. */
+ +      if (in_gate_area_no_mm((unsigned long) p->addr) ||
             within_kprobe_blacklist((unsigned long) p->addr) ||
             jump_label_text_reserved(p->addr, p->addr) ||
             static_call_text_reserved(p->addr, p->addr) ||
@@@ -1587,8 -1579,9 +1586,8 @@@
                 goto out;
         }
   
- -      /* Check if 'p' is probing a module. */
- -      *probed_mod = __module_text_address((unsigned long) p->addr);
- -      if (*probed_mod) {
+ +      /* Get module refcount and reject __init functions for loaded modules. */
+ +      if (IS_ENABLED(CONFIG_MODULES) && *probed_mod) {
                 /*
                  * We must hold a refcount of the probed module while updating
                  * its code to prohibit unexpected unloading.
@@@ -1603,13 -1596,12 +1602,13 @@@
                  * kprobes in there.
                  */
                 if (within_module_init((unsigned long)p->addr, *probed_mod) &&
- -                  (*probed_mod)->state != MODULE_STATE_COMING) {
+ +                  !module_is_coming(*probed_mod)) {
                         module_put(*probed_mod);
                         *probed_mod = NULL;
                         ret = -ENOENT;
                 }
         }
+ +
   out:
         preempt_enable();
         jump_label_unlock();
@@@ -2489,6 -2481,24 +2488,6 @@@ int kprobe_add_area_blacklist(unsigned 
         return 0;
   }
   
- -/* Remove all symbols in given area from kprobe blacklist */
- -static void kprobe_remove_area_blacklist(unsigned long start, unsigned long end)
- -{
- -      struct kprobe_blacklist_entry *ent, *n;
- -
- -      list_for_each_entry_safe(ent, n, &kprobe_blacklist, list) {
- -              if (ent->start_addr < start || ent->start_addr >= end)
- -                      continue;
- -              list_del(&ent->list);
- -              kfree(ent);
- -      }
- -}
- -
- -static void kprobe_remove_ksym_blacklist(unsigned long entry)
- -{
- -      kprobe_remove_area_blacklist(entry, entry + 1);
- -}
- -
   int __weak arch_kprobe_get_kallsym(unsigned int *symnum, unsigned long *value,
                                    char *type, char *sym)
   {
@@@ -2553,25 -2563,6 +2552,25 @@@ static int __init populate_kprobe_black
         return ret ? : arch_populate_kprobe_blacklist();
   }
   
+ +#ifdef CONFIG_MODULES
+ +/* Remove all symbols in given area from kprobe blacklist */
+ +static void kprobe_remove_area_blacklist(unsigned long start, unsigned long end)
+ +{
+ +      struct kprobe_blacklist_entry *ent, *n;
+ +
+ +      list_for_each_entry_safe(ent, n, &kprobe_blacklist, list) {
+ +              if (ent->start_addr < start || ent->start_addr >= end)
+ +                      continue;
+ +              list_del(&ent->list);
+ +              kfree(ent);
+ +      }
+ +}
+ +
+ +static void kprobe_remove_ksym_blacklist(unsigned long entry)
+ +{
+ +      kprobe_remove_area_blacklist(entry, entry + 1);
+ +}
+ +
   static void add_module_kprobe_blacklist(struct module *mod)
   {
         unsigned long start, end;
@@@ -2674,17 -2665,6 +2673,17 @@@ static struct notifier_block kprobe_mod
         .priority = 0
   };
   
+ +static int kprobe_register_module_notifier(void)
+ +{
+ +      return register_module_notifier(&kprobe_module_nb);
+ +}
+ +#else
+ +static int kprobe_register_module_notifier(void)
+ +{
+ +      return 0;
+ +}
+ +#endif /* CONFIG_MODULES */
+ +
   void kprobe_free_init_mem(void)
   {
         void *start = (void *)(&__init_begin);
@@@ -2744,7 -2724,7 +2743,7 @@@ static int __init init_kprobes(void
         if (!err)
                 err = register_die_notifier(&kprobe_exceptions_nb);
         if (!err)
- -              err = register_module_notifier(&kprobe_module_nb);
+ +              err = kprobe_register_module_notifier();
   
         kprobes_initialized = (err == 0);
         kprobe_sysctls_init();
diff --combined kernel/sched/core.c

index 1a914388144aebbb6ed0e2a41835f73ae4510475,7ce76620a308a75ad41ea96e459106dd50b00f85..373eaeaf63b8bdf9835916f93db34c78c1bfd09f
--- 1/kernel/sched/core.c
--- 2/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@@ -108,7 -108,7 +108,7 @@@ EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_rt_tp
   EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_dl_tp);
   EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_irq_tp);
   EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_se_tp);
- -EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_thermal_tp);
+ +EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_hw_tp);
   EXPORT_TRACEPOINT_SYMBOL_GPL(sched_cpu_capacity_tp);
   EXPORT_TRACEPOINT_SYMBOL_GPL(sched_overutilized_tp);
   EXPORT_TRACEPOINT_SYMBOL_GPL(sched_util_est_cfs_tp);
@@@ -4741,7 -4741,6 +4741,6 @@@ static struct ctl_table sched_core_sysc
                 .extra2         = SYSCTL_FOUR,
         },
   #endif /* CONFIG_NUMA_BALANCING */
-       {}
   };
   static int __init sched_core_sysctl_init(void)
   {
@@@ -5662,13 -5661,13 +5661,13 @@@ static inline u64 cpu_resched_latency(s
    * This function gets called by the timer code, with HZ frequency.
    * We call it with interrupts disabled.
    */
- -void scheduler_tick(void)
+ +void sched_tick(void)
   {
         int cpu = smp_processor_id();
         struct rq *rq = cpu_rq(cpu);
         struct task_struct *curr = rq->curr;
         struct rq_flags rf;
- -      unsigned long thermal_pressure;
+ +      unsigned long hw_pressure;
         u64 resched_latency;
   
         if (housekeeping_cpu(cpu, HK_TYPE_TICK))
@@@ -5679,8 -5678,8 +5678,8 @@@
         rq_lock(rq, &rf);
   
         update_rq_clock(rq);
- -      thermal_pressure = arch_scale_thermal_pressure(cpu_of(rq));
- -      update_thermal_load_avg(rq_clock_thermal(rq), rq, thermal_pressure);
+ +      hw_pressure = arch_scale_hw_pressure(cpu_of(rq));
+ +      update_hw_load_avg(rq_clock_task(rq), rq, hw_pressure);
         curr->sched_class->task_tick(rq, curr, 0);
         if (sched_feat(LATENCY_WARN))
                 resched_latency = cpu_resched_latency(rq);
@@@ -5700,7 -5699,7 +5699,7 @@@
   
   #ifdef CONFIG_SMP
         rq->idle_balance = idle_cpu(cpu);
- -      trigger_load_balance(rq);
+ +      sched_balance_trigger(rq);
   #endif
   }
   
@@@ -6585,7 -6584,7 +6584,7 @@@ pick_next_task(struct rq *rq, struct ta
    *      paths. For example, see arch/x86/entry_64.S.
    *
    *      To drive preemption between tasks, the scheduler sets the flag in timer
- - *      interrupt handler scheduler_tick().
+ + *      interrupt handler sched_tick().
    *
    *   3. Wakeups don't really cause entry into schedule(). They add a
    *      task to the run-queue and that's it.
diff --combined kernel/sched/fair.c

index 146ecf9cc3afe709b23f11c27466ae5608f8dc7a,4ac2cf7a918ebbf0b9cc4ca0d70e8df00796b36c..4214df32ba45342f9e90e3250194b9c5dc26a4c9
--- 1/kernel/sched/fair.c
--- 2/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@@ -78,9 -78,15 +78,9 @@@ static unsigned int normalized_sysctl_s
   
   const_debug unsigned int sysctl_sched_migration_cost  = 500000UL;
   
- -int sched_thermal_decay_shift;
   static int __init setup_sched_thermal_decay_shift(char *str)
   {
- -      int _shift = 0;
- -
- -      if (kstrtoint(str, 0, &_shift))
- -              pr_warn("Unable to set scheduler thermal pressure decay shift parameter\n");
- -
- -      sched_thermal_decay_shift = clamp(_shift, 0, 10);
+ +      pr_warn("Ignoring the deprecated sched_thermal_decay_shift= option\n");
         return 1;
   }
   __setup("sched_thermal_decay_shift=", setup_sched_thermal_decay_shift);
@@@ -151,7 -157,6 +151,6 @@@ static struct ctl_table sched_fair_sysc
                 .extra1         = SYSCTL_ZERO,
         },
   #endif /* CONFIG_NUMA_BALANCING */
-       {}
   };
   
   static int __init sched_fair_sysctl_init(void)
@@@ -382,8 -387,8 +381,8 @@@ static inline void list_del_leaf_cfs_rq
   
                 /*
                  * With cfs_rq being unthrottled/throttled during an enqueue,
- -               * it can happen the tmp_alone_branch points the a leaf that
- -               * we finally want to del. In this case, tmp_alone_branch moves
+ +               * it can happen the tmp_alone_branch points to the leaf that
+ +               * we finally want to delete. In this case, tmp_alone_branch moves
                  * to the prev element but it will point to rq->leaf_cfs_rq_list
                  * at the end of the enqueue.
                  */
@@@ -400,7 -405,7 +399,7 @@@ static inline void assert_list_leaf_cfs
         SCHED_WARN_ON(rq->tmp_alone_branch != &rq->leaf_cfs_rq_list);
   }
   
- -/* Iterate thr' all leaf cfs_rq's on a runqueue */
+ +/* Iterate through all leaf cfs_rq's on a runqueue */
   #define for_each_leaf_cfs_rq_safe(rq, cfs_rq, pos)                    \
         list_for_each_entry_safe(cfs_rq, pos, &rq->leaf_cfs_rq_list,    \
                                  leaf_cfs_rq_list)
@@@ -589,13 -594,13 +588,13 @@@ static inline s64 entity_key(struct cfs
    *
    * [[ NOTE: this is only equal to the ideal scheduler under the condition
    *          that join/leave operations happen at lag_i = 0, otherwise the
- - *          virtual time has non-continguous motion equivalent to:
+ + *          virtual time has non-contiguous motion equivalent to:
    *
    *          V +-= lag_i / W
    *
    *        Also see the comment in place_entity() that deals with this. ]]
    *
- - * However, since v_i is u64, and the multiplcation could easily overflow
+ + * However, since v_i is u64, and the multiplication could easily overflow
    * transform it into a relative form that uses smaller quantities:
    *
    * Substitute: v_i == (v_i - v0) + v0
@@@ -665,7 -670,7 +664,7 @@@ u64 avg_vruntime(struct cfs_rq *cfs_rq
         }
   
         if (load) {
- -              /* sign flips effective floor / ceil */
+ +              /* sign flips effective floor / ceiling */
                 if (avg < 0)
                         avg -= (load - 1);
                 avg = div_s64(avg, load);
@@@ -690,21 -695,15 +689,21 @@@
    *
    * XXX could add max_slice to the augmented data to track this.
    */
- -static void update_entity_lag(struct cfs_rq *cfs_rq, struct sched_entity *se)
+ +static s64 entity_lag(u64 avruntime, struct sched_entity *se)
   {
- -      s64 lag, limit;
+ +      s64 vlag, limit;
   
+ +      vlag = avruntime - se->vruntime;
+ +      limit = calc_delta_fair(max_t(u64, 2*se->slice, TICK_NSEC), se);
+ +
+ +      return clamp(vlag, -limit, limit);
+ +}
+ +
+ +static void update_entity_lag(struct cfs_rq *cfs_rq, struct sched_entity *se)
+ +{
         SCHED_WARN_ON(!se->on_rq);
- -      lag = avg_vruntime(cfs_rq) - se->vruntime;
   
- -      limit = calc_delta_fair(max_t(u64, 2*se->slice, TICK_NSEC), se);
- -      se->vlag = clamp(lag, -limit, limit);
+ +      se->vlag = entity_lag(avg_vruntime(cfs_rq), se);
   }
   
   /*
@@@ -721,7 -720,7 +720,7 @@@
    *
    * lag_i >= 0 -> \Sum (v_i - v)*w_i >= (v_i - v)*(\Sum w_i)
    *
- - * Note: using 'avg_vruntime() > se->vruntime' is inacurate due
+ + * Note: using 'avg_vruntime() > se->vruntime' is inaccurate due
    *       to the loss in precision caused by the division.
    */
   static int vruntime_eligible(struct cfs_rq *cfs_rq, u64 vruntime)
@@@ -1024,7 -1023,7 +1023,7 @@@ void init_entity_runnable_average(struc
         if (entity_is_task(se))
                 sa->load_avg = scale_load_down(se->load.weight);
   
- -      /* when this task enqueue'ed, it will contribute to its cfs_rq's load_avg */
+ +      /* when this task is enqueued, it will contribute to its cfs_rq's load_avg */
   }
   
   /*
@@@ -1616,7 -1615,7 +1615,7 @@@ static unsigned long score_nearby_nodes
         max_dist = READ_ONCE(sched_max_numa_distance);
         /*
          * This code is called for each node, introducing N^2 complexity,
- -       * which should be ok given the number of nodes rarely exceeds 8.
+ +       * which should be OK given the number of nodes rarely exceeds 8.
          */
         for_each_online_node(node) {
                 unsigned long faults;
@@@ -3290,7 -3289,7 +3289,7 @@@ retry_pids
                 /*
                  * Shared library pages mapped by multiple processes are not
                  * migrated as it is expected they are cache replicated. Avoid
- -               * hinting faults in read-only file-backed mappings or the vdso
+ +               * hinting faults in read-only file-backed mappings or the vDSO
                  * as migrating the pages will be of marginal benefit.
                  */
                 if (!vma->vm_mm ||
@@@ -3301,7 -3300,7 +3300,7 @@@
   
                 /*
                  * Skip inaccessible VMAs to avoid any confusion between
- -               * PROT_NONE and NUMA hinting ptes
+ +               * PROT_NONE and NUMA hinting PTEs
                  */
                 if (!vma_is_accessible(vma)) {
                         trace_sched_skip_vma_numa(mm, vma, NUMAB_SKIP_INACCESSIBLE);
@@@ -3333,7 -3332,7 +3332,7 @@@
                 }
   
                 /*
- -               * Scanning the VMA's of short lived tasks add more overhead. So
+ +               * Scanning the VMAs of short lived tasks add more overhead. So
                  * delay the scan for new VMAs.
                  */
                 if (mm->numa_scan_seq && time_before(jiffies,
@@@ -3377,7 -3376,7 +3376,7 @@@
                         /*
                          * Try to scan sysctl_numa_balancing_size worth of
                          * hpages that have at least one present PTE that
- -                       * is not already pte-numa. If the VMA contains
+ +                       * is not already PTE-numa. If the VMA contains
                          * areas that are unused or already full of prot_numa
                          * PTEs, scan up to virtpages, to skip through those
                          * areas faster.
@@@ -3676,15 -3675,16 +3675,15 @@@ static inline voi
   dequeue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { }
   #endif
   
- -static void reweight_eevdf(struct cfs_rq *cfs_rq, struct sched_entity *se,
+ +static void reweight_eevdf(struct sched_entity *se, u64 avruntime,
                            unsigned long weight)
   {
         unsigned long old_weight = se->load.weight;
- -      u64 avruntime = avg_vruntime(cfs_rq);
         s64 vlag, vslice;
   
         /*
          * VRUNTIME
- -       * ========
+ +       * --------
          *
          * COROLLARY #1: The virtual runtime of the entity needs to be
          * adjusted if re-weight at !0-lag point.
@@@ -3760,14 -3760,14 +3759,14 @@@
          *         = V  - vl'
          */
         if (avruntime != se->vruntime) {
- -              vlag = (s64)(avruntime - se->vruntime);
+ +              vlag = entity_lag(avruntime, se);
                 vlag = div_s64(vlag * old_weight, weight);
                 se->vruntime = avruntime - vlag;
         }
   
         /*
          * DEADLINE
- -       * ========
+ +       * --------
          *
          * When the weight changes, the virtual time slope changes and
          * we should adjust the relative virtual deadline accordingly.
@@@ -3786,26 -3786,25 +3785,26 @@@ static void reweight_entity(struct cfs_
                             unsigned long weight)
   {
         bool curr = cfs_rq->curr == se;
+ +      u64 avruntime;
   
         if (se->on_rq) {
                 /* commit outstanding execution time */
- -              if (curr)
- -                      update_curr(cfs_rq);
- -              else
+ +              update_curr(cfs_rq);
+ +              avruntime = avg_vruntime(cfs_rq);
+ +              if (!curr)
                         __dequeue_entity(cfs_rq, se);
                 update_load_sub(&cfs_rq->load, se->load.weight);
         }
         dequeue_load_avg(cfs_rq, se);
   
- -      if (!se->on_rq) {
+ +      if (se->on_rq) {
+ +              reweight_eevdf(se, avruntime, weight);
+ +      } else {
                 /*
                  * Because we keep se->vlag = V - v_i, while: lag_i = w_i*(V - v_i),
                  * we need to scale se->vlag when w_i changes.
                  */
                 se->vlag = div_s64(se->vlag * se->load.weight, weight);
- -      } else {
- -              reweight_eevdf(cfs_rq, se, weight);
         }
   
         update_load_set(&se->load, weight);
@@@ -4739,7 -4738,7 +4738,7 @@@ static inline void update_load_avg(stru
   
         /*
          * Track task load average for carrying it to new CPU after migrated, and
- -       * track group sched_entity load average for task_h_load calc in migration
+ +       * track group sched_entity load average for task_h_load calculation in migration
          */
         if (se->avg.last_update_time && !(flags & SKIP_AGE_LOAD))
                 __update_load_avg_se(now, cfs_rq, se);
@@@ -4822,7 -4821,7 +4821,7 @@@ static inline unsigned long cfs_rq_load
         return cfs_rq->avg.load_avg;
   }
   
- -static int newidle_balance(struct rq *this_rq, struct rq_flags *rf);
+ +static int sched_balance_newidle(struct rq *this_rq, struct rq_flags *rf);
   
   static inline unsigned long task_util(struct task_struct *p)
   {
@@@ -4965,22 -4964,13 +4964,22 @@@ done
         trace_sched_util_est_se_tp(&p->se);
   }
   
+ +static inline unsigned long get_actual_cpu_capacity(int cpu)
+ +{
+ +      unsigned long capacity = arch_scale_cpu_capacity(cpu);
+ +
+ +      capacity -= max(hw_load_avg(cpu_rq(cpu)), cpufreq_get_pressure(cpu));
+ +
+ +      return capacity;
+ +}
+ +
   static inline int util_fits_cpu(unsigned long util,
                                 unsigned long uclamp_min,
                                 unsigned long uclamp_max,
                                 int cpu)
   {
- -      unsigned long capacity_orig, capacity_orig_thermal;
         unsigned long capacity = capacity_of(cpu);
+ +      unsigned long capacity_orig;
         bool fits, uclamp_max_fits;
   
         /*
@@@ -5002,7 -4992,7 +5001,7 @@@
          * Similarly if a task is capped to arch_scale_cpu_capacity(little_cpu), it
          * should fit a little cpu even if there's some pressure.
          *
- -       * Only exception is for thermal pressure since it has a direct impact
+ +       * Only exception is for HW or cpufreq pressure since it has a direct impact
          * on available OPP of the system.
          *
          * We honour it for uclamp_min only as a drop in performance level
@@@ -5012,6 -5002,7 +5011,6 @@@
          * goal is to cap the task. So it's okay if it's getting less.
          */
         capacity_orig = arch_scale_cpu_capacity(cpu);
- -      capacity_orig_thermal = capacity_orig - arch_scale_thermal_pressure(cpu);
   
         /*
          * We want to force a task to fit a cpu as implied by uclamp_max.
@@@ -5028,14 -5019,14 +5027,14 @@@
          *   |     |   |       |   |      |   |
          *   |     |   |       |   |      |   |
          *   +----------------------------------------
- -       *         cpu0        cpu1       cpu2
+ +       *         CPU0        CPU1       CPU2
          *
          *   In the above example if a task is capped to a specific performance
          *   point, y, then when:
          *
- -       *   * util = 80% of x then it does not fit on cpu0 and should migrate
- -       *     to cpu1
- -       *   * util = 80% of y then it is forced to fit on cpu1 to honour
+ +       *   * util = 80% of x then it does not fit on CPU0 and should migrate
+ +       *     to CPU1
+ +       *   * util = 80% of y then it is forced to fit on CPU1 to honour
          *     uclamp_max request.
          *
          *   which is what we're enforcing here. A task always fits if
@@@ -5066,7 -5057,7 +5065,7 @@@
          *   |     |   |       |   |      |   |
          *   |     |   |       |   |      |   |      (region c, boosted, util < uclamp_min)
          *   +----------------------------------------
- -       *         cpu0        cpu1       cpu2
+ +       *         CPU0        CPU1       CPU2
          *
          * a) If util > uclamp_max, then we're capped, we don't care about
          *    actual fitness value here. We only care if uclamp_max fits
@@@ -5086,8 -5077,7 +5085,8 @@@
          * handle the case uclamp_min > uclamp_max.
          */
         uclamp_min = min(uclamp_min, uclamp_max);
- -      if (fits && (util < uclamp_min) && (uclamp_min > capacity_orig_thermal))
+ +      if (fits && (util < uclamp_min) &&
+ +          (uclamp_min > get_actual_cpu_capacity(cpu)))
                 return -1;
   
         return fits;
@@@ -5107,19 -5097,15 +5106,19 @@@ static inline int task_fits_cpu(struct 
   
   static inline void update_misfit_status(struct task_struct *p, struct rq *rq)
   {
+ +      int cpu = cpu_of(rq);
+ +
         if (!sched_asym_cpucap_active())
                 return;
   
- -      if (!p || p->nr_cpus_allowed == 1) {
- -              rq->misfit_task_load = 0;
- -              return;
- -      }
+ +      /*
+ +       * Affinity allows us to go somewhere higher?  Or are we on biggest
+ +       * available CPU already? Or do we fit into this CPU ?
+ +       */
+ +      if (!p || (p->nr_cpus_allowed == 1) ||
+ +          (arch_scale_cpu_capacity(cpu) == p->max_allowed_capacity) ||
+ +          task_fits_cpu(p, cpu)) {
   
- -      if (task_fits_cpu(p, cpu_of(rq))) {
                 rq->misfit_task_load = 0;
                 return;
         }
@@@ -5155,7 -5141,7 +5154,7 @@@ attach_entity_load_avg(struct cfs_rq *c
   static inline void
   detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {}
   
- -static inline int newidle_balance(struct rq *rq, struct rq_flags *rf)
+ +static inline int sched_balance_newidle(struct rq *rq, struct rq_flags *rf)
   {
         return 0;
   }
@@@ -5261,7 -5247,7 +5260,7 @@@ place_entity(struct cfs_rq *cfs_rq, str
         se->vruntime = vruntime - lag;
   
         /*
- -       * When joining the competition; the exisiting tasks will be,
+ +       * When joining the competition; the existing tasks will be,
          * on average, halfway through their slice, as such start tasks
          * off with half a slice to ease into the competition.
          */
@@@ -5410,7 -5396,7 +5409,7 @@@ dequeue_entity(struct cfs_rq *cfs_rq, s
          * Now advance min_vruntime if @se was the entity holding it back,
          * except when: DEQUEUE_SAVE && !DEQUEUE_MOVE, in this case we'll be
          * put back on, and if we advance min_vruntime, we'll be placed back
- -       * further than we started -- ie. we'll be penalized.
+ +       * further than we started -- i.e. we'll be penalized.
          */
         if ((flags & (DEQUEUE_SAVE | DEQUEUE_MOVE)) != DEQUEUE_SAVE)
                 update_min_vruntime(cfs_rq);
@@@ -5446,7 -5432,7 +5445,7 @@@ set_next_entity(struct cfs_rq *cfs_rq, 
   
         /*
          * Track our maximum slice length, if the CPU's load is at
- -       * least twice that of our own weight (i.e. dont track it
+ +       * least twice that of our own weight (i.e. don't track it
          * when there are only lesser-weight tasks around):
          */
         if (schedstat_enabled() &&
@@@ -6682,47 -6668,22 +6681,47 @@@ static inline void hrtick_update(struc
   #ifdef CONFIG_SMP
   static inline bool cpu_overutilized(int cpu)
   {
- -      unsigned long rq_util_min = uclamp_rq_get(cpu_rq(cpu), UCLAMP_MIN);
- -      unsigned long rq_util_max = uclamp_rq_get(cpu_rq(cpu), UCLAMP_MAX);
+ +      unsigned long  rq_util_min, rq_util_max;
+ +
+ +      if (!sched_energy_enabled())
+ +              return false;
+ +
+ +      rq_util_min = uclamp_rq_get(cpu_rq(cpu), UCLAMP_MIN);
+ +      rq_util_max = uclamp_rq_get(cpu_rq(cpu), UCLAMP_MAX);
   
         /* Return true only if the utilization doesn't fit CPU's capacity */
         return !util_fits_cpu(cpu_util_cfs(cpu), rq_util_min, rq_util_max, cpu);
   }
   
- -static inline void update_overutilized_status(struct rq *rq)
+ +/*
+ + * overutilized value make sense only if EAS is enabled
+ + */
+ +static inline bool is_rd_overutilized(struct root_domain *rd)
   {
- -      if (!READ_ONCE(rq->rd->overutilized) && cpu_overutilized(rq->cpu)) {
- -              WRITE_ONCE(rq->rd->overutilized, SG_OVERUTILIZED);
- -              trace_sched_overutilized_tp(rq->rd, SG_OVERUTILIZED);
- -      }
+ +      return !sched_energy_enabled() || READ_ONCE(rd->overutilized);
+ +}
+ +
+ +static inline void set_rd_overutilized(struct root_domain *rd, bool flag)
+ +{
+ +      if (!sched_energy_enabled())
+ +              return;
+ +
+ +      WRITE_ONCE(rd->overutilized, flag);
+ +      trace_sched_overutilized_tp(rd, flag);
+ +}
+ +
+ +static inline void check_update_overutilized_status(struct rq *rq)
+ +{
+ +      /*
+ +       * overutilized field is used for load balancing decisions only
+ +       * if energy aware scheduler is being used
+ +       */
+ +
+ +      if (!is_rd_overutilized(rq->rd) && cpu_overutilized(rq->cpu))
+ +              set_rd_overutilized(rq->rd, 1);
   }
   #else
- -static inline void update_overutilized_status(struct rq *rq) { }
+ +static inline void check_update_overutilized_status(struct rq *rq) { }
   #endif
   
   /* Runqueue only has SCHED_IDLE tasks enqueued */
@@@ -6823,7 -6784,7 +6822,7 @@@ enqueue_task_fair(struct rq *rq, struc
          * and the following generally works well enough in practice.
          */
         if (!task_new)
- -              update_overutilized_status(rq);
+ +              check_update_overutilized_status(rq);
   
   enqueue_throttle:
         assert_list_leaf_cfs_rq(rq);
@@@ -6910,7 -6871,7 +6909,7 @@@ dequeue_throttle
   
   #ifdef CONFIG_SMP
   
- -/* Working cpumask for: load_balance, load_balance_newidle. */
+ +/* Working cpumask for: sched_balance_rq(), sched_balance_newidle(). */
   static DEFINE_PER_CPU(cpumask_var_t, load_balance_mask);
   static DEFINE_PER_CPU(cpumask_var_t, select_rq_mask);
   static DEFINE_PER_CPU(cpumask_var_t, should_we_balance_tmpmask);
@@@ -7142,13 -7103,13 +7141,13 @@@ static int wake_affine(struct sched_dom
   }
   
   static struct sched_group *
- -find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu);
+ +sched_balance_find_dst_group(struct sched_domain *sd, struct task_struct *p, int this_cpu);
   
   /*
- - * find_idlest_group_cpu - find the idlest CPU among the CPUs in the group.
+ + * sched_balance_find_dst_group_cpu - find the idlest CPU among the CPUs in the group.
    */
   static int
- -find_idlest_group_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
+ +sched_balance_find_dst_group_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
   {
         unsigned long load, min_load = ULONG_MAX;
         unsigned int min_exit_latency = UINT_MAX;
@@@ -7204,7 -7165,7 +7203,7 @@@
         return shallowest_idle_cpu != -1 ? shallowest_idle_cpu : least_loaded_cpu;
   }
   
- -static inline int find_idlest_cpu(struct sched_domain *sd, struct task_struct *p,
+ +static inline int sched_balance_find_dst_cpu(struct sched_domain *sd, struct task_struct *p,
                                   int cpu, int prev_cpu, int sd_flag)
   {
         int new_cpu = cpu;
@@@ -7229,13 -7190,13 +7228,13 @@@
                         continue;
                 }
   
- -              group = find_idlest_group(sd, p, cpu);
+ +              group = sched_balance_find_dst_group(sd, p, cpu);
                 if (!group) {
                         sd = sd->child;
                         continue;
                 }
   
- -              new_cpu = find_idlest_group_cpu(group, p, cpu);
+ +              new_cpu = sched_balance_find_dst_group_cpu(group, p, cpu);
                 if (new_cpu == cpu) {
                         /* Now try balancing at a lower domain level of 'cpu': */
                         sd = sd->child;
@@@ -7503,7 -7464,7 +7502,7 @@@ select_idle_capacity(struct task_struc
                  * Look for the CPU with best capacity.
                  */
                 else if (fits < 0)
- -                      cpu_cap = arch_scale_cpu_capacity(cpu) - thermal_load_avg(cpu_rq(cpu));
+ +                      cpu_cap = get_actual_cpu_capacity(cpu);
   
                 /*
                  * First, select CPU which fits better (-1 being better than 0).
@@@ -7547,7 -7508,7 +7546,7 @@@ static int select_idle_sibling(struct t
   
         /*
          * On asymmetric system, update task utilization because we will check
- -       * that the task fits with cpu's capacity.
+ +       * that the task fits with CPU's capacity.
          */
         if (sched_asym_cpucap_active()) {
                 sync_entity_load_avg(&p->se);
@@@ -7980,7 -7941,7 +7979,7 @@@ compute_energy(struct energy_env *eenv
    * NOTE: Forkees are not accepted in the energy-aware wake-up path because
    * they don't have any useful utilization data yet and it's not possible to
    * forecast their impact on energy consumption. Consequently, they will be
- - * placed by find_idlest_cpu() on the least loaded CPU, which might turn out
+ + * placed by sched_balance_find_dst_cpu() on the least loaded CPU, which might turn out
    * to be energy-inefficient in some use-cases. The alternative would be to
    * bias new tasks towards specific types of CPUs first, or to try to infer
    * their util_avg from the parent task, but those heuristics could hurt
@@@ -7996,15 -7957,15 +7995,15 @@@ static int find_energy_efficient_cpu(st
         struct root_domain *rd = this_rq()->rd;
         int cpu, best_energy_cpu, target = -1;
         int prev_fits = -1, best_fits = -1;
- -      unsigned long best_thermal_cap = 0;
- -      unsigned long prev_thermal_cap = 0;
+ +      unsigned long best_actual_cap = 0;
+ +      unsigned long prev_actual_cap = 0;
         struct sched_domain *sd;
         struct perf_domain *pd;
         struct energy_env eenv;
   
         rcu_read_lock();
         pd = rcu_dereference(rd->pd);
- -      if (!pd || READ_ONCE(rd->overutilized))
+ +      if (!pd)
                 goto unlock;
   
         /*
@@@ -8027,7 -7988,7 +8026,7 @@@
   
         for (; pd; pd = pd->next) {
                 unsigned long util_min = p_util_min, util_max = p_util_max;
- -              unsigned long cpu_cap, cpu_thermal_cap, util;
+ +              unsigned long cpu_cap, cpu_actual_cap, util;
                 long prev_spare_cap = -1, max_spare_cap = -1;
                 unsigned long rq_util_min, rq_util_max;
                 unsigned long cur_delta, base_energy;
@@@ -8039,17 -8000,18 +8038,17 @@@
                 if (cpumask_empty(cpus))
                         continue;
   
- -              /* Account thermal pressure for the energy estimation */
+ +              /* Account external pressure for the energy estimation */
                 cpu = cpumask_first(cpus);
- -              cpu_thermal_cap = arch_scale_cpu_capacity(cpu);
- -              cpu_thermal_cap -= arch_scale_thermal_pressure(cpu);
+ +              cpu_actual_cap = get_actual_cpu_capacity(cpu);
   
- -              eenv.cpu_cap = cpu_thermal_cap;
+ +              eenv.cpu_cap = cpu_actual_cap;
                 eenv.pd_cap = 0;
   
                 for_each_cpu(cpu, cpus) {
                         struct rq *rq = cpu_rq(cpu);
   
- -                      eenv.pd_cap += cpu_thermal_cap;
+ +                      eenv.pd_cap += cpu_actual_cap;
   
                         if (!cpumask_test_cpu(cpu, sched_domain_span(sd)))
                                 continue;
@@@ -8070,7 -8032,7 +8069,7 @@@
                         if (uclamp_is_used() && !uclamp_rq_is_idle(rq)) {
                                 /*
                                  * Open code uclamp_rq_util_with() except for
- -                               * the clamp() part. Ie: apply max aggregation
+ +                               * the clamp() part. I.e.: apply max aggregation
                                  * only. util_fits_cpu() logic requires to
                                  * operate on non clamped util but must use the
                                  * max-aggregated uclamp_{min, max}.
@@@ -8120,7 -8082,7 +8119,7 @@@
                         if (prev_delta < base_energy)
                                 goto unlock;
                         prev_delta -= base_energy;
- -                      prev_thermal_cap = cpu_thermal_cap;
+ +                      prev_actual_cap = cpu_actual_cap;
                         best_delta = min(best_delta, prev_delta);
                 }
   
@@@ -8135,7 -8097,7 +8134,7 @@@
                          * but best energy cpu has better capacity.
                          */
                         if ((max_fits < 0) &&
- -                          (cpu_thermal_cap <= best_thermal_cap))
+ +                          (cpu_actual_cap <= best_actual_cap))
                                 continue;
   
                         cur_delta = compute_energy(&eenv, pd, cpus, p,
@@@ -8156,14 -8118,14 +8155,14 @@@
                         best_delta = cur_delta;
                         best_energy_cpu = max_spare_cap_cpu;
                         best_fits = max_fits;
- -                      best_thermal_cap = cpu_thermal_cap;
+ +                      best_actual_cap = cpu_actual_cap;
                 }
         }
         rcu_read_unlock();
   
         if ((best_fits > prev_fits) ||
             ((best_fits > 0) && (best_delta < prev_delta)) ||
- -          ((best_fits < 0) && (best_thermal_cap > prev_thermal_cap)))
+ +          ((best_fits < 0) && (best_actual_cap > prev_actual_cap)))
                 target = best_energy_cpu;
   
         return target;
@@@ -8206,7 -8168,7 +8205,7 @@@ select_task_rq_fair(struct task_struct 
                     cpumask_test_cpu(cpu, p->cpus_ptr))
                         return cpu;
   
- -              if (sched_energy_enabled()) {
+ +              if (!is_rd_overutilized(this_rq()->rd)) {
                         new_cpu = find_energy_efficient_cpu(p, prev_cpu);
                         if (new_cpu >= 0)
                                 return new_cpu;
@@@ -8244,7 -8206,7 +8243,7 @@@
   
         if (unlikely(sd)) {
                 /* Slow path */
- -              new_cpu = find_idlest_cpu(sd, p, cpu, prev_cpu, sd_flag);
+ +              new_cpu = sched_balance_find_dst_cpu(sd, p, cpu, prev_cpu, sd_flag);
         } else if (wake_flags & WF_TTWU) { /* XXX always ? */
                 /* Fast path */
                 new_cpu = select_idle_sibling(p, prev_cpu, new_cpu);
@@@ -8290,46 -8252,14 +8289,46 @@@ static void task_dead_fair(struct task_
         remove_entity_load_avg(&p->se);
   }
   
+ +/*
+ + * Set the max capacity the task is allowed to run at for misfit detection.
+ + */
+ +static void set_task_max_allowed_capacity(struct task_struct *p)
+ +{
+ +      struct asym_cap_data *entry;
+ +
+ +      if (!sched_asym_cpucap_active())
+ +              return;
+ +
+ +      rcu_read_lock();
+ +      list_for_each_entry_rcu(entry, &asym_cap_list, link) {
+ +              cpumask_t *cpumask;
+ +
+ +              cpumask = cpu_capacity_span(entry);
+ +              if (!cpumask_intersects(p->cpus_ptr, cpumask))
+ +                      continue;
+ +
+ +              p->max_allowed_capacity = entry->capacity;
+ +              break;
+ +      }
+ +      rcu_read_unlock();
+ +}
+ +
+ +static void set_cpus_allowed_fair(struct task_struct *p, struct affinity_context *ctx)
+ +{
+ +      set_cpus_allowed_common(p, ctx);
+ +      set_task_max_allowed_capacity(p);
+ +}
+ +
   static int
   balance_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
   {
         if (rq->nr_running)
                 return 1;
   
- -      return newidle_balance(rq, rf) != 0;
+ +      return sched_balance_newidle(rq, rf) != 0;
   }
+ +#else
+ +static inline void set_task_max_allowed_capacity(struct task_struct *p) {}
   #endif /* CONFIG_SMP */
   
   static void set_next_buddy(struct sched_entity *se)
@@@ -8580,10 -8510,10 +8579,10 @@@ idle
         if (!rf)
                 return NULL;
   
- -      new_tasks = newidle_balance(rq, rf);
+ +      new_tasks = sched_balance_newidle(rq, rf);
   
         /*
- -       * Because newidle_balance() releases (and re-acquires) rq->lock, it is
+ +       * Because sched_balance_newidle() releases (and re-acquires) rq->lock, it is
          * possible for any higher priority task to appear. In that case we
          * must re-start the pick_next_entity() loop.
          */
@@@ -8661,7 -8591,7 +8660,7 @@@ static bool yield_to_task_fair(struct r
         if (!se->on_rq || throttled_hierarchy(cfs_rq_of(se)))
                 return false;
   
- -      /* Tell the scheduler that we'd really like pse to run next. */
+ +      /* Tell the scheduler that we'd really like se to run next. */
         set_next_buddy(se);
   
         yield_task_fair(rq);
@@@ -8999,7 -8929,7 +8998,7 @@@ int can_migrate_task(struct task_struc
         if (throttled_lb_pair(task_group(p), env->src_cpu, env->dst_cpu))
                 return 0;
   
- -      /* Disregard pcpu kthreads; they are where they need to be. */
+ +      /* Disregard percpu kthreads; they are where they need to be. */
         if (kthread_is_per_cpu(p))
                 return 0;
   
@@@ -9145,7 -9075,7 +9144,7 @@@ static int detach_tasks(struct lb_env *
                  * We don't want to steal all, otherwise we may be treated likewise,
                  * which could at worst lead to a livelock crash.
                  */
- -              if (env->idle != CPU_NOT_IDLE && env->src_rq->nr_running <= 1)
+ +              if (env->idle && env->src_rq->nr_running <= 1)
                         break;
   
                 env->loop++;
@@@ -9324,7 -9254,7 +9323,7 @@@ static inline bool others_have_blocked(
         if (cpu_util_dl(rq))
                 return true;
   
- -      if (thermal_load_avg(rq))
+ +      if (hw_load_avg(rq))
                 return true;
   
         if (cpu_util_irq(rq))
@@@ -9354,7 -9284,7 +9353,7 @@@ static bool __update_blocked_others(str
   {
         const struct sched_class *curr_class;
         u64 now = rq_clock_pelt(rq);
- -      unsigned long thermal_pressure;
+ +      unsigned long hw_pressure;
         bool decayed;
   
         /*
@@@ -9363,11 -9293,11 +9362,11 @@@
          */
         curr_class = rq->curr->sched_class;
   
- -      thermal_pressure = arch_scale_thermal_pressure(cpu_of(rq));
+ +      hw_pressure = arch_scale_hw_pressure(cpu_of(rq));
   
         decayed = update_rt_rq_load_avg(now, rq, curr_class == &rt_sched_class) |
                   update_dl_rq_load_avg(now, rq, curr_class == &dl_sched_class) |
- -                update_thermal_load_avg(rq_clock_thermal(rq), rq, thermal_pressure) |
+ +                update_hw_load_avg(now, rq, hw_pressure) |
                   update_irq_load_avg(rq, 0);
   
         if (others_have_blocked(rq))
@@@ -9486,7 -9416,7 +9485,7 @@@ static unsigned long task_h_load(struc
   }
   #endif
   
- -static void update_blocked_averages(int cpu)
+ +static void sched_balance_update_blocked_averages(int cpu)
   {
         bool decayed = false, done = true;
         struct rq *rq = cpu_rq(cpu);
@@@ -9505,25 -9435,25 +9504,25 @@@
         rq_unlock_irqrestore(rq, &rf);
   }
   
- -/********** Helpers for find_busiest_group ************************/
+ +/********** Helpers for sched_balance_find_src_group ************************/
   
   /*
- - * sg_lb_stats - stats of a sched_group required for load_balancing
+ + * sg_lb_stats - stats of a sched_group required for load-balancing:
    */
   struct sg_lb_stats {
- -      unsigned long avg_load; /*Avg load across the CPUs of the group */
- -      unsigned long group_load; /* Total load over the CPUs of the group */
- -      unsigned long group_capacity;
- -      unsigned long group_util; /* Total utilization over the CPUs of the group */
- -      unsigned long group_runnable; /* Total runnable time over the CPUs of the group */
- -      unsigned int sum_nr_running; /* Nr of tasks running in the group */
- -      unsigned int sum_h_nr_running; /* Nr of CFS tasks running in the group */
- -      unsigned int idle_cpus;
+ +      unsigned long avg_load;                 /* Avg load            over the CPUs of the group */
+ +      unsigned long group_load;               /* Total load          over the CPUs of the group */
+ +      unsigned long group_capacity;           /* Capacity            over the CPUs of the group */
+ +      unsigned long group_util;               /* Total utilization   over the CPUs of the group */
+ +      unsigned long group_runnable;           /* Total runnable time over the CPUs of the group */
+ +      unsigned int sum_nr_running;            /* Nr of all tasks running in the group */
+ +      unsigned int sum_h_nr_running;          /* Nr of CFS tasks running in the group */
+ +      unsigned int idle_cpus;                 /* Nr of idle CPUs         in the group */
         unsigned int group_weight;
         enum group_type group_type;
- -      unsigned int group_asym_packing; /* Tasks should be moved to preferred CPU */
- -      unsigned int group_smt_balance;  /* Task on busy SMT be moved */
- -      unsigned long group_misfit_task_load; /* A CPU has a task too big for its capacity */
+ +      unsigned int group_asym_packing;        /* Tasks should be moved to preferred CPU */
+ +      unsigned int group_smt_balance;         /* Task on busy SMT be moved */
+ +      unsigned long group_misfit_task_load;   /* A CPU has a task too big for its capacity */
   #ifdef CONFIG_NUMA_BALANCING
         unsigned int nr_numa_running;
         unsigned int nr_preferred_running;
@@@ -9531,18 -9461,19 +9530,18 @@@
   };
   
   /*
- - * sd_lb_stats - Structure to store the statistics of a sched_domain
- - *             during load balancing.
+ + * sd_lb_stats - stats of a sched_domain required for load-balancing:
    */
   struct sd_lb_stats {
- -      struct sched_group *busiest;    /* Busiest group in this sd */
- -      struct sched_group *local;      /* Local group in this sd */
- -      unsigned long total_load;       /* Total load of all groups in sd */
- -      unsigned long total_capacity;   /* Total capacity of all groups in sd */
- -      unsigned long avg_load; /* Average load across all groups in sd */
- -      unsigned int prefer_sibling; /* tasks should go to sibling first */
- -
- -      struct sg_lb_stats busiest_stat;/* Statistics of the busiest group */
- -      struct sg_lb_stats local_stat;  /* Statistics of the local group */
+ +      struct sched_group *busiest;            /* Busiest group in this sd */
+ +      struct sched_group *local;              /* Local group in this sd */
+ +      unsigned long total_load;               /* Total load of all groups in sd */
+ +      unsigned long total_capacity;           /* Total capacity of all groups in sd */
+ +      unsigned long avg_load;                 /* Average load across all groups in sd */
+ +      unsigned int prefer_sibling;            /* Tasks should go to sibling first */
+ +
+ +      struct sg_lb_stats busiest_stat;        /* Statistics of the busiest group */
+ +      struct sg_lb_stats local_stat;          /* Statistics of the local group */
   };
   
   static inline void init_sd_lb_stats(struct sd_lb_stats *sds)
@@@ -9568,8 -9499,8 +9567,8 @@@
   
   static unsigned long scale_rt_capacity(int cpu)
   {
+ +      unsigned long max = get_actual_cpu_capacity(cpu);
         struct rq *rq = cpu_rq(cpu);
- -      unsigned long max = arch_scale_cpu_capacity(cpu);
         unsigned long used, free;
         unsigned long irq;
   
@@@ -9581,9 -9512,12 +9580,9 @@@
         /*
          * avg_rt.util_avg and avg_dl.util_avg track binary signals
          * (running and not running) with weights 0 and 1024 respectively.
- -       * avg_thermal.load_avg tracks thermal pressure and the weighted
- -       * average uses the actual delta max capacity(load).
          */
         used = cpu_util_rt(rq);
         used += cpu_util_dl(rq);
- -      used += thermal_load_avg(rq);
   
         if (unlikely(used >= max))
                 return 1;
@@@ -9676,10 -9610,16 +9675,10 @@@ check_cpu_capacity(struct rq *rq, struc
                                 (arch_scale_cpu_capacity(cpu_of(rq)) * 100));
   }
   
- -/*
- - * Check whether a rq has a misfit task and if it looks like we can actually
- - * help that task: we can migrate the task to a CPU of higher capacity, or
- - * the task's current CPU is heavily pressured.
- - */
- -static inline int check_misfit_status(struct rq *rq, struct sched_domain *sd)
+ +/* Check if the rq has a misfit task */
+ +static inline bool check_misfit_status(struct rq *rq)
   {
- -      return rq->misfit_task_load &&
- -              (arch_scale_cpu_capacity(rq->cpu) < rq->rd->max_cpu_capacity ||
- -               check_cpu_capacity(rq, sd));
+ +      return rq->misfit_task_load;
   }
   
   /*
@@@ -9703,7 -9643,7 +9702,7 @@@
    *
    * When this is so detected; this group becomes a candidate for busiest; see
    * update_sd_pick_busiest(). And calculate_imbalance() and
- - * find_busiest_group() avoid some of the usual balance conditions to allow it
+ + * sched_balance_find_src_group() avoid some of the usual balance conditions to allow it
    * to create an effective group imbalance.
    *
    * This is a somewhat tricky proposition since the next run might not find the
@@@ -9868,7 -9808,7 +9867,7 @@@ static inline bool smt_vs_nonsmt_groups
   static inline bool smt_balance(struct lb_env *env, struct sg_lb_stats *sgs,
                                struct sched_group *group)
   {
- -      if (env->idle == CPU_NOT_IDLE)
+ +      if (!env->idle)
                 return false;
   
         /*
@@@ -9892,7 -9832,7 +9891,7 @@@ static inline long sibling_imbalance(st
         int ncores_busiest, ncores_local;
         long imbalance;
   
- -      if (env->idle == CPU_NOT_IDLE || !busiest->sum_nr_running)
+ +      if (!env->idle || !busiest->sum_nr_running)
                 return 0;
   
         ncores_busiest = sds->busiest->cores;
@@@ -9938,15 -9878,13 +9937,15 @@@ sched_reduced_capacity(struct rq *rq, s
    * @sds: Load-balancing data with statistics of the local group.
    * @group: sched_group whose statistics are to be updated.
    * @sgs: variable to hold the statistics for this group.
- - * @sg_status: Holds flag indicating the status of the sched_group
+ + * @sg_overloaded: sched_group is overloaded
+ + * @sg_overutilized: sched_group is overutilized
    */
   static inline void update_sg_lb_stats(struct lb_env *env,
                                       struct sd_lb_stats *sds,
                                       struct sched_group *group,
                                       struct sg_lb_stats *sgs,
- -                                    int *sg_status)
+ +                                    bool *sg_overloaded,
+ +                                    bool *sg_overutilized)
   {
         int i, nr_running, local_group;
   
@@@ -9967,10 -9905,10 +9966,10 @@@
                 sgs->sum_nr_running += nr_running;
   
                 if (nr_running > 1)
- -                      *sg_status |= SG_OVERLOAD;
+ +                      *sg_overloaded = 1;
   
                 if (cpu_overutilized(i))
- -                      *sg_status |= SG_OVERUTILIZED;
+ +                      *sg_overutilized = 1;
   
   #ifdef CONFIG_NUMA_BALANCING
                 sgs->nr_numa_running += rq->nr_numa_running;
@@@ -9992,9 -9930,10 +9991,9 @@@
                         /* Check for a misfit task on the cpu */
                         if (sgs->group_misfit_task_load < rq->misfit_task_load) {
                                 sgs->group_misfit_task_load = rq->misfit_task_load;
- -                              *sg_status |= SG_OVERLOAD;
+ +                              *sg_overloaded = 1;
                         }
- -              } else if ((env->idle != CPU_NOT_IDLE) &&
- -                         sched_reduced_capacity(rq, env->sd)) {
+ +              } else if (env->idle && sched_reduced_capacity(rq, env->sd)) {
                         /* Check for a task running on a CPU with reduced capacity */
                         if (sgs->group_misfit_task_load < load)
                                 sgs->group_misfit_task_load = load;
@@@ -10006,7 -9945,7 +10005,7 @@@
         sgs->group_weight = group->group_weight;
   
         /* Check if dst CPU is idle and preferred to this group */
- -      if (!local_group && env->idle != CPU_NOT_IDLE && sgs->sum_h_nr_running &&
+ +      if (!local_group && env->idle && sgs->sum_h_nr_running &&
             sched_group_asym(env, sgs, group))
                 sgs->group_asym_packing = 1;
   
@@@ -10144,7 -10083,7 +10143,7 @@@ static bool update_sd_pick_busiest(stru
   has_spare:
   
                 /*
- -               * Select not overloaded group with lowest number of idle cpus
+ +               * Select not overloaded group with lowest number of idle CPUs
                  * and highest number of running tasks. We could also compare
                  * the spare capacity which is more stable but it can end up
                  * that the group has less spare capacity but finally more idle
@@@ -10364,13 -10303,13 +10363,13 @@@ static bool update_pick_idlest(struct s
   }
   
   /*
- - * find_idlest_group() finds and returns the least busy CPU group within the
+ + * sched_balance_find_dst_group() finds and returns the least busy CPU group within the
    * domain.
    *
    * Assumes p is allowed on at least one CPU in sd.
    */
   static struct sched_group *
- -find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
+ +sched_balance_find_dst_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
   {
         struct sched_group *idlest = NULL, *local = NULL, *group = sd->groups;
         struct sg_lb_stats local_sgs, tmp_sgs;
@@@ -10618,7 -10557,7 +10617,7 @@@ static inline void update_sd_lb_stats(s
         struct sg_lb_stats *local = &sds->local_stat;
         struct sg_lb_stats tmp_sgs;
         unsigned long sum_util = 0;
- -      int sg_status = 0;
+ +      bool sg_overloaded = 0, sg_overutilized = 0;
   
         do {
                 struct sg_lb_stats *sgs = &tmp_sgs;
@@@ -10634,7 -10573,7 +10633,7 @@@
                                 update_group_capacity(env->sd, env->dst_cpu);
                 }
   
- -              update_sg_lb_stats(env, sds, sg, sgs, &sg_status);
+ +              update_sg_lb_stats(env, sds, sg, sgs, &sg_overloaded, &sg_overutilized);
   
                 if (!local_group && update_sd_pick_busiest(env, sds, sg, sgs)) {
                         sds->busiest = sg;
@@@ -10662,13 -10601,19 +10661,13 @@@
                 env->fbq_type = fbq_classify_group(&sds->busiest_stat);
   
         if (!env->sd->parent) {
- -              struct root_domain *rd = env->dst_rq->rd;
- -
                 /* update overload indicator if we are at root domain */
- -              WRITE_ONCE(rd->overload, sg_status & SG_OVERLOAD);
+ +              set_rd_overloaded(env->dst_rq->rd, sg_overloaded);
   
                 /* Update over-utilization (tipping point, U >= 0) indicator */
- -              WRITE_ONCE(rd->overutilized, sg_status & SG_OVERUTILIZED);
- -              trace_sched_overutilized_tp(rd, sg_status & SG_OVERUTILIZED);
- -      } else if (sg_status & SG_OVERUTILIZED) {
- -              struct root_domain *rd = env->dst_rq->rd;
- -
- -              WRITE_ONCE(rd->overutilized, SG_OVERUTILIZED);
- -              trace_sched_overutilized_tp(rd, SG_OVERUTILIZED);
+ +              set_rd_overutilized(env->dst_rq->rd, sg_overutilized);
+ +      } else if (sg_overutilized) {
+ +              set_rd_overutilized(env->dst_rq->rd, sg_overutilized);
         }
   
         update_idle_cpu_scan(env, sum_util);
@@@ -10758,7 -10703,7 +10757,7 @@@ static inline void calculate_imbalance(
                          * waiting task in this overloaded busiest group. Let's
                          * try to pull it.
                          */
- -                      if (env->idle != CPU_NOT_IDLE && env->imbalance == 0) {
+ +                      if (env->idle && env->imbalance == 0) {
                                 env->migration_type = migrate_task;
                                 env->imbalance = 1;
                         }
@@@ -10777,7 -10722,7 +10776,7 @@@
   
                         /*
                          * If there is no overload, we just want to even the number of
- -                       * idle cpus.
+ +                       * idle CPUs.
                          */
                         env->migration_type = migrate_task;
                         env->imbalance = max_t(long, 0,
@@@ -10850,7 -10795,7 +10849,7 @@@
         ) / SCHED_CAPACITY_SCALE;
   }
   
- -/******* find_busiest_group() helpers end here *********************/
+ +/******* sched_balance_find_src_group() helpers end here *********************/
   
   /*
    * Decision matrix according to the local and busiest group type:
@@@ -10873,7 -10818,7 +10872,7 @@@
    */
   
   /**
- - * find_busiest_group - Returns the busiest group within the sched_domain
+ + * sched_balance_find_src_group - Returns the busiest group within the sched_domain
    * if there is an imbalance.
    * @env: The load balancing environment.
    *
@@@ -10882,7 -10827,7 +10881,7 @@@
    *
    * Return:    - The busiest group if imbalance exists.
    */
- -static struct sched_group *find_busiest_group(struct lb_env *env)
+ +static struct sched_group *sched_balance_find_src_group(struct lb_env *env)
   {
         struct sg_lb_stats *local, *busiest;
         struct sd_lb_stats sds;
@@@ -10905,9 -10850,12 +10904,9 @@@
         if (busiest->group_type == group_misfit_task)
                 goto force_balance;
   
- -      if (sched_energy_enabled()) {
- -              struct root_domain *rd = env->dst_rq->rd;
- -
- -              if (rcu_dereference(rd->pd) && !READ_ONCE(rd->overutilized))
- -                      goto out_balanced;
- -      }
+ +      if (!is_rd_overutilized(env->dst_rq->rd) &&
+ +          rcu_dereference(env->dst_rq->rd->pd))
+ +              goto out_balanced;
   
         /* ASYM feature bypasses nice load balance check */
         if (busiest->group_type == group_asym_packing)
@@@ -10970,7 -10918,7 +10969,7 @@@
                 goto force_balance;
   
         if (busiest->group_type != group_overloaded) {
- -              if (env->idle == CPU_NOT_IDLE) {
+ +              if (!env->idle) {
                         /*
                          * If the busiest group is not overloaded (and as a
                          * result the local one too) but this CPU is already
@@@ -11018,9 -10966,9 +11017,9 @@@ out_balanced
   }
   
   /*
- - * find_busiest_queue - find the busiest runqueue among the CPUs in the group.
+ + * sched_balance_find_src_rq - find the busiest runqueue among the CPUs in the group.
    */
- -static struct rq *find_busiest_queue(struct lb_env *env,
+ +static struct rq *sched_balance_find_src_rq(struct lb_env *env,
                                      struct sched_group *group)
   {
         struct rq *busiest = NULL, *rq;
@@@ -11178,7 -11126,7 +11177,7 @@@ asym_active_balance(struct lb_env *env
          * the lower priority @env::dst_cpu help it. Do not follow
          * CPU priority.
          */
- -      return env->idle != CPU_NOT_IDLE && sched_use_asym_prio(env->sd, env->dst_cpu) &&
+ +      return env->idle && sched_use_asym_prio(env->sd, env->dst_cpu) &&
                (sched_asym_prefer(env->dst_cpu, env->src_cpu) ||
                 !sched_use_asym_prio(env->sd, env->src_cpu));
   }
@@@ -11216,7 -11164,7 +11215,7 @@@ static int need_active_balance(struct l
          * because of other sched_class or IRQs if more capacity stays
          * available on dst_cpu.
          */
- -      if ((env->idle != CPU_NOT_IDLE) &&
+ +      if (env->idle &&
             (env->src_rq->cfs.h_nr_running == 1)) {
                 if ((check_cpu_capacity(env->src_rq, sd)) &&
                     (capacity_of(env->src_cpu)*sd->imbalance_pct < capacity_of(env->dst_cpu)*100))
@@@ -11301,7 -11249,7 +11300,7 @@@ static int should_we_balance(struct lb_
    * Check this_cpu to ensure it is balanced within domain. Attempt to move
    * tasks if there is an imbalance.
    */
- -static int load_balance(int this_cpu, struct rq *this_rq,
+ +static int sched_balance_rq(int this_cpu, struct rq *this_rq,
                         struct sched_domain *sd, enum cpu_idle_type idle,
                         int *continue_balancing)
   {
@@@ -11333,13 -11281,13 +11332,13 @@@ redo
                 goto out_balanced;
         }
   
- -      group = find_busiest_group(&env);
+ +      group = sched_balance_find_src_group(&env);
         if (!group) {
                 schedstat_inc(sd->lb_nobusyg[idle]);
                 goto out_balanced;
         }
   
- -      busiest = find_busiest_queue(&env, group);
+ +      busiest = sched_balance_find_src_rq(&env, group);
         if (!busiest) {
                 schedstat_inc(sd->lb_nobusyq[idle]);
                 goto out_balanced;
@@@ -11357,7 -11305,7 +11356,7 @@@
         env.flags |= LBF_ALL_PINNED;
         if (busiest->nr_running > 1) {
                 /*
- -               * Attempt to move tasks. If find_busiest_group has found
+ +               * Attempt to move tasks. If sched_balance_find_src_group has found
                  * an imbalance but busiest->nr_running <= 1, the group is
                  * still unbalanced. ld_moved simply stays zero, so it is
                  * correctly treated as an imbalance.
@@@ -11472,12 -11420,8 +11471,12 @@@ more_balance
                  * We do not want newidle balance, which can be very
                  * frequent, pollute the failure counter causing
                  * excessive cache_hot migrations and active balances.
+ +               *
+ +               * Similarly for migration_misfit which is not related to
+ +               * load/util migration, don't pollute nr_balance_failed.
                  */
- -              if (idle != CPU_NEWLY_IDLE)
+ +              if (idle != CPU_NEWLY_IDLE &&
+ +                  env.migration_type != migrate_misfit)
                         sd->nr_balance_failed++;
   
                 if (need_active_balance(&env)) {
@@@ -11556,17 -11500,12 +11555,17 @@@ out_one_pinned
         ld_moved = 0;
   
         /*
- -       * newidle_balance() disregards balance intervals, so we could
+ +       * sched_balance_newidle() disregards balance intervals, so we could
          * repeatedly reach this code, which would lead to balance_interval
          * skyrocketing in a short amount of time. Skip the balance_interval
          * increase logic to avoid that.
+ +       *
+ +       * Similarly misfit migration which is not necessarily an indication of
+ +       * the system being busy and requires lb to backoff to let it settle
+ +       * down.
          */
- -      if (env.idle == CPU_NEWLY_IDLE)
+ +      if (env.idle == CPU_NEWLY_IDLE ||
+ +          env.migration_type == migrate_misfit)
                 goto out;
   
         /* tune up the balancing interval */
@@@ -11699,23 -11638,10 +11698,23 @@@ out_unlock
         return 0;
   }
   
- -static DEFINE_SPINLOCK(balancing);
+ +/*
+ + * This flag serializes load-balancing passes over large domains
+ + * (above the NODE topology level) - only one load-balancing instance
+ + * may run at a time, to reduce overhead on very large systems with
+ + * lots of CPUs and large NUMA distances.
+ + *
+ + * - Note that load-balancing passes triggered while another one
+ + *   is executing are skipped and not re-tried.
+ + *
+ + * - Also note that this does not serialize rebalance_domains()
+ + *   execution, as non-SD_SERIALIZE domains will still be
+ + *   load-balanced in parallel.
+ + */
+ +static atomic_t sched_balance_running = ATOMIC_INIT(0);
   
   /*
- - * Scale the max load_balance interval with the number of CPUs in the system.
+ + * Scale the max sched_balance_rq interval with the number of CPUs in the system.
    * This trades load-balance latency on larger machines for less cross talk.
    */
   void update_max_interval(void)
@@@ -11753,7 -11679,7 +11752,7 @@@ static inline bool update_newidle_cost(
    *
    * Balancing parameters are set up in init_sched_domains.
    */
- -static void rebalance_domains(struct rq *rq, enum cpu_idle_type idle)
+ +static void sched_balance_domains(struct rq *rq, enum cpu_idle_type idle)
   {
         int continue_balancing = 1;
         int cpu = rq->cpu;
@@@ -11790,25 -11716,25 +11789,25 @@@
   
                 need_serialize = sd->flags & SD_SERIALIZE;
                 if (need_serialize) {
- -                      if (!spin_trylock(&balancing))
+ +                      if (atomic_cmpxchg_acquire(&sched_balance_running, 0, 1))
                                 goto out;
                 }
   
                 if (time_after_eq(jiffies, sd->last_balance + interval)) {
- -                      if (load_balance(cpu, rq, sd, idle, &continue_balancing)) {
+ +                      if (sched_balance_rq(cpu, rq, sd, idle, &continue_balancing)) {
                                 /*
                                  * The LBF_DST_PINNED logic could have changed
                                  * env->dst_cpu, so we can't know our idle
                                  * state even if we migrated tasks. Update it.
                                  */
- -                              idle = idle_cpu(cpu) ? CPU_IDLE : CPU_NOT_IDLE;
- -                              busy = idle != CPU_IDLE && !sched_idle_cpu(cpu);
+ +                              idle = idle_cpu(cpu);
+ +                              busy = !idle && !sched_idle_cpu(cpu);
                         }
                         sd->last_balance = jiffies;
                         interval = get_sd_balance_interval(sd, busy);
                 }
                 if (need_serialize)
- -                      spin_unlock(&balancing);
+ +                      atomic_set_release(&sched_balance_running, 0);
   out:
                 if (time_after(next_balance, sd->last_balance + interval)) {
                         next_balance = sd->last_balance + interval;
@@@ -11968,7 -11894,7 +11967,7 @@@ static void nohz_balancer_kick(struct r
                  * currently idle; in which case, kick the ILB to move tasks
                  * around.
                  *
- -               * When balancing betwen cores, all the SMT siblings of the
+ +               * When balancing between cores, all the SMT siblings of the
                  * preferred CPU must be idle.
                  */
                 for_each_cpu_and(i, sched_domain_span(sd), nohz.idle_cpus_mask) {
@@@ -11985,7 -11911,7 +11984,7 @@@
                  * When ASYM_CPUCAPACITY; see if there's a higher capacity CPU
                  * to run the misfit task on.
                  */
- -              if (check_misfit_status(rq, sd)) {
+ +              if (check_misfit_status(rq)) {
                         flags = NOHZ_STATS_KICK | NOHZ_BALANCE_KICK;
                         goto unlock;
                 }
@@@ -12129,7 -12055,7 +12128,7 @@@ void nohz_balance_enter_idle(int cpu
   out:
         /*
          * Each time a cpu enter idle, we assume that it has blocked load and
- -       * enable the periodic update of the load of idle cpus
+ +       * enable the periodic update of the load of idle CPUs
          */
         WRITE_ONCE(nohz.has_blocked, 1);
   }
@@@ -12147,13 -12073,13 +12146,13 @@@ static bool update_nohz_stats(struct r
         if (!time_after(jiffies, READ_ONCE(rq->last_blocked_load_update_tick)))
                 return true;
   
- -      update_blocked_averages(cpu);
+ +      sched_balance_update_blocked_averages(cpu);
   
         return rq->has_blocked_load;
   }
   
   /*
- - * Internal function that runs load balance for all idle cpus. The load balance
+ + * Internal function that runs load balance for all idle CPUs. The load balance
    * can be a simple update of blocked load or a complete load balance with
    * tasks movement depending of flags.
    */
@@@ -12229,7 -12155,7 +12228,7 @@@ static void _nohz_idle_balance(struct r
                         rq_unlock_irqrestore(rq, &rf);
   
                         if (flags & NOHZ_BALANCE_KICK)
- -                              rebalance_domains(rq, CPU_IDLE);
+ +                              sched_balance_domains(rq, CPU_IDLE);
                 }
   
                 if (time_after(next_balance, rq->next_balance)) {
@@@ -12258,7 -12184,7 +12257,7 @@@ abort
   
   /*
    * In CONFIG_NO_HZ_COMMON case, the idle balance kickee will do the
- - * rebalancing for all the cpus for whom scheduler ticks are stopped.
+ + * rebalancing for all the CPUs for whom scheduler ticks are stopped.
    */
   static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
   {
@@@ -12289,7 -12215,7 +12288,7 @@@
    * called from this function on (this) CPU that's not yet in the mask. That's
    * OK because the goal of nohz_run_idle_balance() is to run ILB only for
    * updating the blocked load of already idle CPUs without waking up one of
- - * those idle CPUs and outside the preempt disable / irq off phase of the local
+ + * those idle CPUs and outside the preempt disable / IRQ off phase of the local
    * cpu about to enter idle, because it can take a long time.
    */
   void nohz_run_idle_balance(int cpu)
@@@ -12300,7 -12226,7 +12299,7 @@@
   
         /*
          * Update the blocked load only if no SCHED_SOFTIRQ is about to happen
- -       * (ie NOHZ_STATS_KICK set) and will do the same.
+ +       * (i.e. NOHZ_STATS_KICK set) and will do the same.
          */
         if ((flags == NOHZ_NEWILB_KICK) && !need_resched())
                 _nohz_idle_balance(cpu_rq(cpu), NOHZ_STATS_KICK);
@@@ -12345,7 -12271,7 +12344,7 @@@ static inline void nohz_newidle_balance
   #endif /* CONFIG_NO_HZ_COMMON */
   
   /*
- - * newidle_balance is called by schedule() if this_cpu is about to become
+ + * sched_balance_newidle is called by schedule() if this_cpu is about to become
    * idle. Attempts to pull tasks from other CPUs.
    *
    * Returns:
@@@ -12353,11 -12279,10 +12352,11 @@@
    *     0 - failed, no new tasks
    *   > 0 - success, new (fair) tasks present
    */
- -static int newidle_balance(struct rq *this_rq, struct rq_flags *rf)
+ +static int sched_balance_newidle(struct rq *this_rq, struct rq_flags *rf)
   {
         unsigned long next_balance = jiffies + HZ;
         int this_cpu = this_rq->cpu;
+ +      int continue_balancing = 1;
         u64 t0, t1, curr_cost = 0;
         struct sched_domain *sd;
         int pulled_task = 0;
@@@ -12372,9 -12297,8 +12371,9 @@@
                 return 0;
   
         /*
- -       * We must set idle_stamp _before_ calling idle_balance(), such that we
- -       * measure the duration of idle_balance() as idle time.
+ +       * We must set idle_stamp _before_ calling sched_balance_rq()
+ +       * for CPU_NEWLY_IDLE, such that we measure the this duration
+ +       * as idle time.
          */
         this_rq->idle_stamp = rq_clock(this_rq);
   
@@@ -12395,7 -12319,7 +12394,7 @@@
         rcu_read_lock();
         sd = rcu_dereference_check_sched_domain(this_rq->sd);
   
- -      if (!READ_ONCE(this_rq->rd->overload) ||
+ +      if (!get_rd_overloaded(this_rq->rd) ||
             (sd && this_rq->avg_idle < sd->max_newidle_lb_cost)) {
   
                 if (sd)
@@@ -12409,10 -12333,11 +12408,10 @@@
         raw_spin_rq_unlock(this_rq);
   
         t0 = sched_clock_cpu(this_cpu);
- -      update_blocked_averages(this_cpu);
+ +      sched_balance_update_blocked_averages(this_cpu);
   
         rcu_read_lock();
         for_each_domain(this_cpu, sd) {
- -              int continue_balancing = 1;
                 u64 domain_cost;
   
                 update_next_balance(sd, &next_balance);
@@@ -12422,7 -12347,7 +12421,7 @@@
   
                 if (sd->flags & SD_BALANCE_NEWIDLE) {
   
- -                      pulled_task = load_balance(this_cpu, this_rq,
+ +                      pulled_task = sched_balance_rq(this_cpu, this_rq,
                                                    sd, CPU_NEWLY_IDLE,
                                                    &continue_balancing);
   
@@@ -12438,7 -12363,8 +12437,7 @@@
                  * Stop searching for tasks to pull if there are
                  * now runnable tasks on this rq.
                  */
- -              if (pulled_task || this_rq->nr_running > 0 ||
- -                  this_rq->ttwu_pending)
+ +              if (pulled_task || !continue_balancing)
                         break;
         }
         rcu_read_unlock();
@@@ -12476,21 -12402,19 +12475,21 @@@ out
   }
   
   /*
- - * run_rebalance_domains is triggered when needed from the scheduler tick.
- - * Also triggered for nohz idle balancing (with nohz_balancing_kick set).
+ + * This softirq handler is triggered via SCHED_SOFTIRQ from two places:
+ + *
+ + * - directly from the local scheduler_tick() for periodic load balancing
+ + *
+ + * - indirectly from a remote scheduler_tick() for NOHZ idle balancing
+ + *   through the SMP cross-call nohz_csd_func()
    */
- -static __latent_entropy void run_rebalance_domains(struct softirq_action *h)
+ +static __latent_entropy void sched_balance_softirq(struct softirq_action *h)
   {
         struct rq *this_rq = this_rq();
- -      enum cpu_idle_type idle = this_rq->idle_balance ?
- -                                              CPU_IDLE : CPU_NOT_IDLE;
- -
+ +      enum cpu_idle_type idle = this_rq->idle_balance;
         /*
- -       * If this CPU has a pending nohz_balance_kick, then do the
+ +       * If this CPU has a pending NOHZ_BALANCE_KICK, then do the
          * balancing on behalf of the other idle CPUs whose ticks are
- -       * stopped. Do nohz_idle_balance *before* rebalance_domains to
+ +       * stopped. Do nohz_idle_balance *before* sched_balance_domains to
          * give the idle CPUs a chance to load balance. Else we may
          * load balance only within the local sched_domain hierarchy
          * and abort nohz_idle_balance altogether if we pull some load.
@@@ -12499,14 -12423,14 +12498,14 @@@
                 return;
   
         /* normal load balance */
- -      update_blocked_averages(this_rq->cpu);
- -      rebalance_domains(this_rq, idle);
+ +      sched_balance_update_blocked_averages(this_rq->cpu);
+ +      sched_balance_domains(this_rq, idle);
   }
   
   /*
    * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing.
    */
- -void trigger_load_balance(struct rq *rq)
+ +void sched_balance_trigger(struct rq *rq)
   {
         /*
          * Don't need to rebalance while attached to NULL domain or
@@@ -12690,7 -12614,7 +12689,7 @@@ static void task_tick_fair(struct rq *r
                 task_tick_numa(rq, curr);
   
         update_misfit_status(curr, rq);
- -      update_overutilized_status(task_rq(curr));
+ +      check_update_overutilized_status(task_rq(curr));
   
         task_tick_core(rq, curr);
   }
@@@ -12710,8 -12634,6 +12709,8 @@@ static void task_fork_fair(struct task_
         rq_lock(rq, &rf);
         update_rq_clock(rq);
   
+ +      set_task_max_allowed_capacity(p);
+ +
         cfs_rq = task_cfs_rq(current);
         curr = cfs_rq->curr;
         if (curr)
@@@ -12835,8 -12757,6 +12834,8 @@@ static void switched_to_fair(struct rq 
   {
         attach_task_cfs_rq(p);
   
+ +      set_task_max_allowed_capacity(p);
+ +
         if (task_on_rq_queued(p)) {
                 /*
                  * We were most likely switched from sched_rt, so
@@@ -13208,7 -13128,7 +13207,7 @@@ DEFINE_SCHED_CLASS(fair) = 
         .rq_offline             = rq_offline_fair,
   
         .task_dead              = task_dead_fair,
- -      .set_cpus_allowed       = set_cpus_allowed_common,
+ +      .set_cpus_allowed       = set_cpus_allowed_fair,
   #endif
   
         .task_tick              = task_tick_fair,
@@@ -13288,7 -13208,7 +13287,7 @@@ __init void init_sched_fair_class(void
   #endif
         }
   
- -      open_softirq(SCHED_SOFTIRQ, run_rebalance_domains);
+ +      open_softirq(SCHED_SOFTIRQ, sched_balance_softirq);
   
   #ifdef CONFIG_NO_HZ_COMMON
         nohz.next_balance = jiffies;
diff --combined kernel/sched/topology.c

index 63aecd2a7a9f351e557e74ba101c04da4250605d,42c22648d124ff96c9a1f5a390e795dd836af52a..6835598316561cbde8e3154e942bc8c4067c3746
--- 1/kernel/sched/topology.c
--- 2/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@@ -322,7 -322,6 +322,6 @@@ static struct ctl_table sched_energy_aw
                 .extra1         = SYSCTL_ZERO,
                 .extra2         = SYSCTL_ONE,
         },
-       {}
   };
   
   static int __init sched_energy_aware_sysctl_init(void)
@@@ -1329,13 -1328,24 +1328,13 @@@ next
         update_group_capacity(sd, cpu);
   }
   
- -/*
- - * Asymmetric CPU capacity bits
- - */
- -struct asym_cap_data {
- -      struct list_head link;
- -      unsigned long capacity;
- -      unsigned long cpus[];
- -};
- -
   /*
    * Set of available CPUs grouped by their corresponding capacities
    * Each list entry contains a CPU mask reflecting CPUs that share the same
    * capacity.
    * The lifespan of data is unlimited.
    */
- -static LIST_HEAD(asym_cap_list);
- -
- -#define cpu_capacity_span(asym_data) to_cpumask((asym_data)->cpus)
+ +LIST_HEAD(asym_cap_list);
   
   /*
    * Verify whether there is any CPU capacity asymmetry in a given sched domain.
@@@ -1375,39 -1385,21 +1374,39 @@@ asym_cpu_capacity_classify(const struc
   
   }
   
+ +static void free_asym_cap_entry(struct rcu_head *head)
+ +{
+ +      struct asym_cap_data *entry = container_of(head, struct asym_cap_data, rcu);
+ +      kfree(entry);
+ +}
+ +
   static inline void asym_cpu_capacity_update_data(int cpu)
   {
         unsigned long capacity = arch_scale_cpu_capacity(cpu);
- -      struct asym_cap_data *entry = NULL;
+ +      struct asym_cap_data *insert_entry = NULL;
+ +      struct asym_cap_data *entry;
   
+ +      /*
+ +       * Search if capacity already exits. If not, track which the entry
+ +       * where we should insert to keep the list ordered descendingly.
+ +       */
         list_for_each_entry(entry, &asym_cap_list, link) {
                 if (capacity == entry->capacity)
                         goto done;
+ +              else if (!insert_entry && capacity > entry->capacity)
+ +                      insert_entry = list_prev_entry(entry, link);
         }
   
         entry = kzalloc(sizeof(*entry) + cpumask_size(), GFP_KERNEL);
         if (WARN_ONCE(!entry, "Failed to allocate memory for asymmetry data\n"))
                 return;
         entry->capacity = capacity;
- -      list_add(&entry->link, &asym_cap_list);
+ +
+ +      /* If NULL then the new capacity is the smallest, add last. */
+ +      if (!insert_entry)
+ +              list_add_tail_rcu(&entry->link, &asym_cap_list);
+ +      else
+ +              list_add_rcu(&entry->link, &insert_entry->link);
   done:
         __cpumask_set_cpu(cpu, cpu_capacity_span(entry));
   }
@@@ -1430,8 -1422,8 +1429,8 @@@ static void asym_cpu_capacity_scan(void
   
         list_for_each_entry_safe(entry, next, &asym_cap_list, link) {
                 if (cpumask_empty(cpu_capacity_span(entry))) {
- -                      list_del(&entry->link);
- -                      kfree(entry);
+ +                      list_del_rcu(&entry->link);
+ +                      call_rcu(&entry->rcu, free_asym_cap_entry);
                 }
         }
   
@@@ -1441,8 -1433,8 +1440,8 @@@
          */
         if (list_is_singular(&asym_cap_list)) {
                 entry = list_first_entry(&asym_cap_list, typeof(*entry), link);
- -              list_del(&entry->link);
- -              kfree(entry);
+ +              list_del_rcu(&entry->link);
+ +              call_rcu(&entry->rcu, free_asym_cap_entry);
         }
   }
   
@@@ -2514,9 -2506,16 +2513,9 @@@ build_sched_domains(const struct cpumas
         /* Attach the domains */
         rcu_read_lock();
         for_each_cpu(i, cpu_map) {
- -              unsigned long capacity;
- -
                 rq = cpu_rq(i);
                 sd = *per_cpu_ptr(d.sd, i);
   
- -              capacity = arch_scale_cpu_capacity(i);
- -              /* Use READ_ONCE()/WRITE_ONCE() to avoid load/store tearing: */
- -              if (capacity > READ_ONCE(d.rd->max_cpu_capacity))
- -                      WRITE_ONCE(d.rd->max_cpu_capacity, capacity);
- -
                 cpu_attach_domain(sd, d.rd, i);
   
                 if (lowest_flag_domain(i, SD_CLUSTER))
@@@ -2530,8 -2529,10 +2529,8 @@@
         if (has_cluster)
                 static_branch_inc_cpuslocked(&sched_cluster_active);
   
- -      if (rq && sched_debug_verbose) {
- -              pr_info("root domain span: %*pbl (max cpu_capacity = %lu)\n",
- -                      cpumask_pr_args(cpu_map), rq->rd->max_cpu_capacity);
- -      }
+ +      if (rq && sched_debug_verbose)
+ +              pr_info("root domain span: %*pbl\n", cpumask_pr_args(cpu_map));
   
         ret = 0;
   error:
diff --combined kernel/seccomp.c

index f70e031e06a80faed23b70d71cb53ea45dede8cb,7ed72723fb8a17e180abc7ab9276984e5397a26d..e30b60b57614ed9f3a07461154180d51941d5186
--- 1/kernel/seccomp.c
--- 2/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@@ -2334,7 -2334,7 +2334,7 @@@ static bool seccomp_actions_logged_from
         return true;
   }
   
- -static int read_actions_logged(struct ctl_table *ro_table, void *buffer,
+ +static int read_actions_logged(const struct ctl_table *ro_table, void *buffer,
                                size_t *lenp, loff_t *ppos)
   {
         char names[sizeof(seccomp_actions_avail)];
@@@ -2352,7 -2352,7 +2352,7 @@@
         return proc_dostring(&table, 0, buffer, lenp, ppos);
   }
   
- -static int write_actions_logged(struct ctl_table *ro_table, void *buffer,
+ +static int write_actions_logged(const struct ctl_table *ro_table, void *buffer,
                                 size_t *lenp, loff_t *ppos, u32 *actions_logged)
   {
         char names[sizeof(seccomp_actions_avail)];
@@@ -2445,7 -2445,6 +2445,6 @@@ static struct ctl_table seccomp_sysctl_
                 .mode           = 0644,
                 .proc_handler   = seccomp_actions_logged_handler,
         },
-       { }
   };
   
   static int __init seccomp_sysctl_init(void)
diff --combined kernel/stackleak.c

index 59cdfaf5118e8c22a207f87b00bec20a881b2391,d099f3affcf1a69b185278fca4767527fe1464e3..0f971258491309025be1b79cd71597eb0a9ca396
--- 1/kernel/stackleak.c
--- 2/kernel/stackleak.c
+++ b/kernel/stackleak.c
@@@ -27,10 -27,10 +27,10 @@@ static int stack_erasing_sysctl(struct 
         int ret = 0;
         int state = !static_branch_unlikely(&stack_erasing_bypass);
         int prev_state = state;
+ +      struct ctl_table table_copy = *table;
   
- -      table->data = &state;
- -      table->maxlen = sizeof(int);
- -      ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
+ +      table_copy.data = &state;
+ +      ret = proc_dointvec_minmax(&table_copy, write, buffer, lenp, ppos);
         state = !!state;
         if (ret || !write || state == prev_state)
                 return ret;
@@@ -54,7 -54,6 +54,6 @@@ static struct ctl_table stackleak_sysct
                 .extra1         = SYSCTL_ZERO,
                 .extra2         = SYSCTL_ONE,
         },
-       {}
   };
   
   static int __init stackleak_sysctls_init(void)
diff --combined kernel/time/timer.c

index e394d6d5b9b54290571a75a48e01bcfb2266b7ef,9d107f4b506c6f822b70d785649e836bebfe0b4e..48288dd4a102f745f083172cb3add2a82c1b8ea3
--- 1/kernel/time/timer.c
--- 2/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@@ -64,15 -64,15 +64,15 @@@ EXPORT_SYMBOL(jiffies_64)
   
   /*
    * The timer wheel has LVL_DEPTH array levels. Each level provides an array of
- - * LVL_SIZE buckets. Each level is driven by its own clock and therefor each
+ + * LVL_SIZE buckets. Each level is driven by its own clock and therefore each
    * level has a different granularity.
    *
- - * The level granularity is:          LVL_CLK_DIV ^ lvl
+ + * The level granularity is:          LVL_CLK_DIV ^ level
    * The level clock frequency is:      HZ / (LVL_CLK_DIV ^ level)
    *
    * The array level of a newly armed timer depends on the relative expiry
    * time. The farther the expiry time is away the higher the array level and
- - * therefor the granularity becomes.
+ + * therefore the granularity becomes.
    *
    * Contrary to the original timer wheel implementation, which aims for 'exact'
    * expiry of the timers, this implementation removes the need for recascading
@@@ -207,7 -207,7 +207,7 @@@
    * struct timer_base - Per CPU timer base (number of base depends on config)
    * @lock:             Lock protecting the timer_base
    * @running_timer:    When expiring timers, the lock is dropped. To make
- - *                    sure not to race agains deleting/modifying a
+ + *                    sure not to race against deleting/modifying a
    *                    currently running timer, the pointer is set to the
    *                    timer, which expires at the moment. If no timer is
    *                    running, the pointer is NULL.
@@@ -312,7 -312,6 +312,6 @@@ static struct ctl_table timer_sysctl[] 
                 .extra1         = SYSCTL_ZERO,
                 .extra2         = SYSCTL_ONE,
         },
-       {}
   };
   
   static int __init timer_sysctl_init(void)
@@@ -737,7 -736,7 +736,7 @@@ static bool timer_is_static_object(voi
   }
   
   /*
- - * fixup_init is called when:
+ + * timer_fixup_init is called when:
    * - an active object is initialized
    */
   static bool timer_fixup_init(void *addr, enum debug_obj_state state)
@@@ -761,7 -760,7 +760,7 @@@ static void stub_timer(struct timer_lis
   }
   
   /*
- - * fixup_activate is called when:
+ + * timer_fixup_activate is called when:
    * - an active object is activated
    * - an unknown non-static object is activated
    */
@@@ -783,7 -782,7 +782,7 @@@ static bool timer_fixup_activate(void *
   }
   
   /*
- - * fixup_free is called when:
+ + * timer_fixup_free is called when:
    * - an active object is freed
    */
   static bool timer_fixup_free(void *addr, enum debug_obj_state state)
@@@ -801,7 -800,7 +800,7 @@@
   }
   
   /*
- - * fixup_assert_init is called when:
+ + * timer_fixup_assert_init is called when:
    * - an untracked/uninit-ed object is found
    */
   static bool timer_fixup_assert_init(void *addr, enum debug_obj_state state)
@@@ -914,7 -913,7 +913,7 @@@ static void do_init_timer(struct timer_
    * @key: lockdep class key of the fake lock used for tracking timer
    *       sync lock dependencies
    *
- - * init_timer_key() must be done to a timer prior calling *any* of the
+ + * init_timer_key() must be done to a timer prior to calling *any* of the
    * other timer functions.
    */
   void init_timer_key(struct timer_list *timer,
@@@ -1417,7 -1416,7 +1416,7 @@@ static int __timer_delete(struct timer_
          * If @shutdown is set then the lock has to be taken whether the
          * timer is pending or not to protect against a concurrent rearm
          * which might hit between the lockless pending check and the lock
- -       * aquisition. By taking the lock it is ensured that such a newly
+ +       * acquisition. By taking the lock it is ensured that such a newly
          * enqueued timer is dequeued and cannot end up with
          * timer->function == NULL in the expiry code.
          *
@@@ -2306,7 -2305,7 +2305,7 @@@ static inline u64 __get_next_timer_inte
   
                 /*
                  * When timer base is not set idle, undo the effect of
- -               * tmigr_cpu_deactivate() to prevent inconsitent states - active
+ +               * tmigr_cpu_deactivate() to prevent inconsistent states - active
                  * timer base but inactive timer migration hierarchy.
                  *
                  * When timer base was already marked idle, nothing will be
@@@ -2488,7 -2487,7 +2487,7 @@@ void update_process_times(int user_tick
         if (in_irq())
                 irq_work_tick();
   #endif
- -      scheduler_tick();
+ +      sched_tick();
         if (IS_ENABLED(CONFIG_POSIX_TIMERS))
                 run_posix_cpu_timers();
   }
diff --combined kernel/trace/ftrace.c

index 6c96b30f3d63b0a681de16deb50a6a96f6d0c61d,6cec53aa45a6d2f10f8b00011963b884cc7c81c6..50ca4d4f88409860986d3427e5167eff8f5bc97d
--- 1/kernel/trace/ftrace.c
--- 2/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@@ -3157,7 -3157,8 +3157,7 @@@ out
                  * synchronize_rcu_tasks() will wait for those tasks to
                  * execute and either schedule voluntarily or enter user space.
                  */
- -              if (IS_ENABLED(CONFIG_PREEMPTION))
- -                      synchronize_rcu_tasks();
+ +              synchronize_rcu_tasks();
   
                 ftrace_trampoline_free(ops);
         }
@@@ -8269,7 -8270,6 +8269,6 @@@ static struct ctl_table ftrace_sysctls[
                 .mode           = 0644,
                 .proc_handler   = ftrace_enable_sysctl,
         },
-       {}
   };
   
   static int __init ftrace_sysctl_init(void)
diff --combined kernel/ucount.c

index d9e283600f5c7c3a1e86ee11d94a7c6e145b85a4,4d5b9c12c014226028e317ad649b8a69294b8834..8c07714ff27d42e16cf24400c6678cd691887dbd
--- 1/kernel/ucount.c
--- 2/kernel/ucount.c
+++ b/kernel/ucount.c
@@@ -38,7 -38,7 +38,7 @@@ static int set_is_seen(struct ctl_table
   }
   
   static int set_permissions(struct ctl_table_header *head,
-                                 struct ctl_table *table)
+                          const struct ctl_table *table)
   {
         struct user_namespace *user_ns =
                 container_of(head->set, struct user_namespace, set);
@@@ -87,7 -87,6 +87,6 @@@ static struct ctl_table user_table[] = 
         UCOUNT_ENTRY("max_fanotify_groups"),
         UCOUNT_ENTRY("max_fanotify_marks"),
   #endif
-       { }
   };
   #endif /* CONFIG_SYSCTL */
   
@@@ -96,7 -95,7 +95,7 @@@ bool setup_userns_sysctls(struct user_n
   #ifdef CONFIG_SYSCTL
         struct ctl_table *tbl;
   
-       BUILD_BUG_ON(ARRAY_SIZE(user_table) != UCOUNT_COUNTS + 1);
+       BUILD_BUG_ON(ARRAY_SIZE(user_table) != UCOUNT_COUNTS);
         setup_sysctl_set(&ns->set, &set_root, set_is_seen);
         tbl = kmemdup(user_table, sizeof(user_table), GFP_KERNEL);
         if (tbl) {
@@@ -119,7 -118,7 +118,7 @@@
   void retire_userns_sysctls(struct user_namespace *ns)
   {
   #ifdef CONFIG_SYSCTL
- -      struct ctl_table *tbl;
+ +      const struct ctl_table *tbl;
   
         tbl = ns->sysctls->ctl_table_arg;
         unregister_sysctl_table(ns->sysctls);
diff --combined kernel/watchdog.c

index d12ff74889ed54a0247506eb5f3cb24faf7dee6a,4e472d416525b7107d0b271158b4c15d7f598d56..941236828de80f16922e37c902d5e2d0ddd25218
--- 1/kernel/watchdog.c
--- 2/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@@ -12,25 -12,20 +12,25 @@@
   
   #define pr_fmt(fmt) "watchdog: " fmt
   
- -#include <linux/mm.h>
   #include <linux/cpu.h>
- -#include <linux/nmi.h>
   #include <linux/init.h>
+ +#include <linux/irq.h>
+ +#include <linux/irqdesc.h>
+ +#include <linux/kernel_stat.h>
+ +#include <linux/kvm_para.h>
+ +#include <linux/math64.h>
+ +#include <linux/mm.h>
   #include <linux/module.h>
+ +#include <linux/nmi.h>
+ +#include <linux/stop_machine.h>
   #include <linux/sysctl.h>
   #include <linux/tick.h>
+ +
   #include <linux/sched/clock.h>
   #include <linux/sched/debug.h>
   #include <linux/sched/isolation.h>
- -#include <linux/stop_machine.h>
   
   #include <asm/irq_regs.h>
- -#include <linux/kvm_para.h>
   
   static DEFINE_MUTEX(watchdog_mutex);
   
@@@ -40,8 -35,6 +40,8 @@@
   # define WATCHDOG_HARDLOCKUP_DEFAULT  0
   #endif
   
+ +#define NUM_SAMPLE_PERIODS    5
+ +
   unsigned long __read_mostly watchdog_enabled;
   int __read_mostly watchdog_user_enabled = 1;
   static int __read_mostly watchdog_hardlockup_user_enabled = WATCHDOG_HARDLOCKUP_DEFAULT;
@@@ -340,188 -333,6 +340,188 @@@ __setup("watchdog_thresh=", watchdog_th
   
   static void __lockup_detector_cleanup(void);
   
+ +#ifdef CONFIG_SOFTLOCKUP_DETECTOR_INTR_STORM
+ +enum stats_per_group {
+ +      STATS_SYSTEM,
+ +      STATS_SOFTIRQ,
+ +      STATS_HARDIRQ,
+ +      STATS_IDLE,
+ +      NUM_STATS_PER_GROUP,
+ +};
+ +
+ +static const enum cpu_usage_stat tracked_stats[NUM_STATS_PER_GROUP] = {
+ +      CPUTIME_SYSTEM,
+ +      CPUTIME_SOFTIRQ,
+ +      CPUTIME_IRQ,
+ +      CPUTIME_IDLE,
+ +};
+ +
+ +static DEFINE_PER_CPU(u16, cpustat_old[NUM_STATS_PER_GROUP]);
+ +static DEFINE_PER_CPU(u8, cpustat_util[NUM_SAMPLE_PERIODS][NUM_STATS_PER_GROUP]);
+ +static DEFINE_PER_CPU(u8, cpustat_tail);
+ +
+ +/*
+ + * We don't need nanosecond resolution. A granularity of 16ms is
+ + * sufficient for our precision, allowing us to use u16 to store
+ + * cpustats, which will roll over roughly every ~1000 seconds.
+ + * 2^24 ~= 16 * 10^6
+ + */
+ +static u16 get_16bit_precision(u64 data_ns)
+ +{
+ +      return data_ns >> 24LL; /* 2^24ns ~= 16.8ms */
+ +}
+ +
+ +static void update_cpustat(void)
+ +{
+ +      int i;
+ +      u8 util;
+ +      u16 old_stat, new_stat;
+ +      struct kernel_cpustat kcpustat;
+ +      u64 *cpustat = kcpustat.cpustat;
+ +      u8 tail = __this_cpu_read(cpustat_tail);
+ +      u16 sample_period_16 = get_16bit_precision(sample_period);
+ +
+ +      kcpustat_cpu_fetch(&kcpustat, smp_processor_id());
+ +
+ +      for (i = 0; i < NUM_STATS_PER_GROUP; i++) {
+ +              old_stat = __this_cpu_read(cpustat_old[i]);
+ +              new_stat = get_16bit_precision(cpustat[tracked_stats[i]]);
+ +              util = DIV_ROUND_UP(100 * (new_stat - old_stat), sample_period_16);
+ +              __this_cpu_write(cpustat_util[tail][i], util);
+ +              __this_cpu_write(cpustat_old[i], new_stat);
+ +      }
+ +
+ +      __this_cpu_write(cpustat_tail, (tail + 1) % NUM_SAMPLE_PERIODS);
+ +}
+ +
+ +static void print_cpustat(void)
+ +{
+ +      int i, group;
+ +      u8 tail = __this_cpu_read(cpustat_tail);
+ +      u64 sample_period_second = sample_period;
+ +
+ +      do_div(sample_period_second, NSEC_PER_SEC);
+ +
+ +      /*
+ +       * Outputting the "watchdog" prefix on every line is redundant and not
+ +       * concise, and the original alarm information is sufficient for
+ +       * positioning in logs, hence here printk() is used instead of pr_crit().
+ +       */
+ +      printk(KERN_CRIT "CPU#%d Utilization every %llus during lockup:\n",
+ +             smp_processor_id(), sample_period_second);
+ +
+ +      for (i = 0; i < NUM_SAMPLE_PERIODS; i++) {
+ +              group = (tail + i) % NUM_SAMPLE_PERIODS;
+ +              printk(KERN_CRIT "\t#%d: %3u%% system,\t%3u%% softirq,\t"
+ +                      "%3u%% hardirq,\t%3u%% idle\n", i + 1,
+ +                      __this_cpu_read(cpustat_util[group][STATS_SYSTEM]),
+ +                      __this_cpu_read(cpustat_util[group][STATS_SOFTIRQ]),
+ +                      __this_cpu_read(cpustat_util[group][STATS_HARDIRQ]),
+ +                      __this_cpu_read(cpustat_util[group][STATS_IDLE]));
+ +      }
+ +}
+ +
+ +#define HARDIRQ_PERCENT_THRESH          50
+ +#define NUM_HARDIRQ_REPORT              5
+ +struct irq_counts {
+ +      int irq;
+ +      u32 counts;
+ +};
+ +
+ +static DEFINE_PER_CPU(bool, snapshot_taken);
+ +
+ +/* Tabulate the most frequent interrupts. */
+ +static void tabulate_irq_count(struct irq_counts *irq_counts, int irq, u32 counts, int rank)
+ +{
+ +      int i;
+ +      struct irq_counts new_count = {irq, counts};
+ +
+ +      for (i = 0; i < rank; i++) {
+ +              if (counts > irq_counts[i].counts)
+ +                      swap(new_count, irq_counts[i]);
+ +      }
+ +}
+ +
+ +/*
+ + * If the hardirq time exceeds HARDIRQ_PERCENT_THRESH% of the sample_period,
+ + * then the cause of softlockup might be interrupt storm. In this case, it
+ + * would be useful to start interrupt counting.
+ + */
+ +static bool need_counting_irqs(void)
+ +{
+ +      u8 util;
+ +      int tail = __this_cpu_read(cpustat_tail);
+ +
+ +      tail = (tail + NUM_HARDIRQ_REPORT - 1) % NUM_HARDIRQ_REPORT;
+ +      util = __this_cpu_read(cpustat_util[tail][STATS_HARDIRQ]);
+ +      return util > HARDIRQ_PERCENT_THRESH;
+ +}
+ +
+ +static void start_counting_irqs(void)
+ +{
+ +      if (!__this_cpu_read(snapshot_taken)) {
+ +              kstat_snapshot_irqs();
+ +              __this_cpu_write(snapshot_taken, true);
+ +      }
+ +}
+ +
+ +static void stop_counting_irqs(void)
+ +{
+ +      __this_cpu_write(snapshot_taken, false);
+ +}
+ +
+ +static void print_irq_counts(void)
+ +{
+ +      unsigned int i, count;
+ +      struct irq_counts irq_counts_sorted[NUM_HARDIRQ_REPORT] = {
+ +              {-1, 0}, {-1, 0}, {-1, 0}, {-1, 0}, {-1, 0}
+ +      };
+ +
+ +      if (__this_cpu_read(snapshot_taken)) {
+ +              for_each_active_irq(i) {
+ +                      count = kstat_get_irq_since_snapshot(i);
+ +                      tabulate_irq_count(irq_counts_sorted, i, count, NUM_HARDIRQ_REPORT);
+ +              }
+ +
+ +              /*
+ +               * Outputting the "watchdog" prefix on every line is redundant and not
+ +               * concise, and the original alarm information is sufficient for
+ +               * positioning in logs, hence here printk() is used instead of pr_crit().
+ +               */
+ +              printk(KERN_CRIT "CPU#%d Detect HardIRQ Time exceeds %d%%. Most frequent HardIRQs:\n",
+ +                     smp_processor_id(), HARDIRQ_PERCENT_THRESH);
+ +
+ +              for (i = 0; i < NUM_HARDIRQ_REPORT; i++) {
+ +                      if (irq_counts_sorted[i].irq == -1)
+ +                              break;
+ +
+ +                      printk(KERN_CRIT "\t#%u: %-10u\tirq#%d\n",
+ +                             i + 1, irq_counts_sorted[i].counts,
+ +                             irq_counts_sorted[i].irq);
+ +              }
+ +
+ +              /*
+ +               * If the hardirq time is less than HARDIRQ_PERCENT_THRESH% in the last
+ +               * sample_period, then we suspect the interrupt storm might be subsiding.
+ +               */
+ +              if (!need_counting_irqs())
+ +                      stop_counting_irqs();
+ +      }
+ +}
+ +
+ +static void report_cpu_status(void)
+ +{
+ +      print_cpustat();
+ +      print_irq_counts();
+ +}
+ +#else
+ +static inline void update_cpustat(void) { }
+ +static inline void report_cpu_status(void) { }
+ +static inline bool need_counting_irqs(void) { return false; }
+ +static inline void start_counting_irqs(void) { }
+ +static inline void stop_counting_irqs(void) { }
+ +#endif
+ +
   /*
    * Hard-lockup warnings should be triggered after just a few seconds. Soft-
    * lockups can have false positives under extreme conditions. So we generally
@@@ -553,7 -364,7 +553,7 @@@ static void set_sample_period(void
          * and hard thresholds) to increment before the
          * hardlockup detector generates a warning
          */
- -      sample_period = get_softlockup_thresh() * ((u64)NSEC_PER_SEC / 5);
+ +      sample_period = get_softlockup_thresh() * ((u64)NSEC_PER_SEC / NUM_SAMPLE_PERIODS);
         watchdog_update_hrtimer_threshold(sample_period);
   }
   
@@@ -623,18 -434,6 +623,18 @@@ static int is_softlockup(unsigned long 
                          unsigned long now)
   {
         if ((watchdog_enabled & WATCHDOG_SOFTOCKUP_ENABLED) && watchdog_thresh) {
+ +              /*
+ +               * If period_ts has not been updated during a sample_period, then
+ +               * in the subsequent few sample_periods, period_ts might also not
+ +               * be updated, which could indicate a potential softlockup. In
+ +               * this case, if we suspect the cause of the potential softlockup
+ +               * might be interrupt storm, then we need to count the interrupts
+ +               * to find which interrupt is storming.
+ +               */
+ +              if (time_after_eq(now, period_ts + get_softlockup_thresh() / NUM_SAMPLE_PERIODS) &&
+ +                  need_counting_irqs())
+ +                      start_counting_irqs();
+ +
                 /* Warn about unreasonable delays. */
                 if (time_after(now, period_ts + get_softlockup_thresh()))
                         return now - touch_ts;
@@@ -657,7 -456,6 +657,7 @@@ static DEFINE_PER_CPU(struct cpu_stop_w
   static int softlockup_fn(void *data)
   {
         update_touch_ts();
+ +      stop_counting_irqs();
         complete(this_cpu_ptr(&softlockup_completion));
   
         return 0;
@@@ -706,8 -504,6 +706,8 @@@ static enum hrtimer_restart watchdog_ti
          */
         period_ts = READ_ONCE(*this_cpu_ptr(&watchdog_report_ts));
   
+ +      update_cpustat();
+ +
         /* Reset the interval when touched by known problematic code. */
         if (period_ts == SOFTLOCKUP_DELAY_REPORT) {
                 if (unlikely(__this_cpu_read(softlockup_touch_sync))) {
@@@ -743,7 -539,6 +743,7 @@@
                 pr_emerg("BUG: soft lockup - CPU#%d stuck for %us! [%s:%d]\n",
                         smp_processor_id(), duration,
                         current->comm, task_pid_nr(current));
+ +              report_cpu_status();
                 print_modules();
                 print_irqtrace_events(current);
                 if (regs)
@@@ -1155,7 -950,6 +1155,6 @@@ static struct ctl_table watchdog_sysctl
         },
   #endif /* CONFIG_SMP */
   #endif
-       {}
   };
   
   static struct ctl_table watchdog_hardlockup_sysctl[] = {
@@@ -1168,7 -962,6 +1167,6 @@@
                 .extra1         = SYSCTL_ZERO,
                 .extra2         = SYSCTL_ONE,
         },
-       {}
   };
   
   static void __init watchdog_sysctl_init(void)
author	Linus Torvalds <[email protected]>
	Sat, 18 May 2024 00:31:24 +0000 (17:31 -0700)
committer	Linus Torvalds <[email protected]>
	Sat, 18 May 2024 00:31:24 +0000 (17:31 -0700)
		1	2
include/linux/sysctl.h	patch \|	diff1 \|	diff2 \|	blob \| history
ipc/ipc_sysctl.c	patch \|	diff1 \|	diff2 \|	blob \| history
ipc/mq_sysctl.c	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/bpf/syscall.c	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/kprobes.c	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/sched/core.c	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/sched/fair.c	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/sched/topology.c	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/seccomp.c	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/stackleak.c	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/time/timer.c	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/trace/ftrace.c	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/ucount.c	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/watchdog.c	patch \|	diff1 \|	diff2 \|	blob \| history