Merge branch 'sched/core'

author Peter Zijlstra <[email protected]>

Fri, 20 Aug 2021 10:33:05 +0000 (12:33 +0200)

committer Peter Zijlstra <[email protected]>

Fri, 20 Aug 2021 10:33:05 +0000 (12:33 +0200)
author Peter Zijlstra <[email protected]>
Fri, 20 Aug 2021 10:33:05 +0000 (12:33 +0200)
committer Peter Zijlstra <[email protected]>
Fri, 20 Aug 2021 10:33:05 +0000 (12:33 +0200)
diff --combined include/linux/sched.h

index ec8d07d88641cd9d0d974377b6b74d920a12c53b,3bb9fecfdaa12ce43b7306bdc7fbbd19d051830b..6ecd02e2ca1e8cdea693e39149d0388cd1004db4
--- 1/include/linux/sched.h
--- 2/include/linux/sched.h
+++ b/include/linux/sched.h
@@@ -748,6 -748,7 +748,7 @@@ struct task_struct 
         unsigned int                    policy;
         int                             nr_cpus_allowed;
         const cpumask_t                 *cpus_ptr;
+       cpumask_t                       *user_cpus_ptr;
         cpumask_t                       cpus_mask;
         void                            *migration_pending;
   #ifdef CONFIG_SMP
@@@ -1007,6 -1008,7 +1008,6 @@@
         /* Signal handlers: */
         struct signal_struct            *signal;
         struct sighand_struct __rcu             *sighand;
- -      struct sigqueue                 *sigqueue_cache;
         sigset_t                        blocked;
         sigset_t                        real_blocked;
         /* Restored if set_restore_sigmask() was used: */
@@@ -1705,6 -1707,11 +1706,11 @@@ extern int task_can_attach(struct task_
   #ifdef CONFIG_SMP
   extern void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask);
   extern int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask);
+ extern int dup_user_cpus_ptr(struct task_struct *dst, struct task_struct *src, int node);
+ extern void release_user_cpus_ptr(struct task_struct *p);
+ extern int dl_task_check_affinity(struct task_struct *p, const struct cpumask *mask);
+ extern void force_compatible_cpus_allowed_ptr(struct task_struct *p);
+ extern void relax_compatible_cpus_allowed_ptr(struct task_struct *p);
   #else
   static inline void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
   {
@@@ -1715,6 -1722,21 +1721,21 @@@ static inline int set_cpus_allowed_ptr(
                 return -EINVAL;
         return 0;
   }
+ static inline int dup_user_cpus_ptr(struct task_struct *dst, struct task_struct *src, int node)
+ {
+       if (src->user_cpus_ptr)
+               return -EINVAL;
+       return 0;
+ }
+ static inline void release_user_cpus_ptr(struct task_struct *p)
+ {
+       WARN_ON(p->user_cpus_ptr);
+ }
+ 
+ static inline int dl_task_check_affinity(struct task_struct *p, const struct cpumask *mask)
+ {
+       return 0;
+ }
   #endif
   
   extern int yield_to(struct task_struct *p, bool preempt);
@@@ -2028,8 -2050,6 +2049,8 @@@ static inline void set_task_cpu(struct 
   
   #endif /* CONFIG_SMP */
   
+ +extern bool sched_task_on_rq(struct task_struct *p);
+ +
   /*
    * In order to reduce various lock holder preemption latencies provide an
    * interface to see if a vCPU is currently running or not.
diff --combined include/linux/wait.h

index 6598ae35e1b5ab18ab10210df6ea7f2adb2aff97,99c5f05718cdff6c71a8b518345e1e944dc79253..93dab0e9580f8de1841c94a28277fb9c393a6f6d
--- 1/include/linux/wait.h
--- 2/include/linux/wait.h
+++ b/include/linux/wait.h
@@@ -56,7 -56,7 +56,7 @@@ struct task_struct
   
   #define __WAIT_QUEUE_HEAD_INITIALIZER(name) {                                 \
         .lock           = __SPIN_LOCK_UNLOCKED(name.lock),                      \
-       .head           = { &(name).head, &(name).head } }
+       .head           = LIST_HEAD_INIT(name.head) }
   
   #define DECLARE_WAIT_QUEUE_HEAD(name) \
         struct wait_queue_head name = __WAIT_QUEUE_HEAD_INITIALIZER(name)
@@@ -1136,7 -1136,7 +1136,7 @@@ do {                                                                            
    * Waitqueues which are removed from the waitqueue_head at wakeup time
    */
   void prepare_to_wait(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry, int state);
- -void prepare_to_wait_exclusive(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry, int state);
+ +bool prepare_to_wait_exclusive(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry, int state);
   long prepare_to_wait_event(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry, int state);
   void finish_wait(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry);
   long wait_woken(struct wait_queue_entry *wq_entry, unsigned mode, long timeout);
diff --combined kernel/fork.c

index bc94b2cc59956e923cb7bbe11d32a919ef7a053c,5d7addf0c41a2a824d39d8b5cd412e182cfae19a..bd0e165b83975da9cdcc889aa4838d07567eee99
--- 1/kernel/fork.c
--- 2/kernel/fork.c
+++ b/kernel/fork.c
@@@ -446,6 -446,7 +446,7 @@@ void put_task_stack(struct task_struct 
   
   void free_task(struct task_struct *tsk)
   {
+       release_user_cpus_ptr(tsk);
         scs_release(tsk);
   
   #ifndef CONFIG_THREAD_INFO_IN_TASK
@@@ -825,14 -826,9 +826,14 @@@ void __init fork_init(void
         init_task.signal->rlim[RLIMIT_SIGPENDING] =
                 init_task.signal->rlim[RLIMIT_NPROC];
   
- -      for (i = 0; i < UCOUNT_COUNTS; i++)
+ +      for (i = 0; i < MAX_PER_NAMESPACE_UCOUNTS; i++)
                 init_user_ns.ucount_max[i] = max_threads/2;
   
+ +      set_rlimit_ucount_max(&init_user_ns, UCOUNT_RLIMIT_NPROC, task_rlimit(&init_task, RLIMIT_NPROC));
+ +      set_rlimit_ucount_max(&init_user_ns, UCOUNT_RLIMIT_MSGQUEUE, task_rlimit(&init_task, RLIMIT_MSGQUEUE));
+ +      set_rlimit_ucount_max(&init_user_ns, UCOUNT_RLIMIT_SIGPENDING, task_rlimit(&init_task, RLIMIT_SIGPENDING));
+ +      set_rlimit_ucount_max(&init_user_ns, UCOUNT_RLIMIT_MEMLOCK, task_rlimit(&init_task, RLIMIT_MEMLOCK));
+ +
   #ifdef CONFIG_VMAP_STACK
         cpuhp_setup_state(CPUHP_BP_PREPARE_DYN, "fork:vm_stack_cache",
                           NULL, free_vm_stack_cache);
@@@ -924,6 -920,7 +925,7 @@@ static struct task_struct *dup_task_str
   #endif
         if (orig->cpus_ptr == &orig->cpus_mask)
                 tsk->cpus_ptr = &tsk->cpus_mask;
+       dup_user_cpus_ptr(tsk, orig, node);
   
         /*
          * One for the user space visible state that goes away when reaped.
@@@ -1035,6 -1032,7 +1037,6 @@@ static struct mm_struct *mm_init(struc
         mm_pgtables_bytes_init(mm);
         mm->map_count = 0;
         mm->locked_vm = 0;
- -      atomic_set(&mm->has_pinned, 0);
         atomic64_set(&mm->pinned_vm, 0);
         memset(&mm->rss_stat, 0, sizeof(mm->rss_stat));
         spin_lock_init(&mm->page_table_lock);
@@@ -1982,7 -1980,8 +1984,7 @@@ static __latent_entropy struct task_str
         DEBUG_LOCKS_WARN_ON(!p->softirqs_enabled);
   #endif
         retval = -EAGAIN;
- -      if (atomic_read(&p->real_cred->user->processes) >=
- -                      task_rlimit(p, RLIMIT_NPROC)) {
+ +      if (is_ucounts_overlimit(task_ucounts(p), UCOUNT_RLIMIT_NPROC, rlimit(RLIMIT_NPROC))) {
                 if (p->real_cred->user != INIT_USER &&
                     !capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN))
                         goto bad_fork_free;
@@@ -2012,6 -2011,7 +2014,6 @@@
         spin_lock_init(&p->alloc_lock);
   
         init_sigpending(&p->pending);
- -      p->sigqueue_cache = NULL;
   
         p->utime = p->stime = p->gtime = 0;
   #ifdef CONFIG_ARCH_HAS_SCALED_CPUTIME
@@@ -2391,7 -2391,7 +2393,7 @@@ bad_fork_cleanup_threadgroup_lock
   #endif
         delayacct_tsk_free(p);
   bad_fork_cleanup_count:
- -      atomic_dec(&p->cred->user->processes);
+ +      dec_rlimit_ucounts(task_ucounts(p), UCOUNT_RLIMIT_NPROC, 1);
         exit_creds(p);
   bad_fork_free:
         WRITE_ONCE(p->__state, TASK_DEAD);
@@@ -3004,12 -3004,6 +3006,12 @@@ int ksys_unshare(unsigned long unshare_
         if (err)
                 goto bad_unshare_cleanup_cred;
   
+ +      if (new_cred) {
+ +              err = set_cred_ucounts(new_cred);
+ +              if (err)
+ +                      goto bad_unshare_cleanup_cred;
+ +      }
+ +
         if (new_fs || new_fd || do_sysvsem || new_cred || new_nsproxy) {
                 if (do_sysvsem) {
                         /*
diff --combined kernel/sched/core.c

index 20ffcc04413449d6eedc709e1dc8096f50a1cc65,a22cc3c156cea1d62a2b112ec33636c6920113c4..8dc67166aa6c52cae320bbb518a058e23dcf45d4
--- 1/kernel/sched/core.c
--- 2/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@@ -993,6 -993,7 +993,7 @@@ int get_nohz_timer_target(void
   {
         int i, cpu = smp_processor_id(), default_cpu = -1;
         struct sched_domain *sd;
+       const struct cpumask *hk_mask;
   
         if (housekeeping_cpu(cpu, HK_FLAG_TIMER)) {
                 if (!idle_cpu(cpu))
@@@ -1000,10 -1001,11 +1001,11 @@@
                 default_cpu = cpu;
         }
   
+       hk_mask = housekeeping_cpumask(HK_FLAG_TIMER);
+ 
         rcu_read_lock();
         for_each_domain(cpu, sd) {
-               for_each_cpu_and(i, sched_domain_span(sd),
-                       housekeeping_cpumask(HK_FLAG_TIMER)) {
+               for_each_cpu_and(i, sched_domain_span(sd), hk_mask) {
                         if (cpu == i)
                                 continue;
   
@@@ -1619,6 -1621,23 +1621,23 @@@ static inline void uclamp_rq_dec(struc
                 uclamp_rq_dec_id(rq, p, clamp_id);
   }
   
+ static inline void uclamp_rq_reinc_id(struct rq *rq, struct task_struct *p,
+                                     enum uclamp_id clamp_id)
+ {
+       if (!p->uclamp[clamp_id].active)
+               return;
+ 
+       uclamp_rq_dec_id(rq, p, clamp_id);
+       uclamp_rq_inc_id(rq, p, clamp_id);
+ 
+       /*
+        * Make sure to clear the idle flag if we've transiently reached 0
+        * active tasks on rq.
+        */
+       if (clamp_id == UCLAMP_MAX && (rq->uclamp_flags & UCLAMP_FLAG_IDLE))
+               rq->uclamp_flags &= ~UCLAMP_FLAG_IDLE;
+ }
+ 
   static inline void
   uclamp_update_active(struct task_struct *p)
   {
@@@ -1642,12 -1661,8 +1661,8 @@@
          * affecting a valid clamp bucket, the next time it's enqueued,
          * it will already see the updated clamp bucket value.
          */
-       for_each_clamp_id(clamp_id) {
-               if (p->uclamp[clamp_id].active) {
-                       uclamp_rq_dec_id(rq, p, clamp_id);
-                       uclamp_rq_inc_id(rq, p, clamp_id);
-               }
-       }
+       for_each_clamp_id(clamp_id)
+               uclamp_rq_reinc_id(rq, p, clamp_id);
   
         task_rq_unlock(rq, p, &rf);
   }
@@@ -1928,11 -1943,6 +1943,11 @@@ static inline void uclamp_post_fork(str
   static inline void init_uclamp(void) { }
   #endif /* CONFIG_UCLAMP_TASK */
   
+ +bool sched_task_on_rq(struct task_struct *p)
+ +{
+ +      return task_on_rq_queued(p);
+ +}
+ +
   static inline void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
   {
         if (!(flags & ENQUEUE_NOCLOCK))
@@@ -1981,18 -1991,12 +1996,18 @@@ void deactivate_task(struct rq *rq, str
         dequeue_task(rq, p, flags);
   }
   
- -/*
- - * __normal_prio - return the priority that is based on the static prio
- - */
- -static inline int __normal_prio(struct task_struct *p)
+ +static inline int __normal_prio(int policy, int rt_prio, int nice)
   {
- -      return p->static_prio;
+ +      int prio;
+ +
+ +      if (dl_policy(policy))
+ +              prio = MAX_DL_PRIO - 1;
+ +      else if (rt_policy(policy))
+ +              prio = MAX_RT_PRIO - 1 - rt_prio;
+ +      else
+ +              prio = NICE_TO_PRIO(nice);
+ +
+ +      return prio;
   }
   
   /*
@@@ -2004,7 -2008,15 +2019,7 @@@
    */
   static inline int normal_prio(struct task_struct *p)
   {
- -      int prio;
- -
- -      if (task_has_dl_policy(p))
- -              prio = MAX_DL_PRIO-1;
- -      else if (task_has_rt_policy(p))
- -              prio = MAX_RT_PRIO-1 - p->rt_priority;
- -      else
- -              prio = __normal_prio(p);
- -      return prio;
+ +      return __normal_prio(p->policy, p->rt_priority, PRIO_TO_NICE(p->static_prio));
   }
   
   /*
@@@ -2161,7 -2173,7 +2176,7 @@@ static inline bool is_cpu_allowed(struc
   
         /* Non kernel threads are not allowed during either online or offline. */
         if (!(p->flags & PF_KTHREAD))
-               return cpu_active(cpu);
+               return cpu_active(cpu) && task_cpu_possible(cpu, p);
   
         /* KTHREAD_IS_PER_CPU is always allowed. */
         if (kthread_is_per_cpu(p))
@@@ -2468,6 -2480,34 +2483,34 @@@ void do_set_cpus_allowed(struct task_st
         __do_set_cpus_allowed(p, new_mask, 0);
   }
   
+ int dup_user_cpus_ptr(struct task_struct *dst, struct task_struct *src,
+                     int node)
+ {
+       if (!src->user_cpus_ptr)
+               return 0;
+ 
+       dst->user_cpus_ptr = kmalloc_node(cpumask_size(), GFP_KERNEL, node);
+       if (!dst->user_cpus_ptr)
+               return -ENOMEM;
+ 
+       cpumask_copy(dst->user_cpus_ptr, src->user_cpus_ptr);
+       return 0;
+ }
+ 
+ static inline struct cpumask *clear_user_cpus_ptr(struct task_struct *p)
+ {
+       struct cpumask *user_mask = NULL;
+ 
+       swap(p->user_cpus_ptr, user_mask);
+ 
+       return user_mask;
+ }
+ 
+ void release_user_cpus_ptr(struct task_struct *p)
+ {
+       kfree(clear_user_cpus_ptr(p));
+ }
+ 
   /*
    * This function is wildly self concurrent; here be dragons.
    *
@@@ -2685,28 -2725,26 +2728,26 @@@ static int affine_move_task(struct rq *
   }
   
   /*
-  * Change a given task's CPU affinity. Migrate the thread to a
-  * proper CPU and schedule it away if the CPU it's executing on
-  * is removed from the allowed bitmask.
-  *
-  * NOTE: the caller must have a valid reference to the task, the
-  * task must not exit() & deallocate itself prematurely. The
-  * call is not atomic; no spinlocks may be held.
+  * Called with both p->pi_lock and rq->lock held; drops both before returning.
    */
- static int __set_cpus_allowed_ptr(struct task_struct *p,
-                                 const struct cpumask *new_mask,
-                                 u32 flags)
+ static int __set_cpus_allowed_ptr_locked(struct task_struct *p,
+                                        const struct cpumask *new_mask,
+                                        u32 flags,
+                                        struct rq *rq,
+                                        struct rq_flags *rf)
+       __releases(rq->lock)
+       __releases(p->pi_lock)
   {
+       const struct cpumask *cpu_allowed_mask = task_cpu_possible_mask(p);
         const struct cpumask *cpu_valid_mask = cpu_active_mask;
+       bool kthread = p->flags & PF_KTHREAD;
+       struct cpumask *user_mask = NULL;
         unsigned int dest_cpu;
-       struct rq_flags rf;
-       struct rq *rq;
         int ret = 0;
   
-       rq = task_rq_lock(p, &rf);
         update_rq_clock(rq);
   
-       if (p->flags & PF_KTHREAD || is_migration_disabled(p)) {
+       if (kthread || is_migration_disabled(p)) {
                 /*
                  * Kernel threads are allowed on online && !active CPUs,
                  * however, during cpu-hot-unplug, even these might get pushed
@@@ -2720,6 -2758,11 +2761,11 @@@
                 cpu_valid_mask = cpu_online_mask;
         }
   
+       if (!kthread && !cpumask_subset(new_mask, cpu_allowed_mask)) {
+               ret = -EINVAL;
+               goto out;
+       }
+ 
         /*
          * Must re-check here, to close a race against __kthread_bind(),
          * sched_setaffinity() is not guaranteed to observe the flag.
@@@ -2754,20 -2797,178 +2800,178 @@@
   
         __do_set_cpus_allowed(p, new_mask, flags);
   
-       return affine_move_task(rq, p, &rf, dest_cpu, flags);
+       if (flags & SCA_USER)
+               user_mask = clear_user_cpus_ptr(p);
+ 
+       ret = affine_move_task(rq, p, rf, dest_cpu, flags);
+ 
+       kfree(user_mask);
+ 
+       return ret;
   
   out:
-       task_rq_unlock(rq, p, &rf);
+       task_rq_unlock(rq, p, rf);
   
         return ret;
   }
   
+ /*
+  * Change a given task's CPU affinity. Migrate the thread to a
+  * proper CPU and schedule it away if the CPU it's executing on
+  * is removed from the allowed bitmask.
+  *
+  * NOTE: the caller must have a valid reference to the task, the
+  * task must not exit() & deallocate itself prematurely. The
+  * call is not atomic; no spinlocks may be held.
+  */
+ static int __set_cpus_allowed_ptr(struct task_struct *p,
+                                 const struct cpumask *new_mask, u32 flags)
+ {
+       struct rq_flags rf;
+       struct rq *rq;
+ 
+       rq = task_rq_lock(p, &rf);
+       return __set_cpus_allowed_ptr_locked(p, new_mask, flags, rq, &rf);
+ }
+ 
   int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
   {
         return __set_cpus_allowed_ptr(p, new_mask, 0);
   }
   EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr);
   
+ /*
+  * Change a given task's CPU affinity to the intersection of its current
+  * affinity mask and @subset_mask, writing the resulting mask to @new_mask
+  * and pointing @p->user_cpus_ptr to a copy of the old mask.
+  * If the resulting mask is empty, leave the affinity unchanged and return
+  * -EINVAL.
+  */
+ static int restrict_cpus_allowed_ptr(struct task_struct *p,
+                                    struct cpumask *new_mask,
+                                    const struct cpumask *subset_mask)
+ {
+       struct cpumask *user_mask = NULL;
+       struct rq_flags rf;
+       struct rq *rq;
+       int err;
+ 
+       if (!p->user_cpus_ptr) {
+               user_mask = kmalloc(cpumask_size(), GFP_KERNEL);
+               if (!user_mask)
+                       return -ENOMEM;
+       }
+ 
+       rq = task_rq_lock(p, &rf);
+ 
+       /*
+        * Forcefully restricting the affinity of a deadline task is
+        * likely to cause problems, so fail and noisily override the
+        * mask entirely.
+        */
+       if (task_has_dl_policy(p) && dl_bandwidth_enabled()) {
+               err = -EPERM;
+               goto err_unlock;
+       }
+ 
+       if (!cpumask_and(new_mask, &p->cpus_mask, subset_mask)) {
+               err = -EINVAL;
+               goto err_unlock;
+       }
+ 
+       /*
+        * We're about to butcher the task affinity, so keep track of what
+        * the user asked for in case we're able to restore it later on.
+        */
+       if (user_mask) {
+               cpumask_copy(user_mask, p->cpus_ptr);
+               p->user_cpus_ptr = user_mask;
+       }
+ 
+       return __set_cpus_allowed_ptr_locked(p, new_mask, 0, rq, &rf);
+ 
+ err_unlock:
+       task_rq_unlock(rq, p, &rf);
+       kfree(user_mask);
+       return err;
+ }
+ 
+ /*
+  * Restrict the CPU affinity of task @p so that it is a subset of
+  * task_cpu_possible_mask() and point @p->user_cpu_ptr to a copy of the
+  * old affinity mask. If the resulting mask is empty, we warn and walk
+  * up the cpuset hierarchy until we find a suitable mask.
+  */
+ void force_compatible_cpus_allowed_ptr(struct task_struct *p)
+ {
+       cpumask_var_t new_mask;
+       const struct cpumask *override_mask = task_cpu_possible_mask(p);
+ 
+       alloc_cpumask_var(&new_mask, GFP_KERNEL);
+ 
+       /*
+        * __migrate_task() can fail silently in the face of concurrent
+        * offlining of the chosen destination CPU, so take the hotplug
+        * lock to ensure that the migration succeeds.
+        */
+       cpus_read_lock();
+       if (!cpumask_available(new_mask))
+               goto out_set_mask;
+ 
+       if (!restrict_cpus_allowed_ptr(p, new_mask, override_mask))
+               goto out_free_mask;
+ 
+       /*
+        * We failed to find a valid subset of the affinity mask for the
+        * task, so override it based on its cpuset hierarchy.
+        */
+       cpuset_cpus_allowed(p, new_mask);
+       override_mask = new_mask;
+ 
+ out_set_mask:
+       if (printk_ratelimit()) {
+               printk_deferred("Overriding affinity for process %d (%s) to CPUs %*pbl\n",
+                               task_pid_nr(p), p->comm,
+                               cpumask_pr_args(override_mask));
+       }
+ 
+       WARN_ON(set_cpus_allowed_ptr(p, override_mask));
+ out_free_mask:
+       cpus_read_unlock();
+       free_cpumask_var(new_mask);
+ }
+ 
+ static int
+ __sched_setaffinity(struct task_struct *p, const struct cpumask *mask);
+ 
+ /*
+  * Restore the affinity of a task @p which was previously restricted by a
+  * call to force_compatible_cpus_allowed_ptr(). This will clear (and free)
+  * @p->user_cpus_ptr.
+  *
+  * It is the caller's responsibility to serialise this with any calls to
+  * force_compatible_cpus_allowed_ptr(@p).
+  */
+ void relax_compatible_cpus_allowed_ptr(struct task_struct *p)
+ {
+       struct cpumask *user_mask = p->user_cpus_ptr;
+       unsigned long flags;
+ 
+       /*
+        * Try to restore the old affinity mask. If this fails, then
+        * we free the mask explicitly to avoid it being inherited across
+        * a subsequent fork().
+        */
+       if (!user_mask || !__sched_setaffinity(p, user_mask))
+               return;
+ 
+       raw_spin_lock_irqsave(&p->pi_lock, flags);
+       user_mask = clear_user_cpus_ptr(p);
+       raw_spin_unlock_irqrestore(&p->pi_lock, flags);
+ 
+       kfree(user_mask);
+ }
+ 
   void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
   {
   #ifdef CONFIG_SCHED_DEBUG
@@@ -3112,9 -3313,7 +3316,7 @@@ static int select_fallback_rq(int cpu, 
   
                 /* Look for allowed, online CPU in same node. */
                 for_each_cpu(dest_cpu, nodemask) {
-                       if (!cpu_active(dest_cpu))
-                               continue;
-                       if (cpumask_test_cpu(dest_cpu, p->cpus_ptr))
+                       if (is_cpu_allowed(p, dest_cpu))
                                 return dest_cpu;
                 }
         }
@@@ -3131,8 -3330,7 +3333,7 @@@
                 /* No more Mr. Nice Guy. */
                 switch (state) {
                 case cpuset:
-                       if (IS_ENABLED(CONFIG_CPUSETS)) {
-                               cpuset_cpus_allowed_fallback(p);
+                       if (cpuset_cpus_allowed_fallback(p)) {
                                 state = possible;
                                 break;
                         }
@@@ -3144,10 -3342,9 +3345,9 @@@
                          *
                          * More yuck to audit.
                          */
-                       do_set_cpus_allowed(p, cpu_possible_mask);
+                       do_set_cpus_allowed(p, task_cpu_possible_mask(p));
                         state = fail;
                         break;
- 
                 case fail:
                         BUG();
                         break;
@@@ -4097,7 -4294,7 +4297,7 @@@ int sched_fork(unsigned long clone_flag
                 } else if (PRIO_TO_NICE(p->static_prio) < 0)
                         p->static_prio = NICE_TO_PRIO(0);
   
- -              p->prio = p->normal_prio = __normal_prio(p);
+ +              p->prio = p->normal_prio = p->static_prio;
                 set_load_weight(p, false);
   
                 /*
@@@ -4549,7 -4746,6 +4749,7 @@@ static struct rq *finish_task_switch(st
         vtime_task_switch(prev);
         perf_event_task_sched_in(prev, current);
         finish_task(prev);
+ +      tick_nohz_task_switch();
         finish_lock_switch(rq);
         finish_arch_post_lock_switch();
         kcov_finish_switch(current);
@@@ -4595,6 -4791,7 +4795,6 @@@
                 put_task_struct_rcu_user(prev);
         }
   
- -      tick_nohz_task_switch();
         return rq;
   }
   
@@@ -5660,11 -5857,9 +5860,9 @@@ static bool try_steal_cookie(int this, 
                 if (p->core_occupation > dst->idle->core_occupation)
                         goto next;
   
-               p->on_rq = TASK_ON_RQ_MIGRATING;
                 deactivate_task(src, p, 0);
                 set_task_cpu(p, this);
                 activate_task(dst, p, 0);
-               p->on_rq = TASK_ON_RQ_QUEUED;
   
                 resched_curr(dst);
   
@@@ -6339,18 -6534,6 +6537,18 @@@ int default_wake_function(wait_queue_en
   }
   EXPORT_SYMBOL(default_wake_function);
   
+ +static void __setscheduler_prio(struct task_struct *p, int prio)
+ +{
+ +      if (dl_prio(prio))
+ +              p->sched_class = &dl_sched_class;
+ +      else if (rt_prio(prio))
+ +              p->sched_class = &rt_sched_class;
+ +      else
+ +              p->sched_class = &fair_sched_class;
+ +
+ +      p->prio = prio;
+ +}
+ +
   #ifdef CONFIG_RT_MUTEXES
   
   static inline int __rt_effective_prio(struct task_struct *pi_task, int prio)
@@@ -6466,19 -6649,22 +6664,19 @@@ void rt_mutex_setprio(struct task_struc
                 } else {
                         p->dl.pi_se = &p->dl;
                 }
- -              p->sched_class = &dl_sched_class;
         } else if (rt_prio(prio)) {
                 if (dl_prio(oldprio))
                         p->dl.pi_se = &p->dl;
                 if (oldprio < prio)
                         queue_flag |= ENQUEUE_HEAD;
- -              p->sched_class = &rt_sched_class;
         } else {
                 if (dl_prio(oldprio))
                         p->dl.pi_se = &p->dl;
                 if (rt_prio(oldprio))
                         p->rt.timeout = 0;
- -              p->sched_class = &fair_sched_class;
         }
   
- -      p->prio = prio;
+ +      __setscheduler_prio(p, prio);
   
         if (queued)
                 enqueue_task(rq, p, queue_flag);
@@@ -6831,6 -7017,35 +7029,6 @@@ static void __setscheduler_params(struc
         set_load_weight(p, true);
   }
   
- -/* Actually do priority change: must hold pi & rq lock. */
- -static void __setscheduler(struct rq *rq, struct task_struct *p,
- -                         const struct sched_attr *attr, bool keep_boost)
- -{
- -      /*
- -       * If params can't change scheduling class changes aren't allowed
- -       * either.
- -       */
- -      if (attr->sched_flags & SCHED_FLAG_KEEP_PARAMS)
- -              return;
- -
- -      __setscheduler_params(p, attr);
- -
- -      /*
- -       * Keep a potential priority boosting if called from
- -       * sched_setscheduler().
- -       */
- -      p->prio = normal_prio(p);
- -      if (keep_boost)
- -              p->prio = rt_effective_prio(p, p->prio);
- -
- -      if (dl_prio(p->prio))
- -              p->sched_class = &dl_sched_class;
- -      else if (rt_prio(p->prio))
- -              p->sched_class = &rt_sched_class;
- -      else
- -              p->sched_class = &fair_sched_class;
- -}
- -
   /*
    * Check the target process has a UID that matches the current process's:
    */
@@@ -6851,8 -7066,10 +7049,8 @@@ static int __sched_setscheduler(struct 
                                 const struct sched_attr *attr,
                                 bool user, bool pi)
   {
- -      int newprio = dl_policy(attr->sched_policy) ? MAX_DL_PRIO - 1 :
- -                    MAX_RT_PRIO - 1 - attr->sched_priority;
- -      int retval, oldprio, oldpolicy = -1, queued, running;
- -      int new_effective_prio, policy = attr->sched_policy;
+ +      int oldpolicy = -1, policy = attr->sched_policy;
+ +      int retval, oldprio, newprio, queued, running;
         const struct sched_class *prev_class;
         struct callback_head *head;
         struct rq_flags rf;
@@@ -7050,7 -7267,6 +7248,7 @@@ change
         p->sched_reset_on_fork = reset_on_fork;
         oldprio = p->prio;
   
+ +      newprio = __normal_prio(policy, attr->sched_priority, attr->sched_nice);
         if (pi) {
                 /*
                  * Take priority boosted tasks into account. If the new
@@@ -7059,8 -7275,8 +7257,8 @@@
                  * the runqueue. This will be done when the task deboost
                  * itself.
                  */
- -              new_effective_prio = rt_effective_prio(p, newprio);
- -              if (new_effective_prio == oldprio)
+ +              newprio = rt_effective_prio(p, newprio);
+ +              if (newprio == oldprio)
                         queue_flags &= ~DEQUEUE_MOVE;
         }
   
@@@ -7073,10 -7289,7 +7271,10 @@@
   
         prev_class = p->sched_class;
   
- -      __setscheduler(rq, p, attr, pi);
+ +      if (!(attr->sched_flags & SCHED_FLAG_KEEP_PARAMS)) {
+ +              __setscheduler_params(p, attr);
+ +              __setscheduler_prio(p, newprio);
+ +      }
         __setscheduler_uclamp(p, attr);
   
         if (queued) {
@@@ -7300,6 -7513,16 +7498,16 @@@ err_size
         return -E2BIG;
   }
   
+ static void get_params(struct task_struct *p, struct sched_attr *attr)
+ {
+       if (task_has_dl_policy(p))
+               __getparam_dl(p, attr);
+       else if (task_has_rt_policy(p))
+               attr->sched_priority = p->rt_priority;
+       else
+               attr->sched_nice = task_nice(p);
+ }
+ 
   /**
    * sys_sched_setscheduler - set/change the scheduler policy and RT priority
    * @pid: the pid in question.
@@@ -7361,6 -7584,8 +7569,8 @@@ SYSCALL_DEFINE3(sched_setattr, pid_t, p
         rcu_read_unlock();
   
         if (likely(p)) {
+               if (attr.sched_flags & SCHED_FLAG_KEEP_PARAMS)
+                       get_params(p, &attr);
                 retval = sched_setattr(p, &attr);
                 put_task_struct(p);
         }
@@@ -7509,12 -7734,8 +7719,8 @@@ SYSCALL_DEFINE4(sched_getattr, pid_t, p
         kattr.sched_policy = p->policy;
         if (p->sched_reset_on_fork)
                 kattr.sched_flags |= SCHED_FLAG_RESET_ON_FORK;
-       if (task_has_dl_policy(p))
-               __getparam_dl(p, &kattr);
-       else if (task_has_rt_policy(p))
-               kattr.sched_priority = p->rt_priority;
-       else
-               kattr.sched_nice = task_nice(p);
+       get_params(p, &kattr);
+       kattr.sched_flags &= SCHED_FLAG_ALL;
   
   #ifdef CONFIG_UCLAMP_TASK
         /*
@@@ -7535,9 -7756,76 +7741,76 @@@ out_unlock
         return retval;
   }
   
- long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
+ #ifdef CONFIG_SMP
+ int dl_task_check_affinity(struct task_struct *p, const struct cpumask *mask)
   {
+       int ret = 0;
+ 
+       /*
+        * If the task isn't a deadline task or admission control is
+        * disabled then we don't care about affinity changes.
+        */
+       if (!task_has_dl_policy(p) || !dl_bandwidth_enabled())
+               return 0;
+ 
+       /*
+        * Since bandwidth control happens on root_domain basis,
+        * if admission test is enabled, we only admit -deadline
+        * tasks allowed to run on all the CPUs in the task's
+        * root_domain.
+        */
+       rcu_read_lock();
+       if (!cpumask_subset(task_rq(p)->rd->span, mask))
+               ret = -EBUSY;
+       rcu_read_unlock();
+       return ret;
+ }
+ #endif
+ 
+ static int
+ __sched_setaffinity(struct task_struct *p, const struct cpumask *mask)
+ {
+       int retval;
         cpumask_var_t cpus_allowed, new_mask;
+ 
+       if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL))
+               return -ENOMEM;
+ 
+       if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) {
+               retval = -ENOMEM;
+               goto out_free_cpus_allowed;
+       }
+ 
+       cpuset_cpus_allowed(p, cpus_allowed);
+       cpumask_and(new_mask, mask, cpus_allowed);
+ 
+       retval = dl_task_check_affinity(p, new_mask);
+       if (retval)
+               goto out_free_new_mask;
+ again:
+       retval = __set_cpus_allowed_ptr(p, new_mask, SCA_CHECK | SCA_USER);
+       if (retval)
+               goto out_free_new_mask;
+ 
+       cpuset_cpus_allowed(p, cpus_allowed);
+       if (!cpumask_subset(new_mask, cpus_allowed)) {
+               /*
+                * We must have raced with a concurrent cpuset update.
+                * Just reset the cpumask to the cpuset's cpus_allowed.
+                */
+               cpumask_copy(new_mask, cpus_allowed);
+               goto again;
+       }
+ 
+ out_free_new_mask:
+       free_cpumask_var(new_mask);
+ out_free_cpus_allowed:
+       free_cpumask_var(cpus_allowed);
+       return retval;
+ }
+ 
+ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
+ {
         struct task_struct *p;
         int retval;
   
@@@ -7557,68 -7845,22 +7830,22 @@@
                 retval = -EINVAL;
                 goto out_put_task;
         }
-       if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL)) {
-               retval = -ENOMEM;
-               goto out_put_task;
-       }
-       if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) {
-               retval = -ENOMEM;
-               goto out_free_cpus_allowed;
-       }
-       retval = -EPERM;
+ 
         if (!check_same_owner(p)) {
                 rcu_read_lock();
                 if (!ns_capable(__task_cred(p)->user_ns, CAP_SYS_NICE)) {
                         rcu_read_unlock();
-                       goto out_free_new_mask;
+                       retval = -EPERM;
+                       goto out_put_task;
                 }
                 rcu_read_unlock();
         }
   
         retval = security_task_setscheduler(p);
         if (retval)
-               goto out_free_new_mask;
- 
- 
-       cpuset_cpus_allowed(p, cpus_allowed);
-       cpumask_and(new_mask, in_mask, cpus_allowed);
- 
-       /*
-        * Since bandwidth control happens on root_domain basis,
-        * if admission test is enabled, we only admit -deadline
-        * tasks allowed to run on all the CPUs in the task's
-        * root_domain.
-        */
- #ifdef CONFIG_SMP
-       if (task_has_dl_policy(p) && dl_bandwidth_enabled()) {
-               rcu_read_lock();
-               if (!cpumask_subset(task_rq(p)->rd->span, new_mask)) {
-                       retval = -EBUSY;
-                       rcu_read_unlock();
-                       goto out_free_new_mask;
-               }
-               rcu_read_unlock();
-       }
- #endif
- again:
-       retval = __set_cpus_allowed_ptr(p, new_mask, SCA_CHECK);
+               goto out_put_task;
   
-       if (!retval) {
-               cpuset_cpus_allowed(p, cpus_allowed);
-               if (!cpumask_subset(new_mask, cpus_allowed)) {
-                       /*
-                        * We must have raced with a concurrent cpuset
-                        * update. Just reset the cpus_allowed to the
-                        * cpuset's cpus_allowed
-                        */
-                       cpumask_copy(new_mask, cpus_allowed);
-                       goto again;
-               }
-       }
- out_free_new_mask:
-       free_cpumask_var(new_mask);
- out_free_cpus_allowed:
-       free_cpumask_var(cpus_allowed);
+       retval = __sched_setaffinity(p, in_mask);
   out_put_task:
         put_task_struct(p);
         return retval;
@@@ -9804,7 -10046,7 +10031,7 @@@ static int tg_set_cfs_bandwidth(struct 
          * Prevent race between setting of cfs_rq->runtime_enabled and
          * unthrottle_offline_cfs_rqs().
          */
-       get_online_cpus();
+       cpus_read_lock();
         mutex_lock(&cfs_constraints_mutex);
         ret = __cfs_schedulable(tg, period, quota);
         if (ret)
@@@ -9848,7 -10090,7 +10075,7 @@@
                 cfs_bandwidth_usage_dec();
   out_unlock:
         mutex_unlock(&cfs_constraints_mutex);
-       put_online_cpus();
+       cpus_read_unlock();
   
         return ret;
   }
@@@ -10099,6 -10341,20 +10326,20 @@@ static u64 cpu_rt_period_read_uint(stru
   }
   #endif /* CONFIG_RT_GROUP_SCHED */
   
+ #ifdef CONFIG_FAIR_GROUP_SCHED
+ static s64 cpu_idle_read_s64(struct cgroup_subsys_state *css,
+                              struct cftype *cft)
+ {
+       return css_tg(css)->idle;
+ }
+ 
+ static int cpu_idle_write_s64(struct cgroup_subsys_state *css,
+                               struct cftype *cft, s64 idle)
+ {
+       return sched_group_set_idle(css_tg(css), idle);
+ }
+ #endif
+ 
   static struct cftype cpu_legacy_files[] = {
   #ifdef CONFIG_FAIR_GROUP_SCHED
         {
@@@ -10106,6 -10362,11 +10347,11 @@@
                 .read_u64 = cpu_shares_read_u64,
                 .write_u64 = cpu_shares_write_u64,
         },
+       {
+               .name = "idle",
+               .read_s64 = cpu_idle_read_s64,
+               .write_s64 = cpu_idle_write_s64,
+       },
   #endif
   #ifdef CONFIG_CFS_BANDWIDTH
         {
@@@ -10313,6 -10574,12 +10559,12 @@@ static struct cftype cpu_files[] = 
                 .read_s64 = cpu_weight_nice_read_s64,
                 .write_s64 = cpu_weight_nice_write_s64,
         },
+       {
+               .name = "idle",
+               .flags = CFTYPE_NOT_ON_ROOT,
+               .read_s64 = cpu_idle_read_s64,
+               .write_s64 = cpu_idle_write_s64,
+       },
   #endif
   #ifdef CONFIG_CFS_BANDWIDTH
         {
diff --combined kernel/sched/fair.c

index 44c452072a1b05f91354e1f673f83f28f5deca3a,6cd05f1d77ef14b9adac7dee71b16092445e8829..5aa3cfd15a2e8e228e5e720f7ebdd230e72e450d
--- 1/kernel/sched/fair.c
--- 2/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@@ -431,6 -431,23 +431,23 @@@ find_matching_se(struct sched_entity **
         }
   }
   
+ static int tg_is_idle(struct task_group *tg)
+ {
+       return tg->idle > 0;
+ }
+ 
+ static int cfs_rq_is_idle(struct cfs_rq *cfs_rq)
+ {
+       return cfs_rq->idle > 0;
+ }
+ 
+ static int se_is_idle(struct sched_entity *se)
+ {
+       if (entity_is_task(se))
+               return task_has_idle_policy(task_of(se));
+       return cfs_rq_is_idle(group_cfs_rq(se));
+ }
+ 
   #else /* !CONFIG_FAIR_GROUP_SCHED */
   
   #define for_each_sched_entity(se) \
@@@ -468,6 -485,21 +485,21 @@@ find_matching_se(struct sched_entity **
   {
   }
   
+ static int tg_is_idle(struct task_group *tg)
+ {
+       return 0;
+ }
+ 
+ static int cfs_rq_is_idle(struct cfs_rq *cfs_rq)
+ {
+       return 0;
+ }
+ 
+ static int se_is_idle(struct sched_entity *se)
+ {
+       return 0;
+ }
+ 
   #endif        /* CONFIG_FAIR_GROUP_SCHED */
   
   static __always_inline
@@@ -1486,7 -1518,7 +1518,7 @@@ static inline bool is_core_idle(int cpu
                 if (cpu == sibling)
                         continue;
   
-               if (!idle_cpu(cpu))
+               if (!idle_cpu(sibling))
                         return false;
         }
   #endif
@@@ -3037,9 -3069,8 +3069,9 @@@ enqueue_load_avg(struct cfs_rq *cfs_rq
   static inline void
   dequeue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
   {
+ +      u32 divider = get_pelt_divider(&se->avg);
         sub_positive(&cfs_rq->avg.load_avg, se->avg.load_avg);
- -      sub_positive(&cfs_rq->avg.load_sum, se_weight(se) * se->avg.load_sum);
+ +      cfs_rq->avg.load_sum = cfs_rq->avg.load_avg * divider;
   }
   #else
   static inline void
@@@ -3256,31 -3287,6 +3288,31 @@@ static inline void cfs_rq_util_change(s
   
   #ifdef CONFIG_SMP
   #ifdef CONFIG_FAIR_GROUP_SCHED
+ +/*
+ + * Because list_add_leaf_cfs_rq always places a child cfs_rq on the list
+ + * immediately before a parent cfs_rq, and cfs_rqs are removed from the list
+ + * bottom-up, we only have to test whether the cfs_rq before us on the list
+ + * is our child.
+ + * If cfs_rq is not on the list, test whether a child needs its to be added to
+ + * connect a branch to the tree  * (see list_add_leaf_cfs_rq() for details).
+ + */
+ +static inline bool child_cfs_rq_on_list(struct cfs_rq *cfs_rq)
+ +{
+ +      struct cfs_rq *prev_cfs_rq;
+ +      struct list_head *prev;
+ +
+ +      if (cfs_rq->on_list) {
+ +              prev = cfs_rq->leaf_cfs_rq_list.prev;
+ +      } else {
+ +              struct rq *rq = rq_of(cfs_rq);
+ +
+ +              prev = rq->tmp_alone_branch;
+ +      }
+ +
+ +      prev_cfs_rq = container_of(prev, struct cfs_rq, leaf_cfs_rq_list);
+ +
+ +      return (prev_cfs_rq->tg->parent == cfs_rq->tg);
+ +}
   
   static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq)
   {
@@@ -3296,9 -3302,6 +3328,9 @@@
         if (cfs_rq->avg.runnable_sum)
                 return false;
   
+ +      if (child_cfs_rq_on_list(cfs_rq))
+ +              return false;
+ +
         /*
          * _avg must be null when _sum are null because _avg = _sum / divider
          * Make sure that rounding and/or propagation of PELT values never
@@@ -4841,6 -4844,9 +4873,9 @@@ static bool throttle_cfs_rq(struct cfs_
   
                 dequeue_entity(qcfs_rq, se, DEQUEUE_SLEEP);
   
+               if (cfs_rq_is_idle(group_cfs_rq(se)))
+                       idle_task_delta = cfs_rq->h_nr_running;
+ 
                 qcfs_rq->h_nr_running -= task_delta;
                 qcfs_rq->idle_h_nr_running -= idle_task_delta;
   
@@@ -4860,6 -4866,9 +4895,9 @@@
                 update_load_avg(qcfs_rq, se, 0);
                 se_update_runnable(se);
   
+               if (cfs_rq_is_idle(group_cfs_rq(se)))
+                       idle_task_delta = cfs_rq->h_nr_running;
+ 
                 qcfs_rq->h_nr_running -= task_delta;
                 qcfs_rq->idle_h_nr_running -= idle_task_delta;
         }
@@@ -4904,39 -4913,45 +4942,45 @@@ void unthrottle_cfs_rq(struct cfs_rq *c
         task_delta = cfs_rq->h_nr_running;
         idle_task_delta = cfs_rq->idle_h_nr_running;
         for_each_sched_entity(se) {
+               struct cfs_rq *qcfs_rq = cfs_rq_of(se);
+ 
                 if (se->on_rq)
                         break;
-               cfs_rq = cfs_rq_of(se);
-               enqueue_entity(cfs_rq, se, ENQUEUE_WAKEUP);
+               enqueue_entity(qcfs_rq, se, ENQUEUE_WAKEUP);
+ 
+               if (cfs_rq_is_idle(group_cfs_rq(se)))
+                       idle_task_delta = cfs_rq->h_nr_running;
   
-               cfs_rq->h_nr_running += task_delta;
-               cfs_rq->idle_h_nr_running += idle_task_delta;
+               qcfs_rq->h_nr_running += task_delta;
+               qcfs_rq->idle_h_nr_running += idle_task_delta;
   
                 /* end evaluation on encountering a throttled cfs_rq */
-               if (cfs_rq_throttled(cfs_rq))
+               if (cfs_rq_throttled(qcfs_rq))
                         goto unthrottle_throttle;
         }
   
         for_each_sched_entity(se) {
-               cfs_rq = cfs_rq_of(se);
+               struct cfs_rq *qcfs_rq = cfs_rq_of(se);
   
-               update_load_avg(cfs_rq, se, UPDATE_TG);
+               update_load_avg(qcfs_rq, se, UPDATE_TG);
                 se_update_runnable(se);
   
-               cfs_rq->h_nr_running += task_delta;
-               cfs_rq->idle_h_nr_running += idle_task_delta;
+               if (cfs_rq_is_idle(group_cfs_rq(se)))
+                       idle_task_delta = cfs_rq->h_nr_running;
   
+               qcfs_rq->h_nr_running += task_delta;
+               qcfs_rq->idle_h_nr_running += idle_task_delta;
   
                 /* end evaluation on encountering a throttled cfs_rq */
-               if (cfs_rq_throttled(cfs_rq))
+               if (cfs_rq_throttled(qcfs_rq))
                         goto unthrottle_throttle;
   
                 /*
                  * One parent has been throttled and cfs_rq removed from the
                  * list. Add it back to not break the leaf list.
                  */
-               if (throttled_hierarchy(cfs_rq))
-                       list_add_leaf_cfs_rq(cfs_rq);
+               if (throttled_hierarchy(qcfs_rq))
+                       list_add_leaf_cfs_rq(qcfs_rq);
         }
   
         /* At this point se is NULL and we are at root level*/
@@@ -4949,9 -4964,9 +4993,9 @@@ unthrottle_throttle
          * assertion below.
          */
         for_each_sched_entity(se) {
-               cfs_rq = cfs_rq_of(se);
+               struct cfs_rq *qcfs_rq = cfs_rq_of(se);
   
-               if (list_add_leaf_cfs_rq(cfs_rq))
+               if (list_add_leaf_cfs_rq(qcfs_rq))
                         break;
         }
   
@@@ -5082,7 -5097,7 +5126,7 @@@ static const u64 cfs_bandwidth_slack_pe
   static int runtime_refresh_within(struct cfs_bandwidth *cfs_b, u64 min_expire)
   {
         struct hrtimer *refresh_timer = &cfs_b->period_timer;
- -      u64 remaining;
+ +      s64 remaining;
   
         /* if the call-back is running a quota refresh is already occurring */
         if (hrtimer_callback_running(refresh_timer))
@@@ -5090,7 -5105,7 +5134,7 @@@
   
         /* is a quota refresh about to occur? */
         remaining = ktime_to_ns(hrtimer_expires_remaining(refresh_timer));
- -      if (remaining < min_expire)
+ +      if (remaining < (s64)min_expire)
                 return 1;
   
         return 0;
@@@ -5574,6 -5589,9 +5618,9 @@@ enqueue_task_fair(struct rq *rq, struc
                 cfs_rq->h_nr_running++;
                 cfs_rq->idle_h_nr_running += idle_h_nr_running;
   
+               if (cfs_rq_is_idle(cfs_rq))
+                       idle_h_nr_running = 1;
+ 
                 /* end evaluation on encountering a throttled cfs_rq */
                 if (cfs_rq_throttled(cfs_rq))
                         goto enqueue_throttle;
@@@ -5591,6 -5609,9 +5638,9 @@@
                 cfs_rq->h_nr_running++;
                 cfs_rq->idle_h_nr_running += idle_h_nr_running;
   
+               if (cfs_rq_is_idle(cfs_rq))
+                       idle_h_nr_running = 1;
+ 
                 /* end evaluation on encountering a throttled cfs_rq */
                 if (cfs_rq_throttled(cfs_rq))
                         goto enqueue_throttle;
@@@ -5668,6 -5689,9 +5718,9 @@@ static void dequeue_task_fair(struct r
                 cfs_rq->h_nr_running--;
                 cfs_rq->idle_h_nr_running -= idle_h_nr_running;
   
+               if (cfs_rq_is_idle(cfs_rq))
+                       idle_h_nr_running = 1;
+ 
                 /* end evaluation on encountering a throttled cfs_rq */
                 if (cfs_rq_throttled(cfs_rq))
                         goto dequeue_throttle;
@@@ -5697,6 -5721,9 +5750,9 @@@
                 cfs_rq->h_nr_running--;
                 cfs_rq->idle_h_nr_running -= idle_h_nr_running;
   
+               if (cfs_rq_is_idle(cfs_rq))
+                       idle_h_nr_running = 1;
+ 
                 /* end evaluation on encountering a throttled cfs_rq */
                 if (cfs_rq_throttled(cfs_rq))
                         goto dequeue_throttle;
@@@ -6249,7 -6276,7 +6305,7 @@@ static int select_idle_cpu(struct task_
                 time = cpu_clock(this);
         }
   
-       for_each_cpu_wrap(cpu, cpus, target) {
+       for_each_cpu_wrap(cpu, cpus, target + 1) {
                 if (has_idle_core) {
                         i = select_idle_core(p, cpu, cpus, &idle_cpu);
                         if ((unsigned int)i < nr_cpumask_bits)
@@@ -6376,6 -6403,7 +6432,7 @@@ static int select_idle_sibling(struct t
   
         /* Check a recently used CPU as a potential idle candidate: */
         recent_used_cpu = p->recent_used_cpu;
+       p->recent_used_cpu = prev;
         if (recent_used_cpu != prev &&
             recent_used_cpu != target &&
             cpus_share_cache(recent_used_cpu, target) &&
@@@ -6902,9 -6930,6 +6959,6 @@@ select_task_rq_fair(struct task_struct 
         } else if (wake_flags & WF_TTWU) { /* XXX always ? */
                 /* Fast path */
                 new_cpu = select_idle_sibling(p, prev_cpu, new_cpu);
- 
-               if (want_affine)
-                       current->recent_used_cpu = cpu;
         }
         rcu_read_unlock();
   
@@@ -7041,24 -7066,22 +7095,22 @@@ wakeup_preempt_entity(struct sched_enti
   
   static void set_last_buddy(struct sched_entity *se)
   {
-       if (entity_is_task(se) && unlikely(task_has_idle_policy(task_of(se))))
-               return;
- 
         for_each_sched_entity(se) {
                 if (SCHED_WARN_ON(!se->on_rq))
                         return;
+               if (se_is_idle(se))
+                       return;
                 cfs_rq_of(se)->last = se;
         }
   }
   
   static void set_next_buddy(struct sched_entity *se)
   {
-       if (entity_is_task(se) && unlikely(task_has_idle_policy(task_of(se))))
-               return;
- 
         for_each_sched_entity(se) {
                 if (SCHED_WARN_ON(!se->on_rq))
                         return;
+               if (se_is_idle(se))
+                       return;
                 cfs_rq_of(se)->next = se;
         }
   }
@@@ -7079,6 -7102,7 +7131,7 @@@ static void check_preempt_wakeup(struc
         struct cfs_rq *cfs_rq = task_cfs_rq(curr);
         int scale = cfs_rq->nr_running >= sched_nr_latency;
         int next_buddy_marked = 0;
+       int cse_is_idle, pse_is_idle;
   
         if (unlikely(se == pse))
                 return;
@@@ -7123,8 -7147,21 +7176,21 @@@
                 return;
   
         find_matching_se(&se, &pse);
-       update_curr(cfs_rq_of(se));
         BUG_ON(!pse);
+ 
+       cse_is_idle = se_is_idle(se);
+       pse_is_idle = se_is_idle(pse);
+ 
+       /*
+        * Preempt an idle group in favor of a non-idle group (and don't preempt
+        * in the inverse case).
+        */
+       if (cse_is_idle && !pse_is_idle)
+               goto preempt;
+       if (cse_is_idle != pse_is_idle)
+               return;
+ 
+       update_curr(cfs_rq_of(se));
         if (wakeup_preempt_entity(se, pse) == 1) {
                 /*
                  * Bias pick_next to pick the sched entity that is
@@@ -10217,9 -10254,11 +10283,11 @@@ static inline int on_null_domain(struc
   static inline int find_new_ilb(void)
   {
         int ilb;
+       const struct cpumask *hk_mask;
+ 
+       hk_mask = housekeeping_cpumask(HK_FLAG_MISC);
   
-       for_each_cpu_and(ilb, nohz.idle_cpus_mask,
-                             housekeeping_cpumask(HK_FLAG_MISC)) {
+       for_each_cpu_and(ilb, nohz.idle_cpus_mask, hk_mask) {
   
                 if (ilb == smp_processor_id())
                         continue;
@@@ -11416,10 -11455,12 +11484,12 @@@ void init_tg_cfs_entry(struct task_grou
   
   static DEFINE_MUTEX(shares_mutex);
   
- int sched_group_set_shares(struct task_group *tg, unsigned long shares)
+ static int __sched_group_set_shares(struct task_group *tg, unsigned long shares)
   {
         int i;
   
+       lockdep_assert_held(&shares_mutex);
+ 
         /*
          * We can't change the weight of the root cgroup.
          */
@@@ -11428,9 -11469,8 +11498,8 @@@
   
         shares = clamp(shares, scale_load(MIN_SHARES), scale_load(MAX_SHARES));
   
-       mutex_lock(&shares_mutex);
         if (tg->shares == shares)
-               goto done;
+               return 0;
   
         tg->shares = shares;
         for_each_possible_cpu(i) {
@@@ -11448,10 -11488,88 +11517,88 @@@
                 rq_unlock_irqrestore(rq, &rf);
         }
   
- done:
+       return 0;
+ }
+ 
+ int sched_group_set_shares(struct task_group *tg, unsigned long shares)
+ {
+       int ret;
+ 
+       mutex_lock(&shares_mutex);
+       if (tg_is_idle(tg))
+               ret = -EINVAL;
+       else
+               ret = __sched_group_set_shares(tg, shares);
+       mutex_unlock(&shares_mutex);
+ 
+       return ret;
+ }
+ 
+ int sched_group_set_idle(struct task_group *tg, long idle)
+ {
+       int i;
+ 
+       if (tg == &root_task_group)
+               return -EINVAL;
+ 
+       if (idle < 0 || idle > 1)
+               return -EINVAL;
+ 
+       mutex_lock(&shares_mutex);
+ 
+       if (tg->idle == idle) {
+               mutex_unlock(&shares_mutex);
+               return 0;
+       }
+ 
+       tg->idle = idle;
+ 
+       for_each_possible_cpu(i) {
+               struct rq *rq = cpu_rq(i);
+               struct sched_entity *se = tg->se[i];
+               struct cfs_rq *grp_cfs_rq = tg->cfs_rq[i];
+               bool was_idle = cfs_rq_is_idle(grp_cfs_rq);
+               long idle_task_delta;
+               struct rq_flags rf;
+ 
+               rq_lock_irqsave(rq, &rf);
+ 
+               grp_cfs_rq->idle = idle;
+               if (WARN_ON_ONCE(was_idle == cfs_rq_is_idle(grp_cfs_rq)))
+                       goto next_cpu;
+ 
+               idle_task_delta = grp_cfs_rq->h_nr_running -
+                                 grp_cfs_rq->idle_h_nr_running;
+               if (!cfs_rq_is_idle(grp_cfs_rq))
+                       idle_task_delta *= -1;
+ 
+               for_each_sched_entity(se) {
+                       struct cfs_rq *cfs_rq = cfs_rq_of(se);
+ 
+                       if (!se->on_rq)
+                               break;
+ 
+                       cfs_rq->idle_h_nr_running += idle_task_delta;
+ 
+                       /* Already accounted at parent level and above. */
+                       if (cfs_rq_is_idle(cfs_rq))
+                               break;
+               }
+ 
+ next_cpu:
+               rq_unlock_irqrestore(rq, &rf);
+       }
+ 
+       /* Idle groups have minimum weight. */
+       if (tg_is_idle(tg))
+               __sched_group_set_shares(tg, scale_load(WEIGHT_IDLEPRIO));
+       else
+               __sched_group_set_shares(tg, NICE_0_LOAD);
+ 
         mutex_unlock(&shares_mutex);
         return 0;
   }
+ 
   #else /* CONFIG_FAIR_GROUP_SCHED */
   
   void free_fair_sched_group(struct task_group *tg) { }
diff --combined kernel/sched/sched.h

index 14a41a243f7baf308cfbda57c3d3a8c6b3e0d9a3,e7e2bba5b5207793aba1b0ab8a0a05d5eca26ce8..a9a660c6e08ac89b0f8759fff3b5394bef57b263
--- 1/kernel/sched/sched.h
--- 2/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@@ -227,6 -227,8 +227,8 @@@ static inline void update_avg(u64 *avg
    */
   #define SCHED_FLAG_SUGOV      0x10000000
   
+ #define SCHED_DL_FLAGS (SCHED_FLAG_RECLAIM | SCHED_FLAG_DL_OVERRUN | SCHED_FLAG_SUGOV)
+ 
   static inline bool dl_entity_is_special(struct sched_dl_entity *dl_se)
   {
   #ifdef CONFIG_CPU_FREQ_GOV_SCHEDUTIL
@@@ -394,6 -396,9 +396,9 @@@ struct task_group 
         struct cfs_rq           **cfs_rq;
         unsigned long           shares;
   
+       /* A positive value indicates that this is a SCHED_IDLE group. */
+       int                     idle;
+ 
   #ifdef        CONFIG_SMP
         /*
          * load_avg can be heavily contended at clock tick time, so put
@@@ -503,6 -508,8 +508,8 @@@ extern void sched_move_task(struct task
   #ifdef CONFIG_FAIR_GROUP_SCHED
   extern int sched_group_set_shares(struct task_group *tg, unsigned long shares);
   
+ extern int sched_group_set_idle(struct task_group *tg, long idle);
+ 
   #ifdef CONFIG_SMP
   extern void set_task_rq_fair(struct sched_entity *se,
                              struct cfs_rq *prev, struct cfs_rq *next);
@@@ -599,6 -606,9 +606,9 @@@ struct cfs_rq 
         struct list_head        leaf_cfs_rq_list;
         struct task_group       *tg;    /* group that "owns" this runqueue */
   
+       /* Locally cached copy of our task_group's idle value */
+       int                     idle;
+ 
   #ifdef CONFIG_CFS_BANDWIDTH
         int                     runtime_enabled;
         s64                     runtime_remaining;
@@@ -2234,6 -2244,7 +2244,7 @@@ extern struct task_struct *pick_next_ta
   #define SCA_CHECK             0x01
   #define SCA_MIGRATE_DISABLE   0x02
   #define SCA_MIGRATE_ENABLE    0x04
+ #define SCA_USER              0x08
   
   #ifdef CONFIG_SMP
   
@@@ -2385,6 -2396,21 +2396,21 @@@ extern void check_preempt_curr(struct r
   extern const_debug unsigned int sysctl_sched_nr_migrate;
   extern const_debug unsigned int sysctl_sched_migration_cost;
   
+ #ifdef CONFIG_SCHED_DEBUG
+ extern unsigned int sysctl_sched_latency;
+ extern unsigned int sysctl_sched_min_granularity;
+ extern unsigned int sysctl_sched_wakeup_granularity;
+ extern int sysctl_resched_latency_warn_ms;
+ extern int sysctl_resched_latency_warn_once;
+ 
+ extern unsigned int sysctl_sched_tunable_scaling;
+ 
+ extern unsigned int sysctl_numa_balancing_scan_delay;
+ extern unsigned int sysctl_numa_balancing_scan_period_min;
+ extern unsigned int sysctl_numa_balancing_scan_period_max;
+ extern unsigned int sysctl_numa_balancing_scan_size;
+ #endif
+ 
   #ifdef CONFIG_SCHED_HRTICK
   
   /*
@@@ -2818,27 -2844,20 +2844,27 @@@ static __always_inlin
   unsigned long uclamp_rq_util_with(struct rq *rq, unsigned long util,
                                   struct task_struct *p)
   {
- -      unsigned long min_util;
- -      unsigned long max_util;
+ +      unsigned long min_util = 0;
+ +      unsigned long max_util = 0;
   
         if (!static_branch_likely(&sched_uclamp_used))
                 return util;
   
- -      min_util = READ_ONCE(rq->uclamp[UCLAMP_MIN].value);
- -      max_util = READ_ONCE(rq->uclamp[UCLAMP_MAX].value);
- -
         if (p) {
- -              min_util = max(min_util, uclamp_eff_value(p, UCLAMP_MIN));
- -              max_util = max(max_util, uclamp_eff_value(p, UCLAMP_MAX));
+ +              min_util = uclamp_eff_value(p, UCLAMP_MIN);
+ +              max_util = uclamp_eff_value(p, UCLAMP_MAX);
+ +
+ +              /*
+ +               * Ignore last runnable task's max clamp, as this task will
+ +               * reset it. Similarly, no need to read the rq's min clamp.
+ +               */
+ +              if (rq->uclamp_flags & UCLAMP_FLAG_IDLE)
+ +                      goto out;
         }
   
+ +      min_util = max_t(unsigned long, min_util, READ_ONCE(rq->uclamp[UCLAMP_MIN].value));
+ +      max_util = max_t(unsigned long, max_util, READ_ONCE(rq->uclamp[UCLAMP_MAX].value));
+ +out:
         /*
          * Since CPU's {min,max}_util clamps are MAX aggregated considering
          * RUNNABLE tasks with _different_ clamps, we can end up with an
author	Peter Zijlstra <[email protected]>
	Fri, 20 Aug 2021 10:33:05 +0000 (12:33 +0200)
committer	Peter Zijlstra <[email protected]>
	Fri, 20 Aug 2021 10:33:05 +0000 (12:33 +0200)
		1	2
include/linux/sched.h	patch \|	diff1 \|	diff2 \|	blob \| history
include/linux/wait.h	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/fork.c	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/sched/core.c	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/sched/fair.c	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/sched/sched.h	patch \|	diff1 \|	diff2 \|	blob \| history