Merge tag 'sysctl-6.2-rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/mcgrof...

author Linus Torvalds <[email protected]>

Tue, 13 Dec 2022 22:16:44 +0000 (14:16 -0800)

committer Linus Torvalds <[email protected]>

Tue, 13 Dec 2022 22:16:44 +0000 (14:16 -0800)
author Linus Torvalds <[email protected]>
Tue, 13 Dec 2022 22:16:44 +0000 (14:16 -0800)
committer Linus Torvalds <[email protected]>
Tue, 13 Dec 2022 22:16:44 +0000 (14:16 -0800)
diff --combined kernel/sched/core.c

index 78b2d5cabcc5dd47d3f63ea557a720f086a8081a,64f9242328a650ff296ed04032ee2da8e0e6d847..9e3c89c253cbf563c9c91ba14021e77342a34938
--- 1/kernel/sched/core.c
--- 2/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@@ -1392,7 -1392,7 +1392,7 @@@ static inline void uclamp_idle_reset(st
         if (!(rq->uclamp_flags & UCLAMP_FLAG_IDLE))
                 return;
   
- -      WRITE_ONCE(rq->uclamp[clamp_id].value, clamp_value);
+ +      uclamp_rq_set(rq, clamp_id, clamp_value);
   }
   
   static inline
@@@ -1543,8 -1543,8 +1543,8 @@@ static inline void uclamp_rq_inc_id(str
         if (bucket->tasks == 1 || uc_se->value > bucket->value)
                 bucket->value = uc_se->value;
   
- -      if (uc_se->value > READ_ONCE(uc_rq->value))
- -              WRITE_ONCE(uc_rq->value, uc_se->value);
+ +      if (uc_se->value > uclamp_rq_get(rq, clamp_id))
+ +              uclamp_rq_set(rq, clamp_id, uc_se->value);
   }
   
   /*
@@@ -1610,7 -1610,7 +1610,7 @@@ static inline void uclamp_rq_dec_id(str
         if (likely(bucket->tasks))
                 return;
   
- -      rq_clamp = READ_ONCE(uc_rq->value);
+ +      rq_clamp = uclamp_rq_get(rq, clamp_id);
         /*
          * Defensive programming: this should never happen. If it happens,
          * e.g. due to future modification, warn and fixup the expected value.
@@@ -1618,7 -1618,7 +1618,7 @@@
         SCHED_WARN_ON(bucket->value > rq_clamp);
         if (bucket->value >= rq_clamp) {
                 bkt_clamp = uclamp_rq_max_value(rq, clamp_id, uc_se->value);
- -              WRITE_ONCE(uc_rq->value, bkt_clamp);
+ +              uclamp_rq_set(rq, clamp_id, bkt_clamp);
         }
   }
   
@@@ -2053,7 -2053,7 +2053,7 @@@ static inline void enqueue_task(struct 
   
         if (!(flags & ENQUEUE_RESTORE)) {
                 sched_info_enqueue(rq, p);
- -              psi_enqueue(p, flags & ENQUEUE_WAKEUP);
+ +              psi_enqueue(p, (flags & ENQUEUE_WAKEUP) && !(flags & ENQUEUE_MIGRATED));
         }
   
         uclamp_rq_inc(rq, p);
@@@ -2189,18 -2189,14 +2189,18 @@@ void check_preempt_curr(struct rq *rq, 
   #ifdef CONFIG_SMP
   
   static void
- -__do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask, u32 flags);
+ +__do_set_cpus_allowed(struct task_struct *p, struct affinity_context *ctx);
   
   static int __set_cpus_allowed_ptr(struct task_struct *p,
- -                                const struct cpumask *new_mask,
- -                                u32 flags);
+ +                                struct affinity_context *ctx);
   
   static void migrate_disable_switch(struct rq *rq, struct task_struct *p)
   {
+ +      struct affinity_context ac = {
+ +              .new_mask  = cpumask_of(rq->cpu),
+ +              .flags     = SCA_MIGRATE_DISABLE,
+ +      };
+ +
         if (likely(!p->migration_disabled))
                 return;
   
@@@ -2210,7 -2206,7 +2210,7 @@@
         /*
          * Violates locking rules! see comment in __do_set_cpus_allowed().
          */
- -      __do_set_cpus_allowed(p, cpumask_of(rq->cpu), SCA_MIGRATE_DISABLE);
+ +      __do_set_cpus_allowed(p, &ac);
   }
   
   void migrate_disable(void)
@@@ -2232,10 -2228,6 +2232,10 @@@ EXPORT_SYMBOL_GPL(migrate_disable)
   void migrate_enable(void)
   {
         struct task_struct *p = current;
+ +      struct affinity_context ac = {
+ +              .new_mask  = &p->cpus_mask,
+ +              .flags     = SCA_MIGRATE_ENABLE,
+ +      };
   
         if (p->migration_disabled > 1) {
                 p->migration_disabled--;
@@@ -2251,7 -2243,7 +2251,7 @@@
          */
         preempt_disable();
         if (p->cpus_ptr != &p->cpus_mask)
- -              __set_cpus_allowed_ptr(p, &p->cpus_mask, SCA_MIGRATE_ENABLE);
+ +              __set_cpus_allowed_ptr(p, &ac);
         /*
          * Mustn't clear migration_disabled() until cpus_ptr points back at the
          * regular cpus_mask, otherwise things that race (eg.
@@@ -2531,25 -2523,19 +2531,25 @@@ out_unlock
    * sched_class::set_cpus_allowed must do the below, but is not required to
    * actually call this function.
    */
- -void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask, u32 flags)
+ +void set_cpus_allowed_common(struct task_struct *p, struct affinity_context *ctx)
   {
- -      if (flags & (SCA_MIGRATE_ENABLE | SCA_MIGRATE_DISABLE)) {
- -              p->cpus_ptr = new_mask;
+ +      if (ctx->flags & (SCA_MIGRATE_ENABLE | SCA_MIGRATE_DISABLE)) {
+ +              p->cpus_ptr = ctx->new_mask;
                 return;
         }
   
- -      cpumask_copy(&p->cpus_mask, new_mask);
- -      p->nr_cpus_allowed = cpumask_weight(new_mask);
+ +      cpumask_copy(&p->cpus_mask, ctx->new_mask);
+ +      p->nr_cpus_allowed = cpumask_weight(ctx->new_mask);
+ +
+ +      /*
+ +       * Swap in a new user_cpus_ptr if SCA_USER flag set
+ +       */
+ +      if (ctx->flags & SCA_USER)
+ +              swap(p->user_cpus_ptr, ctx->user_mask);
   }
   
   static void
- -__do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask, u32 flags)
+ +__do_set_cpus_allowed(struct task_struct *p, struct affinity_context *ctx)
   {
         struct rq *rq = task_rq(p);
         bool queued, running;
@@@ -2566,7 -2552,7 +2566,7 @@@
          *
          * XXX do further audits, this smells like something putrid.
          */
- -      if (flags & SCA_MIGRATE_DISABLE)
+ +      if (ctx->flags & SCA_MIGRATE_DISABLE)
                 SCHED_WARN_ON(!p->on_cpu);
         else
                 lockdep_assert_held(&p->pi_lock);
@@@ -2585,7 -2571,7 +2585,7 @@@
         if (running)
                 put_prev_task(rq, p);
   
- -      p->sched_class->set_cpus_allowed(p, new_mask, flags);
+ +      p->sched_class->set_cpus_allowed(p, ctx);
   
         if (queued)
                 enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK);
@@@ -2593,27 -2579,14 +2593,27 @@@
                 set_next_task(rq, p);
   }
   
+ +/*
+ + * Used for kthread_bind() and select_fallback_rq(), in both cases the user
+ + * affinity (if any) should be destroyed too.
+ + */
   void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
   {
- -      __do_set_cpus_allowed(p, new_mask, 0);
+ +      struct affinity_context ac = {
+ +              .new_mask  = new_mask,
+ +              .user_mask = NULL,
+ +              .flags     = SCA_USER,  /* clear the user requested mask */
+ +      };
+ +
+ +      __do_set_cpus_allowed(p, &ac);
+ +      kfree(ac.user_mask);
   }
   
   int dup_user_cpus_ptr(struct task_struct *dst, struct task_struct *src,
                       int node)
   {
+ +      unsigned long flags;
+ +
         if (!src->user_cpus_ptr)
                 return 0;
   
@@@ -2621,10 -2594,7 +2621,10 @@@
         if (!dst->user_cpus_ptr)
                 return -ENOMEM;
   
+ +      /* Use pi_lock to protect content of user_cpus_ptr */
+ +      raw_spin_lock_irqsave(&src->pi_lock, flags);
         cpumask_copy(dst->user_cpus_ptr, src->user_cpus_ptr);
+ +      raw_spin_unlock_irqrestore(&src->pi_lock, flags);
         return 0;
   }
   
@@@ -2720,8 -2690,6 +2720,8 @@@ void release_user_cpus_ptr(struct task_
    */
   static int affine_move_task(struct rq *rq, struct task_struct *p, struct rq_flags *rf,
                             int dest_cpu, unsigned int flags)
+ +      __releases(rq->lock)
+ +      __releases(p->pi_lock)
   {
         struct set_affinity_pending my_pending = { }, *pending = NULL;
         bool stop_pending, complete = false;
@@@ -2864,7 -2832,8 +2864,7 @@@
    * Called with both p->pi_lock and rq->lock held; drops both before returning.
    */
   static int __set_cpus_allowed_ptr_locked(struct task_struct *p,
- -                                       const struct cpumask *new_mask,
- -                                       u32 flags,
+ +                                       struct affinity_context *ctx,
                                          struct rq *rq,
                                          struct rq_flags *rf)
         __releases(rq->lock)
@@@ -2873,6 -2842,7 +2873,6 @@@
         const struct cpumask *cpu_allowed_mask = task_cpu_possible_mask(p);
         const struct cpumask *cpu_valid_mask = cpu_active_mask;
         bool kthread = p->flags & PF_KTHREAD;
- -      struct cpumask *user_mask = NULL;
         unsigned int dest_cpu;
         int ret = 0;
   
@@@ -2892,7 -2862,7 +2892,7 @@@
                 cpu_valid_mask = cpu_online_mask;
         }
   
- -      if (!kthread && !cpumask_subset(new_mask, cpu_allowed_mask)) {
+ +      if (!kthread && !cpumask_subset(ctx->new_mask, cpu_allowed_mask)) {
                 ret = -EINVAL;
                 goto out;
         }
@@@ -2901,18 -2871,18 +2901,18 @@@
          * Must re-check here, to close a race against __kthread_bind(),
          * sched_setaffinity() is not guaranteed to observe the flag.
          */
- -      if ((flags & SCA_CHECK) && (p->flags & PF_NO_SETAFFINITY)) {
+ +      if ((ctx->flags & SCA_CHECK) && (p->flags & PF_NO_SETAFFINITY)) {
                 ret = -EINVAL;
                 goto out;
         }
   
- -      if (!(flags & SCA_MIGRATE_ENABLE)) {
- -              if (cpumask_equal(&p->cpus_mask, new_mask))
+ +      if (!(ctx->flags & SCA_MIGRATE_ENABLE)) {
+ +              if (cpumask_equal(&p->cpus_mask, ctx->new_mask))
                         goto out;
   
                 if (WARN_ON_ONCE(p == current &&
                                  is_migration_disabled(p) &&
- -                               !cpumask_test_cpu(task_cpu(p), new_mask))) {
+ +                               !cpumask_test_cpu(task_cpu(p), ctx->new_mask))) {
                         ret = -EBUSY;
                         goto out;
                 }
@@@ -2923,15 -2893,22 +2923,15 @@@
          * for groups of tasks (ie. cpuset), so that load balancing is not
          * immediately required to distribute the tasks within their new mask.
          */
- -      dest_cpu = cpumask_any_and_distribute(cpu_valid_mask, new_mask);
+ +      dest_cpu = cpumask_any_and_distribute(cpu_valid_mask, ctx->new_mask);
         if (dest_cpu >= nr_cpu_ids) {
                 ret = -EINVAL;
                 goto out;
         }
   
- -      __do_set_cpus_allowed(p, new_mask, flags);
- -
- -      if (flags & SCA_USER)
- -              user_mask = clear_user_cpus_ptr(p);
+ +      __do_set_cpus_allowed(p, ctx);
   
- -      ret = affine_move_task(rq, p, rf, dest_cpu, flags);
- -
- -      kfree(user_mask);
- -
- -      return ret;
+ +      return affine_move_task(rq, p, rf, dest_cpu, ctx->flags);
   
   out:
         task_rq_unlock(rq, p, rf);
@@@ -2949,41 -2926,25 +2949,41 @@@
    * call is not atomic; no spinlocks may be held.
    */
   static int __set_cpus_allowed_ptr(struct task_struct *p,
- -                                const struct cpumask *new_mask, u32 flags)
+ +                                struct affinity_context *ctx)
   {
         struct rq_flags rf;
         struct rq *rq;
   
         rq = task_rq_lock(p, &rf);
- -      return __set_cpus_allowed_ptr_locked(p, new_mask, flags, rq, &rf);
+ +      /*
+ +       * Masking should be skipped if SCA_USER or any of the SCA_MIGRATE_*
+ +       * flags are set.
+ +       */
+ +      if (p->user_cpus_ptr &&
+ +          !(ctx->flags & (SCA_USER | SCA_MIGRATE_ENABLE | SCA_MIGRATE_DISABLE)) &&
+ +          cpumask_and(rq->scratch_mask, ctx->new_mask, p->user_cpus_ptr))
+ +              ctx->new_mask = rq->scratch_mask;
+ +
+ +      return __set_cpus_allowed_ptr_locked(p, ctx, rq, &rf);
   }
   
   int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
   {
- -      return __set_cpus_allowed_ptr(p, new_mask, 0);
+ +      struct affinity_context ac = {
+ +              .new_mask  = new_mask,
+ +              .flags     = 0,
+ +      };
+ +
+ +      return __set_cpus_allowed_ptr(p, &ac);
   }
   EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr);
   
   /*
    * Change a given task's CPU affinity to the intersection of its current
- - * affinity mask and @subset_mask, writing the resulting mask to @new_mask
- - * and pointing @p->user_cpus_ptr to a copy of the old mask.
+ + * affinity mask and @subset_mask, writing the resulting mask to @new_mask.
+ + * If user_cpus_ptr is defined, use it as the basis for restricting CPU
+ + * affinity or use cpu_online_mask instead.
+ + *
    * If the resulting mask is empty, leave the affinity unchanged and return
    * -EINVAL.
    */
@@@ -2991,14 -2952,17 +2991,14 @@@ static int restrict_cpus_allowed_ptr(st
                                      struct cpumask *new_mask,
                                      const struct cpumask *subset_mask)
   {
- -      struct cpumask *user_mask = NULL;
+ +      struct affinity_context ac = {
+ +              .new_mask  = new_mask,
+ +              .flags     = 0,
+ +      };
         struct rq_flags rf;
         struct rq *rq;
         int err;
   
- -      if (!p->user_cpus_ptr) {
- -              user_mask = kmalloc(cpumask_size(), GFP_KERNEL);
- -              if (!user_mask)
- -                      return -ENOMEM;
- -      }
- -
         rq = task_rq_lock(p, &rf);
   
         /*
@@@ -3011,21 -2975,31 +3011,21 @@@
                 goto err_unlock;
         }
   
- -      if (!cpumask_and(new_mask, &p->cpus_mask, subset_mask)) {
+ +      if (!cpumask_and(new_mask, task_user_cpus(p), subset_mask)) {
                 err = -EINVAL;
                 goto err_unlock;
         }
   
- -      /*
- -       * We're about to butcher the task affinity, so keep track of what
- -       * the user asked for in case we're able to restore it later on.
- -       */
- -      if (user_mask) {
- -              cpumask_copy(user_mask, p->cpus_ptr);
- -              p->user_cpus_ptr = user_mask;
- -      }
- -
- -      return __set_cpus_allowed_ptr_locked(p, new_mask, 0, rq, &rf);
+ +      return __set_cpus_allowed_ptr_locked(p, &ac, rq, &rf);
   
   err_unlock:
         task_rq_unlock(rq, p, &rf);
- -      kfree(user_mask);
         return err;
   }
   
   /*
    * Restrict the CPU affinity of task @p so that it is a subset of
- - * task_cpu_possible_mask() and point @p->user_cpu_ptr to a copy of the
+ + * task_cpu_possible_mask() and point @p->user_cpus_ptr to a copy of the
    * old affinity mask. If the resulting mask is empty, we warn and walk
    * up the cpuset hierarchy until we find a suitable mask.
    */
@@@ -3069,29 -3043,34 +3069,29 @@@ out_free_mask
   }
   
   static int
- -__sched_setaffinity(struct task_struct *p, const struct cpumask *mask);
+ +__sched_setaffinity(struct task_struct *p, struct affinity_context *ctx);
   
   /*
    * Restore the affinity of a task @p which was previously restricted by a
- - * call to force_compatible_cpus_allowed_ptr(). This will clear (and free)
- - * @p->user_cpus_ptr.
+ + * call to force_compatible_cpus_allowed_ptr().
    *
    * It is the caller's responsibility to serialise this with any calls to
    * force_compatible_cpus_allowed_ptr(@p).
    */
   void relax_compatible_cpus_allowed_ptr(struct task_struct *p)
   {
- -      struct cpumask *user_mask = p->user_cpus_ptr;
- -      unsigned long flags;
+ +      struct affinity_context ac = {
+ +              .new_mask  = task_user_cpus(p),
+ +              .flags     = 0,
+ +      };
+ +      int ret;
   
         /*
- -       * Try to restore the old affinity mask. If this fails, then
- -       * we free the mask explicitly to avoid it being inherited across
- -       * a subsequent fork().
+ +       * Try to restore the old affinity mask with __sched_setaffinity().
+ +       * Cpuset masking will be done there too.
          */
- -      if (!user_mask || !__sched_setaffinity(p, user_mask))
- -              return;
- -
- -      raw_spin_lock_irqsave(&p->pi_lock, flags);
- -      user_mask = clear_user_cpus_ptr(p);
- -      raw_spin_unlock_irqrestore(&p->pi_lock, flags);
- -
- -      kfree(user_mask);
+ +      ret = __sched_setaffinity(p, &ac);
+ +      WARN_ON_ONCE(ret);
   }
   
   void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
@@@ -3569,9 -3548,10 +3569,9 @@@ void sched_set_stop_task(int cpu, struc
   #else /* CONFIG_SMP */
   
   static inline int __set_cpus_allowed_ptr(struct task_struct *p,
- -                                       const struct cpumask *new_mask,
- -                                       u32 flags)
+ +                                       struct affinity_context *ctx)
   {
- -      return set_cpus_allowed_ptr(p, new_mask);
+ +      return set_cpus_allowed_ptr(p, ctx->new_mask);
   }
   
   static inline void migrate_disable_switch(struct rq *rq, struct task_struct *p) { }
@@@ -3739,6 -3719,13 +3739,6 @@@ void sched_ttwu_pending(void *arg
         if (!llist)
                 return;
   
- -      /*
- -       * rq::ttwu_pending racy indication of out-standing wakeups.
- -       * Races such that false-negatives are possible, since they
- -       * are shorter lived that false-positives would be.
- -       */
- -      WRITE_ONCE(rq->ttwu_pending, 0);
- -
         rq_lock_irqsave(rq, &rf);
         update_rq_clock(rq);
   
@@@ -3752,17 -3739,6 +3752,17 @@@
                 ttwu_do_activate(rq, p, p->sched_remote_wakeup ? WF_MIGRATED : 0, &rf);
         }
   
+ +      /*
+ +       * Must be after enqueueing at least once task such that
+ +       * idle_cpu() does not observe a false-negative -- if it does,
+ +       * it is possible for select_idle_siblings() to stack a number
+ +       * of tasks on this CPU during that window.
+ +       *
+ +       * It is ok to clear ttwu_pending when another task pending.
+ +       * We will receive IPI after local irq enabled and then enqueue it.
+ +       * Since now nr_running > 0, idle_cpu() will always get correct result.
+ +       */
+ +      WRITE_ONCE(rq->ttwu_pending, 0);
         rq_unlock_irqrestore(rq, &rf);
   }
   
@@@ -4224,40 -4200,6 +4224,40 @@@ out
         return success;
   }
   
+ +static bool __task_needs_rq_lock(struct task_struct *p)
+ +{
+ +      unsigned int state = READ_ONCE(p->__state);
+ +
+ +      /*
+ +       * Since pi->lock blocks try_to_wake_up(), we don't need rq->lock when
+ +       * the task is blocked. Make sure to check @state since ttwu() can drop
+ +       * locks at the end, see ttwu_queue_wakelist().
+ +       */
+ +      if (state == TASK_RUNNING || state == TASK_WAKING)
+ +              return true;
+ +
+ +      /*
+ +       * Ensure we load p->on_rq after p->__state, otherwise it would be
+ +       * possible to, falsely, observe p->on_rq == 0.
+ +       *
+ +       * See try_to_wake_up() for a longer comment.
+ +       */
+ +      smp_rmb();
+ +      if (p->on_rq)
+ +              return true;
+ +
+ +#ifdef CONFIG_SMP
+ +      /*
+ +       * Ensure the task has finished __schedule() and will not be referenced
+ +       * anymore. Again, see try_to_wake_up() for a longer comment.
+ +       */
+ +      smp_rmb();
+ +      smp_cond_load_acquire(&p->on_cpu, !VAL);
+ +#endif
+ +
+ +      return false;
+ +}
+ +
   /**
    * task_call_func - Invoke a function on task in fixed state
    * @p: Process for which the function is to be invoked, can be @current.
@@@ -4275,12 -4217,28 +4275,12 @@@
   int task_call_func(struct task_struct *p, task_call_f func, void *arg)
   {
         struct rq *rq = NULL;
- -      unsigned int state;
         struct rq_flags rf;
         int ret;
   
         raw_spin_lock_irqsave(&p->pi_lock, rf.flags);
   
- -      state = READ_ONCE(p->__state);
- -
- -      /*
- -       * Ensure we load p->on_rq after p->__state, otherwise it would be
- -       * possible to, falsely, observe p->on_rq == 0.
- -       *
- -       * See try_to_wake_up() for a longer comment.
- -       */
- -      smp_rmb();
- -
- -      /*
- -       * Since pi->lock blocks try_to_wake_up(), we don't need rq->lock when
- -       * the task is blocked. Make sure to check @state since ttwu() can drop
- -       * locks at the end, see ttwu_queue_wakelist().
- -       */
- -      if (state == TASK_RUNNING || state == TASK_WAKING || p->on_rq)
+ +      if (__task_needs_rq_lock(p))
                 rq = __task_rq_lock(p, &rf);
   
         /*
@@@ -4443,7 -4401,7 +4443,7 @@@ static void reset_memory_tiering(void
         }
   }
   
- int sysctl_numa_balancing(struct ctl_table *table, int write,
+ static int sysctl_numa_balancing(struct ctl_table *table, int write,
                           void *buffer, size_t *lenp, loff_t *ppos)
   {
         struct ctl_table t;
@@@ -4570,6 -4528,17 +4570,17 @@@ static struct ctl_table sched_core_sysc
                 .proc_handler   = sysctl_sched_uclamp_handler,
         },
   #endif /* CONFIG_UCLAMP_TASK */
+ #ifdef CONFIG_NUMA_BALANCING
+       {
+               .procname       = "numa_balancing",
+               .data           = NULL, /* filled in by handler */
+               .maxlen         = sizeof(unsigned int),
+               .mode           = 0644,
+               .proc_handler   = sysctl_numa_balancing,
+               .extra1         = SYSCTL_ZERO,
+               .extra2         = SYSCTL_FOUR,
+       },
+ #endif /* CONFIG_NUMA_BALANCING */
         {}
   };
   static int __init sched_core_sysctl_init(void)
@@@ -8130,7 -8099,7 +8141,7 @@@ int dl_task_check_affinity(struct task_
   #endif
   
   static int
- -__sched_setaffinity(struct task_struct *p, const struct cpumask *mask)
+ +__sched_setaffinity(struct task_struct *p, struct affinity_context *ctx)
   {
         int retval;
         cpumask_var_t cpus_allowed, new_mask;
@@@ -8144,16 -8113,13 +8155,16 @@@
         }
   
         cpuset_cpus_allowed(p, cpus_allowed);
- -      cpumask_and(new_mask, mask, cpus_allowed);
+ +      cpumask_and(new_mask, ctx->new_mask, cpus_allowed);
+ +
+ +      ctx->new_mask = new_mask;
+ +      ctx->flags |= SCA_CHECK;
   
         retval = dl_task_check_affinity(p, new_mask);
         if (retval)
                 goto out_free_new_mask;
- -again:
- -      retval = __set_cpus_allowed_ptr(p, new_mask, SCA_CHECK | SCA_USER);
+ +
+ +      retval = __set_cpus_allowed_ptr(p, ctx);
         if (retval)
                 goto out_free_new_mask;
   
@@@ -8164,24 -8130,7 +8175,24 @@@
                  * Just reset the cpumask to the cpuset's cpus_allowed.
                  */
                 cpumask_copy(new_mask, cpus_allowed);
- -              goto again;
+ +
+ +              /*
+ +               * If SCA_USER is set, a 2nd call to __set_cpus_allowed_ptr()
+ +               * will restore the previous user_cpus_ptr value.
+ +               *
+ +               * In the unlikely event a previous user_cpus_ptr exists,
+ +               * we need to further restrict the mask to what is allowed
+ +               * by that old user_cpus_ptr.
+ +               */
+ +              if (unlikely((ctx->flags & SCA_USER) && ctx->user_mask)) {
+ +                      bool empty = !cpumask_and(new_mask, new_mask,
+ +                                                ctx->user_mask);
+ +
+ +                      if (WARN_ON_ONCE(empty))
+ +                              cpumask_copy(new_mask, cpus_allowed);
+ +              }
+ +              __set_cpus_allowed_ptr(p, ctx);
+ +              retval = -EINVAL;
         }
   
   out_free_new_mask:
@@@ -8193,8 -8142,6 +8204,8 @@@ out_free_cpus_allowed
   
   long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
   {
+ +      struct affinity_context ac;
+ +      struct cpumask *user_mask;
         struct task_struct *p;
         int retval;
   
@@@ -8229,21 -8176,7 +8240,21 @@@
         if (retval)
                 goto out_put_task;
   
- -      retval = __sched_setaffinity(p, in_mask);
+ +      user_mask = kmalloc(cpumask_size(), GFP_KERNEL);
+ +      if (!user_mask) {
+ +              retval = -ENOMEM;
+ +              goto out_put_task;
+ +      }
+ +      cpumask_copy(user_mask, in_mask);
+ +      ac = (struct affinity_context){
+ +              .new_mask  = in_mask,
+ +              .user_mask = user_mask,
+ +              .flags     = SCA_USER,
+ +      };
+ +
+ +      retval = __sched_setaffinity(p, &ac);
+ +      kfree(ac.user_mask);
+ +
   out_put_task:
         put_task_struct(p);
         return retval;
@@@ -9024,12 -8957,6 +9035,12 @@@ void show_state_filter(unsigned int sta
    */
   void __init init_idle(struct task_struct *idle, int cpu)
   {
+ +#ifdef CONFIG_SMP
+ +      struct affinity_context ac = (struct affinity_context) {
+ +              .new_mask  = cpumask_of(cpu),
+ +              .flags     = 0,
+ +      };
+ +#endif
         struct rq *rq = cpu_rq(cpu);
         unsigned long flags;
   
@@@ -9054,7 -8981,7 +9065,7 @@@
          *
          * And since this is boot we can forgo the serialization.
          */
- -      set_cpus_allowed_common(idle, cpumask_of(cpu), 0);
+ +      set_cpus_allowed_common(idle, &ac);
   #endif
         /*
          * We're having a chicken and egg problem, even though we are
@@@ -9841,7 -9768,6 +9852,7 @@@ void __init sched_init(void
   
                 rq->core_cookie = 0UL;
   #endif
+ +              zalloc_cpumask_var_node(&rq->scratch_mask, GFP_KERNEL, cpu_to_node(i));
         }
   
         set_load_weight(&init_task, false);
diff --combined kernel/sched/fair.c

index 0cd1d0f7c1bd7c288392185894a61099cafd9eb0,8e029a6460bb8936d28f4fc3a84cf62b366c616b..c36aa54ae071a746bc9d585a33228c731e5de025
--- 1/kernel/sched/fair.c
--- 2/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@@ -178,6 -178,11 +178,11 @@@ int __weak arch_asym_cpu_priority(int c
   static unsigned int sysctl_sched_cfs_bandwidth_slice          = 5000UL;
   #endif
   
+ #ifdef CONFIG_NUMA_BALANCING
+ /* Restrict the NUMA promotion throughput (MB/s) for each target node. */
+ static unsigned int sysctl_numa_balancing_promote_rate_limit = 65536;
+ #endif
+ 
   #ifdef CONFIG_SYSCTL
   static struct ctl_table sched_fair_sysctls[] = {
         {
@@@ -197,6 -202,16 +202,16 @@@
                 .extra1         = SYSCTL_ONE,
         },
   #endif
+ #ifdef CONFIG_NUMA_BALANCING
+       {
+               .procname       = "numa_balancing_promote_rate_limit_MBps",
+               .data           = &sysctl_numa_balancing_promote_rate_limit,
+               .maxlen         = sizeof(unsigned int),
+               .mode           = 0644,
+               .proc_handler   = proc_dointvec_minmax,
+               .extra1         = SYSCTL_ZERO,
+       },
+ #endif /* CONFIG_NUMA_BALANCING */
         {}
   };
   
@@@ -1094,9 -1109,6 +1109,6 @@@ unsigned int sysctl_numa_balancing_scan
   /* The page with hint page fault latency < threshold in ms is considered hot */
   unsigned int sysctl_numa_balancing_hot_threshold = MSEC_PER_SEC;
   
- /* Restrict the NUMA promotion throughput (MB/s) for each target node. */
- unsigned int sysctl_numa_balancing_promote_rate_limit = 65536;
- 
   struct numa_group {
         refcount_t refcount;
   
@@@ -2964,7 -2976,7 +2976,7 @@@ static void task_numa_work(struct callb
         }
   
         next_scan = now + msecs_to_jiffies(p->numa_scan_period);
- -      if (cmpxchg(&mm->numa_next_scan, migrate, next_scan) != migrate)
+ +      if (!try_cmpxchg(&mm->numa_next_scan, &migrate, next_scan))
                 return;
   
         /*
@@@ -4280,16 -4292,14 +4292,16 @@@ static inline unsigned long task_util_e
   }
   
   #ifdef CONFIG_UCLAMP_TASK
- -static inline unsigned long uclamp_task_util(struct task_struct *p)
+ +static inline unsigned long uclamp_task_util(struct task_struct *p,
+ +                                           unsigned long uclamp_min,
+ +                                           unsigned long uclamp_max)
   {
- -      return clamp(task_util_est(p),
- -                   uclamp_eff_value(p, UCLAMP_MIN),
- -                   uclamp_eff_value(p, UCLAMP_MAX));
+ +      return clamp(task_util_est(p), uclamp_min, uclamp_max);
   }
   #else
- -static inline unsigned long uclamp_task_util(struct task_struct *p)
+ +static inline unsigned long uclamp_task_util(struct task_struct *p,
+ +                                           unsigned long uclamp_min,
+ +                                           unsigned long uclamp_max)
   {
         return task_util_est(p);
   }
@@@ -4428,139 -4438,10 +4440,139 @@@ done
         trace_sched_util_est_se_tp(&p->se);
   }
   
- -static inline int task_fits_capacity(struct task_struct *p,
- -                                   unsigned long capacity)
+ +static inline int util_fits_cpu(unsigned long util,
+ +                              unsigned long uclamp_min,
+ +                              unsigned long uclamp_max,
+ +                              int cpu)
   {
- -      return fits_capacity(uclamp_task_util(p), capacity);
+ +      unsigned long capacity_orig, capacity_orig_thermal;
+ +      unsigned long capacity = capacity_of(cpu);
+ +      bool fits, uclamp_max_fits;
+ +
+ +      /*
+ +       * Check if the real util fits without any uclamp boost/cap applied.
+ +       */
+ +      fits = fits_capacity(util, capacity);
+ +
+ +      if (!uclamp_is_used())
+ +              return fits;
+ +
+ +      /*
+ +       * We must use capacity_orig_of() for comparing against uclamp_min and
+ +       * uclamp_max. We only care about capacity pressure (by using
+ +       * capacity_of()) for comparing against the real util.
+ +       *
+ +       * If a task is boosted to 1024 for example, we don't want a tiny
+ +       * pressure to skew the check whether it fits a CPU or not.
+ +       *
+ +       * Similarly if a task is capped to capacity_orig_of(little_cpu), it
+ +       * should fit a little cpu even if there's some pressure.
+ +       *
+ +       * Only exception is for thermal pressure since it has a direct impact
+ +       * on available OPP of the system.
+ +       *
+ +       * We honour it for uclamp_min only as a drop in performance level
+ +       * could result in not getting the requested minimum performance level.
+ +       *
+ +       * For uclamp_max, we can tolerate a drop in performance level as the
+ +       * goal is to cap the task. So it's okay if it's getting less.
+ +       *
+ +       * In case of capacity inversion we should honour the inverted capacity
+ +       * for both uclamp_min and uclamp_max all the time.
+ +       */
+ +      capacity_orig = cpu_in_capacity_inversion(cpu);
+ +      if (capacity_orig) {
+ +              capacity_orig_thermal = capacity_orig;
+ +      } else {
+ +              capacity_orig = capacity_orig_of(cpu);
+ +              capacity_orig_thermal = capacity_orig - arch_scale_thermal_pressure(cpu);
+ +      }
+ +
+ +      /*
+ +       * We want to force a task to fit a cpu as implied by uclamp_max.
+ +       * But we do have some corner cases to cater for..
+ +       *
+ +       *
+ +       *                                 C=z
+ +       *   |                             ___
+ +       *   |                  C=y       |   |
+ +       *   |_ _ _ _ _ _ _ _ _ ___ _ _ _ | _ | _ _ _ _ _  uclamp_max
+ +       *   |      C=x        |   |      |   |
+ +       *   |      ___        |   |      |   |
+ +       *   |     |   |       |   |      |   |    (util somewhere in this region)
+ +       *   |     |   |       |   |      |   |
+ +       *   |     |   |       |   |      |   |
+ +       *   +----------------------------------------
+ +       *         cpu0        cpu1       cpu2
+ +       *
+ +       *   In the above example if a task is capped to a specific performance
+ +       *   point, y, then when:
+ +       *
+ +       *   * util = 80% of x then it does not fit on cpu0 and should migrate
+ +       *     to cpu1
+ +       *   * util = 80% of y then it is forced to fit on cpu1 to honour
+ +       *     uclamp_max request.
+ +       *
+ +       *   which is what we're enforcing here. A task always fits if
+ +       *   uclamp_max <= capacity_orig. But when uclamp_max > capacity_orig,
+ +       *   the normal upmigration rules should withhold still.
+ +       *
+ +       *   Only exception is when we are on max capacity, then we need to be
+ +       *   careful not to block overutilized state. This is so because:
+ +       *
+ +       *     1. There's no concept of capping at max_capacity! We can't go
+ +       *        beyond this performance level anyway.
+ +       *     2. The system is being saturated when we're operating near
+ +       *        max capacity, it doesn't make sense to block overutilized.
+ +       */
+ +      uclamp_max_fits = (capacity_orig == SCHED_CAPACITY_SCALE) && (uclamp_max == SCHED_CAPACITY_SCALE);
+ +      uclamp_max_fits = !uclamp_max_fits && (uclamp_max <= capacity_orig);
+ +      fits = fits || uclamp_max_fits;
+ +
+ +      /*
+ +       *
+ +       *                                 C=z
+ +       *   |                             ___       (region a, capped, util >= uclamp_max)
+ +       *   |                  C=y       |   |
+ +       *   |_ _ _ _ _ _ _ _ _ ___ _ _ _ | _ | _ _ _ _ _ uclamp_max
+ +       *   |      C=x        |   |      |   |
+ +       *   |      ___        |   |      |   |      (region b, uclamp_min <= util <= uclamp_max)
+ +       *   |_ _ _|_ _|_ _ _ _| _ | _ _ _| _ | _ _ _ _ _ uclamp_min
+ +       *   |     |   |       |   |      |   |
+ +       *   |     |   |       |   |      |   |      (region c, boosted, util < uclamp_min)
+ +       *   +----------------------------------------
+ +       *         cpu0        cpu1       cpu2
+ +       *
+ +       * a) If util > uclamp_max, then we're capped, we don't care about
+ +       *    actual fitness value here. We only care if uclamp_max fits
+ +       *    capacity without taking margin/pressure into account.
+ +       *    See comment above.
+ +       *
+ +       * b) If uclamp_min <= util <= uclamp_max, then the normal
+ +       *    fits_capacity() rules apply. Except we need to ensure that we
+ +       *    enforce we remain within uclamp_max, see comment above.
+ +       *
+ +       * c) If util < uclamp_min, then we are boosted. Same as (b) but we
+ +       *    need to take into account the boosted value fits the CPU without
+ +       *    taking margin/pressure into account.
+ +       *
+ +       * Cases (a) and (b) are handled in the 'fits' variable already. We
+ +       * just need to consider an extra check for case (c) after ensuring we
+ +       * handle the case uclamp_min > uclamp_max.
+ +       */
+ +      uclamp_min = min(uclamp_min, uclamp_max);
+ +      if (util < uclamp_min && capacity_orig != SCHED_CAPACITY_SCALE)
+ +              fits = fits && (uclamp_min <= capacity_orig_thermal);
+ +
+ +      return fits;
+ +}
+ +
+ +static inline int task_fits_cpu(struct task_struct *p, int cpu)
+ +{
+ +      unsigned long uclamp_min = uclamp_eff_value(p, UCLAMP_MIN);
+ +      unsigned long uclamp_max = uclamp_eff_value(p, UCLAMP_MAX);
+ +      unsigned long util = task_util_est(p);
+ +      return util_fits_cpu(util, uclamp_min, uclamp_max, cpu);
   }
   
   static inline void update_misfit_status(struct task_struct *p, struct rq *rq)
@@@ -4573,7 -4454,7 +4585,7 @@@
                 return;
         }
   
- -      if (task_fits_capacity(p, capacity_of(cpu_of(rq)))) {
+ +      if (task_fits_cpu(p, cpu_of(rq))) {
                 rq->misfit_task_load = 0;
                 return;
         }
@@@ -5993,10 -5874,7 +6005,10 @@@ static inline void hrtick_update(struc
   #ifdef CONFIG_SMP
   static inline bool cpu_overutilized(int cpu)
   {
- -      return !fits_capacity(cpu_util_cfs(cpu), capacity_of(cpu));
+ +      unsigned long rq_util_min = uclamp_rq_get(cpu_rq(cpu), UCLAMP_MIN);
+ +      unsigned long rq_util_max = uclamp_rq_get(cpu_rq(cpu), UCLAMP_MAX);
+ +
+ +      return !util_fits_cpu(cpu_util_cfs(cpu), rq_util_min, rq_util_max, cpu);
   }
   
   static inline void update_overutilized_status(struct rq *rq)
@@@ -6788,23 -6666,21 +6800,23 @@@ static int select_idle_cpu(struct task_
   static int
   select_idle_capacity(struct task_struct *p, struct sched_domain *sd, int target)
   {
- -      unsigned long task_util, best_cap = 0;
+ +      unsigned long task_util, util_min, util_max, best_cap = 0;
         int cpu, best_cpu = -1;
         struct cpumask *cpus;
   
         cpus = this_cpu_cpumask_var_ptr(select_rq_mask);
         cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr);
   
- -      task_util = uclamp_task_util(p);
+ +      task_util = task_util_est(p);
+ +      util_min = uclamp_eff_value(p, UCLAMP_MIN);
+ +      util_max = uclamp_eff_value(p, UCLAMP_MAX);
   
         for_each_cpu_wrap(cpu, cpus, target) {
                 unsigned long cpu_cap = capacity_of(cpu);
   
                 if (!available_idle_cpu(cpu) && !sched_idle_cpu(cpu))
                         continue;
- -              if (fits_capacity(task_util, cpu_cap))
+ +              if (util_fits_cpu(task_util, util_min, util_max, cpu))
                         return cpu;
   
                 if (cpu_cap > best_cap) {
@@@ -6816,13 -6692,10 +6828,13 @@@
         return best_cpu;
   }
   
- -static inline bool asym_fits_capacity(unsigned long task_util, int cpu)
+ +static inline bool asym_fits_cpu(unsigned long util,
+ +                               unsigned long util_min,
+ +                               unsigned long util_max,
+ +                               int cpu)
   {
         if (sched_asym_cpucap_active())
- -              return fits_capacity(task_util, capacity_of(cpu));
+ +              return util_fits_cpu(util, util_min, util_max, cpu);
   
         return true;
   }
@@@ -6834,7 -6707,7 +6846,7 @@@ static int select_idle_sibling(struct t
   {
         bool has_idle_core = false;
         struct sched_domain *sd;
- -      unsigned long task_util;
+ +      unsigned long task_util, util_min, util_max;
         int i, recent_used_cpu;
   
         /*
@@@ -6843,9 -6716,7 +6855,9 @@@
          */
         if (sched_asym_cpucap_active()) {
                 sync_entity_load_avg(&p->se);
- -              task_util = uclamp_task_util(p);
+ +              task_util = task_util_est(p);
+ +              util_min = uclamp_eff_value(p, UCLAMP_MIN);
+ +              util_max = uclamp_eff_value(p, UCLAMP_MAX);
         }
   
         /*
@@@ -6854,7 -6725,7 +6866,7 @@@
         lockdep_assert_irqs_disabled();
   
         if ((available_idle_cpu(target) || sched_idle_cpu(target)) &&
- -          asym_fits_capacity(task_util, target))
+ +          asym_fits_cpu(task_util, util_min, util_max, target))
                 return target;
   
         /*
@@@ -6862,7 -6733,7 +6874,7 @@@
          */
         if (prev != target && cpus_share_cache(prev, target) &&
             (available_idle_cpu(prev) || sched_idle_cpu(prev)) &&
- -          asym_fits_capacity(task_util, prev))
+ +          asym_fits_cpu(task_util, util_min, util_max, prev))
                 return prev;
   
         /*
@@@ -6877,7 -6748,7 +6889,7 @@@
             in_task() &&
             prev == smp_processor_id() &&
             this_rq()->nr_running <= 1 &&
- -          asym_fits_capacity(task_util, prev)) {
+ +          asym_fits_cpu(task_util, util_min, util_max, prev)) {
                 return prev;
         }
   
@@@ -6889,7 -6760,7 +6901,7 @@@
             cpus_share_cache(recent_used_cpu, target) &&
             (available_idle_cpu(recent_used_cpu) || sched_idle_cpu(recent_used_cpu)) &&
             cpumask_test_cpu(p->recent_used_cpu, p->cpus_ptr) &&
- -          asym_fits_capacity(task_util, recent_used_cpu)) {
+ +          asym_fits_cpu(task_util, util_min, util_max, recent_used_cpu)) {
                 return recent_used_cpu;
         }
   
@@@ -7185,8 -7056,6 +7197,8 @@@ static int find_energy_efficient_cpu(st
   {
         struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_rq_mask);
         unsigned long prev_delta = ULONG_MAX, best_delta = ULONG_MAX;
+ +      unsigned long p_util_min = uclamp_is_used() ? uclamp_eff_value(p, UCLAMP_MIN) : 0;
+ +      unsigned long p_util_max = uclamp_is_used() ? uclamp_eff_value(p, UCLAMP_MAX) : 1024;
         struct root_domain *rd = this_rq()->rd;
         int cpu, best_energy_cpu, target = -1;
         struct sched_domain *sd;
@@@ -7211,7 -7080,7 +7223,7 @@@
         target = prev_cpu;
   
         sync_entity_load_avg(&p->se);
- -      if (!task_util_est(p))
+ +      if (!uclamp_task_util(p, p_util_min, p_util_max))
                 goto unlock;
   
         eenv_task_busy_time(&eenv, p, prev_cpu);
@@@ -7219,9 -7088,7 +7231,9 @@@
         for (; pd; pd = pd->next) {
                 unsigned long cpu_cap, cpu_thermal_cap, util;
                 unsigned long cur_delta, max_spare_cap = 0;
- -              bool compute_prev_delta = false;
+ +              unsigned long rq_util_min, rq_util_max;
+ +              unsigned long util_min, util_max;
+ +              unsigned long prev_spare_cap = 0;
                 int max_spare_cap_cpu = -1;
                 unsigned long base_energy;
   
@@@ -7257,45 -7124,26 +7269,45 @@@
                          * much capacity we can get out of the CPU; this is
                          * aligned with sched_cpu_util().
                          */
- -                      util = uclamp_rq_util_with(cpu_rq(cpu), util, p);
- -                      if (!fits_capacity(util, cpu_cap))
+ +                      if (uclamp_is_used()) {
+ +                              if (uclamp_rq_is_idle(cpu_rq(cpu))) {
+ +                                      util_min = p_util_min;
+ +                                      util_max = p_util_max;
+ +                              } else {
+ +                                      /*
+ +                                       * Open code uclamp_rq_util_with() except for
+ +                                       * the clamp() part. Ie: apply max aggregation
+ +                                       * only. util_fits_cpu() logic requires to
+ +                                       * operate on non clamped util but must use the
+ +                                       * max-aggregated uclamp_{min, max}.
+ +                                       */
+ +                                      rq_util_min = uclamp_rq_get(cpu_rq(cpu), UCLAMP_MIN);
+ +                                      rq_util_max = uclamp_rq_get(cpu_rq(cpu), UCLAMP_MAX);
+ +
+ +                                      util_min = max(rq_util_min, p_util_min);
+ +                                      util_max = max(rq_util_max, p_util_max);
+ +                              }
+ +                      }
+ +                      if (!util_fits_cpu(util, util_min, util_max, cpu))
                                 continue;
   
                         lsub_positive(&cpu_cap, util);
   
                         if (cpu == prev_cpu) {
                                 /* Always use prev_cpu as a candidate. */
- -                              compute_prev_delta = true;
+ +                              prev_spare_cap = cpu_cap;
                         } else if (cpu_cap > max_spare_cap) {
                                 /*
                                  * Find the CPU with the maximum spare capacity
- -                               * in the performance domain.
+ +                               * among the remaining CPUs in the performance
+ +                               * domain.
                                  */
                                 max_spare_cap = cpu_cap;
                                 max_spare_cap_cpu = cpu;
                         }
                 }
   
- -              if (max_spare_cap_cpu < 0 && !compute_prev_delta)
+ +              if (max_spare_cap_cpu < 0 && prev_spare_cap == 0)
                         continue;
   
                 eenv_pd_busy_time(&eenv, cpus, p);
@@@ -7303,7 -7151,7 +7315,7 @@@
                 base_energy = compute_energy(&eenv, pd, cpus, p, -1);
   
                 /* Evaluate the energy impact of using prev_cpu. */
- -              if (compute_prev_delta) {
+ +              if (prev_spare_cap > 0) {
                         prev_delta = compute_energy(&eenv, pd, cpus, p,
                                                     prev_cpu);
                         /* CPU utilization has changed */
@@@ -7314,7 -7162,7 +7326,7 @@@
                 }
   
                 /* Evaluate the energy impact of using max_spare_cap_cpu. */
- -              if (max_spare_cap_cpu >= 0) {
+ +              if (max_spare_cap_cpu >= 0 && max_spare_cap > prev_spare_cap) {
                         cur_delta = compute_energy(&eenv, pd, cpus, p,
                                                    max_spare_cap_cpu);
                         /* CPU utilization has changed */
@@@ -8440,7 -8288,7 +8452,7 @@@ static int detach_tasks(struct lb_env *
   
                 case migrate_misfit:
                         /* This is not a misfit task */
- -                      if (task_fits_capacity(p, capacity_of(env->src_cpu)))
+ +                      if (task_fits_cpu(p, env->src_cpu))
                                 goto next;
   
                         env->imbalance = 0;
@@@ -8829,73 -8677,16 +8841,73 @@@ static unsigned long scale_rt_capacity(
   
   static void update_cpu_capacity(struct sched_domain *sd, int cpu)
   {
+ +      unsigned long capacity_orig = arch_scale_cpu_capacity(cpu);
         unsigned long capacity = scale_rt_capacity(cpu);
         struct sched_group *sdg = sd->groups;
+ +      struct rq *rq = cpu_rq(cpu);
   
- -      cpu_rq(cpu)->cpu_capacity_orig = arch_scale_cpu_capacity(cpu);
+ +      rq->cpu_capacity_orig = capacity_orig;
   
         if (!capacity)
                 capacity = 1;
   
- -      cpu_rq(cpu)->cpu_capacity = capacity;
- -      trace_sched_cpu_capacity_tp(cpu_rq(cpu));
+ +      rq->cpu_capacity = capacity;
+ +
+ +      /*
+ +       * Detect if the performance domain is in capacity inversion state.
+ +       *
+ +       * Capacity inversion happens when another perf domain with equal or
+ +       * lower capacity_orig_of() ends up having higher capacity than this
+ +       * domain after subtracting thermal pressure.
+ +       *
+ +       * We only take into account thermal pressure in this detection as it's
+ +       * the only metric that actually results in *real* reduction of
+ +       * capacity due to performance points (OPPs) being dropped/become
+ +       * unreachable due to thermal throttling.
+ +       *
+ +       * We assume:
+ +       *   * That all cpus in a perf domain have the same capacity_orig
+ +       *     (same uArch).
+ +       *   * Thermal pressure will impact all cpus in this perf domain
+ +       *     equally.
+ +       */
+ +      if (static_branch_unlikely(&sched_asym_cpucapacity)) {
+ +              unsigned long inv_cap = capacity_orig - thermal_load_avg(rq);
+ +              struct perf_domain *pd = rcu_dereference(rq->rd->pd);
+ +
+ +              rq->cpu_capacity_inverted = 0;
+ +
+ +              for (; pd; pd = pd->next) {
+ +                      struct cpumask *pd_span = perf_domain_span(pd);
+ +                      unsigned long pd_cap_orig, pd_cap;
+ +
+ +                      cpu = cpumask_any(pd_span);
+ +                      pd_cap_orig = arch_scale_cpu_capacity(cpu);
+ +
+ +                      if (capacity_orig < pd_cap_orig)
+ +                              continue;
+ +
+ +                      /*
+ +                       * handle the case of multiple perf domains have the
+ +                       * same capacity_orig but one of them is under higher
+ +                       * thermal pressure. We record it as capacity
+ +                       * inversion.
+ +                       */
+ +                      if (capacity_orig == pd_cap_orig) {
+ +                              pd_cap = pd_cap_orig - thermal_load_avg(cpu_rq(cpu));
+ +
+ +                              if (pd_cap > inv_cap) {
+ +                                      rq->cpu_capacity_inverted = inv_cap;
+ +                                      break;
+ +                              }
+ +                      } else if (pd_cap_orig > inv_cap) {
+ +                              rq->cpu_capacity_inverted = inv_cap;
+ +                              break;
+ +                      }
+ +              }
+ +      }
+ +
+ +      trace_sched_cpu_capacity_tp(rq);
   
         sdg->sgc->capacity = capacity;
         sdg->sgc->min_capacity = capacity;
@@@ -9502,10 -9293,6 +9514,10 @@@ static inline void update_sg_wakeup_sta
   
         memset(sgs, 0, sizeof(*sgs));
   
+ +      /* Assume that task can't fit any CPU of the group */
+ +      if (sd->flags & SD_ASYM_CPUCAPACITY)
+ +              sgs->group_misfit_task_load = 1;
+ +
         for_each_cpu(i, sched_group_span(group)) {
                 struct rq *rq = cpu_rq(i);
                 unsigned int local;
@@@ -9525,12 -9312,12 +9537,12 @@@
                 if (!nr_running && idle_cpu_without(i, p))
                         sgs->idle_cpus++;
   
- -      }
+ +              /* Check if task fits in the CPU */
+ +              if (sd->flags & SD_ASYM_CPUCAPACITY &&
+ +                  sgs->group_misfit_task_load &&
+ +                  task_fits_cpu(p, i))
+ +                      sgs->group_misfit_task_load = 0;
   
- -      /* Check if task fits in the group */
- -      if (sd->flags & SD_ASYM_CPUCAPACITY &&
- -          !task_fits_capacity(p, group->sgc->max_capacity)) {
- -              sgs->group_misfit_task_load = 1;
         }
   
         sgs->group_capacity = group->sgc->capacity;
diff --combined kernel/sysctl.c

index c6d9dec11b749de9b66d88498236a4e00f7cbf87,42ac27f1eb19f9216527e5fe4279ecbe6b2863e3..a4d8e19e231a5cf3ff68a4834878e8f70a6ea5b3
--- 1/kernel/sysctl.c
--- 2/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@@ -267,14 -267,13 +267,14 @@@ int proc_dostring(struct ctl_table *tab
                         ppos);
   }
   
- -static size_t proc_skip_spaces(char **buf)
+ +static void proc_skip_spaces(char **buf, size_t *size)
   {
- -      size_t ret;
- -      char *tmp = skip_spaces(*buf);
- -      ret = tmp - *buf;
- -      *buf = tmp;
- -      return ret;
+ +      while (*size) {
+ +              if (!isspace(**buf))
+ +                      break;
+ +              (*size)--;
+ +              (*buf)++;
+ +      }
   }
   
   static void proc_skip_char(char **buf, size_t *size, const char v)
@@@ -343,12 -342,13 +343,12 @@@ static int proc_get_long(char **buf, si
                           unsigned long *val, bool *neg,
                           const char *perm_tr, unsigned perm_tr_len, char *tr)
   {
- -      int len;
         char *p, tmp[TMPBUFLEN];
+ +      ssize_t len = *size;
   
- -      if (!*size)
+ +      if (len <= 0)
                 return -EINVAL;
   
- -      len = *size;
         if (len > TMPBUFLEN - 1)
                 len = TMPBUFLEN - 1;
   
@@@ -521,7 -521,7 +521,7 @@@ static int __do_proc_dointvec(void *tbl
                 bool neg;
   
                 if (write) {
- -                      left -= proc_skip_spaces(&p);
+ +                      proc_skip_spaces(&p, &left);
   
                         if (!left)
                                 break;
@@@ -548,7 -548,7 +548,7 @@@
         if (!write && !first && left && !err)
                 proc_put_char(&buffer, &left, '\n');
         if (write && !err && left)
- -              left -= proc_skip_spaces(&p);
+ +              proc_skip_spaces(&p, &left);
         if (write && first)
                 return err ? : -EINVAL;
         *lenp -= left;
@@@ -590,7 -590,7 +590,7 @@@ static int do_proc_douintvec_w(unsigne
         if (left > PAGE_SIZE - 1)
                 left = PAGE_SIZE - 1;
   
- -      left -= proc_skip_spaces(&p);
+ +      proc_skip_spaces(&p, &left);
         if (!left) {
                 err = -EINVAL;
                 goto out_free;
@@@ -610,7 -610,7 +610,7 @@@
         }
   
         if (!err && left)
- -              left -= proc_skip_spaces(&p);
+ +              proc_skip_spaces(&p, &left);
   
   out_free:
         if (err)
@@@ -1075,7 -1075,7 +1075,7 @@@ static int __do_proc_doulongvec_minmax(
                 if (write) {
                         bool neg;
   
- -                      left -= proc_skip_spaces(&p);
+ +                      proc_skip_spaces(&p, &left);
                         if (!left)
                                 break;
   
@@@ -1104,7 -1104,7 +1104,7 @@@
         if (!write && !first && left && !err)
                 proc_put_char(&buffer, &left, '\n');
         if (write && !err)
- -              left -= proc_skip_spaces(&p);
+ +              proc_skip_spaces(&p, &left);
         if (write && first)
                 return err ? : -EINVAL;
         *lenp -= left;
@@@ -1633,25 -1633,6 +1633,6 @@@ int proc_do_static_key(struct ctl_tabl
   }
   
   static struct ctl_table kern_table[] = {
- #ifdef CONFIG_NUMA_BALANCING
-       {
-               .procname       = "numa_balancing",
-               .data           = NULL, /* filled in by handler */
-               .maxlen         = sizeof(unsigned int),
-               .mode           = 0644,
-               .proc_handler   = sysctl_numa_balancing,
-               .extra1         = SYSCTL_ZERO,
-               .extra2         = SYSCTL_FOUR,
-       },
-       {
-               .procname       = "numa_balancing_promote_rate_limit_MBps",
-               .data           = &sysctl_numa_balancing_promote_rate_limit,
-               .maxlen         = sizeof(unsigned int),
-               .mode           = 0644,
-               .proc_handler   = proc_dointvec_minmax,
-               .extra1         = SYSCTL_ZERO,
-       },
- #endif /* CONFIG_NUMA_BALANCING */
         {
                 .procname       = "panic",
                 .data           = &panic_timeout,
author	Linus Torvalds <[email protected]>
	Tue, 13 Dec 2022 22:16:44 +0000 (14:16 -0800)
committer	Linus Torvalds <[email protected]>
	Tue, 13 Dec 2022 22:16:44 +0000 (14:16 -0800)
		1	2
kernel/sched/core.c	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/sched/fair.c	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/sysctl.c	patch \|	diff1 \|	diff2 \|	blob \| history