Merge branch 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel...

author Linus Torvalds <[email protected]>

Wed, 26 Dec 2018 22:56:10 +0000 (14:56 -0800)

committer Linus Torvalds <[email protected]>

Wed, 26 Dec 2018 22:56:10 +0000 (14:56 -0800)
author Linus Torvalds <[email protected]>
Wed, 26 Dec 2018 22:56:10 +0000 (14:56 -0800)
committer Linus Torvalds <[email protected]>
Wed, 26 Dec 2018 22:56:10 +0000 (14:56 -0800)
diff --combined include/linux/sched.h

index 4f1db3ef62a9032fb17b34bee4ded192fd9feb0d,b8c7ba0e3796f09f85841806f1371e9b270c13d6..89541d248893e2e04b3e1849956a43b8b4c9c12d
--- 1/include/linux/sched.h
--- 2/include/linux/sched.h
+++ b/include/linux/sched.h
@@@ -176,7 -176,7 +176,7 @@@ struct task_group
    * TASK_RUNNING store which can collide with __set_current_state(TASK_RUNNING).
    *
    * However, with slightly different timing the wakeup TASK_RUNNING store can
-  * also collide with the TASK_UNINTERRUPTIBLE store. Loosing that store is not
+  * also collide with the TASK_UNINTERRUPTIBLE store. Losing that store is not
    * a problem either because that will result in one extra go around the loop
    * and our @cond test will save the day.
    *
@@@ -515,7 -515,7 +515,7 @@@ struct sched_dl_entity 
   
         /*
          * Actual scheduling parameters. Initialized with the values above,
-        * they are continously updated during task execution. Note that
+        * they are continuously updated during task execution. Note that
          * the remaining runtime could be < 0 in case we are in overrun.
          */
         s64                             runtime;        /* Remaining runtime for this instance  */
@@@ -572,10 -572,8 +572,10 @@@ union rcu_special 
         struct {
                 u8                      blocked;
                 u8                      need_qs;
+ +              u8                      exp_hint; /* Hint for performance. */
+ +              u8                      pad; /* No garbage from compiler! */
         } b; /* Bits. */
- -      u16 s; /* Set of bits. */
+ +      u32 s; /* Set of bits. */
   };
   
   enum perf_event_task_context {
@@@ -995,7 -993,7 +995,7 @@@ struct task_struct 
         /* cg_list protected by css_set_lock and tsk->alloc_lock: */
         struct list_head                cg_list;
   #endif
- -#ifdef CONFIG_INTEL_RDT
+ +#ifdef CONFIG_RESCTRL
         u32                             closid;
         u32                             rmid;
   #endif
diff --combined kernel/sched/core.c

index a5b7f1c9f24f1e6c8765221cab2757f1d59d3ddd,e4ca15d75541f95013f7fdd3f8756f92a73f67da..f6692017337032f4cc69f684d1a5039780b9fd34
--- 1/kernel/sched/core.c
--- 2/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@@ -697,7 -697,7 +697,7 @@@ static void set_load_weight(struct task
         /*
          * SCHED_IDLE tasks get minimal weight:
          */
-       if (idle_policy(p->policy)) {
+       if (task_has_idle_policy(p)) {
                 load->weight = scale_load(WEIGHT_IDLEPRIO);
                 load->inv_weight = WMULT_IDLEPRIO;
                 p->se.runnable_weight = load->weight;
@@@ -2857,7 -2857,7 +2857,7 @@@ unsigned long nr_running(void
    * preemption, thus the result might have a time-of-check-to-time-of-use
    * race.  The caller is responsible to use it correctly, for example:
    *
-  * - from a non-preemptable section (of course)
+  * - from a non-preemptible section (of course)
    *
    * - from a thread that is bound to a single CPU
    *
@@@ -4191,7 -4191,7 +4191,7 @@@ recheck
                  * Treat SCHED_IDLE as nice 20. Only allow a switch to
                  * SCHED_NORMAL if the RLIMIT_NICE would normally permit it.
                  */
-               if (idle_policy(p->policy) && !idle_policy(policy)) {
+               if (task_has_idle_policy(p) && !idle_policy(policy)) {
                         if (!can_nice(p, task_nice(p)))
                                 return -EPERM;
                 }
@@@ -5783,7 -5783,7 +5783,7 @@@ int sched_cpu_deactivate(unsigned int c
          *
          * Do sync before park smpboot threads to take care the rcu boost case.
          */
- -      synchronize_rcu_mult(call_rcu, call_rcu_sched);
+ +      synchronize_rcu();
   
   #ifdef CONFIG_SCHED_SMT
         /*
diff --combined kernel/sched/cpufreq_schedutil.c

index 626ddd4ffa4333723a405e639258c44d9dd55dd0,c2e53d1a314303776d60ef556103a935e9fb8c20..033ec7c45f13f37f52dd33466911993e111b7ac0
--- 1/kernel/sched/cpufreq_schedutil.c
--- 2/kernel/sched/cpufreq_schedutil.c
+++ b/kernel/sched/cpufreq_schedutil.c
@@@ -1,15 -1,19 +1,16 @@@
+ +// SPDX-License-Identifier: GPL-2.0
   /*
    * CPUFreq governor based on scheduler-provided CPU utilization data.
    *
    * Copyright (C) 2016, Intel Corporation
    * Author: Rafael J. Wysocki <[email protected]>
- - *
- - * This program is free software; you can redistribute it and/or modify
- - * it under the terms of the GNU General Public License version 2 as
- - * published by the Free Software Foundation.
    */
   
   #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
   
   #include "sched.h"
   
+ #include <linux/sched/cpufreq.h>
   #include <trace/events/power.h>
   
   struct sugov_tunables {
@@@ -164,7 -168,7 +165,7 @@@ static unsigned int get_next_freq(struc
         unsigned int freq = arch_scale_freq_invariant() ?
                                 policy->cpuinfo.max_freq : policy->cur;
   
-       freq = (freq + (freq >> 2)) * util / max;
+       freq = map_util_freq(util, freq, max);
   
         if (freq == sg_policy->cached_raw_freq && !sg_policy->need_freq_update)
                 return sg_policy->next_freq;
@@@ -194,15 -198,13 +195,13 @@@
    * based on the task model parameters and gives the minimal utilization
    * required to meet deadlines.
    */
- static unsigned long sugov_get_util(struct sugov_cpu *sg_cpu)
+ unsigned long schedutil_freq_util(int cpu, unsigned long util_cfs,
+                                 unsigned long max, enum schedutil_type type)
   {
-       struct rq *rq = cpu_rq(sg_cpu->cpu);
-       unsigned long util, irq, max;
- 
-       sg_cpu->max = max = arch_scale_cpu_capacity(NULL, sg_cpu->cpu);
-       sg_cpu->bw_dl = cpu_bw_dl(rq);
+       unsigned long dl_util, util, irq;
+       struct rq *rq = cpu_rq(cpu);
   
-       if (rt_rq_is_runnable(&rq->rt))
+       if (type == FREQUENCY_UTIL && rt_rq_is_runnable(&rq->rt))
                 return max;
   
         /*
@@@ -220,21 -222,30 +219,30 @@@
          * utilization (PELT windows are synchronized) we can directly add them
          * to obtain the CPU's actual utilization.
          */
-       util = cpu_util_cfs(rq);
+       util = util_cfs;
         util += cpu_util_rt(rq);
   
+       dl_util = cpu_util_dl(rq);
+ 
         /*
-        * We do not make cpu_util_dl() a permanent part of this sum because we
-        * want to use cpu_bw_dl() later on, but we need to check if the
-        * CFS+RT+DL sum is saturated (ie. no idle time) such that we select
-        * f_max when there is no idle time.
+        * For frequency selection we do not make cpu_util_dl() a permanent part
+        * of this sum because we want to use cpu_bw_dl() later on, but we need
+        * to check if the CFS+RT+DL sum is saturated (ie. no idle time) such
+        * that we select f_max when there is no idle time.
          *
          * NOTE: numerical errors or stop class might cause us to not quite hit
          * saturation when we should -- something for later.
          */
-       if ((util + cpu_util_dl(rq)) >= max)
+       if (util + dl_util >= max)
                 return max;
   
+       /*
+        * OTOH, for energy computation we need the estimated running time, so
+        * include util_dl and ignore dl_bw.
+        */
+       if (type == ENERGY_UTIL)
+               util += dl_util;
+ 
         /*
          * There is still idle time; further improve the number by using the
          * irq metric. Because IRQ/steal time is hidden from the task clock we
@@@ -257,7 -268,22 +265,22 @@@
          * bw_dl as requested freq. However, cpufreq is not yet ready for such
          * an interface. So, we only do the latter for now.
          */
-       return min(max, util + sg_cpu->bw_dl);
+       if (type == FREQUENCY_UTIL)
+               util += cpu_bw_dl(rq);
+ 
+       return min(max, util);
+ }
+ 
+ static unsigned long sugov_get_util(struct sugov_cpu *sg_cpu)
+ {
+       struct rq *rq = cpu_rq(sg_cpu->cpu);
+       unsigned long util = cpu_util_cfs(rq);
+       unsigned long max = arch_scale_cpu_capacity(NULL, sg_cpu->cpu);
+ 
+       sg_cpu->max = max;
+       sg_cpu->bw_dl = cpu_bw_dl(rq);
+ 
+       return schedutil_freq_util(sg_cpu->cpu, util, max, FREQUENCY_UTIL);
   }
   
   /**
@@@ -598,7 -624,7 +621,7 @@@ static struct kobj_type sugov_tunables_
   
   /********************** cpufreq governor interface *********************/
   
- static struct cpufreq_governor schedutil_gov;
+ struct cpufreq_governor schedutil_gov;
   
   static struct sugov_policy *sugov_policy_alloc(struct cpufreq_policy *policy)
   {
@@@ -857,7 -883,7 +880,7 @@@ static void sugov_limits(struct cpufreq
         sg_policy->need_freq_update = true;
   }
   
- static struct cpufreq_governor schedutil_gov = {
+ struct cpufreq_governor schedutil_gov = {
         .name                   = "schedutil",
         .owner                  = THIS_MODULE,
         .dynamic_switching      = true,
@@@ -880,3 -906,36 +903,36 @@@ static int __init sugov_register(void
         return cpufreq_register_governor(&schedutil_gov);
   }
   fs_initcall(sugov_register);
+ 
+ #ifdef CONFIG_ENERGY_MODEL
+ extern bool sched_energy_update;
+ extern struct mutex sched_energy_mutex;
+ 
+ static void rebuild_sd_workfn(struct work_struct *work)
+ {
+       mutex_lock(&sched_energy_mutex);
+       sched_energy_update = true;
+       rebuild_sched_domains();
+       sched_energy_update = false;
+       mutex_unlock(&sched_energy_mutex);
+ }
+ static DECLARE_WORK(rebuild_sd_work, rebuild_sd_workfn);
+ 
+ /*
+  * EAS shouldn't be attempted without sugov, so rebuild the sched_domains
+  * on governor changes to make sure the scheduler knows about it.
+  */
+ void sched_cpufreq_governor_change(struct cpufreq_policy *policy,
+                                 struct cpufreq_governor *old_gov)
+ {
+       if (old_gov == &schedutil_gov || policy->governor == &schedutil_gov) {
+               /*
+                * When called from the cpufreq_register_driver() path, the
+                * cpu_hotplug_lock is already held, so use a work item to
+                * avoid nested locking in rebuild_sched_domains().
+                */
+               schedule_work(&rebuild_sd_work);
+       }
+ 
+ }
+ #endif
diff --combined kernel/sched/fair.c

index db514993565b2274b0b02466dd7ffc03b7e66140,ca469646ebe12d76a002fbbd457e4abb36a0454c..1c1cfbf6ba0c99e026d5672c1d9879e483e189b0
--- 1/kernel/sched/fair.c
--- 2/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@@ -38,7 -38,7 +38,7 @@@
    * (default: 6ms * (1 + ilog(ncpus)), units: nanoseconds)
    */
   unsigned int sysctl_sched_latency                     = 6000000ULL;
- unsigned int normalized_sysctl_sched_latency          = 6000000ULL;
+ static unsigned int normalized_sysctl_sched_latency   = 6000000ULL;
   
   /*
    * The initial- and re-scaling of tunables is configurable
@@@ -58,8 -58,8 +58,8 @@@ enum sched_tunable_scaling sysctl_sched
    *
    * (default: 0.75 msec * (1 + ilog(ncpus)), units: nanoseconds)
    */
- unsigned int sysctl_sched_min_granularity             = 750000ULL;
- unsigned int normalized_sysctl_sched_min_granularity  = 750000ULL;
+ unsigned int sysctl_sched_min_granularity                     = 750000ULL;
+ static unsigned int normalized_sysctl_sched_min_granularity   = 750000ULL;
   
   /*
    * This value is kept at sysctl_sched_latency/sysctl_sched_min_granularity
@@@ -81,8 -81,8 +81,8 @@@ unsigned int sysctl_sched_child_runs_fi
    *
    * (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds)
    */
- unsigned int sysctl_sched_wakeup_granularity          = 1000000UL;
- unsigned int normalized_sysctl_sched_wakeup_granularity       = 1000000UL;
+ unsigned int sysctl_sched_wakeup_granularity                  = 1000000UL;
+ static unsigned int normalized_sysctl_sched_wakeup_granularity        = 1000000UL;
   
   const_debug unsigned int sysctl_sched_migration_cost  = 500000UL;
   
@@@ -116,7 -116,7 +116,7 @@@ unsigned int sysctl_sched_cfs_bandwidth
    *
    * (default: ~20%)
    */
- unsigned int capacity_margin                          = 1280;
+ static unsigned int capacity_margin                   = 1280;
   
   static inline void update_load_add(struct load_weight *lw, unsigned long inc)
   {
@@@ -703,9 -703,9 +703,9 @@@ void init_entity_runnable_average(struc
         memset(sa, 0, sizeof(*sa));
   
         /*
-        * Tasks are intialized with full load to be seen as heavy tasks until
+        * Tasks are initialized with full load to be seen as heavy tasks until
          * they get a chance to stabilize to their real load level.
-        * Group entities are intialized with zero load to reflect the fact that
+        * Group entities are initialized with zero load to reflect the fact that
          * nothing has been attached to the task group yet.
          */
         if (entity_is_task(se))
@@@ -2734,6 -2734,17 +2734,17 @@@ account_entity_dequeue(struct cfs_rq *c
         WRITE_ONCE(*ptr, res);                                  \
   } while (0)
   
+ /*
+  * Remove and clamp on negative, from a local variable.
+  *
+  * A variant of sub_positive(), which does not use explicit load-store
+  * and is thus optimized for local variable updates.
+  */
+ #define lsub_positive(_ptr, _val) do {                                \
+       typeof(_ptr) ptr = (_ptr);                              \
+       *ptr -= min_t(typeof(*ptr), *ptr, _val);                \
+ } while (0)
+ 
   #ifdef CONFIG_SMP
   static inline void
   enqueue_runnable_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
@@@ -3604,7 -3615,7 +3615,7 @@@ static inline unsigned long _task_util_
   {
         struct util_est ue = READ_ONCE(p->se.avg.util_est);
   
-       return max(ue.ewma, ue.enqueued);
+       return (max(ue.ewma, ue.enqueued) | UTIL_AVG_UNCHANGED);
   }
   
   static inline unsigned long task_util_est(struct task_struct *p)
@@@ -3622,7 -3633,7 +3633,7 @@@ static inline void util_est_enqueue(str
   
         /* Update root cfs_rq's estimated utilization */
         enqueued  = cfs_rq->avg.util_est.enqueued;
-       enqueued += (_task_util_est(p) | UTIL_AVG_UNCHANGED);
+       enqueued += _task_util_est(p);
         WRITE_ONCE(cfs_rq->avg.util_est.enqueued, enqueued);
   }
   
@@@ -3650,8 -3661,7 +3661,7 @@@ util_est_dequeue(struct cfs_rq *cfs_rq
   
         /* Update root cfs_rq's estimated utilization */
         ue.enqueued  = cfs_rq->avg.util_est.enqueued;
-       ue.enqueued -= min_t(unsigned int, ue.enqueued,
-                            (_task_util_est(p) | UTIL_AVG_UNCHANGED));
+       ue.enqueued -= min_t(unsigned int, ue.enqueued, _task_util_est(p));
         WRITE_ONCE(cfs_rq->avg.util_est.enqueued, ue.enqueued);
   
         /*
@@@ -3966,8 -3976,8 +3976,8 @@@ dequeue_entity(struct cfs_rq *cfs_rq, s
         /*
          * When dequeuing a sched_entity, we must:
          *   - Update loads to have both entity and cfs_rq synced with now.
-        *   - Substract its load from the cfs_rq->runnable_avg.
-        *   - Substract its previous weight from cfs_rq->load.weight.
+        *   - Subtract its load from the cfs_rq->runnable_avg.
+        *   - Subtract its previous weight from cfs_rq->load.weight.
          *   - For group entity, update its weight to reflect the new share
          *     of its group cfs_rq.
          */
@@@ -4640,7 -4650,7 +4650,7 @@@ static int do_sched_cfs_period_timer(st
                 cfs_b->distribute_running = 0;
                 throttled = !list_empty(&cfs_b->throttled_cfs_rq);
   
-               cfs_b->runtime -= min(runtime, cfs_b->runtime);
+               lsub_positive(&cfs_b->runtime, runtime);
         }
   
         /*
@@@ -4774,7 -4784,7 +4784,7 @@@ static void do_sched_cfs_slack_timer(st
   
         raw_spin_lock(&cfs_b->lock);
         if (expires == cfs_b->runtime_expires)
-               cfs_b->runtime -= min(runtime, cfs_b->runtime);
+               lsub_positive(&cfs_b->runtime, runtime);
         cfs_b->distribute_running = 0;
         raw_spin_unlock(&cfs_b->lock);
   }
@@@ -5072,6 -5082,24 +5082,24 @@@ static inline void hrtick_update(struc
   }
   #endif
   
+ #ifdef CONFIG_SMP
+ static inline unsigned long cpu_util(int cpu);
+ static unsigned long capacity_of(int cpu);
+ 
+ static inline bool cpu_overutilized(int cpu)
+ {
+       return (capacity_of(cpu) * 1024) < (cpu_util(cpu) * capacity_margin);
+ }
+ 
+ static inline void update_overutilized_status(struct rq *rq)
+ {
+       if (!READ_ONCE(rq->rd->overutilized) && cpu_overutilized(rq->cpu))
+               WRITE_ONCE(rq->rd->overutilized, SG_OVERUTILIZED);
+ }
+ #else
+ static inline void update_overutilized_status(struct rq *rq) { }
+ #endif
+ 
   /*
    * The enqueue_task method is called before nr_running is
    * increased. Here we update the fair scheduling stats and
@@@ -5129,8 -5157,26 +5157,26 @@@ enqueue_task_fair(struct rq *rq, struc
                 update_cfs_group(se);
         }
   
-       if (!se)
+       if (!se) {
                 add_nr_running(rq, 1);
+               /*
+                * Since new tasks are assigned an initial util_avg equal to
+                * half of the spare capacity of their CPU, tiny tasks have the
+                * ability to cross the overutilized threshold, which will
+                * result in the load balancer ruining all the task placement
+                * done by EAS. As a way to mitigate that effect, do not account
+                * for the first enqueue operation of new tasks during the
+                * overutilized flag detection.
+                *
+                * A better way of solving this problem would be to wait for
+                * the PELT signals of tasks to converge before taking them
+                * into account, but that is not straightforward to implement,
+                * and the following generally works well enough in practice.
+                */
+               if (flags & ENQUEUE_WAKEUP)
+                       update_overutilized_status(rq);
+ 
+       }
   
         hrtick_update(rq);
   }
@@@ -6241,7 -6287,7 +6287,7 @@@ static unsigned long cpu_util_without(i
         util = READ_ONCE(cfs_rq->avg.util_avg);
   
         /* Discount task's util from CPU's util */
-       util -= min_t(unsigned int, util, task_util(p));
+       lsub_positive(&util, task_util(p));
   
         /*
          * Covered cases:
@@@ -6290,10 -6336,9 +6336,9 @@@
                  * properly fix the execl regression and it helps in further
                  * reducing the chances for the above race.
                  */
-               if (unlikely(task_on_rq_queued(p) || current == p)) {
-                       estimated -= min_t(unsigned int, estimated,
-                                          (_task_util_est(p) | UTIL_AVG_UNCHANGED));
-               }
+               if (unlikely(task_on_rq_queued(p) || current == p))
+                       lsub_positive(&estimated, _task_util_est(p));
+ 
                 util = max(util, estimated);
         }
   
@@@ -6332,6 -6377,213 +6377,213 @@@ static int wake_cap(struct task_struct 
         return !task_fits_capacity(p, min_cap);
   }
   
+ /*
+  * Predicts what cpu_util(@cpu) would return if @p was migrated (and enqueued)
+  * to @dst_cpu.
+  */
+ static unsigned long cpu_util_next(int cpu, struct task_struct *p, int dst_cpu)
+ {
+       struct cfs_rq *cfs_rq = &cpu_rq(cpu)->cfs;
+       unsigned long util_est, util = READ_ONCE(cfs_rq->avg.util_avg);
+ 
+       /*
+        * If @p migrates from @cpu to another, remove its contribution. Or,
+        * if @p migrates from another CPU to @cpu, add its contribution. In
+        * the other cases, @cpu is not impacted by the migration, so the
+        * util_avg should already be correct.
+        */
+       if (task_cpu(p) == cpu && dst_cpu != cpu)
+               sub_positive(&util, task_util(p));
+       else if (task_cpu(p) != cpu && dst_cpu == cpu)
+               util += task_util(p);
+ 
+       if (sched_feat(UTIL_EST)) {
+               util_est = READ_ONCE(cfs_rq->avg.util_est.enqueued);
+ 
+               /*
+                * During wake-up, the task isn't enqueued yet and doesn't
+                * appear in the cfs_rq->avg.util_est.enqueued of any rq,
+                * so just add it (if needed) to "simulate" what will be
+                * cpu_util() after the task has been enqueued.
+                */
+               if (dst_cpu == cpu)
+                       util_est += _task_util_est(p);
+ 
+               util = max(util, util_est);
+       }
+ 
+       return min(util, capacity_orig_of(cpu));
+ }
+ 
+ /*
+  * compute_energy(): Estimates the energy that would be consumed if @p was
+  * migrated to @dst_cpu. compute_energy() predicts what will be the utilization
+  * landscape of the * CPUs after the task migration, and uses the Energy Model
+  * to compute what would be the energy if we decided to actually migrate that
+  * task.
+  */
+ static long
+ compute_energy(struct task_struct *p, int dst_cpu, struct perf_domain *pd)
+ {
+       long util, max_util, sum_util, energy = 0;
+       int cpu;
+ 
+       for (; pd; pd = pd->next) {
+               max_util = sum_util = 0;
+               /*
+                * The capacity state of CPUs of the current rd can be driven by
+                * CPUs of another rd if they belong to the same performance
+                * domain. So, account for the utilization of these CPUs too
+                * by masking pd with cpu_online_mask instead of the rd span.
+                *
+                * If an entire performance domain is outside of the current rd,
+                * it will not appear in its pd list and will not be accounted
+                * by compute_energy().
+                */
+               for_each_cpu_and(cpu, perf_domain_span(pd), cpu_online_mask) {
+                       util = cpu_util_next(cpu, p, dst_cpu);
+                       util = schedutil_energy_util(cpu, util);
+                       max_util = max(util, max_util);
+                       sum_util += util;
+               }
+ 
+               energy += em_pd_energy(pd->em_pd, max_util, sum_util);
+       }
+ 
+       return energy;
+ }
+ 
+ /*
+  * find_energy_efficient_cpu(): Find most energy-efficient target CPU for the
+  * waking task. find_energy_efficient_cpu() looks for the CPU with maximum
+  * spare capacity in each performance domain and uses it as a potential
+  * candidate to execute the task. Then, it uses the Energy Model to figure
+  * out which of the CPU candidates is the most energy-efficient.
+  *
+  * The rationale for this heuristic is as follows. In a performance domain,
+  * all the most energy efficient CPU candidates (according to the Energy
+  * Model) are those for which we'll request a low frequency. When there are
+  * several CPUs for which the frequency request will be the same, we don't
+  * have enough data to break the tie between them, because the Energy Model
+  * only includes active power costs. With this model, if we assume that
+  * frequency requests follow utilization (e.g. using schedutil), the CPU with
+  * the maximum spare capacity in a performance domain is guaranteed to be among
+  * the best candidates of the performance domain.
+  *
+  * In practice, it could be preferable from an energy standpoint to pack
+  * small tasks on a CPU in order to let other CPUs go in deeper idle states,
+  * but that could also hurt our chances to go cluster idle, and we have no
+  * ways to tell with the current Energy Model if this is actually a good
+  * idea or not. So, find_energy_efficient_cpu() basically favors
+  * cluster-packing, and spreading inside a cluster. That should at least be
+  * a good thing for latency, and this is consistent with the idea that most
+  * of the energy savings of EAS come from the asymmetry of the system, and
+  * not so much from breaking the tie between identical CPUs. That's also the
+  * reason why EAS is enabled in the topology code only for systems where
+  * SD_ASYM_CPUCAPACITY is set.
+  *
+  * NOTE: Forkees are not accepted in the energy-aware wake-up path because
+  * they don't have any useful utilization data yet and it's not possible to
+  * forecast their impact on energy consumption. Consequently, they will be
+  * placed by find_idlest_cpu() on the least loaded CPU, which might turn out
+  * to be energy-inefficient in some use-cases. The alternative would be to
+  * bias new tasks towards specific types of CPUs first, or to try to infer
+  * their util_avg from the parent task, but those heuristics could hurt
+  * other use-cases too. So, until someone finds a better way to solve this,
+  * let's keep things simple by re-using the existing slow path.
+  */
+ 
+ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
+ {
+       unsigned long prev_energy = ULONG_MAX, best_energy = ULONG_MAX;
+       struct root_domain *rd = cpu_rq(smp_processor_id())->rd;
+       int cpu, best_energy_cpu = prev_cpu;
+       struct perf_domain *head, *pd;
+       unsigned long cpu_cap, util;
+       struct sched_domain *sd;
+ 
+       rcu_read_lock();
+       pd = rcu_dereference(rd->pd);
+       if (!pd || READ_ONCE(rd->overutilized))
+               goto fail;
+       head = pd;
+ 
+       /*
+        * Energy-aware wake-up happens on the lowest sched_domain starting
+        * from sd_asym_cpucapacity spanning over this_cpu and prev_cpu.
+        */
+       sd = rcu_dereference(*this_cpu_ptr(&sd_asym_cpucapacity));
+       while (sd && !cpumask_test_cpu(prev_cpu, sched_domain_span(sd)))
+               sd = sd->parent;
+       if (!sd)
+               goto fail;
+ 
+       sync_entity_load_avg(&p->se);
+       if (!task_util_est(p))
+               goto unlock;
+ 
+       for (; pd; pd = pd->next) {
+               unsigned long cur_energy, spare_cap, max_spare_cap = 0;
+               int max_spare_cap_cpu = -1;
+ 
+               for_each_cpu_and(cpu, perf_domain_span(pd), sched_domain_span(sd)) {
+                       if (!cpumask_test_cpu(cpu, &p->cpus_allowed))
+                               continue;
+ 
+                       /* Skip CPUs that will be overutilized. */
+                       util = cpu_util_next(cpu, p, cpu);
+                       cpu_cap = capacity_of(cpu);
+                       if (cpu_cap * 1024 < util * capacity_margin)
+                               continue;
+ 
+                       /* Always use prev_cpu as a candidate. */
+                       if (cpu == prev_cpu) {
+                               prev_energy = compute_energy(p, prev_cpu, head);
+                               best_energy = min(best_energy, prev_energy);
+                               continue;
+                       }
+ 
+                       /*
+                        * Find the CPU with the maximum spare capacity in
+                        * the performance domain
+                        */
+                       spare_cap = cpu_cap - util;
+                       if (spare_cap > max_spare_cap) {
+                               max_spare_cap = spare_cap;
+                               max_spare_cap_cpu = cpu;
+                       }
+               }
+ 
+               /* Evaluate the energy impact of using this CPU. */
+               if (max_spare_cap_cpu >= 0) {
+                       cur_energy = compute_energy(p, max_spare_cap_cpu, head);
+                       if (cur_energy < best_energy) {
+                               best_energy = cur_energy;
+                               best_energy_cpu = max_spare_cap_cpu;
+                       }
+               }
+       }
+ unlock:
+       rcu_read_unlock();
+ 
+       /*
+        * Pick the best CPU if prev_cpu cannot be used, or if it saves at
+        * least 6% of the energy used by prev_cpu.
+        */
+       if (prev_energy == ULONG_MAX)
+               return best_energy_cpu;
+ 
+       if ((prev_energy - best_energy) > (prev_energy >> 4))
+               return best_energy_cpu;
+ 
+       return prev_cpu;
+ 
+ fail:
+       rcu_read_unlock();
+ 
+       return -1;
+ }
+ 
   /*
    * select_task_rq_fair: Select target runqueue for the waking task in domains
    * that have the 'sd_flag' flag set. In practice, this is SD_BALANCE_WAKE,
@@@ -6355,8 -6607,16 +6607,16 @@@ select_task_rq_fair(struct task_struct 
   
         if (sd_flag & SD_BALANCE_WAKE) {
                 record_wakee(p);
-               want_affine = !wake_wide(p) && !wake_cap(p, cpu, prev_cpu)
-                             && cpumask_test_cpu(cpu, &p->cpus_allowed);
+ 
+               if (static_branch_unlikely(&sched_energy_present)) {
+                       new_cpu = find_energy_efficient_cpu(p, prev_cpu);
+                       if (new_cpu >= 0)
+                               return new_cpu;
+                       new_cpu = prev_cpu;
+               }
+ 
+               want_affine = !wake_wide(p) && !wake_cap(p, cpu, prev_cpu) &&
+                             cpumask_test_cpu(cpu, &p->cpus_allowed);
         }
   
         rcu_read_lock();
@@@ -6520,7 -6780,7 +6780,7 @@@ wakeup_preempt_entity(struct sched_enti
   
   static void set_last_buddy(struct sched_entity *se)
   {
-       if (entity_is_task(se) && unlikely(task_of(se)->policy == SCHED_IDLE))
+       if (entity_is_task(se) && unlikely(task_has_idle_policy(task_of(se))))
                 return;
   
         for_each_sched_entity(se) {
@@@ -6532,7 -6792,7 +6792,7 @@@
   
   static void set_next_buddy(struct sched_entity *se)
   {
-       if (entity_is_task(se) && unlikely(task_of(se)->policy == SCHED_IDLE))
+       if (entity_is_task(se) && unlikely(task_has_idle_policy(task_of(se))))
                 return;
   
         for_each_sched_entity(se) {
@@@ -6590,8 -6850,8 +6850,8 @@@ static void check_preempt_wakeup(struc
                 return;
   
         /* Idle tasks are by definition preempted by non-idle tasks. */
-       if (unlikely(curr->policy == SCHED_IDLE) &&
-           likely(p->policy != SCHED_IDLE))
+       if (unlikely(task_has_idle_policy(curr)) &&
+           likely(!task_has_idle_policy(p)))
                 goto preempt;
   
         /*
@@@ -7012,7 -7272,7 +7272,7 @@@ static int task_hot(struct task_struct 
         if (p->sched_class != &fair_sched_class)
                 return 0;
   
-       if (unlikely(p->policy == SCHED_IDLE))
+       if (unlikely(task_has_idle_policy(p)))
                 return 0;
   
         /*
@@@ -7896,16 -8156,16 +8156,16 @@@ static bool update_nohz_stats(struct r
    * update_sg_lb_stats - Update sched_group's statistics for load balancing.
    * @env: The load balancing environment.
    * @group: sched_group whose statistics are to be updated.
-  * @load_idx: Load index of sched_domain of this_cpu for load calc.
-  * @local_group: Does group contain this_cpu.
    * @sgs: variable to hold the statistics for this group.
-  * @overload: Indicate pullable load (e.g. >1 runnable task).
+  * @sg_status: Holds flag indicating the status of the sched_group
    */
   static inline void update_sg_lb_stats(struct lb_env *env,
-                       struct sched_group *group, int load_idx,
-                       int local_group, struct sg_lb_stats *sgs,
-                       bool *overload)
+                                     struct sched_group *group,
+                                     struct sg_lb_stats *sgs,
+                                     int *sg_status)
   {
+       int local_group = cpumask_test_cpu(env->dst_cpu, sched_group_span(group));
+       int load_idx = get_sd_load_idx(env->sd, env->idle);
         unsigned long load;
         int i, nr_running;
   
@@@ -7929,7 -8189,10 +8189,10 @@@
   
                 nr_running = rq->nr_running;
                 if (nr_running > 1)
-                       *overload = true;
+                       *sg_status |= SG_OVERLOAD;
+ 
+               if (cpu_overutilized(i))
+                       *sg_status |= SG_OVERUTILIZED;
   
   #ifdef CONFIG_NUMA_BALANCING
                 sgs->nr_numa_running += rq->nr_numa_running;
@@@ -7945,7 -8208,7 +8208,7 @@@
                 if (env->sd->flags & SD_ASYM_CPUCAPACITY &&
                     sgs->group_misfit_task_load < rq->misfit_task_load) {
                         sgs->group_misfit_task_load = rq->misfit_task_load;
-                       *overload = 1;
+                       *sg_status |= SG_OVERLOAD;
                 }
         }
   
@@@ -8090,17 -8353,14 +8353,14 @@@ static inline void update_sd_lb_stats(s
         struct sched_group *sg = env->sd->groups;
         struct sg_lb_stats *local = &sds->local_stat;
         struct sg_lb_stats tmp_sgs;
-       int load_idx;
-       bool overload = false;
         bool prefer_sibling = child && child->flags & SD_PREFER_SIBLING;
+       int sg_status = 0;
   
   #ifdef CONFIG_NO_HZ_COMMON
         if (env->idle == CPU_NEWLY_IDLE && READ_ONCE(nohz.has_blocked))
                 env->flags |= LBF_NOHZ_STATS;
   #endif
   
-       load_idx = get_sd_load_idx(env->sd, env->idle);
- 
         do {
                 struct sg_lb_stats *sgs = &tmp_sgs;
                 int local_group;
@@@ -8115,8 -8375,7 +8375,7 @@@
                                 update_group_capacity(env->sd, env->dst_cpu);
                 }
   
-               update_sg_lb_stats(env, sg, load_idx, local_group, sgs,
-                                               &overload);
+               update_sg_lb_stats(env, sg, sgs, &sg_status);
   
                 if (local_group)
                         goto next_group;
@@@ -8165,9 -8424,15 +8424,15 @@@ next_group
                 env->fbq_type = fbq_classify_group(&sds->busiest_stat);
   
         if (!env->sd->parent) {
+               struct root_domain *rd = env->dst_rq->rd;
+ 
                 /* update overload indicator if we are at root domain */
-               if (READ_ONCE(env->dst_rq->rd->overload) != overload)
-                       WRITE_ONCE(env->dst_rq->rd->overload, overload);
+               WRITE_ONCE(rd->overload, sg_status & SG_OVERLOAD);
+ 
+               /* Update over-utilization (tipping point, U >= 0) indicator */
+               WRITE_ONCE(rd->overutilized, sg_status & SG_OVERUTILIZED);
+       } else if (sg_status & SG_OVERUTILIZED) {
+               WRITE_ONCE(env->dst_rq->rd->overutilized, SG_OVERUTILIZED);
         }
   }
   
@@@ -8394,6 -8659,14 +8659,14 @@@ static struct sched_group *find_busiest
          * this level.
          */
         update_sd_lb_stats(env, &sds);
+ 
+       if (static_branch_unlikely(&sched_energy_present)) {
+               struct root_domain *rd = env->dst_rq->rd;
+ 
+               if (rcu_dereference(rd->pd) && !READ_ONCE(rd->overutilized))
+                       goto out_balanced;
+       }
+ 
         local = &sds.local_stat;
         busiest = &sds.busiest_stat;
   
@@@ -8910,13 -9183,22 +9183,22 @@@ out_all_pinned
         sd->nr_balance_failed = 0;
   
   out_one_pinned:
+       ld_moved = 0;
+ 
+       /*
+        * idle_balance() disregards balance intervals, so we could repeatedly
+        * reach this code, which would lead to balance_interval skyrocketting
+        * in a short amount of time. Skip the balance_interval increase logic
+        * to avoid that.
+        */
+       if (env.idle == CPU_NEWLY_IDLE)
+               goto out;
+ 
         /* tune up the balancing interval */
-       if (((env.flags & LBF_ALL_PINNED) &&
-                       sd->balance_interval < MAX_PINNED_INTERVAL) ||
-                       (sd->balance_interval < sd->max_interval))
+       if ((env.flags & LBF_ALL_PINNED &&
+            sd->balance_interval < MAX_PINNED_INTERVAL) ||
+           sd->balance_interval < sd->max_interval)
                 sd->balance_interval *= 2;
- 
-       ld_moved = 0;
   out:
         return ld_moved;
   }
@@@ -9281,7 -9563,7 +9563,7 @@@ static void nohz_balancer_kick(struct r
                 }
         }
   
-       sd = rcu_dereference(per_cpu(sd_asym, cpu));
+       sd = rcu_dereference(per_cpu(sd_asym_packing, cpu));
         if (sd) {
                 for_each_cpu(i, sched_domain_span(sd)) {
                         if (i == cpu ||
@@@ -9533,7 -9815,9 +9815,7 @@@ static bool nohz_idle_balance(struct r
                 return false;
         }
   
- -      /*
- -       * barrier, pairs with nohz_balance_enter_idle(), ensures ...
- -       */
+ +      /* could be _relaxed() */
         flags = atomic_fetch_andnot(NOHZ_KICK_MASK, nohz_flags(this_cpu));
         if (!(flags & NOHZ_KICK_MASK))
                 return false;
@@@ -9783,6 -10067,7 +10065,7 @@@ static void task_tick_fair(struct rq *r
                 task_tick_numa(rq, curr);
   
         update_misfit_status(curr, rq);
+       update_overutilized_status(task_rq(curr));
   }
   
   /*
author	Linus Torvalds <[email protected]>
	Wed, 26 Dec 2018 22:56:10 +0000 (14:56 -0800)
committer	Linus Torvalds <[email protected]>
	Wed, 26 Dec 2018 22:56:10 +0000 (14:56 -0800)
		1	2
include/linux/sched.h	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/sched/core.c	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/sched/cpufreq_schedutil.c	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/sched/fair.c	patch \|	diff1 \|	diff2 \|	blob \| history