struct task_struct;
+ #ifdef CONFIG_PROVE_RCU
+ extern int lockdep_tasklist_lock_is_held(void);
+ #endif /* #ifdef CONFIG_PROVE_RCU */
+
extern void sched_init(void);
extern void sched_init_smp(void);
extern asmlinkage void schedule_tail(struct task_struct *prev);
#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ)
extern int select_nohz_load_balancer(int cpu);
extern int get_nohz_load_balancer(void);
+extern int nohz_ratelimit(int cpu);
#else
static inline int select_nohz_load_balancer(int cpu)
{
return 0;
}
+
+static inline int nohz_ratelimit(int cpu)
+{
+ return 0;
+}
#endif
/*
static inline void arch_pick_mmap_layout(struct mm_struct *mm) {}
#endif
- #if USE_SPLIT_PTLOCKS
- /*
- * The mm counters are not protected by its page_table_lock,
- * so must be incremented atomically.
- */
- #define set_mm_counter(mm, member, value) atomic_long_set(&(mm)->_##member, value)
- #define get_mm_counter(mm, member) ((unsigned long)atomic_long_read(&(mm)->_##member))
- #define add_mm_counter(mm, member, value) atomic_long_add(value, &(mm)->_##member)
- #define inc_mm_counter(mm, member) atomic_long_inc(&(mm)->_##member)
- #define dec_mm_counter(mm, member) atomic_long_dec(&(mm)->_##member)
-
- #else /* !USE_SPLIT_PTLOCKS */
- /*
- * The mm counters are protected by its page_table_lock,
- * so can be incremented directly.
- */
- #define set_mm_counter(mm, member, value) (mm)->_##member = (value)
- #define get_mm_counter(mm, member) ((mm)->_##member)
- #define add_mm_counter(mm, member, value) (mm)->_##member += (value)
- #define inc_mm_counter(mm, member) (mm)->_##member++
- #define dec_mm_counter(mm, member) (mm)->_##member--
-
- #endif /* !USE_SPLIT_PTLOCKS */
-
- #define get_mm_rss(mm) \
- (get_mm_counter(mm, file_rss) + get_mm_counter(mm, anon_rss))
- #define update_hiwater_rss(mm) do { \
- unsigned long _rss = get_mm_rss(mm); \
- if ((mm)->hiwater_rss < _rss) \
- (mm)->hiwater_rss = _rss; \
- } while (0)
- #define update_hiwater_vm(mm) do { \
- if ((mm)->hiwater_vm < (mm)->total_vm) \
- (mm)->hiwater_vm = (mm)->total_vm; \
- } while (0)
-
- static inline unsigned long get_mm_hiwater_rss(struct mm_struct *mm)
- {
- return max(mm->hiwater_rss, get_mm_rss(mm));
- }
-
- static inline void setmax_mm_hiwater_rss(unsigned long *maxrss,
- struct mm_struct *mm)
- {
- unsigned long hiwater_rss = get_mm_hiwater_rss(mm);
-
- if (*maxrss < hiwater_rss)
- *maxrss = hiwater_rss;
- }
-
- static inline unsigned long get_mm_hiwater_vm(struct mm_struct *mm)
- {
- return max(mm->hiwater_vm, mm->total_vm);
- }
extern void set_dumpable(struct mm_struct *mm, int value);
extern int get_dumpable(struct mm_struct *mm);
unsigned long weight, inv_weight;
};
-/*
- * CFS stats for a schedulable entity (task, task-group etc)
- *
- * Current field usage histogram:
- *
- * 4 se->block_start
- * 4 se->run_node
- * 4 se->sleep_start
- * 6 se->load.weight
- */
-struct sched_entity {
- struct load_weight load; /* for load-balancing */
- struct rb_node run_node;
- struct list_head group_node;
- unsigned int on_rq;
-
- u64 exec_start;
- u64 sum_exec_runtime;
- u64 vruntime;
- u64 prev_sum_exec_runtime;
-
- u64 last_wakeup;
- u64 avg_overlap;
-
- u64 nr_migrations;
-
- u64 start_runtime;
- u64 avg_wakeup;
-
#ifdef CONFIG_SCHEDSTATS
+struct sched_statistics {
u64 wait_start;
u64 wait_max;
u64 wait_count;
u64 nr_wakeups_affine_attempts;
u64 nr_wakeups_passive;
u64 nr_wakeups_idle;
+};
+#endif
+
+struct sched_entity {
+ struct load_weight load; /* for load-balancing */
+ struct rb_node run_node;
+ struct list_head group_node;
+ unsigned int on_rq;
+
+ u64 exec_start;
+ u64 sum_exec_runtime;
+ u64 vruntime;
+ u64 prev_sum_exec_runtime;
+
+ u64 nr_migrations;
+
+#ifdef CONFIG_SCHEDSTATS
+ struct sched_statistics statistics;
#endif
#ifdef CONFIG_FAIR_GROUP_SCHED
struct plist_node pushable_tasks;
struct mm_struct *mm, *active_mm;
-
+ #if defined(SPLIT_RSS_COUNTING)
+ struct task_rss_stat rss_stat;
+ #endif
/* task state */
int exit_state;
int exit_code, exit_signal;
struct list_head *scm_work_list;
#ifdef CONFIG_FUNCTION_GRAPH_TRACER
- /* Index of current stored adress in ret_stack */
+ /* Index of current stored address in ret_stack */
int curr_ret_stack;
/* Stack of return addresses for return function tracing */
struct ftrace_ret_stack *ret_stack;
static inline void thread_group_cputime_init(struct signal_struct *sig)
{
- sig->cputimer.cputime = INIT_CPUTIME;
spin_lock_init(&sig->cputimer.lock);
- sig->cputimer.running = 0;
}
static inline void thread_group_cputime_free(struct signal_struct *sig)
#define CPU_LOAD_IDX_MAX 5
unsigned long cpu_load[CPU_LOAD_IDX_MAX];
#ifdef CONFIG_NO_HZ
+ u64 nohz_stamp;
unsigned char in_nohz_recently;
#endif
+ unsigned int skip_clock_update;
+
/* capture load from *all* tasks on this cpu: */
struct load_weight load;
unsigned long nr_load_updates;
void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
{
rq->curr->sched_class->check_preempt_curr(rq, p, flags);
+
+ /*
+ * A queue event has occurred, and we're going to schedule. In
+ * this case, we can save a useless back to back clock update.
+ */
+ if (test_tsk_need_resched(p))
+ rq->skip_clock_update = 1;
}
static inline int cpu_of(struct rq *rq)
inline void update_rq_clock(struct rq *rq)
{
- rq->clock = sched_clock_cpu(cpu_of(rq));
+ if (!rq->skip_clock_update)
+ rq->clock = sched_clock_cpu(cpu_of(rq));
}
/*
if (!tsk_is_polling(rq->idle))
smp_send_reschedule(cpu);
}
+
+int nohz_ratelimit(int cpu)
+{
+ struct rq *rq = cpu_rq(cpu);
+ u64 diff = rq->clock - rq->nohz_stamp;
+
+ rq->nohz_stamp = rq->clock;
+
+ return diff < (NSEC_PER_SEC / HZ) >> 1;
+}
+
#endif /* CONFIG_NO_HZ */
static u64 sched_avg_period(void)
#ifdef CONFIG_FAIR_GROUP_SCHED
- static __read_mostly unsigned long *update_shares_data;
+ static __read_mostly unsigned long __percpu *update_shares_data;
static void __set_se_shares(struct sched_entity *se, unsigned long shares);
raw_spin_lock_nested(&rq1->lock, SINGLE_DEPTH_NESTING);
}
}
- update_rq_clock(rq1);
- update_rq_clock(rq2);
}
/*
static void
enqueue_task(struct rq *rq, struct task_struct *p, int wakeup, bool head)
{
- if (wakeup)
- p->se.start_runtime = p->se.sum_exec_runtime;
-
+ update_rq_clock(rq);
sched_info_queued(p);
p->sched_class->enqueue_task(rq, p, wakeup, head);
p->se.on_rq = 1;
static void dequeue_task(struct rq *rq, struct task_struct *p, int sleep)
{
- if (sleep) {
- if (p->se.last_wakeup) {
- update_avg(&p->se.avg_overlap,
- p->se.sum_exec_runtime - p->se.last_wakeup);
- p->se.last_wakeup = 0;
- } else {
- update_avg(&p->se.avg_wakeup,
- sysctl_sched_wakeup_granularity);
- }
- }
-
+ update_rq_clock(rq);
sched_info_dequeued(p);
p->sched_class->dequeue_task(rq, p, sleep);
p->se.on_rq = 0;
unsigned long flags;
struct rq *rq;
- if (!sched_feat(SYNC_WAKEUPS))
- wake_flags &= ~WF_SYNC;
-
this_cpu = get_cpu();
smp_wmb();
rq = task_rq_lock(p, &flags);
- update_rq_clock(rq);
if (!(p->state & state))
goto out;
rq = cpu_rq(cpu);
raw_spin_lock(&rq->lock);
- update_rq_clock(rq);
/*
* We migrated the task without holding either rq->lock, however
out_activate:
#endif /* CONFIG_SMP */
- schedstat_inc(p, se.nr_wakeups);
+ schedstat_inc(p, se.statistics.nr_wakeups);
if (wake_flags & WF_SYNC)
- schedstat_inc(p, se.nr_wakeups_sync);
+ schedstat_inc(p, se.statistics.nr_wakeups_sync);
if (orig_cpu != cpu)
- schedstat_inc(p, se.nr_wakeups_migrate);
+ schedstat_inc(p, se.statistics.nr_wakeups_migrate);
if (cpu == this_cpu)
- schedstat_inc(p, se.nr_wakeups_local);
+ schedstat_inc(p, se.statistics.nr_wakeups_local);
else
- schedstat_inc(p, se.nr_wakeups_remote);
+ schedstat_inc(p, se.statistics.nr_wakeups_remote);
activate_task(rq, p, 1);
success = 1;
- /*
- * Only attribute actual wakeups done by this task.
- */
- if (!in_interrupt()) {
- struct sched_entity *se = ¤t->se;
- u64 sample = se->sum_exec_runtime;
-
- if (se->last_wakeup)
- sample -= se->last_wakeup;
- else
- sample -= se->start_runtime;
- update_avg(&se->avg_wakeup, sample);
-
- se->last_wakeup = se->sum_exec_runtime;
- }
-
out_running:
trace_sched_wakeup(rq, p, success);
check_preempt_curr(rq, p, wake_flags);
p->se.sum_exec_runtime = 0;
p->se.prev_sum_exec_runtime = 0;
p->se.nr_migrations = 0;
- p->se.last_wakeup = 0;
- p->se.avg_overlap = 0;
- p->se.start_runtime = 0;
- p->se.avg_wakeup = sysctl_sched_wakeup_granularity;
#ifdef CONFIG_SCHEDSTATS
- p->se.wait_start = 0;
- p->se.wait_max = 0;
- p->se.wait_count = 0;
- p->se.wait_sum = 0;
-
- p->se.sleep_start = 0;
- p->se.sleep_max = 0;
- p->se.sum_sleep_runtime = 0;
-
- p->se.block_start = 0;
- p->se.block_max = 0;
- p->se.exec_max = 0;
- p->se.slice_max = 0;
-
- p->se.nr_migrations_cold = 0;
- p->se.nr_failed_migrations_affine = 0;
- p->se.nr_failed_migrations_running = 0;
- p->se.nr_failed_migrations_hot = 0;
- p->se.nr_forced_migrations = 0;
-
- p->se.nr_wakeups = 0;
- p->se.nr_wakeups_sync = 0;
- p->se.nr_wakeups_migrate = 0;
- p->se.nr_wakeups_local = 0;
- p->se.nr_wakeups_remote = 0;
- p->se.nr_wakeups_affine = 0;
- p->se.nr_wakeups_affine_attempts = 0;
- p->se.nr_wakeups_passive = 0;
- p->se.nr_wakeups_idle = 0;
-
+ memset(&p->se.statistics, 0, sizeof(p->se.statistics));
#endif
INIT_LIST_HEAD(&p->rt.run_list);
{
unsigned long flags;
struct rq *rq;
- int cpu = get_cpu();
+ int cpu __maybe_unused = get_cpu();
#ifdef CONFIG_SMP
/*
BUG_ON(p->state != TASK_WAKING);
p->state = TASK_RUNNING;
- update_rq_clock(rq);
activate_task(rq, p, 0);
trace_sched_wakeup_new(rq, p, 1);
check_preempt_curr(rq, p, WF_FORK);
static void put_prev_task(struct rq *rq, struct task_struct *prev)
{
- if (prev->state == TASK_RUNNING) {
- u64 runtime = prev->se.sum_exec_runtime;
-
- runtime -= prev->se.prev_sum_exec_runtime;
- runtime = min_t(u64, runtime, 2*sysctl_sched_migration_cost);
-
- /*
- * In order to avoid avg_overlap growing stale when we are
- * indeed overlapping and hence not getting put to sleep, grow
- * the avg_overlap on preemption.
- *
- * We use the average preemption runtime because that
- * correlates to the amount of cache footprint a task can
- * build up.
- */
- update_avg(&prev->se.avg_overlap, runtime);
- }
+ if (prev->se.on_rq)
+ update_rq_clock(rq);
+ rq->skip_clock_update = 0;
prev->sched_class->put_prev_task(rq, prev);
}
hrtick_clear(rq);
raw_spin_lock_irq(&rq->lock);
- update_rq_clock(rq);
clear_tsk_need_resched(prev);
if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
BUG_ON(prio < 0 || prio > MAX_PRIO);
rq = task_rq_lock(p, &flags);
- update_rq_clock(rq);
oldprio = p->prio;
prev_class = p->sched_class;
* the task might be in the middle of scheduling on another CPU.
*/
rq = task_rq_lock(p, &flags);
- update_rq_clock(rq);
/*
* The RT priorities are set via sched_setscheduler(), but we still
* allow the 'normal' nice value to be set - but as expected
/* convert nice value [19,-20] to rlimit style value [1,40] */
int nice_rlim = 20 - nice;
- return (nice_rlim <= p->signal->rlim[RLIMIT_NICE].rlim_cur ||
+ return (nice_rlim <= task_rlimit(p, RLIMIT_NICE) ||
capable(CAP_SYS_NICE));
}
if (!lock_task_sighand(p, &flags))
return -ESRCH;
- rlim_rtprio = p->signal->rlim[RLIMIT_RTPRIO].rlim_cur;
+ rlim_rtprio = task_rlimit(p, RLIMIT_RTPRIO);
unlock_task_sighand(p, &flags);
/* can't set/change the rt policy */
raw_spin_unlock_irqrestore(&p->pi_lock, flags);
goto recheck;
}
- update_rq_clock(rq);
on_rq = p->se.on_rq;
running = task_current(rq, p);
if (on_rq)
int ret;
cpumask_var_t mask;
- if (len < cpumask_size())
+ if (len < nr_cpu_ids)
+ return -EINVAL;
+ if (len & (sizeof(unsigned long)-1))
return -EINVAL;
if (!alloc_cpumask_var(&mask, GFP_KERNEL))
ret = sched_getaffinity(pid, mask);
if (ret == 0) {
- if (copy_to_user(user_mask_ptr, mask, cpumask_size()))
+ size_t retlen = min_t(size_t, len, cpumask_size());
+
+ if (copy_to_user(user_mask_ptr, mask, retlen))
ret = -EFAULT;
else
- ret = cpumask_size();
+ ret = retlen;
}
free_cpumask_var(mask);
__setscheduler(rq, p, SCHED_FIFO, MAX_RT_PRIO-1);
- update_rq_clock(rq);
activate_task(rq, p, 0);
raw_spin_unlock_irqrestore(&rq->lock, flags);
for ( ; ; ) {
if (!rq->nr_running)
break;
- update_rq_clock(rq);
next = pick_next_task(rq);
if (!next)
break;
rq->migration_thread = NULL;
/* Idle task back to normal (off runqueue, low prio) */
raw_spin_lock_irq(&rq->lock);
- update_rq_clock(rq);
deactivate_task(rq, rq->idle, 0);
__setscheduler(rq, rq->idle, SCHED_NORMAL, 0);
rq->idle->sched_class = &idle_sched_class;
#ifdef CONFIG_SCHED_MC
static ssize_t sched_mc_power_savings_show(struct sysdev_class *class,
+ struct sysdev_class_attribute *attr,
char *page)
{
return sprintf(page, "%u\n", sched_mc_power_savings);
}
static ssize_t sched_mc_power_savings_store(struct sysdev_class *class,
+ struct sysdev_class_attribute *attr,
const char *buf, size_t count)
{
return sched_power_savings_store(buf, count, 0);
#ifdef CONFIG_SCHED_SMT
static ssize_t sched_smt_power_savings_show(struct sysdev_class *dev,
+ struct sysdev_class_attribute *attr,
char *page)
{
return sprintf(page, "%u\n", sched_smt_power_savings);
}
static ssize_t sched_smt_power_savings_store(struct sysdev_class *dev,
+ struct sysdev_class_attribute *attr,
const char *buf, size_t count)
{
return sched_power_savings_store(buf, count, 1);
{
int on_rq;
- update_rq_clock(rq);
on_rq = p->se.on_rq;
if (on_rq)
deactivate_task(rq, p, 0);
p->se.exec_start = 0;
#ifdef CONFIG_SCHEDSTATS
- p->se.wait_start = 0;
- p->se.sleep_start = 0;
- p->se.block_start = 0;
+ p->se.statistics.wait_start = 0;
+ p->se.statistics.sleep_start = 0;
+ p->se.statistics.block_start = 0;
#endif
if (!rt_task(p)) {
rq = task_rq_lock(tsk, &flags);
- update_rq_clock(rq);
-
running = task_current(rq, tsk);
on_rq = tsk->se.on_rq;
struct cpuacct {
struct cgroup_subsys_state css;
/* cpuusage holds pointer to a u64-type object on every cpu */
- u64 *cpuusage;
+ u64 __percpu *cpuusage;
struct percpu_counter cpustat[CPUACCT_STAT_NSTATS];
struct cpuacct *parent;
};
* (to see the precise effective timeslice length of your workload,
* run vmstat and monitor the context-switches (cs) field)
*/
-unsigned int sysctl_sched_latency = 5000000ULL;
-unsigned int normalized_sysctl_sched_latency = 5000000ULL;
+unsigned int sysctl_sched_latency = 6000000ULL;
+unsigned int normalized_sysctl_sched_latency = 6000000ULL;
/*
* The initial- and re-scaling of tunables is configurable
/*
* Minimal preemption granularity for CPU-bound tasks:
- * (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds)
+ * (default: 2 msec * (1 + ilog(ncpus)), units: nanoseconds)
*/
-unsigned int sysctl_sched_min_granularity = 1000000ULL;
-unsigned int normalized_sysctl_sched_min_granularity = 1000000ULL;
+unsigned int sysctl_sched_min_granularity = 2000000ULL;
+unsigned int normalized_sysctl_sched_min_granularity = 2000000ULL;
/*
* is kept at sysctl_sched_latency / sysctl_sched_min_granularity
*/
-static unsigned int sched_nr_latency = 5;
+static unsigned int sched_nr_latency = 3;
/*
* After fork, child runs first. If set to 0 (default) then
{
unsigned long delta_exec_weighted;
- schedstat_set(curr->exec_max, max((u64)delta_exec, curr->exec_max));
+ schedstat_set(curr->statistics.exec_max,
+ max((u64)delta_exec, curr->statistics.exec_max));
curr->sum_exec_runtime += delta_exec;
schedstat_add(cfs_rq, exec_clock, delta_exec);
static inline void
update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
- schedstat_set(se->wait_start, rq_of(cfs_rq)->clock);
+ schedstat_set(se->statistics.wait_start, rq_of(cfs_rq)->clock);
}
/*
static void
update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
- schedstat_set(se->wait_max, max(se->wait_max,
- rq_of(cfs_rq)->clock - se->wait_start));
- schedstat_set(se->wait_count, se->wait_count + 1);
- schedstat_set(se->wait_sum, se->wait_sum +
- rq_of(cfs_rq)->clock - se->wait_start);
+ schedstat_set(se->statistics.wait_max, max(se->statistics.wait_max,
+ rq_of(cfs_rq)->clock - se->statistics.wait_start));
+ schedstat_set(se->statistics.wait_count, se->statistics.wait_count + 1);
+ schedstat_set(se->statistics.wait_sum, se->statistics.wait_sum +
+ rq_of(cfs_rq)->clock - se->statistics.wait_start);
#ifdef CONFIG_SCHEDSTATS
if (entity_is_task(se)) {
trace_sched_stat_wait(task_of(se),
- rq_of(cfs_rq)->clock - se->wait_start);
+ rq_of(cfs_rq)->clock - se->statistics.wait_start);
}
#endif
- schedstat_set(se->wait_start, 0);
+ schedstat_set(se->statistics.wait_start, 0);
}
static inline void
if (entity_is_task(se))
tsk = task_of(se);
- if (se->sleep_start) {
- u64 delta = rq_of(cfs_rq)->clock - se->sleep_start;
+ if (se->statistics.sleep_start) {
+ u64 delta = rq_of(cfs_rq)->clock - se->statistics.sleep_start;
if ((s64)delta < 0)
delta = 0;
- if (unlikely(delta > se->sleep_max))
- se->sleep_max = delta;
+ if (unlikely(delta > se->statistics.sleep_max))
+ se->statistics.sleep_max = delta;
- se->sleep_start = 0;
- se->sum_sleep_runtime += delta;
+ se->statistics.sleep_start = 0;
+ se->statistics.sum_sleep_runtime += delta;
if (tsk) {
account_scheduler_latency(tsk, delta >> 10, 1);
trace_sched_stat_sleep(tsk, delta);
}
}
- if (se->block_start) {
- u64 delta = rq_of(cfs_rq)->clock - se->block_start;
+ if (se->statistics.block_start) {
+ u64 delta = rq_of(cfs_rq)->clock - se->statistics.block_start;
if ((s64)delta < 0)
delta = 0;
- if (unlikely(delta > se->block_max))
- se->block_max = delta;
+ if (unlikely(delta > se->statistics.block_max))
+ se->statistics.block_max = delta;
- se->block_start = 0;
- se->sum_sleep_runtime += delta;
+ se->statistics.block_start = 0;
+ se->statistics.sum_sleep_runtime += delta;
if (tsk) {
if (tsk->in_iowait) {
- se->iowait_sum += delta;
- se->iowait_count++;
+ se->statistics.iowait_sum += delta;
+ se->statistics.iowait_count++;
trace_sched_stat_iowait(tsk, delta);
}
vruntime += sched_vslice(cfs_rq, se);
/* sleeps up to a single latency don't count. */
- if (!initial && sched_feat(FAIR_SLEEPERS)) {
+ if (!initial) {
unsigned long thresh = sysctl_sched_latency;
- /*
- * Convert the sleeper threshold into virtual time.
- * SCHED_IDLE is a special sub-class. We care about
- * fairness only relative to other SCHED_IDLE tasks,
- * all of which have the same weight.
- */
- if (sched_feat(NORMALIZED_SLEEPER) && (!entity_is_task(se) ||
- task_of(se)->policy != SCHED_IDLE))
- thresh = calc_delta_fair(thresh, se);
-
/*
* Halve their sleep time's effect, to allow
* for a gentler effect of sleepers:
struct task_struct *tsk = task_of(se);
if (tsk->state & TASK_INTERRUPTIBLE)
- se->sleep_start = rq_of(cfs_rq)->clock;
+ se->statistics.sleep_start = rq_of(cfs_rq)->clock;
if (tsk->state & TASK_UNINTERRUPTIBLE)
- se->block_start = rq_of(cfs_rq)->clock;
+ se->statistics.block_start = rq_of(cfs_rq)->clock;
}
#endif
}
* when there are only lesser-weight tasks around):
*/
if (rq_of(cfs_rq)->load.weight >= 2*se->load.weight) {
- se->slice_max = max(se->slice_max,
+ se->statistics.slice_max = max(se->statistics.slice_max,
se->sum_exec_runtime - se->prev_sum_exec_runtime);
}
#endif
static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
{
- struct task_struct *curr = current;
unsigned long this_load, load;
int idx, this_cpu, prev_cpu;
unsigned long tl_per_task;
load = source_load(prev_cpu, idx);
this_load = target_load(this_cpu, idx);
- if (sync) {
- if (sched_feat(SYNC_LESS) &&
- (curr->se.avg_overlap > sysctl_sched_migration_cost ||
- p->se.avg_overlap > sysctl_sched_migration_cost))
- sync = 0;
- } else {
- if (sched_feat(SYNC_MORE) &&
- (curr->se.avg_overlap < sysctl_sched_migration_cost &&
- p->se.avg_overlap < sysctl_sched_migration_cost))
- sync = 1;
- }
-
/*
* If sync wakeup then subtract the (maximum possible)
* effect of the currently running task from the load
if (sync && balanced)
return 1;
- schedstat_inc(p, se.nr_wakeups_affine_attempts);
+ schedstat_inc(p, se.statistics.nr_wakeups_affine_attempts);
tl_per_task = cpu_avg_load_per_task(this_cpu);
if (balanced ||
* there is no bad imbalance.
*/
schedstat_inc(sd, ttwu_move_affine);
- schedstat_inc(p, se.nr_wakeups_affine);
+ schedstat_inc(p, se.statistics.nr_wakeups_affine);
return 1;
}
int cpu = smp_processor_id();
int prev_cpu = task_cpu(p);
int new_cpu = cpu;
- int want_affine = 0;
+ int want_affine = 0, cpu_idle = !current->pid;
int want_sd = 1;
int sync = wake_flags & WF_SYNC;
if (sd_flag & SD_BALANCE_WAKE) {
- if (sched_feat(AFFINE_WAKEUPS) &&
- cpumask_test_cpu(cpu, &p->cpus_allowed))
+ if (cpumask_test_cpu(cpu, &p->cpus_allowed))
want_affine = 1;
new_cpu = prev_cpu;
}
* If there's an idle sibling in this domain, make that
* the wake_affine target instead of the current cpu.
*/
- if (tmp->flags & SD_SHARE_PKG_RESOURCES)
+ if (!cpu_idle && tmp->flags & SD_SHARE_PKG_RESOURCES)
target = select_idle_sibling(p, tmp, target);
if (target >= 0) {
if (tmp->flags & SD_WAKE_AFFINE) {
affine_sd = tmp;
want_affine = 0;
+ if (target != cpu)
+ cpu_idle = 1;
}
cpu = target;
}
sd = tmp;
}
+#ifdef CONFIG_FAIR_GROUP_SCHED
if (sched_feat(LB_SHARES_UPDATE)) {
/*
* Pick the largest domain to update shares over
if (tmp)
update_shares(tmp);
}
+#endif
- if (affine_sd && wake_affine(affine_sd, p, sync))
- return cpu;
+ if (affine_sd) {
+ if (cpu_idle || cpu == prev_cpu || wake_affine(affine_sd, p, sync))
+ return cpu;
+ }
while (sd) {
int load_idx = sd->forkexec_idx;
}
#endif /* CONFIG_SMP */
-/*
- * Adaptive granularity
- *
- * se->avg_wakeup gives the average time a task runs until it does a wakeup,
- * with the limit of wakeup_gran -- when it never does a wakeup.
- *
- * So the smaller avg_wakeup is the faster we want this task to preempt,
- * but we don't want to treat the preemptee unfairly and therefore allow it
- * to run for at least the amount of time we'd like to run.
- *
- * NOTE: we use 2*avg_wakeup to increase the probability of actually doing one
- *
- * NOTE: we use *nr_running to scale with load, this nicely matches the
- * degrading latency on load.
- */
-static unsigned long
-adaptive_gran(struct sched_entity *curr, struct sched_entity *se)
-{
- u64 this_run = curr->sum_exec_runtime - curr->prev_sum_exec_runtime;
- u64 expected_wakeup = 2*se->avg_wakeup * cfs_rq_of(se)->nr_running;
- u64 gran = 0;
-
- if (this_run < expected_wakeup)
- gran = expected_wakeup - this_run;
-
- return min_t(s64, gran, sysctl_sched_wakeup_granularity);
-}
-
static unsigned long
wakeup_gran(struct sched_entity *curr, struct sched_entity *se)
{
unsigned long gran = sysctl_sched_wakeup_granularity;
- if (cfs_rq_of(curr)->curr && sched_feat(ADAPTIVE_GRAN))
- gran = adaptive_gran(curr, se);
-
/*
* Since its curr running now, convert the gran from real-time
* to virtual-time in his units.
+ *
+ * By using 'se' instead of 'curr' we penalize light tasks, so
+ * they get preempted easier. That is, if 'se' < 'curr' then
+ * the resulting gran will be larger, therefore penalizing the
+ * lighter, if otoh 'se' > 'curr' then the resulting gran will
+ * be smaller, again penalizing the lighter task.
+ *
+ * This is especially important for buddies when the leftmost
+ * task is higher priority than the buddy.
*/
- if (sched_feat(ASYM_GRAN)) {
- /*
- * By using 'se' instead of 'curr' we penalize light tasks, so
- * they get preempted easier. That is, if 'se' < 'curr' then
- * the resulting gran will be larger, therefore penalizing the
- * lighter, if otoh 'se' > 'curr' then the resulting gran will
- * be smaller, again penalizing the lighter task.
- *
- * This is especially important for buddies when the leftmost
- * task is higher priority than the buddy.
- */
- if (unlikely(se->load.weight != NICE_0_LOAD))
- gran = calc_delta_fair(gran, se);
- } else {
- if (unlikely(curr->load.weight != NICE_0_LOAD))
- gran = calc_delta_fair(gran, curr);
- }
+ if (unlikely(se->load.weight != NICE_0_LOAD))
+ gran = calc_delta_fair(gran, se);
return gran;
}
struct task_struct *curr = rq->curr;
struct sched_entity *se = &curr->se, *pse = &p->se;
struct cfs_rq *cfs_rq = task_cfs_rq(curr);
- int sync = wake_flags & WF_SYNC;
int scale = cfs_rq->nr_running >= sched_nr_latency;
if (unlikely(rt_prio(p->prio)))
if (unlikely(curr->policy == SCHED_IDLE))
goto preempt;
- if (sched_feat(WAKEUP_SYNC) && sync)
- goto preempt;
-
- if (sched_feat(WAKEUP_OVERLAP) &&
- se->avg_overlap < sysctl_sched_migration_cost &&
- pse->avg_overlap < sysctl_sched_migration_cost)
- goto preempt;
-
if (!sched_feat(WAKEUP_PREEMPT))
return;
* 3) are cache-hot on their current CPU.
*/
if (!cpumask_test_cpu(this_cpu, &p->cpus_allowed)) {
- schedstat_inc(p, se.nr_failed_migrations_affine);
+ schedstat_inc(p, se.statistics.nr_failed_migrations_affine);
return 0;
}
*all_pinned = 0;
if (task_running(rq, p)) {
- schedstat_inc(p, se.nr_failed_migrations_running);
+ schedstat_inc(p, se.statistics.nr_failed_migrations_running);
return 0;
}
#ifdef CONFIG_SCHEDSTATS
if (tsk_cache_hot) {
schedstat_inc(sd, lb_hot_gained[idle]);
- schedstat_inc(p, se.nr_forced_migrations);
+ schedstat_inc(p, se.statistics.nr_forced_migrations);
}
#endif
return 1;
}
if (tsk_cache_hot) {
- schedstat_inc(p, se.nr_failed_migrations_hot);
+ schedstat_inc(p, se.statistics.nr_failed_migrations_hot);
return 0;
}
return 1;
/* move a task from busiest_rq to target_rq */
double_lock_balance(busiest_rq, target_rq);
- update_rq_clock(busiest_rq);
- update_rq_clock(target_rq);
/* Search for an sd spanning us and the target CPU. */
for_each_domain(target_cpu, sd) {
static inline int on_null_domain(int cpu)
{
- return !rcu_dereference(cpu_rq(cpu)->sd);
+ return !rcu_dereference_sched(cpu_rq(cpu)->sd);
}
/*
if (unlikely((s64)delta_exec < 0))
delta_exec = 0;
- schedstat_set(curr->se.exec_max, max(curr->se.exec_max, delta_exec));
+ schedstat_set(curr->se.statistics.exec_max, max(curr->se.statistics.exec_max, delta_exec));
curr->se.sum_exec_runtime += delta_exec;
account_group_exec_runtime(curr, delta_exec);
if (!p->signal)
return;
- soft = p->signal->rlim[RLIMIT_RTTIME].rlim_cur;
- hard = p->signal->rlim[RLIMIT_RTTIME].rlim_max;
+ /* max may change after cur was read, this will be fixed next tick */
+ soft = task_rlimit(p, RLIMIT_RTTIME);
+ hard = task_rlimit_max(p, RLIMIT_RTTIME);
if (soft != RLIM_INFINITY) {
unsigned long next;