#include <linux/elf.h>
#include <linux/pid_namespace.h>
#include <linux/user_namespace.h>
+#include <linux/fs_parser.h>
#include <linux/fs_struct.h>
#include <linux/slab.h>
#include <linux/sched/autogroup.h>
static u8 nlink_tid __ro_after_init;
static u8 nlink_tgid __ro_after_init;
+enum proc_mem_force {
+ PROC_MEM_FORCE_ALWAYS,
+ PROC_MEM_FORCE_PTRACE,
+ PROC_MEM_FORCE_NEVER
+};
+
+static enum proc_mem_force proc_mem_force_override __ro_after_init =
+ IS_ENABLED(CONFIG_PROC_MEM_NO_FORCE) ? PROC_MEM_FORCE_NEVER :
+ IS_ENABLED(CONFIG_PROC_MEM_FORCE_PTRACE) ? PROC_MEM_FORCE_PTRACE :
+ PROC_MEM_FORCE_ALWAYS;
+
+static const struct constant_table proc_mem_force_table[] __initconst = {
+ { "always", PROC_MEM_FORCE_ALWAYS },
+ { "ptrace", PROC_MEM_FORCE_PTRACE },
+ { "never", PROC_MEM_FORCE_NEVER },
+ { }
+};
+
+static int __init early_proc_mem_force_override(char *buf)
+{
+ if (!buf)
+ return -EINVAL;
+
+ /*
+ * lookup_constant() defaults to proc_mem_force_override to preseve
+ * the initial Kconfig choice in case an invalid param gets passed.
+ */
+ proc_mem_force_override = lookup_constant(proc_mem_force_table,
+ buf, proc_mem_force_override);
+
+ return 0;
+}
+early_param("proc_mem.force_override", early_proc_mem_force_override);
+
struct pid_entry {
const char *name;
unsigned int len;
static int mem_open(struct inode *inode, struct file *file)
{
- int ret = __mem_open(inode, file, PTRACE_MODE_ATTACH);
-
- /* OK to pass negative loff_t, we can catch out-of-range */
- file->f_mode |= FMODE_UNSIGNED_OFFSET;
+ if (WARN_ON_ONCE(!(file->f_op->fop_flags & FOP_UNSIGNED_OFFSET)))
+ return -EINVAL;
+ return __mem_open(inode, file, PTRACE_MODE_ATTACH);
+}
- return ret;
+static bool proc_mem_foll_force(struct file *file, struct mm_struct *mm)
+{
+ struct task_struct *task;
+ bool ptrace_active = false;
+
+ switch (proc_mem_force_override) {
+ case PROC_MEM_FORCE_NEVER:
+ return false;
+ case PROC_MEM_FORCE_PTRACE:
+ task = get_proc_task(file_inode(file));
+ if (task) {
+ ptrace_active = READ_ONCE(task->ptrace) &&
+ READ_ONCE(task->mm) == mm &&
+ READ_ONCE(task->parent) == current;
+ put_task_struct(task);
+ }
+ return ptrace_active;
+ default:
+ return true;
+ }
}
static ssize_t mem_rw(struct file *file, char __user *buf,
if (!mmget_not_zero(mm))
goto free;
- flags = FOLL_FORCE | (write ? FOLL_WRITE : 0);
+ flags = write ? FOLL_WRITE : 0;
+ if (proc_mem_foll_force(file, mm))
+ flags |= FOLL_FORCE;
while (count > 0) {
size_t this_len = min_t(size_t, count, PAGE_SIZE);
.write = mem_write,
.open = mem_open,
.release = mem_release,
+ .fop_flags = FOP_UNSIGNED_OFFSET,
};
static int environ_open(struct inode *inode, struct file *file)
inode->i_op = &proc_map_files_link_inode_operations;
inode->i_size = 64;
- d_set_d_op(dentry, &tid_map_files_dentry_operations);
- return d_splice_alias(inode, dentry);
+ return proc_splice_unmountable(inode, dentry,
+ &tid_map_files_dentry_operations);
}
static struct dentry *proc_map_files_lookup(struct inode *dir,
if (!tp->sighand)
return ERR_PTR(-ESRCH);
- return seq_list_start(&tp->task->signal->posix_timers, *pos);
+ return seq_hlist_start(&tp->task->signal->posix_timers, *pos);
}
static void *timers_next(struct seq_file *m, void *v, loff_t *pos)
{
struct timers_private *tp = m->private;
- return seq_list_next(v, &tp->task->signal->posix_timers, pos);
+ return seq_hlist_next(v, &tp->task->signal->posix_timers, pos);
}
static void timers_stop(struct seq_file *m, void *v)
[SIGEV_THREAD] = "thread",
};
- timer = list_entry((struct list_head *)v, struct k_itimer, list);
+ timer = hlist_entry((struct hlist_node *)v, struct k_itimer, list);
notify = timer->it_sigev_notify;
seq_printf(m, "ID: %d\n", timer->it_id);
}
task_lock(p);
- if (task_is_realtime(p))
- if (slack_ns == 0)
- p->timer_slack_ns = p->default_timer_slack_ns;
- else
- p->timer_slack_ns = slack_ns;
++ if (rt_or_dl_task_policy(p))
+ slack_ns = 0;
+ else if (slack_ns == 0)
+ slack_ns = p->default_timer_slack_ns;
+ p->timer_slack_ns = slack_ns;
task_unlock(p);
out:
if (!dir_emit_dots(file, ctx))
return 0;
- /* f_version caches the tgid value that the last readdir call couldn't
- * return. lseek aka telldir automagically resets f_version to 0.
+ /* We cache the tgid value that the last readdir call couldn't
+ * return and lseek resets it to 0.
*/
ns = proc_pid_ns(inode->i_sb);
- tid = (int)file->f_version;
- file->f_version = 0;
+ tid = (int)(intptr_t)file->private_data;
+ file->private_data = NULL;
for (task = first_tid(proc_pid(inode), tid, ctx->pos - 2, ns);
task;
task = next_tid(task), ctx->pos++) {
proc_task_instantiate, task, NULL)) {
/* returning this tgid failed, save it as the first
* pid for the next readir call */
- file->f_version = (u64)tid;
+ file->private_data = (void *)(intptr_t)tid;
put_task_struct(task);
break;
}
return 0;
}
+/*
+ * proc_task_readdir() set @file->private_data to a positive integer
+ * value, so casting that to u64 is safe. generic_llseek_cookie() will
+ * set @cookie to 0, so casting to an int is safe. The WARN_ON_ONCE() is
+ * here to catch any unexpected change in behavior either in
+ * proc_task_readdir() or generic_llseek_cookie().
+ */
+static loff_t proc_dir_llseek(struct file *file, loff_t offset, int whence)
+{
+ u64 cookie = (u64)(intptr_t)file->private_data;
+ loff_t off;
+
+ off = generic_llseek_cookie(file, offset, whence, &cookie);
+ WARN_ON_ONCE(cookie > INT_MAX);
+ file->private_data = (void *)(intptr_t)cookie; /* serialized by f_pos_lock */
+ return off;
+}
+
static const struct inode_operations proc_task_inode_operations = {
.lookup = proc_task_lookup,
.getattr = proc_task_getattr,
static const struct file_operations proc_task_operations = {
.read = generic_read_dir,
.iterate_shared = proc_task_readdir,
- .llseek = generic_file_llseek,
+ .llseek = proc_dir_llseek,
};
void __init set_proc_pid_nlink(void)
* Special states are those that do not use the normal wait-loop pattern. See
* the comment with set_special_state().
*/
- #define is_special_task_state(state) \
- ((state) & (__TASK_STOPPED | __TASK_TRACED | TASK_PARKED | TASK_DEAD))
+ #define is_special_task_state(state) \
+ ((state) & (__TASK_STOPPED | __TASK_TRACED | TASK_PARKED | \
+ TASK_DEAD | TASK_FROZEN))
#ifdef CONFIG_DEBUG_ATOMIC_SLEEP
# define debug_normal_state_change(state_value) \
struct rb_node run_node;
u64 deadline;
u64 min_vruntime;
+ u64 min_slice;
struct list_head group_node;
- unsigned int on_rq;
+ unsigned char on_rq;
+ unsigned char sched_delayed;
+ unsigned char rel_deadline;
+ unsigned char custom_slice;
+ /* hole */
u64 exec_start;
u64 sum_exec_runtime;
*
* @dl_overrun tells if the task asked to be informed about runtime
* overruns.
+ *
+ * @dl_server tells if this is a server entity.
+ *
+ * @dl_defer tells if this is a deferred or regular server. For
+ * now only defer server exists.
+ *
+ * @dl_defer_armed tells if the deferrable server is waiting
+ * for the replenishment timer to activate it.
+ *
+ * @dl_defer_running tells if the deferrable server is actually
+ * running, skipping the defer phase.
*/
unsigned int dl_throttled : 1;
unsigned int dl_yielded : 1;
unsigned int dl_non_contending : 1;
unsigned int dl_overrun : 1;
unsigned int dl_server : 1;
+ unsigned int dl_defer : 1;
+ unsigned int dl_defer_armed : 1;
+ unsigned int dl_defer_running : 1;
/*
* Bandwidth enforcement timer. Each -deadline task has its
*/
struct rq *rq;
dl_server_has_tasks_f server_has_tasks;
- dl_server_pick_f server_pick;
+ dl_server_pick_f server_pick_task;
#ifdef CONFIG_RT_MUTEXES
/*
/* Sequence number to catch updates: */
seqcount_spinlock_t mems_allowed_seq;
int cpuset_mem_spread_rotor;
- int cpuset_slab_spread_rotor;
#endif
#ifdef CONFIG_CGROUPS
/* Control Group info protected by css_set_lock: */
{
int prio = task->prio;
- if (!rt_prio(prio))
+ if (!rt_or_dl_prio(prio))
return DEFAULT_PRIO;
return prio;
* Note that RT tasks are excluded from same priority (lateral)
* steals to prevent the introduction of an unbounded latency.
*/
- if (rt_prio(waiter->tree.prio) || dl_prio(waiter->tree.prio))
+ if (rt_or_dl_prio(waiter->tree.prio))
return false;
return rt_waiter_node_equal(&waiter->tree, &top_waiter->tree);
}
static void __sched rt_mutex_handle_deadlock(int res, int detect_deadlock,
+ struct rt_mutex_base *lock,
struct rt_mutex_waiter *w)
{
/*
if (build_ww_mutex() && w->ww_ctx)
return;
- /*
- * Yell loudly and stop the task right here.
- */
+ raw_spin_unlock_irq(&lock->wait_lock);
+
WARN(1, "rtmutex deadlock detected\n");
+
while (1) {
set_current_state(TASK_INTERRUPTIBLE);
rt_mutex_schedule();
} else {
__set_current_state(TASK_RUNNING);
remove_waiter(lock, waiter);
- rt_mutex_handle_deadlock(ret, chwalk, waiter);
+ rt_mutex_handle_deadlock(ret, chwalk, lock, waiter);
}
/*
if (p->sched_class == &stop_sched_class) /* trumps deadline */
return -2;
- if (rt_prio(p->prio)) /* includes deadline */
+ if (p->dl_server)
+ return -1; /* deadline */
+
+ if (rt_or_dl_prio(p->prio))
return p->prio; /* [-1, 99] */
if (p->sched_class == &idle_sched_class)
if (-pb < -pa)
return false;
- if (pa == -1) /* dl_prio() doesn't work because of stop_class above */
- return !dl_time_before(a->dl.deadline, b->dl.deadline);
+ if (pa == -1) { /* dl_prio() doesn't work because of stop_class above */
+ const struct sched_dl_entity *a_dl, *b_dl;
+
+ a_dl = &a->dl;
+ /*
+ * Since,'a' and 'b' can be CFS tasks served by DL server,
+ * __task_prio() can return -1 (for DL) even for those. In that
+ * case, get to the dl_server's DL entity.
+ */
+ if (a->dl_server)
+ a_dl = a->dl_server;
+
+ b_dl = &b->dl;
+ if (b->dl_server)
+ b_dl = b->dl_server;
+
+ return !dl_time_before(a_dl->deadline, b_dl->deadline);
+ }
if (pa == MAX_RT_PRIO + MAX_NICE) /* fair */
return cfs_prio_less(a, b, in_fi);
void sched_core_enqueue(struct rq *rq, struct task_struct *p)
{
+ if (p->se.sched_delayed)
+ return;
+
rq->core->core_task_seq++;
if (!p->core_cookie)
void sched_core_dequeue(struct rq *rq, struct task_struct *p, int flags)
{
+ if (p->se.sched_delayed)
+ return;
+
rq->core->core_task_seq++;
if (sched_core_enqueued(p)) {
* dequeued by migrating while the constrained task continues to run.
* E.g. going from 2->1 without going through pick_next_task().
*/
- if (sched_feat(HZ_BW) && __need_bw_check(rq, rq->curr)) {
+ if (__need_bw_check(rq, rq->curr)) {
if (cfs_task_bw_constrained(rq->curr))
return false;
}
if (unlikely(!p->sched_class->uclamp_enabled))
return;
+ if (p->se.sched_delayed)
+ return;
+
for_each_clamp_id(clamp_id)
uclamp_rq_inc_id(rq, p, clamp_id);
if (unlikely(!p->sched_class->uclamp_enabled))
return;
+ if (p->se.sched_delayed)
+ return;
+
for_each_clamp_id(clamp_id)
uclamp_rq_dec_id(rq, p, clamp_id);
}
psi_enqueue(p, (flags & ENQUEUE_WAKEUP) && !(flags & ENQUEUE_MIGRATED));
}
- uclamp_rq_inc(rq, p);
p->sched_class->enqueue_task(rq, p, flags);
+ /*
+ * Must be after ->enqueue_task() because ENQUEUE_DELAYED can clear
+ * ->sched_delayed.
+ */
+ uclamp_rq_inc(rq, p);
if (sched_core_enabled(rq))
sched_core_enqueue(rq, p);
}
- void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
+ /*
+ * Must only return false when DEQUEUE_SLEEP.
+ */
+ inline bool dequeue_task(struct rq *rq, struct task_struct *p, int flags)
{
if (sched_core_enabled(rq))
sched_core_dequeue(rq, p, flags);
psi_dequeue(p, flags & DEQUEUE_SLEEP);
}
+ /*
+ * Must be before ->dequeue_task() because ->dequeue_task() can 'fail'
+ * and mark the task ->sched_delayed.
+ */
uclamp_rq_dec(rq, p);
- p->sched_class->dequeue_task(rq, p, flags);
+ return p->sched_class->dequeue_task(rq, p, flags);
}
void activate_task(struct rq *rq, struct task_struct *p, int flags)
void deactivate_task(struct rq *rq, struct task_struct *p, int flags)
{
- WRITE_ONCE(p->on_rq, (flags & DEQUEUE_SLEEP) ? 0 : TASK_ON_RQ_MIGRATING);
+ SCHED_WARN_ON(flags & DEQUEUE_SLEEP);
+
+ WRITE_ONCE(p->on_rq, TASK_ON_RQ_MIGRATING);
ASSERT_EXCLUSIVE_WRITER(p->on_rq);
+ /*
+ * Code explicitly relies on TASK_ON_RQ_MIGRATING begin set *before*
+ * dequeue_task() and cleared *after* enqueue_task().
+ */
+
dequeue_task(rq, p, flags);
}
+ static void block_task(struct rq *rq, struct task_struct *p, int flags)
+ {
+ if (dequeue_task(rq, p, DEQUEUE_SLEEP | flags))
+ __block_task(rq, p);
+ }
+
/**
* task_curr - is this task currently executing on a CPU?
* @p: the task in question.
struct task_struct *p = current;
if (p->migration_disabled) {
+ #ifdef CONFIG_DEBUG_PREEMPT
+ /*
+ *Warn about overflow half-way through the range.
+ */
+ WARN_ON_ONCE((s16)p->migration_disabled < 0);
+ #endif
p->migration_disabled++;
return;
}
.flags = SCA_MIGRATE_ENABLE,
};
+ #ifdef CONFIG_DEBUG_PREEMPT
+ /*
+ * Check both overflow from migrate_disable() and superfluous
+ * migrate_enable().
+ */
+ if (WARN_ON_ONCE((s16)p->migration_disabled <= 0))
+ return;
+ #endif
+
if (p->migration_disabled > 1) {
p->migration_disabled--;
return;
}
- if (WARN_ON_ONCE(!p->migration_disabled))
- return;
-
/*
* Ensure stop_task runs either before or after this, and that
* __set_cpus_allowed_ptr(SCA_MIGRATE_ENABLE) doesn't schedule().
rq->idle_stamp = 0;
}
#endif
-
- p->dl_server = NULL;
}
/*
rq = __task_rq_lock(p, &rf);
if (task_on_rq_queued(p)) {
+ update_rq_clock(rq);
+ if (p->se.sched_delayed)
+ enqueue_task(rq, p, ENQUEUE_NOCLOCK | ENQUEUE_DELAYED);
if (!task_on_cpu(rq, p)) {
/*
* When on_rq && !on_cpu the task is preempted, see if
* it should preempt the task that is current now.
*/
- update_rq_clock(rq);
wakeup_preempt(rq, p, wake_flags);
}
ttwu_do_wakeup(p);
* case the whole 'p->on_rq && ttwu_runnable()' case below
* without taking any locks.
*
+ * Specifically, given current runs ttwu() we must be before
+ * schedule()'s block_task(), as such this must not observe
+ * sched_delayed.
+ *
* In particular:
* - we rely on Program-Order guarantees for all the ordering,
* - we're serialized against set_special_state() by virtue of
* it disabling IRQs (this allows not taking ->pi_lock).
*/
+ SCHED_WARN_ON(p->se.sched_delayed);
if (!ttwu_state_match(p, state, &success))
goto out;
p->se.nr_migrations = 0;
p->se.vruntime = 0;
p->se.vlag = 0;
- p->se.slice = sysctl_sched_base_slice;
INIT_LIST_HEAD(&p->se.group_node);
+ /* A delayed task cannot be in clone(). */
+ SCHED_WARN_ON(p->se.sched_delayed);
+
#ifdef CONFIG_FAIR_GROUP_SCHED
p->se.cfs_rq = NULL;
#endif
p->prio = p->normal_prio = p->static_prio;
set_load_weight(p, false);
+ p->se.custom_slice = 0;
+ p->se.slice = sysctl_sched_base_slice;
/*
* We don't need the reset flag anymore after the fork. It has
update_rq_clock(rq);
post_init_entity_util_avg(p);
- activate_task(rq, p, ENQUEUE_NOCLOCK);
+ activate_task(rq, p, ENQUEUE_NOCLOCK | ENQUEUE_INITIAL);
trace_sched_wakeup_new(p);
wakeup_preempt(rq, p, WF_FORK);
#ifdef CONFIG_SMP
preempt_count_set(PREEMPT_DISABLED);
}
rcu_sleep_check();
- SCHED_WARN_ON(ct_state() == CONTEXT_USER);
+ SCHED_WARN_ON(ct_state() == CT_STATE_USER);
profile_hit(SCHED_PROFILING, __builtin_return_address(0));
schedstat_inc(this_rq()->sched_count);
}
- static void put_prev_task_balance(struct rq *rq, struct task_struct *prev,
- struct rq_flags *rf)
+ static void prev_balance(struct rq *rq, struct task_struct *prev,
+ struct rq_flags *rf)
{
#ifdef CONFIG_SMP
const struct sched_class *class;
break;
}
#endif
-
- put_prev_task(rq, prev);
}
/*
const struct sched_class *class;
struct task_struct *p;
+ rq->dl_server = NULL;
+
/*
* Optimization: we know that if all tasks are in the fair class we can
* call that function directly, but only if the @prev task wasn't of a
/* Assume the next prioritized class is idle_sched_class */
if (!p) {
- put_prev_task(rq, prev);
- p = pick_next_task_idle(rq);
+ p = pick_task_idle(rq);
+ put_prev_set_next_task(rq, prev, p);
}
- /*
- * This is the fast path; it cannot be a DL server pick;
- * therefore even if @p == @prev, ->dl_server must be NULL.
- */
- if (p->dl_server)
- p->dl_server = NULL;
-
return p;
}
restart:
- put_prev_task_balance(rq, prev, rf);
-
- /*
- * We've updated @prev and no longer need the server link, clear it.
- * Must be done before ->pick_next_task() because that can (re)set
- * ->dl_server.
- */
- if (prev->dl_server)
- prev->dl_server = NULL;
+ prev_balance(rq, prev, rf);
for_each_class(class) {
- p = class->pick_next_task(rq);
- if (p)
- return p;
+ if (class->pick_next_task) {
+ p = class->pick_next_task(rq, prev);
+ if (p)
+ return p;
+ } else {
+ p = class->pick_task(rq);
+ if (p) {
+ put_prev_set_next_task(rq, prev, p);
+ return p;
+ }
+ }
}
BUG(); /* The idle class should always have a runnable task. */
const struct sched_class *class;
struct task_struct *p;
+ rq->dl_server = NULL;
+
for_each_class(class) {
p = class->pick_task(rq);
if (p)
* another cpu during offline.
*/
rq->core_pick = NULL;
+ rq->core_dl_server = NULL;
return __pick_next_task(rq, prev, rf);
}
WRITE_ONCE(rq->core_sched_seq, rq->core->core_pick_seq);
next = rq->core_pick;
- if (next != prev) {
- put_prev_task(rq, prev);
- set_next_task(rq, next);
- }
-
+ rq->dl_server = rq->core_dl_server;
rq->core_pick = NULL;
- goto out;
+ rq->core_dl_server = NULL;
+ goto out_set_next;
}
- put_prev_task_balance(rq, prev, rf);
+ prev_balance(rq, prev, rf);
smt_mask = cpu_smt_mask(cpu);
need_sync = !!rq->core->core_cookie;
next = pick_task(rq);
if (!next->core_cookie) {
rq->core_pick = NULL;
+ rq->core_dl_server = NULL;
/*
* For robustness, update the min_vruntime_fi for
* unconstrained picks as well.
if (i != cpu && (rq_i != rq->core || !core_clock_updated))
update_rq_clock(rq_i);
- p = rq_i->core_pick = pick_task(rq_i);
+ rq_i->core_pick = p = pick_task(rq_i);
+ rq_i->core_dl_server = rq_i->dl_server;
+
if (!max || prio_less(max, p, fi_before))
max = p;
}
}
rq_i->core_pick = p;
+ rq_i->core_dl_server = NULL;
if (p == rq_i->idle) {
if (rq_i->nr_running) {
if (i == cpu) {
rq_i->core_pick = NULL;
+ rq_i->core_dl_server = NULL;
continue;
}
if (rq_i->curr == rq_i->core_pick) {
rq_i->core_pick = NULL;
+ rq_i->core_dl_server = NULL;
continue;
}
}
out_set_next:
- set_next_task(rq, next);
- out:
+ put_prev_set_next_task(rq, prev, next);
if (rq->core->core_forceidle_count && next == rq->idle)
queue_core_balance(rq);
* Constants for the sched_mode argument of __schedule().
*
* The mode argument allows RT enabled kernels to differentiate a
- * preemption from blocking on an 'sleeping' spin/rwlock. Note that
- * SM_MASK_PREEMPT for !RT has all bits set, which allows the compiler to
- * optimize the AND operation out and just check for zero.
+ * preemption from blocking on an 'sleeping' spin/rwlock.
*/
- #define SM_NONE 0x0
- #define SM_PREEMPT 0x1
- #define SM_RTLOCK_WAIT 0x2
-
- #ifndef CONFIG_PREEMPT_RT
- # define SM_MASK_PREEMPT (~0U)
- #else
- # define SM_MASK_PREEMPT SM_PREEMPT
- #endif
+ #define SM_IDLE (-1)
+ #define SM_NONE 0
+ #define SM_PREEMPT 1
+ #define SM_RTLOCK_WAIT 2
/*
* __schedule() is the main scheduler function.
*
* WARNING: must be called with preemption disabled!
*/
- static void __sched notrace __schedule(unsigned int sched_mode)
+ static void __sched notrace __schedule(int sched_mode)
{
struct task_struct *prev, *next;
+ /*
+ * On PREEMPT_RT kernel, SM_RTLOCK_WAIT is noted
+ * as a preemption by schedule_debug() and RCU.
+ */
+ bool preempt = sched_mode > SM_NONE;
unsigned long *switch_count;
unsigned long prev_state;
struct rq_flags rf;
rq = cpu_rq(cpu);
prev = rq->curr;
- schedule_debug(prev, !!sched_mode);
+ schedule_debug(prev, preempt);
if (sched_feat(HRTICK) || sched_feat(HRTICK_DL))
hrtick_clear(rq);
local_irq_disable();
- rcu_note_context_switch(!!sched_mode);
+ rcu_note_context_switch(preempt);
/*
* Make sure that signal_pending_state()->signal_pending() below
switch_count = &prev->nivcsw;
+ /* Task state changes only considers SM_PREEMPT as preemption */
+ preempt = sched_mode == SM_PREEMPT;
+
/*
* We must load prev->state once (task_struct::state is volatile), such
* that we form a control dependency vs deactivate_task() below.
*/
prev_state = READ_ONCE(prev->__state);
- if (!(sched_mode & SM_MASK_PREEMPT) && prev_state) {
+ if (sched_mode == SM_IDLE) {
+ if (!rq->nr_running) {
+ next = prev;
+ goto picked;
+ }
+ } else if (!preempt && prev_state) {
if (signal_pending_state(prev_state, prev)) {
WRITE_ONCE(prev->__state, TASK_RUNNING);
} else {
+ int flags = DEQUEUE_NOCLOCK;
+
prev->sched_contributes_to_load =
(prev_state & TASK_UNINTERRUPTIBLE) &&
!(prev_state & TASK_NOLOAD) &&
!(prev_state & TASK_FROZEN);
- if (prev->sched_contributes_to_load)
- rq->nr_uninterruptible++;
+ if (unlikely(is_special_task_state(prev_state)))
+ flags |= DEQUEUE_SPECIAL;
/*
* __schedule() ttwu()
*
* After this, schedule() must not care about p->state any more.
*/
- deactivate_task(rq, prev, DEQUEUE_SLEEP | DEQUEUE_NOCLOCK);
-
- if (prev->in_iowait) {
- atomic_inc(&rq->nr_iowait);
- delayacct_blkio_start();
- }
+ block_task(rq, prev, flags);
}
switch_count = &prev->nvcsw;
}
next = pick_next_task(rq, prev, &rf);
+ picked:
clear_tsk_need_resched(prev);
clear_preempt_need_resched();
#ifdef CONFIG_SCHED_DEBUG
psi_account_irqtime(rq, prev, next);
psi_sched_switch(prev, next, !task_on_rq_queued(prev));
- trace_sched_switch(sched_mode & SM_MASK_PREEMPT, prev, next, prev_state);
+ trace_sched_switch(preempt, prev, next, prev_state);
/* Also unlocks the rq: */
rq = context_switch(rq, prev, next, &rf);
}
}
- static __always_inline void __schedule_loop(unsigned int sched_mode)
+ static __always_inline void __schedule_loop(int sched_mode)
{
do {
preempt_disable();
*/
WARN_ON_ONCE(current->__state);
do {
- __schedule(SM_NONE);
+ __schedule(SM_IDLE);
} while (need_resched());
}
* we find a better solution.
*
* NB: There are buggy callers of this function. Ideally we
- * should warn if prev_state != CONTEXT_USER, but that will trigger
+ * should warn if prev_state != CT_STATE_USER, but that will trigger
* too frequently to make sense yet.
*/
enum ctx_state prev_state = exception_enter();
#endif /* CONFIG_RT_GROUP_SCHED */
}
- init_rt_bandwidth(&def_rt_bandwidth, global_rt_period(), global_rt_runtime());
-
#ifdef CONFIG_SMP
init_defrootdomain();
#endif
init_tg_cfs_entry(&root_task_group, &rq->cfs, NULL, i, NULL);
#endif /* CONFIG_FAIR_GROUP_SCHED */
- rq->rt.rt_runtime = def_rt_bandwidth.rt_runtime;
#ifdef CONFIG_RT_GROUP_SCHED
+ /*
+ * This is required for init cpu because rt.c:__enable_runtime()
+ * starts working after scheduler_running, which is not the case
+ * yet.
+ */
+ rq->rt.rt_runtime = global_rt_runtime();
init_tg_rt_entry(&root_task_group, &rq->rt, NULL, i, NULL);
#endif
#ifdef CONFIG_SMP
#endif /* CONFIG_SMP */
hrtick_rq_init(rq);
atomic_set(&rq->nr_iowait, 0);
+ fair_server_init(rq);
#ifdef CONFIG_SCHED_CORE
rq->core = rq;
rq->core_pick = NULL;
+ rq->core_dl_server = NULL;
rq->core_enabled = 0;
rq->core_tree = RB_ROOT;
rq->core_forceidle_count = 0;
}
set_load_weight(&init_task, false);
+ init_task.se.slice = sysctl_sched_base_slice,
/*
* The boot idle thread does lazy MMU switching as well:
schedstat_set(p->stats.sleep_start, 0);
schedstat_set(p->stats.block_start, 0);
- if (!dl_task(p) && !rt_task(p)) {
+ if (!rt_or_dl_task(p)) {
/*
* Renice negative nice level userspace
* tasks back to 0:
void dump_cpu_task(int cpu)
{
- if (cpu == smp_processor_id() && in_hardirq()) {
+ if (in_hardirq() && cpu == smp_processor_id()) {
struct pt_regs *regs;
regs = get_irq_regs();
static int se_is_idle(struct sched_entity *se)
{
- return 0;
+ return task_has_idle_policy(task_of(se));
}
#endif /* CONFIG_FAIR_GROUP_SCHED */
}
/* ensure we never gain time by being placed backwards. */
- u64_u32_store(cfs_rq->min_vruntime,
- __update_min_vruntime(cfs_rq, vruntime));
+ cfs_rq->min_vruntime = __update_min_vruntime(cfs_rq, vruntime);
+ }
+
+ static inline u64 cfs_rq_min_slice(struct cfs_rq *cfs_rq)
+ {
+ struct sched_entity *root = __pick_root_entity(cfs_rq);
+ struct sched_entity *curr = cfs_rq->curr;
+ u64 min_slice = ~0ULL;
+
+ if (curr && curr->on_rq)
+ min_slice = curr->slice;
+
+ if (root)
+ min_slice = min(min_slice, root->min_slice);
+
+ return min_slice;
}
static inline bool __entity_less(struct rb_node *a, const struct rb_node *b)
}
}
+ static inline void __min_slice_update(struct sched_entity *se, struct rb_node *node)
+ {
+ if (node) {
+ struct sched_entity *rse = __node_2_se(node);
+ if (rse->min_slice < se->min_slice)
+ se->min_slice = rse->min_slice;
+ }
+ }
+
/*
* se->min_vruntime = min(se->vruntime, {left,right}->min_vruntime)
*/
static inline bool min_vruntime_update(struct sched_entity *se, bool exit)
{
u64 old_min_vruntime = se->min_vruntime;
+ u64 old_min_slice = se->min_slice;
struct rb_node *node = &se->run_node;
se->min_vruntime = se->vruntime;
__min_vruntime_update(se, node->rb_right);
__min_vruntime_update(se, node->rb_left);
- return se->min_vruntime == old_min_vruntime;
+ se->min_slice = se->slice;
+ __min_slice_update(se, node->rb_right);
+ __min_slice_update(se, node->rb_left);
+
+ return se->min_vruntime == old_min_vruntime &&
+ se->min_slice == old_min_slice;
}
RB_DECLARE_CALLBACKS(static, min_vruntime_cb, struct sched_entity,
{
avg_vruntime_add(cfs_rq, se);
se->min_vruntime = se->vruntime;
+ se->min_slice = se->slice;
rb_add_augmented_cached(&se->run_node, &cfs_rq->tasks_timeline,
__entity_less, &min_vruntime_cb);
}
* XXX: strictly: vd_i += N*r_i/w_i such that: vd_i > ve_i
* this is probably good enough.
*/
- static void update_deadline(struct cfs_rq *cfs_rq, struct sched_entity *se)
+ static bool update_deadline(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
if ((s64)(se->vruntime - se->deadline) < 0)
- return;
+ return false;
/*
* For EEVDF the virtual time slope is determined by w_i (iow.
* nice) while the request time r_i is determined by
* sysctl_sched_base_slice.
*/
- se->slice = sysctl_sched_base_slice;
+ if (!se->custom_slice)
+ se->slice = sysctl_sched_base_slice;
/*
* EEVDF: vd_i = ve_i + r_i / w_i
/*
* The task has consumed its request, reschedule.
*/
- if (cfs_rq->nr_running > 1) {
- resched_curr(rq_of(cfs_rq));
- clear_buddies(cfs_rq, se);
- }
+ return true;
}
#include "pelt.h"
dl_server_update(p->dl_server, delta_exec);
}
+ static inline bool did_preempt_short(struct cfs_rq *cfs_rq, struct sched_entity *curr)
+ {
+ if (!sched_feat(PREEMPT_SHORT))
+ return false;
+
+ if (curr->vlag == curr->deadline)
+ return false;
+
+ return !entity_eligible(cfs_rq, curr);
+ }
+
+ static inline bool do_preempt_short(struct cfs_rq *cfs_rq,
+ struct sched_entity *pse, struct sched_entity *se)
+ {
+ if (!sched_feat(PREEMPT_SHORT))
+ return false;
+
+ if (pse->slice >= se->slice)
+ return false;
+
+ if (!entity_eligible(cfs_rq, pse))
+ return false;
+
+ if (entity_before(pse, se))
+ return true;
+
+ if (!entity_eligible(cfs_rq, se))
+ return true;
+
+ return false;
+ }
+
/*
* Used by other classes to account runtime.
*/
static void update_curr(struct cfs_rq *cfs_rq)
{
struct sched_entity *curr = cfs_rq->curr;
+ struct rq *rq = rq_of(cfs_rq);
s64 delta_exec;
+ bool resched;
if (unlikely(!curr))
return;
- delta_exec = update_curr_se(rq_of(cfs_rq), curr);
+ delta_exec = update_curr_se(rq, curr);
if (unlikely(delta_exec <= 0))
return;
curr->vruntime += calc_delta_fair(delta_exec, curr);
- update_deadline(cfs_rq, curr);
+ resched = update_deadline(cfs_rq, curr);
update_min_vruntime(cfs_rq);
- if (entity_is_task(curr))
- update_curr_task(task_of(curr), delta_exec);
+ if (entity_is_task(curr)) {
+ struct task_struct *p = task_of(curr);
+
+ update_curr_task(p, delta_exec);
+
+ /*
+ * Any fair task that runs outside of fair_server should
+ * account against fair_server such that it can account for
+ * this time and possibly avoid running this period.
+ */
+ if (p->dl_server != &rq->fair_server)
+ dl_server_update(&rq->fair_server, delta_exec);
+ }
account_cfs_rq_runtime(cfs_rq, delta_exec);
+
+ if (rq->nr_running == 1)
+ return;
+
+ if (resched || did_preempt_short(cfs_rq, curr)) {
+ resched_curr(rq);
+ clear_buddies(cfs_rq, curr);
+ }
}
static void update_curr_fair(struct rq *rq)
u64 vslice, vruntime = avg_vruntime(cfs_rq);
s64 lag = 0;
- se->slice = sysctl_sched_base_slice;
+ if (!se->custom_slice)
+ se->slice = sysctl_sched_base_slice;
vslice = calc_delta_fair(se->slice, se);
/*
se->vruntime = vruntime - lag;
+ if (sched_feat(PLACE_REL_DEADLINE) && se->rel_deadline) {
+ se->deadline += se->vruntime;
+ se->rel_deadline = 0;
+ return;
+ }
+
/*
* When joining the competition; the existing tasks will be,
* on average, halfway through their slice, as such start tasks
static inline bool cfs_bandwidth_used(void);
+ static void
+ requeue_delayed_entity(struct sched_entity *se);
+
static void
enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
{
static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq);
- static void
+ static inline void finish_delayed_dequeue_entity(struct sched_entity *se)
+ {
+ se->sched_delayed = 0;
+ if (sched_feat(DELAY_ZERO) && se->vlag > 0)
+ se->vlag = 0;
+ }
+
+ static bool
dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
{
- int action = UPDATE_TG;
+ bool sleep = flags & DEQUEUE_SLEEP;
+ update_curr(cfs_rq);
+
+ if (flags & DEQUEUE_DELAYED) {
+ SCHED_WARN_ON(!se->sched_delayed);
+ } else {
+ bool delay = sleep;
+ /*
+ * DELAY_DEQUEUE relies on spurious wakeups, special task
+ * states must not suffer spurious wakeups, excempt them.
+ */
+ if (flags & DEQUEUE_SPECIAL)
+ delay = false;
+
+ SCHED_WARN_ON(delay && se->sched_delayed);
+
+ if (sched_feat(DELAY_DEQUEUE) && delay &&
+ !entity_eligible(cfs_rq, se)) {
+ if (cfs_rq->next == se)
+ cfs_rq->next = NULL;
+ update_load_avg(cfs_rq, se, 0);
+ se->sched_delayed = 1;
+ return false;
+ }
+ }
+
+ int action = UPDATE_TG;
if (entity_is_task(se) && task_on_rq_migrating(task_of(se)))
action |= DO_DETACH;
- /*
- * Update run-time statistics of the 'current'.
- */
- update_curr(cfs_rq);
-
/*
* When dequeuing a sched_entity, we must:
* - Update loads to have both entity and cfs_rq synced with now.
clear_buddies(cfs_rq, se);
update_entity_lag(cfs_rq, se);
+ if (sched_feat(PLACE_REL_DEADLINE) && !sleep) {
+ se->deadline -= se->vruntime;
+ se->rel_deadline = 1;
+ }
+
if (se != cfs_rq->curr)
__dequeue_entity(cfs_rq, se);
se->on_rq = 0;
if ((flags & (DEQUEUE_SAVE | DEQUEUE_MOVE)) != DEQUEUE_SAVE)
update_min_vruntime(cfs_rq);
+ if (flags & DEQUEUE_DELAYED)
+ finish_delayed_dequeue_entity(se);
+
if (cfs_rq->nr_running == 0)
update_idle_cfs_rq_clock_pelt(cfs_rq);
+
+ return true;
}
static void
}
update_stats_curr_start(cfs_rq, se);
+ SCHED_WARN_ON(cfs_rq->curr);
cfs_rq->curr = se;
/*
se->prev_sum_exec_runtime = se->sum_exec_runtime;
}
+ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags);
+
/*
* Pick the next process, keeping these things in mind, in this order:
* 1) keep things fair between processes/task groups
* 4) do not run the "skip" process, if something else is available
*/
static struct sched_entity *
- pick_next_entity(struct cfs_rq *cfs_rq)
+ pick_next_entity(struct rq *rq, struct cfs_rq *cfs_rq)
{
/*
* Enabling NEXT_BUDDY will affect latency but not fairness.
*/
if (sched_feat(NEXT_BUDDY) &&
- cfs_rq->next && entity_eligible(cfs_rq, cfs_rq->next))
+ cfs_rq->next && entity_eligible(cfs_rq, cfs_rq->next)) {
+ /* ->next will never be delayed */
+ SCHED_WARN_ON(cfs_rq->next->sched_delayed);
return cfs_rq->next;
+ }
- return pick_eevdf(cfs_rq);
+ struct sched_entity *se = pick_eevdf(cfs_rq);
+ if (se->sched_delayed) {
+ dequeue_entities(rq, se, DEQUEUE_SLEEP | DEQUEUE_DELAYED);
+ SCHED_WARN_ON(se->sched_delayed);
+ SCHED_WARN_ON(se->on_rq);
+ return NULL;
+ }
+ return se;
}
static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq);
/* in !on_rq case, update occurred at dequeue */
update_load_avg(cfs_rq, prev, 0);
}
+ SCHED_WARN_ON(cfs_rq->curr != prev);
cfs_rq->curr = NULL;
}
struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
struct sched_entity *se;
long task_delta, idle_task_delta, dequeue = 1;
+ long rq_h_nr_running = rq->cfs.h_nr_running;
raw_spin_lock(&cfs_b->lock);
/* This will start the period timer if necessary */
idle_task_delta = cfs_rq->idle_h_nr_running;
for_each_sched_entity(se) {
struct cfs_rq *qcfs_rq = cfs_rq_of(se);
+ int flags;
+
/* throttled entity or throttle-on-deactivate */
if (!se->on_rq)
goto done;
- dequeue_entity(qcfs_rq, se, DEQUEUE_SLEEP);
+ /*
+ * Abuse SPECIAL to avoid delayed dequeue in this instance.
+ * This avoids teaching dequeue_entities() about throttled
+ * entities and keeps things relatively simple.
+ */
+ flags = DEQUEUE_SLEEP | DEQUEUE_SPECIAL;
+ if (se->sched_delayed)
+ flags |= DEQUEUE_DELAYED;
+ dequeue_entity(qcfs_rq, se, flags);
if (cfs_rq_is_idle(group_cfs_rq(se)))
idle_task_delta = cfs_rq->h_nr_running;
/* At this point se is NULL and we are at root level*/
sub_nr_running(rq, task_delta);
+ /* Stop the fair server if throttling resulted in no runnable tasks */
+ if (rq_h_nr_running && !rq->cfs.h_nr_running)
+ dl_server_stop(&rq->fair_server);
done:
/*
* Note: distribution will already see us throttled via the
struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
struct sched_entity *se;
long task_delta, idle_task_delta;
+ long rq_h_nr_running = rq->cfs.h_nr_running;
se = cfs_rq->tg->se[cpu_of(rq)];
for_each_sched_entity(se) {
struct cfs_rq *qcfs_rq = cfs_rq_of(se);
- if (se->on_rq)
+ if (se->on_rq) {
+ SCHED_WARN_ON(se->sched_delayed);
break;
+ }
enqueue_entity(qcfs_rq, se, ENQUEUE_WAKEUP);
if (cfs_rq_is_idle(group_cfs_rq(se)))
goto unthrottle_throttle;
}
+ /* Start the fair server if un-throttling resulted in new runnable tasks */
+ if (!rq_h_nr_running && rq->cfs.h_nr_running)
+ dl_server_start(&rq->fair_server);
+
/* At this point se is NULL and we are at root level*/
add_nr_running(rq, task_delta);
{
int cpu = cpu_of(rq);
- if (!sched_feat(HZ_BW) || !cfs_bandwidth_used())
+ if (!cfs_bandwidth_used())
return;
if (!tick_nohz_full_cpu(cpu))
}
#endif
+ static void
+ requeue_delayed_entity(struct sched_entity *se)
+ {
+ struct cfs_rq *cfs_rq = cfs_rq_of(se);
+
+ /*
+ * se->sched_delayed should imply: se->on_rq == 1.
+ * Because a delayed entity is one that is still on
+ * the runqueue competing until elegibility.
+ */
+ SCHED_WARN_ON(!se->sched_delayed);
+ SCHED_WARN_ON(!se->on_rq);
+
+ if (sched_feat(DELAY_ZERO)) {
+ update_entity_lag(cfs_rq, se);
+ if (se->vlag > 0) {
+ cfs_rq->nr_running--;
+ if (se != cfs_rq->curr)
+ __dequeue_entity(cfs_rq, se);
+ se->vlag = 0;
+ place_entity(cfs_rq, se, 0);
+ if (se != cfs_rq->curr)
+ __enqueue_entity(cfs_rq, se);
+ cfs_rq->nr_running++;
+ }
+ }
+
+ update_load_avg(cfs_rq, se, 0);
+ se->sched_delayed = 0;
+ }
+
/*
* The enqueue_task method is called before nr_running is
* increased. Here we update the fair scheduling stats and
struct sched_entity *se = &p->se;
int idle_h_nr_running = task_has_idle_policy(p);
int task_new = !(flags & ENQUEUE_WAKEUP);
+ int rq_h_nr_running = rq->cfs.h_nr_running;
+ u64 slice = 0;
/*
* The code below (indirectly) updates schedutil which looks at
* Let's add the task's estimated utilization to the cfs_rq's
* estimated utilization, before we update schedutil.
*/
- util_est_enqueue(&rq->cfs, p);
+ if (!(p->se.sched_delayed && (task_on_rq_migrating(p) || (flags & ENQUEUE_RESTORE))))
+ util_est_enqueue(&rq->cfs, p);
+
+ if (flags & ENQUEUE_DELAYED) {
+ requeue_delayed_entity(se);
+ return;
+ }
/*
* If in_iowait is set, the code below may not trigger any cpufreq
cpufreq_update_util(rq, SCHED_CPUFREQ_IOWAIT);
for_each_sched_entity(se) {
- if (se->on_rq)
+ if (se->on_rq) {
+ if (se->sched_delayed)
+ requeue_delayed_entity(se);
break;
+ }
cfs_rq = cfs_rq_of(se);
+
+ /*
+ * Basically set the slice of group entries to the min_slice of
+ * their respective cfs_rq. This ensures the group can service
+ * its entities in the desired time-frame.
+ */
+ if (slice) {
+ se->slice = slice;
+ se->custom_slice = 1;
+ }
enqueue_entity(cfs_rq, se, flags);
+ slice = cfs_rq_min_slice(cfs_rq);
cfs_rq->h_nr_running++;
cfs_rq->idle_h_nr_running += idle_h_nr_running;
se_update_runnable(se);
update_cfs_group(se);
+ se->slice = slice;
+ slice = cfs_rq_min_slice(cfs_rq);
+
cfs_rq->h_nr_running++;
cfs_rq->idle_h_nr_running += idle_h_nr_running;
goto enqueue_throttle;
}
+ if (!rq_h_nr_running && rq->cfs.h_nr_running) {
+ /* Account for idle runtime */
+ if (!rq->nr_running)
+ dl_server_update_idle_time(rq, rq->curr);
+ dl_server_start(&rq->fair_server);
+ }
+
/* At this point se is NULL and we are at root level*/
add_nr_running(rq, 1);
static void set_next_buddy(struct sched_entity *se);
/*
- * The dequeue_task method is called before nr_running is
- * decreased. We remove the task from the rbtree and
- * update the fair scheduling stats:
+ * Basically dequeue_task_fair(), except it can deal with dequeue_entity()
+ * failing half-way through and resume the dequeue later.
+ *
+ * Returns:
+ * -1 - dequeue delayed
+ * 0 - dequeue throttled
+ * 1 - dequeue complete
*/
- static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
+ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags)
{
- struct cfs_rq *cfs_rq;
- struct sched_entity *se = &p->se;
- int task_sleep = flags & DEQUEUE_SLEEP;
- int idle_h_nr_running = task_has_idle_policy(p);
bool was_sched_idle = sched_idle_rq(rq);
+ int rq_h_nr_running = rq->cfs.h_nr_running;
+ bool task_sleep = flags & DEQUEUE_SLEEP;
+ bool task_delayed = flags & DEQUEUE_DELAYED;
+ struct task_struct *p = NULL;
+ int idle_h_nr_running = 0;
+ int h_nr_running = 0;
+ struct cfs_rq *cfs_rq;
+ u64 slice = 0;
- util_est_dequeue(&rq->cfs, p);
+ if (entity_is_task(se)) {
+ p = task_of(se);
+ h_nr_running = 1;
+ idle_h_nr_running = task_has_idle_policy(p);
+ } else {
+ cfs_rq = group_cfs_rq(se);
+ slice = cfs_rq_min_slice(cfs_rq);
+ }
for_each_sched_entity(se) {
cfs_rq = cfs_rq_of(se);
- dequeue_entity(cfs_rq, se, flags);
- cfs_rq->h_nr_running--;
+ if (!dequeue_entity(cfs_rq, se, flags)) {
+ if (p && &p->se == se)
+ return -1;
+
+ break;
+ }
+
+ cfs_rq->h_nr_running -= h_nr_running;
cfs_rq->idle_h_nr_running -= idle_h_nr_running;
if (cfs_rq_is_idle(cfs_rq))
- idle_h_nr_running = 1;
+ idle_h_nr_running = h_nr_running;
/* end evaluation on encountering a throttled cfs_rq */
if (cfs_rq_throttled(cfs_rq))
- goto dequeue_throttle;
+ return 0;
/* Don't dequeue parent if it has other entities besides us */
if (cfs_rq->load.weight) {
+ slice = cfs_rq_min_slice(cfs_rq);
+
/* Avoid re-evaluating load for this entity: */
se = parent_entity(se);
/*
break;
}
flags |= DEQUEUE_SLEEP;
+ flags &= ~(DEQUEUE_DELAYED | DEQUEUE_SPECIAL);
}
for_each_sched_entity(se) {
se_update_runnable(se);
update_cfs_group(se);
- cfs_rq->h_nr_running--;
+ se->slice = slice;
+ slice = cfs_rq_min_slice(cfs_rq);
+
+ cfs_rq->h_nr_running -= h_nr_running;
cfs_rq->idle_h_nr_running -= idle_h_nr_running;
if (cfs_rq_is_idle(cfs_rq))
- idle_h_nr_running = 1;
+ idle_h_nr_running = h_nr_running;
/* end evaluation on encountering a throttled cfs_rq */
if (cfs_rq_throttled(cfs_rq))
- goto dequeue_throttle;
-
+ return 0;
}
- /* At this point se is NULL and we are at root level*/
- sub_nr_running(rq, 1);
+ sub_nr_running(rq, h_nr_running);
+
+ if (rq_h_nr_running && !rq->cfs.h_nr_running)
+ dl_server_stop(&rq->fair_server);
/* balance early to pull high priority tasks */
if (unlikely(!was_sched_idle && sched_idle_rq(rq)))
rq->next_balance = jiffies;
- dequeue_throttle:
- util_est_update(&rq->cfs, p, task_sleep);
+ if (p && task_delayed) {
+ SCHED_WARN_ON(!task_sleep);
+ SCHED_WARN_ON(p->on_rq != 1);
+
+ /* Fix-up what dequeue_task_fair() skipped */
+ hrtick_update(rq);
+
+ /* Fix-up what block_task() skipped. */
+ __block_task(rq, p);
+ }
+
+ return 1;
+ }
+
+ /*
+ * The dequeue_task method is called before nr_running is
+ * decreased. We remove the task from the rbtree and
+ * update the fair scheduling stats:
+ */
+ static bool dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
+ {
+ if (!(p->se.sched_delayed && (task_on_rq_migrating(p) || (flags & DEQUEUE_SAVE))))
+ util_est_dequeue(&rq->cfs, p);
+
+ if (dequeue_entities(rq, &p->se, flags) < 0) {
+ util_est_update(&rq->cfs, p, DEQUEUE_SLEEP);
+ return false;
+ }
+
+ util_est_update(&rq->cfs, p, flags & DEQUEUE_SLEEP);
hrtick_update(rq);
+ return true;
}
#ifdef CONFIG_SMP
return cpu_util(cpu, p, -1, 0);
}
+ /*
+ * This function computes an effective utilization for the given CPU, to be
+ * used for frequency selection given the linear relation: f = u * f_max.
+ *
+ * The scheduler tracks the following metrics:
+ *
+ * cpu_util_{cfs,rt,dl,irq}()
+ * cpu_bw_dl()
+ *
+ * Where the cfs,rt and dl util numbers are tracked with the same metric and
+ * synchronized windows and are thus directly comparable.
+ *
+ * The cfs,rt,dl utilization are the running times measured with rq->clock_task
+ * which excludes things like IRQ and steal-time. These latter are then accrued
+ * in the IRQ utilization.
+ *
+ * The DL bandwidth number OTOH is not a measured metric but a value computed
+ * based on the task model parameters and gives the minimal utilization
+ * required to meet deadlines.
+ */
+ unsigned long effective_cpu_util(int cpu, unsigned long util_cfs,
+ unsigned long *min,
+ unsigned long *max)
+ {
+ unsigned long util, irq, scale;
+ struct rq *rq = cpu_rq(cpu);
+
+ scale = arch_scale_cpu_capacity(cpu);
+
+ /*
+ * Early check to see if IRQ/steal time saturates the CPU, can be
+ * because of inaccuracies in how we track these -- see
+ * update_irq_load_avg().
+ */
+ irq = cpu_util_irq(rq);
+ if (unlikely(irq >= scale)) {
+ if (min)
+ *min = scale;
+ if (max)
+ *max = scale;
+ return scale;
+ }
+
+ if (min) {
+ /*
+ * The minimum utilization returns the highest level between:
+ * - the computed DL bandwidth needed with the IRQ pressure which
+ * steals time to the deadline task.
+ * - The minimum performance requirement for CFS and/or RT.
+ */
+ *min = max(irq + cpu_bw_dl(rq), uclamp_rq_get(rq, UCLAMP_MIN));
+
+ /*
+ * When an RT task is runnable and uclamp is not used, we must
+ * ensure that the task will run at maximum compute capacity.
+ */
+ if (!uclamp_is_used() && rt_rq_is_runnable(&rq->rt))
+ *min = max(*min, scale);
+ }
+
+ /*
+ * Because the time spend on RT/DL tasks is visible as 'lost' time to
+ * CFS tasks and we use the same metric to track the effective
+ * utilization (PELT windows are synchronized) we can directly add them
+ * to obtain the CPU's actual utilization.
+ */
+ util = util_cfs + cpu_util_rt(rq);
+ util += cpu_util_dl(rq);
+
+ /*
+ * The maximum hint is a soft bandwidth requirement, which can be lower
+ * than the actual utilization because of uclamp_max requirements.
+ */
+ if (max)
+ *max = min(scale, uclamp_rq_get(rq, UCLAMP_MAX));
+
+ if (util >= scale)
+ return scale;
+
+ /*
+ * There is still idle time; further improve the number by using the
+ * IRQ metric. Because IRQ/steal time is hidden from the task clock we
+ * need to scale the task numbers:
+ *
+ * max - irq
+ * U' = irq + --------- * U
+ * max
+ */
+ util = scale_irq_capacity(util, irq, scale);
+ util += irq;
+
+ return min(scale, util);
+ }
+
+ unsigned long sched_cpu_util(int cpu)
+ {
+ return effective_cpu_util(cpu, cpu_util_cfs(cpu), NULL, NULL);
+ }
+
/*
* energy_env - Utilization landscape for energy estimation.
* @task_busy_time: Utilization contribution by the task for which we test the
static void task_dead_fair(struct task_struct *p)
{
- remove_entity_load_avg(&p->se);
+ struct sched_entity *se = &p->se;
+
+ if (se->sched_delayed) {
+ struct rq_flags rf;
+ struct rq *rq;
+
+ rq = task_rq_lock(p, &rf);
+ if (se->sched_delayed) {
+ update_rq_clock(rq);
+ dequeue_entities(rq, se, DEQUEUE_SLEEP | DEQUEUE_DELAYED);
+ }
+ task_rq_unlock(rq, p, &rf);
+ }
+
+ remove_entity_load_avg(se);
}
/*
static int
balance_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
{
- if (rq->nr_running)
+ if (sched_fair_runnable(rq))
return 1;
return sched_balance_newidle(rq, rf) != 0;
if (test_tsk_need_resched(curr))
return;
- /* Idle tasks are by definition preempted by non-idle tasks. */
- if (unlikely(task_has_idle_policy(curr)) &&
- likely(!task_has_idle_policy(p)))
- goto preempt;
-
- /*
- * Batch and idle tasks do not preempt non-idle tasks (their preemption
- * is driven by the tick):
- */
- if (unlikely(p->policy != SCHED_NORMAL) || !sched_feat(WAKEUP_PREEMPTION))
+ if (!sched_feat(WAKEUP_PREEMPTION))
return;
find_matching_se(&se, &pse);
pse_is_idle = se_is_idle(pse);
/*
- * Preempt an idle group in favor of a non-idle group (and don't preempt
+ * Preempt an idle entity in favor of a non-idle entity (and don't preempt
* in the inverse case).
*/
if (cse_is_idle && !pse_is_idle)
if (cse_is_idle != pse_is_idle)
return;
+ /*
+ * BATCH and IDLE tasks do not preempt others.
+ */
+ if (unlikely(p->policy != SCHED_NORMAL))
+ return;
+
cfs_rq = cfs_rq_of(se);
update_curr(cfs_rq);
+ /*
+ * If @p has a shorter slice than current and @p is eligible, override
+ * current's slice protection in order to allow preemption.
+ *
+ * Note that even if @p does not turn out to be the most eligible
+ * task at this moment, current's slice protection will be lost.
+ */
+ if (do_preempt_short(cfs_rq, pse, se) && se->vlag == se->deadline)
+ se->vlag = se->deadline + 1;
/*
- * XXX pick_eevdf(cfs_rq) != se ?
+ * If @p has become the most eligible task, force preemption.
*/
if (pick_eevdf(cfs_rq) == pse)
goto preempt;
resched_curr(rq);
}
- #ifdef CONFIG_SMP
static struct task_struct *pick_task_fair(struct rq *rq)
{
struct sched_entity *se;
return NULL;
do {
- struct sched_entity *curr = cfs_rq->curr;
-
- /* When we pick for a remote RQ, we'll not have done put_prev_entity() */
- if (curr) {
- if (curr->on_rq)
- update_curr(cfs_rq);
- else
- curr = NULL;
+ /* Might not have done put_prev_entity() */
+ if (cfs_rq->curr && cfs_rq->curr->on_rq)
+ update_curr(cfs_rq);
- if (unlikely(check_cfs_rq_runtime(cfs_rq)))
- goto again;
- }
+ if (unlikely(check_cfs_rq_runtime(cfs_rq)))
+ goto again;
- se = pick_next_entity(cfs_rq);
+ se = pick_next_entity(rq, cfs_rq);
+ if (!se)
+ goto again;
cfs_rq = group_cfs_rq(se);
} while (cfs_rq);
return task_of(se);
}
- #endif
+
+ static void __set_next_task_fair(struct rq *rq, struct task_struct *p, bool first);
+ static void set_next_task_fair(struct rq *rq, struct task_struct *p, bool first);
struct task_struct *
pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
{
- struct cfs_rq *cfs_rq = &rq->cfs;
struct sched_entity *se;
struct task_struct *p;
int new_tasks;
again:
- if (!sched_fair_runnable(rq))
+ p = pick_task_fair(rq);
+ if (!p)
goto idle;
+ se = &p->se;
#ifdef CONFIG_FAIR_GROUP_SCHED
- if (!prev || prev->sched_class != &fair_sched_class)
+ if (prev->sched_class != &fair_sched_class)
goto simple;
+ __put_prev_set_next_dl_server(rq, prev, p);
+
/*
* Because of the set_next_buddy() in dequeue_task_fair() it is rather
* likely that a next task is from the same cgroup as the current.
*
* Therefore attempt to avoid putting and setting the entire cgroup
* hierarchy, only change the part that actually changes.
- */
-
- do {
- struct sched_entity *curr = cfs_rq->curr;
-
- /*
- * Since we got here without doing put_prev_entity() we also
- * have to consider cfs_rq->curr. If it is still a runnable
- * entity, update_curr() will update its vruntime, otherwise
- * forget we've ever seen it.
- */
- if (curr) {
- if (curr->on_rq)
- update_curr(cfs_rq);
- else
- curr = NULL;
-
- /*
- * This call to check_cfs_rq_runtime() will do the
- * throttle and dequeue its entity in the parent(s).
- * Therefore the nr_running test will indeed
- * be correct.
- */
- if (unlikely(check_cfs_rq_runtime(cfs_rq))) {
- cfs_rq = &rq->cfs;
-
- if (!cfs_rq->nr_running)
- goto idle;
-
- goto simple;
- }
- }
-
- se = pick_next_entity(cfs_rq);
- cfs_rq = group_cfs_rq(se);
- } while (cfs_rq);
-
- p = task_of(se);
-
- /*
+ *
* Since we haven't yet done put_prev_entity and if the selected task
* is a different task than we started out with, try and touch the
* least amount of cfs_rqs.
*/
if (prev != p) {
struct sched_entity *pse = &prev->se;
+ struct cfs_rq *cfs_rq;
while (!(cfs_rq = is_same_group(se, pse))) {
int se_depth = se->depth;
put_prev_entity(cfs_rq, pse);
set_next_entity(cfs_rq, se);
- }
-
- goto done;
- simple:
- #endif
- if (prev)
- put_prev_task(rq, prev);
- do {
- se = pick_next_entity(cfs_rq);
- set_next_entity(cfs_rq, se);
- cfs_rq = group_cfs_rq(se);
- } while (cfs_rq);
+ __set_next_task_fair(rq, p, true);
+ }
- p = task_of(se);
+ return p;
- done: __maybe_unused;
- #ifdef CONFIG_SMP
- /*
- * Move the next running task to the front of
- * the list, so our cfs_tasks list becomes MRU
- * one.
- */
- list_move(&p->se.group_node, &rq->cfs_tasks);
+ simple:
#endif
-
- if (hrtick_enabled_fair(rq))
- hrtick_start_fair(rq, p);
-
- update_misfit_status(p, rq);
- sched_fair_update_stop_tick(rq, p);
-
+ put_prev_set_next_task(rq, prev, p);
return p;
idle:
return NULL;
}
- static struct task_struct *__pick_next_task_fair(struct rq *rq)
+ static struct task_struct *__pick_next_task_fair(struct rq *rq, struct task_struct *prev)
{
- return pick_next_task_fair(rq, NULL, NULL);
+ return pick_next_task_fair(rq, prev, NULL);
+ }
+
+ static bool fair_server_has_tasks(struct sched_dl_entity *dl_se)
+ {
+ return !!dl_se->rq->cfs.nr_running;
+ }
+
+ static struct task_struct *fair_server_pick_task(struct sched_dl_entity *dl_se)
+ {
+ return pick_task_fair(dl_se->rq);
+ }
+
+ void fair_server_init(struct rq *rq)
+ {
+ struct sched_dl_entity *dl_se = &rq->fair_server;
+
+ init_dl_entity(dl_se);
+
+ dl_server_init(dl_se, rq, fair_server_has_tasks, fair_server_pick_task);
}
/*
* Account for a descheduled task:
*/
- static void put_prev_task_fair(struct rq *rq, struct task_struct *prev)
+ static void put_prev_task_fair(struct rq *rq, struct task_struct *prev, struct task_struct *next)
{
struct sched_entity *se = &prev->se;
struct cfs_rq *cfs_rq;
hw_pressure = arch_scale_hw_pressure(cpu_of(rq));
+ /* hw_pressure doesn't care about invariance */
decayed = update_rt_rq_load_avg(now, rq, curr_class == &rt_sched_class) |
update_dl_rq_load_avg(now, rq, curr_class == &dl_sched_class) |
- update_hw_load_avg(now, rq, hw_pressure) |
+ update_hw_load_avg(rq_clock_task(rq), rq, hw_pressure) |
update_irq_load_avg(rq, 0);
if (others_have_blocked(rq))
* - indirectly from a remote scheduler_tick() for NOHZ idle balancing
* through the SMP cross-call nohz_csd_func()
*/
-static __latent_entropy void sched_balance_softirq(struct softirq_action *h)
+static __latent_entropy void sched_balance_softirq(void)
{
struct rq *this_rq = this_rq();
enum cpu_idle_type idle = this_rq->idle_balance;
*/
static void task_fork_fair(struct task_struct *p)
{
- struct sched_entity *se = &p->se, *curr;
- struct cfs_rq *cfs_rq;
- struct rq *rq = this_rq();
- struct rq_flags rf;
-
- rq_lock(rq, &rf);
- update_rq_clock(rq);
-
set_task_max_allowed_capacity(p);
-
- cfs_rq = task_cfs_rq(current);
- curr = cfs_rq->curr;
- if (curr)
- update_curr(cfs_rq);
- place_entity(cfs_rq, se, ENQUEUE_INITIAL);
- rq_unlock(rq, &rf);
}
/*
static void switched_from_fair(struct rq *rq, struct task_struct *p)
{
detach_task_cfs_rq(p);
+ /*
+ * Since this is called after changing class, this is a little weird
+ * and we cannot use DEQUEUE_DELAYED.
+ */
+ if (p->se.sched_delayed) {
+ /* First, dequeue it from its new class' structures */
+ dequeue_task(rq, p, DEQUEUE_NOCLOCK | DEQUEUE_SLEEP);
+ /*
+ * Now, clean up the fair_sched_class side of things
+ * related to sched_delayed being true and that wasn't done
+ * due to the generic dequeue not using DEQUEUE_DELAYED.
+ */
+ finish_delayed_dequeue_entity(&p->se);
+ p->se.rel_deadline = 0;
+ __block_task(rq, p);
+ }
}
static void switched_to_fair(struct rq *rq, struct task_struct *p)
{
+ SCHED_WARN_ON(p->se.sched_delayed);
+
attach_task_cfs_rq(p);
set_task_max_allowed_capacity(p);
}
}
- /* Account for a task changing its policy or group.
- *
- * This routine is mostly called to set cfs_rq->curr field when a task
- * migrates between groups/classes.
- */
- static void set_next_task_fair(struct rq *rq, struct task_struct *p, bool first)
+ static void __set_next_task_fair(struct rq *rq, struct task_struct *p, bool first)
{
struct sched_entity *se = &p->se;
list_move(&se->group_node, &rq->cfs_tasks);
}
#endif
+ if (!first)
+ return;
+
+ SCHED_WARN_ON(se->sched_delayed);
+
+ if (hrtick_enabled_fair(rq))
+ hrtick_start_fair(rq, p);
+
+ update_misfit_status(p, rq);
+ sched_fair_update_stop_tick(rq, p);
+ }
+
+ /*
+ * Account for a task changing its policy or group.
+ *
+ * This routine is mostly called to set cfs_rq->curr field when a task
+ * migrates between groups/classes.
+ */
+ static void set_next_task_fair(struct rq *rq, struct task_struct *p, bool first)
+ {
+ struct sched_entity *se = &p->se;
for_each_sched_entity(se) {
struct cfs_rq *cfs_rq = cfs_rq_of(se);
/* ensure bandwidth has been allocated on our new cfs_rq */
account_cfs_rq_runtime(cfs_rq, 0);
}
+
+ __set_next_task_fair(rq, p, first);
}
void init_cfs_rq(struct cfs_rq *cfs_rq)
{
cfs_rq->tasks_timeline = RB_ROOT_CACHED;
- u64_u32_store(cfs_rq->min_vruntime, (u64)(-(1LL << 20)));
+ cfs_rq->min_vruntime = (u64)(-(1LL << 20));
#ifdef CONFIG_SMP
raw_spin_lock_init(&cfs_rq->removed.lock);
#endif
void unregister_fair_sched_group(struct task_group *tg)
{
- unsigned long flags;
- struct rq *rq;
int cpu;
destroy_cfs_bandwidth(tg_cfs_bandwidth(tg));
for_each_possible_cpu(cpu) {
- if (tg->se[cpu])
- remove_entity_load_avg(tg->se[cpu]);
+ struct cfs_rq *cfs_rq = tg->cfs_rq[cpu];
+ struct sched_entity *se = tg->se[cpu];
+ struct rq *rq = cpu_rq(cpu);
+
+ if (se) {
+ if (se->sched_delayed) {
+ guard(rq_lock_irqsave)(rq);
+ if (se->sched_delayed) {
+ update_rq_clock(rq);
+ dequeue_entities(rq, se, DEQUEUE_SLEEP | DEQUEUE_DELAYED);
+ }
+ list_del_leaf_cfs_rq(cfs_rq);
+ }
+ remove_entity_load_avg(se);
+ }
/*
* Only empty task groups can be destroyed; so we can speculatively
* check on_list without danger of it being re-added.
*/
- if (!tg->cfs_rq[cpu]->on_list)
- continue;
-
- rq = cpu_rq(cpu);
-
- raw_spin_rq_lock_irqsave(rq, flags);
- list_del_leaf_cfs_rq(tg->cfs_rq[cpu]);
- raw_spin_rq_unlock_irqrestore(rq, flags);
+ if (cfs_rq->on_list) {
+ guard(rq_lock_irqsave)(rq);
+ list_del_leaf_cfs_rq(cfs_rq);
+ }
}
}
.wakeup_preempt = check_preempt_wakeup_fair,
+ .pick_task = pick_task_fair,
.pick_next_task = __pick_next_task_fair,
.put_prev_task = put_prev_task_fair,
.set_next_task = set_next_task_fair,
#ifdef CONFIG_SMP
.balance = balance_fair,
- .pick_task = pick_task_fair,
.select_task_rq = select_task_rq_fair,
.migrate_task_rq = migrate_task_rq_fair,
* keep the priority unchanged. Otherwise, update priority
* to the normal priority:
*/
- if (!rt_prio(p->prio))
+ if (!rt_or_dl_prio(p->prio))
return p->normal_prio;
return p->prio;
}
#endif
- #ifdef CONFIG_SMP
- /*
- * This function computes an effective utilization for the given CPU, to be
- * used for frequency selection given the linear relation: f = u * f_max.
- *
- * The scheduler tracks the following metrics:
- *
- * cpu_util_{cfs,rt,dl,irq}()
- * cpu_bw_dl()
- *
- * Where the cfs,rt and dl util numbers are tracked with the same metric and
- * synchronized windows and are thus directly comparable.
- *
- * The cfs,rt,dl utilization are the running times measured with rq->clock_task
- * which excludes things like IRQ and steal-time. These latter are then accrued
- * in the IRQ utilization.
- *
- * The DL bandwidth number OTOH is not a measured metric but a value computed
- * based on the task model parameters and gives the minimal utilization
- * required to meet deadlines.
- */
- unsigned long effective_cpu_util(int cpu, unsigned long util_cfs,
- unsigned long *min,
- unsigned long *max)
- {
- unsigned long util, irq, scale;
- struct rq *rq = cpu_rq(cpu);
-
- scale = arch_scale_cpu_capacity(cpu);
-
- /*
- * Early check to see if IRQ/steal time saturates the CPU, can be
- * because of inaccuracies in how we track these -- see
- * update_irq_load_avg().
- */
- irq = cpu_util_irq(rq);
- if (unlikely(irq >= scale)) {
- if (min)
- *min = scale;
- if (max)
- *max = scale;
- return scale;
- }
-
- if (min) {
- /*
- * The minimum utilization returns the highest level between:
- * - the computed DL bandwidth needed with the IRQ pressure which
- * steals time to the deadline task.
- * - The minimum performance requirement for CFS and/or RT.
- */
- *min = max(irq + cpu_bw_dl(rq), uclamp_rq_get(rq, UCLAMP_MIN));
-
- /*
- * When an RT task is runnable and uclamp is not used, we must
- * ensure that the task will run at maximum compute capacity.
- */
- if (!uclamp_is_used() && rt_rq_is_runnable(&rq->rt))
- *min = max(*min, scale);
- }
-
- /*
- * Because the time spend on RT/DL tasks is visible as 'lost' time to
- * CFS tasks and we use the same metric to track the effective
- * utilization (PELT windows are synchronized) we can directly add them
- * to obtain the CPU's actual utilization.
- */
- util = util_cfs + cpu_util_rt(rq);
- util += cpu_util_dl(rq);
-
- /*
- * The maximum hint is a soft bandwidth requirement, which can be lower
- * than the actual utilization because of uclamp_max requirements.
- */
- if (max)
- *max = min(scale, uclamp_rq_get(rq, UCLAMP_MAX));
-
- if (util >= scale)
- return scale;
-
- /*
- * There is still idle time; further improve the number by using the
- * IRQ metric. Because IRQ/steal time is hidden from the task clock we
- * need to scale the task numbers:
- *
- * max - irq
- * U' = irq + --------- * U
- * max
- */
- util = scale_irq_capacity(util, irq, scale);
- util += irq;
-
- return min(scale, util);
- }
-
- unsigned long sched_cpu_util(int cpu)
- {
- return effective_cpu_util(cpu, cpu_util_cfs(cpu), NULL, NULL);
- }
- #endif /* CONFIG_SMP */
-
/**
* find_process_by_pid - find a process with a matching PID value.
* @pid: the pid in question.
p->policy = policy;
- if (dl_policy(policy))
+ if (dl_policy(policy)) {
__setparam_dl(p, attr);
- else if (fair_policy(policy))
+ } else if (fair_policy(policy)) {
p->static_prio = NICE_TO_PRIO(attr->sched_nice);
+ if (attr->sched_runtime) {
+ p->se.custom_slice = 1;
+ p->se.slice = clamp_t(u64, attr->sched_runtime,
+ NSEC_PER_MSEC/10, /* HZ=1000 * 10 */
+ NSEC_PER_MSEC*100); /* HZ=100 / 10 */
+ } else {
+ p->se.custom_slice = 0;
+ p->se.slice = sysctl_sched_base_slice;
+ }
+ }
- if (task_is_realtime(p)) {
+ /* rt-policy tasks do not have a timerslack */
++ if (rt_or_dl_task_policy(p)) {
+ p->timer_slack_ns = 0;
+ } else if (p->timer_slack_ns == 0) {
+ /* when switching back to non-rt policy, restore timerslack */
+ p->timer_slack_ns = p->default_timer_slack_ns;
+ }
+
/*
* __sched_setscheduler() ensures attr->sched_priority == 0 when
* !rt_policy. Always setting this ensures that things like
* but store a possible modification of reset_on_fork.
*/
if (unlikely(policy == p->policy)) {
- if (fair_policy(policy) && attr->sched_nice != task_nice(p))
+ if (fair_policy(policy) &&
+ (attr->sched_nice != task_nice(p) ||
+ (attr->sched_runtime != p->se.slice)))
goto change;
if (rt_policy(policy) && attr->sched_priority != p->rt_priority)
goto change;
.sched_nice = PRIO_TO_NICE(p->static_prio),
};
+ if (p->se.custom_slice)
+ attr.sched_runtime = p->se.slice;
+
/* Fixup the legacy SCHED_RESET_ON_FORK hack. */
if ((policy != SETPARAM_POLICY) && (policy & SCHED_RESET_ON_FORK)) {
attr.sched_flags |= SCHED_FLAG_RESET_ON_FORK;
static void get_params(struct task_struct *p, struct sched_attr *attr)
{
- if (task_has_dl_policy(p))
+ if (task_has_dl_policy(p)) {
__getparam_dl(p, attr);
- else if (task_has_rt_policy(p))
+ } else if (task_has_rt_policy(p)) {
attr->sched_priority = p->rt_priority;
- else
+ } else {
attr->sched_nice = task_nice(p);
+ attr->sched_runtime = p->se.slice;
+ }
}
/**
error = current->timer_slack_ns;
break;
case PR_SET_TIMERSLACK:
- if (task_is_realtime(current))
++ if (rt_or_dl_task_policy(current))
+ break;
if (arg2 <= 0)
current->timer_slack_ns =
current->default_timer_slack_ns;
/*
* CONFIG_TIME_LOW_RES indicates that the system has no way to return
* granular time values. For relative timers we add hrtimer_resolution
- * (i.e. one jiffie) to prevent short timeouts.
+ * (i.e. one jiffy) to prevent short timeouts.
*/
timer->is_rel = mode & HRTIMER_MODE_REL;
if (timer->is_rel)
}
static void hrtimer_cpu_base_lock_expiry(struct hrtimer_cpu_base *base)
+ __acquires(&base->softirq_expiry_lock)
{
spin_lock(&base->softirq_expiry_lock);
}
static void hrtimer_cpu_base_unlock_expiry(struct hrtimer_cpu_base *base)
+ __releases(&base->softirq_expiry_lock)
{
spin_unlock(&base->softirq_expiry_lock);
}
}
}
-static __latent_entropy void hrtimer_run_softirq(struct softirq_action *h)
+static __latent_entropy void hrtimer_run_softirq(void)
{
struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases);
unsigned long flags;
* expiry.
*/
if (IS_ENABLED(CONFIG_PREEMPT_RT)) {
- if (task_is_realtime(current) && !(mode & HRTIMER_MODE_SOFT))
+ if (rt_or_dl_task_policy(current) && !(mode & HRTIMER_MODE_SOFT))
mode |= HRTIMER_MODE_HARD;
}
struct restart_block *restart;
struct hrtimer_sleeper t;
int ret = 0;
- u64 slack;
-
- slack = current->timer_slack_ns;
- if (rt_or_dl_task(current))
- slack = 0;
hrtimer_init_sleeper_on_stack(&t, clockid, mode);
- hrtimer_set_expires_range_ns(&t.timer, rqtp, slack);
+ hrtimer_set_expires_range_ns(&t.timer, rqtp, current->timer_slack_ns);
ret = do_nanosleep(&t, mode);
if (ret != -ERESTART_RESTARTBLOCK)
goto out;
/**
* schedule_hrtimeout_range_clock - sleep until timeout
* @expires: timeout value (ktime_t)
- * @delta: slack in expires timeout (ktime_t) for SCHED_OTHER tasks
+ * @delta: slack in expires timeout (ktime_t)
* @mode: timer mode
* @clock_id: timer clock to be used
*/
return -EINTR;
}
- /*
- * Override any slack passed by the user if under
- * rt contraints.
- */
- if (rt_or_dl_task(current))
- delta = 0;
-
hrtimer_init_sleeper_on_stack(&t, clock_id, mode);
hrtimer_set_expires_range_ns(&t.timer, *expires, delta);
hrtimer_sleeper_start_expires(&t, mode);
/**
* schedule_hrtimeout_range - sleep until timeout
* @expires: timeout value (ktime_t)
- * @delta: slack in expires timeout (ktime_t) for SCHED_OTHER tasks
+ * @delta: slack in expires timeout (ktime_t)
* @mode: timer mode
*
* Make the current task sleep until the given expiry time has
static bool page_contains_unaccepted(struct page *page, unsigned int order);
static void accept_page(struct page *page, unsigned int order);
-static bool try_to_accept_memory(struct zone *zone, unsigned int order);
+static bool cond_accept_memory(struct zone *zone, unsigned int order);
static inline bool has_unaccepted_memory(void);
static bool __free_unaccepted(struct page *page);
reset_page_owner(page, order);
page_table_check_free(page, order);
pgalloc_tag_sub(page, 1 << order);
+
+ /*
+ * The page is isolated and accounted for.
+ * Mark the codetag as empty to avoid accounting error
+ * when the page is freed by unpoison_memory().
+ */
+ clear_page_tag_ref(page);
return false;
}
if (!(alloc_flags & ALLOC_CMA))
unusable_free += zone_page_state(z, NR_FREE_CMA_PAGES);
#endif
-#ifdef CONFIG_UNACCEPTED_MEMORY
- unusable_free += zone_page_state(z, NR_UNACCEPTED);
-#endif
return unusable_free;
}
}
}
+ cond_accept_memory(zone, order);
+
/*
* Detect whether the number of free pages is below high
* watermark. If so, we will decrease pcp->high and free
gfp_mask)) {
int ret;
- if (has_unaccepted_memory()) {
- if (try_to_accept_memory(zone, order))
- goto try_this_zone;
- }
+ if (cond_accept_memory(zone, order))
+ goto try_this_zone;
#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
/*
return page;
} else {
- if (has_unaccepted_memory()) {
- if (try_to_accept_memory(zone, order))
- goto try_this_zone;
- }
+ if (cond_accept_memory(zone, order))
+ goto try_this_zone;
#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
/* Try again if zone has deferred pages */
*/
if (alloc_flags & ALLOC_MIN_RESERVE)
alloc_flags &= ~ALLOC_CPUSET;
- } else if (unlikely(rt_task(current)) && in_task())
+ } else if (unlikely(rt_or_dl_task(current)) && in_task())
alloc_flags |= ALLOC_MIN_RESERVE;
alloc_flags = gfp_to_alloc_flags_cma(gfp_mask, alloc_flags);
for_each_online_pgdat(pgdat)
pgdat->per_cpu_nodestats =
alloc_percpu(struct per_cpu_nodestat);
- store_early_perpage_metadata();
}
__meminit void zone_pcp_init(struct zone *zone)
void free_reserved_page(struct page *page)
{
- if (mem_alloc_profiling_enabled()) {
- union codetag_ref *ref = get_page_tag_ref(page);
-
- if (ref) {
- set_codetag_empty(ref);
- put_page_tag_ref(ref);
- }
- }
+ clear_page_tag_ref(page);
ClearPageReserved(page);
init_page_count(page);
__free_page(page);
struct page *page;
bool last;
- if (list_empty(&zone->unaccepted_pages))
- return false;
-
spin_lock_irqsave(&zone->lock, flags);
page = list_first_entry_or_null(&zone->unaccepted_pages,
struct page, lru);
return true;
}
-static bool try_to_accept_memory(struct zone *zone, unsigned int order)
+static bool cond_accept_memory(struct zone *zone, unsigned int order)
{
long to_accept;
- int ret = false;
+ bool ret = false;
+
+ if (!has_unaccepted_memory())
+ return false;
+
+ if (list_empty(&zone->unaccepted_pages))
+ return false;
/* How much to accept to get to high watermark? */
to_accept = high_wmark_pages(zone) -
(zone_page_state(zone, NR_FREE_PAGES) -
- __zone_watermark_unusable_free(zone, order, 0));
+ __zone_watermark_unusable_free(zone, order, 0) -
+ zone_page_state(zone, NR_UNACCEPTED));
- /* Accept at least one page */
- do {
+ while (to_accept > 0) {
if (!try_to_accept_memory_one(zone))
break;
ret = true;
to_accept -= MAX_ORDER_NR_PAGES;
- } while (to_accept > 0);
+ }
return ret;
}
{
}
-static bool try_to_accept_memory(struct zone *zone, unsigned int order)
+static bool cond_accept_memory(struct zone *zone, unsigned int order)
{
return false;
}