From: Peter Zijlstra Date: Fri, 20 Aug 2021 10:33:05 +0000 (+0200) Subject: Merge branch 'sched/core' X-Git-Tag: v5.15-rc1~145^2^2~7 X-Git-Url: https://repo.jachan.dev/linux.git/commitdiff_plain/c94d89fafa49ba70fedbb01cb52dfbbdd7dc0986?hp=-c Merge branch 'sched/core' --- c94d89fafa49ba70fedbb01cb52dfbbdd7dc0986 diff --combined include/linux/sched.h index ec8d07d88641,3bb9fecfdaa1..6ecd02e2ca1e --- a/include/linux/sched.h +++ b/include/linux/sched.h @@@ -748,6 -748,7 +748,7 @@@ struct task_struct unsigned int policy; int nr_cpus_allowed; const cpumask_t *cpus_ptr; + cpumask_t *user_cpus_ptr; cpumask_t cpus_mask; void *migration_pending; #ifdef CONFIG_SMP @@@ -1007,6 -1008,7 +1008,6 @@@ /* Signal handlers: */ struct signal_struct *signal; struct sighand_struct __rcu *sighand; - struct sigqueue *sigqueue_cache; sigset_t blocked; sigset_t real_blocked; /* Restored if set_restore_sigmask() was used: */ @@@ -1705,6 -1707,11 +1706,11 @@@ extern int task_can_attach(struct task_ #ifdef CONFIG_SMP extern void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask); extern int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask); + extern int dup_user_cpus_ptr(struct task_struct *dst, struct task_struct *src, int node); + extern void release_user_cpus_ptr(struct task_struct *p); + extern int dl_task_check_affinity(struct task_struct *p, const struct cpumask *mask); + extern void force_compatible_cpus_allowed_ptr(struct task_struct *p); + extern void relax_compatible_cpus_allowed_ptr(struct task_struct *p); #else static inline void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) { @@@ -1715,6 -1722,21 +1721,21 @@@ static inline int set_cpus_allowed_ptr( return -EINVAL; return 0; } + static inline int dup_user_cpus_ptr(struct task_struct *dst, struct task_struct *src, int node) + { + if (src->user_cpus_ptr) + return -EINVAL; + return 0; + } + static inline void release_user_cpus_ptr(struct task_struct *p) + { + WARN_ON(p->user_cpus_ptr); + } + + static inline int dl_task_check_affinity(struct task_struct *p, const struct cpumask *mask) + { + return 0; + } #endif extern int yield_to(struct task_struct *p, bool preempt); @@@ -2028,8 -2050,6 +2049,8 @@@ static inline void set_task_cpu(struct #endif /* CONFIG_SMP */ +extern bool sched_task_on_rq(struct task_struct *p); + /* * In order to reduce various lock holder preemption latencies provide an * interface to see if a vCPU is currently running or not. diff --combined include/linux/wait.h index 6598ae35e1b5,99c5f05718cd..93dab0e9580f --- a/include/linux/wait.h +++ b/include/linux/wait.h @@@ -56,7 -56,7 +56,7 @@@ struct task_struct #define __WAIT_QUEUE_HEAD_INITIALIZER(name) { \ .lock = __SPIN_LOCK_UNLOCKED(name.lock), \ - .head = { &(name).head, &(name).head } } + .head = LIST_HEAD_INIT(name.head) } #define DECLARE_WAIT_QUEUE_HEAD(name) \ struct wait_queue_head name = __WAIT_QUEUE_HEAD_INITIALIZER(name) @@@ -1136,7 -1136,7 +1136,7 @@@ do { * Waitqueues which are removed from the waitqueue_head at wakeup time */ void prepare_to_wait(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry, int state); -void prepare_to_wait_exclusive(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry, int state); +bool prepare_to_wait_exclusive(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry, int state); long prepare_to_wait_event(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry, int state); void finish_wait(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry); long wait_woken(struct wait_queue_entry *wq_entry, unsigned mode, long timeout); diff --combined kernel/fork.c index bc94b2cc5995,5d7addf0c41a..bd0e165b8397 --- a/kernel/fork.c +++ b/kernel/fork.c @@@ -446,6 -446,7 +446,7 @@@ void put_task_stack(struct task_struct void free_task(struct task_struct *tsk) { + release_user_cpus_ptr(tsk); scs_release(tsk); #ifndef CONFIG_THREAD_INFO_IN_TASK @@@ -825,14 -826,9 +826,14 @@@ void __init fork_init(void init_task.signal->rlim[RLIMIT_SIGPENDING] = init_task.signal->rlim[RLIMIT_NPROC]; - for (i = 0; i < UCOUNT_COUNTS; i++) + for (i = 0; i < MAX_PER_NAMESPACE_UCOUNTS; i++) init_user_ns.ucount_max[i] = max_threads/2; + set_rlimit_ucount_max(&init_user_ns, UCOUNT_RLIMIT_NPROC, task_rlimit(&init_task, RLIMIT_NPROC)); + set_rlimit_ucount_max(&init_user_ns, UCOUNT_RLIMIT_MSGQUEUE, task_rlimit(&init_task, RLIMIT_MSGQUEUE)); + set_rlimit_ucount_max(&init_user_ns, UCOUNT_RLIMIT_SIGPENDING, task_rlimit(&init_task, RLIMIT_SIGPENDING)); + set_rlimit_ucount_max(&init_user_ns, UCOUNT_RLIMIT_MEMLOCK, task_rlimit(&init_task, RLIMIT_MEMLOCK)); + #ifdef CONFIG_VMAP_STACK cpuhp_setup_state(CPUHP_BP_PREPARE_DYN, "fork:vm_stack_cache", NULL, free_vm_stack_cache); @@@ -924,6 -920,7 +925,7 @@@ static struct task_struct *dup_task_str #endif if (orig->cpus_ptr == &orig->cpus_mask) tsk->cpus_ptr = &tsk->cpus_mask; + dup_user_cpus_ptr(tsk, orig, node); /* * One for the user space visible state that goes away when reaped. @@@ -1035,6 -1032,7 +1037,6 @@@ static struct mm_struct *mm_init(struc mm_pgtables_bytes_init(mm); mm->map_count = 0; mm->locked_vm = 0; - atomic_set(&mm->has_pinned, 0); atomic64_set(&mm->pinned_vm, 0); memset(&mm->rss_stat, 0, sizeof(mm->rss_stat)); spin_lock_init(&mm->page_table_lock); @@@ -1982,7 -1980,8 +1984,7 @@@ static __latent_entropy struct task_str DEBUG_LOCKS_WARN_ON(!p->softirqs_enabled); #endif retval = -EAGAIN; - if (atomic_read(&p->real_cred->user->processes) >= - task_rlimit(p, RLIMIT_NPROC)) { + if (is_ucounts_overlimit(task_ucounts(p), UCOUNT_RLIMIT_NPROC, rlimit(RLIMIT_NPROC))) { if (p->real_cred->user != INIT_USER && !capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN)) goto bad_fork_free; @@@ -2012,6 -2011,7 +2014,6 @@@ spin_lock_init(&p->alloc_lock); init_sigpending(&p->pending); - p->sigqueue_cache = NULL; p->utime = p->stime = p->gtime = 0; #ifdef CONFIG_ARCH_HAS_SCALED_CPUTIME @@@ -2391,7 -2391,7 +2393,7 @@@ bad_fork_cleanup_threadgroup_lock #endif delayacct_tsk_free(p); bad_fork_cleanup_count: - atomic_dec(&p->cred->user->processes); + dec_rlimit_ucounts(task_ucounts(p), UCOUNT_RLIMIT_NPROC, 1); exit_creds(p); bad_fork_free: WRITE_ONCE(p->__state, TASK_DEAD); @@@ -3004,12 -3004,6 +3006,12 @@@ int ksys_unshare(unsigned long unshare_ if (err) goto bad_unshare_cleanup_cred; + if (new_cred) { + err = set_cred_ucounts(new_cred); + if (err) + goto bad_unshare_cleanup_cred; + } + if (new_fs || new_fd || do_sysvsem || new_cred || new_nsproxy) { if (do_sysvsem) { /* diff --combined kernel/sched/core.c index 20ffcc044134,a22cc3c156ce..8dc67166aa6c --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@@ -993,6 -993,7 +993,7 @@@ int get_nohz_timer_target(void { int i, cpu = smp_processor_id(), default_cpu = -1; struct sched_domain *sd; + const struct cpumask *hk_mask; if (housekeeping_cpu(cpu, HK_FLAG_TIMER)) { if (!idle_cpu(cpu)) @@@ -1000,10 -1001,11 +1001,11 @@@ default_cpu = cpu; } + hk_mask = housekeeping_cpumask(HK_FLAG_TIMER); + rcu_read_lock(); for_each_domain(cpu, sd) { - for_each_cpu_and(i, sched_domain_span(sd), - housekeeping_cpumask(HK_FLAG_TIMER)) { + for_each_cpu_and(i, sched_domain_span(sd), hk_mask) { if (cpu == i) continue; @@@ -1619,6 -1621,23 +1621,23 @@@ static inline void uclamp_rq_dec(struc uclamp_rq_dec_id(rq, p, clamp_id); } + static inline void uclamp_rq_reinc_id(struct rq *rq, struct task_struct *p, + enum uclamp_id clamp_id) + { + if (!p->uclamp[clamp_id].active) + return; + + uclamp_rq_dec_id(rq, p, clamp_id); + uclamp_rq_inc_id(rq, p, clamp_id); + + /* + * Make sure to clear the idle flag if we've transiently reached 0 + * active tasks on rq. + */ + if (clamp_id == UCLAMP_MAX && (rq->uclamp_flags & UCLAMP_FLAG_IDLE)) + rq->uclamp_flags &= ~UCLAMP_FLAG_IDLE; + } + static inline void uclamp_update_active(struct task_struct *p) { @@@ -1642,12 -1661,8 +1661,8 @@@ * affecting a valid clamp bucket, the next time it's enqueued, * it will already see the updated clamp bucket value. */ - for_each_clamp_id(clamp_id) { - if (p->uclamp[clamp_id].active) { - uclamp_rq_dec_id(rq, p, clamp_id); - uclamp_rq_inc_id(rq, p, clamp_id); - } - } + for_each_clamp_id(clamp_id) + uclamp_rq_reinc_id(rq, p, clamp_id); task_rq_unlock(rq, p, &rf); } @@@ -1928,11 -1943,6 +1943,11 @@@ static inline void uclamp_post_fork(str static inline void init_uclamp(void) { } #endif /* CONFIG_UCLAMP_TASK */ +bool sched_task_on_rq(struct task_struct *p) +{ + return task_on_rq_queued(p); +} + static inline void enqueue_task(struct rq *rq, struct task_struct *p, int flags) { if (!(flags & ENQUEUE_NOCLOCK)) @@@ -1981,18 -1991,12 +1996,18 @@@ void deactivate_task(struct rq *rq, str dequeue_task(rq, p, flags); } -/* - * __normal_prio - return the priority that is based on the static prio - */ -static inline int __normal_prio(struct task_struct *p) +static inline int __normal_prio(int policy, int rt_prio, int nice) { - return p->static_prio; + int prio; + + if (dl_policy(policy)) + prio = MAX_DL_PRIO - 1; + else if (rt_policy(policy)) + prio = MAX_RT_PRIO - 1 - rt_prio; + else + prio = NICE_TO_PRIO(nice); + + return prio; } /* @@@ -2004,7 -2008,15 +2019,7 @@@ */ static inline int normal_prio(struct task_struct *p) { - int prio; - - if (task_has_dl_policy(p)) - prio = MAX_DL_PRIO-1; - else if (task_has_rt_policy(p)) - prio = MAX_RT_PRIO-1 - p->rt_priority; - else - prio = __normal_prio(p); - return prio; + return __normal_prio(p->policy, p->rt_priority, PRIO_TO_NICE(p->static_prio)); } /* @@@ -2161,7 -2173,7 +2176,7 @@@ static inline bool is_cpu_allowed(struc /* Non kernel threads are not allowed during either online or offline. */ if (!(p->flags & PF_KTHREAD)) - return cpu_active(cpu); + return cpu_active(cpu) && task_cpu_possible(cpu, p); /* KTHREAD_IS_PER_CPU is always allowed. */ if (kthread_is_per_cpu(p)) @@@ -2468,6 -2480,34 +2483,34 @@@ void do_set_cpus_allowed(struct task_st __do_set_cpus_allowed(p, new_mask, 0); } + int dup_user_cpus_ptr(struct task_struct *dst, struct task_struct *src, + int node) + { + if (!src->user_cpus_ptr) + return 0; + + dst->user_cpus_ptr = kmalloc_node(cpumask_size(), GFP_KERNEL, node); + if (!dst->user_cpus_ptr) + return -ENOMEM; + + cpumask_copy(dst->user_cpus_ptr, src->user_cpus_ptr); + return 0; + } + + static inline struct cpumask *clear_user_cpus_ptr(struct task_struct *p) + { + struct cpumask *user_mask = NULL; + + swap(p->user_cpus_ptr, user_mask); + + return user_mask; + } + + void release_user_cpus_ptr(struct task_struct *p) + { + kfree(clear_user_cpus_ptr(p)); + } + /* * This function is wildly self concurrent; here be dragons. * @@@ -2685,28 -2725,26 +2728,26 @@@ static int affine_move_task(struct rq * } /* - * Change a given task's CPU affinity. Migrate the thread to a - * proper CPU and schedule it away if the CPU it's executing on - * is removed from the allowed bitmask. - * - * NOTE: the caller must have a valid reference to the task, the - * task must not exit() & deallocate itself prematurely. The - * call is not atomic; no spinlocks may be held. + * Called with both p->pi_lock and rq->lock held; drops both before returning. */ - static int __set_cpus_allowed_ptr(struct task_struct *p, - const struct cpumask *new_mask, - u32 flags) + static int __set_cpus_allowed_ptr_locked(struct task_struct *p, + const struct cpumask *new_mask, + u32 flags, + struct rq *rq, + struct rq_flags *rf) + __releases(rq->lock) + __releases(p->pi_lock) { + const struct cpumask *cpu_allowed_mask = task_cpu_possible_mask(p); const struct cpumask *cpu_valid_mask = cpu_active_mask; + bool kthread = p->flags & PF_KTHREAD; + struct cpumask *user_mask = NULL; unsigned int dest_cpu; - struct rq_flags rf; - struct rq *rq; int ret = 0; - rq = task_rq_lock(p, &rf); update_rq_clock(rq); - if (p->flags & PF_KTHREAD || is_migration_disabled(p)) { + if (kthread || is_migration_disabled(p)) { /* * Kernel threads are allowed on online && !active CPUs, * however, during cpu-hot-unplug, even these might get pushed @@@ -2720,6 -2758,11 +2761,11 @@@ cpu_valid_mask = cpu_online_mask; } + if (!kthread && !cpumask_subset(new_mask, cpu_allowed_mask)) { + ret = -EINVAL; + goto out; + } + /* * Must re-check here, to close a race against __kthread_bind(), * sched_setaffinity() is not guaranteed to observe the flag. @@@ -2754,20 -2797,178 +2800,178 @@@ __do_set_cpus_allowed(p, new_mask, flags); - return affine_move_task(rq, p, &rf, dest_cpu, flags); + if (flags & SCA_USER) + user_mask = clear_user_cpus_ptr(p); + + ret = affine_move_task(rq, p, rf, dest_cpu, flags); + + kfree(user_mask); + + return ret; out: - task_rq_unlock(rq, p, &rf); + task_rq_unlock(rq, p, rf); return ret; } + /* + * Change a given task's CPU affinity. Migrate the thread to a + * proper CPU and schedule it away if the CPU it's executing on + * is removed from the allowed bitmask. + * + * NOTE: the caller must have a valid reference to the task, the + * task must not exit() & deallocate itself prematurely. The + * call is not atomic; no spinlocks may be held. + */ + static int __set_cpus_allowed_ptr(struct task_struct *p, + const struct cpumask *new_mask, u32 flags) + { + struct rq_flags rf; + struct rq *rq; + + rq = task_rq_lock(p, &rf); + return __set_cpus_allowed_ptr_locked(p, new_mask, flags, rq, &rf); + } + int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask) { return __set_cpus_allowed_ptr(p, new_mask, 0); } EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr); + /* + * Change a given task's CPU affinity to the intersection of its current + * affinity mask and @subset_mask, writing the resulting mask to @new_mask + * and pointing @p->user_cpus_ptr to a copy of the old mask. + * If the resulting mask is empty, leave the affinity unchanged and return + * -EINVAL. + */ + static int restrict_cpus_allowed_ptr(struct task_struct *p, + struct cpumask *new_mask, + const struct cpumask *subset_mask) + { + struct cpumask *user_mask = NULL; + struct rq_flags rf; + struct rq *rq; + int err; + + if (!p->user_cpus_ptr) { + user_mask = kmalloc(cpumask_size(), GFP_KERNEL); + if (!user_mask) + return -ENOMEM; + } + + rq = task_rq_lock(p, &rf); + + /* + * Forcefully restricting the affinity of a deadline task is + * likely to cause problems, so fail and noisily override the + * mask entirely. + */ + if (task_has_dl_policy(p) && dl_bandwidth_enabled()) { + err = -EPERM; + goto err_unlock; + } + + if (!cpumask_and(new_mask, &p->cpus_mask, subset_mask)) { + err = -EINVAL; + goto err_unlock; + } + + /* + * We're about to butcher the task affinity, so keep track of what + * the user asked for in case we're able to restore it later on. + */ + if (user_mask) { + cpumask_copy(user_mask, p->cpus_ptr); + p->user_cpus_ptr = user_mask; + } + + return __set_cpus_allowed_ptr_locked(p, new_mask, 0, rq, &rf); + + err_unlock: + task_rq_unlock(rq, p, &rf); + kfree(user_mask); + return err; + } + + /* + * Restrict the CPU affinity of task @p so that it is a subset of + * task_cpu_possible_mask() and point @p->user_cpu_ptr to a copy of the + * old affinity mask. If the resulting mask is empty, we warn and walk + * up the cpuset hierarchy until we find a suitable mask. + */ + void force_compatible_cpus_allowed_ptr(struct task_struct *p) + { + cpumask_var_t new_mask; + const struct cpumask *override_mask = task_cpu_possible_mask(p); + + alloc_cpumask_var(&new_mask, GFP_KERNEL); + + /* + * __migrate_task() can fail silently in the face of concurrent + * offlining of the chosen destination CPU, so take the hotplug + * lock to ensure that the migration succeeds. + */ + cpus_read_lock(); + if (!cpumask_available(new_mask)) + goto out_set_mask; + + if (!restrict_cpus_allowed_ptr(p, new_mask, override_mask)) + goto out_free_mask; + + /* + * We failed to find a valid subset of the affinity mask for the + * task, so override it based on its cpuset hierarchy. + */ + cpuset_cpus_allowed(p, new_mask); + override_mask = new_mask; + + out_set_mask: + if (printk_ratelimit()) { + printk_deferred("Overriding affinity for process %d (%s) to CPUs %*pbl\n", + task_pid_nr(p), p->comm, + cpumask_pr_args(override_mask)); + } + + WARN_ON(set_cpus_allowed_ptr(p, override_mask)); + out_free_mask: + cpus_read_unlock(); + free_cpumask_var(new_mask); + } + + static int + __sched_setaffinity(struct task_struct *p, const struct cpumask *mask); + + /* + * Restore the affinity of a task @p which was previously restricted by a + * call to force_compatible_cpus_allowed_ptr(). This will clear (and free) + * @p->user_cpus_ptr. + * + * It is the caller's responsibility to serialise this with any calls to + * force_compatible_cpus_allowed_ptr(@p). + */ + void relax_compatible_cpus_allowed_ptr(struct task_struct *p) + { + struct cpumask *user_mask = p->user_cpus_ptr; + unsigned long flags; + + /* + * Try to restore the old affinity mask. If this fails, then + * we free the mask explicitly to avoid it being inherited across + * a subsequent fork(). + */ + if (!user_mask || !__sched_setaffinity(p, user_mask)) + return; + + raw_spin_lock_irqsave(&p->pi_lock, flags); + user_mask = clear_user_cpus_ptr(p); + raw_spin_unlock_irqrestore(&p->pi_lock, flags); + + kfree(user_mask); + } + void set_task_cpu(struct task_struct *p, unsigned int new_cpu) { #ifdef CONFIG_SCHED_DEBUG @@@ -3112,9 -3313,7 +3316,7 @@@ static int select_fallback_rq(int cpu, /* Look for allowed, online CPU in same node. */ for_each_cpu(dest_cpu, nodemask) { - if (!cpu_active(dest_cpu)) - continue; - if (cpumask_test_cpu(dest_cpu, p->cpus_ptr)) + if (is_cpu_allowed(p, dest_cpu)) return dest_cpu; } } @@@ -3131,8 -3330,7 +3333,7 @@@ /* No more Mr. Nice Guy. */ switch (state) { case cpuset: - if (IS_ENABLED(CONFIG_CPUSETS)) { - cpuset_cpus_allowed_fallback(p); + if (cpuset_cpus_allowed_fallback(p)) { state = possible; break; } @@@ -3144,10 -3342,9 +3345,9 @@@ * * More yuck to audit. */ - do_set_cpus_allowed(p, cpu_possible_mask); + do_set_cpus_allowed(p, task_cpu_possible_mask(p)); state = fail; break; - case fail: BUG(); break; @@@ -4097,7 -4294,7 +4297,7 @@@ int sched_fork(unsigned long clone_flag } else if (PRIO_TO_NICE(p->static_prio) < 0) p->static_prio = NICE_TO_PRIO(0); - p->prio = p->normal_prio = __normal_prio(p); + p->prio = p->normal_prio = p->static_prio; set_load_weight(p, false); /* @@@ -4549,7 -4746,6 +4749,7 @@@ static struct rq *finish_task_switch(st vtime_task_switch(prev); perf_event_task_sched_in(prev, current); finish_task(prev); + tick_nohz_task_switch(); finish_lock_switch(rq); finish_arch_post_lock_switch(); kcov_finish_switch(current); @@@ -4595,6 -4791,7 +4795,6 @@@ put_task_struct_rcu_user(prev); } - tick_nohz_task_switch(); return rq; } @@@ -5660,11 -5857,9 +5860,9 @@@ static bool try_steal_cookie(int this, if (p->core_occupation > dst->idle->core_occupation) goto next; - p->on_rq = TASK_ON_RQ_MIGRATING; deactivate_task(src, p, 0); set_task_cpu(p, this); activate_task(dst, p, 0); - p->on_rq = TASK_ON_RQ_QUEUED; resched_curr(dst); @@@ -6339,18 -6534,6 +6537,18 @@@ int default_wake_function(wait_queue_en } EXPORT_SYMBOL(default_wake_function); +static void __setscheduler_prio(struct task_struct *p, int prio) +{ + if (dl_prio(prio)) + p->sched_class = &dl_sched_class; + else if (rt_prio(prio)) + p->sched_class = &rt_sched_class; + else + p->sched_class = &fair_sched_class; + + p->prio = prio; +} + #ifdef CONFIG_RT_MUTEXES static inline int __rt_effective_prio(struct task_struct *pi_task, int prio) @@@ -6466,19 -6649,22 +6664,19 @@@ void rt_mutex_setprio(struct task_struc } else { p->dl.pi_se = &p->dl; } - p->sched_class = &dl_sched_class; } else if (rt_prio(prio)) { if (dl_prio(oldprio)) p->dl.pi_se = &p->dl; if (oldprio < prio) queue_flag |= ENQUEUE_HEAD; - p->sched_class = &rt_sched_class; } else { if (dl_prio(oldprio)) p->dl.pi_se = &p->dl; if (rt_prio(oldprio)) p->rt.timeout = 0; - p->sched_class = &fair_sched_class; } - p->prio = prio; + __setscheduler_prio(p, prio); if (queued) enqueue_task(rq, p, queue_flag); @@@ -6831,6 -7017,35 +7029,6 @@@ static void __setscheduler_params(struc set_load_weight(p, true); } -/* Actually do priority change: must hold pi & rq lock. */ -static void __setscheduler(struct rq *rq, struct task_struct *p, - const struct sched_attr *attr, bool keep_boost) -{ - /* - * If params can't change scheduling class changes aren't allowed - * either. - */ - if (attr->sched_flags & SCHED_FLAG_KEEP_PARAMS) - return; - - __setscheduler_params(p, attr); - - /* - * Keep a potential priority boosting if called from - * sched_setscheduler(). - */ - p->prio = normal_prio(p); - if (keep_boost) - p->prio = rt_effective_prio(p, p->prio); - - if (dl_prio(p->prio)) - p->sched_class = &dl_sched_class; - else if (rt_prio(p->prio)) - p->sched_class = &rt_sched_class; - else - p->sched_class = &fair_sched_class; -} - /* * Check the target process has a UID that matches the current process's: */ @@@ -6851,8 -7066,10 +7049,8 @@@ static int __sched_setscheduler(struct const struct sched_attr *attr, bool user, bool pi) { - int newprio = dl_policy(attr->sched_policy) ? MAX_DL_PRIO - 1 : - MAX_RT_PRIO - 1 - attr->sched_priority; - int retval, oldprio, oldpolicy = -1, queued, running; - int new_effective_prio, policy = attr->sched_policy; + int oldpolicy = -1, policy = attr->sched_policy; + int retval, oldprio, newprio, queued, running; const struct sched_class *prev_class; struct callback_head *head; struct rq_flags rf; @@@ -7050,7 -7267,6 +7248,7 @@@ change p->sched_reset_on_fork = reset_on_fork; oldprio = p->prio; + newprio = __normal_prio(policy, attr->sched_priority, attr->sched_nice); if (pi) { /* * Take priority boosted tasks into account. If the new @@@ -7059,8 -7275,8 +7257,8 @@@ * the runqueue. This will be done when the task deboost * itself. */ - new_effective_prio = rt_effective_prio(p, newprio); - if (new_effective_prio == oldprio) + newprio = rt_effective_prio(p, newprio); + if (newprio == oldprio) queue_flags &= ~DEQUEUE_MOVE; } @@@ -7073,10 -7289,7 +7271,10 @@@ prev_class = p->sched_class; - __setscheduler(rq, p, attr, pi); + if (!(attr->sched_flags & SCHED_FLAG_KEEP_PARAMS)) { + __setscheduler_params(p, attr); + __setscheduler_prio(p, newprio); + } __setscheduler_uclamp(p, attr); if (queued) { @@@ -7300,6 -7513,16 +7498,16 @@@ err_size return -E2BIG; } + static void get_params(struct task_struct *p, struct sched_attr *attr) + { + if (task_has_dl_policy(p)) + __getparam_dl(p, attr); + else if (task_has_rt_policy(p)) + attr->sched_priority = p->rt_priority; + else + attr->sched_nice = task_nice(p); + } + /** * sys_sched_setscheduler - set/change the scheduler policy and RT priority * @pid: the pid in question. @@@ -7361,6 -7584,8 +7569,8 @@@ SYSCALL_DEFINE3(sched_setattr, pid_t, p rcu_read_unlock(); if (likely(p)) { + if (attr.sched_flags & SCHED_FLAG_KEEP_PARAMS) + get_params(p, &attr); retval = sched_setattr(p, &attr); put_task_struct(p); } @@@ -7509,12 -7734,8 +7719,8 @@@ SYSCALL_DEFINE4(sched_getattr, pid_t, p kattr.sched_policy = p->policy; if (p->sched_reset_on_fork) kattr.sched_flags |= SCHED_FLAG_RESET_ON_FORK; - if (task_has_dl_policy(p)) - __getparam_dl(p, &kattr); - else if (task_has_rt_policy(p)) - kattr.sched_priority = p->rt_priority; - else - kattr.sched_nice = task_nice(p); + get_params(p, &kattr); + kattr.sched_flags &= SCHED_FLAG_ALL; #ifdef CONFIG_UCLAMP_TASK /* @@@ -7535,9 -7756,76 +7741,76 @@@ out_unlock return retval; } - long sched_setaffinity(pid_t pid, const struct cpumask *in_mask) + #ifdef CONFIG_SMP + int dl_task_check_affinity(struct task_struct *p, const struct cpumask *mask) { + int ret = 0; + + /* + * If the task isn't a deadline task or admission control is + * disabled then we don't care about affinity changes. + */ + if (!task_has_dl_policy(p) || !dl_bandwidth_enabled()) + return 0; + + /* + * Since bandwidth control happens on root_domain basis, + * if admission test is enabled, we only admit -deadline + * tasks allowed to run on all the CPUs in the task's + * root_domain. + */ + rcu_read_lock(); + if (!cpumask_subset(task_rq(p)->rd->span, mask)) + ret = -EBUSY; + rcu_read_unlock(); + return ret; + } + #endif + + static int + __sched_setaffinity(struct task_struct *p, const struct cpumask *mask) + { + int retval; cpumask_var_t cpus_allowed, new_mask; + + if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL)) + return -ENOMEM; + + if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) { + retval = -ENOMEM; + goto out_free_cpus_allowed; + } + + cpuset_cpus_allowed(p, cpus_allowed); + cpumask_and(new_mask, mask, cpus_allowed); + + retval = dl_task_check_affinity(p, new_mask); + if (retval) + goto out_free_new_mask; + again: + retval = __set_cpus_allowed_ptr(p, new_mask, SCA_CHECK | SCA_USER); + if (retval) + goto out_free_new_mask; + + cpuset_cpus_allowed(p, cpus_allowed); + if (!cpumask_subset(new_mask, cpus_allowed)) { + /* + * We must have raced with a concurrent cpuset update. + * Just reset the cpumask to the cpuset's cpus_allowed. + */ + cpumask_copy(new_mask, cpus_allowed); + goto again; + } + + out_free_new_mask: + free_cpumask_var(new_mask); + out_free_cpus_allowed: + free_cpumask_var(cpus_allowed); + return retval; + } + + long sched_setaffinity(pid_t pid, const struct cpumask *in_mask) + { struct task_struct *p; int retval; @@@ -7557,68 -7845,22 +7830,22 @@@ retval = -EINVAL; goto out_put_task; } - if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL)) { - retval = -ENOMEM; - goto out_put_task; - } - if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) { - retval = -ENOMEM; - goto out_free_cpus_allowed; - } - retval = -EPERM; + if (!check_same_owner(p)) { rcu_read_lock(); if (!ns_capable(__task_cred(p)->user_ns, CAP_SYS_NICE)) { rcu_read_unlock(); - goto out_free_new_mask; + retval = -EPERM; + goto out_put_task; } rcu_read_unlock(); } retval = security_task_setscheduler(p); if (retval) - goto out_free_new_mask; - - - cpuset_cpus_allowed(p, cpus_allowed); - cpumask_and(new_mask, in_mask, cpus_allowed); - - /* - * Since bandwidth control happens on root_domain basis, - * if admission test is enabled, we only admit -deadline - * tasks allowed to run on all the CPUs in the task's - * root_domain. - */ - #ifdef CONFIG_SMP - if (task_has_dl_policy(p) && dl_bandwidth_enabled()) { - rcu_read_lock(); - if (!cpumask_subset(task_rq(p)->rd->span, new_mask)) { - retval = -EBUSY; - rcu_read_unlock(); - goto out_free_new_mask; - } - rcu_read_unlock(); - } - #endif - again: - retval = __set_cpus_allowed_ptr(p, new_mask, SCA_CHECK); + goto out_put_task; - if (!retval) { - cpuset_cpus_allowed(p, cpus_allowed); - if (!cpumask_subset(new_mask, cpus_allowed)) { - /* - * We must have raced with a concurrent cpuset - * update. Just reset the cpus_allowed to the - * cpuset's cpus_allowed - */ - cpumask_copy(new_mask, cpus_allowed); - goto again; - } - } - out_free_new_mask: - free_cpumask_var(new_mask); - out_free_cpus_allowed: - free_cpumask_var(cpus_allowed); + retval = __sched_setaffinity(p, in_mask); out_put_task: put_task_struct(p); return retval; @@@ -9804,7 -10046,7 +10031,7 @@@ static int tg_set_cfs_bandwidth(struct * Prevent race between setting of cfs_rq->runtime_enabled and * unthrottle_offline_cfs_rqs(). */ - get_online_cpus(); + cpus_read_lock(); mutex_lock(&cfs_constraints_mutex); ret = __cfs_schedulable(tg, period, quota); if (ret) @@@ -9848,7 -10090,7 +10075,7 @@@ cfs_bandwidth_usage_dec(); out_unlock: mutex_unlock(&cfs_constraints_mutex); - put_online_cpus(); + cpus_read_unlock(); return ret; } @@@ -10099,6 -10341,20 +10326,20 @@@ static u64 cpu_rt_period_read_uint(stru } #endif /* CONFIG_RT_GROUP_SCHED */ + #ifdef CONFIG_FAIR_GROUP_SCHED + static s64 cpu_idle_read_s64(struct cgroup_subsys_state *css, + struct cftype *cft) + { + return css_tg(css)->idle; + } + + static int cpu_idle_write_s64(struct cgroup_subsys_state *css, + struct cftype *cft, s64 idle) + { + return sched_group_set_idle(css_tg(css), idle); + } + #endif + static struct cftype cpu_legacy_files[] = { #ifdef CONFIG_FAIR_GROUP_SCHED { @@@ -10106,6 -10362,11 +10347,11 @@@ .read_u64 = cpu_shares_read_u64, .write_u64 = cpu_shares_write_u64, }, + { + .name = "idle", + .read_s64 = cpu_idle_read_s64, + .write_s64 = cpu_idle_write_s64, + }, #endif #ifdef CONFIG_CFS_BANDWIDTH { @@@ -10313,6 -10574,12 +10559,12 @@@ static struct cftype cpu_files[] = .read_s64 = cpu_weight_nice_read_s64, .write_s64 = cpu_weight_nice_write_s64, }, + { + .name = "idle", + .flags = CFTYPE_NOT_ON_ROOT, + .read_s64 = cpu_idle_read_s64, + .write_s64 = cpu_idle_write_s64, + }, #endif #ifdef CONFIG_CFS_BANDWIDTH { diff --combined kernel/sched/fair.c index 44c452072a1b,6cd05f1d77ef..5aa3cfd15a2e --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@@ -431,6 -431,23 +431,23 @@@ find_matching_se(struct sched_entity ** } } + static int tg_is_idle(struct task_group *tg) + { + return tg->idle > 0; + } + + static int cfs_rq_is_idle(struct cfs_rq *cfs_rq) + { + return cfs_rq->idle > 0; + } + + static int se_is_idle(struct sched_entity *se) + { + if (entity_is_task(se)) + return task_has_idle_policy(task_of(se)); + return cfs_rq_is_idle(group_cfs_rq(se)); + } + #else /* !CONFIG_FAIR_GROUP_SCHED */ #define for_each_sched_entity(se) \ @@@ -468,6 -485,21 +485,21 @@@ find_matching_se(struct sched_entity ** { } + static int tg_is_idle(struct task_group *tg) + { + return 0; + } + + static int cfs_rq_is_idle(struct cfs_rq *cfs_rq) + { + return 0; + } + + static int se_is_idle(struct sched_entity *se) + { + return 0; + } + #endif /* CONFIG_FAIR_GROUP_SCHED */ static __always_inline @@@ -1486,7 -1518,7 +1518,7 @@@ static inline bool is_core_idle(int cpu if (cpu == sibling) continue; - if (!idle_cpu(cpu)) + if (!idle_cpu(sibling)) return false; } #endif @@@ -3037,9 -3069,8 +3069,9 @@@ enqueue_load_avg(struct cfs_rq *cfs_rq static inline void dequeue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { + u32 divider = get_pelt_divider(&se->avg); sub_positive(&cfs_rq->avg.load_avg, se->avg.load_avg); - sub_positive(&cfs_rq->avg.load_sum, se_weight(se) * se->avg.load_sum); + cfs_rq->avg.load_sum = cfs_rq->avg.load_avg * divider; } #else static inline void @@@ -3256,31 -3287,6 +3288,31 @@@ static inline void cfs_rq_util_change(s #ifdef CONFIG_SMP #ifdef CONFIG_FAIR_GROUP_SCHED +/* + * Because list_add_leaf_cfs_rq always places a child cfs_rq on the list + * immediately before a parent cfs_rq, and cfs_rqs are removed from the list + * bottom-up, we only have to test whether the cfs_rq before us on the list + * is our child. + * If cfs_rq is not on the list, test whether a child needs its to be added to + * connect a branch to the tree * (see list_add_leaf_cfs_rq() for details). + */ +static inline bool child_cfs_rq_on_list(struct cfs_rq *cfs_rq) +{ + struct cfs_rq *prev_cfs_rq; + struct list_head *prev; + + if (cfs_rq->on_list) { + prev = cfs_rq->leaf_cfs_rq_list.prev; + } else { + struct rq *rq = rq_of(cfs_rq); + + prev = rq->tmp_alone_branch; + } + + prev_cfs_rq = container_of(prev, struct cfs_rq, leaf_cfs_rq_list); + + return (prev_cfs_rq->tg->parent == cfs_rq->tg); +} static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq) { @@@ -3296,9 -3302,6 +3328,9 @@@ if (cfs_rq->avg.runnable_sum) return false; + if (child_cfs_rq_on_list(cfs_rq)) + return false; + /* * _avg must be null when _sum are null because _avg = _sum / divider * Make sure that rounding and/or propagation of PELT values never @@@ -4841,6 -4844,9 +4873,9 @@@ static bool throttle_cfs_rq(struct cfs_ dequeue_entity(qcfs_rq, se, DEQUEUE_SLEEP); + if (cfs_rq_is_idle(group_cfs_rq(se))) + idle_task_delta = cfs_rq->h_nr_running; + qcfs_rq->h_nr_running -= task_delta; qcfs_rq->idle_h_nr_running -= idle_task_delta; @@@ -4860,6 -4866,9 +4895,9 @@@ update_load_avg(qcfs_rq, se, 0); se_update_runnable(se); + if (cfs_rq_is_idle(group_cfs_rq(se))) + idle_task_delta = cfs_rq->h_nr_running; + qcfs_rq->h_nr_running -= task_delta; qcfs_rq->idle_h_nr_running -= idle_task_delta; } @@@ -4904,39 -4913,45 +4942,45 @@@ void unthrottle_cfs_rq(struct cfs_rq *c task_delta = cfs_rq->h_nr_running; idle_task_delta = cfs_rq->idle_h_nr_running; for_each_sched_entity(se) { + struct cfs_rq *qcfs_rq = cfs_rq_of(se); + if (se->on_rq) break; - cfs_rq = cfs_rq_of(se); - enqueue_entity(cfs_rq, se, ENQUEUE_WAKEUP); + enqueue_entity(qcfs_rq, se, ENQUEUE_WAKEUP); + + if (cfs_rq_is_idle(group_cfs_rq(se))) + idle_task_delta = cfs_rq->h_nr_running; - cfs_rq->h_nr_running += task_delta; - cfs_rq->idle_h_nr_running += idle_task_delta; + qcfs_rq->h_nr_running += task_delta; + qcfs_rq->idle_h_nr_running += idle_task_delta; /* end evaluation on encountering a throttled cfs_rq */ - if (cfs_rq_throttled(cfs_rq)) + if (cfs_rq_throttled(qcfs_rq)) goto unthrottle_throttle; } for_each_sched_entity(se) { - cfs_rq = cfs_rq_of(se); + struct cfs_rq *qcfs_rq = cfs_rq_of(se); - update_load_avg(cfs_rq, se, UPDATE_TG); + update_load_avg(qcfs_rq, se, UPDATE_TG); se_update_runnable(se); - cfs_rq->h_nr_running += task_delta; - cfs_rq->idle_h_nr_running += idle_task_delta; + if (cfs_rq_is_idle(group_cfs_rq(se))) + idle_task_delta = cfs_rq->h_nr_running; + qcfs_rq->h_nr_running += task_delta; + qcfs_rq->idle_h_nr_running += idle_task_delta; /* end evaluation on encountering a throttled cfs_rq */ - if (cfs_rq_throttled(cfs_rq)) + if (cfs_rq_throttled(qcfs_rq)) goto unthrottle_throttle; /* * One parent has been throttled and cfs_rq removed from the * list. Add it back to not break the leaf list. */ - if (throttled_hierarchy(cfs_rq)) - list_add_leaf_cfs_rq(cfs_rq); + if (throttled_hierarchy(qcfs_rq)) + list_add_leaf_cfs_rq(qcfs_rq); } /* At this point se is NULL and we are at root level*/ @@@ -4949,9 -4964,9 +4993,9 @@@ unthrottle_throttle * assertion below. */ for_each_sched_entity(se) { - cfs_rq = cfs_rq_of(se); + struct cfs_rq *qcfs_rq = cfs_rq_of(se); - if (list_add_leaf_cfs_rq(cfs_rq)) + if (list_add_leaf_cfs_rq(qcfs_rq)) break; } @@@ -5082,7 -5097,7 +5126,7 @@@ static const u64 cfs_bandwidth_slack_pe static int runtime_refresh_within(struct cfs_bandwidth *cfs_b, u64 min_expire) { struct hrtimer *refresh_timer = &cfs_b->period_timer; - u64 remaining; + s64 remaining; /* if the call-back is running a quota refresh is already occurring */ if (hrtimer_callback_running(refresh_timer)) @@@ -5090,7 -5105,7 +5134,7 @@@ /* is a quota refresh about to occur? */ remaining = ktime_to_ns(hrtimer_expires_remaining(refresh_timer)); - if (remaining < min_expire) + if (remaining < (s64)min_expire) return 1; return 0; @@@ -5574,6 -5589,9 +5618,9 @@@ enqueue_task_fair(struct rq *rq, struc cfs_rq->h_nr_running++; cfs_rq->idle_h_nr_running += idle_h_nr_running; + if (cfs_rq_is_idle(cfs_rq)) + idle_h_nr_running = 1; + /* end evaluation on encountering a throttled cfs_rq */ if (cfs_rq_throttled(cfs_rq)) goto enqueue_throttle; @@@ -5591,6 -5609,9 +5638,9 @@@ cfs_rq->h_nr_running++; cfs_rq->idle_h_nr_running += idle_h_nr_running; + if (cfs_rq_is_idle(cfs_rq)) + idle_h_nr_running = 1; + /* end evaluation on encountering a throttled cfs_rq */ if (cfs_rq_throttled(cfs_rq)) goto enqueue_throttle; @@@ -5668,6 -5689,9 +5718,9 @@@ static void dequeue_task_fair(struct r cfs_rq->h_nr_running--; cfs_rq->idle_h_nr_running -= idle_h_nr_running; + if (cfs_rq_is_idle(cfs_rq)) + idle_h_nr_running = 1; + /* end evaluation on encountering a throttled cfs_rq */ if (cfs_rq_throttled(cfs_rq)) goto dequeue_throttle; @@@ -5697,6 -5721,9 +5750,9 @@@ cfs_rq->h_nr_running--; cfs_rq->idle_h_nr_running -= idle_h_nr_running; + if (cfs_rq_is_idle(cfs_rq)) + idle_h_nr_running = 1; + /* end evaluation on encountering a throttled cfs_rq */ if (cfs_rq_throttled(cfs_rq)) goto dequeue_throttle; @@@ -6249,7 -6276,7 +6305,7 @@@ static int select_idle_cpu(struct task_ time = cpu_clock(this); } - for_each_cpu_wrap(cpu, cpus, target) { + for_each_cpu_wrap(cpu, cpus, target + 1) { if (has_idle_core) { i = select_idle_core(p, cpu, cpus, &idle_cpu); if ((unsigned int)i < nr_cpumask_bits) @@@ -6376,6 -6403,7 +6432,7 @@@ static int select_idle_sibling(struct t /* Check a recently used CPU as a potential idle candidate: */ recent_used_cpu = p->recent_used_cpu; + p->recent_used_cpu = prev; if (recent_used_cpu != prev && recent_used_cpu != target && cpus_share_cache(recent_used_cpu, target) && @@@ -6902,9 -6930,6 +6959,6 @@@ select_task_rq_fair(struct task_struct } else if (wake_flags & WF_TTWU) { /* XXX always ? */ /* Fast path */ new_cpu = select_idle_sibling(p, prev_cpu, new_cpu); - - if (want_affine) - current->recent_used_cpu = cpu; } rcu_read_unlock(); @@@ -7041,24 -7066,22 +7095,22 @@@ wakeup_preempt_entity(struct sched_enti static void set_last_buddy(struct sched_entity *se) { - if (entity_is_task(se) && unlikely(task_has_idle_policy(task_of(se)))) - return; - for_each_sched_entity(se) { if (SCHED_WARN_ON(!se->on_rq)) return; + if (se_is_idle(se)) + return; cfs_rq_of(se)->last = se; } } static void set_next_buddy(struct sched_entity *se) { - if (entity_is_task(se) && unlikely(task_has_idle_policy(task_of(se)))) - return; - for_each_sched_entity(se) { if (SCHED_WARN_ON(!se->on_rq)) return; + if (se_is_idle(se)) + return; cfs_rq_of(se)->next = se; } } @@@ -7079,6 -7102,7 +7131,7 @@@ static void check_preempt_wakeup(struc struct cfs_rq *cfs_rq = task_cfs_rq(curr); int scale = cfs_rq->nr_running >= sched_nr_latency; int next_buddy_marked = 0; + int cse_is_idle, pse_is_idle; if (unlikely(se == pse)) return; @@@ -7123,8 -7147,21 +7176,21 @@@ return; find_matching_se(&se, &pse); - update_curr(cfs_rq_of(se)); BUG_ON(!pse); + + cse_is_idle = se_is_idle(se); + pse_is_idle = se_is_idle(pse); + + /* + * Preempt an idle group in favor of a non-idle group (and don't preempt + * in the inverse case). + */ + if (cse_is_idle && !pse_is_idle) + goto preempt; + if (cse_is_idle != pse_is_idle) + return; + + update_curr(cfs_rq_of(se)); if (wakeup_preempt_entity(se, pse) == 1) { /* * Bias pick_next to pick the sched entity that is @@@ -10217,9 -10254,11 +10283,11 @@@ static inline int on_null_domain(struc static inline int find_new_ilb(void) { int ilb; + const struct cpumask *hk_mask; + + hk_mask = housekeeping_cpumask(HK_FLAG_MISC); - for_each_cpu_and(ilb, nohz.idle_cpus_mask, - housekeeping_cpumask(HK_FLAG_MISC)) { + for_each_cpu_and(ilb, nohz.idle_cpus_mask, hk_mask) { if (ilb == smp_processor_id()) continue; @@@ -11416,10 -11455,12 +11484,12 @@@ void init_tg_cfs_entry(struct task_grou static DEFINE_MUTEX(shares_mutex); - int sched_group_set_shares(struct task_group *tg, unsigned long shares) + static int __sched_group_set_shares(struct task_group *tg, unsigned long shares) { int i; + lockdep_assert_held(&shares_mutex); + /* * We can't change the weight of the root cgroup. */ @@@ -11428,9 -11469,8 +11498,8 @@@ shares = clamp(shares, scale_load(MIN_SHARES), scale_load(MAX_SHARES)); - mutex_lock(&shares_mutex); if (tg->shares == shares) - goto done; + return 0; tg->shares = shares; for_each_possible_cpu(i) { @@@ -11448,10 -11488,88 +11517,88 @@@ rq_unlock_irqrestore(rq, &rf); } - done: + return 0; + } + + int sched_group_set_shares(struct task_group *tg, unsigned long shares) + { + int ret; + + mutex_lock(&shares_mutex); + if (tg_is_idle(tg)) + ret = -EINVAL; + else + ret = __sched_group_set_shares(tg, shares); + mutex_unlock(&shares_mutex); + + return ret; + } + + int sched_group_set_idle(struct task_group *tg, long idle) + { + int i; + + if (tg == &root_task_group) + return -EINVAL; + + if (idle < 0 || idle > 1) + return -EINVAL; + + mutex_lock(&shares_mutex); + + if (tg->idle == idle) { + mutex_unlock(&shares_mutex); + return 0; + } + + tg->idle = idle; + + for_each_possible_cpu(i) { + struct rq *rq = cpu_rq(i); + struct sched_entity *se = tg->se[i]; + struct cfs_rq *grp_cfs_rq = tg->cfs_rq[i]; + bool was_idle = cfs_rq_is_idle(grp_cfs_rq); + long idle_task_delta; + struct rq_flags rf; + + rq_lock_irqsave(rq, &rf); + + grp_cfs_rq->idle = idle; + if (WARN_ON_ONCE(was_idle == cfs_rq_is_idle(grp_cfs_rq))) + goto next_cpu; + + idle_task_delta = grp_cfs_rq->h_nr_running - + grp_cfs_rq->idle_h_nr_running; + if (!cfs_rq_is_idle(grp_cfs_rq)) + idle_task_delta *= -1; + + for_each_sched_entity(se) { + struct cfs_rq *cfs_rq = cfs_rq_of(se); + + if (!se->on_rq) + break; + + cfs_rq->idle_h_nr_running += idle_task_delta; + + /* Already accounted at parent level and above. */ + if (cfs_rq_is_idle(cfs_rq)) + break; + } + + next_cpu: + rq_unlock_irqrestore(rq, &rf); + } + + /* Idle groups have minimum weight. */ + if (tg_is_idle(tg)) + __sched_group_set_shares(tg, scale_load(WEIGHT_IDLEPRIO)); + else + __sched_group_set_shares(tg, NICE_0_LOAD); + mutex_unlock(&shares_mutex); return 0; } + #else /* CONFIG_FAIR_GROUP_SCHED */ void free_fair_sched_group(struct task_group *tg) { } diff --combined kernel/sched/sched.h index 14a41a243f7b,e7e2bba5b520..a9a660c6e08a --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@@ -227,6 -227,8 +227,8 @@@ static inline void update_avg(u64 *avg */ #define SCHED_FLAG_SUGOV 0x10000000 + #define SCHED_DL_FLAGS (SCHED_FLAG_RECLAIM | SCHED_FLAG_DL_OVERRUN | SCHED_FLAG_SUGOV) + static inline bool dl_entity_is_special(struct sched_dl_entity *dl_se) { #ifdef CONFIG_CPU_FREQ_GOV_SCHEDUTIL @@@ -394,6 -396,9 +396,9 @@@ struct task_group struct cfs_rq **cfs_rq; unsigned long shares; + /* A positive value indicates that this is a SCHED_IDLE group. */ + int idle; + #ifdef CONFIG_SMP /* * load_avg can be heavily contended at clock tick time, so put @@@ -503,6 -508,8 +508,8 @@@ extern void sched_move_task(struct task #ifdef CONFIG_FAIR_GROUP_SCHED extern int sched_group_set_shares(struct task_group *tg, unsigned long shares); + extern int sched_group_set_idle(struct task_group *tg, long idle); + #ifdef CONFIG_SMP extern void set_task_rq_fair(struct sched_entity *se, struct cfs_rq *prev, struct cfs_rq *next); @@@ -599,6 -606,9 +606,9 @@@ struct cfs_rq struct list_head leaf_cfs_rq_list; struct task_group *tg; /* group that "owns" this runqueue */ + /* Locally cached copy of our task_group's idle value */ + int idle; + #ifdef CONFIG_CFS_BANDWIDTH int runtime_enabled; s64 runtime_remaining; @@@ -2234,6 -2244,7 +2244,7 @@@ extern struct task_struct *pick_next_ta #define SCA_CHECK 0x01 #define SCA_MIGRATE_DISABLE 0x02 #define SCA_MIGRATE_ENABLE 0x04 + #define SCA_USER 0x08 #ifdef CONFIG_SMP @@@ -2385,6 -2396,21 +2396,21 @@@ extern void check_preempt_curr(struct r extern const_debug unsigned int sysctl_sched_nr_migrate; extern const_debug unsigned int sysctl_sched_migration_cost; + #ifdef CONFIG_SCHED_DEBUG + extern unsigned int sysctl_sched_latency; + extern unsigned int sysctl_sched_min_granularity; + extern unsigned int sysctl_sched_wakeup_granularity; + extern int sysctl_resched_latency_warn_ms; + extern int sysctl_resched_latency_warn_once; + + extern unsigned int sysctl_sched_tunable_scaling; + + extern unsigned int sysctl_numa_balancing_scan_delay; + extern unsigned int sysctl_numa_balancing_scan_period_min; + extern unsigned int sysctl_numa_balancing_scan_period_max; + extern unsigned int sysctl_numa_balancing_scan_size; + #endif + #ifdef CONFIG_SCHED_HRTICK /* @@@ -2818,27 -2844,20 +2844,27 @@@ static __always_inlin unsigned long uclamp_rq_util_with(struct rq *rq, unsigned long util, struct task_struct *p) { - unsigned long min_util; - unsigned long max_util; + unsigned long min_util = 0; + unsigned long max_util = 0; if (!static_branch_likely(&sched_uclamp_used)) return util; - min_util = READ_ONCE(rq->uclamp[UCLAMP_MIN].value); - max_util = READ_ONCE(rq->uclamp[UCLAMP_MAX].value); - if (p) { - min_util = max(min_util, uclamp_eff_value(p, UCLAMP_MIN)); - max_util = max(max_util, uclamp_eff_value(p, UCLAMP_MAX)); + min_util = uclamp_eff_value(p, UCLAMP_MIN); + max_util = uclamp_eff_value(p, UCLAMP_MAX); + + /* + * Ignore last runnable task's max clamp, as this task will + * reset it. Similarly, no need to read the rq's min clamp. + */ + if (rq->uclamp_flags & UCLAMP_FLAG_IDLE) + goto out; } + min_util = max_t(unsigned long, min_util, READ_ONCE(rq->uclamp[UCLAMP_MIN].value)); + max_util = max_t(unsigned long, max_util, READ_ONCE(rq->uclamp[UCLAMP_MAX].value)); +out: /* * Since CPU's {min,max}_util clamps are MAX aggregated considering * RUNNABLE tasks with _different_ clamps, we can end up with an