goto repeat;
}
+ /*
+ * Note that if this function returns a valid task_struct pointer (!NULL)
+ * task->usage must remain >0 for the duration of the RCU critical section.
+ */
+ struct task_struct *task_rcu_dereference(struct task_struct **ptask)
+ {
+ struct sighand_struct *sighand;
+ struct task_struct *task;
+
+ /*
+ * We need to verify that release_task() was not called and thus
+ * delayed_put_task_struct() can't run and drop the last reference
+ * before rcu_read_unlock(). We check task->sighand != NULL,
+ * but we can read the already freed and reused memory.
+ */
+ retry:
+ task = rcu_dereference(*ptask);
+ if (!task)
+ return NULL;
+
+ probe_kernel_address(&task->sighand, sighand);
+
+ /*
+ * Pairs with atomic_dec_and_test() in put_task_struct(). If this task
+ * was already freed we can not miss the preceding update of this
+ * pointer.
+ */
+ smp_rmb();
+ if (unlikely(task != READ_ONCE(*ptask)))
+ goto retry;
+
+ /*
+ * We've re-checked that "task == *ptask", now we have two different
+ * cases:
+ *
+ * 1. This is actually the same task/task_struct. In this case
+ * sighand != NULL tells us it is still alive.
+ *
+ * 2. This is another task which got the same memory for task_struct.
+ * We can't know this of course, and we can not trust
+ * sighand != NULL.
+ *
+ * In this case we actually return a random value, but this is
+ * correct.
+ *
+ * If we return NULL - we can pretend that we actually noticed that
+ * *ptask was updated when the previous task has exited. Or pretend
+ * that probe_slab_address(&sighand) reads NULL.
+ *
+ * If we return the new task (because sighand is not NULL for any
+ * reason) - this is fine too. This (new) task can't go away before
+ * another gp pass.
+ *
+ * And note: We could even eliminate the false positive if re-read
+ * task->sighand once again to avoid the falsely NULL. But this case
+ * is very unlikely so we don't care.
+ */
+ if (!sighand)
+ return NULL;
+
+ return task;
+ }
+
+ struct task_struct *try_get_task_struct(struct task_struct **ptask)
+ {
+ struct task_struct *task;
+
+ rcu_read_lock();
+ task = task_rcu_dereference(ptask);
+ if (task)
+ get_task_struct(task);
+ rcu_read_unlock();
+
+ return task;
+ }
+
/*
* Determine if a process group is "orphaned", according to the POSIX
* definition in 2.2.2.52. Orphaned process groups are not to be affected
exit_signals(tsk); /* sets PF_EXITING */
/*
- * tsk->flags are checked in the futex code to protect against
- * an exiting task cleaning up the robust pi futexes.
+ * Ensure that all new tsk->pi_lock acquisitions must observe
+ * PF_EXITING. Serializes against futex.c:attach_to_pi_owner().
*/
smp_mb();
+ /*
+ * Ensure that we must observe the pi_state in exit_mm() ->
+ * mm_release() -> exit_pi_state_list().
+ */
raw_spin_unlock_wait(&tsk->pi_lock);
if (unlikely(in_atomic())) {
* chain to provide order. Instead we do:
*
* 1) smp_store_release(X->on_cpu, 0)
- * 2) smp_cond_acquire(!X->on_cpu)
+ * 2) smp_cond_load_acquire(!X->on_cpu)
*
* Example:
*
* sched-out X
* smp_store_release(X->on_cpu, 0);
*
- * smp_cond_acquire(!X->on_cpu);
+ * smp_cond_load_acquire(&X->on_cpu, !VAL);
* X->state = WAKING
* set_task_cpu(X,2)
*
* This means that any means of doing remote wakeups must order the CPU doing
* the wakeup against the CPU the task is going to end up running on. This,
* however, is already required for the regular Program-Order guarantee above,
- * since the waking CPU is the one issueing the ACQUIRE (smp_cond_acquire).
+ * since the waking CPU is the one issueing the ACQUIRE (smp_cond_load_acquire).
*
*/
* This ensures that tasks getting woken will be fully ordered against
* their previous state and preserve Program Order.
*/
- smp_cond_acquire(!p->on_cpu);
+ smp_cond_load_acquire(&p->on_cpu, !VAL);
p->sched_contributes_to_load = !!task_contributes_to_load(p);
p->state = TASK_WAKING;
__sched_fork(clone_flags, p);
/*
- * We mark the process as running here. This guarantees that
+ * We mark the process as NEW here. This guarantees that
* nobody will actually run it, and a signal or other external
* event cannot wake it up and insert it on the runqueue either.
*/
- p->state = TASK_RUNNING;
+ p->state = TASK_NEW;
/*
* Make sure we do not leak PI boosting priority to the child.
p->sched_class = &fair_sched_class;
}
- if (p->sched_class->task_fork)
- p->sched_class->task_fork(p);
+ init_entity_runnable_average(&p->se);
/*
* The child is not yet in the pid-hash so no cgroup attach races,
* Silence PROVE_RCU.
*/
raw_spin_lock_irqsave(&p->pi_lock, flags);
- set_task_cpu(p, cpu);
+ /*
+ * We're setting the cpu for the first time, we don't migrate,
+ * so use __set_task_cpu().
+ */
+ __set_task_cpu(p, cpu);
+ if (p->sched_class->task_fork)
+ p->sched_class->task_fork(p);
raw_spin_unlock_irqrestore(&p->pi_lock, flags);
#ifdef CONFIG_SCHED_INFO
struct rq_flags rf;
struct rq *rq;
- /* Initialize new task's runnable average */
- init_entity_runnable_average(&p->se);
raw_spin_lock_irqsave(&p->pi_lock, rf.flags);
+ p->state = TASK_RUNNING;
#ifdef CONFIG_SMP
/*
* Fork balancing, do it here and not earlier because:
* - cpus_allowed can change in the fork path
* - any previously selected cpu might disappear through hotplug
+ *
+ * Use __set_task_cpu() to avoid calling sched_class::migrate_task_rq,
+ * as we're not fully set-up yet.
*/
- set_task_cpu(p, select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0));
+ __set_task_cpu(p, select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0));
#endif
rq = __task_rq_lock(p, &rf);
post_init_entity_util_avg(&p->se);
pr_cont("\n");
}
#endif
+ if (panic_on_warn)
+ panic("scheduling while atomic\n");
+
dump_stack();
add_taint(TAINT_WARN, LOCKDEP_STILL_OK);
}
* @len: length in bytes of the bitmask pointed to by user_mask_ptr
* @user_mask_ptr: user-space pointer to hold the current cpu mask
*
- * Return: 0 on success. An error code otherwise.
+ * Return: size of CPU mask copied to user_mask_ptr on success. An
+ * error code otherwise.
*/
SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len,
unsigned long __user *, user_mask_ptr)
/*
* Since this CPU is going 'away' for a while, fold any nr_active delta
* we might have. Assumes we're called after migrate_tasks() so that the
- * nr_active count is stable.
+ * nr_active count is stable. We need to take the teardown thread which
+ * is calling this into account, so we hand in adjust = 1 to the load
+ * calculation.
*
* Also see the comment "Global load-average calculations".
*/
static void calc_load_migrate(struct rq *rq)
{
- long delta = calc_load_fold_active(rq);
+ long delta = calc_load_fold_active(rq, 1);
if (delta)
atomic_long_add(delta, &calc_load_tasks);
}
struct rq *rq = cpu_rq(cpu);
rq->calc_load_update = calc_load_update;
- account_reset_rq(rq);
update_max_interval();
}
INIT_LIST_HEAD(&tg->children);
list_add_rcu(&tg->siblings, &parent->children);
spin_unlock_irqrestore(&task_group_lock, flags);
+
+ online_fair_sched_group(tg);
}
/* rcu callback to free various structures associated with a task group */
spin_unlock_irqrestore(&task_group_lock, flags);
}
- /* change task's runqueue when it moves between groups.
- * The caller of this function should have put the task in its new group
- * by now. This function just updates tsk->se.cfs_rq and tsk->se.parent to
- * reflect its new group.
- */
- void sched_move_task(struct task_struct *tsk)
+ static void sched_change_group(struct task_struct *tsk, int type)
{
struct task_group *tg;
- int queued, running;
- struct rq_flags rf;
- struct rq *rq;
-
- rq = task_rq_lock(tsk, &rf);
-
- running = task_current(rq, tsk);
- queued = task_on_rq_queued(tsk);
-
- if (queued)
- dequeue_task(rq, tsk, DEQUEUE_SAVE | DEQUEUE_MOVE);
- if (unlikely(running))
- put_prev_task(rq, tsk);
/*
* All callers are synchronized by task_rq_lock(); we do not use RCU
tsk->sched_task_group = tg;
#ifdef CONFIG_FAIR_GROUP_SCHED
- if (tsk->sched_class->task_move_group)
- tsk->sched_class->task_move_group(tsk);
+ if (tsk->sched_class->task_change_group)
+ tsk->sched_class->task_change_group(tsk, type);
else
#endif
set_task_rq(tsk, task_cpu(tsk));
+ }
+
+ /*
+ * Change task's runqueue when it moves between groups.
+ *
+ * The caller of this function should have put the task in its new group by
+ * now. This function just updates tsk->se.cfs_rq and tsk->se.parent to reflect
+ * its new group.
+ */
+ void sched_move_task(struct task_struct *tsk)
+ {
+ int queued, running;
+ struct rq_flags rf;
+ struct rq *rq;
+
+ rq = task_rq_lock(tsk, &rf);
+
+ running = task_current(rq, tsk);
+ queued = task_on_rq_queued(tsk);
+
+ if (queued)
+ dequeue_task(rq, tsk, DEQUEUE_SAVE | DEQUEUE_MOVE);
+ if (unlikely(running))
+ put_prev_task(rq, tsk);
+
+ sched_change_group(tsk, TASK_MOVE_GROUP);
if (unlikely(running))
tsk->sched_class->set_curr_task(rq);
sched_free_group(tg);
}
+ /*
+ * This is called before wake_up_new_task(), therefore we really only
+ * have to set its group bits, all the other stuff does not apply.
+ */
static void cpu_cgroup_fork(struct task_struct *task)
{
- sched_move_task(task);
+ struct rq_flags rf;
+ struct rq *rq;
+
+ rq = task_rq_lock(task, &rf);
+
+ sched_change_group(task, TASK_SET_GROUP);
+
+ task_rq_unlock(rq, task, &rf);
}
static int cpu_cgroup_can_attach(struct cgroup_taskset *tset)
{
struct task_struct *task;
struct cgroup_subsys_state *css;
+ int ret = 0;
cgroup_taskset_for_each(task, css, tset) {
#ifdef CONFIG_RT_GROUP_SCHED
if (task->sched_class != &fair_sched_class)
return -EINVAL;
#endif
+ /*
+ * Serialize against wake_up_new_task() such that if its
+ * running, we're sure to observe its full state.
+ */
+ raw_spin_lock_irq(&task->pi_lock);
+ /*
+ * Avoid calling sched_move_task() before wake_up_new_task()
+ * has happened. This would lead to problems with PELT, due to
+ * move wanting to detach+attach while we're not attached yet.
+ */
+ if (task->state == TASK_NEW)
+ ret = -EINVAL;
+ raw_spin_unlock_irq(&task->pi_lock);
+
+ if (ret)
+ break;
}
- return 0;
+ return ret;
}
static void cpu_cgroup_attach(struct cgroup_taskset *tset)
extern atomic_long_t calc_load_tasks;
extern void calc_global_load_tick(struct rq *this_rq);
-extern long calc_load_fold_active(struct rq *this_rq);
+extern long calc_load_fold_active(struct rq *this_rq, long adjust);
#ifdef CONFIG_SMP
extern void cpu_load_update_active(struct rq *this_rq);
extern void free_fair_sched_group(struct task_group *tg);
extern int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent);
+ extern void online_fair_sched_group(struct task_group *tg);
extern void unregister_fair_sched_group(struct task_group *tg);
extern void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
struct sched_entity *se, int cpu,
u64 throttled_clock, throttled_clock_task;
u64 throttled_clock_task_time;
- int throttled, throttle_count, throttle_uptodate;
+ int throttled, throttle_count;
struct list_head throttled_list;
#endif /* CONFIG_CFS_BANDWIDTH */
#endif /* CONFIG_FAIR_GROUP_SCHED */
* In particular, the load of prev->state in finish_task_switch() must
* happen before this.
*
- * Pairs with the smp_cond_acquire() in try_to_wake_up().
+ * Pairs with the smp_cond_load_acquire() in try_to_wake_up().
*/
smp_store_release(&prev->on_cpu, 0);
#endif
void (*update_curr) (struct rq *rq);
+ #define TASK_SET_GROUP 0
+ #define TASK_MOVE_GROUP 1
+
#ifdef CONFIG_FAIR_GROUP_SCHED
- void (*task_move_group) (struct task_struct *p);
+ void (*task_change_group) (struct task_struct *p, int type);
#endif
};
#else /* arch_scale_freq_capacity */
#define arch_scale_freq_invariant() (false)
#endif
-
- static inline void account_reset_rq(struct rq *rq)
- {
- #ifdef CONFIG_IRQ_TIME_ACCOUNTING
- rq->prev_irq_time = 0;
- #endif
- #ifdef CONFIG_PARAVIRT
- rq->prev_steal_time = 0;
- #endif
- #ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING
- rq->prev_steal_time_rq = 0;
- #endif
- }