Merge branch 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel...

author Linus Torvalds <[email protected]>

Mon, 25 Jul 2016 20:59:34 +0000 (13:59 -0700)

committer Linus Torvalds <[email protected]>

Mon, 25 Jul 2016 20:59:34 +0000 (13:59 -0700)
author Linus Torvalds <[email protected]>
Mon, 25 Jul 2016 20:59:34 +0000 (13:59 -0700)
committer Linus Torvalds <[email protected]>
Mon, 25 Jul 2016 20:59:34 +0000 (13:59 -0700)
diff --combined kernel/exit.c

index 0b40791b9e70259b50458b4c67e1d3325a7a4b6b,2fb4d44c51b1f9c06346d6885d056406dc7a6079..84ae830234f8fea6328690ebf977a7063aa91097
--- 1/kernel/exit.c
--- 2/kernel/exit.c
+++ b/kernel/exit.c
@@@ -210,6 -210,82 +210,82 @@@ repeat
                 goto repeat;
   }
   
+ /*
+  * Note that if this function returns a valid task_struct pointer (!NULL)
+  * task->usage must remain >0 for the duration of the RCU critical section.
+  */
+ struct task_struct *task_rcu_dereference(struct task_struct **ptask)
+ {
+       struct sighand_struct *sighand;
+       struct task_struct *task;
+ 
+       /*
+        * We need to verify that release_task() was not called and thus
+        * delayed_put_task_struct() can't run and drop the last reference
+        * before rcu_read_unlock(). We check task->sighand != NULL,
+        * but we can read the already freed and reused memory.
+        */
+ retry:
+       task = rcu_dereference(*ptask);
+       if (!task)
+               return NULL;
+ 
+       probe_kernel_address(&task->sighand, sighand);
+ 
+       /*
+        * Pairs with atomic_dec_and_test() in put_task_struct(). If this task
+        * was already freed we can not miss the preceding update of this
+        * pointer.
+        */
+       smp_rmb();
+       if (unlikely(task != READ_ONCE(*ptask)))
+               goto retry;
+ 
+       /*
+        * We've re-checked that "task == *ptask", now we have two different
+        * cases:
+        *
+        * 1. This is actually the same task/task_struct. In this case
+        *    sighand != NULL tells us it is still alive.
+        *
+        * 2. This is another task which got the same memory for task_struct.
+        *    We can't know this of course, and we can not trust
+        *    sighand != NULL.
+        *
+        *    In this case we actually return a random value, but this is
+        *    correct.
+        *
+        *    If we return NULL - we can pretend that we actually noticed that
+        *    *ptask was updated when the previous task has exited. Or pretend
+        *    that probe_slab_address(&sighand) reads NULL.
+        *
+        *    If we return the new task (because sighand is not NULL for any
+        *    reason) - this is fine too. This (new) task can't go away before
+        *    another gp pass.
+        *
+        *    And note: We could even eliminate the false positive if re-read
+        *    task->sighand once again to avoid the falsely NULL. But this case
+        *    is very unlikely so we don't care.
+        */
+       if (!sighand)
+               return NULL;
+ 
+       return task;
+ }
+ 
+ struct task_struct *try_get_task_struct(struct task_struct **ptask)
+ {
+       struct task_struct *task;
+ 
+       rcu_read_lock();
+       task = task_rcu_dereference(ptask);
+       if (task)
+               get_task_struct(task);
+       rcu_read_unlock();
+ 
+       return task;
+ }
+ 
   /*
    * Determine if a process group is "orphaned", according to the POSIX
    * definition in 2.2.2.52.  Orphaned process groups are not to be affected
@@@ -700,14 -776,10 +776,14 @@@ void do_exit(long code
   
         exit_signals(tsk);  /* sets PF_EXITING */
         /*
- -       * tsk->flags are checked in the futex code to protect against
- -       * an exiting task cleaning up the robust pi futexes.
+ +       * Ensure that all new tsk->pi_lock acquisitions must observe
+ +       * PF_EXITING. Serializes against futex.c:attach_to_pi_owner().
          */
         smp_mb();
+ +      /*
+ +       * Ensure that we must observe the pi_state in exit_mm() ->
+ +       * mm_release() -> exit_pi_state_list().
+ +       */
         raw_spin_unlock_wait(&tsk->pi_lock);
   
         if (unlikely(in_atomic())) {
diff --combined kernel/sched/core.c

index af0ef74df23c657563ba567fff72c23936228880,4e9617a7e7d95718fb58aecc1175013a7aae8b2d..5c883fe8e44016df1109e8f66dd73377dfecb5e9
--- 1/kernel/sched/core.c
--- 2/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@@ -1937,7 -1937,7 +1937,7 @@@ static void ttwu_queue(struct task_stru
    * chain to provide order. Instead we do:
    *
    *   1) smp_store_release(X->on_cpu, 0)
- - *   2) smp_cond_acquire(!X->on_cpu)
+ + *   2) smp_cond_load_acquire(!X->on_cpu)
    *
    * Example:
    *
@@@ -1948,7 -1948,7 +1948,7 @@@
    *   sched-out X
    *   smp_store_release(X->on_cpu, 0);
    *
- - *                    smp_cond_acquire(!X->on_cpu);
+ + *                    smp_cond_load_acquire(&X->on_cpu, !VAL);
    *                    X->state = WAKING
    *                    set_task_cpu(X,2)
    *
@@@ -1974,7 -1974,7 +1974,7 @@@
    * This means that any means of doing remote wakeups must order the CPU doing
    * the wakeup against the CPU the task is going to end up running on. This,
    * however, is already required for the regular Program-Order guarantee above,
- - * since the waking CPU is the one issueing the ACQUIRE (smp_cond_acquire).
+ + * since the waking CPU is the one issueing the ACQUIRE (smp_cond_load_acquire).
    *
    */
   
@@@ -2047,7 -2047,7 +2047,7 @@@ try_to_wake_up(struct task_struct *p, u
          * This ensures that tasks getting woken will be fully ordered against
          * their previous state and preserve Program Order.
          */
- -      smp_cond_acquire(!p->on_cpu);
+ +      smp_cond_load_acquire(&p->on_cpu, !VAL);
   
         p->sched_contributes_to_load = !!task_contributes_to_load(p);
         p->state = TASK_WAKING;
@@@ -2342,11 -2342,11 +2342,11 @@@ int sched_fork(unsigned long clone_flag
   
         __sched_fork(clone_flags, p);
         /*
-        * We mark the process as running here. This guarantees that
+        * We mark the process as NEW here. This guarantees that
          * nobody will actually run it, and a signal or other external
          * event cannot wake it up and insert it on the runqueue either.
          */
-       p->state = TASK_RUNNING;
+       p->state = TASK_NEW;
   
         /*
          * Make sure we do not leak PI boosting priority to the child.
@@@ -2383,8 -2383,7 +2383,7 @@@
                 p->sched_class = &fair_sched_class;
         }
   
-       if (p->sched_class->task_fork)
-               p->sched_class->task_fork(p);
+       init_entity_runnable_average(&p->se);
   
         /*
          * The child is not yet in the pid-hash so no cgroup attach races,
@@@ -2394,7 -2393,13 +2393,13 @@@
          * Silence PROVE_RCU.
          */
         raw_spin_lock_irqsave(&p->pi_lock, flags);
-       set_task_cpu(p, cpu);
+       /*
+        * We're setting the cpu for the first time, we don't migrate,
+        * so use __set_task_cpu().
+        */
+       __set_task_cpu(p, cpu);
+       if (p->sched_class->task_fork)
+               p->sched_class->task_fork(p);
         raw_spin_unlock_irqrestore(&p->pi_lock, flags);
   
   #ifdef CONFIG_SCHED_INFO
@@@ -2526,16 -2531,18 +2531,18 @@@ void wake_up_new_task(struct task_struc
         struct rq_flags rf;
         struct rq *rq;
   
-       /* Initialize new task's runnable average */
-       init_entity_runnable_average(&p->se);
         raw_spin_lock_irqsave(&p->pi_lock, rf.flags);
+       p->state = TASK_RUNNING;
   #ifdef CONFIG_SMP
         /*
          * Fork balancing, do it here and not earlier because:
          *  - cpus_allowed can change in the fork path
          *  - any previously selected cpu might disappear through hotplug
+        *
+        * Use __set_task_cpu() to avoid calling sched_class::migrate_task_rq,
+        * as we're not fully set-up yet.
          */
-       set_task_cpu(p, select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0));
+       __set_task_cpu(p, select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0));
   #endif
         rq = __task_rq_lock(p, &rf);
         post_init_entity_util_avg(&p->se);
@@@ -3161,6 -3168,9 +3168,9 @@@ static noinline void __schedule_bug(str
                 pr_cont("\n");
         }
   #endif
+       if (panic_on_warn)
+               panic("scheduling while atomic\n");
+ 
         dump_stack();
         add_taint(TAINT_WARN, LOCKDEP_STILL_OK);
   }
@@@ -4752,7 -4762,8 +4762,8 @@@ out_unlock
    * @len: length in bytes of the bitmask pointed to by user_mask_ptr
    * @user_mask_ptr: user-space pointer to hold the current cpu mask
    *
-  * Return: 0 on success. An error code otherwise.
+  * Return: size of CPU mask copied to user_mask_ptr on success. An
+  * error code otherwise.
    */
   SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len,
                 unsigned long __user *, user_mask_ptr)
@@@ -5394,15 -5405,13 +5405,15 @@@ void idle_task_exit(void
   /*
    * Since this CPU is going 'away' for a while, fold any nr_active delta
    * we might have. Assumes we're called after migrate_tasks() so that the
- - * nr_active count is stable.
+ + * nr_active count is stable. We need to take the teardown thread which
+ + * is calling this into account, so we hand in adjust = 1 to the load
+ + * calculation.
    *
    * Also see the comment "Global load-average calculations".
    */
   static void calc_load_migrate(struct rq *rq)
   {
- -      long delta = calc_load_fold_active(rq);
+ +      long delta = calc_load_fold_active(rq, 1);
         if (delta)
                 atomic_long_add(delta, &calc_load_tasks);
   }
@@@ -7233,7 -7242,6 +7244,6 @@@ static void sched_rq_cpu_starting(unsig
         struct rq *rq = cpu_rq(cpu);
   
         rq->calc_load_update = calc_load_update;
-       account_reset_rq(rq);
         update_max_interval();
   }
   
@@@ -7713,6 -7721,8 +7723,8 @@@ void sched_online_group(struct task_gro
         INIT_LIST_HEAD(&tg->children);
         list_add_rcu(&tg->siblings, &parent->children);
         spin_unlock_irqrestore(&task_group_lock, flags);
+ 
+       online_fair_sched_group(tg);
   }
   
   /* rcu callback to free various structures associated with a task group */
@@@ -7741,27 -7751,9 +7753,9 @@@ void sched_offline_group(struct task_gr
         spin_unlock_irqrestore(&task_group_lock, flags);
   }
   
- /* change task's runqueue when it moves between groups.
-  *    The caller of this function should have put the task in its new group
-  *    by now. This function just updates tsk->se.cfs_rq and tsk->se.parent to
-  *    reflect its new group.
-  */
- void sched_move_task(struct task_struct *tsk)
+ static void sched_change_group(struct task_struct *tsk, int type)
   {
         struct task_group *tg;
-       int queued, running;
-       struct rq_flags rf;
-       struct rq *rq;
- 
-       rq = task_rq_lock(tsk, &rf);
- 
-       running = task_current(rq, tsk);
-       queued = task_on_rq_queued(tsk);
- 
-       if (queued)
-               dequeue_task(rq, tsk, DEQUEUE_SAVE | DEQUEUE_MOVE);
-       if (unlikely(running))
-               put_prev_task(rq, tsk);
   
         /*
          * All callers are synchronized by task_rq_lock(); we do not use RCU
@@@ -7774,11 -7766,37 +7768,37 @@@
         tsk->sched_task_group = tg;
   
   #ifdef CONFIG_FAIR_GROUP_SCHED
-       if (tsk->sched_class->task_move_group)
-               tsk->sched_class->task_move_group(tsk);
+       if (tsk->sched_class->task_change_group)
+               tsk->sched_class->task_change_group(tsk, type);
         else
   #endif
                 set_task_rq(tsk, task_cpu(tsk));
+ }
+ 
+ /*
+  * Change task's runqueue when it moves between groups.
+  *
+  * The caller of this function should have put the task in its new group by
+  * now. This function just updates tsk->se.cfs_rq and tsk->se.parent to reflect
+  * its new group.
+  */
+ void sched_move_task(struct task_struct *tsk)
+ {
+       int queued, running;
+       struct rq_flags rf;
+       struct rq *rq;
+ 
+       rq = task_rq_lock(tsk, &rf);
+ 
+       running = task_current(rq, tsk);
+       queued = task_on_rq_queued(tsk);
+ 
+       if (queued)
+               dequeue_task(rq, tsk, DEQUEUE_SAVE | DEQUEUE_MOVE);
+       if (unlikely(running))
+               put_prev_task(rq, tsk);
+ 
+       sched_change_group(tsk, TASK_MOVE_GROUP);
   
         if (unlikely(running))
                 tsk->sched_class->set_curr_task(rq);
@@@ -8206,15 -8224,27 +8226,27 @@@ static void cpu_cgroup_css_free(struct 
         sched_free_group(tg);
   }
   
+ /*
+  * This is called before wake_up_new_task(), therefore we really only
+  * have to set its group bits, all the other stuff does not apply.
+  */
   static void cpu_cgroup_fork(struct task_struct *task)
   {
-       sched_move_task(task);
+       struct rq_flags rf;
+       struct rq *rq;
+ 
+       rq = task_rq_lock(task, &rf);
+ 
+       sched_change_group(task, TASK_SET_GROUP);
+ 
+       task_rq_unlock(rq, task, &rf);
   }
   
   static int cpu_cgroup_can_attach(struct cgroup_taskset *tset)
   {
         struct task_struct *task;
         struct cgroup_subsys_state *css;
+       int ret = 0;
   
         cgroup_taskset_for_each(task, css, tset) {
   #ifdef CONFIG_RT_GROUP_SCHED
@@@ -8225,8 -8255,24 +8257,24 @@@
                 if (task->sched_class != &fair_sched_class)
                         return -EINVAL;
   #endif
+               /*
+                * Serialize against wake_up_new_task() such that if its
+                * running, we're sure to observe its full state.
+                */
+               raw_spin_lock_irq(&task->pi_lock);
+               /*
+                * Avoid calling sched_move_task() before wake_up_new_task()
+                * has happened. This would lead to problems with PELT, due to
+                * move wanting to detach+attach while we're not attached yet.
+                */
+               if (task->state == TASK_NEW)
+                       ret = -EINVAL;
+               raw_spin_unlock_irq(&task->pi_lock);
+ 
+               if (ret)
+                       break;
         }
-       return 0;
+       return ret;
   }
   
   static void cpu_cgroup_attach(struct cgroup_taskset *tset)
diff --combined kernel/sched/sched.h

index 81283592942bf8f9954b65e5364fbcf03646da59,f44da95c70cde1360cc8292ed8a61eed66fa0396..c64fc5114004f6a893a1bf942a49f3df1d6c8fea
--- 1/kernel/sched/sched.h
--- 2/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@@ -28,7 -28,7 +28,7 @@@ extern unsigned long calc_load_update
   extern atomic_long_t calc_load_tasks;
   
   extern void calc_global_load_tick(struct rq *this_rq);
- -extern long calc_load_fold_active(struct rq *this_rq);
+ +extern long calc_load_fold_active(struct rq *this_rq, long adjust);
   
   #ifdef CONFIG_SMP
   extern void cpu_load_update_active(struct rq *this_rq);
@@@ -321,6 -321,7 +321,7 @@@ extern int tg_nop(struct task_group *tg
   
   extern void free_fair_sched_group(struct task_group *tg);
   extern int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent);
+ extern void online_fair_sched_group(struct task_group *tg);
   extern void unregister_fair_sched_group(struct task_group *tg);
   extern void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
                         struct sched_entity *se, int cpu,
@@@ -437,7 -438,7 +438,7 @@@ struct cfs_rq 
   
         u64 throttled_clock, throttled_clock_task;
         u64 throttled_clock_task_time;
-       int throttled, throttle_count, throttle_uptodate;
+       int throttled, throttle_count;
         struct list_head throttled_list;
   #endif /* CONFIG_CFS_BANDWIDTH */
   #endif /* CONFIG_FAIR_GROUP_SCHED */
@@@ -1113,7 -1114,7 +1114,7 @@@ static inline void finish_lock_switch(s
          * In particular, the load of prev->state in finish_task_switch() must
          * happen before this.
          *
- -       * Pairs with the smp_cond_acquire() in try_to_wake_up().
+ +       * Pairs with the smp_cond_load_acquire() in try_to_wake_up().
          */
         smp_store_release(&prev->on_cpu, 0);
   #endif
@@@ -1246,8 -1247,11 +1247,11 @@@ struct sched_class 
   
         void (*update_curr) (struct rq *rq);
   
+ #define TASK_SET_GROUP  0
+ #define TASK_MOVE_GROUP       1
+ 
   #ifdef CONFIG_FAIR_GROUP_SCHED
-       void (*task_move_group) (struct task_struct *p);
+       void (*task_change_group) (struct task_struct *p, int type);
   #endif
   };
   
@@@ -1809,16 -1813,3 +1813,3 @@@ static inline void cpufreq_trigger_upda
   #else /* arch_scale_freq_capacity */
   #define arch_scale_freq_invariant()   (false)
   #endif
- 
- static inline void account_reset_rq(struct rq *rq)
- {
- #ifdef CONFIG_IRQ_TIME_ACCOUNTING
-       rq->prev_irq_time = 0;
- #endif
- #ifdef CONFIG_PARAVIRT
-       rq->prev_steal_time = 0;
- #endif
- #ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING
-       rq->prev_steal_time_rq = 0;
- #endif
- }
author	Linus Torvalds <[email protected]>
	Mon, 25 Jul 2016 20:59:34 +0000 (13:59 -0700)
committer	Linus Torvalds <[email protected]>
	Mon, 25 Jul 2016 20:59:34 +0000 (13:59 -0700)
		1	2
kernel/exit.c	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/sched/core.c	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/sched/sched.h	patch \|	diff1 \|	diff2 \|	blob \| history