sched/fair: Prevent dead task groups from regaining cfs_rq's

[linux.git] / kernel / sched / core.c
diff --git a/kernel/sched/core.c b/kernel/sched/core.c

index cec173a5fc5e84b755ce10586dc6eb8bdaa95646..862af1db22ab3b09e26778da56bd49da1488c056 100644 (file)
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -9719,6 +9719,22 @@ static void sched_free_group(struct task_group *tg)
         kmem_cache_free(task_group_cache, tg);
  }
  
+static void sched_free_group_rcu(struct rcu_head *rcu)
+{
+       sched_free_group(container_of(rcu, struct task_group, rcu));
+}
+
+static void sched_unregister_group(struct task_group *tg)
+{
+       unregister_fair_sched_group(tg);
+       unregister_rt_sched_group(tg);
+       /*
+        * We have to wait for yet another RCU grace period to expire, as
+        * print_cfs_stats() might run concurrently.
+        */
+       call_rcu(&tg->rcu, sched_free_group_rcu);
+}
+
  /* allocate runqueue etc for a new task group */
  struct task_group *sched_create_group(struct task_group *parent)
  {
@@ -9762,25 +9778,35 @@ void sched_online_group(struct task_group *tg, struct task_group *parent)
  }
  
  /* rcu callback to free various structures associated with a task group */
-static void sched_free_group_rcu(struct rcu_head *rhp)
+static void sched_unregister_group_rcu(struct rcu_head *rhp)
  {
         /* Now it should be safe to free those cfs_rqs: */
-       sched_free_group(container_of(rhp, struct task_group, rcu));
+       sched_unregister_group(container_of(rhp, struct task_group, rcu));
  }
  
  void sched_destroy_group(struct task_group *tg)
  {
         /* Wait for possible concurrent references to cfs_rqs complete: */
-       call_rcu(&tg->rcu, sched_free_group_rcu);
+       call_rcu(&tg->rcu, sched_unregister_group_rcu);
  }
  
-void sched_offline_group(struct task_group *tg)
+void sched_release_group(struct task_group *tg)
  {
         unsigned long flags;
  
-       /* End participation in shares distribution: */
-       unregister_fair_sched_group(tg);
-
+       /*
+        * Unlink first, to avoid walk_tg_tree_from() from finding us (via
+        * sched_cfs_period_timer()).
+        *
+        * For this to be effective, we have to wait for all pending users of
+        * this task group to leave their RCU critical section to ensure no new
+        * user will see our dying task group any more. Specifically ensure
+        * that tg_unthrottle_up() won't add decayed cfs_rq's to it.
+        *
+        * We therefore defer calling unregister_fair_sched_group() to
+        * sched_unregister_group() which is guarantied to get called only after the
+        * current RCU grace period has expired.
+        */
         spin_lock_irqsave(&task_group_lock, flags);
         list_del_rcu(&tg->list);
         list_del_rcu(&tg->siblings);
@@ -9899,7 +9925,7 @@ static void cpu_cgroup_css_released(struct cgroup_subsys_state *css)
  {
         struct task_group *tg = css_tg(css);
  
-       sched_offline_group(tg);
+       sched_release_group(tg);
  }
  
  static void cpu_cgroup_css_free(struct cgroup_subsys_state *css)
@@ -9909,7 +9935,7 @@ static void cpu_cgroup_css_free(struct cgroup_subsys_state *css)
         /*
          * Relies on the RCU grace period between css_released() and this.
          */
-       sched_free_group(tg);
+       sched_unregister_group(tg);
  }
  
  /*