Merge branch 'for-4.3' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup

author Linus Torvalds <[email protected]>

Wed, 2 Sep 2015 15:04:23 +0000 (08:04 -0700)

committer Linus Torvalds <[email protected]>

Wed, 2 Sep 2015 15:04:23 +0000 (08:04 -0700)
author Linus Torvalds <[email protected]>
Wed, 2 Sep 2015 15:04:23 +0000 (08:04 -0700)
committer Linus Torvalds <[email protected]>
Wed, 2 Sep 2015 15:04:23 +0000 (08:04 -0700)
diff --combined init/Kconfig

index ba1e6eaf4c36e72bdf29d0b683b9c118fb0b38db,2184b34cbf73f9e1e79bf19222938ab8a4161506..bb9b4dd55889f0605b07ddfc73cc105d72b18908
--- 1/init/Kconfig
--- 2/init/Kconfig
+++ b/init/Kconfig
@@@ -538,6 -538,15 +538,6 @@@ config RCU_STALL_COMMO
   config CONTEXT_TRACKING
          bool
   
- -config RCU_USER_QS
- -      bool
- -      help
- -        This option sets hooks on kernel / userspace boundaries and
- -        puts RCU in extended quiescent state when the CPU runs in
- -        userspace. It means that when a CPU runs in userspace, it is
- -        excluded from the global RCU state machine and thus doesn't
- -        try to keep the timer tick on for RCU.
- -
   config CONTEXT_TRACKING_FORCE
         bool "Force context tracking"
         depends on CONTEXT_TRACKING
@@@ -698,7 -707,6 +698,7 @@@ config RCU_BOOST_DELA
   config RCU_NOCB_CPU
         bool "Offload RCU callback processing from boot-selected CPUs"
         depends on TREE_RCU || PREEMPT_RCU
+ +      depends on RCU_EXPERT || NO_HZ_FULL
         default n
         help
           Use this option to reduce OS jitter for aggressive HPC or
@@@ -947,6 -955,22 +947,22 @@@ config CGROUP_FREEZE
           Provides a way to freeze and unfreeze all tasks in a
           cgroup.
   
+ config CGROUP_PIDS
+       bool "PIDs cgroup subsystem"
+       help
+         Provides enforcement of process number limits in the scope of a
+         cgroup. Any attempt to fork more processes than is allowed in the
+         cgroup will fail. PIDs are fundamentally a global resource because it
+         is fairly trivial to reach PID exhaustion before you reach even a
+         conservative kmemcg limit. As a result, it is possible to grind a
+         system to halt without being limited by other cgroup policies. The
+         PIDs cgroup subsystem is designed to stop this from happening.
+ 
+         It should be noted that organisational operations (such as attaching
+         to a cgroup hierarchy will *not* be blocked by the PIDs subsystem),
+         since the PIDs limit only affects a process's ability to fork, not to
+         attach to a cgroup.
+ 
   config CGROUP_DEVICE
         bool "Device controller for cgroups"
         help
diff --combined kernel/cgroup.c

index b89f3168411bc1e9f4f96a42fe5732f3e36c1d41,4ec1b7ee5de8c71ead9916a048ff5b23356db8a2..f3f5cd5e2c0d9ccd8b954e9191cd9169d53c32d7
--- 1/kernel/cgroup.c
--- 2/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@@ -107,8 -107,8 +107,8 @@@ static DEFINE_SPINLOCK(release_agent_pa
   struct percpu_rw_semaphore cgroup_threadgroup_rwsem;
   
   #define cgroup_assert_mutex_or_rcu_locked()                           \
- -      rcu_lockdep_assert(rcu_read_lock_held() ||                      \
- -                         lockdep_is_held(&cgroup_mutex),              \
+ +      RCU_LOCKDEP_WARN(!rcu_read_lock_held() &&                       \
+ +                         !lockdep_is_held(&cgroup_mutex),             \
                            "cgroup_mutex or RCU read lock required");
   
   /*
@@@ -145,6 -145,7 +145,7 @@@ static const char *cgroup_subsys_name[
    * part of that cgroup.
    */
   struct cgroup_root cgrp_dfl_root;
+ EXPORT_SYMBOL_GPL(cgrp_dfl_root);
   
   /*
    * The default hierarchy always exists but is hidden until mounted for the
@@@ -186,6 -187,9 +187,9 @@@ static u64 css_serial_nr_next = 1
   static unsigned long have_fork_callback __read_mostly;
   static unsigned long have_exit_callback __read_mostly;
   
+ /* Ditto for the can_fork callback. */
+ static unsigned long have_canfork_callback __read_mostly;
+ 
   static struct cftype cgroup_dfl_base_files[];
   static struct cftype cgroup_legacy_base_files[];
   
@@@ -207,7 -211,7 +211,7 @@@ static int cgroup_idr_alloc(struct idr 
   
         idr_preload(gfp_mask);
         spin_lock_bh(&cgroup_idr_lock);
-       ret = idr_alloc(idr, ptr, start, end, gfp_mask);
+       ret = idr_alloc(idr, ptr, start, end, gfp_mask & ~__GFP_WAIT);
         spin_unlock_bh(&cgroup_idr_lock);
         idr_preload_end();
         return ret;
@@@ -1027,10 -1031,13 +1031,13 @@@ static const struct file_operations pro
   static char *cgroup_file_name(struct cgroup *cgrp, const struct cftype *cft,
                               char *buf)
   {
+       struct cgroup_subsys *ss = cft->ss;
+ 
         if (cft->ss && !(cft->flags & CFTYPE_NO_PREFIX) &&
             !(cgrp->root->flags & CGRP_ROOT_NOPREFIX))
                 snprintf(buf, CGROUP_FILE_NAME_MAX, "%s.%s",
-                        cft->ss->name, cft->name);
+                        cgroup_on_dfl(cgrp) ? ss->name : ss->legacy_name,
+                        cft->name);
         else
                 strncpy(buf, cft->name, CGROUP_FILE_NAME_MAX);
         return buf;
@@@ -1332,9 -1339,10 +1339,10 @@@ static int cgroup_show_options(struct s
         struct cgroup_subsys *ss;
         int ssid;
   
-       for_each_subsys(ss, ssid)
-               if (root->subsys_mask & (1 << ssid))
-                       seq_printf(seq, ",%s", ss->name);
+       if (root != &cgrp_dfl_root)
+               for_each_subsys(ss, ssid)
+                       if (root->subsys_mask & (1 << ssid))
+                               seq_printf(seq, ",%s", ss->legacy_name);
         if (root->flags & CGRP_ROOT_NOPREFIX)
                 seq_puts(seq, ",noprefix");
         if (root->flags & CGRP_ROOT_XATTR)
@@@ -1447,7 -1455,7 +1455,7 @@@ static int parse_cgroupfs_options(char 
                 }
   
                 for_each_subsys(ss, i) {
-                       if (strcmp(token, ss->name))
+                       if (strcmp(token, ss->legacy_name))
                                 continue;
                         if (ss->disabled)
                                 continue;
@@@ -1666,7 -1674,7 +1674,7 @@@ static int cgroup_setup_root(struct cgr
   
         lockdep_assert_held(&cgroup_mutex);
   
-       ret = cgroup_idr_alloc(&root->cgroup_idr, root_cgrp, 1, 2, GFP_NOWAIT);
+       ret = cgroup_idr_alloc(&root->cgroup_idr, root_cgrp, 1, 2, GFP_KERNEL);
         if (ret < 0)
                 goto out;
         root_cgrp->id = ret;
@@@ -4579,7 -4587,7 +4587,7 @@@ static int create_css(struct cgroup *cg
         if (err)
                 goto err_free_css;
   
-       err = cgroup_idr_alloc(&ss->css_idr, NULL, 2, 0, GFP_NOWAIT);
+       err = cgroup_idr_alloc(&ss->css_idr, NULL, 2, 0, GFP_KERNEL);
         if (err < 0)
                 goto err_free_percpu_ref;
         css->id = err;
@@@ -4656,7 -4664,7 +4664,7 @@@ static int cgroup_mkdir(struct kernfs_n
          * Temporarily set the pointer to NULL, so idr_find() won't return
          * a half-baked cgroup.
          */
-       cgrp->id = cgroup_idr_alloc(&root->cgroup_idr, NULL, 2, 0, GFP_NOWAIT);
+       cgrp->id = cgroup_idr_alloc(&root->cgroup_idr, NULL, 2, 0, GFP_KERNEL);
         if (cgrp->id < 0) {
                 ret = -ENOMEM;
                 goto out_cancel_ref;
@@@ -4955,6 -4963,7 +4963,7 @@@ static void __init cgroup_init_subsys(s
   
         have_fork_callback |= (bool)ss->fork << ss->id;
         have_exit_callback |= (bool)ss->exit << ss->id;
+       have_canfork_callback |= (bool)ss->can_fork << ss->id;
   
         /* At system boot, before all subsystems have been
          * registered, no tasks have been forked, so we don't
@@@ -4993,6 -5002,8 +5002,8 @@@ int __init cgroup_init_early(void
   
                 ss->id = i;
                 ss->name = cgroup_subsys_name[i];
+               if (!ss->legacy_name)
+                       ss->legacy_name = cgroup_subsys_name[i];
   
                 if (ss->early_init)
                         cgroup_init_subsys(ss, true);
@@@ -5136,9 -5147,11 +5147,11 @@@ int proc_cgroup_show(struct seq_file *m
                         continue;
   
                 seq_printf(m, "%d:", root->hierarchy_id);
-               for_each_subsys(ss, ssid)
-                       if (root->subsys_mask & (1 << ssid))
-                               seq_printf(m, "%s%s", count++ ? "," : "", ss->name);
+               if (root != &cgrp_dfl_root)
+                       for_each_subsys(ss, ssid)
+                               if (root->subsys_mask & (1 << ssid))
+                                       seq_printf(m, "%s%s", count++ ? "," : "",
+                                                  ss->legacy_name);
                 if (strlen(root->name))
                         seq_printf(m, "%sname=%s", count ? "," : "",
                                    root->name);
@@@ -5178,7 -5191,7 +5191,7 @@@ static int proc_cgroupstats_show(struc
   
         for_each_subsys(ss, i)
                 seq_printf(m, "%s\t%d\t%d\t%d\n",
-                          ss->name, ss->root->hierarchy_id,
+                          ss->legacy_name, ss->root->hierarchy_id,
                            atomic_read(&ss->root->nr_cgrps), !ss->disabled);
   
         mutex_unlock(&cgroup_mutex);
@@@ -5197,6 -5210,19 +5210,19 @@@ static const struct file_operations pro
         .release = single_release,
   };
   
+ static void **subsys_canfork_priv_p(void *ss_priv[CGROUP_CANFORK_COUNT], int i)
+ {
+       if (CGROUP_CANFORK_START <= i && i < CGROUP_CANFORK_END)
+               return &ss_priv[i - CGROUP_CANFORK_START];
+       return NULL;
+ }
+ 
+ static void *subsys_canfork_priv(void *ss_priv[CGROUP_CANFORK_COUNT], int i)
+ {
+       void **private = subsys_canfork_priv_p(ss_priv, i);
+       return private ? *private : NULL;
+ }
+ 
   /**
    * cgroup_fork - initialize cgroup related fields during copy_process()
    * @child: pointer to task_struct of forking parent process.
@@@ -5211,6 -5237,57 +5237,57 @@@ void cgroup_fork(struct task_struct *ch
         INIT_LIST_HEAD(&child->cg_list);
   }
   
+ /**
+  * cgroup_can_fork - called on a new task before the process is exposed
+  * @child: the task in question.
+  *
+  * This calls the subsystem can_fork() callbacks. If the can_fork() callback
+  * returns an error, the fork aborts with that error code. This allows for
+  * a cgroup subsystem to conditionally allow or deny new forks.
+  */
+ int cgroup_can_fork(struct task_struct *child,
+                   void *ss_priv[CGROUP_CANFORK_COUNT])
+ {
+       struct cgroup_subsys *ss;
+       int i, j, ret;
+ 
+       for_each_subsys_which(ss, i, &have_canfork_callback) {
+               ret = ss->can_fork(child, subsys_canfork_priv_p(ss_priv, i));
+               if (ret)
+                       goto out_revert;
+       }
+ 
+       return 0;
+ 
+ out_revert:
+       for_each_subsys(ss, j) {
+               if (j >= i)
+                       break;
+               if (ss->cancel_fork)
+                       ss->cancel_fork(child, subsys_canfork_priv(ss_priv, j));
+       }
+ 
+       return ret;
+ }
+ 
+ /**
+  * cgroup_cancel_fork - called if a fork failed after cgroup_can_fork()
+  * @child: the task in question
+  *
+  * This calls the cancel_fork() callbacks if a fork failed *after*
+  * cgroup_can_fork() succeded.
+  */
+ void cgroup_cancel_fork(struct task_struct *child,
+                       void *ss_priv[CGROUP_CANFORK_COUNT])
+ {
+       struct cgroup_subsys *ss;
+       int i;
+ 
+       for_each_subsys(ss, i)
+               if (ss->cancel_fork)
+                       ss->cancel_fork(child, subsys_canfork_priv(ss_priv, i));
+ }
+ 
   /**
    * cgroup_post_fork - called on a new task after adding it to the task list
    * @child: the task in question
@@@ -5221,7 -5298,8 +5298,8 @@@
    * cgroup_task_iter_start() - to guarantee that the new task ends up on its
    * list.
    */
- void cgroup_post_fork(struct task_struct *child)
+ void cgroup_post_fork(struct task_struct *child,
+                     void *old_ss_priv[CGROUP_CANFORK_COUNT])
   {
         struct cgroup_subsys *ss;
         int i;
@@@ -5266,7 -5344,7 +5344,7 @@@
          * and addition to css_set.
          */
         for_each_subsys_which(ss, i, &have_fork_callback)
-               ss->fork(child);
+               ss->fork(child, subsys_canfork_priv(old_ss_priv, i));
   }
   
   /**
@@@ -5400,12 -5478,14 +5478,14 @@@ static int __init cgroup_disable(char *
                         continue;
   
                 for_each_subsys(ss, i) {
-                       if (!strcmp(token, ss->name)) {
-                               ss->disabled = 1;
-                               printk(KERN_INFO "Disabling %s control group"
-                                       " subsystem\n", ss->name);
-                               break;
-                       }
+                       if (strcmp(token, ss->name) &&
+                           strcmp(token, ss->legacy_name))
+                               continue;
+ 
+                       ss->disabled = 1;
+                       printk(KERN_INFO "Disabling %s control group subsystem\n",
+                              ss->name);
+                       break;
                 }
         }
         return 1;
diff --combined kernel/fork.c

index 2b1a61cddc1954fc6be6d2c8a69c836e6caca33c,40e3af12c55e5c225dda47349890f3c10e1bf68e..03aa2e6de7a4e90696c003792641d2c3a150cd02
--- 1/kernel/fork.c
--- 2/kernel/fork.c
+++ b/kernel/fork.c
@@@ -287,11 -287,6 +287,11 @@@ static void set_max_threads(unsigned in
         max_threads = clamp_t(u64, threads, MIN_THREADS, MAX_THREADS);
   }
   
+ +#ifdef CONFIG_ARCH_WANTS_DYNAMIC_TASK_STRUCT
+ +/* Initialized by the architecture: */
+ +int arch_task_struct_size __read_mostly;
+ +#endif
+ +
   void __init fork_init(void)
   {
   #ifndef CONFIG_ARCH_TASK_STRUCT_ALLOCATOR
@@@ -300,7 -295,7 +300,7 @@@
   #endif
         /* create a slab on which task_structs can be allocated */
         task_struct_cachep =
- -              kmem_cache_create("task_struct", sizeof(struct task_struct),
+ +              kmem_cache_create("task_struct", arch_task_struct_size,
                         ARCH_MIN_TASKALIGN, SLAB_PANIC | SLAB_NOTRACK, NULL);
   #endif
   
@@@ -1072,7 -1067,6 +1072,7 @@@ static int copy_sighand(unsigned long c
         rcu_assign_pointer(tsk->sighand, sig);
         if (!sig)
                 return -ENOMEM;
+ +
         atomic_set(&sig->count, 1);
         memcpy(sig->action, current->sighand->action, sizeof(sig->action));
         return 0;
@@@ -1134,7 -1128,6 +1134,7 @@@ static int copy_signal(unsigned long cl
         init_sigpending(&sig->shared_pending);
         INIT_LIST_HEAD(&sig->posix_timers);
         seqlock_init(&sig->stats_lock);
+ +      prev_cputime_init(&sig->prev_cputime);
   
         hrtimer_init(&sig->real_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
         sig->real_timer.function = it_real_fn;
@@@ -1246,6 -1239,7 +1246,7 @@@ static struct task_struct *copy_process
   {
         int retval;
         struct task_struct *p;
+       void *cgrp_ss_priv[CGROUP_CANFORK_COUNT] = {};
   
         if ((clone_flags & (CLONE_NEWNS|CLONE_FS)) == (CLONE_NEWNS|CLONE_FS))
                 return ERR_PTR(-EINVAL);
@@@ -1280,9 -1274,10 +1281,9 @@@
   
         /*
          * If the new process will be in a different pid or user namespace
- -       * do not allow it to share a thread group or signal handlers or
- -       * parent with the forking task.
+ +       * do not allow it to share a thread group with the forking task.
          */
- -      if (clone_flags & CLONE_SIGHAND) {
+ +      if (clone_flags & CLONE_THREAD) {
                 if ((clone_flags & (CLONE_NEWUSER | CLONE_NEWPID)) ||
                     (task_active_pid_ns(current) !=
                                 current->nsproxy->pid_ns_for_children))
@@@ -1341,8 -1336,9 +1342,8 @@@
   
         p->utime = p->stime = p->gtime = 0;
         p->utimescaled = p->stimescaled = 0;
- -#ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
- -      p->prev_cputime.utime = p->prev_cputime.stime = 0;
- -#endif
+ +      prev_cputime_init(&p->prev_cputime);
+ +
   #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
         seqlock_init(&p->vtime_seqlock);
         p->vtime_snap = 0;
@@@ -1517,6 -1513,16 +1518,16 @@@
         INIT_LIST_HEAD(&p->thread_group);
         p->task_works = NULL;
   
+       /*
+        * Ensure that the cgroup subsystem policies allow the new process to be
+        * forked. It should be noted the the new process's css_set can be changed
+        * between here and cgroup_post_fork() if an organisation operation is in
+        * progress.
+        */
+       retval = cgroup_can_fork(p, cgrp_ss_priv);
+       if (retval)
+               goto bad_fork_free_pid;
+ 
         /*
          * Make it visible to the rest of the system, but dont wake it up yet.
          * Need tasklist lock for parent etc handling!
@@@ -1553,7 -1559,7 +1564,7 @@@
                 spin_unlock(&current->sighand->siglock);
                 write_unlock_irq(&tasklist_lock);
                 retval = -ERESTARTNOINTR;
-               goto bad_fork_free_pid;
+               goto bad_fork_cancel_cgroup;
         }
   
         if (likely(p->pid)) {
@@@ -1595,7 -1601,7 +1606,7 @@@
         write_unlock_irq(&tasklist_lock);
   
         proc_fork_connector(p);
-       cgroup_post_fork(p);
+       cgroup_post_fork(p, cgrp_ss_priv);
         if (clone_flags & CLONE_THREAD)
                 threadgroup_change_end(current);
         perf_event_fork(p);
@@@ -1605,6 -1611,8 +1616,8 @@@
   
         return p;
   
+ bad_fork_cancel_cgroup:
+       cgroup_cancel_fork(p, cgrp_ss_priv);
   bad_fork_free_pid:
         if (pid != &init_struct_pid)
                 free_pid(pid);
@@@ -1871,21 -1879,13 +1884,21 @@@ static int check_unshare_flags(unsigne
                                 CLONE_NEWUSER|CLONE_NEWPID))
                 return -EINVAL;
         /*
- -       * Not implemented, but pretend it works if there is nothing to
- -       * unshare. Note that unsharing CLONE_THREAD or CLONE_SIGHAND
- -       * needs to unshare vm.
+ +       * Not implemented, but pretend it works if there is nothing
+ +       * to unshare.  Note that unsharing the address space or the
+ +       * signal handlers also need to unshare the signal queues (aka
+ +       * CLONE_THREAD).
          */
         if (unshare_flags & (CLONE_THREAD | CLONE_SIGHAND | CLONE_VM)) {
- -              /* FIXME: get_task_mm() increments ->mm_users */
- -              if (atomic_read(&current->mm->mm_users) > 1)
+ +              if (!thread_group_empty(current))
+ +                      return -EINVAL;
+ +      }
+ +      if (unshare_flags & (CLONE_SIGHAND | CLONE_VM)) {
+ +              if (atomic_read(&current->sighand->count) > 1)
+ +                      return -EINVAL;
+ +      }
+ +      if (unshare_flags & CLONE_VM) {
+ +              if (!current_is_single_threaded())
                         return -EINVAL;
         }
   
@@@ -1949,21 -1949,20 +1962,21 @@@ SYSCALL_DEFINE1(unshare, unsigned long
         int err;
   
         /*
- -       * If unsharing a user namespace must also unshare the thread.
+ +       * If unsharing a user namespace must also unshare the thread group
+ +       * and unshare the filesystem root and working directories.
          */
         if (unshare_flags & CLONE_NEWUSER)
                 unshare_flags |= CLONE_THREAD | CLONE_FS;
- -      /*
- -       * If unsharing a thread from a thread group, must also unshare vm.
- -       */
- -      if (unshare_flags & CLONE_THREAD)
- -              unshare_flags |= CLONE_VM;
         /*
          * If unsharing vm, must also unshare signal handlers.
          */
         if (unshare_flags & CLONE_VM)
                 unshare_flags |= CLONE_SIGHAND;
+ +      /*
+ +       * If unsharing a signal handlers, must also unshare the signal queues.
+ +       */
+ +      if (unshare_flags & CLONE_SIGHAND)
+ +              unshare_flags |= CLONE_THREAD;
         /*
          * If unsharing namespace, must also unshare filesystem information.
          */
diff --combined kernel/sched/core.c

index 8b864ecee0e187c58a7903058ae1c0fa57b4d0c3,d811652fe6f598633ca9a955b7aace4fe5b923c5..d8420c233ff76268cdff1d1add89a05e1b89d9c2
--- 1/kernel/sched/core.c
--- 2/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@@ -1151,45 -1151,15 +1151,45 @@@ static int migration_cpu_stop(void *dat
         return 0;
   }
   
- -void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
+ +/*
+ + * sched_class::set_cpus_allowed must do the below, but is not required to
+ + * actually call this function.
+ + */
+ +void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask)
   {
- -      if (p->sched_class->set_cpus_allowed)
- -              p->sched_class->set_cpus_allowed(p, new_mask);
- -
         cpumask_copy(&p->cpus_allowed, new_mask);
         p->nr_cpus_allowed = cpumask_weight(new_mask);
   }
   
+ +void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
+ +{
+ +      struct rq *rq = task_rq(p);
+ +      bool queued, running;
+ +
+ +      lockdep_assert_held(&p->pi_lock);
+ +
+ +      queued = task_on_rq_queued(p);
+ +      running = task_current(rq, p);
+ +
+ +      if (queued) {
+ +              /*
+ +               * Because __kthread_bind() calls this on blocked tasks without
+ +               * holding rq->lock.
+ +               */
+ +              lockdep_assert_held(&rq->lock);
+ +              dequeue_task(rq, p, 0);
+ +      }
+ +      if (running)
+ +              put_prev_task(rq, p);
+ +
+ +      p->sched_class->set_cpus_allowed(p, new_mask);
+ +
+ +      if (running)
+ +              p->sched_class->set_curr_task(rq);
+ +      if (queued)
+ +              enqueue_task(rq, p, 0);
+ +}
+ +
   /*
    * Change a given task's CPU affinity. Migrate the thread to a
    * proper CPU and schedule it away if the CPU it's executing on
@@@ -1199,8 -1169,7 +1199,8 @@@
    * task must not exit() & deallocate itself prematurely. The
    * call is not atomic; no spinlocks may be held.
    */
- -int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
+ +static int __set_cpus_allowed_ptr(struct task_struct *p,
+ +                                const struct cpumask *new_mask, bool check)
   {
         unsigned long flags;
         struct rq *rq;
@@@ -1209,15 -1178,6 +1209,15 @@@
   
         rq = task_rq_lock(p, &flags);
   
+ +      /*
+ +       * Must re-check here, to close a race against __kthread_bind(),
+ +       * sched_setaffinity() is not guaranteed to observe the flag.
+ +       */
+ +      if (check && (p->flags & PF_NO_SETAFFINITY)) {
+ +              ret = -EINVAL;
+ +              goto out;
+ +      }
+ +
         if (cpumask_equal(&p->cpus_allowed, new_mask))
                 goto out;
   
@@@ -1254,11 -1214,6 +1254,11 @@@ out
   
         return ret;
   }
+ +
+ +int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
+ +{
+ +      return __set_cpus_allowed_ptr(p, new_mask, false);
+ +}
   EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr);
   
   void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
@@@ -1640,15 -1595,6 +1640,15 @@@ static void update_avg(u64 *avg, u64 sa
         s64 diff = sample - *avg;
         *avg += diff >> 3;
   }
+ +
+ +#else
+ +
+ +static inline int __set_cpus_allowed_ptr(struct task_struct *p,
+ +                                       const struct cpumask *new_mask, bool check)
+ +{
+ +      return set_cpus_allowed_ptr(p, new_mask);
+ +}
+ +
   #endif /* CONFIG_SMP */
   
   static void
@@@ -1708,9 -1654,9 +1708,9 @@@ static voi
   ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags)
   {
         check_preempt_curr(rq, p, wake_flags);
- -      trace_sched_wakeup(p, true);
- -
         p->state = TASK_RUNNING;
+ +      trace_sched_wakeup(p);
+ +
   #ifdef CONFIG_SMP
         if (p->sched_class->task_woken) {
                 /*
@@@ -1928,8 -1874,6 +1928,8 @@@ try_to_wake_up(struct task_struct *p, u
         if (!(p->state & state))
                 goto out;
   
+ +      trace_sched_waking(p);
+ +
         success = 1; /* we're going to change ->state */
         cpu = task_cpu(p);
   
@@@ -2005,8 -1949,6 +2005,8 @@@ static void try_to_wake_up_local(struc
         if (!(p->state & TASK_NORMAL))
                 goto out;
   
+ +      trace_sched_waking(p);
+ +
         if (!task_on_rq_queued(p))
                 ttwu_activate(rq, p, ENQUEUE_WAKEUP);
   
@@@ -2074,6 -2016,9 +2074,6 @@@ static void __sched_fork(unsigned long 
         p->se.prev_sum_exec_runtime     = 0;
         p->se.nr_migrations             = 0;
         p->se.vruntime                  = 0;
- -#ifdef CONFIG_SMP
- -      p->se.avg.decay_count           = 0;
- -#endif
         INIT_LIST_HEAD(&p->se.group_node);
   
   #ifdef CONFIG_SCHEDSTATS
@@@ -2255,8 -2200,8 +2255,8 @@@ unsigned long to_ratio(u64 period, u64 
   #ifdef CONFIG_SMP
   inline struct dl_bw *dl_bw_of(int i)
   {
- -      rcu_lockdep_assert(rcu_read_lock_sched_held(),
- -                         "sched RCU must be held");
+ +      RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held(),
+ +                       "sched RCU must be held");
         return &cpu_rq(i)->rd->dl_bw;
   }
   
@@@ -2265,8 -2210,8 +2265,8 @@@ static inline int dl_bw_cpus(int i
         struct root_domain *rd = cpu_rq(i)->rd;
         int cpus = 0;
   
- -      rcu_lockdep_assert(rcu_read_lock_sched_held(),
- -                         "sched RCU must be held");
+ +      RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held(),
+ +                       "sched RCU must be held");
         for_each_cpu_and(i, rd->span, cpu_active_mask)
                 cpus++;
   
@@@ -2358,11 -2303,11 +2358,11 @@@ void wake_up_new_task(struct task_struc
   #endif
   
         /* Initialize new task's runnable average */
- -      init_task_runnable_average(p);
+ +      init_entity_runnable_average(&p->se);
         rq = __task_rq_lock(p);
         activate_task(rq, p, 0);
         p->on_rq = TASK_ON_RQ_QUEUED;
- -      trace_sched_wakeup_new(p, true);
+ +      trace_sched_wakeup_new(p);
         check_preempt_curr(rq, p, WF_FORK);
   #ifdef CONFIG_SMP
         if (p->sched_class->task_woken)
@@@ -2524,6 -2469,7 +2524,6 @@@ static struct rq *finish_task_switch(st
          */
         prev_state = prev->state;
         vtime_task_switch(prev);
- -      finish_arch_switch(prev);
         perf_event_task_sched_in(prev, current);
         finish_lock_switch(rq, prev);
         finish_arch_post_lock_switch();
@@@ -2543,7 -2489,7 +2543,7 @@@
                 put_task_struct(prev);
         }
   
- -      tick_nohz_task_switch(current);
+ +      tick_nohz_task_switch();
         return rq;
   }
   
@@@ -4394,7 -4340,7 +4394,7 @@@ long sched_setaffinity(pid_t pid, cons
         }
   #endif
   again:
- -      retval = set_cpus_allowed_ptr(p, new_mask);
+ +      retval = __set_cpus_allowed_ptr(p, new_mask, true);
   
         if (!retval) {
                 cpuset_cpus_allowed(p, cpus_allowed);
@@@ -4546,7 -4492,7 +4546,7 @@@ SYSCALL_DEFINE0(sched_yield
   
   int __sched _cond_resched(void)
   {
- -      if (should_resched()) {
+ +      if (should_resched(0)) {
                 preempt_schedule_common();
                 return 1;
         }
@@@ -4564,7 -4510,7 +4564,7 @@@ EXPORT_SYMBOL(_cond_resched)
    */
   int __cond_resched_lock(spinlock_t *lock)
   {
- -      int resched = should_resched();
+ +      int resched = should_resched(PREEMPT_LOCK_OFFSET);
         int ret = 0;
   
         lockdep_assert_held(lock);
@@@ -4586,7 -4532,7 +4586,7 @@@ int __sched __cond_resched_softirq(void
   {
         BUG_ON(!in_softirq());
   
- -      if (should_resched()) {
+ +      if (should_resched(SOFTIRQ_DISABLE_OFFSET)) {
                 local_bh_enable();
                 preempt_schedule_common();
                 local_bh_disable();
@@@ -4919,8 -4865,7 +4919,8 @@@ void init_idle(struct task_struct *idle
         struct rq *rq = cpu_rq(cpu);
         unsigned long flags;
   
- -      raw_spin_lock_irqsave(&rq->lock, flags);
+ +      raw_spin_lock_irqsave(&idle->pi_lock, flags);
+ +      raw_spin_lock(&rq->lock);
   
         __sched_fork(0, idle);
         idle->state = TASK_RUNNING;
@@@ -4946,8 -4891,7 +4946,8 @@@
   #if defined(CONFIG_SMP)
         idle->on_cpu = 1;
   #endif
- -      raw_spin_unlock_irqrestore(&rq->lock, flags);
+ +      raw_spin_unlock(&rq->lock);
+ +      raw_spin_unlock_irqrestore(&idle->pi_lock, flags);
   
         /* Set the preempt count _outside_ the spinlocks! */
         init_idle_preempt_count(idle, cpu);
@@@ -5367,7 -5311,8 +5367,7 @@@ static void register_sched_domain_sysct
   /* may be called multiple times per register */
   static void unregister_sched_domain_sysctl(void)
   {
- -      if (sd_sysctl_header)
- -              unregister_sysctl_table(sd_sysctl_header);
+ +      unregister_sysctl_table(sd_sysctl_header);
         sd_sysctl_header = NULL;
         if (sd_ctl_dir[0].child)
                 sd_free_ctl_entry(&sd_ctl_dir[0].child);
@@@ -5488,14 -5433,6 +5488,14 @@@ static int sched_cpu_active(struct noti
         case CPU_STARTING:
                 set_cpu_rq_start_time();
                 return NOTIFY_OK;
+ +      case CPU_ONLINE:
+ +              /*
+ +               * At this point a starting CPU has marked itself as online via
+ +               * set_cpu_online(). But it might not yet have marked itself
+ +               * as active, which is essential from here on.
+ +               *
+ +               * Thus, fall-through and help the starting CPU along.
+ +               */
         case CPU_DOWN_FAILED:
                 set_cpu_active((long)hcpu, true);
                 return NOTIFY_OK;
@@@ -6508,10 -6445,8 +6508,10 @@@ static void init_numa_topology_type(voi
   
         n = sched_max_numa_distance;
   
- -      if (n <= 1)
+ +      if (sched_domains_numa_levels <= 1) {
                 sched_numa_topology_type = NUMA_DIRECT;
+ +              return;
+ +      }
   
         for_each_online_node(a) {
                 for_each_online_node(b) {
@@@ -8133,7 -8068,7 +8133,7 @@@ static void cpu_cgroup_css_offline(stru
         sched_offline_group(tg);
   }
   
- static void cpu_cgroup_fork(struct task_struct *task)
+ static void cpu_cgroup_fork(struct task_struct *task, void *private)
   {
         sched_move_task(task);
   }
author	Linus Torvalds <[email protected]>
	Wed, 2 Sep 2015 15:04:23 +0000 (08:04 -0700)
committer	Linus Torvalds <[email protected]>
	Wed, 2 Sep 2015 15:04:23 +0000 (08:04 -0700)
		1	2
init/Kconfig	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/cgroup.c	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/fork.c	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/sched/core.c	patch \|	diff1 \|	diff2 \|	blob \| history