Merge branch 'for-4.14' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup

author Linus Torvalds <[email protected]>

Thu, 7 Sep 2017 05:25:25 +0000 (22:25 -0700)

committer Linus Torvalds <[email protected]>

Thu, 7 Sep 2017 05:25:25 +0000 (22:25 -0700)
author Linus Torvalds <[email protected]>
Thu, 7 Sep 2017 05:25:25 +0000 (22:25 -0700)
committer Linus Torvalds <[email protected]>
Thu, 7 Sep 2017 05:25:25 +0000 (22:25 -0700)
diff --combined kernel/cgroup/cgroup-internal.h

index 8b4c3c2f2509d2b49666410b011bf604926194f3,c167a40278e63b86f0e04f22167e3280751a28d0..5151ff256c2945ec3a6b274b4eae7c2221d81a98
--- 1/kernel/cgroup/cgroup-internal.h
--- 2/kernel/cgroup/cgroup-internal.h
+++ b/kernel/cgroup/cgroup-internal.h
@@@ -33,9 -33,6 +33,9 @@@ struct cgroup_taskset 
         struct list_head        src_csets;
         struct list_head        dst_csets;
   
+ +      /* the number of tasks in the set */
+ +      int                     nr_tasks;
+ +
         /* the subsys currently being processed */
         int                     ssid;
   
@@@ -156,6 -153,8 +156,8 @@@ static inline void get_css_set(struct c
   
   bool cgroup_ssid_enabled(int ssid);
   bool cgroup_on_dfl(const struct cgroup *cgrp);
+ bool cgroup_is_thread_root(struct cgroup *cgrp);
+ bool cgroup_is_threaded(struct cgroup *cgrp);
   
   struct cgroup_root *cgroup_root_from_kf(struct kernfs_root *kf_root);
   struct cgroup *task_cgroup_from_root(struct task_struct *task,
@@@ -173,7 -172,7 +175,7 @@@ struct dentry *cgroup_do_mount(struct f
                                struct cgroup_root *root, unsigned long magic,
                                struct cgroup_namespace *ns);
   
- bool cgroup_may_migrate_to(struct cgroup *dst_cgrp);
+ int cgroup_migrate_vet_dst(struct cgroup *dst_cgrp);
   void cgroup_migrate_finish(struct cgroup_mgctx *mgctx);
   void cgroup_migrate_add_src(struct css_set *src_cset, struct cgroup *dst_cgrp,
                             struct cgroup_mgctx *mgctx);
@@@ -183,10 -182,10 +185,10 @@@ int cgroup_migrate(struct task_struct *
   
   int cgroup_attach_task(struct cgroup *dst_cgrp, struct task_struct *leader,
                        bool threadgroup);
- ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf,
-                            size_t nbytes, loff_t off, bool threadgroup);
- ssize_t cgroup_procs_write(struct kernfs_open_file *of, char *buf, size_t nbytes,
-                          loff_t off);
+ struct task_struct *cgroup_procs_write_start(char *buf, bool threadgroup)
+       __acquires(&cgroup_threadgroup_rwsem);
+ void cgroup_procs_write_finish(struct task_struct *task)
+       __releases(&cgroup_threadgroup_rwsem);
   
   void cgroup_lock_and_drain_offline(struct cgroup *cgrp);
   
diff --combined kernel/cgroup/cgroup.c

index f64fc967a9efd94f65e7fcf73c0d1a851c09bc1a,1591e9b20122e762ffebf2b0b97c5f6eb95790a8..4f2196a00953f2ce7b2c25893315f66aa721ad12
--- 1/kernel/cgroup/cgroup.c
--- 2/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@@ -162,6 -162,9 +162,9 @@@ static u16 cgrp_dfl_inhibit_ss_mask
   /* some controllers are implicitly enabled on the default hierarchy */
   static u16 cgrp_dfl_implicit_ss_mask;
   
+ /* some controllers can be threaded on the default hierarchy */
+ static u16 cgrp_dfl_threaded_ss_mask;
+ 
   /* The list of hierarchy roots */
   LIST_HEAD(cgroup_roots);
   static int cgroup_root_count;
@@@ -316,13 -319,87 +319,87 @@@ static void cgroup_idr_remove(struct id
         spin_unlock_bh(&cgroup_idr_lock);
   }
   
- static struct cgroup *cgroup_parent(struct cgroup *cgrp)
+ static bool cgroup_has_tasks(struct cgroup *cgrp)
   {
-       struct cgroup_subsys_state *parent_css = cgrp->self.parent;
+       return cgrp->nr_populated_csets;
+ }
   
-       if (parent_css)
-               return container_of(parent_css, struct cgroup, self);
-       return NULL;
+ bool cgroup_is_threaded(struct cgroup *cgrp)
+ {
+       return cgrp->dom_cgrp != cgrp;
+ }
+ 
+ /* can @cgrp host both domain and threaded children? */
+ static bool cgroup_is_mixable(struct cgroup *cgrp)
+ {
+       /*
+        * Root isn't under domain level resource control exempting it from
+        * the no-internal-process constraint, so it can serve as a thread
+        * root and a parent of resource domains at the same time.
+        */
+       return !cgroup_parent(cgrp);
+ }
+ 
+ /* can @cgrp become a thread root? should always be true for a thread root */
+ static bool cgroup_can_be_thread_root(struct cgroup *cgrp)
+ {
+       /* mixables don't care */
+       if (cgroup_is_mixable(cgrp))
+               return true;
+ 
+       /* domain roots can't be nested under threaded */
+       if (cgroup_is_threaded(cgrp))
+               return false;
+ 
+       /* can only have either domain or threaded children */
+       if (cgrp->nr_populated_domain_children)
+               return false;
+ 
+       /* and no domain controllers can be enabled */
+       if (cgrp->subtree_control & ~cgrp_dfl_threaded_ss_mask)
+               return false;
+ 
+       return true;
+ }
+ 
+ /* is @cgrp root of a threaded subtree? */
+ bool cgroup_is_thread_root(struct cgroup *cgrp)
+ {
+       /* thread root should be a domain */
+       if (cgroup_is_threaded(cgrp))
+               return false;
+ 
+       /* a domain w/ threaded children is a thread root */
+       if (cgrp->nr_threaded_children)
+               return true;
+ 
+       /*
+        * A domain which has tasks and explicit threaded controllers
+        * enabled is a thread root.
+        */
+       if (cgroup_has_tasks(cgrp) &&
+           (cgrp->subtree_control & cgrp_dfl_threaded_ss_mask))
+               return true;
+ 
+       return false;
+ }
+ 
+ /* a domain which isn't connected to the root w/o brekage can't be used */
+ static bool cgroup_is_valid_domain(struct cgroup *cgrp)
+ {
+       /* the cgroup itself can be a thread root */
+       if (cgroup_is_threaded(cgrp))
+               return false;
+ 
+       /* but the ancestors can't be unless mixable */
+       while ((cgrp = cgroup_parent(cgrp))) {
+               if (!cgroup_is_mixable(cgrp) && cgroup_is_thread_root(cgrp))
+                       return false;
+               if (cgroup_is_threaded(cgrp))
+                       return false;
+       }
+ 
+       return true;
   }
   
   /* subsystems visibly enabled on a cgroup */
@@@ -331,8 -408,14 +408,14 @@@ static u16 cgroup_control(struct cgrou
         struct cgroup *parent = cgroup_parent(cgrp);
         u16 root_ss_mask = cgrp->root->subsys_mask;
   
-       if (parent)
-               return parent->subtree_control;
+       if (parent) {
+               u16 ss_mask = parent->subtree_control;
+ 
+               /* threaded cgroups can only have threaded controllers */
+               if (cgroup_is_threaded(cgrp))
+                       ss_mask &= cgrp_dfl_threaded_ss_mask;
+               return ss_mask;
+       }
   
         if (cgroup_on_dfl(cgrp))
                 root_ss_mask &= ~(cgrp_dfl_inhibit_ss_mask |
@@@ -345,8 -428,14 +428,14 @@@ static u16 cgroup_ss_mask(struct cgrou
   {
         struct cgroup *parent = cgroup_parent(cgrp);
   
-       if (parent)
-               return parent->subtree_ss_mask;
+       if (parent) {
+               u16 ss_mask = parent->subtree_ss_mask;
+ 
+               /* threaded cgroups can only have threaded controllers */
+               if (cgroup_is_threaded(cgrp))
+                       ss_mask &= cgrp_dfl_threaded_ss_mask;
+               return ss_mask;
+       }
   
         return cgrp->root->subsys_mask;
   }
@@@ -436,22 -525,12 +525,12 @@@ out_unlock
         return css;
   }
   
- static void __maybe_unused cgroup_get(struct cgroup *cgrp)
- {
-       css_get(&cgrp->self);
- }
- 
   static void cgroup_get_live(struct cgroup *cgrp)
   {
         WARN_ON_ONCE(cgroup_is_dead(cgrp));
         css_get(&cgrp->self);
   }
   
- static bool cgroup_tryget(struct cgroup *cgrp)
- {
-       return css_tryget(&cgrp->self);
- }
- 
   struct cgroup_subsys_state *of_css(struct kernfs_open_file *of)
   {
         struct cgroup *cgrp = of->kn->parent->priv;
@@@ -560,9 -639,11 +639,11 @@@ EXPORT_SYMBOL_GPL(of_css)
    */
   struct css_set init_css_set = {
         .refcount               = REFCOUNT_INIT(1),
+       .dom_cset               = &init_css_set,
         .tasks                  = LIST_HEAD_INIT(init_css_set.tasks),
         .mg_tasks               = LIST_HEAD_INIT(init_css_set.mg_tasks),
         .task_iters             = LIST_HEAD_INIT(init_css_set.task_iters),
+       .threaded_csets         = LIST_HEAD_INIT(init_css_set.threaded_csets),
         .cgrp_links             = LIST_HEAD_INIT(init_css_set.cgrp_links),
         .mg_preload_node        = LIST_HEAD_INIT(init_css_set.mg_preload_node),
         .mg_node                = LIST_HEAD_INIT(init_css_set.mg_node),
@@@ -570,6 -651,11 +651,11 @@@
   
   static int css_set_count      = 1;    /* 1 for init_css_set */
   
+ static bool css_set_threaded(struct css_set *cset)
+ {
+       return cset->dom_cset != cset;
+ }
+ 
   /**
    * css_set_populated - does a css_set contain any tasks?
    * @cset: target css_set
@@@ -587,39 -673,48 +673,48 @@@ static bool css_set_populated(struct cs
   }
   
   /**
-  * cgroup_update_populated - updated populated count of a cgroup
+  * cgroup_update_populated - update the populated count of a cgroup
    * @cgrp: the target cgroup
    * @populated: inc or dec populated count
    *
    * One of the css_sets associated with @cgrp is either getting its first
-  * task or losing the last.  Update @cgrp->populated_cnt accordingly.  The
-  * count is propagated towards root so that a given cgroup's populated_cnt
-  * is zero iff the cgroup and all its descendants don't contain any tasks.
+  * task or losing the last.  Update @cgrp->nr_populated_* accordingly.  The
+  * count is propagated towards root so that a given cgroup's
+  * nr_populated_children is zero iff none of its descendants contain any
+  * tasks.
    *
-  * @cgrp's interface file "cgroup.populated" is zero if
-  * @cgrp->populated_cnt is zero and 1 otherwise.  When @cgrp->populated_cnt
-  * changes from or to zero, userland is notified that the content of the
-  * interface file has changed.  This can be used to detect when @cgrp and
-  * its descendants become populated or empty.
+  * @cgrp's interface file "cgroup.populated" is zero if both
+  * @cgrp->nr_populated_csets and @cgrp->nr_populated_children are zero and
+  * 1 otherwise.  When the sum changes from or to zero, userland is notified
+  * that the content of the interface file has changed.  This can be used to
+  * detect when @cgrp and its descendants become populated or empty.
    */
   static void cgroup_update_populated(struct cgroup *cgrp, bool populated)
   {
+       struct cgroup *child = NULL;
+       int adj = populated ? 1 : -1;
+ 
         lockdep_assert_held(&css_set_lock);
   
         do {
-               bool trigger;
+               bool was_populated = cgroup_is_populated(cgrp);
   
-               if (populated)
-                       trigger = !cgrp->populated_cnt++;
-               else
-                       trigger = !--cgrp->populated_cnt;
+               if (!child) {
+                       cgrp->nr_populated_csets += adj;
+               } else {
+                       if (cgroup_is_threaded(child))
+                               cgrp->nr_populated_threaded_children += adj;
+                       else
+                               cgrp->nr_populated_domain_children += adj;
+               }
   
-               if (!trigger)
+               if (was_populated == cgroup_is_populated(cgrp))
                         break;
   
                 cgroup1_check_for_release(cgrp);
                 cgroup_file_notify(&cgrp->events_file);
   
+               child = cgrp;
                 cgrp = cgroup_parent(cgrp);
         } while (cgrp);
   }
@@@ -630,7 -725,7 +725,7 @@@
    * @populated: whether @cset is populated or depopulated
    *
    * @cset is either getting the first task or losing the last.  Update the
-  * ->populated_cnt of all associated cgroups accordingly.
+  * populated counters of all associated cgroups accordingly.
    */
   static void css_set_update_populated(struct css_set *cset, bool populated)
   {
@@@ -653,7 -748,7 +748,7 @@@
    * css_set, @from_cset can be NULL.  If @task is being disassociated
    * instead of moved, @to_cset can be NULL.
    *
-  * This function automatically handles populated_cnt updates and
+  * This function automatically handles populated counter updates and
    * css_task_iter adjustments but the caller is responsible for managing
    * @from_cset and @to_cset's reference counts.
    */
@@@ -737,6 -832,8 +832,8 @@@ void put_css_set_locked(struct css_set 
         if (!refcount_dec_and_test(&cset->refcount))
                 return;
   
+       WARN_ON_ONCE(!list_empty(&cset->threaded_csets));
+ 
         /* This css_set is dead. unlink it and release cgroup and css refs */
         for_each_subsys(ss, ssid) {
                 list_del(&cset->e_cset_node[ssid]);
@@@ -753,6 -850,11 +850,11 @@@
                 kfree(link);
         }
   
+       if (css_set_threaded(cset)) {
+               list_del(&cset->threaded_csets_node);
+               put_css_set_locked(cset->dom_cset);
+       }
+ 
         kfree_rcu(cset, rcu_head);
   }
   
@@@ -771,6 -873,7 +873,7 @@@ static bool compare_css_sets(struct css
                              struct cgroup *new_cgrp,
                              struct cgroup_subsys_state *template[])
   {
+       struct cgroup *new_dfl_cgrp;
         struct list_head *l1, *l2;
   
         /*
@@@ -781,6 -884,16 +884,16 @@@
         if (memcmp(template, cset->subsys, sizeof(cset->subsys)))
                 return false;
   
+ 
+       /* @cset's domain should match the default cgroup's */
+       if (cgroup_on_dfl(new_cgrp))
+               new_dfl_cgrp = new_cgrp;
+       else
+               new_dfl_cgrp = old_cset->dfl_cgrp;
+ 
+       if (new_dfl_cgrp->dom_cgrp != cset->dom_cset->dfl_cgrp)
+               return false;
+ 
         /*
          * Compare cgroup pointers in order to distinguish between
          * different cgroups in hierarchies.  As different cgroups may
@@@ -988,9 -1101,11 +1101,11 @@@ static struct css_set *find_css_set(str
         }
   
         refcount_set(&cset->refcount, 1);
+       cset->dom_cset = cset;
         INIT_LIST_HEAD(&cset->tasks);
         INIT_LIST_HEAD(&cset->mg_tasks);
         INIT_LIST_HEAD(&cset->task_iters);
+       INIT_LIST_HEAD(&cset->threaded_csets);
         INIT_HLIST_NODE(&cset->hlist);
         INIT_LIST_HEAD(&cset->cgrp_links);
         INIT_LIST_HEAD(&cset->mg_preload_node);
@@@ -1028,6 -1143,28 +1143,28 @@@
   
         spin_unlock_irq(&css_set_lock);
   
+       /*
+        * If @cset should be threaded, look up the matching dom_cset and
+        * link them up.  We first fully initialize @cset then look for the
+        * dom_cset.  It's simpler this way and safe as @cset is guaranteed
+        * to stay empty until we return.
+        */
+       if (cgroup_is_threaded(cset->dfl_cgrp)) {
+               struct css_set *dcset;
+ 
+               dcset = find_css_set(cset, cset->dfl_cgrp->dom_cgrp);
+               if (!dcset) {
+                       put_css_set(cset);
+                       return NULL;
+               }
+ 
+               spin_lock_irq(&css_set_lock);
+               cset->dom_cset = dcset;
+               list_add_tail(&cset->threaded_csets_node,
+                             &dcset->threaded_csets);
+               spin_unlock_irq(&css_set_lock);
+       }
+ 
         return cset;
   }
   
@@@ -1155,6 -1292,8 +1292,8 @@@ static struct cgroup *cset_cgroup_from_
   
         if (cset == &init_css_set) {
                 res = &root->cgrp;
+       } else if (root == &cgrp_dfl_root) {
+               res = cset->dfl_cgrp;
         } else {
                 struct cgrp_cset_link *link;
   
@@@ -1670,6 -1809,9 +1809,9 @@@ static void init_cgroup_housekeeping(st
         mutex_init(&cgrp->pidlist_mutex);
         cgrp->self.cgroup = cgrp;
         cgrp->self.flags |= CSS_ONLINE;
+       cgrp->dom_cgrp = cgrp;
+       cgrp->max_descendants = INT_MAX;
+       cgrp->max_depth = INT_MAX;
   
         for_each_subsys(ss, ssid)
                 INIT_LIST_HEAD(&cgrp->e_csets[ssid]);
@@@ -2006,8 -2148,6 +2148,8 @@@ static void cgroup_migrate_add_task(str
         if (!cset->mg_src_cgrp)
                 return;
   
+ +      mgctx->tset.nr_tasks++;
+ +
         list_move_tail(&task->cg_list, &cset->mg_tasks);
         if (list_empty(&cset->mg_node))
                 list_add_tail(&cset->mg_node,
@@@ -2096,19 -2236,21 +2238,19 @@@ static int cgroup_migrate_execute(struc
         struct css_set *cset, *tmp_cset;
         int ssid, failed_ssid, ret;
   
- -      /* methods shouldn't be called if no task is actually migrating */
- -      if (list_empty(&tset->src_csets))
- -              return 0;
- -
         /* check that we can legitimately attach to the cgroup */
- -      do_each_subsys_mask(ss, ssid, mgctx->ss_mask) {
- -              if (ss->can_attach) {
- -                      tset->ssid = ssid;
- -                      ret = ss->can_attach(tset);
- -                      if (ret) {
- -                              failed_ssid = ssid;
- -                              goto out_cancel_attach;
+ +      if (tset->nr_tasks) {
+ +              do_each_subsys_mask(ss, ssid, mgctx->ss_mask) {
+ +                      if (ss->can_attach) {
+ +                              tset->ssid = ssid;
+ +                              ret = ss->can_attach(tset);
+ +                              if (ret) {
+ +                                      failed_ssid = ssid;
+ +                                      goto out_cancel_attach;
+ +                              }
                         }
- -              }
- -      } while_each_subsys_mask();
+ +              } while_each_subsys_mask();
+ +      }
   
         /*
          * Now that we're guaranteed success, proceed to move all tasks to
@@@ -2137,29 -2279,25 +2279,29 @@@
          */
         tset->csets = &tset->dst_csets;
   
- -      do_each_subsys_mask(ss, ssid, mgctx->ss_mask) {
- -              if (ss->attach) {
- -                      tset->ssid = ssid;
- -                      ss->attach(tset);
- -              }
- -      } while_each_subsys_mask();
+ +      if (tset->nr_tasks) {
+ +              do_each_subsys_mask(ss, ssid, mgctx->ss_mask) {
+ +                      if (ss->attach) {
+ +                              tset->ssid = ssid;
+ +                              ss->attach(tset);
+ +                      }
+ +              } while_each_subsys_mask();
+ +      }
   
         ret = 0;
         goto out_release_tset;
   
   out_cancel_attach:
- -      do_each_subsys_mask(ss, ssid, mgctx->ss_mask) {
- -              if (ssid == failed_ssid)
- -                      break;
- -              if (ss->cancel_attach) {
- -                      tset->ssid = ssid;
- -                      ss->cancel_attach(tset);
- -              }
- -      } while_each_subsys_mask();
+ +      if (tset->nr_tasks) {
+ +              do_each_subsys_mask(ss, ssid, mgctx->ss_mask) {
+ +                      if (ssid == failed_ssid)
+ +                              break;
+ +                      if (ss->cancel_attach) {
+ +                              tset->ssid = ssid;
+ +                              ss->cancel_attach(tset);
+ +                      }
+ +              } while_each_subsys_mask();
+ +      }
   out_release_tset:
         spin_lock_irq(&css_set_lock);
         list_splice_init(&tset->dst_csets, &tset->src_csets);
@@@ -2172,17 -2310,40 +2314,40 @@@
   }
   
   /**
-  * cgroup_may_migrate_to - verify whether a cgroup can be migration destination
+  * cgroup_migrate_vet_dst - verify whether a cgroup can be migration destination
    * @dst_cgrp: destination cgroup to test
    *
-  * On the default hierarchy, except for the root, subtree_control must be
-  * zero for migration destination cgroups with tasks so that child cgroups
-  * don't compete against tasks.
+  * On the default hierarchy, except for the mixable, (possible) thread root
+  * and threaded cgroups, subtree_control must be zero for migration
+  * destination cgroups with tasks so that child cgroups don't compete
+  * against tasks.
    */
- bool cgroup_may_migrate_to(struct cgroup *dst_cgrp)
+ int cgroup_migrate_vet_dst(struct cgroup *dst_cgrp)
   {
-       return !cgroup_on_dfl(dst_cgrp) || !cgroup_parent(dst_cgrp) ||
-               !dst_cgrp->subtree_control;
+       /* v1 doesn't have any restriction */
+       if (!cgroup_on_dfl(dst_cgrp))
+               return 0;
+ 
+       /* verify @dst_cgrp can host resources */
+       if (!cgroup_is_valid_domain(dst_cgrp->dom_cgrp))
+               return -EOPNOTSUPP;
+ 
+       /* mixables don't care */
+       if (cgroup_is_mixable(dst_cgrp))
+               return 0;
+ 
+       /*
+        * If @dst_cgrp is already or can become a thread root or is
+        * threaded, it doesn't matter.
+        */
+       if (cgroup_can_be_thread_root(dst_cgrp) || cgroup_is_threaded(dst_cgrp))
+               return 0;
+ 
+       /* apply no-internal-process constraint */
+       if (dst_cgrp->subtree_control)
+               return -EBUSY;
+ 
+       return 0;
   }
   
   /**
@@@ -2387,8 -2548,9 +2552,9 @@@ int cgroup_attach_task(struct cgroup *d
         struct task_struct *task;
         int ret;
   
-       if (!cgroup_may_migrate_to(dst_cgrp))
-               return -EBUSY;
+       ret = cgroup_migrate_vet_dst(dst_cgrp);
+       if (ret)
+               return ret;
   
         /* look up all src csets */
         spin_lock_irq(&css_set_lock);
@@@ -2415,96 -2577,23 +2581,23 @@@
         return ret;
   }
   
- static int cgroup_procs_write_permission(struct task_struct *task,
-                                        struct cgroup *dst_cgrp,
-                                        struct kernfs_open_file *of)
- {
-       struct super_block *sb = of->file->f_path.dentry->d_sb;
-       struct cgroup_namespace *ns = current->nsproxy->cgroup_ns;
-       struct cgroup *root_cgrp = ns->root_cset->dfl_cgrp;
-       struct cgroup *src_cgrp, *com_cgrp;
-       struct inode *inode;
-       int ret;
- 
-       if (!cgroup_on_dfl(dst_cgrp)) {
-               const struct cred *cred = current_cred();
-               const struct cred *tcred = get_task_cred(task);
- 
-               /*
-                * even if we're attaching all tasks in the thread group,
-                * we only need to check permissions on one of them.
-                */
-               if (uid_eq(cred->euid, GLOBAL_ROOT_UID) ||
-                   uid_eq(cred->euid, tcred->uid) ||
-                   uid_eq(cred->euid, tcred->suid))
-                       ret = 0;
-               else
-                       ret = -EACCES;
- 
-               put_cred(tcred);
-               return ret;
-       }
- 
-       /* find the source cgroup */
-       spin_lock_irq(&css_set_lock);
-       src_cgrp = task_cgroup_from_root(task, &cgrp_dfl_root);
-       spin_unlock_irq(&css_set_lock);
- 
-       /* and the common ancestor */
-       com_cgrp = src_cgrp;
-       while (!cgroup_is_descendant(dst_cgrp, com_cgrp))
-               com_cgrp = cgroup_parent(com_cgrp);
- 
-       /* %current should be authorized to migrate to the common ancestor */
-       inode = kernfs_get_inode(sb, com_cgrp->procs_file.kn);
-       if (!inode)
-               return -ENOMEM;
- 
-       ret = inode_permission(inode, MAY_WRITE);
-       iput(inode);
-       if (ret)
-               return ret;
- 
-       /*
-        * If namespaces are delegation boundaries, %current must be able
-        * to see both source and destination cgroups from its namespace.
-        */
-       if ((cgrp_dfl_root.flags & CGRP_ROOT_NS_DELEGATE) &&
-           (!cgroup_is_descendant(src_cgrp, root_cgrp) ||
-            !cgroup_is_descendant(dst_cgrp, root_cgrp)))
-               return -ENOENT;
- 
-       return 0;
- }
- 
- /*
-  * Find the task_struct of the task to attach by vpid and pass it along to the
-  * function to attach either it or all tasks in its threadgroup. Will lock
-  * cgroup_mutex and threadgroup.
-  */
- ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf,
-                            size_t nbytes, loff_t off, bool threadgroup)
+ struct task_struct *cgroup_procs_write_start(char *buf, bool threadgroup)
+       __acquires(&cgroup_threadgroup_rwsem)
   {
         struct task_struct *tsk;
-       struct cgroup_subsys *ss;
-       struct cgroup *cgrp;
         pid_t pid;
-       int ssid, ret;
   
         if (kstrtoint(strstrip(buf), 0, &pid) || pid < 0)
-               return -EINVAL;
- 
-       cgrp = cgroup_kn_lock_live(of->kn, false);
-       if (!cgrp)
-               return -ENODEV;
+               return ERR_PTR(-EINVAL);
   
         percpu_down_write(&cgroup_threadgroup_rwsem);
+ 
         rcu_read_lock();
         if (pid) {
                 tsk = find_task_by_vpid(pid);
                 if (!tsk) {
-                       ret = -ESRCH;
-                       goto out_unlock_rcu;
+                       tsk = ERR_PTR(-ESRCH);
+                       goto out_unlock_threadgroup;
                 }
         } else {
                 tsk = current;
@@@ -2520,35 -2609,33 +2613,33 @@@
          * cgroup with no rt_runtime allocated.  Just say no.
          */
         if (tsk->no_cgroup_migration || (tsk->flags & PF_NO_SETAFFINITY)) {
-               ret = -EINVAL;
-               goto out_unlock_rcu;
+               tsk = ERR_PTR(-EINVAL);
+               goto out_unlock_threadgroup;
         }
   
         get_task_struct(tsk);
+       goto out_unlock_rcu;
+ 
+ out_unlock_threadgroup:
+       percpu_up_write(&cgroup_threadgroup_rwsem);
+ out_unlock_rcu:
         rcu_read_unlock();
+       return tsk;
+ }
   
-       ret = cgroup_procs_write_permission(tsk, cgrp, of);
-       if (!ret)
-               ret = cgroup_attach_task(cgrp, tsk, threadgroup);
+ void cgroup_procs_write_finish(struct task_struct *task)
+       __releases(&cgroup_threadgroup_rwsem)
+ {
+       struct cgroup_subsys *ss;
+       int ssid;
   
-       put_task_struct(tsk);
-       goto out_unlock_threadgroup;
+       /* release reference from cgroup_procs_write_start() */
+       put_task_struct(task);
   
- out_unlock_rcu:
-       rcu_read_unlock();
- out_unlock_threadgroup:
         percpu_up_write(&cgroup_threadgroup_rwsem);
         for_each_subsys(ss, ssid)
                 if (ss->post_attach)
                         ss->post_attach();
-       cgroup_kn_unlock(of->kn);
-       return ret ?: nbytes;
- }
- 
- ssize_t cgroup_procs_write(struct kernfs_open_file *of, char *buf, size_t nbytes,
-                          loff_t off)
- {
-       return __cgroup_procs_write(of, buf, nbytes, off, true);
   }
   
   static void cgroup_print_ss_mask(struct seq_file *seq, u16 ss_mask)
@@@ -2891,6 -2978,46 +2982,46 @@@ static void cgroup_finalize_control(str
         cgroup_apply_control_disable(cgrp);
   }
   
+ static int cgroup_vet_subtree_control_enable(struct cgroup *cgrp, u16 enable)
+ {
+       u16 domain_enable = enable & ~cgrp_dfl_threaded_ss_mask;
+ 
+       /* if nothing is getting enabled, nothing to worry about */
+       if (!enable)
+               return 0;
+ 
+       /* can @cgrp host any resources? */
+       if (!cgroup_is_valid_domain(cgrp->dom_cgrp))
+               return -EOPNOTSUPP;
+ 
+       /* mixables don't care */
+       if (cgroup_is_mixable(cgrp))
+               return 0;
+ 
+       if (domain_enable) {
+               /* can't enable domain controllers inside a thread subtree */
+               if (cgroup_is_thread_root(cgrp) || cgroup_is_threaded(cgrp))
+                       return -EOPNOTSUPP;
+       } else {
+               /*
+                * Threaded controllers can handle internal competitions
+                * and are always allowed inside a (prospective) thread
+                * subtree.
+                */
+               if (cgroup_can_be_thread_root(cgrp) || cgroup_is_threaded(cgrp))
+                       return 0;
+       }
+ 
+       /*
+        * Controllers can't be enabled for a cgroup with tasks to avoid
+        * child cgroups competing against tasks.
+        */
+       if (cgroup_has_tasks(cgrp))
+               return -EBUSY;
+ 
+       return 0;
+ }
+ 
   /* change the enabled child controllers for a cgroup in the default hierarchy */
   static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of,
                                             char *buf, size_t nbytes,
@@@ -2966,33 -3093,9 +3097,9 @@@
                 goto out_unlock;
         }
   
-       /*
-        * Except for the root, subtree_control must be zero for a cgroup
-        * with tasks so that child cgroups don't compete against tasks.
-        */
-       if (enable && cgroup_parent(cgrp)) {
-               struct cgrp_cset_link *link;
- 
-               /*
-                * Because namespaces pin csets too, @cgrp->cset_links
-                * might not be empty even when @cgrp is empty.  Walk and
-                * verify each cset.
-                */
-               spin_lock_irq(&css_set_lock);
- 
-               ret = 0;
-               list_for_each_entry(link, &cgrp->cset_links, cset_link) {
-                       if (css_set_populated(link->cset)) {
-                               ret = -EBUSY;
-                               break;
-                       }
-               }
- 
-               spin_unlock_irq(&css_set_lock);
- 
-               if (ret)
-                       goto out_unlock;
-       }
+       ret = cgroup_vet_subtree_control_enable(cgrp, enable);
+       if (ret)
+               goto out_unlock;
   
         /* save and update control masks and prepare csses */
         cgroup_save_control(cgrp);
@@@ -3001,16 -3104,182 +3108,182 @@@
         cgrp->subtree_control &= ~disable;
   
         ret = cgroup_apply_control(cgrp);
- -
         cgroup_finalize_control(cgrp, ret);
+ +      if (ret)
+ +              goto out_unlock;
   
         kernfs_activate(cgrp->kn);
- -      ret = 0;
   out_unlock:
         cgroup_kn_unlock(of->kn);
         return ret ?: nbytes;
   }
   
+ /**
+  * cgroup_enable_threaded - make @cgrp threaded
+  * @cgrp: the target cgroup
+  *
+  * Called when "threaded" is written to the cgroup.type interface file and
+  * tries to make @cgrp threaded and join the parent's resource domain.
+  * This function is never called on the root cgroup as cgroup.type doesn't
+  * exist on it.
+  */
+ static int cgroup_enable_threaded(struct cgroup *cgrp)
+ {
+       struct cgroup *parent = cgroup_parent(cgrp);
+       struct cgroup *dom_cgrp = parent->dom_cgrp;
+       int ret;
+ 
+       lockdep_assert_held(&cgroup_mutex);
+ 
+       /* noop if already threaded */
+       if (cgroup_is_threaded(cgrp))
+               return 0;
+ 
+       /* we're joining the parent's domain, ensure its validity */
+       if (!cgroup_is_valid_domain(dom_cgrp) ||
+           !cgroup_can_be_thread_root(dom_cgrp))
+               return -EOPNOTSUPP;
+ 
+       /*
+        * The following shouldn't cause actual migrations and should
+        * always succeed.
+        */
+       cgroup_save_control(cgrp);
+ 
+       cgrp->dom_cgrp = dom_cgrp;
+       ret = cgroup_apply_control(cgrp);
+       if (!ret)
+               parent->nr_threaded_children++;
+       else
+               cgrp->dom_cgrp = cgrp;
+ 
+       cgroup_finalize_control(cgrp, ret);
+       return ret;
+ }
+ 
+ static int cgroup_type_show(struct seq_file *seq, void *v)
+ {
+       struct cgroup *cgrp = seq_css(seq)->cgroup;
+ 
+       if (cgroup_is_threaded(cgrp))
+               seq_puts(seq, "threaded\n");
+       else if (!cgroup_is_valid_domain(cgrp))
+               seq_puts(seq, "domain invalid\n");
+       else if (cgroup_is_thread_root(cgrp))
+               seq_puts(seq, "domain threaded\n");
+       else
+               seq_puts(seq, "domain\n");
+ 
+       return 0;
+ }
+ 
+ static ssize_t cgroup_type_write(struct kernfs_open_file *of, char *buf,
+                                size_t nbytes, loff_t off)
+ {
+       struct cgroup *cgrp;
+       int ret;
+ 
+       /* only switching to threaded mode is supported */
+       if (strcmp(strstrip(buf), "threaded"))
+               return -EINVAL;
+ 
+       cgrp = cgroup_kn_lock_live(of->kn, false);
+       if (!cgrp)
+               return -ENOENT;
+ 
+       /* threaded can only be enabled */
+       ret = cgroup_enable_threaded(cgrp);
+ 
+       cgroup_kn_unlock(of->kn);
+       return ret ?: nbytes;
+ }
+ 
+ static int cgroup_max_descendants_show(struct seq_file *seq, void *v)
+ {
+       struct cgroup *cgrp = seq_css(seq)->cgroup;
+       int descendants = READ_ONCE(cgrp->max_descendants);
+ 
+       if (descendants == INT_MAX)
+               seq_puts(seq, "max\n");
+       else
+               seq_printf(seq, "%d\n", descendants);
+ 
+       return 0;
+ }
+ 
+ static ssize_t cgroup_max_descendants_write(struct kernfs_open_file *of,
+                                          char *buf, size_t nbytes, loff_t off)
+ {
+       struct cgroup *cgrp;
+       int descendants;
+       ssize_t ret;
+ 
+       buf = strstrip(buf);
+       if (!strcmp(buf, "max")) {
+               descendants = INT_MAX;
+       } else {
+               ret = kstrtoint(buf, 0, &descendants);
+               if (ret)
+                       return ret;
+       }
+ 
+       if (descendants < 0)
+               return -ERANGE;
+ 
+       cgrp = cgroup_kn_lock_live(of->kn, false);
+       if (!cgrp)
+               return -ENOENT;
+ 
+       cgrp->max_descendants = descendants;
+ 
+       cgroup_kn_unlock(of->kn);
+ 
+       return nbytes;
+ }
+ 
+ static int cgroup_max_depth_show(struct seq_file *seq, void *v)
+ {
+       struct cgroup *cgrp = seq_css(seq)->cgroup;
+       int depth = READ_ONCE(cgrp->max_depth);
+ 
+       if (depth == INT_MAX)
+               seq_puts(seq, "max\n");
+       else
+               seq_printf(seq, "%d\n", depth);
+ 
+       return 0;
+ }
+ 
+ static ssize_t cgroup_max_depth_write(struct kernfs_open_file *of,
+                                     char *buf, size_t nbytes, loff_t off)
+ {
+       struct cgroup *cgrp;
+       ssize_t ret;
+       int depth;
+ 
+       buf = strstrip(buf);
+       if (!strcmp(buf, "max")) {
+               depth = INT_MAX;
+       } else {
+               ret = kstrtoint(buf, 0, &depth);
+               if (ret)
+                       return ret;
+       }
+ 
+       if (depth < 0)
+               return -ERANGE;
+ 
+       cgrp = cgroup_kn_lock_live(of->kn, false);
+       if (!cgrp)
+               return -ENOENT;
+ 
+       cgrp->max_depth = depth;
+ 
+       cgroup_kn_unlock(of->kn);
+ 
+       return nbytes;
+ }
+ 
   static int cgroup_events_show(struct seq_file *seq, void *v)
   {
         seq_printf(seq, "populated %d\n",
@@@ -3018,6 -3287,18 +3291,18 @@@
         return 0;
   }
   
+ static int cgroup_stat_show(struct seq_file *seq, void *v)
+ {
+       struct cgroup *cgroup = seq_css(seq)->cgroup;
+ 
+       seq_printf(seq, "nr_descendants %d\n",
+                  cgroup->nr_descendants);
+       seq_printf(seq, "nr_dying_descendants %d\n",
+                  cgroup->nr_dying_descendants);
+ 
+       return 0;
+ }
+ 
   static int cgroup_file_open(struct kernfs_open_file *of)
   {
         struct cftype *cft = of->kn->priv;
@@@ -3234,7 -3515,6 +3519,6 @@@ restart
   
   static int cgroup_apply_cftypes(struct cftype *cfts, bool is_add)
   {
-       LIST_HEAD(pending);
         struct cgroup_subsys *ss = cfts[0].ss;
         struct cgroup *root = &ss->root->cgrp;
         struct cgroup_subsys_state *css;
@@@ -3659,6 -3939,58 +3943,58 @@@ bool css_has_online_children(struct cgr
         return ret;
   }
   
+ static struct css_set *css_task_iter_next_css_set(struct css_task_iter *it)
+ {
+       struct list_head *l;
+       struct cgrp_cset_link *link;
+       struct css_set *cset;
+ 
+       lockdep_assert_held(&css_set_lock);
+ 
+       /* find the next threaded cset */
+       if (it->tcset_pos) {
+               l = it->tcset_pos->next;
+ 
+               if (l != it->tcset_head) {
+                       it->tcset_pos = l;
+                       return container_of(l, struct css_set,
+                                           threaded_csets_node);
+               }
+ 
+               it->tcset_pos = NULL;
+       }
+ 
+       /* find the next cset */
+       l = it->cset_pos;
+       l = l->next;
+       if (l == it->cset_head) {
+               it->cset_pos = NULL;
+               return NULL;
+       }
+ 
+       if (it->ss) {
+               cset = container_of(l, struct css_set, e_cset_node[it->ss->id]);
+       } else {
+               link = list_entry(l, struct cgrp_cset_link, cset_link);
+               cset = link->cset;
+       }
+ 
+       it->cset_pos = l;
+ 
+       /* initialize threaded css_set walking */
+       if (it->flags & CSS_TASK_ITER_THREADED) {
+               if (it->cur_dcset)
+                       put_css_set_locked(it->cur_dcset);
+               it->cur_dcset = cset;
+               get_css_set(cset);
+ 
+               it->tcset_head = &cset->threaded_csets;
+               it->tcset_pos = &cset->threaded_csets;
+       }
+ 
+       return cset;
+ }
+ 
   /**
    * css_task_iter_advance_css_set - advance a task itererator to the next css_set
    * @it: the iterator to advance
@@@ -3667,32 -3999,19 +4003,19 @@@
    */
   static void css_task_iter_advance_css_set(struct css_task_iter *it)
   {
-       struct list_head *l = it->cset_pos;
-       struct cgrp_cset_link *link;
         struct css_set *cset;
   
         lockdep_assert_held(&css_set_lock);
   
         /* Advance to the next non-empty css_set */
         do {
-               l = l->next;
-               if (l == it->cset_head) {
-                       it->cset_pos = NULL;
+               cset = css_task_iter_next_css_set(it);
+               if (!cset) {
                         it->task_pos = NULL;
                         return;
                 }
- 
-               if (it->ss) {
-                       cset = container_of(l, struct css_set,
-                                           e_cset_node[it->ss->id]);
-               } else {
-                       link = list_entry(l, struct cgrp_cset_link, cset_link);
-                       cset = link->cset;
-               }
         } while (!css_set_populated(cset));
   
-       it->cset_pos = l;
- 
         if (!list_empty(&cset->tasks))
                 it->task_pos = cset->tasks.next;
         else
@@@ -3732,6 -4051,7 +4055,7 @@@ static void css_task_iter_advance(struc
         lockdep_assert_held(&css_set_lock);
         WARN_ON_ONCE(!l);
   
+ repeat:
         /*
          * Advance iterator to find next entry.  cset->tasks is consumed
          * first and then ->mg_tasks.  After ->mg_tasks, we move onto the
@@@ -3746,11 -4066,18 +4070,18 @@@
                 css_task_iter_advance_css_set(it);
         else
                 it->task_pos = l;
+ 
+       /* if PROCS, skip over tasks which aren't group leaders */
+       if ((it->flags & CSS_TASK_ITER_PROCS) && it->task_pos &&
+           !thread_group_leader(list_entry(it->task_pos, struct task_struct,
+                                           cg_list)))
+               goto repeat;
   }
   
   /**
    * css_task_iter_start - initiate task iteration
    * @css: the css to walk tasks of
+  * @flags: CSS_TASK_ITER_* flags
    * @it: the task iterator to use
    *
    * Initiate iteration through the tasks of @css.  The caller can call
@@@ -3758,7 -4085,7 +4089,7 @@@
    * returns NULL.  On completion of iteration, css_task_iter_end() must be
    * called.
    */
- void css_task_iter_start(struct cgroup_subsys_state *css,
+ void css_task_iter_start(struct cgroup_subsys_state *css, unsigned int flags,
                          struct css_task_iter *it)
   {
         /* no one should try to iterate before mounting cgroups */
@@@ -3769,6 -4096,7 +4100,7 @@@
         spin_lock_irq(&css_set_lock);
   
         it->ss = css->ss;
+       it->flags = flags;
   
         if (it->ss)
                 it->cset_pos = &css->cgroup->e_csets[css->ss->id];
@@@ -3826,6 -4154,9 +4158,9 @@@ void css_task_iter_end(struct css_task_
                 spin_unlock_irq(&css_set_lock);
         }
   
+       if (it->cur_dcset)
+               put_css_set(it->cur_dcset);
+ 
         if (it->cur_task)
                 put_task_struct(it->cur_task);
   }
@@@ -3842,16 -4173,12 +4177,12 @@@ static void *cgroup_procs_next(struct s
   {
         struct kernfs_open_file *of = s->private;
         struct css_task_iter *it = of->priv;
-       struct task_struct *task;
   
-       do {
-               task = css_task_iter_next(it);
-       } while (task && !thread_group_leader(task));
- 
-       return task;
+       return css_task_iter_next(it);
   }
   
- static void *cgroup_procs_start(struct seq_file *s, loff_t *pos)
+ static void *__cgroup_procs_start(struct seq_file *s, loff_t *pos,
+                                 unsigned int iter_flags)
   {
         struct kernfs_open_file *of = s->private;
         struct cgroup *cgrp = seq_css(s)->cgroup;
@@@ -3869,23 -4196,168 +4200,168 @@@
                 if (!it)
                         return ERR_PTR(-ENOMEM);
                 of->priv = it;
-               css_task_iter_start(&cgrp->self, it);
+               css_task_iter_start(&cgrp->self, iter_flags, it);
         } else if (!(*pos)++) {
                 css_task_iter_end(it);
-               css_task_iter_start(&cgrp->self, it);
+               css_task_iter_start(&cgrp->self, iter_flags, it);
         }
   
         return cgroup_procs_next(s, NULL, NULL);
   }
   
+ static void *cgroup_procs_start(struct seq_file *s, loff_t *pos)
+ {
+       struct cgroup *cgrp = seq_css(s)->cgroup;
+ 
+       /*
+        * All processes of a threaded subtree belong to the domain cgroup
+        * of the subtree.  Only threads can be distributed across the
+        * subtree.  Reject reads on cgroup.procs in the subtree proper.
+        * They're always empty anyway.
+        */
+       if (cgroup_is_threaded(cgrp))
+               return ERR_PTR(-EOPNOTSUPP);
+ 
+       return __cgroup_procs_start(s, pos, CSS_TASK_ITER_PROCS |
+                                           CSS_TASK_ITER_THREADED);
+ }
+ 
   static int cgroup_procs_show(struct seq_file *s, void *v)
   {
-       seq_printf(s, "%d\n", task_tgid_vnr(v));
+       seq_printf(s, "%d\n", task_pid_vnr(v));
         return 0;
   }
   
+ static int cgroup_procs_write_permission(struct cgroup *src_cgrp,
+                                        struct cgroup *dst_cgrp,
+                                        struct super_block *sb)
+ {
+       struct cgroup_namespace *ns = current->nsproxy->cgroup_ns;
+       struct cgroup *com_cgrp = src_cgrp;
+       struct inode *inode;
+       int ret;
+ 
+       lockdep_assert_held(&cgroup_mutex);
+ 
+       /* find the common ancestor */
+       while (!cgroup_is_descendant(dst_cgrp, com_cgrp))
+               com_cgrp = cgroup_parent(com_cgrp);
+ 
+       /* %current should be authorized to migrate to the common ancestor */
+       inode = kernfs_get_inode(sb, com_cgrp->procs_file.kn);
+       if (!inode)
+               return -ENOMEM;
+ 
+       ret = inode_permission(inode, MAY_WRITE);
+       iput(inode);
+       if (ret)
+               return ret;
+ 
+       /*
+        * If namespaces are delegation boundaries, %current must be able
+        * to see both source and destination cgroups from its namespace.
+        */
+       if ((cgrp_dfl_root.flags & CGRP_ROOT_NS_DELEGATE) &&
+           (!cgroup_is_descendant(src_cgrp, ns->root_cset->dfl_cgrp) ||
+            !cgroup_is_descendant(dst_cgrp, ns->root_cset->dfl_cgrp)))
+               return -ENOENT;
+ 
+       return 0;
+ }
+ 
+ static ssize_t cgroup_procs_write(struct kernfs_open_file *of,
+                                 char *buf, size_t nbytes, loff_t off)
+ {
+       struct cgroup *src_cgrp, *dst_cgrp;
+       struct task_struct *task;
+       ssize_t ret;
+ 
+       dst_cgrp = cgroup_kn_lock_live(of->kn, false);
+       if (!dst_cgrp)
+               return -ENODEV;
+ 
+       task = cgroup_procs_write_start(buf, true);
+       ret = PTR_ERR_OR_ZERO(task);
+       if (ret)
+               goto out_unlock;
+ 
+       /* find the source cgroup */
+       spin_lock_irq(&css_set_lock);
+       src_cgrp = task_cgroup_from_root(task, &cgrp_dfl_root);
+       spin_unlock_irq(&css_set_lock);
+ 
+       ret = cgroup_procs_write_permission(src_cgrp, dst_cgrp,
+                                           of->file->f_path.dentry->d_sb);
+       if (ret)
+               goto out_finish;
+ 
+       ret = cgroup_attach_task(dst_cgrp, task, true);
+ 
+ out_finish:
+       cgroup_procs_write_finish(task);
+ out_unlock:
+       cgroup_kn_unlock(of->kn);
+ 
+       return ret ?: nbytes;
+ }
+ 
+ static void *cgroup_threads_start(struct seq_file *s, loff_t *pos)
+ {
+       return __cgroup_procs_start(s, pos, 0);
+ }
+ 
+ static ssize_t cgroup_threads_write(struct kernfs_open_file *of,
+                                   char *buf, size_t nbytes, loff_t off)
+ {
+       struct cgroup *src_cgrp, *dst_cgrp;
+       struct task_struct *task;
+       ssize_t ret;
+ 
+       buf = strstrip(buf);
+ 
+       dst_cgrp = cgroup_kn_lock_live(of->kn, false);
+       if (!dst_cgrp)
+               return -ENODEV;
+ 
+       task = cgroup_procs_write_start(buf, false);
+       ret = PTR_ERR_OR_ZERO(task);
+       if (ret)
+               goto out_unlock;
+ 
+       /* find the source cgroup */
+       spin_lock_irq(&css_set_lock);
+       src_cgrp = task_cgroup_from_root(task, &cgrp_dfl_root);
+       spin_unlock_irq(&css_set_lock);
+ 
+       /* thread migrations follow the cgroup.procs delegation rule */
+       ret = cgroup_procs_write_permission(src_cgrp, dst_cgrp,
+                                           of->file->f_path.dentry->d_sb);
+       if (ret)
+               goto out_finish;
+ 
+       /* and must be contained in the same domain */
+       ret = -EOPNOTSUPP;
+       if (src_cgrp->dom_cgrp != dst_cgrp->dom_cgrp)
+               goto out_finish;
+ 
+       ret = cgroup_attach_task(dst_cgrp, task, false);
+ 
+ out_finish:
+       cgroup_procs_write_finish(task);
+ out_unlock:
+       cgroup_kn_unlock(of->kn);
+ 
+       return ret ?: nbytes;
+ }
+ 
   /* cgroup core interface files for the default hierarchy */
   static struct cftype cgroup_base_files[] = {
+       {
+               .name = "cgroup.type",
+               .flags = CFTYPE_NOT_ON_ROOT,
+               .seq_show = cgroup_type_show,
+               .write = cgroup_type_write,
+       },
         {
                 .name = "cgroup.procs",
                 .flags = CFTYPE_NS_DELEGATABLE,
@@@ -3896,6 -4368,14 +4372,14 @@@
                 .seq_show = cgroup_procs_show,
                 .write = cgroup_procs_write,
         },
+       {
+               .name = "cgroup.threads",
+               .release = cgroup_procs_release,
+               .seq_start = cgroup_threads_start,
+               .seq_next = cgroup_procs_next,
+               .seq_show = cgroup_procs_show,
+               .write = cgroup_threads_write,
+       },
         {
                 .name = "cgroup.controllers",
                 .seq_show = cgroup_controllers_show,
@@@ -3912,6 -4392,20 +4396,20 @@@
                 .file_offset = offsetof(struct cgroup, events_file),
                 .seq_show = cgroup_events_show,
         },
+       {
+               .name = "cgroup.max.descendants",
+               .seq_show = cgroup_max_descendants_show,
+               .write = cgroup_max_descendants_write,
+       },
+       {
+               .name = "cgroup.max.depth",
+               .seq_show = cgroup_max_depth_show,
+               .write = cgroup_max_depth_write,
+       },
+       {
+               .name = "cgroup.stat",
+               .seq_show = cgroup_stat_show,
+       },
         { }     /* terminate */
   };
   
@@@ -4011,9 -4505,15 +4509,15 @@@ static void css_release_work_fn(struct 
                 if (ss->css_released)
                         ss->css_released(css);
         } else {
+               struct cgroup *tcgrp;
+ 
                 /* cgroup release path */
                 trace_cgroup_release(cgrp);
   
+               for (tcgrp = cgroup_parent(cgrp); tcgrp;
+                    tcgrp = cgroup_parent(tcgrp))
+                       tcgrp->nr_dying_descendants--;
+ 
                 cgroup_idr_remove(&cgrp->root->cgroup_idr, cgrp->id);
                 cgrp->id = -1;
   
@@@ -4100,6 -4600,9 +4604,6 @@@ static void offline_css(struct cgroup_s
         if (!(css->flags & CSS_ONLINE))
                 return;
   
- -      if (ss->css_reset)
- -              ss->css_reset(css);
- -
         if (ss->css_offline)
                 ss->css_offline(css);
   
@@@ -4209,9 -4712,13 +4713,13 @@@ static struct cgroup *cgroup_create(str
         cgrp->root = root;
         cgrp->level = level;
   
-       for (tcgrp = cgrp; tcgrp; tcgrp = cgroup_parent(tcgrp))
+       for (tcgrp = cgrp; tcgrp; tcgrp = cgroup_parent(tcgrp)) {
                 cgrp->ancestor_ids[tcgrp->level] = tcgrp->id;
   
+               if (tcgrp != cgrp)
+                       tcgrp->nr_descendants++;
+       }
+ 
         if (notify_on_release(parent))
                 set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
   
@@@ -4252,6 -4759,29 +4760,29 @@@ out_free_cgrp
         return ERR_PTR(ret);
   }
   
+ static bool cgroup_check_hierarchy_limits(struct cgroup *parent)
+ {
+       struct cgroup *cgroup;
+       int ret = false;
+       int level = 1;
+ 
+       lockdep_assert_held(&cgroup_mutex);
+ 
+       for (cgroup = parent; cgroup; cgroup = cgroup_parent(cgroup)) {
+               if (cgroup->nr_descendants >= cgroup->max_descendants)
+                       goto fail;
+ 
+               if (level > cgroup->max_depth)
+                       goto fail;
+ 
+               level++;
+       }
+ 
+       ret = true;
+ fail:
+       return ret;
+ }
+ 
   int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, umode_t mode)
   {
         struct cgroup *parent, *cgrp;
@@@ -4266,6 -4796,11 +4797,11 @@@
         if (!parent)
                 return -ENODEV;
   
+       if (!cgroup_check_hierarchy_limits(parent)) {
+               ret = -EAGAIN;
+               goto out_unlock;
+       }
+ 
         cgrp = cgroup_create(parent);
         if (IS_ERR(cgrp)) {
                 ret = PTR_ERR(cgrp);
@@@ -4417,6 -4952,7 +4953,7 @@@ static void kill_css(struct cgroup_subs
   static int cgroup_destroy_locked(struct cgroup *cgrp)
         __releases(&cgroup_mutex) __acquires(&cgroup_mutex)
   {
+       struct cgroup *tcgrp, *parent = cgroup_parent(cgrp);
         struct cgroup_subsys_state *css;
         struct cgrp_cset_link *link;
         int ssid;
@@@ -4461,7 -4997,15 +4998,15 @@@
          */
         kernfs_remove(cgrp->kn);
   
-       cgroup1_check_for_release(cgroup_parent(cgrp));
+       if (parent && cgroup_is_threaded(cgrp))
+               parent->nr_threaded_children--;
+ 
+       for (tcgrp = cgroup_parent(cgrp); tcgrp; tcgrp = cgroup_parent(tcgrp)) {
+               tcgrp->nr_descendants--;
+               tcgrp->nr_dying_descendants++;
+       }
+ 
+       cgroup1_check_for_release(parent);
   
         /* put the base reference */
         percpu_ref_kill(&cgrp->self.refcnt);
@@@ -4656,11 -5200,17 +5201,17 @@@ int __init cgroup_init(void
   
                 cgrp_dfl_root.subsys_mask |= 1 << ss->id;
   
+               /* implicit controllers must be threaded too */
+               WARN_ON(ss->implicit_on_dfl && !ss->threaded);
+ 
                 if (ss->implicit_on_dfl)
                         cgrp_dfl_implicit_ss_mask |= 1 << ss->id;
                 else if (!ss->dfl_cftypes)
                         cgrp_dfl_inhibit_ss_mask |= 1 << ss->id;
   
+               if (ss->threaded)
+                       cgrp_dfl_threaded_ss_mask |= 1 << ss->id;
+ 
                 if (ss->dfl_cftypes == ss->legacy_cftypes) {
                         WARN_ON(cgroup_add_cftypes(ss, ss->dfl_cftypes));
                 } else {
@@@ -4670,10 -5220,6 +5221,10 @@@
   
                 if (ss->bind)
                         ss->bind(init_css_set.subsys[ssid]);
+ +
+ +              mutex_lock(&cgroup_mutex);
+ +              css_populate_dir(init_css_set.subsys[ssid]);
+ +              mutex_unlock(&cgroup_mutex);
         }
   
         /* init_css_set.subsys[] has been updated, re-hash */
diff --combined kernel/cgroup/cpuset.c

index e7485786db9b3e8e881d045028d288d3b0cb71b1,f3539a41c49df3224047148b73bff8dae263f14e..67230ecf2ce155a36a045b0bb1c080224521a63a
--- 1/kernel/cgroup/cpuset.c
--- 2/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@@ -56,7 -56,6 +56,7 @@@
   #include <linux/time64.h>
   #include <linux/backing-dev.h>
   #include <linux/sort.h>
+ +#include <linux/oom.h>
   
   #include <linux/uaccess.h>
   #include <linux/atomic.h>
@@@ -64,7 -63,6 +64,7 @@@
   #include <linux/cgroup.h>
   #include <linux/wait.h>
   
+ +DEFINE_STATIC_KEY_FALSE(cpusets_pre_enable_key);
   DEFINE_STATIC_KEY_FALSE(cpusets_enabled_key);
   
   /* See "Frequency meter" comments, below. */
@@@ -300,6 -298,16 +300,16 @@@ static DECLARE_WORK(cpuset_hotplug_work
   
   static DECLARE_WAIT_QUEUE_HEAD(cpuset_attach_wq);
   
+ /*
+  * Cgroup v2 behavior is used when on default hierarchy or the
+  * cgroup_v2_mode flag is set.
+  */
+ static inline bool is_in_v2_mode(void)
+ {
+       return cgroup_subsys_on_dfl(cpuset_cgrp_subsys) ||
+             (cpuset_cgrp_subsys.root->flags & CGRP_ROOT_CPUSET_V2_MODE);
+ }
+ 
   /*
    * This is ugly, but preserves the userspace API for existing cpuset
    * users. If someone tries to mount the "cpuset" filesystem, we
@@@ -490,8 -498,7 +500,7 @@@ static int validate_change(struct cpuse
   
         /* On legacy hiearchy, we must be a subset of our parent cpuset. */
         ret = -EACCES;
-       if (!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) &&
-           !is_cpuset_subset(trial, par))
+       if (!is_in_v2_mode() && !is_cpuset_subset(trial, par))
                 goto out;
   
         /*
@@@ -578,13 -585,6 +587,13 @@@ static void update_domain_attr_tree(str
         rcu_read_unlock();
   }
   
+ +/* Must be called with cpuset_mutex held.  */
+ +static inline int nr_cpusets(void)
+ +{
+ +      /* jump label reference count + the top-level cpuset */
+ +      return static_key_count(&cpusets_enabled_key.key) + 1;
+ +}
+ +
   /*
    * generate_sched_domains()
    *
@@@ -870,7 -870,7 +879,7 @@@ static void update_tasks_cpumask(struc
         struct css_task_iter it;
         struct task_struct *task;
   
-       css_task_iter_start(&cs->css, &it);
+       css_task_iter_start(&cs->css, 0, &it);
         while ((task = css_task_iter_next(&it)))
                 set_cpus_allowed_ptr(task, cs->effective_cpus);
         css_task_iter_end(&it);
@@@ -904,8 -904,7 +913,7 @@@ static void update_cpumasks_hier(struc
                  * If it becomes empty, inherit the effective mask of the
                  * parent, which is guaranteed to have some CPUs.
                  */
-               if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys) &&
-                   cpumask_empty(new_cpus))
+               if (is_in_v2_mode() && cpumask_empty(new_cpus))
                         cpumask_copy(new_cpus, parent->effective_cpus);
   
                 /* Skip the whole subtree if the cpumask remains the same. */
@@@ -922,7 -921,7 +930,7 @@@
                 cpumask_copy(cp->effective_cpus, new_cpus);
                 spin_unlock_irq(&callback_lock);
   
-               WARN_ON(!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) &&
+               WARN_ON(!is_in_v2_mode() &&
                         !cpumask_equal(cp->cpus_allowed, cp->effective_cpus));
   
                 update_tasks_cpumask(cp);
@@@ -1100,7 -1099,7 +1108,7 @@@ static void update_tasks_nodemask(struc
          * It's ok if we rebind the same mm twice; mpol_rebind_mm()
          * is idempotent.  Also migrate pages in each mm to new nodes.
          */
-       css_task_iter_start(&cs->css, &it);
+       css_task_iter_start(&cs->css, 0, &it);
         while ((task = css_task_iter_next(&it))) {
                 struct mm_struct *mm;
                 bool migrate;
@@@ -1158,8 -1157,7 +1166,7 @@@ static void update_nodemasks_hier(struc
                  * If it becomes empty, inherit the effective mask of the
                  * parent, which is guaranteed to have some MEMs.
                  */
-               if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys) &&
-                   nodes_empty(*new_mems))
+               if (is_in_v2_mode() && nodes_empty(*new_mems))
                         *new_mems = parent->effective_mems;
   
                 /* Skip the whole subtree if the nodemask remains the same. */
@@@ -1176,7 -1174,7 +1183,7 @@@
                 cp->effective_mems = *new_mems;
                 spin_unlock_irq(&callback_lock);
   
-               WARN_ON(!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) &&
+               WARN_ON(!is_in_v2_mode() &&
                         !nodes_equal(cp->mems_allowed, cp->effective_mems));
   
                 update_tasks_nodemask(cp);
@@@ -1293,7 -1291,7 +1300,7 @@@ static void update_tasks_flags(struct c
         struct css_task_iter it;
         struct task_struct *task;
   
-       css_task_iter_start(&cs->css, &it);
+       css_task_iter_start(&cs->css, 0, &it);
         while ((task = css_task_iter_next(&it)))
                 cpuset_update_task_spread_flag(cs, task);
         css_task_iter_end(&it);
@@@ -1468,7 -1466,7 +1475,7 @@@ static int cpuset_can_attach(struct cgr
   
         /* allow moving tasks into an empty cpuset if on default hierarchy */
         ret = -ENOSPC;
-       if (!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) &&
+       if (!is_in_v2_mode() &&
             (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed)))
                 goto out_unlock;
   
@@@ -1900,7 -1898,6 +1907,7 @@@ static struct cftype files[] = 
         {
                 .name = "memory_pressure",
                 .read_u64 = cpuset_read_u64,
+ +              .private = FILE_MEMORY_PRESSURE,
         },
   
         {
@@@ -1987,7 -1984,7 +1994,7 @@@ static int cpuset_css_online(struct cgr
         cpuset_inc();
   
         spin_lock_irq(&callback_lock);
-       if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys)) {
+       if (is_in_v2_mode()) {
                 cpumask_copy(cs->effective_cpus, parent->effective_cpus);
                 cs->effective_mems = parent->effective_mems;
         }
@@@ -2064,7 -2061,7 +2071,7 @@@ static void cpuset_bind(struct cgroup_s
         mutex_lock(&cpuset_mutex);
         spin_lock_irq(&callback_lock);
   
-       if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys)) {
+       if (is_in_v2_mode()) {
                 cpumask_copy(top_cpuset.cpus_allowed, cpu_possible_mask);
                 top_cpuset.mems_allowed = node_possible_map;
         } else {
@@@ -2258,7 -2255,7 +2265,7 @@@ retry
         cpus_updated = !cpumask_equal(&new_cpus, cs->effective_cpus);
         mems_updated = !nodes_equal(new_mems, cs->effective_mems);
   
-       if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys))
+       if (is_in_v2_mode())
                 hotplug_update_tasks(cs, &new_cpus, &new_mems,
                                      cpus_updated, mems_updated);
         else
@@@ -2289,7 -2286,7 +2296,7 @@@ static void cpuset_hotplug_workfn(struc
         static cpumask_t new_cpus;
         static nodemask_t new_mems;
         bool cpus_updated, mems_updated;
-       bool on_dfl = cgroup_subsys_on_dfl(cpuset_cgrp_subsys);
+       bool on_dfl = is_in_v2_mode();
   
         mutex_lock(&cpuset_mutex);
   
@@@ -2352,7 -2349,13 +2359,7 @@@ void cpuset_update_active_cpus(void
          * We're inside cpu hotplug critical region which usually nests
          * inside cgroup synchronization.  Bounce actual hotplug processing
          * to a work item to avoid reverse locking order.
- -       *
- -       * We still need to do partition_sched_domains() synchronously;
- -       * otherwise, the scheduler will get confused and put tasks to the
- -       * dead CPU.  Fall back to the default single domain.
- -       * cpuset_hotplug_workfn() will rebuild it as necessary.
          */
- -      partition_sched_domains(1, NULL, NULL);
         schedule_work(&cpuset_hotplug_work);
   }
   
@@@ -2501,12 -2504,12 +2508,12 @@@ static struct cpuset *nearest_hardwall_
    * If we're in interrupt, yes, we can always allocate.  If @node is set in
    * current's mems_allowed, yes.  If it's not a __GFP_HARDWALL request and this
    * node is set in the nearest hardwalled cpuset ancestor to current's cpuset,
- - * yes.  If current has access to memory reserves due to TIF_MEMDIE, yes.
+ + * yes.  If current has access to memory reserves as an oom victim, yes.
    * Otherwise, no.
    *
    * GFP_USER allocations are marked with the __GFP_HARDWALL bit,
    * and do not allow allocations outside the current tasks cpuset
- - * unless the task has been OOM killed as is marked TIF_MEMDIE.
+ + * unless the task has been OOM killed.
    * GFP_KERNEL allocations are not so marked, so can escape to the
    * nearest enclosing hardwalled ancestor cpuset.
    *
@@@ -2529,7 -2532,7 +2536,7 @@@
    * affect that:
    *    in_interrupt - any node ok (current task context irrelevant)
    *    GFP_ATOMIC   - any node ok
- - *    TIF_MEMDIE   - any node ok
+ + *    tsk_is_oom_victim   - any node ok
    *    GFP_KERNEL   - any node in enclosing hardwalled cpuset ok
    *    GFP_USER     - only nodes in current tasks mems allowed ok.
    */
@@@ -2547,7 -2550,7 +2554,7 @@@ bool __cpuset_node_allowed(int node, gf
          * Allow tasks that have access to memory reserves because they have
          * been OOM killed to get memory anywhere.
          */
- -      if (unlikely(test_thread_flag(TIF_MEMDIE)))
+ +      if (unlikely(tsk_is_oom_victim(current)))
                 return true;
         if (gfp_mask & __GFP_HARDWALL)  /* If hardwall request, stop here */
                 return false;
diff --combined kernel/events/core.c

index fb415e3d824bdeb966762268a18b7c1e39354e59,ec78247da3100fb9d0590af1574d679292a145c5..3e691b75b2db2eab410208b7312687270e1fe765
--- 1/kernel/events/core.c
--- 2/kernel/events/core.c
+++ b/kernel/events/core.c
@@@ -1249,31 -1249,26 +1249,31 @@@ unclone_ctx(struct perf_event_context *
         return parent_ctx;
   }
   
- -static u32 perf_event_pid(struct perf_event *event, struct task_struct *p)
+ +static u32 perf_event_pid_type(struct perf_event *event, struct task_struct *p,
+ +                              enum pid_type type)
   {
+ +      u32 nr;
         /*
          * only top level events have the pid namespace they were created in
          */
         if (event->parent)
                 event = event->parent;
   
- -      return task_tgid_nr_ns(p, event->ns);
+ +      nr = __task_pid_nr_ns(p, type, event->ns);
+ +      /* avoid -1 if it is idle thread or runs in another ns */
+ +      if (!nr && !pid_alive(p))
+ +              nr = -1;
+ +      return nr;
   }
   
- -static u32 perf_event_tid(struct perf_event *event, struct task_struct *p)
+ +static u32 perf_event_pid(struct perf_event *event, struct task_struct *p)
   {
- -      /*
- -       * only top level events have the pid namespace they were created in
- -       */
- -      if (event->parent)
- -              event = event->parent;
+ +      return perf_event_pid_type(event, p, __PIDTYPE_TGID);
+ +}
   
- -      return task_pid_nr_ns(p, event->ns);
+ +static u32 perf_event_tid(struct perf_event *event, struct task_struct *p)
+ +{
+ +      return perf_event_pid_type(event, p, PIDTYPE_PID);
   }
   
   /*
@@@ -1457,13 -1452,6 +1457,13 @@@ static enum event_type_t get_event_type
   
         lockdep_assert_held(&ctx->lock);
   
+ +      /*
+ +       * It's 'group type', really, because if our group leader is
+ +       * pinned, so are we.
+ +       */
+ +      if (event->group_leader != event)
+ +              event = event->group_leader;
+ +
         event_type = event->attr.pinned ? EVENT_PINNED : EVENT_FLEXIBLE;
         if (!ctx->task)
                 event_type |= EVENT_CPU;
@@@ -1575,9 -1563,6 +1575,9 @@@ static void __perf_event_header_size(st
         if (sample_type & PERF_SAMPLE_TRANSACTION)
                 size += sizeof(data->txn);
   
+ +      if (sample_type & PERF_SAMPLE_PHYS_ADDR)
+ +              size += sizeof(data->phys_addr);
+ +
         event->header_size = size;
   }
   
@@@ -2225,33 -2210,6 +2225,33 @@@ static int group_can_go_on(struct perf_
         return can_add_hw;
   }
   
+ +/*
+ + * Complement to update_event_times(). This computes the tstamp_* values to
+ + * continue 'enabled' state from @now, and effectively discards the time
+ + * between the prior tstamp_stopped and now (as we were in the OFF state, or
+ + * just switched (context) time base).
+ + *
+ + * This further assumes '@event->state == INACTIVE' (we just came from OFF) and
+ + * cannot have been scheduled in yet. And going into INACTIVE state means
+ + * '@event->tstamp_stopped = @now'.
+ + *
+ + * Thus given the rules of update_event_times():
+ + *
+ + *   total_time_enabled = tstamp_stopped - tstamp_enabled
+ + *   total_time_running = tstamp_stopped - tstamp_running
+ + *
+ + * We can insert 'tstamp_stopped == now' and reverse them to compute new
+ + * tstamp_* values.
+ + */
+ +static void __perf_event_enable_time(struct perf_event *event, u64 now)
+ +{
+ +      WARN_ON_ONCE(event->state != PERF_EVENT_STATE_INACTIVE);
+ +
+ +      event->tstamp_stopped = now;
+ +      event->tstamp_enabled = now - event->total_time_enabled;
+ +      event->tstamp_running = now - event->total_time_running;
+ +}
+ +
   static void add_event_to_ctx(struct perf_event *event,
                                struct perf_event_context *ctx)
   {
@@@ -2259,12 -2217,9 +2259,12 @@@
   
         list_add_event(event, ctx);
         perf_group_attach(event);
- -      event->tstamp_enabled = tstamp;
- -      event->tstamp_running = tstamp;
- -      event->tstamp_stopped = tstamp;
+ +      /*
+ +       * We can be called with event->state == STATE_OFF when we create with
+ +       * .disabled = 1. In that case the IOC_ENABLE will call this function.
+ +       */
+ +      if (event->state == PERF_EVENT_STATE_INACTIVE)
+ +              __perf_event_enable_time(event, tstamp);
   }
   
   static void ctx_sched_out(struct perf_event_context *ctx,
@@@ -2509,11 -2464,10 +2509,11 @@@ static void __perf_event_mark_enabled(s
         u64 tstamp = perf_event_time(event);
   
         event->state = PERF_EVENT_STATE_INACTIVE;
- -      event->tstamp_enabled = tstamp - event->total_time_enabled;
+ +      __perf_event_enable_time(event, tstamp);
         list_for_each_entry(sub, &event->sibling_list, group_entry) {
+ +              /* XXX should not be > INACTIVE if event isn't */
                 if (sub->state >= PERF_EVENT_STATE_INACTIVE)
- -                      sub->tstamp_enabled = tstamp - sub->total_time_enabled;
+ +                      __perf_event_enable_time(sub, tstamp);
         }
   }
   
@@@ -3219,13 -3173,6 +3219,13 @@@ static void perf_event_context_sched_in
                 return;
   
         perf_ctx_lock(cpuctx, ctx);
+ +      /*
+ +       * We must check ctx->nr_events while holding ctx->lock, such
+ +       * that we serialize against perf_install_in_context().
+ +       */
+ +      if (!ctx->nr_events)
+ +              goto unlock;
+ +
         perf_pmu_disable(ctx->pmu);
         /*
          * We want to keep the following priority order:
@@@ -3239,8 -3186,6 +3239,8 @@@
                 cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
         perf_event_sched_in(cpuctx, ctx, task);
         perf_pmu_enable(ctx->pmu);
+ +
+ +unlock:
         perf_ctx_unlock(cpuctx, ctx);
   }
   
@@@ -3673,7 -3618,10 +3673,7 @@@ unlock
   
   static inline u64 perf_event_count(struct perf_event *event)
   {
- -      if (event->pmu->count)
- -              return event->pmu->count(event);
- -
- -      return __perf_event_count(event);
+ +      return local64_read(&event->count) + atomic64_read(&event->child_count);
   }
   
   /*
@@@ -3704,6 -3652,15 +3704,6 @@@ int perf_event_read_local(struct perf_e
                 goto out;
         }
   
- -      /*
- -       * It must not have a pmu::count method, those are not
- -       * NMI safe.
- -       */
- -      if (event->pmu->count) {
- -              ret = -EOPNOTSUPP;
- -              goto out;
- -      }
- -
         /* If this is a per-task event, it must be for current */
         if ((event->attach_state & PERF_ATTACH_TASK) &&
             event->hw.target != current) {
@@@ -4421,9 -4378,7 +4421,9 @@@ EXPORT_SYMBOL_GPL(perf_event_read_value
   static int __perf_read_group_add(struct perf_event *leader,
                                         u64 read_format, u64 *values)
   {
+ +      struct perf_event_context *ctx = leader->ctx;
         struct perf_event *sub;
+ +      unsigned long flags;
         int n = 1; /* skip @nr */
         int ret;
   
@@@ -4453,15 -4408,12 +4453,15 @@@
         if (read_format & PERF_FORMAT_ID)
                 values[n++] = primary_event_id(leader);
   
+ +      raw_spin_lock_irqsave(&ctx->lock, flags);
+ +
         list_for_each_entry(sub, &leader->sibling_list, group_entry) {
                 values[n++] += perf_event_count(sub);
                 if (read_format & PERF_FORMAT_ID)
                         values[n++] = primary_event_id(sub);
         }
   
+ +      raw_spin_unlock_irqrestore(&ctx->lock, flags);
         return 0;
   }
   
@@@ -5126,7 -5078,7 +5126,7 @@@ static void perf_mmap_open(struct vm_ar
                 atomic_inc(&event->rb->aux_mmap_count);
   
         if (event->pmu->event_mapped)
- -              event->pmu->event_mapped(event);
+ +              event->pmu->event_mapped(event, vma->vm_mm);
   }
   
   static void perf_pmu_output_stop(struct perf_event *event);
@@@ -5149,7 -5101,7 +5149,7 @@@ static void perf_mmap_close(struct vm_a
         unsigned long size = perf_data_size(rb);
   
         if (event->pmu->event_unmapped)
- -              event->pmu->event_unmapped(event);
+ +              event->pmu->event_unmapped(event, vma->vm_mm);
   
         /*
          * rb->aux_mmap_count will always drop before rb->mmap_count and
@@@ -5447,7 -5399,7 +5447,7 @@@ aux_unlock
         vma->vm_ops = &perf_mmap_vmops;
   
         if (event->pmu->event_mapped)
- -              event->pmu->event_mapped(event);
+ +              event->pmu->event_mapped(event, vma->vm_mm);
   
         return ret;
   }
@@@ -6008,9 -5960,6 +6008,9 @@@ void perf_output_sample(struct perf_out
                 }
         }
   
+ +      if (sample_type & PERF_SAMPLE_PHYS_ADDR)
+ +              perf_output_put(handle, data->phys_addr);
+ +
         if (!event->attr.watermark) {
                 int wakeup_events = event->attr.wakeup_events;
   
@@@ -6026,38 -5975,6 +6026,38 @@@
         }
   }
   
+ +static u64 perf_virt_to_phys(u64 virt)
+ +{
+ +      u64 phys_addr = 0;
+ +      struct page *p = NULL;
+ +
+ +      if (!virt)
+ +              return 0;
+ +
+ +      if (virt >= TASK_SIZE) {
+ +              /* If it's vmalloc()d memory, leave phys_addr as 0 */
+ +              if (virt_addr_valid((void *)(uintptr_t)virt) &&
+ +                  !(virt >= VMALLOC_START && virt < VMALLOC_END))
+ +                      phys_addr = (u64)virt_to_phys((void *)(uintptr_t)virt);
+ +      } else {
+ +              /*
+ +               * Walking the pages tables for user address.
+ +               * Interrupts are disabled, so it prevents any tear down
+ +               * of the page tables.
+ +               * Try IRQ-safe __get_user_pages_fast first.
+ +               * If failed, leave phys_addr as 0.
+ +               */
+ +              if ((current->mm != NULL) &&
+ +                  (__get_user_pages_fast(virt, 1, 0, &p) == 1))
+ +                      phys_addr = page_to_phys(p) + virt % PAGE_SIZE;
+ +
+ +              if (p)
+ +                      put_page(p);
+ +      }
+ +
+ +      return phys_addr;
+ +}
+ +
   void perf_prepare_sample(struct perf_event_header *header,
                          struct perf_sample_data *data,
                          struct perf_event *event,
@@@ -6176,9 -6093,6 +6176,9 @@@
   
                 header->size += size;
         }
+ +
+ +      if (sample_type & PERF_SAMPLE_PHYS_ADDR)
+ +              data->phys_addr = perf_virt_to_phys(data->addr);
   }
   
   static void __always_inline
@@@ -7330,11 -7244,6 +7330,11 @@@ static void perf_log_throttle(struct pe
         perf_output_end(&handle);
   }
   
+ +void perf_event_itrace_started(struct perf_event *event)
+ +{
+ +      event->attach_state |= PERF_ATTACH_ITRACE;
+ +}
+ +
   static void perf_log_itrace_start(struct perf_event *event)
   {
         struct perf_output_handle handle;
@@@ -7350,7 -7259,7 +7350,7 @@@
                 event = event->parent;
   
         if (!(event->pmu->capabilities & PERF_PMU_CAP_ITRACE) ||
- -          event->hw.itrace_started)
+ +          event->attach_state & PERF_ATTACH_ITRACE)
                 return;
   
         rec.header.type = PERF_RECORD_ITRACE_START;
@@@ -7412,6 -7321,21 +7412,6 @@@ int perf_event_account_interrupt(struc
         return __perf_event_account_interrupt(event, 1);
   }
   
- -static bool sample_is_allowed(struct perf_event *event, struct pt_regs *regs)
- -{
- -      /*
- -       * Due to interrupt latency (AKA "skid"), we may enter the
- -       * kernel before taking an overflow, even if the PMU is only
- -       * counting user events.
- -       * To avoid leaking information to userspace, we must always
- -       * reject kernel samples when exclude_kernel is set.
- -       */
- -      if (event->attr.exclude_kernel && !user_mode(regs))
- -              return false;
- -
- -      return true;
- -}
- -
   /*
    * Generic event overflow handling, sampling.
    */
@@@ -7432,6 -7356,12 +7432,6 @@@ static int __perf_event_overflow(struc
   
         ret = __perf_event_account_interrupt(event, throttle);
   
- -      /*
- -       * For security, drop the skid kernel samples if necessary.
- -       */
- -      if (!sample_is_allowed(event, regs))
- -              return ret;
- -
         /*
          * XXX event_limit might not quite work as expected on inherited
          * events
@@@ -7954,15 -7884,16 +7954,15 @@@ void perf_trace_run_bpf_submit(void *ra
                 }
         }
         perf_tp_event(call->event.type, count, raw_data, size, regs, head,
- -                    rctx, task);
+ +                    rctx, task, NULL);
   }
   EXPORT_SYMBOL_GPL(perf_trace_run_bpf_submit);
   
   void perf_tp_event(u16 event_type, u64 count, void *record, int entry_size,
                    struct pt_regs *regs, struct hlist_head *head, int rctx,
- -                 struct task_struct *task)
+ +                 struct task_struct *task, struct perf_event *event)
   {
         struct perf_sample_data data;
- -      struct perf_event *event;
   
         struct perf_raw_record raw = {
                 .frag = {
@@@ -7976,15 -7907,9 +7976,15 @@@
   
         perf_trace_buf_update(record, event_type);
   
- -      hlist_for_each_entry_rcu(event, head, hlist_entry) {
+ +      /* Use the given event instead of the hlist */
+ +      if (event) {
                 if (perf_tp_event_match(event, &data, regs))
                         perf_swevent_event(event, count, &data, regs);
+ +      } else {
+ +              hlist_for_each_entry_rcu(event, head, hlist_entry) {
+ +                      if (perf_tp_event_match(event, &data, regs))
+ +                              perf_swevent_event(event, count, &data, regs);
+ +              }
         }
   
         /*
@@@ -8134,7 -8059,7 +8134,7 @@@ static void perf_event_free_bpf_handler
   
   static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd)
   {
- -      bool is_kprobe, is_tracepoint;
+ +      bool is_kprobe, is_tracepoint, is_syscall_tp;
         struct bpf_prog *prog;
   
         if (event->attr.type != PERF_TYPE_TRACEPOINT)
@@@ -8145,8 -8070,7 +8145,8 @@@
   
         is_kprobe = event->tp_event->flags & TRACE_EVENT_FL_UKPROBE;
         is_tracepoint = event->tp_event->flags & TRACE_EVENT_FL_TRACEPOINT;
- -      if (!is_kprobe && !is_tracepoint)
+ +      is_syscall_tp = is_syscall_trace_event(event->tp_event);
+ +      if (!is_kprobe && !is_tracepoint && !is_syscall_tp)
                 /* bpf programs can only be attached to u/kprobe or tracepoint */
                 return -EINVAL;
   
@@@ -8155,14 -8079,13 +8155,14 @@@
                 return PTR_ERR(prog);
   
         if ((is_kprobe && prog->type != BPF_PROG_TYPE_KPROBE) ||
- -          (is_tracepoint && prog->type != BPF_PROG_TYPE_TRACEPOINT)) {
+ +          (is_tracepoint && prog->type != BPF_PROG_TYPE_TRACEPOINT) ||
+ +          (is_syscall_tp && prog->type != BPF_PROG_TYPE_TRACEPOINT)) {
                 /* valid fd, but invalid bpf program type */
                 bpf_prog_put(prog);
                 return -EINVAL;
         }
   
- -      if (is_tracepoint) {
+ +      if (is_tracepoint || is_syscall_tp) {
                 int off = trace_event_get_offsets(event->tp_event);
   
                 if (prog->aux->max_ctx_offset > off) {
@@@ -9666,8 -9589,6 +9666,8 @@@ static int perf_copy_attr(struct perf_e
         if (ret)
                 return -EFAULT;
   
+ +      attr->size = size;
+ +
         if (attr->__reserved_1)
                 return -EINVAL;
   
@@@ -9940,11 -9861,6 +9940,11 @@@ SYSCALL_DEFINE5(perf_event_open
                         return -EINVAL;
         }
   
+ +      /* Only privileged users can get physical addresses */
+ +      if ((attr.sample_type & PERF_SAMPLE_PHYS_ADDR) &&
+ +          perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN))
+ +              return -EACCES;
+ +
         if (!attr.sample_max_stack)
                 attr.sample_max_stack = sysctl_perf_event_max_stack;
   
@@@ -10094,27 -10010,28 +10094,27 @@@
                         goto err_context;
   
                 /*
- -               * Do not allow to attach to a group in a different
- -               * task or CPU context:
+ +               * Make sure we're both events for the same CPU;
+ +               * grouping events for different CPUs is broken; since
+ +               * you can never concurrently schedule them anyhow.
                  */
- -              if (move_group) {
- -                      /*
- -                       * Make sure we're both on the same task, or both
- -                       * per-cpu events.
- -                       */
- -                      if (group_leader->ctx->task != ctx->task)
- -                              goto err_context;
+ +              if (group_leader->cpu != event->cpu)
+ +                      goto err_context;
   
- -                      /*
- -                       * Make sure we're both events for the same CPU;
- -                       * grouping events for different CPUs is broken; since
- -                       * you can never concurrently schedule them anyhow.
- -                       */
- -                      if (group_leader->cpu != event->cpu)
- -                              goto err_context;
- -              } else {
- -                      if (group_leader->ctx != ctx)
- -                              goto err_context;
- -              }
+ +              /*
+ +               * Make sure we're both on the same task, or both
+ +               * per-CPU events.
+ +               */
+ +              if (group_leader->ctx->task != ctx->task)
+ +                      goto err_context;
+ +
+ +              /*
+ +               * Do not allow to attach to a group in a different task
+ +               * or CPU context. If we're moving SW events, we'll fix
+ +               * this up later, so allow that.
+ +               */
+ +              if (!move_group && group_leader->ctx != ctx)
+ +                      goto err_context;
   
                 /*
                  * Only a group leader can be exclusive or pinned
@@@ -11293,5 -11210,6 +11293,6 @@@ struct cgroup_subsys perf_event_cgrp_su
          * controller is not mounted on a legacy hierarchy.
          */
         .implicit_on_dfl = true,
+       .threaded       = true,
   };
   #endif /* CONFIG_CGROUP_PERF */
diff --combined mm/memcontrol.c

index ad15850ee157e3b309045a53ad9ba733745869e1,2b2f071f914b5143c45b0192bfefe7daaaa86539..6532b219b22239a268783d399a7ffe0385ee4ccf
--- 1/mm/memcontrol.c
--- 2/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@@ -550,12 -550,10 +550,12 @@@ mem_cgroup_largest_soft_limit_node(stru
    * value, and reading all cpu value can be performance bottleneck in some
    * common workload, threshold and synchronization as vmstat[] should be
    * implemented.
+ + *
+ + * The parameter idx can be of type enum memcg_event_item or vm_event_item.
    */
   
   static unsigned long memcg_sum_events(struct mem_cgroup *memcg,
- -                                    enum memcg_event_item event)
+ +                                    int event)
   {
         unsigned long val = 0;
         int cpu;
@@@ -919,7 -917,7 +919,7 @@@ int mem_cgroup_scan_tasks(struct mem_cg
                 struct css_task_iter it;
                 struct task_struct *task;
   
-               css_task_iter_start(&iter->css, &it);
+               css_task_iter_start(&iter->css, 0, &it);
                 while (!ret && (task = css_task_iter_next(&it)))
                         ret = fn(task, arg);
                 css_task_iter_end(&it);
@@@ -1613,13 -1611,9 +1613,13 @@@ cleanup
    * @page: the page
    *
    * This function protects unlocked LRU pages from being moved to
- - * another cgroup and stabilizes their page->mem_cgroup binding.
+ + * another cgroup.
+ + *
+ + * It ensures lifetime of the returned memcg. Caller is responsible
+ + * for the lifetime of the page; __unlock_page_memcg() is available
+ + * when @page might get freed inside the locked section.
    */
- -void lock_page_memcg(struct page *page)
+ +struct mem_cgroup *lock_page_memcg(struct page *page)
   {
         struct mem_cgroup *memcg;
         unsigned long flags;
@@@ -1628,24 -1622,18 +1628,24 @@@
          * The RCU lock is held throughout the transaction.  The fast
          * path can get away without acquiring the memcg->move_lock
          * because page moving starts with an RCU grace period.
- -       */
+ +       *
+ +       * The RCU lock also protects the memcg from being freed when
+ +       * the page state that is going to change is the only thing
+ +       * preventing the page itself from being freed. E.g. writeback
+ +       * doesn't hold a page reference and relies on PG_writeback to
+ +       * keep off truncation, migration and so forth.
+ +         */
         rcu_read_lock();
   
         if (mem_cgroup_disabled())
- -              return;
+ +              return NULL;
   again:
         memcg = page->mem_cgroup;
         if (unlikely(!memcg))
- -              return;
+ +              return NULL;
   
         if (atomic_read(&memcg->moving_account) <= 0)
- -              return;
+ +              return memcg;
   
         spin_lock_irqsave(&memcg->move_lock, flags);
         if (memcg != page->mem_cgroup) {
@@@ -1661,18 -1649,18 +1661,18 @@@
         memcg->move_lock_task = current;
         memcg->move_lock_flags = flags;
   
- -      return;
+ +      return memcg;
   }
   EXPORT_SYMBOL(lock_page_memcg);
   
   /**
- - * unlock_page_memcg - unlock a page->mem_cgroup binding
- - * @page: the page
+ + * __unlock_page_memcg - unlock and unpin a memcg
+ + * @memcg: the memcg
+ + *
+ + * Unlock and unpin a memcg returned by lock_page_memcg().
    */
- -void unlock_page_memcg(struct page *page)
+ +void __unlock_page_memcg(struct mem_cgroup *memcg)
   {
- -      struct mem_cgroup *memcg = page->mem_cgroup;
- -
         if (memcg && memcg->move_lock_task == current) {
                 unsigned long flags = memcg->move_lock_flags;
   
@@@ -1684,15 -1672,6 +1684,15 @@@
   
         rcu_read_unlock();
   }
+ +
+ +/**
+ + * unlock_page_memcg - unlock a page->mem_cgroup binding
+ + * @page: the page
+ + */
+ +void unlock_page_memcg(struct page *page)
+ +{
+ +      __unlock_page_memcg(page->mem_cgroup);
+ +}
   EXPORT_SYMBOL(unlock_page_memcg);
   
   /*
@@@ -1917,7 -1896,7 +1917,7 @@@ retry
          * bypass the last charges so that they can exit quickly and
          * free their memory.
          */
- -      if (unlikely(test_thread_flag(TIF_MEMDIE) ||
+ +      if (unlikely(tsk_is_oom_victim(current) ||
                      fatal_signal_pending(current) ||
                      current->flags & PF_EXITING))
                 goto force;
@@@ -4321,8 -4300,6 +4321,8 @@@ static void mem_cgroup_css_offline(stru
         }
         spin_unlock(&memcg->event_list_lock);
   
+ +      memcg->low = 0;
+ +
         memcg_offline_kmem(memcg);
         wb_memcg_offline(memcg);
   
@@@ -4639,11 -4616,8 +4639,11 @@@ static enum mc_target_type get_mctgt_ty
                 if (!ret || !target)
                         put_page(page);
         }
- -      /* There is a swap entry and a page doesn't exist or isn't charged */
- -      if (ent.val && !ret &&
+ +      /*
+ +       * There is a swap entry and a page doesn't exist or isn't charged.
+ +       * But we cannot move a tail-page in a THP.
+ +       */
+ +      if (ent.val && !ret && (!page || !PageTransCompound(page)) &&
             mem_cgroup_id(mc.from) == lookup_swap_cgroup_id(ent)) {
                 ret = MC_TARGET_SWAP;
                 if (target)
@@@ -4654,8 -4628,8 +4654,8 @@@
   
   #ifdef CONFIG_TRANSPARENT_HUGEPAGE
   /*
- - * We don't consider swapping or file mapped pages because THP does not
- - * support them for now.
+ + * We don't consider PMD mapped swapping or file mapped pages because THP does
+ + * not support them for now.
    * Caller should make sure that pmd_trans_huge(pmd) is true.
    */
   static enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma,
@@@ -5430,7 -5404,7 +5430,7 @@@ int mem_cgroup_try_charge(struct page *
                  * in turn serializes uncharging.
                  */
                 VM_BUG_ON_PAGE(!PageLocked(page), page);
- -              if (page->mem_cgroup)
+ +              if (compound_head(page)->mem_cgroup)
                         goto out;
   
                 if (do_swap_account) {
@@@ -5913,7 -5887,6 +5913,7 @@@ static struct mem_cgroup *mem_cgroup_id
   void mem_cgroup_swapout(struct page *page, swp_entry_t entry)
   {
         struct mem_cgroup *memcg, *swap_memcg;
+ +      unsigned int nr_entries;
         unsigned short oldid;
   
         VM_BUG_ON_PAGE(PageLRU(page), page);
@@@ -5934,24 -5907,19 +5934,24 @@@
          * ancestor for the swap instead and transfer the memory+swap charge.
          */
         swap_memcg = mem_cgroup_id_get_online(memcg);
- -      oldid = swap_cgroup_record(entry, mem_cgroup_id(swap_memcg), 1);
+ +      nr_entries = hpage_nr_pages(page);
+ +      /* Get references for the tail pages, too */
+ +      if (nr_entries > 1)
+ +              mem_cgroup_id_get_many(swap_memcg, nr_entries - 1);
+ +      oldid = swap_cgroup_record(entry, mem_cgroup_id(swap_memcg),
+ +                                 nr_entries);
         VM_BUG_ON_PAGE(oldid, page);
- -      mem_cgroup_swap_statistics(swap_memcg, 1);
+ +      mem_cgroup_swap_statistics(swap_memcg, nr_entries);
   
         page->mem_cgroup = NULL;
   
         if (!mem_cgroup_is_root(memcg))
- -              page_counter_uncharge(&memcg->memory, 1);
+ +              page_counter_uncharge(&memcg->memory, nr_entries);
   
         if (memcg != swap_memcg) {
                 if (!mem_cgroup_is_root(swap_memcg))
- -                      page_counter_charge(&swap_memcg->memsw, 1);
- -              page_counter_uncharge(&memcg->memsw, 1);
+ +                      page_counter_charge(&swap_memcg->memsw, nr_entries);
+ +              page_counter_uncharge(&memcg->memsw, nr_entries);
         }
   
         /*
@@@ -5961,8 -5929,7 +5961,8 @@@
          * only synchronisation we have for udpating the per-CPU variables.
          */
         VM_BUG_ON(!irqs_disabled());
- -      mem_cgroup_charge_statistics(memcg, page, false, -1);
+ +      mem_cgroup_charge_statistics(memcg, page, PageTransHuge(page),
+ +                                   -nr_entries);
         memcg_check_events(memcg, page);
   
         if (!mem_cgroup_is_root(memcg))
author	Linus Torvalds <[email protected]>
	Thu, 7 Sep 2017 05:25:25 +0000 (22:25 -0700)
committer	Linus Torvalds <[email protected]>
	Thu, 7 Sep 2017 05:25:25 +0000 (22:25 -0700)
		1	2
kernel/cgroup/cgroup-internal.h	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/cgroup/cgroup.c	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/cgroup/cpuset.c	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/events/core.c	patch \|	diff1 \|	diff2 \|	blob \| history
mm/memcontrol.c	patch \|	diff1 \|	diff2 \|	blob \| history