]> Git Repo - linux.git/commitdiff
Merge branch 'for-4.14' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup
authorLinus Torvalds <[email protected]>
Thu, 7 Sep 2017 05:25:25 +0000 (22:25 -0700)
committerLinus Torvalds <[email protected]>
Thu, 7 Sep 2017 05:25:25 +0000 (22:25 -0700)
Pull cgroup updates from Tejun Heo:
 "Several notable changes this cycle:

   - Thread mode was merged. This will be used for cgroup2 support for
     CPU and possibly other controllers. Unfortunately, CPU controller
     cgroup2 support didn't make this pull request but most contentions
     have been resolved and the support is likely to be merged before
     the next merge window.

   - cgroup.stat now shows the number of descendant cgroups.

   - cpuset now can enable the easier-to-configure v2 behavior on v1
     hierarchy"

* 'for-4.14' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup: (21 commits)
  cpuset: Allow v2 behavior in v1 cgroup
  cgroup: Add mount flag to enable cpuset to use v2 behavior in v1 cgroup
  cgroup: remove unneeded checks
  cgroup: misc changes
  cgroup: short-circuit cset_cgroup_from_root() on the default hierarchy
  cgroup: re-use the parent pointer in cgroup_destroy_locked()
  cgroup: add cgroup.stat interface with basic hierarchy stats
  cgroup: implement hierarchy limits
  cgroup: keep track of number of descent cgroups
  cgroup: add comment to cgroup_enable_threaded()
  cgroup: remove unnecessary empty check when enabling threaded mode
  cgroup: update debug controller to print out thread mode information
  cgroup: implement cgroup v2 thread support
  cgroup: implement CSS_TASK_ITER_THREADED
  cgroup: introduce cgroup->dom_cgrp and threaded css_set handling
  cgroup: add @flags to css_task_iter_start() and implement CSS_TASK_ITER_PROCS
  cgroup: reorganize cgroup.procs / task write path
  cgroup: replace css_set walking populated test with testing cgrp->nr_populated_csets
  cgroup: distinguish local and children populated states
  cgroup: remove now unused list_head @pending in cgroup_apply_cftypes()
  ...

1  2 
kernel/cgroup/cgroup-internal.h
kernel/cgroup/cgroup.c
kernel/cgroup/cpuset.c
kernel/events/core.c
mm/memcontrol.c

index 8b4c3c2f2509d2b49666410b011bf604926194f3,c167a40278e63b86f0e04f22167e3280751a28d0..5151ff256c2945ec3a6b274b4eae7c2221d81a98
@@@ -33,9 -33,6 +33,9 @@@ struct cgroup_taskset 
        struct list_head        src_csets;
        struct list_head        dst_csets;
  
 +      /* the number of tasks in the set */
 +      int                     nr_tasks;
 +
        /* the subsys currently being processed */
        int                     ssid;
  
@@@ -156,6 -153,8 +156,8 @@@ static inline void get_css_set(struct c
  
  bool cgroup_ssid_enabled(int ssid);
  bool cgroup_on_dfl(const struct cgroup *cgrp);
+ bool cgroup_is_thread_root(struct cgroup *cgrp);
+ bool cgroup_is_threaded(struct cgroup *cgrp);
  
  struct cgroup_root *cgroup_root_from_kf(struct kernfs_root *kf_root);
  struct cgroup *task_cgroup_from_root(struct task_struct *task,
@@@ -173,7 -172,7 +175,7 @@@ struct dentry *cgroup_do_mount(struct f
                               struct cgroup_root *root, unsigned long magic,
                               struct cgroup_namespace *ns);
  
bool cgroup_may_migrate_to(struct cgroup *dst_cgrp);
int cgroup_migrate_vet_dst(struct cgroup *dst_cgrp);
  void cgroup_migrate_finish(struct cgroup_mgctx *mgctx);
  void cgroup_migrate_add_src(struct css_set *src_cset, struct cgroup *dst_cgrp,
                            struct cgroup_mgctx *mgctx);
@@@ -183,10 -182,10 +185,10 @@@ int cgroup_migrate(struct task_struct *
  
  int cgroup_attach_task(struct cgroup *dst_cgrp, struct task_struct *leader,
                       bool threadgroup);
- ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf,
-                            size_t nbytes, loff_t off, bool threadgroup);
- ssize_t cgroup_procs_write(struct kernfs_open_file *of, char *buf, size_t nbytes,
-                          loff_t off);
+ struct task_struct *cgroup_procs_write_start(char *buf, bool threadgroup)
+       __acquires(&cgroup_threadgroup_rwsem);
+ void cgroup_procs_write_finish(struct task_struct *task)
+       __releases(&cgroup_threadgroup_rwsem);
  
  void cgroup_lock_and_drain_offline(struct cgroup *cgrp);
  
diff --combined kernel/cgroup/cgroup.c
index f64fc967a9efd94f65e7fcf73c0d1a851c09bc1a,1591e9b20122e762ffebf2b0b97c5f6eb95790a8..4f2196a00953f2ce7b2c25893315f66aa721ad12
@@@ -162,6 -162,9 +162,9 @@@ static u16 cgrp_dfl_inhibit_ss_mask
  /* some controllers are implicitly enabled on the default hierarchy */
  static u16 cgrp_dfl_implicit_ss_mask;
  
+ /* some controllers can be threaded on the default hierarchy */
+ static u16 cgrp_dfl_threaded_ss_mask;
  /* The list of hierarchy roots */
  LIST_HEAD(cgroup_roots);
  static int cgroup_root_count;
@@@ -316,13 -319,87 +319,87 @@@ static void cgroup_idr_remove(struct id
        spin_unlock_bh(&cgroup_idr_lock);
  }
  
- static struct cgroup *cgroup_parent(struct cgroup *cgrp)
+ static bool cgroup_has_tasks(struct cgroup *cgrp)
  {
-       struct cgroup_subsys_state *parent_css = cgrp->self.parent;
+       return cgrp->nr_populated_csets;
+ }
  
-       if (parent_css)
-               return container_of(parent_css, struct cgroup, self);
-       return NULL;
+ bool cgroup_is_threaded(struct cgroup *cgrp)
+ {
+       return cgrp->dom_cgrp != cgrp;
+ }
+ /* can @cgrp host both domain and threaded children? */
+ static bool cgroup_is_mixable(struct cgroup *cgrp)
+ {
+       /*
+        * Root isn't under domain level resource control exempting it from
+        * the no-internal-process constraint, so it can serve as a thread
+        * root and a parent of resource domains at the same time.
+        */
+       return !cgroup_parent(cgrp);
+ }
+ /* can @cgrp become a thread root? should always be true for a thread root */
+ static bool cgroup_can_be_thread_root(struct cgroup *cgrp)
+ {
+       /* mixables don't care */
+       if (cgroup_is_mixable(cgrp))
+               return true;
+       /* domain roots can't be nested under threaded */
+       if (cgroup_is_threaded(cgrp))
+               return false;
+       /* can only have either domain or threaded children */
+       if (cgrp->nr_populated_domain_children)
+               return false;
+       /* and no domain controllers can be enabled */
+       if (cgrp->subtree_control & ~cgrp_dfl_threaded_ss_mask)
+               return false;
+       return true;
+ }
+ /* is @cgrp root of a threaded subtree? */
+ bool cgroup_is_thread_root(struct cgroup *cgrp)
+ {
+       /* thread root should be a domain */
+       if (cgroup_is_threaded(cgrp))
+               return false;
+       /* a domain w/ threaded children is a thread root */
+       if (cgrp->nr_threaded_children)
+               return true;
+       /*
+        * A domain which has tasks and explicit threaded controllers
+        * enabled is a thread root.
+        */
+       if (cgroup_has_tasks(cgrp) &&
+           (cgrp->subtree_control & cgrp_dfl_threaded_ss_mask))
+               return true;
+       return false;
+ }
+ /* a domain which isn't connected to the root w/o brekage can't be used */
+ static bool cgroup_is_valid_domain(struct cgroup *cgrp)
+ {
+       /* the cgroup itself can be a thread root */
+       if (cgroup_is_threaded(cgrp))
+               return false;
+       /* but the ancestors can't be unless mixable */
+       while ((cgrp = cgroup_parent(cgrp))) {
+               if (!cgroup_is_mixable(cgrp) && cgroup_is_thread_root(cgrp))
+                       return false;
+               if (cgroup_is_threaded(cgrp))
+                       return false;
+       }
+       return true;
  }
  
  /* subsystems visibly enabled on a cgroup */
@@@ -331,8 -408,14 +408,14 @@@ static u16 cgroup_control(struct cgrou
        struct cgroup *parent = cgroup_parent(cgrp);
        u16 root_ss_mask = cgrp->root->subsys_mask;
  
-       if (parent)
-               return parent->subtree_control;
+       if (parent) {
+               u16 ss_mask = parent->subtree_control;
+               /* threaded cgroups can only have threaded controllers */
+               if (cgroup_is_threaded(cgrp))
+                       ss_mask &= cgrp_dfl_threaded_ss_mask;
+               return ss_mask;
+       }
  
        if (cgroup_on_dfl(cgrp))
                root_ss_mask &= ~(cgrp_dfl_inhibit_ss_mask |
@@@ -345,8 -428,14 +428,14 @@@ static u16 cgroup_ss_mask(struct cgrou
  {
        struct cgroup *parent = cgroup_parent(cgrp);
  
-       if (parent)
-               return parent->subtree_ss_mask;
+       if (parent) {
+               u16 ss_mask = parent->subtree_ss_mask;
+               /* threaded cgroups can only have threaded controllers */
+               if (cgroup_is_threaded(cgrp))
+                       ss_mask &= cgrp_dfl_threaded_ss_mask;
+               return ss_mask;
+       }
  
        return cgrp->root->subsys_mask;
  }
@@@ -436,22 -525,12 +525,12 @@@ out_unlock
        return css;
  }
  
- static void __maybe_unused cgroup_get(struct cgroup *cgrp)
- {
-       css_get(&cgrp->self);
- }
  static void cgroup_get_live(struct cgroup *cgrp)
  {
        WARN_ON_ONCE(cgroup_is_dead(cgrp));
        css_get(&cgrp->self);
  }
  
- static bool cgroup_tryget(struct cgroup *cgrp)
- {
-       return css_tryget(&cgrp->self);
- }
  struct cgroup_subsys_state *of_css(struct kernfs_open_file *of)
  {
        struct cgroup *cgrp = of->kn->parent->priv;
@@@ -560,9 -639,11 +639,11 @@@ EXPORT_SYMBOL_GPL(of_css)
   */
  struct css_set init_css_set = {
        .refcount               = REFCOUNT_INIT(1),
+       .dom_cset               = &init_css_set,
        .tasks                  = LIST_HEAD_INIT(init_css_set.tasks),
        .mg_tasks               = LIST_HEAD_INIT(init_css_set.mg_tasks),
        .task_iters             = LIST_HEAD_INIT(init_css_set.task_iters),
+       .threaded_csets         = LIST_HEAD_INIT(init_css_set.threaded_csets),
        .cgrp_links             = LIST_HEAD_INIT(init_css_set.cgrp_links),
        .mg_preload_node        = LIST_HEAD_INIT(init_css_set.mg_preload_node),
        .mg_node                = LIST_HEAD_INIT(init_css_set.mg_node),
  
  static int css_set_count      = 1;    /* 1 for init_css_set */
  
+ static bool css_set_threaded(struct css_set *cset)
+ {
+       return cset->dom_cset != cset;
+ }
  /**
   * css_set_populated - does a css_set contain any tasks?
   * @cset: target css_set
@@@ -587,39 -673,48 +673,48 @@@ static bool css_set_populated(struct cs
  }
  
  /**
-  * cgroup_update_populated - updated populated count of a cgroup
+  * cgroup_update_populated - update the populated count of a cgroup
   * @cgrp: the target cgroup
   * @populated: inc or dec populated count
   *
   * One of the css_sets associated with @cgrp is either getting its first
-  * task or losing the last.  Update @cgrp->populated_cnt accordingly.  The
-  * count is propagated towards root so that a given cgroup's populated_cnt
-  * is zero iff the cgroup and all its descendants don't contain any tasks.
+  * task or losing the last.  Update @cgrp->nr_populated_* accordingly.  The
+  * count is propagated towards root so that a given cgroup's
+  * nr_populated_children is zero iff none of its descendants contain any
+  * tasks.
   *
-  * @cgrp's interface file "cgroup.populated" is zero if
-  * @cgrp->populated_cnt is zero and 1 otherwise.  When @cgrp->populated_cnt
-  * changes from or to zero, userland is notified that the content of the
-  * interface file has changed.  This can be used to detect when @cgrp and
-  * its descendants become populated or empty.
+  * @cgrp's interface file "cgroup.populated" is zero if both
+  * @cgrp->nr_populated_csets and @cgrp->nr_populated_children are zero and
+  * 1 otherwise.  When the sum changes from or to zero, userland is notified
+  * that the content of the interface file has changed.  This can be used to
+  * detect when @cgrp and its descendants become populated or empty.
   */
  static void cgroup_update_populated(struct cgroup *cgrp, bool populated)
  {
+       struct cgroup *child = NULL;
+       int adj = populated ? 1 : -1;
        lockdep_assert_held(&css_set_lock);
  
        do {
-               bool trigger;
+               bool was_populated = cgroup_is_populated(cgrp);
  
-               if (populated)
-                       trigger = !cgrp->populated_cnt++;
-               else
-                       trigger = !--cgrp->populated_cnt;
+               if (!child) {
+                       cgrp->nr_populated_csets += adj;
+               } else {
+                       if (cgroup_is_threaded(child))
+                               cgrp->nr_populated_threaded_children += adj;
+                       else
+                               cgrp->nr_populated_domain_children += adj;
+               }
  
-               if (!trigger)
+               if (was_populated == cgroup_is_populated(cgrp))
                        break;
  
                cgroup1_check_for_release(cgrp);
                cgroup_file_notify(&cgrp->events_file);
  
+               child = cgrp;
                cgrp = cgroup_parent(cgrp);
        } while (cgrp);
  }
   * @populated: whether @cset is populated or depopulated
   *
   * @cset is either getting the first task or losing the last.  Update the
-  * ->populated_cnt of all associated cgroups accordingly.
+  * populated counters of all associated cgroups accordingly.
   */
  static void css_set_update_populated(struct css_set *cset, bool populated)
  {
   * css_set, @from_cset can be NULL.  If @task is being disassociated
   * instead of moved, @to_cset can be NULL.
   *
-  * This function automatically handles populated_cnt updates and
+  * This function automatically handles populated counter updates and
   * css_task_iter adjustments but the caller is responsible for managing
   * @from_cset and @to_cset's reference counts.
   */
@@@ -737,6 -832,8 +832,8 @@@ void put_css_set_locked(struct css_set 
        if (!refcount_dec_and_test(&cset->refcount))
                return;
  
+       WARN_ON_ONCE(!list_empty(&cset->threaded_csets));
        /* This css_set is dead. unlink it and release cgroup and css refs */
        for_each_subsys(ss, ssid) {
                list_del(&cset->e_cset_node[ssid]);
                kfree(link);
        }
  
+       if (css_set_threaded(cset)) {
+               list_del(&cset->threaded_csets_node);
+               put_css_set_locked(cset->dom_cset);
+       }
        kfree_rcu(cset, rcu_head);
  }
  
@@@ -771,6 -873,7 +873,7 @@@ static bool compare_css_sets(struct css
                             struct cgroup *new_cgrp,
                             struct cgroup_subsys_state *template[])
  {
+       struct cgroup *new_dfl_cgrp;
        struct list_head *l1, *l2;
  
        /*
        if (memcmp(template, cset->subsys, sizeof(cset->subsys)))
                return false;
  
+       /* @cset's domain should match the default cgroup's */
+       if (cgroup_on_dfl(new_cgrp))
+               new_dfl_cgrp = new_cgrp;
+       else
+               new_dfl_cgrp = old_cset->dfl_cgrp;
+       if (new_dfl_cgrp->dom_cgrp != cset->dom_cset->dfl_cgrp)
+               return false;
        /*
         * Compare cgroup pointers in order to distinguish between
         * different cgroups in hierarchies.  As different cgroups may
@@@ -988,9 -1101,11 +1101,11 @@@ static struct css_set *find_css_set(str
        }
  
        refcount_set(&cset->refcount, 1);
+       cset->dom_cset = cset;
        INIT_LIST_HEAD(&cset->tasks);
        INIT_LIST_HEAD(&cset->mg_tasks);
        INIT_LIST_HEAD(&cset->task_iters);
+       INIT_LIST_HEAD(&cset->threaded_csets);
        INIT_HLIST_NODE(&cset->hlist);
        INIT_LIST_HEAD(&cset->cgrp_links);
        INIT_LIST_HEAD(&cset->mg_preload_node);
  
        spin_unlock_irq(&css_set_lock);
  
+       /*
+        * If @cset should be threaded, look up the matching dom_cset and
+        * link them up.  We first fully initialize @cset then look for the
+        * dom_cset.  It's simpler this way and safe as @cset is guaranteed
+        * to stay empty until we return.
+        */
+       if (cgroup_is_threaded(cset->dfl_cgrp)) {
+               struct css_set *dcset;
+               dcset = find_css_set(cset, cset->dfl_cgrp->dom_cgrp);
+               if (!dcset) {
+                       put_css_set(cset);
+                       return NULL;
+               }
+               spin_lock_irq(&css_set_lock);
+               cset->dom_cset = dcset;
+               list_add_tail(&cset->threaded_csets_node,
+                             &dcset->threaded_csets);
+               spin_unlock_irq(&css_set_lock);
+       }
        return cset;
  }
  
@@@ -1155,6 -1292,8 +1292,8 @@@ static struct cgroup *cset_cgroup_from_
  
        if (cset == &init_css_set) {
                res = &root->cgrp;
+       } else if (root == &cgrp_dfl_root) {
+               res = cset->dfl_cgrp;
        } else {
                struct cgrp_cset_link *link;
  
@@@ -1670,6 -1809,9 +1809,9 @@@ static void init_cgroup_housekeeping(st
        mutex_init(&cgrp->pidlist_mutex);
        cgrp->self.cgroup = cgrp;
        cgrp->self.flags |= CSS_ONLINE;
+       cgrp->dom_cgrp = cgrp;
+       cgrp->max_descendants = INT_MAX;
+       cgrp->max_depth = INT_MAX;
  
        for_each_subsys(ss, ssid)
                INIT_LIST_HEAD(&cgrp->e_csets[ssid]);
@@@ -2006,8 -2148,6 +2148,8 @@@ static void cgroup_migrate_add_task(str
        if (!cset->mg_src_cgrp)
                return;
  
 +      mgctx->tset.nr_tasks++;
 +
        list_move_tail(&task->cg_list, &cset->mg_tasks);
        if (list_empty(&cset->mg_node))
                list_add_tail(&cset->mg_node,
@@@ -2096,19 -2236,21 +2238,19 @@@ static int cgroup_migrate_execute(struc
        struct css_set *cset, *tmp_cset;
        int ssid, failed_ssid, ret;
  
 -      /* methods shouldn't be called if no task is actually migrating */
 -      if (list_empty(&tset->src_csets))
 -              return 0;
 -
        /* check that we can legitimately attach to the cgroup */
 -      do_each_subsys_mask(ss, ssid, mgctx->ss_mask) {
 -              if (ss->can_attach) {
 -                      tset->ssid = ssid;
 -                      ret = ss->can_attach(tset);
 -                      if (ret) {
 -                              failed_ssid = ssid;
 -                              goto out_cancel_attach;
 +      if (tset->nr_tasks) {
 +              do_each_subsys_mask(ss, ssid, mgctx->ss_mask) {
 +                      if (ss->can_attach) {
 +                              tset->ssid = ssid;
 +                              ret = ss->can_attach(tset);
 +                              if (ret) {
 +                                      failed_ssid = ssid;
 +                                      goto out_cancel_attach;
 +                              }
                        }
 -              }
 -      } while_each_subsys_mask();
 +              } while_each_subsys_mask();
 +      }
  
        /*
         * Now that we're guaranteed success, proceed to move all tasks to
         */
        tset->csets = &tset->dst_csets;
  
 -      do_each_subsys_mask(ss, ssid, mgctx->ss_mask) {
 -              if (ss->attach) {
 -                      tset->ssid = ssid;
 -                      ss->attach(tset);
 -              }
 -      } while_each_subsys_mask();
 +      if (tset->nr_tasks) {
 +              do_each_subsys_mask(ss, ssid, mgctx->ss_mask) {
 +                      if (ss->attach) {
 +                              tset->ssid = ssid;
 +                              ss->attach(tset);
 +                      }
 +              } while_each_subsys_mask();
 +      }
  
        ret = 0;
        goto out_release_tset;
  
  out_cancel_attach:
 -      do_each_subsys_mask(ss, ssid, mgctx->ss_mask) {
 -              if (ssid == failed_ssid)
 -                      break;
 -              if (ss->cancel_attach) {
 -                      tset->ssid = ssid;
 -                      ss->cancel_attach(tset);
 -              }
 -      } while_each_subsys_mask();
 +      if (tset->nr_tasks) {
 +              do_each_subsys_mask(ss, ssid, mgctx->ss_mask) {
 +                      if (ssid == failed_ssid)
 +                              break;
 +                      if (ss->cancel_attach) {
 +                              tset->ssid = ssid;
 +                              ss->cancel_attach(tset);
 +                      }
 +              } while_each_subsys_mask();
 +      }
  out_release_tset:
        spin_lock_irq(&css_set_lock);
        list_splice_init(&tset->dst_csets, &tset->src_csets);
  }
  
  /**
-  * cgroup_may_migrate_to - verify whether a cgroup can be migration destination
+  * cgroup_migrate_vet_dst - verify whether a cgroup can be migration destination
   * @dst_cgrp: destination cgroup to test
   *
-  * On the default hierarchy, except for the root, subtree_control must be
-  * zero for migration destination cgroups with tasks so that child cgroups
-  * don't compete against tasks.
+  * On the default hierarchy, except for the mixable, (possible) thread root
+  * and threaded cgroups, subtree_control must be zero for migration
+  * destination cgroups with tasks so that child cgroups don't compete
+  * against tasks.
   */
bool cgroup_may_migrate_to(struct cgroup *dst_cgrp)
int cgroup_migrate_vet_dst(struct cgroup *dst_cgrp)
  {
-       return !cgroup_on_dfl(dst_cgrp) || !cgroup_parent(dst_cgrp) ||
-               !dst_cgrp->subtree_control;
+       /* v1 doesn't have any restriction */
+       if (!cgroup_on_dfl(dst_cgrp))
+               return 0;
+       /* verify @dst_cgrp can host resources */
+       if (!cgroup_is_valid_domain(dst_cgrp->dom_cgrp))
+               return -EOPNOTSUPP;
+       /* mixables don't care */
+       if (cgroup_is_mixable(dst_cgrp))
+               return 0;
+       /*
+        * If @dst_cgrp is already or can become a thread root or is
+        * threaded, it doesn't matter.
+        */
+       if (cgroup_can_be_thread_root(dst_cgrp) || cgroup_is_threaded(dst_cgrp))
+               return 0;
+       /* apply no-internal-process constraint */
+       if (dst_cgrp->subtree_control)
+               return -EBUSY;
+       return 0;
  }
  
  /**
@@@ -2387,8 -2548,9 +2552,9 @@@ int cgroup_attach_task(struct cgroup *d
        struct task_struct *task;
        int ret;
  
-       if (!cgroup_may_migrate_to(dst_cgrp))
-               return -EBUSY;
+       ret = cgroup_migrate_vet_dst(dst_cgrp);
+       if (ret)
+               return ret;
  
        /* look up all src csets */
        spin_lock_irq(&css_set_lock);
        return ret;
  }
  
- static int cgroup_procs_write_permission(struct task_struct *task,
-                                        struct cgroup *dst_cgrp,
-                                        struct kernfs_open_file *of)
- {
-       struct super_block *sb = of->file->f_path.dentry->d_sb;
-       struct cgroup_namespace *ns = current->nsproxy->cgroup_ns;
-       struct cgroup *root_cgrp = ns->root_cset->dfl_cgrp;
-       struct cgroup *src_cgrp, *com_cgrp;
-       struct inode *inode;
-       int ret;
-       if (!cgroup_on_dfl(dst_cgrp)) {
-               const struct cred *cred = current_cred();
-               const struct cred *tcred = get_task_cred(task);
-               /*
-                * even if we're attaching all tasks in the thread group,
-                * we only need to check permissions on one of them.
-                */
-               if (uid_eq(cred->euid, GLOBAL_ROOT_UID) ||
-                   uid_eq(cred->euid, tcred->uid) ||
-                   uid_eq(cred->euid, tcred->suid))
-                       ret = 0;
-               else
-                       ret = -EACCES;
-               put_cred(tcred);
-               return ret;
-       }
-       /* find the source cgroup */
-       spin_lock_irq(&css_set_lock);
-       src_cgrp = task_cgroup_from_root(task, &cgrp_dfl_root);
-       spin_unlock_irq(&css_set_lock);
-       /* and the common ancestor */
-       com_cgrp = src_cgrp;
-       while (!cgroup_is_descendant(dst_cgrp, com_cgrp))
-               com_cgrp = cgroup_parent(com_cgrp);
-       /* %current should be authorized to migrate to the common ancestor */
-       inode = kernfs_get_inode(sb, com_cgrp->procs_file.kn);
-       if (!inode)
-               return -ENOMEM;
-       ret = inode_permission(inode, MAY_WRITE);
-       iput(inode);
-       if (ret)
-               return ret;
-       /*
-        * If namespaces are delegation boundaries, %current must be able
-        * to see both source and destination cgroups from its namespace.
-        */
-       if ((cgrp_dfl_root.flags & CGRP_ROOT_NS_DELEGATE) &&
-           (!cgroup_is_descendant(src_cgrp, root_cgrp) ||
-            !cgroup_is_descendant(dst_cgrp, root_cgrp)))
-               return -ENOENT;
-       return 0;
- }
- /*
-  * Find the task_struct of the task to attach by vpid and pass it along to the
-  * function to attach either it or all tasks in its threadgroup. Will lock
-  * cgroup_mutex and threadgroup.
-  */
- ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf,
-                            size_t nbytes, loff_t off, bool threadgroup)
+ struct task_struct *cgroup_procs_write_start(char *buf, bool threadgroup)
+       __acquires(&cgroup_threadgroup_rwsem)
  {
        struct task_struct *tsk;
-       struct cgroup_subsys *ss;
-       struct cgroup *cgrp;
        pid_t pid;
-       int ssid, ret;
  
        if (kstrtoint(strstrip(buf), 0, &pid) || pid < 0)
-               return -EINVAL;
-       cgrp = cgroup_kn_lock_live(of->kn, false);
-       if (!cgrp)
-               return -ENODEV;
+               return ERR_PTR(-EINVAL);
  
        percpu_down_write(&cgroup_threadgroup_rwsem);
        rcu_read_lock();
        if (pid) {
                tsk = find_task_by_vpid(pid);
                if (!tsk) {
-                       ret = -ESRCH;
-                       goto out_unlock_rcu;
+                       tsk = ERR_PTR(-ESRCH);
+                       goto out_unlock_threadgroup;
                }
        } else {
                tsk = current;
         * cgroup with no rt_runtime allocated.  Just say no.
         */
        if (tsk->no_cgroup_migration || (tsk->flags & PF_NO_SETAFFINITY)) {
-               ret = -EINVAL;
-               goto out_unlock_rcu;
+               tsk = ERR_PTR(-EINVAL);
+               goto out_unlock_threadgroup;
        }
  
        get_task_struct(tsk);
+       goto out_unlock_rcu;
+ out_unlock_threadgroup:
+       percpu_up_write(&cgroup_threadgroup_rwsem);
+ out_unlock_rcu:
        rcu_read_unlock();
+       return tsk;
+ }
  
-       ret = cgroup_procs_write_permission(tsk, cgrp, of);
-       if (!ret)
-               ret = cgroup_attach_task(cgrp, tsk, threadgroup);
+ void cgroup_procs_write_finish(struct task_struct *task)
+       __releases(&cgroup_threadgroup_rwsem)
+ {
+       struct cgroup_subsys *ss;
+       int ssid;
  
-       put_task_struct(tsk);
-       goto out_unlock_threadgroup;
+       /* release reference from cgroup_procs_write_start() */
+       put_task_struct(task);
  
- out_unlock_rcu:
-       rcu_read_unlock();
- out_unlock_threadgroup:
        percpu_up_write(&cgroup_threadgroup_rwsem);
        for_each_subsys(ss, ssid)
                if (ss->post_attach)
                        ss->post_attach();
-       cgroup_kn_unlock(of->kn);
-       return ret ?: nbytes;
- }
- ssize_t cgroup_procs_write(struct kernfs_open_file *of, char *buf, size_t nbytes,
-                          loff_t off)
- {
-       return __cgroup_procs_write(of, buf, nbytes, off, true);
  }
  
  static void cgroup_print_ss_mask(struct seq_file *seq, u16 ss_mask)
@@@ -2891,6 -2978,46 +2982,46 @@@ static void cgroup_finalize_control(str
        cgroup_apply_control_disable(cgrp);
  }
  
+ static int cgroup_vet_subtree_control_enable(struct cgroup *cgrp, u16 enable)
+ {
+       u16 domain_enable = enable & ~cgrp_dfl_threaded_ss_mask;
+       /* if nothing is getting enabled, nothing to worry about */
+       if (!enable)
+               return 0;
+       /* can @cgrp host any resources? */
+       if (!cgroup_is_valid_domain(cgrp->dom_cgrp))
+               return -EOPNOTSUPP;
+       /* mixables don't care */
+       if (cgroup_is_mixable(cgrp))
+               return 0;
+       if (domain_enable) {
+               /* can't enable domain controllers inside a thread subtree */
+               if (cgroup_is_thread_root(cgrp) || cgroup_is_threaded(cgrp))
+                       return -EOPNOTSUPP;
+       } else {
+               /*
+                * Threaded controllers can handle internal competitions
+                * and are always allowed inside a (prospective) thread
+                * subtree.
+                */
+               if (cgroup_can_be_thread_root(cgrp) || cgroup_is_threaded(cgrp))
+                       return 0;
+       }
+       /*
+        * Controllers can't be enabled for a cgroup with tasks to avoid
+        * child cgroups competing against tasks.
+        */
+       if (cgroup_has_tasks(cgrp))
+               return -EBUSY;
+       return 0;
+ }
  /* change the enabled child controllers for a cgroup in the default hierarchy */
  static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of,
                                            char *buf, size_t nbytes,
                goto out_unlock;
        }
  
-       /*
-        * Except for the root, subtree_control must be zero for a cgroup
-        * with tasks so that child cgroups don't compete against tasks.
-        */
-       if (enable && cgroup_parent(cgrp)) {
-               struct cgrp_cset_link *link;
-               /*
-                * Because namespaces pin csets too, @cgrp->cset_links
-                * might not be empty even when @cgrp is empty.  Walk and
-                * verify each cset.
-                */
-               spin_lock_irq(&css_set_lock);
-               ret = 0;
-               list_for_each_entry(link, &cgrp->cset_links, cset_link) {
-                       if (css_set_populated(link->cset)) {
-                               ret = -EBUSY;
-                               break;
-                       }
-               }
-               spin_unlock_irq(&css_set_lock);
-               if (ret)
-                       goto out_unlock;
-       }
+       ret = cgroup_vet_subtree_control_enable(cgrp, enable);
+       if (ret)
+               goto out_unlock;
  
        /* save and update control masks and prepare csses */
        cgroup_save_control(cgrp);
        cgrp->subtree_control &= ~disable;
  
        ret = cgroup_apply_control(cgrp);
 -
        cgroup_finalize_control(cgrp, ret);
 +      if (ret)
 +              goto out_unlock;
  
        kernfs_activate(cgrp->kn);
 -      ret = 0;
  out_unlock:
        cgroup_kn_unlock(of->kn);
        return ret ?: nbytes;
  }
  
+ /**
+  * cgroup_enable_threaded - make @cgrp threaded
+  * @cgrp: the target cgroup
+  *
+  * Called when "threaded" is written to the cgroup.type interface file and
+  * tries to make @cgrp threaded and join the parent's resource domain.
+  * This function is never called on the root cgroup as cgroup.type doesn't
+  * exist on it.
+  */
+ static int cgroup_enable_threaded(struct cgroup *cgrp)
+ {
+       struct cgroup *parent = cgroup_parent(cgrp);
+       struct cgroup *dom_cgrp = parent->dom_cgrp;
+       int ret;
+       lockdep_assert_held(&cgroup_mutex);
+       /* noop if already threaded */
+       if (cgroup_is_threaded(cgrp))
+               return 0;
+       /* we're joining the parent's domain, ensure its validity */
+       if (!cgroup_is_valid_domain(dom_cgrp) ||
+           !cgroup_can_be_thread_root(dom_cgrp))
+               return -EOPNOTSUPP;
+       /*
+        * The following shouldn't cause actual migrations and should
+        * always succeed.
+        */
+       cgroup_save_control(cgrp);
+       cgrp->dom_cgrp = dom_cgrp;
+       ret = cgroup_apply_control(cgrp);
+       if (!ret)
+               parent->nr_threaded_children++;
+       else
+               cgrp->dom_cgrp = cgrp;
+       cgroup_finalize_control(cgrp, ret);
+       return ret;
+ }
+ static int cgroup_type_show(struct seq_file *seq, void *v)
+ {
+       struct cgroup *cgrp = seq_css(seq)->cgroup;
+       if (cgroup_is_threaded(cgrp))
+               seq_puts(seq, "threaded\n");
+       else if (!cgroup_is_valid_domain(cgrp))
+               seq_puts(seq, "domain invalid\n");
+       else if (cgroup_is_thread_root(cgrp))
+               seq_puts(seq, "domain threaded\n");
+       else
+               seq_puts(seq, "domain\n");
+       return 0;
+ }
+ static ssize_t cgroup_type_write(struct kernfs_open_file *of, char *buf,
+                                size_t nbytes, loff_t off)
+ {
+       struct cgroup *cgrp;
+       int ret;
+       /* only switching to threaded mode is supported */
+       if (strcmp(strstrip(buf), "threaded"))
+               return -EINVAL;
+       cgrp = cgroup_kn_lock_live(of->kn, false);
+       if (!cgrp)
+               return -ENOENT;
+       /* threaded can only be enabled */
+       ret = cgroup_enable_threaded(cgrp);
+       cgroup_kn_unlock(of->kn);
+       return ret ?: nbytes;
+ }
+ static int cgroup_max_descendants_show(struct seq_file *seq, void *v)
+ {
+       struct cgroup *cgrp = seq_css(seq)->cgroup;
+       int descendants = READ_ONCE(cgrp->max_descendants);
+       if (descendants == INT_MAX)
+               seq_puts(seq, "max\n");
+       else
+               seq_printf(seq, "%d\n", descendants);
+       return 0;
+ }
+ static ssize_t cgroup_max_descendants_write(struct kernfs_open_file *of,
+                                          char *buf, size_t nbytes, loff_t off)
+ {
+       struct cgroup *cgrp;
+       int descendants;
+       ssize_t ret;
+       buf = strstrip(buf);
+       if (!strcmp(buf, "max")) {
+               descendants = INT_MAX;
+       } else {
+               ret = kstrtoint(buf, 0, &descendants);
+               if (ret)
+                       return ret;
+       }
+       if (descendants < 0)
+               return -ERANGE;
+       cgrp = cgroup_kn_lock_live(of->kn, false);
+       if (!cgrp)
+               return -ENOENT;
+       cgrp->max_descendants = descendants;
+       cgroup_kn_unlock(of->kn);
+       return nbytes;
+ }
+ static int cgroup_max_depth_show(struct seq_file *seq, void *v)
+ {
+       struct cgroup *cgrp = seq_css(seq)->cgroup;
+       int depth = READ_ONCE(cgrp->max_depth);
+       if (depth == INT_MAX)
+               seq_puts(seq, "max\n");
+       else
+               seq_printf(seq, "%d\n", depth);
+       return 0;
+ }
+ static ssize_t cgroup_max_depth_write(struct kernfs_open_file *of,
+                                     char *buf, size_t nbytes, loff_t off)
+ {
+       struct cgroup *cgrp;
+       ssize_t ret;
+       int depth;
+       buf = strstrip(buf);
+       if (!strcmp(buf, "max")) {
+               depth = INT_MAX;
+       } else {
+               ret = kstrtoint(buf, 0, &depth);
+               if (ret)
+                       return ret;
+       }
+       if (depth < 0)
+               return -ERANGE;
+       cgrp = cgroup_kn_lock_live(of->kn, false);
+       if (!cgrp)
+               return -ENOENT;
+       cgrp->max_depth = depth;
+       cgroup_kn_unlock(of->kn);
+       return nbytes;
+ }
  static int cgroup_events_show(struct seq_file *seq, void *v)
  {
        seq_printf(seq, "populated %d\n",
        return 0;
  }
  
+ static int cgroup_stat_show(struct seq_file *seq, void *v)
+ {
+       struct cgroup *cgroup = seq_css(seq)->cgroup;
+       seq_printf(seq, "nr_descendants %d\n",
+                  cgroup->nr_descendants);
+       seq_printf(seq, "nr_dying_descendants %d\n",
+                  cgroup->nr_dying_descendants);
+       return 0;
+ }
  static int cgroup_file_open(struct kernfs_open_file *of)
  {
        struct cftype *cft = of->kn->priv;
@@@ -3234,7 -3515,6 +3519,6 @@@ restart
  
  static int cgroup_apply_cftypes(struct cftype *cfts, bool is_add)
  {
-       LIST_HEAD(pending);
        struct cgroup_subsys *ss = cfts[0].ss;
        struct cgroup *root = &ss->root->cgrp;
        struct cgroup_subsys_state *css;
@@@ -3659,6 -3939,58 +3943,58 @@@ bool css_has_online_children(struct cgr
        return ret;
  }
  
+ static struct css_set *css_task_iter_next_css_set(struct css_task_iter *it)
+ {
+       struct list_head *l;
+       struct cgrp_cset_link *link;
+       struct css_set *cset;
+       lockdep_assert_held(&css_set_lock);
+       /* find the next threaded cset */
+       if (it->tcset_pos) {
+               l = it->tcset_pos->next;
+               if (l != it->tcset_head) {
+                       it->tcset_pos = l;
+                       return container_of(l, struct css_set,
+                                           threaded_csets_node);
+               }
+               it->tcset_pos = NULL;
+       }
+       /* find the next cset */
+       l = it->cset_pos;
+       l = l->next;
+       if (l == it->cset_head) {
+               it->cset_pos = NULL;
+               return NULL;
+       }
+       if (it->ss) {
+               cset = container_of(l, struct css_set, e_cset_node[it->ss->id]);
+       } else {
+               link = list_entry(l, struct cgrp_cset_link, cset_link);
+               cset = link->cset;
+       }
+       it->cset_pos = l;
+       /* initialize threaded css_set walking */
+       if (it->flags & CSS_TASK_ITER_THREADED) {
+               if (it->cur_dcset)
+                       put_css_set_locked(it->cur_dcset);
+               it->cur_dcset = cset;
+               get_css_set(cset);
+               it->tcset_head = &cset->threaded_csets;
+               it->tcset_pos = &cset->threaded_csets;
+       }
+       return cset;
+ }
  /**
   * css_task_iter_advance_css_set - advance a task itererator to the next css_set
   * @it: the iterator to advance
   */
  static void css_task_iter_advance_css_set(struct css_task_iter *it)
  {
-       struct list_head *l = it->cset_pos;
-       struct cgrp_cset_link *link;
        struct css_set *cset;
  
        lockdep_assert_held(&css_set_lock);
  
        /* Advance to the next non-empty css_set */
        do {
-               l = l->next;
-               if (l == it->cset_head) {
-                       it->cset_pos = NULL;
+               cset = css_task_iter_next_css_set(it);
+               if (!cset) {
                        it->task_pos = NULL;
                        return;
                }
-               if (it->ss) {
-                       cset = container_of(l, struct css_set,
-                                           e_cset_node[it->ss->id]);
-               } else {
-                       link = list_entry(l, struct cgrp_cset_link, cset_link);
-                       cset = link->cset;
-               }
        } while (!css_set_populated(cset));
  
-       it->cset_pos = l;
        if (!list_empty(&cset->tasks))
                it->task_pos = cset->tasks.next;
        else
@@@ -3732,6 -4051,7 +4055,7 @@@ static void css_task_iter_advance(struc
        lockdep_assert_held(&css_set_lock);
        WARN_ON_ONCE(!l);
  
+ repeat:
        /*
         * Advance iterator to find next entry.  cset->tasks is consumed
         * first and then ->mg_tasks.  After ->mg_tasks, we move onto the
                css_task_iter_advance_css_set(it);
        else
                it->task_pos = l;
+       /* if PROCS, skip over tasks which aren't group leaders */
+       if ((it->flags & CSS_TASK_ITER_PROCS) && it->task_pos &&
+           !thread_group_leader(list_entry(it->task_pos, struct task_struct,
+                                           cg_list)))
+               goto repeat;
  }
  
  /**
   * css_task_iter_start - initiate task iteration
   * @css: the css to walk tasks of
+  * @flags: CSS_TASK_ITER_* flags
   * @it: the task iterator to use
   *
   * Initiate iteration through the tasks of @css.  The caller can call
   * returns NULL.  On completion of iteration, css_task_iter_end() must be
   * called.
   */
- void css_task_iter_start(struct cgroup_subsys_state *css,
+ void css_task_iter_start(struct cgroup_subsys_state *css, unsigned int flags,
                         struct css_task_iter *it)
  {
        /* no one should try to iterate before mounting cgroups */
        spin_lock_irq(&css_set_lock);
  
        it->ss = css->ss;
+       it->flags = flags;
  
        if (it->ss)
                it->cset_pos = &css->cgroup->e_csets[css->ss->id];
@@@ -3826,6 -4154,9 +4158,9 @@@ void css_task_iter_end(struct css_task_
                spin_unlock_irq(&css_set_lock);
        }
  
+       if (it->cur_dcset)
+               put_css_set(it->cur_dcset);
        if (it->cur_task)
                put_task_struct(it->cur_task);
  }
@@@ -3842,16 -4173,12 +4177,12 @@@ static void *cgroup_procs_next(struct s
  {
        struct kernfs_open_file *of = s->private;
        struct css_task_iter *it = of->priv;
-       struct task_struct *task;
  
-       do {
-               task = css_task_iter_next(it);
-       } while (task && !thread_group_leader(task));
-       return task;
+       return css_task_iter_next(it);
  }
  
- static void *cgroup_procs_start(struct seq_file *s, loff_t *pos)
+ static void *__cgroup_procs_start(struct seq_file *s, loff_t *pos,
+                                 unsigned int iter_flags)
  {
        struct kernfs_open_file *of = s->private;
        struct cgroup *cgrp = seq_css(s)->cgroup;
                if (!it)
                        return ERR_PTR(-ENOMEM);
                of->priv = it;
-               css_task_iter_start(&cgrp->self, it);
+               css_task_iter_start(&cgrp->self, iter_flags, it);
        } else if (!(*pos)++) {
                css_task_iter_end(it);
-               css_task_iter_start(&cgrp->self, it);
+               css_task_iter_start(&cgrp->self, iter_flags, it);
        }
  
        return cgroup_procs_next(s, NULL, NULL);
  }
  
+ static void *cgroup_procs_start(struct seq_file *s, loff_t *pos)
+ {
+       struct cgroup *cgrp = seq_css(s)->cgroup;
+       /*
+        * All processes of a threaded subtree belong to the domain cgroup
+        * of the subtree.  Only threads can be distributed across the
+        * subtree.  Reject reads on cgroup.procs in the subtree proper.
+        * They're always empty anyway.
+        */
+       if (cgroup_is_threaded(cgrp))
+               return ERR_PTR(-EOPNOTSUPP);
+       return __cgroup_procs_start(s, pos, CSS_TASK_ITER_PROCS |
+                                           CSS_TASK_ITER_THREADED);
+ }
  static int cgroup_procs_show(struct seq_file *s, void *v)
  {
-       seq_printf(s, "%d\n", task_tgid_vnr(v));
+       seq_printf(s, "%d\n", task_pid_vnr(v));
        return 0;
  }
  
+ static int cgroup_procs_write_permission(struct cgroup *src_cgrp,
+                                        struct cgroup *dst_cgrp,
+                                        struct super_block *sb)
+ {
+       struct cgroup_namespace *ns = current->nsproxy->cgroup_ns;
+       struct cgroup *com_cgrp = src_cgrp;
+       struct inode *inode;
+       int ret;
+       lockdep_assert_held(&cgroup_mutex);
+       /* find the common ancestor */
+       while (!cgroup_is_descendant(dst_cgrp, com_cgrp))
+               com_cgrp = cgroup_parent(com_cgrp);
+       /* %current should be authorized to migrate to the common ancestor */
+       inode = kernfs_get_inode(sb, com_cgrp->procs_file.kn);
+       if (!inode)
+               return -ENOMEM;
+       ret = inode_permission(inode, MAY_WRITE);
+       iput(inode);
+       if (ret)
+               return ret;
+       /*
+        * If namespaces are delegation boundaries, %current must be able
+        * to see both source and destination cgroups from its namespace.
+        */
+       if ((cgrp_dfl_root.flags & CGRP_ROOT_NS_DELEGATE) &&
+           (!cgroup_is_descendant(src_cgrp, ns->root_cset->dfl_cgrp) ||
+            !cgroup_is_descendant(dst_cgrp, ns->root_cset->dfl_cgrp)))
+               return -ENOENT;
+       return 0;
+ }
+ static ssize_t cgroup_procs_write(struct kernfs_open_file *of,
+                                 char *buf, size_t nbytes, loff_t off)
+ {
+       struct cgroup *src_cgrp, *dst_cgrp;
+       struct task_struct *task;
+       ssize_t ret;
+       dst_cgrp = cgroup_kn_lock_live(of->kn, false);
+       if (!dst_cgrp)
+               return -ENODEV;
+       task = cgroup_procs_write_start(buf, true);
+       ret = PTR_ERR_OR_ZERO(task);
+       if (ret)
+               goto out_unlock;
+       /* find the source cgroup */
+       spin_lock_irq(&css_set_lock);
+       src_cgrp = task_cgroup_from_root(task, &cgrp_dfl_root);
+       spin_unlock_irq(&css_set_lock);
+       ret = cgroup_procs_write_permission(src_cgrp, dst_cgrp,
+                                           of->file->f_path.dentry->d_sb);
+       if (ret)
+               goto out_finish;
+       ret = cgroup_attach_task(dst_cgrp, task, true);
+ out_finish:
+       cgroup_procs_write_finish(task);
+ out_unlock:
+       cgroup_kn_unlock(of->kn);
+       return ret ?: nbytes;
+ }
+ static void *cgroup_threads_start(struct seq_file *s, loff_t *pos)
+ {
+       return __cgroup_procs_start(s, pos, 0);
+ }
+ static ssize_t cgroup_threads_write(struct kernfs_open_file *of,
+                                   char *buf, size_t nbytes, loff_t off)
+ {
+       struct cgroup *src_cgrp, *dst_cgrp;
+       struct task_struct *task;
+       ssize_t ret;
+       buf = strstrip(buf);
+       dst_cgrp = cgroup_kn_lock_live(of->kn, false);
+       if (!dst_cgrp)
+               return -ENODEV;
+       task = cgroup_procs_write_start(buf, false);
+       ret = PTR_ERR_OR_ZERO(task);
+       if (ret)
+               goto out_unlock;
+       /* find the source cgroup */
+       spin_lock_irq(&css_set_lock);
+       src_cgrp = task_cgroup_from_root(task, &cgrp_dfl_root);
+       spin_unlock_irq(&css_set_lock);
+       /* thread migrations follow the cgroup.procs delegation rule */
+       ret = cgroup_procs_write_permission(src_cgrp, dst_cgrp,
+                                           of->file->f_path.dentry->d_sb);
+       if (ret)
+               goto out_finish;
+       /* and must be contained in the same domain */
+       ret = -EOPNOTSUPP;
+       if (src_cgrp->dom_cgrp != dst_cgrp->dom_cgrp)
+               goto out_finish;
+       ret = cgroup_attach_task(dst_cgrp, task, false);
+ out_finish:
+       cgroup_procs_write_finish(task);
+ out_unlock:
+       cgroup_kn_unlock(of->kn);
+       return ret ?: nbytes;
+ }
  /* cgroup core interface files for the default hierarchy */
  static struct cftype cgroup_base_files[] = {
+       {
+               .name = "cgroup.type",
+               .flags = CFTYPE_NOT_ON_ROOT,
+               .seq_show = cgroup_type_show,
+               .write = cgroup_type_write,
+       },
        {
                .name = "cgroup.procs",
                .flags = CFTYPE_NS_DELEGATABLE,
                .seq_show = cgroup_procs_show,
                .write = cgroup_procs_write,
        },
+       {
+               .name = "cgroup.threads",
+               .release = cgroup_procs_release,
+               .seq_start = cgroup_threads_start,
+               .seq_next = cgroup_procs_next,
+               .seq_show = cgroup_procs_show,
+               .write = cgroup_threads_write,
+       },
        {
                .name = "cgroup.controllers",
                .seq_show = cgroup_controllers_show,
                .file_offset = offsetof(struct cgroup, events_file),
                .seq_show = cgroup_events_show,
        },
+       {
+               .name = "cgroup.max.descendants",
+               .seq_show = cgroup_max_descendants_show,
+               .write = cgroup_max_descendants_write,
+       },
+       {
+               .name = "cgroup.max.depth",
+               .seq_show = cgroup_max_depth_show,
+               .write = cgroup_max_depth_write,
+       },
+       {
+               .name = "cgroup.stat",
+               .seq_show = cgroup_stat_show,
+       },
        { }     /* terminate */
  };
  
@@@ -4011,9 -4505,15 +4509,15 @@@ static void css_release_work_fn(struct 
                if (ss->css_released)
                        ss->css_released(css);
        } else {
+               struct cgroup *tcgrp;
                /* cgroup release path */
                trace_cgroup_release(cgrp);
  
+               for (tcgrp = cgroup_parent(cgrp); tcgrp;
+                    tcgrp = cgroup_parent(tcgrp))
+                       tcgrp->nr_dying_descendants--;
                cgroup_idr_remove(&cgrp->root->cgroup_idr, cgrp->id);
                cgrp->id = -1;
  
@@@ -4100,6 -4600,9 +4604,6 @@@ static void offline_css(struct cgroup_s
        if (!(css->flags & CSS_ONLINE))
                return;
  
 -      if (ss->css_reset)
 -              ss->css_reset(css);
 -
        if (ss->css_offline)
                ss->css_offline(css);
  
@@@ -4209,9 -4712,13 +4713,13 @@@ static struct cgroup *cgroup_create(str
        cgrp->root = root;
        cgrp->level = level;
  
-       for (tcgrp = cgrp; tcgrp; tcgrp = cgroup_parent(tcgrp))
+       for (tcgrp = cgrp; tcgrp; tcgrp = cgroup_parent(tcgrp)) {
                cgrp->ancestor_ids[tcgrp->level] = tcgrp->id;
  
+               if (tcgrp != cgrp)
+                       tcgrp->nr_descendants++;
+       }
        if (notify_on_release(parent))
                set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
  
@@@ -4252,6 -4759,29 +4760,29 @@@ out_free_cgrp
        return ERR_PTR(ret);
  }
  
+ static bool cgroup_check_hierarchy_limits(struct cgroup *parent)
+ {
+       struct cgroup *cgroup;
+       int ret = false;
+       int level = 1;
+       lockdep_assert_held(&cgroup_mutex);
+       for (cgroup = parent; cgroup; cgroup = cgroup_parent(cgroup)) {
+               if (cgroup->nr_descendants >= cgroup->max_descendants)
+                       goto fail;
+               if (level > cgroup->max_depth)
+                       goto fail;
+               level++;
+       }
+       ret = true;
+ fail:
+       return ret;
+ }
  int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, umode_t mode)
  {
        struct cgroup *parent, *cgrp;
        if (!parent)
                return -ENODEV;
  
+       if (!cgroup_check_hierarchy_limits(parent)) {
+               ret = -EAGAIN;
+               goto out_unlock;
+       }
        cgrp = cgroup_create(parent);
        if (IS_ERR(cgrp)) {
                ret = PTR_ERR(cgrp);
@@@ -4417,6 -4952,7 +4953,7 @@@ static void kill_css(struct cgroup_subs
  static int cgroup_destroy_locked(struct cgroup *cgrp)
        __releases(&cgroup_mutex) __acquires(&cgroup_mutex)
  {
+       struct cgroup *tcgrp, *parent = cgroup_parent(cgrp);
        struct cgroup_subsys_state *css;
        struct cgrp_cset_link *link;
        int ssid;
         */
        kernfs_remove(cgrp->kn);
  
-       cgroup1_check_for_release(cgroup_parent(cgrp));
+       if (parent && cgroup_is_threaded(cgrp))
+               parent->nr_threaded_children--;
+       for (tcgrp = cgroup_parent(cgrp); tcgrp; tcgrp = cgroup_parent(tcgrp)) {
+               tcgrp->nr_descendants--;
+               tcgrp->nr_dying_descendants++;
+       }
+       cgroup1_check_for_release(parent);
  
        /* put the base reference */
        percpu_ref_kill(&cgrp->self.refcnt);
@@@ -4656,11 -5200,17 +5201,17 @@@ int __init cgroup_init(void
  
                cgrp_dfl_root.subsys_mask |= 1 << ss->id;
  
+               /* implicit controllers must be threaded too */
+               WARN_ON(ss->implicit_on_dfl && !ss->threaded);
                if (ss->implicit_on_dfl)
                        cgrp_dfl_implicit_ss_mask |= 1 << ss->id;
                else if (!ss->dfl_cftypes)
                        cgrp_dfl_inhibit_ss_mask |= 1 << ss->id;
  
+               if (ss->threaded)
+                       cgrp_dfl_threaded_ss_mask |= 1 << ss->id;
                if (ss->dfl_cftypes == ss->legacy_cftypes) {
                        WARN_ON(cgroup_add_cftypes(ss, ss->dfl_cftypes));
                } else {
  
                if (ss->bind)
                        ss->bind(init_css_set.subsys[ssid]);
 +
 +              mutex_lock(&cgroup_mutex);
 +              css_populate_dir(init_css_set.subsys[ssid]);
 +              mutex_unlock(&cgroup_mutex);
        }
  
        /* init_css_set.subsys[] has been updated, re-hash */
diff --combined kernel/cgroup/cpuset.c
index e7485786db9b3e8e881d045028d288d3b0cb71b1,f3539a41c49df3224047148b73bff8dae263f14e..67230ecf2ce155a36a045b0bb1c080224521a63a
@@@ -56,7 -56,6 +56,7 @@@
  #include <linux/time64.h>
  #include <linux/backing-dev.h>
  #include <linux/sort.h>
 +#include <linux/oom.h>
  
  #include <linux/uaccess.h>
  #include <linux/atomic.h>
@@@ -64,7 -63,6 +64,7 @@@
  #include <linux/cgroup.h>
  #include <linux/wait.h>
  
 +DEFINE_STATIC_KEY_FALSE(cpusets_pre_enable_key);
  DEFINE_STATIC_KEY_FALSE(cpusets_enabled_key);
  
  /* See "Frequency meter" comments, below. */
@@@ -300,6 -298,16 +300,16 @@@ static DECLARE_WORK(cpuset_hotplug_work
  
  static DECLARE_WAIT_QUEUE_HEAD(cpuset_attach_wq);
  
+ /*
+  * Cgroup v2 behavior is used when on default hierarchy or the
+  * cgroup_v2_mode flag is set.
+  */
+ static inline bool is_in_v2_mode(void)
+ {
+       return cgroup_subsys_on_dfl(cpuset_cgrp_subsys) ||
+             (cpuset_cgrp_subsys.root->flags & CGRP_ROOT_CPUSET_V2_MODE);
+ }
  /*
   * This is ugly, but preserves the userspace API for existing cpuset
   * users. If someone tries to mount the "cpuset" filesystem, we
@@@ -490,8 -498,7 +500,7 @@@ static int validate_change(struct cpuse
  
        /* On legacy hiearchy, we must be a subset of our parent cpuset. */
        ret = -EACCES;
-       if (!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) &&
-           !is_cpuset_subset(trial, par))
+       if (!is_in_v2_mode() && !is_cpuset_subset(trial, par))
                goto out;
  
        /*
@@@ -578,13 -585,6 +587,13 @@@ static void update_domain_attr_tree(str
        rcu_read_unlock();
  }
  
 +/* Must be called with cpuset_mutex held.  */
 +static inline int nr_cpusets(void)
 +{
 +      /* jump label reference count + the top-level cpuset */
 +      return static_key_count(&cpusets_enabled_key.key) + 1;
 +}
 +
  /*
   * generate_sched_domains()
   *
@@@ -870,7 -870,7 +879,7 @@@ static void update_tasks_cpumask(struc
        struct css_task_iter it;
        struct task_struct *task;
  
-       css_task_iter_start(&cs->css, &it);
+       css_task_iter_start(&cs->css, 0, &it);
        while ((task = css_task_iter_next(&it)))
                set_cpus_allowed_ptr(task, cs->effective_cpus);
        css_task_iter_end(&it);
@@@ -904,8 -904,7 +913,7 @@@ static void update_cpumasks_hier(struc
                 * If it becomes empty, inherit the effective mask of the
                 * parent, which is guaranteed to have some CPUs.
                 */
-               if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys) &&
-                   cpumask_empty(new_cpus))
+               if (is_in_v2_mode() && cpumask_empty(new_cpus))
                        cpumask_copy(new_cpus, parent->effective_cpus);
  
                /* Skip the whole subtree if the cpumask remains the same. */
                cpumask_copy(cp->effective_cpus, new_cpus);
                spin_unlock_irq(&callback_lock);
  
-               WARN_ON(!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) &&
+               WARN_ON(!is_in_v2_mode() &&
                        !cpumask_equal(cp->cpus_allowed, cp->effective_cpus));
  
                update_tasks_cpumask(cp);
@@@ -1100,7 -1099,7 +1108,7 @@@ static void update_tasks_nodemask(struc
         * It's ok if we rebind the same mm twice; mpol_rebind_mm()
         * is idempotent.  Also migrate pages in each mm to new nodes.
         */
-       css_task_iter_start(&cs->css, &it);
+       css_task_iter_start(&cs->css, 0, &it);
        while ((task = css_task_iter_next(&it))) {
                struct mm_struct *mm;
                bool migrate;
@@@ -1158,8 -1157,7 +1166,7 @@@ static void update_nodemasks_hier(struc
                 * If it becomes empty, inherit the effective mask of the
                 * parent, which is guaranteed to have some MEMs.
                 */
-               if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys) &&
-                   nodes_empty(*new_mems))
+               if (is_in_v2_mode() && nodes_empty(*new_mems))
                        *new_mems = parent->effective_mems;
  
                /* Skip the whole subtree if the nodemask remains the same. */
                cp->effective_mems = *new_mems;
                spin_unlock_irq(&callback_lock);
  
-               WARN_ON(!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) &&
+               WARN_ON(!is_in_v2_mode() &&
                        !nodes_equal(cp->mems_allowed, cp->effective_mems));
  
                update_tasks_nodemask(cp);
@@@ -1293,7 -1291,7 +1300,7 @@@ static void update_tasks_flags(struct c
        struct css_task_iter it;
        struct task_struct *task;
  
-       css_task_iter_start(&cs->css, &it);
+       css_task_iter_start(&cs->css, 0, &it);
        while ((task = css_task_iter_next(&it)))
                cpuset_update_task_spread_flag(cs, task);
        css_task_iter_end(&it);
@@@ -1468,7 -1466,7 +1475,7 @@@ static int cpuset_can_attach(struct cgr
  
        /* allow moving tasks into an empty cpuset if on default hierarchy */
        ret = -ENOSPC;
-       if (!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) &&
+       if (!is_in_v2_mode() &&
            (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed)))
                goto out_unlock;
  
@@@ -1900,7 -1898,6 +1907,7 @@@ static struct cftype files[] = 
        {
                .name = "memory_pressure",
                .read_u64 = cpuset_read_u64,
 +              .private = FILE_MEMORY_PRESSURE,
        },
  
        {
@@@ -1987,7 -1984,7 +1994,7 @@@ static int cpuset_css_online(struct cgr
        cpuset_inc();
  
        spin_lock_irq(&callback_lock);
-       if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys)) {
+       if (is_in_v2_mode()) {
                cpumask_copy(cs->effective_cpus, parent->effective_cpus);
                cs->effective_mems = parent->effective_mems;
        }
@@@ -2064,7 -2061,7 +2071,7 @@@ static void cpuset_bind(struct cgroup_s
        mutex_lock(&cpuset_mutex);
        spin_lock_irq(&callback_lock);
  
-       if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys)) {
+       if (is_in_v2_mode()) {
                cpumask_copy(top_cpuset.cpus_allowed, cpu_possible_mask);
                top_cpuset.mems_allowed = node_possible_map;
        } else {
@@@ -2258,7 -2255,7 +2265,7 @@@ retry
        cpus_updated = !cpumask_equal(&new_cpus, cs->effective_cpus);
        mems_updated = !nodes_equal(new_mems, cs->effective_mems);
  
-       if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys))
+       if (is_in_v2_mode())
                hotplug_update_tasks(cs, &new_cpus, &new_mems,
                                     cpus_updated, mems_updated);
        else
@@@ -2289,7 -2286,7 +2296,7 @@@ static void cpuset_hotplug_workfn(struc
        static cpumask_t new_cpus;
        static nodemask_t new_mems;
        bool cpus_updated, mems_updated;
-       bool on_dfl = cgroup_subsys_on_dfl(cpuset_cgrp_subsys);
+       bool on_dfl = is_in_v2_mode();
  
        mutex_lock(&cpuset_mutex);
  
@@@ -2352,7 -2349,13 +2359,7 @@@ void cpuset_update_active_cpus(void
         * We're inside cpu hotplug critical region which usually nests
         * inside cgroup synchronization.  Bounce actual hotplug processing
         * to a work item to avoid reverse locking order.
 -       *
 -       * We still need to do partition_sched_domains() synchronously;
 -       * otherwise, the scheduler will get confused and put tasks to the
 -       * dead CPU.  Fall back to the default single domain.
 -       * cpuset_hotplug_workfn() will rebuild it as necessary.
         */
 -      partition_sched_domains(1, NULL, NULL);
        schedule_work(&cpuset_hotplug_work);
  }
  
@@@ -2501,12 -2504,12 +2508,12 @@@ static struct cpuset *nearest_hardwall_
   * If we're in interrupt, yes, we can always allocate.  If @node is set in
   * current's mems_allowed, yes.  If it's not a __GFP_HARDWALL request and this
   * node is set in the nearest hardwalled cpuset ancestor to current's cpuset,
 - * yes.  If current has access to memory reserves due to TIF_MEMDIE, yes.
 + * yes.  If current has access to memory reserves as an oom victim, yes.
   * Otherwise, no.
   *
   * GFP_USER allocations are marked with the __GFP_HARDWALL bit,
   * and do not allow allocations outside the current tasks cpuset
 - * unless the task has been OOM killed as is marked TIF_MEMDIE.
 + * unless the task has been OOM killed.
   * GFP_KERNEL allocations are not so marked, so can escape to the
   * nearest enclosing hardwalled ancestor cpuset.
   *
   * affect that:
   *    in_interrupt - any node ok (current task context irrelevant)
   *    GFP_ATOMIC   - any node ok
 - *    TIF_MEMDIE   - any node ok
 + *    tsk_is_oom_victim   - any node ok
   *    GFP_KERNEL   - any node in enclosing hardwalled cpuset ok
   *    GFP_USER     - only nodes in current tasks mems allowed ok.
   */
@@@ -2547,7 -2550,7 +2554,7 @@@ bool __cpuset_node_allowed(int node, gf
         * Allow tasks that have access to memory reserves because they have
         * been OOM killed to get memory anywhere.
         */
 -      if (unlikely(test_thread_flag(TIF_MEMDIE)))
 +      if (unlikely(tsk_is_oom_victim(current)))
                return true;
        if (gfp_mask & __GFP_HARDWALL)  /* If hardwall request, stop here */
                return false;
diff --combined kernel/events/core.c
index fb415e3d824bdeb966762268a18b7c1e39354e59,ec78247da3100fb9d0590af1574d679292a145c5..3e691b75b2db2eab410208b7312687270e1fe765
@@@ -1249,31 -1249,26 +1249,31 @@@ unclone_ctx(struct perf_event_context *
        return parent_ctx;
  }
  
 -static u32 perf_event_pid(struct perf_event *event, struct task_struct *p)
 +static u32 perf_event_pid_type(struct perf_event *event, struct task_struct *p,
 +                              enum pid_type type)
  {
 +      u32 nr;
        /*
         * only top level events have the pid namespace they were created in
         */
        if (event->parent)
                event = event->parent;
  
 -      return task_tgid_nr_ns(p, event->ns);
 +      nr = __task_pid_nr_ns(p, type, event->ns);
 +      /* avoid -1 if it is idle thread or runs in another ns */
 +      if (!nr && !pid_alive(p))
 +              nr = -1;
 +      return nr;
  }
  
 -static u32 perf_event_tid(struct perf_event *event, struct task_struct *p)
 +static u32 perf_event_pid(struct perf_event *event, struct task_struct *p)
  {
 -      /*
 -       * only top level events have the pid namespace they were created in
 -       */
 -      if (event->parent)
 -              event = event->parent;
 +      return perf_event_pid_type(event, p, __PIDTYPE_TGID);
 +}
  
 -      return task_pid_nr_ns(p, event->ns);
 +static u32 perf_event_tid(struct perf_event *event, struct task_struct *p)
 +{
 +      return perf_event_pid_type(event, p, PIDTYPE_PID);
  }
  
  /*
@@@ -1457,13 -1452,6 +1457,13 @@@ static enum event_type_t get_event_type
  
        lockdep_assert_held(&ctx->lock);
  
 +      /*
 +       * It's 'group type', really, because if our group leader is
 +       * pinned, so are we.
 +       */
 +      if (event->group_leader != event)
 +              event = event->group_leader;
 +
        event_type = event->attr.pinned ? EVENT_PINNED : EVENT_FLEXIBLE;
        if (!ctx->task)
                event_type |= EVENT_CPU;
@@@ -1575,9 -1563,6 +1575,9 @@@ static void __perf_event_header_size(st
        if (sample_type & PERF_SAMPLE_TRANSACTION)
                size += sizeof(data->txn);
  
 +      if (sample_type & PERF_SAMPLE_PHYS_ADDR)
 +              size += sizeof(data->phys_addr);
 +
        event->header_size = size;
  }
  
@@@ -2225,33 -2210,6 +2225,33 @@@ static int group_can_go_on(struct perf_
        return can_add_hw;
  }
  
 +/*
 + * Complement to update_event_times(). This computes the tstamp_* values to
 + * continue 'enabled' state from @now, and effectively discards the time
 + * between the prior tstamp_stopped and now (as we were in the OFF state, or
 + * just switched (context) time base).
 + *
 + * This further assumes '@event->state == INACTIVE' (we just came from OFF) and
 + * cannot have been scheduled in yet. And going into INACTIVE state means
 + * '@event->tstamp_stopped = @now'.
 + *
 + * Thus given the rules of update_event_times():
 + *
 + *   total_time_enabled = tstamp_stopped - tstamp_enabled
 + *   total_time_running = tstamp_stopped - tstamp_running
 + *
 + * We can insert 'tstamp_stopped == now' and reverse them to compute new
 + * tstamp_* values.
 + */
 +static void __perf_event_enable_time(struct perf_event *event, u64 now)
 +{
 +      WARN_ON_ONCE(event->state != PERF_EVENT_STATE_INACTIVE);
 +
 +      event->tstamp_stopped = now;
 +      event->tstamp_enabled = now - event->total_time_enabled;
 +      event->tstamp_running = now - event->total_time_running;
 +}
 +
  static void add_event_to_ctx(struct perf_event *event,
                               struct perf_event_context *ctx)
  {
  
        list_add_event(event, ctx);
        perf_group_attach(event);
 -      event->tstamp_enabled = tstamp;
 -      event->tstamp_running = tstamp;
 -      event->tstamp_stopped = tstamp;
 +      /*
 +       * We can be called with event->state == STATE_OFF when we create with
 +       * .disabled = 1. In that case the IOC_ENABLE will call this function.
 +       */
 +      if (event->state == PERF_EVENT_STATE_INACTIVE)
 +              __perf_event_enable_time(event, tstamp);
  }
  
  static void ctx_sched_out(struct perf_event_context *ctx,
@@@ -2509,11 -2464,10 +2509,11 @@@ static void __perf_event_mark_enabled(s
        u64 tstamp = perf_event_time(event);
  
        event->state = PERF_EVENT_STATE_INACTIVE;
 -      event->tstamp_enabled = tstamp - event->total_time_enabled;
 +      __perf_event_enable_time(event, tstamp);
        list_for_each_entry(sub, &event->sibling_list, group_entry) {
 +              /* XXX should not be > INACTIVE if event isn't */
                if (sub->state >= PERF_EVENT_STATE_INACTIVE)
 -                      sub->tstamp_enabled = tstamp - sub->total_time_enabled;
 +                      __perf_event_enable_time(sub, tstamp);
        }
  }
  
@@@ -3219,13 -3173,6 +3219,13 @@@ static void perf_event_context_sched_in
                return;
  
        perf_ctx_lock(cpuctx, ctx);
 +      /*
 +       * We must check ctx->nr_events while holding ctx->lock, such
 +       * that we serialize against perf_install_in_context().
 +       */
 +      if (!ctx->nr_events)
 +              goto unlock;
 +
        perf_pmu_disable(ctx->pmu);
        /*
         * We want to keep the following priority order:
                cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
        perf_event_sched_in(cpuctx, ctx, task);
        perf_pmu_enable(ctx->pmu);
 +
 +unlock:
        perf_ctx_unlock(cpuctx, ctx);
  }
  
@@@ -3673,7 -3618,10 +3673,7 @@@ unlock
  
  static inline u64 perf_event_count(struct perf_event *event)
  {
 -      if (event->pmu->count)
 -              return event->pmu->count(event);
 -
 -      return __perf_event_count(event);
 +      return local64_read(&event->count) + atomic64_read(&event->child_count);
  }
  
  /*
@@@ -3704,6 -3652,15 +3704,6 @@@ int perf_event_read_local(struct perf_e
                goto out;
        }
  
 -      /*
 -       * It must not have a pmu::count method, those are not
 -       * NMI safe.
 -       */
 -      if (event->pmu->count) {
 -              ret = -EOPNOTSUPP;
 -              goto out;
 -      }
 -
        /* If this is a per-task event, it must be for current */
        if ((event->attach_state & PERF_ATTACH_TASK) &&
            event->hw.target != current) {
@@@ -4421,9 -4378,7 +4421,9 @@@ EXPORT_SYMBOL_GPL(perf_event_read_value
  static int __perf_read_group_add(struct perf_event *leader,
                                        u64 read_format, u64 *values)
  {
 +      struct perf_event_context *ctx = leader->ctx;
        struct perf_event *sub;
 +      unsigned long flags;
        int n = 1; /* skip @nr */
        int ret;
  
        if (read_format & PERF_FORMAT_ID)
                values[n++] = primary_event_id(leader);
  
 +      raw_spin_lock_irqsave(&ctx->lock, flags);
 +
        list_for_each_entry(sub, &leader->sibling_list, group_entry) {
                values[n++] += perf_event_count(sub);
                if (read_format & PERF_FORMAT_ID)
                        values[n++] = primary_event_id(sub);
        }
  
 +      raw_spin_unlock_irqrestore(&ctx->lock, flags);
        return 0;
  }
  
@@@ -5126,7 -5078,7 +5126,7 @@@ static void perf_mmap_open(struct vm_ar
                atomic_inc(&event->rb->aux_mmap_count);
  
        if (event->pmu->event_mapped)
 -              event->pmu->event_mapped(event);
 +              event->pmu->event_mapped(event, vma->vm_mm);
  }
  
  static void perf_pmu_output_stop(struct perf_event *event);
@@@ -5149,7 -5101,7 +5149,7 @@@ static void perf_mmap_close(struct vm_a
        unsigned long size = perf_data_size(rb);
  
        if (event->pmu->event_unmapped)
 -              event->pmu->event_unmapped(event);
 +              event->pmu->event_unmapped(event, vma->vm_mm);
  
        /*
         * rb->aux_mmap_count will always drop before rb->mmap_count and
@@@ -5447,7 -5399,7 +5447,7 @@@ aux_unlock
        vma->vm_ops = &perf_mmap_vmops;
  
        if (event->pmu->event_mapped)
 -              event->pmu->event_mapped(event);
 +              event->pmu->event_mapped(event, vma->vm_mm);
  
        return ret;
  }
@@@ -6008,9 -5960,6 +6008,9 @@@ void perf_output_sample(struct perf_out
                }
        }
  
 +      if (sample_type & PERF_SAMPLE_PHYS_ADDR)
 +              perf_output_put(handle, data->phys_addr);
 +
        if (!event->attr.watermark) {
                int wakeup_events = event->attr.wakeup_events;
  
        }
  }
  
 +static u64 perf_virt_to_phys(u64 virt)
 +{
 +      u64 phys_addr = 0;
 +      struct page *p = NULL;
 +
 +      if (!virt)
 +              return 0;
 +
 +      if (virt >= TASK_SIZE) {
 +              /* If it's vmalloc()d memory, leave phys_addr as 0 */
 +              if (virt_addr_valid((void *)(uintptr_t)virt) &&
 +                  !(virt >= VMALLOC_START && virt < VMALLOC_END))
 +                      phys_addr = (u64)virt_to_phys((void *)(uintptr_t)virt);
 +      } else {
 +              /*
 +               * Walking the pages tables for user address.
 +               * Interrupts are disabled, so it prevents any tear down
 +               * of the page tables.
 +               * Try IRQ-safe __get_user_pages_fast first.
 +               * If failed, leave phys_addr as 0.
 +               */
 +              if ((current->mm != NULL) &&
 +                  (__get_user_pages_fast(virt, 1, 0, &p) == 1))
 +                      phys_addr = page_to_phys(p) + virt % PAGE_SIZE;
 +
 +              if (p)
 +                      put_page(p);
 +      }
 +
 +      return phys_addr;
 +}
 +
  void perf_prepare_sample(struct perf_event_header *header,
                         struct perf_sample_data *data,
                         struct perf_event *event,
  
                header->size += size;
        }
 +
 +      if (sample_type & PERF_SAMPLE_PHYS_ADDR)
 +              data->phys_addr = perf_virt_to_phys(data->addr);
  }
  
  static void __always_inline
@@@ -7330,11 -7244,6 +7330,11 @@@ static void perf_log_throttle(struct pe
        perf_output_end(&handle);
  }
  
 +void perf_event_itrace_started(struct perf_event *event)
 +{
 +      event->attach_state |= PERF_ATTACH_ITRACE;
 +}
 +
  static void perf_log_itrace_start(struct perf_event *event)
  {
        struct perf_output_handle handle;
                event = event->parent;
  
        if (!(event->pmu->capabilities & PERF_PMU_CAP_ITRACE) ||
 -          event->hw.itrace_started)
 +          event->attach_state & PERF_ATTACH_ITRACE)
                return;
  
        rec.header.type = PERF_RECORD_ITRACE_START;
@@@ -7412,6 -7321,21 +7412,6 @@@ int perf_event_account_interrupt(struc
        return __perf_event_account_interrupt(event, 1);
  }
  
 -static bool sample_is_allowed(struct perf_event *event, struct pt_regs *regs)
 -{
 -      /*
 -       * Due to interrupt latency (AKA "skid"), we may enter the
 -       * kernel before taking an overflow, even if the PMU is only
 -       * counting user events.
 -       * To avoid leaking information to userspace, we must always
 -       * reject kernel samples when exclude_kernel is set.
 -       */
 -      if (event->attr.exclude_kernel && !user_mode(regs))
 -              return false;
 -
 -      return true;
 -}
 -
  /*
   * Generic event overflow handling, sampling.
   */
@@@ -7432,6 -7356,12 +7432,6 @@@ static int __perf_event_overflow(struc
  
        ret = __perf_event_account_interrupt(event, throttle);
  
 -      /*
 -       * For security, drop the skid kernel samples if necessary.
 -       */
 -      if (!sample_is_allowed(event, regs))
 -              return ret;
 -
        /*
         * XXX event_limit might not quite work as expected on inherited
         * events
@@@ -7954,15 -7884,16 +7954,15 @@@ void perf_trace_run_bpf_submit(void *ra
                }
        }
        perf_tp_event(call->event.type, count, raw_data, size, regs, head,
 -                    rctx, task);
 +                    rctx, task, NULL);
  }
  EXPORT_SYMBOL_GPL(perf_trace_run_bpf_submit);
  
  void perf_tp_event(u16 event_type, u64 count, void *record, int entry_size,
                   struct pt_regs *regs, struct hlist_head *head, int rctx,
 -                 struct task_struct *task)
 +                 struct task_struct *task, struct perf_event *event)
  {
        struct perf_sample_data data;
 -      struct perf_event *event;
  
        struct perf_raw_record raw = {
                .frag = {
  
        perf_trace_buf_update(record, event_type);
  
 -      hlist_for_each_entry_rcu(event, head, hlist_entry) {
 +      /* Use the given event instead of the hlist */
 +      if (event) {
                if (perf_tp_event_match(event, &data, regs))
                        perf_swevent_event(event, count, &data, regs);
 +      } else {
 +              hlist_for_each_entry_rcu(event, head, hlist_entry) {
 +                      if (perf_tp_event_match(event, &data, regs))
 +                              perf_swevent_event(event, count, &data, regs);
 +              }
        }
  
        /*
@@@ -8134,7 -8059,7 +8134,7 @@@ static void perf_event_free_bpf_handler
  
  static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd)
  {
 -      bool is_kprobe, is_tracepoint;
 +      bool is_kprobe, is_tracepoint, is_syscall_tp;
        struct bpf_prog *prog;
  
        if (event->attr.type != PERF_TYPE_TRACEPOINT)
  
        is_kprobe = event->tp_event->flags & TRACE_EVENT_FL_UKPROBE;
        is_tracepoint = event->tp_event->flags & TRACE_EVENT_FL_TRACEPOINT;
 -      if (!is_kprobe && !is_tracepoint)
 +      is_syscall_tp = is_syscall_trace_event(event->tp_event);
 +      if (!is_kprobe && !is_tracepoint && !is_syscall_tp)
                /* bpf programs can only be attached to u/kprobe or tracepoint */
                return -EINVAL;
  
                return PTR_ERR(prog);
  
        if ((is_kprobe && prog->type != BPF_PROG_TYPE_KPROBE) ||
 -          (is_tracepoint && prog->type != BPF_PROG_TYPE_TRACEPOINT)) {
 +          (is_tracepoint && prog->type != BPF_PROG_TYPE_TRACEPOINT) ||
 +          (is_syscall_tp && prog->type != BPF_PROG_TYPE_TRACEPOINT)) {
                /* valid fd, but invalid bpf program type */
                bpf_prog_put(prog);
                return -EINVAL;
        }
  
 -      if (is_tracepoint) {
 +      if (is_tracepoint || is_syscall_tp) {
                int off = trace_event_get_offsets(event->tp_event);
  
                if (prog->aux->max_ctx_offset > off) {
@@@ -9666,8 -9589,6 +9666,8 @@@ static int perf_copy_attr(struct perf_e
        if (ret)
                return -EFAULT;
  
 +      attr->size = size;
 +
        if (attr->__reserved_1)
                return -EINVAL;
  
@@@ -9940,11 -9861,6 +9940,11 @@@ SYSCALL_DEFINE5(perf_event_open
                        return -EINVAL;
        }
  
 +      /* Only privileged users can get physical addresses */
 +      if ((attr.sample_type & PERF_SAMPLE_PHYS_ADDR) &&
 +          perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN))
 +              return -EACCES;
 +
        if (!attr.sample_max_stack)
                attr.sample_max_stack = sysctl_perf_event_max_stack;
  
                        goto err_context;
  
                /*
 -               * Do not allow to attach to a group in a different
 -               * task or CPU context:
 +               * Make sure we're both events for the same CPU;
 +               * grouping events for different CPUs is broken; since
 +               * you can never concurrently schedule them anyhow.
                 */
 -              if (move_group) {
 -                      /*
 -                       * Make sure we're both on the same task, or both
 -                       * per-cpu events.
 -                       */
 -                      if (group_leader->ctx->task != ctx->task)
 -                              goto err_context;
 +              if (group_leader->cpu != event->cpu)
 +                      goto err_context;
  
 -                      /*
 -                       * Make sure we're both events for the same CPU;
 -                       * grouping events for different CPUs is broken; since
 -                       * you can never concurrently schedule them anyhow.
 -                       */
 -                      if (group_leader->cpu != event->cpu)
 -                              goto err_context;
 -              } else {
 -                      if (group_leader->ctx != ctx)
 -                              goto err_context;
 -              }
 +              /*
 +               * Make sure we're both on the same task, or both
 +               * per-CPU events.
 +               */
 +              if (group_leader->ctx->task != ctx->task)
 +                      goto err_context;
 +
 +              /*
 +               * Do not allow to attach to a group in a different task
 +               * or CPU context. If we're moving SW events, we'll fix
 +               * this up later, so allow that.
 +               */
 +              if (!move_group && group_leader->ctx != ctx)
 +                      goto err_context;
  
                /*
                 * Only a group leader can be exclusive or pinned
@@@ -11293,5 -11210,6 +11293,6 @@@ struct cgroup_subsys perf_event_cgrp_su
         * controller is not mounted on a legacy hierarchy.
         */
        .implicit_on_dfl = true,
+       .threaded       = true,
  };
  #endif /* CONFIG_CGROUP_PERF */
diff --combined mm/memcontrol.c
index ad15850ee157e3b309045a53ad9ba733745869e1,2b2f071f914b5143c45b0192bfefe7daaaa86539..6532b219b22239a268783d399a7ffe0385ee4ccf
@@@ -550,12 -550,10 +550,12 @@@ mem_cgroup_largest_soft_limit_node(stru
   * value, and reading all cpu value can be performance bottleneck in some
   * common workload, threshold and synchronization as vmstat[] should be
   * implemented.
 + *
 + * The parameter idx can be of type enum memcg_event_item or vm_event_item.
   */
  
  static unsigned long memcg_sum_events(struct mem_cgroup *memcg,
 -                                    enum memcg_event_item event)
 +                                    int event)
  {
        unsigned long val = 0;
        int cpu;
@@@ -919,7 -917,7 +919,7 @@@ int mem_cgroup_scan_tasks(struct mem_cg
                struct css_task_iter it;
                struct task_struct *task;
  
-               css_task_iter_start(&iter->css, &it);
+               css_task_iter_start(&iter->css, 0, &it);
                while (!ret && (task = css_task_iter_next(&it)))
                        ret = fn(task, arg);
                css_task_iter_end(&it);
@@@ -1613,13 -1611,9 +1613,13 @@@ cleanup
   * @page: the page
   *
   * This function protects unlocked LRU pages from being moved to
 - * another cgroup and stabilizes their page->mem_cgroup binding.
 + * another cgroup.
 + *
 + * It ensures lifetime of the returned memcg. Caller is responsible
 + * for the lifetime of the page; __unlock_page_memcg() is available
 + * when @page might get freed inside the locked section.
   */
 -void lock_page_memcg(struct page *page)
 +struct mem_cgroup *lock_page_memcg(struct page *page)
  {
        struct mem_cgroup *memcg;
        unsigned long flags;
         * The RCU lock is held throughout the transaction.  The fast
         * path can get away without acquiring the memcg->move_lock
         * because page moving starts with an RCU grace period.
 -       */
 +       *
 +       * The RCU lock also protects the memcg from being freed when
 +       * the page state that is going to change is the only thing
 +       * preventing the page itself from being freed. E.g. writeback
 +       * doesn't hold a page reference and relies on PG_writeback to
 +       * keep off truncation, migration and so forth.
 +         */
        rcu_read_lock();
  
        if (mem_cgroup_disabled())
 -              return;
 +              return NULL;
  again:
        memcg = page->mem_cgroup;
        if (unlikely(!memcg))
 -              return;
 +              return NULL;
  
        if (atomic_read(&memcg->moving_account) <= 0)
 -              return;
 +              return memcg;
  
        spin_lock_irqsave(&memcg->move_lock, flags);
        if (memcg != page->mem_cgroup) {
        memcg->move_lock_task = current;
        memcg->move_lock_flags = flags;
  
 -      return;
 +      return memcg;
  }
  EXPORT_SYMBOL(lock_page_memcg);
  
  /**
 - * unlock_page_memcg - unlock a page->mem_cgroup binding
 - * @page: the page
 + * __unlock_page_memcg - unlock and unpin a memcg
 + * @memcg: the memcg
 + *
 + * Unlock and unpin a memcg returned by lock_page_memcg().
   */
 -void unlock_page_memcg(struct page *page)
 +void __unlock_page_memcg(struct mem_cgroup *memcg)
  {
 -      struct mem_cgroup *memcg = page->mem_cgroup;
 -
        if (memcg && memcg->move_lock_task == current) {
                unsigned long flags = memcg->move_lock_flags;
  
  
        rcu_read_unlock();
  }
 +
 +/**
 + * unlock_page_memcg - unlock a page->mem_cgroup binding
 + * @page: the page
 + */
 +void unlock_page_memcg(struct page *page)
 +{
 +      __unlock_page_memcg(page->mem_cgroup);
 +}
  EXPORT_SYMBOL(unlock_page_memcg);
  
  /*
@@@ -1917,7 -1896,7 +1917,7 @@@ retry
         * bypass the last charges so that they can exit quickly and
         * free their memory.
         */
 -      if (unlikely(test_thread_flag(TIF_MEMDIE) ||
 +      if (unlikely(tsk_is_oom_victim(current) ||
                     fatal_signal_pending(current) ||
                     current->flags & PF_EXITING))
                goto force;
@@@ -4321,8 -4300,6 +4321,8 @@@ static void mem_cgroup_css_offline(stru
        }
        spin_unlock(&memcg->event_list_lock);
  
 +      memcg->low = 0;
 +
        memcg_offline_kmem(memcg);
        wb_memcg_offline(memcg);
  
@@@ -4639,11 -4616,8 +4639,11 @@@ static enum mc_target_type get_mctgt_ty
                if (!ret || !target)
                        put_page(page);
        }
 -      /* There is a swap entry and a page doesn't exist or isn't charged */
 -      if (ent.val && !ret &&
 +      /*
 +       * There is a swap entry and a page doesn't exist or isn't charged.
 +       * But we cannot move a tail-page in a THP.
 +       */
 +      if (ent.val && !ret && (!page || !PageTransCompound(page)) &&
            mem_cgroup_id(mc.from) == lookup_swap_cgroup_id(ent)) {
                ret = MC_TARGET_SWAP;
                if (target)
  
  #ifdef CONFIG_TRANSPARENT_HUGEPAGE
  /*
 - * We don't consider swapping or file mapped pages because THP does not
 - * support them for now.
 + * We don't consider PMD mapped swapping or file mapped pages because THP does
 + * not support them for now.
   * Caller should make sure that pmd_trans_huge(pmd) is true.
   */
  static enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma,
@@@ -5430,7 -5404,7 +5430,7 @@@ int mem_cgroup_try_charge(struct page *
                 * in turn serializes uncharging.
                 */
                VM_BUG_ON_PAGE(!PageLocked(page), page);
 -              if (page->mem_cgroup)
 +              if (compound_head(page)->mem_cgroup)
                        goto out;
  
                if (do_swap_account) {
@@@ -5913,7 -5887,6 +5913,7 @@@ static struct mem_cgroup *mem_cgroup_id
  void mem_cgroup_swapout(struct page *page, swp_entry_t entry)
  {
        struct mem_cgroup *memcg, *swap_memcg;
 +      unsigned int nr_entries;
        unsigned short oldid;
  
        VM_BUG_ON_PAGE(PageLRU(page), page);
         * ancestor for the swap instead and transfer the memory+swap charge.
         */
        swap_memcg = mem_cgroup_id_get_online(memcg);
 -      oldid = swap_cgroup_record(entry, mem_cgroup_id(swap_memcg), 1);
 +      nr_entries = hpage_nr_pages(page);
 +      /* Get references for the tail pages, too */
 +      if (nr_entries > 1)
 +              mem_cgroup_id_get_many(swap_memcg, nr_entries - 1);
 +      oldid = swap_cgroup_record(entry, mem_cgroup_id(swap_memcg),
 +                                 nr_entries);
        VM_BUG_ON_PAGE(oldid, page);
 -      mem_cgroup_swap_statistics(swap_memcg, 1);
 +      mem_cgroup_swap_statistics(swap_memcg, nr_entries);
  
        page->mem_cgroup = NULL;
  
        if (!mem_cgroup_is_root(memcg))
 -              page_counter_uncharge(&memcg->memory, 1);
 +              page_counter_uncharge(&memcg->memory, nr_entries);
  
        if (memcg != swap_memcg) {
                if (!mem_cgroup_is_root(swap_memcg))
 -                      page_counter_charge(&swap_memcg->memsw, 1);
 -              page_counter_uncharge(&memcg->memsw, 1);
 +                      page_counter_charge(&swap_memcg->memsw, nr_entries);
 +              page_counter_uncharge(&memcg->memsw, nr_entries);
        }
  
        /*
         * only synchronisation we have for udpating the per-CPU variables.
         */
        VM_BUG_ON(!irqs_disabled());
 -      mem_cgroup_charge_statistics(memcg, page, false, -1);
 +      mem_cgroup_charge_statistics(memcg, page, PageTransHuge(page),
 +                                   -nr_entries);
        memcg_check_events(memcg, page);
  
        if (!mem_cgroup_is_root(memcg))
This page took 0.213148 seconds and 4 git commands to generate.