Merge branch 'for-5.18' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup

author Linus Torvalds <[email protected]>

Wed, 23 Mar 2022 19:43:35 +0000 (12:43 -0700)

committer Linus Torvalds <[email protected]>

Wed, 23 Mar 2022 19:43:35 +0000 (12:43 -0700)
author Linus Torvalds <[email protected]>
Wed, 23 Mar 2022 19:43:35 +0000 (12:43 -0700)
committer Linus Torvalds <[email protected]>
Wed, 23 Mar 2022 19:43:35 +0000 (12:43 -0700)
diff --combined kernel/cgroup/cgroup.c

index a557eea7166fbd5e56be6300cacfb49d2a3a18cf,37c49e1a672f734a0e9d708db37cf9f5debec594..795c2818e2a33258951ce6298cb240af8766cef4
--- 1/kernel/cgroup/cgroup.c
--- 2/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@@ -3643,12 -3643,6 +3643,12 @@@ static ssize_t cgroup_pressure_write(st
         cgroup_get(cgrp);
         cgroup_kn_unlock(of->kn);
   
+ +      /* Allow only one trigger per file descriptor */
+ +      if (ctx->psi.trigger) {
+ +              cgroup_put(cgrp);
+ +              return -EBUSY;
+ +      }
+ +
         psi = cgroup_ino(cgrp) == 1 ? &psi_system : &cgrp->psi;
         new = psi_trigger_create(psi, buf, nbytes, res);
         if (IS_ERR(new)) {
@@@ -3656,7 -3650,8 +3656,7 @@@
                 return PTR_ERR(new);
         }
   
- -      psi_trigger_replace(&ctx->psi.trigger, new);
- -
+ +      smp_store_release(&ctx->psi.trigger, new);
         cgroup_put(cgrp);
   
         return nbytes;
@@@ -3695,7 -3690,7 +3695,7 @@@ static void cgroup_pressure_release(str
   {
         struct cgroup_file_ctx *ctx = of->priv;
   
- -      psi_trigger_replace(&ctx->psi.trigger, NULL);
+ +      psi_trigger_destroy(ctx->psi.trigger);
   }
   
   bool cgroup_psi_enabled(void)
@@@ -6166,20 -6161,6 +6166,20 @@@ static int cgroup_css_set_fork(struct k
         if (ret)
                 goto err;
   
+ +      /*
+ +       * Spawning a task directly into a cgroup works by passing a file
+ +       * descriptor to the target cgroup directory. This can even be an O_PATH
+ +       * file descriptor. But it can never be a cgroup.procs file descriptor.
+ +       * This was done on purpose so spawning into a cgroup could be
+ +       * conceptualized as an atomic
+ +       *
+ +       *   fd = openat(dfd_cgroup, "cgroup.procs", ...);
+ +       *   write(fd, <child-pid>, ...);
+ +       *
+ +       * sequence, i.e. it's a shorthand for the caller opening and writing
+ +       * cgroup.procs of the cgroup indicated by @dfd_cgroup. This allows us
+ +       * to always use the caller's credentials.
+ +       */
         ret = cgroup_attach_permissions(cset->dfl_cgrp, dst_cgrp, sb,
                                         !(kargs->flags & CLONE_THREAD),
                                         current->nsproxy->cgroup_ns);
@@@ -6243,6 -6224,7 +6243,7 @@@ static void cgroup_css_set_put_fork(str
   /**
    * cgroup_can_fork - called on a new task before the process is exposed
    * @child: the child process
+  * @kargs: the arguments passed to create the child process
    *
    * This prepares a new css_set for the child process which the child will
    * be attached to in cgroup_post_fork().
@@@ -6305,6 -6287,7 +6306,7 @@@ void cgroup_cancel_fork(struct task_str
   /**
    * cgroup_post_fork - finalize cgroup setup for the child process
    * @child: the child process
+  * @kargs: the arguments passed to create the child process
    *
    * Attach the child process to its css_set calling the subsystem fork()
    * callbacks.
diff --combined kernel/cgroup/cpuset.c

index ef88cc366bb8421e1a6cac7c16f1296d839fd6fc,8324a8fd71bbd6a3b3a8332894e0172816d4b61f..9390bfd9f1cd382e6e08a9d12df051fd3722c636
--- 1/kernel/cgroup/cpuset.c
--- 2/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@@ -71,7 -71,7 +71,7 @@@ DEFINE_STATIC_KEY_FALSE(cpusets_enabled
   
   /*
    * There could be abnormal cpuset configurations for cpu or memory
-  * node binding, add this key to provide a quick low-cost judgement
+  * node binding, add this key to provide a quick low-cost judgment
    * of the situation.
    */
   DEFINE_STATIC_KEY_FALSE(cpusets_insane_config_key);
@@@ -590,35 -590,6 +590,35 @@@ static inline void free_cpuset(struct c
         kfree(cs);
   }
   
+ +/*
+ + * validate_change_legacy() - Validate conditions specific to legacy (v1)
+ + *                            behavior.
+ + */
+ +static int validate_change_legacy(struct cpuset *cur, struct cpuset *trial)
+ +{
+ +      struct cgroup_subsys_state *css;
+ +      struct cpuset *c, *par;
+ +      int ret;
+ +
+ +      WARN_ON_ONCE(!rcu_read_lock_held());
+ +
+ +      /* Each of our child cpusets must be a subset of us */
+ +      ret = -EBUSY;
+ +      cpuset_for_each_child(c, css, cur)
+ +              if (!is_cpuset_subset(c, trial))
+ +                      goto out;
+ +
+ +      /* On legacy hierarchy, we must be a subset of our parent cpuset. */
+ +      ret = -EACCES;
+ +      par = parent_cs(cur);
+ +      if (par && !is_cpuset_subset(trial, par))
+ +              goto out;
+ +
+ +      ret = 0;
+ +out:
+ +      return ret;
+ +}
+ +
   /*
    * validate_change() - Used to validate that any proposed cpuset change
    *                   follows the structural rules for cpusets.
@@@ -643,21 -614,20 +643,21 @@@ static int validate_change(struct cpuse
   {
         struct cgroup_subsys_state *css;
         struct cpuset *c, *par;
- -      int ret;
- -
- -      /* The checks don't apply to root cpuset */
- -      if (cur == &top_cpuset)
- -              return 0;
+ +      int ret = 0;
   
         rcu_read_lock();
- -      par = parent_cs(cur);
   
- -      /* On legacy hierarchy, we must be a subset of our parent cpuset. */
- -      ret = -EACCES;
- -      if (!is_in_v2_mode() && !is_cpuset_subset(trial, par))
+ +      if (!is_in_v2_mode())
+ +              ret = validate_change_legacy(cur, trial);
+ +      if (ret)
+ +              goto out;
+ +
+ +      /* Remaining checks don't apply to root cpuset */
+ +      if (cur == &top_cpuset)
                 goto out;
   
+ +      par = parent_cs(cur);
+ +
         /*
          * If either I or some sibling (!= me) is exclusive, we can't
          * overlap
@@@ -833,7 -803,7 +833,7 @@@ static int generate_sched_domains(cpuma
                         update_domain_attr_tree(dattr, &top_cpuset);
                 }
                 cpumask_and(doms[0], top_cpuset.effective_cpus,
- -                          housekeeping_cpumask(HK_FLAG_DOMAIN));
+ +                          housekeeping_cpumask(HK_TYPE_DOMAIN));
   
                 goto done;
         }
@@@ -863,7 -833,7 +863,7 @@@
                 if (!cpumask_empty(cp->cpus_allowed) &&
                     !(is_sched_load_balance(cp) &&
                       cpumask_intersects(cp->cpus_allowed,
- -                                       housekeeping_cpumask(HK_FLAG_DOMAIN))))
+ +                                       housekeeping_cpumask(HK_TYPE_DOMAIN))))
                         continue;
   
                 if (root_load_balance &&
@@@ -952,7 -922,7 +952,7 @@@ restart
   
                         if (apn == b->pn) {
                                 cpumask_or(dp, dp, b->effective_cpus);
- -                              cpumask_and(dp, dp, housekeeping_cpumask(HK_FLAG_DOMAIN));
+ +                              cpumask_and(dp, dp, housekeeping_cpumask(HK_TYPE_DOMAIN));
                                 if (dattr)
                                         update_domain_attr_tree(dattr + nslot, b);
   
@@@ -1181,7 -1151,7 +1181,7 @@@ enum subparts_cmd 
    * effective_cpus. The function will return 0 if all the CPUs listed in
    * cpus_allowed can be granted or an error code will be returned.
    *
-  * For partcmd_disable, the cpuset is being transofrmed from a partition
+  * For partcmd_disable, the cpuset is being transformed from a partition
    * root back to a non-partition root. Any CPUs in cpus_allowed that are in
    * parent's subparts_cpus will be taken away from that cpumask and put back
    * into parent's effective_cpus. 0 should always be returned.
@@@ -1205,7 -1175,9 +1205,7 @@@
    *
    * Because of the implicit cpu exclusive nature of a partition root,
    * cpumask changes that violates the cpu exclusivity rule will not be
- - * permitted when checked by validate_change(). The validate_change()
- - * function will also prevent any changes to the cpu list if it is not
- - * a superset of children's cpu lists.
+ + * permitted when checked by validate_change().
    */
   static int update_parent_subparts_cpumask(struct cpuset *cpuset, int cmd,
                                           struct cpumask *newmask,
@@@ -1550,15 -1522,10 +1550,15 @@@ static void update_sibling_cpumasks(str
         struct cpuset *sibling;
         struct cgroup_subsys_state *pos_css;
   
+ +      percpu_rwsem_assert_held(&cpuset_rwsem);
+ +
         /*
          * Check all its siblings and call update_cpumasks_hier()
          * if their use_parent_ecpus flag is set in order for them
          * to use the right effective_cpus value.
+ +       *
+ +       * The update_cpumasks_hier() function may sleep. So we have to
+ +       * release the RCU read lock before calling it.
          */
         rcu_read_lock();
         cpuset_for_each_child(sibling, pos_css, parent) {
@@@ -1566,13 -1533,8 +1566,13 @@@
                         continue;
                 if (!sibling->use_parent_ecpus)
                         continue;
+ +              if (!css_tryget_online(&sibling->css))
+ +                      continue;
   
+ +              rcu_read_unlock();
                 update_cpumasks_hier(sibling, tmp);
+ +              rcu_read_lock();
+ +              css_put(&sibling->css);
         }
         rcu_read_unlock();
   }
@@@ -1645,7 -1607,8 +1645,7 @@@ static int update_cpumask(struct cpuse
          * Make sure that subparts_cpus is a subset of cpus_allowed.
          */
         if (cs->nr_subparts_cpus) {
- -              cpumask_andnot(cs->subparts_cpus, cs->subparts_cpus,
- -                             cs->cpus_allowed);
+ +              cpumask_and(cs->subparts_cpus, cs->subparts_cpus, cs->cpus_allowed);
                 cs->nr_subparts_cpus = cpumask_weight(cs->subparts_cpus);
         }
         spin_unlock_irq(&callback_lock);
@@@ -2027,7 -1990,7 +2027,7 @@@ out
   }
   
   /*
-  * update_prstate - update partititon_root_state
+  * update_prstate - update partition_root_state
    * cs: the cpuset to update
    * new_prs: new partition root state
    *
@@@ -2289,7 -2252,6 +2289,7 @@@ static void cpuset_attach(struct cgroup
         cgroup_taskset_first(tset, &css);
         cs = css_cs(css);
   
+ +      cpus_read_lock();
         percpu_down_write(&cpuset_rwsem);
   
         guarantee_online_mems(cs, &cpuset_attach_nodemask_to);
@@@ -2343,7 -2305,6 +2343,7 @@@
                 wake_up(&cpuset_attach_wq);
   
         percpu_up_write(&cpuset_rwsem);
+ +      cpus_read_unlock();
   }
   
   /* The various types of files and directories in a cpuset file system */
@@@ -2879,7 -2840,7 +2879,7 @@@ static int cpuset_css_online(struct cgr
         /*
          * Clone @parent's configuration if CGRP_CPUSET_CLONE_CHILDREN is
          * set.  This flag handling is implemented in cgroup core for
-        * histrical reasons - the flag may be specified during mount.
+        * historical reasons - the flag may be specified during mount.
          *
          * Currently, if any sibling cpusets have exclusive cpus or mem, we
          * refuse to clone the configuration - thereby refusing the task to
@@@ -3076,7 -3037,7 +3076,7 @@@ hotplug_update_tasks_legacy(struct cpus
   
         /*
          * Don't call update_tasks_cpumask() if the cpuset becomes empty,
-        * as the tasks will be migratecd to an ancestor.
+        * as the tasks will be migrated to an ancestor.
          */
         if (cpus_updated && !cpumask_empty(cs->cpus_allowed))
                 update_tasks_cpumask(cs);
@@@ -3524,8 -3485,8 +3524,8 @@@ static struct cpuset *nearest_hardwall_
         return cs;
   }
   
- -/**
- - * cpuset_node_allowed - Can we allocate on a memory node?
+ +/*
+ + * __cpuset_node_allowed - Can we allocate on a memory node?
    * @node: is this an allowed node?
    * @gfp_mask: memory allocation flags
    *
@@@ -3696,8 -3657,8 +3696,8 @@@ void cpuset_print_current_mems_allowed(
   
   int cpuset_memory_pressure_enabled __read_mostly;
   
- -/**
- - * cpuset_memory_pressure_bump - keep stats of per-cpuset reclaims.
+ +/*
+ + * __cpuset_memory_pressure_bump - keep stats of per-cpuset reclaims.
    *
    * Keep a running average of the rate of synchronous (direct)
    * page reclaim efforts initiated by tasks in each cpuset.
@@@ -3712,7 -3673,7 +3712,7 @@@
    * "memory_pressure".  Value displayed is an integer
    * representing the recent rate of entry into the synchronous
    * (direct) page reclaim by any task attached to the cpuset.
- - **/
+ + */
   
   void __cpuset_memory_pressure_bump(void)
   {
author	Linus Torvalds <[email protected]>
	Wed, 23 Mar 2022 19:43:35 +0000 (12:43 -0700)
committer	Linus Torvalds <[email protected]>
	Wed, 23 Mar 2022 19:43:35 +0000 (12:43 -0700)
		1	2
kernel/cgroup/cgroup.c	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/cgroup/cpuset.c	patch \|	diff1 \|	diff2 \|	blob \| history