Merge tag 'cgroup-for-6.5' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup

author Linus Torvalds <[email protected]>

Tue, 27 Jun 2023 23:54:21 +0000 (16:54 -0700)

committer Linus Torvalds <[email protected]>

Tue, 27 Jun 2023 23:54:21 +0000 (16:54 -0700)
author Linus Torvalds <[email protected]>
Tue, 27 Jun 2023 23:54:21 +0000 (16:54 -0700)
committer Linus Torvalds <[email protected]>
Tue, 27 Jun 2023 23:54:21 +0000 (16:54 -0700)
diff --combined Documentation/admin-guide/cgroup-v2.rst

index c63358c38a1dda832990bc7c4ab2c27f189d0ddd,d9f3768a10dbf32cc21677a1c6bf31b6b943fa83..9badcb21db6fbe33f3b1c1ddae9ffee5ee00a12d
--- 1/Documentation/admin-guide/cgroup-v2.rst
--- 2/Documentation/admin-guide/cgroup-v2.rst
+++ b/Documentation/admin-guide/cgroup-v2.rst
@@@ -1213,25 -1213,23 +1213,25 @@@ PAGE_SIZE multiple when read back
         A read-write single value file which exists on non-root
         cgroups.  The default is "max".
   
- -      Memory usage throttle limit.  This is the main mechanism to
- -      control memory usage of a cgroup.  If a cgroup's usage goes
+ +      Memory usage throttle limit.  If a cgroup's usage goes
         over the high boundary, the processes of the cgroup are
         throttled and put under heavy reclaim pressure.
   
         Going over the high limit never invokes the OOM killer and
- -      under extreme conditions the limit may be breached.
+ +      under extreme conditions the limit may be breached. The high
+ +      limit should be used in scenarios where an external process
+ +      monitors the limited cgroup to alleviate heavy reclaim
+ +      pressure.
   
     memory.max
         A read-write single value file which exists on non-root
         cgroups.  The default is "max".
   
- -      Memory usage hard limit.  This is the final protection
- -      mechanism.  If a cgroup's memory usage reaches this limit and
- -      can't be reduced, the OOM killer is invoked in the cgroup.
- -      Under certain circumstances, the usage may go over the limit
- -      temporarily.
+ +      Memory usage hard limit.  This is the main mechanism to limit
+ +      memory usage of a cgroup.  If a cgroup's memory usage reaches
+ +      this limit and can't be reduced, the OOM killer is invoked in
+ +      the cgroup. Under certain circumstances, the usage may go
+ +      over the limit temporarily.
   
         In default configuration regular 0-order allocations always
         succeed unless OOM killer chooses current task as a victim.
@@@ -1240,6 -1238,10 +1240,6 @@@
         Caller could retry them differently, return into userspace
         as -ENOMEM or silently ignore in cases like disk readahead.
   
- -      This is the ultimate protection mechanism.  As long as the
- -      high limit is used and monitored properly, this limit's
- -      utility is limited to providing the final safety net.
- -
     memory.reclaim
         A write-only nested-keyed file which exists for all cgroups.
   
@@@ -2022,33 -2024,31 +2022,33 @@@ that attribute
     no-change
         Do not modify the I/O priority class.
   
- -  none-to-rt
- -      For requests that do not have an I/O priority class (NONE),
- -      change the I/O priority class into RT. Do not modify
- -      the I/O priority class of other requests.
+ +  promote-to-rt
+ +      For requests that have a non-RT I/O priority class, change it into RT.
+ +      Also change the priority level of these requests to 4. Do not modify
+ +      the I/O priority of requests that have priority class RT.
   
     restrict-to-be
         For requests that do not have an I/O priority class or that have I/O
- -      priority class RT, change it into BE. Do not modify the I/O priority
- -      class of requests that have priority class IDLE.
+ +      priority class RT, change it into BE. Also change the priority level
+ +      of these requests to 0. Do not modify the I/O priority class of
+ +      requests that have priority class IDLE.
   
     idle
         Change the I/O priority class of all requests into IDLE, the lowest
         I/O priority class.
   
+ +  none-to-rt
+ +      Deprecated. Just an alias for promote-to-rt.
+ +
   The following numerical values are associated with the I/O priority policies:
   
- -+-------------+---+
- -| no-change   | 0 |
- -+-------------+---+
- -| none-to-rt  | 1 |
- -+-------------+---+
- -| rt-to-be    | 2 |
- -+-------------+---+
- -| all-to-idle | 3 |
- -+-------------+---+
+ ++----------------+---+
+ +| no-change      | 0 |
+ ++----------------+---+
+ +| rt-to-be       | 2 |
+ ++----------------+---+
+ +| all-to-idle    | 3 |
+ ++----------------+---+
   
   The numerical value that corresponds to each I/O priority class is as follows:
   
@@@ -2064,13 -2064,9 +2064,13 @@@
   
   The algorithm to set the I/O priority class for a request is as follows:
   
- -- Translate the I/O priority class policy into a number.
- -- Change the request I/O priority class into the maximum of the I/O priority
- -  class policy number and the numerical I/O priority class.
+ +- If I/O priority class policy is promote-to-rt, change the request I/O
+ +  priority class to IOPRIO_CLASS_RT and change the request I/O priority
+ +  level to 4.
+ +- If I/O priorityt class is not promote-to-rt, translate the I/O priority
+ +  class policy into a number, then change the request I/O priority class
+ +  into the maximum of the I/O priority class policy number and the numerical
+ +  I/O priority class.
   
   PID
   ---
@@@ -2443,7 -2439,7 +2443,7 @@@ Miscellaneous controller provides 3 int
           res_b 10
   
     misc.current
-         A read-only flat-keyed file shown in the non-root cgroups.  It shows
+         A read-only flat-keyed file shown in the all cgroups.  It shows
           the current usage of the resources in the cgroup and its children.::
   
           $ cat misc.current
diff --combined include/linux/sched.h

index 1292d38d66cc5c320d29f1f65eee11d12b4b7c5b,2553918f0b619196c2d4a3aae0f0da7c7f973c48..b0011c50da4ff2c1e9fc93e60204ff5f6251034e
--- 1/include/linux/sched.h
--- 2/include/linux/sched.h
+++ b/include/linux/sched.h
@@@ -1852,7 -1852,9 +1852,9 @@@ current_restore_flags(unsigned long ori
   }
   
   extern int cpuset_cpumask_can_shrink(const struct cpumask *cur, const struct cpumask *trial);
- extern int task_can_attach(struct task_struct *p, const struct cpumask *cs_effective_cpus);
+ extern int task_can_attach(struct task_struct *p);
+ extern int dl_bw_alloc(int cpu, u64 dl_bw);
+ extern void dl_bw_free(int cpu, u64 dl_bw);
   #ifdef CONFIG_SMP
   extern void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask);
   extern int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask);
@@@ -2006,12 -2008,15 +2008,12 @@@ static __always_inline void scheduler_i
          */
         preempt_fold_need_resched();
   }
- -extern unsigned long wait_task_inactive(struct task_struct *, unsigned int match_state);
   #else
   static inline void scheduler_ipi(void) { }
- -static inline unsigned long wait_task_inactive(struct task_struct *p, unsigned int match_state)
- -{
- -      return 1;
- -}
   #endif
   
+ +extern unsigned long wait_task_inactive(struct task_struct *, unsigned int match_state);
+ +
   /*
    * Set thread flags in other task's structures.
    * See asm/thread_info.h for TIF_xxxx flags available:
diff --combined kernel/cgroup/cgroup-v1.c

index 5407241dbb45f97b721c81824cd5b4ed410f59cf,d55216c4cc2def70c570a426a28593d4aed4cad5..83044312bc413fc5367d816b76079f971eb33db3
--- 1/kernel/cgroup/cgroup-v1.c
--- 2/kernel/cgroup/cgroup-v1.c
+++ b/kernel/cgroup/cgroup-v1.c
@@@ -108,7 -108,7 +108,7 @@@ int cgroup_transfer_tasks(struct cgrou
   
         cgroup_lock();
   
- -      percpu_down_write(&cgroup_threadgroup_rwsem);
+ +      cgroup_attach_lock(true);
   
         /* all tasks in @from are being moved, all csets are source */
         spin_lock_irq(&css_set_lock);
@@@ -144,7 -144,7 +144,7 @@@
         } while (task && !ret);
   out_err:
         cgroup_migrate_finish(&mgctx);
- -      percpu_up_write(&cgroup_threadgroup_rwsem);
+ +      cgroup_attach_unlock(true);
         cgroup_unlock();
         return ret;
   }
@@@ -563,7 -563,7 +563,7 @@@ static ssize_t cgroup_release_agent_wri
         if (!cgrp)
                 return -ENODEV;
         spin_lock(&release_agent_path_lock);
-       strlcpy(cgrp->root->release_agent_path, strstrip(buf),
+       strscpy(cgrp->root->release_agent_path, strstrip(buf),
                 sizeof(cgrp->root->release_agent_path));
         spin_unlock(&release_agent_path_lock);
         cgroup_kn_unlock(of->kn);
@@@ -797,7 -797,7 +797,7 @@@ void cgroup1_release_agent(struct work_
                 goto out_free;
   
         spin_lock(&release_agent_path_lock);
-       strlcpy(agentbuf, cgrp->root->release_agent_path, PATH_MAX);
+       strscpy(agentbuf, cgrp->root->release_agent_path, PATH_MAX);
         spin_unlock(&release_agent_path_lock);
         if (!agentbuf[0])
                 goto out_free;
diff --combined kernel/cgroup/cgroup.c

index 8f917f682f52cb57fac64dbe436f12f489d46b88,065bebb4af9b9414a59566b20ceb8b91ec7e4dd2..bfe3cd8ccf3668416a544594eb8eea55258cbf92
--- 1/kernel/cgroup/cgroup.c
--- 2/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@@ -57,6 -57,7 +57,7 @@@
   #include <linux/file.h>
   #include <linux/fs_parser.h>
   #include <linux/sched/cputime.h>
+ #include <linux/sched/deadline.h>
   #include <linux/psi.h>
   #include <net/sock.h>
   
@@@ -312,8 -313,6 +313,6 @@@ bool cgroup_ssid_enabled(int ssid
    *   masks of ancestors.
    *
    * - blkcg: blk-throttle becomes properly hierarchical.
-  *
-  * - debug: disallowed on the default hierarchy.
    */
   bool cgroup_on_dfl(const struct cgroup *cgrp)
   {
@@@ -356,7 -355,7 +355,7 @@@ static bool cgroup_has_tasks(struct cgr
         return cgrp->nr_populated_csets;
   }
   
- bool cgroup_is_threaded(struct cgroup *cgrp)
+ static bool cgroup_is_threaded(struct cgroup *cgrp)
   {
         return cgrp->dom_cgrp != cgrp;
   }
@@@ -395,7 -394,7 +394,7 @@@ static bool cgroup_can_be_thread_root(s
   }
   
   /* is @cgrp root of a threaded subtree? */
- bool cgroup_is_thread_root(struct cgroup *cgrp)
+ static bool cgroup_is_thread_root(struct cgroup *cgrp)
   {
         /* thread root should be a domain */
         if (cgroup_is_threaded(cgrp))
@@@ -618,7 -617,7 +617,7 @@@ EXPORT_SYMBOL_GPL(cgroup_get_e_css)
   static void cgroup_get_live(struct cgroup *cgrp)
   {
         WARN_ON_ONCE(cgroup_is_dead(cgrp));
-       css_get(&cgrp->self);
+       cgroup_get(cgrp);
   }
   
   /**
@@@ -689,21 -688,6 +688,6 @@@ EXPORT_SYMBOL_GPL(of_css)
                                 lockdep_is_held(&cgroup_mutex)))) { }   \
                 else
   
- /**
-  * for_each_e_css - iterate all effective css's of a cgroup
-  * @css: the iteration cursor
-  * @ssid: the index of the subsystem, CGROUP_SUBSYS_COUNT after reaching the end
-  * @cgrp: the target cgroup to iterate css's of
-  *
-  * Should be called under cgroup_[tree_]mutex.
-  */
- #define for_each_e_css(css, ssid, cgrp)                                           \
-       for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT; (ssid)++)            \
-               if (!((css) = cgroup_e_css_by_mask(cgrp,                    \
-                                                  cgroup_subsys[(ssid)]))) \
-                       ;                                                   \
-               else
- 
   /**
    * do_each_subsys_mask - filter for_each_subsys with a bitmask
    * @ss: the iteration cursor
@@@ -1798,7 -1782,7 +1782,7 @@@ int rebind_subsystems(struct cgroup_roo
   {
         struct cgroup *dcgrp = &dst_root->cgrp;
         struct cgroup_subsys *ss;
- -      int ssid, i, ret;
+ +      int ssid, ret;
         u16 dfl_disable_ss_mask = 0;
   
         lockdep_assert_held(&cgroup_mutex);
@@@ -1842,8 -1826,7 +1826,8 @@@
                 struct cgroup_root *src_root = ss->root;
                 struct cgroup *scgrp = &src_root->cgrp;
                 struct cgroup_subsys_state *css = cgroup_css(scgrp, ss);
- -              struct css_set *cset;
+ +              struct css_set *cset, *cset_pos;
+ +              struct css_task_iter *it;
   
                 WARN_ON(!css || cgroup_css(dcgrp, ss));
   
@@@ -1861,22 -1844,9 +1845,22 @@@
                 css->cgroup = dcgrp;
   
                 spin_lock_irq(&css_set_lock);
- -              hash_for_each(css_set_table, i, cset, hlist)
+ +              WARN_ON(!list_empty(&dcgrp->e_csets[ss->id]));
+ +              list_for_each_entry_safe(cset, cset_pos, &scgrp->e_csets[ss->id],
+ +                                       e_cset_node[ss->id]) {
                         list_move_tail(&cset->e_cset_node[ss->id],
                                        &dcgrp->e_csets[ss->id]);
+ +                      /*
+ +                       * all css_sets of scgrp together in same order to dcgrp,
+ +                       * patch in-flight iterators to preserve correct iteration.
+ +                       * since the iterator is always advanced right away and
+ +                       * finished when it->cset_pos meets it->cset_head, so only
+ +                       * update it->cset_head is enough here.
+ +                       */
+ +                      list_for_each_entry(it, &cset->task_iters, iters_node)
+ +                              if (it->cset_head == &scgrp->e_csets[ss->id])
+ +                                      it->cset_head = &dcgrp->e_csets[ss->id];
+ +              }
                 spin_unlock_irq(&css_set_lock);
   
                 if (ss->css_rstat_flush) {
@@@ -2392,45 -2362,6 +2376,6 @@@ int cgroup_path_ns(struct cgroup *cgrp
   }
   EXPORT_SYMBOL_GPL(cgroup_path_ns);
   
- /**
-  * task_cgroup_path - cgroup path of a task in the first cgroup hierarchy
-  * @task: target task
-  * @buf: the buffer to write the path into
-  * @buflen: the length of the buffer
-  *
-  * Determine @task's cgroup on the first (the one with the lowest non-zero
-  * hierarchy_id) cgroup hierarchy and copy its path into @buf.  This
-  * function grabs cgroup_mutex and shouldn't be used inside locks used by
-  * cgroup controller callbacks.
-  *
-  * Return value is the same as kernfs_path().
-  */
- int task_cgroup_path(struct task_struct *task, char *buf, size_t buflen)
- {
-       struct cgroup_root *root;
-       struct cgroup *cgrp;
-       int hierarchy_id = 1;
-       int ret;
- 
-       cgroup_lock();
-       spin_lock_irq(&css_set_lock);
- 
-       root = idr_get_next(&cgroup_hierarchy_idr, &hierarchy_id);
- 
-       if (root) {
-               cgrp = task_cgroup_from_root(task, root);
-               ret = cgroup_path_ns_locked(cgrp, buf, buflen, &init_cgroup_ns);
-       } else {
-               /* if no hierarchy exists, everyone is in "/" */
-               ret = strscpy(buf, "/", buflen);
-       }
- 
-       spin_unlock_irq(&css_set_lock);
-       cgroup_unlock();
-       return ret;
- }
- EXPORT_SYMBOL_GPL(task_cgroup_path);
- 
   /**
    * cgroup_attach_lock - Lock for ->attach()
    * @lock_threadgroup: whether to down_write cgroup_threadgroup_rwsem
@@@ -2885,9 -2816,9 +2830,9 @@@ int cgroup_migrate(struct task_struct *
         struct task_struct *task;
   
         /*
-        * Prevent freeing of tasks while we take a snapshot. Tasks that are
-        * already PF_EXITING could be freed from underneath us unless we
-        * take an rcu_read_lock.
+        * The following thread iteration should be inside an RCU critical
+        * section to prevent tasks from being freed while taking the snapshot.
+        * spin_lock_irq() implies RCU critical section here.
          */
         spin_lock_irq(&css_set_lock);
         task = leader;
@@@ -3891,14 -3822,6 +3836,14 @@@ static __poll_t cgroup_pressure_poll(st
         return psi_trigger_poll(&ctx->psi.trigger, of->file, pt);
   }
   
+ +static int cgroup_pressure_open(struct kernfs_open_file *of)
+ +{
+ +      if (of->file->f_mode & FMODE_WRITE && !capable(CAP_SYS_RESOURCE))
+ +              return -EPERM;
+ +
+ +      return 0;
+ +}
+ +
   static void cgroup_pressure_release(struct kernfs_open_file *of)
   {
         struct cgroup_file_ctx *ctx = of->priv;
@@@ -5298,7 -5221,6 +5243,7 @@@ static struct cftype cgroup_psi_files[
         {
                 .name = "io.pressure",
                 .file_offset = offsetof(struct cgroup, psi_files[PSI_IO]),
+ +              .open = cgroup_pressure_open,
                 .seq_show = cgroup_io_pressure_show,
                 .write = cgroup_io_pressure_write,
                 .poll = cgroup_pressure_poll,
@@@ -5307,7 -5229,6 +5252,7 @@@
         {
                 .name = "memory.pressure",
                 .file_offset = offsetof(struct cgroup, psi_files[PSI_MEM]),
+ +              .open = cgroup_pressure_open,
                 .seq_show = cgroup_memory_pressure_show,
                 .write = cgroup_memory_pressure_write,
                 .poll = cgroup_pressure_poll,
@@@ -5316,7 -5237,6 +5261,7 @@@
         {
                 .name = "cpu.pressure",
                 .file_offset = offsetof(struct cgroup, psi_files[PSI_CPU]),
+ +              .open = cgroup_pressure_open,
                 .seq_show = cgroup_cpu_pressure_show,
                 .write = cgroup_cpu_pressure_write,
                 .poll = cgroup_pressure_poll,
@@@ -5326,7 -5246,6 +5271,7 @@@
         {
                 .name = "irq.pressure",
                 .file_offset = offsetof(struct cgroup, psi_files[PSI_IRQ]),
+ +              .open = cgroup_pressure_open,
                 .seq_show = cgroup_irq_pressure_show,
                 .write = cgroup_irq_pressure_write,
                 .poll = cgroup_pressure_poll,
@@@ -6512,18 -6431,19 +6457,18 @@@ err
   static void cgroup_css_set_put_fork(struct kernel_clone_args *kargs)
         __releases(&cgroup_threadgroup_rwsem) __releases(&cgroup_mutex)
   {
+ +      struct cgroup *cgrp = kargs->cgrp;
+ +      struct css_set *cset = kargs->cset;
+ +
         cgroup_threadgroup_change_end(current);
   
- -      if (kargs->flags & CLONE_INTO_CGROUP) {
- -              struct cgroup *cgrp = kargs->cgrp;
- -              struct css_set *cset = kargs->cset;
+ +      if (cset) {
+ +              put_css_set(cset);
+ +              kargs->cset = NULL;
+ +      }
   
+ +      if (kargs->flags & CLONE_INTO_CGROUP) {
                 cgroup_unlock();
- -
- -              if (cset) {
- -                      put_css_set(cset);
- -                      kargs->cset = NULL;
- -              }
- -
                 if (cgrp) {
                         cgroup_put(cgrp);
                         kargs->cgrp = NULL;
@@@ -6708,6 -6628,9 +6653,9 @@@ void cgroup_exit(struct task_struct *ts
         list_add_tail(&tsk->cg_list, &cset->dying_tasks);
         cset->nr_tasks--;
   
+       if (dl_task(tsk))
+               dec_dl_tasks_cs(tsk);
+ 
         WARN_ON_ONCE(cgroup_task_frozen(tsk));
         if (unlikely(!(tsk->flags & PF_KTHREAD) &&
                      test_bit(CGRP_FREEZE, &task_dfl_cgroup(tsk)->flags)))
diff --combined kernel/sched/core.c

index 23203daf71281f697d41657d3fb387de87345057,ed0d7381b2ec92c900e000b7e1b3bd6e927e9f49..c52c2eba7c739fc92fdf98664eb645cac3c5876e
--- 1/kernel/sched/core.c
--- 2/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@@ -2213,154 -2213,6 +2213,154 @@@ void check_preempt_curr(struct rq *rq, 
                 rq_clock_skip_update(rq);
   }
   
+ +static __always_inline
+ +int __task_state_match(struct task_struct *p, unsigned int state)
+ +{
+ +      if (READ_ONCE(p->__state) & state)
+ +              return 1;
+ +
+ +#ifdef CONFIG_PREEMPT_RT
+ +      if (READ_ONCE(p->saved_state) & state)
+ +              return -1;
+ +#endif
+ +      return 0;
+ +}
+ +
+ +static __always_inline
+ +int task_state_match(struct task_struct *p, unsigned int state)
+ +{
+ +#ifdef CONFIG_PREEMPT_RT
+ +      int match;
+ +
+ +      /*
+ +       * Serialize against current_save_and_set_rtlock_wait_state() and
+ +       * current_restore_rtlock_saved_state().
+ +       */
+ +      raw_spin_lock_irq(&p->pi_lock);
+ +      match = __task_state_match(p, state);
+ +      raw_spin_unlock_irq(&p->pi_lock);
+ +
+ +      return match;
+ +#else
+ +      return __task_state_match(p, state);
+ +#endif
+ +}
+ +
+ +/*
+ + * wait_task_inactive - wait for a thread to unschedule.
+ + *
+ + * Wait for the thread to block in any of the states set in @match_state.
+ + * If it changes, i.e. @p might have woken up, then return zero.  When we
+ + * succeed in waiting for @p to be off its CPU, we return a positive number
+ + * (its total switch count).  If a second call a short while later returns the
+ + * same number, the caller can be sure that @p has remained unscheduled the
+ + * whole time.
+ + *
+ + * The caller must ensure that the task *will* unschedule sometime soon,
+ + * else this function might spin for a *long* time. This function can't
+ + * be called with interrupts off, or it may introduce deadlock with
+ + * smp_call_function() if an IPI is sent by the same process we are
+ + * waiting to become inactive.
+ + */
+ +unsigned long wait_task_inactive(struct task_struct *p, unsigned int match_state)
+ +{
+ +      int running, queued, match;
+ +      struct rq_flags rf;
+ +      unsigned long ncsw;
+ +      struct rq *rq;
+ +
+ +      for (;;) {
+ +              /*
+ +               * We do the initial early heuristics without holding
+ +               * any task-queue locks at all. We'll only try to get
+ +               * the runqueue lock when things look like they will
+ +               * work out!
+ +               */
+ +              rq = task_rq(p);
+ +
+ +              /*
+ +               * If the task is actively running on another CPU
+ +               * still, just relax and busy-wait without holding
+ +               * any locks.
+ +               *
+ +               * NOTE! Since we don't hold any locks, it's not
+ +               * even sure that "rq" stays as the right runqueue!
+ +               * But we don't care, since "task_on_cpu()" will
+ +               * return false if the runqueue has changed and p
+ +               * is actually now running somewhere else!
+ +               */
+ +              while (task_on_cpu(rq, p)) {
+ +                      if (!task_state_match(p, match_state))
+ +                              return 0;
+ +                      cpu_relax();
+ +              }
+ +
+ +              /*
+ +               * Ok, time to look more closely! We need the rq
+ +               * lock now, to be *sure*. If we're wrong, we'll
+ +               * just go back and repeat.
+ +               */
+ +              rq = task_rq_lock(p, &rf);
+ +              trace_sched_wait_task(p);
+ +              running = task_on_cpu(rq, p);
+ +              queued = task_on_rq_queued(p);
+ +              ncsw = 0;
+ +              if ((match = __task_state_match(p, match_state))) {
+ +                      /*
+ +                       * When matching on p->saved_state, consider this task
+ +                       * still queued so it will wait.
+ +                       */
+ +                      if (match < 0)
+ +                              queued = 1;
+ +                      ncsw = p->nvcsw | LONG_MIN; /* sets MSB */
+ +              }
+ +              task_rq_unlock(rq, p, &rf);
+ +
+ +              /*
+ +               * If it changed from the expected state, bail out now.
+ +               */
+ +              if (unlikely(!ncsw))
+ +                      break;
+ +
+ +              /*
+ +               * Was it really running after all now that we
+ +               * checked with the proper locks actually held?
+ +               *
+ +               * Oops. Go back and try again..
+ +               */
+ +              if (unlikely(running)) {
+ +                      cpu_relax();
+ +                      continue;
+ +              }
+ +
+ +              /*
+ +               * It's not enough that it's not actively running,
+ +               * it must be off the runqueue _entirely_, and not
+ +               * preempted!
+ +               *
+ +               * So if it was still runnable (but just not actively
+ +               * running right now), it's preempted, and we should
+ +               * yield - it could be a while.
+ +               */
+ +              if (unlikely(queued)) {
+ +                      ktime_t to = NSEC_PER_SEC / HZ;
+ +
+ +                      set_current_state(TASK_UNINTERRUPTIBLE);
+ +                      schedule_hrtimeout(&to, HRTIMER_MODE_REL_HARD);
+ +                      continue;
+ +              }
+ +
+ +              /*
+ +               * Ahh, all good. It wasn't running, and it wasn't
+ +               * runnable, which means that it will never become
+ +               * running in the future either. We're all done!
+ +               */
+ +              break;
+ +      }
+ +
+ +      return ncsw;
+ +}
+ +
   #ifdef CONFIG_SMP
   
   static void
@@@ -2546,6 -2398,7 +2546,6 @@@ static struct rq *__migrate_task(struc
         if (!is_cpu_allowed(p, dest_cpu))
                 return rq;
   
- -      update_rq_clock(rq);
         rq = move_queued_task(rq, rf, p, dest_cpu);
   
         return rq;
@@@ -2603,12 -2456,10 +2603,12 @@@ static int migration_cpu_stop(void *dat
                                 goto out;
                 }
   
- -              if (task_on_rq_queued(p))
+ +              if (task_on_rq_queued(p)) {
+ +                      update_rq_clock(rq);
                         rq = __migrate_task(rq, &rf, p, arg->dest_cpu);
- -              else
+ +              } else {
                         p->wake_cpu = arg->dest_cpu;
+ +              }
   
                 /*
                  * XXX __migrate_task() can fail, at which point we might end
@@@ -3490,6 -3341,114 +3490,6 @@@ out
   }
   #endif /* CONFIG_NUMA_BALANCING */
   
- -/*
- - * wait_task_inactive - wait for a thread to unschedule.
- - *
- - * Wait for the thread to block in any of the states set in @match_state.
- - * If it changes, i.e. @p might have woken up, then return zero.  When we
- - * succeed in waiting for @p to be off its CPU, we return a positive number
- - * (its total switch count).  If a second call a short while later returns the
- - * same number, the caller can be sure that @p has remained unscheduled the
- - * whole time.
- - *
- - * The caller must ensure that the task *will* unschedule sometime soon,
- - * else this function might spin for a *long* time. This function can't
- - * be called with interrupts off, or it may introduce deadlock with
- - * smp_call_function() if an IPI is sent by the same process we are
- - * waiting to become inactive.
- - */
- -unsigned long wait_task_inactive(struct task_struct *p, unsigned int match_state)
- -{
- -      int running, queued;
- -      struct rq_flags rf;
- -      unsigned long ncsw;
- -      struct rq *rq;
- -
- -      for (;;) {
- -              /*
- -               * We do the initial early heuristics without holding
- -               * any task-queue locks at all. We'll only try to get
- -               * the runqueue lock when things look like they will
- -               * work out!
- -               */
- -              rq = task_rq(p);
- -
- -              /*
- -               * If the task is actively running on another CPU
- -               * still, just relax and busy-wait without holding
- -               * any locks.
- -               *
- -               * NOTE! Since we don't hold any locks, it's not
- -               * even sure that "rq" stays as the right runqueue!
- -               * But we don't care, since "task_on_cpu()" will
- -               * return false if the runqueue has changed and p
- -               * is actually now running somewhere else!
- -               */
- -              while (task_on_cpu(rq, p)) {
- -                      if (!(READ_ONCE(p->__state) & match_state))
- -                              return 0;
- -                      cpu_relax();
- -              }
- -
- -              /*
- -               * Ok, time to look more closely! We need the rq
- -               * lock now, to be *sure*. If we're wrong, we'll
- -               * just go back and repeat.
- -               */
- -              rq = task_rq_lock(p, &rf);
- -              trace_sched_wait_task(p);
- -              running = task_on_cpu(rq, p);
- -              queued = task_on_rq_queued(p);
- -              ncsw = 0;
- -              if (READ_ONCE(p->__state) & match_state)
- -                      ncsw = p->nvcsw | LONG_MIN; /* sets MSB */
- -              task_rq_unlock(rq, p, &rf);
- -
- -              /*
- -               * If it changed from the expected state, bail out now.
- -               */
- -              if (unlikely(!ncsw))
- -                      break;
- -
- -              /*
- -               * Was it really running after all now that we
- -               * checked with the proper locks actually held?
- -               *
- -               * Oops. Go back and try again..
- -               */
- -              if (unlikely(running)) {
- -                      cpu_relax();
- -                      continue;
- -              }
- -
- -              /*
- -               * It's not enough that it's not actively running,
- -               * it must be off the runqueue _entirely_, and not
- -               * preempted!
- -               *
- -               * So if it was still runnable (but just not actively
- -               * running right now), it's preempted, and we should
- -               * yield - it could be a while.
- -               */
- -              if (unlikely(queued)) {
- -                      ktime_t to = NSEC_PER_SEC / HZ;
- -
- -                      set_current_state(TASK_UNINTERRUPTIBLE);
- -                      schedule_hrtimeout(&to, HRTIMER_MODE_REL_HARD);
- -                      continue;
- -              }
- -
- -              /*
- -               * Ahh, all good. It wasn't running, and it wasn't
- -               * runnable, which means that it will never become
- -               * running in the future either. We're all done!
- -               */
- -              break;
- -      }
- -
- -      return ncsw;
- -}
- -
   /***
    * kick_process - kick a running thread to enter/exit the kernel
    * @p: the to-be-kicked thread
@@@ -4044,14 -4003,15 +4044,14 @@@ static void ttwu_queue(struct task_stru
   static __always_inline
   bool ttwu_state_match(struct task_struct *p, unsigned int state, int *success)
   {
+ +      int match;
+ +
         if (IS_ENABLED(CONFIG_DEBUG_PREEMPT)) {
                 WARN_ON_ONCE((state & TASK_RTLOCK_WAIT) &&
                              state != TASK_RTLOCK_WAIT);
         }
   
- -      if (READ_ONCE(p->__state) & state) {
- -              *success = 1;
- -              return true;
- -      }
+ +      *success = !!(match = __task_state_match(p, state));
   
   #ifdef CONFIG_PREEMPT_RT
         /*
@@@ -4067,10 -4027,12 +4067,10 @@@
          * p::saved_state to TASK_RUNNING so any further tests will
          * not result in false positives vs. @success
          */
- -      if (p->saved_state & state) {
+ +      if (match < 0)
                 p->saved_state = TASK_RUNNING;
- -              *success = 1;
- -      }
   #endif
- -      return false;
+ +      return match > 0;
   }
   
   /*
@@@ -5670,9 -5632,6 +5670,9 @@@ void scheduler_tick(void
   
         perf_event_task_tick();
   
+ +      if (curr->flags & PF_WQ_WORKER)
+ +              wq_worker_tick(curr);
+ +
   #ifdef CONFIG_SMP
         rq->idle_balance = idle_cpu(cpu);
         trigger_load_balance(rq);
@@@ -7631,6 -7590,7 +7631,7 @@@ static int __sched_setscheduler(struct 
         int reset_on_fork;
         int queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK;
         struct rq *rq;
+       bool cpuset_locked = false;
   
         /* The pi code expects interrupts enabled */
         BUG_ON(pi && in_interrupt());
@@@ -7680,8 -7640,14 +7681,14 @@@ recheck
                         return retval;
         }
   
-       if (pi)
-               cpuset_read_lock();
+       /*
+        * SCHED_DEADLINE bandwidth accounting relies on stable cpusets
+        * information.
+        */
+       if (dl_policy(policy) || dl_policy(p->policy)) {
+               cpuset_locked = true;
+               cpuset_lock();
+       }
   
         /*
          * Make sure no PI-waiters arrive (or leave) while we are
@@@ -7757,8 -7723,8 +7764,8 @@@ change
         if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) {
                 policy = oldpolicy = -1;
                 task_rq_unlock(rq, p, &rf);
-               if (pi)
-                       cpuset_read_unlock();
+               if (cpuset_locked)
+                       cpuset_unlock();
                 goto recheck;
         }
   
@@@ -7825,7 -7791,8 +7832,8 @@@
         task_rq_unlock(rq, p, &rf);
   
         if (pi) {
-               cpuset_read_unlock();
+               if (cpuset_locked)
+                       cpuset_unlock();
                 rt_mutex_adjust_pi(p);
         }
   
@@@ -7837,8 -7804,8 +7845,8 @@@
   
   unlock:
         task_rq_unlock(rq, p, &rf);
-       if (pi)
-               cpuset_read_unlock();
+       if (cpuset_locked)
+               cpuset_unlock();
         return retval;
   }
   
@@@ -9327,8 -9294,7 +9335,7 @@@ int cpuset_cpumask_can_shrink(const str
         return ret;
   }
   
- int task_can_attach(struct task_struct *p,
-                   const struct cpumask *cs_effective_cpus)
+ int task_can_attach(struct task_struct *p)
   {
         int ret = 0;
   
@@@ -9341,21 -9307,9 +9348,9 @@@
          * success of set_cpus_allowed_ptr() on all attached tasks
          * before cpus_mask may be changed.
          */
-       if (p->flags & PF_NO_SETAFFINITY) {
+       if (p->flags & PF_NO_SETAFFINITY)
                 ret = -EINVAL;
-               goto out;
-       }
   
-       if (dl_task(p) && !cpumask_intersects(task_rq(p)->rd->span,
-                                             cs_effective_cpus)) {
-               int cpu = cpumask_any_and(cpu_active_mask, cs_effective_cpus);
- 
-               if (unlikely(cpu >= nr_cpu_ids))
-                       return -EINVAL;
-               ret = dl_cpu_busy(cpu, p);
-       }
- 
- out:
         return ret;
   }
   
@@@ -9589,7 -9543,6 +9584,7 @@@ void set_rq_offline(struct rq *rq
         if (rq->online) {
                 const struct sched_class *class;
   
+ +              update_rq_clock(rq);
                 for_each_class(class) {
                         if (class->rq_offline)
                                 class->rq_offline(rq);
@@@ -9638,7 -9591,7 +9633,7 @@@ static void cpuset_cpu_active(void
   static int cpuset_cpu_inactive(unsigned int cpu)
   {
         if (!cpuhp_tasks_frozen) {
-               int ret = dl_cpu_busy(cpu, NULL);
+               int ret = dl_bw_check_overflow(cpu);
   
                 if (ret)
                         return ret;
@@@ -9731,6 -9684,7 +9726,6 @@@ int sched_cpu_deactivate(unsigned int c
   
         rq_lock_irqsave(rq, &rf);
         if (rq->rd) {
- -              update_rq_clock(rq);
                 BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
                 set_rq_offline(rq);
         }
@@@ -11533,7 -11487,7 +11528,7 @@@ void call_trace_sched_update_nr_running
   
   #ifdef CONFIG_SCHED_MM_CID
   
- -/**
+ +/*
    * @cid_lock: Guarantee forward-progress of cid allocation.
    *
    * Concurrency ID allocation within a bitmap is mostly lock-free. The cid_lock
@@@ -11542,7 -11496,7 +11537,7 @@@
    */
   DEFINE_RAW_SPINLOCK(cid_lock);
   
- -/**
+ +/*
    * @use_cid_lock: Select cid allocation behavior: lock-free vs spinlock.
    *
    * When @use_cid_lock is 0, the cid allocation is lock-free. When contention is
diff --combined kernel/sched/deadline.c

index e41a36bd66a6218b447847d57c2ae64fdbb87f4f,166c3e6eae6173fac30ca8ea3c899850aa516e70..58b542bf2893436185aed5f3c67b9d0ebef96eb2
--- 1/kernel/sched/deadline.c
--- 2/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@@ -16,6 -16,8 +16,8 @@@
    *                    Fabio Checconi <[email protected]>
    */
   
+ #include <linux/cpuset.h>
+ 
   /*
    * Default limits for DL period; on the top end we guard against small util
    * tasks still getting ridiculously long effective runtimes, on the bottom end we
@@@ -489,6 -491,13 +491,6 @@@ static inline int is_leftmost(struct ta
   
   static void init_dl_rq_bw_ratio(struct dl_rq *dl_rq);
   
- -void init_dl_bandwidth(struct dl_bandwidth *dl_b, u64 period, u64 runtime)
- -{
- -      raw_spin_lock_init(&dl_b->dl_runtime_lock);
- -      dl_b->dl_period = period;
- -      dl_b->dl_runtime = runtime;
- -}
- -
   void init_dl_bw(struct dl_bw *dl_b)
   {
         raw_spin_lock_init(&dl_b->lock);
@@@ -1253,39 -1262,43 +1255,39 @@@ int dl_runtime_exceeded(struct sched_dl
   }
   
   /*
- - * This function implements the GRUB accounting rule:
- - * according to the GRUB reclaiming algorithm, the runtime is
- - * not decreased as "dq = -dt", but as
- - * "dq = -max{u / Umax, (1 - Uinact - Uextra)} dt",
+ + * This function implements the GRUB accounting rule. According to the
+ + * GRUB reclaiming algorithm, the runtime is not decreased as "dq = -dt",
+ + * but as "dq = -(max{u, (Umax - Uinact - Uextra)} / Umax) dt",
    * where u is the utilization of the task, Umax is the maximum reclaimable
    * utilization, Uinact is the (per-runqueue) inactive utilization, computed
    * as the difference between the "total runqueue utilization" and the
- - * runqueue active utilization, and Uextra is the (per runqueue) extra
+ + * "runqueue active utilization", and Uextra is the (per runqueue) extra
    * reclaimable utilization.
- - * Since rq->dl.running_bw and rq->dl.this_bw contain utilizations
- - * multiplied by 2^BW_SHIFT, the result has to be shifted right by
- - * BW_SHIFT.
- - * Since rq->dl.bw_ratio contains 1 / Umax multiplied by 2^RATIO_SHIFT,
- - * dl_bw is multiped by rq->dl.bw_ratio and shifted right by RATIO_SHIFT.
- - * Since delta is a 64 bit variable, to have an overflow its value
- - * should be larger than 2^(64 - 20 - 8), which is more than 64 seconds.
- - * So, overflow is not an issue here.
+ + * Since rq->dl.running_bw and rq->dl.this_bw contain utilizations multiplied
+ + * by 2^BW_SHIFT, the result has to be shifted right by BW_SHIFT.
+ + * Since rq->dl.bw_ratio contains 1 / Umax multiplied by 2^RATIO_SHIFT, dl_bw
+ + * is multiped by rq->dl.bw_ratio and shifted right by RATIO_SHIFT.
+ + * Since delta is a 64 bit variable, to have an overflow its value should be
+ + * larger than 2^(64 - 20 - 8), which is more than 64 seconds. So, overflow is
+ + * not an issue here.
    */
   static u64 grub_reclaim(u64 delta, struct rq *rq, struct sched_dl_entity *dl_se)
   {
- -      u64 u_inact = rq->dl.this_bw - rq->dl.running_bw; /* Utot - Uact */
         u64 u_act;
- -      u64 u_act_min = (dl_se->dl_bw * rq->dl.bw_ratio) >> RATIO_SHIFT;
+ +      u64 u_inact = rq->dl.this_bw - rq->dl.running_bw; /* Utot - Uact */
   
         /*
- -       * Instead of computing max{u * bw_ratio, (1 - u_inact - u_extra)},
- -       * we compare u_inact + rq->dl.extra_bw with
- -       * 1 - (u * rq->dl.bw_ratio >> RATIO_SHIFT), because
- -       * u_inact + rq->dl.extra_bw can be larger than
- -       * 1 * (so, 1 - u_inact - rq->dl.extra_bw would be negative
- -       * leading to wrong results)
+ +       * Instead of computing max{u, (u_max - u_inact - u_extra)}, we
+ +       * compare u_inact + u_extra with u_max - u, because u_inact + u_extra
+ +       * can be larger than u_max. So, u_max - u_inact - u_extra would be
+ +       * negative leading to wrong results.
          */
- -      if (u_inact + rq->dl.extra_bw > BW_UNIT - u_act_min)
- -              u_act = u_act_min;
+ +      if (u_inact + rq->dl.extra_bw > rq->dl.max_bw - dl_se->dl_bw)
+ +              u_act = dl_se->dl_bw;
         else
- -              u_act = BW_UNIT - u_inact - rq->dl.extra_bw;
+ +              u_act = rq->dl.max_bw - u_inact - rq->dl.extra_bw;
   
+ +      u_act = (u_act * rq->dl.bw_ratio) >> RATIO_SHIFT;
         return (delta * u_act) >> BW_SHIFT;
   }
   
@@@ -2585,6 -2598,12 +2587,12 @@@ static void switched_from_dl(struct rq 
         if (task_on_rq_queued(p) && p->dl.dl_runtime)
                 task_non_contending(p);
   
+       /*
+        * In case a task is setscheduled out from SCHED_DEADLINE we need to
+        * keep track of that on its cpuset (for correct bandwidth tracking).
+        */
+       dec_dl_tasks_cs(p);
+ 
         if (!task_on_rq_queued(p)) {
                 /*
                  * Inactive timer is armed. However, p is leaving DEADLINE and
@@@ -2625,6 -2644,12 +2633,12 @@@ static void switched_to_dl(struct rq *r
         if (hrtimer_try_to_cancel(&p->dl.inactive_timer) == 1)
                 put_task_struct(p);
   
+       /*
+        * In case a task is setscheduled to SCHED_DEADLINE we need to keep
+        * track of that on its cpuset (for correct bandwidth tracking).
+        */
+       inc_dl_tasks_cs(p);
+ 
         /* If p is not queued we will update its parameters at next wakeup. */
         if (!task_on_rq_queued(p)) {
                 add_rq_bw(&p->dl, &rq->dl);
@@@ -2784,12 -2809,12 +2798,12 @@@ static void init_dl_rq_bw_ratio(struct 
   {
         if (global_rt_runtime() == RUNTIME_INF) {
                 dl_rq->bw_ratio = 1 << RATIO_SHIFT;
- -              dl_rq->extra_bw = 1 << BW_SHIFT;
+ +              dl_rq->max_bw = dl_rq->extra_bw = 1 << BW_SHIFT;
         } else {
                 dl_rq->bw_ratio = to_ratio(global_rt_runtime(),
                           global_rt_period()) >> (BW_SHIFT - RATIO_SHIFT);
- -              dl_rq->extra_bw = to_ratio(global_rt_period(),
- -                                                  global_rt_runtime());
+ +              dl_rq->max_bw = dl_rq->extra_bw =
+ +                      to_ratio(global_rt_period(), global_rt_runtime());
         }
   }
   
@@@ -3033,26 -3058,38 +3047,38 @@@ int dl_cpuset_cpumask_can_shrink(const 
         return ret;
   }
   
- int dl_cpu_busy(int cpu, struct task_struct *p)
+ enum dl_bw_request {
+       dl_bw_req_check_overflow = 0,
+       dl_bw_req_alloc,
+       dl_bw_req_free
+ };
+ 
+ static int dl_bw_manage(enum dl_bw_request req, int cpu, u64 dl_bw)
   {
-       unsigned long flags, cap;
+       unsigned long flags;
         struct dl_bw *dl_b;
-       bool overflow;
+       bool overflow = 0;
   
         rcu_read_lock_sched();
         dl_b = dl_bw_of(cpu);
         raw_spin_lock_irqsave(&dl_b->lock, flags);
-       cap = dl_bw_capacity(cpu);
-       overflow = __dl_overflow(dl_b, cap, 0, p ? p->dl.dl_bw : 0);
   
-       if (!overflow && p) {
-               /*
-                * We reserve space for this task in the destination
-                * root_domain, as we can't fail after this point.
-                * We will free resources in the source root_domain
-                * later on (see set_cpus_allowed_dl()).
-                */
-               __dl_add(dl_b, p->dl.dl_bw, dl_bw_cpus(cpu));
+       if (req == dl_bw_req_free) {
+               __dl_sub(dl_b, dl_bw, dl_bw_cpus(cpu));
+       } else {
+               unsigned long cap = dl_bw_capacity(cpu);
+ 
+               overflow = __dl_overflow(dl_b, cap, 0, dl_bw);
+ 
+               if (req == dl_bw_req_alloc && !overflow) {
+                       /*
+                        * We reserve space in the destination
+                        * root_domain, as we can't fail after this point.
+                        * We will free resources in the source root_domain
+                        * later on (see set_cpus_allowed_dl()).
+                        */
+                       __dl_add(dl_b, dl_bw, dl_bw_cpus(cpu));
+               }
         }
   
         raw_spin_unlock_irqrestore(&dl_b->lock, flags);
@@@ -3060,6 -3097,21 +3086,21 @@@
   
         return overflow ? -EBUSY : 0;
   }
+ 
+ int dl_bw_check_overflow(int cpu)
+ {
+       return dl_bw_manage(dl_bw_req_check_overflow, cpu, 0);
+ }
+ 
+ int dl_bw_alloc(int cpu, u64 dl_bw)
+ {
+       return dl_bw_manage(dl_bw_req_alloc, cpu, dl_bw);
+ }
+ 
+ void dl_bw_free(int cpu, u64 dl_bw)
+ {
+       dl_bw_manage(dl_bw_req_free, cpu, dl_bw);
+ }
   #endif
   
   #ifdef CONFIG_SCHED_DEBUG
diff --combined kernel/sched/sched.h

index 50d4b61aef3aff70ce3db35cd84e2affb5468e2a,0ad712811e3552d97c48340df5a480eff9cce082..e93e006a942b9088406ccc0f9a1a5edb2d006e2c
--- 1/kernel/sched/sched.h
--- 2/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@@ -286,6 -286,12 +286,6 @@@ struct rt_bandwidth 
   
   void __dl_clear_params(struct task_struct *p);
   
- -struct dl_bandwidth {
- -      raw_spinlock_t          dl_runtime_lock;
- -      u64                     dl_runtime;
- -      u64                     dl_period;
- -};
- -
   static inline int dl_bandwidth_enabled(void)
   {
         return sysctl_sched_rt_runtime >= 0;
@@@ -324,7 -330,7 +324,7 @@@ extern void __getparam_dl(struct task_s
   extern bool __checkparam_dl(const struct sched_attr *attr);
   extern bool dl_param_changed(struct task_struct *p, const struct sched_attr *attr);
   extern int  dl_cpuset_cpumask_can_shrink(const struct cpumask *cur, const struct cpumask *trial);
- extern int  dl_cpu_busy(int cpu, struct task_struct *p);
+ extern int  dl_bw_check_overflow(int cpu);
   
   #ifdef CONFIG_CGROUP_SCHED
   
@@@ -747,12 -753,6 +747,12 @@@ struct dl_rq 
         u64                     this_bw;
         u64                     extra_bw;
   
+ +      /*
+ +       * Maximum available bandwidth for reclaiming by SCHED_FLAG_RECLAIM
+ +       * tasks of this rq. Used in calculation of reclaimable bandwidth(GRUB).
+ +       */
+ +      u64                     max_bw;
+ +
         /*
          * Inverse of the fraction of CPU utilization that can be reclaimed
          * by the GRUB algorithm.
@@@ -1546,28 -1546,6 +1546,28 @@@ static inline void rq_clock_cancel_skip
         rq->clock_update_flags &= ~RQCF_REQ_SKIP;
   }
   
+ +/*
+ + * During cpu offlining and rq wide unthrottling, we can trigger
+ + * an update_rq_clock() for several cfs and rt runqueues (Typically
+ + * when using list_for_each_entry_*)
+ + * rq_clock_start_loop_update() can be called after updating the clock
+ + * once and before iterating over the list to prevent multiple update.
+ + * After the iterative traversal, we need to call rq_clock_stop_loop_update()
+ + * to clear RQCF_ACT_SKIP of rq->clock_update_flags.
+ + */
+ +static inline void rq_clock_start_loop_update(struct rq *rq)
+ +{
+ +      lockdep_assert_rq_held(rq);
+ +      SCHED_WARN_ON(rq->clock_update_flags & RQCF_ACT_SKIP);
+ +      rq->clock_update_flags |= RQCF_ACT_SKIP;
+ +}
+ +
+ +static inline void rq_clock_stop_loop_update(struct rq *rq)
+ +{
+ +      lockdep_assert_rq_held(rq);
+ +      rq->clock_update_flags &= ~RQCF_ACT_SKIP;
+ +}
+ +
   struct rq_flags {
         unsigned long flags;
         struct pin_cookie cookie;
@@@ -1794,13 -1772,6 +1794,13 @@@ queue_balance_callback(struct rq *rq
         for (__sd = rcu_dereference_check_sched_domain(cpu_rq(cpu)->sd); \
                         __sd; __sd = __sd->parent)
   
+ +/* A mask of all the SD flags that have the SDF_SHARED_CHILD metaflag */
+ +#define SD_FLAG(name, mflags) (name * !!((mflags) & SDF_SHARED_CHILD)) |
+ +static const unsigned int SD_SHARED_CHILD_MASK =
+ +#include <linux/sched/sd_flags.h>
+ +0;
+ +#undef SD_FLAG
+ +
   /**
    * highest_flag_domain - Return highest sched_domain containing flag.
    * @cpu:      The CPU whose highest level of sched domain is to
@@@ -1808,25 -1779,16 +1808,25 @@@
    * @flag:     The flag to check for the highest sched_domain
    *            for the given CPU.
    *
- - * Returns the highest sched_domain of a CPU which contains the given flag.
+ + * Returns the highest sched_domain of a CPU which contains @flag. If @flag has
+ + * the SDF_SHARED_CHILD metaflag, all the children domains also have @flag.
    */
   static inline struct sched_domain *highest_flag_domain(int cpu, int flag)
   {
         struct sched_domain *sd, *hsd = NULL;
   
         for_each_domain(cpu, sd) {
- -              if (!(sd->flags & flag))
+ +              if (sd->flags & flag) {
+ +                      hsd = sd;
+ +                      continue;
+ +              }
+ +
+ +              /*
+ +               * Stop the search if @flag is known to be shared at lower
+ +               * levels. It will not be found further up.
+ +               */
+ +              if (flag & SD_SHARED_CHILD_MASK)
                         break;
- -              hsd = sd;
         }
   
         return hsd;
@@@ -2416,6 -2378,7 +2416,6 @@@ extern struct rt_bandwidth def_rt_bandw
   extern void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime);
   extern bool sched_rt_bandwidth_account(struct rt_rq *rt_rq);
   
- -extern void init_dl_bandwidth(struct dl_bandwidth *dl_b, u64 period, u64 runtime);
   extern void init_dl_task_timer(struct sched_dl_entity *dl_se);
   extern void init_dl_inactive_task_timer(struct sched_dl_entity *dl_se);
   
@@@ -2983,9 -2946,53 +2983,9 @@@ static inline unsigned long cpu_util_dl
         return READ_ONCE(rq->avg_dl.util_avg);
   }
   
- -/**
- - * cpu_util_cfs() - Estimates the amount of CPU capacity used by CFS tasks.
- - * @cpu: the CPU to get the utilization for.
- - *
- - * The unit of the return value must be the same as the one of CPU capacity
- - * so that CPU utilization can be compared with CPU capacity.
- - *
- - * CPU utilization is the sum of running time of runnable tasks plus the
- - * recent utilization of currently non-runnable tasks on that CPU.
- - * It represents the amount of CPU capacity currently used by CFS tasks in
- - * the range [0..max CPU capacity] with max CPU capacity being the CPU
- - * capacity at f_max.
- - *
- - * The estimated CPU utilization is defined as the maximum between CPU
- - * utilization and sum of the estimated utilization of the currently
- - * runnable tasks on that CPU. It preserves a utilization "snapshot" of
- - * previously-executed tasks, which helps better deduce how busy a CPU will
- - * be when a long-sleeping task wakes up. The contribution to CPU utilization
- - * of such a task would be significantly decayed at this point of time.
- - *
- - * CPU utilization can be higher than the current CPU capacity
- - * (f_curr/f_max * max CPU capacity) or even the max CPU capacity because
- - * of rounding errors as well as task migrations or wakeups of new tasks.
- - * CPU utilization has to be capped to fit into the [0..max CPU capacity]
- - * range. Otherwise a group of CPUs (CPU0 util = 121% + CPU1 util = 80%)
- - * could be seen as over-utilized even though CPU1 has 20% of spare CPU
- - * capacity. CPU utilization is allowed to overshoot current CPU capacity
- - * though since this is useful for predicting the CPU capacity required
- - * after task migrations (scheduler-driven DVFS).
- - *
- - * Return: (Estimated) utilization for the specified CPU.
- - */
- -static inline unsigned long cpu_util_cfs(int cpu)
- -{
- -      struct cfs_rq *cfs_rq;
- -      unsigned long util;
- -
- -      cfs_rq = &cpu_rq(cpu)->cfs;
- -      util = READ_ONCE(cfs_rq->avg.util_avg);
   
- -      if (sched_feat(UTIL_EST)) {
- -              util = max_t(unsigned long, util,
- -                           READ_ONCE(cfs_rq->avg.util_est.enqueued));
- -      }
- -
- -      return min(util, capacity_orig_of(cpu));
- -}
+ +extern unsigned long cpu_util_cfs(int cpu);
+ +extern unsigned long cpu_util_cfs_boost(int cpu);
   
   static inline unsigned long cpu_util_rt(struct rq *rq)
   {
author	Linus Torvalds <[email protected]>
	Tue, 27 Jun 2023 23:54:21 +0000 (16:54 -0700)
committer	Linus Torvalds <[email protected]>
	Tue, 27 Jun 2023 23:54:21 +0000 (16:54 -0700)
		1	2
Documentation/admin-guide/cgroup-v2.rst	patch \|	diff1 \|	diff2 \|	blob \| history
include/linux/sched.h	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/cgroup/cgroup-v1.c	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/cgroup/cgroup.c	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/sched/core.c	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/sched/deadline.c	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/sched/sched.h	patch \|	diff1 \|	diff2 \|	blob \| history