Merge tag 'sched-core-2024-09-19' of git://git.kernel.org/pub/scm/linux/kernel/git...

author Linus Torvalds <[email protected]>

Thu, 19 Sep 2024 13:55:58 +0000 (15:55 +0200)

committer Linus Torvalds <[email protected]>

Thu, 19 Sep 2024 13:55:58 +0000 (15:55 +0200)
author Linus Torvalds <[email protected]>
Thu, 19 Sep 2024 13:55:58 +0000 (15:55 +0200)
committer Linus Torvalds <[email protected]>
Thu, 19 Sep 2024 13:55:58 +0000 (15:55 +0200)
diff --combined fs/proc/base.c

index e7810f3bd522d580ef96ef0522d1394c76b2f30d,72a1acd03675cc77da7320a14426f40e6fc9513f..b31283d81c52ea2a984519dac166d9bcbb7c99a8
--- 1/fs/proc/base.c
--- 2/fs/proc/base.c
+++ b/fs/proc/base.c
@@@ -85,7 -85,6 +85,7 @@@
   #include <linux/elf.h>
   #include <linux/pid_namespace.h>
   #include <linux/user_namespace.h>
+ +#include <linux/fs_parser.h>
   #include <linux/fs_struct.h>
   #include <linux/slab.h>
   #include <linux/sched/autogroup.h>
@@@ -118,40 -117,6 +118,40 @@@
   static u8 nlink_tid __ro_after_init;
   static u8 nlink_tgid __ro_after_init;
   
+ +enum proc_mem_force {
+ +      PROC_MEM_FORCE_ALWAYS,
+ +      PROC_MEM_FORCE_PTRACE,
+ +      PROC_MEM_FORCE_NEVER
+ +};
+ +
+ +static enum proc_mem_force proc_mem_force_override __ro_after_init =
+ +      IS_ENABLED(CONFIG_PROC_MEM_NO_FORCE) ? PROC_MEM_FORCE_NEVER :
+ +      IS_ENABLED(CONFIG_PROC_MEM_FORCE_PTRACE) ? PROC_MEM_FORCE_PTRACE :
+ +      PROC_MEM_FORCE_ALWAYS;
+ +
+ +static const struct constant_table proc_mem_force_table[] __initconst = {
+ +      { "always", PROC_MEM_FORCE_ALWAYS },
+ +      { "ptrace", PROC_MEM_FORCE_PTRACE },
+ +      { "never", PROC_MEM_FORCE_NEVER },
+ +      { }
+ +};
+ +
+ +static int __init early_proc_mem_force_override(char *buf)
+ +{
+ +      if (!buf)
+ +              return -EINVAL;
+ +
+ +      /*
+ +       * lookup_constant() defaults to proc_mem_force_override to preseve
+ +       * the initial Kconfig choice in case an invalid param gets passed.
+ +       */
+ +      proc_mem_force_override = lookup_constant(proc_mem_force_table,
+ +                                                buf, proc_mem_force_override);
+ +
+ +      return 0;
+ +}
+ +early_param("proc_mem.force_override", early_proc_mem_force_override);
+ +
   struct pid_entry {
         const char *name;
         unsigned int len;
@@@ -862,31 -827,12 +862,31 @@@ static int __mem_open(struct inode *ino
   
   static int mem_open(struct inode *inode, struct file *file)
   {
- -      int ret = __mem_open(inode, file, PTRACE_MODE_ATTACH);
- -
- -      /* OK to pass negative loff_t, we can catch out-of-range */
- -      file->f_mode |= FMODE_UNSIGNED_OFFSET;
+ +      if (WARN_ON_ONCE(!(file->f_op->fop_flags & FOP_UNSIGNED_OFFSET)))
+ +              return -EINVAL;
+ +      return __mem_open(inode, file, PTRACE_MODE_ATTACH);
+ +}
   
- -      return ret;
+ +static bool proc_mem_foll_force(struct file *file, struct mm_struct *mm)
+ +{
+ +      struct task_struct *task;
+ +      bool ptrace_active = false;
+ +
+ +      switch (proc_mem_force_override) {
+ +      case PROC_MEM_FORCE_NEVER:
+ +              return false;
+ +      case PROC_MEM_FORCE_PTRACE:
+ +              task = get_proc_task(file_inode(file));
+ +              if (task) {
+ +                      ptrace_active = READ_ONCE(task->ptrace) &&
+ +                                      READ_ONCE(task->mm) == mm &&
+ +                                      READ_ONCE(task->parent) == current;
+ +                      put_task_struct(task);
+ +              }
+ +              return ptrace_active;
+ +      default:
+ +              return true;
+ +      }
   }
   
   static ssize_t mem_rw(struct file *file, char __user *buf,
@@@ -909,9 -855,7 +909,9 @@@
         if (!mmget_not_zero(mm))
                 goto free;
   
- -      flags = FOLL_FORCE | (write ? FOLL_WRITE : 0);
+ +      flags = write ? FOLL_WRITE : 0;
+ +      if (proc_mem_foll_force(file, mm))
+ +              flags |= FOLL_FORCE;
   
         while (count > 0) {
                 size_t this_len = min_t(size_t, count, PAGE_SIZE);
@@@ -988,7 -932,6 +988,7 @@@ static const struct file_operations pro
         .write          = mem_write,
         .open           = mem_open,
         .release        = mem_release,
+ +      .fop_flags      = FOP_UNSIGNED_OFFSET,
   };
   
   static int environ_open(struct inode *inode, struct file *file)
@@@ -2333,8 -2276,8 +2333,8 @@@ proc_map_files_instantiate(struct dentr
         inode->i_op = &proc_map_files_link_inode_operations;
         inode->i_size = 64;
   
- -      d_set_d_op(dentry, &tid_map_files_dentry_operations);
- -      return d_splice_alias(inode, dentry);
+ +      return proc_splice_unmountable(inode, dentry,
+ +                                     &tid_map_files_dentry_operations);
   }
   
   static struct dentry *proc_map_files_lookup(struct inode *dir,
@@@ -2513,13 -2456,13 +2513,13 @@@ static void *timers_start(struct seq_fi
         if (!tp->sighand)
                 return ERR_PTR(-ESRCH);
   
- -      return seq_list_start(&tp->task->signal->posix_timers, *pos);
+ +      return seq_hlist_start(&tp->task->signal->posix_timers, *pos);
   }
   
   static void *timers_next(struct seq_file *m, void *v, loff_t *pos)
   {
         struct timers_private *tp = m->private;
- -      return seq_list_next(v, &tp->task->signal->posix_timers, pos);
+ +      return seq_hlist_next(v, &tp->task->signal->posix_timers, pos);
   }
   
   static void timers_stop(struct seq_file *m, void *v)
@@@ -2548,7 -2491,7 +2548,7 @@@ static int show_timer(struct seq_file *
                 [SIGEV_THREAD] = "thread",
         };
   
- -      timer = list_entry((struct list_head *)v, struct k_itimer, list);
+ +      timer = hlist_entry((struct hlist_node *)v, struct k_itimer, list);
         notify = timer->it_sigev_notify;
   
         seq_printf(m, "ID: %d\n", timer->it_id);
@@@ -2626,11 -2569,10 +2626,11 @@@ static ssize_t timerslack_ns_write(stru
         }
   
         task_lock(p);
-       if (task_is_realtime(p))
- -      if (slack_ns == 0)
- -              p->timer_slack_ns = p->default_timer_slack_ns;
- -      else
- -              p->timer_slack_ns = slack_ns;
++      if (rt_or_dl_task_policy(p))
+ +              slack_ns = 0;
+ +      else if (slack_ns == 0)
+ +              slack_ns = p->default_timer_slack_ns;
+ +      p->timer_slack_ns = slack_ns;
         task_unlock(p);
   
   out:
@@@ -3928,12 -3870,12 +3928,12 @@@ static int proc_task_readdir(struct fil
         if (!dir_emit_dots(file, ctx))
                 return 0;
   
- -      /* f_version caches the tgid value that the last readdir call couldn't
- -       * return. lseek aka telldir automagically resets f_version to 0.
+ +      /* We cache the tgid value that the last readdir call couldn't
+ +       * return and lseek resets it to 0.
          */
         ns = proc_pid_ns(inode->i_sb);
- -      tid = (int)file->f_version;
- -      file->f_version = 0;
+ +      tid = (int)(intptr_t)file->private_data;
+ +      file->private_data = NULL;
         for (task = first_tid(proc_pid(inode), tid, ctx->pos - 2, ns);
              task;
              task = next_tid(task), ctx->pos++) {
@@@ -3948,7 -3890,7 +3948,7 @@@
                                 proc_task_instantiate, task, NULL)) {
                         /* returning this tgid failed, save it as the first
                          * pid for the next readir call */
- -                      file->f_version = (u64)tid;
+ +                      file->private_data = (void *)(intptr_t)tid;
                         put_task_struct(task);
                         break;
                 }
@@@ -3973,24 -3915,6 +3973,24 @@@ static int proc_task_getattr(struct mnt
         return 0;
   }
   
+ +/*
+ + * proc_task_readdir() set @file->private_data to a positive integer
+ + * value, so casting that to u64 is safe. generic_llseek_cookie() will
+ + * set @cookie to 0, so casting to an int is safe. The WARN_ON_ONCE() is
+ + * here to catch any unexpected change in behavior either in
+ + * proc_task_readdir() or generic_llseek_cookie().
+ + */
+ +static loff_t proc_dir_llseek(struct file *file, loff_t offset, int whence)
+ +{
+ +      u64 cookie = (u64)(intptr_t)file->private_data;
+ +      loff_t off;
+ +
+ +      off = generic_llseek_cookie(file, offset, whence, &cookie);
+ +      WARN_ON_ONCE(cookie > INT_MAX);
+ +      file->private_data = (void *)(intptr_t)cookie; /* serialized by f_pos_lock */
+ +      return off;
+ +}
+ +
   static const struct inode_operations proc_task_inode_operations = {
         .lookup         = proc_task_lookup,
         .getattr        = proc_task_getattr,
@@@ -4001,7 -3925,7 +4001,7 @@@
   static const struct file_operations proc_task_operations = {
         .read           = generic_read_dir,
         .iterate_shared = proc_task_readdir,
- -      .llseek         = generic_file_llseek,
+ +      .llseek         = proc_dir_llseek,
   };
   
   void __init set_proc_pid_nlink(void)
diff --combined include/linux/sched.h

index 3773c1c8f099a0653c978801871ef024bddaaf08,57cf27a3045c536a582d7ba9b53d443cb021a999..a1d0c7cab25c7d8e8f423fad9bb3aa6a6574130e
--- 1/include/linux/sched.h
--- 2/include/linux/sched.h
+++ b/include/linux/sched.h
@@@ -149,8 -149,9 +149,9 @@@ struct user_event_mm
    * Special states are those that do not use the normal wait-loop pattern. See
    * the comment with set_special_state().
    */
- #define is_special_task_state(state)                          \
-       ((state) & (__TASK_STOPPED | __TASK_TRACED | TASK_PARKED | TASK_DEAD))
+ #define is_special_task_state(state)                                  \
+       ((state) & (__TASK_STOPPED | __TASK_TRACED | TASK_PARKED |      \
+                   TASK_DEAD | TASK_FROZEN))
   
   #ifdef CONFIG_DEBUG_ATOMIC_SLEEP
   # define debug_normal_state_change(state_value)                               \
@@@ -541,9 -542,14 +542,14 @@@ struct sched_entity 
         struct rb_node                  run_node;
         u64                             deadline;
         u64                             min_vruntime;
+       u64                             min_slice;
   
         struct list_head                group_node;
-       unsigned int                    on_rq;
+       unsigned char                   on_rq;
+       unsigned char                   sched_delayed;
+       unsigned char                   rel_deadline;
+       unsigned char                   custom_slice;
+                                       /* hole */
   
         u64                             exec_start;
         u64                             sum_exec_runtime;
@@@ -639,12 -645,26 +645,26 @@@ struct sched_dl_entity 
          *
          * @dl_overrun tells if the task asked to be informed about runtime
          * overruns.
+        *
+        * @dl_server tells if this is a server entity.
+        *
+        * @dl_defer tells if this is a deferred or regular server. For
+        * now only defer server exists.
+        *
+        * @dl_defer_armed tells if the deferrable server is waiting
+        * for the replenishment timer to activate it.
+        *
+        * @dl_defer_running tells if the deferrable server is actually
+        * running, skipping the defer phase.
          */
         unsigned int                    dl_throttled      : 1;
         unsigned int                    dl_yielded        : 1;
         unsigned int                    dl_non_contending : 1;
         unsigned int                    dl_overrun        : 1;
         unsigned int                    dl_server         : 1;
+       unsigned int                    dl_defer          : 1;
+       unsigned int                    dl_defer_armed    : 1;
+       unsigned int                    dl_defer_running  : 1;
   
         /*
          * Bandwidth enforcement timer. Each -deadline task has its
@@@ -672,7 -692,7 +692,7 @@@
          */
         struct rq                       *rq;
         dl_server_has_tasks_f           server_has_tasks;
-       dl_server_pick_f                server_pick;
+       dl_server_pick_f                server_pick_task;
   
   #ifdef CONFIG_RT_MUTEXES
         /*
@@@ -1243,6 -1263,7 +1263,6 @@@ struct task_struct 
         /* Sequence number to catch updates: */
         seqcount_spinlock_t             mems_allowed_seq;
         int                             cpuset_mem_spread_rotor;
- -      int                             cpuset_slab_spread_rotor;
   #endif
   #ifdef CONFIG_CGROUPS
         /* Control Group info protected by css_set_lock: */
diff --combined kernel/locking/rtmutex.c

index fba1229f1de669d55b2f195f2a73e1a684299c7a,c2a530d704b420be77e7f8d78f9154f8c9ced52c..ebebd0eec7f636088548a7ff43b9b6174a9c1cfb
--- 1/kernel/locking/rtmutex.c
--- 2/kernel/locking/rtmutex.c
+++ b/kernel/locking/rtmutex.c
@@@ -347,7 -347,7 +347,7 @@@ static __always_inline int __waiter_pri
   {
         int prio = task->prio;
   
-       if (!rt_prio(prio))
+       if (!rt_or_dl_prio(prio))
                 return DEFAULT_PRIO;
   
         return prio;
@@@ -435,7 -435,7 +435,7 @@@ static inline bool rt_mutex_steal(struc
          * Note that RT tasks are excluded from same priority (lateral)
          * steals to prevent the introduction of an unbounded latency.
          */
-       if (rt_prio(waiter->tree.prio) || dl_prio(waiter->tree.prio))
+       if (rt_or_dl_prio(waiter->tree.prio))
                 return false;
   
         return rt_waiter_node_equal(&waiter->tree, &top_waiter->tree);
@@@ -1644,7 -1644,6 +1644,7 @@@ static int __sched rt_mutex_slowlock_bl
   }
   
   static void __sched rt_mutex_handle_deadlock(int res, int detect_deadlock,
+ +                                           struct rt_mutex_base *lock,
                                              struct rt_mutex_waiter *w)
   {
         /*
@@@ -1657,10 -1656,10 +1657,10 @@@
         if (build_ww_mutex() && w->ww_ctx)
                 return;
   
- -      /*
- -       * Yell loudly and stop the task right here.
- -       */
+ +      raw_spin_unlock_irq(&lock->wait_lock);
+ +
         WARN(1, "rtmutex deadlock detected\n");
+ +
         while (1) {
                 set_current_state(TASK_INTERRUPTIBLE);
                 rt_mutex_schedule();
@@@ -1714,7 -1713,7 +1714,7 @@@ static int __sched __rt_mutex_slowlock(
         } else {
                 __set_current_state(TASK_RUNNING);
                 remove_waiter(lock, waiter);
- -              rt_mutex_handle_deadlock(ret, chwalk, waiter);
+ +              rt_mutex_handle_deadlock(ret, chwalk, lock, waiter);
         }
   
         /*
diff --combined kernel/sched/core.c

index 1d7f5941bcdced440653e69f54f013ea4269949c,b4c5d83e54d487e25d6f156b302a12a1b9b48bb2..a7af49b3a337b33fb96aee52eb2f5063466a02e9
--- 1/kernel/sched/core.c
--- 2/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@@ -163,7 -163,10 +163,10 @@@ static inline int __task_prio(const str
         if (p->sched_class == &stop_sched_class) /* trumps deadline */
                 return -2;
   
-       if (rt_prio(p->prio)) /* includes deadline */
+       if (p->dl_server)
+               return -1; /* deadline */
+ 
+       if (rt_or_dl_prio(p->prio))
                 return p->prio; /* [-1, 99] */
   
         if (p->sched_class == &idle_sched_class)
@@@ -192,8 -195,24 +195,24 @@@ static inline bool prio_less(const stru
         if (-pb < -pa)
                 return false;
   
-       if (pa == -1) /* dl_prio() doesn't work because of stop_class above */
-               return !dl_time_before(a->dl.deadline, b->dl.deadline);
+       if (pa == -1) { /* dl_prio() doesn't work because of stop_class above */
+               const struct sched_dl_entity *a_dl, *b_dl;
+ 
+               a_dl = &a->dl;
+               /*
+                * Since,'a' and 'b' can be CFS tasks served by DL server,
+                * __task_prio() can return -1 (for DL) even for those. In that
+                * case, get to the dl_server's DL entity.
+                */
+               if (a->dl_server)
+                       a_dl = a->dl_server;
+ 
+               b_dl = &b->dl;
+               if (b->dl_server)
+                       b_dl = b->dl_server;
+ 
+               return !dl_time_before(a_dl->deadline, b_dl->deadline);
+       }
   
         if (pa == MAX_RT_PRIO + MAX_NICE)       /* fair */
                 return cfs_prio_less(a, b, in_fi);
@@@ -240,6 -259,9 +259,9 @@@ static inline int rb_sched_core_cmp(con
   
   void sched_core_enqueue(struct rq *rq, struct task_struct *p)
   {
+       if (p->se.sched_delayed)
+               return;
+ 
         rq->core->core_task_seq++;
   
         if (!p->core_cookie)
@@@ -250,6 -272,9 +272,9 @@@
   
   void sched_core_dequeue(struct rq *rq, struct task_struct *p, int flags)
   {
+       if (p->se.sched_delayed)
+               return;
+ 
         rq->core->core_task_seq++;
   
         if (sched_core_enqueued(p)) {
@@@ -1269,7 -1294,7 +1294,7 @@@ bool sched_can_stop_tick(struct rq *rq
          * dequeued by migrating while the constrained task continues to run.
          * E.g. going from 2->1 without going through pick_next_task().
          */
-       if (sched_feat(HZ_BW) && __need_bw_check(rq, rq->curr)) {
+       if (__need_bw_check(rq, rq->curr)) {
                 if (cfs_task_bw_constrained(rq->curr))
                         return false;
         }
@@@ -1672,6 -1697,9 +1697,9 @@@ static inline void uclamp_rq_inc(struc
         if (unlikely(!p->sched_class->uclamp_enabled))
                 return;
   
+       if (p->se.sched_delayed)
+               return;
+ 
         for_each_clamp_id(clamp_id)
                 uclamp_rq_inc_id(rq, p, clamp_id);
   
@@@ -1696,6 -1724,9 +1724,9 @@@ static inline void uclamp_rq_dec(struc
         if (unlikely(!p->sched_class->uclamp_enabled))
                 return;
   
+       if (p->se.sched_delayed)
+               return;
+ 
         for_each_clamp_id(clamp_id)
                 uclamp_rq_dec_id(rq, p, clamp_id);
   }
@@@ -1975,14 -2006,21 +2006,21 @@@ void enqueue_task(struct rq *rq, struc
                 psi_enqueue(p, (flags & ENQUEUE_WAKEUP) && !(flags & ENQUEUE_MIGRATED));
         }
   
-       uclamp_rq_inc(rq, p);
         p->sched_class->enqueue_task(rq, p, flags);
+       /*
+        * Must be after ->enqueue_task() because ENQUEUE_DELAYED can clear
+        * ->sched_delayed.
+        */
+       uclamp_rq_inc(rq, p);
   
         if (sched_core_enabled(rq))
                 sched_core_enqueue(rq, p);
   }
   
- void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
+ /*
+  * Must only return false when DEQUEUE_SLEEP.
+  */
+ inline bool dequeue_task(struct rq *rq, struct task_struct *p, int flags)
   {
         if (sched_core_enabled(rq))
                 sched_core_dequeue(rq, p, flags);
@@@ -1995,8 -2033,12 +2033,12 @@@
                 psi_dequeue(p, flags & DEQUEUE_SLEEP);
         }
   
+       /*
+        * Must be before ->dequeue_task() because ->dequeue_task() can 'fail'
+        * and mark the task ->sched_delayed.
+        */
         uclamp_rq_dec(rq, p);
-       p->sched_class->dequeue_task(rq, p, flags);
+       return p->sched_class->dequeue_task(rq, p, flags);
   }
   
   void activate_task(struct rq *rq, struct task_struct *p, int flags)
@@@ -2014,12 -2056,25 +2056,25 @@@
   
   void deactivate_task(struct rq *rq, struct task_struct *p, int flags)
   {
-       WRITE_ONCE(p->on_rq, (flags & DEQUEUE_SLEEP) ? 0 : TASK_ON_RQ_MIGRATING);
+       SCHED_WARN_ON(flags & DEQUEUE_SLEEP);
+ 
+       WRITE_ONCE(p->on_rq, TASK_ON_RQ_MIGRATING);
         ASSERT_EXCLUSIVE_WRITER(p->on_rq);
   
+       /*
+        * Code explicitly relies on TASK_ON_RQ_MIGRATING begin set *before*
+        * dequeue_task() and cleared *after* enqueue_task().
+        */
+ 
         dequeue_task(rq, p, flags);
   }
   
+ static void block_task(struct rq *rq, struct task_struct *p, int flags)
+ {
+       if (dequeue_task(rq, p, DEQUEUE_SLEEP | flags))
+               __block_task(rq, p);
+ }
+ 
   /**
    * task_curr - is this task currently executing on a CPU?
    * @p: the task in question.
@@@ -2233,6 -2288,12 +2288,12 @@@ void migrate_disable(void
         struct task_struct *p = current;
   
         if (p->migration_disabled) {
+ #ifdef CONFIG_DEBUG_PREEMPT
+               /*
+                *Warn about overflow half-way through the range.
+                */
+               WARN_ON_ONCE((s16)p->migration_disabled < 0);
+ #endif
                 p->migration_disabled++;
                 return;
         }
@@@ -2251,14 -2312,20 +2312,20 @@@ void migrate_enable(void
                 .flags     = SCA_MIGRATE_ENABLE,
         };
   
+ #ifdef CONFIG_DEBUG_PREEMPT
+       /*
+        * Check both overflow from migrate_disable() and superfluous
+        * migrate_enable().
+        */
+       if (WARN_ON_ONCE((s16)p->migration_disabled <= 0))
+               return;
+ #endif
+ 
         if (p->migration_disabled > 1) {
                 p->migration_disabled--;
                 return;
         }
   
-       if (WARN_ON_ONCE(!p->migration_disabled))
-               return;
- 
         /*
          * Ensure stop_task runs either before or after this, and that
          * __set_cpus_allowed_ptr(SCA_MIGRATE_ENABLE) doesn't schedule().
@@@ -3607,8 -3674,6 +3674,6 @@@ ttwu_do_activate(struct rq *rq, struct 
                 rq->idle_stamp = 0;
         }
   #endif
- 
-       p->dl_server = NULL;
   }
   
   /*
@@@ -3644,12 -3709,14 +3709,14 @@@ static int ttwu_runnable(struct task_st
   
         rq = __task_rq_lock(p, &rf);
         if (task_on_rq_queued(p)) {
+               update_rq_clock(rq);
+               if (p->se.sched_delayed)
+                       enqueue_task(rq, p, ENQUEUE_NOCLOCK | ENQUEUE_DELAYED);
                 if (!task_on_cpu(rq, p)) {
                         /*
                          * When on_rq && !on_cpu the task is preempted, see if
                          * it should preempt the task that is current now.
                          */
-                       update_rq_clock(rq);
                         wakeup_preempt(rq, p, wake_flags);
                 }
                 ttwu_do_wakeup(p);
@@@ -4029,11 -4096,16 +4096,16 @@@ int try_to_wake_up(struct task_struct *
                  * case the whole 'p->on_rq && ttwu_runnable()' case below
                  * without taking any locks.
                  *
+                * Specifically, given current runs ttwu() we must be before
+                * schedule()'s block_task(), as such this must not observe
+                * sched_delayed.
+                *
                  * In particular:
                  *  - we rely on Program-Order guarantees for all the ordering,
                  *  - we're serialized against set_special_state() by virtue of
                  *    it disabling IRQs (this allows not taking ->pi_lock).
                  */
+               SCHED_WARN_ON(p->se.sched_delayed);
                 if (!ttwu_state_match(p, state, &success))
                         goto out;
   
@@@ -4322,9 -4394,11 +4394,11 @@@ static void __sched_fork(unsigned long 
         p->se.nr_migrations             = 0;
         p->se.vruntime                  = 0;
         p->se.vlag                      = 0;
-       p->se.slice                     = sysctl_sched_base_slice;
         INIT_LIST_HEAD(&p->se.group_node);
   
+       /* A delayed task cannot be in clone(). */
+       SCHED_WARN_ON(p->se.sched_delayed);
+ 
   #ifdef CONFIG_FAIR_GROUP_SCHED
         p->se.cfs_rq                    = NULL;
   #endif
@@@ -4572,6 -4646,8 +4646,8 @@@ int sched_fork(unsigned long clone_flag
   
                 p->prio = p->normal_prio = p->static_prio;
                 set_load_weight(p, false);
+               p->se.custom_slice = 0;
+               p->se.slice = sysctl_sched_base_slice;
   
                 /*
                  * We don't need the reset flag anymore after the fork. It has
@@@ -4686,7 -4762,7 +4762,7 @@@ void wake_up_new_task(struct task_struc
         update_rq_clock(rq);
         post_init_entity_util_avg(p);
   
-       activate_task(rq, p, ENQUEUE_NOCLOCK);
+       activate_task(rq, p, ENQUEUE_NOCLOCK | ENQUEUE_INITIAL);
         trace_sched_wakeup_new(p);
         wakeup_preempt(rq, p, WF_FORK);
   #ifdef CONFIG_SMP
@@@ -5762,15 -5838,15 +5838,15 @@@ static inline void schedule_debug(struc
                 preempt_count_set(PREEMPT_DISABLED);
         }
         rcu_sleep_check();
- -      SCHED_WARN_ON(ct_state() == CONTEXT_USER);
+ +      SCHED_WARN_ON(ct_state() == CT_STATE_USER);
   
         profile_hit(SCHED_PROFILING, __builtin_return_address(0));
   
         schedstat_inc(this_rq()->sched_count);
   }
   
- static void put_prev_task_balance(struct rq *rq, struct task_struct *prev,
-                                 struct rq_flags *rf)
+ static void prev_balance(struct rq *rq, struct task_struct *prev,
+                        struct rq_flags *rf)
   {
   #ifdef CONFIG_SMP
         const struct sched_class *class;
@@@ -5787,8 -5863,6 +5863,6 @@@
                         break;
         }
   #endif
- 
-       put_prev_task(rq, prev);
   }
   
   /*
@@@ -5800,6 -5874,8 +5874,8 @@@ __pick_next_task(struct rq *rq, struct 
         const struct sched_class *class;
         struct task_struct *p;
   
+       rq->dl_server = NULL;
+ 
         /*
          * Optimization: we know that if all tasks are in the fair class we can
          * call that function directly, but only if the @prev task wasn't of a
@@@ -5815,35 -5891,28 +5891,28 @@@
   
                 /* Assume the next prioritized class is idle_sched_class */
                 if (!p) {
-                       put_prev_task(rq, prev);
-                       p = pick_next_task_idle(rq);
+                       p = pick_task_idle(rq);
+                       put_prev_set_next_task(rq, prev, p);
                 }
   
-               /*
-                * This is the fast path; it cannot be a DL server pick;
-                * therefore even if @p == @prev, ->dl_server must be NULL.
-                */
-               if (p->dl_server)
-                       p->dl_server = NULL;
- 
                 return p;
         }
   
   restart:
-       put_prev_task_balance(rq, prev, rf);
- 
-       /*
-        * We've updated @prev and no longer need the server link, clear it.
-        * Must be done before ->pick_next_task() because that can (re)set
-        * ->dl_server.
-        */
-       if (prev->dl_server)
-               prev->dl_server = NULL;
+       prev_balance(rq, prev, rf);
   
         for_each_class(class) {
-               p = class->pick_next_task(rq);
-               if (p)
-                       return p;
+               if (class->pick_next_task) {
+                       p = class->pick_next_task(rq, prev);
+                       if (p)
+                               return p;
+               } else {
+                       p = class->pick_task(rq);
+                       if (p) {
+                               put_prev_set_next_task(rq, prev, p);
+                               return p;
+                       }
+               }
         }
   
         BUG(); /* The idle class should always have a runnable task. */
@@@ -5873,6 -5942,8 +5942,8 @@@ static inline struct task_struct *pick_
         const struct sched_class *class;
         struct task_struct *p;
   
+       rq->dl_server = NULL;
+ 
         for_each_class(class) {
                 p = class->pick_task(rq);
                 if (p)
@@@ -5911,6 -5982,7 +5982,7 @@@ pick_next_task(struct rq *rq, struct ta
                  * another cpu during offline.
                  */
                 rq->core_pick = NULL;
+               rq->core_dl_server = NULL;
                 return __pick_next_task(rq, prev, rf);
         }
   
@@@ -5929,16 -6001,13 +6001,13 @@@
                 WRITE_ONCE(rq->core_sched_seq, rq->core->core_pick_seq);
   
                 next = rq->core_pick;
-               if (next != prev) {
-                       put_prev_task(rq, prev);
-                       set_next_task(rq, next);
-               }
- 
+               rq->dl_server = rq->core_dl_server;
                 rq->core_pick = NULL;
-               goto out;
+               rq->core_dl_server = NULL;
+               goto out_set_next;
         }
   
-       put_prev_task_balance(rq, prev, rf);
+       prev_balance(rq, prev, rf);
   
         smt_mask = cpu_smt_mask(cpu);
         need_sync = !!rq->core->core_cookie;
@@@ -5979,6 -6048,7 +6048,7 @@@
                 next = pick_task(rq);
                 if (!next->core_cookie) {
                         rq->core_pick = NULL;
+                       rq->core_dl_server = NULL;
                         /*
                          * For robustness, update the min_vruntime_fi for
                          * unconstrained picks as well.
@@@ -6006,7 -6076,9 +6076,9 @@@
                 if (i != cpu && (rq_i != rq->core || !core_clock_updated))
                         update_rq_clock(rq_i);
   
-               p = rq_i->core_pick = pick_task(rq_i);
+               rq_i->core_pick = p = pick_task(rq_i);
+               rq_i->core_dl_server = rq_i->dl_server;
+ 
                 if (!max || prio_less(max, p, fi_before))
                         max = p;
         }
@@@ -6030,6 -6102,7 +6102,7 @@@
                 }
   
                 rq_i->core_pick = p;
+               rq_i->core_dl_server = NULL;
   
                 if (p == rq_i->idle) {
                         if (rq_i->nr_running) {
@@@ -6090,6 -6163,7 +6163,7 @@@
   
                 if (i == cpu) {
                         rq_i->core_pick = NULL;
+                       rq_i->core_dl_server = NULL;
                         continue;
                 }
   
@@@ -6098,6 -6172,7 +6172,7 @@@
   
                 if (rq_i->curr == rq_i->core_pick) {
                         rq_i->core_pick = NULL;
+                       rq_i->core_dl_server = NULL;
                         continue;
                 }
   
@@@ -6105,8 -6180,7 +6180,7 @@@
         }
   
   out_set_next:
-       set_next_task(rq, next);
- out:
+       put_prev_set_next_task(rq, prev, next);
         if (rq->core->core_forceidle_count && next == rq->idle)
                 queue_core_balance(rq);
   
@@@ -6342,19 -6416,12 +6416,12 @@@ pick_next_task(struct rq *rq, struct ta
    * Constants for the sched_mode argument of __schedule().
    *
    * The mode argument allows RT enabled kernels to differentiate a
-  * preemption from blocking on an 'sleeping' spin/rwlock. Note that
-  * SM_MASK_PREEMPT for !RT has all bits set, which allows the compiler to
-  * optimize the AND operation out and just check for zero.
+  * preemption from blocking on an 'sleeping' spin/rwlock.
    */
- #define SM_NONE                       0x0
- #define SM_PREEMPT            0x1
- #define SM_RTLOCK_WAIT                0x2
- 
- #ifndef CONFIG_PREEMPT_RT
- # define SM_MASK_PREEMPT      (~0U)
- #else
- # define SM_MASK_PREEMPT      SM_PREEMPT
- #endif
+ #define SM_IDLE                       (-1)
+ #define SM_NONE                       0
+ #define SM_PREEMPT            1
+ #define SM_RTLOCK_WAIT                2
   
   /*
    * __schedule() is the main scheduler function.
@@@ -6395,9 -6462,14 +6462,14 @@@
    *
    * WARNING: must be called with preemption disabled!
    */
- static void __sched notrace __schedule(unsigned int sched_mode)
+ static void __sched notrace __schedule(int sched_mode)
   {
         struct task_struct *prev, *next;
+       /*
+        * On PREEMPT_RT kernel, SM_RTLOCK_WAIT is noted
+        * as a preemption by schedule_debug() and RCU.
+        */
+       bool preempt = sched_mode > SM_NONE;
         unsigned long *switch_count;
         unsigned long prev_state;
         struct rq_flags rf;
@@@ -6408,13 -6480,13 +6480,13 @@@
         rq = cpu_rq(cpu);
         prev = rq->curr;
   
-       schedule_debug(prev, !!sched_mode);
+       schedule_debug(prev, preempt);
   
         if (sched_feat(HRTICK) || sched_feat(HRTICK_DL))
                 hrtick_clear(rq);
   
         local_irq_disable();
-       rcu_note_context_switch(!!sched_mode);
+       rcu_note_context_switch(preempt);
   
         /*
          * Make sure that signal_pending_state()->signal_pending() below
@@@ -6443,22 -6515,32 +6515,32 @@@
   
         switch_count = &prev->nivcsw;
   
+       /* Task state changes only considers SM_PREEMPT as preemption */
+       preempt = sched_mode == SM_PREEMPT;
+ 
         /*
          * We must load prev->state once (task_struct::state is volatile), such
          * that we form a control dependency vs deactivate_task() below.
          */
         prev_state = READ_ONCE(prev->__state);
-       if (!(sched_mode & SM_MASK_PREEMPT) && prev_state) {
+       if (sched_mode == SM_IDLE) {
+               if (!rq->nr_running) {
+                       next = prev;
+                       goto picked;
+               }
+       } else if (!preempt && prev_state) {
                 if (signal_pending_state(prev_state, prev)) {
                         WRITE_ONCE(prev->__state, TASK_RUNNING);
                 } else {
+                       int flags = DEQUEUE_NOCLOCK;
+ 
                         prev->sched_contributes_to_load =
                                 (prev_state & TASK_UNINTERRUPTIBLE) &&
                                 !(prev_state & TASK_NOLOAD) &&
                                 !(prev_state & TASK_FROZEN);
   
-                       if (prev->sched_contributes_to_load)
-                               rq->nr_uninterruptible++;
+                       if (unlikely(is_special_task_state(prev_state)))
+                               flags |= DEQUEUE_SPECIAL;
   
                         /*
                          * __schedule()                 ttwu()
@@@ -6471,17 -6553,13 +6553,13 @@@
                          *
                          * After this, schedule() must not care about p->state any more.
                          */
-                       deactivate_task(rq, prev, DEQUEUE_SLEEP | DEQUEUE_NOCLOCK);
- 
-                       if (prev->in_iowait) {
-                               atomic_inc(&rq->nr_iowait);
-                               delayacct_blkio_start();
-                       }
+                       block_task(rq, prev, flags);
                 }
                 switch_count = &prev->nvcsw;
         }
   
         next = pick_next_task(rq, prev, &rf);
+ picked:
         clear_tsk_need_resched(prev);
         clear_preempt_need_resched();
   #ifdef CONFIG_SCHED_DEBUG
@@@ -6523,7 -6601,7 +6601,7 @@@
                 psi_account_irqtime(rq, prev, next);
                 psi_sched_switch(prev, next, !task_on_rq_queued(prev));
   
-               trace_sched_switch(sched_mode & SM_MASK_PREEMPT, prev, next, prev_state);
+               trace_sched_switch(preempt, prev, next, prev_state);
   
                 /* Also unlocks the rq: */
                 rq = context_switch(rq, prev, next, &rf);
@@@ -6599,7 -6677,7 +6677,7 @@@ static void sched_update_worker(struct 
         }
   }
   
- static __always_inline void __schedule_loop(unsigned int sched_mode)
+ static __always_inline void __schedule_loop(int sched_mode)
   {
         do {
                 preempt_disable();
@@@ -6644,7 -6722,7 +6722,7 @@@ void __sched schedule_idle(void
          */
         WARN_ON_ONCE(current->__state);
         do {
-               __schedule(SM_NONE);
+               __schedule(SM_IDLE);
         } while (need_resched());
   }
   
@@@ -6658,7 -6736,7 +6736,7 @@@ asmlinkage __visible void __sched sched
          * we find a better solution.
          *
          * NB: There are buggy callers of this function.  Ideally we
- -       * should warn if prev_state != CONTEXT_USER, but that will trigger
+ +       * should warn if prev_state != CT_STATE_USER, but that will trigger
          * too frequently to make sense yet.
          */
         enum ctx_state prev_state = exception_enter();
@@@ -8228,8 -8306,6 +8306,6 @@@ void __init sched_init(void
   #endif /* CONFIG_RT_GROUP_SCHED */
         }
   
-       init_rt_bandwidth(&def_rt_bandwidth, global_rt_period(), global_rt_runtime());
- 
   #ifdef CONFIG_SMP
         init_defrootdomain();
   #endif
@@@ -8284,8 -8360,13 +8360,13 @@@
                 init_tg_cfs_entry(&root_task_group, &rq->cfs, NULL, i, NULL);
   #endif /* CONFIG_FAIR_GROUP_SCHED */
   
-               rq->rt.rt_runtime = def_rt_bandwidth.rt_runtime;
   #ifdef CONFIG_RT_GROUP_SCHED
+               /*
+                * This is required for init cpu because rt.c:__enable_runtime()
+                * starts working after scheduler_running, which is not the case
+                * yet.
+                */
+               rq->rt.rt_runtime = global_rt_runtime();
                 init_tg_rt_entry(&root_task_group, &rq->rt, NULL, i, NULL);
   #endif
   #ifdef CONFIG_SMP
@@@ -8317,10 -8398,12 +8398,12 @@@
   #endif /* CONFIG_SMP */
                 hrtick_rq_init(rq);
                 atomic_set(&rq->nr_iowait, 0);
+               fair_server_init(rq);
   
   #ifdef CONFIG_SCHED_CORE
                 rq->core = rq;
                 rq->core_pick = NULL;
+               rq->core_dl_server = NULL;
                 rq->core_enabled = 0;
                 rq->core_tree = RB_ROOT;
                 rq->core_forceidle_count = 0;
@@@ -8333,6 -8416,7 +8416,7 @@@
         }
   
         set_load_weight(&init_task, false);
+       init_task.se.slice = sysctl_sched_base_slice,
   
         /*
          * The boot idle thread does lazy MMU switching as well:
@@@ -8548,7 -8632,7 +8632,7 @@@ void normalize_rt_tasks(void
                 schedstat_set(p->stats.sleep_start, 0);
                 schedstat_set(p->stats.block_start, 0);
   
-               if (!dl_task(p) && !rt_task(p)) {
+               if (!rt_or_dl_task(p)) {
                         /*
                          * Renice negative nice level userspace
                          * tasks back to 0:
@@@ -9752,7 -9836,7 +9836,7 @@@ struct cgroup_subsys cpu_cgrp_subsys = 
   
   void dump_cpu_task(int cpu)
   {
- -      if (cpu == smp_processor_id() && in_hardirq()) {
+ +      if (in_hardirq() && cpu == smp_processor_id()) {
                 struct pt_regs *regs;
   
                 regs = get_irq_regs();
diff --combined kernel/sched/fair.c

index 8dc9385f6da4d0ca86c11c4569fad31efcbee771,922d690316617309102cc238a928730160fc06b9..b9784e13e6b6ea9b6b748cbf646b094c492fdf75
--- 1/kernel/sched/fair.c
--- 2/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@@ -511,7 -511,7 +511,7 @@@ static int cfs_rq_is_idle(struct cfs_r
   
   static int se_is_idle(struct sched_entity *se)
   {
-       return 0;
+       return task_has_idle_policy(task_of(se));
   }
   
   #endif        /* CONFIG_FAIR_GROUP_SCHED */
@@@ -779,8 -779,22 +779,22 @@@ static void update_min_vruntime(struct 
         }
   
         /* ensure we never gain time by being placed backwards. */
-       u64_u32_store(cfs_rq->min_vruntime,
-                     __update_min_vruntime(cfs_rq, vruntime));
+       cfs_rq->min_vruntime = __update_min_vruntime(cfs_rq, vruntime);
+ }
+ 
+ static inline u64 cfs_rq_min_slice(struct cfs_rq *cfs_rq)
+ {
+       struct sched_entity *root = __pick_root_entity(cfs_rq);
+       struct sched_entity *curr = cfs_rq->curr;
+       u64 min_slice = ~0ULL;
+ 
+       if (curr && curr->on_rq)
+               min_slice = curr->slice;
+ 
+       if (root)
+               min_slice = min(min_slice, root->min_slice);
+ 
+       return min_slice;
   }
   
   static inline bool __entity_less(struct rb_node *a, const struct rb_node *b)
@@@ -799,19 -813,34 +813,34 @@@ static inline void __min_vruntime_updat
         }
   }
   
+ static inline void __min_slice_update(struct sched_entity *se, struct rb_node *node)
+ {
+       if (node) {
+               struct sched_entity *rse = __node_2_se(node);
+               if (rse->min_slice < se->min_slice)
+                       se->min_slice = rse->min_slice;
+       }
+ }
+ 
   /*
    * se->min_vruntime = min(se->vruntime, {left,right}->min_vruntime)
    */
   static inline bool min_vruntime_update(struct sched_entity *se, bool exit)
   {
         u64 old_min_vruntime = se->min_vruntime;
+       u64 old_min_slice = se->min_slice;
         struct rb_node *node = &se->run_node;
   
         se->min_vruntime = se->vruntime;
         __min_vruntime_update(se, node->rb_right);
         __min_vruntime_update(se, node->rb_left);
   
-       return se->min_vruntime == old_min_vruntime;
+       se->min_slice = se->slice;
+       __min_slice_update(se, node->rb_right);
+       __min_slice_update(se, node->rb_left);
+ 
+       return se->min_vruntime == old_min_vruntime &&
+              se->min_slice == old_min_slice;
   }
   
   RB_DECLARE_CALLBACKS(static, min_vruntime_cb, struct sched_entity,
@@@ -824,6 -853,7 +853,7 @@@ static void __enqueue_entity(struct cfs
   {
         avg_vruntime_add(cfs_rq, se);
         se->min_vruntime = se->vruntime;
+       se->min_slice = se->slice;
         rb_add_augmented_cached(&se->run_node, &cfs_rq->tasks_timeline,
                                 __entity_less, &min_vruntime_cb);
   }
@@@ -974,17 -1004,18 +1004,18 @@@ static void clear_buddies(struct cfs_r
    * XXX: strictly: vd_i += N*r_i/w_i such that: vd_i > ve_i
    * this is probably good enough.
    */
- static void update_deadline(struct cfs_rq *cfs_rq, struct sched_entity *se)
+ static bool update_deadline(struct cfs_rq *cfs_rq, struct sched_entity *se)
   {
         if ((s64)(se->vruntime - se->deadline) < 0)
-               return;
+               return false;
   
         /*
          * For EEVDF the virtual time slope is determined by w_i (iow.
          * nice) while the request time r_i is determined by
          * sysctl_sched_base_slice.
          */
-       se->slice = sysctl_sched_base_slice;
+       if (!se->custom_slice)
+               se->slice = sysctl_sched_base_slice;
   
         /*
          * EEVDF: vd_i = ve_i + r_i / w_i
@@@ -994,10 -1025,7 +1025,7 @@@
         /*
          * The task has consumed its request, reschedule.
          */
-       if (cfs_rq->nr_running > 1) {
-               resched_curr(rq_of(cfs_rq));
-               clear_buddies(cfs_rq, se);
-       }
+       return true;
   }
   
   #include "pelt.h"
@@@ -1135,6 -1163,38 +1163,38 @@@ static inline void update_curr_task(str
                 dl_server_update(p->dl_server, delta_exec);
   }
   
+ static inline bool did_preempt_short(struct cfs_rq *cfs_rq, struct sched_entity *curr)
+ {
+       if (!sched_feat(PREEMPT_SHORT))
+               return false;
+ 
+       if (curr->vlag == curr->deadline)
+               return false;
+ 
+       return !entity_eligible(cfs_rq, curr);
+ }
+ 
+ static inline bool do_preempt_short(struct cfs_rq *cfs_rq,
+                                   struct sched_entity *pse, struct sched_entity *se)
+ {
+       if (!sched_feat(PREEMPT_SHORT))
+               return false;
+ 
+       if (pse->slice >= se->slice)
+               return false;
+ 
+       if (!entity_eligible(cfs_rq, pse))
+               return false;
+ 
+       if (entity_before(pse, se))
+               return true;
+ 
+       if (!entity_eligible(cfs_rq, se))
+               return true;
+ 
+       return false;
+ }
+ 
   /*
    * Used by other classes to account runtime.
    */
@@@ -1156,23 -1216,44 +1216,44 @@@ s64 update_curr_common(struct rq *rq
   static void update_curr(struct cfs_rq *cfs_rq)
   {
         struct sched_entity *curr = cfs_rq->curr;
+       struct rq *rq = rq_of(cfs_rq);
         s64 delta_exec;
+       bool resched;
   
         if (unlikely(!curr))
                 return;
   
-       delta_exec = update_curr_se(rq_of(cfs_rq), curr);
+       delta_exec = update_curr_se(rq, curr);
         if (unlikely(delta_exec <= 0))
                 return;
   
         curr->vruntime += calc_delta_fair(delta_exec, curr);
-       update_deadline(cfs_rq, curr);
+       resched = update_deadline(cfs_rq, curr);
         update_min_vruntime(cfs_rq);
   
-       if (entity_is_task(curr))
-               update_curr_task(task_of(curr), delta_exec);
+       if (entity_is_task(curr)) {
+               struct task_struct *p = task_of(curr);
+ 
+               update_curr_task(p, delta_exec);
+ 
+               /*
+                * Any fair task that runs outside of fair_server should
+                * account against fair_server such that it can account for
+                * this time and possibly avoid running this period.
+                */
+               if (p->dl_server != &rq->fair_server)
+                       dl_server_update(&rq->fair_server, delta_exec);
+       }
   
         account_cfs_rq_runtime(cfs_rq, delta_exec);
+ 
+       if (rq->nr_running == 1)
+               return;
+ 
+       if (resched || did_preempt_short(cfs_rq, curr)) {
+               resched_curr(rq);
+               clear_buddies(cfs_rq, curr);
+       }
   }
   
   static void update_curr_fair(struct rq *rq)
@@@ -5178,7 -5259,8 +5259,8 @@@ place_entity(struct cfs_rq *cfs_rq, str
         u64 vslice, vruntime = avg_vruntime(cfs_rq);
         s64 lag = 0;
   
-       se->slice = sysctl_sched_base_slice;
+       if (!se->custom_slice)
+               se->slice = sysctl_sched_base_slice;
         vslice = calc_delta_fair(se->slice, se);
   
         /*
@@@ -5259,6 -5341,12 +5341,12 @@@
   
         se->vruntime = vruntime - lag;
   
+       if (sched_feat(PLACE_REL_DEADLINE) && se->rel_deadline) {
+               se->deadline += se->vruntime;
+               se->rel_deadline = 0;
+               return;
+       }
+ 
         /*
          * When joining the competition; the existing tasks will be,
          * on average, halfway through their slice, as such start tasks
@@@ -5278,6 -5366,9 +5366,9 @@@ static inline int cfs_rq_throttled(stru
   
   static inline bool cfs_bandwidth_used(void);
   
+ static void
+ requeue_delayed_entity(struct sched_entity *se);
+ 
   static void
   enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
   {
@@@ -5365,19 -5456,47 +5456,47 @@@ static void clear_buddies(struct cfs_r
   
   static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq);
   
- static void
+ static inline void finish_delayed_dequeue_entity(struct sched_entity *se)
+ {
+       se->sched_delayed = 0;
+       if (sched_feat(DELAY_ZERO) && se->vlag > 0)
+               se->vlag = 0;
+ }
+ 
+ static bool
   dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
   {
-       int action = UPDATE_TG;
+       bool sleep = flags & DEQUEUE_SLEEP;
   
+       update_curr(cfs_rq);
+ 
+       if (flags & DEQUEUE_DELAYED) {
+               SCHED_WARN_ON(!se->sched_delayed);
+       } else {
+               bool delay = sleep;
+               /*
+                * DELAY_DEQUEUE relies on spurious wakeups, special task
+                * states must not suffer spurious wakeups, excempt them.
+                */
+               if (flags & DEQUEUE_SPECIAL)
+                       delay = false;
+ 
+               SCHED_WARN_ON(delay && se->sched_delayed);
+ 
+               if (sched_feat(DELAY_DEQUEUE) && delay &&
+                   !entity_eligible(cfs_rq, se)) {
+                       if (cfs_rq->next == se)
+                               cfs_rq->next = NULL;
+                       update_load_avg(cfs_rq, se, 0);
+                       se->sched_delayed = 1;
+                       return false;
+               }
+       }
+ 
+       int action = UPDATE_TG;
         if (entity_is_task(se) && task_on_rq_migrating(task_of(se)))
                 action |= DO_DETACH;
   
-       /*
-        * Update run-time statistics of the 'current'.
-        */
-       update_curr(cfs_rq);
- 
         /*
          * When dequeuing a sched_entity, we must:
          *   - Update loads to have both entity and cfs_rq synced with now.
@@@ -5395,6 -5514,11 +5514,11 @@@
         clear_buddies(cfs_rq, se);
   
         update_entity_lag(cfs_rq, se);
+       if (sched_feat(PLACE_REL_DEADLINE) && !sleep) {
+               se->deadline -= se->vruntime;
+               se->rel_deadline = 1;
+       }
+ 
         if (se != cfs_rq->curr)
                 __dequeue_entity(cfs_rq, se);
         se->on_rq = 0;
@@@ -5414,8 -5538,13 +5538,13 @@@
         if ((flags & (DEQUEUE_SAVE | DEQUEUE_MOVE)) != DEQUEUE_SAVE)
                 update_min_vruntime(cfs_rq);
   
+       if (flags & DEQUEUE_DELAYED)
+               finish_delayed_dequeue_entity(se);
+ 
         if (cfs_rq->nr_running == 0)
                 update_idle_cfs_rq_clock_pelt(cfs_rq);
+ 
+       return true;
   }
   
   static void
@@@ -5441,6 -5570,7 +5570,7 @@@ set_next_entity(struct cfs_rq *cfs_rq, 
         }
   
         update_stats_curr_start(cfs_rq, se);
+       SCHED_WARN_ON(cfs_rq->curr);
         cfs_rq->curr = se;
   
         /*
@@@ -5461,6 -5591,8 +5591,8 @@@
         se->prev_sum_exec_runtime = se->sum_exec_runtime;
   }
   
+ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags);
+ 
   /*
    * Pick the next process, keeping these things in mind, in this order:
    * 1) keep things fair between processes/task groups
@@@ -5469,16 -5601,26 +5601,26 @@@
    * 4) do not run the "skip" process, if something else is available
    */
   static struct sched_entity *
- pick_next_entity(struct cfs_rq *cfs_rq)
+ pick_next_entity(struct rq *rq, struct cfs_rq *cfs_rq)
   {
         /*
          * Enabling NEXT_BUDDY will affect latency but not fairness.
          */
         if (sched_feat(NEXT_BUDDY) &&
-           cfs_rq->next && entity_eligible(cfs_rq, cfs_rq->next))
+           cfs_rq->next && entity_eligible(cfs_rq, cfs_rq->next)) {
+               /* ->next will never be delayed */
+               SCHED_WARN_ON(cfs_rq->next->sched_delayed);
                 return cfs_rq->next;
+       }
   
-       return pick_eevdf(cfs_rq);
+       struct sched_entity *se = pick_eevdf(cfs_rq);
+       if (se->sched_delayed) {
+               dequeue_entities(rq, se, DEQUEUE_SLEEP | DEQUEUE_DELAYED);
+               SCHED_WARN_ON(se->sched_delayed);
+               SCHED_WARN_ON(se->on_rq);
+               return NULL;
+       }
+       return se;
   }
   
   static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq);
@@@ -5502,6 -5644,7 +5644,7 @@@ static void put_prev_entity(struct cfs_
                 /* in !on_rq case, update occurred at dequeue */
                 update_load_avg(cfs_rq, prev, 0);
         }
+       SCHED_WARN_ON(cfs_rq->curr != prev);
         cfs_rq->curr = NULL;
   }
   
@@@ -5765,6 -5908,7 +5908,7 @@@ static bool throttle_cfs_rq(struct cfs_
         struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
         struct sched_entity *se;
         long task_delta, idle_task_delta, dequeue = 1;
+       long rq_h_nr_running = rq->cfs.h_nr_running;
   
         raw_spin_lock(&cfs_b->lock);
         /* This will start the period timer if necessary */
@@@ -5798,11 -5942,21 +5942,21 @@@
         idle_task_delta = cfs_rq->idle_h_nr_running;
         for_each_sched_entity(se) {
                 struct cfs_rq *qcfs_rq = cfs_rq_of(se);
+               int flags;
+ 
                 /* throttled entity or throttle-on-deactivate */
                 if (!se->on_rq)
                         goto done;
   
-               dequeue_entity(qcfs_rq, se, DEQUEUE_SLEEP);
+               /*
+                * Abuse SPECIAL to avoid delayed dequeue in this instance.
+                * This avoids teaching dequeue_entities() about throttled
+                * entities and keeps things relatively simple.
+                */
+               flags = DEQUEUE_SLEEP | DEQUEUE_SPECIAL;
+               if (se->sched_delayed)
+                       flags |= DEQUEUE_DELAYED;
+               dequeue_entity(qcfs_rq, se, flags);
   
                 if (cfs_rq_is_idle(group_cfs_rq(se)))
                         idle_task_delta = cfs_rq->h_nr_running;
@@@ -5836,6 -5990,9 +5990,9 @@@
         /* At this point se is NULL and we are at root level*/
         sub_nr_running(rq, task_delta);
   
+       /* Stop the fair server if throttling resulted in no runnable tasks */
+       if (rq_h_nr_running && !rq->cfs.h_nr_running)
+               dl_server_stop(&rq->fair_server);
   done:
         /*
          * Note: distribution will already see us throttled via the
@@@ -5854,6 -6011,7 +6011,7 @@@ void unthrottle_cfs_rq(struct cfs_rq *c
         struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
         struct sched_entity *se;
         long task_delta, idle_task_delta;
+       long rq_h_nr_running = rq->cfs.h_nr_running;
   
         se = cfs_rq->tg->se[cpu_of(rq)];
   
@@@ -5891,8 -6049,10 +6049,10 @@@
         for_each_sched_entity(se) {
                 struct cfs_rq *qcfs_rq = cfs_rq_of(se);
   
-               if (se->on_rq)
+               if (se->on_rq) {
+                       SCHED_WARN_ON(se->sched_delayed);
                         break;
+               }
                 enqueue_entity(qcfs_rq, se, ENQUEUE_WAKEUP);
   
                 if (cfs_rq_is_idle(group_cfs_rq(se)))
@@@ -5923,6 -6083,10 +6083,10 @@@
                         goto unthrottle_throttle;
         }
   
+       /* Start the fair server if un-throttling resulted in new runnable tasks */
+       if (!rq_h_nr_running && rq->cfs.h_nr_running)
+               dl_server_start(&rq->fair_server);
+ 
         /* At this point se is NULL and we are at root level*/
         add_nr_running(rq, task_delta);
   
@@@ -6555,7 -6719,7 +6719,7 @@@ static void sched_fair_update_stop_tick
   {
         int cpu = cpu_of(rq);
   
-       if (!sched_feat(HZ_BW) || !cfs_bandwidth_used())
+       if (!cfs_bandwidth_used())
                 return;
   
         if (!tick_nohz_full_cpu(cpu))
@@@ -6738,6 -6902,37 +6902,37 @@@ static int sched_idle_cpu(int cpu
   }
   #endif
   
+ static void
+ requeue_delayed_entity(struct sched_entity *se)
+ {
+       struct cfs_rq *cfs_rq = cfs_rq_of(se);
+ 
+       /*
+        * se->sched_delayed should imply: se->on_rq == 1.
+        * Because a delayed entity is one that is still on
+        * the runqueue competing until elegibility.
+        */
+       SCHED_WARN_ON(!se->sched_delayed);
+       SCHED_WARN_ON(!se->on_rq);
+ 
+       if (sched_feat(DELAY_ZERO)) {
+               update_entity_lag(cfs_rq, se);
+               if (se->vlag > 0) {
+                       cfs_rq->nr_running--;
+                       if (se != cfs_rq->curr)
+                               __dequeue_entity(cfs_rq, se);
+                       se->vlag = 0;
+                       place_entity(cfs_rq, se, 0);
+                       if (se != cfs_rq->curr)
+                               __enqueue_entity(cfs_rq, se);
+                       cfs_rq->nr_running++;
+               }
+       }
+ 
+       update_load_avg(cfs_rq, se, 0);
+       se->sched_delayed = 0;
+ }
+ 
   /*
    * The enqueue_task method is called before nr_running is
    * increased. Here we update the fair scheduling stats and
@@@ -6750,6 -6945,8 +6945,8 @@@ enqueue_task_fair(struct rq *rq, struc
         struct sched_entity *se = &p->se;
         int idle_h_nr_running = task_has_idle_policy(p);
         int task_new = !(flags & ENQUEUE_WAKEUP);
+       int rq_h_nr_running = rq->cfs.h_nr_running;
+       u64 slice = 0;
   
         /*
          * The code below (indirectly) updates schedutil which looks at
@@@ -6757,7 -6954,13 +6954,13 @@@
          * Let's add the task's estimated utilization to the cfs_rq's
          * estimated utilization, before we update schedutil.
          */
-       util_est_enqueue(&rq->cfs, p);
+       if (!(p->se.sched_delayed && (task_on_rq_migrating(p) || (flags & ENQUEUE_RESTORE))))
+               util_est_enqueue(&rq->cfs, p);
+ 
+       if (flags & ENQUEUE_DELAYED) {
+               requeue_delayed_entity(se);
+               return;
+       }
   
         /*
          * If in_iowait is set, the code below may not trigger any cpufreq
@@@ -6768,10 -6971,24 +6971,24 @@@
                 cpufreq_update_util(rq, SCHED_CPUFREQ_IOWAIT);
   
         for_each_sched_entity(se) {
-               if (se->on_rq)
+               if (se->on_rq) {
+                       if (se->sched_delayed)
+                               requeue_delayed_entity(se);
                         break;
+               }
                 cfs_rq = cfs_rq_of(se);
+ 
+               /*
+                * Basically set the slice of group entries to the min_slice of
+                * their respective cfs_rq. This ensures the group can service
+                * its entities in the desired time-frame.
+                */
+               if (slice) {
+                       se->slice = slice;
+                       se->custom_slice = 1;
+               }
                 enqueue_entity(cfs_rq, se, flags);
+               slice = cfs_rq_min_slice(cfs_rq);
   
                 cfs_rq->h_nr_running++;
                 cfs_rq->idle_h_nr_running += idle_h_nr_running;
@@@ -6793,6 -7010,9 +7010,9 @@@
                 se_update_runnable(se);
                 update_cfs_group(se);
   
+               se->slice = slice;
+               slice = cfs_rq_min_slice(cfs_rq);
+ 
                 cfs_rq->h_nr_running++;
                 cfs_rq->idle_h_nr_running += idle_h_nr_running;
   
@@@ -6804,6 -7024,13 +7024,13 @@@
                         goto enqueue_throttle;
         }
   
+       if (!rq_h_nr_running && rq->cfs.h_nr_running) {
+               /* Account for idle runtime */
+               if (!rq->nr_running)
+                       dl_server_update_idle_time(rq, rq->curr);
+               dl_server_start(&rq->fair_server);
+       }
+ 
         /* At this point se is NULL and we are at root level*/
         add_nr_running(rq, 1);
   
@@@ -6833,36 -7060,59 +7060,59 @@@ enqueue_throttle
   static void set_next_buddy(struct sched_entity *se);
   
   /*
-  * The dequeue_task method is called before nr_running is
-  * decreased. We remove the task from the rbtree and
-  * update the fair scheduling stats:
+  * Basically dequeue_task_fair(), except it can deal with dequeue_entity()
+  * failing half-way through and resume the dequeue later.
+  *
+  * Returns:
+  * -1 - dequeue delayed
+  *  0 - dequeue throttled
+  *  1 - dequeue complete
    */
- static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
+ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags)
   {
-       struct cfs_rq *cfs_rq;
-       struct sched_entity *se = &p->se;
-       int task_sleep = flags & DEQUEUE_SLEEP;
-       int idle_h_nr_running = task_has_idle_policy(p);
         bool was_sched_idle = sched_idle_rq(rq);
+       int rq_h_nr_running = rq->cfs.h_nr_running;
+       bool task_sleep = flags & DEQUEUE_SLEEP;
+       bool task_delayed = flags & DEQUEUE_DELAYED;
+       struct task_struct *p = NULL;
+       int idle_h_nr_running = 0;
+       int h_nr_running = 0;
+       struct cfs_rq *cfs_rq;
+       u64 slice = 0;
   
-       util_est_dequeue(&rq->cfs, p);
+       if (entity_is_task(se)) {
+               p = task_of(se);
+               h_nr_running = 1;
+               idle_h_nr_running = task_has_idle_policy(p);
+       } else {
+               cfs_rq = group_cfs_rq(se);
+               slice = cfs_rq_min_slice(cfs_rq);
+       }
   
         for_each_sched_entity(se) {
                 cfs_rq = cfs_rq_of(se);
-               dequeue_entity(cfs_rq, se, flags);
   
-               cfs_rq->h_nr_running--;
+               if (!dequeue_entity(cfs_rq, se, flags)) {
+                       if (p && &p->se == se)
+                               return -1;
+ 
+                       break;
+               }
+ 
+               cfs_rq->h_nr_running -= h_nr_running;
                 cfs_rq->idle_h_nr_running -= idle_h_nr_running;
   
                 if (cfs_rq_is_idle(cfs_rq))
-                       idle_h_nr_running = 1;
+                       idle_h_nr_running = h_nr_running;
   
                 /* end evaluation on encountering a throttled cfs_rq */
                 if (cfs_rq_throttled(cfs_rq))
-                       goto dequeue_throttle;
+                       return 0;
   
                 /* Don't dequeue parent if it has other entities besides us */
                 if (cfs_rq->load.weight) {
+                       slice = cfs_rq_min_slice(cfs_rq);
+ 
                         /* Avoid re-evaluating load for this entity: */
                         se = parent_entity(se);
                         /*
@@@ -6874,6 -7124,7 +7124,7 @@@
                         break;
                 }
                 flags |= DEQUEUE_SLEEP;
+               flags &= ~(DEQUEUE_DELAYED | DEQUEUE_SPECIAL);
         }
   
         for_each_sched_entity(se) {
@@@ -6883,28 -7134,61 +7134,61 @@@
                 se_update_runnable(se);
                 update_cfs_group(se);
   
-               cfs_rq->h_nr_running--;
+               se->slice = slice;
+               slice = cfs_rq_min_slice(cfs_rq);
+ 
+               cfs_rq->h_nr_running -= h_nr_running;
                 cfs_rq->idle_h_nr_running -= idle_h_nr_running;
   
                 if (cfs_rq_is_idle(cfs_rq))
-                       idle_h_nr_running = 1;
+                       idle_h_nr_running = h_nr_running;
   
                 /* end evaluation on encountering a throttled cfs_rq */
                 if (cfs_rq_throttled(cfs_rq))
-                       goto dequeue_throttle;
- 
+                       return 0;
         }
   
-       /* At this point se is NULL and we are at root level*/
-       sub_nr_running(rq, 1);
+       sub_nr_running(rq, h_nr_running);
+ 
+       if (rq_h_nr_running && !rq->cfs.h_nr_running)
+               dl_server_stop(&rq->fair_server);
   
         /* balance early to pull high priority tasks */
         if (unlikely(!was_sched_idle && sched_idle_rq(rq)))
                 rq->next_balance = jiffies;
   
- dequeue_throttle:
-       util_est_update(&rq->cfs, p, task_sleep);
+       if (p && task_delayed) {
+               SCHED_WARN_ON(!task_sleep);
+               SCHED_WARN_ON(p->on_rq != 1);
+ 
+               /* Fix-up what dequeue_task_fair() skipped */
+               hrtick_update(rq);
+ 
+               /* Fix-up what block_task() skipped. */
+               __block_task(rq, p);
+       }
+ 
+       return 1;
+ }
+ 
+ /*
+  * The dequeue_task method is called before nr_running is
+  * decreased. We remove the task from the rbtree and
+  * update the fair scheduling stats:
+  */
+ static bool dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
+ {
+       if (!(p->se.sched_delayed && (task_on_rq_migrating(p) || (flags & DEQUEUE_SAVE))))
+               util_est_dequeue(&rq->cfs, p);
+ 
+       if (dequeue_entities(rq, &p->se, flags) < 0) {
+               util_est_update(&rq->cfs, p, DEQUEUE_SLEEP);
+               return false;
+       }
+ 
+       util_est_update(&rq->cfs, p, flags & DEQUEUE_SLEEP);
         hrtick_update(rq);
+       return true;
   }
   
   #ifdef CONFIG_SMP
@@@ -7802,6 -8086,105 +8086,105 @@@ static unsigned long cpu_util_without(i
         return cpu_util(cpu, p, -1, 0);
   }
   
+ /*
+  * This function computes an effective utilization for the given CPU, to be
+  * used for frequency selection given the linear relation: f = u * f_max.
+  *
+  * The scheduler tracks the following metrics:
+  *
+  *   cpu_util_{cfs,rt,dl,irq}()
+  *   cpu_bw_dl()
+  *
+  * Where the cfs,rt and dl util numbers are tracked with the same metric and
+  * synchronized windows and are thus directly comparable.
+  *
+  * The cfs,rt,dl utilization are the running times measured with rq->clock_task
+  * which excludes things like IRQ and steal-time. These latter are then accrued
+  * in the IRQ utilization.
+  *
+  * The DL bandwidth number OTOH is not a measured metric but a value computed
+  * based on the task model parameters and gives the minimal utilization
+  * required to meet deadlines.
+  */
+ unsigned long effective_cpu_util(int cpu, unsigned long util_cfs,
+                                unsigned long *min,
+                                unsigned long *max)
+ {
+       unsigned long util, irq, scale;
+       struct rq *rq = cpu_rq(cpu);
+ 
+       scale = arch_scale_cpu_capacity(cpu);
+ 
+       /*
+        * Early check to see if IRQ/steal time saturates the CPU, can be
+        * because of inaccuracies in how we track these -- see
+        * update_irq_load_avg().
+        */
+       irq = cpu_util_irq(rq);
+       if (unlikely(irq >= scale)) {
+               if (min)
+                       *min = scale;
+               if (max)
+                       *max = scale;
+               return scale;
+       }
+ 
+       if (min) {
+               /*
+                * The minimum utilization returns the highest level between:
+                * - the computed DL bandwidth needed with the IRQ pressure which
+                *   steals time to the deadline task.
+                * - The minimum performance requirement for CFS and/or RT.
+                */
+               *min = max(irq + cpu_bw_dl(rq), uclamp_rq_get(rq, UCLAMP_MIN));
+ 
+               /*
+                * When an RT task is runnable and uclamp is not used, we must
+                * ensure that the task will run at maximum compute capacity.
+                */
+               if (!uclamp_is_used() && rt_rq_is_runnable(&rq->rt))
+                       *min = max(*min, scale);
+       }
+ 
+       /*
+        * Because the time spend on RT/DL tasks is visible as 'lost' time to
+        * CFS tasks and we use the same metric to track the effective
+        * utilization (PELT windows are synchronized) we can directly add them
+        * to obtain the CPU's actual utilization.
+        */
+       util = util_cfs + cpu_util_rt(rq);
+       util += cpu_util_dl(rq);
+ 
+       /*
+        * The maximum hint is a soft bandwidth requirement, which can be lower
+        * than the actual utilization because of uclamp_max requirements.
+        */
+       if (max)
+               *max = min(scale, uclamp_rq_get(rq, UCLAMP_MAX));
+ 
+       if (util >= scale)
+               return scale;
+ 
+       /*
+        * There is still idle time; further improve the number by using the
+        * IRQ metric. Because IRQ/steal time is hidden from the task clock we
+        * need to scale the task numbers:
+        *
+        *              max - irq
+        *   U' = irq + --------- * U
+        *                 max
+        */
+       util = scale_irq_capacity(util, irq, scale);
+       util += irq;
+ 
+       return min(scale, util);
+ }
+ 
+ unsigned long sched_cpu_util(int cpu)
+ {
+       return effective_cpu_util(cpu, cpu_util_cfs(cpu), NULL, NULL);
+ }
+ 
   /*
    * energy_env - Utilization landscape for energy estimation.
    * @task_busy_time: Utilization contribution by the task for which we test the
@@@ -8286,7 -8669,21 +8669,21 @@@ static void migrate_task_rq_fair(struc
   
   static void task_dead_fair(struct task_struct *p)
   {
-       remove_entity_load_avg(&p->se);
+       struct sched_entity *se = &p->se;
+ 
+       if (se->sched_delayed) {
+               struct rq_flags rf;
+               struct rq *rq;
+ 
+               rq = task_rq_lock(p, &rf);
+               if (se->sched_delayed) {
+                       update_rq_clock(rq);
+                       dequeue_entities(rq, se, DEQUEUE_SLEEP | DEQUEUE_DELAYED);
+               }
+               task_rq_unlock(rq, p, &rf);
+       }
+ 
+       remove_entity_load_avg(se);
   }
   
   /*
@@@ -8322,7 -8719,7 +8719,7 @@@ static void set_cpus_allowed_fair(struc
   static int
   balance_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
   {
-       if (rq->nr_running)
+       if (sched_fair_runnable(rq))
                 return 1;
   
         return sched_balance_newidle(rq, rf) != 0;
@@@ -8381,16 -8778,7 +8778,7 @@@ static void check_preempt_wakeup_fair(s
         if (test_tsk_need_resched(curr))
                 return;
   
-       /* Idle tasks are by definition preempted by non-idle tasks. */
-       if (unlikely(task_has_idle_policy(curr)) &&
-           likely(!task_has_idle_policy(p)))
-               goto preempt;
- 
-       /*
-        * Batch and idle tasks do not preempt non-idle tasks (their preemption
-        * is driven by the tick):
-        */
-       if (unlikely(p->policy != SCHED_NORMAL) || !sched_feat(WAKEUP_PREEMPTION))
+       if (!sched_feat(WAKEUP_PREEMPTION))
                 return;
   
         find_matching_se(&se, &pse);
@@@ -8400,7 -8788,7 +8788,7 @@@
         pse_is_idle = se_is_idle(pse);
   
         /*
-        * Preempt an idle group in favor of a non-idle group (and don't preempt
+        * Preempt an idle entity in favor of a non-idle entity (and don't preempt
          * in the inverse case).
          */
         if (cse_is_idle && !pse_is_idle)
@@@ -8408,11 -8796,26 +8796,26 @@@
         if (cse_is_idle != pse_is_idle)
                 return;
   
+       /*
+        * BATCH and IDLE tasks do not preempt others.
+        */
+       if (unlikely(p->policy != SCHED_NORMAL))
+               return;
+ 
         cfs_rq = cfs_rq_of(se);
         update_curr(cfs_rq);
+       /*
+        * If @p has a shorter slice than current and @p is eligible, override
+        * current's slice protection in order to allow preemption.
+        *
+        * Note that even if @p does not turn out to be the most eligible
+        * task at this moment, current's slice protection will be lost.
+        */
+       if (do_preempt_short(cfs_rq, pse, se) && se->vlag == se->deadline)
+               se->vlag = se->deadline + 1;
   
         /*
-        * XXX pick_eevdf(cfs_rq) != se ?
+        * If @p has become the most eligible task, force preemption.
          */
         if (pick_eevdf(cfs_rq) == pse)
                 goto preempt;
@@@ -8423,7 -8826,6 +8826,6 @@@ preempt
         resched_curr(rq);
   }
   
- #ifdef CONFIG_SMP
   static struct task_struct *pick_task_fair(struct rq *rq)
   {
         struct sched_entity *se;
@@@ -8435,95 -8837,58 +8837,58 @@@ again
                 return NULL;
   
         do {
-               struct sched_entity *curr = cfs_rq->curr;
- 
-               /* When we pick for a remote RQ, we'll not have done put_prev_entity() */
-               if (curr) {
-                       if (curr->on_rq)
-                               update_curr(cfs_rq);
-                       else
-                               curr = NULL;
+               /* Might not have done put_prev_entity() */
+               if (cfs_rq->curr && cfs_rq->curr->on_rq)
+                       update_curr(cfs_rq);
   
-                       if (unlikely(check_cfs_rq_runtime(cfs_rq)))
-                               goto again;
-               }
+               if (unlikely(check_cfs_rq_runtime(cfs_rq)))
+                       goto again;
   
-               se = pick_next_entity(cfs_rq);
+               se = pick_next_entity(rq, cfs_rq);
+               if (!se)
+                       goto again;
                 cfs_rq = group_cfs_rq(se);
         } while (cfs_rq);
   
         return task_of(se);
   }
- #endif
+ 
+ static void __set_next_task_fair(struct rq *rq, struct task_struct *p, bool first);
+ static void set_next_task_fair(struct rq *rq, struct task_struct *p, bool first);
   
   struct task_struct *
   pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
   {
-       struct cfs_rq *cfs_rq = &rq->cfs;
         struct sched_entity *se;
         struct task_struct *p;
         int new_tasks;
   
   again:
-       if (!sched_fair_runnable(rq))
+       p = pick_task_fair(rq);
+       if (!p)
                 goto idle;
+       se = &p->se;
   
   #ifdef CONFIG_FAIR_GROUP_SCHED
-       if (!prev || prev->sched_class != &fair_sched_class)
+       if (prev->sched_class != &fair_sched_class)
                 goto simple;
   
+       __put_prev_set_next_dl_server(rq, prev, p);
+ 
         /*
          * Because of the set_next_buddy() in dequeue_task_fair() it is rather
          * likely that a next task is from the same cgroup as the current.
          *
          * Therefore attempt to avoid putting and setting the entire cgroup
          * hierarchy, only change the part that actually changes.
-        */
- 
-       do {
-               struct sched_entity *curr = cfs_rq->curr;
- 
-               /*
-                * Since we got here without doing put_prev_entity() we also
-                * have to consider cfs_rq->curr. If it is still a runnable
-                * entity, update_curr() will update its vruntime, otherwise
-                * forget we've ever seen it.
-                */
-               if (curr) {
-                       if (curr->on_rq)
-                               update_curr(cfs_rq);
-                       else
-                               curr = NULL;
- 
-                       /*
-                        * This call to check_cfs_rq_runtime() will do the
-                        * throttle and dequeue its entity in the parent(s).
-                        * Therefore the nr_running test will indeed
-                        * be correct.
-                        */
-                       if (unlikely(check_cfs_rq_runtime(cfs_rq))) {
-                               cfs_rq = &rq->cfs;
- 
-                               if (!cfs_rq->nr_running)
-                                       goto idle;
- 
-                               goto simple;
-                       }
-               }
- 
-               se = pick_next_entity(cfs_rq);
-               cfs_rq = group_cfs_rq(se);
-       } while (cfs_rq);
- 
-       p = task_of(se);
- 
-       /*
+        *
          * Since we haven't yet done put_prev_entity and if the selected task
          * is a different task than we started out with, try and touch the
          * least amount of cfs_rqs.
          */
         if (prev != p) {
                 struct sched_entity *pse = &prev->se;
+               struct cfs_rq *cfs_rq;
   
                 while (!(cfs_rq = is_same_group(se, pse))) {
                         int se_depth = se->depth;
@@@ -8541,38 -8906,15 +8906,15 @@@
   
                 put_prev_entity(cfs_rq, pse);
                 set_next_entity(cfs_rq, se);
-       }
- 
-       goto done;
- simple:
- #endif
-       if (prev)
-               put_prev_task(rq, prev);
   
-       do {
-               se = pick_next_entity(cfs_rq);
-               set_next_entity(cfs_rq, se);
-               cfs_rq = group_cfs_rq(se);
-       } while (cfs_rq);
+               __set_next_task_fair(rq, p, true);
+       }
   
-       p = task_of(se);
+       return p;
   
- done: __maybe_unused;
- #ifdef CONFIG_SMP
-       /*
-        * Move the next running task to the front of
-        * the list, so our cfs_tasks list becomes MRU
-        * one.
-        */
-       list_move(&p->se.group_node, &rq->cfs_tasks);
+ simple:
   #endif
- 
-       if (hrtick_enabled_fair(rq))
-               hrtick_start_fair(rq, p);
- 
-       update_misfit_status(p, rq);
-       sched_fair_update_stop_tick(rq, p);
- 
+       put_prev_set_next_task(rq, prev, p);
         return p;
   
   idle:
@@@ -8601,15 -8943,34 +8943,34 @@@
         return NULL;
   }
   
- static struct task_struct *__pick_next_task_fair(struct rq *rq)
+ static struct task_struct *__pick_next_task_fair(struct rq *rq, struct task_struct *prev)
   {
-       return pick_next_task_fair(rq, NULL, NULL);
+       return pick_next_task_fair(rq, prev, NULL);
+ }
+ 
+ static bool fair_server_has_tasks(struct sched_dl_entity *dl_se)
+ {
+       return !!dl_se->rq->cfs.nr_running;
+ }
+ 
+ static struct task_struct *fair_server_pick_task(struct sched_dl_entity *dl_se)
+ {
+       return pick_task_fair(dl_se->rq);
+ }
+ 
+ void fair_server_init(struct rq *rq)
+ {
+       struct sched_dl_entity *dl_se = &rq->fair_server;
+ 
+       init_dl_entity(dl_se);
+ 
+       dl_server_init(dl_se, rq, fair_server_has_tasks, fair_server_pick_task);
   }
   
   /*
    * Account for a descheduled task:
    */
- static void put_prev_task_fair(struct rq *rq, struct task_struct *prev)
+ static void put_prev_task_fair(struct rq *rq, struct task_struct *prev, struct task_struct *next)
   {
         struct sched_entity *se = &prev->se;
         struct cfs_rq *cfs_rq;
@@@ -9360,9 -9721,10 +9721,10 @@@ static bool __update_blocked_others(str
   
         hw_pressure = arch_scale_hw_pressure(cpu_of(rq));
   
+       /* hw_pressure doesn't care about invariance */
         decayed = update_rt_rq_load_avg(now, rq, curr_class == &rt_sched_class) |
                   update_dl_rq_load_avg(now, rq, curr_class == &dl_sched_class) |
-                 update_hw_load_avg(now, rq, hw_pressure) |
+                 update_hw_load_avg(rq_clock_task(rq), rq, hw_pressure) |
                   update_irq_load_avg(rq, 0);
   
         if (others_have_blocked(rq))
@@@ -12483,7 -12845,7 +12845,7 @@@ out
    * - indirectly from a remote scheduler_tick() for NOHZ idle balancing
    *   through the SMP cross-call nohz_csd_func()
    */
- -static __latent_entropy void sched_balance_softirq(struct softirq_action *h)
+ +static __latent_entropy void sched_balance_softirq(void)
   {
         struct rq *this_rq = this_rq();
         enum cpu_idle_type idle = this_rq->idle_balance;
@@@ -12702,22 -13064,7 +13064,7 @@@ static void task_tick_fair(struct rq *r
    */
   static void task_fork_fair(struct task_struct *p)
   {
-       struct sched_entity *se = &p->se, *curr;
-       struct cfs_rq *cfs_rq;
-       struct rq *rq = this_rq();
-       struct rq_flags rf;
- 
-       rq_lock(rq, &rf);
-       update_rq_clock(rq);
- 
         set_task_max_allowed_capacity(p);
- 
-       cfs_rq = task_cfs_rq(current);
-       curr = cfs_rq->curr;
-       if (curr)
-               update_curr(cfs_rq);
-       place_entity(cfs_rq, se, ENQUEUE_INITIAL);
-       rq_unlock(rq, &rf);
   }
   
   /*
@@@ -12829,10 -13176,28 +13176,28 @@@ static void attach_task_cfs_rq(struct t
   static void switched_from_fair(struct rq *rq, struct task_struct *p)
   {
         detach_task_cfs_rq(p);
+       /*
+        * Since this is called after changing class, this is a little weird
+        * and we cannot use DEQUEUE_DELAYED.
+        */
+       if (p->se.sched_delayed) {
+               /* First, dequeue it from its new class' structures */
+               dequeue_task(rq, p, DEQUEUE_NOCLOCK | DEQUEUE_SLEEP);
+               /*
+                * Now, clean up the fair_sched_class side of things
+                * related to sched_delayed being true and that wasn't done
+                * due to the generic dequeue not using DEQUEUE_DELAYED.
+                */
+               finish_delayed_dequeue_entity(&p->se);
+               p->se.rel_deadline = 0;
+               __block_task(rq, p);
+       }
   }
   
   static void switched_to_fair(struct rq *rq, struct task_struct *p)
   {
+       SCHED_WARN_ON(p->se.sched_delayed);
+ 
         attach_task_cfs_rq(p);
   
         set_task_max_allowed_capacity(p);
@@@ -12850,12 -13215,7 +13215,7 @@@
         }
   }
   
- /* Account for a task changing its policy or group.
-  *
-  * This routine is mostly called to set cfs_rq->curr field when a task
-  * migrates between groups/classes.
-  */
- static void set_next_task_fair(struct rq *rq, struct task_struct *p, bool first)
+ static void __set_next_task_fair(struct rq *rq, struct task_struct *p, bool first)
   {
         struct sched_entity *se = &p->se;
   
@@@ -12868,6 -13228,27 +13228,27 @@@
                 list_move(&se->group_node, &rq->cfs_tasks);
         }
   #endif
+       if (!first)
+               return;
+ 
+       SCHED_WARN_ON(se->sched_delayed);
+ 
+       if (hrtick_enabled_fair(rq))
+               hrtick_start_fair(rq, p);
+ 
+       update_misfit_status(p, rq);
+       sched_fair_update_stop_tick(rq, p);
+ }
+ 
+ /*
+  * Account for a task changing its policy or group.
+  *
+  * This routine is mostly called to set cfs_rq->curr field when a task
+  * migrates between groups/classes.
+  */
+ static void set_next_task_fair(struct rq *rq, struct task_struct *p, bool first)
+ {
+       struct sched_entity *se = &p->se;
   
         for_each_sched_entity(se) {
                 struct cfs_rq *cfs_rq = cfs_rq_of(se);
@@@ -12876,12 -13257,14 +13257,14 @@@
                 /* ensure bandwidth has been allocated on our new cfs_rq */
                 account_cfs_rq_runtime(cfs_rq, 0);
         }
+ 
+       __set_next_task_fair(rq, p, first);
   }
   
   void init_cfs_rq(struct cfs_rq *cfs_rq)
   {
         cfs_rq->tasks_timeline = RB_ROOT_CACHED;
-       u64_u32_store(cfs_rq->min_vruntime, (u64)(-(1LL << 20)));
+       cfs_rq->min_vruntime = (u64)(-(1LL << 20));
   #ifdef CONFIG_SMP
         raw_spin_lock_init(&cfs_rq->removed.lock);
   #endif
@@@ -12983,28 -13366,35 +13366,35 @@@ void online_fair_sched_group(struct tas
   
   void unregister_fair_sched_group(struct task_group *tg)
   {
-       unsigned long flags;
-       struct rq *rq;
         int cpu;
   
         destroy_cfs_bandwidth(tg_cfs_bandwidth(tg));
   
         for_each_possible_cpu(cpu) {
-               if (tg->se[cpu])
-                       remove_entity_load_avg(tg->se[cpu]);
+               struct cfs_rq *cfs_rq = tg->cfs_rq[cpu];
+               struct sched_entity *se = tg->se[cpu];
+               struct rq *rq = cpu_rq(cpu);
+ 
+               if (se) {
+                       if (se->sched_delayed) {
+                               guard(rq_lock_irqsave)(rq);
+                               if (se->sched_delayed) {
+                                       update_rq_clock(rq);
+                                       dequeue_entities(rq, se, DEQUEUE_SLEEP | DEQUEUE_DELAYED);
+                               }
+                               list_del_leaf_cfs_rq(cfs_rq);
+                       }
+                       remove_entity_load_avg(se);
+               }
   
                 /*
                  * Only empty task groups can be destroyed; so we can speculatively
                  * check on_list without danger of it being re-added.
                  */
-               if (!tg->cfs_rq[cpu]->on_list)
-                       continue;
- 
-               rq = cpu_rq(cpu);
- 
-               raw_spin_rq_lock_irqsave(rq, flags);
-               list_del_leaf_cfs_rq(tg->cfs_rq[cpu]);
-               raw_spin_rq_unlock_irqrestore(rq, flags);
+               if (cfs_rq->on_list) {
+                       guard(rq_lock_irqsave)(rq);
+                       list_del_leaf_cfs_rq(cfs_rq);
+               }
         }
   }
   
@@@ -13194,13 -13584,13 +13584,13 @@@ DEFINE_SCHED_CLASS(fair) = 
   
         .wakeup_preempt         = check_preempt_wakeup_fair,
   
+       .pick_task              = pick_task_fair,
         .pick_next_task         = __pick_next_task_fair,
         .put_prev_task          = put_prev_task_fair,
         .set_next_task          = set_next_task_fair,
   
   #ifdef CONFIG_SMP
         .balance                = balance_fair,
-       .pick_task              = pick_task_fair,
         .select_task_rq         = select_task_rq_fair,
         .migrate_task_rq        = migrate_task_rq_fair,
   
diff --combined kernel/sched/syscalls.c

index 195d2f2834a9758ebc8c855336688d2e2520600e,c62acf509b748599c5dd92cdbbae6da510c40964..cb03c790c27a590d64c7c07acf51180331818393
--- 1/kernel/sched/syscalls.c
--- 2/kernel/sched/syscalls.c
+++ b/kernel/sched/syscalls.c
@@@ -57,7 -57,7 +57,7 @@@ static int effective_prio(struct task_s
          * keep the priority unchanged. Otherwise, update priority
          * to the normal priority:
          */
-       if (!rt_prio(p->prio))
+       if (!rt_or_dl_prio(p->prio))
                 return p->normal_prio;
         return p->prio;
   }
@@@ -258,107 -258,6 +258,6 @@@ int sched_core_idle_cpu(int cpu
   
   #endif
   
- #ifdef CONFIG_SMP
- /*
-  * This function computes an effective utilization for the given CPU, to be
-  * used for frequency selection given the linear relation: f = u * f_max.
-  *
-  * The scheduler tracks the following metrics:
-  *
-  *   cpu_util_{cfs,rt,dl,irq}()
-  *   cpu_bw_dl()
-  *
-  * Where the cfs,rt and dl util numbers are tracked with the same metric and
-  * synchronized windows and are thus directly comparable.
-  *
-  * The cfs,rt,dl utilization are the running times measured with rq->clock_task
-  * which excludes things like IRQ and steal-time. These latter are then accrued
-  * in the IRQ utilization.
-  *
-  * The DL bandwidth number OTOH is not a measured metric but a value computed
-  * based on the task model parameters and gives the minimal utilization
-  * required to meet deadlines.
-  */
- unsigned long effective_cpu_util(int cpu, unsigned long util_cfs,
-                                unsigned long *min,
-                                unsigned long *max)
- {
-       unsigned long util, irq, scale;
-       struct rq *rq = cpu_rq(cpu);
- 
-       scale = arch_scale_cpu_capacity(cpu);
- 
-       /*
-        * Early check to see if IRQ/steal time saturates the CPU, can be
-        * because of inaccuracies in how we track these -- see
-        * update_irq_load_avg().
-        */
-       irq = cpu_util_irq(rq);
-       if (unlikely(irq >= scale)) {
-               if (min)
-                       *min = scale;
-               if (max)
-                       *max = scale;
-               return scale;
-       }
- 
-       if (min) {
-               /*
-                * The minimum utilization returns the highest level between:
-                * - the computed DL bandwidth needed with the IRQ pressure which
-                *   steals time to the deadline task.
-                * - The minimum performance requirement for CFS and/or RT.
-                */
-               *min = max(irq + cpu_bw_dl(rq), uclamp_rq_get(rq, UCLAMP_MIN));
- 
-               /*
-                * When an RT task is runnable and uclamp is not used, we must
-                * ensure that the task will run at maximum compute capacity.
-                */
-               if (!uclamp_is_used() && rt_rq_is_runnable(&rq->rt))
-                       *min = max(*min, scale);
-       }
- 
-       /*
-        * Because the time spend on RT/DL tasks is visible as 'lost' time to
-        * CFS tasks and we use the same metric to track the effective
-        * utilization (PELT windows are synchronized) we can directly add them
-        * to obtain the CPU's actual utilization.
-        */
-       util = util_cfs + cpu_util_rt(rq);
-       util += cpu_util_dl(rq);
- 
-       /*
-        * The maximum hint is a soft bandwidth requirement, which can be lower
-        * than the actual utilization because of uclamp_max requirements.
-        */
-       if (max)
-               *max = min(scale, uclamp_rq_get(rq, UCLAMP_MAX));
- 
-       if (util >= scale)
-               return scale;
- 
-       /*
-        * There is still idle time; further improve the number by using the
-        * IRQ metric. Because IRQ/steal time is hidden from the task clock we
-        * need to scale the task numbers:
-        *
-        *              max - irq
-        *   U' = irq + --------- * U
-        *                 max
-        */
-       util = scale_irq_capacity(util, irq, scale);
-       util += irq;
- 
-       return min(scale, util);
- }
- 
- unsigned long sched_cpu_util(int cpu)
- {
-       return effective_cpu_util(cpu, cpu_util_cfs(cpu), NULL, NULL);
- }
- #endif /* CONFIG_SMP */
- 
   /**
    * find_process_by_pid - find a process with a matching PID value.
    * @pid: the pid in question.
@@@ -401,19 -300,21 +300,29 @@@ static void __setscheduler_params(struc
   
         p->policy = policy;
   
-       if (dl_policy(policy))
+       if (dl_policy(policy)) {
                 __setparam_dl(p, attr);
-       else if (fair_policy(policy))
+       } else if (fair_policy(policy)) {
                 p->static_prio = NICE_TO_PRIO(attr->sched_nice);
+               if (attr->sched_runtime) {
+                       p->se.custom_slice = 1;
+                       p->se.slice = clamp_t(u64, attr->sched_runtime,
+                                             NSEC_PER_MSEC/10,   /* HZ=1000 * 10 */
+                                             NSEC_PER_MSEC*100); /* HZ=100  / 10 */
+               } else {
+                       p->se.custom_slice = 0;
+                       p->se.slice = sysctl_sched_base_slice;
+               }
+       }
   
-       if (task_is_realtime(p)) {
+ +      /* rt-policy tasks do not have a timerslack */
++      if (rt_or_dl_task_policy(p)) {
+ +              p->timer_slack_ns = 0;
+ +      } else if (p->timer_slack_ns == 0) {
+ +              /* when switching back to non-rt policy, restore timerslack */
+ +              p->timer_slack_ns = p->default_timer_slack_ns;
+ +      }
+ +
         /*
          * __sched_setscheduler() ensures attr->sched_priority == 0 when
          * !rt_policy. Always setting this ensures that things like
@@@ -708,7 -609,9 +617,9 @@@ recheck
          * but store a possible modification of reset_on_fork.
          */
         if (unlikely(policy == p->policy)) {
-               if (fair_policy(policy) && attr->sched_nice != task_nice(p))
+               if (fair_policy(policy) &&
+                   (attr->sched_nice != task_nice(p) ||
+                    (attr->sched_runtime != p->se.slice)))
                         goto change;
                 if (rt_policy(policy) && attr->sched_priority != p->rt_priority)
                         goto change;
@@@ -854,6 -757,9 +765,9 @@@ static int _sched_setscheduler(struct t
                 .sched_nice     = PRIO_TO_NICE(p->static_prio),
         };
   
+       if (p->se.custom_slice)
+               attr.sched_runtime = p->se.slice;
+ 
         /* Fixup the legacy SCHED_RESET_ON_FORK hack. */
         if ((policy != SETPARAM_POLICY) && (policy & SCHED_RESET_ON_FORK)) {
                 attr.sched_flags |= SCHED_FLAG_RESET_ON_FORK;
@@@ -1020,12 -926,14 +934,14 @@@ err_size
   
   static void get_params(struct task_struct *p, struct sched_attr *attr)
   {
-       if (task_has_dl_policy(p))
+       if (task_has_dl_policy(p)) {
                 __getparam_dl(p, attr);
-       else if (task_has_rt_policy(p))
+       } else if (task_has_rt_policy(p)) {
                 attr->sched_priority = p->rt_priority;
-       else
+       } else {
                 attr->sched_nice = task_nice(p);
+               attr->sched_runtime = p->se.slice;
+       }
   }
   
   /**
diff --combined kernel/sys.c

index e3c4cffb520ceec414ff20cd769a2a6bfe19df2c,3a2df1bd9f640ebbd63ceda94dd8cc1c51018f1b..b7e096e1c3a13d808731f0f4e55a16a5524dcb81
--- 1/kernel/sys.c
--- 2/kernel/sys.c
+++ b/kernel/sys.c
@@@ -2557,8 -2557,6 +2557,8 @@@ SYSCALL_DEFINE5(prctl, int, option, uns
                         error = current->timer_slack_ns;
                 break;
         case PR_SET_TIMERSLACK:
-               if (task_is_realtime(current))
++              if (rt_or_dl_task_policy(current))
+ +                      break;
                 if (arg2 <= 0)
                         current->timer_slack_ns =
                                         current->default_timer_slack_ns;
diff --combined kernel/time/hrtimer.c

index 12eb40d6290eec61632dd7cdcf921c912f9ce568,f4be3abbb47b3644713370277d96ddb0b9d0628c..cddcd08ea827f9d9b4f3dec52f94472035baf363
--- 1/kernel/time/hrtimer.c
--- 2/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@@ -1177,7 -1177,7 +1177,7 @@@ static inline ktime_t hrtimer_update_lo
         /*
          * CONFIG_TIME_LOW_RES indicates that the system has no way to return
          * granular time values. For relative timers we add hrtimer_resolution
- -       * (i.e. one jiffie) to prevent short timeouts.
+ +       * (i.e. one jiffy) to prevent short timeouts.
          */
         timer->is_rel = mode & HRTIMER_MODE_REL;
         if (timer->is_rel)
@@@ -1351,13 -1351,11 +1351,13 @@@ static void hrtimer_cpu_base_init_expir
   }
   
   static void hrtimer_cpu_base_lock_expiry(struct hrtimer_cpu_base *base)
+ +      __acquires(&base->softirq_expiry_lock)
   {
         spin_lock(&base->softirq_expiry_lock);
   }
   
   static void hrtimer_cpu_base_unlock_expiry(struct hrtimer_cpu_base *base)
+ +      __releases(&base->softirq_expiry_lock)
   {
         spin_unlock(&base->softirq_expiry_lock);
   }
@@@ -1759,7 -1757,7 +1759,7 @@@ static void __hrtimer_run_queues(struc
         }
   }
   
- -static __latent_entropy void hrtimer_run_softirq(struct softirq_action *h)
+ +static __latent_entropy void hrtimer_run_softirq(void)
   {
         struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases);
         unsigned long flags;
@@@ -1977,7 -1975,7 +1977,7 @@@ static void __hrtimer_init_sleeper(stru
          * expiry.
          */
         if (IS_ENABLED(CONFIG_PREEMPT_RT)) {
-               if (task_is_realtime(current) && !(mode & HRTIMER_MODE_SOFT))
+               if (rt_or_dl_task_policy(current) && !(mode & HRTIMER_MODE_SOFT))
                         mode |= HRTIMER_MODE_HARD;
         }
   
@@@ -2074,9 -2072,14 +2074,9 @@@ long hrtimer_nanosleep(ktime_t rqtp, co
         struct restart_block *restart;
         struct hrtimer_sleeper t;
         int ret = 0;
- -      u64 slack;
- -
- -      slack = current->timer_slack_ns;
- -      if (rt_or_dl_task(current))
- -              slack = 0;
   
         hrtimer_init_sleeper_on_stack(&t, clockid, mode);
- -      hrtimer_set_expires_range_ns(&t.timer, rqtp, slack);
+ +      hrtimer_set_expires_range_ns(&t.timer, rqtp, current->timer_slack_ns);
         ret = do_nanosleep(&t, mode);
         if (ret != -ERESTART_RESTARTBLOCK)
                 goto out;
@@@ -2246,7 -2249,7 +2246,7 @@@ void __init hrtimers_init(void
   /**
    * schedule_hrtimeout_range_clock - sleep until timeout
    * @expires:  timeout value (ktime_t)
- - * @delta:    slack in expires timeout (ktime_t) for SCHED_OTHER tasks
+ + * @delta:    slack in expires timeout (ktime_t)
    * @mode:     timer mode
    * @clock_id: timer clock to be used
    */
@@@ -2273,6 -2276,13 +2273,6 @@@ schedule_hrtimeout_range_clock(ktime_t 
                 return -EINTR;
         }
   
- -      /*
- -       * Override any slack passed by the user if under
- -       * rt contraints.
- -       */
- -      if (rt_or_dl_task(current))
- -              delta = 0;
- -
         hrtimer_init_sleeper_on_stack(&t, clock_id, mode);
         hrtimer_set_expires_range_ns(&t.timer, *expires, delta);
         hrtimer_sleeper_start_expires(&t, mode);
@@@ -2292,7 -2302,7 +2292,7 @@@ EXPORT_SYMBOL_GPL(schedule_hrtimeout_ra
   /**
    * schedule_hrtimeout_range - sleep until timeout
    * @expires:  timeout value (ktime_t)
- - * @delta:    slack in expires timeout (ktime_t) for SCHED_OTHER tasks
+ + * @delta:    slack in expires timeout (ktime_t)
    * @mode:     timer mode
    *
    * Make the current task sleep until the given expiry time has
diff --combined mm/page_alloc.c

index 91ace8ca97e21f028f8eaf94520183fad0309e08,36f8abde37510373b387f9abefec613eb092845f..0aefae4a26b207e6421b64b169362fa7cf0396d1
--- 1/mm/page_alloc.c
--- 2/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@@ -287,7 -287,7 +287,7 @@@ EXPORT_SYMBOL(nr_online_nodes)
   
   static bool page_contains_unaccepted(struct page *page, unsigned int order);
   static void accept_page(struct page *page, unsigned int order);
- -static bool try_to_accept_memory(struct zone *zone, unsigned int order);
+ +static bool cond_accept_memory(struct zone *zone, unsigned int order);
   static inline bool has_unaccepted_memory(void);
   static bool __free_unaccepted(struct page *page);
   
@@@ -1054,13 -1054,6 +1054,13 @@@ __always_inline bool free_pages_prepare
                 reset_page_owner(page, order);
                 page_table_check_free(page, order);
                 pgalloc_tag_sub(page, 1 << order);
+ +
+ +              /*
+ +               * The page is isolated and accounted for.
+ +               * Mark the codetag as empty to avoid accounting error
+ +               * when the page is freed by unpoison_memory().
+ +               */
+ +              clear_page_tag_ref(page);
                 return false;
         }
   
@@@ -3079,6 -3072,9 +3079,6 @@@ static inline long __zone_watermark_unu
         if (!(alloc_flags & ALLOC_CMA))
                 unusable_free += zone_page_state(z, NR_FREE_CMA_PAGES);
   #endif
- -#ifdef CONFIG_UNACCEPTED_MEMORY
- -      unusable_free += zone_page_state(z, NR_UNACCEPTED);
- -#endif
   
         return unusable_free;
   }
@@@ -3372,8 -3368,6 +3372,8 @@@ retry
                         }
                 }
   
+ +              cond_accept_memory(zone, order);
+ +
                 /*
                  * Detect whether the number of free pages is below high
                  * watermark.  If so, we will decrease pcp->high and free
@@@ -3399,8 -3393,10 +3399,8 @@@ check_alloc_wmark
                                        gfp_mask)) {
                         int ret;
   
- -                      if (has_unaccepted_memory()) {
- -                              if (try_to_accept_memory(zone, order))
- -                                      goto try_this_zone;
- -                      }
+ +                      if (cond_accept_memory(zone, order))
+ +                              goto try_this_zone;
   
   #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
                         /*
@@@ -3454,8 -3450,10 +3454,8 @@@ try_this_zone
   
                         return page;
                 } else {
- -                      if (has_unaccepted_memory()) {
- -                              if (try_to_accept_memory(zone, order))
- -                                      goto try_this_zone;
- -                      }
+ +                      if (cond_accept_memory(zone, order))
+ +                              goto try_this_zone;
   
   #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
                         /* Try again if zone has deferred pages */
@@@ -4004,7 -4002,7 +4004,7 @@@ gfp_to_alloc_flags(gfp_t gfp_mask, unsi
                  */
                 if (alloc_flags & ALLOC_MIN_RESERVE)
                         alloc_flags &= ~ALLOC_CPUSET;
-       } else if (unlikely(rt_task(current)) && in_task())
+       } else if (unlikely(rt_or_dl_task(current)) && in_task())
                 alloc_flags |= ALLOC_MIN_RESERVE;
   
         alloc_flags = gfp_to_alloc_flags_cma(gfp_mask, alloc_flags);
@@@ -5757,6 -5755,7 +5757,6 @@@ void __init setup_per_cpu_pageset(void
         for_each_online_pgdat(pgdat)
                 pgdat->per_cpu_nodestats =
                         alloc_percpu(struct per_cpu_nodestat);
- -      store_early_perpage_metadata();
   }
   
   __meminit void zone_pcp_init(struct zone *zone)
@@@ -5822,7 -5821,14 +5822,7 @@@ unsigned long free_reserved_area(void *
   
   void free_reserved_page(struct page *page)
   {
- -      if (mem_alloc_profiling_enabled()) {
- -              union codetag_ref *ref = get_page_tag_ref(page);
- -
- -              if (ref) {
- -                      set_codetag_empty(ref);
- -                      put_page_tag_ref(ref);
- -              }
- -      }
+ +      clear_page_tag_ref(page);
         ClearPageReserved(page);
         init_page_count(page);
         __free_page(page);
@@@ -6945,6 -6951,9 +6945,6 @@@ static bool try_to_accept_memory_one(st
         struct page *page;
         bool last;
   
- -      if (list_empty(&zone->unaccepted_pages))
- -              return false;
- -
         spin_lock_irqsave(&zone->lock, flags);
         page = list_first_entry_or_null(&zone->unaccepted_pages,
                                         struct page, lru);
@@@ -6970,29 -6979,23 +6970,29 @@@
         return true;
   }
   
- -static bool try_to_accept_memory(struct zone *zone, unsigned int order)
+ +static bool cond_accept_memory(struct zone *zone, unsigned int order)
   {
         long to_accept;
- -      int ret = false;
+ +      bool ret = false;
+ +
+ +      if (!has_unaccepted_memory())
+ +              return false;
+ +
+ +      if (list_empty(&zone->unaccepted_pages))
+ +              return false;
   
         /* How much to accept to get to high watermark? */
         to_accept = high_wmark_pages(zone) -
                     (zone_page_state(zone, NR_FREE_PAGES) -
- -                  __zone_watermark_unusable_free(zone, order, 0));
+ +                  __zone_watermark_unusable_free(zone, order, 0) -
+ +                  zone_page_state(zone, NR_UNACCEPTED));
   
- -      /* Accept at least one page */
- -      do {
+ +      while (to_accept > 0) {
                 if (!try_to_accept_memory_one(zone))
                         break;
                 ret = true;
                 to_accept -= MAX_ORDER_NR_PAGES;
- -      } while (to_accept > 0);
+ +      }
   
         return ret;
   }
@@@ -7035,7 -7038,7 +7035,7 @@@ static void accept_page(struct page *pa
   {
   }
   
- -static bool try_to_accept_memory(struct zone *zone, unsigned int order)
+ +static bool cond_accept_memory(struct zone *zone, unsigned int order)
   {
         return false;
   }
author	Linus Torvalds <[email protected]>
	Thu, 19 Sep 2024 13:55:58 +0000 (15:55 +0200)
committer	Linus Torvalds <[email protected]>
	Thu, 19 Sep 2024 13:55:58 +0000 (15:55 +0200)
		1	2
fs/proc/base.c	patch \|	diff1 \|	diff2 \|	blob \| history
include/linux/sched.h	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/locking/rtmutex.c	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/sched/core.c	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/sched/fair.c	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/sched/syscalls.c	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/sys.c	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/time/hrtimer.c	patch \|	diff1 \|	diff2 \|	blob \| history
mm/page_alloc.c	patch \|	diff1 \|	diff2 \|	blob \| history