tools/sched_ext/scx_simple.bpf.c

   1 /* SPDX-License-Identifier: GPL-2.0 */
   2 /*
   3  * A simple scheduler.
   4  *
   5  * By default, it operates as a simple global weighted vtime scheduler and can
   6  * be switched to FIFO scheduling. It also demonstrates the following niceties.
   7  *
   8  * - Statistics tracking how many tasks are queued to local and global dsq's.
   9  * - Termination notification for userspace.
  10  *
  11  * While very simple, this scheduler should work reasonably well on CPUs with a
  12  * uniform L3 cache topology. While preemption is not implemented, the fact that
  13  * the scheduling queue is shared across all CPUs means that whatever is at the
  14  * front of the queue is likely to be executed fairly quickly given enough
  15  * number of CPUs. The FIFO scheduling mode may be beneficial to some workloads
  16  * but comes with the usual problems with FIFO scheduling where saturating
  17  * threads can easily drown out interactive ones.
  18  *
  19  * Copyright (c) 2022 Meta Platforms, Inc. and affiliates.
  20  * Copyright (c) 2022 Tejun Heo <[email protected]>
  21  * Copyright (c) 2022 David Vernet <[email protected]>
  22  */
  23 #include <scx/common.bpf.h>
  24
  25 char _license[] SEC("license") = "GPL";
  26
  27 const volatile bool fifo_sched;
  28
  29 static u64 vtime_now;
  30 UEI_DEFINE(uei);
  31
  32 /*
  33  * Built-in DSQs such as SCX_DSQ_GLOBAL cannot be used as priority queues
  34  * (meaning, cannot be dispatched to with scx_bpf_dsq_insert_vtime()). We
  35  * therefore create a separate DSQ with ID 0 that we dispatch to and consume
  36  * from. If scx_simple only supported global FIFO scheduling, then we could just
  37  * use SCX_DSQ_GLOBAL.
  38  */
  39 #define SHARED_DSQ 0
  40
  41 struct {
  42         __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
  43         __uint(key_size, sizeof(u32));
  44         __uint(value_size, sizeof(u64));
  45         __uint(max_entries, 2);                 /* [local, global] */
  46 } stats SEC(".maps");
  47
  48 static void stat_inc(u32 idx)
  49 {
  50         u64 *cnt_p = bpf_map_lookup_elem(&stats, &idx);
  51         if (cnt_p)
  52                 (*cnt_p)++;
  53 }
  54
  55 static inline bool vtime_before(u64 a, u64 b)
  56 {
  57         return (s64)(a - b) < 0;
  58 }
  59
  60 s32 BPF_STRUCT_OPS(simple_select_cpu, struct task_struct *p, s32 prev_cpu, u64 wake_flags)
  61 {
  62         bool is_idle = false;
  63         s32 cpu;
  64
  65         cpu = scx_bpf_select_cpu_dfl(p, prev_cpu, wake_flags, &is_idle);
  66         if (is_idle) {
  67                 stat_inc(0);    /* count local queueing */
  68                 scx_bpf_dsq_insert(p, SCX_DSQ_LOCAL, SCX_SLICE_DFL, 0);
  69         }
  70
  71         return cpu;
  72 }
  73
  74 void BPF_STRUCT_OPS(simple_enqueue, struct task_struct *p, u64 enq_flags)
  75 {
  76         stat_inc(1);    /* count global queueing */
  77
  78         if (fifo_sched) {
  79                 scx_bpf_dsq_insert(p, SHARED_DSQ, SCX_SLICE_DFL, enq_flags);
  80         } else {
  81                 u64 vtime = p->scx.dsq_vtime;
  82
  83                 /*
  84                  * Limit the amount of budget that an idling task can accumulate
  85                  * to one slice.
  86                  */
  87                 if (vtime_before(vtime, vtime_now - SCX_SLICE_DFL))
  88                         vtime = vtime_now - SCX_SLICE_DFL;
  89
  90                 scx_bpf_dsq_insert_vtime(p, SHARED_DSQ, SCX_SLICE_DFL, vtime,
  91                                          enq_flags);
  92         }
  93 }
  94
  95 void BPF_STRUCT_OPS(simple_dispatch, s32 cpu, struct task_struct *prev)
  96 {
  97         scx_bpf_dsq_move_to_local(SHARED_DSQ);
  98 }
  99
 100 void BPF_STRUCT_OPS(simple_running, struct task_struct *p)
 101 {
 102         if (fifo_sched)
 103                 return;
 104
 105         /*
 106          * Global vtime always progresses forward as tasks start executing. The
 107          * test and update can be performed concurrently from multiple CPUs and
 108          * thus racy. Any error should be contained and temporary. Let's just
 109          * live with it.
 110          */
 111         if (vtime_before(vtime_now, p->scx.dsq_vtime))
 112                 vtime_now = p->scx.dsq_vtime;
 113 }
 114
 115 void BPF_STRUCT_OPS(simple_stopping, struct task_struct *p, bool runnable)
 116 {
 117         if (fifo_sched)
 118                 return;
 119
 120         /*
 121          * Scale the execution time by the inverse of the weight and charge.
 122          *
 123          * Note that the default yield implementation yields by setting
 124          * @p->scx.slice to zero and the following would treat the yielding task
 125          * as if it has consumed all its slice. If this penalizes yielding tasks
 126          * too much, determine the execution time by taking explicit timestamps
 127          * instead of depending on @p->scx.slice.
 128          */
 129         p->scx.dsq_vtime += (SCX_SLICE_DFL - p->scx.slice) * 100 / p->scx.weight;
 130 }
 131
 132 void BPF_STRUCT_OPS(simple_enable, struct task_struct *p)
 133 {
 134         p->scx.dsq_vtime = vtime_now;
 135 }
 136
 137 s32 BPF_STRUCT_OPS_SLEEPABLE(simple_init)
 138 {
 139         return scx_bpf_create_dsq(SHARED_DSQ, -1);
 140 }
 141
 142 void BPF_STRUCT_OPS(simple_exit, struct scx_exit_info *ei)
 143 {
 144         UEI_RECORD(uei, ei);
 145 }
 146
 147 SCX_OPS_DEFINE(simple_ops,
 148                .select_cpu              = (void *)simple_select_cpu,
 149                .enqueue                 = (void *)simple_enqueue,
 150                .dispatch                = (void *)simple_dispatch,
 151                .running                 = (void *)simple_running,
 152                .stopping                = (void *)simple_stopping,
 153                .enable                  = (void *)simple_enable,
 154                .init                    = (void *)simple_init,
 155                .exit                    = (void *)simple_exit,
 156                .name                    = "simple");